diff --git a/deploy/charts/testgen-services/Chart.yaml b/deploy/charts/testgen-services/Chart.yaml index 79704ab7..8e98b830 100644 --- a/deploy/charts/testgen-services/Chart.yaml +++ b/deploy/charts/testgen-services/Chart.yaml @@ -20,7 +20,7 @@ dependencies: # This is the chart version. This version number should be incremented each time you make changes # to the chart and its templates, including the app version. # Versions are expected to follow Semantic Versioning (https://semver.org/) -version: 0.1.0 +version: 0.1.1 # This is the version number of the application being deployed. This version number should be # incremented each time you make changes to the application. Versions are not expected to diff --git a/deploy/charts/testgen-services/values.yaml b/deploy/charts/testgen-services/values.yaml index eae5e615..af7ca7be 100644 --- a/deploy/charts/testgen-services/values.yaml +++ b/deploy/charts/testgen-services/values.yaml @@ -6,3 +6,9 @@ postgresql: fullnameOverride: postgresql auth: database: "datakitchen" + image: + repository: bitnamilegacy/postgresql + +global: + security: + allowInsecureImages: true diff --git a/pyproject.toml b/pyproject.toml index 2049bac5..37d1e0f9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,7 +8,7 @@ build-backend = "setuptools.build_meta" [project] name = "dataops-testgen" -version = "4.26.1" +version = "4.32.5" description = "DataKitchen's Data Quality DataOps TestGen" authors = [ { "name" = "DataKitchen, Inc.", "email" = "info@datakitchen.io" }, @@ -33,6 +33,7 @@ dependencies = [ "sqlalchemy==1.4.46", "databricks-sql-connector==2.9.3", "snowflake-sqlalchemy==1.6.1", + "sqlalchemy-bigquery==1.14.1", "pyodbc==5.0.0", "psycopg2-binary==2.9.9", "pycryptodome==3.21", diff --git a/testgen/__main__.py b/testgen/__main__.py index 6362a868..c0d7a7f9 100644 --- a/testgen/__main__.py +++ b/testgen/__main__.py @@ -31,6 +31,7 @@ from testgen.commands.run_observability_exporter import run_observability_exporter from testgen.commands.run_profiling_bridge import run_profiling_queries from testgen.commands.run_quick_start import run_quick_start, run_quick_start_increment +from testgen.commands.run_test_metadata_exporter import run_test_metadata_exporter from testgen.commands.run_upgrade_db_config import get_schema_revision, is_db_revision_up_to_date, run_upgrade_db_config from testgen.common import ( configure_logging, @@ -503,6 +504,25 @@ def export_data(configuration: Configuration, project_key: str, test_suite_key: click.echo("\nexport-observability completed successfully.\n") +@click.option( + "--path", + help="Path to the templates folder. Defaults to path from project root.", + required=False, + default="testgen/template", +) +@cli.command("export-test-metadata", help="Exports current test metadata records to yaml files.") +@pass_configuration +def export_test_metadata(configuration: Configuration, path: str): + click.echo("export-test-metadata") + LOG.info("CurrentStep: Main Program - Test Metadata Export") + if not os.path.isdir(path): + LOG.error(f"Provided path {path} is not a directory. Please correct the --path option.") + return + run_test_metadata_exporter(path) + LOG.info("CurrentStep: Main Program - Test Metadata Export - DONE") + click.echo("\nexport-test-metadata completed successfully.\n") + + @cli.command("list-test-types", help="Lists all available TestGen test types.") @click.option("-d", "--display", help="Show command output in the terminal.", is_flag=True, default=False) @pass_configuration diff --git a/testgen/commands/queries/execute_cat_tests_query.py b/testgen/commands/queries/execute_cat_tests_query.py index 0a22ee8c..5f70a59d 100644 --- a/testgen/commands/queries/execute_cat_tests_query.py +++ b/testgen/commands/queries/execute_cat_tests_query.py @@ -17,7 +17,6 @@ class CATTestParams(TypedDict): class CCATExecutionSQL: project_code = "" flavor = "" - concat_operator = "" test_suite = "" run_date = "" test_run_id = "" @@ -35,8 +34,7 @@ def __init__(self, strProjectCode, strTestSuiteId, strTestSuite, strSQLFlavor, m self.test_suite_id = strTestSuiteId self.test_suite = strTestSuite self.project_code = strProjectCode - flavor_service = get_flavor_service(strSQLFlavor) - self.concat_operator = flavor_service.get_concat_operator() + self.flavor_service = get_flavor_service(strSQLFlavor) self.flavor = strSQLFlavor self.max_query_chars = max_query_chars self.today = date_service.get_now_as_string_with_offset(minutes_offset) @@ -47,7 +45,7 @@ def _get_rollup_scores_sql(self) -> CRollupScoresSQL: self._rollup_scores_sql = CRollupScoresSQL(self.test_run_id, self.table_groups_id) return self._rollup_scores_sql - + def _get_query(self, template_file_name: str, sub_directory: str | None = "exec_cat_tests", no_bind: bool = False) -> tuple[str, dict | None]: query = read_template_sql_file(template_file_name, sub_directory) params = { @@ -58,8 +56,9 @@ def _get_query(self, template_file_name: str, sub_directory: str | None = "exec_ "TEST_SUITE_ID": self.test_suite_id, "TABLE_GROUPS_ID": self.table_groups_id, "SQL_FLAVOR": self.flavor, - "ID_SEPARATOR": "`" if self.flavor == "databricks" else '"', - "CONCAT_OPERATOR": self.concat_operator, + "QUOTE": self.flavor_service.quote_character, + "VARCHAR_TYPE": self.flavor_service.varchar_type, + "CONCAT_OPERATOR": self.flavor_service.concat_operator, "SCHEMA_NAME": self.target_schema, "TABLE_NAME": self.target_table, "NOW_DATE": "GETDATE()", @@ -73,7 +72,7 @@ def _get_query(self, template_file_name: str, sub_directory: str | None = "exec_ query = replace_params(query, params) query = replace_templated_functions(query, self.flavor) - if no_bind and self.flavor != "databricks": + if no_bind: # Adding escape character where ':' is referenced query = query.replace(":", "\\:") diff --git a/testgen/commands/queries/execute_tests_query.py b/testgen/commands/queries/execute_tests_query.py index 93010829..65679ad3 100644 --- a/testgen/commands/queries/execute_tests_query.py +++ b/testgen/commands/queries/execute_tests_query.py @@ -1,7 +1,8 @@ from typing import ClassVar, TypedDict -from testgen.common import AddQuotesToIdentifierCSV, CleanSQL, ConcatColumnList, date_service, read_template_sql_file -from testgen.common.database.database_service import replace_params +from testgen.common import date_service, read_template_sql_file +from testgen.common.clean_sql import CleanSQL, ConcatColumnList, quote_identifiers +from testgen.common.database.database_service import get_flavor_service, replace_params class TestParams(TypedDict): @@ -54,6 +55,7 @@ class CTestExecutionSQL: def __init__(self, strProjectCode, strFlavor, strTestSuiteId, strTestSuite, minutes_offset=0): self.project_code = strProjectCode self.flavor = strFlavor + self.flavor_service = get_flavor_service(strFlavor) self.test_suite_id = strTestSuiteId self.test_suite = strTestSuite self.today = date_service.get_now_as_string_with_offset(minutes_offset) @@ -100,20 +102,21 @@ def _get_query( "TEST_SUITE_ID": self.test_suite_id, "TEST_SUITE": self.test_suite, "SQL_FLAVOR": self.flavor, + "QUOTE": self.flavor_service.quote_character, "TEST_RUN_ID": self.test_run_id, "INPUT_PARAMETERS": self._get_input_parameters(), "RUN_DATE": self.run_date, "EXCEPTION_MESSAGE": self.exception_message, "START_TIME": self.today, "PROCESS_ID": self.process_id, - "VARCHAR_TYPE": "STRING" if self.flavor == "databricks" else "VARCHAR", + "VARCHAR_TYPE": self.flavor_service.varchar_type, "NOW_TIMESTAMP": date_service.get_now_as_string_with_offset(self.minutes_offset), **{key.upper(): value or "" for key, value in self.test_params.items()}, } if self.test_params: column_name = self.test_params["column_name"] - params["COLUMN_NAME"] = AddQuotesToIdentifierCSV(column_name) if column_name else "" + params["COLUMN_NAME"] = quote_identifiers(column_name, self.flavor) if column_name else "" # Shows contents without double-quotes for display and aggregate expressions params["COLUMN_NAME_NO_QUOTES"] = column_name or "" # Concatenates column list into single expression for relative entropy @@ -126,11 +129,13 @@ def _get_query( ) subset_condition = self.test_params["subset_condition"] - params["SUBSET_DISPLAY"] = subset_condition.replace("'", "''") if subset_condition else "" + params["SUBSET_DISPLAY"] = subset_condition.replace( + "'", self.flavor_service.escaped_single_quote + ) if subset_condition else "" query = replace_params(query, params) - if no_bind and self.flavor != "databricks": + if no_bind: # Adding escape character where ':' is referenced query = query.replace(":", "\\:") diff --git a/testgen/commands/queries/generate_tests_query.py b/testgen/commands/queries/generate_tests_query.py index 4f887af4..bf23b7bf 100644 --- a/testgen/commands/queries/generate_tests_query.py +++ b/testgen/commands/queries/generate_tests_query.py @@ -2,7 +2,7 @@ from typing import ClassVar, TypedDict from testgen.common import CleanSQL, date_service, read_template_sql_file -from testgen.common.database.database_service import replace_params +from testgen.common.database.database_service import get_flavor_service, replace_params from testgen.common.read_file import get_template_files LOG = logging.getLogger("testgen") @@ -29,7 +29,10 @@ class CDeriveTestsSQL: _use_clean = False - def __init__(self): + def __init__(self, flavor): + self.sql_flavor = flavor + self.flavor_service = get_flavor_service(flavor) + today = date_service.get_now_as_string() self.run_date = today self.as_of_date = today @@ -47,7 +50,7 @@ def _get_params(self) -> dict: "GENERATION_SET": self.generation_set, "AS_OF_DATE": self.as_of_date, "DATA_SCHEMA": self.data_schema, - "ID_SEPARATOR": "`" if self.sql_flavor == "databricks" else '"', + "QUOTE": self.flavor_service.quote_character, } def _get_query(self, template_file_name: str, sub_directory: str | None = "generation") -> tuple[str, dict]: diff --git a/testgen/commands/queries/profiling_query.py b/testgen/commands/queries/profiling_query.py index 4eadab10..93dbe03d 100644 --- a/testgen/commands/queries/profiling_query.py +++ b/testgen/commands/queries/profiling_query.py @@ -1,9 +1,10 @@ +import re import typing from testgen.commands.queries.refresh_data_chars_query import CRefreshDataCharsSQL from testgen.commands.queries.rollup_scores_query import CRollupScoresSQL from testgen.common import date_service, read_template_sql_file, read_template_yaml_file -from testgen.common.database.database_service import replace_params +from testgen.common.database.database_service import get_flavor_service, replace_params from testgen.common.read_file import replace_templated_functions @@ -21,6 +22,7 @@ class CProfilingSQL: col_name = "" col_gen_type = "" col_type = "" + db_data_type = "" col_ordinal_position = "0" col_is_decimal = "" col_top_freq_update = "" @@ -98,6 +100,7 @@ def _get_params(self) -> dict: "COL_NAME_SANITIZED": self.col_name.replace("'", "''"), "COL_GEN_TYPE": self.col_gen_type, "COL_TYPE": self.col_type or "", + "DB_DATA_TYPE": self.db_data_type or "", "COL_POS": self.col_ordinal_position, "TOP_FREQ": self.col_top_freq_update, "PROFILE_RUN_ID": self.profile_run_id, @@ -118,6 +121,7 @@ def _get_params(self) -> dict: "CONTINGENCY_MAX_VALUES": self.contingency_max_values, "PROCESS_ID": self.process_id, "SQL_FLAVOR": self.flavor, + "QUOTE": get_flavor_service(self.flavor).quote_character } def _get_query( @@ -130,6 +134,7 @@ def _get_query( params = {} if query: + query = self._process_conditionals(query) if extra_params: params.update(extra_params) params.update(self._get_params()) @@ -139,6 +144,33 @@ def _get_query( return query, params + def _process_conditionals(self, query: str): + re_pattern = re.compile(r"^--\s+TG-(IF|ELSE|ENDIF)(?:\s+(\w+))?\s*$") + condition = None + updated_query = [] + for line in query.splitlines(True): + if re_match := re_pattern.match(line): + match re_match.group(1): + case "IF" if condition is None and re_match.group(2) is not None: + condition = bool(getattr(self, re_match.group(2))) + case "ELSE" if condition is not None: + condition = not condition + case "ENDIF" if condition is not None: + condition = None + case _: + raise ValueError("Template conditional misused") + elif condition is not False: + updated_query.append(line) + + if condition is not None: + raise ValueError("Template conditional misused") + + return "".join(updated_query) + + @property + def do_sample_bool(self): + return self.parm_do_sample == "Y" + def GetSecondProfilingColumnsQuery(self) -> tuple[str, dict]: # Runs on App database return self._get_query("secondary_profiling_columns.sql") @@ -260,7 +292,12 @@ def GetProfilingQuery(self) -> tuple[str, dict]: else: strQ += dctSnippetTemplate["strTemplate01_else"] - strQ += dctSnippetTemplate["strTemplate02_all"] + strQ += dctSnippetTemplate["strTemplate01_5"] + + if self.col_gen_type == "X": + strQ += dctSnippetTemplate["strTemplate02_X"] + else: + strQ += dctSnippetTemplate["strTemplate02_else"] if self.col_gen_type in ["A", "D", "N"]: strQ += dctSnippetTemplate["strTemplate03_ADN"] diff --git a/testgen/commands/queries/refresh_data_chars_query.py b/testgen/commands/queries/refresh_data_chars_query.py index 414616f0..d6a0359d 100644 --- a/testgen/commands/queries/refresh_data_chars_query.py +++ b/testgen/commands/queries/refresh_data_chars_query.py @@ -1,5 +1,5 @@ from testgen.common import read_template_sql_file -from testgen.common.database.database_service import replace_params +from testgen.common.database.database_service import get_flavor_service, replace_params from testgen.common.database.flavor.flavor_service import SQLFlavor from testgen.utils import chunk_queries @@ -44,43 +44,41 @@ def _get_query(self, template_file_name: str, sub_directory: str | None = "data_ query = replace_params(query, params) return query, params - def _get_mask_query(self, mask: str, is_include: bool) -> str: - escape = "" - if self.sql_flavor.startswith("mssql"): - escaped_underscore = "[_]" - elif self.sql_flavor == "snowflake": - escaped_underscore = "\\\\_" - escape = "ESCAPE '\\\\'" - elif self.sql_flavor == "redshift": - escaped_underscore = "\\\\_" - else: - escaped_underscore = "\\_" - - table_names = [ item.strip().replace("_", escaped_underscore) for item in mask.split(",") ] - sub_query = f""" - AND {"NOT" if not is_include else ""} ( - {" OR ".join([ f"(c.table_name LIKE '{item}' {escape})" for item in table_names ])} - ) - """ - - return sub_query - - def GetDDFQuery(self) -> tuple[str, dict]: - # Runs on Target database - query, params = self._get_query(f"schema_ddf_query_{self.sql_flavor}.sql", f"flavors/{self.sql_flavor}/data_chars") - + def _get_table_criteria(self) -> str: table_criteria = "" + flavor_service = get_flavor_service(self.sql_flavor) + if self.profiling_table_set: - table_criteria += f" AND c.table_name IN ({self.profiling_table_set})" + table_criteria += f" AND c.{flavor_service.ddf_table_ref} IN ({self.profiling_table_set})" if self.profiling_include_mask: - table_criteria += self._get_mask_query(self.profiling_include_mask, is_include=True) + include_table_names = [ + item.strip().replace("_", flavor_service.escaped_underscore) + for item in self.profiling_include_mask.split(",") + ] + table_criteria += f""" + AND ( + {" OR ".join([ f"(c.{flavor_service.ddf_table_ref} LIKE '{item}' {flavor_service.escape_clause})" for item in include_table_names ])} + ) + """ if self.profiling_exclude_mask: - table_criteria += self._get_mask_query(self.profiling_exclude_mask, is_include=False) - - query = query.replace("{TABLE_CRITERIA}", table_criteria) + exclude_table_names = [ + item.strip().replace("_", flavor_service.escaped_underscore) + for item in self.profiling_exclude_mask.split(",") + ] + table_criteria += f""" + AND NOT ( + {" OR ".join([ f"(c.{flavor_service.ddf_table_ref} LIKE '{item}' {flavor_service.escape_clause})" for item in exclude_table_names ])} + ) + """ + return table_criteria + + def GetDDFQuery(self) -> tuple[str, dict]: + # Runs on Target database + query, params = self._get_query(f"schema_ddf_query_{self.sql_flavor}.sql", f"flavors/{self.sql_flavor}/data_chars") + query = query.replace("{TABLE_CRITERIA}", self._get_table_criteria()) return query, params def GetRecordCountQueries(self, schema_tables: list[str]) -> list[tuple[str, None]]: diff --git a/testgen/commands/queries/test_parameter_validation_query.py b/testgen/commands/queries/test_parameter_validation_query.py index ec8cf408..c7f40c35 100644 --- a/testgen/commands/queries/test_parameter_validation_query.py +++ b/testgen/commands/queries/test_parameter_validation_query.py @@ -1,7 +1,7 @@ import typing from testgen.common import CleanSQL, date_service, read_template_sql_file -from testgen.common.database.database_service import replace_params +from testgen.common.database.database_service import get_flavor_service, replace_params class CTestParamValidationSQL: @@ -13,11 +13,13 @@ class CTestParamValidationSQL: test_ids: typing.ClassVar = [] exception_message = "" flag_val = "" + tg_schema = "" _use_clean = False def __init__(self, strFlavor, strTestSuiteId): self.flavor = strFlavor + self.flavor_service = get_flavor_service(strFlavor) self.test_suite_id = strTestSuiteId self.today = date_service.get_now_as_string() @@ -34,6 +36,8 @@ def _get_query(self, template_file_name: str, sub_directory: str | None = "valid "CAT_TEST_IDS": tuple(self.test_ids or []), "START_TIME": self.today, "NOW_TIMESTAMP": date_service.get_now_as_string(), + "DATA_SCHEMA": self.tg_schema, + "QUOTE": self.flavor_service.quote_character, } query = replace_params(query, params) return query, params @@ -47,7 +51,11 @@ def GetTestValidationColumns(self) -> tuple[str, dict]: def GetProjectTestValidationColumns(self) -> tuple[str, dict]: # Runs on Target database - return self._get_query("ex_get_project_column_list_generic.sql", "flavors/generic/validate_tests") + filename = "ex_get_project_column_list.sql" + try: + return self._get_query(filename, f"flavors/{self.flavor}/validate_tests") + except ModuleNotFoundError: + return self._get_query(filename, "flavors/generic/validate_tests") def PrepFlagTestsWithFailedValidation(self) -> tuple[str, dict]: # Runs on App database diff --git a/testgen/commands/run_generate_tests.py b/testgen/commands/run_generate_tests.py index c163fbab..71b48491 100644 --- a/testgen/commands/run_generate_tests.py +++ b/testgen/commands/run_generate_tests.py @@ -19,7 +19,7 @@ def run_test_gen_queries(table_group_id: str, test_suite: str, generation_set: s connection = Connection.get_by_table_group(table_group_id) set_target_db_params(connection.__dict__) - clsTests = CDeriveTestsSQL() + clsTests = CDeriveTestsSQL(connection.sql_flavor) LOG.info(f"CurrentStep: Retrieving General Parameters for Test Suite {test_suite}") params = get_test_generation_params(table_group_id, test_suite) @@ -32,7 +32,6 @@ def run_test_gen_queries(table_group_id: str, test_suite: str, generation_set: s clsTests.test_suite_id = params["test_suite_id"] if params["test_suite_id"] else "" clsTests.connection_id = str(connection.connection_id) clsTests.table_groups_id = table_group_id - clsTests.sql_flavor = connection.sql_flavor clsTests.data_schema = params["table_group_schema"] if params["profiling_as_of_date"] is not None: clsTests.as_of_date = params["profiling_as_of_date"].strftime("%Y-%m-%d %H:%M:%S") diff --git a/testgen/commands/run_launch_db_config.py b/testgen/commands/run_launch_db_config.py index 68f99336..f65a80ec 100644 --- a/testgen/commands/run_launch_db_config.py +++ b/testgen/commands/run_launch_db_config.py @@ -10,6 +10,7 @@ from testgen.common.models.scores import ScoreDefinition from testgen.common.models.table_group import TableGroup from testgen.common.read_file import get_template_files +from testgen.common.read_yaml_metadata_records import import_metadata_records_from_yaml LOG = logging.getLogger("testgen") @@ -41,6 +42,7 @@ def _get_params_mapping() -> dict: "PROJECT_HOST": settings.PROJECT_DATABASE_HOST, "PROJECT_PW_ENCRYPTED": EncryptText(settings.PROJECT_DATABASE_PASSWORD), "PROJECT_HTTP_PATH": "", + "PROJECT_SERVICE_ACCOUNT_KEY": "", "PROJECT_SCHEMA": settings.PROJECT_DATABASE_SCHEMA, "PROFILING_TABLE_SET": settings.DEFAULT_PROFILING_TABLE_SET, "PROFILING_INCLUDE_MASK": settings.DEFAULT_PROFILING_INCLUDE_MASK, @@ -84,7 +86,9 @@ def run_launch_db_config(delete_db: bool, drop_users_and_roles: bool = True) -> user_override=params_mapping["TESTGEN_ADMIN_USER"], password_override=params_mapping["TESTGEN_ADMIN_PASSWORD"], user_type="schema_admin", + suppress_logs=True, ) + import_metadata_records_from_yaml(params_mapping) ScoreDefinition.from_table_group( TableGroup( diff --git a/testgen/commands/run_profiling_bridge.py b/testgen/commands/run_profiling_bridge.py index 691f4c1c..236985bc 100644 --- a/testgen/commands/run_profiling_bridge.py +++ b/testgen/commands/run_profiling_bridge.py @@ -23,7 +23,7 @@ set_target_db_params, write_to_app_db, ) -from testgen.common.database.database_service import empty_cache +from testgen.common.database.database_service import empty_cache, get_flavor_service from testgen.common.mixpanel_service import MixpanelService from testgen.common.models import with_database_session from testgen.common.models.connection import Connection @@ -279,12 +279,15 @@ def run_profiling_queries(table_group_id: str, username: str | None = None, spin column_count = len(lstResult) if lstResult: + flavor_service = get_flavor_service(connection.sql_flavor) + quote = flavor_service.quote_character + # Get distinct tables distinct_tables = set() for item in lstResult: schema_name = item["table_schema"] table_name = item["table_name"] - distinct_tables.add(f"{schema_name}.{table_name}") + distinct_tables.add(f"{quote}{schema_name}{quote}.{quote}{table_name}{quote}") # Convert the set to a list distinct_tables_list = list(distinct_tables) @@ -317,7 +320,8 @@ def run_profiling_queries(table_group_id: str, username: str | None = None, spin clsProfiling.data_schema = dctColumnRecord["table_schema"] clsProfiling.data_table = dctColumnRecord["table_name"] clsProfiling.col_name = dctColumnRecord["column_name"] - clsProfiling.col_type = dctColumnRecord["data_type"] + clsProfiling.col_type = dctColumnRecord["column_type"] + clsProfiling.db_data_type = dctColumnRecord["db_data_type"] clsProfiling.profile_run_id = profiling_run_id clsProfiling.col_is_decimal = dctColumnRecord["is_decimal"] clsProfiling.col_ordinal_position = dctColumnRecord["ordinal_position"] @@ -325,16 +329,11 @@ def run_profiling_queries(table_group_id: str, username: str | None = None, spin clsProfiling.parm_do_sample = "N" if clsProfiling.profile_use_sampling == "Y": - if dctSampleTables[clsProfiling.data_schema + "." + clsProfiling.data_table][0] > -1: - clsProfiling.parm_sample_size = dctSampleTables[ - clsProfiling.data_schema + "." + clsProfiling.data_table - ][0] - clsProfiling.sample_ratio = dctSampleTables[ - clsProfiling.data_schema + "." + clsProfiling.data_table - ][1] - clsProfiling.sample_percent_calc = dctSampleTables[ - clsProfiling.data_schema + "." + clsProfiling.data_table - ][2] + table_identifier = f"{quote}{clsProfiling.data_schema}{quote}.{quote}{clsProfiling.data_table}{quote}" + if dctSampleTables[table_identifier][0] > -1: + clsProfiling.parm_sample_size = dctSampleTables[table_identifier][0] + clsProfiling.sample_ratio = dctSampleTables[table_identifier][1] + clsProfiling.sample_percent_calc = dctSampleTables[table_identifier][2] clsProfiling.parm_do_sample = clsProfiling.profile_use_sampling else: clsProfiling.parm_sample_size = 0 diff --git a/testgen/commands/run_refresh_data_chars.py b/testgen/commands/run_refresh_data_chars.py index 78489445..2c812559 100644 --- a/testgen/commands/run_refresh_data_chars.py +++ b/testgen/commands/run_refresh_data_chars.py @@ -7,6 +7,7 @@ execute_db_queries, fetch_dict_from_db, fetch_from_db_threaded, + get_flavor_service, write_to_app_db, ) from testgen.common.get_pipeline_parms import TestExecutionParams @@ -18,12 +19,14 @@ def run_refresh_data_chars_queries(params: TestExecutionParams, run_date: str, spinner: Spinner=None): LOG.info("CurrentStep: Initializing Data Characteristics Refresh") sql_generator = CRefreshDataCharsSQL(params, run_date, STAGING_TABLE) + flavor_service = get_flavor_service(params["sql_flavor"]) + quote = flavor_service.quote_character LOG.info("CurrentStep: Getting DDF for table group") ddf_results = fetch_dict_from_db(*sql_generator.GetDDFQuery(), use_target_db=True) distinct_tables = { - f"{item['table_schema']}.{item['table_name']}" + f"{quote}{item['table_schema']}{quote}.{quote}{item['table_name']}{quote}" for item in ddf_results } if distinct_tables: @@ -50,6 +53,7 @@ def run_refresh_data_chars_queries(params: TestExecutionParams, run_date: str, s "position", "general_type", "column_type", + "db_data_type", "record_ct", ] staging_records = [ @@ -62,8 +66,9 @@ def run_refresh_data_chars_queries(params: TestExecutionParams, run_date: str, s item["column_name"], item["ordinal_position"], item["general_type"], - item["data_type"], - count_map.get(f"{item['table_schema']}.{item['table_name']}", 0), + item["column_type"], + item["db_data_type"], + count_map.get(f"{quote}{item['table_schema']}{quote}.{quote}{item['table_name']}{quote}", 0), ] for item in ddf_results ] diff --git a/testgen/commands/run_test_metadata_exporter.py b/testgen/commands/run_test_metadata_exporter.py new file mode 100644 index 00000000..b204a554 --- /dev/null +++ b/testgen/commands/run_test_metadata_exporter.py @@ -0,0 +1,19 @@ +import logging + +from testgen import settings +from testgen.common.credentials import get_tg_schema +from testgen.common.models import with_database_session +from testgen.common.read_yaml_metadata_records import export_metadata_records_to_yaml + +LOG = logging.getLogger("testgen") + +def _get_params_mapping() -> dict: + return { + "SCHEMA_NAME": get_tg_schema(), + "TESTGEN_ADMIN_USER": settings.DATABASE_ADMIN_USER, + "TESTGEN_ADMIN_PASSWORD": settings.DATABASE_ADMIN_PASSWORD, + } + +@with_database_session +def run_test_metadata_exporter(templates_path) -> None: + export_metadata_records_to_yaml(_get_params_mapping(), templates_path) diff --git a/testgen/commands/run_test_parameter_validation.py b/testgen/commands/run_test_parameter_validation.py index db6ba728..f31be1ba 100644 --- a/testgen/commands/run_test_parameter_validation.py +++ b/testgen/commands/run_test_parameter_validation.py @@ -45,6 +45,7 @@ def run_parameter_validation_queries( # Retrieve Current Project Column list LOG.info("CurrentStep: Retrieve Current Columns for Validation") + clsExecute.tg_schema = params["table_group_schema"] clsExecute.test_schemas = strSchemas lstProjectTestColumns = fetch_dict_from_db(*clsExecute.GetProjectTestValidationColumns(), use_target_db=True) @@ -99,7 +100,7 @@ def run_parameter_validation_queries( clsExecute.message = f"Missing table: {table_name}" clsExecute.test_ids = test_ids execute_db_queries([clsExecute.FlagTestsWithFailedValidation()]) - + if invalid_tests: clsExecute.message = "Invalid test: schema, table, or column not defined" clsExecute.test_ids = invalid_tests diff --git a/testgen/commands/run_upgrade_db_config.py b/testgen/commands/run_upgrade_db_config.py index 5d532120..e144f07c 100644 --- a/testgen/commands/run_upgrade_db_config.py +++ b/testgen/commands/run_upgrade_db_config.py @@ -5,6 +5,7 @@ from testgen.common.credentials import get_tg_schema from testgen.common.database.database_service import replace_params from testgen.common.read_file import get_template_files +from testgen.common.read_yaml_metadata_records import import_metadata_records_from_yaml LOG = logging.getLogger("testgen") @@ -95,6 +96,18 @@ def _refresh_static_metadata(params_mapping): user_override=params_mapping["TESTGEN_ADMIN_USER"], password_override=params_mapping["TESTGEN_ADMIN_PASSWORD"], user_type="schema_admin", + suppress_logs=True, + ) + import_metadata_records_from_yaml(params_mapping) + + strQueryMetadataConstraints = read_template_sql_file("055_recreate_metadata_constraints.sql", "dbsetup") + strQueryMetadataConstraints = replace_params(strQueryMetadataConstraints, params_mapping) + execute_db_queries( + [(strQueryMetadataConstraints, None)], + user_override=params_mapping["TESTGEN_ADMIN_USER"], + password_override=params_mapping["TESTGEN_ADMIN_PASSWORD"], + user_type="schema_admin", + suppress_logs=True, ) diff --git a/testgen/common/clean_sql.py b/testgen/common/clean_sql.py index 8c275ac4..27299615 100644 --- a/testgen/common/clean_sql.py +++ b/testgen/common/clean_sql.py @@ -1,7 +1,7 @@ -__all__ = ["AddQuotesToIdentifierCSV", "CleanSQL", "ConcatColumnList"] - import re +from testgen.common.database.database_service import get_flavor_service + def CleanSQL(strInput: str) -> str: # Use regular expression to remove comment text fenced by /*...*/ @@ -16,7 +16,7 @@ def CleanSQL(strInput: str) -> str: return " ".join(parts) -def AddQuotesToIdentifierCSV(strInput: str) -> str: +def quote_identifiers(identifiers: str, flavor: str) -> str: # Keywords -- identifiers to quote keywords = [ "select", @@ -26,14 +26,22 @@ def AddQuotesToIdentifierCSV(strInput: str) -> str: "by", "having", ] + flavor_service = get_flavor_service(flavor) + quote = flavor_service.quote_character quoted_values = [] - for value in strInput.split(","): + for value in identifiers.split(","): value = value.strip() - if value.startswith('"') and value.endswith('"'): + if value.startswith(quote) and value.endswith(quote): quoted_values.append(value) - elif any(c.isupper() or c.isspace() or value.lower() in keywords for c in value): - quoted_values.append(f'"{value}"') + elif any( + (flavor_service.default_uppercase and c.lower()) + or (not flavor_service.default_uppercase and c.isupper()) + or c.isspace() + or value.lower() in keywords + for c in value + ): + quoted_values.append(f"{quote}{value}{quote}") else: quoted_values.append(value) return ", ".join(quoted_values) diff --git a/testgen/common/database/database_service.py b/testgen/common/database/database_service.py index 75016501..8adbe7cf 100644 --- a/testgen/common/database/database_service.py +++ b/testgen/common/database/database_service.py @@ -77,7 +77,7 @@ def set_target_db_params(connection_params: ConnectionParams) -> None: def get_flavor_service(flavor: SQLFlavor) -> FlavorService: module_path = f"testgen.common.database.flavor.{flavor}_flavor_service" - class_name = f"{flavor.capitalize()}FlavorService" + class_name = f"{flavor.replace('_', ' ').title().replace(' ', '')}FlavorService" module = importlib.import_module(module_path) flavor_class = getattr(module, class_name) return flavor_class() @@ -134,8 +134,9 @@ def execute_db_queries( user_override: str | None = None, password_override: str | None = None, user_type: UserType = "normal", + suppress_logs: bool = False, ) -> tuple[list[Any], list[int]]: - LOG.info(f"DB operation: execute_db_queries on {'Target' if use_target_db else 'App'} database (User type = {user_type})") + LOG.info(f"DB operation: execute_db_queries ({len(queries)}) on {'Target' if use_target_db else 'App'} database (User type = {user_type})") with _init_db_connection(use_target_db, user_override, password_override, user_type) as connection: return_values: list[Any] = [] @@ -144,7 +145,8 @@ def execute_db_queries( LOG.info("No queries to process") for index, (query, params) in enumerate(queries): LOG.debug(f"Query: {query}") - LOG.info(f"Processing {index + 1} of {len(queries)} queries") + if not suppress_logs: + LOG.info(f"Processing {index + 1} of {len(queries)} queries") transaction = connection.begin() result = connection.execute(text(query), params) row_counts.append(result.rowcount) @@ -357,15 +359,17 @@ def _init_target_db_connection() -> Connection: engine = engine_cache.target_db if not engine: - connection_string = flavor_service.get_connection_string() - connect_args = flavor_service.get_connect_args() - try: - engine: Engine = create_engine(connection_string, connect_args=connect_args) - engine_cache.target_db = engine - + engine: Engine = create_engine( + flavor_service.get_connection_string(), + connect_args=flavor_service.get_connect_args(), + **flavor_service.get_engine_args(), + ) except SQLAlchemyError as e: raise ValueError(f"Failed to create engine for Target database '{flavor_service.dbname}' (User type = normal)") from e + else: + engine_cache.target_db = engine + connection: Connection = engine.connect() diff --git a/testgen/common/database/flavor/bigquery_flavor_service.py b/testgen/common/database/flavor/bigquery_flavor_service.py new file mode 100644 index 00000000..8e80f146 --- /dev/null +++ b/testgen/common/database/flavor/bigquery_flavor_service.py @@ -0,0 +1,22 @@ +from typing import Any + +from testgen.common.database.flavor.flavor_service import FlavorService + + +class BigqueryFlavorService(FlavorService): + + quote_character = "`" + escaped_single_quote = "\\'" + varchar_type = "STRING" + + def get_connection_string_head(self): + return "bigquery://" + + def get_connection_string_from_fields(self): + return f"bigquery://{self.service_account_key["project_id"] if self.service_account_key else ""}" + + def get_connect_args(self) -> dict: + return {} + + def get_engine_args(self) -> dict[str,Any]: + return {"credentials_info": self.service_account_key} if self.service_account_key else {} diff --git a/testgen/common/database/flavor/databricks_flavor_service.py b/testgen/common/database/flavor/databricks_flavor_service.py index 9ef750a9..b9b339ef 100644 --- a/testgen/common/database/flavor/databricks_flavor_service.py +++ b/testgen/common/database/flavor/databricks_flavor_service.py @@ -5,6 +5,10 @@ class DatabricksFlavorService(FlavorService): + quote_character = "`" + escaped_single_quote = "\\'" + varchar_type = "STRING" + def get_connection_string_head(self): return f"{self.flavor}://{self.username}:{quote_plus(self.password)}@" diff --git a/testgen/common/database/flavor/flavor_service.py b/testgen/common/database/flavor/flavor_service.py index a257b2da..9849f1bb 100644 --- a/testgen/common/database/flavor/flavor_service.py +++ b/testgen/common/database/flavor/flavor_service.py @@ -4,7 +4,7 @@ from testgen.common.encrypt import DecryptText -SQLFlavor = Literal["redshift", "snowflake", "mssql", "postgresql", "databricks"] +SQLFlavor = Literal["redshift", "redshift_spectrum", "snowflake", "mssql", "postgresql", "databricks"] class ConnectionParams(TypedDict): @@ -21,24 +21,19 @@ class ConnectionParams(TypedDict): private_key: bytes private_key_passphrase: bytes http_path: str + service_account_key: dict[str,Any] class FlavorService: - url = None - connect_by_url = None - username = None - password = None - host = None - port = None - dbname = None - flavor = None - dbschema = None - connect_by_key = None - private_key = None - private_key_passphrase = None - http_path = None - catalog = None - warehouse = None + concat_operator = "||" + quote_character = '"' + escaped_single_quote = "''" + escaped_underscore = "\\_" + escape_clause = "" + varchar_type = "VARCHAR(1000)" + ddf_table_ref = "table_name" + use_top = False + default_uppercase = False def init(self, connection_params: ConnectionParams): self.url = connection_params.get("url") or "" @@ -53,6 +48,7 @@ def init(self, connection_params: ConnectionParams): self.http_path = connection_params.get("http_path") or "" self.catalog = connection_params.get("catalog") or "" self.warehouse = connection_params.get("warehouse") or "" + self.service_account_key = connection_params.get("service_account_key", None) password = connection_params.get("project_pw_encrypted", None) if isinstance(password, memoryview) or isinstance(password, bytes): @@ -75,8 +71,8 @@ def get_pre_connection_queries(self) -> list[tuple[str, dict | None]]: def get_connect_args(self) -> dict: return {"connect_timeout": 3600} - def get_concat_operator(self) -> str: - return "||" + def get_engine_args(self) -> dict[str,Any]: + return {} def get_connection_string(self) -> str: if self.connect_by_url: diff --git a/testgen/common/database/flavor/mssql_flavor_service.py b/testgen/common/database/flavor/mssql_flavor_service.py index 7cdc23fe..f4e3f1be 100644 --- a/testgen/common/database/flavor/mssql_flavor_service.py +++ b/testgen/common/database/flavor/mssql_flavor_service.py @@ -5,6 +5,11 @@ class MssqlFlavorService(FlavorService): + + concat_operator = "+" + escaped_underscore = "[_]" + use_top = True + def get_connection_string_head(self): return f"mssql+pyodbc://{self.username}:{quote_plus(self.password)}@" @@ -29,6 +34,3 @@ def get_connect_args(self): if settings.SKIP_DATABASE_CERTIFICATE_VERIFICATION: connect_args["TrustServerCertificate"] = "yes" return connect_args - - def get_concat_operator(self): - return "+" diff --git a/testgen/common/database/flavor/postgresql_flavor_service.py b/testgen/common/database/flavor/postgresql_flavor_service.py index b9a04c32..65c10dd4 100644 --- a/testgen/common/database/flavor/postgresql_flavor_service.py +++ b/testgen/common/database/flavor/postgresql_flavor_service.py @@ -2,4 +2,5 @@ class PostgresqlFlavorService(RedshiftFlavorService): - pass + + escaped_underscore = "\\_" diff --git a/testgen/common/database/flavor/redshift_flavor_service.py b/testgen/common/database/flavor/redshift_flavor_service.py index ba17105e..36f89418 100644 --- a/testgen/common/database/flavor/redshift_flavor_service.py +++ b/testgen/common/database/flavor/redshift_flavor_service.py @@ -4,6 +4,9 @@ class RedshiftFlavorService(FlavorService): + + escaped_underscore = "\\\\_" + def init(self, connection_params: dict): super().init(connection_params) # This is for connection purposes. sqlalchemy 1.4.46 uses postgresql to connect to redshift database diff --git a/testgen/common/database/flavor/redshift_spectrum_flavor_service.py b/testgen/common/database/flavor/redshift_spectrum_flavor_service.py new file mode 100644 index 00000000..3a81bee5 --- /dev/null +++ b/testgen/common/database/flavor/redshift_spectrum_flavor_service.py @@ -0,0 +1,6 @@ +from testgen.common.database.flavor.redshift_flavor_service import RedshiftFlavorService + + +class RedshiftSpectrumFlavorService(RedshiftFlavorService): + + ddf_table_ref = "tablename" diff --git a/testgen/common/database/flavor/snowflake_flavor_service.py b/testgen/common/database/flavor/snowflake_flavor_service.py index 49c479bb..a7bad3d8 100644 --- a/testgen/common/database/flavor/snowflake_flavor_service.py +++ b/testgen/common/database/flavor/snowflake_flavor_service.py @@ -9,6 +9,10 @@ class SnowflakeFlavorService(FlavorService): + escaped_underscore = "\\\\_" + escape_clause = "ESCAPE '\\\\'" + default_uppercase = True + def get_connect_args(self): if self.connect_by_key: # https://docs.snowflake.com/en/developer-guide/python-connector/sqlalchemy#key-pair-authentication-support diff --git a/testgen/common/get_pipeline_parms.py b/testgen/common/get_pipeline_parms.py index 79f5c5ed..3c37aacf 100644 --- a/testgen/common/get_pipeline_parms.py +++ b/testgen/common/get_pipeline_parms.py @@ -34,6 +34,7 @@ class TestGenerationParams(BaseParams): class TestExecutionParams(BaseParams): test_suite_id: str table_groups_id: str + table_group_schema: str profiling_table_set: str profiling_include_mask: str profiling_exclude_mask: str diff --git a/testgen/common/models/connection.py b/testgen/common/models/connection.py index 660f51fd..84f71aa5 100644 --- a/testgen/common/models/connection.py +++ b/testgen/common/models/connection.py @@ -22,12 +22,12 @@ from testgen.common.database.database_service import get_flavor_service from testgen.common.database.flavor.flavor_service import SQLFlavor from testgen.common.models import get_current_session -from testgen.common.models.custom_types import EncryptedBytea +from testgen.common.models.custom_types import JSON_TYPE, EncryptedBytea, EncryptedJson from testgen.common.models.entity import ENTITY_HASH_FUNCS, Entity, EntityMinimal from testgen.common.models.table_group import TableGroup from testgen.utils import is_uuid4 -SQLFlavorCode = Literal["redshift", "snowflake", "mssql", "azure_mssql", "synapse_mssql", "postgresql", "databricks"] +SQLFlavorCode = Literal["redshift", "redshift_spectrum", "snowflake", "mssql", "azure_mssql", "synapse_mssql", "postgresql", "databricks"] @dataclass @@ -61,6 +61,7 @@ class Connection(Entity): private_key_passphrase: str = Column(EncryptedBytea) http_path: str = Column(String) warehouse: str = Column(String) + service_account_key: JSON_TYPE = Column(EncryptedJson) _get_by = "connection_id" _default_order_by = (asc(func.lower(connection_name)),) diff --git a/testgen/common/models/custom_types.py b/testgen/common/models/custom_types.py index b4a34276..e68726ba 100644 --- a/testgen/common/models/custom_types.py +++ b/testgen/common/models/custom_types.py @@ -1,10 +1,14 @@ +import json from datetime import UTC, datetime +from types import NoneType from sqlalchemy import Integer, String, TypeDecorator from sqlalchemy.dialects import postgresql from testgen.common.encrypt import DecryptText, EncryptText +JSON_TYPE = str | int | float | list | dict | NoneType + class NullIfEmptyString(TypeDecorator): impl = String @@ -22,12 +26,12 @@ def process_bind_param(self, value: bool | str | None, _dialect) -> str | None: if isinstance(value, bool): return "Y" if value else "N" return value - + def process_result_value(self, value: str | None, _dialect) -> bool | None: if isinstance(value, str): return value == "Y" return value - + class ZeroIfEmptyInteger(TypeDecorator): impl = Integer @@ -54,3 +58,12 @@ def process_bind_param(self, value: str, _dialect) -> bytes: def process_result_value(self, value: bytes, _dialect) -> str: return DecryptText(value) if value is not None else value + + +class EncryptedJson(EncryptedBytea): + + def process_bind_param(self, value: JSON_TYPE, _dialect) -> bytes: + return None if value is None else super().process_bind_param(json.dumps(value), _dialect) + + def process_result_value(self, value: bytes, _dialect) -> JSON_TYPE: + return None if value is None else json.loads(super().process_result_value(value, _dialect)) diff --git a/testgen/common/models/profiling_run.py b/testgen/common/models/profiling_run.py index efc19313..da848f76 100644 --- a/testgen/common/models/profiling_run.py +++ b/testgen/common/models/profiling_run.py @@ -160,14 +160,13 @@ def select_summary( SUM( CASE WHEN COALESCE(profile_anomaly_results.disposition, 'Confirmed') = 'Confirmed' - AND profile_anomaly_types.issue_likelihood = 'Possible' THEN 1 + AND profile_anomaly_types.issue_likelihood IN ('Possible', 'Potential PII') THEN 1 ELSE 0 END ) AS possible_ct, SUM( CASE - WHEN COALESCE(profile_anomaly_results.disposition, 'Confirmed') IN ('Dismissed', 'Inactive') - AND profile_anomaly_types.issue_likelihood <> 'Potential PII' THEN 1 + WHEN COALESCE(profile_anomaly_results.disposition, 'Confirmed') IN ('Dismissed', 'Inactive') THEN 1 ELSE 0 END ) AS dismissed_ct diff --git a/testgen/common/models/scheduler.py b/testgen/common/models/scheduler.py index 9f665e52..3cd9cb79 100644 --- a/testgen/common/models/scheduler.py +++ b/testgen/common/models/scheduler.py @@ -4,7 +4,7 @@ from uuid import UUID, uuid4 from cron_converter import Cron -from sqlalchemy import Column, String, func, select +from sqlalchemy import Boolean, Column, String, delete, func, select, update from sqlalchemy.dialects import postgresql from sqlalchemy.orm import InstrumentedAttribute @@ -27,6 +27,7 @@ class JobSchedule(Base): kwargs: dict[str, Any] = Column(postgresql.JSONB, nullable=False, default={}) cron_expr: str = Column(String, nullable=False) cron_tz: str = Column(String, nullable=False) + active: bool = Column(Boolean, default=True) @classmethod def select_where(cls, *clauses, order_by: str | InstrumentedAttribute | None = None) -> Iterable[Self]: @@ -34,7 +35,7 @@ def select_where(cls, *clauses, order_by: str | InstrumentedAttribute | None = N select(cls.id) .join(TestSuite, TestSuite.test_suite == cls.kwargs["test_suite_key"].astext) .join(TestDefinition, TestDefinition.test_suite_id == TestSuite.id) - .where(cls.key == RUN_TESTS_JOB_KEY) + .where(cls.key == RUN_TESTS_JOB_KEY, cls.active == True) .group_by(cls.id, TestSuite.test_suite) .having(func.count(TestDefinition.id) > 0) .subquery() @@ -44,11 +45,33 @@ def select_where(cls, *clauses, order_by: str | InstrumentedAttribute | None = N .join(test_definitions_count, test_definitions_count.c.id == cls.id) .where(*clauses) ) - non_test_runs_query = select(cls).where(cls.key != RUN_TESTS_JOB_KEY, *clauses) + non_test_runs_query = select(cls).where(cls.key != RUN_TESTS_JOB_KEY, cls.active == True, *clauses) query = test_runs_query.union_all(non_test_runs_query).order_by(order_by) return get_current_session().execute(query) + @classmethod + def delete(cls, job_id: str | UUID) -> None: + query = delete(cls).where(JobSchedule.id == UUID(job_id)) + db_session = get_current_session() + try: + db_session.execute(query) + except ValueError: + db_session.rollback() + else: + db_session.commit() + + @classmethod + def update_active(cls, job_id: str | UUID, active: bool) -> None: + query = update(cls).where(JobSchedule.id == UUID(job_id)).values(active=active) + db_session = get_current_session() + try: + db_session.execute(query) + except ValueError: + db_session.rollback() + else: + db_session.commit() + @classmethod def count(cls): return get_current_session().query(cls).count() diff --git a/testgen/common/models/table_group.py b/testgen/common/models/table_group.py index de0282cc..46e3da53 100644 --- a/testgen/common/models/table_group.py +++ b/testgen/common/models/table_group.py @@ -40,6 +40,7 @@ class TableGroupSummary(EntityMinimal): latest_profile_start: datetime latest_profile_table_ct: int latest_profile_column_ct: int + latest_profile_data_point_ct: int latest_anomalies_ct: int latest_anomalies_definite_ct: int latest_anomalies_likely_ct: int @@ -123,6 +124,7 @@ def select_summary(cls, project_code: str, for_dashboard: bool = False) -> Itera latest_run.profiling_starttime, latest_run.table_ct, latest_run.column_ct, + latest_run.dq_total_data_points, latest_run.anomaly_ct, SUM( CASE @@ -141,14 +143,13 @@ def select_summary(cls, project_code: str, for_dashboard: bool = False) -> Itera SUM( CASE WHEN COALESCE(latest_anomalies.disposition, 'Confirmed') = 'Confirmed' - AND anomaly_types.issue_likelihood = 'Possible' THEN 1 + AND anomaly_types.issue_likelihood IN ('Possible', 'Potential PII') THEN 1 ELSE 0 END ) AS possible_ct, SUM( CASE - WHEN COALESCE(latest_anomalies.disposition, 'Confirmed') IN ('Dismissed', 'Inactive') - AND anomaly_types.issue_likelihood <> 'Potential PII' THEN 1 + WHEN COALESCE(latest_anomalies.disposition, 'Confirmed') IN ('Dismissed', 'Inactive') THEN 1 ELSE 0 END ) AS dismissed_ct @@ -172,6 +173,7 @@ def select_summary(cls, project_code: str, for_dashboard: bool = False) -> Itera latest_profile.profiling_starttime AS latest_profile_start, latest_profile.table_ct AS latest_profile_table_ct, latest_profile.column_ct AS latest_profile_column_ct, + latest_profile.dq_total_data_points AS latest_profile_data_point_ct, latest_profile.anomaly_ct AS latest_anomalies_ct, latest_profile.definite_ct AS latest_anomalies_definite_ct, latest_profile.likely_ct AS latest_anomalies_likely_ct, diff --git a/testgen/common/read_yaml_metadata_records.py b/testgen/common/read_yaml_metadata_records.py new file mode 100644 index 00000000..6361b2b7 --- /dev/null +++ b/testgen/common/read_yaml_metadata_records.py @@ -0,0 +1,268 @@ +__all__ = ["export_metadata_records_to_yaml", "import_metadata_records_from_yaml"] + +import logging +from importlib.resources import as_file +from os import mkdir +from os.path import isdir +from os.path import sep as path_seperator + +from yaml import SafeDumper, safe_dump, safe_load + +from testgen.common.database.database_service import execute_db_queries, fetch_from_db_threaded +from testgen.common.read_file import get_template_files + +LOG = logging.getLogger("testgen") + + +TEST_TYPES_TEMPLATE_FOLDER = "dbsetup_test_types" +TEST_TYPES_PARENT_TABLE = "test_types" +TEST_TYPES_PARENT_KEY = "test_type" +TEST_TYPES_CHILD_TABLES = ["cat_test_conditions", "target_data_lookups", "test_templates"] + +# Fallback PKs +TEST_TYPES_DEFAULT_PK = { + "target_data_lookups": ["test_id", "sql_flavor", "error_type"], + "test_templates": ["test_type", "sql_flavor"], + "cat_test_conditions": ["test_type", "sql_flavor"], +} + +# child_col → parent_col for filtering +TEST_TYPES_PARENT_CHILD_COLUMN_MAP = { + "cat_test_conditions": { + "test_type": "test_type", + }, + "target_data_lookups": { + "test_type": "test_type", + "test_id": "id", + }, + "test_templates": { + "test_type": "test_type", + }, +} + +# Columns to treat as literal blocks (embedded special chars) +TEST_TYPES_LITERAL_FIELDS = { + "test_types": [ + "test_description", + "except_message", + "measure_uom_description", + "selection_criteria", + "dq_score_prevalence_formula", + "column_name_prompt", + "column_name_help", + "default_parm_values", + "default_parm_prompts", + "default_parm_help", + "threshold_description", + "usage_notes", + ], + "cat_test_conditions": [ + "measure", + "test_condition", + ], + "target_data_lookups": [ + "lookup_query", + ], +} + + +ANOMALY_TYPES_TEMPLATE_FOLDER = "dbsetup_anomaly_types" +ANOMALY_TYPES_PARENT_TABLE = "profile_anomaly_types" +ANOMALY_TYPES_PARENT_KEY = "anomaly_type" +ANOMALY_TYPES_CHILD_TABLES = ["target_data_lookups"] + +# Fallback PKs +ANOMALY_TYPES_DEFAULT_PK = { + "target_data_lookups": ["test_id", "sql_flavor", "error_type"], +} + +# child_col → parent_col for filtering +ANOMALY_TYPES_PARENT_CHILD_COLUMN_MAP = { + "target_data_lookups": { + "test_type": "anomaly_type", + "test_id": "id", + }, +} + +# Columns to treat as literal blocks (embedded special chars) +ANOMALY_TYPES_LITERAL_FIELDS = { + "profile_anomaly_types": [ + "anomaly_description", + "anomaly_criteria", + "detail_expression", + "suggested_action", + "dq_score_prevalence_formula", + ], + "target_data_lookups": [ + "lookup_query", + ], +} + + + +class LiteralString(str): + pass + +def _add_literal_representer(): + def _literal_representer(dumper, data): + # emit this string with | style + return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|") + SafeDumper.add_representer(LiteralString, _literal_representer) + + +def _process_yaml_for_import(params_mapping: dict, data:dict, parent_table:str, parent_key:str, child_tables:list[str], default_pk:dict[str, list[str]], parent_child_column_map:dict[str, dict[str,str]]): + queries = [] + parent = data.get(parent_table) + if not isinstance(parent, dict): + raise TypeError(f"YAML key '{parent_table}' must be a dict") + + for table_name in child_tables: + records = parent.pop(table_name, []) + if not isinstance(records, list): + raise TypeError(f"YAML key '{table_name}' under parent must be a list") + + mapping = parent_child_column_map.get(table_name, {}) + + pk_cols = default_pk.get(table_name) or [parent_key] + + for record in records: + for child_col, parent_col in mapping.items(): + record.setdefault(child_col, parent.get(parent_col)) + + columns = list(record.keys()) + + insert_cols = ", ".join(columns) + insert_vals = ", ".join(f":{c}" for c in columns) + update_stmt = ", ".join(f"{c}=EXCLUDED.{c}" for c in columns if c not in pk_cols) + bound_values = {c: record[c] for c in columns} + + sql = f""" + INSERT INTO {params_mapping["SCHEMA_NAME"]}.{table_name} ({insert_cols}) + VALUES ({insert_vals}) + ON CONFLICT ({', '.join(pk_cols)}) DO UPDATE + SET {update_stmt}; + """ + queries.append((sql, bound_values)) + + columns = list(parent.keys()) + + insert_cols = ", ".join(columns) + insert_vals = ", ".join(f":{c}" for c in columns) + update_stmt = ", ".join(f"{c}=EXCLUDED.{c}" for c in columns if c != parent_key) + bound_values = {c: parent[c] for c in columns} + parent_insert_query = f""" + INSERT INTO {params_mapping["SCHEMA_NAME"]}.{parent_table} ({insert_cols}) + VALUES ({insert_vals}) + ON CONFLICT ({parent_key}) DO UPDATE + SET {update_stmt}; + """ + + queries = [(parent_insert_query, bound_values), *queries] + + execute_db_queries( + queries, + user_override=params_mapping["TESTGEN_ADMIN_USER"], + password_override=params_mapping["TESTGEN_ADMIN_PASSWORD"], + user_type="schema_admin", + suppress_logs=True, + ) + return + +def import_metadata_records_from_yaml(params_mapping: dict) -> None: + files = sorted(get_template_files(mask="^.*ya?ml$", sub_directory=TEST_TYPES_TEMPLATE_FOLDER), key=lambda key: str(key)) + for yaml_file in files: + with as_file(yaml_file) as f: + with f.open("r") as file: + data = safe_load(file) + _process_yaml_for_import( + params_mapping, + data, + TEST_TYPES_PARENT_TABLE, + TEST_TYPES_PARENT_KEY, + TEST_TYPES_CHILD_TABLES, + TEST_TYPES_DEFAULT_PK, + TEST_TYPES_PARENT_CHILD_COLUMN_MAP, + ) + files = sorted(get_template_files(mask="^.*ya?ml$", sub_directory=ANOMALY_TYPES_TEMPLATE_FOLDER), key=lambda key: str(key)) + for yaml_file in files: + with as_file(yaml_file) as f: + with f.open("r") as file: + LOG.info(f"Importing {yaml_file}") + data = safe_load(file) + _process_yaml_for_import( + params_mapping, + data, + ANOMALY_TYPES_PARENT_TABLE, + ANOMALY_TYPES_PARENT_KEY, + ANOMALY_TYPES_CHILD_TABLES, + ANOMALY_TYPES_DEFAULT_PK, + ANOMALY_TYPES_PARENT_CHILD_COLUMN_MAP, + ) + return + +def _wrap_literal(table_name: str, recs: list[dict], literal_fields: dict[str, list[str]]): + for rec in recs: + for fld in literal_fields.get(table_name, []): + val = rec.get(fld) + if isinstance(val, str) and val != "": + rec[fld] = LiteralString(val) + +def _process_records_for_export(params_mapping: dict, export_path:str, parent_table:str, parent_key:str, child_tables:list[str], default_pk:dict[str, list[str]], parent_child_column_map:dict[str, dict[str,str]], literal_fields:dict[str, list[str]]) -> None: + if not isdir(export_path): + mkdir(export_path) + fetch_parent_query = f""" + SELECT * + FROM {params_mapping["SCHEMA_NAME"]}.{parent_table}; + """ + parent_records, parent_columns, _ = fetch_from_db_threaded( + [(fetch_parent_query, None)], + ) + for parent_record in parent_records: + parent_record_dict = dict(zip(parent_columns, parent_record, strict=False)) + for child_name in child_tables: + child_key = next(key for key, value in parent_child_column_map[child_name].items() if value==parent_key) + fetch_children_query = f""" + SELECT * FROM {params_mapping["SCHEMA_NAME"]}.{child_name} + WHERE {child_key} = '{parent_record_dict[parent_key]}' + ORDER BY {", ".join(default_pk[child_name])}; + """ + child_records, child_columns, _ = fetch_from_db_threaded( + [(fetch_children_query, None)], + ) + child_records_dict = [] + for child_record in child_records: + child_records_dict.append(dict(zip(child_columns, child_record, strict=False))) + _wrap_literal(child_name, child_records_dict, literal_fields) + parent_record_dict[child_name] = child_records_dict + + _wrap_literal(parent_table, [parent_record_dict], literal_fields) + payload = {parent_table: parent_record_dict} + out_file = f"{export_path}{path_seperator}{parent_table}_{parent_record_dict[parent_key].replace(' ','_')}.yaml" + LOG.info(f"Exporting {out_file}") + with open(out_file, "w") as f: + safe_dump(payload, f, sort_keys=False) + + +def export_metadata_records_to_yaml(params_mapping: dict, templates_path: str) -> None: + _add_literal_representer() + _process_records_for_export( + params_mapping, + f"{templates_path}{path_seperator}{TEST_TYPES_TEMPLATE_FOLDER}", + TEST_TYPES_PARENT_TABLE, + TEST_TYPES_PARENT_KEY, + TEST_TYPES_CHILD_TABLES, + TEST_TYPES_DEFAULT_PK, + TEST_TYPES_PARENT_CHILD_COLUMN_MAP, + TEST_TYPES_LITERAL_FIELDS, + ) + _process_records_for_export( + params_mapping, + f"{templates_path}{path_seperator}{ANOMALY_TYPES_TEMPLATE_FOLDER}", + ANOMALY_TYPES_PARENT_TABLE, + ANOMALY_TYPES_PARENT_KEY, + ANOMALY_TYPES_CHILD_TABLES, + ANOMALY_TYPES_DEFAULT_PK, + ANOMALY_TYPES_PARENT_CHILD_COLUMN_MAP, + ANOMALY_TYPES_LITERAL_FIELDS, + ) + return diff --git a/testgen/template/data_chars/data_chars_update.sql b/testgen/template/data_chars/data_chars_update.sql index dcad1454..ec16d4e0 100644 --- a/testgen/template/data_chars/data_chars_update.sql +++ b/testgen/template/data_chars/data_chars_update.sql @@ -120,27 +120,47 @@ WITH new_chars AS ( position, general_type, column_type, + db_data_type, functional_data_type, run_date FROM {SOURCE_TABLE} WHERE table_groups_id = :TABLE_GROUPS_ID +), +update_chars AS ( + UPDATE data_column_chars + SET ordinal_position = n.position, + general_type = n.general_type, + column_type = n.column_type, + db_data_type = n.db_data_type, + functional_data_type = COALESCE(n.functional_data_type, d.functional_data_type), + last_mod_date = CASE WHEN n.db_data_type <> d.db_data_type THEN n.run_date ELSE d.last_mod_date END, + drop_date = NULL + FROM new_chars n + INNER JOIN data_column_chars d ON ( + n.table_groups_id = d.table_groups_id + AND n.schema_name = d.schema_name + AND n.table_name = d.table_name + AND n.column_name = d.column_name + ) + WHERE data_column_chars.table_id = d.table_id + AND data_column_chars.column_name = d.column_name + RETURNING data_column_chars.*, d.db_data_type as old_data_type ) -UPDATE data_column_chars -SET ordinal_position = n.position, - general_type = n.general_type, - column_type = n.column_type, - functional_data_type = COALESCE(n.functional_data_type, d.functional_data_type), - last_mod_date = CASE WHEN n.column_type <> d.column_type THEN n.run_date ELSE d.last_mod_date END, - drop_date = NULL -FROM new_chars n - INNER JOIN data_column_chars d ON ( - n.table_groups_id = d.table_groups_id - AND n.schema_name = d.schema_name - AND n.table_name = d.table_name - AND n.column_name = d.column_name - ) -WHERE data_column_chars.table_id = d.table_id - AND data_column_chars.column_name = d.column_name; +INSERT INTO data_structure_log ( + element_id, + change_date, + change, + old_data_type, + new_data_type +) +SELECT u.column_id, + u.last_mod_date, + 'M', + u.old_data_type, + u.db_data_type + FROM update_chars u + WHERE u.old_data_type <> u.db_data_type; + -- Add new records WITH new_chars AS ( @@ -151,48 +171,65 @@ WITH new_chars AS ( position, general_type, column_type, + db_data_type, functional_data_type, run_date FROM {SOURCE_TABLE} WHERE table_groups_id = :TABLE_GROUPS_ID +), +inserted_records AS ( + INSERT INTO data_column_chars ( + table_groups_id, + schema_name, + table_name, + table_id, + column_name, + ordinal_position, + general_type, + column_type, + db_data_type, + functional_data_type, + add_date, + last_mod_date + ) + SELECT n.table_groups_id, + n.schema_name, + n.table_name, + dtc.table_id, + n.column_name, + n.position, + n.general_type, + n.column_type, + n.db_data_type, + n.functional_data_type, + n.run_date, + n.run_date + FROM new_chars n + INNER JOIN data_table_chars dtc ON ( + n.table_groups_id = dtc.table_groups_id + AND n.schema_name = dtc.schema_name + AND n.table_name = dtc.table_name + ) + LEFT JOIN data_column_chars d ON ( + n.table_groups_id = d.table_groups_id + AND n.schema_name = d.schema_name + AND n.table_name = d.table_name + AND n.column_name = d.column_name + ) + WHERE d.table_id IS NULL + RETURNING data_column_chars.* ) -INSERT INTO data_column_chars ( - table_groups_id, - schema_name, - table_name, - table_id, - column_name, - ordinal_position, - general_type, - column_type, - functional_data_type, - add_date, - last_mod_date - ) -SELECT n.table_groups_id, - n.schema_name, - n.table_name, - dtc.table_id, - n.column_name, - n.position, - n.general_type, - n.column_type, - n.functional_data_type, - n.run_date, - n.run_date -FROM new_chars n - INNER JOIN data_table_chars dtc ON ( - n.table_groups_id = dtc.table_groups_id - AND n.schema_name = dtc.schema_name - AND n.table_name = dtc.table_name - ) - LEFT JOIN data_column_chars d ON ( - n.table_groups_id = d.table_groups_id - AND n.schema_name = d.schema_name - AND n.table_name = d.table_name - AND n.column_name = d.column_name - ) -WHERE d.table_id IS NULL; +INSERT INTO data_structure_log ( + element_id, + change_date, + change, + new_data_type +) +SELECT i.column_id, + i.add_date, + 'A', + i.db_data_type + FROM inserted_records i; -- Mark dropped records WITH new_chars AS ( @@ -209,18 +246,32 @@ last_run AS ( FROM {SOURCE_TABLE} WHERE table_groups_id = :TABLE_GROUPS_ID GROUP BY table_groups_id +), +deleted_records AS ( + UPDATE data_column_chars + SET drop_date = l.last_run_date + FROM last_run l + INNER JOIN data_column_chars d ON (l.table_groups_id = d.table_groups_id) + LEFT JOIN new_chars n ON ( + d.table_groups_id = n.table_groups_id + AND d.schema_name = n.schema_name + AND d.table_name = n.table_name + AND d.column_name = n.column_name + ) + WHERE data_column_chars.table_id = d.table_id + AND data_column_chars.column_name = d.column_name + AND d.drop_date IS NULL + AND n.column_name IS NULL + RETURNING data_column_chars.* ) -UPDATE data_column_chars -SET drop_date = l.last_run_date -FROM last_run l - INNER JOIN data_column_chars d ON (l.table_groups_id = d.table_groups_id) - LEFT JOIN new_chars n ON ( - d.table_groups_id = n.table_groups_id - AND d.schema_name = n.schema_name - AND d.table_name = n.table_name - AND d.column_name = n.column_name - ) -WHERE data_column_chars.table_id = d.table_id - AND data_column_chars.column_name = d.column_name - AND d.drop_date IS NULL - AND n.column_name IS NULL; +INSERT INTO data_structure_log ( + element_id, + change_date, + change, + old_data_type +) +SELECT del.column_id, + del.drop_date, + 'D', + del.db_data_type + FROM deleted_records del; diff --git a/testgen/template/dbsetup/020_create_standard_functions_sprocs.sql b/testgen/template/dbsetup/020_create_standard_functions_sprocs.sql index 40e7d585..01b65623 100644 --- a/testgen/template/dbsetup/020_create_standard_functions_sprocs.sql +++ b/testgen/template/dbsetup/020_create_standard_functions_sprocs.sql @@ -74,7 +74,7 @@ BEGIN IF lower_case_sql_flavor IN ('postgres', 'postgresql') THEN escaped_value := QUOTE_LITERAL(var_value); - ELSIF lower_case_sql_flavor IN ('redshift', 'snowflake') THEN + ELSIF lower_case_sql_flavor IN ('redshift', 'redshift_spectrum', 'snowflake') THEN escaped_value := TRIM(LEADING 'E' FROM QUOTE_LITERAL(var_value)); ELSIF lower_case_sql_flavor = 'mssql' THEN escaped_value := '''' || REPLACE(var_value, '''', '''''') || ''''; diff --git a/testgen/template/dbsetup/030_initialize_new_schema_structure.sql b/testgen/template/dbsetup/030_initialize_new_schema_structure.sql index 09593c39..ad76f02c 100644 --- a/testgen/template/dbsetup/030_initialize_new_schema_structure.sql +++ b/testgen/template/dbsetup/030_initialize_new_schema_structure.sql @@ -40,6 +40,7 @@ CREATE TABLE stg_data_chars_updates ( position INTEGER, general_type VARCHAR(1), column_type VARCHAR(50), + db_data_type VARCHAR(50), functional_data_type VARCHAR(50), record_ct BIGINT ); @@ -78,7 +79,8 @@ CREATE TABLE connections ( private_key BYTEA, private_key_passphrase BYTEA, http_path VARCHAR(200), - warehouse VARCHAR(200) + warehouse VARCHAR(200), + service_account_key BYTEA ); CREATE TABLE table_groups @@ -246,6 +248,7 @@ CREATE TABLE profile_results ( position INTEGER, column_name VARCHAR(120), column_type VARCHAR(50), + db_data_type VARCHAR(50), general_type VARCHAR(1), record_ct BIGINT, value_ct BIGINT, @@ -339,6 +342,7 @@ CREATE TABLE profile_anomaly_results ( table_name VARCHAR(120), column_name VARCHAR(500), column_type VARCHAR(50), + db_data_type VARCHAR(50), anomaly_id VARCHAR(10), detail VARCHAR, disposition VARCHAR(20), -- Confirmed, Dismissed, Inactive @@ -365,15 +369,14 @@ CREATE TABLE profile_pair_rules ( CREATE TABLE data_structure_log ( - project_code VARCHAR(30), - connection_id BIGINT, - change_date TIMESTAMP, - schema_name VARCHAR(50), - table_name VARCHAR(100), - ordinal_position INTEGER, - column_name VARCHAR(100), - data_type VARCHAR(50), - status VARCHAR(10) + log_id UUID DEFAULT gen_random_uuid() + CONSTRAINT pk_dsl_id + PRIMARY KEY, + element_id UUID, + change_date TIMESTAMP, + change VARCHAR(10), + old_data_type VARCHAR(50), + new_data_type VARCHAR(50) ); CREATE TABLE data_table_chars ( @@ -418,6 +421,7 @@ CREATE TABLE data_column_chars ( ordinal_position INTEGER, general_type VARCHAR(1), column_type VARCHAR(50), + db_data_type VARCHAR(50), functional_data_type VARCHAR(50), description VARCHAR(1000), critical_data_element BOOLEAN, @@ -619,7 +623,9 @@ CREATE TABLE target_data_lookups ( sql_flavor VARCHAR(20) NOT NULL, lookup_type VARCHAR(10), lookup_query VARCHAR, - error_type VARCHAR(30) NOT NULL + error_type VARCHAR(30) NOT NULL, + CONSTRAINT target_data_lookups_test_id_sql_flavor_error_type_pk + PRIMARY KEY (test_id, sql_flavor, error_type) ); CREATE TABLE variant_codings ( @@ -900,6 +906,7 @@ CREATE TABLE job_schedules ( kwargs JSONB NOT NULL, cron_expr VARCHAR(50) NOT NULL, cron_tz VARCHAR(30) NOT NULL, + active BOOLEAN DEFAULT TRUE, UNIQUE (project_code, key, args, kwargs, cron_expr, cron_tz) ); diff --git a/testgen/template/dbsetup/050_populate_new_schema_metadata.sql b/testgen/template/dbsetup/050_populate_new_schema_metadata.sql index 47d0e9a9..4b1c20a7 100644 --- a/testgen/template/dbsetup/050_populate_new_schema_metadata.sql +++ b/testgen/template/dbsetup/050_populate_new_schema_metadata.sql @@ -12,156 +12,8 @@ ALTER TABLE cat_test_conditions DROP CONSTRAINT cat_test_conditions_cat_tests_te TRUNCATE TABLE profile_anomaly_types; -INSERT INTO profile_anomaly_types - (id, anomaly_type, data_object, anomaly_name, anomaly_description, anomaly_criteria, detail_expression, issue_likelihood, suggested_action, dq_score_prevalence_formula, dq_score_risk_factor, dq_dimension) -VALUES ('1001', 'Suggested_Type', 'Column', 'Suggested Data Type', 'Data stored as text all meets criteria for a more suitable type. ', '(functional_data_type NOT IN (''Boolean'', ''Flag'') ) AND (column_type ILIKE ''%ch -ar%'' OR column_type ILIKE ''text'') AND NOT (datatype_suggestion ILIKE ''%char%'' OR datatype_suggestion ILIKE ''text'')', 'p.datatype_suggestion::VARCHAR(200)', 'Likely', 'Consider changing the column data type to tighte -n controls over data ingested and to make values more efficient, consistent and suitable for downstream analysis.', NULL, NULL, NULL), - ('1002', 'Non_Standard_Blanks', 'Column', 'Non-Standard Blank Values', 'Values representing missing data may be unexpected or inconsistent. Non-standard values may include empty strings as opposed to nulls, dummy entries such as "MISSING" or repeated characters that may have been used to bypass entry requirements, processing artifacts such as "NULL", or spreadsheet artifacts such as "NA", "ERROR".', '(p.zero_length_ct > 0 OR (p.filled_value_ct > 0 AND (p.numeric_ct <> p.value_ct OR functional_data_type IN (''Phone'', ''Zip''))))', '''Dummy Values: '' || p.filled_value_ct::VARCHAR || '', Empty String: '' || p.zero_length_ct::VARCHAR || '', Null: '' || p.null_value_ct::VARCHAR || '', Records: '' || p.record_ct::VARCHAR', 'Definite', 'Consider cleansing the column upon ingestion to replace all variants of missing data with a standard designation, like Null.', 'p.filled_value_ct::FLOAT/NULLIF(p.record_ct, 0)::FLOAT', '1.0', 'Completeness'), - ('1003', 'Invalid_Zip_USA', 'Column', 'Invalid USA Zip Code Format', 'Some values present do not conform with the expected format of USA Zip Codes.', 'p.functional_data_type = ''Zip'' AND (p.general_type <> ''A'' OR p.filled_value_ct > 0 OR EXISTS (SELECT 1 FROM UNNEST(STRING_TO_ARRAY(p.top_patterns, '' | '')) WITH ORDINALITY AS u(val, idx) WHERE idx % 2 = 0 AND val NOT IN (''NNNNN'',''NNNNN-NNNN'',''NNNNNNNNN'')))', 'CASE WHEN p.general_type = ''N'' THEN ''Type: '' || p.column_type ELSE '''' END || CASE WHEN p.general_type = ''A'' THEN ''Patterns: '' || (SELECT string_agg(val, '','') FROM UNNEST(STRING_TO_ARRAY(top_patterns, '' | '')) WITH ORDINALITY AS u(val, idx) WHERE idx % 2 = 0) || '', Dummy Values: '' || p.filled_value_ct::VARCHAR ELSE '''' END', 'Definite', 'Consider correcting invalid column values or changing them to indicate a missing value if corrections cannot be made.', NULL, '1.0', 'Validity'), - ('1004', 'Multiple_Types_Minor', 'Multi-Col', 'Multiple Data Types per Column Name - Minor', 'Columns with the same name have the same general type across tables, but the types do not exactly match. Truncation issues may result if columns are commingled and assumed to be the same format.', 'm.general_type_ct = 1 AND m.type_ct > 1', '''Found '' || m.column_ct::VARCHAR || '' columns, '' || m.type_ct::VARCHAR(10) || '' types, '' || m.min_type || '' to '' || m.max_type', 'Possible', 'Consider changing the column data types to be fully consistent. This will tighten your standards at ingestion and assure that data is consistent between tables.', NULL, NULL, 'Consistency'), - ('1005', 'Multiple_Types_Major', 'Multi-Col', 'Multiple Data Types per Column Name - Major', 'Columns with the same name have broadly different types across tables. Differences could be significant enough to cause errors in downstream analysis, extra steps resulting in divergent business logic and inconsistencies in results.', 'm.general_type_ct > 1', '''Found '' || m.column_ct::VARCHAR || '' columns, '' || m.type_ct::VARCHAR(10) || '' types, '' || m.min_type || '' to '' || m.max_type', 'Likely', 'Ideally, you should change the column data types to be fully consistent. If the data is meant to be different, you should change column names so downstream users aren''t led astray.', NULL, NULL, 'Consistency'), - ('1006', 'No_Values', 'Column', 'No Column Values Present', 'This column is present in the table, but no values have been ingested or assigned in any records. This could indicate missing data or a processing error. Note that this considers dummy values and zero-length values as missing data. ', '(p.null_value_ct + p.filled_value_ct + p.zero_length_ct) = p.record_ct', '''Null: '' || p.null_value_ct::VARCHAR(10) || '', Dummy: '' || p.filled_value_ct::VARCHAR(10) || '', Zero Len: '' || p.zero_length_ct::VARCHAR(10)', 'Possible', 'Review your source data, ingestion process, and any processing steps that update this column.', '1.0', '0.33', 'Completeness'), - ('1007', 'Column_Pattern_Mismatch', 'Column', 'Pattern Inconsistency Within Column', 'Alpha-numeric string data within this column conforms to 2-4 different patterns, with 95% matching the first pattern. This could indicate data errors in the remaining values. ', 'p.general_type = ''A'' - AND functional_data_type NOT ILIKE ''Measurement%'' AND functional_data_type NOT IN (''Category'', ''Code'') - AND p.max_length > 3 - AND p.value_ct > (p.numeric_ct + p.filled_value_ct + p.zero_length_ct) - AND p.distinct_pattern_ct BETWEEN 2 AND 4 - AND STRPOS(p.top_patterns, ''N'') > 0 - AND ( - ( (STRPOS(p.top_patterns, ''A'') > 0 OR STRPOS(p.top_patterns, ''a'') > 0) - AND SPLIT_PART(p.top_patterns, ''|'', 3)::NUMERIC / SPLIT_PART(p.top_patterns, ''|'', 1)::NUMERIC < 0.05) - OR - SPLIT_PART(p.top_patterns, ''|'', 3)::NUMERIC / SPLIT_PART(p.top_patterns, ''|'', 1)::NUMERIC < 0.1 - )', '''Patterns: '' || p.top_patterns', 'Likely', 'Review the values for any data that doesn''t conform to the most common pattern and correct any data errors.', '(p.record_ct - SPLIT_PART(p.top_patterns, ''|'', 1)::INT)::FLOAT/NULLIF(p.record_ct, 0)::FLOAT', '0.66', 'Validity'), - ('1008', 'Table_Pattern_Mismatch', 'Multi-Col', 'Pattern Inconsistency Across Tables', 'Alpha-numeric string data within this column matches a single pattern, but other columns with the same name have data that matches a different single pattern. Inconsistent formatting may contradict user assumptions and cause downstream errors, extra steps and inconsistent business logic.', 'p.general_type = ''A'' - AND functional_data_type NOT ILIKE ''Measurement%'' AND functional_data_type NOT IN (''Category'', ''Code'') - AND p.max_length > 3 - AND p.value_ct > (p.numeric_ct + p.filled_value_ct + p.zero_length_ct) - AND m.max_pattern_ct = 1 - AND m.column_ct > 1 - AND SPLIT_PART(p.top_patterns, ''|'', 2) <> SPLIT_PART(m.very_top_pattern, ''|'', 2) - AND SPLIT_PART(p.top_patterns, ''|'', 1)::NUMERIC / SPLIT_PART(m.very_top_pattern, ''|'', 1)::NUMERIC < 0.1', '''Patterns: '' || SPLIT_PART(p.top_patterns, ''|'', 2) || '', '' || SPLIT_PART(ltrim(m.very_top_pattern, ''0''), ''|'', 2)', 'Likely', 'Review the profiled patterns for the same column in other tables. You may want to add a hygiene step to your processing to make patterns consistent.', NULL, NULL, 'Validity'), - ('1009', 'Leading_Spaces', 'Column', 'Leading Spaces Found in Column Values', 'Spaces were found before data at the front of column string values. This likely contradicts user expectations and could be a sign of broader ingestion or processing errors.', 'p.lead_space_ct > 0', '''Cases Found: '' || p.lead_space_ct::VARCHAR(10)', 'Likely', 'Review your source data, ingestion process, and any processing steps that update this column.', 'p.lead_space_ct::FLOAT/NULLIF(p.record_ct, 0)::FLOAT', '0.66', 'Validity'), - ('1010', 'Quoted_Values', 'Column', 'Quoted Values Found in Column Values', 'Column values were found within quotes. This likely contradicts user expectations and could be a sign of broader ingestion or processing errors.', 'p.quoted_value_ct > 0', '''Cases Found: '' || p.quoted_value_ct::VARCHAR(10)', 'Likely', 'Review your source data, ingestion process, and any processing steps that update this column.', 'p.quoted_value_ct::FLOAT/NULLIF(p.record_ct, 0)::FLOAT', '0.66', 'Validity'), - ('1011', 'Char_Column_Number_Values', 'Column', 'Character Column with Mostly Numeric Values', 'This column is defined as alpha, but more than 95% of its values are numeric. Numbers in alpha columns won''t sort correctly, and might contradict user expectations downstream. It''s also possible that more than one type of information is stored in the column, making it harder to retrieve.', 'p.general_type = ''A'' - AND p.column_name NOT ILIKE ''%zip%'' - AND p.functional_data_type NOT ILIKE ''id%'' - AND p.functional_data_type NOT ILIKE ''Period%'' - AND p.value_ct > p.numeric_ct - AND p.numeric_ct::NUMERIC > (0.95 * p.value_ct::NUMERIC)', '''Numeric Ct: '' || p.numeric_ct || '' of '' || p.value_ct || '' (Numeric Percent: '' || ROUND(100.0 * p.numeric_ct::NUMERIC(18, 5) / p.value_ct::NUMERIC(18, 5), 2) || '' )''::VARCHAR(200)', 'Likely', 'Review your source data and ingestion process. Consider whether it might be better to store the numeric data in a numeric column. If the alpha data is significant, you could store it in a different column.', 'p.numeric_ct::FLOAT/NULLIF(p.record_ct, 0)::FLOAT', '0.66', 'Validity'), - ('1012', 'Char_Column_Date_Values', 'Column', 'Character Column with Mostly Date Values', 'This column is defined as alpha, but more than 95% of its values are dates. Dates in alpha columns might not sort correctly, and might contradict user expectations downstream. It''s also possible that more than one type of information is stored in the column, making it harder to retrieve. ', 'p.general_type = ''A'' - AND p.value_ct > p.date_ct - AND p.date_ct::NUMERIC > (0.95 * p.value_ct::NUMERIC)', ''' Date Ct: '' || p.date_ct || '' of '' || p.value_ct || '' (Date Percent: '' || ROUND(100.0 * p.date_ct::NUMERIC(18, 5) / p.value_ct::NUMERIC(18, 5), 2) || '' )''::VARCHAR(200)', 'Likely', 'Review your source data and ingestion process. Consider whether it might be better to store the date values as a date or datetime column. If the alpha data is also significant, you could store it in a different column.', 'p.date_ct::FLOAT/NULLIF(p.record_ct, 0)::FLOAT', '0.66', 'Validity'), - ('1013', 'Small Missing Value Ct', 'Column', 'Small Percentage of Missing Values Found', 'Under 3% of values in this column were found to be null, zero-length or dummy values, but values are not universally present. This could indicate unexpected missing values in a required column.', '(p.value_ct - p.zero_length_ct - CASE WHEN (p.filled_value_ct > 0 AND (p.numeric_ct <> p.value_ct OR functional_data_type IN (''Phone'', ''Zip''))) THEN p.filled_value_ct ELSE 0 END - )::FLOAT / p.record_ct::FLOAT > 0.97 - AND (p.value_ct - p.zero_length_ct - CASE WHEN (p.filled_value_ct > 0 AND (p.numeric_ct <> p.value_ct OR functional_data_type IN (''Phone'', ''Zip''))) THEN p.filled_value_ct ELSE 0 END - ) < p.record_ct', '(p.record_ct - (p.value_ct - p.zero_length_ct - CASE WHEN (p.filled_value_ct > 0 AND (p.numeric_ct <> p.value_ct OR functional_data_type IN (''Phone'', ''Zip''))) THEN p.filled_value_ct ELSE 0 END - ))::VARCHAR(20) || - '' of '' || p.record_ct::VARCHAR(20) || '' blank values: '' || - ROUND(100.0 * (p.record_ct - (p.value_ct - p.zero_length_ct - CASE WHEN (p.filled_value_ct > 0 AND (p.numeric_ct <> p.value_ct OR functional_data_type IN (''Phone'', ''Zip''))) THEN p.filled_value_ct ELSE 0 END - ))::NUMERIC(18, 5) - / NULLIF(p.value_ct, 0)::NUMERIC(18, 5), 2)::VARCHAR(40) || ''%''', 'Possible', 'Review your source data and follow-up with data owners to determine whether this data needs to be corrected, supplemented or excluded.', '(p.null_value_ct + filled_value_ct + zero_length_ct)::FLOAT/NULLIF(p.record_ct, 0)::FLOAT', '0.33', 'Completeness'), - ('1014', 'Small Divergent Value Ct', 'Column', 'Small Percentage of Divergent Values Found', 'Under 3% of values in this column were found to be different from the most common value. This could indicate a data error.', 'functional_data_type <> ''Boolean'' AND (100.0 * fn_parsefreq(p.top_freq_values, 1, 2)::FLOAT / - p.value_ct::FLOAT) > 97::FLOAT - AND (100.0 * fn_parsefreq(p.top_freq_values, 1, 2)::FLOAT / - NULLIF(p.value_ct, 0)::FLOAT) < 100::FLOAT', '''Single Value Pct: '' || ROUND(100.0 * fn_parsefreq(p.top_freq_values, 1, 2)::FLOAT - / NULLIF(p.value_ct, 0)::FLOAT)::VARCHAR(40) - || '', Value | Freq: '' || top_freq_values', 'Possible', 'Review your source data and follow-up with data owners to determine whether this data needs to be corrected.', '(p.record_ct - fn_parsefreq(p.top_freq_values, 1, 2)::BIGINT)::FLOAT/NULLIF(p.record_ct, 0)::FLOAT', '0.33', 'Validity'), - ('1015', 'Boolean_Value_Mismatch', 'Column', 'Unexpected Boolean Values Found', 'This column appears to contain boolean (True/False) data, but unexpected values were found. This could indicate inconsistent coding for the same intended values, potentially leading to downstream errors or inconsistent business logic. ', '(distinct_value_ct > 1 AND - ((lower(top_freq_values) ILIKE ''| true |%'' OR lower(top_freq_values) ILIKE ''| false |%'') AND NOT (lower(top_freq_values) ILIKE ''%| true |%'' AND lower(top_freq_values) ILIKE ''%| false |%'')) - OR ((lower(top_freq_values) ILIKE ''| yes |%'' OR lower(top_freq_values) ILIKE ''| no |%'' ) AND NOT (lower(top_freq_values) ILIKE ''%| yes |%'' AND lower(top_freq_values) ILIKE ''%| no |%'')) )', 'CASE WHEN p.top_freq_values IS NULL THEN ''Min: '' || p.min_text || '', Max: '' || p.max_text - ELSE ''Top Freq: '' || p.top_freq_values END', 'Likely', 'Review your source data and follow-up with data owners to determine whether this data needs to be corrected. ', NULL, '0.66', 'Validity'), - ('1016', 'Potential_Duplicates', 'Column', 'Potential Duplicate Values Found', 'This column is largely unique, but some duplicate values are present. This pattern is uncommon and could indicate inadvertant duplication. ', 'p.distinct_value_ct > 1000 - AND fn_parsefreq(p.top_freq_values, 1, 2)::BIGINT BETWEEN 2 AND 4', '''Top Freq: '' || p.top_freq_values', 'Possible', 'Review your source data and follow-up with data owners to determine whether this data needs to be corrected. ', '(p.value_ct - p.distinct_value_ct)::FLOAT/NULLIF(p.record_ct, 0)::FLOAT', '0.33', 'Uniqueness'), - ('1017', 'Standardized_Value_Matches', 'Column', 'Similar Values Match When Standardized', 'When column values are standardized (removing spaces, single-quotes, periods and dashes), matching values are found in other records. This may indicate that formats should be further standardized to allow consistent comparisons for merges, joins and roll-ups. It could also indicate the presence of unintended duplicates.', 'p.general_type = ''A'' AND p.distinct_std_value_ct <> p.distinct_value_ct AND p.functional_data_type NOT LIKE ''Person%Name'' ', '''Distinct Values: '' || p.distinct_value_ct::VARCHAR - || '', Standardized: '' || p.distinct_std_value_ct::VARCHAR', 'Likely', 'Review standardized vs. raw data values for all matches. Correct data if values should be consistent.', '(p.distinct_value_ct - p.distinct_std_value_ct)::FLOAT/NULLIF(p.value_ct, 0)', '0.66', 'Uniqueness'), - ('1018', 'Unlikely_Date_Values', 'Column', 'Unlikely Dates out of Typical Range', 'Some date values in this column are earlier than 1900-01-01 or later than 30 years after Profiling date.', 'p.general_type = ''D'' - AND (p.min_date BETWEEN ''0001-01-02''::DATE AND ''1900-01-01''::DATE - OR p.max_date > CURRENT_DATE + INTERVAL ''30 year'')', '''Date Range: '' || p.min_date::VARCHAR || '' thru '' || p.max_date::VARCHAR', 'Likely', 'Review your source data and follow-up with data owners to determine whether this data needs to be corrected or removed.', '(COALESCE(p.before_100yr_date_ct,0)+COALESCE(p.distant_future_date_ct, 0))::FLOAT/NULLIF(p.record_ct, 0)', '0.66', 'Accuracy'), - ('1019', 'Recency_One_Year', 'Dates', 'Recency - No Table Dates within 1 Year', 'Among all date columns present in the table, none fall inside of one year from Profile date.', 'MAX(p.max_date) < CURRENT_DATE - INTERVAL ''1 year''', '''Most Recent Date: '' || MAX(p.max_date)::VARCHAR', 'Possible', 'Review your source data and follow-up with data owners to determine whether dates in table should be more recent.', NULL, NULL, 'Timeliness'), - ('1020', 'Recency_Six_Months', 'Dates', 'Recency - No Table Dates within 6 Months', 'Among all date columns present in the table, the most recent date falls 6 months to 1 year back from Profile date. ', 'MAX(p.max_date) >= CURRENT_DATE - INTERVAL ''1 year'' AND MAX(p.max_date) < CURRENT_DATE - INTERVAL ''6 months''', '''Most Recent Date: '' || MAX(p.max_date)::VARCHAR', 'Possible', 'Review your source data and follow-up with data owners to determine whether dates in table should be more recent.', NULL, NULL, 'Timeliness'), - ('1021', 'Unexpected US States', 'Column', 'Unexpected Column Contains US States', 'This column is not labeled as a state, but contains mostly US State abbreviations. This could indicate shifted or switched source data columns.', 'p.std_pattern_match = ''STATE_USA'' - AND p.distinct_value_ct > 5 - AND NOT (p.column_name = ''st'' OR p.column_name ILIKE ''%state%'' OR p.column_name ILIKE ''%_st'' OR p.column_name ILIKE ''st_%'')', '''Value Range: '' || p.min_text || '' thru '' || max_text || CASE WHEN p.top_freq_values > '''' THEN '', Top Freq Values: '' || REPLACE(p.top_freq_values, CHR(10), '' ; '') ELSE '''' END ', 'Possible', 'Review your source data and follow-up with data owners to determine whether column should be populated with US states.', NULL, '0.33', 'Consistency'), - ('1022', 'Unexpected Emails', 'Column', 'Unexpected Column Contains Emails', 'This column is not labeled as email, but contains mostly email addresses. This could indicate shifted or switched source data columns.', 'p.std_pattern_match = ''EMAIL'' - AND NOT (p.column_name ILIKE ''%email%'' OR p.column_name ILIKE ''%addr%'')', '''Value Range: '' || p.min_text || '' thru '' || max_text', 'Possible', 'Review your source data and follow-up with data owners to determine whether column should be populated with email addresses.', NULL, '0.33', 'Consistency'), - ('1023', 'Small_Numeric_Value_Ct', 'Column', 'Unexpected Numeric Values Found', 'A small fraction (under 3%) of values in this column were found to be numeric. They could be erroneous.', 'p.general_type = ''A'' - AND p.numeric_ct::FLOAT/NULLIF(p.value_ct, 0)::FLOAT < 0.03 - AND p.numeric_ct > 0', '''Numeric Ct: '' || p.numeric_ct || '' of '' || p.value_ct || '' (Numeric Percent: '' || ROUND(100.0 * p.numeric_ct::NUMERIC(18, 5)/NULLIF(p.value_ct, 0)::NUMERIC(18, 5), 2) || '' )''::VARCHAR(200)', 'Likely', 'Review your source data and follow-up with data owners to determine whether numeric values are invalid entries here.', 'p.numeric_ct::FLOAT/NULLIF(p.record_ct, 0)::FLOAT', '0.66', 'Validity'), - ('1024', 'Invalid_Zip3_USA', 'Column', 'Invalid USA ZIP-3 Format', 'The majority of values in this column are 3-digit zips, but divergent patterns were found. This could indicate an incorrect roll-up category or a PII concern.', 'p.distinct_pattern_ct > 1 - AND (p.column_name ilike ''%zip%'' OR p.column_name ILIKE ''%postal%'') - AND SPLIT_PART(p.top_patterns, '' | '', 2) = ''NNN'' - AND SPLIT_PART(p.top_patterns, '' | '', 1)::FLOAT/NULLIF(value_ct, 0)::FLOAT > 0.50', '''Pattern: '' || p.top_patterns', 'Definite', 'Review your source data, ingestion process, and any processing steps that update this column.', '(NULLIF(p.record_ct, 0)::INT - SPLIT_PART(p.top_patterns, '' | '', 1)::INT)::FLOAT/NULLIF(p.record_ct, 0)::FLOAT', '1', 'Validity'), - ('1025', 'Delimited_Data_Embedded', 'Column', 'Delimited Data Embedded in Column', 'Delimited data, separated by a common delimiter (comma, tab, pipe or caret) is present in over 80% of column values. This could indicate data that was incorrectly ingested, or data that would be better represented in parsed form.', 'p.std_pattern_match = ''DELIMITED_DATA''', 'CASE WHEN p.top_freq_values IS NULL THEN ''Min: '' || p.min_text || '', Max: '' || p.max_text ELSE ''Top Freq: '' || p.top_freq_values END', 'Likely', 'Review your source data and follow-up with data consumers to determine the most useful representation of this data.', NULL, '0.66', 'Validity'), - ('1026', 'Char_Column_Number_Units', 'Column', 'Character Column with Numbers and Units', 'This column is defined as alpha, but values include numbers with percents or common units. Embedded measures in alpha columns are harder to access, won''t sort correctly, and might contradict user expectations downstream. Consider parsing into numeric and UOM columns to improve usability.', 'p.includes_digit_ct::FLOAT/NULLIF(p.value_ct, 0)::FLOAT > 0.5 AND TRIM(fn_parsefreq(p.top_freq_values, 1, 1)) ~ ''(?i)^[0-9]+(\.[0-9]+)? ?(%|lb|oz|kg|g|mg|km|m|cm|mm|mi|ft|in)$''', '''Top Freq: '' || p.top_freq_values', 'Possible', 'Review your source data and ingestion process. Consider whether it might be better to parse the numeric and unit data and store in separate columns.', NULL, '0.33', 'Consistency'), - ('1027', 'Variant_Coded_Values', 'Variant', 'Variant Codings for Same Values', 'This column contains more than one common variants that represent a single value or state. This can occur when data is integrated from multiple sources with different standards, or when free entry is permitted without validation. The variations can cause confusion and error for downstream data users and multiple versions of the truth. ', 'p.distinct_value_ct <= 20', '''Variants Found: '' || intersect_list', 'Definite', 'Review your source data and ingestion process. Consider cleansing this data to standardize on a single set of definitive codes.', NULL, NULL, 'Consistency'), - ('1100', 'Potential_PII', 'Column', 'Personally Identifiable Information', 'This column contains data that could be Personally Identifiable Information (PII)', 'p.pii_flag > ''''', '''Risk: '' || CASE LEFT(p.pii_flag, 1) WHEN ''A'' THEN ''HIGH'' WHEN ''B'' THEN ''MODERATE'' WHEN ''C'' THEN ''LOW'' END || '', PII Type: '' || SUBSTRING(p.pii_flag, 3)', 'Potential PII', 'PII may require steps to ensure data security and compliance with relevant privacy regulations and legal requirements. You may have to classify and inventory PII, implement appropriate access controls, encrypt data, and monitor for unauthorized access. Your organization might be required to update privacy policies and train staff on data protection practices. Note that PII that is lower-risk in isolation might be high-risk in conjunction with other data.', NULL, 'CASE LEFT(p.pii_flag, 1) WHEN ''A'' THEN 1 WHEN ''B'' THEN 0.66 WHEN ''C'' THEN 0.33 END', 'Validity'), - ('1028', 'Inconsistent_Casing', 'Column', 'Inconsistent Casing', 'Casing is inconsistent for a column representing an entity name or address elements. Mixed-Case and All-Upper-Case values were found in the same column.', 'mixed_case_ct > 0 AND upper_case_ct > 0 AND functional_data_type IN (''Address'', ''City'', ''Entity Name'', ''Person Given Name'', ''Person Last Name'', ''Person Full Name'')', '''Mixed-Case: '' || p.mixed_case_ct::VARCHAR || '', All-Upper-Case: '' || p.upper_case_ct::VARCHAR || '' for Semantic Data Type: '' || p.functional_data_type || '', Records: '' || p.record_ct::VARCHAR', 'Definite', 'Review your source data and follow-up with data owners to determine whether consistent casing should be applied at the source. If source data corrections are not possible, consider standardizing the column upon ingestion to ensure consistent casing.', 'LEAST(p.mixed_case_ct, p.upper_case_ct)::FLOAT/NULLIF(p.record_ct, 0)::FLOAT', '1.0', 'Validity'), - ('1029', 'Non_Alpha_Name_Address', 'Column', 'Non-Alpha Name or Address', 'Entirely non-alphabetic values were found in a column representing an entity name or address element.', 'non_alpha_ct - zero_length_ct > 0 AND functional_data_type IN (''Address'', ''City'', ''Entity Name'', ''Person Given Name'', ''Person Last Name'', ''Person Full Name'')', '''Non-Alpha Values: '' || (non_alpha_ct - zero_length_ct)::VARCHAR || '', Semantic Type: '' || p.functional_data_type || '', Records: '' || p.record_ct::VARCHAR', 'Definite', 'Non-alphabetic values are highly likely to be invalid for this kind of column. This may indicate a file format change, error in an ingestion process, or incorrect source data. Review your pipeline process and source data to determine the root-cause. If this data accurately reflects source data, and upstream corrections are not possible, consider assigning the processed value to null to reflect that data is missing.', '(non_alpha_ct - zero_length_ct)::FLOAT/NULLIF(p.record_ct, 0)::FLOAT', '1.0', 'Validity'), - ('1030', 'Non_Alpha_Prefixed_Name', 'Column', 'Non-Alpha Prefixed Name', 'Non-alphabetic characters were found at the start of a column representing an entity name.', 'min_text < ''A'' AND LEFT(min_text, 1) NOT IN (''"'', '' '') AND RIGHT(min_text, 1) <> '''''''' AND functional_data_type IN (''City'', ''Person Given Name'', ''Person Last Name'', ''Person Full Name'')', '''Minimum Value: '' || min_text', 'Definite', 'Values starting with a non-alphabetic character are highly likely to be invalid for this kind of column. This may indicate a file format change, error in an ingestion process, or incorrect source data. It could also indicate flagging or coding of some kind that can be broken out in a separate column in processed data. Review your pipeline process and source data to determine the root-cause. If this data accurately reflects source data, and upstream corrections are not possible, consider applying corrections directly to processed data where possible.', '0.25', '1.0', 'Validity'), - ('1031', 'Non_Printing_Chars', 'Column', 'Non-Printing Characters', 'Non-printing characters were found embedded in a text column.', 'non_printing_ct > 0', '''Non-Printing Chars: '' || non_printing_ct::VARCHAR || '', Records: '' || p.record_ct::VARCHAR', 'Definite', 'Embedded non-printing characters are typically stripped from data. They affect filters and aggregations, and may cause problems for downstream users who don''t recognize their presence. Review your source data and follow-up with data owners to determine whether this data can be corrected upstream. If not, strip these characters from processed data.', 'non_printing_ct::FLOAT/NULLIF(p.record_ct, 0)::FLOAT', '1.0', 'Validity') -; - - TRUNCATE TABLE test_types; - -INSERT INTO test_types - (id, test_type, test_name_short, test_name_long, test_description, except_message, measure_uom, measure_uom_description, selection_criteria, dq_score_prevalence_formula, dq_score_risk_factor, column_name_prompt, column_name_help, default_parm_columns, default_parm_values, default_parm_prompts, default_parm_help, default_severity, run_type, test_scope, dq_dimension, health_dimension, threshold_description, usage_notes, active, result_visualization, result_visualization_params) -VALUES ('1004', 'Alpha_Trunc', 'Alpha Truncation', 'Maximum character count consistent', 'Tests that the maximum count of characters in a column value has not dropped vs. baseline data', 'Maximum length of values has dropped from prior expected length.', 'Values over max', NULL, 'general_type =''A'' AND max_length > 0 AND ( (min_length = avg_length AND max_length = avg_length) OR (numeric_ct <> value_ct ) ) AND functional_table_type NOT LIKE ''%window%'' /* The conditions below are to eliminate overlap with : LOV_Match (excluded selection criteria for this test_type), Pattern_Match (excluded selection criteria for this test_type), Constant (excluded functional_data_type Constant and Boolean) */ AND ( (distinct_value_ct NOT BETWEEN 2 AND 10 AND functional_data_type NOT IN ( ''Constant'', ''Boolean'') ) AND NOT ( fn_charcount(top_patterns, E'' \| '' ) = 1 AND fn_charcount(top_patterns, E'' \| '' ) IS NOT NULL AND REPLACE(SPLIT_PART(top_patterns, ''|'' , 2), ''N'' , '''' ) > ''''))', '{VALUE_CT}::FLOAT * (FN_NORMAL_CDF(({MAX_LENGTH}::FLOAT - {AVG_LENGTH}::FLOAT) / (NULLIF({MAX_LENGTH}::FLOAT, 0) / 3)) - FN_NORMAL_CDF(({RESULT_MEASURE}::FLOAT - {AVG_LENGTH}::FLOAT) / (NULLIF({MAX_LENGTH}::FLOAT, 0) / 3)) ) /NULLIF({RECORD_CT}::FLOAT, 0)', '1.0', NULL, NULL, 'threshold_value', 'FLOOR(0.95 * max_length::FLOAT)', 'Maximum String Length at Baseline', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'Maximum length expected', 'Alpha Truncation tests that the longest text value in a column hasn''t become shorter than the defined threshold, initially 95% of the longest value at baseline. This could indicate a problem in a cumulative dataset, where prior values should still exist unchanged. A failure here would suggest that some process changed data that you would still expect to be present and matching its value when the column was profiled. This test would not be appropriate for an incremental or windowed dataset.', 'Y', 'line_chart', NULL), - ('1005', 'Avg_Shift', 'Average Shift', 'Column mean is consistent with reference', 'Tests for statistically-significant shift in mean value for column from average calculated at baseline.', 'Standardized difference between averages is over the selected threshold level.', 'Difference Measure', 'Cohen''s D Difference (0.20 small, 0.5 mod, 0.8 large, 1.2 very large, 2.0 huge)', 'general_type=''N'' AND distinct_value_ct > 10 AND functional_data_type ilike ''Measure%'' AND functional_data_type <> ''Measurement Spike'' AND column_name NOT ilike ''%latitude%'' AND column_name NOT ilike ''%longitude%''', '2.0 * (1.0 - fn_normal_cdf(ABS({RESULT_MEASURE}::FLOAT) / 2.0))', '0.75', NULL, NULL, 'baseline_value_ct,baseline_avg,baseline_sd,threshold_value', 'value_ct,avg_value,stdev_value,0.5::VARCHAR', 'Value Ct at Baseline,Mean at Baseline,Std Deviation at Baseline,Threshold Difference Measure ', NULL, 'Warning', 'CAT', 'column', 'Consistency', 'Data Drift', 'Standardized Difference Measure', 'Average Shift tests that the average of a numeric column has not significantly changed since baseline, when profiling was done. A significant shift may indicate errors in processing, differences in source data, or valid changes that may nevertheless impact assumptions in downstream data products. The test uses Cohen''s D, a statistical technique to identify significant shifts in a value. Cohen''s D measures the difference between the two averages, reporting results on a standardized scale, which can be interpreted via a rule-of-thumb from small to huge. Depending on your data, some difference may be expected, so it''s reasonable to adjust the threshold value that triggers test failure. This test works well for measures, or even for identifiers if you expect them to increment consistently. You may want to periodically adjust the expected threshold, or even the expected average value if you expect shifting over time. Consider this test along with Variability Increase. If variability rises too, process or measurement flaws could be at work. If variability remains consistent, the issue is more likely to be with the source data itself.', 'Y', 'line_chart', NULL), - ('1007', 'Constant', 'Constant Match', 'All column values match constant value', 'Tests that all values in the column match the constant value identified in baseline data', 'A constant value is expected for this column.', 'Mismatched values', NULL, 'TEMPLATE', '({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/NULLIF({RECORD_CT}::FLOAT, 0)', '1.0', NULL, NULL, 'baseline_value,threshold_value', NULL, 'Constant Value at Baseline,Threshold Error Count', 'The single, unchanging value of the column, per baseline|The number of errors that are acceptable before test fails.', 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'Count of records with unexpected values', 'Constant Match tests that a single value determined to be a constant in baseline profiling is still the only value for the column that appears in subsequent versions of the dataset. Sometimes new data or business knowledge may reveal that the value is not a constant at all, even though only one value was present at profiling. In this case, you will want to disable this test. Alternatively, you can use the Value Match test to provide a limited number of valid values for the column.', 'Y', 'line_chart', NULL), - ('1009', 'Daily_Record_Ct', 'Daily Records', 'All dates present within date range', 'Tests for presence of every calendar date within min/max date range, per baseline data', 'Not every date value between min and max dates is present, unlike at baseline.', 'Missing dates', NULL, 'general_type= ''D'' AND date_days_present > 21 AND date_days_present - (DATEDIFF(''day'', ''1800-01-05''::DATE, max_date) - DATEDIFF(''day'', ''1800-01-05''::DATE, min_date) + 1) = 0 AND future_date_ct::FLOAT / NULLIF(value_ct, 0) <= 0.75', '({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT*{PRO_RECORD_CT}::FLOAT/NULLIF({DATE_DAYS_PRESENT}::FLOAT, 0)/NULLIF({RECORD_CT}::FLOAT, 0)', '0.75', NULL, NULL, 'threshold_value', '0', 'Threshold Missing Calendar Days', NULL, 'Warning', 'CAT', 'column', 'Completeness', 'Volume', 'Missing calendar days within min/max range', 'Daily Records tests that at least one record is present for every day within the minimum and maximum date range for the column. The test is relevant for transactional data, where you would expect at least one transaction to be recorded each day. A failure here would suggest missing records for the number of days identified without data. You can adjust the threshold to accept a number of days that you know legitimately have no records. ', 'Y', 'line_chart', NULL), - ('1011', 'Dec_Trunc', 'Decimal Truncation', 'Sum of fractional values at or above reference', 'Tests for decimal truncation by confirming that the sum of fractional values in data is no less than the sum at baseline', 'The sum of fractional values is under baseline, which may indicate decimal truncation', 'Fractional sum', 'The sum of all decimal values from all data for this column', 'fractional_sum > 0 AND functional_table_type LIKE''%cumulative%''', '1', '1.0', NULL, NULL, 'threshold_value', 'ROUND(fractional_sum, 0)', 'Sum of Fractional Values at Baseline', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'Minimum expected sum of all fractional values', 'Decimal Truncation tests that the fractional (decimal) part of a numeric column has not been truncated since Baseline. This works by summing all the fractional values after the decimal point and confirming that the total is at least equal to the fractional total at baseline. This could indicate a problem in a cumulative dataset, where prior values should still exist unchanged. A failure here would suggest that some process changed data that you would still expect to be present and matching its value when the column was profiled. This test would not be appropriate for an incremental or windowed dataset.', 'Y', 'line_chart', NULL), - ('1012', 'Distinct_Date_Ct', 'Date Count', 'Count of distinct dates at or above reference', 'Tests that the count of distinct dates referenced in the column has not dropped vs. baseline data', 'Drop in count of unique dates recorded in column.', 'Unique dates', 'Count of unique dates in transactional date column', 'functional_data_type ILIKE ''Transactional Date%'' AND date_days_present > 1 AND functional_table_type ILIKE ''%cumulative%''', '(({RECORD_CT}-{PRO_RECORD_CT})::FLOAT*{DISTINCT_VALUE_CT}::FLOAT/NULLIF({PRO_RECORD_CT}::FLOAT, 0))/NULLIF({PRO_RECORD_CT}::FLOAT, 0)', '1.0', NULL, NULL, 'baseline_value,threshold_value', 'date_days_present,date_days_present', 'Distinct Date Count at Baseline,Min Expected Date Count', NULL, 'Fail', 'CAT', 'column', 'Timeliness', 'Recency', 'Minimum distinct date count expected', 'Date Count tests that the count of distinct dates present in the column has not dropped since baseline. The test is relevant for cumulative datasets, where old records are retained. A failure here would indicate missing records, which could be caused by a processing error or changed upstream data sources.', 'Y', 'line_chart', NULL), - ('1013', 'Distinct_Value_Ct', 'Value Count', 'Count of distinct values has not dropped', 'Tests that the count of unique values in the column has not changed from baseline.', 'Count of unique values in column has changed from baseline.', 'Unique Values', NULL, 'distinct_value_ct between 2 and 10 AND value_ct > 50 AND functional_data_type IN (''Code'', ''Category'', ''Attribute'', ''Description'') AND NOT coalesce(top_freq_values,'''') > ''''', 'ABS({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT*{PRO_RECORD_CT}::FLOAT/NULLIF({DISTINCT_VALUE_CT}::FLOAT, 0)/NULLIF({RECORD_CT}::FLOAT, 0)', '1.0', NULL, NULL, 'baseline_value_ct,threshold_value', 'distinct_value_ct,distinct_value_ct', 'Distinct Value Count at Baseline,Min Expected Value Count', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'Expected distinct value count', 'Value Count tests that the count of unique values present in the column has not dropped since baseline. The test is relevant for cumulative datasets, where old records are retained, or for any dataset where you would expect a set number of distinct values should be present. A failure here would indicate missing records or a change in categories or value assignment.', 'Y', 'line_chart', NULL), - ('1014', 'Email_Format', 'Email Format', 'Email is correctly formatted', 'Tests that non-blank, non-empty email addresses match the standard format', 'Invalid email address formats found.', 'Invalid emails', 'Number of emails that do not match standard format', 'std_pattern_match=''EMAIL''', '({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/NULLIF({RECORD_CT}::FLOAT, 0)', '1.0', NULL, NULL, 'threshold_value', '0', 'Maximum Invalid Email Count', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'Expected count of invalid email addresses', NULL, 'Y', 'line_chart', NULL), - ('1015', 'Future_Date', 'Past Dates', 'Latest date is prior to test run date', 'Tests that the maximum date referenced in the column is no greater than the test date, consistent with baseline data', 'Future date found when absent in baseline data.', 'Future dates', NULL, 'general_type=''D''AND future_date_ct = 0', '({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/NULLIF({RECORD_CT}::FLOAT, 0)', '1.0', NULL, NULL, 'threshold_value', '0', 'Maximum Future Date Count', NULL, 'Fail', 'CAT', 'column', 'Timeliness', 'Recency', 'Expected count of future dates', NULL, 'Y', 'line_chart', NULL), - ('1016', 'Future_Date_1Y', 'Future Year', 'Future dates within year of test run date', 'Tests that the maximum date referenced in the column is no greater than one year beyond the test date, consistent with baseline data', 'Future date beyond one-year found when absent in baseline.', 'Future dates post 1 year', NULL, 'general_type=''D''AND future_date_ct > 0 AND max_date <=''{AS_OF_DATE}''::DATE + INTERVAL''365 DAYS''', '({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/NULLIF({RECORD_CT}::FLOAT, 0)', '1.0', NULL, NULL, 'threshold_value', '0', 'Maximum Post 1-Year Future Date Count', NULL, 'Fail', 'CAT', 'column', 'Timeliness', 'Recency', 'Expected count of future dates beyond one year', 'Future Year looks for date values in the column that extend beyond one year after the test date. This would be appropriate for transactional dates where you would expect to find dates in the near future, but not beyond one year ahead. Errors could indicate invalid entries or possibly dummy dates representing blank values.', 'Y', 'line_chart', NULL), - ('1017', 'Incr_Avg_Shift', 'New Shift', 'New record mean is consistent with reference', 'Tests for statistically-significant shift in mean of new values for column compared to average calculated at baseline.', 'Significant shift in average of new values vs. baseline avg', 'Z-score of mean shift', 'Absolute Z-score (number of SD''s outside mean) of prior avg - incremental avg', 'general_type=''N'' AND distinct_value_ct > 10 AND functional_data_type ilike ''Measure%'' AND functional_data_type <> ''Measurement Spike'' AND column_name NOT ilike ''%latitude%'' AND column_name NOT ilike ''%longitude%''', '{RECORD_CT}::FLOAT*(1-FN_NORMAL_CDF({RESULT_MEASURE}::FLOAT))/NULLIF({RECORD_CT}::FLOAT, 0)', '0.75', NULL, NULL, 'baseline_value_ct,baseline_sum,baseline_avg,baseline_sd,threshold_value', 'value_ct,(avg_value * value_ct)::FLOAT,avg_value,stdev_value,2', 'Value Count at Baseline,Sum at Baseline,Mean Value at Baseline,Std Deviation at Baseline,Threshold Max Z-Score', NULL, 'Warning', 'CAT', 'column', 'Accuracy', 'Data Drift', 'Maximum Z-Score (number of SD''s beyond mean) expected', 'This is a more sensitive test than Average Shift, because it calculates an incremental difference in the average of new values compared to the average of values at baseline. This is appropriate for a cumulative dataset only, because it calculates the average of new entries based on the assumption that the count and average of records present at baseline are still present at the time of the test. This test compares the mean of new values with the standard deviation of the baseline average to calculate a Z-score. If the new mean falls outside the Z-score threshold, a shift is detected. Potential Z-score thresholds may range from 0 to 3, depending on the sensitivity you prefer. A failed test could indicate a quality issue or a legitimate shift in new data that should be noted and assessed by business users. Consider this test along with Variability Increase. If variability rises too, process, methodology or measurement flaws could be at issue. If variability remains consistent, the problem is more likely to be with the source data itself.', 'Y', 'line_chart', NULL), - ('1018', 'LOV_All', 'Value Match All', 'List of expected values all present in column', 'Tests that all values match a pipe-delimited list of expected values and that all expected values are present', 'Column values found don''t exactly match the expected list of values', 'Values found', NULL, NULL, '1', '1.0', NULL, NULL, 'threshold_value', NULL, 'List of Expected Values', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'List of values expected, in form (''Val1'',''Val2)', 'This is a more restrictive form of Value Match, testing that all values in the dataset match the list provided, and also that all values present in the list appear at least once in the dataset. This would be appropriate for tables where all category values in the column are represented at least once.', 'Y', 'line_chart', NULL), - ('1019', 'LOV_Match', 'Value Match', 'All column values present in expected list', 'Tests that all values in the column match the list-of-values identified in baseline data.', 'Values not matching expected List-of-Values from baseline.', 'Non-matching records', NULL, 'functional_data_type IN (''Boolean'', ''Code'', ''Category'') AND top_freq_values > '''' AND distinct_value_ct BETWEEN 2 and 10 AND value_ct > 5', '({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/NULLIF({RECORD_CT}::FLOAT, 0)', '1.0', NULL, NULL, 'baseline_value,threshold_value', '''('' || SUBSTRING( CASE WHEN SPLIT_PART(top_freq_values, ''|'' , 2) > '''' THEN '','''''' || TRIM( REPLACE ( SPLIT_PART(top_freq_values, ''|'' , 2), '''''''' , '''''''''''' ) ) || '''''''' ELSE '''' END || CASE WHEN SPLIT_PART(top_freq_values, ''|'' , 4) > '''' THEN '','''''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, ''|'' , 4), '''''''' , '''''''''''' )) || '''''''' ELSE '''' END || CASE WHEN SPLIT_PART(top_freq_values, ''|'' , 6) > '''' THEN '','''''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, ''|'' , 6), '''''''' , '''''''''''' )) || '''''''' ELSE '''' END || CASE WHEN SPLIT_PART(top_freq_values, ''|'' , 8) > '''' THEN '','''''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, ''|'' , 8), '''''''' , '''''''''''' )) || '''''''' ELSE '''' END || CASE WHEN SPLIT_PART(top_freq_values, ''|'' , 10) > '''' THEN '','''''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, ''|'' , 10), '''''''' , '''''''''''' )) || '''''''' ELSE '''' END || CASE WHEN SPLIT_PART(top_freq_values, ''|'' , 12) > '''' THEN '','''''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, ''|'' , 12), '''''''' , '''''''''''' )) || '''''''' ELSE '''' END || CASE WHEN SPLIT_PART(top_freq_values, ''|'' , 14) > '''' THEN '','''''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, ''|'' , 14), '''''''' , '''''''''''' )) || '''''''' ELSE '''' END || CASE WHEN SPLIT_PART(top_freq_values, ''|'' , 16) > '''' THEN '','''''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, ''|'' , 16), '''''''' , '''''''''''' )) || '''''''' ELSE '''' END || CASE WHEN SPLIT_PART(top_freq_values, ''|'' , 18) > '''' THEN '','''''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, ''|'' , 18), '''''''' , '''''''''''' )) || '''''''' ELSE '''' END || CASE WHEN SPLIT_PART(top_freq_values, ''|'' , 20) > '''' THEN '','''''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, ''|'' , 20), '''''''' , '''''''''''' )) || '''''''' ELSE '''' END, 2, 999) || '')'',0', 'List of Expected Values,Threshold Error Count', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'List of values expected, in form (''Val1'',''Val2)', 'This tests that all values in the column match the hard-coded list provided. This is relevant when the list of allowable values is small and not expected to change often. Even if new values might occasionally be added, this test is useful for downstream data products to provide warning that assumptions and logic may need to change.', 'Y', 'line_chart', NULL), - ('1020', 'Min_Date', 'Minimum Date', 'All dates on or after set minimum', 'Tests that the earliest date referenced in the column is no earlier than baseline data', 'The earliest date value found is before the earliest value at baseline.', 'Dates prior to limit', NULL, 'general_type=''D''and min_date IS NOT NULL AND distinct_value_ct > 1', '({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/NULLIF({RECORD_CT}::FLOAT, 0)', '1.0', NULL, NULL, 'baseline_value,threshold_value', 'min_date,0', 'Minimum Date at Baseline,Threshold Error Count', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'Expected count of dates prior to minimum', 'This test is appropriate for a cumulative dataset only, because it assumes all prior values are still present. It''s appropriate where new records are added with more recent dates, but old dates dates do not change.', 'Y', 'line_chart', NULL), - ('1021', 'Min_Val', 'Minimum Value', 'All values at or above set minimum', 'Tests that the minimum value present in the column is no lower than the minimum value in baseline data', 'Minimum column value less than baseline.', 'Values under limit', NULL, 'general_type=''N'' AND functional_data_type ILIKE ''Measure%'' AND min_value IS NOT NULL AND (distinct_value_ct >= 2 OR (distinct_value_ct=2 and min_value<>0 and max_value<>1))', '({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/NULLIF({RECORD_CT}::FLOAT, 0)', '1.0', NULL, NULL, 'baseline_value,threshold_value', 'min_value,0', 'Minimum Value at Baseline,Threshold Error Count', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'Expected count of values under limit', 'This test is appropriate for a cumulative dataset only, assuming all prior values are still present. It is also appropriate for any measure that has an absolute, definable minimum value, or a heuristic that makes senes for valid data.', 'Y', 'line_chart', NULL), - ('1022', 'Missing_Pct', 'Percent Missing', 'Consistent ratio of missing values', 'Tests for statistically-significant shift in percentage of missing values in column vs. baseline data', 'Significant shift in percent of missing values vs. baseline.', 'Difference measure', 'Cohen''s H Difference (0.20 small, 0.5 mod, 0.8 large, 1.2 very large, 2.0 huge)', 'record_ct <> value_ct', '2.0 * (1.0 - fn_normal_cdf(ABS({RESULT_MEASURE}::FLOAT) / 2.0))', '0.75', NULL, NULL, 'baseline_ct,baseline_value_ct,threshold_value', 'record_ct,value_ct,2::VARCHAR(10)', 'Baseline Record Count,Baseline Value Count,Standardized Difference Measure', NULL, 'Warning', 'CAT', 'column', 'Completeness', 'Data Drift', 'Expected maximum Cohen''s H Difference', 'This test uses Cohen''s H, a statistical test to identify a significant difference between two ratios. Results are reported on a standardized scale, which can be interpreted via a rule-of-thumb from small to huge. An uptick in missing data may indicate a collection issue at the source. A larger change may indicate a processing failure. A drop in missing data may also be significant, if it affects assumptions built into analytic products downstream. You can refine the expected threshold value as you view legitimate results of the measure over time.', 'Y', 'line_chart', NULL), - ('1023', 'Monthly_Rec_Ct', 'Monthly Records', 'At least one date per month present within date range', 'Tests for presence of at least one date per calendar month within min/max date range, per baseline data', 'At least one date per month expected in min/max date range.', 'Missing months', 'Calendar months without date values present', 'functional_data_type ILIKE ''Transactional Date%'' AND date_days_present > 1 AND functional_table_type ILIKE ''%cumulative%'' AND date_months_present > 2 AND date_months_present - (datediff( ''MON'' , min_date, max_date) + 1) = 0 AND future_date_ct::FLOAT / NULLIF(value_ct, 0) <= 0.75', '({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT*{PRO_RECORD_CT}::FLOAT/NULLIF({DATE_MONTHS_PRESENT}::FLOAT, 0)/NULLIF({RECORD_CT}::FLOAT, 0)', '1.0', NULL, NULL, 'threshold_value', '0', 'Threshold Count of Months without Dates', NULL, 'Fail', 'CAT', 'column', 'Completeness', 'Volume', 'Expected maximum count of calendar months without dates present', 'Monthly Records tests that at least one record is present for every calendar month within the minimum and maximum date range for the column. The test is relevant for transactional data, where you would expect at least one transaction to be recorded each month. A failure here would suggest missing records for the number of months identified without data. You can adjust the threshold to accept a number of month that you know legitimately have no records.', 'Y', 'line_chart', NULL), - ('1024', 'Outlier_Pct_Above', 'Outliers Above', 'Consistent outlier counts over 2 SD above mean', 'Tests that percent of outliers over 2 SD above Mean doesn''t exceed threshold', 'Percent of outliers exceeding 2 SD above the mean is greater than expected threshold.', 'Pct records over limit', NULL, 'functional_data_type = ''Measurement'' AND distinct_value_ct > 30 AND NOT distinct_value_ct = max_value - min_value + 1 AND distinct_value_ct::FLOAT/value_ct::FLOAT > 0.1 AND stdev_value::FLOAT/avg_value::FLOAT > 0.01 AND column_name NOT ILIKE ''%latitude%'' AND column_name NOT ilike ''%longitude%''', 'GREATEST(0, {RESULT_MEASURE}::FLOAT-{THRESHOLD_VALUE}::FLOAT)', '0.75', NULL, NULL, 'baseline_avg,baseline_sd,threshold_value', 'avg_value,stdev_value,0.05', 'Baseline Mean, Baseline Std Deviation, Pct Records over 2 SD', NULL, 'Warning', 'CAT', 'column', 'Accuracy', 'Data Drift', 'Expected maximum pct records over upper 2 SD limit', 'This test counts the number of data points that may be considered as outliers, determined by whether their value exceeds 2 standard deviations above the mean at baseline. Assuming a normal distribution, a small percentage (defaulted to 5%) of outliers is expected. The actual number may vary for different distributions. The expected threshold reflects the maximum percentage of outliers you expect to see. This test uses the baseline mean rather than the mean for the latest dataset to capture systemic shift as well as individual outliers. ', 'Y', 'line_chart', NULL), - ('1025', 'Outlier_Pct_Below', 'Outliers Below', 'Consistent outlier counts under 2 SD below mean', 'Tests that percent of outliers over 2 SD below Mean doesn''t exceed threshold', 'Percent of outliers exceeding 2 SD below the mean is greater than expected threshold.', 'Pct records under limit', NULL, 'functional_data_type = ''Measurement'' AND distinct_value_ct > 30 AND NOT distinct_value_ct = max_value - min_value + 1 AND distinct_value_ct::FLOAT/value_ct::FLOAT > 0.1 AND stdev_value::FLOAT/avg_value::FLOAT > 0.01 AND column_name NOT ILIKE ''%latitude%'' AND column_name NOT ilike ''%longitude%''', 'GREATEST(0, {RESULT_MEASURE}::FLOAT-{THRESHOLD_VALUE}::FLOAT)', '0.75', NULL, NULL, 'baseline_avg,baseline_sd,threshold_value', 'avg_value,stdev_value,0.05', 'Baseline Mean, Baseline Std Deviation, Pct Records over 2 SD', NULL, 'Warning', 'CAT', 'column', 'Accuracy', 'Data Drift', 'Expected maximum pct records over lower 2 SD limit', 'This test counts the number of data points that may be considered as outliers, determined by whether their value exceeds 2 standard deviations below the mean at baseline. Assuming a normal distribution, a small percentage (defaulted to 5%) of outliers is expected. The actual number may vary for different distributions. The expected threshold reflects the maximum percentage of outliers you expect to see. This test uses the baseline mean rather than the mean for the latest dataset to capture systemic shift as well as individual outliers. ', 'Y', 'line_chart', NULL), - ('1026', 'Pattern_Match', 'Pattern Match', 'Column values match alpha-numeric pattern', 'Tests that all values in the column match the same alpha-numeric pattern identified in baseline data', 'Alpha values do not match consistent pattern in baseline.', 'Pattern Mismatches', NULL, '(functional_data_type IN (''Attribute'', ''DateTime Stamp'', ''Phone'') OR functional_data_type ILIKE ''ID%'' OR functional_data_type ILIKE ''Period%'') AND fn_charcount(top_patterns, E'' \| '' ) = 1 AND REPLACE(SPLIT_PART(top_patterns, ''|'' , 2), ''N'' , '''' ) > '''' AND distinct_value_ct > 10', '({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/NULLIF({RECORD_CT}::FLOAT, 0)', '1.0', NULL, NULL, 'baseline_value,threshold_value', 'TRIM(REPLACE(REPLACE(REPLACE(REGEXP_REPLACE(SPLIT_PART(top_patterns, '' | '', 2), ''([*+\-%_])'', ''[\1]'', ''g''), ''A'', ''[A-Z]''), ''N'', ''[0-9]''), ''a'', ''[a-z]'')),0', 'Pattern at Baseline,Threshold Error Count', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'Expected count of pattern mismatches', 'This test is appropriate for character fields that are expected to appear in a consistent format. It uses pattern matching syntax as appropriate for your database: REGEX matching if available, otherwise LIKE expressions. The expected threshold is the number of records that fail to match the defined pattern.', 'Y', 'line_chart', NULL), - ('1028', 'Recency', 'Recency', 'Latest date within expected range of test date', 'Tests that the latest date in column is within a set number of days of the test date', 'Most recent date value not within expected days of test date.', 'Days before test', 'Number of days that most recent date precedes the date of test', 'general_type= ''D'' AND max_date <= run_date AND NOT column_name IN ( ''filedate'' , ''file_date'' ) AND NOT functional_data_type IN (''Future Date'', ''Schedule Date'') AND DATEDIFF( ''DAY'' , max_date, run_date) <= 62', '(ABS({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT*{PRO_RECORD_CT}::FLOAT/(1.0+DATEDIFF(''DAY'', ''{MIN_DATE}'', ''{MAX_DATE}''))::FLOAT)/NULLIF({RECORD_CT}::FLOAT, 0)', '0.75', NULL, NULL, 'threshold_value', 'CASE WHEN DATEDIFF( ''DAY'' , max_date, run_date) <= 3 THEN DATEDIFF(''DAY'', max_date, run_date) + 3 WHEN DATEDIFF(''DAY'', max_date, run_date) <= 7 then DATEDIFF(''DAY'', max_date, run_date) + 7 WHEN DATEDIFF( ''DAY'' , max_date, run_date) <= 31 THEN CEILING( DATEDIFF( ''DAY'' , max_date, run_date)::FLOAT / 7.0) * 7 WHEN DATEDIFF( ''DAY'' , max_date, run_date) > 31 THEN CEILING( DATEDIFF( ''DAY'' , max_date, run_date)::FLOAT / 30.0) * 30 END', 'Threshold Maximum Days before Test', NULL, 'Warning', 'CAT', 'column', 'Timeliness', 'Recency', 'Expected maximum count of days preceding test date', 'This test evaluates recency based on the latest referenced dates in the column. The test is appropriate for transactional dates and timestamps. The test can be especially valuable because timely data deliveries themselves may not assure that the most recent data is present. You can adjust the expected threshold to the maximum number of days that you expect the data to age before the dataset is refreshed.', 'Y', 'line_chart', NULL), - ('1030', 'Required', 'Required Entry', 'Required non-null value present', 'Tests that a non-null value is present in each record for the column, consistent with baseline data', 'Every record for this column is expected to be filled, but some are missing.', 'Missing values', NULL, 'record_ct = value_ct AND record_ct > 10', '({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/NULLIF({RECORD_CT}::FLOAT, 0)', '1.0', NULL, NULL, 'threshold_value', '0', 'Threshold Missing Value Count', NULL, 'Fail', 'CAT', 'column', 'Completeness', 'Schema Drift', 'Expected count of missing values', NULL, 'Y', 'line_chart', NULL), - ('1033', 'Street_Addr_Pattern', 'Street Address', 'Enough street address entries match defined pattern', 'Tests for percent of records matching standard street address pattern.', 'Percent of values matching standard street address format is under expected threshold.', 'Percent matches', 'Percent of records that match street address pattern', '(std_pattern_match=''STREET_ADDR'') AND (avg_length <> round(avg_length)) AND (avg_embedded_spaces BETWEEN 2 AND 6) AND (avg_length < 35)', '({VALUE_CT}::FLOAT * ({RESULT_MEASURE}::FLOAT - {THRESHOLD_VALUE}::FLOAT)/100.0)/NULLIF({RECORD_CT}::FLOAT, 0)', '1.0', NULL, NULL, 'threshold_value', '75', 'Threshold Pct that Match Address Pattern', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'Expected percent of records that match standard street address pattern', 'The street address pattern used in this test should match the vast majority of USA addresses. You can adjust the threshold percent of matches based on the results you are getting -- you may well want to tighten it to make the test more sensitive to invalid entries.', 'Y', 'line_chart', NULL), - ('1034', 'Unique', 'Unique Values', 'Each column value is unique', 'Tests that no values for the column are repeated in multiple records.', 'Column values should be unique per row.', 'Duplicate values', 'Count of non-unique values', 'record_ct > 500 and record_ct = distinct_value_ct and value_ct > 0', '({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/NULLIF({RECORD_CT}::FLOAT, 0)', '1.0', NULL, NULL, 'threshold_value', '0', 'Threshold Duplicate Value Count', NULL, 'Fail', 'CAT', 'column', 'Uniqueness', 'Schema Drift', 'Expected count of duplicate values', 'This test is ideal when the database itself does not enforce a primary key constraint on the table. It serves as an independent check on uniqueness. If''s also useful when there are a small number of exceptions to uniqueness, which can be reflected in the expected threshold count of duplicates.', 'Y', 'line_chart', NULL), - ('1035', 'Unique_Pct', 'Percent Unique', 'Consistent ratio of unique values', 'Tests for statistically-significant shift in percentage of unique values vs. baseline data.', 'Significant shift in percent of unique values vs. baseline.', 'Difference measure', 'Cohen''s H Difference (0.20 small, 0.5 mod, 0.8 large, 1.2 very large, 2.0 huge)', 'distinct_value_ct > 10 AND functional_data_type NOT ILIKE ''Measurement%''', '2.0 * (1.0 - fn_normal_cdf(ABS({RESULT_MEASURE}::FLOAT) / 2.0))', '0.75', NULL, NULL, 'baseline_value_ct,baseline_unique_ct,threshold_value', 'value_ct,distinct_value_ct,0.5', 'Value Count at Baseline,Distinct Value Count at Baseline,Standardized Difference Measure (0 to 1)', NULL, 'Warning', 'CAT', 'column', 'Uniqueness', 'Data Drift', 'Expected maximum Cohen''s H Difference', 'You can think of this as a test of similarity that measures whether the percentage of unique values is consistent with the percentage at baseline. A significant change might indicate duplication or a telling shift in cardinality between entities. The test uses Cohen''s H, a statistical test to identify a significant difference between two ratios. Results are reported on a standardized scale, which can be interpreted via a rule-of-thumb from small to huge. You can refine the expected threshold value as you view legitimate results of the measure over time.', 'Y', 'line_chart', NULL), - ('1036', 'US_State', 'US State', 'Column value is two-letter US state code', 'Tests that the recorded column value is a valid US state.', 'Column Value is not a valid US state.', 'Not US States', 'Values that doo not match 2-character US state abbreviations.', 'general_type= ''A'' AND column_name ILIKE ''%state%'' AND distinct_value_ct < 70 AND max_length = 2', '({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/NULLIF({RECORD_CT}::FLOAT, 0)', '1.0', NULL, NULL, 'threshold_value', '0', 'Threshold Count not Matching State Abbreviations', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'Expected count of values that are not US state abbreviations', 'This test validates entries against a fixed list of two-character US state codes and related Armed Forces codes.', 'Y', 'line_chart', NULL), - ('1037', 'Weekly_Rec_Ct', 'Weekly Records', 'At least one date per week present within date range', 'Tests for presence of at least one date per calendar week within min/max date range, per baseline data', 'At least one date per week expected in min/max date range.', 'Missing weeks', 'Calendar weeks without date values present', 'functional_data_type ILIKE ''Transactional Date%'' AND date_days_present > 1 AND functional_table_type ILIKE ''%cumulative%'' AND date_weeks_present > 3 AND date_weeks_present - (DATEDIFF(''week'', ''1800-01-05''::DATE, max_date) - DATEDIFF(''week'', ''1800-01-05''::DATE, min_date) + 1) = 0 AND future_date_ct::FLOAT / NULLIF(value_ct, 0) <= 0.75', '({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT*{PRO_RECORD_CT}::FLOAT/NULLIF({DATE_WEEKS_PRESENT}::FLOAT, 0)/NULLIF({RECORD_CT}::FLOAT, 0)', '1.0', NULL, NULL, 'threshold_value', '0', 'Threshold Weeks without Dates', NULL, 'Fail', 'CAT', 'column', 'Completeness', 'Volume', 'Expected maximum count of calendar weeks without dates present', 'Weekly Records tests that at least one record is present for every calendar week within the minimum and maximum date range for the column. The test is relevant for transactional data, where you would expect at least one transaction to be recorded each week. A failure here would suggest missing records for the number of weeks identified without data. You can adjust the threshold to accept a number of weeks that you know legitimately have no records.', 'Y', 'line_chart', NULL), - ('1040', 'Variability_Increase', 'Variability Increase', 'Variability has increased above threshold', 'Tests that the spread or dispersion of column values has increased significantly over baseline, indicating a drop in stability of the measure.', 'The Standard Deviation of the measure has increased beyond the defined threshold. This could signal a change in a process or a data quality issue.', 'Pct SD shift', 'Percent of baseline Standard Deviation', 'general_type = ''N'' AND functional_data_type ilike ''Measure%'' AND functional_data_type <> ''Measurement Spike'' AND column_name NOT ilike ''%latitude%'' AND column_name NOT ilike ''%longitude%'' AND value_ct <> distinct_value_ct AND distinct_value_ct > 10 AND stdev_value > 0 AND avg_value IS NOT NULL AND NOT (distinct_value_ct = max_value - min_value + 1 AND distinct_value_ct > 2)', '1', '0.75', NULL, NULL, 'baseline_sd,threshold_value', 'stdev_value,120', 'Std Deviation at Baseline,Expected Maximum Percent', NULL, 'Warning', 'CAT', 'column', 'Accuracy', 'Data Drift', 'Expected maximum pct of baseline Standard Deviation (SD)', 'This test looks for percent shifts in standard deviation as a measure of the stability of a measure over time. A significant change could indicate that new values are erroneous, or that the cohort being evaluated is significantly different from baseline. An increase in particular could mark new problems in measurement, a more heterogeneous cohort, or that significant outliers have been introduced. Consider this test along with Average Shift and New Shift. If the average shifts as well, there may be a fundamental shift in the dataset or process used to collect the data point. This might suggest a data shift that should be noted and assessed by business users. If the average does not shift, this may point to a data quality or data collection problem. ', 'Y', 'line_chart', NULL), - ('1041', 'Variability_Decrease', 'Variability Decrease', 'Variability has decreased below threshold', 'Tests that the spread or dispersion of column values has decreased significantly over baseline, indicating a shift in stability of the measure. This could signal a change in a process or a data quality issue.', 'The Standard Deviation of the measure has decreased below the defined threshold. This could signal a change in a process or a data quality issue.', 'Pct SD shift', 'Percent of baseline Standard Deviation', 'general_type = ''N'' AND functional_data_type ilike ''Measure%'' AND functional_data_type <> ''Measurement Spike'' AND column_name NOT ilike ''%latitude%'' AND column_name NOT ilike ''%longitude%'' AND value_ct <> distinct_value_ct AND distinct_value_ct > 10 AND stdev_value > 0 AND avg_value IS NOT NULL AND NOT (distinct_value_ct = max_value - min_value + 1 AND distinct_value_ct > 2)', '1', '0.75', NULL, NULL, 'baseline_sd,threshold_value', 'stdev_value, 80', 'Std Deviation at Baseline,Expected Minimum Percent', NULL, 'Warning', 'CAT', 'column', 'Accuracy', 'Data Drift', 'Expected minimum pct of baseline Standard Deviation (SD)', 'This test looks for percent shifts in standard deviation as a measure of the stability of a measure over time. A significant change could indicate that new values are erroneous, or that the cohort being evaluated is significantly different from baseline. A decrease in particular could indicate an improved process, better precision in measurement, the elimination of outliers, or a more homogeneous cohort. ', 'Y', 'line_chart', NULL), - ('1042', 'Valid_Month', 'Valid Month', 'Valid calendar month in expected format', 'Tests for the presence of a valid representation of a calendar month consistent with the format at baseline.', 'Column values are not a valid representation of a calendar month consistent with the format at baseline.', 'Invalid months', NULL, 'functional_data_type = ''Period Month''', '({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/NULLIF({RECORD_CT}::FLOAT, 0)', '1.0', NULL, NULL, 'threshold_value,baseline_value', '0,CASE WHEN max_length > 3 AND initcap(min_text) = min_text THEN ''''''January'''',''''February'''',''''March'''',''''April'''',''''May'''',''''June'''',''''July'''',''''August'''',''''September'''',''''October'''',''''November'''',''''December'''''' WHEN max_length > 3 AND upper(min_text) = min_text THEN ''''''JANUARY'''',''''FEBRUARY'''',''''MARCH'''',''''APRIL'''',''''MAY'''',''''JUNE'''',''''JULY'''',''''AUGUST'''',''''SEPTEMBER'''',''''OCTOBER'''',''''NOVEMBER'''',''''DECEMBER'''''' WHEN max_length > 3 AND lower(min_text) = min_text THEN ''''''january'''',''''february'''',''''march'''',''''april'''',''''may'''',''''june'''',''''july'''',''''august'''',''''september'''',''''october'''',''''november'''',''''december'''''' WHEN max_length = 3 AND initcap(min_text) = min_text THEN ''''''Jan'''',''''Feb'''',''''Mar'''',''''Apr'''',''''May'''',''''Jun'''',''''Jul'''',''''Aug'''',''''Sep'''',''''Oct'''',''''Nov'''',''''Dec'''''' WHEN max_length = 3 AND upper(min_text) = min_text THEN ''''''JAN'''',''''FEB'''',''''MAR'''',''''APR'''',''''MAY'''',''''JUN'''',''''JUL'''',''''AUG'''',''''SEP'''',''''OCT'''',''''NOV'''',''''DEC'''''' WHEN max_length = 3 AND lower(min_text) = min_text THEN ''''''jan'''',''''feb'''',''''mar'''',''''apr'''',''''may'''',''''jun'''',''''jul'''',''''aug'''',''''sep'''',''''oct'''',''''nov'''',''''dec'''''' WHEN max_length = 2 AND min_text = ''01'' THEN ''''''01'''',''''02'''',''''03'''',''''04'''',''''05'''',''''06'''',''''07'''',''''08'''',''''09'''',''''10'''',''''11'''',''''12'''''' WHEN max_length = 2 AND min_text = ''1'' THEN ''''''1'''',''''2'''',''''3'''',''''4'''',''''5'''',''''6'''',''''7'''',''''8'''',''''9'''',''''10'''',''''11'''',''''12'''''' WHEN min_value = 1 THEN ''1,2,3,4,5,6,7,8,9,10,11,12'' ELSE ''NULL'' END', 'Threshold Invalid Months,Valid Month List', 'The acceptable number of records with invalid months present.|List of valid month values for this field, in quotes if field is numeric, separated by commas.', 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'Expected count of invalid months', NULL, 'N', 'line_chart', NULL), - ('1043', 'Valid_Characters', 'Valid Characters', 'Column contains no invalid characters', 'Tests for the presence of non-printing characters, leading spaces, or surrounding quotes.', 'Invalid characters, such as non-printing characters, leading spaces, or surrounding quotes, were found.', 'Invalid records', 'Expected count of values with invalid characters', 'general_type = ''A''', '({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/NULLIF({RECORD_CT}::FLOAT, 0)', '0.75', NULL, NULL, 'threshold_value', '0', NULL, 'The acceptable number of records with invalid character values present.', 'Warning', 'CAT', 'column', 'Validity', 'Schema Drift', 'Threshold Invalid Value Count', 'This test looks for the presence of non-printing ASCII characters that are considered non-standard in basic text processing. It also identifies leading spaces and values enclosed in quotes. Values that fail this test may be artifacts of data conversion, or just more difficult to process or analyze downstream.', 'N', 'line_chart', NULL), - ('1044', 'Valid_US_Zip', 'Valid US Zip', 'Valid USA Postal Codes', 'Tests that postal codes match the 5 or 9 digit standard US format', 'Invalid US Zip Code formats found.', 'Invalid Zip Codes', 'Expected count of values with invalid Zip Codes', 'functional_data_type = ''Zip''', '({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/NULLIF({RECORD_CT}::FLOAT, 0)', '0.75', NULL, NULL, 'threshold_value', '0', NULL, NULL, 'Warning', 'CAT', 'column', 'Validity', 'Schema Drift', 'Threshold Invalid Value Count', NULL, 'Y', 'line_chart', NULL), - ('1045', 'Valid_US_Zip3', 'Valid US Zip-3 ', 'Valid USA Zip-3 Prefix', 'Tests that postal codes match the 3 digit format of a regional prefix.', 'Invalid 3-digit US Zip Code regional prefix formats found.', 'Invalid Zip-3 Prefix', 'Expected count of values with invalid Zip-3 Prefix Codes', 'functional_data_type = ''Zip3''', '({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/NULLIF({RECORD_CT}::FLOAT, 0)', '0.75', NULL, NULL, 'threshold_value', '0', NULL, NULL, 'Warning', 'CAT', 'column', 'Validity', 'Schema Drift', 'Threshold Invalid Zip3 Count', 'This test looks for the presence of values that fail to match the three-digit numeric code expected for US Zip Code regional prefixes. These prefixes are often used to roll up Zip Code data to a regional level, and may be critical to anonymize detailed data and protect PID. Depending on your needs and regulatory requirements, longer zip codes could place PID at risk.', 'Y', 'line_chart', NULL), - ('1006', 'Condition_Flag', 'Custom Condition', 'Column values match pre-defined condition', 'Tests that each record in the table matches a pre-defined, custom condition', 'Value(s) found not matching defined condition.', 'Values Failing', NULL, NULL, '({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/NULLIF({RECORD_CT}::FLOAT, 0)', '1.0', 'Test Focus', 'Specify a brief descriptor of the focus of this test that is unique within this Test Suite for the Table and Test Type. This distinguishes this test from others of the same type on the same table. Example: `Quantity Consistency` if you are testing that quantity ordered matches quantity shipped.', 'threshold_value,custom_query', NULL, 'Threshold Error Count,Custom SQL Expression (TRUE on error)', 'The number of errors that are acceptable before test fails.|Expression should evaluate to TRUE to register an error or FALSE if no error. An expression can reference only columns in the selected table.', 'Fail', 'CAT', 'custom', 'Validity', 'Schema Drift', 'Count of records that don''t meet test condition', 'Custom Condition is a business-rule test for a user-defined error condition based on the value of one or more columns. The condition is applied to each record within the table, and the count of records failing the condition is added up. If that count exceeds a threshold of errors, the test as a whole is failed. This test is ideal for error conditions that TestGen cannot automatically infer, and any condition that involves the values of more than one column in the same record. Performance of this test is fast, since it is performed together with other aggregate tests. Interpretation is based on the user-defined meaning of the test.', 'Y', 'line_chart', NULL), - - ('1031', 'Row_Ct', 'Row Count', 'Number of rows is at or above threshold', 'Tests that the count of records has not decreased from the baseline count.', 'Row count less than baseline count.', 'Row count', NULL, 'TEMPLATE', '({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/NULLIF({THRESHOLD_VALUE}::FLOAT, 0)', '1.0', NULL, NULL, 'threshold_value', NULL, 'Threshold Minimum Record Count', NULL, 'Fail', 'CAT', 'table', 'Completeness', 'Volume', 'Expected minimum row count', 'Because this tests the row count against a constant minimum threshold, it''s appropriate for any dataset, as long as the number of rows doesn''t radically change from refresh to refresh. But it''s not responsive to change over time. You may want to adjust the threshold periodically if you are dealing with a cumulative dataset.', 'Y', 'line_chart', NULL), - ('1032', 'Row_Ct_Pct', 'Row Range', 'Number of rows within percent range of threshold', 'Tests that the count of records is within a percentage above or below the baseline count.', 'Row Count is outside of threshold percent of baseline count.', 'Percent of baseline', 'Row count percent above or below baseline', 'TEMPLATE', '(100.0 - {RESULT_MEASURE}::FLOAT)/100.0', '1.0', NULL, NULL, 'baseline_ct,threshold_value', NULL, 'Baseline Record Count,Threshold Pct Above or Below Baseline', NULL, 'Fail', 'CAT', 'table', 'Completeness', 'Volume', 'Expected percent window below or above baseline', 'This test is better than Row Count for an incremental or windowed dataset where you would expect the row count to range within a percentage of baseline.', 'Y', 'line_chart', NULL), - - ('1008', 'CUSTOM', 'Custom Test', 'Custom-defined business rule', 'Custom SQL Query Test', 'Errors were detected according to test definition.', 'Errors found', 'Count of errors identified by query', NULL, '({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/NULLIF({RECORD_CT}::FLOAT, 0)', '1.0', 'Test Focus', 'Specify a brief descriptor of the focus of this test that is unique within this Test Suite for the Table and Test Type. This distinguishes this test from others of the same type on the same table. Example: `Order Total Matches Detail` if you are testing that the total in one table matches the sum of lines in another.', 'custom_query', NULL, 'Custom SQL Query Returning Error Records', 'Query should return records indicating one or more errors. The test passes if no records are returned. Results of the query will be shown when you click `Review Source Data` for a failed test, so be sure to include enough data in your results to follow-up. \n\nA query can refer to any tables in the database. You must hard-code the schema or use `{DATA_SCHEMA}` to represent the schema defined for the Table Group.', 'Fail', 'QUERY', 'custom', 'Accuracy', 'Data Drift', 'Expected count of errors found by custom query', 'This business-rule test is highly flexible, covering any error state that can be expressed by a SQL query against one or more tables in the database. In operation, the user-defined query is embedded within a parent query returning the count of error rows identified. Any row returned by the query is interpreted as a single error condition in the test. Note that this query is run independently of other tests, and that performance will be slower, depending in large part on the efficiency of the query you write. Interpretation is based on the user-defined meaning of the test. Your query might be written to return errors in individual rows identified by joining tables. Or it might return an error based on a multi-column aggregate condition returning a single row if an error is found. This query is run separately when you click `Review Source Data` from Test Results, so be sure to include enough data in your results to follow-up. Interpretation is based on the user-defined meaning of the test.', 'Y', 'line_chart', NULL), - - ('1500', 'Aggregate_Balance', 'Aggregate Balance', 'Aggregate values per group match reference', 'Tests for exact match in aggregate values for each set of column values vs. reference dataset', 'Aggregate measure per set of column values does not exactly match reference dataset.', 'Mismatched measures', NULL, NULL, '1', '1.0', 'Aggregate Expression', 'Specify an aggregate column expression: one of `SUM([column_name])` or `COUNT([column_name])`', 'subset_condition,groupby_names,having_condition,match_schema_name,match_table_name,match_column_names,match_subset_condition,match_groupby_names,match_having_condition', NULL, 'Record Subset Condition,Grouping Columns,Group Subset Condition,Matching Schema Name,Matching Table Name,Matching Aggregate Expression,Matching Record Subset Condition,Matching Grouping Columns,Matching Group Subset Condition', 'Condition defining a subset of records in main table, written like a condition within a SQL WHERE clause - OPTIONAL|Category columns in main table separated by commas (e.g. GROUP BY columns)|Condition defining a subset of aggregate records in main table (e.g. HAVING clause) - OPTIONAL|Schema location of matching table|Matching table name|Agregate column expression in matching table: one of `SUM([column_name])` or `COUNT([column_name])`|Condition defining a subset of records in matching table, written like a condition within a SQL WHERE clause - OPTIONAL|Category columns in matching table separated by commas (e.g. GROUP BY columns)|Condition defining a subset of aggregate records in matching table (e.g. HAVING clause) - OPTIONAL', 'Fail', 'QUERY', 'referential', 'Consistency', 'Data Drift', 'Expected count of group totals not matching aggregate value', 'This test compares sums or counts of a column rolled up to one or more category combinations across two different tables. Both tables must be accessible at the same time. It''s ideal for confirming that two datasets exactly match -- that the sum of a measure or count of a value hasn''t changed or shifted between categories. Use this test to compare a raw and processed version of the same dataset, or to confirm that an aggregated table exactly matches the detail table that it''s built from. An error here means that one or more value combinations fail to match. New categories or combinations will cause failure.', 'Y', 'line_chart', NULL), - ('1501', 'Aggregate_Minimum', 'Aggregate Minimum', 'Aggregate values per group are at or above reference', 'Tests that aggregate values for each set of column values are at least the same as reference dataset', 'Aggregate measure per set of column values is not at least the same as reference dataset.', 'Mismatched measures', NULL, NULL, '1', '1.0', 'Aggregate Expression', 'Specify an aggregate column expression: one of `SUM([column_name])` or `COUNT([column_name])`', 'subset_condition,groupby_names,having_condition,match_schema_name,match_table_name,match_column_names,match_subset_condition,match_groupby_names,match_having_condition', NULL, 'Record Subset Condition,Grouping Columns,Group Subset Condition,Matching Schema Name,Matching Table Name,Matching Aggregate Expression,Matching Record Subset Condition,Matching Grouping Columns,Matching Group Subset Condition', 'Condition defining a subset of records in main table, written like a condition within a SQL WHERE clause - OPTIONAL|Category columns in main table separated by commas (e.g. GROUP BY columns)|Condition defining a subset of aggregate records in main table (e.g. HAVING clause) - OPTIONAL|Schema location of reference table|Reference table name|Aggregate column expression in reference table (e.g. `SUM(sales)`)|Condition defining a subset of records in reference table, written like a condition within a SQL WHERE clause - OPTIONAL|Category columns in reference table separated by commas (e.g. GROUP BY columns)|Condition defining a subset of aggregate records in reference table (e.g. HAVING clause) - OPTIONAL', 'Fail', 'QUERY', 'referential', 'Accuracy', 'Data Drift', 'Expected count of group totals below aggregate value', 'This test compares sums or counts of a column rolled up to one or more category combinations, but requires a match or increase in the aggregate value, rather than an exact match, across two different tables. Both tables must be accessible at the same time. Use this to confirm that aggregate values have not dropped for any set of categories, even if some values may rise. This test is useful to compare an older and newer version of a cumulative dataset. An error here means that one or more values per category set fail to match or exceed the prior dataset. New categories or combinations are allowed (but can be restricted independently with a Combo_Match test). Both tables must be present to run this test.', 'Y', 'line_chart', NULL), - ('1502', 'Combo_Match', 'Reference Match', 'Column values or combinations found in reference', 'Tests for the presence of one or a set of column values in a reference table', 'Column value combinations are not found in reference table values.', 'Missing values', NULL, NULL, '({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/NULLIF({RECORD_CT}::FLOAT, 0)', '1.0', 'Categorical Column List', 'Specify one or more Categorical columns, separated by commas. \n\nDo not use continuous mesurements here. Do not use numeric values unless they represent discrete categories.', 'subset_condition,having_condition,match_schema_name,match_table_name,match_groupby_names,match_subset_condition,match_having_condition', NULL, 'Record Subset Condition,Group Subset Condition,Reference Schema Name,Reference Table Name,Matching Columns,Matching Record Subset Condition,Matching Group Subset Condition', 'Condition defining a subset of records in main table to evaluate, written like a condition within a SQL WHERE clause - OPTIONAL|Condition based on aggregate expression used to exclude value combinations in source table, written like a condition within a SQL HAVING clause (e.g. `SUM(sales) < 100`) - OPTIONAL|Schema location of matching table|Matching table name|Column Names in reference table used to validate source table values (separated by commas)|Condition defining a subset of records in reference table to match against, written like a condition within a SQL WHERE clause - OPTIONAL|Condition based on aggregate expression used to exclude value combinations in reference table, written like a condition within a SQL HAVING clause (e.g. `SUM(sales) < 100`) - OPTIONAL', 'Fail', 'QUERY', 'referential', 'Validity', 'Schema Drift', 'Expected count of non-matching value combinations', 'This test verifies that values, or combinations of values, that are present in the main table are also found in a reference table. This is a useful test for referential integrity between fact and dimension tables. You can also use it to confirm the validity of a code or category, or of combinations of values that should only be found together within each record, such as product/size/color. An error here means that one or more category combinations in the main table are not found in the reference table. Both tables must be present to run this test.', 'Y', 'line_chart', NULL), - ('1503', 'Distribution_Shift', 'Distribution Shift', 'Probability distribution consistent with reference', 'Tests the closeness of match between two distributions of aggregate measures across combinations of column values, using Jensen-Shannon Divergence test', 'Divergence between two distributions exceeds specified threshold.', 'Divergence level (0-1)', 'Jensen-Shannon Divergence, from 0 (identical distributions), to 1.0 (max divergence)', NULL, '1', '0.75', 'Categorical Column List', 'Specify one or more Categorical columns, separated by commas. Do not use continuous mesurements here. Do not use numeric values unless they represent discrete categories.', 'subset_condition,match_schema_name,match_table_name,match_groupby_names,match_subset_condition', NULL, 'Record Subset Condition,Reference Schema Name,Reference Table Name,Matching Columns to Compare,Matching Record Subset Condition', 'Condition defining a subset of records in main table to evaluate, written like a condition within a SQL WHERE clause - OPTIONAL|Schema location of matching table|Matching table name|Column Names in reference table used to compare counts with source table values (separated by commas)|Condition defining a subset of records in reference table to match against, written like a condition within a SQL WHERE clause - OPTIONAL', 'Warning', 'QUERY', 'referential', 'Consistency', 'Data Drift', 'Expected maximum divergence level between 0 and 1', 'This test measures the similarity of two sets of counts per categories, by using their proportional counts as probability distributions. Using Jensen-Shannon divergence, a measure of relative entropy or difference between two distributions, the test assigns a score ranging from 0, meaning that the distributions are identical, to 1, meaning that the distributions are completely unrelated. This test can be used to compare datasets that may not match exactly, but should have similar distributions. For example, it is a useful sanity check for data from different sources that you would expect to have a consistent spread, such as shipment of building materials per state and construction projects by state. Scores can be compared over time even if the distributions are not identical -- a dataset can be expected to maintain a comparable divergence score with a reference dataset over time. Both tables must be present to run this test.', 'Y', 'line_chart', NULL), - ('1504', 'Aggregate_Balance_Percent', 'Aggregate Balance Percent', 'Aggregate measure per group within percent of reference', 'Tests that aggregate measure for each set of column values fall within a percent range above or below the measure for reference dataset', 'Aggregate measure per set of column values is outside percent range of reference dataset.', 'Mismatched measures', NULL, NULL, '1', '1.0', 'Aggregate Expression', 'Specify an aggregate column expression: one of `SUM([column_name])` or `COUNT([column_name])`', 'subset_condition,groupby_names,having_condition,match_schema_name,match_table_name,match_column_names,match_subset_condition,match_groupby_names,match_having_condition,lower_tolerance,upper_tolerance', NULL, 'Record Subset Condition,Grouping Columns,Group Subset Condition,Matching Schema Name,Matching Table Name,Matching Aggregate Expression,Matching Record Subset Condition,Matching Grouping Columns,Matching Group Subset Condition,Lower Tolerance Percent,Upper Tolerance Percent', 'Condition defining a subset of records in main table, written like a condition within a SQL WHERE clause - OPTIONAL|Category columns in main table separated by commas (e.g. GROUP BY columns)|Condition defining a subset of aggregate records in main table (e.g. HAVING clause) - OPTIONAL|Schema location of matching table|Matching table name|Agregate column expression in matching table: one of `SUM([column_name])` or `COUNT([column_name])`|Condition defining a subset of records in matching table, written like a condition within a SQL WHERE clause - OPTIONAL|Category columns in matching table separated by commas (e.g. GROUP BY columns)|Condition defining a subset of aggregate records in matching table (e.g. HAVING clause) - OPTIONAL|Allowable tolerance below the reference measure expressed as a percent|Allowable tolerance above the reference measure expressed as a percent', 'Fail', 'QUERY', 'referential', 'Consistency', 'Data Drift', 'Expected count of group totals not matching aggregate value', 'This test compares sums or counts of a column rolled up to one or more category combinations across two different tables. Both tables must be accessible at the same time. Use it to confirm that two datasets closely match within the tolerance you set -- that the sum of a measure or count of a value remains sufficiently consistent between categories. You could use this test compare sales per product within one month to another, when you want to be alerted if the difference for any product falls outside of the range defined as 5% below to 10% above the prior month. An error here means that one or more value combinations fail to match within the set tolerances. New categories or combinations will cause failure.', 'Y', 'line_chart', NULL), - ('1505', 'Aggregate_Balance_Range', 'Aggregate Balance Range', 'Aggregate measure per group within hard range of reference', 'Tests that aggregate measure for each set of column values fall within a hard range above or below the measure for reference dataset', 'Aggregate measure per set of column values is outside expected range of reference dataset.', 'Mismatched measures', NULL, NULL, '1', '1.0', 'Aggregate Expression', 'Specify an aggregate column expression: one of `SUM([column_name])` or `COUNT([column_name])`', 'subset_condition,groupby_names,having_condition,match_schema_name,match_table_name,match_column_names,match_subset_condition,match_groupby_names,match_having_condition,lower_tolerance,upper_tolerance', NULL, 'Record Subset Condition,Grouping Columns,Group Subset Condition,Matching Schema Name,Matching Table Name,Matching Aggregate Expression,Matching Record Subset Condition,Matching Grouping Columns,Matching Group Subset Condition,Lower Tolerance Constant,Upper Tolerance Constant', 'Condition defining a subset of records in main table, written like a condition within a SQL WHERE clause - OPTIONAL|Category columns in main table separated by commas (e.g. GROUP BY columns)|Condition defining a subset of aggregate records in main table (e.g. HAVING clause) - OPTIONAL|Schema location of matching table|Matching table name|Agregate column expression in matching table: one of `SUM([column_name])` or `COUNT([column_name])`|Condition defining a subset of records in matching table, written like a condition within a SQL WHERE clause - OPTIONAL|Category columns in matching table separated by commas (e.g. GROUP BY columns)|Condition defining a subset of aggregate records in matching table (e.g. HAVING clause) - OPTIONAL|Allowable tolerance below the reference measure expressed as a constant value|Allowable tolerance above the reference measure expressed as a constant value', 'Fail', 'QUERY', 'referential', 'Consistency', 'Data Drift', 'Expected count of group totals not matching aggregate value', 'This test compares sums or counts of a column rolled up to one or more category combinations across two different tables. Both tables must be accessible at the same time. Use it to confirm that two datasets closely match within the tolerances you define as specific values above or below the aggregate measure for the same categories in the reference dataset -- that the sum of a measure or count of a value remains sufficiently consistent between categories. For instance, you can use this test to compare sales per product within one month to another, when you want to be alerted if the difference for any product falls outside of the range defined as 10000 dollars above or below the prior week. An error here means that one or more value combinations fail to match within the set tolerances. New categories or combinations will cause failure.', 'Y', 'line_chart', NULL), - ('1508', 'Timeframe_Combo_Gain', 'Timeframe No Drops', 'Latest timeframe has at least all value combinations from prior period', 'Tests that column values in most recent time-window include at least same as prior time window', 'Column values in most recent time-window don''t include all values in prior window.', 'Mismatched values', NULL, NULL, '({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/NULLIF({RECORD_CT}::FLOAT, 0)', '1.0', 'Categorical Column List', 'Specify one or more Categorical columns, separated by commas. Make sure not to use continuous measurements here. Do not use numeric values unless they represent discrete categories.', 'window_date_column,window_days,subset_condition', NULL, 'Date Column for Time Windows,Time Window in Days,Record Subset Condition', 'The date column used to define the time windows. This must be a DATE or DATETIME type.|Length in days of the time window. The test will compare the most recent period of days to the prior period of the same duration.|Condition defining a subset of records in main table to evaluate, written like a condition within a SQL WHERE clause - OPTIONAL', 'Fail', 'QUERY', 'referential', 'Consistency', 'Data Drift', 'Expected count of missing value combinations', 'This test checks a single transactional table to verify that categorical values or combinations that are present in the most recent time window you define include at least all those found in the prior time window of the same duration. Missing values in the latest time window will trigger the test to fail. New values are permitted. Use this test to confirm that codes or categories are not lost across successive time periods in a transactional table.', 'Y', 'line_chart', NULL), - ('1509', 'Timeframe_Combo_Match', 'Timeframe Match', 'Column value combinations from latest timeframe same as prior period', 'Tests for presence of same column values in most recent time-window vs. prior time window', 'Column values don''t match in most recent time-windows.', 'Mismatched values', NULL, NULL, '({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/NULLIF({RECORD_CT}::FLOAT, 0)', '1.0', 'Categorical Column List', 'Specify one or more Categorical columns, separated by commas. Do not use continuous measurements here. Do not use numeric values unless they represent discrete categories.', 'window_date_column,window_days,subset_condition', NULL, 'Date Column for Time Windows,Time Window in Days,Record Subset Condition', NULL, 'Fail', 'QUERY', 'referential', 'Consistency', 'Data Drift', 'Expected count of non-matching value combinations', 'This test checks a single transactional table (such as a fact table) to verify that categorical values or combinations that are present in the most recent time window you define match those found in the prior time window of the same duration. New or missing values in the latest time window will trigger the test to fail. Use this test to confirm the consistency in the occurrence of codes or categories across successive time periods in a transactional table.', 'Y', 'line_chart', NULL), - ('1510', 'Dupe_Rows', 'Duplicate Rows', 'Rows are not duplicated in table', 'Tests for the absence of duplicate rows based on unique combination of column values', 'Column value combinations are duplicated in the table.', 'Duplicate records', NULL, NULL, '(({RESULT_MEASURE}-{THRESHOLD_VALUE}))::FLOAT/NULLIF({RECORD_CT}::FLOAT, 0)', '1.0', 'null', 'null', 'groupby_names', NULL, 'Columns to Compare', 'List of columns in the table that define a duplicate record when the combination of values is repeated on multiple rows', 'Fail', 'QUERY', 'table', 'Uniqueness', 'Schema Drift', 'Expected count of duplicate value combinations', 'This test verifies that combinations of values are not repeated within the table. By default when auto-generated, the test considers all columns to protect against duplication of entire rows. If you know the minimum columns that should constitute a unique record, such as a set of ID''s, you should use those to make the test as sensitive as possible. Alternatively, if you know of columns you can always exclude, such as file_date or refresh_snapshot_id, remove them to tighten the test somewhat.', 'Y', 'line_chart', NULL), - ('1511', 'Table_Freshness', 'Table Freshness', 'Stale Table Not Updated', 'Confirms whether table has been updated based on data fingerprint', 'Table has not been updated.', 'Was Change Detected', NULL, 'TEMPLATE', '(({RESULT_MEASURE}-{THRESHOLD_VALUE}))::FLOAT/NULLIF({RECORD_CT}::FLOAT, 0)', '0.0', 'null', 'null', 'history_calculation,history_lookback,subset_condition,custom_query', NULL, 'History Aggregate,History Lookback,Record Subset Condition,Fingerprint Expression', 'Aggregate calculation to be performed on the N lookback results|Last N tests to use for history aggregate calculation|Condition defining a subset of records in main table|String expression combining key column measures into a distinct representation of table state', 'Log', 'QUERY', 'table', 'Recency', 'Recency', 'Most recent prior table fingerprint', 'This test compares the current table fingerprint, calculated signature of column contents, to confirm that the table has been updated. The table fingerprint is derived from a set of values and aggregates from columns most likely to change. This test allows you to track the schedule and frequency of updates and refreshes to the table.', 'Y', 'binary_chart', '{"legend":{"labels":{"0":"Stale","1":"Updated"}}}') -; - - TRUNCATE TABLE generation_sets; INSERT INTO generation_sets (generation_set, test_type) @@ -176,1619 +28,10 @@ VALUES ('Monitor', 'Recency'), TRUNCATE TABLE test_templates; -INSERT INTO test_templates (id, test_type, sql_flavor, template_name) -VALUES ('2001', 'Combo_Match', 'redshift', 'ex_data_match_generic.sql'), - ('2002', 'Aggregate_Minimum', 'redshift', 'ex_aggregate_match_no_drops_generic.sql'), - ('2003', 'Distribution_Shift', 'redshift', 'ex_relative_entropy_generic.sql'), - ('2004', 'CUSTOM', 'redshift', 'ex_custom_query_generic.sql'), - ('2006', 'Aggregate_Balance', 'redshift', 'ex_aggregate_match_same_generic.sql'), - ('2007', 'Timeframe_Combo_Gain', 'redshift', 'ex_window_match_no_drops_generic.sql'), - ('2008', 'Timeframe_Combo_Match', 'redshift', 'ex_window_match_same_generic.sql'), - ('2009', 'Aggregate_Balance_Percent', 'redshift', 'ex_aggregate_match_percent_generic.sql'), - ('2010', 'Aggregate_Balance_Range', 'redshift', 'ex_aggregate_match_range_generic.sql'), - ('2011', 'Dupe_Rows', 'redshift', 'ex_dupe_rows_generic.sql'), - - ('2101', 'Combo_Match', 'snowflake', 'ex_data_match_generic.sql'), - ('2102', 'Aggregate_Minimum', 'snowflake', 'ex_aggregate_match_no_drops_generic.sql'), - ('2103', 'Distribution_Shift', 'snowflake', 'ex_relative_entropy_generic.sql'), - ('2104', 'CUSTOM', 'snowflake', 'ex_custom_query_generic.sql'), - ('2106', 'Aggregate_Balance', 'snowflake', 'ex_aggregate_match_same_generic.sql'), - ('2107', 'Timeframe_Combo_Gain', 'snowflake', 'ex_window_match_no_drops_generic.sql'), - ('2108', 'Timeframe_Combo_Match', 'snowflake', 'ex_window_match_same_generic.sql'), - ('2109', 'Aggregate_Balance_Percent', 'snowflake', 'ex_aggregate_match_percent_generic.sql'), - ('2110', 'Aggregate_Balance_Range', 'snowflake', 'ex_aggregate_match_range_generic.sql'), - ('2111', 'Dupe_Rows', 'snowflake', 'ex_dupe_rows_generic.sql'), - - ('2201', 'Combo_Match', 'mssql', 'ex_data_match_generic.sql'), - ('2202', 'Aggregate_Minimum', 'mssql', 'ex_aggregate_match_no_drops_generic.sql'), - ('2203', 'Distribution_Shift', 'mssql', 'ex_relative_entropy_mssql.sql'), - ('2204', 'CUSTOM', 'mssql', 'ex_custom_query_generic.sql'), - ('2206', 'Aggregate_Balance', 'mssql', 'ex_aggregate_match_same_generic.sql'), - ('2207', 'Timeframe_Combo_Gain', 'mssql', 'ex_window_match_no_drops_generic.sql'), - ('2208', 'Timeframe_Combo_Match', 'mssql', 'ex_window_match_same_generic.sql'), - ('2209', 'Aggregate_Balance_Percent', 'mssql', 'ex_aggregate_match_percent_generic.sql'), - ('2210', 'Aggregate_Balance_Range', 'mssql', 'ex_aggregate_match_range_generic.sql'), - ('2211', 'Dupe_Rows', 'mssql', 'ex_dupe_rows_generic.sql'), - - ('2301', 'Combo_Match', 'postgresql', 'ex_data_match_generic.sql'), - ('2302', 'Aggregate_Minimum', 'postgresql', 'ex_aggregate_match_no_drops_generic.sql'), - ('2303', 'Distribution_Shift', 'postgresql', 'ex_relative_entropy_generic.sql'), - ('2304', 'CUSTOM', 'postgresql', 'ex_custom_query_generic.sql'), - ('2306', 'Aggregate_Balance', 'postgresql', 'ex_aggregate_match_same_generic.sql'), - ('2307', 'Timeframe_Combo_Gain', 'postgresql', 'ex_window_match_no_drops_postgresql.sql'), - ('2308', 'Timeframe_Combo_Match', 'postgresql', 'ex_window_match_same_postgresql.sql'), - ('2309', 'Aggregate_Balance_Percent', 'postgresql', 'ex_aggregate_match_percent_generic.sql'), - ('2310', 'Aggregate_Balance_Range', 'postgresql', 'ex_aggregate_match_range_generic.sql'), - ('2311', 'Dupe_Rows', 'postgresql', 'ex_dupe_rows_generic.sql'), - - ('2401', 'Combo_Match', 'databricks', 'ex_data_match_generic.sql'), - ('2402', 'Aggregate_Minimum', 'databricks', 'ex_aggregate_match_no_drops_generic.sql'), - ('2403', 'Distribution_Shift', 'databricks', 'ex_relative_entropy_generic.sql'), - ('2404', 'CUSTOM', 'databricks', 'ex_custom_query_generic.sql'), - ('2406', 'Aggregate_Balance', 'databricks', 'ex_aggregate_match_same_generic.sql'), - ('2407', 'Timeframe_Combo_Gain', 'databricks', 'ex_window_match_no_drops_databricks.sql'), - ('2408', 'Timeframe_Combo_Match', 'databricks', 'ex_window_match_same_databricks.sql'), - ('2409', 'Aggregate_Balance_Percent', 'databricks', 'ex_aggregate_match_percent_generic.sql'), - ('2410', 'Aggregate_Balance_Range', 'databricks', 'ex_aggregate_match_range_generic.sql'), - ('2411', 'Dupe_Rows', 'databricks', 'ex_dupe_rows_generic.sql'), - - ('2012', 'Table_Freshness', 'redshift', 'ex_table_changed_generic.sql'), - ('2112', 'Table_Freshness', 'snowflake', 'ex_table_changed_generic.sql'), - ('2212', 'Table_Freshness', 'mssql', 'ex_table_changed_mssql.sql'), - ('2312', 'Table_Freshness', 'postgresql', 'ex_table_changed_generic.sql'), - ('2412', 'Table_Freshness', 'databricks', 'ex_table_changed_generic.sql') -; - TRUNCATE TABLE cat_test_conditions; -INSERT INTO cat_test_conditions (id, test_type, sql_flavor, measure, test_operator, test_condition) -VALUES ('1001', 'Alpha_Trunc', 'redshift', 'MAX(LENGTH({COLUMN_NAME}))', '<', '{THRESHOLD_VALUE}'), - ('1002', 'Avg_Shift', 'redshift', 'ABS( (AVG({COLUMN_NAME}::FLOAT) - {BASELINE_AVG}) / SQRT(((COUNT({COLUMN_NAME})::FLOAT-1)*STDDEV({COLUMN_NAME}::FLOAT)^2 + ({BASELINE_VALUE_CT}::FLOAT-1) * {BASELINE_SD}::FLOAT^2) /NULLIF(COUNT({COLUMN_NAME})::FLOAT + {BASELINE_VALUE_CT}::FLOAT, 0) ))', '>=', '{THRESHOLD_VALUE}'), - ('1003', 'Condition_Flag', 'redshift', 'SUM(CASE WHEN {CUSTOM_QUERY} THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), - ('1004', 'Constant', 'redshift', 'SUM(CASE WHEN {COLUMN_NAME} <> {BASELINE_VALUE} THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), - ('1005', 'Daily_Record_Ct', 'redshift', 'DATEDIFF(''DAY'', MIN({COLUMN_NAME}), MAX({COLUMN_NAME}))+1-COUNT(DISTINCT {COLUMN_NAME})', '>', '{THRESHOLD_VALUE}'), - ('1006', 'Dec_Trunc', 'redshift', 'SUM(ROUND(ABS(({COLUMN_NAME} % 1)), 5))+1', '<', '{THRESHOLD_VALUE}'), - ('1007', 'Distinct_Date_Ct', 'redshift', 'COUNT(DISTINCT {COLUMN_NAME})', '<', '{THRESHOLD_VALUE}'), - ('1008', 'Distinct_Value_Ct', 'redshift', 'COUNT(DISTINCT {COLUMN_NAME})', '<>', '{THRESHOLD_VALUE}'), - ('1009', 'Email_Format', 'redshift', 'SUM(CASE WHEN {COLUMN_NAME} !~ ''^[A-Za-z0-9._''''%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$'' THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), - ('1010', 'Future_Date', 'redshift', 'SUM(GREATEST(0, SIGN({COLUMN_NAME}::DATE - ''{RUN_DATE}''::DATE)))', '>', '{THRESHOLD_VALUE}'), - ('1011', 'Future_Date_1Y', 'redshift', 'SUM(GREATEST(0, SIGN({COLUMN_NAME}::DATE - (''{RUN_DATE}''::DATE+365))))', '>', '{THRESHOLD_VALUE}'), - ('1012', 'Incr_Avg_Shift', 'redshift', 'NVL(ABS( ({BASELINE_AVG} - (SUM({COLUMN_NAME}) - {BASELINE_SUM}) / NULLIF(COUNT({COLUMN_NAME})::FLOAT - {BASELINE_VALUE_CT}, 0)) / {BASELINE_SD} ), 0)', '>=', '{THRESHOLD_VALUE}'), - ('1013', 'LOV_All', 'redshift', 'LISTAGG(DISTINCT {COLUMN_NAME}, ''|'') WITHIN GROUP (ORDER BY {COLUMN_NAME})', '<>', '{THRESHOLD_VALUE}'), - ('1014', 'LOV_Match', 'redshift', 'SUM(CASE WHEN NULLIF({COLUMN_NAME}, '''') NOT IN {BASELINE_VALUE} THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), - ('1015', 'Min_Date', 'redshift', 'SUM(CASE WHEN {COLUMN_NAME} < ''{BASELINE_VALUE}'' THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), - ('1016', 'Min_Val', 'redshift', 'SUM(CASE WHEN {COLUMN_NAME} < {BASELINE_VALUE} - 1e-6 THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), - ('1017', 'Missing_Pct', 'redshift', 'ABS( 2.0 * ASIN( SQRT( {BASELINE_VALUE_CT}::FLOAT / {BASELINE_CT}::FLOAT ) ) - 2 * ASIN( SQRT( COUNT( {COLUMN_NAME} )::FLOAT / NULLIF(COUNT(*), 0)::FLOAT )) )', '>=', '{THRESHOLD_VALUE}'), - ('1018', 'Monthly_Rec_Ct', 'redshift', '(MAX(DATEDIFF(month, {COLUMN_NAME}, ''{RUN_DATE}''::DATE)) - MIN(DATEDIFF(month, {COLUMN_NAME}, ''{RUN_DATE}''::DATE)) + 1) - COUNT(DISTINCT DATEDIFF(month, {COLUMN_NAME}, ''{RUN_DATE}''::DATE))', '>', '{THRESHOLD_VALUE}'), - ('1019', 'Outlier_Pct_Above', 'redshift', 'SUM(CASE WHEN {COLUMN_NAME}::FLOAT > {BASELINE_AVG}+(2.0*{BASELINE_SD}) THEN 1 ELSE 0 END)::FLOAT / NULLIF(COUNT({COLUMN_NAME}), 0)::FLOAT', '>', '{THRESHOLD_VALUE}'), - ('1020', 'Outlier_Pct_Below', 'redshift', 'SUM(CASE WHEN {COLUMN_NAME}::FLOAT < {BASELINE_AVG}-(2.0*{BASELINE_SD}) THEN 1 ELSE 0 END)::FLOAT / NULLIF(COUNT({COLUMN_NAME}), 0)::FLOAT', '>', '{THRESHOLD_VALUE}'), - ('1021', 'Pattern_Match', 'redshift', 'COUNT(NULLIF({COLUMN_NAME}, '''')) - SUM((NULLIF({COLUMN_NAME}, '''') SIMILAR TO ''{BASELINE_VALUE}'')::BIGINT)', '>', '{THRESHOLD_VALUE}'), - ('1022', 'Recency', 'redshift', 'DATEDIFF(''D'', MAX({COLUMN_NAME}), ''{RUN_DATE}''::DATE)', '>', '{THRESHOLD_VALUE}'), - ('1023', 'Required', 'redshift', 'COUNT(*) - COUNT( {COLUMN_NAME} )', '>', '{THRESHOLD_VALUE}'), - ('1024', 'Row_Ct', 'redshift', 'COUNT(*)', '<', '{THRESHOLD_VALUE}'), - ('1025', 'Row_Ct_Pct', 'redshift', 'ABS(ROUND(100.0 * (COUNT(*) - {BASELINE_CT})::FLOAT / {BASELINE_CT}::FLOAT, 2))', '>', '{THRESHOLD_VALUE}'), - ('1026', 'Street_Addr_Pattern', 'redshift', '100.0*SUM(({COLUMN_NAME} ~ ''^[0-9]{1,5}[a-zA-Z]?\\s\\w{1,5}\\.?\\s?\\w*\\s?\\w*\\s[a-zA-Z]{1,6}\\.?\\s?[0-9]{0,5}[A-Z]{0,1}$'')::BIGINT)::FLOAT / NULLIF(COUNT({COLUMN_NAME}), 0)::FLOAT', '<', '{THRESHOLD_VALUE}'), - ('1027', 'US_State', 'redshift', 'SUM(CASE WHEN {COLUMN_NAME} NOT IN ('''',''AL'',''AK'',''AS'',''AZ'',''AR'',''CA'',''CO'',''CT'',''DE'',''DC'',''FM'',''FL'',''GA'',''GU'',''HI'',''ID'',''IL'',''IN'',''IA'',''KS'',''KY'',''LA'',''ME'',''MH'',''MD'',''MA'',''MI'',''MN'',''MS'',''MO'',''MT'',''NE'',''NV'',''NH'',''NJ'',''NM'',''NY'',''NC'',''ND'',''MP'',''OH'',''OK'',''OR'',''PW'',''PA'',''PR'',''RI'',''SC'',''SD'',''TN'',''TX'',''UT'',''VT'',''VI'',''VA'',''WA'',''WV'',''WI'',''WY'',''AE'',''AP'',''AA'') THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), - ('1028', 'Unique', 'redshift', 'COUNT(*) - COUNT(DISTINCT {COLUMN_NAME})', '>', '{THRESHOLD_VALUE}'), - ('1029', 'Unique_Pct', 'redshift', 'ABS( 2.0 * ASIN( SQRT({BASELINE_UNIQUE_CT}::FLOAT / {BASELINE_VALUE_CT}::FLOAT ) ) - 2 * ASIN( SQRT( COUNT( DISTINCT {COLUMN_NAME} )::FLOAT / NULLIF(COUNT( {COLUMN_NAME} ), 0)::FLOAT )) )', '>=', '{THRESHOLD_VALUE}'), - ('1030', 'Weekly_Rec_Ct', 'redshift', 'MAX(DATEDIFF(week, ''1800-01-01''::DATE, {COLUMN_NAME})) - MIN(DATEDIFF(week, ''1800-01-01''::DATE, {COLUMN_NAME}))+1 - COUNT(DISTINCT DATEDIFF(week, ''1800-01-01''::DATE, {COLUMN_NAME}))', '>', '{THRESHOLD_VALUE}'), - ('2001', 'Alpha_Trunc', 'snowflake', 'MAX(LENGTH({COLUMN_NAME}))', '<', '{THRESHOLD_VALUE}'), - ('2002', 'Avg_Shift', 'snowflake', 'ABS( (AVG({COLUMN_NAME}::FLOAT) - {BASELINE_AVG}) / SQRT(((COUNT({COLUMN_NAME})::FLOAT-1)*POWER(STDDEV({COLUMN_NAME}::FLOAT),2) + ({BASELINE_VALUE_CT}::FLOAT-1) * POWER({BASELINE_SD}::FLOAT,2)) /NULLIF(COUNT({COLUMN_NAME})::FLOAT + {BASELINE_VALUE_CT}::FLOAT, 0) ))', '>=', '{THRESHOLD_VALUE}'), - ('2003', 'Condition_Flag', 'snowflake', 'SUM(CASE WHEN {CUSTOM_QUERY} THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), - ('2004', 'Constant', 'snowflake', 'SUM(CASE WHEN {COLUMN_NAME} <> {BASELINE_VALUE} THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), - ('2005', 'Daily_Record_Ct', 'snowflake', 'DATEDIFF(day, MIN({COLUMN_NAME}), MAX({COLUMN_NAME}))+1-COUNT(DISTINCT {COLUMN_NAME})', '<', '{THRESHOLD_VALUE}'), - ('2006', 'Dec_Trunc', 'snowflake', 'SUM(ROUND(ABS(({COLUMN_NAME} % 1)), 5))+1', '<', '{THRESHOLD_VALUE}'), - ('2007', 'Distinct_Date_Ct', 'snowflake', 'COUNT(DISTINCT {COLUMN_NAME})', '<', '{THRESHOLD_VALUE}'), - ('2008', 'Distinct_Value_Ct', 'snowflake', 'COUNT(DISTINCT {COLUMN_NAME})', '<>', '{THRESHOLD_VALUE}'), - ('2009', 'Email_Format', 'snowflake', 'SUM(CASE WHEN NOT REGEXP_LIKE({COLUMN_NAME}::VARCHAR, ''^[A-Za-z0-9._''''%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$'') THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), - ('2010', 'Future_Date', 'snowflake', 'SUM(GREATEST(0, SIGN({COLUMN_NAME}::DATE - ''{RUN_DATE}''::DATE)))', '>', '{THRESHOLD_VALUE}'), - ('2011', 'Future_Date_1Y', 'snowflake', 'SUM(GREATEST(0, SIGN({COLUMN_NAME}::DATE - (''{RUN_DATE}''::DATE+365))))', '>', '{THRESHOLD_VALUE}'), - ('2012', 'Incr_Avg_Shift', 'snowflake', 'COALESCE(ABS( ({BASELINE_AVG} - (SUM({COLUMN_NAME}) - {BASELINE_SUM}) / NULLIF(COUNT({COLUMN_NAME})::FLOAT - {BASELINE_VALUE_CT}, 0)) / {BASELINE_SD} ), 0)', '>=', '{THRESHOLD_VALUE}'), - ('2013', 'LOV_All', 'snowflake', 'LISTAGG(DISTINCT {COLUMN_NAME}, ''|'') WITHIN GROUP (ORDER BY {COLUMN_NAME})', '<>', '{THRESHOLD_VALUE}'), - ('2014', 'LOV_Match', 'snowflake', 'SUM(CASE WHEN NULLIF({COLUMN_NAME}, '''') NOT IN {BASELINE_VALUE} THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), - ('2015', 'Min_Date', 'snowflake', 'SUM(CASE WHEN {COLUMN_NAME} < ''{BASELINE_VALUE}'' THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), - ('2016', 'Min_Val', 'snowflake', 'SUM(CASE WHEN {COLUMN_NAME} < {BASELINE_VALUE} - 1e-6 THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), - ('2017', 'Missing_Pct', 'snowflake', 'ABS( 2.0 * ASIN( SQRT( {BASELINE_VALUE_CT}::FLOAT / {BASELINE_CT}::FLOAT ) ) - 2 * ASIN( SQRT( COUNT( {COLUMN_NAME} )::FLOAT / NULLIF(COUNT(*), 0)::FLOAT )) )', '>=', '{THRESHOLD_VALUE}'), - ('2018', 'Monthly_Rec_Ct', 'snowflake', '(MAX(DATEDIFF(month, {COLUMN_NAME}, ''{RUN_DATE}''::DATE)) - MIN(DATEDIFF(month, {COLUMN_NAME}, ''{RUN_DATE}''::DATE)) + 1) - COUNT(DISTINCT DATEDIFF(month, {COLUMN_NAME}, ''{RUN_DATE}''::DATE))', '>', '{THRESHOLD_VALUE}'), - ('2019', 'Outlier_Pct_Above', 'snowflake', 'SUM(CASE WHEN {COLUMN_NAME}::FLOAT > {BASELINE_AVG}+(2.0*{BASELINE_SD}) THEN 1 ELSE 0 END)::FLOAT / NULLIF(COUNT({COLUMN_NAME}), 0)::FLOAT', '>', '{THRESHOLD_VALUE}'), - ('2020', 'Outlier_Pct_Below', 'snowflake', 'SUM(CASE WHEN {COLUMN_NAME}::FLOAT < {BASELINE_AVG}-(2.0*{BASELINE_SD}) THEN 1 ELSE 0 END)::FLOAT / NULLIF(COUNT({COLUMN_NAME}), 0)::FLOAT', '>', '{THRESHOLD_VALUE}'), - ('2021', 'Pattern_Match', 'snowflake', 'COUNT(NULLIF({COLUMN_NAME}, '''')) - SUM(REGEXP_LIKE(NULLIF({COLUMN_NAME}::VARCHAR, ''''), ''{BASELINE_VALUE}'')::BIGINT)', '>', '{THRESHOLD_VALUE}'), - ('2022', 'Recency', 'snowflake', 'DATEDIFF(''D'', MAX({COLUMN_NAME}), ''{RUN_DATE}''::DATE)', '>', '{THRESHOLD_VALUE}'), - ('2023', 'Required', 'snowflake', 'COUNT(*) - COUNT( {COLUMN_NAME} )', '>', '{THRESHOLD_VALUE}'), - ('2024', 'Row_Ct', 'snowflake', 'COUNT(*)', '<', '{THRESHOLD_VALUE}'), - ('2025', 'Row_Ct_Pct', 'snowflake', 'ABS(ROUND(100.0 * (COUNT(*) - {BASELINE_CT})::FLOAT / {BASELINE_CT}::FLOAT, 2))', '>', '{THRESHOLD_VALUE}'), - ('2026', 'Street_Addr_Pattern', 'snowflake', '100.0*SUM((regexp_like({COLUMN_NAME}::VARCHAR, ''^[0-9]{1,5}[a-zA-Z]?\\s\\w{1,5}\\.?\\s?\\w*\\s?\\w*\\s[a-zA-Z]{1,6}\\.?\\s?[0-9]{0,5}[A-Z]{0,1}$''))::BIGINT)::FLOAT / NULLIF(COUNT({COLUMN_NAME}), 0)::FLOAT', '<', '{THRESHOLD_VALUE}'), - ('2027', 'US_State', 'snowflake', 'SUM(CASE WHEN {COLUMN_NAME} NOT IN ('''',''AL'',''AK'',''AS'',''AZ'',''AR'',''CA'',''CO'',''CT'',''DE'',''DC'',''FM'',''FL'',''GA'',''GU'',''HI'',''ID'',''IL'',''IN'',''IA'',''KS'',''KY'',''LA'',''ME'',''MH'',''MD'',''MA'',''MI'',''MN'',''MS'',''MO'',''MT'',''NE'',''NV'',''NH'',''NJ'',''NM'',''NY'',''NC'',''ND'',''MP'',''OH'',''OK'',''OR'',''PW'',''PA'',''PR'',''RI'',''SC'',''SD'',''TN'',''TX'',''UT'',''VT'',''VI'',''VA'',''WA'',''WV'',''WI'',''WY'',''AE'',''AP'',''AA'') THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), - ('2028', 'Unique', 'snowflake', 'COUNT(*) - COUNT(DISTINCT {COLUMN_NAME})', '>', '{THRESHOLD_VALUE}'), - ('2029', 'Unique_Pct', 'snowflake', 'ABS( 2.0 * ASIN( SQRT({BASELINE_UNIQUE_CT}::FLOAT / {BASELINE_VALUE_CT}::FLOAT ) ) - 2 * ASIN( SQRT( COUNT( DISTINCT {COLUMN_NAME} )::FLOAT / NULLIF(COUNT( {COLUMN_NAME} ), 0)::FLOAT )) )', '>=', '{THRESHOLD_VALUE}'), - ('2030', 'Weekly_Rec_Ct', 'snowflake', 'MAX(DATEDIFF(week, ''1800-01-01''::DATE, {COLUMN_NAME})) - MIN(DATEDIFF(week, ''1800-01-01''::DATE, {COLUMN_NAME}))+1 - COUNT(DISTINCT DATEDIFF(week, ''1800-01-01''::DATE, {COLUMN_NAME}))', '>', '{THRESHOLD_VALUE}'), - ('3001', 'Alpha_Trunc', 'mssql', 'MAX(LEN({COLUMN_NAME}))', '<', '{THRESHOLD_VALUE}'), - ('3002', 'Avg_Shift', 'mssql', 'ABS( (AVG(CAST({COLUMN_NAME} AS FLOAT)) - CAST({BASELINE_AVG} as FLOAT)) / SQRT(((COUNT({COLUMN_NAME})-1)*POWER(STDEV(CAST({COLUMN_NAME} AS FLOAT)), 2) + ({BASELINE_VALUE_CT}-1) * POWER(CAST({BASELINE_SD} as FLOAT), 2)) /NULLIF(COUNT({COLUMN_NAME}) + {BASELINE_VALUE_CT}, 0) ))', '>=', '{THRESHOLD_VALUE}'), - ('3003', 'Condition_Flag', 'mssql', 'SUM(CASE WHEN {CUSTOM_QUERY} THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), - ('3004', 'Constant', 'mssql', 'SUM(CASE WHEN {COLUMN_NAME} <> {BASELINE_VALUE} THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), - ('3005', 'Daily_Record_Ct', 'mssql', 'DATEDIFF(day, MIN({COLUMN_NAME}), MAX({COLUMN_NAME}))+1-COUNT(DISTINCT {COLUMN_NAME})', '<', '{THRESHOLD_VALUE}'), - ('3006', 'Dec_Trunc', 'mssql', 'SUM(ROUND(ABS(({COLUMN_NAME} % 1)), 5))+1', '<', '{THRESHOLD_VALUE}'), - ('3007', 'Distinct_Date_Ct', 'mssql', 'COUNT(DISTINCT {COLUMN_NAME})', '<', '{THRESHOLD_VALUE}'), - ('3008', 'Distinct_Value_Ct', 'mssql', 'COUNT(DISTINCT {COLUMN_NAME})', '<>', '{THRESHOLD_VALUE}'), - ('3009', 'Email_Format', 'mssql', 'SUM(CASE WHEN {COLUMN_NAME} NOT LIKE ''[A-Za-z0-9._''''%+-]%@[A-Za-z0-9.-]%.[A-Za-z][A-Za-z]%'' THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), - ('3010', 'Future_Date', 'mssql', 'SUM(CASE WHEN CAST({COLUMN_NAME} AS DATE) >= CONVERT(DATE, ''{RUN_DATE}'') THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), - ('3011', 'Future_Date_1Y', 'mssql', 'SUM(CASE WHEN CAST({COLUMN_NAME} AS DATE) >= DATEADD(DAY, 365, CONVERT(DATE, ''{RUN_DATE}'')) THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), - ('3012', 'Incr_Avg_Shift', 'mssql', 'COALESCE(ABS( ({BASELINE_AVG} - (SUM({COLUMN_NAME}) - {BASELINE_SUM}) / NULLIF(CAST(COUNT({COLUMN_NAME}) AS FLOAT) - {BASELINE_VALUE_CT}, 0)) / {BASELINE_SD} ), 0)', '>=', '{THRESHOLD_VALUE}'), - ('3013', 'LOV_All', 'mssql', 'STRING_AGG(DISTINCT {COLUMN_NAME}, ''|'') WITHIN GROUP (ORDER BY {COLUMN_NAME})', '<>', '{THRESHOLD_VALUE}'), - ('3014', 'LOV_Match', 'mssql', 'SUM(CASE WHEN NULLIF({COLUMN_NAME}, '''') NOT IN {BASELINE_VALUE} THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), - ('3015', 'Min_Date', 'mssql', 'SUM(CASE WHEN {COLUMN_NAME} < ''{BASELINE_VALUE}'' THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), - ('3016', 'Min_Val', 'mssql', 'SUM(CASE WHEN {COLUMN_NAME} < {BASELINE_VALUE} - 1e-6 THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), - ('3017', 'Missing_Pct', 'mssql', 'ABS( 2.0 * ASIN( SQRT( CAST({BASELINE_VALUE_CT} AS FLOAT) / CAST({BASELINE_CT} AS FLOAT) ) ) - 2 * ASIN( SQRT( CAST(COUNT( {COLUMN_NAME} ) AS FLOAT) / CAST(NULLIF(COUNT(*), 0) AS FLOAT) )) )', '>=', '{THRESHOLD_VALUE}'), - ('3018', 'Monthly_Rec_Ct', 'mssql', '(MAX(DATEDIFF(month, {COLUMN_NAME}, CAST(''{RUN_DATE}''AS DATE))) - MIN(DATEDIFF(month, {COLUMN_NAME}, CAST(''{RUN_DATE}'' AS DATE))) + 1) - COUNT(DISTINCT DATEDIFF(month, {COLUMN_NAME}, CAST(''{RUN_DATE}''AS DATE)))', '>', '{THRESHOLD_VALUE}'), - ('3019', 'Outlier_Pct_Above', 'mssql', 'CAST(SUM(CASE WHEN CAST({COLUMN_NAME} AS FLOAT) > {BASELINE_AVG}+(2.0*{BASELINE_SD}) THEN 1 ELSE 0 END) AS FLOAT) / CAST(COUNT({COLUMN_NAME}) AS FLOAT)', '>', '{THRESHOLD_VALUE}'), - ('3020', 'Outlier_Pct_Below', 'mssql', 'CAST(SUM(CASE WHEN CAST( {COLUMN_NAME} AS FLOAT) < {BASELINE_AVG}-(2.0*{BASELINE_SD}) THEN 1 ELSE 0 END) AS FLOAT) / CAST(COUNT({COLUMN_NAME}) AS FLOAT)', '>', '{THRESHOLD_VALUE}'), - ('3021', 'Pattern_Match', 'mssql', 'COUNT(NULLIF({COLUMN_NAME}, '''')) - CAST(SUM(CASE WHEN NULLIF({COLUMN_NAME}, '''') LIKE ''{BASELINE_VALUE}'' THEN 1 ELSE 0 END) AS BIGINT)', '>', '{THRESHOLD_VALUE}'), - ('3022', 'Recency', 'mssql', 'DATEDIFF(day, MAX({COLUMN_NAME}), CAST(''{RUN_DATE}''AS DATE))', '>', '{THRESHOLD_VALUE}'), - ('3023', 'Required', 'mssql', 'COUNT(*) - COUNT( {COLUMN_NAME} )', '>', '{THRESHOLD_VALUE}'), - ('3024', 'Row_Ct', 'mssql', 'COUNT(*)', '<', '{THRESHOLD_VALUE}'), - ('3025', 'Row_Ct_Pct', 'mssql', 'ABS(ROUND(100.0 * CAST((COUNT(*) - {BASELINE_CT} ) AS FLOAT)/ CAST({BASELINE_CT} AS FLOAT), 2))', '>', '{THRESHOLD_VALUE}'), - ('3026', 'Street_Addr_Pattern', 'mssql', 'CAST(100.0*SUM(CASE WHEN UPPER({COLUMN_NAME}) LIKE ''[1-9]% [A-Z]% %'' AND CHARINDEX('' '', {COLUMN_NAME}) BETWEEN 2 AND 6 THEN 1 ELSE 0 END) as FLOAT) /CAST(COUNT({COLUMN_NAME}) AS FLOAT)', '<', '{THRESHOLD_VALUE}'), - ('3027', 'US_State', 'mssql', 'SUM(CASE WHEN NULLIF({COLUMN_NAME}, '''') NOT IN (''AL'',''AK'',''AS'',''AZ'',''AR'',''CA'',''CO'',''CT'',''DE'',''DC'',''FM'',''FL'',''GA'',''GU'',''HI'',''ID'',''IL'',''IN'',''IA'',''KS'',''KY'',''LA'',''ME'',''MH'',''MD'',''MA'',''MI'',''MN'',''MS'',''MO'',''MT'',''NE'',''NV'',''NH'',''NJ'',''NM'',''NY'',''NC'',''ND'',''MP'',''OH'',''OK'',''OR'',''PW'',''PA'',''PR'',''RI'',''SC'',''SD'',''TN'',''TX'',''UT'',''VT'',''VI'',''VA'',''WA'',''WV'',''WI'',''WY'',''AE'',''AP'',''AA'') THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), - ('3028', 'Unique', 'mssql', 'COUNT(*) - COUNT(DISTINCT {COLUMN_NAME})', '>', '{THRESHOLD_VALUE}'), - ('3029', 'Unique_Pct', 'mssql', 'ABS( 2.0 * ASIN( SQRT(CAST({BASELINE_UNIQUE_CT} AS FLOAT) / CAST({BASELINE_VALUE_CT} AS FLOAT) ) ) - 2 * ASIN( SQRT( CAST(COUNT( DISTINCT {COLUMN_NAME} ) AS FLOAT) / CAST(NULLIF(COUNT( {COLUMN_NAME} ), 0) AS FLOAT) )) )', '>=', '{THRESHOLD_VALUE}'), - ('3030', 'Weekly_Rec_Ct', 'mssql', 'MAX(DATEDIFF(week, CAST(''1800-01-01'' AS DATE), {COLUMN_NAME})) - MIN(DATEDIFF(week, CAST(''1800-01-01'' AS DATE), {COLUMN_NAME}))+1 - COUNT(DISTINCT DATEDIFF(week, CAST(''1800-01-01'' AS DATE), {COLUMN_NAME}))', '>', '{THRESHOLD_VALUE}'), - ('4001', 'Alpha_Trunc', 'postgresql', 'MAX(LENGTH({COLUMN_NAME}))', '<', '{THRESHOLD_VALUE}'), - ('4002', 'Avg_Shift', 'postgresql', 'ABS( (AVG({COLUMN_NAME}::FLOAT) - {BASELINE_AVG}) / SQRT(((COUNT({COLUMN_NAME})::FLOAT-1)*STDDEV({COLUMN_NAME}::FLOAT)^2 + ({BASELINE_VALUE_CT}::FLOAT-1) * {BASELINE_SD}::FLOAT^2) /NULLIF(COUNT({COLUMN_NAME})::FLOAT + {BASELINE_VALUE_CT}::FLOAT, 0) ))', '>=', '{THRESHOLD_VALUE}'), - ('4003', 'Condition_Flag', 'postgresql', 'SUM(CASE WHEN {CUSTOM_QUERY} THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), - ('4004', 'Constant', 'postgresql', 'SUM(CASE WHEN {COLUMN_NAME} <> {BASELINE_VALUE} THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), - ('4005', 'Daily_Record_Ct', 'postgresql', '<%DATEDIFF_DAY;MIN({COLUMN_NAME});MAX({COLUMN_NAME})%>+1-COUNT(DISTINCT {COLUMN_NAME})', '>', '{THRESHOLD_VALUE}'), - ('4006', 'Dec_Trunc', 'postgresql', 'SUM(ROUND(ABS(({COLUMN_NAME} % 1)), 5))+1', '<', '{THRESHOLD_VALUE}'), - ('4007', 'Distinct_Date_Ct', 'postgresql', 'COUNT(DISTINCT {COLUMN_NAME})', '<', '{THRESHOLD_VALUE}'), - ('4008', 'Distinct_Value_Ct', 'postgresql', 'COUNT(DISTINCT {COLUMN_NAME})', '<>', '{THRESHOLD_VALUE}'), - ('4009', 'Email_Format', 'postgresql', 'SUM(CASE WHEN {COLUMN_NAME} !~ ''^[A-Za-z0-9._''''%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$'' THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), - ('4010', 'Future_Date', 'postgresql', 'SUM(GREATEST(0, SIGN({COLUMN_NAME}::DATE - ''{RUN_DATE}''::DATE)))', '>', '{THRESHOLD_VALUE}'), - ('4011', 'Future_Date_1Y', 'postgresql', 'SUM(GREATEST(0, SIGN({COLUMN_NAME}::DATE - (''{RUN_DATE}''::DATE+365))))', '>', '{THRESHOLD_VALUE}'), - ('4012', 'Incr_Avg_Shift', 'postgresql', 'COALESCE(ABS( ({BASELINE_AVG} - (SUM({COLUMN_NAME}) - {BASELINE_SUM}) / NULLIF(COUNT({COLUMN_NAME})::FLOAT - {BASELINE_VALUE_CT}, 0)) / {BASELINE_SD} ), 0)', '>=', '{THRESHOLD_VALUE}'), - ('4013', 'LOV_All', 'postgresql', 'STRING_AGG(DISTINCT {COLUMN_NAME}, ''|'') WITHIN GROUP (ORDER BY {COLUMN_NAME})', '<>', '{THRESHOLD_VALUE}'), - ('4014', 'LOV_Match', 'postgresql', 'SUM(CASE WHEN NULLIF({COLUMN_NAME}, '''') NOT IN {BASELINE_VALUE} THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), - ('4015', 'Min_Date', 'postgresql', 'SUM(CASE WHEN {COLUMN_NAME} < ''{BASELINE_VALUE}'' THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), - ('4016', 'Min_Val', 'postgresql', 'SUM(CASE WHEN {COLUMN_NAME} < {BASELINE_VALUE} - 1e-6 THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), - ('4017', 'Missing_Pct', 'postgresql', 'ABS( 2.0 * ASIN( SQRT( {BASELINE_VALUE_CT}::FLOAT / {BASELINE_CT}::FLOAT ) ) - 2 * ASIN( SQRT( COUNT( {COLUMN_NAME} )::FLOAT / NULLIF(COUNT(*), 0)::FLOAT )) )', '>=', '{THRESHOLD_VALUE}'), - ('4018', 'Monthly_Rec_Ct', 'postgresql', '(MAX(<%DATEDIFF_MONTH;{COLUMN_NAME};''{RUN_DATE}''::DATE%>) - MIN(<%DATEDIFF_MONTH;{COLUMN_NAME};''{RUN_DATE}''::DATE%>) + 1) - COUNT(DISTINCT <%DATEDIFF_MONTH;{COLUMN_NAME};''{RUN_DATE}''::DATE%>)', '>', '{THRESHOLD_VALUE}'), - ('4019', 'Outlier_Pct_Above', 'postgresql', 'SUM(CASE WHEN {COLUMN_NAME}::FLOAT > {BASELINE_AVG}+(2.0*{BASELINE_SD}) THEN 1 ELSE 0 END)::FLOAT / NULLIF(COUNT({COLUMN_NAME}), 0)::FLOAT', '>', '{THRESHOLD_VALUE}'), - ('4020', 'Outlier_Pct_Below', 'postgresql', 'SUM(CASE WHEN {COLUMN_NAME}::FLOAT < {BASELINE_AVG}-(2.0*{BASELINE_SD}) THEN 1 ELSE 0 END)::FLOAT / NULLIF(COUNT({COLUMN_NAME}), 0)::FLOAT', '>', '{THRESHOLD_VALUE}'), - ('4021', 'Pattern_Match', 'postgresql', 'COUNT(NULLIF({COLUMN_NAME}, '''')) - SUM(CASE WHEN NULLIF({COLUMN_NAME}, '''') ~ ''{BASELINE_VALUE}'' THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), - ('4022', 'Recency', 'postgresql', '<%DATEDIFF_DAY;MAX({COLUMN_NAME});''{RUN_DATE}''::DATE%>', '>', '{THRESHOLD_VALUE}'), - ('4023', 'Required', 'postgresql', 'COUNT(*) - COUNT({COLUMN_NAME})', '>', '{THRESHOLD_VALUE}'), - ('4024', 'Row_Ct', 'postgresql', 'COUNT(*)', '<', '{THRESHOLD_VALUE}'), - ('4025', 'Row_Ct_Pct', 'postgresql', 'ABS(ROUND(100.0 * (COUNT(*) - {BASELINE_CT})::DECIMAL(18,4) / {BASELINE_CT}::DECIMAL(18,4), 2))', '>', '{THRESHOLD_VALUE}'), - ('4026', 'Street_Addr_Pattern', 'postgresql', '100.0*SUM(CASE WHEN {COLUMN_NAME} ~ ''^[0-9]{1,5}[a-zA-Z]?\s\w{1,5}\.?\s?\w*\s?\w*\s[a-zA-Z]{1,6}\.?\s?[0-9]{0,5}[A-Z]{0,1}$'' THEN 1 ELSE 0 END)::FLOAT / NULLIF(COUNT({COLUMN_NAME}), 0)::FLOAT', '<', '{THRESHOLD_VALUE}'), - ('4027', 'US_State', 'postgresql', 'SUM(CASE WHEN NULLIF({COLUMN_NAME}, '''') NOT IN (''AL'',''AK'',''AS'',''AZ'',''AR'',''CA'',''CO'',''CT'',''DE'',''DC'',''FM'',''FL'',''GA'',''GU'',''HI'',''ID'',''IL'',''IN'',''IA'',''KS'',''KY'',''LA'',''ME'',''MH'',''MD'',''MA'',''MI'',''MN'',''MS'',''MO'',''MT'',''NE'',''NV'',''NH'',''NJ'',''NM'',''NY'',''NC'',''ND'',''MP'',''OH'',''OK'',''OR'',''PW'',''PA'',''PR'',''RI'',''SC'',''SD'',''TN'',''TX'',''UT'',''VT'',''VI'',''VA'',''WA'',''WV'',''WI'',''WY'',''AE'',''AP'',''AA'') THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), - ('4028', 'Unique', 'postgresql', 'COUNT(*) - COUNT(DISTINCT {COLUMN_NAME})', '>', '{THRESHOLD_VALUE}'), - ('4029', 'Unique_Pct', 'postgresql', 'ABS( 2.0 * ASIN( SQRT({BASELINE_UNIQUE_CT}::FLOAT / {BASELINE_VALUE_CT}::FLOAT ) ) - 2 * ASIN( SQRT( COUNT( DISTINCT {COLUMN_NAME} )::FLOAT / NULLIF(COUNT( {COLUMN_NAME} ), 0)::FLOAT )) )', '>=', '{THRESHOLD_VALUE}'), - ('4030', 'Weekly_Rec_Ct', 'postgresql', 'MAX(<%DATEDIFF_WEEK;''1800-01-01''::DATE;{COLUMN_NAME}%>) - MIN(<%DATEDIFF_WEEK;''1800-01-01''::DATE;{COLUMN_NAME}%>)+1 - COUNT(DISTINCT <%DATEDIFF_WEEK;''1800-01-01''::DATE;{COLUMN_NAME}%>)', '>', '{THRESHOLD_VALUE}'), - - ('1031', 'Variability_Increase', 'redshift', '100.0*STDDEV({COLUMN_NAME}::FLOAT)/{BASELINE_SD}::FLOAT', '>', '{THRESHOLD_VALUE}'), - ('1032', 'Variability_Decrease', 'redshift', '100.0*STDDEV({COLUMN_NAME}::FLOAT)/{BASELINE_SD}::FLOAT', '<', '{THRESHOLD_VALUE}'), - ('2031', 'Variability_Increase', 'snowflake', '100.0*STDDEV({COLUMN_NAME}::FLOAT)/{BASELINE_SD}::FLOAT', '>', '{THRESHOLD_VALUE}'), - ('2032', 'Variability_Decrease', 'snowflake', '100.0*STDDEV({COLUMN_NAME}::FLOAT)/{BASELINE_SD}::FLOAT', '<', '{THRESHOLD_VALUE}'), - ('3031', 'Variability_Increase', 'mssql', '100.0*STDEV(CAST({COLUMN_NAME} AS FLOAT))/CAST({BASELINE_SD} AS FLOAT)', '>', '{THRESHOLD_VALUE}'), - ('3032', 'Variability_Decrease', 'mssql', '100.0*STDEV(CAST({COLUMN_NAME} AS FLOAT))/CAST({BASELINE_SD} AS FLOAT)', '<', '{THRESHOLD_VALUE}'), - ('4031', 'Variability_Increase', 'postgresql', '100.0*STDDEV({COLUMN_NAME}::FLOAT)/{BASELINE_SD}::FLOAT', '>', '{THRESHOLD_VALUE}'), - ('4032', 'Variability_Decrease', 'postgresql', '100.0*STDDEV({COLUMN_NAME}::FLOAT)/{BASELINE_SD}::FLOAT', '<', '{THRESHOLD_VALUE}'), - ('6031', 'Variability_Increase', 'databricks', '100.0*STDDEV_SAMP({COLUMN_NAME}::FLOAT)/{BASELINE_SD}::FLOAT', '>', '{THRESHOLD_VALUE}'), - ('6032', 'Variability_Decrease', 'databricks', '100.0*STDDEV_SAMP({COLUMN_NAME}::FLOAT)/{BASELINE_SD}::FLOAT', '<', '{THRESHOLD_VALUE}'), - - ('5001', 'Alpha_Trunc', 'trino', 'MAX(LENGTH({COLUMN_NAME}))', '<', '{THRESHOLD_VALUE}'), - ('5002', 'Avg_Shift', 'trino', 'ABS( (CAST(AVG({COLUMN_NAME} AS REAL)) - {BASELINE_AVG}) / SQRT(((CAST(COUNT({COLUMN_NAME}) AS REAL)-1)*STDDEV({COLUMN_NAME})^2 + (CAST({BASELINE_VALUE_CT} AS REAL)-1) * CAST({BASELINE_SD} AS REAL)^2) /NULLIF(CAST(COUNT({COLUMN_NAME}) AS REAL) + CAST({BASELINE_VALUE_CT} AS REAL), 0) ))', '>=', '{THRESHOLD_VALUE}'), - ('5003', 'Condition_Flag', 'trino', 'SUM(CASE WHEN {BASELINE_VALUE} IS NOT NULL THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), - ('5004', 'Constant', 'trino', 'SUM(CASE WHEN {COLUMN_NAME} <> {BASELINE_VALUE} THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), - ('5005', 'Daily_Record_Ct', 'trino', 'DATE_DIFF(''DAY'', MIN({COLUMN_NAME}), MAX({COLUMN_NAME}))+1-COUNT(DISTINCT {COLUMN_NAME})', '>', '{THRESHOLD_VALUE}'), - ('5006', 'Dec_Trunc', 'trino', 'SUM(ROUND(ABS(({COLUMN_NAME} % 1)), 5))+1', '<', '{THRESHOLD_VALUE}'), - ('5007', 'Distinct_Date_Ct', 'trino', 'COUNT(DISTINCT {COLUMN_NAME})', '<', '{THRESHOLD_VALUE}'), - ('5008', 'Distinct_Value_Ct', 'trino', 'COUNT(DISTINCT {COLUMN_NAME})', '<>', '{THRESHOLD_VALUE}'), - ('5009', 'Email_Format', 'trino', 'SUM(CASE WHEN REGEXP_LIKE({COLUMN_NAME} , ''^[A-Za-z0-9._''''%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$'') != TRUE THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), - ('5010', 'Future_Date', 'trino', 'SUM(CASE WHEN CAST({COLUMN_NAME} AS DATE) >= CAST(''{RUN_DATE}'' AS DATE) THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), - ('5011', 'Future_Date_1Y', 'trino', 'SUM(CASE WHEN CAST({COLUMN_NAME} AS DATE) >= (FROM_ISO8601_DATE(''{RUN_DATE}'') + interval ''365'' day ) THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), - ('5012', 'Incr_Avg_Shift', 'trino', 'COALESCE(ABS( ({BASELINE_AVG} - (SUM({COLUMN_NAME}) - {BASELINE_SUM}) / NULLIF(CAST(COUNT({COLUMN_NAME}) AS REAL) - {BASELINE_VALUE_CT}, 0)) / {BASELINE_SD} ), 0)', '>=', '{THRESHOLD_VALUE}'), - ('5013', 'LOV_All', 'trino', 'LISTAGG(DISTINCT {COLUMN_NAME}, ''|'') WITHIN GROUP (ORDER BY {COLUMN_NAME})', '<>', '{THRESHOLD_VALUE}'), - ('5014', 'LOV_Match', 'trino', 'SUM(CASE WHEN NULLIF({COLUMN_NAME}, '''') NOT IN {BASELINE_VALUE} THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), - ('5015', 'Min_Date', 'trino', 'SUM(CASE WHEN {COLUMN_NAME} < CAST(''{BASELINE_VALUE}'' AS DATE) THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), - ('5016', 'Min_Val', 'trino', 'SUM(CASE WHEN {COLUMN_NAME} < {BASELINE_VALUE} - 1e-6 THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), - ('5017', 'Missing_Pct', 'trino', 'ABS(2.0 * ASIN(SQRT(CAST({BASELINE_VALUE_CT} AS REAL) / CAST({BASELINE_CT} AS REAL))) - 2 * ASIN(SQRT(CAST(COUNT({COLUMN_NAME}) AS REAL) / CAST(NULLIF(COUNT(*), 0) AS REAL) )))', '>=', '{THRESHOLD_VALUE}'), - ('5018', 'Monthly_Rec_Ct', 'trino', '(MAX(DATE_DIFF(''month'', {COLUMN_NAME}, CAST(''{RUN_DATE}'' AS DATE))) - MIN(DATE_DIFF(''month'', {COLUMN_NAME}, CAST(''{RUN_DATE}'' AS DATE))) + 1) - COUNT(DISTINCT DATE_DIFF(''month'', {COLUMN_NAME}, CAST(''{RUN_DATE}'' AS DATE)))', '>', '{THRESHOLD_VALUE}'), - ('5019', 'Outlier_Pct_Above', 'trino', 'CAST(SUM(CASE WHEN CAST({COLUMN_NAME} AS REAL) > {BASELINE_AVG}+(2.0*{BASELINE_SD}) THEN 1 ELSE 0 END) AS REAL) / CAST(COUNT({COLUMN_NAME}) AS REAL)', '>', '{THRESHOLD_VALUE}'), - ('5020', 'Outlier_Pct_Below', 'trino', 'CAST(SUM(CASE WHEN CAST( {COLUMN_NAME} AS REAL) < {BASELINE_AVG}-(2.0*{BASELINE_SD}) THEN 1 ELSE 0 END) AS REAL) / CAST(COUNT({COLUMN_NAME}) AS REAL)', '>', '{THRESHOLD_VALUE}'), - ('5021', 'Pattern_Match', 'trino', 'COUNT(NULLIF({COLUMN_NAME}, '''')) - SUM(CASE WHEN REGEXP_LIKE(NULLIF({COLUMN_NAME}, '''') , ''{BASELINE_VALUE}'') = TRUE THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), - ('5022', 'Recency', 'trino', 'DATE_DIFF(''day'', MAX({COLUMN_NAME}), CAST(''{RUN_DATE}'' AS DATE))', '>', '{THRESHOLD_VALUE}'), - ('5023', 'Required', 'trino', 'COUNT(*) - COUNT({COLUMN_NAME})', '>', '{THRESHOLD_VALUE}'), - ('5024', 'Row_Ct', 'trino', 'COUNT(*)', '<', '{THRESHOLD_VALUE}'), - ('5025', 'Row_Ct_Pct', 'trino', 'ABS(ROUND(100.0 * CAST((COUNT(*) - {BASELINE_CT}) AS DECIMAL(18,4)) /CAST( {BASELINE_CT} AS DECIMAL(18,4) ), 2))', '>', '{THRESHOLD_VALUE}'), - ('5026', 'Street_Addr_Pattern', 'trino', 'CAST(100.0*SUM(CASE WHEN REGEXP_LIKE({COLUMN_NAME} , ''^[0-9]{1,5}[a-zA-Z]?\s\w{1,5}\.?\s?\w*\s?\w*\s[a-zA-Z]{1,6}\.?\s?[0-9]{0,5}[A-Z]{0,1}$'') = TRUE THEN 1 ELSE 0 END) AS REAL )/ CAST(COUNT({COLUMN_NAME}) AS REAL)', '<', '{THRESHOLD_VALUE}'), - ('5027', 'US_State', 'trino', 'SUM(CASE WHEN NULLIF({COLUMN_NAME}, '''') NOT IN (''AL'',''AK'',''AS'',''AZ'',''AR'',''CA'',''CO'',''CT'',''DE'',''DC'',''FM'',''FL'',''GA'',''GU'',''HI'',''ID'',''IL'',''IN'',''IA'',''KS'',''KY'',''LA'',''ME'',''MH'',''MD'',''MA'',''MI'',''MN'',''MS'',''MO'',''MT'',''NE'',''NV'',''NH'',''NJ'',''NM'',''NY'',''NC'',''ND'',''MP'',''OH'',''OK'',''OR'',''PW'',''PA'',''PR'',''RI'',''SC'',''SD'',''TN'',''TX'',''UT'',''VT'',''VI'',''VA'',''WA'',''WV'',''WI'',''WY'',''AE'',''AP'',''AA'') THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), - ('5028', 'Unique', 'trino', 'COUNT(*) - COUNT(DISTINCT {COLUMN_NAME})', '>', '{THRESHOLD_VALUE}'), - ('5029', 'Unique_Pct', 'trino', 'ABS( 2.0 * ASIN( SQRT(CAST({BASELINE_UNIQUE_CT} AS REAL) / CAST({BASELINE_VALUE_CT} AS REAL) ) ) - 2 * ASIN( SQRT( CAST(COUNT( DISTINCT {COLUMN_NAME} ) AS REAL) / CAST(NULLIF(COUNT( {COLUMN_NAME} ), 0) AS REAL) )))', '>=', '{THRESHOLD_VALUE}'), - ('5030', 'Weekly_Rec_Ct', 'trino', 'MAX(DATE_DIFF(''week'', CAST(''1800-01-01'' AS DATE), {COLUMN_NAME})) - MIN(DATE_DIFF(''week'', CAST(''1800-01-01'' AS DATE), {COLUMN_NAME})) +1 - COUNT(DISTINCT DATE_DIFF(''week'', CAST(''1800-01-01'' AS DATE), {COLUMN_NAME}))', '>', '{THRESHOLD_VALUE}'), - ('5031', 'Variability_Increase', 'trino', '100.0*STDDEV(CAST({COLUMN_NAME} AS REAL))/{BASELINE_SD}', '>', '{THRESHOLD_VALUE}'), - ('5032', 'Variability_Decrease', 'trino', '100.0*STDDEV(CAST({COLUMN_NAME} AS REAL))/{BASELINE_SD}', '<', '{THRESHOLD_VALUE}'), - - ('6001', 'Alpha_Trunc', 'databricks', 'MAX(LENGTH({COLUMN_NAME}))', '<', '{THRESHOLD_VALUE}'), - ('6002', 'Avg_Shift', 'databricks', 'ABS( (AVG({COLUMN_NAME}::FLOAT) - {BASELINE_AVG}) / SQRT(((COUNT({COLUMN_NAME})::FLOAT-1)*POWER(STDDEV_SAMP({COLUMN_NAME}),2) + ({BASELINE_VALUE_CT}::FLOAT-1) * POWER({BASELINE_SD}::FLOAT,2)) /NULLIF(COUNT({COLUMN_NAME})::FLOAT + {BASELINE_VALUE_CT}::FLOAT, 0) ))', '>=', '{THRESHOLD_VALUE}'), - ('6003', 'Condition_Flag', 'databricks', 'SUM(CASE WHEN {CUSTOM_QUERY} THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), - ('6004', 'Constant', 'databricks', 'SUM(CASE WHEN {COLUMN_NAME} <> {BASELINE_VALUE} THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), - ('6005', 'Daily_Record_Ct', 'databricks', '<%DATEDIFF_DAY;MIN({COLUMN_NAME});MAX({COLUMN_NAME})%>+1-COUNT(DISTINCT {COLUMN_NAME})', '<', '{THRESHOLD_VALUE}'), - ('6006', 'Dec_Trunc', 'databricks', 'SUM(ROUND(ABS(({COLUMN_NAME} % 1)), 5))+1', '<', '{THRESHOLD_VALUE}'), - ('6007', 'Distinct_Date_Ct', 'databricks', 'COUNT(DISTINCT {COLUMN_NAME})', '<', '{THRESHOLD_VALUE}'), - ('6008', 'Distinct_Value_Ct', 'databricks', 'COUNT(DISTINCT {COLUMN_NAME})', '<>', '{THRESHOLD_VALUE}'), - ('6009', 'Email_Format', 'databricks', 'SUM(CASE WHEN NOT REGEXP_LIKE({COLUMN_NAME}::STRING, ''^[A-Za-z0-9._''''%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$'') THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), - ('6010', 'Future_Date', 'databricks', 'SUM(GREATEST(0, SIGN({COLUMN_NAME}::DATE - ''{RUN_DATE}''::DATE)))', '>', '{THRESHOLD_VALUE}'), - ('6011', 'Future_Date_1Y', 'databricks', 'SUM(GREATEST(0, SIGN({COLUMN_NAME}::DATE - (''{RUN_DATE}''::DATE+365))))', '>', '{THRESHOLD_VALUE}'), - ('6012', 'Incr_Avg_Shift', 'databricks', 'COALESCE(ABS( ({BASELINE_AVG} - (SUM({COLUMN_NAME}) - {BASELINE_SUM}) / NULLIF(COUNT({COLUMN_NAME})::FLOAT - {BASELINE_VALUE_CT}, 0)) / {BASELINE_SD} ), 0)', '>=', '{THRESHOLD_VALUE}'), - ('6013', 'LOV_All', 'databricks', 'STRING_AGG(DISTINCT {COLUMN_NAME}, ''|'') WITHIN GROUP (ORDER BY {COLUMN_NAME})', '<>', '{THRESHOLD_VALUE}'), - ('6014', 'LOV_Match', 'databricks', 'SUM(CASE WHEN NULLIF({COLUMN_NAME}, '''') NOT IN {BASELINE_VALUE} THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), - ('6015', 'Min_Date', 'databricks', 'SUM(CASE WHEN {COLUMN_NAME} < ''{BASELINE_VALUE}'' THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), - ('6016', 'Min_Val', 'databricks', 'SUM(CASE WHEN {COLUMN_NAME} < {BASELINE_VALUE} - 1e-6 THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), - ('6017', 'Missing_Pct', 'databricks', 'ABS( 2.0 * ASIN( SQRT( {BASELINE_VALUE_CT}::FLOAT / {BASELINE_CT}::FLOAT ) ) - 2 * ASIN( SQRT( COUNT({COLUMN_NAME})::FLOAT / NULLIF(COUNT(*), 0)::FLOAT )) )', '>=', '{THRESHOLD_VALUE}'), - ('6018', 'Monthly_Rec_Ct', 'databricks', '(MAX(<%DATEDIFF_MONTH;{COLUMN_NAME};''{RUN_DATE}''::DATE%>) - MIN(<%DATEDIFF_MONTH;{COLUMN_NAME};''{RUN_DATE}''::DATE%>) + 1) - COUNT(DISTINCT <%DATEDIFF_MONTH;{COLUMN_NAME};''{RUN_DATE}''::DATE%>)', '>', '{THRESHOLD_VALUE}'), - ('6019', 'Outlier_Pct_Above', 'databricks', 'SUM(CASE WHEN {COLUMN_NAME}::FLOAT > {BASELINE_AVG}+(2.0*{BASELINE_SD}) THEN 1 ELSE 0 END)::FLOAT / NULLIF(COUNT({COLUMN_NAME}), 0)::FLOAT', '>', '{THRESHOLD_VALUE}'), - ('6020', 'Outlier_Pct_Below', 'databricks', 'SUM(CASE WHEN {COLUMN_NAME}::FLOAT < {BASELINE_AVG}-(2.0*{BASELINE_SD}) THEN 1 ELSE 0 END)::FLOAT / NULLIF(COUNT({COLUMN_NAME}), 0)::FLOAT', '>', '{THRESHOLD_VALUE}'), - ('6021', 'Pattern_Match', 'databricks', 'COUNT(NULLIF({COLUMN_NAME}, '''')) - SUM(REGEXP_LIKE(NULLIF({COLUMN_NAME}::STRING, ''''), ''{BASELINE_VALUE}'')::BIGINT)', '>', '{THRESHOLD_VALUE}'), - ('6022', 'Recency', 'databricks', '<%DATEDIFF_DAY;MAX({COLUMN_NAME});''{RUN_DATE}''::DATE%>', '>', '{THRESHOLD_VALUE}'), - ('6023', 'Required', 'databricks', 'COUNT(*) - COUNT( {COLUMN_NAME} )', '>', '{THRESHOLD_VALUE}'), - ('6024', 'Row_Ct', 'databricks', 'COUNT(*)', '<', '{THRESHOLD_VALUE}'), - ('6025', 'Row_Ct_Pct', 'databricks', 'ABS(ROUND(100.0 * (COUNT(*) - {BASELINE_CT})::FLOAT / {BASELINE_CT}::FLOAT, 2))', '>', '{THRESHOLD_VALUE}'), - ('6026', 'Street_Addr_Pattern', 'databricks', '100.0*SUM((regexp_like({COLUMN_NAME}::STRING, ''^[0-9]{1,5}[a-zA-Z]?\\s\\w{1,5}\\.?\\s?\\w*\\s?\\w*\\s[a-zA-Z]{1,6}\\.?\\s?[0-9]{0,5}[A-Z]{0,1}$''))::BIGINT)::FLOAT / NULLIF(COUNT({COLUMN_NAME}), 0)::FLOAT', '<', '{THRESHOLD_VALUE}'), - ('6027', 'US_State', 'databricks', 'SUM(CASE WHEN {COLUMN_NAME} NOT IN ('''',''AL'',''AK'',''AS'',''AZ'',''AR'',''CA'',''CO'',''CT'',''DE'',''DC'',''FM'',''FL'',''GA'',''GU'',''HI'',''ID'',''IL'',''IN'',''IA'',''KS'',''KY'',''LA'',''ME'',''MH'',''MD'',''MA'',''MI'',''MN'',''MS'',''MO'',''MT'',''NE'',''NV'',''NH'',''NJ'',''NM'',''NY'',''NC'',''ND'',''MP'',''OH'',''OK'',''OR'',''PW'',''PA'',''PR'',''RI'',''SC'',''SD'',''TN'',''TX'',''UT'',''VT'',''VI'',''VA'',''WA'',''WV'',''WI'',''WY'',''AE'',''AP'',''AA'') THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), - ('6028', 'Unique', 'databricks', 'COUNT(*) - COUNT(DISTINCT {COLUMN_NAME})', '>', '{THRESHOLD_VALUE}'), - ('6029', 'Unique_Pct', 'databricks', 'ABS( 2.0 * ASIN( SQRT({BASELINE_UNIQUE_CT}::FLOAT / {BASELINE_VALUE_CT}::FLOAT ) ) - 2 * ASIN( SQRT( COUNT( DISTINCT {COLUMN_NAME} )::FLOAT / NULLIF(COUNT( {COLUMN_NAME} ), 0)::FLOAT )) )', '>=', '{THRESHOLD_VALUE}'), - ('6030', 'Weekly_Rec_Ct', 'databricks', 'CAST(<%DATEDIFF_WEEK;MIN({COLUMN_NAME});MAX({COLUMN_NAME})%> + 1 - COUNT(DISTINCT DATE_TRUNC(''week'', {COLUMN_NAME})) AS INT)', '>', '{THRESHOLD_VALUE}'), - - ('1033', 'Valid_Month', 'redshift', 'SUM(CASE WHEN NULLIF({COLUMN_NAME}, '''') NOT IN ({BASELINE_VALUE}) THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), - ('2033', 'Valid_Month', 'snowflake', 'SUM(CASE WHEN NULLIF({COLUMN_NAME}, '''') NOT IN ({BASELINE_VALUE}) THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), - ('3033', 'Valid_Month', 'mssql', 'SUM(CASE WHEN NULLIF({COLUMN_NAME}, '''') NOT IN ({BASELINE_VALUE}) THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), - ('4033', 'Valid_Month', 'postgresql', 'SUM(CASE WHEN NULLIF({COLUMN_NAME}, '''') NOT IN ({BASELINE_VALUE}) THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), - ('5033', 'Valid_Month', 'trino', 'SUM(CASE WHEN NULLIF({COLUMN_NAME}, '''') NOT IN ({BASELINE_VALUE}) THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), - ('6033', 'Valid_Month', 'databricks', 'SUM(CASE WHEN NULLIF({COLUMN_NAME}, '''') NOT IN ({BASELINE_VALUE}) THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), - - ('1034', 'Valid_US_Zip', 'redshift', 'SUM(CASE WHEN TRANSLATE({COLUMN_NAME},''012345678'',''999999999'') NOT IN (''99999'', ''999999999'', ''99999-9999'') THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), - ('4034', 'Valid_US_Zip', 'postgresql', 'SUM(CASE WHEN TRANSLATE({COLUMN_NAME},''012345678'',''999999999'') NOT IN (''99999'', ''999999999'', ''99999-9999'') THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), - ('2034', 'Valid_US_Zip', 'snowflake', 'SUM(CASE WHEN TRANSLATE({COLUMN_NAME},''012345678'',''999999999'') NOT IN (''99999'', ''999999999'', ''99999-9999'') THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), - ('5034', 'Valid_US_Zip', 'trino', 'SUM(CASE WHEN TRANSLATE({COLUMN_NAME},''012345678'',''999999999'') NOT IN (''99999'', ''999999999'', ''99999-9999'') THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), - ('3034', 'Valid_US_Zip', 'mssql', 'SUM(CASE WHEN TRANSLATE({COLUMN_NAME},''012345678'',''999999999'') NOT IN (''99999'', ''999999999'', ''99999-9999'') THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), - ('6034', 'Valid_US_Zip', 'databricks', 'SUM(CASE WHEN TRANSLATE({COLUMN_NAME},''012345678'',''999999999'') NOT IN (''99999'', ''999999999'', ''99999-9999'') THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), - - ('1035', 'Valid_US_Zip3', 'redshift', 'SUM(CASE WHEN TRANSLATE({COLUMN_NAME},''012345678'',''999999999'') <> ''999'' THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), - ('4035', 'Valid_US_Zip3', 'postgresql', 'SUM(CASE WHEN TRANSLATE({COLUMN_NAME},''012345678'',''999999999'') <> ''999'' THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), - ('2035', 'Valid_US_Zip3', 'snowflake', 'SUM(CASE WHEN TRANSLATE({COLUMN_NAME},''012345678'',''999999999'') <> ''999'' THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), - ('5035', 'Valid_US_Zip3', 'trino', 'SUM(CASE WHEN TRANSLATE({COLUMN_NAME},''012345678'',''999999999'') <> ''999'' THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), - ('3035', 'Valid_US_Zip3', 'mssql', 'SUM(CASE WHEN TRANSLATE({COLUMN_NAME},''012345678'',''999999999'') <> ''999'' THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), - ('6035', 'Valid_US_Zip3', 'databricks', 'SUM(CASE WHEN TRANSLATE({COLUMN_NAME},''012345678'',''999999999'') <> ''999'' THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), - - ('1036', 'Valid_Characters', 'redshift', 'SUM(CASE WHEN TRANSLATE({COLUMN_NAME}, CHR(160) || CHR(8203) || CHR(65279) || CHR(8239) || CHR(8201) || CHR(12288) || CHR(8204), ''XXXXXXX'') <> {COLUMN_NAME} OR {COLUMN_NAME} LIKE '' %'' OR {COLUMN_NAME} LIKE ''''''%'''''' OR {COLUMN_NAME} LIKE ''"%"'' THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), - ('4036', 'Valid_Characters', 'postgresql', 'SUM(CASE WHEN TRANSLATE({COLUMN_NAME}, CHR(160) || CHR(8203) || CHR(65279) || CHR(8239) || CHR(8201) || CHR(12288) || CHR(8204), ''XXXXXXX'') <> {COLUMN_NAME} OR {COLUMN_NAME} LIKE '' %'' OR {COLUMN_NAME} LIKE ''''''%'''''' OR {COLUMN_NAME} LIKE ''"%"'' THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), - ('2036', 'Valid_Characters', 'snowflake', 'SUM(CASE WHEN TRANSLATE({COLUMN_NAME}, CHAR(160) || CHAR(8203) || CHAR(65279) || CHAR(8239) || CHAR(8201) || CHAR(12288) || CHAR(8204), ''XXXXXXX'') <> {COLUMN_NAME} OR {COLUMN_NAME} LIKE '' %'' OR {COLUMN_NAME} LIKE ''''''%'''''' OR {COLUMN_NAME} LIKE ''"%"'' THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), - ('5036', 'Valid_Characters', 'trino', 'SUM(CASE WHEN TRANSLATE({COLUMN_NAME}, CHR(160) || CHR(8203) || CHR(65279) || CHR(8239) || CHR(8201) || CHR(12288) || CHR(8204), ''XXXXXXX'') <> {COLUMN_NAME} OR {COLUMN_NAME} LIKE '' %'' OR {COLUMN_NAME} LIKE ''''''%'''''' OR {COLUMN_NAME} LIKE ''"%"'' THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), - ('3036', 'Valid_Characters', 'mssql', 'SUM(CASE WHEN TRANSLATE({COLUMN_NAME}, NCHAR(160) || NCHAR(8203) || NCHAR(65279) || NCHAR(8239) || NCHAR(8201) || NCHAR(12288) || NCHAR(8204), ''XXXXXXX'') <> {COLUMN_NAME} OR {COLUMN_NAME} LIKE '' %'' OR {COLUMN_NAME} LIKE ''''''%'''''' OR {COLUMN_NAME} LIKE ''"%"'' THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), - ('6036', 'Valid_Characters', 'databricks', 'SUM(CASE WHEN TRANSLATE({COLUMN_NAME}, CHR(160) || CHR(8203) || CHR(65279) || CHR(8239) || CHR(8201) || CHR(12288) || CHR(8204), ''XXXXXXX'') <> {COLUMN_NAME} OR {COLUMN_NAME} LIKE '' %'' OR {COLUMN_NAME} LIKE ''''''%'''''' OR {COLUMN_NAME} LIKE ''"%"'' THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'); - TRUNCATE TABLE target_data_lookups; -INSERT INTO target_data_lookups -(id, test_id, error_type, test_type, sql_flavor, lookup_type, lookup_query) -VALUES - ('1001', '1004', 'Test Results', 'Alpha_Trunc', 'redshift', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", LEN("{COLUMN_NAME}") as current_max_length, {THRESHOLD_VALUE} as previous_max_length FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT MAX(LEN("{COLUMN_NAME}")) as max_length FROM {TARGET_SCHEMA}.{TABLE_NAME}) a WHERE LEN("{COLUMN_NAME}") = a.max_length AND a.max_length < {THRESHOLD_VALUE} LIMIT 500;'), - ('1002', '1005', 'Test Results', 'Avg_Shift', 'redshift', NULL, 'SELECT AVG("{COLUMN_NAME}"::FLOAT) AS current_average FROM {TARGET_SCHEMA}.{TABLE_NAME};'), - ('1003', '1006', 'Test Results', 'Condition_Flag', 'redshift', NULL, 'SELECT * FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE {CUSTOM_QUERY} LIMIT 500;'), - ('1004', '1007', 'Test Results', 'Constant', 'redshift', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" <> {BASELINE_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT 500;'), - ('1005', '1009', 'Test Results', 'Daily_Record_Ct', 'redshift', NULL, 'WITH RECURSIVE daterange(all_dates) AS (SELECT MIN("{COLUMN_NAME}") :: DATE AS all_dates FROM {TARGET_SCHEMA}.{TABLE_NAME} UNION ALL SELECT DATEADD(DAY, 1, d.all_dates) :: DATE AS all_dates FROM daterange d WHERE d.all_dates < (SELECT MAX("{COLUMN_NAME}") :: DATE FROM {TARGET_SCHEMA}.{TABLE_NAME}) ), existing_periods AS ( SELECT DISTINCT "{COLUMN_NAME}" :: DATE AS period, COUNT(1) AS period_count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" :: DATE ) SELECT d.all_dates AS missing_period, MAX(b.period) AS prior_available_date, (SELECT period_count FROM existing_periods WHERE period = MAX(b.period) ) AS prior_available_date_count, MIN(c.period) AS next_available_date, (SELECT period_count FROM existing_periods WHERE period = MIN(c.period) ) AS next_available_date_count FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates ORDER BY d.all_dates LIMIT 500;'), - ('1006', '1011', 'Test Results', 'Dec_Trunc', 'redshift', NULL, 'SELECT DISTINCT DECIMAL_SCALE("{COLUMN_NAME}" :: SUPER) AS decimal_scale, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY DECIMAL_SCALE("{COLUMN_NAME}" :: SUPER) LIMIT 500;'), - ('1007', '1012', 'Test Results', 'Distinct_Date_Ct', 'redshift', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" IS NOT NULL GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500;'), - ('1008', '1013', 'Test Results', 'Distinct_Value_Ct', 'redshift', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" IS NOT NULL GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500;'), - ('1009', '1014', 'Test Results', 'Email_Format', 'redshift', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" !~ ''^[A-Za-z0-9._''''%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$'' GROUP BY "{COLUMN_NAME}" LIMIT 500;'), - ('1010', '1015', 'Test Results', 'Future_Date', 'redshift', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE GREATEST(0, SIGN("{COLUMN_NAME}"::DATE - ''{TEST_DATE}''::DATE)) > {THRESHOLD_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT 500;'), - ('1011', '1016', 'Test Results', 'Future_Date_1Y', 'redshift', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE GREATEST(0, SIGN("{COLUMN_NAME}"::DATE - (''{TEST_DATE}''::DATE + 365))) > {THRESHOLD_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT 500;'), - ('1012', '1017', 'Test Results', 'Incr_Avg_Shift', 'redshift', NULL, 'SELECT AVG("{COLUMN_NAME}" :: FLOAT) AS current_average, SUM("{COLUMN_NAME}" ::FLOAT) AS current_sum, NULLIF(COUNT("{COLUMN_NAME}" )::FLOAT, 0) as current_value_count FROM {TARGET_SCHEMA}.{TABLE_NAME};'), - ('1013', '1018', 'Test Results', 'LOV_All', 'redshift', NULL, 'SELECT LISTAGG(DISTINCT "{COLUMN_NAME}", ''|'') WITHIN GROUP (ORDER BY "{COLUMN_NAME}") FROM {TARGET_SCHEMA}.{TABLE_NAME} HAVING LISTAGG(DISTINCT "{COLUMN_NAME}", ''|'') WITHIN GROUP (ORDER BY "{COLUMN_NAME}") <> ''{THRESHOLD_VALUE}'' LIMIT 500;'), - ('1014', '1019', 'Test Results', 'LOV_Match', 'redshift', NULL, 'SELECT DISTINCT NULLIF("{COLUMN_NAME}", '''') AS "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE NULLIF("{COLUMN_NAME}", '''') NOT IN {BASELINE_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT 500;'), - ('1015', '1020', 'Test Results', 'Min_Date', 'redshift', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" :: DATE < ''{BASELINE_VALUE}'' :: DATE GROUP BY "{COLUMN_NAME}" LIMIT 500;'), - ('1016', '1021', 'Test Results', 'Min_Val', 'redshift', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", (ABS("{COLUMN_NAME}") - ABS({BASELINE_VALUE})) AS difference_from_baseline FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" < {BASELINE_VALUE} LIMIT 500;'), - ('1017', '1022', 'Test Results', 'Missing_Pct', 'redshift', NULL, 'SELECT TOP 10 * FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" IS NULL OR "{COLUMN_NAME}" :: VARCHAR(255) = '''' ;'), - ('1018', '1023', 'Test Results', 'Monthly_Rec_Ct', 'redshift', NULL, 'WITH RECURSIVE daterange(all_dates) AS (SELECT DATE_TRUNC(''month'', MIN("{COLUMN_NAME}")) :: DATE AS all_dates FROM {TARGET_SCHEMA}.{TABLE_NAME} UNION ALL SELECT DATEADD(MONTH, 1, d.all_dates) :: DATE AS all_dates FROM daterange d WHERE d.all_dates < (SELECT DATE_TRUNC(''month'', MAX("{COLUMN_NAME}")) :: DATE FROM {TARGET_SCHEMA}.{TABLE_NAME}) ), existing_periods AS ( SELECT DISTINCT DATE_TRUNC(''month'',"{COLUMN_NAME}") :: DATE AS period, COUNT(1) AS period_count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY DATE_TRUNC(''month'',"{COLUMN_NAME}") :: DATE ) SELECT d.all_dates as missing_period, MAX(b.period) AS prior_available_month, (SELECT period_count FROM existing_periods WHERE period = MAX(b.period) ) AS prior_available_month_count, MIN(c.period) AS next_available_month, (SELECT period_count FROM existing_periods WHERE period = MIN(c.period) ) AS next_available_month_count FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates ORDER BY d.all_dates;'), - ('1019', '1024', 'Test Results', 'Outlier_Pct_Above', 'redshift', NULL, 'SELECT ({BASELINE_AVG} + (2*{BASELINE_SD})) AS outlier_threshold, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" :: FLOAT > ({BASELINE_AVG} + (2*{BASELINE_SD})) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC;'), - ('1020', '1025', 'Test Results', 'Outlier_Pct_Below', 'redshift', NULL, 'SELECT ({BASELINE_AVG} + (2*{BASELINE_SD})) AS outlier_threshold, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" :: FLOAT < ({BASELINE_AVG} + (2*{BASELINE_SD})) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC;'), - ('1021', '1026', 'Test Results', 'Pattern_Match', 'redshift', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE NULLIF("{COLUMN_NAME}", '''') NOT SIMILAR TO ''{BASELINE_VALUE}'' GROUP BY "{COLUMN_NAME}";'), - ('1022', '1028', 'Test Results', 'Recency', 'redshift', NULL, 'SELECT DISTINCT col AS latest_date_available, ''{TEST_DATE}'' :: DATE as test_run_date FROM (SELECT MAX("{COLUMN_NAME}") AS col FROM {TARGET_SCHEMA}.{TABLE_NAME}) WHERE DATEDIFF(''D'', col, ''{TEST_DATE}''::DATE) > {THRESHOLD_VALUE};'), - ('1023', '1030', 'Test Results', 'Required', 'redshift', NULL, 'SELECT * FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" IS NULL LIMIT 500;'), - ('1024', '1031', 'Test Results', 'Row_Ct', 'redshift', NULL, 'WITH CTE AS (SELECT COUNT(*) AS current_count FROM {TARGET_SCHEMA}.{TABLE_NAME}) SELECT current_count, ABS(ROUND(100 * (current_count - {THRESHOLD_VALUE}) :: FLOAT / {THRESHOLD_VALUE} :: FLOAT,2)) AS row_count_pct_decrease FROM cte WHERE current_count < {THRESHOLD_VALUE};'), - ('1025', '1032', 'Test Results', 'Row_Ct_Pct', 'redshift', NULL, 'WITH CTE AS (SELECT COUNT(*) AS current_count FROM {TARGET_SCHEMA}.{TABLE_NAME}) SELECT current_count, {BASELINE_CT} AS baseline_count, ABS(ROUND(100 * (current_count - {BASELINE_CT}) :: FLOAT / {BASELINE_CT} :: FLOAT,2)) AS row_count_pct_difference FROM cte;'), - ('1026', '1033', 'Test Results', 'Street_Addr_Pattern', 'redshift', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" !~ ''^[0-9]{1,5}[a-zA-Z]?\\s\\w{1,5}\\.?\\s?\\w*\\s?\\w*\\s[a-zA-Z]{1,6}\\.?\\s?[0-9]{0,5}[A-Z]{0,1}$'' GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT 500;'), - ('1027', '1036', 'Test Results', 'US_State', 'redshift', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE NULLIF("{COLUMN_NAME}", '''') NOT IN (''AL'',''AK'',''AS'',''AZ'',''AR'',''CA'',''CO'',''CT'',''DE'',''DC'',''FM'',''FL'',''GA'',''GU'',''HI'',''ID'',''IL'',''IN'',''IA'',''KS'',''KY'',''LA'',''ME'',''MH'',''MD'',''MA'',''MI'',''MN'',''MS'',''MO'',''MT'',''NE'',''NV'',''NH'',''NJ'',''NM'',''NY'',''NC'',''ND'',''MP'',''OH'',''OK'',''OR'',''PW'',''PA'',''PR'',''RI'',''SC'',''SD'',''TN'',''TX'',''UT'',''VT'',''VI'',''VA'',''WA'',''WV'',''WI'',''WY'',''AE'',''AP'',''AA'') GROUP BY "{COLUMN_NAME}" LIMIT 500;'), - ('1028', '1034', 'Test Results', 'Unique', 'redshift', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" HAVING COUNT(*) > 1 ORDER BY COUNT(*) DESC LIMIT 500;'), - ('1029', '1035', 'Test Results', 'Unique_Pct', 'redshift', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT 500;'), - ('1030', '1037', 'Test Results', 'Weekly_Rec_Ct', 'redshift', NULL, 'WITH RECURSIVE daterange(all_dates) AS (SELECT DATE_TRUNC(''week'',MIN("{COLUMN_NAME}")) :: DATE AS all_dates FROM {TARGET_SCHEMA}.{TABLE_NAME} UNION ALL SELECT (d.all_dates + INTERVAL ''1 week'' ) :: DATE AS all_dates FROM daterange d WHERE d.all_dates < (SELECT DATE_TRUNC(''week'', MAX("{COLUMN_NAME}")) :: DATE FROM {TARGET_SCHEMA}.{TABLE_NAME}) ), existing_periods AS ( SELECT DISTINCT DATE_TRUNC(''week'',"{COLUMN_NAME}") :: DATE AS period, COUNT(1) as period_count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY DATE_TRUNC(''week'',"{COLUMN_NAME}") :: DATE ) SELECT d.all_dates as missing_period, MAX(b.period) AS prior_available_week, (SELECT period_count FROM existing_periods WHERE period = MAX(b.period) ) AS prior_available_week_count, MIN(c.period) AS next_available_week, (SELECT period_count FROM existing_periods WHERE period = MIN(c.period) ) AS next_available_week_count FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates ORDER BY d.all_dates;'), - ('1031', '1040', 'Test Results', 'Variability_Increase', 'redshift', NULL, 'SELECT STDDEV("{COLUMN_NAME}"::FLOAT) as current_standard_deviation FROM {TARGET_SCHEMA}.{TABLE_NAME};'), - ('1032', '1041', 'Test Results', 'Variability_Decrease', 'redshift', NULL, 'SELECT STDDEV("{COLUMN_NAME}"::FLOAT) as current_standard_deviation FROM {TARGET_SCHEMA}.{TABLE_NAME};'), - - ('1033', '1001', 'Profile Anomaly' , 'Suggested_Type', 'redshift', NULL, 'SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC;'), - ('1034', '1002', 'Profile Anomaly', 'Non_Standard_Blanks', 'redshift', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE CASE WHEN "{COLUMN_NAME}" IN (''.'', ''?'', '' '') THEN 1 WHEN LOWER("{COLUMN_NAME}") SIMILAR TO ''(^.{2,}|-{2,}|0{2,}|9{2,}|x{2,}|z{2,}$)'' THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''blank'',''error'',''missing'',''tbd'', ''n/a'',''#na'',''none'',''null'',''unknown'') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''(blank)'',''(error)'',''(missing)'',''(tbd)'', ''(n/a)'',''(#na)'',''(none)'',''(null)'',''(unknown)'') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''[blank]'',''[error]'',''[missing]'',''[tbd]'', ''[n/a]'',''[#na]'',''[none]'',''[null]'',''[unknown]'') THEN 1 WHEN "{COLUMN_NAME}" = '''' THEN 1 WHEN "{COLUMN_NAME}" IS NULL THEN 1 ELSE 0 END = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";'), - ('1035', '1003', 'Profile Anomaly' , 'Invalid_Zip_USA', 'redshift', NULL, 'SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}",''012345678'',''999999999'') NOT IN (''99999'', ''999999999'', ''99999-9999'') GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500;'), - ('1036', '1004', 'Profile Anomaly' , 'Multiple_Types_Minor', 'redshift', NULL, 'SELECT DISTINCT column_name, table_name, CASE WHEN data_type = ''timestamp without time zone'' THEN ''timestamp'' WHEN data_type = ''character varying'' THEN ''varchar('' || CAST(character_maximum_length AS VARCHAR) || '')'' WHEN data_type = ''character'' THEN ''char('' || CAST(character_maximum_length AS VARCHAR) || '')'' WHEN data_type = ''numeric'' THEN ''numeric('' || CAST(numeric_precision AS VARCHAR) || '','' || CAST(numeric_scale AS VARCHAR) || '')'' ELSE data_type END AS data_type FROM information_schema.columns WHERE table_schema = ''{TARGET_SCHEMA}'' AND column_name = ''{COLUMN_NAME}'' ORDER BY data_type, table_name;'), - ('1037', '1005', 'Profile Anomaly' , 'Multiple_Types_Major', 'redshift', NULL, 'SELECT DISTINCT column_name, table_name, CASE WHEN data_type = ''timestamp without time zone'' THEN ''timestamp'' WHEN data_type = ''character varying'' THEN ''varchar('' || CAST(character_maximum_length AS VARCHAR) || '')'' WHEN data_type = ''character'' THEN ''char('' || CAST(character_maximum_length AS VARCHAR) || '')'' WHEN data_type = ''numeric'' THEN ''numeric('' || CAST(numeric_precision AS VARCHAR) || '','' || CAST(numeric_scale AS VARCHAR) || '')'' ELSE data_type END AS data_type FROM information_schema.columns WHERE table_schema = ''{TARGET_SCHEMA}'' AND column_name = ''{COLUMN_NAME}'' ORDER BY data_type, table_name;'), - ('1038', '1006', 'Profile Anomaly' , 'No_Values', 'redshift', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";' ), - ('1039', '1007', 'Profile Anomaly' , 'Column_Pattern_Mismatch', 'redshift', NULL, 'SELECT A.* FROM ( SELECT TOP 5 DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT trim(split_part(''{DETAIL_EXPRESSION}'', ''|'', 4)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", ''[a-z]'', ''a''),''[A-Z]'', ''A''),''[0-9]'', ''N'') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC ) A UNION ALL SELECT B.* FROM ( SELECT TOP 5 DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT trim(split_part(''{DETAIL_EXPRESSION}'', ''|'', 6)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", ''[a-z]'', ''a''),''[A-Z]'', ''A''),''[0-9]'', ''N'') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC ) B UNION ALL SELECT C.* FROM ( SELECT TOP 5 DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT trim(split_part(''{DETAIL_EXPRESSION}'', ''|'', 8)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", ''[a-z]'', ''a''),''[A-Z]'', ''A''),''[0-9]'', ''N'') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC ) C UNION ALL SELECT D.* FROM ( SELECT TOP 5 DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT trim(split_part(''{DETAIL_EXPRESSION}'', ''|'', 10)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", ''[a-z]'', ''a''),''[A-Z]'', ''A''),''[0-9]'', ''N'') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC ) D ORDER BY top_pattern DESC, count DESC;' ), - ('1040', '1008', 'Profile Anomaly' , 'Table_Pattern_Mismatch', 'redshift', NULL, 'SELECT column_name, table_name, data_type FROM information_schema.columns WHERE table_schema = ''{TARGET_SCHEMA}'' AND column_name = ''{COLUMN_NAME}'' ORDER BY data_type;' ), - ('1041', '1009', 'Profile Anomaly' , 'Leading_Spaces', 'redshift', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (CASE WHEN "{COLUMN_NAME}" BETWEEN '' !'' AND ''!'' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";' ), - ('1042', '1010', 'Profile Anomaly' , 'Quoted_Values', 'redshift', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (CASE WHEN "{COLUMN_NAME}" ILIKE ''"%"'' OR "{COLUMN_NAME}" ILIKE ''''''%'''''' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500;' ), - ('1043', '1011', 'Profile Anomaly' , 'Char_Column_Number_Values', 'redshift', NULL, 'SELECT A.* FROM ( SELECT TOP 10 DISTINCT ''Numeric'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_NUM;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC) AS A UNION ALL SELECT B.* FROM ( SELECT TOP 10 DISTINCT ''Non-Numeric'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_NUM;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS B ORDER BY data_type, count DESC;' ), - ('1044', '1012', 'Profile Anomaly' , 'Char_Column_Date_Values', 'redshift', NULL, 'SELECT A.* FROM ( SELECT TOP 10 DISTINCT ''Date'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_DATE;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS A UNION ALL SELECT B.* FROM ( SELECT TOP 10 DISTINCT ''Non-Date'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_DATE;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS B ORDER BY data_type, count DESC;' ), - ('1045', '1013', 'Profile Anomaly', 'Small Missing Value Ct', 'redshift', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (CASE WHEN "{COLUMN_NAME}" IN (''.'', ''?'', '' '') THEN 1 WHEN LOWER("{COLUMN_NAME}") SIMILAR TO ''(^.{2,}|-{2,}|0{2,}|9{2,}|x{2,}|z{2,}$)'' THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''blank'',''error'',''missing'',''tbd'', ''n/a'',''#na'',''none'',''null'',''unknown'') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''(blank)'',''(error)'',''(missing)'',''(tbd)'', ''(n/a)'',''(#na)'',''(none)'',''(null)'',''(unknown)'') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''[blank]'',''[error]'',''[missing]'',''[tbd]'', ''[n/a]'',''[#na]'',''[none]'',''[null]'',''[unknown]'') THEN 1 WHEN "{COLUMN_NAME}" = '''' THEN 1 WHEN "{COLUMN_NAME}" IS NULL THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";'), - ('1046', '1014', 'Profile Anomaly' , 'Small Divergent Value Ct', 'redshift', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC;' ), - ('1047', '1015', 'Profile Anomaly' , 'Boolean_Value_Mismatch', 'redshift', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC;' ), - ('1048', '1016', 'Profile Anomaly' , 'Potential_Duplicates', 'redshift', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" HAVING COUNT(*)> 1 ORDER BY COUNT(*) DESC LIMIT 500;' ), - ('1049', '1017', 'Profile Anomaly' , 'Standardized_Value_Matches', 'redshift', NULL, 'WITH CTE AS ( SELECT DISTINCT UPPER(TRANSLATE("{COLUMN_NAME}", '' '''',.-'', '''')) as possible_standard_value, COUNT(DISTINCT "{COLUMN_NAME}") FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY UPPER(TRANSLATE("{COLUMN_NAME}", '' '''',.-'', '''')) HAVING COUNT(DISTINCT "{COLUMN_NAME}") > 1 ) SELECT DISTINCT a."{COLUMN_NAME}", possible_standard_value, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} a, cte b WHERE UPPER(TRANSLATE(a."{COLUMN_NAME}", '' '''',.-'', '''')) = b.possible_standard_value GROUP BY a."{COLUMN_NAME}", possible_standard_value ORDER BY possible_standard_value ASC, count DESC LIMIT 500;' ), - ('1050', '1018', 'Profile Anomaly' , 'Unlikely_Date_Values', 'redshift', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", ''{PROFILE_RUN_DATE}'' :: DATE AS profile_run_date, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} a WHERE ("{COLUMN_NAME}" < ''1900-01-01''::DATE) OR ("{COLUMN_NAME}" > ''{PROFILE_RUN_DATE}'' :: DATE + INTERVAL ''30 year'' ) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500;' ), - ('1051', '1019', 'Profile Anomaly' , 'Recency_One_Year', 'redshift', NULL, 'created_in_ui' ), - ('1052', '1020', 'Profile Anomaly' , 'Recency_Six_Months', 'redshift', NULL, 'created_in_ui' ), - ('1053', '1021', 'Profile Anomaly' , 'Unexpected US States', 'redshift', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500;' ), - ('1054', '1022', 'Profile Anomaly' , 'Unexpected Emails', 'redshift', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500;' ), - ('1055', '1023', 'Profile Anomaly' , 'Small_Numeric_Value_Ct', 'redshift', NULL, 'SELECT A.* FROM ( SELECT TOP 10 DISTINCT ''Numeric'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_NUM;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC) AS A UNION ALL SELECT B.* FROM ( SELECT TOP 10 DISTINCT ''Non-Numeric'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_NUM;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS B ORDER BY data_type, count DESC;' ), - ('1056', '1024', 'Profile Anomaly' , 'Invalid_Zip3_USA', 'redshift', NULL, 'SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}",''012345678'',''999999999'') <> ''999'' GROUP BY "{COLUMN_NAME}" ORDER BY count DESC, "{COLUMN_NAME}" LIMIT 500;'), - ('1057', '1025', 'Profile Anomaly' , 'Delimited_Data_Embedded', 'redshift', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" ~ ''^([^,|\t]{1,20}[,|\t]){2,}[^,|\t]{0,20}([,|\t]{0,1}[^,|\t]{0,20})*$'' AND "{COLUMN_NAME}" !~ ''\\s(and|but|or|yet)\\s'' GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT 500;' ), - - ('1058', '1001', 'Profile Anomaly' , 'Suggested_Type', 'postgresql', NULL, 'SELECT "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC LIMIT 20;'), - ('1059', '1002', 'Profile Anomaly', 'Non_Standard_Blanks', 'postgresql', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE CASE WHEN "{COLUMN_NAME}" IN (''.'', ''?'', '' '') THEN 1 WHEN LOWER("{COLUMN_NAME}") SIMILAR TO ''(^.{2,}|-{2,}|0{2,}|9{2,}|x{2,}|z{2,}$)'' THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''blank'',''error'',''missing'',''tbd'', ''n/a'',''#na'',''none'',''null'',''unknown'') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''(blank)'',''(error)'',''(missing)'',''(tbd)'', ''(n/a)'',''(#na)'',''(none)'',''(null)'',''(unknown)'') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''[blank]'',''[error]'',''[missing]'',''[tbd]'', ''[n/a]'',''[#na]'',''[none]'',''[null]'',''[unknown]'') THEN 1 WHEN "{COLUMN_NAME}" = '''' THEN 1 WHEN "{COLUMN_NAME}" IS NULL THEN 1 ELSE 0 END = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";'), - ('1060', '1003', 'Profile Anomaly' , 'Invalid_Zip_USA', 'postgresql', NULL, 'SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}",''012345678'',''999999999'') NOT IN (''99999'', ''999999999'', ''99999-9999'') GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500;'), - ('1061', '1004', 'Profile Anomaly' , 'Multiple_Types_Minor', 'postgresql', NULL, 'SELECT DISTINCT column_name, columns.table_name, CASE WHEN data_type = ''timestamp without time zone'' THEN ''timestamp'' WHEN data_type = ''character varying'' THEN ''varchar('' || CAST(character_maximum_length AS VARCHAR) || '')'' WHEN data_type = ''character'' THEN ''char('' || CAST(character_maximum_length AS VARCHAR) || '')'' WHEN data_type = ''numeric'' THEN ''numeric('' || CAST(numeric_precision AS VARCHAR) || '','' || CAST(numeric_scale AS VARCHAR) || '')'' ELSE data_type END AS data_type FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = ''{TARGET_SCHEMA}'' AND columns.column_name = ''{COLUMN_NAME}'' AND UPPER(tables.table_type) = ''BASE TABLE'' ORDER BY data_type, table_name;'), - ('1062', '1005', 'Profile Anomaly' , 'Multiple_Types_Major', 'postgresql', NULL, 'SELECT DISTINCT column_name, columns.table_name, CASE WHEN data_type = ''timestamp without time zone'' THEN ''timestamp'' WHEN data_type = ''character varying'' THEN ''varchar('' || CAST(character_maximum_length AS VARCHAR) || '')'' WHEN data_type = ''character'' THEN ''char('' || CAST(character_maximum_length AS VARCHAR) || '')'' WHEN data_type = ''numeric'' THEN ''numeric('' || CAST(numeric_precision AS VARCHAR) || '','' || CAST(numeric_scale AS VARCHAR) || '')'' ELSE data_type END AS data_type FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = ''{TARGET_SCHEMA}'' AND columns.column_name = ''{COLUMN_NAME}'' AND UPPER(tables.table_type) = ''BASE TABLE'' ORDER BY data_type, table_name;'), - ('1063', '1006', 'Profile Anomaly' , 'No_Values', 'postgresql', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";' ), - ('1064', '1007', 'Profile Anomaly' , 'Column_Pattern_Mismatch', 'postgresql', NULL, 'SELECT A.* FROM ( SELECT DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT trim(split_part(''{DETAIL_EXPRESSION}'', ''|'', 4)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", ''[a-z]'', ''a'', ''g''), ''[A-Z]'', ''A'', ''g''), ''[0-9]'', ''N'', ''g'') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC LIMIT 5 ) A UNION ALL SELECT B.* FROM ( SELECT DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT trim(split_part(''{DETAIL_EXPRESSION}'', ''|'', 6)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", ''[a-z]'', ''a'', ''g''), ''[A-Z]'', ''A'', ''g''), ''[0-9]'', ''N'', ''g'') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC LIMIT 5 ) B UNION ALL SELECT C.* FROM ( SELECT DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT trim(split_part(''{DETAIL_EXPRESSION}'', ''|'', 8)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", ''[a-z]'', ''a'', ''g''), ''[A-Z]'', ''A'', ''g''), ''[0-9]'', ''N'', ''g'') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC LIMIT 5 ) C UNION ALL SELECT D.* FROM ( SELECT DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT trim(split_part(''{DETAIL_EXPRESSION}'', ''|'', 10)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", ''[a-z]'', ''a'', ''g''), ''[A-Z]'', ''A'', ''g''), ''[0-9]'', ''N'', ''g'') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC LIMIT 5) D ORDER BY top_pattern DESC, count DESC;' ), - ('1065', '1008', 'Profile Anomaly' , 'Table_Pattern_Mismatch', 'postgresql', NULL, 'SELECT column_name, columns.table_name FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = ''{TARGET_SCHEMA}'' AND columns.column_name = ''{COLUMN_NAME}'' AND UPPER(tables.table_type) = ''BASE TABLE'' ORDER BY columns.table_name;' ), - ('1066', '1009', 'Profile Anomaly' , 'Leading_Spaces', 'postgresql', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (CASE WHEN "{COLUMN_NAME}" BETWEEN '' !'' AND ''!'' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";' ), - ('1067', '1010', 'Profile Anomaly' , 'Quoted_Values', 'postgresql', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (CASE WHEN "{COLUMN_NAME}" ILIKE ''"%"'' OR "{COLUMN_NAME}" ILIKE ''''''%'''''' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500;' ), - ('1068', '1011', 'Profile Anomaly' , 'Char_Column_Number_Values', 'postgresql', NULL, 'SELECT A.* FROM ( SELECT DISTINCT ''Numeric'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_NUM;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT 10 ) AS A UNION ALL SELECT B.* FROM ( SELECT DISTINCT ''Non-Numeric'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_NUM;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT 10 ) AS B ORDER BY data_type, count DESC;' ), - ('1069', '1012', 'Profile Anomaly' , 'Char_Column_Date_Values', 'postgresql', NULL, 'SELECT A.* FROM ( SELECT DISTINCT ''Date'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_DATE;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT 10) AS A UNION ALL SELECT B.* FROM ( SELECT DISTINCT ''Non-Date'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_DATE;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT 10) AS B ORDER BY data_type, count DESC;' ), - ('1070', '1013', 'Profile Anomaly', 'Small Missing Value Ct', 'postgresql', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (CASE WHEN "{COLUMN_NAME}" IN (''.'', ''?'', '' '') THEN 1 WHEN LOWER("{COLUMN_NAME}") SIMILAR TO ''(^.{2,}|-{2,}|0{2,}|9{2,}|x{2,}|z{2,}$)'' THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''blank'',''error'',''missing'',''tbd'', ''n/a'',''#na'',''none'',''null'',''unknown'') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''(blank)'',''(error)'',''(missing)'',''(tbd)'', ''(n/a)'',''(#na)'',''(none)'',''(null)'',''(unknown)'') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''[blank]'',''[error]'',''[missing]'',''[tbd]'', ''[n/a]'',''[#na]'',''[none]'',''[null]'',''[unknown]'') THEN 1 WHEN "{COLUMN_NAME}" = '''' THEN 1 WHEN "{COLUMN_NAME}" IS NULL THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";'), - ('1071', '1014', 'Profile Anomaly' , 'Small Divergent Value Ct', 'postgresql', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC;' ), - ('1072', '1015', 'Profile Anomaly' , 'Boolean_Value_Mismatch', 'postgresql', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC;' ), - ('1073', '1016', 'Profile Anomaly' , 'Potential_Duplicates', 'postgresql', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" HAVING COUNT(*)> 1 ORDER BY COUNT(*) DESC LIMIT 500;' ), - ('1074', '1017', 'Profile Anomaly' , 'Standardized_Value_Matches', 'postgresql', NULL, 'WITH CTE AS ( SELECT DISTINCT UPPER(TRANSLATE("{COLUMN_NAME}", '' '''',.-'', '''')) as possible_standard_value, COUNT(DISTINCT "{COLUMN_NAME}") FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY UPPER(TRANSLATE("{COLUMN_NAME}", '' '''',.-'', '''')) HAVING COUNT(DISTINCT "{COLUMN_NAME}") > 1 ) SELECT DISTINCT a."{COLUMN_NAME}", possible_standard_value, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} a, cte b WHERE UPPER(TRANSLATE(a."{COLUMN_NAME}", '' '''',.-'', '''')) = b.possible_standard_value GROUP BY a."{COLUMN_NAME}", possible_standard_value ORDER BY possible_standard_value ASC, count DESC LIMIT 500;' ), - ('1075', '1018', 'Profile Anomaly' , 'Unlikely_Date_Values', 'postgresql', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", ''{PROFILE_RUN_DATE}'' :: DATE AS profile_run_date, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} a WHERE ("{COLUMN_NAME}" < ''1900-01-01''::DATE) OR ("{COLUMN_NAME}" > ''{PROFILE_RUN_DATE}'' :: DATE + INTERVAL ''30 year'' ) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500;' ), - ('1076', '1019', 'Profile Anomaly' , 'Recency_One_Year', 'postgresql', NULL, 'created_in_ui' ), - ('1077', '1020', 'Profile Anomaly' , 'Recency_Six_Months', 'postgresql', NULL, 'created_in_ui' ), - ('1078', '1021', 'Profile Anomaly' , 'Unexpected US States', 'postgresql', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500;' ), - ('1079', '1022', 'Profile Anomaly' , 'Unexpected Emails', 'postgresql', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500;' ), - ('1080', '1023', 'Profile Anomaly' , 'Small_Numeric_Value_Ct', 'postgresql', NULL, 'SELECT A.* FROM ( SELECT DISTINCT ''Numeric'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_NUM;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT 10 ) AS A UNION ALL SELECT B.* FROM ( SELECT DISTINCT ''Non-Numeric'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_NUM;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT 10 ) AS B ORDER BY data_type, count DESC;' ), - ('1081', '1024', 'Profile Anomaly' , 'Invalid_Zip3_USA', 'postgresql', NULL, 'SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}",''012345678'',''999999999'') <> ''999'' GROUP BY "{COLUMN_NAME}" ORDER BY count DESC, "{COLUMN_NAME}" LIMIT 500;'), - ('1082', '1025', 'Profile Anomaly' , 'Delimited_Data_Embedded', 'postgresql', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" ~ ''^([^,|\t]{1,20}[,|\t]){2,}[^,|\t]{0,20}([,|\t]{0,1}[^,|\t]{0,20})*$'' AND "{COLUMN_NAME}" !~ ''\s(and|but|or|yet)\s'' GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT 500;' ), - - - ('1083', '1004', 'Test Results', 'Alpha_Trunc', 'postgresql', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", LENGTH("{COLUMN_NAME}") as current_max_length, {THRESHOLD_VALUE} as previous_max_length FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT MAX(LENGTH("{COLUMN_NAME}")) as max_length FROM {TARGET_SCHEMA}.{TABLE_NAME}) a WHERE LENGTH("{COLUMN_NAME}") = a.max_length AND a.max_length < {THRESHOLD_VALUE} LIMIT 500;'), - ('1084', '1005', 'Test Results', 'Avg_Shift', 'postgresql', NULL, 'SELECT AVG("{COLUMN_NAME}" :: FLOAT) AS current_average FROM {TARGET_SCHEMA}.{TABLE_NAME};'), - ('1085', '1006', 'Test Results', 'Condition_Flag', 'postgresql', NULL, 'SELECT * FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE {CUSTOM_QUERY} LIMIT 500;'), - ('1086', '1007', 'Test Results', 'Constant', 'postgresql', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" <> {BASELINE_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT 500;'), - ('1087', '1009', 'Test Results', 'Daily_Record_Ct', 'postgresql', NULL, 'WITH RECURSIVE daterange(all_dates) AS (SELECT MIN("{COLUMN_NAME}") :: DATE AS all_dates FROM {TARGET_SCHEMA}.{TABLE_NAME} UNION ALL SELECT (d.all_dates :: DATE + INTERVAL ''1 day'') :: DATE AS all_dates FROM daterange d WHERE d.all_dates < (SELECT MAX("{COLUMN_NAME}") :: DATE FROM {TARGET_SCHEMA}.{TABLE_NAME}) ), existing_periods AS ( SELECT DISTINCT "{COLUMN_NAME}" :: DATE AS period, COUNT(1) AS period_count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" :: DATE ) SELECT d.all_dates AS missing_period, MAX(b.period) AS prior_available_date, (SELECT period_count FROM existing_periods WHERE period = MAX(b.period) ) AS prior_available_date_count, MIN(c.period) AS next_available_date, (SELECT period_count FROM existing_periods WHERE period = MIN(c.period) ) AS next_available_date_count FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates LIMIT 500;'), - ('1088', '1011', 'Test Results', 'Dec_Trunc', 'postgresql', NULL, 'SELECT DISTINCT LENGTH(SPLIT_PART("{COLUMN_NAME}" :: TEXT, ''.'', 2)) AS decimal_scale, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY decimal_scale LIMIT 500;'), - ('1089', '1012', 'Test Results', 'Distinct_Date_Ct', 'postgresql', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" IS NOT NULL GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500;'), - ('1090', '1013', 'Test Results', 'Distinct_Value_Ct', 'postgresql', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" IS NOT NULL GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500;'), - ('1091', '1014', 'Test Results', 'Email_Format', 'postgresql', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" !~ ''^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$'' GROUP BY "{COLUMN_NAME}" LIMIT 500;'), - ('1092', '1015', 'Test Results', 'Future_Date', 'postgresql', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE GREATEST(0, SIGN("{COLUMN_NAME}"::DATE - ''{TEST_DATE}''::DATE)) > {THRESHOLD_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT 500;'), - ('1093', '1016', 'Test Results', 'Future_Date_1Y', 'postgresql', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE GREATEST(0, SIGN("{COLUMN_NAME}"::DATE - (''{TEST_DATE}''::DATE + 365))) > {THRESHOLD_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT 500;'), - ('1094', '1017', 'Test Results', 'Incr_Avg_Shift', 'postgresql', NULL, 'SELECT AVG("{COLUMN_NAME}" :: FLOAT) AS current_average, SUM("{COLUMN_NAME}" ::FLOAT) AS current_sum, NULLIF(COUNT("{COLUMN_NAME}" )::FLOAT, 0) as current_value_count FROM {TARGET_SCHEMA}.{TABLE_NAME};'), - ('1095', '1018', 'Test Results', 'LOV_All', 'postgresql', NULL, 'SELECT STRING_AGG(DISTINCT "{COLUMN_NAME}", ''|'' ORDER BY "{COLUMN_NAME}" ASC) FROM {TARGET_SCHEMA}.{TABLE_NAME} HAVING STRING_AGG(DISTINCT "{COLUMN_NAME}", ''|'' ORDER BY "{COLUMN_NAME}" ASC) <> ''{THRESHOLD_VALUE}'' LIMIT 500;'), - ('1096', '1019', 'Test Results', 'LOV_Match', 'postgresql', NULL, 'SELECT DISTINCT NULLIF("{COLUMN_NAME}", '''') AS "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE NULLIF("{COLUMN_NAME}", '''') NOT IN {BASELINE_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT 500;'), - ('1097', '1020', 'Test Results', 'Min_Date', 'postgresql', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" :: DATE < ''{BASELINE_VALUE}'' :: DATE GROUP BY "{COLUMN_NAME}" LIMIT 500;'), - ('1098', '1021', 'Test Results', 'Min_Val', 'postgresql', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", (ABS("{COLUMN_NAME}") - ABS({BASELINE_VALUE})) AS difference_from_baseline FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" < {BASELINE_VALUE} LIMIT 500;'), - ('1099', '1022', 'Test Results', 'Missing_Pct', 'postgresql', NULL, 'SELECT * FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" IS NULL OR "{COLUMN_NAME}" :: VARCHAR(255) = '''' LIMIT 10;'), - ('1100', '1023', 'Test Results', 'Monthly_Rec_Ct', 'postgresql', NULL, 'WITH RECURSIVE daterange(all_dates) AS (SELECT DATE_TRUNC(''month'', MIN("{COLUMN_NAME}")) :: DATE AS all_dates FROM {TARGET_SCHEMA}.{TABLE_NAME} UNION ALL SELECT (d.all_dates :: DATE + INTERVAL ''1 month'') :: DATE AS all_dates FROM daterange d WHERE d.all_dates < (SELECT DATE_TRUNC(''month'', MAX("{COLUMN_NAME}")) :: DATE FROM {TARGET_SCHEMA}.{TABLE_NAME}) ), existing_periods AS ( SELECT DISTINCT DATE_TRUNC(''month'',"{COLUMN_NAME}") :: DATE AS period, COUNT(1) AS period_count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY DATE_TRUNC(''month'',"{COLUMN_NAME}") :: DATE ) SELECT d.all_dates as missing_period, MAX(b.period) AS prior_available_month, (SELECT period_count FROM existing_periods WHERE period = MAX(b.period) ) AS prior_available_month_count, MIN(c.period) AS next_available_month, (SELECT period_count FROM existing_periods WHERE period = MIN(c.period) ) AS next_available_month_count FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates ORDER BY d.all_dates;'), - ('1101', '1024', 'Test Results', 'Outlier_Pct_Above', 'postgresql', NULL, 'SELECT ({BASELINE_AVG} + (2*{BASELINE_SD})) AS outlier_threshold, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" :: FLOAT > ({BASELINE_AVG} + (2*{BASELINE_SD})) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC;'), - ('1102', '1025', 'Test Results', 'Outlier_Pct_Below', 'postgresql', NULL, 'SELECT ({BASELINE_AVG} + (2*{BASELINE_SD})) AS outlier_threshold, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" :: FLOAT < ({BASELINE_AVG} + (2*{BASELINE_SD})) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC;'), - ('1103', '1026', 'Test Results', 'Pattern_Match', 'postgresql', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE NULLIF("{COLUMN_NAME}", '''') NOT SIMILAR TO ''{BASELINE_VALUE}'' GROUP BY "{COLUMN_NAME}";'), - ('1104', '1028', 'Test Results', 'Recency', 'postgresql', NULL, 'SELECT DISTINCT col AS latest_date_available, ''{TEST_DATE}'' :: DATE as test_run_date FROM (SELECT MAX("{COLUMN_NAME}") AS col FROM {TARGET_SCHEMA}.{TABLE_NAME}) a WHERE <%DATEDIFF_DAY;col;''{TEST_DATE}''::DATE%> > {THRESHOLD_VALUE};'), - ('1105', '1030', 'Test Results', 'Required', 'postgresql', NULL, 'SELECT * FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" IS NULL LIMIT 500;'), - ('1106', '1031', 'Test Results', 'Row_Ct', 'postgresql', NULL, 'WITH CTE AS (SELECT COUNT(*) AS current_count FROM {TARGET_SCHEMA}.{TABLE_NAME}) SELECT current_count, ABS(ROUND(100 * (current_count - {THRESHOLD_VALUE}) :: NUMERIC / {THRESHOLD_VALUE} :: NUMERIC,2)) AS row_count_pct_decrease FROM cte WHERE current_count < {THRESHOLD_VALUE};'), - ('1107', '1032', 'Test Results', 'Row_Ct_Pct', 'postgresql', NULL, 'WITH CTE AS (SELECT COUNT(*) AS current_count FROM {TARGET_SCHEMA}.{TABLE_NAME}) SELECT current_count, {BASELINE_CT} AS baseline_count, ABS(ROUND(100 * (current_count - {BASELINE_CT}) :: NUMERIC / {BASELINE_CT} :: NUMERIC,2)) AS row_count_pct_difference FROM cte;'), - ('1108', '1033', 'Test Results', 'Street_Addr_Pattern', 'postgresql', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" !~ ''^[0-9]{1,5}[a-zA-Z]?\s\w{1,5}\.?\s?\w*\s?\w*\s[a-zA-Z]{1,6}\.?\s?[0-9]{0,5}[A-Z]{0,1}$'' GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT 500;'), - ('1109', '1036', 'Test Results', 'US_State', 'postgresql', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE NULLIF("{COLUMN_NAME}", '''') NOT IN (''AL'',''AK'',''AS'',''AZ'',''AR'',''CA'',''CO'',''CT'',''DE'',''DC'',''FM'',''FL'',''GA'',''GU'',''HI'',''ID'',''IL'',''IN'',''IA'',''KS'',''KY'',''LA'',''ME'',''MH'',''MD'',''MA'',''MI'',''MN'',''MS'',''MO'',''MT'',''NE'',''NV'',''NH'',''NJ'',''NM'',''NY'',''NC'',''ND'',''MP'',''OH'',''OK'',''OR'',''PW'',''PA'',''PR'',''RI'',''SC'',''SD'',''TN'',''TX'',''UT'',''VT'',''VI'',''VA'',''WA'',''WV'',''WI'',''WY'',''AE'',''AP'',''AA'') GROUP BY "{COLUMN_NAME}" LIMIT 500;'), - ('1110', '1034', 'Test Results', 'Unique', 'postgresql', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" HAVING COUNT(*) > 1 ORDER BY COUNT(*) DESC LIMIT 500;'), - ('1111', '1035', 'Test Results', 'Unique_Pct', 'postgresql', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT 500;'), - ('1112', '1037', 'Test Results', 'Weekly_Rec_Ct', 'postgresql', NULL, 'WITH RECURSIVE daterange(all_dates) AS (SELECT DATE_TRUNC(''week'', MIN("{COLUMN_NAME}")) :: DATE AS all_dates FROM {TARGET_SCHEMA}.{TABLE_NAME} UNION ALL SELECT (d.all_dates + INTERVAL ''1 week'' ) :: DATE AS all_dates FROM daterange d WHERE d.all_dates < (SELECT DATE_TRUNC(''week'' , MAX("{COLUMN_NAME}")) :: DATE FROM {TARGET_SCHEMA}.{TABLE_NAME}) ), existing_periods AS (SELECT DISTINCT DATE_TRUNC(''week'', "{COLUMN_NAME}") :: DATE AS period, COUNT(1) as period_count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY DATE_TRUNC(''week'', "{COLUMN_NAME}") :: DATE) SELECT d.all_dates as missing_period, MAX(b.period) AS prior_available_week, (SELECT period_count FROM existing_periods WHERE period = MAX(b.period) ) AS prior_available_week_count, MIN(c.period) AS next_available_week, (SELECT period_count FROM existing_periods WHERE period = MIN(c.period) ) AS next_available_week_count FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates ORDER BY d.all_dates;'), - ('1113', '1040', 'Test Results', 'Variability_Increase', 'postgresql', NULL, 'SELECT STDDEV(CAST("{COLUMN_NAME}" AS FLOAT)) as current_standard_deviation FROM {TARGET_SCHEMA}.{TABLE_NAME};'), - ('1114', '1041', 'Test Results', 'Variability_Decrease', 'postgresql', NULL, 'SELECT STDDEV(CAST("{COLUMN_NAME}" AS FLOAT)) as current_standard_deviation FROM {TARGET_SCHEMA}.{TABLE_NAME};'), - - ('1115', '1001', 'Profile Anomaly' , 'Suggested_Type', 'mssql', NULL, 'SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC;'), - ('1116', '1002', 'Profile Anomaly', 'Non_Standard_Blanks', 'mssql', NULL, 'SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE CASE WHEN "{COLUMN_NAME}" IN (''.'', ''?'') OR "{COLUMN_NAME}" LIKE '' '' THEN 1 WHEN LEN("{COLUMN_NAME}") > 1 AND ( LOWER("{COLUMN_NAME}") LIKE ''%..%'' OR LOWER("{COLUMN_NAME}") LIKE ''%--%'' OR (LEN(REPLACE("{COLUMN_NAME}", ''0'', ''''))= 0 ) OR (LEN(REPLACE("{COLUMN_NAME}", ''9'', ''''))= 0 ) OR (LEN(REPLACE(LOWER("{COLUMN_NAME}"), ''x'', ''''))= 0 ) OR (LEN(REPLACE(LOWER("{COLUMN_NAME}"), ''z'', ''''))= 0 ) ) THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''blank'',''error'',''missing'',''tbd'', ''n/a'',''#na'',''none'',''null'',''unknown'') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''(blank)'',''(error)'',''(missing)'',''(tbd)'', ''(n/a)'',''(#na)'',''(none)'',''(null)'',''(unknown)'') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''[blank]'',''[error]'',''[missing]'',''[tbd]'', ''[n/a]'',''[#na]'',''[none]'',''[null]'',''[unknown]'') THEN 1 WHEN "{COLUMN_NAME}" = '''' THEN 1 WHEN "{COLUMN_NAME}" IS NULL THEN 1 ELSE 0 END = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";'), - ('1117', '1003', 'Profile Anomaly' , 'Invalid_Zip_USA', 'mssql', NULL, 'SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}",''012345678'',''999999999'') NOT IN (''99999'', ''999999999'', ''99999-9999'') GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";'), - ('1118', '1004', 'Profile Anomaly' , 'Multiple_Types_Minor', 'mssql', NULL, 'SELECT TOP 500 column_name, columns.table_name, CASE WHEN data_type = ''datetime'' THEN ''datetime'' WHEN data_type = ''datetime2'' THEN ''datetime'' WHEN data_type = ''varchar'' THEN ''varchar('' + CAST(character_maximum_length AS VARCHAR) + '')'' WHEN data_type = ''char'' THEN ''char('' + CAST(character_maximum_length AS VARCHAR) + '')'' WHEN data_type = ''numeric'' THEN ''numeric('' + CAST(numeric_precision AS VARCHAR) + '','' + CAST(numeric_scale AS VARCHAR) + '')'' ELSE data_type END AS data_type FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = ''{TARGET_SCHEMA}'' AND columns.column_name = ''{COLUMN_NAME}'' AND tables.table_type = ''BASE TABLE'' ORDER BY data_type, table_name;'), - ('1119', '1005', 'Profile Anomaly' , 'Multiple_Types_Major', 'mssql', NULL, 'SELECT TOP 500 column_name, columns.table_name, CASE WHEN data_type = ''datetime'' THEN ''datetime'' WHEN data_type = ''datetime2'' THEN ''datetime'' WHEN data_type = ''varchar'' THEN ''varchar('' + CAST(character_maximum_length AS VARCHAR) + '')'' WHEN data_type = ''char'' THEN ''char('' + CAST(character_maximum_length AS VARCHAR) + '')'' WHEN data_type = ''numeric'' THEN ''numeric('' + CAST(numeric_precision AS VARCHAR) + '','' + CAST(numeric_scale AS VARCHAR) + '')'' ELSE data_type END AS data_type FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = ''{TARGET_SCHEMA}'' AND columns.column_name = ''{COLUMN_NAME}'' AND tables.table_type = ''BASE TABLE'' ORDER BY data_type, table_name;'), - ('1120', '1006', 'Profile Anomaly' , 'No_Values', 'mssql', NULL, 'SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";' ), - ('1121', '1007', 'Profile Anomaly' , 'Column_Pattern_Mismatch', 'mssql', NULL, 'WITH cte AS ( SELECT TRIM(value) AS top_pattern, ROW_NUMBER() OVER (ORDER BY CHARINDEX(''| ''+ TRIM(value) + '' |'', ''| '' + ''{DETAIL_EXPRESSION}'' + '' |'' ) ASC) as row_num FROM STRING_SPLIT(''{DETAIL_EXPRESSION}'', ''|'') ) SELECT DISTINCT TOP 5 c.top_pattern, a."{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} a, cte c WHERE c.row_num = 4 AND TRANSLATE(a."{COLUMN_NAME}" COLLATE Latin1_General_BIN, ''abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789'', ''aaaaaaaaaaaaaaaaaaaaaaaaaaAAAAAAAAAAAAAAAAAAAAAAAAAANNNNNNNNNN'') = c.top_pattern GROUP BY c.top_pattern, a."{COLUMN_NAME}" UNION ALL SELECT DISTINCT TOP 5 c.top_pattern, a."{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} a, cte c WHERE c.row_num = 6 AND TRANSLATE(a."{COLUMN_NAME}" COLLATE Latin1_General_BIN, ''abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789'', ''aaaaaaaaaaaaaaaaaaaaaaaaaaAAAAAAAAAAAAAAAAAAAAAAAAAANNNNNNNNNN'') = c.top_pattern GROUP BY c.top_pattern, a."{COLUMN_NAME}" UNION ALL SELECT DISTINCT TOP 5 c.top_pattern, a."{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} a, cte c WHERE c.row_num = 8 AND TRANSLATE(a."{COLUMN_NAME}" COLLATE Latin1_General_BIN, ''abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789'', ''aaaaaaaaaaaaaaaaaaaaaaaaaaAAAAAAAAAAAAAAAAAAAAAAAAAANNNNNNNNNN'') = c.top_pattern GROUP BY c.top_pattern, a."{COLUMN_NAME}" UNION ALL SELECT DISTINCT TOP 5 c.top_pattern, a."{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} a, cte c WHERE c.row_num = 10 AND TRANSLATE(a."{COLUMN_NAME}" COLLATE Latin1_General_BIN, ''abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789'', ''aaaaaaaaaaaaaaaaaaaaaaaaaaAAAAAAAAAAAAAAAAAAAAAAAAAANNNNNNNNNN'') = c.top_pattern GROUP BY c.top_pattern, a."{COLUMN_NAME}" ORDER BY top_pattern DESC, count DESC;' ), - ('1122', '1008', 'Profile Anomaly' , 'Table_Pattern_Mismatch', 'mssql', NULL, 'SELECT TOP 500 column_name, columns.table_name FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = ''{TARGET_SCHEMA}'' AND columns.column_name = ''{COLUMN_NAME}'' AND UPPER(tables.table_type) = ''BASE TABLE'' ORDER BY table_name;' ), - ('1123', '1009', 'Profile Anomaly' , 'Leading_Spaces', 'mssql', NULL, 'SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (CASE WHEN "{COLUMN_NAME}" BETWEEN '' !'' AND ''!'' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";' ), - ('1124', '1010', 'Profile Anomaly' , 'Quoted_Values', 'mssql', NULL, 'SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (CASE WHEN "{COLUMN_NAME}" LIKE ''"%"'' OR "{COLUMN_NAME}" LIKE ''''''%'''''' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";' ), - ('1125', '1011', 'Profile Anomaly' , 'Char_Column_Number_Values', 'mssql', NULL, 'SELECT A.* FROM ( SELECT DISTINCT TOP 10 ''Numeric'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_NUM;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS A UNION ALL SELECT B.* FROM ( SELECT DISTINCT TOP 10 ''Non-Numeric'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_NUM;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS B ORDER BY data_type, count DESC;' ), - ('1126', '1012', 'Profile Anomaly' , 'Char_Column_Date_Values', 'mssql', NULL, 'SELECT A.* FROM ( SELECT DISTINCT TOP 10 ''Date'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_DATE;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC) AS A UNION ALL SELECT B.* FROM ( SELECT DISTINCT TOP 10 ''Non-Date'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_DATE;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS B ORDER BY data_type, count DESC;' ), - ('1127', '1013', 'Profile Anomaly', 'Small Missing Value Ct', 'mssql', NULL, 'SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (CASE WHEN "{COLUMN_NAME}" IN (''.'', ''?'', '' '') THEN 1 WHEN LEN("{COLUMN_NAME}") > 1 AND ( LOWER("{COLUMN_NAME}") LIKE ''%..%'' OR LOWER("{COLUMN_NAME}") LIKE ''%--%'' OR (LEN(REPLACE("{COLUMN_NAME}", ''0'', ''''))= 0 ) OR (LEN(REPLACE("{COLUMN_NAME}", ''9'', ''''))= 0 ) OR (LEN(REPLACE(LOWER("{COLUMN_NAME}"), ''x'', ''''))= 0 ) OR (LEN(REPLACE(LOWER("{COLUMN_NAME}"), ''z'', ''''))= 0 ) ) THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''blank'',''error'',''missing'',''tbd'', ''n/a'',''#na'',''none'',''null'',''unknown'') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''(blank)'',''(error)'',''(missing)'',''(tbd)'', ''(n/a)'',''(#na)'',''(none)'',''(null)'',''(unknown)'') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''[blank]'',''[error]'',''[missing]'',''[tbd]'', ''[n/a]'',''[#na]'',''[none]'',''[null]'',''[unknown]'') THEN 1 WHEN "{COLUMN_NAME}" = '''' THEN 1 WHEN "{COLUMN_NAME}" IS NULL THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";'), - ('1128', '1014', 'Profile Anomaly' , 'Small Divergent Value Ct', 'mssql', NULL, 'SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC;' ), - ('1129', '1015', 'Profile Anomaly' , 'Boolean_Value_Mismatch', 'mssql', NULL, 'SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC;' ), - ('1130', '1016', 'Profile Anomaly' , 'Potential_Duplicates', 'mssql', NULL, 'SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" HAVING COUNT(*)> 1 ORDER BY COUNT(*) DESC;' ), - ('1131', '1017', 'Profile Anomaly' , 'Standardized_Value_Matches', 'mssql', NULL, 'WITH CTE AS ( SELECT DISTINCT TOP 500 UPPER(REPLACE(TRANSLATE("{COLUMN_NAME}",'' '''''''',.-'',REPLICATE('' '', LEN('' '''''''',.-''))),'' '','''')) as possible_standard_value, COUNT(DISTINCT "{COLUMN_NAME}") as distinct_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY UPPER(REPLACE(TRANSLATE("{COLUMN_NAME}",'' '''''''',.-'',REPLICATE('' '', LEN('' '''''''',.-''))),'' '','''')) HAVING COUNT(DISTINCT "{COLUMN_NAME}") > 1 ) SELECT DISTINCT a."{COLUMN_NAME}", possible_standard_value, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} a, cte b WHERE UPPER(REPLACE(TRANSLATE("{COLUMN_NAME}",'' '''''''',.-'',REPLICATE('' '', LEN('' '''''''',.-''))),'' '','''')) = b.possible_standard_value GROUP BY a."{COLUMN_NAME}", possible_standard_value ORDER BY possible_standard_value ASC, count DESC;' ), - ('1132', '1018', 'Profile Anomaly' , 'Unlikely_Date_Values', 'mssql', NULL, 'SELECT TOP 500 "{COLUMN_NAME}", CAST( ''{PROFILE_RUN_DATE}'' AS DATE) AS profile_run_date, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} a WHERE ("{COLUMN_NAME}" < CAST(''1900-01-01'' AS DATE) ) OR ("{COLUMN_NAME}" > DATEADD(YEAR, 30, CAST(''{PROFILE_RUN_DATE}'' AS DATE ))) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC;' ), - ('1133', '1019', 'Profile Anomaly' , 'Recency_One_Year', 'mssql', NULL, 'created_in_ui' ), - ('1134', '1020', 'Profile Anomaly' , 'Recency_Six_Months', 'mssql', NULL, 'created_in_ui' ), - ('1135', '1021', 'Profile Anomaly' , 'Unexpected US States', 'mssql', NULL, 'SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC;' ), - ('1136', '1022', 'Profile Anomaly' , 'Unexpected Emails', 'mssql', NULL, 'SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC;' ), - ('1137', '1023', 'Profile Anomaly' , 'Small_Numeric_Value_Ct', 'mssql', NULL, 'SELECT A.* FROM ( SELECT DISTINCT TOP 10 ''Numeric'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_NUM;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS A UNION ALL SELECT B.* FROM ( SELECT DISTINCT TOP 10 ''Non-Numeric'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_NUM;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS B ORDER BY data_type, count DESC;' ), - ('1138', '1024', 'Profile Anomaly' , 'Invalid_Zip3_USA', 'mssql', NULL, 'SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}",''012345678'',''999999999'') <> ''999'' GROUP BY "{COLUMN_NAME}" ORDER BY count DESC, "{COLUMN_NAME}";'), - ('1139', '1025', 'Profile Anomaly' , 'Delimited_Data_Embedded', 'mssql', NULL, 'SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE ( "{COLUMN_NAME}" LIKE ''%,%,%,%'' OR "{COLUMN_NAME}" LIKE ''%|%|%|%'' OR "{COLUMN_NAME}" LIKE ''%^%^%^%'' OR "{COLUMN_NAME}" LIKE ''%'' + CHAR(9) + ''%'' + CHAR(9) + ''%'' + CHAR(9) + ''%'' ) AND NOT ( "{COLUMN_NAME}" LIKE ''% and %'' OR "{COLUMN_NAME}" LIKE ''% but %'' OR "{COLUMN_NAME}" LIKE ''% or %'' OR "{COLUMN_NAME}" LIKE ''% yet %'' ) AND ISNULL(CAST(LEN("{COLUMN_NAME}") - LEN(REPLACE("{COLUMN_NAME}", '','', '''')) as FLOAT) / CAST(NULLIF(LEN("{COLUMN_NAME}") - LEN(REPLACE("{COLUMN_NAME}", '' '', '''')), 0) as FLOAT), 1) > 0.6 GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC;' ), - - ('1140', '1004', 'Test Results', 'Alpha_Trunc', 'mssql', NULL, 'SELECT DISTINCT TOP 500 "{COLUMN_NAME}", LEN("{COLUMN_NAME}") as current_max_length, {THRESHOLD_VALUE} as previous_max_length FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT MAX(LEN("{COLUMN_NAME}")) as max_length FROM {TARGET_SCHEMA}.{TABLE_NAME}) a WHERE LEN("{COLUMN_NAME}") = a.max_length AND a.max_length < {THRESHOLD_VALUE} ;'), - ('1141', '1005', 'Test Results', 'Avg_Shift', 'mssql', NULL, 'SELECT AVG(CAST("{COLUMN_NAME}" AS FLOAT)) AS current_average FROM {TARGET_SCHEMA}.{TABLE_NAME};'), - ('1142', '1006', 'Test Results', 'Condition_Flag', 'mssql', NULL, 'SELECT TOP 500 * FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE {CUSTOM_QUERY};'), - ('1143', '1007', 'Test Results', 'Constant', 'mssql', NULL, 'SELECT DISTINCT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" <> {BASELINE_VALUE} GROUP BY "{COLUMN_NAME}";'), - ('1144', '1009', 'Test Results', 'Daily_Record_Ct', 'mssql', NULL, 'WITH - Pass0 as (select 1 as C union all select 1), --2 rows - Pass1 as (select 1 as C from Pass0 as A, Pass0 as B),--4 rows - Pass2 as (select 1 as C from Pass1 as A, Pass1 as B),--16 rows - Pass3 as (select 1 as C from Pass2 as A, Pass2 as B),--256 rows - Pass4 as (select 1 as C from Pass3 as A, Pass3 as B),--65536 rows - All_Nums as (select row_number() over(order by C) as Number from Pass4), - tally as (SELECT Number FROM All_Nums WHERE Number <= 45000), - - date_range as (SELECT CAST(DATEADD(DAY, DATEDIFF(DAY, 0, MIN("{COLUMN_NAME}")), 0) AS DATE) AS min_period, - CAST(DATEADD(DAY, DATEDIFF(DAY, 0, MAX("{COLUMN_NAME}")), 0) AS DATE) AS max_period, - DATEDIFF(DAY, - CAST(DATEADD(DAY, DATEDIFF(DAY, 0, MIN("{COLUMN_NAME}")), 0) AS DATE), - CAST(DATEADD(DAY, DATEDIFF(DAY, 0, MAX("{COLUMN_NAME}")), 0) AS DATE) ) + 1 as period_ct - FROM {TARGET_SCHEMA}.{TABLE_NAME} ), - check_periods as ( SELECT d.min_period, d.max_period, t.number, - DATEADD(DAY, -(t.number - 1), d.max_period) AS check_period - FROM date_range d - INNER JOIN tally t - ON (d.period_ct >= t.number) ), - data_by_period as (SELECT CAST(DATEADD(DAY, DATEDIFF(DAY, 0, "{COLUMN_NAME}"), 0) AS DATE) as data_period, COUNT(*) as record_ct - FROM {TARGET_SCHEMA}.{TABLE_NAME} - GROUP BY CAST(DATEADD(DAY, DATEDIFF(DAY, 0, "{COLUMN_NAME}"), 0) AS DATE) ), - data_by_prd_with_prior_next as (SELECT check_period, - RANK() OVER (ORDER BY check_period DESC) as ranked, - ISNULL(d.record_ct, 0) as record_ct, - ISNULL(LAG(d.record_ct) OVER (ORDER BY check_period), 0) as last_record_ct, - ISNULL(LEAD(d.record_ct) OVER (ORDER BY check_period), 0) as next_record_ct - FROM check_periods c - LEFT JOIN data_by_period d - ON (c.check_period = d.data_period) ) -SELECT check_period, record_ct, - CASE - WHEN record_ct = 0 THEN ''MISSING'' - ELSE ''Present'' - END as status - FROM data_by_prd_with_prior_next - WHERE record_ct = 0 - OR last_record_ct = 0 - OR next_record_ct = 0 -ORDER BY check_period DESC;'), - ('1145', '1011', 'Test Results', 'Dec_Trunc', 'mssql', NULL, 'WITH CTE AS ( SELECT LEN(SUBSTRING(CAST(ABS("{COLUMN_NAME}") % 1 AS VARCHAR) , 3, LEN("{COLUMN_NAME}"))) AS decimal_scale FROM {TARGET_SCHEMA}.{TABLE_NAME} ) SELECT DISTINCT TOP 500 decimal_scale,COUNT(*) AS count FROM cte GROUP BY decimal_scale ORDER BY COUNT(*) DESC; '), - ('1146', '1012', 'Test Results', 'Distinct_Date_Ct', 'mssql', NULL, 'SELECT DISTINCT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" IS NOT NULL GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC;'), - ('1147', '1013', 'Test Results', 'Distinct_Value_Ct', 'mssql', NULL, 'SELECT DISTINCT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" IS NOT NULL GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC;'), - ('1148', '1014', 'Test Results', 'Email_Format', 'mssql', NULL, 'SELECT DISTINCT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" NOT LIKE ''%[_a-zA-Z0-9.-]%@%[a-zA-Z0-9.-]%.[a-zA-Z][a-zA-Z]%'' GROUP BY "{COLUMN_NAME}";'), - ('1149', '1015', 'Test Results', 'Future_Date', 'mssql', NULL, 'SELECT DISTINCT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE CAST("{COLUMN_NAME}" AS DATE) >= CONVERT(DATE, ''{TEST_DATE}'') GROUP BY "{COLUMN_NAME}";'), - ('1150', '1016', 'Test Results', 'Future_Date_1Y', 'mssql', NULL, 'SELECT DISTINCT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE CAST("{COLUMN_NAME}" AS DATE) >= DATEADD(DAY, 365, CONVERT(DATE, ''{TEST_DATE}'')) GROUP BY "{COLUMN_NAME}";'), - ('1151', '1017', 'Test Results', 'Incr_Avg_Shift', 'mssql', NULL, 'SELECT AVG(CAST("{COLUMN_NAME}" AS FLOAT)) AS current_average, SUM(CAST("{COLUMN_NAME}" AS FLOAT)) AS current_sum, NULLIF(CAST(COUNT("{COLUMN_NAME}") AS FLOAT), 0) as current_value_count FROM {TARGET_SCHEMA}.{TABLE_NAME};'), - ('1152', '1018', 'Test Results', 'LOV_All', 'mssql', NULL, 'WITH CTE AS (SELECT DISTINCT "{COLUMN_NAME}" FROM {TARGET_SCHEMA}.{TABLE_NAME}) SELECT STRING_AGG( "{COLUMN_NAME}", ''|'' ) WITHIN GROUP (ORDER BY "{COLUMN_NAME}" ASC) FROM CTE HAVING STRING_AGG("{COLUMN_NAME}", ''|'') WITHIN GROUP (ORDER BY "{COLUMN_NAME}" ASC) <> ''{THRESHOLD_VALUE}'';'), - ('1153', '1019', 'Test Results', 'LOV_Match', 'mssql', NULL, 'SELECT DISTINCT TOP 500 NULLIF("{COLUMN_NAME}", '''') AS "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE NULLIF("{COLUMN_NAME}", '''') NOT IN {BASELINE_VALUE} GROUP BY "{COLUMN_NAME}" ;'), - ('1154', '1020', 'Test Results', 'Min_Date', 'mssql', NULL, 'SELECT DISTINCT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE CAST("{COLUMN_NAME}" AS DATE) < CAST(''{BASELINE_VALUE}'' AS DATE) GROUP BY "{COLUMN_NAME}";'), - ('1155', '1021', 'Test Results', 'Min_Val', 'mssql', NULL, 'SELECT DISTINCT TOP 500 "{COLUMN_NAME}", (ABS("{COLUMN_NAME}") - ABS({BASELINE_VALUE})) AS difference_from_baseline FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" < {BASELINE_VALUE};'), - ('1156', '1022', 'Test Results', 'Missing_Pct', 'mssql', NULL, 'SELECT TOP 10 * FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" IS NULL OR CAST("{COLUMN_NAME}" AS VARCHAR(255)) = '''';'), - ('1157', '1023', 'Test Results', 'Monthly_Rec_Ct', 'mssql', NULL, 'WITH - Pass0 as (select 1 as C union all select 1), --2 rows - Pass1 as (select 1 as C from Pass0 as A, Pass0 as B),--4 rows - Pass2 as (select 1 as C from Pass1 as A, Pass1 as B),--16 rows - Pass3 as (select 1 as C from Pass2 as A, Pass2 as B),--256 rows - Pass4 as (select 1 as C from Pass3 as A, Pass3 as B),--65536 rows - All_Nums as (select row_number() over(order by C) as Number from Pass4), - tally as (SELECT Number FROM All_Nums WHERE Number <= 45000), - - date_range as (SELECT CAST(DATEADD(MONTH, DATEDIFF(MONTH, 0, MIN("{COLUMN_NAME}")), 0) AS DATE) AS min_period, - CAST(DATEADD(MONTH, DATEDIFF(MONTH, 0, MAX("{COLUMN_NAME}")), 0) AS DATE) AS max_period, - DATEDIFF(MONTH, - CAST(DATEADD(MONTH, DATEDIFF(MONTH, 0, MIN("{COLUMN_NAME}")), 0) AS DATE), - CAST(DATEADD(MONTH, DATEDIFF(MONTH, 0, MAX("{COLUMN_NAME}")), 0) AS DATE) ) + 1 as period_ct - FROM {TARGET_SCHEMA}.{TABLE_NAME} ), - check_periods as ( SELECT d.min_period, d.max_period, t.number, - DATEADD(MONTH, -(t.number - 1), d.max_period) AS check_period - FROM date_range d - INNER JOIN tally t - ON (d.period_ct >= t.number) ), - data_by_period as (SELECT CAST(DATEADD(MONTH, DATEDIFF(MONTH, 0, "{COLUMN_NAME}"), 0) AS DATE) as data_period, COUNT(*) as record_ct - FROM {TARGET_SCHEMA}.{TABLE_NAME} - GROUP BY CAST(DATEADD(MONTH, DATEDIFF(MONTH, 0, "{COLUMN_NAME}"), 0) AS DATE) ), - data_by_prd_with_prior_next as (SELECT check_period, - RANK() OVER (ORDER BY check_period DESC) as ranked, - ISNULL(d.record_ct, 0) as record_ct, - ISNULL(LAG(d.record_ct) OVER (ORDER BY check_period), 0) as last_record_ct, - ISNULL(LEAD(d.record_ct) OVER (ORDER BY check_period), 0) as next_record_ct - FROM check_periods c - LEFT JOIN data_by_period d - ON (c.check_period = d.data_period) ) -SELECT check_period, record_ct, - CASE - WHEN record_ct = 0 THEN ''MISSING'' - ELSE ''Present'' - END as status - FROM data_by_prd_with_prior_next - WHERE record_ct = 0 - OR last_record_ct = 0 - OR next_record_ct = 0 -ORDER BY check_period DESC;'), - ('1158', '1024', 'Test Results', 'Outlier_Pct_Above', 'mssql', NULL, 'SELECT ({BASELINE_AVG} + (2*{BASELINE_SD})) AS outlier_threshold, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE CAST("{COLUMN_NAME}" AS FLOAT) > ({BASELINE_AVG} + (2*{BASELINE_SD})) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC;'), - ('1159', '1025', 'Test Results', 'Outlier_Pct_Below', 'mssql', NULL, 'SELECT ({BASELINE_AVG} + (2*{BASELINE_SD})) AS outlier_threshold, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE CAST("{COLUMN_NAME}" AS FLOAT) < ({BASELINE_AVG} + (2*{BASELINE_SD})) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC;'), - ('1160', '1026', 'Test Results', 'Pattern_Match', 'mssql', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE NULLIF("{COLUMN_NAME}", '''') NOT LIKE ''{BASELINE_VALUE}'' GROUP BY "{COLUMN_NAME}";'), - ('1161', '1028', 'Test Results', 'Recency', 'mssql', NULL, 'SELECT DISTINCT col AS latest_date_available, CAST(''{TEST_DATE}'' AS DATE) AS test_run_date FROM (SELECT MAX("{COLUMN_NAME}") AS col FROM {TARGET_SCHEMA}.{TABLE_NAME}) a WHERE DATEDIFF(day, col, CAST(''{TEST_DATE}'' AS DATE)) > {THRESHOLD_VALUE};'), - ('1162', '1030', 'Test Results', 'Required', 'mssql', NULL, 'SELECT TOP 500 * FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" IS NULL;'), - ('1163', '1031', 'Test Results', 'Row_Ct', 'mssql', NULL, 'WITH CTE AS (SELECT COUNT(*) AS current_count FROM {TARGET_SCHEMA}.{TABLE_NAME}) SELECT current_count, ABS(ROUND(CAST(100 * (current_count - {THRESHOLD_VALUE}) AS NUMERIC) / CAST({THRESHOLD_VALUE} AS NUMERIC) ,2)) AS row_count_pct_decrease FROM cte WHERE current_count < {THRESHOLD_VALUE};'), - ('1164', '1032', 'Test Results', 'Row_Ct_Pct', 'mssql', NULL, 'WITH CTE AS (SELECT COUNT(*) AS current_count FROM {TARGET_SCHEMA}.{TABLE_NAME}) SELECT current_count, {BASELINE_CT} AS baseline_count, ABS(ROUND(CAST(100 * (current_count - {BASELINE_CT}) AS NUMERIC) / CAST({BASELINE_CT} AS NUMERIC) ,2)) AS row_count_pct_difference FROM cte;'), - ('1165', '1033', 'Test Results', 'Street_Addr_Pattern', 'mssql', NULL, 'SELECT DISTINCT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE UPPER("{COLUMN_NAME}") NOT LIKE ''[1-9]% [A-Z]% %'' AND CHARINDEX('' '', "{COLUMN_NAME}") NOT BETWEEN 2 AND 6 GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC;'), - ('1166', '1036', 'Test Results', 'US_State', 'mssql', NULL, 'SELECT DISTINCT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE NULLIF("{COLUMN_NAME}", '''') NOT IN (''AL'',''AK'',''AS'',''AZ'',''AR'',''CA'',''CO'',''CT'',''DE'',''DC'',''FM'',''FL'',''GA'',''GU'',''HI'',''ID'',''IL'',''IN'',''IA'',''KS'',''KY'',''LA'',''ME'',''MH'',''MD'',''MA'',''MI'',''MN'',''MS'',''MO'',''MT'',''NE'',''NV'',''NH'',''NJ'',''NM'',''NY'',''NC'',''ND'',''MP'',''OH'',''OK'',''OR'',''PW'',''PA'',''PR'',''RI'',''SC'',''SD'',''TN'',''TX'',''UT'',''VT'',''VI'',''VA'',''WA'',''WV'',''WI'',''WY'',''AE'',''AP'',''AA'') GROUP BY "{COLUMN_NAME}";'), - ('1167', '1034', 'Test Results', 'Unique', 'mssql', NULL, 'SELECT DISTINCT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" HAVING COUNT(*) > 1 ORDER BY COUNT(*) DESC;'), - ('1168', '1035', 'Test Results', 'Unique_Pct', 'mssql', NULL, 'SELECT DISTINCT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC;'), - ('1169', '1037', 'Test Results', 'Weekly_Rec_Ct', 'mssql', NULL, 'WITH - Pass0 as (select 1 as C union all select 1), --2 rows - Pass1 as (select 1 as C from Pass0 as A, Pass0 as B),--4 rows - Pass2 as (select 1 as C from Pass1 as A, Pass1 as B),--16 rows - Pass3 as (select 1 as C from Pass2 as A, Pass2 as B),--256 rows - Pass4 as (select 1 as C from Pass3 as A, Pass3 as B),--65536 rows - All_Nums as (select row_number() over(order by C) as Number from Pass4), - tally as (SELECT Number FROM All_Nums WHERE Number <= 45000), - - date_range as (SELECT CAST(DATEADD(WEEK, DATEDIFF(WEEK, 0, MIN("{COLUMN_NAME}")), 0) AS DATE) AS min_period, - CAST(DATEADD(WEEK, DATEDIFF(WEEK, 0, MAX("{COLUMN_NAME}")), 0) AS DATE) AS max_period, - DATEDIFF(WEEK, - CAST(DATEADD(WEEK, DATEDIFF(WEEK, 0, MIN("{COLUMN_NAME}")), 0) AS DATE), - CAST(DATEADD(WEEK, DATEDIFF(WEEK, 0, MAX("{COLUMN_NAME}")), 0) AS DATE) ) + 1 as period_ct - FROM {TARGET_SCHEMA}.{TABLE_NAME} ), - check_periods as ( SELECT d.min_period, d.max_period, t.number, - DATEADD(WEEK, -(t.number - 1), d.max_period) AS check_period - FROM date_range d - INNER JOIN tally t - ON (d.period_ct >= t.number) ), - data_by_period as (SELECT CAST(DATEADD(WEEK, DATEDIFF(WEEK, 0, "{COLUMN_NAME}"), 0) AS DATE) as data_period, COUNT(*) as record_ct - FROM {TARGET_SCHEMA}.{TABLE_NAME} - GROUP BY CAST(DATEADD(WEEK, DATEDIFF(WEEK, 0, "{COLUMN_NAME}"), 0) AS DATE) ), - data_by_prd_with_prior_next as (SELECT check_period, - RANK() OVER (ORDER BY check_period DESC) as ranked, - ISNULL(d.record_ct, 0) as record_ct, - ISNULL(LAG(d.record_ct) OVER (ORDER BY check_period), 0) as last_record_ct, - ISNULL(LEAD(d.record_ct) OVER (ORDER BY check_period), 0) as next_record_ct - FROM check_periods c - LEFT JOIN data_by_period d - ON (c.check_period = d.data_period) ) -SELECT check_period, record_ct, - CASE - WHEN record_ct = 0 THEN ''MISSING'' - ELSE ''Present'' - END as status - FROM data_by_prd_with_prior_next - WHERE record_ct = 0 - OR last_record_ct = 0 - OR next_record_ct = 0 -ORDER BY check_period DESC;'), - ('1170', '1040', 'Test Results', 'Variability_Increase', 'mssql', NULL, 'SELECT STDEV(CAST("{COLUMN_NAME}" AS FLOAT)) as current_standard_deviation FROM {TARGET_SCHEMA}.{TABLE_NAME};'), - ('1171', '1041', 'Test Results', 'Variability_Decrease', 'mssql', NULL, 'SELECT STDEV(CAST("{COLUMN_NAME}" AS FLOAT)) as current_standard_deviation FROM {TARGET_SCHEMA}.{TABLE_NAME};'), - - ('1172', '1001', 'Profile Anomaly' , 'Suggested_Type', 'snowflake', NULL, 'SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC;'), - ('1173', '1002', 'Profile Anomaly', 'Non_Standard_Blanks', 'snowflake', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE CASE WHEN "{COLUMN_NAME}" IN (''.'', ''?'', '' '') THEN 1 WHEN LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP ''-{2,}'' OR LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP ''0{2,}'' OR LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP ''9{2,}'' OR LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP ''x{2,}'' OR LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP ''z{2,}'' THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''blank'',''error'',''missing'',''tbd'', ''n/a'',''#na'',''none'',''null'',''unknown'') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''(blank)'',''(error)'',''(missing)'',''(tbd)'', ''(n/a)'',''(#na)'',''(none)'',''(null)'',''(unknown)'') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''[blank]'',''[error]'',''[missing]'',''[tbd]'', ''[n/a]'',''[#na]'',''[none]'',''[null]'',''[unknown]'') THEN 1 WHEN "{COLUMN_NAME}" = '''' THEN 1 WHEN "{COLUMN_NAME}" IS NULL THEN 1 ELSE 0 END = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";'), - ('1174', '1003', 'Profile Anomaly' , 'Invalid_Zip_USA', 'snowflake', NULL, 'SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}",''012345678'',''999999999'') NOT IN (''99999'', ''999999999'', ''99999-9999'') GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500;'), - ('1175', '1004', 'Profile Anomaly' , 'Multiple_Types_Minor', 'snowflake', NULL, 'SELECT DISTINCT column_name, columns.table_name, CASE WHEN data_type ILIKE ''timestamp%'' THEN lower(data_type) WHEN data_type ILIKE ''date'' THEN lower(data_type) WHEN data_type ILIKE ''boolean'' THEN ''boolean'' WHEN data_type = ''TEXT'' THEN ''varchar('' || CAST(character_maximum_length AS VARCHAR) || '')'' WHEN data_type ILIKE ''char%'' THEN ''char('' || CAST(character_maximum_length AS VARCHAR) || '')'' WHEN data_type = ''NUMBER'' AND numeric_precision = 38 AND numeric_scale = 0 THEN ''bigint'' WHEN data_type ILIKE ''num%'' THEN ''numeric('' || CAST(numeric_precision AS VARCHAR) || '','' || CAST(numeric_scale AS VARCHAR) || '')'' ELSE data_type END AS data_type FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = ''{TARGET_SCHEMA}'' AND columns.column_name = ''{COLUMN_NAME}'' AND tables.table_type = ''BASE TABLE'' ORDER BY data_type, table_name;'), - ('1176', '1005', 'Profile Anomaly' , 'Multiple_Types_Major', 'snowflake', NULL, 'SELECT DISTINCT column_name, columns.table_name, CASE WHEN data_type ILIKE ''timestamp%'' THEN lower(data_type) WHEN data_type ILIKE ''date'' THEN lower(data_type) WHEN data_type ILIKE ''boolean'' THEN ''boolean'' WHEN data_type = ''TEXT'' THEN ''varchar('' || CAST(character_maximum_length AS VARCHAR) || '')'' WHEN data_type ILIKE ''char%'' THEN ''char('' || CAST(character_maximum_length AS VARCHAR) || '')'' WHEN data_type = ''NUMBER'' AND numeric_precision = 38 AND numeric_scale = 0 THEN ''bigint'' WHEN data_type ILIKE ''num%'' THEN ''numeric('' || CAST(numeric_precision AS VARCHAR) || '','' || CAST(numeric_scale AS VARCHAR) || '')'' ELSE data_type END AS data_type FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = ''{TARGET_SCHEMA}'' AND columns.column_name = ''{COLUMN_NAME}'' AND tables.table_type = ''BASE TABLE'' ORDER BY data_type, table_name;'), - ('1177', '1006', 'Profile Anomaly' , 'No_Values', 'snowflake', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";' ), - ('1178', '1007', 'Profile Anomaly' , 'Column_Pattern_Mismatch', 'snowflake', NULL, 'SELECT A.* FROM (SELECT DISTINCT TOP 5 b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT trim(split_part(''{DETAIL_EXPRESSION}'', ''|'', 4)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}"::VARCHAR, ''[a-z]'', ''a''), ''[A-Z]'', ''A''), ''[0-9]'', ''N'') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC) A UNION ALL SELECT B.* FROM (SELECT DISTINCT TOP 5 b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT trim(split_part(''{DETAIL_EXPRESSION}'', ''|'', 6)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}"::VARCHAR, ''[a-z]'', ''a''), ''[A-Z]'', ''A''), ''[0-9]'', ''N'') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC) B UNION ALL SELECT C.* FROM (SELECT DISTINCT TOP 5 b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT trim(split_part(''{DETAIL_EXPRESSION}'', ''|'', 8)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}"::VARCHAR, ''[a-z]'', ''a''), ''[A-Z]'', ''A''), ''[0-9]'', ''N'') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC) C UNION ALL SELECT D.* FROM (SELECT DISTINCT TOP 5 b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT trim(split_part(''{DETAIL_EXPRESSION}'', ''|'', 10)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}"::VARCHAR, ''[a-z]'', ''a''), ''[A-Z]'', ''A''), ''[0-9]'', ''N'') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC) D ORDER BY top_pattern DESC, count DESC;' ), - ('1179', '1008', 'Profile Anomaly' , 'Table_Pattern_Mismatch', 'snowflake', NULL, 'SELECT DISTINCT column_name, columns.table_name FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = ''{TARGET_SCHEMA}'' AND columns.column_name = ''{COLUMN_NAME}'' AND UPPER(tables.table_type) = ''BASE TABLE'' ORDER BY table_name; ' ), - ('1180', '1009', 'Profile Anomaly' , 'Leading_Spaces', 'snowflake', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (CASE WHEN "{COLUMN_NAME}" BETWEEN '' !'' AND ''!'' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";' ), - ('1181', '1010', 'Profile Anomaly' , 'Quoted_Values', 'snowflake', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (CASE WHEN "{COLUMN_NAME}" ILIKE ''"%"'' OR "{COLUMN_NAME}" ILIKE ''''''%'''''' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500;' ), - ('1182', '1011', 'Profile Anomaly' , 'Char_Column_Number_Values', 'snowflake', NULL, 'SELECT A.* FROM (SELECT DISTINCT TOP 10 ''Numeric'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_NUM;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC) AS A UNION ALL SELECT B.* FROM (SELECT DISTINCT TOP 10 ''Non-Numeric'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_NUM;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC) AS B ORDER BY data_type, count DESC;' ), - ('1183', '1012', 'Profile Anomaly' , 'Char_Column_Date_Values', 'snowflake', NULL, 'SELECT A.* FROM (SELECT DISTINCT TOP 10 ''Date'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_DATE;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC) AS A UNION ALL SELECT B.* FROM (SELECT DISTINCT TOP 10 ''Non-Date'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_DATE;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC) AS B ORDER BY data_type, count DESC;' ), - ('1184', '1013', 'Profile Anomaly', 'Small Missing Value Ct', 'snowflake', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (CASE WHEN "{COLUMN_NAME}" IN (''.'', ''?'', '' '') THEN 1 WHEN LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP ''-{2,}'' OR LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP ''0{2,}'' OR LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP ''9{2,}'' OR LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP ''x{2,}'' OR LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP ''z{2,}'' THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''blank'',''error'',''missing'',''tbd'', ''n/a'',''#na'',''none'',''null'',''unknown'') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''(blank)'',''(error)'',''(missing)'',''(tbd)'', ''(n/a)'',''(#na)'',''(none)'',''(null)'',''(unknown)'') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''[blank]'',''[error]'',''[missing]'',''[tbd]'', ''[n/a]'',''[#na]'',''[none]'',''[null]'',''[unknown]'') THEN 1 WHEN "{COLUMN_NAME}" = '''' THEN 1 WHEN "{COLUMN_NAME}" IS NULL THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";'), - ('1185', '1014', 'Profile Anomaly' , 'Small Divergent Value Ct', 'snowflake', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC;' ), - ('1186', '1015', 'Profile Anomaly' , 'Boolean_Value_Mismatch', 'snowflake', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC;' ), - ('1187', '1016', 'Profile Anomaly' , 'Potential_Duplicates', 'snowflake', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" HAVING COUNT(*)> 1 ORDER BY COUNT(*) DESC LIMIT 500;' ), - ('1188', '1017', 'Profile Anomaly' , 'Standardized_Value_Matches', 'snowflake', NULL, 'WITH CTE AS ( SELECT DISTINCT UPPER(TRANSLATE("{COLUMN_NAME}", '' '''',.-'', '''')) as possible_standard_value, COUNT(DISTINCT "{COLUMN_NAME}") FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY UPPER(TRANSLATE("{COLUMN_NAME}", '' '''',.-'', '''')) HAVING COUNT(DISTINCT "{COLUMN_NAME}") > 1 ) SELECT DISTINCT a."{COLUMN_NAME}", possible_standard_value, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} a, cte b WHERE UPPER(TRANSLATE(a."{COLUMN_NAME}", '' '''',.-'', '''')) = b.possible_standard_value GROUP BY a."{COLUMN_NAME}", possible_standard_value ORDER BY possible_standard_value ASC, count DESC LIMIT 500;' ), - ('1189', '1018', 'Profile Anomaly' , 'Unlikely_Date_Values', 'snowflake', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", ''{PROFILE_RUN_DATE}'' :: DATE AS profile_run_date, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} a WHERE ("{COLUMN_NAME}" < ''1900-01-01''::DATE) OR ("{COLUMN_NAME}" > ''{PROFILE_RUN_DATE}'' :: DATE + INTERVAL ''30 year'' ) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500;' ), - ('1190', '1019', 'Profile Anomaly' , 'Recency_One_Year', 'snowflake', NULL, 'created_in_ui' ), - ('1191', '1020', 'Profile Anomaly' , 'Recency_Six_Months', 'snowflake', NULL, 'created_in_ui' ), - ('1192', '1021', 'Profile Anomaly' , 'Unexpected US States', 'snowflake', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500;' ), - ('1193', '1022', 'Profile Anomaly' , 'Unexpected Emails', 'snowflake', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500;' ), - ('1194', '1023', 'Profile Anomaly' , 'Small_Numeric_Value_Ct', 'snowflake', NULL, 'SELECT A.* FROM (SELECT DISTINCT TOP 10 ''Numeric'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_NUM;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC) AS A UNION ALL SELECT B.* FROM (SELECT DISTINCT TOP 10 ''Non-Numeric'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_NUM;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC) AS B ORDER BY data_type, count DESC;' ), - ('1195', '1024', 'Profile Anomaly' , 'Invalid_Zip3_USA', 'snowflake', NULL, 'SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}",''012345678'',''999999999'') <> ''999'' GROUP BY "{COLUMN_NAME}" ORDER BY count DESC, "{COLUMN_NAME}" LIMIT 500;'), - ('1196', '1025', 'Profile Anomaly' , 'Delimited_Data_Embedded', 'snowflake', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE REGEXP_LIKE("{COLUMN_NAME}"::VARCHAR, ''^([^,|\t]{1,20}[,|\t]){2,}[^,|\t]{0,20}([,|\t]{0,1}[^,|\t]{0,20})*$'') AND NOT REGEXP_LIKE("{COLUMN_NAME}"::VARCHAR, ''.*\\s(and|but|or|yet)\\s.*'') GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT 500;' ), - - ('1197', '1004', 'Test Results', 'Alpha_Trunc', 'snowflake', NULL, 'SELECT DISTINCT "{COLUMN_NAME}" , LEN("{COLUMN_NAME}") as current_max_length, {THRESHOLD_VALUE} as previous_max_length FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT MAX(LEN("{COLUMN_NAME}")) as max_length FROM {TARGET_SCHEMA}.{TABLE_NAME}) a WHERE LEN("{COLUMN_NAME}") = a.max_length AND a.max_length < {THRESHOLD_VALUE} LIMIT 500;'), - ('1198', '1005', 'Test Results', 'Avg_Shift', 'snowflake', NULL, 'SELECT AVG("{COLUMN_NAME}" :: FLOAT) AS current_average FROM {TARGET_SCHEMA}.{TABLE_NAME};'), - ('1199', '1006', 'Test Results', 'Condition_Flag', 'snowflake', NULL, 'SELECT * FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE {CUSTOM_QUERY} LIMIT 500;'), - ('1200', '1007', 'Test Results', 'Constant', 'snowflake', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" <> {BASELINE_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT 500;'), - ('1201', '1009', 'Test Results', 'Daily_Record_Ct', 'snowflake', NULL, 'WITH RECURSIVE daterange(all_dates) AS (SELECT MIN("{COLUMN_NAME}") :: DATE AS all_dates FROM {TARGET_SCHEMA}.{TABLE_NAME} UNION ALL SELECT DATEADD(DAY, 1, d.all_dates) :: DATE AS all_dates FROM daterange d WHERE d.all_dates < (SELECT MAX("{COLUMN_NAME}") :: DATE FROM {TARGET_SCHEMA}.{TABLE_NAME}) ), existing_periods AS ( SELECT DISTINCT "{COLUMN_NAME}" :: DATE AS period, COUNT(1) AS period_count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" :: DATE ) SELECT p.missing_period, p.prior_available_date, e.period_count as prior_available_date_count, p.next_available_date, f.period_count as next_available_date_count FROM (SELECT d.all_dates AS missing_period, MAX(b.period) AS prior_available_date, MIN(c.period) AS next_available_date FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates) p LEFT JOIN existing_periods e ON (p.prior_available_date = e.period) LEFT JOIN existing_periods f ON (p.next_available_date = f.period) ORDER BY p.missing_period LIMIT 500;'), - ('1202', '1011', 'Test Results', 'Dec_Trunc', 'snowflake', NULL, 'SELECT DISTINCT LENGTH(SPLIT_PART("{COLUMN_NAME}" :: TEXT, ''.'', 2)) AS decimal_scale, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY decimal_scale LIMIT 500;'), - ('1203', '1012', 'Test Results', 'Distinct_Date_Ct', 'snowflake', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" IS NOT NULL GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500;'), - ('1204', '1013', 'Test Results', 'Distinct_Value_Ct', 'snowflake', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" IS NOT NULL GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500;'), - ('1205', '1014', 'Test Results', 'Email_Format', 'snowflake', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE REGEXP_LIKE("{COLUMN_NAME}"::VARCHAR, ''^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z]{2,}$'') != 1 GROUP BY "{COLUMN_NAME}" LIMIT 500;'), - ('1206', '1015', 'Test Results', 'Future_Date', 'snowflake', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE GREATEST(0, SIGN("{COLUMN_NAME}"::DATE - ''{TEST_DATE}''::DATE)) > {THRESHOLD_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT 500;'), - ('1207', '1016', 'Test Results', 'Future_Date_1Y', 'snowflake', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE GREATEST(0, SIGN("{COLUMN_NAME}"::DATE - (''{TEST_DATE}''::DATE + 365))) > {THRESHOLD_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT 500;'), - ('1208', '1017', 'Test Results', 'Incr_Avg_Shift', 'snowflake', NULL, 'SELECT AVG("{COLUMN_NAME}" :: FLOAT) AS current_average, SUM("{COLUMN_NAME}" ::FLOAT) AS current_sum, NULLIF(COUNT("{COLUMN_NAME}" )::FLOAT, 0) as current_value_count FROM {TARGET_SCHEMA}.{TABLE_NAME};'), - ('1209', '1018', 'Test Results', 'LOV_All', 'snowflake', NULL, 'SELECT LISTAGG(DISTINCT "{COLUMN_NAME}", ''|'') WITHIN GROUP (ORDER BY "{COLUMN_NAME}") FROM {TARGET_SCHEMA}.{TABLE_NAME} HAVING LISTAGG(DISTINCT "{COLUMN_NAME}", ''|'') WITHIN GROUP (ORDER BY "{COLUMN_NAME}") <> ''{THRESHOLD_VALUE}'' LIMIT 500;'), - ('1210', '1019', 'Test Results', 'LOV_Match', 'snowflake', NULL, 'SELECT DISTINCT NULLIF("{COLUMN_NAME}", '''') AS "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE NULLIF("{COLUMN_NAME}", '''') NOT IN {BASELINE_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT 500;'), - ('1211', '1020', 'Test Results', 'Min_Date', 'snowflake', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" :: DATE < ''{BASELINE_VALUE}'' :: DATE GROUP BY "{COLUMN_NAME}" LIMIT 500;'), - ('1212', '1021', 'Test Results', 'Min_Val', 'snowflake', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", (ABS("{COLUMN_NAME}") - ABS({BASELINE_VALUE})) AS difference_from_baseline FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" < {BASELINE_VALUE} LIMIT 500;'), - ('1213', '1022', 'Test Results', 'Missing_Pct', 'snowflake', NULL, 'SELECT TOP 10 * FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" IS NULL OR "{COLUMN_NAME}" :: VARCHAR(255) = '''' ;'), - ('1214', '1023', 'Test Results', 'Monthly_Rec_Ct', 'snowflake', NULL, 'WITH RECURSIVE daterange(all_dates) AS (SELECT DATE_TRUNC(''month'', MIN("{COLUMN_NAME}")) :: DATE AS all_dates FROM {TARGET_SCHEMA}.{TABLE_NAME} UNION ALL SELECT DATEADD(MONTH, 1, d.all_dates) :: DATE AS all_dates FROM daterange d WHERE d.all_dates < (SELECT DATE_TRUNC(''month'', MAX("{COLUMN_NAME}")) :: DATE FROM {TARGET_SCHEMA}.{TABLE_NAME}) ), existing_periods AS (SELECT DISTINCT DATE_TRUNC(''month'',"{COLUMN_NAME}") :: DATE AS period, COUNT(1) AS period_count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY DATE_TRUNC(''month'',"{COLUMN_NAME}") :: DATE ) SELECT p.missing_period, p.prior_available_month, e.period_count as prior_available_month_count, p.next_available_month, f.period_count as next_available_month_count FROM (SELECT d.all_dates as missing_period, MAX(b.period) AS prior_available_month, MIN(c.period) AS next_available_month FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates) p LEFT JOIN existing_periods e ON (p.prior_available_month = e.period) LEFT JOIN existing_periods f ON (p.next_available_month = f.period) ORDER BY p.missing_period;'), - ('1215', '1024', 'Test Results', 'Outlier_Pct_Above', 'snowflake', NULL, 'SELECT ({BASELINE_AVG} + (2*{BASELINE_SD})) AS outlier_threshold, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" :: FLOAT > ({BASELINE_AVG} + (2*{BASELINE_SD})) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC;'), - ('1216', '1025', 'Test Results', 'Outlier_Pct_Below', 'snowflake', NULL, 'SELECT ({BASELINE_AVG} + (2*{BASELINE_SD})) AS outlier_threshold, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" :: FLOAT < ({BASELINE_AVG} + (2*{BASELINE_SD})) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC;'), - ('1217', '1026', 'Test Results', 'Pattern_Match', 'snowflake', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE REGEXP_LIKE(NULLIF("{COLUMN_NAME}"::VARCHAR, ''''),''{BASELINE_VALUE}'') != 1 GROUP BY "{COLUMN_NAME}";'), - ('1218', '1028', 'Test Results', 'Recency', 'snowflake', NULL, 'SELECT DISTINCT col AS latest_date_available, ''{TEST_DATE}'' :: DATE as test_run_date FROM (SELECT MAX("{COLUMN_NAME}") AS col FROM {TARGET_SCHEMA}.{TABLE_NAME}) WHERE DATEDIFF(''D'', col, ''{TEST_DATE}''::DATE) > {THRESHOLD_VALUE};'), - ('1219', '1030', 'Test Results', 'Required', 'snowflake', NULL, 'SELECT * FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" IS NULL LIMIT 500;'), - ('1220', '1031', 'Test Results', 'Row_Ct', 'snowflake', NULL, 'WITH CTE AS (SELECT COUNT(*) AS current_count FROM {TARGET_SCHEMA}.{TABLE_NAME}) SELECT current_count, ABS(ROUND(100 *(current_count - {THRESHOLD_VALUE}) :: FLOAT / {THRESHOLD_VALUE} :: FLOAT,2)) AS row_count_pct_decrease FROM cte WHERE current_count < {THRESHOLD_VALUE};'), - ('1221', '1032', 'Test Results', 'Row_Ct_Pct', 'snowflake', NULL, 'WITH CTE AS (SELECT COUNT(*) AS current_count FROM {TARGET_SCHEMA}.{TABLE_NAME}) SELECT current_count, {BASELINE_CT} AS baseline_count, ABS(ROUND(100 * (current_count - {BASELINE_CT}) :: FLOAT / {BASELINE_CT} :: FLOAT,2)) AS row_count_pct_difference FROM cte;'), - ('1222', '1033', 'Test Results', 'Street_Addr_Pattern', 'snowflake', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE REGEXP_LIKE("{COLUMN_NAME}"::VARCHAR, ''^[0-9]{1,5}[a-zA-Z]?\\s\\w{1,5}\\.?\\s?\\w*\\s?\\w*\\s[a-zA-Z]{1,6}\\.?\\s?[0-9]{0,5}[A-Z]{0,1}$'') != 1 GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT 500;'), - ('1223', '1036', 'Test Results', 'US_State', 'snowflake', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE NULLIF("{COLUMN_NAME}", '''') NOT IN (''AL'',''AK'',''AS'',''AZ'',''AR'',''CA'',''CO'',''CT'',''DE'',''DC'',''FM'',''FL'',''GA'',''GU'',''HI'',''ID'',''IL'',''IN'',''IA'',''KS'',''KY'',''LA'',''ME'',''MH'',''MD'',''MA'',''MI'',''MN'',''MS'',''MO'',''MT'',''NE'',''NV'',''NH'',''NJ'',''NM'',''NY'',''NC'',''ND'',''MP'',''OH'',''OK'',''OR'',''PW'',''PA'',''PR'',''RI'',''SC'',''SD'',''TN'',''TX'',''UT'',''VT'',''VI'',''VA'',''WA'',''WV'',''WI'',''WY'',''AE'',''AP'',''AA'') GROUP BY "{COLUMN_NAME}" LIMIT 500;'), - ('1224', '1034', 'Test Results', 'Unique', 'snowflake', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" HAVING COUNT(*) > 1 ORDER BY COUNT(*) DESC LIMIT 500;'), - ('1225', '1035', 'Test Results', 'Unique_Pct', 'snowflake', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT 500;'), - ('1226', '1037', 'Test Results', 'Weekly_Rec_Ct', 'snowflake', NULL, 'WITH RECURSIVE daterange(all_dates) AS (SELECT DATE_TRUNC(''week'',MIN("{COLUMN_NAME}")) :: DATE AS all_dates FROM {TARGET_SCHEMA}.{TABLE_NAME} UNION ALL SELECT (d.all_dates + INTERVAL ''1 week'' ) :: DATE AS all_dates FROM daterange d WHERE d.all_dates < (SELECT DATE_TRUNC(''week'', MAX("{COLUMN_NAME}")) :: DATE FROM {TARGET_SCHEMA}.{TABLE_NAME}) ), existing_periods AS ( SELECT DISTINCT DATE_TRUNC(''week'',"{COLUMN_NAME}") :: DATE AS period, COUNT(1) as period_count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY DATE_TRUNC(''week'',"{COLUMN_NAME}") :: DATE ) SELECT p.missing_period, p.prior_available_week, e.period_count as prior_available_week_count, p.next_available_week, f.period_count as next_available_week_count FROM( SELECT d.all_dates as missing_period, MAX(b.period) AS prior_available_week, MIN(c.period) AS next_available_week FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates ) p LEFT JOIN existing_periods e ON (p.prior_available_week = e.period) LEFT JOIN existing_periods f ON (p.next_available_week = f.period) ORDER BY p.missing_period;'), - ('1227', '1040', 'Test Results', 'Variability_Increase', 'snowflake', NULL, 'SELECT STDDEV(CAST("{COLUMN_NAME}" AS FLOAT)) as current_standard_deviation FROM {TARGET_SCHEMA}.{TABLE_NAME};'), - ('1228', '1041', 'Test Results', 'Variability_Decrease', 'snowflake', NULL, 'SELECT STDDEV(CAST("{COLUMN_NAME}" AS FLOAT)) as current_standard_deviation FROM {TARGET_SCHEMA}.{TABLE_NAME};'), - - ('1229', '1027', 'Profile Anomaly' , 'Variant_Coded_Values', 'redshift', NULL, 'WITH val_array AS (SELECT 1 as valkey, SPLIT_TO_ARRAY(SUBSTRING (''{DETAIL_EXPRESSION}'', STRPOS(''{DETAIL_EXPRESSION}'', '':'') + 2), ''|'') vals), val_list AS ( SELECT valkey, val::VARCHAR FROM val_array v, v.vals val ) SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} t INNER JOIN val_list v ON (LOWER("{COLUMN_NAME}") = v.val) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";'), - ('1230', '1027', 'Profile Anomaly' , 'Variant_Coded_Values', 'snowflake', NULL, 'SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE lower("{COLUMN_NAME}") IN (SELECT trim(value) FROM TABLE (FLATTEN(INPUT => SPLIT(SUBSTRING(''{DETAIL_EXPRESSION}'', POSITION('':'', ''{DETAIL_EXPRESSION}'') + 2), ''|''))) ) GROUP BY "{COLUMN_NAME}";'), - ('1231', '1027', 'Profile Anomaly' , 'Variant_Coded_Values', 'mssql', NULL, 'SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE LOWER("{COLUMN_NAME}") IN (SELECT trim(value) FROM STRING_SPLIT(SUBSTRING(''{DETAIL_EXPRESSION}'', CHARINDEX('':'', ''{DETAIL_EXPRESSION}'') + 2, 999), ''|'')) GROUP BY "{COLUMN_NAME}";'), - ('1232', '1027', 'Profile Anomaly' , 'Variant_Coded_Values', 'postgresql', NULL, 'SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE LOWER("{COLUMN_NAME}") = ANY(STRING_TO_ARRAY(SUBSTRING(''{DETAIL_EXPRESSION}'', STRPOS(''{DETAIL_EXPRESSION}'', '':'') + 2), ''|'')) GROUP BY "{COLUMN_NAME}";'), - - ('1233', '1043', 'Test Results', 'Valid_Characters', 'redshift', NULL, 'SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}", CHR(160) || CHR(8203) || CHR(65279) || CHR(8239) || CHR(8201) || CHR(12288) || CHR(8204), ''XXXXXXX'') <> "{COLUMN_NAME}" OR "{COLUMN_NAME}" LIKE '' %'' OR "{COLUMN_NAME}" LIKE ''''''%'''''' OR "{COLUMN_NAME}" LIKE ''"%"'' ORDER BY record_ct DESC;'), - ('1234', '1043', 'Test Results', 'Valid_Characters', 'postgresql', NULL, 'SELECT "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}", CHR(160) || CHR(8203) || CHR(65279) || CHR(8239) || CHR(8201) || CHR(12288) || CHR(8204), ''XXXXXXX'') <> "{COLUMN_NAME}" OR "{COLUMN_NAME}" LIKE '' %'' OR "{COLUMN_NAME}" LIKE ''''''%'''''' OR "{COLUMN_NAME}" LIKE ''"%"'' ORDER BY record_ct DESC LIMIT 20;'), - ('1235', '1043', 'Test Results', 'Valid_Characters', 'mssql', NULL, 'SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}", NCHAR(160) || NCHAR(8203) || NCHAR(65279) || NCHAR(8239) || NCHAR(8201) || NCHAR(12288) || NCHAR(8204), ''XXXXXXX'') <> "{COLUMN_NAME}" OR "{COLUMN_NAME}" LIKE '' %'' OR "{COLUMN_NAME}" LIKE ''''''%'''''' OR "{COLUMN_NAME}" LIKE ''"%"'' ORDER BY record_ct DESC;'), - ('1236', '1043', 'Test Results', 'Valid_Characters', 'snowflake', NULL, 'SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}", CHAR(160) || CHAR(8203) || CHAR(65279) || CHAR(8239) || CHAR(8201) || CHAR(12288) || CHAR(8204), ''XXXXXXX'') <> "{COLUMN_NAME}" OR "{COLUMN_NAME}" LIKE '' %'' OR "{COLUMN_NAME}" LIKE ''''''%'''''' OR "{COLUMN_NAME}" LIKE ''"%"'' ORDER BY record_ct DESC;'), - - ('1237', '1044', 'Test Results', 'Valid_US_Zip', 'redshift', NULL, 'SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}",''012345678'',''999999999'') NOT IN (''99999'', ''999999999'', ''99999-9999'') GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC;'), - ('1238', '1044', 'Test Results', 'Valid_US_Zip', 'postgresql', NULL, 'SELECT "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}",''012345678'',''999999999'') NOT IN (''99999'', ''999999999'', ''99999-9999'') GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC LIMIT 20;'), - ('1239', '1044', 'Test Results', 'Valid_US_Zip', 'mssql', NULL, 'SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}",''012345678'',''999999999'') NOT IN (''99999'', ''999999999'', ''99999-9999'') GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC;'), - ('1240', '1044', 'Test Results', 'Valid_US_Zip', 'snowflake', NULL, 'SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}",''012345678'',''999999999'') NOT IN (''99999'', ''999999999'', ''99999-9999'') GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC;'), - - ('1241', '1045', 'Test Results', 'Valid_US_Zip3', 'redshift', NULL, 'SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}",''012345678'',''999999999'') <> ''999'' GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC;'), - ('1242', '1045', 'Test Results', 'Valid_US_Zip3', 'postgresql', NULL, 'SELECT "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}",''012345678'',''999999999'') NOT IN (''99999'', ''999999999'', ''99999-9999'') <> '''' GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC LIMIT 20;'), - ('1243', '1045', 'Test Results', 'Valid_US_Zip3', 'mssql', NULL, 'SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}",''012345678'',''999999999'') NOT IN (''99999'', ''999999999'', ''99999-9999'') GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC;'), - ('1244', '1045', 'Test Results', 'Valid_US_Zip3', 'snowflake', NULL, 'SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}",''012345678'',''999999999'') NOT IN (''99999'', ''999999999'', ''99999-9999'') GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC;'), - - ('1245', '1500', 'Test Results', 'Aggregate_Balance', 'redshift', NULL, 'SELECT * - FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) AS total, SUM(MATCH_TOTAL) AS MATCH_TOTAL - FROM - ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} AS total, NULL AS match_total - FROM {TARGET_SCHEMA}.{TABLE_NAME} - WHERE {SUBSET_CONDITION} - GROUP BY {GROUPBY_NAMES} - {HAVING_CONDITION} - UNION ALL - SELECT {MATCH_GROUPBY_NAMES}, NULL AS total, {MATCH_COLUMN_NAMES} AS match_total - FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME} - WHERE {MATCH_SUBSET_CONDITION} - GROUP BY {MATCH_GROUPBY_NAMES} - {MATCH_HAVING_CONDITION} ) a - GROUP BY {GROUPBY_NAMES} ) s - WHERE total <> match_total OR (total IS NOT NULL AND match_total IS NULL) OR (total IS NULL AND match_total IS NOT NULL) -ORDER BY {GROUPBY_NAMES};'), - ('1246', '1500', 'Test Results', 'Aggregate_Balance', 'snowflake', NULL, 'SELECT * - FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) AS total, SUM(MATCH_TOTAL) AS MATCH_TOTAL - FROM - ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} AS total, NULL AS match_total - FROM {TARGET_SCHEMA}.{TABLE_NAME} - WHERE {SUBSET_CONDITION} - GROUP BY {GROUPBY_NAMES} - {HAVING_CONDITION} - UNION ALL - SELECT {MATCH_GROUPBY_NAMES}, NULL AS total, {MATCH_COLUMN_NAMES} AS match_total - FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME} - WHERE {MATCH_SUBSET_CONDITION} - GROUP BY {MATCH_GROUPBY_NAMES} - {MATCH_HAVING_CONDITION} ) a - GROUP BY {GROUPBY_NAMES} ) s - WHERE total <> match_total OR (total IS NOT NULL AND match_total IS NULL) OR (total IS NULL AND match_total IS NOT NULL) -ORDER BY {GROUPBY_NAMES};'), - ('1247', '1500', 'Test Results', 'Aggregate_Balance', 'mssql', NULL, 'SELECT * - FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) AS total, SUM(MATCH_TOTAL) AS MATCH_TOTAL - FROM - ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} AS total, NULL AS match_total - FROM {TARGET_SCHEMA}.{TABLE_NAME} - WHERE {SUBSET_CONDITION} - GROUP BY {GROUPBY_NAMES} - {HAVING_CONDITION} - UNION ALL - SELECT {MATCH_GROUPBY_NAMES}, NULL AS total, {MATCH_COLUMN_NAMES} AS match_total - FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME} - WHERE {MATCH_SUBSET_CONDITION} - GROUP BY {MATCH_GROUPBY_NAMES} - {MATCH_HAVING_CONDITION} ) a - GROUP BY {GROUPBY_NAMES} ) s - WHERE total <> match_total OR (total IS NOT NULL AND match_total IS NULL) OR (total IS NULL AND match_total IS NOT NULL) -ORDER BY {GROUPBY_NAMES};'), - ('1248', '1500', 'Test Results', 'Aggregate_Balance', 'postgresql', NULL, 'SELECT * - FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) AS total, SUM(MATCH_TOTAL) AS MATCH_TOTAL - FROM - ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} AS total, NULL AS match_total - FROM {TARGET_SCHEMA}.{TABLE_NAME} - WHERE {SUBSET_CONDITION} - GROUP BY {GROUPBY_NAMES} - {HAVING_CONDITION} - UNION ALL - SELECT {MATCH_GROUPBY_NAMES}, NULL AS total, {MATCH_COLUMN_NAMES} AS match_total - FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME} - WHERE {MATCH_SUBSET_CONDITION} - GROUP BY {MATCH_GROUPBY_NAMES} - {MATCH_HAVING_CONDITION} ) a - GROUP BY {GROUPBY_NAMES} ) s - WHERE total <> match_total OR (total IS NOT NULL AND match_total IS NULL) OR (total IS NULL AND match_total IS NOT NULL) -ORDER BY {GROUPBY_NAMES};'), - ('1249', '1501', 'Test Results', 'Aggregate_Minimum', 'redshift', NULL, 'SELECT * -FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) as total, SUM(MATCH_TOTAL) as MATCH_TOTAL - FROM - ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} as total, NULL as match_total - FROM {TARGET_SCHEMA}.{TABLE_NAME} - WHERE {SUBSET_CONDITION} - GROUP BY {GROUPBY_NAMES} - {HAVING_CONDITION} - UNION ALL - SELECT {MATCH_GROUPBY_NAMES}, NULL as total, {MATCH_COLUMN_NAMES} as match_total - FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME} - WHERE {MATCH_SUBSET_CONDITION} - GROUP BY {MATCH_GROUPBY_NAMES} - {MATCH_HAVING_CONDITION} ) a - GROUP BY {GROUPBY_NAMES} ) s - WHERE total < match_total OR (total IS NULL AND match_total IS NOT NULL) -ORDER BY {GROUPBY_NAMES};'), - ('1250', '1501', 'Test Results', 'Aggregate_Minimum', 'snowflake', NULL, 'SELECT * -FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) as total, SUM(MATCH_TOTAL) as MATCH_TOTAL - FROM - ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} as total, NULL as match_total - FROM {TARGET_SCHEMA}.{TABLE_NAME} - WHERE {SUBSET_CONDITION} - GROUP BY {GROUPBY_NAMES} - {HAVING_CONDITION} - UNION ALL - SELECT {MATCH_GROUPBY_NAMES}, NULL as total, {MATCH_COLUMN_NAMES} as match_total - FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME} - WHERE {MATCH_SUBSET_CONDITION} - GROUP BY {MATCH_GROUPBY_NAMES} - {MATCH_HAVING_CONDITION} ) a - GROUP BY {GROUPBY_NAMES} ) s - WHERE total < match_total OR (total IS NULL AND match_total IS NOT NULL) -ORDER BY {GROUPBY_NAMES};'), - ('1251', '1501', 'Test Results', 'Aggregate_Minimum', 'mssql', NULL, 'SELECT * -FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) as total, SUM(MATCH_TOTAL) as MATCH_TOTAL - FROM - ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} as total, NULL as match_total - FROM {TARGET_SCHEMA}.{TABLE_NAME} - WHERE {SUBSET_CONDITION} - GROUP BY {GROUPBY_NAMES} - {HAVING_CONDITION} - UNION ALL - SELECT {MATCH_GROUPBY_NAMES}, NULL as total, {MATCH_COLUMN_NAMES} as match_total - FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME} - WHERE {MATCH_SUBSET_CONDITION} - GROUP BY {MATCH_GROUPBY_NAMES} - {MATCH_HAVING_CONDITION} ) a - GROUP BY {GROUPBY_NAMES} ) s - WHERE total < match_total OR (total IS NULL AND match_total IS NOT NULL) -ORDER BY {GROUPBY_NAMES};'), - ('1252', '1501', 'Test Results', 'Aggregate_Minimum', 'postgresql', NULL, 'SELECT * -FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) as total, SUM(MATCH_TOTAL) as MATCH_TOTAL - FROM - ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} as total, NULL as match_total - FROM {TARGET_SCHEMA}.{TABLE_NAME} - WHERE {SUBSET_CONDITION} - GROUP BY {GROUPBY_NAMES} - {HAVING_CONDITION} - UNION ALL - SELECT {MATCH_GROUPBY_NAMES}, NULL as total, {MATCH_COLUMN_NAMES} as match_total - FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME} - WHERE {MATCH_SUBSET_CONDITION} - GROUP BY {MATCH_GROUPBY_NAMES} - {MATCH_HAVING_CONDITION} ) a - GROUP BY {GROUPBY_NAMES} ) s - WHERE total < match_total OR (total IS NULL AND match_total IS NOT NULL) -ORDER BY {GROUPBY_NAMES};'), - ('1253', '1502', 'Test Results', 'Combo_Match', 'redshift', NULL, 'SELECT * - FROM ( SELECT {COLUMN_NAME_NO_QUOTES} - FROM {TARGET_SCHEMA}.{TABLE_NAME} - WHERE {SUBSET_CONDITION} - GROUP BY {COLUMN_NAME_NO_QUOTES} - {HAVING_CONDITION} - EXCEPT - SELECT {MATCH_GROUPBY_NAMES} - FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME} - WHERE {MATCH_SUBSET_CONDITION} - GROUP BY {MATCH_GROUPBY_NAMES} - {MATCH_HAVING_CONDITION} - ) test -ORDER BY {COLUMN_NAME_NO_QUOTES};'), - ('1254', '1502', 'Test Results', 'Combo_Match', 'snowflake', NULL, 'SELECT * - FROM ( SELECT {COLUMN_NAME_NO_QUOTES} - FROM {TARGET_SCHEMA}.{TABLE_NAME} - WHERE {SUBSET_CONDITION} - GROUP BY {COLUMN_NAME_NO_QUOTES} - {HAVING_CONDITION} - EXCEPT - SELECT {MATCH_GROUPBY_NAMES} - FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME} - WHERE {MATCH_SUBSET_CONDITION} - GROUP BY {MATCH_GROUPBY_NAMES} - {MATCH_HAVING_CONDITION} - ) test -ORDER BY {COLUMN_NAME_NO_QUOTES};'), - ('1255', '1502', 'Test Results', 'Combo_Match', 'mssql', NULL, 'SELECT * - FROM ( SELECT {COLUMN_NAME_NO_QUOTES} - FROM {TARGET_SCHEMA}.{TABLE_NAME} - WHERE {SUBSET_CONDITION} - GROUP BY {COLUMN_NAME_NO_QUOTES} - {HAVING_CONDITION} - EXCEPT - SELECT {MATCH_GROUPBY_NAMES} - FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME} - WHERE {MATCH_SUBSET_CONDITION} - GROUP BY {MATCH_GROUPBY_NAMES} - {MATCH_HAVING_CONDITION} - ) test -ORDER BY {COLUMN_NAME_NO_QUOTES};'), - ('1256', '1502', 'Test Results', 'Combo_Match', 'postgresql', NULL, 'SELECT * - FROM ( SELECT {COLUMN_NAME_NO_QUOTES} - FROM {TARGET_SCHEMA}.{TABLE_NAME} - WHERE {SUBSET_CONDITION} - GROUP BY {COLUMN_NAME_NO_QUOTES} - {HAVING_CONDITION} - EXCEPT - SELECT {MATCH_GROUPBY_NAMES} - FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME} - WHERE {MATCH_SUBSET_CONDITION} - GROUP BY {MATCH_GROUPBY_NAMES} - {MATCH_HAVING_CONDITION} - ) test -ORDER BY {COLUMN_NAME_NO_QUOTES};'), - ('1257', '1503', 'Test Results', 'Distribution_Shift', 'redshift', NULL, 'WITH latest_ver - AS ( SELECT {CONCAT_COLUMNS} as category, - COUNT(*)::FLOAT / SUM(COUNT(*)) OVER ()::FLOAT AS pct_of_total - FROM {TARGET_SCHEMA}.{TABLE_NAME} v1 - WHERE {SUBSET_CONDITION} - GROUP BY {COLUMN_NAME_NO_QUOTES} ), -older_ver - AS ( SELECT {CONCAT_MATCH_GROUPBY} as category, - COUNT(*)::FLOAT / SUM(COUNT(*)) OVER ()::FLOAT AS pct_of_total - FROM {MATCH_SCHEMA_NAME}.{TABLE_NAME} v2 - WHERE {MATCH_SUBSET_CONDITION} - GROUP BY {MATCH_GROUPBY_NAMES} ) -SELECT COALESCE(l.category, o.category) AS category, - o.pct_of_total AS old_pct, - l.pct_of_total AS new_pct - FROM latest_ver l -FULL JOIN older_ver o - ON (l.category = o.category) -ORDER BY COALESCE(l.category, o.category)'), - ('1258', '1503', 'Test Results', 'Distribution_Shift', 'snowflake', NULL, 'WITH latest_ver - AS ( SELECT {CONCAT_COLUMNS} as category, - COUNT(*)::FLOAT / SUM(COUNT(*)) OVER ()::FLOAT AS pct_of_total - FROM {TARGET_SCHEMA}.{TABLE_NAME} v1 - WHERE {SUBSET_CONDITION} - GROUP BY {COLUMN_NAME_NO_QUOTES} ), -older_ver - AS ( SELECT {CONCAT_MATCH_GROUPBY} as category, - COUNT(*)::FLOAT / SUM(COUNT(*)) OVER ()::FLOAT AS pct_of_total - FROM {MATCH_SCHEMA_NAME}.{TABLE_NAME} v2 - WHERE {MATCH_SUBSET_CONDITION} - GROUP BY {MATCH_GROUPBY_NAMES} ) -SELECT COALESCE(l.category, o.category) AS category, - o.pct_of_total AS old_pct, - l.pct_of_total AS new_pct - FROM latest_ver l -FULL JOIN older_ver o - ON (l.category = o.category) -ORDER BY COALESCE(l.category, o.category)'), - ('1259', '1503', 'Test Results', 'Distribution_Shift', 'mssql', NULL, 'WITH latest_ver - AS ( SELECT {CONCAT_COLUMNS} as category, - CAST(COUNT(*) as FLOAT) / CAST(SUM(COUNT(*)) OVER () as FLOAT) AS pct_of_total - FROM {TARGET_SCHEMA}.{TABLE_NAME} v1 - WHERE {SUBSET_CONDITION} - GROUP BY {COLUMN_NAME_NO_QUOTES} ), -older_ver - AS ( SELECT {CONCAT_MATCH_GROUPBY} as category, - CAST(COUNT(*) as FLOAT) / CAST(SUM(COUNT(*)) OVER () as FLOAT) AS pct_of_total - FROM {MATCH_SCHEMA_NAME}.{TABLE_NAME} v2 - WHERE {MATCH_SUBSET_CONDITION} - GROUP BY {MATCH_GROUPBY_NAMES} ) -SELECT COALESCE(l.category, o.category) AS category, - o.pct_of_total AS old_pct, - l.pct_of_total AS new_pct - FROM latest_ver l -FULL JOIN older_ver o - ON (l.category = o.category) -ORDER BY COALESCE(l.category, o.category)'), - ('1260', '1503', 'Test Results', 'Distribution_Shift', 'postgresql', NULL, 'WITH latest_ver - AS ( SELECT {CONCAT_COLUMNS} as category, - COUNT(*)::FLOAT / SUM(COUNT(*)) OVER ()::FLOAT AS pct_of_total - FROM {TARGET_SCHEMA}.{TABLE_NAME} v1 - WHERE {SUBSET_CONDITION} - GROUP BY {COLUMN_NAME_NO_QUOTES} ), -older_ver - AS ( SELECT {CONCAT_MATCH_GROUPBY} as category, - COUNT(*)::FLOAT / SUM(COUNT(*)) OVER ()::FLOAT AS pct_of_total - FROM {MATCH_SCHEMA_NAME}.{TABLE_NAME} v2 - WHERE {MATCH_SUBSET_CONDITION} - GROUP BY {MATCH_GROUPBY_NAMES} ) -SELECT COALESCE(l.category, o.category) AS category, - o.pct_of_total AS old_pct, - l.pct_of_total AS new_pct - FROM latest_ver l -FULL JOIN older_ver o - ON (l.category = o.category) -ORDER BY COALESCE(l.category, o.category)'), - - ('1245', '1504', 'Test Results', 'Aggregate_Balance_Percent', 'redshift', NULL, 'SELECT * - FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) AS total, SUM(MATCH_TOTAL) AS MATCH_TOTAL - FROM - ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} AS total, NULL AS match_total - FROM {TARGET_SCHEMA}.{TABLE_NAME} - WHERE {SUBSET_CONDITION} - GROUP BY {GROUPBY_NAMES} - {HAVING_CONDITION} - UNION ALL - SELECT {MATCH_GROUPBY_NAMES}, NULL AS total, {MATCH_COLUMN_NAMES} AS match_total - FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME} - WHERE {MATCH_SUBSET_CONDITION} - GROUP BY {MATCH_GROUPBY_NAMES} - {MATCH_HAVING_CONDITION} ) a - GROUP BY {GROUPBY_NAMES} ) s - WHERE (total IS NOT NULL AND match_total IS NULL) - OR (total IS NULL AND match_total IS NOT NULL) - OR (total NOT BETWEEN match_total * (1 + {LOWER_TOLERANCE}/100.0) AND match_total * (1 + {UPPER_TOLERANCE}/100.0)) - ORDER BY {GROUPBY_NAMES};'), - ('1246', '1504', 'Test Results', 'Aggregate_Balance_Percent', 'snowflake', NULL, 'SELECT * - FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) AS total, SUM(MATCH_TOTAL) AS MATCH_TOTAL - FROM - ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} AS total, NULL AS match_total - FROM {TARGET_SCHEMA}.{TABLE_NAME} - WHERE {SUBSET_CONDITION} - GROUP BY {GROUPBY_NAMES} - {HAVING_CONDITION} - UNION ALL - SELECT {MATCH_GROUPBY_NAMES}, NULL AS total, {MATCH_COLUMN_NAMES} AS match_total - FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME} - WHERE {MATCH_SUBSET_CONDITION} - GROUP BY {MATCH_GROUPBY_NAMES} - {MATCH_HAVING_CONDITION} ) a - GROUP BY {GROUPBY_NAMES} ) s - WHERE (total IS NOT NULL AND match_total IS NULL) - OR (total IS NULL AND match_total IS NOT NULL) - OR (total NOT BETWEEN match_total * (1 + {LOWER_TOLERANCE}/100.0) AND match_total * (1 + {UPPER_TOLERANCE}/100.0)) - ORDER BY {GROUPBY_NAMES};'), - ('1247', '1504', 'Test Results', 'Aggregate_Balance_Percent', 'mssql', NULL, 'SELECT * - FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) AS total, SUM(MATCH_TOTAL) AS MATCH_TOTAL - FROM - ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} AS total, NULL AS match_total - FROM {TARGET_SCHEMA}.{TABLE_NAME} - WHERE {SUBSET_CONDITION} - GROUP BY {GROUPBY_NAMES} - {HAVING_CONDITION} - UNION ALL - SELECT {MATCH_GROUPBY_NAMES}, NULL AS total, {MATCH_COLUMN_NAMES} AS match_total - FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME} - WHERE {MATCH_SUBSET_CONDITION} - GROUP BY {MATCH_GROUPBY_NAMES} - {MATCH_HAVING_CONDITION} ) a - GROUP BY {GROUPBY_NAMES} ) s - WHERE (total IS NOT NULL AND match_total IS NULL) - OR (total IS NULL AND match_total IS NOT NULL) - OR (total NOT BETWEEN match_total * (1 + {LOWER_TOLERANCE}/100.0) AND match_total * (1 + {UPPER_TOLERANCE}/100.0)) - ORDER BY {GROUPBY_NAMES};'), - ('1248', '1504', 'Test Results', 'Aggregate_Balance_Percent', 'postgresql', NULL, 'SELECT * - FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) AS total, SUM(MATCH_TOTAL) AS MATCH_TOTAL - FROM - ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} AS total, NULL AS match_total - FROM {TARGET_SCHEMA}.{TABLE_NAME} - WHERE {SUBSET_CONDITION} - GROUP BY {GROUPBY_NAMES} - {HAVING_CONDITION} - UNION ALL - SELECT {MATCH_GROUPBY_NAMES}, NULL AS total, {MATCH_COLUMN_NAMES} AS match_total - FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME} - WHERE {MATCH_SUBSET_CONDITION} - GROUP BY {MATCH_GROUPBY_NAMES} - {MATCH_HAVING_CONDITION} ) a - GROUP BY {GROUPBY_NAMES} ) s - WHERE (total IS NOT NULL AND match_total IS NULL) - OR (total IS NULL AND match_total IS NOT NULL) - OR (total NOT BETWEEN match_total * (1 + {LOWER_TOLERANCE}/100.0) AND match_total * (1 + {UPPER_TOLERANCE}/100.0)) - ORDER BY {GROUPBY_NAMES};'), - - ('1245', '1505', 'Test Results', 'Aggregate_Balance_Range', 'redshift', NULL, 'SELECT * - FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) AS total, SUM(MATCH_TOTAL) AS MATCH_TOTAL - FROM - ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} AS total, NULL AS match_total - FROM {TARGET_SCHEMA}.{TABLE_NAME} - WHERE {SUBSET_CONDITION} - GROUP BY {GROUPBY_NAMES} - {HAVING_CONDITION} - UNION ALL - SELECT {MATCH_GROUPBY_NAMES}, NULL AS total, {MATCH_COLUMN_NAMES} AS match_total - FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME} - WHERE {MATCH_SUBSET_CONDITION} - GROUP BY {MATCH_GROUPBY_NAMES} - {MATCH_HAVING_CONDITION} ) a - GROUP BY {GROUPBY_NAMES} ) s - WHERE (total IS NOT NULL AND match_total IS NULL) - OR (total IS NULL AND match_total IS NOT NULL) - OR (total NOT BETWEEN match_total + {LOWER_TOLERANCE} AND match_total + {UPPER_TOLERANCE}) - ORDER BY {GROUPBY_NAMES};'), - ('1246', '1505', 'Test Results', 'Aggregate_Balance_Range', 'snowflake', NULL, 'SELECT * - FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) AS total, SUM(MATCH_TOTAL) AS MATCH_TOTAL - FROM - ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} AS total, NULL AS match_total - FROM {TARGET_SCHEMA}.{TABLE_NAME} - WHERE {SUBSET_CONDITION} - GROUP BY {GROUPBY_NAMES} - {HAVING_CONDITION} - UNION ALL - SELECT {MATCH_GROUPBY_NAMES}, NULL AS total, {MATCH_COLUMN_NAMES} AS match_total - FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME} - WHERE {MATCH_SUBSET_CONDITION} - GROUP BY {MATCH_GROUPBY_NAMES} - {MATCH_HAVING_CONDITION} ) a - GROUP BY {GROUPBY_NAMES} ) s - WHERE (total IS NOT NULL AND match_total IS NULL) - OR (total IS NULL AND match_total IS NOT NULL) - OR (total NOT BETWEEN match_total + {LOWER_TOLERANCE} AND match_total + {UPPER_TOLERANCE}) - ORDER BY {GROUPBY_NAMES};'), - ('1247', '1505', 'Test Results', 'Aggregate_Balance_Range', 'mssql', NULL, 'SELECT * - FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) AS total, SUM(MATCH_TOTAL) AS MATCH_TOTAL - FROM - ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} AS total, NULL AS match_total - FROM {TARGET_SCHEMA}.{TABLE_NAME} - WHERE {SUBSET_CONDITION} - GROUP BY {GROUPBY_NAMES} - {HAVING_CONDITION} - UNION ALL - SELECT {MATCH_GROUPBY_NAMES}, NULL AS total, {MATCH_COLUMN_NAMES} AS match_total - FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME} - WHERE {MATCH_SUBSET_CONDITION} - GROUP BY {MATCH_GROUPBY_NAMES} - {MATCH_HAVING_CONDITION} ) a - GROUP BY {GROUPBY_NAMES} ) s - WHERE (total IS NOT NULL AND match_total IS NULL) - OR (total IS NULL AND match_total IS NOT NULL) - OR (total NOT BETWEEN match_total + {LOWER_TOLERANCE} AND match_total + {UPPER_TOLERANCE}) - ORDER BY {GROUPBY_NAMES};'), - ('1248', '1505', 'Test Results', 'Aggregate_Balance_Range', 'postgresql', NULL, 'SELECT * - FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) AS total, SUM(MATCH_TOTAL) AS MATCH_TOTAL - FROM - ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} AS total, NULL AS match_total - FROM {TARGET_SCHEMA}.{TABLE_NAME} - WHERE {SUBSET_CONDITION} - GROUP BY {GROUPBY_NAMES} - {HAVING_CONDITION} - UNION ALL - SELECT {MATCH_GROUPBY_NAMES}, NULL AS total, {MATCH_COLUMN_NAMES} AS match_total - FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME} - WHERE {MATCH_SUBSET_CONDITION} - GROUP BY {MATCH_GROUPBY_NAMES} - {MATCH_HAVING_CONDITION} ) a - GROUP BY {GROUPBY_NAMES} ) s - WHERE (total IS NOT NULL AND match_total IS NULL) - OR (total IS NULL AND match_total IS NOT NULL) - OR (total NOT BETWEEN match_total + {LOWER_TOLERANCE} AND match_total + {UPPER_TOLERANCE}) - ORDER BY {GROUPBY_NAMES};'), - - ('1261', '1508', 'Test Results', 'Timeframe_Combo_Gain', 'redshift', NULL, 'SELECT {COLUMN_NAME_NO_QUOTES} - FROM {TARGET_SCHEMA}.{TABLE_NAME} - WHERE {SUBSET_CONDITION} - AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - 2 * {WINDOW_DAYS} - AND {WINDOW_DATE_COLUMN} < (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS} -GROUP BY {COLUMN_NAME_NO_QUOTES} - EXCEPT -SELECT {COLUMN_NAME_NO_QUOTES} - FROM {TARGET_SCHEMA}.{TABLE_NAME} - WHERE {SUBSET_CONDITION} - AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS} -GROUP BY {COLUMN_NAME_NO_QUOTES}'), - ('1262', '1508', 'Test Results', 'Timeframe_Combo_Gain', 'snowflake', NULL, 'SELECT {COLUMN_NAME_NO_QUOTES} - FROM {TARGET_SCHEMA}.{TABLE_NAME} - WHERE {SUBSET_CONDITION} - AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - 2 * {WINDOW_DAYS} - AND {WINDOW_DATE_COLUMN} < (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS} -GROUP BY {COLUMN_NAME_NO_QUOTES} - EXCEPT -SELECT {COLUMN_NAME_NO_QUOTES} - FROM {TARGET_SCHEMA}.{TABLE_NAME} - WHERE {SUBSET_CONDITION} - AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS} -GROUP BY {COLUMN_NAME_NO_QUOTES}'), - ('1263', '1508', 'Test Results', 'Timeframe_Combo_Gain', 'mssql', NULL, 'SELECT {COLUMN_NAME_NO_QUOTES} - FROM {TARGET_SCHEMA}.{TABLE_NAME} - WHERE {SUBSET_CONDITION} - AND {WINDOW_DATE_COLUMN} >= DATEADD("day", - 2 * {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME})) - AND {WINDOW_DATE_COLUMN} < DATEADD("day", - {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME})) -GROUP BY {COLUMN_NAME_NO_QUOTES} - EXCEPT -SELECT {COLUMN_NAME_NO_QUOTES} - FROM {TARGET_SCHEMA}.{TABLE_NAME} - WHERE {SUBSET_CONDITION} - AND {WINDOW_DATE_COLUMN} >= DATEADD("day", - {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME})) -GROUP BY {COLUMN_NAME_NO_QUOTES}'), - ('1264', '1508', 'Test Results', 'Timeframe_Combo_Gain', 'postgresql', NULL, 'SELECT {COLUMN_NAME_NO_QUOTES} - FROM {TARGET_SCHEMA}.{TABLE_NAME} - WHERE {SUBSET_CONDITION} - AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - 2 * {WINDOW_DAYS} - AND {WINDOW_DATE_COLUMN} < (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS} -GROUP BY {COLUMN_NAME_NO_QUOTES} - EXCEPT -SELECT {COLUMN_NAME_NO_QUOTES} - FROM {TARGET_SCHEMA}.{TABLE_NAME} - WHERE {SUBSET_CONDITION} - AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS} -GROUP BY {COLUMN_NAME_NO_QUOTES}'), - ('1265', '1509', 'Test Results', 'Timeframe_Combo_Match', 'redshift', NULL, ' ( -SELECT ''Prior Timeframe'' as missing_from, {COLUMN_NAME} -FROM {TARGET_SCHEMA}.{TABLE_NAME} -WHERE {SUBSET_CONDITION} - AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS} -EXCEPT -SELECT ''Prior Timeframe'' as missing_from, {COLUMN_NAME} -FROM {TARGET_SCHEMA}.{TABLE_NAME} -WHERE {SUBSET_CONDITION} - AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - 2 * {WINDOW_DAYS} - AND {WINDOW_DATE_COLUMN} < (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS} -) -UNION ALL -( -SELECT ''Latest Timeframe'' as missing_from, {COLUMN_NAME} -FROM {TARGET_SCHEMA}.{TABLE_NAME} -WHERE {SUBSET_CONDITION} - AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - 2 * {WINDOW_DAYS} - AND {WINDOW_DATE_COLUMN} < (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS} - EXCEPT -SELECT ''Latest Timeframe'' as missing_from, {COLUMN_NAME} -FROM {TARGET_SCHEMA}.{TABLE_NAME} -WHERE {SUBSET_CONDITION} - AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS} -)'), - ('1266', '1509', 'Test Results', 'Timeframe_Combo_Match', 'snowflake', NULL, ' ( -SELECT ''Prior Timeframe'' as missing_from, {COLUMN_NAME} -FROM {TARGET_SCHEMA}.{TABLE_NAME} -WHERE {SUBSET_CONDITION} - AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS} -EXCEPT -SELECT ''Prior Timeframe'' as missing_from, {COLUMN_NAME} -FROM {TARGET_SCHEMA}.{TABLE_NAME} -WHERE {SUBSET_CONDITION} - AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - 2 * {WINDOW_DAYS} - AND {WINDOW_DATE_COLUMN} < (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS} -) -UNION ALL -( -SELECT ''Latest Timeframe'' as missing_from, {COLUMN_NAME} -FROM {TARGET_SCHEMA}.{TABLE_NAME} -WHERE {SUBSET_CONDITION} - AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - 2 * {WINDOW_DAYS} - AND {WINDOW_DATE_COLUMN} < (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS} - EXCEPT -SELECT ''Latest Timeframe'' as missing_from, {COLUMN_NAME} -FROM {TARGET_SCHEMA}.{TABLE_NAME} -WHERE {SUBSET_CONDITION} - AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS} -)'), - ('1267', '1509', 'Test Results', 'Timeframe_Combo_Match', 'mssql', NULL, ' ( -SELECT ''Prior Timeframe'' as missing_from, {COLUMN_NAME} -FROM {TARGET_SCHEMA}.{TABLE_NAME} -WHERE {SUBSET_CONDITION} - AND {WINDOW_DATE_COLUMN} >= DATEADD("day", - {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME})) -EXCEPT -SELECT ''Prior Timeframe'' as missing_from, {COLUMN_NAME} -FROM {TARGET_SCHEMA}.{TABLE_NAME} -WHERE {SUBSET_CONDITION} - AND {WINDOW_DATE_COLUMN} >= DATEADD("day", - 2 * {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME})) - AND {WINDOW_DATE_COLUMN} < DATEADD("day", - {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME})) -) -UNION ALL -( -SELECT ''Latest Timeframe'' as missing_from, {COLUMN_NAME} -FROM {TARGET_SCHEMA}.{TABLE_NAME} -WHERE {SUBSET_CONDITION} - AND {WINDOW_DATE_COLUMN} >= DATEADD("day", - 2 * {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME})) - AND {WINDOW_DATE_COLUMN} < DATEADD("day", - {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME})) - EXCEPT -SELECT ''Latest Timeframe'' as missing_from, {COLUMN_NAME} -FROM {TARGET_SCHEMA}.{TABLE_NAME} -WHERE {SUBSET_CONDITION} - AND {WINDOW_DATE_COLUMN} >= DATEADD("day", - {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME})) -)'), - ('1268', '1509', 'Test Results', 'Timeframe_Combo_Match', 'postgresql', NULL, ' ( -SELECT ''Prior Timeframe'' as missing_from, {COLUMN_NAME} -FROM {TARGET_SCHEMA}.{TABLE_NAME} -WHERE {SUBSET_CONDITION} - AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS} -EXCEPT -SELECT ''Prior Timeframe'' as missing_from, {COLUMN_NAME} -FROM {TARGET_SCHEMA}.{TABLE_NAME} -WHERE {SUBSET_CONDITION} - AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - 2 * {WINDOW_DAYS} - AND {WINDOW_DATE_COLUMN} < (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS} -) -UNION ALL -( -SELECT ''Latest Timeframe'' as missing_from, {COLUMN_NAME} -FROM {TARGET_SCHEMA}.{TABLE_NAME} -WHERE {SUBSET_CONDITION} - AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - 2 * {WINDOW_DAYS} - AND {WINDOW_DATE_COLUMN} < (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS} - EXCEPT -SELECT ''Latest Timeframe'' as missing_from, {COLUMN_NAME} -FROM {TARGET_SCHEMA}.{TABLE_NAME} -WHERE {SUBSET_CONDITION} - AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS} -)'), - ('1269', '1100', 'Profile Anomaly', 'Potential_PII', 'redshift', NULL, 'SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500;'), - ('1270', '1100', 'Profile Anomaly', 'Potential_PII', 'snowflake', NULL, 'SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500;'), - ('1271', '1100', 'Profile Anomaly', 'Potential_PII', 'mssql', NULL, 'SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC;'), - ('1272', '1100', 'Profile Anomaly', 'Potential_PII', 'postgresql', NULL, 'SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500;'), - - ('1273', '1001', 'Profile Anomaly' , 'Suggested_Type', 'databricks', NULL, 'SELECT `{COLUMN_NAME}`, COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY `{COLUMN_NAME}` ORDER BY record_ct DESC LIMIT 20;'), - ('1274', '1002', 'Profile Anomaly' , 'Non_Standard_Blanks', 'databricks', NULL, 'SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE CASE WHEN `{COLUMN_NAME}` IN (''.'', ''?'', '' '') THEN 1 WHEN LOWER(`{COLUMN_NAME}`::STRING) REGEXP ''-{2,}'' OR LOWER(`{COLUMN_NAME}`::STRING) REGEXP ''0{2,}'' OR LOWER(`{COLUMN_NAME}`::STRING) REGEXP ''9{2,}'' OR LOWER(`{COLUMN_NAME}`::STRING) REGEXP ''x{2,}'' OR LOWER(`{COLUMN_NAME}`::STRING) REGEXP ''z{2,}'' THEN 1 WHEN LOWER(`{COLUMN_NAME}`) IN (''blank'',''error'',''missing'',''tbd'', ''n/a'',''#na'',''none'',''null'',''unknown'') THEN 1 WHEN LOWER(`{COLUMN_NAME}`) IN (''(blank)'',''(error)'',''(missing)'',''(tbd)'', ''(n/a)'',''(#na)'',''(none)'',''(null)'',''(unknown)'') THEN 1 WHEN LOWER(`{COLUMN_NAME}`) IN (''[blank]'',''[error]'',''[missing]'',''[tbd]'', ''[n/a]'',''[#na]'',''[none]'',''[null]'',''[unknown]'') THEN 1 WHEN `{COLUMN_NAME}` = '''' THEN 1 WHEN `{COLUMN_NAME}` IS NULL THEN 1 ELSE 0 END = 1 GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}`;'), - ('1275', '1003', 'Profile Anomaly' , 'Invalid_Zip_USA', 'databricks', NULL, 'SELECT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE(`{COLUMN_NAME}`,''012345678'',''999999999'') NOT IN (''99999'', ''999999999'', ''99999-9999'') GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` LIMIT 500;'), - ('1276', '1004', 'Profile Anomaly' , 'Multiple_Types_Minor', 'databricks', NULL, 'SELECT DISTINCT column_name, columns.table_name, CASE WHEN data_type ILIKE ''timestamp%'' THEN lower(data_type) WHEN data_type ILIKE ''date'' THEN lower(data_type) WHEN data_type ILIKE ''boolean'' THEN ''boolean'' WHEN data_type = ''TEXT'' THEN ''varchar('' || CAST(character_maximum_length AS STRING) || '')'' WHEN data_type ILIKE ''char%'' THEN ''char('' || CAST(character_maximum_length AS STRING) || '')'' WHEN data_type = ''NUMBER'' AND numeric_precision = 38 AND numeric_scale = 0 THEN ''bigint'' WHEN data_type ILIKE ''num%'' THEN ''numeric('' || CAST(numeric_precision AS STRING) || '','' || CAST(numeric_scale AS STRING) || '')'' ELSE data_type END AS data_type FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = ''{TARGET_SCHEMA}'' AND columns.column_name = ''{COLUMN_NAME}'' AND tables.table_type = ''BASE TABLE'' ORDER BY data_type, table_name;'), - ('1277', '1005', 'Profile Anomaly' , 'Multiple_Types_Major', 'databricks', NULL, 'SELECT DISTINCT column_name, columns.table_name, CASE WHEN data_type ILIKE ''timestamp%'' THEN lower(data_type) WHEN data_type ILIKE ''date'' THEN lower(data_type) WHEN data_type ILIKE ''boolean'' THEN ''boolean'' WHEN data_type = ''TEXT'' THEN ''varchar('' || CAST(character_maximum_length AS STRING) || '')'' WHEN data_type ILIKE ''char%'' THEN ''char('' || CAST(character_maximum_length AS STRING) || '')'' WHEN data_type = ''NUMBER'' AND numeric_precision = 38 AND numeric_scale = 0 THEN ''bigint'' WHEN data_type ILIKE ''num%'' THEN ''numeric('' || CAST(numeric_precision AS STRING) || '','' || CAST(numeric_scale AS STRING) || '')'' ELSE data_type END AS data_type FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = ''{TARGET_SCHEMA}'' AND columns.column_name = ''{COLUMN_NAME}'' AND tables.table_type = ''BASE TABLE'' ORDER BY data_type, table_name;'), - ('1278', '1006', 'Profile Anomaly' , 'No_Values', 'databricks', NULL, 'SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}`;' ), - ('1279', '1007', 'Profile Anomaly' , 'Column_Pattern_Mismatch', 'databricks', NULL, 'SELECT A.* FROM (SELECT DISTINCT b.top_pattern, `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT trim(split_part(''{DETAIL_EXPRESSION}'', ''|'', 4)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( `{COLUMN_NAME}`::STRING, ''[a-z]'', ''a''), ''[A-Z]'', ''A''), ''[0-9]'', ''N'') = b.top_pattern GROUP BY b.top_pattern, `{COLUMN_NAME}` ORDER BY count DESC LIMIT 5) A UNION ALL SELECT B.* FROM (SELECT DISTINCT b.top_pattern, `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT trim(split_part(''{DETAIL_EXPRESSION}'', ''|'', 6)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( `{COLUMN_NAME}`::STRING, ''[a-z]'', ''a''), ''[A-Z]'', ''A''), ''[0-9]'', ''N'') = b.top_pattern GROUP BY b.top_pattern, `{COLUMN_NAME}` ORDER BY count DESC LIMIT 5) B UNION ALL SELECT C.* FROM (SELECT DISTINCT b.top_pattern, `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT trim(split_part(''{DETAIL_EXPRESSION}'', ''|'', 8)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( `{COLUMN_NAME}`::STRING, ''[a-z]'', ''a''), ''[A-Z]'', ''A''), ''[0-9]'', ''N'') = b.top_pattern GROUP BY b.top_pattern, `{COLUMN_NAME}` ORDER BY count DESC LIMIT 5) C UNION ALL SELECT D.* FROM (SELECT DISTINCT b.top_pattern, `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT trim(split_part(''{DETAIL_EXPRESSION}'', ''|'', 10)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( `{COLUMN_NAME}`::STRING, ''[a-z]'', ''a''), ''[A-Z]'', ''A''), ''[0-9]'', ''N'') = b.top_pattern GROUP BY b.top_pattern, `{COLUMN_NAME}` ORDER BY count DESC LIMIT 5) D ORDER BY top_pattern DESC, count DESC;' ), - ('1280', '1008', 'Profile Anomaly' , 'Table_Pattern_Mismatch', 'databricks', NULL, 'SELECT DISTINCT column_name, columns.table_name FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = ''{TARGET_SCHEMA}'' AND columns.column_name = ''{COLUMN_NAME}'' AND UPPER(tables.table_type) = ''BASE TABLE'' ORDER BY table_name; ' ), - ('1281', '1009', 'Profile Anomaly' , 'Leading_Spaces', 'databricks', NULL, 'SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (CASE WHEN `{COLUMN_NAME}` BETWEEN '' !'' AND ''!'' THEN 1 ELSE 0 END) = 1 GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}`;' ), - ('1282', '1010', 'Profile Anomaly' , 'Quoted_Values', 'databricks', NULL, 'SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (CASE WHEN `{COLUMN_NAME}` ILIKE ''"%"'' OR `{COLUMN_NAME}` ILIKE ''''''%'''''' THEN 1 ELSE 0 END) = 1 GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` LIMIT 500;' ), - ('1283', '1011', 'Profile Anomaly' , 'Char_Column_Number_Values', 'databricks', NULL, 'SELECT A.* FROM (SELECT DISTINCT ''Numeric'' as data_type, `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_NUM;`{COLUMN_NAME}`%> = 1 GROUP BY `{COLUMN_NAME}` ORDER BY count DESC LIMIT 10) AS A UNION ALL SELECT B.* FROM (SELECT DISTINCT ''Non-Numeric'' as data_type, `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_NUM;`{COLUMN_NAME}`%> != 1 GROUP BY `{COLUMN_NAME}` ORDER BY count DESC) AS B ORDER BY data_type, count DESC LIMIT 10;' ), - ('1284', '1012', 'Profile Anomaly' , 'Char_Column_Date_Values', 'databricks', NULL, 'SELECT A.* FROM (SELECT DISTINCT ''Date'' as data_type, `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_DATE;`{COLUMN_NAME}`%> = 1 GROUP BY `{COLUMN_NAME}` ORDER BY count DESC LIMIT 10) AS A UNION ALL SELECT B.* FROM (SELECT DISTINCT ''Non-Date'' as data_type, `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_DATE;`{COLUMN_NAME}`%> != 1 GROUP BY `{COLUMN_NAME}` ORDER BY count DESC) AS B ORDER BY data_type, count DESC LIMIT 10;' ), - ('1285', '1013', 'Profile Anomaly', 'Small Missing Value Ct', 'databricks', NULL, 'SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (CASE WHEN `{COLUMN_NAME}` IN (''.'', ''?'', '' '') THEN 1 WHEN LOWER(`{COLUMN_NAME}`::STRING) REGEXP ''-{2,}'' OR LOWER(`{COLUMN_NAME}`::STRING) REGEXP ''0{2,}'' OR LOWER(`{COLUMN_NAME}`::STRING) REGEXP ''9{2,}'' OR LOWER(`{COLUMN_NAME}`::STRING) REGEXP ''x{2,}'' OR LOWER(`{COLUMN_NAME}`::STRING) REGEXP ''z{2,}'' THEN 1 WHEN LOWER(`{COLUMN_NAME}`) IN (''blank'',''error'',''missing'',''tbd'', ''n/a'',''#na'',''none'',''null'',''unknown'') THEN 1 WHEN LOWER(`{COLUMN_NAME}`) IN (''(blank)'',''(error)'',''(missing)'',''(tbd)'', ''(n/a)'',''(#na)'',''(none)'',''(null)'',''(unknown)'') THEN 1 WHEN LOWER(`{COLUMN_NAME}`) IN (''[blank]'',''[error]'',''[missing]'',''[tbd]'', ''[n/a]'',''[#na]'',''[none]'',''[null]'',''[unknown]'') THEN 1 WHEN `{COLUMN_NAME}` = '''' THEN 1 WHEN `{COLUMN_NAME}` IS NULL THEN 1 ELSE 0 END) = 1 GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}`;'), - ('1286', '1014', 'Profile Anomaly' , 'Small Divergent Value Ct', 'databricks', NULL, 'SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY `{COLUMN_NAME}` ORDER BY count DESC;' ), - ('1287', '1015', 'Profile Anomaly' , 'Boolean_Value_Mismatch', 'databricks', NULL, 'SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY `{COLUMN_NAME}` ORDER BY count DESC;' ), - ('1288', '1016', 'Profile Anomaly' , 'Potential_Duplicates', 'databricks', NULL, 'SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY `{COLUMN_NAME}` HAVING count > 1 ORDER BY count DESC LIMIT 500;' ), - ('1289', '1017', 'Profile Anomaly' , 'Standardized_Value_Matches', 'databricks', NULL, 'WITH CTE AS ( SELECT DISTINCT UPPER(TRANSLATE(`{COLUMN_NAME}`, '' '''',.-'', '''')) as possible_standard_value, COUNT(DISTINCT `{COLUMN_NAME}`) FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY UPPER(TRANSLATE(`{COLUMN_NAME}`, '' '''',.-'', '''')) HAVING COUNT(DISTINCT `{COLUMN_NAME}`) > 1 ) SELECT DISTINCT a.`{COLUMN_NAME}`, possible_standard_value, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} a, cte b WHERE UPPER(TRANSLATE(a.`{COLUMN_NAME}`, '' '''',.-'', '''')) = b.possible_standard_value GROUP BY a.`{COLUMN_NAME}`, possible_standard_value ORDER BY possible_standard_value ASC, count DESC LIMIT 500;' ), - ('1290', '1018', 'Profile Anomaly' , 'Unlikely_Date_Values', 'databricks', NULL, 'SELECT DISTINCT `{COLUMN_NAME}`, ''{PROFILE_RUN_DATE}'' :: DATE AS profile_run_date, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} a WHERE (`{COLUMN_NAME}` < ''1900-01-01''::DATE) OR (`{COLUMN_NAME}` > ''{PROFILE_RUN_DATE}'' :: DATE + INTERVAL ''30 year'' ) GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` DESC LIMIT 500;' ), - ('1291', '1019', 'Profile Anomaly' , 'Recency_One_Year', 'databricks', NULL, 'created_in_ui' ), - ('1292', '1020', 'Profile Anomaly' , 'Recency_Six_Months', 'databricks', NULL, 'created_in_ui' ), - ('1293', '1021', 'Profile Anomaly' , 'Unexpected US States', 'databricks', NULL, 'SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` DESC LIMIT 500;' ), - ('1294', '1022', 'Profile Anomaly' , 'Unexpected Emails', 'databricks', NULL, 'SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` DESC LIMIT 500;' ), - ('1295', '1023', 'Profile Anomaly' , 'Small_Numeric_Value_Ct', 'databricks', NULL, 'SELECT A.* FROM (SELECT DISTINCT ''Numeric'' as data_type, `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_NUM;`{COLUMN_NAME}`%> = 1 GROUP BY `{COLUMN_NAME}` ORDER BY count DESC LIMIT 10) AS A UNION ALL SELECT B.* FROM (SELECT DISTINCT ''Non-Numeric'' as data_type, `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_NUM;`{COLUMN_NAME}`%> != 1 GROUP BY `{COLUMN_NAME}` ORDER BY count DESC) AS B ORDER BY data_type, count DESC LIMIT 10;' ), - ('1296', '1024', 'Profile Anomaly' , 'Invalid_Zip3_USA', 'databricks', NULL, 'SELECT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE(`{COLUMN_NAME}`,''012345678'',''999999999'') <> ''999'' GROUP BY `{COLUMN_NAME}` ORDER BY count DESC, `{COLUMN_NAME}` LIMIT 500;'), - ('1297', '1025', 'Profile Anomaly' , 'Delimited_Data_Embedded', 'databricks', NULL, 'SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE REGEXP_LIKE(`{COLUMN_NAME}`::STRING, ''^([^,|\t]{1,20}[,|\t]){2,}[^,|\t]{0,20}([,|\t]{0,1}[^,|\t]{0,20})*$'') AND NOT REGEXP_LIKE(`{COLUMN_NAME}`::STRING, ''.*\\s(and|but|or|yet)\\s.*'') GROUP BY `{COLUMN_NAME}` ORDER BY count DESC LIMIT 500;' ), - - ('1298', '1004', 'Test Results', 'Alpha_Trunc', 'databricks', NULL, 'SELECT DISTINCT `{COLUMN_NAME}` , LEN(`{COLUMN_NAME}`) as current_max_length, {THRESHOLD_VALUE} as previous_max_length FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT MAX(LEN(`{COLUMN_NAME}`)) as max_length FROM {TARGET_SCHEMA}.{TABLE_NAME}) a WHERE LEN(`{COLUMN_NAME}`) = a.max_length AND a.max_length < {THRESHOLD_VALUE} LIMIT 500;'), - ('1299', '1005', 'Test Results', 'Avg_Shift', 'databricks', NULL, 'SELECT AVG(`{COLUMN_NAME}` :: FLOAT) AS current_average FROM {TARGET_SCHEMA}.{TABLE_NAME};'), - ('1300', '1006', 'Test Results', 'Condition_Flag', 'databricks', NULL, 'SELECT * FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE {CUSTOM_QUERY} LIMIT 500;'), - ('1301', '1007', 'Test Results', 'Constant', 'databricks', NULL, 'SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE `{COLUMN_NAME}` <> {BASELINE_VALUE} GROUP BY `{COLUMN_NAME}` LIMIT 500;'), - ('1302', '1009', 'Test Results', 'Daily_Record_Ct', 'databricks', NULL, 'WITH date_bounds AS( SELECT MIN(`{COLUMN_NAME}`) AS min_date, MAX(`{COLUMN_NAME}`) AS max_date FROM {TARGET_SCHEMA}.{TABLE_NAME}), all_dates AS ( SELECT EXPLODE(SEQUENCE(min_date, max_date, INTERVAL 1 DAY)) AS all_dates FROM date_bounds ), existing_periods AS ( SELECT DISTINCT CAST(`{COLUMN_NAME}` AS DATE) AS period, COUNT(1) AS period_count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY CAST(`{COLUMN_NAME}` AS DATE) ), missing_dates AS ( SELECT d.all_dates AS missing_period FROM all_dates d LEFT JOIN existing_periods e ON d.all_dates = e.period WHERE e.period IS NULL ) SELECT m.missing_period, MAX(e1.period) AS prior_available_date, MAX(e1.period_count) AS prior_available_date_count, MIN(e2.period) AS next_available_date, MAX(e2.period_count) AS next_available_date_count FROM missing_dates m LEFT JOIN existing_periods e1 ON e1.period < m.missing_period LEFT JOIN existing_periods e2 ON e2.period > m.missing_period GROUP BY m.missing_period ORDER BY m.missing_period LIMIT 500;'), - ('1303', '1011', 'Test Results', 'Dec_Trunc', 'databricks', NULL, 'SELECT DISTINCT LENGTH(SPLIT_PART(`{COLUMN_NAME}`::STRING, ''.'', 2)) AS decimal_scale, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY decimal_scale LIMIT 500;'), - ('1304', '1012', 'Test Results', 'Distinct_Date_Ct', 'databricks', NULL, 'SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE `{COLUMN_NAME}` IS NOT NULL GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` DESC LIMIT 500;'), - ('1305', '1013', 'Test Results', 'Distinct_Value_Ct', 'databricks', NULL, 'SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE `{COLUMN_NAME}` IS NOT NULL GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` DESC LIMIT 500;'), - ('1306', '1014', 'Test Results', 'Email_Format', 'databricks', NULL, 'SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE REGEXP_LIKE(`{COLUMN_NAME}`::STRING, ''^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z]{2,}$'') != 1 GROUP BY `{COLUMN_NAME}` LIMIT 500;'), - ('1307', '1015', 'Test Results', 'Future_Date', 'databricks', NULL, 'SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE GREATEST(0, SIGN(`{COLUMN_NAME}`::DATE - ''{TEST_DATE}''::DATE)) > {THRESHOLD_VALUE} GROUP BY `{COLUMN_NAME}` LIMIT 500;'), - ('1308', '1016', 'Test Results', 'Future_Date_1Y', 'databricks', NULL, 'SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE GREATEST(0, SIGN(`{COLUMN_NAME}`::DATE - (''{TEST_DATE}''::DATE + 365))) > {THRESHOLD_VALUE} GROUP BY `{COLUMN_NAME}` LIMIT 500;'), - ('1309', '1017', 'Test Results', 'Incr_Avg_Shift', 'databricks', NULL, 'SELECT AVG(`{COLUMN_NAME}` :: FLOAT) AS current_average, SUM(`{COLUMN_NAME}` ::FLOAT) AS current_sum, NULLIF(COUNT(`{COLUMN_NAME}` )::FLOAT, 0) as current_value_count FROM {TARGET_SCHEMA}.{TABLE_NAME};'), - ('1310', '1018', 'Test Results', 'LOV_All', 'databricks', NULL, 'SELECT ARRAY_JOIN(ARRAY_SORT(COLLECT_SET(`{COLUMN_NAME}`)), ''|'') AS aggregated_values FROM {TARGET_SCHEMA}.{TABLE_NAME} HAVING ARRAY_JOIN(ARRAY_SORT(COLLECT_SET(`{COLUMN_NAME}`)), ''|'') <> ''{THRESHOLD_VALUE}'' LIMIT 500;'), - ('1311', '1019', 'Test Results', 'LOV_Match', 'databricks', NULL, 'SELECT DISTINCT NULLIF(`{COLUMN_NAME}`, '''') AS `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE NULLIF(`{COLUMN_NAME}`, '''') NOT IN {BASELINE_VALUE} GROUP BY `{COLUMN_NAME}` LIMIT 500;'), - ('1312', '1020', 'Test Results', 'Min_Date', 'databricks', NULL, 'SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE `{COLUMN_NAME}` :: DATE < ''{BASELINE_VALUE}'' :: DATE GROUP BY `{COLUMN_NAME}` LIMIT 500;'), - ('1313', '1021', 'Test Results', 'Min_Val', 'databricks', NULL, 'SELECT DISTINCT `{COLUMN_NAME}`, (ABS(`{COLUMN_NAME}`) - ABS({BASELINE_VALUE})) AS difference_from_baseline FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE `{COLUMN_NAME}` < {BASELINE_VALUE} LIMIT 500;'), - ('1314', '1022', 'Test Results', 'Missing_Pct', 'databricks', NULL, 'SELECT * FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE `{COLUMN_NAME}` IS NULL OR `{COLUMN_NAME}` :: VARCHAR(255) = '''' LIMIT 10;'), - ('1315', '1023', 'Test Results', 'Monthly_Rec_Ct', 'databricks', NULL, 'WITH daterange AS( SELECT explode( sequence( date_trunc(''month'', (SELECT MIN(`{COLUMN_NAME}`) FROM {TARGET_SCHEMA}.{TABLE_NAME})), date_trunc(''month'', (SELECT MAX(`{COLUMN_NAME}`) FROM {TARGET_SCHEMA}.{TABLE_NAME})), interval 1 month) ) AS all_dates ), existing_periods AS ( SELECT DISTINCT date_trunc(''month'', `{COLUMN_NAME}`) AS period, COUNT(1) AS period_count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY date_trunc(''month'', `{COLUMN_NAME}`) ) SELECT p.missing_period, p.prior_available_month, e.period_count AS prior_available_month_count, p.next_available_month, f.period_count AS next_available_month_count FROM ( SELECT d.all_dates AS missing_period, MAX(b.period) AS prior_available_month, MIN(c.period) AS next_available_month FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates ) p LEFT JOIN existing_periods e ON p.prior_available_month = e.period LEFT JOIN existing_periods f ON p.next_available_month = f.period ORDER BY p.missing_period;'), - ('1316', '1024', 'Test Results', 'Outlier_Pct_Above', 'databricks', NULL, 'SELECT ({BASELINE_AVG} + (2*{BASELINE_SD})) AS outlier_threshold, `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE `{COLUMN_NAME}` :: FLOAT > ({BASELINE_AVG} + (2*{BASELINE_SD})) GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` DESC;'), - ('1317', '1025', 'Test Results', 'Outlier_Pct_Below', 'databricks', NULL, 'SELECT ({BASELINE_AVG} + (2*{BASELINE_SD})) AS outlier_threshold, `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE `{COLUMN_NAME}` :: FLOAT < ({BASELINE_AVG} + (2*{BASELINE_SD})) GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` DESC;'), - ('1318', '1026', 'Test Results', 'Pattern_Match', 'databricks', NULL, 'SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE REGEXP_LIKE(NULLIF(`{COLUMN_NAME}`::STRING, ''''),''{BASELINE_VALUE}'') != 1 GROUP BY `{COLUMN_NAME}`;'), - ('1319', '1028', 'Test Results', 'Recency', 'databricks', NULL, 'SELECT DISTINCT col AS latest_date_available, ''{TEST_DATE}'' :: DATE as test_run_date FROM (SELECT MAX(`{COLUMN_NAME}`) AS col FROM {TARGET_SCHEMA}.{TABLE_NAME}) WHERE ABS(<%DATEDIFF_DAY;col;''{TEST_DATE}''::DATE%>) > {THRESHOLD_VALUE};'), - ('1320', '1030', 'Test Results', 'Required', 'databricks', NULL, 'SELECT * FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE `{COLUMN_NAME}` IS NULL LIMIT 500;'), - ('1321', '1031', 'Test Results', 'Row_Ct', 'databricks', NULL, 'WITH CTE AS (SELECT COUNT(*) AS current_count FROM {TARGET_SCHEMA}.{TABLE_NAME}) SELECT current_count, ABS(ROUND(100 *(current_count - {THRESHOLD_VALUE}) :: FLOAT / {THRESHOLD_VALUE} :: FLOAT,2)) AS row_count_pct_decrease FROM cte WHERE current_count < {THRESHOLD_VALUE};'), - ('1322', '1032', 'Test Results', 'Row_Ct_Pct', 'databricks', NULL, 'WITH CTE AS (SELECT COUNT(*) AS current_count FROM {TARGET_SCHEMA}.{TABLE_NAME}) SELECT current_count, {BASELINE_CT} AS baseline_count, ABS(ROUND(100 * (current_count - {BASELINE_CT}) :: FLOAT / {BASELINE_CT} :: FLOAT,2)) AS row_count_pct_difference FROM cte;'), - ('1323', '1033', 'Test Results', 'Street_Addr_Pattern', 'databricks', NULL, 'SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE REGEXP_LIKE(`{COLUMN_NAME}`::STRING, ''^[0-9]{1,5}[a-zA-Z]?\\s\\w{1,5}\\.?\\s?\\w*\\s?\\w*\\s[a-zA-Z]{1,6}\\.?\\s?[0-9]{0,5}[A-Z]{0,1}$'') != 1 GROUP BY `{COLUMN_NAME}` ORDER BY count DESC LIMIT 500;'), - ('1324', '1036', 'Test Results', 'US_State', 'databricks', NULL, 'SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE NULLIF(`{COLUMN_NAME}`, '''') NOT IN (''AL'',''AK'',''AS'',''AZ'',''AR'',''CA'',''CO'',''CT'',''DE'',''DC'',''FM'',''FL'',''GA'',''GU'',''HI'',''ID'',''IL'',''IN'',''IA'',''KS'',''KY'',''LA'',''ME'',''MH'',''MD'',''MA'',''MI'',''MN'',''MS'',''MO'',''MT'',''NE'',''NV'',''NH'',''NJ'',''NM'',''NY'',''NC'',''ND'',''MP'',''OH'',''OK'',''OR'',''PW'',''PA'',''PR'',''RI'',''SC'',''SD'',''TN'',''TX'',''UT'',''VT'',''VI'',''VA'',''WA'',''WV'',''WI'',''WY'',''AE'',''AP'',''AA'') GROUP BY `{COLUMN_NAME}` LIMIT 500;'), - ('1325', '1034', 'Test Results', 'Unique', 'databricks', NULL, 'SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY `{COLUMN_NAME}` HAVING count > 1 ORDER BY count DESC LIMIT 500;'), - ('1326', '1035', 'Test Results', 'Unique_Pct', 'databricks', NULL, 'SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY `{COLUMN_NAME}` ORDER BY count DESC LIMIT 500;'), - ('1327', '1037', 'Test Results', 'Weekly_Rec_Ct', 'databricks', NULL, 'WITH daterange AS( SELECT explode(sequence( date_trunc(''week'', (SELECT min(`{COLUMN_NAME}`) FROM {TARGET_SCHEMA}.{TABLE_NAME})), date_trunc(''week'', (SELECT max(`{COLUMN_NAME}`) FROM {TARGET_SCHEMA}.{TABLE_NAME})), interval 1 week)) AS all_dates ), existing_periods AS ( SELECT DISTINCT date_trunc(''week'', `{COLUMN_NAME}`) AS period, COUNT(1) AS period_count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY date_trunc(''week'', `{COLUMN_NAME}`) ) SELECT p.missing_period, p.prior_available_week, e.period_count AS prior_available_week_count, p.next_available_week, f.period_count AS next_available_week_count FROM ( SELECT d.all_dates AS missing_period, MAX(b.period) AS prior_available_week, MIN(c.period) AS next_available_week FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates ) p LEFT JOIN existing_periods e ON p.prior_available_week = e.period LEFT JOIN existing_periods f ON p.next_available_week = f.period ORDER BY p.missing_period;'), - ('1328', '1040', 'Test Results', 'Variability_Increase', 'databricks', NULL, 'SELECT STDDEV(CAST(`{COLUMN_NAME}` AS FLOAT)) as current_standard_deviation FROM {TARGET_SCHEMA}.{TABLE_NAME};'), - ('1329', '1041', 'Test Results', 'Variability_Decrease', 'databricks', NULL, 'SELECT STDDEV(CAST(`{COLUMN_NAME}` AS FLOAT)) as current_standard_deviation FROM {TARGET_SCHEMA}.{TABLE_NAME};'), - - ('1230', '1027', 'Profile Anomaly' , 'Variant_Coded_Values', 'databricks', NULL, 'SELECT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE LOWER(`{COLUMN_NAME}`) IN (SELECT TRIM(value) FROM (SELECT EXPLODE(SPLIT(SUBSTRING(''{DETAIL_EXPRESSION}'', INSTR(''{DETAIL_EXPRESSION}'', '':'') + 2), ''\\|'')) AS value)) GROUP BY `{COLUMN_NAME}`;'), - ('1330', '1043', 'Test Results', 'Valid_Characters', 'databricks', NULL, 'SELECT `{COLUMN_NAME}`, COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE REGEXP_LIKE(`{COLUMN_NAME}`, ''.*[[:cntrl:]].*'') OR `{COLUMN_NAME}`::STRING LIKE '' %'' OR `{COLUMN_NAME}`::STRING LIKE ''''''%'''''' OR `{COLUMN_NAME}`::STRING LIKE ''"%"'' GROUP BY `{COLUMN_NAME}` ORDER BY record_ct DESC LIMIT 20;'), - ('1331', '1044', 'Test Results', 'Valid_US_Zip', 'databricks', NULL, 'SELECT `{COLUMN_NAME}`, COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE(`{COLUMN_NAME}`,''012345678'',''999999999'') NOT IN (''99999'', ''999999999'', ''99999-9999'') GROUP BY `{COLUMN_NAME}` ORDER BY record_ct DESC LIMIT 20;'), - ('1332', '1045', 'Test Results', 'Valid_US_Zip3', 'databricks', NULL, 'SELECT `{COLUMN_NAME}`, COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE(`{COLUMN_NAME}`,''012345678'',''999999999'') NOT IN (''99999'', ''999999999'', ''99999-9999'') GROUP BY `{COLUMN_NAME}` ORDER BY record_ct DESC LIMIT 20;'), - - ('1333', '1500', 'Test Results', 'Aggregate_Balance', 'databricks', NULL, 'SELECT * - FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) AS total, SUM(MATCH_TOTAL) AS MATCH_TOTAL - FROM - ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} AS total, NULL AS match_total - FROM {TARGET_SCHEMA}.{TABLE_NAME} - WHERE {SUBSET_CONDITION} - GROUP BY {GROUPBY_NAMES} - {HAVING_CONDITION} - UNION ALL - SELECT {MATCH_GROUPBY_NAMES}, NULL AS total, {MATCH_COLUMN_NAMES} AS match_total - FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME} - WHERE {MATCH_SUBSET_CONDITION} - GROUP BY {MATCH_GROUPBY_NAMES} - {MATCH_HAVING_CONDITION} ) a - GROUP BY {GROUPBY_NAMES} ) s - WHERE total <> match_total OR (total IS NOT NULL AND match_total IS NULL) OR (total IS NULL AND match_total IS NOT NULL) -ORDER BY {GROUPBY_NAMES};'), - ('1334', '1501', 'Test Results', 'Aggregate_Minimum', 'databricks', NULL, 'SELECT * -FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) as total, SUM(MATCH_TOTAL) as MATCH_TOTAL - FROM - ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} as total, NULL as match_total - FROM {TARGET_SCHEMA}.{TABLE_NAME} - WHERE {SUBSET_CONDITION} - GROUP BY {GROUPBY_NAMES} - {HAVING_CONDITION} - UNION ALL - SELECT {MATCH_GROUPBY_NAMES}, NULL as total, {MATCH_COLUMN_NAMES} as match_total - FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME} - WHERE {MATCH_SUBSET_CONDITION} - GROUP BY {MATCH_GROUPBY_NAMES} - {MATCH_HAVING_CONDITION} ) a - GROUP BY {GROUPBY_NAMES} ) s - WHERE total < match_total OR (total IS NULL AND match_total IS NOT NULL) -ORDER BY {GROUPBY_NAMES};'), - ('1335', '1502', 'Test Results', 'Combo_Match', 'databricks', NULL, 'SELECT * - FROM ( SELECT {COLUMN_NAME_NO_QUOTES} - FROM {TARGET_SCHEMA}.{TABLE_NAME} - WHERE {SUBSET_CONDITION} - GROUP BY {COLUMN_NAME_NO_QUOTES} - {HAVING_CONDITION} - EXCEPT - SELECT {MATCH_GROUPBY_NAMES} - FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME} - WHERE {MATCH_SUBSET_CONDITION} - GROUP BY {MATCH_GROUPBY_NAMES} - {MATCH_HAVING_CONDITION} - ) test -ORDER BY {COLUMN_NAME_NO_QUOTES};'), - ('1336', '1503', 'Test Results', 'Distribution_Shift', 'databricks', NULL, 'WITH latest_ver - AS ( SELECT {CONCAT_COLUMNS} as category, - COUNT(*)::FLOAT / SUM(COUNT(*)) OVER ()::FLOAT AS pct_of_total - FROM {TARGET_SCHEMA}.{TABLE_NAME} v1 - WHERE {SUBSET_CONDITION} - GROUP BY {COLUMN_NAME_NO_QUOTES} ), -older_ver - AS ( SELECT {CONCAT_MATCH_GROUPBY} as category, - COUNT(*)::FLOAT / SUM(COUNT(*)) OVER ()::FLOAT AS pct_of_total - FROM {MATCH_SCHEMA_NAME}.{TABLE_NAME} v2 - WHERE {MATCH_SUBSET_CONDITION} - GROUP BY {MATCH_GROUPBY_NAMES} ) -SELECT COALESCE(l.category, o.category) AS category, - o.pct_of_total AS old_pct, - l.pct_of_total AS new_pct - FROM latest_ver l -FULL JOIN older_ver o - ON (l.category = o.category) -ORDER BY COALESCE(l.category, o.category)'), - ('1248', '1504', 'Test Results', 'Aggregate_Balance_Percent', 'databricks', NULL, 'SELECT * - FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) AS total, SUM(MATCH_TOTAL) AS MATCH_TOTAL - FROM - ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} AS total, NULL AS match_total - FROM {TARGET_SCHEMA}.{TABLE_NAME} - WHERE {SUBSET_CONDITION} - GROUP BY {GROUPBY_NAMES} - {HAVING_CONDITION} - UNION ALL - SELECT {MATCH_GROUPBY_NAMES}, NULL AS total, {MATCH_COLUMN_NAMES} AS match_total - FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME} - WHERE {MATCH_SUBSET_CONDITION} - GROUP BY {MATCH_GROUPBY_NAMES} - {MATCH_HAVING_CONDITION} ) a - GROUP BY {GROUPBY_NAMES} ) s - WHERE (total IS NOT NULL AND match_total IS NULL) - OR (total IS NULL AND match_total IS NOT NULL) - OR (total NOT BETWEEN match_total * (1 + {LOWER_TOLERANCE}/100.0) AND match_total * (1 + {UPPER_TOLERANCE}/100.0)) - ORDER BY {GROUPBY_NAMES};'), - ('1245', '1505', 'Test Results', 'Aggregate_Balance_Range', 'databricks', NULL, 'SELECT * - FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) AS total, SUM(MATCH_TOTAL) AS MATCH_TOTAL - FROM - ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} AS total, NULL AS match_total - FROM {TARGET_SCHEMA}.{TABLE_NAME} - WHERE {SUBSET_CONDITION} - GROUP BY {GROUPBY_NAMES} - {HAVING_CONDITION} - UNION ALL - SELECT {MATCH_GROUPBY_NAMES}, NULL AS total, {MATCH_COLUMN_NAMES} AS match_total - FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME} - WHERE {MATCH_SUBSET_CONDITION} - GROUP BY {MATCH_GROUPBY_NAMES} - {MATCH_HAVING_CONDITION} ) a - GROUP BY {GROUPBY_NAMES} ) s - WHERE (total IS NOT NULL AND match_total IS NULL) - OR (total IS NULL AND match_total IS NOT NULL) - OR (total NOT BETWEEN match_total + {LOWER_TOLERANCE} AND match_total + {UPPER_TOLERANCE}) - ORDER BY {GROUPBY_NAMES};'), - ('1337', '1509', 'Test Results', 'Timeframe_Combo_Match', 'databricks', NULL, ' ( -SELECT ''Prior Timeframe'' as missing_from, {COLUMN_NAME} -FROM {TARGET_SCHEMA}.{TABLE_NAME} -WHERE {SUBSET_CONDITION} - AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS} -EXCEPT -SELECT ''Prior Timeframe'' as missing_from, {COLUMN_NAME} -FROM {TARGET_SCHEMA}.{TABLE_NAME} -WHERE {SUBSET_CONDITION} - AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - 2 * {WINDOW_DAYS} - AND {WINDOW_DATE_COLUMN} < (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS} -) -UNION ALL -( -SELECT ''Latest Timeframe'' as missing_from, {COLUMN_NAME} -FROM {TARGET_SCHEMA}.{TABLE_NAME} -WHERE {SUBSET_CONDITION} - AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - 2 * {WINDOW_DAYS} - AND {WINDOW_DATE_COLUMN} < (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS} - EXCEPT -SELECT ''Latest Timeframe'' as missing_from, {COLUMN_NAME} -FROM {TARGET_SCHEMA}.{TABLE_NAME} -WHERE {SUBSET_CONDITION} - AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS} -)'), - ('1338', '1100', 'Profile Anomaly', 'Potential_PII', 'databricks', NULL, 'SELECT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` DESC LIMIT 500;'), - - ('1253', '1510', 'Test Results', 'Dupe_Rows', 'redshift', NULL, 'SELECT {GROUPBY_NAMES}, COUNT(*) as record_ct - FROM {TARGET_SCHEMA}.{TABLE_NAME} - WHERE {SUBSET_CONDITION} - GROUP BY {GROUPBY_NAMES} - HAVING COUNT(*) > 1 -ORDER BY {GROUPBY_NAMES}'), - ('1254', '1510', 'Test Results', 'Dupe_Rows', 'snowflake', NULL, 'SELECT {GROUPBY_NAMES}, COUNT(*) as record_ct - FROM {TARGET_SCHEMA}.{TABLE_NAME} - WHERE {SUBSET_CONDITION} - GROUP BY {GROUPBY_NAMES} - HAVING COUNT(*) > 1 -ORDER BY {GROUPBY_NAMES}'), - ('1255', '1510', 'Test Results', 'Dupe_Rows', 'mssql', NULL, 'SELECT {GROUPBY_NAMES}, COUNT(*) as record_ct - FROM {TARGET_SCHEMA}.{TABLE_NAME} - WHERE {SUBSET_CONDITION} - GROUP BY {GROUPBY_NAMES} - HAVING COUNT(*) > 1 -ORDER BY {GROUPBY_NAMES}'), - ('1256', '1510', 'Test Results', 'Dupe_Rows', 'postgresql', NULL, 'SELECT {GROUPBY_NAMES}, COUNT(*) as record_ct - FROM {TARGET_SCHEMA}.{TABLE_NAME} - WHERE {SUBSET_CONDITION} - GROUP BY {GROUPBY_NAMES} - HAVING COUNT(*) > 1 -ORDER BY {GROUPBY_NAMES}'), - ('1257', '1510', 'Test Results', 'Dupe_Rows', 'databricks', NULL, 'SELECT {GROUPBY_NAMES}, COUNT(*) as record_ct - FROM {TARGET_SCHEMA}.{TABLE_NAME} - WHERE {SUBSET_CONDITION} - GROUP BY {GROUPBY_NAMES} - HAVING COUNT(*) > 1 -ORDER BY {GROUPBY_NAMES}'), - ('1258', '1028', 'Profile Anomaly', 'Inconsistent_Casing', 'redshift', NULL, '(SELECT ''Upper Case'' as casing, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} -WHERE UPPER("{COLUMN_NAME}") = "{COLUMN_NAME}" -GROUP BY "{COLUMN_NAME}" LIMIT 20) -UNION ALL -(SELECT ''Mixed Case'' as casing, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} -WHERE "{COLUMN_NAME}" <> UPPER("{COLUMN_NAME}") AND "{COLUMN_NAME}" <> LOWER("{COLUMN_NAME}") -GROUP BY "{COLUMN_NAME}" LIMIT 20)'), - ('1259', '1028', 'Profile Anomaly', 'Inconsistent_Casing', 'postgresql', NULL, '(SELECT ''Upper Case'' as casing, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} -WHERE UPPER("{COLUMN_NAME}") = "{COLUMN_NAME}" -GROUP BY "{COLUMN_NAME}" LIMIT 20) -UNION ALL -(SELECT ''Mixed Case'' as casing, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} -WHERE "{COLUMN_NAME}" <> UPPER("{COLUMN_NAME}") AND "{COLUMN_NAME}" <> LOWER("{COLUMN_NAME}") -GROUP BY "{COLUMN_NAME}" LIMIT 20)'), - ('1260', '1028', 'Profile Anomaly', 'Inconsistent_Casing', 'mssql', NULL, 'SELECT TOP 20 ''Upper Case'' as casing, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} -WHERE UPPER("{COLUMN_NAME}") = "{COLUMN_NAME}" -GROUP BY "{COLUMN_NAME}" -UNION ALL -SELECT TOP 20 ''Mixed Case'' as casing, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} -WHERE "{COLUMN_NAME}" <> UPPER("{COLUMN_NAME}") AND "{COLUMN_NAME}" <> LOWER("{COLUMN_NAME}") -GROUP BY "{COLUMN_NAME}"'), - ('1261', '1028', 'Profile Anomaly', 'Inconsistent_Casing', 'snowflake', NULL, '(SELECT ''Upper Case'' as casing, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} -WHERE UPPER("{COLUMN_NAME}") = "{COLUMN_NAME}" -GROUP BY "{COLUMN_NAME}" LIMIT 20) -UNION ALL -(SELECT ''Mixed Case'' as casing, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} -WHERE "{COLUMN_NAME}" <> UPPER("{COLUMN_NAME}") AND "{COLUMN_NAME}" <> LOWER("{COLUMN_NAME}") -GROUP BY "{COLUMN_NAME}" LIMIT 20)'), - ('1262', '1028', 'Profile Anomaly', 'Inconsistent_Casing', 'databricks', NULL, '(SELECT ''Upper Case'' as casing, `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} -WHERE UPPER(`{COLUMN_NAME}`) = `{COLUMN_NAME}` -GROUP BY `{COLUMN_NAME}` LIMIT 20) -UNION ALL -(SELECT ''Mixed Case'' as casing, `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} -WHERE `{COLUMN_NAME}` <> UPPER(`{COLUMN_NAME}`) AND `{COLUMN_NAME}` <> LOWER(`{COLUMN_NAME}`) -GROUP BY `{COLUMN_NAME}` LIMIT 20)'), - ('1263', '1029', 'Profile Anomaly', 'Non_Alpha_Name_Address', 'redshift', NULL, 'SELECT "{COLUMN_NAME}", COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" - WHERE "{COLUMN_NAME}" = UPPER("{COLUMN_NAME}") AND "{COLUMN_NAME}" = LOWER("{COLUMN_NAME}") AND "{COLUMN_NAME}" > '''' -GROUP BY "{COLUMN_NAME}" LIMIT 500'), - ('1264', '1029', 'Profile Anomaly', 'Non_Alpha_Name_Address', 'postgresql', NULL, 'SELECT "{COLUMN_NAME}", COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" - WHERE "{COLUMN_NAME}" = UPPER("{COLUMN_NAME}") AND "{COLUMN_NAME}" = LOWER("{COLUMN_NAME}") AND "{COLUMN_NAME}" > '''' -GROUP BY "{COLUMN_NAME}" LIMIT 500'), - ('1265', '1029', 'Profile Anomaly', 'Non_Alpha_Name_Address', 'mssql', NULL, 'SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" - WHERE "{COLUMN_NAME}" = UPPER("{COLUMN_NAME}") AND "{COLUMN_NAME}" = LOWER("{COLUMN_NAME}") AND "{COLUMN_NAME}" > '''' -GROUP BY "{COLUMN_NAME}"'), - ('1266', '1029', 'Profile Anomaly', 'Non_Alpha_Name_Address', 'snowflake', NULL, 'SELECT "{COLUMN_NAME}", COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" - WHERE "{COLUMN_NAME}" = UPPER("{COLUMN_NAME}") AND "{COLUMN_NAME}" = LOWER("{COLUMN_NAME}") AND "{COLUMN_NAME}" > '''' -GROUP BY "{COLUMN_NAME}" LIMIT 500'), - ('1267', '1029', 'Profile Anomaly', 'Non_Alpha_Name_Address', 'databricks', NULL, 'SELECT `{COLUMN_NAME}`, COUNT(*) as record_ct FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` - WHERE `{COLUMN_NAME}` = UPPER(`{COLUMN_NAME}`) AND `{COLUMN_NAME}` = LOWER(`{COLUMN_NAME}`) AND `{COLUMN_NAME}` > '''' -GROUP BY `{COLUMN_NAME}` LIMIT 500'), - ('1268', '1030', 'Profile Anomaly', 'Non_Alpha_Prefixed_Name', 'redshift', NULL, 'SELECT "{COLUMN_NAME}", COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" - WHERE "{COLUMN_NAME}" < ''A'' AND LEFT("{COLUMN_NAME}", 1) NOT IN (''"'', '' '') AND RIGHT("{COLUMN_NAME}", 1) <> '''''''' -GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500'), - ('1269', '1030', 'Profile Anomaly', 'Non_Alpha_Prefixed_Name', 'postgresql', NULL, 'SELECT "{COLUMN_NAME}", COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" - WHERE "{COLUMN_NAME}" < ''A'' AND LEFT("{COLUMN_NAME}", 1) NOT IN (''"'', '' '') AND RIGHT("{COLUMN_NAME}", 1) <> '''''''' -GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500'), - ('1270', '1030', 'Profile Anomaly', 'Non_Alpha_Prefixed_Name', 'mssql', NULL, 'SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" - WHERE "{COLUMN_NAME}" < ''A'' AND LEFT("{COLUMN_NAME}", 1) NOT IN (''"'', '' '') AND RIGHT("{COLUMN_NAME}", 1) <> '''''''' -GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}"'), - ('1271', '1030', 'Profile Anomaly', 'Non_Alpha_Prefixed_Name', 'snowflake', NULL, 'SELECT "{COLUMN_NAME}", COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" -WHERE "{COLUMN_NAME}" < ''A'' AND LEFT("{COLUMN_NAME}", 1) NOT IN (''"'', '' '') AND RIGHT("{COLUMN_NAME}", 1) <> '''''''' -GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500'), - ('1272', '1030', 'Profile Anomaly', 'Non_Alpha_Prefixed_Name', 'databricks', NULL, 'SELECT `{COLUMN_NAME}`, COUNT(*) as record_ct FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` -WHERE `{COLUMN_NAME}` < ''A'' AND LEFT(`{COLUMN_NAME}`, 1) NOT IN (''"'', '' '') AND RIGHT(`{COLUMN_NAME}`, 1) <> '''''''' -GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` LIMIT 500'), - ('1273', '1031', 'Profile Anomaly', 'Non_Printing_Chars', 'redshift', NULL, 'SELECT REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE("{COLUMN_NAME}", - CHR(160), ''\x160''), - CHR(8201), ''\x8201''), - CHR(8203), ''\x8203''), - CHR(8204), ''\x8204''), - CHR(8205), ''\x8205''), - CHR(8206), ''\x8206''), - CHR(8207), ''\x8207''), - CHR(8239), ''\x8239''), - CHR(12288), ''\x12288''), - CHR(65279), ''\x65279'') as "{COLUMN_NAME}_content", - COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" - WHERE TRANSLATE("{COLUMN_NAME}", CHR(160) || CHR(8201) || CHR(8203) || CHR(8204) || CHR(8205) || CHR(8206) || CHR(8207) || CHR(8239) || CHR(12288) || CHR(65279), ''XXXXXXXXXX'') <> "{COLUMN_NAME}" -GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500'), - ('1274', '1031', 'Profile Anomaly', 'Non_Printing_Chars', 'postgresql', NULL, 'SELECT REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE("{COLUMN_NAME}", - CHR(160), ''\x160''), - CHR(8201), ''\x8201''), - CHR(8203), ''\x8203''), - CHR(8204), ''\x8204''), - CHR(8205), ''\x8205''), - CHR(8206), ''\x8206''), - CHR(8207), ''\x8207''), - CHR(8239), ''\x8239''), - CHR(12288), ''\x12288''), - CHR(65279), ''\x65279'') as "{COLUMN_NAME}_content", - COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" - WHERE TRANSLATE("{COLUMN_NAME}", CHR(160) || CHR(8201) || CHR(8203) || CHR(8204) || CHR(8205) || CHR(8206) || CHR(8207) || CHR(8239) || CHR(12288) || CHR(65279), ''XXXXXXXXXX'') <> "{COLUMN_NAME}" -GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500'), - ('1275', '1031', 'Profile Anomaly', 'Non_Printing_Chars', 'mssql', NULL, 'SELECT TOP 500 REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE("{COLUMN_NAME}", - NCHAR(160), ''\x160''), - NCHAR(8201), ''\x8201''), - NCHAR(8203), ''\x8203''), - NCHAR(8204), ''\x8204''), - NCHAR(8205), ''\x8205''), - NCHAR(8206), ''\x8206''), - NCHAR(8207), ''\x8207''), - NCHAR(8239), ''\x8239''), - NCHAR(12288), ''\x12288''), - NCHAR(65279), ''\x65279'') AS "{COLUMN_NAME}_content", - COUNT(*) AS record_ct -FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" - WHERE TRANSLATE("{COLUMN_NAME}", NCHAR(160) + NCHAR(8201) + NCHAR(8203) + NCHAR(8204) + NCHAR(8205) + NCHAR(8206) + NCHAR(8207) + NCHAR(8239) + NCHAR(12288) + NCHAR(65279), ''XXXXXXXXXX'') <> "{COLUMN_NAME}" -GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}"'), - ('1276', '1031', 'Profile Anomaly', 'Non_Printing_Chars', 'snowflake', NULL, 'SELECT REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE("{COLUMN_NAME}", - CHR(160), ''\x160''), - CHR(8201), ''\x8201''), - CHR(8203), ''\x8203''), - CHR(8204), ''\x8204''), - CHR(8205), ''\x8205''), - CHR(8206), ''\x8206''), - CHR(8207), ''\x8207''), - CHR(8239), ''\x8239''), - CHR(12288), ''\x12288''), - CHR(65279), ''\x65279'') as "{COLUMN_NAME}_content", - COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" - WHERE TRANSLATE("{COLUMN_NAME}", CHR(160) || CHR(8201) || CHR(8203) || CHR(8204) || CHR(8205) || CHR(8206) || CHR(8207) || CHR(8239) || CHR(12288) || CHR(65279), ''XXXXXXXXXX'') <> "{COLUMN_NAME}" -GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500'), - ('1277', '1031', 'Profile Anomaly', 'Non_Printing_Chars', 'databricks', NULL, 'SELECT REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(`{COLUMN_NAME}`, - ''\u00a0'', ''\x160''), - ''\u2009'', ''\x8201''), - ''\u200b'', ''\x8203''), - ''\u200c'', ''\x8204''), - ''\u200d'', ''\x8205''), - ''\u200e'', ''\x8206''), - ''\u200f'', ''\x8207''), - ''\u202f'', ''\x8239''), - ''\u3000'', ''\x12288''), - ''\ufeff'', ''\x65279'') as `{COLUMN_NAME}_content`, - COUNT(*) as record_ct FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` - WHERE TRANSLATE(`{COLUMN_NAME}`, ''\u00a0\u2009\u200b\u200c\u200d\u200e\u200f\u202f\u3000\ufeff'', ''XXXXXXXXXX'') <> `{COLUMN_NAME}` -GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` LIMIT 500') -; - - TRUNCATE TABLE variant_codings; INSERT INTO variant_codings (value_type, check_values) @@ -2091,16 +334,3 @@ VALUES ('measure', 'meter|m|metre'), ('pharma','priority review|pr|priority assessment'), ('pharma','tentative approval|ta|conditional approval'), ('pharma','off-label use|off-label|olu|unapproved use|unapproved'); - --- Replace constraints -ALTER TABLE test_templates - ADD CONSTRAINT test_templates_test_types_test_type_fk - FOREIGN KEY (test_type) REFERENCES test_types; - -ALTER TABLE test_results - ADD CONSTRAINT test_results_test_types_test_type_fk - FOREIGN KEY (test_type) REFERENCES test_types; - -ALTER TABLE cat_test_conditions - ADD CONSTRAINT cat_test_conditions_cat_tests_test_type_fk - FOREIGN KEY (test_type) REFERENCES test_types; diff --git a/testgen/template/dbsetup/055_recreate_metadata_constraints.sql b/testgen/template/dbsetup/055_recreate_metadata_constraints.sql new file mode 100644 index 00000000..2967dcd4 --- /dev/null +++ b/testgen/template/dbsetup/055_recreate_metadata_constraints.sql @@ -0,0 +1,17 @@ +-- ============================================================================== +-- | This recreates the constraints for the test metadata tables after being imported by yaml +-- ============================================================================== + +SET SEARCH_PATH TO {SCHEMA_NAME}; + +ALTER TABLE test_templates + ADD CONSTRAINT test_templates_test_types_test_type_fk + FOREIGN KEY (test_type) REFERENCES test_types; + +ALTER TABLE test_results + ADD CONSTRAINT test_results_test_types_test_type_fk + FOREIGN KEY (test_type) REFERENCES test_types; + +ALTER TABLE cat_test_conditions + ADD CONSTRAINT cat_test_conditions_cat_tests_test_type_fk + FOREIGN KEY (test_type) REFERENCES test_types; diff --git a/testgen/template/dbsetup/075_grant_role_rights.sql b/testgen/template/dbsetup/075_grant_role_rights.sql index 2f7fbf31..f8fb631e 100644 --- a/testgen/template/dbsetup/075_grant_role_rights.sql +++ b/testgen/template/dbsetup/075_grant_role_rights.sql @@ -31,6 +31,7 @@ GRANT SELECT, INSERT, DELETE, UPDATE ON {SCHEMA_NAME}.projects, {SCHEMA_NAME}.data_table_chars, {SCHEMA_NAME}.data_column_chars, + {SCHEMA_NAME}.data_structure_log, {SCHEMA_NAME}.auth_users, {SCHEMA_NAME}.score_definitions, {SCHEMA_NAME}.score_definition_criteria, diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Boolean_Value_Mismatch.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Boolean_Value_Mismatch.yaml new file mode 100644 index 00000000..fae0ec4b --- /dev/null +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Boolean_Value_Mismatch.yaml @@ -0,0 +1,84 @@ +profile_anomaly_types: + id: '1015' + anomaly_type: Boolean_Value_Mismatch + data_object: Column + anomaly_name: Unexpected Boolean Values Found + anomaly_description: "This column appears to contain boolean (True/False) data,\ + \ but unexpected values were found. This could indicate inconsistent coding for\ + \ the same intended values, potentially leading to downstream errors or inconsistent\ + \ business logic. " + anomaly_criteria: "(distinct_value_ct > 1 AND\n\t\t ((lower(top_freq_values)\ + \ ILIKE '| true |%' OR lower(top_freq_values) ILIKE '| false |%') AND NOT (lower(top_freq_values)\ + \ ILIKE '%| true |%' AND lower(top_freq_values) ILIKE '%| false |%'))\n\t\t OR\ + \ ((lower(top_freq_values) ILIKE '| yes |%' OR lower(top_freq_values) ILIKE '|\ + \ no |%' ) AND NOT (lower(top_freq_values) ILIKE '%| yes |%' AND lower(top_freq_values)\ + \ ILIKE '%| no |%')) )" + detail_expression: |- + CASE WHEN p.top_freq_values IS NULL THEN 'Min: ' || p.min_text || ', Max: ' || p.max_text + ELSE 'Top Freq: ' || p.top_freq_values END + issue_likelihood: Likely + suggested_action: "Review your source data and follow-up with data owners to determine\ + \ whether this data needs to be corrected. " + dq_score_prevalence_formula: null + dq_score_risk_factor: '0.66' + dq_dimension: Validity + target_data_lookups: + - id: '1353' + test_id: '1015' + test_type: Boolean_Value_Mismatch + sql_flavor: bigquery + lookup_type: null + lookup_query: |- + SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count + FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` + GROUP BY `{COLUMN_NAME}` + ORDER BY COUNT(*) DESC; + error_type: Profile Anomaly + - id: '1287' + test_id: '1015' + test_type: Boolean_Value_Mismatch + sql_flavor: databricks + lookup_type: null + lookup_query: |- + SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` GROUP BY `{COLUMN_NAME}` ORDER BY count DESC; + error_type: Profile Anomaly + - id: '1129' + test_id: '1015' + test_type: Boolean_Value_Mismatch + sql_flavor: mssql + lookup_type: null + lookup_query: |- + SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC; + error_type: Profile Anomaly + - id: '1072' + test_id: '1015' + test_type: Boolean_Value_Mismatch + sql_flavor: postgresql + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC; + error_type: Profile Anomaly + - id: '1047' + test_id: '1015' + test_type: Boolean_Value_Mismatch + sql_flavor: redshift + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC; + error_type: Profile Anomaly + - id: '1447' + test_id: '1015' + test_type: Boolean_Value_Mismatch + sql_flavor: redshift_spectrum + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC; + error_type: Profile Anomaly + - id: '1186' + test_id: '1015' + test_type: Boolean_Value_Mismatch + sql_flavor: snowflake + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC; + error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Char_Column_Date_Values.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Char_Column_Date_Values.yaml new file mode 100644 index 00000000..a7371d24 --- /dev/null +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Char_Column_Date_Values.yaml @@ -0,0 +1,97 @@ +profile_anomaly_types: + id: '1012' + anomaly_type: Char_Column_Date_Values + data_object: Column + anomaly_name: Character Column with Mostly Date Values + anomaly_description: "This column is defined as alpha, but more than 95% of its\ + \ values are dates. Dates in alpha columns might not sort correctly, and might\ + \ contradict user expectations downstream. It's also possible that more than one\ + \ type of information is stored in the column, making it harder to retrieve. \ + \ " + anomaly_criteria: |- + p.general_type = 'A' + AND p.value_ct > p.date_ct + AND p.date_ct::NUMERIC > (0.95 * p.value_ct::NUMERIC) + detail_expression: |- + ' Date Ct: ' || p.date_ct || ' of ' || p.value_ct || ' (Date Percent: ' || ROUND(100.0 * p.date_ct::NUMERIC(18, 5) / p.value_ct::NUMERIC(18, 5), 2) || ' )'::VARCHAR(200) + issue_likelihood: Likely + suggested_action: |- + Review your source data and ingestion process. Consider whether it might be better to store the date values as a date or datetime column. If the alpha data is also significant, you could store it in a different column. + dq_score_prevalence_formula: |- + p.date_ct::FLOAT/NULLIF(p.record_ct, 0)::FLOAT + dq_score_risk_factor: '0.66' + dq_dimension: Validity + target_data_lookups: + - id: '1350' + test_id: '1012' + test_type: Char_Column_Date_Values + sql_flavor: bigquery + lookup_type: null + lookup_query: |- + ( + SELECT 'Date' AS data_type, `{COLUMN_NAME}`, COUNT(*) AS count + FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` + WHERE SAFE_CAST(CAST(`{COLUMN_NAME}` AS STRING) AS DATE) IS NOT NULL + GROUP BY `{COLUMN_NAME}` + ORDER BY count DESC + LIMIT 10 + ) + UNION ALL + ( + SELECT 'Non-Date' AS data_type, `{COLUMN_NAME}`, COUNT(*) AS count + FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` + WHERE SAFE_CAST(CAST(`{COLUMN_NAME}` AS STRING) AS DATE) IS NULL + GROUP BY `{COLUMN_NAME}` + ORDER BY count DESC + LIMIT 10 + ) + ORDER BY data_type, count DESC; + error_type: Profile Anomaly + - id: '1284' + test_id: '1012' + test_type: Char_Column_Date_Values + sql_flavor: databricks + lookup_type: null + lookup_query: |- + SELECT A.* FROM (SELECT DISTINCT 'Date' as data_type, `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE <%IS_DATE;`{COLUMN_NAME}`%> = 1 GROUP BY `{COLUMN_NAME}` ORDER BY count DESC LIMIT 10) AS A UNION ALL SELECT B.* FROM (SELECT DISTINCT 'Non-Date' as data_type, `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE <%IS_DATE;`{COLUMN_NAME}`%> != 1 GROUP BY `{COLUMN_NAME}` ORDER BY count DESC) AS B ORDER BY data_type, count DESC LIMIT 10; + error_type: Profile Anomaly + - id: '1126' + test_id: '1012' + test_type: Char_Column_Date_Values + sql_flavor: mssql + lookup_type: null + lookup_query: |- + SELECT A.* FROM ( SELECT DISTINCT TOP 10 'Date' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_DATE;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC) AS A UNION ALL SELECT B.* FROM ( SELECT DISTINCT TOP 10 'Non-Date' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_DATE;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS B ORDER BY data_type, count DESC; + error_type: Profile Anomaly + - id: '1069' + test_id: '1012' + test_type: Char_Column_Date_Values + sql_flavor: postgresql + lookup_type: null + lookup_query: |- + SELECT A.* FROM ( SELECT DISTINCT 'Date' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_DATE;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT 10) AS A UNION ALL SELECT B.* FROM ( SELECT DISTINCT 'Non-Date' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_DATE;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT 10) AS B ORDER BY data_type, count DESC; + error_type: Profile Anomaly + - id: '1044' + test_id: '1012' + test_type: Char_Column_Date_Values + sql_flavor: redshift + lookup_type: null + lookup_query: |- + SELECT A.* FROM ( SELECT TOP 10 DISTINCT 'Date' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_DATE;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS A UNION ALL SELECT B.* FROM ( SELECT TOP 10 DISTINCT 'Non-Date' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_DATE;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS B ORDER BY data_type, count DESC; + error_type: Profile Anomaly + - id: '1444' + test_id: '1012' + test_type: Char_Column_Date_Values + sql_flavor: redshift_spectrum + lookup_type: null + lookup_query: |- + SELECT A.* FROM ( SELECT TOP 10 DISTINCT 'Date' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_DATE;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS A UNION ALL SELECT B.* FROM ( SELECT TOP 10 DISTINCT 'Non-Date' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_DATE;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS B ORDER BY data_type, count DESC; + error_type: Profile Anomaly + - id: '1183' + test_id: '1012' + test_type: Char_Column_Date_Values + sql_flavor: snowflake + lookup_type: null + lookup_query: |- + SELECT A.* FROM (SELECT DISTINCT TOP 10 'Date' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_DATE;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC) AS A UNION ALL SELECT B.* FROM (SELECT DISTINCT TOP 10 'Non-Date' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_DATE;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC) AS B ORDER BY data_type, count DESC; + error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Char_Column_Number_Units.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Char_Column_Number_Units.yaml new file mode 100644 index 00000000..da49a9c1 --- /dev/null +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Char_Column_Number_Units.yaml @@ -0,0 +1,18 @@ +profile_anomaly_types: + id: '1026' + anomaly_type: Char_Column_Number_Units + data_object: Column + anomaly_name: Character Column with Numbers and Units + anomaly_description: |- + This column is defined as alpha, but values include numbers with percents or common units. Embedded measures in alpha columns are harder to access, won't sort correctly, and might contradict user expectations downstream. Consider parsing into numeric and UOM columns to improve usability. + anomaly_criteria: |- + p.includes_digit_ct::FLOAT/NULLIF(p.value_ct, 0)::FLOAT > 0.5 AND TRIM(fn_parsefreq(p.top_freq_values, 1, 1)) ~ '(?i)^[0-9]+(\.[0-9]+)? ?(%|lb|oz|kg|g|mg|km|m|cm|mm|mi|ft|in)$' + detail_expression: |- + 'Top Freq: ' || p.top_freq_values + issue_likelihood: Possible + suggested_action: |- + Review your source data and ingestion process. Consider whether it might be better to parse the numeric and unit data and store in separate columns. + dq_score_prevalence_formula: null + dq_score_risk_factor: '0.33' + dq_dimension: Consistency + target_data_lookups: [] diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Char_Column_Number_Values.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Char_Column_Number_Values.yaml new file mode 100644 index 00000000..12cccad4 --- /dev/null +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Char_Column_Number_Values.yaml @@ -0,0 +1,97 @@ +profile_anomaly_types: + id: '1011' + anomaly_type: Char_Column_Number_Values + data_object: Column + anomaly_name: Character Column with Mostly Numeric Values + anomaly_description: |- + This column is defined as alpha, but more than 95% of its values are numeric. Numbers in alpha columns won't sort correctly, and might contradict user expectations downstream. It's also possible that more than one type of information is stored in the column, making it harder to retrieve. + anomaly_criteria: |- + p.general_type = 'A' + AND p.column_name NOT ILIKE '%zip%' + AND p.functional_data_type NOT ILIKE 'id%' + AND p.functional_data_type NOT ILIKE 'Period%' + AND p.value_ct > p.numeric_ct + AND p.numeric_ct::NUMERIC > (0.95 * p.value_ct::NUMERIC) + detail_expression: |- + 'Numeric Ct: ' || p.numeric_ct || ' of ' || p.value_ct || ' (Numeric Percent: ' || ROUND(100.0 * p.numeric_ct::NUMERIC(18, 5) / p.value_ct::NUMERIC(18, 5), 2) || ' )'::VARCHAR(200) + issue_likelihood: Likely + suggested_action: |- + Review your source data and ingestion process. Consider whether it might be better to store the numeric data in a numeric column. If the alpha data is significant, you could store it in a different column. + dq_score_prevalence_formula: |- + p.numeric_ct::FLOAT/NULLIF(p.record_ct, 0)::FLOAT + dq_score_risk_factor: '0.66' + dq_dimension: Validity + target_data_lookups: + - id: '1349' + test_id: '1011' + test_type: Char_Column_Number_Values + sql_flavor: bigquery + lookup_type: null + lookup_query: |- + ( + SELECT 'Numeric' AS data_type, `{COLUMN_NAME}`, COUNT(*) AS count + FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` + WHERE SAFE_CAST(CAST(`{COLUMN_NAME}` AS STRING) AS FLOAT64) IS NOT NULL + GROUP BY `{COLUMN_NAME}` + ORDER BY count DESC + LIMIT 10 + ) + UNION ALL + ( + SELECT 'Non-Numeric' AS data_type, `{COLUMN_NAME}`, COUNT(*) AS count + FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` + WHERE SAFE_CAST(CAST(`{COLUMN_NAME}` AS STRING) AS FLOAT64) IS NULL + GROUP BY `{COLUMN_NAME}` + ORDER BY count DESC + LIMIT 10 + ) + ORDER BY data_type, count DESC; + error_type: Profile Anomaly + - id: '1283' + test_id: '1011' + test_type: Char_Column_Number_Values + sql_flavor: databricks + lookup_type: null + lookup_query: |- + SELECT A.* FROM (SELECT DISTINCT 'Numeric' as data_type, `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE <%IS_NUM;`{COLUMN_NAME}`%> = 1 GROUP BY `{COLUMN_NAME}` ORDER BY count DESC LIMIT 10) AS A UNION ALL SELECT B.* FROM (SELECT DISTINCT 'Non-Numeric' as data_type, `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE <%IS_NUM;`{COLUMN_NAME}`%> != 1 GROUP BY `{COLUMN_NAME}` ORDER BY count DESC) AS B ORDER BY data_type, count DESC LIMIT 10; + error_type: Profile Anomaly + - id: '1125' + test_id: '1011' + test_type: Char_Column_Number_Values + sql_flavor: mssql + lookup_type: null + lookup_query: |- + SELECT A.* FROM ( SELECT DISTINCT TOP 10 'Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_NUM;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS A UNION ALL SELECT B.* FROM ( SELECT DISTINCT TOP 10 'Non-Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_NUM;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS B ORDER BY data_type, count DESC; + error_type: Profile Anomaly + - id: '1068' + test_id: '1011' + test_type: Char_Column_Number_Values + sql_flavor: postgresql + lookup_type: null + lookup_query: |- + SELECT A.* FROM ( SELECT DISTINCT 'Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_NUM;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT 10 ) AS A UNION ALL SELECT B.* FROM ( SELECT DISTINCT 'Non-Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_NUM;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT 10 ) AS B ORDER BY data_type, count DESC; + error_type: Profile Anomaly + - id: '1043' + test_id: '1011' + test_type: Char_Column_Number_Values + sql_flavor: redshift + lookup_type: null + lookup_query: |- + SELECT A.* FROM ( SELECT TOP 10 DISTINCT 'Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_NUM;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC) AS A UNION ALL SELECT B.* FROM ( SELECT TOP 10 DISTINCT 'Non-Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_NUM;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS B ORDER BY data_type, count DESC; + error_type: Profile Anomaly + - id: '1443' + test_id: '1011' + test_type: Char_Column_Number_Values + sql_flavor: redshift_spectrum + lookup_type: null + lookup_query: |- + SELECT A.* FROM ( SELECT TOP 10 DISTINCT 'Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_NUM;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC) AS A UNION ALL SELECT B.* FROM ( SELECT TOP 10 DISTINCT 'Non-Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_NUM;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS B ORDER BY data_type, count DESC; + error_type: Profile Anomaly + - id: '1182' + test_id: '1011' + test_type: Char_Column_Number_Values + sql_flavor: snowflake + lookup_type: null + lookup_query: |- + SELECT A.* FROM (SELECT DISTINCT TOP 10 'Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_NUM;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC) AS A UNION ALL SELECT B.* FROM (SELECT DISTINCT TOP 10 'Non-Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_NUM;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC) AS B ORDER BY data_type, count DESC; + error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Column_Pattern_Mismatch.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Column_Pattern_Mismatch.yaml new file mode 100644 index 00000000..987d9f06 --- /dev/null +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Column_Pattern_Mismatch.yaml @@ -0,0 +1,126 @@ +profile_anomaly_types: + id: '1007' + anomaly_type: Column_Pattern_Mismatch + data_object: Column + anomaly_name: Pattern Inconsistency Within Column + anomaly_description: "Alpha-numeric string data within this column conforms to 2-4\ + \ different patterns, with 95% matching the first pattern. This could indicate\ + \ data errors in the remaining values. " + anomaly_criteria: |- + p.general_type = 'A' + AND functional_data_type NOT ILIKE 'Measurement%' AND functional_data_type NOT IN ('Category', 'Code') + AND p.max_length > 3 + AND p.value_ct > (p.numeric_ct + p.filled_value_ct + p.zero_length_ct) + AND p.distinct_pattern_ct BETWEEN 2 AND 4 + AND STRPOS(p.top_patterns, 'N') > 0 + AND ( + ( (STRPOS(p.top_patterns, 'A') > 0 OR STRPOS(p.top_patterns, 'a') > 0) + AND SPLIT_PART(p.top_patterns, '|', 3)::NUMERIC / SPLIT_PART(p.top_patterns, '|', 1)::NUMERIC < 0.05) + OR + SPLIT_PART(p.top_patterns, '|', 3)::NUMERIC / SPLIT_PART(p.top_patterns, '|', 1)::NUMERIC < 0.1 + ) + detail_expression: |- + 'Patterns: ' || p.top_patterns + issue_likelihood: Likely + suggested_action: |- + Review the values for any data that doesn't conform to the most common pattern and correct any data errors. + dq_score_prevalence_formula: |- + (p.record_ct - SPLIT_PART(p.top_patterns, '|', 1)::BIGINT)::FLOAT/NULLIF(p.record_ct, 0)::FLOAT + dq_score_risk_factor: '0.66' + dq_dimension: Validity + target_data_lookups: + - id: '1345' + test_id: '1007' + test_type: Column_Pattern_Mismatch + sql_flavor: bigquery + lookup_type: null + lookup_query: |- + ( + SELECT b.top_pattern, `{COLUMN_NAME}`, COUNT(*) AS count + FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`, + (SELECT TRIM(SPLIT('{DETAIL_EXPRESSION}', '|')[SAFE_OFFSET(3)]) AS top_pattern) b + WHERE REGEXP_REPLACE(REGEXP_REPLACE(REGEXP_REPLACE(CAST(`{COLUMN_NAME}` AS STRING), r'[a-z]', 'a'), r'[A-Z]', 'A'), r'[0-9]', 'N') = b.top_pattern + GROUP BY b.top_pattern, `{COLUMN_NAME}` + ORDER BY count DESC + LIMIT 5 + ) + UNION ALL + ( + SELECT b.top_pattern, `{COLUMN_NAME}`, COUNT(*) AS count + FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`, + (SELECT TRIM(SPLIT('{DETAIL_EXPRESSION}', '|')[SAFE_OFFSET(5)]) AS top_pattern) b + WHERE REGEXP_REPLACE(REGEXP_REPLACE(REGEXP_REPLACE(CAST(`{COLUMN_NAME}` AS STRING), r'[a-z]', 'a'), r'[A-Z]', 'A'), r'[0-9]', 'N') = b.top_pattern + GROUP BY b.top_pattern, `{COLUMN_NAME}` + ORDER BY count DESC + LIMIT 5 + ) + UNION ALL + ( + SELECT b.top_pattern, `{COLUMN_NAME}`, COUNT(*) AS count + FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`, + (SELECT TRIM(SPLIT('{DETAIL_EXPRESSION}', '|')[SAFE_OFFSET(7)]) AS top_pattern) b + WHERE REGEXP_REPLACE(REGEXP_REPLACE(REGEXP_REPLACE(CAST(`{COLUMN_NAME}` AS STRING), r'[a-z]', 'a'), r'[A-Z]', 'A'), r'[0-9]', 'N') = b.top_pattern + GROUP BY b.top_pattern, `{COLUMN_NAME}` + ORDER BY count DESC + LIMIT 5 + ) + UNION ALL + ( + SELECT b.top_pattern, `{COLUMN_NAME}`, COUNT(*) AS count + FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`, + (SELECT TRIM(SPLIT('{DETAIL_EXPRESSION}', '|')[SAFE_OFFSET(9)]) AS top_pattern) b + WHERE REGEXP_REPLACE(REGEXP_REPLACE(REGEXP_REPLACE(CAST(`{COLUMN_NAME}` AS STRING), r'[a-z]', 'a'), r'[A-Z]', 'A'), r'[0-9]', 'N') = b.top_pattern + GROUP BY b.top_pattern, `{COLUMN_NAME}` + ORDER BY count DESC + LIMIT 5 + ) + ORDER BY top_pattern DESC, count DESC; + error_type: Profile Anomaly + - id: '1279' + test_id: '1007' + test_type: Column_Pattern_Mismatch + sql_flavor: databricks + lookup_type: null + lookup_query: |- + SELECT A.* FROM (SELECT DISTINCT b.top_pattern, `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`, (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 4)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( `{COLUMN_NAME}`::STRING, '[a-z]', 'a'), '[A-Z]', 'A'), '[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, `{COLUMN_NAME}` ORDER BY count DESC LIMIT 5) A UNION ALL SELECT B.* FROM (SELECT DISTINCT b.top_pattern, `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`, (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 6)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( `{COLUMN_NAME}`::STRING, '[a-z]', 'a'), '[A-Z]', 'A'), '[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, `{COLUMN_NAME}` ORDER BY count DESC LIMIT 5) B UNION ALL SELECT C.* FROM (SELECT DISTINCT b.top_pattern, `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`, (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 8)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( `{COLUMN_NAME}`::STRING, '[a-z]', 'a'), '[A-Z]', 'A'), '[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, `{COLUMN_NAME}` ORDER BY count DESC LIMIT 5) C UNION ALL SELECT D.* FROM (SELECT DISTINCT b.top_pattern, `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`, (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 10)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( `{COLUMN_NAME}`::STRING, '[a-z]', 'a'), '[A-Z]', 'A'), '[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, `{COLUMN_NAME}` ORDER BY count DESC LIMIT 5) D ORDER BY top_pattern DESC, count DESC; + error_type: Profile Anomaly + - id: '1121' + test_id: '1007' + test_type: Column_Pattern_Mismatch + sql_flavor: mssql + lookup_type: null + lookup_query: |- + WITH cte AS ( SELECT TRIM(value) AS top_pattern, ROW_NUMBER() OVER (ORDER BY CHARINDEX('| '+ TRIM(value) + ' |', '| ' + '{DETAIL_EXPRESSION}' + ' |' ) ASC) as row_num FROM STRING_SPLIT('{DETAIL_EXPRESSION}', '|') ) SELECT DISTINCT TOP 5 c.top_pattern, a."{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" a, cte c WHERE c.row_num = 4 AND TRANSLATE(a."{COLUMN_NAME}" COLLATE Latin1_General_BIN, 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789', 'aaaaaaaaaaaaaaaaaaaaaaaaaaAAAAAAAAAAAAAAAAAAAAAAAAAANNNNNNNNNN') = c.top_pattern GROUP BY c.top_pattern, a."{COLUMN_NAME}" UNION ALL SELECT DISTINCT TOP 5 c.top_pattern, a."{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" a, cte c WHERE c.row_num = 6 AND TRANSLATE(a."{COLUMN_NAME}" COLLATE Latin1_General_BIN, 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789', 'aaaaaaaaaaaaaaaaaaaaaaaaaaAAAAAAAAAAAAAAAAAAAAAAAAAANNNNNNNNNN') = c.top_pattern GROUP BY c.top_pattern, a."{COLUMN_NAME}" UNION ALL SELECT DISTINCT TOP 5 c.top_pattern, a."{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" a, cte c WHERE c.row_num = 8 AND TRANSLATE(a."{COLUMN_NAME}" COLLATE Latin1_General_BIN, 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789', 'aaaaaaaaaaaaaaaaaaaaaaaaaaAAAAAAAAAAAAAAAAAAAAAAAAAANNNNNNNNNN') = c.top_pattern GROUP BY c.top_pattern, a."{COLUMN_NAME}" UNION ALL SELECT DISTINCT TOP 5 c.top_pattern, a."{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" a, cte c WHERE c.row_num = 10 AND TRANSLATE(a."{COLUMN_NAME}" COLLATE Latin1_General_BIN, 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789', 'aaaaaaaaaaaaaaaaaaaaaaaaaaAAAAAAAAAAAAAAAAAAAAAAAAAANNNNNNNNNN') = c.top_pattern GROUP BY c.top_pattern, a."{COLUMN_NAME}" ORDER BY top_pattern DESC, count DESC; + error_type: Profile Anomaly + - id: '1064' + test_id: '1007' + test_type: Column_Pattern_Mismatch + sql_flavor: postgresql + lookup_type: null + lookup_query: |- + SELECT A.* FROM ( SELECT DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 4)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", '[a-z]', 'a', 'g'), '[A-Z]', 'A', 'g'), '[0-9]', 'N', 'g') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC LIMIT 5 ) A UNION ALL SELECT B.* FROM ( SELECT DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 6)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", '[a-z]', 'a', 'g'), '[A-Z]', 'A', 'g'), '[0-9]', 'N', 'g') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC LIMIT 5 ) B UNION ALL SELECT C.* FROM ( SELECT DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 8)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", '[a-z]', 'a', 'g'), '[A-Z]', 'A', 'g'), '[0-9]', 'N', 'g') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC LIMIT 5 ) C UNION ALL SELECT D.* FROM ( SELECT DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 10)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", '[a-z]', 'a', 'g'), '[A-Z]', 'A', 'g'), '[0-9]', 'N', 'g') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC LIMIT 5) D ORDER BY top_pattern DESC, count DESC; + error_type: Profile Anomaly + - id: '1039' + test_id: '1007' + test_type: Column_Pattern_Mismatch + sql_flavor: redshift + lookup_type: null + lookup_query: |- + SELECT A.* FROM ( SELECT TOP 5 DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 4)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", '[a-z]', 'a'),'[A-Z]', 'A'),'[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC ) A UNION ALL SELECT B.* FROM ( SELECT TOP 5 DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 6)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", '[a-z]', 'a'),'[A-Z]', 'A'),'[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC ) B UNION ALL SELECT C.* FROM ( SELECT TOP 5 DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 8)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", '[a-z]', 'a'),'[A-Z]', 'A'),'[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC ) C UNION ALL SELECT D.* FROM ( SELECT TOP 5 DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 10)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", '[a-z]', 'a'),'[A-Z]', 'A'),'[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC ) D ORDER BY top_pattern DESC, count DESC; + error_type: Profile Anomaly + - id: '1439' + test_id: '1007' + test_type: Column_Pattern_Mismatch + sql_flavor: redshift_spectrum + lookup_type: null + lookup_query: |- + SELECT A.* FROM ( SELECT TOP 5 DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 4)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", '[a-z]', 'a'),'[A-Z]', 'A'),'[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC ) A UNION ALL SELECT B.* FROM ( SELECT TOP 5 DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 6)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", '[a-z]', 'a'),'[A-Z]', 'A'),'[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC ) B UNION ALL SELECT C.* FROM ( SELECT TOP 5 DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 8)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", '[a-z]', 'a'),'[A-Z]', 'A'),'[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC ) C UNION ALL SELECT D.* FROM ( SELECT TOP 5 DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 10)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", '[a-z]', 'a'),'[A-Z]', 'A'),'[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC ) D ORDER BY top_pattern DESC, count DESC; + error_type: Profile Anomaly + - id: '1178' + test_id: '1007' + test_type: Column_Pattern_Mismatch + sql_flavor: snowflake + lookup_type: null + lookup_query: |- + SELECT A.* FROM (SELECT DISTINCT TOP 5 b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 4)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}"::VARCHAR, '[a-z]', 'a'), '[A-Z]', 'A'), '[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC) A UNION ALL SELECT B.* FROM (SELECT DISTINCT TOP 5 b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 6)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}"::VARCHAR, '[a-z]', 'a'), '[A-Z]', 'A'), '[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC) B UNION ALL SELECT C.* FROM (SELECT DISTINCT TOP 5 b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 8)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}"::VARCHAR, '[a-z]', 'a'), '[A-Z]', 'A'), '[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC) C UNION ALL SELECT D.* FROM (SELECT DISTINCT TOP 5 b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 10)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}"::VARCHAR, '[a-z]', 'a'), '[A-Z]', 'A'), '[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC) D ORDER BY top_pattern DESC, count DESC; + error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Delimited_Data_Embedded.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Delimited_Data_Embedded.yaml new file mode 100644 index 00000000..aea55e63 --- /dev/null +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Delimited_Data_Embedded.yaml @@ -0,0 +1,80 @@ +profile_anomaly_types: + id: '1025' + anomaly_type: Delimited_Data_Embedded + data_object: Column + anomaly_name: Delimited Data Embedded in Column + anomaly_description: |- + Delimited data, separated by a common delimiter (comma, tab, pipe or caret) is present in over 80% of column values. This could indicate data that was incorrectly ingested, or data that would be better represented in parsed form. + anomaly_criteria: |- + p.std_pattern_match = 'DELIMITED_DATA' + detail_expression: |- + CASE WHEN p.top_freq_values IS NULL THEN 'Min: ' || p.min_text || ', Max: ' || p.max_text ELSE 'Top Freq: ' || p.top_freq_values END + issue_likelihood: Likely + suggested_action: |- + Review your source data and follow-up with data consumers to determine the most useful representation of this data. + dq_score_prevalence_formula: null + dq_score_risk_factor: '0.66' + dq_dimension: Validity + target_data_lookups: + - id: '1363' + test_id: '1025' + test_type: Delimited_Data_Embedded + sql_flavor: bigquery + lookup_type: null + lookup_query: |- + SELECT `{COLUMN_NAME}`, COUNT(*) AS count + FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` + WHERE REGEXP_CONTAINS(CAST(`{COLUMN_NAME}` AS STRING), r'^([^,|\t]{1,20}[,|\t]){2,}[^,|\t]{0,20}([,|\t]{0,1}[^,|\t]{0,20})*$') + AND NOT REGEXP_CONTAINS(CAST(`{COLUMN_NAME}` AS STRING), r'.*\s(and|but|or|yet)\s.*') + GROUP BY `{COLUMN_NAME}` + ORDER BY COUNT(*) DESC + LIMIT 500; + error_type: Profile Anomaly + - id: '1297' + test_id: '1025' + test_type: Delimited_Data_Embedded + sql_flavor: databricks + lookup_type: null + lookup_query: |- + SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE REGEXP_LIKE(`{COLUMN_NAME}`::STRING, '^([^,|\t]{1,20}[,|\t]){2,}[^,|\t]{0,20}([,|\t]{0,1}[^,|\t]{0,20})*$') AND NOT REGEXP_LIKE(`{COLUMN_NAME}`::STRING, '.*\\s(and|but|or|yet)\\s.*') GROUP BY `{COLUMN_NAME}` ORDER BY count DESC LIMIT 500; + error_type: Profile Anomaly + - id: '1139' + test_id: '1025' + test_type: Delimited_Data_Embedded + sql_flavor: mssql + lookup_type: null + lookup_query: |- + SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE ( "{COLUMN_NAME}" LIKE '%,%,%,%' OR "{COLUMN_NAME}" LIKE '%|%|%|%' OR "{COLUMN_NAME}" LIKE '%^%^%^%' OR "{COLUMN_NAME}" LIKE '%' + CHAR(9) + '%' + CHAR(9) + '%' + CHAR(9) + '%' ) AND NOT ( "{COLUMN_NAME}" LIKE '% and %' OR "{COLUMN_NAME}" LIKE '% but %' OR "{COLUMN_NAME}" LIKE '% or %' OR "{COLUMN_NAME}" LIKE '% yet %' ) AND ISNULL(CAST(LEN("{COLUMN_NAME}") - LEN(REPLACE("{COLUMN_NAME}", ',', '')) as FLOAT) / CAST(NULLIF(LEN("{COLUMN_NAME}") - LEN(REPLACE("{COLUMN_NAME}", ' ', '')), 0) as FLOAT), 1) > 0.6 GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC; + error_type: Profile Anomaly + - id: '1082' + test_id: '1025' + test_type: Delimited_Data_Embedded + sql_flavor: postgresql + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" ~ '^([^,|\t]{1,20}[,|\t]){2,}[^,|\t]{0,20}([,|\t]{0,1}[^,|\t]{0,20})*$' AND "{COLUMN_NAME}" !~ '\s(and|but|or|yet)\s' GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT 500; + error_type: Profile Anomaly + - id: '1057' + test_id: '1025' + test_type: Delimited_Data_Embedded + sql_flavor: redshift + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" ~ '^([^,|\t]{1,20}[,|\t]){2,}[^,|\t]{0,20}([,|\t]{0,1}[^,|\t]{0,20})*$' AND "{COLUMN_NAME}" !~ '\\s(and|but|or|yet)\\s' GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT 500; + error_type: Profile Anomaly + - id: '1457' + test_id: '1025' + test_type: Delimited_Data_Embedded + sql_flavor: redshift_spectrum + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" ~ '^([^,|\t]{1,20}[,|\t]){2,}[^,|\t]{0,20}([,|\t]{0,1}[^,|\t]{0,20})*$' AND "{COLUMN_NAME}" !~ '\\s(and|but|or|yet)\\s' GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT 500; + error_type: Profile Anomaly + - id: '1196' + test_id: '1025' + test_type: Delimited_Data_Embedded + sql_flavor: snowflake + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE REGEXP_LIKE("{COLUMN_NAME}"::VARCHAR, '^([^,|\t]{1,20}[,|\t]){2,}[^,|\t]{0,20}([,|\t]{0,1}[^,|\t]{0,20})*$') AND NOT REGEXP_LIKE("{COLUMN_NAME}"::VARCHAR, '.*\\s(and|but|or|yet)\\s.*') GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT 500; + error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Inconsistent_Casing.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Inconsistent_Casing.yaml new file mode 100644 index 00000000..b09f8700 --- /dev/null +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Inconsistent_Casing.yaml @@ -0,0 +1,126 @@ +profile_anomaly_types: + id: '1028' + anomaly_type: Inconsistent_Casing + data_object: Column + anomaly_name: Inconsistent Casing + anomaly_description: |- + Casing is inconsistent for a column representing an entity name or address elements. Mixed-Case and All-Upper-Case values were found in the same column. + anomaly_criteria: |- + mixed_case_ct > 0 AND upper_case_ct > 0 AND functional_data_type IN ('Address', 'City', 'Entity Name', 'Person Given Name', 'Person Last Name', 'Person Full Name') + detail_expression: |- + 'Mixed-Case: ' || p.mixed_case_ct::VARCHAR || ', All-Upper-Case: ' || p.upper_case_ct::VARCHAR || ' for Semantic Data Type: ' || p.functional_data_type || ', Records: ' || p.record_ct::VARCHAR + issue_likelihood: Definite + suggested_action: |- + Review your source data and follow-up with data owners to determine whether consistent casing should be applied at the source. If source data corrections are not possible, consider standardizing the column upon ingestion to ensure consistent casing. + dq_score_prevalence_formula: |- + LEAST(p.mixed_case_ct, p.upper_case_ct)::FLOAT/NULLIF(p.record_ct, 0)::FLOAT + dq_score_risk_factor: '1.0' + dq_dimension: Validity + target_data_lookups: + - id: '1410' + test_id: '1028' + test_type: Inconsistent_Casing + sql_flavor: bigquery + lookup_type: null + lookup_query: |- + ( + SELECT 'Upper Case' AS casing, `{COLUMN_NAME}`, COUNT(*) AS count + FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` + WHERE UPPER(CAST(`{COLUMN_NAME}` AS STRING)) = CAST(`{COLUMN_NAME}` AS STRING) + GROUP BY `{COLUMN_NAME}` + LIMIT 20 + ) + UNION ALL + ( + SELECT 'Mixed Case' AS casing, `{COLUMN_NAME}`, COUNT(*) AS count + FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` + WHERE CAST(`{COLUMN_NAME}` AS STRING) <> UPPER(CAST(`{COLUMN_NAME}` AS STRING)) + AND CAST(`{COLUMN_NAME}` AS STRING) <> LOWER(CAST(`{COLUMN_NAME}` AS STRING)) + GROUP BY `{COLUMN_NAME}` + LIMIT 20 + ); + error_type: Profile Anomaly + - id: '1262' + test_id: '1028' + test_type: Inconsistent_Casing + sql_flavor: databricks + lookup_type: null + lookup_query: |- + (SELECT 'Upper Case' as casing, `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` + WHERE UPPER(`{COLUMN_NAME}`) = `{COLUMN_NAME}` + GROUP BY `{COLUMN_NAME}` LIMIT 20) + UNION ALL + (SELECT 'Mixed Case' as casing, `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` + WHERE `{COLUMN_NAME}` <> UPPER(`{COLUMN_NAME}`) AND `{COLUMN_NAME}` <> LOWER(`{COLUMN_NAME}`) + GROUP BY `{COLUMN_NAME}` LIMIT 20) + error_type: Profile Anomaly + - id: '1260' + test_id: '1028' + test_type: Inconsistent_Casing + sql_flavor: mssql + lookup_type: null + lookup_query: |- + SELECT TOP 20 'Upper Case' as casing, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + WHERE UPPER("{COLUMN_NAME}") = "{COLUMN_NAME}" + GROUP BY "{COLUMN_NAME}" + UNION ALL + SELECT TOP 20 'Mixed Case' as casing, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + WHERE "{COLUMN_NAME}" <> UPPER("{COLUMN_NAME}") AND "{COLUMN_NAME}" <> LOWER("{COLUMN_NAME}") + GROUP BY "{COLUMN_NAME}" + error_type: Profile Anomaly + - id: '1259' + test_id: '1028' + test_type: Inconsistent_Casing + sql_flavor: postgresql + lookup_type: null + lookup_query: |- + (SELECT 'Upper Case' as casing, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + WHERE UPPER("{COLUMN_NAME}") = "{COLUMN_NAME}" + GROUP BY "{COLUMN_NAME}" LIMIT 20) + UNION ALL + (SELECT 'Mixed Case' as casing, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + WHERE "{COLUMN_NAME}" <> UPPER("{COLUMN_NAME}") AND "{COLUMN_NAME}" <> LOWER("{COLUMN_NAME}") + GROUP BY "{COLUMN_NAME}" LIMIT 20) + error_type: Profile Anomaly + - id: '1258' + test_id: '1028' + test_type: Inconsistent_Casing + sql_flavor: redshift + lookup_type: null + lookup_query: |- + (SELECT 'Upper Case' as casing, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + WHERE UPPER("{COLUMN_NAME}") = "{COLUMN_NAME}" + GROUP BY "{COLUMN_NAME}" LIMIT 20) + UNION ALL + (SELECT 'Mixed Case' as casing, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + WHERE "{COLUMN_NAME}" <> UPPER("{COLUMN_NAME}") AND "{COLUMN_NAME}" <> LOWER("{COLUMN_NAME}") + GROUP BY "{COLUMN_NAME}" LIMIT 20) + error_type: Profile Anomaly + - id: '1473' + test_id: '1028' + test_type: Inconsistent_Casing + sql_flavor: redshift_spectrum + lookup_type: null + lookup_query: |- + (SELECT 'Upper Case' as casing, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + WHERE UPPER("{COLUMN_NAME}") = "{COLUMN_NAME}" + GROUP BY "{COLUMN_NAME}" LIMIT 20) + UNION ALL + (SELECT 'Mixed Case' as casing, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + WHERE "{COLUMN_NAME}" <> UPPER("{COLUMN_NAME}") AND "{COLUMN_NAME}" <> LOWER("{COLUMN_NAME}") + GROUP BY "{COLUMN_NAME}" LIMIT 20) + error_type: Profile Anomaly + - id: '1261' + test_id: '1028' + test_type: Inconsistent_Casing + sql_flavor: snowflake + lookup_type: null + lookup_query: |- + (SELECT 'Upper Case' as casing, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + WHERE UPPER("{COLUMN_NAME}") = "{COLUMN_NAME}" + GROUP BY "{COLUMN_NAME}" LIMIT 20) + UNION ALL + (SELECT 'Mixed Case' as casing, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + WHERE "{COLUMN_NAME}" <> UPPER("{COLUMN_NAME}") AND "{COLUMN_NAME}" <> LOWER("{COLUMN_NAME}") + GROUP BY "{COLUMN_NAME}" LIMIT 20) + error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Invalid_Zip3_USA.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Invalid_Zip3_USA.yaml new file mode 100644 index 00000000..87576c2d --- /dev/null +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Invalid_Zip3_USA.yaml @@ -0,0 +1,83 @@ +profile_anomaly_types: + id: '1024' + anomaly_type: Invalid_Zip3_USA + data_object: Column + anomaly_name: Invalid USA ZIP-3 Format + anomaly_description: |- + The majority of values in this column are 3-digit zips, but divergent patterns were found. This could indicate an incorrect roll-up category or a PII concern. + anomaly_criteria: |- + p.distinct_pattern_ct > 1 + AND (p.column_name ilike '%zip%' OR p.column_name ILIKE '%postal%') + AND SPLIT_PART(p.top_patterns, ' | ', 2) = 'NNN' + AND SPLIT_PART(p.top_patterns, ' | ', 1)::FLOAT/NULLIF(value_ct, 0)::FLOAT > 0.50 + detail_expression: |- + 'Pattern: ' || p.top_patterns + issue_likelihood: Definite + suggested_action: |- + Review your source data, ingestion process, and any processing steps that update this column. + dq_score_prevalence_formula: |- + (NULLIF(p.record_ct, 0)::INT - SPLIT_PART(p.top_patterns, ' | ', 1)::BIGINT)::FLOAT/NULLIF(p.record_ct, 0)::FLOAT + dq_score_risk_factor: '1' + dq_dimension: Validity + target_data_lookups: + - id: '1362' + test_id: '1024' + test_type: Invalid_Zip3_USA + sql_flavor: bigquery + lookup_type: null + lookup_query: |- + SELECT `{COLUMN_NAME}`, COUNT(*) AS count + FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` + WHERE TRANSLATE(CAST(`{COLUMN_NAME}` AS STRING), '012345678', '999999999') <> '999' + GROUP BY `{COLUMN_NAME}` + ORDER BY count DESC, `{COLUMN_NAME}` + LIMIT 500; + error_type: Profile Anomaly + - id: '1296' + test_id: '1024' + test_type: Invalid_Zip3_USA + sql_flavor: databricks + lookup_type: null + lookup_query: |- + SELECT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE TRANSLATE(`{COLUMN_NAME}`,'012345678','999999999') <> '999' GROUP BY `{COLUMN_NAME}` ORDER BY count DESC, `{COLUMN_NAME}` LIMIT 500; + error_type: Profile Anomaly + - id: '1138' + test_id: '1024' + test_type: Invalid_Zip3_USA + sql_flavor: mssql + lookup_type: null + lookup_query: |- + SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') <> '999' GROUP BY "{COLUMN_NAME}" ORDER BY count DESC, "{COLUMN_NAME}"; + error_type: Profile Anomaly + - id: '1081' + test_id: '1024' + test_type: Invalid_Zip3_USA + sql_flavor: postgresql + lookup_type: null + lookup_query: |- + SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') <> '999' GROUP BY "{COLUMN_NAME}" ORDER BY count DESC, "{COLUMN_NAME}" LIMIT 500; + error_type: Profile Anomaly + - id: '1056' + test_id: '1024' + test_type: Invalid_Zip3_USA + sql_flavor: redshift + lookup_type: null + lookup_query: |- + SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') <> '999' GROUP BY "{COLUMN_NAME}" ORDER BY count DESC, "{COLUMN_NAME}" LIMIT 500; + error_type: Profile Anomaly + - id: '1456' + test_id: '1024' + test_type: Invalid_Zip3_USA + sql_flavor: redshift_spectrum + lookup_type: null + lookup_query: |- + SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') <> '999' GROUP BY "{COLUMN_NAME}" ORDER BY count DESC, "{COLUMN_NAME}" LIMIT 500; + error_type: Profile Anomaly + - id: '1195' + test_id: '1024' + test_type: Invalid_Zip3_USA + sql_flavor: snowflake + lookup_type: null + lookup_query: |- + SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') <> '999' GROUP BY "{COLUMN_NAME}" ORDER BY count DESC, "{COLUMN_NAME}" LIMIT 500; + error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Invalid_Zip_USA.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Invalid_Zip_USA.yaml new file mode 100644 index 00000000..03c47fc1 --- /dev/null +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Invalid_Zip_USA.yaml @@ -0,0 +1,79 @@ +profile_anomaly_types: + id: '1003' + anomaly_type: Invalid_Zip_USA + data_object: Column + anomaly_name: Invalid USA Zip Code Format + anomaly_description: |- + Some values present do not conform with the expected format of USA Zip Codes. + anomaly_criteria: |- + p.functional_data_type = 'Zip' AND (p.general_type <> 'A' OR p.filled_value_ct > 0 OR EXISTS (SELECT 1 FROM UNNEST(STRING_TO_ARRAY(p.top_patterns, ' | ')) WITH ORDINALITY AS u(val, idx) WHERE idx % 2 = 0 AND val NOT IN ('NNNNN','NNNNN-NNNN','NNNNNNNNN'))) + detail_expression: |- + CASE WHEN p.general_type = 'N' THEN 'Type: ' || p.column_type ELSE '' END || CASE WHEN p.general_type = 'A' THEN 'Patterns: ' || (SELECT string_agg(val, ',') FROM UNNEST(STRING_TO_ARRAY(top_patterns, ' | ')) WITH ORDINALITY AS u(val, idx) WHERE idx % 2 = 0) || ', Dummy Values: ' || p.filled_value_ct::VARCHAR ELSE '' END + issue_likelihood: Definite + suggested_action: |- + Consider correcting invalid column values or changing them to indicate a missing value if corrections cannot be made. + dq_score_prevalence_formula: null + dq_score_risk_factor: '1.0' + dq_dimension: Validity + target_data_lookups: + - id: '1341' + test_id: '1003' + test_type: Invalid_Zip_USA + sql_flavor: bigquery + lookup_type: null + lookup_query: |- + SELECT `{COLUMN_NAME}`, COUNT(*) AS count + FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` + WHERE TRANSLATE(CAST(`{COLUMN_NAME}` AS STRING), '012345678', '999999999') NOT IN ('99999', '999999999', '99999-9999') + GROUP BY `{COLUMN_NAME}` + ORDER BY `{COLUMN_NAME}` + LIMIT 500; + error_type: Profile Anomaly + - id: '1275' + test_id: '1003' + test_type: Invalid_Zip_USA + sql_flavor: databricks + lookup_type: null + lookup_query: |- + SELECT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE TRANSLATE(`{COLUMN_NAME}`,'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` LIMIT 500; + error_type: Profile Anomaly + - id: '1117' + test_id: '1003' + test_type: Invalid_Zip_USA + sql_flavor: mssql + lookup_type: null + lookup_query: |- + SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}"; + error_type: Profile Anomaly + - id: '1060' + test_id: '1003' + test_type: Invalid_Zip_USA + sql_flavor: postgresql + lookup_type: null + lookup_query: |- + SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500; + error_type: Profile Anomaly + - id: '1035' + test_id: '1003' + test_type: Invalid_Zip_USA + sql_flavor: redshift + lookup_type: null + lookup_query: |- + SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500; + error_type: Profile Anomaly + - id: '1435' + test_id: '1003' + test_type: Invalid_Zip_USA + sql_flavor: redshift_spectrum + lookup_type: null + lookup_query: |- + SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500; + error_type: Profile Anomaly + - id: '1174' + test_id: '1003' + test_type: Invalid_Zip_USA + sql_flavor: snowflake + lookup_type: null + lookup_query: |- + SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500; + error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Leading_Spaces.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Leading_Spaces.yaml new file mode 100644 index 00000000..a6dc9c91 --- /dev/null +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Leading_Spaces.yaml @@ -0,0 +1,79 @@ +profile_anomaly_types: + id: '1009' + anomaly_type: Leading_Spaces + data_object: Column + anomaly_name: Leading Spaces Found in Column Values + anomaly_description: |- + Spaces were found before data at the front of column string values. This likely contradicts user expectations and could be a sign of broader ingestion or processing errors. + anomaly_criteria: |- + p.lead_space_ct > 0 + detail_expression: |- + 'Cases Found: ' || p.lead_space_ct::VARCHAR(10) + issue_likelihood: Likely + suggested_action: |- + Review your source data, ingestion process, and any processing steps that update this column. + dq_score_prevalence_formula: |- + p.lead_space_ct::FLOAT/NULLIF(p.record_ct, 0)::FLOAT + dq_score_risk_factor: '0.66' + dq_dimension: Validity + target_data_lookups: + - id: '1347' + test_id: '1009' + test_type: Leading_Spaces + sql_flavor: bigquery + lookup_type: null + lookup_query: |- + SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count + FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` + WHERE REGEXP_CONTAINS(CAST(`{COLUMN_NAME}` AS STRING), r'^\s') + GROUP BY `{COLUMN_NAME}` + ORDER BY `{COLUMN_NAME}`; + error_type: Profile Anomaly + - id: '1281' + test_id: '1009' + test_type: Leading_Spaces + sql_flavor: databricks + lookup_type: null + lookup_query: |- + SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE (CASE WHEN `{COLUMN_NAME}` BETWEEN ' !' AND '!' THEN 1 ELSE 0 END) = 1 GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}`; + error_type: Profile Anomaly + - id: '1123' + test_id: '1009' + test_type: Leading_Spaces + sql_flavor: mssql + lookup_type: null + lookup_query: |- + SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE (CASE WHEN "{COLUMN_NAME}" BETWEEN ' !' AND '!' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}"; + error_type: Profile Anomaly + - id: '1066' + test_id: '1009' + test_type: Leading_Spaces + sql_flavor: postgresql + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE (CASE WHEN "{COLUMN_NAME}" BETWEEN ' !' AND '!' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}"; + error_type: Profile Anomaly + - id: '1041' + test_id: '1009' + test_type: Leading_Spaces + sql_flavor: redshift + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE (CASE WHEN "{COLUMN_NAME}" BETWEEN ' !' AND '!' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}"; + error_type: Profile Anomaly + - id: '1441' + test_id: '1009' + test_type: Leading_Spaces + sql_flavor: redshift_spectrum + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE (CASE WHEN "{COLUMN_NAME}" BETWEEN ' !' AND '!' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}"; + error_type: Profile Anomaly + - id: '1180' + test_id: '1009' + test_type: Leading_Spaces + sql_flavor: snowflake + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE (CASE WHEN "{COLUMN_NAME}" BETWEEN ' !' AND '!' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}"; + error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Multiple_Types_Major.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Multiple_Types_Major.yaml new file mode 100644 index 00000000..f6bc2d42 --- /dev/null +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Multiple_Types_Major.yaml @@ -0,0 +1,92 @@ +profile_anomaly_types: + id: '1005' + anomaly_type: Multiple_Types_Major + data_object: Multi-Col + anomaly_name: Multiple Data Types per Column Name - Major + anomaly_description: |- + Columns with the same name have broadly different types across tables. Differences could be significant enough to cause errors in downstream analysis, extra steps resulting in divergent business logic and inconsistencies in results. + anomaly_criteria: |- + m.general_type_ct > 1 + detail_expression: |- + 'Found ' || m.column_ct::VARCHAR || ' columns, ' || m.type_ct::VARCHAR(10) || ' types, ' || m.min_type || ' to ' || m.max_type + issue_likelihood: Likely + suggested_action: |- + Ideally, you should change the column data types to be fully consistent. If the data is meant to be different, you should change column names so downstream users aren't led astray. + dq_score_prevalence_formula: null + dq_score_risk_factor: null + dq_dimension: Consistency + target_data_lookups: + - id: '1343' + test_id: '1005' + test_type: Multiple_Types_Major + sql_flavor: bigquery + lookup_type: null + lookup_query: |- + SELECT DISTINCT column_name, columns.table_name, + CASE + WHEN LOWER(data_type) LIKE 'timestamp%' THEN LOWER(data_type) + WHEN LOWER(data_type) LIKE 'date' THEN LOWER(data_type) + WHEN LOWER(data_type) LIKE 'boolean' THEN 'boolean' + WHEN data_type = 'TEXT' THEN CONCAT('varchar(', CAST(character_maximum_length AS STRING), ')') + WHEN LOWER(data_type) LIKE 'char%' THEN CONCAT('char(', CAST(character_maximum_length AS STRING), ')') + WHEN data_type = 'NUMBER' AND numeric_precision = 38 AND numeric_scale = 0 THEN 'bigint' + WHEN LOWER(data_type) LIKE 'num%' THEN CONCAT('numeric(', CAST(numeric_precision AS STRING), ',', CAST(numeric_scale AS STRING), ')') + ELSE data_type + END AS data_type + FROM information_schema.columns + JOIN information_schema.tables + ON columns.table_name = tables.table_name + AND columns.table_schema = tables.table_schema + WHERE columns.table_schema = '{TARGET_SCHEMA}' + AND columns.column_name = '{COLUMN_NAME}' + AND tables.table_type = 'BASE TABLE' + ORDER BY data_type, table_name; + error_type: Profile Anomaly + - id: '1277' + test_id: '1005' + test_type: Multiple_Types_Major + sql_flavor: databricks + lookup_type: null + lookup_query: |- + SELECT DISTINCT column_name, columns.table_name, CASE WHEN data_type ILIKE 'timestamp%' THEN lower(data_type) WHEN data_type ILIKE 'date' THEN lower(data_type) WHEN data_type ILIKE 'boolean' THEN 'boolean' WHEN data_type = 'TEXT' THEN 'varchar(' || CAST(character_maximum_length AS STRING) || ')' WHEN data_type ILIKE 'char%' THEN 'char(' || CAST(character_maximum_length AS STRING) || ')' WHEN data_type = 'NUMBER' AND numeric_precision = 38 AND numeric_scale = 0 THEN 'bigint' WHEN data_type ILIKE 'num%' THEN 'numeric(' || CAST(numeric_precision AS STRING) || ',' || CAST(numeric_scale AS STRING) || ')' ELSE data_type END AS data_type FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = '{TARGET_SCHEMA}' AND columns.column_name = '{COLUMN_NAME}' AND tables.table_type = 'BASE TABLE' ORDER BY data_type, table_name; + error_type: Profile Anomaly + - id: '1119' + test_id: '1005' + test_type: Multiple_Types_Major + sql_flavor: mssql + lookup_type: null + lookup_query: |- + SELECT TOP 500 column_name, columns.table_name, CASE WHEN data_type = 'datetime' THEN 'datetime' WHEN data_type = 'datetime2' THEN 'datetime' WHEN data_type = 'varchar' THEN 'varchar(' + CAST(character_maximum_length AS VARCHAR) + ')' WHEN data_type = 'char' THEN 'char(' + CAST(character_maximum_length AS VARCHAR) + ')' WHEN data_type = 'numeric' THEN 'numeric(' + CAST(numeric_precision AS VARCHAR) + ',' + CAST(numeric_scale AS VARCHAR) + ')' ELSE data_type END AS data_type FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = '{TARGET_SCHEMA}' AND columns.column_name = '{COLUMN_NAME}' AND tables.table_type = 'BASE TABLE' ORDER BY data_type, table_name; + error_type: Profile Anomaly + - id: '1062' + test_id: '1005' + test_type: Multiple_Types_Major + sql_flavor: postgresql + lookup_type: null + lookup_query: |- + SELECT DISTINCT column_name, columns.table_name, CASE WHEN data_type = 'timestamp without time zone' THEN 'timestamp' WHEN data_type = 'character varying' THEN 'varchar(' || CAST(character_maximum_length AS VARCHAR) || ')' WHEN data_type = 'character' THEN 'char(' || CAST(character_maximum_length AS VARCHAR) || ')' WHEN data_type = 'numeric' THEN 'numeric(' || CAST(numeric_precision AS VARCHAR) || ',' || CAST(numeric_scale AS VARCHAR) || ')' ELSE data_type END AS data_type FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = '{TARGET_SCHEMA}' AND columns.column_name = '{COLUMN_NAME}' AND UPPER(tables.table_type) = 'BASE TABLE' ORDER BY data_type, table_name; + error_type: Profile Anomaly + - id: '1037' + test_id: '1005' + test_type: Multiple_Types_Major + sql_flavor: redshift + lookup_type: null + lookup_query: |- + SELECT DISTINCT column_name, table_name, CASE WHEN data_type = 'timestamp without time zone' THEN 'timestamp' WHEN data_type = 'character varying' THEN 'varchar(' || CAST(character_maximum_length AS VARCHAR) || ')' WHEN data_type = 'character' THEN 'char(' || CAST(character_maximum_length AS VARCHAR) || ')' WHEN data_type = 'numeric' THEN 'numeric(' || CAST(numeric_precision AS VARCHAR) || ',' || CAST(numeric_scale AS VARCHAR) || ')' ELSE data_type END AS data_type FROM information_schema.columns WHERE table_schema = '{TARGET_SCHEMA}' AND column_name = '{COLUMN_NAME}' ORDER BY data_type, table_name; + error_type: Profile Anomaly + - id: '1437' + test_id: '1005' + test_type: Multiple_Types_Major + sql_flavor: redshift_spectrum + lookup_type: null + lookup_query: |- + SELECT DISTINCT columnname AS column_name, tablename AS table_name, external_type AS data_type FROM svv_external_columns WHERE schemaname = '{TARGET_SCHEMA}' AND columnname = '{COLUMN_NAME}' ORDER BY external_type, tablename; + error_type: Profile Anomaly + - id: '1176' + test_id: '1005' + test_type: Multiple_Types_Major + sql_flavor: snowflake + lookup_type: null + lookup_query: |- + SELECT DISTINCT column_name, columns.table_name, CASE WHEN data_type ILIKE 'timestamp%' THEN lower(data_type) WHEN data_type ILIKE 'date' THEN lower(data_type) WHEN data_type ILIKE 'boolean' THEN 'boolean' WHEN data_type = 'TEXT' THEN 'varchar(' || CAST(character_maximum_length AS VARCHAR) || ')' WHEN data_type ILIKE 'char%' THEN 'char(' || CAST(character_maximum_length AS VARCHAR) || ')' WHEN data_type = 'NUMBER' AND numeric_precision = 38 AND numeric_scale = 0 THEN 'bigint' WHEN data_type ILIKE 'num%' THEN 'numeric(' || CAST(numeric_precision AS VARCHAR) || ',' || CAST(numeric_scale AS VARCHAR) || ')' ELSE data_type END AS data_type FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = '{TARGET_SCHEMA}' AND columns.column_name = '{COLUMN_NAME}' AND tables.table_type = 'BASE TABLE' ORDER BY data_type, table_name; + error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Multiple_Types_Minor.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Multiple_Types_Minor.yaml new file mode 100644 index 00000000..554a78b7 --- /dev/null +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Multiple_Types_Minor.yaml @@ -0,0 +1,92 @@ +profile_anomaly_types: + id: '1004' + anomaly_type: Multiple_Types_Minor + data_object: Multi-Col + anomaly_name: Multiple Data Types per Column Name - Minor + anomaly_description: |- + Columns with the same name have the same general type across tables, but the types do not exactly match. Truncation issues may result if columns are commingled and assumed to be the same format. + anomaly_criteria: |- + m.general_type_ct = 1 AND m.type_ct > 1 + detail_expression: |- + 'Found ' || m.column_ct::VARCHAR || ' columns, ' || m.type_ct::VARCHAR(10) || ' types, ' || m.min_type || ' to ' || m.max_type + issue_likelihood: Possible + suggested_action: |- + Consider changing the column data types to be fully consistent. This will tighten your standards at ingestion and assure that data is consistent between tables. + dq_score_prevalence_formula: null + dq_score_risk_factor: null + dq_dimension: Consistency + target_data_lookups: + - id: '1342' + test_id: '1004' + test_type: Multiple_Types_Minor + sql_flavor: bigquery + lookup_type: null + lookup_query: |- + SELECT DISTINCT column_name, columns.table_name, + CASE + WHEN LOWER(data_type) LIKE 'timestamp%' THEN LOWER(data_type) + WHEN LOWER(data_type) LIKE 'date' THEN LOWER(data_type) + WHEN LOWER(data_type) LIKE 'boolean' THEN 'boolean' + WHEN data_type = 'TEXT' THEN CONCAT('varchar(', CAST(character_maximum_length AS STRING), ')') + WHEN LOWER(data_type) LIKE 'char%' THEN CONCAT('char(', CAST(character_maximum_length AS STRING), ')') + WHEN data_type = 'NUMBER' AND numeric_precision = 38 AND numeric_scale = 0 THEN 'bigint' + WHEN LOWER(data_type) LIKE 'num%' THEN CONCAT('numeric(', CAST(numeric_precision AS STRING), ',', CAST(numeric_scale AS STRING), ')') + ELSE data_type + END AS data_type + FROM information_schema.columns + JOIN information_schema.tables + ON columns.table_name = tables.table_name + AND columns.table_schema = tables.table_schema + WHERE columns.table_schema = '{TARGET_SCHEMA}' + AND columns.column_name = '{COLUMN_NAME}' + AND tables.table_type = 'BASE TABLE' + ORDER BY data_type, table_name; + error_type: Profile Anomaly + - id: '1276' + test_id: '1004' + test_type: Multiple_Types_Minor + sql_flavor: databricks + lookup_type: null + lookup_query: |- + SELECT DISTINCT column_name, columns.table_name, CASE WHEN data_type ILIKE 'timestamp%' THEN lower(data_type) WHEN data_type ILIKE 'date' THEN lower(data_type) WHEN data_type ILIKE 'boolean' THEN 'boolean' WHEN data_type = 'TEXT' THEN 'varchar(' || CAST(character_maximum_length AS STRING) || ')' WHEN data_type ILIKE 'char%' THEN 'char(' || CAST(character_maximum_length AS STRING) || ')' WHEN data_type = 'NUMBER' AND numeric_precision = 38 AND numeric_scale = 0 THEN 'bigint' WHEN data_type ILIKE 'num%' THEN 'numeric(' || CAST(numeric_precision AS STRING) || ',' || CAST(numeric_scale AS STRING) || ')' ELSE data_type END AS data_type FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = '{TARGET_SCHEMA}' AND columns.column_name = '{COLUMN_NAME}' AND tables.table_type = 'BASE TABLE' ORDER BY data_type, table_name; + error_type: Profile Anomaly + - id: '1118' + test_id: '1004' + test_type: Multiple_Types_Minor + sql_flavor: mssql + lookup_type: null + lookup_query: |- + SELECT TOP 500 column_name, columns.table_name, CASE WHEN data_type = 'datetime' THEN 'datetime' WHEN data_type = 'datetime2' THEN 'datetime' WHEN data_type = 'varchar' THEN 'varchar(' + CAST(character_maximum_length AS VARCHAR) + ')' WHEN data_type = 'char' THEN 'char(' + CAST(character_maximum_length AS VARCHAR) + ')' WHEN data_type = 'numeric' THEN 'numeric(' + CAST(numeric_precision AS VARCHAR) + ',' + CAST(numeric_scale AS VARCHAR) + ')' ELSE data_type END AS data_type FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = '{TARGET_SCHEMA}' AND columns.column_name = '{COLUMN_NAME}' AND tables.table_type = 'BASE TABLE' ORDER BY data_type, table_name; + error_type: Profile Anomaly + - id: '1061' + test_id: '1004' + test_type: Multiple_Types_Minor + sql_flavor: postgresql + lookup_type: null + lookup_query: |- + SELECT DISTINCT column_name, columns.table_name, CASE WHEN data_type = 'timestamp without time zone' THEN 'timestamp' WHEN data_type = 'character varying' THEN 'varchar(' || CAST(character_maximum_length AS VARCHAR) || ')' WHEN data_type = 'character' THEN 'char(' || CAST(character_maximum_length AS VARCHAR) || ')' WHEN data_type = 'numeric' THEN 'numeric(' || CAST(numeric_precision AS VARCHAR) || ',' || CAST(numeric_scale AS VARCHAR) || ')' ELSE data_type END AS data_type FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = '{TARGET_SCHEMA}' AND columns.column_name = '{COLUMN_NAME}' AND UPPER(tables.table_type) = 'BASE TABLE' ORDER BY data_type, table_name; + error_type: Profile Anomaly + - id: '1036' + test_id: '1004' + test_type: Multiple_Types_Minor + sql_flavor: redshift + lookup_type: null + lookup_query: |- + SELECT DISTINCT column_name, table_name, CASE WHEN data_type = 'timestamp without time zone' THEN 'timestamp' WHEN data_type = 'character varying' THEN 'varchar(' || CAST(character_maximum_length AS VARCHAR) || ')' WHEN data_type = 'character' THEN 'char(' || CAST(character_maximum_length AS VARCHAR) || ')' WHEN data_type = 'numeric' THEN 'numeric(' || CAST(numeric_precision AS VARCHAR) || ',' || CAST(numeric_scale AS VARCHAR) || ')' ELSE data_type END AS data_type FROM information_schema.columns WHERE table_schema = '{TARGET_SCHEMA}' AND column_name = '{COLUMN_NAME}' ORDER BY data_type, table_name; + error_type: Profile Anomaly + - id: '1436' + test_id: '1004' + test_type: Multiple_Types_Minor + sql_flavor: redshift_spectrum + lookup_type: null + lookup_query: |- + SELECT DISTINCT columnname AS column_name, tablename AS table_name, external_type AS data_type FROM svv_external_columns WHERE schemaname = '{TARGET_SCHEMA}' AND columnname = '{COLUMN_NAME}' ORDER BY external_type, tablename; + error_type: Profile Anomaly + - id: '1175' + test_id: '1004' + test_type: Multiple_Types_Minor + sql_flavor: snowflake + lookup_type: null + lookup_query: |- + SELECT DISTINCT column_name, columns.table_name, CASE WHEN data_type ILIKE 'timestamp%' THEN lower(data_type) WHEN data_type ILIKE 'date' THEN lower(data_type) WHEN data_type ILIKE 'boolean' THEN 'boolean' WHEN data_type = 'TEXT' THEN 'varchar(' || CAST(character_maximum_length AS VARCHAR) || ')' WHEN data_type ILIKE 'char%' THEN 'char(' || CAST(character_maximum_length AS VARCHAR) || ')' WHEN data_type = 'NUMBER' AND numeric_precision = 38 AND numeric_scale = 0 THEN 'bigint' WHEN data_type ILIKE 'num%' THEN 'numeric(' || CAST(numeric_precision AS VARCHAR) || ',' || CAST(numeric_scale AS VARCHAR) || ')' ELSE data_type END AS data_type FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = '{TARGET_SCHEMA}' AND columns.column_name = '{COLUMN_NAME}' AND tables.table_type = 'BASE TABLE' ORDER BY data_type, table_name; + error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_No_Values.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_No_Values.yaml new file mode 100644 index 00000000..29978d5c --- /dev/null +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_No_Values.yaml @@ -0,0 +1,80 @@ +profile_anomaly_types: + id: '1006' + anomaly_type: No_Values + data_object: Column + anomaly_name: No Column Values Present + anomaly_description: "This column is present in the table, but no values have been\ + \ ingested or assigned in any records. This could indicate missing data or a processing\ + \ error. Note that this considers dummy values and zero-length values as missing\ + \ data. " + anomaly_criteria: |- + (p.null_value_ct + p.filled_value_ct + p.zero_length_ct) = p.record_ct + detail_expression: |- + 'Null: ' || p.null_value_ct::VARCHAR(10) || ', Dummy: ' || p.filled_value_ct::VARCHAR(10) || ', Zero Len: ' || p.zero_length_ct::VARCHAR(10) + issue_likelihood: Possible + suggested_action: |- + Review your source data, ingestion process, and any processing steps that update this column. + dq_score_prevalence_formula: |- + 1.0 + dq_score_risk_factor: '0.33' + dq_dimension: Completeness + target_data_lookups: + - id: '1344' + test_id: '1006' + test_type: No_Values + sql_flavor: bigquery + lookup_type: null + lookup_query: |- + SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count + FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` + GROUP BY `{COLUMN_NAME}` + ORDER BY `{COLUMN_NAME}`; + error_type: Profile Anomaly + - id: '1278' + test_id: '1006' + test_type: No_Values + sql_flavor: databricks + lookup_type: null + lookup_query: |- + SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}`; + error_type: Profile Anomaly + - id: '1120' + test_id: '1006' + test_type: No_Values + sql_flavor: mssql + lookup_type: null + lookup_query: |- + SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}"; + error_type: Profile Anomaly + - id: '1063' + test_id: '1006' + test_type: No_Values + sql_flavor: postgresql + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}"; + error_type: Profile Anomaly + - id: '1038' + test_id: '1006' + test_type: No_Values + sql_flavor: redshift + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}"; + error_type: Profile Anomaly + - id: '1438' + test_id: '1006' + test_type: No_Values + sql_flavor: redshift_spectrum + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}"; + error_type: Profile Anomaly + - id: '1177' + test_id: '1006' + test_type: No_Values + sql_flavor: snowflake + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}"; + error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Alpha_Name_Address.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Alpha_Name_Address.yaml new file mode 100644 index 00000000..81d2d0ca --- /dev/null +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Alpha_Name_Address.yaml @@ -0,0 +1,93 @@ +profile_anomaly_types: + id: '1029' + anomaly_type: Non_Alpha_Name_Address + data_object: Column + anomaly_name: Non-Alpha Name or Address + anomaly_description: |- + Entirely non-alphabetic values were found in a column representing an entity name or address element. + anomaly_criteria: |- + non_alpha_ct - zero_length_ct > 0 AND functional_data_type IN ('Address', 'City', 'Entity Name', 'Person Given Name', 'Person Last Name', 'Person Full Name') + detail_expression: |- + 'Non-Alpha Values: ' || (non_alpha_ct - zero_length_ct)::VARCHAR || ', Semantic Type: ' || p.functional_data_type || ', Records: ' || p.record_ct::VARCHAR + issue_likelihood: Definite + suggested_action: |- + Non-alphabetic values are highly likely to be invalid for this kind of column. This may indicate a file format change, error in an ingestion process, or incorrect source data. Review your pipeline process and source data to determine the root-cause. If this data accurately reflects source data, and upstream corrections are not possible, consider assigning the processed value to null to reflect that data is missing. + dq_score_prevalence_formula: |- + (non_alpha_ct - zero_length_ct)::FLOAT/NULLIF(p.record_ct, 0)::FLOAT + dq_score_risk_factor: '1.0' + dq_dimension: Validity + target_data_lookups: + - id: '1411' + test_id: '1029' + test_type: Non_Alpha_Name_Address + sql_flavor: bigquery + lookup_type: null + lookup_query: |- + SELECT `{COLUMN_NAME}`, COUNT(*) AS record_ct + FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` + WHERE CAST(`{COLUMN_NAME}` AS STRING) = UPPER(CAST(`{COLUMN_NAME}` AS STRING)) + AND CAST(`{COLUMN_NAME}` AS STRING) = LOWER(CAST(`{COLUMN_NAME}` AS STRING)) + AND CAST(`{COLUMN_NAME}` AS STRING) > '' + GROUP BY `{COLUMN_NAME}` + LIMIT 500; + error_type: Profile Anomaly + - id: '1267' + test_id: '1029' + test_type: Non_Alpha_Name_Address + sql_flavor: databricks + lookup_type: null + lookup_query: |- + SELECT `{COLUMN_NAME}`, COUNT(*) as record_ct FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` + WHERE `{COLUMN_NAME}` = UPPER(`{COLUMN_NAME}`) AND `{COLUMN_NAME}` = LOWER(`{COLUMN_NAME}`) AND `{COLUMN_NAME}` > '' + GROUP BY `{COLUMN_NAME}` LIMIT 500 + error_type: Profile Anomaly + - id: '1265' + test_id: '1029' + test_type: Non_Alpha_Name_Address + sql_flavor: mssql + lookup_type: null + lookup_query: |- + SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + WHERE "{COLUMN_NAME}" = UPPER("{COLUMN_NAME}") AND "{COLUMN_NAME}" = LOWER("{COLUMN_NAME}") AND "{COLUMN_NAME}" > '' + GROUP BY "{COLUMN_NAME}" + error_type: Profile Anomaly + - id: '1264' + test_id: '1029' + test_type: Non_Alpha_Name_Address + sql_flavor: postgresql + lookup_type: null + lookup_query: |- + SELECT "{COLUMN_NAME}", COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + WHERE "{COLUMN_NAME}" = UPPER("{COLUMN_NAME}") AND "{COLUMN_NAME}" = LOWER("{COLUMN_NAME}") AND "{COLUMN_NAME}" > '' + GROUP BY "{COLUMN_NAME}" LIMIT 500 + error_type: Profile Anomaly + - id: '1263' + test_id: '1029' + test_type: Non_Alpha_Name_Address + sql_flavor: redshift + lookup_type: null + lookup_query: |- + SELECT "{COLUMN_NAME}", COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + WHERE "{COLUMN_NAME}" = UPPER("{COLUMN_NAME}") AND "{COLUMN_NAME}" = LOWER("{COLUMN_NAME}") AND "{COLUMN_NAME}" > '' + GROUP BY "{COLUMN_NAME}" LIMIT 500 + error_type: Profile Anomaly + - id: '1474' + test_id: '1029' + test_type: Non_Alpha_Name_Address + sql_flavor: redshift_spectrum + lookup_type: null + lookup_query: |- + SELECT "{COLUMN_NAME}", COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + WHERE "{COLUMN_NAME}" = UPPER("{COLUMN_NAME}") AND "{COLUMN_NAME}" = LOWER("{COLUMN_NAME}") AND "{COLUMN_NAME}" > '' + GROUP BY "{COLUMN_NAME}" LIMIT 500 + error_type: Profile Anomaly + - id: '1266' + test_id: '1029' + test_type: Non_Alpha_Name_Address + sql_flavor: snowflake + lookup_type: null + lookup_query: |- + SELECT "{COLUMN_NAME}", COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + WHERE "{COLUMN_NAME}" = UPPER("{COLUMN_NAME}") AND "{COLUMN_NAME}" = LOWER("{COLUMN_NAME}") AND "{COLUMN_NAME}" > '' + GROUP BY "{COLUMN_NAME}" LIMIT 500 + error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Alpha_Prefixed_Name.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Alpha_Prefixed_Name.yaml new file mode 100644 index 00000000..0281a7f0 --- /dev/null +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Alpha_Prefixed_Name.yaml @@ -0,0 +1,94 @@ +profile_anomaly_types: + id: '1030' + anomaly_type: Non_Alpha_Prefixed_Name + data_object: Column + anomaly_name: Non-Alpha Prefixed Name + anomaly_description: |- + Non-alphabetic characters were found at the start of a column representing an entity name. + anomaly_criteria: |- + min_text < 'A' AND LEFT(min_text, 1) NOT IN ('"', ' ') AND RIGHT(min_text, 1) <> '''' AND functional_data_type IN ('City', 'Person Given Name', 'Person Last Name', 'Person Full Name') + detail_expression: |- + 'Minimum Value: ' || min_text + issue_likelihood: Definite + suggested_action: |- + Values starting with a non-alphabetic character are highly likely to be invalid for this kind of column. This may indicate a file format change, error in an ingestion process, or incorrect source data. It could also indicate flagging or coding of some kind that can be broken out in a separate column in processed data. Review your pipeline process and source data to determine the root-cause. If this data accurately reflects source data, and upstream corrections are not possible, consider applying corrections directly to processed data where possible. + dq_score_prevalence_formula: |- + 0.25 + dq_score_risk_factor: '1.0' + dq_dimension: Validity + target_data_lookups: + - id: '1412' + test_id: '1030' + test_type: Non_Alpha_Prefixed_Name + sql_flavor: bigquery + lookup_type: null + lookup_query: |- + SELECT `{COLUMN_NAME}`, COUNT(*) AS record_ct + FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` + WHERE CAST(`{COLUMN_NAME}` AS STRING) < 'A' + AND SUBSTR(CAST(`{COLUMN_NAME}` AS STRING), 1, 1) NOT IN ('"', ' ') + AND SUBSTR(CAST(`{COLUMN_NAME}` AS STRING), LENGTH(CAST(`{COLUMN_NAME}` AS STRING)), 1) <> '\'' + GROUP BY `{COLUMN_NAME}` + ORDER BY `{COLUMN_NAME}` + LIMIT 500; + error_type: Profile Anomaly + - id: '1272' + test_id: '1030' + test_type: Non_Alpha_Prefixed_Name + sql_flavor: databricks + lookup_type: null + lookup_query: |- + SELECT `{COLUMN_NAME}`, COUNT(*) as record_ct FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` + WHERE `{COLUMN_NAME}` < 'A' AND LEFT(`{COLUMN_NAME}`, 1) NOT IN ('"', ' ') AND RIGHT(`{COLUMN_NAME}`, 1) <> '''' + GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` LIMIT 500 + error_type: Profile Anomaly + - id: '1270' + test_id: '1030' + test_type: Non_Alpha_Prefixed_Name + sql_flavor: mssql + lookup_type: null + lookup_query: |- + SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + WHERE "{COLUMN_NAME}" < 'A' AND LEFT("{COLUMN_NAME}", 1) NOT IN ('"', ' ') AND RIGHT("{COLUMN_NAME}", 1) <> '''' + GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" + error_type: Profile Anomaly + - id: '1269' + test_id: '1030' + test_type: Non_Alpha_Prefixed_Name + sql_flavor: postgresql + lookup_type: null + lookup_query: |- + SELECT "{COLUMN_NAME}", COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + WHERE "{COLUMN_NAME}" < 'A' AND LEFT("{COLUMN_NAME}", 1) NOT IN ('"', ' ') AND RIGHT("{COLUMN_NAME}", 1) <> '''' + GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500 + error_type: Profile Anomaly + - id: '1268' + test_id: '1030' + test_type: Non_Alpha_Prefixed_Name + sql_flavor: redshift + lookup_type: null + lookup_query: |- + SELECT "{COLUMN_NAME}", COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + WHERE "{COLUMN_NAME}" < 'A' AND LEFT("{COLUMN_NAME}", 1) NOT IN ('"', ' ') AND RIGHT("{COLUMN_NAME}", 1) <> '''' + GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500 + error_type: Profile Anomaly + - id: '1475' + test_id: '1030' + test_type: Non_Alpha_Prefixed_Name + sql_flavor: redshift_spectrum + lookup_type: null + lookup_query: |- + SELECT "{COLUMN_NAME}", COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + WHERE "{COLUMN_NAME}" < 'A' AND LEFT("{COLUMN_NAME}", 1) NOT IN ('"', ' ') AND RIGHT("{COLUMN_NAME}", 1) <> '''' + GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500 + error_type: Profile Anomaly + - id: '1271' + test_id: '1030' + test_type: Non_Alpha_Prefixed_Name + sql_flavor: snowflake + lookup_type: null + lookup_query: |- + SELECT "{COLUMN_NAME}", COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + WHERE "{COLUMN_NAME}" < 'A' AND LEFT("{COLUMN_NAME}", 1) NOT IN ('"', ' ') AND RIGHT("{COLUMN_NAME}", 1) <> '''' + GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500 + error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Printing_Chars.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Printing_Chars.yaml new file mode 100644 index 00000000..6761e2bc --- /dev/null +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Printing_Chars.yaml @@ -0,0 +1,146 @@ +profile_anomaly_types: + id: '1031' + anomaly_type: Non_Printing_Chars + data_object: Column + anomaly_name: Non-Printing Characters + anomaly_description: |- + Non-printing characters were found embedded in a text column. + anomaly_criteria: |- + non_printing_ct > 0 + detail_expression: |- + 'Non-Printing Chars: ' || non_printing_ct::VARCHAR || ', Records: ' || p.record_ct::VARCHAR + issue_likelihood: Definite + suggested_action: |- + Embedded non-printing characters are typically stripped from data. They affect filters and aggregations, and may cause problems for downstream users who don't recognize their presence. Review your source data and follow-up with data owners to determine whether this data can be corrected upstream. If not, strip these characters from processed data. + dq_score_prevalence_formula: |- + non_printing_ct::FLOAT/NULLIF(p.record_ct, 0)::FLOAT + dq_score_risk_factor: '1.0' + dq_dimension: Validity + target_data_lookups: + - id: '1277' + test_id: '1031' + test_type: Non_Printing_Chars + sql_flavor: databricks + lookup_type: null + lookup_query: |- + SELECT REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(`{COLUMN_NAME}`, + '\u00a0', '\x160'), + '\u2009', '\x8201'), + '\u200b', '\x8203'), + '\u200c', '\x8204'), + '\u200d', '\x8205'), + '\u200e', '\x8206'), + '\u200f', '\x8207'), + '\u202f', '\x8239'), + '\u3000', '\x12288'), + '\ufeff', '\x65279') as `{COLUMN_NAME}_content`, + COUNT(*) as record_ct FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` + WHERE TRANSLATE(`{COLUMN_NAME}`, '\u00a0\u2009\u200b\u200c\u200d\u200e\u200f\u202f\u3000\ufeff', 'XXXXXXXXXX') <> `{COLUMN_NAME}` + GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` LIMIT 500 + error_type: Profile Anomaly + - id: '1275' + test_id: '1031' + test_type: Non_Printing_Chars + sql_flavor: mssql + lookup_type: null + lookup_query: |- + SELECT TOP 500 REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE("{COLUMN_NAME}", + NCHAR(160), '\x160'), + NCHAR(8201), '\x8201'), + NCHAR(8203), '\x8203'), + NCHAR(8204), '\x8204'), + NCHAR(8205), '\x8205'), + NCHAR(8206), '\x8206'), + NCHAR(8207), '\x8207'), + NCHAR(8239), '\x8239'), + NCHAR(12288), '\x12288'), + NCHAR(65279), '\x65279') AS "{COLUMN_NAME}_content", + COUNT(*) AS record_ct + FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + WHERE TRANSLATE("{COLUMN_NAME}", NCHAR(160) + NCHAR(8201) + NCHAR(8203) + NCHAR(8204) + NCHAR(8205) + NCHAR(8206) + NCHAR(8207) + NCHAR(8239) + NCHAR(12288) + NCHAR(65279), 'XXXXXXXXXX') <> "{COLUMN_NAME}" + GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" + error_type: Profile Anomaly + - id: '1274' + test_id: '1031' + test_type: Non_Printing_Chars + sql_flavor: postgresql + lookup_type: null + lookup_query: |- + SELECT REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE("{COLUMN_NAME}", + CHR(160), '\x160'), + CHR(8201), '\x8201'), + CHR(8203), '\x8203'), + CHR(8204), '\x8204'), + CHR(8205), '\x8205'), + CHR(8206), '\x8206'), + CHR(8207), '\x8207'), + CHR(8239), '\x8239'), + CHR(12288), '\x12288'), + CHR(65279), '\x65279') as "{COLUMN_NAME}_content", + COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + WHERE TRANSLATE("{COLUMN_NAME}", CHR(160) || CHR(8201) || CHR(8203) || CHR(8204) || CHR(8205) || CHR(8206) || CHR(8207) || CHR(8239) || CHR(12288) || CHR(65279), 'XXXXXXXXXX') <> "{COLUMN_NAME}" + GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500 + error_type: Profile Anomaly + - id: '1273' + test_id: '1031' + test_type: Non_Printing_Chars + sql_flavor: redshift + lookup_type: null + lookup_query: |- + SELECT REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE("{COLUMN_NAME}", + CHR(160), '\x160'), + CHR(8201), '\x8201'), + CHR(8203), '\x8203'), + CHR(8204), '\x8204'), + CHR(8205), '\x8205'), + CHR(8206), '\x8206'), + CHR(8207), '\x8207'), + CHR(8239), '\x8239'), + CHR(12288), '\x12288'), + CHR(65279), '\x65279') as "{COLUMN_NAME}_content", + COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + WHERE TRANSLATE("{COLUMN_NAME}", CHR(160) || CHR(8201) || CHR(8203) || CHR(8204) || CHR(8205) || CHR(8206) || CHR(8207) || CHR(8239) || CHR(12288) || CHR(65279), 'XXXXXXXXXX') <> "{COLUMN_NAME}" + GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500 + error_type: Profile Anomaly + - id: '1476' + test_id: '1031' + test_type: Non_Printing_Chars + sql_flavor: redshift_spectrum + lookup_type: null + lookup_query: |- + SELECT REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE("{COLUMN_NAME}", + CHR(160), '\x160'), + CHR(8201), '\x8201'), + CHR(8203), '\x8203'), + CHR(8204), '\x8204'), + CHR(8205), '\x8205'), + CHR(8206), '\x8206'), + CHR(8207), '\x8207'), + CHR(8239), '\x8239'), + CHR(12288), '\x12288'), + CHR(65279), '\x65279') as "{COLUMN_NAME}_content", + COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + WHERE TRANSLATE("{COLUMN_NAME}", CHR(160) || CHR(8201) || CHR(8203) || CHR(8204) || CHR(8205) || CHR(8206) || CHR(8207) || CHR(8239) || CHR(12288) || CHR(65279), 'XXXXXXXXXX') <> "{COLUMN_NAME}" + GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500 + error_type: Profile Anomaly + - id: '1276' + test_id: '1031' + test_type: Non_Printing_Chars + sql_flavor: snowflake + lookup_type: null + lookup_query: |- + SELECT REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE("{COLUMN_NAME}", + CHR(160), '\x160'), + CHR(8201), '\x8201'), + CHR(8203), '\x8203'), + CHR(8204), '\x8204'), + CHR(8205), '\x8205'), + CHR(8206), '\x8206'), + CHR(8207), '\x8207'), + CHR(8239), '\x8239'), + CHR(12288), '\x12288'), + CHR(65279), '\x65279') as "{COLUMN_NAME}_content", + COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + WHERE TRANSLATE("{COLUMN_NAME}", CHR(160) || CHR(8201) || CHR(8203) || CHR(8204) || CHR(8205) || CHR(8206) || CHR(8207) || CHR(8239) || CHR(12288) || CHR(65279), 'XXXXXXXXXX') <> "{COLUMN_NAME}" + GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500 + error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Standard_Blanks.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Standard_Blanks.yaml new file mode 100644 index 00000000..6a115e85 --- /dev/null +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Standard_Blanks.yaml @@ -0,0 +1,91 @@ +profile_anomaly_types: + id: '1002' + anomaly_type: Non_Standard_Blanks + data_object: Column + anomaly_name: Non-Standard Blank Values + anomaly_description: |- + Values representing missing data may be unexpected or inconsistent. Non-standard values may include empty strings as opposed to nulls, dummy entries such as "MISSING" or repeated characters that may have been used to bypass entry requirements, processing artifacts such as "NULL", or spreadsheet artifacts such as "NA", "ERROR". + anomaly_criteria: |- + (p.zero_length_ct > 0 OR (p.filled_value_ct > 0 AND (p.numeric_ct <> p.value_ct OR functional_data_type IN ('Phone', 'Zip')))) + detail_expression: |- + 'Dummy Values: ' || p.filled_value_ct::VARCHAR || ', Empty String: ' || p.zero_length_ct::VARCHAR || ', Null: ' || p.null_value_ct::VARCHAR || ', Records: ' || p.record_ct::VARCHAR + issue_likelihood: Definite + suggested_action: |- + Consider cleansing the column upon ingestion to replace all variants of missing data with a standard designation, like Null. + dq_score_prevalence_formula: |- + p.filled_value_ct::FLOAT/NULLIF(p.record_ct, 0)::FLOAT + dq_score_risk_factor: '1.0' + dq_dimension: Completeness + target_data_lookups: + - id: '1340' + test_id: '1002' + test_type: Non_Standard_Blanks + sql_flavor: bigquery + lookup_type: null + lookup_query: |- + SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count + FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` + WHERE ( + `{COLUMN_NAME}` IN ('.','?',' ') + OR REGEXP_CONTAINS(LOWER(CAST(`{COLUMN_NAME}` AS STRING)), r'-{2,}') + OR REGEXP_CONTAINS(LOWER(CAST(`{COLUMN_NAME}` AS STRING)), r'0{2,}') + OR REGEXP_CONTAINS(LOWER(CAST(`{COLUMN_NAME}` AS STRING)), r'9{2,}') + OR REGEXP_CONTAINS(LOWER(CAST(`{COLUMN_NAME}` AS STRING)), r'x{2,}') + OR REGEXP_CONTAINS(LOWER(CAST(`{COLUMN_NAME}` AS STRING)), r'z{2,}') + OR LOWER(CAST(`{COLUMN_NAME}` AS STRING)) IN ('blank','error','missing','tbd','n/a','#na','none','null','unknown') + OR LOWER(CAST(`{COLUMN_NAME}` AS STRING)) IN ('(blank)','(error)','(missing)','(tbd)','(n/a)','(#na)','(none)','(null)','(unknown)') + OR LOWER(CAST(`{COLUMN_NAME}` AS STRING)) IN ('[blank]','[error]','[missing]','[tbd]','[n/a]','[#na]','[none]','[null]','[unknown]') + OR `{COLUMN_NAME}` = '' + OR `{COLUMN_NAME}` IS NULL + ) + GROUP BY `{COLUMN_NAME}` + ORDER BY `{COLUMN_NAME}`; + error_type: Profile Anomaly + - id: '1274' + test_id: '1002' + test_type: Non_Standard_Blanks + sql_flavor: databricks + lookup_type: null + lookup_query: |- + SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE CASE WHEN `{COLUMN_NAME}` IN ('.', '?', ' ') THEN 1 WHEN LOWER(`{COLUMN_NAME}`::STRING) REGEXP '-{2,}' OR LOWER(`{COLUMN_NAME}`::STRING) REGEXP '0{2,}' OR LOWER(`{COLUMN_NAME}`::STRING) REGEXP '9{2,}' OR LOWER(`{COLUMN_NAME}`::STRING) REGEXP 'x{2,}' OR LOWER(`{COLUMN_NAME}`::STRING) REGEXP 'z{2,}' THEN 1 WHEN LOWER(`{COLUMN_NAME}`) IN ('blank','error','missing','tbd', 'n/a','#na','none','null','unknown') THEN 1 WHEN LOWER(`{COLUMN_NAME}`) IN ('(blank)','(error)','(missing)','(tbd)', '(n/a)','(#na)','(none)','(null)','(unknown)') THEN 1 WHEN LOWER(`{COLUMN_NAME}`) IN ('[blank]','[error]','[missing]','[tbd]', '[n/a]','[#na]','[none]','[null]','[unknown]') THEN 1 WHEN `{COLUMN_NAME}` = '' THEN 1 WHEN `{COLUMN_NAME}` IS NULL THEN 1 ELSE 0 END = 1 GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}`; + error_type: Profile Anomaly + - id: '1116' + test_id: '1002' + test_type: Non_Standard_Blanks + sql_flavor: mssql + lookup_type: null + lookup_query: |- + SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE CASE WHEN "{COLUMN_NAME}" IN ('.', '?') OR "{COLUMN_NAME}" LIKE ' ' THEN 1 WHEN LEN("{COLUMN_NAME}") > 1 AND ( LOWER("{COLUMN_NAME}") LIKE '%..%' OR LOWER("{COLUMN_NAME}") LIKE '%--%' OR (LEN(REPLACE("{COLUMN_NAME}", '0', ''))= 0 ) OR (LEN(REPLACE("{COLUMN_NAME}", '9', ''))= 0 ) OR (LEN(REPLACE(LOWER("{COLUMN_NAME}"), 'x', ''))= 0 ) OR (LEN(REPLACE(LOWER("{COLUMN_NAME}"), 'z', ''))= 0 ) ) THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('blank','error','missing','tbd', 'n/a','#na','none','null','unknown') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('(blank)','(error)','(missing)','(tbd)', '(n/a)','(#na)','(none)','(null)','(unknown)') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('[blank]','[error]','[missing]','[tbd]', '[n/a]','[#na]','[none]','[null]','[unknown]') THEN 1 WHEN "{COLUMN_NAME}" = '' THEN 1 WHEN "{COLUMN_NAME}" IS NULL THEN 1 ELSE 0 END = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}"; + error_type: Profile Anomaly + - id: '1059' + test_id: '1002' + test_type: Non_Standard_Blanks + sql_flavor: postgresql + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE CASE WHEN "{COLUMN_NAME}" IN ('.', '?', ' ') THEN 1 WHEN LOWER("{COLUMN_NAME}") SIMILAR TO '(^.{2,}|-{2,}|0{2,}|9{2,}|x{2,}|z{2,}$)' THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('blank','error','missing','tbd', 'n/a','#na','none','null','unknown') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('(blank)','(error)','(missing)','(tbd)', '(n/a)','(#na)','(none)','(null)','(unknown)') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('[blank]','[error]','[missing]','[tbd]', '[n/a]','[#na]','[none]','[null]','[unknown]') THEN 1 WHEN "{COLUMN_NAME}" = '' THEN 1 WHEN "{COLUMN_NAME}" IS NULL THEN 1 ELSE 0 END = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}"; + error_type: Profile Anomaly + - id: '1034' + test_id: '1002' + test_type: Non_Standard_Blanks + sql_flavor: redshift + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE CASE WHEN "{COLUMN_NAME}" IN ('.', '?', ' ') THEN 1 WHEN LOWER("{COLUMN_NAME}") SIMILAR TO '(^.{2,}|-{2,}|0{2,}|9{2,}|x{2,}|z{2,}$)' THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('blank','error','missing','tbd', 'n/a','#na','none','null','unknown') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('(blank)','(error)','(missing)','(tbd)', '(n/a)','(#na)','(none)','(null)','(unknown)') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('[blank]','[error]','[missing]','[tbd]', '[n/a]','[#na]','[none]','[null]','[unknown]') THEN 1 WHEN "{COLUMN_NAME}" = '' THEN 1 WHEN "{COLUMN_NAME}" IS NULL THEN 1 ELSE 0 END = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}"; + error_type: Profile Anomaly + - id: '1434' + test_id: '1002' + test_type: Non_Standard_Blanks + sql_flavor: redshift_spectrum + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE CASE WHEN "{COLUMN_NAME}" IN ('.', '?', ' ') THEN 1 WHEN LOWER("{COLUMN_NAME}") SIMILAR TO '(^.{2,}|-{2,}|0{2,}|9{2,}|x{2,}|z{2,}$)' THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('blank','error','missing','tbd', 'n/a','#na','none','null','unknown') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('(blank)','(error)','(missing)','(tbd)', '(n/a)','(#na)','(none)','(null)','(unknown)') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('[blank]','[error]','[missing]','[tbd]', '[n/a]','[#na]','[none]','[null]','[unknown]') THEN 1 WHEN "{COLUMN_NAME}" = '' THEN 1 WHEN "{COLUMN_NAME}" IS NULL THEN 1 ELSE 0 END = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}"; + error_type: Profile Anomaly + - id: '1173' + test_id: '1002' + test_type: Non_Standard_Blanks + sql_flavor: snowflake + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE CASE WHEN "{COLUMN_NAME}" IN ('.', '?', ' ') THEN 1 WHEN LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP '-{2,}' OR LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP '0{2,}' OR LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP '9{2,}' OR LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP 'x{2,}' OR LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP 'z{2,}' THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('blank','error','missing','tbd', 'n/a','#na','none','null','unknown') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('(blank)','(error)','(missing)','(tbd)', '(n/a)','(#na)','(none)','(null)','(unknown)') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('[blank]','[error]','[missing]','[tbd]', '[n/a]','[#na]','[none]','[null]','[unknown]') THEN 1 WHEN "{COLUMN_NAME}" = '' THEN 1 WHEN "{COLUMN_NAME}" IS NULL THEN 1 ELSE 0 END = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}"; + error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Potential_Duplicates.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Potential_Duplicates.yaml new file mode 100644 index 00000000..20e6fc37 --- /dev/null +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Potential_Duplicates.yaml @@ -0,0 +1,81 @@ +profile_anomaly_types: + id: '1016' + anomaly_type: Potential_Duplicates + data_object: Column + anomaly_name: Potential Duplicate Values Found + anomaly_description: "This column is largely unique, but some duplicate values are\ + \ present. This pattern is uncommon and could indicate inadvertant duplication. " + anomaly_criteria: |- + p.distinct_value_ct > 1000 + AND fn_parsefreq(p.top_freq_values, 1, 2)::BIGINT BETWEEN 2 AND 4 + detail_expression: |- + 'Top Freq: ' || p.top_freq_values + issue_likelihood: Possible + suggested_action: "Review your source data and follow-up with data owners to determine\ + \ whether this data needs to be corrected. " + dq_score_prevalence_formula: |- + (p.value_ct - p.distinct_value_ct)::FLOAT/NULLIF(p.record_ct, 0)::FLOAT + dq_score_risk_factor: '0.33' + dq_dimension: Uniqueness + target_data_lookups: + - id: '1354' + test_id: '1016' + test_type: Potential_Duplicates + sql_flavor: bigquery + lookup_type: null + lookup_query: |- + SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count + FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` + GROUP BY `{COLUMN_NAME}` + HAVING COUNT(*) > 1 + ORDER BY COUNT(*) DESC + LIMIT 500; + error_type: Profile Anomaly + - id: '1288' + test_id: '1016' + test_type: Potential_Duplicates + sql_flavor: databricks + lookup_type: null + lookup_query: |- + SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` GROUP BY `{COLUMN_NAME}` HAVING count > 1 ORDER BY count DESC LIMIT 500; + error_type: Profile Anomaly + - id: '1130' + test_id: '1016' + test_type: Potential_Duplicates + sql_flavor: mssql + lookup_type: null + lookup_query: |- + SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" HAVING COUNT(*)> 1 ORDER BY COUNT(*) DESC; + error_type: Profile Anomaly + - id: '1073' + test_id: '1016' + test_type: Potential_Duplicates + sql_flavor: postgresql + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" HAVING COUNT(*)> 1 ORDER BY COUNT(*) DESC LIMIT 500; + error_type: Profile Anomaly + - id: '1048' + test_id: '1016' + test_type: Potential_Duplicates + sql_flavor: redshift + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" HAVING COUNT(*)> 1 ORDER BY COUNT(*) DESC LIMIT 500; + error_type: Profile Anomaly + - id: '1448' + test_id: '1016' + test_type: Potential_Duplicates + sql_flavor: redshift_spectrum + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" HAVING COUNT(*)> 1 ORDER BY COUNT(*) DESC LIMIT 500; + error_type: Profile Anomaly + - id: '1187' + test_id: '1016' + test_type: Potential_Duplicates + sql_flavor: snowflake + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" HAVING COUNT(*)> 1 ORDER BY COUNT(*) DESC LIMIT 500; + error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Potential_PII.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Potential_PII.yaml new file mode 100644 index 00000000..652fc467 --- /dev/null +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Potential_PII.yaml @@ -0,0 +1,79 @@ +profile_anomaly_types: + id: '1100' + anomaly_type: Potential_PII + data_object: Column + anomaly_name: Personally Identifiable Information + anomaly_description: |- + This column contains data that could be Personally Identifiable Information (PII) + anomaly_criteria: |- + p.pii_flag > '' + detail_expression: |- + 'Risk: ' || CASE LEFT(p.pii_flag, 1) WHEN 'A' THEN 'HIGH' WHEN 'B' THEN 'MODERATE' WHEN 'C' THEN 'LOW' END || ', PII Type: ' || SUBSTRING(p.pii_flag, 3) + issue_likelihood: Potential PII + suggested_action: |- + PII may require steps to ensure data security and compliance with relevant privacy regulations and legal requirements. You may have to classify and inventory PII, implement appropriate access controls, encrypt data, and monitor for unauthorized access. Your organization might be required to update privacy policies and train staff on data protection practices. Note that PII that is lower-risk in isolation might be high-risk in conjunction with other data. + dq_score_prevalence_formula: null + dq_score_risk_factor: CASE LEFT(p.pii_flag, 1) WHEN 'A' THEN 1 WHEN 'B' THEN 0.66 + WHEN 'C' THEN 0.33 END + dq_dimension: Validity + target_data_lookups: + - id: '1408' + test_id: '1100' + test_type: Potential_PII + sql_flavor: bigquery + lookup_type: null + lookup_query: |- + SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count + FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` + GROUP BY `{COLUMN_NAME}` + ORDER BY `{COLUMN_NAME}` DESC + LIMIT 500; + error_type: Profile Anomaly + - id: '1338' + test_id: '1100' + test_type: Potential_PII + sql_flavor: databricks + lookup_type: null + lookup_query: |- + SELECT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` DESC LIMIT 500; + error_type: Profile Anomaly + - id: '1271' + test_id: '1100' + test_type: Potential_PII + sql_flavor: mssql + lookup_type: null + lookup_query: |- + SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC; + error_type: Profile Anomaly + - id: '1272' + test_id: '1100' + test_type: Potential_PII + sql_flavor: postgresql + lookup_type: null + lookup_query: |- + SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500; + error_type: Profile Anomaly + - id: '1269' + test_id: '1100' + test_type: Potential_PII + sql_flavor: redshift + lookup_type: null + lookup_query: |- + SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500; + error_type: Profile Anomaly + - id: '1470' + test_id: '1100' + test_type: Potential_PII + sql_flavor: redshift_spectrum + lookup_type: null + lookup_query: |- + SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500; + error_type: Profile Anomaly + - id: '1270' + test_id: '1100' + test_type: Potential_PII + sql_flavor: snowflake + lookup_type: null + lookup_query: |- + SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500; + error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Quoted_Values.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Quoted_Values.yaml new file mode 100644 index 00000000..c4a3499d --- /dev/null +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Quoted_Values.yaml @@ -0,0 +1,80 @@ +profile_anomaly_types: + id: '1010' + anomaly_type: Quoted_Values + data_object: Column + anomaly_name: Quoted Values Found in Column Values + anomaly_description: |- + Column values were found within quotes. This likely contradicts user expectations and could be a sign of broader ingestion or processing errors. + anomaly_criteria: |- + p.quoted_value_ct > 0 + detail_expression: |- + 'Cases Found: ' || p.quoted_value_ct::VARCHAR(10) + issue_likelihood: Likely + suggested_action: |- + Review your source data, ingestion process, and any processing steps that update this column. + dq_score_prevalence_formula: |- + p.quoted_value_ct::FLOAT/NULLIF(p.record_ct, 0)::FLOAT + dq_score_risk_factor: '0.66' + dq_dimension: Validity + target_data_lookups: + - id: '1348' + test_id: '1010' + test_type: Quoted_Values + sql_flavor: bigquery + lookup_type: null + lookup_query: |- + SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count + FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` + WHERE LEFT(CAST(`{COLUMN_NAME}` AS STRING), 1) = '"' OR LEFT(CAST(`{COLUMN_NAME}` AS STRING), 1) = "'" + GROUP BY `{COLUMN_NAME}` + ORDER BY `{COLUMN_NAME}` + LIMIT 500; + error_type: Profile Anomaly + - id: '1282' + test_id: '1010' + test_type: Quoted_Values + sql_flavor: databricks + lookup_type: null + lookup_query: |- + SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE (CASE WHEN `{COLUMN_NAME}` ILIKE '"%"' OR `{COLUMN_NAME}` ILIKE '''%''' THEN 1 ELSE 0 END) = 1 GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` LIMIT 500; + error_type: Profile Anomaly + - id: '1124' + test_id: '1010' + test_type: Quoted_Values + sql_flavor: mssql + lookup_type: null + lookup_query: |- + SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE (CASE WHEN "{COLUMN_NAME}" LIKE '"%"' OR "{COLUMN_NAME}" LIKE '''%''' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}"; + error_type: Profile Anomaly + - id: '1067' + test_id: '1010' + test_type: Quoted_Values + sql_flavor: postgresql + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE (CASE WHEN "{COLUMN_NAME}" ILIKE '"%"' OR "{COLUMN_NAME}" ILIKE '''%''' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500; + error_type: Profile Anomaly + - id: '1042' + test_id: '1010' + test_type: Quoted_Values + sql_flavor: redshift + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE (CASE WHEN "{COLUMN_NAME}" ILIKE '"%"' OR "{COLUMN_NAME}" ILIKE '''%''' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500; + error_type: Profile Anomaly + - id: '1442' + test_id: '1010' + test_type: Quoted_Values + sql_flavor: redshift_spectrum + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE (CASE WHEN "{COLUMN_NAME}" ILIKE '"%"' OR "{COLUMN_NAME}" ILIKE '''%''' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500; + error_type: Profile Anomaly + - id: '1181' + test_id: '1010' + test_type: Quoted_Values + sql_flavor: snowflake + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE (CASE WHEN "{COLUMN_NAME}" ILIKE '"%"' OR "{COLUMN_NAME}" ILIKE '''%''' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500; + error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Recency_One_Year.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Recency_One_Year.yaml new file mode 100644 index 00000000..f6b3b36f --- /dev/null +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Recency_One_Year.yaml @@ -0,0 +1,74 @@ +profile_anomaly_types: + id: '1019' + anomaly_type: Recency_One_Year + data_object: Dates + anomaly_name: Recency - No Table Dates within 1 Year + anomaly_description: |- + Among all date columns present in the table, none fall inside of one year from Profile date. + anomaly_criteria: |- + MAX(p.max_date) < CURRENT_DATE - INTERVAL '1 year' + detail_expression: |- + 'Most Recent Date: ' || MAX(p.max_date)::VARCHAR + issue_likelihood: Possible + suggested_action: |- + Review your source data and follow-up with data owners to determine whether dates in table should be more recent. + dq_score_prevalence_formula: null + dq_score_risk_factor: null + dq_dimension: Timeliness + target_data_lookups: + - id: '1357' + test_id: '1019' + test_type: Recency_One_Year + sql_flavor: bigquery + lookup_type: null + lookup_query: |- + created_in_ui + error_type: Profile Anomaly + - id: '1291' + test_id: '1019' + test_type: Recency_One_Year + sql_flavor: databricks + lookup_type: null + lookup_query: |- + created_in_ui + error_type: Profile Anomaly + - id: '1133' + test_id: '1019' + test_type: Recency_One_Year + sql_flavor: mssql + lookup_type: null + lookup_query: |- + created_in_ui + error_type: Profile Anomaly + - id: '1076' + test_id: '1019' + test_type: Recency_One_Year + sql_flavor: postgresql + lookup_type: null + lookup_query: |- + created_in_ui + error_type: Profile Anomaly + - id: '1051' + test_id: '1019' + test_type: Recency_One_Year + sql_flavor: redshift + lookup_type: null + lookup_query: |- + created_in_ui + error_type: Profile Anomaly + - id: '1451' + test_id: '1019' + test_type: Recency_One_Year + sql_flavor: redshift_spectrum + lookup_type: null + lookup_query: |- + created_in_ui + error_type: Profile Anomaly + - id: '1190' + test_id: '1019' + test_type: Recency_One_Year + sql_flavor: snowflake + lookup_type: null + lookup_query: |- + created_in_ui + error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Recency_Six_Months.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Recency_Six_Months.yaml new file mode 100644 index 00000000..7f13ef99 --- /dev/null +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Recency_Six_Months.yaml @@ -0,0 +1,74 @@ +profile_anomaly_types: + id: '1020' + anomaly_type: Recency_Six_Months + data_object: Dates + anomaly_name: Recency - No Table Dates within 6 Months + anomaly_description: "Among all date columns present in the table, the most recent\ + \ date falls 6 months to 1 year back from Profile date. " + anomaly_criteria: |- + MAX(p.max_date) >= CURRENT_DATE - INTERVAL '1 year' AND MAX(p.max_date) < CURRENT_DATE - INTERVAL '6 months' + detail_expression: |- + 'Most Recent Date: ' || MAX(p.max_date)::VARCHAR + issue_likelihood: Possible + suggested_action: |- + Review your source data and follow-up with data owners to determine whether dates in table should be more recent. + dq_score_prevalence_formula: null + dq_score_risk_factor: null + dq_dimension: Timeliness + target_data_lookups: + - id: '1358' + test_id: '1020' + test_type: Recency_Six_Months + sql_flavor: bigquery + lookup_type: null + lookup_query: |- + created_in_ui + error_type: Profile Anomaly + - id: '1292' + test_id: '1020' + test_type: Recency_Six_Months + sql_flavor: databricks + lookup_type: null + lookup_query: |- + created_in_ui + error_type: Profile Anomaly + - id: '1134' + test_id: '1020' + test_type: Recency_Six_Months + sql_flavor: mssql + lookup_type: null + lookup_query: |- + created_in_ui + error_type: Profile Anomaly + - id: '1077' + test_id: '1020' + test_type: Recency_Six_Months + sql_flavor: postgresql + lookup_type: null + lookup_query: |- + created_in_ui + error_type: Profile Anomaly + - id: '1052' + test_id: '1020' + test_type: Recency_Six_Months + sql_flavor: redshift + lookup_type: null + lookup_query: |- + created_in_ui + error_type: Profile Anomaly + - id: '1452' + test_id: '1020' + test_type: Recency_Six_Months + sql_flavor: redshift_spectrum + lookup_type: null + lookup_query: |- + created_in_ui + error_type: Profile Anomaly + - id: '1191' + test_id: '1020' + test_type: Recency_Six_Months + sql_flavor: snowflake + lookup_type: null + lookup_query: |- + created_in_ui + error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Small_Divergent_Value_Ct.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Small_Divergent_Value_Ct.yaml new file mode 100644 index 00000000..afb7893b --- /dev/null +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Small_Divergent_Value_Ct.yaml @@ -0,0 +1,72 @@ +profile_anomaly_types: + id: '1014' + anomaly_type: Small Divergent Value Ct + data_object: Column + anomaly_name: Small Percentage of Divergent Values Found + anomaly_description: |- + Under 3% of values in this column were found to be different from the most common value. This could indicate a data error. + anomaly_criteria: |- + functional_data_type <> 'Boolean' AND (100.0 * fn_parsefreq(p.top_freq_values, 1, 2)::FLOAT / + p.value_ct::FLOAT) > 97::FLOAT + AND (100.0 * fn_parsefreq(p.top_freq_values, 1, 2)::FLOAT / + NULLIF(p.value_ct, 0)::FLOAT) < 100::FLOAT + detail_expression: |- + 'Single Value Pct: ' || ROUND(100.0 * fn_parsefreq(p.top_freq_values, 1, 2)::FLOAT + / NULLIF(p.value_ct, 0)::FLOAT)::VARCHAR(40) + || ', Value | Freq: ' || top_freq_values + issue_likelihood: Possible + suggested_action: |- + Review your source data and follow-up with data owners to determine whether this data needs to be corrected. + dq_score_prevalence_formula: |- + (p.record_ct - fn_parsefreq(p.top_freq_values, 1, 2)::BIGINT)::FLOAT/NULLIF(p.record_ct, 0)::FLOAT + dq_score_risk_factor: '0.33' + dq_dimension: Validity + target_data_lookups: + - id: '1286' + test_id: '1014' + test_type: Small Divergent Value Ct + sql_flavor: databricks + lookup_type: null + lookup_query: |- + SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` GROUP BY `{COLUMN_NAME}` ORDER BY count DESC; + error_type: Profile Anomaly + - id: '1128' + test_id: '1014' + test_type: Small Divergent Value Ct + sql_flavor: mssql + lookup_type: null + lookup_query: |- + SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC; + error_type: Profile Anomaly + - id: '1071' + test_id: '1014' + test_type: Small Divergent Value Ct + sql_flavor: postgresql + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC; + error_type: Profile Anomaly + - id: '1046' + test_id: '1014' + test_type: Small Divergent Value Ct + sql_flavor: redshift + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC; + error_type: Profile Anomaly + - id: '1446' + test_id: '1014' + test_type: Small Divergent Value Ct + sql_flavor: redshift_spectrum + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC; + error_type: Profile Anomaly + - id: '1185' + test_id: '1014' + test_type: Small Divergent Value Ct + sql_flavor: snowflake + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC; + error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Small_Missing_Value_Ct.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Small_Missing_Value_Ct.yaml new file mode 100644 index 00000000..964d7eb8 --- /dev/null +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Small_Missing_Value_Ct.yaml @@ -0,0 +1,75 @@ +profile_anomaly_types: + id: '1013' + anomaly_type: Small Missing Value Ct + data_object: Column + anomaly_name: Small Percentage of Missing Values Found + anomaly_description: |- + Under 3% of values in this column were found to be null, zero-length or dummy values, but values are not universally present. This could indicate unexpected missing values in a required column. + anomaly_criteria: |- + (p.value_ct - p.zero_length_ct - CASE WHEN (p.filled_value_ct > 0 AND (p.numeric_ct <> p.value_ct OR functional_data_type IN ('Phone', 'Zip'))) THEN p.filled_value_ct ELSE 0 END + )::FLOAT / p.record_ct::FLOAT > 0.97 + AND (p.value_ct - p.zero_length_ct - CASE WHEN (p.filled_value_ct > 0 AND (p.numeric_ct <> p.value_ct OR functional_data_type IN ('Phone', 'Zip'))) THEN p.filled_value_ct ELSE 0 END + ) < p.record_ct + detail_expression: |- + (p.record_ct - (p.value_ct - p.zero_length_ct - CASE WHEN (p.filled_value_ct > 0 AND (p.numeric_ct <> p.value_ct OR functional_data_type IN ('Phone', 'Zip'))) THEN p.filled_value_ct ELSE 0 END + ))::VARCHAR(20) || + ' of ' || p.record_ct::VARCHAR(20) || ' blank values: ' || + ROUND(100.0 * (p.record_ct - (p.value_ct - p.zero_length_ct - CASE WHEN (p.filled_value_ct > 0 AND (p.numeric_ct <> p.value_ct OR functional_data_type IN ('Phone', 'Zip'))) THEN p.filled_value_ct ELSE 0 END + ))::NUMERIC(18, 5) + / NULLIF(p.value_ct, 0)::NUMERIC(18, 5), 2)::VARCHAR(40) || '%' + issue_likelihood: Possible + suggested_action: |- + Review your source data and follow-up with data owners to determine whether this data needs to be corrected, supplemented or excluded. + dq_score_prevalence_formula: |- + (p.null_value_ct + filled_value_ct + zero_length_ct)::FLOAT/NULLIF(p.record_ct, 0)::FLOAT + dq_score_risk_factor: '0.33' + dq_dimension: Completeness + target_data_lookups: + - id: '1285' + test_id: '1013' + test_type: Small Missing Value Ct + sql_flavor: databricks + lookup_type: null + lookup_query: |- + SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE (CASE WHEN `{COLUMN_NAME}` IN ('.', '?', ' ') THEN 1 WHEN LOWER(`{COLUMN_NAME}`::STRING) REGEXP '-{2,}' OR LOWER(`{COLUMN_NAME}`::STRING) REGEXP '0{2,}' OR LOWER(`{COLUMN_NAME}`::STRING) REGEXP '9{2,}' OR LOWER(`{COLUMN_NAME}`::STRING) REGEXP 'x{2,}' OR LOWER(`{COLUMN_NAME}`::STRING) REGEXP 'z{2,}' THEN 1 WHEN LOWER(`{COLUMN_NAME}`) IN ('blank','error','missing','tbd', 'n/a','#na','none','null','unknown') THEN 1 WHEN LOWER(`{COLUMN_NAME}`) IN ('(blank)','(error)','(missing)','(tbd)', '(n/a)','(#na)','(none)','(null)','(unknown)') THEN 1 WHEN LOWER(`{COLUMN_NAME}`) IN ('[blank]','[error]','[missing]','[tbd]', '[n/a]','[#na]','[none]','[null]','[unknown]') THEN 1 WHEN `{COLUMN_NAME}` = '' THEN 1 WHEN `{COLUMN_NAME}` IS NULL THEN 1 ELSE 0 END) = 1 GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}`; + error_type: Profile Anomaly + - id: '1127' + test_id: '1013' + test_type: Small Missing Value Ct + sql_flavor: mssql + lookup_type: null + lookup_query: |- + SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE (CASE WHEN "{COLUMN_NAME}" IN ('.', '?', ' ') THEN 1 WHEN LEN("{COLUMN_NAME}") > 1 AND ( LOWER("{COLUMN_NAME}") LIKE '%..%' OR LOWER("{COLUMN_NAME}") LIKE '%--%' OR (LEN(REPLACE("{COLUMN_NAME}", '0', ''))= 0 ) OR (LEN(REPLACE("{COLUMN_NAME}", '9', ''))= 0 ) OR (LEN(REPLACE(LOWER("{COLUMN_NAME}"), 'x', ''))= 0 ) OR (LEN(REPLACE(LOWER("{COLUMN_NAME}"), 'z', ''))= 0 ) ) THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('blank','error','missing','tbd', 'n/a','#na','none','null','unknown') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('(blank)','(error)','(missing)','(tbd)', '(n/a)','(#na)','(none)','(null)','(unknown)') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('[blank]','[error]','[missing]','[tbd]', '[n/a]','[#na]','[none]','[null]','[unknown]') THEN 1 WHEN "{COLUMN_NAME}" = '' THEN 1 WHEN "{COLUMN_NAME}" IS NULL THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}"; + error_type: Profile Anomaly + - id: '1070' + test_id: '1013' + test_type: Small Missing Value Ct + sql_flavor: postgresql + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE (CASE WHEN "{COLUMN_NAME}" IN ('.', '?', ' ') THEN 1 WHEN LOWER("{COLUMN_NAME}") SIMILAR TO '(^.{2,}|-{2,}|0{2,}|9{2,}|x{2,}|z{2,}$)' THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('blank','error','missing','tbd', 'n/a','#na','none','null','unknown') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('(blank)','(error)','(missing)','(tbd)', '(n/a)','(#na)','(none)','(null)','(unknown)') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('[blank]','[error]','[missing]','[tbd]', '[n/a]','[#na]','[none]','[null]','[unknown]') THEN 1 WHEN "{COLUMN_NAME}" = '' THEN 1 WHEN "{COLUMN_NAME}" IS NULL THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}"; + error_type: Profile Anomaly + - id: '1045' + test_id: '1013' + test_type: Small Missing Value Ct + sql_flavor: redshift + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE (CASE WHEN "{COLUMN_NAME}" IN ('.', '?', ' ') THEN 1 WHEN LOWER("{COLUMN_NAME}") SIMILAR TO '(^.{2,}|-{2,}|0{2,}|9{2,}|x{2,}|z{2,}$)' THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('blank','error','missing','tbd', 'n/a','#na','none','null','unknown') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('(blank)','(error)','(missing)','(tbd)', '(n/a)','(#na)','(none)','(null)','(unknown)') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('[blank]','[error]','[missing]','[tbd]', '[n/a]','[#na]','[none]','[null]','[unknown]') THEN 1 WHEN "{COLUMN_NAME}" = '' THEN 1 WHEN "{COLUMN_NAME}" IS NULL THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}"; + error_type: Profile Anomaly + - id: '1445' + test_id: '1013' + test_type: Small Missing Value Ct + sql_flavor: redshift_spectrum + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE (CASE WHEN "{COLUMN_NAME}" IN ('.', '?', ' ') THEN 1 WHEN LOWER("{COLUMN_NAME}") SIMILAR TO '(^.{2,}|-{2,}|0{2,}|9{2,}|x{2,}|z{2,}$)' THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('blank','error','missing','tbd', 'n/a','#na','none','null','unknown') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('(blank)','(error)','(missing)','(tbd)', '(n/a)','(#na)','(none)','(null)','(unknown)') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('[blank]','[error]','[missing]','[tbd]', '[n/a]','[#na]','[none]','[null]','[unknown]') THEN 1 WHEN "{COLUMN_NAME}" = '' THEN 1 WHEN "{COLUMN_NAME}" IS NULL THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}"; + error_type: Profile Anomaly + - id: '1184' + test_id: '1013' + test_type: Small Missing Value Ct + sql_flavor: snowflake + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE (CASE WHEN "{COLUMN_NAME}" IN ('.', '?', ' ') THEN 1 WHEN LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP '-{2,}' OR LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP '0{2,}' OR LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP '9{2,}' OR LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP 'x{2,}' OR LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP 'z{2,}' THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('blank','error','missing','tbd', 'n/a','#na','none','null','unknown') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('(blank)','(error)','(missing)','(tbd)', '(n/a)','(#na)','(none)','(null)','(unknown)') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('[blank]','[error]','[missing]','[tbd]', '[n/a]','[#na]','[none]','[null]','[unknown]') THEN 1 WHEN "{COLUMN_NAME}" = '' THEN 1 WHEN "{COLUMN_NAME}" IS NULL THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}"; + error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Small_Numeric_Value_Ct.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Small_Numeric_Value_Ct.yaml new file mode 100644 index 00000000..9ef1f377 --- /dev/null +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Small_Numeric_Value_Ct.yaml @@ -0,0 +1,94 @@ +profile_anomaly_types: + id: '1023' + anomaly_type: Small_Numeric_Value_Ct + data_object: Column + anomaly_name: Unexpected Numeric Values Found + anomaly_description: |- + A small fraction (under 3%) of values in this column were found to be numeric. They could be erroneous. + anomaly_criteria: |- + p.general_type = 'A' + AND p.numeric_ct::FLOAT/NULLIF(p.value_ct, 0)::FLOAT < 0.03 + AND p.numeric_ct > 0 + detail_expression: |- + 'Numeric Ct: ' || p.numeric_ct || ' of ' || p.value_ct || ' (Numeric Percent: ' || ROUND(100.0 * p.numeric_ct::NUMERIC(18, 5)/NULLIF(p.value_ct, 0)::NUMERIC(18, 5), 2) || ' )'::VARCHAR(200) + issue_likelihood: Likely + suggested_action: |- + Review your source data and follow-up with data owners to determine whether numeric values are invalid entries here. + dq_score_prevalence_formula: |- + p.numeric_ct::FLOAT/NULLIF(p.record_ct, 0)::FLOAT + dq_score_risk_factor: '0.66' + dq_dimension: Validity + target_data_lookups: + - id: '1361' + test_id: '1023' + test_type: Small_Numeric_Value_Ct + sql_flavor: bigquery + lookup_type: null + lookup_query: |- + ( + SELECT 'Numeric' AS data_type, `{COLUMN_NAME}`, COUNT(*) AS count + FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` + WHERE SAFE_CAST(CAST(`{COLUMN_NAME}` AS STRING) AS FLOAT64) IS NOT NULL + GROUP BY `{COLUMN_NAME}` + ORDER BY count DESC + LIMIT 10 + ) + UNION ALL + ( + SELECT 'Non-Numeric' AS data_type, `{COLUMN_NAME}`, COUNT(*) AS count + FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` + WHERE SAFE_CAST(CAST(`{COLUMN_NAME}` AS STRING) AS FLOAT64) IS NULL + GROUP BY `{COLUMN_NAME}` + ORDER BY count DESC + LIMIT 10 + ) + ORDER BY data_type, count DESC; + error_type: Profile Anomaly + - id: '1295' + test_id: '1023' + test_type: Small_Numeric_Value_Ct + sql_flavor: databricks + lookup_type: null + lookup_query: |- + SELECT A.* FROM (SELECT DISTINCT 'Numeric' as data_type, `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE <%IS_NUM;`{COLUMN_NAME}`%> = 1 GROUP BY `{COLUMN_NAME}` ORDER BY count DESC LIMIT 10) AS A UNION ALL SELECT B.* FROM (SELECT DISTINCT 'Non-Numeric' as data_type, `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE <%IS_NUM;`{COLUMN_NAME}`%> != 1 GROUP BY `{COLUMN_NAME}` ORDER BY count DESC) AS B ORDER BY data_type, count DESC LIMIT 10; + error_type: Profile Anomaly + - id: '1137' + test_id: '1023' + test_type: Small_Numeric_Value_Ct + sql_flavor: mssql + lookup_type: null + lookup_query: |- + SELECT A.* FROM ( SELECT DISTINCT TOP 10 'Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_NUM;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS A UNION ALL SELECT B.* FROM ( SELECT DISTINCT TOP 10 'Non-Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_NUM;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS B ORDER BY data_type, count DESC; + error_type: Profile Anomaly + - id: '1080' + test_id: '1023' + test_type: Small_Numeric_Value_Ct + sql_flavor: postgresql + lookup_type: null + lookup_query: |- + SELECT A.* FROM ( SELECT DISTINCT 'Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_NUM;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT 10 ) AS A UNION ALL SELECT B.* FROM ( SELECT DISTINCT 'Non-Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_NUM;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT 10 ) AS B ORDER BY data_type, count DESC; + error_type: Profile Anomaly + - id: '1055' + test_id: '1023' + test_type: Small_Numeric_Value_Ct + sql_flavor: redshift + lookup_type: null + lookup_query: |- + SELECT A.* FROM ( SELECT TOP 10 DISTINCT 'Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_NUM;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC) AS A UNION ALL SELECT B.* FROM ( SELECT TOP 10 DISTINCT 'Non-Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_NUM;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS B ORDER BY data_type, count DESC; + error_type: Profile Anomaly + - id: '1455' + test_id: '1023' + test_type: Small_Numeric_Value_Ct + sql_flavor: redshift_spectrum + lookup_type: null + lookup_query: |- + SELECT A.* FROM ( SELECT TOP 10 DISTINCT 'Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_NUM;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC) AS A UNION ALL SELECT B.* FROM ( SELECT TOP 10 DISTINCT 'Non-Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_NUM;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS B ORDER BY data_type, count DESC; + error_type: Profile Anomaly + - id: '1194' + test_id: '1023' + test_type: Small_Numeric_Value_Ct + sql_flavor: snowflake + lookup_type: null + lookup_query: |- + SELECT A.* FROM (SELECT DISTINCT TOP 10 'Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_NUM;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC) AS A UNION ALL SELECT B.* FROM (SELECT DISTINCT TOP 10 'Non-Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_NUM;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC) AS B ORDER BY data_type, count DESC; + error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Standardized_Value_Matches.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Standardized_Value_Matches.yaml new file mode 100644 index 00000000..4eb691ab --- /dev/null +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Standardized_Value_Matches.yaml @@ -0,0 +1,89 @@ +profile_anomaly_types: + id: '1017' + anomaly_type: Standardized_Value_Matches + data_object: Column + anomaly_name: Similar Values Match When Standardized + anomaly_description: |- + When column values are standardized (removing spaces, single-quotes, periods and dashes), matching values are found in other records. This may indicate that formats should be further standardized to allow consistent comparisons for merges, joins and roll-ups. It could also indicate the presence of unintended duplicates. + anomaly_criteria: "p.general_type = 'A' AND p.distinct_std_value_ct <> p.distinct_value_ct\ + \ AND p.functional_data_type NOT LIKE 'Person%Name' " + detail_expression: |- + 'Distinct Values: ' || p.distinct_value_ct::VARCHAR + || ', Standardized: ' || p.distinct_std_value_ct::VARCHAR + issue_likelihood: Likely + suggested_action: |- + Review standardized vs. raw data values for all matches. Correct data if values should be consistent. + dq_score_prevalence_formula: |- + (p.distinct_value_ct - p.distinct_std_value_ct)::FLOAT/NULLIF(p.value_ct, 0) + dq_score_risk_factor: '0.66' + dq_dimension: Uniqueness + target_data_lookups: + - id: '1355' + test_id: '1017' + test_type: Standardized_Value_Matches + sql_flavor: bigquery + lookup_type: null + lookup_query: |- + WITH cte AS ( + SELECT UPPER(REGEXP_REPLACE(CAST(`{COLUMN_NAME}` AS STRING), r"[ '\.\-\,]", '')) AS possible_standard_value, + COUNT(DISTINCT `{COLUMN_NAME}`) AS cnt + FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` + GROUP BY possible_standard_value + HAVING COUNT(DISTINCT `{COLUMN_NAME}`) > 1 + ) + SELECT DISTINCT a.`{COLUMN_NAME}`, b.possible_standard_value, COUNT(*) AS count + FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` a + JOIN cte b + ON UPPER(REGEXP_REPLACE(CAST(a.`{COLUMN_NAME}` AS STRING), r"[ '\.\-\,]", '')) = b.possible_standard_value + GROUP BY a.`{COLUMN_NAME}`, b.possible_standard_value + ORDER BY b.possible_standard_value ASC, count DESC + LIMIT 500; + error_type: Profile Anomaly + - id: '1289' + test_id: '1017' + test_type: Standardized_Value_Matches + sql_flavor: databricks + lookup_type: null + lookup_query: |- + WITH CTE AS ( SELECT DISTINCT UPPER(TRANSLATE(`{COLUMN_NAME}`, ' '',.-', '')) as possible_standard_value, COUNT(DISTINCT `{COLUMN_NAME}`) FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` GROUP BY UPPER(TRANSLATE(`{COLUMN_NAME}`, ' '',.-', '')) HAVING COUNT(DISTINCT `{COLUMN_NAME}`) > 1 ) SELECT DISTINCT a.`{COLUMN_NAME}`, possible_standard_value, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` a, cte b WHERE UPPER(TRANSLATE(a.`{COLUMN_NAME}`, ' '',.-', '')) = b.possible_standard_value GROUP BY a.`{COLUMN_NAME}`, possible_standard_value ORDER BY possible_standard_value ASC, count DESC LIMIT 500; + error_type: Profile Anomaly + - id: '1131' + test_id: '1017' + test_type: Standardized_Value_Matches + sql_flavor: mssql + lookup_type: null + lookup_query: |- + WITH CTE AS ( SELECT DISTINCT TOP 500 UPPER(REPLACE(TRANSLATE("{COLUMN_NAME}",' '''',.-',REPLICATE(' ', LEN(' '''',.-'))),' ','')) as possible_standard_value, COUNT(DISTINCT "{COLUMN_NAME}") as distinct_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY UPPER(REPLACE(TRANSLATE("{COLUMN_NAME}",' '''',.-',REPLICATE(' ', LEN(' '''',.-'))),' ','')) HAVING COUNT(DISTINCT "{COLUMN_NAME}") > 1 ) SELECT DISTINCT a."{COLUMN_NAME}", possible_standard_value, COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" a, cte b WHERE UPPER(REPLACE(TRANSLATE("{COLUMN_NAME}",' '''',.-',REPLICATE(' ', LEN(' '''',.-'))),' ','')) = b.possible_standard_value GROUP BY a."{COLUMN_NAME}", possible_standard_value ORDER BY possible_standard_value ASC, count DESC; + error_type: Profile Anomaly + - id: '1074' + test_id: '1017' + test_type: Standardized_Value_Matches + sql_flavor: postgresql + lookup_type: null + lookup_query: |- + WITH CTE AS ( SELECT DISTINCT UPPER(TRANSLATE("{COLUMN_NAME}", ' '',.-', '')) as possible_standard_value, COUNT(DISTINCT "{COLUMN_NAME}") FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY UPPER(TRANSLATE("{COLUMN_NAME}", ' '',.-', '')) HAVING COUNT(DISTINCT "{COLUMN_NAME}") > 1 ) SELECT DISTINCT a."{COLUMN_NAME}", possible_standard_value, COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" a, cte b WHERE UPPER(TRANSLATE(a."{COLUMN_NAME}", ' '',.-', '')) = b.possible_standard_value GROUP BY a."{COLUMN_NAME}", possible_standard_value ORDER BY possible_standard_value ASC, count DESC LIMIT 500; + error_type: Profile Anomaly + - id: '1049' + test_id: '1017' + test_type: Standardized_Value_Matches + sql_flavor: redshift + lookup_type: null + lookup_query: |- + WITH CTE AS ( SELECT DISTINCT UPPER(TRANSLATE("{COLUMN_NAME}", ' '',.-', '')) as possible_standard_value, COUNT(DISTINCT "{COLUMN_NAME}") FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY UPPER(TRANSLATE("{COLUMN_NAME}", ' '',.-', '')) HAVING COUNT(DISTINCT "{COLUMN_NAME}") > 1 ) SELECT DISTINCT a."{COLUMN_NAME}", possible_standard_value, COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" a, cte b WHERE UPPER(TRANSLATE(a."{COLUMN_NAME}", ' '',.-', '')) = b.possible_standard_value GROUP BY a."{COLUMN_NAME}", possible_standard_value ORDER BY possible_standard_value ASC, count DESC LIMIT 500; + error_type: Profile Anomaly + - id: '1449' + test_id: '1017' + test_type: Standardized_Value_Matches + sql_flavor: redshift_spectrum + lookup_type: null + lookup_query: |- + WITH CTE AS ( SELECT DISTINCT UPPER(TRANSLATE("{COLUMN_NAME}", ' '',.-', '')) as possible_standard_value, COUNT(DISTINCT "{COLUMN_NAME}") FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY UPPER(TRANSLATE("{COLUMN_NAME}", ' '',.-', '')) HAVING COUNT(DISTINCT "{COLUMN_NAME}") > 1 ) SELECT DISTINCT a."{COLUMN_NAME}", possible_standard_value, COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" a, cte b WHERE UPPER(TRANSLATE(a."{COLUMN_NAME}", ' '',.-', '')) = b.possible_standard_value GROUP BY a."{COLUMN_NAME}", possible_standard_value ORDER BY possible_standard_value ASC, count DESC LIMIT 500; + error_type: Profile Anomaly + - id: '1188' + test_id: '1017' + test_type: Standardized_Value_Matches + sql_flavor: snowflake + lookup_type: null + lookup_query: |- + WITH CTE AS ( SELECT DISTINCT UPPER(TRANSLATE("{COLUMN_NAME}", ' '',.-', '')) as possible_standard_value, COUNT(DISTINCT "{COLUMN_NAME}") FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY UPPER(TRANSLATE("{COLUMN_NAME}", ' '',.-', '')) HAVING COUNT(DISTINCT "{COLUMN_NAME}") > 1 ) SELECT DISTINCT a."{COLUMN_NAME}", possible_standard_value, COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" a, cte b WHERE UPPER(TRANSLATE(a."{COLUMN_NAME}", ' '',.-', '')) = b.possible_standard_value GROUP BY a."{COLUMN_NAME}", possible_standard_value ORDER BY possible_standard_value ASC, count DESC LIMIT 500; + error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Suggested_Type.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Suggested_Type.yaml new file mode 100644 index 00000000..9763b988 --- /dev/null +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Suggested_Type.yaml @@ -0,0 +1,80 @@ +profile_anomaly_types: + id: '1001' + anomaly_type: Suggested_Type + data_object: Column + anomaly_name: Suggested Data Type + anomaly_description: "Data stored as text all meets criteria for a more suitable\ + \ type. " + anomaly_criteria: |- + (functional_data_type NOT IN ('Boolean', 'Flag') ) AND (column_type ILIKE '%ch + ar%' OR column_type ILIKE 'text') AND NOT (datatype_suggestion ILIKE '%char%' OR datatype_suggestion ILIKE 'text') + detail_expression: |- + p.datatype_suggestion::VARCHAR(200) + issue_likelihood: Likely + suggested_action: |- + Consider changing the column data type to tighte + n controls over data ingested and to make values more efficient, consistent and suitable for downstream analysis. + dq_score_prevalence_formula: null + dq_score_risk_factor: null + dq_dimension: null + target_data_lookups: + - id: '1339' + test_id: '1001' + test_type: Suggested_Type + sql_flavor: bigquery + lookup_type: null + lookup_query: |- + SELECT `{COLUMN_NAME}`, COUNT(*) AS record_ct + FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` + GROUP BY `{COLUMN_NAME}` + ORDER BY record_ct DESC + LIMIT 20; + error_type: Profile Anomaly + - id: '1273' + test_id: '1001' + test_type: Suggested_Type + sql_flavor: databricks + lookup_type: null + lookup_query: |- + SELECT `{COLUMN_NAME}`, COUNT(*) AS record_ct FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` GROUP BY `{COLUMN_NAME}` ORDER BY record_ct DESC LIMIT 20; + error_type: Profile Anomaly + - id: '1115' + test_id: '1001' + test_type: Suggested_Type + sql_flavor: mssql + lookup_type: null + lookup_query: |- + SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC; + error_type: Profile Anomaly + - id: '1058' + test_id: '1001' + test_type: Suggested_Type + sql_flavor: postgresql + lookup_type: null + lookup_query: |- + SELECT "{COLUMN_NAME}", COUNT(*) AS record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC LIMIT 20; + error_type: Profile Anomaly + - id: '1033' + test_id: '1001' + test_type: Suggested_Type + sql_flavor: redshift + lookup_type: null + lookup_query: |- + SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC; + error_type: Profile Anomaly + - id: '1433' + test_id: '1001' + test_type: Suggested_Type + sql_flavor: redshift_spectrum + lookup_type: null + lookup_query: |- + SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC; + error_type: Profile Anomaly + - id: '1172' + test_id: '1001' + test_type: Suggested_Type + sql_flavor: snowflake + lookup_type: null + lookup_query: |- + SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC; + error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Table_Pattern_Mismatch.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Table_Pattern_Mismatch.yaml new file mode 100644 index 00000000..f8ea4cee --- /dev/null +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Table_Pattern_Mismatch.yaml @@ -0,0 +1,95 @@ +profile_anomaly_types: + id: '1008' + anomaly_type: Table_Pattern_Mismatch + data_object: Multi-Col + anomaly_name: Pattern Inconsistency Across Tables + anomaly_description: |- + Alpha-numeric string data within this column matches a single pattern, but other columns with the same name have data that matches a different single pattern. Inconsistent formatting may contradict user assumptions and cause downstream errors, extra steps and inconsistent business logic. + anomaly_criteria: |- + p.general_type = 'A' + AND functional_data_type NOT ILIKE 'Measurement%' AND functional_data_type NOT IN ('Category', 'Code') + AND p.max_length > 3 + AND p.value_ct > (p.numeric_ct + p.filled_value_ct + p.zero_length_ct) + AND m.max_pattern_ct = 1 + AND m.column_ct > 1 + AND SPLIT_PART(p.top_patterns, '|', 2) <> SPLIT_PART(m.very_top_pattern, '|', 2) + AND SPLIT_PART(p.top_patterns, '|', 1)::NUMERIC / SPLIT_PART(m.very_top_pattern, '|', 1)::NUMERIC < 0.1 + detail_expression: |- + 'Patterns: ' || SPLIT_PART(p.top_patterns, '|', 2) || ', ' || SPLIT_PART(ltrim(m.very_top_pattern, '0'), '|', 2) + issue_likelihood: Likely + suggested_action: |- + Review the profiled patterns for the same column in other tables. You may want to add a hygiene step to your processing to make patterns consistent. + dq_score_prevalence_formula: null + dq_score_risk_factor: null + dq_dimension: Validity + target_data_lookups: + - id: '1346' + test_id: '1008' + test_type: Table_Pattern_Mismatch + sql_flavor: bigquery + lookup_type: null + lookup_query: |- + SELECT DISTINCT column_name, columns.table_name + FROM information_schema.columns + JOIN information_schema.tables + ON columns.table_name = tables.table_name + AND columns.table_schema = tables.table_schema + WHERE columns.table_schema = '{TARGET_SCHEMA}' + AND columns.column_name = '{COLUMN_NAME}' + AND UPPER(tables.table_type) = 'BASE TABLE' + ORDER BY table_name; + error_type: Profile Anomaly + - id: '1280' + test_id: '1008' + test_type: Table_Pattern_Mismatch + sql_flavor: databricks + lookup_type: null + lookup_query: "SELECT DISTINCT column_name, columns.table_name FROM information_schema.columns\ + \ JOIN information_schema.tables ON columns.table_name = tables.table_name AND\ + \ columns.table_schema = tables.table_schema WHERE columns.table_schema = '{TARGET_SCHEMA}'\ + \ AND columns.column_name = '{COLUMN_NAME}' AND UPPER(tables.table_type) = 'BASE\ + \ TABLE' ORDER BY table_name; " + error_type: Profile Anomaly + - id: '1122' + test_id: '1008' + test_type: Table_Pattern_Mismatch + sql_flavor: mssql + lookup_type: null + lookup_query: |- + SELECT TOP 500 column_name, columns.table_name FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = '{TARGET_SCHEMA}' AND columns.column_name = '{COLUMN_NAME}' AND UPPER(tables.table_type) = 'BASE TABLE' ORDER BY table_name; + error_type: Profile Anomaly + - id: '1065' + test_id: '1008' + test_type: Table_Pattern_Mismatch + sql_flavor: postgresql + lookup_type: null + lookup_query: |- + SELECT column_name, columns.table_name FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = '{TARGET_SCHEMA}' AND columns.column_name = '{COLUMN_NAME}' AND UPPER(tables.table_type) = 'BASE TABLE' ORDER BY columns.table_name; + error_type: Profile Anomaly + - id: '1040' + test_id: '1008' + test_type: Table_Pattern_Mismatch + sql_flavor: redshift + lookup_type: null + lookup_query: |- + SELECT column_name, table_name, data_type FROM information_schema.columns WHERE table_schema = '{TARGET_SCHEMA}' AND column_name = '{COLUMN_NAME}' ORDER BY data_type; + error_type: Profile Anomaly + - id: '1440' + test_id: '1008' + test_type: Table_Pattern_Mismatch + sql_flavor: redshift_spectrum + lookup_type: null + lookup_query: |- + SELECT columnname AS column_name, tablename AS table_name, external_type AS data_type FROM svv_external_columns WHERE schemaname = '{TARGET_SCHEMA}' AND columnname = '{COLUMN_NAME}' ORDER BY external_type; + error_type: Profile Anomaly + - id: '1179' + test_id: '1008' + test_type: Table_Pattern_Mismatch + sql_flavor: snowflake + lookup_type: null + lookup_query: "SELECT DISTINCT column_name, columns.table_name FROM information_schema.columns\ + \ JOIN information_schema.tables ON columns.table_name = tables.table_name AND\ + \ columns.table_schema = tables.table_schema WHERE columns.table_schema = '{TARGET_SCHEMA}'\ + \ AND columns.column_name = '{COLUMN_NAME}' AND UPPER(tables.table_type) = 'BASE\ + \ TABLE' ORDER BY table_name; " + error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Unexpected_Emails.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Unexpected_Emails.yaml new file mode 100644 index 00000000..a8574f95 --- /dev/null +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Unexpected_Emails.yaml @@ -0,0 +1,79 @@ +profile_anomaly_types: + id: '1022' + anomaly_type: Unexpected Emails + data_object: Column + anomaly_name: Unexpected Column Contains Emails + anomaly_description: |- + This column is not labeled as email, but contains mostly email addresses. This could indicate shifted or switched source data columns. + anomaly_criteria: |- + p.std_pattern_match = 'EMAIL' + AND NOT (p.column_name ILIKE '%email%' OR p.column_name ILIKE '%addr%') + detail_expression: |- + 'Value Range: ' || p.min_text || ' thru ' || max_text + issue_likelihood: Possible + suggested_action: |- + Review your source data and follow-up with data owners to determine whether column should be populated with email addresses. + dq_score_prevalence_formula: null + dq_score_risk_factor: '0.33' + dq_dimension: Consistency + target_data_lookups: + - id: '1360' + test_id: '1022' + test_type: Unexpected Emails + sql_flavor: bigquery + lookup_type: null + lookup_query: |- + SELECT `{COLUMN_NAME}`, COUNT(*) AS count + FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` + GROUP BY `{COLUMN_NAME}` + ORDER BY `{COLUMN_NAME}` DESC + LIMIT 500; + error_type: Profile Anomaly + - id: '1294' + test_id: '1022' + test_type: Unexpected Emails + sql_flavor: databricks + lookup_type: null + lookup_query: |- + SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` DESC LIMIT 500; + error_type: Profile Anomaly + - id: '1136' + test_id: '1022' + test_type: Unexpected Emails + sql_flavor: mssql + lookup_type: null + lookup_query: |- + SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC; + error_type: Profile Anomaly + - id: '1079' + test_id: '1022' + test_type: Unexpected Emails + sql_flavor: postgresql + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500; + error_type: Profile Anomaly + - id: '1054' + test_id: '1022' + test_type: Unexpected Emails + sql_flavor: redshift + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500; + error_type: Profile Anomaly + - id: '1454' + test_id: '1022' + test_type: Unexpected Emails + sql_flavor: redshift_spectrum + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500; + error_type: Profile Anomaly + - id: '1193' + test_id: '1022' + test_type: Unexpected Emails + sql_flavor: snowflake + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500; + error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Unexpected_US_States.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Unexpected_US_States.yaml new file mode 100644 index 00000000..04790269 --- /dev/null +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Unexpected_US_States.yaml @@ -0,0 +1,81 @@ +profile_anomaly_types: + id: '1021' + anomaly_type: Unexpected US States + data_object: Column + anomaly_name: Unexpected Column Contains US States + anomaly_description: |- + This column is not labeled as a state, but contains mostly US State abbreviations. This could indicate shifted or switched source data columns. + anomaly_criteria: |- + p.std_pattern_match = 'STATE_USA' + AND p.distinct_value_ct > 5 + AND NOT (p.column_name = 'st' OR p.column_name ILIKE '%state%' OR p.column_name ILIKE '%_st' OR p.column_name ILIKE 'st_%') + detail_expression: "'Value Range: ' || p.min_text || ' thru ' || max_text || CASE\ + \ WHEN p.top_freq_values > '' THEN ', Top Freq Values: ' || REPLACE(p.top_freq_values,\ + \ CHR(10), ' ; ') ELSE '' END " + issue_likelihood: Possible + suggested_action: |- + Review your source data and follow-up with data owners to determine whether column should be populated with US states. + dq_score_prevalence_formula: null + dq_score_risk_factor: '0.33' + dq_dimension: Consistency + target_data_lookups: + - id: '1359' + test_id: '1021' + test_type: Unexpected US States + sql_flavor: bigquery + lookup_type: null + lookup_query: |- + SELECT `{COLUMN_NAME}`, COUNT(*) AS count + FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` + GROUP BY `{COLUMN_NAME}` + ORDER BY `{COLUMN_NAME}` DESC + LIMIT 500; + error_type: Profile Anomaly + - id: '1293' + test_id: '1021' + test_type: Unexpected US States + sql_flavor: databricks + lookup_type: null + lookup_query: |- + SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` DESC LIMIT 500; + error_type: Profile Anomaly + - id: '1135' + test_id: '1021' + test_type: Unexpected US States + sql_flavor: mssql + lookup_type: null + lookup_query: |- + SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC; + error_type: Profile Anomaly + - id: '1078' + test_id: '1021' + test_type: Unexpected US States + sql_flavor: postgresql + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500; + error_type: Profile Anomaly + - id: '1053' + test_id: '1021' + test_type: Unexpected US States + sql_flavor: redshift + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500; + error_type: Profile Anomaly + - id: '1453' + test_id: '1021' + test_type: Unexpected US States + sql_flavor: redshift_spectrum + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500; + error_type: Profile Anomaly + - id: '1192' + test_id: '1021' + test_type: Unexpected US States + sql_flavor: snowflake + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500; + error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Unlikely_Date_Values.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Unlikely_Date_Values.yaml new file mode 100644 index 00000000..23dc70fb --- /dev/null +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Unlikely_Date_Values.yaml @@ -0,0 +1,83 @@ +profile_anomaly_types: + id: '1018' + anomaly_type: Unlikely_Date_Values + data_object: Column + anomaly_name: Unlikely Dates out of Typical Range + anomaly_description: |- + Some date values in this column are earlier than 1900-01-01 or later than 30 years after Profiling date. + anomaly_criteria: |- + p.general_type = 'D' + AND (p.min_date BETWEEN '0001-01-02'::DATE AND '1900-01-01'::DATE + OR p.max_date > CURRENT_DATE + INTERVAL '30 year') + detail_expression: |- + 'Date Range: ' || p.min_date::VARCHAR || ' thru ' || p.max_date::VARCHAR + issue_likelihood: Likely + suggested_action: |- + Review your source data and follow-up with data owners to determine whether this data needs to be corrected or removed. + dq_score_prevalence_formula: |- + (COALESCE(p.before_100yr_date_ct,0)+COALESCE(p.distant_future_date_ct, 0))::FLOAT/NULLIF(p.record_ct, 0) + dq_score_risk_factor: '0.66' + dq_dimension: Accuracy + target_data_lookups: + - id: '1356' + test_id: '1018' + test_type: Unlikely_Date_Values + sql_flavor: bigquery + lookup_type: null + lookup_query: |- + SELECT DISTINCT `{COLUMN_NAME}`, CAST(CAST('{PROFILE_RUN_DATE}' AS DATETIME) AS DATE) AS profile_run_date, COUNT(*) AS count + FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` a + WHERE (CAST(`{COLUMN_NAME}` AS DATE) < DATE '1900-01-01') + OR (CAST(`{COLUMN_NAME}` AS DATE) > DATE_ADD(CAST(CAST('{PROFILE_RUN_DATE}' AS DATETIME) AS DATE), INTERVAL 30 YEAR)) + GROUP BY `{COLUMN_NAME}` + ORDER BY `{COLUMN_NAME}` DESC + LIMIT 500; + error_type: Profile Anomaly + - id: '1290' + test_id: '1018' + test_type: Unlikely_Date_Values + sql_flavor: databricks + lookup_type: null + lookup_query: |- + SELECT DISTINCT `{COLUMN_NAME}`, '{PROFILE_RUN_DATE}' :: DATE AS profile_run_date, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` a WHERE (`{COLUMN_NAME}` < '1900-01-01'::DATE) OR (`{COLUMN_NAME}` > '{PROFILE_RUN_DATE}' :: DATE + INTERVAL '30 year' ) GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` DESC LIMIT 500; + error_type: Profile Anomaly + - id: '1132' + test_id: '1018' + test_type: Unlikely_Date_Values + sql_flavor: mssql + lookup_type: null + lookup_query: |- + SELECT TOP 500 "{COLUMN_NAME}", CAST( '{PROFILE_RUN_DATE}' AS DATE) AS profile_run_date, COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" a WHERE ("{COLUMN_NAME}" < CAST('1900-01-01' AS DATE) ) OR ("{COLUMN_NAME}" > DATEADD(YEAR, 30, CAST('{PROFILE_RUN_DATE}' AS DATE ))) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC; + error_type: Profile Anomaly + - id: '1075' + test_id: '1018' + test_type: Unlikely_Date_Values + sql_flavor: postgresql + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", '{PROFILE_RUN_DATE}' :: DATE AS profile_run_date, COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" a WHERE ("{COLUMN_NAME}" < '1900-01-01'::DATE) OR ("{COLUMN_NAME}" > '{PROFILE_RUN_DATE}' :: DATE + INTERVAL '30 year' ) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500; + error_type: Profile Anomaly + - id: '1050' + test_id: '1018' + test_type: Unlikely_Date_Values + sql_flavor: redshift + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", '{PROFILE_RUN_DATE}' :: DATE AS profile_run_date, COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" a WHERE ("{COLUMN_NAME}" < '1900-01-01'::DATE) OR ("{COLUMN_NAME}" > '{PROFILE_RUN_DATE}' :: DATE + INTERVAL '30 year' ) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500; + error_type: Profile Anomaly + - id: '1450' + test_id: '1018' + test_type: Unlikely_Date_Values + sql_flavor: redshift_spectrum + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", '{PROFILE_RUN_DATE}' :: DATE AS profile_run_date, COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" a WHERE ("{COLUMN_NAME}" < '1900-01-01'::DATE) OR ("{COLUMN_NAME}" > '{PROFILE_RUN_DATE}' :: DATE + INTERVAL '30 year' ) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500; + error_type: Profile Anomaly + - id: '1189' + test_id: '1018' + test_type: Unlikely_Date_Values + sql_flavor: snowflake + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", '{PROFILE_RUN_DATE}' :: DATE AS profile_run_date, COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" a WHERE ("{COLUMN_NAME}" < '1900-01-01'::DATE) OR ("{COLUMN_NAME}" > '{PROFILE_RUN_DATE}' :: DATE + INTERVAL '30 year' ) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500; + error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Variant_Coded_Values.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Variant_Coded_Values.yaml new file mode 100644 index 00000000..a935d7fb --- /dev/null +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Variant_Coded_Values.yaml @@ -0,0 +1,82 @@ +profile_anomaly_types: + id: '1027' + anomaly_type: Variant_Coded_Values + data_object: Variant + anomaly_name: Variant Codings for Same Values + anomaly_description: "This column contains more than one common variants that represent\ + \ a single value or state. This can occur when data is integrated from multiple\ + \ sources with different standards, or when free entry is permitted without validation.\ + \ The variations can cause confusion and error for downstream data users and multiple\ + \ versions of the truth. " + anomaly_criteria: |- + p.distinct_value_ct <= 20 + detail_expression: |- + 'Variants Found: ' || intersect_list + issue_likelihood: Definite + suggested_action: |- + Review your source data and ingestion process. Consider cleansing this data to standardize on a single set of definitive codes. + dq_score_prevalence_formula: null + dq_score_risk_factor: null + dq_dimension: Consistency + target_data_lookups: + - id: '1396' + test_id: '1027' + test_type: Variant_Coded_Values + sql_flavor: bigquery + lookup_type: null + lookup_query: |- + SELECT `{COLUMN_NAME}`, COUNT(*) AS count + FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` + WHERE LOWER(CAST(`{COLUMN_NAME}` AS STRING)) IN ( + SELECT TRIM(val) FROM UNNEST(SPLIT(SUBSTR('{DETAIL_EXPRESSION}', STRPOS('{DETAIL_EXPRESSION}', ':') + 2), '|')) AS val + ) + GROUP BY `{COLUMN_NAME}`; + error_type: Profile Anomaly + - id: '1230' + test_id: '1027' + test_type: Variant_Coded_Values + sql_flavor: databricks + lookup_type: null + lookup_query: |- + SELECT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE LOWER(`{COLUMN_NAME}`) IN (SELECT TRIM(value) FROM (SELECT EXPLODE(SPLIT(SUBSTRING('{DETAIL_EXPRESSION}', INSTR('{DETAIL_EXPRESSION}', ':') + 2), '\\|')) AS value)) GROUP BY `{COLUMN_NAME}`; + error_type: Profile Anomaly + - id: '1231' + test_id: '1027' + test_type: Variant_Coded_Values + sql_flavor: mssql + lookup_type: null + lookup_query: |- + SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE LOWER("{COLUMN_NAME}") IN (SELECT trim(value) FROM STRING_SPLIT(SUBSTRING('{DETAIL_EXPRESSION}', CHARINDEX(':', '{DETAIL_EXPRESSION}') + 2, 999), '|')) GROUP BY "{COLUMN_NAME}"; + error_type: Profile Anomaly + - id: '1232' + test_id: '1027' + test_type: Variant_Coded_Values + sql_flavor: postgresql + lookup_type: null + lookup_query: |- + SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE LOWER("{COLUMN_NAME}") = ANY(STRING_TO_ARRAY(SUBSTRING('{DETAIL_EXPRESSION}', STRPOS('{DETAIL_EXPRESSION}', ':') + 2), '|')) GROUP BY "{COLUMN_NAME}"; + error_type: Profile Anomaly + - id: '1229' + test_id: '1027' + test_type: Variant_Coded_Values + sql_flavor: redshift + lookup_type: null + lookup_query: |- + WITH val_array AS (SELECT 1 as valkey, SPLIT_TO_ARRAY(SUBSTRING ('{DETAIL_EXPRESSION}', STRPOS('{DETAIL_EXPRESSION}', ':') + 2), '|') vals), val_list AS ( SELECT valkey, val::VARCHAR FROM val_array v, v.vals val ) SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" t INNER JOIN val_list v ON (LOWER("{COLUMN_NAME}") = v.val) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}"; + error_type: Profile Anomaly + - id: '1458' + test_id: '1027' + test_type: Variant_Coded_Values + sql_flavor: redshift_spectrum + lookup_type: null + lookup_query: |- + WITH val_array AS (SELECT 1 as valkey, SPLIT_TO_ARRAY(SUBSTRING ('{DETAIL_EXPRESSION}', STRPOS('{DETAIL_EXPRESSION}', ':') + 2), '|') vals), val_list AS ( SELECT valkey, val::VARCHAR FROM val_array v, v.vals val ) SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" t INNER JOIN val_list v ON (LOWER("{COLUMN_NAME}") = v.val) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}"; + error_type: Profile Anomaly + - id: '1230' + test_id: '1027' + test_type: Variant_Coded_Values + sql_flavor: snowflake + lookup_type: null + lookup_query: |- + SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE lower("{COLUMN_NAME}") IN (SELECT trim(value) FROM TABLE (FLATTEN(INPUT => SPLIT(SUBSTRING('{DETAIL_EXPRESSION}', POSITION(':', '{DETAIL_EXPRESSION}') + 2), '|'))) ) GROUP BY "{COLUMN_NAME}"; + error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_test_types/test_types_Aggregate_Balance.yaml b/testgen/template/dbsetup_test_types/test_types_Aggregate_Balance.yaml new file mode 100644 index 00000000..57b2901a --- /dev/null +++ b/testgen/template/dbsetup_test_types/test_types_Aggregate_Balance.yaml @@ -0,0 +1,239 @@ +test_types: + id: '1500' + test_type: Aggregate_Balance + test_name_short: Aggregate Balance + test_name_long: Aggregate values per group match reference + test_description: |- + Tests for exact match in aggregate values for each set of column values vs. reference dataset + except_message: |- + Aggregate measure per set of column values does not exactly match reference dataset. + measure_uom: Mismatched measures + measure_uom_description: null + selection_criteria: null + dq_score_prevalence_formula: |- + 1 + dq_score_risk_factor: '1.0' + column_name_prompt: |- + Aggregate Expression + column_name_help: |- + Specify an aggregate column expression: one of `SUM([column_name])` or `COUNT([column_name])` + default_parm_columns: subset_condition,groupby_names,having_condition,match_schema_name,match_table_name,match_column_names,match_subset_condition,match_groupby_names,match_having_condition + default_parm_values: null + default_parm_prompts: |- + Record Subset Condition,Grouping Columns,Group Subset Condition,Matching Schema Name,Matching Table Name,Matching Aggregate Expression,Matching Record Subset Condition,Matching Grouping Columns,Matching Group Subset Condition + default_parm_help: |- + Condition defining a subset of records in main table, written like a condition within a SQL WHERE clause - OPTIONAL|Category columns in main table separated by commas (e.g. GROUP BY columns)|Condition defining a subset of aggregate records in main table (e.g. HAVING clause) - OPTIONAL|Schema location of matching table|Matching table name|Agregate column expression in matching table: one of `SUM([column_name])` or `COUNT([column_name])`|Condition defining a subset of records in matching table, written like a condition within a SQL WHERE clause - OPTIONAL|Category columns in matching table separated by commas (e.g. GROUP BY columns)|Condition defining a subset of aggregate records in matching table (e.g. HAVING clause) - OPTIONAL + default_severity: Fail + run_type: QUERY + test_scope: referential + dq_dimension: Consistency + health_dimension: Data Drift + threshold_description: |- + Expected count of group totals not matching aggregate value + result_visualization: line_chart + result_visualization_params: null + usage_notes: |- + This test compares sums or counts of a column rolled up to one or more category combinations across two different tables. Both tables must be accessible at the same time. It's ideal for confirming that two datasets exactly match -- that the sum of a measure or count of a value hasn't changed or shifted between categories. Use this test to compare a raw and processed version of the same dataset, or to confirm that an aggregated table exactly matches the detail table that it's built from. An error here means that one or more value combinations fail to match. New categories or combinations will cause failure. + active: Y + cat_test_conditions: [] + target_data_lookups: + - id: '1400' + test_id: '1500' + test_type: Aggregate_Balance + sql_flavor: bigquery + lookup_type: null + lookup_query: |- + SELECT * + FROM ( + SELECT {GROUPBY_NAMES}, SUM(TOTAL) AS total, SUM(MATCH_TOTAL) AS MATCH_TOTAL + FROM ( + SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} AS total, NULL AS match_total + FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` + WHERE {SUBSET_CONDITION} + GROUP BY {GROUPBY_NAMES} + {HAVING_CONDITION} + UNION ALL + SELECT {MATCH_GROUPBY_NAMES}, NULL AS total, {MATCH_COLUMN_NAMES} AS match_total + FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME} + WHERE {MATCH_SUBSET_CONDITION} + GROUP BY {MATCH_GROUPBY_NAMES} + {MATCH_HAVING_CONDITION} + ) a + GROUP BY {GROUPBY_NAMES} + ) s + WHERE total <> match_total OR (total IS NOT NULL AND match_total IS NULL) OR (total IS NULL AND match_total IS NOT NULL) + ORDER BY {GROUPBY_NAMES}; + error_type: Test Results + - id: '1333' + test_id: '1500' + test_type: Aggregate_Balance + sql_flavor: databricks + lookup_type: null + lookup_query: |- + SELECT * + FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) AS total, SUM(MATCH_TOTAL) AS MATCH_TOTAL + FROM + ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} AS total, NULL AS match_total + FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` + WHERE {SUBSET_CONDITION} + GROUP BY {GROUPBY_NAMES} + {HAVING_CONDITION} + UNION ALL + SELECT {MATCH_GROUPBY_NAMES}, NULL AS total, {MATCH_COLUMN_NAMES} AS match_total + FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME} + WHERE {MATCH_SUBSET_CONDITION} + GROUP BY {MATCH_GROUPBY_NAMES} + {MATCH_HAVING_CONDITION} ) a + GROUP BY {GROUPBY_NAMES} ) s + WHERE total <> match_total OR (total IS NOT NULL AND match_total IS NULL) OR (total IS NULL AND match_total IS NOT NULL) + ORDER BY {GROUPBY_NAMES}; + error_type: Test Results + - id: '1247' + test_id: '1500' + test_type: Aggregate_Balance + sql_flavor: mssql + lookup_type: null + lookup_query: |- + SELECT * + FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) AS total, SUM(MATCH_TOTAL) AS MATCH_TOTAL + FROM + ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} AS total, NULL AS match_total + FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + WHERE {SUBSET_CONDITION} + GROUP BY {GROUPBY_NAMES} + {HAVING_CONDITION} + UNION ALL + SELECT {MATCH_GROUPBY_NAMES}, NULL AS total, {MATCH_COLUMN_NAMES} AS match_total + FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME} + WHERE {MATCH_SUBSET_CONDITION} + GROUP BY {MATCH_GROUPBY_NAMES} + {MATCH_HAVING_CONDITION} ) a + GROUP BY {GROUPBY_NAMES} ) s + WHERE total <> match_total OR (total IS NOT NULL AND match_total IS NULL) OR (total IS NULL AND match_total IS NOT NULL) + ORDER BY {GROUPBY_NAMES}; + error_type: Test Results + - id: '1248' + test_id: '1500' + test_type: Aggregate_Balance + sql_flavor: postgresql + lookup_type: null + lookup_query: |- + SELECT * + FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) AS total, SUM(MATCH_TOTAL) AS MATCH_TOTAL + FROM + ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} AS total, NULL AS match_total + FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + WHERE {SUBSET_CONDITION} + GROUP BY {GROUPBY_NAMES} + {HAVING_CONDITION} + UNION ALL + SELECT {MATCH_GROUPBY_NAMES}, NULL AS total, {MATCH_COLUMN_NAMES} AS match_total + FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME} + WHERE {MATCH_SUBSET_CONDITION} + GROUP BY {MATCH_GROUPBY_NAMES} + {MATCH_HAVING_CONDITION} ) a + GROUP BY {GROUPBY_NAMES} ) s + WHERE total <> match_total OR (total IS NOT NULL AND match_total IS NULL) OR (total IS NULL AND match_total IS NOT NULL) + ORDER BY {GROUPBY_NAMES}; + error_type: Test Results + - id: '1245' + test_id: '1500' + test_type: Aggregate_Balance + sql_flavor: redshift + lookup_type: null + lookup_query: |- + SELECT * + FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) AS total, SUM(MATCH_TOTAL) AS MATCH_TOTAL + FROM + ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} AS total, NULL AS match_total + FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + WHERE {SUBSET_CONDITION} + GROUP BY {GROUPBY_NAMES} + {HAVING_CONDITION} + UNION ALL + SELECT {MATCH_GROUPBY_NAMES}, NULL AS total, {MATCH_COLUMN_NAMES} AS match_total + FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME} + WHERE {MATCH_SUBSET_CONDITION} + GROUP BY {MATCH_GROUPBY_NAMES} + {MATCH_HAVING_CONDITION} ) a + GROUP BY {GROUPBY_NAMES} ) s + WHERE total <> match_total OR (total IS NOT NULL AND match_total IS NULL) OR (total IS NULL AND match_total IS NOT NULL) + ORDER BY {GROUPBY_NAMES}; + error_type: Test Results + - id: '1462' + test_id: '1500' + test_type: Aggregate_Balance + sql_flavor: redshift_spectrum + lookup_type: null + lookup_query: |- + SELECT * + FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) AS total, SUM(MATCH_TOTAL) AS MATCH_TOTAL + FROM + ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} AS total, NULL AS match_total + FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + WHERE {SUBSET_CONDITION} + GROUP BY {GROUPBY_NAMES} + {HAVING_CONDITION} + UNION ALL + SELECT {MATCH_GROUPBY_NAMES}, NULL AS total, {MATCH_COLUMN_NAMES} AS match_total + FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME} + WHERE {MATCH_SUBSET_CONDITION} + GROUP BY {MATCH_GROUPBY_NAMES} + {MATCH_HAVING_CONDITION} ) a + GROUP BY {GROUPBY_NAMES} ) s + WHERE total <> match_total OR (total IS NOT NULL AND match_total IS NULL) OR (total IS NULL AND match_total IS NOT NULL) + ORDER BY {GROUPBY_NAMES}; + error_type: Test Results + - id: '1246' + test_id: '1500' + test_type: Aggregate_Balance + sql_flavor: snowflake + lookup_type: null + lookup_query: |- + SELECT * + FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) AS total, SUM(MATCH_TOTAL) AS MATCH_TOTAL + FROM + ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} AS total, NULL AS match_total + FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + WHERE {SUBSET_CONDITION} + GROUP BY {GROUPBY_NAMES} + {HAVING_CONDITION} + UNION ALL + SELECT {MATCH_GROUPBY_NAMES}, NULL AS total, {MATCH_COLUMN_NAMES} AS match_total + FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME} + WHERE {MATCH_SUBSET_CONDITION} + GROUP BY {MATCH_GROUPBY_NAMES} + {MATCH_HAVING_CONDITION} ) a + GROUP BY {GROUPBY_NAMES} ) s + WHERE total <> match_total OR (total IS NOT NULL AND match_total IS NULL) OR (total IS NULL AND match_total IS NOT NULL) + ORDER BY {GROUPBY_NAMES}; + error_type: Test Results + test_templates: + - id: '2506' + test_type: Aggregate_Balance + sql_flavor: bigquery + template_name: ex_aggregate_match_same_generic.sql + - id: '2406' + test_type: Aggregate_Balance + sql_flavor: databricks + template_name: ex_aggregate_match_same_generic.sql + - id: '2206' + test_type: Aggregate_Balance + sql_flavor: mssql + template_name: ex_aggregate_match_same_generic.sql + - id: '2306' + test_type: Aggregate_Balance + sql_flavor: postgresql + template_name: ex_aggregate_match_same_generic.sql + - id: '2006' + test_type: Aggregate_Balance + sql_flavor: redshift + template_name: ex_aggregate_match_same_generic.sql + - id: '2506' + test_type: Aggregate_Balance + sql_flavor: redshift_spectrum + template_name: ex_aggregate_match_same_generic.sql + - id: '2106' + test_type: Aggregate_Balance + sql_flavor: snowflake + template_name: ex_aggregate_match_same_generic.sql diff --git a/testgen/template/dbsetup_test_types/test_types_Aggregate_Balance_Percent.yaml b/testgen/template/dbsetup_test_types/test_types_Aggregate_Balance_Percent.yaml new file mode 100644 index 00000000..fcc0487c --- /dev/null +++ b/testgen/template/dbsetup_test_types/test_types_Aggregate_Balance_Percent.yaml @@ -0,0 +1,253 @@ +test_types: + id: '1504' + test_type: Aggregate_Balance_Percent + test_name_short: Aggregate Balance Percent + test_name_long: Aggregate measure per group within percent of reference + test_description: |- + Tests that aggregate measure for each set of column values fall within a percent range above or below the measure for reference dataset + except_message: |- + Aggregate measure per set of column values is outside percent range of reference dataset. + measure_uom: Mismatched measures + measure_uom_description: null + selection_criteria: null + dq_score_prevalence_formula: |- + 1 + dq_score_risk_factor: '1.0' + column_name_prompt: |- + Aggregate Expression + column_name_help: |- + Specify an aggregate column expression: one of `SUM([column_name])` or `COUNT([column_name])` + default_parm_columns: subset_condition,groupby_names,having_condition,match_schema_name,match_table_name,match_column_names,match_subset_condition,match_groupby_names,match_having_condition,lower_tolerance,upper_tolerance + default_parm_values: null + default_parm_prompts: |- + Record Subset Condition,Grouping Columns,Group Subset Condition,Matching Schema Name,Matching Table Name,Matching Aggregate Expression,Matching Record Subset Condition,Matching Grouping Columns,Matching Group Subset Condition,Lower Tolerance Percent,Upper Tolerance Percent + default_parm_help: |- + Condition defining a subset of records in main table, written like a condition within a SQL WHERE clause - OPTIONAL|Category columns in main table separated by commas (e.g. GROUP BY columns)|Condition defining a subset of aggregate records in main table (e.g. HAVING clause) - OPTIONAL|Schema location of matching table|Matching table name|Agregate column expression in matching table: one of `SUM([column_name])` or `COUNT([column_name])`|Condition defining a subset of records in matching table, written like a condition within a SQL WHERE clause - OPTIONAL|Category columns in matching table separated by commas (e.g. GROUP BY columns)|Condition defining a subset of aggregate records in matching table (e.g. HAVING clause) - OPTIONAL|Allowable tolerance below the reference measure expressed as a percent|Allowable tolerance above the reference measure expressed as a percent + default_severity: Fail + run_type: QUERY + test_scope: referential + dq_dimension: Consistency + health_dimension: Data Drift + threshold_description: |- + Expected count of group totals not matching aggregate value + result_visualization: line_chart + result_visualization_params: null + usage_notes: |- + This test compares sums or counts of a column rolled up to one or more category combinations across two different tables. Both tables must be accessible at the same time. Use it to confirm that two datasets closely match within the tolerance you set -- that the sum of a measure or count of a value remains sufficiently consistent between categories. You could use this test compare sales per product within one month to another, when you want to be alerted if the difference for any product falls outside of the range defined as 5% below to 10% above the prior month. An error here means that one or more value combinations fail to match within the set tolerances. New categories or combinations will cause failure. + active: Y + cat_test_conditions: [] + target_data_lookups: + - id: '1404' + test_id: '1504' + test_type: Aggregate_Balance_Percent + sql_flavor: bigquery + lookup_type: null + lookup_query: |- + SELECT * + FROM ( + SELECT {GROUPBY_NAMES}, SUM(TOTAL) AS total, SUM(MATCH_TOTAL) AS MATCH_TOTAL + FROM ( + SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} AS total, NULL AS match_total + FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` + WHERE {SUBSET_CONDITION} + GROUP BY {GROUPBY_NAMES} + {HAVING_CONDITION} + UNION ALL + SELECT {MATCH_GROUPBY_NAMES}, NULL AS total, {MATCH_COLUMN_NAMES} AS match_total + FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME} + WHERE {MATCH_SUBSET_CONDITION} + GROUP BY {MATCH_GROUPBY_NAMES} + {MATCH_HAVING_CONDITION} + ) a + GROUP BY {GROUPBY_NAMES} + ) s + WHERE (total IS NOT NULL AND match_total IS NULL) + OR (total IS NULL AND match_total IS NOT NULL) + OR (total NOT BETWEEN match_total * (1 + {LOWER_TOLERANCE}/100.0) AND match_total * (1 + {UPPER_TOLERANCE}/100.0)) + ORDER BY {GROUPBY_NAMES}; + error_type: Test Results + - id: '1248' + test_id: '1504' + test_type: Aggregate_Balance_Percent + sql_flavor: databricks + lookup_type: null + lookup_query: |- + SELECT * + FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) AS total, SUM(MATCH_TOTAL) AS MATCH_TOTAL + FROM + ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} AS total, NULL AS match_total + FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` + WHERE {SUBSET_CONDITION} + GROUP BY {GROUPBY_NAMES} + {HAVING_CONDITION} + UNION ALL + SELECT {MATCH_GROUPBY_NAMES}, NULL AS total, {MATCH_COLUMN_NAMES} AS match_total + FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME} + WHERE {MATCH_SUBSET_CONDITION} + GROUP BY {MATCH_GROUPBY_NAMES} + {MATCH_HAVING_CONDITION} ) a + GROUP BY {GROUPBY_NAMES} ) s + WHERE (total IS NOT NULL AND match_total IS NULL) + OR (total IS NULL AND match_total IS NOT NULL) + OR (total NOT BETWEEN match_total * (1 + {LOWER_TOLERANCE}/100.0) AND match_total * (1 + {UPPER_TOLERANCE}/100.0)) + ORDER BY {GROUPBY_NAMES}; + error_type: Test Results + - id: '1247' + test_id: '1504' + test_type: Aggregate_Balance_Percent + sql_flavor: mssql + lookup_type: null + lookup_query: |- + SELECT * + FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) AS total, SUM(MATCH_TOTAL) AS MATCH_TOTAL + FROM + ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} AS total, NULL AS match_total + FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + WHERE {SUBSET_CONDITION} + GROUP BY {GROUPBY_NAMES} + {HAVING_CONDITION} + UNION ALL + SELECT {MATCH_GROUPBY_NAMES}, NULL AS total, {MATCH_COLUMN_NAMES} AS match_total + FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME} + WHERE {MATCH_SUBSET_CONDITION} + GROUP BY {MATCH_GROUPBY_NAMES} + {MATCH_HAVING_CONDITION} ) a + GROUP BY {GROUPBY_NAMES} ) s + WHERE (total IS NOT NULL AND match_total IS NULL) + OR (total IS NULL AND match_total IS NOT NULL) + OR (total NOT BETWEEN match_total * (1 + {LOWER_TOLERANCE}/100.0) AND match_total * (1 + {UPPER_TOLERANCE}/100.0)) + ORDER BY {GROUPBY_NAMES}; + error_type: Test Results + - id: '1248' + test_id: '1504' + test_type: Aggregate_Balance_Percent + sql_flavor: postgresql + lookup_type: null + lookup_query: |- + SELECT * + FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) AS total, SUM(MATCH_TOTAL) AS MATCH_TOTAL + FROM + ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} AS total, NULL AS match_total + FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + WHERE {SUBSET_CONDITION} + GROUP BY {GROUPBY_NAMES} + {HAVING_CONDITION} + UNION ALL + SELECT {MATCH_GROUPBY_NAMES}, NULL AS total, {MATCH_COLUMN_NAMES} AS match_total + FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME} + WHERE {MATCH_SUBSET_CONDITION} + GROUP BY {MATCH_GROUPBY_NAMES} + {MATCH_HAVING_CONDITION} ) a + GROUP BY {GROUPBY_NAMES} ) s + WHERE (total IS NOT NULL AND match_total IS NULL) + OR (total IS NULL AND match_total IS NOT NULL) + OR (total NOT BETWEEN match_total * (1 + {LOWER_TOLERANCE}/100.0) AND match_total * (1 + {UPPER_TOLERANCE}/100.0)) + ORDER BY {GROUPBY_NAMES}; + error_type: Test Results + - id: '1245' + test_id: '1504' + test_type: Aggregate_Balance_Percent + sql_flavor: redshift + lookup_type: null + lookup_query: |- + SELECT * + FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) AS total, SUM(MATCH_TOTAL) AS MATCH_TOTAL + FROM + ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} AS total, NULL AS match_total + FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + WHERE {SUBSET_CONDITION} + GROUP BY {GROUPBY_NAMES} + {HAVING_CONDITION} + UNION ALL + SELECT {MATCH_GROUPBY_NAMES}, NULL AS total, {MATCH_COLUMN_NAMES} AS match_total + FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME} + WHERE {MATCH_SUBSET_CONDITION} + GROUP BY {MATCH_GROUPBY_NAMES} + {MATCH_HAVING_CONDITION} ) a + GROUP BY {GROUPBY_NAMES} ) s + WHERE (total IS NOT NULL AND match_total IS NULL) + OR (total IS NULL AND match_total IS NOT NULL) + OR (total NOT BETWEEN match_total * (1 + {LOWER_TOLERANCE}/100.0) AND match_total * (1 + {UPPER_TOLERANCE}/100.0)) + ORDER BY {GROUPBY_NAMES}; + error_type: Test Results + - id: '1466' + test_id: '1504' + test_type: Aggregate_Balance_Percent + sql_flavor: redshift_spectrum + lookup_type: null + lookup_query: |- + SELECT * + FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) AS total, SUM(MATCH_TOTAL) AS MATCH_TOTAL + FROM + ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} AS total, NULL AS match_total + FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + WHERE {SUBSET_CONDITION} + GROUP BY {GROUPBY_NAMES} + {HAVING_CONDITION} + UNION ALL + SELECT {MATCH_GROUPBY_NAMES}, NULL AS total, {MATCH_COLUMN_NAMES} AS match_total + FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME} + WHERE {MATCH_SUBSET_CONDITION} + GROUP BY {MATCH_GROUPBY_NAMES} + {MATCH_HAVING_CONDITION} ) a + GROUP BY {GROUPBY_NAMES} ) s + WHERE (total IS NOT NULL AND match_total IS NULL) + OR (total IS NULL AND match_total IS NOT NULL) + OR (total NOT BETWEEN match_total * (1 + {LOWER_TOLERANCE}/100.0) AND match_total * (1 + {UPPER_TOLERANCE}/100.0)) + ORDER BY {GROUPBY_NAMES}; + error_type: Test Results + - id: '1246' + test_id: '1504' + test_type: Aggregate_Balance_Percent + sql_flavor: snowflake + lookup_type: null + lookup_query: |- + SELECT * + FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) AS total, SUM(MATCH_TOTAL) AS MATCH_TOTAL + FROM + ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} AS total, NULL AS match_total + FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + WHERE {SUBSET_CONDITION} + GROUP BY {GROUPBY_NAMES} + {HAVING_CONDITION} + UNION ALL + SELECT {MATCH_GROUPBY_NAMES}, NULL AS total, {MATCH_COLUMN_NAMES} AS match_total + FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME} + WHERE {MATCH_SUBSET_CONDITION} + GROUP BY {MATCH_GROUPBY_NAMES} + {MATCH_HAVING_CONDITION} ) a + GROUP BY {GROUPBY_NAMES} ) s + WHERE (total IS NOT NULL AND match_total IS NULL) + OR (total IS NULL AND match_total IS NOT NULL) + OR (total NOT BETWEEN match_total * (1 + {LOWER_TOLERANCE}/100.0) AND match_total * (1 + {UPPER_TOLERANCE}/100.0)) + ORDER BY {GROUPBY_NAMES}; + error_type: Test Results + test_templates: + - id: '2509' + test_type: Aggregate_Balance_Percent + sql_flavor: bigquery + template_name: ex_aggregate_match_percent_generic.sql + - id: '2409' + test_type: Aggregate_Balance_Percent + sql_flavor: databricks + template_name: ex_aggregate_match_percent_generic.sql + - id: '2209' + test_type: Aggregate_Balance_Percent + sql_flavor: mssql + template_name: ex_aggregate_match_percent_generic.sql + - id: '2309' + test_type: Aggregate_Balance_Percent + sql_flavor: postgresql + template_name: ex_aggregate_match_percent_generic.sql + - id: '2009' + test_type: Aggregate_Balance_Percent + sql_flavor: redshift + template_name: ex_aggregate_match_percent_generic.sql + - id: '2509' + test_type: Aggregate_Balance_Percent + sql_flavor: redshift_spectrum + template_name: ex_aggregate_match_percent_generic.sql + - id: '2109' + test_type: Aggregate_Balance_Percent + sql_flavor: snowflake + template_name: ex_aggregate_match_percent_generic.sql diff --git a/testgen/template/dbsetup_test_types/test_types_Aggregate_Balance_Range.yaml b/testgen/template/dbsetup_test_types/test_types_Aggregate_Balance_Range.yaml new file mode 100644 index 00000000..320ccc37 --- /dev/null +++ b/testgen/template/dbsetup_test_types/test_types_Aggregate_Balance_Range.yaml @@ -0,0 +1,253 @@ +test_types: + id: '1505' + test_type: Aggregate_Balance_Range + test_name_short: Aggregate Balance Range + test_name_long: Aggregate measure per group within hard range of reference + test_description: |- + Tests that aggregate measure for each set of column values fall within a hard range above or below the measure for reference dataset + except_message: |- + Aggregate measure per set of column values is outside expected range of reference dataset. + measure_uom: Mismatched measures + measure_uom_description: null + selection_criteria: null + dq_score_prevalence_formula: |- + 1 + dq_score_risk_factor: '1.0' + column_name_prompt: |- + Aggregate Expression + column_name_help: |- + Specify an aggregate column expression: one of `SUM([column_name])` or `COUNT([column_name])` + default_parm_columns: subset_condition,groupby_names,having_condition,match_schema_name,match_table_name,match_column_names,match_subset_condition,match_groupby_names,match_having_condition,lower_tolerance,upper_tolerance + default_parm_values: null + default_parm_prompts: |- + Record Subset Condition,Grouping Columns,Group Subset Condition,Matching Schema Name,Matching Table Name,Matching Aggregate Expression,Matching Record Subset Condition,Matching Grouping Columns,Matching Group Subset Condition,Lower Tolerance Constant,Upper Tolerance Constant + default_parm_help: |- + Condition defining a subset of records in main table, written like a condition within a SQL WHERE clause - OPTIONAL|Category columns in main table separated by commas (e.g. GROUP BY columns)|Condition defining a subset of aggregate records in main table (e.g. HAVING clause) - OPTIONAL|Schema location of matching table|Matching table name|Agregate column expression in matching table: one of `SUM([column_name])` or `COUNT([column_name])`|Condition defining a subset of records in matching table, written like a condition within a SQL WHERE clause - OPTIONAL|Category columns in matching table separated by commas (e.g. GROUP BY columns)|Condition defining a subset of aggregate records in matching table (e.g. HAVING clause) - OPTIONAL|Allowable tolerance below the reference measure expressed as a constant value|Allowable tolerance above the reference measure expressed as a constant value + default_severity: Fail + run_type: QUERY + test_scope: referential + dq_dimension: Consistency + health_dimension: Data Drift + threshold_description: |- + Expected count of group totals not matching aggregate value + result_visualization: line_chart + result_visualization_params: null + usage_notes: |- + This test compares sums or counts of a column rolled up to one or more category combinations across two different tables. Both tables must be accessible at the same time. Use it to confirm that two datasets closely match within the tolerances you define as specific values above or below the aggregate measure for the same categories in the reference dataset -- that the sum of a measure or count of a value remains sufficiently consistent between categories. For instance, you can use this test to compare sales per product within one month to another, when you want to be alerted if the difference for any product falls outside of the range defined as 10000 dollars above or below the prior week. An error here means that one or more value combinations fail to match within the set tolerances. New categories or combinations will cause failure. + active: Y + cat_test_conditions: [] + target_data_lookups: + - id: '1405' + test_id: '1505' + test_type: Aggregate_Balance_Range + sql_flavor: bigquery + lookup_type: null + lookup_query: |- + SELECT * + FROM ( + SELECT {GROUPBY_NAMES}, SUM(TOTAL) AS total, SUM(MATCH_TOTAL) AS MATCH_TOTAL + FROM ( + SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} AS total, NULL AS match_total + FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` + WHERE {SUBSET_CONDITION} + GROUP BY {GROUPBY_NAMES} + {HAVING_CONDITION} + UNION ALL + SELECT {MATCH_GROUPBY_NAMES}, NULL AS total, {MATCH_COLUMN_NAMES} AS match_total + FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME} + WHERE {MATCH_SUBSET_CONDITION} + GROUP BY {MATCH_GROUPBY_NAMES} + {MATCH_HAVING_CONDITION} + ) a + GROUP BY {GROUPBY_NAMES} + ) s + WHERE (total IS NOT NULL AND match_total IS NULL) + OR (total IS NULL AND match_total IS NOT NULL) + OR (total NOT BETWEEN match_total + {LOWER_TOLERANCE} AND match_total + {UPPER_TOLERANCE}) + ORDER BY {GROUPBY_NAMES}; + error_type: Test Results + - id: '1245' + test_id: '1505' + test_type: Aggregate_Balance_Range + sql_flavor: databricks + lookup_type: null + lookup_query: |- + SELECT * + FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) AS total, SUM(MATCH_TOTAL) AS MATCH_TOTAL + FROM + ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} AS total, NULL AS match_total + FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` + WHERE {SUBSET_CONDITION} + GROUP BY {GROUPBY_NAMES} + {HAVING_CONDITION} + UNION ALL + SELECT {MATCH_GROUPBY_NAMES}, NULL AS total, {MATCH_COLUMN_NAMES} AS match_total + FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME} + WHERE {MATCH_SUBSET_CONDITION} + GROUP BY {MATCH_GROUPBY_NAMES} + {MATCH_HAVING_CONDITION} ) a + GROUP BY {GROUPBY_NAMES} ) s + WHERE (total IS NOT NULL AND match_total IS NULL) + OR (total IS NULL AND match_total IS NOT NULL) + OR (total NOT BETWEEN match_total + {LOWER_TOLERANCE} AND match_total + {UPPER_TOLERANCE}) + ORDER BY {GROUPBY_NAMES}; + error_type: Test Results + - id: '1247' + test_id: '1505' + test_type: Aggregate_Balance_Range + sql_flavor: mssql + lookup_type: null + lookup_query: |- + SELECT * + FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) AS total, SUM(MATCH_TOTAL) AS MATCH_TOTAL + FROM + ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} AS total, NULL AS match_total + FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + WHERE {SUBSET_CONDITION} + GROUP BY {GROUPBY_NAMES} + {HAVING_CONDITION} + UNION ALL + SELECT {MATCH_GROUPBY_NAMES}, NULL AS total, {MATCH_COLUMN_NAMES} AS match_total + FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME} + WHERE {MATCH_SUBSET_CONDITION} + GROUP BY {MATCH_GROUPBY_NAMES} + {MATCH_HAVING_CONDITION} ) a + GROUP BY {GROUPBY_NAMES} ) s + WHERE (total IS NOT NULL AND match_total IS NULL) + OR (total IS NULL AND match_total IS NOT NULL) + OR (total NOT BETWEEN match_total + {LOWER_TOLERANCE} AND match_total + {UPPER_TOLERANCE}) + ORDER BY {GROUPBY_NAMES}; + error_type: Test Results + - id: '1248' + test_id: '1505' + test_type: Aggregate_Balance_Range + sql_flavor: postgresql + lookup_type: null + lookup_query: |- + SELECT * + FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) AS total, SUM(MATCH_TOTAL) AS MATCH_TOTAL + FROM + ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} AS total, NULL AS match_total + FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + WHERE {SUBSET_CONDITION} + GROUP BY {GROUPBY_NAMES} + {HAVING_CONDITION} + UNION ALL + SELECT {MATCH_GROUPBY_NAMES}, NULL AS total, {MATCH_COLUMN_NAMES} AS match_total + FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME} + WHERE {MATCH_SUBSET_CONDITION} + GROUP BY {MATCH_GROUPBY_NAMES} + {MATCH_HAVING_CONDITION} ) a + GROUP BY {GROUPBY_NAMES} ) s + WHERE (total IS NOT NULL AND match_total IS NULL) + OR (total IS NULL AND match_total IS NOT NULL) + OR (total NOT BETWEEN match_total + {LOWER_TOLERANCE} AND match_total + {UPPER_TOLERANCE}) + ORDER BY {GROUPBY_NAMES}; + error_type: Test Results + - id: '1245' + test_id: '1505' + test_type: Aggregate_Balance_Range + sql_flavor: redshift + lookup_type: null + lookup_query: |- + SELECT * + FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) AS total, SUM(MATCH_TOTAL) AS MATCH_TOTAL + FROM + ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} AS total, NULL AS match_total + FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + WHERE {SUBSET_CONDITION} + GROUP BY {GROUPBY_NAMES} + {HAVING_CONDITION} + UNION ALL + SELECT {MATCH_GROUPBY_NAMES}, NULL AS total, {MATCH_COLUMN_NAMES} AS match_total + FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME} + WHERE {MATCH_SUBSET_CONDITION} + GROUP BY {MATCH_GROUPBY_NAMES} + {MATCH_HAVING_CONDITION} ) a + GROUP BY {GROUPBY_NAMES} ) s + WHERE (total IS NOT NULL AND match_total IS NULL) + OR (total IS NULL AND match_total IS NOT NULL) + OR (total NOT BETWEEN match_total + {LOWER_TOLERANCE} AND match_total + {UPPER_TOLERANCE}) + ORDER BY {GROUPBY_NAMES}; + error_type: Test Results + - id: '1467' + test_id: '1505' + test_type: Aggregate_Balance_Range + sql_flavor: redshift_spectrum + lookup_type: null + lookup_query: |- + SELECT * + FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) AS total, SUM(MATCH_TOTAL) AS MATCH_TOTAL + FROM + ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} AS total, NULL AS match_total + FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + WHERE {SUBSET_CONDITION} + GROUP BY {GROUPBY_NAMES} + {HAVING_CONDITION} + UNION ALL + SELECT {MATCH_GROUPBY_NAMES}, NULL AS total, {MATCH_COLUMN_NAMES} AS match_total + FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME} + WHERE {MATCH_SUBSET_CONDITION} + GROUP BY {MATCH_GROUPBY_NAMES} + {MATCH_HAVING_CONDITION} ) a + GROUP BY {GROUPBY_NAMES} ) s + WHERE (total IS NOT NULL AND match_total IS NULL) + OR (total IS NULL AND match_total IS NOT NULL) + OR (total NOT BETWEEN match_total + {LOWER_TOLERANCE} AND match_total + {UPPER_TOLERANCE}) + ORDER BY {GROUPBY_NAMES}; + error_type: Test Results + - id: '1246' + test_id: '1505' + test_type: Aggregate_Balance_Range + sql_flavor: snowflake + lookup_type: null + lookup_query: |- + SELECT * + FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) AS total, SUM(MATCH_TOTAL) AS MATCH_TOTAL + FROM + ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} AS total, NULL AS match_total + FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + WHERE {SUBSET_CONDITION} + GROUP BY {GROUPBY_NAMES} + {HAVING_CONDITION} + UNION ALL + SELECT {MATCH_GROUPBY_NAMES}, NULL AS total, {MATCH_COLUMN_NAMES} AS match_total + FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME} + WHERE {MATCH_SUBSET_CONDITION} + GROUP BY {MATCH_GROUPBY_NAMES} + {MATCH_HAVING_CONDITION} ) a + GROUP BY {GROUPBY_NAMES} ) s + WHERE (total IS NOT NULL AND match_total IS NULL) + OR (total IS NULL AND match_total IS NOT NULL) + OR (total NOT BETWEEN match_total + {LOWER_TOLERANCE} AND match_total + {UPPER_TOLERANCE}) + ORDER BY {GROUPBY_NAMES}; + error_type: Test Results + test_templates: + - id: '2510' + test_type: Aggregate_Balance_Range + sql_flavor: bigquery + template_name: ex_aggregate_match_range_generic.sql + - id: '2410' + test_type: Aggregate_Balance_Range + sql_flavor: databricks + template_name: ex_aggregate_match_range_generic.sql + - id: '2210' + test_type: Aggregate_Balance_Range + sql_flavor: mssql + template_name: ex_aggregate_match_range_generic.sql + - id: '2310' + test_type: Aggregate_Balance_Range + sql_flavor: postgresql + template_name: ex_aggregate_match_range_generic.sql + - id: '2010' + test_type: Aggregate_Balance_Range + sql_flavor: redshift + template_name: ex_aggregate_match_range_generic.sql + - id: '2510' + test_type: Aggregate_Balance_Range + sql_flavor: redshift_spectrum + template_name: ex_aggregate_match_range_generic.sql + - id: '2110' + test_type: Aggregate_Balance_Range + sql_flavor: snowflake + template_name: ex_aggregate_match_range_generic.sql diff --git a/testgen/template/dbsetup_test_types/test_types_Aggregate_Minimum.yaml b/testgen/template/dbsetup_test_types/test_types_Aggregate_Minimum.yaml new file mode 100644 index 00000000..58462bf0 --- /dev/null +++ b/testgen/template/dbsetup_test_types/test_types_Aggregate_Minimum.yaml @@ -0,0 +1,239 @@ +test_types: + id: '1501' + test_type: Aggregate_Minimum + test_name_short: Aggregate Minimum + test_name_long: Aggregate values per group are at or above reference + test_description: |- + Tests that aggregate values for each set of column values are at least the same as reference dataset + except_message: |- + Aggregate measure per set of column values is not at least the same as reference dataset. + measure_uom: Mismatched measures + measure_uom_description: null + selection_criteria: null + dq_score_prevalence_formula: |- + 1 + dq_score_risk_factor: '1.0' + column_name_prompt: |- + Aggregate Expression + column_name_help: |- + Specify an aggregate column expression: one of `SUM([column_name])` or `COUNT([column_name])` + default_parm_columns: subset_condition,groupby_names,having_condition,match_schema_name,match_table_name,match_column_names,match_subset_condition,match_groupby_names,match_having_condition + default_parm_values: null + default_parm_prompts: |- + Record Subset Condition,Grouping Columns,Group Subset Condition,Matching Schema Name,Matching Table Name,Matching Aggregate Expression,Matching Record Subset Condition,Matching Grouping Columns,Matching Group Subset Condition + default_parm_help: |- + Condition defining a subset of records in main table, written like a condition within a SQL WHERE clause - OPTIONAL|Category columns in main table separated by commas (e.g. GROUP BY columns)|Condition defining a subset of aggregate records in main table (e.g. HAVING clause) - OPTIONAL|Schema location of reference table|Reference table name|Aggregate column expression in reference table (e.g. `SUM(sales)`)|Condition defining a subset of records in reference table, written like a condition within a SQL WHERE clause - OPTIONAL|Category columns in reference table separated by commas (e.g. GROUP BY columns)|Condition defining a subset of aggregate records in reference table (e.g. HAVING clause) - OPTIONAL + default_severity: Fail + run_type: QUERY + test_scope: referential + dq_dimension: Accuracy + health_dimension: Data Drift + threshold_description: |- + Expected count of group totals below aggregate value + result_visualization: line_chart + result_visualization_params: null + usage_notes: |- + This test compares sums or counts of a column rolled up to one or more category combinations, but requires a match or increase in the aggregate value, rather than an exact match, across two different tables. Both tables must be accessible at the same time. Use this to confirm that aggregate values have not dropped for any set of categories, even if some values may rise. This test is useful to compare an older and newer version of a cumulative dataset. An error here means that one or more values per category set fail to match or exceed the prior dataset. New categories or combinations are allowed (but can be restricted independently with a Combo_Match test). Both tables must be present to run this test. + active: Y + cat_test_conditions: [] + target_data_lookups: + - id: '1401' + test_id: '1501' + test_type: Aggregate_Minimum + sql_flavor: bigquery + lookup_type: null + lookup_query: |- + SELECT * + FROM ( + SELECT {GROUPBY_NAMES}, SUM(TOTAL) AS total, SUM(MATCH_TOTAL) AS MATCH_TOTAL + FROM ( + SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} AS total, NULL AS match_total + FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` + WHERE {SUBSET_CONDITION} + GROUP BY {GROUPBY_NAMES} + {HAVING_CONDITION} + UNION ALL + SELECT {MATCH_GROUPBY_NAMES}, NULL AS total, {MATCH_COLUMN_NAMES} AS match_total + FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME} + WHERE {MATCH_SUBSET_CONDITION} + GROUP BY {MATCH_GROUPBY_NAMES} + {MATCH_HAVING_CONDITION} + ) a + GROUP BY {GROUPBY_NAMES} + ) s + WHERE total < match_total OR (total IS NULL AND match_total IS NOT NULL) + ORDER BY {GROUPBY_NAMES}; + error_type: Test Results + - id: '1334' + test_id: '1501' + test_type: Aggregate_Minimum + sql_flavor: databricks + lookup_type: null + lookup_query: |- + SELECT * + FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) as total, SUM(MATCH_TOTAL) as MATCH_TOTAL + FROM + ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} as total, NULL as match_total + FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` + WHERE {SUBSET_CONDITION} + GROUP BY {GROUPBY_NAMES} + {HAVING_CONDITION} + UNION ALL + SELECT {MATCH_GROUPBY_NAMES}, NULL as total, {MATCH_COLUMN_NAMES} as match_total + FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME} + WHERE {MATCH_SUBSET_CONDITION} + GROUP BY {MATCH_GROUPBY_NAMES} + {MATCH_HAVING_CONDITION} ) a + GROUP BY {GROUPBY_NAMES} ) s + WHERE total < match_total OR (total IS NULL AND match_total IS NOT NULL) + ORDER BY {GROUPBY_NAMES}; + error_type: Test Results + - id: '1251' + test_id: '1501' + test_type: Aggregate_Minimum + sql_flavor: mssql + lookup_type: null + lookup_query: |- + SELECT * + FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) as total, SUM(MATCH_TOTAL) as MATCH_TOTAL + FROM + ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} as total, NULL as match_total + FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + WHERE {SUBSET_CONDITION} + GROUP BY {GROUPBY_NAMES} + {HAVING_CONDITION} + UNION ALL + SELECT {MATCH_GROUPBY_NAMES}, NULL as total, {MATCH_COLUMN_NAMES} as match_total + FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME} + WHERE {MATCH_SUBSET_CONDITION} + GROUP BY {MATCH_GROUPBY_NAMES} + {MATCH_HAVING_CONDITION} ) a + GROUP BY {GROUPBY_NAMES} ) s + WHERE total < match_total OR (total IS NULL AND match_total IS NOT NULL) + ORDER BY {GROUPBY_NAMES}; + error_type: Test Results + - id: '1252' + test_id: '1501' + test_type: Aggregate_Minimum + sql_flavor: postgresql + lookup_type: null + lookup_query: |- + SELECT * + FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) as total, SUM(MATCH_TOTAL) as MATCH_TOTAL + FROM + ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} as total, NULL as match_total + FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + WHERE {SUBSET_CONDITION} + GROUP BY {GROUPBY_NAMES} + {HAVING_CONDITION} + UNION ALL + SELECT {MATCH_GROUPBY_NAMES}, NULL as total, {MATCH_COLUMN_NAMES} as match_total + FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME} + WHERE {MATCH_SUBSET_CONDITION} + GROUP BY {MATCH_GROUPBY_NAMES} + {MATCH_HAVING_CONDITION} ) a + GROUP BY {GROUPBY_NAMES} ) s + WHERE total < match_total OR (total IS NULL AND match_total IS NOT NULL) + ORDER BY {GROUPBY_NAMES}; + error_type: Test Results + - id: '1249' + test_id: '1501' + test_type: Aggregate_Minimum + sql_flavor: redshift + lookup_type: null + lookup_query: |- + SELECT * + FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) as total, SUM(MATCH_TOTAL) as MATCH_TOTAL + FROM + ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} as total, NULL as match_total + FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + WHERE {SUBSET_CONDITION} + GROUP BY {GROUPBY_NAMES} + {HAVING_CONDITION} + UNION ALL + SELECT {MATCH_GROUPBY_NAMES}, NULL as total, {MATCH_COLUMN_NAMES} as match_total + FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME} + WHERE {MATCH_SUBSET_CONDITION} + GROUP BY {MATCH_GROUPBY_NAMES} + {MATCH_HAVING_CONDITION} ) a + GROUP BY {GROUPBY_NAMES} ) s + WHERE total < match_total OR (total IS NULL AND match_total IS NOT NULL) + ORDER BY {GROUPBY_NAMES}; + error_type: Test Results + - id: '1463' + test_id: '1501' + test_type: Aggregate_Minimum + sql_flavor: redshift_spectrum + lookup_type: null + lookup_query: |- + SELECT * + FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) as total, SUM(MATCH_TOTAL) as MATCH_TOTAL + FROM + ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} as total, NULL as match_total + FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + WHERE {SUBSET_CONDITION} + GROUP BY {GROUPBY_NAMES} + {HAVING_CONDITION} + UNION ALL + SELECT {MATCH_GROUPBY_NAMES}, NULL as total, {MATCH_COLUMN_NAMES} as match_total + FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME} + WHERE {MATCH_SUBSET_CONDITION} + GROUP BY {MATCH_GROUPBY_NAMES} + {MATCH_HAVING_CONDITION} ) a + GROUP BY {GROUPBY_NAMES} ) s + WHERE total < match_total OR (total IS NULL AND match_total IS NOT NULL) + ORDER BY {GROUPBY_NAMES}; + error_type: Test Results + - id: '1250' + test_id: '1501' + test_type: Aggregate_Minimum + sql_flavor: snowflake + lookup_type: null + lookup_query: |- + SELECT * + FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) as total, SUM(MATCH_TOTAL) as MATCH_TOTAL + FROM + ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} as total, NULL as match_total + FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + WHERE {SUBSET_CONDITION} + GROUP BY {GROUPBY_NAMES} + {HAVING_CONDITION} + UNION ALL + SELECT {MATCH_GROUPBY_NAMES}, NULL as total, {MATCH_COLUMN_NAMES} as match_total + FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME} + WHERE {MATCH_SUBSET_CONDITION} + GROUP BY {MATCH_GROUPBY_NAMES} + {MATCH_HAVING_CONDITION} ) a + GROUP BY {GROUPBY_NAMES} ) s + WHERE total < match_total OR (total IS NULL AND match_total IS NOT NULL) + ORDER BY {GROUPBY_NAMES}; + error_type: Test Results + test_templates: + - id: '2502' + test_type: Aggregate_Minimum + sql_flavor: bigquery + template_name: ex_aggregate_match_no_drops_generic.sql + - id: '2402' + test_type: Aggregate_Minimum + sql_flavor: databricks + template_name: ex_aggregate_match_no_drops_generic.sql + - id: '2202' + test_type: Aggregate_Minimum + sql_flavor: mssql + template_name: ex_aggregate_match_no_drops_generic.sql + - id: '2302' + test_type: Aggregate_Minimum + sql_flavor: postgresql + template_name: ex_aggregate_match_no_drops_generic.sql + - id: '2002' + test_type: Aggregate_Minimum + sql_flavor: redshift + template_name: ex_aggregate_match_no_drops_generic.sql + - id: '2502' + test_type: Aggregate_Minimum + sql_flavor: redshift_spectrum + template_name: ex_aggregate_match_no_drops_generic.sql + - id: '2102' + test_type: Aggregate_Minimum + sql_flavor: snowflake + template_name: ex_aggregate_match_no_drops_generic.sql diff --git a/testgen/template/dbsetup_test_types/test_types_Alpha_Trunc.yaml b/testgen/template/dbsetup_test_types/test_types_Alpha_Trunc.yaml new file mode 100644 index 00000000..97f00d83 --- /dev/null +++ b/testgen/template/dbsetup_test_types/test_types_Alpha_Trunc.yaml @@ -0,0 +1,166 @@ +test_types: + id: '1004' + test_type: Alpha_Trunc + test_name_short: Alpha Truncation + test_name_long: Maximum character count consistent + test_description: |- + Tests that the maximum count of characters in a column value has not dropped vs. baseline data + except_message: |- + Maximum length of values has dropped from prior expected length. + measure_uom: Values over max + measure_uom_description: null + selection_criteria: |- + general_type ='A' AND max_length > 0 AND ( (min_length = avg_length AND max_length = avg_length) OR (numeric_ct <> value_ct ) ) AND functional_table_type NOT LIKE '%window%' /* The conditions below are to eliminate overlap with : LOV_Match (excluded selection criteria for this test_type), Pattern_Match (excluded selection criteria for this test_type), Constant (excluded functional_data_type Constant and Boolean) */ AND ( (distinct_value_ct NOT BETWEEN 2 AND 10 AND functional_data_type NOT IN ( 'Constant', 'Boolean') ) AND NOT ( fn_charcount(top_patterns, E' \| ' ) = 1 AND fn_charcount(top_patterns, E' \| ' ) IS NOT NULL AND REPLACE(SPLIT_PART(top_patterns, '|' , 2), 'N' , '' ) > '')) + dq_score_prevalence_formula: |- + {VALUE_CT}::FLOAT * (FN_NORMAL_CDF(({MAX_LENGTH}::FLOAT - {AVG_LENGTH}::FLOAT) / (NULLIF({MAX_LENGTH}::FLOAT, 0) / 3)) - FN_NORMAL_CDF(({RESULT_MEASURE}::FLOAT - {AVG_LENGTH}::FLOAT) / (NULLIF({MAX_LENGTH}::FLOAT, 0) / 3)) ) /NULLIF({RECORD_CT}::FLOAT, 0) + dq_score_risk_factor: '1.0' + column_name_prompt: null + column_name_help: null + default_parm_columns: threshold_value + default_parm_values: |- + FLOOR(0.95 * max_length::FLOAT) + default_parm_prompts: |- + Maximum String Length at Baseline + default_parm_help: null + default_severity: Fail + run_type: CAT + test_scope: column + dq_dimension: Validity + health_dimension: Schema Drift + threshold_description: |- + Maximum length expected + result_visualization: line_chart + result_visualization_params: null + usage_notes: |- + Alpha Truncation tests that the longest text value in a column hasn't become shorter than the defined threshold, initially 95% of the longest value at baseline. This could indicate a problem in a cumulative dataset, where prior values should still exist unchanged. A failure here would suggest that some process changed data that you would still expect to be present and matching its value when the column was profiled. This test would not be appropriate for an incremental or windowed dataset. + active: Y + cat_test_conditions: + - id: '7001' + test_type: Alpha_Trunc + sql_flavor: bigquery + measure: |- + MAX(LENGTH({COLUMN_NAME})) + test_operator: < + test_condition: |- + {THRESHOLD_VALUE} + - id: '6001' + test_type: Alpha_Trunc + sql_flavor: databricks + measure: |- + MAX(LENGTH({COLUMN_NAME})) + test_operator: < + test_condition: |- + {THRESHOLD_VALUE} + - id: '3001' + test_type: Alpha_Trunc + sql_flavor: mssql + measure: |- + MAX(LEN({COLUMN_NAME})) + test_operator: < + test_condition: |- + {THRESHOLD_VALUE} + - id: '4001' + test_type: Alpha_Trunc + sql_flavor: postgresql + measure: |- + MAX(LENGTH({COLUMN_NAME})) + test_operator: < + test_condition: |- + {THRESHOLD_VALUE} + - id: '1001' + test_type: Alpha_Trunc + sql_flavor: redshift + measure: |- + MAX(LENGTH({COLUMN_NAME})) + test_operator: < + test_condition: |- + {THRESHOLD_VALUE} + - id: '7001' + test_type: Alpha_Trunc + sql_flavor: redshift_spectrum + measure: |- + MAX(LENGTH({COLUMN_NAME})) + test_operator: < + test_condition: |- + {THRESHOLD_VALUE} + - id: '2001' + test_type: Alpha_Trunc + sql_flavor: snowflake + measure: |- + MAX(LENGTH({COLUMN_NAME})) + test_operator: < + test_condition: |- + {THRESHOLD_VALUE} + - id: '5001' + test_type: Alpha_Trunc + sql_flavor: trino + measure: |- + MAX(LENGTH({COLUMN_NAME})) + test_operator: < + test_condition: |- + {THRESHOLD_VALUE} + target_data_lookups: + - id: '1364' + test_id: '1004' + test_type: Alpha_Trunc + sql_flavor: bigquery + lookup_type: null + lookup_query: |- + SELECT DISTINCT `{COLUMN_NAME}`, LENGTH(CAST(`{COLUMN_NAME}` AS STRING)) AS current_max_length, {THRESHOLD_VALUE} AS previous_max_length + FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`, ( + SELECT MAX(LENGTH(CAST(`{COLUMN_NAME}` AS STRING))) AS max_length + FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` + ) a + WHERE LENGTH(CAST(`{COLUMN_NAME}` AS STRING)) = a.max_length + AND a.max_length < {THRESHOLD_VALUE} + LIMIT 500; + error_type: Test Results + - id: '1298' + test_id: '1004' + test_type: Alpha_Trunc + sql_flavor: databricks + lookup_type: null + lookup_query: |- + SELECT DISTINCT `{COLUMN_NAME}` , LEN(`{COLUMN_NAME}`) as current_max_length, {THRESHOLD_VALUE} as previous_max_length FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`, (SELECT MAX(LEN(`{COLUMN_NAME}`)) as max_length FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`) a WHERE LEN(`{COLUMN_NAME}`) = a.max_length AND a.max_length < {THRESHOLD_VALUE} LIMIT 500; + error_type: Test Results + - id: '1140' + test_id: '1004' + test_type: Alpha_Trunc + sql_flavor: mssql + lookup_type: null + lookup_query: |- + SELECT DISTINCT TOP 500 "{COLUMN_NAME}", LEN("{COLUMN_NAME}") as current_max_length, {THRESHOLD_VALUE} as previous_max_length FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT MAX(LEN("{COLUMN_NAME}")) as max_length FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") a WHERE LEN("{COLUMN_NAME}") = a.max_length AND a.max_length < {THRESHOLD_VALUE} ; + error_type: Test Results + - id: '1083' + test_id: '1004' + test_type: Alpha_Trunc + sql_flavor: postgresql + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", LENGTH("{COLUMN_NAME}") as current_max_length, {THRESHOLD_VALUE} as previous_max_length FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT MAX(LENGTH("{COLUMN_NAME}")) as max_length FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") a WHERE LENGTH("{COLUMN_NAME}") = a.max_length AND a.max_length < {THRESHOLD_VALUE} LIMIT 500; + error_type: Test Results + - id: '1001' + test_id: '1004' + test_type: Alpha_Trunc + sql_flavor: redshift + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", LEN("{COLUMN_NAME}") as current_max_length, {THRESHOLD_VALUE} as previous_max_length FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT MAX(LEN("{COLUMN_NAME}")) as max_length FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") a WHERE LEN("{COLUMN_NAME}") = a.max_length AND a.max_length < {THRESHOLD_VALUE} LIMIT 500; + error_type: Test Results + - id: '1401' + test_id: '1004' + test_type: Alpha_Trunc + sql_flavor: redshift_spectrum + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", LEN("{COLUMN_NAME}") as current_max_length, {THRESHOLD_VALUE} as previous_max_length FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT MAX(LEN("{COLUMN_NAME}")) as max_length FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") a WHERE LEN("{COLUMN_NAME}") = a.max_length AND a.max_length < {THRESHOLD_VALUE} LIMIT 500; + error_type: Test Results + - id: '1197' + test_id: '1004' + test_type: Alpha_Trunc + sql_flavor: snowflake + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}" , LEN("{COLUMN_NAME}") as current_max_length, {THRESHOLD_VALUE} as previous_max_length FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT MAX(LEN("{COLUMN_NAME}")) as max_length FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") a WHERE LEN("{COLUMN_NAME}") = a.max_length AND a.max_length < {THRESHOLD_VALUE} LIMIT 500; + error_type: Test Results + test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Avg_Shift.yaml b/testgen/template/dbsetup_test_types/test_types_Avg_Shift.yaml new file mode 100644 index 00000000..b5a0aaf6 --- /dev/null +++ b/testgen/template/dbsetup_test_types/test_types_Avg_Shift.yaml @@ -0,0 +1,161 @@ +test_types: + id: '1005' + test_type: Avg_Shift + test_name_short: Average Shift + test_name_long: Column mean is consistent with reference + test_description: |- + Tests for statistically-significant shift in mean value for column from average calculated at baseline. + except_message: |- + Standardized difference between averages is over the selected threshold level. + measure_uom: Difference Measure + measure_uom_description: |- + Cohen's D Difference (0.20 small, 0.5 mod, 0.8 large, 1.2 very large, 2.0 huge) + selection_criteria: |- + general_type='N' AND distinct_value_ct > 10 AND functional_data_type ilike 'Measure%' AND functional_data_type <> 'Measurement Spike' AND column_name NOT ilike '%latitude%' AND column_name NOT ilike '%longitude%' + dq_score_prevalence_formula: |- + 2.0 * (1.0 - fn_normal_cdf(ABS({RESULT_MEASURE}::FLOAT) / 2.0)) + dq_score_risk_factor: '0.75' + column_name_prompt: null + column_name_help: null + default_parm_columns: baseline_value_ct,baseline_avg,baseline_sd,threshold_value + default_parm_values: |- + value_ct,avg_value,stdev_value,0.5::VARCHAR + default_parm_prompts: "Value Ct at Baseline,Mean at Baseline,Std Deviation at Baseline,Threshold\ + \ Difference Measure " + default_parm_help: null + default_severity: Warning + run_type: CAT + test_scope: column + dq_dimension: Consistency + health_dimension: Data Drift + threshold_description: |- + Standardized Difference Measure + result_visualization: line_chart + result_visualization_params: null + usage_notes: |- + Average Shift tests that the average of a numeric column has not significantly changed since baseline, when profiling was done. A significant shift may indicate errors in processing, differences in source data, or valid changes that may nevertheless impact assumptions in downstream data products. The test uses Cohen's D, a statistical technique to identify significant shifts in a value. Cohen's D measures the difference between the two averages, reporting results on a standardized scale, which can be interpreted via a rule-of-thumb from small to huge. Depending on your data, some difference may be expected, so it's reasonable to adjust the threshold value that triggers test failure. This test works well for measures, or even for identifiers if you expect them to increment consistently. You may want to periodically adjust the expected threshold, or even the expected average value if you expect shifting over time. Consider this test along with Variability Increase. If variability rises too, process or measurement flaws could be at work. If variability remains consistent, the issue is more likely to be with the source data itself. + active: Y + cat_test_conditions: + - id: '7002' + test_type: Avg_Shift + sql_flavor: bigquery + measure: |- + ROUND(ABS((AVG(SAFE_CAST({COLUMN_NAME} AS FLOAT64)) - {BASELINE_AVG}) / SQRT(((COUNT({COLUMN_NAME})-1)*POW(STDDEV({COLUMN_NAME}),2) + ({BASELINE_VALUE_CT}-1)*POW({BASELINE_SD},2)) / NULLIF(COUNT({COLUMN_NAME}) + {BASELINE_VALUE_CT}, 0))),3) + test_operator: '>=' + test_condition: |- + {THRESHOLD_VALUE} + - id: '6002' + test_type: Avg_Shift + sql_flavor: databricks + measure: |- + ABS( (AVG({COLUMN_NAME}::FLOAT) - {BASELINE_AVG}) / SQRT(((COUNT({COLUMN_NAME})::FLOAT-1)*POWER(STDDEV_SAMP({COLUMN_NAME}),2) + ({BASELINE_VALUE_CT}::FLOAT-1) * POWER({BASELINE_SD}::FLOAT,2)) /NULLIF(COUNT({COLUMN_NAME})::FLOAT + {BASELINE_VALUE_CT}::FLOAT, 0) )) + test_operator: '>=' + test_condition: |- + {THRESHOLD_VALUE} + - id: '3002' + test_type: Avg_Shift + sql_flavor: mssql + measure: |- + ABS( (AVG(CAST({COLUMN_NAME} AS FLOAT)) - CAST({BASELINE_AVG} as FLOAT)) / SQRT(((COUNT({COLUMN_NAME})-1)*POWER(STDEV(CAST({COLUMN_NAME} AS FLOAT)), 2) + ({BASELINE_VALUE_CT}-1) * POWER(CAST({BASELINE_SD} as FLOAT), 2)) /NULLIF(COUNT({COLUMN_NAME}) + {BASELINE_VALUE_CT}, 0) )) + test_operator: '>=' + test_condition: |- + {THRESHOLD_VALUE} + - id: '4002' + test_type: Avg_Shift + sql_flavor: postgresql + measure: |- + ABS( (AVG({COLUMN_NAME}::FLOAT) - {BASELINE_AVG}) / SQRT(((COUNT({COLUMN_NAME})::FLOAT-1)*STDDEV({COLUMN_NAME}::FLOAT)^2 + ({BASELINE_VALUE_CT}::FLOAT-1) * {BASELINE_SD}::FLOAT^2) /NULLIF(COUNT({COLUMN_NAME})::FLOAT + {BASELINE_VALUE_CT}::FLOAT, 0) )) + test_operator: '>=' + test_condition: |- + {THRESHOLD_VALUE} + - id: '1002' + test_type: Avg_Shift + sql_flavor: redshift + measure: |- + ABS( (AVG({COLUMN_NAME}::FLOAT) - {BASELINE_AVG}) / SQRT(((COUNT({COLUMN_NAME})::FLOAT-1)*STDDEV({COLUMN_NAME}::FLOAT)^2 + ({BASELINE_VALUE_CT}::FLOAT-1) * {BASELINE_SD}::FLOAT^2) /NULLIF(COUNT({COLUMN_NAME})::FLOAT + {BASELINE_VALUE_CT}::FLOAT, 0) )) + test_operator: '>=' + test_condition: |- + {THRESHOLD_VALUE} + - id: '7002' + test_type: Avg_Shift + sql_flavor: redshift_spectrum + measure: |- + ABS( (AVG({COLUMN_NAME}::FLOAT) - {BASELINE_AVG}) / SQRT(((COUNT({COLUMN_NAME})::FLOAT-1)*STDDEV({COLUMN_NAME})^2 + ({BASELINE_VALUE_CT}::FLOAT-1) * {BASELINE_SD}::FLOAT^2) /NULLIF(COUNT({COLUMN_NAME})::FLOAT + {BASELINE_VALUE_CT}::FLOAT, 0) )) + test_operator: '>=' + test_condition: |- + {THRESHOLD_VALUE} + - id: '2002' + test_type: Avg_Shift + sql_flavor: snowflake + measure: |- + ABS( (AVG({COLUMN_NAME}::FLOAT) - {BASELINE_AVG}) / SQRT(((COUNT({COLUMN_NAME})::FLOAT-1)*POWER(STDDEV({COLUMN_NAME}::FLOAT),2) + ({BASELINE_VALUE_CT}::FLOAT-1) * POWER({BASELINE_SD}::FLOAT,2)) /NULLIF(COUNT({COLUMN_NAME})::FLOAT + {BASELINE_VALUE_CT}::FLOAT, 0) )) + test_operator: '>=' + test_condition: |- + {THRESHOLD_VALUE} + - id: '5002' + test_type: Avg_Shift + sql_flavor: trino + measure: |- + ABS( (CAST(AVG({COLUMN_NAME} AS REAL)) - {BASELINE_AVG}) / SQRT(((CAST(COUNT({COLUMN_NAME}) AS REAL)-1)*STDDEV({COLUMN_NAME})^2 + (CAST({BASELINE_VALUE_CT} AS REAL)-1) * CAST({BASELINE_SD} AS REAL)^2) /NULLIF(CAST(COUNT({COLUMN_NAME}) AS REAL) + CAST({BASELINE_VALUE_CT} AS REAL), 0) )) + test_operator: '>=' + test_condition: |- + {THRESHOLD_VALUE} + target_data_lookups: + - id: '1365' + test_id: '1005' + test_type: Avg_Shift + sql_flavor: bigquery + lookup_type: null + lookup_query: |- + SELECT AVG(CAST(`{COLUMN_NAME}` AS FLOAT64)) AS current_average + FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`; + error_type: Test Results + - id: '1299' + test_id: '1005' + test_type: Avg_Shift + sql_flavor: databricks + lookup_type: null + lookup_query: |- + SELECT AVG(`{COLUMN_NAME}` :: FLOAT) AS current_average FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`; + error_type: Test Results + - id: '1141' + test_id: '1005' + test_type: Avg_Shift + sql_flavor: mssql + lookup_type: null + lookup_query: |- + SELECT AVG(CAST("{COLUMN_NAME}" AS FLOAT)) AS current_average FROM "{TARGET_SCHEMA}"."{TABLE_NAME}"; + error_type: Test Results + - id: '1084' + test_id: '1005' + test_type: Avg_Shift + sql_flavor: postgresql + lookup_type: null + lookup_query: |- + SELECT AVG("{COLUMN_NAME}" :: FLOAT) AS current_average FROM "{TARGET_SCHEMA}"."{TABLE_NAME}"; + error_type: Test Results + - id: '1002' + test_id: '1005' + test_type: Avg_Shift + sql_flavor: redshift + lookup_type: null + lookup_query: |- + SELECT AVG("{COLUMN_NAME}"::FLOAT) AS current_average FROM "{TARGET_SCHEMA}"."{TABLE_NAME}"; + error_type: Test Results + - id: '1402' + test_id: '1005' + test_type: Avg_Shift + sql_flavor: redshift_spectrum + lookup_type: null + lookup_query: |- + SELECT AVG("{COLUMN_NAME}" :: FLOAT) AS current_average FROM "{TARGET_SCHEMA}"."{TABLE_NAME}"; + error_type: Test Results + - id: '1198' + test_id: '1005' + test_type: Avg_Shift + sql_flavor: snowflake + lookup_type: null + lookup_query: |- + SELECT AVG("{COLUMN_NAME}" :: FLOAT) AS current_average FROM "{TARGET_SCHEMA}"."{TABLE_NAME}"; + error_type: Test Results + test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_CUSTOM.yaml b/testgen/template/dbsetup_test_types/test_types_CUSTOM.yaml new file mode 100644 index 00000000..bdbbc883 --- /dev/null +++ b/testgen/template/dbsetup_test_types/test_types_CUSTOM.yaml @@ -0,0 +1,69 @@ +test_types: + id: '1008' + test_type: CUSTOM + test_name_short: Custom Test + test_name_long: Custom-defined business rule + test_description: |- + Custom SQL Query Test + except_message: |- + Errors were detected according to test definition. + measure_uom: Errors found + measure_uom_description: |- + Count of errors identified by query + selection_criteria: null + dq_score_prevalence_formula: |- + ({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/NULLIF({RECORD_CT}::FLOAT, 0) + dq_score_risk_factor: '1.0' + column_name_prompt: |- + Test Focus + column_name_help: |- + Specify a brief descriptor of the focus of this test that is unique within this Test Suite for the Table and Test Type. This distinguishes this test from others of the same type on the same table. Example: `Order Total Matches Detail` if you are testing that the total in one table matches the sum of lines in another. + default_parm_columns: custom_query + default_parm_values: null + default_parm_prompts: |- + Custom SQL Query Returning Error Records + default_parm_help: |- + Query should return records indicating one or more errors. The test passes if no records are returned. Results of the query will be shown when you click `Review Source Data` for a failed test, so be sure to include enough data in your results to follow-up. \n\nA query can refer to any tables in the database. You must hard-code the schema or use `{DATA_SCHEMA}` to represent the schema defined for the Table Group. + default_severity: Fail + run_type: QUERY + test_scope: custom + dq_dimension: Accuracy + health_dimension: Data Drift + threshold_description: |- + Expected count of errors found by custom query + result_visualization: line_chart + result_visualization_params: null + usage_notes: |- + This business-rule test is highly flexible, covering any error state that can be expressed by a SQL query against one or more tables in the database. In operation, the user-defined query is embedded within a parent query returning the count of error rows identified. Any row returned by the query is interpreted as a single error condition in the test. Note that this query is run independently of other tests, and that performance will be slower, depending in large part on the efficiency of the query you write. Interpretation is based on the user-defined meaning of the test. Your query might be written to return errors in individual rows identified by joining tables. Or it might return an error based on a multi-column aggregate condition returning a single row if an error is found. This query is run separately when you click `Review Source Data` from Test Results, so be sure to include enough data in your results to follow-up. Interpretation is based on the user-defined meaning of the test. + active: Y + cat_test_conditions: [] + target_data_lookups: [] + test_templates: + - id: '2504' + test_type: CUSTOM + sql_flavor: bigquery + template_name: ex_custom_query_generic.sql + - id: '2404' + test_type: CUSTOM + sql_flavor: databricks + template_name: ex_custom_query_generic.sql + - id: '2204' + test_type: CUSTOM + sql_flavor: mssql + template_name: ex_custom_query_generic.sql + - id: '2304' + test_type: CUSTOM + sql_flavor: postgresql + template_name: ex_custom_query_generic.sql + - id: '2004' + test_type: CUSTOM + sql_flavor: redshift + template_name: ex_custom_query_generic.sql + - id: '2504' + test_type: CUSTOM + sql_flavor: redshift_spectrum + template_name: ex_custom_query_generic.sql + - id: '2104' + test_type: CUSTOM + sql_flavor: snowflake + template_name: ex_custom_query_generic.sql diff --git a/testgen/template/dbsetup_test_types/test_types_Combo_Match.yaml b/testgen/template/dbsetup_test_types/test_types_Combo_Match.yaml new file mode 100644 index 00000000..3630cc0c --- /dev/null +++ b/testgen/template/dbsetup_test_types/test_types_Combo_Match.yaml @@ -0,0 +1,216 @@ +test_types: + id: '1502' + test_type: Combo_Match + test_name_short: Reference Match + test_name_long: Column values or combinations found in reference + test_description: |- + Tests for the presence of one or a set of column values in a reference table + except_message: |- + Column value combinations are not found in reference table values. + measure_uom: Missing values + measure_uom_description: null + selection_criteria: null + dq_score_prevalence_formula: |- + ({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/NULLIF({RECORD_CT}::FLOAT, 0) + dq_score_risk_factor: '1.0' + column_name_prompt: |- + Categorical Column List + column_name_help: |- + Specify one or more Categorical columns, separated by commas. \n\nDo not use continuous mesurements here. Do not use numeric values unless they represent discrete categories. + default_parm_columns: subset_condition,having_condition,match_schema_name,match_table_name,match_groupby_names,match_subset_condition,match_having_condition + default_parm_values: null + default_parm_prompts: |- + Record Subset Condition,Group Subset Condition,Reference Schema Name,Reference Table Name,Matching Columns,Matching Record Subset Condition,Matching Group Subset Condition + default_parm_help: |- + Condition defining a subset of records in main table to evaluate, written like a condition within a SQL WHERE clause - OPTIONAL|Condition based on aggregate expression used to exclude value combinations in source table, written like a condition within a SQL HAVING clause (e.g. `SUM(sales) < 100`) - OPTIONAL|Schema location of matching table|Matching table name|Column Names in reference table used to validate source table values (separated by commas)|Condition defining a subset of records in reference table to match against, written like a condition within a SQL WHERE clause - OPTIONAL|Condition based on aggregate expression used to exclude value combinations in reference table, written like a condition within a SQL HAVING clause (e.g. `SUM(sales) < 100`) - OPTIONAL + default_severity: Fail + run_type: QUERY + test_scope: referential + dq_dimension: Validity + health_dimension: Schema Drift + threshold_description: |- + Expected count of non-matching value combinations + result_visualization: line_chart + result_visualization_params: null + usage_notes: |- + This test verifies that values, or combinations of values, that are present in the main table are also found in a reference table. This is a useful test for referential integrity between fact and dimension tables. You can also use it to confirm the validity of a code or category, or of combinations of values that should only be found together within each record, such as product/size/color. An error here means that one or more category combinations in the main table are not found in the reference table. Both tables must be present to run this test. + active: Y + cat_test_conditions: [] + target_data_lookups: + - id: '1402' + test_id: '1502' + test_type: Combo_Match + sql_flavor: bigquery + lookup_type: null + lookup_query: |- + SELECT * + FROM ( + SELECT {COLUMN_NAME_NO_QUOTES} + FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` + WHERE {SUBSET_CONDITION} + GROUP BY {COLUMN_NAME_NO_QUOTES} + {HAVING_CONDITION} + EXCEPT DISTINCT + SELECT {MATCH_GROUPBY_NAMES} + FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME} + WHERE {MATCH_SUBSET_CONDITION} + GROUP BY {MATCH_GROUPBY_NAMES} + {MATCH_HAVING_CONDITION} + ) test + ORDER BY {COLUMN_NAME_NO_QUOTES}; + error_type: Test Results + - id: '1335' + test_id: '1502' + test_type: Combo_Match + sql_flavor: databricks + lookup_type: null + lookup_query: |- + SELECT * + FROM ( SELECT {COLUMN_NAME_NO_QUOTES} + FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` + WHERE {SUBSET_CONDITION} + GROUP BY {COLUMN_NAME_NO_QUOTES} + {HAVING_CONDITION} + EXCEPT + SELECT {MATCH_GROUPBY_NAMES} + FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME} + WHERE {MATCH_SUBSET_CONDITION} + GROUP BY {MATCH_GROUPBY_NAMES} + {MATCH_HAVING_CONDITION} + ) test + ORDER BY {COLUMN_NAME_NO_QUOTES}; + error_type: Test Results + - id: '1255' + test_id: '1502' + test_type: Combo_Match + sql_flavor: mssql + lookup_type: null + lookup_query: |- + SELECT * + FROM ( SELECT {COLUMN_NAME_NO_QUOTES} + FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + WHERE {SUBSET_CONDITION} + GROUP BY {COLUMN_NAME_NO_QUOTES} + {HAVING_CONDITION} + EXCEPT + SELECT {MATCH_GROUPBY_NAMES} + FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME} + WHERE {MATCH_SUBSET_CONDITION} + GROUP BY {MATCH_GROUPBY_NAMES} + {MATCH_HAVING_CONDITION} + ) test + ORDER BY {COLUMN_NAME_NO_QUOTES}; + error_type: Test Results + - id: '1256' + test_id: '1502' + test_type: Combo_Match + sql_flavor: postgresql + lookup_type: null + lookup_query: |- + SELECT * + FROM ( SELECT {COLUMN_NAME_NO_QUOTES} + FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + WHERE {SUBSET_CONDITION} + GROUP BY {COLUMN_NAME_NO_QUOTES} + {HAVING_CONDITION} + EXCEPT + SELECT {MATCH_GROUPBY_NAMES} + FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME} + WHERE {MATCH_SUBSET_CONDITION} + GROUP BY {MATCH_GROUPBY_NAMES} + {MATCH_HAVING_CONDITION} + ) test + ORDER BY {COLUMN_NAME_NO_QUOTES}; + error_type: Test Results + - id: '1253' + test_id: '1502' + test_type: Combo_Match + sql_flavor: redshift + lookup_type: null + lookup_query: |- + SELECT * + FROM ( SELECT {COLUMN_NAME_NO_QUOTES} + FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + WHERE {SUBSET_CONDITION} + GROUP BY {COLUMN_NAME_NO_QUOTES} + {HAVING_CONDITION} + EXCEPT + SELECT {MATCH_GROUPBY_NAMES} + FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME} + WHERE {MATCH_SUBSET_CONDITION} + GROUP BY {MATCH_GROUPBY_NAMES} + {MATCH_HAVING_CONDITION} + ) test + ORDER BY {COLUMN_NAME_NO_QUOTES}; + error_type: Test Results + - id: '1464' + test_id: '1502' + test_type: Combo_Match + sql_flavor: redshift_spectrum + lookup_type: null + lookup_query: |- + SELECT * + FROM ( SELECT {COLUMN_NAME_NO_QUOTES} + FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + WHERE {SUBSET_CONDITION} + GROUP BY {COLUMN_NAME_NO_QUOTES} + {HAVING_CONDITION} + EXCEPT + SELECT {MATCH_GROUPBY_NAMES} + FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME} + WHERE {MATCH_SUBSET_CONDITION} + GROUP BY {MATCH_GROUPBY_NAMES} + {MATCH_HAVING_CONDITION} + ) test + ORDER BY {COLUMN_NAME_NO_QUOTES}; + error_type: Test Results + - id: '1254' + test_id: '1502' + test_type: Combo_Match + sql_flavor: snowflake + lookup_type: null + lookup_query: |- + SELECT * + FROM ( SELECT {COLUMN_NAME_NO_QUOTES} + FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + WHERE {SUBSET_CONDITION} + GROUP BY {COLUMN_NAME_NO_QUOTES} + {HAVING_CONDITION} + EXCEPT + SELECT {MATCH_GROUPBY_NAMES} + FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME} + WHERE {MATCH_SUBSET_CONDITION} + GROUP BY {MATCH_GROUPBY_NAMES} + {MATCH_HAVING_CONDITION} + ) test + ORDER BY {COLUMN_NAME_NO_QUOTES}; + error_type: Test Results + test_templates: + - id: '2501' + test_type: Combo_Match + sql_flavor: bigquery + template_name: ex_data_match_bigquery.sql + - id: '2401' + test_type: Combo_Match + sql_flavor: databricks + template_name: ex_data_match_generic.sql + - id: '2201' + test_type: Combo_Match + sql_flavor: mssql + template_name: ex_data_match_generic.sql + - id: '2301' + test_type: Combo_Match + sql_flavor: postgresql + template_name: ex_data_match_generic.sql + - id: '2001' + test_type: Combo_Match + sql_flavor: redshift + template_name: ex_data_match_generic.sql + - id: '2501' + test_type: Combo_Match + sql_flavor: redshift_spectrum + template_name: ex_data_match_generic.sql + - id: '2101' + test_type: Combo_Match + sql_flavor: snowflake + template_name: ex_data_match_generic.sql diff --git a/testgen/template/dbsetup_test_types/test_types_Condition_Flag.yaml b/testgen/template/dbsetup_test_types/test_types_Condition_Flag.yaml new file mode 100644 index 00000000..6f01a0b5 --- /dev/null +++ b/testgen/template/dbsetup_test_types/test_types_Condition_Flag.yaml @@ -0,0 +1,163 @@ +test_types: + id: '1006' + test_type: Condition_Flag + test_name_short: Custom Condition + test_name_long: Column values match pre-defined condition + test_description: |- + Tests that each record in the table matches a pre-defined, custom condition + except_message: |- + Value(s) found not matching defined condition. + measure_uom: Values Failing + measure_uom_description: null + selection_criteria: null + dq_score_prevalence_formula: |- + ({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/NULLIF({RECORD_CT}::FLOAT, 0) + dq_score_risk_factor: '1.0' + column_name_prompt: |- + Test Focus + column_name_help: |- + Specify a brief descriptor of the focus of this test that is unique within this Test Suite for the Table and Test Type. This distinguishes this test from others of the same type on the same table. Example: `Quantity Consistency` if you are testing that quantity ordered matches quantity shipped. + default_parm_columns: threshold_value,custom_query + default_parm_values: null + default_parm_prompts: |- + Threshold Error Count,Custom SQL Expression (TRUE on error) + default_parm_help: |- + The number of errors that are acceptable before test fails.|Expression should evaluate to TRUE to register an error or FALSE if no error. An expression can reference only columns in the selected table. + default_severity: Fail + run_type: CAT + test_scope: custom + dq_dimension: Validity + health_dimension: Schema Drift + threshold_description: |- + Count of records that don't meet test condition + result_visualization: line_chart + result_visualization_params: null + usage_notes: |- + Custom Condition is a business-rule test for a user-defined error condition based on the value of one or more columns. The condition is applied to each record within the table, and the count of records failing the condition is added up. If that count exceeds a threshold of errors, the test as a whole is failed. This test is ideal for error conditions that TestGen cannot automatically infer, and any condition that involves the values of more than one column in the same record. Performance of this test is fast, since it is performed together with other aggregate tests. Interpretation is based on the user-defined meaning of the test. + active: Y + cat_test_conditions: + - id: '7003' + test_type: Condition_Flag + sql_flavor: bigquery + measure: |- + SUM(CASE WHEN {CUSTOM_QUERY} THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '6003' + test_type: Condition_Flag + sql_flavor: databricks + measure: |- + SUM(CASE WHEN {CUSTOM_QUERY} THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '3003' + test_type: Condition_Flag + sql_flavor: mssql + measure: |- + SUM(CASE WHEN {CUSTOM_QUERY} THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '4003' + test_type: Condition_Flag + sql_flavor: postgresql + measure: |- + SUM(CASE WHEN {CUSTOM_QUERY} THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '1003' + test_type: Condition_Flag + sql_flavor: redshift + measure: |- + SUM(CASE WHEN {CUSTOM_QUERY} THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '7003' + test_type: Condition_Flag + sql_flavor: redshift_spectrum + measure: |- + SUM(CASE WHEN {CUSTOM_QUERY} THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '2003' + test_type: Condition_Flag + sql_flavor: snowflake + measure: |- + SUM(CASE WHEN {CUSTOM_QUERY} THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '5003' + test_type: Condition_Flag + sql_flavor: trino + measure: |- + SUM(CASE WHEN {BASELINE_VALUE} IS NOT NULL THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + target_data_lookups: + - id: '1366' + test_id: '1006' + test_type: Condition_Flag + sql_flavor: bigquery + lookup_type: null + lookup_query: |- + SELECT * + FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` + WHERE {CUSTOM_QUERY} + LIMIT 500; + error_type: Test Results + - id: '1300' + test_id: '1006' + test_type: Condition_Flag + sql_flavor: databricks + lookup_type: null + lookup_query: |- + SELECT * FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE {CUSTOM_QUERY} LIMIT 500; + error_type: Test Results + - id: '1142' + test_id: '1006' + test_type: Condition_Flag + sql_flavor: mssql + lookup_type: null + lookup_query: |- + SELECT TOP 500 * FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE {CUSTOM_QUERY}; + error_type: Test Results + - id: '1085' + test_id: '1006' + test_type: Condition_Flag + sql_flavor: postgresql + lookup_type: null + lookup_query: |- + SELECT * FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE {CUSTOM_QUERY} LIMIT 500; + error_type: Test Results + - id: '1003' + test_id: '1006' + test_type: Condition_Flag + sql_flavor: redshift + lookup_type: null + lookup_query: |- + SELECT * FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE {CUSTOM_QUERY} LIMIT 500; + error_type: Test Results + - id: '1403' + test_id: '1006' + test_type: Condition_Flag + sql_flavor: redshift_spectrum + lookup_type: null + lookup_query: |- + SELECT * FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE {CUSTOM_QUERY} LIMIT 500; + error_type: Test Results + - id: '1199' + test_id: '1006' + test_type: Condition_Flag + sql_flavor: snowflake + lookup_type: null + lookup_query: |- + SELECT * FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE {CUSTOM_QUERY} LIMIT 500; + error_type: Test Results + test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Constant.yaml b/testgen/template/dbsetup_test_types/test_types_Constant.yaml new file mode 100644 index 00000000..b9e5033f --- /dev/null +++ b/testgen/template/dbsetup_test_types/test_types_Constant.yaml @@ -0,0 +1,163 @@ +test_types: + id: '1007' + test_type: Constant + test_name_short: Constant Match + test_name_long: All column values match constant value + test_description: |- + Tests that all values in the column match the constant value identified in baseline data + except_message: |- + A constant value is expected for this column. + measure_uom: Mismatched values + measure_uom_description: null + selection_criteria: |- + TEMPLATE + dq_score_prevalence_formula: |- + ({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/NULLIF({RECORD_CT}::FLOAT, 0) + dq_score_risk_factor: '1.0' + column_name_prompt: null + column_name_help: null + default_parm_columns: baseline_value,threshold_value + default_parm_values: null + default_parm_prompts: |- + Constant Value at Baseline,Threshold Error Count + default_parm_help: |- + The single, unchanging value of the column, per baseline|The number of errors that are acceptable before test fails. + default_severity: Fail + run_type: CAT + test_scope: column + dq_dimension: Validity + health_dimension: Schema Drift + threshold_description: |- + Count of records with unexpected values + result_visualization: line_chart + result_visualization_params: null + usage_notes: |- + Constant Match tests that a single value determined to be a constant in baseline profiling is still the only value for the column that appears in subsequent versions of the dataset. Sometimes new data or business knowledge may reveal that the value is not a constant at all, even though only one value was present at profiling. In this case, you will want to disable this test. Alternatively, you can use the Value Match test to provide a limited number of valid values for the column. + active: Y + cat_test_conditions: + - id: '7004' + test_type: Constant + sql_flavor: bigquery + measure: |- + SUM(CASE WHEN {COLUMN_NAME} != {BASELINE_VALUE} THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '6004' + test_type: Constant + sql_flavor: databricks + measure: |- + SUM(CASE WHEN {COLUMN_NAME} <> {BASELINE_VALUE} THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '3004' + test_type: Constant + sql_flavor: mssql + measure: |- + SUM(CASE WHEN {COLUMN_NAME} <> {BASELINE_VALUE} THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '4004' + test_type: Constant + sql_flavor: postgresql + measure: |- + SUM(CASE WHEN {COLUMN_NAME} <> {BASELINE_VALUE} THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '1004' + test_type: Constant + sql_flavor: redshift + measure: |- + SUM(CASE WHEN {COLUMN_NAME} <> {BASELINE_VALUE} THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '7004' + test_type: Constant + sql_flavor: redshift_spectrum + measure: |- + SUM(CASE WHEN {COLUMN_NAME} <> {BASELINE_VALUE} THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '2004' + test_type: Constant + sql_flavor: snowflake + measure: |- + SUM(CASE WHEN {COLUMN_NAME} <> {BASELINE_VALUE} THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '5004' + test_type: Constant + sql_flavor: trino + measure: |- + SUM(CASE WHEN {COLUMN_NAME} <> {BASELINE_VALUE} THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + target_data_lookups: + - id: '1367' + test_id: '1007' + test_type: Constant + sql_flavor: bigquery + lookup_type: null + lookup_query: |- + SELECT `{COLUMN_NAME}`, COUNT(*) AS count + FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` + WHERE `{COLUMN_NAME}` <> {BASELINE_VALUE} + GROUP BY `{COLUMN_NAME}` + LIMIT 500; + error_type: Test Results + - id: '1301' + test_id: '1007' + test_type: Constant + sql_flavor: databricks + lookup_type: null + lookup_query: |- + SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE `{COLUMN_NAME}` <> {BASELINE_VALUE} GROUP BY `{COLUMN_NAME}` LIMIT 500; + error_type: Test Results + - id: '1143' + test_id: '1007' + test_type: Constant + sql_flavor: mssql + lookup_type: null + lookup_query: |- + SELECT DISTINCT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" <> {BASELINE_VALUE} GROUP BY "{COLUMN_NAME}"; + error_type: Test Results + - id: '1086' + test_id: '1007' + test_type: Constant + sql_flavor: postgresql + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" <> {BASELINE_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT 500; + error_type: Test Results + - id: '1004' + test_id: '1007' + test_type: Constant + sql_flavor: redshift + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" <> {BASELINE_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT 500; + error_type: Test Results + - id: '1404' + test_id: '1007' + test_type: Constant + sql_flavor: redshift_spectrum + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" <> {BASELINE_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT 500; + error_type: Test Results + - id: '1200' + test_id: '1007' + test_type: Constant + sql_flavor: snowflake + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" <> {BASELINE_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT 500; + error_type: Test Results + test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Daily_Record_Ct.yaml b/testgen/template/dbsetup_test_types/test_types_Daily_Record_Ct.yaml new file mode 100644 index 00000000..d0d0ca27 --- /dev/null +++ b/testgen/template/dbsetup_test_types/test_types_Daily_Record_Ct.yaml @@ -0,0 +1,234 @@ +test_types: + id: '1009' + test_type: Daily_Record_Ct + test_name_short: Daily Records + test_name_long: All dates present within date range + test_description: |- + Tests for presence of every calendar date within min/max date range, per baseline data + except_message: |- + Not every date value between min and max dates is present, unlike at baseline. + measure_uom: Missing dates + measure_uom_description: null + selection_criteria: |- + general_type= 'D' AND date_days_present > 21 AND date_days_present - (DATEDIFF('day', '1800-01-05'::DATE, max_date) - DATEDIFF('day', '1800-01-05'::DATE, min_date) + 1) = 0 AND future_date_ct::FLOAT / NULLIF(value_ct, 0) <= 0.75 + dq_score_prevalence_formula: |- + ({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT*{PRO_RECORD_CT}::FLOAT/NULLIF({DATE_DAYS_PRESENT}::FLOAT, 0)/NULLIF({RECORD_CT}::FLOAT, 0) + dq_score_risk_factor: '0.75' + column_name_prompt: null + column_name_help: null + default_parm_columns: threshold_value + default_parm_values: |- + 0 + default_parm_prompts: |- + Threshold Missing Calendar Days + default_parm_help: null + default_severity: Warning + run_type: CAT + test_scope: column + dq_dimension: Completeness + health_dimension: Volume + threshold_description: |- + Missing calendar days within min/max range + result_visualization: line_chart + result_visualization_params: null + usage_notes: "Daily Records tests that at least one record is present for every\ + \ day within the minimum and maximum date range for the column. The test is relevant\ + \ for transactional data, where you would expect at least one transaction to be\ + \ recorded each day. A failure here would suggest missing records for the number\ + \ of days identified without data. You can adjust the threshold to accept a number\ + \ of days that you know legitimately have no records. " + active: Y + cat_test_conditions: + - id: '7005' + test_type: Daily_Record_Ct + sql_flavor: bigquery + measure: |- + DATETIME_DIFF(DATETIME_TRUNC(SAFE_CAST(MAX({COLUMN_NAME}) AS DATE), DAY), DATETIME_TRUNC(SAFE_CAST(MIN({COLUMN_NAME}) AS DATE), DAY), DAY) + 1 - COUNT(DISTINCT DATETIME_TRUNC({COLUMN_NAME}, DAY)) + test_operator: < + test_condition: |- + {THRESHOLD_VALUE} + - id: '6005' + test_type: Daily_Record_Ct + sql_flavor: databricks + measure: |- + <%DATEDIFF_DAY;MIN({COLUMN_NAME});MAX({COLUMN_NAME})%>+1-COUNT(DISTINCT {COLUMN_NAME}) + test_operator: < + test_condition: |- + {THRESHOLD_VALUE} + - id: '3005' + test_type: Daily_Record_Ct + sql_flavor: mssql + measure: |- + DATEDIFF(day, MIN({COLUMN_NAME}), MAX({COLUMN_NAME}))+1-COUNT(DISTINCT {COLUMN_NAME}) + test_operator: < + test_condition: |- + {THRESHOLD_VALUE} + - id: '4005' + test_type: Daily_Record_Ct + sql_flavor: postgresql + measure: |- + <%DATEDIFF_DAY;MIN({COLUMN_NAME});MAX({COLUMN_NAME})%>+1-COUNT(DISTINCT {COLUMN_NAME}) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '1005' + test_type: Daily_Record_Ct + sql_flavor: redshift + measure: |- + DATEDIFF('DAY', MIN({COLUMN_NAME}), MAX({COLUMN_NAME}))+1-COUNT(DISTINCT {COLUMN_NAME}) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '7005' + test_type: Daily_Record_Ct + sql_flavor: redshift_spectrum + measure: |- + DATEDIFF('DAY', MIN({COLUMN_NAME}), MAX({COLUMN_NAME}))+1-COUNT(DISTINCT {COLUMN_NAME}) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '2005' + test_type: Daily_Record_Ct + sql_flavor: snowflake + measure: |- + DATEDIFF(day, MIN({COLUMN_NAME}), MAX({COLUMN_NAME}))+1-COUNT(DISTINCT {COLUMN_NAME}) + test_operator: < + test_condition: |- + {THRESHOLD_VALUE} + - id: '5005' + test_type: Daily_Record_Ct + sql_flavor: trino + measure: |- + DATE_DIFF('DAY', MIN({COLUMN_NAME}), MAX({COLUMN_NAME}))+1-COUNT(DISTINCT {COLUMN_NAME}) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + target_data_lookups: + - id: '1368' + test_id: '1009' + test_type: Daily_Record_Ct + sql_flavor: bigquery + lookup_type: null + lookup_query: |- + WITH daterange AS ( + SELECT day AS all_dates + FROM UNNEST( + GENERATE_DATE_ARRAY( + (SELECT MIN(CAST(`{COLUMN_NAME}` AS DATE)) FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`), + (SELECT MAX(CAST(`{COLUMN_NAME}` AS DATE)) FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`) + ) + ) AS day + ), + existing_periods AS ( + SELECT DISTINCT CAST(`{COLUMN_NAME}` AS DATE) AS period, COUNT(1) AS period_count + FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` + GROUP BY period + ), + p AS ( + SELECT d.all_dates AS missing_period, + MAX(b.period) AS prior_available_date, + MIN(c.period) AS next_available_date + FROM daterange d + LEFT JOIN existing_periods a ON d.all_dates = a.period + LEFT JOIN existing_periods b ON b.period < d.all_dates + LEFT JOIN existing_periods c ON c.period > d.all_dates + WHERE a.period IS NULL + AND d.all_dates BETWEEN b.period AND c.period + GROUP BY d.all_dates + ) + SELECT p.missing_period, p.prior_available_date, e.period_count AS prior_available_date_count, p.next_available_date, f.period_count AS next_available_date_count + FROM p + LEFT JOIN existing_periods e ON (p.prior_available_date = e.period) + LEFT JOIN existing_periods f ON (p.next_available_date = f.period) + ORDER BY p.missing_period + LIMIT 500; + error_type: Test Results + - id: '1302' + test_id: '1009' + test_type: Daily_Record_Ct + sql_flavor: databricks + lookup_type: null + lookup_query: |- + WITH date_bounds AS( SELECT MIN(`{COLUMN_NAME}`) AS min_date, MAX(`{COLUMN_NAME}`) AS max_date FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`), all_dates AS ( SELECT EXPLODE(SEQUENCE(min_date, max_date, INTERVAL 1 DAY)) AS all_dates FROM date_bounds ), existing_periods AS ( SELECT DISTINCT CAST(`{COLUMN_NAME}` AS DATE) AS period, COUNT(1) AS period_count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` GROUP BY CAST(`{COLUMN_NAME}` AS DATE) ), missing_dates AS ( SELECT d.all_dates AS missing_period FROM all_dates d LEFT JOIN existing_periods e ON d.all_dates = e.period WHERE e.period IS NULL ) SELECT m.missing_period, MAX(e1.period) AS prior_available_date, MAX(e1.period_count) AS prior_available_date_count, MIN(e2.period) AS next_available_date, MAX(e2.period_count) AS next_available_date_count FROM missing_dates m LEFT JOIN existing_periods e1 ON e1.period < m.missing_period LEFT JOIN existing_periods e2 ON e2.period > m.missing_period GROUP BY m.missing_period ORDER BY m.missing_period LIMIT 500; + error_type: Test Results + - id: '1144' + test_id: '1009' + test_type: Daily_Record_Ct + sql_flavor: mssql + lookup_type: null + lookup_query: |- + WITH + Pass0 as (select 1 as C union all select 1), --2 rows + Pass1 as (select 1 as C from Pass0 as A, Pass0 as B),--4 rows + Pass2 as (select 1 as C from Pass1 as A, Pass1 as B),--16 rows + Pass3 as (select 1 as C from Pass2 as A, Pass2 as B),--256 rows + Pass4 as (select 1 as C from Pass3 as A, Pass3 as B),--65536 rows + All_Nums as (select row_number() over(order by C) as Number from Pass4), + tally as (SELECT Number FROM All_Nums WHERE Number <= 45000), + + date_range as (SELECT CAST(DATEADD(DAY, DATEDIFF(DAY, 0, MIN("{COLUMN_NAME}")), 0) AS DATE) AS min_period, + CAST(DATEADD(DAY, DATEDIFF(DAY, 0, MAX("{COLUMN_NAME}")), 0) AS DATE) AS max_period, + DATEDIFF(DAY, + CAST(DATEADD(DAY, DATEDIFF(DAY, 0, MIN("{COLUMN_NAME}")), 0) AS DATE), + CAST(DATEADD(DAY, DATEDIFF(DAY, 0, MAX("{COLUMN_NAME}")), 0) AS DATE) ) + 1 as period_ct + FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" ), + check_periods as ( SELECT d.min_period, d.max_period, t.number, + DATEADD(DAY, -(t.number - 1), d.max_period) AS check_period + FROM date_range d + INNER JOIN tally t + ON (d.period_ct >= t.number) ), + data_by_period as (SELECT CAST(DATEADD(DAY, DATEDIFF(DAY, 0, "{COLUMN_NAME}"), 0) AS DATE) as data_period, COUNT(*) as record_ct + FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + GROUP BY CAST(DATEADD(DAY, DATEDIFF(DAY, 0, "{COLUMN_NAME}"), 0) AS DATE) ), + data_by_prd_with_prior_next as (SELECT check_period, + RANK() OVER (ORDER BY check_period DESC) as ranked, + ISNULL(d.record_ct, 0) as record_ct, + ISNULL(LAG(d.record_ct) OVER (ORDER BY check_period), 0) as last_record_ct, + ISNULL(LEAD(d.record_ct) OVER (ORDER BY check_period), 0) as next_record_ct + FROM check_periods c + LEFT JOIN data_by_period d + ON (c.check_period = d.data_period) ) + SELECT check_period, record_ct, + CASE + WHEN record_ct = 0 THEN 'MISSING' + ELSE 'Present' + END as status + FROM data_by_prd_with_prior_next + WHERE record_ct = 0 + OR last_record_ct = 0 + OR next_record_ct = 0 + ORDER BY check_period DESC; + error_type: Test Results + - id: '1087' + test_id: '1009' + test_type: Daily_Record_Ct + sql_flavor: postgresql + lookup_type: null + lookup_query: |- + WITH RECURSIVE daterange(all_dates) AS (SELECT MIN("{COLUMN_NAME}") :: DATE AS all_dates FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" UNION ALL SELECT (d.all_dates :: DATE + INTERVAL '1 day') :: DATE AS all_dates FROM daterange d WHERE d.all_dates < (SELECT MAX("{COLUMN_NAME}") :: DATE FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") ), existing_periods AS ( SELECT DISTINCT "{COLUMN_NAME}" :: DATE AS period, COUNT(1) AS period_count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" :: DATE ) SELECT d.all_dates AS missing_period, MAX(b.period) AS prior_available_date, (SELECT period_count FROM existing_periods WHERE period = MAX(b.period) ) AS prior_available_date_count, MIN(c.period) AS next_available_date, (SELECT period_count FROM existing_periods WHERE period = MIN(c.period) ) AS next_available_date_count FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates LIMIT 500; + error_type: Test Results + - id: '1005' + test_id: '1009' + test_type: Daily_Record_Ct + sql_flavor: redshift + lookup_type: null + lookup_query: |- + WITH RECURSIVE daterange(all_dates) AS (SELECT MIN("{COLUMN_NAME}") :: DATE AS all_dates FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" UNION ALL SELECT DATEADD(DAY, 1, d.all_dates) :: DATE AS all_dates FROM daterange d WHERE d.all_dates < (SELECT MAX("{COLUMN_NAME}") :: DATE FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") ), existing_periods AS ( SELECT DISTINCT "{COLUMN_NAME}" :: DATE AS period, COUNT(1) AS period_count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" :: DATE ) SELECT d.all_dates AS missing_period, MAX(b.period) AS prior_available_date, (SELECT period_count FROM existing_periods WHERE period = MAX(b.period) ) AS prior_available_date_count, MIN(c.period) AS next_available_date, (SELECT period_count FROM existing_periods WHERE period = MIN(c.period) ) AS next_available_date_count FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates ORDER BY d.all_dates LIMIT 500; + error_type: Test Results + - id: '1405' + test_id: '1009' + test_type: Daily_Record_Ct + sql_flavor: redshift_spectrum + lookup_type: null + lookup_query: |- + WITH RECURSIVE daterange(all_dates) AS (SELECT MIN("{COLUMN_NAME}") :: DATE AS all_dates FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" UNION ALL SELECT DATEADD(DAY, 1, d.all_dates) :: DATE AS all_dates FROM daterange d WHERE d.all_dates < (SELECT MAX("{COLUMN_NAME}") :: DATE FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") ), existing_periods AS ( SELECT DISTINCT "{COLUMN_NAME}" :: DATE AS period, COUNT(1) AS period_count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" :: DATE ) SELECT d.all_dates AS missing_period, MAX(b.period) AS prior_available_date, (SELECT period_count FROM existing_periods WHERE period = MAX(b.period) ) AS prior_available_date_count, MIN(c.period) AS next_available_date, (SELECT period_count FROM existing_periods WHERE period = MIN(c.period) ) AS next_available_date_count FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates ORDER BY d.all_dates LIMIT 500; + error_type: Test Results + - id: '1201' + test_id: '1009' + test_type: Daily_Record_Ct + sql_flavor: snowflake + lookup_type: null + lookup_query: |- + WITH RECURSIVE daterange(all_dates) AS (SELECT MIN("{COLUMN_NAME}") :: DATE AS all_dates FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" UNION ALL SELECT DATEADD(DAY, 1, d.all_dates) :: DATE AS all_dates FROM daterange d WHERE d.all_dates < (SELECT MAX("{COLUMN_NAME}") :: DATE FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") ), existing_periods AS ( SELECT DISTINCT "{COLUMN_NAME}" :: DATE AS period, COUNT(1) AS period_count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" :: DATE ) SELECT p.missing_period, p.prior_available_date, e.period_count as prior_available_date_count, p.next_available_date, f.period_count as next_available_date_count FROM (SELECT d.all_dates AS missing_period, MAX(b.period) AS prior_available_date, MIN(c.period) AS next_available_date FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates) p LEFT JOIN existing_periods e ON (p.prior_available_date = e.period) LEFT JOIN existing_periods f ON (p.next_available_date = f.period) ORDER BY p.missing_period LIMIT 500; + error_type: Test Results + test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Dec_Trunc.yaml b/testgen/template/dbsetup_test_types/test_types_Dec_Trunc.yaml new file mode 100644 index 00000000..b7554ca6 --- /dev/null +++ b/testgen/template/dbsetup_test_types/test_types_Dec_Trunc.yaml @@ -0,0 +1,168 @@ +test_types: + id: '1011' + test_type: Dec_Trunc + test_name_short: Decimal Truncation + test_name_long: Sum of fractional values at or above reference + test_description: |- + Tests for decimal truncation by confirming that the sum of fractional values in data is no less than the sum at baseline + except_message: |- + The sum of fractional values is under baseline, which may indicate decimal truncation + measure_uom: Fractional sum + measure_uom_description: |- + The sum of all decimal values from all data for this column + selection_criteria: |- + fractional_sum > 0 AND functional_table_type LIKE'%cumulative%' + dq_score_prevalence_formula: |- + 1 + dq_score_risk_factor: '1.0' + column_name_prompt: null + column_name_help: null + default_parm_columns: threshold_value + default_parm_values: |- + ROUND(fractional_sum, 0) + default_parm_prompts: |- + Sum of Fractional Values at Baseline + default_parm_help: null + default_severity: Fail + run_type: CAT + test_scope: column + dq_dimension: Validity + health_dimension: Schema Drift + threshold_description: |- + Minimum expected sum of all fractional values + result_visualization: line_chart + result_visualization_params: null + usage_notes: |- + Decimal Truncation tests that the fractional (decimal) part of a numeric column has not been truncated since Baseline. This works by summing all the fractional values after the decimal point and confirming that the total is at least equal to the fractional total at baseline. This could indicate a problem in a cumulative dataset, where prior values should still exist unchanged. A failure here would suggest that some process changed data that you would still expect to be present and matching its value when the column was profiled. This test would not be appropriate for an incremental or windowed dataset. + active: Y + cat_test_conditions: + - id: '7006' + test_type: Dec_Trunc + sql_flavor: bigquery + measure: |- + SUM(ROUND(ABS(MOD({COLUMN_NAME}, 1)), 5)) + 1 + test_operator: < + test_condition: |- + {THRESHOLD_VALUE} + - id: '6006' + test_type: Dec_Trunc + sql_flavor: databricks + measure: |- + SUM(ROUND(ABS(({COLUMN_NAME} % 1)), 5))+1 + test_operator: < + test_condition: |- + {THRESHOLD_VALUE} + - id: '3006' + test_type: Dec_Trunc + sql_flavor: mssql + measure: |- + SUM(ROUND(ABS(({COLUMN_NAME} % 1)), 5))+1 + test_operator: < + test_condition: |- + {THRESHOLD_VALUE} + - id: '4006' + test_type: Dec_Trunc + sql_flavor: postgresql + measure: |- + SUM(ROUND(ABS(({COLUMN_NAME} % 1)), 5))+1 + test_operator: < + test_condition: |- + {THRESHOLD_VALUE} + - id: '1006' + test_type: Dec_Trunc + sql_flavor: redshift + measure: |- + SUM(ROUND(ABS(({COLUMN_NAME} % 1)), 5))+1 + test_operator: < + test_condition: |- + {THRESHOLD_VALUE} + - id: '7006' + test_type: Dec_Trunc + sql_flavor: redshift_spectrum + measure: |- + SUM(ROUND(ABS(({COLUMN_NAME} % 1)), 5))+1 + test_operator: < + test_condition: |- + {THRESHOLD_VALUE} + - id: '2006' + test_type: Dec_Trunc + sql_flavor: snowflake + measure: |- + SUM(ROUND(ABS(({COLUMN_NAME} % 1)), 5))+1 + test_operator: < + test_condition: |- + {THRESHOLD_VALUE} + - id: '5006' + test_type: Dec_Trunc + sql_flavor: trino + measure: |- + SUM(ROUND(ABS(({COLUMN_NAME} % 1)), 5))+1 + test_operator: < + test_condition: |- + {THRESHOLD_VALUE} + target_data_lookups: + - id: '1369' + test_id: '1011' + test_type: Dec_Trunc + sql_flavor: bigquery + lookup_type: null + lookup_query: |- + SELECT DISTINCT LENGTH(SPLIT(CAST(`{COLUMN_NAME}` AS STRING), '.')[SAFE_OFFSET(1)]) AS decimal_scale, COUNT(*) AS count + FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` + GROUP BY decimal_scale + LIMIT 500; + error_type: Test Results + - id: '1303' + test_id: '1011' + test_type: Dec_Trunc + sql_flavor: databricks + lookup_type: null + lookup_query: |- + SELECT DISTINCT LENGTH(SPLIT_PART(`{COLUMN_NAME}`::STRING, '.', 2)) AS decimal_scale, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` GROUP BY decimal_scale LIMIT 500; + error_type: Test Results + - id: '1145' + test_id: '1011' + test_type: Dec_Trunc + sql_flavor: mssql + lookup_type: null + lookup_query: |- + WITH CTE AS ( + SELECT LEN(SUBSTRING(CAST(ABS("{COLUMN_NAME}") % 1 AS VARCHAR), 3, LEN("{COLUMN_NAME}"))) AS decimal_scale + FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + ) + SELECT DISTINCT TOP 500 decimal_scale, COUNT(*) AS count + FROM cte GROUP BY decimal_scale ORDER BY COUNT(*) DESC; + error_type: Test Results + - id: '1088' + test_id: '1011' + test_type: Dec_Trunc + sql_flavor: postgresql + lookup_type: null + lookup_query: |- + SELECT DISTINCT LENGTH(SPLIT_PART("{COLUMN_NAME}" :: TEXT, '.', 2)) AS decimal_scale, COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY decimal_scale LIMIT 500; + error_type: Test Results + - id: '1006' + test_id: '1011' + test_type: Dec_Trunc + sql_flavor: redshift + lookup_type: null + lookup_query: |- + SELECT DISTINCT DECIMAL_SCALE("{COLUMN_NAME}" :: SUPER) AS decimal_scale, COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY DECIMAL_SCALE("{COLUMN_NAME}" :: SUPER) LIMIT 500; + error_type: Test Results + - id: '1406' + test_id: '1011' + test_type: Dec_Trunc + sql_flavor: redshift_spectrum + lookup_type: null + lookup_query: |- + SELECT DISTINCT DECIMAL_SCALE("{COLUMN_NAME}" :: SUPER) AS decimal_scale, COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY DECIMAL_SCALE("{COLUMN_NAME}" :: SUPER) LIMIT 500; + error_type: Test Results + - id: '1202' + test_id: '1011' + test_type: Dec_Trunc + sql_flavor: snowflake + lookup_type: null + lookup_query: |- + SELECT DISTINCT LENGTH(SPLIT_PART("{COLUMN_NAME}" :: TEXT, '.', 2)) AS decimal_scale, COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY decimal_scale LIMIT 500; + error_type: Test Results + test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Distinct_Date_Ct.yaml b/testgen/template/dbsetup_test_types/test_types_Distinct_Date_Ct.yaml new file mode 100644 index 00000000..e27cdb93 --- /dev/null +++ b/testgen/template/dbsetup_test_types/test_types_Distinct_Date_Ct.yaml @@ -0,0 +1,165 @@ +test_types: + id: '1012' + test_type: Distinct_Date_Ct + test_name_short: Date Count + test_name_long: Count of distinct dates at or above reference + test_description: |- + Tests that the count of distinct dates referenced in the column has not dropped vs. baseline data + except_message: |- + Drop in count of unique dates recorded in column. + measure_uom: Unique dates + measure_uom_description: |- + Count of unique dates in transactional date column + selection_criteria: |- + functional_data_type ILIKE 'Transactional Date%' AND date_days_present > 1 AND functional_table_type ILIKE '%cumulative%' + dq_score_prevalence_formula: |- + (({RECORD_CT}-{PRO_RECORD_CT})::FLOAT*{DISTINCT_VALUE_CT}::FLOAT/NULLIF({PRO_RECORD_CT}::FLOAT, 0))/NULLIF({PRO_RECORD_CT}::FLOAT, 0) + dq_score_risk_factor: '1.0' + column_name_prompt: null + column_name_help: null + default_parm_columns: baseline_value,threshold_value + default_parm_values: |- + date_days_present,date_days_present + default_parm_prompts: |- + Distinct Date Count at Baseline,Min Expected Date Count + default_parm_help: null + default_severity: Fail + run_type: CAT + test_scope: column + dq_dimension: Timeliness + health_dimension: Recency + threshold_description: |- + Minimum distinct date count expected + result_visualization: line_chart + result_visualization_params: null + usage_notes: |- + Date Count tests that the count of distinct dates present in the column has not dropped since baseline. The test is relevant for cumulative datasets, where old records are retained. A failure here would indicate missing records, which could be caused by a processing error or changed upstream data sources. + active: Y + cat_test_conditions: + - id: '7007' + test_type: Distinct_Date_Ct + sql_flavor: bigquery + measure: |- + COUNT(DISTINCT {COLUMN_NAME}) + test_operator: < + test_condition: |- + {THRESHOLD_VALUE} + - id: '6007' + test_type: Distinct_Date_Ct + sql_flavor: databricks + measure: |- + COUNT(DISTINCT {COLUMN_NAME}) + test_operator: < + test_condition: |- + {THRESHOLD_VALUE} + - id: '3007' + test_type: Distinct_Date_Ct + sql_flavor: mssql + measure: |- + COUNT(DISTINCT {COLUMN_NAME}) + test_operator: < + test_condition: |- + {THRESHOLD_VALUE} + - id: '4007' + test_type: Distinct_Date_Ct + sql_flavor: postgresql + measure: |- + COUNT(DISTINCT {COLUMN_NAME}) + test_operator: < + test_condition: |- + {THRESHOLD_VALUE} + - id: '1007' + test_type: Distinct_Date_Ct + sql_flavor: redshift + measure: |- + COUNT(DISTINCT {COLUMN_NAME}) + test_operator: < + test_condition: |- + {THRESHOLD_VALUE} + - id: '7007' + test_type: Distinct_Date_Ct + sql_flavor: redshift_spectrum + measure: |- + COUNT(DISTINCT {COLUMN_NAME}) + test_operator: < + test_condition: |- + {THRESHOLD_VALUE} + - id: '2007' + test_type: Distinct_Date_Ct + sql_flavor: snowflake + measure: |- + COUNT(DISTINCT {COLUMN_NAME}) + test_operator: < + test_condition: |- + {THRESHOLD_VALUE} + - id: '5007' + test_type: Distinct_Date_Ct + sql_flavor: trino + measure: |- + COUNT(DISTINCT {COLUMN_NAME}) + test_operator: < + test_condition: |- + {THRESHOLD_VALUE} + target_data_lookups: + - id: '1370' + test_id: '1012' + test_type: Distinct_Date_Ct + sql_flavor: bigquery + lookup_type: null + lookup_query: |- + SELECT `{COLUMN_NAME}`, COUNT(*) AS count + FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` + WHERE `{COLUMN_NAME}` IS NOT NULL + GROUP BY `{COLUMN_NAME}` + ORDER BY `{COLUMN_NAME}` DESC + LIMIT 500; + error_type: Test Results + - id: '1304' + test_id: '1012' + test_type: Distinct_Date_Ct + sql_flavor: databricks + lookup_type: null + lookup_query: |- + SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE `{COLUMN_NAME}` IS NOT NULL GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` DESC LIMIT 500; + error_type: Test Results + - id: '1146' + test_id: '1012' + test_type: Distinct_Date_Ct + sql_flavor: mssql + lookup_type: null + lookup_query: |- + SELECT DISTINCT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NOT NULL GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC; + error_type: Test Results + - id: '1089' + test_id: '1012' + test_type: Distinct_Date_Ct + sql_flavor: postgresql + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NOT NULL GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500; + error_type: Test Results + - id: '1007' + test_id: '1012' + test_type: Distinct_Date_Ct + sql_flavor: redshift + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NOT NULL GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500; + error_type: Test Results + - id: '1407' + test_id: '1012' + test_type: Distinct_Date_Ct + sql_flavor: redshift_spectrum + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NOT NULL GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500; + error_type: Test Results + - id: '1203' + test_id: '1012' + test_type: Distinct_Date_Ct + sql_flavor: snowflake + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NOT NULL GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500; + error_type: Test Results + test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Distinct_Value_Ct.yaml b/testgen/template/dbsetup_test_types/test_types_Distinct_Value_Ct.yaml new file mode 100644 index 00000000..d0382cb3 --- /dev/null +++ b/testgen/template/dbsetup_test_types/test_types_Distinct_Value_Ct.yaml @@ -0,0 +1,164 @@ +test_types: + id: '1013' + test_type: Distinct_Value_Ct + test_name_short: Value Count + test_name_long: Count of distinct values has not dropped + test_description: |- + Tests that the count of unique values in the column has not changed from baseline. + except_message: |- + Count of unique values in column has changed from baseline. + measure_uom: Unique Values + measure_uom_description: null + selection_criteria: |- + distinct_value_ct between 2 and 10 AND value_ct > 50 AND functional_data_type IN ('Code', 'Category', 'Attribute', 'Description') AND NOT coalesce(top_freq_values,'') > '' + dq_score_prevalence_formula: |- + ABS({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT*{PRO_RECORD_CT}::FLOAT/NULLIF({DISTINCT_VALUE_CT}::FLOAT, 0)/NULLIF({RECORD_CT}::FLOAT, 0) + dq_score_risk_factor: '1.0' + column_name_prompt: null + column_name_help: null + default_parm_columns: baseline_value_ct,threshold_value + default_parm_values: |- + distinct_value_ct,distinct_value_ct + default_parm_prompts: |- + Distinct Value Count at Baseline,Min Expected Value Count + default_parm_help: null + default_severity: Fail + run_type: CAT + test_scope: column + dq_dimension: Validity + health_dimension: Schema Drift + threshold_description: |- + Expected distinct value count + result_visualization: line_chart + result_visualization_params: null + usage_notes: |- + Value Count tests that the count of unique values present in the column has not dropped since baseline. The test is relevant for cumulative datasets, where old records are retained, or for any dataset where you would expect a set number of distinct values should be present. A failure here would indicate missing records or a change in categories or value assignment. + active: Y + cat_test_conditions: + - id: '7008' + test_type: Distinct_Value_Ct + sql_flavor: bigquery + measure: |- + COUNT(DISTINCT {COLUMN_NAME}) + test_operator: '!=' + test_condition: |- + {THRESHOLD_VALUE} + - id: '6008' + test_type: Distinct_Value_Ct + sql_flavor: databricks + measure: |- + COUNT(DISTINCT {COLUMN_NAME}) + test_operator: <> + test_condition: |- + {THRESHOLD_VALUE} + - id: '3008' + test_type: Distinct_Value_Ct + sql_flavor: mssql + measure: |- + COUNT(DISTINCT {COLUMN_NAME}) + test_operator: <> + test_condition: |- + {THRESHOLD_VALUE} + - id: '4008' + test_type: Distinct_Value_Ct + sql_flavor: postgresql + measure: |- + COUNT(DISTINCT {COLUMN_NAME}) + test_operator: <> + test_condition: |- + {THRESHOLD_VALUE} + - id: '1008' + test_type: Distinct_Value_Ct + sql_flavor: redshift + measure: |- + COUNT(DISTINCT {COLUMN_NAME}) + test_operator: <> + test_condition: |- + {THRESHOLD_VALUE} + - id: '7008' + test_type: Distinct_Value_Ct + sql_flavor: redshift_spectrum + measure: |- + COUNT(DISTINCT {COLUMN_NAME}) + test_operator: <> + test_condition: |- + {THRESHOLD_VALUE} + - id: '2008' + test_type: Distinct_Value_Ct + sql_flavor: snowflake + measure: |- + COUNT(DISTINCT {COLUMN_NAME}) + test_operator: <> + test_condition: |- + {THRESHOLD_VALUE} + - id: '5008' + test_type: Distinct_Value_Ct + sql_flavor: trino + measure: |- + COUNT(DISTINCT {COLUMN_NAME}) + test_operator: <> + test_condition: |- + {THRESHOLD_VALUE} + target_data_lookups: + - id: '1371' + test_id: '1013' + test_type: Distinct_Value_Ct + sql_flavor: bigquery + lookup_type: null + lookup_query: |- + SELECT `{COLUMN_NAME}`, COUNT(*) AS count + FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` + WHERE `{COLUMN_NAME}` IS NOT NULL + GROUP BY `{COLUMN_NAME}` + ORDER BY `{COLUMN_NAME}` DESC + LIMIT 500; + error_type: Test Results + - id: '1305' + test_id: '1013' + test_type: Distinct_Value_Ct + sql_flavor: databricks + lookup_type: null + lookup_query: |- + SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE `{COLUMN_NAME}` IS NOT NULL GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` DESC LIMIT 500; + error_type: Test Results + - id: '1147' + test_id: '1013' + test_type: Distinct_Value_Ct + sql_flavor: mssql + lookup_type: null + lookup_query: |- + SELECT DISTINCT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NOT NULL GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC; + error_type: Test Results + - id: '1090' + test_id: '1013' + test_type: Distinct_Value_Ct + sql_flavor: postgresql + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NOT NULL GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500; + error_type: Test Results + - id: '1008' + test_id: '1013' + test_type: Distinct_Value_Ct + sql_flavor: redshift + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NOT NULL GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500; + error_type: Test Results + - id: '1408' + test_id: '1013' + test_type: Distinct_Value_Ct + sql_flavor: redshift_spectrum + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NOT NULL GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500; + error_type: Test Results + - id: '1204' + test_id: '1013' + test_type: Distinct_Value_Ct + sql_flavor: snowflake + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NOT NULL GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500; + error_type: Test Results + test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Distribution_Shift.yaml b/testgen/template/dbsetup_test_types/test_types_Distribution_Shift.yaml new file mode 100644 index 00000000..cd7f6c04 --- /dev/null +++ b/testgen/template/dbsetup_test_types/test_types_Distribution_Shift.yaml @@ -0,0 +1,241 @@ +test_types: + id: '1503' + test_type: Distribution_Shift + test_name_short: Distribution Shift + test_name_long: Probability distribution consistent with reference + test_description: |- + Tests the closeness of match between two distributions of aggregate measures across combinations of column values, using Jensen-Shannon Divergence test + except_message: |- + Divergence between two distributions exceeds specified threshold. + measure_uom: Divergence level (0-1) + measure_uom_description: |- + Jensen-Shannon Divergence, from 0 (identical distributions), to 1.0 (max divergence) + selection_criteria: null + dq_score_prevalence_formula: |- + 1 + dq_score_risk_factor: '0.75' + column_name_prompt: |- + Categorical Column List + column_name_help: |- + Specify one or more Categorical columns, separated by commas. Do not use continuous mesurements here. Do not use numeric values unless they represent discrete categories. + default_parm_columns: subset_condition,match_schema_name,match_table_name,match_groupby_names,match_subset_condition + default_parm_values: null + default_parm_prompts: |- + Record Subset Condition,Reference Schema Name,Reference Table Name,Matching Columns to Compare,Matching Record Subset Condition + default_parm_help: |- + Condition defining a subset of records in main table to evaluate, written like a condition within a SQL WHERE clause - OPTIONAL|Schema location of matching table|Matching table name|Column Names in reference table used to compare counts with source table values (separated by commas)|Condition defining a subset of records in reference table to match against, written like a condition within a SQL WHERE clause - OPTIONAL + default_severity: Warning + run_type: QUERY + test_scope: referential + dq_dimension: Consistency + health_dimension: Data Drift + threshold_description: |- + Expected maximum divergence level between 0 and 1 + result_visualization: line_chart + result_visualization_params: null + usage_notes: |- + This test measures the similarity of two sets of counts per categories, by using their proportional counts as probability distributions. Using Jensen-Shannon divergence, a measure of relative entropy or difference between two distributions, the test assigns a score ranging from 0, meaning that the distributions are identical, to 1, meaning that the distributions are completely unrelated. This test can be used to compare datasets that may not match exactly, but should have similar distributions. For example, it is a useful sanity check for data from different sources that you would expect to have a consistent spread, such as shipment of building materials per state and construction projects by state. Scores can be compared over time even if the distributions are not identical -- a dataset can be expected to maintain a comparable divergence score with a reference dataset over time. Both tables must be present to run this test. + active: Y + cat_test_conditions: [] + target_data_lookups: + - id: '1403' + test_id: '1503' + test_type: Distribution_Shift + sql_flavor: bigquery + lookup_type: null + lookup_query: |- + WITH latest_ver AS ( + SELECT {CONCAT_COLUMNS} AS category, + CAST(COUNT(*) AS FLOAT64) / SUM(COUNT(*)) OVER() AS pct_of_total + FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` v1 + WHERE {SUBSET_CONDITION} + GROUP BY {CONCAT_COLUMNS} + ) + SELECT * + FROM latest_ver; + error_type: Test Results + - id: '1336' + test_id: '1503' + test_type: Distribution_Shift + sql_flavor: databricks + lookup_type: null + lookup_query: |- + WITH latest_ver + AS ( SELECT {CONCAT_COLUMNS} as category, + COUNT(*)::FLOAT / SUM(COUNT(*)) OVER ()::FLOAT AS pct_of_total + FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` v1 + WHERE {SUBSET_CONDITION} + GROUP BY {COLUMN_NAME_NO_QUOTES} ), + older_ver + AS ( SELECT {CONCAT_MATCH_GROUPBY} as category, + COUNT(*)::FLOAT / SUM(COUNT(*)) OVER ()::FLOAT AS pct_of_total + FROM {MATCH_SCHEMA_NAME}.{TABLE_NAME} v2 + WHERE {MATCH_SUBSET_CONDITION} + GROUP BY {MATCH_GROUPBY_NAMES} ) + SELECT COALESCE(l.category, o.category) AS category, + o.pct_of_total AS old_pct, + l.pct_of_total AS new_pct + FROM latest_ver l + FULL JOIN older_ver o + ON (l.category = o.category) + ORDER BY COALESCE(l.category, o.category) + error_type: Test Results + - id: '1259' + test_id: '1503' + test_type: Distribution_Shift + sql_flavor: mssql + lookup_type: null + lookup_query: |- + WITH latest_ver + AS ( SELECT {CONCAT_COLUMNS} as category, + CAST(COUNT(*) as FLOAT) / CAST(SUM(COUNT(*)) OVER () as FLOAT) AS pct_of_total + FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" v1 + WHERE {SUBSET_CONDITION} + GROUP BY {COLUMN_NAME_NO_QUOTES} ), + older_ver + AS ( SELECT {CONCAT_MATCH_GROUPBY} as category, + CAST(COUNT(*) as FLOAT) / CAST(SUM(COUNT(*)) OVER () as FLOAT) AS pct_of_total + FROM {MATCH_SCHEMA_NAME}.{TABLE_NAME} v2 + WHERE {MATCH_SUBSET_CONDITION} + GROUP BY {MATCH_GROUPBY_NAMES} ) + SELECT COALESCE(l.category, o.category) AS category, + o.pct_of_total AS old_pct, + l.pct_of_total AS new_pct + FROM latest_ver l + FULL JOIN older_ver o + ON (l.category = o.category) + ORDER BY COALESCE(l.category, o.category) + error_type: Test Results + - id: '1260' + test_id: '1503' + test_type: Distribution_Shift + sql_flavor: postgresql + lookup_type: null + lookup_query: |- + WITH latest_ver + AS ( SELECT {CONCAT_COLUMNS} as category, + COUNT(*)::FLOAT / SUM(COUNT(*)) OVER ()::FLOAT AS pct_of_total + FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" v1 + WHERE {SUBSET_CONDITION} + GROUP BY {COLUMN_NAME_NO_QUOTES} ), + older_ver + AS ( SELECT {CONCAT_MATCH_GROUPBY} as category, + COUNT(*)::FLOAT / SUM(COUNT(*)) OVER ()::FLOAT AS pct_of_total + FROM {MATCH_SCHEMA_NAME}.{TABLE_NAME} v2 + WHERE {MATCH_SUBSET_CONDITION} + GROUP BY {MATCH_GROUPBY_NAMES} ) + SELECT COALESCE(l.category, o.category) AS category, + o.pct_of_total AS old_pct, + l.pct_of_total AS new_pct + FROM latest_ver l + FULL JOIN older_ver o + ON (l.category = o.category) + ORDER BY COALESCE(l.category, o.category) + error_type: Test Results + - id: '1257' + test_id: '1503' + test_type: Distribution_Shift + sql_flavor: redshift + lookup_type: null + lookup_query: |- + WITH latest_ver + AS ( SELECT {CONCAT_COLUMNS} as category, + COUNT(*)::FLOAT / SUM(COUNT(*)) OVER ()::FLOAT AS pct_of_total + FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" v1 + WHERE {SUBSET_CONDITION} + GROUP BY {COLUMN_NAME_NO_QUOTES} ), + older_ver + AS ( SELECT {CONCAT_MATCH_GROUPBY} as category, + COUNT(*)::FLOAT / SUM(COUNT(*)) OVER ()::FLOAT AS pct_of_total + FROM {MATCH_SCHEMA_NAME}.{TABLE_NAME} v2 + WHERE {MATCH_SUBSET_CONDITION} + GROUP BY {MATCH_GROUPBY_NAMES} ) + SELECT COALESCE(l.category, o.category) AS category, + o.pct_of_total AS old_pct, + l.pct_of_total AS new_pct + FROM latest_ver l + FULL JOIN older_ver o + ON (l.category = o.category) + ORDER BY COALESCE(l.category, o.category) + error_type: Test Results + - id: '1465' + test_id: '1503' + test_type: Distribution_Shift + sql_flavor: redshift_spectrum + lookup_type: null + lookup_query: |- + WITH latest_ver + AS ( SELECT {CONCAT_COLUMNS} as category, + COUNT(*)::FLOAT / SUM(COUNT(*)) OVER ()::FLOAT AS pct_of_total + FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" v1 + WHERE {SUBSET_CONDITION} + GROUP BY {COLUMN_NAME_NO_QUOTES} ), + older_ver + AS ( SELECT {CONCAT_MATCH_GROUPBY} as category, + COUNT(*)::FLOAT / SUM(COUNT(*)) OVER ()::FLOAT AS pct_of_total + FROM {MATCH_SCHEMA_NAME}.{TABLE_NAME} v2 + WHERE {MATCH_SUBSET_CONDITION} + GROUP BY {MATCH_GROUPBY_NAMES} ) + SELECT COALESCE(l.category, o.category) AS category, + o.pct_of_total AS old_pct, + l.pct_of_total AS new_pct + FROM latest_ver l + FULL JOIN older_ver o + ON (l.category = o.category) + ORDER BY COALESCE(l.category, o.category) + error_type: Test Results + - id: '1258' + test_id: '1503' + test_type: Distribution_Shift + sql_flavor: snowflake + lookup_type: null + lookup_query: |- + WITH latest_ver + AS ( SELECT {CONCAT_COLUMNS} as category, + COUNT(*)::FLOAT / SUM(COUNT(*)) OVER ()::FLOAT AS pct_of_total + FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" v1 + WHERE {SUBSET_CONDITION} + GROUP BY {COLUMN_NAME_NO_QUOTES} ), + older_ver + AS ( SELECT {CONCAT_MATCH_GROUPBY} as category, + COUNT(*)::FLOAT / SUM(COUNT(*)) OVER ()::FLOAT AS pct_of_total + FROM {MATCH_SCHEMA_NAME}.{TABLE_NAME} v2 + WHERE {MATCH_SUBSET_CONDITION} + GROUP BY {MATCH_GROUPBY_NAMES} ) + SELECT COALESCE(l.category, o.category) AS category, + o.pct_of_total AS old_pct, + l.pct_of_total AS new_pct + FROM latest_ver l + FULL JOIN older_ver o + ON (l.category = o.category) + ORDER BY COALESCE(l.category, o.category) + error_type: Test Results + test_templates: + - id: '2503' + test_type: Distribution_Shift + sql_flavor: bigquery + template_name: ex_relative_entropy_bigquery.sql + - id: '2403' + test_type: Distribution_Shift + sql_flavor: databricks + template_name: ex_relative_entropy_generic.sql + - id: '2203' + test_type: Distribution_Shift + sql_flavor: mssql + template_name: ex_relative_entropy_mssql.sql + - id: '2303' + test_type: Distribution_Shift + sql_flavor: postgresql + template_name: ex_relative_entropy_generic.sql + - id: '2003' + test_type: Distribution_Shift + sql_flavor: redshift + template_name: ex_relative_entropy_generic.sql + - id: '2503' + test_type: Distribution_Shift + sql_flavor: redshift_spectrum + template_name: ex_relative_entropy_generic.sql + - id: '2103' + test_type: Distribution_Shift + sql_flavor: snowflake + template_name: ex_relative_entropy_generic.sql diff --git a/testgen/template/dbsetup_test_types/test_types_Dupe_Rows.yaml b/testgen/template/dbsetup_test_types/test_types_Dupe_Rows.yaml new file mode 100644 index 00000000..af83785c --- /dev/null +++ b/testgen/template/dbsetup_test_types/test_types_Dupe_Rows.yaml @@ -0,0 +1,159 @@ +test_types: + id: '1510' + test_type: Dupe_Rows + test_name_short: Duplicate Rows + test_name_long: Rows are not duplicated in table + test_description: |- + Tests for the absence of duplicate rows based on unique combination of column values + except_message: |- + Column value combinations are duplicated in the table. + measure_uom: Duplicate records + measure_uom_description: null + selection_criteria: null + dq_score_prevalence_formula: |- + (({RESULT_MEASURE}-{THRESHOLD_VALUE}))::FLOAT/NULLIF({RECORD_CT}::FLOAT, 0) + dq_score_risk_factor: '1.0' + column_name_prompt: |- + null + column_name_help: |- + null + default_parm_columns: groupby_names + default_parm_values: null + default_parm_prompts: |- + Columns to Compare + default_parm_help: |- + List of columns in the table that define a duplicate record when the combination of values is repeated on multiple rows + default_severity: Fail + run_type: QUERY + test_scope: table + dq_dimension: Uniqueness + health_dimension: Schema Drift + threshold_description: |- + Expected count of duplicate value combinations + result_visualization: line_chart + result_visualization_params: null + usage_notes: |- + This test verifies that combinations of values are not repeated within the table. By default when auto-generated, the test considers all columns to protect against duplication of entire rows. If you know the minimum columns that should constitute a unique record, such as a set of ID's, you should use those to make the test as sensitive as possible. Alternatively, if you know of columns you can always exclude, such as file_date or refresh_snapshot_id, remove them to tighten the test somewhat. + active: Y + cat_test_conditions: [] + target_data_lookups: + - id: '1409' + test_id: '1510' + test_type: Dupe_Rows + sql_flavor: bigquery + lookup_type: null + lookup_query: |- + SELECT {GROUPBY_NAMES}, COUNT(*) AS record_ct + FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` + WHERE {SUBSET_CONDITION} + GROUP BY {GROUPBY_NAMES} + HAVING COUNT(*) > 1 + ORDER BY {GROUPBY_NAMES}; + error_type: Test Results + - id: '1257' + test_id: '1510' + test_type: Dupe_Rows + sql_flavor: databricks + lookup_type: null + lookup_query: |- + SELECT {GROUPBY_NAMES}, COUNT(*) as record_ct + FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` + WHERE {SUBSET_CONDITION} + GROUP BY {GROUPBY_NAMES} + HAVING COUNT(*) > 1 + ORDER BY {GROUPBY_NAMES} + error_type: Test Results + - id: '1255' + test_id: '1510' + test_type: Dupe_Rows + sql_flavor: mssql + lookup_type: null + lookup_query: |- + SELECT {GROUPBY_NAMES}, COUNT(*) as record_ct + FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + WHERE {SUBSET_CONDITION} + GROUP BY {GROUPBY_NAMES} + HAVING COUNT(*) > 1 + ORDER BY {GROUPBY_NAMES} + error_type: Test Results + - id: '1256' + test_id: '1510' + test_type: Dupe_Rows + sql_flavor: postgresql + lookup_type: null + lookup_query: |- + SELECT {GROUPBY_NAMES}, COUNT(*) as record_ct + FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + WHERE {SUBSET_CONDITION} + GROUP BY {GROUPBY_NAMES} + HAVING COUNT(*) > 1 + ORDER BY {GROUPBY_NAMES} + error_type: Test Results + - id: '1253' + test_id: '1510' + test_type: Dupe_Rows + sql_flavor: redshift + lookup_type: null + lookup_query: |- + SELECT {GROUPBY_NAMES}, COUNT(*) as record_ct + FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + WHERE {SUBSET_CONDITION} + GROUP BY {GROUPBY_NAMES} + HAVING COUNT(*) > 1 + ORDER BY {GROUPBY_NAMES} + error_type: Test Results + - id: '1472' + test_id: '1510' + test_type: Dupe_Rows + sql_flavor: redshift_spectrum + lookup_type: null + lookup_query: |- + SELECT {GROUPBY_NAMES}, COUNT(*) as record_ct + FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + WHERE {SUBSET_CONDITION} + GROUP BY {GROUPBY_NAMES} + HAVING COUNT(*) > 1 + ORDER BY {GROUPBY_NAMES} + error_type: Test Results + - id: '1254' + test_id: '1510' + test_type: Dupe_Rows + sql_flavor: snowflake + lookup_type: null + lookup_query: |- + SELECT {GROUPBY_NAMES}, COUNT(*) as record_ct + FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + WHERE {SUBSET_CONDITION} + GROUP BY {GROUPBY_NAMES} + HAVING COUNT(*) > 1 + ORDER BY {GROUPBY_NAMES} + error_type: Test Results + test_templates: + - id: '2511' + test_type: Dupe_Rows + sql_flavor: bigquery + template_name: ex_dupe_rows_generic.sql + - id: '2411' + test_type: Dupe_Rows + sql_flavor: databricks + template_name: ex_dupe_rows_generic.sql + - id: '2211' + test_type: Dupe_Rows + sql_flavor: mssql + template_name: ex_dupe_rows_generic.sql + - id: '2311' + test_type: Dupe_Rows + sql_flavor: postgresql + template_name: ex_dupe_rows_generic.sql + - id: '2011' + test_type: Dupe_Rows + sql_flavor: redshift + template_name: ex_dupe_rows_generic.sql + - id: '2511' + test_type: Dupe_Rows + sql_flavor: redshift_spectrum + template_name: ex_dupe_rows_generic.sql + - id: '2111' + test_type: Dupe_Rows + sql_flavor: snowflake + template_name: ex_dupe_rows_generic.sql diff --git a/testgen/template/dbsetup_test_types/test_types_Email_Format.yaml b/testgen/template/dbsetup_test_types/test_types_Email_Format.yaml new file mode 100644 index 00000000..928f9815 --- /dev/null +++ b/testgen/template/dbsetup_test_types/test_types_Email_Format.yaml @@ -0,0 +1,163 @@ +test_types: + id: '1014' + test_type: Email_Format + test_name_short: Email Format + test_name_long: Email is correctly formatted + test_description: |- + Tests that non-blank, non-empty email addresses match the standard format + except_message: |- + Invalid email address formats found. + measure_uom: Invalid emails + measure_uom_description: |- + Number of emails that do not match standard format + selection_criteria: |- + std_pattern_match='EMAIL' + dq_score_prevalence_formula: |- + ({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/NULLIF({RECORD_CT}::FLOAT, 0) + dq_score_risk_factor: '1.0' + column_name_prompt: null + column_name_help: null + default_parm_columns: threshold_value + default_parm_values: |- + 0 + default_parm_prompts: |- + Maximum Invalid Email Count + default_parm_help: null + default_severity: Fail + run_type: CAT + test_scope: column + dq_dimension: Validity + health_dimension: Schema Drift + threshold_description: |- + Expected count of invalid email addresses + result_visualization: line_chart + result_visualization_params: null + usage_notes: null + active: Y + cat_test_conditions: + - id: '7009' + test_type: Email_Format + sql_flavor: bigquery + measure: |- + SUM(CASE WHEN NOT REGEXP_CONTAINS(CAST({COLUMN_NAME} AS STRING), r'^[A-Za-z0-9._%+-]+@(?:[A-Za-z0-9-]+[.])+[A-Za-z]{2,}$') THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '6009' + test_type: Email_Format + sql_flavor: databricks + measure: |- + SUM(CASE WHEN NOT REGEXP_LIKE({COLUMN_NAME}::STRING, '^[A-Za-z0-9._''%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$') THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '3009' + test_type: Email_Format + sql_flavor: mssql + measure: |- + SUM(CASE WHEN {COLUMN_NAME} NOT LIKE '[A-Za-z0-9._''%+-]%@[A-Za-z0-9.-]%.[A-Za-z][A-Za-z]%' THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '4009' + test_type: Email_Format + sql_flavor: postgresql + measure: |- + SUM(CASE WHEN {COLUMN_NAME} !~ '^[A-Za-z0-9._''%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$' THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '1009' + test_type: Email_Format + sql_flavor: redshift + measure: |- + SUM(CASE WHEN {COLUMN_NAME} !~ '^[A-Za-z0-9._''%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$' THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '7009' + test_type: Email_Format + sql_flavor: redshift_spectrum + measure: |- + SUM(CASE WHEN {COLUMN_NAME} !~ '^[A-Za-z0-9._''%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$' THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '2009' + test_type: Email_Format + sql_flavor: snowflake + measure: |- + SUM(CASE WHEN NOT REGEXP_LIKE({COLUMN_NAME}::VARCHAR, '^[A-Za-z0-9._''%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$') THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '5009' + test_type: Email_Format + sql_flavor: trino + measure: |- + SUM(CASE WHEN REGEXP_LIKE({COLUMN_NAME} , '^[A-Za-z0-9._''%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$') != TRUE THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + target_data_lookups: + - id: '1372' + test_id: '1014' + test_type: Email_Format + sql_flavor: bigquery + lookup_type: null + lookup_query: |- + SELECT `{COLUMN_NAME}`, COUNT(*) AS count + FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` + WHERE NOT REGEXP_CONTAINS(CAST(`{COLUMN_NAME}` AS STRING), r'^[A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+\.[A-Za-z]{2,}$') + GROUP BY `{COLUMN_NAME}` + LIMIT 500; + error_type: Test Results + - id: '1306' + test_id: '1014' + test_type: Email_Format + sql_flavor: databricks + lookup_type: null + lookup_query: |- + SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE REGEXP_LIKE(`{COLUMN_NAME}`::STRING, '^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z]{2,}$') != 1 GROUP BY `{COLUMN_NAME}` LIMIT 500; + error_type: Test Results + - id: '1148' + test_id: '1014' + test_type: Email_Format + sql_flavor: mssql + lookup_type: null + lookup_query: |- + SELECT DISTINCT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" NOT LIKE '%[_a-zA-Z0-9.-]%@%[a-zA-Z0-9.-]%.[a-zA-Z][a-zA-Z]%' GROUP BY "{COLUMN_NAME}"; + error_type: Test Results + - id: '1091' + test_id: '1014' + test_type: Email_Format + sql_flavor: postgresql + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" !~ '^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$' GROUP BY "{COLUMN_NAME}" LIMIT 500; + error_type: Test Results + - id: '1009' + test_id: '1014' + test_type: Email_Format + sql_flavor: redshift + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" !~ '^[A-Za-z0-9._''%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$' GROUP BY "{COLUMN_NAME}" LIMIT 500; + error_type: Test Results + - id: '1409' + test_id: '1014' + test_type: Email_Format + sql_flavor: redshift_spectrum + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" !~ '^[A-Za-z0-9._''%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$' GROUP BY "{COLUMN_NAME}" LIMIT 500; + error_type: Test Results + - id: '1205' + test_id: '1014' + test_type: Email_Format + sql_flavor: snowflake + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE REGEXP_LIKE("{COLUMN_NAME}"::VARCHAR, '^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z]{2,}$') != 1 GROUP BY "{COLUMN_NAME}" LIMIT 500; + error_type: Test Results + test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Future_Date.yaml b/testgen/template/dbsetup_test_types/test_types_Future_Date.yaml new file mode 100644 index 00000000..3f02dd24 --- /dev/null +++ b/testgen/template/dbsetup_test_types/test_types_Future_Date.yaml @@ -0,0 +1,162 @@ +test_types: + id: '1015' + test_type: Future_Date + test_name_short: Past Dates + test_name_long: Latest date is prior to test run date + test_description: |- + Tests that the maximum date referenced in the column is no greater than the test date, consistent with baseline data + except_message: |- + Future date found when absent in baseline data. + measure_uom: Future dates + measure_uom_description: null + selection_criteria: |- + general_type='D'AND future_date_ct = 0 + dq_score_prevalence_formula: |- + ({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/NULLIF({RECORD_CT}::FLOAT, 0) + dq_score_risk_factor: '1.0' + column_name_prompt: null + column_name_help: null + default_parm_columns: threshold_value + default_parm_values: |- + 0 + default_parm_prompts: |- + Maximum Future Date Count + default_parm_help: null + default_severity: Fail + run_type: CAT + test_scope: column + dq_dimension: Timeliness + health_dimension: Recency + threshold_description: |- + Expected count of future dates + result_visualization: line_chart + result_visualization_params: null + usage_notes: null + active: Y + cat_test_conditions: + - id: '7010' + test_type: Future_Date + sql_flavor: bigquery + measure: |- + SUM(IF({COLUMN_NAME} > CAST(CAST('{RUN_DATE}' AS DATETIME) AS {COLUMN_TYPE}), 1, 0)) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '6010' + test_type: Future_Date + sql_flavor: databricks + measure: |- + SUM(GREATEST(0, SIGN({COLUMN_NAME}::DATE - '{RUN_DATE}'::DATE))) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '3010' + test_type: Future_Date + sql_flavor: mssql + measure: |- + SUM(CASE WHEN CAST({COLUMN_NAME} AS DATE) >= CONVERT(DATE, '{RUN_DATE}') THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '4010' + test_type: Future_Date + sql_flavor: postgresql + measure: |- + SUM(GREATEST(0, SIGN({COLUMN_NAME}::DATE - '{RUN_DATE}'::DATE))) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '1010' + test_type: Future_Date + sql_flavor: redshift + measure: |- + SUM(GREATEST(0, SIGN({COLUMN_NAME}::DATE - '{RUN_DATE}'::DATE))) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '7010' + test_type: Future_Date + sql_flavor: redshift_spectrum + measure: |- + SUM(GREATEST(0, SIGN({COLUMN_NAME}::DATE - '{RUN_DATE}'::DATE))) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '2010' + test_type: Future_Date + sql_flavor: snowflake + measure: |- + SUM(GREATEST(0, SIGN({COLUMN_NAME}::DATE - '{RUN_DATE}'::DATE))) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '5010' + test_type: Future_Date + sql_flavor: trino + measure: |- + SUM(CASE WHEN CAST({COLUMN_NAME} AS DATE) >= CAST('{RUN_DATE}' AS DATE) THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + target_data_lookups: + - id: '1373' + test_id: '1015' + test_type: Future_Date + sql_flavor: bigquery + lookup_type: null + lookup_query: |- + SELECT `{COLUMN_NAME}`, COUNT(*) AS count + FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` + WHERE DATETIME_DIFF(`{COLUMN_NAME}`, CAST(CAST('{TEST_DATE}' AS DATETIME) AS {COLUMN_TYPE}), DAY) > {THRESHOLD_VALUE} + GROUP BY `{COLUMN_NAME}` + LIMIT 500; + error_type: Test Results + - id: '1307' + test_id: '1015' + test_type: Future_Date + sql_flavor: databricks + lookup_type: null + lookup_query: |- + SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE GREATEST(0, SIGN(`{COLUMN_NAME}`::DATE - '{TEST_DATE}'::DATE)) > {THRESHOLD_VALUE} GROUP BY `{COLUMN_NAME}` LIMIT 500; + error_type: Test Results + - id: '1149' + test_id: '1015' + test_type: Future_Date + sql_flavor: mssql + lookup_type: null + lookup_query: |- + SELECT DISTINCT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE CAST("{COLUMN_NAME}" AS DATE) >= CONVERT(DATE, '{TEST_DATE}') GROUP BY "{COLUMN_NAME}"; + error_type: Test Results + - id: '1092' + test_id: '1015' + test_type: Future_Date + sql_flavor: postgresql + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE GREATEST(0, SIGN("{COLUMN_NAME}"::DATE - '{TEST_DATE}'::DATE)) > {THRESHOLD_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT 500; + error_type: Test Results + - id: '1010' + test_id: '1015' + test_type: Future_Date + sql_flavor: redshift + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE GREATEST(0, SIGN("{COLUMN_NAME}"::DATE - '{TEST_DATE}'::DATE)) > {THRESHOLD_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT 500; + error_type: Test Results + - id: '1410' + test_id: '1015' + test_type: Future_Date + sql_flavor: redshift_spectrum + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE GREATEST(0, SIGN("{COLUMN_NAME}"::DATE - '{TEST_DATE}'::DATE)) > {THRESHOLD_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT 500; + error_type: Test Results + - id: '1206' + test_id: '1015' + test_type: Future_Date + sql_flavor: snowflake + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE GREATEST(0, SIGN("{COLUMN_NAME}"::DATE - '{TEST_DATE}'::DATE)) > {THRESHOLD_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT 500; + error_type: Test Results + test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Future_Date_1Y.yaml b/testgen/template/dbsetup_test_types/test_types_Future_Date_1Y.yaml new file mode 100644 index 00000000..0ce7f4a8 --- /dev/null +++ b/testgen/template/dbsetup_test_types/test_types_Future_Date_1Y.yaml @@ -0,0 +1,163 @@ +test_types: + id: '1016' + test_type: Future_Date_1Y + test_name_short: Future Year + test_name_long: Future dates within year of test run date + test_description: |- + Tests that the maximum date referenced in the column is no greater than one year beyond the test date, consistent with baseline data + except_message: |- + Future date beyond one-year found when absent in baseline. + measure_uom: Future dates post 1 year + measure_uom_description: null + selection_criteria: |- + general_type='D'AND future_date_ct > 0 AND max_date <='{AS_OF_DATE}'::DATE + INTERVAL'365 DAYS' + dq_score_prevalence_formula: |- + ({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/NULLIF({RECORD_CT}::FLOAT, 0) + dq_score_risk_factor: '1.0' + column_name_prompt: null + column_name_help: null + default_parm_columns: threshold_value + default_parm_values: |- + 0 + default_parm_prompts: |- + Maximum Post 1-Year Future Date Count + default_parm_help: null + default_severity: Fail + run_type: CAT + test_scope: column + dq_dimension: Timeliness + health_dimension: Recency + threshold_description: |- + Expected count of future dates beyond one year + result_visualization: line_chart + result_visualization_params: null + usage_notes: |- + Future Year looks for date values in the column that extend beyond one year after the test date. This would be appropriate for transactional dates where you would expect to find dates in the near future, but not beyond one year ahead. Errors could indicate invalid entries or possibly dummy dates representing blank values. + active: Y + cat_test_conditions: + - id: '7011' + test_type: Future_Date_1Y + sql_flavor: bigquery + measure: |- + SUM(IF({COLUMN_NAME} > CAST(DATETIME_ADD(CAST('{RUN_DATE}' AS DATETIME), INTERVAL 1 YEAR) AS {COLUMN_TYPE}), 1, 0)) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '6011' + test_type: Future_Date_1Y + sql_flavor: databricks + measure: |- + SUM(GREATEST(0, SIGN({COLUMN_NAME}::DATE - ('{RUN_DATE}'::DATE+365)))) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '3011' + test_type: Future_Date_1Y + sql_flavor: mssql + measure: |- + SUM(CASE WHEN CAST({COLUMN_NAME} AS DATE) >= DATEADD(DAY, 365, CONVERT(DATE, '{RUN_DATE}')) THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '4011' + test_type: Future_Date_1Y + sql_flavor: postgresql + measure: |- + SUM(GREATEST(0, SIGN({COLUMN_NAME}::DATE - ('{RUN_DATE}'::DATE+365)))) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '1011' + test_type: Future_Date_1Y + sql_flavor: redshift + measure: |- + SUM(GREATEST(0, SIGN({COLUMN_NAME}::DATE - ('{RUN_DATE}'::DATE+365)))) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '7011' + test_type: Future_Date_1Y + sql_flavor: redshift_spectrum + measure: |- + SUM(GREATEST(0, SIGN({COLUMN_NAME}::DATE - ('{RUN_DATE}'::DATE+365)))) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '2011' + test_type: Future_Date_1Y + sql_flavor: snowflake + measure: |- + SUM(GREATEST(0, SIGN({COLUMN_NAME}::DATE - ('{RUN_DATE}'::DATE+365)))) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '5011' + test_type: Future_Date_1Y + sql_flavor: trino + measure: |- + SUM(CASE WHEN CAST({COLUMN_NAME} AS DATE) >= (FROM_ISO8601_DATE('{RUN_DATE}') + interval '365' day ) THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + target_data_lookups: + - id: '1374' + test_id: '1016' + test_type: Future_Date_1Y + sql_flavor: bigquery + lookup_type: null + lookup_query: |- + SELECT `{COLUMN_NAME}`, COUNT(*) AS count + FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` + WHERE DATETIME_DIFF(`{COLUMN_NAME}`, DATE_ADD(CAST(CAST('{TEST_DATE}' AS DATETIME) AS {COLUMN_TYPE}), INTERVAL 365 DAY), DAY) > {THRESHOLD_VALUE} + GROUP BY `{COLUMN_NAME}` + LIMIT 500; + error_type: Test Results + - id: '1308' + test_id: '1016' + test_type: Future_Date_1Y + sql_flavor: databricks + lookup_type: null + lookup_query: |- + SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE GREATEST(0, SIGN(`{COLUMN_NAME}`::DATE - ('{TEST_DATE}'::DATE + 365))) > {THRESHOLD_VALUE} GROUP BY `{COLUMN_NAME}` LIMIT 500; + error_type: Test Results + - id: '1150' + test_id: '1016' + test_type: Future_Date_1Y + sql_flavor: mssql + lookup_type: null + lookup_query: |- + SELECT DISTINCT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE CAST("{COLUMN_NAME}" AS DATE) >= DATEADD(DAY, 365, CONVERT(DATE, '{TEST_DATE}')) GROUP BY "{COLUMN_NAME}"; + error_type: Test Results + - id: '1093' + test_id: '1016' + test_type: Future_Date_1Y + sql_flavor: postgresql + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE GREATEST(0, SIGN("{COLUMN_NAME}"::DATE - ('{TEST_DATE}'::DATE + 365))) > {THRESHOLD_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT 500; + error_type: Test Results + - id: '1011' + test_id: '1016' + test_type: Future_Date_1Y + sql_flavor: redshift + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE GREATEST(0, SIGN("{COLUMN_NAME}"::DATE - ('{TEST_DATE}'::DATE + 365))) > {THRESHOLD_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT 500; + error_type: Test Results + - id: '1411' + test_id: '1016' + test_type: Future_Date_1Y + sql_flavor: redshift_spectrum + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE GREATEST(0, SIGN("{COLUMN_NAME}"::DATE - ('{TEST_DATE}'::DATE + 365))) > {THRESHOLD_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT 500; + error_type: Test Results + - id: '1207' + test_id: '1016' + test_type: Future_Date_1Y + sql_flavor: snowflake + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE GREATEST(0, SIGN("{COLUMN_NAME}"::DATE - ('{TEST_DATE}'::DATE + 365))) > {THRESHOLD_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT 500; + error_type: Test Results + test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Incr_Avg_Shift.yaml b/testgen/template/dbsetup_test_types/test_types_Incr_Avg_Shift.yaml new file mode 100644 index 00000000..1e4c6259 --- /dev/null +++ b/testgen/template/dbsetup_test_types/test_types_Incr_Avg_Shift.yaml @@ -0,0 +1,163 @@ +test_types: + id: '1017' + test_type: Incr_Avg_Shift + test_name_short: New Shift + test_name_long: New record mean is consistent with reference + test_description: |- + Tests for statistically-significant shift in mean of new values for column compared to average calculated at baseline. + except_message: |- + Significant shift in average of new values vs. baseline avg + measure_uom: Z-score of mean shift + measure_uom_description: |- + Absolute Z-score (number of SD's outside mean) of prior avg - incremental avg + selection_criteria: |- + general_type='N' AND distinct_value_ct > 10 AND functional_data_type ilike 'Measure%' AND functional_data_type <> 'Measurement Spike' AND column_name NOT ilike '%latitude%' AND column_name NOT ilike '%longitude%' + dq_score_prevalence_formula: |- + {RECORD_CT}::FLOAT*(1-FN_NORMAL_CDF({RESULT_MEASURE}::FLOAT))/NULLIF({RECORD_CT}::FLOAT, 0) + dq_score_risk_factor: '0.75' + column_name_prompt: null + column_name_help: null + default_parm_columns: baseline_value_ct,baseline_sum,baseline_avg,baseline_sd,threshold_value + default_parm_values: |- + value_ct,(avg_value * value_ct)::FLOAT,avg_value,stdev_value,2 + default_parm_prompts: |- + Value Count at Baseline,Sum at Baseline,Mean Value at Baseline,Std Deviation at Baseline,Threshold Max Z-Score + default_parm_help: null + default_severity: Warning + run_type: CAT + test_scope: column + dq_dimension: Accuracy + health_dimension: Data Drift + threshold_description: |- + Maximum Z-Score (number of SD's beyond mean) expected + result_visualization: line_chart + result_visualization_params: null + usage_notes: |- + This is a more sensitive test than Average Shift, because it calculates an incremental difference in the average of new values compared to the average of values at baseline. This is appropriate for a cumulative dataset only, because it calculates the average of new entries based on the assumption that the count and average of records present at baseline are still present at the time of the test. This test compares the mean of new values with the standard deviation of the baseline average to calculate a Z-score. If the new mean falls outside the Z-score threshold, a shift is detected. Potential Z-score thresholds may range from 0 to 3, depending on the sensitivity you prefer. A failed test could indicate a quality issue or a legitimate shift in new data that should be noted and assessed by business users. Consider this test along with Variability Increase. If variability rises too, process, methodology or measurement flaws could be at issue. If variability remains consistent, the problem is more likely to be with the source data itself. + active: Y + cat_test_conditions: + - id: '7012' + test_type: Incr_Avg_Shift + sql_flavor: bigquery + measure: |- + COALESCE(ABS(({BASELINE_AVG} - (SUM({COLUMN_NAME}) - {BASELINE_SUM}) / NULLIF(COUNT({COLUMN_NAME}) - {BASELINE_VALUE_CT}, 0)) / {BASELINE_SD}), 0) + test_operator: '>=' + test_condition: |- + {THRESHOLD_VALUE} + - id: '6012' + test_type: Incr_Avg_Shift + sql_flavor: databricks + measure: |- + COALESCE(ABS( ({BASELINE_AVG} - (SUM({COLUMN_NAME}) - {BASELINE_SUM}) / NULLIF(COUNT({COLUMN_NAME})::FLOAT - {BASELINE_VALUE_CT}, 0)) / {BASELINE_SD} ), 0) + test_operator: '>=' + test_condition: |- + {THRESHOLD_VALUE} + - id: '3012' + test_type: Incr_Avg_Shift + sql_flavor: mssql + measure: |- + COALESCE(ABS( ({BASELINE_AVG} - (SUM(CAST({COLUMN_NAME} AS FLOAT)) - {BASELINE_SUM}) / NULLIF(CAST(COUNT({COLUMN_NAME}) AS FLOAT) - {BASELINE_VALUE_CT}, 0)) / {BASELINE_SD} ), 0) + test_operator: '>=' + test_condition: |- + {THRESHOLD_VALUE} + - id: '4012' + test_type: Incr_Avg_Shift + sql_flavor: postgresql + measure: |- + COALESCE(ABS( ({BASELINE_AVG} - (SUM({COLUMN_NAME}) - {BASELINE_SUM}) / NULLIF(COUNT({COLUMN_NAME})::FLOAT - {BASELINE_VALUE_CT}, 0)) / {BASELINE_SD} ), 0) + test_operator: '>=' + test_condition: |- + {THRESHOLD_VALUE} + - id: '1012' + test_type: Incr_Avg_Shift + sql_flavor: redshift + measure: |- + NVL(ABS( ({BASELINE_AVG} - (SUM({COLUMN_NAME}) - {BASELINE_SUM}) / NULLIF(COUNT({COLUMN_NAME})::FLOAT - {BASELINE_VALUE_CT}, 0)) / {BASELINE_SD} ), 0) + test_operator: '>=' + test_condition: |- + {THRESHOLD_VALUE} + - id: '7012' + test_type: Incr_Avg_Shift + sql_flavor: redshift_spectrum + measure: |- + NVL(ABS( ({BASELINE_AVG} - (SUM({COLUMN_NAME}) - {BASELINE_SUM}) / NULLIF(COUNT({COLUMN_NAME})::FLOAT - {BASELINE_VALUE_CT}, 0)) / {BASELINE_SD} ), 0) + test_operator: '>=' + test_condition: |- + {THRESHOLD_VALUE} + - id: '2012' + test_type: Incr_Avg_Shift + sql_flavor: snowflake + measure: |- + COALESCE(ABS( ({BASELINE_AVG} - (SUM({COLUMN_NAME}) - {BASELINE_SUM}) / NULLIF(COUNT({COLUMN_NAME})::FLOAT - {BASELINE_VALUE_CT}, 0)) / {BASELINE_SD} ), 0) + test_operator: '>=' + test_condition: |- + {THRESHOLD_VALUE} + - id: '5012' + test_type: Incr_Avg_Shift + sql_flavor: trino + measure: |- + COALESCE(ABS( ({BASELINE_AVG} - (SUM({COLUMN_NAME}) - {BASELINE_SUM}) / NULLIF(CAST(COUNT({COLUMN_NAME}) AS REAL) - {BASELINE_VALUE_CT}, 0)) / {BASELINE_SD} ), 0) + test_operator: '>=' + test_condition: |- + {THRESHOLD_VALUE} + target_data_lookups: + - id: '1375' + test_id: '1017' + test_type: Incr_Avg_Shift + sql_flavor: bigquery + lookup_type: null + lookup_query: |- + SELECT AVG(CAST(`{COLUMN_NAME}` AS FLOAT64)) AS current_average, + SUM(CAST(`{COLUMN_NAME}` AS FLOAT64)) AS current_sum, + NULLIF(CAST(COUNT(`{COLUMN_NAME}`) AS FLOAT64), 0) AS current_value_count + FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`; + error_type: Test Results + - id: '1309' + test_id: '1017' + test_type: Incr_Avg_Shift + sql_flavor: databricks + lookup_type: null + lookup_query: |- + SELECT AVG(`{COLUMN_NAME}` :: FLOAT) AS current_average, SUM(`{COLUMN_NAME}` ::FLOAT) AS current_sum, NULLIF(COUNT(`{COLUMN_NAME}` )::FLOAT, 0) as current_value_count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`; + error_type: Test Results + - id: '1151' + test_id: '1017' + test_type: Incr_Avg_Shift + sql_flavor: mssql + lookup_type: null + lookup_query: |- + SELECT AVG(CAST("{COLUMN_NAME}" AS FLOAT)) AS current_average, SUM(CAST("{COLUMN_NAME}" AS FLOAT)) AS current_sum, NULLIF(CAST(COUNT("{COLUMN_NAME}") AS FLOAT), 0) as current_value_count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}"; + error_type: Test Results + - id: '1094' + test_id: '1017' + test_type: Incr_Avg_Shift + sql_flavor: postgresql + lookup_type: null + lookup_query: |- + SELECT AVG("{COLUMN_NAME}" :: FLOAT) AS current_average, SUM("{COLUMN_NAME}" ::FLOAT) AS current_sum, NULLIF(COUNT("{COLUMN_NAME}" )::FLOAT, 0) as current_value_count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}"; + error_type: Test Results + - id: '1012' + test_id: '1017' + test_type: Incr_Avg_Shift + sql_flavor: redshift + lookup_type: null + lookup_query: |- + SELECT AVG("{COLUMN_NAME}" :: FLOAT) AS current_average, SUM("{COLUMN_NAME}" ::FLOAT) AS current_sum, NULLIF(COUNT("{COLUMN_NAME}" )::FLOAT, 0) as current_value_count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}"; + error_type: Test Results + - id: '1412' + test_id: '1017' + test_type: Incr_Avg_Shift + sql_flavor: redshift_spectrum + lookup_type: null + lookup_query: |- + SELECT AVG("{COLUMN_NAME}" :: FLOAT) AS current_average, SUM("{COLUMN_NAME}" ::FLOAT) AS current_sum, NULLIF(COUNT("{COLUMN_NAME}" )::FLOAT, 0) as current_value_count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}"; + error_type: Test Results + - id: '1208' + test_id: '1017' + test_type: Incr_Avg_Shift + sql_flavor: snowflake + lookup_type: null + lookup_query: |- + SELECT AVG("{COLUMN_NAME}" :: FLOAT) AS current_average, SUM("{COLUMN_NAME}" ::FLOAT) AS current_sum, NULLIF(COUNT("{COLUMN_NAME}" )::FLOAT, 0) as current_value_count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}"; + error_type: Test Results + test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_LOV_All.yaml b/testgen/template/dbsetup_test_types/test_types_LOV_All.yaml new file mode 100644 index 00000000..96b8e33b --- /dev/null +++ b/testgen/template/dbsetup_test_types/test_types_LOV_All.yaml @@ -0,0 +1,163 @@ +test_types: + id: '1018' + test_type: LOV_All + test_name_short: Value Match All + test_name_long: List of expected values all present in column + test_description: |- + Tests that all values match a pipe-delimited list of expected values and that all expected values are present + except_message: |- + Column values found don't exactly match the expected list of values + measure_uom: Values found + measure_uom_description: null + selection_criteria: null + dq_score_prevalence_formula: |- + 1 + dq_score_risk_factor: '1.0' + column_name_prompt: null + column_name_help: null + default_parm_columns: threshold_value + default_parm_values: null + default_parm_prompts: |- + List of Expected Values + default_parm_help: null + default_severity: Fail + run_type: CAT + test_scope: column + dq_dimension: Validity + health_dimension: Schema Drift + threshold_description: |- + List of values expected, in form ('Val1','Val2) + result_visualization: line_chart + result_visualization_params: null + usage_notes: |- + This is a more restrictive form of Value Match, testing that all values in the dataset match the list provided, and also that all values present in the list appear at least once in the dataset. This would be appropriate for tables where all category values in the column are represented at least once. + active: Y + cat_test_conditions: + - id: '7013' + test_type: LOV_All + sql_flavor: bigquery + measure: |- + STRING_AGG(DISTINCT CAST({COLUMN_NAME} AS STRING), '|' ORDER BY {COLUMN_NAME}) + test_operator: '!=' + test_condition: |- + {THRESHOLD_VALUE} + - id: '6013' + test_type: LOV_All + sql_flavor: databricks + measure: |- + STRING_AGG(DISTINCT {COLUMN_NAME}, '|') WITHIN GROUP (ORDER BY {COLUMN_NAME}) + test_operator: <> + test_condition: |- + {THRESHOLD_VALUE} + - id: '3013' + test_type: LOV_All + sql_flavor: mssql + measure: |- + STRING_AGG(DISTINCT {COLUMN_NAME}, '|') WITHIN GROUP (ORDER BY {COLUMN_NAME}) + test_operator: <> + test_condition: |- + {THRESHOLD_VALUE} + - id: '4013' + test_type: LOV_All + sql_flavor: postgresql + measure: |- + STRING_AGG(DISTINCT {COLUMN_NAME}, '|') WITHIN GROUP (ORDER BY {COLUMN_NAME}) + test_operator: <> + test_condition: |- + {THRESHOLD_VALUE} + - id: '1013' + test_type: LOV_All + sql_flavor: redshift + measure: |- + LISTAGG(DISTINCT {COLUMN_NAME}, '|') WITHIN GROUP (ORDER BY {COLUMN_NAME}) + test_operator: <> + test_condition: |- + {THRESHOLD_VALUE} + - id: '7013' + test_type: LOV_All + sql_flavor: redshift_spectrum + measure: |- + LISTAGG(DISTINCT {COLUMN_NAME}, '|') WITHIN GROUP (ORDER BY {COLUMN_NAME}) + test_operator: <> + test_condition: |- + {THRESHOLD_VALUE} + - id: '2013' + test_type: LOV_All + sql_flavor: snowflake + measure: |- + LISTAGG(DISTINCT {COLUMN_NAME}, '|') WITHIN GROUP (ORDER BY {COLUMN_NAME}) + test_operator: <> + test_condition: |- + {THRESHOLD_VALUE} + - id: '5013' + test_type: LOV_All + sql_flavor: trino + measure: |- + LISTAGG(DISTINCT {COLUMN_NAME}, '|') WITHIN GROUP (ORDER BY {COLUMN_NAME}) + test_operator: <> + test_condition: |- + {THRESHOLD_VALUE} + target_data_lookups: + - id: '1376' + test_id: '1018' + test_type: LOV_All + sql_flavor: bigquery + lookup_type: null + lookup_query: |- + SELECT lov + FROM ( + SELECT STRING_AGG(DISTINCT CAST(`{COLUMN_NAME}` AS STRING), '|' ORDER BY `{COLUMN_NAME}`) AS lov + FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` + ) + WHERE lov <> '{THRESHOLD_VALUE}' + LIMIT 500; + error_type: Test Results + - id: '1310' + test_id: '1018' + test_type: LOV_All + sql_flavor: databricks + lookup_type: null + lookup_query: |- + SELECT ARRAY_JOIN(ARRAY_SORT(COLLECT_SET(`{COLUMN_NAME}`)), '|') AS aggregated_values FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` HAVING ARRAY_JOIN(ARRAY_SORT(COLLECT_SET(`{COLUMN_NAME}`)), '|') <> '{THRESHOLD_VALUE}' LIMIT 500; + error_type: Test Results + - id: '1152' + test_id: '1018' + test_type: LOV_All + sql_flavor: mssql + lookup_type: null + lookup_query: |- + WITH CTE AS (SELECT DISTINCT "{COLUMN_NAME}" FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") SELECT STRING_AGG( "{COLUMN_NAME}", '|' ) WITHIN GROUP (ORDER BY "{COLUMN_NAME}" ASC) FROM CTE HAVING STRING_AGG("{COLUMN_NAME}", '|') WITHIN GROUP (ORDER BY "{COLUMN_NAME}" ASC) <> '{THRESHOLD_VALUE}'; + error_type: Test Results + - id: '1095' + test_id: '1018' + test_type: LOV_All + sql_flavor: postgresql + lookup_type: null + lookup_query: |- + SELECT STRING_AGG(DISTINCT "{COLUMN_NAME}", '|' ORDER BY "{COLUMN_NAME}" ASC) FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" HAVING STRING_AGG(DISTINCT "{COLUMN_NAME}", '|' ORDER BY "{COLUMN_NAME}" ASC) <> '{THRESHOLD_VALUE}' LIMIT 500; + error_type: Test Results + - id: '1013' + test_id: '1018' + test_type: LOV_All + sql_flavor: redshift + lookup_type: null + lookup_query: |- + SELECT LISTAGG(DISTINCT "{COLUMN_NAME}", '|') WITHIN GROUP (ORDER BY "{COLUMN_NAME}") FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" HAVING LISTAGG(DISTINCT "{COLUMN_NAME}", '|') WITHIN GROUP (ORDER BY "{COLUMN_NAME}") <> '{THRESHOLD_VALUE}' LIMIT 500; + error_type: Test Results + - id: '1413' + test_id: '1018' + test_type: LOV_All + sql_flavor: redshift_spectrum + lookup_type: null + lookup_query: |- + SELECT LISTAGG(DISTINCT "{COLUMN_NAME}", '|') WITHIN GROUP (ORDER BY "{COLUMN_NAME}") FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" HAVING LISTAGG(DISTINCT "{COLUMN_NAME}", '|') WITHIN GROUP (ORDER BY "{COLUMN_NAME}") <> '{THRESHOLD_VALUE}' LIMIT 500; + error_type: Test Results + - id: '1209' + test_id: '1018' + test_type: LOV_All + sql_flavor: snowflake + lookup_type: null + lookup_query: |- + SELECT LISTAGG(DISTINCT "{COLUMN_NAME}", '|') WITHIN GROUP (ORDER BY "{COLUMN_NAME}") FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" HAVING LISTAGG(DISTINCT "{COLUMN_NAME}", '|') WITHIN GROUP (ORDER BY "{COLUMN_NAME}") <> '{THRESHOLD_VALUE}' LIMIT 500; + error_type: Test Results + test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_LOV_Match.yaml b/testgen/template/dbsetup_test_types/test_types_LOV_Match.yaml new file mode 100644 index 00000000..66567768 --- /dev/null +++ b/testgen/template/dbsetup_test_types/test_types_LOV_Match.yaml @@ -0,0 +1,163 @@ +test_types: + id: '1019' + test_type: LOV_Match + test_name_short: Value Match + test_name_long: All column values present in expected list + test_description: |- + Tests that all values in the column match the list-of-values identified in baseline data. + except_message: |- + Values not matching expected List-of-Values from baseline. + measure_uom: Non-matching records + measure_uom_description: null + selection_criteria: |- + functional_data_type IN ('Boolean', 'Code', 'Category') AND top_freq_values > '' AND distinct_value_ct BETWEEN 2 and 10 AND value_ct > 5 + dq_score_prevalence_formula: |- + ({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/NULLIF({RECORD_CT}::FLOAT, 0) + dq_score_risk_factor: '1.0' + column_name_prompt: null + column_name_help: null + default_parm_columns: baseline_value,threshold_value + default_parm_values: |- + '(' || SUBSTRING( CASE WHEN SPLIT_PART(top_freq_values, '|' , 2) > '' THEN ',''' || TRIM( REPLACE ( SPLIT_PART(top_freq_values, '|' , 2), '''' , '''''' ) ) || '''' ELSE '' END || CASE WHEN SPLIT_PART(top_freq_values, '|' , 4) > '' THEN ',''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, '|' , 4), '''' , '''''' )) || '''' ELSE '' END || CASE WHEN SPLIT_PART(top_freq_values, '|' , 6) > '' THEN ',''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, '|' , 6), '''' , '''''' )) || '''' ELSE '' END || CASE WHEN SPLIT_PART(top_freq_values, '|' , 8) > '' THEN ',''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, '|' , 8), '''' , '''''' )) || '''' ELSE '' END || CASE WHEN SPLIT_PART(top_freq_values, '|' , 10) > '' THEN ',''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, '|' , 10), '''' , '''''' )) || '''' ELSE '' END || CASE WHEN SPLIT_PART(top_freq_values, '|' , 12) > '' THEN ',''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, '|' , 12), '''' , '''''' )) || '''' ELSE '' END || CASE WHEN SPLIT_PART(top_freq_values, '|' , 14) > '' THEN ',''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, '|' , 14), '''' , '''''' )) || '''' ELSE '' END || CASE WHEN SPLIT_PART(top_freq_values, '|' , 16) > '' THEN ',''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, '|' , 16), '''' , '''''' )) || '''' ELSE '' END || CASE WHEN SPLIT_PART(top_freq_values, '|' , 18) > '' THEN ',''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, '|' , 18), '''' , '''''' )) || '''' ELSE '' END || CASE WHEN SPLIT_PART(top_freq_values, '|' , 20) > '' THEN ',''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, '|' , 20), '''' , '''''' )) || '''' ELSE '' END, 2, 999) || ')',0 + default_parm_prompts: |- + List of Expected Values,Threshold Error Count + default_parm_help: null + default_severity: Fail + run_type: CAT + test_scope: column + dq_dimension: Validity + health_dimension: Schema Drift + threshold_description: |- + List of values expected, in form ('Val1','Val2) + result_visualization: line_chart + result_visualization_params: null + usage_notes: |- + This tests that all values in the column match the hard-coded list provided. This is relevant when the list of allowable values is small and not expected to change often. Even if new values might occasionally be added, this test is useful for downstream data products to provide warning that assumptions and logic may need to change. + active: Y + cat_test_conditions: + - id: '7014' + test_type: LOV_Match + sql_flavor: bigquery + measure: |- + SUM(CASE WHEN NULLIF({COLUMN_NAME}, '') NOT IN {BASELINE_VALUE} THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '6014' + test_type: LOV_Match + sql_flavor: databricks + measure: |- + SUM(CASE WHEN NULLIF({COLUMN_NAME}, '') NOT IN {BASELINE_VALUE} THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '3014' + test_type: LOV_Match + sql_flavor: mssql + measure: |- + SUM(CASE WHEN NULLIF({COLUMN_NAME}, '') NOT IN {BASELINE_VALUE} THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '4014' + test_type: LOV_Match + sql_flavor: postgresql + measure: |- + SUM(CASE WHEN NULLIF({COLUMN_NAME}, '') NOT IN {BASELINE_VALUE} THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '1014' + test_type: LOV_Match + sql_flavor: redshift + measure: |- + SUM(CASE WHEN NULLIF({COLUMN_NAME}, '') NOT IN {BASELINE_VALUE} THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '7014' + test_type: LOV_Match + sql_flavor: redshift_spectrum + measure: |- + SUM(CASE WHEN NULLIF({COLUMN_NAME}, '') NOT IN {BASELINE_VALUE} THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '2014' + test_type: LOV_Match + sql_flavor: snowflake + measure: |- + SUM(CASE WHEN NULLIF({COLUMN_NAME}, '') NOT IN {BASELINE_VALUE} THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '5014' + test_type: LOV_Match + sql_flavor: trino + measure: |- + SUM(CASE WHEN NULLIF({COLUMN_NAME}, '') NOT IN {BASELINE_VALUE} THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + target_data_lookups: + - id: '1377' + test_id: '1019' + test_type: LOV_Match + sql_flavor: bigquery + lookup_type: null + lookup_query: |- + SELECT DISTINCT NULLIF(`{COLUMN_NAME}`, '') AS `{COLUMN_NAME}`, COUNT(*) AS count + FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` + WHERE NULLIF(`{COLUMN_NAME}`, '') NOT IN {BASELINE_VALUE} + GROUP BY `{COLUMN_NAME}` + LIMIT 500; + error_type: Test Results + - id: '1311' + test_id: '1019' + test_type: LOV_Match + sql_flavor: databricks + lookup_type: null + lookup_query: |- + SELECT DISTINCT NULLIF(`{COLUMN_NAME}`, '') AS `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE NULLIF(`{COLUMN_NAME}`, '') NOT IN {BASELINE_VALUE} GROUP BY `{COLUMN_NAME}` LIMIT 500; + error_type: Test Results + - id: '1153' + test_id: '1019' + test_type: LOV_Match + sql_flavor: mssql + lookup_type: null + lookup_query: |- + SELECT DISTINCT TOP 500 NULLIF("{COLUMN_NAME}", '') AS "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE NULLIF("{COLUMN_NAME}", '') NOT IN {BASELINE_VALUE} GROUP BY "{COLUMN_NAME}" ; + error_type: Test Results + - id: '1096' + test_id: '1019' + test_type: LOV_Match + sql_flavor: postgresql + lookup_type: null + lookup_query: |- + SELECT DISTINCT NULLIF("{COLUMN_NAME}", '') AS "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE NULLIF("{COLUMN_NAME}", '') NOT IN {BASELINE_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT 500; + error_type: Test Results + - id: '1014' + test_id: '1019' + test_type: LOV_Match + sql_flavor: redshift + lookup_type: null + lookup_query: |- + SELECT DISTINCT NULLIF("{COLUMN_NAME}", '') AS "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE NULLIF("{COLUMN_NAME}", '') NOT IN {BASELINE_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT 500; + error_type: Test Results + - id: '1414' + test_id: '1019' + test_type: LOV_Match + sql_flavor: redshift_spectrum + lookup_type: null + lookup_query: |- + SELECT DISTINCT NULLIF("{COLUMN_NAME}", '') AS "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE NULLIF("{COLUMN_NAME}", '') NOT IN {BASELINE_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT 500; + error_type: Test Results + - id: '1210' + test_id: '1019' + test_type: LOV_Match + sql_flavor: snowflake + lookup_type: null + lookup_query: |- + SELECT DISTINCT NULLIF("{COLUMN_NAME}", '') AS "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE NULLIF("{COLUMN_NAME}", '') NOT IN {BASELINE_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT 500; + error_type: Test Results + test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Min_Date.yaml b/testgen/template/dbsetup_test_types/test_types_Min_Date.yaml new file mode 100644 index 00000000..939dc27b --- /dev/null +++ b/testgen/template/dbsetup_test_types/test_types_Min_Date.yaml @@ -0,0 +1,163 @@ +test_types: + id: '1020' + test_type: Min_Date + test_name_short: Minimum Date + test_name_long: All dates on or after set minimum + test_description: |- + Tests that the earliest date referenced in the column is no earlier than baseline data + except_message: |- + The earliest date value found is before the earliest value at baseline. + measure_uom: Dates prior to limit + measure_uom_description: null + selection_criteria: |- + general_type='D'and min_date IS NOT NULL AND distinct_value_ct > 1 + dq_score_prevalence_formula: |- + ({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/NULLIF({RECORD_CT}::FLOAT, 0) + dq_score_risk_factor: '1.0' + column_name_prompt: null + column_name_help: null + default_parm_columns: baseline_value,threshold_value + default_parm_values: |- + min_date,0 + default_parm_prompts: |- + Minimum Date at Baseline,Threshold Error Count + default_parm_help: null + default_severity: Fail + run_type: CAT + test_scope: column + dq_dimension: Validity + health_dimension: Schema Drift + threshold_description: |- + Expected count of dates prior to minimum + result_visualization: line_chart + result_visualization_params: null + usage_notes: |- + This test is appropriate for a cumulative dataset only, because it assumes all prior values are still present. It's appropriate where new records are added with more recent dates, but old dates dates do not change. + active: Y + cat_test_conditions: + - id: '7015' + test_type: Min_Date + sql_flavor: bigquery + measure: |- + SUM(CASE WHEN {COLUMN_NAME} < SAFE_CAST('{BASELINE_VALUE}' AS {COLUMN_TYPE}) THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '6015' + test_type: Min_Date + sql_flavor: databricks + measure: |- + SUM(CASE WHEN {COLUMN_NAME} < '{BASELINE_VALUE}' THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '3015' + test_type: Min_Date + sql_flavor: mssql + measure: |- + SUM(CASE WHEN {COLUMN_NAME} < '{BASELINE_VALUE}' THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '4015' + test_type: Min_Date + sql_flavor: postgresql + measure: |- + SUM(CASE WHEN {COLUMN_NAME} < '{BASELINE_VALUE}' THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '1015' + test_type: Min_Date + sql_flavor: redshift + measure: |- + SUM(CASE WHEN {COLUMN_NAME} < '{BASELINE_VALUE}' THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '7015' + test_type: Min_Date + sql_flavor: redshift_spectrum + measure: |- + SUM(CASE WHEN {COLUMN_NAME} < '{BASELINE_VALUE}' THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '2015' + test_type: Min_Date + sql_flavor: snowflake + measure: |- + SUM(CASE WHEN {COLUMN_NAME} < '{BASELINE_VALUE}' THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '5015' + test_type: Min_Date + sql_flavor: trino + measure: |- + SUM(CASE WHEN {COLUMN_NAME} < CAST('{BASELINE_VALUE}' AS DATE) THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + target_data_lookups: + - id: '1378' + test_id: '1020' + test_type: Min_Date + sql_flavor: bigquery + lookup_type: null + lookup_query: |- + SELECT `{COLUMN_NAME}`, COUNT(*) AS count + FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` + WHERE CAST(`{COLUMN_NAME}` AS DATE) < CAST(CAST('{BASELINE_VALUE}' AS DATETIME) AS DATE) + GROUP BY `{COLUMN_NAME}` + LIMIT 500; + error_type: Test Results + - id: '1312' + test_id: '1020' + test_type: Min_Date + sql_flavor: databricks + lookup_type: null + lookup_query: |- + SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE `{COLUMN_NAME}` :: DATE < '{BASELINE_VALUE}' :: DATE GROUP BY `{COLUMN_NAME}` LIMIT 500; + error_type: Test Results + - id: '1154' + test_id: '1020' + test_type: Min_Date + sql_flavor: mssql + lookup_type: null + lookup_query: |- + SELECT DISTINCT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE CAST("{COLUMN_NAME}" AS DATE) < CAST('{BASELINE_VALUE}' AS DATE) GROUP BY "{COLUMN_NAME}"; + error_type: Test Results + - id: '1097' + test_id: '1020' + test_type: Min_Date + sql_flavor: postgresql + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" :: DATE < '{BASELINE_VALUE}' :: DATE GROUP BY "{COLUMN_NAME}" LIMIT 500; + error_type: Test Results + - id: '1015' + test_id: '1020' + test_type: Min_Date + sql_flavor: redshift + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" :: DATE < '{BASELINE_VALUE}' :: DATE GROUP BY "{COLUMN_NAME}" LIMIT 500; + error_type: Test Results + - id: '1415' + test_id: '1020' + test_type: Min_Date + sql_flavor: redshift_spectrum + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" :: DATE < '{BASELINE_VALUE}' :: DATE GROUP BY "{COLUMN_NAME}" LIMIT 500; + error_type: Test Results + - id: '1211' + test_id: '1020' + test_type: Min_Date + sql_flavor: snowflake + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" :: DATE < '{BASELINE_VALUE}' :: DATE GROUP BY "{COLUMN_NAME}" LIMIT 500; + error_type: Test Results + test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Min_Val.yaml b/testgen/template/dbsetup_test_types/test_types_Min_Val.yaml new file mode 100644 index 00000000..8563d339 --- /dev/null +++ b/testgen/template/dbsetup_test_types/test_types_Min_Val.yaml @@ -0,0 +1,162 @@ +test_types: + id: '1021' + test_type: Min_Val + test_name_short: Minimum Value + test_name_long: All values at or above set minimum + test_description: |- + Tests that the minimum value present in the column is no lower than the minimum value in baseline data + except_message: |- + Minimum column value less than baseline. + measure_uom: Values under limit + measure_uom_description: null + selection_criteria: |- + general_type='N' AND functional_data_type ILIKE 'Measure%' AND min_value IS NOT NULL AND (distinct_value_ct >= 2 OR (distinct_value_ct=2 and min_value<>0 and max_value<>1)) + dq_score_prevalence_formula: |- + ({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/NULLIF({RECORD_CT}::FLOAT, 0) + dq_score_risk_factor: '1.0' + column_name_prompt: null + column_name_help: null + default_parm_columns: baseline_value,threshold_value + default_parm_values: |- + min_value,0 + default_parm_prompts: |- + Minimum Value at Baseline,Threshold Error Count + default_parm_help: null + default_severity: Fail + run_type: CAT + test_scope: column + dq_dimension: Validity + health_dimension: Schema Drift + threshold_description: |- + Expected count of values under limit + result_visualization: line_chart + result_visualization_params: null + usage_notes: |- + This test is appropriate for a cumulative dataset only, assuming all prior values are still present. It is also appropriate for any measure that has an absolute, definable minimum value, or a heuristic that makes senes for valid data. + active: Y + cat_test_conditions: + - id: '7016' + test_type: Min_Val + sql_flavor: bigquery + measure: |- + SUM(CASE WHEN {COLUMN_NAME} < {BASELINE_VALUE} - 1e-6 THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '6016' + test_type: Min_Val + sql_flavor: databricks + measure: |- + SUM(CASE WHEN {COLUMN_NAME} < {BASELINE_VALUE} - 1e-6 THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '3016' + test_type: Min_Val + sql_flavor: mssql + measure: |- + SUM(CASE WHEN {COLUMN_NAME} < {BASELINE_VALUE} - 1e-6 THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '4016' + test_type: Min_Val + sql_flavor: postgresql + measure: |- + SUM(CASE WHEN {COLUMN_NAME} < {BASELINE_VALUE} - 1e-6 THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '1016' + test_type: Min_Val + sql_flavor: redshift + measure: |- + SUM(CASE WHEN {COLUMN_NAME} < {BASELINE_VALUE} - 1e-6 THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '7016' + test_type: Min_Val + sql_flavor: redshift_spectrum + measure: |- + SUM(CASE WHEN {COLUMN_NAME} < {BASELINE_VALUE} - 1e-6 THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '2016' + test_type: Min_Val + sql_flavor: snowflake + measure: |- + SUM(CASE WHEN {COLUMN_NAME} < {BASELINE_VALUE} - 1e-6 THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '5016' + test_type: Min_Val + sql_flavor: trino + measure: |- + SUM(CASE WHEN {COLUMN_NAME} < {BASELINE_VALUE} - 1e-6 THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + target_data_lookups: + - id: '1379' + test_id: '1021' + test_type: Min_Val + sql_flavor: bigquery + lookup_type: null + lookup_query: |- + SELECT DISTINCT `{COLUMN_NAME}`, (ABS(CAST(`{COLUMN_NAME}` AS NUMERIC)) - ABS({BASELINE_VALUE})) AS difference_from_baseline + FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` + WHERE CAST(`{COLUMN_NAME}` AS NUMERIC) < {BASELINE_VALUE} + LIMIT 500; + error_type: Test Results + - id: '1313' + test_id: '1021' + test_type: Min_Val + sql_flavor: databricks + lookup_type: null + lookup_query: |- + SELECT DISTINCT `{COLUMN_NAME}`, (ABS(`{COLUMN_NAME}`) - ABS({BASELINE_VALUE})) AS difference_from_baseline FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE `{COLUMN_NAME}` < {BASELINE_VALUE} LIMIT 500; + error_type: Test Results + - id: '1155' + test_id: '1021' + test_type: Min_Val + sql_flavor: mssql + lookup_type: null + lookup_query: |- + SELECT DISTINCT TOP 500 "{COLUMN_NAME}", (ABS("{COLUMN_NAME}") - ABS({BASELINE_VALUE})) AS difference_from_baseline FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" < {BASELINE_VALUE}; + error_type: Test Results + - id: '1098' + test_id: '1021' + test_type: Min_Val + sql_flavor: postgresql + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", (ABS("{COLUMN_NAME}") - ABS({BASELINE_VALUE})) AS difference_from_baseline FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" < {BASELINE_VALUE} LIMIT 500; + error_type: Test Results + - id: '1016' + test_id: '1021' + test_type: Min_Val + sql_flavor: redshift + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", (ABS("{COLUMN_NAME}") - ABS({BASELINE_VALUE})) AS difference_from_baseline FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" < {BASELINE_VALUE} LIMIT 500; + error_type: Test Results + - id: '1416' + test_id: '1021' + test_type: Min_Val + sql_flavor: redshift_spectrum + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", (ABS("{COLUMN_NAME}") - ABS({BASELINE_VALUE})) AS difference_from_baseline FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" < {BASELINE_VALUE} LIMIT 500; + error_type: Test Results + - id: '1212' + test_id: '1021' + test_type: Min_Val + sql_flavor: snowflake + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", (ABS("{COLUMN_NAME}") - ABS({BASELINE_VALUE})) AS difference_from_baseline FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" < {BASELINE_VALUE} LIMIT 500; + error_type: Test Results + test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Missing_Pct.yaml b/testgen/template/dbsetup_test_types/test_types_Missing_Pct.yaml new file mode 100644 index 00000000..67069e25 --- /dev/null +++ b/testgen/template/dbsetup_test_types/test_types_Missing_Pct.yaml @@ -0,0 +1,163 @@ +test_types: + id: '1022' + test_type: Missing_Pct + test_name_short: Percent Missing + test_name_long: Consistent ratio of missing values + test_description: |- + Tests for statistically-significant shift in percentage of missing values in column vs. baseline data + except_message: |- + Significant shift in percent of missing values vs. baseline. + measure_uom: Difference measure + measure_uom_description: |- + Cohen's H Difference (0.20 small, 0.5 mod, 0.8 large, 1.2 very large, 2.0 huge) + selection_criteria: |- + record_ct <> value_ct + dq_score_prevalence_formula: |- + 2.0 * (1.0 - fn_normal_cdf(ABS({RESULT_MEASURE}::FLOAT) / 2.0)) + dq_score_risk_factor: '0.75' + column_name_prompt: null + column_name_help: null + default_parm_columns: baseline_ct,baseline_value_ct,threshold_value + default_parm_values: |- + record_ct,value_ct,2::VARCHAR(10) + default_parm_prompts: |- + Baseline Record Count,Baseline Value Count,Standardized Difference Measure + default_parm_help: null + default_severity: Warning + run_type: CAT + test_scope: column + dq_dimension: Completeness + health_dimension: Data Drift + threshold_description: |- + Expected maximum Cohen's H Difference + result_visualization: line_chart + result_visualization_params: null + usage_notes: |- + This test uses Cohen's H, a statistical test to identify a significant difference between two ratios. Results are reported on a standardized scale, which can be interpreted via a rule-of-thumb from small to huge. An uptick in missing data may indicate a collection issue at the source. A larger change may indicate a processing failure. A drop in missing data may also be significant, if it affects assumptions built into analytic products downstream. You can refine the expected threshold value as you view legitimate results of the measure over time. + active: Y + cat_test_conditions: + - id: '7017' + test_type: Missing_Pct + sql_flavor: bigquery + measure: |- + ABS(2.0 * ASIN(SQRT({BASELINE_VALUE_CT} / {BASELINE_CT})) - 2.0 * ASIN(SQRT(COUNT({COLUMN_NAME}) / NULLIF(COUNT(*),0)))) + test_operator: '>=' + test_condition: |- + {THRESHOLD_VALUE} + - id: '6017' + test_type: Missing_Pct + sql_flavor: databricks + measure: |- + ABS( 2.0 * ASIN( SQRT( {BASELINE_VALUE_CT}::FLOAT / {BASELINE_CT}::FLOAT ) ) - 2 * ASIN( SQRT( COUNT({COLUMN_NAME})::FLOAT / NULLIF(COUNT(*), 0)::FLOAT )) ) + test_operator: '>=' + test_condition: |- + {THRESHOLD_VALUE} + - id: '3017' + test_type: Missing_Pct + sql_flavor: mssql + measure: |- + ABS( 2.0 * ASIN( SQRT( CAST({BASELINE_VALUE_CT} AS FLOAT) / CAST({BASELINE_CT} AS FLOAT) ) ) - 2 * ASIN( SQRT( CAST(COUNT( {COLUMN_NAME} ) AS FLOAT) / CAST(NULLIF(COUNT(*), 0) AS FLOAT) )) ) + test_operator: '>=' + test_condition: |- + {THRESHOLD_VALUE} + - id: '4017' + test_type: Missing_Pct + sql_flavor: postgresql + measure: |- + ABS( 2.0 * ASIN( SQRT( {BASELINE_VALUE_CT}::FLOAT / {BASELINE_CT}::FLOAT ) ) - 2 * ASIN( SQRT( COUNT( {COLUMN_NAME} )::FLOAT / NULLIF(COUNT(*), 0)::FLOAT )) ) + test_operator: '>=' + test_condition: |- + {THRESHOLD_VALUE} + - id: '1017' + test_type: Missing_Pct + sql_flavor: redshift + measure: |- + ABS( 2.0 * ASIN( SQRT( {BASELINE_VALUE_CT}::FLOAT / {BASELINE_CT}::FLOAT ) ) - 2 * ASIN( SQRT( COUNT( {COLUMN_NAME} )::FLOAT / NULLIF(COUNT(*), 0)::FLOAT )) ) + test_operator: '>=' + test_condition: |- + {THRESHOLD_VALUE} + - id: '7017' + test_type: Missing_Pct + sql_flavor: redshift_spectrum + measure: |- + ABS( 2.0 * ASIN( SQRT( {BASELINE_VALUE_CT}::FLOAT / {BASELINE_CT}::FLOAT ) ) - 2 * ASIN( SQRT( COUNT( {COLUMN_NAME} )::FLOAT / NULLIF(COUNT(*), 0)::FLOAT )) ) + test_operator: '>=' + test_condition: |- + {THRESHOLD_VALUE} + - id: '2017' + test_type: Missing_Pct + sql_flavor: snowflake + measure: |- + ABS( 2.0 * ASIN( SQRT( {BASELINE_VALUE_CT}::FLOAT / {BASELINE_CT}::FLOAT ) ) - 2 * ASIN( SQRT( COUNT( {COLUMN_NAME} )::FLOAT / NULLIF(COUNT(*), 0)::FLOAT )) ) + test_operator: '>=' + test_condition: |- + {THRESHOLD_VALUE} + - id: '5017' + test_type: Missing_Pct + sql_flavor: trino + measure: |- + ABS(2.0 * ASIN(SQRT(CAST({BASELINE_VALUE_CT} AS REAL) / CAST({BASELINE_CT} AS REAL))) - 2 * ASIN(SQRT(CAST(COUNT({COLUMN_NAME}) AS REAL) / CAST(NULLIF(COUNT(*), 0) AS REAL) ))) + test_operator: '>=' + test_condition: |- + {THRESHOLD_VALUE} + target_data_lookups: + - id: '1380' + test_id: '1022' + test_type: Missing_Pct + sql_flavor: bigquery + lookup_type: null + lookup_query: |- + SELECT * + FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` + WHERE `{COLUMN_NAME}` IS NULL OR CAST(`{COLUMN_NAME}` AS STRING) = '' + LIMIT 10; + error_type: Test Results + - id: '1314' + test_id: '1022' + test_type: Missing_Pct + sql_flavor: databricks + lookup_type: null + lookup_query: |- + SELECT * FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE `{COLUMN_NAME}` IS NULL OR `{COLUMN_NAME}` :: VARCHAR(255) = '' LIMIT 10; + error_type: Test Results + - id: '1156' + test_id: '1022' + test_type: Missing_Pct + sql_flavor: mssql + lookup_type: null + lookup_query: |- + SELECT TOP 10 * FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NULL OR CAST("{COLUMN_NAME}" AS VARCHAR(255)) = ''; + error_type: Test Results + - id: '1099' + test_id: '1022' + test_type: Missing_Pct + sql_flavor: postgresql + lookup_type: null + lookup_query: |- + SELECT * FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NULL OR "{COLUMN_NAME}" :: VARCHAR(255) = '' LIMIT 10; + error_type: Test Results + - id: '1017' + test_id: '1022' + test_type: Missing_Pct + sql_flavor: redshift + lookup_type: null + lookup_query: |- + SELECT TOP 10 * FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NULL OR "{COLUMN_NAME}" :: VARCHAR(255) = '' ; + error_type: Test Results + - id: '1417' + test_id: '1022' + test_type: Missing_Pct + sql_flavor: redshift_spectrum + lookup_type: null + lookup_query: |- + SELECT TOP 10 * FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NULL OR "{COLUMN_NAME}" :: VARCHAR(255) = '' ; + error_type: Test Results + - id: '1213' + test_id: '1022' + test_type: Missing_Pct + sql_flavor: snowflake + lookup_type: null + lookup_query: |- + SELECT TOP 10 * FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NULL OR "{COLUMN_NAME}" :: VARCHAR(255) = '' ; + error_type: Test Results + test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Monthly_Rec_Ct.yaml b/testgen/template/dbsetup_test_types/test_types_Monthly_Rec_Ct.yaml new file mode 100644 index 00000000..af459f04 --- /dev/null +++ b/testgen/template/dbsetup_test_types/test_types_Monthly_Rec_Ct.yaml @@ -0,0 +1,229 @@ +test_types: + id: '1023' + test_type: Monthly_Rec_Ct + test_name_short: Monthly Records + test_name_long: At least one date per month present within date range + test_description: |- + Tests for presence of at least one date per calendar month within min/max date range, per baseline data + except_message: |- + At least one date per month expected in min/max date range. + measure_uom: Missing months + measure_uom_description: |- + Calendar months without date values present + selection_criteria: |- + functional_data_type ILIKE 'Transactional Date%' AND date_days_present > 1 AND functional_table_type ILIKE '%cumulative%' AND date_months_present > 2 AND date_months_present - (datediff( 'MON' , min_date, max_date) + 1) = 0 AND future_date_ct::FLOAT / NULLIF(value_ct, 0) <= 0.75 + dq_score_prevalence_formula: |- + ({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT*{PRO_RECORD_CT}::FLOAT/NULLIF({DATE_MONTHS_PRESENT}::FLOAT, 0)/NULLIF({RECORD_CT}::FLOAT, 0) + dq_score_risk_factor: '1.0' + column_name_prompt: null + column_name_help: null + default_parm_columns: threshold_value + default_parm_values: |- + 0 + default_parm_prompts: |- + Threshold Count of Months without Dates + default_parm_help: null + default_severity: Fail + run_type: CAT + test_scope: column + dq_dimension: Completeness + health_dimension: Volume + threshold_description: |- + Expected maximum count of calendar months without dates present + result_visualization: line_chart + result_visualization_params: null + usage_notes: |- + Monthly Records tests that at least one record is present for every calendar month within the minimum and maximum date range for the column. The test is relevant for transactional data, where you would expect at least one transaction to be recorded each month. A failure here would suggest missing records for the number of months identified without data. You can adjust the threshold to accept a number of month that you know legitimately have no records. + active: Y + cat_test_conditions: + - id: '7018' + test_type: Monthly_Rec_Ct + sql_flavor: bigquery + measure: |- + DATETIME_DIFF(DATETIME_TRUNC(SAFE_CAST(MAX({COLUMN_NAME}) AS DATE), MONTH), DATETIME_TRUNC(SAFE_CAST(MIN({COLUMN_NAME}) AS DATE), MONTH), MONTH) + 1 - COUNT(DISTINCT DATETIME_TRUNC({COLUMN_NAME}, MONTH)) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '6018' + test_type: Monthly_Rec_Ct + sql_flavor: databricks + measure: |- + (MAX(<%DATEDIFF_MONTH;{COLUMN_NAME};'{RUN_DATE}'::DATE%>) - MIN(<%DATEDIFF_MONTH;{COLUMN_NAME};'{RUN_DATE}'::DATE%>) + 1) - COUNT(DISTINCT <%DATEDIFF_MONTH;{COLUMN_NAME};'{RUN_DATE}'::DATE%>) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '3018' + test_type: Monthly_Rec_Ct + sql_flavor: mssql + measure: |- + (MAX(DATEDIFF(month, {COLUMN_NAME}, CAST('{RUN_DATE}'AS DATE))) - MIN(DATEDIFF(month, {COLUMN_NAME}, CAST('{RUN_DATE}' AS DATE))) + 1) - COUNT(DISTINCT DATEDIFF(month, {COLUMN_NAME}, CAST('{RUN_DATE}'AS DATE))) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '4018' + test_type: Monthly_Rec_Ct + sql_flavor: postgresql + measure: |- + (MAX(<%DATEDIFF_MONTH;{COLUMN_NAME};'{RUN_DATE}'::DATE%>) - MIN(<%DATEDIFF_MONTH;{COLUMN_NAME};'{RUN_DATE}'::DATE%>) + 1) - COUNT(DISTINCT <%DATEDIFF_MONTH;{COLUMN_NAME};'{RUN_DATE}'::DATE%>) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '1018' + test_type: Monthly_Rec_Ct + sql_flavor: redshift + measure: |- + (MAX(DATEDIFF(month, {COLUMN_NAME}, '{RUN_DATE}'::DATE)) - MIN(DATEDIFF(month, {COLUMN_NAME}, '{RUN_DATE}'::DATE)) + 1) - COUNT(DISTINCT DATEDIFF(month, {COLUMN_NAME}, '{RUN_DATE}'::DATE)) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '7018' + test_type: Monthly_Rec_Ct + sql_flavor: redshift_spectrum + measure: |- + (MAX(DATEDIFF(month, {COLUMN_NAME}, '{RUN_DATE}'::DATE)) - MIN(DATEDIFF(month, {COLUMN_NAME}, '{RUN_DATE}'::DATE)) + 1) - COUNT(DISTINCT DATEDIFF(month, {COLUMN_NAME}, '{RUN_DATE}'::DATE)) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '2018' + test_type: Monthly_Rec_Ct + sql_flavor: snowflake + measure: |- + (MAX(DATEDIFF(month, {COLUMN_NAME}, '{RUN_DATE}'::DATE)) - MIN(DATEDIFF(month, {COLUMN_NAME}, '{RUN_DATE}'::DATE)) + 1) - COUNT(DISTINCT DATEDIFF(month, {COLUMN_NAME}, '{RUN_DATE}'::DATE)) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '5018' + test_type: Monthly_Rec_Ct + sql_flavor: trino + measure: |- + (MAX(DATE_DIFF('month', {COLUMN_NAME}, CAST('{RUN_DATE}' AS DATE))) - MIN(DATE_DIFF('month', {COLUMN_NAME}, CAST('{RUN_DATE}' AS DATE))) + 1) - COUNT(DISTINCT DATE_DIFF('month', {COLUMN_NAME}, CAST('{RUN_DATE}' AS DATE))) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + target_data_lookups: + - id: '1381' + test_id: '1023' + test_type: Monthly_Rec_Ct + sql_flavor: bigquery + lookup_type: null + lookup_query: |- + WITH daterange AS ( + SELECT month AS all_dates + FROM UNNEST( + GENERATE_DATE_ARRAY( + DATE_TRUNC((SELECT MIN(CAST(`{COLUMN_NAME}` AS DATE)) FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`), MONTH), + DATE_TRUNC((SELECT MAX(CAST(`{COLUMN_NAME}` AS DATE)) FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`), MONTH), + INTERVAL 1 MONTH + ) + ) AS month + ), + existing_periods AS ( + SELECT DISTINCT DATE_TRUNC(CAST(`{COLUMN_NAME}` AS DATE), MONTH) AS period, COUNT(1) AS period_count + FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` + GROUP BY period + ), + p AS ( + SELECT d.all_dates AS missing_period, MAX(b.period) AS prior_available_month, MIN(c.period) AS next_available_month + FROM daterange d + LEFT JOIN existing_periods a ON d.all_dates = a.period + LEFT JOIN existing_periods b ON b.period < d.all_dates + LEFT JOIN existing_periods c ON c.period > d.all_dates + WHERE a.period IS NULL + AND d.all_dates BETWEEN b.period AND c.period + GROUP BY d.all_dates + ) + SELECT p.missing_period, p.prior_available_month, e.period_count AS prior_available_month_count, p.next_available_month, f.period_count AS next_available_month_count + FROM p + LEFT JOIN existing_periods e ON (p.prior_available_month = e.period) + LEFT JOIN existing_periods f ON (p.next_available_month = f.period) + ORDER BY p.missing_period; + error_type: Test Results + - id: '1315' + test_id: '1023' + test_type: Monthly_Rec_Ct + sql_flavor: databricks + lookup_type: null + lookup_query: |- + WITH daterange AS( SELECT explode( sequence( date_trunc('month', (SELECT MIN(`{COLUMN_NAME}`) FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`)), date_trunc('month', (SELECT MAX(`{COLUMN_NAME}`) FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`)), interval 1 month) ) AS all_dates ), existing_periods AS ( SELECT DISTINCT date_trunc('month', `{COLUMN_NAME}`) AS period, COUNT(1) AS period_count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` GROUP BY date_trunc('month', `{COLUMN_NAME}`) ) SELECT p.missing_period, p.prior_available_month, e.period_count AS prior_available_month_count, p.next_available_month, f.period_count AS next_available_month_count FROM ( SELECT d.all_dates AS missing_period, MAX(b.period) AS prior_available_month, MIN(c.period) AS next_available_month FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates ) p LEFT JOIN existing_periods e ON p.prior_available_month = e.period LEFT JOIN existing_periods f ON p.next_available_month = f.period ORDER BY p.missing_period; + error_type: Test Results + - id: '1157' + test_id: '1023' + test_type: Monthly_Rec_Ct + sql_flavor: mssql + lookup_type: null + lookup_query: |- + WITH + Pass0 as (select 1 as C union all select 1), --2 rows + Pass1 as (select 1 as C from Pass0 as A, Pass0 as B),--4 rows + Pass2 as (select 1 as C from Pass1 as A, Pass1 as B),--16 rows + Pass3 as (select 1 as C from Pass2 as A, Pass2 as B),--256 rows + Pass4 as (select 1 as C from Pass3 as A, Pass3 as B),--65536 rows + All_Nums as (select row_number() over(order by C) as Number from Pass4), + tally as (SELECT Number FROM All_Nums WHERE Number <= 45000), + + date_range as (SELECT CAST(DATEADD(MONTH, DATEDIFF(MONTH, 0, MIN("{COLUMN_NAME}")), 0) AS DATE) AS min_period, + CAST(DATEADD(MONTH, DATEDIFF(MONTH, 0, MAX("{COLUMN_NAME}")), 0) AS DATE) AS max_period, + DATEDIFF(MONTH, + CAST(DATEADD(MONTH, DATEDIFF(MONTH, 0, MIN("{COLUMN_NAME}")), 0) AS DATE), + CAST(DATEADD(MONTH, DATEDIFF(MONTH, 0, MAX("{COLUMN_NAME}")), 0) AS DATE) ) + 1 as period_ct + FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" ), + check_periods as ( SELECT d.min_period, d.max_period, t.number, + DATEADD(MONTH, -(t.number - 1), d.max_period) AS check_period + FROM date_range d + INNER JOIN tally t + ON (d.period_ct >= t.number) ), + data_by_period as (SELECT CAST(DATEADD(MONTH, DATEDIFF(MONTH, 0, "{COLUMN_NAME}"), 0) AS DATE) as data_period, COUNT(*) as record_ct + FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + GROUP BY CAST(DATEADD(MONTH, DATEDIFF(MONTH, 0, "{COLUMN_NAME}"), 0) AS DATE) ), + data_by_prd_with_prior_next as (SELECT check_period, + RANK() OVER (ORDER BY check_period DESC) as ranked, + ISNULL(d.record_ct, 0) as record_ct, + ISNULL(LAG(d.record_ct) OVER (ORDER BY check_period), 0) as last_record_ct, + ISNULL(LEAD(d.record_ct) OVER (ORDER BY check_period), 0) as next_record_ct + FROM check_periods c + LEFT JOIN data_by_period d + ON (c.check_period = d.data_period) ) + SELECT check_period, record_ct, + CASE + WHEN record_ct = 0 THEN 'MISSING' + ELSE 'Present' + END as status + FROM data_by_prd_with_prior_next + WHERE record_ct = 0 + OR last_record_ct = 0 + OR next_record_ct = 0 + ORDER BY check_period DESC; + error_type: Test Results + - id: '1100' + test_id: '1023' + test_type: Monthly_Rec_Ct + sql_flavor: postgresql + lookup_type: null + lookup_query: |- + WITH RECURSIVE daterange(all_dates) AS (SELECT DATE_TRUNC('month', MIN("{COLUMN_NAME}")) :: DATE AS all_dates FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" UNION ALL SELECT (d.all_dates :: DATE + INTERVAL '1 month') :: DATE AS all_dates FROM daterange d WHERE d.all_dates < (SELECT DATE_TRUNC('month', MAX("{COLUMN_NAME}")) :: DATE FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") ), existing_periods AS ( SELECT DISTINCT DATE_TRUNC('month',"{COLUMN_NAME}") :: DATE AS period, COUNT(1) AS period_count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY DATE_TRUNC('month',"{COLUMN_NAME}") :: DATE ) SELECT d.all_dates as missing_period, MAX(b.period) AS prior_available_month, (SELECT period_count FROM existing_periods WHERE period = MAX(b.period) ) AS prior_available_month_count, MIN(c.period) AS next_available_month, (SELECT period_count FROM existing_periods WHERE period = MIN(c.period) ) AS next_available_month_count FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates ORDER BY d.all_dates; + error_type: Test Results + - id: '1018' + test_id: '1023' + test_type: Monthly_Rec_Ct + sql_flavor: redshift + lookup_type: null + lookup_query: |- + WITH RECURSIVE daterange(all_dates) AS (SELECT DATE_TRUNC('month', MIN("{COLUMN_NAME}")) :: DATE AS all_dates FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" UNION ALL SELECT DATEADD(MONTH, 1, d.all_dates) :: DATE AS all_dates FROM daterange d WHERE d.all_dates < (SELECT DATE_TRUNC('month', MAX("{COLUMN_NAME}")) :: DATE FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") ), existing_periods AS ( SELECT DISTINCT DATE_TRUNC('month',"{COLUMN_NAME}") :: DATE AS period, COUNT(1) AS period_count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY DATE_TRUNC('month',"{COLUMN_NAME}") :: DATE ) SELECT d.all_dates as missing_period, MAX(b.period) AS prior_available_month, (SELECT period_count FROM existing_periods WHERE period = MAX(b.period) ) AS prior_available_month_count, MIN(c.period) AS next_available_month, (SELECT period_count FROM existing_periods WHERE period = MIN(c.period) ) AS next_available_month_count FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates ORDER BY d.all_dates; + error_type: Test Results + - id: '1418' + test_id: '1023' + test_type: Monthly_Rec_Ct + sql_flavor: redshift_spectrum + lookup_type: null + lookup_query: |- + WITH RECURSIVE daterange(all_dates) AS (SELECT DATE_TRUNC('month', MIN("{COLUMN_NAME}")) :: DATE AS all_dates FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" UNION ALL SELECT DATEADD(MONTH, 1, d.all_dates) :: DATE AS all_dates FROM daterange d WHERE d.all_dates < (SELECT DATE_TRUNC('month', MAX("{COLUMN_NAME}")) :: DATE FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") ), existing_periods AS ( SELECT DISTINCT DATE_TRUNC('month',"{COLUMN_NAME}") :: DATE AS period, COUNT(1) AS period_count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY DATE_TRUNC('month',"{COLUMN_NAME}") :: DATE ) SELECT d.all_dates as missing_period, MAX(b.period) AS prior_available_month, (SELECT period_count FROM existing_periods WHERE period = MAX(b.period) ) AS prior_available_month_count, MIN(c.period) AS next_available_month, (SELECT period_count FROM existing_periods WHERE period = MIN(c.period) ) AS next_available_month_count FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates ORDER BY d.all_dates; + error_type: Test Results + - id: '1214' + test_id: '1023' + test_type: Monthly_Rec_Ct + sql_flavor: snowflake + lookup_type: null + lookup_query: |- + WITH RECURSIVE daterange(all_dates) AS (SELECT DATE_TRUNC('month', MIN("{COLUMN_NAME}")) :: DATE AS all_dates FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" UNION ALL SELECT DATEADD(MONTH, 1, d.all_dates) :: DATE AS all_dates FROM daterange d WHERE d.all_dates < (SELECT DATE_TRUNC('month', MAX("{COLUMN_NAME}")) :: DATE FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") ), existing_periods AS (SELECT DISTINCT DATE_TRUNC('month',"{COLUMN_NAME}") :: DATE AS period, COUNT(1) AS period_count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY DATE_TRUNC('month',"{COLUMN_NAME}") :: DATE ) SELECT p.missing_period, p.prior_available_month, e.period_count as prior_available_month_count, p.next_available_month, f.period_count as next_available_month_count FROM (SELECT d.all_dates as missing_period, MAX(b.period) AS prior_available_month, MIN(c.period) AS next_available_month FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates) p LEFT JOIN existing_periods e ON (p.prior_available_month = e.period) LEFT JOIN existing_periods f ON (p.next_available_month = f.period) ORDER BY p.missing_period; + error_type: Test Results + test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Outlier_Pct_Above.yaml b/testgen/template/dbsetup_test_types/test_types_Outlier_Pct_Above.yaml new file mode 100644 index 00000000..1901fdac --- /dev/null +++ b/testgen/template/dbsetup_test_types/test_types_Outlier_Pct_Above.yaml @@ -0,0 +1,168 @@ +test_types: + id: '1024' + test_type: Outlier_Pct_Above + test_name_short: Outliers Above + test_name_long: Consistent outlier counts over 2 SD above mean + test_description: |- + Tests that percent of outliers over 2 SD above Mean doesn't exceed threshold + except_message: |- + Percent of outliers exceeding 2 SD above the mean is greater than expected threshold. + measure_uom: Pct records over limit + measure_uom_description: null + selection_criteria: |- + functional_data_type = 'Measurement' AND distinct_value_ct > 30 AND NOT distinct_value_ct = max_value - min_value + 1 AND distinct_value_ct::FLOAT/value_ct::FLOAT > 0.1 AND stdev_value::FLOAT/avg_value::FLOAT > 0.01 AND column_name NOT ILIKE '%latitude%' AND column_name NOT ilike '%longitude%' + dq_score_prevalence_formula: |- + GREATEST(0, {RESULT_MEASURE}::FLOAT-{THRESHOLD_VALUE}::FLOAT) + dq_score_risk_factor: '0.75' + column_name_prompt: null + column_name_help: null + default_parm_columns: baseline_avg,baseline_sd,threshold_value + default_parm_values: |- + avg_value,stdev_value,0.05 + default_parm_prompts: |- + Baseline Mean, Baseline Std Deviation, Pct Records over 2 SD + default_parm_help: null + default_severity: Warning + run_type: CAT + test_scope: column + dq_dimension: Accuracy + health_dimension: Data Drift + threshold_description: |- + Expected maximum pct records over upper 2 SD limit + result_visualization: line_chart + result_visualization_params: null + usage_notes: "This test counts the number of data points that may be considered\ + \ as outliers, determined by whether their value exceeds 2 standard deviations\ + \ above the mean at baseline. Assuming a normal distribution, a small percentage\ + \ (defaulted to 5%) of outliers is expected. The actual number may vary for different\ + \ distributions. The expected threshold reflects the maximum percentage of outliers\ + \ you expect to see. This test uses the baseline mean rather than the mean for\ + \ the latest dataset to capture systemic shift as well as individual outliers. " + active: Y + cat_test_conditions: + - id: '7019' + test_type: Outlier_Pct_Above + sql_flavor: bigquery + measure: |- + SUM(CASE WHEN CAST({COLUMN_NAME} AS FLOAT64) > {BASELINE_AVG} + 2*{BASELINE_SD} THEN 1 ELSE 0 END) / NULLIF(COUNT({COLUMN_NAME}),0) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '6019' + test_type: Outlier_Pct_Above + sql_flavor: databricks + measure: |- + SUM(CASE WHEN {COLUMN_NAME}::FLOAT > {BASELINE_AVG}+(2.0*{BASELINE_SD}) THEN 1 ELSE 0 END)::FLOAT / NULLIF(COUNT({COLUMN_NAME}), 0)::FLOAT + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '3019' + test_type: Outlier_Pct_Above + sql_flavor: mssql + measure: |- + CAST(SUM(CASE WHEN CAST({COLUMN_NAME} AS FLOAT) > {BASELINE_AVG}+(2.0*{BASELINE_SD}) THEN 1 ELSE 0 END) AS FLOAT) / CAST(COUNT({COLUMN_NAME}) AS FLOAT) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '4019' + test_type: Outlier_Pct_Above + sql_flavor: postgresql + measure: |- + SUM(CASE WHEN {COLUMN_NAME}::FLOAT > {BASELINE_AVG}+(2.0*{BASELINE_SD}) THEN 1 ELSE 0 END)::FLOAT / NULLIF(COUNT({COLUMN_NAME}), 0)::FLOAT + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '1019' + test_type: Outlier_Pct_Above + sql_flavor: redshift + measure: |- + SUM(CASE WHEN {COLUMN_NAME}::FLOAT > {BASELINE_AVG}+(2.0*{BASELINE_SD}) THEN 1 ELSE 0 END)::FLOAT / NULLIF(COUNT({COLUMN_NAME}), 0)::FLOAT + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '7019' + test_type: Outlier_Pct_Above + sql_flavor: redshift_spectrum + measure: |- + SUM(CASE WHEN {COLUMN_NAME}::FLOAT > {BASELINE_AVG}+(2.0*{BASELINE_SD}) THEN 1 ELSE 0 END)::FLOAT / NULLIF(COUNT({COLUMN_NAME}), 0)::FLOAT + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '2019' + test_type: Outlier_Pct_Above + sql_flavor: snowflake + measure: |- + SUM(CASE WHEN {COLUMN_NAME}::FLOAT > {BASELINE_AVG}+(2.0*{BASELINE_SD}) THEN 1 ELSE 0 END)::FLOAT / NULLIF(COUNT({COLUMN_NAME}), 0)::FLOAT + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '5019' + test_type: Outlier_Pct_Above + sql_flavor: trino + measure: |- + CAST(SUM(CASE WHEN CAST({COLUMN_NAME} AS REAL) > {BASELINE_AVG}+(2.0*{BASELINE_SD}) THEN 1 ELSE 0 END) AS REAL) / CAST(COUNT({COLUMN_NAME}) AS REAL) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + target_data_lookups: + - id: '1382' + test_id: '1024' + test_type: Outlier_Pct_Above + sql_flavor: bigquery + lookup_type: null + lookup_query: |- + SELECT ({BASELINE_AVG} + (2 * {BASELINE_SD})) AS outlier_threshold, `{COLUMN_NAME}`, COUNT(*) AS count + FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` + WHERE CAST(`{COLUMN_NAME}` AS FLOAT64) > ({BASELINE_AVG} + (2 * {BASELINE_SD})) + GROUP BY `{COLUMN_NAME}` + ORDER BY `{COLUMN_NAME}` DESC; + error_type: Test Results + - id: '1316' + test_id: '1024' + test_type: Outlier_Pct_Above + sql_flavor: databricks + lookup_type: null + lookup_query: |- + SELECT ({BASELINE_AVG} + (2*{BASELINE_SD})) AS outlier_threshold, `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE `{COLUMN_NAME}` :: FLOAT > ({BASELINE_AVG} + (2*{BASELINE_SD})) GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` DESC; + error_type: Test Results + - id: '1158' + test_id: '1024' + test_type: Outlier_Pct_Above + sql_flavor: mssql + lookup_type: null + lookup_query: |- + SELECT ({BASELINE_AVG} + (2*{BASELINE_SD})) AS outlier_threshold, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE CAST("{COLUMN_NAME}" AS FLOAT) > ({BASELINE_AVG} + (2*{BASELINE_SD})) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC; + error_type: Test Results + - id: '1101' + test_id: '1024' + test_type: Outlier_Pct_Above + sql_flavor: postgresql + lookup_type: null + lookup_query: |- + SELECT ({BASELINE_AVG} + (2*{BASELINE_SD})) AS outlier_threshold, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" :: FLOAT > ({BASELINE_AVG} + (2*{BASELINE_SD})) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC; + error_type: Test Results + - id: '1019' + test_id: '1024' + test_type: Outlier_Pct_Above + sql_flavor: redshift + lookup_type: null + lookup_query: |- + SELECT ({BASELINE_AVG} + (2*{BASELINE_SD})) AS outlier_threshold, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" :: FLOAT > ({BASELINE_AVG} + (2*{BASELINE_SD})) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC; + error_type: Test Results + - id: '1419' + test_id: '1024' + test_type: Outlier_Pct_Above + sql_flavor: redshift_spectrum + lookup_type: null + lookup_query: |- + SELECT ({BASELINE_AVG} + (2*{BASELINE_SD})) AS outlier_threshold, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" :: FLOAT > ({BASELINE_AVG} + (2*{BASELINE_SD})) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC; + error_type: Test Results + - id: '1215' + test_id: '1024' + test_type: Outlier_Pct_Above + sql_flavor: snowflake + lookup_type: null + lookup_query: |- + SELECT ({BASELINE_AVG} + (2*{BASELINE_SD})) AS outlier_threshold, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" :: FLOAT > ({BASELINE_AVG} + (2*{BASELINE_SD})) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC; + error_type: Test Results + test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Outlier_Pct_Below.yaml b/testgen/template/dbsetup_test_types/test_types_Outlier_Pct_Below.yaml new file mode 100644 index 00000000..0d9b45cb --- /dev/null +++ b/testgen/template/dbsetup_test_types/test_types_Outlier_Pct_Below.yaml @@ -0,0 +1,168 @@ +test_types: + id: '1025' + test_type: Outlier_Pct_Below + test_name_short: Outliers Below + test_name_long: Consistent outlier counts under 2 SD below mean + test_description: |- + Tests that percent of outliers over 2 SD below Mean doesn't exceed threshold + except_message: |- + Percent of outliers exceeding 2 SD below the mean is greater than expected threshold. + measure_uom: Pct records under limit + measure_uom_description: null + selection_criteria: |- + functional_data_type = 'Measurement' AND distinct_value_ct > 30 AND NOT distinct_value_ct = max_value - min_value + 1 AND distinct_value_ct::FLOAT/value_ct::FLOAT > 0.1 AND stdev_value::FLOAT/avg_value::FLOAT > 0.01 AND column_name NOT ILIKE '%latitude%' AND column_name NOT ilike '%longitude%' + dq_score_prevalence_formula: |- + GREATEST(0, {RESULT_MEASURE}::FLOAT-{THRESHOLD_VALUE}::FLOAT) + dq_score_risk_factor: '0.75' + column_name_prompt: null + column_name_help: null + default_parm_columns: baseline_avg,baseline_sd,threshold_value + default_parm_values: |- + avg_value,stdev_value,0.05 + default_parm_prompts: |- + Baseline Mean, Baseline Std Deviation, Pct Records over 2 SD + default_parm_help: null + default_severity: Warning + run_type: CAT + test_scope: column + dq_dimension: Accuracy + health_dimension: Data Drift + threshold_description: |- + Expected maximum pct records over lower 2 SD limit + result_visualization: line_chart + result_visualization_params: null + usage_notes: "This test counts the number of data points that may be considered\ + \ as outliers, determined by whether their value exceeds 2 standard deviations\ + \ below the mean at baseline. Assuming a normal distribution, a small percentage\ + \ (defaulted to 5%) of outliers is expected. The actual number may vary for different\ + \ distributions. The expected threshold reflects the maximum percentage of outliers\ + \ you expect to see. This test uses the baseline mean rather than the mean for\ + \ the latest dataset to capture systemic shift as well as individual outliers. " + active: Y + cat_test_conditions: + - id: '7020' + test_type: Outlier_Pct_Below + sql_flavor: bigquery + measure: |- + SUM(CASE WHEN CAST({COLUMN_NAME} AS FLOAT64) < {BASELINE_AVG} - 2*{BASELINE_SD} THEN 1 ELSE 0 END) / NULLIF(COUNT({COLUMN_NAME}),0) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '6020' + test_type: Outlier_Pct_Below + sql_flavor: databricks + measure: |- + SUM(CASE WHEN {COLUMN_NAME}::FLOAT < {BASELINE_AVG}-(2.0*{BASELINE_SD}) THEN 1 ELSE 0 END)::FLOAT / NULLIF(COUNT({COLUMN_NAME}), 0)::FLOAT + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '3020' + test_type: Outlier_Pct_Below + sql_flavor: mssql + measure: |- + CAST(SUM(CASE WHEN CAST( {COLUMN_NAME} AS FLOAT) < {BASELINE_AVG}-(2.0*{BASELINE_SD}) THEN 1 ELSE 0 END) AS FLOAT) / CAST(COUNT({COLUMN_NAME}) AS FLOAT) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '4020' + test_type: Outlier_Pct_Below + sql_flavor: postgresql + measure: |- + SUM(CASE WHEN {COLUMN_NAME}::FLOAT < {BASELINE_AVG}-(2.0*{BASELINE_SD}) THEN 1 ELSE 0 END)::FLOAT / NULLIF(COUNT({COLUMN_NAME}), 0)::FLOAT + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '1020' + test_type: Outlier_Pct_Below + sql_flavor: redshift + measure: |- + SUM(CASE WHEN {COLUMN_NAME}::FLOAT < {BASELINE_AVG}-(2.0*{BASELINE_SD}) THEN 1 ELSE 0 END)::FLOAT / NULLIF(COUNT({COLUMN_NAME}), 0)::FLOAT + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '7020' + test_type: Outlier_Pct_Below + sql_flavor: redshift_spectrum + measure: |- + SUM(CASE WHEN {COLUMN_NAME}::FLOAT < {BASELINE_AVG}-(2.0*{BASELINE_SD}) THEN 1 ELSE 0 END)::FLOAT / NULLIF(COUNT({COLUMN_NAME}), 0)::FLOAT + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '2020' + test_type: Outlier_Pct_Below + sql_flavor: snowflake + measure: |- + SUM(CASE WHEN {COLUMN_NAME}::FLOAT < {BASELINE_AVG}-(2.0*{BASELINE_SD}) THEN 1 ELSE 0 END)::FLOAT / NULLIF(COUNT({COLUMN_NAME}), 0)::FLOAT + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '5020' + test_type: Outlier_Pct_Below + sql_flavor: trino + measure: |- + CAST(SUM(CASE WHEN CAST( {COLUMN_NAME} AS REAL) < {BASELINE_AVG}-(2.0*{BASELINE_SD}) THEN 1 ELSE 0 END) AS REAL) / CAST(COUNT({COLUMN_NAME}) AS REAL) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + target_data_lookups: + - id: '1383' + test_id: '1025' + test_type: Outlier_Pct_Below + sql_flavor: bigquery + lookup_type: null + lookup_query: |- + SELECT ({BASELINE_AVG} + (2 * {BASELINE_SD})) AS outlier_threshold, `{COLUMN_NAME}`, COUNT(*) AS count + FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` + WHERE CAST(`{COLUMN_NAME}` AS FLOAT64) < ({BASELINE_AVG} + (2 * {BASELINE_SD})) + GROUP BY `{COLUMN_NAME}` + ORDER BY `{COLUMN_NAME}` DESC; + error_type: Test Results + - id: '1317' + test_id: '1025' + test_type: Outlier_Pct_Below + sql_flavor: databricks + lookup_type: null + lookup_query: |- + SELECT ({BASELINE_AVG} + (2*{BASELINE_SD})) AS outlier_threshold, `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE `{COLUMN_NAME}` :: FLOAT < ({BASELINE_AVG} + (2*{BASELINE_SD})) GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` DESC; + error_type: Test Results + - id: '1159' + test_id: '1025' + test_type: Outlier_Pct_Below + sql_flavor: mssql + lookup_type: null + lookup_query: |- + SELECT ({BASELINE_AVG} + (2*{BASELINE_SD})) AS outlier_threshold, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE CAST("{COLUMN_NAME}" AS FLOAT) < ({BASELINE_AVG} + (2*{BASELINE_SD})) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC; + error_type: Test Results + - id: '1102' + test_id: '1025' + test_type: Outlier_Pct_Below + sql_flavor: postgresql + lookup_type: null + lookup_query: |- + SELECT ({BASELINE_AVG} + (2*{BASELINE_SD})) AS outlier_threshold, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" :: FLOAT < ({BASELINE_AVG} + (2*{BASELINE_SD})) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC; + error_type: Test Results + - id: '1020' + test_id: '1025' + test_type: Outlier_Pct_Below + sql_flavor: redshift + lookup_type: null + lookup_query: |- + SELECT ({BASELINE_AVG} + (2*{BASELINE_SD})) AS outlier_threshold, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" :: FLOAT < ({BASELINE_AVG} + (2*{BASELINE_SD})) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC; + error_type: Test Results + - id: '1420' + test_id: '1025' + test_type: Outlier_Pct_Below + sql_flavor: redshift_spectrum + lookup_type: null + lookup_query: |- + SELECT ({BASELINE_AVG} + (2*{BASELINE_SD})) AS outlier_threshold, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" :: FLOAT < ({BASELINE_AVG} + (2*{BASELINE_SD})) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC; + error_type: Test Results + - id: '1216' + test_id: '1025' + test_type: Outlier_Pct_Below + sql_flavor: snowflake + lookup_type: null + lookup_query: |- + SELECT ({BASELINE_AVG} + (2*{BASELINE_SD})) AS outlier_threshold, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" :: FLOAT < ({BASELINE_AVG} + (2*{BASELINE_SD})) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC; + error_type: Test Results + test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Pattern_Match.yaml b/testgen/template/dbsetup_test_types/test_types_Pattern_Match.yaml new file mode 100644 index 00000000..03f123e6 --- /dev/null +++ b/testgen/template/dbsetup_test_types/test_types_Pattern_Match.yaml @@ -0,0 +1,162 @@ +test_types: + id: '1026' + test_type: Pattern_Match + test_name_short: Pattern Match + test_name_long: Column values match alpha-numeric pattern + test_description: |- + Tests that all values in the column match the same alpha-numeric pattern identified in baseline data + except_message: |- + Alpha values do not match consistent pattern in baseline. + measure_uom: Pattern Mismatches + measure_uom_description: null + selection_criteria: |- + (functional_data_type IN ('Attribute', 'DateTime Stamp', 'Phone') OR functional_data_type ILIKE 'ID%' OR functional_data_type ILIKE 'Period%') AND fn_charcount(top_patterns, E' \| ' ) = 1 AND REPLACE(SPLIT_PART(top_patterns, '|' , 2), 'N' , '' ) > '' AND distinct_value_ct > 10 + dq_score_prevalence_formula: |- + ({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/NULLIF({RECORD_CT}::FLOAT, 0) + dq_score_risk_factor: '1.0' + column_name_prompt: null + column_name_help: null + default_parm_columns: baseline_value,threshold_value + default_parm_values: |- + TRIM(REPLACE(REPLACE(REPLACE(REGEXP_REPLACE(SPLIT_PART(top_patterns, ' | ', 2), '([*+\-%_])', '[\1]', 'g'), 'A', '[A-Z]'), 'N', '[0-9]'), 'a', '[a-z]')),0 + default_parm_prompts: |- + Pattern at Baseline,Threshold Error Count + default_parm_help: null + default_severity: Fail + run_type: CAT + test_scope: column + dq_dimension: Validity + health_dimension: Schema Drift + threshold_description: |- + Expected count of pattern mismatches + result_visualization: line_chart + result_visualization_params: null + usage_notes: |- + This test is appropriate for character fields that are expected to appear in a consistent format. It uses pattern matching syntax as appropriate for your database: REGEX matching if available, otherwise LIKE expressions. The expected threshold is the number of records that fail to match the defined pattern. + active: Y + cat_test_conditions: + - id: '7021' + test_type: Pattern_Match + sql_flavor: bigquery + measure: |- + COUNT(NULLIF({COLUMN_NAME}, '')) - SUM(CAST(REGEXP_CONTAINS(CAST(NULLIF({COLUMN_NAME}, '') AS STRING), r'{BASELINE_VALUE}') AS INT64)) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '6021' + test_type: Pattern_Match + sql_flavor: databricks + measure: |- + COUNT(NULLIF({COLUMN_NAME}, '')) - SUM(REGEXP_LIKE(NULLIF({COLUMN_NAME}::STRING, ''), '{BASELINE_VALUE}')::BIGINT) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '3021' + test_type: Pattern_Match + sql_flavor: mssql + measure: |- + COUNT(NULLIF({COLUMN_NAME}, '')) - CAST(SUM(CASE WHEN NULLIF({COLUMN_NAME}, '') LIKE '{BASELINE_VALUE}' THEN 1 ELSE 0 END) AS BIGINT) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '4021' + test_type: Pattern_Match + sql_flavor: postgresql + measure: |- + COUNT(NULLIF({COLUMN_NAME}, '')) - SUM(CASE WHEN NULLIF({COLUMN_NAME}, '') ~ '{BASELINE_VALUE}' THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '1021' + test_type: Pattern_Match + sql_flavor: redshift + measure: |- + COUNT(NULLIF({COLUMN_NAME}, '')) - SUM((NULLIF({COLUMN_NAME}, '') SIMILAR TO '{BASELINE_VALUE}')::BIGINT) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '7021' + test_type: Pattern_Match + sql_flavor: redshift_spectrum + measure: |- + COUNT(NULLIF({COLUMN_NAME}, '')) - SUM((NULLIF({COLUMN_NAME}, '') SIMILAR TO '{BASELINE_VALUE}')::BIGINT) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '2021' + test_type: Pattern_Match + sql_flavor: snowflake + measure: |- + COUNT(NULLIF({COLUMN_NAME}, '')) - SUM(REGEXP_LIKE(NULLIF({COLUMN_NAME}::VARCHAR, ''), '{BASELINE_VALUE}')::BIGINT) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '5021' + test_type: Pattern_Match + sql_flavor: trino + measure: |- + COUNT(NULLIF({COLUMN_NAME}, '')) - SUM(CASE WHEN REGEXP_LIKE(NULLIF({COLUMN_NAME}, '') , '{BASELINE_VALUE}') = TRUE THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + target_data_lookups: + - id: '1384' + test_id: '1026' + test_type: Pattern_Match + sql_flavor: bigquery + lookup_type: null + lookup_query: |- + SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count + FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` + WHERE NOT REGEXP_CONTAINS(NULLIF(CAST(`{COLUMN_NAME}` AS STRING), ''), r'{BASELINE_VALUE}') + GROUP BY `{COLUMN_NAME}`; + error_type: Test Results + - id: '1318' + test_id: '1026' + test_type: Pattern_Match + sql_flavor: databricks + lookup_type: null + lookup_query: |- + SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE REGEXP_LIKE(NULLIF(`{COLUMN_NAME}`::STRING, ''),'{BASELINE_VALUE}') != 1 GROUP BY `{COLUMN_NAME}`; + error_type: Test Results + - id: '1160' + test_id: '1026' + test_type: Pattern_Match + sql_flavor: mssql + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE NULLIF("{COLUMN_NAME}", '') NOT LIKE '{BASELINE_VALUE}' GROUP BY "{COLUMN_NAME}"; + error_type: Test Results + - id: '1103' + test_id: '1026' + test_type: Pattern_Match + sql_flavor: postgresql + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE NULLIF("{COLUMN_NAME}", '') NOT SIMILAR TO '{BASELINE_VALUE}' GROUP BY "{COLUMN_NAME}"; + error_type: Test Results + - id: '1021' + test_id: '1026' + test_type: Pattern_Match + sql_flavor: redshift + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE NULLIF("{COLUMN_NAME}", '') NOT SIMILAR TO '{BASELINE_VALUE}' GROUP BY "{COLUMN_NAME}"; + error_type: Test Results + - id: '1421' + test_id: '1026' + test_type: Pattern_Match + sql_flavor: redshift_spectrum + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE NULLIF("{COLUMN_NAME}", '') NOT SIMILAR TO '{BASELINE_VALUE}' GROUP BY "{COLUMN_NAME}"; + error_type: Test Results + - id: '1217' + test_id: '1026' + test_type: Pattern_Match + sql_flavor: snowflake + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE REGEXP_LIKE(NULLIF("{COLUMN_NAME}"::VARCHAR, ''),'{BASELINE_VALUE}') != 1 GROUP BY "{COLUMN_NAME}"; + error_type: Test Results + test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Recency.yaml b/testgen/template/dbsetup_test_types/test_types_Recency.yaml new file mode 100644 index 00000000..69aedb37 --- /dev/null +++ b/testgen/template/dbsetup_test_types/test_types_Recency.yaml @@ -0,0 +1,162 @@ +test_types: + id: '1028' + test_type: Recency + test_name_short: Recency + test_name_long: Latest date within expected range of test date + test_description: |- + Tests that the latest date in column is within a set number of days of the test date + except_message: |- + Most recent date value not within expected days of test date. + measure_uom: Days before test + measure_uom_description: |- + Number of days that most recent date precedes the date of test + selection_criteria: |- + general_type= 'D' AND max_date <= run_date AND NOT column_name IN ( 'filedate' , 'file_date' ) AND NOT functional_data_type IN ('Future Date', 'Schedule Date') AND DATEDIFF( 'DAY' , max_date, run_date) <= 62 + dq_score_prevalence_formula: |- + (ABS({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT*{PRO_RECORD_CT}::FLOAT/(1.0+DATEDIFF('DAY', '{MIN_DATE}', '{MAX_DATE}'))::FLOAT)/NULLIF({RECORD_CT}::FLOAT, 0) + dq_score_risk_factor: '0.75' + column_name_prompt: null + column_name_help: null + default_parm_columns: threshold_value + default_parm_values: |- + CASE WHEN DATEDIFF( 'DAY' , max_date, run_date) <= 3 THEN DATEDIFF('DAY', max_date, run_date) + 3 WHEN DATEDIFF('DAY', max_date, run_date) <= 7 then DATEDIFF('DAY', max_date, run_date) + 7 WHEN DATEDIFF( 'DAY' , max_date, run_date) <= 31 THEN CEILING( DATEDIFF( 'DAY' , max_date, run_date)::FLOAT / 7.0) * 7 WHEN DATEDIFF( 'DAY' , max_date, run_date) > 31 THEN CEILING( DATEDIFF( 'DAY' , max_date, run_date)::FLOAT / 30.0) * 30 END + default_parm_prompts: |- + Threshold Maximum Days before Test + default_parm_help: null + default_severity: Warning + run_type: CAT + test_scope: column + dq_dimension: Timeliness + health_dimension: Recency + threshold_description: |- + Expected maximum count of days preceding test date + result_visualization: line_chart + result_visualization_params: null + usage_notes: |- + This test evaluates recency based on the latest referenced dates in the column. The test is appropriate for transactional dates and timestamps. The test can be especially valuable because timely data deliveries themselves may not assure that the most recent data is present. You can adjust the expected threshold to the maximum number of days that you expect the data to age before the dataset is refreshed. + active: Y + cat_test_conditions: + - id: '7022' + test_type: Recency + sql_flavor: bigquery + measure: |- + CAST((DATETIME_DIFF(DATETIME_TRUNC(CAST(CAST('{RUN_DATE}' AS DATETIME) AS {COLUMN_TYPE}), DAY), DATETIME_TRUNC(MAX({COLUMN_NAME}), DAY), DAY)) AS INT64) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '6022' + test_type: Recency + sql_flavor: databricks + measure: |- + <%DATEDIFF_DAY;MAX({COLUMN_NAME});'{RUN_DATE}'::DATE%> + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '3022' + test_type: Recency + sql_flavor: mssql + measure: |- + DATEDIFF(day, MAX({COLUMN_NAME}), CAST('{RUN_DATE}'AS DATE)) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '4022' + test_type: Recency + sql_flavor: postgresql + measure: |- + <%DATEDIFF_DAY;MAX({COLUMN_NAME});'{RUN_DATE}'::DATE%> + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '1022' + test_type: Recency + sql_flavor: redshift + measure: |- + DATEDIFF('D', MAX({COLUMN_NAME}), '{RUN_DATE}'::DATE) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '7022' + test_type: Recency + sql_flavor: redshift_spectrum + measure: |- + DATEDIFF('D', MAX({COLUMN_NAME}), '{RUN_DATE}'::DATE) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '2022' + test_type: Recency + sql_flavor: snowflake + measure: |- + DATEDIFF('D', MAX({COLUMN_NAME}), '{RUN_DATE}'::DATE) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '5022' + test_type: Recency + sql_flavor: trino + measure: |- + DATE_DIFF('day', MAX({COLUMN_NAME}), CAST('{RUN_DATE}' AS DATE)) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + target_data_lookups: + - id: '1385' + test_id: '1028' + test_type: Recency + sql_flavor: bigquery + lookup_type: null + lookup_query: |- + SELECT DISTINCT col AS latest_date_available, CAST(CAST('{TEST_DATE}' AS DATETIME) AS {COLUMN_TYPE}) AS test_run_date + FROM (SELECT DATE_TRUNC(MAX(`{COLUMN_NAME}`), DAY) AS col FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`) + WHERE DATETIME_DIFF(CAST(CAST('{TEST_DATE}' AS DATETIME) AS {COLUMN_TYPE}), col, DAY) > {THRESHOLD_VALUE}; + error_type: Test Results + - id: '1319' + test_id: '1028' + test_type: Recency + sql_flavor: databricks + lookup_type: null + lookup_query: |- + SELECT DISTINCT col AS latest_date_available, '{TEST_DATE}' :: DATE as test_run_date FROM (SELECT MAX(`{COLUMN_NAME}`) AS col FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`) WHERE ABS(<%DATEDIFF_DAY;col;'{TEST_DATE}'::DATE%>) > {THRESHOLD_VALUE}; + error_type: Test Results + - id: '1161' + test_id: '1028' + test_type: Recency + sql_flavor: mssql + lookup_type: null + lookup_query: |- + SELECT DISTINCT col AS latest_date_available, CAST('{TEST_DATE}' AS DATE) AS test_run_date FROM (SELECT MAX("{COLUMN_NAME}") AS col FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") a WHERE DATEDIFF(day, col, CAST('{TEST_DATE}' AS DATE)) > {THRESHOLD_VALUE}; + error_type: Test Results + - id: '1104' + test_id: '1028' + test_type: Recency + sql_flavor: postgresql + lookup_type: null + lookup_query: |- + SELECT DISTINCT col AS latest_date_available, '{TEST_DATE}' :: DATE as test_run_date FROM (SELECT MAX("{COLUMN_NAME}") AS col FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") a WHERE <%DATEDIFF_DAY;col;'{TEST_DATE}'::DATE%> > {THRESHOLD_VALUE}; + error_type: Test Results + - id: '1022' + test_id: '1028' + test_type: Recency + sql_flavor: redshift + lookup_type: null + lookup_query: |- + SELECT DISTINCT col AS latest_date_available, '{TEST_DATE}' :: DATE as test_run_date FROM (SELECT MAX("{COLUMN_NAME}") AS col FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") WHERE DATEDIFF('D', col, '{TEST_DATE}'::DATE) > {THRESHOLD_VALUE}; + error_type: Test Results + - id: '1422' + test_id: '1028' + test_type: Recency + sql_flavor: redshift_spectrum + lookup_type: null + lookup_query: |- + SELECT DISTINCT col AS latest_date_available, '{TEST_DATE}' :: DATE as test_run_date FROM (SELECT MAX("{COLUMN_NAME}") AS col FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") WHERE DATEDIFF('D', col, '{TEST_DATE}'::DATE) > {THRESHOLD_VALUE}; + error_type: Test Results + - id: '1218' + test_id: '1028' + test_type: Recency + sql_flavor: snowflake + lookup_type: null + lookup_query: |- + SELECT DISTINCT col AS latest_date_available, '{TEST_DATE}' :: DATE as test_run_date FROM (SELECT MAX("{COLUMN_NAME}") AS col FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") WHERE DATEDIFF('D', col, '{TEST_DATE}'::DATE) > {THRESHOLD_VALUE}; + error_type: Test Results + test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Required.yaml b/testgen/template/dbsetup_test_types/test_types_Required.yaml new file mode 100644 index 00000000..1149fbb5 --- /dev/null +++ b/testgen/template/dbsetup_test_types/test_types_Required.yaml @@ -0,0 +1,161 @@ +test_types: + id: '1030' + test_type: Required + test_name_short: Required Entry + test_name_long: Required non-null value present + test_description: |- + Tests that a non-null value is present in each record for the column, consistent with baseline data + except_message: |- + Every record for this column is expected to be filled, but some are missing. + measure_uom: Missing values + measure_uom_description: null + selection_criteria: |- + record_ct = value_ct AND record_ct > 10 + dq_score_prevalence_formula: |- + ({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/NULLIF({RECORD_CT}::FLOAT, 0) + dq_score_risk_factor: '1.0' + column_name_prompt: null + column_name_help: null + default_parm_columns: threshold_value + default_parm_values: |- + 0 + default_parm_prompts: |- + Threshold Missing Value Count + default_parm_help: null + default_severity: Fail + run_type: CAT + test_scope: column + dq_dimension: Completeness + health_dimension: Schema Drift + threshold_description: |- + Expected count of missing values + result_visualization: line_chart + result_visualization_params: null + usage_notes: null + active: Y + cat_test_conditions: + - id: '7023' + test_type: Required + sql_flavor: bigquery + measure: |- + COUNT(*) - COUNT({COLUMN_NAME}) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '6023' + test_type: Required + sql_flavor: databricks + measure: |- + COUNT(*) - COUNT( {COLUMN_NAME} ) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '3023' + test_type: Required + sql_flavor: mssql + measure: |- + COUNT(*) - COUNT( {COLUMN_NAME} ) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '4023' + test_type: Required + sql_flavor: postgresql + measure: |- + COUNT(*) - COUNT({COLUMN_NAME}) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '1023' + test_type: Required + sql_flavor: redshift + measure: |- + COUNT(*) - COUNT( {COLUMN_NAME} ) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '7023' + test_type: Required + sql_flavor: redshift_spectrum + measure: |- + COUNT(*) - COUNT( {COLUMN_NAME} ) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '2023' + test_type: Required + sql_flavor: snowflake + measure: |- + COUNT(*) - COUNT( {COLUMN_NAME} ) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '5023' + test_type: Required + sql_flavor: trino + measure: |- + COUNT(*) - COUNT({COLUMN_NAME}) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + target_data_lookups: + - id: '1386' + test_id: '1030' + test_type: Required + sql_flavor: bigquery + lookup_type: null + lookup_query: |- + SELECT * + FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` + WHERE `{COLUMN_NAME}` IS NULL + LIMIT 500; + error_type: Test Results + - id: '1320' + test_id: '1030' + test_type: Required + sql_flavor: databricks + lookup_type: null + lookup_query: |- + SELECT * FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE `{COLUMN_NAME}` IS NULL LIMIT 500; + error_type: Test Results + - id: '1162' + test_id: '1030' + test_type: Required + sql_flavor: mssql + lookup_type: null + lookup_query: |- + SELECT TOP 500 * FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NULL; + error_type: Test Results + - id: '1105' + test_id: '1030' + test_type: Required + sql_flavor: postgresql + lookup_type: null + lookup_query: |- + SELECT * FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NULL LIMIT 500; + error_type: Test Results + - id: '1023' + test_id: '1030' + test_type: Required + sql_flavor: redshift + lookup_type: null + lookup_query: |- + SELECT * FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NULL LIMIT 500; + error_type: Test Results + - id: '1423' + test_id: '1030' + test_type: Required + sql_flavor: redshift_spectrum + lookup_type: null + lookup_query: |- + SELECT * FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NULL LIMIT 500; + error_type: Test Results + - id: '1219' + test_id: '1030' + test_type: Required + sql_flavor: snowflake + lookup_type: null + lookup_query: |- + SELECT * FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NULL LIMIT 500; + error_type: Test Results + test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Row_Ct.yaml b/testgen/template/dbsetup_test_types/test_types_Row_Ct.yaml new file mode 100644 index 00000000..776bea6a --- /dev/null +++ b/testgen/template/dbsetup_test_types/test_types_Row_Ct.yaml @@ -0,0 +1,165 @@ +test_types: + id: '1031' + test_type: Row_Ct + test_name_short: Row Count + test_name_long: Number of rows is at or above threshold + test_description: |- + Tests that the count of records has not decreased from the baseline count. + except_message: |- + Row count less than baseline count. + measure_uom: Row count + measure_uom_description: null + selection_criteria: |- + TEMPLATE + dq_score_prevalence_formula: |- + ({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/NULLIF({THRESHOLD_VALUE}::FLOAT, 0) + dq_score_risk_factor: '1.0' + column_name_prompt: null + column_name_help: null + default_parm_columns: threshold_value + default_parm_values: null + default_parm_prompts: |- + Threshold Minimum Record Count + default_parm_help: null + default_severity: Fail + run_type: CAT + test_scope: table + dq_dimension: Completeness + health_dimension: Volume + threshold_description: |- + Expected minimum row count + result_visualization: line_chart + result_visualization_params: null + usage_notes: |- + Because this tests the row count against a constant minimum threshold, it's appropriate for any dataset, as long as the number of rows doesn't radically change from refresh to refresh. But it's not responsive to change over time. You may want to adjust the threshold periodically if you are dealing with a cumulative dataset. + active: Y + cat_test_conditions: + - id: '7024' + test_type: Row_Ct + sql_flavor: bigquery + measure: |- + COUNT(*) + test_operator: < + test_condition: |- + {THRESHOLD_VALUE} + - id: '6024' + test_type: Row_Ct + sql_flavor: databricks + measure: |- + COUNT(*) + test_operator: < + test_condition: |- + {THRESHOLD_VALUE} + - id: '3024' + test_type: Row_Ct + sql_flavor: mssql + measure: |- + COUNT(*) + test_operator: < + test_condition: |- + {THRESHOLD_VALUE} + - id: '4024' + test_type: Row_Ct + sql_flavor: postgresql + measure: |- + COUNT(*) + test_operator: < + test_condition: |- + {THRESHOLD_VALUE} + - id: '1024' + test_type: Row_Ct + sql_flavor: redshift + measure: |- + COUNT(*) + test_operator: < + test_condition: |- + {THRESHOLD_VALUE} + - id: '7024' + test_type: Row_Ct + sql_flavor: redshift_spectrum + measure: |- + COUNT(*) + test_operator: < + test_condition: |- + {THRESHOLD_VALUE} + - id: '2024' + test_type: Row_Ct + sql_flavor: snowflake + measure: |- + COUNT(*) + test_operator: < + test_condition: |- + {THRESHOLD_VALUE} + - id: '5024' + test_type: Row_Ct + sql_flavor: trino + measure: |- + COUNT(*) + test_operator: < + test_condition: |- + {THRESHOLD_VALUE} + target_data_lookups: + - id: '1387' + test_id: '1031' + test_type: Row_Ct + sql_flavor: bigquery + lookup_type: null + lookup_query: |- + WITH cte AS ( + SELECT COUNT(*) AS current_count + FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` + ) + SELECT current_count, + ABS(ROUND(100 * (current_count - {THRESHOLD_VALUE}) / CAST({THRESHOLD_VALUE} AS FLOAT64), 2)) AS row_count_pct_decrease + FROM cte + WHERE current_count < {THRESHOLD_VALUE}; + error_type: Test Results + - id: '1321' + test_id: '1031' + test_type: Row_Ct + sql_flavor: databricks + lookup_type: null + lookup_query: |- + WITH CTE AS (SELECT COUNT(*) AS current_count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`) SELECT current_count, ABS(ROUND(100 *(current_count - {THRESHOLD_VALUE}) :: FLOAT / {THRESHOLD_VALUE} :: FLOAT,2)) AS row_count_pct_decrease FROM cte WHERE current_count < {THRESHOLD_VALUE}; + error_type: Test Results + - id: '1163' + test_id: '1031' + test_type: Row_Ct + sql_flavor: mssql + lookup_type: null + lookup_query: |- + WITH CTE AS (SELECT COUNT(*) AS current_count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") SELECT current_count, ABS(ROUND(CAST(100 * (current_count - {THRESHOLD_VALUE}) AS NUMERIC) / CAST({THRESHOLD_VALUE} AS NUMERIC) ,2)) AS row_count_pct_decrease FROM cte WHERE current_count < {THRESHOLD_VALUE}; + error_type: Test Results + - id: '1106' + test_id: '1031' + test_type: Row_Ct + sql_flavor: postgresql + lookup_type: null + lookup_query: |- + WITH CTE AS (SELECT COUNT(*) AS current_count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") SELECT current_count, ABS(ROUND(100 * (current_count - {THRESHOLD_VALUE}) :: NUMERIC / {THRESHOLD_VALUE} :: NUMERIC,2)) AS row_count_pct_decrease FROM cte WHERE current_count < {THRESHOLD_VALUE}; + error_type: Test Results + - id: '1024' + test_id: '1031' + test_type: Row_Ct + sql_flavor: redshift + lookup_type: null + lookup_query: |- + WITH CTE AS (SELECT COUNT(*) AS current_count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") SELECT current_count, ABS(ROUND(100 * (current_count - {THRESHOLD_VALUE}) :: FLOAT / {THRESHOLD_VALUE} :: FLOAT,2)) AS row_count_pct_decrease FROM cte WHERE current_count < {THRESHOLD_VALUE}; + error_type: Test Results + - id: '1424' + test_id: '1031' + test_type: Row_Ct + sql_flavor: redshift_spectrum + lookup_type: null + lookup_query: |- + WITH CTE AS (SELECT COUNT(*) AS current_count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") SELECT current_count, ABS(ROUND(100 * (current_count - {THRESHOLD_VALUE}) :: FLOAT / {THRESHOLD_VALUE} :: FLOAT,2)) AS row_count_pct_decrease FROM cte WHERE current_count < {THRESHOLD_VALUE}; + error_type: Test Results + - id: '1220' + test_id: '1031' + test_type: Row_Ct + sql_flavor: snowflake + lookup_type: null + lookup_query: |- + WITH CTE AS (SELECT COUNT(*) AS current_count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") SELECT current_count, ABS(ROUND(100 *(current_count - {THRESHOLD_VALUE}) :: FLOAT / {THRESHOLD_VALUE} :: FLOAT,2)) AS row_count_pct_decrease FROM cte WHERE current_count < {THRESHOLD_VALUE}; + error_type: Test Results + test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Row_Ct_Pct.yaml b/testgen/template/dbsetup_test_types/test_types_Row_Ct_Pct.yaml new file mode 100644 index 00000000..5b5ab463 --- /dev/null +++ b/testgen/template/dbsetup_test_types/test_types_Row_Ct_Pct.yaml @@ -0,0 +1,165 @@ +test_types: + id: '1032' + test_type: Row_Ct_Pct + test_name_short: Row Range + test_name_long: Number of rows within percent range of threshold + test_description: |- + Tests that the count of records is within a percentage above or below the baseline count. + except_message: |- + Row Count is outside of threshold percent of baseline count. + measure_uom: Percent of baseline + measure_uom_description: |- + Row count percent above or below baseline + selection_criteria: |- + TEMPLATE + dq_score_prevalence_formula: |- + (100.0 - {RESULT_MEASURE}::FLOAT)/100.0 + dq_score_risk_factor: '1.0' + column_name_prompt: null + column_name_help: null + default_parm_columns: baseline_ct,threshold_value + default_parm_values: null + default_parm_prompts: |- + Baseline Record Count,Threshold Pct Above or Below Baseline + default_parm_help: null + default_severity: Fail + run_type: CAT + test_scope: table + dq_dimension: Completeness + health_dimension: Volume + threshold_description: |- + Expected percent window below or above baseline + result_visualization: line_chart + result_visualization_params: null + usage_notes: |- + This test is better than Row Count for an incremental or windowed dataset where you would expect the row count to range within a percentage of baseline. + active: Y + cat_test_conditions: + - id: '7025' + test_type: Row_Ct_Pct + sql_flavor: bigquery + measure: |- + ABS(ROUND(100.0 * (COUNT(*) - {BASELINE_CT}) / {BASELINE_CT}, 2)) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '6025' + test_type: Row_Ct_Pct + sql_flavor: databricks + measure: |- + ABS(ROUND(100.0 * (COUNT(*) - {BASELINE_CT})::FLOAT / {BASELINE_CT}::FLOAT, 2)) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '3025' + test_type: Row_Ct_Pct + sql_flavor: mssql + measure: |- + ABS(ROUND(100.0 * CAST((COUNT(*) - {BASELINE_CT} ) AS FLOAT)/ CAST({BASELINE_CT} AS FLOAT), 2)) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '4025' + test_type: Row_Ct_Pct + sql_flavor: postgresql + measure: |- + ABS(ROUND(100.0 * (COUNT(*) - {BASELINE_CT})::DECIMAL(18,4) / {BASELINE_CT}::DECIMAL(18,4), 2)) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '1025' + test_type: Row_Ct_Pct + sql_flavor: redshift + measure: |- + ABS(ROUND(100.0 * (COUNT(*) - {BASELINE_CT})::FLOAT / {BASELINE_CT}::FLOAT, 2)) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '7025' + test_type: Row_Ct_Pct + sql_flavor: redshift_spectrum + measure: |- + ABS(ROUND(100.0 * (COUNT(*) - {BASELINE_CT})::FLOAT / {BASELINE_CT}::FLOAT, 2)) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '2025' + test_type: Row_Ct_Pct + sql_flavor: snowflake + measure: |- + ABS(ROUND(100.0 * (COUNT(*) - {BASELINE_CT})::FLOAT / {BASELINE_CT}::FLOAT, 2)) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '5025' + test_type: Row_Ct_Pct + sql_flavor: trino + measure: |- + ABS(ROUND(100.0 * CAST((COUNT(*) - {BASELINE_CT}) AS DECIMAL(18,4)) /CAST( {BASELINE_CT} AS DECIMAL(18,4) ), 2)) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + target_data_lookups: + - id: '1388' + test_id: '1032' + test_type: Row_Ct_Pct + sql_flavor: bigquery + lookup_type: null + lookup_query: |- + WITH cte AS ( + SELECT COUNT(*) AS current_count + FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` + ) + SELECT current_count, {BASELINE_CT} AS baseline_count, + ABS(ROUND(100 * (current_count - {BASELINE_CT}) / CAST({BASELINE_CT} AS FLOAT64), 2)) AS row_count_pct_difference + FROM cte; + error_type: Test Results + - id: '1322' + test_id: '1032' + test_type: Row_Ct_Pct + sql_flavor: databricks + lookup_type: null + lookup_query: |- + WITH CTE AS (SELECT COUNT(*) AS current_count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`) SELECT current_count, {BASELINE_CT} AS baseline_count, ABS(ROUND(100 * (current_count - {BASELINE_CT}) :: FLOAT / {BASELINE_CT} :: FLOAT,2)) AS row_count_pct_difference FROM cte; + error_type: Test Results + - id: '1164' + test_id: '1032' + test_type: Row_Ct_Pct + sql_flavor: mssql + lookup_type: null + lookup_query: |- + WITH CTE AS (SELECT COUNT(*) AS current_count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") SELECT current_count, {BASELINE_CT} AS baseline_count, ABS(ROUND(CAST(100 * (current_count - {BASELINE_CT}) AS NUMERIC) / CAST({BASELINE_CT} AS NUMERIC) ,2)) AS row_count_pct_difference FROM cte; + error_type: Test Results + - id: '1107' + test_id: '1032' + test_type: Row_Ct_Pct + sql_flavor: postgresql + lookup_type: null + lookup_query: |- + WITH CTE AS (SELECT COUNT(*) AS current_count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") SELECT current_count, {BASELINE_CT} AS baseline_count, ABS(ROUND(100 * (current_count - {BASELINE_CT}) :: NUMERIC / {BASELINE_CT} :: NUMERIC,2)) AS row_count_pct_difference FROM cte; + error_type: Test Results + - id: '1025' + test_id: '1032' + test_type: Row_Ct_Pct + sql_flavor: redshift + lookup_type: null + lookup_query: |- + WITH CTE AS (SELECT COUNT(*) AS current_count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") SELECT current_count, {BASELINE_CT} AS baseline_count, ABS(ROUND(100 * (current_count - {BASELINE_CT}) :: FLOAT / {BASELINE_CT} :: FLOAT,2)) AS row_count_pct_difference FROM cte; + error_type: Test Results + - id: '1425' + test_id: '1032' + test_type: Row_Ct_Pct + sql_flavor: redshift_spectrum + lookup_type: null + lookup_query: |- + WITH CTE AS (SELECT COUNT(*) AS current_count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") SELECT current_count, {BASELINE_CT} AS baseline_count, ABS(ROUND(100 * (current_count - {BASELINE_CT}) :: FLOAT / {BASELINE_CT} :: FLOAT,2)) AS row_count_pct_difference FROM cte; + error_type: Test Results + - id: '1221' + test_id: '1032' + test_type: Row_Ct_Pct + sql_flavor: snowflake + lookup_type: null + lookup_query: |- + WITH CTE AS (SELECT COUNT(*) AS current_count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") SELECT current_count, {BASELINE_CT} AS baseline_count, ABS(ROUND(100 * (current_count - {BASELINE_CT}) :: FLOAT / {BASELINE_CT} :: FLOAT,2)) AS row_count_pct_difference FROM cte; + error_type: Test Results + test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Street_Addr_Pattern.yaml b/testgen/template/dbsetup_test_types/test_types_Street_Addr_Pattern.yaml new file mode 100644 index 00000000..759e5a34 --- /dev/null +++ b/testgen/template/dbsetup_test_types/test_types_Street_Addr_Pattern.yaml @@ -0,0 +1,165 @@ +test_types: + id: '1033' + test_type: Street_Addr_Pattern + test_name_short: Street Address + test_name_long: Enough street address entries match defined pattern + test_description: |- + Tests for percent of records matching standard street address pattern. + except_message: |- + Percent of values matching standard street address format is under expected threshold. + measure_uom: Percent matches + measure_uom_description: |- + Percent of records that match street address pattern + selection_criteria: |- + (std_pattern_match='STREET_ADDR') AND (avg_length <> round(avg_length)) AND (avg_embedded_spaces BETWEEN 2 AND 6) AND (avg_length < 35) + dq_score_prevalence_formula: |- + ({VALUE_CT}::FLOAT * ({RESULT_MEASURE}::FLOAT - {THRESHOLD_VALUE}::FLOAT)/100.0)/NULLIF({RECORD_CT}::FLOAT, 0) + dq_score_risk_factor: '1.0' + column_name_prompt: null + column_name_help: null + default_parm_columns: threshold_value + default_parm_values: |- + 75 + default_parm_prompts: |- + Threshold Pct that Match Address Pattern + default_parm_help: null + default_severity: Fail + run_type: CAT + test_scope: column + dq_dimension: Validity + health_dimension: Schema Drift + threshold_description: |- + Expected percent of records that match standard street address pattern + result_visualization: line_chart + result_visualization_params: null + usage_notes: |- + The street address pattern used in this test should match the vast majority of USA addresses. You can adjust the threshold percent of matches based on the results you are getting -- you may well want to tighten it to make the test more sensitive to invalid entries. + active: Y + cat_test_conditions: + - id: '7026' + test_type: Street_Addr_Pattern + sql_flavor: bigquery + measure: |- + 100.0 * SUM(CAST(REGEXP_CONTAINS(CAST({COLUMN_NAME} AS STRING), r'^[0-9]{1,5}[a-zA-Z]?\s\w{1,5}\.?\s?\w*\s?\w*\s[a-zA-Z]{1,6}\.?\s?[0-9]{0,5}[A-Z]{0,1}$') AS INT64)) / NULLIF(COUNT({COLUMN_NAME}),0) + test_operator: < + test_condition: |- + {THRESHOLD_VALUE} + - id: '6026' + test_type: Street_Addr_Pattern + sql_flavor: databricks + measure: |- + 100.0*SUM((regexp_like({COLUMN_NAME}::STRING, '^[0-9]{1,5}[a-zA-Z]?\\s\\w{1,5}\\.?\\s?\\w*\\s?\\w*\\s[a-zA-Z]{1,6}\\.?\\s?[0-9]{0,5}[A-Z]{0,1}$'))::BIGINT)::FLOAT / NULLIF(COUNT({COLUMN_NAME}), 0)::FLOAT + test_operator: < + test_condition: |- + {THRESHOLD_VALUE} + - id: '3026' + test_type: Street_Addr_Pattern + sql_flavor: mssql + measure: |- + CAST(100.0*SUM(CASE WHEN UPPER({COLUMN_NAME}) LIKE '[1-9]% [A-Z]% %' AND CHARINDEX(' ', {COLUMN_NAME}) BETWEEN 2 AND 6 THEN 1 ELSE 0 END) as FLOAT) /CAST(COUNT({COLUMN_NAME}) AS FLOAT) + test_operator: < + test_condition: |- + {THRESHOLD_VALUE} + - id: '4026' + test_type: Street_Addr_Pattern + sql_flavor: postgresql + measure: |- + 100.0*SUM(CASE WHEN {COLUMN_NAME} ~ '^[0-9]{1,5}[a-zA-Z]?\s\w{1,5}\.?\s?\w*\s?\w*\s[a-zA-Z]{1,6}\.?\s?[0-9]{0,5}[A-Z]{0,1}$' THEN 1 ELSE 0 END)::FLOAT / NULLIF(COUNT({COLUMN_NAME}), 0)::FLOAT + test_operator: < + test_condition: |- + {THRESHOLD_VALUE} + - id: '1026' + test_type: Street_Addr_Pattern + sql_flavor: redshift + measure: |- + 100.0*SUM(({COLUMN_NAME} ~ '^[0-9]{1,5}[a-zA-Z]?\\s\\w{1,5}\\.?\\s?\\w*\\s?\\w*\\s[a-zA-Z]{1,6}\\.?\\s?[0-9]{0,5}[A-Z]{0,1}$')::BIGINT)::FLOAT / NULLIF(COUNT({COLUMN_NAME}), 0)::FLOAT + test_operator: < + test_condition: |- + {THRESHOLD_VALUE} + - id: '7026' + test_type: Street_Addr_Pattern + sql_flavor: redshift_spectrum + measure: |- + 100.0*SUM(({COLUMN_NAME} ~ '^[0-9]{1,5}[a-zA-Z]?\\s\\w{1,5}\\.?\\s?\\w*\\s?\\w*\\s[a-zA-Z]{1,6}\\.?\\s?[0-9]{0,5}[A-Z]{0,1}$')::BIGINT)::FLOAT / NULLIF(COUNT({COLUMN_NAME}), 0)::FLOAT + test_operator: < + test_condition: |- + {THRESHOLD_VALUE} + - id: '2026' + test_type: Street_Addr_Pattern + sql_flavor: snowflake + measure: |- + 100.0*SUM((regexp_like({COLUMN_NAME}::VARCHAR, '^[0-9]{1,5}[a-zA-Z]?\\s\\w{1,5}\\.?\\s?\\w*\\s?\\w*\\s[a-zA-Z]{1,6}\\.?\\s?[0-9]{0,5}[A-Z]{0,1}$'))::BIGINT)::FLOAT / NULLIF(COUNT({COLUMN_NAME}), 0)::FLOAT + test_operator: < + test_condition: |- + {THRESHOLD_VALUE} + - id: '5026' + test_type: Street_Addr_Pattern + sql_flavor: trino + measure: |- + CAST(100.0*SUM(CASE WHEN REGEXP_LIKE({COLUMN_NAME} , '^[0-9]{1,5}[a-zA-Z]?\s\w{1,5}\.?\s?\w*\s?\w*\s[a-zA-Z]{1,6}\.?\s?[0-9]{0,5}[A-Z]{0,1}$') = TRUE THEN 1 ELSE 0 END) AS REAL )/ CAST(COUNT({COLUMN_NAME}) AS REAL) + test_operator: < + test_condition: |- + {THRESHOLD_VALUE} + target_data_lookups: + - id: '1389' + test_id: '1033' + test_type: Street_Addr_Pattern + sql_flavor: bigquery + lookup_type: null + lookup_query: |- + SELECT `{COLUMN_NAME}`, COUNT(*) AS count + FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` + WHERE NOT REGEXP_CONTAINS(CAST(`{COLUMN_NAME}` AS STRING), r'^[0-9]{1,5}[A-Za-z]?\s\w{1,5}\.?\s?\w*\s?\w*\s[A-Za-z]{1,6}\.?\s?[0-9]{0,5}[A-Z]{0,1}$') + GROUP BY `{COLUMN_NAME}` + ORDER BY COUNT(*) DESC + LIMIT 500; + error_type: Test Results + - id: '1323' + test_id: '1033' + test_type: Street_Addr_Pattern + sql_flavor: databricks + lookup_type: null + lookup_query: |- + SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE REGEXP_LIKE(`{COLUMN_NAME}`::STRING, '^[0-9]{1,5}[a-zA-Z]?\\s\\w{1,5}\\.?\\s?\\w*\\s?\\w*\\s[a-zA-Z]{1,6}\\.?\\s?[0-9]{0,5}[A-Z]{0,1}$') != 1 GROUP BY `{COLUMN_NAME}` ORDER BY count DESC LIMIT 500; + error_type: Test Results + - id: '1165' + test_id: '1033' + test_type: Street_Addr_Pattern + sql_flavor: mssql + lookup_type: null + lookup_query: |- + SELECT DISTINCT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE UPPER("{COLUMN_NAME}") NOT LIKE '[1-9]% [A-Z]% %' AND CHARINDEX(' ', "{COLUMN_NAME}") NOT BETWEEN 2 AND 6 GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC; + error_type: Test Results + - id: '1108' + test_id: '1033' + test_type: Street_Addr_Pattern + sql_flavor: postgresql + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" !~ '^[0-9]{1,5}[a-zA-Z]?\s\w{1,5}\.?\s?\w*\s?\w*\s[a-zA-Z]{1,6}\.?\s?[0-9]{0,5}[A-Z]{0,1}$' GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT 500; + error_type: Test Results + - id: '1026' + test_id: '1033' + test_type: Street_Addr_Pattern + sql_flavor: redshift + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" !~ '^[0-9]{1,5}[a-zA-Z]?\\s\\w{1,5}\\.?\\s?\\w*\\s?\\w*\\s[a-zA-Z]{1,6}\\.?\\s?[0-9]{0,5}[A-Z]{0,1}$' GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT 500; + error_type: Test Results + - id: '1426' + test_id: '1033' + test_type: Street_Addr_Pattern + sql_flavor: redshift_spectrum + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" !~ '^[0-9]{1,5}[a-zA-Z]?\\s\\w{1,5}\\.?\\s?\\w*\\s?\\w*\\s[a-zA-Z]{1,6}\\.?\\s?[0-9]{0,5}[A-Z]{0,1}$' GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT 500; + error_type: Test Results + - id: '1222' + test_id: '1033' + test_type: Street_Addr_Pattern + sql_flavor: snowflake + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE REGEXP_LIKE("{COLUMN_NAME}"::VARCHAR, '^[0-9]{1,5}[a-zA-Z]?\\s\\w{1,5}\\.?\\s?\\w*\\s?\\w*\\s[a-zA-Z]{1,6}\\.?\\s?[0-9]{0,5}[A-Z]{0,1}$') != 1 GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT 500; + error_type: Test Results + test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Table_Freshness.yaml b/testgen/template/dbsetup_test_types/test_types_Table_Freshness.yaml new file mode 100644 index 00000000..9149319b --- /dev/null +++ b/testgen/template/dbsetup_test_types/test_types_Table_Freshness.yaml @@ -0,0 +1,69 @@ +test_types: + id: '1511' + test_type: Table_Freshness + test_name_short: Table Freshness + test_name_long: Stale Table Not Updated + test_description: |- + Confirms whether table has been updated based on data fingerprint + except_message: |- + Table has not been updated. + measure_uom: Was Change Detected + measure_uom_description: null + selection_criteria: |- + TEMPLATE + dq_score_prevalence_formula: |- + (({RESULT_MEASURE}-{THRESHOLD_VALUE}))::FLOAT/NULLIF({RECORD_CT}::FLOAT, 0) + dq_score_risk_factor: '0.0' + column_name_prompt: |- + null + column_name_help: |- + null + default_parm_columns: history_calculation,history_lookback,subset_condition,custom_query + default_parm_values: null + default_parm_prompts: |- + History Aggregate,History Lookback,Record Subset Condition,Fingerprint Expression + default_parm_help: |- + Aggregate calculation to be performed on the N lookback results|Last N tests to use for history aggregate calculation|Condition defining a subset of records in main table|String expression combining key column measures into a distinct representation of table state + default_severity: Log + run_type: QUERY + test_scope: table + dq_dimension: Recency + health_dimension: Recency + threshold_description: |- + Most recent prior table fingerprint + result_visualization: binary_chart + result_visualization_params: '{"legend":{"labels":{"0":"Stale","1":"Updated"}}}' + usage_notes: |- + This test compares the current table fingerprint, calculated signature of column contents, to confirm that the table has been updated. The table fingerprint is derived from a set of values and aggregates from columns most likely to change. This test allows you to track the schedule and frequency of updates and refreshes to the table. + active: Y + cat_test_conditions: [] + target_data_lookups: [] + test_templates: + - id: '2512' + test_type: Table_Freshness + sql_flavor: bigquery + template_name: ex_table_changed_bigquery.sql + - id: '2412' + test_type: Table_Freshness + sql_flavor: databricks + template_name: ex_table_changed_generic.sql + - id: '2212' + test_type: Table_Freshness + sql_flavor: mssql + template_name: ex_table_changed_mssql.sql + - id: '2312' + test_type: Table_Freshness + sql_flavor: postgresql + template_name: ex_table_changed_generic.sql + - id: '2012' + test_type: Table_Freshness + sql_flavor: redshift + template_name: ex_table_changed_generic.sql + - id: '2512' + test_type: Table_Freshness + sql_flavor: redshift_spectrum + template_name: ex_table_changed_generic.sql + - id: '2112' + test_type: Table_Freshness + sql_flavor: snowflake + template_name: ex_table_changed_generic.sql diff --git a/testgen/template/dbsetup_test_types/test_types_Timeframe_Combo_Gain.yaml b/testgen/template/dbsetup_test_types/test_types_Timeframe_Combo_Gain.yaml new file mode 100644 index 00000000..dd72a774 --- /dev/null +++ b/testgen/template/dbsetup_test_types/test_types_Timeframe_Combo_Gain.yaml @@ -0,0 +1,183 @@ +test_types: + id: '1508' + test_type: Timeframe_Combo_Gain + test_name_short: Timeframe No Drops + test_name_long: Latest timeframe has at least all value combinations from prior + period + test_description: |- + Tests that column values in most recent time-window include at least same as prior time window + except_message: |- + Column values in most recent time-window don't include all values in prior window. + measure_uom: Mismatched values + measure_uom_description: null + selection_criteria: null + dq_score_prevalence_formula: |- + ({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/NULLIF({RECORD_CT}::FLOAT, 0) + dq_score_risk_factor: '1.0' + column_name_prompt: |- + Categorical Column List + column_name_help: |- + Specify one or more Categorical columns, separated by commas. Make sure not to use continuous measurements here. Do not use numeric values unless they represent discrete categories. + default_parm_columns: window_date_column,window_days,subset_condition + default_parm_values: null + default_parm_prompts: |- + Date Column for Time Windows,Time Window in Days,Record Subset Condition + default_parm_help: |- + The date column used to define the time windows. This must be a DATE or DATETIME type.|Length in days of the time window. The test will compare the most recent period of days to the prior period of the same duration.|Condition defining a subset of records in main table to evaluate, written like a condition within a SQL WHERE clause - OPTIONAL + default_severity: Fail + run_type: QUERY + test_scope: referential + dq_dimension: Consistency + health_dimension: Data Drift + threshold_description: |- + Expected count of missing value combinations + result_visualization: line_chart + result_visualization_params: null + usage_notes: |- + This test checks a single transactional table to verify that categorical values or combinations that are present in the most recent time window you define include at least all those found in the prior time window of the same duration. Missing values in the latest time window will trigger the test to fail. New values are permitted. Use this test to confirm that codes or categories are not lost across successive time periods in a transactional table. + active: Y + cat_test_conditions: [] + target_data_lookups: + - id: '1406' + test_id: '1508' + test_type: Timeframe_Combo_Gain + sql_flavor: bigquery + lookup_type: null + lookup_query: |- + SELECT {COLUMN_NAME_NO_QUOTES} + FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` + WHERE {SUBSET_CONDITION} + AND {WINDOW_DATE_COLUMN} >= DATE_SUB((SELECT MAX({WINDOW_DATE_COLUMN}) FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`), INTERVAL 2 * {WINDOW_DAYS} DAY) + AND {WINDOW_DATE_COLUMN} < DATE_SUB((SELECT MAX({WINDOW_DATE_COLUMN}) FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`), INTERVAL {WINDOW_DAYS} DAY) + GROUP BY {COLUMN_NAME_NO_QUOTES} + EXCEPT DISTINCT + SELECT {COLUMN_NAME_NO_QUOTES} + FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` + WHERE {SUBSET_CONDITION} + AND {WINDOW_DATE_COLUMN} >= DATE_SUB((SELECT MAX({WINDOW_DATE_COLUMN}) FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`), INTERVAL {WINDOW_DAYS} DAY) + GROUP BY {COLUMN_NAME_NO_QUOTES}; + error_type: Test Results + - id: '1263' + test_id: '1508' + test_type: Timeframe_Combo_Gain + sql_flavor: mssql + lookup_type: null + lookup_query: |- + SELECT {COLUMN_NAME_NO_QUOTES} + FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + WHERE {SUBSET_CONDITION} + AND {WINDOW_DATE_COLUMN} >= DATEADD("day", - 2 * {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{TARGET_SCHEMA}"."{TABLE_NAME}")) + AND {WINDOW_DATE_COLUMN} < DATEADD("day", - {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{TARGET_SCHEMA}"."{TABLE_NAME}")) + GROUP BY {COLUMN_NAME_NO_QUOTES} + EXCEPT + SELECT {COLUMN_NAME_NO_QUOTES} + FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + WHERE {SUBSET_CONDITION} + AND {WINDOW_DATE_COLUMN} >= DATEADD("day", - {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{TARGET_SCHEMA}"."{TABLE_NAME}")) + GROUP BY {COLUMN_NAME_NO_QUOTES} + error_type: Test Results + - id: '1264' + test_id: '1508' + test_type: Timeframe_Combo_Gain + sql_flavor: postgresql + lookup_type: null + lookup_query: |- + SELECT {COLUMN_NAME_NO_QUOTES} + FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + WHERE {SUBSET_CONDITION} + AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") - 2 * {WINDOW_DAYS} + AND {WINDOW_DATE_COLUMN} < (SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") - {WINDOW_DAYS} + GROUP BY {COLUMN_NAME_NO_QUOTES} + EXCEPT + SELECT {COLUMN_NAME_NO_QUOTES} + FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + WHERE {SUBSET_CONDITION} + AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") - {WINDOW_DAYS} + GROUP BY {COLUMN_NAME_NO_QUOTES} + error_type: Test Results + - id: '1261' + test_id: '1508' + test_type: Timeframe_Combo_Gain + sql_flavor: redshift + lookup_type: null + lookup_query: |- + SELECT {COLUMN_NAME_NO_QUOTES} + FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + WHERE {SUBSET_CONDITION} + AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") - 2 * {WINDOW_DAYS} + AND {WINDOW_DATE_COLUMN} < (SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") - {WINDOW_DAYS} + GROUP BY {COLUMN_NAME_NO_QUOTES} + EXCEPT + SELECT {COLUMN_NAME_NO_QUOTES} + FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + WHERE {SUBSET_CONDITION} + AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") - {WINDOW_DAYS} + GROUP BY {COLUMN_NAME_NO_QUOTES} + error_type: Test Results + - id: '1468' + test_id: '1508' + test_type: Timeframe_Combo_Gain + sql_flavor: redshift_spectrum + lookup_type: null + lookup_query: |- + SELECT {COLUMN_NAME_NO_QUOTES} + FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + WHERE {SUBSET_CONDITION} + AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") - 2 * {WINDOW_DAYS} + AND {WINDOW_DATE_COLUMN} < (SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") - {WINDOW_DAYS} + GROUP BY {COLUMN_NAME_NO_QUOTES} + EXCEPT + SELECT {COLUMN_NAME_NO_QUOTES} + FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + WHERE {SUBSET_CONDITION} + AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") - {WINDOW_DAYS} + GROUP BY {COLUMN_NAME_NO_QUOTES} + error_type: Test Results + - id: '1262' + test_id: '1508' + test_type: Timeframe_Combo_Gain + sql_flavor: snowflake + lookup_type: null + lookup_query: |- + SELECT {COLUMN_NAME_NO_QUOTES} + FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + WHERE {SUBSET_CONDITION} + AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") - 2 * {WINDOW_DAYS} + AND {WINDOW_DATE_COLUMN} < (SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") - {WINDOW_DAYS} + GROUP BY {COLUMN_NAME_NO_QUOTES} + EXCEPT + SELECT {COLUMN_NAME_NO_QUOTES} + FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + WHERE {SUBSET_CONDITION} + AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") - {WINDOW_DAYS} + GROUP BY {COLUMN_NAME_NO_QUOTES} + error_type: Test Results + test_templates: + - id: '2507' + test_type: Timeframe_Combo_Gain + sql_flavor: bigquery + template_name: ex_window_match_no_drops_bigquery.sql + - id: '2407' + test_type: Timeframe_Combo_Gain + sql_flavor: databricks + template_name: ex_window_match_no_drops_databricks.sql + - id: '2207' + test_type: Timeframe_Combo_Gain + sql_flavor: mssql + template_name: ex_window_match_no_drops_generic.sql + - id: '2307' + test_type: Timeframe_Combo_Gain + sql_flavor: postgresql + template_name: ex_window_match_no_drops_postgresql.sql + - id: '2007' + test_type: Timeframe_Combo_Gain + sql_flavor: redshift + template_name: ex_window_match_no_drops_generic.sql + - id: '2507' + test_type: Timeframe_Combo_Gain + sql_flavor: redshift_spectrum + template_name: ex_window_match_no_drops_generic.sql + - id: '2107' + test_type: Timeframe_Combo_Gain + sql_flavor: snowflake + template_name: ex_window_match_no_drops_generic.sql diff --git a/testgen/template/dbsetup_test_types/test_types_Timeframe_Combo_Match.yaml b/testgen/template/dbsetup_test_types/test_types_Timeframe_Combo_Match.yaml new file mode 100644 index 00000000..af62dff3 --- /dev/null +++ b/testgen/template/dbsetup_test_types/test_types_Timeframe_Combo_Match.yaml @@ -0,0 +1,291 @@ +test_types: + id: '1509' + test_type: Timeframe_Combo_Match + test_name_short: Timeframe Match + test_name_long: Column value combinations from latest timeframe same as prior period + test_description: |- + Tests for presence of same column values in most recent time-window vs. prior time window + except_message: |- + Column values don't match in most recent time-windows. + measure_uom: Mismatched values + measure_uom_description: null + selection_criteria: null + dq_score_prevalence_formula: |- + ({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/NULLIF({RECORD_CT}::FLOAT, 0) + dq_score_risk_factor: '1.0' + column_name_prompt: |- + Categorical Column List + column_name_help: |- + Specify one or more Categorical columns, separated by commas. Do not use continuous measurements here. Do not use numeric values unless they represent discrete categories. + default_parm_columns: window_date_column,window_days,subset_condition + default_parm_values: null + default_parm_prompts: |- + Date Column for Time Windows,Time Window in Days,Record Subset Condition + default_parm_help: null + default_severity: Fail + run_type: QUERY + test_scope: referential + dq_dimension: Consistency + health_dimension: Data Drift + threshold_description: |- + Expected count of non-matching value combinations + result_visualization: line_chart + result_visualization_params: null + usage_notes: |- + This test checks a single transactional table (such as a fact table) to verify that categorical values or combinations that are present in the most recent time window you define match those found in the prior time window of the same duration. New or missing values in the latest time window will trigger the test to fail. Use this test to confirm the consistency in the occurrence of codes or categories across successive time periods in a transactional table. + active: Y + cat_test_conditions: [] + target_data_lookups: + - id: '1407' + test_id: '1509' + test_type: Timeframe_Combo_Match + sql_flavor: bigquery + lookup_type: null + lookup_query: |- + ( + SELECT 'Prior Timeframe' AS missing_from, {COLUMN_NAME_NO_QUOTES} + FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` + WHERE {SUBSET_CONDITION} + AND {WINDOW_DATE_COLUMN} >= DATE_SUB((SELECT MAX({WINDOW_DATE_COLUMN}) FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`), INTERVAL {WINDOW_DAYS} DAY) + EXCEPT DISTINCT + SELECT 'Prior Timeframe' AS missing_from, {COLUMN_NAME_NO_QUOTES} + FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` + WHERE {SUBSET_CONDITION} + AND {WINDOW_DATE_COLUMN} >= DATE_SUB((SELECT MAX({WINDOW_DATE_COLUMN}) FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`), INTERVAL 2 * {WINDOW_DAYS} DAY) + AND {WINDOW_DATE_COLUMN} < DATE_SUB((SELECT MAX({WINDOW_DATE_COLUMN}) FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`), INTERVAL {WINDOW_DAYS} DAY) + ) + UNION ALL + ( + SELECT 'Latest Timeframe' AS missing_from, {COLUMN_NAME_NO_QUOTES} + FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` + WHERE {SUBSET_CONDITION} + AND {WINDOW_DATE_COLUMN} >= DATE_SUB((SELECT MAX({WINDOW_DATE_COLUMN}) FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`), INTERVAL 2 * {WINDOW_DAYS} DAY) + AND {WINDOW_DATE_COLUMN} < DATE_SUB((SELECT MAX({WINDOW_DATE_COLUMN}) FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`), INTERVAL {WINDOW_DAYS} DAY) + EXCEPT DISTINCT + SELECT 'Latest Timeframe' AS missing_from, {COLUMN_NAME_NO_QUOTES} + FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` + WHERE {SUBSET_CONDITION} + AND {WINDOW_DATE_COLUMN} >= DATE_SUB((SELECT MAX({WINDOW_DATE_COLUMN}) FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`), INTERVAL {WINDOW_DAYS} DAY) + ); + error_type: Test Results + - id: '1337' + test_id: '1509' + test_type: Timeframe_Combo_Match + sql_flavor: databricks + lookup_type: null + lookup_query: |2- + ( + SELECT 'Prior Timeframe' as missing_from, {COLUMN_NAME_NO_QUOTES} + FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` + WHERE {SUBSET_CONDITION} + AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`) - {WINDOW_DAYS} + EXCEPT + SELECT 'Prior Timeframe' as missing_from, {COLUMN_NAME_NO_QUOTES} + FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` + WHERE {SUBSET_CONDITION} + AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`) - 2 * {WINDOW_DAYS} + AND {WINDOW_DATE_COLUMN} < (SELECT MAX({WINDOW_DATE_COLUMN}) FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`) - {WINDOW_DAYS} + ) + UNION ALL + ( + SELECT 'Latest Timeframe' as missing_from, {COLUMN_NAME_NO_QUOTES} + FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` + WHERE {SUBSET_CONDITION} + AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`) - 2 * {WINDOW_DAYS} + AND {WINDOW_DATE_COLUMN} < (SELECT MAX({WINDOW_DATE_COLUMN}) FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`) - {WINDOW_DAYS} + EXCEPT + SELECT 'Latest Timeframe' as missing_from, {COLUMN_NAME_NO_QUOTES} + FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` + WHERE {SUBSET_CONDITION} + AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`) - {WINDOW_DAYS} + ) + error_type: Test Results + - id: '1267' + test_id: '1509' + test_type: Timeframe_Combo_Match + sql_flavor: mssql + lookup_type: null + lookup_query: |2- + ( + SELECT 'Prior Timeframe' as missing_from, {COLUMN_NAME_NO_QUOTES} + FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + WHERE {SUBSET_CONDITION} + AND {WINDOW_DATE_COLUMN} >= DATEADD("day", - {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{TARGET_SCHEMA}"."{TABLE_NAME}")) + EXCEPT + SELECT 'Prior Timeframe' as missing_from, {COLUMN_NAME_NO_QUOTES} + FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + WHERE {SUBSET_CONDITION} + AND {WINDOW_DATE_COLUMN} >= DATEADD("day", - 2 * {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{TARGET_SCHEMA}"."{TABLE_NAME}")) + AND {WINDOW_DATE_COLUMN} < DATEADD("day", - {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{TARGET_SCHEMA}"."{TABLE_NAME}")) + ) + UNION ALL + ( + SELECT 'Latest Timeframe' as missing_from, {COLUMN_NAME_NO_QUOTES} + FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + WHERE {SUBSET_CONDITION} + AND {WINDOW_DATE_COLUMN} >= DATEADD("day", - 2 * {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{TARGET_SCHEMA}"."{TABLE_NAME}")) + AND {WINDOW_DATE_COLUMN} < DATEADD("day", - {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{TARGET_SCHEMA}"."{TABLE_NAME}")) + EXCEPT + SELECT 'Latest Timeframe' as missing_from, {COLUMN_NAME_NO_QUOTES} + FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + WHERE {SUBSET_CONDITION} + AND {WINDOW_DATE_COLUMN} >= DATEADD("day", - {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{TARGET_SCHEMA}"."{TABLE_NAME}")) + ) + error_type: Test Results + - id: '1268' + test_id: '1509' + test_type: Timeframe_Combo_Match + sql_flavor: postgresql + lookup_type: null + lookup_query: |2- + ( + SELECT 'Prior Timeframe' as missing_from, {COLUMN_NAME_NO_QUOTES} + FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + WHERE {SUBSET_CONDITION} + AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") - {WINDOW_DAYS} + EXCEPT + SELECT 'Prior Timeframe' as missing_from, {COLUMN_NAME_NO_QUOTES} + FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + WHERE {SUBSET_CONDITION} + AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") - 2 * {WINDOW_DAYS} + AND {WINDOW_DATE_COLUMN} < (SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") - {WINDOW_DAYS} + ) + UNION ALL + ( + SELECT 'Latest Timeframe' as missing_from, {COLUMN_NAME_NO_QUOTES} + FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + WHERE {SUBSET_CONDITION} + AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") - 2 * {WINDOW_DAYS} + AND {WINDOW_DATE_COLUMN} < (SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") - {WINDOW_DAYS} + EXCEPT + SELECT 'Latest Timeframe' as missing_from, {COLUMN_NAME_NO_QUOTES} + FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + WHERE {SUBSET_CONDITION} + AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") - {WINDOW_DAYS} + ) + error_type: Test Results + - id: '1265' + test_id: '1509' + test_type: Timeframe_Combo_Match + sql_flavor: redshift + lookup_type: null + lookup_query: |2- + ( + SELECT 'Prior Timeframe' as missing_from, {COLUMN_NAME_NO_QUOTES} + FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + WHERE {SUBSET_CONDITION} + AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") - {WINDOW_DAYS} + EXCEPT + SELECT 'Prior Timeframe' as missing_from, {COLUMN_NAME_NO_QUOTES} + FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + WHERE {SUBSET_CONDITION} + AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") - 2 * {WINDOW_DAYS} + AND {WINDOW_DATE_COLUMN} < (SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") - {WINDOW_DAYS} + ) + UNION ALL + ( + SELECT 'Latest Timeframe' as missing_from, {COLUMN_NAME_NO_QUOTES} + FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + WHERE {SUBSET_CONDITION} + AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") - 2 * {WINDOW_DAYS} + AND {WINDOW_DATE_COLUMN} < (SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") - {WINDOW_DAYS} + EXCEPT + SELECT 'Latest Timeframe' as missing_from, {COLUMN_NAME_NO_QUOTES} + FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + WHERE {SUBSET_CONDITION} + AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") - {WINDOW_DAYS} + ) + error_type: Test Results + - id: '1469' + test_id: '1509' + test_type: Timeframe_Combo_Match + sql_flavor: redshift_spectrum + lookup_type: null + lookup_query: |2- + ( + SELECT 'Prior Timeframe' as missing_from, {COLUMN_NAME} + FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + WHERE {SUBSET_CONDITION} + AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") - {WINDOW_DAYS} + EXCEPT + SELECT 'Prior Timeframe' as missing_from, {COLUMN_NAME} + FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + WHERE {SUBSET_CONDITION} + AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") - 2 * {WINDOW_DAYS} + AND {WINDOW_DATE_COLUMN} < (SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") - {WINDOW_DAYS} + ) + UNION ALL + ( + SELECT 'Latest Timeframe' as missing_from, {COLUMN_NAME} + FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + WHERE {SUBSET_CONDITION} + AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") - 2 * {WINDOW_DAYS} + AND {WINDOW_DATE_COLUMN} < (SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") - {WINDOW_DAYS} + EXCEPT + SELECT 'Latest Timeframe' as missing_from, {COLUMN_NAME} + FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + WHERE {SUBSET_CONDITION} + AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") - {WINDOW_DAYS} + ) + error_type: Test Results + - id: '1266' + test_id: '1509' + test_type: Timeframe_Combo_Match + sql_flavor: snowflake + lookup_type: null + lookup_query: |2- + ( + SELECT 'Prior Timeframe' as missing_from, {COLUMN_NAME_NO_QUOTES} + FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + WHERE {SUBSET_CONDITION} + AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") - {WINDOW_DAYS} + EXCEPT + SELECT 'Prior Timeframe' as missing_from, {COLUMN_NAME_NO_QUOTES} + FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + WHERE {SUBSET_CONDITION} + AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") - 2 * {WINDOW_DAYS} + AND {WINDOW_DATE_COLUMN} < (SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") - {WINDOW_DAYS} + ) + UNION ALL + ( + SELECT 'Latest Timeframe' as missing_from, {COLUMN_NAME_NO_QUOTES} + FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + WHERE {SUBSET_CONDITION} + AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") - 2 * {WINDOW_DAYS} + AND {WINDOW_DATE_COLUMN} < (SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") - {WINDOW_DAYS} + EXCEPT + SELECT 'Latest Timeframe' as missing_from, {COLUMN_NAME_NO_QUOTES} + FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + WHERE {SUBSET_CONDITION} + AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") - {WINDOW_DAYS} + ) + error_type: Test Results + test_templates: + - id: '2508' + test_type: Timeframe_Combo_Match + sql_flavor: bigquery + template_name: ex_window_match_same_bigquery.sql + - id: '2408' + test_type: Timeframe_Combo_Match + sql_flavor: databricks + template_name: ex_window_match_same_databricks.sql + - id: '2208' + test_type: Timeframe_Combo_Match + sql_flavor: mssql + template_name: ex_window_match_same_generic.sql + - id: '2308' + test_type: Timeframe_Combo_Match + sql_flavor: postgresql + template_name: ex_window_match_same_postgresql.sql + - id: '2008' + test_type: Timeframe_Combo_Match + sql_flavor: redshift + template_name: ex_window_match_same_generic.sql + - id: '2508' + test_type: Timeframe_Combo_Match + sql_flavor: redshift_spectrum + template_name: ex_window_match_same_generic.sql + - id: '2108' + test_type: Timeframe_Combo_Match + sql_flavor: snowflake + template_name: ex_window_match_same_generic.sql diff --git a/testgen/template/dbsetup_test_types/test_types_US_State.yaml b/testgen/template/dbsetup_test_types/test_types_US_State.yaml new file mode 100644 index 00000000..f2d22996 --- /dev/null +++ b/testgen/template/dbsetup_test_types/test_types_US_State.yaml @@ -0,0 +1,164 @@ +test_types: + id: '1036' + test_type: US_State + test_name_short: US State + test_name_long: Column value is two-letter US state code + test_description: |- + Tests that the recorded column value is a valid US state. + except_message: |- + Column Value is not a valid US state. + measure_uom: Not US States + measure_uom_description: |- + Values that doo not match 2-character US state abbreviations. + selection_criteria: |- + general_type= 'A' AND column_name ILIKE '%state%' AND distinct_value_ct < 70 AND max_length = 2 + dq_score_prevalence_formula: |- + ({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/NULLIF({RECORD_CT}::FLOAT, 0) + dq_score_risk_factor: '1.0' + column_name_prompt: null + column_name_help: null + default_parm_columns: threshold_value + default_parm_values: |- + 0 + default_parm_prompts: |- + Threshold Count not Matching State Abbreviations + default_parm_help: null + default_severity: Fail + run_type: CAT + test_scope: column + dq_dimension: Validity + health_dimension: Schema Drift + threshold_description: |- + Expected count of values that are not US state abbreviations + result_visualization: line_chart + result_visualization_params: null + usage_notes: |- + This test validates entries against a fixed list of two-character US state codes and related Armed Forces codes. + active: Y + cat_test_conditions: + - id: '7027' + test_type: US_State + sql_flavor: bigquery + measure: |- + SUM(CASE WHEN {COLUMN_NAME} NOT IN ('','AL','AK','AS','AZ','AR','CA','CO','CT','DE','DC','FM','FL','GA','GU','HI','ID','IL','IN','IA','KS','KY','LA','ME','MH','MD','MA','MI','MN','MS','MO','MT','NE','NV','NH','NJ','NM','NY','NC','ND','MP','OH','OK','OR','PW','PA','PR','RI','SC','SD','TN','TX','UT','VT','VI','VA','WA','WV','WI','WY','AE','AP','AA') THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '6027' + test_type: US_State + sql_flavor: databricks + measure: |- + SUM(CASE WHEN {COLUMN_NAME} NOT IN ('','AL','AK','AS','AZ','AR','CA','CO','CT','DE','DC','FM','FL','GA','GU','HI','ID','IL','IN','IA','KS','KY','LA','ME','MH','MD','MA','MI','MN','MS','MO','MT','NE','NV','NH','NJ','NM','NY','NC','ND','MP','OH','OK','OR','PW','PA','PR','RI','SC','SD','TN','TX','UT','VT','VI','VA','WA','WV','WI','WY','AE','AP','AA') THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '3027' + test_type: US_State + sql_flavor: mssql + measure: |- + SUM(CASE WHEN NULLIF({COLUMN_NAME}, '') NOT IN ('AL','AK','AS','AZ','AR','CA','CO','CT','DE','DC','FM','FL','GA','GU','HI','ID','IL','IN','IA','KS','KY','LA','ME','MH','MD','MA','MI','MN','MS','MO','MT','NE','NV','NH','NJ','NM','NY','NC','ND','MP','OH','OK','OR','PW','PA','PR','RI','SC','SD','TN','TX','UT','VT','VI','VA','WA','WV','WI','WY','AE','AP','AA') THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '4027' + test_type: US_State + sql_flavor: postgresql + measure: |- + SUM(CASE WHEN NULLIF({COLUMN_NAME}, '') NOT IN ('AL','AK','AS','AZ','AR','CA','CO','CT','DE','DC','FM','FL','GA','GU','HI','ID','IL','IN','IA','KS','KY','LA','ME','MH','MD','MA','MI','MN','MS','MO','MT','NE','NV','NH','NJ','NM','NY','NC','ND','MP','OH','OK','OR','PW','PA','PR','RI','SC','SD','TN','TX','UT','VT','VI','VA','WA','WV','WI','WY','AE','AP','AA') THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '1027' + test_type: US_State + sql_flavor: redshift + measure: |- + SUM(CASE WHEN {COLUMN_NAME} NOT IN ('','AL','AK','AS','AZ','AR','CA','CO','CT','DE','DC','FM','FL','GA','GU','HI','ID','IL','IN','IA','KS','KY','LA','ME','MH','MD','MA','MI','MN','MS','MO','MT','NE','NV','NH','NJ','NM','NY','NC','ND','MP','OH','OK','OR','PW','PA','PR','RI','SC','SD','TN','TX','UT','VT','VI','VA','WA','WV','WI','WY','AE','AP','AA') THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '7027' + test_type: US_State + sql_flavor: redshift_spectrum + measure: |- + SUM(CASE WHEN {COLUMN_NAME} NOT IN ('','AL','AK','AS','AZ','AR','CA','CO','CT','DE','DC','FM','FL','GA','GU','HI','ID','IL','IN','IA','KS','KY','LA','ME','MH','MD','MA','MI','MN','MS','MO','MT','NE','NV','NH','NJ','NM','NY','NC','ND','MP','OH','OK','OR','PW','PA','PR','RI','SC','SD','TN','TX','UT','VT','VI','VA','WA','WV','WI','WY','AE','AP','AA') THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '2027' + test_type: US_State + sql_flavor: snowflake + measure: |- + SUM(CASE WHEN {COLUMN_NAME} NOT IN ('','AL','AK','AS','AZ','AR','CA','CO','CT','DE','DC','FM','FL','GA','GU','HI','ID','IL','IN','IA','KS','KY','LA','ME','MH','MD','MA','MI','MN','MS','MO','MT','NE','NV','NH','NJ','NM','NY','NC','ND','MP','OH','OK','OR','PW','PA','PR','RI','SC','SD','TN','TX','UT','VT','VI','VA','WA','WV','WI','WY','AE','AP','AA') THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '5027' + test_type: US_State + sql_flavor: trino + measure: |- + SUM(CASE WHEN NULLIF({COLUMN_NAME}, '') NOT IN ('AL','AK','AS','AZ','AR','CA','CO','CT','DE','DC','FM','FL','GA','GU','HI','ID','IL','IN','IA','KS','KY','LA','ME','MH','MD','MA','MI','MN','MS','MO','MT','NE','NV','NH','NJ','NM','NY','NC','ND','MP','OH','OK','OR','PW','PA','PR','RI','SC','SD','TN','TX','UT','VT','VI','VA','WA','WV','WI','WY','AE','AP','AA') THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + target_data_lookups: + - id: '1390' + test_id: '1036' + test_type: US_State + sql_flavor: bigquery + lookup_type: null + lookup_query: |- + SELECT `{COLUMN_NAME}`, COUNT(*) AS count + FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` + WHERE NULLIF(`{COLUMN_NAME}`, '') NOT IN ('AL','AK','AS','AZ','AR','CA','CO','CT','DE','DC','FM','FL','GA','GU','HI','ID','IL','IN','IA','KS','KY','LA','ME','MH','MD','MA','MI','MN','MS','MO','MT','NE','NV','NH','NJ','NM','NY','NC','ND','MP','OH','OK','OR','PW','PA','PR','RI','SC','SD','TN','TX','UT','VT','VI','VA','WA','WV','WI','WY','AE','AP','AA') + GROUP BY `{COLUMN_NAME}` + LIMIT 500; + error_type: Test Results + - id: '1324' + test_id: '1036' + test_type: US_State + sql_flavor: databricks + lookup_type: null + lookup_query: |- + SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE NULLIF(`{COLUMN_NAME}`, '') NOT IN ('AL','AK','AS','AZ','AR','CA','CO','CT','DE','DC','FM','FL','GA','GU','HI','ID','IL','IN','IA','KS','KY','LA','ME','MH','MD','MA','MI','MN','MS','MO','MT','NE','NV','NH','NJ','NM','NY','NC','ND','MP','OH','OK','OR','PW','PA','PR','RI','SC','SD','TN','TX','UT','VT','VI','VA','WA','WV','WI','WY','AE','AP','AA') GROUP BY `{COLUMN_NAME}` LIMIT 500; + error_type: Test Results + - id: '1166' + test_id: '1036' + test_type: US_State + sql_flavor: mssql + lookup_type: null + lookup_query: |- + SELECT DISTINCT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE NULLIF("{COLUMN_NAME}", '') NOT IN ('AL','AK','AS','AZ','AR','CA','CO','CT','DE','DC','FM','FL','GA','GU','HI','ID','IL','IN','IA','KS','KY','LA','ME','MH','MD','MA','MI','MN','MS','MO','MT','NE','NV','NH','NJ','NM','NY','NC','ND','MP','OH','OK','OR','PW','PA','PR','RI','SC','SD','TN','TX','UT','VT','VI','VA','WA','WV','WI','WY','AE','AP','AA') GROUP BY "{COLUMN_NAME}"; + error_type: Test Results + - id: '1109' + test_id: '1036' + test_type: US_State + sql_flavor: postgresql + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE NULLIF("{COLUMN_NAME}", '') NOT IN ('AL','AK','AS','AZ','AR','CA','CO','CT','DE','DC','FM','FL','GA','GU','HI','ID','IL','IN','IA','KS','KY','LA','ME','MH','MD','MA','MI','MN','MS','MO','MT','NE','NV','NH','NJ','NM','NY','NC','ND','MP','OH','OK','OR','PW','PA','PR','RI','SC','SD','TN','TX','UT','VT','VI','VA','WA','WV','WI','WY','AE','AP','AA') GROUP BY "{COLUMN_NAME}" LIMIT 500; + error_type: Test Results + - id: '1027' + test_id: '1036' + test_type: US_State + sql_flavor: redshift + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE NULLIF("{COLUMN_NAME}", '') NOT IN ('AL','AK','AS','AZ','AR','CA','CO','CT','DE','DC','FM','FL','GA','GU','HI','ID','IL','IN','IA','KS','KY','LA','ME','MH','MD','MA','MI','MN','MS','MO','MT','NE','NV','NH','NJ','NM','NY','NC','ND','MP','OH','OK','OR','PW','PA','PR','RI','SC','SD','TN','TX','UT','VT','VI','VA','WA','WV','WI','WY','AE','AP','AA') GROUP BY "{COLUMN_NAME}" LIMIT 500; + error_type: Test Results + - id: '1427' + test_id: '1036' + test_type: US_State + sql_flavor: redshift_spectrum + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE NULLIF("{COLUMN_NAME}", '') NOT IN ('AL','AK','AS','AZ','AR','CA','CO','CT','DE','DC','FM','FL','GA','GU','HI','ID','IL','IN','IA','KS','KY','LA','ME','MH','MD','MA','MI','MN','MS','MO','MT','NE','NV','NH','NJ','NM','NY','NC','ND','MP','OH','OK','OR','PW','PA','PR','RI','SC','SD','TN','TX','UT','VT','VI','VA','WA','WV','WI','WY','AE','AP','AA') GROUP BY "{COLUMN_NAME}" LIMIT 500; + error_type: Test Results + - id: '1223' + test_id: '1036' + test_type: US_State + sql_flavor: snowflake + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE NULLIF("{COLUMN_NAME}", '') NOT IN ('AL','AK','AS','AZ','AR','CA','CO','CT','DE','DC','FM','FL','GA','GU','HI','ID','IL','IN','IA','KS','KY','LA','ME','MH','MD','MA','MI','MN','MS','MO','MT','NE','NV','NH','NJ','NM','NY','NC','ND','MP','OH','OK','OR','PW','PA','PR','RI','SC','SD','TN','TX','UT','VT','VI','VA','WA','WV','WI','WY','AE','AP','AA') GROUP BY "{COLUMN_NAME}" LIMIT 500; + error_type: Test Results + test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Unique.yaml b/testgen/template/dbsetup_test_types/test_types_Unique.yaml new file mode 100644 index 00000000..c9cc6ca9 --- /dev/null +++ b/testgen/template/dbsetup_test_types/test_types_Unique.yaml @@ -0,0 +1,165 @@ +test_types: + id: '1034' + test_type: Unique + test_name_short: Unique Values + test_name_long: Each column value is unique + test_description: |- + Tests that no values for the column are repeated in multiple records. + except_message: |- + Column values should be unique per row. + measure_uom: Duplicate values + measure_uom_description: |- + Count of non-unique values + selection_criteria: |- + record_ct > 500 and record_ct = distinct_value_ct and value_ct > 0 + dq_score_prevalence_formula: |- + ({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/NULLIF({RECORD_CT}::FLOAT, 0) + dq_score_risk_factor: '1.0' + column_name_prompt: null + column_name_help: null + default_parm_columns: threshold_value + default_parm_values: |- + 0 + default_parm_prompts: |- + Threshold Duplicate Value Count + default_parm_help: null + default_severity: Fail + run_type: CAT + test_scope: column + dq_dimension: Uniqueness + health_dimension: Schema Drift + threshold_description: |- + Expected count of duplicate values + result_visualization: line_chart + result_visualization_params: null + usage_notes: |- + This test is ideal when the database itself does not enforce a primary key constraint on the table. It serves as an independent check on uniqueness. If's also useful when there are a small number of exceptions to uniqueness, which can be reflected in the expected threshold count of duplicates. + active: Y + cat_test_conditions: + - id: '7028' + test_type: Unique + sql_flavor: bigquery + measure: |- + COUNT(*) - COUNT(DISTINCT {COLUMN_NAME}) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '6028' + test_type: Unique + sql_flavor: databricks + measure: |- + COUNT(*) - COUNT(DISTINCT {COLUMN_NAME}) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '3028' + test_type: Unique + sql_flavor: mssql + measure: |- + COUNT(*) - COUNT(DISTINCT {COLUMN_NAME}) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '4028' + test_type: Unique + sql_flavor: postgresql + measure: |- + COUNT(*) - COUNT(DISTINCT {COLUMN_NAME}) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '1028' + test_type: Unique + sql_flavor: redshift + measure: |- + COUNT(*) - COUNT(DISTINCT {COLUMN_NAME}) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '7028' + test_type: Unique + sql_flavor: redshift_spectrum + measure: |- + COUNT(*) - COUNT(DISTINCT {COLUMN_NAME}) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '2028' + test_type: Unique + sql_flavor: snowflake + measure: |- + COUNT(*) - COUNT(DISTINCT {COLUMN_NAME}) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '5028' + test_type: Unique + sql_flavor: trino + measure: |- + COUNT(*) - COUNT(DISTINCT {COLUMN_NAME}) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + target_data_lookups: + - id: '1391' + test_id: '1034' + test_type: Unique + sql_flavor: bigquery + lookup_type: null + lookup_query: |- + SELECT `{COLUMN_NAME}`, COUNT(*) AS count + FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` + GROUP BY `{COLUMN_NAME}` + HAVING COUNT(*) > 1 + ORDER BY COUNT(*) DESC + LIMIT 500; + error_type: Test Results + - id: '1325' + test_id: '1034' + test_type: Unique + sql_flavor: databricks + lookup_type: null + lookup_query: |- + SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` GROUP BY `{COLUMN_NAME}` HAVING count > 1 ORDER BY count DESC LIMIT 500; + error_type: Test Results + - id: '1167' + test_id: '1034' + test_type: Unique + sql_flavor: mssql + lookup_type: null + lookup_query: |- + SELECT DISTINCT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" HAVING COUNT(*) > 1 ORDER BY COUNT(*) DESC; + error_type: Test Results + - id: '1110' + test_id: '1034' + test_type: Unique + sql_flavor: postgresql + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" HAVING COUNT(*) > 1 ORDER BY COUNT(*) DESC LIMIT 500; + error_type: Test Results + - id: '1028' + test_id: '1034' + test_type: Unique + sql_flavor: redshift + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" HAVING COUNT(*) > 1 ORDER BY COUNT(*) DESC LIMIT 500; + error_type: Test Results + - id: '1428' + test_id: '1034' + test_type: Unique + sql_flavor: redshift_spectrum + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" HAVING COUNT(*) > 1 ORDER BY COUNT(*) DESC LIMIT 500; + error_type: Test Results + - id: '1224' + test_id: '1034' + test_type: Unique + sql_flavor: snowflake + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" HAVING COUNT(*) > 1 ORDER BY COUNT(*) DESC LIMIT 500; + error_type: Test Results + test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Unique_Pct.yaml b/testgen/template/dbsetup_test_types/test_types_Unique_Pct.yaml new file mode 100644 index 00000000..7665c977 --- /dev/null +++ b/testgen/template/dbsetup_test_types/test_types_Unique_Pct.yaml @@ -0,0 +1,164 @@ +test_types: + id: '1035' + test_type: Unique_Pct + test_name_short: Percent Unique + test_name_long: Consistent ratio of unique values + test_description: |- + Tests for statistically-significant shift in percentage of unique values vs. baseline data. + except_message: |- + Significant shift in percent of unique values vs. baseline. + measure_uom: Difference measure + measure_uom_description: |- + Cohen's H Difference (0.20 small, 0.5 mod, 0.8 large, 1.2 very large, 2.0 huge) + selection_criteria: |- + distinct_value_ct > 10 AND functional_data_type NOT ILIKE 'Measurement%' + dq_score_prevalence_formula: |- + 2.0 * (1.0 - fn_normal_cdf(ABS({RESULT_MEASURE}::FLOAT) / 2.0)) + dq_score_risk_factor: '0.75' + column_name_prompt: null + column_name_help: null + default_parm_columns: baseline_value_ct,baseline_unique_ct,threshold_value + default_parm_values: |- + value_ct,distinct_value_ct,0.5 + default_parm_prompts: |- + Value Count at Baseline,Distinct Value Count at Baseline,Standardized Difference Measure (0 to 1) + default_parm_help: null + default_severity: Warning + run_type: CAT + test_scope: column + dq_dimension: Uniqueness + health_dimension: Data Drift + threshold_description: |- + Expected maximum Cohen's H Difference + result_visualization: line_chart + result_visualization_params: null + usage_notes: |- + You can think of this as a test of similarity that measures whether the percentage of unique values is consistent with the percentage at baseline. A significant change might indicate duplication or a telling shift in cardinality between entities. The test uses Cohen's H, a statistical test to identify a significant difference between two ratios. Results are reported on a standardized scale, which can be interpreted via a rule-of-thumb from small to huge. You can refine the expected threshold value as you view legitimate results of the measure over time. + active: Y + cat_test_conditions: + - id: '7029' + test_type: Unique_Pct + sql_flavor: bigquery + measure: |- + ABS(2.0 * ASIN(SQRT({BASELINE_UNIQUE_CT}/{BASELINE_VALUE_CT})) - 2.0 * ASIN(SQRT(COUNT(DISTINCT {COLUMN_NAME}) / NULLIF(COUNT({COLUMN_NAME}),0)))) + test_operator: '>=' + test_condition: |- + {THRESHOLD_VALUE} + - id: '6029' + test_type: Unique_Pct + sql_flavor: databricks + measure: |- + ABS( 2.0 * ASIN( SQRT({BASELINE_UNIQUE_CT}::FLOAT / {BASELINE_VALUE_CT}::FLOAT ) ) - 2 * ASIN( SQRT( COUNT( DISTINCT {COLUMN_NAME} )::FLOAT / NULLIF(COUNT( {COLUMN_NAME} ), 0)::FLOAT )) ) + test_operator: '>=' + test_condition: |- + {THRESHOLD_VALUE} + - id: '3029' + test_type: Unique_Pct + sql_flavor: mssql + measure: |- + ABS( 2.0 * ASIN( SQRT(CAST({BASELINE_UNIQUE_CT} AS FLOAT) / CAST({BASELINE_VALUE_CT} AS FLOAT) ) ) - 2 * ASIN( SQRT( CAST(COUNT( DISTINCT {COLUMN_NAME} ) AS FLOAT) / CAST(NULLIF(COUNT( {COLUMN_NAME} ), 0) AS FLOAT) )) ) + test_operator: '>=' + test_condition: |- + {THRESHOLD_VALUE} + - id: '4029' + test_type: Unique_Pct + sql_flavor: postgresql + measure: |- + ABS( 2.0 * ASIN( SQRT({BASELINE_UNIQUE_CT}::FLOAT / {BASELINE_VALUE_CT}::FLOAT ) ) - 2 * ASIN( SQRT( COUNT( DISTINCT {COLUMN_NAME} )::FLOAT / NULLIF(COUNT( {COLUMN_NAME} ), 0)::FLOAT )) ) + test_operator: '>=' + test_condition: |- + {THRESHOLD_VALUE} + - id: '1029' + test_type: Unique_Pct + sql_flavor: redshift + measure: |- + ABS( 2.0 * ASIN( SQRT({BASELINE_UNIQUE_CT}::FLOAT / {BASELINE_VALUE_CT}::FLOAT ) ) - 2 * ASIN( SQRT( COUNT( DISTINCT {COLUMN_NAME} )::FLOAT / NULLIF(COUNT( {COLUMN_NAME} ), 0)::FLOAT )) ) + test_operator: '>=' + test_condition: |- + {THRESHOLD_VALUE} + - id: '7029' + test_type: Unique_Pct + sql_flavor: redshift_spectrum + measure: |- + ABS( 2.0 * ASIN( SQRT({BASELINE_UNIQUE_CT}::FLOAT / {BASELINE_VALUE_CT}::FLOAT ) ) - 2 * ASIN( SQRT( COUNT( DISTINCT {COLUMN_NAME} )::FLOAT / NULLIF(COUNT( {COLUMN_NAME} ), 0)::FLOAT )) ) + test_operator: '>=' + test_condition: |- + {THRESHOLD_VALUE} + - id: '2029' + test_type: Unique_Pct + sql_flavor: snowflake + measure: |- + ABS( 2.0 * ASIN( SQRT({BASELINE_UNIQUE_CT}::FLOAT / {BASELINE_VALUE_CT}::FLOAT ) ) - 2 * ASIN( SQRT( COUNT( DISTINCT {COLUMN_NAME} )::FLOAT / NULLIF(COUNT( {COLUMN_NAME} ), 0)::FLOAT )) ) + test_operator: '>=' + test_condition: |- + {THRESHOLD_VALUE} + - id: '5029' + test_type: Unique_Pct + sql_flavor: trino + measure: |- + ABS( 2.0 * ASIN( SQRT(CAST({BASELINE_UNIQUE_CT} AS REAL) / CAST({BASELINE_VALUE_CT} AS REAL) ) ) - 2 * ASIN( SQRT( CAST(COUNT( DISTINCT {COLUMN_NAME} ) AS REAL) / CAST(NULLIF(COUNT( {COLUMN_NAME} ), 0) AS REAL) ))) + test_operator: '>=' + test_condition: |- + {THRESHOLD_VALUE} + target_data_lookups: + - id: '1392' + test_id: '1035' + test_type: Unique_Pct + sql_flavor: bigquery + lookup_type: null + lookup_query: |- + SELECT `{COLUMN_NAME}`, COUNT(*) AS count + FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` + GROUP BY `{COLUMN_NAME}` + ORDER BY COUNT(*) DESC + LIMIT 500; + error_type: Test Results + - id: '1326' + test_id: '1035' + test_type: Unique_Pct + sql_flavor: databricks + lookup_type: null + lookup_query: |- + SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` GROUP BY `{COLUMN_NAME}` ORDER BY count DESC LIMIT 500; + error_type: Test Results + - id: '1168' + test_id: '1035' + test_type: Unique_Pct + sql_flavor: mssql + lookup_type: null + lookup_query: |- + SELECT DISTINCT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC; + error_type: Test Results + - id: '1111' + test_id: '1035' + test_type: Unique_Pct + sql_flavor: postgresql + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT 500; + error_type: Test Results + - id: '1029' + test_id: '1035' + test_type: Unique_Pct + sql_flavor: redshift + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT 500; + error_type: Test Results + - id: '1429' + test_id: '1035' + test_type: Unique_Pct + sql_flavor: redshift_spectrum + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT 500; + error_type: Test Results + - id: '1225' + test_id: '1035' + test_type: Unique_Pct + sql_flavor: snowflake + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT 500; + error_type: Test Results + test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Valid_Characters.yaml b/testgen/template/dbsetup_test_types/test_types_Valid_Characters.yaml new file mode 100644 index 00000000..fdef7072 --- /dev/null +++ b/testgen/template/dbsetup_test_types/test_types_Valid_Characters.yaml @@ -0,0 +1,168 @@ +test_types: + id: '1043' + test_type: Valid_Characters + test_name_short: Valid Characters + test_name_long: Column contains no invalid characters + test_description: |- + Tests for the presence of non-printing characters, leading spaces, or surrounding quotes. + except_message: |- + Invalid characters, such as non-printing characters, leading spaces, or surrounding quotes, were found. + measure_uom: Invalid records + measure_uom_description: |- + Expected count of values with invalid characters + selection_criteria: |- + general_type = 'A' + dq_score_prevalence_formula: |- + ({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/NULLIF({RECORD_CT}::FLOAT, 0) + dq_score_risk_factor: '0.75' + column_name_prompt: null + column_name_help: null + default_parm_columns: threshold_value + default_parm_values: |- + 0 + default_parm_prompts: null + default_parm_help: |- + The acceptable number of records with invalid character values present. + default_severity: Warning + run_type: CAT + test_scope: column + dq_dimension: Validity + health_dimension: Schema Drift + threshold_description: |- + Threshold Invalid Value Count + result_visualization: line_chart + result_visualization_params: null + usage_notes: |- + This test looks for the presence of non-printing ASCII characters that are considered non-standard in basic text processing. It also identifies leading spaces and values enclosed in quotes. Values that fail this test may be artifacts of data conversion, or just more difficult to process or analyze downstream. + active: N + cat_test_conditions: + - id: '7036' + test_type: Valid_Characters + sql_flavor: bigquery + measure: |- + SUM(CASE WHEN REGEXP_REPLACE({COLUMN_NAME}, r'[\u00A0\u200B\uFEFF\u202F\u2009\u3000\u200C]', 'X') != {COLUMN_NAME} OR {COLUMN_NAME} LIKE ' %' OR {COLUMN_NAME} LIKE "'%'" OR {COLUMN_NAME} LIKE '"%"' THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '6036' + test_type: Valid_Characters + sql_flavor: databricks + measure: |- + SUM(CASE WHEN TRANSLATE({COLUMN_NAME}, CHR(160) || CHR(8203) || CHR(65279) || CHR(8239) || CHR(8201) || CHR(12288) || CHR(8204), 'XXXXXXX') <> {COLUMN_NAME} OR {COLUMN_NAME} LIKE ' %' OR {COLUMN_NAME} LIKE '''%''' OR {COLUMN_NAME} LIKE '"%"' THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '3036' + test_type: Valid_Characters + sql_flavor: mssql + measure: |- + SUM(CASE WHEN TRANSLATE({COLUMN_NAME}, NCHAR(160) || NCHAR(8203) || NCHAR(65279) || NCHAR(8239) || NCHAR(8201) || NCHAR(12288) || NCHAR(8204), 'XXXXXXX') <> {COLUMN_NAME} OR {COLUMN_NAME} LIKE ' %' OR {COLUMN_NAME} LIKE '''%''' OR {COLUMN_NAME} LIKE '"%"' THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '4036' + test_type: Valid_Characters + sql_flavor: postgresql + measure: |- + SUM(CASE WHEN TRANSLATE({COLUMN_NAME}, CHR(160) || CHR(8203) || CHR(65279) || CHR(8239) || CHR(8201) || CHR(12288) || CHR(8204), 'XXXXXXX') <> {COLUMN_NAME} OR {COLUMN_NAME} LIKE ' %' OR {COLUMN_NAME} LIKE '''%''' OR {COLUMN_NAME} LIKE '"%"' THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '1036' + test_type: Valid_Characters + sql_flavor: redshift + measure: |- + SUM(CASE WHEN TRANSLATE({COLUMN_NAME}, CHR(160) || CHR(8203) || CHR(65279) || CHR(8239) || CHR(8201) || CHR(12288) || CHR(8204), 'XXXXXXX') <> {COLUMN_NAME} OR {COLUMN_NAME} LIKE ' %' OR {COLUMN_NAME} LIKE '''%''' OR {COLUMN_NAME} LIKE '"%"' THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '7036' + test_type: Valid_Characters + sql_flavor: redshift_spectrum + measure: |- + SUM(CASE WHEN TRANSLATE({COLUMN_NAME}, CHR(160) || CHR(8203) || CHR(65279) || CHR(8239) || CHR(8201) || CHR(12288) || CHR(8204), 'XXXXXXX') <> {COLUMN_NAME} OR {COLUMN_NAME} LIKE ' %' OR {COLUMN_NAME} LIKE '''%''' OR {COLUMN_NAME} LIKE '"%"' THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '2036' + test_type: Valid_Characters + sql_flavor: snowflake + measure: |- + SUM(CASE WHEN TRANSLATE({COLUMN_NAME}, CHAR(160) || CHAR(8203) || CHAR(65279) || CHAR(8239) || CHAR(8201) || CHAR(12288) || CHAR(8204), 'XXXXXXX') <> {COLUMN_NAME} OR {COLUMN_NAME} LIKE ' %' OR {COLUMN_NAME} LIKE '''%''' OR {COLUMN_NAME} LIKE '"%"' THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '5036' + test_type: Valid_Characters + sql_flavor: trino + measure: |- + SUM(CASE WHEN TRANSLATE({COLUMN_NAME}, CHR(160) || CHR(8203) || CHR(65279) || CHR(8239) || CHR(8201) || CHR(12288) || CHR(8204), 'XXXXXXX') <> {COLUMN_NAME} OR {COLUMN_NAME} LIKE ' %' OR {COLUMN_NAME} LIKE '''%''' OR {COLUMN_NAME} LIKE '"%"' THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + target_data_lookups: + - id: '1397' + test_id: '1043' + test_type: Valid_Characters + sql_flavor: bigquery + lookup_type: null + lookup_query: |- + SELECT `{COLUMN_NAME}`, COUNT(*) AS record_ct + FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` + WHERE REGEXP_CONTAINS(CAST(`{COLUMN_NAME}` AS STRING), r'[\u00A0\u200B\uFEFF\u202F\u2001\u3000\u2004\u200C]') + OR CAST(`{COLUMN_NAME}` AS STRING) LIKE ' %' + OR CAST(`{COLUMN_NAME}` AS STRING) LIKE '\''%' + OR CAST(`{COLUMN_NAME}` AS STRING) LIKE '"%' + GROUP BY `{COLUMN_NAME}` + ORDER BY record_ct DESC + LIMIT 20; + error_type: Test Results + - id: '1330' + test_id: '1043' + test_type: Valid_Characters + sql_flavor: databricks + lookup_type: null + lookup_query: |- + SELECT `{COLUMN_NAME}`, COUNT(*) AS record_ct FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE REGEXP_LIKE(`{COLUMN_NAME}`, '.*[[:cntrl:]].*') OR `{COLUMN_NAME}`::STRING LIKE ' %' OR `{COLUMN_NAME}`::STRING LIKE '''%''' OR `{COLUMN_NAME}`::STRING LIKE '"%"' GROUP BY `{COLUMN_NAME}` ORDER BY record_ct DESC LIMIT 20; + error_type: Test Results + - id: '1235' + test_id: '1043' + test_type: Valid_Characters + sql_flavor: mssql + lookup_type: null + lookup_query: |- + SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}", NCHAR(160) || NCHAR(8203) || NCHAR(65279) || NCHAR(8239) || NCHAR(8201) || NCHAR(12288) || NCHAR(8204), 'XXXXXXX') <> "{COLUMN_NAME}" OR "{COLUMN_NAME}" LIKE ' %' OR "{COLUMN_NAME}" LIKE '''%''' OR "{COLUMN_NAME}" LIKE '"%"' ORDER BY record_ct DESC; + error_type: Test Results + - id: '1234' + test_id: '1043' + test_type: Valid_Characters + sql_flavor: postgresql + lookup_type: null + lookup_query: |- + SELECT "{COLUMN_NAME}", COUNT(*) AS record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}", CHR(160) || CHR(8203) || CHR(65279) || CHR(8239) || CHR(8201) || CHR(12288) || CHR(8204), 'XXXXXXX') <> "{COLUMN_NAME}" OR "{COLUMN_NAME}" LIKE ' %' OR "{COLUMN_NAME}" LIKE '''%''' OR "{COLUMN_NAME}" LIKE '"%"' ORDER BY record_ct DESC LIMIT 20; + error_type: Test Results + - id: '1233' + test_id: '1043' + test_type: Valid_Characters + sql_flavor: redshift + lookup_type: null + lookup_query: |- + SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}", CHR(160) || CHR(8203) || CHR(65279) || CHR(8239) || CHR(8201) || CHR(12288) || CHR(8204), 'XXXXXXX') <> "{COLUMN_NAME}" OR "{COLUMN_NAME}" LIKE ' %' OR "{COLUMN_NAME}" LIKE '''%''' OR "{COLUMN_NAME}" LIKE '"%"' ORDER BY record_ct DESC; + error_type: Test Results + - id: '1459' + test_id: '1043' + test_type: Valid_Characters + sql_flavor: redshift_spectrum + lookup_type: null + lookup_query: |- + SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}", CHR(160) || CHR(8203) || CHR(65279) || CHR(8239) || CHR(8201) || CHR(12288) || CHR(8204), 'XXXXXXX') <> "{COLUMN_NAME}" OR "{COLUMN_NAME}" LIKE ' %' OR "{COLUMN_NAME}" LIKE '''%''' OR "{COLUMN_NAME}" LIKE '"%"' ORDER BY record_ct DESC; + error_type: Test Results + - id: '1236' + test_id: '1043' + test_type: Valid_Characters + sql_flavor: snowflake + lookup_type: null + lookup_query: |- + SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}", CHAR(160) || CHAR(8203) || CHAR(65279) || CHAR(8239) || CHAR(8201) || CHAR(12288) || CHAR(8204), 'XXXXXXX') <> "{COLUMN_NAME}" OR "{COLUMN_NAME}" LIKE ' %' OR "{COLUMN_NAME}" LIKE '''%''' OR "{COLUMN_NAME}" LIKE '"%"' ORDER BY record_ct DESC; + error_type: Test Results + test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Valid_Month.yaml b/testgen/template/dbsetup_test_types/test_types_Valid_Month.yaml new file mode 100644 index 00000000..32e74026 --- /dev/null +++ b/testgen/template/dbsetup_test_types/test_types_Valid_Month.yaml @@ -0,0 +1,103 @@ +test_types: + id: '1042' + test_type: Valid_Month + test_name_short: Valid Month + test_name_long: Valid calendar month in expected format + test_description: |- + Tests for the presence of a valid representation of a calendar month consistent with the format at baseline. + except_message: |- + Column values are not a valid representation of a calendar month consistent with the format at baseline. + measure_uom: Invalid months + measure_uom_description: null + selection_criteria: |- + functional_data_type = 'Period Month' + dq_score_prevalence_formula: |- + ({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/NULLIF({RECORD_CT}::FLOAT, 0) + dq_score_risk_factor: '1.0' + column_name_prompt: null + column_name_help: null + default_parm_columns: threshold_value,baseline_value + default_parm_values: |- + 0,CASE WHEN max_length > 3 AND initcap(min_text) = min_text THEN '''January'',''February'',''March'',''April'',''May'',''June'',''July'',''August'',''September'',''October'',''November'',''December''' WHEN max_length > 3 AND upper(min_text) = min_text THEN '''JANUARY'',''FEBRUARY'',''MARCH'',''APRIL'',''MAY'',''JUNE'',''JULY'',''AUGUST'',''SEPTEMBER'',''OCTOBER'',''NOVEMBER'',''DECEMBER''' WHEN max_length > 3 AND lower(min_text) = min_text THEN '''january'',''february'',''march'',''april'',''may'',''june'',''july'',''august'',''september'',''october'',''november'',''december''' WHEN max_length = 3 AND initcap(min_text) = min_text THEN '''Jan'',''Feb'',''Mar'',''Apr'',''May'',''Jun'',''Jul'',''Aug'',''Sep'',''Oct'',''Nov'',''Dec''' WHEN max_length = 3 AND upper(min_text) = min_text THEN '''JAN'',''FEB'',''MAR'',''APR'',''MAY'',''JUN'',''JUL'',''AUG'',''SEP'',''OCT'',''NOV'',''DEC''' WHEN max_length = 3 AND lower(min_text) = min_text THEN '''jan'',''feb'',''mar'',''apr'',''may'',''jun'',''jul'',''aug'',''sep'',''oct'',''nov'',''dec''' WHEN max_length = 2 AND min_text = '01' THEN '''01'',''02'',''03'',''04'',''05'',''06'',''07'',''08'',''09'',''10'',''11'',''12''' WHEN max_length = 2 AND min_text = '1' THEN '''1'',''2'',''3'',''4'',''5'',''6'',''7'',''8'',''9'',''10'',''11'',''12''' WHEN min_value = 1 THEN '1,2,3,4,5,6,7,8,9,10,11,12' ELSE 'NULL' END + default_parm_prompts: |- + Threshold Invalid Months,Valid Month List + default_parm_help: |- + The acceptable number of records with invalid months present.|List of valid month values for this field, in quotes if field is numeric, separated by commas. + default_severity: Fail + run_type: CAT + test_scope: column + dq_dimension: Validity + health_dimension: Schema Drift + threshold_description: |- + Expected count of invalid months + result_visualization: line_chart + result_visualization_params: null + usage_notes: null + active: N + cat_test_conditions: + - id: '7033' + test_type: Valid_Month + sql_flavor: bigquery + measure: |- + SUM(CASE WHEN NULLIF({COLUMN_NAME}, '') NOT IN ({BASELINE_VALUE}) THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '6033' + test_type: Valid_Month + sql_flavor: databricks + measure: |- + SUM(CASE WHEN NULLIF({COLUMN_NAME}, '') NOT IN ({BASELINE_VALUE}) THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '3033' + test_type: Valid_Month + sql_flavor: mssql + measure: |- + SUM(CASE WHEN NULLIF({COLUMN_NAME}, '') NOT IN ({BASELINE_VALUE}) THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '4033' + test_type: Valid_Month + sql_flavor: postgresql + measure: |- + SUM(CASE WHEN NULLIF({COLUMN_NAME}, '') NOT IN ({BASELINE_VALUE}) THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '1033' + test_type: Valid_Month + sql_flavor: redshift + measure: |- + SUM(CASE WHEN NULLIF({COLUMN_NAME}, '') NOT IN ({BASELINE_VALUE}) THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '7033' + test_type: Valid_Month + sql_flavor: redshift_spectrum + measure: |- + SUM(CASE WHEN NULLIF({COLUMN_NAME}, '') NOT IN ({BASELINE_VALUE}) THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '2033' + test_type: Valid_Month + sql_flavor: snowflake + measure: |- + SUM(CASE WHEN NULLIF({COLUMN_NAME}, '') NOT IN ({BASELINE_VALUE}) THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '5033' + test_type: Valid_Month + sql_flavor: trino + measure: |- + SUM(CASE WHEN NULLIF({COLUMN_NAME}, '') NOT IN ({BASELINE_VALUE}) THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + target_data_lookups: [] + test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Valid_US_Zip.yaml b/testgen/template/dbsetup_test_types/test_types_Valid_US_Zip.yaml new file mode 100644 index 00000000..6e8929c5 --- /dev/null +++ b/testgen/template/dbsetup_test_types/test_types_Valid_US_Zip.yaml @@ -0,0 +1,163 @@ +test_types: + id: '1044' + test_type: Valid_US_Zip + test_name_short: Valid US Zip + test_name_long: Valid USA Postal Codes + test_description: |- + Tests that postal codes match the 5 or 9 digit standard US format + except_message: |- + Invalid US Zip Code formats found. + measure_uom: Invalid Zip Codes + measure_uom_description: |- + Expected count of values with invalid Zip Codes + selection_criteria: |- + functional_data_type = 'Zip' + dq_score_prevalence_formula: |- + ({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/NULLIF({RECORD_CT}::FLOAT, 0) + dq_score_risk_factor: '0.75' + column_name_prompt: null + column_name_help: null + default_parm_columns: threshold_value + default_parm_values: |- + 0 + default_parm_prompts: null + default_parm_help: null + default_severity: Warning + run_type: CAT + test_scope: column + dq_dimension: Validity + health_dimension: Schema Drift + threshold_description: |- + Threshold Invalid Value Count + result_visualization: line_chart + result_visualization_params: null + usage_notes: null + active: Y + cat_test_conditions: + - id: '7034' + test_type: Valid_US_Zip + sql_flavor: bigquery + measure: |- + SUM(CASE WHEN REGEXP_REPLACE({COLUMN_NAME}, r'[0-9]', '9') NOT IN ('99999','999999999','99999-9999') THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '6034' + test_type: Valid_US_Zip + sql_flavor: databricks + measure: |- + SUM(CASE WHEN TRANSLATE({COLUMN_NAME},'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '3034' + test_type: Valid_US_Zip + sql_flavor: mssql + measure: |- + SUM(CASE WHEN TRANSLATE({COLUMN_NAME},'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '4034' + test_type: Valid_US_Zip + sql_flavor: postgresql + measure: |- + SUM(CASE WHEN TRANSLATE({COLUMN_NAME},'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '1034' + test_type: Valid_US_Zip + sql_flavor: redshift + measure: |- + SUM(CASE WHEN TRANSLATE({COLUMN_NAME},'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '7034' + test_type: Valid_US_Zip + sql_flavor: redshift_spectrum + measure: |- + SUM(CASE WHEN TRANSLATE({COLUMN_NAME},'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '2034' + test_type: Valid_US_Zip + sql_flavor: snowflake + measure: |- + SUM(CASE WHEN TRANSLATE({COLUMN_NAME},'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '5034' + test_type: Valid_US_Zip + sql_flavor: trino + measure: |- + SUM(CASE WHEN TRANSLATE({COLUMN_NAME},'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + target_data_lookups: + - id: '1398' + test_id: '1044' + test_type: Valid_US_Zip + sql_flavor: bigquery + lookup_type: null + lookup_query: |- + SELECT `{COLUMN_NAME}`, COUNT(*) AS record_ct + FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` + WHERE TRANSLATE(CAST(`{COLUMN_NAME}` AS STRING), '012345678', '999999999') NOT IN ('99999', '999999999', '99999-9999') + GROUP BY `{COLUMN_NAME}` + ORDER BY record_ct DESC + LIMIT 20; + error_type: Test Results + - id: '1331' + test_id: '1044' + test_type: Valid_US_Zip + sql_flavor: databricks + lookup_type: null + lookup_query: |- + SELECT `{COLUMN_NAME}`, COUNT(*) AS record_ct FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE TRANSLATE(`{COLUMN_NAME}`,'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') GROUP BY `{COLUMN_NAME}` ORDER BY record_ct DESC LIMIT 20; + error_type: Test Results + - id: '1239' + test_id: '1044' + test_type: Valid_US_Zip + sql_flavor: mssql + lookup_type: null + lookup_query: |- + SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC; + error_type: Test Results + - id: '1238' + test_id: '1044' + test_type: Valid_US_Zip + sql_flavor: postgresql + lookup_type: null + lookup_query: |- + SELECT "{COLUMN_NAME}", COUNT(*) AS record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC LIMIT 20; + error_type: Test Results + - id: '1237' + test_id: '1044' + test_type: Valid_US_Zip + sql_flavor: redshift + lookup_type: null + lookup_query: |- + SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC; + error_type: Test Results + - id: '1460' + test_id: '1044' + test_type: Valid_US_Zip + sql_flavor: redshift_spectrum + lookup_type: null + lookup_query: |- + SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC; + error_type: Test Results + - id: '1240' + test_id: '1044' + test_type: Valid_US_Zip + sql_flavor: snowflake + lookup_type: null + lookup_query: |- + SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC; + error_type: Test Results + test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Valid_US_Zip3.yaml b/testgen/template/dbsetup_test_types/test_types_Valid_US_Zip3.yaml new file mode 100644 index 00000000..acba07f0 --- /dev/null +++ b/testgen/template/dbsetup_test_types/test_types_Valid_US_Zip3.yaml @@ -0,0 +1,164 @@ +test_types: + id: '1045' + test_type: Valid_US_Zip3 + test_name_short: 'Valid US Zip-3 ' + test_name_long: Valid USA Zip-3 Prefix + test_description: |- + Tests that postal codes match the 3 digit format of a regional prefix. + except_message: |- + Invalid 3-digit US Zip Code regional prefix formats found. + measure_uom: Invalid Zip-3 Prefix + measure_uom_description: |- + Expected count of values with invalid Zip-3 Prefix Codes + selection_criteria: |- + functional_data_type = 'Zip3' + dq_score_prevalence_formula: |- + ({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/NULLIF({RECORD_CT}::FLOAT, 0) + dq_score_risk_factor: '0.75' + column_name_prompt: null + column_name_help: null + default_parm_columns: threshold_value + default_parm_values: |- + 0 + default_parm_prompts: null + default_parm_help: null + default_severity: Warning + run_type: CAT + test_scope: column + dq_dimension: Validity + health_dimension: Schema Drift + threshold_description: |- + Threshold Invalid Zip3 Count + result_visualization: line_chart + result_visualization_params: null + usage_notes: |- + This test looks for the presence of values that fail to match the three-digit numeric code expected for US Zip Code regional prefixes. These prefixes are often used to roll up Zip Code data to a regional level, and may be critical to anonymize detailed data and protect PID. Depending on your needs and regulatory requirements, longer zip codes could place PID at risk. + active: Y + cat_test_conditions: + - id: '7035' + test_type: Valid_US_Zip3 + sql_flavor: bigquery + measure: |- + SUM(CASE WHEN REGEXP_REPLACE({COLUMN_NAME}, r'[0-9]', '9') != '999' THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '6035' + test_type: Valid_US_Zip3 + sql_flavor: databricks + measure: |- + SUM(CASE WHEN TRANSLATE({COLUMN_NAME},'012345678','999999999') <> '999' THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '3035' + test_type: Valid_US_Zip3 + sql_flavor: mssql + measure: |- + SUM(CASE WHEN TRANSLATE({COLUMN_NAME},'012345678','999999999') <> '999' THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '4035' + test_type: Valid_US_Zip3 + sql_flavor: postgresql + measure: |- + SUM(CASE WHEN TRANSLATE({COLUMN_NAME},'012345678','999999999') <> '999' THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '1035' + test_type: Valid_US_Zip3 + sql_flavor: redshift + measure: |- + SUM(CASE WHEN TRANSLATE({COLUMN_NAME},'012345678','999999999') <> '999' THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '7035' + test_type: Valid_US_Zip3 + sql_flavor: redshift_spectrum + measure: |- + SUM(CASE WHEN TRANSLATE({COLUMN_NAME},'012345678','999999999') <> '999' THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '2035' + test_type: Valid_US_Zip3 + sql_flavor: snowflake + measure: |- + SUM(CASE WHEN TRANSLATE({COLUMN_NAME},'012345678','999999999') <> '999' THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '5035' + test_type: Valid_US_Zip3 + sql_flavor: trino + measure: |- + SUM(CASE WHEN TRANSLATE({COLUMN_NAME},'012345678','999999999') <> '999' THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + target_data_lookups: + - id: '1399' + test_id: '1045' + test_type: Valid_US_Zip3 + sql_flavor: bigquery + lookup_type: null + lookup_query: |- + SELECT `{COLUMN_NAME}`, COUNT(*) AS record_ct + FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` + WHERE TRANSLATE(CAST(`{COLUMN_NAME}` AS STRING), '012345678', '999999999') != '999' + GROUP BY `{COLUMN_NAME}` + ORDER BY record_ct DESC + LIMIT 20; + error_type: Test Results + - id: '1332' + test_id: '1045' + test_type: Valid_US_Zip3 + sql_flavor: databricks + lookup_type: null + lookup_query: |- + SELECT `{COLUMN_NAME}`, COUNT(*) AS record_ct FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE TRANSLATE(`{COLUMN_NAME}`,'012345678','999999999') <> '999' GROUP BY `{COLUMN_NAME}` ORDER BY record_ct DESC LIMIT 20; + error_type: Test Results + - id: '1243' + test_id: '1045' + test_type: Valid_US_Zip3 + sql_flavor: mssql + lookup_type: null + lookup_query: |- + SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') <> '999' GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC; + error_type: Test Results + - id: '1242' + test_id: '1045' + test_type: Valid_US_Zip3 + sql_flavor: postgresql + lookup_type: null + lookup_query: |- + SELECT "{COLUMN_NAME}", COUNT(*) AS record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') <> '999' GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC LIMIT 20; + error_type: Test Results + - id: '1241' + test_id: '1045' + test_type: Valid_US_Zip3 + sql_flavor: redshift + lookup_type: null + lookup_query: |- + SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') <> '999' GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC; + error_type: Test Results + - id: '1461' + test_id: '1045' + test_type: Valid_US_Zip3 + sql_flavor: redshift_spectrum + lookup_type: null + lookup_query: |- + SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') <> '999' GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC; + error_type: Test Results + - id: '1244' + test_id: '1045' + test_type: Valid_US_Zip3 + sql_flavor: snowflake + lookup_type: null + lookup_query: |- + SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') <> '999' GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC; + error_type: Test Results + test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Variability_Decrease.yaml b/testgen/template/dbsetup_test_types/test_types_Variability_Decrease.yaml new file mode 100644 index 00000000..6f476d0a --- /dev/null +++ b/testgen/template/dbsetup_test_types/test_types_Variability_Decrease.yaml @@ -0,0 +1,165 @@ +test_types: + id: '1041' + test_type: Variability_Decrease + test_name_short: Variability Decrease + test_name_long: Variability has decreased below threshold + test_description: |- + Tests that the spread or dispersion of column values has decreased significantly over baseline, indicating a shift in stability of the measure. This could signal a change in a process or a data quality issue. + except_message: |- + The Standard Deviation of the measure has decreased below the defined threshold. This could signal a change in a process or a data quality issue. + measure_uom: Pct SD shift + measure_uom_description: |- + Percent of baseline Standard Deviation + selection_criteria: |- + general_type = 'N' AND functional_data_type ilike 'Measure%' AND functional_data_type <> 'Measurement Spike' AND column_name NOT ilike '%latitude%' AND column_name NOT ilike '%longitude%' AND value_ct <> distinct_value_ct AND distinct_value_ct > 10 AND stdev_value > 0 AND avg_value IS NOT NULL AND NOT (distinct_value_ct = max_value - min_value + 1 AND distinct_value_ct > 2) + dq_score_prevalence_formula: |- + 1 + dq_score_risk_factor: '0.75' + column_name_prompt: null + column_name_help: null + default_parm_columns: baseline_sd,threshold_value + default_parm_values: |- + stdev_value, 80 + default_parm_prompts: |- + Std Deviation at Baseline,Expected Minimum Percent + default_parm_help: null + default_severity: Warning + run_type: CAT + test_scope: column + dq_dimension: Accuracy + health_dimension: Data Drift + threshold_description: |- + Expected minimum pct of baseline Standard Deviation (SD) + result_visualization: line_chart + result_visualization_params: null + usage_notes: "This test looks for percent shifts in standard deviation as a measure\ + \ of the stability of a measure over time. A significant change could indicate\ + \ that new values are erroneous, or that the cohort being evaluated is significantly\ + \ different from baseline. A decrease in particular could indicate an improved\ + \ process, better precision in measurement, the elimination of outliers, or a\ + \ more homogeneous cohort. " + active: Y + cat_test_conditions: + - id: '7032' + test_type: Variability_Decrease + sql_flavor: bigquery + measure: |- + 100.0 * STDDEV(CAST({COLUMN_NAME} AS FLOAT64)) / {BASELINE_SD} + test_operator: < + test_condition: |- + {THRESHOLD_VALUE} + - id: '6032' + test_type: Variability_Decrease + sql_flavor: databricks + measure: |- + 100.0*STDDEV_SAMP({COLUMN_NAME}::FLOAT)/{BASELINE_SD}::FLOAT + test_operator: < + test_condition: |- + {THRESHOLD_VALUE} + - id: '3032' + test_type: Variability_Decrease + sql_flavor: mssql + measure: |- + 100.0*STDEV(CAST({COLUMN_NAME} AS FLOAT))/CAST({BASELINE_SD} AS FLOAT) + test_operator: < + test_condition: |- + {THRESHOLD_VALUE} + - id: '4032' + test_type: Variability_Decrease + sql_flavor: postgresql + measure: |- + 100.0*STDDEV({COLUMN_NAME}::FLOAT)/{BASELINE_SD}::FLOAT + test_operator: < + test_condition: |- + {THRESHOLD_VALUE} + - id: '1032' + test_type: Variability_Decrease + sql_flavor: redshift + measure: |- + 100.0*STDDEV({COLUMN_NAME}::FLOAT)/{BASELINE_SD}::FLOAT + test_operator: < + test_condition: |- + {THRESHOLD_VALUE} + - id: '7032' + test_type: Variability_Decrease + sql_flavor: redshift_spectrum + measure: |- + 100.0*STDDEV({COLUMN_NAME}::FLOAT)/{BASELINE_SD}::FLOAT + test_operator: < + test_condition: |- + {THRESHOLD_VALUE} + - id: '2032' + test_type: Variability_Decrease + sql_flavor: snowflake + measure: |- + 100.0*STDDEV({COLUMN_NAME}::FLOAT)/{BASELINE_SD}::FLOAT + test_operator: < + test_condition: |- + {THRESHOLD_VALUE} + - id: '5032' + test_type: Variability_Decrease + sql_flavor: trino + measure: |- + 100.0*STDDEV(CAST({COLUMN_NAME} AS REAL))/{BASELINE_SD} + test_operator: < + test_condition: |- + {THRESHOLD_VALUE} + target_data_lookups: + - id: '1395' + test_id: '1041' + test_type: Variability_Decrease + sql_flavor: bigquery + lookup_type: null + lookup_query: |- + SELECT STDDEV_POP(CAST(`{COLUMN_NAME}` AS FLOAT64)) AS current_standard_deviation + FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`; + error_type: Test Results + - id: '1329' + test_id: '1041' + test_type: Variability_Decrease + sql_flavor: databricks + lookup_type: null + lookup_query: |- + SELECT STDDEV(CAST(`{COLUMN_NAME}` AS FLOAT)) as current_standard_deviation FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`; + error_type: Test Results + - id: '1171' + test_id: '1041' + test_type: Variability_Decrease + sql_flavor: mssql + lookup_type: null + lookup_query: |- + SELECT STDEV(CAST("{COLUMN_NAME}" AS FLOAT)) as current_standard_deviation FROM "{TARGET_SCHEMA}"."{TABLE_NAME}"; + error_type: Test Results + - id: '1114' + test_id: '1041' + test_type: Variability_Decrease + sql_flavor: postgresql + lookup_type: null + lookup_query: |- + SELECT STDDEV(CAST("{COLUMN_NAME}" AS FLOAT)) as current_standard_deviation FROM "{TARGET_SCHEMA}"."{TABLE_NAME}"; + error_type: Test Results + - id: '1032' + test_id: '1041' + test_type: Variability_Decrease + sql_flavor: redshift + lookup_type: null + lookup_query: |- + SELECT STDDEV("{COLUMN_NAME}"::FLOAT) as current_standard_deviation FROM "{TARGET_SCHEMA}"."{TABLE_NAME}"; + error_type: Test Results + - id: '1432' + test_id: '1041' + test_type: Variability_Decrease + sql_flavor: redshift_spectrum + lookup_type: null + lookup_query: |- + SELECT STDDEV(CAST("{COLUMN_NAME}" AS FLOAT)) as current_standard_deviation FROM "{TARGET_SCHEMA}"."{TABLE_NAME}"; + error_type: Test Results + - id: '1228' + test_id: '1041' + test_type: Variability_Decrease + sql_flavor: snowflake + lookup_type: null + lookup_query: |- + SELECT STDDEV(CAST("{COLUMN_NAME}" AS FLOAT)) as current_standard_deviation FROM "{TARGET_SCHEMA}"."{TABLE_NAME}"; + error_type: Test Results + test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Variability_Increase.yaml b/testgen/template/dbsetup_test_types/test_types_Variability_Increase.yaml new file mode 100644 index 00000000..ec4a921a --- /dev/null +++ b/testgen/template/dbsetup_test_types/test_types_Variability_Increase.yaml @@ -0,0 +1,169 @@ +test_types: + id: '1040' + test_type: Variability_Increase + test_name_short: Variability Increase + test_name_long: Variability has increased above threshold + test_description: |- + Tests that the spread or dispersion of column values has increased significantly over baseline, indicating a drop in stability of the measure. + except_message: |- + The Standard Deviation of the measure has increased beyond the defined threshold. This could signal a change in a process or a data quality issue. + measure_uom: Pct SD shift + measure_uom_description: |- + Percent of baseline Standard Deviation + selection_criteria: |- + general_type = 'N' AND functional_data_type ilike 'Measure%' AND functional_data_type <> 'Measurement Spike' AND column_name NOT ilike '%latitude%' AND column_name NOT ilike '%longitude%' AND value_ct <> distinct_value_ct AND distinct_value_ct > 10 AND stdev_value > 0 AND avg_value IS NOT NULL AND NOT (distinct_value_ct = max_value - min_value + 1 AND distinct_value_ct > 2) + dq_score_prevalence_formula: |- + 1 + dq_score_risk_factor: '0.75' + column_name_prompt: null + column_name_help: null + default_parm_columns: baseline_sd,threshold_value + default_parm_values: |- + stdev_value,120 + default_parm_prompts: |- + Std Deviation at Baseline,Expected Maximum Percent + default_parm_help: null + default_severity: Warning + run_type: CAT + test_scope: column + dq_dimension: Accuracy + health_dimension: Data Drift + threshold_description: |- + Expected maximum pct of baseline Standard Deviation (SD) + result_visualization: line_chart + result_visualization_params: null + usage_notes: "This test looks for percent shifts in standard deviation as a measure\ + \ of the stability of a measure over time. A significant change could indicate\ + \ that new values are erroneous, or that the cohort being evaluated is significantly\ + \ different from baseline. An increase in particular could mark new problems\ + \ in measurement, a more heterogeneous cohort, or that significant outliers have\ + \ been introduced. Consider this test along with Average Shift and New Shift.\ + \ If the average shifts as well, there may be a fundamental shift in the dataset\ + \ or process used to collect the data point. This might suggest a data shift\ + \ that should be noted and assessed by business users. If the average does not\ + \ shift, this may point to a data quality or data collection problem. " + active: Y + cat_test_conditions: + - id: '7031' + test_type: Variability_Increase + sql_flavor: bigquery + measure: |- + 100.0 * STDDEV(CAST({COLUMN_NAME} AS FLOAT64)) / {BASELINE_SD} + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '6031' + test_type: Variability_Increase + sql_flavor: databricks + measure: |- + 100.0*STDDEV_SAMP({COLUMN_NAME}::FLOAT)/{BASELINE_SD}::FLOAT + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '3031' + test_type: Variability_Increase + sql_flavor: mssql + measure: |- + 100.0*STDEV(CAST({COLUMN_NAME} AS FLOAT))/CAST({BASELINE_SD} AS FLOAT) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '4031' + test_type: Variability_Increase + sql_flavor: postgresql + measure: |- + 100.0*STDDEV({COLUMN_NAME}::FLOAT)/{BASELINE_SD}::FLOAT + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '1031' + test_type: Variability_Increase + sql_flavor: redshift + measure: |- + 100.0*STDDEV({COLUMN_NAME}::FLOAT)/{BASELINE_SD}::FLOAT + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '7031' + test_type: Variability_Increase + sql_flavor: redshift_spectrum + measure: |- + 100.0*STDDEV({COLUMN_NAME}::FLOAT)/{BASELINE_SD}::FLOAT + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '2031' + test_type: Variability_Increase + sql_flavor: snowflake + measure: |- + 100.0*STDDEV({COLUMN_NAME}::FLOAT)/{BASELINE_SD}::FLOAT + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '5031' + test_type: Variability_Increase + sql_flavor: trino + measure: |- + 100.0*STDDEV(CAST({COLUMN_NAME} AS REAL))/{BASELINE_SD} + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + target_data_lookups: + - id: '1394' + test_id: '1040' + test_type: Variability_Increase + sql_flavor: bigquery + lookup_type: null + lookup_query: |- + SELECT STDDEV_POP(CAST(`{COLUMN_NAME}` AS FLOAT64)) AS current_standard_deviation + FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`; + error_type: Test Results + - id: '1328' + test_id: '1040' + test_type: Variability_Increase + sql_flavor: databricks + lookup_type: null + lookup_query: |- + SELECT STDDEV(CAST(`{COLUMN_NAME}` AS FLOAT)) as current_standard_deviation FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`; + error_type: Test Results + - id: '1170' + test_id: '1040' + test_type: Variability_Increase + sql_flavor: mssql + lookup_type: null + lookup_query: |- + SELECT STDEV(CAST("{COLUMN_NAME}" AS FLOAT)) as current_standard_deviation FROM "{TARGET_SCHEMA}"."{TABLE_NAME}"; + error_type: Test Results + - id: '1113' + test_id: '1040' + test_type: Variability_Increase + sql_flavor: postgresql + lookup_type: null + lookup_query: |- + SELECT STDDEV(CAST("{COLUMN_NAME}" AS FLOAT)) as current_standard_deviation FROM "{TARGET_SCHEMA}"."{TABLE_NAME}"; + error_type: Test Results + - id: '1031' + test_id: '1040' + test_type: Variability_Increase + sql_flavor: redshift + lookup_type: null + lookup_query: |- + SELECT STDDEV("{COLUMN_NAME}"::FLOAT) as current_standard_deviation FROM "{TARGET_SCHEMA}"."{TABLE_NAME}"; + error_type: Test Results + - id: '1431' + test_id: '1040' + test_type: Variability_Increase + sql_flavor: redshift_spectrum + lookup_type: null + lookup_query: |- + SELECT STDDEV(CAST("{COLUMN_NAME}" AS FLOAT)) as current_standard_deviation FROM "{TARGET_SCHEMA}"."{TABLE_NAME}"; + error_type: Test Results + - id: '1227' + test_id: '1040' + test_type: Variability_Increase + sql_flavor: snowflake + lookup_type: null + lookup_query: |- + SELECT STDDEV(CAST("{COLUMN_NAME}" AS FLOAT)) as current_standard_deviation FROM "{TARGET_SCHEMA}"."{TABLE_NAME}"; + error_type: Test Results + test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Weekly_Rec_Ct.yaml b/testgen/template/dbsetup_test_types/test_types_Weekly_Rec_Ct.yaml new file mode 100644 index 00000000..c774e4df --- /dev/null +++ b/testgen/template/dbsetup_test_types/test_types_Weekly_Rec_Ct.yaml @@ -0,0 +1,229 @@ +test_types: + id: '1037' + test_type: Weekly_Rec_Ct + test_name_short: Weekly Records + test_name_long: At least one date per week present within date range + test_description: |- + Tests for presence of at least one date per calendar week within min/max date range, per baseline data + except_message: |- + At least one date per week expected in min/max date range. + measure_uom: Missing weeks + measure_uom_description: |- + Calendar weeks without date values present + selection_criteria: |- + functional_data_type ILIKE 'Transactional Date%' AND date_days_present > 1 AND functional_table_type ILIKE '%cumulative%' AND date_weeks_present > 3 AND date_weeks_present - (DATEDIFF('week', '1800-01-05'::DATE, max_date) - DATEDIFF('week', '1800-01-05'::DATE, min_date) + 1) = 0 AND future_date_ct::FLOAT / NULLIF(value_ct, 0) <= 0.75 + dq_score_prevalence_formula: |- + ({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT*{PRO_RECORD_CT}::FLOAT/NULLIF({DATE_WEEKS_PRESENT}::FLOAT, 0)/NULLIF({RECORD_CT}::FLOAT, 0) + dq_score_risk_factor: '1.0' + column_name_prompt: null + column_name_help: null + default_parm_columns: threshold_value + default_parm_values: |- + 0 + default_parm_prompts: |- + Threshold Weeks without Dates + default_parm_help: null + default_severity: Fail + run_type: CAT + test_scope: column + dq_dimension: Completeness + health_dimension: Volume + threshold_description: |- + Expected maximum count of calendar weeks without dates present + result_visualization: line_chart + result_visualization_params: null + usage_notes: |- + Weekly Records tests that at least one record is present for every calendar week within the minimum and maximum date range for the column. The test is relevant for transactional data, where you would expect at least one transaction to be recorded each week. A failure here would suggest missing records for the number of weeks identified without data. You can adjust the threshold to accept a number of weeks that you know legitimately have no records. + active: Y + cat_test_conditions: + - id: '7030' + test_type: Weekly_Rec_Ct + sql_flavor: bigquery + measure: |- + DATETIME_DIFF(DATETIME_TRUNC(SAFE_CAST(MAX({COLUMN_NAME}) AS DATE), WEEK), DATETIME_TRUNC(SAFE_CAST(MIN({COLUMN_NAME}) AS DATE), WEEK), WEEK) + 1 - COUNT(DISTINCT DATETIME_TRUNC({COLUMN_NAME}, WEEK)) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '6030' + test_type: Weekly_Rec_Ct + sql_flavor: databricks + measure: |- + CAST(<%DATEDIFF_WEEK;MIN({COLUMN_NAME});MAX({COLUMN_NAME})%> + 1 - COUNT(DISTINCT DATE_TRUNC('week', {COLUMN_NAME})) AS INT) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '3030' + test_type: Weekly_Rec_Ct + sql_flavor: mssql + measure: |- + MAX(DATEDIFF(week, CAST('1800-01-01' AS DATE), {COLUMN_NAME})) - MIN(DATEDIFF(week, CAST('1800-01-01' AS DATE), {COLUMN_NAME}))+1 - COUNT(DISTINCT DATEDIFF(week, CAST('1800-01-01' AS DATE), {COLUMN_NAME})) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '4030' + test_type: Weekly_Rec_Ct + sql_flavor: postgresql + measure: |- + MAX(<%DATEDIFF_WEEK;'1800-01-01'::DATE;{COLUMN_NAME}%>) - MIN(<%DATEDIFF_WEEK;'1800-01-01'::DATE;{COLUMN_NAME}%>)+1 - COUNT(DISTINCT <%DATEDIFF_WEEK;'1800-01-01'::DATE;{COLUMN_NAME}%>) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '1030' + test_type: Weekly_Rec_Ct + sql_flavor: redshift + measure: |- + MAX(DATEDIFF(week, '1800-01-01'::DATE, {COLUMN_NAME})) - MIN(DATEDIFF(week, '1800-01-01'::DATE, {COLUMN_NAME}))+1 - COUNT(DISTINCT DATEDIFF(week, '1800-01-01'::DATE, {COLUMN_NAME})) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '7030' + test_type: Weekly_Rec_Ct + sql_flavor: redshift_spectrum + measure: |- + MAX(DATEDIFF(week, '1800-01-01'::DATE, {COLUMN_NAME})) - MIN(DATEDIFF(week, '1800-01-01'::DATE, {COLUMN_NAME}))+1 - COUNT(DISTINCT DATEDIFF(week, '1800-01-01'::DATE, {COLUMN_NAME})) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '2030' + test_type: Weekly_Rec_Ct + sql_flavor: snowflake + measure: |- + MAX(DATEDIFF(week, '1800-01-01'::DATE, {COLUMN_NAME})) - MIN(DATEDIFF(week, '1800-01-01'::DATE, {COLUMN_NAME}))+1 - COUNT(DISTINCT DATEDIFF(week, '1800-01-01'::DATE, {COLUMN_NAME})) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + - id: '5030' + test_type: Weekly_Rec_Ct + sql_flavor: trino + measure: |- + MAX(DATE_DIFF('week', CAST('1800-01-01' AS DATE), {COLUMN_NAME})) - MIN(DATE_DIFF('week', CAST('1800-01-01' AS DATE), {COLUMN_NAME})) +1 - COUNT(DISTINCT DATE_DIFF('week', CAST('1800-01-01' AS DATE), {COLUMN_NAME})) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} + target_data_lookups: + - id: '1393' + test_id: '1037' + test_type: Weekly_Rec_Ct + sql_flavor: bigquery + lookup_type: null + lookup_query: |- + WITH daterange AS ( + SELECT week_start AS all_dates + FROM UNNEST( + GENERATE_DATE_ARRAY( + DATE_TRUNC((SELECT MIN(CAST(`{COLUMN_NAME}` AS DATE)) FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`), WEEK), + DATE_TRUNC((SELECT MAX(CAST(`{COLUMN_NAME}` AS DATE)) FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`), WEEK), + INTERVAL 7 DAY + ) + ) AS week_start + ), + existing_periods AS ( + SELECT DISTINCT DATE_TRUNC(CAST(`{COLUMN_NAME}` AS DATE), WEEK) AS period, COUNT(1) AS period_count + FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` + GROUP BY period + ), + p AS ( + SELECT d.all_dates AS missing_period, MAX(b.period) AS prior_available_week, MIN(c.period) AS next_available_week + FROM daterange d + LEFT JOIN existing_periods a ON d.all_dates = a.period + LEFT JOIN existing_periods b ON b.period < d.all_dates + LEFT JOIN existing_periods c ON c.period > d.all_dates + WHERE a.period IS NULL + AND d.all_dates BETWEEN b.period AND c.period + GROUP BY d.all_dates + ) + SELECT p.missing_period, p.prior_available_week, e.period_count AS prior_available_week_count, p.next_available_week, f.period_count AS next_available_week_count + FROM p + LEFT JOIN existing_periods e ON (p.prior_available_week = e.period) + LEFT JOIN existing_periods f ON (p.next_available_week = f.period) + ORDER BY p.missing_period; + error_type: Test Results + - id: '1327' + test_id: '1037' + test_type: Weekly_Rec_Ct + sql_flavor: databricks + lookup_type: null + lookup_query: |- + WITH daterange AS( SELECT explode(sequence( date_trunc('week', (SELECT min(`{COLUMN_NAME}`) FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`)), date_trunc('week', (SELECT max(`{COLUMN_NAME}`) FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`)), interval 1 week)) AS all_dates ), existing_periods AS ( SELECT DISTINCT date_trunc('week', `{COLUMN_NAME}`) AS period, COUNT(1) AS period_count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` GROUP BY date_trunc('week', `{COLUMN_NAME}`) ) SELECT p.missing_period, p.prior_available_week, e.period_count AS prior_available_week_count, p.next_available_week, f.period_count AS next_available_week_count FROM ( SELECT d.all_dates AS missing_period, MAX(b.period) AS prior_available_week, MIN(c.period) AS next_available_week FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates ) p LEFT JOIN existing_periods e ON p.prior_available_week = e.period LEFT JOIN existing_periods f ON p.next_available_week = f.period ORDER BY p.missing_period; + error_type: Test Results + - id: '1169' + test_id: '1037' + test_type: Weekly_Rec_Ct + sql_flavor: mssql + lookup_type: null + lookup_query: |- + WITH + Pass0 as (select 1 as C union all select 1), --2 rows + Pass1 as (select 1 as C from Pass0 as A, Pass0 as B),--4 rows + Pass2 as (select 1 as C from Pass1 as A, Pass1 as B),--16 rows + Pass3 as (select 1 as C from Pass2 as A, Pass2 as B),--256 rows + Pass4 as (select 1 as C from Pass3 as A, Pass3 as B),--65536 rows + All_Nums as (select row_number() over(order by C) as Number from Pass4), + tally as (SELECT Number FROM All_Nums WHERE Number <= 45000), + + date_range as (SELECT CAST(DATEADD(WEEK, DATEDIFF(WEEK, 0, MIN("{COLUMN_NAME}")), 0) AS DATE) AS min_period, + CAST(DATEADD(WEEK, DATEDIFF(WEEK, 0, MAX("{COLUMN_NAME}")), 0) AS DATE) AS max_period, + DATEDIFF(WEEK, + CAST(DATEADD(WEEK, DATEDIFF(WEEK, 0, MIN("{COLUMN_NAME}")), 0) AS DATE), + CAST(DATEADD(WEEK, DATEDIFF(WEEK, 0, MAX("{COLUMN_NAME}")), 0) AS DATE) ) + 1 as period_ct + FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" ), + check_periods as ( SELECT d.min_period, d.max_period, t.number, + DATEADD(WEEK, -(t.number - 1), d.max_period) AS check_period + FROM date_range d + INNER JOIN tally t + ON (d.period_ct >= t.number) ), + data_by_period as (SELECT CAST(DATEADD(WEEK, DATEDIFF(WEEK, 0, "{COLUMN_NAME}"), 0) AS DATE) as data_period, COUNT(*) as record_ct + FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + GROUP BY CAST(DATEADD(WEEK, DATEDIFF(WEEK, 0, "{COLUMN_NAME}"), 0) AS DATE) ), + data_by_prd_with_prior_next as (SELECT check_period, + RANK() OVER (ORDER BY check_period DESC) as ranked, + ISNULL(d.record_ct, 0) as record_ct, + ISNULL(LAG(d.record_ct) OVER (ORDER BY check_period), 0) as last_record_ct, + ISNULL(LEAD(d.record_ct) OVER (ORDER BY check_period), 0) as next_record_ct + FROM check_periods c + LEFT JOIN data_by_period d + ON (c.check_period = d.data_period) ) + SELECT check_period, record_ct, + CASE + WHEN record_ct = 0 THEN 'MISSING' + ELSE 'Present' + END as status + FROM data_by_prd_with_prior_next + WHERE record_ct = 0 + OR last_record_ct = 0 + OR next_record_ct = 0 + ORDER BY check_period DESC; + error_type: Test Results + - id: '1112' + test_id: '1037' + test_type: Weekly_Rec_Ct + sql_flavor: postgresql + lookup_type: null + lookup_query: |- + WITH RECURSIVE daterange(all_dates) AS (SELECT DATE_TRUNC('week', MIN("{COLUMN_NAME}")) :: DATE AS all_dates FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" UNION ALL SELECT (d.all_dates + INTERVAL '1 week' ) :: DATE AS all_dates FROM daterange d WHERE d.all_dates < (SELECT DATE_TRUNC('week' , MAX("{COLUMN_NAME}")) :: DATE FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") ), existing_periods AS (SELECT DISTINCT DATE_TRUNC('week', "{COLUMN_NAME}") :: DATE AS period, COUNT(1) as period_count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY DATE_TRUNC('week', "{COLUMN_NAME}") :: DATE) SELECT d.all_dates as missing_period, MAX(b.period) AS prior_available_week, (SELECT period_count FROM existing_periods WHERE period = MAX(b.period) ) AS prior_available_week_count, MIN(c.period) AS next_available_week, (SELECT period_count FROM existing_periods WHERE period = MIN(c.period) ) AS next_available_week_count FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates ORDER BY d.all_dates; + error_type: Test Results + - id: '1030' + test_id: '1037' + test_type: Weekly_Rec_Ct + sql_flavor: redshift + lookup_type: null + lookup_query: |- + WITH RECURSIVE daterange(all_dates) AS (SELECT DATE_TRUNC('week',MIN("{COLUMN_NAME}")) :: DATE AS all_dates FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" UNION ALL SELECT (d.all_dates + INTERVAL '1 week' ) :: DATE AS all_dates FROM daterange d WHERE d.all_dates < (SELECT DATE_TRUNC('week', MAX("{COLUMN_NAME}")) :: DATE FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") ), existing_periods AS ( SELECT DISTINCT DATE_TRUNC('week',"{COLUMN_NAME}") :: DATE AS period, COUNT(1) as period_count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY DATE_TRUNC('week',"{COLUMN_NAME}") :: DATE ) SELECT d.all_dates as missing_period, MAX(b.period) AS prior_available_week, (SELECT period_count FROM existing_periods WHERE period = MAX(b.period) ) AS prior_available_week_count, MIN(c.period) AS next_available_week, (SELECT period_count FROM existing_periods WHERE period = MIN(c.period) ) AS next_available_week_count FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates ORDER BY d.all_dates; + error_type: Test Results + - id: '1430' + test_id: '1037' + test_type: Weekly_Rec_Ct + sql_flavor: redshift_spectrum + lookup_type: null + lookup_query: |- + WITH RECURSIVE daterange(all_dates) AS (SELECT DATE_TRUNC('week',MIN("{COLUMN_NAME}")) :: DATE AS all_dates FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" UNION ALL SELECT (d.all_dates + INTERVAL '1 week' ) :: DATE AS all_dates FROM daterange d WHERE d.all_dates < (SELECT DATE_TRUNC('week', MAX("{COLUMN_NAME}")) :: DATE FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") ), existing_periods AS ( SELECT DISTINCT DATE_TRUNC('week',"{COLUMN_NAME}") :: DATE AS period, COUNT(1) as period_count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY DATE_TRUNC('week',"{COLUMN_NAME}") :: DATE ) SELECT d.all_dates as missing_period, MAX(b.period) AS prior_available_week, (SELECT period_count FROM existing_periods WHERE period = MAX(b.period) ) AS prior_available_week_count, MIN(c.period) AS next_available_week, (SELECT period_count FROM existing_periods WHERE period = MIN(c.period) ) AS next_available_week_count FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates ORDER BY d.all_dates; + error_type: Test Results + - id: '1226' + test_id: '1037' + test_type: Weekly_Rec_Ct + sql_flavor: snowflake + lookup_type: null + lookup_query: |- + WITH RECURSIVE daterange(all_dates) AS (SELECT DATE_TRUNC('week',MIN("{COLUMN_NAME}")) :: DATE AS all_dates FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" UNION ALL SELECT (d.all_dates + INTERVAL '1 week' ) :: DATE AS all_dates FROM daterange d WHERE d.all_dates < (SELECT DATE_TRUNC('week', MAX("{COLUMN_NAME}")) :: DATE FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") ), existing_periods AS ( SELECT DISTINCT DATE_TRUNC('week',"{COLUMN_NAME}") :: DATE AS period, COUNT(1) as period_count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY DATE_TRUNC('week',"{COLUMN_NAME}") :: DATE ) SELECT p.missing_period, p.prior_available_week, e.period_count as prior_available_week_count, p.next_available_week, f.period_count as next_available_week_count FROM( SELECT d.all_dates as missing_period, MAX(b.period) AS prior_available_week, MIN(c.period) AS next_available_week FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates ) p LEFT JOIN existing_periods e ON (p.prior_available_week = e.period) LEFT JOIN existing_periods f ON (p.next_available_week = f.period) ORDER BY p.missing_period; + error_type: Test Results + test_templates: [] diff --git a/testgen/template/dbupgrade/0148_incremental_upgrade.sql b/testgen/template/dbupgrade/0148_incremental_upgrade.sql index b69d2b1d..20f3c53f 100644 --- a/testgen/template/dbupgrade/0148_incremental_upgrade.sql +++ b/testgen/template/dbupgrade/0148_incremental_upgrade.sql @@ -3,4 +3,3 @@ SET SEARCH_PATH TO {SCHEMA_NAME}; UPDATE test_definitions SET id = gen_random_uuid() WHERE id IS NULL; - \ No newline at end of file diff --git a/testgen/template/dbupgrade/0151_incremental_upgrade.sql b/testgen/template/dbupgrade/0151_incremental_upgrade.sql new file mode 100644 index 00000000..730562e8 --- /dev/null +++ b/testgen/template/dbupgrade/0151_incremental_upgrade.sql @@ -0,0 +1,5 @@ +SET SEARCH_PATH TO {SCHEMA_NAME}; + +ALTER TABLE target_data_lookups + ADD CONSTRAINT target_data_lookups_test_id_sql_flavor_error_type_pk + PRIMARY KEY (test_id, sql_flavor, error_type); diff --git a/testgen/template/dbupgrade/0152_incremental_upgrade.sql b/testgen/template/dbupgrade/0152_incremental_upgrade.sql new file mode 100644 index 00000000..2184830e --- /dev/null +++ b/testgen/template/dbupgrade/0152_incremental_upgrade.sql @@ -0,0 +1,27 @@ +SET SEARCH_PATH TO {SCHEMA_NAME}; + +CREATE OR REPLACE FUNCTION fn_quote_literal_escape(var_value varchar, sql_flavor varchar) RETURNS varchar + LANGUAGE plpgsql +AS +$$ +DECLARE + escaped_value varchar; + lower_case_sql_flavor varchar; +BEGIN + lower_case_sql_flavor := LOWER(sql_flavor); + + IF lower_case_sql_flavor IN ('postgres', 'postgresql') THEN + escaped_value := QUOTE_LITERAL(var_value); + ELSIF lower_case_sql_flavor IN ('redshift', 'redshift_spectrum', 'snowflake') THEN + escaped_value := TRIM(LEADING 'E' FROM QUOTE_LITERAL(var_value)); + ELSIF lower_case_sql_flavor = 'mssql' THEN + escaped_value := '''' || REPLACE(var_value, '''', '''''') || ''''; + ELSIF lower_case_sql_flavor = 'databricks' THEN + escaped_value := '''' || REPLACE(REPLACE(var_value, '\', '\\'), '''', '\''') || ''''; + ELSE + RAISE EXCEPTION 'Invalid sql_flavor name: %', sql_flavor; + END IF; + + RETURN escaped_value; +END; +$$; diff --git a/testgen/template/dbupgrade/0153_incremental_upgrade.sql b/testgen/template/dbupgrade/0153_incremental_upgrade.sql new file mode 100644 index 00000000..91277507 --- /dev/null +++ b/testgen/template/dbupgrade/0153_incremental_upgrade.sql @@ -0,0 +1,3 @@ +SET SEARCH_PATH TO {SCHEMA_NAME}; + +ALTER TABLE connections ADD COLUMN service_account_key BYTEA; diff --git a/testgen/template/dbupgrade/0154_incremental_upgrade.sql b/testgen/template/dbupgrade/0154_incremental_upgrade.sql new file mode 100644 index 00000000..07eed7ad --- /dev/null +++ b/testgen/template/dbupgrade/0154_incremental_upgrade.sql @@ -0,0 +1,3 @@ +SET SEARCH_PATH TO {SCHEMA_NAME}; + +ALTER TABLE job_schedules ADD COLUMN active BOOLEAN DEFAULT TRUE; diff --git a/testgen/template/dbupgrade/0155_incremental_upgrade.sql b/testgen/template/dbupgrade/0155_incremental_upgrade.sql new file mode 100644 index 00000000..b7a30946 --- /dev/null +++ b/testgen/template/dbupgrade/0155_incremental_upgrade.sql @@ -0,0 +1,14 @@ +SET SEARCH_PATH TO {SCHEMA_NAME}; + +DROP TABLE data_structure_log; + +CREATE TABLE data_structure_log ( + log_id UUID DEFAULT gen_random_uuid() + CONSTRAINT pk_dsl_id + PRIMARY KEY, + element_id UUID, + change_date TIMESTAMP, + change VARCHAR(10), + old_column_type VARCHAR(50), + new_column_type VARCHAR(50) +); diff --git a/testgen/template/dbupgrade/0156_incremental_upgrade.sql b/testgen/template/dbupgrade/0156_incremental_upgrade.sql new file mode 100644 index 00000000..9031a8c4 --- /dev/null +++ b/testgen/template/dbupgrade/0156_incremental_upgrade.sql @@ -0,0 +1,21 @@ +SET SEARCH_PATH TO {SCHEMA_NAME}; + +ALTER TABLE data_structure_log RENAME COLUMN old_column_type TO old_data_type; +ALTER TABLE data_structure_log RENAME COLUMN new_column_type TO new_data_type; + +ALTER TABLE stg_data_chars_updates ADD COLUMN db_data_type VARCHAR(50); +ALTER TABLE profile_results ADD COLUMN db_data_type VARCHAR(50); +ALTER TABLE profile_anomaly_results ADD COLUMN db_data_type VARCHAR(50); +ALTER TABLE data_column_chars ADD COLUMN db_data_type VARCHAR(50); + +UPDATE profile_results + SET db_data_type = column_type + WHERE db_data_type IS NULL; + +UPDATE profile_anomaly_results + SET db_data_type = column_type + WHERE db_data_type IS NULL; + +UPDATE data_column_chars + SET db_data_type = column_type + WHERE db_data_type IS NULL; diff --git a/testgen/template/exec_cat_tests/ex_cat_build_agg_table_tests.sql b/testgen/template/exec_cat_tests/ex_cat_build_agg_table_tests.sql index 2f821506..b6268c52 100644 --- a/testgen/template/exec_cat_tests/ex_cat_build_agg_table_tests.sql +++ b/testgen/template/exec_cat_tests/ex_cat_build_agg_table_tests.sql @@ -5,9 +5,35 @@ INSERT INTO working_agg_cat_tests column_names, test_types, test_definition_ids, test_actions, test_descriptions, test_parms, test_measures, test_conditions) + +-- Column types from latest profile_results +WITH column_types AS ( + SELECT pr.table_groups_id, + pr.connection_id, + pr.schema_name, + pr.table_name, + pr.column_name, + pr.column_type + FROM profile_results pr + INNER JOIN ( + SELECT table_groups_id, + connection_id, + schema_name, + table_name, + column_name, + MAX(run_date) AS max_run_date + FROM profile_results + GROUP BY table_groups_id, connection_id, schema_name, table_name, column_name + ) latest + ON pr.table_groups_id = latest.table_groups_id + AND pr.schema_name = latest.schema_name + AND pr.table_name = latest.table_name + AND pr.column_name = latest.column_name + AND pr.run_date = latest.max_run_date +), + -- Test details from each test type -WITH test_detail - AS ( +test_detail AS ( SELECT t.test_suite_id, '{SCHEMA_NAME}' as schema_name, '{TABLE_NAME}' as table_name, '{RUN_DATE}'::TIMESTAMP as test_time, @@ -29,9 +55,10 @@ WITH test_detail -- Standard Measure start 'CAST(' || -- Nested parm replacements - part of query, not Python parms - REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE( + REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE( c.measure, - '{COLUMN_NAME}', '{ID_SEPARATOR}' || COALESCE(t.column_name, '') || '{ID_SEPARATOR}'), + '{COLUMN_NAME}', '{QUOTE}' || COALESCE(t.column_name, '') || '{QUOTE}'), + '{COLUMN_TYPE}', COALESCE(ct.column_type, '')), '{BASELINE_CT}', COALESCE(t.baseline_ct, '')), '{BASELINE_UNIQUE_CT}', COALESCE(t.baseline_unique_ct, '')), '{BASELINE_VALUE}', COALESCE(t.baseline_value, '') ), @@ -40,16 +67,17 @@ WITH test_detail '{BASELINE_AVG}', COALESCE(t.baseline_avg, '') ), '{BASELINE_SD}', COALESCE(t.baseline_sd, '') ), '{CUSTOM_QUERY}', COALESCE(t.custom_query, '')), - '{THRESHOLD_VALUE}', COALESCE(t.threshold_value, '') ) + '{THRESHOLD_VALUE}', COALESCE(t.threshold_value, '')) -- Standard measure end with pipe delimiter - || ' AS VARCHAR(1000) ) {CONCAT_OPERATOR} ''|'' ' as measure, + || ' AS {VARCHAR_TYPE}) {CONCAT_OPERATOR} ''|'' ' as measure, -- Standard CASE for condition starts 'CASE WHEN ' || -- Nested parm replacements - standard - REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE( + REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE( c.measure || c.test_operator || c.test_condition, - '{COLUMN_NAME}', '{ID_SEPARATOR}' || COALESCE(t.column_name, '') || '{ID_SEPARATOR}'), + '{COLUMN_NAME}', '{QUOTE}' || COALESCE(t.column_name, '') || '{QUOTE}'), + '{COLUMN_TYPE}', COALESCE(ct.column_type, '')), '{BASELINE_CT}', COALESCE(t.baseline_ct, '')), '{BASELINE_UNIQUE_CT}', COALESCE(t.baseline_unique_ct, '')), '{BASELINE_VALUE}', COALESCE(t.baseline_value, '') ), @@ -58,20 +86,28 @@ WITH test_detail '{BASELINE_AVG}', COALESCE(t.baseline_avg, '') ), '{BASELINE_SD}', COALESCE(t.baseline_sd, '') ), '{CUSTOM_QUERY}', COALESCE(t.custom_query, '')), - '{THRESHOLD_VALUE}', COALESCE(t.threshold_value, '') ) + '{THRESHOLD_VALUE}', COALESCE(t.threshold_value, '')) -- Standard case ends || ' THEN ''0,'' ELSE ''1,'' END' as condition FROM test_definitions t INNER JOIN cat_test_conditions c ON (t.test_type = c.test_type AND '{SQL_FLAVOR}' = c.sql_flavor) + INNER JOIN test_suites s + ON t.test_suite_id = s.id + LEFT JOIN column_types ct + ON s.table_groups_id = ct.table_groups_id + AND t.schema_name = ct.schema_name + AND t.table_name = ct.table_name + AND t.column_name = ct.column_name WHERE t.test_suite_id = '{TEST_SUITE_ID}' AND t.schema_name = '{SCHEMA_NAME}' AND t.table_name = '{TABLE_NAME}' AND COALESCE(t.test_active, 'Y') = 'Y' ), -test_detail_split - AS ( SELECT test_suite_id, schema_name, table_name, test_time, + +test_detail_split AS ( + SELECT test_suite_id, schema_name, table_name, test_time, column_name, test_type, test_definition_id, test_action, test_description, parms, measure, condition, SUM(LENGTH(condition)) OVER (PARTITION BY t.schema_name, t.table_name @@ -79,7 +115,9 @@ test_detail_split FLOOR( SUM(LENGTH(condition)) OVER (PARTITION BY t.schema_name, t.table_name ORDER BY t.column_name ROWS UNBOUNDED PRECEDING ) / {MAX_QUERY_CHARS} ) + 1 as query_split_no - FROM test_detail t ) + FROM test_detail t +) + SELECT '{TEST_RUN_ID}' as test_run_id, d.schema_name, d.table_name, d.query_split_no as cat_sequence, diff --git a/testgen/template/exec_cat_tests/ex_cat_test_query.sql b/testgen/template/exec_cat_tests/ex_cat_test_query.sql index c544be2b..3013780c 100644 --- a/testgen/template/exec_cat_tests/ex_cat_test_query.sql +++ b/testgen/template/exec_cat_tests/ex_cat_test_query.sql @@ -4,4 +4,4 @@ SELECT '{TEST_RUN_ID}' as test_run_id, '{CAT_SEQUENCE}' as cat_sequence, {TEST_MEASURES} as measure_results, {TEST_CONDITIONS} as test_results - FROM {SCHEMA_NAME}.{TABLE_NAME} + FROM {QUOTE}{SCHEMA_NAME}{QUOTE}.{QUOTE}{TABLE_NAME}{QUOTE} diff --git a/testgen/template/flavors/bigquery/data_chars/schema_ddf_query_bigquery.sql b/testgen/template/flavors/bigquery/data_chars/schema_ddf_query_bigquery.sql new file mode 100644 index 00000000..1e3c93ae --- /dev/null +++ b/testgen/template/flavors/bigquery/data_chars/schema_ddf_query_bigquery.sql @@ -0,0 +1,27 @@ +SELECT '{PROJECT_CODE}' AS project_code, + CURRENT_TIMESTAMP() AS refresh_timestamp, + c.table_schema, + c.table_name, + c.column_name, + CASE + WHEN LOWER(c.data_type) LIKE 'timestamp%' THEN LOWER(c.data_type) + WHEN LOWER(c.data_type) = 'date' THEN 'date' + WHEN LOWER(c.data_type) = 'bool' THEN 'boolean' + ELSE LOWER(c.data_type) + END AS column_type, + c.data_type AS db_data_type, + NULL AS character_maximum_length, + c.ordinal_position, + CASE + WHEN LOWER(c.data_type) = 'string' THEN 'A' + WHEN LOWER(c.data_type) = 'bool' THEN 'B' + WHEN LOWER(c.data_type) IN ('date', 'datetime', 'timestamp') THEN 'D' + WHEN LOWER(c.data_type) = 'time' THEN 'T' + WHEN LOWER(c.data_type) IN ('int64', 'float64') THEN 'N' + WHEN REGEXP_CONTAINS(LOWER(c.data_type), r'(decimal|numeric|bignumeric)') THEN 'N' + ELSE 'X' + END AS general_type, + REGEXP_CONTAINS(LOWER(c.data_type), r'(decimal|numeric|bignumeric)') AS is_decimal +FROM `{DATA_SCHEMA}.INFORMATION_SCHEMA.COLUMNS` c +WHERE c.table_schema = '{DATA_SCHEMA}' {TABLE_CRITERIA} +ORDER BY c.table_schema, c.table_name, c.ordinal_position; diff --git a/testgen/template/flavors/bigquery/exec_query_tests/ex_data_match_bigquery.sql b/testgen/template/flavors/bigquery/exec_query_tests/ex_data_match_bigquery.sql new file mode 100644 index 00000000..03ccee36 --- /dev/null +++ b/testgen/template/flavors/bigquery/exec_query_tests/ex_data_match_bigquery.sql @@ -0,0 +1,46 @@ +SELECT '{TEST_TYPE}' AS test_type, + '{TEST_DEFINITION_ID}' AS test_definition_id, + '{TEST_SUITE_ID}' AS test_suite_id, + '{TEST_RUN_ID}' AS test_run_id, + '{RUN_DATE}' AS test_time, + '{START_TIME}' AS starttime, + CURRENT_TIMESTAMP AS endtime, + '{SCHEMA_NAME}' AS schema_name, + '{TABLE_NAME}' AS table_name, + '{COLUMN_NAME_NO_QUOTES}' AS column_names, + '{SKIP_ERRORS}' AS threshold_value, + {SKIP_ERRORS} AS skip_errors, + '{INPUT_PARAMETERS}' AS input_parameters, + NULL as result_signal, + CASE WHEN COUNT(*) > {SKIP_ERRORS} THEN 0 ELSE 1 END AS result_code, + CASE + WHEN COUNT(*) > 0 THEN + CONCAT( + CAST(COUNT(*) AS STRING), + ' error(s) identified, ', + CASE + WHEN COUNT(*) > {SKIP_ERRORS} THEN 'exceeding limit of ' + ELSE 'within limit of ' + END, + '{SKIP_ERRORS}.' + ) + ELSE 'No errors found.' + END AS result_message, + COUNT(*) AS result_measure, + '{SUBSET_DISPLAY}' AS subset_condition, + NULL AS result_query +FROM ( + SELECT {COLUMN_NAME_NO_QUOTES} + FROM `{SCHEMA_NAME}.{TABLE_NAME}` + WHERE {SUBSET_CONDITION} + GROUP BY {COLUMN_NAME_NO_QUOTES} + {HAVING_CONDITION} + + EXCEPT DISTINCT + + SELECT {MATCH_GROUPBY_NAMES} + FROM `{MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME}` + WHERE {MATCH_SUBSET_CONDITION} + GROUP BY {MATCH_GROUPBY_NAMES} + {MATCH_HAVING_CONDITION} +) test; diff --git a/testgen/template/flavors/bigquery/exec_query_tests/ex_relative_entropy_bigquery.sql b/testgen/template/flavors/bigquery/exec_query_tests/ex_relative_entropy_bigquery.sql new file mode 100644 index 00000000..0aee6ead --- /dev/null +++ b/testgen/template/flavors/bigquery/exec_query_tests/ex_relative_entropy_bigquery.sql @@ -0,0 +1,54 @@ +-- Relative Entropy: measured by Jensen-Shannon Divergence +-- Smoothed and normalized version of KL divergence, +-- with scores between 0 (identical) and 1 (maximally different), +-- when using the base-2 logarithm. Formula is: +-- 0.5 * kl_divergence(p, m) + 0.5 * kl_divergence(q, m) +-- Log base 2 of x = LN(x)/LN(2) +WITH latest_ver AS ( + SELECT {CONCAT_COLUMNS} AS category, + CAST(COUNT(*) AS FLOAT64) / CAST(SUM(COUNT(*)) OVER () AS FLOAT64) AS pct_of_total + FROM `{SCHEMA_NAME}.{TABLE_NAME}` v1 + WHERE {SUBSET_CONDITION} + GROUP BY {COLUMN_NAME_NO_QUOTES} +), +older_ver AS ( + SELECT {CONCAT_MATCH_GROUPBY} AS category, + CAST(COUNT(*) AS FLOAT64) / CAST(SUM(COUNT(*)) OVER () AS FLOAT64) AS pct_of_total + FROM `{MATCH_SCHEMA_NAME}.{TABLE_NAME}` v2 + WHERE {MATCH_SUBSET_CONDITION} + GROUP BY {MATCH_GROUPBY_NAMES} +), +dataset AS ( + SELECT COALESCE(l.category, o.category) AS category, + COALESCE(o.pct_of_total, 0.0000001) AS old_pct, + COALESCE(l.pct_of_total, 0.0000001) AS new_pct, + (COALESCE(o.pct_of_total, 0.0000001) + COALESCE(l.pct_of_total, 0.0000001)) / 2.0 AS avg_pct + FROM latest_ver l + FULL JOIN older_ver o + ON l.category = o.category +) +SELECT '{TEST_TYPE}' AS test_type, + '{TEST_DEFINITION_ID}' AS test_definition_id, + '{TEST_SUITE_ID}' AS test_suite_id, + '{TEST_RUN_ID}' AS test_run_id, + '{RUN_DATE}' AS test_time, + '{START_TIME}' AS starttime, + CURRENT_TIMESTAMP AS endtime, + '{SCHEMA_NAME}' AS schema_name, + '{TABLE_NAME}' AS table_name, + '{COLUMN_NAME_NO_QUOTES}' AS column_names, + -- '{GROUPBY_NAMES}' as column_names, + '{THRESHOLD_VALUE}' AS threshold_value, + NULL AS skip_errors, + '{INPUT_PARAMETERS}' AS input_parameters, + NULL as result_signal, + CASE WHEN js_divergence > {THRESHOLD_VALUE} THEN 0 ELSE 1 END AS result_code, + CONCAT('Divergence Level: ', CAST(js_divergence AS STRING), ', Threshold: {THRESHOLD_VALUE}.') AS result_message, + js_divergence AS result_measure, + '{SUBSET_DISPLAY}' AS subset_condition, + NULL AS result_query +FROM ( + SELECT 0.5 * ABS(SUM(new_pct * LN(new_pct/avg_pct)/LN(2))) + + 0.5 * ABS(SUM(old_pct * LN(old_pct/avg_pct)/LN(2))) AS js_divergence + FROM dataset +) rslt; diff --git a/testgen/template/flavors/bigquery/exec_query_tests/ex_table_changed_bigquery.sql b/testgen/template/flavors/bigquery/exec_query_tests/ex_table_changed_bigquery.sql new file mode 100644 index 00000000..70d97b32 --- /dev/null +++ b/testgen/template/flavors/bigquery/exec_query_tests/ex_table_changed_bigquery.sql @@ -0,0 +1,30 @@ +SELECT '{TEST_TYPE}' AS test_type, + '{TEST_DEFINITION_ID}' AS test_definition_id, + '{TEST_SUITE_ID}' AS test_suite_id, + '{TEST_RUN_ID}' AS test_run_id, + '{RUN_DATE}' AS test_time, + '{START_TIME}' AS starttime, + CURRENT_TIMESTAMP AS endtime, + '{SCHEMA_NAME}' AS schema_name, + '{TABLE_NAME}' AS table_name, + '{COLUMN_NAME_NO_QUOTES}' AS column_names, + '{SKIP_ERRORS}' AS threshold_value, + {SKIP_ERRORS} AS skip_errors, + '{INPUT_PARAMETERS}' AS input_parameters, + fingerprint AS result_signal, + /* Fails if table is the same */ + CASE WHEN fingerprint = '{BASELINE_VALUE}' THEN 0 ELSE 1 END AS result_code, + CASE + WHEN fingerprint = '{BASELINE_VALUE}' THEN 'No table change detected.' + ELSE 'Table change detected.' + END AS result_message, + CASE + WHEN fingerprint = '{BASELINE_VALUE}' THEN 0 ELSE 1 + END AS result_measure, + '{SUBSET_DISPLAY}' AS subset_condition, + NULL AS result_query +FROM ( + SELECT {CUSTOM_QUERY} AS fingerprint + FROM `{SCHEMA_NAME}.{TABLE_NAME}` + WHERE {SUBSET_CONDITION} +) test; diff --git a/testgen/template/flavors/bigquery/exec_query_tests/ex_window_match_no_drops_bigquery.sql b/testgen/template/flavors/bigquery/exec_query_tests/ex_window_match_no_drops_bigquery.sql new file mode 100644 index 00000000..5ba04cfd --- /dev/null +++ b/testgen/template/flavors/bigquery/exec_query_tests/ex_window_match_no_drops_bigquery.sql @@ -0,0 +1,44 @@ +SELECT + '{TEST_TYPE}' AS test_type, + '{TEST_DEFINITION_ID}' AS test_definition_id, + '{TEST_SUITE_ID}' AS test_suite_id, + '{TEST_RUN_ID}' AS test_run_id, + '{RUN_DATE}' AS test_time, + '{START_TIME}' AS starttime, + CURRENT_TIMESTAMP AS endtime, + '{SCHEMA_NAME}' AS schema_name, + '{TABLE_NAME}' AS table_name, + '{COLUMN_NAME_NO_QUOTES}' AS column_names, + '{SKIP_ERRORS}' AS threshold_value, + {SKIP_ERRORS} AS skip_errors, + '{INPUT_PARAMETERS}' AS input_parameters, + NULL as result_signal, + CASE WHEN COUNT(*) > {SKIP_ERRORS} THEN 0 ELSE 1 END AS result_code, + CASE + WHEN COUNT(*) > 0 THEN CONCAT( + CAST(COUNT(*) AS STRING), ' error(s) identified, ', + CASE + WHEN COUNT(*) > {SKIP_ERRORS} THEN 'exceeding limit of ' + ELSE 'within limit of ' + END, + '{SKIP_ERRORS}.' + ) + ELSE 'No errors found.' + END AS result_message, + COUNT(*) AS result_measure, + '{SUBSET_DISPLAY}' AS subset_condition, + NULL AS result_query + FROM ( + SELECT {COLUMN_NAME_NO_QUOTES} + FROM `{SCHEMA_NAME}.{TABLE_NAME}` + WHERE {SUBSET_CONDITION} + AND {WINDOW_DATE_COLUMN} >= DATE_SUB((SELECT MAX({WINDOW_DATE_COLUMN}) FROM `{SCHEMA_NAME}.{TABLE_NAME}`), INTERVAL 2 * {WINDOW_DAYS} DAY) + AND {WINDOW_DATE_COLUMN} < DATE_SUB((SELECT MAX({WINDOW_DATE_COLUMN}) FROM `{SCHEMA_NAME}.{TABLE_NAME}`), INTERVAL {WINDOW_DAYS} DAY) + GROUP BY {COLUMN_NAME_NO_QUOTES} + EXCEPT DISTINCT + SELECT {COLUMN_NAME_NO_QUOTES} + FROM `{SCHEMA_NAME}.{TABLE_NAME}` + WHERE {SUBSET_CONDITION} + AND {WINDOW_DATE_COLUMN} >= DATE_SUB((SELECT MAX({WINDOW_DATE_COLUMN}) FROM `{SCHEMA_NAME}.{TABLE_NAME}`), INTERVAL {WINDOW_DAYS} DAY) + GROUP BY {COLUMN_NAME_NO_QUOTES} + ) test; diff --git a/testgen/template/flavors/bigquery/exec_query_tests/ex_window_match_same_bigquery.sql b/testgen/template/flavors/bigquery/exec_query_tests/ex_window_match_same_bigquery.sql new file mode 100644 index 00000000..c16c158e --- /dev/null +++ b/testgen/template/flavors/bigquery/exec_query_tests/ex_window_match_same_bigquery.sql @@ -0,0 +1,78 @@ +SELECT '{TEST_TYPE}' AS test_type, + '{TEST_DEFINITION_ID}' AS test_definition_id, + '{TEST_SUITE_ID}' AS test_suite_id, + '{TEST_RUN_ID}' AS test_run_id, + '{RUN_DATE}' AS test_time, + '{START_TIME}' AS starttime, + CURRENT_TIMESTAMP AS endtime, + '{SCHEMA_NAME}' AS schema_name, + '{TABLE_NAME}' AS table_name, + '{COLUMN_NAME_NO_QUOTES}' AS column_names, + '{SKIP_ERRORS}' AS threshold_value, + {SKIP_ERRORS} AS skip_errors, + '{INPUT_PARAMETERS}' AS input_parameters, + NULL as result_signal, + CASE WHEN COUNT(*) > {SKIP_ERRORS} THEN 0 ELSE 1 END AS result_code, + CASE + WHEN COUNT(*) > 0 THEN + CONCAT( + CAST(COUNT(*) AS STRING), + ' error(s) identified, ', + CASE + WHEN COUNT(*) > {SKIP_ERRORS} THEN 'exceeding limit of ' + ELSE 'within limit of ' + END, + '{SKIP_ERRORS}.' + ) + ELSE 'No errors found.' + END AS result_message, + COUNT(*) AS result_measure, + '{SUBSET_DISPLAY}' AS subset_condition, + NULL AS result_query +FROM ( + -- Values in the prior timeframe but not in the latest + ( + SELECT 'Prior Timeframe' AS missing_from, {COLUMN_NAME} + FROM `{SCHEMA_NAME}.{TABLE_NAME}` + WHERE {SUBSET_CONDITION} + AND {WINDOW_DATE_COLUMN} >= DATE_ADD( + (SELECT MAX({WINDOW_DATE_COLUMN}) FROM `{SCHEMA_NAME}.{TABLE_NAME}`), + INTERVAL -{WINDOW_DAYS} DAY + ) + EXCEPT DISTINCT + SELECT 'Prior Timeframe' AS missing_from, {COLUMN_NAME} + FROM `{SCHEMA_NAME}.{TABLE_NAME}` + WHERE {SUBSET_CONDITION} + AND {WINDOW_DATE_COLUMN} >= DATE_ADD( + (SELECT MAX({WINDOW_DATE_COLUMN}) FROM `{SCHEMA_NAME}.{TABLE_NAME}`), + INTERVAL -2 * {WINDOW_DAYS} DAY + ) + AND {WINDOW_DATE_COLUMN} < DATE_ADD( + (SELECT MAX({WINDOW_DATE_COLUMN}) FROM `{SCHEMA_NAME}.{TABLE_NAME}`), + INTERVAL -{WINDOW_DAYS} DAY + ) + ) + UNION ALL + -- Values in the latest timeframe but not in the prior + ( + SELECT 'Latest Timeframe' AS missing_from, {COLUMN_NAME} + FROM `{SCHEMA_NAME}.{TABLE_NAME}` + WHERE {SUBSET_CONDITION} + AND {WINDOW_DATE_COLUMN} >= DATE_ADD( + (SELECT MAX({WINDOW_DATE_COLUMN}) FROM `{SCHEMA_NAME}.{TABLE_NAME}`), + INTERVAL -2 * {WINDOW_DAYS} DAY + ) + AND {WINDOW_DATE_COLUMN} < DATE_ADD( + (SELECT MAX({WINDOW_DATE_COLUMN}) FROM `{SCHEMA_NAME}.{TABLE_NAME}`), + INTERVAL -{WINDOW_DAYS} DAY + ) + EXCEPT DISTINCT + SELECT 'Latest Timeframe' AS missing_from, {COLUMN_NAME} + FROM `{SCHEMA_NAME}.{TABLE_NAME}` + WHERE {SUBSET_CONDITION} + AND {WINDOW_DATE_COLUMN} >= DATE_ADD( + (SELECT MAX({WINDOW_DATE_COLUMN}) FROM `{SCHEMA_NAME}.{TABLE_NAME}`), + INTERVAL -{WINDOW_DAYS} DAY + ) + ) +) test; diff --git a/testgen/template/flavors/bigquery/gen_query_tests/gen_table_changed_test.sql b/testgen/template/flavors/bigquery/gen_query_tests/gen_table_changed_test.sql new file mode 100644 index 00000000..da6811be --- /dev/null +++ b/testgen/template/flavors/bigquery/gen_query_tests/gen_table_changed_test.sql @@ -0,0 +1,161 @@ +INSERT INTO test_definitions (table_groups_id, profile_run_id, test_type, test_suite_id, + schema_name, table_name, + skip_errors, test_active, last_auto_gen_date, profiling_as_of_date, + lock_refresh, history_calculation, history_lookback, custom_query ) +WITH last_run AS (SELECT r.table_groups_id, MAX(run_date) AS last_run_date + FROM profile_results p + INNER JOIN profiling_runs r + ON (p.profile_run_id = r.id) + INNER JOIN test_suites ts + ON p.project_code = ts.project_code + AND p.connection_id = ts.connection_id + WHERE p.project_code = '{PROJECT_CODE}' + AND r.table_groups_id = '{TABLE_GROUPS_ID}'::UUID + AND ts.id = '{TEST_SUITE_ID}' + AND p.run_date::DATE <= '{AS_OF_DATE}' + GROUP BY r.table_groups_id), +curprof AS (SELECT p.profile_run_id, schema_name, table_name, column_name, functional_data_type, general_type, + distinct_value_ct, record_ct, max_value, min_value, avg_value, stdev_value, null_value_ct + FROM last_run lr + INNER JOIN profile_results p + ON (lr.table_groups_id = p.table_groups_id + AND lr.last_run_date = p.run_date) ), +locked AS (SELECT schema_name, table_name + FROM test_definitions + WHERE table_groups_id = '{TABLE_GROUPS_ID}'::UUID + AND test_suite_id = '{TEST_SUITE_ID}' + AND test_type = 'Table_Freshness' + AND lock_refresh = 'Y'), +-- IDs - TOP 2 +id_cols + AS ( SELECT profile_run_id, schema_name, table_name, column_name, functional_data_type, general_type, + distinct_value_ct, + ROW_NUMBER() OVER (PARTITION BY schema_name, table_name + ORDER BY + CASE + WHEN functional_data_type ILIKE 'ID-Unique%' THEN 1 + WHEN functional_data_type = 'ID-Secondary' THEN 2 + ELSE 3 + END, distinct_value_ct DESC, column_name) AS rank + FROM curprof + WHERE general_type IN ('A', 'D', 'N') + AND functional_data_type ILIKE 'ID%'), +-- Process Date - TOP 1 +process_date_cols + AS (SELECT profile_run_id, schema_name, table_name, column_name, functional_data_type, general_type, + distinct_value_ct, + ROW_NUMBER() OVER (PARTITION BY schema_name, table_name + ORDER BY + CASE + WHEN column_name ILIKE '%mod%' THEN 1 + WHEN column_name ILIKE '%up%' THEN 1 + WHEN column_name ILIKE '%cr%' THEN 2 + WHEN column_name ILIKE '%in%' THEN 2 + END , distinct_value_ct DESC, column_name) AS rank + FROM curprof + WHERE general_type IN ('A', 'D', 'N') + AND functional_data_type ILIKE 'process%'), +-- Transaction Date - TOP 1 +tran_date_cols + AS ( SELECT profile_run_id, schema_name, table_name, column_name, functional_data_type, general_type, + distinct_value_ct, + ROW_NUMBER() OVER (PARTITION BY schema_name, table_name + ORDER BY + distinct_value_ct DESC, column_name) AS rank + FROM curprof + WHERE general_type IN ('A', 'D', 'N') + AND functional_data_type ILIKE 'transactional date%' + OR functional_data_type ILIKE 'period%' + OR functional_data_type = 'timestamp' ), + +-- Numeric Measures +numeric_cols + AS ( SELECT profile_run_id, schema_name, table_name, column_name, functional_data_type, general_type, +/* + -- Subscores + distinct_value_ct * 1.0 / NULLIF(record_ct, 0) AS cardinality_score, + (max_value - min_value) / NULLIF(ABS(NULLIF(avg_value, 0)), 1) AS range_score, + LEAST(1, LOG(GREATEST(distinct_value_ct, 2))) / LOG(GREATEST(record_ct, 2)) AS nontriviality_score, + stdev_value / NULLIF(ABS(NULLIF(avg_value, 0)), 1) AS variability_score, + 1.0 - (null_value_ct * 1.0 / NULLIF(NULLIF(record_ct, 0), 1)) AS null_penalty, +*/ + -- Weighted score + ( + 0.25 * (distinct_value_ct * 1.0 / NULLIF(record_ct, 0)) + + 0.15 * ((max_value - min_value) / NULLIF(ABS(NULLIF(avg_value, 0)), 1)) + + 0.10 * (LEAST(1, LOG(GREATEST(distinct_value_ct, 2))) / LOG(GREATEST(record_ct, 2))) + + 0.40 * (stdev_value / NULLIF(ABS(NULLIF(avg_value, 0)), 1)) + + 0.10 * (1.0 - (null_value_ct * 1.0 / NULLIF(NULLIF(record_ct, 0), 1))) + ) AS change_detection_score + FROM curprof + WHERE general_type = 'N' + AND (functional_data_type ILIKE 'Measure%' OR functional_data_type IN ('Sequence', 'Constant')) + ), +numeric_cols_ranked + AS ( SELECT *, + ROW_NUMBER() OVER (PARTITION BY schema_name, table_name + ORDER BY change_detection_score DESC, column_name) as rank + FROM numeric_cols + WHERE change_detection_score IS NOT NULL), +combined + AS ( SELECT profile_run_id, schema_name, table_name, column_name, 'ID' AS element_type, general_type, 10 + rank AS fingerprint_order + FROM id_cols + WHERE rank <= 2 + UNION ALL + SELECT profile_run_id, schema_name, table_name, column_name, 'DATE_P' AS element_type, general_type, 20 + rank AS fingerprint_order + FROM process_date_cols + WHERE rank = 1 + UNION ALL + SELECT profile_run_id, schema_name, table_name, column_name, 'DATE_T' AS element_type, general_type, 30 + rank AS fingerprint_order + FROM tran_date_cols + WHERE rank = 1 + UNION ALL + SELECT profile_run_id, schema_name, table_name, column_name, 'MEAS' AS element_type, general_type, 40 + rank AS fingerprint_order + FROM numeric_cols_ranked + WHERE rank = 1 ), +newtests AS ( + SELECT profile_run_id, schema_name, table_name, + 'CAST(COUNT(*) AS STRING) || "|" || ' || + STRING_AGG( + REPLACE( + CASE + WHEN general_type = 'D' THEN + 'CAST(MIN(@@@) AS STRING) || "|" || CAST(MAX(@@@) AS STRING) || "|" || CAST(COUNT(DISTINCT @@@) AS STRING)' + WHEN general_type = 'A' THEN + 'CAST(MIN(@@@) AS STRING) || "|" || CAST(MAX(@@@) AS STRING) || "|" || CAST(COUNT(DISTINCT @@@) AS STRING) || "|" || CAST(SUM(LENGTH(@@@)) AS STRING)' + WHEN general_type = 'N' THEN + 'CAST(MIN(@@@) AS STRING) || "|" || CAST(MAX(@@@) AS STRING) || "|" || CAST(SUM(@@@) AS STRING) || "|" || CAST(ROUND(AVG(@@@), 5) AS STRING) || "|" || CAST(ROUND(STDDEV(CAST(@@@ AS FLOAT64)), 5) AS STRING)' + END, + '@@@', '`' || column_name || '`'), + ' || "|" || ' + ORDER BY element_type, fingerprint_order, column_name + ) as fingerprint + FROM combined + GROUP BY profile_run_id, schema_name, table_name +) +SELECT '{TABLE_GROUPS_ID}'::UUID as table_groups_id, + n.profile_run_id, + 'Table_Freshness' AS test_type, + '{TEST_SUITE_ID}' AS test_suite_id, + n.schema_name, n.table_name, + 0 as skip_errors, 'Y' as test_active, + + '{RUN_DATE}'::TIMESTAMP as last_auto_gen_date, + '{AS_OF_DATE}'::TIMESTAMP as profiling_as_of_date, + 'N' as lock_refresh, + 'Value' as history_calculation, + 1 as history_lookback, + fingerprint as custom_query +FROM newtests n +INNER JOIN test_types t + ON ('Table_Freshness' = t.test_type + AND 'Y' = t.active) +LEFT JOIN generation_sets s + ON (t.test_type = s.test_type + AND '{GENERATION_SET}' = s.generation_set) +LEFT JOIN locked l + ON (n.schema_name = l.schema_name + AND n.table_name = l.table_name) +WHERE (s.generation_set IS NOT NULL + OR '{GENERATION_SET}' = '') + AND l.schema_name IS NULL; diff --git a/testgen/template/flavors/bigquery/profiling/project_get_table_sample_count_bigquery.sql b/testgen/template/flavors/bigquery/profiling/project_get_table_sample_count_bigquery.sql new file mode 100644 index 00000000..4fdfcc6e --- /dev/null +++ b/testgen/template/flavors/bigquery/profiling/project_get_table_sample_count_bigquery.sql @@ -0,0 +1,30 @@ +WITH stats AS ( + SELECT + COUNT(*) * 1.0 AS record_ct, + ROUND(CAST({PROFILE_SAMPLE_PERCENT} AS FLOAT64) * COUNT(*) * 1.0 / 100.0) AS calc_sample_ct, + CAST({PROFILE_SAMPLE_MIN_COUNT} AS FLOAT64) AS min_sample_ct, + CAST(999000 AS FLOAT64) AS max_sample_ct + FROM `{SAMPLING_TABLE}` +) +SELECT '{SAMPLING_TABLE}' AS schema_table, + CASE + WHEN record_ct <= min_sample_ct THEN -1 + WHEN calc_sample_ct > max_sample_ct THEN max_sample_ct + WHEN calc_sample_ct > min_sample_ct THEN calc_sample_ct + ELSE {PROFILE_SAMPLE_MIN_COUNT} + END AS sample_count, + CASE + WHEN record_ct <= min_sample_ct THEN 1 + WHEN calc_sample_ct > max_sample_ct THEN record_ct / max_sample_ct + WHEN calc_sample_ct > min_sample_ct THEN record_ct / calc_sample_ct + ELSE record_ct / min_sample_ct + END AS sample_ratio, + ROUND( + CASE + WHEN record_ct <= min_sample_ct THEN 100 + WHEN calc_sample_ct > max_sample_ct THEN 100.0 * max_sample_ct / record_ct + WHEN calc_sample_ct > min_sample_ct THEN 100.0 * calc_sample_ct / record_ct + ELSE 100.0 * min_sample_ct / record_ct + END, + 4) AS sample_percent_calc +FROM stats; diff --git a/testgen/template/flavors/bigquery/profiling/project_profiling_query_bigquery.yaml b/testgen/template/flavors/bigquery/profiling/project_profiling_query_bigquery.yaml new file mode 100644 index 00000000..5d0456af --- /dev/null +++ b/testgen/template/flavors/bigquery/profiling/project_profiling_query_bigquery.yaml @@ -0,0 +1,273 @@ +--- +strTemplate01_sampling: | + WITH target_table AS ( + SELECT * FROM `{DATA_SCHEMA}.{DATA_TABLE}` WHERE RAND() * 100 < {SAMPLE_PERCENT_CALC} + ) + SELECT +strTemplate01_else: | + WITH target_table AS ( + SELECT * FROM `{DATA_SCHEMA}.{DATA_TABLE}` + ) + SELECT +strTemplate01_5: | + {CONNECTION_ID} as connection_id, + '{PROJECT_CODE}' as project_code, + '{TABLE_GROUPS_ID}' as table_groups_id, + '{DATA_SCHEMA}' AS schema_name, + '{RUN_DATE}' AS run_date, + '{DATA_TABLE}' AS table_name, + {COL_POS} AS position, + '{COL_NAME_SANITIZED}' AS column_name, + '{COL_TYPE}' AS column_type, + '{DB_DATA_TYPE}' AS db_data_type, + '{COL_GEN_TYPE}' AS general_type, + COUNT(*) AS record_ct, +strTemplate02_X: | + COUNT(`{COL_NAME}`) AS value_ct, + COUNT(DISTINCT `{COL_NAME}`) AS distinct_value_ct, + SUM(IF(`{COL_NAME}` IS NULL, 1, 0)) AS null_value_ct, +strTemplate02_else: | + COUNT(`{COL_NAME}`) AS value_ct, + COUNT(DISTINCT `{COL_NAME}`) AS distinct_value_ct, + SUM(IF(`{COL_NAME}` IS NULL, 1, 0)) AS null_value_ct, +strTemplate03_ADN: MIN(LENGTH(CAST(`{COL_NAME}` AS STRING))) AS min_length, + MAX(LENGTH(CAST(`{COL_NAME}` AS STRING))) AS max_length, + AVG(NULLIF(LENGTH(CAST(`{COL_NAME}` AS STRING)), 0)) AS avg_length, +strTemplate03_else: NULL as min_length, + NULL as max_length, + NULL as avg_length, +strTemplate04_A: SUM( + CASE + WHEN REGEXP_CONTAINS(TRIM(CAST(`{COL_NAME}` AS STRING)), r'^0(\.0*)?$') THEN 1 + ELSE 0 + END + ) AS zero_value_ct, +strTemplate04_N: CAST(SUM(1 - ABS(SIGN(CAST(`{COL_NAME}` AS NUMERIC)))) AS INT64) AS zero_value_ct, +strTemplate04_else: NULL as zero_value_ct, +strTemplate05_A: | + COUNT( + DISTINCT UPPER( + REGEXP_REPLACE(CAST(`{COL_NAME}` AS STRING), r"[ '\.,-]", "") + ) + ) as distinct_std_value_ct, + SUM(CASE WHEN `{COL_NAME}` = '' THEN 1 ELSE 0 END) AS zero_length_ct, + SUM(CASE WHEN `{COL_NAME}` BETWEEN ' !' AND '!' THEN 1 ELSE 0 END) AS lead_space_ct, + SUM( + CASE + WHEN LOWER(CAST(`{COL_NAME}` AS STRING)) LIKE '"%"' + OR LOWER(CAST(`{COL_NAME}` AS STRING)) LIKE "'%'" THEN 1 + ELSE 0 + END + ) AS quoted_value_ct, + SUM( + CASE + WHEN REGEXP_CONTAINS(CAST(`{COL_NAME}` AS STRING), r'.*[0-9].*') THEN 1 + ELSE 0 + END + ) AS includes_digit_ct, + SUM( + CASE + WHEN CAST(`{COL_NAME}` AS STRING) IN ('.', '?', ' ') THEN 1 + WHEN REGEXP_CONTAINS(LOWER(CAST(`{COL_NAME}` AS STRING)), r'^\s*[-09xz]{2,}\s*$') THEN 1 + WHEN LOWER(CAST(`{COL_NAME}` AS STRING)) IN ('blank','error','missing','tbd', + 'n/a','#na','none','null','unknown') THEN 1 + WHEN LOWER(CAST(`{COL_NAME}` AS STRING)) IN ('(blank)','(error)','(missing)','(tbd)', + '(n/a)','(#na)','(none)','(null)','(unknown)') THEN 1 + WHEN LOWER(CAST(`{COL_NAME}` AS STRING)) IN ('[blank]','[error]','[missing]','[tbd]', + '[n/a]','[#na]','[none]','[null]','[unknown]') THEN 1 + ELSE 0 + END + ) AS filled_value_ct, + LEFT(MIN(NULLIF(`{COL_NAME}`, '')), 100) AS min_text, + LEFT(MAX(NULLIF(`{COL_NAME}`, '')), 100) AS max_text, + SUM( CASE WHEN `{COL_NAME}` = UPPER(`{COL_NAME}`) AND `{COL_NAME}` <> LOWER(`{COL_NAME}`) THEN 1 ELSE 0 END) AS upper_case_ct, + SUM( CASE WHEN `{COL_NAME}` = LOWER(`{COL_NAME}`) AND `{COL_NAME}` <> UPPER(`{COL_NAME}`) THEN 1 ELSE 0 END) AS lower_case_ct, + SUM( CASE WHEN `{COL_NAME}` = UPPER(`{COL_NAME}`) AND `{COL_NAME}` = LOWER(`{COL_NAME}`) THEN 1 ELSE 0 END) AS non_alpha_ct, + COUNTIF( + TRANSLATE( + CAST(`{COL_NAME}` AS STRING), + CODE_POINTS_TO_STRING([160, 8201, 8203, 8204, 8205, 8206, 8207, 8239, 12288, 65279]), + REPEAT('X', 10) + ) <> CAST(`{COL_NAME}` AS STRING) + ) as non_printing_ct, + SUM(<%IS_NUM;LEFT(`{COL_NAME}`, 31)%>) AS numeric_ct, + SUM(<%IS_DATE;LEFT(`{COL_NAME}`, 26)%>) AS date_ct, + CASE + WHEN SAFE_DIVIDE(SUM(CASE WHEN REGEXP_CONTAINS(`{COL_NAME}`, r'^[0-9]{1,5}[a-zA-Z]?\s\w{1,5}\.?\s?\w*\s?\w*\s[a-zA-Z]{1,6}\.?\s?[0-9]{0,5}[A-Z]{0,1}$') + THEN 1 END), COUNT(`{COL_NAME}`)) > 0.8 THEN 'STREET_ADDR' + WHEN SAFE_DIVIDE(SUM(CASE WHEN `{COL_NAME}` IN ('AL','AK','AS','AZ','AR','CA','CO','CT','DE','DC','FM','FL','GA','GU','HI','ID','IL','IN','IA','KS','KY','LA','ME','MH','MD','MA','MI','MN','MS','MO','MT','NE','NV','NH','NJ','NM','NY','NC','ND','MP','OH','OK','OR','PW','PA','PR','RI','SC','SD','TN','TX','UT','VT','VI','VA','WA','WV','WI','WY','AE','AP','AA') + THEN 1 END), COUNT(`{COL_NAME}`)) > 0.9 THEN 'STATE_USA' + WHEN SAFE_DIVIDE(SUM(CASE WHEN REGEXP_CONTAINS(`{COL_NAME}`, r'^(\\+1|1)?[ .-]?(\\([2-9][0-9]{2}\\)|[2-9][0-9]{2})[ .-]?[2-9][0-9]{2}[ .-]?[0-9]{4}$') + THEN 1 END), COUNT(`{COL_NAME}`)) > 0.8 THEN 'PHONE_USA' + WHEN SAFE_DIVIDE(SUM(CASE WHEN REGEXP_CONTAINS(`{COL_NAME}`, r'^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$') + THEN 1 END), COUNT(`{COL_NAME}`)) > 0.9 THEN 'EMAIL' + WHEN SAFE_DIVIDE(SUM(CASE WHEN TRANSLATE(`{COL_NAME}`, '012345678', '999999999') IN ('99999', '999999999', '99999-9999') + THEN 1 END), COUNT(`{COL_NAME}`)) > 0.9 THEN 'ZIP_USA' + WHEN SAFE_DIVIDE(SUM(CASE WHEN REGEXP_CONTAINS(`{COL_NAME}`, r'^[\w\s\-]+\.(txt|csv|tsv|dat|doc|pdf|xlsx)$') + THEN 1 END), COUNT(`{COL_NAME}`)) > 0.9 THEN 'FILE_NAME' + WHEN SAFE_DIVIDE(SUM(CASE WHEN REGEXP_CONTAINS(`{COL_NAME}`, r'^([0-9]{4}[- ]){3}[0-9]{4}$') + THEN 1 END), COUNT(`{COL_NAME}`)) > 0.8 THEN 'CREDIT_CARD' + WHEN SAFE_DIVIDE(SUM(CASE WHEN REGEXP_CONTAINS(`{COL_NAME}`, r'^([^,|\t]{1,20}[,|\t]){2,}[^,|\t]{0,20}([,|\t]{0,1}[^,|\t]{0,20})*$') + AND NOT REGEXP_CONTAINS(`{COL_NAME}`, r'\s(and|but|or|yet)\s') + THEN 1 END), COUNT(`{COL_NAME}`)) > 0.8 THEN 'DELIMITED_DATA' + WHEN SAFE_DIVIDE(SUM(CASE WHEN REGEXP_CONTAINS(`{COL_NAME}`, r'^[0-8][0-9]{2}-[0-9]{2}-[0-9]{4}$') + AND CAST(SUBSTR(`{COL_NAME}`, 1, 3) AS INT64) NOT BETWEEN 734 AND 749 + AND SUBSTR(`{COL_NAME}`, 1, 3) <> '666' + THEN 1 END), COUNT(`{COL_NAME}`)) > 0.9 THEN 'SSN' + END AS std_pattern_match, +strTemplate05_else: NULL as distinct_std_value_ct, + NULL as zero_length_ct, + NULL as lead_space_ct, + NULL as quoted_value_ct, + NULL as includes_digit_ct, + NULL as filled_value_ct, + NULL as min_text, + NULL as max_text, + NULL as upper_case_ct, + NULL as lower_case_ct, + NULL as non_alpha_ct, + NULL as non_printing_ct, + NULL as numeric_ct, + NULL as date_ct, + NULL as std_pattern_match, +strTemplate06_A_patterns: | + ( + SELECT LEFT(STRING_AGG(val, ' | ' ORDER BY ct DESC), 1000) AS top_patterns + FROM ( + SELECT CONCAT(CAST(ct AS STRING), ' | ', pattern) AS val, + ct + FROM ( + SELECT pattern, + COUNT(*) AS ct + FROM ( + SELECT REGEXP_REPLACE( + REGEXP_REPLACE( + REGEXP_REPLACE(CAST({COL_NAME} AS STRING), r'[a-z]', 'a'), + r'[A-Z]', 'A'), + r'[0-9]', 'N') AS pattern + FROM `target_table` + WHERE {COL_NAME} > ' ' + AND ( + SELECT MAX(LENGTH(CAST({COL_NAME} AS STRING))) + FROM `target_table` + ) BETWEEN 3 AND {PARM_MAX_PATTERN_LENGTH} + ) p + GROUP BY pattern + HAVING pattern > ' ' + ORDER BY ct DESC + LIMIT 5 + ) + ) ps + ) as top_patterns, +strTemplate06_else: NULL as top_patterns, +strTemplate07_A_freq: | + ( + SELECT LEFT(STRING_AGG(val, ' | ' ORDER BY ct DESC), 1000) AS top_freq_values + FROM ( + SELECT CONCAT(CAST(ct AS STRING), ' | ', CAST({COL_NAME} AS STRING)) AS val, + ct + FROM ( + SELECT {COL_NAME}, + COUNT(*) AS ct + FROM `target_table` + WHERE {COL_NAME} > ' ' + GROUP BY {COL_NAME} + HAVING {COL_NAME} > ' ' + ORDER BY ct DESC, {COL_NAME} DESC + LIMIT 10 + ) + ) ps + ) as top_freq_values, +strTemplate07_else: NULL as top_freq_values, +strTemplate08_N: MIN(`{COL_NAME}`) AS min_value, + MIN(CASE WHEN `{COL_NAME}` > 0 THEN `{COL_NAME}` ELSE NULL END) AS min_value_over_0, + MAX(`{COL_NAME}`) AS max_value, + AVG(CAST(`{COL_NAME}` AS FLOAT64)) AS avg_value, + STDDEV(CAST(`{COL_NAME}` AS FLOAT64)) AS stdev_value, + MIN(pct_25) AS percentile_25, + MIN(pct_50) AS percentile_50, + MIN(pct_75) AS percentile_75, +strTemplate08_else: NULL as min_value, + NULL as min_value_over_0, + NULL as max_value, + NULL as avg_value, + NULL as stdev_value, + NULL as percentile_25, + NULL as percentile_50, + NULL as percentile_75, +strTemplate10_N_dec: SUM(COALESCE(ROUND(ABS(MOD(`{COL_NAME}`, 1)), 5), 0)) as fractional_sum, +strTemplate10_else: NULL as fractional_sum, +strTemplate11_D: | + MIN(`{COL_NAME}`) AS min_date, -- Other flavors have a minimum threshold of 0001-01-01, but BigQuery doesn't make it easy to to the same + MAX(`{COL_NAME}`) as max_date, + COUNT(CASE WHEN DATE_DIFF(SAFE_CAST(DATE('{RUN_DATE}') AS DATE), SAFE_CAST(DATE(`{COL_NAME}`) AS DATE), MONTH) > 12 THEN 1 END) AS before_1yr_date_ct, + COUNT(CASE WHEN DATE_DIFF(SAFE_CAST(DATE('{RUN_DATE}') AS DATE), SAFE_CAST(DATE(`{COL_NAME}`) AS DATE), MONTH) > 60 THEN 1 END) AS before_5yr_date_ct, + COUNT(CASE WHEN DATE_DIFF(SAFE_CAST(DATE('{RUN_DATE}') AS DATE), SAFE_CAST(DATE(`{COL_NAME}`) AS DATE), MONTH) > 240 THEN 1 END) AS before_20yr_date_ct, + COUNT(CASE WHEN DATE_DIFF(SAFE_CAST(DATE('{RUN_DATE}') AS DATE), SAFE_CAST(DATE(`{COL_NAME}`) AS DATE), MONTH) > 1200 THEN 1 END) AS before_100yr_date_ct, + COUNT(CASE WHEN DATE_DIFF(SAFE_CAST(DATE('{RUN_DATE}') AS DATE), SAFE_CAST(DATE(`{COL_NAME}`) AS DATE), DAY) BETWEEN 0 AND 365 THEN 1 END) AS within_1yr_date_ct, + COUNT(CASE WHEN DATE_DIFF(SAFE_CAST(DATE('{RUN_DATE}') AS DATE), SAFE_CAST(DATE(`{COL_NAME}`) AS DATE), DAY) BETWEEN 0 AND 30 THEN 1 END) AS within_1mo_date_ct, + COUNT(CASE WHEN SAFE_CAST(DATE(`{COL_NAME}`) AS DATE) > SAFE_CAST(DATE('{RUN_DATE}') AS DATE) THEN 1 END) AS future_date_ct, + COUNT(CASE WHEN DATE_DIFF(SAFE_CAST(DATE(`{COL_NAME}`) AS DATE), SAFE_CAST(DATE('{RUN_DATE}') AS DATE), MONTH) > 240 THEN 1 END) AS distant_future_date_ct, + COUNT(DISTINCT DATE_DIFF(SAFE_CAST(DATE('{RUN_DATE}') AS DATE), SAFE_CAST(DATE(`{COL_NAME}`) AS DATE), DAY)) AS date_days_present, + COUNT(DISTINCT DATE_DIFF(SAFE_CAST(DATE('{RUN_DATE}') AS DATE), SAFE_CAST(DATE(`{COL_NAME}`) AS DATE), WEEK)) AS date_weeks_present, + COUNT(DISTINCT DATE_DIFF(SAFE_CAST(DATE('{RUN_DATE}') AS DATE), SAFE_CAST(DATE(`{COL_NAME}`) AS DATE), MONTH)) AS date_months_present, + +strTemplate11_else: NULL as min_date, + NULL as max_date, + NULL as before_1yr_date_ct, + NULL as before_5yr_date_ct, + NULL as before_20yr_date_ct, + NULL AS before_100yr_date_ct, + NULL as within_1yr_date_ct, + NULL as within_1mo_date_ct, + NULL as future_date_ct, + NULL as distant_future_date_ct, + NULL as date_days_present, + NULL as date_weeks_present, + NULL as date_months_present, +strTemplate12_B: SUM(CAST(`{COL_NAME}` AS INT64)) AS boolean_true_ct, +strTemplate12_else: NULL as boolean_true_ct, +strTemplate13_ALL: NULL AS datatype_suggestion, +strTemplate14_A_do_patterns: | + ( + SELECT + COUNT(DISTINCT REGEXP_REPLACE( + REGEXP_REPLACE( + REGEXP_REPLACE(CAST(`{COL_NAME}` AS STRING), r'[a-z]', 'a'), + r'[A-Z]', 'A' + ), + r'[0-9]', 'N' + )) AS pattern_ct + FROM `target_table` + WHERE `{COL_NAME}` > ' ' + ) as distinct_pattern_ct, + SUM(CAST(SIGN(LENGTH(TRIM(`{COL_NAME}`)) - LENGTH(REPLACE(TRIM(`{COL_NAME}`), ' ', ''))) AS INT64)) AS embedded_space_ct, + AVG(CAST(LENGTH(TRIM(`{COL_NAME}`)) - LENGTH(REPLACE(TRIM(`{COL_NAME}`), ' ', '')) AS FLOAT64)) AS avg_embedded_spaces, +strTemplate14_A_no_patterns: NULL as distinct_pattern_ct, + SUM(CAST(SIGN(LENGTH(TRIM(`{COL_NAME}`)) - LENGTH(REPLACE(TRIM(`{COL_NAME}`), ' ', ''))) AS INT64)) AS embedded_space_ct, + AVG(CAST(LENGTH(TRIM(`{COL_NAME}`)) - LENGTH(REPLACE(TRIM(`{COL_NAME}`), ' ', '')) AS FLOAT64)) AS avg_embedded_spaces, +strTemplate14_else: NULL as distinct_pattern_ct, + NULL as embedded_space_ct, + NULL as avg_embedded_spaces, +strTemplate15_ALL: NULL as functional_data_type, + NULL as functional_table_type, +strTemplate16_ALL: " '{PROFILE_RUN_ID}' as profile_run_id " +strTemplate98_sampling: ' FROM target_table' +strTemplate98_else: ' FROM target_table' + +strTemplate99_N: | + , + (SELECT + PERCENTILE_CONT(`{COL_NAME}`, 0.25) OVER() AS pct_25, + PERCENTILE_CONT(`{COL_NAME}`, 0.50) OVER() AS pct_50, + PERCENTILE_CONT(`{COL_NAME}`, 0.75) OVER() AS pct_75 + FROM `{DATA_SCHEMA}.{DATA_TABLE}` LIMIT 1) pctile +strTemplate99_N_sampling: | + , + (SELECT + APPROX_QUANTILES(`{COL_NAME}`, 100)[OFFSET(25)] AS pct_25, + APPROX_QUANTILES(`{COL_NAME}`, 100)[OFFSET(50)] AS pct_50, + APPROX_QUANTILES(`{COL_NAME}`, 100)[OFFSET(75)] AS pct_75 + FROM `{DATA_SCHEMA}.{DATA_TABLE}` LIMIT 1) pctile +strTemplate99_else: ; +strTemplate100_sampling: ' ' diff --git a/testgen/template/flavors/bigquery/profiling/project_secondary_profiling_query_bigquery.sql b/testgen/template/flavors/bigquery/profiling/project_secondary_profiling_query_bigquery.sql new file mode 100644 index 00000000..8bb81c50 --- /dev/null +++ b/testgen/template/flavors/bigquery/profiling/project_secondary_profiling_query_bigquery.sql @@ -0,0 +1,55 @@ +WITH counts AS ( + SELECT + `{COL_NAME}` AS col_val, + COUNT(*) AS ct + FROM `{DATA_SCHEMA}.{DATA_TABLE}` + WHERE `{COL_NAME}` > ' ' +-- TG-IF do_sample_bool + AND RAND() * 100 < {SAMPLE_PERCENT_CALC} +-- TG-ENDIF + GROUP BY `{COL_NAME}` +), +ranked AS ( + SELECT + col_val, + ct, + ROW_NUMBER() OVER (ORDER BY ct DESC, col_val ASC) AS rn + FROM counts +), +top10 AS ( + -- top 10 formatted rows + SELECT + rn, + CONCAT('| ', CAST(col_val AS STRING), ' | ', CAST(ct AS STRING)) AS val + FROM ranked + WHERE rn <= 10 + ORDER BY rn +), +others_agg AS ( + SELECT + 11 AS rn, + CONCAT( + '| Other Values (', + CAST(COUNT(DISTINCT col_val) AS STRING), + ') | ', + CAST(SUM(ct) AS STRING) + ) AS val, + COUNT(*) AS other_row_count + FROM ranked + WHERE rn > 10 +), +all_vals AS ( + SELECT * FROM top10 + UNION ALL + SELECT rn, val FROM others_agg WHERE other_row_count > 0 +) +SELECT + '{PROJECT_CODE}' AS project_code, + '{DATA_SCHEMA}' AS schema_name, + '{RUN_DATE}' AS run_date, + '{DATA_TABLE}' AS table_name, + '{COL_NAME}' AS column_name, + (SELECT STRING_AGG(val, '\n' ORDER BY rn) FROM all_vals) AS top_freq_values, + (SELECT TO_HEX(MD5(STRING_AGG(CAST(col_val AS STRING), '|' ORDER BY col_val))) + FROM counts + ) AS distinct_value_hash; diff --git a/testgen/template/flavors/bigquery/profiling/templated_functions.yaml b/testgen/template/flavors/bigquery/profiling/templated_functions.yaml new file mode 100644 index 00000000..d6b91b0a --- /dev/null +++ b/testgen/template/flavors/bigquery/profiling/templated_functions.yaml @@ -0,0 +1,49 @@ +IS_NUM: CASE + WHEN REGEXP_CONTAINS(CAST({$1} AS STRING), + r'^\s*[+-]?\$?\s*[0-9]+(,[0-9]{3})*(\.[0-9]*)?[%]?\s*$') THEN 1 + ELSE 0 + END + +IS_DATE: | + CASE + /* YYYY-MM-DD HH:MM:SS SSSSSS */ + WHEN SAFE.PARSE_DATETIME('%F %H:%M:%S %6f', CAST({$1} AS STRING)) IS NOT NULL THEN 1 + + /* YYYY-MM-DD HH:MM:SS */ + WHEN SAFE.PARSE_DATETIME('%F %H:%M:%S', CAST({$1} AS STRING)) IS NOT NULL THEN 1 + + /* YYYYMMDDHHMMSSSSSS */ + WHEN SAFE.PARSE_DATETIME('%Y%m%d%H%M%S%6f', CAST({$1} AS STRING)) IS NOT NULL THEN 1 + + /* YYYYMMDDHHMMSS */ + WHEN SAFE.PARSE_DATETIME('%Y%m%d%H%M%S', CAST({$1} AS STRING)) IS NOT NULL THEN 1 + + /* YYYYMMDD */ + WHEN LENGTH(CAST({$1} AS STRING)) = 8 AND SAFE.PARSE_DATE('%Y%m%d', CAST({$1} AS STRING)) IS NOT NULL THEN 1 + + /* Exclude anything else long */ + WHEN LENGTH(CAST({$1} AS STRING)) > 11 THEN 0 + + /* YYYY-MON-DD */ + WHEN SAFE.PARSE_DATE('%Y-%b-%d', CAST({$1} AS STRING)) IS NOT NULL THEN 1 + + /* YYYY-MM-DD */ + WHEN SAFE.PARSE_DATE('%F', CAST({$1} AS STRING)) IS NOT NULL THEN 1 + + /* MM/DD/YYYY */ + WHEN SAFE.PARSE_DATE('%m/%d/%Y', CAST({$1} AS STRING)) IS NOT NULL THEN 1 + + /* MM/DD/YY */ + WHEN SAFE.PARSE_DATE('%m/%d/%y', CAST({$1} AS STRING)) IS NOT NULL THEN 1 + + /* MM-DD-YYYY */ + WHEN SAFE.PARSE_DATE('%m-%d-%Y', CAST({$1} AS STRING)) IS NOT NULL THEN 1 + + /* MM-DD-YY */ + WHEN SAFE.PARSE_DATE('%m-%d-%y', CAST({$1} AS STRING)) IS NOT NULL THEN 1 + + /* DD-MON-YYYY */ + WHEN SAFE.PARSE_DATE('%d-%b-%Y', CAST({$1} AS STRING)) IS NOT NULL THEN 1 + + ELSE 0 + END diff --git a/testgen/template/flavors/bigquery/validate_tests/ex_get_project_column_list.sql b/testgen/template/flavors/bigquery/validate_tests/ex_get_project_column_list.sql new file mode 100644 index 00000000..8a465da2 --- /dev/null +++ b/testgen/template/flavors/bigquery/validate_tests/ex_get_project_column_list.sql @@ -0,0 +1,3 @@ +select concat(concat(concat(table_schema, '.'), concat(table_name, '.')), column_name) as columns +from `{DATA_SCHEMA}.INFORMATION_SCHEMA.COLUMNS` +where table_schema in ({TEST_SCHEMAS}); diff --git a/testgen/template/flavors/databricks/data_chars/schema_ddf_query_databricks.sql b/testgen/template/flavors/databricks/data_chars/schema_ddf_query_databricks.sql index 0f3e4dd4..0cfb56f6 100644 --- a/testgen/template/flavors/databricks/data_chars/schema_ddf_query_databricks.sql +++ b/testgen/template/flavors/databricks/data_chars/schema_ddf_query_databricks.sql @@ -9,7 +9,8 @@ SELECT '{PROJECT_CODE}' AS project_code, WHEN lower(c.full_data_type) IN ('double', 'float') THEN 'numeric' WHEN lower(c.full_data_type) LIKE 'decimal%' THEN 'numeric(' || c.numeric_precision || ',' || c.numeric_scale || ')' ELSE lower(c.full_data_type) - END AS data_type, + END AS column_type, + c.full_data_type AS db_data_type, c.character_maximum_length, c.ordinal_position, CASE diff --git a/testgen/template/flavors/databricks/exec_query_tests/ex_window_match_no_drops_databricks.sql b/testgen/template/flavors/databricks/exec_query_tests/ex_window_match_no_drops_databricks.sql index ad1f7581..6e5184d6 100644 --- a/testgen/template/flavors/databricks/exec_query_tests/ex_window_match_no_drops_databricks.sql +++ b/testgen/template/flavors/databricks/exec_query_tests/ex_window_match_no_drops_databricks.sql @@ -32,15 +32,15 @@ SELECT '{TEST_TYPE}' as test_type, NULL as result_query FROM ( SELECT {COLUMN_NAME_NO_QUOTES} - FROM {SCHEMA_NAME}.{TABLE_NAME} + FROM `{SCHEMA_NAME}`.`{TABLE_NAME}` WHERE {SUBSET_CONDITION} - AND {WINDOW_DATE_COLUMN} >= DATEADD(day, - 2 * {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {SCHEMA_NAME}.{TABLE_NAME})) - AND {WINDOW_DATE_COLUMN} < DATEADD(day, - {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {SCHEMA_NAME}.{TABLE_NAME})) + AND {WINDOW_DATE_COLUMN} >= DATEADD(day, - 2 * {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM `{SCHEMA_NAME}`.`{TABLE_NAME}`)) + AND {WINDOW_DATE_COLUMN} < DATEADD(day, - {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM `{SCHEMA_NAME}`.`{TABLE_NAME}`)) GROUP BY {COLUMN_NAME_NO_QUOTES} EXCEPT SELECT {COLUMN_NAME_NO_QUOTES} - FROM {SCHEMA_NAME}.{TABLE_NAME} + FROM `{SCHEMA_NAME}`.`{TABLE_NAME}` WHERE {SUBSET_CONDITION} - AND {WINDOW_DATE_COLUMN} >= DATEADD(day, - {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {SCHEMA_NAME}.{TABLE_NAME})) + AND {WINDOW_DATE_COLUMN} >= DATEADD(day, - {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM `{SCHEMA_NAME}`.`{TABLE_NAME}`)) GROUP BY {COLUMN_NAME_NO_QUOTES} ) test; diff --git a/testgen/template/flavors/databricks/exec_query_tests/ex_window_match_same_databricks.sql b/testgen/template/flavors/databricks/exec_query_tests/ex_window_match_same_databricks.sql index 2fe39587..7a078dc7 100644 --- a/testgen/template/flavors/databricks/exec_query_tests/ex_window_match_same_databricks.sql +++ b/testgen/template/flavors/databricks/exec_query_tests/ex_window_match_same_databricks.sql @@ -32,28 +32,28 @@ SELECT '{TEST_TYPE}' as test_type, NULL as result_query FROM ( ( -SELECT 'Prior Timeframe' as missing_from, {COLUMN_NAME} -FROM {SCHEMA_NAME}.{TABLE_NAME} +SELECT 'Prior Timeframe' as missing_from, {COLUMN_NAME_NO_QUOTES} +FROM `{SCHEMA_NAME}`.`{TABLE_NAME}` WHERE {SUBSET_CONDITION} - AND {WINDOW_DATE_COLUMN} >= DATEADD(day, - {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {SCHEMA_NAME}.{TABLE_NAME})) + AND {WINDOW_DATE_COLUMN} >= DATEADD(day, - {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM `{SCHEMA_NAME}`.`{TABLE_NAME}`)) EXCEPT -SELECT 'Prior Timeframe' as missing_from, {COLUMN_NAME} -FROM {SCHEMA_NAME}.{TABLE_NAME} +SELECT 'Prior Timeframe' as missing_from, {COLUMN_NAME_NO_QUOTES} +FROM `{SCHEMA_NAME}`.`{TABLE_NAME}` WHERE {SUBSET_CONDITION} - AND {WINDOW_DATE_COLUMN} >= DATEADD(day, - 2 * {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {SCHEMA_NAME}.{TABLE_NAME})) - AND {WINDOW_DATE_COLUMN} < DATEADD(day, - {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {SCHEMA_NAME}.{TABLE_NAME})) + AND {WINDOW_DATE_COLUMN} >= DATEADD(day, - 2 * {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM `{SCHEMA_NAME}`.`{TABLE_NAME}`)) + AND {WINDOW_DATE_COLUMN} < DATEADD(day, - {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM `{SCHEMA_NAME}`.`{TABLE_NAME}`)) ) UNION ALL ( -SELECT 'Latest Timeframe' as missing_from, {COLUMN_NAME} -FROM {SCHEMA_NAME}.{TABLE_NAME} +SELECT 'Latest Timeframe' as missing_from, {COLUMN_NAME_NO_QUOTES} +FROM `{SCHEMA_NAME}`.`{TABLE_NAME}` WHERE {SUBSET_CONDITION} - AND {WINDOW_DATE_COLUMN} >= DATEADD(day, - 2 * {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {SCHEMA_NAME}.{TABLE_NAME})) - AND {WINDOW_DATE_COLUMN} < DATEADD(day, - {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {SCHEMA_NAME}.{TABLE_NAME})) + AND {WINDOW_DATE_COLUMN} >= DATEADD(day, - 2 * {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM `{SCHEMA_NAME}`.`{TABLE_NAME}`)) + AND {WINDOW_DATE_COLUMN} < DATEADD(day, - {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM `{SCHEMA_NAME}`.`{TABLE_NAME}`)) EXCEPT -SELECT 'Latest Timeframe' as missing_from, {COLUMN_NAME} -FROM {SCHEMA_NAME}.{TABLE_NAME} +SELECT 'Latest Timeframe' as missing_from, {COLUMN_NAME_NO_QUOTES} +FROM `{SCHEMA_NAME}`.`{TABLE_NAME}` WHERE {SUBSET_CONDITION} - AND {WINDOW_DATE_COLUMN} >= DATEADD(day, - {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {SCHEMA_NAME}.{TABLE_NAME})) + AND {WINDOW_DATE_COLUMN} >= DATEADD(day, - {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM `{SCHEMA_NAME}`.`{TABLE_NAME}`)) ) ) test; diff --git a/testgen/template/flavors/databricks/profiling/project_profiling_query_databricks.yaml b/testgen/template/flavors/databricks/profiling/project_profiling_query_databricks.yaml index d7612bd3..4c2cbaa4 100644 --- a/testgen/template/flavors/databricks/profiling/project_profiling_query_databricks.yaml +++ b/testgen/template/flavors/databricks/profiling/project_profiling_query_databricks.yaml @@ -1,7 +1,7 @@ --- strTemplate01_sampling: "SELECT " strTemplate01_else: "SELECT " -strTemplate02_all: | +strTemplate01_5: | {CONNECTION_ID} as connection_id, '{PROJECT_CODE}' as project_code, '{TABLE_GROUPS_ID}' as table_groups_id, @@ -11,8 +11,14 @@ strTemplate02_all: | {COL_POS} AS position, '{COL_NAME_SANITIZED}' AS column_name, '{COL_TYPE}' AS column_type, + '{DB_DATA_TYPE}' AS db_data_type, '{COL_GEN_TYPE}' AS general_type, COUNT(*) AS record_ct, +strTemplate02_X: | + COUNT(`{COL_NAME}`) AS value_ct, + COUNT(DISTINCT `{COL_NAME}`) AS distinct_value_ct, + SUM(CASE WHEN `{COL_NAME}` IS NULL THEN 1 ELSE 0 END) AS null_value_ct, +strTemplate02_else: | COUNT(`{COL_NAME}`) AS value_ct, COUNT(DISTINCT `{COL_NAME}`) AS distinct_value_ct, SUM(CASE WHEN `{COL_NAME}` IS NULL THEN 1 ELSE 0 END) AS null_value_ct, @@ -151,7 +157,7 @@ strTemplate07_A_freq: ( SELECT LEFT(CONCAT_WS(' | ', collect_list(val)), 1000) FROM ( SELECT CAST(COUNT(*) as VARCHAR(10)) || ' | ' || `{COL_NAME}` as val, COUNT(*) as ct - FROM {DATA_SCHEMA}.{DATA_TABLE} + FROM `{DATA_SCHEMA}`.`{DATA_TABLE}` WHERE `{COL_NAME}` > ' ' GROUP BY `{COL_NAME}` HAVING `{COL_NAME}` > ' ' @@ -244,7 +250,7 @@ strTemplate14_A_do_patterns: ( SELECT COUNT(DISTINCT TRANSLATE(`{COL_NAME}`, 'aaaaaaaaaaaaaaaaaaaaaaaaaaAAAAAAAAAAAAAAAAAAAAAAAAAANNNNNNNNNN' ) ) AS pattern_ct - FROM {DATA_SCHEMA}.{DATA_TABLE} + FROM `{DATA_SCHEMA}`.`{DATA_TABLE}` WHERE `{COL_NAME}` > ' ' ) AS distinct_pattern_ct, SUM(CAST(SIGN(LEN(TRIM(`{COL_NAME}`)) - LEN(REPLACE(TRIM(`{COL_NAME}`),' ',''))) AS BIGINT)) AS embedded_space_ct, AVG(CAST(LEN(TRIM(`{COL_NAME}`)) - LEN(REPLACE(TRIM(`{COL_NAME}`),' ','')) AS FLOAT)) AS avg_embedded_spaces, @@ -262,23 +268,23 @@ strTemplate15_ALL: NULL as functional_data_type, strTemplate16_ALL: " '{PROFILE_RUN_ID}' as profile_run_id" -strTemplate98_sampling: ' FROM {DATA_SCHEMA}.{DATA_TABLE} TABLESAMPLE ({SAMPLE_PERCENT_CALC} PERCENT)' +strTemplate98_sampling: ' FROM `{DATA_SCHEMA}`.`{DATA_TABLE}` TABLESAMPLE ({SAMPLE_PERCENT_CALC} PERCENT)' -strTemplate98_else: ' FROM {DATA_SCHEMA}.{DATA_TABLE}' +strTemplate98_else: ' FROM `{DATA_SCHEMA}`.`{DATA_TABLE}`' strTemplate99_N: | , (SELECT PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY `{COL_NAME}`) OVER () AS pct_25, PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY `{COL_NAME}`) OVER () AS pct_50, PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY `{COL_NAME}`) OVER () AS pct_75 - FROM {DATA_SCHEMA}.{DATA_TABLE} LIMIT 1) pctile + FROM `{DATA_SCHEMA}`.`{DATA_TABLE}` LIMIT 1) pctile strTemplate99_N_sampling: | , (SELECT PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY `{COL_NAME}`) OVER () AS pct_25, PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY `{COL_NAME}`) OVER () AS pct_50, PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY `{COL_NAME}`) OVER () AS pct_75 - FROM {DATA_SCHEMA}.{DATA_TABLE} TABLESAMPLE ({SAMPLE_PERCENT_CALC} PERCENT) LIMIT 1 ) pctile + FROM `{DATA_SCHEMA}`.`{DATA_TABLE}` TABLESAMPLE ({SAMPLE_PERCENT_CALC} PERCENT) LIMIT 1 ) pctile strTemplate99_else: ' ' diff --git a/testgen/template/flavors/databricks/profiling/project_secondary_profiling_query_databricks.sql b/testgen/template/flavors/databricks/profiling/project_secondary_profiling_query_databricks.sql index 483fb373..7def8c78 100644 --- a/testgen/template/flavors/databricks/profiling/project_secondary_profiling_query_databricks.sql +++ b/testgen/template/flavors/databricks/profiling/project_secondary_profiling_query_databricks.sql @@ -4,7 +4,10 @@ AS (SELECT `{COL_NAME}`, COUNT(*) AS ct, ROW_NUMBER() OVER (ORDER BY COUNT(*) DESC) AS rn - FROM {DATA_SCHEMA}.{DATA_TABLE} + FROM `{DATA_SCHEMA}`.`{DATA_TABLE}` +-- TG-IF do_sample_bool + TABLESAMPLE ({SAMPLE_PERCENT_CALC} PERCENT) +-- TG-ENDIF WHERE `{COL_NAME}` > ' ' GROUP BY `{COL_NAME}` ), @@ -30,6 +33,6 @@ SELECT '{PROJECT_CODE}' as project_code, )), '^#^', '\n') AS top_freq_values, (SELECT MD5(CONCAT_WS('|', ARRAY_SORT(COLLECT_LIST(NULLIF(dist_col_name,''))))) as dvh FROM (SELECT DISTINCT `{COL_NAME}` as dist_col_name - FROM {DATA_SCHEMA}.{DATA_TABLE}) a + FROM `{DATA_SCHEMA}`.`{DATA_TABLE}`) a ) as distinct_value_hash FROM consol_vals; diff --git a/testgen/template/flavors/generic/exec_query_tests/ex_aggregate_match_no_drops_generic.sql b/testgen/template/flavors/generic/exec_query_tests/ex_aggregate_match_no_drops_generic.sql index b8db09e4..098da4d4 100644 --- a/testgen/template/flavors/generic/exec_query_tests/ex_aggregate_match_no_drops_generic.sql +++ b/testgen/template/flavors/generic/exec_query_tests/ex_aggregate_match_no_drops_generic.sql @@ -33,13 +33,13 @@ SELECT '{TEST_TYPE}' as test_type, FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) as total, SUM(MATCH_TOTAL) as MATCH_TOTAL FROM ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} as total, NULL as match_total - FROM {SCHEMA_NAME}.{TABLE_NAME} + FROM {QUOTE}{SCHEMA_NAME}{QUOTE}.{QUOTE}{TABLE_NAME}{QUOTE} WHERE {SUBSET_CONDITION} GROUP BY {GROUPBY_NAMES} {HAVING_CONDITION} UNION ALL SELECT {MATCH_GROUPBY_NAMES}, NULL as total, {MATCH_COLUMN_NAMES} as match_total - FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME} + FROM {QUOTE}{MATCH_SCHEMA_NAME}{QUOTE}.{QUOTE}{MATCH_TABLE_NAME}{QUOTE} WHERE {MATCH_SUBSET_CONDITION} GROUP BY {MATCH_GROUPBY_NAMES} {MATCH_HAVING_CONDITION} ) a diff --git a/testgen/template/flavors/generic/exec_query_tests/ex_aggregate_match_percent_generic.sql b/testgen/template/flavors/generic/exec_query_tests/ex_aggregate_match_percent_generic.sql index 872acfe5..fe60101f 100644 --- a/testgen/template/flavors/generic/exec_query_tests/ex_aggregate_match_percent_generic.sql +++ b/testgen/template/flavors/generic/exec_query_tests/ex_aggregate_match_percent_generic.sql @@ -33,13 +33,13 @@ SELECT '{TEST_TYPE}' as test_type, FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) as total, SUM(MATCH_TOTAL) as MATCH_TOTAL FROM ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} as total, NULL as match_total - FROM {SCHEMA_NAME}.{TABLE_NAME} + FROM {QUOTE}{SCHEMA_NAME}{QUOTE}.{QUOTE}{TABLE_NAME}{QUOTE} WHERE {SUBSET_CONDITION} GROUP BY {GROUPBY_NAMES} {HAVING_CONDITION} UNION ALL SELECT {MATCH_GROUPBY_NAMES}, NULL as total, {MATCH_COLUMN_NAMES} as match_total - FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME} + FROM {QUOTE}{MATCH_SCHEMA_NAME}{QUOTE}.{QUOTE}{MATCH_TABLE_NAME}{QUOTE} WHERE {MATCH_SUBSET_CONDITION} GROUP BY {MATCH_GROUPBY_NAMES} {MATCH_HAVING_CONDITION} ) a diff --git a/testgen/template/flavors/generic/exec_query_tests/ex_aggregate_match_range_generic.sql b/testgen/template/flavors/generic/exec_query_tests/ex_aggregate_match_range_generic.sql index d97df8a8..89845709 100644 --- a/testgen/template/flavors/generic/exec_query_tests/ex_aggregate_match_range_generic.sql +++ b/testgen/template/flavors/generic/exec_query_tests/ex_aggregate_match_range_generic.sql @@ -33,13 +33,13 @@ SELECT '{TEST_TYPE}' as test_type, FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) as total, SUM(MATCH_TOTAL) as MATCH_TOTAL FROM ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} as total, NULL as match_total - FROM {SCHEMA_NAME}.{TABLE_NAME} + FROM {QUOTE}{SCHEMA_NAME}{QUOTE}.{QUOTE}{TABLE_NAME}{QUOTE} WHERE {SUBSET_CONDITION} GROUP BY {GROUPBY_NAMES} {HAVING_CONDITION} UNION ALL SELECT {MATCH_GROUPBY_NAMES}, NULL as total, {MATCH_COLUMN_NAMES} as match_total - FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME} + FROM {QUOTE}{MATCH_SCHEMA_NAME}{QUOTE}.{QUOTE}{MATCH_TABLE_NAME}{QUOTE} WHERE {MATCH_SUBSET_CONDITION} GROUP BY {MATCH_GROUPBY_NAMES} {MATCH_HAVING_CONDITION} ) a diff --git a/testgen/template/flavors/generic/exec_query_tests/ex_aggregate_match_same_generic.sql b/testgen/template/flavors/generic/exec_query_tests/ex_aggregate_match_same_generic.sql index 7fb3787e..3fb69cc8 100644 --- a/testgen/template/flavors/generic/exec_query_tests/ex_aggregate_match_same_generic.sql +++ b/testgen/template/flavors/generic/exec_query_tests/ex_aggregate_match_same_generic.sql @@ -33,13 +33,13 @@ SELECT '{TEST_TYPE}' as test_type, FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) as total, SUM(MATCH_TOTAL) as MATCH_TOTAL FROM ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} as total, NULL as match_total - FROM {SCHEMA_NAME}.{TABLE_NAME} + FROM {QUOTE}{SCHEMA_NAME}{QUOTE}.{QUOTE}{TABLE_NAME}{QUOTE} WHERE {SUBSET_CONDITION} GROUP BY {GROUPBY_NAMES} {HAVING_CONDITION} UNION ALL SELECT {MATCH_GROUPBY_NAMES}, NULL as total, {MATCH_COLUMN_NAMES} as match_total - FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME} + FROM {QUOTE}{MATCH_SCHEMA_NAME}{QUOTE}.{QUOTE}{MATCH_TABLE_NAME}{QUOTE} WHERE {MATCH_SUBSET_CONDITION} GROUP BY {MATCH_GROUPBY_NAMES} {MATCH_HAVING_CONDITION} ) a diff --git a/testgen/template/flavors/generic/exec_query_tests/ex_data_match_2way_generic.sql b/testgen/template/flavors/generic/exec_query_tests/ex_data_match_2way_generic.sql index 0640907e..838ea5c0 100644 --- a/testgen/template/flavors/generic/exec_query_tests/ex_data_match_2way_generic.sql +++ b/testgen/template/flavors/generic/exec_query_tests/ex_data_match_2way_generic.sql @@ -32,25 +32,25 @@ SELECT '{TEST_TYPE}' as test_type, NULL as result_query FROM ( ( SELECT {GROUPBY_NAMES} - FROM {SCHEMA_NAME}.{TABLE_NAME} + FROM {QUOTE}{SCHEMA_NAME}{QUOTE}.{QUOTE}{TABLE_NAME}{QUOTE} WHERE {SUBSET_CONDITION} GROUP BY {GROUPBY_NAMES} {HAVING_CONDITION} EXCEPT SELECT {MATCH_COLUMN_NAMES} - FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME} + FROM {QUOTE}{MATCH_SCHEMA_NAME}{QUOTE}.{QUOTE}{MATCH_TABLE_NAME}{QUOTE} WHERE {MATCH_SUBSET_CONDITION} GROUP BY {MATCH_GROUPBY_NAMES} {MATCH_HAVING_CONDITION} ) UNION ( SELECT {MATCH_COLUMN_NAMES} - FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME} + FROM {QUOTE}{MATCH_SCHEMA_NAME}{QUOTE}.{QUOTE}{MATCH_TABLE_NAME}{QUOTE} WHERE {MATCH_SUBSET_CONDITION} GROUP BY {MATCH_GROUPBY_NAMES} {MATCH_HAVING_CONDITION} EXCEPT SELECT {GROUPBY_NAMES} - FROM {SCHEMA_NAME}.{TABLE_NAME} + FROM {QUOTE}{SCHEMA_NAME}{QUOTE}.{QUOTE}{TABLE_NAME}{QUOTE} WHERE {SUBSET_CONDITION} GROUP BY {GROUPBY_NAMES} {HAVING_CONDITION} diff --git a/testgen/template/flavors/generic/exec_query_tests/ex_data_match_generic.sql b/testgen/template/flavors/generic/exec_query_tests/ex_data_match_generic.sql index 84317e8a..0c0c0b19 100644 --- a/testgen/template/flavors/generic/exec_query_tests/ex_data_match_generic.sql +++ b/testgen/template/flavors/generic/exec_query_tests/ex_data_match_generic.sql @@ -31,13 +31,13 @@ SELECT '{TEST_TYPE}' as test_type, '{SUBSET_DISPLAY}' as subset_condition, NULL as result_query FROM ( SELECT {COLUMN_NAME_NO_QUOTES} - FROM {SCHEMA_NAME}.{TABLE_NAME} + FROM {QUOTE}{SCHEMA_NAME}{QUOTE}.{QUOTE}{TABLE_NAME}{QUOTE} WHERE {SUBSET_CONDITION} GROUP BY {COLUMN_NAME_NO_QUOTES} {HAVING_CONDITION} EXCEPT SELECT {MATCH_GROUPBY_NAMES} - FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME} + FROM {QUOTE}{MATCH_SCHEMA_NAME}{QUOTE}.{QUOTE}{MATCH_TABLE_NAME}{QUOTE} WHERE {MATCH_SUBSET_CONDITION} GROUP BY {MATCH_GROUPBY_NAMES} {MATCH_HAVING_CONDITION} diff --git a/testgen/template/flavors/generic/exec_query_tests/ex_dupe_rows_generic.sql b/testgen/template/flavors/generic/exec_query_tests/ex_dupe_rows_generic.sql index 4cdbf875..61137108 100644 --- a/testgen/template/flavors/generic/exec_query_tests/ex_dupe_rows_generic.sql +++ b/testgen/template/flavors/generic/exec_query_tests/ex_dupe_rows_generic.sql @@ -31,7 +31,7 @@ SELECT '{TEST_TYPE}' as test_type, '{SUBSET_DISPLAY}' as subset_condition, NULL as result_query FROM ( SELECT {GROUPBY_NAMES}, COUNT(*) as record_ct - FROM {SCHEMA_NAME}.{TABLE_NAME} + FROM {QUOTE}{SCHEMA_NAME}{QUOTE}.{QUOTE}{TABLE_NAME}{QUOTE} WHERE {SUBSET_CONDITION} GROUP BY {GROUPBY_NAMES} HAVING COUNT(*) > 1 diff --git a/testgen/template/flavors/generic/exec_query_tests/ex_prior_match_generic.sql b/testgen/template/flavors/generic/exec_query_tests/ex_prior_match_generic.sql index d7030589..fb717344 100644 --- a/testgen/template/flavors/generic/exec_query_tests/ex_prior_match_generic.sql +++ b/testgen/template/flavors/generic/exec_query_tests/ex_prior_match_generic.sql @@ -21,18 +21,18 @@ SELECT '{TEST_TYPE}' as test_type, '{TEST_DESCRIPTION}' as test_description FROM ( ( SELECT {COLUMN_NAME} - FROM {SCHEMA_NAME}.{TABLE_NAME} + FROM {QUOTE}{SCHEMA_NAME}{QUOTE}.{QUOTE}{TABLE_NAME}{QUOTE} WHERE {SUBSET_CONDITION} EXCEPT SELECT {COLUMN_NAME} - FROM {MATCH_SCHEMA_NAME}.{TABLE_NAME} + FROM {QUOTE}{MATCH_SCHEMA_NAME}{QUOTE}.{QUOTE}{TABLE_NAME}{QUOTE} WHERE {SUBSET_CONDITION} ) UNION ( SELECT {COLUMN_NAME} - FROM {MATCH_SCHEMA_NAME}.{TABLE_NAME} + FROM {QUOTE}{MATCH_SCHEMA_NAME}{QUOTE}.{QUOTE}{TABLE_NAME}{QUOTE} WHERE {SUBSET_CONDITION} EXCEPT SELECT {COLUMN_NAME} - FROM {SCHEMA_NAME}.{TABLE_NAME} + FROM {QUOTE}{SCHEMA_NAME}{QUOTE}.{QUOTE}{TABLE_NAME}{QUOTE} WHERE {SUBSET_CONDITION} ) ); diff --git a/testgen/template/flavors/generic/exec_query_tests/ex_relative_entropy_generic.sql b/testgen/template/flavors/generic/exec_query_tests/ex_relative_entropy_generic.sql index 0e6e0647..84be7315 100644 --- a/testgen/template/flavors/generic/exec_query_tests/ex_relative_entropy_generic.sql +++ b/testgen/template/flavors/generic/exec_query_tests/ex_relative_entropy_generic.sql @@ -7,13 +7,13 @@ WITH latest_ver AS ( SELECT {CONCAT_COLUMNS} as category, COUNT(*)::FLOAT / SUM(COUNT(*)) OVER ()::FLOAT AS pct_of_total - FROM {SCHEMA_NAME}.{TABLE_NAME} v1 + FROM {QUOTE}{SCHEMA_NAME}{QUOTE}.{QUOTE}{TABLE_NAME}{QUOTE} v1 WHERE {SUBSET_CONDITION} GROUP BY {COLUMN_NAME_NO_QUOTES} ), older_ver AS ( SELECT {CONCAT_MATCH_GROUPBY} as category, COUNT(*)::FLOAT / SUM(COUNT(*)) OVER ()::FLOAT AS pct_of_total - FROM {MATCH_SCHEMA_NAME}.{TABLE_NAME} v2 + FROM {QUOTE}{MATCH_SCHEMA_NAME}{QUOTE}.{QUOTE}{TABLE_NAME}{QUOTE} v2 WHERE {MATCH_SUBSET_CONDITION} GROUP BY {MATCH_GROUPBY_NAMES} ), dataset diff --git a/testgen/template/flavors/generic/exec_query_tests/ex_table_changed_generic.sql b/testgen/template/flavors/generic/exec_query_tests/ex_table_changed_generic.sql index da660156..bf573f78 100644 --- a/testgen/template/flavors/generic/exec_query_tests/ex_table_changed_generic.sql +++ b/testgen/template/flavors/generic/exec_query_tests/ex_table_changed_generic.sql @@ -28,6 +28,6 @@ SELECT '{TEST_TYPE}' as test_type, '{SUBSET_DISPLAY}' as subset_condition, NULL as result_query FROM ( SELECT {CUSTOM_QUERY} as fingerprint - FROM {SCHEMA_NAME}.{TABLE_NAME} + FROM {QUOTE}{SCHEMA_NAME}{QUOTE}.{QUOTE}{TABLE_NAME}{QUOTE} WHERE {SUBSET_CONDITION} ) test; diff --git a/testgen/template/flavors/generic/exec_query_tests/ex_window_match_no_drops_generic.sql b/testgen/template/flavors/generic/exec_query_tests/ex_window_match_no_drops_generic.sql index 3993e1a4..81d0784e 100644 --- a/testgen/template/flavors/generic/exec_query_tests/ex_window_match_no_drops_generic.sql +++ b/testgen/template/flavors/generic/exec_query_tests/ex_window_match_no_drops_generic.sql @@ -32,15 +32,15 @@ SELECT '{TEST_TYPE}' as test_type, NULL as result_query FROM ( SELECT {COLUMN_NAME_NO_QUOTES} - FROM {SCHEMA_NAME}.{TABLE_NAME} + FROM {QUOTE}{SCHEMA_NAME}{QUOTE}.{QUOTE}{TABLE_NAME}{QUOTE} WHERE {SUBSET_CONDITION} - AND {WINDOW_DATE_COLUMN} >= DATEADD("day", - 2 * {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {SCHEMA_NAME}.{TABLE_NAME})) - AND {WINDOW_DATE_COLUMN} < DATEADD("day", - {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {SCHEMA_NAME}.{TABLE_NAME})) + AND {WINDOW_DATE_COLUMN} >= DATEADD("day", - 2 * {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {QUOTE}{SCHEMA_NAME}{QUOTE}.{QUOTE}{TABLE_NAME}{QUOTE})) + AND {WINDOW_DATE_COLUMN} < DATEADD("day", - {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {QUOTE}{SCHEMA_NAME}{QUOTE}.{QUOTE}{TABLE_NAME}{QUOTE})) GROUP BY {COLUMN_NAME_NO_QUOTES} EXCEPT SELECT {COLUMN_NAME_NO_QUOTES} - FROM {SCHEMA_NAME}.{TABLE_NAME} + FROM {QUOTE}{SCHEMA_NAME}{QUOTE}.{QUOTE}{TABLE_NAME}{QUOTE} WHERE {SUBSET_CONDITION} - AND {WINDOW_DATE_COLUMN} >= DATEADD("day", - {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {SCHEMA_NAME}.{TABLE_NAME})) + AND {WINDOW_DATE_COLUMN} >= DATEADD("day", - {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {QUOTE}{SCHEMA_NAME}{QUOTE}.{QUOTE}{TABLE_NAME}{QUOTE})) GROUP BY {COLUMN_NAME_NO_QUOTES} ) test; diff --git a/testgen/template/flavors/generic/exec_query_tests/ex_window_match_same_generic.sql b/testgen/template/flavors/generic/exec_query_tests/ex_window_match_same_generic.sql index 42e603be..3bb2e84b 100644 --- a/testgen/template/flavors/generic/exec_query_tests/ex_window_match_same_generic.sql +++ b/testgen/template/flavors/generic/exec_query_tests/ex_window_match_same_generic.sql @@ -32,28 +32,28 @@ SELECT '{TEST_TYPE}' as test_type, NULL as result_query FROM ( ( -SELECT 'Prior Timeframe' as missing_from, {COLUMN_NAME} -FROM {SCHEMA_NAME}.{TABLE_NAME} +SELECT 'Prior Timeframe' as missing_from, {COLUMN_NAME_NO_QUOTES} +FROM {QUOTE}{SCHEMA_NAME}{QUOTE}.{QUOTE}{TABLE_NAME}{QUOTE} WHERE {SUBSET_CONDITION} - AND {WINDOW_DATE_COLUMN} >= DATEADD("day", - {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {SCHEMA_NAME}.{TABLE_NAME})) + AND {WINDOW_DATE_COLUMN} >= DATEADD("day", - {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {QUOTE}{SCHEMA_NAME}{QUOTE}.{QUOTE}{TABLE_NAME}{QUOTE})) EXCEPT -SELECT 'Prior Timeframe' as missing_from, {COLUMN_NAME} -FROM {SCHEMA_NAME}.{TABLE_NAME} +SELECT 'Prior Timeframe' as missing_from, {COLUMN_NAME_NO_QUOTES} +FROM {QUOTE}{SCHEMA_NAME}{QUOTE}.{QUOTE}{TABLE_NAME}{QUOTE} WHERE {SUBSET_CONDITION} - AND {WINDOW_DATE_COLUMN} >= DATEADD("day", - 2 * {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {SCHEMA_NAME}.{TABLE_NAME})) - AND {WINDOW_DATE_COLUMN} < DATEADD("day", - {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {SCHEMA_NAME}.{TABLE_NAME})) + AND {WINDOW_DATE_COLUMN} >= DATEADD("day", - 2 * {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {QUOTE}{SCHEMA_NAME}{QUOTE}.{QUOTE}{TABLE_NAME}{QUOTE})) + AND {WINDOW_DATE_COLUMN} < DATEADD("day", - {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {QUOTE}{SCHEMA_NAME}{QUOTE}.{QUOTE}{TABLE_NAME}{QUOTE})) ) UNION ALL ( -SELECT 'Latest Timeframe' as missing_from, {COLUMN_NAME} -FROM {SCHEMA_NAME}.{TABLE_NAME} +SELECT 'Latest Timeframe' as missing_from, {COLUMN_NAME_NO_QUOTES} +FROM {QUOTE}{SCHEMA_NAME}{QUOTE}.{QUOTE}{TABLE_NAME}{QUOTE} WHERE {SUBSET_CONDITION} - AND {WINDOW_DATE_COLUMN} >= DATEADD("day", - 2 * {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {SCHEMA_NAME}.{TABLE_NAME})) - AND {WINDOW_DATE_COLUMN} < DATEADD("day", - {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {SCHEMA_NAME}.{TABLE_NAME})) + AND {WINDOW_DATE_COLUMN} >= DATEADD("day", - 2 * {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {QUOTE}{SCHEMA_NAME}{QUOTE}.{QUOTE}{TABLE_NAME}{QUOTE})) + AND {WINDOW_DATE_COLUMN} < DATEADD("day", - {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {QUOTE}{SCHEMA_NAME}{QUOTE}.{QUOTE}{TABLE_NAME}{QUOTE})) EXCEPT -SELECT 'Latest Timeframe' as missing_from, {COLUMN_NAME} -FROM {SCHEMA_NAME}.{TABLE_NAME} +SELECT 'Latest Timeframe' as missing_from, {COLUMN_NAME_NO_QUOTES} +FROM {QUOTE}{SCHEMA_NAME}{QUOTE}.{QUOTE}{TABLE_NAME}{QUOTE} WHERE {SUBSET_CONDITION} - AND {WINDOW_DATE_COLUMN} >= DATEADD("day", - {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {SCHEMA_NAME}.{TABLE_NAME})) + AND {WINDOW_DATE_COLUMN} >= DATEADD("day", - {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {QUOTE}{SCHEMA_NAME}{QUOTE}.{QUOTE}{TABLE_NAME}{QUOTE})) ) ) test; diff --git a/testgen/template/flavors/generic/validate_tests/ex_get_project_column_list_generic.sql b/testgen/template/flavors/generic/validate_tests/ex_get_project_column_list.sql similarity index 100% rename from testgen/template/flavors/generic/validate_tests/ex_get_project_column_list_generic.sql rename to testgen/template/flavors/generic/validate_tests/ex_get_project_column_list.sql diff --git a/testgen/template/flavors/mssql/data_chars/schema_ddf_query_mssql.sql b/testgen/template/flavors/mssql/data_chars/schema_ddf_query_mssql.sql index 44d659ca..8b113f7c 100644 --- a/testgen/template/flavors/mssql/data_chars/schema_ddf_query_mssql.sql +++ b/testgen/template/flavors/mssql/data_chars/schema_ddf_query_mssql.sql @@ -11,12 +11,20 @@ SELECT '{PROJECT_CODE}' as project_code, WHEN c.data_type = 'char' THEN 'char(' + CAST(c.character_maximum_length AS VARCHAR) + ')' WHEN c.data_type = 'numeric' THEN 'numeric(' + CAST(c.numeric_precision AS VARCHAR) + ',' + CAST(c.numeric_scale AS VARCHAR) + ')' - ELSE c.data_type END AS data_type, + ELSE c.data_type END AS column_type, + CASE + WHEN c.data_type LIKE '%char' OR c.data_type LIKE '%binary' + THEN c.data_type + '(' + CAST(c.character_maximum_length AS VARCHAR) + ')' + WHEN c.data_type IN ('datetime2', 'datetimeoffset', 'time') + THEN c.data_type + '(' + CAST(c.datetime_precision AS VARCHAR) + ')' + WHEN c.data_type IN ('numeric', 'decimal') + THEN c.data_type + '(' + CAST(c.numeric_precision AS VARCHAR) + ',' + + CAST(c.numeric_scale AS VARCHAR) + ')' + ELSE c.data_type END AS db_data_type, c.character_maximum_length, c.ordinal_position, CASE WHEN LOWER(c.data_type) LIKE '%char%' - OR c.data_type LIKE '%text%' THEN 'A' WHEN c.data_type = 'bit' THEN 'B' diff --git a/testgen/template/flavors/mssql/exec_query_tests/ex_relative_entropy_mssql.sql b/testgen/template/flavors/mssql/exec_query_tests/ex_relative_entropy_mssql.sql index be5fa577..7b26cbab 100644 --- a/testgen/template/flavors/mssql/exec_query_tests/ex_relative_entropy_mssql.sql +++ b/testgen/template/flavors/mssql/exec_query_tests/ex_relative_entropy_mssql.sql @@ -7,13 +7,13 @@ WITH latest_ver AS ( SELECT {CONCAT_COLUMNS} as category, CAST(COUNT(*) as FLOAT) / CAST(SUM(COUNT(*)) OVER () as FLOAT) AS pct_of_total - FROM {SCHEMA_NAME}.{TABLE_NAME} v1 + FROM "{SCHEMA_NAME}"."{TABLE_NAME}" v1 WHERE {SUBSET_CONDITION} GROUP BY {COLUMN_NAME_NO_QUOTES} ), older_ver AS ( SELECT {CONCAT_MATCH_GROUPBY} as category, CAST(COUNT(*) as FLOAT) / CAST(SUM(COUNT(*)) OVER () as FLOAT) AS pct_of_total - FROM {MATCH_SCHEMA_NAME}.{TABLE_NAME} v2 + FROM "{MATCH_SCHEMA_NAME}"."{TABLE_NAME}" v2 WHERE {MATCH_SUBSET_CONDITION} GROUP BY {MATCH_GROUPBY_NAMES} ), dataset diff --git a/testgen/template/flavors/mssql/exec_query_tests/ex_table_changed_mssql.sql b/testgen/template/flavors/mssql/exec_query_tests/ex_table_changed_mssql.sql index 5189c442..978a46dd 100644 --- a/testgen/template/flavors/mssql/exec_query_tests/ex_table_changed_mssql.sql +++ b/testgen/template/flavors/mssql/exec_query_tests/ex_table_changed_mssql.sql @@ -28,6 +28,6 @@ SELECT '{TEST_TYPE}' as test_type, '{SUBSET_DISPLAY}' as subset_condition, NULL as result_query FROM ( SELECT {CUSTOM_QUERY} as fingerprint - FROM {SCHEMA_NAME}.{TABLE_NAME} WITH (NOLOCK) + FROM "{SCHEMA_NAME}"."{TABLE_NAME}" WITH (NOLOCK) WHERE {SUBSET_CONDITION} ) test; diff --git a/testgen/template/flavors/mssql/gen_query_tests/gen_table_changed_test.sql b/testgen/template/flavors/mssql/gen_query_tests/gen_table_changed_test.sql index 4ed918a2..3f8be00e 100644 --- a/testgen/template/flavors/mssql/gen_query_tests/gen_table_changed_test.sql +++ b/testgen/template/flavors/mssql/gen_query_tests/gen_table_changed_test.sql @@ -14,7 +14,7 @@ WITH last_run AS (SELECT r.table_groups_id, MAX(run_date) AS last_run_date AND ts.id = '{TEST_SUITE_ID}' AND p.run_date::DATE <= '{AS_OF_DATE}' GROUP BY r.table_groups_id), -curprof AS (SELECT p.profile_run_id, schema_name, table_name, column_name, functional_data_type, general_type, +curprof AS (SELECT p.profile_run_id, schema_name, table_name, column_name, functional_data_type, general_type, column_type, distinct_value_ct, record_ct, max_value, min_value, avg_value, stdev_value, null_value_ct FROM last_run lr INNER JOIN profile_results p @@ -28,7 +28,7 @@ locked AS (SELECT schema_name, table_name AND lock_refresh = 'Y'), -- IDs - TOP 2 id_cols - AS ( SELECT profile_run_id, schema_name, table_name, column_name, functional_data_type, general_type, + AS ( SELECT profile_run_id, schema_name, table_name, column_name, functional_data_type, general_type, column_type, distinct_value_ct, ROW_NUMBER() OVER (PARTITION BY schema_name, table_name ORDER BY @@ -42,7 +42,7 @@ id_cols AND functional_data_type ILIKE 'ID%'), -- Process Date - TOP 1 process_date_cols - AS (SELECT profile_run_id, schema_name, table_name, column_name, functional_data_type, general_type, + AS (SELECT profile_run_id, schema_name, table_name, column_name, functional_data_type, general_type, column_type, distinct_value_ct, ROW_NUMBER() OVER (PARTITION BY schema_name, table_name ORDER BY @@ -57,7 +57,7 @@ process_date_cols AND functional_data_type ILIKE 'process%'), -- Transaction Date - TOP 1 tran_date_cols - AS ( SELECT profile_run_id, schema_name, table_name, column_name, functional_data_type, general_type, + AS ( SELECT profile_run_id, schema_name, table_name, column_name, functional_data_type, general_type, column_type, distinct_value_ct, ROW_NUMBER() OVER (PARTITION BY schema_name, table_name ORDER BY @@ -70,7 +70,7 @@ tran_date_cols -- Numeric Measures numeric_cols - AS ( SELECT profile_run_id, schema_name, table_name, column_name, functional_data_type, general_type, + AS ( SELECT profile_run_id, schema_name, table_name, column_name, functional_data_type, general_type, column_type, /* -- Subscores distinct_value_ct * 1.0 / NULLIF(record_ct, 0) AS cardinality_score, @@ -98,19 +98,19 @@ numeric_cols_ranked FROM numeric_cols WHERE change_detection_score IS NOT NULL), combined - AS ( SELECT profile_run_id, schema_name, table_name, column_name, 'ID' AS element_type, general_type, 10 + rank AS fingerprint_order + AS ( SELECT profile_run_id, schema_name, table_name, column_name, 'ID' AS element_type, general_type, column_type, 10 + rank AS fingerprint_order FROM id_cols WHERE rank <= 2 UNION ALL - SELECT profile_run_id, schema_name, table_name, column_name, 'DATE_P' AS element_type, general_type, 20 + rank AS fingerprint_order + SELECT profile_run_id, schema_name, table_name, column_name, 'DATE_P' AS element_type, general_type, column_type, 20 + rank AS fingerprint_order FROM process_date_cols WHERE rank = 1 UNION ALL - SELECT profile_run_id, schema_name, table_name, column_name, 'DATE_T' AS element_type, general_type, 30 + rank AS fingerprint_order + SELECT profile_run_id, schema_name, table_name, column_name, 'DATE_T' AS element_type, general_type, column_type, 30 + rank AS fingerprint_order FROM tran_date_cols WHERE rank = 1 UNION ALL - SELECT profile_run_id, schema_name, table_name, column_name, 'MEAS' AS element_type, general_type, 40 + rank AS fingerprint_order + SELECT profile_run_id, schema_name, table_name, column_name, 'MEAS' AS element_type, general_type, column_type, 40 + rank AS fingerprint_order FROM numeric_cols_ranked WHERE rank = 1 ), newtests AS ( @@ -123,7 +123,8 @@ newtests AS ( CASE WHEN general_type = 'D' THEN 'CAST(MIN(@@@) AS NVARCHAR) + ''|'' + MAX(CAST(@@@ AS NVARCHAR)) + ''|'' + CAST(COUNT(DISTINCT @@@) AS NVARCHAR)' WHEN general_type = 'A' THEN 'CAST(MIN(@@@) AS NVARCHAR) + ''|'' + MAX(CAST(@@@ AS NVARCHAR)) + ''|'' + CAST(COUNT(DISTINCT @@@) AS NVARCHAR) + ''|'' + CAST(SUM(LEN(@@@)) AS NVARCHAR)' - WHEN general_type = 'N' THEN 'CAST(MIN(@@@) AS NVARCHAR) + ''|'' + MAX(CAST(@@@ AS NVARCHAR)) + ''|'' + CAST(SUM(@@@) AS NVARCHAR) + ''|'' + CAST(ROUND(AVG(@@@), 5) AS NVARCHAR) + ''|'' + CAST(ROUND(STDEV(CAST(@@@ AS FLOAT)), 5) AS NVARCHAR)' + WHEN general_type = 'N' AND column_type ILIKE '%int%' THEN 'CAST(MIN(@@@) AS NVARCHAR) + ''|'' + MAX(CAST(@@@ AS NVARCHAR)) + ''|'' + CAST(SUM(CAST(@@@ AS BIGINT)) AS NVARCHAR) + ''|'' + CAST(ROUND(AVG(CAST(@@@ AS DECIMAL(30,5))), 5) AS NVARCHAR) + ''|'' + CAST(ROUND(STDEV(CAST(@@@ AS FLOAT)), 5) AS NVARCHAR)' + WHEN general_type = 'N' AND column_type NOT ILIKE '%int%' THEN 'CAST(MIN(@@@) AS NVARCHAR) + ''|'' + MAX(CAST(@@@ AS NVARCHAR)) + ''|'' + CAST(SUM(@@@) AS NVARCHAR) + ''|'' + CAST(ROUND(AVG(@@@), 5) AS NVARCHAR) + ''|'' + CAST(ROUND(STDEV(CAST(@@@ AS FLOAT)), 5) AS NVARCHAR)' END, '@@@', '"' || column_name || '"' ), diff --git a/testgen/template/flavors/mssql/profiling/project_profiling_query_mssql.yaml b/testgen/template/flavors/mssql/profiling/project_profiling_query_mssql.yaml index 2b8aae99..75ed4598 100644 --- a/testgen/template/flavors/mssql/profiling/project_profiling_query_mssql.yaml +++ b/testgen/template/flavors/mssql/profiling/project_profiling_query_mssql.yaml @@ -1,7 +1,7 @@ --- strTemplate01_sampling: "SELECT " strTemplate01_else: "SELECT " -strTemplate02_all: | +strTemplate01_5: | {CONNECTION_ID} as connection_id, '{PROJECT_CODE}' as project_code, '{TABLE_GROUPS_ID}' as table_groups_id, @@ -11,8 +11,14 @@ strTemplate02_all: | {COL_POS} AS position, '{COL_NAME_SANITIZED}' AS column_name, '{COL_TYPE}' AS column_type, + '{DB_DATA_TYPE}' AS db_data_type, '{COL_GEN_TYPE}' AS general_type, COUNT(*) AS record_ct, +strTemplate02_X: | + COUNT(CASE WHEN "{COL_NAME}" IS NOT NULL THEN 1 END) AS value_ct, + NULL AS distinct_value_ct, + SUM(CASE WHEN "{COL_NAME}" IS NULL THEN 1 ELSE 0 END) AS null_value_ct, +strTemplate02_else: | COUNT("{COL_NAME}") AS value_ct, COUNT(DISTINCT "{COL_NAME}") AS distinct_value_ct, SUM(CASE WHEN "{COL_NAME}" IS NULL THEN 1 ELSE 0 END) AS null_value_ct, @@ -137,9 +143,9 @@ strTemplate06_A_patterns: ( SELECT LEFT(STRING_AGG(pattern, ' | ') WITHIN GROUP 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789', 'aaaaaaaaaaaaaaaaaaaaaaaaaaAAAAAAAAAAAAAAAAAAAAAAAAAANNNNNNNNNN' ) AS pattern - FROM {DATA_SCHEMA}.{DATA_TABLE} WITH (NOLOCK) + FROM "{DATA_SCHEMA}"."{DATA_TABLE}" WITH (NOLOCK) WHERE "{COL_NAME}" > ' ' AND ((SELECT MAX(LEN("{COL_NAME}")) - FROM {DATA_SCHEMA}.{DATA_TABLE} WITH (NOLOCK)) BETWEEN 3 and {PARM_MAX_PATTERN_LENGTH})) p + FROM "{DATA_SCHEMA}"."{DATA_TABLE}" WITH (NOLOCK)) BETWEEN 3 and {PARM_MAX_PATTERN_LENGTH})) p GROUP BY pattern HAVING pattern > ' ' ORDER BY COUNT(*) DESC @@ -149,7 +155,7 @@ strTemplate07_A_freq: ( SELECT LEFT(STRING_AGG(val, ' | ') WITHIN GROUP (ORDER FROM ( SELECT TOP 10 CAST(COUNT(*) as VARCHAR(10)) + ' | ' + "{COL_NAME}" as val, COUNT(*) as ct - FROM {DATA_SCHEMA}.{DATA_TABLE} WITH (NOLOCK) + FROM "{DATA_SCHEMA}"."{DATA_TABLE}" WITH (NOLOCK) WHERE "{COL_NAME}" > ' ' GROUP BY "{COL_NAME}" HAVING "{COL_NAME}" > ' ' @@ -241,7 +247,7 @@ strTemplate14_A_do_patterns: ( SELECT COUNT(DISTINCT TRANSLATE("{COL_NAME}" CO 'aaaaaaaaaaaaaaaaaaaaaaaaaaAAAAAAAAAAAAAAAAAAAAAAAAAANNNNNNNNNN' ) ) AS pattern_ct - FROM {DATA_SCHEMA}.{DATA_TABLE} WITH (NOLOCK) + FROM "{DATA_SCHEMA}"."{DATA_TABLE}" WITH (NOLOCK) WHERE "{COL_NAME}" > ' ' ) AS distinct_pattern_ct, SUM(CAST(SIGN(LEN(RTRIM(LTRIM("{COL_NAME}"))) - LEN(REPLACE(RTRIM(LTRIM("{COL_NAME}")),' ',''))) AS BIGINT)) AS embedded_space_ct, AVG(CAST(LEN(RTRIM(LTRIM("{COL_NAME}"))) - LEN(REPLACE(RTRIM(LTRIM("{COL_NAME}")),' ','')) AS FLOAT)) AS avg_embedded_spaces, @@ -259,23 +265,23 @@ strTemplate15_ALL: NULL as functional_data_type, strTemplate16_ALL: " '{PROFILE_RUN_ID}' as profile_run_id" -strTemplate98_sampling: ' FROM {DATA_SCHEMA}.{DATA_TABLE} TABLESAMPLE ({SAMPLE_PERCENT_CALC} PERCENT) WITH (NOLOCK)' +strTemplate98_sampling: ' FROM "{DATA_SCHEMA}"."{DATA_TABLE}" TABLESAMPLE ({SAMPLE_PERCENT_CALC} PERCENT) WITH (NOLOCK)' -strTemplate98_else: ' FROM {DATA_SCHEMA}.{DATA_TABLE} WITH (NOLOCK)' +strTemplate98_else: ' FROM "{DATA_SCHEMA}"."{DATA_TABLE}" WITH (NOLOCK)' strTemplate99_N: | , (SELECT TOP 1 PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_25, PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_50, PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_75 - FROM {DATA_SCHEMA}.{DATA_TABLE} WITH (NOLOCK)) pctile + FROM "{DATA_SCHEMA}"."{DATA_TABLE}" WITH (NOLOCK)) pctile strTemplate99_N_sampling: | , (SELECT TOP 1 PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_25, PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_50, PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_75 - FROM {DATA_SCHEMA}.{DATA_TABLE} TABLESAMPLE ({SAMPLE_PERCENT_CALC} PERCENT) WITH (NOLOCK)) pctile + FROM "{DATA_SCHEMA}"."{DATA_TABLE}" TABLESAMPLE ({SAMPLE_PERCENT_CALC} PERCENT) WITH (NOLOCK)) pctile strTemplate99_else: ' ' diff --git a/testgen/template/flavors/mssql/profiling/project_secondary_profiling_query_mssql.sql b/testgen/template/flavors/mssql/profiling/project_secondary_profiling_query_mssql.sql index cdb368fe..54505605 100644 --- a/testgen/template/flavors/mssql/profiling/project_secondary_profiling_query_mssql.sql +++ b/testgen/template/flavors/mssql/profiling/project_secondary_profiling_query_mssql.sql @@ -4,7 +4,10 @@ AS (SELECT "{COL_NAME}", COUNT(*) AS ct, ROW_NUMBER() OVER (ORDER BY COUNT(*) DESC) AS rn - FROM {DATA_SCHEMA}.{DATA_TABLE} + FROM "{DATA_SCHEMA}"."{DATA_TABLE}" +-- TG-IF do_sample_bool + TABLESAMPLE ({SAMPLE_PERCENT_CALC} PERCENT) +-- TG-ENDIF WHERE "{COL_NAME}" > ' ' GROUP BY "{COL_NAME}" ), @@ -29,8 +32,8 @@ SELECT '{PROJECT_CODE}' as project_code, (SELECT CONVERT(VARCHAR(40), HASHBYTES('MD5', STRING_AGG( NULLIF(dist_col_name,''), '|') WITHIN GROUP (ORDER BY dist_col_name)), 2) as dvh FROM (SELECT DISTINCT "{COL_NAME}" as dist_col_name - FROM {DATA_SCHEMA}.{DATA_TABLE}) a + FROM "{DATA_SCHEMA}"."{DATA_TABLE}") a ) as distinct_value_hash FROM consol_vals; --- Convert function has style = 2 : The characters 0x aren't added to the left of the converted result for style 2. \ No newline at end of file +-- Convert function has style = 2 : The characters 0x aren't added to the left of the converted result for style 2. diff --git a/testgen/template/flavors/postgresql/data_chars/schema_ddf_query_postgresql.sql b/testgen/template/flavors/postgresql/data_chars/schema_ddf_query_postgresql.sql index 5e3136ca..aca74a15 100644 --- a/testgen/template/flavors/postgresql/data_chars/schema_ddf_query_postgresql.sql +++ b/testgen/template/flavors/postgresql/data_chars/schema_ddf_query_postgresql.sql @@ -14,7 +14,17 @@ SELECT '{PROJECT_CODE}' as project_code, || COALESCE( '(' || CAST(c.numeric_precision AS VARCHAR) || ',' || CAST(c.numeric_scale AS VARCHAR) || ')', '') ELSE c.data_type - END AS data_type, + END AS column_type, + CASE + WHEN c.data_type ILIKE 'char%' OR c.data_type ILIKE 'bit%' + THEN c.data_type || '(' || CAST(c.character_maximum_length AS VARCHAR) || ')' + WHEN c.data_type = 'numeric' + THEN 'numeric' || COALESCE( '(' || CAST(c.numeric_precision AS VARCHAR) || ',' + || CAST(c.numeric_scale AS VARCHAR) || ')', '') + WHEN c.data_type ILIKE 'time%' + THEN c.data_type || '(' || CAST(c.datetime_precision AS VARCHAR) || ')' + ELSE c.data_type + END AS db_data_type, COALESCE(c.character_maximum_length, CASE WHEN c.data_type IN ('text', 'character varying') THEN 65535 END) as character_maximum_length, c.ordinal_position, @@ -26,7 +36,7 @@ SELECT '{PROJECT_CODE}' as project_code, WHEN c.data_type ILIKE 'date' OR c.data_type ILIKE 'timestamp%' THEN 'D' - WHEN c.data_type ILIKE 'time without time zone' + WHEN c.data_type ILIKE 'time with%' THEN 'T' WHEN LOWER(c.data_type) IN ('bigint', 'integer', 'smallint', 'double precision', 'real', 'numeric', 'money') THEN 'N' diff --git a/testgen/template/flavors/postgresql/exec_query_tests/ex_window_match_no_drops_postgresql.sql b/testgen/template/flavors/postgresql/exec_query_tests/ex_window_match_no_drops_postgresql.sql index 178998b0..31b99ee1 100644 --- a/testgen/template/flavors/postgresql/exec_query_tests/ex_window_match_no_drops_postgresql.sql +++ b/testgen/template/flavors/postgresql/exec_query_tests/ex_window_match_no_drops_postgresql.sql @@ -32,15 +32,15 @@ SELECT '{TEST_TYPE}' as test_type, NULL as result_query FROM ( SELECT {COLUMN_NAME_NO_QUOTES} - FROM {SCHEMA_NAME}.{TABLE_NAME} + FROM "{SCHEMA_NAME}"."{TABLE_NAME}" WHERE {SUBSET_CONDITION} - AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {SCHEMA_NAME}.{TABLE_NAME}) - 2 * {WINDOW_DAYS} - AND {WINDOW_DATE_COLUMN} < (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {SCHEMA_NAME}.{TABLE_NAME}) - {WINDOW_DAYS} + AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{SCHEMA_NAME}"."{TABLE_NAME}") - 2 * {WINDOW_DAYS} + AND {WINDOW_DATE_COLUMN} < (SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{SCHEMA_NAME}"."{TABLE_NAME}") - {WINDOW_DAYS} GROUP BY {COLUMN_NAME_NO_QUOTES} EXCEPT SELECT {COLUMN_NAME_NO_QUOTES} - FROM {SCHEMA_NAME}.{TABLE_NAME} + FROM "{SCHEMA_NAME}"."{TABLE_NAME}" WHERE {SUBSET_CONDITION} - AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {SCHEMA_NAME}.{TABLE_NAME}) - {WINDOW_DAYS} + AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{SCHEMA_NAME}"."{TABLE_NAME}") - {WINDOW_DAYS} GROUP BY {COLUMN_NAME_NO_QUOTES} ) test; diff --git a/testgen/template/flavors/postgresql/exec_query_tests/ex_window_match_same_postgresql.sql b/testgen/template/flavors/postgresql/exec_query_tests/ex_window_match_same_postgresql.sql index 4a6aaee4..eda6d933 100644 --- a/testgen/template/flavors/postgresql/exec_query_tests/ex_window_match_same_postgresql.sql +++ b/testgen/template/flavors/postgresql/exec_query_tests/ex_window_match_same_postgresql.sql @@ -32,28 +32,28 @@ SELECT '{TEST_TYPE}' as test_type, NULL as result_query FROM ( ( -SELECT 'Prior Timeframe' as missing_from, {COLUMN_NAME} -FROM {SCHEMA_NAME}.{TABLE_NAME} +SELECT 'Prior Timeframe' as missing_from, {COLUMN_NAME_NO_QUOTES} +FROM "{SCHEMA_NAME}"."{TABLE_NAME}" WHERE {SUBSET_CONDITION} - AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {SCHEMA_NAME}.{TABLE_NAME}) - {WINDOW_DAYS} + AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{SCHEMA_NAME}"."{TABLE_NAME}") - {WINDOW_DAYS} EXCEPT -SELECT 'Prior Timeframe' as missing_from, {COLUMN_NAME} -FROM {SCHEMA_NAME}.{TABLE_NAME} +SELECT 'Prior Timeframe' as missing_from, {COLUMN_NAME_NO_QUOTES} +FROM "{SCHEMA_NAME}"."{TABLE_NAME}" WHERE {SUBSET_CONDITION} - AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {SCHEMA_NAME}.{TABLE_NAME}) - 2 * {WINDOW_DAYS} - AND {WINDOW_DATE_COLUMN} < (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {SCHEMA_NAME}.{TABLE_NAME}) - {WINDOW_DAYS} + AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{SCHEMA_NAME}"."{TABLE_NAME}") - 2 * {WINDOW_DAYS} + AND {WINDOW_DATE_COLUMN} < (SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{SCHEMA_NAME}"."{TABLE_NAME}") - {WINDOW_DAYS} ) UNION ALL ( -SELECT 'Latest Timeframe' as missing_from, {COLUMN_NAME} -FROM {SCHEMA_NAME}.{TABLE_NAME} +SELECT 'Latest Timeframe' as missing_from, {COLUMN_NAME_NO_QUOTES} +FROM "{SCHEMA_NAME}"."{TABLE_NAME}" WHERE {SUBSET_CONDITION} - AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {SCHEMA_NAME}.{TABLE_NAME}) - 2 * {WINDOW_DAYS} - AND {WINDOW_DATE_COLUMN} < (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {SCHEMA_NAME}.{TABLE_NAME}) - {WINDOW_DAYS} + AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{SCHEMA_NAME}"."{TABLE_NAME}") - 2 * {WINDOW_DAYS} + AND {WINDOW_DATE_COLUMN} < (SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{SCHEMA_NAME}"."{TABLE_NAME}") - {WINDOW_DAYS} EXCEPT -SELECT 'Latest Timeframe' as missing_from, {COLUMN_NAME} -FROM {SCHEMA_NAME}.{TABLE_NAME} +SELECT 'Latest Timeframe' as missing_from, {COLUMN_NAME_NO_QUOTES} +FROM "{SCHEMA_NAME}"."{TABLE_NAME}" WHERE {SUBSET_CONDITION} - AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {SCHEMA_NAME}.{TABLE_NAME}) - {WINDOW_DAYS} + AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{SCHEMA_NAME}"."{TABLE_NAME}") - {WINDOW_DAYS} ) ) test; diff --git a/testgen/template/flavors/postgresql/profiling/project_profiling_query_postgresql.yaml b/testgen/template/flavors/postgresql/profiling/project_profiling_query_postgresql.yaml index 763dd4b7..6bf6631f 100644 --- a/testgen/template/flavors/postgresql/profiling/project_profiling_query_postgresql.yaml +++ b/testgen/template/flavors/postgresql/profiling/project_profiling_query_postgresql.yaml @@ -1,7 +1,7 @@ --- strTemplate01_sampling: "SELECT " strTemplate01_else: "SELECT " -strTemplate02_all: | +strTemplate01_5: | {CONNECTION_ID} as connection_id, '{PROJECT_CODE}' as project_code, '{TABLE_GROUPS_ID}' as table_groups_id, @@ -11,8 +11,14 @@ strTemplate02_all: | {COL_POS} AS position, '{COL_NAME_SANITIZED}' AS column_name, '{COL_TYPE}' AS column_type, + '{DB_DATA_TYPE}' AS db_data_type, '{COL_GEN_TYPE}' AS general_type, COUNT(*) AS record_ct, +strTemplate02_X: | + COUNT("{COL_NAME}") AS value_ct, + COUNT(DISTINCT "{COL_NAME}") AS distinct_value_ct, + SUM(CASE WHEN "{COL_NAME}" IS NULL THEN 1 ELSE 0 END) AS null_value_ct, +strTemplate02_else: | COUNT("{COL_NAME}") AS value_ct, COUNT(DISTINCT "{COL_NAME}") AS distinct_value_ct, SUM(CASE WHEN "{COL_NAME}" IS NULL THEN 1 ELSE 0 END) AS null_value_ct, @@ -25,7 +31,7 @@ strTemplate03_else: NULL as min_length, strTemplate04_A: SUM(CASE WHEN TRIM("{COL_NAME}") ~ '^0(\.0*)?$' THEN 1 ELSE 0 END) AS zero_value_ct, -strTemplate04_N: SUM( 1 - ABS(SIGN("{COL_NAME}")) )::BIGINT AS zero_value_ct, +strTemplate04_N: SUM( 1 - ABS(SIGN("{COL_NAME}"::NUMERIC)) )::BIGINT AS zero_value_ct, strTemplate04_else: NULL as zero_value_ct, strTemplate05_A: COUNT(DISTINCT UPPER(TRANSLATE("{COL_NAME}", ' '',.-', ''))) as distinct_std_value_ct, SUM(CASE @@ -113,9 +119,9 @@ strTemplate06_A_patterns: ( SELECT LEFT(STRING_AGG(pattern, ' | ' ORDER BY ct DE "{COL_NAME}", '[a-z]', 'a', 'g'), '[A-Z]', 'A', 'g'), '[0-9]', 'N', 'g') AS pattern - FROM {DATA_SCHEMA}.{DATA_TABLE} + FROM "{DATA_SCHEMA}"."{DATA_TABLE}" WHERE "{COL_NAME}" > ' ' AND (SELECT MAX(LENGTH("{COL_NAME}")) - FROM {DATA_SCHEMA}.{DATA_TABLE}) BETWEEN 3 and {PARM_MAX_PATTERN_LENGTH}) p + FROM "{DATA_SCHEMA}"."{DATA_TABLE}") BETWEEN 3 and {PARM_MAX_PATTERN_LENGTH}) p GROUP BY pattern HAVING pattern > ' ' ORDER BY COUNT(*) DESC @@ -126,7 +132,7 @@ strTemplate07_A_freq: ( SELECT LEFT(STRING_AGG(val, ' | ' ORDER BY ct DESC), 1 FROM ( SELECT TOP 10 CAST(COUNT(*) as VARCHAR(10)) || ' | ' || "{COL_NAME}" as val, COUNT(*) as ct - FROM {DATA_SCHEMA}.{DATA_TABLE} + FROM "{DATA_SCHEMA}"."{DATA_TABLE}" WHERE "{COL_NAME}" > ' ' GROUP BY "{COL_NAME}" HAVING "{COL_NAME}" > ' ' @@ -135,10 +141,10 @@ strTemplate07_A_freq: ( SELECT LEFT(STRING_AGG(val, ' | ' ORDER BY ct DESC), 1 ) AS top_freq_values, strTemplate07_else: NULL as top_freq_values, strTemplate08_N: MIN("{COL_NAME}") AS min_value, - MIN(CASE WHEN "{COL_NAME}" > 0 THEN "{COL_NAME}" ELSE NULL END) AS min_value_over_0, + MIN(CASE WHEN "{COL_NAME}"::NUMERIC > 0 THEN "{COL_NAME}" ELSE NULL END) AS min_value_over_0, MAX("{COL_NAME}") AS max_value, - AVG(CAST("{COL_NAME}" AS FLOAT)) AS avg_value, - STDDEV(CAST("{COL_NAME}" AS FLOAT)) AS stdev_value, + AVG(CAST("{COL_NAME}"::NUMERIC AS FLOAT)) AS avg_value, + STDDEV(CAST("{COL_NAME}"::NUMERIC AS FLOAT)) AS stdev_value, MIN(pct_25) as percentile_25, MIN(pct_50) as percentile_50, MIN(pct_75) as percentile_75, @@ -219,7 +225,7 @@ strTemplate14_A_do_patterns: ( SELECT COUNT(DISTINCT REGEXP_REPLACE( REGEXP_REPL '[A-Z]', 'A', 'g'), '[0-9]', 'N', 'g') ) AS pattern_ct - FROM {DATA_SCHEMA}.{DATA_TABLE} + FROM "{DATA_SCHEMA}"."{DATA_TABLE}" WHERE "{COL_NAME}" > ' ' ) AS distinct_pattern_ct, SUM(SIGN(LENGTH(TRIM("{COL_NAME}")) - LENGTH(REGEXP_REPLACE(TRIM("{COL_NAME}"), ' ', '', 'g')))::BIGINT) AS embedded_space_ct, AVG(LENGTH(TRIM("{COL_NAME}")) - LENGTH(REGEXP_REPLACE(TRIM("{COL_NAME}"), ' ', '', 'g'))::FLOAT) AS avg_embedded_spaces, @@ -237,23 +243,23 @@ strTemplate15_ALL: NULL as functional_data_type, strTemplate16_ALL: " '{PROFILE_RUN_ID}' as profile_run_id" -strTemplate98_sampling: ' FROM {DATA_SCHEMA}.{DATA_TABLE} TABLESAMPLE BERNOULLI ({SAMPLE_PERCENT_CALC}) REPEATABLE (64)' +strTemplate98_sampling: ' FROM "{DATA_SCHEMA}"."{DATA_TABLE}" TABLESAMPLE BERNOULLI ({SAMPLE_PERCENT_CALC}) REPEATABLE (64)' -strTemplate98_else: ' FROM {DATA_SCHEMA}.{DATA_TABLE} ' +strTemplate98_else: ' FROM "{DATA_SCHEMA}"."{DATA_TABLE}" ' strTemplate99_N: | , (SELECT - PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY "{COL_NAME}") AS pct_25, - PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY "{COL_NAME}") AS pct_50, - PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY "{COL_NAME}") AS pct_75 - FROM {DATA_SCHEMA}.{DATA_TABLE} LIMIT 1) pctile + PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY "{COL_NAME}"::NUMERIC) AS pct_25, + PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY "{COL_NAME}"::NUMERIC) AS pct_50, + PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY "{COL_NAME}"::NUMERIC) AS pct_75 + FROM "{DATA_SCHEMA}"."{DATA_TABLE}" LIMIT 1) pctile strTemplate99_N_sampling: | , (SELECT - PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY "{COL_NAME}") AS pct_25, - PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY "{COL_NAME}") AS pct_50, - PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY "{COL_NAME}") AS pct_75 - FROM {DATA_SCHEMA}.{DATA_TABLE} TABLESAMPLE BERNOULLI ({SAMPLE_PERCENT_CALC}) REPEATABLE (64) LIMIT 1) pctile + PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY "{COL_NAME}"::NUMERIC) AS pct_25, + PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY "{COL_NAME}"::NUMERIC) AS pct_50, + PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY "{COL_NAME}"::NUMERIC) AS pct_75 + FROM "{DATA_SCHEMA}"."{DATA_TABLE}" TABLESAMPLE BERNOULLI ({SAMPLE_PERCENT_CALC}) REPEATABLE (64) LIMIT 1) pctile strTemplate99_else: ' ' diff --git a/testgen/template/flavors/postgresql/profiling/project_secondary_profiling_query_postgresql.sql b/testgen/template/flavors/postgresql/profiling/project_secondary_profiling_query_postgresql.sql index 857af0d3..b9b0c3d6 100644 --- a/testgen/template/flavors/postgresql/profiling/project_secondary_profiling_query_postgresql.sql +++ b/testgen/template/flavors/postgresql/profiling/project_secondary_profiling_query_postgresql.sql @@ -3,7 +3,10 @@ WITH ranked_vals AS ( SELECT "{COL_NAME}", COUNT(*) AS ct, ROW_NUMBER() OVER (ORDER BY COUNT(*) DESC, "{COL_NAME}") AS rn - FROM {DATA_SCHEMA}.{DATA_TABLE} + FROM "{DATA_SCHEMA}"."{DATA_TABLE}" +-- TG-IF do_sample_bool + TABLESAMPLE BERNOULLI ({SAMPLE_PERCENT_CALC}) REPEATABLE (64) +-- TG-ENDIF WHERE "{COL_NAME}" > ' ' GROUP BY "{COL_NAME}" ), @@ -24,5 +27,5 @@ SELECT '{PROJECT_CODE}' as project_code, '{COL_NAME}' as column_name, REPLACE(STRING_AGG(val, '^#^' ORDER BY min_rn), '^#^', CHR(10)) AS top_freq_values, ( SELECT MD5(STRING_AGG(DISTINCT "{COL_NAME}", '|' ORDER BY "{COL_NAME}")) as dvh - FROM {DATA_SCHEMA}.{DATA_TABLE} ) as distinct_value_hash + FROM "{DATA_SCHEMA}"."{DATA_TABLE}" ) as distinct_value_hash FROM consol_vals; diff --git a/testgen/template/flavors/redshift/data_chars/schema_ddf_query_redshift.sql b/testgen/template/flavors/redshift/data_chars/schema_ddf_query_redshift.sql index aa081ff5..cf61e7ca 100644 --- a/testgen/template/flavors/redshift/data_chars/schema_ddf_query_redshift.sql +++ b/testgen/template/flavors/redshift/data_chars/schema_ddf_query_redshift.sql @@ -11,7 +11,15 @@ SELECT '{PROJECT_CODE}' as project_code, WHEN c.data_type = 'numeric' THEN 'numeric' || COALESCE( '(' || CAST(c.numeric_precision AS VARCHAR) || ',' || CAST(c.numeric_scale AS VARCHAR) || ')', '') - ELSE c.data_type END AS data_type, + ELSE c.data_type END AS column_type, + CASE + WHEN c.data_type ILIKE 'char%' + THEN c.data_type || '(' || CAST(c.character_maximum_length AS VARCHAR) || ')' + WHEN c.data_type = 'numeric' + THEN 'numeric' || COALESCE( '(' || CAST(c.numeric_precision AS VARCHAR) || ',' + || CAST(c.numeric_scale AS VARCHAR) || ')', '') + ELSE c.data_type + END AS db_data_type, c.character_maximum_length, c.ordinal_position, CASE diff --git a/testgen/template/flavors/redshift/profiling/project_profiling_query_redshift.yaml b/testgen/template/flavors/redshift/profiling/project_profiling_query_redshift.yaml index 1596dd1d..8ee6eed3 100644 --- a/testgen/template/flavors/redshift/profiling/project_profiling_query_redshift.yaml +++ b/testgen/template/flavors/redshift/profiling/project_profiling_query_redshift.yaml @@ -1,7 +1,7 @@ --- strTemplate01_sampling: "SELECT " strTemplate01_else: "SELECT " -strTemplate02_all: | +strTemplate01_5: | {CONNECTION_ID} as connection_id, '{PROJECT_CODE}' as project_code, '{TABLE_GROUPS_ID}' as table_groups_id, @@ -11,8 +11,14 @@ strTemplate02_all: | {COL_POS} AS position, '{COL_NAME_SANITIZED}' AS column_name, '{COL_TYPE}' AS column_type, + '{DB_DATA_TYPE}' AS db_data_type, '{COL_GEN_TYPE}' AS general_type, COUNT(*) AS record_ct, +strTemplate02_X: | + COUNT("{COL_NAME}") AS value_ct, + COUNT(DISTINCT "{COL_NAME}") AS distinct_value_ct, + SUM(NVL2("{COL_NAME}", 0, 1)) AS null_value_ct, +strTemplate02_else: | COUNT("{COL_NAME}") AS value_ct, COUNT(DISTINCT "{COL_NAME}") AS distinct_value_ct, SUM(NVL2("{COL_NAME}", 0, 1)) AS null_value_ct, @@ -92,9 +98,9 @@ strTemplate06_A_patterns: (SELECT LEFT(LISTAGG(pattern, ' | ') WITHIN GROUP (ORD "{COL_NAME}", '[a-z]', 'a'), '[A-Z]', 'A'), '[0-9]', 'N') AS pattern - FROM {DATA_SCHEMA}.{DATA_TABLE} + FROM "{DATA_SCHEMA}"."{DATA_TABLE}" WHERE "{COL_NAME}" > ' ' AND (SELECT MAX(LEN("{COL_NAME}")) - FROM {DATA_SCHEMA}.{DATA_TABLE}) BETWEEN 3 and {PARM_MAX_PATTERN_LENGTH}) p + FROM "{DATA_SCHEMA}"."{DATA_TABLE}") BETWEEN 3 and {PARM_MAX_PATTERN_LENGTH}) p GROUP BY pattern HAVING pattern > ' ' ORDER BY COUNT(*) DESC) as ps) AS top_patterns, @@ -103,7 +109,7 @@ strTemplate07_A_freq: ( SELECT LEFT(LISTAGG(val, ' | ') WITHIN GROUP (ORDER BY FROM ( SELECT TOP 10 CAST(COUNT(*) as VARCHAR(10)) || ' | ' || "{COL_NAME}" as val, COUNT(*) as ct - FROM {DATA_SCHEMA}.{DATA_TABLE} + FROM "{DATA_SCHEMA}"."{DATA_TABLE}" WHERE "{COL_NAME}" > ' ' GROUP BY "{COL_NAME}" HAVING "{COL_NAME}" > ' ' @@ -136,17 +142,17 @@ strTemplate11_D: CASE ELSE GREATEST(MIN("{COL_NAME}"), '0001-01-01') END as min_date, MAX("{COL_NAME}") as max_date, - COUNT( CASE WHEN DATEDIFF('MON', "{COL_NAME}", '{RUN_DATE}') > 12 THEN 1 END) AS before_1yr_date_ct, - COUNT( CASE WHEN DATEDIFF('MON', "{COL_NAME}", '{RUN_DATE}') > 60 THEN 1 END) AS before_5yr_date_ct, - COUNT( CASE WHEN DATEDIFF('MON', "{COL_NAME}", '{RUN_DATE}') > 240 THEN 1 END) AS before_20yr_date_ct, - COUNT( CASE WHEN DATEDIFF('MON', "{COL_NAME}", '{RUN_DATE}') > 1200 THEN 1 END) AS before_100yr_date_ct, - COUNT( CASE WHEN DATEDIFF('DAY', "{COL_NAME}", '{RUN_DATE}') BETWEEN 0 AND 365 THEN 1 END) AS within_1yr_date_ct, - COUNT( CASE WHEN DATEDIFF('DAY', "{COL_NAME}", '{RUN_DATE}') BETWEEN 0 AND 30 THEN 1 END) AS within_1mo_date_ct, + COUNT( CASE WHEN DATEDIFF('MON', "{COL_NAME}"::DATE, '{RUN_DATE}') > 12 THEN 1 END) AS before_1yr_date_ct, + COUNT( CASE WHEN DATEDIFF('MON', "{COL_NAME}"::DATE, '{RUN_DATE}') > 60 THEN 1 END) AS before_5yr_date_ct, + COUNT( CASE WHEN DATEDIFF('MON', "{COL_NAME}"::DATE, '{RUN_DATE}') > 240 THEN 1 END) AS before_20yr_date_ct, + COUNT( CASE WHEN DATEDIFF('MON', "{COL_NAME}"::DATE, '{RUN_DATE}') > 1200 THEN 1 END) AS before_100yr_date_ct, + COUNT( CASE WHEN DATEDIFF('DAY', "{COL_NAME}"::DATE, '{RUN_DATE}') BETWEEN 0 AND 365 THEN 1 END) AS within_1yr_date_ct, + COUNT( CASE WHEN DATEDIFF('DAY', "{COL_NAME}"::DATE, '{RUN_DATE}') BETWEEN 0 AND 30 THEN 1 END) AS within_1mo_date_ct, COUNT( CASE WHEN "{COL_NAME}" > '{RUN_DATE}' THEN 1 END) AS future_date_ct, - COUNT( CASE WHEN DATEDIFF('MON', '{RUN_DATE}', "{COL_NAME}") > 240 THEN 1 END) AS distant_future_date_ct, - COUNT(DISTINCT DATEDIFF(day, "{COL_NAME}", '{RUN_DATE}' ) ) as date_days_present, - COUNT(DISTINCT DATEDIFF(week, "{COL_NAME}", '{RUN_DATE}' ) ) as date_weeks_present, - COUNT(DISTINCT DATEDIFF(month, "{COL_NAME}", '{RUN_DATE}' ) ) as date_months_present, + COUNT( CASE WHEN DATEDIFF('MON', '{RUN_DATE}', "{COL_NAME}"::DATE) > 240 THEN 1 END) AS distant_future_date_ct, + COUNT(DISTINCT DATEDIFF(day, "{COL_NAME}"::DATE, '{RUN_DATE}' ) ) as date_days_present, + COUNT(DISTINCT DATEDIFF(week, "{COL_NAME}"::DATE, '{RUN_DATE}' ) ) as date_weeks_present, + COUNT(DISTINCT DATEDIFF(month, "{COL_NAME}"::DATE, '{RUN_DATE}' ) ) as date_months_present, strTemplate11_else: NULL as min_date, NULL as max_date, @@ -172,7 +178,7 @@ strTemplate14_A_do_patterns: ( SELECT COUNT(DISTINCT REGEXP_REPLACE( REGEXP_REPL '[A-Z]', 'A'), '[0-9]', 'N') ) AS pattern_ct - FROM {DATA_SCHEMA}.{DATA_TABLE} + FROM "{DATA_SCHEMA}"."{DATA_TABLE}" WHERE "{COL_NAME}" > ' ' ) AS distinct_pattern_ct, SUM(SIGN(REGEXP_COUNT(TRIM("{COL_NAME}"), ' '))::BIGINT) AS embedded_space_ct, AVG(REGEXP_COUNT(TRIM("{COL_NAME}"), ' ')::FLOAT) AS avg_embedded_spaces, @@ -190,23 +196,23 @@ strTemplate15_ALL: NULL as functional_data_type, strTemplate16_ALL: " '{PROFILE_RUN_ID}' as profile_run_id" -strTemplate98_sampling: ' FROM {DATA_SCHEMA}.{DATA_TABLE} ' +strTemplate98_sampling: ' FROM "{DATA_SCHEMA}"."{DATA_TABLE}" ' -strTemplate98_else: ' FROM {DATA_SCHEMA}.{DATA_TABLE}' +strTemplate98_else: ' FROM "{DATA_SCHEMA}"."{DATA_TABLE}"' strTemplate99_N: | , (SELECT PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_25, PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_50, PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_75 - FROM {DATA_SCHEMA}.{DATA_TABLE} LIMIT 1) pctile + FROM "{DATA_SCHEMA}"."{DATA_TABLE}" LIMIT 1) pctile strTemplate99_N_sampling: | , (SELECT PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_25, PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_50, PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_75 - FROM {DATA_SCHEMA}.{DATA_TABLE} LIMIT 1) pctile + FROM "{DATA_SCHEMA}"."{DATA_TABLE}" LIMIT 1) pctile strTemplate99_else: ' ' diff --git a/testgen/template/flavors/redshift/profiling/project_secondary_profiling_query_redshift.sql b/testgen/template/flavors/redshift/profiling/project_secondary_profiling_query_redshift.sql index 84c25587..58b86519 100644 --- a/testgen/template/flavors/redshift/profiling/project_secondary_profiling_query_redshift.sql +++ b/testgen/template/flavors/redshift/profiling/project_secondary_profiling_query_redshift.sql @@ -3,8 +3,11 @@ WITH ranked_vals AS ( SELECT "{COL_NAME}", COUNT(*) AS ct, ROW_NUMBER() OVER (ORDER BY COUNT(*) DESC, "{COL_NAME}") AS rn - FROM {DATA_SCHEMA}.{DATA_TABLE} + FROM "{DATA_SCHEMA}"."{DATA_TABLE}" WHERE "{COL_NAME}" > ' ' +-- TG-IF do_sample_bool + AND RAND() <= 1.0 / {PROFILE_SAMPLE_RATIO} +-- TG-ENDIF GROUP BY "{COL_NAME}" ), consol_vals AS ( @@ -25,5 +28,5 @@ SELECT '{PROJECT_CODE}' as project_code, REPLACE(LISTAGG(val, '^#^') WITHIN GROUP (ORDER BY min_rn), '^#^', CHR(10)) AS top_freq_values, ( SELECT MD5(LISTAGG(DISTINCT "{COL_NAME}", '|') WITHIN GROUP (ORDER BY "{COL_NAME}")) as dvh - FROM {DATA_SCHEMA}.{DATA_TABLE} ) as distinct_value_hash + FROM "{DATA_SCHEMA}"."{DATA_TABLE}" ) as distinct_value_hash FROM consol_vals; diff --git a/testgen/template/flavors/redshift_spectrum/data_chars/schema_ddf_query_redshift_spectrum.sql b/testgen/template/flavors/redshift_spectrum/data_chars/schema_ddf_query_redshift_spectrum.sql new file mode 100644 index 00000000..76ded622 --- /dev/null +++ b/testgen/template/flavors/redshift_spectrum/data_chars/schema_ddf_query_redshift_spectrum.sql @@ -0,0 +1,38 @@ +SELECT '{PROJECT_CODE}' AS project_code, + CURRENT_TIMESTAMP AT TIME ZONE 'UTC' AS refresh_timestamp, + c.schemaname AS table_schema, + c.tablename AS table_name, + c.columnname AS column_name, + c.external_type AS column_type, + c.external_type AS db_data_type, + NULLIF( + REGEXP_SUBSTR(c.external_type, 'char\\(([0-9]+)\\)', 1, 1, 'e'), + '' + ) AS character_maximum_length, + c.columnnum AS ordinal_position, + CASE + WHEN c.external_type = 'string' + OR c.external_type ILIKE 'varchar%' + OR c.external_type ILIKE 'char%' + THEN 'A' + WHEN c.external_type = 'boolean' + THEN 'B' + WHEN c.external_type IN ('date', 'timestamp') + THEN 'D' + WHEN c.external_type IN ('long', 'double', 'float') + OR c.external_type ILIKE '%int%' + OR c.external_type ILIKE 'decimal%' + THEN 'N' + ELSE 'X' + END AS general_type, + CASE + WHEN REGEXP_SUBSTR(c.external_type, 'decimal\\([0-9]+,([0-9]+)\\)', 1, 1, 'e') > 0 + THEN 1 + ELSE 0 + END AS is_decimal +FROM svv_external_columns c +WHERE c.schemaname = '{DATA_SCHEMA}' + {TABLE_CRITERIA} +ORDER BY c.schemaname, + c.tablename, + c.columnnum \ No newline at end of file diff --git a/testgen/template/flavors/redshift_spectrum/profiling/project_get_table_sample_count_redshift_spectrum.sql b/testgen/template/flavors/redshift_spectrum/profiling/project_get_table_sample_count_redshift_spectrum.sql new file mode 100644 index 00000000..9a62c3d6 --- /dev/null +++ b/testgen/template/flavors/redshift_spectrum/profiling/project_get_table_sample_count_redshift_spectrum.sql @@ -0,0 +1,23 @@ +WITH stats + AS (SELECT COUNT(*)::FLOAT as record_ct, + ROUND(CAST({PROFILE_SAMPLE_PERCENT} as FLOAT) * CAST(COUNT(*) as FLOAT) / 100.0) as calc_sample_ct, + CAST({PROFILE_SAMPLE_MIN_COUNT} as FLOAT) as min_sample_ct, + CAST(999000 as FLOAT) as max_sample_ct + FROM {SAMPLING_TABLE} ) +SELECT '{SAMPLING_TABLE}' as schema_table, + CASE WHEN record_ct <= min_sample_ct THEN -1 + WHEN calc_sample_ct > max_sample_ct THEN max_sample_ct + WHEN calc_sample_ct > min_sample_ct THEN calc_sample_ct + ELSE {PROFILE_SAMPLE_MIN_COUNT} + END as sample_count, + CASE WHEN record_ct <= min_sample_ct THEN 1 + WHEN calc_sample_ct > max_sample_ct THEN record_ct / max_sample_ct + WHEN calc_sample_ct > min_sample_ct THEN record_ct / calc_sample_ct + ELSE record_ct / min_sample_ct + END as sample_ratio, + ROUND(CASE WHEN record_ct <= min_sample_ct THEN 100 + WHEN calc_sample_ct > max_sample_ct THEN 100.0 * max_sample_ct / record_ct + WHEN calc_sample_ct > min_sample_ct THEN 100.0 * calc_sample_ct / record_ct + ELSE 100.0 * min_sample_ct / record_ct + END, 4) as sample_percent_calc + FROM stats; diff --git a/testgen/template/flavors/redshift_spectrum/profiling/project_profiling_query_redshift_spectrum.yaml b/testgen/template/flavors/redshift_spectrum/profiling/project_profiling_query_redshift_spectrum.yaml new file mode 100644 index 00000000..80b7a583 --- /dev/null +++ b/testgen/template/flavors/redshift_spectrum/profiling/project_profiling_query_redshift_spectrum.yaml @@ -0,0 +1,219 @@ +--- +strTemplate01_sampling: "SELECT " +strTemplate01_else: "SELECT " +strTemplate01_5: | + {CONNECTION_ID} as connection_id, + '{PROJECT_CODE}' as project_code, + '{TABLE_GROUPS_ID}' as table_groups_id, + '{DATA_SCHEMA}' AS schema_name, + '{RUN_DATE}' AS run_date, + '{DATA_TABLE}' AS table_name, + {COL_POS} AS position, + '{COL_NAME_SANITIZED}' AS column_name, + '{COL_TYPE}' AS column_type, + '{DB_DATA_TYPE}' AS db_data_type, + '{COL_GEN_TYPE}' AS general_type, + COUNT(*) AS record_ct, +strTemplate02_X: | + COUNT("{COL_NAME}") AS value_ct, + COUNT(DISTINCT "{COL_NAME}") AS distinct_value_ct, + SUM(NVL2("{COL_NAME}", 0, 1)) AS null_value_ct, +strTemplate02_else: | + COUNT("{COL_NAME}") AS value_ct, + COUNT(DISTINCT "{COL_NAME}") AS distinct_value_ct, + SUM(NVL2("{COL_NAME}", 0, 1)) AS null_value_ct, +strTemplate03_ADN: MIN(LEN("{COL_NAME}")) AS min_length, + MAX(LEN("{COL_NAME}")) AS max_length, + AVG(NULLIF(LEN("{COL_NAME}"), 0)::FLOAT) AS avg_length, +strTemplate03_else: NULL as min_length, + NULL as max_length, + NULL as avg_length, +strTemplate04_A: COUNT( CASE WHEN TRIM("{COL_NAME}") ~ '^0(\.0*)?$' THEN 1 END) AS zero_value_ct, +strTemplate04_N: SUM( 1 - ABS(SIGN("{COL_NAME}")) )::BIGINT AS zero_value_ct, +strTemplate04_else: NULL as zero_value_ct, +strTemplate05_A: COUNT(DISTINCT UPPER(TRANSLATE("{COL_NAME}", ' '',.-', ''))) as distinct_std_value_ct, + COUNT( CASE WHEN "{COL_NAME}" = '' THEN 1 END) AS zero_length_ct, + COUNT( CASE WHEN "{COL_NAME}" BETWEEN ' !' AND '!' THEN 1 END ) AS lead_space_ct, + COUNT( CASE WHEN "{COL_NAME}" ILIKE '"%"' OR "{COL_NAME}" ILIKE '''%''' THEN 1 END ) as quoted_value_ct, + COUNT( CASE WHEN "{COL_NAME}" ~ '[0-9]' THEN 1 END ) as includes_digit_ct, + COUNT( CASE + WHEN LENGTH("{COL_NAME}") > 0 AND "{COL_NAME}" IN ('.', '?', ' ') THEN 1 + WHEN LOWER("{COL_NAME}") SIMILAR TO '(^.{2,}|-{2,}|0{2,}|9{2,}|x{2,}|z{2,}$)' THEN 1 + WHEN LOWER("{COL_NAME}") IN ('blank','error','missing','tbd', + 'n/a','#na','none','null','unknown') THEN 1 + WHEN LOWER("{COL_NAME}") IN ('(blank)','(error)','(missing)','(tbd)', + '(n/a)','(#na)','(none)','(null)','(unknown)') THEN 1 + WHEN LOWER("{COL_NAME}") IN ('[blank]','[error]','[missing]','[tbd]', + '[n/a]','[#na]','[none]','[null]','[unknown]') THEN 1 + END ) AS filled_value_ct, + LEFT(MIN(NULLIF("{COL_NAME}", '')), 100) AS min_text, + LEFT(MAX(NULLIF("{COL_NAME}", '')), 100) AS max_text, + COUNT( CASE WHEN "{COL_NAME}" = UPPER("{COL_NAME}") AND "{COL_NAME}" <> LOWER("{COL_NAME}") THEN 1 END) AS upper_case_ct, + COUNT( CASE WHEN "{COL_NAME}" = LOWER("{COL_NAME}") AND "{COL_NAME}" <> UPPER("{COL_NAME}") THEN 1 END) AS lower_case_ct, + COUNT( CASE WHEN "{COL_NAME}" = UPPER("{COL_NAME}") AND "{COL_NAME}" = LOWER("{COL_NAME}") THEN 1 END) AS non_alpha_ct, + COUNT( CASE WHEN TRANSLATE("{COL_NAME}", CHR(160) || CHR(8201) || CHR(8203) || CHR(8204) || CHR(8205) || CHR(8206) || CHR(8207) || CHR(8239) || CHR(12288) || CHR(65279), 'XXXXXXXXXX') <> "{COL_NAME}" THEN 1 END) as non_printing_ct, + SUM(<%IS_NUM;LEFT("{COL_NAME}", 31)%>) AS numeric_ct, + SUM(<%IS_DATE;LEFT("{COL_NAME}", 26)%>) AS date_ct, + CASE + WHEN SUM( CASE WHEN "{COL_NAME}" ~ '^[0-9]{1,5}[a-zA-Z]?\\s\\w{1,5}\\.?\\s?\\w*\\s?\\w*\\s[a-zA-Z]{1,6}\\.?\\s?[0-9]{0,5}[A-Z]{0,1}$' + THEN 1 END)::FLOAT/COUNT("{COL_NAME}")::FLOAT > 0.8 THEN 'STREET_ADDR' + WHEN SUM( CASE WHEN "{COL_NAME}" IN ('AL','AK','AS','AZ','AR','CA','CO','CT','DE','DC','FM','FL','GA','GU','HI','ID','IL','IN','IA','KS','KY','LA','ME','MH','MD','MA','MI','MN','MS','MO','MT','NE','NV','NH','NJ','NM','NY','NC','ND','MP','OH','OK','OR','PW','PA','PR','RI','SC','SD','TN','TX','UT','VT','VI','VA','WA','WV','WI','WY','AE','AP','AA') + THEN 1 END)::FLOAT/COUNT("{COL_NAME}")::FLOAT > 0.9 THEN 'STATE_USA' + WHEN SUM( CASE WHEN "{COL_NAME}" ~ '^(\\+1|1)?[ .-]?(\\([2-9][0-9]{2}\\)|[2-9][0-9]{2})[ .-]?[2-9][0-9]{2}[ .-]?[0-9]{4}$' + THEN 1 END)::FLOAT/COUNT("{COL_NAME}")::FLOAT > 0.8 THEN 'PHONE_USA' + WHEN SUM( CASE WHEN "{COL_NAME}" ~ '^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z]{2,}$' + THEN 1 END)::FLOAT/COUNT("{COL_NAME}")::FLOAT > 0.9 THEN 'EMAIL' + WHEN SUM( CASE WHEN TRANSLATE("{COL_NAME}",'012345678','999999999') IN ('99999', '999999999', '99999-9999') + THEN 1 END)::FLOAT/COUNT("{COL_NAME}")::FLOAT > 0.9 THEN 'ZIP_USA' + WHEN SUM( CASE WHEN "{COL_NAME}" ~ '^[\\w\\s\-]+\\.(txt|(c|t|p)sv|dat|doc|docx|json|pdf|xlsx|xml)$' + THEN 1 END)::FLOAT/COUNT("{COL_NAME}")::FLOAT > 0.9 THEN 'FILE_NAME' + WHEN SUM( CASE WHEN "{COL_NAME}" ~ '^([0-9]{4}[- ]){3}[0-9]{4}$' + THEN 1 END)::FLOAT/COUNT("{COL_NAME}")::FLOAT > 0.8 THEN 'CREDIT_CARD' + WHEN SUM( CASE WHEN "{COL_NAME}" ~ '^([^,|\t]{1,20}[,|\t]){2,}[^,|\t]{0,20}([,|\t]{0,1}[^,|\t]{0,20})*$' + AND "{COL_NAME}" !~ '\\s(and|but|or|yet)\\s' + THEN 1 END)::FLOAT/COUNT("{COL_NAME}")::FLOAT > 0.8 THEN 'DELIMITED_DATA' + WHEN SUM( CASE WHEN "{COL_NAME}" ~ '^[0-8][0-9]{2}-[0-9]{2}-[0-9]{4}$' + AND LEFT("{COL_NAME}", 3) NOT BETWEEN '734' AND '749' + AND LEFT("{COL_NAME}", 3) <> '666' THEN 1 END)::FLOAT/COUNT("{COL_NAME}")::FLOAT > 0.9 THEN 'SSN' + END as std_pattern_match, +strTemplate05_else: NULL as distinct_std_value_ct, + NULL as zero_length_ct, + NULL as lead_space_ct, + NULL as quoted_value_ct, + NULL as includes_digit_ct, + NULL as filled_value_ct, + NULL as min_text, + NULL as max_text, + NULL as upper_case_ct, + NULL as lower_case_ct, + NULL as non_alpha_ct, + NULL as non_printing_ct, + NULL as numeric_ct, + NULL as date_ct, + NULL as std_pattern_match, +strTemplate06_A_patterns: (SELECT LEFT(LISTAGG(pattern, ' | ') WITHIN GROUP (ORDER BY ct DESC), 1000) AS concat_pats + FROM ( SELECT TOP 5 CAST(COUNT(*) AS VARCHAR(40)) || ' | ' || pattern AS pattern, + COUNT(*) AS ct + FROM ( SELECT REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( + "{COL_NAME}", '[a-z]', 'a'), + '[A-Z]', 'A'), + '[0-9]', 'N') AS pattern + FROM "{DATA_SCHEMA}"."{DATA_TABLE}" + WHERE "{COL_NAME}" > ' ' AND (SELECT MAX(LEN("{COL_NAME}")) + FROM "{DATA_SCHEMA}"."{DATA_TABLE}") BETWEEN 3 and {PARM_MAX_PATTERN_LENGTH}) p + GROUP BY pattern + HAVING pattern > ' ' + ORDER BY COUNT(*) DESC) as ps) AS top_patterns, +strTemplate06_else: NULL as top_patterns, +strTemplate07_A_freq: ( SELECT LEFT(LISTAGG(val, ' | ') WITHIN GROUP (ORDER BY ct DESC), 1000) as concat_vals + FROM ( + SELECT TOP 10 CAST(COUNT(*) as VARCHAR(10)) || ' | ' || "{COL_NAME}" as val, + COUNT(*) as ct + FROM "{DATA_SCHEMA}"."{DATA_TABLE}" + WHERE "{COL_NAME}" > ' ' + GROUP BY "{COL_NAME}" + HAVING "{COL_NAME}" > ' ' + ORDER BY COUNT(*), "{COL_NAME}" DESC + ) ps + ) AS top_freq_values, +strTemplate07_else: NULL as top_freq_values, +strTemplate08_N: MIN("{COL_NAME}") AS min_value, + MIN(CASE WHEN "{COL_NAME}" > 0 THEN "{COL_NAME}" ELSE NULL END) AS min_value_over_0, + MAX("{COL_NAME}") AS max_value, + AVG(CAST("{COL_NAME}" AS FLOAT)) AS avg_value, + STDDEV(CAST("{COL_NAME}" AS FLOAT)) AS stdev_value, + MIN(pct_25) as percentile_25, + MIN(pct_50) as percentile_50, + MIN(pct_75) as percentile_75, +strTemplate08_else: NULL as min_value, + NULL as min_value_over_0, + NULL as max_value, + NULL as avg_value, + NULL as stdev_value, + NULL as percentile_25, + NULL as percentile_50, + NULL as percentile_75, +strTemplate10_N_dec: SUM(ROUND(ABS(MOD("{COL_NAME}", 1)), 5)) as fractional_sum, + +strTemplate10_else: NULL as fractional_sum, + +strTemplate11_D: CASE + WHEN MIN("{COL_NAME}") IS NULL THEN NULL + ELSE GREATEST(MIN("{COL_NAME}"), '0001-01-01') + END as min_date, + MAX("{COL_NAME}") as max_date, + COUNT( CASE WHEN DATEDIFF('MON', "{COL_NAME}", '{RUN_DATE}') > 12 THEN 1 END) AS before_1yr_date_ct, + COUNT( CASE WHEN DATEDIFF('MON', "{COL_NAME}", '{RUN_DATE}') > 60 THEN 1 END) AS before_5yr_date_ct, + COUNT( CASE WHEN DATEDIFF('MON', "{COL_NAME}", '{RUN_DATE}') > 240 THEN 1 END) AS before_20yr_date_ct, + COUNT( CASE WHEN DATEDIFF('MON', "{COL_NAME}", '{RUN_DATE}') > 1200 THEN 1 END) AS before_100yr_date_ct, + COUNT( CASE WHEN DATEDIFF('DAY', "{COL_NAME}", '{RUN_DATE}') BETWEEN 0 AND 365 THEN 1 END) AS within_1yr_date_ct, + COUNT( CASE WHEN DATEDIFF('DAY', "{COL_NAME}", '{RUN_DATE}') BETWEEN 0 AND 30 THEN 1 END) AS within_1mo_date_ct, + COUNT( CASE WHEN "{COL_NAME}" > '{RUN_DATE}' THEN 1 END) AS future_date_ct, + COUNT( CASE WHEN DATEDIFF('MON', '{RUN_DATE}', "{COL_NAME}") > 240 THEN 1 END) AS distant_future_date_ct, + COUNT(DISTINCT DATEDIFF(day, "{COL_NAME}", '{RUN_DATE}' ) ) as date_days_present, + COUNT(DISTINCT DATEDIFF(week, "{COL_NAME}", '{RUN_DATE}' ) ) as date_weeks_present, + COUNT(DISTINCT DATEDIFF(month, "{COL_NAME}", '{RUN_DATE}' ) ) as date_months_present, + +strTemplate11_else: NULL as min_date, + NULL as max_date, + NULL as before_1yr_date_ct, + NULL as before_5yr_date_ct, + NULL as before_20yr_date_ct, + NULL AS before_100yr_date_ct, + NULL as within_1yr_date_ct, + NULL as within_1mo_date_ct, + NULL as future_date_ct, + NULL as distant_future_date_ct, + NULL as date_days_present, + NULL as date_weeks_present, + NULL as date_months_present, + +strTemplate12_B: SUM(CAST("{COL_NAME}" AS INTEGER)) AS boolean_true_ct, + +strTemplate12_else: NULL as boolean_true_ct, + +strTemplate13_ALL: NULL AS datatype_suggestion, +strTemplate14_A_do_patterns: ( SELECT COUNT(DISTINCT REGEXP_REPLACE( REGEXP_REPLACE( REGEXP_REPLACE( + "{COL_NAME}", '[a-z]', 'a'), + '[A-Z]', 'A'), + '[0-9]', 'N') + ) AS pattern_ct + FROM "{DATA_SCHEMA}"."{DATA_TABLE}" + WHERE "{COL_NAME}" > ' ' ) AS distinct_pattern_ct, + SUM(SIGN(REGEXP_COUNT(TRIM("{COL_NAME}"), ' '))::BIGINT) AS embedded_space_ct, + AVG(REGEXP_COUNT(TRIM("{COL_NAME}"), ' ')::FLOAT) AS avg_embedded_spaces, + +strTemplate14_A_no_patterns: NULL as distinct_pattern_ct, + SUM(SIGN(REGEXP_COUNT(TRIM("{COL_NAME}"), ' '))::BIGINT) AS embedded_space_ct, + AVG(REGEXP_COUNT(TRIM("{COL_NAME}"), ' ')::FLOAT) AS avg_embedded_spaces, + +strTemplate14_else: NULL as distinct_pattern_ct, + NULL as embedded_space_ct, + NULL as avg_embedded_spaces, + +strTemplate15_ALL: NULL as functional_data_type, + NULL as functional_table_type, + +strTemplate16_ALL: " '{PROFILE_RUN_ID}' as profile_run_id" + +strTemplate98_sampling: ' FROM "{DATA_SCHEMA}"."{DATA_TABLE}" ' + +strTemplate98_else: ' FROM "{DATA_SCHEMA}"."{DATA_TABLE}"' + +strTemplate99_N: | + , (SELECT + PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_25, + PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_50, + PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_75 + FROM "{DATA_SCHEMA}"."{DATA_TABLE}" LIMIT 1) pctile + +strTemplate99_N_sampling: | + , (SELECT + PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_25, + PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_50, + PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_75 + FROM "{DATA_SCHEMA}"."{DATA_TABLE}" LIMIT 1) pctile + +strTemplate99_else: ' ' + +strTemplate100_sampling: 'WHERE RAND() <= 1.0 / {PROFILE_SAMPLE_RATIO}' diff --git a/testgen/template/flavors/redshift_spectrum/profiling/project_secondary_profiling_query_redshift_spectrum.sql b/testgen/template/flavors/redshift_spectrum/profiling/project_secondary_profiling_query_redshift_spectrum.sql new file mode 100644 index 00000000..58b86519 --- /dev/null +++ b/testgen/template/flavors/redshift_spectrum/profiling/project_secondary_profiling_query_redshift_spectrum.sql @@ -0,0 +1,32 @@ +-- Get Freqs for selected columns +WITH ranked_vals AS ( + SELECT "{COL_NAME}", + COUNT(*) AS ct, + ROW_NUMBER() OVER (ORDER BY COUNT(*) DESC, "{COL_NAME}") AS rn + FROM "{DATA_SCHEMA}"."{DATA_TABLE}" + WHERE "{COL_NAME}" > ' ' +-- TG-IF do_sample_bool + AND RAND() <= 1.0 / {PROFILE_SAMPLE_RATIO} +-- TG-ENDIF + GROUP BY "{COL_NAME}" +), +consol_vals AS ( + SELECT COALESCE(CASE WHEN rn <= 10 THEN '| ' || "{COL_NAME}" || ' | ' || CAST(ct AS VARCHAR) + ELSE NULL + END, '| Other Values (' || CAST(COUNT(DISTINCT "{COL_NAME}") as VARCHAR) || ') | ' || CAST(SUM(ct) as VARCHAR) ) AS val, + MIN(rn) as min_rn + FROM ranked_vals + GROUP BY CASE WHEN rn <= 10 THEN '| ' || "{COL_NAME}" || ' | ' || CAST(ct AS VARCHAR) + ELSE NULL + END +) +SELECT '{PROJECT_CODE}' as project_code, + '{DATA_SCHEMA}' as schema_name, + '{RUN_DATE}' as run_date, + '{DATA_TABLE}' as table_name, + '{COL_NAME}' as column_name, + REPLACE(LISTAGG(val, '^#^') WITHIN GROUP (ORDER BY min_rn), '^#^', CHR(10)) AS top_freq_values, + ( SELECT MD5(LISTAGG(DISTINCT "{COL_NAME}", '|') + WITHIN GROUP (ORDER BY "{COL_NAME}")) as dvh + FROM "{DATA_SCHEMA}"."{DATA_TABLE}" ) as distinct_value_hash + FROM consol_vals; diff --git a/testgen/template/flavors/redshift_spectrum/profiling/templated_functions.yaml b/testgen/template/flavors/redshift_spectrum/profiling/templated_functions.yaml new file mode 100644 index 00000000..4953e254 --- /dev/null +++ b/testgen/template/flavors/redshift_spectrum/profiling/templated_functions.yaml @@ -0,0 +1,101 @@ +IS_NUM: CASE + WHEN {$1} ~ '^\\s*[+-]?\\$?\\s*[0-9]+(,[0-9]{3})*(\\.[0-9]*)?[\\%]?\\s*$' THEN 1 + ELSE 0 + END + +IS_DATE: CASE + /* YYYY-MM-DD HH:MM:SS SSSSSS or YYYY-MM-DD HH:MM:SS */ + WHEN {$1} ~ + '^(\\d{4})-(0[1-9]|1[0-2])-(0[1-9]|[12][0-9]|3[01])\\s(2[0-3]|[01][0-9]):([0-5][0-9]):([0-5][0-9])(\\s[0-9]{6})?$' + THEN CASE + WHEN LEFT({$1}, 4)::INT BETWEEN 1800 AND 2200 + AND ( + (SUBSTRING({$1}, 6, 2) IN ('01', '03', '05', '07', '08', + '10', '12') + AND SUBSTRING({$1}, 9, 2)::INT BETWEEN 1 AND 31) + OR (SUBSTRING({$1}, 6, 2) IN ('04', '06', '09') + AND SUBSTRING({$1}, 9, 2)::INT BETWEEN 1 AND 30) + OR (SUBSTRING({$1}, 6, 2) = '02' + AND SUBSTRING({$1}, 9, 2)::INT ::INT BETWEEN 1 AND 29) + ) + THEN 1 + ELSE 0 + END + /* YYYYMMDDHHMMSSSSSS or YYYYMMDD */ + WHEN {$1} ~ + '^(\\d{4})(0[1-9]|1[0-2])(0[1-9]|[12][0-9]|3[01])(2[0-3]|[01][0-9])([0-5][0-9])([0-5][0-9])([0-9]{6})$' + OR {$1} ~ '^(\\d{4})(0[1-9]|1[0-2])(0[1-9]|[12][0-9]|3[01])(2[0-3]|[01][0-9])$' + THEN CASE + WHEN LEFT({$1}, 4)::INT BETWEEN 1800 AND 2200 + AND ( + (SUBSTRING({$1}, 5, 2) IN ('01', '03', '05', '07', '08', + '10', '12') + AND SUBSTRING({$1}, 7, 2)::INT BETWEEN 1 AND 31) + OR (SUBSTRING({$1}, 5, 2) IN ('04', '06', '09') + AND SUBSTRING({$1}, 7, 2)::INT BETWEEN 1 AND 30) + OR (SUBSTRING({$1}, 5, 2) = '02' + AND SUBSTRING({$1}, 7, 2)::INT::INT BETWEEN 1 AND 29) + ) + THEN 1 + ELSE 0 + END + /* Exclude anything else long */ + WHEN LENGTH({$1}) > 11 THEN 0 + /* YYYY-MMM/MM-DD */ + WHEN REGEXP_REPLACE(UPPER({$1}), '(JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)', '12') + ~ '[12][09][0-9][0-9]-[0-1]?[0-9]-[0-3]?[0-9]' + THEN CASE + WHEN SPLIT_PART({$1}, '-', 1)::INT BETWEEN 1800 AND 2200 + AND ( + (UPPER(SPLIT_PART({$1}, '-', 2)) IN ('01', '03', '05', '07', '08', + '1', '3', '5', '7', '8', '10', '12', + 'JAN', 'MAR', 'MAY', 'JUL', 'AUG', + 'OCT', 'DEC') + AND SPLIT_PART({$1}, '-', 3)::INT BETWEEN 1 AND 31) + OR (UPPER(SPLIT_PART({$1}, '-', 2)) IN ('04', '06', '09', '4', '6', '9', '11', + 'APR', 'JUN', 'SEP', 'NOV') + AND SPLIT_PART({$1}, '-', 3)::INT BETWEEN 1 AND 30) + OR (UPPER(SPLIT_PART({$1}, '-', 2)) IN ('02', '2', 'FEB') + AND SPLIT_PART({$1}, '-', 3)::INT BETWEEN 1 AND 29) + ) + THEN 1 + ELSE 0 + END + /* MM/-DD/-YY/YYYY */ + WHEN REPLACE({$1}, '-', '/') ~ '^[0-1]?[0-9]/[0-3]?[0-9]/[12][09][0-9][0-9]$' + OR REPLACE({$1}, '-', '/') ~ '^[0-1]?[0-9]/[0-3]?[0-9]/[0-9][0-9]$' + THEN + CASE + WHEN SPLIT_PART(REPLACE({$1}, '-', '/'), '/', 1)::INT BETWEEN 1 AND 12 + AND ( + (SPLIT_PART(REPLACE({$1}, '-', '/'), '/', 1)::INT IN (1, 3, 5, 7, 8, 10, 12) + AND SPLIT_PART(REPLACE({$1}, '-', '/'), '/', 2)::INT BETWEEN 1 AND 31) + OR (SPLIT_PART(REPLACE({$1}, '-', '/'), '/', 1)::INT IN (4, 6, 9, 11) + AND SPLIT_PART(REPLACE({$1}, '-', '/'), '/', 2)::INT BETWEEN 1 AND 30) + OR (SPLIT_PART(REPLACE({$1}, '-', '/'), '/', 1)::INT = 2 + AND SPLIT_PART(REPLACE({$1}, '-', '/'), '/', 2)::INT BETWEEN 1 AND 29) + ) + AND + ('20' + RIGHT(SPLIT_PART(REPLACE({$1}, '-', '/'), '/', 3), 2))::INT BETWEEN 1800 AND 2200 + THEN 1 + ELSE 0 + END + /* DD-MMM-YYYY */ + WHEN UPPER({$1}) ~ '[0-3]?[0-9]-(JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)-[12][09][0-9][0-9]' + THEN + CASE + WHEN SPLIT_PART({$1}, '-', 3)::INT BETWEEN 1800 AND 2200 + AND ( + (UPPER(SPLIT_PART({$1}, '-', 2)) IN ('JAN', 'MAR', 'MAY', 'JUL', 'AUG', 'OCT', 'DEC') + AND SPLIT_PART({$1}, '-', 1)::INT BETWEEN 1 AND 31) + OR (UPPER(SPLIT_PART({$1}, '-', 2)) IN ('APR', 'JUN', 'SEP', 'NOV') + AND SPLIT_PART({$1}, '-', 1)::INT BETWEEN 1 AND 30) + OR (UPPER(SPLIT_PART({$1}, '-', 2)) = 'FEB' + AND SPLIT_PART({$1}, '-', 1)::INT BETWEEN 1 AND 29) + ) + THEN 1 + ELSE 0 + END + ELSE 0 + END + diff --git a/testgen/template/flavors/redshift_spectrum/validate_tests/ex_get_project_column_list.sql b/testgen/template/flavors/redshift_spectrum/validate_tests/ex_get_project_column_list.sql new file mode 100644 index 00000000..83cc6091 --- /dev/null +++ b/testgen/template/flavors/redshift_spectrum/validate_tests/ex_get_project_column_list.sql @@ -0,0 +1,3 @@ +select concat(concat(concat(schemaname, '.'), concat(tablename, '.')), columnname) as columns +from svv_external_columns +where schemaname in ({TEST_SCHEMAS}); diff --git a/testgen/template/flavors/snowflake/data_chars/schema_ddf_query_snowflake.sql b/testgen/template/flavors/snowflake/data_chars/schema_ddf_query_snowflake.sql index b2fa5a4f..6e90f897 100644 --- a/testgen/template/flavors/snowflake/data_chars/schema_ddf_query_snowflake.sql +++ b/testgen/template/flavors/snowflake/data_chars/schema_ddf_query_snowflake.sql @@ -14,7 +14,17 @@ SELECT '{PROJECT_CODE}' as project_code, WHEN c.data_type ILIKE 'num%' THEN 'numeric(' || CAST(c.numeric_precision AS VARCHAR) || ',' || CAST(c.numeric_scale AS VARCHAR) || ')' ELSE c.data_type - END AS data_type, + END AS column_type, + CASE + WHEN c.data_type = 'TEXT' + THEN 'VARCHAR(' || CAST(c.character_maximum_length AS VARCHAR) || ')' + WHEN c.data_type = 'NUMBER' + THEN c.data_type || '(' || CAST(c.numeric_precision AS VARCHAR) || ',' + || CAST(c.numeric_scale AS VARCHAR) || ')' + WHEN c.data_type ILIKE 'TIME%' + THEN c.data_type || '(' || CAST(c.datetime_precision AS VARCHAR) || ')' + ELSE c.data_type + END AS db_data_type, c.character_maximum_length, c.ordinal_position, CASE diff --git a/testgen/template/flavors/snowflake/profiling/project_profiling_query_snowflake.yaml b/testgen/template/flavors/snowflake/profiling/project_profiling_query_snowflake.yaml index bc0f1e7d..a42e3e29 100644 --- a/testgen/template/flavors/snowflake/profiling/project_profiling_query_snowflake.yaml +++ b/testgen/template/flavors/snowflake/profiling/project_profiling_query_snowflake.yaml @@ -1,7 +1,7 @@ --- strTemplate01_sampling: "SELECT " strTemplate01_else: "SELECT " -strTemplate02_all: | +strTemplate01_5: | {CONNECTION_ID} as connection_id, '{PROJECT_CODE}' as project_code, '{TABLE_GROUPS_ID}' as table_groups_id, @@ -11,8 +11,14 @@ strTemplate02_all: | {COL_POS} AS position, '{COL_NAME_SANITIZED}' AS column_name, '{COL_TYPE}' AS column_type, + '{DB_DATA_TYPE}' AS db_data_type, '{COL_GEN_TYPE}' AS general_type, COUNT(*) AS record_ct, +strTemplate02_X: | + COUNT("{COL_NAME}") AS value_ct, + COUNT(DISTINCT "{COL_NAME}") AS distinct_value_ct, + SUM(NVL2("{COL_NAME}", 0, 1)) AS null_value_ct, +strTemplate02_else: | COUNT("{COL_NAME}") AS value_ct, COUNT(DISTINCT "{COL_NAME}") AS distinct_value_ct, SUM(NVL2("{COL_NAME}", 0, 1)) AS null_value_ct, @@ -100,9 +106,9 @@ strTemplate06_A_patterns: ( SELECT LEFT(LISTAGG(pattern, ' | ') WITHIN GROUP (OR "{COL_NAME}"::VARCHAR, '[a-z]', 'a'), '[A-Z]', 'A'), '[0-9]', 'N') AS pattern - FROM {DATA_SCHEMA}.{DATA_TABLE} + FROM "{DATA_SCHEMA}"."{DATA_TABLE}" WHERE "{COL_NAME}" > ' ' AND (SELECT MAX(LEN("{COL_NAME}")) - FROM {DATA_SCHEMA}.{DATA_TABLE}) BETWEEN 3 and {PARM_MAX_PATTERN_LENGTH}) p + FROM "{DATA_SCHEMA}"."{DATA_TABLE}") BETWEEN 3 and {PARM_MAX_PATTERN_LENGTH}) p GROUP BY pattern HAVING pattern > ' ' ORDER BY COUNT(*) DESC) as ps) AS top_patterns, @@ -111,7 +117,7 @@ strTemplate07_A_freq: ( SELECT LEFT(LISTAGG(val, ' | ') WITHIN GROUP (ORDER BY FROM ( SELECT TOP 10 CAST(COUNT(*) as VARCHAR(10)) || ' | ' || "{COL_NAME}" as val, COUNT(*) as ct - FROM {DATA_SCHEMA}.{DATA_TABLE} + FROM "{DATA_SCHEMA}"."{DATA_TABLE}" WHERE "{COL_NAME}" > ' ' GROUP BY "{COL_NAME}" HAVING "{COL_NAME}" > ' ' @@ -177,7 +183,7 @@ strTemplate14_A_do_patterns: ( SELECT COUNT(DISTINCT REGEXP_REPLACE( REGEXP_REP '[A-Z]', 'A'), '[0-9]', 'N') ) AS pattern_ct - FROM {DATA_SCHEMA}.{DATA_TABLE} + FROM "{DATA_SCHEMA}"."{DATA_TABLE}" WHERE "{COL_NAME}" > ' ' ) AS distinct_pattern_ct, SUM(SIGN(REGEXP_COUNT(TRIM("{COL_NAME}"::VARCHAR), ' '))::BIGINT) AS embedded_space_ct, AVG(REGEXP_COUNT(TRIM("{COL_NAME}"::VARCHAR), ' ')::FLOAT) AS avg_embedded_spaces, @@ -195,9 +201,9 @@ strTemplate15_ALL: NULL as functional_data_type, strTemplate16_ALL: " '{PROFILE_RUN_ID}' as profile_run_id " -strTemplate98_sampling: ' FROM {DATA_SCHEMA}.{DATA_TABLE} SAMPLE ({SAMPLE_SIZE} rows)' +strTemplate98_sampling: ' FROM "{DATA_SCHEMA}"."{DATA_TABLE}" SAMPLE ({SAMPLE_SIZE} rows)' -strTemplate98_else: ' FROM {DATA_SCHEMA}.{DATA_TABLE}' +strTemplate98_else: ' FROM "{DATA_SCHEMA}"."{DATA_TABLE}"' strTemplate99_N: | , @@ -205,7 +211,7 @@ strTemplate99_N: | PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_25, PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_50, PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_75 - FROM {DATA_SCHEMA}.{DATA_TABLE} LIMIT 1) pctile + FROM "{DATA_SCHEMA}"."{DATA_TABLE}" LIMIT 1) pctile strTemplate99_N_sampling: | , @@ -213,7 +219,7 @@ strTemplate99_N_sampling: | PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_25, PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_50, PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_75 - FROM {DATA_SCHEMA}.{DATA_TABLE} SAMPLE ({SAMPLE_SIZE} rows) LIMIT 1 ) pctile + FROM "{DATA_SCHEMA}"."{DATA_TABLE}" SAMPLE ({SAMPLE_SIZE} rows) LIMIT 1 ) pctile strTemplate99_else: ; diff --git a/testgen/template/flavors/snowflake/profiling/project_secondary_profiling_query_snowflake.sql b/testgen/template/flavors/snowflake/profiling/project_secondary_profiling_query_snowflake.sql index 3e186892..7b80fc70 100644 --- a/testgen/template/flavors/snowflake/profiling/project_secondary_profiling_query_snowflake.sql +++ b/testgen/template/flavors/snowflake/profiling/project_secondary_profiling_query_snowflake.sql @@ -3,7 +3,10 @@ WITH ranked_vals AS ( SELECT "{COL_NAME}", COUNT(*) AS ct, ROW_NUMBER() OVER (ORDER BY COUNT(*) DESC, "{COL_NAME}") AS rn - FROM {DATA_SCHEMA}.{DATA_TABLE} + FROM "{DATA_SCHEMA}"."{DATA_TABLE}" +-- TG-IF do_sample_bool + SAMPLE ({SAMPLE_SIZE} rows) +-- TG-ENDIF WHERE "{COL_NAME}" > ' ' GROUP BY "{COL_NAME}" ), @@ -25,5 +28,5 @@ SELECT '{PROJECT_CODE}' as project_code, REPLACE(LISTAGG(val, '^#^') WITHIN GROUP (ORDER BY min_rn), '^#^', CHR(10)) AS top_freq_values, ( SELECT MD5(LISTAGG(DISTINCT NULLIF("{COL_NAME}", ''), '|') WITHIN GROUP (ORDER BY NULLIF("{COL_NAME}", ''))) as dvh - FROM {DATA_SCHEMA}.{DATA_TABLE} ) as distinct_value_hash + FROM "{DATA_SCHEMA}"."{DATA_TABLE}" ) as distinct_value_hash FROM consol_vals; diff --git a/testgen/template/flavors/trino/profiling/project_profiling_query_trino.yaml b/testgen/template/flavors/trino/profiling/project_profiling_query_trino.yaml index e3ee9f83..313f79bd 100644 --- a/testgen/template/flavors/trino/profiling/project_profiling_query_trino.yaml +++ b/testgen/template/flavors/trino/profiling/project_profiling_query_trino.yaml @@ -1,7 +1,7 @@ --- strTemplate01_sampling: "SELECT " strTemplate01_else: "SELECT " -strTemplate02_all: | +strTemplate01_5: | {CONNECTION_ID} as connection_id, '{PROJECT_CODE}' as project_code, '{TABLE_GROUPS_ID}' as table_groups_id, @@ -11,8 +11,14 @@ strTemplate02_all: | {COL_POS} AS position, '{COL_NAME_SANITIZED}' AS column_name, '{COL_TYPE}' AS column_type, + '{DB_DATA_TYPE}' AS db_data_type, '{COL_GEN_TYPE}' AS general_type, COUNT(*) AS record_ct, +strTemplate02_X: | + COUNT("{COL_NAME}") AS value_ct, + COUNT(DISTINCT "{COL_NAME}") AS distinct_value_ct, + SUM(NVL2("{COL_NAME}", 0, 1)) AS null_value_ct, +strTemplate02_else: | COUNT("{COL_NAME}") AS value_ct, COUNT(DISTINCT "{COL_NAME}") AS distinct_value_ct, SUM(NVL2("{COL_NAME}", 0, 1)) AS null_value_ct, @@ -113,9 +119,9 @@ strTemplate06_A_patterns: (SELECT SUBSTRING(LISTAGG(pattern, ' | ') WITHIN GROUP "{COL_NAME}", '[a-z]', 'a'), '[A-Z]', 'A'), '[0-9]', 'N') AS pattern - FROM {DATA_SCHEMA}.{DATA_TABLE} + FROM "{DATA_SCHEMA}"."{DATA_TABLE}" WHERE "{COL_NAME}" > ' ' AND (SELECT MAX(LENGTH("{COL_NAME}")) - FROM {DATA_SCHEMA}.{DATA_TABLE}) BETWEEN 3 and {PARM_MAX_PATTERN_LENGTH}) p + FROM "{DATA_SCHEMA}"."{DATA_TABLE}") BETWEEN 3 and {PARM_MAX_PATTERN_LENGTH}) p GROUP BY pattern HAVING pattern > ' ' ORDER BY COUNT(*) DESC LIMIT 5) as ps) AS top_patterns, @@ -123,7 +129,7 @@ strTemplate06_else: NULL as top_patterns, strTemplate07_A_freq: ( SELECT SUBSTRING(LISTAGG(val, ' | ') WITHIN GROUP (ORDER BY ct DESC), 1, 1000) as concat_vals FROM ( SELECT CAST(COUNT(*) as VARCHAR(10)) || ' | ' || "{COL_NAME}" as val, COUNT(*) as ct - FROM {DATA_SCHEMA}.{DATA_TABLE} + FROM "{DATA_SCHEMA}"."{DATA_TABLE}" WHERE "{COL_NAME}" > ' ' GROUP BY "{COL_NAME}" HAVING "{COL_NAME}" > ' ' @@ -215,7 +221,7 @@ strTemplate14_A_do_patterns: ( SELECT COUNT(DISTINCT REGEXP_REPLACE( REGEXP_REPL '[A-Z]', 'A'), '[0-9]', 'N') ) AS pattern_ct - FROM {DATA_SCHEMA}.{DATA_TABLE} + FROM "{DATA_SCHEMA}"."{DATA_TABLE}" WHERE "{COL_NAME}" > ' ' ) AS distinct_pattern_ct, SUM(CAST(SIGN(REGEXP_COUNT(TRIM("{COL_NAME}"), ' ')) AS BIGINT)) AS embedded_space_ct, AVG(CAST(REGEXP_COUNT(TRIM("{COL_NAME}"), ' ') AS REAL)) AS avg_embedded_spaces, @@ -233,23 +239,23 @@ strTemplate15_ALL: NULL as functional_data_type, strTemplate16_ALL: " '{PROFILE_RUN_ID}' as profile_run_id" -strTemplate98_sampling: ' FROM {DATA_SCHEMA}.{DATA_TABLE} TABLESAMPLE SYSTEM ({SAMPLE_PERCENT_CALC})' +strTemplate98_sampling: ' FROM "{DATA_SCHEMA}"."{DATA_TABLE}" TABLESAMPLE SYSTEM ({SAMPLE_PERCENT_CALC})' -strTemplate98_else: ' FROM {DATA_SCHEMA}.{DATA_TABLE}' +strTemplate98_else: ' FROM "{DATA_SCHEMA}"."{DATA_TABLE}"' strTemplate99_N: | , (SELECT APPROX_PERCENTILE("{COL_NAME}", 0.25) AS pct_25, APPROX_PERCENTILE("{COL_NAME}", 0.50) AS pct_50, APPROX_PERCENTILE("{COL_NAME}", 0.75) AS pct_75 - FROM {DATA_SCHEMA}.{DATA_TABLE} LIMIT 1) pctile + FROM "{DATA_SCHEMA}"."{DATA_TABLE}" LIMIT 1) pctile strTemplate99_N_sampling: | , (SELECT APPROX_PERCENTILE("{COL_NAME}", 0.25) AS pct_25, APPROX_PERCENTILE("{COL_NAME}", 0.50) AS pct_50, APPROX_PERCENTILE("{COL_NAME}", 0.75) AS pct_75 - FROM {DATA_SCHEMA}.{DATA_TABLE} TABLESAMPLE SYSTEM ({SAMPLE_PERCENT_CALC}) ) pctile + FROM "{DATA_SCHEMA}"."{DATA_TABLE}" TABLESAMPLE SYSTEM ({SAMPLE_PERCENT_CALC}) ) pctile strTemplate99_else: ' ' diff --git a/testgen/template/gen_query_tests/gen_dupe_rows_test.sql b/testgen/template/gen_query_tests/gen_dupe_rows_test.sql index 5f416ec8..3e75460c 100644 --- a/testgen/template/gen_query_tests/gen_dupe_rows_test.sql +++ b/testgen/template/gen_query_tests/gen_dupe_rows_test.sql @@ -15,7 +15,7 @@ WITH last_run AS (SELECT r.table_groups_id, MAX(run_date) AS last_run_date AND p.run_date::DATE <= '{AS_OF_DATE}' GROUP BY r.table_groups_id), curprof AS (SELECT p.schema_name, p.table_name, p.profile_run_id, - STRING_AGG('{ID_SEPARATOR}' || p.column_name || '{ID_SEPARATOR}', ', ' ORDER BY p.position) as unique_by_columns + STRING_AGG('{QUOTE}' || p.column_name || '{QUOTE}', ', ' ORDER BY p.position) as unique_by_columns FROM last_run lr INNER JOIN profile_results p ON (lr.table_groups_id = p.table_groups_id diff --git a/testgen/template/observability/get_event_data.sql b/testgen/template/observability/get_event_data.sql index 704fdc2f..0e0b8714 100644 --- a/testgen/template/observability/get_event_data.sql +++ b/testgen/template/observability/get_event_data.sql @@ -4,7 +4,7 @@ Select COALESCE(ts.component_name, tg.table_groups_name) as dataset_name, tg.table_group_schema as schema, c.connection_name as connection_name, - c.project_db as project_db, + COALESCE(NULLIF(c.project_db, ''), c.connection_name) as project_db, tg.profile_sample_min_count as profile_sample_minimum_count, ts.table_groups_id as table_groups_id, tg.profile_use_sampling as profile_use_sampling, diff --git a/testgen/template/profiling/datatype_suggestions.sql b/testgen/template/profiling/datatype_suggestions.sql index 6eff3b95..785350a4 100644 --- a/testgen/template/profiling/datatype_suggestions.sql +++ b/testgen/template/profiling/datatype_suggestions.sql @@ -47,6 +47,7 @@ SET datatype_suggestion = AND POSITION('+' IN pr.top_freq_values) > 0 THEN CASE WHEN '{SQL_FLAVOR}' = 'redshift' THEN 'TIMESTAMPZ' + WHEN '{SQL_FLAVOR}' = 'redshift_spectrum' THEN 'TIMESTAMPZ' WHEN '{SQL_FLAVOR}' = 'postgresql' THEN 'TIMESTAMPZ' WHEN '{SQL_FLAVOR}' = 'snowflake' THEN 'TIMESTAMP_TZ' WHEN '{SQL_FLAVOR}' LIKE 'mssql%' THEN 'DATETIMEOFFSET' @@ -61,6 +62,7 @@ SET datatype_suggestion = AND POSITION(':' IN pr.top_freq_values) > 0 THEN CASE WHEN '{SQL_FLAVOR}' = 'redshift' THEN 'TIMESTAMP' + WHEN '{SQL_FLAVOR}' = 'redshift_spectrum' THEN 'TIMESTAMP' WHEN '{SQL_FLAVOR}' = 'postgresql' THEN 'TIMESTAMP' WHEN '{SQL_FLAVOR}' = 'snowflake' THEN 'TIMESTAMP_NTZ' WHEN '{SQL_FLAVOR}' LIKE 'mssql%' THEN 'DATETIME2' diff --git a/testgen/template/profiling/profile_anomalies_screen_column.sql b/testgen/template/profiling/profile_anomalies_screen_column.sql index e7c2b5dc..f1faf012 100644 --- a/testgen/template/profiling/profile_anomalies_screen_column.sql +++ b/testgen/template/profiling/profile_anomalies_screen_column.sql @@ -1,6 +1,6 @@ INSERT INTO profile_anomaly_results (project_code, table_groups_id, profile_run_id, anomaly_id, - schema_name, table_name, column_name, column_type, detail) + schema_name, table_name, column_name, column_type, db_data_type, detail) SELECT p.project_code, p.table_groups_id, p.profile_run_id, @@ -9,6 +9,7 @@ SELECT p.project_code, p.table_name, p.column_name, p.column_type, + p.db_data_type, {DETAIL_EXPRESSION} AS detail FROM profile_results p LEFT JOIN v_inactive_anomalies i diff --git a/testgen/template/profiling/profile_anomalies_screen_multi_column.sql b/testgen/template/profiling/profile_anomalies_screen_multi_column.sql index af315502..7c2cfed4 100644 --- a/testgen/template/profiling/profile_anomalies_screen_multi_column.sql +++ b/testgen/template/profiling/profile_anomalies_screen_multi_column.sql @@ -48,10 +48,10 @@ WITH mults AS ( SELECT p.project_code, ) INSERT INTO profile_anomaly_results (project_code, table_groups_id, profile_run_id, anomaly_id, - schema_name, table_name, column_name, column_type, detail) + schema_name, table_name, column_name, column_type, db_data_type, detail) SELECT project_code, table_groups_id, profile_run_id, anomaly_id, schema_name, '(multi-table)' as table_name, - column_name, '(multiple)' as column_type, + column_name, '(multiple)' as column_type, '(multiple)' as db_data_type, detail || ' , Tables: ' || table_list AS detail FROM subset GROUP BY project_code, table_groups_id, profile_run_id, anomaly_id, diff --git a/testgen/template/profiling/profile_anomalies_screen_variants.sql b/testgen/template/profiling/profile_anomalies_screen_variants.sql index f3e603e0..e4b69be2 100644 --- a/testgen/template/profiling/profile_anomalies_screen_variants.sql +++ b/testgen/template/profiling/profile_anomalies_screen_variants.sql @@ -1,6 +1,6 @@ INSERT INTO profile_anomaly_results (project_code, table_groups_id, profile_run_id, anomaly_id, - schema_name, table_name, column_name, column_type, detail) + schema_name, table_name, column_name, column_type, db_data_type, detail) WITH all_matches AS ( SELECT p.project_code, p.table_groups_id, @@ -9,6 +9,7 @@ WITH all_matches p.table_name, p.column_name, p.column_type, + p.db_data_type, fn_extract_distinct_items(STRING_AGG(fn_extract_intersecting_items(LOWER(fn_extract_top_values(p.top_freq_values)), v.check_values, '|'), '|'), @@ -32,9 +33,10 @@ WITH all_matches p.schema_name, p.table_name, p.column_name, - p.column_type ) + p.column_type, + p.db_data_type ) SELECT project_code, table_groups_id, profile_run_id, :ANOMALY_ID AS anomaly_id, - schema_name, table_name, column_name, column_type, + schema_name, table_name, column_name, column_type, db_data_type, {DETAIL_EXPRESSION} AS detail FROM all_matches; diff --git a/testgen/template/profiling/project_update_profile_results_to_estimates.sql b/testgen/template/profiling/project_update_profile_results_to_estimates.sql index 278302d0..e5a8741f 100644 --- a/testgen/template/profiling/project_update_profile_results_to_estimates.sql +++ b/testgen/template/profiling/project_update_profile_results_to_estimates.sql @@ -24,8 +24,8 @@ set sample_ratio = :PROFILE_SAMPLE_RATIO, future_date_ct = ROUND(future_date_ct * :PROFILE_SAMPLE_RATIO, 0), boolean_true_ct = ROUND(boolean_true_ct * :PROFILE_SAMPLE_RATIO, 0) where profile_run_id = :PROFILE_RUN_ID -and schema_name = split_part(:SAMPLING_TABLE, '.', 1) -and table_name = split_part(:SAMPLING_TABLE, '.', 2) +and schema_name = TRIM(SPLIT_PART(:SAMPLING_TABLE, '.', 1), :QUOTE) +and table_name = TRIM(SPLIT_PART(:SAMPLING_TABLE, '.', 2), :QUOTE) and sample_ratio IS NULL; diff --git a/testgen/template/validate_tests/ex_get_test_column_list_tg.sql b/testgen/template/validate_tests/ex_get_test_column_list_tg.sql index 1498ae87..f7a1474f 100644 --- a/testgen/template/validate_tests/ex_get_test_column_list_tg.sql +++ b/testgen/template/validate_tests/ex_get_test_column_list_tg.sql @@ -17,7 +17,9 @@ SELECT cat_test_id, schema_name AS schema_name, table_name AS table_name, - TRIM(UNNEST(STRING_TO_ARRAY(column_name, ','))) as column_name + TRIM(TRIM(UNNEST(ARRAY_REMOVE( + REGEXP_SPLIT_TO_ARRAY(column_name, ',(?=(?:[^"]*"[^"]*")*[^"]*$)'), + '' )), ' '), '{QUOTE}') as column_name FROM test_definitions d INNER JOIN test_types t ON d.test_type = t.test_type @@ -26,23 +28,27 @@ AND t.test_scope = 'referential' AND t.test_type NOT LIKE 'Aggregate_%' UNION - -- FROM: groupby_names (should be referential) + -- FROM: groupby_names SELECT cat_test_id, schema_name AS schema_name, table_name AS table_name, - TRIM(UNNEST(STRING_TO_ARRAY(groupby_names, ','))) as column_name + TRIM(TRIM(UNNEST(ARRAY_REMOVE( + REGEXP_SPLIT_TO_ARRAY(groupby_names, ',(?=(?:[^"]*"[^"]*")*[^"]*$)'), + '' )), ' '), '{QUOTE}') AS column_name FROM test_definitions d INNER JOIN test_types t ON d.test_type = t.test_type WHERE test_suite_id = :TEST_SUITE_ID AND COALESCE(test_active, 'Y') = 'Y' - AND t.test_scope IN ('column', 'referential') + AND t.test_scope IN ('column', 'referential', 'table') UNION -- FROM: window_date_column (referential) SELECT cat_test_id, schema_name AS schema_name, table_name AS table_name, - TRIM(UNNEST(STRING_TO_ARRAY(window_date_column, ','))) as column_name + TRIM(TRIM(UNNEST(ARRAY_REMOVE( + REGEXP_SPLIT_TO_ARRAY(window_date_column, ',(?=(?:[^"]*"[^"]*")*[^"]*$)'), + '' )), ' '), '{QUOTE}') as column_name FROM test_definitions d INNER JOIN test_types t ON d.test_type = t.test_type @@ -54,7 +60,9 @@ SELECT cat_test_id, match_schema_name AS schema_name, match_table_name AS table_name, - TRIM(UNNEST(STRING_TO_ARRAY(match_column_names, ','))) as column_name + TRIM(TRIM(UNNEST(ARRAY_REMOVE( + REGEXP_SPLIT_TO_ARRAY(match_column_names, ',(?=(?:[^"]*"[^"]*")*[^"]*$)'), + '' )), ' '), '{QUOTE}') as column_name FROM test_definitions d INNER JOIN test_types t ON d.test_type = t.test_type @@ -67,7 +75,9 @@ SELECT cat_test_id, match_schema_name AS schema_name, match_table_name AS table_name, - TRIM(UNNEST(STRING_TO_ARRAY(match_groupby_names, ','))) as column_name + TRIM(TRIM(UNNEST(ARRAY_REMOVE( + REGEXP_SPLIT_TO_ARRAY(match_groupby_names, ',(?=(?:[^"]*"[^"]*")*[^"]*$)'), + '' )), ' '), '{QUOTE}') as column_name FROM test_definitions d INNER JOIN test_types t ON d.test_type = t.test_type diff --git a/testgen/ui/assets/flavors/bigquery.svg b/testgen/ui/assets/flavors/bigquery.svg new file mode 100644 index 00000000..8793b381 --- /dev/null +++ b/testgen/ui/assets/flavors/bigquery.svg @@ -0,0 +1,26 @@ + + + BigQuery + + + + + + + + diff --git a/testgen/ui/assets/style.css b/testgen/ui/assets/style.css index b413f980..dedd11fa 100644 --- a/testgen/ui/assets/style.css +++ b/testgen/ui/assets/style.css @@ -253,6 +253,10 @@ div[data-testid="stVerticalBlockBorderWrapper"]:has( > div > div[data-testid="st justify-content: center; } +.stVerticalBlock:has(> div.stElementContainer > div.stHtml > i.flex-end) { + flex-wrap: wrap; +} + .stVerticalBlock:has(> div.stElementContainer > div.stHtml > i.no-flex-gap) { gap: unset; } @@ -439,6 +443,45 @@ div[data-testid="stPopoverBody"]:has(i.tg-header--help-wrapper) { } /* */ +/* Summary counts component */ +.tg-summary-counts--label { + margin-bottom: 4px; +} + +.tg-summary-counts { + height: 100%; + display: flex; + flex-flow: row nowrap; + align-items: flex-start; + justify-content: flex-start; + gap: 16px; +} + +.tg-summary-counts--item { + display: flex; + flex-flow: row nowrap; + align-items: stretch; + gap: 8px; +} + +.tg-summary-counts--bar { + width: 4px; +} + +.tg-summary-counts--value { + line-height: 1.2; +} + +.tg-summary-counts--value > div:first-child { + color: var(--caption-text-color); + font-size: 12px; +} + +.tg-summary-counts--value > div:last-child { + font-size: 16px; +} +/* */ + /* Export Menu */ .st-key-tg--export-popover [data-testid="stPopoverButton"] > div:last-child { display: none; diff --git a/testgen/ui/components/frontend/js/components/connection_form.js b/testgen/ui/components/frontend/js/components/connection_form.js index 0a2b43f4..5d6aa7e5 100644 --- a/testgen/ui/components/frontend/js/components/connection_form.js +++ b/testgen/ui/components/frontend/js/components/connection_form.js @@ -1,7 +1,7 @@ /** * @import { FileValue } from './file_input.js'; * @import { VanState } from '../van.min.js'; - * + * * @typedef Flavor * @type {object} * @property {string} label @@ -9,19 +9,20 @@ * @property {string} icon * @property {string} flavor * @property {string} connection_string - * + * * @typedef ConnectionStatus * @type {object} * @property {string} message * @property {boolean} successful * @property {string?} details - * + * * @typedef Connection * @type {object} * @property {string} connection_id * @property {string} connection_name * @property {string} sql_flavor * @property {string} sql_flavor_code + * @property {string} project_code * @property {string} project_host * @property {string} project_port * @property {string} project_db @@ -35,23 +36,25 @@ * @property {string?} http_path * @property {string?} warehouse * @property {ConnectionStatus?} status - * + * * @typedef FormState * @type {object} * @property {boolean} dirty * @property {boolean} valid - * + * * @typedef FieldsCache * @type {object} * @property {FileValue} privateKey - * + * @property {FileValue} serviceAccountKey + * * @typedef Properties * @type {object} * @property {Connection} connection * @property {Array.} flavors * @property {boolean} disableFlavor * @property {FileValue?} cachedPrivateKeyFile - * @property {string?} dynamicConnectionUrl + * @property {FileValue?} cachedServiceAccountKeyFile + * @param {string?} dynamicConnectionUrl * @property {(c: Connection, state: FormState, cache?: FieldsCache) => void} onChange */ import van from '../van.min.js'; @@ -72,6 +75,7 @@ const clearSentinel = ''; const secretsPlaceholder = ''; const defaultPorts = { redshift: '5439', + redshift_spectrum: '5439', azure_mssql: '1433', synapse_mssql: '1433', mssql: '1433', @@ -81,7 +85,7 @@ const defaultPorts = { }; /** - * + * * @param {Properties} props * @param {(any|undefined)} saveButton * @returns {HTMLElement} @@ -103,6 +107,7 @@ const ConnectionForm = (props, saveButton) => { const connectionMaxThreads = van.state(connection?.max_threads ?? 4); const connectionQueryChars = van.state(connection?.max_query_chars ?? 9000); const privateKeyFile = van.state(getValue(props.cachedPrivateKeyFile) ?? null); + const serviceAccountKeyFile = van.state(getValue(props.cachedServiceAccountKeyFile) ?? null); const updatedConnection = van.state({ project_code: connection.project_code, @@ -120,6 +125,7 @@ const ConnectionForm = (props, saveButton) => { http_path: connection?.http_path ?? '', warehouse: connection?.warehouse ?? '', url: connection?.url ?? '', + service_account_key: connection?.service_account_key ?? '', sql_flavor_code: connectionFlavor.rawVal ?? '', connection_name: connectionName.rawVal ?? '', max_threads: connectionMaxThreads.rawVal ?? 4, @@ -155,6 +161,15 @@ const ConnectionForm = (props, saveButton) => { connection, dynamicConnectionUrl, ), + redshift_spectrum: () => RedshiftSpectrumForm( + updatedConnection, + getValue(props.flavors).find(f => f.value === connectionFlavor.rawVal), + (formValue, isValid) => { + updatedConnection.val = {...updatedConnection.val, ...formValue}; + setFieldValidity('redshift_spectrum_form', isValid); + }, + connection, + ), azure_mssql: () => AzureMSSQLForm( updatedConnection, getValue(props.flavors).find(f => f.value === connectionFlavor.rawVal), @@ -195,7 +210,6 @@ const ConnectionForm = (props, saveButton) => { connection, dynamicConnectionUrl, ), - snowflake: () => SnowflakeForm( updatedConnection, getValue(props.flavors).find(f => f.value === connectionFlavor.rawVal), @@ -218,6 +232,17 @@ const ConnectionForm = (props, saveButton) => { connection, dynamicConnectionUrl, ), + bigquery: () => BigqueryForm( + updatedConnection, + getValue(props.flavors).find(f => f.value === connectionFlavor.rawVal), + (formValue, fileValue, isValid) => { + updatedConnection.val = {...updatedConnection.val, ...formValue}; + serviceAccountKeyFile.val = fileValue; + setFieldValidity('bigquery_form', isValid); + }, + connection, + getValue(props.cachedServiceAccountKeyFile) ?? null + ), }; const setFieldValidity = (field, validity) => { @@ -235,7 +260,7 @@ const ConnectionForm = (props, saveButton) => { const selectedFlavorCode = connectionFlavor.val; const previousFlavorCode = connectionFlavor.oldVal; const updatedConnection_ = updatedConnection.rawVal; - + const isCustomPort = updatedConnection_?.project_port !== defaultPorts[previousFlavorCode]; if (selectedFlavorCode !== previousFlavorCode && (!isCustomPort || !updatedConnection_?.project_port)) { updatedConnection.val = {...updatedConnection_, project_port: defaultPorts[selectedFlavorCode]}; @@ -260,7 +285,11 @@ const ConnectionForm = (props, saveButton) => { const fieldsValidity = validityPerField.val; const isValid = Object.keys(fieldsValidity).length > 0 && Object.values(fieldsValidity).every(v => v); - props.onChange?.(updatedConnection.val, { dirty: dirty.val, valid: isValid }, { privateKey: privateKeyFile.rawVal }); + props.onChange?.( + updatedConnection.val, + { dirty: dirty.val, valid: isValid }, + { privateKey: privateKeyFile.rawVal, serviceAccountKey: serviceAccountKeyFile.rawVal } + ); }); return div( @@ -517,6 +546,8 @@ const RedshiftForm = ( ); }; +const RedshiftSpectrumForm = RedshiftForm; + const PostgresqlForm = RedshiftForm; const AzureMSSQLForm = RedshiftForm; @@ -723,7 +754,7 @@ const DatabricksForm = ( * @param {VanState} connection * @param {Flavor} flavor * @param {boolean} maskPassword - * @param {(params: Partial, isValid: boolean) => void} onChange + * @param {(params: Partial, fileValue: FileValue, isValid: boolean) => void} onChange * @param {Connection?} originalConnection * @param {string?} cachedFile * @param {VanState} dynamicConnectionUrl @@ -975,7 +1006,7 @@ const SnowflakeForm = ( isValid.val = Object.values(validityPerField).every(v => v); }, validators: [ - required, + requiredIf(() => !originalConnection?.connection_id || !originalConnection?.private_key), sizeLimit(200 * 1024 * 1024), ], }), @@ -1000,6 +1031,84 @@ const SnowflakeForm = ( ); }; +/** + * @param {VanState} connection + * @param {Flavor} flavor + * @param {(params: Partial, fileValue: FileValue, isValid: boolean) => void} onChange + * @param {Connection?} originalConnection + * @param {string?} originalConnection + * @param {FileValue?} cachedFile + * @returns {HTMLElement} + */ +const BigqueryForm = ( + connection, + flavor, + onChange, + originalConnection, + cachedFile, +) => { + const isValid = van.state(false); + const serviceAccountKey = van.state(connection.rawVal.service_account_key ?? null); + const projectId = van.state(""); + const serviceAccountKeyFileRaw = van.state(cachedFile); + + const validityPerField = {}; + + van.derive(() => { + projectId.val = serviceAccountKey.val?.project_id ?? ''; + isValid.val = !!projectId.val; + }); + + van.derive(() => { + onChange({ service_account_key: serviceAccountKey.val, project_db: projectId.val }, serviceAccountKeyFileRaw.val, isValid.val); + }); + + return div( + {class: 'flex-column fx-gap-3 fx-flex'}, + div( + { class: 'flex-column border border-radius-1 p-3 mt-1 fx-gap-1', style: 'position: relative;' }, + Caption({content: 'Service Account Key', style: 'position: absolute; top: -10px; background: var(--app-background-color); padding: 0px 8px;' }), + + () => { + return div( + { class: 'flex-column fx-gap-3' }, + FileInput({ + name: 'service_account_key', + label: 'Upload service account key (.json)', + placeholder: (originalConnection?.connection_id && originalConnection?.service_account_key) + ? 'Drop file here or browse files to replace existing key' + : undefined, + value: serviceAccountKeyFileRaw, + onChange: (value, state) => { + let isFieldValid = state.valid; + try { + if (value?.content) { + serviceAccountKey.val = JSON.parse(atob(value.content.split(',')?.[1] ?? '')); + } + } catch (err) { + console.error(err); + isFieldValid = false; + } + serviceAccountKeyFileRaw.val = value; + validityPerField['service_account_key'] = isFieldValid; + isValid.val = Object.values(validityPerField).every(v => v); + }, + validators: [ + requiredIf(() => !originalConnection?.connection_id || !originalConnection?.service_account_key), + sizeLimit(20 * 1024), + ], + }), + ); + }, + + div( + { class: 'text-caption text-right' }, + () => `Project ID: ${projectId.val}`, + ), + ), + ); +}; + function extractPrefix(url) { const parts = (url ?? '').split('@'); if (!parts[0]) { diff --git a/testgen/ui/components/frontend/js/components/crontab_input.js b/testgen/ui/components/frontend/js/components/crontab_input.js index a1f2c638..3bb489a3 100644 --- a/testgen/ui/components/frontend/js/components/crontab_input.js +++ b/testgen/ui/components/frontend/js/components/crontab_input.js @@ -8,7 +8,7 @@ * @typedef CronSample * @type {object} * @property {string?} error - * @property {string?} sample + * @property {string[]?} samples * @property {string?} readable_expr * * @typedef InitialValue @@ -85,8 +85,8 @@ const CrontabInput = (/** @type Options */ props) => { }), ), Portal( - {target: domId.val, align: 'right', style: 'width: 450px;', opened}, - () => ContabEditorPortal( + {target: domId.val, align: 'right', style: 'width: 500px;', opened}, + () => CrontabEditorPortal( { onChange: onEditorChange, onClose: () => opened.val = false, @@ -103,7 +103,7 @@ const CrontabInput = (/** @type Options */ props) => { * @param {import('../van.min.js').VanState} expr * @returns {HTMLElement} */ -const ContabEditorPortal = ({sample, ...options}, expr) => { +const CrontabEditorPortal = ({sample, ...options}, expr) => { const mode = van.state(expr.rawVal ? determineMode(expr.rawVal) : 'x_hours'); const xHoursState = { @@ -286,11 +286,6 @@ const ContabEditorPortal = ({sample, ...options}, expr) => { { class: () => `${mode.val === 'certain_days' ? '' : 'hidden'}`}, div( {class: 'flex-row fx-gap-2 mb-2'}, - Checkbox({ - label: 'Sunday', - checked: certainDaysState.sunday, - onChange: (v) => certainDaysState.sunday.val = v, - }), Checkbox({ label: 'Monday', checked: certainDaysState.monday, @@ -301,22 +296,20 @@ const ContabEditorPortal = ({sample, ...options}, expr) => { checked: certainDaysState.tuesday, onChange: (v) => certainDaysState.tuesday.val = v, }), - ), - div( - {class: 'flex-row fx-gap-2 mb-2'}, Checkbox({ label: 'Wednesday', checked: certainDaysState.wednesday, onChange: (v) => certainDaysState.wednesday.val = v, }), + ), + div( + {class: 'flex-row fx-gap-2 mb-2'}, + Checkbox({ label: 'Thursday', checked: certainDaysState.thursday, onChange: (v) => certainDaysState.thursday.val = v, }), - ), - div( - {class: 'flex-row fx-gap-2 mb-2'}, Checkbox({ label: 'Friday', checked: certainDaysState.friday, @@ -327,6 +320,11 @@ const ContabEditorPortal = ({sample, ...options}, expr) => { checked: certainDaysState.saturday, onChange: (v) => certainDaysState.saturday.val = v, }), + Checkbox({ + label: 'Sunday', + checked: certainDaysState.sunday, + onChange: (v) => certainDaysState.sunday.val = v, + }), ), div( {class: 'flex-row fx-gap-2'}, @@ -370,10 +368,17 @@ const ContabEditorPortal = ({sample, ...options}, expr) => { span({class: 'fx-flex'}, ''), div( {class: 'flex-column fx-gap-1 mt-3 text-secondary'}, - () => span({}, `Cron Expression: ${expr.val ?? ''}`), - () => span({}, `Next Run: ${(getValue(sample) ?? {})?.sample ?? ''}`), + () => span( + { class: mode.val === 'custom' ? 'hidden': '' }, + `Cron Expression: ${expr.val ?? ''}`, + ), + () => div( + { class: 'flex-column' }, + span('Next Runs:'), + (getValue(sample) ?? {})?.samples?.map(item => span({ class: 'text-caption' }, item)), + ), () => div( - {class: 'flex-row fx-gap-1 text-caption'}, + {class: `flex-row fx-gap-1 text-caption ${mode.val === 'custom' ? '': 'hidden'}`}, span({}, 'Learn more about'), Link({ open_new: true, diff --git a/testgen/ui/components/frontend/js/components/paginator.js b/testgen/ui/components/frontend/js/components/paginator.js index 602302b2..7799e7f2 100644 --- a/testgen/ui/components/frontend/js/components/paginator.js +++ b/testgen/ui/components/frontend/js/components/paginator.js @@ -21,7 +21,8 @@ const Paginator = (/** @type Properties */ props) => { } const { count, pageSize } = props; - const pageIndexState = van.state(getValue(props.pageIndex) ?? 0); + const pageIndexState = van.derive(() => getValue(props.pageIndex) ?? 0); + van.derive(() => { const onChange = props.onChange?.val ?? props.onChange ?? changePage; onChange(pageIndexState.val); diff --git a/testgen/ui/components/frontend/js/components/select.js b/testgen/ui/components/frontend/js/components/select.js index 967aee4e..72bb11cc 100644 --- a/testgen/ui/components/frontend/js/components/select.js +++ b/testgen/ui/components/frontend/js/components/select.js @@ -1,5 +1,5 @@ /** - * @typedef Option + * @typedef SelectOption * @type {object} * @property {string} label * @property {string} value @@ -11,7 +11,7 @@ * @property {string?} id * @property {string} label * @property {string?} value - * @property {Array.