From 76ad5441cb99ddba123f10d4dc9d73eda3e03441 Mon Sep 17 00:00:00 2001
From: Ricardo Boni
Date: Wed, 6 Aug 2025 15:18:12 -0400
Subject: [PATCH 01/48] feat(bigquery): Adding BigQuery to the connections
pages
---
pyproject.toml | 1 +
testgen/commands/run_launch_db_config.py | 1 +
testgen/common/database/database_service.py | 14 +-
.../flavor/bigquery_flavor_service.py | 17 +++
.../common/database/flavor/flavor_service.py | 6 +
testgen/common/models/connection.py | 3 +-
testgen/common/models/custom_types.py | 17 ++-
.../030_initialize_new_schema_structure.sql | 3 +-
.../dbupgrade/0148_incremental_upgrade.sql | 1 -
.../dbupgrade/0151_incremental_upgrade.sql | 3 +
.../data_chars/schema_ddf_query_bigquery.sql | 26 ++++
testgen/ui/assets/flavors/bigquery.svg | 26 ++++
.../frontend/js/components/connection_form.js | 121 ++++++++++++++++--
testgen/ui/services/database_service.py | 11 +-
testgen/ui/views/connections.py | 8 +-
15 files changed, 230 insertions(+), 28 deletions(-)
create mode 100644 testgen/common/database/flavor/bigquery_flavor_service.py
create mode 100644 testgen/template/dbupgrade/0151_incremental_upgrade.sql
create mode 100644 testgen/template/flavors/bigquery/data_chars/schema_ddf_query_bigquery.sql
create mode 100644 testgen/ui/assets/flavors/bigquery.svg
diff --git a/pyproject.toml b/pyproject.toml
index b463aaa4..21b6dee0 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -33,6 +33,7 @@ dependencies = [
"sqlalchemy==1.4.46",
"databricks-sql-connector==2.9.3",
"snowflake-sqlalchemy==1.6.1",
+ "sqlalchemy-bigquery==1.14.1",
"pyodbc==5.0.0",
"psycopg2-binary==2.9.9",
"pycryptodome==3.21",
diff --git a/testgen/commands/run_launch_db_config.py b/testgen/commands/run_launch_db_config.py
index 68f99336..cc248f0e 100644
--- a/testgen/commands/run_launch_db_config.py
+++ b/testgen/commands/run_launch_db_config.py
@@ -41,6 +41,7 @@ def _get_params_mapping() -> dict:
"PROJECT_HOST": settings.PROJECT_DATABASE_HOST,
"PROJECT_PW_ENCRYPTED": EncryptText(settings.PROJECT_DATABASE_PASSWORD),
"PROJECT_HTTP_PATH": "",
+ "PROJECT_SERVICE_ACCOUNT_KEY": "",
"PROJECT_SCHEMA": settings.PROJECT_DATABASE_SCHEMA,
"PROFILING_TABLE_SET": settings.DEFAULT_PROFILING_TABLE_SET,
"PROFILING_INCLUDE_MASK": settings.DEFAULT_PROFILING_INCLUDE_MASK,
diff --git a/testgen/common/database/database_service.py b/testgen/common/database/database_service.py
index 75016501..94e81870 100644
--- a/testgen/common/database/database_service.py
+++ b/testgen/common/database/database_service.py
@@ -357,15 +357,17 @@ def _init_target_db_connection() -> Connection:
engine = engine_cache.target_db
if not engine:
- connection_string = flavor_service.get_connection_string()
- connect_args = flavor_service.get_connect_args()
-
try:
- engine: Engine = create_engine(connection_string, connect_args=connect_args)
- engine_cache.target_db = engine
-
+ engine: Engine = create_engine(
+ flavor_service.get_connection_string(),
+ connect_args=flavor_service.get_connect_args(),
+ **flavor_service.get_engine_args(),
+ )
except SQLAlchemyError as e:
raise ValueError(f"Failed to create engine for Target database '{flavor_service.dbname}' (User type = normal)") from e
+ else:
+ engine_cache.target_db = engine
+
connection: Connection = engine.connect()
diff --git a/testgen/common/database/flavor/bigquery_flavor_service.py b/testgen/common/database/flavor/bigquery_flavor_service.py
new file mode 100644
index 00000000..38e6d913
--- /dev/null
+++ b/testgen/common/database/flavor/bigquery_flavor_service.py
@@ -0,0 +1,17 @@
+from typing import Any
+
+from testgen.common.database.flavor.flavor_service import FlavorService
+
+
+class BigqueryFlavorService(FlavorService):
+ def get_connection_string_head(self):
+ return "bigquery://"
+
+ def get_connection_string_from_fields(self):
+ return f"bigquery://{self.service_account_key["project_id"] if self.service_account_key else ""}"
+
+ def get_connect_args(self) -> dict:
+ return {}
+
+ def get_engine_args(self) -> dict[str,Any]:
+ return {"credentials_info": self.service_account_key} if self.service_account_key else {}
diff --git a/testgen/common/database/flavor/flavor_service.py b/testgen/common/database/flavor/flavor_service.py
index a257b2da..fe302dcb 100644
--- a/testgen/common/database/flavor/flavor_service.py
+++ b/testgen/common/database/flavor/flavor_service.py
@@ -21,6 +21,7 @@ class ConnectionParams(TypedDict):
private_key: bytes
private_key_passphrase: bytes
http_path: str
+ service_account_key: dict[str,Any]
class FlavorService:
@@ -37,6 +38,7 @@ class FlavorService:
private_key = None
private_key_passphrase = None
http_path = None
+ service_account_key = None
catalog = None
warehouse = None
@@ -53,6 +55,7 @@ def init(self, connection_params: ConnectionParams):
self.http_path = connection_params.get("http_path") or ""
self.catalog = connection_params.get("catalog") or ""
self.warehouse = connection_params.get("warehouse") or ""
+ self.service_account_key = connection_params.get("service_account_key", None)
password = connection_params.get("project_pw_encrypted", None)
if isinstance(password, memoryview) or isinstance(password, bytes):
@@ -75,6 +78,9 @@ def get_pre_connection_queries(self) -> list[tuple[str, dict | None]]:
def get_connect_args(self) -> dict:
return {"connect_timeout": 3600}
+ def get_engine_args(self) -> dict[str,Any]:
+ return {}
+
def get_concat_operator(self) -> str:
return "||"
diff --git a/testgen/common/models/connection.py b/testgen/common/models/connection.py
index 660f51fd..1cef40c6 100644
--- a/testgen/common/models/connection.py
+++ b/testgen/common/models/connection.py
@@ -22,7 +22,7 @@
from testgen.common.database.database_service import get_flavor_service
from testgen.common.database.flavor.flavor_service import SQLFlavor
from testgen.common.models import get_current_session
-from testgen.common.models.custom_types import EncryptedBytea
+from testgen.common.models.custom_types import JSON_TYPE, EncryptedBytea, EncryptedJson
from testgen.common.models.entity import ENTITY_HASH_FUNCS, Entity, EntityMinimal
from testgen.common.models.table_group import TableGroup
from testgen.utils import is_uuid4
@@ -61,6 +61,7 @@ class Connection(Entity):
private_key_passphrase: str = Column(EncryptedBytea)
http_path: str = Column(String)
warehouse: str = Column(String)
+ service_account_key: JSON_TYPE = Column(EncryptedJson)
_get_by = "connection_id"
_default_order_by = (asc(func.lower(connection_name)),)
diff --git a/testgen/common/models/custom_types.py b/testgen/common/models/custom_types.py
index b4a34276..e68726ba 100644
--- a/testgen/common/models/custom_types.py
+++ b/testgen/common/models/custom_types.py
@@ -1,10 +1,14 @@
+import json
from datetime import UTC, datetime
+from types import NoneType
from sqlalchemy import Integer, String, TypeDecorator
from sqlalchemy.dialects import postgresql
from testgen.common.encrypt import DecryptText, EncryptText
+JSON_TYPE = str | int | float | list | dict | NoneType
+
class NullIfEmptyString(TypeDecorator):
impl = String
@@ -22,12 +26,12 @@ def process_bind_param(self, value: bool | str | None, _dialect) -> str | None:
if isinstance(value, bool):
return "Y" if value else "N"
return value
-
+
def process_result_value(self, value: str | None, _dialect) -> bool | None:
if isinstance(value, str):
return value == "Y"
return value
-
+
class ZeroIfEmptyInteger(TypeDecorator):
impl = Integer
@@ -54,3 +58,12 @@ def process_bind_param(self, value: str, _dialect) -> bytes:
def process_result_value(self, value: bytes, _dialect) -> str:
return DecryptText(value) if value is not None else value
+
+
+class EncryptedJson(EncryptedBytea):
+
+ def process_bind_param(self, value: JSON_TYPE, _dialect) -> bytes:
+ return None if value is None else super().process_bind_param(json.dumps(value), _dialect)
+
+ def process_result_value(self, value: bytes, _dialect) -> JSON_TYPE:
+ return None if value is None else json.loads(super().process_result_value(value, _dialect))
diff --git a/testgen/template/dbsetup/030_initialize_new_schema_structure.sql b/testgen/template/dbsetup/030_initialize_new_schema_structure.sql
index 09593c39..80d4d66f 100644
--- a/testgen/template/dbsetup/030_initialize_new_schema_structure.sql
+++ b/testgen/template/dbsetup/030_initialize_new_schema_structure.sql
@@ -78,7 +78,8 @@ CREATE TABLE connections (
private_key BYTEA,
private_key_passphrase BYTEA,
http_path VARCHAR(200),
- warehouse VARCHAR(200)
+ warehouse VARCHAR(200),
+ service_account_key BYTEA
);
CREATE TABLE table_groups
diff --git a/testgen/template/dbupgrade/0148_incremental_upgrade.sql b/testgen/template/dbupgrade/0148_incremental_upgrade.sql
index b69d2b1d..20f3c53f 100644
--- a/testgen/template/dbupgrade/0148_incremental_upgrade.sql
+++ b/testgen/template/dbupgrade/0148_incremental_upgrade.sql
@@ -3,4 +3,3 @@ SET SEARCH_PATH TO {SCHEMA_NAME};
UPDATE test_definitions
SET id = gen_random_uuid()
WHERE id IS NULL;
-
\ No newline at end of file
diff --git a/testgen/template/dbupgrade/0151_incremental_upgrade.sql b/testgen/template/dbupgrade/0151_incremental_upgrade.sql
new file mode 100644
index 00000000..91277507
--- /dev/null
+++ b/testgen/template/dbupgrade/0151_incremental_upgrade.sql
@@ -0,0 +1,3 @@
+SET SEARCH_PATH TO {SCHEMA_NAME};
+
+ALTER TABLE connections ADD COLUMN service_account_key BYTEA;
diff --git a/testgen/template/flavors/bigquery/data_chars/schema_ddf_query_bigquery.sql b/testgen/template/flavors/bigquery/data_chars/schema_ddf_query_bigquery.sql
new file mode 100644
index 00000000..f3a1d6fa
--- /dev/null
+++ b/testgen/template/flavors/bigquery/data_chars/schema_ddf_query_bigquery.sql
@@ -0,0 +1,26 @@
+SELECT '{PROJECT_CODE}' AS project_code,
+ CURRENT_TIMESTAMP() AS refresh_timestamp,
+ c.table_schema,
+ c.table_name,
+ c.column_name,
+ CASE
+ WHEN LOWER(c.data_type) LIKE 'timestamp%' THEN LOWER(c.data_type)
+ WHEN LOWER(c.data_type) = 'date' THEN 'date'
+ WHEN LOWER(c.data_type) = 'bool' THEN 'boolean'
+ ELSE LOWER(c.data_type)
+ END AS data_type,
+ NULL AS character_maximum_length,
+ c.ordinal_position,
+ CASE
+ WHEN REGEXP_CONTAINS(LOWER(c.data_type), r'(string|bytes)') THEN 'A'
+ WHEN LOWER(c.data_type) = 'bool' THEN 'B'
+ WHEN LOWER(c.data_type) IN ('date', 'datetime', 'timestamp') THEN 'D'
+ WHEN LOWER(c.data_type) = 'time' THEN 'T'
+ WHEN LOWER(c.data_type) IN ('int64', 'float64') THEN 'N'
+ WHEN REGEXP_CONTAINS(LOWER(c.data_type), r'(decimal|numeric|bignumeric)') THEN 'N'
+ ELSE 'X'
+ END AS general_type,
+ NULL AS is_decimal
+FROM `{DATA_SCHEMA}.INFORMATION_SCHEMA.COLUMNS` c
+WHERE c.table_schema = '{DATA_SCHEMA}' {TABLE_CRITERIA}
+ORDER BY c.table_schema, c.table_name, c.ordinal_position;
diff --git a/testgen/ui/assets/flavors/bigquery.svg b/testgen/ui/assets/flavors/bigquery.svg
new file mode 100644
index 00000000..8793b381
--- /dev/null
+++ b/testgen/ui/assets/flavors/bigquery.svg
@@ -0,0 +1,26 @@
+
+
diff --git a/testgen/ui/components/frontend/js/components/connection_form.js b/testgen/ui/components/frontend/js/components/connection_form.js
index e16b29a1..d81f6683 100644
--- a/testgen/ui/components/frontend/js/components/connection_form.js
+++ b/testgen/ui/components/frontend/js/components/connection_form.js
@@ -1,7 +1,7 @@
/**
* @import { FileValue } from './file_input.js';
* @import { VanState } from '../van.min.js';
- *
+ *
* @typedef Flavor
* @type {object}
* @property {string} label
@@ -9,13 +9,13 @@
* @property {string} icon
* @property {string} flavor
* @property {string} connection_string
- *
+ *
* @typedef ConnectionStatus
* @type {object}
* @property {string} message
* @property {boolean} successful
* @property {string?} details
- *
+ *
* @typedef Connection
* @type {object}
* @property {string} connection_id
@@ -35,23 +35,24 @@
* @property {string?} http_path
* @property {string?} warehouse
* @property {ConnectionStatus?} status
- *
+ *
* @typedef FormState
* @type {object}
* @property {boolean} dirty
* @property {boolean} valid
- *
+ *
* @typedef FieldsCache
* @type {object}
* @property {FileValue} privateKey
- *
+ * @property {FileValue} serviceAccountKey
+ *
* @typedef Properties
* @type {object}
* @property {Connection} connection
* @property {Array.} flavors
* @property {boolean} disableFlavor
* @property {FileValue?} cachedPrivateKeyFile
- * @property {string?} dynamicConnectionUrl
+ * @property {FileValue?} cacheServiceAccountKeyFile
* @property {(c: Connection, state: FormState, cache?: FieldsCache) => void} onChange
*/
import van from '../van.min.js';
@@ -81,7 +82,7 @@ const defaultPorts = {
};
/**
- *
+ *
* @param {Properties} props
* @param {(any|undefined)} saveButton
* @returns {HTMLElement}
@@ -103,6 +104,7 @@ const ConnectionForm = (props, saveButton) => {
const connectionMaxThreads = van.state(connection?.max_threads ?? 4);
const connectionQueryChars = van.state(connection?.max_query_chars ?? 9000);
const privateKeyFile = van.state(getValue(props.cachedPrivateKeyFile) ?? null);
+ const serviceAccountKeyFile = van.state(getValue(props.cachedServiceAccountKeyFile) ?? null);
const updatedConnection = van.state({
project_code: connection.project_code,
@@ -120,6 +122,7 @@ const ConnectionForm = (props, saveButton) => {
http_path: connection?.http_path ?? '',
warehouse: connection?.warehouse ?? '',
url: connection?.url ?? '',
+ service_account_key: connection?.service_account_key ?? '',
sql_flavor_code: connectionFlavor.rawVal ?? '',
connection_name: connectionName.rawVal ?? '',
max_threads: connectionMaxThreads.rawVal ?? 4,
@@ -195,7 +198,6 @@ const ConnectionForm = (props, saveButton) => {
connection,
dynamicConnectionUrl,
),
-
snowflake: () => SnowflakeForm(
updatedConnection,
getValue(props.flavors).find(f => f.value === connectionFlavor.rawVal),
@@ -218,6 +220,17 @@ const ConnectionForm = (props, saveButton) => {
connection,
dynamicConnectionUrl,
),
+ bigquery: () => BigqueryForm(
+ updatedConnection,
+ getValue(props.flavors).find(f => f.value === connectionFlavor.rawVal),
+ (formValue, fileValue, isValid) => {
+ updatedConnection.val = {...updatedConnection.val, ...formValue};
+ serviceAccountKeyFile.val = fileValue;
+ setFieldValidity('bigquery_form', isValid);
+ },
+ connection,
+ getValue(props.cachedServiceAccountKeyFile) ?? null
+ ),
};
const setFieldValidity = (field, validity) => {
@@ -234,7 +247,7 @@ const ConnectionForm = (props, saveButton) => {
const selectedFlavorCode = connectionFlavor.val;
const previousFlavorCode = connectionFlavor.oldVal;
const updatedConnection_ = updatedConnection.rawVal;
-
+
const isCustomPort = updatedConnection_?.project_port !== defaultPorts[previousFlavorCode];
if (selectedFlavorCode !== previousFlavorCode && (!isCustomPort || !updatedConnection_?.project_port)) {
updatedConnection.val = {...updatedConnection_, project_port: defaultPorts[selectedFlavorCode]};
@@ -259,7 +272,11 @@ const ConnectionForm = (props, saveButton) => {
const fieldsValidity = validityPerField.val;
const isValid = Object.keys(fieldsValidity).length > 0 &&
Object.values(fieldsValidity).every(v => v);
- props.onChange?.(updatedConnection.val, { dirty: dirty.val, valid: isValid }, { privateKey: privateKeyFile.rawVal });
+ props.onChange?.(
+ updatedConnection.val,
+ { dirty: dirty.val, valid: isValid },
+ { privateKey: privateKeyFile.rawVal, serviceAccountKey: serviceAccountKeyFile.rawVal }
+ );
});
return div(
@@ -722,7 +739,7 @@ const DatabricksForm = (
* @param {VanState} connection
* @param {Flavor} flavor
* @param {boolean} maskPassword
- * @param {(params: Partial, isValid: boolean) => void} onChange
+ * @param {(params: Partial, fileValue: FileValue, isValid: boolean) => void} onChange
* @param {Connection?} originalConnection
* @param {string?} cachedFile
* @param {VanState} dynamicConnectionUrl
@@ -999,6 +1016,86 @@ const SnowflakeForm = (
);
};
+/**
+ * @param {VanState} connection
+ * @param {Flavor} flavor
+ * @param {(params: Partial, fileValue: FileValue, isValid: boolean) => void} onChange
+ * @param {Connection?} originalConnection
+ * @param {string?} originalConnection
+ * @param {FileValue?} cachedFile
+ * @returns {HTMLElement}
+ */
+const BigqueryForm = (
+ connection,
+ flavor,
+ onChange,
+ originalConnection,
+ cachedFile,
+) => {
+ const isValid = van.state(false);
+ const serviceAccountKey = van.state(connection.rawVal.service_account_key ?? null);
+ const projectId = van.state("");
+ const serviceAccountKeyFileRaw = van.state(cachedFile);
+
+ van.derive(() => {
+ projectId.val = serviceAccountKey.val?.project_id ?? '';
+ isValid.val = !!projectId.val;
+ });
+
+ van.derive(() => {
+ onChange({ service_account_key: serviceAccountKey.val }, serviceAccountKeyFileRaw.val, isValid.val);
+ });
+
+ return div(
+ {class: 'flex-column fx-gap-3 fx-flex'},
+ div(
+ { class: 'flex-column border border-radius-1 p-3 mt-1 fx-gap-1', style: 'position: relative;' },
+ Caption({content: 'Service Account Key', style: 'position: absolute; top: -10px; background: var(--app-background-color); padding: 0px 8px;' }),
+
+ () => {
+ return div(
+ { class: 'flex-column fx-gap-3' },
+ FileInput({
+ name: 'service_account_key',
+ label: 'Upload service account key (.json)',
+ placeholder: (originalConnection?.connection_id && originalConnection?.service_account_key)
+ ? 'Drop file here or browse files to replace existing key'
+ : undefined,
+ value: serviceAccountKeyFileRaw,
+ onChange: (value, state) => {
+ let isFieldValid = state.valid;
+ try {
+ if (value?.content) {
+ serviceAccountKey.val = JSON.parse(atob(value.content.split(',')?.[1] ?? ''));
+ }
+ } catch (err) {
+ console.error(err);
+ isFieldValid = false;
+ }
+ serviceAccountKeyFileRaw.val = value;
+ },
+ validators: [
+ sizeLimit(20 * 1024),
+ ],
+ }),
+ );
+ },
+
+ div(
+ { class: 'flex-row fx-gap-3 fx-flex' },
+ Input({
+ name: 'project_id',
+ label: 'Project ID',
+ value: projectId,
+ height: 38,
+ class: 'fx-flex',
+ disabled: true,
+ }),
+ ),
+ ),
+ );
+};
+
function extractPrefix(url) {
const parts = (url ?? '').split('@');
if (!parts[0]) {
diff --git a/testgen/ui/services/database_service.py b/testgen/ui/services/database_service.py
index a094bc84..cf5c7280 100644
--- a/testgen/ui/services/database_service.py
+++ b/testgen/ui/services/database_service.py
@@ -53,12 +53,15 @@ def fetch_one_from_db(query: str, params: dict | None = None) -> RowMapping | No
return result._mapping if result else None
-def fetch_from_target_db(connection: Connection, query: str, params: dict | None = None) -> list[Row]:
+def fetch_from_target_db(connection: Connection, query: str, params: dict | None = None) -> list[Row]:
flavor_service = get_flavor_service(connection.sql_flavor)
flavor_service.init(connection.to_dict())
- connection_string = flavor_service.get_connection_string()
- connect_args = flavor_service.get_connect_args()
- engine = create_engine(connection_string, connect_args=connect_args)
+
+ engine = create_engine(
+ flavor_service.get_connection_string(),
+ connect_args=flavor_service.get_connect_args(),
+ **flavor_service.get_engine_args(),
+ )
with engine.connect() as connection:
cursor: CursorResult = connection.execute(text(query), params)
diff --git a/testgen/ui/views/connections.py b/testgen/ui/views/connections.py
index ab52de96..cecf9197 100644
--- a/testgen/ui/views/connections.py
+++ b/testgen/ui/views/connections.py
@@ -162,7 +162,7 @@ def on_test_connection_clicked(updated_connection: dict) -> None:
message = "Error creating connection"
success = False
LOG.exception(message)
-
+
results = {
"success": success,
"message": message,
@@ -477,4 +477,10 @@ class ConnectionFlavor:
flavor="databricks",
icon=get_asset_data_url("flavors/databricks.svg"),
),
+ ConnectionFlavor(
+ label="BigQuery",
+ value="bigquery",
+ flavor="bigquery",
+ icon=get_asset_data_url("flavors/bigquery.svg"),
+ ),
]
From b9ef2b42851bc3b82ae0eb7e12780ed4a71b8292 Mon Sep 17 00:00:00 2001
From: Ricardo Boni
Date: Fri, 15 Aug 2025 12:11:17 -0400
Subject: [PATCH 02/48] feat(bigquery): Support BigQuery for Profiling
---
...roject_get_table_sample_count_bigquery.sql | 30 ++
.../project_profiling_query_bigquery.yaml | 267 ++++++++++++++++++
...ect_secondary_profiling_query_bigquery.sql | 52 ++++
.../profiling/templated_functions.yaml | 49 ++++
4 files changed, 398 insertions(+)
create mode 100644 testgen/template/flavors/bigquery/profiling/project_get_table_sample_count_bigquery.sql
create mode 100644 testgen/template/flavors/bigquery/profiling/project_profiling_query_bigquery.yaml
create mode 100644 testgen/template/flavors/bigquery/profiling/project_secondary_profiling_query_bigquery.sql
create mode 100644 testgen/template/flavors/bigquery/profiling/templated_functions.yaml
diff --git a/testgen/template/flavors/bigquery/profiling/project_get_table_sample_count_bigquery.sql b/testgen/template/flavors/bigquery/profiling/project_get_table_sample_count_bigquery.sql
new file mode 100644
index 00000000..4fdfcc6e
--- /dev/null
+++ b/testgen/template/flavors/bigquery/profiling/project_get_table_sample_count_bigquery.sql
@@ -0,0 +1,30 @@
+WITH stats AS (
+ SELECT
+ COUNT(*) * 1.0 AS record_ct,
+ ROUND(CAST({PROFILE_SAMPLE_PERCENT} AS FLOAT64) * COUNT(*) * 1.0 / 100.0) AS calc_sample_ct,
+ CAST({PROFILE_SAMPLE_MIN_COUNT} AS FLOAT64) AS min_sample_ct,
+ CAST(999000 AS FLOAT64) AS max_sample_ct
+ FROM `{SAMPLING_TABLE}`
+)
+SELECT '{SAMPLING_TABLE}' AS schema_table,
+ CASE
+ WHEN record_ct <= min_sample_ct THEN -1
+ WHEN calc_sample_ct > max_sample_ct THEN max_sample_ct
+ WHEN calc_sample_ct > min_sample_ct THEN calc_sample_ct
+ ELSE {PROFILE_SAMPLE_MIN_COUNT}
+ END AS sample_count,
+ CASE
+ WHEN record_ct <= min_sample_ct THEN 1
+ WHEN calc_sample_ct > max_sample_ct THEN record_ct / max_sample_ct
+ WHEN calc_sample_ct > min_sample_ct THEN record_ct / calc_sample_ct
+ ELSE record_ct / min_sample_ct
+ END AS sample_ratio,
+ ROUND(
+ CASE
+ WHEN record_ct <= min_sample_ct THEN 100
+ WHEN calc_sample_ct > max_sample_ct THEN 100.0 * max_sample_ct / record_ct
+ WHEN calc_sample_ct > min_sample_ct THEN 100.0 * calc_sample_ct / record_ct
+ ELSE 100.0 * min_sample_ct / record_ct
+ END,
+ 4) AS sample_percent_calc
+FROM stats;
diff --git a/testgen/template/flavors/bigquery/profiling/project_profiling_query_bigquery.yaml b/testgen/template/flavors/bigquery/profiling/project_profiling_query_bigquery.yaml
new file mode 100644
index 00000000..dcadd458
--- /dev/null
+++ b/testgen/template/flavors/bigquery/profiling/project_profiling_query_bigquery.yaml
@@ -0,0 +1,267 @@
+---
+strTemplate01_sampling: |
+ WITH target_table AS (
+ SELECT * FROM `{DATA_SCHEMA}.{DATA_TABLE}` WHERE RAND() * 100 < {SAMPLE_PERCENT_CALC}
+ )
+ SELECT
+strTemplate01_else: |
+ WITH target_table AS (
+ SELECT * FROM `{DATA_SCHEMA}.{DATA_TABLE}`
+ )
+ SELECT
+strTemplate02_all: |
+ {CONNECTION_ID} as connection_id,
+ '{PROJECT_CODE}' as project_code,
+ '{TABLE_GROUPS_ID}' as table_groups_id,
+ '{DATA_SCHEMA}' AS schema_name,
+ '{RUN_DATE}' AS run_date,
+ '{DATA_TABLE}' AS table_name,
+ {COL_POS} AS position,
+ '{COL_NAME_SANITIZED}' AS column_name,
+ '{COL_TYPE}' AS column_type,
+ '{COL_GEN_TYPE}' AS general_type,
+ COUNT(*) AS record_ct,
+ COUNT(`{COL_NAME}`) AS value_ct,
+ COUNT(DISTINCT `{COL_NAME}`) AS distinct_value_ct,
+ SUM(IF(`{COL_NAME}` IS NULL, 1, 0)) AS null_value_ct,
+strTemplate03_ADN: MIN(LENGTH(CAST(`{COL_NAME}` AS STRING))) AS min_length,
+ MAX(LENGTH(CAST(`{COL_NAME}` AS STRING))) AS max_length,
+ AVG(NULLIF(LENGTH(CAST(`{COL_NAME}` AS STRING)), 0)) AS avg_length,
+strTemplate03_else: NULL as min_length,
+ NULL as max_length,
+ NULL as avg_length,
+strTemplate04_A: SUM(
+ CASE
+ WHEN REGEXP_CONTAINS(TRIM(CAST(`{COL_NAME}` AS STRING)), r'^0(\.0*)?$') THEN 1
+ ELSE 0
+ END
+ ) AS zero_value_ct,
+strTemplate04_N: CAST(SUM(1 - ABS(SIGN(CAST(`{COL_NAME}` AS NUMERIC)))) AS INT64) AS zero_value_ct,
+strTemplate04_else: NULL as zero_value_ct,
+strTemplate05_A: |
+ COUNT(
+ DISTINCT UPPER(
+ REGEXP_REPLACE(CAST(`{COL_NAME}` AS STRING), r"[ '\.,-]", "")
+ )
+ ) as distinct_std_value_ct,
+ SUM(CASE WHEN `{COL_NAME}` = '' THEN 1 ELSE 0 END) AS zero_length_ct,
+ SUM(CASE WHEN `{COL_NAME}` BETWEEN ' !' AND '!' THEN 1 ELSE 0 END) AS lead_space_ct,
+ SUM(
+ CASE
+ WHEN LOWER(CAST(`{COL_NAME}` AS STRING)) LIKE '"%"'
+ OR LOWER(CAST(`{COL_NAME}` AS STRING)) LIKE "'%'" THEN 1
+ ELSE 0
+ END
+ ) AS quoted_value_ct,
+ SUM(
+ CASE
+ WHEN REGEXP_CONTAINS(CAST(`{COL_NAME}` AS STRING), r'.*[0-9].*') THEN 1
+ ELSE 0
+ END
+ ) AS includes_digit_ct,
+ SUM(
+ CASE
+ WHEN CAST(`{COL_NAME}` AS STRING) IN ('.', '?', ' ') THEN 1
+ WHEN REGEXP_CONTAINS(LOWER(CAST(`{COL_NAME}` AS STRING)), r'^\s*[-09xz]{2,}\s*$') THEN 1
+ WHEN LOWER(CAST(`{COL_NAME}` AS STRING)) IN ('blank','error','missing','tbd',
+ 'n/a','#na','none','null','unknown') THEN 1
+ WHEN LOWER(CAST(`{COL_NAME}` AS STRING)) IN ('(blank)','(error)','(missing)','(tbd)',
+ '(n/a)','(#na)','(none)','(null)','(unknown)') THEN 1
+ WHEN LOWER(CAST(`{COL_NAME}` AS STRING)) IN ('[blank]','[error]','[missing]','[tbd]',
+ '[n/a]','[#na]','[none]','[null]','[unknown]') THEN 1
+ ELSE 0
+ END
+ ) AS filled_value_ct,
+ LEFT(MIN(NULLIF(`{COL_NAME}`, '')), 100) AS min_text,
+ LEFT(MAX(NULLIF(`{COL_NAME}`, '')), 100) AS max_text,
+ SUM( CASE WHEN `{COL_NAME}` = UPPER(`{COL_NAME}`) AND `{COL_NAME}` <> LOWER(`{COL_NAME}`) THEN 1 ELSE 0 END) AS upper_case_ct,
+ SUM( CASE WHEN `{COL_NAME}` = LOWER(`{COL_NAME}`) AND `{COL_NAME}` <> UPPER(`{COL_NAME}`) THEN 1 ELSE 0 END) AS lower_case_ct,
+ SUM( CASE WHEN `{COL_NAME}` = UPPER(`{COL_NAME}`) AND `{COL_NAME}` = LOWER(`{COL_NAME}`) THEN 1 ELSE 0 END) AS non_alpha_ct,
+ COUNTIF(
+ TRANSLATE(
+ CAST(`{COL_NAME}` AS STRING),
+ CODE_POINTS_TO_STRING([160, 8201, 8203, 8204, 8205, 8206, 8207, 8239, 12288, 65279]),
+ REPEAT('X', 10)
+ ) <> CAST(`{COL_NAME}` AS STRING)
+ ) as non_printing_ct,
+ SUM(<%IS_NUM;LEFT(`{COL_NAME}`, 31)%>) AS numeric_ct,
+ SUM(<%IS_DATE;LEFT(`{COL_NAME}`, 26)%>) AS date_ct,
+ CASE
+ WHEN SAFE_DIVIDE(SUM(CASE WHEN REGEXP_CONTAINS(`{COL_NAME}`, r'^[0-9]{1,5}[a-zA-Z]?\s\w{1,5}\.?\s?\w*\s?\w*\s[a-zA-Z]{1,6}\.?\s?[0-9]{0,5}[A-Z]{0,1}$')
+ THEN 1 END), COUNT(`{COL_NAME}`)) > 0.8 THEN 'STREET_ADDR'
+ WHEN SAFE_DIVIDE(SUM(CASE WHEN `{COL_NAME}` IN ('AL','AK','AS','AZ','AR','CA','CO','CT','DE','DC','FM','FL','GA','GU','HI','ID','IL','IN','IA','KS','KY','LA','ME','MH','MD','MA','MI','MN','MS','MO','MT','NE','NV','NH','NJ','NM','NY','NC','ND','MP','OH','OK','OR','PW','PA','PR','RI','SC','SD','TN','TX','UT','VT','VI','VA','WA','WV','WI','WY','AE','AP','AA')
+ THEN 1 END), COUNT(`{COL_NAME}`)) > 0.9 THEN 'STATE_USA'
+ WHEN SAFE_DIVIDE(SUM(CASE WHEN REGEXP_CONTAINS(`{COL_NAME}`, r'^(\\+1|1)?[ .-]?(\\([2-9][0-9]{2}\\)|[2-9][0-9]{2})[ .-]?[2-9][0-9]{2}[ .-]?[0-9]{4}$')
+ THEN 1 END), COUNT(`{COL_NAME}`)) > 0.8 THEN 'PHONE_USA'
+ WHEN SAFE_DIVIDE(SUM(CASE WHEN REGEXP_CONTAINS(`{COL_NAME}`, r'^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$')
+ THEN 1 END), COUNT(`{COL_NAME}`)) > 0.9 THEN 'EMAIL'
+ WHEN SAFE_DIVIDE(SUM(CASE WHEN TRANSLATE(`{COL_NAME}`, '012345678', '999999999') IN ('99999', '999999999', '99999-9999')
+ THEN 1 END), COUNT(`{COL_NAME}`)) > 0.9 THEN 'ZIP_USA'
+ WHEN SAFE_DIVIDE(SUM(CASE WHEN REGEXP_CONTAINS(`{COL_NAME}`, r'^[\w\s\-]+\.(txt|csv|tsv|dat|doc|pdf|xlsx)$')
+ THEN 1 END), COUNT(`{COL_NAME}`)) > 0.9 THEN 'FILE_NAME'
+ WHEN SAFE_DIVIDE(SUM(CASE WHEN REGEXP_CONTAINS(`{COL_NAME}`, r'^([0-9]{4}[- ]){3}[0-9]{4}$')
+ THEN 1 END), COUNT(`{COL_NAME}`)) > 0.8 THEN 'CREDIT_CARD'
+ WHEN SAFE_DIVIDE(SUM(CASE WHEN REGEXP_CONTAINS(`{COL_NAME}`, r'^([^,|\t]{1,20}[,|\t]){2,}[^,|\t]{0,20}([,|\t]{0,1}[^,|\t]{0,20})*$')
+ AND NOT REGEXP_CONTAINS(`{COL_NAME}`, r'\s(and|but|or|yet)\s')
+ THEN 1 END), COUNT(`{COL_NAME}`)) > 0.8 THEN 'DELIMITED_DATA'
+ WHEN SAFE_DIVIDE(SUM(CASE WHEN REGEXP_CONTAINS(`{COL_NAME}`, r'^[0-8][0-9]{2}-[0-9]{2}-[0-9]{4}$')
+ AND CAST(SUBSTR(`{COL_NAME}`, 1, 3) AS INT64) NOT BETWEEN 734 AND 749
+ AND SUBSTR(`{COL_NAME}`, 1, 3) <> '666'
+ THEN 1 END), COUNT(`{COL_NAME}`)) > 0.9 THEN 'SSN'
+ END AS std_pattern_match,
+strTemplate05_else: NULL as distinct_std_value_ct,
+ NULL as zero_length_ct,
+ NULL as lead_space_ct,
+ NULL as quoted_value_ct,
+ NULL as includes_digit_ct,
+ NULL as filled_value_ct,
+ NULL as min_text,
+ NULL as max_text,
+ NULL as upper_case_ct,
+ NULL as lower_case_ct,
+ NULL as non_alpha_ct,
+ NULL as non_printing_ct,
+ NULL as numeric_ct,
+ NULL as date_ct,
+ NULL as std_pattern_match,
+strTemplate06_A_patterns: |
+ (
+ SELECT LEFT(STRING_AGG(val, ' | ' ORDER BY ct DESC), 1000) AS top_patterns
+ FROM (
+ SELECT CONCAT(CAST(ct AS STRING), ' | ', pattern) AS val,
+ ct
+ FROM (
+ SELECT pattern,
+ COUNT(*) AS ct
+ FROM (
+ SELECT REGEXP_REPLACE(
+ REGEXP_REPLACE(
+ REGEXP_REPLACE(CAST({COL_NAME} AS STRING), r'[a-z]', 'a'),
+ r'[A-Z]', 'A'),
+ r'[0-9]', 'N') AS pattern
+ FROM `target_table`
+ WHERE {COL_NAME} > ' '
+ AND (
+ SELECT MAX(LENGTH(CAST({COL_NAME} AS STRING)))
+ FROM `target_table`
+ ) BETWEEN 3 AND {PARM_MAX_PATTERN_LENGTH}
+ ) p
+ GROUP BY pattern
+ HAVING pattern > ' '
+ ORDER BY ct DESC
+ LIMIT 5
+ )
+ ) ps
+ ) as top_patterns,
+strTemplate06_else: NULL as top_patterns,
+strTemplate07_A_freq: |
+ (
+ SELECT LEFT(STRING_AGG(val, ' | ' ORDER BY ct DESC), 1000) AS top_freq_values
+ FROM (
+ SELECT CONCAT(CAST(ct AS STRING), ' | ', CAST({COL_NAME} AS STRING)) AS val,
+ ct
+ FROM (
+ SELECT {COL_NAME},
+ COUNT(*) AS ct
+ FROM `target_table`
+ WHERE {COL_NAME} > ' '
+ GROUP BY {COL_NAME}
+ HAVING {COL_NAME} > ' '
+ ORDER BY ct DESC, {COL_NAME} DESC
+ LIMIT 10
+ )
+ ) ps
+ ) as top_freq_values,
+strTemplate07_else: NULL as top_freq_values,
+strTemplate08_N: MIN(`{COL_NAME}`) AS min_value,
+ MIN(CASE WHEN `{COL_NAME}` > 0 THEN `{COL_NAME}` ELSE NULL END) AS min_value_over_0,
+ MAX(`{COL_NAME}`) AS max_value,
+ AVG(CAST(`{COL_NAME}` AS FLOAT64)) AS avg_value,
+ STDDEV(CAST(`{COL_NAME}` AS FLOAT64)) AS stdev_value,
+ MIN(pct_25) AS percentile_25,
+ MIN(pct_50) AS percentile_50,
+ MIN(pct_75) AS percentile_75,
+strTemplate08_else: NULL as min_value,
+ NULL as min_value_over_0,
+ NULL as max_value,
+ NULL as avg_value,
+ NULL as stdev_value,
+ NULL as percentile_25,
+ NULL as percentile_50,
+ NULL as percentile_75,
+strTemplate10_N_dec: SUM(ROUND(ABS(MOD(`{COL_NAME}`, 1)), 5)) as fractional_sum,
+strTemplate10_else: NULL as fractional_sum,
+strTemplate11_D: |
+ MIN(`{COL_NAME}`) AS min_date, -- Other flavors have a minimum threshold of 0001-01-01, but BigQuery doesn't make it easy to to the same
+ MAX(`{COL_NAME}`) as max_date,
+ COUNT(CASE WHEN DATE_DIFF(SAFE_CAST(DATE('{RUN_DATE}') AS DATE), SAFE_CAST(DATE(`{COL_NAME}`) AS DATE), MONTH) > 12 THEN 1 END) AS before_1yr_date_ct,
+ COUNT(CASE WHEN DATE_DIFF(SAFE_CAST(DATE('{RUN_DATE}') AS DATE), SAFE_CAST(DATE(`{COL_NAME}`) AS DATE), MONTH) > 60 THEN 1 END) AS before_5yr_date_ct,
+ COUNT(CASE WHEN DATE_DIFF(SAFE_CAST(DATE('{RUN_DATE}') AS DATE), SAFE_CAST(DATE(`{COL_NAME}`) AS DATE), MONTH) > 240 THEN 1 END) AS before_20yr_date_ct,
+ COUNT(CASE WHEN DATE_DIFF(SAFE_CAST(DATE('{RUN_DATE}') AS DATE), SAFE_CAST(DATE(`{COL_NAME}`) AS DATE), MONTH) > 1200 THEN 1 END) AS before_100yr_date_ct,
+ COUNT(CASE WHEN DATE_DIFF(SAFE_CAST(DATE('{RUN_DATE}') AS DATE), SAFE_CAST(DATE(`{COL_NAME}`) AS DATE), DAY) BETWEEN 0 AND 365 THEN 1 END) AS within_1yr_date_ct,
+ COUNT(CASE WHEN DATE_DIFF(SAFE_CAST(DATE('{RUN_DATE}') AS DATE), SAFE_CAST(DATE(`{COL_NAME}`) AS DATE), DAY) BETWEEN 0 AND 30 THEN 1 END) AS within_1mo_date_ct,
+ COUNT(CASE WHEN SAFE_CAST(DATE(`{COL_NAME}`) AS DATE) > SAFE_CAST(DATE('{RUN_DATE}') AS DATE) THEN 1 END) AS future_date_ct,
+ COUNT(CASE WHEN DATE_DIFF(SAFE_CAST(DATE(`{COL_NAME}`) AS DATE), SAFE_CAST(DATE('{RUN_DATE}') AS DATE), MONTH) > 240 THEN 1 END) AS distant_future_date_ct,
+ COUNT(DISTINCT DATE_DIFF(SAFE_CAST(DATE('{RUN_DATE}') AS DATE), SAFE_CAST(DATE(`{COL_NAME}`) AS DATE), DAY)) AS date_days_present,
+ COUNT(DISTINCT DATE_DIFF(SAFE_CAST(DATE('{RUN_DATE}') AS DATE), SAFE_CAST(DATE(`{COL_NAME}`) AS DATE), WEEK)) AS date_weeks_present,
+ COUNT(DISTINCT DATE_DIFF(SAFE_CAST(DATE('{RUN_DATE}') AS DATE), SAFE_CAST(DATE(`{COL_NAME}`) AS DATE), MONTH)) AS date_months_present,
+
+strTemplate11_else: NULL as min_date,
+ NULL as max_date,
+ NULL as before_1yr_date_ct,
+ NULL as before_5yr_date_ct,
+ NULL as before_20yr_date_ct,
+ NULL AS before_100yr_date_ct,
+ NULL as within_1yr_date_ct,
+ NULL as within_1mo_date_ct,
+ NULL as future_date_ct,
+ NULL as distant_future_date_ct,
+ NULL as date_days_present,
+ NULL as date_weeks_present,
+ NULL as date_months_present,
+strTemplate12_B: SUM(CAST(`{COL_NAME}` AS INT64)) AS boolean_true_ct,
+strTemplate12_else: NULL as boolean_true_ct,
+strTemplate13_ALL: NULL AS datatype_suggestion,
+strTemplate14_A_do_patterns: |
+ (
+ SELECT
+ COUNT(DISTINCT REGEXP_REPLACE(
+ REGEXP_REPLACE(
+ REGEXP_REPLACE(CAST(`{COL_NAME}` AS STRING), r'[a-z]', 'a'),
+ r'[A-Z]', 'A'
+ ),
+ r'[0-9]', 'N'
+ )) AS pattern_ct
+ FROM `target_table`
+ WHERE `{COL_NAME}` > ' '
+ ) as distinct_pattern_ct,
+ SUM(CAST(SIGN(LENGTH(TRIM(`{COL_NAME}`)) - LENGTH(REPLACE(TRIM(`{COL_NAME}`), ' ', ''))) AS INT64)) AS embedded_space_ct,
+ AVG(CAST(LENGTH(TRIM(`{COL_NAME}`)) - LENGTH(REPLACE(TRIM(`{COL_NAME}`), ' ', '')) AS FLOAT64)) AS avg_embedded_spaces,
+strTemplate14_A_no_patterns: NULL as distinct_pattern_ct,
+ SUM(CAST(SIGN(LENGTH(TRIM(`{COL_NAME}`)) - LENGTH(REPLACE(TRIM(`{COL_NAME}`), ' ', ''))) AS INT64)) AS embedded_space_ct,
+ AVG(CAST(LENGTH(TRIM(`{COL_NAME}`)) - LENGTH(REPLACE(TRIM(`{COL_NAME}`), ' ', '')) AS FLOAT64)) AS avg_embedded_spaces,
+strTemplate14_else: NULL as distinct_pattern_ct,
+ NULL as embedded_space_ct,
+ NULL as avg_embedded_spaces,
+strTemplate15_ALL: NULL as functional_data_type,
+ NULL as functional_table_type,
+strTemplate16_ALL: " '{PROFILE_RUN_ID}' as profile_run_id "
+strTemplate98_sampling: ' FROM target_table'
+strTemplate98_else: ' FROM target_table'
+
+strTemplate99_N: |
+ ,
+ (SELECT
+ PERCENTILE_CONT(`{COL_NAME}`, 0.25) OVER() AS pct_25,
+ PERCENTILE_CONT(`{COL_NAME}`, 0.50) OVER() AS pct_50,
+ PERCENTILE_CONT(`{COL_NAME}`, 0.75) OVER() AS pct_75
+ FROM `{DATA_SCHEMA}.{DATA_TABLE}` LIMIT 1) pctile
+strTemplate99_N_sampling: |
+ ,
+ (SELECT
+ APPROX_QUANTILES(`{COL_NAME}`, 100)[OFFSET(25)] AS pct_25,
+ APPROX_QUANTILES(`{COL_NAME}`, 100)[OFFSET(50)] AS pct_50,
+ APPROX_QUANTILES(`{COL_NAME}`, 100)[OFFSET(75)] AS pct_75
+ FROM `{DATA_SCHEMA}.{DATA_TABLE}` LIMIT 1) pctile
+strTemplate99_else: ;
+strTemplate100_sampling: ' '
diff --git a/testgen/template/flavors/bigquery/profiling/project_secondary_profiling_query_bigquery.sql b/testgen/template/flavors/bigquery/profiling/project_secondary_profiling_query_bigquery.sql
new file mode 100644
index 00000000..4f2453b3
--- /dev/null
+++ b/testgen/template/flavors/bigquery/profiling/project_secondary_profiling_query_bigquery.sql
@@ -0,0 +1,52 @@
+WITH counts AS (
+ SELECT
+ `{COL_NAME}` AS col_val,
+ COUNT(*) AS ct
+ FROM `{DATA_SCHEMA}.{DATA_TABLE}`
+ WHERE `{COL_NAME}` > ' '
+ GROUP BY `{COL_NAME}`
+),
+ranked AS (
+ SELECT
+ col_val,
+ ct,
+ ROW_NUMBER() OVER (ORDER BY ct DESC, col_val ASC) AS rn
+ FROM counts
+),
+top10 AS (
+ -- top 10 formatted rows
+ SELECT
+ rn,
+ CONCAT('| ', CAST(col_val AS STRING), ' | ', CAST(ct AS STRING)) AS val
+ FROM ranked
+ WHERE rn <= 10
+ ORDER BY rn
+),
+others_agg AS (
+ SELECT
+ 11 AS rn,
+ CONCAT(
+ '| Other Values (',
+ CAST(COUNT(DISTINCT col_val) AS STRING),
+ ') | ',
+ CAST(SUM(ct) AS STRING)
+ ) AS val,
+ COUNT(*) AS other_row_count
+ FROM ranked
+ WHERE rn > 10
+),
+all_vals AS (
+ SELECT * FROM top10
+ UNION ALL
+ SELECT rn, val FROM others_agg WHERE other_row_count > 0
+)
+SELECT
+ '{PROJECT_CODE}' AS project_code,
+ '{DATA_SCHEMA}' AS schema_name,
+ '{RUN_DATE}' AS run_date,
+ '{DATA_TABLE}' AS table_name,
+ '{COL_NAME}' AS column_name,
+ (SELECT STRING_AGG(val, '\n' ORDER BY rn) FROM all_vals) AS top_freq_values,
+ (SELECT TO_HEX(MD5(STRING_AGG(CAST(col_val AS STRING), '|' ORDER BY col_val)))
+ FROM counts
+ ) AS distinct_value_hash;
diff --git a/testgen/template/flavors/bigquery/profiling/templated_functions.yaml b/testgen/template/flavors/bigquery/profiling/templated_functions.yaml
new file mode 100644
index 00000000..d6b91b0a
--- /dev/null
+++ b/testgen/template/flavors/bigquery/profiling/templated_functions.yaml
@@ -0,0 +1,49 @@
+IS_NUM: CASE
+ WHEN REGEXP_CONTAINS(CAST({$1} AS STRING),
+ r'^\s*[+-]?\$?\s*[0-9]+(,[0-9]{3})*(\.[0-9]*)?[%]?\s*$') THEN 1
+ ELSE 0
+ END
+
+IS_DATE: |
+ CASE
+ /* YYYY-MM-DD HH:MM:SS SSSSSS */
+ WHEN SAFE.PARSE_DATETIME('%F %H:%M:%S %6f', CAST({$1} AS STRING)) IS NOT NULL THEN 1
+
+ /* YYYY-MM-DD HH:MM:SS */
+ WHEN SAFE.PARSE_DATETIME('%F %H:%M:%S', CAST({$1} AS STRING)) IS NOT NULL THEN 1
+
+ /* YYYYMMDDHHMMSSSSSS */
+ WHEN SAFE.PARSE_DATETIME('%Y%m%d%H%M%S%6f', CAST({$1} AS STRING)) IS NOT NULL THEN 1
+
+ /* YYYYMMDDHHMMSS */
+ WHEN SAFE.PARSE_DATETIME('%Y%m%d%H%M%S', CAST({$1} AS STRING)) IS NOT NULL THEN 1
+
+ /* YYYYMMDD */
+ WHEN LENGTH(CAST({$1} AS STRING)) = 8 AND SAFE.PARSE_DATE('%Y%m%d', CAST({$1} AS STRING)) IS NOT NULL THEN 1
+
+ /* Exclude anything else long */
+ WHEN LENGTH(CAST({$1} AS STRING)) > 11 THEN 0
+
+ /* YYYY-MON-DD */
+ WHEN SAFE.PARSE_DATE('%Y-%b-%d', CAST({$1} AS STRING)) IS NOT NULL THEN 1
+
+ /* YYYY-MM-DD */
+ WHEN SAFE.PARSE_DATE('%F', CAST({$1} AS STRING)) IS NOT NULL THEN 1
+
+ /* MM/DD/YYYY */
+ WHEN SAFE.PARSE_DATE('%m/%d/%Y', CAST({$1} AS STRING)) IS NOT NULL THEN 1
+
+ /* MM/DD/YY */
+ WHEN SAFE.PARSE_DATE('%m/%d/%y', CAST({$1} AS STRING)) IS NOT NULL THEN 1
+
+ /* MM-DD-YYYY */
+ WHEN SAFE.PARSE_DATE('%m-%d-%Y', CAST({$1} AS STRING)) IS NOT NULL THEN 1
+
+ /* MM-DD-YY */
+ WHEN SAFE.PARSE_DATE('%m-%d-%y', CAST({$1} AS STRING)) IS NOT NULL THEN 1
+
+ /* DD-MON-YYYY */
+ WHEN SAFE.PARSE_DATE('%d-%b-%Y', CAST({$1} AS STRING)) IS NOT NULL THEN 1
+
+ ELSE 0
+ END
From 74a956954c7dcc017abcdfde48a70b5f4c547184 Mon Sep 17 00:00:00 2001
From: Ricardo Boni
Date: Tue, 19 Aug 2025 11:28:53 -0400
Subject: [PATCH 03/48] feat(bigquery): Support BigQuery for Test Generation
---
.../queries/execute_cat_tests_query.py | 5 +-
.../commands/queries/execute_tests_query.py | 6 +-
.../commands/queries/generate_tests_query.py | 2 +-
.../test_parameter_validation_query.py | 2 +
.../commands/run_test_parameter_validation.py | 3 +-
testgen/common/get_pipeline_parms.py | 1 +
.../050_populate_new_schema_metadata.sql | 752 +++++++++++++++++-
.../ex_cat_build_agg_table_tests.sql | 58 +-
.../data_chars/schema_ddf_query_bigquery.sql | 2 +-
.../ex_data_match_bigquery.sql | 46 ++
.../ex_relative_entropy_bigquery.sql | 54 ++
.../ex_table_changed_bigquery.sql | 30 +
.../ex_window_match_no_drops_bigquery.sql | 44 +
.../ex_window_match_same_bigquery.sql | 78 ++
.../gen_table_changed_test.sql | 161 ++++
.../project_profiling_query_bigquery.yaml | 2 +-
.../ex_get_project_column_list_generic.sql | 2 +-
testgen/ui/queries/source_data_queries.py | 11 +-
testgen/ui/queries/test_result_queries.py | 1 +
19 files changed, 1231 insertions(+), 29 deletions(-)
create mode 100644 testgen/template/flavors/bigquery/exec_query_tests/ex_data_match_bigquery.sql
create mode 100644 testgen/template/flavors/bigquery/exec_query_tests/ex_relative_entropy_bigquery.sql
create mode 100644 testgen/template/flavors/bigquery/exec_query_tests/ex_table_changed_bigquery.sql
create mode 100644 testgen/template/flavors/bigquery/exec_query_tests/ex_window_match_no_drops_bigquery.sql
create mode 100644 testgen/template/flavors/bigquery/exec_query_tests/ex_window_match_same_bigquery.sql
create mode 100644 testgen/template/flavors/bigquery/gen_query_tests/gen_table_changed_test.sql
diff --git a/testgen/commands/queries/execute_cat_tests_query.py b/testgen/commands/queries/execute_cat_tests_query.py
index 0a22ee8c..cb2ee3dd 100644
--- a/testgen/commands/queries/execute_cat_tests_query.py
+++ b/testgen/commands/queries/execute_cat_tests_query.py
@@ -47,7 +47,7 @@ def _get_rollup_scores_sql(self) -> CRollupScoresSQL:
self._rollup_scores_sql = CRollupScoresSQL(self.test_run_id, self.table_groups_id)
return self._rollup_scores_sql
-
+
def _get_query(self, template_file_name: str, sub_directory: str | None = "exec_cat_tests", no_bind: bool = False) -> tuple[str, dict | None]:
query = read_template_sql_file(template_file_name, sub_directory)
params = {
@@ -58,7 +58,8 @@ def _get_query(self, template_file_name: str, sub_directory: str | None = "exec_
"TEST_SUITE_ID": self.test_suite_id,
"TABLE_GROUPS_ID": self.table_groups_id,
"SQL_FLAVOR": self.flavor,
- "ID_SEPARATOR": "`" if self.flavor == "databricks" else '"',
+ "ID_SEPARATOR": "`" if self.flavor in ("databricks", "bigquery") else '"',
+ "VARCHAR_TYPE": "STRING" if self.flavor == "bigquery" else "VARCHAR(1000)",
"CONCAT_OPERATOR": self.concat_operator,
"SCHEMA_NAME": self.target_schema,
"TABLE_NAME": self.target_table,
diff --git a/testgen/commands/queries/execute_tests_query.py b/testgen/commands/queries/execute_tests_query.py
index 93010829..c257cb1a 100644
--- a/testgen/commands/queries/execute_tests_query.py
+++ b/testgen/commands/queries/execute_tests_query.py
@@ -106,7 +106,7 @@ def _get_query(
"EXCEPTION_MESSAGE": self.exception_message,
"START_TIME": self.today,
"PROCESS_ID": self.process_id,
- "VARCHAR_TYPE": "STRING" if self.flavor == "databricks" else "VARCHAR",
+ "VARCHAR_TYPE": "STRING" if self.flavor in ("databricks", "bigquery") else "VARCHAR",
"NOW_TIMESTAMP": date_service.get_now_as_string_with_offset(self.minutes_offset),
**{key.upper(): value or "" for key, value in self.test_params.items()},
}
@@ -126,7 +126,9 @@ def _get_query(
)
subset_condition = self.test_params["subset_condition"]
- params["SUBSET_DISPLAY"] = subset_condition.replace("'", "''") if subset_condition else ""
+ params["SUBSET_DISPLAY"] = subset_condition.replace(
+ "'", r"\'" if self.flavor == "bigquery" else "''"
+ ) if subset_condition else ""
query = replace_params(query, params)
diff --git a/testgen/commands/queries/generate_tests_query.py b/testgen/commands/queries/generate_tests_query.py
index 4f887af4..16f075a3 100644
--- a/testgen/commands/queries/generate_tests_query.py
+++ b/testgen/commands/queries/generate_tests_query.py
@@ -47,7 +47,7 @@ def _get_params(self) -> dict:
"GENERATION_SET": self.generation_set,
"AS_OF_DATE": self.as_of_date,
"DATA_SCHEMA": self.data_schema,
- "ID_SEPARATOR": "`" if self.sql_flavor == "databricks" else '"',
+ "ID_SEPARATOR": "`" if self.sql_flavor in ("databricks", "bigquery") else '"',
}
def _get_query(self, template_file_name: str, sub_directory: str | None = "generation") -> tuple[str, dict]:
diff --git a/testgen/commands/queries/test_parameter_validation_query.py b/testgen/commands/queries/test_parameter_validation_query.py
index ec8cf408..7d96a5f9 100644
--- a/testgen/commands/queries/test_parameter_validation_query.py
+++ b/testgen/commands/queries/test_parameter_validation_query.py
@@ -13,6 +13,7 @@ class CTestParamValidationSQL:
test_ids: typing.ClassVar = []
exception_message = ""
flag_val = ""
+ tg_schema = ""
_use_clean = False
@@ -34,6 +35,7 @@ def _get_query(self, template_file_name: str, sub_directory: str | None = "valid
"CAT_TEST_IDS": tuple(self.test_ids or []),
"START_TIME": self.today,
"NOW_TIMESTAMP": date_service.get_now_as_string(),
+ "COLUMNS_TABLE": f"{self.tg_schema}.INFORMATION_SCHEMA.COLUMNS" if self.flavor == "bigquery" else "information_schema.columns"
}
query = replace_params(query, params)
return query, params
diff --git a/testgen/commands/run_test_parameter_validation.py b/testgen/commands/run_test_parameter_validation.py
index db6ba728..f31be1ba 100644
--- a/testgen/commands/run_test_parameter_validation.py
+++ b/testgen/commands/run_test_parameter_validation.py
@@ -45,6 +45,7 @@ def run_parameter_validation_queries(
# Retrieve Current Project Column list
LOG.info("CurrentStep: Retrieve Current Columns for Validation")
+ clsExecute.tg_schema = params["table_group_schema"]
clsExecute.test_schemas = strSchemas
lstProjectTestColumns = fetch_dict_from_db(*clsExecute.GetProjectTestValidationColumns(), use_target_db=True)
@@ -99,7 +100,7 @@ def run_parameter_validation_queries(
clsExecute.message = f"Missing table: {table_name}"
clsExecute.test_ids = test_ids
execute_db_queries([clsExecute.FlagTestsWithFailedValidation()])
-
+
if invalid_tests:
clsExecute.message = "Invalid test: schema, table, or column not defined"
clsExecute.test_ids = invalid_tests
diff --git a/testgen/common/get_pipeline_parms.py b/testgen/common/get_pipeline_parms.py
index 79f5c5ed..3c37aacf 100644
--- a/testgen/common/get_pipeline_parms.py
+++ b/testgen/common/get_pipeline_parms.py
@@ -34,6 +34,7 @@ class TestGenerationParams(BaseParams):
class TestExecutionParams(BaseParams):
test_suite_id: str
table_groups_id: str
+ table_group_schema: str
profiling_table_set: str
profiling_include_mask: str
profiling_exclude_mask: str
diff --git a/testgen/template/dbsetup/050_populate_new_schema_metadata.sql b/testgen/template/dbsetup/050_populate_new_schema_metadata.sql
index 47d0e9a9..df23d59d 100644
--- a/testgen/template/dbsetup/050_populate_new_schema_metadata.sql
+++ b/testgen/template/dbsetup/050_populate_new_schema_metadata.sql
@@ -232,11 +232,23 @@ VALUES ('2001', 'Combo_Match', 'redshift', 'ex_data_match_generic.sql'),
('2410', 'Aggregate_Balance_Range', 'databricks', 'ex_aggregate_match_range_generic.sql'),
('2411', 'Dupe_Rows', 'databricks', 'ex_dupe_rows_generic.sql'),
+ ('2501', 'Combo_Match', 'bigquery', 'ex_data_match_bigquery.sql'),
+ ('2502', 'Aggregate_Minimum', 'bigquery', 'ex_aggregate_match_no_drops_generic.sql'),
+ ('2503', 'Distribution_Shift', 'bigquery', 'ex_relative_entropy_bigquery.sql'),
+ ('2504', 'CUSTOM', 'bigquery', 'ex_custom_query_generic.sql'),
+ ('2506', 'Aggregate_Balance', 'bigquery', 'ex_aggregate_match_same_generic.sql'),
+ ('2507', 'Timeframe_Combo_Gain', 'bigquery', 'ex_window_match_no_drops_bigquery.sql'),
+ ('2508', 'Timeframe_Combo_Match', 'bigquery', 'ex_window_match_same_bigquery.sql'),
+ ('2509', 'Aggregate_Balance_Percent', 'bigquery', 'ex_aggregate_match_percent_generic.sql'),
+ ('2510', 'Aggregate_Balance_Range', 'bigquery', 'ex_aggregate_match_range_generic.sql'),
+ ('2511', 'Dupe_Rows', 'bigquery', 'ex_dupe_rows_generic.sql'),
+
('2012', 'Table_Freshness', 'redshift', 'ex_table_changed_generic.sql'),
('2112', 'Table_Freshness', 'snowflake', 'ex_table_changed_generic.sql'),
('2212', 'Table_Freshness', 'mssql', 'ex_table_changed_mssql.sql'),
('2312', 'Table_Freshness', 'postgresql', 'ex_table_changed_generic.sql'),
- ('2412', 'Table_Freshness', 'databricks', 'ex_table_changed_generic.sql')
+ ('2412', 'Table_Freshness', 'databricks', 'ex_table_changed_generic.sql'),
+ ('2512', 'Table_Freshness', 'bigquery', 'ex_table_changed_bigquery.sql')
;
TRUNCATE TABLE cat_test_conditions;
@@ -464,7 +476,45 @@ VALUES ('1001', 'Alpha_Trunc', 'redshift', 'MAX(LENGTH({COLUMN_NAME}))', '<', '
('2036', 'Valid_Characters', 'snowflake', 'SUM(CASE WHEN TRANSLATE({COLUMN_NAME}, CHAR(160) || CHAR(8203) || CHAR(65279) || CHAR(8239) || CHAR(8201) || CHAR(12288) || CHAR(8204), ''XXXXXXX'') <> {COLUMN_NAME} OR {COLUMN_NAME} LIKE '' %'' OR {COLUMN_NAME} LIKE ''''''%'''''' OR {COLUMN_NAME} LIKE ''"%"'' THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'),
('5036', 'Valid_Characters', 'trino', 'SUM(CASE WHEN TRANSLATE({COLUMN_NAME}, CHR(160) || CHR(8203) || CHR(65279) || CHR(8239) || CHR(8201) || CHR(12288) || CHR(8204), ''XXXXXXX'') <> {COLUMN_NAME} OR {COLUMN_NAME} LIKE '' %'' OR {COLUMN_NAME} LIKE ''''''%'''''' OR {COLUMN_NAME} LIKE ''"%"'' THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'),
('3036', 'Valid_Characters', 'mssql', 'SUM(CASE WHEN TRANSLATE({COLUMN_NAME}, NCHAR(160) || NCHAR(8203) || NCHAR(65279) || NCHAR(8239) || NCHAR(8201) || NCHAR(12288) || NCHAR(8204), ''XXXXXXX'') <> {COLUMN_NAME} OR {COLUMN_NAME} LIKE '' %'' OR {COLUMN_NAME} LIKE ''''''%'''''' OR {COLUMN_NAME} LIKE ''"%"'' THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'),
- ('6036', 'Valid_Characters', 'databricks', 'SUM(CASE WHEN TRANSLATE({COLUMN_NAME}, CHR(160) || CHR(8203) || CHR(65279) || CHR(8239) || CHR(8201) || CHR(12288) || CHR(8204), ''XXXXXXX'') <> {COLUMN_NAME} OR {COLUMN_NAME} LIKE '' %'' OR {COLUMN_NAME} LIKE ''''''%'''''' OR {COLUMN_NAME} LIKE ''"%"'' THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}');
+ ('6036', 'Valid_Characters', 'databricks', 'SUM(CASE WHEN TRANSLATE({COLUMN_NAME}, CHR(160) || CHR(8203) || CHR(65279) || CHR(8239) || CHR(8201) || CHR(12288) || CHR(8204), ''XXXXXXX'') <> {COLUMN_NAME} OR {COLUMN_NAME} LIKE '' %'' OR {COLUMN_NAME} LIKE ''''''%'''''' OR {COLUMN_NAME} LIKE ''"%"'' THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'),
+
+ ('7001', 'Alpha_Trunc', 'bigquery', 'MAX(LENGTH({COLUMN_NAME}))', '<', '{THRESHOLD_VALUE}'),
+ ('7002', 'Avg_Shift', 'bigquery', 'ROUND(ABS((AVG(SAFE_CAST({COLUMN_NAME} AS FLOAT64)) - {BASELINE_AVG}) / SQRT(((COUNT({COLUMN_NAME})-1)*POW(STDDEV({COLUMN_NAME}),2) + ({BASELINE_VALUE_CT}-1)*POW({BASELINE_SD},2)) / NULLIF(COUNT({COLUMN_NAME}) + {BASELINE_VALUE_CT}, 0))),3)', '>=', '{THRESHOLD_VALUE}'),
+ ('7003', 'Condition_Flag', 'bigquery', 'SUM(CASE WHEN {CUSTOM_QUERY} THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'),
+ ('7004', 'Constant', 'bigquery', 'SUM(CASE WHEN {COLUMN_NAME} != {BASELINE_VALUE} THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'),
+ ('7005', 'Daily_Record_Ct', 'bigquery', 'DATETIME_DIFF(DATETIME_TRUNC(SAFE_CAST(MAX({COLUMN_NAME}) AS DATE), DAY), DATETIME_TRUNC(SAFE_CAST(MIN({COLUMN_NAME}) AS DATE), DAY), DAY) + 1 - COUNT(DISTINCT DATETIME_TRUNC({COLUMN_NAME}, DAY))', '<', '{THRESHOLD_VALUE}'),
+ ('7006', 'Dec_Trunc', 'bigquery', 'SUM(ROUND(ABS(MOD({COLUMN_NAME}, 1)), 5)) + 1', '<', '{THRESHOLD_VALUE}'),
+ ('7007', 'Distinct_Date_Ct', 'bigquery', 'COUNT(DISTINCT {COLUMN_NAME})', '<', '{THRESHOLD_VALUE}'),
+ ('7008', 'Distinct_Value_Ct', 'bigquery', 'COUNT(DISTINCT {COLUMN_NAME})', '!=', '{THRESHOLD_VALUE}'),
+ ('7009', 'Email_Format', 'bigquery', 'SUM(CASE WHEN NOT REGEXP_CONTAINS(CAST({COLUMN_NAME} AS STRING), r''^[A-Za-z0-9._%+-]+@(?:[A-Za-z0-9-]+[.])+[A-Za-z]{2,}$'') THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'),
+ ('7010', 'Future_Date', 'bigquery', 'SUM(IF({COLUMN_NAME} > CAST(CAST(''{RUN_DATE}'' AS DATETIME) AS {COLUMN_TYPE}), 1, 0))', '>', '{THRESHOLD_VALUE}'),
+ ('7011', 'Future_Date_1Y', 'bigquery', 'SUM(IF({COLUMN_NAME} > CAST(DATETIME_ADD(CAST(''{RUN_DATE}'' AS DATETIME), INTERVAL 1 YEAR) AS {COLUMN_TYPE}), 1, 0))', '>', '{THRESHOLD_VALUE}'),
+ ('7012', 'Incr_Avg_Shift', 'bigquery', 'COALESCE(ABS(({BASELINE_AVG} - (SUM({COLUMN_NAME}) - {BASELINE_SUM}) / NULLIF(COUNT({COLUMN_NAME}) - {BASELINE_VALUE_CT}, 0)) / {BASELINE_SD}), 0)', '>=', '{THRESHOLD_VALUE}'),
+ ('7013', 'LOV_All', 'bigquery', 'STRING_AGG(DISTINCT CAST({COLUMN_NAME} AS STRING), ''|'' ORDER BY {COLUMN_NAME})', '!=', '{THRESHOLD_VALUE}'),
+ ('7014', 'LOV_Match', 'bigquery', 'SUM(CASE WHEN NULLIF({COLUMN_NAME}, '''') NOT IN {BASELINE_VALUE} THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'),
+ ('7015', 'Min_Date', 'bigquery', 'SUM(CASE WHEN {COLUMN_NAME} < SAFE_CAST(''{BASELINE_VALUE}'' AS {COLUMN_TYPE}) THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'),
+ ('7016', 'Min_Val', 'bigquery', 'SUM(CASE WHEN {COLUMN_NAME} < {BASELINE_VALUE} - 1e-6 THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'),
+ ('7017', 'Missing_Pct', 'bigquery', 'ABS(2.0 * ASIN(SQRT({BASELINE_VALUE_CT} / {BASELINE_CT})) - 2.0 * ASIN(SQRT(COUNT({COLUMN_NAME}) / NULLIF(COUNT(*),0))))', '>=', '{THRESHOLD_VALUE}'),
+ ('7018', 'Monthly_Rec_Ct', 'bigquery', 'DATETIME_DIFF(DATETIME_TRUNC(SAFE_CAST(MAX({COLUMN_NAME}) AS DATE), MONTH), DATETIME_TRUNC(SAFE_CAST(MIN({COLUMN_NAME}) AS DATE), MONTH), MONTH) + 1 - COUNT(DISTINCT DATETIME_TRUNC({COLUMN_NAME}, MONTH))', '>', '{THRESHOLD_VALUE}'),
+ ('7019', 'Outlier_Pct_Above', 'bigquery', 'SUM(CASE WHEN CAST({COLUMN_NAME} AS FLOAT64) > {BASELINE_AVG} + 2*{BASELINE_SD} THEN 1 ELSE 0 END) / NULLIF(COUNT({COLUMN_NAME}),0)', '>', '{THRESHOLD_VALUE}'),
+ ('7020', 'Outlier_Pct_Below', 'bigquery', 'SUM(CASE WHEN CAST({COLUMN_NAME} AS FLOAT64) < {BASELINE_AVG} - 2*{BASELINE_SD} THEN 1 ELSE 0 END) / NULLIF(COUNT({COLUMN_NAME}),0)', '>', '{THRESHOLD_VALUE}'),
+ ('7021', 'Pattern_Match', 'bigquery', 'COUNT(NULLIF({COLUMN_NAME}, '''')) - SUM(CAST(REGEXP_CONTAINS(CAST(NULLIF({COLUMN_NAME}, '''') AS STRING), r''{BASELINE_VALUE}'') AS INT64))', '>', '{THRESHOLD_VALUE}'),
+ ('7022', 'Recency', 'bigquery', 'CAST((DATETIME_DIFF(DATETIME_TRUNC(CAST(CAST(''{RUN_DATE}'' AS DATETIME) AS {COLUMN_TYPE}), DAY), DATETIME_TRUNC(MAX({COLUMN_NAME}), DAY), DAY)) AS INT64)', '>', '{THRESHOLD_VALUE}'),
+ ('7023', 'Required', 'bigquery', 'COUNT(*) - COUNT({COLUMN_NAME})', '>', '{THRESHOLD_VALUE}'),
+ ('7024', 'Row_Ct', 'bigquery', 'COUNT(*)', '<', '{THRESHOLD_VALUE}'),
+ ('7025', 'Row_Ct_Pct', 'bigquery', 'ABS(ROUND(100.0 * (COUNT(*) - {BASELINE_CT}) / {BASELINE_CT}, 2))', '>', '{THRESHOLD_VALUE}'),
+ ('7026', 'Street_Addr_Pattern', 'bigquery', '100.0 * SUM(CAST(REGEXP_CONTAINS(CAST({COLUMN_NAME} AS STRING), r''^[0-9]{1,5}[a-zA-Z]?\s\w{1,5}\.?\s?\w*\s?\w*\s[a-zA-Z]{1,6}\.?\s?[0-9]{0,5}[A-Z]{0,1}$'') AS INT64)) / NULLIF(COUNT({COLUMN_NAME}),0)', '<', '{THRESHOLD_VALUE}'),
+ ('7027', 'US_State', 'bigquery', 'SUM(CASE WHEN {COLUMN_NAME} NOT IN ('''',''AL'',''AK'',''AS'',''AZ'',''AR'',''CA'',''CO'',''CT'',''DE'',''DC'',''FM'',''FL'',''GA'',''GU'',''HI'',''ID'',''IL'',''IN'',''IA'',''KS'',''KY'',''LA'',''ME'',''MH'',''MD'',''MA'',''MI'',''MN'',''MS'',''MO'',''MT'',''NE'',''NV'',''NH'',''NJ'',''NM'',''NY'',''NC'',''ND'',''MP'',''OH'',''OK'',''OR'',''PW'',''PA'',''PR'',''RI'',''SC'',''SD'',''TN'',''TX'',''UT'',''VT'',''VI'',''VA'',''WA'',''WV'',''WI'',''WY'',''AE'',''AP'',''AA'') THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'),
+ ('7028', 'Unique', 'bigquery', 'COUNT(*) - COUNT(DISTINCT {COLUMN_NAME})', '>', '{THRESHOLD_VALUE}'),
+ ('7029', 'Unique_Pct', 'bigquery', 'ABS(2.0 * ASIN(SQRT({BASELINE_UNIQUE_CT}/{BASELINE_VALUE_CT})) - 2.0 * ASIN(SQRT(COUNT(DISTINCT {COLUMN_NAME}) / NULLIF(COUNT({COLUMN_NAME}),0))))', '>=', '{THRESHOLD_VALUE}'),
+ ('7036', 'Valid_Characters', 'bigquery', 'SUM(CASE WHEN REGEXP_REPLACE({COLUMN_NAME}, r''[\u00A0\u200B\uFEFF\u202F\u2009\u3000\u200C]'', ''X'') != {COLUMN_NAME} OR {COLUMN_NAME} LIKE '' %'' OR {COLUMN_NAME} LIKE "''%''" OR {COLUMN_NAME} LIKE ''"%"'' THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'),
+ ('7033', 'Valid_Month', 'bigquery', 'SUM(CASE WHEN NULLIF({COLUMN_NAME}, '''') NOT IN ({BASELINE_VALUE}) THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'),
+ ('7034', 'Valid_US_Zip', 'bigquery', 'SUM(CASE WHEN REGEXP_REPLACE({COLUMN_NAME}, r''[0-9]'', ''9'') NOT IN (''99999'',''999999999'',''99999-9999'') THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'),
+ ('7035', 'Valid_US_Zip3', 'bigquery', 'SUM(CASE WHEN REGEXP_REPLACE({COLUMN_NAME}, r''[0-9]'', ''9'') != ''999'' THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'),
+ ('7032', 'Variability_Decrease', 'bigquery', '100.0 * STDDEV(CAST({COLUMN_NAME} AS FLOAT64)) / {BASELINE_SD}', '<', '{THRESHOLD_VALUE}'),
+ ('7031', 'Variability_Increase', 'bigquery', '100.0 * STDDEV(CAST({COLUMN_NAME} AS FLOAT64)) / {BASELINE_SD}', '>', '{THRESHOLD_VALUE}'),
+ ('7030', 'Weekly_Rec_Ct', 'bigquery', 'DATETIME_DIFF(DATETIME_TRUNC(SAFE_CAST(MAX({COLUMN_NAME}) AS DATE), WEEK), DATETIME_TRUNC(SAFE_CAST(MIN({COLUMN_NAME}) AS DATE), WEEK), WEEK) + 1 - COUNT(DISTINCT DATETIME_TRUNC({COLUMN_NAME}, WEEK))', '>', '{THRESHOLD_VALUE}');
+
TRUNCATE TABLE target_data_lookups;
@@ -1785,9 +1835,701 @@ GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500'),
''\ufeff'', ''\x65279'') as `{COLUMN_NAME}_content`,
COUNT(*) as record_ct FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`
WHERE TRANSLATE(`{COLUMN_NAME}`, ''\u00a0\u2009\u200b\u200c\u200d\u200e\u200f\u202f\u3000\ufeff'', ''XXXXXXXXXX'') <> `{COLUMN_NAME}`
-GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` LIMIT 500')
-;
-
+GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` LIMIT 500'),
+ ('1339', '1001', 'Profile Anomaly', 'Suggested_Type', 'bigquery', NULL, 'SELECT `{COLUMN_NAME}`, COUNT(*) AS record_ct
+FROM {TARGET_SCHEMA}.{TABLE_NAME}
+GROUP BY `{COLUMN_NAME}`
+ORDER BY record_ct DESC
+LIMIT 20;'),
+ ('1340', '1002', 'Profile Anomaly', 'Non_Standard_Blanks', 'bigquery', NULL, 'SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count
+FROM {TARGET_SCHEMA}.{TABLE_NAME}
+WHERE (
+ `{COLUMN_NAME}` IN (''.'',''?'','' '')
+ OR REGEXP_CONTAINS(LOWER(CAST(`{COLUMN_NAME}` AS STRING)), r''-{2,}'')
+ OR REGEXP_CONTAINS(LOWER(CAST(`{COLUMN_NAME}` AS STRING)), r''0{2,}'')
+ OR REGEXP_CONTAINS(LOWER(CAST(`{COLUMN_NAME}` AS STRING)), r''9{2,}'')
+ OR REGEXP_CONTAINS(LOWER(CAST(`{COLUMN_NAME}` AS STRING)), r''x{2,}'')
+ OR REGEXP_CONTAINS(LOWER(CAST(`{COLUMN_NAME}` AS STRING)), r''z{2,}'')
+ OR LOWER(CAST(`{COLUMN_NAME}` AS STRING)) IN (''blank'',''error'',''missing'',''tbd'',''n/a'',''#na'',''none'',''null'',''unknown'')
+ OR LOWER(CAST(`{COLUMN_NAME}` AS STRING)) IN (''(blank)'',''(error)'',''(missing)'',''(tbd)'',''(n/a)'',''(#na)'',''(none)'',''(null)'',''(unknown)'')
+ OR LOWER(CAST(`{COLUMN_NAME}` AS STRING)) IN (''[blank]'',''[error]'',''[missing]'',''[tbd]'',''[n/a]'',''[#na]'',''[none]'',''[null]'',''[unknown]'')
+ OR `{COLUMN_NAME}` = ''''
+ OR `{COLUMN_NAME}` IS NULL
+)
+GROUP BY `{COLUMN_NAME}`
+ORDER BY `{COLUMN_NAME}`;'),
+ ('1341', '1003', 'Profile Anomaly', 'Invalid_Zip_USA', 'bigquery', NULL, 'SELECT `{COLUMN_NAME}`, COUNT(*) AS count
+FROM {TARGET_SCHEMA}.{TABLE_NAME}
+WHERE TRANSLATE(CAST(`{COLUMN_NAME}` AS STRING), ''012345678'', ''999999999'') NOT IN (''99999'', ''999999999'', ''99999-9999'')
+GROUP BY `{COLUMN_NAME}`
+ORDER BY `{COLUMN_NAME}`
+LIMIT 500;'),
+ ('1342', '1004', 'Profile Anomaly', 'Multiple_Types_Minor', 'bigquery', NULL, 'SELECT DISTINCT column_name, columns.table_name,
+ CASE
+ WHEN LOWER(data_type) LIKE ''timestamp%'' THEN LOWER(data_type)
+ WHEN LOWER(data_type) LIKE ''date'' THEN LOWER(data_type)
+ WHEN LOWER(data_type) LIKE ''boolean'' THEN ''boolean''
+ WHEN data_type = ''TEXT'' THEN CONCAT(''varchar('', CAST(character_maximum_length AS STRING), '')'')
+ WHEN LOWER(data_type) LIKE ''char%'' THEN CONCAT(''char('', CAST(character_maximum_length AS STRING), '')'')
+ WHEN data_type = ''NUMBER'' AND numeric_precision = 38 AND numeric_scale = 0 THEN ''bigint''
+ WHEN LOWER(data_type) LIKE ''num%'' THEN CONCAT(''numeric('', CAST(numeric_precision AS STRING), '','', CAST(numeric_scale AS STRING), '')'')
+ ELSE data_type
+ END AS data_type
+FROM information_schema.columns
+JOIN information_schema.tables
+ ON columns.table_name = tables.table_name
+ AND columns.table_schema = tables.table_schema
+WHERE columns.table_schema = ''{TARGET_SCHEMA}''
+ AND columns.column_name = ''{COLUMN_NAME}''
+ AND tables.table_type = ''BASE TABLE''
+ORDER BY data_type, table_name;'),
+ ('1343', '1005', 'Profile Anomaly', 'Multiple_Types_Major', 'bigquery', NULL, 'SELECT DISTINCT column_name, columns.table_name,
+ CASE
+ WHEN LOWER(data_type) LIKE ''timestamp%'' THEN LOWER(data_type)
+ WHEN LOWER(data_type) LIKE ''date'' THEN LOWER(data_type)
+ WHEN LOWER(data_type) LIKE ''boolean'' THEN ''boolean''
+ WHEN data_type = ''TEXT'' THEN CONCAT(''varchar('', CAST(character_maximum_length AS STRING), '')'')
+ WHEN LOWER(data_type) LIKE ''char%'' THEN CONCAT(''char('', CAST(character_maximum_length AS STRING), '')'')
+ WHEN data_type = ''NUMBER'' AND numeric_precision = 38 AND numeric_scale = 0 THEN ''bigint''
+ WHEN LOWER(data_type) LIKE ''num%'' THEN CONCAT(''numeric('', CAST(numeric_precision AS STRING), '','', CAST(numeric_scale AS STRING), '')'')
+ ELSE data_type
+ END AS data_type
+FROM information_schema.columns
+JOIN information_schema.tables
+ ON columns.table_name = tables.table_name
+ AND columns.table_schema = tables.table_schema
+WHERE columns.table_schema = ''{TARGET_SCHEMA}''
+ AND columns.column_name = ''{COLUMN_NAME}''
+ AND tables.table_type = ''BASE TABLE''
+ORDER BY data_type, table_name;'),
+ ('1344', '1006', 'Profile Anomaly', 'No_Values', 'bigquery', NULL, 'SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count
+FROM {TARGET_SCHEMA}.{TABLE_NAME}
+GROUP BY `{COLUMN_NAME}`
+ORDER BY `{COLUMN_NAME}`;'),
+ ('1345', '1007', 'Profile Anomaly', 'Column_Pattern_Mismatch', 'bigquery', NULL, '(
+ SELECT b.top_pattern, `{COLUMN_NAME}`, COUNT(*) AS count
+ FROM {TARGET_SCHEMA}.{TABLE_NAME},
+ (SELECT TRIM(SPLIT(''{DETAIL_EXPRESSION}'', ''|'')[SAFE_OFFSET(3)]) AS top_pattern) b
+ WHERE REGEXP_REPLACE(REGEXP_REPLACE(REGEXP_REPLACE(CAST(`{COLUMN_NAME}` AS STRING), r''[a-z]'', ''a''), r''[A-Z]'', ''A''), r''[0-9]'', ''N'') = b.top_pattern
+ GROUP BY b.top_pattern, `{COLUMN_NAME}`
+ ORDER BY count DESC
+ LIMIT 5
+)
+UNION ALL
+(
+ SELECT b.top_pattern, `{COLUMN_NAME}`, COUNT(*) AS count
+ FROM {TARGET_SCHEMA}.{TABLE_NAME},
+ (SELECT TRIM(SPLIT(''{DETAIL_EXPRESSION}'', ''|'')[SAFE_OFFSET(5)]) AS top_pattern) b
+ WHERE REGEXP_REPLACE(REGEXP_REPLACE(REGEXP_REPLACE(CAST(`{COLUMN_NAME}` AS STRING), r''[a-z]'', ''a''), r''[A-Z]'', ''A''), r''[0-9]'', ''N'') = b.top_pattern
+ GROUP BY b.top_pattern, `{COLUMN_NAME}`
+ ORDER BY count DESC
+ LIMIT 5
+)
+UNION ALL
+(
+ SELECT b.top_pattern, `{COLUMN_NAME}`, COUNT(*) AS count
+ FROM {TARGET_SCHEMA}.{TABLE_NAME},
+ (SELECT TRIM(SPLIT(''{DETAIL_EXPRESSION}'', ''|'')[SAFE_OFFSET(7)]) AS top_pattern) b
+ WHERE REGEXP_REPLACE(REGEXP_REPLACE(REGEXP_REPLACE(CAST(`{COLUMN_NAME}` AS STRING), r''[a-z]'', ''a''), r''[A-Z]'', ''A''), r''[0-9]'', ''N'') = b.top_pattern
+ GROUP BY b.top_pattern, `{COLUMN_NAME}`
+ ORDER BY count DESC
+ LIMIT 5
+)
+UNION ALL
+(
+ SELECT b.top_pattern, `{COLUMN_NAME}`, COUNT(*) AS count
+ FROM {TARGET_SCHEMA}.{TABLE_NAME},
+ (SELECT TRIM(SPLIT(''{DETAIL_EXPRESSION}'', ''|'')[SAFE_OFFSET(9)]) AS top_pattern) b
+ WHERE REGEXP_REPLACE(REGEXP_REPLACE(REGEXP_REPLACE(CAST(`{COLUMN_NAME}` AS STRING), r''[a-z]'', ''a''), r''[A-Z]'', ''A''), r''[0-9]'', ''N'') = b.top_pattern
+ GROUP BY b.top_pattern, `{COLUMN_NAME}`
+ ORDER BY count DESC
+ LIMIT 5
+)
+ORDER BY top_pattern DESC, count DESC;'),
+ ('1346', '1008', 'Profile Anomaly', 'Table_Pattern_Mismatch', 'bigquery', NULL, 'SELECT DISTINCT column_name, columns.table_name
+FROM information_schema.columns
+JOIN information_schema.tables
+ ON columns.table_name = tables.table_name
+ AND columns.table_schema = tables.table_schema
+WHERE columns.table_schema = ''{TARGET_SCHEMA}''
+ AND columns.column_name = ''{COLUMN_NAME}''
+ AND UPPER(tables.table_type) = ''BASE TABLE''
+ORDER BY table_name;'),
+ ('1347', '1009', 'Profile Anomaly', 'Leading_Spaces', 'bigquery', NULL, 'SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count
+FROM {TARGET_SCHEMA}.{TABLE_NAME}
+WHERE REGEXP_CONTAINS(CAST(`{COLUMN_NAME}` AS STRING), r''^\s'')
+GROUP BY `{COLUMN_NAME}`
+ORDER BY `{COLUMN_NAME}`;'),
+ ('1348', '1010', 'Profile Anomaly', 'Quoted_Values', 'bigquery', NULL, 'SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count
+FROM {TARGET_SCHEMA}.{TABLE_NAME}
+WHERE LEFT(CAST(`{COLUMN_NAME}` AS STRING), 1) = ''"'' OR LEFT(CAST(`{COLUMN_NAME}` AS STRING), 1) = "''"
+GROUP BY `{COLUMN_NAME}`
+ORDER BY `{COLUMN_NAME}`
+LIMIT 500;'),
+ ('1349', '1011', 'Profile Anomaly', 'Char_Column_Number_Values', 'bigquery', NULL, '(
+ SELECT ''Numeric'' AS data_type, `{COLUMN_NAME}`, COUNT(*) AS count
+ FROM {TARGET_SCHEMA}.{TABLE_NAME}
+ WHERE SAFE_CAST(CAST(`{COLUMN_NAME}` AS STRING) AS FLOAT64) IS NOT NULL
+ GROUP BY `{COLUMN_NAME}`
+ ORDER BY count DESC
+ LIMIT 10
+)
+UNION ALL
+(
+ SELECT ''Non-Numeric'' AS data_type, `{COLUMN_NAME}`, COUNT(*) AS count
+ FROM {TARGET_SCHEMA}.{TABLE_NAME}
+ WHERE SAFE_CAST(CAST(`{COLUMN_NAME}` AS STRING) AS FLOAT64) IS NULL
+ GROUP BY `{COLUMN_NAME}`
+ ORDER BY count DESC
+ LIMIT 10
+)
+ORDER BY data_type, count DESC;'),
+ ('1350', '1012', 'Profile Anomaly', 'Char_Column_Date_Values', 'bigquery', NULL, '(
+ SELECT ''Date'' AS data_type, `{COLUMN_NAME}`, COUNT(*) AS count
+ FROM {TARGET_SCHEMA}.{TABLE_NAME}
+ WHERE SAFE_CAST(CAST(`{COLUMN_NAME}` AS STRING) AS DATE) IS NOT NULL
+ GROUP BY `{COLUMN_NAME}`
+ ORDER BY count DESC
+ LIMIT 10
+)
+UNION ALL
+(
+ SELECT ''Non-Date'' AS data_type, `{COLUMN_NAME}`, COUNT(*) AS count
+ FROM {TARGET_SCHEMA}.{TABLE_NAME}
+ WHERE SAFE_CAST(CAST(`{COLUMN_NAME}` AS STRING) AS DATE) IS NULL
+ GROUP BY `{COLUMN_NAME}`
+ ORDER BY count DESC
+ LIMIT 10
+)
+ORDER BY data_type, count DESC;'),
+ ('1353', '1015', 'Profile Anomaly', 'Boolean_Value_Mismatch', 'bigquery', NULL, 'SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count
+FROM {TARGET_SCHEMA}.{TABLE_NAME}
+GROUP BY `{COLUMN_NAME}`
+ORDER BY COUNT(*) DESC;'),
+ ('1354', '1016', 'Profile Anomaly', 'Potential_Duplicates', 'bigquery', NULL, 'SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count
+FROM {TARGET_SCHEMA}.{TABLE_NAME}
+GROUP BY `{COLUMN_NAME}`
+HAVING COUNT(*) > 1
+ORDER BY COUNT(*) DESC
+LIMIT 500;'),
+ ('1355', '1017', 'Profile Anomaly', 'Standardized_Value_Matches', 'bigquery', NULL, 'WITH cte AS (
+ SELECT UPPER(REGEXP_REPLACE(CAST(`{COLUMN_NAME}` AS STRING), r"[ ''\.\-\,]", '''')) AS possible_standard_value,
+ COUNT(DISTINCT `{COLUMN_NAME}`) AS cnt
+ FROM {TARGET_SCHEMA}.{TABLE_NAME}
+ GROUP BY possible_standard_value
+ HAVING COUNT(DISTINCT `{COLUMN_NAME}`) > 1
+)
+SELECT DISTINCT a.`{COLUMN_NAME}`, b.possible_standard_value, COUNT(*) AS count
+FROM {TARGET_SCHEMA}.{TABLE_NAME} a
+JOIN cte b
+ ON UPPER(REGEXP_REPLACE(CAST(a.`{COLUMN_NAME}` AS STRING), r"[ ''\.\-\,]", '''')) = b.possible_standard_value
+GROUP BY a.`{COLUMN_NAME}`, b.possible_standard_value
+ORDER BY b.possible_standard_value ASC, count DESC
+LIMIT 500;'),
+ ('1356', '1018', 'Profile Anomaly', 'Unlikely_Date_Values', 'bigquery', NULL, 'SELECT DISTINCT `{COLUMN_NAME}`, CAST(CAST(''{PROFILE_RUN_DATE}'' AS DATETIME) AS DATE) AS profile_run_date, COUNT(*) AS count
+FROM {TARGET_SCHEMA}.{TABLE_NAME} a
+WHERE (CAST(`{COLUMN_NAME}` AS DATE) < DATE ''1900-01-01'')
+ OR (CAST(`{COLUMN_NAME}` AS DATE) > DATE_ADD(CAST(CAST(''{PROFILE_RUN_DATE}'' AS DATETIME) AS DATE), INTERVAL 30 YEAR))
+GROUP BY `{COLUMN_NAME}`
+ORDER BY `{COLUMN_NAME}` DESC
+LIMIT 500;'),
+ ('1357', '1019', 'Profile Anomaly', 'Recency_One_Year', 'bigquery', NULL, 'created_in_ui'),
+ ('1358', '1020', 'Profile Anomaly', 'Recency_Six_Months', 'bigquery', NULL, 'created_in_ui'),
+ ('1359', '1021', 'Profile Anomaly', 'Unexpected US States', 'bigquery', NULL, 'SELECT `{COLUMN_NAME}`, COUNT(*) AS count
+FROM {TARGET_SCHEMA}.{TABLE_NAME}
+GROUP BY `{COLUMN_NAME}`
+ORDER BY `{COLUMN_NAME}` DESC
+LIMIT 500;'),
+ ('1360', '1022', 'Profile Anomaly', 'Unexpected Emails', 'bigquery', NULL, 'SELECT `{COLUMN_NAME}`, COUNT(*) AS count
+FROM {TARGET_SCHEMA}.{TABLE_NAME}
+GROUP BY `{COLUMN_NAME}`
+ORDER BY `{COLUMN_NAME}` DESC
+LIMIT 500;'),
+ ('1361', '1023', 'Profile Anomaly', 'Small_Numeric_Value_Ct', 'bigquery', NULL, '(
+ SELECT ''Numeric'' AS data_type, `{COLUMN_NAME}`, COUNT(*) AS count
+ FROM {TARGET_SCHEMA}.{TABLE_NAME}
+ WHERE SAFE_CAST(CAST(`{COLUMN_NAME}` AS STRING) AS FLOAT64) IS NOT NULL
+ GROUP BY `{COLUMN_NAME}`
+ ORDER BY count DESC
+ LIMIT 10
+)
+UNION ALL
+(
+ SELECT ''Non-Numeric'' AS data_type, `{COLUMN_NAME}`, COUNT(*) AS count
+ FROM {TARGET_SCHEMA}.{TABLE_NAME}
+ WHERE SAFE_CAST(CAST(`{COLUMN_NAME}` AS STRING) AS FLOAT64) IS NULL
+ GROUP BY `{COLUMN_NAME}`
+ ORDER BY count DESC
+ LIMIT 10
+)
+ORDER BY data_type, count DESC;'),
+ ('1362', '1024', 'Profile Anomaly', 'Invalid_Zip3_USA', 'bigquery', NULL, 'SELECT `{COLUMN_NAME}`, COUNT(*) AS count
+FROM {TARGET_SCHEMA}.{TABLE_NAME}
+WHERE TRANSLATE(CAST(`{COLUMN_NAME}` AS STRING), ''012345678'', ''999999999'') <> ''999''
+GROUP BY `{COLUMN_NAME}`
+ORDER BY count DESC, `{COLUMN_NAME}`
+LIMIT 500;'),
+ ('1363', '1025', 'Profile Anomaly', 'Delimited_Data_Embedded', 'bigquery', NULL, 'SELECT `{COLUMN_NAME}`, COUNT(*) AS count
+FROM {TARGET_SCHEMA}.{TABLE_NAME}
+WHERE REGEXP_CONTAINS(CAST(`{COLUMN_NAME}` AS STRING), r''^([^,|\t]{1,20}[,|\t]){2,}[^,|\t]{0,20}([,|\t]{0,1}[^,|\t]{0,20})*$'')
+ AND NOT REGEXP_CONTAINS(CAST(`{COLUMN_NAME}` AS STRING), r''.*\s(and|but|or|yet)\s.*'')
+GROUP BY `{COLUMN_NAME}`
+ORDER BY COUNT(*) DESC
+LIMIT 500;'),
+ ('1364', '1004', 'Test Results', 'Alpha_Trunc', 'bigquery', NULL, 'SELECT DISTINCT `{COLUMN_NAME}`, LENGTH(CAST(`{COLUMN_NAME}` AS STRING)) AS current_max_length, {THRESHOLD_VALUE} AS previous_max_length
+FROM {TARGET_SCHEMA}.{TABLE_NAME}, (
+ SELECT MAX(LENGTH(CAST(`{COLUMN_NAME}` AS STRING))) AS max_length
+ FROM {TARGET_SCHEMA}.{TABLE_NAME}
+) a
+WHERE LENGTH(CAST(`{COLUMN_NAME}` AS STRING)) = a.max_length
+ AND a.max_length < {THRESHOLD_VALUE}
+LIMIT 500;'),
+ ('1365', '1005', 'Test Results', 'Avg_Shift', 'bigquery', NULL, 'SELECT AVG(CAST(`{COLUMN_NAME}` AS FLOAT64)) AS current_average
+FROM {TARGET_SCHEMA}.{TABLE_NAME};'),
+ ('1366', '1006', 'Test Results', 'Condition_Flag', 'bigquery', NULL, 'SELECT *
+FROM {TARGET_SCHEMA}.{TABLE_NAME}
+WHERE {CUSTOM_QUERY}
+LIMIT 500;'),
+ ('1367', '1007', 'Test Results', 'Constant', 'bigquery', NULL, 'SELECT `{COLUMN_NAME}`, COUNT(*) AS count
+FROM {TARGET_SCHEMA}.{TABLE_NAME}
+WHERE `{COLUMN_NAME}` <> {BASELINE_VALUE}
+GROUP BY `{COLUMN_NAME}`
+LIMIT 500;'),
+ ('1368', '1009', 'Test Results', 'Daily_Record_Ct', 'bigquery', NULL, 'WITH daterange AS (
+ SELECT day AS all_dates
+ FROM UNNEST(
+ GENERATE_DATE_ARRAY(
+ (SELECT MIN(CAST(`{COLUMN_NAME}` AS DATE)) FROM {TARGET_SCHEMA}.{TABLE_NAME}),
+ (SELECT MAX(CAST(`{COLUMN_NAME}` AS DATE)) FROM {TARGET_SCHEMA}.{TABLE_NAME})
+ )
+ ) AS day
+),
+existing_periods AS (
+ SELECT DISTINCT CAST(`{COLUMN_NAME}` AS DATE) AS period, COUNT(1) AS period_count
+ FROM {TARGET_SCHEMA}.{TABLE_NAME}
+ GROUP BY period
+),
+p AS (
+ SELECT d.all_dates AS missing_period,
+ MAX(b.period) AS prior_available_date,
+ MIN(c.period) AS next_available_date
+ FROM daterange d
+ LEFT JOIN existing_periods a ON d.all_dates = a.period
+ LEFT JOIN existing_periods b ON b.period < d.all_dates
+ LEFT JOIN existing_periods c ON c.period > d.all_dates
+ WHERE a.period IS NULL
+ AND d.all_dates BETWEEN b.period AND c.period
+ GROUP BY d.all_dates
+)
+SELECT p.missing_period, p.prior_available_date, e.period_count AS prior_available_date_count, p.next_available_date, f.period_count AS next_available_date_count
+FROM p
+LEFT JOIN existing_periods e ON (p.prior_available_date = e.period)
+LEFT JOIN existing_periods f ON (p.next_available_date = f.period)
+ORDER BY p.missing_period
+LIMIT 500;'),
+ ('1369', '1011', 'Test Results', 'Dec_Trunc', 'bigquery', NULL, 'SELECT DISTINCT LENGTH(SPLIT(CAST(`{COLUMN_NAME}` AS STRING), ''.'')[SAFE_OFFSET(1)]) AS decimal_scale, COUNT(*) AS count
+FROM {TARGET_SCHEMA}.{TABLE_NAME}
+GROUP BY decimal_scale
+LIMIT 500;'),
+ ('1370', '1012', 'Test Results', 'Distinct_Date_Ct', 'bigquery', NULL, 'SELECT `{COLUMN_NAME}`, COUNT(*) AS count
+FROM {TARGET_SCHEMA}.{TABLE_NAME}
+WHERE `{COLUMN_NAME}` IS NOT NULL
+GROUP BY `{COLUMN_NAME}`
+ORDER BY `{COLUMN_NAME}` DESC
+LIMIT 500;'),
+ ('1371', '1013', 'Test Results', 'Distinct_Value_Ct', 'bigquery', NULL, 'SELECT `{COLUMN_NAME}`, COUNT(*) AS count
+FROM {TARGET_SCHEMA}.{TABLE_NAME}
+WHERE `{COLUMN_NAME}` IS NOT NULL
+GROUP BY `{COLUMN_NAME}`
+ORDER BY `{COLUMN_NAME}` DESC
+LIMIT 500;'),
+ ('1372', '1014', 'Test Results', 'Email_Format', 'bigquery', NULL, 'SELECT `{COLUMN_NAME}`, COUNT(*) AS count
+FROM {TARGET_SCHEMA}.{TABLE_NAME}
+WHERE NOT REGEXP_CONTAINS(CAST(`{COLUMN_NAME}` AS STRING), r''^[A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+\.[A-Za-z]{2,}$'')
+GROUP BY `{COLUMN_NAME}`
+LIMIT 500;'),
+ ('1373', '1015', 'Test Results', 'Future_Date', 'bigquery', NULL, 'SELECT `{COLUMN_NAME}`, COUNT(*) AS count
+FROM {TARGET_SCHEMA}.{TABLE_NAME}
+WHERE DATETIME_DIFF(`{COLUMN_NAME}`, CAST(CAST(''{TEST_DATE}'' AS DATETIME) AS {COLUMN_TYPE}), DAY) > {THRESHOLD_VALUE}
+GROUP BY `{COLUMN_NAME}`
+LIMIT 500;'),
+ ('1374', '1016', 'Test Results', 'Future_Date_1Y', 'bigquery', NULL, 'SELECT `{COLUMN_NAME}`, COUNT(*) AS count
+FROM {TARGET_SCHEMA}.{TABLE_NAME}
+WHERE DATETIME_DIFF(`{COLUMN_NAME}`, DATE_ADD(CAST(CAST(''{TEST_DATE}'' AS DATETIME) AS {COLUMN_TYPE}), INTERVAL 365 DAY), DAY) > {THRESHOLD_VALUE}
+GROUP BY `{COLUMN_NAME}`
+LIMIT 500;'),
+ ('1375', '1017', 'Test Results', 'Incr_Avg_Shift', 'bigquery', NULL, 'SELECT AVG(CAST(`{COLUMN_NAME}` AS FLOAT64)) AS current_average,
+ SUM(CAST(`{COLUMN_NAME}` AS FLOAT64)) AS current_sum,
+ NULLIF(CAST(COUNT(`{COLUMN_NAME}`) AS FLOAT64), 0) AS current_value_count
+FROM {TARGET_SCHEMA}.{TABLE_NAME};'),
+ ('1376', '1018', 'Test Results', 'LOV_All', 'bigquery', NULL, 'SELECT lov
+FROM (
+ SELECT STRING_AGG(DISTINCT CAST(`{COLUMN_NAME}` AS STRING), ''|'' ORDER BY `{COLUMN_NAME}`) AS lov
+ FROM {TARGET_SCHEMA}.{TABLE_NAME}
+)
+WHERE lov <> ''{THRESHOLD_VALUE}''
+LIMIT 500;'),
+ ('1377', '1019', 'Test Results', 'LOV_Match', 'bigquery', NULL, 'SELECT DISTINCT NULLIF(`{COLUMN_NAME}`, '''') AS `{COLUMN_NAME}`, COUNT(*) AS count
+FROM {TARGET_SCHEMA}.{TABLE_NAME}
+WHERE NULLIF(`{COLUMN_NAME}`, '''') NOT IN {BASELINE_VALUE}
+GROUP BY `{COLUMN_NAME}`
+LIMIT 500;'),
+ ('1378', '1020', 'Test Results', 'Min_Date', 'bigquery', NULL, 'SELECT `{COLUMN_NAME}`, COUNT(*) AS count
+FROM {TARGET_SCHEMA}.{TABLE_NAME}
+WHERE CAST(`{COLUMN_NAME}` AS DATE) < CAST(CAST(''{BASELINE_VALUE}'' AS DATETIME) AS DATE)
+GROUP BY `{COLUMN_NAME}`
+LIMIT 500;'),
+ ('1379', '1021', 'Test Results', 'Min_Val', 'bigquery', NULL, 'SELECT DISTINCT `{COLUMN_NAME}`, (ABS(CAST(`{COLUMN_NAME}` AS NUMERIC)) - ABS({BASELINE_VALUE})) AS difference_from_baseline
+FROM {TARGET_SCHEMA}.{TABLE_NAME}
+WHERE CAST(`{COLUMN_NAME}` AS NUMERIC) < {BASELINE_VALUE}
+LIMIT 500;'),
+ ('1380', '1022', 'Test Results', 'Missing_Pct', 'bigquery', NULL, 'SELECT *
+FROM {TARGET_SCHEMA}.{TABLE_NAME}
+WHERE `{COLUMN_NAME}` IS NULL OR CAST(`{COLUMN_NAME}` AS STRING) = ''''
+LIMIT 10;'),
+ ('1381', '1023', 'Test Results', 'Monthly_Rec_Ct', 'bigquery', NULL, 'WITH daterange AS (
+ SELECT month AS all_dates
+ FROM UNNEST(
+ GENERATE_DATE_ARRAY(
+ DATE_TRUNC((SELECT MIN(CAST(`{COLUMN_NAME}` AS DATE)) FROM {TARGET_SCHEMA}.{TABLE_NAME}), MONTH),
+ DATE_TRUNC((SELECT MAX(CAST(`{COLUMN_NAME}` AS DATE)) FROM {TARGET_SCHEMA}.{TABLE_NAME}), MONTH),
+ INTERVAL 1 MONTH
+ )
+ ) AS month
+),
+existing_periods AS (
+ SELECT DISTINCT DATE_TRUNC(CAST(`{COLUMN_NAME}` AS DATE), MONTH) AS period, COUNT(1) AS period_count
+ FROM {TARGET_SCHEMA}.{TABLE_NAME}
+ GROUP BY period
+),
+p AS (
+ SELECT d.all_dates AS missing_period, MAX(b.period) AS prior_available_month, MIN(c.period) AS next_available_month
+ FROM daterange d
+ LEFT JOIN existing_periods a ON d.all_dates = a.period
+ LEFT JOIN existing_periods b ON b.period < d.all_dates
+ LEFT JOIN existing_periods c ON c.period > d.all_dates
+ WHERE a.period IS NULL
+ AND d.all_dates BETWEEN b.period AND c.period
+ GROUP BY d.all_dates
+)
+SELECT p.missing_period, p.prior_available_month, e.period_count AS prior_available_month_count, p.next_available_month, f.period_count AS next_available_month_count
+FROM p
+LEFT JOIN existing_periods e ON (p.prior_available_month = e.period)
+LEFT JOIN existing_periods f ON (p.next_available_month = f.period)
+ORDER BY p.missing_period;'),
+ ('1382', '1024', 'Test Results', 'Outlier_Pct_Above', 'bigquery', NULL, 'SELECT ({BASELINE_AVG} + (2 * {BASELINE_SD})) AS outlier_threshold, `{COLUMN_NAME}`, COUNT(*) AS count
+FROM {TARGET_SCHEMA}.{TABLE_NAME}
+WHERE CAST(`{COLUMN_NAME}` AS FLOAT64) > ({BASELINE_AVG} + (2 * {BASELINE_SD}))
+GROUP BY `{COLUMN_NAME}`
+ORDER BY `{COLUMN_NAME}` DESC;'),
+ ('1383', '1025', 'Test Results', 'Outlier_Pct_Below', 'bigquery', NULL, 'SELECT ({BASELINE_AVG} + (2 * {BASELINE_SD})) AS outlier_threshold, `{COLUMN_NAME}`, COUNT(*) AS count
+FROM {TARGET_SCHEMA}.{TABLE_NAME}
+WHERE CAST(`{COLUMN_NAME}` AS FLOAT64) < ({BASELINE_AVG} + (2 * {BASELINE_SD}))
+GROUP BY `{COLUMN_NAME}`
+ORDER BY `{COLUMN_NAME}` DESC;'),
+ ('1384', '1026', 'Test Results', 'Pattern_Match', 'bigquery', NULL, 'SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count
+FROM {TARGET_SCHEMA}.{TABLE_NAME}
+WHERE NOT REGEXP_CONTAINS(NULLIF(CAST(`{COLUMN_NAME}` AS STRING), ''''), r''{BASELINE_VALUE}'')
+GROUP BY `{COLUMN_NAME}`;'),
+ ('1385', '1028', 'Test Results', 'Recency', 'bigquery', NULL, 'SELECT DISTINCT col AS latest_date_available, CAST(CAST(''{TEST_DATE}'' AS DATETIME) AS {COLUMN_TYPE}) AS test_run_date
+FROM (SELECT DATE_TRUNC(MAX(`{COLUMN_NAME}`), DAY) AS col FROM {TARGET_SCHEMA}.{TABLE_NAME})
+WHERE DATETIME_DIFF(CAST(CAST(''{TEST_DATE}'' AS DATETIME) AS {COLUMN_TYPE}), col, DAY) > {THRESHOLD_VALUE};'),
+ ('1386', '1030', 'Test Results', 'Required', 'bigquery', NULL, 'SELECT *
+FROM {TARGET_SCHEMA}.{TABLE_NAME}
+WHERE `{COLUMN_NAME}` IS NULL
+LIMIT 500;'),
+ ('1387', '1031', 'Test Results', 'Row_Ct', 'bigquery', NULL, 'WITH cte AS (
+ SELECT COUNT(*) AS current_count
+ FROM {TARGET_SCHEMA}.{TABLE_NAME}
+)
+SELECT current_count,
+ ABS(ROUND(100 * (current_count - {THRESHOLD_VALUE}) / CAST({THRESHOLD_VALUE} AS FLOAT64), 2)) AS row_count_pct_decrease
+FROM cte
+WHERE current_count < {THRESHOLD_VALUE};'),
+ ('1388', '1032', 'Test Results', 'Row_Ct_Pct', 'bigquery', NULL, 'WITH cte AS (
+ SELECT COUNT(*) AS current_count
+ FROM {TARGET_SCHEMA}.{TABLE_NAME}
+)
+SELECT current_count, {BASELINE_CT} AS baseline_count,
+ ABS(ROUND(100 * (current_count - {BASELINE_CT}) / CAST({BASELINE_CT} AS FLOAT64), 2)) AS row_count_pct_difference
+FROM cte;'),
+ ('1389', '1033', 'Test Results', 'Street_Addr_Pattern', 'bigquery', NULL, 'SELECT `{COLUMN_NAME}`, COUNT(*) AS count
+FROM {TARGET_SCHEMA}.{TABLE_NAME}
+WHERE NOT REGEXP_CONTAINS(CAST(`{COLUMN_NAME}` AS STRING), r''^[0-9]{1,5}[A-Za-z]?\s\w{1,5}\.?\s?\w*\s?\w*\s[A-Za-z]{1,6}\.?\s?[0-9]{0,5}[A-Z]{0,1}$'')
+GROUP BY `{COLUMN_NAME}`
+ORDER BY COUNT(*) DESC
+LIMIT 500;'),
+ ('1390', '1036', 'Test Results', 'US_State', 'bigquery', NULL, 'SELECT `{COLUMN_NAME}`, COUNT(*) AS count
+FROM {TARGET_SCHEMA}.{TABLE_NAME}
+WHERE NULLIF(`{COLUMN_NAME}`, '''') NOT IN (''AL'',''AK'',''AS'',''AZ'',''AR'',''CA'',''CO'',''CT'',''DE'',''DC'',''FM'',''FL'',''GA'',''GU'',''HI'',''ID'',''IL'',''IN'',''IA'',''KS'',''KY'',''LA'',''ME'',''MH'',''MD'',''MA'',''MI'',''MN'',''MS'',''MO'',''MT'',''NE'',''NV'',''NH'',''NJ'',''NM'',''NY'',''NC'',''ND'',''MP'',''OH'',''OK'',''OR'',''PW'',''PA'',''PR'',''RI'',''SC'',''SD'',''TN'',''TX'',''UT'',''VT'',''VI'',''VA'',''WA'',''WV'',''WI'',''WY'',''AE'',''AP'',''AA'')
+GROUP BY `{COLUMN_NAME}`
+LIMIT 500;'),
+ ('1391', '1034', 'Test Results', 'Unique', 'bigquery', NULL, 'SELECT `{COLUMN_NAME}`, COUNT(*) AS count
+FROM {TARGET_SCHEMA}.{TABLE_NAME}
+GROUP BY `{COLUMN_NAME}`
+HAVING COUNT(*) > 1
+ORDER BY COUNT(*) DESC
+LIMIT 500;'),
+ ('1392', '1035', 'Test Results', 'Unique_Pct', 'bigquery', NULL, 'SELECT `{COLUMN_NAME}`, COUNT(*) AS count
+FROM {TARGET_SCHEMA}.{TABLE_NAME}
+GROUP BY `{COLUMN_NAME}`
+ORDER BY COUNT(*) DESC
+LIMIT 500;'),
+ ('1393', '1037', 'Test Results', 'Weekly_Rec_Ct', 'bigquery', NULL, 'WITH daterange AS (
+ SELECT week_start AS all_dates
+ FROM UNNEST(
+ GENERATE_DATE_ARRAY(
+ DATE_TRUNC((SELECT MIN(CAST(`{COLUMN_NAME}` AS DATE)) FROM {TARGET_SCHEMA}.{TABLE_NAME}), WEEK),
+ DATE_TRUNC((SELECT MAX(CAST(`{COLUMN_NAME}` AS DATE)) FROM {TARGET_SCHEMA}.{TABLE_NAME}), WEEK),
+ INTERVAL 7 DAY
+ )
+ ) AS week_start
+),
+existing_periods AS (
+ SELECT DISTINCT DATE_TRUNC(CAST(`{COLUMN_NAME}` AS DATE), WEEK) AS period, COUNT(1) AS period_count
+ FROM {TARGET_SCHEMA}.{TABLE_NAME}
+ GROUP BY period
+),
+p AS (
+ SELECT d.all_dates AS missing_period, MAX(b.period) AS prior_available_week, MIN(c.period) AS next_available_week
+ FROM daterange d
+ LEFT JOIN existing_periods a ON d.all_dates = a.period
+ LEFT JOIN existing_periods b ON b.period < d.all_dates
+ LEFT JOIN existing_periods c ON c.period > d.all_dates
+ WHERE a.period IS NULL
+ AND d.all_dates BETWEEN b.period AND c.period
+ GROUP BY d.all_dates
+)
+SELECT p.missing_period, p.prior_available_week, e.period_count AS prior_available_week_count, p.next_available_week, f.period_count AS next_available_week_count
+FROM p
+LEFT JOIN existing_periods e ON (p.prior_available_week = e.period)
+LEFT JOIN existing_periods f ON (p.next_available_week = f.period)
+ORDER BY p.missing_period;'),
+ ('1394', '1040', 'Test Results', 'Variability_Increase', 'bigquery', NULL, 'SELECT STDDEV_POP(CAST(`{COLUMN_NAME}` AS FLOAT64)) AS current_standard_deviation
+FROM {TARGET_SCHEMA}.{TABLE_NAME};'),
+ ('1395', '1041', 'Test Results', 'Variability_Decrease', 'bigquery', NULL, 'SELECT STDDEV_POP(CAST(`{COLUMN_NAME}` AS FLOAT64)) AS current_standard_deviation
+FROM {TARGET_SCHEMA}.{TABLE_NAME};'),
+ ('1396', '1027', 'Profile Anomaly', 'Variant_Coded_Values', 'bigquery', NULL, 'SELECT `{COLUMN_NAME}`, COUNT(*) AS count
+FROM {TARGET_SCHEMA}.{TABLE_NAME}
+WHERE LOWER(CAST(`{COLUMN_NAME}` AS STRING)) IN (
+ SELECT TRIM(val) FROM UNNEST(SPLIT(SUBSTR(''{DETAIL_EXPRESSION}'', STRPOS(''{DETAIL_EXPRESSION}'', '':'') + 2), ''|'')) AS val
+)
+GROUP BY `{COLUMN_NAME}`;'),
+ ('1397', '1043', 'Test Results', 'Valid_Characters', 'bigquery', NULL, 'SELECT `{COLUMN_NAME}`, COUNT(*) AS record_ct
+FROM {TARGET_SCHEMA}.{TABLE_NAME}
+WHERE REGEXP_CONTAINS(CAST(`{COLUMN_NAME}` AS STRING), r''[\u00A0\u200B\uFEFF\u202F\u2001\u3000\u2004\u200C]'')
+ OR CAST(`{COLUMN_NAME}` AS STRING) LIKE '' %''
+ OR CAST(`{COLUMN_NAME}` AS STRING) LIKE ''\''''%''
+ OR CAST(`{COLUMN_NAME}` AS STRING) LIKE ''"%''
+GROUP BY `{COLUMN_NAME}`
+ORDER BY record_ct DESC
+LIMIT 20;'),
+ ('1398', '1044', 'Test Results', 'Valid_US_Zip', 'bigquery', NULL, 'SELECT `{COLUMN_NAME}`, COUNT(*) AS record_ct
+FROM {TARGET_SCHEMA}.{TABLE_NAME}
+WHERE TRANSLATE(CAST(`{COLUMN_NAME}` AS STRING), ''012345678'', ''999999999'') NOT IN (''99999'', ''999999999'', ''99999-9999'')
+GROUP BY `{COLUMN_NAME}`
+ORDER BY record_ct DESC
+LIMIT 20;'),
+ ('1399', '1045', 'Test Results', 'Valid_US_Zip3', 'bigquery', NULL, 'SELECT `{COLUMN_NAME}`, COUNT(*) AS record_ct
+FROM {TARGET_SCHEMA}.{TABLE_NAME}
+WHERE TRANSLATE(CAST(`{COLUMN_NAME}` AS STRING), ''012345678'', ''999999999'') != ''999''
+GROUP BY `{COLUMN_NAME}`
+ORDER BY record_ct DESC
+LIMIT 20;'),
+ ('1400', '1500', 'Test Results', 'Aggregate_Balance', 'bigquery', NULL, 'SELECT *
+FROM (
+ SELECT {GROUPBY_NAMES}, SUM(TOTAL) AS total, SUM(MATCH_TOTAL) AS MATCH_TOTAL
+ FROM (
+ SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} AS total, NULL AS match_total
+ FROM {TARGET_SCHEMA}.{TABLE_NAME}
+ WHERE {SUBSET_CONDITION}
+ GROUP BY {GROUPBY_NAMES}
+ {HAVING_CONDITION}
+ UNION ALL
+ SELECT {MATCH_GROUPBY_NAMES}, NULL AS total, {MATCH_COLUMN_NAMES} AS match_total
+ FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME}
+ WHERE {MATCH_SUBSET_CONDITION}
+ GROUP BY {MATCH_GROUPBY_NAMES}
+ {MATCH_HAVING_CONDITION}
+ ) a
+ GROUP BY {GROUPBY_NAMES}
+) s
+WHERE total <> match_total OR (total IS NOT NULL AND match_total IS NULL) OR (total IS NULL AND match_total IS NOT NULL)
+ORDER BY {GROUPBY_NAMES};'),
+ ('1401', '1501', 'Test Results', 'Aggregate_Minimum', 'bigquery', NULL, 'SELECT *
+FROM (
+ SELECT {GROUPBY_NAMES}, SUM(TOTAL) AS total, SUM(MATCH_TOTAL) AS MATCH_TOTAL
+ FROM (
+ SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} AS total, NULL AS match_total
+ FROM {TARGET_SCHEMA}.{TABLE_NAME}
+ WHERE {SUBSET_CONDITION}
+ GROUP BY {GROUPBY_NAMES}
+ {HAVING_CONDITION}
+ UNION ALL
+ SELECT {MATCH_GROUPBY_NAMES}, NULL AS total, {MATCH_COLUMN_NAMES} AS match_total
+ FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME}
+ WHERE {MATCH_SUBSET_CONDITION}
+ GROUP BY {MATCH_GROUPBY_NAMES}
+ {MATCH_HAVING_CONDITION}
+ ) a
+ GROUP BY {GROUPBY_NAMES}
+) s
+WHERE total < match_total OR (total IS NULL AND match_total IS NOT NULL)
+ORDER BY {GROUPBY_NAMES};'),
+ ('1402', '1502', 'Test Results', 'Combo_Match', 'bigquery', NULL, 'SELECT *
+FROM (
+ SELECT {COLUMN_NAME_NO_QUOTES}
+ FROM {TARGET_SCHEMA}.{TABLE_NAME}
+ WHERE {SUBSET_CONDITION}
+ GROUP BY {COLUMN_NAME_NO_QUOTES}
+ {HAVING_CONDITION}
+ EXCEPT DISTINCT
+ SELECT {MATCH_GROUPBY_NAMES}
+ FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME}
+ WHERE {MATCH_SUBSET_CONDITION}
+ GROUP BY {MATCH_GROUPBY_NAMES}
+ {MATCH_HAVING_CONDITION}
+) test
+ORDER BY {COLUMN_NAME_NO_QUOTES};'),
+ ('1403', '1503', 'Test Results', 'Distribution_Shift', 'bigquery', NULL, 'WITH latest_ver AS (
+ SELECT {CONCAT_COLUMNS} AS category,
+ CAST(COUNT(*) AS FLOAT64) / SUM(COUNT(*)) OVER() AS pct_of_total
+ FROM {TARGET_SCHEMA}.{TABLE_NAME} v1
+ WHERE {SUBSET_CONDITION}
+ GROUP BY {CONCAT_COLUMNS}
+)
+SELECT *
+FROM latest_ver;'),
+ ('1404', '1504', 'Test Results', 'Aggregate_Balance_Percent', 'bigquery', NULL, 'SELECT *
+FROM (
+ SELECT {GROUPBY_NAMES}, SUM(TOTAL) AS total, SUM(MATCH_TOTAL) AS MATCH_TOTAL
+ FROM (
+ SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} AS total, NULL AS match_total
+ FROM {TARGET_SCHEMA}.{TABLE_NAME}
+ WHERE {SUBSET_CONDITION}
+ GROUP BY {GROUPBY_NAMES}
+ {HAVING_CONDITION}
+ UNION ALL
+ SELECT {MATCH_GROUPBY_NAMES}, NULL AS total, {MATCH_COLUMN_NAMES} AS match_total
+ FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME}
+ WHERE {MATCH_SUBSET_CONDITION}
+ GROUP BY {MATCH_GROUPBY_NAMES}
+ {MATCH_HAVING_CONDITION}
+ ) a
+ GROUP BY {GROUPBY_NAMES}
+) s
+WHERE (total IS NOT NULL AND match_total IS NULL)
+ OR (total IS NULL AND match_total IS NOT NULL)
+ OR (total NOT BETWEEN match_total * (1 + {LOWER_TOLERANCE}/100.0) AND match_total * (1 + {UPPER_TOLERANCE}/100.0))
+ORDER BY {GROUPBY_NAMES};'),
+ ('1405', '1505', 'Test Results', 'Aggregate_Balance_Range', 'bigquery', NULL, 'SELECT *
+FROM (
+ SELECT {GROUPBY_NAMES}, SUM(TOTAL) AS total, SUM(MATCH_TOTAL) AS MATCH_TOTAL
+ FROM (
+ SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} AS total, NULL AS match_total
+ FROM {TARGET_SCHEMA}.{TABLE_NAME}
+ WHERE {SUBSET_CONDITION}
+ GROUP BY {GROUPBY_NAMES}
+ {HAVING_CONDITION}
+ UNION ALL
+ SELECT {MATCH_GROUPBY_NAMES}, NULL AS total, {MATCH_COLUMN_NAMES} AS match_total
+ FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME}
+ WHERE {MATCH_SUBSET_CONDITION}
+ GROUP BY {MATCH_GROUPBY_NAMES}
+ {MATCH_HAVING_CONDITION}
+ ) a
+ GROUP BY {GROUPBY_NAMES}
+) s
+WHERE (total IS NOT NULL AND match_total IS NULL)
+ OR (total IS NULL AND match_total IS NOT NULL)
+ OR (total NOT BETWEEN match_total + {LOWER_TOLERANCE} AND match_total + {UPPER_TOLERANCE})
+ORDER BY {GROUPBY_NAMES};'),
+ ('1406', '1508', 'Test Results', 'Timeframe_Combo_Gain', 'bigquery', NULL, 'SELECT {COLUMN_NAME_NO_QUOTES}
+FROM {TARGET_SCHEMA}.{TABLE_NAME}
+WHERE {SUBSET_CONDITION}
+ AND {WINDOW_DATE_COLUMN} >= DATE_SUB((SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}), INTERVAL 2 * {WINDOW_DAYS} DAY)
+ AND {WINDOW_DATE_COLUMN} < DATE_SUB((SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}), INTERVAL {WINDOW_DAYS} DAY)
+GROUP BY {COLUMN_NAME_NO_QUOTES}
+EXCEPT DISTINCT
+SELECT {COLUMN_NAME_NO_QUOTES}
+FROM {TARGET_SCHEMA}.{TABLE_NAME}
+WHERE {SUBSET_CONDITION}
+ AND {WINDOW_DATE_COLUMN} >= DATE_SUB((SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}), INTERVAL {WINDOW_DAYS} DAY)
+GROUP BY {COLUMN_NAME_NO_QUOTES};'),
+ ('1407', '1509', 'Test Results', 'Timeframe_Combo_Match', 'bigquery', NULL, '(
+ SELECT ''Prior Timeframe'' AS missing_from, {COLUMN_NAME}
+ FROM {TARGET_SCHEMA}.{TABLE_NAME}
+ WHERE {SUBSET_CONDITION}
+ AND {WINDOW_DATE_COLUMN} >= DATE_SUB((SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}), INTERVAL {WINDOW_DAYS} DAY)
+ EXCEPT DISTINCT
+ SELECT ''Prior Timeframe'' AS missing_from, {COLUMN_NAME}
+ FROM {TARGET_SCHEMA}.{TABLE_NAME}
+ WHERE {SUBSET_CONDITION}
+ AND {WINDOW_DATE_COLUMN} >= DATE_SUB((SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}), INTERVAL 2 * {WINDOW_DAYS} DAY)
+ AND {WINDOW_DATE_COLUMN} < DATE_SUB((SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}), INTERVAL {WINDOW_DAYS} DAY)
+)
+UNION ALL
+(
+ SELECT ''Latest Timeframe'' AS missing_from, {COLUMN_NAME}
+ FROM {TARGET_SCHEMA}.{TABLE_NAME}
+ WHERE {SUBSET_CONDITION}
+ AND {WINDOW_DATE_COLUMN} >= DATE_SUB((SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}), INTERVAL 2 * {WINDOW_DAYS} DAY)
+ AND {WINDOW_DATE_COLUMN} < DATE_SUB((SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}), INTERVAL {WINDOW_DAYS} DAY)
+ EXCEPT DISTINCT
+ SELECT ''Latest Timeframe'' AS missing_from, {COLUMN_NAME}
+ FROM {TARGET_SCHEMA}.{TABLE_NAME}
+ WHERE {SUBSET_CONDITION}
+ AND {WINDOW_DATE_COLUMN} >= DATE_SUB((SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}), INTERVAL {WINDOW_DAYS} DAY)
+);'),
+ ('1408', '1100', 'Profile Anomaly', 'Potential_PII', 'bigquery', NULL, 'SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count
+FROM {TARGET_SCHEMA}.{TABLE_NAME}
+GROUP BY `{COLUMN_NAME}`
+ORDER BY `{COLUMN_NAME}` DESC
+LIMIT 500;'),
+ ('1409', '1510', 'Test Results', 'Dupe_Rows', 'bigquery', NULL, 'SELECT {GROUPBY_NAMES}, COUNT(*) AS record_ct
+FROM {TARGET_SCHEMA}.{TABLE_NAME}
+WHERE {SUBSET_CONDITION}
+GROUP BY {GROUPBY_NAMES}
+HAVING COUNT(*) > 1
+ORDER BY {GROUPBY_NAMES};'),
+ ('1410', '1028', 'Profile Anomaly', 'Inconsistent_Casing', 'bigquery', NULL, '(
+ SELECT ''Upper Case'' AS casing, `{COLUMN_NAME}`, COUNT(*) AS count
+ FROM {TARGET_SCHEMA}.{TABLE_NAME}
+ WHERE UPPER(CAST(`{COLUMN_NAME}` AS STRING)) = CAST(`{COLUMN_NAME}` AS STRING)
+ GROUP BY `{COLUMN_NAME}`
+ LIMIT 20
+)
+UNION ALL
+(
+ SELECT ''Mixed Case'' AS casing, `{COLUMN_NAME}`, COUNT(*) AS count
+ FROM {TARGET_SCHEMA}.{TABLE_NAME}
+ WHERE CAST(`{COLUMN_NAME}` AS STRING) <> UPPER(CAST(`{COLUMN_NAME}` AS STRING))
+ AND CAST(`{COLUMN_NAME}` AS STRING) <> LOWER(CAST(`{COLUMN_NAME}` AS STRING))
+ GROUP BY `{COLUMN_NAME}`
+ LIMIT 20
+);'),
+ ('1411', '1029', 'Profile Anomaly', 'Non_Alpha_Name_Address', 'bigquery', NULL, 'SELECT `{COLUMN_NAME}`, COUNT(*) AS record_ct
+FROM {TARGET_SCHEMA}.{TABLE_NAME}
+WHERE CAST(`{COLUMN_NAME}` AS STRING) = UPPER(CAST(`{COLUMN_NAME}` AS STRING))
+ AND CAST(`{COLUMN_NAME}` AS STRING) = LOWER(CAST(`{COLUMN_NAME}` AS STRING))
+ AND CAST(`{COLUMN_NAME}` AS STRING) > ''''
+GROUP BY `{COLUMN_NAME}`
+LIMIT 500;'),
+ ('1412', '1030', 'Profile Anomaly', 'Non_Alpha_Prefixed_Name', 'bigquery', NULL, 'SELECT `{COLUMN_NAME}`, COUNT(*) AS record_ct
+FROM {TARGET_SCHEMA}.{TABLE_NAME}
+WHERE CAST(`{COLUMN_NAME}` AS STRING) < ''A''
+ AND SUBSTR(CAST(`{COLUMN_NAME}` AS STRING), 1, 1) NOT IN (''"'', '' '')
+ AND SUBSTR(CAST(`{COLUMN_NAME}` AS STRING), LENGTH(CAST(`{COLUMN_NAME}` AS STRING)), 1) <> ''\''''
+GROUP BY `{COLUMN_NAME}`
+ORDER BY `{COLUMN_NAME}`
+LIMIT 500;'),
+ ('1413', '1031', 'Profile Anomaly', 'Non_Printing_CHRs', 'bigquery', NULL, 'SELECT REGEXP_REPLACE(CAST(`{COLUMN_NAME}` AS STRING), r''['' || CHR(160) || CHR(8191) || CHR(8192) || CHR(8193) || CHR(8194) || CHR(8195) || CHR(8196) || CHR(8201) || CHR(8202) || CHR(12288) || CHR(65279) || '']'', '''') AS `{COLUMN_NAME}_content`,
+ COUNT(*) AS record_ct
+FROM {TARGET_SCHEMA}.{TABLE_NAME}
+WHERE REGEXP_CONTAINS(CAST(`{COLUMN_NAME}` AS STRING), r''['' || CHR(160) || CHR(8191) || CHR(8192) || CHR(8193) || CHR(8194) || CHR(8195) || CHR(8196) || CHR(8201) || CHR(8202) || CHR(12288) || CHR(65279) || '']'')
+GROUP BY `{COLUMN_NAME}`
+ORDER BY `{COLUMN_NAME}`
+LIMIT 500');
TRUNCATE TABLE variant_codings;
diff --git a/testgen/template/exec_cat_tests/ex_cat_build_agg_table_tests.sql b/testgen/template/exec_cat_tests/ex_cat_build_agg_table_tests.sql
index 2f821506..f0b0ce9a 100644
--- a/testgen/template/exec_cat_tests/ex_cat_build_agg_table_tests.sql
+++ b/testgen/template/exec_cat_tests/ex_cat_build_agg_table_tests.sql
@@ -5,9 +5,35 @@ INSERT INTO working_agg_cat_tests
column_names, test_types, test_definition_ids,
test_actions, test_descriptions,
test_parms, test_measures, test_conditions)
+
+-- Column types from latest profile_results
+WITH column_types AS (
+ SELECT pr.table_groups_id,
+ pr.connection_id,
+ pr.schema_name,
+ pr.table_name,
+ pr.column_name,
+ pr.column_type
+ FROM profile_results pr
+ INNER JOIN (
+ SELECT table_groups_id,
+ connection_id,
+ schema_name,
+ table_name,
+ column_name,
+ MAX(run_date) AS max_run_date
+ FROM profile_results
+ GROUP BY table_groups_id, connection_id, schema_name, table_name, column_name
+ ) latest
+ ON pr.table_groups_id = latest.table_groups_id
+ AND pr.schema_name = latest.schema_name
+ AND pr.table_name = latest.table_name
+ AND pr.column_name = latest.column_name
+ AND pr.run_date = latest.max_run_date
+),
+
-- Test details from each test type
-WITH test_detail
- AS (
+test_detail AS (
SELECT t.test_suite_id,
'{SCHEMA_NAME}' as schema_name, '{TABLE_NAME}' as table_name,
'{RUN_DATE}'::TIMESTAMP as test_time,
@@ -29,9 +55,10 @@ WITH test_detail
-- Standard Measure start
'CAST(' ||
-- Nested parm replacements - part of query, not Python parms
- REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(
+ REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(
c.measure,
'{COLUMN_NAME}', '{ID_SEPARATOR}' || COALESCE(t.column_name, '') || '{ID_SEPARATOR}'),
+ '{COLUMN_TYPE}', COALESCE(ct.column_type, '')),
'{BASELINE_CT}', COALESCE(t.baseline_ct, '')),
'{BASELINE_UNIQUE_CT}', COALESCE(t.baseline_unique_ct, '')),
'{BASELINE_VALUE}', COALESCE(t.baseline_value, '') ),
@@ -40,16 +67,17 @@ WITH test_detail
'{BASELINE_AVG}', COALESCE(t.baseline_avg, '') ),
'{BASELINE_SD}', COALESCE(t.baseline_sd, '') ),
'{CUSTOM_QUERY}', COALESCE(t.custom_query, '')),
- '{THRESHOLD_VALUE}', COALESCE(t.threshold_value, '') )
+ '{THRESHOLD_VALUE}', COALESCE(t.threshold_value, ''))
-- Standard measure end with pipe delimiter
- || ' AS VARCHAR(1000) ) {CONCAT_OPERATOR} ''|'' ' as measure,
+ || ' AS {VARCHAR_TYPE}) {CONCAT_OPERATOR} ''|'' ' as measure,
-- Standard CASE for condition starts
'CASE WHEN ' ||
-- Nested parm replacements - standard
- REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(
+ REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(
c.measure || c.test_operator || c.test_condition,
'{COLUMN_NAME}', '{ID_SEPARATOR}' || COALESCE(t.column_name, '') || '{ID_SEPARATOR}'),
+ '{COLUMN_TYPE}', COALESCE(ct.column_type, '')),
'{BASELINE_CT}', COALESCE(t.baseline_ct, '')),
'{BASELINE_UNIQUE_CT}', COALESCE(t.baseline_unique_ct, '')),
'{BASELINE_VALUE}', COALESCE(t.baseline_value, '') ),
@@ -58,20 +86,28 @@ WITH test_detail
'{BASELINE_AVG}', COALESCE(t.baseline_avg, '') ),
'{BASELINE_SD}', COALESCE(t.baseline_sd, '') ),
'{CUSTOM_QUERY}', COALESCE(t.custom_query, '')),
- '{THRESHOLD_VALUE}', COALESCE(t.threshold_value, '') )
+ '{THRESHOLD_VALUE}', COALESCE(t.threshold_value, ''))
-- Standard case ends
|| ' THEN ''0,'' ELSE ''1,'' END' as condition
FROM test_definitions t
INNER JOIN cat_test_conditions c
ON (t.test_type = c.test_type
AND '{SQL_FLAVOR}' = c.sql_flavor)
+ INNER JOIN test_suites s
+ ON t.test_suite_id = s.id
+ LEFT JOIN column_types ct
+ ON s.table_groups_id = ct.table_groups_id
+ AND t.schema_name = ct.schema_name
+ AND t.table_name = ct.table_name
+ AND t.column_name = ct.column_name
WHERE t.test_suite_id = '{TEST_SUITE_ID}'
AND t.schema_name = '{SCHEMA_NAME}'
AND t.table_name = '{TABLE_NAME}'
AND COALESCE(t.test_active, 'Y') = 'Y'
),
-test_detail_split
- AS ( SELECT test_suite_id, schema_name, table_name, test_time,
+
+test_detail_split AS (
+ SELECT test_suite_id, schema_name, table_name, test_time,
column_name, test_type, test_definition_id, test_action, test_description,
parms, measure, condition,
SUM(LENGTH(condition)) OVER (PARTITION BY t.schema_name, t.table_name
@@ -79,7 +115,9 @@ test_detail_split
FLOOR( SUM(LENGTH(condition)) OVER (PARTITION BY t.schema_name, t.table_name
ORDER BY t.column_name ROWS UNBOUNDED PRECEDING )
/ {MAX_QUERY_CHARS} ) + 1 as query_split_no
- FROM test_detail t )
+ FROM test_detail t
+)
+
SELECT '{TEST_RUN_ID}' as test_run_id,
d.schema_name, d.table_name,
d.query_split_no as cat_sequence,
diff --git a/testgen/template/flavors/bigquery/data_chars/schema_ddf_query_bigquery.sql b/testgen/template/flavors/bigquery/data_chars/schema_ddf_query_bigquery.sql
index f3a1d6fa..bfbeec21 100644
--- a/testgen/template/flavors/bigquery/data_chars/schema_ddf_query_bigquery.sql
+++ b/testgen/template/flavors/bigquery/data_chars/schema_ddf_query_bigquery.sql
@@ -20,7 +20,7 @@ SELECT '{PROJECT_CODE}' AS project_code,
WHEN REGEXP_CONTAINS(LOWER(c.data_type), r'(decimal|numeric|bignumeric)') THEN 'N'
ELSE 'X'
END AS general_type,
- NULL AS is_decimal
+ REGEXP_CONTAINS(LOWER(c.data_type), r'(decimal|numeric|bignumeric)') AS is_decimal
FROM `{DATA_SCHEMA}.INFORMATION_SCHEMA.COLUMNS` c
WHERE c.table_schema = '{DATA_SCHEMA}' {TABLE_CRITERIA}
ORDER BY c.table_schema, c.table_name, c.ordinal_position;
diff --git a/testgen/template/flavors/bigquery/exec_query_tests/ex_data_match_bigquery.sql b/testgen/template/flavors/bigquery/exec_query_tests/ex_data_match_bigquery.sql
new file mode 100644
index 00000000..03ccee36
--- /dev/null
+++ b/testgen/template/flavors/bigquery/exec_query_tests/ex_data_match_bigquery.sql
@@ -0,0 +1,46 @@
+SELECT '{TEST_TYPE}' AS test_type,
+ '{TEST_DEFINITION_ID}' AS test_definition_id,
+ '{TEST_SUITE_ID}' AS test_suite_id,
+ '{TEST_RUN_ID}' AS test_run_id,
+ '{RUN_DATE}' AS test_time,
+ '{START_TIME}' AS starttime,
+ CURRENT_TIMESTAMP AS endtime,
+ '{SCHEMA_NAME}' AS schema_name,
+ '{TABLE_NAME}' AS table_name,
+ '{COLUMN_NAME_NO_QUOTES}' AS column_names,
+ '{SKIP_ERRORS}' AS threshold_value,
+ {SKIP_ERRORS} AS skip_errors,
+ '{INPUT_PARAMETERS}' AS input_parameters,
+ NULL as result_signal,
+ CASE WHEN COUNT(*) > {SKIP_ERRORS} THEN 0 ELSE 1 END AS result_code,
+ CASE
+ WHEN COUNT(*) > 0 THEN
+ CONCAT(
+ CAST(COUNT(*) AS STRING),
+ ' error(s) identified, ',
+ CASE
+ WHEN COUNT(*) > {SKIP_ERRORS} THEN 'exceeding limit of '
+ ELSE 'within limit of '
+ END,
+ '{SKIP_ERRORS}.'
+ )
+ ELSE 'No errors found.'
+ END AS result_message,
+ COUNT(*) AS result_measure,
+ '{SUBSET_DISPLAY}' AS subset_condition,
+ NULL AS result_query
+FROM (
+ SELECT {COLUMN_NAME_NO_QUOTES}
+ FROM `{SCHEMA_NAME}.{TABLE_NAME}`
+ WHERE {SUBSET_CONDITION}
+ GROUP BY {COLUMN_NAME_NO_QUOTES}
+ {HAVING_CONDITION}
+
+ EXCEPT DISTINCT
+
+ SELECT {MATCH_GROUPBY_NAMES}
+ FROM `{MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME}`
+ WHERE {MATCH_SUBSET_CONDITION}
+ GROUP BY {MATCH_GROUPBY_NAMES}
+ {MATCH_HAVING_CONDITION}
+) test;
diff --git a/testgen/template/flavors/bigquery/exec_query_tests/ex_relative_entropy_bigquery.sql b/testgen/template/flavors/bigquery/exec_query_tests/ex_relative_entropy_bigquery.sql
new file mode 100644
index 00000000..0aee6ead
--- /dev/null
+++ b/testgen/template/flavors/bigquery/exec_query_tests/ex_relative_entropy_bigquery.sql
@@ -0,0 +1,54 @@
+-- Relative Entropy: measured by Jensen-Shannon Divergence
+-- Smoothed and normalized version of KL divergence,
+-- with scores between 0 (identical) and 1 (maximally different),
+-- when using the base-2 logarithm. Formula is:
+-- 0.5 * kl_divergence(p, m) + 0.5 * kl_divergence(q, m)
+-- Log base 2 of x = LN(x)/LN(2)
+WITH latest_ver AS (
+ SELECT {CONCAT_COLUMNS} AS category,
+ CAST(COUNT(*) AS FLOAT64) / CAST(SUM(COUNT(*)) OVER () AS FLOAT64) AS pct_of_total
+ FROM `{SCHEMA_NAME}.{TABLE_NAME}` v1
+ WHERE {SUBSET_CONDITION}
+ GROUP BY {COLUMN_NAME_NO_QUOTES}
+),
+older_ver AS (
+ SELECT {CONCAT_MATCH_GROUPBY} AS category,
+ CAST(COUNT(*) AS FLOAT64) / CAST(SUM(COUNT(*)) OVER () AS FLOAT64) AS pct_of_total
+ FROM `{MATCH_SCHEMA_NAME}.{TABLE_NAME}` v2
+ WHERE {MATCH_SUBSET_CONDITION}
+ GROUP BY {MATCH_GROUPBY_NAMES}
+),
+dataset AS (
+ SELECT COALESCE(l.category, o.category) AS category,
+ COALESCE(o.pct_of_total, 0.0000001) AS old_pct,
+ COALESCE(l.pct_of_total, 0.0000001) AS new_pct,
+ (COALESCE(o.pct_of_total, 0.0000001) + COALESCE(l.pct_of_total, 0.0000001)) / 2.0 AS avg_pct
+ FROM latest_ver l
+ FULL JOIN older_ver o
+ ON l.category = o.category
+)
+SELECT '{TEST_TYPE}' AS test_type,
+ '{TEST_DEFINITION_ID}' AS test_definition_id,
+ '{TEST_SUITE_ID}' AS test_suite_id,
+ '{TEST_RUN_ID}' AS test_run_id,
+ '{RUN_DATE}' AS test_time,
+ '{START_TIME}' AS starttime,
+ CURRENT_TIMESTAMP AS endtime,
+ '{SCHEMA_NAME}' AS schema_name,
+ '{TABLE_NAME}' AS table_name,
+ '{COLUMN_NAME_NO_QUOTES}' AS column_names,
+ -- '{GROUPBY_NAMES}' as column_names,
+ '{THRESHOLD_VALUE}' AS threshold_value,
+ NULL AS skip_errors,
+ '{INPUT_PARAMETERS}' AS input_parameters,
+ NULL as result_signal,
+ CASE WHEN js_divergence > {THRESHOLD_VALUE} THEN 0 ELSE 1 END AS result_code,
+ CONCAT('Divergence Level: ', CAST(js_divergence AS STRING), ', Threshold: {THRESHOLD_VALUE}.') AS result_message,
+ js_divergence AS result_measure,
+ '{SUBSET_DISPLAY}' AS subset_condition,
+ NULL AS result_query
+FROM (
+ SELECT 0.5 * ABS(SUM(new_pct * LN(new_pct/avg_pct)/LN(2)))
+ + 0.5 * ABS(SUM(old_pct * LN(old_pct/avg_pct)/LN(2))) AS js_divergence
+ FROM dataset
+) rslt;
diff --git a/testgen/template/flavors/bigquery/exec_query_tests/ex_table_changed_bigquery.sql b/testgen/template/flavors/bigquery/exec_query_tests/ex_table_changed_bigquery.sql
new file mode 100644
index 00000000..70d97b32
--- /dev/null
+++ b/testgen/template/flavors/bigquery/exec_query_tests/ex_table_changed_bigquery.sql
@@ -0,0 +1,30 @@
+SELECT '{TEST_TYPE}' AS test_type,
+ '{TEST_DEFINITION_ID}' AS test_definition_id,
+ '{TEST_SUITE_ID}' AS test_suite_id,
+ '{TEST_RUN_ID}' AS test_run_id,
+ '{RUN_DATE}' AS test_time,
+ '{START_TIME}' AS starttime,
+ CURRENT_TIMESTAMP AS endtime,
+ '{SCHEMA_NAME}' AS schema_name,
+ '{TABLE_NAME}' AS table_name,
+ '{COLUMN_NAME_NO_QUOTES}' AS column_names,
+ '{SKIP_ERRORS}' AS threshold_value,
+ {SKIP_ERRORS} AS skip_errors,
+ '{INPUT_PARAMETERS}' AS input_parameters,
+ fingerprint AS result_signal,
+ /* Fails if table is the same */
+ CASE WHEN fingerprint = '{BASELINE_VALUE}' THEN 0 ELSE 1 END AS result_code,
+ CASE
+ WHEN fingerprint = '{BASELINE_VALUE}' THEN 'No table change detected.'
+ ELSE 'Table change detected.'
+ END AS result_message,
+ CASE
+ WHEN fingerprint = '{BASELINE_VALUE}' THEN 0 ELSE 1
+ END AS result_measure,
+ '{SUBSET_DISPLAY}' AS subset_condition,
+ NULL AS result_query
+FROM (
+ SELECT {CUSTOM_QUERY} AS fingerprint
+ FROM `{SCHEMA_NAME}.{TABLE_NAME}`
+ WHERE {SUBSET_CONDITION}
+) test;
diff --git a/testgen/template/flavors/bigquery/exec_query_tests/ex_window_match_no_drops_bigquery.sql b/testgen/template/flavors/bigquery/exec_query_tests/ex_window_match_no_drops_bigquery.sql
new file mode 100644
index 00000000..5ba04cfd
--- /dev/null
+++ b/testgen/template/flavors/bigquery/exec_query_tests/ex_window_match_no_drops_bigquery.sql
@@ -0,0 +1,44 @@
+SELECT
+ '{TEST_TYPE}' AS test_type,
+ '{TEST_DEFINITION_ID}' AS test_definition_id,
+ '{TEST_SUITE_ID}' AS test_suite_id,
+ '{TEST_RUN_ID}' AS test_run_id,
+ '{RUN_DATE}' AS test_time,
+ '{START_TIME}' AS starttime,
+ CURRENT_TIMESTAMP AS endtime,
+ '{SCHEMA_NAME}' AS schema_name,
+ '{TABLE_NAME}' AS table_name,
+ '{COLUMN_NAME_NO_QUOTES}' AS column_names,
+ '{SKIP_ERRORS}' AS threshold_value,
+ {SKIP_ERRORS} AS skip_errors,
+ '{INPUT_PARAMETERS}' AS input_parameters,
+ NULL as result_signal,
+ CASE WHEN COUNT(*) > {SKIP_ERRORS} THEN 0 ELSE 1 END AS result_code,
+ CASE
+ WHEN COUNT(*) > 0 THEN CONCAT(
+ CAST(COUNT(*) AS STRING), ' error(s) identified, ',
+ CASE
+ WHEN COUNT(*) > {SKIP_ERRORS} THEN 'exceeding limit of '
+ ELSE 'within limit of '
+ END,
+ '{SKIP_ERRORS}.'
+ )
+ ELSE 'No errors found.'
+ END AS result_message,
+ COUNT(*) AS result_measure,
+ '{SUBSET_DISPLAY}' AS subset_condition,
+ NULL AS result_query
+ FROM (
+ SELECT {COLUMN_NAME_NO_QUOTES}
+ FROM `{SCHEMA_NAME}.{TABLE_NAME}`
+ WHERE {SUBSET_CONDITION}
+ AND {WINDOW_DATE_COLUMN} >= DATE_SUB((SELECT MAX({WINDOW_DATE_COLUMN}) FROM `{SCHEMA_NAME}.{TABLE_NAME}`), INTERVAL 2 * {WINDOW_DAYS} DAY)
+ AND {WINDOW_DATE_COLUMN} < DATE_SUB((SELECT MAX({WINDOW_DATE_COLUMN}) FROM `{SCHEMA_NAME}.{TABLE_NAME}`), INTERVAL {WINDOW_DAYS} DAY)
+ GROUP BY {COLUMN_NAME_NO_QUOTES}
+ EXCEPT DISTINCT
+ SELECT {COLUMN_NAME_NO_QUOTES}
+ FROM `{SCHEMA_NAME}.{TABLE_NAME}`
+ WHERE {SUBSET_CONDITION}
+ AND {WINDOW_DATE_COLUMN} >= DATE_SUB((SELECT MAX({WINDOW_DATE_COLUMN}) FROM `{SCHEMA_NAME}.{TABLE_NAME}`), INTERVAL {WINDOW_DAYS} DAY)
+ GROUP BY {COLUMN_NAME_NO_QUOTES}
+ ) test;
diff --git a/testgen/template/flavors/bigquery/exec_query_tests/ex_window_match_same_bigquery.sql b/testgen/template/flavors/bigquery/exec_query_tests/ex_window_match_same_bigquery.sql
new file mode 100644
index 00000000..c16c158e
--- /dev/null
+++ b/testgen/template/flavors/bigquery/exec_query_tests/ex_window_match_same_bigquery.sql
@@ -0,0 +1,78 @@
+SELECT '{TEST_TYPE}' AS test_type,
+ '{TEST_DEFINITION_ID}' AS test_definition_id,
+ '{TEST_SUITE_ID}' AS test_suite_id,
+ '{TEST_RUN_ID}' AS test_run_id,
+ '{RUN_DATE}' AS test_time,
+ '{START_TIME}' AS starttime,
+ CURRENT_TIMESTAMP AS endtime,
+ '{SCHEMA_NAME}' AS schema_name,
+ '{TABLE_NAME}' AS table_name,
+ '{COLUMN_NAME_NO_QUOTES}' AS column_names,
+ '{SKIP_ERRORS}' AS threshold_value,
+ {SKIP_ERRORS} AS skip_errors,
+ '{INPUT_PARAMETERS}' AS input_parameters,
+ NULL as result_signal,
+ CASE WHEN COUNT(*) > {SKIP_ERRORS} THEN 0 ELSE 1 END AS result_code,
+ CASE
+ WHEN COUNT(*) > 0 THEN
+ CONCAT(
+ CAST(COUNT(*) AS STRING),
+ ' error(s) identified, ',
+ CASE
+ WHEN COUNT(*) > {SKIP_ERRORS} THEN 'exceeding limit of '
+ ELSE 'within limit of '
+ END,
+ '{SKIP_ERRORS}.'
+ )
+ ELSE 'No errors found.'
+ END AS result_message,
+ COUNT(*) AS result_measure,
+ '{SUBSET_DISPLAY}' AS subset_condition,
+ NULL AS result_query
+FROM (
+ -- Values in the prior timeframe but not in the latest
+ (
+ SELECT 'Prior Timeframe' AS missing_from, {COLUMN_NAME}
+ FROM `{SCHEMA_NAME}.{TABLE_NAME}`
+ WHERE {SUBSET_CONDITION}
+ AND {WINDOW_DATE_COLUMN} >= DATE_ADD(
+ (SELECT MAX({WINDOW_DATE_COLUMN}) FROM `{SCHEMA_NAME}.{TABLE_NAME}`),
+ INTERVAL -{WINDOW_DAYS} DAY
+ )
+ EXCEPT DISTINCT
+ SELECT 'Prior Timeframe' AS missing_from, {COLUMN_NAME}
+ FROM `{SCHEMA_NAME}.{TABLE_NAME}`
+ WHERE {SUBSET_CONDITION}
+ AND {WINDOW_DATE_COLUMN} >= DATE_ADD(
+ (SELECT MAX({WINDOW_DATE_COLUMN}) FROM `{SCHEMA_NAME}.{TABLE_NAME}`),
+ INTERVAL -2 * {WINDOW_DAYS} DAY
+ )
+ AND {WINDOW_DATE_COLUMN} < DATE_ADD(
+ (SELECT MAX({WINDOW_DATE_COLUMN}) FROM `{SCHEMA_NAME}.{TABLE_NAME}`),
+ INTERVAL -{WINDOW_DAYS} DAY
+ )
+ )
+ UNION ALL
+ -- Values in the latest timeframe but not in the prior
+ (
+ SELECT 'Latest Timeframe' AS missing_from, {COLUMN_NAME}
+ FROM `{SCHEMA_NAME}.{TABLE_NAME}`
+ WHERE {SUBSET_CONDITION}
+ AND {WINDOW_DATE_COLUMN} >= DATE_ADD(
+ (SELECT MAX({WINDOW_DATE_COLUMN}) FROM `{SCHEMA_NAME}.{TABLE_NAME}`),
+ INTERVAL -2 * {WINDOW_DAYS} DAY
+ )
+ AND {WINDOW_DATE_COLUMN} < DATE_ADD(
+ (SELECT MAX({WINDOW_DATE_COLUMN}) FROM `{SCHEMA_NAME}.{TABLE_NAME}`),
+ INTERVAL -{WINDOW_DAYS} DAY
+ )
+ EXCEPT DISTINCT
+ SELECT 'Latest Timeframe' AS missing_from, {COLUMN_NAME}
+ FROM `{SCHEMA_NAME}.{TABLE_NAME}`
+ WHERE {SUBSET_CONDITION}
+ AND {WINDOW_DATE_COLUMN} >= DATE_ADD(
+ (SELECT MAX({WINDOW_DATE_COLUMN}) FROM `{SCHEMA_NAME}.{TABLE_NAME}`),
+ INTERVAL -{WINDOW_DAYS} DAY
+ )
+ )
+) test;
diff --git a/testgen/template/flavors/bigquery/gen_query_tests/gen_table_changed_test.sql b/testgen/template/flavors/bigquery/gen_query_tests/gen_table_changed_test.sql
new file mode 100644
index 00000000..da6811be
--- /dev/null
+++ b/testgen/template/flavors/bigquery/gen_query_tests/gen_table_changed_test.sql
@@ -0,0 +1,161 @@
+INSERT INTO test_definitions (table_groups_id, profile_run_id, test_type, test_suite_id,
+ schema_name, table_name,
+ skip_errors, test_active, last_auto_gen_date, profiling_as_of_date,
+ lock_refresh, history_calculation, history_lookback, custom_query )
+WITH last_run AS (SELECT r.table_groups_id, MAX(run_date) AS last_run_date
+ FROM profile_results p
+ INNER JOIN profiling_runs r
+ ON (p.profile_run_id = r.id)
+ INNER JOIN test_suites ts
+ ON p.project_code = ts.project_code
+ AND p.connection_id = ts.connection_id
+ WHERE p.project_code = '{PROJECT_CODE}'
+ AND r.table_groups_id = '{TABLE_GROUPS_ID}'::UUID
+ AND ts.id = '{TEST_SUITE_ID}'
+ AND p.run_date::DATE <= '{AS_OF_DATE}'
+ GROUP BY r.table_groups_id),
+curprof AS (SELECT p.profile_run_id, schema_name, table_name, column_name, functional_data_type, general_type,
+ distinct_value_ct, record_ct, max_value, min_value, avg_value, stdev_value, null_value_ct
+ FROM last_run lr
+ INNER JOIN profile_results p
+ ON (lr.table_groups_id = p.table_groups_id
+ AND lr.last_run_date = p.run_date) ),
+locked AS (SELECT schema_name, table_name
+ FROM test_definitions
+ WHERE table_groups_id = '{TABLE_GROUPS_ID}'::UUID
+ AND test_suite_id = '{TEST_SUITE_ID}'
+ AND test_type = 'Table_Freshness'
+ AND lock_refresh = 'Y'),
+-- IDs - TOP 2
+id_cols
+ AS ( SELECT profile_run_id, schema_name, table_name, column_name, functional_data_type, general_type,
+ distinct_value_ct,
+ ROW_NUMBER() OVER (PARTITION BY schema_name, table_name
+ ORDER BY
+ CASE
+ WHEN functional_data_type ILIKE 'ID-Unique%' THEN 1
+ WHEN functional_data_type = 'ID-Secondary' THEN 2
+ ELSE 3
+ END, distinct_value_ct DESC, column_name) AS rank
+ FROM curprof
+ WHERE general_type IN ('A', 'D', 'N')
+ AND functional_data_type ILIKE 'ID%'),
+-- Process Date - TOP 1
+process_date_cols
+ AS (SELECT profile_run_id, schema_name, table_name, column_name, functional_data_type, general_type,
+ distinct_value_ct,
+ ROW_NUMBER() OVER (PARTITION BY schema_name, table_name
+ ORDER BY
+ CASE
+ WHEN column_name ILIKE '%mod%' THEN 1
+ WHEN column_name ILIKE '%up%' THEN 1
+ WHEN column_name ILIKE '%cr%' THEN 2
+ WHEN column_name ILIKE '%in%' THEN 2
+ END , distinct_value_ct DESC, column_name) AS rank
+ FROM curprof
+ WHERE general_type IN ('A', 'D', 'N')
+ AND functional_data_type ILIKE 'process%'),
+-- Transaction Date - TOP 1
+tran_date_cols
+ AS ( SELECT profile_run_id, schema_name, table_name, column_name, functional_data_type, general_type,
+ distinct_value_ct,
+ ROW_NUMBER() OVER (PARTITION BY schema_name, table_name
+ ORDER BY
+ distinct_value_ct DESC, column_name) AS rank
+ FROM curprof
+ WHERE general_type IN ('A', 'D', 'N')
+ AND functional_data_type ILIKE 'transactional date%'
+ OR functional_data_type ILIKE 'period%'
+ OR functional_data_type = 'timestamp' ),
+
+-- Numeric Measures
+numeric_cols
+ AS ( SELECT profile_run_id, schema_name, table_name, column_name, functional_data_type, general_type,
+/*
+ -- Subscores
+ distinct_value_ct * 1.0 / NULLIF(record_ct, 0) AS cardinality_score,
+ (max_value - min_value) / NULLIF(ABS(NULLIF(avg_value, 0)), 1) AS range_score,
+ LEAST(1, LOG(GREATEST(distinct_value_ct, 2))) / LOG(GREATEST(record_ct, 2)) AS nontriviality_score,
+ stdev_value / NULLIF(ABS(NULLIF(avg_value, 0)), 1) AS variability_score,
+ 1.0 - (null_value_ct * 1.0 / NULLIF(NULLIF(record_ct, 0), 1)) AS null_penalty,
+*/
+ -- Weighted score
+ (
+ 0.25 * (distinct_value_ct * 1.0 / NULLIF(record_ct, 0)) +
+ 0.15 * ((max_value - min_value) / NULLIF(ABS(NULLIF(avg_value, 0)), 1)) +
+ 0.10 * (LEAST(1, LOG(GREATEST(distinct_value_ct, 2))) / LOG(GREATEST(record_ct, 2))) +
+ 0.40 * (stdev_value / NULLIF(ABS(NULLIF(avg_value, 0)), 1)) +
+ 0.10 * (1.0 - (null_value_ct * 1.0 / NULLIF(NULLIF(record_ct, 0), 1)))
+ ) AS change_detection_score
+ FROM curprof
+ WHERE general_type = 'N'
+ AND (functional_data_type ILIKE 'Measure%' OR functional_data_type IN ('Sequence', 'Constant'))
+ ),
+numeric_cols_ranked
+ AS ( SELECT *,
+ ROW_NUMBER() OVER (PARTITION BY schema_name, table_name
+ ORDER BY change_detection_score DESC, column_name) as rank
+ FROM numeric_cols
+ WHERE change_detection_score IS NOT NULL),
+combined
+ AS ( SELECT profile_run_id, schema_name, table_name, column_name, 'ID' AS element_type, general_type, 10 + rank AS fingerprint_order
+ FROM id_cols
+ WHERE rank <= 2
+ UNION ALL
+ SELECT profile_run_id, schema_name, table_name, column_name, 'DATE_P' AS element_type, general_type, 20 + rank AS fingerprint_order
+ FROM process_date_cols
+ WHERE rank = 1
+ UNION ALL
+ SELECT profile_run_id, schema_name, table_name, column_name, 'DATE_T' AS element_type, general_type, 30 + rank AS fingerprint_order
+ FROM tran_date_cols
+ WHERE rank = 1
+ UNION ALL
+ SELECT profile_run_id, schema_name, table_name, column_name, 'MEAS' AS element_type, general_type, 40 + rank AS fingerprint_order
+ FROM numeric_cols_ranked
+ WHERE rank = 1 ),
+newtests AS (
+ SELECT profile_run_id, schema_name, table_name,
+ 'CAST(COUNT(*) AS STRING) || "|" || ' ||
+ STRING_AGG(
+ REPLACE(
+ CASE
+ WHEN general_type = 'D' THEN
+ 'CAST(MIN(@@@) AS STRING) || "|" || CAST(MAX(@@@) AS STRING) || "|" || CAST(COUNT(DISTINCT @@@) AS STRING)'
+ WHEN general_type = 'A' THEN
+ 'CAST(MIN(@@@) AS STRING) || "|" || CAST(MAX(@@@) AS STRING) || "|" || CAST(COUNT(DISTINCT @@@) AS STRING) || "|" || CAST(SUM(LENGTH(@@@)) AS STRING)'
+ WHEN general_type = 'N' THEN
+ 'CAST(MIN(@@@) AS STRING) || "|" || CAST(MAX(@@@) AS STRING) || "|" || CAST(SUM(@@@) AS STRING) || "|" || CAST(ROUND(AVG(@@@), 5) AS STRING) || "|" || CAST(ROUND(STDDEV(CAST(@@@ AS FLOAT64)), 5) AS STRING)'
+ END,
+ '@@@', '`' || column_name || '`'),
+ ' || "|" || '
+ ORDER BY element_type, fingerprint_order, column_name
+ ) as fingerprint
+ FROM combined
+ GROUP BY profile_run_id, schema_name, table_name
+)
+SELECT '{TABLE_GROUPS_ID}'::UUID as table_groups_id,
+ n.profile_run_id,
+ 'Table_Freshness' AS test_type,
+ '{TEST_SUITE_ID}' AS test_suite_id,
+ n.schema_name, n.table_name,
+ 0 as skip_errors, 'Y' as test_active,
+
+ '{RUN_DATE}'::TIMESTAMP as last_auto_gen_date,
+ '{AS_OF_DATE}'::TIMESTAMP as profiling_as_of_date,
+ 'N' as lock_refresh,
+ 'Value' as history_calculation,
+ 1 as history_lookback,
+ fingerprint as custom_query
+FROM newtests n
+INNER JOIN test_types t
+ ON ('Table_Freshness' = t.test_type
+ AND 'Y' = t.active)
+LEFT JOIN generation_sets s
+ ON (t.test_type = s.test_type
+ AND '{GENERATION_SET}' = s.generation_set)
+LEFT JOIN locked l
+ ON (n.schema_name = l.schema_name
+ AND n.table_name = l.table_name)
+WHERE (s.generation_set IS NOT NULL
+ OR '{GENERATION_SET}' = '')
+ AND l.schema_name IS NULL;
diff --git a/testgen/template/flavors/bigquery/profiling/project_profiling_query_bigquery.yaml b/testgen/template/flavors/bigquery/profiling/project_profiling_query_bigquery.yaml
index dcadd458..41e4ea46 100644
--- a/testgen/template/flavors/bigquery/profiling/project_profiling_query_bigquery.yaml
+++ b/testgen/template/flavors/bigquery/profiling/project_profiling_query_bigquery.yaml
@@ -189,7 +189,7 @@ strTemplate08_else: NULL as min_value,
NULL as percentile_25,
NULL as percentile_50,
NULL as percentile_75,
-strTemplate10_N_dec: SUM(ROUND(ABS(MOD(`{COL_NAME}`, 1)), 5)) as fractional_sum,
+strTemplate10_N_dec: SUM(COALESCE(ROUND(ABS(MOD(`{COL_NAME}`, 1)), 5), 0)) as fractional_sum,
strTemplate10_else: NULL as fractional_sum,
strTemplate11_D: |
MIN(`{COL_NAME}`) AS min_date, -- Other flavors have a minimum threshold of 0001-01-01, but BigQuery doesn't make it easy to to the same
diff --git a/testgen/template/flavors/generic/validate_tests/ex_get_project_column_list_generic.sql b/testgen/template/flavors/generic/validate_tests/ex_get_project_column_list_generic.sql
index eacffa61..3e04d094 100644
--- a/testgen/template/flavors/generic/validate_tests/ex_get_project_column_list_generic.sql
+++ b/testgen/template/flavors/generic/validate_tests/ex_get_project_column_list_generic.sql
@@ -1,3 +1,3 @@
select concat(concat(concat(table_schema, '.'), concat(table_name, '.')), column_name) as columns
-from information_schema.columns
+from {COLUMNS_TABLE}
where table_schema in ({TEST_SCHEMAS});
diff --git a/testgen/ui/queries/source_data_queries.py b/testgen/ui/queries/source_data_queries.py
index d632457b..9cda0223 100644
--- a/testgen/ui/queries/source_data_queries.py
+++ b/testgen/ui/queries/source_data_queries.py
@@ -62,7 +62,7 @@ def generate_lookup_query(test_id: str, detail_exp: str, column_names: list[str]
lookup_query = replace_params(lookup_query, params)
lookup_query = replace_templated_functions(lookup_query, lookup_data.sql_flavor)
return lookup_query
-
+
@st.cache_data(show_spinner=False)
def get_hygiene_issue_source_data(
@@ -98,7 +98,7 @@ def get_test_issue_source_query(issue_data: dict) -> str:
lookup_data = _get_lookup_data(issue_data["table_groups_id"], issue_data["test_type_id"], "Test Results")
if not lookup_data or not lookup_data.lookup_query:
return None
-
+
test_definition = TestDefinition.get(issue_data["test_definition_id_current"])
if not test_definition:
return None
@@ -107,6 +107,7 @@ def get_test_issue_source_query(issue_data: dict) -> str:
"TARGET_SCHEMA": issue_data["schema_name"],
"TABLE_NAME": issue_data["table_name"],
"COLUMN_NAME": issue_data["column_names"],
+ "COLUMN_TYPE": issue_data["column_type"],
"TEST_DATE": str(issue_data["test_date"]),
"CUSTOM_QUERY": test_definition.custom_query,
"BASELINE_VALUE": test_definition.baseline_value,
@@ -146,7 +147,7 @@ def get_test_issue_source_data(
test_definition = TestDefinition.get(issue_data["test_definition_id_current"])
if not test_definition:
return "NA", "Test definition no longer exists.", None, None
-
+
lookup_query = get_test_issue_source_query(issue_data)
if not lookup_query:
return "NA", "Source data lookup is not available for this test.", None, None
@@ -189,7 +190,7 @@ def get_test_issue_source_data_custom(
test_definition = TestDefinition.get(issue_data["test_definition_id_current"])
if not test_definition:
return "NA", "Test definition no longer exists.", None, None
-
+
lookup_query = get_test_issue_source_query_custom(issue_data)
if not lookup_query:
return "NA", "Source data lookup is not available for this test.", None, None
@@ -249,7 +250,7 @@ def _get_lookup_data_custom(
) -> LookupData | None:
result = fetch_one_from_db(
"""
- SELECT
+ SELECT
d.custom_query as lookup_query
FROM test_definitions d
WHERE d.id = :test_definition_id;
diff --git a/testgen/ui/queries/test_result_queries.py b/testgen/ui/queries/test_result_queries.py
index 52a51767..f11abea6 100644
--- a/testgen/ui/queries/test_result_queries.py
+++ b/testgen/ui/queries/test_result_queries.py
@@ -70,6 +70,7 @@ def get_test_results(
-- These are used in the PDF report
tt.threshold_description, tt.usage_notes, r.test_time,
dcc.description as column_description,
+ dcc.column_type as column_type,
COALESCE(dcc.critical_data_element, dtc.critical_data_element) as critical_data_element,
COALESCE(dcc.data_source, dtc.data_source, tg.data_source) as data_source,
COALESCE(dcc.source_system, dtc.source_system, tg.source_system) as source_system,
From 8a171f2113d762f83a02bfd0d7d2da5dee48e830 Mon Sep 17 00:00:00 2001
From: Diogo Basto
Date: Mon, 15 Sep 2025 14:48:47 +0100
Subject: [PATCH 04/48] TG-920
---
testgen/__main__.py | 11 +
testgen/commands/run_launch_db_config.py | 2 +
.../commands/run_test_metadata_exporter.py | 22 +
testgen/commands/run_upgrade_db_config.py | 11 +
testgen/common/read_yaml_metadata_records.py | 251 +++
.../030_initialize_new_schema_structure.sql | 4 +-
.../050_populate_new_schema_metadata.sql | 1770 -----------------
.../055_recreate_metadata_constraints.sql | 17 +
..._anomaly_types_Boolean_Value_Mismatch.yaml | 65 +
...anomaly_types_Char_Column_Date_Values.yaml | 64 +
...nomaly_types_Char_Column_Number_Units.yaml | 18 +
...omaly_types_Char_Column_Number_Values.yaml | 64 +
...anomaly_types_Column_Pattern_Mismatch.yaml | 71 +
...anomaly_types_Delimited_Data_Embedded.yaml | 58 +
...ile_anomaly_types_Inconsistent_Casing.yaml | 89 +
...rofile_anomaly_types_Invalid_Zip3_USA.yaml | 62 +
...profile_anomaly_types_Invalid_Zip_USA.yaml | 58 +
.../profile_anomaly_types_Leading_Spaces.yaml | 59 +
...le_anomaly_types_Multiple_Types_Major.yaml | 58 +
...le_anomaly_types_Multiple_Types_Minor.yaml | 58 +
.../profile_anomaly_types_No_Values.yaml | 61 +
..._anomaly_types_Non_Alpha_Name_Address.yaml | 69 +
...anomaly_types_Non_Alpha_Prefixed_Name.yaml | 69 +
...file_anomaly_types_Non_Printing_Chars.yaml | 125 ++
...ile_anomaly_types_Non_Standard_Blanks.yaml | 59 +
...le_anomaly_types_Potential_Duplicates.yaml | 60 +
.../profile_anomaly_types_Potential_PII.yaml | 59 +
.../profile_anomaly_types_Quoted_Values.yaml | 59 +
...rofile_anomaly_types_Recency_One_Year.yaml | 58 +
...file_anomaly_types_Recency_Six_Months.yaml | 58 +
...nomaly_types_Small Divergent Value Ct.yaml | 64 +
..._anomaly_types_Small Missing Value Ct.yaml | 67 +
..._anomaly_types_Small_Numeric_Value_Ct.yaml | 61 +
...maly_types_Standardized_Value_Matches.yaml | 60 +
.../profile_anomaly_types_Suggested_Type.yaml | 60 +
..._anomaly_types_Table_Pattern_Mismatch.yaml | 71 +
...ofile_anomaly_types_Unexpected Emails.yaml | 59 +
...le_anomaly_types_Unexpected US States.yaml | 61 +
...le_anomaly_types_Unlikely_Date_Values.yaml | 61 +
...le_anomaly_types_Variant_Coded_Values.yaml | 61 +
.../test_types_Aggregate_Balance.yaml | 180 ++
.../test_types_Aggregate_Balance_Percent.yaml | 190 ++
.../test_types_Aggregate_Balance_Range.yaml | 190 ++
.../test_types_Aggregate_Minimum.yaml | 180 ++
.../test_types_Alpha_Trunc.yaml | 127 ++
.../test_types_Avg_Shift.yaml | 128 ++
.../dbsetup_test_types/test_types_CUSTOM.yaml | 61 +
.../test_types_Combo_Match.yaml | 165 ++
.../test_types_Condition_Flag.yaml | 128 ++
.../test_types_Constant.yaml | 127 ++
.../test_types_Daily_Record_Ct.yaml | 171 ++
.../test_types_Dec_Trunc.yaml | 130 ++
.../test_types_Distinct_Date_Ct.yaml | 128 ++
.../test_types_Distinct_Value_Ct.yaml | 127 ++
.../test_types_Distribution_Shift.yaml | 191 ++
.../test_types_Dupe_Rows.yaml | 125 ++
.../test_types_Email_Format.yaml | 127 ++
.../test_types_Future_Date.yaml | 126 ++
.../test_types_Future_Date_1Y.yaml | 127 ++
.../test_types_Incr_Avg_Shift.yaml | 128 ++
.../test_types_LOV_All.yaml | 125 ++
.../test_types_LOV_Match.yaml | 127 ++
.../test_types_Min_Date.yaml | 127 ++
.../test_types_Min_Val.yaml | 127 ++
.../test_types_Missing_Pct.yaml | 128 ++
.../test_types_Monthly_Rec_Ct.yaml | 168 ++
.../test_types_Outlier_Pct_Above.yaml | 132 ++
.../test_types_Outlier_Pct_Below.yaml | 132 ++
.../test_types_Pattern_Match.yaml | 127 ++
.../test_types_Recency.yaml | 128 ++
.../test_types_Required.yaml | 126 ++
.../dbsetup_test_types/test_types_Row_Ct.yaml | 126 ++
.../test_types_Row_Ct_Pct.yaml | 127 ++
.../test_types_Street_Addr_Pattern.yaml | 128 ++
.../test_types_Table_Freshness.yaml | 61 +
.../test_types_Timeframe_Combo_Gain.yaml | 137 ++
.../test_types_Timeframe_Combo_Match.yaml | 219 ++
.../test_types_US_State.yaml | 128 ++
.../dbsetup_test_types/test_types_Unique.yaml | 128 ++
.../test_types_Unique_Pct.yaml | 128 ++
.../test_types_Valid_Characters.yaml | 128 ++
.../test_types_Valid_Month.yaml | 87 +
.../test_types_Valid_US_Zip.yaml | 126 ++
.../test_types_Valid_US_Zip3.yaml | 127 ++
.../test_types_Variability_Decrease.yaml | 132 ++
.../test_types_Variability_Increase.yaml | 136 ++
.../test_types_Weekly_Rec_Ct.yaml | 168 ++
.../dbupgrade/0151_incremental_upgrade.sql | 5 +
88 files changed, 8737 insertions(+), 1771 deletions(-)
create mode 100644 testgen/commands/run_test_metadata_exporter.py
create mode 100644 testgen/common/read_yaml_metadata_records.py
create mode 100644 testgen/template/dbsetup/055_recreate_metadata_constraints.sql
create mode 100644 testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Boolean_Value_Mismatch.yaml
create mode 100644 testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Char_Column_Date_Values.yaml
create mode 100644 testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Char_Column_Number_Units.yaml
create mode 100644 testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Char_Column_Number_Values.yaml
create mode 100644 testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Column_Pattern_Mismatch.yaml
create mode 100644 testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Delimited_Data_Embedded.yaml
create mode 100644 testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Inconsistent_Casing.yaml
create mode 100644 testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Invalid_Zip3_USA.yaml
create mode 100644 testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Invalid_Zip_USA.yaml
create mode 100644 testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Leading_Spaces.yaml
create mode 100644 testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Multiple_Types_Major.yaml
create mode 100644 testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Multiple_Types_Minor.yaml
create mode 100644 testgen/template/dbsetup_anomaly_types/profile_anomaly_types_No_Values.yaml
create mode 100644 testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Alpha_Name_Address.yaml
create mode 100644 testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Alpha_Prefixed_Name.yaml
create mode 100644 testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Printing_Chars.yaml
create mode 100644 testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Standard_Blanks.yaml
create mode 100644 testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Potential_Duplicates.yaml
create mode 100644 testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Potential_PII.yaml
create mode 100644 testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Quoted_Values.yaml
create mode 100644 testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Recency_One_Year.yaml
create mode 100644 testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Recency_Six_Months.yaml
create mode 100644 testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Small Divergent Value Ct.yaml
create mode 100644 testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Small Missing Value Ct.yaml
create mode 100644 testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Small_Numeric_Value_Ct.yaml
create mode 100644 testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Standardized_Value_Matches.yaml
create mode 100644 testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Suggested_Type.yaml
create mode 100644 testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Table_Pattern_Mismatch.yaml
create mode 100644 testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Unexpected Emails.yaml
create mode 100644 testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Unexpected US States.yaml
create mode 100644 testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Unlikely_Date_Values.yaml
create mode 100644 testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Variant_Coded_Values.yaml
create mode 100644 testgen/template/dbsetup_test_types/test_types_Aggregate_Balance.yaml
create mode 100644 testgen/template/dbsetup_test_types/test_types_Aggregate_Balance_Percent.yaml
create mode 100644 testgen/template/dbsetup_test_types/test_types_Aggregate_Balance_Range.yaml
create mode 100644 testgen/template/dbsetup_test_types/test_types_Aggregate_Minimum.yaml
create mode 100644 testgen/template/dbsetup_test_types/test_types_Alpha_Trunc.yaml
create mode 100644 testgen/template/dbsetup_test_types/test_types_Avg_Shift.yaml
create mode 100644 testgen/template/dbsetup_test_types/test_types_CUSTOM.yaml
create mode 100644 testgen/template/dbsetup_test_types/test_types_Combo_Match.yaml
create mode 100644 testgen/template/dbsetup_test_types/test_types_Condition_Flag.yaml
create mode 100644 testgen/template/dbsetup_test_types/test_types_Constant.yaml
create mode 100644 testgen/template/dbsetup_test_types/test_types_Daily_Record_Ct.yaml
create mode 100644 testgen/template/dbsetup_test_types/test_types_Dec_Trunc.yaml
create mode 100644 testgen/template/dbsetup_test_types/test_types_Distinct_Date_Ct.yaml
create mode 100644 testgen/template/dbsetup_test_types/test_types_Distinct_Value_Ct.yaml
create mode 100644 testgen/template/dbsetup_test_types/test_types_Distribution_Shift.yaml
create mode 100644 testgen/template/dbsetup_test_types/test_types_Dupe_Rows.yaml
create mode 100644 testgen/template/dbsetup_test_types/test_types_Email_Format.yaml
create mode 100644 testgen/template/dbsetup_test_types/test_types_Future_Date.yaml
create mode 100644 testgen/template/dbsetup_test_types/test_types_Future_Date_1Y.yaml
create mode 100644 testgen/template/dbsetup_test_types/test_types_Incr_Avg_Shift.yaml
create mode 100644 testgen/template/dbsetup_test_types/test_types_LOV_All.yaml
create mode 100644 testgen/template/dbsetup_test_types/test_types_LOV_Match.yaml
create mode 100644 testgen/template/dbsetup_test_types/test_types_Min_Date.yaml
create mode 100644 testgen/template/dbsetup_test_types/test_types_Min_Val.yaml
create mode 100644 testgen/template/dbsetup_test_types/test_types_Missing_Pct.yaml
create mode 100644 testgen/template/dbsetup_test_types/test_types_Monthly_Rec_Ct.yaml
create mode 100644 testgen/template/dbsetup_test_types/test_types_Outlier_Pct_Above.yaml
create mode 100644 testgen/template/dbsetup_test_types/test_types_Outlier_Pct_Below.yaml
create mode 100644 testgen/template/dbsetup_test_types/test_types_Pattern_Match.yaml
create mode 100644 testgen/template/dbsetup_test_types/test_types_Recency.yaml
create mode 100644 testgen/template/dbsetup_test_types/test_types_Required.yaml
create mode 100644 testgen/template/dbsetup_test_types/test_types_Row_Ct.yaml
create mode 100644 testgen/template/dbsetup_test_types/test_types_Row_Ct_Pct.yaml
create mode 100644 testgen/template/dbsetup_test_types/test_types_Street_Addr_Pattern.yaml
create mode 100644 testgen/template/dbsetup_test_types/test_types_Table_Freshness.yaml
create mode 100644 testgen/template/dbsetup_test_types/test_types_Timeframe_Combo_Gain.yaml
create mode 100644 testgen/template/dbsetup_test_types/test_types_Timeframe_Combo_Match.yaml
create mode 100644 testgen/template/dbsetup_test_types/test_types_US_State.yaml
create mode 100644 testgen/template/dbsetup_test_types/test_types_Unique.yaml
create mode 100644 testgen/template/dbsetup_test_types/test_types_Unique_Pct.yaml
create mode 100644 testgen/template/dbsetup_test_types/test_types_Valid_Characters.yaml
create mode 100644 testgen/template/dbsetup_test_types/test_types_Valid_Month.yaml
create mode 100644 testgen/template/dbsetup_test_types/test_types_Valid_US_Zip.yaml
create mode 100644 testgen/template/dbsetup_test_types/test_types_Valid_US_Zip3.yaml
create mode 100644 testgen/template/dbsetup_test_types/test_types_Variability_Decrease.yaml
create mode 100644 testgen/template/dbsetup_test_types/test_types_Variability_Increase.yaml
create mode 100644 testgen/template/dbsetup_test_types/test_types_Weekly_Rec_Ct.yaml
create mode 100644 testgen/template/dbupgrade/0151_incremental_upgrade.sql
diff --git a/testgen/__main__.py b/testgen/__main__.py
index 6362a868..942b02a4 100644
--- a/testgen/__main__.py
+++ b/testgen/__main__.py
@@ -31,6 +31,7 @@
from testgen.commands.run_observability_exporter import run_observability_exporter
from testgen.commands.run_profiling_bridge import run_profiling_queries
from testgen.commands.run_quick_start import run_quick_start, run_quick_start_increment
+from testgen.commands.run_test_metadata_exporter import run_test_metadata_exporter
from testgen.commands.run_upgrade_db_config import get_schema_revision, is_db_revision_up_to_date, run_upgrade_db_config
from testgen.common import (
configure_logging,
@@ -503,6 +504,16 @@ def export_data(configuration: Configuration, project_key: str, test_suite_key:
click.echo("\nexport-observability completed successfully.\n")
+@cli.command("export-test-metadata", help="Exports current test metadata records to yaml files.")
+@pass_configuration
+def export_test_metadata(configuration: Configuration):
+ click.echo("export-test-metadata")
+ LOG.info("CurrentStep: Main Program - Test Metadata Export")
+ run_test_metadata_exporter()
+ LOG.info("CurrentStep: Main Program - Test Metadata Export - DONE")
+ click.echo("\nexport-test-metadata completed successfully.\n")
+
+
@cli.command("list-test-types", help="Lists all available TestGen test types.")
@click.option("-d", "--display", help="Show command output in the terminal.", is_flag=True, default=False)
@pass_configuration
diff --git a/testgen/commands/run_launch_db_config.py b/testgen/commands/run_launch_db_config.py
index 68f99336..6899f0dc 100644
--- a/testgen/commands/run_launch_db_config.py
+++ b/testgen/commands/run_launch_db_config.py
@@ -10,6 +10,7 @@
from testgen.common.models.scores import ScoreDefinition
from testgen.common.models.table_group import TableGroup
from testgen.common.read_file import get_template_files
+from testgen.common.read_yaml_metadata_records import import_metadata_records_from_yaml
LOG = logging.getLogger("testgen")
@@ -85,6 +86,7 @@ def run_launch_db_config(delete_db: bool, drop_users_and_roles: bool = True) ->
password_override=params_mapping["TESTGEN_ADMIN_PASSWORD"],
user_type="schema_admin",
)
+ import_metadata_records_from_yaml(params_mapping)
ScoreDefinition.from_table_group(
TableGroup(
diff --git a/testgen/commands/run_test_metadata_exporter.py b/testgen/commands/run_test_metadata_exporter.py
new file mode 100644
index 00000000..30e3188c
--- /dev/null
+++ b/testgen/commands/run_test_metadata_exporter.py
@@ -0,0 +1,22 @@
+import logging
+
+from testgen import settings
+from testgen.common.credentials import get_tg_schema
+from testgen.common.models import with_database_session
+from testgen.common.read_yaml_metadata_records import export_metadata_records_to_yaml
+
+LOG = logging.getLogger("testgen")
+
+
+def _get_params_mapping() -> dict:
+ return {
+ "SCHEMA_NAME": get_tg_schema(),
+ "TESTGEN_ADMIN_USER": settings.DATABASE_ADMIN_USER,
+ "TESTGEN_ADMIN_PASSWORD": settings.DATABASE_ADMIN_PASSWORD,
+ "OBSERVABILITY_URL": settings.OBSERVABILITY_API_URL,
+ }
+
+
+@with_database_session
+def run_test_metadata_exporter() -> None:
+ export_metadata_records_to_yaml(_get_params_mapping())
diff --git a/testgen/commands/run_upgrade_db_config.py b/testgen/commands/run_upgrade_db_config.py
index 5d532120..95ec4bc0 100644
--- a/testgen/commands/run_upgrade_db_config.py
+++ b/testgen/commands/run_upgrade_db_config.py
@@ -5,6 +5,7 @@
from testgen.common.credentials import get_tg_schema
from testgen.common.database.database_service import replace_params
from testgen.common.read_file import get_template_files
+from testgen.common.read_yaml_metadata_records import import_metadata_records_from_yaml
LOG = logging.getLogger("testgen")
@@ -96,6 +97,16 @@ def _refresh_static_metadata(params_mapping):
password_override=params_mapping["TESTGEN_ADMIN_PASSWORD"],
user_type="schema_admin",
)
+ import_metadata_records_from_yaml(params_mapping)
+
+ strQueryMetadataConstraints = read_template_sql_file("055_recreate_metadata_constraints.sql", "dbsetup")
+ strQueryMetadataConstraints = replace_params(strQueryMetadataConstraints, params_mapping)
+ execute_db_queries(
+ [(strQueryMetadataConstraints, None)],
+ user_override=params_mapping["TESTGEN_ADMIN_USER"],
+ password_override=params_mapping["TESTGEN_ADMIN_PASSWORD"],
+ user_type="schema_admin",
+ )
def _update_revision_number(params_mapping, latest_prefix_applied):
diff --git a/testgen/common/read_yaml_metadata_records.py b/testgen/common/read_yaml_metadata_records.py
new file mode 100644
index 00000000..2602d147
--- /dev/null
+++ b/testgen/common/read_yaml_metadata_records.py
@@ -0,0 +1,251 @@
+__all__ = ["import_metadata_records_from_yaml", "export_metadata_records_to_yaml"]
+
+import logging
+from importlib.resources import as_file
+
+from yaml import SafeDumper, safe_dump, safe_load
+
+from testgen.common.database.database_service import execute_db_queries, fetch_from_db_threaded
+from testgen.common.read_file import get_template_files
+
+LOG = logging.getLogger("testgen")
+
+TEST_TYPES_PARENT_TABLE = "test_types"
+TEST_TYPES_PARENT_KEY = "test_type"
+TEST_TYPES_CHILD_TABLES = ["cat_test_conditions", "target_data_lookups", "test_templates"]
+
+# Fallback PKs
+TEST_TYPES_DEFAULT_PK = {
+ "target_data_lookups": ["test_id", "sql_flavor", "error_type"],
+ "test_templates": ["test_type", "sql_flavor"],
+ "cat_test_conditions": ["test_type", "sql_flavor"],
+}
+
+# child_col → parent_col for filtering
+TEST_TYPES_PARENT_CHILD_COLUMN_MAP = {
+ "cat_test_conditions": {
+ "test_type": "test_type",
+ },
+ "target_data_lookups": {
+ "test_type": "test_type",
+ "test_id": "id",
+ },
+ "test_templates": {
+ "test_type": "test_type",
+ },
+}
+
+# Columns to treat as literal blocks (embedded special chars)
+TEST_TYPES_LITERAL_FIELDS = {
+ "test_types": [
+ "test_description",
+ "except_message",
+ "measure_uom_description",
+ "selection_criteria",
+ "dq_score_prevalence_formula",
+ "column_name_prompt",
+ "column_name_help",
+ "default_parm_values",
+ "default_parm_prompts",
+ "default_parm_help",
+ "threshold_description",
+ "usage_notes",
+ ],
+ "cat_test_conditions": [
+ "measure",
+ "test_condition",
+ ],
+ "target_data_lookups": [
+ "lookup_query",
+ ],
+}
+
+
+ANOMALY_TYPES_PARENT_TABLE = "profile_anomaly_types"
+ANOMALY_TYPES_PARENT_KEY = "anomaly_type"
+ANOMALY_TYPES_CHILD_TABLES = ["target_data_lookups"]
+
+# Fallback PKs
+ANOMALY_TYPES_DEFAULT_PK = {
+ "target_data_lookups": ["test_id", "sql_flavor", "error_type"],
+}
+
+# child_col → parent_col for filtering
+ANOMALY_TYPES_PARENT_CHILD_COLUMN_MAP = {
+ "target_data_lookups": {
+ "test_type": "anomaly_type",
+ "test_id": "id",
+ },
+}
+
+# Columns to treat as literal blocks (embedded special chars)
+ANOMALY_TYPES_LITERAL_FIELDS = {
+ "profile_anomaly_types": [
+ "anomaly_description",
+ "anomaly_criteria",
+ "detail_expression",
+ "suggested_action",
+ "dq_score_prevalence_formula",
+ ],
+ "target_data_lookups": [
+ "lookup_query",
+ ],
+}
+
+
+
+class LiteralString(str):
+ pass
+
+def _add_literal_representer():
+ def _literal_representer(dumper, data):
+ # emit this string with | style
+ return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|")
+ SafeDumper.add_representer(LiteralString, _literal_representer)
+
+
+def _process_yaml_for_import(params_mapping: dict, data:dict, parent_table:str, parent_key:str, child_tables:list[str], default_pk:dict[str, list[str]], parent_child_column_map:dict[str, dict[str,str]]):
+ queries = []
+
+ parent = data.get(parent_table)
+ if not isinstance(parent, dict):
+ raise TypeError(f"YAML key '{parent_table}' must be a dict")
+
+ for table_name in child_tables:
+ records = parent.pop(table_name, [])
+ if not isinstance(records, list):
+ raise TypeError(f"YAML key '{table_name}' under parent must be a list")
+
+ mapping = parent_child_column_map.get(table_name, {})
+
+ pk_cols = default_pk.get(table_name) or [parent_key]
+
+ for record in records:
+ for child_col, parent_col in mapping.items():
+ record.setdefault(child_col, parent.get(parent_col))
+
+ columns = list(record.keys())
+
+ insert_cols = ", ".join(columns)
+ insert_vals = ", ".join(f":{c}" for c in columns)
+ update_stmt = ", ".join(f"{c}=EXCLUDED.{c}" for c in columns if c not in pk_cols)
+ bound_values = {c: record[c] for c in columns}
+
+ sql = f"""
+ INSERT INTO {params_mapping["SCHEMA_NAME"]}.{table_name} ({insert_cols})
+ VALUES ({insert_vals})
+ ON CONFLICT ({', '.join(pk_cols)}) DO UPDATE
+ SET {update_stmt};
+ """
+ queries.append((sql, bound_values))
+
+ columns = list(parent.keys())
+
+ insert_cols = ", ".join(columns)
+ insert_vals = ", ".join(f":{c}" for c in columns)
+ update_stmt = ", ".join(f"{c}=EXCLUDED.{c}" for c in columns if c != parent_key)
+ bound_values = {c: parent[c] for c in columns}
+ parent_insert_query = f"""
+ INSERT INTO {params_mapping["SCHEMA_NAME"]}.{parent_table} ({insert_cols})
+ VALUES ({insert_vals})
+ ON CONFLICT ({parent_key}) DO UPDATE
+ SET {update_stmt};
+ """
+
+ queries = [(parent_insert_query, bound_values), *queries]
+
+ execute_db_queries(
+ queries,
+ user_override=params_mapping["TESTGEN_ADMIN_USER"],
+ password_override=params_mapping["TESTGEN_ADMIN_PASSWORD"],
+ user_type="schema_admin",
+ )
+ return
+
+def import_metadata_records_from_yaml(params_mapping) -> None:
+ files = sorted(get_template_files(mask="^.*ya?ml$", sub_directory="dbsetup_test_types"), key=lambda key: str(key))
+ for yaml_file in files:
+ with as_file(yaml_file) as f:
+ with f.open("r") as file:
+ data = safe_load(file)
+ _process_yaml_for_import(
+ params_mapping,
+ data,
+ TEST_TYPES_PARENT_TABLE,
+ TEST_TYPES_PARENT_KEY,
+ TEST_TYPES_CHILD_TABLES,
+ TEST_TYPES_DEFAULT_PK,
+ TEST_TYPES_PARENT_CHILD_COLUMN_MAP,
+ )
+ files = sorted(get_template_files(mask="^.*ya?ml$", sub_directory="dbsetup_anomaly_types"), key=lambda key: str(key))
+ for yaml_file in files:
+ with as_file(yaml_file) as f:
+ with f.open("r") as file:
+ LOG.info(f"Importing {yaml_file}")
+ data = safe_load(file)
+ _process_yaml_for_import(
+ params_mapping,
+ data,
+ ANOMALY_TYPES_PARENT_TABLE,
+ ANOMALY_TYPES_PARENT_KEY,
+ ANOMALY_TYPES_CHILD_TABLES,
+ ANOMALY_TYPES_DEFAULT_PK,
+ ANOMALY_TYPES_PARENT_CHILD_COLUMN_MAP,
+ )
+ return
+
+
+def _process_records_for_export(params_mapping: dict, parent_table:str, parent_key:str, child_tables:list[str], parent_child_column_map:dict[str, dict[str,str]], literal_fields:dict[str, list[str]]) -> None:
+ def wrap_literal(table_name, recs):
+ for rec in recs:
+ for fld in literal_fields.get(table_name, []):
+ val = rec.get(fld)
+ if isinstance(val, str) and val != "":
+ rec[fld] = LiteralString(val)
+
+ fetch_parent_query = f"SELECT * FROM {params_mapping["SCHEMA_NAME"]}.{parent_table};"
+ parent_records, parent_columns, _ = fetch_from_db_threaded(
+ [(fetch_parent_query, None)],
+ )
+ for parent_record in parent_records:
+ parent_record_dict = dict(zip(parent_columns, parent_record, strict=False))
+ for child_name in child_tables:
+ child_key = next(key for key, value in parent_child_column_map[child_name].items() if value==parent_key)
+ fetch_children_query = f"SELECT * FROM {params_mapping["SCHEMA_NAME"]}.{child_name} WHERE {child_key} = '{parent_record_dict[parent_key]}';"
+ child_records, child_columns, _ = fetch_from_db_threaded(
+ [(fetch_children_query, None)],
+ )
+ child_records_dict = []
+ for child_record in child_records:
+ child_records_dict.append(dict(zip(child_columns, child_record, strict=False)))
+ LOG.info(child_records_dict)
+ wrap_literal(child_name, child_records_dict)
+ parent_record_dict[child_name] = child_records_dict
+
+ wrap_literal(parent_table, [parent_record_dict])
+ payload = {parent_table: parent_record_dict}
+ out_file = f"{parent_table}_{parent_record_dict[parent_key]}.yaml"
+ LOG.info(f"Exporting {out_file}")
+ with open(out_file, "w") as f:
+ safe_dump(payload, f, sort_keys=False)
+
+
+def export_metadata_records_to_yaml(params_mapping: dict) -> None:
+ _add_literal_representer()
+ _process_records_for_export(
+ params_mapping,
+ TEST_TYPES_PARENT_TABLE,
+ TEST_TYPES_PARENT_KEY,
+ TEST_TYPES_CHILD_TABLES,
+ TEST_TYPES_PARENT_CHILD_COLUMN_MAP,
+ TEST_TYPES_LITERAL_FIELDS,
+ )
+ _process_records_for_export(
+ params_mapping,
+ ANOMALY_TYPES_PARENT_TABLE,
+ ANOMALY_TYPES_PARENT_KEY,
+ ANOMALY_TYPES_CHILD_TABLES,
+ ANOMALY_TYPES_PARENT_CHILD_COLUMN_MAP,
+ ANOMALY_TYPES_LITERAL_FIELDS,
+ )
+ return
diff --git a/testgen/template/dbsetup/030_initialize_new_schema_structure.sql b/testgen/template/dbsetup/030_initialize_new_schema_structure.sql
index 09593c39..146e1c06 100644
--- a/testgen/template/dbsetup/030_initialize_new_schema_structure.sql
+++ b/testgen/template/dbsetup/030_initialize_new_schema_structure.sql
@@ -619,7 +619,9 @@ CREATE TABLE target_data_lookups (
sql_flavor VARCHAR(20) NOT NULL,
lookup_type VARCHAR(10),
lookup_query VARCHAR,
- error_type VARCHAR(30) NOT NULL
+ error_type VARCHAR(30) NOT NULL,
+ CONSTRAINT target_data_lookups_test_id_sql_flavor_error_type_pk
+ PRIMARY KEY (test_id, sql_flavor, error_type),
);
CREATE TABLE variant_codings (
diff --git a/testgen/template/dbsetup/050_populate_new_schema_metadata.sql b/testgen/template/dbsetup/050_populate_new_schema_metadata.sql
index 47d0e9a9..4b1c20a7 100644
--- a/testgen/template/dbsetup/050_populate_new_schema_metadata.sql
+++ b/testgen/template/dbsetup/050_populate_new_schema_metadata.sql
@@ -12,156 +12,8 @@ ALTER TABLE cat_test_conditions DROP CONSTRAINT cat_test_conditions_cat_tests_te
TRUNCATE TABLE profile_anomaly_types;
-INSERT INTO profile_anomaly_types
- (id, anomaly_type, data_object, anomaly_name, anomaly_description, anomaly_criteria, detail_expression, issue_likelihood, suggested_action, dq_score_prevalence_formula, dq_score_risk_factor, dq_dimension)
-VALUES ('1001', 'Suggested_Type', 'Column', 'Suggested Data Type', 'Data stored as text all meets criteria for a more suitable type. ', '(functional_data_type NOT IN (''Boolean'', ''Flag'') ) AND (column_type ILIKE ''%ch
-ar%'' OR column_type ILIKE ''text'') AND NOT (datatype_suggestion ILIKE ''%char%'' OR datatype_suggestion ILIKE ''text'')', 'p.datatype_suggestion::VARCHAR(200)', 'Likely', 'Consider changing the column data type to tighte
-n controls over data ingested and to make values more efficient, consistent and suitable for downstream analysis.', NULL, NULL, NULL),
- ('1002', 'Non_Standard_Blanks', 'Column', 'Non-Standard Blank Values', 'Values representing missing data may be unexpected or inconsistent. Non-standard values may include empty strings as opposed to nulls, dummy entries such as "MISSING" or repeated characters that may have been used to bypass entry requirements, processing artifacts such as "NULL", or spreadsheet artifacts such as "NA", "ERROR".', '(p.zero_length_ct > 0 OR (p.filled_value_ct > 0 AND (p.numeric_ct <> p.value_ct OR functional_data_type IN (''Phone'', ''Zip''))))', '''Dummy Values: '' || p.filled_value_ct::VARCHAR || '', Empty String: '' || p.zero_length_ct::VARCHAR || '', Null: '' || p.null_value_ct::VARCHAR || '', Records: '' || p.record_ct::VARCHAR', 'Definite', 'Consider cleansing the column upon ingestion to replace all variants of missing data with a standard designation, like Null.', 'p.filled_value_ct::FLOAT/NULLIF(p.record_ct, 0)::FLOAT', '1.0', 'Completeness'),
- ('1003', 'Invalid_Zip_USA', 'Column', 'Invalid USA Zip Code Format', 'Some values present do not conform with the expected format of USA Zip Codes.', 'p.functional_data_type = ''Zip'' AND (p.general_type <> ''A'' OR p.filled_value_ct > 0 OR EXISTS (SELECT 1 FROM UNNEST(STRING_TO_ARRAY(p.top_patterns, '' | '')) WITH ORDINALITY AS u(val, idx) WHERE idx % 2 = 0 AND val NOT IN (''NNNNN'',''NNNNN-NNNN'',''NNNNNNNNN'')))', 'CASE WHEN p.general_type = ''N'' THEN ''Type: '' || p.column_type ELSE '''' END || CASE WHEN p.general_type = ''A'' THEN ''Patterns: '' || (SELECT string_agg(val, '','') FROM UNNEST(STRING_TO_ARRAY(top_patterns, '' | '')) WITH ORDINALITY AS u(val, idx) WHERE idx % 2 = 0) || '', Dummy Values: '' || p.filled_value_ct::VARCHAR ELSE '''' END', 'Definite', 'Consider correcting invalid column values or changing them to indicate a missing value if corrections cannot be made.', NULL, '1.0', 'Validity'),
- ('1004', 'Multiple_Types_Minor', 'Multi-Col', 'Multiple Data Types per Column Name - Minor', 'Columns with the same name have the same general type across tables, but the types do not exactly match. Truncation issues may result if columns are commingled and assumed to be the same format.', 'm.general_type_ct = 1 AND m.type_ct > 1', '''Found '' || m.column_ct::VARCHAR || '' columns, '' || m.type_ct::VARCHAR(10) || '' types, '' || m.min_type || '' to '' || m.max_type', 'Possible', 'Consider changing the column data types to be fully consistent. This will tighten your standards at ingestion and assure that data is consistent between tables.', NULL, NULL, 'Consistency'),
- ('1005', 'Multiple_Types_Major', 'Multi-Col', 'Multiple Data Types per Column Name - Major', 'Columns with the same name have broadly different types across tables. Differences could be significant enough to cause errors in downstream analysis, extra steps resulting in divergent business logic and inconsistencies in results.', 'm.general_type_ct > 1', '''Found '' || m.column_ct::VARCHAR || '' columns, '' || m.type_ct::VARCHAR(10) || '' types, '' || m.min_type || '' to '' || m.max_type', 'Likely', 'Ideally, you should change the column data types to be fully consistent. If the data is meant to be different, you should change column names so downstream users aren''t led astray.', NULL, NULL, 'Consistency'),
- ('1006', 'No_Values', 'Column', 'No Column Values Present', 'This column is present in the table, but no values have been ingested or assigned in any records. This could indicate missing data or a processing error. Note that this considers dummy values and zero-length values as missing data. ', '(p.null_value_ct + p.filled_value_ct + p.zero_length_ct) = p.record_ct', '''Null: '' || p.null_value_ct::VARCHAR(10) || '', Dummy: '' || p.filled_value_ct::VARCHAR(10) || '', Zero Len: '' || p.zero_length_ct::VARCHAR(10)', 'Possible', 'Review your source data, ingestion process, and any processing steps that update this column.', '1.0', '0.33', 'Completeness'),
- ('1007', 'Column_Pattern_Mismatch', 'Column', 'Pattern Inconsistency Within Column', 'Alpha-numeric string data within this column conforms to 2-4 different patterns, with 95% matching the first pattern. This could indicate data errors in the remaining values. ', 'p.general_type = ''A''
- AND functional_data_type NOT ILIKE ''Measurement%'' AND functional_data_type NOT IN (''Category'', ''Code'')
- AND p.max_length > 3
- AND p.value_ct > (p.numeric_ct + p.filled_value_ct + p.zero_length_ct)
- AND p.distinct_pattern_ct BETWEEN 2 AND 4
- AND STRPOS(p.top_patterns, ''N'') > 0
- AND (
- ( (STRPOS(p.top_patterns, ''A'') > 0 OR STRPOS(p.top_patterns, ''a'') > 0)
- AND SPLIT_PART(p.top_patterns, ''|'', 3)::NUMERIC / SPLIT_PART(p.top_patterns, ''|'', 1)::NUMERIC < 0.05)
- OR
- SPLIT_PART(p.top_patterns, ''|'', 3)::NUMERIC / SPLIT_PART(p.top_patterns, ''|'', 1)::NUMERIC < 0.1
- )', '''Patterns: '' || p.top_patterns', 'Likely', 'Review the values for any data that doesn''t conform to the most common pattern and correct any data errors.', '(p.record_ct - SPLIT_PART(p.top_patterns, ''|'', 1)::INT)::FLOAT/NULLIF(p.record_ct, 0)::FLOAT', '0.66', 'Validity'),
- ('1008', 'Table_Pattern_Mismatch', 'Multi-Col', 'Pattern Inconsistency Across Tables', 'Alpha-numeric string data within this column matches a single pattern, but other columns with the same name have data that matches a different single pattern. Inconsistent formatting may contradict user assumptions and cause downstream errors, extra steps and inconsistent business logic.', 'p.general_type = ''A''
- AND functional_data_type NOT ILIKE ''Measurement%'' AND functional_data_type NOT IN (''Category'', ''Code'')
- AND p.max_length > 3
- AND p.value_ct > (p.numeric_ct + p.filled_value_ct + p.zero_length_ct)
- AND m.max_pattern_ct = 1
- AND m.column_ct > 1
- AND SPLIT_PART(p.top_patterns, ''|'', 2) <> SPLIT_PART(m.very_top_pattern, ''|'', 2)
- AND SPLIT_PART(p.top_patterns, ''|'', 1)::NUMERIC / SPLIT_PART(m.very_top_pattern, ''|'', 1)::NUMERIC < 0.1', '''Patterns: '' || SPLIT_PART(p.top_patterns, ''|'', 2) || '', '' || SPLIT_PART(ltrim(m.very_top_pattern, ''0''), ''|'', 2)', 'Likely', 'Review the profiled patterns for the same column in other tables. You may want to add a hygiene step to your processing to make patterns consistent.', NULL, NULL, 'Validity'),
- ('1009', 'Leading_Spaces', 'Column', 'Leading Spaces Found in Column Values', 'Spaces were found before data at the front of column string values. This likely contradicts user expectations and could be a sign of broader ingestion or processing errors.', 'p.lead_space_ct > 0', '''Cases Found: '' || p.lead_space_ct::VARCHAR(10)', 'Likely', 'Review your source data, ingestion process, and any processing steps that update this column.', 'p.lead_space_ct::FLOAT/NULLIF(p.record_ct, 0)::FLOAT', '0.66', 'Validity'),
- ('1010', 'Quoted_Values', 'Column', 'Quoted Values Found in Column Values', 'Column values were found within quotes. This likely contradicts user expectations and could be a sign of broader ingestion or processing errors.', 'p.quoted_value_ct > 0', '''Cases Found: '' || p.quoted_value_ct::VARCHAR(10)', 'Likely', 'Review your source data, ingestion process, and any processing steps that update this column.', 'p.quoted_value_ct::FLOAT/NULLIF(p.record_ct, 0)::FLOAT', '0.66', 'Validity'),
- ('1011', 'Char_Column_Number_Values', 'Column', 'Character Column with Mostly Numeric Values', 'This column is defined as alpha, but more than 95% of its values are numeric. Numbers in alpha columns won''t sort correctly, and might contradict user expectations downstream. It''s also possible that more than one type of information is stored in the column, making it harder to retrieve.', 'p.general_type = ''A''
- AND p.column_name NOT ILIKE ''%zip%''
- AND p.functional_data_type NOT ILIKE ''id%''
- AND p.functional_data_type NOT ILIKE ''Period%''
- AND p.value_ct > p.numeric_ct
- AND p.numeric_ct::NUMERIC > (0.95 * p.value_ct::NUMERIC)', '''Numeric Ct: '' || p.numeric_ct || '' of '' || p.value_ct || '' (Numeric Percent: '' || ROUND(100.0 * p.numeric_ct::NUMERIC(18, 5) / p.value_ct::NUMERIC(18, 5), 2) || '' )''::VARCHAR(200)', 'Likely', 'Review your source data and ingestion process. Consider whether it might be better to store the numeric data in a numeric column. If the alpha data is significant, you could store it in a different column.', 'p.numeric_ct::FLOAT/NULLIF(p.record_ct, 0)::FLOAT', '0.66', 'Validity'),
- ('1012', 'Char_Column_Date_Values', 'Column', 'Character Column with Mostly Date Values', 'This column is defined as alpha, but more than 95% of its values are dates. Dates in alpha columns might not sort correctly, and might contradict user expectations downstream. It''s also possible that more than one type of information is stored in the column, making it harder to retrieve. ', 'p.general_type = ''A''
- AND p.value_ct > p.date_ct
- AND p.date_ct::NUMERIC > (0.95 * p.value_ct::NUMERIC)', ''' Date Ct: '' || p.date_ct || '' of '' || p.value_ct || '' (Date Percent: '' || ROUND(100.0 * p.date_ct::NUMERIC(18, 5) / p.value_ct::NUMERIC(18, 5), 2) || '' )''::VARCHAR(200)', 'Likely', 'Review your source data and ingestion process. Consider whether it might be better to store the date values as a date or datetime column. If the alpha data is also significant, you could store it in a different column.', 'p.date_ct::FLOAT/NULLIF(p.record_ct, 0)::FLOAT', '0.66', 'Validity'),
- ('1013', 'Small Missing Value Ct', 'Column', 'Small Percentage of Missing Values Found', 'Under 3% of values in this column were found to be null, zero-length or dummy values, but values are not universally present. This could indicate unexpected missing values in a required column.', '(p.value_ct - p.zero_length_ct - CASE WHEN (p.filled_value_ct > 0 AND (p.numeric_ct <> p.value_ct OR functional_data_type IN (''Phone'', ''Zip''))) THEN p.filled_value_ct ELSE 0 END
- )::FLOAT / p.record_ct::FLOAT > 0.97
- AND (p.value_ct - p.zero_length_ct - CASE WHEN (p.filled_value_ct > 0 AND (p.numeric_ct <> p.value_ct OR functional_data_type IN (''Phone'', ''Zip''))) THEN p.filled_value_ct ELSE 0 END
- ) < p.record_ct', '(p.record_ct - (p.value_ct - p.zero_length_ct - CASE WHEN (p.filled_value_ct > 0 AND (p.numeric_ct <> p.value_ct OR functional_data_type IN (''Phone'', ''Zip''))) THEN p.filled_value_ct ELSE 0 END
- ))::VARCHAR(20) ||
- '' of '' || p.record_ct::VARCHAR(20) || '' blank values: '' ||
- ROUND(100.0 * (p.record_ct - (p.value_ct - p.zero_length_ct - CASE WHEN (p.filled_value_ct > 0 AND (p.numeric_ct <> p.value_ct OR functional_data_type IN (''Phone'', ''Zip''))) THEN p.filled_value_ct ELSE 0 END
- ))::NUMERIC(18, 5)
- / NULLIF(p.value_ct, 0)::NUMERIC(18, 5), 2)::VARCHAR(40) || ''%''', 'Possible', 'Review your source data and follow-up with data owners to determine whether this data needs to be corrected, supplemented or excluded.', '(p.null_value_ct + filled_value_ct + zero_length_ct)::FLOAT/NULLIF(p.record_ct, 0)::FLOAT', '0.33', 'Completeness'),
- ('1014', 'Small Divergent Value Ct', 'Column', 'Small Percentage of Divergent Values Found', 'Under 3% of values in this column were found to be different from the most common value. This could indicate a data error.', 'functional_data_type <> ''Boolean'' AND (100.0 * fn_parsefreq(p.top_freq_values, 1, 2)::FLOAT /
- p.value_ct::FLOAT) > 97::FLOAT
- AND (100.0 * fn_parsefreq(p.top_freq_values, 1, 2)::FLOAT /
- NULLIF(p.value_ct, 0)::FLOAT) < 100::FLOAT', '''Single Value Pct: '' || ROUND(100.0 * fn_parsefreq(p.top_freq_values, 1, 2)::FLOAT
- / NULLIF(p.value_ct, 0)::FLOAT)::VARCHAR(40)
- || '', Value | Freq: '' || top_freq_values', 'Possible', 'Review your source data and follow-up with data owners to determine whether this data needs to be corrected.', '(p.record_ct - fn_parsefreq(p.top_freq_values, 1, 2)::BIGINT)::FLOAT/NULLIF(p.record_ct, 0)::FLOAT', '0.33', 'Validity'),
- ('1015', 'Boolean_Value_Mismatch', 'Column', 'Unexpected Boolean Values Found', 'This column appears to contain boolean (True/False) data, but unexpected values were found. This could indicate inconsistent coding for the same intended values, potentially leading to downstream errors or inconsistent business logic. ', '(distinct_value_ct > 1 AND
- ((lower(top_freq_values) ILIKE ''| true |%'' OR lower(top_freq_values) ILIKE ''| false |%'') AND NOT (lower(top_freq_values) ILIKE ''%| true |%'' AND lower(top_freq_values) ILIKE ''%| false |%''))
- OR ((lower(top_freq_values) ILIKE ''| yes |%'' OR lower(top_freq_values) ILIKE ''| no |%'' ) AND NOT (lower(top_freq_values) ILIKE ''%| yes |%'' AND lower(top_freq_values) ILIKE ''%| no |%'')) )', 'CASE WHEN p.top_freq_values IS NULL THEN ''Min: '' || p.min_text || '', Max: '' || p.max_text
- ELSE ''Top Freq: '' || p.top_freq_values END', 'Likely', 'Review your source data and follow-up with data owners to determine whether this data needs to be corrected. ', NULL, '0.66', 'Validity'),
- ('1016', 'Potential_Duplicates', 'Column', 'Potential Duplicate Values Found', 'This column is largely unique, but some duplicate values are present. This pattern is uncommon and could indicate inadvertant duplication. ', 'p.distinct_value_ct > 1000
- AND fn_parsefreq(p.top_freq_values, 1, 2)::BIGINT BETWEEN 2 AND 4', '''Top Freq: '' || p.top_freq_values', 'Possible', 'Review your source data and follow-up with data owners to determine whether this data needs to be corrected. ', '(p.value_ct - p.distinct_value_ct)::FLOAT/NULLIF(p.record_ct, 0)::FLOAT', '0.33', 'Uniqueness'),
- ('1017', 'Standardized_Value_Matches', 'Column', 'Similar Values Match When Standardized', 'When column values are standardized (removing spaces, single-quotes, periods and dashes), matching values are found in other records. This may indicate that formats should be further standardized to allow consistent comparisons for merges, joins and roll-ups. It could also indicate the presence of unintended duplicates.', 'p.general_type = ''A'' AND p.distinct_std_value_ct <> p.distinct_value_ct AND p.functional_data_type NOT LIKE ''Person%Name'' ', '''Distinct Values: '' || p.distinct_value_ct::VARCHAR
- || '', Standardized: '' || p.distinct_std_value_ct::VARCHAR', 'Likely', 'Review standardized vs. raw data values for all matches. Correct data if values should be consistent.', '(p.distinct_value_ct - p.distinct_std_value_ct)::FLOAT/NULLIF(p.value_ct, 0)', '0.66', 'Uniqueness'),
- ('1018', 'Unlikely_Date_Values', 'Column', 'Unlikely Dates out of Typical Range', 'Some date values in this column are earlier than 1900-01-01 or later than 30 years after Profiling date.', 'p.general_type = ''D''
- AND (p.min_date BETWEEN ''0001-01-02''::DATE AND ''1900-01-01''::DATE
- OR p.max_date > CURRENT_DATE + INTERVAL ''30 year'')', '''Date Range: '' || p.min_date::VARCHAR || '' thru '' || p.max_date::VARCHAR', 'Likely', 'Review your source data and follow-up with data owners to determine whether this data needs to be corrected or removed.', '(COALESCE(p.before_100yr_date_ct,0)+COALESCE(p.distant_future_date_ct, 0))::FLOAT/NULLIF(p.record_ct, 0)', '0.66', 'Accuracy'),
- ('1019', 'Recency_One_Year', 'Dates', 'Recency - No Table Dates within 1 Year', 'Among all date columns present in the table, none fall inside of one year from Profile date.', 'MAX(p.max_date) < CURRENT_DATE - INTERVAL ''1 year''', '''Most Recent Date: '' || MAX(p.max_date)::VARCHAR', 'Possible', 'Review your source data and follow-up with data owners to determine whether dates in table should be more recent.', NULL, NULL, 'Timeliness'),
- ('1020', 'Recency_Six_Months', 'Dates', 'Recency - No Table Dates within 6 Months', 'Among all date columns present in the table, the most recent date falls 6 months to 1 year back from Profile date. ', 'MAX(p.max_date) >= CURRENT_DATE - INTERVAL ''1 year'' AND MAX(p.max_date) < CURRENT_DATE - INTERVAL ''6 months''', '''Most Recent Date: '' || MAX(p.max_date)::VARCHAR', 'Possible', 'Review your source data and follow-up with data owners to determine whether dates in table should be more recent.', NULL, NULL, 'Timeliness'),
- ('1021', 'Unexpected US States', 'Column', 'Unexpected Column Contains US States', 'This column is not labeled as a state, but contains mostly US State abbreviations. This could indicate shifted or switched source data columns.', 'p.std_pattern_match = ''STATE_USA''
- AND p.distinct_value_ct > 5
- AND NOT (p.column_name = ''st'' OR p.column_name ILIKE ''%state%'' OR p.column_name ILIKE ''%_st'' OR p.column_name ILIKE ''st_%'')', '''Value Range: '' || p.min_text || '' thru '' || max_text || CASE WHEN p.top_freq_values > '''' THEN '', Top Freq Values: '' || REPLACE(p.top_freq_values, CHR(10), '' ; '') ELSE '''' END ', 'Possible', 'Review your source data and follow-up with data owners to determine whether column should be populated with US states.', NULL, '0.33', 'Consistency'),
- ('1022', 'Unexpected Emails', 'Column', 'Unexpected Column Contains Emails', 'This column is not labeled as email, but contains mostly email addresses. This could indicate shifted or switched source data columns.', 'p.std_pattern_match = ''EMAIL''
- AND NOT (p.column_name ILIKE ''%email%'' OR p.column_name ILIKE ''%addr%'')', '''Value Range: '' || p.min_text || '' thru '' || max_text', 'Possible', 'Review your source data and follow-up with data owners to determine whether column should be populated with email addresses.', NULL, '0.33', 'Consistency'),
- ('1023', 'Small_Numeric_Value_Ct', 'Column', 'Unexpected Numeric Values Found', 'A small fraction (under 3%) of values in this column were found to be numeric. They could be erroneous.', 'p.general_type = ''A''
- AND p.numeric_ct::FLOAT/NULLIF(p.value_ct, 0)::FLOAT < 0.03
- AND p.numeric_ct > 0', '''Numeric Ct: '' || p.numeric_ct || '' of '' || p.value_ct || '' (Numeric Percent: '' || ROUND(100.0 * p.numeric_ct::NUMERIC(18, 5)/NULLIF(p.value_ct, 0)::NUMERIC(18, 5), 2) || '' )''::VARCHAR(200)', 'Likely', 'Review your source data and follow-up with data owners to determine whether numeric values are invalid entries here.', 'p.numeric_ct::FLOAT/NULLIF(p.record_ct, 0)::FLOAT', '0.66', 'Validity'),
- ('1024', 'Invalid_Zip3_USA', 'Column', 'Invalid USA ZIP-3 Format', 'The majority of values in this column are 3-digit zips, but divergent patterns were found. This could indicate an incorrect roll-up category or a PII concern.', 'p.distinct_pattern_ct > 1
- AND (p.column_name ilike ''%zip%'' OR p.column_name ILIKE ''%postal%'')
- AND SPLIT_PART(p.top_patterns, '' | '', 2) = ''NNN''
- AND SPLIT_PART(p.top_patterns, '' | '', 1)::FLOAT/NULLIF(value_ct, 0)::FLOAT > 0.50', '''Pattern: '' || p.top_patterns', 'Definite', 'Review your source data, ingestion process, and any processing steps that update this column.', '(NULLIF(p.record_ct, 0)::INT - SPLIT_PART(p.top_patterns, '' | '', 1)::INT)::FLOAT/NULLIF(p.record_ct, 0)::FLOAT', '1', 'Validity'),
- ('1025', 'Delimited_Data_Embedded', 'Column', 'Delimited Data Embedded in Column', 'Delimited data, separated by a common delimiter (comma, tab, pipe or caret) is present in over 80% of column values. This could indicate data that was incorrectly ingested, or data that would be better represented in parsed form.', 'p.std_pattern_match = ''DELIMITED_DATA''', 'CASE WHEN p.top_freq_values IS NULL THEN ''Min: '' || p.min_text || '', Max: '' || p.max_text ELSE ''Top Freq: '' || p.top_freq_values END', 'Likely', 'Review your source data and follow-up with data consumers to determine the most useful representation of this data.', NULL, '0.66', 'Validity'),
- ('1026', 'Char_Column_Number_Units', 'Column', 'Character Column with Numbers and Units', 'This column is defined as alpha, but values include numbers with percents or common units. Embedded measures in alpha columns are harder to access, won''t sort correctly, and might contradict user expectations downstream. Consider parsing into numeric and UOM columns to improve usability.', 'p.includes_digit_ct::FLOAT/NULLIF(p.value_ct, 0)::FLOAT > 0.5 AND TRIM(fn_parsefreq(p.top_freq_values, 1, 1)) ~ ''(?i)^[0-9]+(\.[0-9]+)? ?(%|lb|oz|kg|g|mg|km|m|cm|mm|mi|ft|in)$''', '''Top Freq: '' || p.top_freq_values', 'Possible', 'Review your source data and ingestion process. Consider whether it might be better to parse the numeric and unit data and store in separate columns.', NULL, '0.33', 'Consistency'),
- ('1027', 'Variant_Coded_Values', 'Variant', 'Variant Codings for Same Values', 'This column contains more than one common variants that represent a single value or state. This can occur when data is integrated from multiple sources with different standards, or when free entry is permitted without validation. The variations can cause confusion and error for downstream data users and multiple versions of the truth. ', 'p.distinct_value_ct <= 20', '''Variants Found: '' || intersect_list', 'Definite', 'Review your source data and ingestion process. Consider cleansing this data to standardize on a single set of definitive codes.', NULL, NULL, 'Consistency'),
- ('1100', 'Potential_PII', 'Column', 'Personally Identifiable Information', 'This column contains data that could be Personally Identifiable Information (PII)', 'p.pii_flag > ''''', '''Risk: '' || CASE LEFT(p.pii_flag, 1) WHEN ''A'' THEN ''HIGH'' WHEN ''B'' THEN ''MODERATE'' WHEN ''C'' THEN ''LOW'' END || '', PII Type: '' || SUBSTRING(p.pii_flag, 3)', 'Potential PII', 'PII may require steps to ensure data security and compliance with relevant privacy regulations and legal requirements. You may have to classify and inventory PII, implement appropriate access controls, encrypt data, and monitor for unauthorized access. Your organization might be required to update privacy policies and train staff on data protection practices. Note that PII that is lower-risk in isolation might be high-risk in conjunction with other data.', NULL, 'CASE LEFT(p.pii_flag, 1) WHEN ''A'' THEN 1 WHEN ''B'' THEN 0.66 WHEN ''C'' THEN 0.33 END', 'Validity'),
- ('1028', 'Inconsistent_Casing', 'Column', 'Inconsistent Casing', 'Casing is inconsistent for a column representing an entity name or address elements. Mixed-Case and All-Upper-Case values were found in the same column.', 'mixed_case_ct > 0 AND upper_case_ct > 0 AND functional_data_type IN (''Address'', ''City'', ''Entity Name'', ''Person Given Name'', ''Person Last Name'', ''Person Full Name'')', '''Mixed-Case: '' || p.mixed_case_ct::VARCHAR || '', All-Upper-Case: '' || p.upper_case_ct::VARCHAR || '' for Semantic Data Type: '' || p.functional_data_type || '', Records: '' || p.record_ct::VARCHAR', 'Definite', 'Review your source data and follow-up with data owners to determine whether consistent casing should be applied at the source. If source data corrections are not possible, consider standardizing the column upon ingestion to ensure consistent casing.', 'LEAST(p.mixed_case_ct, p.upper_case_ct)::FLOAT/NULLIF(p.record_ct, 0)::FLOAT', '1.0', 'Validity'),
- ('1029', 'Non_Alpha_Name_Address', 'Column', 'Non-Alpha Name or Address', 'Entirely non-alphabetic values were found in a column representing an entity name or address element.', 'non_alpha_ct - zero_length_ct > 0 AND functional_data_type IN (''Address'', ''City'', ''Entity Name'', ''Person Given Name'', ''Person Last Name'', ''Person Full Name'')', '''Non-Alpha Values: '' || (non_alpha_ct - zero_length_ct)::VARCHAR || '', Semantic Type: '' || p.functional_data_type || '', Records: '' || p.record_ct::VARCHAR', 'Definite', 'Non-alphabetic values are highly likely to be invalid for this kind of column. This may indicate a file format change, error in an ingestion process, or incorrect source data. Review your pipeline process and source data to determine the root-cause. If this data accurately reflects source data, and upstream corrections are not possible, consider assigning the processed value to null to reflect that data is missing.', '(non_alpha_ct - zero_length_ct)::FLOAT/NULLIF(p.record_ct, 0)::FLOAT', '1.0', 'Validity'),
- ('1030', 'Non_Alpha_Prefixed_Name', 'Column', 'Non-Alpha Prefixed Name', 'Non-alphabetic characters were found at the start of a column representing an entity name.', 'min_text < ''A'' AND LEFT(min_text, 1) NOT IN (''"'', '' '') AND RIGHT(min_text, 1) <> '''''''' AND functional_data_type IN (''City'', ''Person Given Name'', ''Person Last Name'', ''Person Full Name'')', '''Minimum Value: '' || min_text', 'Definite', 'Values starting with a non-alphabetic character are highly likely to be invalid for this kind of column. This may indicate a file format change, error in an ingestion process, or incorrect source data. It could also indicate flagging or coding of some kind that can be broken out in a separate column in processed data. Review your pipeline process and source data to determine the root-cause. If this data accurately reflects source data, and upstream corrections are not possible, consider applying corrections directly to processed data where possible.', '0.25', '1.0', 'Validity'),
- ('1031', 'Non_Printing_Chars', 'Column', 'Non-Printing Characters', 'Non-printing characters were found embedded in a text column.', 'non_printing_ct > 0', '''Non-Printing Chars: '' || non_printing_ct::VARCHAR || '', Records: '' || p.record_ct::VARCHAR', 'Definite', 'Embedded non-printing characters are typically stripped from data. They affect filters and aggregations, and may cause problems for downstream users who don''t recognize their presence. Review your source data and follow-up with data owners to determine whether this data can be corrected upstream. If not, strip these characters from processed data.', 'non_printing_ct::FLOAT/NULLIF(p.record_ct, 0)::FLOAT', '1.0', 'Validity')
-;
-
-
TRUNCATE TABLE test_types;
-
-INSERT INTO test_types
- (id, test_type, test_name_short, test_name_long, test_description, except_message, measure_uom, measure_uom_description, selection_criteria, dq_score_prevalence_formula, dq_score_risk_factor, column_name_prompt, column_name_help, default_parm_columns, default_parm_values, default_parm_prompts, default_parm_help, default_severity, run_type, test_scope, dq_dimension, health_dimension, threshold_description, usage_notes, active, result_visualization, result_visualization_params)
-VALUES ('1004', 'Alpha_Trunc', 'Alpha Truncation', 'Maximum character count consistent', 'Tests that the maximum count of characters in a column value has not dropped vs. baseline data', 'Maximum length of values has dropped from prior expected length.', 'Values over max', NULL, 'general_type =''A'' AND max_length > 0 AND ( (min_length = avg_length AND max_length = avg_length) OR (numeric_ct <> value_ct ) ) AND functional_table_type NOT LIKE ''%window%'' /* The conditions below are to eliminate overlap with : LOV_Match (excluded selection criteria for this test_type), Pattern_Match (excluded selection criteria for this test_type), Constant (excluded functional_data_type Constant and Boolean) */ AND ( (distinct_value_ct NOT BETWEEN 2 AND 10 AND functional_data_type NOT IN ( ''Constant'', ''Boolean'') ) AND NOT ( fn_charcount(top_patterns, E'' \| '' ) = 1 AND fn_charcount(top_patterns, E'' \| '' ) IS NOT NULL AND REPLACE(SPLIT_PART(top_patterns, ''|'' , 2), ''N'' , '''' ) > ''''))', '{VALUE_CT}::FLOAT * (FN_NORMAL_CDF(({MAX_LENGTH}::FLOAT - {AVG_LENGTH}::FLOAT) / (NULLIF({MAX_LENGTH}::FLOAT, 0) / 3)) - FN_NORMAL_CDF(({RESULT_MEASURE}::FLOAT - {AVG_LENGTH}::FLOAT) / (NULLIF({MAX_LENGTH}::FLOAT, 0) / 3)) ) /NULLIF({RECORD_CT}::FLOAT, 0)', '1.0', NULL, NULL, 'threshold_value', 'FLOOR(0.95 * max_length::FLOAT)', 'Maximum String Length at Baseline', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'Maximum length expected', 'Alpha Truncation tests that the longest text value in a column hasn''t become shorter than the defined threshold, initially 95% of the longest value at baseline. This could indicate a problem in a cumulative dataset, where prior values should still exist unchanged. A failure here would suggest that some process changed data that you would still expect to be present and matching its value when the column was profiled. This test would not be appropriate for an incremental or windowed dataset.', 'Y', 'line_chart', NULL),
- ('1005', 'Avg_Shift', 'Average Shift', 'Column mean is consistent with reference', 'Tests for statistically-significant shift in mean value for column from average calculated at baseline.', 'Standardized difference between averages is over the selected threshold level.', 'Difference Measure', 'Cohen''s D Difference (0.20 small, 0.5 mod, 0.8 large, 1.2 very large, 2.0 huge)', 'general_type=''N'' AND distinct_value_ct > 10 AND functional_data_type ilike ''Measure%'' AND functional_data_type <> ''Measurement Spike'' AND column_name NOT ilike ''%latitude%'' AND column_name NOT ilike ''%longitude%''', '2.0 * (1.0 - fn_normal_cdf(ABS({RESULT_MEASURE}::FLOAT) / 2.0))', '0.75', NULL, NULL, 'baseline_value_ct,baseline_avg,baseline_sd,threshold_value', 'value_ct,avg_value,stdev_value,0.5::VARCHAR', 'Value Ct at Baseline,Mean at Baseline,Std Deviation at Baseline,Threshold Difference Measure ', NULL, 'Warning', 'CAT', 'column', 'Consistency', 'Data Drift', 'Standardized Difference Measure', 'Average Shift tests that the average of a numeric column has not significantly changed since baseline, when profiling was done. A significant shift may indicate errors in processing, differences in source data, or valid changes that may nevertheless impact assumptions in downstream data products. The test uses Cohen''s D, a statistical technique to identify significant shifts in a value. Cohen''s D measures the difference between the two averages, reporting results on a standardized scale, which can be interpreted via a rule-of-thumb from small to huge. Depending on your data, some difference may be expected, so it''s reasonable to adjust the threshold value that triggers test failure. This test works well for measures, or even for identifiers if you expect them to increment consistently. You may want to periodically adjust the expected threshold, or even the expected average value if you expect shifting over time. Consider this test along with Variability Increase. If variability rises too, process or measurement flaws could be at work. If variability remains consistent, the issue is more likely to be with the source data itself.', 'Y', 'line_chart', NULL),
- ('1007', 'Constant', 'Constant Match', 'All column values match constant value', 'Tests that all values in the column match the constant value identified in baseline data', 'A constant value is expected for this column.', 'Mismatched values', NULL, 'TEMPLATE', '({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/NULLIF({RECORD_CT}::FLOAT, 0)', '1.0', NULL, NULL, 'baseline_value,threshold_value', NULL, 'Constant Value at Baseline,Threshold Error Count', 'The single, unchanging value of the column, per baseline|The number of errors that are acceptable before test fails.', 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'Count of records with unexpected values', 'Constant Match tests that a single value determined to be a constant in baseline profiling is still the only value for the column that appears in subsequent versions of the dataset. Sometimes new data or business knowledge may reveal that the value is not a constant at all, even though only one value was present at profiling. In this case, you will want to disable this test. Alternatively, you can use the Value Match test to provide a limited number of valid values for the column.', 'Y', 'line_chart', NULL),
- ('1009', 'Daily_Record_Ct', 'Daily Records', 'All dates present within date range', 'Tests for presence of every calendar date within min/max date range, per baseline data', 'Not every date value between min and max dates is present, unlike at baseline.', 'Missing dates', NULL, 'general_type= ''D'' AND date_days_present > 21 AND date_days_present - (DATEDIFF(''day'', ''1800-01-05''::DATE, max_date) - DATEDIFF(''day'', ''1800-01-05''::DATE, min_date) + 1) = 0 AND future_date_ct::FLOAT / NULLIF(value_ct, 0) <= 0.75', '({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT*{PRO_RECORD_CT}::FLOAT/NULLIF({DATE_DAYS_PRESENT}::FLOAT, 0)/NULLIF({RECORD_CT}::FLOAT, 0)', '0.75', NULL, NULL, 'threshold_value', '0', 'Threshold Missing Calendar Days', NULL, 'Warning', 'CAT', 'column', 'Completeness', 'Volume', 'Missing calendar days within min/max range', 'Daily Records tests that at least one record is present for every day within the minimum and maximum date range for the column. The test is relevant for transactional data, where you would expect at least one transaction to be recorded each day. A failure here would suggest missing records for the number of days identified without data. You can adjust the threshold to accept a number of days that you know legitimately have no records. ', 'Y', 'line_chart', NULL),
- ('1011', 'Dec_Trunc', 'Decimal Truncation', 'Sum of fractional values at or above reference', 'Tests for decimal truncation by confirming that the sum of fractional values in data is no less than the sum at baseline', 'The sum of fractional values is under baseline, which may indicate decimal truncation', 'Fractional sum', 'The sum of all decimal values from all data for this column', 'fractional_sum > 0 AND functional_table_type LIKE''%cumulative%''', '1', '1.0', NULL, NULL, 'threshold_value', 'ROUND(fractional_sum, 0)', 'Sum of Fractional Values at Baseline', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'Minimum expected sum of all fractional values', 'Decimal Truncation tests that the fractional (decimal) part of a numeric column has not been truncated since Baseline. This works by summing all the fractional values after the decimal point and confirming that the total is at least equal to the fractional total at baseline. This could indicate a problem in a cumulative dataset, where prior values should still exist unchanged. A failure here would suggest that some process changed data that you would still expect to be present and matching its value when the column was profiled. This test would not be appropriate for an incremental or windowed dataset.', 'Y', 'line_chart', NULL),
- ('1012', 'Distinct_Date_Ct', 'Date Count', 'Count of distinct dates at or above reference', 'Tests that the count of distinct dates referenced in the column has not dropped vs. baseline data', 'Drop in count of unique dates recorded in column.', 'Unique dates', 'Count of unique dates in transactional date column', 'functional_data_type ILIKE ''Transactional Date%'' AND date_days_present > 1 AND functional_table_type ILIKE ''%cumulative%''', '(({RECORD_CT}-{PRO_RECORD_CT})::FLOAT*{DISTINCT_VALUE_CT}::FLOAT/NULLIF({PRO_RECORD_CT}::FLOAT, 0))/NULLIF({PRO_RECORD_CT}::FLOAT, 0)', '1.0', NULL, NULL, 'baseline_value,threshold_value', 'date_days_present,date_days_present', 'Distinct Date Count at Baseline,Min Expected Date Count', NULL, 'Fail', 'CAT', 'column', 'Timeliness', 'Recency', 'Minimum distinct date count expected', 'Date Count tests that the count of distinct dates present in the column has not dropped since baseline. The test is relevant for cumulative datasets, where old records are retained. A failure here would indicate missing records, which could be caused by a processing error or changed upstream data sources.', 'Y', 'line_chart', NULL),
- ('1013', 'Distinct_Value_Ct', 'Value Count', 'Count of distinct values has not dropped', 'Tests that the count of unique values in the column has not changed from baseline.', 'Count of unique values in column has changed from baseline.', 'Unique Values', NULL, 'distinct_value_ct between 2 and 10 AND value_ct > 50 AND functional_data_type IN (''Code'', ''Category'', ''Attribute'', ''Description'') AND NOT coalesce(top_freq_values,'''') > ''''', 'ABS({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT*{PRO_RECORD_CT}::FLOAT/NULLIF({DISTINCT_VALUE_CT}::FLOAT, 0)/NULLIF({RECORD_CT}::FLOAT, 0)', '1.0', NULL, NULL, 'baseline_value_ct,threshold_value', 'distinct_value_ct,distinct_value_ct', 'Distinct Value Count at Baseline,Min Expected Value Count', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'Expected distinct value count', 'Value Count tests that the count of unique values present in the column has not dropped since baseline. The test is relevant for cumulative datasets, where old records are retained, or for any dataset where you would expect a set number of distinct values should be present. A failure here would indicate missing records or a change in categories or value assignment.', 'Y', 'line_chart', NULL),
- ('1014', 'Email_Format', 'Email Format', 'Email is correctly formatted', 'Tests that non-blank, non-empty email addresses match the standard format', 'Invalid email address formats found.', 'Invalid emails', 'Number of emails that do not match standard format', 'std_pattern_match=''EMAIL''', '({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/NULLIF({RECORD_CT}::FLOAT, 0)', '1.0', NULL, NULL, 'threshold_value', '0', 'Maximum Invalid Email Count', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'Expected count of invalid email addresses', NULL, 'Y', 'line_chart', NULL),
- ('1015', 'Future_Date', 'Past Dates', 'Latest date is prior to test run date', 'Tests that the maximum date referenced in the column is no greater than the test date, consistent with baseline data', 'Future date found when absent in baseline data.', 'Future dates', NULL, 'general_type=''D''AND future_date_ct = 0', '({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/NULLIF({RECORD_CT}::FLOAT, 0)', '1.0', NULL, NULL, 'threshold_value', '0', 'Maximum Future Date Count', NULL, 'Fail', 'CAT', 'column', 'Timeliness', 'Recency', 'Expected count of future dates', NULL, 'Y', 'line_chart', NULL),
- ('1016', 'Future_Date_1Y', 'Future Year', 'Future dates within year of test run date', 'Tests that the maximum date referenced in the column is no greater than one year beyond the test date, consistent with baseline data', 'Future date beyond one-year found when absent in baseline.', 'Future dates post 1 year', NULL, 'general_type=''D''AND future_date_ct > 0 AND max_date <=''{AS_OF_DATE}''::DATE + INTERVAL''365 DAYS''', '({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/NULLIF({RECORD_CT}::FLOAT, 0)', '1.0', NULL, NULL, 'threshold_value', '0', 'Maximum Post 1-Year Future Date Count', NULL, 'Fail', 'CAT', 'column', 'Timeliness', 'Recency', 'Expected count of future dates beyond one year', 'Future Year looks for date values in the column that extend beyond one year after the test date. This would be appropriate for transactional dates where you would expect to find dates in the near future, but not beyond one year ahead. Errors could indicate invalid entries or possibly dummy dates representing blank values.', 'Y', 'line_chart', NULL),
- ('1017', 'Incr_Avg_Shift', 'New Shift', 'New record mean is consistent with reference', 'Tests for statistically-significant shift in mean of new values for column compared to average calculated at baseline.', 'Significant shift in average of new values vs. baseline avg', 'Z-score of mean shift', 'Absolute Z-score (number of SD''s outside mean) of prior avg - incremental avg', 'general_type=''N'' AND distinct_value_ct > 10 AND functional_data_type ilike ''Measure%'' AND functional_data_type <> ''Measurement Spike'' AND column_name NOT ilike ''%latitude%'' AND column_name NOT ilike ''%longitude%''', '{RECORD_CT}::FLOAT*(1-FN_NORMAL_CDF({RESULT_MEASURE}::FLOAT))/NULLIF({RECORD_CT}::FLOAT, 0)', '0.75', NULL, NULL, 'baseline_value_ct,baseline_sum,baseline_avg,baseline_sd,threshold_value', 'value_ct,(avg_value * value_ct)::FLOAT,avg_value,stdev_value,2', 'Value Count at Baseline,Sum at Baseline,Mean Value at Baseline,Std Deviation at Baseline,Threshold Max Z-Score', NULL, 'Warning', 'CAT', 'column', 'Accuracy', 'Data Drift', 'Maximum Z-Score (number of SD''s beyond mean) expected', 'This is a more sensitive test than Average Shift, because it calculates an incremental difference in the average of new values compared to the average of values at baseline. This is appropriate for a cumulative dataset only, because it calculates the average of new entries based on the assumption that the count and average of records present at baseline are still present at the time of the test. This test compares the mean of new values with the standard deviation of the baseline average to calculate a Z-score. If the new mean falls outside the Z-score threshold, a shift is detected. Potential Z-score thresholds may range from 0 to 3, depending on the sensitivity you prefer. A failed test could indicate a quality issue or a legitimate shift in new data that should be noted and assessed by business users. Consider this test along with Variability Increase. If variability rises too, process, methodology or measurement flaws could be at issue. If variability remains consistent, the problem is more likely to be with the source data itself.', 'Y', 'line_chart', NULL),
- ('1018', 'LOV_All', 'Value Match All', 'List of expected values all present in column', 'Tests that all values match a pipe-delimited list of expected values and that all expected values are present', 'Column values found don''t exactly match the expected list of values', 'Values found', NULL, NULL, '1', '1.0', NULL, NULL, 'threshold_value', NULL, 'List of Expected Values', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'List of values expected, in form (''Val1'',''Val2)', 'This is a more restrictive form of Value Match, testing that all values in the dataset match the list provided, and also that all values present in the list appear at least once in the dataset. This would be appropriate for tables where all category values in the column are represented at least once.', 'Y', 'line_chart', NULL),
- ('1019', 'LOV_Match', 'Value Match', 'All column values present in expected list', 'Tests that all values in the column match the list-of-values identified in baseline data.', 'Values not matching expected List-of-Values from baseline.', 'Non-matching records', NULL, 'functional_data_type IN (''Boolean'', ''Code'', ''Category'') AND top_freq_values > '''' AND distinct_value_ct BETWEEN 2 and 10 AND value_ct > 5', '({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/NULLIF({RECORD_CT}::FLOAT, 0)', '1.0', NULL, NULL, 'baseline_value,threshold_value', '''('' || SUBSTRING( CASE WHEN SPLIT_PART(top_freq_values, ''|'' , 2) > '''' THEN '','''''' || TRIM( REPLACE ( SPLIT_PART(top_freq_values, ''|'' , 2), '''''''' , '''''''''''' ) ) || '''''''' ELSE '''' END || CASE WHEN SPLIT_PART(top_freq_values, ''|'' , 4) > '''' THEN '','''''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, ''|'' , 4), '''''''' , '''''''''''' )) || '''''''' ELSE '''' END || CASE WHEN SPLIT_PART(top_freq_values, ''|'' , 6) > '''' THEN '','''''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, ''|'' , 6), '''''''' , '''''''''''' )) || '''''''' ELSE '''' END || CASE WHEN SPLIT_PART(top_freq_values, ''|'' , 8) > '''' THEN '','''''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, ''|'' , 8), '''''''' , '''''''''''' )) || '''''''' ELSE '''' END || CASE WHEN SPLIT_PART(top_freq_values, ''|'' , 10) > '''' THEN '','''''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, ''|'' , 10), '''''''' , '''''''''''' )) || '''''''' ELSE '''' END || CASE WHEN SPLIT_PART(top_freq_values, ''|'' , 12) > '''' THEN '','''''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, ''|'' , 12), '''''''' , '''''''''''' )) || '''''''' ELSE '''' END || CASE WHEN SPLIT_PART(top_freq_values, ''|'' , 14) > '''' THEN '','''''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, ''|'' , 14), '''''''' , '''''''''''' )) || '''''''' ELSE '''' END || CASE WHEN SPLIT_PART(top_freq_values, ''|'' , 16) > '''' THEN '','''''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, ''|'' , 16), '''''''' , '''''''''''' )) || '''''''' ELSE '''' END || CASE WHEN SPLIT_PART(top_freq_values, ''|'' , 18) > '''' THEN '','''''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, ''|'' , 18), '''''''' , '''''''''''' )) || '''''''' ELSE '''' END || CASE WHEN SPLIT_PART(top_freq_values, ''|'' , 20) > '''' THEN '','''''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, ''|'' , 20), '''''''' , '''''''''''' )) || '''''''' ELSE '''' END, 2, 999) || '')'',0', 'List of Expected Values,Threshold Error Count', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'List of values expected, in form (''Val1'',''Val2)', 'This tests that all values in the column match the hard-coded list provided. This is relevant when the list of allowable values is small and not expected to change often. Even if new values might occasionally be added, this test is useful for downstream data products to provide warning that assumptions and logic may need to change.', 'Y', 'line_chart', NULL),
- ('1020', 'Min_Date', 'Minimum Date', 'All dates on or after set minimum', 'Tests that the earliest date referenced in the column is no earlier than baseline data', 'The earliest date value found is before the earliest value at baseline.', 'Dates prior to limit', NULL, 'general_type=''D''and min_date IS NOT NULL AND distinct_value_ct > 1', '({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/NULLIF({RECORD_CT}::FLOAT, 0)', '1.0', NULL, NULL, 'baseline_value,threshold_value', 'min_date,0', 'Minimum Date at Baseline,Threshold Error Count', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'Expected count of dates prior to minimum', 'This test is appropriate for a cumulative dataset only, because it assumes all prior values are still present. It''s appropriate where new records are added with more recent dates, but old dates dates do not change.', 'Y', 'line_chart', NULL),
- ('1021', 'Min_Val', 'Minimum Value', 'All values at or above set minimum', 'Tests that the minimum value present in the column is no lower than the minimum value in baseline data', 'Minimum column value less than baseline.', 'Values under limit', NULL, 'general_type=''N'' AND functional_data_type ILIKE ''Measure%'' AND min_value IS NOT NULL AND (distinct_value_ct >= 2 OR (distinct_value_ct=2 and min_value<>0 and max_value<>1))', '({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/NULLIF({RECORD_CT}::FLOAT, 0)', '1.0', NULL, NULL, 'baseline_value,threshold_value', 'min_value,0', 'Minimum Value at Baseline,Threshold Error Count', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'Expected count of values under limit', 'This test is appropriate for a cumulative dataset only, assuming all prior values are still present. It is also appropriate for any measure that has an absolute, definable minimum value, or a heuristic that makes senes for valid data.', 'Y', 'line_chart', NULL),
- ('1022', 'Missing_Pct', 'Percent Missing', 'Consistent ratio of missing values', 'Tests for statistically-significant shift in percentage of missing values in column vs. baseline data', 'Significant shift in percent of missing values vs. baseline.', 'Difference measure', 'Cohen''s H Difference (0.20 small, 0.5 mod, 0.8 large, 1.2 very large, 2.0 huge)', 'record_ct <> value_ct', '2.0 * (1.0 - fn_normal_cdf(ABS({RESULT_MEASURE}::FLOAT) / 2.0))', '0.75', NULL, NULL, 'baseline_ct,baseline_value_ct,threshold_value', 'record_ct,value_ct,2::VARCHAR(10)', 'Baseline Record Count,Baseline Value Count,Standardized Difference Measure', NULL, 'Warning', 'CAT', 'column', 'Completeness', 'Data Drift', 'Expected maximum Cohen''s H Difference', 'This test uses Cohen''s H, a statistical test to identify a significant difference between two ratios. Results are reported on a standardized scale, which can be interpreted via a rule-of-thumb from small to huge. An uptick in missing data may indicate a collection issue at the source. A larger change may indicate a processing failure. A drop in missing data may also be significant, if it affects assumptions built into analytic products downstream. You can refine the expected threshold value as you view legitimate results of the measure over time.', 'Y', 'line_chart', NULL),
- ('1023', 'Monthly_Rec_Ct', 'Monthly Records', 'At least one date per month present within date range', 'Tests for presence of at least one date per calendar month within min/max date range, per baseline data', 'At least one date per month expected in min/max date range.', 'Missing months', 'Calendar months without date values present', 'functional_data_type ILIKE ''Transactional Date%'' AND date_days_present > 1 AND functional_table_type ILIKE ''%cumulative%'' AND date_months_present > 2 AND date_months_present - (datediff( ''MON'' , min_date, max_date) + 1) = 0 AND future_date_ct::FLOAT / NULLIF(value_ct, 0) <= 0.75', '({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT*{PRO_RECORD_CT}::FLOAT/NULLIF({DATE_MONTHS_PRESENT}::FLOAT, 0)/NULLIF({RECORD_CT}::FLOAT, 0)', '1.0', NULL, NULL, 'threshold_value', '0', 'Threshold Count of Months without Dates', NULL, 'Fail', 'CAT', 'column', 'Completeness', 'Volume', 'Expected maximum count of calendar months without dates present', 'Monthly Records tests that at least one record is present for every calendar month within the minimum and maximum date range for the column. The test is relevant for transactional data, where you would expect at least one transaction to be recorded each month. A failure here would suggest missing records for the number of months identified without data. You can adjust the threshold to accept a number of month that you know legitimately have no records.', 'Y', 'line_chart', NULL),
- ('1024', 'Outlier_Pct_Above', 'Outliers Above', 'Consistent outlier counts over 2 SD above mean', 'Tests that percent of outliers over 2 SD above Mean doesn''t exceed threshold', 'Percent of outliers exceeding 2 SD above the mean is greater than expected threshold.', 'Pct records over limit', NULL, 'functional_data_type = ''Measurement'' AND distinct_value_ct > 30 AND NOT distinct_value_ct = max_value - min_value + 1 AND distinct_value_ct::FLOAT/value_ct::FLOAT > 0.1 AND stdev_value::FLOAT/avg_value::FLOAT > 0.01 AND column_name NOT ILIKE ''%latitude%'' AND column_name NOT ilike ''%longitude%''', 'GREATEST(0, {RESULT_MEASURE}::FLOAT-{THRESHOLD_VALUE}::FLOAT)', '0.75', NULL, NULL, 'baseline_avg,baseline_sd,threshold_value', 'avg_value,stdev_value,0.05', 'Baseline Mean, Baseline Std Deviation, Pct Records over 2 SD', NULL, 'Warning', 'CAT', 'column', 'Accuracy', 'Data Drift', 'Expected maximum pct records over upper 2 SD limit', 'This test counts the number of data points that may be considered as outliers, determined by whether their value exceeds 2 standard deviations above the mean at baseline. Assuming a normal distribution, a small percentage (defaulted to 5%) of outliers is expected. The actual number may vary for different distributions. The expected threshold reflects the maximum percentage of outliers you expect to see. This test uses the baseline mean rather than the mean for the latest dataset to capture systemic shift as well as individual outliers. ', 'Y', 'line_chart', NULL),
- ('1025', 'Outlier_Pct_Below', 'Outliers Below', 'Consistent outlier counts under 2 SD below mean', 'Tests that percent of outliers over 2 SD below Mean doesn''t exceed threshold', 'Percent of outliers exceeding 2 SD below the mean is greater than expected threshold.', 'Pct records under limit', NULL, 'functional_data_type = ''Measurement'' AND distinct_value_ct > 30 AND NOT distinct_value_ct = max_value - min_value + 1 AND distinct_value_ct::FLOAT/value_ct::FLOAT > 0.1 AND stdev_value::FLOAT/avg_value::FLOAT > 0.01 AND column_name NOT ILIKE ''%latitude%'' AND column_name NOT ilike ''%longitude%''', 'GREATEST(0, {RESULT_MEASURE}::FLOAT-{THRESHOLD_VALUE}::FLOAT)', '0.75', NULL, NULL, 'baseline_avg,baseline_sd,threshold_value', 'avg_value,stdev_value,0.05', 'Baseline Mean, Baseline Std Deviation, Pct Records over 2 SD', NULL, 'Warning', 'CAT', 'column', 'Accuracy', 'Data Drift', 'Expected maximum pct records over lower 2 SD limit', 'This test counts the number of data points that may be considered as outliers, determined by whether their value exceeds 2 standard deviations below the mean at baseline. Assuming a normal distribution, a small percentage (defaulted to 5%) of outliers is expected. The actual number may vary for different distributions. The expected threshold reflects the maximum percentage of outliers you expect to see. This test uses the baseline mean rather than the mean for the latest dataset to capture systemic shift as well as individual outliers. ', 'Y', 'line_chart', NULL),
- ('1026', 'Pattern_Match', 'Pattern Match', 'Column values match alpha-numeric pattern', 'Tests that all values in the column match the same alpha-numeric pattern identified in baseline data', 'Alpha values do not match consistent pattern in baseline.', 'Pattern Mismatches', NULL, '(functional_data_type IN (''Attribute'', ''DateTime Stamp'', ''Phone'') OR functional_data_type ILIKE ''ID%'' OR functional_data_type ILIKE ''Period%'') AND fn_charcount(top_patterns, E'' \| '' ) = 1 AND REPLACE(SPLIT_PART(top_patterns, ''|'' , 2), ''N'' , '''' ) > '''' AND distinct_value_ct > 10', '({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/NULLIF({RECORD_CT}::FLOAT, 0)', '1.0', NULL, NULL, 'baseline_value,threshold_value', 'TRIM(REPLACE(REPLACE(REPLACE(REGEXP_REPLACE(SPLIT_PART(top_patterns, '' | '', 2), ''([*+\-%_])'', ''[\1]'', ''g''), ''A'', ''[A-Z]''), ''N'', ''[0-9]''), ''a'', ''[a-z]'')),0', 'Pattern at Baseline,Threshold Error Count', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'Expected count of pattern mismatches', 'This test is appropriate for character fields that are expected to appear in a consistent format. It uses pattern matching syntax as appropriate for your database: REGEX matching if available, otherwise LIKE expressions. The expected threshold is the number of records that fail to match the defined pattern.', 'Y', 'line_chart', NULL),
- ('1028', 'Recency', 'Recency', 'Latest date within expected range of test date', 'Tests that the latest date in column is within a set number of days of the test date', 'Most recent date value not within expected days of test date.', 'Days before test', 'Number of days that most recent date precedes the date of test', 'general_type= ''D'' AND max_date <= run_date AND NOT column_name IN ( ''filedate'' , ''file_date'' ) AND NOT functional_data_type IN (''Future Date'', ''Schedule Date'') AND DATEDIFF( ''DAY'' , max_date, run_date) <= 62', '(ABS({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT*{PRO_RECORD_CT}::FLOAT/(1.0+DATEDIFF(''DAY'', ''{MIN_DATE}'', ''{MAX_DATE}''))::FLOAT)/NULLIF({RECORD_CT}::FLOAT, 0)', '0.75', NULL, NULL, 'threshold_value', 'CASE WHEN DATEDIFF( ''DAY'' , max_date, run_date) <= 3 THEN DATEDIFF(''DAY'', max_date, run_date) + 3 WHEN DATEDIFF(''DAY'', max_date, run_date) <= 7 then DATEDIFF(''DAY'', max_date, run_date) + 7 WHEN DATEDIFF( ''DAY'' , max_date, run_date) <= 31 THEN CEILING( DATEDIFF( ''DAY'' , max_date, run_date)::FLOAT / 7.0) * 7 WHEN DATEDIFF( ''DAY'' , max_date, run_date) > 31 THEN CEILING( DATEDIFF( ''DAY'' , max_date, run_date)::FLOAT / 30.0) * 30 END', 'Threshold Maximum Days before Test', NULL, 'Warning', 'CAT', 'column', 'Timeliness', 'Recency', 'Expected maximum count of days preceding test date', 'This test evaluates recency based on the latest referenced dates in the column. The test is appropriate for transactional dates and timestamps. The test can be especially valuable because timely data deliveries themselves may not assure that the most recent data is present. You can adjust the expected threshold to the maximum number of days that you expect the data to age before the dataset is refreshed.', 'Y', 'line_chart', NULL),
- ('1030', 'Required', 'Required Entry', 'Required non-null value present', 'Tests that a non-null value is present in each record for the column, consistent with baseline data', 'Every record for this column is expected to be filled, but some are missing.', 'Missing values', NULL, 'record_ct = value_ct AND record_ct > 10', '({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/NULLIF({RECORD_CT}::FLOAT, 0)', '1.0', NULL, NULL, 'threshold_value', '0', 'Threshold Missing Value Count', NULL, 'Fail', 'CAT', 'column', 'Completeness', 'Schema Drift', 'Expected count of missing values', NULL, 'Y', 'line_chart', NULL),
- ('1033', 'Street_Addr_Pattern', 'Street Address', 'Enough street address entries match defined pattern', 'Tests for percent of records matching standard street address pattern.', 'Percent of values matching standard street address format is under expected threshold.', 'Percent matches', 'Percent of records that match street address pattern', '(std_pattern_match=''STREET_ADDR'') AND (avg_length <> round(avg_length)) AND (avg_embedded_spaces BETWEEN 2 AND 6) AND (avg_length < 35)', '({VALUE_CT}::FLOAT * ({RESULT_MEASURE}::FLOAT - {THRESHOLD_VALUE}::FLOAT)/100.0)/NULLIF({RECORD_CT}::FLOAT, 0)', '1.0', NULL, NULL, 'threshold_value', '75', 'Threshold Pct that Match Address Pattern', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'Expected percent of records that match standard street address pattern', 'The street address pattern used in this test should match the vast majority of USA addresses. You can adjust the threshold percent of matches based on the results you are getting -- you may well want to tighten it to make the test more sensitive to invalid entries.', 'Y', 'line_chart', NULL),
- ('1034', 'Unique', 'Unique Values', 'Each column value is unique', 'Tests that no values for the column are repeated in multiple records.', 'Column values should be unique per row.', 'Duplicate values', 'Count of non-unique values', 'record_ct > 500 and record_ct = distinct_value_ct and value_ct > 0', '({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/NULLIF({RECORD_CT}::FLOAT, 0)', '1.0', NULL, NULL, 'threshold_value', '0', 'Threshold Duplicate Value Count', NULL, 'Fail', 'CAT', 'column', 'Uniqueness', 'Schema Drift', 'Expected count of duplicate values', 'This test is ideal when the database itself does not enforce a primary key constraint on the table. It serves as an independent check on uniqueness. If''s also useful when there are a small number of exceptions to uniqueness, which can be reflected in the expected threshold count of duplicates.', 'Y', 'line_chart', NULL),
- ('1035', 'Unique_Pct', 'Percent Unique', 'Consistent ratio of unique values', 'Tests for statistically-significant shift in percentage of unique values vs. baseline data.', 'Significant shift in percent of unique values vs. baseline.', 'Difference measure', 'Cohen''s H Difference (0.20 small, 0.5 mod, 0.8 large, 1.2 very large, 2.0 huge)', 'distinct_value_ct > 10 AND functional_data_type NOT ILIKE ''Measurement%''', '2.0 * (1.0 - fn_normal_cdf(ABS({RESULT_MEASURE}::FLOAT) / 2.0))', '0.75', NULL, NULL, 'baseline_value_ct,baseline_unique_ct,threshold_value', 'value_ct,distinct_value_ct,0.5', 'Value Count at Baseline,Distinct Value Count at Baseline,Standardized Difference Measure (0 to 1)', NULL, 'Warning', 'CAT', 'column', 'Uniqueness', 'Data Drift', 'Expected maximum Cohen''s H Difference', 'You can think of this as a test of similarity that measures whether the percentage of unique values is consistent with the percentage at baseline. A significant change might indicate duplication or a telling shift in cardinality between entities. The test uses Cohen''s H, a statistical test to identify a significant difference between two ratios. Results are reported on a standardized scale, which can be interpreted via a rule-of-thumb from small to huge. You can refine the expected threshold value as you view legitimate results of the measure over time.', 'Y', 'line_chart', NULL),
- ('1036', 'US_State', 'US State', 'Column value is two-letter US state code', 'Tests that the recorded column value is a valid US state.', 'Column Value is not a valid US state.', 'Not US States', 'Values that doo not match 2-character US state abbreviations.', 'general_type= ''A'' AND column_name ILIKE ''%state%'' AND distinct_value_ct < 70 AND max_length = 2', '({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/NULLIF({RECORD_CT}::FLOAT, 0)', '1.0', NULL, NULL, 'threshold_value', '0', 'Threshold Count not Matching State Abbreviations', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'Expected count of values that are not US state abbreviations', 'This test validates entries against a fixed list of two-character US state codes and related Armed Forces codes.', 'Y', 'line_chart', NULL),
- ('1037', 'Weekly_Rec_Ct', 'Weekly Records', 'At least one date per week present within date range', 'Tests for presence of at least one date per calendar week within min/max date range, per baseline data', 'At least one date per week expected in min/max date range.', 'Missing weeks', 'Calendar weeks without date values present', 'functional_data_type ILIKE ''Transactional Date%'' AND date_days_present > 1 AND functional_table_type ILIKE ''%cumulative%'' AND date_weeks_present > 3 AND date_weeks_present - (DATEDIFF(''week'', ''1800-01-05''::DATE, max_date) - DATEDIFF(''week'', ''1800-01-05''::DATE, min_date) + 1) = 0 AND future_date_ct::FLOAT / NULLIF(value_ct, 0) <= 0.75', '({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT*{PRO_RECORD_CT}::FLOAT/NULLIF({DATE_WEEKS_PRESENT}::FLOAT, 0)/NULLIF({RECORD_CT}::FLOAT, 0)', '1.0', NULL, NULL, 'threshold_value', '0', 'Threshold Weeks without Dates', NULL, 'Fail', 'CAT', 'column', 'Completeness', 'Volume', 'Expected maximum count of calendar weeks without dates present', 'Weekly Records tests that at least one record is present for every calendar week within the minimum and maximum date range for the column. The test is relevant for transactional data, where you would expect at least one transaction to be recorded each week. A failure here would suggest missing records for the number of weeks identified without data. You can adjust the threshold to accept a number of weeks that you know legitimately have no records.', 'Y', 'line_chart', NULL),
- ('1040', 'Variability_Increase', 'Variability Increase', 'Variability has increased above threshold', 'Tests that the spread or dispersion of column values has increased significantly over baseline, indicating a drop in stability of the measure.', 'The Standard Deviation of the measure has increased beyond the defined threshold. This could signal a change in a process or a data quality issue.', 'Pct SD shift', 'Percent of baseline Standard Deviation', 'general_type = ''N'' AND functional_data_type ilike ''Measure%'' AND functional_data_type <> ''Measurement Spike'' AND column_name NOT ilike ''%latitude%'' AND column_name NOT ilike ''%longitude%'' AND value_ct <> distinct_value_ct AND distinct_value_ct > 10 AND stdev_value > 0 AND avg_value IS NOT NULL AND NOT (distinct_value_ct = max_value - min_value + 1 AND distinct_value_ct > 2)', '1', '0.75', NULL, NULL, 'baseline_sd,threshold_value', 'stdev_value,120', 'Std Deviation at Baseline,Expected Maximum Percent', NULL, 'Warning', 'CAT', 'column', 'Accuracy', 'Data Drift', 'Expected maximum pct of baseline Standard Deviation (SD)', 'This test looks for percent shifts in standard deviation as a measure of the stability of a measure over time. A significant change could indicate that new values are erroneous, or that the cohort being evaluated is significantly different from baseline. An increase in particular could mark new problems in measurement, a more heterogeneous cohort, or that significant outliers have been introduced. Consider this test along with Average Shift and New Shift. If the average shifts as well, there may be a fundamental shift in the dataset or process used to collect the data point. This might suggest a data shift that should be noted and assessed by business users. If the average does not shift, this may point to a data quality or data collection problem. ', 'Y', 'line_chart', NULL),
- ('1041', 'Variability_Decrease', 'Variability Decrease', 'Variability has decreased below threshold', 'Tests that the spread or dispersion of column values has decreased significantly over baseline, indicating a shift in stability of the measure. This could signal a change in a process or a data quality issue.', 'The Standard Deviation of the measure has decreased below the defined threshold. This could signal a change in a process or a data quality issue.', 'Pct SD shift', 'Percent of baseline Standard Deviation', 'general_type = ''N'' AND functional_data_type ilike ''Measure%'' AND functional_data_type <> ''Measurement Spike'' AND column_name NOT ilike ''%latitude%'' AND column_name NOT ilike ''%longitude%'' AND value_ct <> distinct_value_ct AND distinct_value_ct > 10 AND stdev_value > 0 AND avg_value IS NOT NULL AND NOT (distinct_value_ct = max_value - min_value + 1 AND distinct_value_ct > 2)', '1', '0.75', NULL, NULL, 'baseline_sd,threshold_value', 'stdev_value, 80', 'Std Deviation at Baseline,Expected Minimum Percent', NULL, 'Warning', 'CAT', 'column', 'Accuracy', 'Data Drift', 'Expected minimum pct of baseline Standard Deviation (SD)', 'This test looks for percent shifts in standard deviation as a measure of the stability of a measure over time. A significant change could indicate that new values are erroneous, or that the cohort being evaluated is significantly different from baseline. A decrease in particular could indicate an improved process, better precision in measurement, the elimination of outliers, or a more homogeneous cohort. ', 'Y', 'line_chart', NULL),
- ('1042', 'Valid_Month', 'Valid Month', 'Valid calendar month in expected format', 'Tests for the presence of a valid representation of a calendar month consistent with the format at baseline.', 'Column values are not a valid representation of a calendar month consistent with the format at baseline.', 'Invalid months', NULL, 'functional_data_type = ''Period Month''', '({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/NULLIF({RECORD_CT}::FLOAT, 0)', '1.0', NULL, NULL, 'threshold_value,baseline_value', '0,CASE WHEN max_length > 3 AND initcap(min_text) = min_text THEN ''''''January'''',''''February'''',''''March'''',''''April'''',''''May'''',''''June'''',''''July'''',''''August'''',''''September'''',''''October'''',''''November'''',''''December'''''' WHEN max_length > 3 AND upper(min_text) = min_text THEN ''''''JANUARY'''',''''FEBRUARY'''',''''MARCH'''',''''APRIL'''',''''MAY'''',''''JUNE'''',''''JULY'''',''''AUGUST'''',''''SEPTEMBER'''',''''OCTOBER'''',''''NOVEMBER'''',''''DECEMBER'''''' WHEN max_length > 3 AND lower(min_text) = min_text THEN ''''''january'''',''''february'''',''''march'''',''''april'''',''''may'''',''''june'''',''''july'''',''''august'''',''''september'''',''''october'''',''''november'''',''''december'''''' WHEN max_length = 3 AND initcap(min_text) = min_text THEN ''''''Jan'''',''''Feb'''',''''Mar'''',''''Apr'''',''''May'''',''''Jun'''',''''Jul'''',''''Aug'''',''''Sep'''',''''Oct'''',''''Nov'''',''''Dec'''''' WHEN max_length = 3 AND upper(min_text) = min_text THEN ''''''JAN'''',''''FEB'''',''''MAR'''',''''APR'''',''''MAY'''',''''JUN'''',''''JUL'''',''''AUG'''',''''SEP'''',''''OCT'''',''''NOV'''',''''DEC'''''' WHEN max_length = 3 AND lower(min_text) = min_text THEN ''''''jan'''',''''feb'''',''''mar'''',''''apr'''',''''may'''',''''jun'''',''''jul'''',''''aug'''',''''sep'''',''''oct'''',''''nov'''',''''dec'''''' WHEN max_length = 2 AND min_text = ''01'' THEN ''''''01'''',''''02'''',''''03'''',''''04'''',''''05'''',''''06'''',''''07'''',''''08'''',''''09'''',''''10'''',''''11'''',''''12'''''' WHEN max_length = 2 AND min_text = ''1'' THEN ''''''1'''',''''2'''',''''3'''',''''4'''',''''5'''',''''6'''',''''7'''',''''8'''',''''9'''',''''10'''',''''11'''',''''12'''''' WHEN min_value = 1 THEN ''1,2,3,4,5,6,7,8,9,10,11,12'' ELSE ''NULL'' END', 'Threshold Invalid Months,Valid Month List', 'The acceptable number of records with invalid months present.|List of valid month values for this field, in quotes if field is numeric, separated by commas.', 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'Expected count of invalid months', NULL, 'N', 'line_chart', NULL),
- ('1043', 'Valid_Characters', 'Valid Characters', 'Column contains no invalid characters', 'Tests for the presence of non-printing characters, leading spaces, or surrounding quotes.', 'Invalid characters, such as non-printing characters, leading spaces, or surrounding quotes, were found.', 'Invalid records', 'Expected count of values with invalid characters', 'general_type = ''A''', '({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/NULLIF({RECORD_CT}::FLOAT, 0)', '0.75', NULL, NULL, 'threshold_value', '0', NULL, 'The acceptable number of records with invalid character values present.', 'Warning', 'CAT', 'column', 'Validity', 'Schema Drift', 'Threshold Invalid Value Count', 'This test looks for the presence of non-printing ASCII characters that are considered non-standard in basic text processing. It also identifies leading spaces and values enclosed in quotes. Values that fail this test may be artifacts of data conversion, or just more difficult to process or analyze downstream.', 'N', 'line_chart', NULL),
- ('1044', 'Valid_US_Zip', 'Valid US Zip', 'Valid USA Postal Codes', 'Tests that postal codes match the 5 or 9 digit standard US format', 'Invalid US Zip Code formats found.', 'Invalid Zip Codes', 'Expected count of values with invalid Zip Codes', 'functional_data_type = ''Zip''', '({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/NULLIF({RECORD_CT}::FLOAT, 0)', '0.75', NULL, NULL, 'threshold_value', '0', NULL, NULL, 'Warning', 'CAT', 'column', 'Validity', 'Schema Drift', 'Threshold Invalid Value Count', NULL, 'Y', 'line_chart', NULL),
- ('1045', 'Valid_US_Zip3', 'Valid US Zip-3 ', 'Valid USA Zip-3 Prefix', 'Tests that postal codes match the 3 digit format of a regional prefix.', 'Invalid 3-digit US Zip Code regional prefix formats found.', 'Invalid Zip-3 Prefix', 'Expected count of values with invalid Zip-3 Prefix Codes', 'functional_data_type = ''Zip3''', '({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/NULLIF({RECORD_CT}::FLOAT, 0)', '0.75', NULL, NULL, 'threshold_value', '0', NULL, NULL, 'Warning', 'CAT', 'column', 'Validity', 'Schema Drift', 'Threshold Invalid Zip3 Count', 'This test looks for the presence of values that fail to match the three-digit numeric code expected for US Zip Code regional prefixes. These prefixes are often used to roll up Zip Code data to a regional level, and may be critical to anonymize detailed data and protect PID. Depending on your needs and regulatory requirements, longer zip codes could place PID at risk.', 'Y', 'line_chart', NULL),
- ('1006', 'Condition_Flag', 'Custom Condition', 'Column values match pre-defined condition', 'Tests that each record in the table matches a pre-defined, custom condition', 'Value(s) found not matching defined condition.', 'Values Failing', NULL, NULL, '({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/NULLIF({RECORD_CT}::FLOAT, 0)', '1.0', 'Test Focus', 'Specify a brief descriptor of the focus of this test that is unique within this Test Suite for the Table and Test Type. This distinguishes this test from others of the same type on the same table. Example: `Quantity Consistency` if you are testing that quantity ordered matches quantity shipped.', 'threshold_value,custom_query', NULL, 'Threshold Error Count,Custom SQL Expression (TRUE on error)', 'The number of errors that are acceptable before test fails.|Expression should evaluate to TRUE to register an error or FALSE if no error. An expression can reference only columns in the selected table.', 'Fail', 'CAT', 'custom', 'Validity', 'Schema Drift', 'Count of records that don''t meet test condition', 'Custom Condition is a business-rule test for a user-defined error condition based on the value of one or more columns. The condition is applied to each record within the table, and the count of records failing the condition is added up. If that count exceeds a threshold of errors, the test as a whole is failed. This test is ideal for error conditions that TestGen cannot automatically infer, and any condition that involves the values of more than one column in the same record. Performance of this test is fast, since it is performed together with other aggregate tests. Interpretation is based on the user-defined meaning of the test.', 'Y', 'line_chart', NULL),
-
- ('1031', 'Row_Ct', 'Row Count', 'Number of rows is at or above threshold', 'Tests that the count of records has not decreased from the baseline count.', 'Row count less than baseline count.', 'Row count', NULL, 'TEMPLATE', '({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/NULLIF({THRESHOLD_VALUE}::FLOAT, 0)', '1.0', NULL, NULL, 'threshold_value', NULL, 'Threshold Minimum Record Count', NULL, 'Fail', 'CAT', 'table', 'Completeness', 'Volume', 'Expected minimum row count', 'Because this tests the row count against a constant minimum threshold, it''s appropriate for any dataset, as long as the number of rows doesn''t radically change from refresh to refresh. But it''s not responsive to change over time. You may want to adjust the threshold periodically if you are dealing with a cumulative dataset.', 'Y', 'line_chart', NULL),
- ('1032', 'Row_Ct_Pct', 'Row Range', 'Number of rows within percent range of threshold', 'Tests that the count of records is within a percentage above or below the baseline count.', 'Row Count is outside of threshold percent of baseline count.', 'Percent of baseline', 'Row count percent above or below baseline', 'TEMPLATE', '(100.0 - {RESULT_MEASURE}::FLOAT)/100.0', '1.0', NULL, NULL, 'baseline_ct,threshold_value', NULL, 'Baseline Record Count,Threshold Pct Above or Below Baseline', NULL, 'Fail', 'CAT', 'table', 'Completeness', 'Volume', 'Expected percent window below or above baseline', 'This test is better than Row Count for an incremental or windowed dataset where you would expect the row count to range within a percentage of baseline.', 'Y', 'line_chart', NULL),
-
- ('1008', 'CUSTOM', 'Custom Test', 'Custom-defined business rule', 'Custom SQL Query Test', 'Errors were detected according to test definition.', 'Errors found', 'Count of errors identified by query', NULL, '({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/NULLIF({RECORD_CT}::FLOAT, 0)', '1.0', 'Test Focus', 'Specify a brief descriptor of the focus of this test that is unique within this Test Suite for the Table and Test Type. This distinguishes this test from others of the same type on the same table. Example: `Order Total Matches Detail` if you are testing that the total in one table matches the sum of lines in another.', 'custom_query', NULL, 'Custom SQL Query Returning Error Records', 'Query should return records indicating one or more errors. The test passes if no records are returned. Results of the query will be shown when you click `Review Source Data` for a failed test, so be sure to include enough data in your results to follow-up. \n\nA query can refer to any tables in the database. You must hard-code the schema or use `{DATA_SCHEMA}` to represent the schema defined for the Table Group.', 'Fail', 'QUERY', 'custom', 'Accuracy', 'Data Drift', 'Expected count of errors found by custom query', 'This business-rule test is highly flexible, covering any error state that can be expressed by a SQL query against one or more tables in the database. In operation, the user-defined query is embedded within a parent query returning the count of error rows identified. Any row returned by the query is interpreted as a single error condition in the test. Note that this query is run independently of other tests, and that performance will be slower, depending in large part on the efficiency of the query you write. Interpretation is based on the user-defined meaning of the test. Your query might be written to return errors in individual rows identified by joining tables. Or it might return an error based on a multi-column aggregate condition returning a single row if an error is found. This query is run separately when you click `Review Source Data` from Test Results, so be sure to include enough data in your results to follow-up. Interpretation is based on the user-defined meaning of the test.', 'Y', 'line_chart', NULL),
-
- ('1500', 'Aggregate_Balance', 'Aggregate Balance', 'Aggregate values per group match reference', 'Tests for exact match in aggregate values for each set of column values vs. reference dataset', 'Aggregate measure per set of column values does not exactly match reference dataset.', 'Mismatched measures', NULL, NULL, '1', '1.0', 'Aggregate Expression', 'Specify an aggregate column expression: one of `SUM([column_name])` or `COUNT([column_name])`', 'subset_condition,groupby_names,having_condition,match_schema_name,match_table_name,match_column_names,match_subset_condition,match_groupby_names,match_having_condition', NULL, 'Record Subset Condition,Grouping Columns,Group Subset Condition,Matching Schema Name,Matching Table Name,Matching Aggregate Expression,Matching Record Subset Condition,Matching Grouping Columns,Matching Group Subset Condition', 'Condition defining a subset of records in main table, written like a condition within a SQL WHERE clause - OPTIONAL|Category columns in main table separated by commas (e.g. GROUP BY columns)|Condition defining a subset of aggregate records in main table (e.g. HAVING clause) - OPTIONAL|Schema location of matching table|Matching table name|Agregate column expression in matching table: one of `SUM([column_name])` or `COUNT([column_name])`|Condition defining a subset of records in matching table, written like a condition within a SQL WHERE clause - OPTIONAL|Category columns in matching table separated by commas (e.g. GROUP BY columns)|Condition defining a subset of aggregate records in matching table (e.g. HAVING clause) - OPTIONAL', 'Fail', 'QUERY', 'referential', 'Consistency', 'Data Drift', 'Expected count of group totals not matching aggregate value', 'This test compares sums or counts of a column rolled up to one or more category combinations across two different tables. Both tables must be accessible at the same time. It''s ideal for confirming that two datasets exactly match -- that the sum of a measure or count of a value hasn''t changed or shifted between categories. Use this test to compare a raw and processed version of the same dataset, or to confirm that an aggregated table exactly matches the detail table that it''s built from. An error here means that one or more value combinations fail to match. New categories or combinations will cause failure.', 'Y', 'line_chart', NULL),
- ('1501', 'Aggregate_Minimum', 'Aggregate Minimum', 'Aggregate values per group are at or above reference', 'Tests that aggregate values for each set of column values are at least the same as reference dataset', 'Aggregate measure per set of column values is not at least the same as reference dataset.', 'Mismatched measures', NULL, NULL, '1', '1.0', 'Aggregate Expression', 'Specify an aggregate column expression: one of `SUM([column_name])` or `COUNT([column_name])`', 'subset_condition,groupby_names,having_condition,match_schema_name,match_table_name,match_column_names,match_subset_condition,match_groupby_names,match_having_condition', NULL, 'Record Subset Condition,Grouping Columns,Group Subset Condition,Matching Schema Name,Matching Table Name,Matching Aggregate Expression,Matching Record Subset Condition,Matching Grouping Columns,Matching Group Subset Condition', 'Condition defining a subset of records in main table, written like a condition within a SQL WHERE clause - OPTIONAL|Category columns in main table separated by commas (e.g. GROUP BY columns)|Condition defining a subset of aggregate records in main table (e.g. HAVING clause) - OPTIONAL|Schema location of reference table|Reference table name|Aggregate column expression in reference table (e.g. `SUM(sales)`)|Condition defining a subset of records in reference table, written like a condition within a SQL WHERE clause - OPTIONAL|Category columns in reference table separated by commas (e.g. GROUP BY columns)|Condition defining a subset of aggregate records in reference table (e.g. HAVING clause) - OPTIONAL', 'Fail', 'QUERY', 'referential', 'Accuracy', 'Data Drift', 'Expected count of group totals below aggregate value', 'This test compares sums or counts of a column rolled up to one or more category combinations, but requires a match or increase in the aggregate value, rather than an exact match, across two different tables. Both tables must be accessible at the same time. Use this to confirm that aggregate values have not dropped for any set of categories, even if some values may rise. This test is useful to compare an older and newer version of a cumulative dataset. An error here means that one or more values per category set fail to match or exceed the prior dataset. New categories or combinations are allowed (but can be restricted independently with a Combo_Match test). Both tables must be present to run this test.', 'Y', 'line_chart', NULL),
- ('1502', 'Combo_Match', 'Reference Match', 'Column values or combinations found in reference', 'Tests for the presence of one or a set of column values in a reference table', 'Column value combinations are not found in reference table values.', 'Missing values', NULL, NULL, '({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/NULLIF({RECORD_CT}::FLOAT, 0)', '1.0', 'Categorical Column List', 'Specify one or more Categorical columns, separated by commas. \n\nDo not use continuous mesurements here. Do not use numeric values unless they represent discrete categories.', 'subset_condition,having_condition,match_schema_name,match_table_name,match_groupby_names,match_subset_condition,match_having_condition', NULL, 'Record Subset Condition,Group Subset Condition,Reference Schema Name,Reference Table Name,Matching Columns,Matching Record Subset Condition,Matching Group Subset Condition', 'Condition defining a subset of records in main table to evaluate, written like a condition within a SQL WHERE clause - OPTIONAL|Condition based on aggregate expression used to exclude value combinations in source table, written like a condition within a SQL HAVING clause (e.g. `SUM(sales) < 100`) - OPTIONAL|Schema location of matching table|Matching table name|Column Names in reference table used to validate source table values (separated by commas)|Condition defining a subset of records in reference table to match against, written like a condition within a SQL WHERE clause - OPTIONAL|Condition based on aggregate expression used to exclude value combinations in reference table, written like a condition within a SQL HAVING clause (e.g. `SUM(sales) < 100`) - OPTIONAL', 'Fail', 'QUERY', 'referential', 'Validity', 'Schema Drift', 'Expected count of non-matching value combinations', 'This test verifies that values, or combinations of values, that are present in the main table are also found in a reference table. This is a useful test for referential integrity between fact and dimension tables. You can also use it to confirm the validity of a code or category, or of combinations of values that should only be found together within each record, such as product/size/color. An error here means that one or more category combinations in the main table are not found in the reference table. Both tables must be present to run this test.', 'Y', 'line_chart', NULL),
- ('1503', 'Distribution_Shift', 'Distribution Shift', 'Probability distribution consistent with reference', 'Tests the closeness of match between two distributions of aggregate measures across combinations of column values, using Jensen-Shannon Divergence test', 'Divergence between two distributions exceeds specified threshold.', 'Divergence level (0-1)', 'Jensen-Shannon Divergence, from 0 (identical distributions), to 1.0 (max divergence)', NULL, '1', '0.75', 'Categorical Column List', 'Specify one or more Categorical columns, separated by commas. Do not use continuous mesurements here. Do not use numeric values unless they represent discrete categories.', 'subset_condition,match_schema_name,match_table_name,match_groupby_names,match_subset_condition', NULL, 'Record Subset Condition,Reference Schema Name,Reference Table Name,Matching Columns to Compare,Matching Record Subset Condition', 'Condition defining a subset of records in main table to evaluate, written like a condition within a SQL WHERE clause - OPTIONAL|Schema location of matching table|Matching table name|Column Names in reference table used to compare counts with source table values (separated by commas)|Condition defining a subset of records in reference table to match against, written like a condition within a SQL WHERE clause - OPTIONAL', 'Warning', 'QUERY', 'referential', 'Consistency', 'Data Drift', 'Expected maximum divergence level between 0 and 1', 'This test measures the similarity of two sets of counts per categories, by using their proportional counts as probability distributions. Using Jensen-Shannon divergence, a measure of relative entropy or difference between two distributions, the test assigns a score ranging from 0, meaning that the distributions are identical, to 1, meaning that the distributions are completely unrelated. This test can be used to compare datasets that may not match exactly, but should have similar distributions. For example, it is a useful sanity check for data from different sources that you would expect to have a consistent spread, such as shipment of building materials per state and construction projects by state. Scores can be compared over time even if the distributions are not identical -- a dataset can be expected to maintain a comparable divergence score with a reference dataset over time. Both tables must be present to run this test.', 'Y', 'line_chart', NULL),
- ('1504', 'Aggregate_Balance_Percent', 'Aggregate Balance Percent', 'Aggregate measure per group within percent of reference', 'Tests that aggregate measure for each set of column values fall within a percent range above or below the measure for reference dataset', 'Aggregate measure per set of column values is outside percent range of reference dataset.', 'Mismatched measures', NULL, NULL, '1', '1.0', 'Aggregate Expression', 'Specify an aggregate column expression: one of `SUM([column_name])` or `COUNT([column_name])`', 'subset_condition,groupby_names,having_condition,match_schema_name,match_table_name,match_column_names,match_subset_condition,match_groupby_names,match_having_condition,lower_tolerance,upper_tolerance', NULL, 'Record Subset Condition,Grouping Columns,Group Subset Condition,Matching Schema Name,Matching Table Name,Matching Aggregate Expression,Matching Record Subset Condition,Matching Grouping Columns,Matching Group Subset Condition,Lower Tolerance Percent,Upper Tolerance Percent', 'Condition defining a subset of records in main table, written like a condition within a SQL WHERE clause - OPTIONAL|Category columns in main table separated by commas (e.g. GROUP BY columns)|Condition defining a subset of aggregate records in main table (e.g. HAVING clause) - OPTIONAL|Schema location of matching table|Matching table name|Agregate column expression in matching table: one of `SUM([column_name])` or `COUNT([column_name])`|Condition defining a subset of records in matching table, written like a condition within a SQL WHERE clause - OPTIONAL|Category columns in matching table separated by commas (e.g. GROUP BY columns)|Condition defining a subset of aggregate records in matching table (e.g. HAVING clause) - OPTIONAL|Allowable tolerance below the reference measure expressed as a percent|Allowable tolerance above the reference measure expressed as a percent', 'Fail', 'QUERY', 'referential', 'Consistency', 'Data Drift', 'Expected count of group totals not matching aggregate value', 'This test compares sums or counts of a column rolled up to one or more category combinations across two different tables. Both tables must be accessible at the same time. Use it to confirm that two datasets closely match within the tolerance you set -- that the sum of a measure or count of a value remains sufficiently consistent between categories. You could use this test compare sales per product within one month to another, when you want to be alerted if the difference for any product falls outside of the range defined as 5% below to 10% above the prior month. An error here means that one or more value combinations fail to match within the set tolerances. New categories or combinations will cause failure.', 'Y', 'line_chart', NULL),
- ('1505', 'Aggregate_Balance_Range', 'Aggregate Balance Range', 'Aggregate measure per group within hard range of reference', 'Tests that aggregate measure for each set of column values fall within a hard range above or below the measure for reference dataset', 'Aggregate measure per set of column values is outside expected range of reference dataset.', 'Mismatched measures', NULL, NULL, '1', '1.0', 'Aggregate Expression', 'Specify an aggregate column expression: one of `SUM([column_name])` or `COUNT([column_name])`', 'subset_condition,groupby_names,having_condition,match_schema_name,match_table_name,match_column_names,match_subset_condition,match_groupby_names,match_having_condition,lower_tolerance,upper_tolerance', NULL, 'Record Subset Condition,Grouping Columns,Group Subset Condition,Matching Schema Name,Matching Table Name,Matching Aggregate Expression,Matching Record Subset Condition,Matching Grouping Columns,Matching Group Subset Condition,Lower Tolerance Constant,Upper Tolerance Constant', 'Condition defining a subset of records in main table, written like a condition within a SQL WHERE clause - OPTIONAL|Category columns in main table separated by commas (e.g. GROUP BY columns)|Condition defining a subset of aggregate records in main table (e.g. HAVING clause) - OPTIONAL|Schema location of matching table|Matching table name|Agregate column expression in matching table: one of `SUM([column_name])` or `COUNT([column_name])`|Condition defining a subset of records in matching table, written like a condition within a SQL WHERE clause - OPTIONAL|Category columns in matching table separated by commas (e.g. GROUP BY columns)|Condition defining a subset of aggregate records in matching table (e.g. HAVING clause) - OPTIONAL|Allowable tolerance below the reference measure expressed as a constant value|Allowable tolerance above the reference measure expressed as a constant value', 'Fail', 'QUERY', 'referential', 'Consistency', 'Data Drift', 'Expected count of group totals not matching aggregate value', 'This test compares sums or counts of a column rolled up to one or more category combinations across two different tables. Both tables must be accessible at the same time. Use it to confirm that two datasets closely match within the tolerances you define as specific values above or below the aggregate measure for the same categories in the reference dataset -- that the sum of a measure or count of a value remains sufficiently consistent between categories. For instance, you can use this test to compare sales per product within one month to another, when you want to be alerted if the difference for any product falls outside of the range defined as 10000 dollars above or below the prior week. An error here means that one or more value combinations fail to match within the set tolerances. New categories or combinations will cause failure.', 'Y', 'line_chart', NULL),
- ('1508', 'Timeframe_Combo_Gain', 'Timeframe No Drops', 'Latest timeframe has at least all value combinations from prior period', 'Tests that column values in most recent time-window include at least same as prior time window', 'Column values in most recent time-window don''t include all values in prior window.', 'Mismatched values', NULL, NULL, '({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/NULLIF({RECORD_CT}::FLOAT, 0)', '1.0', 'Categorical Column List', 'Specify one or more Categorical columns, separated by commas. Make sure not to use continuous measurements here. Do not use numeric values unless they represent discrete categories.', 'window_date_column,window_days,subset_condition', NULL, 'Date Column for Time Windows,Time Window in Days,Record Subset Condition', 'The date column used to define the time windows. This must be a DATE or DATETIME type.|Length in days of the time window. The test will compare the most recent period of days to the prior period of the same duration.|Condition defining a subset of records in main table to evaluate, written like a condition within a SQL WHERE clause - OPTIONAL', 'Fail', 'QUERY', 'referential', 'Consistency', 'Data Drift', 'Expected count of missing value combinations', 'This test checks a single transactional table to verify that categorical values or combinations that are present in the most recent time window you define include at least all those found in the prior time window of the same duration. Missing values in the latest time window will trigger the test to fail. New values are permitted. Use this test to confirm that codes or categories are not lost across successive time periods in a transactional table.', 'Y', 'line_chart', NULL),
- ('1509', 'Timeframe_Combo_Match', 'Timeframe Match', 'Column value combinations from latest timeframe same as prior period', 'Tests for presence of same column values in most recent time-window vs. prior time window', 'Column values don''t match in most recent time-windows.', 'Mismatched values', NULL, NULL, '({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/NULLIF({RECORD_CT}::FLOAT, 0)', '1.0', 'Categorical Column List', 'Specify one or more Categorical columns, separated by commas. Do not use continuous measurements here. Do not use numeric values unless they represent discrete categories.', 'window_date_column,window_days,subset_condition', NULL, 'Date Column for Time Windows,Time Window in Days,Record Subset Condition', NULL, 'Fail', 'QUERY', 'referential', 'Consistency', 'Data Drift', 'Expected count of non-matching value combinations', 'This test checks a single transactional table (such as a fact table) to verify that categorical values or combinations that are present in the most recent time window you define match those found in the prior time window of the same duration. New or missing values in the latest time window will trigger the test to fail. Use this test to confirm the consistency in the occurrence of codes or categories across successive time periods in a transactional table.', 'Y', 'line_chart', NULL),
- ('1510', 'Dupe_Rows', 'Duplicate Rows', 'Rows are not duplicated in table', 'Tests for the absence of duplicate rows based on unique combination of column values', 'Column value combinations are duplicated in the table.', 'Duplicate records', NULL, NULL, '(({RESULT_MEASURE}-{THRESHOLD_VALUE}))::FLOAT/NULLIF({RECORD_CT}::FLOAT, 0)', '1.0', 'null', 'null', 'groupby_names', NULL, 'Columns to Compare', 'List of columns in the table that define a duplicate record when the combination of values is repeated on multiple rows', 'Fail', 'QUERY', 'table', 'Uniqueness', 'Schema Drift', 'Expected count of duplicate value combinations', 'This test verifies that combinations of values are not repeated within the table. By default when auto-generated, the test considers all columns to protect against duplication of entire rows. If you know the minimum columns that should constitute a unique record, such as a set of ID''s, you should use those to make the test as sensitive as possible. Alternatively, if you know of columns you can always exclude, such as file_date or refresh_snapshot_id, remove them to tighten the test somewhat.', 'Y', 'line_chart', NULL),
- ('1511', 'Table_Freshness', 'Table Freshness', 'Stale Table Not Updated', 'Confirms whether table has been updated based on data fingerprint', 'Table has not been updated.', 'Was Change Detected', NULL, 'TEMPLATE', '(({RESULT_MEASURE}-{THRESHOLD_VALUE}))::FLOAT/NULLIF({RECORD_CT}::FLOAT, 0)', '0.0', 'null', 'null', 'history_calculation,history_lookback,subset_condition,custom_query', NULL, 'History Aggregate,History Lookback,Record Subset Condition,Fingerprint Expression', 'Aggregate calculation to be performed on the N lookback results|Last N tests to use for history aggregate calculation|Condition defining a subset of records in main table|String expression combining key column measures into a distinct representation of table state', 'Log', 'QUERY', 'table', 'Recency', 'Recency', 'Most recent prior table fingerprint', 'This test compares the current table fingerprint, calculated signature of column contents, to confirm that the table has been updated. The table fingerprint is derived from a set of values and aggregates from columns most likely to change. This test allows you to track the schedule and frequency of updates and refreshes to the table.', 'Y', 'binary_chart', '{"legend":{"labels":{"0":"Stale","1":"Updated"}}}')
-;
-
-
TRUNCATE TABLE generation_sets;
INSERT INTO generation_sets (generation_set, test_type)
@@ -176,1619 +28,10 @@ VALUES ('Monitor', 'Recency'),
TRUNCATE TABLE test_templates;
-INSERT INTO test_templates (id, test_type, sql_flavor, template_name)
-VALUES ('2001', 'Combo_Match', 'redshift', 'ex_data_match_generic.sql'),
- ('2002', 'Aggregate_Minimum', 'redshift', 'ex_aggregate_match_no_drops_generic.sql'),
- ('2003', 'Distribution_Shift', 'redshift', 'ex_relative_entropy_generic.sql'),
- ('2004', 'CUSTOM', 'redshift', 'ex_custom_query_generic.sql'),
- ('2006', 'Aggregate_Balance', 'redshift', 'ex_aggregate_match_same_generic.sql'),
- ('2007', 'Timeframe_Combo_Gain', 'redshift', 'ex_window_match_no_drops_generic.sql'),
- ('2008', 'Timeframe_Combo_Match', 'redshift', 'ex_window_match_same_generic.sql'),
- ('2009', 'Aggregate_Balance_Percent', 'redshift', 'ex_aggregate_match_percent_generic.sql'),
- ('2010', 'Aggregate_Balance_Range', 'redshift', 'ex_aggregate_match_range_generic.sql'),
- ('2011', 'Dupe_Rows', 'redshift', 'ex_dupe_rows_generic.sql'),
-
- ('2101', 'Combo_Match', 'snowflake', 'ex_data_match_generic.sql'),
- ('2102', 'Aggregate_Minimum', 'snowflake', 'ex_aggregate_match_no_drops_generic.sql'),
- ('2103', 'Distribution_Shift', 'snowflake', 'ex_relative_entropy_generic.sql'),
- ('2104', 'CUSTOM', 'snowflake', 'ex_custom_query_generic.sql'),
- ('2106', 'Aggregate_Balance', 'snowflake', 'ex_aggregate_match_same_generic.sql'),
- ('2107', 'Timeframe_Combo_Gain', 'snowflake', 'ex_window_match_no_drops_generic.sql'),
- ('2108', 'Timeframe_Combo_Match', 'snowflake', 'ex_window_match_same_generic.sql'),
- ('2109', 'Aggregate_Balance_Percent', 'snowflake', 'ex_aggregate_match_percent_generic.sql'),
- ('2110', 'Aggregate_Balance_Range', 'snowflake', 'ex_aggregate_match_range_generic.sql'),
- ('2111', 'Dupe_Rows', 'snowflake', 'ex_dupe_rows_generic.sql'),
-
- ('2201', 'Combo_Match', 'mssql', 'ex_data_match_generic.sql'),
- ('2202', 'Aggregate_Minimum', 'mssql', 'ex_aggregate_match_no_drops_generic.sql'),
- ('2203', 'Distribution_Shift', 'mssql', 'ex_relative_entropy_mssql.sql'),
- ('2204', 'CUSTOM', 'mssql', 'ex_custom_query_generic.sql'),
- ('2206', 'Aggregate_Balance', 'mssql', 'ex_aggregate_match_same_generic.sql'),
- ('2207', 'Timeframe_Combo_Gain', 'mssql', 'ex_window_match_no_drops_generic.sql'),
- ('2208', 'Timeframe_Combo_Match', 'mssql', 'ex_window_match_same_generic.sql'),
- ('2209', 'Aggregate_Balance_Percent', 'mssql', 'ex_aggregate_match_percent_generic.sql'),
- ('2210', 'Aggregate_Balance_Range', 'mssql', 'ex_aggregate_match_range_generic.sql'),
- ('2211', 'Dupe_Rows', 'mssql', 'ex_dupe_rows_generic.sql'),
-
- ('2301', 'Combo_Match', 'postgresql', 'ex_data_match_generic.sql'),
- ('2302', 'Aggregate_Minimum', 'postgresql', 'ex_aggregate_match_no_drops_generic.sql'),
- ('2303', 'Distribution_Shift', 'postgresql', 'ex_relative_entropy_generic.sql'),
- ('2304', 'CUSTOM', 'postgresql', 'ex_custom_query_generic.sql'),
- ('2306', 'Aggregate_Balance', 'postgresql', 'ex_aggregate_match_same_generic.sql'),
- ('2307', 'Timeframe_Combo_Gain', 'postgresql', 'ex_window_match_no_drops_postgresql.sql'),
- ('2308', 'Timeframe_Combo_Match', 'postgresql', 'ex_window_match_same_postgresql.sql'),
- ('2309', 'Aggregate_Balance_Percent', 'postgresql', 'ex_aggregate_match_percent_generic.sql'),
- ('2310', 'Aggregate_Balance_Range', 'postgresql', 'ex_aggregate_match_range_generic.sql'),
- ('2311', 'Dupe_Rows', 'postgresql', 'ex_dupe_rows_generic.sql'),
-
- ('2401', 'Combo_Match', 'databricks', 'ex_data_match_generic.sql'),
- ('2402', 'Aggregate_Minimum', 'databricks', 'ex_aggregate_match_no_drops_generic.sql'),
- ('2403', 'Distribution_Shift', 'databricks', 'ex_relative_entropy_generic.sql'),
- ('2404', 'CUSTOM', 'databricks', 'ex_custom_query_generic.sql'),
- ('2406', 'Aggregate_Balance', 'databricks', 'ex_aggregate_match_same_generic.sql'),
- ('2407', 'Timeframe_Combo_Gain', 'databricks', 'ex_window_match_no_drops_databricks.sql'),
- ('2408', 'Timeframe_Combo_Match', 'databricks', 'ex_window_match_same_databricks.sql'),
- ('2409', 'Aggregate_Balance_Percent', 'databricks', 'ex_aggregate_match_percent_generic.sql'),
- ('2410', 'Aggregate_Balance_Range', 'databricks', 'ex_aggregate_match_range_generic.sql'),
- ('2411', 'Dupe_Rows', 'databricks', 'ex_dupe_rows_generic.sql'),
-
- ('2012', 'Table_Freshness', 'redshift', 'ex_table_changed_generic.sql'),
- ('2112', 'Table_Freshness', 'snowflake', 'ex_table_changed_generic.sql'),
- ('2212', 'Table_Freshness', 'mssql', 'ex_table_changed_mssql.sql'),
- ('2312', 'Table_Freshness', 'postgresql', 'ex_table_changed_generic.sql'),
- ('2412', 'Table_Freshness', 'databricks', 'ex_table_changed_generic.sql')
-;
-
TRUNCATE TABLE cat_test_conditions;
-INSERT INTO cat_test_conditions (id, test_type, sql_flavor, measure, test_operator, test_condition)
-VALUES ('1001', 'Alpha_Trunc', 'redshift', 'MAX(LENGTH({COLUMN_NAME}))', '<', '{THRESHOLD_VALUE}'),
- ('1002', 'Avg_Shift', 'redshift', 'ABS( (AVG({COLUMN_NAME}::FLOAT) - {BASELINE_AVG}) / SQRT(((COUNT({COLUMN_NAME})::FLOAT-1)*STDDEV({COLUMN_NAME}::FLOAT)^2 + ({BASELINE_VALUE_CT}::FLOAT-1) * {BASELINE_SD}::FLOAT^2) /NULLIF(COUNT({COLUMN_NAME})::FLOAT + {BASELINE_VALUE_CT}::FLOAT, 0) ))', '>=', '{THRESHOLD_VALUE}'),
- ('1003', 'Condition_Flag', 'redshift', 'SUM(CASE WHEN {CUSTOM_QUERY} THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'),
- ('1004', 'Constant', 'redshift', 'SUM(CASE WHEN {COLUMN_NAME} <> {BASELINE_VALUE} THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'),
- ('1005', 'Daily_Record_Ct', 'redshift', 'DATEDIFF(''DAY'', MIN({COLUMN_NAME}), MAX({COLUMN_NAME}))+1-COUNT(DISTINCT {COLUMN_NAME})', '>', '{THRESHOLD_VALUE}'),
- ('1006', 'Dec_Trunc', 'redshift', 'SUM(ROUND(ABS(({COLUMN_NAME} % 1)), 5))+1', '<', '{THRESHOLD_VALUE}'),
- ('1007', 'Distinct_Date_Ct', 'redshift', 'COUNT(DISTINCT {COLUMN_NAME})', '<', '{THRESHOLD_VALUE}'),
- ('1008', 'Distinct_Value_Ct', 'redshift', 'COUNT(DISTINCT {COLUMN_NAME})', '<>', '{THRESHOLD_VALUE}'),
- ('1009', 'Email_Format', 'redshift', 'SUM(CASE WHEN {COLUMN_NAME} !~ ''^[A-Za-z0-9._''''%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$'' THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'),
- ('1010', 'Future_Date', 'redshift', 'SUM(GREATEST(0, SIGN({COLUMN_NAME}::DATE - ''{RUN_DATE}''::DATE)))', '>', '{THRESHOLD_VALUE}'),
- ('1011', 'Future_Date_1Y', 'redshift', 'SUM(GREATEST(0, SIGN({COLUMN_NAME}::DATE - (''{RUN_DATE}''::DATE+365))))', '>', '{THRESHOLD_VALUE}'),
- ('1012', 'Incr_Avg_Shift', 'redshift', 'NVL(ABS( ({BASELINE_AVG} - (SUM({COLUMN_NAME}) - {BASELINE_SUM}) / NULLIF(COUNT({COLUMN_NAME})::FLOAT - {BASELINE_VALUE_CT}, 0)) / {BASELINE_SD} ), 0)', '>=', '{THRESHOLD_VALUE}'),
- ('1013', 'LOV_All', 'redshift', 'LISTAGG(DISTINCT {COLUMN_NAME}, ''|'') WITHIN GROUP (ORDER BY {COLUMN_NAME})', '<>', '{THRESHOLD_VALUE}'),
- ('1014', 'LOV_Match', 'redshift', 'SUM(CASE WHEN NULLIF({COLUMN_NAME}, '''') NOT IN {BASELINE_VALUE} THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'),
- ('1015', 'Min_Date', 'redshift', 'SUM(CASE WHEN {COLUMN_NAME} < ''{BASELINE_VALUE}'' THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'),
- ('1016', 'Min_Val', 'redshift', 'SUM(CASE WHEN {COLUMN_NAME} < {BASELINE_VALUE} - 1e-6 THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'),
- ('1017', 'Missing_Pct', 'redshift', 'ABS( 2.0 * ASIN( SQRT( {BASELINE_VALUE_CT}::FLOAT / {BASELINE_CT}::FLOAT ) ) - 2 * ASIN( SQRT( COUNT( {COLUMN_NAME} )::FLOAT / NULLIF(COUNT(*), 0)::FLOAT )) )', '>=', '{THRESHOLD_VALUE}'),
- ('1018', 'Monthly_Rec_Ct', 'redshift', '(MAX(DATEDIFF(month, {COLUMN_NAME}, ''{RUN_DATE}''::DATE)) - MIN(DATEDIFF(month, {COLUMN_NAME}, ''{RUN_DATE}''::DATE)) + 1) - COUNT(DISTINCT DATEDIFF(month, {COLUMN_NAME}, ''{RUN_DATE}''::DATE))', '>', '{THRESHOLD_VALUE}'),
- ('1019', 'Outlier_Pct_Above', 'redshift', 'SUM(CASE WHEN {COLUMN_NAME}::FLOAT > {BASELINE_AVG}+(2.0*{BASELINE_SD}) THEN 1 ELSE 0 END)::FLOAT / NULLIF(COUNT({COLUMN_NAME}), 0)::FLOAT', '>', '{THRESHOLD_VALUE}'),
- ('1020', 'Outlier_Pct_Below', 'redshift', 'SUM(CASE WHEN {COLUMN_NAME}::FLOAT < {BASELINE_AVG}-(2.0*{BASELINE_SD}) THEN 1 ELSE 0 END)::FLOAT / NULLIF(COUNT({COLUMN_NAME}), 0)::FLOAT', '>', '{THRESHOLD_VALUE}'),
- ('1021', 'Pattern_Match', 'redshift', 'COUNT(NULLIF({COLUMN_NAME}, '''')) - SUM((NULLIF({COLUMN_NAME}, '''') SIMILAR TO ''{BASELINE_VALUE}'')::BIGINT)', '>', '{THRESHOLD_VALUE}'),
- ('1022', 'Recency', 'redshift', 'DATEDIFF(''D'', MAX({COLUMN_NAME}), ''{RUN_DATE}''::DATE)', '>', '{THRESHOLD_VALUE}'),
- ('1023', 'Required', 'redshift', 'COUNT(*) - COUNT( {COLUMN_NAME} )', '>', '{THRESHOLD_VALUE}'),
- ('1024', 'Row_Ct', 'redshift', 'COUNT(*)', '<', '{THRESHOLD_VALUE}'),
- ('1025', 'Row_Ct_Pct', 'redshift', 'ABS(ROUND(100.0 * (COUNT(*) - {BASELINE_CT})::FLOAT / {BASELINE_CT}::FLOAT, 2))', '>', '{THRESHOLD_VALUE}'),
- ('1026', 'Street_Addr_Pattern', 'redshift', '100.0*SUM(({COLUMN_NAME} ~ ''^[0-9]{1,5}[a-zA-Z]?\\s\\w{1,5}\\.?\\s?\\w*\\s?\\w*\\s[a-zA-Z]{1,6}\\.?\\s?[0-9]{0,5}[A-Z]{0,1}$'')::BIGINT)::FLOAT / NULLIF(COUNT({COLUMN_NAME}), 0)::FLOAT', '<', '{THRESHOLD_VALUE}'),
- ('1027', 'US_State', 'redshift', 'SUM(CASE WHEN {COLUMN_NAME} NOT IN ('''',''AL'',''AK'',''AS'',''AZ'',''AR'',''CA'',''CO'',''CT'',''DE'',''DC'',''FM'',''FL'',''GA'',''GU'',''HI'',''ID'',''IL'',''IN'',''IA'',''KS'',''KY'',''LA'',''ME'',''MH'',''MD'',''MA'',''MI'',''MN'',''MS'',''MO'',''MT'',''NE'',''NV'',''NH'',''NJ'',''NM'',''NY'',''NC'',''ND'',''MP'',''OH'',''OK'',''OR'',''PW'',''PA'',''PR'',''RI'',''SC'',''SD'',''TN'',''TX'',''UT'',''VT'',''VI'',''VA'',''WA'',''WV'',''WI'',''WY'',''AE'',''AP'',''AA'') THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'),
- ('1028', 'Unique', 'redshift', 'COUNT(*) - COUNT(DISTINCT {COLUMN_NAME})', '>', '{THRESHOLD_VALUE}'),
- ('1029', 'Unique_Pct', 'redshift', 'ABS( 2.0 * ASIN( SQRT({BASELINE_UNIQUE_CT}::FLOAT / {BASELINE_VALUE_CT}::FLOAT ) ) - 2 * ASIN( SQRT( COUNT( DISTINCT {COLUMN_NAME} )::FLOAT / NULLIF(COUNT( {COLUMN_NAME} ), 0)::FLOAT )) )', '>=', '{THRESHOLD_VALUE}'),
- ('1030', 'Weekly_Rec_Ct', 'redshift', 'MAX(DATEDIFF(week, ''1800-01-01''::DATE, {COLUMN_NAME})) - MIN(DATEDIFF(week, ''1800-01-01''::DATE, {COLUMN_NAME}))+1 - COUNT(DISTINCT DATEDIFF(week, ''1800-01-01''::DATE, {COLUMN_NAME}))', '>', '{THRESHOLD_VALUE}'),
- ('2001', 'Alpha_Trunc', 'snowflake', 'MAX(LENGTH({COLUMN_NAME}))', '<', '{THRESHOLD_VALUE}'),
- ('2002', 'Avg_Shift', 'snowflake', 'ABS( (AVG({COLUMN_NAME}::FLOAT) - {BASELINE_AVG}) / SQRT(((COUNT({COLUMN_NAME})::FLOAT-1)*POWER(STDDEV({COLUMN_NAME}::FLOAT),2) + ({BASELINE_VALUE_CT}::FLOAT-1) * POWER({BASELINE_SD}::FLOAT,2)) /NULLIF(COUNT({COLUMN_NAME})::FLOAT + {BASELINE_VALUE_CT}::FLOAT, 0) ))', '>=', '{THRESHOLD_VALUE}'),
- ('2003', 'Condition_Flag', 'snowflake', 'SUM(CASE WHEN {CUSTOM_QUERY} THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'),
- ('2004', 'Constant', 'snowflake', 'SUM(CASE WHEN {COLUMN_NAME} <> {BASELINE_VALUE} THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'),
- ('2005', 'Daily_Record_Ct', 'snowflake', 'DATEDIFF(day, MIN({COLUMN_NAME}), MAX({COLUMN_NAME}))+1-COUNT(DISTINCT {COLUMN_NAME})', '<', '{THRESHOLD_VALUE}'),
- ('2006', 'Dec_Trunc', 'snowflake', 'SUM(ROUND(ABS(({COLUMN_NAME} % 1)), 5))+1', '<', '{THRESHOLD_VALUE}'),
- ('2007', 'Distinct_Date_Ct', 'snowflake', 'COUNT(DISTINCT {COLUMN_NAME})', '<', '{THRESHOLD_VALUE}'),
- ('2008', 'Distinct_Value_Ct', 'snowflake', 'COUNT(DISTINCT {COLUMN_NAME})', '<>', '{THRESHOLD_VALUE}'),
- ('2009', 'Email_Format', 'snowflake', 'SUM(CASE WHEN NOT REGEXP_LIKE({COLUMN_NAME}::VARCHAR, ''^[A-Za-z0-9._''''%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$'') THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'),
- ('2010', 'Future_Date', 'snowflake', 'SUM(GREATEST(0, SIGN({COLUMN_NAME}::DATE - ''{RUN_DATE}''::DATE)))', '>', '{THRESHOLD_VALUE}'),
- ('2011', 'Future_Date_1Y', 'snowflake', 'SUM(GREATEST(0, SIGN({COLUMN_NAME}::DATE - (''{RUN_DATE}''::DATE+365))))', '>', '{THRESHOLD_VALUE}'),
- ('2012', 'Incr_Avg_Shift', 'snowflake', 'COALESCE(ABS( ({BASELINE_AVG} - (SUM({COLUMN_NAME}) - {BASELINE_SUM}) / NULLIF(COUNT({COLUMN_NAME})::FLOAT - {BASELINE_VALUE_CT}, 0)) / {BASELINE_SD} ), 0)', '>=', '{THRESHOLD_VALUE}'),
- ('2013', 'LOV_All', 'snowflake', 'LISTAGG(DISTINCT {COLUMN_NAME}, ''|'') WITHIN GROUP (ORDER BY {COLUMN_NAME})', '<>', '{THRESHOLD_VALUE}'),
- ('2014', 'LOV_Match', 'snowflake', 'SUM(CASE WHEN NULLIF({COLUMN_NAME}, '''') NOT IN {BASELINE_VALUE} THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'),
- ('2015', 'Min_Date', 'snowflake', 'SUM(CASE WHEN {COLUMN_NAME} < ''{BASELINE_VALUE}'' THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'),
- ('2016', 'Min_Val', 'snowflake', 'SUM(CASE WHEN {COLUMN_NAME} < {BASELINE_VALUE} - 1e-6 THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'),
- ('2017', 'Missing_Pct', 'snowflake', 'ABS( 2.0 * ASIN( SQRT( {BASELINE_VALUE_CT}::FLOAT / {BASELINE_CT}::FLOAT ) ) - 2 * ASIN( SQRT( COUNT( {COLUMN_NAME} )::FLOAT / NULLIF(COUNT(*), 0)::FLOAT )) )', '>=', '{THRESHOLD_VALUE}'),
- ('2018', 'Monthly_Rec_Ct', 'snowflake', '(MAX(DATEDIFF(month, {COLUMN_NAME}, ''{RUN_DATE}''::DATE)) - MIN(DATEDIFF(month, {COLUMN_NAME}, ''{RUN_DATE}''::DATE)) + 1) - COUNT(DISTINCT DATEDIFF(month, {COLUMN_NAME}, ''{RUN_DATE}''::DATE))', '>', '{THRESHOLD_VALUE}'),
- ('2019', 'Outlier_Pct_Above', 'snowflake', 'SUM(CASE WHEN {COLUMN_NAME}::FLOAT > {BASELINE_AVG}+(2.0*{BASELINE_SD}) THEN 1 ELSE 0 END)::FLOAT / NULLIF(COUNT({COLUMN_NAME}), 0)::FLOAT', '>', '{THRESHOLD_VALUE}'),
- ('2020', 'Outlier_Pct_Below', 'snowflake', 'SUM(CASE WHEN {COLUMN_NAME}::FLOAT < {BASELINE_AVG}-(2.0*{BASELINE_SD}) THEN 1 ELSE 0 END)::FLOAT / NULLIF(COUNT({COLUMN_NAME}), 0)::FLOAT', '>', '{THRESHOLD_VALUE}'),
- ('2021', 'Pattern_Match', 'snowflake', 'COUNT(NULLIF({COLUMN_NAME}, '''')) - SUM(REGEXP_LIKE(NULLIF({COLUMN_NAME}::VARCHAR, ''''), ''{BASELINE_VALUE}'')::BIGINT)', '>', '{THRESHOLD_VALUE}'),
- ('2022', 'Recency', 'snowflake', 'DATEDIFF(''D'', MAX({COLUMN_NAME}), ''{RUN_DATE}''::DATE)', '>', '{THRESHOLD_VALUE}'),
- ('2023', 'Required', 'snowflake', 'COUNT(*) - COUNT( {COLUMN_NAME} )', '>', '{THRESHOLD_VALUE}'),
- ('2024', 'Row_Ct', 'snowflake', 'COUNT(*)', '<', '{THRESHOLD_VALUE}'),
- ('2025', 'Row_Ct_Pct', 'snowflake', 'ABS(ROUND(100.0 * (COUNT(*) - {BASELINE_CT})::FLOAT / {BASELINE_CT}::FLOAT, 2))', '>', '{THRESHOLD_VALUE}'),
- ('2026', 'Street_Addr_Pattern', 'snowflake', '100.0*SUM((regexp_like({COLUMN_NAME}::VARCHAR, ''^[0-9]{1,5}[a-zA-Z]?\\s\\w{1,5}\\.?\\s?\\w*\\s?\\w*\\s[a-zA-Z]{1,6}\\.?\\s?[0-9]{0,5}[A-Z]{0,1}$''))::BIGINT)::FLOAT / NULLIF(COUNT({COLUMN_NAME}), 0)::FLOAT', '<', '{THRESHOLD_VALUE}'),
- ('2027', 'US_State', 'snowflake', 'SUM(CASE WHEN {COLUMN_NAME} NOT IN ('''',''AL'',''AK'',''AS'',''AZ'',''AR'',''CA'',''CO'',''CT'',''DE'',''DC'',''FM'',''FL'',''GA'',''GU'',''HI'',''ID'',''IL'',''IN'',''IA'',''KS'',''KY'',''LA'',''ME'',''MH'',''MD'',''MA'',''MI'',''MN'',''MS'',''MO'',''MT'',''NE'',''NV'',''NH'',''NJ'',''NM'',''NY'',''NC'',''ND'',''MP'',''OH'',''OK'',''OR'',''PW'',''PA'',''PR'',''RI'',''SC'',''SD'',''TN'',''TX'',''UT'',''VT'',''VI'',''VA'',''WA'',''WV'',''WI'',''WY'',''AE'',''AP'',''AA'') THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'),
- ('2028', 'Unique', 'snowflake', 'COUNT(*) - COUNT(DISTINCT {COLUMN_NAME})', '>', '{THRESHOLD_VALUE}'),
- ('2029', 'Unique_Pct', 'snowflake', 'ABS( 2.0 * ASIN( SQRT({BASELINE_UNIQUE_CT}::FLOAT / {BASELINE_VALUE_CT}::FLOAT ) ) - 2 * ASIN( SQRT( COUNT( DISTINCT {COLUMN_NAME} )::FLOAT / NULLIF(COUNT( {COLUMN_NAME} ), 0)::FLOAT )) )', '>=', '{THRESHOLD_VALUE}'),
- ('2030', 'Weekly_Rec_Ct', 'snowflake', 'MAX(DATEDIFF(week, ''1800-01-01''::DATE, {COLUMN_NAME})) - MIN(DATEDIFF(week, ''1800-01-01''::DATE, {COLUMN_NAME}))+1 - COUNT(DISTINCT DATEDIFF(week, ''1800-01-01''::DATE, {COLUMN_NAME}))', '>', '{THRESHOLD_VALUE}'),
- ('3001', 'Alpha_Trunc', 'mssql', 'MAX(LEN({COLUMN_NAME}))', '<', '{THRESHOLD_VALUE}'),
- ('3002', 'Avg_Shift', 'mssql', 'ABS( (AVG(CAST({COLUMN_NAME} AS FLOAT)) - CAST({BASELINE_AVG} as FLOAT)) / SQRT(((COUNT({COLUMN_NAME})-1)*POWER(STDEV(CAST({COLUMN_NAME} AS FLOAT)), 2) + ({BASELINE_VALUE_CT}-1) * POWER(CAST({BASELINE_SD} as FLOAT), 2)) /NULLIF(COUNT({COLUMN_NAME}) + {BASELINE_VALUE_CT}, 0) ))', '>=', '{THRESHOLD_VALUE}'),
- ('3003', 'Condition_Flag', 'mssql', 'SUM(CASE WHEN {CUSTOM_QUERY} THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'),
- ('3004', 'Constant', 'mssql', 'SUM(CASE WHEN {COLUMN_NAME} <> {BASELINE_VALUE} THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'),
- ('3005', 'Daily_Record_Ct', 'mssql', 'DATEDIFF(day, MIN({COLUMN_NAME}), MAX({COLUMN_NAME}))+1-COUNT(DISTINCT {COLUMN_NAME})', '<', '{THRESHOLD_VALUE}'),
- ('3006', 'Dec_Trunc', 'mssql', 'SUM(ROUND(ABS(({COLUMN_NAME} % 1)), 5))+1', '<', '{THRESHOLD_VALUE}'),
- ('3007', 'Distinct_Date_Ct', 'mssql', 'COUNT(DISTINCT {COLUMN_NAME})', '<', '{THRESHOLD_VALUE}'),
- ('3008', 'Distinct_Value_Ct', 'mssql', 'COUNT(DISTINCT {COLUMN_NAME})', '<>', '{THRESHOLD_VALUE}'),
- ('3009', 'Email_Format', 'mssql', 'SUM(CASE WHEN {COLUMN_NAME} NOT LIKE ''[A-Za-z0-9._''''%+-]%@[A-Za-z0-9.-]%.[A-Za-z][A-Za-z]%'' THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'),
- ('3010', 'Future_Date', 'mssql', 'SUM(CASE WHEN CAST({COLUMN_NAME} AS DATE) >= CONVERT(DATE, ''{RUN_DATE}'') THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'),
- ('3011', 'Future_Date_1Y', 'mssql', 'SUM(CASE WHEN CAST({COLUMN_NAME} AS DATE) >= DATEADD(DAY, 365, CONVERT(DATE, ''{RUN_DATE}'')) THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'),
- ('3012', 'Incr_Avg_Shift', 'mssql', 'COALESCE(ABS( ({BASELINE_AVG} - (SUM({COLUMN_NAME}) - {BASELINE_SUM}) / NULLIF(CAST(COUNT({COLUMN_NAME}) AS FLOAT) - {BASELINE_VALUE_CT}, 0)) / {BASELINE_SD} ), 0)', '>=', '{THRESHOLD_VALUE}'),
- ('3013', 'LOV_All', 'mssql', 'STRING_AGG(DISTINCT {COLUMN_NAME}, ''|'') WITHIN GROUP (ORDER BY {COLUMN_NAME})', '<>', '{THRESHOLD_VALUE}'),
- ('3014', 'LOV_Match', 'mssql', 'SUM(CASE WHEN NULLIF({COLUMN_NAME}, '''') NOT IN {BASELINE_VALUE} THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'),
- ('3015', 'Min_Date', 'mssql', 'SUM(CASE WHEN {COLUMN_NAME} < ''{BASELINE_VALUE}'' THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'),
- ('3016', 'Min_Val', 'mssql', 'SUM(CASE WHEN {COLUMN_NAME} < {BASELINE_VALUE} - 1e-6 THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'),
- ('3017', 'Missing_Pct', 'mssql', 'ABS( 2.0 * ASIN( SQRT( CAST({BASELINE_VALUE_CT} AS FLOAT) / CAST({BASELINE_CT} AS FLOAT) ) ) - 2 * ASIN( SQRT( CAST(COUNT( {COLUMN_NAME} ) AS FLOAT) / CAST(NULLIF(COUNT(*), 0) AS FLOAT) )) )', '>=', '{THRESHOLD_VALUE}'),
- ('3018', 'Monthly_Rec_Ct', 'mssql', '(MAX(DATEDIFF(month, {COLUMN_NAME}, CAST(''{RUN_DATE}''AS DATE))) - MIN(DATEDIFF(month, {COLUMN_NAME}, CAST(''{RUN_DATE}'' AS DATE))) + 1) - COUNT(DISTINCT DATEDIFF(month, {COLUMN_NAME}, CAST(''{RUN_DATE}''AS DATE)))', '>', '{THRESHOLD_VALUE}'),
- ('3019', 'Outlier_Pct_Above', 'mssql', 'CAST(SUM(CASE WHEN CAST({COLUMN_NAME} AS FLOAT) > {BASELINE_AVG}+(2.0*{BASELINE_SD}) THEN 1 ELSE 0 END) AS FLOAT) / CAST(COUNT({COLUMN_NAME}) AS FLOAT)', '>', '{THRESHOLD_VALUE}'),
- ('3020', 'Outlier_Pct_Below', 'mssql', 'CAST(SUM(CASE WHEN CAST( {COLUMN_NAME} AS FLOAT) < {BASELINE_AVG}-(2.0*{BASELINE_SD}) THEN 1 ELSE 0 END) AS FLOAT) / CAST(COUNT({COLUMN_NAME}) AS FLOAT)', '>', '{THRESHOLD_VALUE}'),
- ('3021', 'Pattern_Match', 'mssql', 'COUNT(NULLIF({COLUMN_NAME}, '''')) - CAST(SUM(CASE WHEN NULLIF({COLUMN_NAME}, '''') LIKE ''{BASELINE_VALUE}'' THEN 1 ELSE 0 END) AS BIGINT)', '>', '{THRESHOLD_VALUE}'),
- ('3022', 'Recency', 'mssql', 'DATEDIFF(day, MAX({COLUMN_NAME}), CAST(''{RUN_DATE}''AS DATE))', '>', '{THRESHOLD_VALUE}'),
- ('3023', 'Required', 'mssql', 'COUNT(*) - COUNT( {COLUMN_NAME} )', '>', '{THRESHOLD_VALUE}'),
- ('3024', 'Row_Ct', 'mssql', 'COUNT(*)', '<', '{THRESHOLD_VALUE}'),
- ('3025', 'Row_Ct_Pct', 'mssql', 'ABS(ROUND(100.0 * CAST((COUNT(*) - {BASELINE_CT} ) AS FLOAT)/ CAST({BASELINE_CT} AS FLOAT), 2))', '>', '{THRESHOLD_VALUE}'),
- ('3026', 'Street_Addr_Pattern', 'mssql', 'CAST(100.0*SUM(CASE WHEN UPPER({COLUMN_NAME}) LIKE ''[1-9]% [A-Z]% %'' AND CHARINDEX('' '', {COLUMN_NAME}) BETWEEN 2 AND 6 THEN 1 ELSE 0 END) as FLOAT) /CAST(COUNT({COLUMN_NAME}) AS FLOAT)', '<', '{THRESHOLD_VALUE}'),
- ('3027', 'US_State', 'mssql', 'SUM(CASE WHEN NULLIF({COLUMN_NAME}, '''') NOT IN (''AL'',''AK'',''AS'',''AZ'',''AR'',''CA'',''CO'',''CT'',''DE'',''DC'',''FM'',''FL'',''GA'',''GU'',''HI'',''ID'',''IL'',''IN'',''IA'',''KS'',''KY'',''LA'',''ME'',''MH'',''MD'',''MA'',''MI'',''MN'',''MS'',''MO'',''MT'',''NE'',''NV'',''NH'',''NJ'',''NM'',''NY'',''NC'',''ND'',''MP'',''OH'',''OK'',''OR'',''PW'',''PA'',''PR'',''RI'',''SC'',''SD'',''TN'',''TX'',''UT'',''VT'',''VI'',''VA'',''WA'',''WV'',''WI'',''WY'',''AE'',''AP'',''AA'') THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'),
- ('3028', 'Unique', 'mssql', 'COUNT(*) - COUNT(DISTINCT {COLUMN_NAME})', '>', '{THRESHOLD_VALUE}'),
- ('3029', 'Unique_Pct', 'mssql', 'ABS( 2.0 * ASIN( SQRT(CAST({BASELINE_UNIQUE_CT} AS FLOAT) / CAST({BASELINE_VALUE_CT} AS FLOAT) ) ) - 2 * ASIN( SQRT( CAST(COUNT( DISTINCT {COLUMN_NAME} ) AS FLOAT) / CAST(NULLIF(COUNT( {COLUMN_NAME} ), 0) AS FLOAT) )) )', '>=', '{THRESHOLD_VALUE}'),
- ('3030', 'Weekly_Rec_Ct', 'mssql', 'MAX(DATEDIFF(week, CAST(''1800-01-01'' AS DATE), {COLUMN_NAME})) - MIN(DATEDIFF(week, CAST(''1800-01-01'' AS DATE), {COLUMN_NAME}))+1 - COUNT(DISTINCT DATEDIFF(week, CAST(''1800-01-01'' AS DATE), {COLUMN_NAME}))', '>', '{THRESHOLD_VALUE}'),
- ('4001', 'Alpha_Trunc', 'postgresql', 'MAX(LENGTH({COLUMN_NAME}))', '<', '{THRESHOLD_VALUE}'),
- ('4002', 'Avg_Shift', 'postgresql', 'ABS( (AVG({COLUMN_NAME}::FLOAT) - {BASELINE_AVG}) / SQRT(((COUNT({COLUMN_NAME})::FLOAT-1)*STDDEV({COLUMN_NAME}::FLOAT)^2 + ({BASELINE_VALUE_CT}::FLOAT-1) * {BASELINE_SD}::FLOAT^2) /NULLIF(COUNT({COLUMN_NAME})::FLOAT + {BASELINE_VALUE_CT}::FLOAT, 0) ))', '>=', '{THRESHOLD_VALUE}'),
- ('4003', 'Condition_Flag', 'postgresql', 'SUM(CASE WHEN {CUSTOM_QUERY} THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'),
- ('4004', 'Constant', 'postgresql', 'SUM(CASE WHEN {COLUMN_NAME} <> {BASELINE_VALUE} THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'),
- ('4005', 'Daily_Record_Ct', 'postgresql', '<%DATEDIFF_DAY;MIN({COLUMN_NAME});MAX({COLUMN_NAME})%>+1-COUNT(DISTINCT {COLUMN_NAME})', '>', '{THRESHOLD_VALUE}'),
- ('4006', 'Dec_Trunc', 'postgresql', 'SUM(ROUND(ABS(({COLUMN_NAME} % 1)), 5))+1', '<', '{THRESHOLD_VALUE}'),
- ('4007', 'Distinct_Date_Ct', 'postgresql', 'COUNT(DISTINCT {COLUMN_NAME})', '<', '{THRESHOLD_VALUE}'),
- ('4008', 'Distinct_Value_Ct', 'postgresql', 'COUNT(DISTINCT {COLUMN_NAME})', '<>', '{THRESHOLD_VALUE}'),
- ('4009', 'Email_Format', 'postgresql', 'SUM(CASE WHEN {COLUMN_NAME} !~ ''^[A-Za-z0-9._''''%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$'' THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'),
- ('4010', 'Future_Date', 'postgresql', 'SUM(GREATEST(0, SIGN({COLUMN_NAME}::DATE - ''{RUN_DATE}''::DATE)))', '>', '{THRESHOLD_VALUE}'),
- ('4011', 'Future_Date_1Y', 'postgresql', 'SUM(GREATEST(0, SIGN({COLUMN_NAME}::DATE - (''{RUN_DATE}''::DATE+365))))', '>', '{THRESHOLD_VALUE}'),
- ('4012', 'Incr_Avg_Shift', 'postgresql', 'COALESCE(ABS( ({BASELINE_AVG} - (SUM({COLUMN_NAME}) - {BASELINE_SUM}) / NULLIF(COUNT({COLUMN_NAME})::FLOAT - {BASELINE_VALUE_CT}, 0)) / {BASELINE_SD} ), 0)', '>=', '{THRESHOLD_VALUE}'),
- ('4013', 'LOV_All', 'postgresql', 'STRING_AGG(DISTINCT {COLUMN_NAME}, ''|'') WITHIN GROUP (ORDER BY {COLUMN_NAME})', '<>', '{THRESHOLD_VALUE}'),
- ('4014', 'LOV_Match', 'postgresql', 'SUM(CASE WHEN NULLIF({COLUMN_NAME}, '''') NOT IN {BASELINE_VALUE} THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'),
- ('4015', 'Min_Date', 'postgresql', 'SUM(CASE WHEN {COLUMN_NAME} < ''{BASELINE_VALUE}'' THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'),
- ('4016', 'Min_Val', 'postgresql', 'SUM(CASE WHEN {COLUMN_NAME} < {BASELINE_VALUE} - 1e-6 THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'),
- ('4017', 'Missing_Pct', 'postgresql', 'ABS( 2.0 * ASIN( SQRT( {BASELINE_VALUE_CT}::FLOAT / {BASELINE_CT}::FLOAT ) ) - 2 * ASIN( SQRT( COUNT( {COLUMN_NAME} )::FLOAT / NULLIF(COUNT(*), 0)::FLOAT )) )', '>=', '{THRESHOLD_VALUE}'),
- ('4018', 'Monthly_Rec_Ct', 'postgresql', '(MAX(<%DATEDIFF_MONTH;{COLUMN_NAME};''{RUN_DATE}''::DATE%>) - MIN(<%DATEDIFF_MONTH;{COLUMN_NAME};''{RUN_DATE}''::DATE%>) + 1) - COUNT(DISTINCT <%DATEDIFF_MONTH;{COLUMN_NAME};''{RUN_DATE}''::DATE%>)', '>', '{THRESHOLD_VALUE}'),
- ('4019', 'Outlier_Pct_Above', 'postgresql', 'SUM(CASE WHEN {COLUMN_NAME}::FLOAT > {BASELINE_AVG}+(2.0*{BASELINE_SD}) THEN 1 ELSE 0 END)::FLOAT / NULLIF(COUNT({COLUMN_NAME}), 0)::FLOAT', '>', '{THRESHOLD_VALUE}'),
- ('4020', 'Outlier_Pct_Below', 'postgresql', 'SUM(CASE WHEN {COLUMN_NAME}::FLOAT < {BASELINE_AVG}-(2.0*{BASELINE_SD}) THEN 1 ELSE 0 END)::FLOAT / NULLIF(COUNT({COLUMN_NAME}), 0)::FLOAT', '>', '{THRESHOLD_VALUE}'),
- ('4021', 'Pattern_Match', 'postgresql', 'COUNT(NULLIF({COLUMN_NAME}, '''')) - SUM(CASE WHEN NULLIF({COLUMN_NAME}, '''') ~ ''{BASELINE_VALUE}'' THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'),
- ('4022', 'Recency', 'postgresql', '<%DATEDIFF_DAY;MAX({COLUMN_NAME});''{RUN_DATE}''::DATE%>', '>', '{THRESHOLD_VALUE}'),
- ('4023', 'Required', 'postgresql', 'COUNT(*) - COUNT({COLUMN_NAME})', '>', '{THRESHOLD_VALUE}'),
- ('4024', 'Row_Ct', 'postgresql', 'COUNT(*)', '<', '{THRESHOLD_VALUE}'),
- ('4025', 'Row_Ct_Pct', 'postgresql', 'ABS(ROUND(100.0 * (COUNT(*) - {BASELINE_CT})::DECIMAL(18,4) / {BASELINE_CT}::DECIMAL(18,4), 2))', '>', '{THRESHOLD_VALUE}'),
- ('4026', 'Street_Addr_Pattern', 'postgresql', '100.0*SUM(CASE WHEN {COLUMN_NAME} ~ ''^[0-9]{1,5}[a-zA-Z]?\s\w{1,5}\.?\s?\w*\s?\w*\s[a-zA-Z]{1,6}\.?\s?[0-9]{0,5}[A-Z]{0,1}$'' THEN 1 ELSE 0 END)::FLOAT / NULLIF(COUNT({COLUMN_NAME}), 0)::FLOAT', '<', '{THRESHOLD_VALUE}'),
- ('4027', 'US_State', 'postgresql', 'SUM(CASE WHEN NULLIF({COLUMN_NAME}, '''') NOT IN (''AL'',''AK'',''AS'',''AZ'',''AR'',''CA'',''CO'',''CT'',''DE'',''DC'',''FM'',''FL'',''GA'',''GU'',''HI'',''ID'',''IL'',''IN'',''IA'',''KS'',''KY'',''LA'',''ME'',''MH'',''MD'',''MA'',''MI'',''MN'',''MS'',''MO'',''MT'',''NE'',''NV'',''NH'',''NJ'',''NM'',''NY'',''NC'',''ND'',''MP'',''OH'',''OK'',''OR'',''PW'',''PA'',''PR'',''RI'',''SC'',''SD'',''TN'',''TX'',''UT'',''VT'',''VI'',''VA'',''WA'',''WV'',''WI'',''WY'',''AE'',''AP'',''AA'') THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'),
- ('4028', 'Unique', 'postgresql', 'COUNT(*) - COUNT(DISTINCT {COLUMN_NAME})', '>', '{THRESHOLD_VALUE}'),
- ('4029', 'Unique_Pct', 'postgresql', 'ABS( 2.0 * ASIN( SQRT({BASELINE_UNIQUE_CT}::FLOAT / {BASELINE_VALUE_CT}::FLOAT ) ) - 2 * ASIN( SQRT( COUNT( DISTINCT {COLUMN_NAME} )::FLOAT / NULLIF(COUNT( {COLUMN_NAME} ), 0)::FLOAT )) )', '>=', '{THRESHOLD_VALUE}'),
- ('4030', 'Weekly_Rec_Ct', 'postgresql', 'MAX(<%DATEDIFF_WEEK;''1800-01-01''::DATE;{COLUMN_NAME}%>) - MIN(<%DATEDIFF_WEEK;''1800-01-01''::DATE;{COLUMN_NAME}%>)+1 - COUNT(DISTINCT <%DATEDIFF_WEEK;''1800-01-01''::DATE;{COLUMN_NAME}%>)', '>', '{THRESHOLD_VALUE}'),
-
- ('1031', 'Variability_Increase', 'redshift', '100.0*STDDEV({COLUMN_NAME}::FLOAT)/{BASELINE_SD}::FLOAT', '>', '{THRESHOLD_VALUE}'),
- ('1032', 'Variability_Decrease', 'redshift', '100.0*STDDEV({COLUMN_NAME}::FLOAT)/{BASELINE_SD}::FLOAT', '<', '{THRESHOLD_VALUE}'),
- ('2031', 'Variability_Increase', 'snowflake', '100.0*STDDEV({COLUMN_NAME}::FLOAT)/{BASELINE_SD}::FLOAT', '>', '{THRESHOLD_VALUE}'),
- ('2032', 'Variability_Decrease', 'snowflake', '100.0*STDDEV({COLUMN_NAME}::FLOAT)/{BASELINE_SD}::FLOAT', '<', '{THRESHOLD_VALUE}'),
- ('3031', 'Variability_Increase', 'mssql', '100.0*STDEV(CAST({COLUMN_NAME} AS FLOAT))/CAST({BASELINE_SD} AS FLOAT)', '>', '{THRESHOLD_VALUE}'),
- ('3032', 'Variability_Decrease', 'mssql', '100.0*STDEV(CAST({COLUMN_NAME} AS FLOAT))/CAST({BASELINE_SD} AS FLOAT)', '<', '{THRESHOLD_VALUE}'),
- ('4031', 'Variability_Increase', 'postgresql', '100.0*STDDEV({COLUMN_NAME}::FLOAT)/{BASELINE_SD}::FLOAT', '>', '{THRESHOLD_VALUE}'),
- ('4032', 'Variability_Decrease', 'postgresql', '100.0*STDDEV({COLUMN_NAME}::FLOAT)/{BASELINE_SD}::FLOAT', '<', '{THRESHOLD_VALUE}'),
- ('6031', 'Variability_Increase', 'databricks', '100.0*STDDEV_SAMP({COLUMN_NAME}::FLOAT)/{BASELINE_SD}::FLOAT', '>', '{THRESHOLD_VALUE}'),
- ('6032', 'Variability_Decrease', 'databricks', '100.0*STDDEV_SAMP({COLUMN_NAME}::FLOAT)/{BASELINE_SD}::FLOAT', '<', '{THRESHOLD_VALUE}'),
-
- ('5001', 'Alpha_Trunc', 'trino', 'MAX(LENGTH({COLUMN_NAME}))', '<', '{THRESHOLD_VALUE}'),
- ('5002', 'Avg_Shift', 'trino', 'ABS( (CAST(AVG({COLUMN_NAME} AS REAL)) - {BASELINE_AVG}) / SQRT(((CAST(COUNT({COLUMN_NAME}) AS REAL)-1)*STDDEV({COLUMN_NAME})^2 + (CAST({BASELINE_VALUE_CT} AS REAL)-1) * CAST({BASELINE_SD} AS REAL)^2) /NULLIF(CAST(COUNT({COLUMN_NAME}) AS REAL) + CAST({BASELINE_VALUE_CT} AS REAL), 0) ))', '>=', '{THRESHOLD_VALUE}'),
- ('5003', 'Condition_Flag', 'trino', 'SUM(CASE WHEN {BASELINE_VALUE} IS NOT NULL THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'),
- ('5004', 'Constant', 'trino', 'SUM(CASE WHEN {COLUMN_NAME} <> {BASELINE_VALUE} THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'),
- ('5005', 'Daily_Record_Ct', 'trino', 'DATE_DIFF(''DAY'', MIN({COLUMN_NAME}), MAX({COLUMN_NAME}))+1-COUNT(DISTINCT {COLUMN_NAME})', '>', '{THRESHOLD_VALUE}'),
- ('5006', 'Dec_Trunc', 'trino', 'SUM(ROUND(ABS(({COLUMN_NAME} % 1)), 5))+1', '<', '{THRESHOLD_VALUE}'),
- ('5007', 'Distinct_Date_Ct', 'trino', 'COUNT(DISTINCT {COLUMN_NAME})', '<', '{THRESHOLD_VALUE}'),
- ('5008', 'Distinct_Value_Ct', 'trino', 'COUNT(DISTINCT {COLUMN_NAME})', '<>', '{THRESHOLD_VALUE}'),
- ('5009', 'Email_Format', 'trino', 'SUM(CASE WHEN REGEXP_LIKE({COLUMN_NAME} , ''^[A-Za-z0-9._''''%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$'') != TRUE THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'),
- ('5010', 'Future_Date', 'trino', 'SUM(CASE WHEN CAST({COLUMN_NAME} AS DATE) >= CAST(''{RUN_DATE}'' AS DATE) THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'),
- ('5011', 'Future_Date_1Y', 'trino', 'SUM(CASE WHEN CAST({COLUMN_NAME} AS DATE) >= (FROM_ISO8601_DATE(''{RUN_DATE}'') + interval ''365'' day ) THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'),
- ('5012', 'Incr_Avg_Shift', 'trino', 'COALESCE(ABS( ({BASELINE_AVG} - (SUM({COLUMN_NAME}) - {BASELINE_SUM}) / NULLIF(CAST(COUNT({COLUMN_NAME}) AS REAL) - {BASELINE_VALUE_CT}, 0)) / {BASELINE_SD} ), 0)', '>=', '{THRESHOLD_VALUE}'),
- ('5013', 'LOV_All', 'trino', 'LISTAGG(DISTINCT {COLUMN_NAME}, ''|'') WITHIN GROUP (ORDER BY {COLUMN_NAME})', '<>', '{THRESHOLD_VALUE}'),
- ('5014', 'LOV_Match', 'trino', 'SUM(CASE WHEN NULLIF({COLUMN_NAME}, '''') NOT IN {BASELINE_VALUE} THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'),
- ('5015', 'Min_Date', 'trino', 'SUM(CASE WHEN {COLUMN_NAME} < CAST(''{BASELINE_VALUE}'' AS DATE) THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'),
- ('5016', 'Min_Val', 'trino', 'SUM(CASE WHEN {COLUMN_NAME} < {BASELINE_VALUE} - 1e-6 THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'),
- ('5017', 'Missing_Pct', 'trino', 'ABS(2.0 * ASIN(SQRT(CAST({BASELINE_VALUE_CT} AS REAL) / CAST({BASELINE_CT} AS REAL))) - 2 * ASIN(SQRT(CAST(COUNT({COLUMN_NAME}) AS REAL) / CAST(NULLIF(COUNT(*), 0) AS REAL) )))', '>=', '{THRESHOLD_VALUE}'),
- ('5018', 'Monthly_Rec_Ct', 'trino', '(MAX(DATE_DIFF(''month'', {COLUMN_NAME}, CAST(''{RUN_DATE}'' AS DATE))) - MIN(DATE_DIFF(''month'', {COLUMN_NAME}, CAST(''{RUN_DATE}'' AS DATE))) + 1) - COUNT(DISTINCT DATE_DIFF(''month'', {COLUMN_NAME}, CAST(''{RUN_DATE}'' AS DATE)))', '>', '{THRESHOLD_VALUE}'),
- ('5019', 'Outlier_Pct_Above', 'trino', 'CAST(SUM(CASE WHEN CAST({COLUMN_NAME} AS REAL) > {BASELINE_AVG}+(2.0*{BASELINE_SD}) THEN 1 ELSE 0 END) AS REAL) / CAST(COUNT({COLUMN_NAME}) AS REAL)', '>', '{THRESHOLD_VALUE}'),
- ('5020', 'Outlier_Pct_Below', 'trino', 'CAST(SUM(CASE WHEN CAST( {COLUMN_NAME} AS REAL) < {BASELINE_AVG}-(2.0*{BASELINE_SD}) THEN 1 ELSE 0 END) AS REAL) / CAST(COUNT({COLUMN_NAME}) AS REAL)', '>', '{THRESHOLD_VALUE}'),
- ('5021', 'Pattern_Match', 'trino', 'COUNT(NULLIF({COLUMN_NAME}, '''')) - SUM(CASE WHEN REGEXP_LIKE(NULLIF({COLUMN_NAME}, '''') , ''{BASELINE_VALUE}'') = TRUE THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'),
- ('5022', 'Recency', 'trino', 'DATE_DIFF(''day'', MAX({COLUMN_NAME}), CAST(''{RUN_DATE}'' AS DATE))', '>', '{THRESHOLD_VALUE}'),
- ('5023', 'Required', 'trino', 'COUNT(*) - COUNT({COLUMN_NAME})', '>', '{THRESHOLD_VALUE}'),
- ('5024', 'Row_Ct', 'trino', 'COUNT(*)', '<', '{THRESHOLD_VALUE}'),
- ('5025', 'Row_Ct_Pct', 'trino', 'ABS(ROUND(100.0 * CAST((COUNT(*) - {BASELINE_CT}) AS DECIMAL(18,4)) /CAST( {BASELINE_CT} AS DECIMAL(18,4) ), 2))', '>', '{THRESHOLD_VALUE}'),
- ('5026', 'Street_Addr_Pattern', 'trino', 'CAST(100.0*SUM(CASE WHEN REGEXP_LIKE({COLUMN_NAME} , ''^[0-9]{1,5}[a-zA-Z]?\s\w{1,5}\.?\s?\w*\s?\w*\s[a-zA-Z]{1,6}\.?\s?[0-9]{0,5}[A-Z]{0,1}$'') = TRUE THEN 1 ELSE 0 END) AS REAL )/ CAST(COUNT({COLUMN_NAME}) AS REAL)', '<', '{THRESHOLD_VALUE}'),
- ('5027', 'US_State', 'trino', 'SUM(CASE WHEN NULLIF({COLUMN_NAME}, '''') NOT IN (''AL'',''AK'',''AS'',''AZ'',''AR'',''CA'',''CO'',''CT'',''DE'',''DC'',''FM'',''FL'',''GA'',''GU'',''HI'',''ID'',''IL'',''IN'',''IA'',''KS'',''KY'',''LA'',''ME'',''MH'',''MD'',''MA'',''MI'',''MN'',''MS'',''MO'',''MT'',''NE'',''NV'',''NH'',''NJ'',''NM'',''NY'',''NC'',''ND'',''MP'',''OH'',''OK'',''OR'',''PW'',''PA'',''PR'',''RI'',''SC'',''SD'',''TN'',''TX'',''UT'',''VT'',''VI'',''VA'',''WA'',''WV'',''WI'',''WY'',''AE'',''AP'',''AA'') THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'),
- ('5028', 'Unique', 'trino', 'COUNT(*) - COUNT(DISTINCT {COLUMN_NAME})', '>', '{THRESHOLD_VALUE}'),
- ('5029', 'Unique_Pct', 'trino', 'ABS( 2.0 * ASIN( SQRT(CAST({BASELINE_UNIQUE_CT} AS REAL) / CAST({BASELINE_VALUE_CT} AS REAL) ) ) - 2 * ASIN( SQRT( CAST(COUNT( DISTINCT {COLUMN_NAME} ) AS REAL) / CAST(NULLIF(COUNT( {COLUMN_NAME} ), 0) AS REAL) )))', '>=', '{THRESHOLD_VALUE}'),
- ('5030', 'Weekly_Rec_Ct', 'trino', 'MAX(DATE_DIFF(''week'', CAST(''1800-01-01'' AS DATE), {COLUMN_NAME})) - MIN(DATE_DIFF(''week'', CAST(''1800-01-01'' AS DATE), {COLUMN_NAME})) +1 - COUNT(DISTINCT DATE_DIFF(''week'', CAST(''1800-01-01'' AS DATE), {COLUMN_NAME}))', '>', '{THRESHOLD_VALUE}'),
- ('5031', 'Variability_Increase', 'trino', '100.0*STDDEV(CAST({COLUMN_NAME} AS REAL))/{BASELINE_SD}', '>', '{THRESHOLD_VALUE}'),
- ('5032', 'Variability_Decrease', 'trino', '100.0*STDDEV(CAST({COLUMN_NAME} AS REAL))/{BASELINE_SD}', '<', '{THRESHOLD_VALUE}'),
-
- ('6001', 'Alpha_Trunc', 'databricks', 'MAX(LENGTH({COLUMN_NAME}))', '<', '{THRESHOLD_VALUE}'),
- ('6002', 'Avg_Shift', 'databricks', 'ABS( (AVG({COLUMN_NAME}::FLOAT) - {BASELINE_AVG}) / SQRT(((COUNT({COLUMN_NAME})::FLOAT-1)*POWER(STDDEV_SAMP({COLUMN_NAME}),2) + ({BASELINE_VALUE_CT}::FLOAT-1) * POWER({BASELINE_SD}::FLOAT,2)) /NULLIF(COUNT({COLUMN_NAME})::FLOAT + {BASELINE_VALUE_CT}::FLOAT, 0) ))', '>=', '{THRESHOLD_VALUE}'),
- ('6003', 'Condition_Flag', 'databricks', 'SUM(CASE WHEN {CUSTOM_QUERY} THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'),
- ('6004', 'Constant', 'databricks', 'SUM(CASE WHEN {COLUMN_NAME} <> {BASELINE_VALUE} THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'),
- ('6005', 'Daily_Record_Ct', 'databricks', '<%DATEDIFF_DAY;MIN({COLUMN_NAME});MAX({COLUMN_NAME})%>+1-COUNT(DISTINCT {COLUMN_NAME})', '<', '{THRESHOLD_VALUE}'),
- ('6006', 'Dec_Trunc', 'databricks', 'SUM(ROUND(ABS(({COLUMN_NAME} % 1)), 5))+1', '<', '{THRESHOLD_VALUE}'),
- ('6007', 'Distinct_Date_Ct', 'databricks', 'COUNT(DISTINCT {COLUMN_NAME})', '<', '{THRESHOLD_VALUE}'),
- ('6008', 'Distinct_Value_Ct', 'databricks', 'COUNT(DISTINCT {COLUMN_NAME})', '<>', '{THRESHOLD_VALUE}'),
- ('6009', 'Email_Format', 'databricks', 'SUM(CASE WHEN NOT REGEXP_LIKE({COLUMN_NAME}::STRING, ''^[A-Za-z0-9._''''%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$'') THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'),
- ('6010', 'Future_Date', 'databricks', 'SUM(GREATEST(0, SIGN({COLUMN_NAME}::DATE - ''{RUN_DATE}''::DATE)))', '>', '{THRESHOLD_VALUE}'),
- ('6011', 'Future_Date_1Y', 'databricks', 'SUM(GREATEST(0, SIGN({COLUMN_NAME}::DATE - (''{RUN_DATE}''::DATE+365))))', '>', '{THRESHOLD_VALUE}'),
- ('6012', 'Incr_Avg_Shift', 'databricks', 'COALESCE(ABS( ({BASELINE_AVG} - (SUM({COLUMN_NAME}) - {BASELINE_SUM}) / NULLIF(COUNT({COLUMN_NAME})::FLOAT - {BASELINE_VALUE_CT}, 0)) / {BASELINE_SD} ), 0)', '>=', '{THRESHOLD_VALUE}'),
- ('6013', 'LOV_All', 'databricks', 'STRING_AGG(DISTINCT {COLUMN_NAME}, ''|'') WITHIN GROUP (ORDER BY {COLUMN_NAME})', '<>', '{THRESHOLD_VALUE}'),
- ('6014', 'LOV_Match', 'databricks', 'SUM(CASE WHEN NULLIF({COLUMN_NAME}, '''') NOT IN {BASELINE_VALUE} THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'),
- ('6015', 'Min_Date', 'databricks', 'SUM(CASE WHEN {COLUMN_NAME} < ''{BASELINE_VALUE}'' THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'),
- ('6016', 'Min_Val', 'databricks', 'SUM(CASE WHEN {COLUMN_NAME} < {BASELINE_VALUE} - 1e-6 THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'),
- ('6017', 'Missing_Pct', 'databricks', 'ABS( 2.0 * ASIN( SQRT( {BASELINE_VALUE_CT}::FLOAT / {BASELINE_CT}::FLOAT ) ) - 2 * ASIN( SQRT( COUNT({COLUMN_NAME})::FLOAT / NULLIF(COUNT(*), 0)::FLOAT )) )', '>=', '{THRESHOLD_VALUE}'),
- ('6018', 'Monthly_Rec_Ct', 'databricks', '(MAX(<%DATEDIFF_MONTH;{COLUMN_NAME};''{RUN_DATE}''::DATE%>) - MIN(<%DATEDIFF_MONTH;{COLUMN_NAME};''{RUN_DATE}''::DATE%>) + 1) - COUNT(DISTINCT <%DATEDIFF_MONTH;{COLUMN_NAME};''{RUN_DATE}''::DATE%>)', '>', '{THRESHOLD_VALUE}'),
- ('6019', 'Outlier_Pct_Above', 'databricks', 'SUM(CASE WHEN {COLUMN_NAME}::FLOAT > {BASELINE_AVG}+(2.0*{BASELINE_SD}) THEN 1 ELSE 0 END)::FLOAT / NULLIF(COUNT({COLUMN_NAME}), 0)::FLOAT', '>', '{THRESHOLD_VALUE}'),
- ('6020', 'Outlier_Pct_Below', 'databricks', 'SUM(CASE WHEN {COLUMN_NAME}::FLOAT < {BASELINE_AVG}-(2.0*{BASELINE_SD}) THEN 1 ELSE 0 END)::FLOAT / NULLIF(COUNT({COLUMN_NAME}), 0)::FLOAT', '>', '{THRESHOLD_VALUE}'),
- ('6021', 'Pattern_Match', 'databricks', 'COUNT(NULLIF({COLUMN_NAME}, '''')) - SUM(REGEXP_LIKE(NULLIF({COLUMN_NAME}::STRING, ''''), ''{BASELINE_VALUE}'')::BIGINT)', '>', '{THRESHOLD_VALUE}'),
- ('6022', 'Recency', 'databricks', '<%DATEDIFF_DAY;MAX({COLUMN_NAME});''{RUN_DATE}''::DATE%>', '>', '{THRESHOLD_VALUE}'),
- ('6023', 'Required', 'databricks', 'COUNT(*) - COUNT( {COLUMN_NAME} )', '>', '{THRESHOLD_VALUE}'),
- ('6024', 'Row_Ct', 'databricks', 'COUNT(*)', '<', '{THRESHOLD_VALUE}'),
- ('6025', 'Row_Ct_Pct', 'databricks', 'ABS(ROUND(100.0 * (COUNT(*) - {BASELINE_CT})::FLOAT / {BASELINE_CT}::FLOAT, 2))', '>', '{THRESHOLD_VALUE}'),
- ('6026', 'Street_Addr_Pattern', 'databricks', '100.0*SUM((regexp_like({COLUMN_NAME}::STRING, ''^[0-9]{1,5}[a-zA-Z]?\\s\\w{1,5}\\.?\\s?\\w*\\s?\\w*\\s[a-zA-Z]{1,6}\\.?\\s?[0-9]{0,5}[A-Z]{0,1}$''))::BIGINT)::FLOAT / NULLIF(COUNT({COLUMN_NAME}), 0)::FLOAT', '<', '{THRESHOLD_VALUE}'),
- ('6027', 'US_State', 'databricks', 'SUM(CASE WHEN {COLUMN_NAME} NOT IN ('''',''AL'',''AK'',''AS'',''AZ'',''AR'',''CA'',''CO'',''CT'',''DE'',''DC'',''FM'',''FL'',''GA'',''GU'',''HI'',''ID'',''IL'',''IN'',''IA'',''KS'',''KY'',''LA'',''ME'',''MH'',''MD'',''MA'',''MI'',''MN'',''MS'',''MO'',''MT'',''NE'',''NV'',''NH'',''NJ'',''NM'',''NY'',''NC'',''ND'',''MP'',''OH'',''OK'',''OR'',''PW'',''PA'',''PR'',''RI'',''SC'',''SD'',''TN'',''TX'',''UT'',''VT'',''VI'',''VA'',''WA'',''WV'',''WI'',''WY'',''AE'',''AP'',''AA'') THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'),
- ('6028', 'Unique', 'databricks', 'COUNT(*) - COUNT(DISTINCT {COLUMN_NAME})', '>', '{THRESHOLD_VALUE}'),
- ('6029', 'Unique_Pct', 'databricks', 'ABS( 2.0 * ASIN( SQRT({BASELINE_UNIQUE_CT}::FLOAT / {BASELINE_VALUE_CT}::FLOAT ) ) - 2 * ASIN( SQRT( COUNT( DISTINCT {COLUMN_NAME} )::FLOAT / NULLIF(COUNT( {COLUMN_NAME} ), 0)::FLOAT )) )', '>=', '{THRESHOLD_VALUE}'),
- ('6030', 'Weekly_Rec_Ct', 'databricks', 'CAST(<%DATEDIFF_WEEK;MIN({COLUMN_NAME});MAX({COLUMN_NAME})%> + 1 - COUNT(DISTINCT DATE_TRUNC(''week'', {COLUMN_NAME})) AS INT)', '>', '{THRESHOLD_VALUE}'),
-
- ('1033', 'Valid_Month', 'redshift', 'SUM(CASE WHEN NULLIF({COLUMN_NAME}, '''') NOT IN ({BASELINE_VALUE}) THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'),
- ('2033', 'Valid_Month', 'snowflake', 'SUM(CASE WHEN NULLIF({COLUMN_NAME}, '''') NOT IN ({BASELINE_VALUE}) THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'),
- ('3033', 'Valid_Month', 'mssql', 'SUM(CASE WHEN NULLIF({COLUMN_NAME}, '''') NOT IN ({BASELINE_VALUE}) THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'),
- ('4033', 'Valid_Month', 'postgresql', 'SUM(CASE WHEN NULLIF({COLUMN_NAME}, '''') NOT IN ({BASELINE_VALUE}) THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'),
- ('5033', 'Valid_Month', 'trino', 'SUM(CASE WHEN NULLIF({COLUMN_NAME}, '''') NOT IN ({BASELINE_VALUE}) THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'),
- ('6033', 'Valid_Month', 'databricks', 'SUM(CASE WHEN NULLIF({COLUMN_NAME}, '''') NOT IN ({BASELINE_VALUE}) THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'),
-
- ('1034', 'Valid_US_Zip', 'redshift', 'SUM(CASE WHEN TRANSLATE({COLUMN_NAME},''012345678'',''999999999'') NOT IN (''99999'', ''999999999'', ''99999-9999'') THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'),
- ('4034', 'Valid_US_Zip', 'postgresql', 'SUM(CASE WHEN TRANSLATE({COLUMN_NAME},''012345678'',''999999999'') NOT IN (''99999'', ''999999999'', ''99999-9999'') THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'),
- ('2034', 'Valid_US_Zip', 'snowflake', 'SUM(CASE WHEN TRANSLATE({COLUMN_NAME},''012345678'',''999999999'') NOT IN (''99999'', ''999999999'', ''99999-9999'') THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'),
- ('5034', 'Valid_US_Zip', 'trino', 'SUM(CASE WHEN TRANSLATE({COLUMN_NAME},''012345678'',''999999999'') NOT IN (''99999'', ''999999999'', ''99999-9999'') THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'),
- ('3034', 'Valid_US_Zip', 'mssql', 'SUM(CASE WHEN TRANSLATE({COLUMN_NAME},''012345678'',''999999999'') NOT IN (''99999'', ''999999999'', ''99999-9999'') THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'),
- ('6034', 'Valid_US_Zip', 'databricks', 'SUM(CASE WHEN TRANSLATE({COLUMN_NAME},''012345678'',''999999999'') NOT IN (''99999'', ''999999999'', ''99999-9999'') THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'),
-
- ('1035', 'Valid_US_Zip3', 'redshift', 'SUM(CASE WHEN TRANSLATE({COLUMN_NAME},''012345678'',''999999999'') <> ''999'' THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'),
- ('4035', 'Valid_US_Zip3', 'postgresql', 'SUM(CASE WHEN TRANSLATE({COLUMN_NAME},''012345678'',''999999999'') <> ''999'' THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'),
- ('2035', 'Valid_US_Zip3', 'snowflake', 'SUM(CASE WHEN TRANSLATE({COLUMN_NAME},''012345678'',''999999999'') <> ''999'' THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'),
- ('5035', 'Valid_US_Zip3', 'trino', 'SUM(CASE WHEN TRANSLATE({COLUMN_NAME},''012345678'',''999999999'') <> ''999'' THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'),
- ('3035', 'Valid_US_Zip3', 'mssql', 'SUM(CASE WHEN TRANSLATE({COLUMN_NAME},''012345678'',''999999999'') <> ''999'' THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'),
- ('6035', 'Valid_US_Zip3', 'databricks', 'SUM(CASE WHEN TRANSLATE({COLUMN_NAME},''012345678'',''999999999'') <> ''999'' THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'),
-
- ('1036', 'Valid_Characters', 'redshift', 'SUM(CASE WHEN TRANSLATE({COLUMN_NAME}, CHR(160) || CHR(8203) || CHR(65279) || CHR(8239) || CHR(8201) || CHR(12288) || CHR(8204), ''XXXXXXX'') <> {COLUMN_NAME} OR {COLUMN_NAME} LIKE '' %'' OR {COLUMN_NAME} LIKE ''''''%'''''' OR {COLUMN_NAME} LIKE ''"%"'' THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'),
- ('4036', 'Valid_Characters', 'postgresql', 'SUM(CASE WHEN TRANSLATE({COLUMN_NAME}, CHR(160) || CHR(8203) || CHR(65279) || CHR(8239) || CHR(8201) || CHR(12288) || CHR(8204), ''XXXXXXX'') <> {COLUMN_NAME} OR {COLUMN_NAME} LIKE '' %'' OR {COLUMN_NAME} LIKE ''''''%'''''' OR {COLUMN_NAME} LIKE ''"%"'' THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'),
- ('2036', 'Valid_Characters', 'snowflake', 'SUM(CASE WHEN TRANSLATE({COLUMN_NAME}, CHAR(160) || CHAR(8203) || CHAR(65279) || CHAR(8239) || CHAR(8201) || CHAR(12288) || CHAR(8204), ''XXXXXXX'') <> {COLUMN_NAME} OR {COLUMN_NAME} LIKE '' %'' OR {COLUMN_NAME} LIKE ''''''%'''''' OR {COLUMN_NAME} LIKE ''"%"'' THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'),
- ('5036', 'Valid_Characters', 'trino', 'SUM(CASE WHEN TRANSLATE({COLUMN_NAME}, CHR(160) || CHR(8203) || CHR(65279) || CHR(8239) || CHR(8201) || CHR(12288) || CHR(8204), ''XXXXXXX'') <> {COLUMN_NAME} OR {COLUMN_NAME} LIKE '' %'' OR {COLUMN_NAME} LIKE ''''''%'''''' OR {COLUMN_NAME} LIKE ''"%"'' THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'),
- ('3036', 'Valid_Characters', 'mssql', 'SUM(CASE WHEN TRANSLATE({COLUMN_NAME}, NCHAR(160) || NCHAR(8203) || NCHAR(65279) || NCHAR(8239) || NCHAR(8201) || NCHAR(12288) || NCHAR(8204), ''XXXXXXX'') <> {COLUMN_NAME} OR {COLUMN_NAME} LIKE '' %'' OR {COLUMN_NAME} LIKE ''''''%'''''' OR {COLUMN_NAME} LIKE ''"%"'' THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'),
- ('6036', 'Valid_Characters', 'databricks', 'SUM(CASE WHEN TRANSLATE({COLUMN_NAME}, CHR(160) || CHR(8203) || CHR(65279) || CHR(8239) || CHR(8201) || CHR(12288) || CHR(8204), ''XXXXXXX'') <> {COLUMN_NAME} OR {COLUMN_NAME} LIKE '' %'' OR {COLUMN_NAME} LIKE ''''''%'''''' OR {COLUMN_NAME} LIKE ''"%"'' THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}');
-
TRUNCATE TABLE target_data_lookups;
-INSERT INTO target_data_lookups
-(id, test_id, error_type, test_type, sql_flavor, lookup_type, lookup_query)
-VALUES
- ('1001', '1004', 'Test Results', 'Alpha_Trunc', 'redshift', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", LEN("{COLUMN_NAME}") as current_max_length, {THRESHOLD_VALUE} as previous_max_length FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT MAX(LEN("{COLUMN_NAME}")) as max_length FROM {TARGET_SCHEMA}.{TABLE_NAME}) a WHERE LEN("{COLUMN_NAME}") = a.max_length AND a.max_length < {THRESHOLD_VALUE} LIMIT 500;'),
- ('1002', '1005', 'Test Results', 'Avg_Shift', 'redshift', NULL, 'SELECT AVG("{COLUMN_NAME}"::FLOAT) AS current_average FROM {TARGET_SCHEMA}.{TABLE_NAME};'),
- ('1003', '1006', 'Test Results', 'Condition_Flag', 'redshift', NULL, 'SELECT * FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE {CUSTOM_QUERY} LIMIT 500;'),
- ('1004', '1007', 'Test Results', 'Constant', 'redshift', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" <> {BASELINE_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT 500;'),
- ('1005', '1009', 'Test Results', 'Daily_Record_Ct', 'redshift', NULL, 'WITH RECURSIVE daterange(all_dates) AS (SELECT MIN("{COLUMN_NAME}") :: DATE AS all_dates FROM {TARGET_SCHEMA}.{TABLE_NAME} UNION ALL SELECT DATEADD(DAY, 1, d.all_dates) :: DATE AS all_dates FROM daterange d WHERE d.all_dates < (SELECT MAX("{COLUMN_NAME}") :: DATE FROM {TARGET_SCHEMA}.{TABLE_NAME}) ), existing_periods AS ( SELECT DISTINCT "{COLUMN_NAME}" :: DATE AS period, COUNT(1) AS period_count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" :: DATE ) SELECT d.all_dates AS missing_period, MAX(b.period) AS prior_available_date, (SELECT period_count FROM existing_periods WHERE period = MAX(b.period) ) AS prior_available_date_count, MIN(c.period) AS next_available_date, (SELECT period_count FROM existing_periods WHERE period = MIN(c.period) ) AS next_available_date_count FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates ORDER BY d.all_dates LIMIT 500;'),
- ('1006', '1011', 'Test Results', 'Dec_Trunc', 'redshift', NULL, 'SELECT DISTINCT DECIMAL_SCALE("{COLUMN_NAME}" :: SUPER) AS decimal_scale, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY DECIMAL_SCALE("{COLUMN_NAME}" :: SUPER) LIMIT 500;'),
- ('1007', '1012', 'Test Results', 'Distinct_Date_Ct', 'redshift', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" IS NOT NULL GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500;'),
- ('1008', '1013', 'Test Results', 'Distinct_Value_Ct', 'redshift', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" IS NOT NULL GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500;'),
- ('1009', '1014', 'Test Results', 'Email_Format', 'redshift', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" !~ ''^[A-Za-z0-9._''''%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$'' GROUP BY "{COLUMN_NAME}" LIMIT 500;'),
- ('1010', '1015', 'Test Results', 'Future_Date', 'redshift', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE GREATEST(0, SIGN("{COLUMN_NAME}"::DATE - ''{TEST_DATE}''::DATE)) > {THRESHOLD_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT 500;'),
- ('1011', '1016', 'Test Results', 'Future_Date_1Y', 'redshift', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE GREATEST(0, SIGN("{COLUMN_NAME}"::DATE - (''{TEST_DATE}''::DATE + 365))) > {THRESHOLD_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT 500;'),
- ('1012', '1017', 'Test Results', 'Incr_Avg_Shift', 'redshift', NULL, 'SELECT AVG("{COLUMN_NAME}" :: FLOAT) AS current_average, SUM("{COLUMN_NAME}" ::FLOAT) AS current_sum, NULLIF(COUNT("{COLUMN_NAME}" )::FLOAT, 0) as current_value_count FROM {TARGET_SCHEMA}.{TABLE_NAME};'),
- ('1013', '1018', 'Test Results', 'LOV_All', 'redshift', NULL, 'SELECT LISTAGG(DISTINCT "{COLUMN_NAME}", ''|'') WITHIN GROUP (ORDER BY "{COLUMN_NAME}") FROM {TARGET_SCHEMA}.{TABLE_NAME} HAVING LISTAGG(DISTINCT "{COLUMN_NAME}", ''|'') WITHIN GROUP (ORDER BY "{COLUMN_NAME}") <> ''{THRESHOLD_VALUE}'' LIMIT 500;'),
- ('1014', '1019', 'Test Results', 'LOV_Match', 'redshift', NULL, 'SELECT DISTINCT NULLIF("{COLUMN_NAME}", '''') AS "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE NULLIF("{COLUMN_NAME}", '''') NOT IN {BASELINE_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT 500;'),
- ('1015', '1020', 'Test Results', 'Min_Date', 'redshift', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" :: DATE < ''{BASELINE_VALUE}'' :: DATE GROUP BY "{COLUMN_NAME}" LIMIT 500;'),
- ('1016', '1021', 'Test Results', 'Min_Val', 'redshift', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", (ABS("{COLUMN_NAME}") - ABS({BASELINE_VALUE})) AS difference_from_baseline FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" < {BASELINE_VALUE} LIMIT 500;'),
- ('1017', '1022', 'Test Results', 'Missing_Pct', 'redshift', NULL, 'SELECT TOP 10 * FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" IS NULL OR "{COLUMN_NAME}" :: VARCHAR(255) = '''' ;'),
- ('1018', '1023', 'Test Results', 'Monthly_Rec_Ct', 'redshift', NULL, 'WITH RECURSIVE daterange(all_dates) AS (SELECT DATE_TRUNC(''month'', MIN("{COLUMN_NAME}")) :: DATE AS all_dates FROM {TARGET_SCHEMA}.{TABLE_NAME} UNION ALL SELECT DATEADD(MONTH, 1, d.all_dates) :: DATE AS all_dates FROM daterange d WHERE d.all_dates < (SELECT DATE_TRUNC(''month'', MAX("{COLUMN_NAME}")) :: DATE FROM {TARGET_SCHEMA}.{TABLE_NAME}) ), existing_periods AS ( SELECT DISTINCT DATE_TRUNC(''month'',"{COLUMN_NAME}") :: DATE AS period, COUNT(1) AS period_count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY DATE_TRUNC(''month'',"{COLUMN_NAME}") :: DATE ) SELECT d.all_dates as missing_period, MAX(b.period) AS prior_available_month, (SELECT period_count FROM existing_periods WHERE period = MAX(b.period) ) AS prior_available_month_count, MIN(c.period) AS next_available_month, (SELECT period_count FROM existing_periods WHERE period = MIN(c.period) ) AS next_available_month_count FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates ORDER BY d.all_dates;'),
- ('1019', '1024', 'Test Results', 'Outlier_Pct_Above', 'redshift', NULL, 'SELECT ({BASELINE_AVG} + (2*{BASELINE_SD})) AS outlier_threshold, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" :: FLOAT > ({BASELINE_AVG} + (2*{BASELINE_SD})) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC;'),
- ('1020', '1025', 'Test Results', 'Outlier_Pct_Below', 'redshift', NULL, 'SELECT ({BASELINE_AVG} + (2*{BASELINE_SD})) AS outlier_threshold, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" :: FLOAT < ({BASELINE_AVG} + (2*{BASELINE_SD})) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC;'),
- ('1021', '1026', 'Test Results', 'Pattern_Match', 'redshift', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE NULLIF("{COLUMN_NAME}", '''') NOT SIMILAR TO ''{BASELINE_VALUE}'' GROUP BY "{COLUMN_NAME}";'),
- ('1022', '1028', 'Test Results', 'Recency', 'redshift', NULL, 'SELECT DISTINCT col AS latest_date_available, ''{TEST_DATE}'' :: DATE as test_run_date FROM (SELECT MAX("{COLUMN_NAME}") AS col FROM {TARGET_SCHEMA}.{TABLE_NAME}) WHERE DATEDIFF(''D'', col, ''{TEST_DATE}''::DATE) > {THRESHOLD_VALUE};'),
- ('1023', '1030', 'Test Results', 'Required', 'redshift', NULL, 'SELECT * FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" IS NULL LIMIT 500;'),
- ('1024', '1031', 'Test Results', 'Row_Ct', 'redshift', NULL, 'WITH CTE AS (SELECT COUNT(*) AS current_count FROM {TARGET_SCHEMA}.{TABLE_NAME}) SELECT current_count, ABS(ROUND(100 * (current_count - {THRESHOLD_VALUE}) :: FLOAT / {THRESHOLD_VALUE} :: FLOAT,2)) AS row_count_pct_decrease FROM cte WHERE current_count < {THRESHOLD_VALUE};'),
- ('1025', '1032', 'Test Results', 'Row_Ct_Pct', 'redshift', NULL, 'WITH CTE AS (SELECT COUNT(*) AS current_count FROM {TARGET_SCHEMA}.{TABLE_NAME}) SELECT current_count, {BASELINE_CT} AS baseline_count, ABS(ROUND(100 * (current_count - {BASELINE_CT}) :: FLOAT / {BASELINE_CT} :: FLOAT,2)) AS row_count_pct_difference FROM cte;'),
- ('1026', '1033', 'Test Results', 'Street_Addr_Pattern', 'redshift', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" !~ ''^[0-9]{1,5}[a-zA-Z]?\\s\\w{1,5}\\.?\\s?\\w*\\s?\\w*\\s[a-zA-Z]{1,6}\\.?\\s?[0-9]{0,5}[A-Z]{0,1}$'' GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT 500;'),
- ('1027', '1036', 'Test Results', 'US_State', 'redshift', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE NULLIF("{COLUMN_NAME}", '''') NOT IN (''AL'',''AK'',''AS'',''AZ'',''AR'',''CA'',''CO'',''CT'',''DE'',''DC'',''FM'',''FL'',''GA'',''GU'',''HI'',''ID'',''IL'',''IN'',''IA'',''KS'',''KY'',''LA'',''ME'',''MH'',''MD'',''MA'',''MI'',''MN'',''MS'',''MO'',''MT'',''NE'',''NV'',''NH'',''NJ'',''NM'',''NY'',''NC'',''ND'',''MP'',''OH'',''OK'',''OR'',''PW'',''PA'',''PR'',''RI'',''SC'',''SD'',''TN'',''TX'',''UT'',''VT'',''VI'',''VA'',''WA'',''WV'',''WI'',''WY'',''AE'',''AP'',''AA'') GROUP BY "{COLUMN_NAME}" LIMIT 500;'),
- ('1028', '1034', 'Test Results', 'Unique', 'redshift', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" HAVING COUNT(*) > 1 ORDER BY COUNT(*) DESC LIMIT 500;'),
- ('1029', '1035', 'Test Results', 'Unique_Pct', 'redshift', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT 500;'),
- ('1030', '1037', 'Test Results', 'Weekly_Rec_Ct', 'redshift', NULL, 'WITH RECURSIVE daterange(all_dates) AS (SELECT DATE_TRUNC(''week'',MIN("{COLUMN_NAME}")) :: DATE AS all_dates FROM {TARGET_SCHEMA}.{TABLE_NAME} UNION ALL SELECT (d.all_dates + INTERVAL ''1 week'' ) :: DATE AS all_dates FROM daterange d WHERE d.all_dates < (SELECT DATE_TRUNC(''week'', MAX("{COLUMN_NAME}")) :: DATE FROM {TARGET_SCHEMA}.{TABLE_NAME}) ), existing_periods AS ( SELECT DISTINCT DATE_TRUNC(''week'',"{COLUMN_NAME}") :: DATE AS period, COUNT(1) as period_count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY DATE_TRUNC(''week'',"{COLUMN_NAME}") :: DATE ) SELECT d.all_dates as missing_period, MAX(b.period) AS prior_available_week, (SELECT period_count FROM existing_periods WHERE period = MAX(b.period) ) AS prior_available_week_count, MIN(c.period) AS next_available_week, (SELECT period_count FROM existing_periods WHERE period = MIN(c.period) ) AS next_available_week_count FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates ORDER BY d.all_dates;'),
- ('1031', '1040', 'Test Results', 'Variability_Increase', 'redshift', NULL, 'SELECT STDDEV("{COLUMN_NAME}"::FLOAT) as current_standard_deviation FROM {TARGET_SCHEMA}.{TABLE_NAME};'),
- ('1032', '1041', 'Test Results', 'Variability_Decrease', 'redshift', NULL, 'SELECT STDDEV("{COLUMN_NAME}"::FLOAT) as current_standard_deviation FROM {TARGET_SCHEMA}.{TABLE_NAME};'),
-
- ('1033', '1001', 'Profile Anomaly' , 'Suggested_Type', 'redshift', NULL, 'SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC;'),
- ('1034', '1002', 'Profile Anomaly', 'Non_Standard_Blanks', 'redshift', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE CASE WHEN "{COLUMN_NAME}" IN (''.'', ''?'', '' '') THEN 1 WHEN LOWER("{COLUMN_NAME}") SIMILAR TO ''(^.{2,}|-{2,}|0{2,}|9{2,}|x{2,}|z{2,}$)'' THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''blank'',''error'',''missing'',''tbd'', ''n/a'',''#na'',''none'',''null'',''unknown'') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''(blank)'',''(error)'',''(missing)'',''(tbd)'', ''(n/a)'',''(#na)'',''(none)'',''(null)'',''(unknown)'') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''[blank]'',''[error]'',''[missing]'',''[tbd]'', ''[n/a]'',''[#na]'',''[none]'',''[null]'',''[unknown]'') THEN 1 WHEN "{COLUMN_NAME}" = '''' THEN 1 WHEN "{COLUMN_NAME}" IS NULL THEN 1 ELSE 0 END = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";'),
- ('1035', '1003', 'Profile Anomaly' , 'Invalid_Zip_USA', 'redshift', NULL, 'SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}",''012345678'',''999999999'') NOT IN (''99999'', ''999999999'', ''99999-9999'') GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500;'),
- ('1036', '1004', 'Profile Anomaly' , 'Multiple_Types_Minor', 'redshift', NULL, 'SELECT DISTINCT column_name, table_name, CASE WHEN data_type = ''timestamp without time zone'' THEN ''timestamp'' WHEN data_type = ''character varying'' THEN ''varchar('' || CAST(character_maximum_length AS VARCHAR) || '')'' WHEN data_type = ''character'' THEN ''char('' || CAST(character_maximum_length AS VARCHAR) || '')'' WHEN data_type = ''numeric'' THEN ''numeric('' || CAST(numeric_precision AS VARCHAR) || '','' || CAST(numeric_scale AS VARCHAR) || '')'' ELSE data_type END AS data_type FROM information_schema.columns WHERE table_schema = ''{TARGET_SCHEMA}'' AND column_name = ''{COLUMN_NAME}'' ORDER BY data_type, table_name;'),
- ('1037', '1005', 'Profile Anomaly' , 'Multiple_Types_Major', 'redshift', NULL, 'SELECT DISTINCT column_name, table_name, CASE WHEN data_type = ''timestamp without time zone'' THEN ''timestamp'' WHEN data_type = ''character varying'' THEN ''varchar('' || CAST(character_maximum_length AS VARCHAR) || '')'' WHEN data_type = ''character'' THEN ''char('' || CAST(character_maximum_length AS VARCHAR) || '')'' WHEN data_type = ''numeric'' THEN ''numeric('' || CAST(numeric_precision AS VARCHAR) || '','' || CAST(numeric_scale AS VARCHAR) || '')'' ELSE data_type END AS data_type FROM information_schema.columns WHERE table_schema = ''{TARGET_SCHEMA}'' AND column_name = ''{COLUMN_NAME}'' ORDER BY data_type, table_name;'),
- ('1038', '1006', 'Profile Anomaly' , 'No_Values', 'redshift', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";' ),
- ('1039', '1007', 'Profile Anomaly' , 'Column_Pattern_Mismatch', 'redshift', NULL, 'SELECT A.* FROM ( SELECT TOP 5 DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT trim(split_part(''{DETAIL_EXPRESSION}'', ''|'', 4)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", ''[a-z]'', ''a''),''[A-Z]'', ''A''),''[0-9]'', ''N'') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC ) A UNION ALL SELECT B.* FROM ( SELECT TOP 5 DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT trim(split_part(''{DETAIL_EXPRESSION}'', ''|'', 6)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", ''[a-z]'', ''a''),''[A-Z]'', ''A''),''[0-9]'', ''N'') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC ) B UNION ALL SELECT C.* FROM ( SELECT TOP 5 DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT trim(split_part(''{DETAIL_EXPRESSION}'', ''|'', 8)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", ''[a-z]'', ''a''),''[A-Z]'', ''A''),''[0-9]'', ''N'') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC ) C UNION ALL SELECT D.* FROM ( SELECT TOP 5 DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT trim(split_part(''{DETAIL_EXPRESSION}'', ''|'', 10)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", ''[a-z]'', ''a''),''[A-Z]'', ''A''),''[0-9]'', ''N'') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC ) D ORDER BY top_pattern DESC, count DESC;' ),
- ('1040', '1008', 'Profile Anomaly' , 'Table_Pattern_Mismatch', 'redshift', NULL, 'SELECT column_name, table_name, data_type FROM information_schema.columns WHERE table_schema = ''{TARGET_SCHEMA}'' AND column_name = ''{COLUMN_NAME}'' ORDER BY data_type;' ),
- ('1041', '1009', 'Profile Anomaly' , 'Leading_Spaces', 'redshift', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (CASE WHEN "{COLUMN_NAME}" BETWEEN '' !'' AND ''!'' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";' ),
- ('1042', '1010', 'Profile Anomaly' , 'Quoted_Values', 'redshift', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (CASE WHEN "{COLUMN_NAME}" ILIKE ''"%"'' OR "{COLUMN_NAME}" ILIKE ''''''%'''''' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500;' ),
- ('1043', '1011', 'Profile Anomaly' , 'Char_Column_Number_Values', 'redshift', NULL, 'SELECT A.* FROM ( SELECT TOP 10 DISTINCT ''Numeric'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_NUM;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC) AS A UNION ALL SELECT B.* FROM ( SELECT TOP 10 DISTINCT ''Non-Numeric'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_NUM;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS B ORDER BY data_type, count DESC;' ),
- ('1044', '1012', 'Profile Anomaly' , 'Char_Column_Date_Values', 'redshift', NULL, 'SELECT A.* FROM ( SELECT TOP 10 DISTINCT ''Date'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_DATE;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS A UNION ALL SELECT B.* FROM ( SELECT TOP 10 DISTINCT ''Non-Date'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_DATE;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS B ORDER BY data_type, count DESC;' ),
- ('1045', '1013', 'Profile Anomaly', 'Small Missing Value Ct', 'redshift', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (CASE WHEN "{COLUMN_NAME}" IN (''.'', ''?'', '' '') THEN 1 WHEN LOWER("{COLUMN_NAME}") SIMILAR TO ''(^.{2,}|-{2,}|0{2,}|9{2,}|x{2,}|z{2,}$)'' THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''blank'',''error'',''missing'',''tbd'', ''n/a'',''#na'',''none'',''null'',''unknown'') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''(blank)'',''(error)'',''(missing)'',''(tbd)'', ''(n/a)'',''(#na)'',''(none)'',''(null)'',''(unknown)'') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''[blank]'',''[error]'',''[missing]'',''[tbd]'', ''[n/a]'',''[#na]'',''[none]'',''[null]'',''[unknown]'') THEN 1 WHEN "{COLUMN_NAME}" = '''' THEN 1 WHEN "{COLUMN_NAME}" IS NULL THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";'),
- ('1046', '1014', 'Profile Anomaly' , 'Small Divergent Value Ct', 'redshift', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC;' ),
- ('1047', '1015', 'Profile Anomaly' , 'Boolean_Value_Mismatch', 'redshift', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC;' ),
- ('1048', '1016', 'Profile Anomaly' , 'Potential_Duplicates', 'redshift', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" HAVING COUNT(*)> 1 ORDER BY COUNT(*) DESC LIMIT 500;' ),
- ('1049', '1017', 'Profile Anomaly' , 'Standardized_Value_Matches', 'redshift', NULL, 'WITH CTE AS ( SELECT DISTINCT UPPER(TRANSLATE("{COLUMN_NAME}", '' '''',.-'', '''')) as possible_standard_value, COUNT(DISTINCT "{COLUMN_NAME}") FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY UPPER(TRANSLATE("{COLUMN_NAME}", '' '''',.-'', '''')) HAVING COUNT(DISTINCT "{COLUMN_NAME}") > 1 ) SELECT DISTINCT a."{COLUMN_NAME}", possible_standard_value, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} a, cte b WHERE UPPER(TRANSLATE(a."{COLUMN_NAME}", '' '''',.-'', '''')) = b.possible_standard_value GROUP BY a."{COLUMN_NAME}", possible_standard_value ORDER BY possible_standard_value ASC, count DESC LIMIT 500;' ),
- ('1050', '1018', 'Profile Anomaly' , 'Unlikely_Date_Values', 'redshift', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", ''{PROFILE_RUN_DATE}'' :: DATE AS profile_run_date, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} a WHERE ("{COLUMN_NAME}" < ''1900-01-01''::DATE) OR ("{COLUMN_NAME}" > ''{PROFILE_RUN_DATE}'' :: DATE + INTERVAL ''30 year'' ) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500;' ),
- ('1051', '1019', 'Profile Anomaly' , 'Recency_One_Year', 'redshift', NULL, 'created_in_ui' ),
- ('1052', '1020', 'Profile Anomaly' , 'Recency_Six_Months', 'redshift', NULL, 'created_in_ui' ),
- ('1053', '1021', 'Profile Anomaly' , 'Unexpected US States', 'redshift', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500;' ),
- ('1054', '1022', 'Profile Anomaly' , 'Unexpected Emails', 'redshift', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500;' ),
- ('1055', '1023', 'Profile Anomaly' , 'Small_Numeric_Value_Ct', 'redshift', NULL, 'SELECT A.* FROM ( SELECT TOP 10 DISTINCT ''Numeric'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_NUM;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC) AS A UNION ALL SELECT B.* FROM ( SELECT TOP 10 DISTINCT ''Non-Numeric'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_NUM;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS B ORDER BY data_type, count DESC;' ),
- ('1056', '1024', 'Profile Anomaly' , 'Invalid_Zip3_USA', 'redshift', NULL, 'SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}",''012345678'',''999999999'') <> ''999'' GROUP BY "{COLUMN_NAME}" ORDER BY count DESC, "{COLUMN_NAME}" LIMIT 500;'),
- ('1057', '1025', 'Profile Anomaly' , 'Delimited_Data_Embedded', 'redshift', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" ~ ''^([^,|\t]{1,20}[,|\t]){2,}[^,|\t]{0,20}([,|\t]{0,1}[^,|\t]{0,20})*$'' AND "{COLUMN_NAME}" !~ ''\\s(and|but|or|yet)\\s'' GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT 500;' ),
-
- ('1058', '1001', 'Profile Anomaly' , 'Suggested_Type', 'postgresql', NULL, 'SELECT "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC LIMIT 20;'),
- ('1059', '1002', 'Profile Anomaly', 'Non_Standard_Blanks', 'postgresql', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE CASE WHEN "{COLUMN_NAME}" IN (''.'', ''?'', '' '') THEN 1 WHEN LOWER("{COLUMN_NAME}") SIMILAR TO ''(^.{2,}|-{2,}|0{2,}|9{2,}|x{2,}|z{2,}$)'' THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''blank'',''error'',''missing'',''tbd'', ''n/a'',''#na'',''none'',''null'',''unknown'') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''(blank)'',''(error)'',''(missing)'',''(tbd)'', ''(n/a)'',''(#na)'',''(none)'',''(null)'',''(unknown)'') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''[blank]'',''[error]'',''[missing]'',''[tbd]'', ''[n/a]'',''[#na]'',''[none]'',''[null]'',''[unknown]'') THEN 1 WHEN "{COLUMN_NAME}" = '''' THEN 1 WHEN "{COLUMN_NAME}" IS NULL THEN 1 ELSE 0 END = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";'),
- ('1060', '1003', 'Profile Anomaly' , 'Invalid_Zip_USA', 'postgresql', NULL, 'SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}",''012345678'',''999999999'') NOT IN (''99999'', ''999999999'', ''99999-9999'') GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500;'),
- ('1061', '1004', 'Profile Anomaly' , 'Multiple_Types_Minor', 'postgresql', NULL, 'SELECT DISTINCT column_name, columns.table_name, CASE WHEN data_type = ''timestamp without time zone'' THEN ''timestamp'' WHEN data_type = ''character varying'' THEN ''varchar('' || CAST(character_maximum_length AS VARCHAR) || '')'' WHEN data_type = ''character'' THEN ''char('' || CAST(character_maximum_length AS VARCHAR) || '')'' WHEN data_type = ''numeric'' THEN ''numeric('' || CAST(numeric_precision AS VARCHAR) || '','' || CAST(numeric_scale AS VARCHAR) || '')'' ELSE data_type END AS data_type FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = ''{TARGET_SCHEMA}'' AND columns.column_name = ''{COLUMN_NAME}'' AND UPPER(tables.table_type) = ''BASE TABLE'' ORDER BY data_type, table_name;'),
- ('1062', '1005', 'Profile Anomaly' , 'Multiple_Types_Major', 'postgresql', NULL, 'SELECT DISTINCT column_name, columns.table_name, CASE WHEN data_type = ''timestamp without time zone'' THEN ''timestamp'' WHEN data_type = ''character varying'' THEN ''varchar('' || CAST(character_maximum_length AS VARCHAR) || '')'' WHEN data_type = ''character'' THEN ''char('' || CAST(character_maximum_length AS VARCHAR) || '')'' WHEN data_type = ''numeric'' THEN ''numeric('' || CAST(numeric_precision AS VARCHAR) || '','' || CAST(numeric_scale AS VARCHAR) || '')'' ELSE data_type END AS data_type FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = ''{TARGET_SCHEMA}'' AND columns.column_name = ''{COLUMN_NAME}'' AND UPPER(tables.table_type) = ''BASE TABLE'' ORDER BY data_type, table_name;'),
- ('1063', '1006', 'Profile Anomaly' , 'No_Values', 'postgresql', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";' ),
- ('1064', '1007', 'Profile Anomaly' , 'Column_Pattern_Mismatch', 'postgresql', NULL, 'SELECT A.* FROM ( SELECT DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT trim(split_part(''{DETAIL_EXPRESSION}'', ''|'', 4)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", ''[a-z]'', ''a'', ''g''), ''[A-Z]'', ''A'', ''g''), ''[0-9]'', ''N'', ''g'') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC LIMIT 5 ) A UNION ALL SELECT B.* FROM ( SELECT DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT trim(split_part(''{DETAIL_EXPRESSION}'', ''|'', 6)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", ''[a-z]'', ''a'', ''g''), ''[A-Z]'', ''A'', ''g''), ''[0-9]'', ''N'', ''g'') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC LIMIT 5 ) B UNION ALL SELECT C.* FROM ( SELECT DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT trim(split_part(''{DETAIL_EXPRESSION}'', ''|'', 8)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", ''[a-z]'', ''a'', ''g''), ''[A-Z]'', ''A'', ''g''), ''[0-9]'', ''N'', ''g'') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC LIMIT 5 ) C UNION ALL SELECT D.* FROM ( SELECT DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT trim(split_part(''{DETAIL_EXPRESSION}'', ''|'', 10)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", ''[a-z]'', ''a'', ''g''), ''[A-Z]'', ''A'', ''g''), ''[0-9]'', ''N'', ''g'') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC LIMIT 5) D ORDER BY top_pattern DESC, count DESC;' ),
- ('1065', '1008', 'Profile Anomaly' , 'Table_Pattern_Mismatch', 'postgresql', NULL, 'SELECT column_name, columns.table_name FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = ''{TARGET_SCHEMA}'' AND columns.column_name = ''{COLUMN_NAME}'' AND UPPER(tables.table_type) = ''BASE TABLE'' ORDER BY columns.table_name;' ),
- ('1066', '1009', 'Profile Anomaly' , 'Leading_Spaces', 'postgresql', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (CASE WHEN "{COLUMN_NAME}" BETWEEN '' !'' AND ''!'' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";' ),
- ('1067', '1010', 'Profile Anomaly' , 'Quoted_Values', 'postgresql', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (CASE WHEN "{COLUMN_NAME}" ILIKE ''"%"'' OR "{COLUMN_NAME}" ILIKE ''''''%'''''' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500;' ),
- ('1068', '1011', 'Profile Anomaly' , 'Char_Column_Number_Values', 'postgresql', NULL, 'SELECT A.* FROM ( SELECT DISTINCT ''Numeric'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_NUM;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT 10 ) AS A UNION ALL SELECT B.* FROM ( SELECT DISTINCT ''Non-Numeric'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_NUM;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT 10 ) AS B ORDER BY data_type, count DESC;' ),
- ('1069', '1012', 'Profile Anomaly' , 'Char_Column_Date_Values', 'postgresql', NULL, 'SELECT A.* FROM ( SELECT DISTINCT ''Date'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_DATE;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT 10) AS A UNION ALL SELECT B.* FROM ( SELECT DISTINCT ''Non-Date'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_DATE;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT 10) AS B ORDER BY data_type, count DESC;' ),
- ('1070', '1013', 'Profile Anomaly', 'Small Missing Value Ct', 'postgresql', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (CASE WHEN "{COLUMN_NAME}" IN (''.'', ''?'', '' '') THEN 1 WHEN LOWER("{COLUMN_NAME}") SIMILAR TO ''(^.{2,}|-{2,}|0{2,}|9{2,}|x{2,}|z{2,}$)'' THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''blank'',''error'',''missing'',''tbd'', ''n/a'',''#na'',''none'',''null'',''unknown'') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''(blank)'',''(error)'',''(missing)'',''(tbd)'', ''(n/a)'',''(#na)'',''(none)'',''(null)'',''(unknown)'') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''[blank]'',''[error]'',''[missing]'',''[tbd]'', ''[n/a]'',''[#na]'',''[none]'',''[null]'',''[unknown]'') THEN 1 WHEN "{COLUMN_NAME}" = '''' THEN 1 WHEN "{COLUMN_NAME}" IS NULL THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";'),
- ('1071', '1014', 'Profile Anomaly' , 'Small Divergent Value Ct', 'postgresql', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC;' ),
- ('1072', '1015', 'Profile Anomaly' , 'Boolean_Value_Mismatch', 'postgresql', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC;' ),
- ('1073', '1016', 'Profile Anomaly' , 'Potential_Duplicates', 'postgresql', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" HAVING COUNT(*)> 1 ORDER BY COUNT(*) DESC LIMIT 500;' ),
- ('1074', '1017', 'Profile Anomaly' , 'Standardized_Value_Matches', 'postgresql', NULL, 'WITH CTE AS ( SELECT DISTINCT UPPER(TRANSLATE("{COLUMN_NAME}", '' '''',.-'', '''')) as possible_standard_value, COUNT(DISTINCT "{COLUMN_NAME}") FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY UPPER(TRANSLATE("{COLUMN_NAME}", '' '''',.-'', '''')) HAVING COUNT(DISTINCT "{COLUMN_NAME}") > 1 ) SELECT DISTINCT a."{COLUMN_NAME}", possible_standard_value, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} a, cte b WHERE UPPER(TRANSLATE(a."{COLUMN_NAME}", '' '''',.-'', '''')) = b.possible_standard_value GROUP BY a."{COLUMN_NAME}", possible_standard_value ORDER BY possible_standard_value ASC, count DESC LIMIT 500;' ),
- ('1075', '1018', 'Profile Anomaly' , 'Unlikely_Date_Values', 'postgresql', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", ''{PROFILE_RUN_DATE}'' :: DATE AS profile_run_date, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} a WHERE ("{COLUMN_NAME}" < ''1900-01-01''::DATE) OR ("{COLUMN_NAME}" > ''{PROFILE_RUN_DATE}'' :: DATE + INTERVAL ''30 year'' ) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500;' ),
- ('1076', '1019', 'Profile Anomaly' , 'Recency_One_Year', 'postgresql', NULL, 'created_in_ui' ),
- ('1077', '1020', 'Profile Anomaly' , 'Recency_Six_Months', 'postgresql', NULL, 'created_in_ui' ),
- ('1078', '1021', 'Profile Anomaly' , 'Unexpected US States', 'postgresql', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500;' ),
- ('1079', '1022', 'Profile Anomaly' , 'Unexpected Emails', 'postgresql', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500;' ),
- ('1080', '1023', 'Profile Anomaly' , 'Small_Numeric_Value_Ct', 'postgresql', NULL, 'SELECT A.* FROM ( SELECT DISTINCT ''Numeric'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_NUM;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT 10 ) AS A UNION ALL SELECT B.* FROM ( SELECT DISTINCT ''Non-Numeric'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_NUM;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT 10 ) AS B ORDER BY data_type, count DESC;' ),
- ('1081', '1024', 'Profile Anomaly' , 'Invalid_Zip3_USA', 'postgresql', NULL, 'SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}",''012345678'',''999999999'') <> ''999'' GROUP BY "{COLUMN_NAME}" ORDER BY count DESC, "{COLUMN_NAME}" LIMIT 500;'),
- ('1082', '1025', 'Profile Anomaly' , 'Delimited_Data_Embedded', 'postgresql', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" ~ ''^([^,|\t]{1,20}[,|\t]){2,}[^,|\t]{0,20}([,|\t]{0,1}[^,|\t]{0,20})*$'' AND "{COLUMN_NAME}" !~ ''\s(and|but|or|yet)\s'' GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT 500;' ),
-
-
- ('1083', '1004', 'Test Results', 'Alpha_Trunc', 'postgresql', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", LENGTH("{COLUMN_NAME}") as current_max_length, {THRESHOLD_VALUE} as previous_max_length FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT MAX(LENGTH("{COLUMN_NAME}")) as max_length FROM {TARGET_SCHEMA}.{TABLE_NAME}) a WHERE LENGTH("{COLUMN_NAME}") = a.max_length AND a.max_length < {THRESHOLD_VALUE} LIMIT 500;'),
- ('1084', '1005', 'Test Results', 'Avg_Shift', 'postgresql', NULL, 'SELECT AVG("{COLUMN_NAME}" :: FLOAT) AS current_average FROM {TARGET_SCHEMA}.{TABLE_NAME};'),
- ('1085', '1006', 'Test Results', 'Condition_Flag', 'postgresql', NULL, 'SELECT * FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE {CUSTOM_QUERY} LIMIT 500;'),
- ('1086', '1007', 'Test Results', 'Constant', 'postgresql', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" <> {BASELINE_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT 500;'),
- ('1087', '1009', 'Test Results', 'Daily_Record_Ct', 'postgresql', NULL, 'WITH RECURSIVE daterange(all_dates) AS (SELECT MIN("{COLUMN_NAME}") :: DATE AS all_dates FROM {TARGET_SCHEMA}.{TABLE_NAME} UNION ALL SELECT (d.all_dates :: DATE + INTERVAL ''1 day'') :: DATE AS all_dates FROM daterange d WHERE d.all_dates < (SELECT MAX("{COLUMN_NAME}") :: DATE FROM {TARGET_SCHEMA}.{TABLE_NAME}) ), existing_periods AS ( SELECT DISTINCT "{COLUMN_NAME}" :: DATE AS period, COUNT(1) AS period_count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" :: DATE ) SELECT d.all_dates AS missing_period, MAX(b.period) AS prior_available_date, (SELECT period_count FROM existing_periods WHERE period = MAX(b.period) ) AS prior_available_date_count, MIN(c.period) AS next_available_date, (SELECT period_count FROM existing_periods WHERE period = MIN(c.period) ) AS next_available_date_count FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates LIMIT 500;'),
- ('1088', '1011', 'Test Results', 'Dec_Trunc', 'postgresql', NULL, 'SELECT DISTINCT LENGTH(SPLIT_PART("{COLUMN_NAME}" :: TEXT, ''.'', 2)) AS decimal_scale, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY decimal_scale LIMIT 500;'),
- ('1089', '1012', 'Test Results', 'Distinct_Date_Ct', 'postgresql', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" IS NOT NULL GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500;'),
- ('1090', '1013', 'Test Results', 'Distinct_Value_Ct', 'postgresql', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" IS NOT NULL GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500;'),
- ('1091', '1014', 'Test Results', 'Email_Format', 'postgresql', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" !~ ''^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$'' GROUP BY "{COLUMN_NAME}" LIMIT 500;'),
- ('1092', '1015', 'Test Results', 'Future_Date', 'postgresql', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE GREATEST(0, SIGN("{COLUMN_NAME}"::DATE - ''{TEST_DATE}''::DATE)) > {THRESHOLD_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT 500;'),
- ('1093', '1016', 'Test Results', 'Future_Date_1Y', 'postgresql', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE GREATEST(0, SIGN("{COLUMN_NAME}"::DATE - (''{TEST_DATE}''::DATE + 365))) > {THRESHOLD_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT 500;'),
- ('1094', '1017', 'Test Results', 'Incr_Avg_Shift', 'postgresql', NULL, 'SELECT AVG("{COLUMN_NAME}" :: FLOAT) AS current_average, SUM("{COLUMN_NAME}" ::FLOAT) AS current_sum, NULLIF(COUNT("{COLUMN_NAME}" )::FLOAT, 0) as current_value_count FROM {TARGET_SCHEMA}.{TABLE_NAME};'),
- ('1095', '1018', 'Test Results', 'LOV_All', 'postgresql', NULL, 'SELECT STRING_AGG(DISTINCT "{COLUMN_NAME}", ''|'' ORDER BY "{COLUMN_NAME}" ASC) FROM {TARGET_SCHEMA}.{TABLE_NAME} HAVING STRING_AGG(DISTINCT "{COLUMN_NAME}", ''|'' ORDER BY "{COLUMN_NAME}" ASC) <> ''{THRESHOLD_VALUE}'' LIMIT 500;'),
- ('1096', '1019', 'Test Results', 'LOV_Match', 'postgresql', NULL, 'SELECT DISTINCT NULLIF("{COLUMN_NAME}", '''') AS "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE NULLIF("{COLUMN_NAME}", '''') NOT IN {BASELINE_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT 500;'),
- ('1097', '1020', 'Test Results', 'Min_Date', 'postgresql', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" :: DATE < ''{BASELINE_VALUE}'' :: DATE GROUP BY "{COLUMN_NAME}" LIMIT 500;'),
- ('1098', '1021', 'Test Results', 'Min_Val', 'postgresql', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", (ABS("{COLUMN_NAME}") - ABS({BASELINE_VALUE})) AS difference_from_baseline FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" < {BASELINE_VALUE} LIMIT 500;'),
- ('1099', '1022', 'Test Results', 'Missing_Pct', 'postgresql', NULL, 'SELECT * FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" IS NULL OR "{COLUMN_NAME}" :: VARCHAR(255) = '''' LIMIT 10;'),
- ('1100', '1023', 'Test Results', 'Monthly_Rec_Ct', 'postgresql', NULL, 'WITH RECURSIVE daterange(all_dates) AS (SELECT DATE_TRUNC(''month'', MIN("{COLUMN_NAME}")) :: DATE AS all_dates FROM {TARGET_SCHEMA}.{TABLE_NAME} UNION ALL SELECT (d.all_dates :: DATE + INTERVAL ''1 month'') :: DATE AS all_dates FROM daterange d WHERE d.all_dates < (SELECT DATE_TRUNC(''month'', MAX("{COLUMN_NAME}")) :: DATE FROM {TARGET_SCHEMA}.{TABLE_NAME}) ), existing_periods AS ( SELECT DISTINCT DATE_TRUNC(''month'',"{COLUMN_NAME}") :: DATE AS period, COUNT(1) AS period_count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY DATE_TRUNC(''month'',"{COLUMN_NAME}") :: DATE ) SELECT d.all_dates as missing_period, MAX(b.period) AS prior_available_month, (SELECT period_count FROM existing_periods WHERE period = MAX(b.period) ) AS prior_available_month_count, MIN(c.period) AS next_available_month, (SELECT period_count FROM existing_periods WHERE period = MIN(c.period) ) AS next_available_month_count FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates ORDER BY d.all_dates;'),
- ('1101', '1024', 'Test Results', 'Outlier_Pct_Above', 'postgresql', NULL, 'SELECT ({BASELINE_AVG} + (2*{BASELINE_SD})) AS outlier_threshold, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" :: FLOAT > ({BASELINE_AVG} + (2*{BASELINE_SD})) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC;'),
- ('1102', '1025', 'Test Results', 'Outlier_Pct_Below', 'postgresql', NULL, 'SELECT ({BASELINE_AVG} + (2*{BASELINE_SD})) AS outlier_threshold, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" :: FLOAT < ({BASELINE_AVG} + (2*{BASELINE_SD})) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC;'),
- ('1103', '1026', 'Test Results', 'Pattern_Match', 'postgresql', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE NULLIF("{COLUMN_NAME}", '''') NOT SIMILAR TO ''{BASELINE_VALUE}'' GROUP BY "{COLUMN_NAME}";'),
- ('1104', '1028', 'Test Results', 'Recency', 'postgresql', NULL, 'SELECT DISTINCT col AS latest_date_available, ''{TEST_DATE}'' :: DATE as test_run_date FROM (SELECT MAX("{COLUMN_NAME}") AS col FROM {TARGET_SCHEMA}.{TABLE_NAME}) a WHERE <%DATEDIFF_DAY;col;''{TEST_DATE}''::DATE%> > {THRESHOLD_VALUE};'),
- ('1105', '1030', 'Test Results', 'Required', 'postgresql', NULL, 'SELECT * FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" IS NULL LIMIT 500;'),
- ('1106', '1031', 'Test Results', 'Row_Ct', 'postgresql', NULL, 'WITH CTE AS (SELECT COUNT(*) AS current_count FROM {TARGET_SCHEMA}.{TABLE_NAME}) SELECT current_count, ABS(ROUND(100 * (current_count - {THRESHOLD_VALUE}) :: NUMERIC / {THRESHOLD_VALUE} :: NUMERIC,2)) AS row_count_pct_decrease FROM cte WHERE current_count < {THRESHOLD_VALUE};'),
- ('1107', '1032', 'Test Results', 'Row_Ct_Pct', 'postgresql', NULL, 'WITH CTE AS (SELECT COUNT(*) AS current_count FROM {TARGET_SCHEMA}.{TABLE_NAME}) SELECT current_count, {BASELINE_CT} AS baseline_count, ABS(ROUND(100 * (current_count - {BASELINE_CT}) :: NUMERIC / {BASELINE_CT} :: NUMERIC,2)) AS row_count_pct_difference FROM cte;'),
- ('1108', '1033', 'Test Results', 'Street_Addr_Pattern', 'postgresql', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" !~ ''^[0-9]{1,5}[a-zA-Z]?\s\w{1,5}\.?\s?\w*\s?\w*\s[a-zA-Z]{1,6}\.?\s?[0-9]{0,5}[A-Z]{0,1}$'' GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT 500;'),
- ('1109', '1036', 'Test Results', 'US_State', 'postgresql', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE NULLIF("{COLUMN_NAME}", '''') NOT IN (''AL'',''AK'',''AS'',''AZ'',''AR'',''CA'',''CO'',''CT'',''DE'',''DC'',''FM'',''FL'',''GA'',''GU'',''HI'',''ID'',''IL'',''IN'',''IA'',''KS'',''KY'',''LA'',''ME'',''MH'',''MD'',''MA'',''MI'',''MN'',''MS'',''MO'',''MT'',''NE'',''NV'',''NH'',''NJ'',''NM'',''NY'',''NC'',''ND'',''MP'',''OH'',''OK'',''OR'',''PW'',''PA'',''PR'',''RI'',''SC'',''SD'',''TN'',''TX'',''UT'',''VT'',''VI'',''VA'',''WA'',''WV'',''WI'',''WY'',''AE'',''AP'',''AA'') GROUP BY "{COLUMN_NAME}" LIMIT 500;'),
- ('1110', '1034', 'Test Results', 'Unique', 'postgresql', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" HAVING COUNT(*) > 1 ORDER BY COUNT(*) DESC LIMIT 500;'),
- ('1111', '1035', 'Test Results', 'Unique_Pct', 'postgresql', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT 500;'),
- ('1112', '1037', 'Test Results', 'Weekly_Rec_Ct', 'postgresql', NULL, 'WITH RECURSIVE daterange(all_dates) AS (SELECT DATE_TRUNC(''week'', MIN("{COLUMN_NAME}")) :: DATE AS all_dates FROM {TARGET_SCHEMA}.{TABLE_NAME} UNION ALL SELECT (d.all_dates + INTERVAL ''1 week'' ) :: DATE AS all_dates FROM daterange d WHERE d.all_dates < (SELECT DATE_TRUNC(''week'' , MAX("{COLUMN_NAME}")) :: DATE FROM {TARGET_SCHEMA}.{TABLE_NAME}) ), existing_periods AS (SELECT DISTINCT DATE_TRUNC(''week'', "{COLUMN_NAME}") :: DATE AS period, COUNT(1) as period_count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY DATE_TRUNC(''week'', "{COLUMN_NAME}") :: DATE) SELECT d.all_dates as missing_period, MAX(b.period) AS prior_available_week, (SELECT period_count FROM existing_periods WHERE period = MAX(b.period) ) AS prior_available_week_count, MIN(c.period) AS next_available_week, (SELECT period_count FROM existing_periods WHERE period = MIN(c.period) ) AS next_available_week_count FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates ORDER BY d.all_dates;'),
- ('1113', '1040', 'Test Results', 'Variability_Increase', 'postgresql', NULL, 'SELECT STDDEV(CAST("{COLUMN_NAME}" AS FLOAT)) as current_standard_deviation FROM {TARGET_SCHEMA}.{TABLE_NAME};'),
- ('1114', '1041', 'Test Results', 'Variability_Decrease', 'postgresql', NULL, 'SELECT STDDEV(CAST("{COLUMN_NAME}" AS FLOAT)) as current_standard_deviation FROM {TARGET_SCHEMA}.{TABLE_NAME};'),
-
- ('1115', '1001', 'Profile Anomaly' , 'Suggested_Type', 'mssql', NULL, 'SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC;'),
- ('1116', '1002', 'Profile Anomaly', 'Non_Standard_Blanks', 'mssql', NULL, 'SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE CASE WHEN "{COLUMN_NAME}" IN (''.'', ''?'') OR "{COLUMN_NAME}" LIKE '' '' THEN 1 WHEN LEN("{COLUMN_NAME}") > 1 AND ( LOWER("{COLUMN_NAME}") LIKE ''%..%'' OR LOWER("{COLUMN_NAME}") LIKE ''%--%'' OR (LEN(REPLACE("{COLUMN_NAME}", ''0'', ''''))= 0 ) OR (LEN(REPLACE("{COLUMN_NAME}", ''9'', ''''))= 0 ) OR (LEN(REPLACE(LOWER("{COLUMN_NAME}"), ''x'', ''''))= 0 ) OR (LEN(REPLACE(LOWER("{COLUMN_NAME}"), ''z'', ''''))= 0 ) ) THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''blank'',''error'',''missing'',''tbd'', ''n/a'',''#na'',''none'',''null'',''unknown'') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''(blank)'',''(error)'',''(missing)'',''(tbd)'', ''(n/a)'',''(#na)'',''(none)'',''(null)'',''(unknown)'') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''[blank]'',''[error]'',''[missing]'',''[tbd]'', ''[n/a]'',''[#na]'',''[none]'',''[null]'',''[unknown]'') THEN 1 WHEN "{COLUMN_NAME}" = '''' THEN 1 WHEN "{COLUMN_NAME}" IS NULL THEN 1 ELSE 0 END = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";'),
- ('1117', '1003', 'Profile Anomaly' , 'Invalid_Zip_USA', 'mssql', NULL, 'SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}",''012345678'',''999999999'') NOT IN (''99999'', ''999999999'', ''99999-9999'') GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";'),
- ('1118', '1004', 'Profile Anomaly' , 'Multiple_Types_Minor', 'mssql', NULL, 'SELECT TOP 500 column_name, columns.table_name, CASE WHEN data_type = ''datetime'' THEN ''datetime'' WHEN data_type = ''datetime2'' THEN ''datetime'' WHEN data_type = ''varchar'' THEN ''varchar('' + CAST(character_maximum_length AS VARCHAR) + '')'' WHEN data_type = ''char'' THEN ''char('' + CAST(character_maximum_length AS VARCHAR) + '')'' WHEN data_type = ''numeric'' THEN ''numeric('' + CAST(numeric_precision AS VARCHAR) + '','' + CAST(numeric_scale AS VARCHAR) + '')'' ELSE data_type END AS data_type FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = ''{TARGET_SCHEMA}'' AND columns.column_name = ''{COLUMN_NAME}'' AND tables.table_type = ''BASE TABLE'' ORDER BY data_type, table_name;'),
- ('1119', '1005', 'Profile Anomaly' , 'Multiple_Types_Major', 'mssql', NULL, 'SELECT TOP 500 column_name, columns.table_name, CASE WHEN data_type = ''datetime'' THEN ''datetime'' WHEN data_type = ''datetime2'' THEN ''datetime'' WHEN data_type = ''varchar'' THEN ''varchar('' + CAST(character_maximum_length AS VARCHAR) + '')'' WHEN data_type = ''char'' THEN ''char('' + CAST(character_maximum_length AS VARCHAR) + '')'' WHEN data_type = ''numeric'' THEN ''numeric('' + CAST(numeric_precision AS VARCHAR) + '','' + CAST(numeric_scale AS VARCHAR) + '')'' ELSE data_type END AS data_type FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = ''{TARGET_SCHEMA}'' AND columns.column_name = ''{COLUMN_NAME}'' AND tables.table_type = ''BASE TABLE'' ORDER BY data_type, table_name;'),
- ('1120', '1006', 'Profile Anomaly' , 'No_Values', 'mssql', NULL, 'SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";' ),
- ('1121', '1007', 'Profile Anomaly' , 'Column_Pattern_Mismatch', 'mssql', NULL, 'WITH cte AS ( SELECT TRIM(value) AS top_pattern, ROW_NUMBER() OVER (ORDER BY CHARINDEX(''| ''+ TRIM(value) + '' |'', ''| '' + ''{DETAIL_EXPRESSION}'' + '' |'' ) ASC) as row_num FROM STRING_SPLIT(''{DETAIL_EXPRESSION}'', ''|'') ) SELECT DISTINCT TOP 5 c.top_pattern, a."{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} a, cte c WHERE c.row_num = 4 AND TRANSLATE(a."{COLUMN_NAME}" COLLATE Latin1_General_BIN, ''abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789'', ''aaaaaaaaaaaaaaaaaaaaaaaaaaAAAAAAAAAAAAAAAAAAAAAAAAAANNNNNNNNNN'') = c.top_pattern GROUP BY c.top_pattern, a."{COLUMN_NAME}" UNION ALL SELECT DISTINCT TOP 5 c.top_pattern, a."{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} a, cte c WHERE c.row_num = 6 AND TRANSLATE(a."{COLUMN_NAME}" COLLATE Latin1_General_BIN, ''abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789'', ''aaaaaaaaaaaaaaaaaaaaaaaaaaAAAAAAAAAAAAAAAAAAAAAAAAAANNNNNNNNNN'') = c.top_pattern GROUP BY c.top_pattern, a."{COLUMN_NAME}" UNION ALL SELECT DISTINCT TOP 5 c.top_pattern, a."{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} a, cte c WHERE c.row_num = 8 AND TRANSLATE(a."{COLUMN_NAME}" COLLATE Latin1_General_BIN, ''abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789'', ''aaaaaaaaaaaaaaaaaaaaaaaaaaAAAAAAAAAAAAAAAAAAAAAAAAAANNNNNNNNNN'') = c.top_pattern GROUP BY c.top_pattern, a."{COLUMN_NAME}" UNION ALL SELECT DISTINCT TOP 5 c.top_pattern, a."{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} a, cte c WHERE c.row_num = 10 AND TRANSLATE(a."{COLUMN_NAME}" COLLATE Latin1_General_BIN, ''abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789'', ''aaaaaaaaaaaaaaaaaaaaaaaaaaAAAAAAAAAAAAAAAAAAAAAAAAAANNNNNNNNNN'') = c.top_pattern GROUP BY c.top_pattern, a."{COLUMN_NAME}" ORDER BY top_pattern DESC, count DESC;' ),
- ('1122', '1008', 'Profile Anomaly' , 'Table_Pattern_Mismatch', 'mssql', NULL, 'SELECT TOP 500 column_name, columns.table_name FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = ''{TARGET_SCHEMA}'' AND columns.column_name = ''{COLUMN_NAME}'' AND UPPER(tables.table_type) = ''BASE TABLE'' ORDER BY table_name;' ),
- ('1123', '1009', 'Profile Anomaly' , 'Leading_Spaces', 'mssql', NULL, 'SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (CASE WHEN "{COLUMN_NAME}" BETWEEN '' !'' AND ''!'' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";' ),
- ('1124', '1010', 'Profile Anomaly' , 'Quoted_Values', 'mssql', NULL, 'SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (CASE WHEN "{COLUMN_NAME}" LIKE ''"%"'' OR "{COLUMN_NAME}" LIKE ''''''%'''''' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";' ),
- ('1125', '1011', 'Profile Anomaly' , 'Char_Column_Number_Values', 'mssql', NULL, 'SELECT A.* FROM ( SELECT DISTINCT TOP 10 ''Numeric'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_NUM;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS A UNION ALL SELECT B.* FROM ( SELECT DISTINCT TOP 10 ''Non-Numeric'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_NUM;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS B ORDER BY data_type, count DESC;' ),
- ('1126', '1012', 'Profile Anomaly' , 'Char_Column_Date_Values', 'mssql', NULL, 'SELECT A.* FROM ( SELECT DISTINCT TOP 10 ''Date'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_DATE;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC) AS A UNION ALL SELECT B.* FROM ( SELECT DISTINCT TOP 10 ''Non-Date'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_DATE;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS B ORDER BY data_type, count DESC;' ),
- ('1127', '1013', 'Profile Anomaly', 'Small Missing Value Ct', 'mssql', NULL, 'SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (CASE WHEN "{COLUMN_NAME}" IN (''.'', ''?'', '' '') THEN 1 WHEN LEN("{COLUMN_NAME}") > 1 AND ( LOWER("{COLUMN_NAME}") LIKE ''%..%'' OR LOWER("{COLUMN_NAME}") LIKE ''%--%'' OR (LEN(REPLACE("{COLUMN_NAME}", ''0'', ''''))= 0 ) OR (LEN(REPLACE("{COLUMN_NAME}", ''9'', ''''))= 0 ) OR (LEN(REPLACE(LOWER("{COLUMN_NAME}"), ''x'', ''''))= 0 ) OR (LEN(REPLACE(LOWER("{COLUMN_NAME}"), ''z'', ''''))= 0 ) ) THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''blank'',''error'',''missing'',''tbd'', ''n/a'',''#na'',''none'',''null'',''unknown'') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''(blank)'',''(error)'',''(missing)'',''(tbd)'', ''(n/a)'',''(#na)'',''(none)'',''(null)'',''(unknown)'') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''[blank]'',''[error]'',''[missing]'',''[tbd]'', ''[n/a]'',''[#na]'',''[none]'',''[null]'',''[unknown]'') THEN 1 WHEN "{COLUMN_NAME}" = '''' THEN 1 WHEN "{COLUMN_NAME}" IS NULL THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";'),
- ('1128', '1014', 'Profile Anomaly' , 'Small Divergent Value Ct', 'mssql', NULL, 'SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC;' ),
- ('1129', '1015', 'Profile Anomaly' , 'Boolean_Value_Mismatch', 'mssql', NULL, 'SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC;' ),
- ('1130', '1016', 'Profile Anomaly' , 'Potential_Duplicates', 'mssql', NULL, 'SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" HAVING COUNT(*)> 1 ORDER BY COUNT(*) DESC;' ),
- ('1131', '1017', 'Profile Anomaly' , 'Standardized_Value_Matches', 'mssql', NULL, 'WITH CTE AS ( SELECT DISTINCT TOP 500 UPPER(REPLACE(TRANSLATE("{COLUMN_NAME}",'' '''''''',.-'',REPLICATE('' '', LEN('' '''''''',.-''))),'' '','''')) as possible_standard_value, COUNT(DISTINCT "{COLUMN_NAME}") as distinct_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY UPPER(REPLACE(TRANSLATE("{COLUMN_NAME}",'' '''''''',.-'',REPLICATE('' '', LEN('' '''''''',.-''))),'' '','''')) HAVING COUNT(DISTINCT "{COLUMN_NAME}") > 1 ) SELECT DISTINCT a."{COLUMN_NAME}", possible_standard_value, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} a, cte b WHERE UPPER(REPLACE(TRANSLATE("{COLUMN_NAME}",'' '''''''',.-'',REPLICATE('' '', LEN('' '''''''',.-''))),'' '','''')) = b.possible_standard_value GROUP BY a."{COLUMN_NAME}", possible_standard_value ORDER BY possible_standard_value ASC, count DESC;' ),
- ('1132', '1018', 'Profile Anomaly' , 'Unlikely_Date_Values', 'mssql', NULL, 'SELECT TOP 500 "{COLUMN_NAME}", CAST( ''{PROFILE_RUN_DATE}'' AS DATE) AS profile_run_date, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} a WHERE ("{COLUMN_NAME}" < CAST(''1900-01-01'' AS DATE) ) OR ("{COLUMN_NAME}" > DATEADD(YEAR, 30, CAST(''{PROFILE_RUN_DATE}'' AS DATE ))) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC;' ),
- ('1133', '1019', 'Profile Anomaly' , 'Recency_One_Year', 'mssql', NULL, 'created_in_ui' ),
- ('1134', '1020', 'Profile Anomaly' , 'Recency_Six_Months', 'mssql', NULL, 'created_in_ui' ),
- ('1135', '1021', 'Profile Anomaly' , 'Unexpected US States', 'mssql', NULL, 'SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC;' ),
- ('1136', '1022', 'Profile Anomaly' , 'Unexpected Emails', 'mssql', NULL, 'SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC;' ),
- ('1137', '1023', 'Profile Anomaly' , 'Small_Numeric_Value_Ct', 'mssql', NULL, 'SELECT A.* FROM ( SELECT DISTINCT TOP 10 ''Numeric'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_NUM;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS A UNION ALL SELECT B.* FROM ( SELECT DISTINCT TOP 10 ''Non-Numeric'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_NUM;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS B ORDER BY data_type, count DESC;' ),
- ('1138', '1024', 'Profile Anomaly' , 'Invalid_Zip3_USA', 'mssql', NULL, 'SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}",''012345678'',''999999999'') <> ''999'' GROUP BY "{COLUMN_NAME}" ORDER BY count DESC, "{COLUMN_NAME}";'),
- ('1139', '1025', 'Profile Anomaly' , 'Delimited_Data_Embedded', 'mssql', NULL, 'SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE ( "{COLUMN_NAME}" LIKE ''%,%,%,%'' OR "{COLUMN_NAME}" LIKE ''%|%|%|%'' OR "{COLUMN_NAME}" LIKE ''%^%^%^%'' OR "{COLUMN_NAME}" LIKE ''%'' + CHAR(9) + ''%'' + CHAR(9) + ''%'' + CHAR(9) + ''%'' ) AND NOT ( "{COLUMN_NAME}" LIKE ''% and %'' OR "{COLUMN_NAME}" LIKE ''% but %'' OR "{COLUMN_NAME}" LIKE ''% or %'' OR "{COLUMN_NAME}" LIKE ''% yet %'' ) AND ISNULL(CAST(LEN("{COLUMN_NAME}") - LEN(REPLACE("{COLUMN_NAME}", '','', '''')) as FLOAT) / CAST(NULLIF(LEN("{COLUMN_NAME}") - LEN(REPLACE("{COLUMN_NAME}", '' '', '''')), 0) as FLOAT), 1) > 0.6 GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC;' ),
-
- ('1140', '1004', 'Test Results', 'Alpha_Trunc', 'mssql', NULL, 'SELECT DISTINCT TOP 500 "{COLUMN_NAME}", LEN("{COLUMN_NAME}") as current_max_length, {THRESHOLD_VALUE} as previous_max_length FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT MAX(LEN("{COLUMN_NAME}")) as max_length FROM {TARGET_SCHEMA}.{TABLE_NAME}) a WHERE LEN("{COLUMN_NAME}") = a.max_length AND a.max_length < {THRESHOLD_VALUE} ;'),
- ('1141', '1005', 'Test Results', 'Avg_Shift', 'mssql', NULL, 'SELECT AVG(CAST("{COLUMN_NAME}" AS FLOAT)) AS current_average FROM {TARGET_SCHEMA}.{TABLE_NAME};'),
- ('1142', '1006', 'Test Results', 'Condition_Flag', 'mssql', NULL, 'SELECT TOP 500 * FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE {CUSTOM_QUERY};'),
- ('1143', '1007', 'Test Results', 'Constant', 'mssql', NULL, 'SELECT DISTINCT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" <> {BASELINE_VALUE} GROUP BY "{COLUMN_NAME}";'),
- ('1144', '1009', 'Test Results', 'Daily_Record_Ct', 'mssql', NULL, 'WITH
- Pass0 as (select 1 as C union all select 1), --2 rows
- Pass1 as (select 1 as C from Pass0 as A, Pass0 as B),--4 rows
- Pass2 as (select 1 as C from Pass1 as A, Pass1 as B),--16 rows
- Pass3 as (select 1 as C from Pass2 as A, Pass2 as B),--256 rows
- Pass4 as (select 1 as C from Pass3 as A, Pass3 as B),--65536 rows
- All_Nums as (select row_number() over(order by C) as Number from Pass4),
- tally as (SELECT Number FROM All_Nums WHERE Number <= 45000),
-
- date_range as (SELECT CAST(DATEADD(DAY, DATEDIFF(DAY, 0, MIN("{COLUMN_NAME}")), 0) AS DATE) AS min_period,
- CAST(DATEADD(DAY, DATEDIFF(DAY, 0, MAX("{COLUMN_NAME}")), 0) AS DATE) AS max_period,
- DATEDIFF(DAY,
- CAST(DATEADD(DAY, DATEDIFF(DAY, 0, MIN("{COLUMN_NAME}")), 0) AS DATE),
- CAST(DATEADD(DAY, DATEDIFF(DAY, 0, MAX("{COLUMN_NAME}")), 0) AS DATE) ) + 1 as period_ct
- FROM {TARGET_SCHEMA}.{TABLE_NAME} ),
- check_periods as ( SELECT d.min_period, d.max_period, t.number,
- DATEADD(DAY, -(t.number - 1), d.max_period) AS check_period
- FROM date_range d
- INNER JOIN tally t
- ON (d.period_ct >= t.number) ),
- data_by_period as (SELECT CAST(DATEADD(DAY, DATEDIFF(DAY, 0, "{COLUMN_NAME}"), 0) AS DATE) as data_period, COUNT(*) as record_ct
- FROM {TARGET_SCHEMA}.{TABLE_NAME}
- GROUP BY CAST(DATEADD(DAY, DATEDIFF(DAY, 0, "{COLUMN_NAME}"), 0) AS DATE) ),
- data_by_prd_with_prior_next as (SELECT check_period,
- RANK() OVER (ORDER BY check_period DESC) as ranked,
- ISNULL(d.record_ct, 0) as record_ct,
- ISNULL(LAG(d.record_ct) OVER (ORDER BY check_period), 0) as last_record_ct,
- ISNULL(LEAD(d.record_ct) OVER (ORDER BY check_period), 0) as next_record_ct
- FROM check_periods c
- LEFT JOIN data_by_period d
- ON (c.check_period = d.data_period) )
-SELECT check_period, record_ct,
- CASE
- WHEN record_ct = 0 THEN ''MISSING''
- ELSE ''Present''
- END as status
- FROM data_by_prd_with_prior_next
- WHERE record_ct = 0
- OR last_record_ct = 0
- OR next_record_ct = 0
-ORDER BY check_period DESC;'),
- ('1145', '1011', 'Test Results', 'Dec_Trunc', 'mssql', NULL, 'WITH CTE AS ( SELECT LEN(SUBSTRING(CAST(ABS("{COLUMN_NAME}") % 1 AS VARCHAR) , 3, LEN("{COLUMN_NAME}"))) AS decimal_scale FROM {TARGET_SCHEMA}.{TABLE_NAME} ) SELECT DISTINCT TOP 500 decimal_scale,COUNT(*) AS count FROM cte GROUP BY decimal_scale ORDER BY COUNT(*) DESC; '),
- ('1146', '1012', 'Test Results', 'Distinct_Date_Ct', 'mssql', NULL, 'SELECT DISTINCT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" IS NOT NULL GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC;'),
- ('1147', '1013', 'Test Results', 'Distinct_Value_Ct', 'mssql', NULL, 'SELECT DISTINCT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" IS NOT NULL GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC;'),
- ('1148', '1014', 'Test Results', 'Email_Format', 'mssql', NULL, 'SELECT DISTINCT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" NOT LIKE ''%[_a-zA-Z0-9.-]%@%[a-zA-Z0-9.-]%.[a-zA-Z][a-zA-Z]%'' GROUP BY "{COLUMN_NAME}";'),
- ('1149', '1015', 'Test Results', 'Future_Date', 'mssql', NULL, 'SELECT DISTINCT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE CAST("{COLUMN_NAME}" AS DATE) >= CONVERT(DATE, ''{TEST_DATE}'') GROUP BY "{COLUMN_NAME}";'),
- ('1150', '1016', 'Test Results', 'Future_Date_1Y', 'mssql', NULL, 'SELECT DISTINCT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE CAST("{COLUMN_NAME}" AS DATE) >= DATEADD(DAY, 365, CONVERT(DATE, ''{TEST_DATE}'')) GROUP BY "{COLUMN_NAME}";'),
- ('1151', '1017', 'Test Results', 'Incr_Avg_Shift', 'mssql', NULL, 'SELECT AVG(CAST("{COLUMN_NAME}" AS FLOAT)) AS current_average, SUM(CAST("{COLUMN_NAME}" AS FLOAT)) AS current_sum, NULLIF(CAST(COUNT("{COLUMN_NAME}") AS FLOAT), 0) as current_value_count FROM {TARGET_SCHEMA}.{TABLE_NAME};'),
- ('1152', '1018', 'Test Results', 'LOV_All', 'mssql', NULL, 'WITH CTE AS (SELECT DISTINCT "{COLUMN_NAME}" FROM {TARGET_SCHEMA}.{TABLE_NAME}) SELECT STRING_AGG( "{COLUMN_NAME}", ''|'' ) WITHIN GROUP (ORDER BY "{COLUMN_NAME}" ASC) FROM CTE HAVING STRING_AGG("{COLUMN_NAME}", ''|'') WITHIN GROUP (ORDER BY "{COLUMN_NAME}" ASC) <> ''{THRESHOLD_VALUE}'';'),
- ('1153', '1019', 'Test Results', 'LOV_Match', 'mssql', NULL, 'SELECT DISTINCT TOP 500 NULLIF("{COLUMN_NAME}", '''') AS "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE NULLIF("{COLUMN_NAME}", '''') NOT IN {BASELINE_VALUE} GROUP BY "{COLUMN_NAME}" ;'),
- ('1154', '1020', 'Test Results', 'Min_Date', 'mssql', NULL, 'SELECT DISTINCT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE CAST("{COLUMN_NAME}" AS DATE) < CAST(''{BASELINE_VALUE}'' AS DATE) GROUP BY "{COLUMN_NAME}";'),
- ('1155', '1021', 'Test Results', 'Min_Val', 'mssql', NULL, 'SELECT DISTINCT TOP 500 "{COLUMN_NAME}", (ABS("{COLUMN_NAME}") - ABS({BASELINE_VALUE})) AS difference_from_baseline FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" < {BASELINE_VALUE};'),
- ('1156', '1022', 'Test Results', 'Missing_Pct', 'mssql', NULL, 'SELECT TOP 10 * FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" IS NULL OR CAST("{COLUMN_NAME}" AS VARCHAR(255)) = '''';'),
- ('1157', '1023', 'Test Results', 'Monthly_Rec_Ct', 'mssql', NULL, 'WITH
- Pass0 as (select 1 as C union all select 1), --2 rows
- Pass1 as (select 1 as C from Pass0 as A, Pass0 as B),--4 rows
- Pass2 as (select 1 as C from Pass1 as A, Pass1 as B),--16 rows
- Pass3 as (select 1 as C from Pass2 as A, Pass2 as B),--256 rows
- Pass4 as (select 1 as C from Pass3 as A, Pass3 as B),--65536 rows
- All_Nums as (select row_number() over(order by C) as Number from Pass4),
- tally as (SELECT Number FROM All_Nums WHERE Number <= 45000),
-
- date_range as (SELECT CAST(DATEADD(MONTH, DATEDIFF(MONTH, 0, MIN("{COLUMN_NAME}")), 0) AS DATE) AS min_period,
- CAST(DATEADD(MONTH, DATEDIFF(MONTH, 0, MAX("{COLUMN_NAME}")), 0) AS DATE) AS max_period,
- DATEDIFF(MONTH,
- CAST(DATEADD(MONTH, DATEDIFF(MONTH, 0, MIN("{COLUMN_NAME}")), 0) AS DATE),
- CAST(DATEADD(MONTH, DATEDIFF(MONTH, 0, MAX("{COLUMN_NAME}")), 0) AS DATE) ) + 1 as period_ct
- FROM {TARGET_SCHEMA}.{TABLE_NAME} ),
- check_periods as ( SELECT d.min_period, d.max_period, t.number,
- DATEADD(MONTH, -(t.number - 1), d.max_period) AS check_period
- FROM date_range d
- INNER JOIN tally t
- ON (d.period_ct >= t.number) ),
- data_by_period as (SELECT CAST(DATEADD(MONTH, DATEDIFF(MONTH, 0, "{COLUMN_NAME}"), 0) AS DATE) as data_period, COUNT(*) as record_ct
- FROM {TARGET_SCHEMA}.{TABLE_NAME}
- GROUP BY CAST(DATEADD(MONTH, DATEDIFF(MONTH, 0, "{COLUMN_NAME}"), 0) AS DATE) ),
- data_by_prd_with_prior_next as (SELECT check_period,
- RANK() OVER (ORDER BY check_period DESC) as ranked,
- ISNULL(d.record_ct, 0) as record_ct,
- ISNULL(LAG(d.record_ct) OVER (ORDER BY check_period), 0) as last_record_ct,
- ISNULL(LEAD(d.record_ct) OVER (ORDER BY check_period), 0) as next_record_ct
- FROM check_periods c
- LEFT JOIN data_by_period d
- ON (c.check_period = d.data_period) )
-SELECT check_period, record_ct,
- CASE
- WHEN record_ct = 0 THEN ''MISSING''
- ELSE ''Present''
- END as status
- FROM data_by_prd_with_prior_next
- WHERE record_ct = 0
- OR last_record_ct = 0
- OR next_record_ct = 0
-ORDER BY check_period DESC;'),
- ('1158', '1024', 'Test Results', 'Outlier_Pct_Above', 'mssql', NULL, 'SELECT ({BASELINE_AVG} + (2*{BASELINE_SD})) AS outlier_threshold, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE CAST("{COLUMN_NAME}" AS FLOAT) > ({BASELINE_AVG} + (2*{BASELINE_SD})) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC;'),
- ('1159', '1025', 'Test Results', 'Outlier_Pct_Below', 'mssql', NULL, 'SELECT ({BASELINE_AVG} + (2*{BASELINE_SD})) AS outlier_threshold, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE CAST("{COLUMN_NAME}" AS FLOAT) < ({BASELINE_AVG} + (2*{BASELINE_SD})) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC;'),
- ('1160', '1026', 'Test Results', 'Pattern_Match', 'mssql', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE NULLIF("{COLUMN_NAME}", '''') NOT LIKE ''{BASELINE_VALUE}'' GROUP BY "{COLUMN_NAME}";'),
- ('1161', '1028', 'Test Results', 'Recency', 'mssql', NULL, 'SELECT DISTINCT col AS latest_date_available, CAST(''{TEST_DATE}'' AS DATE) AS test_run_date FROM (SELECT MAX("{COLUMN_NAME}") AS col FROM {TARGET_SCHEMA}.{TABLE_NAME}) a WHERE DATEDIFF(day, col, CAST(''{TEST_DATE}'' AS DATE)) > {THRESHOLD_VALUE};'),
- ('1162', '1030', 'Test Results', 'Required', 'mssql', NULL, 'SELECT TOP 500 * FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" IS NULL;'),
- ('1163', '1031', 'Test Results', 'Row_Ct', 'mssql', NULL, 'WITH CTE AS (SELECT COUNT(*) AS current_count FROM {TARGET_SCHEMA}.{TABLE_NAME}) SELECT current_count, ABS(ROUND(CAST(100 * (current_count - {THRESHOLD_VALUE}) AS NUMERIC) / CAST({THRESHOLD_VALUE} AS NUMERIC) ,2)) AS row_count_pct_decrease FROM cte WHERE current_count < {THRESHOLD_VALUE};'),
- ('1164', '1032', 'Test Results', 'Row_Ct_Pct', 'mssql', NULL, 'WITH CTE AS (SELECT COUNT(*) AS current_count FROM {TARGET_SCHEMA}.{TABLE_NAME}) SELECT current_count, {BASELINE_CT} AS baseline_count, ABS(ROUND(CAST(100 * (current_count - {BASELINE_CT}) AS NUMERIC) / CAST({BASELINE_CT} AS NUMERIC) ,2)) AS row_count_pct_difference FROM cte;'),
- ('1165', '1033', 'Test Results', 'Street_Addr_Pattern', 'mssql', NULL, 'SELECT DISTINCT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE UPPER("{COLUMN_NAME}") NOT LIKE ''[1-9]% [A-Z]% %'' AND CHARINDEX('' '', "{COLUMN_NAME}") NOT BETWEEN 2 AND 6 GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC;'),
- ('1166', '1036', 'Test Results', 'US_State', 'mssql', NULL, 'SELECT DISTINCT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE NULLIF("{COLUMN_NAME}", '''') NOT IN (''AL'',''AK'',''AS'',''AZ'',''AR'',''CA'',''CO'',''CT'',''DE'',''DC'',''FM'',''FL'',''GA'',''GU'',''HI'',''ID'',''IL'',''IN'',''IA'',''KS'',''KY'',''LA'',''ME'',''MH'',''MD'',''MA'',''MI'',''MN'',''MS'',''MO'',''MT'',''NE'',''NV'',''NH'',''NJ'',''NM'',''NY'',''NC'',''ND'',''MP'',''OH'',''OK'',''OR'',''PW'',''PA'',''PR'',''RI'',''SC'',''SD'',''TN'',''TX'',''UT'',''VT'',''VI'',''VA'',''WA'',''WV'',''WI'',''WY'',''AE'',''AP'',''AA'') GROUP BY "{COLUMN_NAME}";'),
- ('1167', '1034', 'Test Results', 'Unique', 'mssql', NULL, 'SELECT DISTINCT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" HAVING COUNT(*) > 1 ORDER BY COUNT(*) DESC;'),
- ('1168', '1035', 'Test Results', 'Unique_Pct', 'mssql', NULL, 'SELECT DISTINCT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC;'),
- ('1169', '1037', 'Test Results', 'Weekly_Rec_Ct', 'mssql', NULL, 'WITH
- Pass0 as (select 1 as C union all select 1), --2 rows
- Pass1 as (select 1 as C from Pass0 as A, Pass0 as B),--4 rows
- Pass2 as (select 1 as C from Pass1 as A, Pass1 as B),--16 rows
- Pass3 as (select 1 as C from Pass2 as A, Pass2 as B),--256 rows
- Pass4 as (select 1 as C from Pass3 as A, Pass3 as B),--65536 rows
- All_Nums as (select row_number() over(order by C) as Number from Pass4),
- tally as (SELECT Number FROM All_Nums WHERE Number <= 45000),
-
- date_range as (SELECT CAST(DATEADD(WEEK, DATEDIFF(WEEK, 0, MIN("{COLUMN_NAME}")), 0) AS DATE) AS min_period,
- CAST(DATEADD(WEEK, DATEDIFF(WEEK, 0, MAX("{COLUMN_NAME}")), 0) AS DATE) AS max_period,
- DATEDIFF(WEEK,
- CAST(DATEADD(WEEK, DATEDIFF(WEEK, 0, MIN("{COLUMN_NAME}")), 0) AS DATE),
- CAST(DATEADD(WEEK, DATEDIFF(WEEK, 0, MAX("{COLUMN_NAME}")), 0) AS DATE) ) + 1 as period_ct
- FROM {TARGET_SCHEMA}.{TABLE_NAME} ),
- check_periods as ( SELECT d.min_period, d.max_period, t.number,
- DATEADD(WEEK, -(t.number - 1), d.max_period) AS check_period
- FROM date_range d
- INNER JOIN tally t
- ON (d.period_ct >= t.number) ),
- data_by_period as (SELECT CAST(DATEADD(WEEK, DATEDIFF(WEEK, 0, "{COLUMN_NAME}"), 0) AS DATE) as data_period, COUNT(*) as record_ct
- FROM {TARGET_SCHEMA}.{TABLE_NAME}
- GROUP BY CAST(DATEADD(WEEK, DATEDIFF(WEEK, 0, "{COLUMN_NAME}"), 0) AS DATE) ),
- data_by_prd_with_prior_next as (SELECT check_period,
- RANK() OVER (ORDER BY check_period DESC) as ranked,
- ISNULL(d.record_ct, 0) as record_ct,
- ISNULL(LAG(d.record_ct) OVER (ORDER BY check_period), 0) as last_record_ct,
- ISNULL(LEAD(d.record_ct) OVER (ORDER BY check_period), 0) as next_record_ct
- FROM check_periods c
- LEFT JOIN data_by_period d
- ON (c.check_period = d.data_period) )
-SELECT check_period, record_ct,
- CASE
- WHEN record_ct = 0 THEN ''MISSING''
- ELSE ''Present''
- END as status
- FROM data_by_prd_with_prior_next
- WHERE record_ct = 0
- OR last_record_ct = 0
- OR next_record_ct = 0
-ORDER BY check_period DESC;'),
- ('1170', '1040', 'Test Results', 'Variability_Increase', 'mssql', NULL, 'SELECT STDEV(CAST("{COLUMN_NAME}" AS FLOAT)) as current_standard_deviation FROM {TARGET_SCHEMA}.{TABLE_NAME};'),
- ('1171', '1041', 'Test Results', 'Variability_Decrease', 'mssql', NULL, 'SELECT STDEV(CAST("{COLUMN_NAME}" AS FLOAT)) as current_standard_deviation FROM {TARGET_SCHEMA}.{TABLE_NAME};'),
-
- ('1172', '1001', 'Profile Anomaly' , 'Suggested_Type', 'snowflake', NULL, 'SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC;'),
- ('1173', '1002', 'Profile Anomaly', 'Non_Standard_Blanks', 'snowflake', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE CASE WHEN "{COLUMN_NAME}" IN (''.'', ''?'', '' '') THEN 1 WHEN LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP ''-{2,}'' OR LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP ''0{2,}'' OR LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP ''9{2,}'' OR LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP ''x{2,}'' OR LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP ''z{2,}'' THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''blank'',''error'',''missing'',''tbd'', ''n/a'',''#na'',''none'',''null'',''unknown'') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''(blank)'',''(error)'',''(missing)'',''(tbd)'', ''(n/a)'',''(#na)'',''(none)'',''(null)'',''(unknown)'') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''[blank]'',''[error]'',''[missing]'',''[tbd]'', ''[n/a]'',''[#na]'',''[none]'',''[null]'',''[unknown]'') THEN 1 WHEN "{COLUMN_NAME}" = '''' THEN 1 WHEN "{COLUMN_NAME}" IS NULL THEN 1 ELSE 0 END = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";'),
- ('1174', '1003', 'Profile Anomaly' , 'Invalid_Zip_USA', 'snowflake', NULL, 'SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}",''012345678'',''999999999'') NOT IN (''99999'', ''999999999'', ''99999-9999'') GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500;'),
- ('1175', '1004', 'Profile Anomaly' , 'Multiple_Types_Minor', 'snowflake', NULL, 'SELECT DISTINCT column_name, columns.table_name, CASE WHEN data_type ILIKE ''timestamp%'' THEN lower(data_type) WHEN data_type ILIKE ''date'' THEN lower(data_type) WHEN data_type ILIKE ''boolean'' THEN ''boolean'' WHEN data_type = ''TEXT'' THEN ''varchar('' || CAST(character_maximum_length AS VARCHAR) || '')'' WHEN data_type ILIKE ''char%'' THEN ''char('' || CAST(character_maximum_length AS VARCHAR) || '')'' WHEN data_type = ''NUMBER'' AND numeric_precision = 38 AND numeric_scale = 0 THEN ''bigint'' WHEN data_type ILIKE ''num%'' THEN ''numeric('' || CAST(numeric_precision AS VARCHAR) || '','' || CAST(numeric_scale AS VARCHAR) || '')'' ELSE data_type END AS data_type FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = ''{TARGET_SCHEMA}'' AND columns.column_name = ''{COLUMN_NAME}'' AND tables.table_type = ''BASE TABLE'' ORDER BY data_type, table_name;'),
- ('1176', '1005', 'Profile Anomaly' , 'Multiple_Types_Major', 'snowflake', NULL, 'SELECT DISTINCT column_name, columns.table_name, CASE WHEN data_type ILIKE ''timestamp%'' THEN lower(data_type) WHEN data_type ILIKE ''date'' THEN lower(data_type) WHEN data_type ILIKE ''boolean'' THEN ''boolean'' WHEN data_type = ''TEXT'' THEN ''varchar('' || CAST(character_maximum_length AS VARCHAR) || '')'' WHEN data_type ILIKE ''char%'' THEN ''char('' || CAST(character_maximum_length AS VARCHAR) || '')'' WHEN data_type = ''NUMBER'' AND numeric_precision = 38 AND numeric_scale = 0 THEN ''bigint'' WHEN data_type ILIKE ''num%'' THEN ''numeric('' || CAST(numeric_precision AS VARCHAR) || '','' || CAST(numeric_scale AS VARCHAR) || '')'' ELSE data_type END AS data_type FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = ''{TARGET_SCHEMA}'' AND columns.column_name = ''{COLUMN_NAME}'' AND tables.table_type = ''BASE TABLE'' ORDER BY data_type, table_name;'),
- ('1177', '1006', 'Profile Anomaly' , 'No_Values', 'snowflake', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";' ),
- ('1178', '1007', 'Profile Anomaly' , 'Column_Pattern_Mismatch', 'snowflake', NULL, 'SELECT A.* FROM (SELECT DISTINCT TOP 5 b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT trim(split_part(''{DETAIL_EXPRESSION}'', ''|'', 4)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}"::VARCHAR, ''[a-z]'', ''a''), ''[A-Z]'', ''A''), ''[0-9]'', ''N'') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC) A UNION ALL SELECT B.* FROM (SELECT DISTINCT TOP 5 b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT trim(split_part(''{DETAIL_EXPRESSION}'', ''|'', 6)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}"::VARCHAR, ''[a-z]'', ''a''), ''[A-Z]'', ''A''), ''[0-9]'', ''N'') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC) B UNION ALL SELECT C.* FROM (SELECT DISTINCT TOP 5 b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT trim(split_part(''{DETAIL_EXPRESSION}'', ''|'', 8)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}"::VARCHAR, ''[a-z]'', ''a''), ''[A-Z]'', ''A''), ''[0-9]'', ''N'') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC) C UNION ALL SELECT D.* FROM (SELECT DISTINCT TOP 5 b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT trim(split_part(''{DETAIL_EXPRESSION}'', ''|'', 10)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}"::VARCHAR, ''[a-z]'', ''a''), ''[A-Z]'', ''A''), ''[0-9]'', ''N'') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC) D ORDER BY top_pattern DESC, count DESC;' ),
- ('1179', '1008', 'Profile Anomaly' , 'Table_Pattern_Mismatch', 'snowflake', NULL, 'SELECT DISTINCT column_name, columns.table_name FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = ''{TARGET_SCHEMA}'' AND columns.column_name = ''{COLUMN_NAME}'' AND UPPER(tables.table_type) = ''BASE TABLE'' ORDER BY table_name; ' ),
- ('1180', '1009', 'Profile Anomaly' , 'Leading_Spaces', 'snowflake', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (CASE WHEN "{COLUMN_NAME}" BETWEEN '' !'' AND ''!'' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";' ),
- ('1181', '1010', 'Profile Anomaly' , 'Quoted_Values', 'snowflake', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (CASE WHEN "{COLUMN_NAME}" ILIKE ''"%"'' OR "{COLUMN_NAME}" ILIKE ''''''%'''''' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500;' ),
- ('1182', '1011', 'Profile Anomaly' , 'Char_Column_Number_Values', 'snowflake', NULL, 'SELECT A.* FROM (SELECT DISTINCT TOP 10 ''Numeric'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_NUM;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC) AS A UNION ALL SELECT B.* FROM (SELECT DISTINCT TOP 10 ''Non-Numeric'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_NUM;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC) AS B ORDER BY data_type, count DESC;' ),
- ('1183', '1012', 'Profile Anomaly' , 'Char_Column_Date_Values', 'snowflake', NULL, 'SELECT A.* FROM (SELECT DISTINCT TOP 10 ''Date'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_DATE;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC) AS A UNION ALL SELECT B.* FROM (SELECT DISTINCT TOP 10 ''Non-Date'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_DATE;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC) AS B ORDER BY data_type, count DESC;' ),
- ('1184', '1013', 'Profile Anomaly', 'Small Missing Value Ct', 'snowflake', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (CASE WHEN "{COLUMN_NAME}" IN (''.'', ''?'', '' '') THEN 1 WHEN LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP ''-{2,}'' OR LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP ''0{2,}'' OR LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP ''9{2,}'' OR LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP ''x{2,}'' OR LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP ''z{2,}'' THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''blank'',''error'',''missing'',''tbd'', ''n/a'',''#na'',''none'',''null'',''unknown'') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''(blank)'',''(error)'',''(missing)'',''(tbd)'', ''(n/a)'',''(#na)'',''(none)'',''(null)'',''(unknown)'') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''[blank]'',''[error]'',''[missing]'',''[tbd]'', ''[n/a]'',''[#na]'',''[none]'',''[null]'',''[unknown]'') THEN 1 WHEN "{COLUMN_NAME}" = '''' THEN 1 WHEN "{COLUMN_NAME}" IS NULL THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";'),
- ('1185', '1014', 'Profile Anomaly' , 'Small Divergent Value Ct', 'snowflake', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC;' ),
- ('1186', '1015', 'Profile Anomaly' , 'Boolean_Value_Mismatch', 'snowflake', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC;' ),
- ('1187', '1016', 'Profile Anomaly' , 'Potential_Duplicates', 'snowflake', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" HAVING COUNT(*)> 1 ORDER BY COUNT(*) DESC LIMIT 500;' ),
- ('1188', '1017', 'Profile Anomaly' , 'Standardized_Value_Matches', 'snowflake', NULL, 'WITH CTE AS ( SELECT DISTINCT UPPER(TRANSLATE("{COLUMN_NAME}", '' '''',.-'', '''')) as possible_standard_value, COUNT(DISTINCT "{COLUMN_NAME}") FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY UPPER(TRANSLATE("{COLUMN_NAME}", '' '''',.-'', '''')) HAVING COUNT(DISTINCT "{COLUMN_NAME}") > 1 ) SELECT DISTINCT a."{COLUMN_NAME}", possible_standard_value, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} a, cte b WHERE UPPER(TRANSLATE(a."{COLUMN_NAME}", '' '''',.-'', '''')) = b.possible_standard_value GROUP BY a."{COLUMN_NAME}", possible_standard_value ORDER BY possible_standard_value ASC, count DESC LIMIT 500;' ),
- ('1189', '1018', 'Profile Anomaly' , 'Unlikely_Date_Values', 'snowflake', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", ''{PROFILE_RUN_DATE}'' :: DATE AS profile_run_date, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} a WHERE ("{COLUMN_NAME}" < ''1900-01-01''::DATE) OR ("{COLUMN_NAME}" > ''{PROFILE_RUN_DATE}'' :: DATE + INTERVAL ''30 year'' ) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500;' ),
- ('1190', '1019', 'Profile Anomaly' , 'Recency_One_Year', 'snowflake', NULL, 'created_in_ui' ),
- ('1191', '1020', 'Profile Anomaly' , 'Recency_Six_Months', 'snowflake', NULL, 'created_in_ui' ),
- ('1192', '1021', 'Profile Anomaly' , 'Unexpected US States', 'snowflake', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500;' ),
- ('1193', '1022', 'Profile Anomaly' , 'Unexpected Emails', 'snowflake', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500;' ),
- ('1194', '1023', 'Profile Anomaly' , 'Small_Numeric_Value_Ct', 'snowflake', NULL, 'SELECT A.* FROM (SELECT DISTINCT TOP 10 ''Numeric'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_NUM;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC) AS A UNION ALL SELECT B.* FROM (SELECT DISTINCT TOP 10 ''Non-Numeric'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_NUM;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC) AS B ORDER BY data_type, count DESC;' ),
- ('1195', '1024', 'Profile Anomaly' , 'Invalid_Zip3_USA', 'snowflake', NULL, 'SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}",''012345678'',''999999999'') <> ''999'' GROUP BY "{COLUMN_NAME}" ORDER BY count DESC, "{COLUMN_NAME}" LIMIT 500;'),
- ('1196', '1025', 'Profile Anomaly' , 'Delimited_Data_Embedded', 'snowflake', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE REGEXP_LIKE("{COLUMN_NAME}"::VARCHAR, ''^([^,|\t]{1,20}[,|\t]){2,}[^,|\t]{0,20}([,|\t]{0,1}[^,|\t]{0,20})*$'') AND NOT REGEXP_LIKE("{COLUMN_NAME}"::VARCHAR, ''.*\\s(and|but|or|yet)\\s.*'') GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT 500;' ),
-
- ('1197', '1004', 'Test Results', 'Alpha_Trunc', 'snowflake', NULL, 'SELECT DISTINCT "{COLUMN_NAME}" , LEN("{COLUMN_NAME}") as current_max_length, {THRESHOLD_VALUE} as previous_max_length FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT MAX(LEN("{COLUMN_NAME}")) as max_length FROM {TARGET_SCHEMA}.{TABLE_NAME}) a WHERE LEN("{COLUMN_NAME}") = a.max_length AND a.max_length < {THRESHOLD_VALUE} LIMIT 500;'),
- ('1198', '1005', 'Test Results', 'Avg_Shift', 'snowflake', NULL, 'SELECT AVG("{COLUMN_NAME}" :: FLOAT) AS current_average FROM {TARGET_SCHEMA}.{TABLE_NAME};'),
- ('1199', '1006', 'Test Results', 'Condition_Flag', 'snowflake', NULL, 'SELECT * FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE {CUSTOM_QUERY} LIMIT 500;'),
- ('1200', '1007', 'Test Results', 'Constant', 'snowflake', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" <> {BASELINE_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT 500;'),
- ('1201', '1009', 'Test Results', 'Daily_Record_Ct', 'snowflake', NULL, 'WITH RECURSIVE daterange(all_dates) AS (SELECT MIN("{COLUMN_NAME}") :: DATE AS all_dates FROM {TARGET_SCHEMA}.{TABLE_NAME} UNION ALL SELECT DATEADD(DAY, 1, d.all_dates) :: DATE AS all_dates FROM daterange d WHERE d.all_dates < (SELECT MAX("{COLUMN_NAME}") :: DATE FROM {TARGET_SCHEMA}.{TABLE_NAME}) ), existing_periods AS ( SELECT DISTINCT "{COLUMN_NAME}" :: DATE AS period, COUNT(1) AS period_count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" :: DATE ) SELECT p.missing_period, p.prior_available_date, e.period_count as prior_available_date_count, p.next_available_date, f.period_count as next_available_date_count FROM (SELECT d.all_dates AS missing_period, MAX(b.period) AS prior_available_date, MIN(c.period) AS next_available_date FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates) p LEFT JOIN existing_periods e ON (p.prior_available_date = e.period) LEFT JOIN existing_periods f ON (p.next_available_date = f.period) ORDER BY p.missing_period LIMIT 500;'),
- ('1202', '1011', 'Test Results', 'Dec_Trunc', 'snowflake', NULL, 'SELECT DISTINCT LENGTH(SPLIT_PART("{COLUMN_NAME}" :: TEXT, ''.'', 2)) AS decimal_scale, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY decimal_scale LIMIT 500;'),
- ('1203', '1012', 'Test Results', 'Distinct_Date_Ct', 'snowflake', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" IS NOT NULL GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500;'),
- ('1204', '1013', 'Test Results', 'Distinct_Value_Ct', 'snowflake', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" IS NOT NULL GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500;'),
- ('1205', '1014', 'Test Results', 'Email_Format', 'snowflake', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE REGEXP_LIKE("{COLUMN_NAME}"::VARCHAR, ''^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z]{2,}$'') != 1 GROUP BY "{COLUMN_NAME}" LIMIT 500;'),
- ('1206', '1015', 'Test Results', 'Future_Date', 'snowflake', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE GREATEST(0, SIGN("{COLUMN_NAME}"::DATE - ''{TEST_DATE}''::DATE)) > {THRESHOLD_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT 500;'),
- ('1207', '1016', 'Test Results', 'Future_Date_1Y', 'snowflake', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE GREATEST(0, SIGN("{COLUMN_NAME}"::DATE - (''{TEST_DATE}''::DATE + 365))) > {THRESHOLD_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT 500;'),
- ('1208', '1017', 'Test Results', 'Incr_Avg_Shift', 'snowflake', NULL, 'SELECT AVG("{COLUMN_NAME}" :: FLOAT) AS current_average, SUM("{COLUMN_NAME}" ::FLOAT) AS current_sum, NULLIF(COUNT("{COLUMN_NAME}" )::FLOAT, 0) as current_value_count FROM {TARGET_SCHEMA}.{TABLE_NAME};'),
- ('1209', '1018', 'Test Results', 'LOV_All', 'snowflake', NULL, 'SELECT LISTAGG(DISTINCT "{COLUMN_NAME}", ''|'') WITHIN GROUP (ORDER BY "{COLUMN_NAME}") FROM {TARGET_SCHEMA}.{TABLE_NAME} HAVING LISTAGG(DISTINCT "{COLUMN_NAME}", ''|'') WITHIN GROUP (ORDER BY "{COLUMN_NAME}") <> ''{THRESHOLD_VALUE}'' LIMIT 500;'),
- ('1210', '1019', 'Test Results', 'LOV_Match', 'snowflake', NULL, 'SELECT DISTINCT NULLIF("{COLUMN_NAME}", '''') AS "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE NULLIF("{COLUMN_NAME}", '''') NOT IN {BASELINE_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT 500;'),
- ('1211', '1020', 'Test Results', 'Min_Date', 'snowflake', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" :: DATE < ''{BASELINE_VALUE}'' :: DATE GROUP BY "{COLUMN_NAME}" LIMIT 500;'),
- ('1212', '1021', 'Test Results', 'Min_Val', 'snowflake', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", (ABS("{COLUMN_NAME}") - ABS({BASELINE_VALUE})) AS difference_from_baseline FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" < {BASELINE_VALUE} LIMIT 500;'),
- ('1213', '1022', 'Test Results', 'Missing_Pct', 'snowflake', NULL, 'SELECT TOP 10 * FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" IS NULL OR "{COLUMN_NAME}" :: VARCHAR(255) = '''' ;'),
- ('1214', '1023', 'Test Results', 'Monthly_Rec_Ct', 'snowflake', NULL, 'WITH RECURSIVE daterange(all_dates) AS (SELECT DATE_TRUNC(''month'', MIN("{COLUMN_NAME}")) :: DATE AS all_dates FROM {TARGET_SCHEMA}.{TABLE_NAME} UNION ALL SELECT DATEADD(MONTH, 1, d.all_dates) :: DATE AS all_dates FROM daterange d WHERE d.all_dates < (SELECT DATE_TRUNC(''month'', MAX("{COLUMN_NAME}")) :: DATE FROM {TARGET_SCHEMA}.{TABLE_NAME}) ), existing_periods AS (SELECT DISTINCT DATE_TRUNC(''month'',"{COLUMN_NAME}") :: DATE AS period, COUNT(1) AS period_count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY DATE_TRUNC(''month'',"{COLUMN_NAME}") :: DATE ) SELECT p.missing_period, p.prior_available_month, e.period_count as prior_available_month_count, p.next_available_month, f.period_count as next_available_month_count FROM (SELECT d.all_dates as missing_period, MAX(b.period) AS prior_available_month, MIN(c.period) AS next_available_month FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates) p LEFT JOIN existing_periods e ON (p.prior_available_month = e.period) LEFT JOIN existing_periods f ON (p.next_available_month = f.period) ORDER BY p.missing_period;'),
- ('1215', '1024', 'Test Results', 'Outlier_Pct_Above', 'snowflake', NULL, 'SELECT ({BASELINE_AVG} + (2*{BASELINE_SD})) AS outlier_threshold, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" :: FLOAT > ({BASELINE_AVG} + (2*{BASELINE_SD})) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC;'),
- ('1216', '1025', 'Test Results', 'Outlier_Pct_Below', 'snowflake', NULL, 'SELECT ({BASELINE_AVG} + (2*{BASELINE_SD})) AS outlier_threshold, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" :: FLOAT < ({BASELINE_AVG} + (2*{BASELINE_SD})) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC;'),
- ('1217', '1026', 'Test Results', 'Pattern_Match', 'snowflake', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE REGEXP_LIKE(NULLIF("{COLUMN_NAME}"::VARCHAR, ''''),''{BASELINE_VALUE}'') != 1 GROUP BY "{COLUMN_NAME}";'),
- ('1218', '1028', 'Test Results', 'Recency', 'snowflake', NULL, 'SELECT DISTINCT col AS latest_date_available, ''{TEST_DATE}'' :: DATE as test_run_date FROM (SELECT MAX("{COLUMN_NAME}") AS col FROM {TARGET_SCHEMA}.{TABLE_NAME}) WHERE DATEDIFF(''D'', col, ''{TEST_DATE}''::DATE) > {THRESHOLD_VALUE};'),
- ('1219', '1030', 'Test Results', 'Required', 'snowflake', NULL, 'SELECT * FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" IS NULL LIMIT 500;'),
- ('1220', '1031', 'Test Results', 'Row_Ct', 'snowflake', NULL, 'WITH CTE AS (SELECT COUNT(*) AS current_count FROM {TARGET_SCHEMA}.{TABLE_NAME}) SELECT current_count, ABS(ROUND(100 *(current_count - {THRESHOLD_VALUE}) :: FLOAT / {THRESHOLD_VALUE} :: FLOAT,2)) AS row_count_pct_decrease FROM cte WHERE current_count < {THRESHOLD_VALUE};'),
- ('1221', '1032', 'Test Results', 'Row_Ct_Pct', 'snowflake', NULL, 'WITH CTE AS (SELECT COUNT(*) AS current_count FROM {TARGET_SCHEMA}.{TABLE_NAME}) SELECT current_count, {BASELINE_CT} AS baseline_count, ABS(ROUND(100 * (current_count - {BASELINE_CT}) :: FLOAT / {BASELINE_CT} :: FLOAT,2)) AS row_count_pct_difference FROM cte;'),
- ('1222', '1033', 'Test Results', 'Street_Addr_Pattern', 'snowflake', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE REGEXP_LIKE("{COLUMN_NAME}"::VARCHAR, ''^[0-9]{1,5}[a-zA-Z]?\\s\\w{1,5}\\.?\\s?\\w*\\s?\\w*\\s[a-zA-Z]{1,6}\\.?\\s?[0-9]{0,5}[A-Z]{0,1}$'') != 1 GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT 500;'),
- ('1223', '1036', 'Test Results', 'US_State', 'snowflake', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE NULLIF("{COLUMN_NAME}", '''') NOT IN (''AL'',''AK'',''AS'',''AZ'',''AR'',''CA'',''CO'',''CT'',''DE'',''DC'',''FM'',''FL'',''GA'',''GU'',''HI'',''ID'',''IL'',''IN'',''IA'',''KS'',''KY'',''LA'',''ME'',''MH'',''MD'',''MA'',''MI'',''MN'',''MS'',''MO'',''MT'',''NE'',''NV'',''NH'',''NJ'',''NM'',''NY'',''NC'',''ND'',''MP'',''OH'',''OK'',''OR'',''PW'',''PA'',''PR'',''RI'',''SC'',''SD'',''TN'',''TX'',''UT'',''VT'',''VI'',''VA'',''WA'',''WV'',''WI'',''WY'',''AE'',''AP'',''AA'') GROUP BY "{COLUMN_NAME}" LIMIT 500;'),
- ('1224', '1034', 'Test Results', 'Unique', 'snowflake', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" HAVING COUNT(*) > 1 ORDER BY COUNT(*) DESC LIMIT 500;'),
- ('1225', '1035', 'Test Results', 'Unique_Pct', 'snowflake', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT 500;'),
- ('1226', '1037', 'Test Results', 'Weekly_Rec_Ct', 'snowflake', NULL, 'WITH RECURSIVE daterange(all_dates) AS (SELECT DATE_TRUNC(''week'',MIN("{COLUMN_NAME}")) :: DATE AS all_dates FROM {TARGET_SCHEMA}.{TABLE_NAME} UNION ALL SELECT (d.all_dates + INTERVAL ''1 week'' ) :: DATE AS all_dates FROM daterange d WHERE d.all_dates < (SELECT DATE_TRUNC(''week'', MAX("{COLUMN_NAME}")) :: DATE FROM {TARGET_SCHEMA}.{TABLE_NAME}) ), existing_periods AS ( SELECT DISTINCT DATE_TRUNC(''week'',"{COLUMN_NAME}") :: DATE AS period, COUNT(1) as period_count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY DATE_TRUNC(''week'',"{COLUMN_NAME}") :: DATE ) SELECT p.missing_period, p.prior_available_week, e.period_count as prior_available_week_count, p.next_available_week, f.period_count as next_available_week_count FROM( SELECT d.all_dates as missing_period, MAX(b.period) AS prior_available_week, MIN(c.period) AS next_available_week FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates ) p LEFT JOIN existing_periods e ON (p.prior_available_week = e.period) LEFT JOIN existing_periods f ON (p.next_available_week = f.period) ORDER BY p.missing_period;'),
- ('1227', '1040', 'Test Results', 'Variability_Increase', 'snowflake', NULL, 'SELECT STDDEV(CAST("{COLUMN_NAME}" AS FLOAT)) as current_standard_deviation FROM {TARGET_SCHEMA}.{TABLE_NAME};'),
- ('1228', '1041', 'Test Results', 'Variability_Decrease', 'snowflake', NULL, 'SELECT STDDEV(CAST("{COLUMN_NAME}" AS FLOAT)) as current_standard_deviation FROM {TARGET_SCHEMA}.{TABLE_NAME};'),
-
- ('1229', '1027', 'Profile Anomaly' , 'Variant_Coded_Values', 'redshift', NULL, 'WITH val_array AS (SELECT 1 as valkey, SPLIT_TO_ARRAY(SUBSTRING (''{DETAIL_EXPRESSION}'', STRPOS(''{DETAIL_EXPRESSION}'', '':'') + 2), ''|'') vals), val_list AS ( SELECT valkey, val::VARCHAR FROM val_array v, v.vals val ) SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} t INNER JOIN val_list v ON (LOWER("{COLUMN_NAME}") = v.val) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";'),
- ('1230', '1027', 'Profile Anomaly' , 'Variant_Coded_Values', 'snowflake', NULL, 'SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE lower("{COLUMN_NAME}") IN (SELECT trim(value) FROM TABLE (FLATTEN(INPUT => SPLIT(SUBSTRING(''{DETAIL_EXPRESSION}'', POSITION('':'', ''{DETAIL_EXPRESSION}'') + 2), ''|''))) ) GROUP BY "{COLUMN_NAME}";'),
- ('1231', '1027', 'Profile Anomaly' , 'Variant_Coded_Values', 'mssql', NULL, 'SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE LOWER("{COLUMN_NAME}") IN (SELECT trim(value) FROM STRING_SPLIT(SUBSTRING(''{DETAIL_EXPRESSION}'', CHARINDEX('':'', ''{DETAIL_EXPRESSION}'') + 2, 999), ''|'')) GROUP BY "{COLUMN_NAME}";'),
- ('1232', '1027', 'Profile Anomaly' , 'Variant_Coded_Values', 'postgresql', NULL, 'SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE LOWER("{COLUMN_NAME}") = ANY(STRING_TO_ARRAY(SUBSTRING(''{DETAIL_EXPRESSION}'', STRPOS(''{DETAIL_EXPRESSION}'', '':'') + 2), ''|'')) GROUP BY "{COLUMN_NAME}";'),
-
- ('1233', '1043', 'Test Results', 'Valid_Characters', 'redshift', NULL, 'SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}", CHR(160) || CHR(8203) || CHR(65279) || CHR(8239) || CHR(8201) || CHR(12288) || CHR(8204), ''XXXXXXX'') <> "{COLUMN_NAME}" OR "{COLUMN_NAME}" LIKE '' %'' OR "{COLUMN_NAME}" LIKE ''''''%'''''' OR "{COLUMN_NAME}" LIKE ''"%"'' ORDER BY record_ct DESC;'),
- ('1234', '1043', 'Test Results', 'Valid_Characters', 'postgresql', NULL, 'SELECT "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}", CHR(160) || CHR(8203) || CHR(65279) || CHR(8239) || CHR(8201) || CHR(12288) || CHR(8204), ''XXXXXXX'') <> "{COLUMN_NAME}" OR "{COLUMN_NAME}" LIKE '' %'' OR "{COLUMN_NAME}" LIKE ''''''%'''''' OR "{COLUMN_NAME}" LIKE ''"%"'' ORDER BY record_ct DESC LIMIT 20;'),
- ('1235', '1043', 'Test Results', 'Valid_Characters', 'mssql', NULL, 'SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}", NCHAR(160) || NCHAR(8203) || NCHAR(65279) || NCHAR(8239) || NCHAR(8201) || NCHAR(12288) || NCHAR(8204), ''XXXXXXX'') <> "{COLUMN_NAME}" OR "{COLUMN_NAME}" LIKE '' %'' OR "{COLUMN_NAME}" LIKE ''''''%'''''' OR "{COLUMN_NAME}" LIKE ''"%"'' ORDER BY record_ct DESC;'),
- ('1236', '1043', 'Test Results', 'Valid_Characters', 'snowflake', NULL, 'SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}", CHAR(160) || CHAR(8203) || CHAR(65279) || CHAR(8239) || CHAR(8201) || CHAR(12288) || CHAR(8204), ''XXXXXXX'') <> "{COLUMN_NAME}" OR "{COLUMN_NAME}" LIKE '' %'' OR "{COLUMN_NAME}" LIKE ''''''%'''''' OR "{COLUMN_NAME}" LIKE ''"%"'' ORDER BY record_ct DESC;'),
-
- ('1237', '1044', 'Test Results', 'Valid_US_Zip', 'redshift', NULL, 'SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}",''012345678'',''999999999'') NOT IN (''99999'', ''999999999'', ''99999-9999'') GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC;'),
- ('1238', '1044', 'Test Results', 'Valid_US_Zip', 'postgresql', NULL, 'SELECT "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}",''012345678'',''999999999'') NOT IN (''99999'', ''999999999'', ''99999-9999'') GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC LIMIT 20;'),
- ('1239', '1044', 'Test Results', 'Valid_US_Zip', 'mssql', NULL, 'SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}",''012345678'',''999999999'') NOT IN (''99999'', ''999999999'', ''99999-9999'') GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC;'),
- ('1240', '1044', 'Test Results', 'Valid_US_Zip', 'snowflake', NULL, 'SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}",''012345678'',''999999999'') NOT IN (''99999'', ''999999999'', ''99999-9999'') GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC;'),
-
- ('1241', '1045', 'Test Results', 'Valid_US_Zip3', 'redshift', NULL, 'SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}",''012345678'',''999999999'') <> ''999'' GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC;'),
- ('1242', '1045', 'Test Results', 'Valid_US_Zip3', 'postgresql', NULL, 'SELECT "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}",''012345678'',''999999999'') NOT IN (''99999'', ''999999999'', ''99999-9999'') <> '''' GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC LIMIT 20;'),
- ('1243', '1045', 'Test Results', 'Valid_US_Zip3', 'mssql', NULL, 'SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}",''012345678'',''999999999'') NOT IN (''99999'', ''999999999'', ''99999-9999'') GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC;'),
- ('1244', '1045', 'Test Results', 'Valid_US_Zip3', 'snowflake', NULL, 'SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}",''012345678'',''999999999'') NOT IN (''99999'', ''999999999'', ''99999-9999'') GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC;'),
-
- ('1245', '1500', 'Test Results', 'Aggregate_Balance', 'redshift', NULL, 'SELECT *
- FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) AS total, SUM(MATCH_TOTAL) AS MATCH_TOTAL
- FROM
- ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} AS total, NULL AS match_total
- FROM {TARGET_SCHEMA}.{TABLE_NAME}
- WHERE {SUBSET_CONDITION}
- GROUP BY {GROUPBY_NAMES}
- {HAVING_CONDITION}
- UNION ALL
- SELECT {MATCH_GROUPBY_NAMES}, NULL AS total, {MATCH_COLUMN_NAMES} AS match_total
- FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME}
- WHERE {MATCH_SUBSET_CONDITION}
- GROUP BY {MATCH_GROUPBY_NAMES}
- {MATCH_HAVING_CONDITION} ) a
- GROUP BY {GROUPBY_NAMES} ) s
- WHERE total <> match_total OR (total IS NOT NULL AND match_total IS NULL) OR (total IS NULL AND match_total IS NOT NULL)
-ORDER BY {GROUPBY_NAMES};'),
- ('1246', '1500', 'Test Results', 'Aggregate_Balance', 'snowflake', NULL, 'SELECT *
- FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) AS total, SUM(MATCH_TOTAL) AS MATCH_TOTAL
- FROM
- ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} AS total, NULL AS match_total
- FROM {TARGET_SCHEMA}.{TABLE_NAME}
- WHERE {SUBSET_CONDITION}
- GROUP BY {GROUPBY_NAMES}
- {HAVING_CONDITION}
- UNION ALL
- SELECT {MATCH_GROUPBY_NAMES}, NULL AS total, {MATCH_COLUMN_NAMES} AS match_total
- FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME}
- WHERE {MATCH_SUBSET_CONDITION}
- GROUP BY {MATCH_GROUPBY_NAMES}
- {MATCH_HAVING_CONDITION} ) a
- GROUP BY {GROUPBY_NAMES} ) s
- WHERE total <> match_total OR (total IS NOT NULL AND match_total IS NULL) OR (total IS NULL AND match_total IS NOT NULL)
-ORDER BY {GROUPBY_NAMES};'),
- ('1247', '1500', 'Test Results', 'Aggregate_Balance', 'mssql', NULL, 'SELECT *
- FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) AS total, SUM(MATCH_TOTAL) AS MATCH_TOTAL
- FROM
- ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} AS total, NULL AS match_total
- FROM {TARGET_SCHEMA}.{TABLE_NAME}
- WHERE {SUBSET_CONDITION}
- GROUP BY {GROUPBY_NAMES}
- {HAVING_CONDITION}
- UNION ALL
- SELECT {MATCH_GROUPBY_NAMES}, NULL AS total, {MATCH_COLUMN_NAMES} AS match_total
- FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME}
- WHERE {MATCH_SUBSET_CONDITION}
- GROUP BY {MATCH_GROUPBY_NAMES}
- {MATCH_HAVING_CONDITION} ) a
- GROUP BY {GROUPBY_NAMES} ) s
- WHERE total <> match_total OR (total IS NOT NULL AND match_total IS NULL) OR (total IS NULL AND match_total IS NOT NULL)
-ORDER BY {GROUPBY_NAMES};'),
- ('1248', '1500', 'Test Results', 'Aggregate_Balance', 'postgresql', NULL, 'SELECT *
- FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) AS total, SUM(MATCH_TOTAL) AS MATCH_TOTAL
- FROM
- ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} AS total, NULL AS match_total
- FROM {TARGET_SCHEMA}.{TABLE_NAME}
- WHERE {SUBSET_CONDITION}
- GROUP BY {GROUPBY_NAMES}
- {HAVING_CONDITION}
- UNION ALL
- SELECT {MATCH_GROUPBY_NAMES}, NULL AS total, {MATCH_COLUMN_NAMES} AS match_total
- FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME}
- WHERE {MATCH_SUBSET_CONDITION}
- GROUP BY {MATCH_GROUPBY_NAMES}
- {MATCH_HAVING_CONDITION} ) a
- GROUP BY {GROUPBY_NAMES} ) s
- WHERE total <> match_total OR (total IS NOT NULL AND match_total IS NULL) OR (total IS NULL AND match_total IS NOT NULL)
-ORDER BY {GROUPBY_NAMES};'),
- ('1249', '1501', 'Test Results', 'Aggregate_Minimum', 'redshift', NULL, 'SELECT *
-FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) as total, SUM(MATCH_TOTAL) as MATCH_TOTAL
- FROM
- ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} as total, NULL as match_total
- FROM {TARGET_SCHEMA}.{TABLE_NAME}
- WHERE {SUBSET_CONDITION}
- GROUP BY {GROUPBY_NAMES}
- {HAVING_CONDITION}
- UNION ALL
- SELECT {MATCH_GROUPBY_NAMES}, NULL as total, {MATCH_COLUMN_NAMES} as match_total
- FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME}
- WHERE {MATCH_SUBSET_CONDITION}
- GROUP BY {MATCH_GROUPBY_NAMES}
- {MATCH_HAVING_CONDITION} ) a
- GROUP BY {GROUPBY_NAMES} ) s
- WHERE total < match_total OR (total IS NULL AND match_total IS NOT NULL)
-ORDER BY {GROUPBY_NAMES};'),
- ('1250', '1501', 'Test Results', 'Aggregate_Minimum', 'snowflake', NULL, 'SELECT *
-FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) as total, SUM(MATCH_TOTAL) as MATCH_TOTAL
- FROM
- ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} as total, NULL as match_total
- FROM {TARGET_SCHEMA}.{TABLE_NAME}
- WHERE {SUBSET_CONDITION}
- GROUP BY {GROUPBY_NAMES}
- {HAVING_CONDITION}
- UNION ALL
- SELECT {MATCH_GROUPBY_NAMES}, NULL as total, {MATCH_COLUMN_NAMES} as match_total
- FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME}
- WHERE {MATCH_SUBSET_CONDITION}
- GROUP BY {MATCH_GROUPBY_NAMES}
- {MATCH_HAVING_CONDITION} ) a
- GROUP BY {GROUPBY_NAMES} ) s
- WHERE total < match_total OR (total IS NULL AND match_total IS NOT NULL)
-ORDER BY {GROUPBY_NAMES};'),
- ('1251', '1501', 'Test Results', 'Aggregate_Minimum', 'mssql', NULL, 'SELECT *
-FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) as total, SUM(MATCH_TOTAL) as MATCH_TOTAL
- FROM
- ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} as total, NULL as match_total
- FROM {TARGET_SCHEMA}.{TABLE_NAME}
- WHERE {SUBSET_CONDITION}
- GROUP BY {GROUPBY_NAMES}
- {HAVING_CONDITION}
- UNION ALL
- SELECT {MATCH_GROUPBY_NAMES}, NULL as total, {MATCH_COLUMN_NAMES} as match_total
- FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME}
- WHERE {MATCH_SUBSET_CONDITION}
- GROUP BY {MATCH_GROUPBY_NAMES}
- {MATCH_HAVING_CONDITION} ) a
- GROUP BY {GROUPBY_NAMES} ) s
- WHERE total < match_total OR (total IS NULL AND match_total IS NOT NULL)
-ORDER BY {GROUPBY_NAMES};'),
- ('1252', '1501', 'Test Results', 'Aggregate_Minimum', 'postgresql', NULL, 'SELECT *
-FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) as total, SUM(MATCH_TOTAL) as MATCH_TOTAL
- FROM
- ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} as total, NULL as match_total
- FROM {TARGET_SCHEMA}.{TABLE_NAME}
- WHERE {SUBSET_CONDITION}
- GROUP BY {GROUPBY_NAMES}
- {HAVING_CONDITION}
- UNION ALL
- SELECT {MATCH_GROUPBY_NAMES}, NULL as total, {MATCH_COLUMN_NAMES} as match_total
- FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME}
- WHERE {MATCH_SUBSET_CONDITION}
- GROUP BY {MATCH_GROUPBY_NAMES}
- {MATCH_HAVING_CONDITION} ) a
- GROUP BY {GROUPBY_NAMES} ) s
- WHERE total < match_total OR (total IS NULL AND match_total IS NOT NULL)
-ORDER BY {GROUPBY_NAMES};'),
- ('1253', '1502', 'Test Results', 'Combo_Match', 'redshift', NULL, 'SELECT *
- FROM ( SELECT {COLUMN_NAME_NO_QUOTES}
- FROM {TARGET_SCHEMA}.{TABLE_NAME}
- WHERE {SUBSET_CONDITION}
- GROUP BY {COLUMN_NAME_NO_QUOTES}
- {HAVING_CONDITION}
- EXCEPT
- SELECT {MATCH_GROUPBY_NAMES}
- FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME}
- WHERE {MATCH_SUBSET_CONDITION}
- GROUP BY {MATCH_GROUPBY_NAMES}
- {MATCH_HAVING_CONDITION}
- ) test
-ORDER BY {COLUMN_NAME_NO_QUOTES};'),
- ('1254', '1502', 'Test Results', 'Combo_Match', 'snowflake', NULL, 'SELECT *
- FROM ( SELECT {COLUMN_NAME_NO_QUOTES}
- FROM {TARGET_SCHEMA}.{TABLE_NAME}
- WHERE {SUBSET_CONDITION}
- GROUP BY {COLUMN_NAME_NO_QUOTES}
- {HAVING_CONDITION}
- EXCEPT
- SELECT {MATCH_GROUPBY_NAMES}
- FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME}
- WHERE {MATCH_SUBSET_CONDITION}
- GROUP BY {MATCH_GROUPBY_NAMES}
- {MATCH_HAVING_CONDITION}
- ) test
-ORDER BY {COLUMN_NAME_NO_QUOTES};'),
- ('1255', '1502', 'Test Results', 'Combo_Match', 'mssql', NULL, 'SELECT *
- FROM ( SELECT {COLUMN_NAME_NO_QUOTES}
- FROM {TARGET_SCHEMA}.{TABLE_NAME}
- WHERE {SUBSET_CONDITION}
- GROUP BY {COLUMN_NAME_NO_QUOTES}
- {HAVING_CONDITION}
- EXCEPT
- SELECT {MATCH_GROUPBY_NAMES}
- FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME}
- WHERE {MATCH_SUBSET_CONDITION}
- GROUP BY {MATCH_GROUPBY_NAMES}
- {MATCH_HAVING_CONDITION}
- ) test
-ORDER BY {COLUMN_NAME_NO_QUOTES};'),
- ('1256', '1502', 'Test Results', 'Combo_Match', 'postgresql', NULL, 'SELECT *
- FROM ( SELECT {COLUMN_NAME_NO_QUOTES}
- FROM {TARGET_SCHEMA}.{TABLE_NAME}
- WHERE {SUBSET_CONDITION}
- GROUP BY {COLUMN_NAME_NO_QUOTES}
- {HAVING_CONDITION}
- EXCEPT
- SELECT {MATCH_GROUPBY_NAMES}
- FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME}
- WHERE {MATCH_SUBSET_CONDITION}
- GROUP BY {MATCH_GROUPBY_NAMES}
- {MATCH_HAVING_CONDITION}
- ) test
-ORDER BY {COLUMN_NAME_NO_QUOTES};'),
- ('1257', '1503', 'Test Results', 'Distribution_Shift', 'redshift', NULL, 'WITH latest_ver
- AS ( SELECT {CONCAT_COLUMNS} as category,
- COUNT(*)::FLOAT / SUM(COUNT(*)) OVER ()::FLOAT AS pct_of_total
- FROM {TARGET_SCHEMA}.{TABLE_NAME} v1
- WHERE {SUBSET_CONDITION}
- GROUP BY {COLUMN_NAME_NO_QUOTES} ),
-older_ver
- AS ( SELECT {CONCAT_MATCH_GROUPBY} as category,
- COUNT(*)::FLOAT / SUM(COUNT(*)) OVER ()::FLOAT AS pct_of_total
- FROM {MATCH_SCHEMA_NAME}.{TABLE_NAME} v2
- WHERE {MATCH_SUBSET_CONDITION}
- GROUP BY {MATCH_GROUPBY_NAMES} )
-SELECT COALESCE(l.category, o.category) AS category,
- o.pct_of_total AS old_pct,
- l.pct_of_total AS new_pct
- FROM latest_ver l
-FULL JOIN older_ver o
- ON (l.category = o.category)
-ORDER BY COALESCE(l.category, o.category)'),
- ('1258', '1503', 'Test Results', 'Distribution_Shift', 'snowflake', NULL, 'WITH latest_ver
- AS ( SELECT {CONCAT_COLUMNS} as category,
- COUNT(*)::FLOAT / SUM(COUNT(*)) OVER ()::FLOAT AS pct_of_total
- FROM {TARGET_SCHEMA}.{TABLE_NAME} v1
- WHERE {SUBSET_CONDITION}
- GROUP BY {COLUMN_NAME_NO_QUOTES} ),
-older_ver
- AS ( SELECT {CONCAT_MATCH_GROUPBY} as category,
- COUNT(*)::FLOAT / SUM(COUNT(*)) OVER ()::FLOAT AS pct_of_total
- FROM {MATCH_SCHEMA_NAME}.{TABLE_NAME} v2
- WHERE {MATCH_SUBSET_CONDITION}
- GROUP BY {MATCH_GROUPBY_NAMES} )
-SELECT COALESCE(l.category, o.category) AS category,
- o.pct_of_total AS old_pct,
- l.pct_of_total AS new_pct
- FROM latest_ver l
-FULL JOIN older_ver o
- ON (l.category = o.category)
-ORDER BY COALESCE(l.category, o.category)'),
- ('1259', '1503', 'Test Results', 'Distribution_Shift', 'mssql', NULL, 'WITH latest_ver
- AS ( SELECT {CONCAT_COLUMNS} as category,
- CAST(COUNT(*) as FLOAT) / CAST(SUM(COUNT(*)) OVER () as FLOAT) AS pct_of_total
- FROM {TARGET_SCHEMA}.{TABLE_NAME} v1
- WHERE {SUBSET_CONDITION}
- GROUP BY {COLUMN_NAME_NO_QUOTES} ),
-older_ver
- AS ( SELECT {CONCAT_MATCH_GROUPBY} as category,
- CAST(COUNT(*) as FLOAT) / CAST(SUM(COUNT(*)) OVER () as FLOAT) AS pct_of_total
- FROM {MATCH_SCHEMA_NAME}.{TABLE_NAME} v2
- WHERE {MATCH_SUBSET_CONDITION}
- GROUP BY {MATCH_GROUPBY_NAMES} )
-SELECT COALESCE(l.category, o.category) AS category,
- o.pct_of_total AS old_pct,
- l.pct_of_total AS new_pct
- FROM latest_ver l
-FULL JOIN older_ver o
- ON (l.category = o.category)
-ORDER BY COALESCE(l.category, o.category)'),
- ('1260', '1503', 'Test Results', 'Distribution_Shift', 'postgresql', NULL, 'WITH latest_ver
- AS ( SELECT {CONCAT_COLUMNS} as category,
- COUNT(*)::FLOAT / SUM(COUNT(*)) OVER ()::FLOAT AS pct_of_total
- FROM {TARGET_SCHEMA}.{TABLE_NAME} v1
- WHERE {SUBSET_CONDITION}
- GROUP BY {COLUMN_NAME_NO_QUOTES} ),
-older_ver
- AS ( SELECT {CONCAT_MATCH_GROUPBY} as category,
- COUNT(*)::FLOAT / SUM(COUNT(*)) OVER ()::FLOAT AS pct_of_total
- FROM {MATCH_SCHEMA_NAME}.{TABLE_NAME} v2
- WHERE {MATCH_SUBSET_CONDITION}
- GROUP BY {MATCH_GROUPBY_NAMES} )
-SELECT COALESCE(l.category, o.category) AS category,
- o.pct_of_total AS old_pct,
- l.pct_of_total AS new_pct
- FROM latest_ver l
-FULL JOIN older_ver o
- ON (l.category = o.category)
-ORDER BY COALESCE(l.category, o.category)'),
-
- ('1245', '1504', 'Test Results', 'Aggregate_Balance_Percent', 'redshift', NULL, 'SELECT *
- FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) AS total, SUM(MATCH_TOTAL) AS MATCH_TOTAL
- FROM
- ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} AS total, NULL AS match_total
- FROM {TARGET_SCHEMA}.{TABLE_NAME}
- WHERE {SUBSET_CONDITION}
- GROUP BY {GROUPBY_NAMES}
- {HAVING_CONDITION}
- UNION ALL
- SELECT {MATCH_GROUPBY_NAMES}, NULL AS total, {MATCH_COLUMN_NAMES} AS match_total
- FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME}
- WHERE {MATCH_SUBSET_CONDITION}
- GROUP BY {MATCH_GROUPBY_NAMES}
- {MATCH_HAVING_CONDITION} ) a
- GROUP BY {GROUPBY_NAMES} ) s
- WHERE (total IS NOT NULL AND match_total IS NULL)
- OR (total IS NULL AND match_total IS NOT NULL)
- OR (total NOT BETWEEN match_total * (1 + {LOWER_TOLERANCE}/100.0) AND match_total * (1 + {UPPER_TOLERANCE}/100.0))
- ORDER BY {GROUPBY_NAMES};'),
- ('1246', '1504', 'Test Results', 'Aggregate_Balance_Percent', 'snowflake', NULL, 'SELECT *
- FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) AS total, SUM(MATCH_TOTAL) AS MATCH_TOTAL
- FROM
- ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} AS total, NULL AS match_total
- FROM {TARGET_SCHEMA}.{TABLE_NAME}
- WHERE {SUBSET_CONDITION}
- GROUP BY {GROUPBY_NAMES}
- {HAVING_CONDITION}
- UNION ALL
- SELECT {MATCH_GROUPBY_NAMES}, NULL AS total, {MATCH_COLUMN_NAMES} AS match_total
- FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME}
- WHERE {MATCH_SUBSET_CONDITION}
- GROUP BY {MATCH_GROUPBY_NAMES}
- {MATCH_HAVING_CONDITION} ) a
- GROUP BY {GROUPBY_NAMES} ) s
- WHERE (total IS NOT NULL AND match_total IS NULL)
- OR (total IS NULL AND match_total IS NOT NULL)
- OR (total NOT BETWEEN match_total * (1 + {LOWER_TOLERANCE}/100.0) AND match_total * (1 + {UPPER_TOLERANCE}/100.0))
- ORDER BY {GROUPBY_NAMES};'),
- ('1247', '1504', 'Test Results', 'Aggregate_Balance_Percent', 'mssql', NULL, 'SELECT *
- FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) AS total, SUM(MATCH_TOTAL) AS MATCH_TOTAL
- FROM
- ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} AS total, NULL AS match_total
- FROM {TARGET_SCHEMA}.{TABLE_NAME}
- WHERE {SUBSET_CONDITION}
- GROUP BY {GROUPBY_NAMES}
- {HAVING_CONDITION}
- UNION ALL
- SELECT {MATCH_GROUPBY_NAMES}, NULL AS total, {MATCH_COLUMN_NAMES} AS match_total
- FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME}
- WHERE {MATCH_SUBSET_CONDITION}
- GROUP BY {MATCH_GROUPBY_NAMES}
- {MATCH_HAVING_CONDITION} ) a
- GROUP BY {GROUPBY_NAMES} ) s
- WHERE (total IS NOT NULL AND match_total IS NULL)
- OR (total IS NULL AND match_total IS NOT NULL)
- OR (total NOT BETWEEN match_total * (1 + {LOWER_TOLERANCE}/100.0) AND match_total * (1 + {UPPER_TOLERANCE}/100.0))
- ORDER BY {GROUPBY_NAMES};'),
- ('1248', '1504', 'Test Results', 'Aggregate_Balance_Percent', 'postgresql', NULL, 'SELECT *
- FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) AS total, SUM(MATCH_TOTAL) AS MATCH_TOTAL
- FROM
- ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} AS total, NULL AS match_total
- FROM {TARGET_SCHEMA}.{TABLE_NAME}
- WHERE {SUBSET_CONDITION}
- GROUP BY {GROUPBY_NAMES}
- {HAVING_CONDITION}
- UNION ALL
- SELECT {MATCH_GROUPBY_NAMES}, NULL AS total, {MATCH_COLUMN_NAMES} AS match_total
- FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME}
- WHERE {MATCH_SUBSET_CONDITION}
- GROUP BY {MATCH_GROUPBY_NAMES}
- {MATCH_HAVING_CONDITION} ) a
- GROUP BY {GROUPBY_NAMES} ) s
- WHERE (total IS NOT NULL AND match_total IS NULL)
- OR (total IS NULL AND match_total IS NOT NULL)
- OR (total NOT BETWEEN match_total * (1 + {LOWER_TOLERANCE}/100.0) AND match_total * (1 + {UPPER_TOLERANCE}/100.0))
- ORDER BY {GROUPBY_NAMES};'),
-
- ('1245', '1505', 'Test Results', 'Aggregate_Balance_Range', 'redshift', NULL, 'SELECT *
- FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) AS total, SUM(MATCH_TOTAL) AS MATCH_TOTAL
- FROM
- ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} AS total, NULL AS match_total
- FROM {TARGET_SCHEMA}.{TABLE_NAME}
- WHERE {SUBSET_CONDITION}
- GROUP BY {GROUPBY_NAMES}
- {HAVING_CONDITION}
- UNION ALL
- SELECT {MATCH_GROUPBY_NAMES}, NULL AS total, {MATCH_COLUMN_NAMES} AS match_total
- FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME}
- WHERE {MATCH_SUBSET_CONDITION}
- GROUP BY {MATCH_GROUPBY_NAMES}
- {MATCH_HAVING_CONDITION} ) a
- GROUP BY {GROUPBY_NAMES} ) s
- WHERE (total IS NOT NULL AND match_total IS NULL)
- OR (total IS NULL AND match_total IS NOT NULL)
- OR (total NOT BETWEEN match_total + {LOWER_TOLERANCE} AND match_total + {UPPER_TOLERANCE})
- ORDER BY {GROUPBY_NAMES};'),
- ('1246', '1505', 'Test Results', 'Aggregate_Balance_Range', 'snowflake', NULL, 'SELECT *
- FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) AS total, SUM(MATCH_TOTAL) AS MATCH_TOTAL
- FROM
- ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} AS total, NULL AS match_total
- FROM {TARGET_SCHEMA}.{TABLE_NAME}
- WHERE {SUBSET_CONDITION}
- GROUP BY {GROUPBY_NAMES}
- {HAVING_CONDITION}
- UNION ALL
- SELECT {MATCH_GROUPBY_NAMES}, NULL AS total, {MATCH_COLUMN_NAMES} AS match_total
- FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME}
- WHERE {MATCH_SUBSET_CONDITION}
- GROUP BY {MATCH_GROUPBY_NAMES}
- {MATCH_HAVING_CONDITION} ) a
- GROUP BY {GROUPBY_NAMES} ) s
- WHERE (total IS NOT NULL AND match_total IS NULL)
- OR (total IS NULL AND match_total IS NOT NULL)
- OR (total NOT BETWEEN match_total + {LOWER_TOLERANCE} AND match_total + {UPPER_TOLERANCE})
- ORDER BY {GROUPBY_NAMES};'),
- ('1247', '1505', 'Test Results', 'Aggregate_Balance_Range', 'mssql', NULL, 'SELECT *
- FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) AS total, SUM(MATCH_TOTAL) AS MATCH_TOTAL
- FROM
- ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} AS total, NULL AS match_total
- FROM {TARGET_SCHEMA}.{TABLE_NAME}
- WHERE {SUBSET_CONDITION}
- GROUP BY {GROUPBY_NAMES}
- {HAVING_CONDITION}
- UNION ALL
- SELECT {MATCH_GROUPBY_NAMES}, NULL AS total, {MATCH_COLUMN_NAMES} AS match_total
- FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME}
- WHERE {MATCH_SUBSET_CONDITION}
- GROUP BY {MATCH_GROUPBY_NAMES}
- {MATCH_HAVING_CONDITION} ) a
- GROUP BY {GROUPBY_NAMES} ) s
- WHERE (total IS NOT NULL AND match_total IS NULL)
- OR (total IS NULL AND match_total IS NOT NULL)
- OR (total NOT BETWEEN match_total + {LOWER_TOLERANCE} AND match_total + {UPPER_TOLERANCE})
- ORDER BY {GROUPBY_NAMES};'),
- ('1248', '1505', 'Test Results', 'Aggregate_Balance_Range', 'postgresql', NULL, 'SELECT *
- FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) AS total, SUM(MATCH_TOTAL) AS MATCH_TOTAL
- FROM
- ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} AS total, NULL AS match_total
- FROM {TARGET_SCHEMA}.{TABLE_NAME}
- WHERE {SUBSET_CONDITION}
- GROUP BY {GROUPBY_NAMES}
- {HAVING_CONDITION}
- UNION ALL
- SELECT {MATCH_GROUPBY_NAMES}, NULL AS total, {MATCH_COLUMN_NAMES} AS match_total
- FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME}
- WHERE {MATCH_SUBSET_CONDITION}
- GROUP BY {MATCH_GROUPBY_NAMES}
- {MATCH_HAVING_CONDITION} ) a
- GROUP BY {GROUPBY_NAMES} ) s
- WHERE (total IS NOT NULL AND match_total IS NULL)
- OR (total IS NULL AND match_total IS NOT NULL)
- OR (total NOT BETWEEN match_total + {LOWER_TOLERANCE} AND match_total + {UPPER_TOLERANCE})
- ORDER BY {GROUPBY_NAMES};'),
-
- ('1261', '1508', 'Test Results', 'Timeframe_Combo_Gain', 'redshift', NULL, 'SELECT {COLUMN_NAME_NO_QUOTES}
- FROM {TARGET_SCHEMA}.{TABLE_NAME}
- WHERE {SUBSET_CONDITION}
- AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - 2 * {WINDOW_DAYS}
- AND {WINDOW_DATE_COLUMN} < (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS}
-GROUP BY {COLUMN_NAME_NO_QUOTES}
- EXCEPT
-SELECT {COLUMN_NAME_NO_QUOTES}
- FROM {TARGET_SCHEMA}.{TABLE_NAME}
- WHERE {SUBSET_CONDITION}
- AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS}
-GROUP BY {COLUMN_NAME_NO_QUOTES}'),
- ('1262', '1508', 'Test Results', 'Timeframe_Combo_Gain', 'snowflake', NULL, 'SELECT {COLUMN_NAME_NO_QUOTES}
- FROM {TARGET_SCHEMA}.{TABLE_NAME}
- WHERE {SUBSET_CONDITION}
- AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - 2 * {WINDOW_DAYS}
- AND {WINDOW_DATE_COLUMN} < (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS}
-GROUP BY {COLUMN_NAME_NO_QUOTES}
- EXCEPT
-SELECT {COLUMN_NAME_NO_QUOTES}
- FROM {TARGET_SCHEMA}.{TABLE_NAME}
- WHERE {SUBSET_CONDITION}
- AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS}
-GROUP BY {COLUMN_NAME_NO_QUOTES}'),
- ('1263', '1508', 'Test Results', 'Timeframe_Combo_Gain', 'mssql', NULL, 'SELECT {COLUMN_NAME_NO_QUOTES}
- FROM {TARGET_SCHEMA}.{TABLE_NAME}
- WHERE {SUBSET_CONDITION}
- AND {WINDOW_DATE_COLUMN} >= DATEADD("day", - 2 * {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}))
- AND {WINDOW_DATE_COLUMN} < DATEADD("day", - {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}))
-GROUP BY {COLUMN_NAME_NO_QUOTES}
- EXCEPT
-SELECT {COLUMN_NAME_NO_QUOTES}
- FROM {TARGET_SCHEMA}.{TABLE_NAME}
- WHERE {SUBSET_CONDITION}
- AND {WINDOW_DATE_COLUMN} >= DATEADD("day", - {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}))
-GROUP BY {COLUMN_NAME_NO_QUOTES}'),
- ('1264', '1508', 'Test Results', 'Timeframe_Combo_Gain', 'postgresql', NULL, 'SELECT {COLUMN_NAME_NO_QUOTES}
- FROM {TARGET_SCHEMA}.{TABLE_NAME}
- WHERE {SUBSET_CONDITION}
- AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - 2 * {WINDOW_DAYS}
- AND {WINDOW_DATE_COLUMN} < (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS}
-GROUP BY {COLUMN_NAME_NO_QUOTES}
- EXCEPT
-SELECT {COLUMN_NAME_NO_QUOTES}
- FROM {TARGET_SCHEMA}.{TABLE_NAME}
- WHERE {SUBSET_CONDITION}
- AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS}
-GROUP BY {COLUMN_NAME_NO_QUOTES}'),
- ('1265', '1509', 'Test Results', 'Timeframe_Combo_Match', 'redshift', NULL, ' (
-SELECT ''Prior Timeframe'' as missing_from, {COLUMN_NAME}
-FROM {TARGET_SCHEMA}.{TABLE_NAME}
-WHERE {SUBSET_CONDITION}
- AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS}
-EXCEPT
-SELECT ''Prior Timeframe'' as missing_from, {COLUMN_NAME}
-FROM {TARGET_SCHEMA}.{TABLE_NAME}
-WHERE {SUBSET_CONDITION}
- AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - 2 * {WINDOW_DAYS}
- AND {WINDOW_DATE_COLUMN} < (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS}
-)
-UNION ALL
-(
-SELECT ''Latest Timeframe'' as missing_from, {COLUMN_NAME}
-FROM {TARGET_SCHEMA}.{TABLE_NAME}
-WHERE {SUBSET_CONDITION}
- AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - 2 * {WINDOW_DAYS}
- AND {WINDOW_DATE_COLUMN} < (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS}
- EXCEPT
-SELECT ''Latest Timeframe'' as missing_from, {COLUMN_NAME}
-FROM {TARGET_SCHEMA}.{TABLE_NAME}
-WHERE {SUBSET_CONDITION}
- AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS}
-)'),
- ('1266', '1509', 'Test Results', 'Timeframe_Combo_Match', 'snowflake', NULL, ' (
-SELECT ''Prior Timeframe'' as missing_from, {COLUMN_NAME}
-FROM {TARGET_SCHEMA}.{TABLE_NAME}
-WHERE {SUBSET_CONDITION}
- AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS}
-EXCEPT
-SELECT ''Prior Timeframe'' as missing_from, {COLUMN_NAME}
-FROM {TARGET_SCHEMA}.{TABLE_NAME}
-WHERE {SUBSET_CONDITION}
- AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - 2 * {WINDOW_DAYS}
- AND {WINDOW_DATE_COLUMN} < (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS}
-)
-UNION ALL
-(
-SELECT ''Latest Timeframe'' as missing_from, {COLUMN_NAME}
-FROM {TARGET_SCHEMA}.{TABLE_NAME}
-WHERE {SUBSET_CONDITION}
- AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - 2 * {WINDOW_DAYS}
- AND {WINDOW_DATE_COLUMN} < (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS}
- EXCEPT
-SELECT ''Latest Timeframe'' as missing_from, {COLUMN_NAME}
-FROM {TARGET_SCHEMA}.{TABLE_NAME}
-WHERE {SUBSET_CONDITION}
- AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS}
-)'),
- ('1267', '1509', 'Test Results', 'Timeframe_Combo_Match', 'mssql', NULL, ' (
-SELECT ''Prior Timeframe'' as missing_from, {COLUMN_NAME}
-FROM {TARGET_SCHEMA}.{TABLE_NAME}
-WHERE {SUBSET_CONDITION}
- AND {WINDOW_DATE_COLUMN} >= DATEADD("day", - {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}))
-EXCEPT
-SELECT ''Prior Timeframe'' as missing_from, {COLUMN_NAME}
-FROM {TARGET_SCHEMA}.{TABLE_NAME}
-WHERE {SUBSET_CONDITION}
- AND {WINDOW_DATE_COLUMN} >= DATEADD("day", - 2 * {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}))
- AND {WINDOW_DATE_COLUMN} < DATEADD("day", - {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}))
-)
-UNION ALL
-(
-SELECT ''Latest Timeframe'' as missing_from, {COLUMN_NAME}
-FROM {TARGET_SCHEMA}.{TABLE_NAME}
-WHERE {SUBSET_CONDITION}
- AND {WINDOW_DATE_COLUMN} >= DATEADD("day", - 2 * {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}))
- AND {WINDOW_DATE_COLUMN} < DATEADD("day", - {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}))
- EXCEPT
-SELECT ''Latest Timeframe'' as missing_from, {COLUMN_NAME}
-FROM {TARGET_SCHEMA}.{TABLE_NAME}
-WHERE {SUBSET_CONDITION}
- AND {WINDOW_DATE_COLUMN} >= DATEADD("day", - {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}))
-)'),
- ('1268', '1509', 'Test Results', 'Timeframe_Combo_Match', 'postgresql', NULL, ' (
-SELECT ''Prior Timeframe'' as missing_from, {COLUMN_NAME}
-FROM {TARGET_SCHEMA}.{TABLE_NAME}
-WHERE {SUBSET_CONDITION}
- AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS}
-EXCEPT
-SELECT ''Prior Timeframe'' as missing_from, {COLUMN_NAME}
-FROM {TARGET_SCHEMA}.{TABLE_NAME}
-WHERE {SUBSET_CONDITION}
- AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - 2 * {WINDOW_DAYS}
- AND {WINDOW_DATE_COLUMN} < (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS}
-)
-UNION ALL
-(
-SELECT ''Latest Timeframe'' as missing_from, {COLUMN_NAME}
-FROM {TARGET_SCHEMA}.{TABLE_NAME}
-WHERE {SUBSET_CONDITION}
- AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - 2 * {WINDOW_DAYS}
- AND {WINDOW_DATE_COLUMN} < (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS}
- EXCEPT
-SELECT ''Latest Timeframe'' as missing_from, {COLUMN_NAME}
-FROM {TARGET_SCHEMA}.{TABLE_NAME}
-WHERE {SUBSET_CONDITION}
- AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS}
-)'),
- ('1269', '1100', 'Profile Anomaly', 'Potential_PII', 'redshift', NULL, 'SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500;'),
- ('1270', '1100', 'Profile Anomaly', 'Potential_PII', 'snowflake', NULL, 'SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500;'),
- ('1271', '1100', 'Profile Anomaly', 'Potential_PII', 'mssql', NULL, 'SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC;'),
- ('1272', '1100', 'Profile Anomaly', 'Potential_PII', 'postgresql', NULL, 'SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500;'),
-
- ('1273', '1001', 'Profile Anomaly' , 'Suggested_Type', 'databricks', NULL, 'SELECT `{COLUMN_NAME}`, COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY `{COLUMN_NAME}` ORDER BY record_ct DESC LIMIT 20;'),
- ('1274', '1002', 'Profile Anomaly' , 'Non_Standard_Blanks', 'databricks', NULL, 'SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE CASE WHEN `{COLUMN_NAME}` IN (''.'', ''?'', '' '') THEN 1 WHEN LOWER(`{COLUMN_NAME}`::STRING) REGEXP ''-{2,}'' OR LOWER(`{COLUMN_NAME}`::STRING) REGEXP ''0{2,}'' OR LOWER(`{COLUMN_NAME}`::STRING) REGEXP ''9{2,}'' OR LOWER(`{COLUMN_NAME}`::STRING) REGEXP ''x{2,}'' OR LOWER(`{COLUMN_NAME}`::STRING) REGEXP ''z{2,}'' THEN 1 WHEN LOWER(`{COLUMN_NAME}`) IN (''blank'',''error'',''missing'',''tbd'', ''n/a'',''#na'',''none'',''null'',''unknown'') THEN 1 WHEN LOWER(`{COLUMN_NAME}`) IN (''(blank)'',''(error)'',''(missing)'',''(tbd)'', ''(n/a)'',''(#na)'',''(none)'',''(null)'',''(unknown)'') THEN 1 WHEN LOWER(`{COLUMN_NAME}`) IN (''[blank]'',''[error]'',''[missing]'',''[tbd]'', ''[n/a]'',''[#na]'',''[none]'',''[null]'',''[unknown]'') THEN 1 WHEN `{COLUMN_NAME}` = '''' THEN 1 WHEN `{COLUMN_NAME}` IS NULL THEN 1 ELSE 0 END = 1 GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}`;'),
- ('1275', '1003', 'Profile Anomaly' , 'Invalid_Zip_USA', 'databricks', NULL, 'SELECT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE(`{COLUMN_NAME}`,''012345678'',''999999999'') NOT IN (''99999'', ''999999999'', ''99999-9999'') GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` LIMIT 500;'),
- ('1276', '1004', 'Profile Anomaly' , 'Multiple_Types_Minor', 'databricks', NULL, 'SELECT DISTINCT column_name, columns.table_name, CASE WHEN data_type ILIKE ''timestamp%'' THEN lower(data_type) WHEN data_type ILIKE ''date'' THEN lower(data_type) WHEN data_type ILIKE ''boolean'' THEN ''boolean'' WHEN data_type = ''TEXT'' THEN ''varchar('' || CAST(character_maximum_length AS STRING) || '')'' WHEN data_type ILIKE ''char%'' THEN ''char('' || CAST(character_maximum_length AS STRING) || '')'' WHEN data_type = ''NUMBER'' AND numeric_precision = 38 AND numeric_scale = 0 THEN ''bigint'' WHEN data_type ILIKE ''num%'' THEN ''numeric('' || CAST(numeric_precision AS STRING) || '','' || CAST(numeric_scale AS STRING) || '')'' ELSE data_type END AS data_type FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = ''{TARGET_SCHEMA}'' AND columns.column_name = ''{COLUMN_NAME}'' AND tables.table_type = ''BASE TABLE'' ORDER BY data_type, table_name;'),
- ('1277', '1005', 'Profile Anomaly' , 'Multiple_Types_Major', 'databricks', NULL, 'SELECT DISTINCT column_name, columns.table_name, CASE WHEN data_type ILIKE ''timestamp%'' THEN lower(data_type) WHEN data_type ILIKE ''date'' THEN lower(data_type) WHEN data_type ILIKE ''boolean'' THEN ''boolean'' WHEN data_type = ''TEXT'' THEN ''varchar('' || CAST(character_maximum_length AS STRING) || '')'' WHEN data_type ILIKE ''char%'' THEN ''char('' || CAST(character_maximum_length AS STRING) || '')'' WHEN data_type = ''NUMBER'' AND numeric_precision = 38 AND numeric_scale = 0 THEN ''bigint'' WHEN data_type ILIKE ''num%'' THEN ''numeric('' || CAST(numeric_precision AS STRING) || '','' || CAST(numeric_scale AS STRING) || '')'' ELSE data_type END AS data_type FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = ''{TARGET_SCHEMA}'' AND columns.column_name = ''{COLUMN_NAME}'' AND tables.table_type = ''BASE TABLE'' ORDER BY data_type, table_name;'),
- ('1278', '1006', 'Profile Anomaly' , 'No_Values', 'databricks', NULL, 'SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}`;' ),
- ('1279', '1007', 'Profile Anomaly' , 'Column_Pattern_Mismatch', 'databricks', NULL, 'SELECT A.* FROM (SELECT DISTINCT b.top_pattern, `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT trim(split_part(''{DETAIL_EXPRESSION}'', ''|'', 4)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( `{COLUMN_NAME}`::STRING, ''[a-z]'', ''a''), ''[A-Z]'', ''A''), ''[0-9]'', ''N'') = b.top_pattern GROUP BY b.top_pattern, `{COLUMN_NAME}` ORDER BY count DESC LIMIT 5) A UNION ALL SELECT B.* FROM (SELECT DISTINCT b.top_pattern, `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT trim(split_part(''{DETAIL_EXPRESSION}'', ''|'', 6)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( `{COLUMN_NAME}`::STRING, ''[a-z]'', ''a''), ''[A-Z]'', ''A''), ''[0-9]'', ''N'') = b.top_pattern GROUP BY b.top_pattern, `{COLUMN_NAME}` ORDER BY count DESC LIMIT 5) B UNION ALL SELECT C.* FROM (SELECT DISTINCT b.top_pattern, `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT trim(split_part(''{DETAIL_EXPRESSION}'', ''|'', 8)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( `{COLUMN_NAME}`::STRING, ''[a-z]'', ''a''), ''[A-Z]'', ''A''), ''[0-9]'', ''N'') = b.top_pattern GROUP BY b.top_pattern, `{COLUMN_NAME}` ORDER BY count DESC LIMIT 5) C UNION ALL SELECT D.* FROM (SELECT DISTINCT b.top_pattern, `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT trim(split_part(''{DETAIL_EXPRESSION}'', ''|'', 10)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( `{COLUMN_NAME}`::STRING, ''[a-z]'', ''a''), ''[A-Z]'', ''A''), ''[0-9]'', ''N'') = b.top_pattern GROUP BY b.top_pattern, `{COLUMN_NAME}` ORDER BY count DESC LIMIT 5) D ORDER BY top_pattern DESC, count DESC;' ),
- ('1280', '1008', 'Profile Anomaly' , 'Table_Pattern_Mismatch', 'databricks', NULL, 'SELECT DISTINCT column_name, columns.table_name FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = ''{TARGET_SCHEMA}'' AND columns.column_name = ''{COLUMN_NAME}'' AND UPPER(tables.table_type) = ''BASE TABLE'' ORDER BY table_name; ' ),
- ('1281', '1009', 'Profile Anomaly' , 'Leading_Spaces', 'databricks', NULL, 'SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (CASE WHEN `{COLUMN_NAME}` BETWEEN '' !'' AND ''!'' THEN 1 ELSE 0 END) = 1 GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}`;' ),
- ('1282', '1010', 'Profile Anomaly' , 'Quoted_Values', 'databricks', NULL, 'SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (CASE WHEN `{COLUMN_NAME}` ILIKE ''"%"'' OR `{COLUMN_NAME}` ILIKE ''''''%'''''' THEN 1 ELSE 0 END) = 1 GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` LIMIT 500;' ),
- ('1283', '1011', 'Profile Anomaly' , 'Char_Column_Number_Values', 'databricks', NULL, 'SELECT A.* FROM (SELECT DISTINCT ''Numeric'' as data_type, `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_NUM;`{COLUMN_NAME}`%> = 1 GROUP BY `{COLUMN_NAME}` ORDER BY count DESC LIMIT 10) AS A UNION ALL SELECT B.* FROM (SELECT DISTINCT ''Non-Numeric'' as data_type, `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_NUM;`{COLUMN_NAME}`%> != 1 GROUP BY `{COLUMN_NAME}` ORDER BY count DESC) AS B ORDER BY data_type, count DESC LIMIT 10;' ),
- ('1284', '1012', 'Profile Anomaly' , 'Char_Column_Date_Values', 'databricks', NULL, 'SELECT A.* FROM (SELECT DISTINCT ''Date'' as data_type, `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_DATE;`{COLUMN_NAME}`%> = 1 GROUP BY `{COLUMN_NAME}` ORDER BY count DESC LIMIT 10) AS A UNION ALL SELECT B.* FROM (SELECT DISTINCT ''Non-Date'' as data_type, `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_DATE;`{COLUMN_NAME}`%> != 1 GROUP BY `{COLUMN_NAME}` ORDER BY count DESC) AS B ORDER BY data_type, count DESC LIMIT 10;' ),
- ('1285', '1013', 'Profile Anomaly', 'Small Missing Value Ct', 'databricks', NULL, 'SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (CASE WHEN `{COLUMN_NAME}` IN (''.'', ''?'', '' '') THEN 1 WHEN LOWER(`{COLUMN_NAME}`::STRING) REGEXP ''-{2,}'' OR LOWER(`{COLUMN_NAME}`::STRING) REGEXP ''0{2,}'' OR LOWER(`{COLUMN_NAME}`::STRING) REGEXP ''9{2,}'' OR LOWER(`{COLUMN_NAME}`::STRING) REGEXP ''x{2,}'' OR LOWER(`{COLUMN_NAME}`::STRING) REGEXP ''z{2,}'' THEN 1 WHEN LOWER(`{COLUMN_NAME}`) IN (''blank'',''error'',''missing'',''tbd'', ''n/a'',''#na'',''none'',''null'',''unknown'') THEN 1 WHEN LOWER(`{COLUMN_NAME}`) IN (''(blank)'',''(error)'',''(missing)'',''(tbd)'', ''(n/a)'',''(#na)'',''(none)'',''(null)'',''(unknown)'') THEN 1 WHEN LOWER(`{COLUMN_NAME}`) IN (''[blank]'',''[error]'',''[missing]'',''[tbd]'', ''[n/a]'',''[#na]'',''[none]'',''[null]'',''[unknown]'') THEN 1 WHEN `{COLUMN_NAME}` = '''' THEN 1 WHEN `{COLUMN_NAME}` IS NULL THEN 1 ELSE 0 END) = 1 GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}`;'),
- ('1286', '1014', 'Profile Anomaly' , 'Small Divergent Value Ct', 'databricks', NULL, 'SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY `{COLUMN_NAME}` ORDER BY count DESC;' ),
- ('1287', '1015', 'Profile Anomaly' , 'Boolean_Value_Mismatch', 'databricks', NULL, 'SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY `{COLUMN_NAME}` ORDER BY count DESC;' ),
- ('1288', '1016', 'Profile Anomaly' , 'Potential_Duplicates', 'databricks', NULL, 'SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY `{COLUMN_NAME}` HAVING count > 1 ORDER BY count DESC LIMIT 500;' ),
- ('1289', '1017', 'Profile Anomaly' , 'Standardized_Value_Matches', 'databricks', NULL, 'WITH CTE AS ( SELECT DISTINCT UPPER(TRANSLATE(`{COLUMN_NAME}`, '' '''',.-'', '''')) as possible_standard_value, COUNT(DISTINCT `{COLUMN_NAME}`) FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY UPPER(TRANSLATE(`{COLUMN_NAME}`, '' '''',.-'', '''')) HAVING COUNT(DISTINCT `{COLUMN_NAME}`) > 1 ) SELECT DISTINCT a.`{COLUMN_NAME}`, possible_standard_value, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} a, cte b WHERE UPPER(TRANSLATE(a.`{COLUMN_NAME}`, '' '''',.-'', '''')) = b.possible_standard_value GROUP BY a.`{COLUMN_NAME}`, possible_standard_value ORDER BY possible_standard_value ASC, count DESC LIMIT 500;' ),
- ('1290', '1018', 'Profile Anomaly' , 'Unlikely_Date_Values', 'databricks', NULL, 'SELECT DISTINCT `{COLUMN_NAME}`, ''{PROFILE_RUN_DATE}'' :: DATE AS profile_run_date, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} a WHERE (`{COLUMN_NAME}` < ''1900-01-01''::DATE) OR (`{COLUMN_NAME}` > ''{PROFILE_RUN_DATE}'' :: DATE + INTERVAL ''30 year'' ) GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` DESC LIMIT 500;' ),
- ('1291', '1019', 'Profile Anomaly' , 'Recency_One_Year', 'databricks', NULL, 'created_in_ui' ),
- ('1292', '1020', 'Profile Anomaly' , 'Recency_Six_Months', 'databricks', NULL, 'created_in_ui' ),
- ('1293', '1021', 'Profile Anomaly' , 'Unexpected US States', 'databricks', NULL, 'SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` DESC LIMIT 500;' ),
- ('1294', '1022', 'Profile Anomaly' , 'Unexpected Emails', 'databricks', NULL, 'SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` DESC LIMIT 500;' ),
- ('1295', '1023', 'Profile Anomaly' , 'Small_Numeric_Value_Ct', 'databricks', NULL, 'SELECT A.* FROM (SELECT DISTINCT ''Numeric'' as data_type, `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_NUM;`{COLUMN_NAME}`%> = 1 GROUP BY `{COLUMN_NAME}` ORDER BY count DESC LIMIT 10) AS A UNION ALL SELECT B.* FROM (SELECT DISTINCT ''Non-Numeric'' as data_type, `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_NUM;`{COLUMN_NAME}`%> != 1 GROUP BY `{COLUMN_NAME}` ORDER BY count DESC) AS B ORDER BY data_type, count DESC LIMIT 10;' ),
- ('1296', '1024', 'Profile Anomaly' , 'Invalid_Zip3_USA', 'databricks', NULL, 'SELECT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE(`{COLUMN_NAME}`,''012345678'',''999999999'') <> ''999'' GROUP BY `{COLUMN_NAME}` ORDER BY count DESC, `{COLUMN_NAME}` LIMIT 500;'),
- ('1297', '1025', 'Profile Anomaly' , 'Delimited_Data_Embedded', 'databricks', NULL, 'SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE REGEXP_LIKE(`{COLUMN_NAME}`::STRING, ''^([^,|\t]{1,20}[,|\t]){2,}[^,|\t]{0,20}([,|\t]{0,1}[^,|\t]{0,20})*$'') AND NOT REGEXP_LIKE(`{COLUMN_NAME}`::STRING, ''.*\\s(and|but|or|yet)\\s.*'') GROUP BY `{COLUMN_NAME}` ORDER BY count DESC LIMIT 500;' ),
-
- ('1298', '1004', 'Test Results', 'Alpha_Trunc', 'databricks', NULL, 'SELECT DISTINCT `{COLUMN_NAME}` , LEN(`{COLUMN_NAME}`) as current_max_length, {THRESHOLD_VALUE} as previous_max_length FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT MAX(LEN(`{COLUMN_NAME}`)) as max_length FROM {TARGET_SCHEMA}.{TABLE_NAME}) a WHERE LEN(`{COLUMN_NAME}`) = a.max_length AND a.max_length < {THRESHOLD_VALUE} LIMIT 500;'),
- ('1299', '1005', 'Test Results', 'Avg_Shift', 'databricks', NULL, 'SELECT AVG(`{COLUMN_NAME}` :: FLOAT) AS current_average FROM {TARGET_SCHEMA}.{TABLE_NAME};'),
- ('1300', '1006', 'Test Results', 'Condition_Flag', 'databricks', NULL, 'SELECT * FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE {CUSTOM_QUERY} LIMIT 500;'),
- ('1301', '1007', 'Test Results', 'Constant', 'databricks', NULL, 'SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE `{COLUMN_NAME}` <> {BASELINE_VALUE} GROUP BY `{COLUMN_NAME}` LIMIT 500;'),
- ('1302', '1009', 'Test Results', 'Daily_Record_Ct', 'databricks', NULL, 'WITH date_bounds AS( SELECT MIN(`{COLUMN_NAME}`) AS min_date, MAX(`{COLUMN_NAME}`) AS max_date FROM {TARGET_SCHEMA}.{TABLE_NAME}), all_dates AS ( SELECT EXPLODE(SEQUENCE(min_date, max_date, INTERVAL 1 DAY)) AS all_dates FROM date_bounds ), existing_periods AS ( SELECT DISTINCT CAST(`{COLUMN_NAME}` AS DATE) AS period, COUNT(1) AS period_count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY CAST(`{COLUMN_NAME}` AS DATE) ), missing_dates AS ( SELECT d.all_dates AS missing_period FROM all_dates d LEFT JOIN existing_periods e ON d.all_dates = e.period WHERE e.period IS NULL ) SELECT m.missing_period, MAX(e1.period) AS prior_available_date, MAX(e1.period_count) AS prior_available_date_count, MIN(e2.period) AS next_available_date, MAX(e2.period_count) AS next_available_date_count FROM missing_dates m LEFT JOIN existing_periods e1 ON e1.period < m.missing_period LEFT JOIN existing_periods e2 ON e2.period > m.missing_period GROUP BY m.missing_period ORDER BY m.missing_period LIMIT 500;'),
- ('1303', '1011', 'Test Results', 'Dec_Trunc', 'databricks', NULL, 'SELECT DISTINCT LENGTH(SPLIT_PART(`{COLUMN_NAME}`::STRING, ''.'', 2)) AS decimal_scale, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY decimal_scale LIMIT 500;'),
- ('1304', '1012', 'Test Results', 'Distinct_Date_Ct', 'databricks', NULL, 'SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE `{COLUMN_NAME}` IS NOT NULL GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` DESC LIMIT 500;'),
- ('1305', '1013', 'Test Results', 'Distinct_Value_Ct', 'databricks', NULL, 'SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE `{COLUMN_NAME}` IS NOT NULL GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` DESC LIMIT 500;'),
- ('1306', '1014', 'Test Results', 'Email_Format', 'databricks', NULL, 'SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE REGEXP_LIKE(`{COLUMN_NAME}`::STRING, ''^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z]{2,}$'') != 1 GROUP BY `{COLUMN_NAME}` LIMIT 500;'),
- ('1307', '1015', 'Test Results', 'Future_Date', 'databricks', NULL, 'SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE GREATEST(0, SIGN(`{COLUMN_NAME}`::DATE - ''{TEST_DATE}''::DATE)) > {THRESHOLD_VALUE} GROUP BY `{COLUMN_NAME}` LIMIT 500;'),
- ('1308', '1016', 'Test Results', 'Future_Date_1Y', 'databricks', NULL, 'SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE GREATEST(0, SIGN(`{COLUMN_NAME}`::DATE - (''{TEST_DATE}''::DATE + 365))) > {THRESHOLD_VALUE} GROUP BY `{COLUMN_NAME}` LIMIT 500;'),
- ('1309', '1017', 'Test Results', 'Incr_Avg_Shift', 'databricks', NULL, 'SELECT AVG(`{COLUMN_NAME}` :: FLOAT) AS current_average, SUM(`{COLUMN_NAME}` ::FLOAT) AS current_sum, NULLIF(COUNT(`{COLUMN_NAME}` )::FLOAT, 0) as current_value_count FROM {TARGET_SCHEMA}.{TABLE_NAME};'),
- ('1310', '1018', 'Test Results', 'LOV_All', 'databricks', NULL, 'SELECT ARRAY_JOIN(ARRAY_SORT(COLLECT_SET(`{COLUMN_NAME}`)), ''|'') AS aggregated_values FROM {TARGET_SCHEMA}.{TABLE_NAME} HAVING ARRAY_JOIN(ARRAY_SORT(COLLECT_SET(`{COLUMN_NAME}`)), ''|'') <> ''{THRESHOLD_VALUE}'' LIMIT 500;'),
- ('1311', '1019', 'Test Results', 'LOV_Match', 'databricks', NULL, 'SELECT DISTINCT NULLIF(`{COLUMN_NAME}`, '''') AS `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE NULLIF(`{COLUMN_NAME}`, '''') NOT IN {BASELINE_VALUE} GROUP BY `{COLUMN_NAME}` LIMIT 500;'),
- ('1312', '1020', 'Test Results', 'Min_Date', 'databricks', NULL, 'SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE `{COLUMN_NAME}` :: DATE < ''{BASELINE_VALUE}'' :: DATE GROUP BY `{COLUMN_NAME}` LIMIT 500;'),
- ('1313', '1021', 'Test Results', 'Min_Val', 'databricks', NULL, 'SELECT DISTINCT `{COLUMN_NAME}`, (ABS(`{COLUMN_NAME}`) - ABS({BASELINE_VALUE})) AS difference_from_baseline FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE `{COLUMN_NAME}` < {BASELINE_VALUE} LIMIT 500;'),
- ('1314', '1022', 'Test Results', 'Missing_Pct', 'databricks', NULL, 'SELECT * FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE `{COLUMN_NAME}` IS NULL OR `{COLUMN_NAME}` :: VARCHAR(255) = '''' LIMIT 10;'),
- ('1315', '1023', 'Test Results', 'Monthly_Rec_Ct', 'databricks', NULL, 'WITH daterange AS( SELECT explode( sequence( date_trunc(''month'', (SELECT MIN(`{COLUMN_NAME}`) FROM {TARGET_SCHEMA}.{TABLE_NAME})), date_trunc(''month'', (SELECT MAX(`{COLUMN_NAME}`) FROM {TARGET_SCHEMA}.{TABLE_NAME})), interval 1 month) ) AS all_dates ), existing_periods AS ( SELECT DISTINCT date_trunc(''month'', `{COLUMN_NAME}`) AS period, COUNT(1) AS period_count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY date_trunc(''month'', `{COLUMN_NAME}`) ) SELECT p.missing_period, p.prior_available_month, e.period_count AS prior_available_month_count, p.next_available_month, f.period_count AS next_available_month_count FROM ( SELECT d.all_dates AS missing_period, MAX(b.period) AS prior_available_month, MIN(c.period) AS next_available_month FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates ) p LEFT JOIN existing_periods e ON p.prior_available_month = e.period LEFT JOIN existing_periods f ON p.next_available_month = f.period ORDER BY p.missing_period;'),
- ('1316', '1024', 'Test Results', 'Outlier_Pct_Above', 'databricks', NULL, 'SELECT ({BASELINE_AVG} + (2*{BASELINE_SD})) AS outlier_threshold, `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE `{COLUMN_NAME}` :: FLOAT > ({BASELINE_AVG} + (2*{BASELINE_SD})) GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` DESC;'),
- ('1317', '1025', 'Test Results', 'Outlier_Pct_Below', 'databricks', NULL, 'SELECT ({BASELINE_AVG} + (2*{BASELINE_SD})) AS outlier_threshold, `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE `{COLUMN_NAME}` :: FLOAT < ({BASELINE_AVG} + (2*{BASELINE_SD})) GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` DESC;'),
- ('1318', '1026', 'Test Results', 'Pattern_Match', 'databricks', NULL, 'SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE REGEXP_LIKE(NULLIF(`{COLUMN_NAME}`::STRING, ''''),''{BASELINE_VALUE}'') != 1 GROUP BY `{COLUMN_NAME}`;'),
- ('1319', '1028', 'Test Results', 'Recency', 'databricks', NULL, 'SELECT DISTINCT col AS latest_date_available, ''{TEST_DATE}'' :: DATE as test_run_date FROM (SELECT MAX(`{COLUMN_NAME}`) AS col FROM {TARGET_SCHEMA}.{TABLE_NAME}) WHERE ABS(<%DATEDIFF_DAY;col;''{TEST_DATE}''::DATE%>) > {THRESHOLD_VALUE};'),
- ('1320', '1030', 'Test Results', 'Required', 'databricks', NULL, 'SELECT * FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE `{COLUMN_NAME}` IS NULL LIMIT 500;'),
- ('1321', '1031', 'Test Results', 'Row_Ct', 'databricks', NULL, 'WITH CTE AS (SELECT COUNT(*) AS current_count FROM {TARGET_SCHEMA}.{TABLE_NAME}) SELECT current_count, ABS(ROUND(100 *(current_count - {THRESHOLD_VALUE}) :: FLOAT / {THRESHOLD_VALUE} :: FLOAT,2)) AS row_count_pct_decrease FROM cte WHERE current_count < {THRESHOLD_VALUE};'),
- ('1322', '1032', 'Test Results', 'Row_Ct_Pct', 'databricks', NULL, 'WITH CTE AS (SELECT COUNT(*) AS current_count FROM {TARGET_SCHEMA}.{TABLE_NAME}) SELECT current_count, {BASELINE_CT} AS baseline_count, ABS(ROUND(100 * (current_count - {BASELINE_CT}) :: FLOAT / {BASELINE_CT} :: FLOAT,2)) AS row_count_pct_difference FROM cte;'),
- ('1323', '1033', 'Test Results', 'Street_Addr_Pattern', 'databricks', NULL, 'SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE REGEXP_LIKE(`{COLUMN_NAME}`::STRING, ''^[0-9]{1,5}[a-zA-Z]?\\s\\w{1,5}\\.?\\s?\\w*\\s?\\w*\\s[a-zA-Z]{1,6}\\.?\\s?[0-9]{0,5}[A-Z]{0,1}$'') != 1 GROUP BY `{COLUMN_NAME}` ORDER BY count DESC LIMIT 500;'),
- ('1324', '1036', 'Test Results', 'US_State', 'databricks', NULL, 'SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE NULLIF(`{COLUMN_NAME}`, '''') NOT IN (''AL'',''AK'',''AS'',''AZ'',''AR'',''CA'',''CO'',''CT'',''DE'',''DC'',''FM'',''FL'',''GA'',''GU'',''HI'',''ID'',''IL'',''IN'',''IA'',''KS'',''KY'',''LA'',''ME'',''MH'',''MD'',''MA'',''MI'',''MN'',''MS'',''MO'',''MT'',''NE'',''NV'',''NH'',''NJ'',''NM'',''NY'',''NC'',''ND'',''MP'',''OH'',''OK'',''OR'',''PW'',''PA'',''PR'',''RI'',''SC'',''SD'',''TN'',''TX'',''UT'',''VT'',''VI'',''VA'',''WA'',''WV'',''WI'',''WY'',''AE'',''AP'',''AA'') GROUP BY `{COLUMN_NAME}` LIMIT 500;'),
- ('1325', '1034', 'Test Results', 'Unique', 'databricks', NULL, 'SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY `{COLUMN_NAME}` HAVING count > 1 ORDER BY count DESC LIMIT 500;'),
- ('1326', '1035', 'Test Results', 'Unique_Pct', 'databricks', NULL, 'SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY `{COLUMN_NAME}` ORDER BY count DESC LIMIT 500;'),
- ('1327', '1037', 'Test Results', 'Weekly_Rec_Ct', 'databricks', NULL, 'WITH daterange AS( SELECT explode(sequence( date_trunc(''week'', (SELECT min(`{COLUMN_NAME}`) FROM {TARGET_SCHEMA}.{TABLE_NAME})), date_trunc(''week'', (SELECT max(`{COLUMN_NAME}`) FROM {TARGET_SCHEMA}.{TABLE_NAME})), interval 1 week)) AS all_dates ), existing_periods AS ( SELECT DISTINCT date_trunc(''week'', `{COLUMN_NAME}`) AS period, COUNT(1) AS period_count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY date_trunc(''week'', `{COLUMN_NAME}`) ) SELECT p.missing_period, p.prior_available_week, e.period_count AS prior_available_week_count, p.next_available_week, f.period_count AS next_available_week_count FROM ( SELECT d.all_dates AS missing_period, MAX(b.period) AS prior_available_week, MIN(c.period) AS next_available_week FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates ) p LEFT JOIN existing_periods e ON p.prior_available_week = e.period LEFT JOIN existing_periods f ON p.next_available_week = f.period ORDER BY p.missing_period;'),
- ('1328', '1040', 'Test Results', 'Variability_Increase', 'databricks', NULL, 'SELECT STDDEV(CAST(`{COLUMN_NAME}` AS FLOAT)) as current_standard_deviation FROM {TARGET_SCHEMA}.{TABLE_NAME};'),
- ('1329', '1041', 'Test Results', 'Variability_Decrease', 'databricks', NULL, 'SELECT STDDEV(CAST(`{COLUMN_NAME}` AS FLOAT)) as current_standard_deviation FROM {TARGET_SCHEMA}.{TABLE_NAME};'),
-
- ('1230', '1027', 'Profile Anomaly' , 'Variant_Coded_Values', 'databricks', NULL, 'SELECT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE LOWER(`{COLUMN_NAME}`) IN (SELECT TRIM(value) FROM (SELECT EXPLODE(SPLIT(SUBSTRING(''{DETAIL_EXPRESSION}'', INSTR(''{DETAIL_EXPRESSION}'', '':'') + 2), ''\\|'')) AS value)) GROUP BY `{COLUMN_NAME}`;'),
- ('1330', '1043', 'Test Results', 'Valid_Characters', 'databricks', NULL, 'SELECT `{COLUMN_NAME}`, COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE REGEXP_LIKE(`{COLUMN_NAME}`, ''.*[[:cntrl:]].*'') OR `{COLUMN_NAME}`::STRING LIKE '' %'' OR `{COLUMN_NAME}`::STRING LIKE ''''''%'''''' OR `{COLUMN_NAME}`::STRING LIKE ''"%"'' GROUP BY `{COLUMN_NAME}` ORDER BY record_ct DESC LIMIT 20;'),
- ('1331', '1044', 'Test Results', 'Valid_US_Zip', 'databricks', NULL, 'SELECT `{COLUMN_NAME}`, COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE(`{COLUMN_NAME}`,''012345678'',''999999999'') NOT IN (''99999'', ''999999999'', ''99999-9999'') GROUP BY `{COLUMN_NAME}` ORDER BY record_ct DESC LIMIT 20;'),
- ('1332', '1045', 'Test Results', 'Valid_US_Zip3', 'databricks', NULL, 'SELECT `{COLUMN_NAME}`, COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE(`{COLUMN_NAME}`,''012345678'',''999999999'') NOT IN (''99999'', ''999999999'', ''99999-9999'') GROUP BY `{COLUMN_NAME}` ORDER BY record_ct DESC LIMIT 20;'),
-
- ('1333', '1500', 'Test Results', 'Aggregate_Balance', 'databricks', NULL, 'SELECT *
- FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) AS total, SUM(MATCH_TOTAL) AS MATCH_TOTAL
- FROM
- ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} AS total, NULL AS match_total
- FROM {TARGET_SCHEMA}.{TABLE_NAME}
- WHERE {SUBSET_CONDITION}
- GROUP BY {GROUPBY_NAMES}
- {HAVING_CONDITION}
- UNION ALL
- SELECT {MATCH_GROUPBY_NAMES}, NULL AS total, {MATCH_COLUMN_NAMES} AS match_total
- FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME}
- WHERE {MATCH_SUBSET_CONDITION}
- GROUP BY {MATCH_GROUPBY_NAMES}
- {MATCH_HAVING_CONDITION} ) a
- GROUP BY {GROUPBY_NAMES} ) s
- WHERE total <> match_total OR (total IS NOT NULL AND match_total IS NULL) OR (total IS NULL AND match_total IS NOT NULL)
-ORDER BY {GROUPBY_NAMES};'),
- ('1334', '1501', 'Test Results', 'Aggregate_Minimum', 'databricks', NULL, 'SELECT *
-FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) as total, SUM(MATCH_TOTAL) as MATCH_TOTAL
- FROM
- ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} as total, NULL as match_total
- FROM {TARGET_SCHEMA}.{TABLE_NAME}
- WHERE {SUBSET_CONDITION}
- GROUP BY {GROUPBY_NAMES}
- {HAVING_CONDITION}
- UNION ALL
- SELECT {MATCH_GROUPBY_NAMES}, NULL as total, {MATCH_COLUMN_NAMES} as match_total
- FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME}
- WHERE {MATCH_SUBSET_CONDITION}
- GROUP BY {MATCH_GROUPBY_NAMES}
- {MATCH_HAVING_CONDITION} ) a
- GROUP BY {GROUPBY_NAMES} ) s
- WHERE total < match_total OR (total IS NULL AND match_total IS NOT NULL)
-ORDER BY {GROUPBY_NAMES};'),
- ('1335', '1502', 'Test Results', 'Combo_Match', 'databricks', NULL, 'SELECT *
- FROM ( SELECT {COLUMN_NAME_NO_QUOTES}
- FROM {TARGET_SCHEMA}.{TABLE_NAME}
- WHERE {SUBSET_CONDITION}
- GROUP BY {COLUMN_NAME_NO_QUOTES}
- {HAVING_CONDITION}
- EXCEPT
- SELECT {MATCH_GROUPBY_NAMES}
- FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME}
- WHERE {MATCH_SUBSET_CONDITION}
- GROUP BY {MATCH_GROUPBY_NAMES}
- {MATCH_HAVING_CONDITION}
- ) test
-ORDER BY {COLUMN_NAME_NO_QUOTES};'),
- ('1336', '1503', 'Test Results', 'Distribution_Shift', 'databricks', NULL, 'WITH latest_ver
- AS ( SELECT {CONCAT_COLUMNS} as category,
- COUNT(*)::FLOAT / SUM(COUNT(*)) OVER ()::FLOAT AS pct_of_total
- FROM {TARGET_SCHEMA}.{TABLE_NAME} v1
- WHERE {SUBSET_CONDITION}
- GROUP BY {COLUMN_NAME_NO_QUOTES} ),
-older_ver
- AS ( SELECT {CONCAT_MATCH_GROUPBY} as category,
- COUNT(*)::FLOAT / SUM(COUNT(*)) OVER ()::FLOAT AS pct_of_total
- FROM {MATCH_SCHEMA_NAME}.{TABLE_NAME} v2
- WHERE {MATCH_SUBSET_CONDITION}
- GROUP BY {MATCH_GROUPBY_NAMES} )
-SELECT COALESCE(l.category, o.category) AS category,
- o.pct_of_total AS old_pct,
- l.pct_of_total AS new_pct
- FROM latest_ver l
-FULL JOIN older_ver o
- ON (l.category = o.category)
-ORDER BY COALESCE(l.category, o.category)'),
- ('1248', '1504', 'Test Results', 'Aggregate_Balance_Percent', 'databricks', NULL, 'SELECT *
- FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) AS total, SUM(MATCH_TOTAL) AS MATCH_TOTAL
- FROM
- ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} AS total, NULL AS match_total
- FROM {TARGET_SCHEMA}.{TABLE_NAME}
- WHERE {SUBSET_CONDITION}
- GROUP BY {GROUPBY_NAMES}
- {HAVING_CONDITION}
- UNION ALL
- SELECT {MATCH_GROUPBY_NAMES}, NULL AS total, {MATCH_COLUMN_NAMES} AS match_total
- FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME}
- WHERE {MATCH_SUBSET_CONDITION}
- GROUP BY {MATCH_GROUPBY_NAMES}
- {MATCH_HAVING_CONDITION} ) a
- GROUP BY {GROUPBY_NAMES} ) s
- WHERE (total IS NOT NULL AND match_total IS NULL)
- OR (total IS NULL AND match_total IS NOT NULL)
- OR (total NOT BETWEEN match_total * (1 + {LOWER_TOLERANCE}/100.0) AND match_total * (1 + {UPPER_TOLERANCE}/100.0))
- ORDER BY {GROUPBY_NAMES};'),
- ('1245', '1505', 'Test Results', 'Aggregate_Balance_Range', 'databricks', NULL, 'SELECT *
- FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) AS total, SUM(MATCH_TOTAL) AS MATCH_TOTAL
- FROM
- ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} AS total, NULL AS match_total
- FROM {TARGET_SCHEMA}.{TABLE_NAME}
- WHERE {SUBSET_CONDITION}
- GROUP BY {GROUPBY_NAMES}
- {HAVING_CONDITION}
- UNION ALL
- SELECT {MATCH_GROUPBY_NAMES}, NULL AS total, {MATCH_COLUMN_NAMES} AS match_total
- FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME}
- WHERE {MATCH_SUBSET_CONDITION}
- GROUP BY {MATCH_GROUPBY_NAMES}
- {MATCH_HAVING_CONDITION} ) a
- GROUP BY {GROUPBY_NAMES} ) s
- WHERE (total IS NOT NULL AND match_total IS NULL)
- OR (total IS NULL AND match_total IS NOT NULL)
- OR (total NOT BETWEEN match_total + {LOWER_TOLERANCE} AND match_total + {UPPER_TOLERANCE})
- ORDER BY {GROUPBY_NAMES};'),
- ('1337', '1509', 'Test Results', 'Timeframe_Combo_Match', 'databricks', NULL, ' (
-SELECT ''Prior Timeframe'' as missing_from, {COLUMN_NAME}
-FROM {TARGET_SCHEMA}.{TABLE_NAME}
-WHERE {SUBSET_CONDITION}
- AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS}
-EXCEPT
-SELECT ''Prior Timeframe'' as missing_from, {COLUMN_NAME}
-FROM {TARGET_SCHEMA}.{TABLE_NAME}
-WHERE {SUBSET_CONDITION}
- AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - 2 * {WINDOW_DAYS}
- AND {WINDOW_DATE_COLUMN} < (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS}
-)
-UNION ALL
-(
-SELECT ''Latest Timeframe'' as missing_from, {COLUMN_NAME}
-FROM {TARGET_SCHEMA}.{TABLE_NAME}
-WHERE {SUBSET_CONDITION}
- AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - 2 * {WINDOW_DAYS}
- AND {WINDOW_DATE_COLUMN} < (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS}
- EXCEPT
-SELECT ''Latest Timeframe'' as missing_from, {COLUMN_NAME}
-FROM {TARGET_SCHEMA}.{TABLE_NAME}
-WHERE {SUBSET_CONDITION}
- AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS}
-)'),
- ('1338', '1100', 'Profile Anomaly', 'Potential_PII', 'databricks', NULL, 'SELECT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` DESC LIMIT 500;'),
-
- ('1253', '1510', 'Test Results', 'Dupe_Rows', 'redshift', NULL, 'SELECT {GROUPBY_NAMES}, COUNT(*) as record_ct
- FROM {TARGET_SCHEMA}.{TABLE_NAME}
- WHERE {SUBSET_CONDITION}
- GROUP BY {GROUPBY_NAMES}
- HAVING COUNT(*) > 1
-ORDER BY {GROUPBY_NAMES}'),
- ('1254', '1510', 'Test Results', 'Dupe_Rows', 'snowflake', NULL, 'SELECT {GROUPBY_NAMES}, COUNT(*) as record_ct
- FROM {TARGET_SCHEMA}.{TABLE_NAME}
- WHERE {SUBSET_CONDITION}
- GROUP BY {GROUPBY_NAMES}
- HAVING COUNT(*) > 1
-ORDER BY {GROUPBY_NAMES}'),
- ('1255', '1510', 'Test Results', 'Dupe_Rows', 'mssql', NULL, 'SELECT {GROUPBY_NAMES}, COUNT(*) as record_ct
- FROM {TARGET_SCHEMA}.{TABLE_NAME}
- WHERE {SUBSET_CONDITION}
- GROUP BY {GROUPBY_NAMES}
- HAVING COUNT(*) > 1
-ORDER BY {GROUPBY_NAMES}'),
- ('1256', '1510', 'Test Results', 'Dupe_Rows', 'postgresql', NULL, 'SELECT {GROUPBY_NAMES}, COUNT(*) as record_ct
- FROM {TARGET_SCHEMA}.{TABLE_NAME}
- WHERE {SUBSET_CONDITION}
- GROUP BY {GROUPBY_NAMES}
- HAVING COUNT(*) > 1
-ORDER BY {GROUPBY_NAMES}'),
- ('1257', '1510', 'Test Results', 'Dupe_Rows', 'databricks', NULL, 'SELECT {GROUPBY_NAMES}, COUNT(*) as record_ct
- FROM {TARGET_SCHEMA}.{TABLE_NAME}
- WHERE {SUBSET_CONDITION}
- GROUP BY {GROUPBY_NAMES}
- HAVING COUNT(*) > 1
-ORDER BY {GROUPBY_NAMES}'),
- ('1258', '1028', 'Profile Anomaly', 'Inconsistent_Casing', 'redshift', NULL, '(SELECT ''Upper Case'' as casing, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}
-WHERE UPPER("{COLUMN_NAME}") = "{COLUMN_NAME}"
-GROUP BY "{COLUMN_NAME}" LIMIT 20)
-UNION ALL
-(SELECT ''Mixed Case'' as casing, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}
-WHERE "{COLUMN_NAME}" <> UPPER("{COLUMN_NAME}") AND "{COLUMN_NAME}" <> LOWER("{COLUMN_NAME}")
-GROUP BY "{COLUMN_NAME}" LIMIT 20)'),
- ('1259', '1028', 'Profile Anomaly', 'Inconsistent_Casing', 'postgresql', NULL, '(SELECT ''Upper Case'' as casing, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}
-WHERE UPPER("{COLUMN_NAME}") = "{COLUMN_NAME}"
-GROUP BY "{COLUMN_NAME}" LIMIT 20)
-UNION ALL
-(SELECT ''Mixed Case'' as casing, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}
-WHERE "{COLUMN_NAME}" <> UPPER("{COLUMN_NAME}") AND "{COLUMN_NAME}" <> LOWER("{COLUMN_NAME}")
-GROUP BY "{COLUMN_NAME}" LIMIT 20)'),
- ('1260', '1028', 'Profile Anomaly', 'Inconsistent_Casing', 'mssql', NULL, 'SELECT TOP 20 ''Upper Case'' as casing, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}
-WHERE UPPER("{COLUMN_NAME}") = "{COLUMN_NAME}"
-GROUP BY "{COLUMN_NAME}"
-UNION ALL
-SELECT TOP 20 ''Mixed Case'' as casing, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}
-WHERE "{COLUMN_NAME}" <> UPPER("{COLUMN_NAME}") AND "{COLUMN_NAME}" <> LOWER("{COLUMN_NAME}")
-GROUP BY "{COLUMN_NAME}"'),
- ('1261', '1028', 'Profile Anomaly', 'Inconsistent_Casing', 'snowflake', NULL, '(SELECT ''Upper Case'' as casing, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}
-WHERE UPPER("{COLUMN_NAME}") = "{COLUMN_NAME}"
-GROUP BY "{COLUMN_NAME}" LIMIT 20)
-UNION ALL
-(SELECT ''Mixed Case'' as casing, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}
-WHERE "{COLUMN_NAME}" <> UPPER("{COLUMN_NAME}") AND "{COLUMN_NAME}" <> LOWER("{COLUMN_NAME}")
-GROUP BY "{COLUMN_NAME}" LIMIT 20)'),
- ('1262', '1028', 'Profile Anomaly', 'Inconsistent_Casing', 'databricks', NULL, '(SELECT ''Upper Case'' as casing, `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}
-WHERE UPPER(`{COLUMN_NAME}`) = `{COLUMN_NAME}`
-GROUP BY `{COLUMN_NAME}` LIMIT 20)
-UNION ALL
-(SELECT ''Mixed Case'' as casing, `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}
-WHERE `{COLUMN_NAME}` <> UPPER(`{COLUMN_NAME}`) AND `{COLUMN_NAME}` <> LOWER(`{COLUMN_NAME}`)
-GROUP BY `{COLUMN_NAME}` LIMIT 20)'),
- ('1263', '1029', 'Profile Anomaly', 'Non_Alpha_Name_Address', 'redshift', NULL, 'SELECT "{COLUMN_NAME}", COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}"
- WHERE "{COLUMN_NAME}" = UPPER("{COLUMN_NAME}") AND "{COLUMN_NAME}" = LOWER("{COLUMN_NAME}") AND "{COLUMN_NAME}" > ''''
-GROUP BY "{COLUMN_NAME}" LIMIT 500'),
- ('1264', '1029', 'Profile Anomaly', 'Non_Alpha_Name_Address', 'postgresql', NULL, 'SELECT "{COLUMN_NAME}", COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}"
- WHERE "{COLUMN_NAME}" = UPPER("{COLUMN_NAME}") AND "{COLUMN_NAME}" = LOWER("{COLUMN_NAME}") AND "{COLUMN_NAME}" > ''''
-GROUP BY "{COLUMN_NAME}" LIMIT 500'),
- ('1265', '1029', 'Profile Anomaly', 'Non_Alpha_Name_Address', 'mssql', NULL, 'SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}"
- WHERE "{COLUMN_NAME}" = UPPER("{COLUMN_NAME}") AND "{COLUMN_NAME}" = LOWER("{COLUMN_NAME}") AND "{COLUMN_NAME}" > ''''
-GROUP BY "{COLUMN_NAME}"'),
- ('1266', '1029', 'Profile Anomaly', 'Non_Alpha_Name_Address', 'snowflake', NULL, 'SELECT "{COLUMN_NAME}", COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}"
- WHERE "{COLUMN_NAME}" = UPPER("{COLUMN_NAME}") AND "{COLUMN_NAME}" = LOWER("{COLUMN_NAME}") AND "{COLUMN_NAME}" > ''''
-GROUP BY "{COLUMN_NAME}" LIMIT 500'),
- ('1267', '1029', 'Profile Anomaly', 'Non_Alpha_Name_Address', 'databricks', NULL, 'SELECT `{COLUMN_NAME}`, COUNT(*) as record_ct FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`
- WHERE `{COLUMN_NAME}` = UPPER(`{COLUMN_NAME}`) AND `{COLUMN_NAME}` = LOWER(`{COLUMN_NAME}`) AND `{COLUMN_NAME}` > ''''
-GROUP BY `{COLUMN_NAME}` LIMIT 500'),
- ('1268', '1030', 'Profile Anomaly', 'Non_Alpha_Prefixed_Name', 'redshift', NULL, 'SELECT "{COLUMN_NAME}", COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}"
- WHERE "{COLUMN_NAME}" < ''A'' AND LEFT("{COLUMN_NAME}", 1) NOT IN (''"'', '' '') AND RIGHT("{COLUMN_NAME}", 1) <> ''''''''
-GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500'),
- ('1269', '1030', 'Profile Anomaly', 'Non_Alpha_Prefixed_Name', 'postgresql', NULL, 'SELECT "{COLUMN_NAME}", COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}"
- WHERE "{COLUMN_NAME}" < ''A'' AND LEFT("{COLUMN_NAME}", 1) NOT IN (''"'', '' '') AND RIGHT("{COLUMN_NAME}", 1) <> ''''''''
-GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500'),
- ('1270', '1030', 'Profile Anomaly', 'Non_Alpha_Prefixed_Name', 'mssql', NULL, 'SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}"
- WHERE "{COLUMN_NAME}" < ''A'' AND LEFT("{COLUMN_NAME}", 1) NOT IN (''"'', '' '') AND RIGHT("{COLUMN_NAME}", 1) <> ''''''''
-GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}"'),
- ('1271', '1030', 'Profile Anomaly', 'Non_Alpha_Prefixed_Name', 'snowflake', NULL, 'SELECT "{COLUMN_NAME}", COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}"
-WHERE "{COLUMN_NAME}" < ''A'' AND LEFT("{COLUMN_NAME}", 1) NOT IN (''"'', '' '') AND RIGHT("{COLUMN_NAME}", 1) <> ''''''''
-GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500'),
- ('1272', '1030', 'Profile Anomaly', 'Non_Alpha_Prefixed_Name', 'databricks', NULL, 'SELECT `{COLUMN_NAME}`, COUNT(*) as record_ct FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`
-WHERE `{COLUMN_NAME}` < ''A'' AND LEFT(`{COLUMN_NAME}`, 1) NOT IN (''"'', '' '') AND RIGHT(`{COLUMN_NAME}`, 1) <> ''''''''
-GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` LIMIT 500'),
- ('1273', '1031', 'Profile Anomaly', 'Non_Printing_Chars', 'redshift', NULL, 'SELECT REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE("{COLUMN_NAME}",
- CHR(160), ''\x160''),
- CHR(8201), ''\x8201''),
- CHR(8203), ''\x8203''),
- CHR(8204), ''\x8204''),
- CHR(8205), ''\x8205''),
- CHR(8206), ''\x8206''),
- CHR(8207), ''\x8207''),
- CHR(8239), ''\x8239''),
- CHR(12288), ''\x12288''),
- CHR(65279), ''\x65279'') as "{COLUMN_NAME}_content",
- COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}"
- WHERE TRANSLATE("{COLUMN_NAME}", CHR(160) || CHR(8201) || CHR(8203) || CHR(8204) || CHR(8205) || CHR(8206) || CHR(8207) || CHR(8239) || CHR(12288) || CHR(65279), ''XXXXXXXXXX'') <> "{COLUMN_NAME}"
-GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500'),
- ('1274', '1031', 'Profile Anomaly', 'Non_Printing_Chars', 'postgresql', NULL, 'SELECT REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE("{COLUMN_NAME}",
- CHR(160), ''\x160''),
- CHR(8201), ''\x8201''),
- CHR(8203), ''\x8203''),
- CHR(8204), ''\x8204''),
- CHR(8205), ''\x8205''),
- CHR(8206), ''\x8206''),
- CHR(8207), ''\x8207''),
- CHR(8239), ''\x8239''),
- CHR(12288), ''\x12288''),
- CHR(65279), ''\x65279'') as "{COLUMN_NAME}_content",
- COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}"
- WHERE TRANSLATE("{COLUMN_NAME}", CHR(160) || CHR(8201) || CHR(8203) || CHR(8204) || CHR(8205) || CHR(8206) || CHR(8207) || CHR(8239) || CHR(12288) || CHR(65279), ''XXXXXXXXXX'') <> "{COLUMN_NAME}"
-GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500'),
- ('1275', '1031', 'Profile Anomaly', 'Non_Printing_Chars', 'mssql', NULL, 'SELECT TOP 500 REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE("{COLUMN_NAME}",
- NCHAR(160), ''\x160''),
- NCHAR(8201), ''\x8201''),
- NCHAR(8203), ''\x8203''),
- NCHAR(8204), ''\x8204''),
- NCHAR(8205), ''\x8205''),
- NCHAR(8206), ''\x8206''),
- NCHAR(8207), ''\x8207''),
- NCHAR(8239), ''\x8239''),
- NCHAR(12288), ''\x12288''),
- NCHAR(65279), ''\x65279'') AS "{COLUMN_NAME}_content",
- COUNT(*) AS record_ct
-FROM "{TARGET_SCHEMA}"."{TABLE_NAME}"
- WHERE TRANSLATE("{COLUMN_NAME}", NCHAR(160) + NCHAR(8201) + NCHAR(8203) + NCHAR(8204) + NCHAR(8205) + NCHAR(8206) + NCHAR(8207) + NCHAR(8239) + NCHAR(12288) + NCHAR(65279), ''XXXXXXXXXX'') <> "{COLUMN_NAME}"
-GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}"'),
- ('1276', '1031', 'Profile Anomaly', 'Non_Printing_Chars', 'snowflake', NULL, 'SELECT REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE("{COLUMN_NAME}",
- CHR(160), ''\x160''),
- CHR(8201), ''\x8201''),
- CHR(8203), ''\x8203''),
- CHR(8204), ''\x8204''),
- CHR(8205), ''\x8205''),
- CHR(8206), ''\x8206''),
- CHR(8207), ''\x8207''),
- CHR(8239), ''\x8239''),
- CHR(12288), ''\x12288''),
- CHR(65279), ''\x65279'') as "{COLUMN_NAME}_content",
- COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}"
- WHERE TRANSLATE("{COLUMN_NAME}", CHR(160) || CHR(8201) || CHR(8203) || CHR(8204) || CHR(8205) || CHR(8206) || CHR(8207) || CHR(8239) || CHR(12288) || CHR(65279), ''XXXXXXXXXX'') <> "{COLUMN_NAME}"
-GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500'),
- ('1277', '1031', 'Profile Anomaly', 'Non_Printing_Chars', 'databricks', NULL, 'SELECT REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(`{COLUMN_NAME}`,
- ''\u00a0'', ''\x160''),
- ''\u2009'', ''\x8201''),
- ''\u200b'', ''\x8203''),
- ''\u200c'', ''\x8204''),
- ''\u200d'', ''\x8205''),
- ''\u200e'', ''\x8206''),
- ''\u200f'', ''\x8207''),
- ''\u202f'', ''\x8239''),
- ''\u3000'', ''\x12288''),
- ''\ufeff'', ''\x65279'') as `{COLUMN_NAME}_content`,
- COUNT(*) as record_ct FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`
- WHERE TRANSLATE(`{COLUMN_NAME}`, ''\u00a0\u2009\u200b\u200c\u200d\u200e\u200f\u202f\u3000\ufeff'', ''XXXXXXXXXX'') <> `{COLUMN_NAME}`
-GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` LIMIT 500')
-;
-
-
TRUNCATE TABLE variant_codings;
INSERT INTO variant_codings (value_type, check_values)
@@ -2091,16 +334,3 @@ VALUES ('measure', 'meter|m|metre'),
('pharma','priority review|pr|priority assessment'),
('pharma','tentative approval|ta|conditional approval'),
('pharma','off-label use|off-label|olu|unapproved use|unapproved');
-
--- Replace constraints
-ALTER TABLE test_templates
- ADD CONSTRAINT test_templates_test_types_test_type_fk
- FOREIGN KEY (test_type) REFERENCES test_types;
-
-ALTER TABLE test_results
- ADD CONSTRAINT test_results_test_types_test_type_fk
- FOREIGN KEY (test_type) REFERENCES test_types;
-
-ALTER TABLE cat_test_conditions
- ADD CONSTRAINT cat_test_conditions_cat_tests_test_type_fk
- FOREIGN KEY (test_type) REFERENCES test_types;
diff --git a/testgen/template/dbsetup/055_recreate_metadata_constraints.sql b/testgen/template/dbsetup/055_recreate_metadata_constraints.sql
new file mode 100644
index 00000000..2967dcd4
--- /dev/null
+++ b/testgen/template/dbsetup/055_recreate_metadata_constraints.sql
@@ -0,0 +1,17 @@
+-- ==============================================================================
+-- | This recreates the constraints for the test metadata tables after being imported by yaml
+-- ==============================================================================
+
+SET SEARCH_PATH TO {SCHEMA_NAME};
+
+ALTER TABLE test_templates
+ ADD CONSTRAINT test_templates_test_types_test_type_fk
+ FOREIGN KEY (test_type) REFERENCES test_types;
+
+ALTER TABLE test_results
+ ADD CONSTRAINT test_results_test_types_test_type_fk
+ FOREIGN KEY (test_type) REFERENCES test_types;
+
+ALTER TABLE cat_test_conditions
+ ADD CONSTRAINT cat_test_conditions_cat_tests_test_type_fk
+ FOREIGN KEY (test_type) REFERENCES test_types;
diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Boolean_Value_Mismatch.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Boolean_Value_Mismatch.yaml
new file mode 100644
index 00000000..9aa56cd7
--- /dev/null
+++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Boolean_Value_Mismatch.yaml
@@ -0,0 +1,65 @@
+profile_anomaly_types:
+ id: '1015'
+ anomaly_type: Boolean_Value_Mismatch
+ data_object: Column
+ anomaly_name: Unexpected Boolean Values Found
+ anomaly_description: "This column appears to contain boolean (True/False) data,\
+ \ but unexpected values were found. This could indicate inconsistent coding for\
+ \ the same intended values, potentially leading to downstream errors or inconsistent\
+ \ business logic. "
+ anomaly_criteria: "(distinct_value_ct > 1 AND\n\t\t ((lower(top_freq_values)\
+ \ ILIKE '| true |%' OR lower(top_freq_values) ILIKE '| false |%') AND NOT (lower(top_freq_values)\
+ \ ILIKE '%| true |%' AND lower(top_freq_values) ILIKE '%| false |%'))\n\t\t OR\
+ \ ((lower(top_freq_values) ILIKE '| yes |%' OR lower(top_freq_values) ILIKE '|\
+ \ no |%' ) AND NOT (lower(top_freq_values) ILIKE '%| yes |%' AND lower(top_freq_values)\
+ \ ILIKE '%| no |%')) )"
+ detail_expression: |-
+ CASE WHEN p.top_freq_values IS NULL THEN 'Min: ' || p.min_text || ', Max: ' || p.max_text
+ ELSE 'Top Freq: ' || p.top_freq_values END
+ issue_likelihood: Likely
+ suggested_action: "Review your source data and follow-up with data owners to determine\
+ \ whether this data needs to be corrected. "
+ dq_score_prevalence_formula: null
+ dq_score_risk_factor: '0.66'
+ dq_dimension: Validity
+ target_data_lookups:
+ - id: '1047'
+ test_id: '1015'
+ test_type: Boolean_Value_Mismatch
+ sql_flavor: redshift
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC;
+ error_type: Profile Anomaly
+ - id: '1072'
+ test_id: '1015'
+ test_type: Boolean_Value_Mismatch
+ sql_flavor: postgresql
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC;
+ error_type: Profile Anomaly
+ - id: '1129'
+ test_id: '1015'
+ test_type: Boolean_Value_Mismatch
+ sql_flavor: mssql
+ lookup_type: null
+ lookup_query: |-
+ SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC;
+ error_type: Profile Anomaly
+ - id: '1186'
+ test_id: '1015'
+ test_type: Boolean_Value_Mismatch
+ sql_flavor: snowflake
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC;
+ error_type: Profile Anomaly
+ - id: '1287'
+ test_id: '1015'
+ test_type: Boolean_Value_Mismatch
+ sql_flavor: databricks
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY `{COLUMN_NAME}` ORDER BY count DESC;
+ error_type: Profile Anomaly
diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Char_Column_Date_Values.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Char_Column_Date_Values.yaml
new file mode 100644
index 00000000..6e9ce327
--- /dev/null
+++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Char_Column_Date_Values.yaml
@@ -0,0 +1,64 @@
+profile_anomaly_types:
+ id: '1012'
+ anomaly_type: Char_Column_Date_Values
+ data_object: Column
+ anomaly_name: Character Column with Mostly Date Values
+ anomaly_description: "This column is defined as alpha, but more than 95% of its\
+ \ values are dates. Dates in alpha columns might not sort correctly, and might\
+ \ contradict user expectations downstream. It's also possible that more than one\
+ \ type of information is stored in the column, making it harder to retrieve. \
+ \ "
+ anomaly_criteria: |-
+ p.general_type = 'A'
+ AND p.value_ct > p.date_ct
+ AND p.date_ct::NUMERIC > (0.95 * p.value_ct::NUMERIC)
+ detail_expression: |-
+ ' Date Ct: ' || p.date_ct || ' of ' || p.value_ct || ' (Date Percent: ' || ROUND(100.0 * p.date_ct::NUMERIC(18, 5) / p.value_ct::NUMERIC(18, 5), 2) || ' )'::VARCHAR(200)
+ issue_likelihood: Likely
+ suggested_action: |-
+ Review your source data and ingestion process. Consider whether it might be better to store the date values as a date or datetime column. If the alpha data is also significant, you could store it in a different column.
+ dq_score_prevalence_formula: |-
+ p.date_ct::FLOAT/NULLIF(p.record_ct, 0)::FLOAT
+ dq_score_risk_factor: '0.66'
+ dq_dimension: Validity
+ target_data_lookups:
+ - id: '1044'
+ test_id: '1012'
+ test_type: Char_Column_Date_Values
+ sql_flavor: redshift
+ lookup_type: null
+ lookup_query: |-
+ SELECT A.* FROM ( SELECT TOP 10 DISTINCT 'Date' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_DATE;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS A UNION ALL SELECT B.* FROM ( SELECT TOP 10 DISTINCT 'Non-Date' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_DATE;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS B ORDER BY data_type, count DESC;
+ error_type: Profile Anomaly
+ - id: '1069'
+ test_id: '1012'
+ test_type: Char_Column_Date_Values
+ sql_flavor: postgresql
+ lookup_type: null
+ lookup_query: |-
+ SELECT A.* FROM ( SELECT DISTINCT 'Date' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_DATE;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT 10) AS A UNION ALL SELECT B.* FROM ( SELECT DISTINCT 'Non-Date' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_DATE;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT 10) AS B ORDER BY data_type, count DESC;
+ error_type: Profile Anomaly
+ - id: '1126'
+ test_id: '1012'
+ test_type: Char_Column_Date_Values
+ sql_flavor: mssql
+ lookup_type: null
+ lookup_query: |-
+ SELECT A.* FROM ( SELECT DISTINCT TOP 10 'Date' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_DATE;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC) AS A UNION ALL SELECT B.* FROM ( SELECT DISTINCT TOP 10 'Non-Date' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_DATE;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS B ORDER BY data_type, count DESC;
+ error_type: Profile Anomaly
+ - id: '1183'
+ test_id: '1012'
+ test_type: Char_Column_Date_Values
+ sql_flavor: snowflake
+ lookup_type: null
+ lookup_query: |-
+ SELECT A.* FROM (SELECT DISTINCT TOP 10 'Date' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_DATE;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC) AS A UNION ALL SELECT B.* FROM (SELECT DISTINCT TOP 10 'Non-Date' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_DATE;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC) AS B ORDER BY data_type, count DESC;
+ error_type: Profile Anomaly
+ - id: '1284'
+ test_id: '1012'
+ test_type: Char_Column_Date_Values
+ sql_flavor: databricks
+ lookup_type: null
+ lookup_query: |-
+ SELECT A.* FROM (SELECT DISTINCT 'Date' as data_type, `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_DATE;`{COLUMN_NAME}`%> = 1 GROUP BY `{COLUMN_NAME}` ORDER BY count DESC LIMIT 10) AS A UNION ALL SELECT B.* FROM (SELECT DISTINCT 'Non-Date' as data_type, `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_DATE;`{COLUMN_NAME}`%> != 1 GROUP BY `{COLUMN_NAME}` ORDER BY count DESC) AS B ORDER BY data_type, count DESC LIMIT 10;
+ error_type: Profile Anomaly
diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Char_Column_Number_Units.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Char_Column_Number_Units.yaml
new file mode 100644
index 00000000..da49a9c1
--- /dev/null
+++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Char_Column_Number_Units.yaml
@@ -0,0 +1,18 @@
+profile_anomaly_types:
+ id: '1026'
+ anomaly_type: Char_Column_Number_Units
+ data_object: Column
+ anomaly_name: Character Column with Numbers and Units
+ anomaly_description: |-
+ This column is defined as alpha, but values include numbers with percents or common units. Embedded measures in alpha columns are harder to access, won't sort correctly, and might contradict user expectations downstream. Consider parsing into numeric and UOM columns to improve usability.
+ anomaly_criteria: |-
+ p.includes_digit_ct::FLOAT/NULLIF(p.value_ct, 0)::FLOAT > 0.5 AND TRIM(fn_parsefreq(p.top_freq_values, 1, 1)) ~ '(?i)^[0-9]+(\.[0-9]+)? ?(%|lb|oz|kg|g|mg|km|m|cm|mm|mi|ft|in)$'
+ detail_expression: |-
+ 'Top Freq: ' || p.top_freq_values
+ issue_likelihood: Possible
+ suggested_action: |-
+ Review your source data and ingestion process. Consider whether it might be better to parse the numeric and unit data and store in separate columns.
+ dq_score_prevalence_formula: null
+ dq_score_risk_factor: '0.33'
+ dq_dimension: Consistency
+ target_data_lookups: []
diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Char_Column_Number_Values.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Char_Column_Number_Values.yaml
new file mode 100644
index 00000000..52730c32
--- /dev/null
+++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Char_Column_Number_Values.yaml
@@ -0,0 +1,64 @@
+profile_anomaly_types:
+ id: '1011'
+ anomaly_type: Char_Column_Number_Values
+ data_object: Column
+ anomaly_name: Character Column with Mostly Numeric Values
+ anomaly_description: |-
+ This column is defined as alpha, but more than 95% of its values are numeric. Numbers in alpha columns won't sort correctly, and might contradict user expectations downstream. It's also possible that more than one type of information is stored in the column, making it harder to retrieve.
+ anomaly_criteria: |-
+ p.general_type = 'A'
+ AND p.column_name NOT ILIKE '%zip%'
+ AND p.functional_data_type NOT ILIKE 'id%'
+ AND p.functional_data_type NOT ILIKE 'Period%'
+ AND p.value_ct > p.numeric_ct
+ AND p.numeric_ct::NUMERIC > (0.95 * p.value_ct::NUMERIC)
+ detail_expression: |-
+ 'Numeric Ct: ' || p.numeric_ct || ' of ' || p.value_ct || ' (Numeric Percent: ' || ROUND(100.0 * p.numeric_ct::NUMERIC(18, 5) / p.value_ct::NUMERIC(18, 5), 2) || ' )'::VARCHAR(200)
+ issue_likelihood: Likely
+ suggested_action: |-
+ Review your source data and ingestion process. Consider whether it might be better to store the numeric data in a numeric column. If the alpha data is significant, you could store it in a different column.
+ dq_score_prevalence_formula: |-
+ p.numeric_ct::FLOAT/NULLIF(p.record_ct, 0)::FLOAT
+ dq_score_risk_factor: '0.66'
+ dq_dimension: Validity
+ target_data_lookups:
+ - id: '1043'
+ test_id: '1011'
+ test_type: Char_Column_Number_Values
+ sql_flavor: redshift
+ lookup_type: null
+ lookup_query: |-
+ SELECT A.* FROM ( SELECT TOP 10 DISTINCT 'Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_NUM;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC) AS A UNION ALL SELECT B.* FROM ( SELECT TOP 10 DISTINCT 'Non-Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_NUM;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS B ORDER BY data_type, count DESC;
+ error_type: Profile Anomaly
+ - id: '1068'
+ test_id: '1011'
+ test_type: Char_Column_Number_Values
+ sql_flavor: postgresql
+ lookup_type: null
+ lookup_query: |-
+ SELECT A.* FROM ( SELECT DISTINCT 'Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_NUM;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT 10 ) AS A UNION ALL SELECT B.* FROM ( SELECT DISTINCT 'Non-Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_NUM;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT 10 ) AS B ORDER BY data_type, count DESC;
+ error_type: Profile Anomaly
+ - id: '1125'
+ test_id: '1011'
+ test_type: Char_Column_Number_Values
+ sql_flavor: mssql
+ lookup_type: null
+ lookup_query: |-
+ SELECT A.* FROM ( SELECT DISTINCT TOP 10 'Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_NUM;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS A UNION ALL SELECT B.* FROM ( SELECT DISTINCT TOP 10 'Non-Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_NUM;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS B ORDER BY data_type, count DESC;
+ error_type: Profile Anomaly
+ - id: '1182'
+ test_id: '1011'
+ test_type: Char_Column_Number_Values
+ sql_flavor: snowflake
+ lookup_type: null
+ lookup_query: |-
+ SELECT A.* FROM (SELECT DISTINCT TOP 10 'Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_NUM;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC) AS A UNION ALL SELECT B.* FROM (SELECT DISTINCT TOP 10 'Non-Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_NUM;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC) AS B ORDER BY data_type, count DESC;
+ error_type: Profile Anomaly
+ - id: '1283'
+ test_id: '1011'
+ test_type: Char_Column_Number_Values
+ sql_flavor: databricks
+ lookup_type: null
+ lookup_query: |-
+ SELECT A.* FROM (SELECT DISTINCT 'Numeric' as data_type, `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_NUM;`{COLUMN_NAME}`%> = 1 GROUP BY `{COLUMN_NAME}` ORDER BY count DESC LIMIT 10) AS A UNION ALL SELECT B.* FROM (SELECT DISTINCT 'Non-Numeric' as data_type, `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_NUM;`{COLUMN_NAME}`%> != 1 GROUP BY `{COLUMN_NAME}` ORDER BY count DESC) AS B ORDER BY data_type, count DESC LIMIT 10;
+ error_type: Profile Anomaly
diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Column_Pattern_Mismatch.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Column_Pattern_Mismatch.yaml
new file mode 100644
index 00000000..d5389cbd
--- /dev/null
+++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Column_Pattern_Mismatch.yaml
@@ -0,0 +1,71 @@
+profile_anomaly_types:
+ id: '1007'
+ anomaly_type: Column_Pattern_Mismatch
+ data_object: Column
+ anomaly_name: Pattern Inconsistency Within Column
+ anomaly_description: "Alpha-numeric string data within this column conforms to 2-4\
+ \ different patterns, with 95% matching the first pattern. This could indicate\
+ \ data errors in the remaining values. "
+ anomaly_criteria: |-
+ p.general_type = 'A'
+ AND functional_data_type NOT ILIKE 'Measurement%' AND functional_data_type NOT IN ('Category', 'Code')
+ AND p.max_length > 3
+ AND p.value_ct > (p.numeric_ct + p.filled_value_ct + p.zero_length_ct)
+ AND p.distinct_pattern_ct BETWEEN 2 AND 4
+ AND STRPOS(p.top_patterns, 'N') > 0
+ AND (
+ ( (STRPOS(p.top_patterns, 'A') > 0 OR STRPOS(p.top_patterns, 'a') > 0)
+ AND SPLIT_PART(p.top_patterns, '|', 3)::NUMERIC / SPLIT_PART(p.top_patterns, '|', 1)::NUMERIC < 0.05)
+ OR
+ SPLIT_PART(p.top_patterns, '|', 3)::NUMERIC / SPLIT_PART(p.top_patterns, '|', 1)::NUMERIC < 0.1
+ )
+ detail_expression: |-
+ 'Patterns: ' || p.top_patterns
+ issue_likelihood: Likely
+ suggested_action: |-
+ Review the values for any data that doesn't conform to the most common pattern and correct any data errors.
+ dq_score_prevalence_formula: |-
+ (p.record_ct - SPLIT_PART(p.top_patterns, '|', 1)::INT)::FLOAT/NULLIF(p.record_ct, 0)::FLOAT
+ dq_score_risk_factor: '0.66'
+ dq_dimension: Validity
+ target_data_lookups:
+ - id: '1039'
+ test_id: '1007'
+ test_type: Column_Pattern_Mismatch
+ sql_flavor: redshift
+ lookup_type: null
+ lookup_query: |-
+ SELECT A.* FROM ( SELECT TOP 5 DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 4)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", '[a-z]', 'a'),'[A-Z]', 'A'),'[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC ) A UNION ALL SELECT B.* FROM ( SELECT TOP 5 DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 6)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", '[a-z]', 'a'),'[A-Z]', 'A'),'[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC ) B UNION ALL SELECT C.* FROM ( SELECT TOP 5 DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 8)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", '[a-z]', 'a'),'[A-Z]', 'A'),'[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC ) C UNION ALL SELECT D.* FROM ( SELECT TOP 5 DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 10)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", '[a-z]', 'a'),'[A-Z]', 'A'),'[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC ) D ORDER BY top_pattern DESC, count DESC;
+ error_type: Profile Anomaly
+ - id: '1064'
+ test_id: '1007'
+ test_type: Column_Pattern_Mismatch
+ sql_flavor: postgresql
+ lookup_type: null
+ lookup_query: |-
+ SELECT A.* FROM ( SELECT DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 4)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", '[a-z]', 'a', 'g'), '[A-Z]', 'A', 'g'), '[0-9]', 'N', 'g') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC LIMIT 5 ) A UNION ALL SELECT B.* FROM ( SELECT DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 6)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", '[a-z]', 'a', 'g'), '[A-Z]', 'A', 'g'), '[0-9]', 'N', 'g') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC LIMIT 5 ) B UNION ALL SELECT C.* FROM ( SELECT DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 8)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", '[a-z]', 'a', 'g'), '[A-Z]', 'A', 'g'), '[0-9]', 'N', 'g') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC LIMIT 5 ) C UNION ALL SELECT D.* FROM ( SELECT DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 10)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", '[a-z]', 'a', 'g'), '[A-Z]', 'A', 'g'), '[0-9]', 'N', 'g') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC LIMIT 5) D ORDER BY top_pattern DESC, count DESC;
+ error_type: Profile Anomaly
+ - id: '1121'
+ test_id: '1007'
+ test_type: Column_Pattern_Mismatch
+ sql_flavor: mssql
+ lookup_type: null
+ lookup_query: |-
+ WITH cte AS ( SELECT TRIM(value) AS top_pattern, ROW_NUMBER() OVER (ORDER BY CHARINDEX('| '+ TRIM(value) + ' |', '| ' + '{DETAIL_EXPRESSION}' + ' |' ) ASC) as row_num FROM STRING_SPLIT('{DETAIL_EXPRESSION}', '|') ) SELECT DISTINCT TOP 5 c.top_pattern, a."{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} a, cte c WHERE c.row_num = 4 AND TRANSLATE(a."{COLUMN_NAME}" COLLATE Latin1_General_BIN, 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789', 'aaaaaaaaaaaaaaaaaaaaaaaaaaAAAAAAAAAAAAAAAAAAAAAAAAAANNNNNNNNNN') = c.top_pattern GROUP BY c.top_pattern, a."{COLUMN_NAME}" UNION ALL SELECT DISTINCT TOP 5 c.top_pattern, a."{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} a, cte c WHERE c.row_num = 6 AND TRANSLATE(a."{COLUMN_NAME}" COLLATE Latin1_General_BIN, 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789', 'aaaaaaaaaaaaaaaaaaaaaaaaaaAAAAAAAAAAAAAAAAAAAAAAAAAANNNNNNNNNN') = c.top_pattern GROUP BY c.top_pattern, a."{COLUMN_NAME}" UNION ALL SELECT DISTINCT TOP 5 c.top_pattern, a."{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} a, cte c WHERE c.row_num = 8 AND TRANSLATE(a."{COLUMN_NAME}" COLLATE Latin1_General_BIN, 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789', 'aaaaaaaaaaaaaaaaaaaaaaaaaaAAAAAAAAAAAAAAAAAAAAAAAAAANNNNNNNNNN') = c.top_pattern GROUP BY c.top_pattern, a."{COLUMN_NAME}" UNION ALL SELECT DISTINCT TOP 5 c.top_pattern, a."{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} a, cte c WHERE c.row_num = 10 AND TRANSLATE(a."{COLUMN_NAME}" COLLATE Latin1_General_BIN, 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789', 'aaaaaaaaaaaaaaaaaaaaaaaaaaAAAAAAAAAAAAAAAAAAAAAAAAAANNNNNNNNNN') = c.top_pattern GROUP BY c.top_pattern, a."{COLUMN_NAME}" ORDER BY top_pattern DESC, count DESC;
+ error_type: Profile Anomaly
+ - id: '1178'
+ test_id: '1007'
+ test_type: Column_Pattern_Mismatch
+ sql_flavor: snowflake
+ lookup_type: null
+ lookup_query: |-
+ SELECT A.* FROM (SELECT DISTINCT TOP 5 b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 4)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}"::VARCHAR, '[a-z]', 'a'), '[A-Z]', 'A'), '[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC) A UNION ALL SELECT B.* FROM (SELECT DISTINCT TOP 5 b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 6)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}"::VARCHAR, '[a-z]', 'a'), '[A-Z]', 'A'), '[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC) B UNION ALL SELECT C.* FROM (SELECT DISTINCT TOP 5 b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 8)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}"::VARCHAR, '[a-z]', 'a'), '[A-Z]', 'A'), '[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC) C UNION ALL SELECT D.* FROM (SELECT DISTINCT TOP 5 b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 10)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}"::VARCHAR, '[a-z]', 'a'), '[A-Z]', 'A'), '[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC) D ORDER BY top_pattern DESC, count DESC;
+ error_type: Profile Anomaly
+ - id: '1279'
+ test_id: '1007'
+ test_type: Column_Pattern_Mismatch
+ sql_flavor: databricks
+ lookup_type: null
+ lookup_query: |-
+ SELECT A.* FROM (SELECT DISTINCT b.top_pattern, `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 4)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( `{COLUMN_NAME}`::STRING, '[a-z]', 'a'), '[A-Z]', 'A'), '[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, `{COLUMN_NAME}` ORDER BY count DESC LIMIT 5) A UNION ALL SELECT B.* FROM (SELECT DISTINCT b.top_pattern, `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 6)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( `{COLUMN_NAME}`::STRING, '[a-z]', 'a'), '[A-Z]', 'A'), '[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, `{COLUMN_NAME}` ORDER BY count DESC LIMIT 5) B UNION ALL SELECT C.* FROM (SELECT DISTINCT b.top_pattern, `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 8)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( `{COLUMN_NAME}`::STRING, '[a-z]', 'a'), '[A-Z]', 'A'), '[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, `{COLUMN_NAME}` ORDER BY count DESC LIMIT 5) C UNION ALL SELECT D.* FROM (SELECT DISTINCT b.top_pattern, `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 10)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( `{COLUMN_NAME}`::STRING, '[a-z]', 'a'), '[A-Z]', 'A'), '[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, `{COLUMN_NAME}` ORDER BY count DESC LIMIT 5) D ORDER BY top_pattern DESC, count DESC;
+ error_type: Profile Anomaly
diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Delimited_Data_Embedded.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Delimited_Data_Embedded.yaml
new file mode 100644
index 00000000..066ec529
--- /dev/null
+++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Delimited_Data_Embedded.yaml
@@ -0,0 +1,58 @@
+profile_anomaly_types:
+ id: '1025'
+ anomaly_type: Delimited_Data_Embedded
+ data_object: Column
+ anomaly_name: Delimited Data Embedded in Column
+ anomaly_description: |-
+ Delimited data, separated by a common delimiter (comma, tab, pipe or caret) is present in over 80% of column values. This could indicate data that was incorrectly ingested, or data that would be better represented in parsed form.
+ anomaly_criteria: |-
+ p.std_pattern_match = 'DELIMITED_DATA'
+ detail_expression: |-
+ CASE WHEN p.top_freq_values IS NULL THEN 'Min: ' || p.min_text || ', Max: ' || p.max_text ELSE 'Top Freq: ' || p.top_freq_values END
+ issue_likelihood: Likely
+ suggested_action: |-
+ Review your source data and follow-up with data consumers to determine the most useful representation of this data.
+ dq_score_prevalence_formula: null
+ dq_score_risk_factor: '0.66'
+ dq_dimension: Validity
+ target_data_lookups:
+ - id: '1057'
+ test_id: '1025'
+ test_type: Delimited_Data_Embedded
+ sql_flavor: redshift
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" ~ '^([^,|\t]{1,20}[,|\t]){2,}[^,|\t]{0,20}([,|\t]{0,1}[^,|\t]{0,20})*$' AND "{COLUMN_NAME}" !~ '\\s(and|but|or|yet)\\s' GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT 500;
+ error_type: Profile Anomaly
+ - id: '1082'
+ test_id: '1025'
+ test_type: Delimited_Data_Embedded
+ sql_flavor: postgresql
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" ~ '^([^,|\t]{1,20}[,|\t]){2,}[^,|\t]{0,20}([,|\t]{0,1}[^,|\t]{0,20})*$' AND "{COLUMN_NAME}" !~ '\s(and|but|or|yet)\s' GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT 500;
+ error_type: Profile Anomaly
+ - id: '1139'
+ test_id: '1025'
+ test_type: Delimited_Data_Embedded
+ sql_flavor: mssql
+ lookup_type: null
+ lookup_query: |-
+ SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE ( "{COLUMN_NAME}" LIKE '%,%,%,%' OR "{COLUMN_NAME}" LIKE '%|%|%|%' OR "{COLUMN_NAME}" LIKE '%^%^%^%' OR "{COLUMN_NAME}" LIKE '%' + CHAR(9) + '%' + CHAR(9) + '%' + CHAR(9) + '%' ) AND NOT ( "{COLUMN_NAME}" LIKE '% and %' OR "{COLUMN_NAME}" LIKE '% but %' OR "{COLUMN_NAME}" LIKE '% or %' OR "{COLUMN_NAME}" LIKE '% yet %' ) AND ISNULL(CAST(LEN("{COLUMN_NAME}") - LEN(REPLACE("{COLUMN_NAME}", ',', '')) as FLOAT) / CAST(NULLIF(LEN("{COLUMN_NAME}") - LEN(REPLACE("{COLUMN_NAME}", ' ', '')), 0) as FLOAT), 1) > 0.6 GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC;
+ error_type: Profile Anomaly
+ - id: '1196'
+ test_id: '1025'
+ test_type: Delimited_Data_Embedded
+ sql_flavor: snowflake
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE REGEXP_LIKE("{COLUMN_NAME}"::VARCHAR, '^([^,|\t]{1,20}[,|\t]){2,}[^,|\t]{0,20}([,|\t]{0,1}[^,|\t]{0,20})*$') AND NOT REGEXP_LIKE("{COLUMN_NAME}"::VARCHAR, '.*\\s(and|but|or|yet)\\s.*') GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT 500;
+ error_type: Profile Anomaly
+ - id: '1297'
+ test_id: '1025'
+ test_type: Delimited_Data_Embedded
+ sql_flavor: databricks
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE REGEXP_LIKE(`{COLUMN_NAME}`::STRING, '^([^,|\t]{1,20}[,|\t]){2,}[^,|\t]{0,20}([,|\t]{0,1}[^,|\t]{0,20})*$') AND NOT REGEXP_LIKE(`{COLUMN_NAME}`::STRING, '.*\\s(and|but|or|yet)\\s.*') GROUP BY `{COLUMN_NAME}` ORDER BY count DESC LIMIT 500;
+ error_type: Profile Anomaly
diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Inconsistent_Casing.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Inconsistent_Casing.yaml
new file mode 100644
index 00000000..8995cbd3
--- /dev/null
+++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Inconsistent_Casing.yaml
@@ -0,0 +1,89 @@
+profile_anomaly_types:
+ id: '1028'
+ anomaly_type: Inconsistent_Casing
+ data_object: Column
+ anomaly_name: Inconsistent Casing
+ anomaly_description: |-
+ Casing is inconsistent for a column representing an entity name or address elements. Mixed-Case and All-Upper-Case values were found in the same column.
+ anomaly_criteria: |-
+ mixed_case_ct > 0 AND upper_case_ct > 0 AND functional_data_type IN ('Address', 'City', 'Entity Name', 'Person Given Name', 'Person Last Name', 'Person Full Name')
+ detail_expression: |-
+ 'Mixed-Case: ' || p.mixed_case_ct::VARCHAR || ', All-Upper-Case: ' || p.upper_case_ct::VARCHAR || ' for Semantic Data Type: ' || p.functional_data_type || ', Records: ' || p.record_ct::VARCHAR
+ issue_likelihood: Definite
+ suggested_action: |-
+ Review your source data and follow-up with data owners to determine whether consistent casing should be applied at the source. If source data corrections are not possible, consider standardizing the column upon ingestion to ensure consistent casing.
+ dq_score_prevalence_formula: |-
+ LEAST(p.mixed_case_ct, p.upper_case_ct)::FLOAT/NULLIF(p.record_ct, 0)::FLOAT
+ dq_score_risk_factor: '1.0'
+ dq_dimension: Validity
+ target_data_lookups:
+ - id: '1258'
+ test_id: '1028'
+ test_type: Inconsistent_Casing
+ sql_flavor: redshift
+ lookup_type: null
+ lookup_query: |-
+ (SELECT 'Upper Case' as casing, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}
+ WHERE UPPER("{COLUMN_NAME}") = "{COLUMN_NAME}"
+ GROUP BY "{COLUMN_NAME}" LIMIT 20)
+ UNION ALL
+ (SELECT 'Mixed Case' as casing, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}
+ WHERE "{COLUMN_NAME}" <> UPPER("{COLUMN_NAME}") AND "{COLUMN_NAME}" <> LOWER("{COLUMN_NAME}")
+ GROUP BY "{COLUMN_NAME}" LIMIT 20)
+ error_type: Profile Anomaly
+ - id: '1259'
+ test_id: '1028'
+ test_type: Inconsistent_Casing
+ sql_flavor: postgresql
+ lookup_type: null
+ lookup_query: |-
+ (SELECT 'Upper Case' as casing, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}
+ WHERE UPPER("{COLUMN_NAME}") = "{COLUMN_NAME}"
+ GROUP BY "{COLUMN_NAME}" LIMIT 20)
+ UNION ALL
+ (SELECT 'Mixed Case' as casing, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}
+ WHERE "{COLUMN_NAME}" <> UPPER("{COLUMN_NAME}") AND "{COLUMN_NAME}" <> LOWER("{COLUMN_NAME}")
+ GROUP BY "{COLUMN_NAME}" LIMIT 20)
+ error_type: Profile Anomaly
+ - id: '1260'
+ test_id: '1028'
+ test_type: Inconsistent_Casing
+ sql_flavor: mssql
+ lookup_type: null
+ lookup_query: |-
+ SELECT TOP 20 'Upper Case' as casing, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}
+ WHERE UPPER("{COLUMN_NAME}") = "{COLUMN_NAME}"
+ GROUP BY "{COLUMN_NAME}"
+ UNION ALL
+ SELECT TOP 20 'Mixed Case' as casing, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}
+ WHERE "{COLUMN_NAME}" <> UPPER("{COLUMN_NAME}") AND "{COLUMN_NAME}" <> LOWER("{COLUMN_NAME}")
+ GROUP BY "{COLUMN_NAME}"
+ error_type: Profile Anomaly
+ - id: '1261'
+ test_id: '1028'
+ test_type: Inconsistent_Casing
+ sql_flavor: snowflake
+ lookup_type: null
+ lookup_query: |-
+ (SELECT 'Upper Case' as casing, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}
+ WHERE UPPER("{COLUMN_NAME}") = "{COLUMN_NAME}"
+ GROUP BY "{COLUMN_NAME}" LIMIT 20)
+ UNION ALL
+ (SELECT 'Mixed Case' as casing, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}
+ WHERE "{COLUMN_NAME}" <> UPPER("{COLUMN_NAME}") AND "{COLUMN_NAME}" <> LOWER("{COLUMN_NAME}")
+ GROUP BY "{COLUMN_NAME}" LIMIT 20)
+ error_type: Profile Anomaly
+ - id: '1262'
+ test_id: '1028'
+ test_type: Inconsistent_Casing
+ sql_flavor: databricks
+ lookup_type: null
+ lookup_query: |-
+ (SELECT 'Upper Case' as casing, `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}
+ WHERE UPPER(`{COLUMN_NAME}`) = `{COLUMN_NAME}`
+ GROUP BY `{COLUMN_NAME}` LIMIT 20)
+ UNION ALL
+ (SELECT 'Mixed Case' as casing, `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}
+ WHERE `{COLUMN_NAME}` <> UPPER(`{COLUMN_NAME}`) AND `{COLUMN_NAME}` <> LOWER(`{COLUMN_NAME}`)
+ GROUP BY `{COLUMN_NAME}` LIMIT 20)
+ error_type: Profile Anomaly
diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Invalid_Zip3_USA.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Invalid_Zip3_USA.yaml
new file mode 100644
index 00000000..869819ac
--- /dev/null
+++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Invalid_Zip3_USA.yaml
@@ -0,0 +1,62 @@
+profile_anomaly_types:
+ id: '1024'
+ anomaly_type: Invalid_Zip3_USA
+ data_object: Column
+ anomaly_name: Invalid USA ZIP-3 Format
+ anomaly_description: |-
+ The majority of values in this column are 3-digit zips, but divergent patterns were found. This could indicate an incorrect roll-up category or a PII concern.
+ anomaly_criteria: |-
+ p.distinct_pattern_ct > 1
+ AND (p.column_name ilike '%zip%' OR p.column_name ILIKE '%postal%')
+ AND SPLIT_PART(p.top_patterns, ' | ', 2) = 'NNN'
+ AND SPLIT_PART(p.top_patterns, ' | ', 1)::FLOAT/NULLIF(value_ct, 0)::FLOAT > 0.50
+ detail_expression: |-
+ 'Pattern: ' || p.top_patterns
+ issue_likelihood: Definite
+ suggested_action: |-
+ Review your source data, ingestion process, and any processing steps that update this column.
+ dq_score_prevalence_formula: |-
+ (NULLIF(p.record_ct, 0)::INT - SPLIT_PART(p.top_patterns, ' | ', 1)::INT)::FLOAT/NULLIF(p.record_ct, 0)::FLOAT
+ dq_score_risk_factor: '1'
+ dq_dimension: Validity
+ target_data_lookups:
+ - id: '1056'
+ test_id: '1024'
+ test_type: Invalid_Zip3_USA
+ sql_flavor: redshift
+ lookup_type: null
+ lookup_query: |-
+ SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') <> '999' GROUP BY "{COLUMN_NAME}" ORDER BY count DESC, "{COLUMN_NAME}" LIMIT 500;
+ error_type: Profile Anomaly
+ - id: '1081'
+ test_id: '1024'
+ test_type: Invalid_Zip3_USA
+ sql_flavor: postgresql
+ lookup_type: null
+ lookup_query: |-
+ SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') <> '999' GROUP BY "{COLUMN_NAME}" ORDER BY count DESC, "{COLUMN_NAME}" LIMIT 500;
+ error_type: Profile Anomaly
+ - id: '1138'
+ test_id: '1024'
+ test_type: Invalid_Zip3_USA
+ sql_flavor: mssql
+ lookup_type: null
+ lookup_query: |-
+ SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') <> '999' GROUP BY "{COLUMN_NAME}" ORDER BY count DESC, "{COLUMN_NAME}";
+ error_type: Profile Anomaly
+ - id: '1195'
+ test_id: '1024'
+ test_type: Invalid_Zip3_USA
+ sql_flavor: snowflake
+ lookup_type: null
+ lookup_query: |-
+ SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') <> '999' GROUP BY "{COLUMN_NAME}" ORDER BY count DESC, "{COLUMN_NAME}" LIMIT 500;
+ error_type: Profile Anomaly
+ - id: '1296'
+ test_id: '1024'
+ test_type: Invalid_Zip3_USA
+ sql_flavor: databricks
+ lookup_type: null
+ lookup_query: |-
+ SELECT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE(`{COLUMN_NAME}`,'012345678','999999999') <> '999' GROUP BY `{COLUMN_NAME}` ORDER BY count DESC, `{COLUMN_NAME}` LIMIT 500;
+ error_type: Profile Anomaly
diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Invalid_Zip_USA.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Invalid_Zip_USA.yaml
new file mode 100644
index 00000000..0a0aa5d0
--- /dev/null
+++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Invalid_Zip_USA.yaml
@@ -0,0 +1,58 @@
+profile_anomaly_types:
+ id: '1003'
+ anomaly_type: Invalid_Zip_USA
+ data_object: Column
+ anomaly_name: Invalid USA Zip Code Format
+ anomaly_description: |-
+ Some values present do not conform with the expected format of USA Zip Codes.
+ anomaly_criteria: |-
+ p.functional_data_type = 'Zip' AND (p.general_type <> 'A' OR p.filled_value_ct > 0 OR EXISTS (SELECT 1 FROM UNNEST(STRING_TO_ARRAY(p.top_patterns, ' | ')) WITH ORDINALITY AS u(val, idx) WHERE idx % 2 = 0 AND val NOT IN ('NNNNN','NNNNN-NNNN','NNNNNNNNN')))
+ detail_expression: |-
+ CASE WHEN p.general_type = 'N' THEN 'Type: ' || p.column_type ELSE '' END || CASE WHEN p.general_type = 'A' THEN 'Patterns: ' || (SELECT string_agg(val, ',') FROM UNNEST(STRING_TO_ARRAY(top_patterns, ' | ')) WITH ORDINALITY AS u(val, idx) WHERE idx % 2 = 0) || ', Dummy Values: ' || p.filled_value_ct::VARCHAR ELSE '' END
+ issue_likelihood: Definite
+ suggested_action: |-
+ Consider correcting invalid column values or changing them to indicate a missing value if corrections cannot be made.
+ dq_score_prevalence_formula: null
+ dq_score_risk_factor: '1.0'
+ dq_dimension: Validity
+ target_data_lookups:
+ - id: '1035'
+ test_id: '1003'
+ test_type: Invalid_Zip_USA
+ sql_flavor: redshift
+ lookup_type: null
+ lookup_query: |-
+ SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500;
+ error_type: Profile Anomaly
+ - id: '1060'
+ test_id: '1003'
+ test_type: Invalid_Zip_USA
+ sql_flavor: postgresql
+ lookup_type: null
+ lookup_query: |-
+ SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500;
+ error_type: Profile Anomaly
+ - id: '1117'
+ test_id: '1003'
+ test_type: Invalid_Zip_USA
+ sql_flavor: mssql
+ lookup_type: null
+ lookup_query: |-
+ SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";
+ error_type: Profile Anomaly
+ - id: '1174'
+ test_id: '1003'
+ test_type: Invalid_Zip_USA
+ sql_flavor: snowflake
+ lookup_type: null
+ lookup_query: |-
+ SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500;
+ error_type: Profile Anomaly
+ - id: '1275'
+ test_id: '1003'
+ test_type: Invalid_Zip_USA
+ sql_flavor: databricks
+ lookup_type: null
+ lookup_query: |-
+ SELECT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE(`{COLUMN_NAME}`,'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` LIMIT 500;
+ error_type: Profile Anomaly
diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Leading_Spaces.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Leading_Spaces.yaml
new file mode 100644
index 00000000..7bb29073
--- /dev/null
+++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Leading_Spaces.yaml
@@ -0,0 +1,59 @@
+profile_anomaly_types:
+ id: '1009'
+ anomaly_type: Leading_Spaces
+ data_object: Column
+ anomaly_name: Leading Spaces Found in Column Values
+ anomaly_description: |-
+ Spaces were found before data at the front of column string values. This likely contradicts user expectations and could be a sign of broader ingestion or processing errors.
+ anomaly_criteria: |-
+ p.lead_space_ct > 0
+ detail_expression: |-
+ 'Cases Found: ' || p.lead_space_ct::VARCHAR(10)
+ issue_likelihood: Likely
+ suggested_action: |-
+ Review your source data, ingestion process, and any processing steps that update this column.
+ dq_score_prevalence_formula: |-
+ p.lead_space_ct::FLOAT/NULLIF(p.record_ct, 0)::FLOAT
+ dq_score_risk_factor: '0.66'
+ dq_dimension: Validity
+ target_data_lookups:
+ - id: '1041'
+ test_id: '1009'
+ test_type: Leading_Spaces
+ sql_flavor: redshift
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (CASE WHEN "{COLUMN_NAME}" BETWEEN ' !' AND '!' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";
+ error_type: Profile Anomaly
+ - id: '1066'
+ test_id: '1009'
+ test_type: Leading_Spaces
+ sql_flavor: postgresql
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (CASE WHEN "{COLUMN_NAME}" BETWEEN ' !' AND '!' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";
+ error_type: Profile Anomaly
+ - id: '1123'
+ test_id: '1009'
+ test_type: Leading_Spaces
+ sql_flavor: mssql
+ lookup_type: null
+ lookup_query: |-
+ SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (CASE WHEN "{COLUMN_NAME}" BETWEEN ' !' AND '!' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";
+ error_type: Profile Anomaly
+ - id: '1180'
+ test_id: '1009'
+ test_type: Leading_Spaces
+ sql_flavor: snowflake
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (CASE WHEN "{COLUMN_NAME}" BETWEEN ' !' AND '!' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";
+ error_type: Profile Anomaly
+ - id: '1281'
+ test_id: '1009'
+ test_type: Leading_Spaces
+ sql_flavor: databricks
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (CASE WHEN `{COLUMN_NAME}` BETWEEN ' !' AND '!' THEN 1 ELSE 0 END) = 1 GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}`;
+ error_type: Profile Anomaly
diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Multiple_Types_Major.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Multiple_Types_Major.yaml
new file mode 100644
index 00000000..529ce1f7
--- /dev/null
+++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Multiple_Types_Major.yaml
@@ -0,0 +1,58 @@
+profile_anomaly_types:
+ id: '1005'
+ anomaly_type: Multiple_Types_Major
+ data_object: Multi-Col
+ anomaly_name: Multiple Data Types per Column Name - Major
+ anomaly_description: |-
+ Columns with the same name have broadly different types across tables. Differences could be significant enough to cause errors in downstream analysis, extra steps resulting in divergent business logic and inconsistencies in results.
+ anomaly_criteria: |-
+ m.general_type_ct > 1
+ detail_expression: |-
+ 'Found ' || m.column_ct::VARCHAR || ' columns, ' || m.type_ct::VARCHAR(10) || ' types, ' || m.min_type || ' to ' || m.max_type
+ issue_likelihood: Likely
+ suggested_action: |-
+ Ideally, you should change the column data types to be fully consistent. If the data is meant to be different, you should change column names so downstream users aren't led astray.
+ dq_score_prevalence_formula: null
+ dq_score_risk_factor: null
+ dq_dimension: Consistency
+ target_data_lookups:
+ - id: '1037'
+ test_id: '1005'
+ test_type: Multiple_Types_Major
+ sql_flavor: redshift
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT column_name, table_name, CASE WHEN data_type = 'timestamp without time zone' THEN 'timestamp' WHEN data_type = 'character varying' THEN 'varchar(' || CAST(character_maximum_length AS VARCHAR) || ')' WHEN data_type = 'character' THEN 'char(' || CAST(character_maximum_length AS VARCHAR) || ')' WHEN data_type = 'numeric' THEN 'numeric(' || CAST(numeric_precision AS VARCHAR) || ',' || CAST(numeric_scale AS VARCHAR) || ')' ELSE data_type END AS data_type FROM information_schema.columns WHERE table_schema = '{TARGET_SCHEMA}' AND column_name = '{COLUMN_NAME}' ORDER BY data_type, table_name;
+ error_type: Profile Anomaly
+ - id: '1062'
+ test_id: '1005'
+ test_type: Multiple_Types_Major
+ sql_flavor: postgresql
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT column_name, columns.table_name, CASE WHEN data_type = 'timestamp without time zone' THEN 'timestamp' WHEN data_type = 'character varying' THEN 'varchar(' || CAST(character_maximum_length AS VARCHAR) || ')' WHEN data_type = 'character' THEN 'char(' || CAST(character_maximum_length AS VARCHAR) || ')' WHEN data_type = 'numeric' THEN 'numeric(' || CAST(numeric_precision AS VARCHAR) || ',' || CAST(numeric_scale AS VARCHAR) || ')' ELSE data_type END AS data_type FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = '{TARGET_SCHEMA}' AND columns.column_name = '{COLUMN_NAME}' AND UPPER(tables.table_type) = 'BASE TABLE' ORDER BY data_type, table_name;
+ error_type: Profile Anomaly
+ - id: '1119'
+ test_id: '1005'
+ test_type: Multiple_Types_Major
+ sql_flavor: mssql
+ lookup_type: null
+ lookup_query: |-
+ SELECT TOP 500 column_name, columns.table_name, CASE WHEN data_type = 'datetime' THEN 'datetime' WHEN data_type = 'datetime2' THEN 'datetime' WHEN data_type = 'varchar' THEN 'varchar(' + CAST(character_maximum_length AS VARCHAR) + ')' WHEN data_type = 'char' THEN 'char(' + CAST(character_maximum_length AS VARCHAR) + ')' WHEN data_type = 'numeric' THEN 'numeric(' + CAST(numeric_precision AS VARCHAR) + ',' + CAST(numeric_scale AS VARCHAR) + ')' ELSE data_type END AS data_type FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = '{TARGET_SCHEMA}' AND columns.column_name = '{COLUMN_NAME}' AND tables.table_type = 'BASE TABLE' ORDER BY data_type, table_name;
+ error_type: Profile Anomaly
+ - id: '1176'
+ test_id: '1005'
+ test_type: Multiple_Types_Major
+ sql_flavor: snowflake
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT column_name, columns.table_name, CASE WHEN data_type ILIKE 'timestamp%' THEN lower(data_type) WHEN data_type ILIKE 'date' THEN lower(data_type) WHEN data_type ILIKE 'boolean' THEN 'boolean' WHEN data_type = 'TEXT' THEN 'varchar(' || CAST(character_maximum_length AS VARCHAR) || ')' WHEN data_type ILIKE 'char%' THEN 'char(' || CAST(character_maximum_length AS VARCHAR) || ')' WHEN data_type = 'NUMBER' AND numeric_precision = 38 AND numeric_scale = 0 THEN 'bigint' WHEN data_type ILIKE 'num%' THEN 'numeric(' || CAST(numeric_precision AS VARCHAR) || ',' || CAST(numeric_scale AS VARCHAR) || ')' ELSE data_type END AS data_type FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = '{TARGET_SCHEMA}' AND columns.column_name = '{COLUMN_NAME}' AND tables.table_type = 'BASE TABLE' ORDER BY data_type, table_name;
+ error_type: Profile Anomaly
+ - id: '1277'
+ test_id: '1005'
+ test_type: Multiple_Types_Major
+ sql_flavor: databricks
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT column_name, columns.table_name, CASE WHEN data_type ILIKE 'timestamp%' THEN lower(data_type) WHEN data_type ILIKE 'date' THEN lower(data_type) WHEN data_type ILIKE 'boolean' THEN 'boolean' WHEN data_type = 'TEXT' THEN 'varchar(' || CAST(character_maximum_length AS STRING) || ')' WHEN data_type ILIKE 'char%' THEN 'char(' || CAST(character_maximum_length AS STRING) || ')' WHEN data_type = 'NUMBER' AND numeric_precision = 38 AND numeric_scale = 0 THEN 'bigint' WHEN data_type ILIKE 'num%' THEN 'numeric(' || CAST(numeric_precision AS STRING) || ',' || CAST(numeric_scale AS STRING) || ')' ELSE data_type END AS data_type FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = '{TARGET_SCHEMA}' AND columns.column_name = '{COLUMN_NAME}' AND tables.table_type = 'BASE TABLE' ORDER BY data_type, table_name;
+ error_type: Profile Anomaly
diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Multiple_Types_Minor.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Multiple_Types_Minor.yaml
new file mode 100644
index 00000000..aacc90a7
--- /dev/null
+++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Multiple_Types_Minor.yaml
@@ -0,0 +1,58 @@
+profile_anomaly_types:
+ id: '1004'
+ anomaly_type: Multiple_Types_Minor
+ data_object: Multi-Col
+ anomaly_name: Multiple Data Types per Column Name - Minor
+ anomaly_description: |-
+ Columns with the same name have the same general type across tables, but the types do not exactly match. Truncation issues may result if columns are commingled and assumed to be the same format.
+ anomaly_criteria: |-
+ m.general_type_ct = 1 AND m.type_ct > 1
+ detail_expression: |-
+ 'Found ' || m.column_ct::VARCHAR || ' columns, ' || m.type_ct::VARCHAR(10) || ' types, ' || m.min_type || ' to ' || m.max_type
+ issue_likelihood: Possible
+ suggested_action: |-
+ Consider changing the column data types to be fully consistent. This will tighten your standards at ingestion and assure that data is consistent between tables.
+ dq_score_prevalence_formula: null
+ dq_score_risk_factor: null
+ dq_dimension: Consistency
+ target_data_lookups:
+ - id: '1036'
+ test_id: '1004'
+ test_type: Multiple_Types_Minor
+ sql_flavor: redshift
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT column_name, table_name, CASE WHEN data_type = 'timestamp without time zone' THEN 'timestamp' WHEN data_type = 'character varying' THEN 'varchar(' || CAST(character_maximum_length AS VARCHAR) || ')' WHEN data_type = 'character' THEN 'char(' || CAST(character_maximum_length AS VARCHAR) || ')' WHEN data_type = 'numeric' THEN 'numeric(' || CAST(numeric_precision AS VARCHAR) || ',' || CAST(numeric_scale AS VARCHAR) || ')' ELSE data_type END AS data_type FROM information_schema.columns WHERE table_schema = '{TARGET_SCHEMA}' AND column_name = '{COLUMN_NAME}' ORDER BY data_type, table_name;
+ error_type: Profile Anomaly
+ - id: '1061'
+ test_id: '1004'
+ test_type: Multiple_Types_Minor
+ sql_flavor: postgresql
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT column_name, columns.table_name, CASE WHEN data_type = 'timestamp without time zone' THEN 'timestamp' WHEN data_type = 'character varying' THEN 'varchar(' || CAST(character_maximum_length AS VARCHAR) || ')' WHEN data_type = 'character' THEN 'char(' || CAST(character_maximum_length AS VARCHAR) || ')' WHEN data_type = 'numeric' THEN 'numeric(' || CAST(numeric_precision AS VARCHAR) || ',' || CAST(numeric_scale AS VARCHAR) || ')' ELSE data_type END AS data_type FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = '{TARGET_SCHEMA}' AND columns.column_name = '{COLUMN_NAME}' AND UPPER(tables.table_type) = 'BASE TABLE' ORDER BY data_type, table_name;
+ error_type: Profile Anomaly
+ - id: '1118'
+ test_id: '1004'
+ test_type: Multiple_Types_Minor
+ sql_flavor: mssql
+ lookup_type: null
+ lookup_query: |-
+ SELECT TOP 500 column_name, columns.table_name, CASE WHEN data_type = 'datetime' THEN 'datetime' WHEN data_type = 'datetime2' THEN 'datetime' WHEN data_type = 'varchar' THEN 'varchar(' + CAST(character_maximum_length AS VARCHAR) + ')' WHEN data_type = 'char' THEN 'char(' + CAST(character_maximum_length AS VARCHAR) + ')' WHEN data_type = 'numeric' THEN 'numeric(' + CAST(numeric_precision AS VARCHAR) + ',' + CAST(numeric_scale AS VARCHAR) + ')' ELSE data_type END AS data_type FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = '{TARGET_SCHEMA}' AND columns.column_name = '{COLUMN_NAME}' AND tables.table_type = 'BASE TABLE' ORDER BY data_type, table_name;
+ error_type: Profile Anomaly
+ - id: '1175'
+ test_id: '1004'
+ test_type: Multiple_Types_Minor
+ sql_flavor: snowflake
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT column_name, columns.table_name, CASE WHEN data_type ILIKE 'timestamp%' THEN lower(data_type) WHEN data_type ILIKE 'date' THEN lower(data_type) WHEN data_type ILIKE 'boolean' THEN 'boolean' WHEN data_type = 'TEXT' THEN 'varchar(' || CAST(character_maximum_length AS VARCHAR) || ')' WHEN data_type ILIKE 'char%' THEN 'char(' || CAST(character_maximum_length AS VARCHAR) || ')' WHEN data_type = 'NUMBER' AND numeric_precision = 38 AND numeric_scale = 0 THEN 'bigint' WHEN data_type ILIKE 'num%' THEN 'numeric(' || CAST(numeric_precision AS VARCHAR) || ',' || CAST(numeric_scale AS VARCHAR) || ')' ELSE data_type END AS data_type FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = '{TARGET_SCHEMA}' AND columns.column_name = '{COLUMN_NAME}' AND tables.table_type = 'BASE TABLE' ORDER BY data_type, table_name;
+ error_type: Profile Anomaly
+ - id: '1276'
+ test_id: '1004'
+ test_type: Multiple_Types_Minor
+ sql_flavor: databricks
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT column_name, columns.table_name, CASE WHEN data_type ILIKE 'timestamp%' THEN lower(data_type) WHEN data_type ILIKE 'date' THEN lower(data_type) WHEN data_type ILIKE 'boolean' THEN 'boolean' WHEN data_type = 'TEXT' THEN 'varchar(' || CAST(character_maximum_length AS STRING) || ')' WHEN data_type ILIKE 'char%' THEN 'char(' || CAST(character_maximum_length AS STRING) || ')' WHEN data_type = 'NUMBER' AND numeric_precision = 38 AND numeric_scale = 0 THEN 'bigint' WHEN data_type ILIKE 'num%' THEN 'numeric(' || CAST(numeric_precision AS STRING) || ',' || CAST(numeric_scale AS STRING) || ')' ELSE data_type END AS data_type FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = '{TARGET_SCHEMA}' AND columns.column_name = '{COLUMN_NAME}' AND tables.table_type = 'BASE TABLE' ORDER BY data_type, table_name;
+ error_type: Profile Anomaly
diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_No_Values.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_No_Values.yaml
new file mode 100644
index 00000000..9b130d57
--- /dev/null
+++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_No_Values.yaml
@@ -0,0 +1,61 @@
+profile_anomaly_types:
+ id: '1006'
+ anomaly_type: No_Values
+ data_object: Column
+ anomaly_name: No Column Values Present
+ anomaly_description: "This column is present in the table, but no values have been\
+ \ ingested or assigned in any records. This could indicate missing data or a processing\
+ \ error. Note that this considers dummy values and zero-length values as missing\
+ \ data. "
+ anomaly_criteria: |-
+ (p.null_value_ct + p.filled_value_ct + p.zero_length_ct) = p.record_ct
+ detail_expression: |-
+ 'Null: ' || p.null_value_ct::VARCHAR(10) || ', Dummy: ' || p.filled_value_ct::VARCHAR(10) || ', Zero Len: ' || p.zero_length_ct::VARCHAR(10)
+ issue_likelihood: Possible
+ suggested_action: |-
+ Review your source data, ingestion process, and any processing steps that update this column.
+ dq_score_prevalence_formula: |-
+ 1.0
+ dq_score_risk_factor: '0.33'
+ dq_dimension: Completeness
+ target_data_lookups:
+ - id: '1038'
+ test_id: '1006'
+ test_type: No_Values
+ sql_flavor: redshift
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";
+ error_type: Profile Anomaly
+ - id: '1063'
+ test_id: '1006'
+ test_type: No_Values
+ sql_flavor: postgresql
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";
+ error_type: Profile Anomaly
+ - id: '1120'
+ test_id: '1006'
+ test_type: No_Values
+ sql_flavor: mssql
+ lookup_type: null
+ lookup_query: |-
+ SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";
+ error_type: Profile Anomaly
+ - id: '1177'
+ test_id: '1006'
+ test_type: No_Values
+ sql_flavor: snowflake
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";
+ error_type: Profile Anomaly
+ - id: '1278'
+ test_id: '1006'
+ test_type: No_Values
+ sql_flavor: databricks
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}`;
+ error_type: Profile Anomaly
diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Alpha_Name_Address.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Alpha_Name_Address.yaml
new file mode 100644
index 00000000..ea507508
--- /dev/null
+++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Alpha_Name_Address.yaml
@@ -0,0 +1,69 @@
+profile_anomaly_types:
+ id: '1029'
+ anomaly_type: Non_Alpha_Name_Address
+ data_object: Column
+ anomaly_name: Non-Alpha Name or Address
+ anomaly_description: |-
+ Entirely non-alphabetic values were found in a column representing an entity name or address element.
+ anomaly_criteria: |-
+ non_alpha_ct - zero_length_ct > 0 AND functional_data_type IN ('Address', 'City', 'Entity Name', 'Person Given Name', 'Person Last Name', 'Person Full Name')
+ detail_expression: |-
+ 'Non-Alpha Values: ' || (non_alpha_ct - zero_length_ct)::VARCHAR || ', Semantic Type: ' || p.functional_data_type || ', Records: ' || p.record_ct::VARCHAR
+ issue_likelihood: Definite
+ suggested_action: |-
+ Non-alphabetic values are highly likely to be invalid for this kind of column. This may indicate a file format change, error in an ingestion process, or incorrect source data. Review your pipeline process and source data to determine the root-cause. If this data accurately reflects source data, and upstream corrections are not possible, consider assigning the processed value to null to reflect that data is missing.
+ dq_score_prevalence_formula: |-
+ (non_alpha_ct - zero_length_ct)::FLOAT/NULLIF(p.record_ct, 0)::FLOAT
+ dq_score_risk_factor: '1.0'
+ dq_dimension: Validity
+ target_data_lookups:
+ - id: '1263'
+ test_id: '1029'
+ test_type: Non_Alpha_Name_Address
+ sql_flavor: redshift
+ lookup_type: null
+ lookup_query: |-
+ SELECT "{COLUMN_NAME}", COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}"
+ WHERE "{COLUMN_NAME}" = UPPER("{COLUMN_NAME}") AND "{COLUMN_NAME}" = LOWER("{COLUMN_NAME}") AND "{COLUMN_NAME}" > ''
+ GROUP BY "{COLUMN_NAME}" LIMIT 500
+ error_type: Profile Anomaly
+ - id: '1264'
+ test_id: '1029'
+ test_type: Non_Alpha_Name_Address
+ sql_flavor: postgresql
+ lookup_type: null
+ lookup_query: |-
+ SELECT "{COLUMN_NAME}", COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}"
+ WHERE "{COLUMN_NAME}" = UPPER("{COLUMN_NAME}") AND "{COLUMN_NAME}" = LOWER("{COLUMN_NAME}") AND "{COLUMN_NAME}" > ''
+ GROUP BY "{COLUMN_NAME}" LIMIT 500
+ error_type: Profile Anomaly
+ - id: '1265'
+ test_id: '1029'
+ test_type: Non_Alpha_Name_Address
+ sql_flavor: mssql
+ lookup_type: null
+ lookup_query: |-
+ SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}"
+ WHERE "{COLUMN_NAME}" = UPPER("{COLUMN_NAME}") AND "{COLUMN_NAME}" = LOWER("{COLUMN_NAME}") AND "{COLUMN_NAME}" > ''
+ GROUP BY "{COLUMN_NAME}"
+ error_type: Profile Anomaly
+ - id: '1266'
+ test_id: '1029'
+ test_type: Non_Alpha_Name_Address
+ sql_flavor: snowflake
+ lookup_type: null
+ lookup_query: |-
+ SELECT "{COLUMN_NAME}", COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}"
+ WHERE "{COLUMN_NAME}" = UPPER("{COLUMN_NAME}") AND "{COLUMN_NAME}" = LOWER("{COLUMN_NAME}") AND "{COLUMN_NAME}" > ''
+ GROUP BY "{COLUMN_NAME}" LIMIT 500
+ error_type: Profile Anomaly
+ - id: '1267'
+ test_id: '1029'
+ test_type: Non_Alpha_Name_Address
+ sql_flavor: databricks
+ lookup_type: null
+ lookup_query: |-
+ SELECT `{COLUMN_NAME}`, COUNT(*) as record_ct FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`
+ WHERE `{COLUMN_NAME}` = UPPER(`{COLUMN_NAME}`) AND `{COLUMN_NAME}` = LOWER(`{COLUMN_NAME}`) AND `{COLUMN_NAME}` > ''
+ GROUP BY `{COLUMN_NAME}` LIMIT 500
+ error_type: Profile Anomaly
diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Alpha_Prefixed_Name.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Alpha_Prefixed_Name.yaml
new file mode 100644
index 00000000..453e86b2
--- /dev/null
+++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Alpha_Prefixed_Name.yaml
@@ -0,0 +1,69 @@
+profile_anomaly_types:
+ id: '1030'
+ anomaly_type: Non_Alpha_Prefixed_Name
+ data_object: Column
+ anomaly_name: Non-Alpha Prefixed Name
+ anomaly_description: |-
+ Non-alphabetic characters were found at the start of a column representing an entity name.
+ anomaly_criteria: |-
+ min_text < 'A' AND LEFT(min_text, 1) NOT IN ('"', ' ') AND RIGHT(min_text, 1) <> '''' AND functional_data_type IN ('City', 'Person Given Name', 'Person Last Name', 'Person Full Name')
+ detail_expression: |-
+ 'Minimum Value: ' || min_text
+ issue_likelihood: Definite
+ suggested_action: |-
+ Values starting with a non-alphabetic character are highly likely to be invalid for this kind of column. This may indicate a file format change, error in an ingestion process, or incorrect source data. It could also indicate flagging or coding of some kind that can be broken out in a separate column in processed data. Review your pipeline process and source data to determine the root-cause. If this data accurately reflects source data, and upstream corrections are not possible, consider applying corrections directly to processed data where possible.
+ dq_score_prevalence_formula: |-
+ 0.25
+ dq_score_risk_factor: '1.0'
+ dq_dimension: Validity
+ target_data_lookups:
+ - id: '1268'
+ test_id: '1030'
+ test_type: Non_Alpha_Prefixed_Name
+ sql_flavor: redshift
+ lookup_type: null
+ lookup_query: |-
+ SELECT "{COLUMN_NAME}", COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}"
+ WHERE "{COLUMN_NAME}" < 'A' AND LEFT("{COLUMN_NAME}", 1) NOT IN ('"', ' ') AND RIGHT("{COLUMN_NAME}", 1) <> ''''
+ GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500
+ error_type: Profile Anomaly
+ - id: '1269'
+ test_id: '1030'
+ test_type: Non_Alpha_Prefixed_Name
+ sql_flavor: postgresql
+ lookup_type: null
+ lookup_query: |-
+ SELECT "{COLUMN_NAME}", COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}"
+ WHERE "{COLUMN_NAME}" < 'A' AND LEFT("{COLUMN_NAME}", 1) NOT IN ('"', ' ') AND RIGHT("{COLUMN_NAME}", 1) <> ''''
+ GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500
+ error_type: Profile Anomaly
+ - id: '1270'
+ test_id: '1030'
+ test_type: Non_Alpha_Prefixed_Name
+ sql_flavor: mssql
+ lookup_type: null
+ lookup_query: |-
+ SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}"
+ WHERE "{COLUMN_NAME}" < 'A' AND LEFT("{COLUMN_NAME}", 1) NOT IN ('"', ' ') AND RIGHT("{COLUMN_NAME}", 1) <> ''''
+ GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}"
+ error_type: Profile Anomaly
+ - id: '1271'
+ test_id: '1030'
+ test_type: Non_Alpha_Prefixed_Name
+ sql_flavor: snowflake
+ lookup_type: null
+ lookup_query: |-
+ SELECT "{COLUMN_NAME}", COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}"
+ WHERE "{COLUMN_NAME}" < 'A' AND LEFT("{COLUMN_NAME}", 1) NOT IN ('"', ' ') AND RIGHT("{COLUMN_NAME}", 1) <> ''''
+ GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500
+ error_type: Profile Anomaly
+ - id: '1272'
+ test_id: '1030'
+ test_type: Non_Alpha_Prefixed_Name
+ sql_flavor: databricks
+ lookup_type: null
+ lookup_query: |-
+ SELECT `{COLUMN_NAME}`, COUNT(*) as record_ct FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`
+ WHERE `{COLUMN_NAME}` < 'A' AND LEFT(`{COLUMN_NAME}`, 1) NOT IN ('"', ' ') AND RIGHT(`{COLUMN_NAME}`, 1) <> ''''
+ GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` LIMIT 500
+ error_type: Profile Anomaly
diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Printing_Chars.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Printing_Chars.yaml
new file mode 100644
index 00000000..1ca207a3
--- /dev/null
+++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Printing_Chars.yaml
@@ -0,0 +1,125 @@
+profile_anomaly_types:
+ id: '1031'
+ anomaly_type: Non_Printing_Chars
+ data_object: Column
+ anomaly_name: Non-Printing Characters
+ anomaly_description: |-
+ Non-printing characters were found embedded in a text column.
+ anomaly_criteria: |-
+ non_printing_ct > 0
+ detail_expression: |-
+ 'Non-Printing Chars: ' || non_printing_ct::VARCHAR || ', Records: ' || p.record_ct::VARCHAR
+ issue_likelihood: Definite
+ suggested_action: |-
+ Embedded non-printing characters are typically stripped from data. They affect filters and aggregations, and may cause problems for downstream users who don't recognize their presence. Review your source data and follow-up with data owners to determine whether this data can be corrected upstream. If not, strip these characters from processed data.
+ dq_score_prevalence_formula: |-
+ non_printing_ct::FLOAT/NULLIF(p.record_ct, 0)::FLOAT
+ dq_score_risk_factor: '1.0'
+ dq_dimension: Validity
+ target_data_lookups:
+ - id: '1273'
+ test_id: '1031'
+ test_type: Non_Printing_Chars
+ sql_flavor: redshift
+ lookup_type: null
+ lookup_query: |-
+ SELECT REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE("{COLUMN_NAME}",
+ CHR(160), '\x160'),
+ CHR(8201), '\x8201'),
+ CHR(8203), '\x8203'),
+ CHR(8204), '\x8204'),
+ CHR(8205), '\x8205'),
+ CHR(8206), '\x8206'),
+ CHR(8207), '\x8207'),
+ CHR(8239), '\x8239'),
+ CHR(12288), '\x12288'),
+ CHR(65279), '\x65279') as "{COLUMN_NAME}_content",
+ COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}"
+ WHERE TRANSLATE("{COLUMN_NAME}", CHR(160) || CHR(8201) || CHR(8203) || CHR(8204) || CHR(8205) || CHR(8206) || CHR(8207) || CHR(8239) || CHR(12288) || CHR(65279), 'XXXXXXXXXX') <> "{COLUMN_NAME}"
+ GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500
+ error_type: Profile Anomaly
+ - id: '1274'
+ test_id: '1031'
+ test_type: Non_Printing_Chars
+ sql_flavor: postgresql
+ lookup_type: null
+ lookup_query: |-
+ SELECT REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE("{COLUMN_NAME}",
+ CHR(160), '\x160'),
+ CHR(8201), '\x8201'),
+ CHR(8203), '\x8203'),
+ CHR(8204), '\x8204'),
+ CHR(8205), '\x8205'),
+ CHR(8206), '\x8206'),
+ CHR(8207), '\x8207'),
+ CHR(8239), '\x8239'),
+ CHR(12288), '\x12288'),
+ CHR(65279), '\x65279') as "{COLUMN_NAME}_content",
+ COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}"
+ WHERE TRANSLATE("{COLUMN_NAME}", CHR(160) || CHR(8201) || CHR(8203) || CHR(8204) || CHR(8205) || CHR(8206) || CHR(8207) || CHR(8239) || CHR(12288) || CHR(65279), 'XXXXXXXXXX') <> "{COLUMN_NAME}"
+ GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500
+ error_type: Profile Anomaly
+ - id: '1275'
+ test_id: '1031'
+ test_type: Non_Printing_Chars
+ sql_flavor: mssql
+ lookup_type: null
+ lookup_query: |-
+ SELECT TOP 500 REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE("{COLUMN_NAME}",
+ NCHAR(160), '\x160'),
+ NCHAR(8201), '\x8201'),
+ NCHAR(8203), '\x8203'),
+ NCHAR(8204), '\x8204'),
+ NCHAR(8205), '\x8205'),
+ NCHAR(8206), '\x8206'),
+ NCHAR(8207), '\x8207'),
+ NCHAR(8239), '\x8239'),
+ NCHAR(12288), '\x12288'),
+ NCHAR(65279), '\x65279') AS "{COLUMN_NAME}_content",
+ COUNT(*) AS record_ct
+ FROM "{TARGET_SCHEMA}"."{TABLE_NAME}"
+ WHERE TRANSLATE("{COLUMN_NAME}", NCHAR(160) + NCHAR(8201) + NCHAR(8203) + NCHAR(8204) + NCHAR(8205) + NCHAR(8206) + NCHAR(8207) + NCHAR(8239) + NCHAR(12288) + NCHAR(65279), 'XXXXXXXXXX') <> "{COLUMN_NAME}"
+ GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}"
+ error_type: Profile Anomaly
+ - id: '1276'
+ test_id: '1031'
+ test_type: Non_Printing_Chars
+ sql_flavor: snowflake
+ lookup_type: null
+ lookup_query: |-
+ SELECT REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE("{COLUMN_NAME}",
+ CHR(160), '\x160'),
+ CHR(8201), '\x8201'),
+ CHR(8203), '\x8203'),
+ CHR(8204), '\x8204'),
+ CHR(8205), '\x8205'),
+ CHR(8206), '\x8206'),
+ CHR(8207), '\x8207'),
+ CHR(8239), '\x8239'),
+ CHR(12288), '\x12288'),
+ CHR(65279), '\x65279') as "{COLUMN_NAME}_content",
+ COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}"
+ WHERE TRANSLATE("{COLUMN_NAME}", CHR(160) || CHR(8201) || CHR(8203) || CHR(8204) || CHR(8205) || CHR(8206) || CHR(8207) || CHR(8239) || CHR(12288) || CHR(65279), 'XXXXXXXXXX') <> "{COLUMN_NAME}"
+ GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500
+ error_type: Profile Anomaly
+ - id: '1277'
+ test_id: '1031'
+ test_type: Non_Printing_Chars
+ sql_flavor: databricks
+ lookup_type: null
+ lookup_query: |-
+ SELECT REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(`{COLUMN_NAME}`,
+ '\u00a0', '\x160'),
+ '\u2009', '\x8201'),
+ '\u200b', '\x8203'),
+ '\u200c', '\x8204'),
+ '\u200d', '\x8205'),
+ '\u200e', '\x8206'),
+ '\u200f', '\x8207'),
+ '\u202f', '\x8239'),
+ '\u3000', '\x12288'),
+ '\ufeff', '\x65279') as `{COLUMN_NAME}_content`,
+ COUNT(*) as record_ct FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`
+ WHERE TRANSLATE(`{COLUMN_NAME}`, '\u00a0\u2009\u200b\u200c\u200d\u200e\u200f\u202f\u3000\ufeff', 'XXXXXXXXXX') <> `{COLUMN_NAME}`
+ GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` LIMIT 500
+ error_type: Profile Anomaly
diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Standard_Blanks.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Standard_Blanks.yaml
new file mode 100644
index 00000000..73eb6c03
--- /dev/null
+++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Standard_Blanks.yaml
@@ -0,0 +1,59 @@
+profile_anomaly_types:
+ id: '1002'
+ anomaly_type: Non_Standard_Blanks
+ data_object: Column
+ anomaly_name: Non-Standard Blank Values
+ anomaly_description: |-
+ Values representing missing data may be unexpected or inconsistent. Non-standard values may include empty strings as opposed to nulls, dummy entries such as "MISSING" or repeated characters that may have been used to bypass entry requirements, processing artifacts such as "NULL", or spreadsheet artifacts such as "NA", "ERROR".
+ anomaly_criteria: |-
+ (p.zero_length_ct > 0 OR (p.filled_value_ct > 0 AND (p.numeric_ct <> p.value_ct OR functional_data_type IN ('Phone', 'Zip'))))
+ detail_expression: |-
+ 'Dummy Values: ' || p.filled_value_ct::VARCHAR || ', Empty String: ' || p.zero_length_ct::VARCHAR || ', Null: ' || p.null_value_ct::VARCHAR || ', Records: ' || p.record_ct::VARCHAR
+ issue_likelihood: Definite
+ suggested_action: |-
+ Consider cleansing the column upon ingestion to replace all variants of missing data with a standard designation, like Null.
+ dq_score_prevalence_formula: |-
+ p.filled_value_ct::FLOAT/NULLIF(p.record_ct, 0)::FLOAT
+ dq_score_risk_factor: '1.0'
+ dq_dimension: Completeness
+ target_data_lookups:
+ - id: '1034'
+ test_id: '1002'
+ test_type: Non_Standard_Blanks
+ sql_flavor: redshift
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE CASE WHEN "{COLUMN_NAME}" IN ('.', '?', ' ') THEN 1 WHEN LOWER("{COLUMN_NAME}") SIMILAR TO '(^.{2,}|-{2,}|0{2,}|9{2,}|x{2,}|z{2,}$)' THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('blank','error','missing','tbd', 'n/a','#na','none','null','unknown') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('(blank)','(error)','(missing)','(tbd)', '(n/a)','(#na)','(none)','(null)','(unknown)') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('[blank]','[error]','[missing]','[tbd]', '[n/a]','[#na]','[none]','[null]','[unknown]') THEN 1 WHEN "{COLUMN_NAME}" = '' THEN 1 WHEN "{COLUMN_NAME}" IS NULL THEN 1 ELSE 0 END = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";
+ error_type: Profile Anomaly
+ - id: '1059'
+ test_id: '1002'
+ test_type: Non_Standard_Blanks
+ sql_flavor: postgresql
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE CASE WHEN "{COLUMN_NAME}" IN ('.', '?', ' ') THEN 1 WHEN LOWER("{COLUMN_NAME}") SIMILAR TO '(^.{2,}|-{2,}|0{2,}|9{2,}|x{2,}|z{2,}$)' THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('blank','error','missing','tbd', 'n/a','#na','none','null','unknown') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('(blank)','(error)','(missing)','(tbd)', '(n/a)','(#na)','(none)','(null)','(unknown)') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('[blank]','[error]','[missing]','[tbd]', '[n/a]','[#na]','[none]','[null]','[unknown]') THEN 1 WHEN "{COLUMN_NAME}" = '' THEN 1 WHEN "{COLUMN_NAME}" IS NULL THEN 1 ELSE 0 END = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";
+ error_type: Profile Anomaly
+ - id: '1116'
+ test_id: '1002'
+ test_type: Non_Standard_Blanks
+ sql_flavor: mssql
+ lookup_type: null
+ lookup_query: |-
+ SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE CASE WHEN "{COLUMN_NAME}" IN ('.', '?') OR "{COLUMN_NAME}" LIKE ' ' THEN 1 WHEN LEN("{COLUMN_NAME}") > 1 AND ( LOWER("{COLUMN_NAME}") LIKE '%..%' OR LOWER("{COLUMN_NAME}") LIKE '%--%' OR (LEN(REPLACE("{COLUMN_NAME}", '0', ''))= 0 ) OR (LEN(REPLACE("{COLUMN_NAME}", '9', ''))= 0 ) OR (LEN(REPLACE(LOWER("{COLUMN_NAME}"), 'x', ''))= 0 ) OR (LEN(REPLACE(LOWER("{COLUMN_NAME}"), 'z', ''))= 0 ) ) THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('blank','error','missing','tbd', 'n/a','#na','none','null','unknown') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('(blank)','(error)','(missing)','(tbd)', '(n/a)','(#na)','(none)','(null)','(unknown)') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('[blank]','[error]','[missing]','[tbd]', '[n/a]','[#na]','[none]','[null]','[unknown]') THEN 1 WHEN "{COLUMN_NAME}" = '' THEN 1 WHEN "{COLUMN_NAME}" IS NULL THEN 1 ELSE 0 END = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";
+ error_type: Profile Anomaly
+ - id: '1173'
+ test_id: '1002'
+ test_type: Non_Standard_Blanks
+ sql_flavor: snowflake
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE CASE WHEN "{COLUMN_NAME}" IN ('.', '?', ' ') THEN 1 WHEN LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP '-{2,}' OR LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP '0{2,}' OR LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP '9{2,}' OR LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP 'x{2,}' OR LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP 'z{2,}' THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('blank','error','missing','tbd', 'n/a','#na','none','null','unknown') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('(blank)','(error)','(missing)','(tbd)', '(n/a)','(#na)','(none)','(null)','(unknown)') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('[blank]','[error]','[missing]','[tbd]', '[n/a]','[#na]','[none]','[null]','[unknown]') THEN 1 WHEN "{COLUMN_NAME}" = '' THEN 1 WHEN "{COLUMN_NAME}" IS NULL THEN 1 ELSE 0 END = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";
+ error_type: Profile Anomaly
+ - id: '1274'
+ test_id: '1002'
+ test_type: Non_Standard_Blanks
+ sql_flavor: databricks
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE CASE WHEN `{COLUMN_NAME}` IN ('.', '?', ' ') THEN 1 WHEN LOWER(`{COLUMN_NAME}`::STRING) REGEXP '-{2,}' OR LOWER(`{COLUMN_NAME}`::STRING) REGEXP '0{2,}' OR LOWER(`{COLUMN_NAME}`::STRING) REGEXP '9{2,}' OR LOWER(`{COLUMN_NAME}`::STRING) REGEXP 'x{2,}' OR LOWER(`{COLUMN_NAME}`::STRING) REGEXP 'z{2,}' THEN 1 WHEN LOWER(`{COLUMN_NAME}`) IN ('blank','error','missing','tbd', 'n/a','#na','none','null','unknown') THEN 1 WHEN LOWER(`{COLUMN_NAME}`) IN ('(blank)','(error)','(missing)','(tbd)', '(n/a)','(#na)','(none)','(null)','(unknown)') THEN 1 WHEN LOWER(`{COLUMN_NAME}`) IN ('[blank]','[error]','[missing]','[tbd]', '[n/a]','[#na]','[none]','[null]','[unknown]') THEN 1 WHEN `{COLUMN_NAME}` = '' THEN 1 WHEN `{COLUMN_NAME}` IS NULL THEN 1 ELSE 0 END = 1 GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}`;
+ error_type: Profile Anomaly
diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Potential_Duplicates.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Potential_Duplicates.yaml
new file mode 100644
index 00000000..bc9fe8c5
--- /dev/null
+++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Potential_Duplicates.yaml
@@ -0,0 +1,60 @@
+profile_anomaly_types:
+ id: '1016'
+ anomaly_type: Potential_Duplicates
+ data_object: Column
+ anomaly_name: Potential Duplicate Values Found
+ anomaly_description: "This column is largely unique, but some duplicate values are\
+ \ present. This pattern is uncommon and could indicate inadvertant duplication. "
+ anomaly_criteria: |-
+ p.distinct_value_ct > 1000
+ AND fn_parsefreq(p.top_freq_values, 1, 2)::BIGINT BETWEEN 2 AND 4
+ detail_expression: |-
+ 'Top Freq: ' || p.top_freq_values
+ issue_likelihood: Possible
+ suggested_action: "Review your source data and follow-up with data owners to determine\
+ \ whether this data needs to be corrected. "
+ dq_score_prevalence_formula: |-
+ (p.value_ct - p.distinct_value_ct)::FLOAT/NULLIF(p.record_ct, 0)::FLOAT
+ dq_score_risk_factor: '0.33'
+ dq_dimension: Uniqueness
+ target_data_lookups:
+ - id: '1048'
+ test_id: '1016'
+ test_type: Potential_Duplicates
+ sql_flavor: redshift
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" HAVING COUNT(*)> 1 ORDER BY COUNT(*) DESC LIMIT 500;
+ error_type: Profile Anomaly
+ - id: '1073'
+ test_id: '1016'
+ test_type: Potential_Duplicates
+ sql_flavor: postgresql
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" HAVING COUNT(*)> 1 ORDER BY COUNT(*) DESC LIMIT 500;
+ error_type: Profile Anomaly
+ - id: '1130'
+ test_id: '1016'
+ test_type: Potential_Duplicates
+ sql_flavor: mssql
+ lookup_type: null
+ lookup_query: |-
+ SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" HAVING COUNT(*)> 1 ORDER BY COUNT(*) DESC;
+ error_type: Profile Anomaly
+ - id: '1187'
+ test_id: '1016'
+ test_type: Potential_Duplicates
+ sql_flavor: snowflake
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" HAVING COUNT(*)> 1 ORDER BY COUNT(*) DESC LIMIT 500;
+ error_type: Profile Anomaly
+ - id: '1288'
+ test_id: '1016'
+ test_type: Potential_Duplicates
+ sql_flavor: databricks
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY `{COLUMN_NAME}` HAVING count > 1 ORDER BY count DESC LIMIT 500;
+ error_type: Profile Anomaly
diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Potential_PII.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Potential_PII.yaml
new file mode 100644
index 00000000..d160615d
--- /dev/null
+++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Potential_PII.yaml
@@ -0,0 +1,59 @@
+profile_anomaly_types:
+ id: '1100'
+ anomaly_type: Potential_PII
+ data_object: Column
+ anomaly_name: Personally Identifiable Information
+ anomaly_description: |-
+ This column contains data that could be Personally Identifiable Information (PII)
+ anomaly_criteria: |-
+ p.pii_flag > ''
+ detail_expression: |-
+ 'Risk: ' || CASE LEFT(p.pii_flag, 1) WHEN 'A' THEN 'HIGH' WHEN 'B' THEN 'MODERATE' WHEN 'C' THEN 'LOW' END || ', PII Type: ' || SUBSTRING(p.pii_flag, 3)
+ issue_likelihood: Potential PII
+ suggested_action: |-
+ PII may require steps to ensure data security and compliance with relevant privacy regulations and legal requirements. You may have to classify and inventory PII, implement appropriate access controls, encrypt data, and monitor for unauthorized access. Your organization might be required to update privacy policies and train staff on data protection practices. Note that PII that is lower-risk in isolation might be high-risk in conjunction with other data.
+ dq_score_prevalence_formula: null
+ dq_score_risk_factor: CASE LEFT(p.pii_flag, 1) WHEN 'A' THEN 1 WHEN 'B' THEN 0.66
+ WHEN 'C' THEN 0.33 END
+ dq_dimension: Validity
+ target_data_lookups:
+ - id: '1269'
+ test_id: '1100'
+ test_type: Potential_PII
+ sql_flavor: redshift
+ lookup_type: null
+ lookup_query: |-
+ SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500;
+ error_type: Profile Anomaly
+ - id: '1270'
+ test_id: '1100'
+ test_type: Potential_PII
+ sql_flavor: snowflake
+ lookup_type: null
+ lookup_query: |-
+ SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500;
+ error_type: Profile Anomaly
+ - id: '1271'
+ test_id: '1100'
+ test_type: Potential_PII
+ sql_flavor: mssql
+ lookup_type: null
+ lookup_query: |-
+ SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC;
+ error_type: Profile Anomaly
+ - id: '1272'
+ test_id: '1100'
+ test_type: Potential_PII
+ sql_flavor: postgresql
+ lookup_type: null
+ lookup_query: |-
+ SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500;
+ error_type: Profile Anomaly
+ - id: '1338'
+ test_id: '1100'
+ test_type: Potential_PII
+ sql_flavor: databricks
+ lookup_type: null
+ lookup_query: |-
+ SELECT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` DESC LIMIT 500;
+ error_type: Profile Anomaly
diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Quoted_Values.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Quoted_Values.yaml
new file mode 100644
index 00000000..4c9542c3
--- /dev/null
+++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Quoted_Values.yaml
@@ -0,0 +1,59 @@
+profile_anomaly_types:
+ id: '1010'
+ anomaly_type: Quoted_Values
+ data_object: Column
+ anomaly_name: Quoted Values Found in Column Values
+ anomaly_description: |-
+ Column values were found within quotes. This likely contradicts user expectations and could be a sign of broader ingestion or processing errors.
+ anomaly_criteria: |-
+ p.quoted_value_ct > 0
+ detail_expression: |-
+ 'Cases Found: ' || p.quoted_value_ct::VARCHAR(10)
+ issue_likelihood: Likely
+ suggested_action: |-
+ Review your source data, ingestion process, and any processing steps that update this column.
+ dq_score_prevalence_formula: |-
+ p.quoted_value_ct::FLOAT/NULLIF(p.record_ct, 0)::FLOAT
+ dq_score_risk_factor: '0.66'
+ dq_dimension: Validity
+ target_data_lookups:
+ - id: '1042'
+ test_id: '1010'
+ test_type: Quoted_Values
+ sql_flavor: redshift
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (CASE WHEN "{COLUMN_NAME}" ILIKE '"%"' OR "{COLUMN_NAME}" ILIKE '''%''' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500;
+ error_type: Profile Anomaly
+ - id: '1067'
+ test_id: '1010'
+ test_type: Quoted_Values
+ sql_flavor: postgresql
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (CASE WHEN "{COLUMN_NAME}" ILIKE '"%"' OR "{COLUMN_NAME}" ILIKE '''%''' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500;
+ error_type: Profile Anomaly
+ - id: '1124'
+ test_id: '1010'
+ test_type: Quoted_Values
+ sql_flavor: mssql
+ lookup_type: null
+ lookup_query: |-
+ SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (CASE WHEN "{COLUMN_NAME}" LIKE '"%"' OR "{COLUMN_NAME}" LIKE '''%''' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";
+ error_type: Profile Anomaly
+ - id: '1181'
+ test_id: '1010'
+ test_type: Quoted_Values
+ sql_flavor: snowflake
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (CASE WHEN "{COLUMN_NAME}" ILIKE '"%"' OR "{COLUMN_NAME}" ILIKE '''%''' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500;
+ error_type: Profile Anomaly
+ - id: '1282'
+ test_id: '1010'
+ test_type: Quoted_Values
+ sql_flavor: databricks
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (CASE WHEN `{COLUMN_NAME}` ILIKE '"%"' OR `{COLUMN_NAME}` ILIKE '''%''' THEN 1 ELSE 0 END) = 1 GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` LIMIT 500;
+ error_type: Profile Anomaly
diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Recency_One_Year.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Recency_One_Year.yaml
new file mode 100644
index 00000000..cfefa973
--- /dev/null
+++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Recency_One_Year.yaml
@@ -0,0 +1,58 @@
+profile_anomaly_types:
+ id: '1019'
+ anomaly_type: Recency_One_Year
+ data_object: Dates
+ anomaly_name: Recency - No Table Dates within 1 Year
+ anomaly_description: |-
+ Among all date columns present in the table, none fall inside of one year from Profile date.
+ anomaly_criteria: |-
+ MAX(p.max_date) < CURRENT_DATE - INTERVAL '1 year'
+ detail_expression: |-
+ 'Most Recent Date: ' || MAX(p.max_date)::VARCHAR
+ issue_likelihood: Possible
+ suggested_action: |-
+ Review your source data and follow-up with data owners to determine whether dates in table should be more recent.
+ dq_score_prevalence_formula: null
+ dq_score_risk_factor: null
+ dq_dimension: Timeliness
+ target_data_lookups:
+ - id: '1051'
+ test_id: '1019'
+ test_type: Recency_One_Year
+ sql_flavor: redshift
+ lookup_type: null
+ lookup_query: |-
+ created_in_ui
+ error_type: Profile Anomaly
+ - id: '1076'
+ test_id: '1019'
+ test_type: Recency_One_Year
+ sql_flavor: postgresql
+ lookup_type: null
+ lookup_query: |-
+ created_in_ui
+ error_type: Profile Anomaly
+ - id: '1133'
+ test_id: '1019'
+ test_type: Recency_One_Year
+ sql_flavor: mssql
+ lookup_type: null
+ lookup_query: |-
+ created_in_ui
+ error_type: Profile Anomaly
+ - id: '1291'
+ test_id: '1019'
+ test_type: Recency_One_Year
+ sql_flavor: databricks
+ lookup_type: null
+ lookup_query: |-
+ created_in_ui
+ error_type: Profile Anomaly
+ - id: '1190'
+ test_id: '1019'
+ test_type: Recency_One_Year
+ sql_flavor: snowflake
+ lookup_type: null
+ lookup_query: |-
+ created_in_ui
+ error_type: Profile Anomaly
diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Recency_Six_Months.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Recency_Six_Months.yaml
new file mode 100644
index 00000000..7388aba5
--- /dev/null
+++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Recency_Six_Months.yaml
@@ -0,0 +1,58 @@
+profile_anomaly_types:
+ id: '1020'
+ anomaly_type: Recency_Six_Months
+ data_object: Dates
+ anomaly_name: Recency - No Table Dates within 6 Months
+ anomaly_description: "Among all date columns present in the table, the most recent\
+ \ date falls 6 months to 1 year back from Profile date. "
+ anomaly_criteria: |-
+ MAX(p.max_date) >= CURRENT_DATE - INTERVAL '1 year' AND MAX(p.max_date) < CURRENT_DATE - INTERVAL '6 months'
+ detail_expression: |-
+ 'Most Recent Date: ' || MAX(p.max_date)::VARCHAR
+ issue_likelihood: Possible
+ suggested_action: |-
+ Review your source data and follow-up with data owners to determine whether dates in table should be more recent.
+ dq_score_prevalence_formula: null
+ dq_score_risk_factor: null
+ dq_dimension: Timeliness
+ target_data_lookups:
+ - id: '1052'
+ test_id: '1020'
+ test_type: Recency_Six_Months
+ sql_flavor: redshift
+ lookup_type: null
+ lookup_query: |-
+ created_in_ui
+ error_type: Profile Anomaly
+ - id: '1077'
+ test_id: '1020'
+ test_type: Recency_Six_Months
+ sql_flavor: postgresql
+ lookup_type: null
+ lookup_query: |-
+ created_in_ui
+ error_type: Profile Anomaly
+ - id: '1134'
+ test_id: '1020'
+ test_type: Recency_Six_Months
+ sql_flavor: mssql
+ lookup_type: null
+ lookup_query: |-
+ created_in_ui
+ error_type: Profile Anomaly
+ - id: '1292'
+ test_id: '1020'
+ test_type: Recency_Six_Months
+ sql_flavor: databricks
+ lookup_type: null
+ lookup_query: |-
+ created_in_ui
+ error_type: Profile Anomaly
+ - id: '1191'
+ test_id: '1020'
+ test_type: Recency_Six_Months
+ sql_flavor: snowflake
+ lookup_type: null
+ lookup_query: |-
+ created_in_ui
+ error_type: Profile Anomaly
diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Small Divergent Value Ct.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Small Divergent Value Ct.yaml
new file mode 100644
index 00000000..798d99a9
--- /dev/null
+++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Small Divergent Value Ct.yaml
@@ -0,0 +1,64 @@
+profile_anomaly_types:
+ id: '1014'
+ anomaly_type: Small Divergent Value Ct
+ data_object: Column
+ anomaly_name: Small Percentage of Divergent Values Found
+ anomaly_description: |-
+ Under 3% of values in this column were found to be different from the most common value. This could indicate a data error.
+ anomaly_criteria: |-
+ functional_data_type <> 'Boolean' AND (100.0 * fn_parsefreq(p.top_freq_values, 1, 2)::FLOAT /
+ p.value_ct::FLOAT) > 97::FLOAT
+ AND (100.0 * fn_parsefreq(p.top_freq_values, 1, 2)::FLOAT /
+ NULLIF(p.value_ct, 0)::FLOAT) < 100::FLOAT
+ detail_expression: |-
+ 'Single Value Pct: ' || ROUND(100.0 * fn_parsefreq(p.top_freq_values, 1, 2)::FLOAT
+ / NULLIF(p.value_ct, 0)::FLOAT)::VARCHAR(40)
+ || ', Value | Freq: ' || top_freq_values
+ issue_likelihood: Possible
+ suggested_action: |-
+ Review your source data and follow-up with data owners to determine whether this data needs to be corrected.
+ dq_score_prevalence_formula: |-
+ (p.record_ct - fn_parsefreq(p.top_freq_values, 1, 2)::BIGINT)::FLOAT/NULLIF(p.record_ct, 0)::FLOAT
+ dq_score_risk_factor: '0.33'
+ dq_dimension: Validity
+ target_data_lookups:
+ - id: '1046'
+ test_id: '1014'
+ test_type: Small Divergent Value Ct
+ sql_flavor: redshift
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC;
+ error_type: Profile Anomaly
+ - id: '1071'
+ test_id: '1014'
+ test_type: Small Divergent Value Ct
+ sql_flavor: postgresql
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC;
+ error_type: Profile Anomaly
+ - id: '1128'
+ test_id: '1014'
+ test_type: Small Divergent Value Ct
+ sql_flavor: mssql
+ lookup_type: null
+ lookup_query: |-
+ SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC;
+ error_type: Profile Anomaly
+ - id: '1185'
+ test_id: '1014'
+ test_type: Small Divergent Value Ct
+ sql_flavor: snowflake
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC;
+ error_type: Profile Anomaly
+ - id: '1286'
+ test_id: '1014'
+ test_type: Small Divergent Value Ct
+ sql_flavor: databricks
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY `{COLUMN_NAME}` ORDER BY count DESC;
+ error_type: Profile Anomaly
diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Small Missing Value Ct.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Small Missing Value Ct.yaml
new file mode 100644
index 00000000..5a3fc09a
--- /dev/null
+++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Small Missing Value Ct.yaml
@@ -0,0 +1,67 @@
+profile_anomaly_types:
+ id: '1013'
+ anomaly_type: Small Missing Value Ct
+ data_object: Column
+ anomaly_name: Small Percentage of Missing Values Found
+ anomaly_description: |-
+ Under 3% of values in this column were found to be null, zero-length or dummy values, but values are not universally present. This could indicate unexpected missing values in a required column.
+ anomaly_criteria: |-
+ (p.value_ct - p.zero_length_ct - CASE WHEN (p.filled_value_ct > 0 AND (p.numeric_ct <> p.value_ct OR functional_data_type IN ('Phone', 'Zip'))) THEN p.filled_value_ct ELSE 0 END
+ )::FLOAT / p.record_ct::FLOAT > 0.97
+ AND (p.value_ct - p.zero_length_ct - CASE WHEN (p.filled_value_ct > 0 AND (p.numeric_ct <> p.value_ct OR functional_data_type IN ('Phone', 'Zip'))) THEN p.filled_value_ct ELSE 0 END
+ ) < p.record_ct
+ detail_expression: |-
+ (p.record_ct - (p.value_ct - p.zero_length_ct - CASE WHEN (p.filled_value_ct > 0 AND (p.numeric_ct <> p.value_ct OR functional_data_type IN ('Phone', 'Zip'))) THEN p.filled_value_ct ELSE 0 END
+ ))::VARCHAR(20) ||
+ ' of ' || p.record_ct::VARCHAR(20) || ' blank values: ' ||
+ ROUND(100.0 * (p.record_ct - (p.value_ct - p.zero_length_ct - CASE WHEN (p.filled_value_ct > 0 AND (p.numeric_ct <> p.value_ct OR functional_data_type IN ('Phone', 'Zip'))) THEN p.filled_value_ct ELSE 0 END
+ ))::NUMERIC(18, 5)
+ / NULLIF(p.value_ct, 0)::NUMERIC(18, 5), 2)::VARCHAR(40) || '%'
+ issue_likelihood: Possible
+ suggested_action: |-
+ Review your source data and follow-up with data owners to determine whether this data needs to be corrected, supplemented or excluded.
+ dq_score_prevalence_formula: |-
+ (p.null_value_ct + filled_value_ct + zero_length_ct)::FLOAT/NULLIF(p.record_ct, 0)::FLOAT
+ dq_score_risk_factor: '0.33'
+ dq_dimension: Completeness
+ target_data_lookups:
+ - id: '1045'
+ test_id: '1013'
+ test_type: Small Missing Value Ct
+ sql_flavor: redshift
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (CASE WHEN "{COLUMN_NAME}" IN ('.', '?', ' ') THEN 1 WHEN LOWER("{COLUMN_NAME}") SIMILAR TO '(^.{2,}|-{2,}|0{2,}|9{2,}|x{2,}|z{2,}$)' THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('blank','error','missing','tbd', 'n/a','#na','none','null','unknown') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('(blank)','(error)','(missing)','(tbd)', '(n/a)','(#na)','(none)','(null)','(unknown)') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('[blank]','[error]','[missing]','[tbd]', '[n/a]','[#na]','[none]','[null]','[unknown]') THEN 1 WHEN "{COLUMN_NAME}" = '' THEN 1 WHEN "{COLUMN_NAME}" IS NULL THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";
+ error_type: Profile Anomaly
+ - id: '1070'
+ test_id: '1013'
+ test_type: Small Missing Value Ct
+ sql_flavor: postgresql
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (CASE WHEN "{COLUMN_NAME}" IN ('.', '?', ' ') THEN 1 WHEN LOWER("{COLUMN_NAME}") SIMILAR TO '(^.{2,}|-{2,}|0{2,}|9{2,}|x{2,}|z{2,}$)' THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('blank','error','missing','tbd', 'n/a','#na','none','null','unknown') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('(blank)','(error)','(missing)','(tbd)', '(n/a)','(#na)','(none)','(null)','(unknown)') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('[blank]','[error]','[missing]','[tbd]', '[n/a]','[#na]','[none]','[null]','[unknown]') THEN 1 WHEN "{COLUMN_NAME}" = '' THEN 1 WHEN "{COLUMN_NAME}" IS NULL THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";
+ error_type: Profile Anomaly
+ - id: '1127'
+ test_id: '1013'
+ test_type: Small Missing Value Ct
+ sql_flavor: mssql
+ lookup_type: null
+ lookup_query: |-
+ SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (CASE WHEN "{COLUMN_NAME}" IN ('.', '?', ' ') THEN 1 WHEN LEN("{COLUMN_NAME}") > 1 AND ( LOWER("{COLUMN_NAME}") LIKE '%..%' OR LOWER("{COLUMN_NAME}") LIKE '%--%' OR (LEN(REPLACE("{COLUMN_NAME}", '0', ''))= 0 ) OR (LEN(REPLACE("{COLUMN_NAME}", '9', ''))= 0 ) OR (LEN(REPLACE(LOWER("{COLUMN_NAME}"), 'x', ''))= 0 ) OR (LEN(REPLACE(LOWER("{COLUMN_NAME}"), 'z', ''))= 0 ) ) THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('blank','error','missing','tbd', 'n/a','#na','none','null','unknown') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('(blank)','(error)','(missing)','(tbd)', '(n/a)','(#na)','(none)','(null)','(unknown)') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('[blank]','[error]','[missing]','[tbd]', '[n/a]','[#na]','[none]','[null]','[unknown]') THEN 1 WHEN "{COLUMN_NAME}" = '' THEN 1 WHEN "{COLUMN_NAME}" IS NULL THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";
+ error_type: Profile Anomaly
+ - id: '1184'
+ test_id: '1013'
+ test_type: Small Missing Value Ct
+ sql_flavor: snowflake
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (CASE WHEN "{COLUMN_NAME}" IN ('.', '?', ' ') THEN 1 WHEN LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP '-{2,}' OR LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP '0{2,}' OR LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP '9{2,}' OR LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP 'x{2,}' OR LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP 'z{2,}' THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('blank','error','missing','tbd', 'n/a','#na','none','null','unknown') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('(blank)','(error)','(missing)','(tbd)', '(n/a)','(#na)','(none)','(null)','(unknown)') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('[blank]','[error]','[missing]','[tbd]', '[n/a]','[#na]','[none]','[null]','[unknown]') THEN 1 WHEN "{COLUMN_NAME}" = '' THEN 1 WHEN "{COLUMN_NAME}" IS NULL THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";
+ error_type: Profile Anomaly
+ - id: '1285'
+ test_id: '1013'
+ test_type: Small Missing Value Ct
+ sql_flavor: databricks
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (CASE WHEN `{COLUMN_NAME}` IN ('.', '?', ' ') THEN 1 WHEN LOWER(`{COLUMN_NAME}`::STRING) REGEXP '-{2,}' OR LOWER(`{COLUMN_NAME}`::STRING) REGEXP '0{2,}' OR LOWER(`{COLUMN_NAME}`::STRING) REGEXP '9{2,}' OR LOWER(`{COLUMN_NAME}`::STRING) REGEXP 'x{2,}' OR LOWER(`{COLUMN_NAME}`::STRING) REGEXP 'z{2,}' THEN 1 WHEN LOWER(`{COLUMN_NAME}`) IN ('blank','error','missing','tbd', 'n/a','#na','none','null','unknown') THEN 1 WHEN LOWER(`{COLUMN_NAME}`) IN ('(blank)','(error)','(missing)','(tbd)', '(n/a)','(#na)','(none)','(null)','(unknown)') THEN 1 WHEN LOWER(`{COLUMN_NAME}`) IN ('[blank]','[error]','[missing]','[tbd]', '[n/a]','[#na]','[none]','[null]','[unknown]') THEN 1 WHEN `{COLUMN_NAME}` = '' THEN 1 WHEN `{COLUMN_NAME}` IS NULL THEN 1 ELSE 0 END) = 1 GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}`;
+ error_type: Profile Anomaly
diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Small_Numeric_Value_Ct.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Small_Numeric_Value_Ct.yaml
new file mode 100644
index 00000000..ef3d5d28
--- /dev/null
+++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Small_Numeric_Value_Ct.yaml
@@ -0,0 +1,61 @@
+profile_anomaly_types:
+ id: '1023'
+ anomaly_type: Small_Numeric_Value_Ct
+ data_object: Column
+ anomaly_name: Unexpected Numeric Values Found
+ anomaly_description: |-
+ A small fraction (under 3%) of values in this column were found to be numeric. They could be erroneous.
+ anomaly_criteria: |-
+ p.general_type = 'A'
+ AND p.numeric_ct::FLOAT/NULLIF(p.value_ct, 0)::FLOAT < 0.03
+ AND p.numeric_ct > 0
+ detail_expression: |-
+ 'Numeric Ct: ' || p.numeric_ct || ' of ' || p.value_ct || ' (Numeric Percent: ' || ROUND(100.0 * p.numeric_ct::NUMERIC(18, 5)/NULLIF(p.value_ct, 0)::NUMERIC(18, 5), 2) || ' )'::VARCHAR(200)
+ issue_likelihood: Likely
+ suggested_action: |-
+ Review your source data and follow-up with data owners to determine whether numeric values are invalid entries here.
+ dq_score_prevalence_formula: |-
+ p.numeric_ct::FLOAT/NULLIF(p.record_ct, 0)::FLOAT
+ dq_score_risk_factor: '0.66'
+ dq_dimension: Validity
+ target_data_lookups:
+ - id: '1055'
+ test_id: '1023'
+ test_type: Small_Numeric_Value_Ct
+ sql_flavor: redshift
+ lookup_type: null
+ lookup_query: |-
+ SELECT A.* FROM ( SELECT TOP 10 DISTINCT 'Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_NUM;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC) AS A UNION ALL SELECT B.* FROM ( SELECT TOP 10 DISTINCT 'Non-Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_NUM;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS B ORDER BY data_type, count DESC;
+ error_type: Profile Anomaly
+ - id: '1080'
+ test_id: '1023'
+ test_type: Small_Numeric_Value_Ct
+ sql_flavor: postgresql
+ lookup_type: null
+ lookup_query: |-
+ SELECT A.* FROM ( SELECT DISTINCT 'Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_NUM;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT 10 ) AS A UNION ALL SELECT B.* FROM ( SELECT DISTINCT 'Non-Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_NUM;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT 10 ) AS B ORDER BY data_type, count DESC;
+ error_type: Profile Anomaly
+ - id: '1137'
+ test_id: '1023'
+ test_type: Small_Numeric_Value_Ct
+ sql_flavor: mssql
+ lookup_type: null
+ lookup_query: |-
+ SELECT A.* FROM ( SELECT DISTINCT TOP 10 'Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_NUM;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS A UNION ALL SELECT B.* FROM ( SELECT DISTINCT TOP 10 'Non-Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_NUM;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS B ORDER BY data_type, count DESC;
+ error_type: Profile Anomaly
+ - id: '1194'
+ test_id: '1023'
+ test_type: Small_Numeric_Value_Ct
+ sql_flavor: snowflake
+ lookup_type: null
+ lookup_query: |-
+ SELECT A.* FROM (SELECT DISTINCT TOP 10 'Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_NUM;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC) AS A UNION ALL SELECT B.* FROM (SELECT DISTINCT TOP 10 'Non-Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_NUM;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC) AS B ORDER BY data_type, count DESC;
+ error_type: Profile Anomaly
+ - id: '1295'
+ test_id: '1023'
+ test_type: Small_Numeric_Value_Ct
+ sql_flavor: databricks
+ lookup_type: null
+ lookup_query: |-
+ SELECT A.* FROM (SELECT DISTINCT 'Numeric' as data_type, `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_NUM;`{COLUMN_NAME}`%> = 1 GROUP BY `{COLUMN_NAME}` ORDER BY count DESC LIMIT 10) AS A UNION ALL SELECT B.* FROM (SELECT DISTINCT 'Non-Numeric' as data_type, `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_NUM;`{COLUMN_NAME}`%> != 1 GROUP BY `{COLUMN_NAME}` ORDER BY count DESC) AS B ORDER BY data_type, count DESC LIMIT 10;
+ error_type: Profile Anomaly
diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Standardized_Value_Matches.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Standardized_Value_Matches.yaml
new file mode 100644
index 00000000..63498405
--- /dev/null
+++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Standardized_Value_Matches.yaml
@@ -0,0 +1,60 @@
+profile_anomaly_types:
+ id: '1017'
+ anomaly_type: Standardized_Value_Matches
+ data_object: Column
+ anomaly_name: Similar Values Match When Standardized
+ anomaly_description: |-
+ When column values are standardized (removing spaces, single-quotes, periods and dashes), matching values are found in other records. This may indicate that formats should be further standardized to allow consistent comparisons for merges, joins and roll-ups. It could also indicate the presence of unintended duplicates.
+ anomaly_criteria: "p.general_type = 'A' AND p.distinct_std_value_ct <> p.distinct_value_ct\
+ \ AND p.functional_data_type NOT LIKE 'Person%Name' "
+ detail_expression: |-
+ 'Distinct Values: ' || p.distinct_value_ct::VARCHAR
+ || ', Standardized: ' || p.distinct_std_value_ct::VARCHAR
+ issue_likelihood: Likely
+ suggested_action: |-
+ Review standardized vs. raw data values for all matches. Correct data if values should be consistent.
+ dq_score_prevalence_formula: |-
+ (p.distinct_value_ct - p.distinct_std_value_ct)::FLOAT/NULLIF(p.value_ct, 0)
+ dq_score_risk_factor: '0.66'
+ dq_dimension: Uniqueness
+ target_data_lookups:
+ - id: '1049'
+ test_id: '1017'
+ test_type: Standardized_Value_Matches
+ sql_flavor: redshift
+ lookup_type: null
+ lookup_query: |-
+ WITH CTE AS ( SELECT DISTINCT UPPER(TRANSLATE("{COLUMN_NAME}", ' '',.-', '')) as possible_standard_value, COUNT(DISTINCT "{COLUMN_NAME}") FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY UPPER(TRANSLATE("{COLUMN_NAME}", ' '',.-', '')) HAVING COUNT(DISTINCT "{COLUMN_NAME}") > 1 ) SELECT DISTINCT a."{COLUMN_NAME}", possible_standard_value, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} a, cte b WHERE UPPER(TRANSLATE(a."{COLUMN_NAME}", ' '',.-', '')) = b.possible_standard_value GROUP BY a."{COLUMN_NAME}", possible_standard_value ORDER BY possible_standard_value ASC, count DESC LIMIT 500;
+ error_type: Profile Anomaly
+ - id: '1074'
+ test_id: '1017'
+ test_type: Standardized_Value_Matches
+ sql_flavor: postgresql
+ lookup_type: null
+ lookup_query: |-
+ WITH CTE AS ( SELECT DISTINCT UPPER(TRANSLATE("{COLUMN_NAME}", ' '',.-', '')) as possible_standard_value, COUNT(DISTINCT "{COLUMN_NAME}") FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY UPPER(TRANSLATE("{COLUMN_NAME}", ' '',.-', '')) HAVING COUNT(DISTINCT "{COLUMN_NAME}") > 1 ) SELECT DISTINCT a."{COLUMN_NAME}", possible_standard_value, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} a, cte b WHERE UPPER(TRANSLATE(a."{COLUMN_NAME}", ' '',.-', '')) = b.possible_standard_value GROUP BY a."{COLUMN_NAME}", possible_standard_value ORDER BY possible_standard_value ASC, count DESC LIMIT 500;
+ error_type: Profile Anomaly
+ - id: '1131'
+ test_id: '1017'
+ test_type: Standardized_Value_Matches
+ sql_flavor: mssql
+ lookup_type: null
+ lookup_query: |-
+ WITH CTE AS ( SELECT DISTINCT TOP 500 UPPER(REPLACE(TRANSLATE("{COLUMN_NAME}",' '''',.-',REPLICATE(' ', LEN(' '''',.-'))),' ','')) as possible_standard_value, COUNT(DISTINCT "{COLUMN_NAME}") as distinct_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY UPPER(REPLACE(TRANSLATE("{COLUMN_NAME}",' '''',.-',REPLICATE(' ', LEN(' '''',.-'))),' ','')) HAVING COUNT(DISTINCT "{COLUMN_NAME}") > 1 ) SELECT DISTINCT a."{COLUMN_NAME}", possible_standard_value, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} a, cte b WHERE UPPER(REPLACE(TRANSLATE("{COLUMN_NAME}",' '''',.-',REPLICATE(' ', LEN(' '''',.-'))),' ','')) = b.possible_standard_value GROUP BY a."{COLUMN_NAME}", possible_standard_value ORDER BY possible_standard_value ASC, count DESC;
+ error_type: Profile Anomaly
+ - id: '1188'
+ test_id: '1017'
+ test_type: Standardized_Value_Matches
+ sql_flavor: snowflake
+ lookup_type: null
+ lookup_query: |-
+ WITH CTE AS ( SELECT DISTINCT UPPER(TRANSLATE("{COLUMN_NAME}", ' '',.-', '')) as possible_standard_value, COUNT(DISTINCT "{COLUMN_NAME}") FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY UPPER(TRANSLATE("{COLUMN_NAME}", ' '',.-', '')) HAVING COUNT(DISTINCT "{COLUMN_NAME}") > 1 ) SELECT DISTINCT a."{COLUMN_NAME}", possible_standard_value, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} a, cte b WHERE UPPER(TRANSLATE(a."{COLUMN_NAME}", ' '',.-', '')) = b.possible_standard_value GROUP BY a."{COLUMN_NAME}", possible_standard_value ORDER BY possible_standard_value ASC, count DESC LIMIT 500;
+ error_type: Profile Anomaly
+ - id: '1289'
+ test_id: '1017'
+ test_type: Standardized_Value_Matches
+ sql_flavor: databricks
+ lookup_type: null
+ lookup_query: |-
+ WITH CTE AS ( SELECT DISTINCT UPPER(TRANSLATE(`{COLUMN_NAME}`, ' '',.-', '')) as possible_standard_value, COUNT(DISTINCT `{COLUMN_NAME}`) FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY UPPER(TRANSLATE(`{COLUMN_NAME}`, ' '',.-', '')) HAVING COUNT(DISTINCT `{COLUMN_NAME}`) > 1 ) SELECT DISTINCT a.`{COLUMN_NAME}`, possible_standard_value, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} a, cte b WHERE UPPER(TRANSLATE(a.`{COLUMN_NAME}`, ' '',.-', '')) = b.possible_standard_value GROUP BY a.`{COLUMN_NAME}`, possible_standard_value ORDER BY possible_standard_value ASC, count DESC LIMIT 500;
+ error_type: Profile Anomaly
diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Suggested_Type.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Suggested_Type.yaml
new file mode 100644
index 00000000..36a2e3fa
--- /dev/null
+++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Suggested_Type.yaml
@@ -0,0 +1,60 @@
+profile_anomaly_types:
+ id: '1001'
+ anomaly_type: Suggested_Type
+ data_object: Column
+ anomaly_name: Suggested Data Type
+ anomaly_description: "Data stored as text all meets criteria for a more suitable\
+ \ type. "
+ anomaly_criteria: |-
+ (functional_data_type NOT IN ('Boolean', 'Flag') ) AND (column_type ILIKE '%ch
+ ar%' OR column_type ILIKE 'text') AND NOT (datatype_suggestion ILIKE '%char%' OR datatype_suggestion ILIKE 'text')
+ detail_expression: |-
+ p.datatype_suggestion::VARCHAR(200)
+ issue_likelihood: Likely
+ suggested_action: |-
+ Consider changing the column data type to tighte
+ n controls over data ingested and to make values more efficient, consistent and suitable for downstream analysis.
+ dq_score_prevalence_formula: null
+ dq_score_risk_factor: null
+ dq_dimension: null
+ target_data_lookups:
+ - id: '1033'
+ test_id: '1001'
+ test_type: Suggested_Type
+ sql_flavor: redshift
+ lookup_type: null
+ lookup_query: |-
+ SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC;
+ error_type: Profile Anomaly
+ - id: '1058'
+ test_id: '1001'
+ test_type: Suggested_Type
+ sql_flavor: postgresql
+ lookup_type: null
+ lookup_query: |-
+ SELECT "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC LIMIT 20;
+ error_type: Profile Anomaly
+ - id: '1115'
+ test_id: '1001'
+ test_type: Suggested_Type
+ sql_flavor: mssql
+ lookup_type: null
+ lookup_query: |-
+ SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC;
+ error_type: Profile Anomaly
+ - id: '1172'
+ test_id: '1001'
+ test_type: Suggested_Type
+ sql_flavor: snowflake
+ lookup_type: null
+ lookup_query: |-
+ SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC;
+ error_type: Profile Anomaly
+ - id: '1273'
+ test_id: '1001'
+ test_type: Suggested_Type
+ sql_flavor: databricks
+ lookup_type: null
+ lookup_query: |-
+ SELECT `{COLUMN_NAME}`, COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY `{COLUMN_NAME}` ORDER BY record_ct DESC LIMIT 20;
+ error_type: Profile Anomaly
diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Table_Pattern_Mismatch.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Table_Pattern_Mismatch.yaml
new file mode 100644
index 00000000..dfc98e26
--- /dev/null
+++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Table_Pattern_Mismatch.yaml
@@ -0,0 +1,71 @@
+profile_anomaly_types:
+ id: '1008'
+ anomaly_type: Table_Pattern_Mismatch
+ data_object: Multi-Col
+ anomaly_name: Pattern Inconsistency Across Tables
+ anomaly_description: |-
+ Alpha-numeric string data within this column matches a single pattern, but other columns with the same name have data that matches a different single pattern. Inconsistent formatting may contradict user assumptions and cause downstream errors, extra steps and inconsistent business logic.
+ anomaly_criteria: |-
+ p.general_type = 'A'
+ AND functional_data_type NOT ILIKE 'Measurement%' AND functional_data_type NOT IN ('Category', 'Code')
+ AND p.max_length > 3
+ AND p.value_ct > (p.numeric_ct + p.filled_value_ct + p.zero_length_ct)
+ AND m.max_pattern_ct = 1
+ AND m.column_ct > 1
+ AND SPLIT_PART(p.top_patterns, '|', 2) <> SPLIT_PART(m.very_top_pattern, '|', 2)
+ AND SPLIT_PART(p.top_patterns, '|', 1)::NUMERIC / SPLIT_PART(m.very_top_pattern, '|', 1)::NUMERIC < 0.1
+ detail_expression: |-
+ 'Patterns: ' || SPLIT_PART(p.top_patterns, '|', 2) || ', ' || SPLIT_PART(ltrim(m.very_top_pattern, '0'), '|', 2)
+ issue_likelihood: Likely
+ suggested_action: |-
+ Review the profiled patterns for the same column in other tables. You may want to add a hygiene step to your processing to make patterns consistent.
+ dq_score_prevalence_formula: null
+ dq_score_risk_factor: null
+ dq_dimension: Validity
+ target_data_lookups:
+ - id: '1040'
+ test_id: '1008'
+ test_type: Table_Pattern_Mismatch
+ sql_flavor: redshift
+ lookup_type: null
+ lookup_query: |-
+ SELECT column_name, table_name, data_type FROM information_schema.columns WHERE table_schema = '{TARGET_SCHEMA}' AND column_name = '{COLUMN_NAME}' ORDER BY data_type;
+ error_type: Profile Anomaly
+ - id: '1065'
+ test_id: '1008'
+ test_type: Table_Pattern_Mismatch
+ sql_flavor: postgresql
+ lookup_type: null
+ lookup_query: |-
+ SELECT column_name, columns.table_name FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = '{TARGET_SCHEMA}' AND columns.column_name = '{COLUMN_NAME}' AND UPPER(tables.table_type) = 'BASE TABLE' ORDER BY columns.table_name;
+ error_type: Profile Anomaly
+ - id: '1122'
+ test_id: '1008'
+ test_type: Table_Pattern_Mismatch
+ sql_flavor: mssql
+ lookup_type: null
+ lookup_query: |-
+ SELECT TOP 500 column_name, columns.table_name FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = '{TARGET_SCHEMA}' AND columns.column_name = '{COLUMN_NAME}' AND UPPER(tables.table_type) = 'BASE TABLE' ORDER BY table_name;
+ error_type: Profile Anomaly
+ - id: '1179'
+ test_id: '1008'
+ test_type: Table_Pattern_Mismatch
+ sql_flavor: snowflake
+ lookup_type: null
+ lookup_query: "SELECT DISTINCT column_name, columns.table_name FROM information_schema.columns\
+ \ JOIN information_schema.tables ON columns.table_name = tables.table_name AND\
+ \ columns.table_schema = tables.table_schema WHERE columns.table_schema = '{TARGET_SCHEMA}'\
+ \ AND columns.column_name = '{COLUMN_NAME}' AND UPPER(tables.table_type) = 'BASE\
+ \ TABLE' ORDER BY table_name; "
+ error_type: Profile Anomaly
+ - id: '1280'
+ test_id: '1008'
+ test_type: Table_Pattern_Mismatch
+ sql_flavor: databricks
+ lookup_type: null
+ lookup_query: "SELECT DISTINCT column_name, columns.table_name FROM information_schema.columns\
+ \ JOIN information_schema.tables ON columns.table_name = tables.table_name AND\
+ \ columns.table_schema = tables.table_schema WHERE columns.table_schema = '{TARGET_SCHEMA}'\
+ \ AND columns.column_name = '{COLUMN_NAME}' AND UPPER(tables.table_type) = 'BASE\
+ \ TABLE' ORDER BY table_name; "
+ error_type: Profile Anomaly
diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Unexpected Emails.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Unexpected Emails.yaml
new file mode 100644
index 00000000..6aac3fd8
--- /dev/null
+++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Unexpected Emails.yaml
@@ -0,0 +1,59 @@
+profile_anomaly_types:
+ id: '1022'
+ anomaly_type: Unexpected Emails
+ data_object: Column
+ anomaly_name: Unexpected Column Contains Emails
+ anomaly_description: |-
+ This column is not labeled as email, but contains mostly email addresses. This could indicate shifted or switched source data columns.
+ anomaly_criteria: |-
+ p.std_pattern_match = 'EMAIL'
+ AND NOT (p.column_name ILIKE '%email%' OR p.column_name ILIKE '%addr%')
+ detail_expression: |-
+ 'Value Range: ' || p.min_text || ' thru ' || max_text
+ issue_likelihood: Possible
+ suggested_action: |-
+ Review your source data and follow-up with data owners to determine whether column should be populated with email addresses.
+ dq_score_prevalence_formula: null
+ dq_score_risk_factor: '0.33'
+ dq_dimension: Consistency
+ target_data_lookups:
+ - id: '1054'
+ test_id: '1022'
+ test_type: Unexpected Emails
+ sql_flavor: redshift
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500;
+ error_type: Profile Anomaly
+ - id: '1079'
+ test_id: '1022'
+ test_type: Unexpected Emails
+ sql_flavor: postgresql
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500;
+ error_type: Profile Anomaly
+ - id: '1136'
+ test_id: '1022'
+ test_type: Unexpected Emails
+ sql_flavor: mssql
+ lookup_type: null
+ lookup_query: |-
+ SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC;
+ error_type: Profile Anomaly
+ - id: '1193'
+ test_id: '1022'
+ test_type: Unexpected Emails
+ sql_flavor: snowflake
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500;
+ error_type: Profile Anomaly
+ - id: '1294'
+ test_id: '1022'
+ test_type: Unexpected Emails
+ sql_flavor: databricks
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` DESC LIMIT 500;
+ error_type: Profile Anomaly
diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Unexpected US States.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Unexpected US States.yaml
new file mode 100644
index 00000000..81a28c35
--- /dev/null
+++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Unexpected US States.yaml
@@ -0,0 +1,61 @@
+profile_anomaly_types:
+ id: '1021'
+ anomaly_type: Unexpected US States
+ data_object: Column
+ anomaly_name: Unexpected Column Contains US States
+ anomaly_description: |-
+ This column is not labeled as a state, but contains mostly US State abbreviations. This could indicate shifted or switched source data columns.
+ anomaly_criteria: |-
+ p.std_pattern_match = 'STATE_USA'
+ AND p.distinct_value_ct > 5
+ AND NOT (p.column_name = 'st' OR p.column_name ILIKE '%state%' OR p.column_name ILIKE '%_st' OR p.column_name ILIKE 'st_%')
+ detail_expression: "'Value Range: ' || p.min_text || ' thru ' || max_text || CASE\
+ \ WHEN p.top_freq_values > '' THEN ', Top Freq Values: ' || REPLACE(p.top_freq_values,\
+ \ CHR(10), ' ; ') ELSE '' END "
+ issue_likelihood: Possible
+ suggested_action: |-
+ Review your source data and follow-up with data owners to determine whether column should be populated with US states.
+ dq_score_prevalence_formula: null
+ dq_score_risk_factor: '0.33'
+ dq_dimension: Consistency
+ target_data_lookups:
+ - id: '1053'
+ test_id: '1021'
+ test_type: Unexpected US States
+ sql_flavor: redshift
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500;
+ error_type: Profile Anomaly
+ - id: '1078'
+ test_id: '1021'
+ test_type: Unexpected US States
+ sql_flavor: postgresql
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500;
+ error_type: Profile Anomaly
+ - id: '1135'
+ test_id: '1021'
+ test_type: Unexpected US States
+ sql_flavor: mssql
+ lookup_type: null
+ lookup_query: |-
+ SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC;
+ error_type: Profile Anomaly
+ - id: '1192'
+ test_id: '1021'
+ test_type: Unexpected US States
+ sql_flavor: snowflake
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500;
+ error_type: Profile Anomaly
+ - id: '1293'
+ test_id: '1021'
+ test_type: Unexpected US States
+ sql_flavor: databricks
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` DESC LIMIT 500;
+ error_type: Profile Anomaly
diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Unlikely_Date_Values.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Unlikely_Date_Values.yaml
new file mode 100644
index 00000000..c42f354b
--- /dev/null
+++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Unlikely_Date_Values.yaml
@@ -0,0 +1,61 @@
+profile_anomaly_types:
+ id: '1018'
+ anomaly_type: Unlikely_Date_Values
+ data_object: Column
+ anomaly_name: Unlikely Dates out of Typical Range
+ anomaly_description: |-
+ Some date values in this column are earlier than 1900-01-01 or later than 30 years after Profiling date.
+ anomaly_criteria: |-
+ p.general_type = 'D'
+ AND (p.min_date BETWEEN '0001-01-02'::DATE AND '1900-01-01'::DATE
+ OR p.max_date > CURRENT_DATE + INTERVAL '30 year')
+ detail_expression: |-
+ 'Date Range: ' || p.min_date::VARCHAR || ' thru ' || p.max_date::VARCHAR
+ issue_likelihood: Likely
+ suggested_action: |-
+ Review your source data and follow-up with data owners to determine whether this data needs to be corrected or removed.
+ dq_score_prevalence_formula: |-
+ (COALESCE(p.before_100yr_date_ct,0)+COALESCE(p.distant_future_date_ct, 0))::FLOAT/NULLIF(p.record_ct, 0)
+ dq_score_risk_factor: '0.66'
+ dq_dimension: Accuracy
+ target_data_lookups:
+ - id: '1050'
+ test_id: '1018'
+ test_type: Unlikely_Date_Values
+ sql_flavor: redshift
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT "{COLUMN_NAME}", '{PROFILE_RUN_DATE}' :: DATE AS profile_run_date, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} a WHERE ("{COLUMN_NAME}" < '1900-01-01'::DATE) OR ("{COLUMN_NAME}" > '{PROFILE_RUN_DATE}' :: DATE + INTERVAL '30 year' ) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500;
+ error_type: Profile Anomaly
+ - id: '1075'
+ test_id: '1018'
+ test_type: Unlikely_Date_Values
+ sql_flavor: postgresql
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT "{COLUMN_NAME}", '{PROFILE_RUN_DATE}' :: DATE AS profile_run_date, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} a WHERE ("{COLUMN_NAME}" < '1900-01-01'::DATE) OR ("{COLUMN_NAME}" > '{PROFILE_RUN_DATE}' :: DATE + INTERVAL '30 year' ) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500;
+ error_type: Profile Anomaly
+ - id: '1132'
+ test_id: '1018'
+ test_type: Unlikely_Date_Values
+ sql_flavor: mssql
+ lookup_type: null
+ lookup_query: |-
+ SELECT TOP 500 "{COLUMN_NAME}", CAST( '{PROFILE_RUN_DATE}' AS DATE) AS profile_run_date, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} a WHERE ("{COLUMN_NAME}" < CAST('1900-01-01' AS DATE) ) OR ("{COLUMN_NAME}" > DATEADD(YEAR, 30, CAST('{PROFILE_RUN_DATE}' AS DATE ))) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC;
+ error_type: Profile Anomaly
+ - id: '1189'
+ test_id: '1018'
+ test_type: Unlikely_Date_Values
+ sql_flavor: snowflake
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT "{COLUMN_NAME}", '{PROFILE_RUN_DATE}' :: DATE AS profile_run_date, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} a WHERE ("{COLUMN_NAME}" < '1900-01-01'::DATE) OR ("{COLUMN_NAME}" > '{PROFILE_RUN_DATE}' :: DATE + INTERVAL '30 year' ) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500;
+ error_type: Profile Anomaly
+ - id: '1290'
+ test_id: '1018'
+ test_type: Unlikely_Date_Values
+ sql_flavor: databricks
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT `{COLUMN_NAME}`, '{PROFILE_RUN_DATE}' :: DATE AS profile_run_date, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} a WHERE (`{COLUMN_NAME}` < '1900-01-01'::DATE) OR (`{COLUMN_NAME}` > '{PROFILE_RUN_DATE}' :: DATE + INTERVAL '30 year' ) GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` DESC LIMIT 500;
+ error_type: Profile Anomaly
diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Variant_Coded_Values.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Variant_Coded_Values.yaml
new file mode 100644
index 00000000..adab3e19
--- /dev/null
+++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Variant_Coded_Values.yaml
@@ -0,0 +1,61 @@
+profile_anomaly_types:
+ id: '1027'
+ anomaly_type: Variant_Coded_Values
+ data_object: Variant
+ anomaly_name: Variant Codings for Same Values
+ anomaly_description: "This column contains more than one common variants that represent\
+ \ a single value or state. This can occur when data is integrated from multiple\
+ \ sources with different standards, or when free entry is permitted without validation.\
+ \ The variations can cause confusion and error for downstream data users and multiple\
+ \ versions of the truth. "
+ anomaly_criteria: |-
+ p.distinct_value_ct <= 20
+ detail_expression: |-
+ 'Variants Found: ' || intersect_list
+ issue_likelihood: Definite
+ suggested_action: |-
+ Review your source data and ingestion process. Consider cleansing this data to standardize on a single set of definitive codes.
+ dq_score_prevalence_formula: null
+ dq_score_risk_factor: null
+ dq_dimension: Consistency
+ target_data_lookups:
+ - id: '1229'
+ test_id: '1027'
+ test_type: Variant_Coded_Values
+ sql_flavor: redshift
+ lookup_type: null
+ lookup_query: |-
+ WITH val_array AS (SELECT 1 as valkey, SPLIT_TO_ARRAY(SUBSTRING ('{DETAIL_EXPRESSION}', STRPOS('{DETAIL_EXPRESSION}', ':') + 2), '|') vals), val_list AS ( SELECT valkey, val::VARCHAR FROM val_array v, v.vals val ) SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} t INNER JOIN val_list v ON (LOWER("{COLUMN_NAME}") = v.val) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";
+ error_type: Profile Anomaly
+ - id: '1230'
+ test_id: '1027'
+ test_type: Variant_Coded_Values
+ sql_flavor: snowflake
+ lookup_type: null
+ lookup_query: |-
+ SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE lower("{COLUMN_NAME}") IN (SELECT trim(value) FROM TABLE (FLATTEN(INPUT => SPLIT(SUBSTRING('{DETAIL_EXPRESSION}', POSITION(':', '{DETAIL_EXPRESSION}') + 2), '|'))) ) GROUP BY "{COLUMN_NAME}";
+ error_type: Profile Anomaly
+ - id: '1231'
+ test_id: '1027'
+ test_type: Variant_Coded_Values
+ sql_flavor: mssql
+ lookup_type: null
+ lookup_query: |-
+ SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE LOWER("{COLUMN_NAME}") IN (SELECT trim(value) FROM STRING_SPLIT(SUBSTRING('{DETAIL_EXPRESSION}', CHARINDEX(':', '{DETAIL_EXPRESSION}') + 2, 999), '|')) GROUP BY "{COLUMN_NAME}";
+ error_type: Profile Anomaly
+ - id: '1232'
+ test_id: '1027'
+ test_type: Variant_Coded_Values
+ sql_flavor: postgresql
+ lookup_type: null
+ lookup_query: |-
+ SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE LOWER("{COLUMN_NAME}") = ANY(STRING_TO_ARRAY(SUBSTRING('{DETAIL_EXPRESSION}', STRPOS('{DETAIL_EXPRESSION}', ':') + 2), '|')) GROUP BY "{COLUMN_NAME}";
+ error_type: Profile Anomaly
+ - id: '1230'
+ test_id: '1027'
+ test_type: Variant_Coded_Values
+ sql_flavor: databricks
+ lookup_type: null
+ lookup_query: |-
+ SELECT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE LOWER(`{COLUMN_NAME}`) IN (SELECT TRIM(value) FROM (SELECT EXPLODE(SPLIT(SUBSTRING('{DETAIL_EXPRESSION}', INSTR('{DETAIL_EXPRESSION}', ':') + 2), '\\|')) AS value)) GROUP BY `{COLUMN_NAME}`;
+ error_type: Profile Anomaly
diff --git a/testgen/template/dbsetup_test_types/test_types_Aggregate_Balance.yaml b/testgen/template/dbsetup_test_types/test_types_Aggregate_Balance.yaml
new file mode 100644
index 00000000..0e04bdf0
--- /dev/null
+++ b/testgen/template/dbsetup_test_types/test_types_Aggregate_Balance.yaml
@@ -0,0 +1,180 @@
+test_types:
+ id: '1500'
+ test_type: Aggregate_Balance
+ test_name_short: Aggregate Balance
+ test_name_long: Aggregate values per group match reference
+ test_description: |-
+ Tests for exact match in aggregate values for each set of column values vs. reference dataset
+ except_message: |-
+ Aggregate measure per set of column values does not exactly match reference dataset.
+ measure_uom: Mismatched measures
+ measure_uom_description: null
+ selection_criteria: null
+ dq_score_prevalence_formula: |-
+ 1
+ dq_score_risk_factor: '1.0'
+ column_name_prompt: |-
+ Aggregate Expression
+ column_name_help: |-
+ Specify an aggregate column expression: one of `SUM([column_name])` or `COUNT([column_name])`
+ default_parm_columns: subset_condition,groupby_names,having_condition,match_schema_name,match_table_name,match_column_names,match_subset_condition,match_groupby_names,match_having_condition
+ default_parm_values: null
+ default_parm_prompts: |-
+ Record Subset Condition,Grouping Columns,Group Subset Condition,Matching Schema Name,Matching Table Name,Matching Aggregate Expression,Matching Record Subset Condition,Matching Grouping Columns,Matching Group Subset Condition
+ default_parm_help: |-
+ Condition defining a subset of records in main table, written like a condition within a SQL WHERE clause - OPTIONAL|Category columns in main table separated by commas (e.g. GROUP BY columns)|Condition defining a subset of aggregate records in main table (e.g. HAVING clause) - OPTIONAL|Schema location of matching table|Matching table name|Agregate column expression in matching table: one of `SUM([column_name])` or `COUNT([column_name])`|Condition defining a subset of records in matching table, written like a condition within a SQL WHERE clause - OPTIONAL|Category columns in matching table separated by commas (e.g. GROUP BY columns)|Condition defining a subset of aggregate records in matching table (e.g. HAVING clause) - OPTIONAL
+ default_severity: Fail
+ run_type: QUERY
+ test_scope: referential
+ dq_dimension: Consistency
+ health_dimension: Data Drift
+ threshold_description: |-
+ Expected count of group totals not matching aggregate value
+ usage_notes: |-
+ This test compares sums or counts of a column rolled up to one or more category combinations across two different tables. Both tables must be accessible at the same time. It's ideal for confirming that two datasets exactly match -- that the sum of a measure or count of a value hasn't changed or shifted between categories. Use this test to compare a raw and processed version of the same dataset, or to confirm that an aggregated table exactly matches the detail table that it's built from. An error here means that one or more value combinations fail to match. New categories or combinations will cause failure.
+ active: Y
+ result_visualization: line_chart
+ result_visualization_params: null
+ cat_test_conditions: []
+ target_data_lookups:
+ - id: '1245'
+ test_id: '1500'
+ test_type: Aggregate_Balance
+ sql_flavor: redshift
+ lookup_type: null
+ lookup_query: |-
+ SELECT *
+ FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) AS total, SUM(MATCH_TOTAL) AS MATCH_TOTAL
+ FROM
+ ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} AS total, NULL AS match_total
+ FROM {TARGET_SCHEMA}.{TABLE_NAME}
+ WHERE {SUBSET_CONDITION}
+ GROUP BY {GROUPBY_NAMES}
+ {HAVING_CONDITION}
+ UNION ALL
+ SELECT {MATCH_GROUPBY_NAMES}, NULL AS total, {MATCH_COLUMN_NAMES} AS match_total
+ FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME}
+ WHERE {MATCH_SUBSET_CONDITION}
+ GROUP BY {MATCH_GROUPBY_NAMES}
+ {MATCH_HAVING_CONDITION} ) a
+ GROUP BY {GROUPBY_NAMES} ) s
+ WHERE total <> match_total OR (total IS NOT NULL AND match_total IS NULL) OR (total IS NULL AND match_total IS NOT NULL)
+ ORDER BY {GROUPBY_NAMES};
+ error_type: Test Results
+ - id: '1246'
+ test_id: '1500'
+ test_type: Aggregate_Balance
+ sql_flavor: snowflake
+ lookup_type: null
+ lookup_query: |-
+ SELECT *
+ FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) AS total, SUM(MATCH_TOTAL) AS MATCH_TOTAL
+ FROM
+ ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} AS total, NULL AS match_total
+ FROM {TARGET_SCHEMA}.{TABLE_NAME}
+ WHERE {SUBSET_CONDITION}
+ GROUP BY {GROUPBY_NAMES}
+ {HAVING_CONDITION}
+ UNION ALL
+ SELECT {MATCH_GROUPBY_NAMES}, NULL AS total, {MATCH_COLUMN_NAMES} AS match_total
+ FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME}
+ WHERE {MATCH_SUBSET_CONDITION}
+ GROUP BY {MATCH_GROUPBY_NAMES}
+ {MATCH_HAVING_CONDITION} ) a
+ GROUP BY {GROUPBY_NAMES} ) s
+ WHERE total <> match_total OR (total IS NOT NULL AND match_total IS NULL) OR (total IS NULL AND match_total IS NOT NULL)
+ ORDER BY {GROUPBY_NAMES};
+ error_type: Test Results
+ - id: '1247'
+ test_id: '1500'
+ test_type: Aggregate_Balance
+ sql_flavor: mssql
+ lookup_type: null
+ lookup_query: |-
+ SELECT *
+ FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) AS total, SUM(MATCH_TOTAL) AS MATCH_TOTAL
+ FROM
+ ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} AS total, NULL AS match_total
+ FROM {TARGET_SCHEMA}.{TABLE_NAME}
+ WHERE {SUBSET_CONDITION}
+ GROUP BY {GROUPBY_NAMES}
+ {HAVING_CONDITION}
+ UNION ALL
+ SELECT {MATCH_GROUPBY_NAMES}, NULL AS total, {MATCH_COLUMN_NAMES} AS match_total
+ FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME}
+ WHERE {MATCH_SUBSET_CONDITION}
+ GROUP BY {MATCH_GROUPBY_NAMES}
+ {MATCH_HAVING_CONDITION} ) a
+ GROUP BY {GROUPBY_NAMES} ) s
+ WHERE total <> match_total OR (total IS NOT NULL AND match_total IS NULL) OR (total IS NULL AND match_total IS NOT NULL)
+ ORDER BY {GROUPBY_NAMES};
+ error_type: Test Results
+ - id: '1248'
+ test_id: '1500'
+ test_type: Aggregate_Balance
+ sql_flavor: postgresql
+ lookup_type: null
+ lookup_query: |-
+ SELECT *
+ FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) AS total, SUM(MATCH_TOTAL) AS MATCH_TOTAL
+ FROM
+ ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} AS total, NULL AS match_total
+ FROM {TARGET_SCHEMA}.{TABLE_NAME}
+ WHERE {SUBSET_CONDITION}
+ GROUP BY {GROUPBY_NAMES}
+ {HAVING_CONDITION}
+ UNION ALL
+ SELECT {MATCH_GROUPBY_NAMES}, NULL AS total, {MATCH_COLUMN_NAMES} AS match_total
+ FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME}
+ WHERE {MATCH_SUBSET_CONDITION}
+ GROUP BY {MATCH_GROUPBY_NAMES}
+ {MATCH_HAVING_CONDITION} ) a
+ GROUP BY {GROUPBY_NAMES} ) s
+ WHERE total <> match_total OR (total IS NOT NULL AND match_total IS NULL) OR (total IS NULL AND match_total IS NOT NULL)
+ ORDER BY {GROUPBY_NAMES};
+ error_type: Test Results
+ - id: '1333'
+ test_id: '1500'
+ test_type: Aggregate_Balance
+ sql_flavor: databricks
+ lookup_type: null
+ lookup_query: |-
+ SELECT *
+ FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) AS total, SUM(MATCH_TOTAL) AS MATCH_TOTAL
+ FROM
+ ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} AS total, NULL AS match_total
+ FROM {TARGET_SCHEMA}.{TABLE_NAME}
+ WHERE {SUBSET_CONDITION}
+ GROUP BY {GROUPBY_NAMES}
+ {HAVING_CONDITION}
+ UNION ALL
+ SELECT {MATCH_GROUPBY_NAMES}, NULL AS total, {MATCH_COLUMN_NAMES} AS match_total
+ FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME}
+ WHERE {MATCH_SUBSET_CONDITION}
+ GROUP BY {MATCH_GROUPBY_NAMES}
+ {MATCH_HAVING_CONDITION} ) a
+ GROUP BY {GROUPBY_NAMES} ) s
+ WHERE total <> match_total OR (total IS NOT NULL AND match_total IS NULL) OR (total IS NULL AND match_total IS NOT NULL)
+ ORDER BY {GROUPBY_NAMES};
+ error_type: Test Results
+ test_templates:
+ - id: '2006'
+ test_type: Aggregate_Balance
+ sql_flavor: redshift
+ template_name: ex_aggregate_match_same_generic.sql
+ - id: '2106'
+ test_type: Aggregate_Balance
+ sql_flavor: snowflake
+ template_name: ex_aggregate_match_same_generic.sql
+ - id: '2206'
+ test_type: Aggregate_Balance
+ sql_flavor: mssql
+ template_name: ex_aggregate_match_same_generic.sql
+ - id: '2306'
+ test_type: Aggregate_Balance
+ sql_flavor: postgresql
+ template_name: ex_aggregate_match_same_generic.sql
+ - id: '2406'
+ test_type: Aggregate_Balance
+ sql_flavor: databricks
+ template_name: ex_aggregate_match_same_generic.sql
diff --git a/testgen/template/dbsetup_test_types/test_types_Aggregate_Balance_Percent.yaml b/testgen/template/dbsetup_test_types/test_types_Aggregate_Balance_Percent.yaml
new file mode 100644
index 00000000..8d7236ef
--- /dev/null
+++ b/testgen/template/dbsetup_test_types/test_types_Aggregate_Balance_Percent.yaml
@@ -0,0 +1,190 @@
+test_types:
+ id: '1504'
+ test_type: Aggregate_Balance_Percent
+ test_name_short: Aggregate Balance Percent
+ test_name_long: Aggregate measure per group within percent of reference
+ test_description: |-
+ Tests that aggregate measure for each set of column values fall within a percent range above or below the measure for reference dataset
+ except_message: |-
+ Aggregate measure per set of column values is outside percent range of reference dataset.
+ measure_uom: Mismatched measures
+ measure_uom_description: null
+ selection_criteria: null
+ dq_score_prevalence_formula: |-
+ 1
+ dq_score_risk_factor: '1.0'
+ column_name_prompt: |-
+ Aggregate Expression
+ column_name_help: |-
+ Specify an aggregate column expression: one of `SUM([column_name])` or `COUNT([column_name])`
+ default_parm_columns: subset_condition,groupby_names,having_condition,match_schema_name,match_table_name,match_column_names,match_subset_condition,match_groupby_names,match_having_condition,lower_tolerance,upper_tolerance
+ default_parm_values: null
+ default_parm_prompts: |-
+ Record Subset Condition,Grouping Columns,Group Subset Condition,Matching Schema Name,Matching Table Name,Matching Aggregate Expression,Matching Record Subset Condition,Matching Grouping Columns,Matching Group Subset Condition,Lower Tolerance Percent,Upper Tolerance Percent
+ default_parm_help: |-
+ Condition defining a subset of records in main table, written like a condition within a SQL WHERE clause - OPTIONAL|Category columns in main table separated by commas (e.g. GROUP BY columns)|Condition defining a subset of aggregate records in main table (e.g. HAVING clause) - OPTIONAL|Schema location of matching table|Matching table name|Agregate column expression in matching table: one of `SUM([column_name])` or `COUNT([column_name])`|Condition defining a subset of records in matching table, written like a condition within a SQL WHERE clause - OPTIONAL|Category columns in matching table separated by commas (e.g. GROUP BY columns)|Condition defining a subset of aggregate records in matching table (e.g. HAVING clause) - OPTIONAL|Allowable tolerance below the reference measure expressed as a percent|Allowable tolerance above the reference measure expressed as a percent
+ default_severity: Fail
+ run_type: QUERY
+ test_scope: referential
+ dq_dimension: Consistency
+ health_dimension: Data Drift
+ threshold_description: |-
+ Expected count of group totals not matching aggregate value
+ usage_notes: |-
+ This test compares sums or counts of a column rolled up to one or more category combinations across two different tables. Both tables must be accessible at the same time. Use it to confirm that two datasets closely match within the tolerance you set -- that the sum of a measure or count of a value remains sufficiently consistent between categories. You could use this test compare sales per product within one month to another, when you want to be alerted if the difference for any product falls outside of the range defined as 5% below to 10% above the prior month. An error here means that one or more value combinations fail to match within the set tolerances. New categories or combinations will cause failure.
+ active: Y
+ result_visualization: line_chart
+ result_visualization_params: null
+ cat_test_conditions: []
+ target_data_lookups:
+ - id: '1245'
+ test_id: '1504'
+ test_type: Aggregate_Balance_Percent
+ sql_flavor: redshift
+ lookup_type: null
+ lookup_query: |-
+ SELECT *
+ FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) AS total, SUM(MATCH_TOTAL) AS MATCH_TOTAL
+ FROM
+ ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} AS total, NULL AS match_total
+ FROM {TARGET_SCHEMA}.{TABLE_NAME}
+ WHERE {SUBSET_CONDITION}
+ GROUP BY {GROUPBY_NAMES}
+ {HAVING_CONDITION}
+ UNION ALL
+ SELECT {MATCH_GROUPBY_NAMES}, NULL AS total, {MATCH_COLUMN_NAMES} AS match_total
+ FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME}
+ WHERE {MATCH_SUBSET_CONDITION}
+ GROUP BY {MATCH_GROUPBY_NAMES}
+ {MATCH_HAVING_CONDITION} ) a
+ GROUP BY {GROUPBY_NAMES} ) s
+ WHERE (total IS NOT NULL AND match_total IS NULL)
+ OR (total IS NULL AND match_total IS NOT NULL)
+ OR (total NOT BETWEEN match_total * (1 + {LOWER_TOLERANCE}/100.0) AND match_total * (1 + {UPPER_TOLERANCE}/100.0))
+ ORDER BY {GROUPBY_NAMES};
+ error_type: Test Results
+ - id: '1246'
+ test_id: '1504'
+ test_type: Aggregate_Balance_Percent
+ sql_flavor: snowflake
+ lookup_type: null
+ lookup_query: |-
+ SELECT *
+ FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) AS total, SUM(MATCH_TOTAL) AS MATCH_TOTAL
+ FROM
+ ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} AS total, NULL AS match_total
+ FROM {TARGET_SCHEMA}.{TABLE_NAME}
+ WHERE {SUBSET_CONDITION}
+ GROUP BY {GROUPBY_NAMES}
+ {HAVING_CONDITION}
+ UNION ALL
+ SELECT {MATCH_GROUPBY_NAMES}, NULL AS total, {MATCH_COLUMN_NAMES} AS match_total
+ FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME}
+ WHERE {MATCH_SUBSET_CONDITION}
+ GROUP BY {MATCH_GROUPBY_NAMES}
+ {MATCH_HAVING_CONDITION} ) a
+ GROUP BY {GROUPBY_NAMES} ) s
+ WHERE (total IS NOT NULL AND match_total IS NULL)
+ OR (total IS NULL AND match_total IS NOT NULL)
+ OR (total NOT BETWEEN match_total * (1 + {LOWER_TOLERANCE}/100.0) AND match_total * (1 + {UPPER_TOLERANCE}/100.0))
+ ORDER BY {GROUPBY_NAMES};
+ error_type: Test Results
+ - id: '1247'
+ test_id: '1504'
+ test_type: Aggregate_Balance_Percent
+ sql_flavor: mssql
+ lookup_type: null
+ lookup_query: |-
+ SELECT *
+ FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) AS total, SUM(MATCH_TOTAL) AS MATCH_TOTAL
+ FROM
+ ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} AS total, NULL AS match_total
+ FROM {TARGET_SCHEMA}.{TABLE_NAME}
+ WHERE {SUBSET_CONDITION}
+ GROUP BY {GROUPBY_NAMES}
+ {HAVING_CONDITION}
+ UNION ALL
+ SELECT {MATCH_GROUPBY_NAMES}, NULL AS total, {MATCH_COLUMN_NAMES} AS match_total
+ FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME}
+ WHERE {MATCH_SUBSET_CONDITION}
+ GROUP BY {MATCH_GROUPBY_NAMES}
+ {MATCH_HAVING_CONDITION} ) a
+ GROUP BY {GROUPBY_NAMES} ) s
+ WHERE (total IS NOT NULL AND match_total IS NULL)
+ OR (total IS NULL AND match_total IS NOT NULL)
+ OR (total NOT BETWEEN match_total * (1 + {LOWER_TOLERANCE}/100.0) AND match_total * (1 + {UPPER_TOLERANCE}/100.0))
+ ORDER BY {GROUPBY_NAMES};
+ error_type: Test Results
+ - id: '1248'
+ test_id: '1504'
+ test_type: Aggregate_Balance_Percent
+ sql_flavor: postgresql
+ lookup_type: null
+ lookup_query: |-
+ SELECT *
+ FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) AS total, SUM(MATCH_TOTAL) AS MATCH_TOTAL
+ FROM
+ ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} AS total, NULL AS match_total
+ FROM {TARGET_SCHEMA}.{TABLE_NAME}
+ WHERE {SUBSET_CONDITION}
+ GROUP BY {GROUPBY_NAMES}
+ {HAVING_CONDITION}
+ UNION ALL
+ SELECT {MATCH_GROUPBY_NAMES}, NULL AS total, {MATCH_COLUMN_NAMES} AS match_total
+ FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME}
+ WHERE {MATCH_SUBSET_CONDITION}
+ GROUP BY {MATCH_GROUPBY_NAMES}
+ {MATCH_HAVING_CONDITION} ) a
+ GROUP BY {GROUPBY_NAMES} ) s
+ WHERE (total IS NOT NULL AND match_total IS NULL)
+ OR (total IS NULL AND match_total IS NOT NULL)
+ OR (total NOT BETWEEN match_total * (1 + {LOWER_TOLERANCE}/100.0) AND match_total * (1 + {UPPER_TOLERANCE}/100.0))
+ ORDER BY {GROUPBY_NAMES};
+ error_type: Test Results
+ - id: '1248'
+ test_id: '1504'
+ test_type: Aggregate_Balance_Percent
+ sql_flavor: databricks
+ lookup_type: null
+ lookup_query: |-
+ SELECT *
+ FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) AS total, SUM(MATCH_TOTAL) AS MATCH_TOTAL
+ FROM
+ ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} AS total, NULL AS match_total
+ FROM {TARGET_SCHEMA}.{TABLE_NAME}
+ WHERE {SUBSET_CONDITION}
+ GROUP BY {GROUPBY_NAMES}
+ {HAVING_CONDITION}
+ UNION ALL
+ SELECT {MATCH_GROUPBY_NAMES}, NULL AS total, {MATCH_COLUMN_NAMES} AS match_total
+ FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME}
+ WHERE {MATCH_SUBSET_CONDITION}
+ GROUP BY {MATCH_GROUPBY_NAMES}
+ {MATCH_HAVING_CONDITION} ) a
+ GROUP BY {GROUPBY_NAMES} ) s
+ WHERE (total IS NOT NULL AND match_total IS NULL)
+ OR (total IS NULL AND match_total IS NOT NULL)
+ OR (total NOT BETWEEN match_total * (1 + {LOWER_TOLERANCE}/100.0) AND match_total * (1 + {UPPER_TOLERANCE}/100.0))
+ ORDER BY {GROUPBY_NAMES};
+ error_type: Test Results
+ test_templates:
+ - id: '2009'
+ test_type: Aggregate_Balance_Percent
+ sql_flavor: redshift
+ template_name: ex_aggregate_match_percent_generic.sql
+ - id: '2109'
+ test_type: Aggregate_Balance_Percent
+ sql_flavor: snowflake
+ template_name: ex_aggregate_match_percent_generic.sql
+ - id: '2209'
+ test_type: Aggregate_Balance_Percent
+ sql_flavor: mssql
+ template_name: ex_aggregate_match_percent_generic.sql
+ - id: '2309'
+ test_type: Aggregate_Balance_Percent
+ sql_flavor: postgresql
+ template_name: ex_aggregate_match_percent_generic.sql
+ - id: '2409'
+ test_type: Aggregate_Balance_Percent
+ sql_flavor: databricks
+ template_name: ex_aggregate_match_percent_generic.sql
diff --git a/testgen/template/dbsetup_test_types/test_types_Aggregate_Balance_Range.yaml b/testgen/template/dbsetup_test_types/test_types_Aggregate_Balance_Range.yaml
new file mode 100644
index 00000000..0f5b5c43
--- /dev/null
+++ b/testgen/template/dbsetup_test_types/test_types_Aggregate_Balance_Range.yaml
@@ -0,0 +1,190 @@
+test_types:
+ id: '1505'
+ test_type: Aggregate_Balance_Range
+ test_name_short: Aggregate Balance Range
+ test_name_long: Aggregate measure per group within hard range of reference
+ test_description: |-
+ Tests that aggregate measure for each set of column values fall within a hard range above or below the measure for reference dataset
+ except_message: |-
+ Aggregate measure per set of column values is outside expected range of reference dataset.
+ measure_uom: Mismatched measures
+ measure_uom_description: null
+ selection_criteria: null
+ dq_score_prevalence_formula: |-
+ 1
+ dq_score_risk_factor: '1.0'
+ column_name_prompt: |-
+ Aggregate Expression
+ column_name_help: |-
+ Specify an aggregate column expression: one of `SUM([column_name])` or `COUNT([column_name])`
+ default_parm_columns: subset_condition,groupby_names,having_condition,match_schema_name,match_table_name,match_column_names,match_subset_condition,match_groupby_names,match_having_condition,lower_tolerance,upper_tolerance
+ default_parm_values: null
+ default_parm_prompts: |-
+ Record Subset Condition,Grouping Columns,Group Subset Condition,Matching Schema Name,Matching Table Name,Matching Aggregate Expression,Matching Record Subset Condition,Matching Grouping Columns,Matching Group Subset Condition,Lower Tolerance Constant,Upper Tolerance Constant
+ default_parm_help: |-
+ Condition defining a subset of records in main table, written like a condition within a SQL WHERE clause - OPTIONAL|Category columns in main table separated by commas (e.g. GROUP BY columns)|Condition defining a subset of aggregate records in main table (e.g. HAVING clause) - OPTIONAL|Schema location of matching table|Matching table name|Agregate column expression in matching table: one of `SUM([column_name])` or `COUNT([column_name])`|Condition defining a subset of records in matching table, written like a condition within a SQL WHERE clause - OPTIONAL|Category columns in matching table separated by commas (e.g. GROUP BY columns)|Condition defining a subset of aggregate records in matching table (e.g. HAVING clause) - OPTIONAL|Allowable tolerance below the reference measure expressed as a constant value|Allowable tolerance above the reference measure expressed as a constant value
+ default_severity: Fail
+ run_type: QUERY
+ test_scope: referential
+ dq_dimension: Consistency
+ health_dimension: Data Drift
+ threshold_description: |-
+ Expected count of group totals not matching aggregate value
+ usage_notes: |-
+ This test compares sums or counts of a column rolled up to one or more category combinations across two different tables. Both tables must be accessible at the same time. Use it to confirm that two datasets closely match within the tolerances you define as specific values above or below the aggregate measure for the same categories in the reference dataset -- that the sum of a measure or count of a value remains sufficiently consistent between categories. For instance, you can use this test to compare sales per product within one month to another, when you want to be alerted if the difference for any product falls outside of the range defined as 10000 dollars above or below the prior week. An error here means that one or more value combinations fail to match within the set tolerances. New categories or combinations will cause failure.
+ active: Y
+ result_visualization: line_chart
+ result_visualization_params: null
+ cat_test_conditions: []
+ target_data_lookups:
+ - id: '1245'
+ test_id: '1505'
+ test_type: Aggregate_Balance_Range
+ sql_flavor: redshift
+ lookup_type: null
+ lookup_query: |-
+ SELECT *
+ FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) AS total, SUM(MATCH_TOTAL) AS MATCH_TOTAL
+ FROM
+ ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} AS total, NULL AS match_total
+ FROM {TARGET_SCHEMA}.{TABLE_NAME}
+ WHERE {SUBSET_CONDITION}
+ GROUP BY {GROUPBY_NAMES}
+ {HAVING_CONDITION}
+ UNION ALL
+ SELECT {MATCH_GROUPBY_NAMES}, NULL AS total, {MATCH_COLUMN_NAMES} AS match_total
+ FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME}
+ WHERE {MATCH_SUBSET_CONDITION}
+ GROUP BY {MATCH_GROUPBY_NAMES}
+ {MATCH_HAVING_CONDITION} ) a
+ GROUP BY {GROUPBY_NAMES} ) s
+ WHERE (total IS NOT NULL AND match_total IS NULL)
+ OR (total IS NULL AND match_total IS NOT NULL)
+ OR (total NOT BETWEEN match_total + {LOWER_TOLERANCE} AND match_total + {UPPER_TOLERANCE})
+ ORDER BY {GROUPBY_NAMES};
+ error_type: Test Results
+ - id: '1246'
+ test_id: '1505'
+ test_type: Aggregate_Balance_Range
+ sql_flavor: snowflake
+ lookup_type: null
+ lookup_query: |-
+ SELECT *
+ FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) AS total, SUM(MATCH_TOTAL) AS MATCH_TOTAL
+ FROM
+ ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} AS total, NULL AS match_total
+ FROM {TARGET_SCHEMA}.{TABLE_NAME}
+ WHERE {SUBSET_CONDITION}
+ GROUP BY {GROUPBY_NAMES}
+ {HAVING_CONDITION}
+ UNION ALL
+ SELECT {MATCH_GROUPBY_NAMES}, NULL AS total, {MATCH_COLUMN_NAMES} AS match_total
+ FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME}
+ WHERE {MATCH_SUBSET_CONDITION}
+ GROUP BY {MATCH_GROUPBY_NAMES}
+ {MATCH_HAVING_CONDITION} ) a
+ GROUP BY {GROUPBY_NAMES} ) s
+ WHERE (total IS NOT NULL AND match_total IS NULL)
+ OR (total IS NULL AND match_total IS NOT NULL)
+ OR (total NOT BETWEEN match_total + {LOWER_TOLERANCE} AND match_total + {UPPER_TOLERANCE})
+ ORDER BY {GROUPBY_NAMES};
+ error_type: Test Results
+ - id: '1247'
+ test_id: '1505'
+ test_type: Aggregate_Balance_Range
+ sql_flavor: mssql
+ lookup_type: null
+ lookup_query: |-
+ SELECT *
+ FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) AS total, SUM(MATCH_TOTAL) AS MATCH_TOTAL
+ FROM
+ ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} AS total, NULL AS match_total
+ FROM {TARGET_SCHEMA}.{TABLE_NAME}
+ WHERE {SUBSET_CONDITION}
+ GROUP BY {GROUPBY_NAMES}
+ {HAVING_CONDITION}
+ UNION ALL
+ SELECT {MATCH_GROUPBY_NAMES}, NULL AS total, {MATCH_COLUMN_NAMES} AS match_total
+ FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME}
+ WHERE {MATCH_SUBSET_CONDITION}
+ GROUP BY {MATCH_GROUPBY_NAMES}
+ {MATCH_HAVING_CONDITION} ) a
+ GROUP BY {GROUPBY_NAMES} ) s
+ WHERE (total IS NOT NULL AND match_total IS NULL)
+ OR (total IS NULL AND match_total IS NOT NULL)
+ OR (total NOT BETWEEN match_total + {LOWER_TOLERANCE} AND match_total + {UPPER_TOLERANCE})
+ ORDER BY {GROUPBY_NAMES};
+ error_type: Test Results
+ - id: '1248'
+ test_id: '1505'
+ test_type: Aggregate_Balance_Range
+ sql_flavor: postgresql
+ lookup_type: null
+ lookup_query: |-
+ SELECT *
+ FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) AS total, SUM(MATCH_TOTAL) AS MATCH_TOTAL
+ FROM
+ ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} AS total, NULL AS match_total
+ FROM {TARGET_SCHEMA}.{TABLE_NAME}
+ WHERE {SUBSET_CONDITION}
+ GROUP BY {GROUPBY_NAMES}
+ {HAVING_CONDITION}
+ UNION ALL
+ SELECT {MATCH_GROUPBY_NAMES}, NULL AS total, {MATCH_COLUMN_NAMES} AS match_total
+ FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME}
+ WHERE {MATCH_SUBSET_CONDITION}
+ GROUP BY {MATCH_GROUPBY_NAMES}
+ {MATCH_HAVING_CONDITION} ) a
+ GROUP BY {GROUPBY_NAMES} ) s
+ WHERE (total IS NOT NULL AND match_total IS NULL)
+ OR (total IS NULL AND match_total IS NOT NULL)
+ OR (total NOT BETWEEN match_total + {LOWER_TOLERANCE} AND match_total + {UPPER_TOLERANCE})
+ ORDER BY {GROUPBY_NAMES};
+ error_type: Test Results
+ - id: '1245'
+ test_id: '1505'
+ test_type: Aggregate_Balance_Range
+ sql_flavor: databricks
+ lookup_type: null
+ lookup_query: |-
+ SELECT *
+ FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) AS total, SUM(MATCH_TOTAL) AS MATCH_TOTAL
+ FROM
+ ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} AS total, NULL AS match_total
+ FROM {TARGET_SCHEMA}.{TABLE_NAME}
+ WHERE {SUBSET_CONDITION}
+ GROUP BY {GROUPBY_NAMES}
+ {HAVING_CONDITION}
+ UNION ALL
+ SELECT {MATCH_GROUPBY_NAMES}, NULL AS total, {MATCH_COLUMN_NAMES} AS match_total
+ FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME}
+ WHERE {MATCH_SUBSET_CONDITION}
+ GROUP BY {MATCH_GROUPBY_NAMES}
+ {MATCH_HAVING_CONDITION} ) a
+ GROUP BY {GROUPBY_NAMES} ) s
+ WHERE (total IS NOT NULL AND match_total IS NULL)
+ OR (total IS NULL AND match_total IS NOT NULL)
+ OR (total NOT BETWEEN match_total + {LOWER_TOLERANCE} AND match_total + {UPPER_TOLERANCE})
+ ORDER BY {GROUPBY_NAMES};
+ error_type: Test Results
+ test_templates:
+ - id: '2010'
+ test_type: Aggregate_Balance_Range
+ sql_flavor: redshift
+ template_name: ex_aggregate_match_range_generic.sql
+ - id: '2110'
+ test_type: Aggregate_Balance_Range
+ sql_flavor: snowflake
+ template_name: ex_aggregate_match_range_generic.sql
+ - id: '2210'
+ test_type: Aggregate_Balance_Range
+ sql_flavor: mssql
+ template_name: ex_aggregate_match_range_generic.sql
+ - id: '2310'
+ test_type: Aggregate_Balance_Range
+ sql_flavor: postgresql
+ template_name: ex_aggregate_match_range_generic.sql
+ - id: '2410'
+ test_type: Aggregate_Balance_Range
+ sql_flavor: databricks
+ template_name: ex_aggregate_match_range_generic.sql
diff --git a/testgen/template/dbsetup_test_types/test_types_Aggregate_Minimum.yaml b/testgen/template/dbsetup_test_types/test_types_Aggregate_Minimum.yaml
new file mode 100644
index 00000000..09bbc2a2
--- /dev/null
+++ b/testgen/template/dbsetup_test_types/test_types_Aggregate_Minimum.yaml
@@ -0,0 +1,180 @@
+test_types:
+ id: '1501'
+ test_type: Aggregate_Minimum
+ test_name_short: Aggregate Minimum
+ test_name_long: Aggregate values per group are at or above reference
+ test_description: |-
+ Tests that aggregate values for each set of column values are at least the same as reference dataset
+ except_message: |-
+ Aggregate measure per set of column values is not at least the same as reference dataset.
+ measure_uom: Mismatched measures
+ measure_uom_description: null
+ selection_criteria: null
+ dq_score_prevalence_formula: |-
+ 1
+ dq_score_risk_factor: '1.0'
+ column_name_prompt: |-
+ Aggregate Expression
+ column_name_help: |-
+ Specify an aggregate column expression: one of `SUM([column_name])` or `COUNT([column_name])`
+ default_parm_columns: subset_condition,groupby_names,having_condition,match_schema_name,match_table_name,match_column_names,match_subset_condition,match_groupby_names,match_having_condition
+ default_parm_values: null
+ default_parm_prompts: |-
+ Record Subset Condition,Grouping Columns,Group Subset Condition,Matching Schema Name,Matching Table Name,Matching Aggregate Expression,Matching Record Subset Condition,Matching Grouping Columns,Matching Group Subset Condition
+ default_parm_help: |-
+ Condition defining a subset of records in main table, written like a condition within a SQL WHERE clause - OPTIONAL|Category columns in main table separated by commas (e.g. GROUP BY columns)|Condition defining a subset of aggregate records in main table (e.g. HAVING clause) - OPTIONAL|Schema location of reference table|Reference table name|Aggregate column expression in reference table (e.g. `SUM(sales)`)|Condition defining a subset of records in reference table, written like a condition within a SQL WHERE clause - OPTIONAL|Category columns in reference table separated by commas (e.g. GROUP BY columns)|Condition defining a subset of aggregate records in reference table (e.g. HAVING clause) - OPTIONAL
+ default_severity: Fail
+ run_type: QUERY
+ test_scope: referential
+ dq_dimension: Accuracy
+ health_dimension: Data Drift
+ threshold_description: |-
+ Expected count of group totals below aggregate value
+ usage_notes: |-
+ This test compares sums or counts of a column rolled up to one or more category combinations, but requires a match or increase in the aggregate value, rather than an exact match, across two different tables. Both tables must be accessible at the same time. Use this to confirm that aggregate values have not dropped for any set of categories, even if some values may rise. This test is useful to compare an older and newer version of a cumulative dataset. An error here means that one or more values per category set fail to match or exceed the prior dataset. New categories or combinations are allowed (but can be restricted independently with a Combo_Match test). Both tables must be present to run this test.
+ active: Y
+ result_visualization: line_chart
+ result_visualization_params: null
+ cat_test_conditions: []
+ target_data_lookups:
+ - id: '1249'
+ test_id: '1501'
+ test_type: Aggregate_Minimum
+ sql_flavor: redshift
+ lookup_type: null
+ lookup_query: |-
+ SELECT *
+ FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) as total, SUM(MATCH_TOTAL) as MATCH_TOTAL
+ FROM
+ ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} as total, NULL as match_total
+ FROM {TARGET_SCHEMA}.{TABLE_NAME}
+ WHERE {SUBSET_CONDITION}
+ GROUP BY {GROUPBY_NAMES}
+ {HAVING_CONDITION}
+ UNION ALL
+ SELECT {MATCH_GROUPBY_NAMES}, NULL as total, {MATCH_COLUMN_NAMES} as match_total
+ FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME}
+ WHERE {MATCH_SUBSET_CONDITION}
+ GROUP BY {MATCH_GROUPBY_NAMES}
+ {MATCH_HAVING_CONDITION} ) a
+ GROUP BY {GROUPBY_NAMES} ) s
+ WHERE total < match_total OR (total IS NULL AND match_total IS NOT NULL)
+ ORDER BY {GROUPBY_NAMES};
+ error_type: Test Results
+ - id: '1250'
+ test_id: '1501'
+ test_type: Aggregate_Minimum
+ sql_flavor: snowflake
+ lookup_type: null
+ lookup_query: |-
+ SELECT *
+ FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) as total, SUM(MATCH_TOTAL) as MATCH_TOTAL
+ FROM
+ ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} as total, NULL as match_total
+ FROM {TARGET_SCHEMA}.{TABLE_NAME}
+ WHERE {SUBSET_CONDITION}
+ GROUP BY {GROUPBY_NAMES}
+ {HAVING_CONDITION}
+ UNION ALL
+ SELECT {MATCH_GROUPBY_NAMES}, NULL as total, {MATCH_COLUMN_NAMES} as match_total
+ FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME}
+ WHERE {MATCH_SUBSET_CONDITION}
+ GROUP BY {MATCH_GROUPBY_NAMES}
+ {MATCH_HAVING_CONDITION} ) a
+ GROUP BY {GROUPBY_NAMES} ) s
+ WHERE total < match_total OR (total IS NULL AND match_total IS NOT NULL)
+ ORDER BY {GROUPBY_NAMES};
+ error_type: Test Results
+ - id: '1251'
+ test_id: '1501'
+ test_type: Aggregate_Minimum
+ sql_flavor: mssql
+ lookup_type: null
+ lookup_query: |-
+ SELECT *
+ FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) as total, SUM(MATCH_TOTAL) as MATCH_TOTAL
+ FROM
+ ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} as total, NULL as match_total
+ FROM {TARGET_SCHEMA}.{TABLE_NAME}
+ WHERE {SUBSET_CONDITION}
+ GROUP BY {GROUPBY_NAMES}
+ {HAVING_CONDITION}
+ UNION ALL
+ SELECT {MATCH_GROUPBY_NAMES}, NULL as total, {MATCH_COLUMN_NAMES} as match_total
+ FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME}
+ WHERE {MATCH_SUBSET_CONDITION}
+ GROUP BY {MATCH_GROUPBY_NAMES}
+ {MATCH_HAVING_CONDITION} ) a
+ GROUP BY {GROUPBY_NAMES} ) s
+ WHERE total < match_total OR (total IS NULL AND match_total IS NOT NULL)
+ ORDER BY {GROUPBY_NAMES};
+ error_type: Test Results
+ - id: '1252'
+ test_id: '1501'
+ test_type: Aggregate_Minimum
+ sql_flavor: postgresql
+ lookup_type: null
+ lookup_query: |-
+ SELECT *
+ FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) as total, SUM(MATCH_TOTAL) as MATCH_TOTAL
+ FROM
+ ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} as total, NULL as match_total
+ FROM {TARGET_SCHEMA}.{TABLE_NAME}
+ WHERE {SUBSET_CONDITION}
+ GROUP BY {GROUPBY_NAMES}
+ {HAVING_CONDITION}
+ UNION ALL
+ SELECT {MATCH_GROUPBY_NAMES}, NULL as total, {MATCH_COLUMN_NAMES} as match_total
+ FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME}
+ WHERE {MATCH_SUBSET_CONDITION}
+ GROUP BY {MATCH_GROUPBY_NAMES}
+ {MATCH_HAVING_CONDITION} ) a
+ GROUP BY {GROUPBY_NAMES} ) s
+ WHERE total < match_total OR (total IS NULL AND match_total IS NOT NULL)
+ ORDER BY {GROUPBY_NAMES};
+ error_type: Test Results
+ - id: '1334'
+ test_id: '1501'
+ test_type: Aggregate_Minimum
+ sql_flavor: databricks
+ lookup_type: null
+ lookup_query: |-
+ SELECT *
+ FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) as total, SUM(MATCH_TOTAL) as MATCH_TOTAL
+ FROM
+ ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} as total, NULL as match_total
+ FROM {TARGET_SCHEMA}.{TABLE_NAME}
+ WHERE {SUBSET_CONDITION}
+ GROUP BY {GROUPBY_NAMES}
+ {HAVING_CONDITION}
+ UNION ALL
+ SELECT {MATCH_GROUPBY_NAMES}, NULL as total, {MATCH_COLUMN_NAMES} as match_total
+ FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME}
+ WHERE {MATCH_SUBSET_CONDITION}
+ GROUP BY {MATCH_GROUPBY_NAMES}
+ {MATCH_HAVING_CONDITION} ) a
+ GROUP BY {GROUPBY_NAMES} ) s
+ WHERE total < match_total OR (total IS NULL AND match_total IS NOT NULL)
+ ORDER BY {GROUPBY_NAMES};
+ error_type: Test Results
+ test_templates:
+ - id: '2002'
+ test_type: Aggregate_Minimum
+ sql_flavor: redshift
+ template_name: ex_aggregate_match_no_drops_generic.sql
+ - id: '2102'
+ test_type: Aggregate_Minimum
+ sql_flavor: snowflake
+ template_name: ex_aggregate_match_no_drops_generic.sql
+ - id: '2202'
+ test_type: Aggregate_Minimum
+ sql_flavor: mssql
+ template_name: ex_aggregate_match_no_drops_generic.sql
+ - id: '2302'
+ test_type: Aggregate_Minimum
+ sql_flavor: postgresql
+ template_name: ex_aggregate_match_no_drops_generic.sql
+ - id: '2402'
+ test_type: Aggregate_Minimum
+ sql_flavor: databricks
+ template_name: ex_aggregate_match_no_drops_generic.sql
diff --git a/testgen/template/dbsetup_test_types/test_types_Alpha_Trunc.yaml b/testgen/template/dbsetup_test_types/test_types_Alpha_Trunc.yaml
new file mode 100644
index 00000000..e59479d9
--- /dev/null
+++ b/testgen/template/dbsetup_test_types/test_types_Alpha_Trunc.yaml
@@ -0,0 +1,127 @@
+test_types:
+ id: '1004'
+ test_type: Alpha_Trunc
+ test_name_short: Alpha Truncation
+ test_name_long: Maximum character count consistent
+ test_description: |-
+ Tests that the maximum count of characters in a column value has not dropped vs. baseline data
+ except_message: |-
+ Maximum length of values has dropped from prior expected length.
+ measure_uom: Values over max
+ measure_uom_description: null
+ selection_criteria: |-
+ general_type ='A' AND max_length > 0 AND ( (min_length = avg_length AND max_length = avg_length) OR (numeric_ct <> value_ct ) ) AND functional_table_type NOT LIKE '%window%' /* The conditions below are to eliminate overlap with : LOV_Match (excluded selection criteria for this test_type), Pattern_Match (excluded selection criteria for this test_type), Constant (excluded functional_data_type Constant and Boolean) */ AND ( (distinct_value_ct NOT BETWEEN 2 AND 10 AND functional_data_type NOT IN ( 'Constant', 'Boolean') ) AND NOT ( fn_charcount(top_patterns, E' \| ' ) = 1 AND fn_charcount(top_patterns, E' \| ' ) IS NOT NULL AND REPLACE(SPLIT_PART(top_patterns, '|' , 2), 'N' , '' ) > ''))
+ dq_score_prevalence_formula: |-
+ {VALUE_CT}::FLOAT * (FN_NORMAL_CDF(({MAX_LENGTH}::FLOAT - {AVG_LENGTH}::FLOAT) / (NULLIF({MAX_LENGTH}::FLOAT, 0) / 3)) - FN_NORMAL_CDF(({RESULT_MEASURE}::FLOAT - {AVG_LENGTH}::FLOAT) / (NULLIF({MAX_LENGTH}::FLOAT, 0) / 3)) ) /NULLIF({RECORD_CT}::FLOAT, 0)
+ dq_score_risk_factor: '1.0'
+ column_name_prompt: null
+ column_name_help: null
+ default_parm_columns: threshold_value
+ default_parm_values: |-
+ FLOOR(0.95 * max_length::FLOAT)
+ default_parm_prompts: |-
+ Maximum String Length at Baseline
+ default_parm_help: null
+ default_severity: Fail
+ run_type: CAT
+ test_scope: column
+ dq_dimension: Validity
+ health_dimension: Schema Drift
+ threshold_description: |-
+ Maximum length expected
+ usage_notes: |-
+ Alpha Truncation tests that the longest text value in a column hasn't become shorter than the defined threshold, initially 95% of the longest value at baseline. This could indicate a problem in a cumulative dataset, where prior values should still exist unchanged. A failure here would suggest that some process changed data that you would still expect to be present and matching its value when the column was profiled. This test would not be appropriate for an incremental or windowed dataset.
+ active: Y
+ result_visualization: line_chart
+ result_visualization_params: null
+ cat_test_conditions:
+ - id: '1001'
+ test_type: Alpha_Trunc
+ sql_flavor: redshift
+ measure: |-
+ MAX(LENGTH({COLUMN_NAME}))
+ test_operator: <
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '2001'
+ test_type: Alpha_Trunc
+ sql_flavor: snowflake
+ measure: |-
+ MAX(LENGTH({COLUMN_NAME}))
+ test_operator: <
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '3001'
+ test_type: Alpha_Trunc
+ sql_flavor: mssql
+ measure: |-
+ MAX(LEN({COLUMN_NAME}))
+ test_operator: <
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '4001'
+ test_type: Alpha_Trunc
+ sql_flavor: postgresql
+ measure: |-
+ MAX(LENGTH({COLUMN_NAME}))
+ test_operator: <
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '5001'
+ test_type: Alpha_Trunc
+ sql_flavor: trino
+ measure: |-
+ MAX(LENGTH({COLUMN_NAME}))
+ test_operator: <
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '6001'
+ test_type: Alpha_Trunc
+ sql_flavor: databricks
+ measure: |-
+ MAX(LENGTH({COLUMN_NAME}))
+ test_operator: <
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ target_data_lookups:
+ - id: '1001'
+ test_id: '1004'
+ test_type: Alpha_Trunc
+ sql_flavor: redshift
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT "{COLUMN_NAME}", LEN("{COLUMN_NAME}") as current_max_length, {THRESHOLD_VALUE} as previous_max_length FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT MAX(LEN("{COLUMN_NAME}")) as max_length FROM {TARGET_SCHEMA}.{TABLE_NAME}) a WHERE LEN("{COLUMN_NAME}") = a.max_length AND a.max_length < {THRESHOLD_VALUE} LIMIT 500;
+ error_type: Test Results
+ - id: '1083'
+ test_id: '1004'
+ test_type: Alpha_Trunc
+ sql_flavor: postgresql
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT "{COLUMN_NAME}", LENGTH("{COLUMN_NAME}") as current_max_length, {THRESHOLD_VALUE} as previous_max_length FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT MAX(LENGTH("{COLUMN_NAME}")) as max_length FROM {TARGET_SCHEMA}.{TABLE_NAME}) a WHERE LENGTH("{COLUMN_NAME}") = a.max_length AND a.max_length < {THRESHOLD_VALUE} LIMIT 500;
+ error_type: Test Results
+ - id: '1140'
+ test_id: '1004'
+ test_type: Alpha_Trunc
+ sql_flavor: mssql
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT TOP 500 "{COLUMN_NAME}", LEN("{COLUMN_NAME}") as current_max_length, {THRESHOLD_VALUE} as previous_max_length FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT MAX(LEN("{COLUMN_NAME}")) as max_length FROM {TARGET_SCHEMA}.{TABLE_NAME}) a WHERE LEN("{COLUMN_NAME}") = a.max_length AND a.max_length < {THRESHOLD_VALUE} ;
+ error_type: Test Results
+ - id: '1197'
+ test_id: '1004'
+ test_type: Alpha_Trunc
+ sql_flavor: snowflake
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT "{COLUMN_NAME}" , LEN("{COLUMN_NAME}") as current_max_length, {THRESHOLD_VALUE} as previous_max_length FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT MAX(LEN("{COLUMN_NAME}")) as max_length FROM {TARGET_SCHEMA}.{TABLE_NAME}) a WHERE LEN("{COLUMN_NAME}") = a.max_length AND a.max_length < {THRESHOLD_VALUE} LIMIT 500;
+ error_type: Test Results
+ - id: '1298'
+ test_id: '1004'
+ test_type: Alpha_Trunc
+ sql_flavor: databricks
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT `{COLUMN_NAME}` , LEN(`{COLUMN_NAME}`) as current_max_length, {THRESHOLD_VALUE} as previous_max_length FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT MAX(LEN(`{COLUMN_NAME}`)) as max_length FROM {TARGET_SCHEMA}.{TABLE_NAME}) a WHERE LEN(`{COLUMN_NAME}`) = a.max_length AND a.max_length < {THRESHOLD_VALUE} LIMIT 500;
+ error_type: Test Results
+ test_templates: []
diff --git a/testgen/template/dbsetup_test_types/test_types_Avg_Shift.yaml b/testgen/template/dbsetup_test_types/test_types_Avg_Shift.yaml
new file mode 100644
index 00000000..bdc7adfc
--- /dev/null
+++ b/testgen/template/dbsetup_test_types/test_types_Avg_Shift.yaml
@@ -0,0 +1,128 @@
+test_types:
+ id: '1005'
+ test_type: Avg_Shift
+ test_name_short: Average Shift
+ test_name_long: Column mean is consistent with reference
+ test_description: |-
+ Tests for statistically-significant shift in mean value for column from average calculated at baseline.
+ except_message: |-
+ Standardized difference between averages is over the selected threshold level.
+ measure_uom: Difference Measure
+ measure_uom_description: |-
+ Cohen's D Difference (0.20 small, 0.5 mod, 0.8 large, 1.2 very large, 2.0 huge)
+ selection_criteria: |-
+ general_type='N' AND distinct_value_ct > 10 AND functional_data_type ilike 'Measure%' AND functional_data_type <> 'Measurement Spike' AND column_name NOT ilike '%latitude%' AND column_name NOT ilike '%longitude%'
+ dq_score_prevalence_formula: |-
+ 2.0 * (1.0 - fn_normal_cdf(ABS({RESULT_MEASURE}::FLOAT) / 2.0))
+ dq_score_risk_factor: '0.75'
+ column_name_prompt: null
+ column_name_help: null
+ default_parm_columns: baseline_value_ct,baseline_avg,baseline_sd,threshold_value
+ default_parm_values: |-
+ value_ct,avg_value,stdev_value,0.5::VARCHAR
+ default_parm_prompts: "Value Ct at Baseline,Mean at Baseline,Std Deviation at Baseline,Threshold\
+ \ Difference Measure "
+ default_parm_help: null
+ default_severity: Warning
+ run_type: CAT
+ test_scope: column
+ dq_dimension: Consistency
+ health_dimension: Data Drift
+ threshold_description: |-
+ Standardized Difference Measure
+ usage_notes: |-
+ Average Shift tests that the average of a numeric column has not significantly changed since baseline, when profiling was done. A significant shift may indicate errors in processing, differences in source data, or valid changes that may nevertheless impact assumptions in downstream data products. The test uses Cohen's D, a statistical technique to identify significant shifts in a value. Cohen's D measures the difference between the two averages, reporting results on a standardized scale, which can be interpreted via a rule-of-thumb from small to huge. Depending on your data, some difference may be expected, so it's reasonable to adjust the threshold value that triggers test failure. This test works well for measures, or even for identifiers if you expect them to increment consistently. You may want to periodically adjust the expected threshold, or even the expected average value if you expect shifting over time. Consider this test along with Variability Increase. If variability rises too, process or measurement flaws could be at work. If variability remains consistent, the issue is more likely to be with the source data itself.
+ active: Y
+ result_visualization: line_chart
+ result_visualization_params: null
+ cat_test_conditions:
+ - id: '1002'
+ test_type: Avg_Shift
+ sql_flavor: redshift
+ measure: |-
+ ABS( (AVG({COLUMN_NAME}::FLOAT) - {BASELINE_AVG}) / SQRT(((COUNT({COLUMN_NAME})::FLOAT-1)*STDDEV({COLUMN_NAME}::FLOAT)^2 + ({BASELINE_VALUE_CT}::FLOAT-1) * {BASELINE_SD}::FLOAT^2) /NULLIF(COUNT({COLUMN_NAME})::FLOAT + {BASELINE_VALUE_CT}::FLOAT, 0) ))
+ test_operator: '>='
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '2002'
+ test_type: Avg_Shift
+ sql_flavor: snowflake
+ measure: |-
+ ABS( (AVG({COLUMN_NAME}::FLOAT) - {BASELINE_AVG}) / SQRT(((COUNT({COLUMN_NAME})::FLOAT-1)*POWER(STDDEV({COLUMN_NAME}::FLOAT),2) + ({BASELINE_VALUE_CT}::FLOAT-1) * POWER({BASELINE_SD}::FLOAT,2)) /NULLIF(COUNT({COLUMN_NAME})::FLOAT + {BASELINE_VALUE_CT}::FLOAT, 0) ))
+ test_operator: '>='
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '3002'
+ test_type: Avg_Shift
+ sql_flavor: mssql
+ measure: |-
+ ABS( (AVG(CAST({COLUMN_NAME} AS FLOAT)) - CAST({BASELINE_AVG} as FLOAT)) / SQRT(((COUNT({COLUMN_NAME})-1)*POWER(STDEV(CAST({COLUMN_NAME} AS FLOAT)), 2) + ({BASELINE_VALUE_CT}-1) * POWER(CAST({BASELINE_SD} as FLOAT), 2)) /NULLIF(COUNT({COLUMN_NAME}) + {BASELINE_VALUE_CT}, 0) ))
+ test_operator: '>='
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '4002'
+ test_type: Avg_Shift
+ sql_flavor: postgresql
+ measure: |-
+ ABS( (AVG({COLUMN_NAME}::FLOAT) - {BASELINE_AVG}) / SQRT(((COUNT({COLUMN_NAME})::FLOAT-1)*STDDEV({COLUMN_NAME}::FLOAT)^2 + ({BASELINE_VALUE_CT}::FLOAT-1) * {BASELINE_SD}::FLOAT^2) /NULLIF(COUNT({COLUMN_NAME})::FLOAT + {BASELINE_VALUE_CT}::FLOAT, 0) ))
+ test_operator: '>='
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '5002'
+ test_type: Avg_Shift
+ sql_flavor: trino
+ measure: |-
+ ABS( (CAST(AVG({COLUMN_NAME} AS REAL)) - {BASELINE_AVG}) / SQRT(((CAST(COUNT({COLUMN_NAME}) AS REAL)-1)*STDDEV({COLUMN_NAME})^2 + (CAST({BASELINE_VALUE_CT} AS REAL)-1) * CAST({BASELINE_SD} AS REAL)^2) /NULLIF(CAST(COUNT({COLUMN_NAME}) AS REAL) + CAST({BASELINE_VALUE_CT} AS REAL), 0) ))
+ test_operator: '>='
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '6002'
+ test_type: Avg_Shift
+ sql_flavor: databricks
+ measure: |-
+ ABS( (AVG({COLUMN_NAME}::FLOAT) - {BASELINE_AVG}) / SQRT(((COUNT({COLUMN_NAME})::FLOAT-1)*POWER(STDDEV_SAMP({COLUMN_NAME}),2) + ({BASELINE_VALUE_CT}::FLOAT-1) * POWER({BASELINE_SD}::FLOAT,2)) /NULLIF(COUNT({COLUMN_NAME})::FLOAT + {BASELINE_VALUE_CT}::FLOAT, 0) ))
+ test_operator: '>='
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ target_data_lookups:
+ - id: '1002'
+ test_id: '1005'
+ test_type: Avg_Shift
+ sql_flavor: redshift
+ lookup_type: null
+ lookup_query: |-
+ SELECT AVG("{COLUMN_NAME}"::FLOAT) AS current_average FROM {TARGET_SCHEMA}.{TABLE_NAME};
+ error_type: Test Results
+ - id: '1084'
+ test_id: '1005'
+ test_type: Avg_Shift
+ sql_flavor: postgresql
+ lookup_type: null
+ lookup_query: |-
+ SELECT AVG("{COLUMN_NAME}" :: FLOAT) AS current_average FROM {TARGET_SCHEMA}.{TABLE_NAME};
+ error_type: Test Results
+ - id: '1141'
+ test_id: '1005'
+ test_type: Avg_Shift
+ sql_flavor: mssql
+ lookup_type: null
+ lookup_query: |-
+ SELECT AVG(CAST("{COLUMN_NAME}" AS FLOAT)) AS current_average FROM {TARGET_SCHEMA}.{TABLE_NAME};
+ error_type: Test Results
+ - id: '1198'
+ test_id: '1005'
+ test_type: Avg_Shift
+ sql_flavor: snowflake
+ lookup_type: null
+ lookup_query: |-
+ SELECT AVG("{COLUMN_NAME}" :: FLOAT) AS current_average FROM {TARGET_SCHEMA}.{TABLE_NAME};
+ error_type: Test Results
+ - id: '1299'
+ test_id: '1005'
+ test_type: Avg_Shift
+ sql_flavor: databricks
+ lookup_type: null
+ lookup_query: |-
+ SELECT AVG(`{COLUMN_NAME}` :: FLOAT) AS current_average FROM {TARGET_SCHEMA}.{TABLE_NAME};
+ error_type: Test Results
+ test_templates: []
diff --git a/testgen/template/dbsetup_test_types/test_types_CUSTOM.yaml b/testgen/template/dbsetup_test_types/test_types_CUSTOM.yaml
new file mode 100644
index 00000000..6d5454ec
--- /dev/null
+++ b/testgen/template/dbsetup_test_types/test_types_CUSTOM.yaml
@@ -0,0 +1,61 @@
+test_types:
+ id: '1008'
+ test_type: CUSTOM
+ test_name_short: Custom Test
+ test_name_long: Custom-defined business rule
+ test_description: |-
+ Custom SQL Query Test
+ except_message: |-
+ Errors were detected according to test definition.
+ measure_uom: Errors found
+ measure_uom_description: |-
+ Count of errors identified by query
+ selection_criteria: null
+ dq_score_prevalence_formula: |-
+ ({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/NULLIF({RECORD_CT}::FLOAT, 0)
+ dq_score_risk_factor: '1.0'
+ column_name_prompt: |-
+ Test Focus
+ column_name_help: |-
+ Specify a brief descriptor of the focus of this test that is unique within this Test Suite for the Table and Test Type. This distinguishes this test from others of the same type on the same table. Example: `Order Total Matches Detail` if you are testing that the total in one table matches the sum of lines in another.
+ default_parm_columns: custom_query
+ default_parm_values: null
+ default_parm_prompts: |-
+ Custom SQL Query Returning Error Records
+ default_parm_help: |-
+ Query should return records indicating one or more errors. The test passes if no records are returned. Results of the query will be shown when you click `Review Source Data` for a failed test, so be sure to include enough data in your results to follow-up. \n\nA query can refer to any tables in the database. You must hard-code the schema or use `{DATA_SCHEMA}` to represent the schema defined for the Table Group.
+ default_severity: Fail
+ run_type: QUERY
+ test_scope: custom
+ dq_dimension: Accuracy
+ health_dimension: Data Drift
+ threshold_description: |-
+ Expected count of errors found by custom query
+ usage_notes: |-
+ This business-rule test is highly flexible, covering any error state that can be expressed by a SQL query against one or more tables in the database. In operation, the user-defined query is embedded within a parent query returning the count of error rows identified. Any row returned by the query is interpreted as a single error condition in the test. Note that this query is run independently of other tests, and that performance will be slower, depending in large part on the efficiency of the query you write. Interpretation is based on the user-defined meaning of the test. Your query might be written to return errors in individual rows identified by joining tables. Or it might return an error based on a multi-column aggregate condition returning a single row if an error is found. This query is run separately when you click `Review Source Data` from Test Results, so be sure to include enough data in your results to follow-up. Interpretation is based on the user-defined meaning of the test.
+ active: Y
+ result_visualization: line_chart
+ result_visualization_params: null
+ cat_test_conditions: []
+ target_data_lookups: []
+ test_templates:
+ - id: '2004'
+ test_type: CUSTOM
+ sql_flavor: redshift
+ template_name: ex_custom_query_generic.sql
+ - id: '2104'
+ test_type: CUSTOM
+ sql_flavor: snowflake
+ template_name: ex_custom_query_generic.sql
+ - id: '2204'
+ test_type: CUSTOM
+ sql_flavor: mssql
+ template_name: ex_custom_query_generic.sql
+ - id: '2304'
+ test_type: CUSTOM
+ sql_flavor: postgresql
+ template_name: ex_custom_query_generic.sql
+ - id: '2404'
+ test_type: CUSTOM
+ sql_flavor: databricks
+ template_name: ex_custom_query_generic.sql
diff --git a/testgen/template/dbsetup_test_types/test_types_Combo_Match.yaml b/testgen/template/dbsetup_test_types/test_types_Combo_Match.yaml
new file mode 100644
index 00000000..94d27b92
--- /dev/null
+++ b/testgen/template/dbsetup_test_types/test_types_Combo_Match.yaml
@@ -0,0 +1,165 @@
+test_types:
+ id: '1502'
+ test_type: Combo_Match
+ test_name_short: Reference Match
+ test_name_long: Column values or combinations found in reference
+ test_description: |-
+ Tests for the presence of one or a set of column values in a reference table
+ except_message: |-
+ Column value combinations are not found in reference table values.
+ measure_uom: Missing values
+ measure_uom_description: null
+ selection_criteria: null
+ dq_score_prevalence_formula: |-
+ ({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/NULLIF({RECORD_CT}::FLOAT, 0)
+ dq_score_risk_factor: '1.0'
+ column_name_prompt: |-
+ Categorical Column List
+ column_name_help: |-
+ Specify one or more Categorical columns, separated by commas. \n\nDo not use continuous mesurements here. Do not use numeric values unless they represent discrete categories.
+ default_parm_columns: subset_condition,having_condition,match_schema_name,match_table_name,match_groupby_names,match_subset_condition,match_having_condition
+ default_parm_values: null
+ default_parm_prompts: |-
+ Record Subset Condition,Group Subset Condition,Reference Schema Name,Reference Table Name,Matching Columns,Matching Record Subset Condition,Matching Group Subset Condition
+ default_parm_help: |-
+ Condition defining a subset of records in main table to evaluate, written like a condition within a SQL WHERE clause - OPTIONAL|Condition based on aggregate expression used to exclude value combinations in source table, written like a condition within a SQL HAVING clause (e.g. `SUM(sales) < 100`) - OPTIONAL|Schema location of matching table|Matching table name|Column Names in reference table used to validate source table values (separated by commas)|Condition defining a subset of records in reference table to match against, written like a condition within a SQL WHERE clause - OPTIONAL|Condition based on aggregate expression used to exclude value combinations in reference table, written like a condition within a SQL HAVING clause (e.g. `SUM(sales) < 100`) - OPTIONAL
+ default_severity: Fail
+ run_type: QUERY
+ test_scope: referential
+ dq_dimension: Validity
+ health_dimension: Schema Drift
+ threshold_description: |-
+ Expected count of non-matching value combinations
+ usage_notes: |-
+ This test verifies that values, or combinations of values, that are present in the main table are also found in a reference table. This is a useful test for referential integrity between fact and dimension tables. You can also use it to confirm the validity of a code or category, or of combinations of values that should only be found together within each record, such as product/size/color. An error here means that one or more category combinations in the main table are not found in the reference table. Both tables must be present to run this test.
+ active: Y
+ result_visualization: line_chart
+ result_visualization_params: null
+ cat_test_conditions: []
+ target_data_lookups:
+ - id: '1253'
+ test_id: '1502'
+ test_type: Combo_Match
+ sql_flavor: redshift
+ lookup_type: null
+ lookup_query: |-
+ SELECT *
+ FROM ( SELECT {COLUMN_NAME_NO_QUOTES}
+ FROM {TARGET_SCHEMA}.{TABLE_NAME}
+ WHERE {SUBSET_CONDITION}
+ GROUP BY {COLUMN_NAME_NO_QUOTES}
+ {HAVING_CONDITION}
+ EXCEPT
+ SELECT {MATCH_GROUPBY_NAMES}
+ FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME}
+ WHERE {MATCH_SUBSET_CONDITION}
+ GROUP BY {MATCH_GROUPBY_NAMES}
+ {MATCH_HAVING_CONDITION}
+ ) test
+ ORDER BY {COLUMN_NAME_NO_QUOTES};
+ error_type: Test Results
+ - id: '1254'
+ test_id: '1502'
+ test_type: Combo_Match
+ sql_flavor: snowflake
+ lookup_type: null
+ lookup_query: |-
+ SELECT *
+ FROM ( SELECT {COLUMN_NAME_NO_QUOTES}
+ FROM {TARGET_SCHEMA}.{TABLE_NAME}
+ WHERE {SUBSET_CONDITION}
+ GROUP BY {COLUMN_NAME_NO_QUOTES}
+ {HAVING_CONDITION}
+ EXCEPT
+ SELECT {MATCH_GROUPBY_NAMES}
+ FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME}
+ WHERE {MATCH_SUBSET_CONDITION}
+ GROUP BY {MATCH_GROUPBY_NAMES}
+ {MATCH_HAVING_CONDITION}
+ ) test
+ ORDER BY {COLUMN_NAME_NO_QUOTES};
+ error_type: Test Results
+ - id: '1255'
+ test_id: '1502'
+ test_type: Combo_Match
+ sql_flavor: mssql
+ lookup_type: null
+ lookup_query: |-
+ SELECT *
+ FROM ( SELECT {COLUMN_NAME_NO_QUOTES}
+ FROM {TARGET_SCHEMA}.{TABLE_NAME}
+ WHERE {SUBSET_CONDITION}
+ GROUP BY {COLUMN_NAME_NO_QUOTES}
+ {HAVING_CONDITION}
+ EXCEPT
+ SELECT {MATCH_GROUPBY_NAMES}
+ FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME}
+ WHERE {MATCH_SUBSET_CONDITION}
+ GROUP BY {MATCH_GROUPBY_NAMES}
+ {MATCH_HAVING_CONDITION}
+ ) test
+ ORDER BY {COLUMN_NAME_NO_QUOTES};
+ error_type: Test Results
+ - id: '1256'
+ test_id: '1502'
+ test_type: Combo_Match
+ sql_flavor: postgresql
+ lookup_type: null
+ lookup_query: |-
+ SELECT *
+ FROM ( SELECT {COLUMN_NAME_NO_QUOTES}
+ FROM {TARGET_SCHEMA}.{TABLE_NAME}
+ WHERE {SUBSET_CONDITION}
+ GROUP BY {COLUMN_NAME_NO_QUOTES}
+ {HAVING_CONDITION}
+ EXCEPT
+ SELECT {MATCH_GROUPBY_NAMES}
+ FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME}
+ WHERE {MATCH_SUBSET_CONDITION}
+ GROUP BY {MATCH_GROUPBY_NAMES}
+ {MATCH_HAVING_CONDITION}
+ ) test
+ ORDER BY {COLUMN_NAME_NO_QUOTES};
+ error_type: Test Results
+ - id: '1335'
+ test_id: '1502'
+ test_type: Combo_Match
+ sql_flavor: databricks
+ lookup_type: null
+ lookup_query: |-
+ SELECT *
+ FROM ( SELECT {COLUMN_NAME_NO_QUOTES}
+ FROM {TARGET_SCHEMA}.{TABLE_NAME}
+ WHERE {SUBSET_CONDITION}
+ GROUP BY {COLUMN_NAME_NO_QUOTES}
+ {HAVING_CONDITION}
+ EXCEPT
+ SELECT {MATCH_GROUPBY_NAMES}
+ FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME}
+ WHERE {MATCH_SUBSET_CONDITION}
+ GROUP BY {MATCH_GROUPBY_NAMES}
+ {MATCH_HAVING_CONDITION}
+ ) test
+ ORDER BY {COLUMN_NAME_NO_QUOTES};
+ error_type: Test Results
+ test_templates:
+ - id: '2001'
+ test_type: Combo_Match
+ sql_flavor: redshift
+ template_name: ex_data_match_generic.sql
+ - id: '2101'
+ test_type: Combo_Match
+ sql_flavor: snowflake
+ template_name: ex_data_match_generic.sql
+ - id: '2201'
+ test_type: Combo_Match
+ sql_flavor: mssql
+ template_name: ex_data_match_generic.sql
+ - id: '2301'
+ test_type: Combo_Match
+ sql_flavor: postgresql
+ template_name: ex_data_match_generic.sql
+ - id: '2401'
+ test_type: Combo_Match
+ sql_flavor: databricks
+ template_name: ex_data_match_generic.sql
diff --git a/testgen/template/dbsetup_test_types/test_types_Condition_Flag.yaml b/testgen/template/dbsetup_test_types/test_types_Condition_Flag.yaml
new file mode 100644
index 00000000..69bc6af2
--- /dev/null
+++ b/testgen/template/dbsetup_test_types/test_types_Condition_Flag.yaml
@@ -0,0 +1,128 @@
+test_types:
+ id: '1006'
+ test_type: Condition_Flag
+ test_name_short: Custom Condition
+ test_name_long: Column values match pre-defined condition
+ test_description: |-
+ Tests that each record in the table matches a pre-defined, custom condition
+ except_message: |-
+ Value(s) found not matching defined condition.
+ measure_uom: Values Failing
+ measure_uom_description: null
+ selection_criteria: null
+ dq_score_prevalence_formula: |-
+ ({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/NULLIF({RECORD_CT}::FLOAT, 0)
+ dq_score_risk_factor: '1.0'
+ column_name_prompt: |-
+ Test Focus
+ column_name_help: |-
+ Specify a brief descriptor of the focus of this test that is unique within this Test Suite for the Table and Test Type. This distinguishes this test from others of the same type on the same table. Example: `Quantity Consistency` if you are testing that quantity ordered matches quantity shipped.
+ default_parm_columns: threshold_value,custom_query
+ default_parm_values: null
+ default_parm_prompts: |-
+ Threshold Error Count,Custom SQL Expression (TRUE on error)
+ default_parm_help: |-
+ The number of errors that are acceptable before test fails.|Expression should evaluate to TRUE to register an error or FALSE if no error. An expression can reference only columns in the selected table.
+ default_severity: Fail
+ run_type: CAT
+ test_scope: custom
+ dq_dimension: Validity
+ health_dimension: Schema Drift
+ threshold_description: |-
+ Count of records that don't meet test condition
+ usage_notes: |-
+ Custom Condition is a business-rule test for a user-defined error condition based on the value of one or more columns. The condition is applied to each record within the table, and the count of records failing the condition is added up. If that count exceeds a threshold of errors, the test as a whole is failed. This test is ideal for error conditions that TestGen cannot automatically infer, and any condition that involves the values of more than one column in the same record. Performance of this test is fast, since it is performed together with other aggregate tests. Interpretation is based on the user-defined meaning of the test.
+ active: Y
+ result_visualization: line_chart
+ result_visualization_params: null
+ cat_test_conditions:
+ - id: '1003'
+ test_type: Condition_Flag
+ sql_flavor: redshift
+ measure: |-
+ SUM(CASE WHEN {CUSTOM_QUERY} THEN 1 ELSE 0 END)
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '2003'
+ test_type: Condition_Flag
+ sql_flavor: snowflake
+ measure: |-
+ SUM(CASE WHEN {CUSTOM_QUERY} THEN 1 ELSE 0 END)
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '3003'
+ test_type: Condition_Flag
+ sql_flavor: mssql
+ measure: |-
+ SUM(CASE WHEN {CUSTOM_QUERY} THEN 1 ELSE 0 END)
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '4003'
+ test_type: Condition_Flag
+ sql_flavor: postgresql
+ measure: |-
+ SUM(CASE WHEN {CUSTOM_QUERY} THEN 1 ELSE 0 END)
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '5003'
+ test_type: Condition_Flag
+ sql_flavor: trino
+ measure: |-
+ SUM(CASE WHEN {BASELINE_VALUE} IS NOT NULL THEN 1 ELSE 0 END)
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '6003'
+ test_type: Condition_Flag
+ sql_flavor: databricks
+ measure: |-
+ SUM(CASE WHEN {CUSTOM_QUERY} THEN 1 ELSE 0 END)
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ target_data_lookups:
+ - id: '1003'
+ test_id: '1006'
+ test_type: Condition_Flag
+ sql_flavor: redshift
+ lookup_type: null
+ lookup_query: |-
+ SELECT * FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE {CUSTOM_QUERY} LIMIT 500;
+ error_type: Test Results
+ - id: '1085'
+ test_id: '1006'
+ test_type: Condition_Flag
+ sql_flavor: postgresql
+ lookup_type: null
+ lookup_query: |-
+ SELECT * FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE {CUSTOM_QUERY} LIMIT 500;
+ error_type: Test Results
+ - id: '1142'
+ test_id: '1006'
+ test_type: Condition_Flag
+ sql_flavor: mssql
+ lookup_type: null
+ lookup_query: |-
+ SELECT TOP 500 * FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE {CUSTOM_QUERY};
+ error_type: Test Results
+ - id: '1199'
+ test_id: '1006'
+ test_type: Condition_Flag
+ sql_flavor: snowflake
+ lookup_type: null
+ lookup_query: |-
+ SELECT * FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE {CUSTOM_QUERY} LIMIT 500;
+ error_type: Test Results
+ - id: '1300'
+ test_id: '1006'
+ test_type: Condition_Flag
+ sql_flavor: databricks
+ lookup_type: null
+ lookup_query: |-
+ SELECT * FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE {CUSTOM_QUERY} LIMIT 500;
+ error_type: Test Results
+ test_templates: []
diff --git a/testgen/template/dbsetup_test_types/test_types_Constant.yaml b/testgen/template/dbsetup_test_types/test_types_Constant.yaml
new file mode 100644
index 00000000..4d41239c
--- /dev/null
+++ b/testgen/template/dbsetup_test_types/test_types_Constant.yaml
@@ -0,0 +1,127 @@
+test_types:
+ id: '1007'
+ test_type: Constant
+ test_name_short: Constant Match
+ test_name_long: All column values match constant value
+ test_description: |-
+ Tests that all values in the column match the constant value identified in baseline data
+ except_message: |-
+ A constant value is expected for this column.
+ measure_uom: Mismatched values
+ measure_uom_description: null
+ selection_criteria: |-
+ TEMPLATE
+ dq_score_prevalence_formula: |-
+ ({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/NULLIF({RECORD_CT}::FLOAT, 0)
+ dq_score_risk_factor: '1.0'
+ column_name_prompt: null
+ column_name_help: null
+ default_parm_columns: baseline_value,threshold_value
+ default_parm_values: null
+ default_parm_prompts: |-
+ Constant Value at Baseline,Threshold Error Count
+ default_parm_help: |-
+ The single, unchanging value of the column, per baseline|The number of errors that are acceptable before test fails.
+ default_severity: Fail
+ run_type: CAT
+ test_scope: column
+ dq_dimension: Validity
+ health_dimension: Schema Drift
+ threshold_description: |-
+ Count of records with unexpected values
+ usage_notes: |-
+ Constant Match tests that a single value determined to be a constant in baseline profiling is still the only value for the column that appears in subsequent versions of the dataset. Sometimes new data or business knowledge may reveal that the value is not a constant at all, even though only one value was present at profiling. In this case, you will want to disable this test. Alternatively, you can use the Value Match test to provide a limited number of valid values for the column.
+ active: Y
+ result_visualization: line_chart
+ result_visualization_params: null
+ cat_test_conditions:
+ - id: '1004'
+ test_type: Constant
+ sql_flavor: redshift
+ measure: |-
+ SUM(CASE WHEN {COLUMN_NAME} <> {BASELINE_VALUE} THEN 1 ELSE 0 END)
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '2004'
+ test_type: Constant
+ sql_flavor: snowflake
+ measure: |-
+ SUM(CASE WHEN {COLUMN_NAME} <> {BASELINE_VALUE} THEN 1 ELSE 0 END)
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '3004'
+ test_type: Constant
+ sql_flavor: mssql
+ measure: |-
+ SUM(CASE WHEN {COLUMN_NAME} <> {BASELINE_VALUE} THEN 1 ELSE 0 END)
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '4004'
+ test_type: Constant
+ sql_flavor: postgresql
+ measure: |-
+ SUM(CASE WHEN {COLUMN_NAME} <> {BASELINE_VALUE} THEN 1 ELSE 0 END)
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '5004'
+ test_type: Constant
+ sql_flavor: trino
+ measure: |-
+ SUM(CASE WHEN {COLUMN_NAME} <> {BASELINE_VALUE} THEN 1 ELSE 0 END)
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '6004'
+ test_type: Constant
+ sql_flavor: databricks
+ measure: |-
+ SUM(CASE WHEN {COLUMN_NAME} <> {BASELINE_VALUE} THEN 1 ELSE 0 END)
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ target_data_lookups:
+ - id: '1004'
+ test_id: '1007'
+ test_type: Constant
+ sql_flavor: redshift
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" <> {BASELINE_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT 500;
+ error_type: Test Results
+ - id: '1086'
+ test_id: '1007'
+ test_type: Constant
+ sql_flavor: postgresql
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" <> {BASELINE_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT 500;
+ error_type: Test Results
+ - id: '1143'
+ test_id: '1007'
+ test_type: Constant
+ sql_flavor: mssql
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" <> {BASELINE_VALUE} GROUP BY "{COLUMN_NAME}";
+ error_type: Test Results
+ - id: '1200'
+ test_id: '1007'
+ test_type: Constant
+ sql_flavor: snowflake
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" <> {BASELINE_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT 500;
+ error_type: Test Results
+ - id: '1301'
+ test_id: '1007'
+ test_type: Constant
+ sql_flavor: databricks
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE `{COLUMN_NAME}` <> {BASELINE_VALUE} GROUP BY `{COLUMN_NAME}` LIMIT 500;
+ error_type: Test Results
+ test_templates: []
diff --git a/testgen/template/dbsetup_test_types/test_types_Daily_Record_Ct.yaml b/testgen/template/dbsetup_test_types/test_types_Daily_Record_Ct.yaml
new file mode 100644
index 00000000..cbe772c2
--- /dev/null
+++ b/testgen/template/dbsetup_test_types/test_types_Daily_Record_Ct.yaml
@@ -0,0 +1,171 @@
+test_types:
+ id: '1009'
+ test_type: Daily_Record_Ct
+ test_name_short: Daily Records
+ test_name_long: All dates present within date range
+ test_description: |-
+ Tests for presence of every calendar date within min/max date range, per baseline data
+ except_message: |-
+ Not every date value between min and max dates is present, unlike at baseline.
+ measure_uom: Missing dates
+ measure_uom_description: null
+ selection_criteria: |-
+ general_type= 'D' AND date_days_present > 21 AND date_days_present - (DATEDIFF('day', '1800-01-05'::DATE, max_date) - DATEDIFF('day', '1800-01-05'::DATE, min_date) + 1) = 0 AND future_date_ct::FLOAT / NULLIF(value_ct, 0) <= 0.75
+ dq_score_prevalence_formula: |-
+ ({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT*{PRO_RECORD_CT}::FLOAT/NULLIF({DATE_DAYS_PRESENT}::FLOAT, 0)/NULLIF({RECORD_CT}::FLOAT, 0)
+ dq_score_risk_factor: '0.75'
+ column_name_prompt: null
+ column_name_help: null
+ default_parm_columns: threshold_value
+ default_parm_values: |-
+ 0
+ default_parm_prompts: |-
+ Threshold Missing Calendar Days
+ default_parm_help: null
+ default_severity: Warning
+ run_type: CAT
+ test_scope: column
+ dq_dimension: Completeness
+ health_dimension: Volume
+ threshold_description: |-
+ Missing calendar days within min/max range
+ usage_notes: "Daily Records tests that at least one record is present for every\
+ \ day within the minimum and maximum date range for the column. The test is relevant\
+ \ for transactional data, where you would expect at least one transaction to be\
+ \ recorded each day. A failure here would suggest missing records for the number\
+ \ of days identified without data. You can adjust the threshold to accept a number\
+ \ of days that you know legitimately have no records. "
+ active: Y
+ result_visualization: line_chart
+ result_visualization_params: null
+ cat_test_conditions:
+ - id: '1005'
+ test_type: Daily_Record_Ct
+ sql_flavor: redshift
+ measure: |-
+ DATEDIFF('DAY', MIN({COLUMN_NAME}), MAX({COLUMN_NAME}))+1-COUNT(DISTINCT {COLUMN_NAME})
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '2005'
+ test_type: Daily_Record_Ct
+ sql_flavor: snowflake
+ measure: |-
+ DATEDIFF(day, MIN({COLUMN_NAME}), MAX({COLUMN_NAME}))+1-COUNT(DISTINCT {COLUMN_NAME})
+ test_operator: <
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '3005'
+ test_type: Daily_Record_Ct
+ sql_flavor: mssql
+ measure: |-
+ DATEDIFF(day, MIN({COLUMN_NAME}), MAX({COLUMN_NAME}))+1-COUNT(DISTINCT {COLUMN_NAME})
+ test_operator: <
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '4005'
+ test_type: Daily_Record_Ct
+ sql_flavor: postgresql
+ measure: |-
+ <%DATEDIFF_DAY;MIN({COLUMN_NAME});MAX({COLUMN_NAME})%>+1-COUNT(DISTINCT {COLUMN_NAME})
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '5005'
+ test_type: Daily_Record_Ct
+ sql_flavor: trino
+ measure: |-
+ DATE_DIFF('DAY', MIN({COLUMN_NAME}), MAX({COLUMN_NAME}))+1-COUNT(DISTINCT {COLUMN_NAME})
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '6005'
+ test_type: Daily_Record_Ct
+ sql_flavor: databricks
+ measure: |-
+ <%DATEDIFF_DAY;MIN({COLUMN_NAME});MAX({COLUMN_NAME})%>+1-COUNT(DISTINCT {COLUMN_NAME})
+ test_operator: <
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ target_data_lookups:
+ - id: '1005'
+ test_id: '1009'
+ test_type: Daily_Record_Ct
+ sql_flavor: redshift
+ lookup_type: null
+ lookup_query: |-
+ WITH RECURSIVE daterange(all_dates) AS (SELECT MIN("{COLUMN_NAME}") :: DATE AS all_dates FROM {TARGET_SCHEMA}.{TABLE_NAME} UNION ALL SELECT DATEADD(DAY, 1, d.all_dates) :: DATE AS all_dates FROM daterange d WHERE d.all_dates < (SELECT MAX("{COLUMN_NAME}") :: DATE FROM {TARGET_SCHEMA}.{TABLE_NAME}) ), existing_periods AS ( SELECT DISTINCT "{COLUMN_NAME}" :: DATE AS period, COUNT(1) AS period_count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" :: DATE ) SELECT d.all_dates AS missing_period, MAX(b.period) AS prior_available_date, (SELECT period_count FROM existing_periods WHERE period = MAX(b.period) ) AS prior_available_date_count, MIN(c.period) AS next_available_date, (SELECT period_count FROM existing_periods WHERE period = MIN(c.period) ) AS next_available_date_count FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates ORDER BY d.all_dates LIMIT 500;
+ error_type: Test Results
+ - id: '1087'
+ test_id: '1009'
+ test_type: Daily_Record_Ct
+ sql_flavor: postgresql
+ lookup_type: null
+ lookup_query: |-
+ WITH RECURSIVE daterange(all_dates) AS (SELECT MIN("{COLUMN_NAME}") :: DATE AS all_dates FROM {TARGET_SCHEMA}.{TABLE_NAME} UNION ALL SELECT (d.all_dates :: DATE + INTERVAL '1 day') :: DATE AS all_dates FROM daterange d WHERE d.all_dates < (SELECT MAX("{COLUMN_NAME}") :: DATE FROM {TARGET_SCHEMA}.{TABLE_NAME}) ), existing_periods AS ( SELECT DISTINCT "{COLUMN_NAME}" :: DATE AS period, COUNT(1) AS period_count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" :: DATE ) SELECT d.all_dates AS missing_period, MAX(b.period) AS prior_available_date, (SELECT period_count FROM existing_periods WHERE period = MAX(b.period) ) AS prior_available_date_count, MIN(c.period) AS next_available_date, (SELECT period_count FROM existing_periods WHERE period = MIN(c.period) ) AS next_available_date_count FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates LIMIT 500;
+ error_type: Test Results
+ - id: '1144'
+ test_id: '1009'
+ test_type: Daily_Record_Ct
+ sql_flavor: mssql
+ lookup_type: null
+ lookup_query: |-
+ WITH
+ Pass0 as (select 1 as C union all select 1), --2 rows
+ Pass1 as (select 1 as C from Pass0 as A, Pass0 as B),--4 rows
+ Pass2 as (select 1 as C from Pass1 as A, Pass1 as B),--16 rows
+ Pass3 as (select 1 as C from Pass2 as A, Pass2 as B),--256 rows
+ Pass4 as (select 1 as C from Pass3 as A, Pass3 as B),--65536 rows
+ All_Nums as (select row_number() over(order by C) as Number from Pass4),
+ tally as (SELECT Number FROM All_Nums WHERE Number <= 45000),
+
+ date_range as (SELECT CAST(DATEADD(DAY, DATEDIFF(DAY, 0, MIN("{COLUMN_NAME}")), 0) AS DATE) AS min_period,
+ CAST(DATEADD(DAY, DATEDIFF(DAY, 0, MAX("{COLUMN_NAME}")), 0) AS DATE) AS max_period,
+ DATEDIFF(DAY,
+ CAST(DATEADD(DAY, DATEDIFF(DAY, 0, MIN("{COLUMN_NAME}")), 0) AS DATE),
+ CAST(DATEADD(DAY, DATEDIFF(DAY, 0, MAX("{COLUMN_NAME}")), 0) AS DATE) ) + 1 as period_ct
+ FROM {TARGET_SCHEMA}.{TABLE_NAME} ),
+ check_periods as ( SELECT d.min_period, d.max_period, t.number,
+ DATEADD(DAY, -(t.number - 1), d.max_period) AS check_period
+ FROM date_range d
+ INNER JOIN tally t
+ ON (d.period_ct >= t.number) ),
+ data_by_period as (SELECT CAST(DATEADD(DAY, DATEDIFF(DAY, 0, "{COLUMN_NAME}"), 0) AS DATE) as data_period, COUNT(*) as record_ct
+ FROM {TARGET_SCHEMA}.{TABLE_NAME}
+ GROUP BY CAST(DATEADD(DAY, DATEDIFF(DAY, 0, "{COLUMN_NAME}"), 0) AS DATE) ),
+ data_by_prd_with_prior_next as (SELECT check_period,
+ RANK() OVER (ORDER BY check_period DESC) as ranked,
+ ISNULL(d.record_ct, 0) as record_ct,
+ ISNULL(LAG(d.record_ct) OVER (ORDER BY check_period), 0) as last_record_ct,
+ ISNULL(LEAD(d.record_ct) OVER (ORDER BY check_period), 0) as next_record_ct
+ FROM check_periods c
+ LEFT JOIN data_by_period d
+ ON (c.check_period = d.data_period) )
+ SELECT check_period, record_ct,
+ CASE
+ WHEN record_ct = 0 THEN 'MISSING'
+ ELSE 'Present'
+ END as status
+ FROM data_by_prd_with_prior_next
+ WHERE record_ct = 0
+ OR last_record_ct = 0
+ OR next_record_ct = 0
+ ORDER BY check_period DESC;
+ error_type: Test Results
+ - id: '1201'
+ test_id: '1009'
+ test_type: Daily_Record_Ct
+ sql_flavor: snowflake
+ lookup_type: null
+ lookup_query: |-
+ WITH RECURSIVE daterange(all_dates) AS (SELECT MIN("{COLUMN_NAME}") :: DATE AS all_dates FROM {TARGET_SCHEMA}.{TABLE_NAME} UNION ALL SELECT DATEADD(DAY, 1, d.all_dates) :: DATE AS all_dates FROM daterange d WHERE d.all_dates < (SELECT MAX("{COLUMN_NAME}") :: DATE FROM {TARGET_SCHEMA}.{TABLE_NAME}) ), existing_periods AS ( SELECT DISTINCT "{COLUMN_NAME}" :: DATE AS period, COUNT(1) AS period_count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" :: DATE ) SELECT p.missing_period, p.prior_available_date, e.period_count as prior_available_date_count, p.next_available_date, f.period_count as next_available_date_count FROM (SELECT d.all_dates AS missing_period, MAX(b.period) AS prior_available_date, MIN(c.period) AS next_available_date FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates) p LEFT JOIN existing_periods e ON (p.prior_available_date = e.period) LEFT JOIN existing_periods f ON (p.next_available_date = f.period) ORDER BY p.missing_period LIMIT 500;
+ error_type: Test Results
+ - id: '1302'
+ test_id: '1009'
+ test_type: Daily_Record_Ct
+ sql_flavor: databricks
+ lookup_type: null
+ lookup_query: |-
+ WITH date_bounds AS( SELECT MIN(`{COLUMN_NAME}`) AS min_date, MAX(`{COLUMN_NAME}`) AS max_date FROM {TARGET_SCHEMA}.{TABLE_NAME}), all_dates AS ( SELECT EXPLODE(SEQUENCE(min_date, max_date, INTERVAL 1 DAY)) AS all_dates FROM date_bounds ), existing_periods AS ( SELECT DISTINCT CAST(`{COLUMN_NAME}` AS DATE) AS period, COUNT(1) AS period_count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY CAST(`{COLUMN_NAME}` AS DATE) ), missing_dates AS ( SELECT d.all_dates AS missing_period FROM all_dates d LEFT JOIN existing_periods e ON d.all_dates = e.period WHERE e.period IS NULL ) SELECT m.missing_period, MAX(e1.period) AS prior_available_date, MAX(e1.period_count) AS prior_available_date_count, MIN(e2.period) AS next_available_date, MAX(e2.period_count) AS next_available_date_count FROM missing_dates m LEFT JOIN existing_periods e1 ON e1.period < m.missing_period LEFT JOIN existing_periods e2 ON e2.period > m.missing_period GROUP BY m.missing_period ORDER BY m.missing_period LIMIT 500;
+ error_type: Test Results
+ test_templates: []
diff --git a/testgen/template/dbsetup_test_types/test_types_Dec_Trunc.yaml b/testgen/template/dbsetup_test_types/test_types_Dec_Trunc.yaml
new file mode 100644
index 00000000..0f20c746
--- /dev/null
+++ b/testgen/template/dbsetup_test_types/test_types_Dec_Trunc.yaml
@@ -0,0 +1,130 @@
+test_types:
+ id: '1011'
+ test_type: Dec_Trunc
+ test_name_short: Decimal Truncation
+ test_name_long: Sum of fractional values at or above reference
+ test_description: |-
+ Tests for decimal truncation by confirming that the sum of fractional values in data is no less than the sum at baseline
+ except_message: |-
+ The sum of fractional values is under baseline, which may indicate decimal truncation
+ measure_uom: Fractional sum
+ measure_uom_description: |-
+ The sum of all decimal values from all data for this column
+ selection_criteria: |-
+ fractional_sum > 0 AND functional_table_type LIKE'%cumulative%'
+ dq_score_prevalence_formula: |-
+ 1
+ dq_score_risk_factor: '1.0'
+ column_name_prompt: null
+ column_name_help: null
+ default_parm_columns: threshold_value
+ default_parm_values: |-
+ ROUND(fractional_sum, 0)
+ default_parm_prompts: |-
+ Sum of Fractional Values at Baseline
+ default_parm_help: null
+ default_severity: Fail
+ run_type: CAT
+ test_scope: column
+ dq_dimension: Validity
+ health_dimension: Schema Drift
+ threshold_description: |-
+ Minimum expected sum of all fractional values
+ usage_notes: |-
+ Decimal Truncation tests that the fractional (decimal) part of a numeric column has not been truncated since Baseline. This works by summing all the fractional values after the decimal point and confirming that the total is at least equal to the fractional total at baseline. This could indicate a problem in a cumulative dataset, where prior values should still exist unchanged. A failure here would suggest that some process changed data that you would still expect to be present and matching its value when the column was profiled. This test would not be appropriate for an incremental or windowed dataset.
+ active: Y
+ result_visualization: line_chart
+ result_visualization_params: null
+ cat_test_conditions:
+ - id: '1006'
+ test_type: Dec_Trunc
+ sql_flavor: redshift
+ measure: |-
+ SUM(ROUND(ABS(({COLUMN_NAME} % 1)), 5))+1
+ test_operator: <
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '2006'
+ test_type: Dec_Trunc
+ sql_flavor: snowflake
+ measure: |-
+ SUM(ROUND(ABS(({COLUMN_NAME} % 1)), 5))+1
+ test_operator: <
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '3006'
+ test_type: Dec_Trunc
+ sql_flavor: mssql
+ measure: |-
+ SUM(ROUND(ABS(({COLUMN_NAME} % 1)), 5))+1
+ test_operator: <
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '4006'
+ test_type: Dec_Trunc
+ sql_flavor: postgresql
+ measure: |-
+ SUM(ROUND(ABS(({COLUMN_NAME} % 1)), 5))+1
+ test_operator: <
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '5006'
+ test_type: Dec_Trunc
+ sql_flavor: trino
+ measure: |-
+ SUM(ROUND(ABS(({COLUMN_NAME} % 1)), 5))+1
+ test_operator: <
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '6006'
+ test_type: Dec_Trunc
+ sql_flavor: databricks
+ measure: |-
+ SUM(ROUND(ABS(({COLUMN_NAME} % 1)), 5))+1
+ test_operator: <
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ target_data_lookups:
+ - id: '1006'
+ test_id: '1011'
+ test_type: Dec_Trunc
+ sql_flavor: redshift
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT DECIMAL_SCALE("{COLUMN_NAME}" :: SUPER) AS decimal_scale, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY DECIMAL_SCALE("{COLUMN_NAME}" :: SUPER) LIMIT 500;
+ error_type: Test Results
+ - id: '1088'
+ test_id: '1011'
+ test_type: Dec_Trunc
+ sql_flavor: postgresql
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT LENGTH(SPLIT_PART("{COLUMN_NAME}" :: TEXT, '.', 2)) AS decimal_scale, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY decimal_scale LIMIT 500;
+ error_type: Test Results
+ - id: '1145'
+ test_id: '1011'
+ test_type: Dec_Trunc
+ sql_flavor: mssql
+ lookup_type: null
+ lookup_query: "WITH CTE AS ( SELECT LEN(SUBSTRING(CAST(ABS(\"{COLUMN_NAME}\")\
+ \ % 1 AS VARCHAR) , 3, LEN(\"{COLUMN_NAME}\"))) AS decimal_scale FROM {TARGET_SCHEMA}.{TABLE_NAME}\
+ \ ) SELECT DISTINCT TOP 500 decimal_scale,COUNT(*) AS count FROM cte GROUP BY\
+ \ decimal_scale ORDER BY COUNT(*) DESC; "
+ error_type: Test Results
+ - id: '1202'
+ test_id: '1011'
+ test_type: Dec_Trunc
+ sql_flavor: snowflake
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT LENGTH(SPLIT_PART("{COLUMN_NAME}" :: TEXT, '.', 2)) AS decimal_scale, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY decimal_scale LIMIT 500;
+ error_type: Test Results
+ - id: '1303'
+ test_id: '1011'
+ test_type: Dec_Trunc
+ sql_flavor: databricks
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT LENGTH(SPLIT_PART(`{COLUMN_NAME}`::STRING, '.', 2)) AS decimal_scale, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY decimal_scale LIMIT 500;
+ error_type: Test Results
+ test_templates: []
diff --git a/testgen/template/dbsetup_test_types/test_types_Distinct_Date_Ct.yaml b/testgen/template/dbsetup_test_types/test_types_Distinct_Date_Ct.yaml
new file mode 100644
index 00000000..339d8f62
--- /dev/null
+++ b/testgen/template/dbsetup_test_types/test_types_Distinct_Date_Ct.yaml
@@ -0,0 +1,128 @@
+test_types:
+ id: '1012'
+ test_type: Distinct_Date_Ct
+ test_name_short: Date Count
+ test_name_long: Count of distinct dates at or above reference
+ test_description: |-
+ Tests that the count of distinct dates referenced in the column has not dropped vs. baseline data
+ except_message: |-
+ Drop in count of unique dates recorded in column.
+ measure_uom: Unique dates
+ measure_uom_description: |-
+ Count of unique dates in transactional date column
+ selection_criteria: |-
+ functional_data_type ILIKE 'Transactional Date%' AND date_days_present > 1 AND functional_table_type ILIKE '%cumulative%'
+ dq_score_prevalence_formula: |-
+ (({RECORD_CT}-{PRO_RECORD_CT})::FLOAT*{DISTINCT_VALUE_CT}::FLOAT/NULLIF({PRO_RECORD_CT}::FLOAT, 0))/NULLIF({PRO_RECORD_CT}::FLOAT, 0)
+ dq_score_risk_factor: '1.0'
+ column_name_prompt: null
+ column_name_help: null
+ default_parm_columns: baseline_value,threshold_value
+ default_parm_values: |-
+ date_days_present,date_days_present
+ default_parm_prompts: |-
+ Distinct Date Count at Baseline,Min Expected Date Count
+ default_parm_help: null
+ default_severity: Fail
+ run_type: CAT
+ test_scope: column
+ dq_dimension: Timeliness
+ health_dimension: Recency
+ threshold_description: |-
+ Minimum distinct date count expected
+ usage_notes: |-
+ Date Count tests that the count of distinct dates present in the column has not dropped since baseline. The test is relevant for cumulative datasets, where old records are retained. A failure here would indicate missing records, which could be caused by a processing error or changed upstream data sources.
+ active: Y
+ result_visualization: line_chart
+ result_visualization_params: null
+ cat_test_conditions:
+ - id: '1007'
+ test_type: Distinct_Date_Ct
+ sql_flavor: redshift
+ measure: |-
+ COUNT(DISTINCT {COLUMN_NAME})
+ test_operator: <
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '2007'
+ test_type: Distinct_Date_Ct
+ sql_flavor: snowflake
+ measure: |-
+ COUNT(DISTINCT {COLUMN_NAME})
+ test_operator: <
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '3007'
+ test_type: Distinct_Date_Ct
+ sql_flavor: mssql
+ measure: |-
+ COUNT(DISTINCT {COLUMN_NAME})
+ test_operator: <
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '4007'
+ test_type: Distinct_Date_Ct
+ sql_flavor: postgresql
+ measure: |-
+ COUNT(DISTINCT {COLUMN_NAME})
+ test_operator: <
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '5007'
+ test_type: Distinct_Date_Ct
+ sql_flavor: trino
+ measure: |-
+ COUNT(DISTINCT {COLUMN_NAME})
+ test_operator: <
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '6007'
+ test_type: Distinct_Date_Ct
+ sql_flavor: databricks
+ measure: |-
+ COUNT(DISTINCT {COLUMN_NAME})
+ test_operator: <
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ target_data_lookups:
+ - id: '1007'
+ test_id: '1012'
+ test_type: Distinct_Date_Ct
+ sql_flavor: redshift
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" IS NOT NULL GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500;
+ error_type: Test Results
+ - id: '1089'
+ test_id: '1012'
+ test_type: Distinct_Date_Ct
+ sql_flavor: postgresql
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" IS NOT NULL GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500;
+ error_type: Test Results
+ - id: '1146'
+ test_id: '1012'
+ test_type: Distinct_Date_Ct
+ sql_flavor: mssql
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" IS NOT NULL GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC;
+ error_type: Test Results
+ - id: '1203'
+ test_id: '1012'
+ test_type: Distinct_Date_Ct
+ sql_flavor: snowflake
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" IS NOT NULL GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500;
+ error_type: Test Results
+ - id: '1304'
+ test_id: '1012'
+ test_type: Distinct_Date_Ct
+ sql_flavor: databricks
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE `{COLUMN_NAME}` IS NOT NULL GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` DESC LIMIT 500;
+ error_type: Test Results
+ test_templates: []
diff --git a/testgen/template/dbsetup_test_types/test_types_Distinct_Value_Ct.yaml b/testgen/template/dbsetup_test_types/test_types_Distinct_Value_Ct.yaml
new file mode 100644
index 00000000..95bc9080
--- /dev/null
+++ b/testgen/template/dbsetup_test_types/test_types_Distinct_Value_Ct.yaml
@@ -0,0 +1,127 @@
+test_types:
+ id: '1013'
+ test_type: Distinct_Value_Ct
+ test_name_short: Value Count
+ test_name_long: Count of distinct values has not dropped
+ test_description: |-
+ Tests that the count of unique values in the column has not changed from baseline.
+ except_message: |-
+ Count of unique values in column has changed from baseline.
+ measure_uom: Unique Values
+ measure_uom_description: null
+ selection_criteria: |-
+ distinct_value_ct between 2 and 10 AND value_ct > 50 AND functional_data_type IN ('Code', 'Category', 'Attribute', 'Description') AND NOT coalesce(top_freq_values,'') > ''
+ dq_score_prevalence_formula: |-
+ ABS({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT*{PRO_RECORD_CT}::FLOAT/NULLIF({DISTINCT_VALUE_CT}::FLOAT, 0)/NULLIF({RECORD_CT}::FLOAT, 0)
+ dq_score_risk_factor: '1.0'
+ column_name_prompt: null
+ column_name_help: null
+ default_parm_columns: baseline_value_ct,threshold_value
+ default_parm_values: |-
+ distinct_value_ct,distinct_value_ct
+ default_parm_prompts: |-
+ Distinct Value Count at Baseline,Min Expected Value Count
+ default_parm_help: null
+ default_severity: Fail
+ run_type: CAT
+ test_scope: column
+ dq_dimension: Validity
+ health_dimension: Schema Drift
+ threshold_description: |-
+ Expected distinct value count
+ usage_notes: |-
+ Value Count tests that the count of unique values present in the column has not dropped since baseline. The test is relevant for cumulative datasets, where old records are retained, or for any dataset where you would expect a set number of distinct values should be present. A failure here would indicate missing records or a change in categories or value assignment.
+ active: Y
+ result_visualization: line_chart
+ result_visualization_params: null
+ cat_test_conditions:
+ - id: '1008'
+ test_type: Distinct_Value_Ct
+ sql_flavor: redshift
+ measure: |-
+ COUNT(DISTINCT {COLUMN_NAME})
+ test_operator: <>
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '2008'
+ test_type: Distinct_Value_Ct
+ sql_flavor: snowflake
+ measure: |-
+ COUNT(DISTINCT {COLUMN_NAME})
+ test_operator: <>
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '3008'
+ test_type: Distinct_Value_Ct
+ sql_flavor: mssql
+ measure: |-
+ COUNT(DISTINCT {COLUMN_NAME})
+ test_operator: <>
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '4008'
+ test_type: Distinct_Value_Ct
+ sql_flavor: postgresql
+ measure: |-
+ COUNT(DISTINCT {COLUMN_NAME})
+ test_operator: <>
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '5008'
+ test_type: Distinct_Value_Ct
+ sql_flavor: trino
+ measure: |-
+ COUNT(DISTINCT {COLUMN_NAME})
+ test_operator: <>
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '6008'
+ test_type: Distinct_Value_Ct
+ sql_flavor: databricks
+ measure: |-
+ COUNT(DISTINCT {COLUMN_NAME})
+ test_operator: <>
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ target_data_lookups:
+ - id: '1008'
+ test_id: '1013'
+ test_type: Distinct_Value_Ct
+ sql_flavor: redshift
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" IS NOT NULL GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500;
+ error_type: Test Results
+ - id: '1090'
+ test_id: '1013'
+ test_type: Distinct_Value_Ct
+ sql_flavor: postgresql
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" IS NOT NULL GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500;
+ error_type: Test Results
+ - id: '1147'
+ test_id: '1013'
+ test_type: Distinct_Value_Ct
+ sql_flavor: mssql
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" IS NOT NULL GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC;
+ error_type: Test Results
+ - id: '1204'
+ test_id: '1013'
+ test_type: Distinct_Value_Ct
+ sql_flavor: snowflake
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" IS NOT NULL GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500;
+ error_type: Test Results
+ - id: '1305'
+ test_id: '1013'
+ test_type: Distinct_Value_Ct
+ sql_flavor: databricks
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE `{COLUMN_NAME}` IS NOT NULL GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` DESC LIMIT 500;
+ error_type: Test Results
+ test_templates: []
diff --git a/testgen/template/dbsetup_test_types/test_types_Distribution_Shift.yaml b/testgen/template/dbsetup_test_types/test_types_Distribution_Shift.yaml
new file mode 100644
index 00000000..52e3cfc5
--- /dev/null
+++ b/testgen/template/dbsetup_test_types/test_types_Distribution_Shift.yaml
@@ -0,0 +1,191 @@
+test_types:
+ id: '1503'
+ test_type: Distribution_Shift
+ test_name_short: Distribution Shift
+ test_name_long: Probability distribution consistent with reference
+ test_description: |-
+ Tests the closeness of match between two distributions of aggregate measures across combinations of column values, using Jensen-Shannon Divergence test
+ except_message: |-
+ Divergence between two distributions exceeds specified threshold.
+ measure_uom: Divergence level (0-1)
+ measure_uom_description: |-
+ Jensen-Shannon Divergence, from 0 (identical distributions), to 1.0 (max divergence)
+ selection_criteria: null
+ dq_score_prevalence_formula: |-
+ 1
+ dq_score_risk_factor: '0.75'
+ column_name_prompt: |-
+ Categorical Column List
+ column_name_help: |-
+ Specify one or more Categorical columns, separated by commas. Do not use continuous mesurements here. Do not use numeric values unless they represent discrete categories.
+ default_parm_columns: subset_condition,match_schema_name,match_table_name,match_groupby_names,match_subset_condition
+ default_parm_values: null
+ default_parm_prompts: |-
+ Record Subset Condition,Reference Schema Name,Reference Table Name,Matching Columns to Compare,Matching Record Subset Condition
+ default_parm_help: |-
+ Condition defining a subset of records in main table to evaluate, written like a condition within a SQL WHERE clause - OPTIONAL|Schema location of matching table|Matching table name|Column Names in reference table used to compare counts with source table values (separated by commas)|Condition defining a subset of records in reference table to match against, written like a condition within a SQL WHERE clause - OPTIONAL
+ default_severity: Warning
+ run_type: QUERY
+ test_scope: referential
+ dq_dimension: Consistency
+ health_dimension: Data Drift
+ threshold_description: |-
+ Expected maximum divergence level between 0 and 1
+ usage_notes: |-
+ This test measures the similarity of two sets of counts per categories, by using their proportional counts as probability distributions. Using Jensen-Shannon divergence, a measure of relative entropy or difference between two distributions, the test assigns a score ranging from 0, meaning that the distributions are identical, to 1, meaning that the distributions are completely unrelated. This test can be used to compare datasets that may not match exactly, but should have similar distributions. For example, it is a useful sanity check for data from different sources that you would expect to have a consistent spread, such as shipment of building materials per state and construction projects by state. Scores can be compared over time even if the distributions are not identical -- a dataset can be expected to maintain a comparable divergence score with a reference dataset over time. Both tables must be present to run this test.
+ active: Y
+ result_visualization: line_chart
+ result_visualization_params: null
+ cat_test_conditions: []
+ target_data_lookups:
+ - id: '1257'
+ test_id: '1503'
+ test_type: Distribution_Shift
+ sql_flavor: redshift
+ lookup_type: null
+ lookup_query: |-
+ WITH latest_ver
+ AS ( SELECT {CONCAT_COLUMNS} as category,
+ COUNT(*)::FLOAT / SUM(COUNT(*)) OVER ()::FLOAT AS pct_of_total
+ FROM {TARGET_SCHEMA}.{TABLE_NAME} v1
+ WHERE {SUBSET_CONDITION}
+ GROUP BY {COLUMN_NAME_NO_QUOTES} ),
+ older_ver
+ AS ( SELECT {CONCAT_MATCH_GROUPBY} as category,
+ COUNT(*)::FLOAT / SUM(COUNT(*)) OVER ()::FLOAT AS pct_of_total
+ FROM {MATCH_SCHEMA_NAME}.{TABLE_NAME} v2
+ WHERE {MATCH_SUBSET_CONDITION}
+ GROUP BY {MATCH_GROUPBY_NAMES} )
+ SELECT COALESCE(l.category, o.category) AS category,
+ o.pct_of_total AS old_pct,
+ l.pct_of_total AS new_pct
+ FROM latest_ver l
+ FULL JOIN older_ver o
+ ON (l.category = o.category)
+ ORDER BY COALESCE(l.category, o.category)
+ error_type: Test Results
+ - id: '1258'
+ test_id: '1503'
+ test_type: Distribution_Shift
+ sql_flavor: snowflake
+ lookup_type: null
+ lookup_query: |-
+ WITH latest_ver
+ AS ( SELECT {CONCAT_COLUMNS} as category,
+ COUNT(*)::FLOAT / SUM(COUNT(*)) OVER ()::FLOAT AS pct_of_total
+ FROM {TARGET_SCHEMA}.{TABLE_NAME} v1
+ WHERE {SUBSET_CONDITION}
+ GROUP BY {COLUMN_NAME_NO_QUOTES} ),
+ older_ver
+ AS ( SELECT {CONCAT_MATCH_GROUPBY} as category,
+ COUNT(*)::FLOAT / SUM(COUNT(*)) OVER ()::FLOAT AS pct_of_total
+ FROM {MATCH_SCHEMA_NAME}.{TABLE_NAME} v2
+ WHERE {MATCH_SUBSET_CONDITION}
+ GROUP BY {MATCH_GROUPBY_NAMES} )
+ SELECT COALESCE(l.category, o.category) AS category,
+ o.pct_of_total AS old_pct,
+ l.pct_of_total AS new_pct
+ FROM latest_ver l
+ FULL JOIN older_ver o
+ ON (l.category = o.category)
+ ORDER BY COALESCE(l.category, o.category)
+ error_type: Test Results
+ - id: '1259'
+ test_id: '1503'
+ test_type: Distribution_Shift
+ sql_flavor: mssql
+ lookup_type: null
+ lookup_query: |-
+ WITH latest_ver
+ AS ( SELECT {CONCAT_COLUMNS} as category,
+ CAST(COUNT(*) as FLOAT) / CAST(SUM(COUNT(*)) OVER () as FLOAT) AS pct_of_total
+ FROM {TARGET_SCHEMA}.{TABLE_NAME} v1
+ WHERE {SUBSET_CONDITION}
+ GROUP BY {COLUMN_NAME_NO_QUOTES} ),
+ older_ver
+ AS ( SELECT {CONCAT_MATCH_GROUPBY} as category,
+ CAST(COUNT(*) as FLOAT) / CAST(SUM(COUNT(*)) OVER () as FLOAT) AS pct_of_total
+ FROM {MATCH_SCHEMA_NAME}.{TABLE_NAME} v2
+ WHERE {MATCH_SUBSET_CONDITION}
+ GROUP BY {MATCH_GROUPBY_NAMES} )
+ SELECT COALESCE(l.category, o.category) AS category,
+ o.pct_of_total AS old_pct,
+ l.pct_of_total AS new_pct
+ FROM latest_ver l
+ FULL JOIN older_ver o
+ ON (l.category = o.category)
+ ORDER BY COALESCE(l.category, o.category)
+ error_type: Test Results
+ - id: '1260'
+ test_id: '1503'
+ test_type: Distribution_Shift
+ sql_flavor: postgresql
+ lookup_type: null
+ lookup_query: |-
+ WITH latest_ver
+ AS ( SELECT {CONCAT_COLUMNS} as category,
+ COUNT(*)::FLOAT / SUM(COUNT(*)) OVER ()::FLOAT AS pct_of_total
+ FROM {TARGET_SCHEMA}.{TABLE_NAME} v1
+ WHERE {SUBSET_CONDITION}
+ GROUP BY {COLUMN_NAME_NO_QUOTES} ),
+ older_ver
+ AS ( SELECT {CONCAT_MATCH_GROUPBY} as category,
+ COUNT(*)::FLOAT / SUM(COUNT(*)) OVER ()::FLOAT AS pct_of_total
+ FROM {MATCH_SCHEMA_NAME}.{TABLE_NAME} v2
+ WHERE {MATCH_SUBSET_CONDITION}
+ GROUP BY {MATCH_GROUPBY_NAMES} )
+ SELECT COALESCE(l.category, o.category) AS category,
+ o.pct_of_total AS old_pct,
+ l.pct_of_total AS new_pct
+ FROM latest_ver l
+ FULL JOIN older_ver o
+ ON (l.category = o.category)
+ ORDER BY COALESCE(l.category, o.category)
+ error_type: Test Results
+ - id: '1336'
+ test_id: '1503'
+ test_type: Distribution_Shift
+ sql_flavor: databricks
+ lookup_type: null
+ lookup_query: |-
+ WITH latest_ver
+ AS ( SELECT {CONCAT_COLUMNS} as category,
+ COUNT(*)::FLOAT / SUM(COUNT(*)) OVER ()::FLOAT AS pct_of_total
+ FROM {TARGET_SCHEMA}.{TABLE_NAME} v1
+ WHERE {SUBSET_CONDITION}
+ GROUP BY {COLUMN_NAME_NO_QUOTES} ),
+ older_ver
+ AS ( SELECT {CONCAT_MATCH_GROUPBY} as category,
+ COUNT(*)::FLOAT / SUM(COUNT(*)) OVER ()::FLOAT AS pct_of_total
+ FROM {MATCH_SCHEMA_NAME}.{TABLE_NAME} v2
+ WHERE {MATCH_SUBSET_CONDITION}
+ GROUP BY {MATCH_GROUPBY_NAMES} )
+ SELECT COALESCE(l.category, o.category) AS category,
+ o.pct_of_total AS old_pct,
+ l.pct_of_total AS new_pct
+ FROM latest_ver l
+ FULL JOIN older_ver o
+ ON (l.category = o.category)
+ ORDER BY COALESCE(l.category, o.category)
+ error_type: Test Results
+ test_templates:
+ - id: '2003'
+ test_type: Distribution_Shift
+ sql_flavor: redshift
+ template_name: ex_relative_entropy_generic.sql
+ - id: '2103'
+ test_type: Distribution_Shift
+ sql_flavor: snowflake
+ template_name: ex_relative_entropy_generic.sql
+ - id: '2203'
+ test_type: Distribution_Shift
+ sql_flavor: mssql
+ template_name: ex_relative_entropy_mssql.sql
+ - id: '2303'
+ test_type: Distribution_Shift
+ sql_flavor: postgresql
+ template_name: ex_relative_entropy_generic.sql
+ - id: '2403'
+ test_type: Distribution_Shift
+ sql_flavor: databricks
+ template_name: ex_relative_entropy_generic.sql
diff --git a/testgen/template/dbsetup_test_types/test_types_Dupe_Rows.yaml b/testgen/template/dbsetup_test_types/test_types_Dupe_Rows.yaml
new file mode 100644
index 00000000..a929a661
--- /dev/null
+++ b/testgen/template/dbsetup_test_types/test_types_Dupe_Rows.yaml
@@ -0,0 +1,125 @@
+test_types:
+ id: '1510'
+ test_type: Dupe_Rows
+ test_name_short: Duplicate Rows
+ test_name_long: Rows are not duplicated in table
+ test_description: |-
+ Tests for the absence of duplicate rows based on unique combination of column values
+ except_message: |-
+ Column value combinations are duplicated in the table.
+ measure_uom: Duplicate records
+ measure_uom_description: null
+ selection_criteria: null
+ dq_score_prevalence_formula: |-
+ (({RESULT_MEASURE}-{THRESHOLD_VALUE}))::FLOAT/NULLIF({RECORD_CT}::FLOAT, 0)
+ dq_score_risk_factor: '1.0'
+ column_name_prompt: |-
+ null
+ column_name_help: |-
+ null
+ default_parm_columns: groupby_names
+ default_parm_values: null
+ default_parm_prompts: |-
+ Columns to Compare
+ default_parm_help: |-
+ List of columns in the table that define a duplicate record when the combination of values is repeated on multiple rows
+ default_severity: Fail
+ run_type: QUERY
+ test_scope: table
+ dq_dimension: Uniqueness
+ health_dimension: Schema Drift
+ threshold_description: |-
+ Expected count of duplicate value combinations
+ usage_notes: |-
+ This test verifies that combinations of values are not repeated within the table. By default when auto-generated, the test considers all columns to protect against duplication of entire rows. If you know the minimum columns that should constitute a unique record, such as a set of ID's, you should use those to make the test as sensitive as possible. Alternatively, if you know of columns you can always exclude, such as file_date or refresh_snapshot_id, remove them to tighten the test somewhat.
+ active: Y
+ result_visualization: line_chart
+ result_visualization_params: null
+ cat_test_conditions: []
+ target_data_lookups:
+ - id: '1253'
+ test_id: '1510'
+ test_type: Dupe_Rows
+ sql_flavor: redshift
+ lookup_type: null
+ lookup_query: |-
+ SELECT {GROUPBY_NAMES}, COUNT(*) as record_ct
+ FROM {TARGET_SCHEMA}.{TABLE_NAME}
+ WHERE {SUBSET_CONDITION}
+ GROUP BY {GROUPBY_NAMES}
+ HAVING COUNT(*) > 1
+ ORDER BY {GROUPBY_NAMES}
+ error_type: Test Results
+ - id: '1254'
+ test_id: '1510'
+ test_type: Dupe_Rows
+ sql_flavor: snowflake
+ lookup_type: null
+ lookup_query: |-
+ SELECT {GROUPBY_NAMES}, COUNT(*) as record_ct
+ FROM {TARGET_SCHEMA}.{TABLE_NAME}
+ WHERE {SUBSET_CONDITION}
+ GROUP BY {GROUPBY_NAMES}
+ HAVING COUNT(*) > 1
+ ORDER BY {GROUPBY_NAMES}
+ error_type: Test Results
+ - id: '1255'
+ test_id: '1510'
+ test_type: Dupe_Rows
+ sql_flavor: mssql
+ lookup_type: null
+ lookup_query: |-
+ SELECT {GROUPBY_NAMES}, COUNT(*) as record_ct
+ FROM {TARGET_SCHEMA}.{TABLE_NAME}
+ WHERE {SUBSET_CONDITION}
+ GROUP BY {GROUPBY_NAMES}
+ HAVING COUNT(*) > 1
+ ORDER BY {GROUPBY_NAMES}
+ error_type: Test Results
+ - id: '1256'
+ test_id: '1510'
+ test_type: Dupe_Rows
+ sql_flavor: postgresql
+ lookup_type: null
+ lookup_query: |-
+ SELECT {GROUPBY_NAMES}, COUNT(*) as record_ct
+ FROM {TARGET_SCHEMA}.{TABLE_NAME}
+ WHERE {SUBSET_CONDITION}
+ GROUP BY {GROUPBY_NAMES}
+ HAVING COUNT(*) > 1
+ ORDER BY {GROUPBY_NAMES}
+ error_type: Test Results
+ - id: '1257'
+ test_id: '1510'
+ test_type: Dupe_Rows
+ sql_flavor: databricks
+ lookup_type: null
+ lookup_query: |-
+ SELECT {GROUPBY_NAMES}, COUNT(*) as record_ct
+ FROM {TARGET_SCHEMA}.{TABLE_NAME}
+ WHERE {SUBSET_CONDITION}
+ GROUP BY {GROUPBY_NAMES}
+ HAVING COUNT(*) > 1
+ ORDER BY {GROUPBY_NAMES}
+ error_type: Test Results
+ test_templates:
+ - id: '2011'
+ test_type: Dupe_Rows
+ sql_flavor: redshift
+ template_name: ex_dupe_rows_generic.sql
+ - id: '2111'
+ test_type: Dupe_Rows
+ sql_flavor: snowflake
+ template_name: ex_dupe_rows_generic.sql
+ - id: '2211'
+ test_type: Dupe_Rows
+ sql_flavor: mssql
+ template_name: ex_dupe_rows_generic.sql
+ - id: '2311'
+ test_type: Dupe_Rows
+ sql_flavor: postgresql
+ template_name: ex_dupe_rows_generic.sql
+ - id: '2411'
+ test_type: Dupe_Rows
+ sql_flavor: databricks
+ template_name: ex_dupe_rows_generic.sql
diff --git a/testgen/template/dbsetup_test_types/test_types_Email_Format.yaml b/testgen/template/dbsetup_test_types/test_types_Email_Format.yaml
new file mode 100644
index 00000000..c32ab45b
--- /dev/null
+++ b/testgen/template/dbsetup_test_types/test_types_Email_Format.yaml
@@ -0,0 +1,127 @@
+test_types:
+ id: '1014'
+ test_type: Email_Format
+ test_name_short: Email Format
+ test_name_long: Email is correctly formatted
+ test_description: |-
+ Tests that non-blank, non-empty email addresses match the standard format
+ except_message: |-
+ Invalid email address formats found.
+ measure_uom: Invalid emails
+ measure_uom_description: |-
+ Number of emails that do not match standard format
+ selection_criteria: |-
+ std_pattern_match='EMAIL'
+ dq_score_prevalence_formula: |-
+ ({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/NULLIF({RECORD_CT}::FLOAT, 0)
+ dq_score_risk_factor: '1.0'
+ column_name_prompt: null
+ column_name_help: null
+ default_parm_columns: threshold_value
+ default_parm_values: |-
+ 0
+ default_parm_prompts: |-
+ Maximum Invalid Email Count
+ default_parm_help: null
+ default_severity: Fail
+ run_type: CAT
+ test_scope: column
+ dq_dimension: Validity
+ health_dimension: Schema Drift
+ threshold_description: |-
+ Expected count of invalid email addresses
+ usage_notes: null
+ active: Y
+ result_visualization: line_chart
+ result_visualization_params: null
+ cat_test_conditions:
+ - id: '1009'
+ test_type: Email_Format
+ sql_flavor: redshift
+ measure: |-
+ SUM(CASE WHEN {COLUMN_NAME} !~ '^[A-Za-z0-9._''%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$' THEN 1 ELSE 0 END)
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '2009'
+ test_type: Email_Format
+ sql_flavor: snowflake
+ measure: |-
+ SUM(CASE WHEN NOT REGEXP_LIKE({COLUMN_NAME}::VARCHAR, '^[A-Za-z0-9._''%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$') THEN 1 ELSE 0 END)
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '3009'
+ test_type: Email_Format
+ sql_flavor: mssql
+ measure: |-
+ SUM(CASE WHEN {COLUMN_NAME} NOT LIKE '[A-Za-z0-9._''%+-]%@[A-Za-z0-9.-]%.[A-Za-z][A-Za-z]%' THEN 1 ELSE 0 END)
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '4009'
+ test_type: Email_Format
+ sql_flavor: postgresql
+ measure: |-
+ SUM(CASE WHEN {COLUMN_NAME} !~ '^[A-Za-z0-9._''%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$' THEN 1 ELSE 0 END)
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '5009'
+ test_type: Email_Format
+ sql_flavor: trino
+ measure: |-
+ SUM(CASE WHEN REGEXP_LIKE({COLUMN_NAME} , '^[A-Za-z0-9._''%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$') != TRUE THEN 1 ELSE 0 END)
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '6009'
+ test_type: Email_Format
+ sql_flavor: databricks
+ measure: |-
+ SUM(CASE WHEN NOT REGEXP_LIKE({COLUMN_NAME}::STRING, '^[A-Za-z0-9._''%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$') THEN 1 ELSE 0 END)
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ target_data_lookups:
+ - id: '1009'
+ test_id: '1014'
+ test_type: Email_Format
+ sql_flavor: redshift
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" !~ '^[A-Za-z0-9._''%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$' GROUP BY "{COLUMN_NAME}" LIMIT 500;
+ error_type: Test Results
+ - id: '1091'
+ test_id: '1014'
+ test_type: Email_Format
+ sql_flavor: postgresql
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" !~ '^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$' GROUP BY "{COLUMN_NAME}" LIMIT 500;
+ error_type: Test Results
+ - id: '1148'
+ test_id: '1014'
+ test_type: Email_Format
+ sql_flavor: mssql
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" NOT LIKE '%[_a-zA-Z0-9.-]%@%[a-zA-Z0-9.-]%.[a-zA-Z][a-zA-Z]%' GROUP BY "{COLUMN_NAME}";
+ error_type: Test Results
+ - id: '1205'
+ test_id: '1014'
+ test_type: Email_Format
+ sql_flavor: snowflake
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE REGEXP_LIKE("{COLUMN_NAME}"::VARCHAR, '^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z]{2,}$') != 1 GROUP BY "{COLUMN_NAME}" LIMIT 500;
+ error_type: Test Results
+ - id: '1306'
+ test_id: '1014'
+ test_type: Email_Format
+ sql_flavor: databricks
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE REGEXP_LIKE(`{COLUMN_NAME}`::STRING, '^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z]{2,}$') != 1 GROUP BY `{COLUMN_NAME}` LIMIT 500;
+ error_type: Test Results
+ test_templates: []
diff --git a/testgen/template/dbsetup_test_types/test_types_Future_Date.yaml b/testgen/template/dbsetup_test_types/test_types_Future_Date.yaml
new file mode 100644
index 00000000..aa1c8270
--- /dev/null
+++ b/testgen/template/dbsetup_test_types/test_types_Future_Date.yaml
@@ -0,0 +1,126 @@
+test_types:
+ id: '1015'
+ test_type: Future_Date
+ test_name_short: Past Dates
+ test_name_long: Latest date is prior to test run date
+ test_description: |-
+ Tests that the maximum date referenced in the column is no greater than the test date, consistent with baseline data
+ except_message: |-
+ Future date found when absent in baseline data.
+ measure_uom: Future dates
+ measure_uom_description: null
+ selection_criteria: |-
+ general_type='D'AND future_date_ct = 0
+ dq_score_prevalence_formula: |-
+ ({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/NULLIF({RECORD_CT}::FLOAT, 0)
+ dq_score_risk_factor: '1.0'
+ column_name_prompt: null
+ column_name_help: null
+ default_parm_columns: threshold_value
+ default_parm_values: |-
+ 0
+ default_parm_prompts: |-
+ Maximum Future Date Count
+ default_parm_help: null
+ default_severity: Fail
+ run_type: CAT
+ test_scope: column
+ dq_dimension: Timeliness
+ health_dimension: Recency
+ threshold_description: |-
+ Expected count of future dates
+ usage_notes: null
+ active: Y
+ result_visualization: line_chart
+ result_visualization_params: null
+ cat_test_conditions:
+ - id: '1010'
+ test_type: Future_Date
+ sql_flavor: redshift
+ measure: |-
+ SUM(GREATEST(0, SIGN({COLUMN_NAME}::DATE - '{RUN_DATE}'::DATE)))
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '2010'
+ test_type: Future_Date
+ sql_flavor: snowflake
+ measure: |-
+ SUM(GREATEST(0, SIGN({COLUMN_NAME}::DATE - '{RUN_DATE}'::DATE)))
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '3010'
+ test_type: Future_Date
+ sql_flavor: mssql
+ measure: |-
+ SUM(CASE WHEN CAST({COLUMN_NAME} AS DATE) >= CONVERT(DATE, '{RUN_DATE}') THEN 1 ELSE 0 END)
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '4010'
+ test_type: Future_Date
+ sql_flavor: postgresql
+ measure: |-
+ SUM(GREATEST(0, SIGN({COLUMN_NAME}::DATE - '{RUN_DATE}'::DATE)))
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '5010'
+ test_type: Future_Date
+ sql_flavor: trino
+ measure: |-
+ SUM(CASE WHEN CAST({COLUMN_NAME} AS DATE) >= CAST('{RUN_DATE}' AS DATE) THEN 1 ELSE 0 END)
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '6010'
+ test_type: Future_Date
+ sql_flavor: databricks
+ measure: |-
+ SUM(GREATEST(0, SIGN({COLUMN_NAME}::DATE - '{RUN_DATE}'::DATE)))
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ target_data_lookups:
+ - id: '1010'
+ test_id: '1015'
+ test_type: Future_Date
+ sql_flavor: redshift
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE GREATEST(0, SIGN("{COLUMN_NAME}"::DATE - '{TEST_DATE}'::DATE)) > {THRESHOLD_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT 500;
+ error_type: Test Results
+ - id: '1092'
+ test_id: '1015'
+ test_type: Future_Date
+ sql_flavor: postgresql
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE GREATEST(0, SIGN("{COLUMN_NAME}"::DATE - '{TEST_DATE}'::DATE)) > {THRESHOLD_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT 500;
+ error_type: Test Results
+ - id: '1149'
+ test_id: '1015'
+ test_type: Future_Date
+ sql_flavor: mssql
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE CAST("{COLUMN_NAME}" AS DATE) >= CONVERT(DATE, '{TEST_DATE}') GROUP BY "{COLUMN_NAME}";
+ error_type: Test Results
+ - id: '1206'
+ test_id: '1015'
+ test_type: Future_Date
+ sql_flavor: snowflake
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE GREATEST(0, SIGN("{COLUMN_NAME}"::DATE - '{TEST_DATE}'::DATE)) > {THRESHOLD_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT 500;
+ error_type: Test Results
+ - id: '1307'
+ test_id: '1015'
+ test_type: Future_Date
+ sql_flavor: databricks
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE GREATEST(0, SIGN(`{COLUMN_NAME}`::DATE - '{TEST_DATE}'::DATE)) > {THRESHOLD_VALUE} GROUP BY `{COLUMN_NAME}` LIMIT 500;
+ error_type: Test Results
+ test_templates: []
diff --git a/testgen/template/dbsetup_test_types/test_types_Future_Date_1Y.yaml b/testgen/template/dbsetup_test_types/test_types_Future_Date_1Y.yaml
new file mode 100644
index 00000000..d9be7bbe
--- /dev/null
+++ b/testgen/template/dbsetup_test_types/test_types_Future_Date_1Y.yaml
@@ -0,0 +1,127 @@
+test_types:
+ id: '1016'
+ test_type: Future_Date_1Y
+ test_name_short: Future Year
+ test_name_long: Future dates within year of test run date
+ test_description: |-
+ Tests that the maximum date referenced in the column is no greater than one year beyond the test date, consistent with baseline data
+ except_message: |-
+ Future date beyond one-year found when absent in baseline.
+ measure_uom: Future dates post 1 year
+ measure_uom_description: null
+ selection_criteria: |-
+ general_type='D'AND future_date_ct > 0 AND max_date <='{AS_OF_DATE}'::DATE + INTERVAL'365 DAYS'
+ dq_score_prevalence_formula: |-
+ ({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/NULLIF({RECORD_CT}::FLOAT, 0)
+ dq_score_risk_factor: '1.0'
+ column_name_prompt: null
+ column_name_help: null
+ default_parm_columns: threshold_value
+ default_parm_values: |-
+ 0
+ default_parm_prompts: |-
+ Maximum Post 1-Year Future Date Count
+ default_parm_help: null
+ default_severity: Fail
+ run_type: CAT
+ test_scope: column
+ dq_dimension: Timeliness
+ health_dimension: Recency
+ threshold_description: |-
+ Expected count of future dates beyond one year
+ usage_notes: |-
+ Future Year looks for date values in the column that extend beyond one year after the test date. This would be appropriate for transactional dates where you would expect to find dates in the near future, but not beyond one year ahead. Errors could indicate invalid entries or possibly dummy dates representing blank values.
+ active: Y
+ result_visualization: line_chart
+ result_visualization_params: null
+ cat_test_conditions:
+ - id: '1011'
+ test_type: Future_Date_1Y
+ sql_flavor: redshift
+ measure: |-
+ SUM(GREATEST(0, SIGN({COLUMN_NAME}::DATE - ('{RUN_DATE}'::DATE+365))))
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '2011'
+ test_type: Future_Date_1Y
+ sql_flavor: snowflake
+ measure: |-
+ SUM(GREATEST(0, SIGN({COLUMN_NAME}::DATE - ('{RUN_DATE}'::DATE+365))))
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '3011'
+ test_type: Future_Date_1Y
+ sql_flavor: mssql
+ measure: |-
+ SUM(CASE WHEN CAST({COLUMN_NAME} AS DATE) >= DATEADD(DAY, 365, CONVERT(DATE, '{RUN_DATE}')) THEN 1 ELSE 0 END)
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '4011'
+ test_type: Future_Date_1Y
+ sql_flavor: postgresql
+ measure: |-
+ SUM(GREATEST(0, SIGN({COLUMN_NAME}::DATE - ('{RUN_DATE}'::DATE+365))))
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '5011'
+ test_type: Future_Date_1Y
+ sql_flavor: trino
+ measure: |-
+ SUM(CASE WHEN CAST({COLUMN_NAME} AS DATE) >= (FROM_ISO8601_DATE('{RUN_DATE}') + interval '365' day ) THEN 1 ELSE 0 END)
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '6011'
+ test_type: Future_Date_1Y
+ sql_flavor: databricks
+ measure: |-
+ SUM(GREATEST(0, SIGN({COLUMN_NAME}::DATE - ('{RUN_DATE}'::DATE+365))))
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ target_data_lookups:
+ - id: '1011'
+ test_id: '1016'
+ test_type: Future_Date_1Y
+ sql_flavor: redshift
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE GREATEST(0, SIGN("{COLUMN_NAME}"::DATE - ('{TEST_DATE}'::DATE + 365))) > {THRESHOLD_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT 500;
+ error_type: Test Results
+ - id: '1093'
+ test_id: '1016'
+ test_type: Future_Date_1Y
+ sql_flavor: postgresql
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE GREATEST(0, SIGN("{COLUMN_NAME}"::DATE - ('{TEST_DATE}'::DATE + 365))) > {THRESHOLD_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT 500;
+ error_type: Test Results
+ - id: '1150'
+ test_id: '1016'
+ test_type: Future_Date_1Y
+ sql_flavor: mssql
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE CAST("{COLUMN_NAME}" AS DATE) >= DATEADD(DAY, 365, CONVERT(DATE, '{TEST_DATE}')) GROUP BY "{COLUMN_NAME}";
+ error_type: Test Results
+ - id: '1207'
+ test_id: '1016'
+ test_type: Future_Date_1Y
+ sql_flavor: snowflake
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE GREATEST(0, SIGN("{COLUMN_NAME}"::DATE - ('{TEST_DATE}'::DATE + 365))) > {THRESHOLD_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT 500;
+ error_type: Test Results
+ - id: '1308'
+ test_id: '1016'
+ test_type: Future_Date_1Y
+ sql_flavor: databricks
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE GREATEST(0, SIGN(`{COLUMN_NAME}`::DATE - ('{TEST_DATE}'::DATE + 365))) > {THRESHOLD_VALUE} GROUP BY `{COLUMN_NAME}` LIMIT 500;
+ error_type: Test Results
+ test_templates: []
diff --git a/testgen/template/dbsetup_test_types/test_types_Incr_Avg_Shift.yaml b/testgen/template/dbsetup_test_types/test_types_Incr_Avg_Shift.yaml
new file mode 100644
index 00000000..f4051df5
--- /dev/null
+++ b/testgen/template/dbsetup_test_types/test_types_Incr_Avg_Shift.yaml
@@ -0,0 +1,128 @@
+test_types:
+ id: '1017'
+ test_type: Incr_Avg_Shift
+ test_name_short: New Shift
+ test_name_long: New record mean is consistent with reference
+ test_description: |-
+ Tests for statistically-significant shift in mean of new values for column compared to average calculated at baseline.
+ except_message: |-
+ Significant shift in average of new values vs. baseline avg
+ measure_uom: Z-score of mean shift
+ measure_uom_description: |-
+ Absolute Z-score (number of SD's outside mean) of prior avg - incremental avg
+ selection_criteria: |-
+ general_type='N' AND distinct_value_ct > 10 AND functional_data_type ilike 'Measure%' AND functional_data_type <> 'Measurement Spike' AND column_name NOT ilike '%latitude%' AND column_name NOT ilike '%longitude%'
+ dq_score_prevalence_formula: |-
+ {RECORD_CT}::FLOAT*(1-FN_NORMAL_CDF({RESULT_MEASURE}::FLOAT))/NULLIF({RECORD_CT}::FLOAT, 0)
+ dq_score_risk_factor: '0.75'
+ column_name_prompt: null
+ column_name_help: null
+ default_parm_columns: baseline_value_ct,baseline_sum,baseline_avg,baseline_sd,threshold_value
+ default_parm_values: |-
+ value_ct,(avg_value * value_ct)::FLOAT,avg_value,stdev_value,2
+ default_parm_prompts: |-
+ Value Count at Baseline,Sum at Baseline,Mean Value at Baseline,Std Deviation at Baseline,Threshold Max Z-Score
+ default_parm_help: null
+ default_severity: Warning
+ run_type: CAT
+ test_scope: column
+ dq_dimension: Accuracy
+ health_dimension: Data Drift
+ threshold_description: |-
+ Maximum Z-Score (number of SD's beyond mean) expected
+ usage_notes: |-
+ This is a more sensitive test than Average Shift, because it calculates an incremental difference in the average of new values compared to the average of values at baseline. This is appropriate for a cumulative dataset only, because it calculates the average of new entries based on the assumption that the count and average of records present at baseline are still present at the time of the test. This test compares the mean of new values with the standard deviation of the baseline average to calculate a Z-score. If the new mean falls outside the Z-score threshold, a shift is detected. Potential Z-score thresholds may range from 0 to 3, depending on the sensitivity you prefer. A failed test could indicate a quality issue or a legitimate shift in new data that should be noted and assessed by business users. Consider this test along with Variability Increase. If variability rises too, process, methodology or measurement flaws could be at issue. If variability remains consistent, the problem is more likely to be with the source data itself.
+ active: Y
+ result_visualization: line_chart
+ result_visualization_params: null
+ cat_test_conditions:
+ - id: '1012'
+ test_type: Incr_Avg_Shift
+ sql_flavor: redshift
+ measure: |-
+ NVL(ABS( ({BASELINE_AVG} - (SUM({COLUMN_NAME}) - {BASELINE_SUM}) / NULLIF(COUNT({COLUMN_NAME})::FLOAT - {BASELINE_VALUE_CT}, 0)) / {BASELINE_SD} ), 0)
+ test_operator: '>='
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '2012'
+ test_type: Incr_Avg_Shift
+ sql_flavor: snowflake
+ measure: |-
+ COALESCE(ABS( ({BASELINE_AVG} - (SUM({COLUMN_NAME}) - {BASELINE_SUM}) / NULLIF(COUNT({COLUMN_NAME})::FLOAT - {BASELINE_VALUE_CT}, 0)) / {BASELINE_SD} ), 0)
+ test_operator: '>='
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '3012'
+ test_type: Incr_Avg_Shift
+ sql_flavor: mssql
+ measure: |-
+ COALESCE(ABS( ({BASELINE_AVG} - (SUM({COLUMN_NAME}) - {BASELINE_SUM}) / NULLIF(CAST(COUNT({COLUMN_NAME}) AS FLOAT) - {BASELINE_VALUE_CT}, 0)) / {BASELINE_SD} ), 0)
+ test_operator: '>='
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '4012'
+ test_type: Incr_Avg_Shift
+ sql_flavor: postgresql
+ measure: |-
+ COALESCE(ABS( ({BASELINE_AVG} - (SUM({COLUMN_NAME}) - {BASELINE_SUM}) / NULLIF(COUNT({COLUMN_NAME})::FLOAT - {BASELINE_VALUE_CT}, 0)) / {BASELINE_SD} ), 0)
+ test_operator: '>='
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '5012'
+ test_type: Incr_Avg_Shift
+ sql_flavor: trino
+ measure: |-
+ COALESCE(ABS( ({BASELINE_AVG} - (SUM({COLUMN_NAME}) - {BASELINE_SUM}) / NULLIF(CAST(COUNT({COLUMN_NAME}) AS REAL) - {BASELINE_VALUE_CT}, 0)) / {BASELINE_SD} ), 0)
+ test_operator: '>='
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '6012'
+ test_type: Incr_Avg_Shift
+ sql_flavor: databricks
+ measure: |-
+ COALESCE(ABS( ({BASELINE_AVG} - (SUM({COLUMN_NAME}) - {BASELINE_SUM}) / NULLIF(COUNT({COLUMN_NAME})::FLOAT - {BASELINE_VALUE_CT}, 0)) / {BASELINE_SD} ), 0)
+ test_operator: '>='
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ target_data_lookups:
+ - id: '1012'
+ test_id: '1017'
+ test_type: Incr_Avg_Shift
+ sql_flavor: redshift
+ lookup_type: null
+ lookup_query: |-
+ SELECT AVG("{COLUMN_NAME}" :: FLOAT) AS current_average, SUM("{COLUMN_NAME}" ::FLOAT) AS current_sum, NULLIF(COUNT("{COLUMN_NAME}" )::FLOAT, 0) as current_value_count FROM {TARGET_SCHEMA}.{TABLE_NAME};
+ error_type: Test Results
+ - id: '1094'
+ test_id: '1017'
+ test_type: Incr_Avg_Shift
+ sql_flavor: postgresql
+ lookup_type: null
+ lookup_query: |-
+ SELECT AVG("{COLUMN_NAME}" :: FLOAT) AS current_average, SUM("{COLUMN_NAME}" ::FLOAT) AS current_sum, NULLIF(COUNT("{COLUMN_NAME}" )::FLOAT, 0) as current_value_count FROM {TARGET_SCHEMA}.{TABLE_NAME};
+ error_type: Test Results
+ - id: '1151'
+ test_id: '1017'
+ test_type: Incr_Avg_Shift
+ sql_flavor: mssql
+ lookup_type: null
+ lookup_query: |-
+ SELECT AVG(CAST("{COLUMN_NAME}" AS FLOAT)) AS current_average, SUM(CAST("{COLUMN_NAME}" AS FLOAT)) AS current_sum, NULLIF(CAST(COUNT("{COLUMN_NAME}") AS FLOAT), 0) as current_value_count FROM {TARGET_SCHEMA}.{TABLE_NAME};
+ error_type: Test Results
+ - id: '1208'
+ test_id: '1017'
+ test_type: Incr_Avg_Shift
+ sql_flavor: snowflake
+ lookup_type: null
+ lookup_query: |-
+ SELECT AVG("{COLUMN_NAME}" :: FLOAT) AS current_average, SUM("{COLUMN_NAME}" ::FLOAT) AS current_sum, NULLIF(COUNT("{COLUMN_NAME}" )::FLOAT, 0) as current_value_count FROM {TARGET_SCHEMA}.{TABLE_NAME};
+ error_type: Test Results
+ - id: '1309'
+ test_id: '1017'
+ test_type: Incr_Avg_Shift
+ sql_flavor: databricks
+ lookup_type: null
+ lookup_query: |-
+ SELECT AVG(`{COLUMN_NAME}` :: FLOAT) AS current_average, SUM(`{COLUMN_NAME}` ::FLOAT) AS current_sum, NULLIF(COUNT(`{COLUMN_NAME}` )::FLOAT, 0) as current_value_count FROM {TARGET_SCHEMA}.{TABLE_NAME};
+ error_type: Test Results
+ test_templates: []
diff --git a/testgen/template/dbsetup_test_types/test_types_LOV_All.yaml b/testgen/template/dbsetup_test_types/test_types_LOV_All.yaml
new file mode 100644
index 00000000..54fa704b
--- /dev/null
+++ b/testgen/template/dbsetup_test_types/test_types_LOV_All.yaml
@@ -0,0 +1,125 @@
+test_types:
+ id: '1018'
+ test_type: LOV_All
+ test_name_short: Value Match All
+ test_name_long: List of expected values all present in column
+ test_description: |-
+ Tests that all values match a pipe-delimited list of expected values and that all expected values are present
+ except_message: |-
+ Column values found don't exactly match the expected list of values
+ measure_uom: Values found
+ measure_uom_description: null
+ selection_criteria: null
+ dq_score_prevalence_formula: |-
+ 1
+ dq_score_risk_factor: '1.0'
+ column_name_prompt: null
+ column_name_help: null
+ default_parm_columns: threshold_value
+ default_parm_values: null
+ default_parm_prompts: |-
+ List of Expected Values
+ default_parm_help: null
+ default_severity: Fail
+ run_type: CAT
+ test_scope: column
+ dq_dimension: Validity
+ health_dimension: Schema Drift
+ threshold_description: |-
+ List of values expected, in form ('Val1','Val2)
+ usage_notes: |-
+ This is a more restrictive form of Value Match, testing that all values in the dataset match the list provided, and also that all values present in the list appear at least once in the dataset. This would be appropriate for tables where all category values in the column are represented at least once.
+ active: Y
+ result_visualization: line_chart
+ result_visualization_params: null
+ cat_test_conditions:
+ - id: '1013'
+ test_type: LOV_All
+ sql_flavor: redshift
+ measure: |-
+ LISTAGG(DISTINCT {COLUMN_NAME}, '|') WITHIN GROUP (ORDER BY {COLUMN_NAME})
+ test_operator: <>
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '2013'
+ test_type: LOV_All
+ sql_flavor: snowflake
+ measure: |-
+ LISTAGG(DISTINCT {COLUMN_NAME}, '|') WITHIN GROUP (ORDER BY {COLUMN_NAME})
+ test_operator: <>
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '3013'
+ test_type: LOV_All
+ sql_flavor: mssql
+ measure: |-
+ STRING_AGG(DISTINCT {COLUMN_NAME}, '|') WITHIN GROUP (ORDER BY {COLUMN_NAME})
+ test_operator: <>
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '4013'
+ test_type: LOV_All
+ sql_flavor: postgresql
+ measure: |-
+ STRING_AGG(DISTINCT {COLUMN_NAME}, '|') WITHIN GROUP (ORDER BY {COLUMN_NAME})
+ test_operator: <>
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '6013'
+ test_type: LOV_All
+ sql_flavor: databricks
+ measure: |-
+ STRING_AGG(DISTINCT {COLUMN_NAME}, '|') WITHIN GROUP (ORDER BY {COLUMN_NAME})
+ test_operator: <>
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '5013'
+ test_type: LOV_All
+ sql_flavor: trino
+ measure: |-
+ LISTAGG(DISTINCT {COLUMN_NAME}, '|') WITHIN GROUP (ORDER BY {COLUMN_NAME})
+ test_operator: <>
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ target_data_lookups:
+ - id: '1013'
+ test_id: '1018'
+ test_type: LOV_All
+ sql_flavor: redshift
+ lookup_type: null
+ lookup_query: |-
+ SELECT LISTAGG(DISTINCT "{COLUMN_NAME}", '|') WITHIN GROUP (ORDER BY "{COLUMN_NAME}") FROM {TARGET_SCHEMA}.{TABLE_NAME} HAVING LISTAGG(DISTINCT "{COLUMN_NAME}", '|') WITHIN GROUP (ORDER BY "{COLUMN_NAME}") <> '{THRESHOLD_VALUE}' LIMIT 500;
+ error_type: Test Results
+ - id: '1095'
+ test_id: '1018'
+ test_type: LOV_All
+ sql_flavor: postgresql
+ lookup_type: null
+ lookup_query: |-
+ SELECT STRING_AGG(DISTINCT "{COLUMN_NAME}", '|' ORDER BY "{COLUMN_NAME}" ASC) FROM {TARGET_SCHEMA}.{TABLE_NAME} HAVING STRING_AGG(DISTINCT "{COLUMN_NAME}", '|' ORDER BY "{COLUMN_NAME}" ASC) <> '{THRESHOLD_VALUE}' LIMIT 500;
+ error_type: Test Results
+ - id: '1152'
+ test_id: '1018'
+ test_type: LOV_All
+ sql_flavor: mssql
+ lookup_type: null
+ lookup_query: |-
+ WITH CTE AS (SELECT DISTINCT "{COLUMN_NAME}" FROM {TARGET_SCHEMA}.{TABLE_NAME}) SELECT STRING_AGG( "{COLUMN_NAME}", '|' ) WITHIN GROUP (ORDER BY "{COLUMN_NAME}" ASC) FROM CTE HAVING STRING_AGG("{COLUMN_NAME}", '|') WITHIN GROUP (ORDER BY "{COLUMN_NAME}" ASC) <> '{THRESHOLD_VALUE}';
+ error_type: Test Results
+ - id: '1209'
+ test_id: '1018'
+ test_type: LOV_All
+ sql_flavor: snowflake
+ lookup_type: null
+ lookup_query: |-
+ SELECT LISTAGG(DISTINCT "{COLUMN_NAME}", '|') WITHIN GROUP (ORDER BY "{COLUMN_NAME}") FROM {TARGET_SCHEMA}.{TABLE_NAME} HAVING LISTAGG(DISTINCT "{COLUMN_NAME}", '|') WITHIN GROUP (ORDER BY "{COLUMN_NAME}") <> '{THRESHOLD_VALUE}' LIMIT 500;
+ error_type: Test Results
+ - id: '1310'
+ test_id: '1018'
+ test_type: LOV_All
+ sql_flavor: databricks
+ lookup_type: null
+ lookup_query: |-
+ SELECT ARRAY_JOIN(ARRAY_SORT(COLLECT_SET(`{COLUMN_NAME}`)), '|') AS aggregated_values FROM {TARGET_SCHEMA}.{TABLE_NAME} HAVING ARRAY_JOIN(ARRAY_SORT(COLLECT_SET(`{COLUMN_NAME}`)), '|') <> '{THRESHOLD_VALUE}' LIMIT 500;
+ error_type: Test Results
+ test_templates: []
diff --git a/testgen/template/dbsetup_test_types/test_types_LOV_Match.yaml b/testgen/template/dbsetup_test_types/test_types_LOV_Match.yaml
new file mode 100644
index 00000000..033c290a
--- /dev/null
+++ b/testgen/template/dbsetup_test_types/test_types_LOV_Match.yaml
@@ -0,0 +1,127 @@
+test_types:
+ id: '1019'
+ test_type: LOV_Match
+ test_name_short: Value Match
+ test_name_long: All column values present in expected list
+ test_description: |-
+ Tests that all values in the column match the list-of-values identified in baseline data.
+ except_message: |-
+ Values not matching expected List-of-Values from baseline.
+ measure_uom: Non-matching records
+ measure_uom_description: null
+ selection_criteria: |-
+ functional_data_type IN ('Boolean', 'Code', 'Category') AND top_freq_values > '' AND distinct_value_ct BETWEEN 2 and 10 AND value_ct > 5
+ dq_score_prevalence_formula: |-
+ ({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/NULLIF({RECORD_CT}::FLOAT, 0)
+ dq_score_risk_factor: '1.0'
+ column_name_prompt: null
+ column_name_help: null
+ default_parm_columns: baseline_value,threshold_value
+ default_parm_values: |-
+ '(' || SUBSTRING( CASE WHEN SPLIT_PART(top_freq_values, '|' , 2) > '' THEN ',''' || TRIM( REPLACE ( SPLIT_PART(top_freq_values, '|' , 2), '''' , '''''' ) ) || '''' ELSE '' END || CASE WHEN SPLIT_PART(top_freq_values, '|' , 4) > '' THEN ',''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, '|' , 4), '''' , '''''' )) || '''' ELSE '' END || CASE WHEN SPLIT_PART(top_freq_values, '|' , 6) > '' THEN ',''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, '|' , 6), '''' , '''''' )) || '''' ELSE '' END || CASE WHEN SPLIT_PART(top_freq_values, '|' , 8) > '' THEN ',''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, '|' , 8), '''' , '''''' )) || '''' ELSE '' END || CASE WHEN SPLIT_PART(top_freq_values, '|' , 10) > '' THEN ',''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, '|' , 10), '''' , '''''' )) || '''' ELSE '' END || CASE WHEN SPLIT_PART(top_freq_values, '|' , 12) > '' THEN ',''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, '|' , 12), '''' , '''''' )) || '''' ELSE '' END || CASE WHEN SPLIT_PART(top_freq_values, '|' , 14) > '' THEN ',''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, '|' , 14), '''' , '''''' )) || '''' ELSE '' END || CASE WHEN SPLIT_PART(top_freq_values, '|' , 16) > '' THEN ',''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, '|' , 16), '''' , '''''' )) || '''' ELSE '' END || CASE WHEN SPLIT_PART(top_freq_values, '|' , 18) > '' THEN ',''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, '|' , 18), '''' , '''''' )) || '''' ELSE '' END || CASE WHEN SPLIT_PART(top_freq_values, '|' , 20) > '' THEN ',''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, '|' , 20), '''' , '''''' )) || '''' ELSE '' END, 2, 999) || ')',0
+ default_parm_prompts: |-
+ List of Expected Values,Threshold Error Count
+ default_parm_help: null
+ default_severity: Fail
+ run_type: CAT
+ test_scope: column
+ dq_dimension: Validity
+ health_dimension: Schema Drift
+ threshold_description: |-
+ List of values expected, in form ('Val1','Val2)
+ usage_notes: |-
+ This tests that all values in the column match the hard-coded list provided. This is relevant when the list of allowable values is small and not expected to change often. Even if new values might occasionally be added, this test is useful for downstream data products to provide warning that assumptions and logic may need to change.
+ active: Y
+ result_visualization: line_chart
+ result_visualization_params: null
+ cat_test_conditions:
+ - id: '1014'
+ test_type: LOV_Match
+ sql_flavor: redshift
+ measure: |-
+ SUM(CASE WHEN NULLIF({COLUMN_NAME}, '') NOT IN {BASELINE_VALUE} THEN 1 ELSE 0 END)
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '2014'
+ test_type: LOV_Match
+ sql_flavor: snowflake
+ measure: |-
+ SUM(CASE WHEN NULLIF({COLUMN_NAME}, '') NOT IN {BASELINE_VALUE} THEN 1 ELSE 0 END)
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '3014'
+ test_type: LOV_Match
+ sql_flavor: mssql
+ measure: |-
+ SUM(CASE WHEN NULLIF({COLUMN_NAME}, '') NOT IN {BASELINE_VALUE} THEN 1 ELSE 0 END)
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '4014'
+ test_type: LOV_Match
+ sql_flavor: postgresql
+ measure: |-
+ SUM(CASE WHEN NULLIF({COLUMN_NAME}, '') NOT IN {BASELINE_VALUE} THEN 1 ELSE 0 END)
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '5014'
+ test_type: LOV_Match
+ sql_flavor: trino
+ measure: |-
+ SUM(CASE WHEN NULLIF({COLUMN_NAME}, '') NOT IN {BASELINE_VALUE} THEN 1 ELSE 0 END)
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '6014'
+ test_type: LOV_Match
+ sql_flavor: databricks
+ measure: |-
+ SUM(CASE WHEN NULLIF({COLUMN_NAME}, '') NOT IN {BASELINE_VALUE} THEN 1 ELSE 0 END)
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ target_data_lookups:
+ - id: '1014'
+ test_id: '1019'
+ test_type: LOV_Match
+ sql_flavor: redshift
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT NULLIF("{COLUMN_NAME}", '') AS "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE NULLIF("{COLUMN_NAME}", '') NOT IN {BASELINE_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT 500;
+ error_type: Test Results
+ - id: '1096'
+ test_id: '1019'
+ test_type: LOV_Match
+ sql_flavor: postgresql
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT NULLIF("{COLUMN_NAME}", '') AS "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE NULLIF("{COLUMN_NAME}", '') NOT IN {BASELINE_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT 500;
+ error_type: Test Results
+ - id: '1153'
+ test_id: '1019'
+ test_type: LOV_Match
+ sql_flavor: mssql
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT TOP 500 NULLIF("{COLUMN_NAME}", '') AS "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE NULLIF("{COLUMN_NAME}", '') NOT IN {BASELINE_VALUE} GROUP BY "{COLUMN_NAME}" ;
+ error_type: Test Results
+ - id: '1210'
+ test_id: '1019'
+ test_type: LOV_Match
+ sql_flavor: snowflake
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT NULLIF("{COLUMN_NAME}", '') AS "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE NULLIF("{COLUMN_NAME}", '') NOT IN {BASELINE_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT 500;
+ error_type: Test Results
+ - id: '1311'
+ test_id: '1019'
+ test_type: LOV_Match
+ sql_flavor: databricks
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT NULLIF(`{COLUMN_NAME}`, '') AS `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE NULLIF(`{COLUMN_NAME}`, '') NOT IN {BASELINE_VALUE} GROUP BY `{COLUMN_NAME}` LIMIT 500;
+ error_type: Test Results
+ test_templates: []
diff --git a/testgen/template/dbsetup_test_types/test_types_Min_Date.yaml b/testgen/template/dbsetup_test_types/test_types_Min_Date.yaml
new file mode 100644
index 00000000..0a5874dc
--- /dev/null
+++ b/testgen/template/dbsetup_test_types/test_types_Min_Date.yaml
@@ -0,0 +1,127 @@
+test_types:
+ id: '1020'
+ test_type: Min_Date
+ test_name_short: Minimum Date
+ test_name_long: All dates on or after set minimum
+ test_description: |-
+ Tests that the earliest date referenced in the column is no earlier than baseline data
+ except_message: |-
+ The earliest date value found is before the earliest value at baseline.
+ measure_uom: Dates prior to limit
+ measure_uom_description: null
+ selection_criteria: |-
+ general_type='D'and min_date IS NOT NULL AND distinct_value_ct > 1
+ dq_score_prevalence_formula: |-
+ ({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/NULLIF({RECORD_CT}::FLOAT, 0)
+ dq_score_risk_factor: '1.0'
+ column_name_prompt: null
+ column_name_help: null
+ default_parm_columns: baseline_value,threshold_value
+ default_parm_values: |-
+ min_date,0
+ default_parm_prompts: |-
+ Minimum Date at Baseline,Threshold Error Count
+ default_parm_help: null
+ default_severity: Fail
+ run_type: CAT
+ test_scope: column
+ dq_dimension: Validity
+ health_dimension: Schema Drift
+ threshold_description: |-
+ Expected count of dates prior to minimum
+ usage_notes: |-
+ This test is appropriate for a cumulative dataset only, because it assumes all prior values are still present. It's appropriate where new records are added with more recent dates, but old dates dates do not change.
+ active: Y
+ result_visualization: line_chart
+ result_visualization_params: null
+ cat_test_conditions:
+ - id: '1015'
+ test_type: Min_Date
+ sql_flavor: redshift
+ measure: |-
+ SUM(CASE WHEN {COLUMN_NAME} < '{BASELINE_VALUE}' THEN 1 ELSE 0 END)
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '2015'
+ test_type: Min_Date
+ sql_flavor: snowflake
+ measure: |-
+ SUM(CASE WHEN {COLUMN_NAME} < '{BASELINE_VALUE}' THEN 1 ELSE 0 END)
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '3015'
+ test_type: Min_Date
+ sql_flavor: mssql
+ measure: |-
+ SUM(CASE WHEN {COLUMN_NAME} < '{BASELINE_VALUE}' THEN 1 ELSE 0 END)
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '4015'
+ test_type: Min_Date
+ sql_flavor: postgresql
+ measure: |-
+ SUM(CASE WHEN {COLUMN_NAME} < '{BASELINE_VALUE}' THEN 1 ELSE 0 END)
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '5015'
+ test_type: Min_Date
+ sql_flavor: trino
+ measure: |-
+ SUM(CASE WHEN {COLUMN_NAME} < CAST('{BASELINE_VALUE}' AS DATE) THEN 1 ELSE 0 END)
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '6015'
+ test_type: Min_Date
+ sql_flavor: databricks
+ measure: |-
+ SUM(CASE WHEN {COLUMN_NAME} < '{BASELINE_VALUE}' THEN 1 ELSE 0 END)
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ target_data_lookups:
+ - id: '1015'
+ test_id: '1020'
+ test_type: Min_Date
+ sql_flavor: redshift
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" :: DATE < '{BASELINE_VALUE}' :: DATE GROUP BY "{COLUMN_NAME}" LIMIT 500;
+ error_type: Test Results
+ - id: '1097'
+ test_id: '1020'
+ test_type: Min_Date
+ sql_flavor: postgresql
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" :: DATE < '{BASELINE_VALUE}' :: DATE GROUP BY "{COLUMN_NAME}" LIMIT 500;
+ error_type: Test Results
+ - id: '1154'
+ test_id: '1020'
+ test_type: Min_Date
+ sql_flavor: mssql
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE CAST("{COLUMN_NAME}" AS DATE) < CAST('{BASELINE_VALUE}' AS DATE) GROUP BY "{COLUMN_NAME}";
+ error_type: Test Results
+ - id: '1211'
+ test_id: '1020'
+ test_type: Min_Date
+ sql_flavor: snowflake
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" :: DATE < '{BASELINE_VALUE}' :: DATE GROUP BY "{COLUMN_NAME}" LIMIT 500;
+ error_type: Test Results
+ - id: '1312'
+ test_id: '1020'
+ test_type: Min_Date
+ sql_flavor: databricks
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE `{COLUMN_NAME}` :: DATE < '{BASELINE_VALUE}' :: DATE GROUP BY `{COLUMN_NAME}` LIMIT 500;
+ error_type: Test Results
+ test_templates: []
diff --git a/testgen/template/dbsetup_test_types/test_types_Min_Val.yaml b/testgen/template/dbsetup_test_types/test_types_Min_Val.yaml
new file mode 100644
index 00000000..effe37b6
--- /dev/null
+++ b/testgen/template/dbsetup_test_types/test_types_Min_Val.yaml
@@ -0,0 +1,127 @@
+test_types:
+ id: '1021'
+ test_type: Min_Val
+ test_name_short: Minimum Value
+ test_name_long: All values at or above set minimum
+ test_description: |-
+ Tests that the minimum value present in the column is no lower than the minimum value in baseline data
+ except_message: |-
+ Minimum column value less than baseline.
+ measure_uom: Values under limit
+ measure_uom_description: null
+ selection_criteria: |-
+ general_type='N' AND functional_data_type ILIKE 'Measure%' AND min_value IS NOT NULL AND (distinct_value_ct >= 2 OR (distinct_value_ct=2 and min_value<>0 and max_value<>1))
+ dq_score_prevalence_formula: |-
+ ({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/NULLIF({RECORD_CT}::FLOAT, 0)
+ dq_score_risk_factor: '1.0'
+ column_name_prompt: null
+ column_name_help: null
+ default_parm_columns: baseline_value,threshold_value
+ default_parm_values: |-
+ min_value,0
+ default_parm_prompts: |-
+ Minimum Value at Baseline,Threshold Error Count
+ default_parm_help: null
+ default_severity: Fail
+ run_type: CAT
+ test_scope: column
+ dq_dimension: Validity
+ health_dimension: Schema Drift
+ threshold_description: |-
+ Expected count of values under limit
+ usage_notes: |-
+ This test is appropriate for a cumulative dataset only, assuming all prior values are still present. It is also appropriate for any measure that has an absolute, definable minimum value, or a heuristic that makes senes for valid data.
+ active: Y
+ result_visualization: line_chart
+ result_visualization_params: null
+ cat_test_conditions:
+ - id: '1016'
+ test_type: Min_Val
+ sql_flavor: redshift
+ measure: |-
+ SUM(CASE WHEN {COLUMN_NAME} < {BASELINE_VALUE} - 1e-6 THEN 1 ELSE 0 END)
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '2016'
+ test_type: Min_Val
+ sql_flavor: snowflake
+ measure: |-
+ SUM(CASE WHEN {COLUMN_NAME} < {BASELINE_VALUE} - 1e-6 THEN 1 ELSE 0 END)
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '3016'
+ test_type: Min_Val
+ sql_flavor: mssql
+ measure: |-
+ SUM(CASE WHEN {COLUMN_NAME} < {BASELINE_VALUE} - 1e-6 THEN 1 ELSE 0 END)
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '4016'
+ test_type: Min_Val
+ sql_flavor: postgresql
+ measure: |-
+ SUM(CASE WHEN {COLUMN_NAME} < {BASELINE_VALUE} - 1e-6 THEN 1 ELSE 0 END)
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '5016'
+ test_type: Min_Val
+ sql_flavor: trino
+ measure: |-
+ SUM(CASE WHEN {COLUMN_NAME} < {BASELINE_VALUE} - 1e-6 THEN 1 ELSE 0 END)
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '6016'
+ test_type: Min_Val
+ sql_flavor: databricks
+ measure: |-
+ SUM(CASE WHEN {COLUMN_NAME} < {BASELINE_VALUE} - 1e-6 THEN 1 ELSE 0 END)
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ target_data_lookups:
+ - id: '1016'
+ test_id: '1021'
+ test_type: Min_Val
+ sql_flavor: redshift
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT "{COLUMN_NAME}", (ABS("{COLUMN_NAME}") - ABS({BASELINE_VALUE})) AS difference_from_baseline FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" < {BASELINE_VALUE} LIMIT 500;
+ error_type: Test Results
+ - id: '1098'
+ test_id: '1021'
+ test_type: Min_Val
+ sql_flavor: postgresql
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT "{COLUMN_NAME}", (ABS("{COLUMN_NAME}") - ABS({BASELINE_VALUE})) AS difference_from_baseline FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" < {BASELINE_VALUE} LIMIT 500;
+ error_type: Test Results
+ - id: '1155'
+ test_id: '1021'
+ test_type: Min_Val
+ sql_flavor: mssql
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT TOP 500 "{COLUMN_NAME}", (ABS("{COLUMN_NAME}") - ABS({BASELINE_VALUE})) AS difference_from_baseline FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" < {BASELINE_VALUE};
+ error_type: Test Results
+ - id: '1212'
+ test_id: '1021'
+ test_type: Min_Val
+ sql_flavor: snowflake
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT "{COLUMN_NAME}", (ABS("{COLUMN_NAME}") - ABS({BASELINE_VALUE})) AS difference_from_baseline FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" < {BASELINE_VALUE} LIMIT 500;
+ error_type: Test Results
+ - id: '1313'
+ test_id: '1021'
+ test_type: Min_Val
+ sql_flavor: databricks
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT `{COLUMN_NAME}`, (ABS(`{COLUMN_NAME}`) - ABS({BASELINE_VALUE})) AS difference_from_baseline FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE `{COLUMN_NAME}` < {BASELINE_VALUE} LIMIT 500;
+ error_type: Test Results
+ test_templates: []
diff --git a/testgen/template/dbsetup_test_types/test_types_Missing_Pct.yaml b/testgen/template/dbsetup_test_types/test_types_Missing_Pct.yaml
new file mode 100644
index 00000000..ba7b6fb7
--- /dev/null
+++ b/testgen/template/dbsetup_test_types/test_types_Missing_Pct.yaml
@@ -0,0 +1,128 @@
+test_types:
+ id: '1022'
+ test_type: Missing_Pct
+ test_name_short: Percent Missing
+ test_name_long: Consistent ratio of missing values
+ test_description: |-
+ Tests for statistically-significant shift in percentage of missing values in column vs. baseline data
+ except_message: |-
+ Significant shift in percent of missing values vs. baseline.
+ measure_uom: Difference measure
+ measure_uom_description: |-
+ Cohen's H Difference (0.20 small, 0.5 mod, 0.8 large, 1.2 very large, 2.0 huge)
+ selection_criteria: |-
+ record_ct <> value_ct
+ dq_score_prevalence_formula: |-
+ 2.0 * (1.0 - fn_normal_cdf(ABS({RESULT_MEASURE}::FLOAT) / 2.0))
+ dq_score_risk_factor: '0.75'
+ column_name_prompt: null
+ column_name_help: null
+ default_parm_columns: baseline_ct,baseline_value_ct,threshold_value
+ default_parm_values: |-
+ record_ct,value_ct,2::VARCHAR(10)
+ default_parm_prompts: |-
+ Baseline Record Count,Baseline Value Count,Standardized Difference Measure
+ default_parm_help: null
+ default_severity: Warning
+ run_type: CAT
+ test_scope: column
+ dq_dimension: Completeness
+ health_dimension: Data Drift
+ threshold_description: |-
+ Expected maximum Cohen's H Difference
+ usage_notes: |-
+ This test uses Cohen's H, a statistical test to identify a significant difference between two ratios. Results are reported on a standardized scale, which can be interpreted via a rule-of-thumb from small to huge. An uptick in missing data may indicate a collection issue at the source. A larger change may indicate a processing failure. A drop in missing data may also be significant, if it affects assumptions built into analytic products downstream. You can refine the expected threshold value as you view legitimate results of the measure over time.
+ active: Y
+ result_visualization: line_chart
+ result_visualization_params: null
+ cat_test_conditions:
+ - id: '1017'
+ test_type: Missing_Pct
+ sql_flavor: redshift
+ measure: |-
+ ABS( 2.0 * ASIN( SQRT( {BASELINE_VALUE_CT}::FLOAT / {BASELINE_CT}::FLOAT ) ) - 2 * ASIN( SQRT( COUNT( {COLUMN_NAME} )::FLOAT / NULLIF(COUNT(*), 0)::FLOAT )) )
+ test_operator: '>='
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '2017'
+ test_type: Missing_Pct
+ sql_flavor: snowflake
+ measure: |-
+ ABS( 2.0 * ASIN( SQRT( {BASELINE_VALUE_CT}::FLOAT / {BASELINE_CT}::FLOAT ) ) - 2 * ASIN( SQRT( COUNT( {COLUMN_NAME} )::FLOAT / NULLIF(COUNT(*), 0)::FLOAT )) )
+ test_operator: '>='
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '3017'
+ test_type: Missing_Pct
+ sql_flavor: mssql
+ measure: |-
+ ABS( 2.0 * ASIN( SQRT( CAST({BASELINE_VALUE_CT} AS FLOAT) / CAST({BASELINE_CT} AS FLOAT) ) ) - 2 * ASIN( SQRT( CAST(COUNT( {COLUMN_NAME} ) AS FLOAT) / CAST(NULLIF(COUNT(*), 0) AS FLOAT) )) )
+ test_operator: '>='
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '4017'
+ test_type: Missing_Pct
+ sql_flavor: postgresql
+ measure: |-
+ ABS( 2.0 * ASIN( SQRT( {BASELINE_VALUE_CT}::FLOAT / {BASELINE_CT}::FLOAT ) ) - 2 * ASIN( SQRT( COUNT( {COLUMN_NAME} )::FLOAT / NULLIF(COUNT(*), 0)::FLOAT )) )
+ test_operator: '>='
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '5017'
+ test_type: Missing_Pct
+ sql_flavor: trino
+ measure: |-
+ ABS(2.0 * ASIN(SQRT(CAST({BASELINE_VALUE_CT} AS REAL) / CAST({BASELINE_CT} AS REAL))) - 2 * ASIN(SQRT(CAST(COUNT({COLUMN_NAME}) AS REAL) / CAST(NULLIF(COUNT(*), 0) AS REAL) )))
+ test_operator: '>='
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '6017'
+ test_type: Missing_Pct
+ sql_flavor: databricks
+ measure: |-
+ ABS( 2.0 * ASIN( SQRT( {BASELINE_VALUE_CT}::FLOAT / {BASELINE_CT}::FLOAT ) ) - 2 * ASIN( SQRT( COUNT({COLUMN_NAME})::FLOAT / NULLIF(COUNT(*), 0)::FLOAT )) )
+ test_operator: '>='
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ target_data_lookups:
+ - id: '1017'
+ test_id: '1022'
+ test_type: Missing_Pct
+ sql_flavor: redshift
+ lookup_type: null
+ lookup_query: |-
+ SELECT TOP 10 * FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" IS NULL OR "{COLUMN_NAME}" :: VARCHAR(255) = '' ;
+ error_type: Test Results
+ - id: '1099'
+ test_id: '1022'
+ test_type: Missing_Pct
+ sql_flavor: postgresql
+ lookup_type: null
+ lookup_query: |-
+ SELECT * FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" IS NULL OR "{COLUMN_NAME}" :: VARCHAR(255) = '' LIMIT 10;
+ error_type: Test Results
+ - id: '1156'
+ test_id: '1022'
+ test_type: Missing_Pct
+ sql_flavor: mssql
+ lookup_type: null
+ lookup_query: |-
+ SELECT TOP 10 * FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" IS NULL OR CAST("{COLUMN_NAME}" AS VARCHAR(255)) = '';
+ error_type: Test Results
+ - id: '1213'
+ test_id: '1022'
+ test_type: Missing_Pct
+ sql_flavor: snowflake
+ lookup_type: null
+ lookup_query: |-
+ SELECT TOP 10 * FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" IS NULL OR "{COLUMN_NAME}" :: VARCHAR(255) = '' ;
+ error_type: Test Results
+ - id: '1314'
+ test_id: '1022'
+ test_type: Missing_Pct
+ sql_flavor: databricks
+ lookup_type: null
+ lookup_query: |-
+ SELECT * FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE `{COLUMN_NAME}` IS NULL OR `{COLUMN_NAME}` :: VARCHAR(255) = '' LIMIT 10;
+ error_type: Test Results
+ test_templates: []
diff --git a/testgen/template/dbsetup_test_types/test_types_Monthly_Rec_Ct.yaml b/testgen/template/dbsetup_test_types/test_types_Monthly_Rec_Ct.yaml
new file mode 100644
index 00000000..65f92f3b
--- /dev/null
+++ b/testgen/template/dbsetup_test_types/test_types_Monthly_Rec_Ct.yaml
@@ -0,0 +1,168 @@
+test_types:
+ id: '1023'
+ test_type: Monthly_Rec_Ct
+ test_name_short: Monthly Records
+ test_name_long: At least one date per month present within date range
+ test_description: |-
+ Tests for presence of at least one date per calendar month within min/max date range, per baseline data
+ except_message: |-
+ At least one date per month expected in min/max date range.
+ measure_uom: Missing months
+ measure_uom_description: |-
+ Calendar months without date values present
+ selection_criteria: |-
+ functional_data_type ILIKE 'Transactional Date%' AND date_days_present > 1 AND functional_table_type ILIKE '%cumulative%' AND date_months_present > 2 AND date_months_present - (datediff( 'MON' , min_date, max_date) + 1) = 0 AND future_date_ct::FLOAT / NULLIF(value_ct, 0) <= 0.75
+ dq_score_prevalence_formula: |-
+ ({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT*{PRO_RECORD_CT}::FLOAT/NULLIF({DATE_MONTHS_PRESENT}::FLOAT, 0)/NULLIF({RECORD_CT}::FLOAT, 0)
+ dq_score_risk_factor: '1.0'
+ column_name_prompt: null
+ column_name_help: null
+ default_parm_columns: threshold_value
+ default_parm_values: |-
+ 0
+ default_parm_prompts: |-
+ Threshold Count of Months without Dates
+ default_parm_help: null
+ default_severity: Fail
+ run_type: CAT
+ test_scope: column
+ dq_dimension: Completeness
+ health_dimension: Volume
+ threshold_description: |-
+ Expected maximum count of calendar months without dates present
+ usage_notes: |-
+ Monthly Records tests that at least one record is present for every calendar month within the minimum and maximum date range for the column. The test is relevant for transactional data, where you would expect at least one transaction to be recorded each month. A failure here would suggest missing records for the number of months identified without data. You can adjust the threshold to accept a number of month that you know legitimately have no records.
+ active: Y
+ result_visualization: line_chart
+ result_visualization_params: null
+ cat_test_conditions:
+ - id: '1018'
+ test_type: Monthly_Rec_Ct
+ sql_flavor: redshift
+ measure: |-
+ (MAX(DATEDIFF(month, {COLUMN_NAME}, '{RUN_DATE}'::DATE)) - MIN(DATEDIFF(month, {COLUMN_NAME}, '{RUN_DATE}'::DATE)) + 1) - COUNT(DISTINCT DATEDIFF(month, {COLUMN_NAME}, '{RUN_DATE}'::DATE))
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '2018'
+ test_type: Monthly_Rec_Ct
+ sql_flavor: snowflake
+ measure: |-
+ (MAX(DATEDIFF(month, {COLUMN_NAME}, '{RUN_DATE}'::DATE)) - MIN(DATEDIFF(month, {COLUMN_NAME}, '{RUN_DATE}'::DATE)) + 1) - COUNT(DISTINCT DATEDIFF(month, {COLUMN_NAME}, '{RUN_DATE}'::DATE))
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '3018'
+ test_type: Monthly_Rec_Ct
+ sql_flavor: mssql
+ measure: |-
+ (MAX(DATEDIFF(month, {COLUMN_NAME}, CAST('{RUN_DATE}'AS DATE))) - MIN(DATEDIFF(month, {COLUMN_NAME}, CAST('{RUN_DATE}' AS DATE))) + 1) - COUNT(DISTINCT DATEDIFF(month, {COLUMN_NAME}, CAST('{RUN_DATE}'AS DATE)))
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '4018'
+ test_type: Monthly_Rec_Ct
+ sql_flavor: postgresql
+ measure: |-
+ (MAX(<%DATEDIFF_MONTH;{COLUMN_NAME};'{RUN_DATE}'::DATE%>) - MIN(<%DATEDIFF_MONTH;{COLUMN_NAME};'{RUN_DATE}'::DATE%>) + 1) - COUNT(DISTINCT <%DATEDIFF_MONTH;{COLUMN_NAME};'{RUN_DATE}'::DATE%>)
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '5018'
+ test_type: Monthly_Rec_Ct
+ sql_flavor: trino
+ measure: |-
+ (MAX(DATE_DIFF('month', {COLUMN_NAME}, CAST('{RUN_DATE}' AS DATE))) - MIN(DATE_DIFF('month', {COLUMN_NAME}, CAST('{RUN_DATE}' AS DATE))) + 1) - COUNT(DISTINCT DATE_DIFF('month', {COLUMN_NAME}, CAST('{RUN_DATE}' AS DATE)))
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '6018'
+ test_type: Monthly_Rec_Ct
+ sql_flavor: databricks
+ measure: |-
+ (MAX(<%DATEDIFF_MONTH;{COLUMN_NAME};'{RUN_DATE}'::DATE%>) - MIN(<%DATEDIFF_MONTH;{COLUMN_NAME};'{RUN_DATE}'::DATE%>) + 1) - COUNT(DISTINCT <%DATEDIFF_MONTH;{COLUMN_NAME};'{RUN_DATE}'::DATE%>)
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ target_data_lookups:
+ - id: '1018'
+ test_id: '1023'
+ test_type: Monthly_Rec_Ct
+ sql_flavor: redshift
+ lookup_type: null
+ lookup_query: |-
+ WITH RECURSIVE daterange(all_dates) AS (SELECT DATE_TRUNC('month', MIN("{COLUMN_NAME}")) :: DATE AS all_dates FROM {TARGET_SCHEMA}.{TABLE_NAME} UNION ALL SELECT DATEADD(MONTH, 1, d.all_dates) :: DATE AS all_dates FROM daterange d WHERE d.all_dates < (SELECT DATE_TRUNC('month', MAX("{COLUMN_NAME}")) :: DATE FROM {TARGET_SCHEMA}.{TABLE_NAME}) ), existing_periods AS ( SELECT DISTINCT DATE_TRUNC('month',"{COLUMN_NAME}") :: DATE AS period, COUNT(1) AS period_count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY DATE_TRUNC('month',"{COLUMN_NAME}") :: DATE ) SELECT d.all_dates as missing_period, MAX(b.period) AS prior_available_month, (SELECT period_count FROM existing_periods WHERE period = MAX(b.period) ) AS prior_available_month_count, MIN(c.period) AS next_available_month, (SELECT period_count FROM existing_periods WHERE period = MIN(c.period) ) AS next_available_month_count FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates ORDER BY d.all_dates;
+ error_type: Test Results
+ - id: '1100'
+ test_id: '1023'
+ test_type: Monthly_Rec_Ct
+ sql_flavor: postgresql
+ lookup_type: null
+ lookup_query: |-
+ WITH RECURSIVE daterange(all_dates) AS (SELECT DATE_TRUNC('month', MIN("{COLUMN_NAME}")) :: DATE AS all_dates FROM {TARGET_SCHEMA}.{TABLE_NAME} UNION ALL SELECT (d.all_dates :: DATE + INTERVAL '1 month') :: DATE AS all_dates FROM daterange d WHERE d.all_dates < (SELECT DATE_TRUNC('month', MAX("{COLUMN_NAME}")) :: DATE FROM {TARGET_SCHEMA}.{TABLE_NAME}) ), existing_periods AS ( SELECT DISTINCT DATE_TRUNC('month',"{COLUMN_NAME}") :: DATE AS period, COUNT(1) AS period_count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY DATE_TRUNC('month',"{COLUMN_NAME}") :: DATE ) SELECT d.all_dates as missing_period, MAX(b.period) AS prior_available_month, (SELECT period_count FROM existing_periods WHERE period = MAX(b.period) ) AS prior_available_month_count, MIN(c.period) AS next_available_month, (SELECT period_count FROM existing_periods WHERE period = MIN(c.period) ) AS next_available_month_count FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates ORDER BY d.all_dates;
+ error_type: Test Results
+ - id: '1157'
+ test_id: '1023'
+ test_type: Monthly_Rec_Ct
+ sql_flavor: mssql
+ lookup_type: null
+ lookup_query: |-
+ WITH
+ Pass0 as (select 1 as C union all select 1), --2 rows
+ Pass1 as (select 1 as C from Pass0 as A, Pass0 as B),--4 rows
+ Pass2 as (select 1 as C from Pass1 as A, Pass1 as B),--16 rows
+ Pass3 as (select 1 as C from Pass2 as A, Pass2 as B),--256 rows
+ Pass4 as (select 1 as C from Pass3 as A, Pass3 as B),--65536 rows
+ All_Nums as (select row_number() over(order by C) as Number from Pass4),
+ tally as (SELECT Number FROM All_Nums WHERE Number <= 45000),
+
+ date_range as (SELECT CAST(DATEADD(MONTH, DATEDIFF(MONTH, 0, MIN("{COLUMN_NAME}")), 0) AS DATE) AS min_period,
+ CAST(DATEADD(MONTH, DATEDIFF(MONTH, 0, MAX("{COLUMN_NAME}")), 0) AS DATE) AS max_period,
+ DATEDIFF(MONTH,
+ CAST(DATEADD(MONTH, DATEDIFF(MONTH, 0, MIN("{COLUMN_NAME}")), 0) AS DATE),
+ CAST(DATEADD(MONTH, DATEDIFF(MONTH, 0, MAX("{COLUMN_NAME}")), 0) AS DATE) ) + 1 as period_ct
+ FROM {TARGET_SCHEMA}.{TABLE_NAME} ),
+ check_periods as ( SELECT d.min_period, d.max_period, t.number,
+ DATEADD(MONTH, -(t.number - 1), d.max_period) AS check_period
+ FROM date_range d
+ INNER JOIN tally t
+ ON (d.period_ct >= t.number) ),
+ data_by_period as (SELECT CAST(DATEADD(MONTH, DATEDIFF(MONTH, 0, "{COLUMN_NAME}"), 0) AS DATE) as data_period, COUNT(*) as record_ct
+ FROM {TARGET_SCHEMA}.{TABLE_NAME}
+ GROUP BY CAST(DATEADD(MONTH, DATEDIFF(MONTH, 0, "{COLUMN_NAME}"), 0) AS DATE) ),
+ data_by_prd_with_prior_next as (SELECT check_period,
+ RANK() OVER (ORDER BY check_period DESC) as ranked,
+ ISNULL(d.record_ct, 0) as record_ct,
+ ISNULL(LAG(d.record_ct) OVER (ORDER BY check_period), 0) as last_record_ct,
+ ISNULL(LEAD(d.record_ct) OVER (ORDER BY check_period), 0) as next_record_ct
+ FROM check_periods c
+ LEFT JOIN data_by_period d
+ ON (c.check_period = d.data_period) )
+ SELECT check_period, record_ct,
+ CASE
+ WHEN record_ct = 0 THEN 'MISSING'
+ ELSE 'Present'
+ END as status
+ FROM data_by_prd_with_prior_next
+ WHERE record_ct = 0
+ OR last_record_ct = 0
+ OR next_record_ct = 0
+ ORDER BY check_period DESC;
+ error_type: Test Results
+ - id: '1214'
+ test_id: '1023'
+ test_type: Monthly_Rec_Ct
+ sql_flavor: snowflake
+ lookup_type: null
+ lookup_query: |-
+ WITH RECURSIVE daterange(all_dates) AS (SELECT DATE_TRUNC('month', MIN("{COLUMN_NAME}")) :: DATE AS all_dates FROM {TARGET_SCHEMA}.{TABLE_NAME} UNION ALL SELECT DATEADD(MONTH, 1, d.all_dates) :: DATE AS all_dates FROM daterange d WHERE d.all_dates < (SELECT DATE_TRUNC('month', MAX("{COLUMN_NAME}")) :: DATE FROM {TARGET_SCHEMA}.{TABLE_NAME}) ), existing_periods AS (SELECT DISTINCT DATE_TRUNC('month',"{COLUMN_NAME}") :: DATE AS period, COUNT(1) AS period_count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY DATE_TRUNC('month',"{COLUMN_NAME}") :: DATE ) SELECT p.missing_period, p.prior_available_month, e.period_count as prior_available_month_count, p.next_available_month, f.period_count as next_available_month_count FROM (SELECT d.all_dates as missing_period, MAX(b.period) AS prior_available_month, MIN(c.period) AS next_available_month FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates) p LEFT JOIN existing_periods e ON (p.prior_available_month = e.period) LEFT JOIN existing_periods f ON (p.next_available_month = f.period) ORDER BY p.missing_period;
+ error_type: Test Results
+ - id: '1315'
+ test_id: '1023'
+ test_type: Monthly_Rec_Ct
+ sql_flavor: databricks
+ lookup_type: null
+ lookup_query: |-
+ WITH daterange AS( SELECT explode( sequence( date_trunc('month', (SELECT MIN(`{COLUMN_NAME}`) FROM {TARGET_SCHEMA}.{TABLE_NAME})), date_trunc('month', (SELECT MAX(`{COLUMN_NAME}`) FROM {TARGET_SCHEMA}.{TABLE_NAME})), interval 1 month) ) AS all_dates ), existing_periods AS ( SELECT DISTINCT date_trunc('month', `{COLUMN_NAME}`) AS period, COUNT(1) AS period_count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY date_trunc('month', `{COLUMN_NAME}`) ) SELECT p.missing_period, p.prior_available_month, e.period_count AS prior_available_month_count, p.next_available_month, f.period_count AS next_available_month_count FROM ( SELECT d.all_dates AS missing_period, MAX(b.period) AS prior_available_month, MIN(c.period) AS next_available_month FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates ) p LEFT JOIN existing_periods e ON p.prior_available_month = e.period LEFT JOIN existing_periods f ON p.next_available_month = f.period ORDER BY p.missing_period;
+ error_type: Test Results
+ test_templates: []
diff --git a/testgen/template/dbsetup_test_types/test_types_Outlier_Pct_Above.yaml b/testgen/template/dbsetup_test_types/test_types_Outlier_Pct_Above.yaml
new file mode 100644
index 00000000..736de800
--- /dev/null
+++ b/testgen/template/dbsetup_test_types/test_types_Outlier_Pct_Above.yaml
@@ -0,0 +1,132 @@
+test_types:
+ id: '1024'
+ test_type: Outlier_Pct_Above
+ test_name_short: Outliers Above
+ test_name_long: Consistent outlier counts over 2 SD above mean
+ test_description: |-
+ Tests that percent of outliers over 2 SD above Mean doesn't exceed threshold
+ except_message: |-
+ Percent of outliers exceeding 2 SD above the mean is greater than expected threshold.
+ measure_uom: Pct records over limit
+ measure_uom_description: null
+ selection_criteria: |-
+ functional_data_type = 'Measurement' AND distinct_value_ct > 30 AND NOT distinct_value_ct = max_value - min_value + 1 AND distinct_value_ct::FLOAT/value_ct::FLOAT > 0.1 AND stdev_value::FLOAT/avg_value::FLOAT > 0.01 AND column_name NOT ILIKE '%latitude%' AND column_name NOT ilike '%longitude%'
+ dq_score_prevalence_formula: |-
+ GREATEST(0, {RESULT_MEASURE}::FLOAT-{THRESHOLD_VALUE}::FLOAT)
+ dq_score_risk_factor: '0.75'
+ column_name_prompt: null
+ column_name_help: null
+ default_parm_columns: baseline_avg,baseline_sd,threshold_value
+ default_parm_values: |-
+ avg_value,stdev_value,0.05
+ default_parm_prompts: |-
+ Baseline Mean, Baseline Std Deviation, Pct Records over 2 SD
+ default_parm_help: null
+ default_severity: Warning
+ run_type: CAT
+ test_scope: column
+ dq_dimension: Accuracy
+ health_dimension: Data Drift
+ threshold_description: |-
+ Expected maximum pct records over upper 2 SD limit
+ usage_notes: "This test counts the number of data points that may be considered\
+ \ as outliers, determined by whether their value exceeds 2 standard deviations\
+ \ above the mean at baseline. Assuming a normal distribution, a small percentage\
+ \ (defaulted to 5%) of outliers is expected. The actual number may vary for different\
+ \ distributions. The expected threshold reflects the maximum percentage of outliers\
+ \ you expect to see. This test uses the baseline mean rather than the mean for\
+ \ the latest dataset to capture systemic shift as well as individual outliers. "
+ active: Y
+ result_visualization: line_chart
+ result_visualization_params: null
+ cat_test_conditions:
+ - id: '1019'
+ test_type: Outlier_Pct_Above
+ sql_flavor: redshift
+ measure: |-
+ SUM(CASE WHEN {COLUMN_NAME}::FLOAT > {BASELINE_AVG}+(2.0*{BASELINE_SD}) THEN 1 ELSE 0 END)::FLOAT / NULLIF(COUNT({COLUMN_NAME}), 0)::FLOAT
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '2019'
+ test_type: Outlier_Pct_Above
+ sql_flavor: snowflake
+ measure: |-
+ SUM(CASE WHEN {COLUMN_NAME}::FLOAT > {BASELINE_AVG}+(2.0*{BASELINE_SD}) THEN 1 ELSE 0 END)::FLOAT / NULLIF(COUNT({COLUMN_NAME}), 0)::FLOAT
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '3019'
+ test_type: Outlier_Pct_Above
+ sql_flavor: mssql
+ measure: |-
+ CAST(SUM(CASE WHEN CAST({COLUMN_NAME} AS FLOAT) > {BASELINE_AVG}+(2.0*{BASELINE_SD}) THEN 1 ELSE 0 END) AS FLOAT) / CAST(COUNT({COLUMN_NAME}) AS FLOAT)
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '4019'
+ test_type: Outlier_Pct_Above
+ sql_flavor: postgresql
+ measure: |-
+ SUM(CASE WHEN {COLUMN_NAME}::FLOAT > {BASELINE_AVG}+(2.0*{BASELINE_SD}) THEN 1 ELSE 0 END)::FLOAT / NULLIF(COUNT({COLUMN_NAME}), 0)::FLOAT
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '5019'
+ test_type: Outlier_Pct_Above
+ sql_flavor: trino
+ measure: |-
+ CAST(SUM(CASE WHEN CAST({COLUMN_NAME} AS REAL) > {BASELINE_AVG}+(2.0*{BASELINE_SD}) THEN 1 ELSE 0 END) AS REAL) / CAST(COUNT({COLUMN_NAME}) AS REAL)
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '6019'
+ test_type: Outlier_Pct_Above
+ sql_flavor: databricks
+ measure: |-
+ SUM(CASE WHEN {COLUMN_NAME}::FLOAT > {BASELINE_AVG}+(2.0*{BASELINE_SD}) THEN 1 ELSE 0 END)::FLOAT / NULLIF(COUNT({COLUMN_NAME}), 0)::FLOAT
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ target_data_lookups:
+ - id: '1019'
+ test_id: '1024'
+ test_type: Outlier_Pct_Above
+ sql_flavor: redshift
+ lookup_type: null
+ lookup_query: |-
+ SELECT ({BASELINE_AVG} + (2*{BASELINE_SD})) AS outlier_threshold, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" :: FLOAT > ({BASELINE_AVG} + (2*{BASELINE_SD})) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC;
+ error_type: Test Results
+ - id: '1101'
+ test_id: '1024'
+ test_type: Outlier_Pct_Above
+ sql_flavor: postgresql
+ lookup_type: null
+ lookup_query: |-
+ SELECT ({BASELINE_AVG} + (2*{BASELINE_SD})) AS outlier_threshold, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" :: FLOAT > ({BASELINE_AVG} + (2*{BASELINE_SD})) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC;
+ error_type: Test Results
+ - id: '1158'
+ test_id: '1024'
+ test_type: Outlier_Pct_Above
+ sql_flavor: mssql
+ lookup_type: null
+ lookup_query: |-
+ SELECT ({BASELINE_AVG} + (2*{BASELINE_SD})) AS outlier_threshold, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE CAST("{COLUMN_NAME}" AS FLOAT) > ({BASELINE_AVG} + (2*{BASELINE_SD})) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC;
+ error_type: Test Results
+ - id: '1215'
+ test_id: '1024'
+ test_type: Outlier_Pct_Above
+ sql_flavor: snowflake
+ lookup_type: null
+ lookup_query: |-
+ SELECT ({BASELINE_AVG} + (2*{BASELINE_SD})) AS outlier_threshold, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" :: FLOAT > ({BASELINE_AVG} + (2*{BASELINE_SD})) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC;
+ error_type: Test Results
+ - id: '1316'
+ test_id: '1024'
+ test_type: Outlier_Pct_Above
+ sql_flavor: databricks
+ lookup_type: null
+ lookup_query: |-
+ SELECT ({BASELINE_AVG} + (2*{BASELINE_SD})) AS outlier_threshold, `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE `{COLUMN_NAME}` :: FLOAT > ({BASELINE_AVG} + (2*{BASELINE_SD})) GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` DESC;
+ error_type: Test Results
+ test_templates: []
diff --git a/testgen/template/dbsetup_test_types/test_types_Outlier_Pct_Below.yaml b/testgen/template/dbsetup_test_types/test_types_Outlier_Pct_Below.yaml
new file mode 100644
index 00000000..22559430
--- /dev/null
+++ b/testgen/template/dbsetup_test_types/test_types_Outlier_Pct_Below.yaml
@@ -0,0 +1,132 @@
+test_types:
+ id: '1025'
+ test_type: Outlier_Pct_Below
+ test_name_short: Outliers Below
+ test_name_long: Consistent outlier counts under 2 SD below mean
+ test_description: |-
+ Tests that percent of outliers over 2 SD below Mean doesn't exceed threshold
+ except_message: |-
+ Percent of outliers exceeding 2 SD below the mean is greater than expected threshold.
+ measure_uom: Pct records under limit
+ measure_uom_description: null
+ selection_criteria: |-
+ functional_data_type = 'Measurement' AND distinct_value_ct > 30 AND NOT distinct_value_ct = max_value - min_value + 1 AND distinct_value_ct::FLOAT/value_ct::FLOAT > 0.1 AND stdev_value::FLOAT/avg_value::FLOAT > 0.01 AND column_name NOT ILIKE '%latitude%' AND column_name NOT ilike '%longitude%'
+ dq_score_prevalence_formula: |-
+ GREATEST(0, {RESULT_MEASURE}::FLOAT-{THRESHOLD_VALUE}::FLOAT)
+ dq_score_risk_factor: '0.75'
+ column_name_prompt: null
+ column_name_help: null
+ default_parm_columns: baseline_avg,baseline_sd,threshold_value
+ default_parm_values: |-
+ avg_value,stdev_value,0.05
+ default_parm_prompts: |-
+ Baseline Mean, Baseline Std Deviation, Pct Records over 2 SD
+ default_parm_help: null
+ default_severity: Warning
+ run_type: CAT
+ test_scope: column
+ dq_dimension: Accuracy
+ health_dimension: Data Drift
+ threshold_description: |-
+ Expected maximum pct records over lower 2 SD limit
+ usage_notes: "This test counts the number of data points that may be considered\
+ \ as outliers, determined by whether their value exceeds 2 standard deviations\
+ \ below the mean at baseline. Assuming a normal distribution, a small percentage\
+ \ (defaulted to 5%) of outliers is expected. The actual number may vary for different\
+ \ distributions. The expected threshold reflects the maximum percentage of outliers\
+ \ you expect to see. This test uses the baseline mean rather than the mean for\
+ \ the latest dataset to capture systemic shift as well as individual outliers. "
+ active: Y
+ result_visualization: line_chart
+ result_visualization_params: null
+ cat_test_conditions:
+ - id: '1020'
+ test_type: Outlier_Pct_Below
+ sql_flavor: redshift
+ measure: |-
+ SUM(CASE WHEN {COLUMN_NAME}::FLOAT < {BASELINE_AVG}-(2.0*{BASELINE_SD}) THEN 1 ELSE 0 END)::FLOAT / NULLIF(COUNT({COLUMN_NAME}), 0)::FLOAT
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '2020'
+ test_type: Outlier_Pct_Below
+ sql_flavor: snowflake
+ measure: |-
+ SUM(CASE WHEN {COLUMN_NAME}::FLOAT < {BASELINE_AVG}-(2.0*{BASELINE_SD}) THEN 1 ELSE 0 END)::FLOAT / NULLIF(COUNT({COLUMN_NAME}), 0)::FLOAT
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '3020'
+ test_type: Outlier_Pct_Below
+ sql_flavor: mssql
+ measure: |-
+ CAST(SUM(CASE WHEN CAST( {COLUMN_NAME} AS FLOAT) < {BASELINE_AVG}-(2.0*{BASELINE_SD}) THEN 1 ELSE 0 END) AS FLOAT) / CAST(COUNT({COLUMN_NAME}) AS FLOAT)
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '4020'
+ test_type: Outlier_Pct_Below
+ sql_flavor: postgresql
+ measure: |-
+ SUM(CASE WHEN {COLUMN_NAME}::FLOAT < {BASELINE_AVG}-(2.0*{BASELINE_SD}) THEN 1 ELSE 0 END)::FLOAT / NULLIF(COUNT({COLUMN_NAME}), 0)::FLOAT
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '5020'
+ test_type: Outlier_Pct_Below
+ sql_flavor: trino
+ measure: |-
+ CAST(SUM(CASE WHEN CAST( {COLUMN_NAME} AS REAL) < {BASELINE_AVG}-(2.0*{BASELINE_SD}) THEN 1 ELSE 0 END) AS REAL) / CAST(COUNT({COLUMN_NAME}) AS REAL)
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '6020'
+ test_type: Outlier_Pct_Below
+ sql_flavor: databricks
+ measure: |-
+ SUM(CASE WHEN {COLUMN_NAME}::FLOAT < {BASELINE_AVG}-(2.0*{BASELINE_SD}) THEN 1 ELSE 0 END)::FLOAT / NULLIF(COUNT({COLUMN_NAME}), 0)::FLOAT
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ target_data_lookups:
+ - id: '1020'
+ test_id: '1025'
+ test_type: Outlier_Pct_Below
+ sql_flavor: redshift
+ lookup_type: null
+ lookup_query: |-
+ SELECT ({BASELINE_AVG} + (2*{BASELINE_SD})) AS outlier_threshold, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" :: FLOAT < ({BASELINE_AVG} + (2*{BASELINE_SD})) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC;
+ error_type: Test Results
+ - id: '1102'
+ test_id: '1025'
+ test_type: Outlier_Pct_Below
+ sql_flavor: postgresql
+ lookup_type: null
+ lookup_query: |-
+ SELECT ({BASELINE_AVG} + (2*{BASELINE_SD})) AS outlier_threshold, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" :: FLOAT < ({BASELINE_AVG} + (2*{BASELINE_SD})) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC;
+ error_type: Test Results
+ - id: '1159'
+ test_id: '1025'
+ test_type: Outlier_Pct_Below
+ sql_flavor: mssql
+ lookup_type: null
+ lookup_query: |-
+ SELECT ({BASELINE_AVG} + (2*{BASELINE_SD})) AS outlier_threshold, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE CAST("{COLUMN_NAME}" AS FLOAT) < ({BASELINE_AVG} + (2*{BASELINE_SD})) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC;
+ error_type: Test Results
+ - id: '1216'
+ test_id: '1025'
+ test_type: Outlier_Pct_Below
+ sql_flavor: snowflake
+ lookup_type: null
+ lookup_query: |-
+ SELECT ({BASELINE_AVG} + (2*{BASELINE_SD})) AS outlier_threshold, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" :: FLOAT < ({BASELINE_AVG} + (2*{BASELINE_SD})) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC;
+ error_type: Test Results
+ - id: '1317'
+ test_id: '1025'
+ test_type: Outlier_Pct_Below
+ sql_flavor: databricks
+ lookup_type: null
+ lookup_query: |-
+ SELECT ({BASELINE_AVG} + (2*{BASELINE_SD})) AS outlier_threshold, `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE `{COLUMN_NAME}` :: FLOAT < ({BASELINE_AVG} + (2*{BASELINE_SD})) GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` DESC;
+ error_type: Test Results
+ test_templates: []
diff --git a/testgen/template/dbsetup_test_types/test_types_Pattern_Match.yaml b/testgen/template/dbsetup_test_types/test_types_Pattern_Match.yaml
new file mode 100644
index 00000000..835e5258
--- /dev/null
+++ b/testgen/template/dbsetup_test_types/test_types_Pattern_Match.yaml
@@ -0,0 +1,127 @@
+test_types:
+ id: '1026'
+ test_type: Pattern_Match
+ test_name_short: Pattern Match
+ test_name_long: Column values match alpha-numeric pattern
+ test_description: |-
+ Tests that all values in the column match the same alpha-numeric pattern identified in baseline data
+ except_message: |-
+ Alpha values do not match consistent pattern in baseline.
+ measure_uom: Pattern Mismatches
+ measure_uom_description: null
+ selection_criteria: |-
+ (functional_data_type IN ('Attribute', 'DateTime Stamp', 'Phone') OR functional_data_type ILIKE 'ID%' OR functional_data_type ILIKE 'Period%') AND fn_charcount(top_patterns, E' \| ' ) = 1 AND REPLACE(SPLIT_PART(top_patterns, '|' , 2), 'N' , '' ) > '' AND distinct_value_ct > 10
+ dq_score_prevalence_formula: |-
+ ({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/NULLIF({RECORD_CT}::FLOAT, 0)
+ dq_score_risk_factor: '1.0'
+ column_name_prompt: null
+ column_name_help: null
+ default_parm_columns: baseline_value,threshold_value
+ default_parm_values: |-
+ TRIM(REPLACE(REPLACE(REPLACE(REGEXP_REPLACE(SPLIT_PART(top_patterns, ' | ', 2), '([*+\-%_])', '[\1]', 'g'), 'A', '[A-Z]'), 'N', '[0-9]'), 'a', '[a-z]')),0
+ default_parm_prompts: |-
+ Pattern at Baseline,Threshold Error Count
+ default_parm_help: null
+ default_severity: Fail
+ run_type: CAT
+ test_scope: column
+ dq_dimension: Validity
+ health_dimension: Schema Drift
+ threshold_description: |-
+ Expected count of pattern mismatches
+ usage_notes: |-
+ This test is appropriate for character fields that are expected to appear in a consistent format. It uses pattern matching syntax as appropriate for your database: REGEX matching if available, otherwise LIKE expressions. The expected threshold is the number of records that fail to match the defined pattern.
+ active: Y
+ result_visualization: line_chart
+ result_visualization_params: null
+ cat_test_conditions:
+ - id: '1021'
+ test_type: Pattern_Match
+ sql_flavor: redshift
+ measure: |-
+ COUNT(NULLIF({COLUMN_NAME}, '')) - SUM((NULLIF({COLUMN_NAME}, '') SIMILAR TO '{BASELINE_VALUE}')::BIGINT)
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '2021'
+ test_type: Pattern_Match
+ sql_flavor: snowflake
+ measure: |-
+ COUNT(NULLIF({COLUMN_NAME}, '')) - SUM(REGEXP_LIKE(NULLIF({COLUMN_NAME}::VARCHAR, ''), '{BASELINE_VALUE}')::BIGINT)
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '3021'
+ test_type: Pattern_Match
+ sql_flavor: mssql
+ measure: |-
+ COUNT(NULLIF({COLUMN_NAME}, '')) - CAST(SUM(CASE WHEN NULLIF({COLUMN_NAME}, '') LIKE '{BASELINE_VALUE}' THEN 1 ELSE 0 END) AS BIGINT)
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '4021'
+ test_type: Pattern_Match
+ sql_flavor: postgresql
+ measure: |-
+ COUNT(NULLIF({COLUMN_NAME}, '')) - SUM(CASE WHEN NULLIF({COLUMN_NAME}, '') ~ '{BASELINE_VALUE}' THEN 1 ELSE 0 END)
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '5021'
+ test_type: Pattern_Match
+ sql_flavor: trino
+ measure: |-
+ COUNT(NULLIF({COLUMN_NAME}, '')) - SUM(CASE WHEN REGEXP_LIKE(NULLIF({COLUMN_NAME}, '') , '{BASELINE_VALUE}') = TRUE THEN 1 ELSE 0 END)
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '6021'
+ test_type: Pattern_Match
+ sql_flavor: databricks
+ measure: |-
+ COUNT(NULLIF({COLUMN_NAME}, '')) - SUM(REGEXP_LIKE(NULLIF({COLUMN_NAME}::STRING, ''), '{BASELINE_VALUE}')::BIGINT)
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ target_data_lookups:
+ - id: '1021'
+ test_id: '1026'
+ test_type: Pattern_Match
+ sql_flavor: redshift
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE NULLIF("{COLUMN_NAME}", '') NOT SIMILAR TO '{BASELINE_VALUE}' GROUP BY "{COLUMN_NAME}";
+ error_type: Test Results
+ - id: '1103'
+ test_id: '1026'
+ test_type: Pattern_Match
+ sql_flavor: postgresql
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE NULLIF("{COLUMN_NAME}", '') NOT SIMILAR TO '{BASELINE_VALUE}' GROUP BY "{COLUMN_NAME}";
+ error_type: Test Results
+ - id: '1160'
+ test_id: '1026'
+ test_type: Pattern_Match
+ sql_flavor: mssql
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE NULLIF("{COLUMN_NAME}", '') NOT LIKE '{BASELINE_VALUE}' GROUP BY "{COLUMN_NAME}";
+ error_type: Test Results
+ - id: '1217'
+ test_id: '1026'
+ test_type: Pattern_Match
+ sql_flavor: snowflake
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE REGEXP_LIKE(NULLIF("{COLUMN_NAME}"::VARCHAR, ''),'{BASELINE_VALUE}') != 1 GROUP BY "{COLUMN_NAME}";
+ error_type: Test Results
+ - id: '1318'
+ test_id: '1026'
+ test_type: Pattern_Match
+ sql_flavor: databricks
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE REGEXP_LIKE(NULLIF(`{COLUMN_NAME}`::STRING, ''),'{BASELINE_VALUE}') != 1 GROUP BY `{COLUMN_NAME}`;
+ error_type: Test Results
+ test_templates: []
diff --git a/testgen/template/dbsetup_test_types/test_types_Recency.yaml b/testgen/template/dbsetup_test_types/test_types_Recency.yaml
new file mode 100644
index 00000000..ebafb9a1
--- /dev/null
+++ b/testgen/template/dbsetup_test_types/test_types_Recency.yaml
@@ -0,0 +1,128 @@
+test_types:
+ id: '1028'
+ test_type: Recency
+ test_name_short: Recency
+ test_name_long: Latest date within expected range of test date
+ test_description: |-
+ Tests that the latest date in column is within a set number of days of the test date
+ except_message: |-
+ Most recent date value not within expected days of test date.
+ measure_uom: Days before test
+ measure_uom_description: |-
+ Number of days that most recent date precedes the date of test
+ selection_criteria: |-
+ general_type= 'D' AND max_date <= run_date AND NOT column_name IN ( 'filedate' , 'file_date' ) AND NOT functional_data_type IN ('Future Date', 'Schedule Date') AND DATEDIFF( 'DAY' , max_date, run_date) <= 62
+ dq_score_prevalence_formula: |-
+ (ABS({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT*{PRO_RECORD_CT}::FLOAT/(1.0+DATEDIFF('DAY', '{MIN_DATE}', '{MAX_DATE}'))::FLOAT)/NULLIF({RECORD_CT}::FLOAT, 0)
+ dq_score_risk_factor: '0.75'
+ column_name_prompt: null
+ column_name_help: null
+ default_parm_columns: threshold_value
+ default_parm_values: |-
+ CASE WHEN DATEDIFF( 'DAY' , max_date, run_date) <= 3 THEN DATEDIFF('DAY', max_date, run_date) + 3 WHEN DATEDIFF('DAY', max_date, run_date) <= 7 then DATEDIFF('DAY', max_date, run_date) + 7 WHEN DATEDIFF( 'DAY' , max_date, run_date) <= 31 THEN CEILING( DATEDIFF( 'DAY' , max_date, run_date)::FLOAT / 7.0) * 7 WHEN DATEDIFF( 'DAY' , max_date, run_date) > 31 THEN CEILING( DATEDIFF( 'DAY' , max_date, run_date)::FLOAT / 30.0) * 30 END
+ default_parm_prompts: |-
+ Threshold Maximum Days before Test
+ default_parm_help: null
+ default_severity: Warning
+ run_type: CAT
+ test_scope: column
+ dq_dimension: Timeliness
+ health_dimension: Recency
+ threshold_description: |-
+ Expected maximum count of days preceding test date
+ usage_notes: |-
+ This test evaluates recency based on the latest referenced dates in the column. The test is appropriate for transactional dates and timestamps. The test can be especially valuable because timely data deliveries themselves may not assure that the most recent data is present. You can adjust the expected threshold to the maximum number of days that you expect the data to age before the dataset is refreshed.
+ active: Y
+ result_visualization: line_chart
+ result_visualization_params: null
+ cat_test_conditions:
+ - id: '1022'
+ test_type: Recency
+ sql_flavor: redshift
+ measure: |-
+ DATEDIFF('D', MAX({COLUMN_NAME}), '{RUN_DATE}'::DATE)
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '2022'
+ test_type: Recency
+ sql_flavor: snowflake
+ measure: |-
+ DATEDIFF('D', MAX({COLUMN_NAME}), '{RUN_DATE}'::DATE)
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '3022'
+ test_type: Recency
+ sql_flavor: mssql
+ measure: |-
+ DATEDIFF(day, MAX({COLUMN_NAME}), CAST('{RUN_DATE}'AS DATE))
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '4022'
+ test_type: Recency
+ sql_flavor: postgresql
+ measure: |-
+ <%DATEDIFF_DAY;MAX({COLUMN_NAME});'{RUN_DATE}'::DATE%>
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '5022'
+ test_type: Recency
+ sql_flavor: trino
+ measure: |-
+ DATE_DIFF('day', MAX({COLUMN_NAME}), CAST('{RUN_DATE}' AS DATE))
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '6022'
+ test_type: Recency
+ sql_flavor: databricks
+ measure: |-
+ <%DATEDIFF_DAY;MAX({COLUMN_NAME});'{RUN_DATE}'::DATE%>
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ target_data_lookups:
+ - id: '1022'
+ test_id: '1028'
+ test_type: Recency
+ sql_flavor: redshift
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT col AS latest_date_available, '{TEST_DATE}' :: DATE as test_run_date FROM (SELECT MAX("{COLUMN_NAME}") AS col FROM {TARGET_SCHEMA}.{TABLE_NAME}) WHERE DATEDIFF('D', col, '{TEST_DATE}'::DATE) > {THRESHOLD_VALUE};
+ error_type: Test Results
+ - id: '1104'
+ test_id: '1028'
+ test_type: Recency
+ sql_flavor: postgresql
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT col AS latest_date_available, '{TEST_DATE}' :: DATE as test_run_date FROM (SELECT MAX("{COLUMN_NAME}") AS col FROM {TARGET_SCHEMA}.{TABLE_NAME}) a WHERE <%DATEDIFF_DAY;col;'{TEST_DATE}'::DATE%> > {THRESHOLD_VALUE};
+ error_type: Test Results
+ - id: '1161'
+ test_id: '1028'
+ test_type: Recency
+ sql_flavor: mssql
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT col AS latest_date_available, CAST('{TEST_DATE}' AS DATE) AS test_run_date FROM (SELECT MAX("{COLUMN_NAME}") AS col FROM {TARGET_SCHEMA}.{TABLE_NAME}) a WHERE DATEDIFF(day, col, CAST('{TEST_DATE}' AS DATE)) > {THRESHOLD_VALUE};
+ error_type: Test Results
+ - id: '1218'
+ test_id: '1028'
+ test_type: Recency
+ sql_flavor: snowflake
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT col AS latest_date_available, '{TEST_DATE}' :: DATE as test_run_date FROM (SELECT MAX("{COLUMN_NAME}") AS col FROM {TARGET_SCHEMA}.{TABLE_NAME}) WHERE DATEDIFF('D', col, '{TEST_DATE}'::DATE) > {THRESHOLD_VALUE};
+ error_type: Test Results
+ - id: '1319'
+ test_id: '1028'
+ test_type: Recency
+ sql_flavor: databricks
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT col AS latest_date_available, '{TEST_DATE}' :: DATE as test_run_date FROM (SELECT MAX(`{COLUMN_NAME}`) AS col FROM {TARGET_SCHEMA}.{TABLE_NAME}) WHERE ABS(<%DATEDIFF_DAY;col;'{TEST_DATE}'::DATE%>) > {THRESHOLD_VALUE};
+ error_type: Test Results
+ test_templates: []
diff --git a/testgen/template/dbsetup_test_types/test_types_Required.yaml b/testgen/template/dbsetup_test_types/test_types_Required.yaml
new file mode 100644
index 00000000..27200ce5
--- /dev/null
+++ b/testgen/template/dbsetup_test_types/test_types_Required.yaml
@@ -0,0 +1,126 @@
+test_types:
+ id: '1030'
+ test_type: Required
+ test_name_short: Required Entry
+ test_name_long: Required non-null value present
+ test_description: |-
+ Tests that a non-null value is present in each record for the column, consistent with baseline data
+ except_message: |-
+ Every record for this column is expected to be filled, but some are missing.
+ measure_uom: Missing values
+ measure_uom_description: null
+ selection_criteria: |-
+ record_ct = value_ct AND record_ct > 10
+ dq_score_prevalence_formula: |-
+ ({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/NULLIF({RECORD_CT}::FLOAT, 0)
+ dq_score_risk_factor: '1.0'
+ column_name_prompt: null
+ column_name_help: null
+ default_parm_columns: threshold_value
+ default_parm_values: |-
+ 0
+ default_parm_prompts: |-
+ Threshold Missing Value Count
+ default_parm_help: null
+ default_severity: Fail
+ run_type: CAT
+ test_scope: column
+ dq_dimension: Completeness
+ health_dimension: Schema Drift
+ threshold_description: |-
+ Expected count of missing values
+ usage_notes: null
+ active: Y
+ result_visualization: line_chart
+ result_visualization_params: null
+ cat_test_conditions:
+ - id: '1023'
+ test_type: Required
+ sql_flavor: redshift
+ measure: |-
+ COUNT(*) - COUNT( {COLUMN_NAME} )
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '2023'
+ test_type: Required
+ sql_flavor: snowflake
+ measure: |-
+ COUNT(*) - COUNT( {COLUMN_NAME} )
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '3023'
+ test_type: Required
+ sql_flavor: mssql
+ measure: |-
+ COUNT(*) - COUNT( {COLUMN_NAME} )
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '4023'
+ test_type: Required
+ sql_flavor: postgresql
+ measure: |-
+ COUNT(*) - COUNT({COLUMN_NAME})
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '5023'
+ test_type: Required
+ sql_flavor: trino
+ measure: |-
+ COUNT(*) - COUNT({COLUMN_NAME})
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '6023'
+ test_type: Required
+ sql_flavor: databricks
+ measure: |-
+ COUNT(*) - COUNT( {COLUMN_NAME} )
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ target_data_lookups:
+ - id: '1023'
+ test_id: '1030'
+ test_type: Required
+ sql_flavor: redshift
+ lookup_type: null
+ lookup_query: |-
+ SELECT * FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" IS NULL LIMIT 500;
+ error_type: Test Results
+ - id: '1105'
+ test_id: '1030'
+ test_type: Required
+ sql_flavor: postgresql
+ lookup_type: null
+ lookup_query: |-
+ SELECT * FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" IS NULL LIMIT 500;
+ error_type: Test Results
+ - id: '1162'
+ test_id: '1030'
+ test_type: Required
+ sql_flavor: mssql
+ lookup_type: null
+ lookup_query: |-
+ SELECT TOP 500 * FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" IS NULL;
+ error_type: Test Results
+ - id: '1219'
+ test_id: '1030'
+ test_type: Required
+ sql_flavor: snowflake
+ lookup_type: null
+ lookup_query: |-
+ SELECT * FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" IS NULL LIMIT 500;
+ error_type: Test Results
+ - id: '1320'
+ test_id: '1030'
+ test_type: Required
+ sql_flavor: databricks
+ lookup_type: null
+ lookup_query: |-
+ SELECT * FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE `{COLUMN_NAME}` IS NULL LIMIT 500;
+ error_type: Test Results
+ test_templates: []
diff --git a/testgen/template/dbsetup_test_types/test_types_Row_Ct.yaml b/testgen/template/dbsetup_test_types/test_types_Row_Ct.yaml
new file mode 100644
index 00000000..e2470d79
--- /dev/null
+++ b/testgen/template/dbsetup_test_types/test_types_Row_Ct.yaml
@@ -0,0 +1,126 @@
+test_types:
+ id: '1031'
+ test_type: Row_Ct
+ test_name_short: Row Count
+ test_name_long: Number of rows is at or above threshold
+ test_description: |-
+ Tests that the count of records has not decreased from the baseline count.
+ except_message: |-
+ Row count less than baseline count.
+ measure_uom: Row count
+ measure_uom_description: null
+ selection_criteria: |-
+ TEMPLATE
+ dq_score_prevalence_formula: |-
+ ({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/NULLIF({THRESHOLD_VALUE}::FLOAT, 0)
+ dq_score_risk_factor: '1.0'
+ column_name_prompt: null
+ column_name_help: null
+ default_parm_columns: threshold_value
+ default_parm_values: null
+ default_parm_prompts: |-
+ Threshold Minimum Record Count
+ default_parm_help: null
+ default_severity: Fail
+ run_type: CAT
+ test_scope: table
+ dq_dimension: Completeness
+ health_dimension: Volume
+ threshold_description: |-
+ Expected minimum row count
+ usage_notes: |-
+ Because this tests the row count against a constant minimum threshold, it's appropriate for any dataset, as long as the number of rows doesn't radically change from refresh to refresh. But it's not responsive to change over time. You may want to adjust the threshold periodically if you are dealing with a cumulative dataset.
+ active: Y
+ result_visualization: line_chart
+ result_visualization_params: null
+ cat_test_conditions:
+ - id: '1024'
+ test_type: Row_Ct
+ sql_flavor: redshift
+ measure: |-
+ COUNT(*)
+ test_operator: <
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '2024'
+ test_type: Row_Ct
+ sql_flavor: snowflake
+ measure: |-
+ COUNT(*)
+ test_operator: <
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '3024'
+ test_type: Row_Ct
+ sql_flavor: mssql
+ measure: |-
+ COUNT(*)
+ test_operator: <
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '4024'
+ test_type: Row_Ct
+ sql_flavor: postgresql
+ measure: |-
+ COUNT(*)
+ test_operator: <
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '5024'
+ test_type: Row_Ct
+ sql_flavor: trino
+ measure: |-
+ COUNT(*)
+ test_operator: <
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '6024'
+ test_type: Row_Ct
+ sql_flavor: databricks
+ measure: |-
+ COUNT(*)
+ test_operator: <
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ target_data_lookups:
+ - id: '1024'
+ test_id: '1031'
+ test_type: Row_Ct
+ sql_flavor: redshift
+ lookup_type: null
+ lookup_query: |-
+ WITH CTE AS (SELECT COUNT(*) AS current_count FROM {TARGET_SCHEMA}.{TABLE_NAME}) SELECT current_count, ABS(ROUND(100 * (current_count - {THRESHOLD_VALUE}) :: FLOAT / {THRESHOLD_VALUE} :: FLOAT,2)) AS row_count_pct_decrease FROM cte WHERE current_count < {THRESHOLD_VALUE};
+ error_type: Test Results
+ - id: '1106'
+ test_id: '1031'
+ test_type: Row_Ct
+ sql_flavor: postgresql
+ lookup_type: null
+ lookup_query: |-
+ WITH CTE AS (SELECT COUNT(*) AS current_count FROM {TARGET_SCHEMA}.{TABLE_NAME}) SELECT current_count, ABS(ROUND(100 * (current_count - {THRESHOLD_VALUE}) :: NUMERIC / {THRESHOLD_VALUE} :: NUMERIC,2)) AS row_count_pct_decrease FROM cte WHERE current_count < {THRESHOLD_VALUE};
+ error_type: Test Results
+ - id: '1163'
+ test_id: '1031'
+ test_type: Row_Ct
+ sql_flavor: mssql
+ lookup_type: null
+ lookup_query: |-
+ WITH CTE AS (SELECT COUNT(*) AS current_count FROM {TARGET_SCHEMA}.{TABLE_NAME}) SELECT current_count, ABS(ROUND(CAST(100 * (current_count - {THRESHOLD_VALUE}) AS NUMERIC) / CAST({THRESHOLD_VALUE} AS NUMERIC) ,2)) AS row_count_pct_decrease FROM cte WHERE current_count < {THRESHOLD_VALUE};
+ error_type: Test Results
+ - id: '1220'
+ test_id: '1031'
+ test_type: Row_Ct
+ sql_flavor: snowflake
+ lookup_type: null
+ lookup_query: |-
+ WITH CTE AS (SELECT COUNT(*) AS current_count FROM {TARGET_SCHEMA}.{TABLE_NAME}) SELECT current_count, ABS(ROUND(100 *(current_count - {THRESHOLD_VALUE}) :: FLOAT / {THRESHOLD_VALUE} :: FLOAT,2)) AS row_count_pct_decrease FROM cte WHERE current_count < {THRESHOLD_VALUE};
+ error_type: Test Results
+ - id: '1321'
+ test_id: '1031'
+ test_type: Row_Ct
+ sql_flavor: databricks
+ lookup_type: null
+ lookup_query: |-
+ WITH CTE AS (SELECT COUNT(*) AS current_count FROM {TARGET_SCHEMA}.{TABLE_NAME}) SELECT current_count, ABS(ROUND(100 *(current_count - {THRESHOLD_VALUE}) :: FLOAT / {THRESHOLD_VALUE} :: FLOAT,2)) AS row_count_pct_decrease FROM cte WHERE current_count < {THRESHOLD_VALUE};
+ error_type: Test Results
+ test_templates: []
diff --git a/testgen/template/dbsetup_test_types/test_types_Row_Ct_Pct.yaml b/testgen/template/dbsetup_test_types/test_types_Row_Ct_Pct.yaml
new file mode 100644
index 00000000..c3c687bd
--- /dev/null
+++ b/testgen/template/dbsetup_test_types/test_types_Row_Ct_Pct.yaml
@@ -0,0 +1,127 @@
+test_types:
+ id: '1032'
+ test_type: Row_Ct_Pct
+ test_name_short: Row Range
+ test_name_long: Number of rows within percent range of threshold
+ test_description: |-
+ Tests that the count of records is within a percentage above or below the baseline count.
+ except_message: |-
+ Row Count is outside of threshold percent of baseline count.
+ measure_uom: Percent of baseline
+ measure_uom_description: |-
+ Row count percent above or below baseline
+ selection_criteria: |-
+ TEMPLATE
+ dq_score_prevalence_formula: |-
+ (100.0 - {RESULT_MEASURE}::FLOAT)/100.0
+ dq_score_risk_factor: '1.0'
+ column_name_prompt: null
+ column_name_help: null
+ default_parm_columns: baseline_ct,threshold_value
+ default_parm_values: null
+ default_parm_prompts: |-
+ Baseline Record Count,Threshold Pct Above or Below Baseline
+ default_parm_help: null
+ default_severity: Fail
+ run_type: CAT
+ test_scope: table
+ dq_dimension: Completeness
+ health_dimension: Volume
+ threshold_description: |-
+ Expected percent window below or above baseline
+ usage_notes: |-
+ This test is better than Row Count for an incremental or windowed dataset where you would expect the row count to range within a percentage of baseline.
+ active: Y
+ result_visualization: line_chart
+ result_visualization_params: null
+ cat_test_conditions:
+ - id: '1025'
+ test_type: Row_Ct_Pct
+ sql_flavor: redshift
+ measure: |-
+ ABS(ROUND(100.0 * (COUNT(*) - {BASELINE_CT})::FLOAT / {BASELINE_CT}::FLOAT, 2))
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '2025'
+ test_type: Row_Ct_Pct
+ sql_flavor: snowflake
+ measure: |-
+ ABS(ROUND(100.0 * (COUNT(*) - {BASELINE_CT})::FLOAT / {BASELINE_CT}::FLOAT, 2))
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '3025'
+ test_type: Row_Ct_Pct
+ sql_flavor: mssql
+ measure: |-
+ ABS(ROUND(100.0 * CAST((COUNT(*) - {BASELINE_CT} ) AS FLOAT)/ CAST({BASELINE_CT} AS FLOAT), 2))
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '4025'
+ test_type: Row_Ct_Pct
+ sql_flavor: postgresql
+ measure: |-
+ ABS(ROUND(100.0 * (COUNT(*) - {BASELINE_CT})::DECIMAL(18,4) / {BASELINE_CT}::DECIMAL(18,4), 2))
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '5025'
+ test_type: Row_Ct_Pct
+ sql_flavor: trino
+ measure: |-
+ ABS(ROUND(100.0 * CAST((COUNT(*) - {BASELINE_CT}) AS DECIMAL(18,4)) /CAST( {BASELINE_CT} AS DECIMAL(18,4) ), 2))
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '6025'
+ test_type: Row_Ct_Pct
+ sql_flavor: databricks
+ measure: |-
+ ABS(ROUND(100.0 * (COUNT(*) - {BASELINE_CT})::FLOAT / {BASELINE_CT}::FLOAT, 2))
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ target_data_lookups:
+ - id: '1025'
+ test_id: '1032'
+ test_type: Row_Ct_Pct
+ sql_flavor: redshift
+ lookup_type: null
+ lookup_query: |-
+ WITH CTE AS (SELECT COUNT(*) AS current_count FROM {TARGET_SCHEMA}.{TABLE_NAME}) SELECT current_count, {BASELINE_CT} AS baseline_count, ABS(ROUND(100 * (current_count - {BASELINE_CT}) :: FLOAT / {BASELINE_CT} :: FLOAT,2)) AS row_count_pct_difference FROM cte;
+ error_type: Test Results
+ - id: '1107'
+ test_id: '1032'
+ test_type: Row_Ct_Pct
+ sql_flavor: postgresql
+ lookup_type: null
+ lookup_query: |-
+ WITH CTE AS (SELECT COUNT(*) AS current_count FROM {TARGET_SCHEMA}.{TABLE_NAME}) SELECT current_count, {BASELINE_CT} AS baseline_count, ABS(ROUND(100 * (current_count - {BASELINE_CT}) :: NUMERIC / {BASELINE_CT} :: NUMERIC,2)) AS row_count_pct_difference FROM cte;
+ error_type: Test Results
+ - id: '1164'
+ test_id: '1032'
+ test_type: Row_Ct_Pct
+ sql_flavor: mssql
+ lookup_type: null
+ lookup_query: |-
+ WITH CTE AS (SELECT COUNT(*) AS current_count FROM {TARGET_SCHEMA}.{TABLE_NAME}) SELECT current_count, {BASELINE_CT} AS baseline_count, ABS(ROUND(CAST(100 * (current_count - {BASELINE_CT}) AS NUMERIC) / CAST({BASELINE_CT} AS NUMERIC) ,2)) AS row_count_pct_difference FROM cte;
+ error_type: Test Results
+ - id: '1221'
+ test_id: '1032'
+ test_type: Row_Ct_Pct
+ sql_flavor: snowflake
+ lookup_type: null
+ lookup_query: |-
+ WITH CTE AS (SELECT COUNT(*) AS current_count FROM {TARGET_SCHEMA}.{TABLE_NAME}) SELECT current_count, {BASELINE_CT} AS baseline_count, ABS(ROUND(100 * (current_count - {BASELINE_CT}) :: FLOAT / {BASELINE_CT} :: FLOAT,2)) AS row_count_pct_difference FROM cte;
+ error_type: Test Results
+ - id: '1322'
+ test_id: '1032'
+ test_type: Row_Ct_Pct
+ sql_flavor: databricks
+ lookup_type: null
+ lookup_query: |-
+ WITH CTE AS (SELECT COUNT(*) AS current_count FROM {TARGET_SCHEMA}.{TABLE_NAME}) SELECT current_count, {BASELINE_CT} AS baseline_count, ABS(ROUND(100 * (current_count - {BASELINE_CT}) :: FLOAT / {BASELINE_CT} :: FLOAT,2)) AS row_count_pct_difference FROM cte;
+ error_type: Test Results
+ test_templates: []
diff --git a/testgen/template/dbsetup_test_types/test_types_Street_Addr_Pattern.yaml b/testgen/template/dbsetup_test_types/test_types_Street_Addr_Pattern.yaml
new file mode 100644
index 00000000..c1775d5a
--- /dev/null
+++ b/testgen/template/dbsetup_test_types/test_types_Street_Addr_Pattern.yaml
@@ -0,0 +1,128 @@
+test_types:
+ id: '1033'
+ test_type: Street_Addr_Pattern
+ test_name_short: Street Address
+ test_name_long: Enough street address entries match defined pattern
+ test_description: |-
+ Tests for percent of records matching standard street address pattern.
+ except_message: |-
+ Percent of values matching standard street address format is under expected threshold.
+ measure_uom: Percent matches
+ measure_uom_description: |-
+ Percent of records that match street address pattern
+ selection_criteria: |-
+ (std_pattern_match='STREET_ADDR') AND (avg_length <> round(avg_length)) AND (avg_embedded_spaces BETWEEN 2 AND 6) AND (avg_length < 35)
+ dq_score_prevalence_formula: |-
+ ({VALUE_CT}::FLOAT * ({RESULT_MEASURE}::FLOAT - {THRESHOLD_VALUE}::FLOAT)/100.0)/NULLIF({RECORD_CT}::FLOAT, 0)
+ dq_score_risk_factor: '1.0'
+ column_name_prompt: null
+ column_name_help: null
+ default_parm_columns: threshold_value
+ default_parm_values: |-
+ 75
+ default_parm_prompts: |-
+ Threshold Pct that Match Address Pattern
+ default_parm_help: null
+ default_severity: Fail
+ run_type: CAT
+ test_scope: column
+ dq_dimension: Validity
+ health_dimension: Schema Drift
+ threshold_description: |-
+ Expected percent of records that match standard street address pattern
+ usage_notes: |-
+ The street address pattern used in this test should match the vast majority of USA addresses. You can adjust the threshold percent of matches based on the results you are getting -- you may well want to tighten it to make the test more sensitive to invalid entries.
+ active: Y
+ result_visualization: line_chart
+ result_visualization_params: null
+ cat_test_conditions:
+ - id: '1026'
+ test_type: Street_Addr_Pattern
+ sql_flavor: redshift
+ measure: |-
+ 100.0*SUM(({COLUMN_NAME} ~ '^[0-9]{1,5}[a-zA-Z]?\\s\\w{1,5}\\.?\\s?\\w*\\s?\\w*\\s[a-zA-Z]{1,6}\\.?\\s?[0-9]{0,5}[A-Z]{0,1}$')::BIGINT)::FLOAT / NULLIF(COUNT({COLUMN_NAME}), 0)::FLOAT
+ test_operator: <
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '2026'
+ test_type: Street_Addr_Pattern
+ sql_flavor: snowflake
+ measure: |-
+ 100.0*SUM((regexp_like({COLUMN_NAME}::VARCHAR, '^[0-9]{1,5}[a-zA-Z]?\\s\\w{1,5}\\.?\\s?\\w*\\s?\\w*\\s[a-zA-Z]{1,6}\\.?\\s?[0-9]{0,5}[A-Z]{0,1}$'))::BIGINT)::FLOAT / NULLIF(COUNT({COLUMN_NAME}), 0)::FLOAT
+ test_operator: <
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '3026'
+ test_type: Street_Addr_Pattern
+ sql_flavor: mssql
+ measure: |-
+ CAST(100.0*SUM(CASE WHEN UPPER({COLUMN_NAME}) LIKE '[1-9]% [A-Z]% %' AND CHARINDEX(' ', {COLUMN_NAME}) BETWEEN 2 AND 6 THEN 1 ELSE 0 END) as FLOAT) /CAST(COUNT({COLUMN_NAME}) AS FLOAT)
+ test_operator: <
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '4026'
+ test_type: Street_Addr_Pattern
+ sql_flavor: postgresql
+ measure: |-
+ 100.0*SUM(CASE WHEN {COLUMN_NAME} ~ '^[0-9]{1,5}[a-zA-Z]?\s\w{1,5}\.?\s?\w*\s?\w*\s[a-zA-Z]{1,6}\.?\s?[0-9]{0,5}[A-Z]{0,1}$' THEN 1 ELSE 0 END)::FLOAT / NULLIF(COUNT({COLUMN_NAME}), 0)::FLOAT
+ test_operator: <
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '5026'
+ test_type: Street_Addr_Pattern
+ sql_flavor: trino
+ measure: |-
+ CAST(100.0*SUM(CASE WHEN REGEXP_LIKE({COLUMN_NAME} , '^[0-9]{1,5}[a-zA-Z]?\s\w{1,5}\.?\s?\w*\s?\w*\s[a-zA-Z]{1,6}\.?\s?[0-9]{0,5}[A-Z]{0,1}$') = TRUE THEN 1 ELSE 0 END) AS REAL )/ CAST(COUNT({COLUMN_NAME}) AS REAL)
+ test_operator: <
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '6026'
+ test_type: Street_Addr_Pattern
+ sql_flavor: databricks
+ measure: |-
+ 100.0*SUM((regexp_like({COLUMN_NAME}::STRING, '^[0-9]{1,5}[a-zA-Z]?\\s\\w{1,5}\\.?\\s?\\w*\\s?\\w*\\s[a-zA-Z]{1,6}\\.?\\s?[0-9]{0,5}[A-Z]{0,1}$'))::BIGINT)::FLOAT / NULLIF(COUNT({COLUMN_NAME}), 0)::FLOAT
+ test_operator: <
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ target_data_lookups:
+ - id: '1026'
+ test_id: '1033'
+ test_type: Street_Addr_Pattern
+ sql_flavor: redshift
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" !~ '^[0-9]{1,5}[a-zA-Z]?\\s\\w{1,5}\\.?\\s?\\w*\\s?\\w*\\s[a-zA-Z]{1,6}\\.?\\s?[0-9]{0,5}[A-Z]{0,1}$' GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT 500;
+ error_type: Test Results
+ - id: '1108'
+ test_id: '1033'
+ test_type: Street_Addr_Pattern
+ sql_flavor: postgresql
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" !~ '^[0-9]{1,5}[a-zA-Z]?\s\w{1,5}\.?\s?\w*\s?\w*\s[a-zA-Z]{1,6}\.?\s?[0-9]{0,5}[A-Z]{0,1}$' GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT 500;
+ error_type: Test Results
+ - id: '1165'
+ test_id: '1033'
+ test_type: Street_Addr_Pattern
+ sql_flavor: mssql
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE UPPER("{COLUMN_NAME}") NOT LIKE '[1-9]% [A-Z]% %' AND CHARINDEX(' ', "{COLUMN_NAME}") NOT BETWEEN 2 AND 6 GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC;
+ error_type: Test Results
+ - id: '1222'
+ test_id: '1033'
+ test_type: Street_Addr_Pattern
+ sql_flavor: snowflake
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE REGEXP_LIKE("{COLUMN_NAME}"::VARCHAR, '^[0-9]{1,5}[a-zA-Z]?\\s\\w{1,5}\\.?\\s?\\w*\\s?\\w*\\s[a-zA-Z]{1,6}\\.?\\s?[0-9]{0,5}[A-Z]{0,1}$') != 1 GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT 500;
+ error_type: Test Results
+ - id: '1323'
+ test_id: '1033'
+ test_type: Street_Addr_Pattern
+ sql_flavor: databricks
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE REGEXP_LIKE(`{COLUMN_NAME}`::STRING, '^[0-9]{1,5}[a-zA-Z]?\\s\\w{1,5}\\.?\\s?\\w*\\s?\\w*\\s[a-zA-Z]{1,6}\\.?\\s?[0-9]{0,5}[A-Z]{0,1}$') != 1 GROUP BY `{COLUMN_NAME}` ORDER BY count DESC LIMIT 500;
+ error_type: Test Results
+ test_templates: []
diff --git a/testgen/template/dbsetup_test_types/test_types_Table_Freshness.yaml b/testgen/template/dbsetup_test_types/test_types_Table_Freshness.yaml
new file mode 100644
index 00000000..08e74413
--- /dev/null
+++ b/testgen/template/dbsetup_test_types/test_types_Table_Freshness.yaml
@@ -0,0 +1,61 @@
+test_types:
+ id: '1511'
+ test_type: Table_Freshness
+ test_name_short: Table Freshness
+ test_name_long: Stale Table Not Updated
+ test_description: |-
+ Confirms whether table has been updated based on data fingerprint
+ except_message: |-
+ Table has not been updated.
+ measure_uom: Was Change Detected
+ measure_uom_description: null
+ selection_criteria: |-
+ TEMPLATE
+ dq_score_prevalence_formula: |-
+ (({RESULT_MEASURE}-{THRESHOLD_VALUE}))::FLOAT/NULLIF({RECORD_CT}::FLOAT, 0)
+ dq_score_risk_factor: '0.0'
+ column_name_prompt: |-
+ null
+ column_name_help: |-
+ null
+ default_parm_columns: history_calculation,history_lookback,subset_condition,custom_query
+ default_parm_values: null
+ default_parm_prompts: |-
+ History Aggregate,History Lookback,Record Subset Condition,Fingerprint Expression
+ default_parm_help: |-
+ Aggregate calculation to be performed on the N lookback results|Last N tests to use for history aggregate calculation|Condition defining a subset of records in main table|String expression combining key column measures into a distinct representation of table state
+ default_severity: Log
+ run_type: QUERY
+ test_scope: table
+ dq_dimension: Recency
+ health_dimension: Recency
+ threshold_description: |-
+ Most recent prior table fingerprint
+ usage_notes: |-
+ This test compares the current table fingerprint, calculated signature of column contents, to confirm that the table has been updated. The table fingerprint is derived from a set of values and aggregates from columns most likely to change. This test allows you to track the schedule and frequency of updates and refreshes to the table.
+ active: Y
+ result_visualization: binary_chart
+ result_visualization_params: '{"legend":{"labels":{"0":"Stale","1":"Updated"}}}'
+ cat_test_conditions: []
+ target_data_lookups: []
+ test_templates:
+ - id: '2012'
+ test_type: Table_Freshness
+ sql_flavor: redshift
+ template_name: ex_table_changed_generic.sql
+ - id: '2112'
+ test_type: Table_Freshness
+ sql_flavor: snowflake
+ template_name: ex_table_changed_generic.sql
+ - id: '2212'
+ test_type: Table_Freshness
+ sql_flavor: mssql
+ template_name: ex_table_changed_mssql.sql
+ - id: '2312'
+ test_type: Table_Freshness
+ sql_flavor: postgresql
+ template_name: ex_table_changed_generic.sql
+ - id: '2412'
+ test_type: Table_Freshness
+ sql_flavor: databricks
+ template_name: ex_table_changed_generic.sql
diff --git a/testgen/template/dbsetup_test_types/test_types_Timeframe_Combo_Gain.yaml b/testgen/template/dbsetup_test_types/test_types_Timeframe_Combo_Gain.yaml
new file mode 100644
index 00000000..2ae08ca0
--- /dev/null
+++ b/testgen/template/dbsetup_test_types/test_types_Timeframe_Combo_Gain.yaml
@@ -0,0 +1,137 @@
+test_types:
+ id: '1508'
+ test_type: Timeframe_Combo_Gain
+ test_name_short: Timeframe No Drops
+ test_name_long: Latest timeframe has at least all value combinations from prior
+ period
+ test_description: |-
+ Tests that column values in most recent time-window include at least same as prior time window
+ except_message: |-
+ Column values in most recent time-window don't include all values in prior window.
+ measure_uom: Mismatched values
+ measure_uom_description: null
+ selection_criteria: null
+ dq_score_prevalence_formula: |-
+ ({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/NULLIF({RECORD_CT}::FLOAT, 0)
+ dq_score_risk_factor: '1.0'
+ column_name_prompt: |-
+ Categorical Column List
+ column_name_help: |-
+ Specify one or more Categorical columns, separated by commas. Make sure not to use continuous measurements here. Do not use numeric values unless they represent discrete categories.
+ default_parm_columns: window_date_column,window_days,subset_condition
+ default_parm_values: null
+ default_parm_prompts: |-
+ Date Column for Time Windows,Time Window in Days,Record Subset Condition
+ default_parm_help: |-
+ The date column used to define the time windows. This must be a DATE or DATETIME type.|Length in days of the time window. The test will compare the most recent period of days to the prior period of the same duration.|Condition defining a subset of records in main table to evaluate, written like a condition within a SQL WHERE clause - OPTIONAL
+ default_severity: Fail
+ run_type: QUERY
+ test_scope: referential
+ dq_dimension: Consistency
+ health_dimension: Data Drift
+ threshold_description: |-
+ Expected count of missing value combinations
+ usage_notes: |-
+ This test checks a single transactional table to verify that categorical values or combinations that are present in the most recent time window you define include at least all those found in the prior time window of the same duration. Missing values in the latest time window will trigger the test to fail. New values are permitted. Use this test to confirm that codes or categories are not lost across successive time periods in a transactional table.
+ active: Y
+ result_visualization: line_chart
+ result_visualization_params: null
+ cat_test_conditions: []
+ target_data_lookups:
+ - id: '1261'
+ test_id: '1508'
+ test_type: Timeframe_Combo_Gain
+ sql_flavor: redshift
+ lookup_type: null
+ lookup_query: |-
+ SELECT {COLUMN_NAME_NO_QUOTES}
+ FROM {TARGET_SCHEMA}.{TABLE_NAME}
+ WHERE {SUBSET_CONDITION}
+ AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - 2 * {WINDOW_DAYS}
+ AND {WINDOW_DATE_COLUMN} < (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS}
+ GROUP BY {COLUMN_NAME_NO_QUOTES}
+ EXCEPT
+ SELECT {COLUMN_NAME_NO_QUOTES}
+ FROM {TARGET_SCHEMA}.{TABLE_NAME}
+ WHERE {SUBSET_CONDITION}
+ AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS}
+ GROUP BY {COLUMN_NAME_NO_QUOTES}
+ error_type: Test Results
+ - id: '1262'
+ test_id: '1508'
+ test_type: Timeframe_Combo_Gain
+ sql_flavor: snowflake
+ lookup_type: null
+ lookup_query: |-
+ SELECT {COLUMN_NAME_NO_QUOTES}
+ FROM {TARGET_SCHEMA}.{TABLE_NAME}
+ WHERE {SUBSET_CONDITION}
+ AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - 2 * {WINDOW_DAYS}
+ AND {WINDOW_DATE_COLUMN} < (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS}
+ GROUP BY {COLUMN_NAME_NO_QUOTES}
+ EXCEPT
+ SELECT {COLUMN_NAME_NO_QUOTES}
+ FROM {TARGET_SCHEMA}.{TABLE_NAME}
+ WHERE {SUBSET_CONDITION}
+ AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS}
+ GROUP BY {COLUMN_NAME_NO_QUOTES}
+ error_type: Test Results
+ - id: '1263'
+ test_id: '1508'
+ test_type: Timeframe_Combo_Gain
+ sql_flavor: mssql
+ lookup_type: null
+ lookup_query: |-
+ SELECT {COLUMN_NAME_NO_QUOTES}
+ FROM {TARGET_SCHEMA}.{TABLE_NAME}
+ WHERE {SUBSET_CONDITION}
+ AND {WINDOW_DATE_COLUMN} >= DATEADD("day", - 2 * {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}))
+ AND {WINDOW_DATE_COLUMN} < DATEADD("day", - {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}))
+ GROUP BY {COLUMN_NAME_NO_QUOTES}
+ EXCEPT
+ SELECT {COLUMN_NAME_NO_QUOTES}
+ FROM {TARGET_SCHEMA}.{TABLE_NAME}
+ WHERE {SUBSET_CONDITION}
+ AND {WINDOW_DATE_COLUMN} >= DATEADD("day", - {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}))
+ GROUP BY {COLUMN_NAME_NO_QUOTES}
+ error_type: Test Results
+ - id: '1264'
+ test_id: '1508'
+ test_type: Timeframe_Combo_Gain
+ sql_flavor: postgresql
+ lookup_type: null
+ lookup_query: |-
+ SELECT {COLUMN_NAME_NO_QUOTES}
+ FROM {TARGET_SCHEMA}.{TABLE_NAME}
+ WHERE {SUBSET_CONDITION}
+ AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - 2 * {WINDOW_DAYS}
+ AND {WINDOW_DATE_COLUMN} < (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS}
+ GROUP BY {COLUMN_NAME_NO_QUOTES}
+ EXCEPT
+ SELECT {COLUMN_NAME_NO_QUOTES}
+ FROM {TARGET_SCHEMA}.{TABLE_NAME}
+ WHERE {SUBSET_CONDITION}
+ AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS}
+ GROUP BY {COLUMN_NAME_NO_QUOTES}
+ error_type: Test Results
+ test_templates:
+ - id: '2007'
+ test_type: Timeframe_Combo_Gain
+ sql_flavor: redshift
+ template_name: ex_window_match_no_drops_generic.sql
+ - id: '2107'
+ test_type: Timeframe_Combo_Gain
+ sql_flavor: snowflake
+ template_name: ex_window_match_no_drops_generic.sql
+ - id: '2207'
+ test_type: Timeframe_Combo_Gain
+ sql_flavor: mssql
+ template_name: ex_window_match_no_drops_generic.sql
+ - id: '2307'
+ test_type: Timeframe_Combo_Gain
+ sql_flavor: postgresql
+ template_name: ex_window_match_no_drops_postgresql.sql
+ - id: '2407'
+ test_type: Timeframe_Combo_Gain
+ sql_flavor: databricks
+ template_name: ex_window_match_no_drops_databricks.sql
diff --git a/testgen/template/dbsetup_test_types/test_types_Timeframe_Combo_Match.yaml b/testgen/template/dbsetup_test_types/test_types_Timeframe_Combo_Match.yaml
new file mode 100644
index 00000000..ae338117
--- /dev/null
+++ b/testgen/template/dbsetup_test_types/test_types_Timeframe_Combo_Match.yaml
@@ -0,0 +1,219 @@
+test_types:
+ id: '1509'
+ test_type: Timeframe_Combo_Match
+ test_name_short: Timeframe Match
+ test_name_long: Column value combinations from latest timeframe same as prior period
+ test_description: |-
+ Tests for presence of same column values in most recent time-window vs. prior time window
+ except_message: |-
+ Column values don't match in most recent time-windows.
+ measure_uom: Mismatched values
+ measure_uom_description: null
+ selection_criteria: null
+ dq_score_prevalence_formula: |-
+ ({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/NULLIF({RECORD_CT}::FLOAT, 0)
+ dq_score_risk_factor: '1.0'
+ column_name_prompt: |-
+ Categorical Column List
+ column_name_help: |-
+ Specify one or more Categorical columns, separated by commas. Do not use continuous measurements here. Do not use numeric values unless they represent discrete categories.
+ default_parm_columns: window_date_column,window_days,subset_condition
+ default_parm_values: null
+ default_parm_prompts: |-
+ Date Column for Time Windows,Time Window in Days,Record Subset Condition
+ default_parm_help: null
+ default_severity: Fail
+ run_type: QUERY
+ test_scope: referential
+ dq_dimension: Consistency
+ health_dimension: Data Drift
+ threshold_description: |-
+ Expected count of non-matching value combinations
+ usage_notes: |-
+ This test checks a single transactional table (such as a fact table) to verify that categorical values or combinations that are present in the most recent time window you define match those found in the prior time window of the same duration. New or missing values in the latest time window will trigger the test to fail. Use this test to confirm the consistency in the occurrence of codes or categories across successive time periods in a transactional table.
+ active: Y
+ result_visualization: line_chart
+ result_visualization_params: null
+ cat_test_conditions: []
+ target_data_lookups:
+ - id: '1265'
+ test_id: '1509'
+ test_type: Timeframe_Combo_Match
+ sql_flavor: redshift
+ lookup_type: null
+ lookup_query: |2-
+ (
+ SELECT 'Prior Timeframe' as missing_from, {COLUMN_NAME}
+ FROM {TARGET_SCHEMA}.{TABLE_NAME}
+ WHERE {SUBSET_CONDITION}
+ AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS}
+ EXCEPT
+ SELECT 'Prior Timeframe' as missing_from, {COLUMN_NAME}
+ FROM {TARGET_SCHEMA}.{TABLE_NAME}
+ WHERE {SUBSET_CONDITION}
+ AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - 2 * {WINDOW_DAYS}
+ AND {WINDOW_DATE_COLUMN} < (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS}
+ )
+ UNION ALL
+ (
+ SELECT 'Latest Timeframe' as missing_from, {COLUMN_NAME}
+ FROM {TARGET_SCHEMA}.{TABLE_NAME}
+ WHERE {SUBSET_CONDITION}
+ AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - 2 * {WINDOW_DAYS}
+ AND {WINDOW_DATE_COLUMN} < (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS}
+ EXCEPT
+ SELECT 'Latest Timeframe' as missing_from, {COLUMN_NAME}
+ FROM {TARGET_SCHEMA}.{TABLE_NAME}
+ WHERE {SUBSET_CONDITION}
+ AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS}
+ )
+ error_type: Test Results
+ - id: '1266'
+ test_id: '1509'
+ test_type: Timeframe_Combo_Match
+ sql_flavor: snowflake
+ lookup_type: null
+ lookup_query: |2-
+ (
+ SELECT 'Prior Timeframe' as missing_from, {COLUMN_NAME}
+ FROM {TARGET_SCHEMA}.{TABLE_NAME}
+ WHERE {SUBSET_CONDITION}
+ AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS}
+ EXCEPT
+ SELECT 'Prior Timeframe' as missing_from, {COLUMN_NAME}
+ FROM {TARGET_SCHEMA}.{TABLE_NAME}
+ WHERE {SUBSET_CONDITION}
+ AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - 2 * {WINDOW_DAYS}
+ AND {WINDOW_DATE_COLUMN} < (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS}
+ )
+ UNION ALL
+ (
+ SELECT 'Latest Timeframe' as missing_from, {COLUMN_NAME}
+ FROM {TARGET_SCHEMA}.{TABLE_NAME}
+ WHERE {SUBSET_CONDITION}
+ AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - 2 * {WINDOW_DAYS}
+ AND {WINDOW_DATE_COLUMN} < (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS}
+ EXCEPT
+ SELECT 'Latest Timeframe' as missing_from, {COLUMN_NAME}
+ FROM {TARGET_SCHEMA}.{TABLE_NAME}
+ WHERE {SUBSET_CONDITION}
+ AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS}
+ )
+ error_type: Test Results
+ - id: '1267'
+ test_id: '1509'
+ test_type: Timeframe_Combo_Match
+ sql_flavor: mssql
+ lookup_type: null
+ lookup_query: |2-
+ (
+ SELECT 'Prior Timeframe' as missing_from, {COLUMN_NAME}
+ FROM {TARGET_SCHEMA}.{TABLE_NAME}
+ WHERE {SUBSET_CONDITION}
+ AND {WINDOW_DATE_COLUMN} >= DATEADD("day", - {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}))
+ EXCEPT
+ SELECT 'Prior Timeframe' as missing_from, {COLUMN_NAME}
+ FROM {TARGET_SCHEMA}.{TABLE_NAME}
+ WHERE {SUBSET_CONDITION}
+ AND {WINDOW_DATE_COLUMN} >= DATEADD("day", - 2 * {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}))
+ AND {WINDOW_DATE_COLUMN} < DATEADD("day", - {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}))
+ )
+ UNION ALL
+ (
+ SELECT 'Latest Timeframe' as missing_from, {COLUMN_NAME}
+ FROM {TARGET_SCHEMA}.{TABLE_NAME}
+ WHERE {SUBSET_CONDITION}
+ AND {WINDOW_DATE_COLUMN} >= DATEADD("day", - 2 * {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}))
+ AND {WINDOW_DATE_COLUMN} < DATEADD("day", - {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}))
+ EXCEPT
+ SELECT 'Latest Timeframe' as missing_from, {COLUMN_NAME}
+ FROM {TARGET_SCHEMA}.{TABLE_NAME}
+ WHERE {SUBSET_CONDITION}
+ AND {WINDOW_DATE_COLUMN} >= DATEADD("day", - {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}))
+ )
+ error_type: Test Results
+ - id: '1268'
+ test_id: '1509'
+ test_type: Timeframe_Combo_Match
+ sql_flavor: postgresql
+ lookup_type: null
+ lookup_query: |2-
+ (
+ SELECT 'Prior Timeframe' as missing_from, {COLUMN_NAME}
+ FROM {TARGET_SCHEMA}.{TABLE_NAME}
+ WHERE {SUBSET_CONDITION}
+ AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS}
+ EXCEPT
+ SELECT 'Prior Timeframe' as missing_from, {COLUMN_NAME}
+ FROM {TARGET_SCHEMA}.{TABLE_NAME}
+ WHERE {SUBSET_CONDITION}
+ AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - 2 * {WINDOW_DAYS}
+ AND {WINDOW_DATE_COLUMN} < (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS}
+ )
+ UNION ALL
+ (
+ SELECT 'Latest Timeframe' as missing_from, {COLUMN_NAME}
+ FROM {TARGET_SCHEMA}.{TABLE_NAME}
+ WHERE {SUBSET_CONDITION}
+ AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - 2 * {WINDOW_DAYS}
+ AND {WINDOW_DATE_COLUMN} < (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS}
+ EXCEPT
+ SELECT 'Latest Timeframe' as missing_from, {COLUMN_NAME}
+ FROM {TARGET_SCHEMA}.{TABLE_NAME}
+ WHERE {SUBSET_CONDITION}
+ AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS}
+ )
+ error_type: Test Results
+ - id: '1337'
+ test_id: '1509'
+ test_type: Timeframe_Combo_Match
+ sql_flavor: databricks
+ lookup_type: null
+ lookup_query: |2-
+ (
+ SELECT 'Prior Timeframe' as missing_from, {COLUMN_NAME}
+ FROM {TARGET_SCHEMA}.{TABLE_NAME}
+ WHERE {SUBSET_CONDITION}
+ AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS}
+ EXCEPT
+ SELECT 'Prior Timeframe' as missing_from, {COLUMN_NAME}
+ FROM {TARGET_SCHEMA}.{TABLE_NAME}
+ WHERE {SUBSET_CONDITION}
+ AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - 2 * {WINDOW_DAYS}
+ AND {WINDOW_DATE_COLUMN} < (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS}
+ )
+ UNION ALL
+ (
+ SELECT 'Latest Timeframe' as missing_from, {COLUMN_NAME}
+ FROM {TARGET_SCHEMA}.{TABLE_NAME}
+ WHERE {SUBSET_CONDITION}
+ AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - 2 * {WINDOW_DAYS}
+ AND {WINDOW_DATE_COLUMN} < (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS}
+ EXCEPT
+ SELECT 'Latest Timeframe' as missing_from, {COLUMN_NAME}
+ FROM {TARGET_SCHEMA}.{TABLE_NAME}
+ WHERE {SUBSET_CONDITION}
+ AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS}
+ )
+ error_type: Test Results
+ test_templates:
+ - id: '2008'
+ test_type: Timeframe_Combo_Match
+ sql_flavor: redshift
+ template_name: ex_window_match_same_generic.sql
+ - id: '2108'
+ test_type: Timeframe_Combo_Match
+ sql_flavor: snowflake
+ template_name: ex_window_match_same_generic.sql
+ - id: '2208'
+ test_type: Timeframe_Combo_Match
+ sql_flavor: mssql
+ template_name: ex_window_match_same_generic.sql
+ - id: '2308'
+ test_type: Timeframe_Combo_Match
+ sql_flavor: postgresql
+ template_name: ex_window_match_same_postgresql.sql
+ - id: '2408'
+ test_type: Timeframe_Combo_Match
+ sql_flavor: databricks
+ template_name: ex_window_match_same_databricks.sql
diff --git a/testgen/template/dbsetup_test_types/test_types_US_State.yaml b/testgen/template/dbsetup_test_types/test_types_US_State.yaml
new file mode 100644
index 00000000..d6d9dd8e
--- /dev/null
+++ b/testgen/template/dbsetup_test_types/test_types_US_State.yaml
@@ -0,0 +1,128 @@
+test_types:
+ id: '1036'
+ test_type: US_State
+ test_name_short: US State
+ test_name_long: Column value is two-letter US state code
+ test_description: |-
+ Tests that the recorded column value is a valid US state.
+ except_message: |-
+ Column Value is not a valid US state.
+ measure_uom: Not US States
+ measure_uom_description: |-
+ Values that doo not match 2-character US state abbreviations.
+ selection_criteria: |-
+ general_type= 'A' AND column_name ILIKE '%state%' AND distinct_value_ct < 70 AND max_length = 2
+ dq_score_prevalence_formula: |-
+ ({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/NULLIF({RECORD_CT}::FLOAT, 0)
+ dq_score_risk_factor: '1.0'
+ column_name_prompt: null
+ column_name_help: null
+ default_parm_columns: threshold_value
+ default_parm_values: |-
+ 0
+ default_parm_prompts: |-
+ Threshold Count not Matching State Abbreviations
+ default_parm_help: null
+ default_severity: Fail
+ run_type: CAT
+ test_scope: column
+ dq_dimension: Validity
+ health_dimension: Schema Drift
+ threshold_description: |-
+ Expected count of values that are not US state abbreviations
+ usage_notes: |-
+ This test validates entries against a fixed list of two-character US state codes and related Armed Forces codes.
+ active: Y
+ result_visualization: line_chart
+ result_visualization_params: null
+ cat_test_conditions:
+ - id: '1027'
+ test_type: US_State
+ sql_flavor: redshift
+ measure: |-
+ SUM(CASE WHEN {COLUMN_NAME} NOT IN ('','AL','AK','AS','AZ','AR','CA','CO','CT','DE','DC','FM','FL','GA','GU','HI','ID','IL','IN','IA','KS','KY','LA','ME','MH','MD','MA','MI','MN','MS','MO','MT','NE','NV','NH','NJ','NM','NY','NC','ND','MP','OH','OK','OR','PW','PA','PR','RI','SC','SD','TN','TX','UT','VT','VI','VA','WA','WV','WI','WY','AE','AP','AA') THEN 1 ELSE 0 END)
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '2027'
+ test_type: US_State
+ sql_flavor: snowflake
+ measure: |-
+ SUM(CASE WHEN {COLUMN_NAME} NOT IN ('','AL','AK','AS','AZ','AR','CA','CO','CT','DE','DC','FM','FL','GA','GU','HI','ID','IL','IN','IA','KS','KY','LA','ME','MH','MD','MA','MI','MN','MS','MO','MT','NE','NV','NH','NJ','NM','NY','NC','ND','MP','OH','OK','OR','PW','PA','PR','RI','SC','SD','TN','TX','UT','VT','VI','VA','WA','WV','WI','WY','AE','AP','AA') THEN 1 ELSE 0 END)
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '3027'
+ test_type: US_State
+ sql_flavor: mssql
+ measure: |-
+ SUM(CASE WHEN NULLIF({COLUMN_NAME}, '') NOT IN ('AL','AK','AS','AZ','AR','CA','CO','CT','DE','DC','FM','FL','GA','GU','HI','ID','IL','IN','IA','KS','KY','LA','ME','MH','MD','MA','MI','MN','MS','MO','MT','NE','NV','NH','NJ','NM','NY','NC','ND','MP','OH','OK','OR','PW','PA','PR','RI','SC','SD','TN','TX','UT','VT','VI','VA','WA','WV','WI','WY','AE','AP','AA') THEN 1 ELSE 0 END)
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '4027'
+ test_type: US_State
+ sql_flavor: postgresql
+ measure: |-
+ SUM(CASE WHEN NULLIF({COLUMN_NAME}, '') NOT IN ('AL','AK','AS','AZ','AR','CA','CO','CT','DE','DC','FM','FL','GA','GU','HI','ID','IL','IN','IA','KS','KY','LA','ME','MH','MD','MA','MI','MN','MS','MO','MT','NE','NV','NH','NJ','NM','NY','NC','ND','MP','OH','OK','OR','PW','PA','PR','RI','SC','SD','TN','TX','UT','VT','VI','VA','WA','WV','WI','WY','AE','AP','AA') THEN 1 ELSE 0 END)
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '5027'
+ test_type: US_State
+ sql_flavor: trino
+ measure: |-
+ SUM(CASE WHEN NULLIF({COLUMN_NAME}, '') NOT IN ('AL','AK','AS','AZ','AR','CA','CO','CT','DE','DC','FM','FL','GA','GU','HI','ID','IL','IN','IA','KS','KY','LA','ME','MH','MD','MA','MI','MN','MS','MO','MT','NE','NV','NH','NJ','NM','NY','NC','ND','MP','OH','OK','OR','PW','PA','PR','RI','SC','SD','TN','TX','UT','VT','VI','VA','WA','WV','WI','WY','AE','AP','AA') THEN 1 ELSE 0 END)
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '6027'
+ test_type: US_State
+ sql_flavor: databricks
+ measure: |-
+ SUM(CASE WHEN {COLUMN_NAME} NOT IN ('','AL','AK','AS','AZ','AR','CA','CO','CT','DE','DC','FM','FL','GA','GU','HI','ID','IL','IN','IA','KS','KY','LA','ME','MH','MD','MA','MI','MN','MS','MO','MT','NE','NV','NH','NJ','NM','NY','NC','ND','MP','OH','OK','OR','PW','PA','PR','RI','SC','SD','TN','TX','UT','VT','VI','VA','WA','WV','WI','WY','AE','AP','AA') THEN 1 ELSE 0 END)
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ target_data_lookups:
+ - id: '1027'
+ test_id: '1036'
+ test_type: US_State
+ sql_flavor: redshift
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE NULLIF("{COLUMN_NAME}", '') NOT IN ('AL','AK','AS','AZ','AR','CA','CO','CT','DE','DC','FM','FL','GA','GU','HI','ID','IL','IN','IA','KS','KY','LA','ME','MH','MD','MA','MI','MN','MS','MO','MT','NE','NV','NH','NJ','NM','NY','NC','ND','MP','OH','OK','OR','PW','PA','PR','RI','SC','SD','TN','TX','UT','VT','VI','VA','WA','WV','WI','WY','AE','AP','AA') GROUP BY "{COLUMN_NAME}" LIMIT 500;
+ error_type: Test Results
+ - id: '1109'
+ test_id: '1036'
+ test_type: US_State
+ sql_flavor: postgresql
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE NULLIF("{COLUMN_NAME}", '') NOT IN ('AL','AK','AS','AZ','AR','CA','CO','CT','DE','DC','FM','FL','GA','GU','HI','ID','IL','IN','IA','KS','KY','LA','ME','MH','MD','MA','MI','MN','MS','MO','MT','NE','NV','NH','NJ','NM','NY','NC','ND','MP','OH','OK','OR','PW','PA','PR','RI','SC','SD','TN','TX','UT','VT','VI','VA','WA','WV','WI','WY','AE','AP','AA') GROUP BY "{COLUMN_NAME}" LIMIT 500;
+ error_type: Test Results
+ - id: '1166'
+ test_id: '1036'
+ test_type: US_State
+ sql_flavor: mssql
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE NULLIF("{COLUMN_NAME}", '') NOT IN ('AL','AK','AS','AZ','AR','CA','CO','CT','DE','DC','FM','FL','GA','GU','HI','ID','IL','IN','IA','KS','KY','LA','ME','MH','MD','MA','MI','MN','MS','MO','MT','NE','NV','NH','NJ','NM','NY','NC','ND','MP','OH','OK','OR','PW','PA','PR','RI','SC','SD','TN','TX','UT','VT','VI','VA','WA','WV','WI','WY','AE','AP','AA') GROUP BY "{COLUMN_NAME}";
+ error_type: Test Results
+ - id: '1223'
+ test_id: '1036'
+ test_type: US_State
+ sql_flavor: snowflake
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE NULLIF("{COLUMN_NAME}", '') NOT IN ('AL','AK','AS','AZ','AR','CA','CO','CT','DE','DC','FM','FL','GA','GU','HI','ID','IL','IN','IA','KS','KY','LA','ME','MH','MD','MA','MI','MN','MS','MO','MT','NE','NV','NH','NJ','NM','NY','NC','ND','MP','OH','OK','OR','PW','PA','PR','RI','SC','SD','TN','TX','UT','VT','VI','VA','WA','WV','WI','WY','AE','AP','AA') GROUP BY "{COLUMN_NAME}" LIMIT 500;
+ error_type: Test Results
+ - id: '1324'
+ test_id: '1036'
+ test_type: US_State
+ sql_flavor: databricks
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE NULLIF(`{COLUMN_NAME}`, '') NOT IN ('AL','AK','AS','AZ','AR','CA','CO','CT','DE','DC','FM','FL','GA','GU','HI','ID','IL','IN','IA','KS','KY','LA','ME','MH','MD','MA','MI','MN','MS','MO','MT','NE','NV','NH','NJ','NM','NY','NC','ND','MP','OH','OK','OR','PW','PA','PR','RI','SC','SD','TN','TX','UT','VT','VI','VA','WA','WV','WI','WY','AE','AP','AA') GROUP BY `{COLUMN_NAME}` LIMIT 500;
+ error_type: Test Results
+ test_templates: []
diff --git a/testgen/template/dbsetup_test_types/test_types_Unique.yaml b/testgen/template/dbsetup_test_types/test_types_Unique.yaml
new file mode 100644
index 00000000..013e2d88
--- /dev/null
+++ b/testgen/template/dbsetup_test_types/test_types_Unique.yaml
@@ -0,0 +1,128 @@
+test_types:
+ id: '1034'
+ test_type: Unique
+ test_name_short: Unique Values
+ test_name_long: Each column value is unique
+ test_description: |-
+ Tests that no values for the column are repeated in multiple records.
+ except_message: |-
+ Column values should be unique per row.
+ measure_uom: Duplicate values
+ measure_uom_description: |-
+ Count of non-unique values
+ selection_criteria: |-
+ record_ct > 500 and record_ct = distinct_value_ct and value_ct > 0
+ dq_score_prevalence_formula: |-
+ ({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/NULLIF({RECORD_CT}::FLOAT, 0)
+ dq_score_risk_factor: '1.0'
+ column_name_prompt: null
+ column_name_help: null
+ default_parm_columns: threshold_value
+ default_parm_values: |-
+ 0
+ default_parm_prompts: |-
+ Threshold Duplicate Value Count
+ default_parm_help: null
+ default_severity: Fail
+ run_type: CAT
+ test_scope: column
+ dq_dimension: Uniqueness
+ health_dimension: Schema Drift
+ threshold_description: |-
+ Expected count of duplicate values
+ usage_notes: |-
+ This test is ideal when the database itself does not enforce a primary key constraint on the table. It serves as an independent check on uniqueness. If's also useful when there are a small number of exceptions to uniqueness, which can be reflected in the expected threshold count of duplicates.
+ active: Y
+ result_visualization: line_chart
+ result_visualization_params: null
+ cat_test_conditions:
+ - id: '1028'
+ test_type: Unique
+ sql_flavor: redshift
+ measure: |-
+ COUNT(*) - COUNT(DISTINCT {COLUMN_NAME})
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '2028'
+ test_type: Unique
+ sql_flavor: snowflake
+ measure: |-
+ COUNT(*) - COUNT(DISTINCT {COLUMN_NAME})
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '3028'
+ test_type: Unique
+ sql_flavor: mssql
+ measure: |-
+ COUNT(*) - COUNT(DISTINCT {COLUMN_NAME})
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '4028'
+ test_type: Unique
+ sql_flavor: postgresql
+ measure: |-
+ COUNT(*) - COUNT(DISTINCT {COLUMN_NAME})
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '5028'
+ test_type: Unique
+ sql_flavor: trino
+ measure: |-
+ COUNT(*) - COUNT(DISTINCT {COLUMN_NAME})
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '6028'
+ test_type: Unique
+ sql_flavor: databricks
+ measure: |-
+ COUNT(*) - COUNT(DISTINCT {COLUMN_NAME})
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ target_data_lookups:
+ - id: '1028'
+ test_id: '1034'
+ test_type: Unique
+ sql_flavor: redshift
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" HAVING COUNT(*) > 1 ORDER BY COUNT(*) DESC LIMIT 500;
+ error_type: Test Results
+ - id: '1110'
+ test_id: '1034'
+ test_type: Unique
+ sql_flavor: postgresql
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" HAVING COUNT(*) > 1 ORDER BY COUNT(*) DESC LIMIT 500;
+ error_type: Test Results
+ - id: '1167'
+ test_id: '1034'
+ test_type: Unique
+ sql_flavor: mssql
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" HAVING COUNT(*) > 1 ORDER BY COUNT(*) DESC;
+ error_type: Test Results
+ - id: '1224'
+ test_id: '1034'
+ test_type: Unique
+ sql_flavor: snowflake
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" HAVING COUNT(*) > 1 ORDER BY COUNT(*) DESC LIMIT 500;
+ error_type: Test Results
+ - id: '1325'
+ test_id: '1034'
+ test_type: Unique
+ sql_flavor: databricks
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY `{COLUMN_NAME}` HAVING count > 1 ORDER BY count DESC LIMIT 500;
+ error_type: Test Results
+ test_templates: []
diff --git a/testgen/template/dbsetup_test_types/test_types_Unique_Pct.yaml b/testgen/template/dbsetup_test_types/test_types_Unique_Pct.yaml
new file mode 100644
index 00000000..0e74a8dd
--- /dev/null
+++ b/testgen/template/dbsetup_test_types/test_types_Unique_Pct.yaml
@@ -0,0 +1,128 @@
+test_types:
+ id: '1035'
+ test_type: Unique_Pct
+ test_name_short: Percent Unique
+ test_name_long: Consistent ratio of unique values
+ test_description: |-
+ Tests for statistically-significant shift in percentage of unique values vs. baseline data.
+ except_message: |-
+ Significant shift in percent of unique values vs. baseline.
+ measure_uom: Difference measure
+ measure_uom_description: |-
+ Cohen's H Difference (0.20 small, 0.5 mod, 0.8 large, 1.2 very large, 2.0 huge)
+ selection_criteria: |-
+ distinct_value_ct > 10 AND functional_data_type NOT ILIKE 'Measurement%'
+ dq_score_prevalence_formula: |-
+ 2.0 * (1.0 - fn_normal_cdf(ABS({RESULT_MEASURE}::FLOAT) / 2.0))
+ dq_score_risk_factor: '0.75'
+ column_name_prompt: null
+ column_name_help: null
+ default_parm_columns: baseline_value_ct,baseline_unique_ct,threshold_value
+ default_parm_values: |-
+ value_ct,distinct_value_ct,0.5
+ default_parm_prompts: |-
+ Value Count at Baseline,Distinct Value Count at Baseline,Standardized Difference Measure (0 to 1)
+ default_parm_help: null
+ default_severity: Warning
+ run_type: CAT
+ test_scope: column
+ dq_dimension: Uniqueness
+ health_dimension: Data Drift
+ threshold_description: |-
+ Expected maximum Cohen's H Difference
+ usage_notes: |-
+ You can think of this as a test of similarity that measures whether the percentage of unique values is consistent with the percentage at baseline. A significant change might indicate duplication or a telling shift in cardinality between entities. The test uses Cohen's H, a statistical test to identify a significant difference between two ratios. Results are reported on a standardized scale, which can be interpreted via a rule-of-thumb from small to huge. You can refine the expected threshold value as you view legitimate results of the measure over time.
+ active: Y
+ result_visualization: line_chart
+ result_visualization_params: null
+ cat_test_conditions:
+ - id: '1029'
+ test_type: Unique_Pct
+ sql_flavor: redshift
+ measure: |-
+ ABS( 2.0 * ASIN( SQRT({BASELINE_UNIQUE_CT}::FLOAT / {BASELINE_VALUE_CT}::FLOAT ) ) - 2 * ASIN( SQRT( COUNT( DISTINCT {COLUMN_NAME} )::FLOAT / NULLIF(COUNT( {COLUMN_NAME} ), 0)::FLOAT )) )
+ test_operator: '>='
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '2029'
+ test_type: Unique_Pct
+ sql_flavor: snowflake
+ measure: |-
+ ABS( 2.0 * ASIN( SQRT({BASELINE_UNIQUE_CT}::FLOAT / {BASELINE_VALUE_CT}::FLOAT ) ) - 2 * ASIN( SQRT( COUNT( DISTINCT {COLUMN_NAME} )::FLOAT / NULLIF(COUNT( {COLUMN_NAME} ), 0)::FLOAT )) )
+ test_operator: '>='
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '3029'
+ test_type: Unique_Pct
+ sql_flavor: mssql
+ measure: |-
+ ABS( 2.0 * ASIN( SQRT(CAST({BASELINE_UNIQUE_CT} AS FLOAT) / CAST({BASELINE_VALUE_CT} AS FLOAT) ) ) - 2 * ASIN( SQRT( CAST(COUNT( DISTINCT {COLUMN_NAME} ) AS FLOAT) / CAST(NULLIF(COUNT( {COLUMN_NAME} ), 0) AS FLOAT) )) )
+ test_operator: '>='
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '4029'
+ test_type: Unique_Pct
+ sql_flavor: postgresql
+ measure: |-
+ ABS( 2.0 * ASIN( SQRT({BASELINE_UNIQUE_CT}::FLOAT / {BASELINE_VALUE_CT}::FLOAT ) ) - 2 * ASIN( SQRT( COUNT( DISTINCT {COLUMN_NAME} )::FLOAT / NULLIF(COUNT( {COLUMN_NAME} ), 0)::FLOAT )) )
+ test_operator: '>='
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '5029'
+ test_type: Unique_Pct
+ sql_flavor: trino
+ measure: |-
+ ABS( 2.0 * ASIN( SQRT(CAST({BASELINE_UNIQUE_CT} AS REAL) / CAST({BASELINE_VALUE_CT} AS REAL) ) ) - 2 * ASIN( SQRT( CAST(COUNT( DISTINCT {COLUMN_NAME} ) AS REAL) / CAST(NULLIF(COUNT( {COLUMN_NAME} ), 0) AS REAL) )))
+ test_operator: '>='
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '6029'
+ test_type: Unique_Pct
+ sql_flavor: databricks
+ measure: |-
+ ABS( 2.0 * ASIN( SQRT({BASELINE_UNIQUE_CT}::FLOAT / {BASELINE_VALUE_CT}::FLOAT ) ) - 2 * ASIN( SQRT( COUNT( DISTINCT {COLUMN_NAME} )::FLOAT / NULLIF(COUNT( {COLUMN_NAME} ), 0)::FLOAT )) )
+ test_operator: '>='
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ target_data_lookups:
+ - id: '1029'
+ test_id: '1035'
+ test_type: Unique_Pct
+ sql_flavor: redshift
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT 500;
+ error_type: Test Results
+ - id: '1111'
+ test_id: '1035'
+ test_type: Unique_Pct
+ sql_flavor: postgresql
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT 500;
+ error_type: Test Results
+ - id: '1168'
+ test_id: '1035'
+ test_type: Unique_Pct
+ sql_flavor: mssql
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC;
+ error_type: Test Results
+ - id: '1225'
+ test_id: '1035'
+ test_type: Unique_Pct
+ sql_flavor: snowflake
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT 500;
+ error_type: Test Results
+ - id: '1326'
+ test_id: '1035'
+ test_type: Unique_Pct
+ sql_flavor: databricks
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY `{COLUMN_NAME}` ORDER BY count DESC LIMIT 500;
+ error_type: Test Results
+ test_templates: []
diff --git a/testgen/template/dbsetup_test_types/test_types_Valid_Characters.yaml b/testgen/template/dbsetup_test_types/test_types_Valid_Characters.yaml
new file mode 100644
index 00000000..e06ff91a
--- /dev/null
+++ b/testgen/template/dbsetup_test_types/test_types_Valid_Characters.yaml
@@ -0,0 +1,128 @@
+test_types:
+ id: '1043'
+ test_type: Valid_Characters
+ test_name_short: Valid Characters
+ test_name_long: Column contains no invalid characters
+ test_description: |-
+ Tests for the presence of non-printing characters, leading spaces, or surrounding quotes.
+ except_message: |-
+ Invalid characters, such as non-printing characters, leading spaces, or surrounding quotes, were found.
+ measure_uom: Invalid records
+ measure_uom_description: |-
+ Expected count of values with invalid characters
+ selection_criteria: |-
+ general_type = 'A'
+ dq_score_prevalence_formula: |-
+ ({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/NULLIF({RECORD_CT}::FLOAT, 0)
+ dq_score_risk_factor: '0.75'
+ column_name_prompt: null
+ column_name_help: null
+ default_parm_columns: threshold_value
+ default_parm_values: |-
+ 0
+ default_parm_prompts: null
+ default_parm_help: |-
+ The acceptable number of records with invalid character values present.
+ default_severity: Warning
+ run_type: CAT
+ test_scope: column
+ dq_dimension: Validity
+ health_dimension: Schema Drift
+ threshold_description: |-
+ Threshold Invalid Value Count
+ usage_notes: |-
+ This test looks for the presence of non-printing ASCII characters that are considered non-standard in basic text processing. It also identifies leading spaces and values enclosed in quotes. Values that fail this test may be artifacts of data conversion, or just more difficult to process or analyze downstream.
+ active: N
+ result_visualization: line_chart
+ result_visualization_params: null
+ cat_test_conditions:
+ - id: '1036'
+ test_type: Valid_Characters
+ sql_flavor: redshift
+ measure: |-
+ SUM(CASE WHEN TRANSLATE({COLUMN_NAME}, CHR(160) || CHR(8203) || CHR(65279) || CHR(8239) || CHR(8201) || CHR(12288) || CHR(8204), 'XXXXXXX') <> {COLUMN_NAME} OR {COLUMN_NAME} LIKE ' %' OR {COLUMN_NAME} LIKE '''%''' OR {COLUMN_NAME} LIKE '"%"' THEN 1 ELSE 0 END)
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '4036'
+ test_type: Valid_Characters
+ sql_flavor: postgresql
+ measure: |-
+ SUM(CASE WHEN TRANSLATE({COLUMN_NAME}, CHR(160) || CHR(8203) || CHR(65279) || CHR(8239) || CHR(8201) || CHR(12288) || CHR(8204), 'XXXXXXX') <> {COLUMN_NAME} OR {COLUMN_NAME} LIKE ' %' OR {COLUMN_NAME} LIKE '''%''' OR {COLUMN_NAME} LIKE '"%"' THEN 1 ELSE 0 END)
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '2036'
+ test_type: Valid_Characters
+ sql_flavor: snowflake
+ measure: |-
+ SUM(CASE WHEN TRANSLATE({COLUMN_NAME}, CHAR(160) || CHAR(8203) || CHAR(65279) || CHAR(8239) || CHAR(8201) || CHAR(12288) || CHAR(8204), 'XXXXXXX') <> {COLUMN_NAME} OR {COLUMN_NAME} LIKE ' %' OR {COLUMN_NAME} LIKE '''%''' OR {COLUMN_NAME} LIKE '"%"' THEN 1 ELSE 0 END)
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '5036'
+ test_type: Valid_Characters
+ sql_flavor: trino
+ measure: |-
+ SUM(CASE WHEN TRANSLATE({COLUMN_NAME}, CHR(160) || CHR(8203) || CHR(65279) || CHR(8239) || CHR(8201) || CHR(12288) || CHR(8204), 'XXXXXXX') <> {COLUMN_NAME} OR {COLUMN_NAME} LIKE ' %' OR {COLUMN_NAME} LIKE '''%''' OR {COLUMN_NAME} LIKE '"%"' THEN 1 ELSE 0 END)
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '3036'
+ test_type: Valid_Characters
+ sql_flavor: mssql
+ measure: |-
+ SUM(CASE WHEN TRANSLATE({COLUMN_NAME}, NCHAR(160) || NCHAR(8203) || NCHAR(65279) || NCHAR(8239) || NCHAR(8201) || NCHAR(12288) || NCHAR(8204), 'XXXXXXX') <> {COLUMN_NAME} OR {COLUMN_NAME} LIKE ' %' OR {COLUMN_NAME} LIKE '''%''' OR {COLUMN_NAME} LIKE '"%"' THEN 1 ELSE 0 END)
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '6036'
+ test_type: Valid_Characters
+ sql_flavor: databricks
+ measure: |-
+ SUM(CASE WHEN TRANSLATE({COLUMN_NAME}, CHR(160) || CHR(8203) || CHR(65279) || CHR(8239) || CHR(8201) || CHR(12288) || CHR(8204), 'XXXXXXX') <> {COLUMN_NAME} OR {COLUMN_NAME} LIKE ' %' OR {COLUMN_NAME} LIKE '''%''' OR {COLUMN_NAME} LIKE '"%"' THEN 1 ELSE 0 END)
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ target_data_lookups:
+ - id: '1233'
+ test_id: '1043'
+ test_type: Valid_Characters
+ sql_flavor: redshift
+ lookup_type: null
+ lookup_query: |-
+ SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}", CHR(160) || CHR(8203) || CHR(65279) || CHR(8239) || CHR(8201) || CHR(12288) || CHR(8204), 'XXXXXXX') <> "{COLUMN_NAME}" OR "{COLUMN_NAME}" LIKE ' %' OR "{COLUMN_NAME}" LIKE '''%''' OR "{COLUMN_NAME}" LIKE '"%"' ORDER BY record_ct DESC;
+ error_type: Test Results
+ - id: '1234'
+ test_id: '1043'
+ test_type: Valid_Characters
+ sql_flavor: postgresql
+ lookup_type: null
+ lookup_query: |-
+ SELECT "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}", CHR(160) || CHR(8203) || CHR(65279) || CHR(8239) || CHR(8201) || CHR(12288) || CHR(8204), 'XXXXXXX') <> "{COLUMN_NAME}" OR "{COLUMN_NAME}" LIKE ' %' OR "{COLUMN_NAME}" LIKE '''%''' OR "{COLUMN_NAME}" LIKE '"%"' ORDER BY record_ct DESC LIMIT 20;
+ error_type: Test Results
+ - id: '1235'
+ test_id: '1043'
+ test_type: Valid_Characters
+ sql_flavor: mssql
+ lookup_type: null
+ lookup_query: |-
+ SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}", NCHAR(160) || NCHAR(8203) || NCHAR(65279) || NCHAR(8239) || NCHAR(8201) || NCHAR(12288) || NCHAR(8204), 'XXXXXXX') <> "{COLUMN_NAME}" OR "{COLUMN_NAME}" LIKE ' %' OR "{COLUMN_NAME}" LIKE '''%''' OR "{COLUMN_NAME}" LIKE '"%"' ORDER BY record_ct DESC;
+ error_type: Test Results
+ - id: '1236'
+ test_id: '1043'
+ test_type: Valid_Characters
+ sql_flavor: snowflake
+ lookup_type: null
+ lookup_query: |-
+ SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}", CHAR(160) || CHAR(8203) || CHAR(65279) || CHAR(8239) || CHAR(8201) || CHAR(12288) || CHAR(8204), 'XXXXXXX') <> "{COLUMN_NAME}" OR "{COLUMN_NAME}" LIKE ' %' OR "{COLUMN_NAME}" LIKE '''%''' OR "{COLUMN_NAME}" LIKE '"%"' ORDER BY record_ct DESC;
+ error_type: Test Results
+ - id: '1330'
+ test_id: '1043'
+ test_type: Valid_Characters
+ sql_flavor: databricks
+ lookup_type: null
+ lookup_query: |-
+ SELECT `{COLUMN_NAME}`, COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE REGEXP_LIKE(`{COLUMN_NAME}`, '.*[[:cntrl:]].*') OR `{COLUMN_NAME}`::STRING LIKE ' %' OR `{COLUMN_NAME}`::STRING LIKE '''%''' OR `{COLUMN_NAME}`::STRING LIKE '"%"' GROUP BY `{COLUMN_NAME}` ORDER BY record_ct DESC LIMIT 20;
+ error_type: Test Results
+ test_templates: []
diff --git a/testgen/template/dbsetup_test_types/test_types_Valid_Month.yaml b/testgen/template/dbsetup_test_types/test_types_Valid_Month.yaml
new file mode 100644
index 00000000..0ec9e5ad
--- /dev/null
+++ b/testgen/template/dbsetup_test_types/test_types_Valid_Month.yaml
@@ -0,0 +1,87 @@
+test_types:
+ id: '1042'
+ test_type: Valid_Month
+ test_name_short: Valid Month
+ test_name_long: Valid calendar month in expected format
+ test_description: |-
+ Tests for the presence of a valid representation of a calendar month consistent with the format at baseline.
+ except_message: |-
+ Column values are not a valid representation of a calendar month consistent with the format at baseline.
+ measure_uom: Invalid months
+ measure_uom_description: null
+ selection_criteria: |-
+ functional_data_type = 'Period Month'
+ dq_score_prevalence_formula: |-
+ ({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/NULLIF({RECORD_CT}::FLOAT, 0)
+ dq_score_risk_factor: '1.0'
+ column_name_prompt: null
+ column_name_help: null
+ default_parm_columns: threshold_value,baseline_value
+ default_parm_values: |-
+ 0,CASE WHEN max_length > 3 AND initcap(min_text) = min_text THEN '''January'',''February'',''March'',''April'',''May'',''June'',''July'',''August'',''September'',''October'',''November'',''December''' WHEN max_length > 3 AND upper(min_text) = min_text THEN '''JANUARY'',''FEBRUARY'',''MARCH'',''APRIL'',''MAY'',''JUNE'',''JULY'',''AUGUST'',''SEPTEMBER'',''OCTOBER'',''NOVEMBER'',''DECEMBER''' WHEN max_length > 3 AND lower(min_text) = min_text THEN '''january'',''february'',''march'',''april'',''may'',''june'',''july'',''august'',''september'',''october'',''november'',''december''' WHEN max_length = 3 AND initcap(min_text) = min_text THEN '''Jan'',''Feb'',''Mar'',''Apr'',''May'',''Jun'',''Jul'',''Aug'',''Sep'',''Oct'',''Nov'',''Dec''' WHEN max_length = 3 AND upper(min_text) = min_text THEN '''JAN'',''FEB'',''MAR'',''APR'',''MAY'',''JUN'',''JUL'',''AUG'',''SEP'',''OCT'',''NOV'',''DEC''' WHEN max_length = 3 AND lower(min_text) = min_text THEN '''jan'',''feb'',''mar'',''apr'',''may'',''jun'',''jul'',''aug'',''sep'',''oct'',''nov'',''dec''' WHEN max_length = 2 AND min_text = '01' THEN '''01'',''02'',''03'',''04'',''05'',''06'',''07'',''08'',''09'',''10'',''11'',''12''' WHEN max_length = 2 AND min_text = '1' THEN '''1'',''2'',''3'',''4'',''5'',''6'',''7'',''8'',''9'',''10'',''11'',''12''' WHEN min_value = 1 THEN '1,2,3,4,5,6,7,8,9,10,11,12' ELSE 'NULL' END
+ default_parm_prompts: |-
+ Threshold Invalid Months,Valid Month List
+ default_parm_help: |-
+ The acceptable number of records with invalid months present.|List of valid month values for this field, in quotes if field is numeric, separated by commas.
+ default_severity: Fail
+ run_type: CAT
+ test_scope: column
+ dq_dimension: Validity
+ health_dimension: Schema Drift
+ threshold_description: |-
+ Expected count of invalid months
+ usage_notes: null
+ active: N
+ result_visualization: line_chart
+ result_visualization_params: null
+ cat_test_conditions:
+ - id: '1033'
+ test_type: Valid_Month
+ sql_flavor: redshift
+ measure: |-
+ SUM(CASE WHEN NULLIF({COLUMN_NAME}, '') NOT IN ({BASELINE_VALUE}) THEN 1 ELSE 0 END)
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '2033'
+ test_type: Valid_Month
+ sql_flavor: snowflake
+ measure: |-
+ SUM(CASE WHEN NULLIF({COLUMN_NAME}, '') NOT IN ({BASELINE_VALUE}) THEN 1 ELSE 0 END)
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '3033'
+ test_type: Valid_Month
+ sql_flavor: mssql
+ measure: |-
+ SUM(CASE WHEN NULLIF({COLUMN_NAME}, '') NOT IN ({BASELINE_VALUE}) THEN 1 ELSE 0 END)
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '4033'
+ test_type: Valid_Month
+ sql_flavor: postgresql
+ measure: |-
+ SUM(CASE WHEN NULLIF({COLUMN_NAME}, '') NOT IN ({BASELINE_VALUE}) THEN 1 ELSE 0 END)
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '5033'
+ test_type: Valid_Month
+ sql_flavor: trino
+ measure: |-
+ SUM(CASE WHEN NULLIF({COLUMN_NAME}, '') NOT IN ({BASELINE_VALUE}) THEN 1 ELSE 0 END)
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '6033'
+ test_type: Valid_Month
+ sql_flavor: databricks
+ measure: |-
+ SUM(CASE WHEN NULLIF({COLUMN_NAME}, '') NOT IN ({BASELINE_VALUE}) THEN 1 ELSE 0 END)
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ target_data_lookups: []
+ test_templates: []
diff --git a/testgen/template/dbsetup_test_types/test_types_Valid_US_Zip.yaml b/testgen/template/dbsetup_test_types/test_types_Valid_US_Zip.yaml
new file mode 100644
index 00000000..f8eaa0e5
--- /dev/null
+++ b/testgen/template/dbsetup_test_types/test_types_Valid_US_Zip.yaml
@@ -0,0 +1,126 @@
+test_types:
+ id: '1044'
+ test_type: Valid_US_Zip
+ test_name_short: Valid US Zip
+ test_name_long: Valid USA Postal Codes
+ test_description: |-
+ Tests that postal codes match the 5 or 9 digit standard US format
+ except_message: |-
+ Invalid US Zip Code formats found.
+ measure_uom: Invalid Zip Codes
+ measure_uom_description: |-
+ Expected count of values with invalid Zip Codes
+ selection_criteria: |-
+ functional_data_type = 'Zip'
+ dq_score_prevalence_formula: |-
+ ({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/NULLIF({RECORD_CT}::FLOAT, 0)
+ dq_score_risk_factor: '0.75'
+ column_name_prompt: null
+ column_name_help: null
+ default_parm_columns: threshold_value
+ default_parm_values: |-
+ 0
+ default_parm_prompts: null
+ default_parm_help: null
+ default_severity: Warning
+ run_type: CAT
+ test_scope: column
+ dq_dimension: Validity
+ health_dimension: Schema Drift
+ threshold_description: |-
+ Threshold Invalid Value Count
+ usage_notes: null
+ active: Y
+ result_visualization: line_chart
+ result_visualization_params: null
+ cat_test_conditions:
+ - id: '1034'
+ test_type: Valid_US_Zip
+ sql_flavor: redshift
+ measure: |-
+ SUM(CASE WHEN TRANSLATE({COLUMN_NAME},'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') THEN 1 ELSE 0 END)
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '4034'
+ test_type: Valid_US_Zip
+ sql_flavor: postgresql
+ measure: |-
+ SUM(CASE WHEN TRANSLATE({COLUMN_NAME},'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') THEN 1 ELSE 0 END)
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '2034'
+ test_type: Valid_US_Zip
+ sql_flavor: snowflake
+ measure: |-
+ SUM(CASE WHEN TRANSLATE({COLUMN_NAME},'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') THEN 1 ELSE 0 END)
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '5034'
+ test_type: Valid_US_Zip
+ sql_flavor: trino
+ measure: |-
+ SUM(CASE WHEN TRANSLATE({COLUMN_NAME},'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') THEN 1 ELSE 0 END)
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '3034'
+ test_type: Valid_US_Zip
+ sql_flavor: mssql
+ measure: |-
+ SUM(CASE WHEN TRANSLATE({COLUMN_NAME},'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') THEN 1 ELSE 0 END)
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '6034'
+ test_type: Valid_US_Zip
+ sql_flavor: databricks
+ measure: |-
+ SUM(CASE WHEN TRANSLATE({COLUMN_NAME},'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') THEN 1 ELSE 0 END)
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ target_data_lookups:
+ - id: '1237'
+ test_id: '1044'
+ test_type: Valid_US_Zip
+ sql_flavor: redshift
+ lookup_type: null
+ lookup_query: |-
+ SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC;
+ error_type: Test Results
+ - id: '1238'
+ test_id: '1044'
+ test_type: Valid_US_Zip
+ sql_flavor: postgresql
+ lookup_type: null
+ lookup_query: |-
+ SELECT "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC LIMIT 20;
+ error_type: Test Results
+ - id: '1239'
+ test_id: '1044'
+ test_type: Valid_US_Zip
+ sql_flavor: mssql
+ lookup_type: null
+ lookup_query: |-
+ SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC;
+ error_type: Test Results
+ - id: '1240'
+ test_id: '1044'
+ test_type: Valid_US_Zip
+ sql_flavor: snowflake
+ lookup_type: null
+ lookup_query: |-
+ SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC;
+ error_type: Test Results
+ - id: '1331'
+ test_id: '1044'
+ test_type: Valid_US_Zip
+ sql_flavor: databricks
+ lookup_type: null
+ lookup_query: |-
+ SELECT `{COLUMN_NAME}`, COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE(`{COLUMN_NAME}`,'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') GROUP BY `{COLUMN_NAME}` ORDER BY record_ct DESC LIMIT 20;
+ error_type: Test Results
+ test_templates: []
diff --git a/testgen/template/dbsetup_test_types/test_types_Valid_US_Zip3.yaml b/testgen/template/dbsetup_test_types/test_types_Valid_US_Zip3.yaml
new file mode 100644
index 00000000..dac90d63
--- /dev/null
+++ b/testgen/template/dbsetup_test_types/test_types_Valid_US_Zip3.yaml
@@ -0,0 +1,127 @@
+test_types:
+ id: '1045'
+ test_type: Valid_US_Zip3
+ test_name_short: 'Valid US Zip-3 '
+ test_name_long: Valid USA Zip-3 Prefix
+ test_description: |-
+ Tests that postal codes match the 3 digit format of a regional prefix.
+ except_message: |-
+ Invalid 3-digit US Zip Code regional prefix formats found.
+ measure_uom: Invalid Zip-3 Prefix
+ measure_uom_description: |-
+ Expected count of values with invalid Zip-3 Prefix Codes
+ selection_criteria: |-
+ functional_data_type = 'Zip3'
+ dq_score_prevalence_formula: |-
+ ({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/NULLIF({RECORD_CT}::FLOAT, 0)
+ dq_score_risk_factor: '0.75'
+ column_name_prompt: null
+ column_name_help: null
+ default_parm_columns: threshold_value
+ default_parm_values: |-
+ 0
+ default_parm_prompts: null
+ default_parm_help: null
+ default_severity: Warning
+ run_type: CAT
+ test_scope: column
+ dq_dimension: Validity
+ health_dimension: Schema Drift
+ threshold_description: |-
+ Threshold Invalid Zip3 Count
+ usage_notes: |-
+ This test looks for the presence of values that fail to match the three-digit numeric code expected for US Zip Code regional prefixes. These prefixes are often used to roll up Zip Code data to a regional level, and may be critical to anonymize detailed data and protect PID. Depending on your needs and regulatory requirements, longer zip codes could place PID at risk.
+ active: Y
+ result_visualization: line_chart
+ result_visualization_params: null
+ cat_test_conditions:
+ - id: '1035'
+ test_type: Valid_US_Zip3
+ sql_flavor: redshift
+ measure: |-
+ SUM(CASE WHEN TRANSLATE({COLUMN_NAME},'012345678','999999999') <> '999' THEN 1 ELSE 0 END)
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '4035'
+ test_type: Valid_US_Zip3
+ sql_flavor: postgresql
+ measure: |-
+ SUM(CASE WHEN TRANSLATE({COLUMN_NAME},'012345678','999999999') <> '999' THEN 1 ELSE 0 END)
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '2035'
+ test_type: Valid_US_Zip3
+ sql_flavor: snowflake
+ measure: |-
+ SUM(CASE WHEN TRANSLATE({COLUMN_NAME},'012345678','999999999') <> '999' THEN 1 ELSE 0 END)
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '5035'
+ test_type: Valid_US_Zip3
+ sql_flavor: trino
+ measure: |-
+ SUM(CASE WHEN TRANSLATE({COLUMN_NAME},'012345678','999999999') <> '999' THEN 1 ELSE 0 END)
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '3035'
+ test_type: Valid_US_Zip3
+ sql_flavor: mssql
+ measure: |-
+ SUM(CASE WHEN TRANSLATE({COLUMN_NAME},'012345678','999999999') <> '999' THEN 1 ELSE 0 END)
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '6035'
+ test_type: Valid_US_Zip3
+ sql_flavor: databricks
+ measure: |-
+ SUM(CASE WHEN TRANSLATE({COLUMN_NAME},'012345678','999999999') <> '999' THEN 1 ELSE 0 END)
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ target_data_lookups:
+ - id: '1241'
+ test_id: '1045'
+ test_type: Valid_US_Zip3
+ sql_flavor: redshift
+ lookup_type: null
+ lookup_query: |-
+ SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') <> '999' GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC;
+ error_type: Test Results
+ - id: '1242'
+ test_id: '1045'
+ test_type: Valid_US_Zip3
+ sql_flavor: postgresql
+ lookup_type: null
+ lookup_query: |-
+ SELECT "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') <> '' GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC LIMIT 20;
+ error_type: Test Results
+ - id: '1243'
+ test_id: '1045'
+ test_type: Valid_US_Zip3
+ sql_flavor: mssql
+ lookup_type: null
+ lookup_query: |-
+ SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC;
+ error_type: Test Results
+ - id: '1244'
+ test_id: '1045'
+ test_type: Valid_US_Zip3
+ sql_flavor: snowflake
+ lookup_type: null
+ lookup_query: |-
+ SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC;
+ error_type: Test Results
+ - id: '1332'
+ test_id: '1045'
+ test_type: Valid_US_Zip3
+ sql_flavor: databricks
+ lookup_type: null
+ lookup_query: |-
+ SELECT `{COLUMN_NAME}`, COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE(`{COLUMN_NAME}`,'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') GROUP BY `{COLUMN_NAME}` ORDER BY record_ct DESC LIMIT 20;
+ error_type: Test Results
+ test_templates: []
diff --git a/testgen/template/dbsetup_test_types/test_types_Variability_Decrease.yaml b/testgen/template/dbsetup_test_types/test_types_Variability_Decrease.yaml
new file mode 100644
index 00000000..30804567
--- /dev/null
+++ b/testgen/template/dbsetup_test_types/test_types_Variability_Decrease.yaml
@@ -0,0 +1,132 @@
+test_types:
+ id: '1041'
+ test_type: Variability_Decrease
+ test_name_short: Variability Decrease
+ test_name_long: Variability has decreased below threshold
+ test_description: |-
+ Tests that the spread or dispersion of column values has decreased significantly over baseline, indicating a shift in stability of the measure. This could signal a change in a process or a data quality issue.
+ except_message: |-
+ The Standard Deviation of the measure has decreased below the defined threshold. This could signal a change in a process or a data quality issue.
+ measure_uom: Pct SD shift
+ measure_uom_description: |-
+ Percent of baseline Standard Deviation
+ selection_criteria: |-
+ general_type = 'N' AND functional_data_type ilike 'Measure%' AND functional_data_type <> 'Measurement Spike' AND column_name NOT ilike '%latitude%' AND column_name NOT ilike '%longitude%' AND value_ct <> distinct_value_ct AND distinct_value_ct > 10 AND stdev_value > 0 AND avg_value IS NOT NULL AND NOT (distinct_value_ct = max_value - min_value + 1 AND distinct_value_ct > 2)
+ dq_score_prevalence_formula: |-
+ 1
+ dq_score_risk_factor: '0.75'
+ column_name_prompt: null
+ column_name_help: null
+ default_parm_columns: baseline_sd,threshold_value
+ default_parm_values: |-
+ stdev_value, 80
+ default_parm_prompts: |-
+ Std Deviation at Baseline,Expected Minimum Percent
+ default_parm_help: null
+ default_severity: Warning
+ run_type: CAT
+ test_scope: column
+ dq_dimension: Accuracy
+ health_dimension: Data Drift
+ threshold_description: |-
+ Expected minimum pct of baseline Standard Deviation (SD)
+ usage_notes: "This test looks for percent shifts in standard deviation as a measure\
+ \ of the stability of a measure over time. A significant change could indicate\
+ \ that new values are erroneous, or that the cohort being evaluated is significantly\
+ \ different from baseline. A decrease in particular could indicate an improved\
+ \ process, better precision in measurement, the elimination of outliers, or a\
+ \ more homogeneous cohort. "
+ active: Y
+ result_visualization: line_chart
+ result_visualization_params: null
+ cat_test_conditions:
+ - id: '1032'
+ test_type: Variability_Decrease
+ sql_flavor: redshift
+ measure: |-
+ 100.0*STDDEV({COLUMN_NAME}::FLOAT)/{BASELINE_SD}::FLOAT
+ test_operator: <
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '2032'
+ test_type: Variability_Decrease
+ sql_flavor: snowflake
+ measure: |-
+ 100.0*STDDEV({COLUMN_NAME}::FLOAT)/{BASELINE_SD}::FLOAT
+ test_operator: <
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '3032'
+ test_type: Variability_Decrease
+ sql_flavor: mssql
+ measure: |-
+ 100.0*STDEV(CAST({COLUMN_NAME} AS FLOAT))/CAST({BASELINE_SD} AS FLOAT)
+ test_operator: <
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '4032'
+ test_type: Variability_Decrease
+ sql_flavor: postgresql
+ measure: |-
+ 100.0*STDDEV({COLUMN_NAME}::FLOAT)/{BASELINE_SD}::FLOAT
+ test_operator: <
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '6032'
+ test_type: Variability_Decrease
+ sql_flavor: databricks
+ measure: |-
+ 100.0*STDDEV_SAMP({COLUMN_NAME}::FLOAT)/{BASELINE_SD}::FLOAT
+ test_operator: <
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '5032'
+ test_type: Variability_Decrease
+ sql_flavor: trino
+ measure: |-
+ 100.0*STDDEV(CAST({COLUMN_NAME} AS REAL))/{BASELINE_SD}
+ test_operator: <
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ target_data_lookups:
+ - id: '1032'
+ test_id: '1041'
+ test_type: Variability_Decrease
+ sql_flavor: redshift
+ lookup_type: null
+ lookup_query: |-
+ SELECT STDDEV("{COLUMN_NAME}"::FLOAT) as current_standard_deviation FROM {TARGET_SCHEMA}.{TABLE_NAME};
+ error_type: Test Results
+ - id: '1114'
+ test_id: '1041'
+ test_type: Variability_Decrease
+ sql_flavor: postgresql
+ lookup_type: null
+ lookup_query: |-
+ SELECT STDDEV(CAST("{COLUMN_NAME}" AS FLOAT)) as current_standard_deviation FROM {TARGET_SCHEMA}.{TABLE_NAME};
+ error_type: Test Results
+ - id: '1171'
+ test_id: '1041'
+ test_type: Variability_Decrease
+ sql_flavor: mssql
+ lookup_type: null
+ lookup_query: |-
+ SELECT STDEV(CAST("{COLUMN_NAME}" AS FLOAT)) as current_standard_deviation FROM {TARGET_SCHEMA}.{TABLE_NAME};
+ error_type: Test Results
+ - id: '1228'
+ test_id: '1041'
+ test_type: Variability_Decrease
+ sql_flavor: snowflake
+ lookup_type: null
+ lookup_query: |-
+ SELECT STDDEV(CAST("{COLUMN_NAME}" AS FLOAT)) as current_standard_deviation FROM {TARGET_SCHEMA}.{TABLE_NAME};
+ error_type: Test Results
+ - id: '1329'
+ test_id: '1041'
+ test_type: Variability_Decrease
+ sql_flavor: databricks
+ lookup_type: null
+ lookup_query: |-
+ SELECT STDDEV(CAST(`{COLUMN_NAME}` AS FLOAT)) as current_standard_deviation FROM {TARGET_SCHEMA}.{TABLE_NAME};
+ error_type: Test Results
+ test_templates: []
diff --git a/testgen/template/dbsetup_test_types/test_types_Variability_Increase.yaml b/testgen/template/dbsetup_test_types/test_types_Variability_Increase.yaml
new file mode 100644
index 00000000..1b4d4c8b
--- /dev/null
+++ b/testgen/template/dbsetup_test_types/test_types_Variability_Increase.yaml
@@ -0,0 +1,136 @@
+test_types:
+ id: '1040'
+ test_type: Variability_Increase
+ test_name_short: Variability Increase
+ test_name_long: Variability has increased above threshold
+ test_description: |-
+ Tests that the spread or dispersion of column values has increased significantly over baseline, indicating a drop in stability of the measure.
+ except_message: |-
+ The Standard Deviation of the measure has increased beyond the defined threshold. This could signal a change in a process or a data quality issue.
+ measure_uom: Pct SD shift
+ measure_uom_description: |-
+ Percent of baseline Standard Deviation
+ selection_criteria: |-
+ general_type = 'N' AND functional_data_type ilike 'Measure%' AND functional_data_type <> 'Measurement Spike' AND column_name NOT ilike '%latitude%' AND column_name NOT ilike '%longitude%' AND value_ct <> distinct_value_ct AND distinct_value_ct > 10 AND stdev_value > 0 AND avg_value IS NOT NULL AND NOT (distinct_value_ct = max_value - min_value + 1 AND distinct_value_ct > 2)
+ dq_score_prevalence_formula: |-
+ 1
+ dq_score_risk_factor: '0.75'
+ column_name_prompt: null
+ column_name_help: null
+ default_parm_columns: baseline_sd,threshold_value
+ default_parm_values: |-
+ stdev_value,120
+ default_parm_prompts: |-
+ Std Deviation at Baseline,Expected Maximum Percent
+ default_parm_help: null
+ default_severity: Warning
+ run_type: CAT
+ test_scope: column
+ dq_dimension: Accuracy
+ health_dimension: Data Drift
+ threshold_description: |-
+ Expected maximum pct of baseline Standard Deviation (SD)
+ usage_notes: "This test looks for percent shifts in standard deviation as a measure\
+ \ of the stability of a measure over time. A significant change could indicate\
+ \ that new values are erroneous, or that the cohort being evaluated is significantly\
+ \ different from baseline. An increase in particular could mark new problems\
+ \ in measurement, a more heterogeneous cohort, or that significant outliers have\
+ \ been introduced. Consider this test along with Average Shift and New Shift.\
+ \ If the average shifts as well, there may be a fundamental shift in the dataset\
+ \ or process used to collect the data point. This might suggest a data shift\
+ \ that should be noted and assessed by business users. If the average does not\
+ \ shift, this may point to a data quality or data collection problem. "
+ active: Y
+ result_visualization: line_chart
+ result_visualization_params: null
+ cat_test_conditions:
+ - id: '1031'
+ test_type: Variability_Increase
+ sql_flavor: redshift
+ measure: |-
+ 100.0*STDDEV({COLUMN_NAME}::FLOAT)/{BASELINE_SD}::FLOAT
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '2031'
+ test_type: Variability_Increase
+ sql_flavor: snowflake
+ measure: |-
+ 100.0*STDDEV({COLUMN_NAME}::FLOAT)/{BASELINE_SD}::FLOAT
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '3031'
+ test_type: Variability_Increase
+ sql_flavor: mssql
+ measure: |-
+ 100.0*STDEV(CAST({COLUMN_NAME} AS FLOAT))/CAST({BASELINE_SD} AS FLOAT)
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '4031'
+ test_type: Variability_Increase
+ sql_flavor: postgresql
+ measure: |-
+ 100.0*STDDEV({COLUMN_NAME}::FLOAT)/{BASELINE_SD}::FLOAT
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '6031'
+ test_type: Variability_Increase
+ sql_flavor: databricks
+ measure: |-
+ 100.0*STDDEV_SAMP({COLUMN_NAME}::FLOAT)/{BASELINE_SD}::FLOAT
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '5031'
+ test_type: Variability_Increase
+ sql_flavor: trino
+ measure: |-
+ 100.0*STDDEV(CAST({COLUMN_NAME} AS REAL))/{BASELINE_SD}
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ target_data_lookups:
+ - id: '1031'
+ test_id: '1040'
+ test_type: Variability_Increase
+ sql_flavor: redshift
+ lookup_type: null
+ lookup_query: |-
+ SELECT STDDEV("{COLUMN_NAME}"::FLOAT) as current_standard_deviation FROM {TARGET_SCHEMA}.{TABLE_NAME};
+ error_type: Test Results
+ - id: '1113'
+ test_id: '1040'
+ test_type: Variability_Increase
+ sql_flavor: postgresql
+ lookup_type: null
+ lookup_query: |-
+ SELECT STDDEV(CAST("{COLUMN_NAME}" AS FLOAT)) as current_standard_deviation FROM {TARGET_SCHEMA}.{TABLE_NAME};
+ error_type: Test Results
+ - id: '1170'
+ test_id: '1040'
+ test_type: Variability_Increase
+ sql_flavor: mssql
+ lookup_type: null
+ lookup_query: |-
+ SELECT STDEV(CAST("{COLUMN_NAME}" AS FLOAT)) as current_standard_deviation FROM {TARGET_SCHEMA}.{TABLE_NAME};
+ error_type: Test Results
+ - id: '1227'
+ test_id: '1040'
+ test_type: Variability_Increase
+ sql_flavor: snowflake
+ lookup_type: null
+ lookup_query: |-
+ SELECT STDDEV(CAST("{COLUMN_NAME}" AS FLOAT)) as current_standard_deviation FROM {TARGET_SCHEMA}.{TABLE_NAME};
+ error_type: Test Results
+ - id: '1328'
+ test_id: '1040'
+ test_type: Variability_Increase
+ sql_flavor: databricks
+ lookup_type: null
+ lookup_query: |-
+ SELECT STDDEV(CAST(`{COLUMN_NAME}` AS FLOAT)) as current_standard_deviation FROM {TARGET_SCHEMA}.{TABLE_NAME};
+ error_type: Test Results
+ test_templates: []
diff --git a/testgen/template/dbsetup_test_types/test_types_Weekly_Rec_Ct.yaml b/testgen/template/dbsetup_test_types/test_types_Weekly_Rec_Ct.yaml
new file mode 100644
index 00000000..5f8ab3ee
--- /dev/null
+++ b/testgen/template/dbsetup_test_types/test_types_Weekly_Rec_Ct.yaml
@@ -0,0 +1,168 @@
+test_types:
+ id: '1037'
+ test_type: Weekly_Rec_Ct
+ test_name_short: Weekly Records
+ test_name_long: At least one date per week present within date range
+ test_description: |-
+ Tests for presence of at least one date per calendar week within min/max date range, per baseline data
+ except_message: |-
+ At least one date per week expected in min/max date range.
+ measure_uom: Missing weeks
+ measure_uom_description: |-
+ Calendar weeks without date values present
+ selection_criteria: |-
+ functional_data_type ILIKE 'Transactional Date%' AND date_days_present > 1 AND functional_table_type ILIKE '%cumulative%' AND date_weeks_present > 3 AND date_weeks_present - (DATEDIFF('week', '1800-01-05'::DATE, max_date) - DATEDIFF('week', '1800-01-05'::DATE, min_date) + 1) = 0 AND future_date_ct::FLOAT / NULLIF(value_ct, 0) <= 0.75
+ dq_score_prevalence_formula: |-
+ ({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT*{PRO_RECORD_CT}::FLOAT/NULLIF({DATE_WEEKS_PRESENT}::FLOAT, 0)/NULLIF({RECORD_CT}::FLOAT, 0)
+ dq_score_risk_factor: '1.0'
+ column_name_prompt: null
+ column_name_help: null
+ default_parm_columns: threshold_value
+ default_parm_values: |-
+ 0
+ default_parm_prompts: |-
+ Threshold Weeks without Dates
+ default_parm_help: null
+ default_severity: Fail
+ run_type: CAT
+ test_scope: column
+ dq_dimension: Completeness
+ health_dimension: Volume
+ threshold_description: |-
+ Expected maximum count of calendar weeks without dates present
+ usage_notes: |-
+ Weekly Records tests that at least one record is present for every calendar week within the minimum and maximum date range for the column. The test is relevant for transactional data, where you would expect at least one transaction to be recorded each week. A failure here would suggest missing records for the number of weeks identified without data. You can adjust the threshold to accept a number of weeks that you know legitimately have no records.
+ active: Y
+ result_visualization: line_chart
+ result_visualization_params: null
+ cat_test_conditions:
+ - id: '1030'
+ test_type: Weekly_Rec_Ct
+ sql_flavor: redshift
+ measure: |-
+ MAX(DATEDIFF(week, '1800-01-01'::DATE, {COLUMN_NAME})) - MIN(DATEDIFF(week, '1800-01-01'::DATE, {COLUMN_NAME}))+1 - COUNT(DISTINCT DATEDIFF(week, '1800-01-01'::DATE, {COLUMN_NAME}))
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '2030'
+ test_type: Weekly_Rec_Ct
+ sql_flavor: snowflake
+ measure: |-
+ MAX(DATEDIFF(week, '1800-01-01'::DATE, {COLUMN_NAME})) - MIN(DATEDIFF(week, '1800-01-01'::DATE, {COLUMN_NAME}))+1 - COUNT(DISTINCT DATEDIFF(week, '1800-01-01'::DATE, {COLUMN_NAME}))
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '3030'
+ test_type: Weekly_Rec_Ct
+ sql_flavor: mssql
+ measure: |-
+ MAX(DATEDIFF(week, CAST('1800-01-01' AS DATE), {COLUMN_NAME})) - MIN(DATEDIFF(week, CAST('1800-01-01' AS DATE), {COLUMN_NAME}))+1 - COUNT(DISTINCT DATEDIFF(week, CAST('1800-01-01' AS DATE), {COLUMN_NAME}))
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '4030'
+ test_type: Weekly_Rec_Ct
+ sql_flavor: postgresql
+ measure: |-
+ MAX(<%DATEDIFF_WEEK;'1800-01-01'::DATE;{COLUMN_NAME}%>) - MIN(<%DATEDIFF_WEEK;'1800-01-01'::DATE;{COLUMN_NAME}%>)+1 - COUNT(DISTINCT <%DATEDIFF_WEEK;'1800-01-01'::DATE;{COLUMN_NAME}%>)
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '5030'
+ test_type: Weekly_Rec_Ct
+ sql_flavor: trino
+ measure: |-
+ MAX(DATE_DIFF('week', CAST('1800-01-01' AS DATE), {COLUMN_NAME})) - MIN(DATE_DIFF('week', CAST('1800-01-01' AS DATE), {COLUMN_NAME})) +1 - COUNT(DISTINCT DATE_DIFF('week', CAST('1800-01-01' AS DATE), {COLUMN_NAME}))
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '6030'
+ test_type: Weekly_Rec_Ct
+ sql_flavor: databricks
+ measure: |-
+ CAST(<%DATEDIFF_WEEK;MIN({COLUMN_NAME});MAX({COLUMN_NAME})%> + 1 - COUNT(DISTINCT DATE_TRUNC('week', {COLUMN_NAME})) AS INT)
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ target_data_lookups:
+ - id: '1030'
+ test_id: '1037'
+ test_type: Weekly_Rec_Ct
+ sql_flavor: redshift
+ lookup_type: null
+ lookup_query: |-
+ WITH RECURSIVE daterange(all_dates) AS (SELECT DATE_TRUNC('week',MIN("{COLUMN_NAME}")) :: DATE AS all_dates FROM {TARGET_SCHEMA}.{TABLE_NAME} UNION ALL SELECT (d.all_dates + INTERVAL '1 week' ) :: DATE AS all_dates FROM daterange d WHERE d.all_dates < (SELECT DATE_TRUNC('week', MAX("{COLUMN_NAME}")) :: DATE FROM {TARGET_SCHEMA}.{TABLE_NAME}) ), existing_periods AS ( SELECT DISTINCT DATE_TRUNC('week',"{COLUMN_NAME}") :: DATE AS period, COUNT(1) as period_count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY DATE_TRUNC('week',"{COLUMN_NAME}") :: DATE ) SELECT d.all_dates as missing_period, MAX(b.period) AS prior_available_week, (SELECT period_count FROM existing_periods WHERE period = MAX(b.period) ) AS prior_available_week_count, MIN(c.period) AS next_available_week, (SELECT period_count FROM existing_periods WHERE period = MIN(c.period) ) AS next_available_week_count FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates ORDER BY d.all_dates;
+ error_type: Test Results
+ - id: '1112'
+ test_id: '1037'
+ test_type: Weekly_Rec_Ct
+ sql_flavor: postgresql
+ lookup_type: null
+ lookup_query: |-
+ WITH RECURSIVE daterange(all_dates) AS (SELECT DATE_TRUNC('week', MIN("{COLUMN_NAME}")) :: DATE AS all_dates FROM {TARGET_SCHEMA}.{TABLE_NAME} UNION ALL SELECT (d.all_dates + INTERVAL '1 week' ) :: DATE AS all_dates FROM daterange d WHERE d.all_dates < (SELECT DATE_TRUNC('week' , MAX("{COLUMN_NAME}")) :: DATE FROM {TARGET_SCHEMA}.{TABLE_NAME}) ), existing_periods AS (SELECT DISTINCT DATE_TRUNC('week', "{COLUMN_NAME}") :: DATE AS period, COUNT(1) as period_count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY DATE_TRUNC('week', "{COLUMN_NAME}") :: DATE) SELECT d.all_dates as missing_period, MAX(b.period) AS prior_available_week, (SELECT period_count FROM existing_periods WHERE period = MAX(b.period) ) AS prior_available_week_count, MIN(c.period) AS next_available_week, (SELECT period_count FROM existing_periods WHERE period = MIN(c.period) ) AS next_available_week_count FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates ORDER BY d.all_dates;
+ error_type: Test Results
+ - id: '1169'
+ test_id: '1037'
+ test_type: Weekly_Rec_Ct
+ sql_flavor: mssql
+ lookup_type: null
+ lookup_query: |-
+ WITH
+ Pass0 as (select 1 as C union all select 1), --2 rows
+ Pass1 as (select 1 as C from Pass0 as A, Pass0 as B),--4 rows
+ Pass2 as (select 1 as C from Pass1 as A, Pass1 as B),--16 rows
+ Pass3 as (select 1 as C from Pass2 as A, Pass2 as B),--256 rows
+ Pass4 as (select 1 as C from Pass3 as A, Pass3 as B),--65536 rows
+ All_Nums as (select row_number() over(order by C) as Number from Pass4),
+ tally as (SELECT Number FROM All_Nums WHERE Number <= 45000),
+
+ date_range as (SELECT CAST(DATEADD(WEEK, DATEDIFF(WEEK, 0, MIN("{COLUMN_NAME}")), 0) AS DATE) AS min_period,
+ CAST(DATEADD(WEEK, DATEDIFF(WEEK, 0, MAX("{COLUMN_NAME}")), 0) AS DATE) AS max_period,
+ DATEDIFF(WEEK,
+ CAST(DATEADD(WEEK, DATEDIFF(WEEK, 0, MIN("{COLUMN_NAME}")), 0) AS DATE),
+ CAST(DATEADD(WEEK, DATEDIFF(WEEK, 0, MAX("{COLUMN_NAME}")), 0) AS DATE) ) + 1 as period_ct
+ FROM {TARGET_SCHEMA}.{TABLE_NAME} ),
+ check_periods as ( SELECT d.min_period, d.max_period, t.number,
+ DATEADD(WEEK, -(t.number - 1), d.max_period) AS check_period
+ FROM date_range d
+ INNER JOIN tally t
+ ON (d.period_ct >= t.number) ),
+ data_by_period as (SELECT CAST(DATEADD(WEEK, DATEDIFF(WEEK, 0, "{COLUMN_NAME}"), 0) AS DATE) as data_period, COUNT(*) as record_ct
+ FROM {TARGET_SCHEMA}.{TABLE_NAME}
+ GROUP BY CAST(DATEADD(WEEK, DATEDIFF(WEEK, 0, "{COLUMN_NAME}"), 0) AS DATE) ),
+ data_by_prd_with_prior_next as (SELECT check_period,
+ RANK() OVER (ORDER BY check_period DESC) as ranked,
+ ISNULL(d.record_ct, 0) as record_ct,
+ ISNULL(LAG(d.record_ct) OVER (ORDER BY check_period), 0) as last_record_ct,
+ ISNULL(LEAD(d.record_ct) OVER (ORDER BY check_period), 0) as next_record_ct
+ FROM check_periods c
+ LEFT JOIN data_by_period d
+ ON (c.check_period = d.data_period) )
+ SELECT check_period, record_ct,
+ CASE
+ WHEN record_ct = 0 THEN 'MISSING'
+ ELSE 'Present'
+ END as status
+ FROM data_by_prd_with_prior_next
+ WHERE record_ct = 0
+ OR last_record_ct = 0
+ OR next_record_ct = 0
+ ORDER BY check_period DESC;
+ error_type: Test Results
+ - id: '1226'
+ test_id: '1037'
+ test_type: Weekly_Rec_Ct
+ sql_flavor: snowflake
+ lookup_type: null
+ lookup_query: |-
+ WITH RECURSIVE daterange(all_dates) AS (SELECT DATE_TRUNC('week',MIN("{COLUMN_NAME}")) :: DATE AS all_dates FROM {TARGET_SCHEMA}.{TABLE_NAME} UNION ALL SELECT (d.all_dates + INTERVAL '1 week' ) :: DATE AS all_dates FROM daterange d WHERE d.all_dates < (SELECT DATE_TRUNC('week', MAX("{COLUMN_NAME}")) :: DATE FROM {TARGET_SCHEMA}.{TABLE_NAME}) ), existing_periods AS ( SELECT DISTINCT DATE_TRUNC('week',"{COLUMN_NAME}") :: DATE AS period, COUNT(1) as period_count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY DATE_TRUNC('week',"{COLUMN_NAME}") :: DATE ) SELECT p.missing_period, p.prior_available_week, e.period_count as prior_available_week_count, p.next_available_week, f.period_count as next_available_week_count FROM( SELECT d.all_dates as missing_period, MAX(b.period) AS prior_available_week, MIN(c.period) AS next_available_week FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates ) p LEFT JOIN existing_periods e ON (p.prior_available_week = e.period) LEFT JOIN existing_periods f ON (p.next_available_week = f.period) ORDER BY p.missing_period;
+ error_type: Test Results
+ - id: '1327'
+ test_id: '1037'
+ test_type: Weekly_Rec_Ct
+ sql_flavor: databricks
+ lookup_type: null
+ lookup_query: |-
+ WITH daterange AS( SELECT explode(sequence( date_trunc('week', (SELECT min(`{COLUMN_NAME}`) FROM {TARGET_SCHEMA}.{TABLE_NAME})), date_trunc('week', (SELECT max(`{COLUMN_NAME}`) FROM {TARGET_SCHEMA}.{TABLE_NAME})), interval 1 week)) AS all_dates ), existing_periods AS ( SELECT DISTINCT date_trunc('week', `{COLUMN_NAME}`) AS period, COUNT(1) AS period_count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY date_trunc('week', `{COLUMN_NAME}`) ) SELECT p.missing_period, p.prior_available_week, e.period_count AS prior_available_week_count, p.next_available_week, f.period_count AS next_available_week_count FROM ( SELECT d.all_dates AS missing_period, MAX(b.period) AS prior_available_week, MIN(c.period) AS next_available_week FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates ) p LEFT JOIN existing_periods e ON p.prior_available_week = e.period LEFT JOIN existing_periods f ON p.next_available_week = f.period ORDER BY p.missing_period;
+ error_type: Test Results
+ test_templates: []
diff --git a/testgen/template/dbupgrade/0151_incremental_upgrade.sql b/testgen/template/dbupgrade/0151_incremental_upgrade.sql
new file mode 100644
index 00000000..730562e8
--- /dev/null
+++ b/testgen/template/dbupgrade/0151_incremental_upgrade.sql
@@ -0,0 +1,5 @@
+SET SEARCH_PATH TO {SCHEMA_NAME};
+
+ALTER TABLE target_data_lookups
+ ADD CONSTRAINT target_data_lookups_test_id_sql_flavor_error_type_pk
+ PRIMARY KEY (test_id, sql_flavor, error_type);
From 3bb17e0b838c68ff4704338bbaf9c7293ad2e352 Mon Sep 17 00:00:00 2001
From: Diogo Basto
Date: Fri, 19 Sep 2025 18:16:10 +0100
Subject: [PATCH 05/48] TG-920
---
.../template/dbsetup/030_initialize_new_schema_structure.sql | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/testgen/template/dbsetup/030_initialize_new_schema_structure.sql b/testgen/template/dbsetup/030_initialize_new_schema_structure.sql
index 146e1c06..18f049e1 100644
--- a/testgen/template/dbsetup/030_initialize_new_schema_structure.sql
+++ b/testgen/template/dbsetup/030_initialize_new_schema_structure.sql
@@ -621,7 +621,7 @@ CREATE TABLE target_data_lookups (
lookup_query VARCHAR,
error_type VARCHAR(30) NOT NULL,
CONSTRAINT target_data_lookups_test_id_sql_flavor_error_type_pk
- PRIMARY KEY (test_id, sql_flavor, error_type),
+ PRIMARY KEY (test_id, sql_flavor, error_type)
);
CREATE TABLE variant_codings (
From fa30b23b51beee7d6f27ddd06ade3761694be749 Mon Sep 17 00:00:00 2001
From: Diogo Basto
Date: Mon, 22 Sep 2025 17:06:09 +0100
Subject: [PATCH 06/48] Address Review
---
testgen/__main__.py | 13 ++-
testgen/commands/run_launch_db_config.py | 2 +-
.../commands/run_test_metadata_exporter.py | 16 +---
testgen/commands/run_upgrade_db_config.py | 2 +-
testgen/common/read_yaml_metadata_records.py | 71 ++++++++------
..._anomaly_types_Boolean_Value_Mismatch.yaml | 28 +++---
...anomaly_types_Char_Column_Date_Values.yaml | 28 +++---
...omaly_types_Char_Column_Number_Values.yaml | 28 +++---
...anomaly_types_Column_Pattern_Mismatch.yaml | 28 +++---
...anomaly_types_Delimited_Data_Embedded.yaml | 28 +++---
...ile_anomaly_types_Inconsistent_Casing.yaml | 52 +++++-----
...rofile_anomaly_types_Invalid_Zip3_USA.yaml | 28 +++---
...profile_anomaly_types_Invalid_Zip_USA.yaml | 28 +++---
.../profile_anomaly_types_Leading_Spaces.yaml | 28 +++---
...le_anomaly_types_Multiple_Types_Major.yaml | 28 +++---
...le_anomaly_types_Multiple_Types_Minor.yaml | 28 +++---
.../profile_anomaly_types_No_Values.yaml | 28 +++---
..._anomaly_types_Non_Alpha_Name_Address.yaml | 36 +++----
...anomaly_types_Non_Alpha_Prefixed_Name.yaml | 36 +++----
...file_anomaly_types_Non_Printing_Chars.yaml | 94 +++++++++----------
...ile_anomaly_types_Non_Standard_Blanks.yaml | 28 +++---
...le_anomaly_types_Potential_Duplicates.yaml | 28 +++---
.../profile_anomaly_types_Potential_PII.yaml | 28 +++---
.../profile_anomaly_types_Quoted_Values.yaml | 28 +++---
...rofile_anomaly_types_Recency_One_Year.yaml | 16 ++--
...file_anomaly_types_Recency_Six_Months.yaml | 16 ++--
...omaly_types_Small_Divergent_Value_Ct.yaml} | 28 +++---
...anomaly_types_Small_Missing_Value_Ct.yaml} | 28 +++---
..._anomaly_types_Small_Numeric_Value_Ct.yaml | 28 +++---
...maly_types_Standardized_Value_Matches.yaml | 28 +++---
.../profile_anomaly_types_Suggested_Type.yaml | 24 ++---
..._anomaly_types_Table_Pattern_Mismatch.yaml | 34 +++----
...file_anomaly_types_Unexpected_Emails.yaml} | 28 +++---
...e_anomaly_types_Unexpected_US_States.yaml} | 28 +++---
...le_anomaly_types_Unlikely_Date_Values.yaml | 28 +++---
...le_anomaly_types_Variant_Coded_Values.yaml | 24 ++---
.../test_types_Aggregate_Balance.yaml | 40 ++++----
.../test_types_Aggregate_Balance_Percent.yaml | 40 ++++----
.../test_types_Aggregate_Balance_Range.yaml | 38 ++++----
.../test_types_Aggregate_Minimum.yaml | 40 ++++----
.../test_types_Alpha_Trunc.yaml | 60 ++++++------
.../test_types_Avg_Shift.yaml | 66 ++++++-------
.../dbsetup_test_types/test_types_CUSTOM.yaml | 20 ++--
.../test_types_Combo_Match.yaml | 40 ++++----
.../test_types_Condition_Flag.yaml | 56 +++++------
.../test_types_Constant.yaml | 56 +++++------
.../test_types_Daily_Record_Ct.yaml | 72 +++++++-------
.../test_types_Dec_Trunc.yaml | 60 ++++++------
.../test_types_Distinct_Date_Ct.yaml | 56 +++++------
.../test_types_Distinct_Value_Ct.yaml | 56 +++++------
.../test_types_Distribution_Shift.yaml | 48 +++++-----
.../test_types_Dupe_Rows.yaml | 40 ++++----
.../test_types_Email_Format.yaml | 66 ++++++-------
.../test_types_Future_Date.yaml | 62 ++++++------
.../test_types_Future_Date_1Y.yaml | 62 ++++++------
.../test_types_Incr_Avg_Shift.yaml | 62 ++++++------
.../test_types_LOV_All.yaml | 60 ++++++------
.../test_types_LOV_Match.yaml | 56 +++++------
.../test_types_Min_Date.yaml | 60 ++++++------
.../test_types_Min_Val.yaml | 56 +++++------
.../test_types_Missing_Pct.yaml | 66 ++++++-------
.../test_types_Monthly_Rec_Ct.yaml | 72 +++++++-------
.../test_types_Outlier_Pct_Above.yaml | 62 ++++++------
.../test_types_Outlier_Pct_Below.yaml | 62 ++++++------
.../test_types_Pattern_Match.yaml | 66 ++++++-------
.../test_types_Recency.yaml | 66 ++++++-------
.../test_types_Required.yaml | 62 ++++++------
.../dbsetup_test_types/test_types_Row_Ct.yaml | 56 +++++------
.../test_types_Row_Ct_Pct.yaml | 58 ++++++------
.../test_types_Street_Addr_Pattern.yaml | 66 ++++++-------
.../test_types_Table_Freshness.yaml | 20 ++--
.../test_types_Timeframe_Combo_Gain.yaml | 52 +++++-----
.../test_types_Timeframe_Combo_Match.yaml | 68 +++++++-------
.../test_types_US_State.yaml | 62 ++++++------
.../dbsetup_test_types/test_types_Unique.yaml | 56 +++++------
.../test_types_Unique_Pct.yaml | 62 ++++++------
.../test_types_Valid_Characters.yaml | 62 ++++++------
.../test_types_Valid_Month.yaml | 28 +++---
.../test_types_Valid_US_Zip.yaml | 52 +++++-----
.../test_types_Valid_US_Zip3.yaml | 56 +++++------
.../test_types_Variability_Decrease.yaml | 60 ++++++------
.../test_types_Variability_Increase.yaml | 60 ++++++------
.../test_types_Weekly_Rec_Ct.yaml | 72 +++++++-------
83 files changed, 1840 insertions(+), 1830 deletions(-)
rename testgen/template/dbsetup_anomaly_types/{profile_anomaly_types_Small Divergent Value Ct.yaml => profile_anomaly_types_Small_Divergent_Value_Ct.yaml} (100%)
rename testgen/template/dbsetup_anomaly_types/{profile_anomaly_types_Small Missing Value Ct.yaml => profile_anomaly_types_Small_Missing_Value_Ct.yaml} (100%)
rename testgen/template/dbsetup_anomaly_types/{profile_anomaly_types_Unexpected Emails.yaml => profile_anomaly_types_Unexpected_Emails.yaml} (100%)
rename testgen/template/dbsetup_anomaly_types/{profile_anomaly_types_Unexpected US States.yaml => profile_anomaly_types_Unexpected_US_States.yaml} (100%)
diff --git a/testgen/__main__.py b/testgen/__main__.py
index 942b02a4..fda81d29 100644
--- a/testgen/__main__.py
+++ b/testgen/__main__.py
@@ -504,12 +504,21 @@ def export_data(configuration: Configuration, project_key: str, test_suite_key:
click.echo("\nexport-observability completed successfully.\n")
+@click.option(
+ "--path",
+ help="Path to the templates folder. Defaults to path from project root.",
+ required=False,
+ default="testgen/template",
+)
@cli.command("export-test-metadata", help="Exports current test metadata records to yaml files.")
@pass_configuration
-def export_test_metadata(configuration: Configuration):
+def export_test_metadata(configuration: Configuration, path: str):
click.echo("export-test-metadata")
LOG.info("CurrentStep: Main Program - Test Metadata Export")
- run_test_metadata_exporter()
+ if not os.path.isdir(path):
+ LOG.error("Provided path {path} is not a directory. Please correct the --path option.")
+ return
+ run_test_metadata_exporter(path)
LOG.info("CurrentStep: Main Program - Test Metadata Export - DONE")
click.echo("\nexport-test-metadata completed successfully.\n")
diff --git a/testgen/commands/run_launch_db_config.py b/testgen/commands/run_launch_db_config.py
index 6899f0dc..37226ea5 100644
--- a/testgen/commands/run_launch_db_config.py
+++ b/testgen/commands/run_launch_db_config.py
@@ -86,7 +86,7 @@ def run_launch_db_config(delete_db: bool, drop_users_and_roles: bool = True) ->
password_override=params_mapping["TESTGEN_ADMIN_PASSWORD"],
user_type="schema_admin",
)
- import_metadata_records_from_yaml(params_mapping)
+ import_metadata_records_from_yaml()
ScoreDefinition.from_table_group(
TableGroup(
diff --git a/testgen/commands/run_test_metadata_exporter.py b/testgen/commands/run_test_metadata_exporter.py
index 30e3188c..89f7e8cc 100644
--- a/testgen/commands/run_test_metadata_exporter.py
+++ b/testgen/commands/run_test_metadata_exporter.py
@@ -1,22 +1,10 @@
import logging
-from testgen import settings
-from testgen.common.credentials import get_tg_schema
from testgen.common.models import with_database_session
from testgen.common.read_yaml_metadata_records import export_metadata_records_to_yaml
LOG = logging.getLogger("testgen")
-
-def _get_params_mapping() -> dict:
- return {
- "SCHEMA_NAME": get_tg_schema(),
- "TESTGEN_ADMIN_USER": settings.DATABASE_ADMIN_USER,
- "TESTGEN_ADMIN_PASSWORD": settings.DATABASE_ADMIN_PASSWORD,
- "OBSERVABILITY_URL": settings.OBSERVABILITY_API_URL,
- }
-
-
@with_database_session
-def run_test_metadata_exporter() -> None:
- export_metadata_records_to_yaml(_get_params_mapping())
+def run_test_metadata_exporter(templates_path) -> None:
+ export_metadata_records_to_yaml(templates_path)
diff --git a/testgen/commands/run_upgrade_db_config.py b/testgen/commands/run_upgrade_db_config.py
index 95ec4bc0..a6b7d91c 100644
--- a/testgen/commands/run_upgrade_db_config.py
+++ b/testgen/commands/run_upgrade_db_config.py
@@ -97,7 +97,7 @@ def _refresh_static_metadata(params_mapping):
password_override=params_mapping["TESTGEN_ADMIN_PASSWORD"],
user_type="schema_admin",
)
- import_metadata_records_from_yaml(params_mapping)
+ import_metadata_records_from_yaml()
strQueryMetadataConstraints = read_template_sql_file("055_recreate_metadata_constraints.sql", "dbsetup")
strQueryMetadataConstraints = replace_params(strQueryMetadataConstraints, params_mapping)
diff --git a/testgen/common/read_yaml_metadata_records.py b/testgen/common/read_yaml_metadata_records.py
index 2602d147..971397c5 100644
--- a/testgen/common/read_yaml_metadata_records.py
+++ b/testgen/common/read_yaml_metadata_records.py
@@ -2,14 +2,20 @@
import logging
from importlib.resources import as_file
+from os import mkdir
+from os.path import isdir
+from os.path import sep as path_seperator
from yaml import SafeDumper, safe_dump, safe_load
+from testgen.common.credentials import get_tg_schema
from testgen.common.database.database_service import execute_db_queries, fetch_from_db_threaded
from testgen.common.read_file import get_template_files
LOG = logging.getLogger("testgen")
+
+TEST_TYPES_TEMPLATE_FOLDER = "dbsetup_test_types"
TEST_TYPES_PARENT_TABLE = "test_types"
TEST_TYPES_PARENT_KEY = "test_type"
TEST_TYPES_CHILD_TABLES = ["cat_test_conditions", "target_data_lookups", "test_templates"]
@@ -61,6 +67,7 @@
}
+ANOMALY_TYPES_TEMPLATE_FOLDER = "dbsetup_anomaly_types"
ANOMALY_TYPES_PARENT_TABLE = "profile_anomaly_types"
ANOMALY_TYPES_PARENT_KEY = "anomaly_type"
ANOMALY_TYPES_CHILD_TABLES = ["target_data_lookups"]
@@ -104,8 +111,9 @@ def _literal_representer(dumper, data):
SafeDumper.add_representer(LiteralString, _literal_representer)
-def _process_yaml_for_import(params_mapping: dict, data:dict, parent_table:str, parent_key:str, child_tables:list[str], default_pk:dict[str, list[str]], parent_child_column_map:dict[str, dict[str,str]]):
+def _process_yaml_for_import(data:dict, parent_table:str, parent_key:str, child_tables:list[str], default_pk:dict[str, list[str]], parent_child_column_map:dict[str, dict[str,str]]):
queries = []
+ schema = get_tg_schema()
parent = data.get(parent_table)
if not isinstance(parent, dict):
@@ -132,7 +140,7 @@ def _process_yaml_for_import(params_mapping: dict, data:dict, parent_table:str,
bound_values = {c: record[c] for c in columns}
sql = f"""
- INSERT INTO {params_mapping["SCHEMA_NAME"]}.{table_name} ({insert_cols})
+ INSERT INTO {schema}.{table_name} ({insert_cols})
VALUES ({insert_vals})
ON CONFLICT ({', '.join(pk_cols)}) DO UPDATE
SET {update_stmt};
@@ -146,7 +154,7 @@ def _process_yaml_for_import(params_mapping: dict, data:dict, parent_table:str,
update_stmt = ", ".join(f"{c}=EXCLUDED.{c}" for c in columns if c != parent_key)
bound_values = {c: parent[c] for c in columns}
parent_insert_query = f"""
- INSERT INTO {params_mapping["SCHEMA_NAME"]}.{parent_table} ({insert_cols})
+ INSERT INTO {schema}.{parent_table} ({insert_cols})
VALUES ({insert_vals})
ON CONFLICT ({parent_key}) DO UPDATE
SET {update_stmt};
@@ -156,20 +164,16 @@ def _process_yaml_for_import(params_mapping: dict, data:dict, parent_table:str,
execute_db_queries(
queries,
- user_override=params_mapping["TESTGEN_ADMIN_USER"],
- password_override=params_mapping["TESTGEN_ADMIN_PASSWORD"],
- user_type="schema_admin",
)
return
-def import_metadata_records_from_yaml(params_mapping) -> None:
- files = sorted(get_template_files(mask="^.*ya?ml$", sub_directory="dbsetup_test_types"), key=lambda key: str(key))
+def import_metadata_records_from_yaml() -> None:
+ files = sorted(get_template_files(mask="^.*ya?ml$", sub_directory=TEST_TYPES_TEMPLATE_FOLDER), key=lambda key: str(key))
for yaml_file in files:
with as_file(yaml_file) as f:
with f.open("r") as file:
data = safe_load(file)
_process_yaml_for_import(
- params_mapping,
data,
TEST_TYPES_PARENT_TABLE,
TEST_TYPES_PARENT_KEY,
@@ -177,14 +181,13 @@ def import_metadata_records_from_yaml(params_mapping) -> None:
TEST_TYPES_DEFAULT_PK,
TEST_TYPES_PARENT_CHILD_COLUMN_MAP,
)
- files = sorted(get_template_files(mask="^.*ya?ml$", sub_directory="dbsetup_anomaly_types"), key=lambda key: str(key))
+ files = sorted(get_template_files(mask="^.*ya?ml$", sub_directory=ANOMALY_TYPES_TEMPLATE_FOLDER), key=lambda key: str(key))
for yaml_file in files:
with as_file(yaml_file) as f:
with f.open("r") as file:
LOG.info(f"Importing {yaml_file}")
data = safe_load(file)
_process_yaml_for_import(
- params_mapping,
data,
ANOMALY_TYPES_PARENT_TABLE,
ANOMALY_TYPES_PARENT_KEY,
@@ -194,16 +197,21 @@ def import_metadata_records_from_yaml(params_mapping) -> None:
)
return
-
-def _process_records_for_export(params_mapping: dict, parent_table:str, parent_key:str, child_tables:list[str], parent_child_column_map:dict[str, dict[str,str]], literal_fields:dict[str, list[str]]) -> None:
- def wrap_literal(table_name, recs):
- for rec in recs:
- for fld in literal_fields.get(table_name, []):
- val = rec.get(fld)
- if isinstance(val, str) and val != "":
- rec[fld] = LiteralString(val)
-
- fetch_parent_query = f"SELECT * FROM {params_mapping["SCHEMA_NAME"]}.{parent_table};"
+def _wrap_literal(table_name: str, recs: list[dict], literal_fields: dict[str, list[str]]):
+ for rec in recs:
+ for fld in literal_fields.get(table_name, []):
+ val = rec.get(fld)
+ if isinstance(val, str) and val != "":
+ rec[fld] = LiteralString(val)
+
+def _process_records_for_export(export_path:str, parent_table:str, parent_key:str, child_tables:list[str], default_pk:dict[str, list[str]], parent_child_column_map:dict[str, dict[str,str]], literal_fields:dict[str, list[str]]) -> None:
+ if not isdir(export_path):
+ mkdir(export_path)
+ schema = get_tg_schema()
+ fetch_parent_query = f"""
+ SELECT *
+ FROM {schema}.{parent_table};
+ """
parent_records, parent_columns, _ = fetch_from_db_threaded(
[(fetch_parent_query, None)],
)
@@ -211,40 +219,45 @@ def wrap_literal(table_name, recs):
parent_record_dict = dict(zip(parent_columns, parent_record, strict=False))
for child_name in child_tables:
child_key = next(key for key, value in parent_child_column_map[child_name].items() if value==parent_key)
- fetch_children_query = f"SELECT * FROM {params_mapping["SCHEMA_NAME"]}.{child_name} WHERE {child_key} = '{parent_record_dict[parent_key]}';"
+ fetch_children_query = f"""
+ SELECT * FROM {schema}.{child_name}
+ WHERE {child_key} = '{parent_record_dict[parent_key]}'
+ ORDER BY {", ".join(default_pk[child_name])};
+ """
child_records, child_columns, _ = fetch_from_db_threaded(
[(fetch_children_query, None)],
)
child_records_dict = []
for child_record in child_records:
child_records_dict.append(dict(zip(child_columns, child_record, strict=False)))
- LOG.info(child_records_dict)
- wrap_literal(child_name, child_records_dict)
+ _wrap_literal(child_name, child_records_dict, literal_fields)
parent_record_dict[child_name] = child_records_dict
- wrap_literal(parent_table, [parent_record_dict])
+ _wrap_literal(parent_table, [parent_record_dict], literal_fields)
payload = {parent_table: parent_record_dict}
- out_file = f"{parent_table}_{parent_record_dict[parent_key]}.yaml"
+ out_file = f"{export_path}{path_seperator}{parent_table}_{parent_record_dict[parent_key].replace(' ','_')}.yaml"
LOG.info(f"Exporting {out_file}")
with open(out_file, "w") as f:
safe_dump(payload, f, sort_keys=False)
-def export_metadata_records_to_yaml(params_mapping: dict) -> None:
+def export_metadata_records_to_yaml(templates_path) -> None:
_add_literal_representer()
_process_records_for_export(
- params_mapping,
+ f"{templates_path}{path_seperator}{TEST_TYPES_TEMPLATE_FOLDER}",
TEST_TYPES_PARENT_TABLE,
TEST_TYPES_PARENT_KEY,
TEST_TYPES_CHILD_TABLES,
+ TEST_TYPES_DEFAULT_PK,
TEST_TYPES_PARENT_CHILD_COLUMN_MAP,
TEST_TYPES_LITERAL_FIELDS,
)
_process_records_for_export(
- params_mapping,
+ f"{templates_path}{path_seperator}{ANOMALY_TYPES_TEMPLATE_FOLDER}",
ANOMALY_TYPES_PARENT_TABLE,
ANOMALY_TYPES_PARENT_KEY,
ANOMALY_TYPES_CHILD_TABLES,
+ ANOMALY_TYPES_DEFAULT_PK,
ANOMALY_TYPES_PARENT_CHILD_COLUMN_MAP,
ANOMALY_TYPES_LITERAL_FIELDS,
)
diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Boolean_Value_Mismatch.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Boolean_Value_Mismatch.yaml
index 9aa56cd7..10cedbcf 100644
--- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Boolean_Value_Mismatch.yaml
+++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Boolean_Value_Mismatch.yaml
@@ -23,13 +23,21 @@ profile_anomaly_types:
dq_score_risk_factor: '0.66'
dq_dimension: Validity
target_data_lookups:
- - id: '1047'
+ - id: '1287'
test_id: '1015'
test_type: Boolean_Value_Mismatch
- sql_flavor: redshift
+ sql_flavor: databricks
lookup_type: null
lookup_query: |-
- SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC;
+ SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY `{COLUMN_NAME}` ORDER BY count DESC;
+ error_type: Profile Anomaly
+ - id: '1129'
+ test_id: '1015'
+ test_type: Boolean_Value_Mismatch
+ sql_flavor: mssql
+ lookup_type: null
+ lookup_query: |-
+ SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC;
error_type: Profile Anomaly
- id: '1072'
test_id: '1015'
@@ -39,13 +47,13 @@ profile_anomaly_types:
lookup_query: |-
SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC;
error_type: Profile Anomaly
- - id: '1129'
+ - id: '1047'
test_id: '1015'
test_type: Boolean_Value_Mismatch
- sql_flavor: mssql
+ sql_flavor: redshift
lookup_type: null
lookup_query: |-
- SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC;
+ SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC;
error_type: Profile Anomaly
- id: '1186'
test_id: '1015'
@@ -55,11 +63,3 @@ profile_anomaly_types:
lookup_query: |-
SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC;
error_type: Profile Anomaly
- - id: '1287'
- test_id: '1015'
- test_type: Boolean_Value_Mismatch
- sql_flavor: databricks
- lookup_type: null
- lookup_query: |-
- SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY `{COLUMN_NAME}` ORDER BY count DESC;
- error_type: Profile Anomaly
diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Char_Column_Date_Values.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Char_Column_Date_Values.yaml
index 6e9ce327..f31357b2 100644
--- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Char_Column_Date_Values.yaml
+++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Char_Column_Date_Values.yaml
@@ -22,13 +22,21 @@ profile_anomaly_types:
dq_score_risk_factor: '0.66'
dq_dimension: Validity
target_data_lookups:
- - id: '1044'
+ - id: '1284'
test_id: '1012'
test_type: Char_Column_Date_Values
- sql_flavor: redshift
+ sql_flavor: databricks
lookup_type: null
lookup_query: |-
- SELECT A.* FROM ( SELECT TOP 10 DISTINCT 'Date' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_DATE;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS A UNION ALL SELECT B.* FROM ( SELECT TOP 10 DISTINCT 'Non-Date' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_DATE;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS B ORDER BY data_type, count DESC;
+ SELECT A.* FROM (SELECT DISTINCT 'Date' as data_type, `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_DATE;`{COLUMN_NAME}`%> = 1 GROUP BY `{COLUMN_NAME}` ORDER BY count DESC LIMIT 10) AS A UNION ALL SELECT B.* FROM (SELECT DISTINCT 'Non-Date' as data_type, `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_DATE;`{COLUMN_NAME}`%> != 1 GROUP BY `{COLUMN_NAME}` ORDER BY count DESC) AS B ORDER BY data_type, count DESC LIMIT 10;
+ error_type: Profile Anomaly
+ - id: '1126'
+ test_id: '1012'
+ test_type: Char_Column_Date_Values
+ sql_flavor: mssql
+ lookup_type: null
+ lookup_query: |-
+ SELECT A.* FROM ( SELECT DISTINCT TOP 10 'Date' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_DATE;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC) AS A UNION ALL SELECT B.* FROM ( SELECT DISTINCT TOP 10 'Non-Date' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_DATE;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS B ORDER BY data_type, count DESC;
error_type: Profile Anomaly
- id: '1069'
test_id: '1012'
@@ -38,13 +46,13 @@ profile_anomaly_types:
lookup_query: |-
SELECT A.* FROM ( SELECT DISTINCT 'Date' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_DATE;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT 10) AS A UNION ALL SELECT B.* FROM ( SELECT DISTINCT 'Non-Date' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_DATE;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT 10) AS B ORDER BY data_type, count DESC;
error_type: Profile Anomaly
- - id: '1126'
+ - id: '1044'
test_id: '1012'
test_type: Char_Column_Date_Values
- sql_flavor: mssql
+ sql_flavor: redshift
lookup_type: null
lookup_query: |-
- SELECT A.* FROM ( SELECT DISTINCT TOP 10 'Date' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_DATE;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC) AS A UNION ALL SELECT B.* FROM ( SELECT DISTINCT TOP 10 'Non-Date' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_DATE;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS B ORDER BY data_type, count DESC;
+ SELECT A.* FROM ( SELECT TOP 10 DISTINCT 'Date' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_DATE;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS A UNION ALL SELECT B.* FROM ( SELECT TOP 10 DISTINCT 'Non-Date' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_DATE;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS B ORDER BY data_type, count DESC;
error_type: Profile Anomaly
- id: '1183'
test_id: '1012'
@@ -54,11 +62,3 @@ profile_anomaly_types:
lookup_query: |-
SELECT A.* FROM (SELECT DISTINCT TOP 10 'Date' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_DATE;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC) AS A UNION ALL SELECT B.* FROM (SELECT DISTINCT TOP 10 'Non-Date' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_DATE;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC) AS B ORDER BY data_type, count DESC;
error_type: Profile Anomaly
- - id: '1284'
- test_id: '1012'
- test_type: Char_Column_Date_Values
- sql_flavor: databricks
- lookup_type: null
- lookup_query: |-
- SELECT A.* FROM (SELECT DISTINCT 'Date' as data_type, `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_DATE;`{COLUMN_NAME}`%> = 1 GROUP BY `{COLUMN_NAME}` ORDER BY count DESC LIMIT 10) AS A UNION ALL SELECT B.* FROM (SELECT DISTINCT 'Non-Date' as data_type, `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_DATE;`{COLUMN_NAME}`%> != 1 GROUP BY `{COLUMN_NAME}` ORDER BY count DESC) AS B ORDER BY data_type, count DESC LIMIT 10;
- error_type: Profile Anomaly
diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Char_Column_Number_Values.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Char_Column_Number_Values.yaml
index 52730c32..fbc78bc8 100644
--- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Char_Column_Number_Values.yaml
+++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Char_Column_Number_Values.yaml
@@ -22,13 +22,21 @@ profile_anomaly_types:
dq_score_risk_factor: '0.66'
dq_dimension: Validity
target_data_lookups:
- - id: '1043'
+ - id: '1283'
test_id: '1011'
test_type: Char_Column_Number_Values
- sql_flavor: redshift
+ sql_flavor: databricks
lookup_type: null
lookup_query: |-
- SELECT A.* FROM ( SELECT TOP 10 DISTINCT 'Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_NUM;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC) AS A UNION ALL SELECT B.* FROM ( SELECT TOP 10 DISTINCT 'Non-Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_NUM;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS B ORDER BY data_type, count DESC;
+ SELECT A.* FROM (SELECT DISTINCT 'Numeric' as data_type, `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_NUM;`{COLUMN_NAME}`%> = 1 GROUP BY `{COLUMN_NAME}` ORDER BY count DESC LIMIT 10) AS A UNION ALL SELECT B.* FROM (SELECT DISTINCT 'Non-Numeric' as data_type, `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_NUM;`{COLUMN_NAME}`%> != 1 GROUP BY `{COLUMN_NAME}` ORDER BY count DESC) AS B ORDER BY data_type, count DESC LIMIT 10;
+ error_type: Profile Anomaly
+ - id: '1125'
+ test_id: '1011'
+ test_type: Char_Column_Number_Values
+ sql_flavor: mssql
+ lookup_type: null
+ lookup_query: |-
+ SELECT A.* FROM ( SELECT DISTINCT TOP 10 'Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_NUM;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS A UNION ALL SELECT B.* FROM ( SELECT DISTINCT TOP 10 'Non-Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_NUM;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS B ORDER BY data_type, count DESC;
error_type: Profile Anomaly
- id: '1068'
test_id: '1011'
@@ -38,13 +46,13 @@ profile_anomaly_types:
lookup_query: |-
SELECT A.* FROM ( SELECT DISTINCT 'Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_NUM;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT 10 ) AS A UNION ALL SELECT B.* FROM ( SELECT DISTINCT 'Non-Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_NUM;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT 10 ) AS B ORDER BY data_type, count DESC;
error_type: Profile Anomaly
- - id: '1125'
+ - id: '1043'
test_id: '1011'
test_type: Char_Column_Number_Values
- sql_flavor: mssql
+ sql_flavor: redshift
lookup_type: null
lookup_query: |-
- SELECT A.* FROM ( SELECT DISTINCT TOP 10 'Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_NUM;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS A UNION ALL SELECT B.* FROM ( SELECT DISTINCT TOP 10 'Non-Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_NUM;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS B ORDER BY data_type, count DESC;
+ SELECT A.* FROM ( SELECT TOP 10 DISTINCT 'Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_NUM;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC) AS A UNION ALL SELECT B.* FROM ( SELECT TOP 10 DISTINCT 'Non-Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_NUM;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS B ORDER BY data_type, count DESC;
error_type: Profile Anomaly
- id: '1182'
test_id: '1011'
@@ -54,11 +62,3 @@ profile_anomaly_types:
lookup_query: |-
SELECT A.* FROM (SELECT DISTINCT TOP 10 'Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_NUM;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC) AS A UNION ALL SELECT B.* FROM (SELECT DISTINCT TOP 10 'Non-Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_NUM;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC) AS B ORDER BY data_type, count DESC;
error_type: Profile Anomaly
- - id: '1283'
- test_id: '1011'
- test_type: Char_Column_Number_Values
- sql_flavor: databricks
- lookup_type: null
- lookup_query: |-
- SELECT A.* FROM (SELECT DISTINCT 'Numeric' as data_type, `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_NUM;`{COLUMN_NAME}`%> = 1 GROUP BY `{COLUMN_NAME}` ORDER BY count DESC LIMIT 10) AS A UNION ALL SELECT B.* FROM (SELECT DISTINCT 'Non-Numeric' as data_type, `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_NUM;`{COLUMN_NAME}`%> != 1 GROUP BY `{COLUMN_NAME}` ORDER BY count DESC) AS B ORDER BY data_type, count DESC LIMIT 10;
- error_type: Profile Anomaly
diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Column_Pattern_Mismatch.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Column_Pattern_Mismatch.yaml
index d5389cbd..2c69ccfa 100644
--- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Column_Pattern_Mismatch.yaml
+++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Column_Pattern_Mismatch.yaml
@@ -29,13 +29,21 @@ profile_anomaly_types:
dq_score_risk_factor: '0.66'
dq_dimension: Validity
target_data_lookups:
- - id: '1039'
+ - id: '1279'
test_id: '1007'
test_type: Column_Pattern_Mismatch
- sql_flavor: redshift
+ sql_flavor: databricks
lookup_type: null
lookup_query: |-
- SELECT A.* FROM ( SELECT TOP 5 DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 4)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", '[a-z]', 'a'),'[A-Z]', 'A'),'[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC ) A UNION ALL SELECT B.* FROM ( SELECT TOP 5 DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 6)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", '[a-z]', 'a'),'[A-Z]', 'A'),'[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC ) B UNION ALL SELECT C.* FROM ( SELECT TOP 5 DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 8)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", '[a-z]', 'a'),'[A-Z]', 'A'),'[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC ) C UNION ALL SELECT D.* FROM ( SELECT TOP 5 DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 10)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", '[a-z]', 'a'),'[A-Z]', 'A'),'[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC ) D ORDER BY top_pattern DESC, count DESC;
+ SELECT A.* FROM (SELECT DISTINCT b.top_pattern, `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 4)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( `{COLUMN_NAME}`::STRING, '[a-z]', 'a'), '[A-Z]', 'A'), '[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, `{COLUMN_NAME}` ORDER BY count DESC LIMIT 5) A UNION ALL SELECT B.* FROM (SELECT DISTINCT b.top_pattern, `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 6)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( `{COLUMN_NAME}`::STRING, '[a-z]', 'a'), '[A-Z]', 'A'), '[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, `{COLUMN_NAME}` ORDER BY count DESC LIMIT 5) B UNION ALL SELECT C.* FROM (SELECT DISTINCT b.top_pattern, `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 8)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( `{COLUMN_NAME}`::STRING, '[a-z]', 'a'), '[A-Z]', 'A'), '[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, `{COLUMN_NAME}` ORDER BY count DESC LIMIT 5) C UNION ALL SELECT D.* FROM (SELECT DISTINCT b.top_pattern, `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 10)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( `{COLUMN_NAME}`::STRING, '[a-z]', 'a'), '[A-Z]', 'A'), '[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, `{COLUMN_NAME}` ORDER BY count DESC LIMIT 5) D ORDER BY top_pattern DESC, count DESC;
+ error_type: Profile Anomaly
+ - id: '1121'
+ test_id: '1007'
+ test_type: Column_Pattern_Mismatch
+ sql_flavor: mssql
+ lookup_type: null
+ lookup_query: |-
+ WITH cte AS ( SELECT TRIM(value) AS top_pattern, ROW_NUMBER() OVER (ORDER BY CHARINDEX('| '+ TRIM(value) + ' |', '| ' + '{DETAIL_EXPRESSION}' + ' |' ) ASC) as row_num FROM STRING_SPLIT('{DETAIL_EXPRESSION}', '|') ) SELECT DISTINCT TOP 5 c.top_pattern, a."{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} a, cte c WHERE c.row_num = 4 AND TRANSLATE(a."{COLUMN_NAME}" COLLATE Latin1_General_BIN, 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789', 'aaaaaaaaaaaaaaaaaaaaaaaaaaAAAAAAAAAAAAAAAAAAAAAAAAAANNNNNNNNNN') = c.top_pattern GROUP BY c.top_pattern, a."{COLUMN_NAME}" UNION ALL SELECT DISTINCT TOP 5 c.top_pattern, a."{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} a, cte c WHERE c.row_num = 6 AND TRANSLATE(a."{COLUMN_NAME}" COLLATE Latin1_General_BIN, 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789', 'aaaaaaaaaaaaaaaaaaaaaaaaaaAAAAAAAAAAAAAAAAAAAAAAAAAANNNNNNNNNN') = c.top_pattern GROUP BY c.top_pattern, a."{COLUMN_NAME}" UNION ALL SELECT DISTINCT TOP 5 c.top_pattern, a."{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} a, cte c WHERE c.row_num = 8 AND TRANSLATE(a."{COLUMN_NAME}" COLLATE Latin1_General_BIN, 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789', 'aaaaaaaaaaaaaaaaaaaaaaaaaaAAAAAAAAAAAAAAAAAAAAAAAAAANNNNNNNNNN') = c.top_pattern GROUP BY c.top_pattern, a."{COLUMN_NAME}" UNION ALL SELECT DISTINCT TOP 5 c.top_pattern, a."{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} a, cte c WHERE c.row_num = 10 AND TRANSLATE(a."{COLUMN_NAME}" COLLATE Latin1_General_BIN, 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789', 'aaaaaaaaaaaaaaaaaaaaaaaaaaAAAAAAAAAAAAAAAAAAAAAAAAAANNNNNNNNNN') = c.top_pattern GROUP BY c.top_pattern, a."{COLUMN_NAME}" ORDER BY top_pattern DESC, count DESC;
error_type: Profile Anomaly
- id: '1064'
test_id: '1007'
@@ -45,13 +53,13 @@ profile_anomaly_types:
lookup_query: |-
SELECT A.* FROM ( SELECT DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 4)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", '[a-z]', 'a', 'g'), '[A-Z]', 'A', 'g'), '[0-9]', 'N', 'g') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC LIMIT 5 ) A UNION ALL SELECT B.* FROM ( SELECT DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 6)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", '[a-z]', 'a', 'g'), '[A-Z]', 'A', 'g'), '[0-9]', 'N', 'g') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC LIMIT 5 ) B UNION ALL SELECT C.* FROM ( SELECT DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 8)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", '[a-z]', 'a', 'g'), '[A-Z]', 'A', 'g'), '[0-9]', 'N', 'g') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC LIMIT 5 ) C UNION ALL SELECT D.* FROM ( SELECT DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 10)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", '[a-z]', 'a', 'g'), '[A-Z]', 'A', 'g'), '[0-9]', 'N', 'g') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC LIMIT 5) D ORDER BY top_pattern DESC, count DESC;
error_type: Profile Anomaly
- - id: '1121'
+ - id: '1039'
test_id: '1007'
test_type: Column_Pattern_Mismatch
- sql_flavor: mssql
+ sql_flavor: redshift
lookup_type: null
lookup_query: |-
- WITH cte AS ( SELECT TRIM(value) AS top_pattern, ROW_NUMBER() OVER (ORDER BY CHARINDEX('| '+ TRIM(value) + ' |', '| ' + '{DETAIL_EXPRESSION}' + ' |' ) ASC) as row_num FROM STRING_SPLIT('{DETAIL_EXPRESSION}', '|') ) SELECT DISTINCT TOP 5 c.top_pattern, a."{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} a, cte c WHERE c.row_num = 4 AND TRANSLATE(a."{COLUMN_NAME}" COLLATE Latin1_General_BIN, 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789', 'aaaaaaaaaaaaaaaaaaaaaaaaaaAAAAAAAAAAAAAAAAAAAAAAAAAANNNNNNNNNN') = c.top_pattern GROUP BY c.top_pattern, a."{COLUMN_NAME}" UNION ALL SELECT DISTINCT TOP 5 c.top_pattern, a."{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} a, cte c WHERE c.row_num = 6 AND TRANSLATE(a."{COLUMN_NAME}" COLLATE Latin1_General_BIN, 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789', 'aaaaaaaaaaaaaaaaaaaaaaaaaaAAAAAAAAAAAAAAAAAAAAAAAAAANNNNNNNNNN') = c.top_pattern GROUP BY c.top_pattern, a."{COLUMN_NAME}" UNION ALL SELECT DISTINCT TOP 5 c.top_pattern, a."{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} a, cte c WHERE c.row_num = 8 AND TRANSLATE(a."{COLUMN_NAME}" COLLATE Latin1_General_BIN, 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789', 'aaaaaaaaaaaaaaaaaaaaaaaaaaAAAAAAAAAAAAAAAAAAAAAAAAAANNNNNNNNNN') = c.top_pattern GROUP BY c.top_pattern, a."{COLUMN_NAME}" UNION ALL SELECT DISTINCT TOP 5 c.top_pattern, a."{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} a, cte c WHERE c.row_num = 10 AND TRANSLATE(a."{COLUMN_NAME}" COLLATE Latin1_General_BIN, 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789', 'aaaaaaaaaaaaaaaaaaaaaaaaaaAAAAAAAAAAAAAAAAAAAAAAAAAANNNNNNNNNN') = c.top_pattern GROUP BY c.top_pattern, a."{COLUMN_NAME}" ORDER BY top_pattern DESC, count DESC;
+ SELECT A.* FROM ( SELECT TOP 5 DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 4)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", '[a-z]', 'a'),'[A-Z]', 'A'),'[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC ) A UNION ALL SELECT B.* FROM ( SELECT TOP 5 DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 6)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", '[a-z]', 'a'),'[A-Z]', 'A'),'[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC ) B UNION ALL SELECT C.* FROM ( SELECT TOP 5 DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 8)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", '[a-z]', 'a'),'[A-Z]', 'A'),'[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC ) C UNION ALL SELECT D.* FROM ( SELECT TOP 5 DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 10)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", '[a-z]', 'a'),'[A-Z]', 'A'),'[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC ) D ORDER BY top_pattern DESC, count DESC;
error_type: Profile Anomaly
- id: '1178'
test_id: '1007'
@@ -61,11 +69,3 @@ profile_anomaly_types:
lookup_query: |-
SELECT A.* FROM (SELECT DISTINCT TOP 5 b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 4)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}"::VARCHAR, '[a-z]', 'a'), '[A-Z]', 'A'), '[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC) A UNION ALL SELECT B.* FROM (SELECT DISTINCT TOP 5 b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 6)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}"::VARCHAR, '[a-z]', 'a'), '[A-Z]', 'A'), '[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC) B UNION ALL SELECT C.* FROM (SELECT DISTINCT TOP 5 b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 8)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}"::VARCHAR, '[a-z]', 'a'), '[A-Z]', 'A'), '[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC) C UNION ALL SELECT D.* FROM (SELECT DISTINCT TOP 5 b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 10)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}"::VARCHAR, '[a-z]', 'a'), '[A-Z]', 'A'), '[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC) D ORDER BY top_pattern DESC, count DESC;
error_type: Profile Anomaly
- - id: '1279'
- test_id: '1007'
- test_type: Column_Pattern_Mismatch
- sql_flavor: databricks
- lookup_type: null
- lookup_query: |-
- SELECT A.* FROM (SELECT DISTINCT b.top_pattern, `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 4)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( `{COLUMN_NAME}`::STRING, '[a-z]', 'a'), '[A-Z]', 'A'), '[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, `{COLUMN_NAME}` ORDER BY count DESC LIMIT 5) A UNION ALL SELECT B.* FROM (SELECT DISTINCT b.top_pattern, `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 6)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( `{COLUMN_NAME}`::STRING, '[a-z]', 'a'), '[A-Z]', 'A'), '[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, `{COLUMN_NAME}` ORDER BY count DESC LIMIT 5) B UNION ALL SELECT C.* FROM (SELECT DISTINCT b.top_pattern, `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 8)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( `{COLUMN_NAME}`::STRING, '[a-z]', 'a'), '[A-Z]', 'A'), '[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, `{COLUMN_NAME}` ORDER BY count DESC LIMIT 5) C UNION ALL SELECT D.* FROM (SELECT DISTINCT b.top_pattern, `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 10)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( `{COLUMN_NAME}`::STRING, '[a-z]', 'a'), '[A-Z]', 'A'), '[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, `{COLUMN_NAME}` ORDER BY count DESC LIMIT 5) D ORDER BY top_pattern DESC, count DESC;
- error_type: Profile Anomaly
diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Delimited_Data_Embedded.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Delimited_Data_Embedded.yaml
index 066ec529..eb5d7db4 100644
--- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Delimited_Data_Embedded.yaml
+++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Delimited_Data_Embedded.yaml
@@ -16,13 +16,21 @@ profile_anomaly_types:
dq_score_risk_factor: '0.66'
dq_dimension: Validity
target_data_lookups:
- - id: '1057'
+ - id: '1297'
test_id: '1025'
test_type: Delimited_Data_Embedded
- sql_flavor: redshift
+ sql_flavor: databricks
lookup_type: null
lookup_query: |-
- SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" ~ '^([^,|\t]{1,20}[,|\t]){2,}[^,|\t]{0,20}([,|\t]{0,1}[^,|\t]{0,20})*$' AND "{COLUMN_NAME}" !~ '\\s(and|but|or|yet)\\s' GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT 500;
+ SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE REGEXP_LIKE(`{COLUMN_NAME}`::STRING, '^([^,|\t]{1,20}[,|\t]){2,}[^,|\t]{0,20}([,|\t]{0,1}[^,|\t]{0,20})*$') AND NOT REGEXP_LIKE(`{COLUMN_NAME}`::STRING, '.*\\s(and|but|or|yet)\\s.*') GROUP BY `{COLUMN_NAME}` ORDER BY count DESC LIMIT 500;
+ error_type: Profile Anomaly
+ - id: '1139'
+ test_id: '1025'
+ test_type: Delimited_Data_Embedded
+ sql_flavor: mssql
+ lookup_type: null
+ lookup_query: |-
+ SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE ( "{COLUMN_NAME}" LIKE '%,%,%,%' OR "{COLUMN_NAME}" LIKE '%|%|%|%' OR "{COLUMN_NAME}" LIKE '%^%^%^%' OR "{COLUMN_NAME}" LIKE '%' + CHAR(9) + '%' + CHAR(9) + '%' + CHAR(9) + '%' ) AND NOT ( "{COLUMN_NAME}" LIKE '% and %' OR "{COLUMN_NAME}" LIKE '% but %' OR "{COLUMN_NAME}" LIKE '% or %' OR "{COLUMN_NAME}" LIKE '% yet %' ) AND ISNULL(CAST(LEN("{COLUMN_NAME}") - LEN(REPLACE("{COLUMN_NAME}", ',', '')) as FLOAT) / CAST(NULLIF(LEN("{COLUMN_NAME}") - LEN(REPLACE("{COLUMN_NAME}", ' ', '')), 0) as FLOAT), 1) > 0.6 GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC;
error_type: Profile Anomaly
- id: '1082'
test_id: '1025'
@@ -32,13 +40,13 @@ profile_anomaly_types:
lookup_query: |-
SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" ~ '^([^,|\t]{1,20}[,|\t]){2,}[^,|\t]{0,20}([,|\t]{0,1}[^,|\t]{0,20})*$' AND "{COLUMN_NAME}" !~ '\s(and|but|or|yet)\s' GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT 500;
error_type: Profile Anomaly
- - id: '1139'
+ - id: '1057'
test_id: '1025'
test_type: Delimited_Data_Embedded
- sql_flavor: mssql
+ sql_flavor: redshift
lookup_type: null
lookup_query: |-
- SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE ( "{COLUMN_NAME}" LIKE '%,%,%,%' OR "{COLUMN_NAME}" LIKE '%|%|%|%' OR "{COLUMN_NAME}" LIKE '%^%^%^%' OR "{COLUMN_NAME}" LIKE '%' + CHAR(9) + '%' + CHAR(9) + '%' + CHAR(9) + '%' ) AND NOT ( "{COLUMN_NAME}" LIKE '% and %' OR "{COLUMN_NAME}" LIKE '% but %' OR "{COLUMN_NAME}" LIKE '% or %' OR "{COLUMN_NAME}" LIKE '% yet %' ) AND ISNULL(CAST(LEN("{COLUMN_NAME}") - LEN(REPLACE("{COLUMN_NAME}", ',', '')) as FLOAT) / CAST(NULLIF(LEN("{COLUMN_NAME}") - LEN(REPLACE("{COLUMN_NAME}", ' ', '')), 0) as FLOAT), 1) > 0.6 GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC;
+ SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" ~ '^([^,|\t]{1,20}[,|\t]){2,}[^,|\t]{0,20}([,|\t]{0,1}[^,|\t]{0,20})*$' AND "{COLUMN_NAME}" !~ '\\s(and|but|or|yet)\\s' GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT 500;
error_type: Profile Anomaly
- id: '1196'
test_id: '1025'
@@ -48,11 +56,3 @@ profile_anomaly_types:
lookup_query: |-
SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE REGEXP_LIKE("{COLUMN_NAME}"::VARCHAR, '^([^,|\t]{1,20}[,|\t]){2,}[^,|\t]{0,20}([,|\t]{0,1}[^,|\t]{0,20})*$') AND NOT REGEXP_LIKE("{COLUMN_NAME}"::VARCHAR, '.*\\s(and|but|or|yet)\\s.*') GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT 500;
error_type: Profile Anomaly
- - id: '1297'
- test_id: '1025'
- test_type: Delimited_Data_Embedded
- sql_flavor: databricks
- lookup_type: null
- lookup_query: |-
- SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE REGEXP_LIKE(`{COLUMN_NAME}`::STRING, '^([^,|\t]{1,20}[,|\t]){2,}[^,|\t]{0,20}([,|\t]{0,1}[^,|\t]{0,20})*$') AND NOT REGEXP_LIKE(`{COLUMN_NAME}`::STRING, '.*\\s(and|but|or|yet)\\s.*') GROUP BY `{COLUMN_NAME}` ORDER BY count DESC LIMIT 500;
- error_type: Profile Anomaly
diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Inconsistent_Casing.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Inconsistent_Casing.yaml
index 8995cbd3..865601d2 100644
--- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Inconsistent_Casing.yaml
+++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Inconsistent_Casing.yaml
@@ -17,19 +17,33 @@ profile_anomaly_types:
dq_score_risk_factor: '1.0'
dq_dimension: Validity
target_data_lookups:
- - id: '1258'
+ - id: '1262'
test_id: '1028'
test_type: Inconsistent_Casing
- sql_flavor: redshift
+ sql_flavor: databricks
lookup_type: null
lookup_query: |-
- (SELECT 'Upper Case' as casing, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}
+ (SELECT 'Upper Case' as casing, `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}
+ WHERE UPPER(`{COLUMN_NAME}`) = `{COLUMN_NAME}`
+ GROUP BY `{COLUMN_NAME}` LIMIT 20)
+ UNION ALL
+ (SELECT 'Mixed Case' as casing, `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}
+ WHERE `{COLUMN_NAME}` <> UPPER(`{COLUMN_NAME}`) AND `{COLUMN_NAME}` <> LOWER(`{COLUMN_NAME}`)
+ GROUP BY `{COLUMN_NAME}` LIMIT 20)
+ error_type: Profile Anomaly
+ - id: '1260'
+ test_id: '1028'
+ test_type: Inconsistent_Casing
+ sql_flavor: mssql
+ lookup_type: null
+ lookup_query: |-
+ SELECT TOP 20 'Upper Case' as casing, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}
WHERE UPPER("{COLUMN_NAME}") = "{COLUMN_NAME}"
- GROUP BY "{COLUMN_NAME}" LIMIT 20)
+ GROUP BY "{COLUMN_NAME}"
UNION ALL
- (SELECT 'Mixed Case' as casing, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}
+ SELECT TOP 20 'Mixed Case' as casing, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}
WHERE "{COLUMN_NAME}" <> UPPER("{COLUMN_NAME}") AND "{COLUMN_NAME}" <> LOWER("{COLUMN_NAME}")
- GROUP BY "{COLUMN_NAME}" LIMIT 20)
+ GROUP BY "{COLUMN_NAME}"
error_type: Profile Anomaly
- id: '1259'
test_id: '1028'
@@ -45,19 +59,19 @@ profile_anomaly_types:
WHERE "{COLUMN_NAME}" <> UPPER("{COLUMN_NAME}") AND "{COLUMN_NAME}" <> LOWER("{COLUMN_NAME}")
GROUP BY "{COLUMN_NAME}" LIMIT 20)
error_type: Profile Anomaly
- - id: '1260'
+ - id: '1258'
test_id: '1028'
test_type: Inconsistent_Casing
- sql_flavor: mssql
+ sql_flavor: redshift
lookup_type: null
lookup_query: |-
- SELECT TOP 20 'Upper Case' as casing, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}
+ (SELECT 'Upper Case' as casing, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}
WHERE UPPER("{COLUMN_NAME}") = "{COLUMN_NAME}"
- GROUP BY "{COLUMN_NAME}"
+ GROUP BY "{COLUMN_NAME}" LIMIT 20)
UNION ALL
- SELECT TOP 20 'Mixed Case' as casing, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}
+ (SELECT 'Mixed Case' as casing, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}
WHERE "{COLUMN_NAME}" <> UPPER("{COLUMN_NAME}") AND "{COLUMN_NAME}" <> LOWER("{COLUMN_NAME}")
- GROUP BY "{COLUMN_NAME}"
+ GROUP BY "{COLUMN_NAME}" LIMIT 20)
error_type: Profile Anomaly
- id: '1261'
test_id: '1028'
@@ -73,17 +87,3 @@ profile_anomaly_types:
WHERE "{COLUMN_NAME}" <> UPPER("{COLUMN_NAME}") AND "{COLUMN_NAME}" <> LOWER("{COLUMN_NAME}")
GROUP BY "{COLUMN_NAME}" LIMIT 20)
error_type: Profile Anomaly
- - id: '1262'
- test_id: '1028'
- test_type: Inconsistent_Casing
- sql_flavor: databricks
- lookup_type: null
- lookup_query: |-
- (SELECT 'Upper Case' as casing, `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}
- WHERE UPPER(`{COLUMN_NAME}`) = `{COLUMN_NAME}`
- GROUP BY `{COLUMN_NAME}` LIMIT 20)
- UNION ALL
- (SELECT 'Mixed Case' as casing, `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}
- WHERE `{COLUMN_NAME}` <> UPPER(`{COLUMN_NAME}`) AND `{COLUMN_NAME}` <> LOWER(`{COLUMN_NAME}`)
- GROUP BY `{COLUMN_NAME}` LIMIT 20)
- error_type: Profile Anomaly
diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Invalid_Zip3_USA.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Invalid_Zip3_USA.yaml
index 869819ac..14130bfd 100644
--- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Invalid_Zip3_USA.yaml
+++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Invalid_Zip3_USA.yaml
@@ -20,43 +20,43 @@ profile_anomaly_types:
dq_score_risk_factor: '1'
dq_dimension: Validity
target_data_lookups:
- - id: '1056'
+ - id: '1296'
test_id: '1024'
test_type: Invalid_Zip3_USA
- sql_flavor: redshift
+ sql_flavor: databricks
lookup_type: null
lookup_query: |-
- SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') <> '999' GROUP BY "{COLUMN_NAME}" ORDER BY count DESC, "{COLUMN_NAME}" LIMIT 500;
+ SELECT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE(`{COLUMN_NAME}`,'012345678','999999999') <> '999' GROUP BY `{COLUMN_NAME}` ORDER BY count DESC, `{COLUMN_NAME}` LIMIT 500;
error_type: Profile Anomaly
- - id: '1081'
+ - id: '1138'
test_id: '1024'
test_type: Invalid_Zip3_USA
- sql_flavor: postgresql
+ sql_flavor: mssql
lookup_type: null
lookup_query: |-
- SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') <> '999' GROUP BY "{COLUMN_NAME}" ORDER BY count DESC, "{COLUMN_NAME}" LIMIT 500;
+ SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') <> '999' GROUP BY "{COLUMN_NAME}" ORDER BY count DESC, "{COLUMN_NAME}";
error_type: Profile Anomaly
- - id: '1138'
+ - id: '1081'
test_id: '1024'
test_type: Invalid_Zip3_USA
- sql_flavor: mssql
+ sql_flavor: postgresql
lookup_type: null
lookup_query: |-
- SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') <> '999' GROUP BY "{COLUMN_NAME}" ORDER BY count DESC, "{COLUMN_NAME}";
+ SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') <> '999' GROUP BY "{COLUMN_NAME}" ORDER BY count DESC, "{COLUMN_NAME}" LIMIT 500;
error_type: Profile Anomaly
- - id: '1195'
+ - id: '1056'
test_id: '1024'
test_type: Invalid_Zip3_USA
- sql_flavor: snowflake
+ sql_flavor: redshift
lookup_type: null
lookup_query: |-
SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') <> '999' GROUP BY "{COLUMN_NAME}" ORDER BY count DESC, "{COLUMN_NAME}" LIMIT 500;
error_type: Profile Anomaly
- - id: '1296'
+ - id: '1195'
test_id: '1024'
test_type: Invalid_Zip3_USA
- sql_flavor: databricks
+ sql_flavor: snowflake
lookup_type: null
lookup_query: |-
- SELECT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE(`{COLUMN_NAME}`,'012345678','999999999') <> '999' GROUP BY `{COLUMN_NAME}` ORDER BY count DESC, `{COLUMN_NAME}` LIMIT 500;
+ SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') <> '999' GROUP BY "{COLUMN_NAME}" ORDER BY count DESC, "{COLUMN_NAME}" LIMIT 500;
error_type: Profile Anomaly
diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Invalid_Zip_USA.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Invalid_Zip_USA.yaml
index 0a0aa5d0..32cee0ac 100644
--- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Invalid_Zip_USA.yaml
+++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Invalid_Zip_USA.yaml
@@ -16,43 +16,43 @@ profile_anomaly_types:
dq_score_risk_factor: '1.0'
dq_dimension: Validity
target_data_lookups:
- - id: '1035'
+ - id: '1275'
test_id: '1003'
test_type: Invalid_Zip_USA
- sql_flavor: redshift
+ sql_flavor: databricks
lookup_type: null
lookup_query: |-
- SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500;
+ SELECT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE(`{COLUMN_NAME}`,'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` LIMIT 500;
error_type: Profile Anomaly
- - id: '1060'
+ - id: '1117'
test_id: '1003'
test_type: Invalid_Zip_USA
- sql_flavor: postgresql
+ sql_flavor: mssql
lookup_type: null
lookup_query: |-
- SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500;
+ SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";
error_type: Profile Anomaly
- - id: '1117'
+ - id: '1060'
test_id: '1003'
test_type: Invalid_Zip_USA
- sql_flavor: mssql
+ sql_flavor: postgresql
lookup_type: null
lookup_query: |-
- SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";
+ SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500;
error_type: Profile Anomaly
- - id: '1174'
+ - id: '1035'
test_id: '1003'
test_type: Invalid_Zip_USA
- sql_flavor: snowflake
+ sql_flavor: redshift
lookup_type: null
lookup_query: |-
SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500;
error_type: Profile Anomaly
- - id: '1275'
+ - id: '1174'
test_id: '1003'
test_type: Invalid_Zip_USA
- sql_flavor: databricks
+ sql_flavor: snowflake
lookup_type: null
lookup_query: |-
- SELECT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE(`{COLUMN_NAME}`,'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` LIMIT 500;
+ SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500;
error_type: Profile Anomaly
diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Leading_Spaces.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Leading_Spaces.yaml
index 7bb29073..812146b8 100644
--- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Leading_Spaces.yaml
+++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Leading_Spaces.yaml
@@ -17,13 +17,21 @@ profile_anomaly_types:
dq_score_risk_factor: '0.66'
dq_dimension: Validity
target_data_lookups:
- - id: '1041'
+ - id: '1281'
test_id: '1009'
test_type: Leading_Spaces
- sql_flavor: redshift
+ sql_flavor: databricks
lookup_type: null
lookup_query: |-
- SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (CASE WHEN "{COLUMN_NAME}" BETWEEN ' !' AND '!' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";
+ SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (CASE WHEN `{COLUMN_NAME}` BETWEEN ' !' AND '!' THEN 1 ELSE 0 END) = 1 GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}`;
+ error_type: Profile Anomaly
+ - id: '1123'
+ test_id: '1009'
+ test_type: Leading_Spaces
+ sql_flavor: mssql
+ lookup_type: null
+ lookup_query: |-
+ SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (CASE WHEN "{COLUMN_NAME}" BETWEEN ' !' AND '!' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";
error_type: Profile Anomaly
- id: '1066'
test_id: '1009'
@@ -33,13 +41,13 @@ profile_anomaly_types:
lookup_query: |-
SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (CASE WHEN "{COLUMN_NAME}" BETWEEN ' !' AND '!' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";
error_type: Profile Anomaly
- - id: '1123'
+ - id: '1041'
test_id: '1009'
test_type: Leading_Spaces
- sql_flavor: mssql
+ sql_flavor: redshift
lookup_type: null
lookup_query: |-
- SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (CASE WHEN "{COLUMN_NAME}" BETWEEN ' !' AND '!' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";
+ SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (CASE WHEN "{COLUMN_NAME}" BETWEEN ' !' AND '!' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";
error_type: Profile Anomaly
- id: '1180'
test_id: '1009'
@@ -49,11 +57,3 @@ profile_anomaly_types:
lookup_query: |-
SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (CASE WHEN "{COLUMN_NAME}" BETWEEN ' !' AND '!' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";
error_type: Profile Anomaly
- - id: '1281'
- test_id: '1009'
- test_type: Leading_Spaces
- sql_flavor: databricks
- lookup_type: null
- lookup_query: |-
- SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (CASE WHEN `{COLUMN_NAME}` BETWEEN ' !' AND '!' THEN 1 ELSE 0 END) = 1 GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}`;
- error_type: Profile Anomaly
diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Multiple_Types_Major.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Multiple_Types_Major.yaml
index 529ce1f7..53e7a7a9 100644
--- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Multiple_Types_Major.yaml
+++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Multiple_Types_Major.yaml
@@ -16,13 +16,21 @@ profile_anomaly_types:
dq_score_risk_factor: null
dq_dimension: Consistency
target_data_lookups:
- - id: '1037'
+ - id: '1277'
test_id: '1005'
test_type: Multiple_Types_Major
- sql_flavor: redshift
+ sql_flavor: databricks
lookup_type: null
lookup_query: |-
- SELECT DISTINCT column_name, table_name, CASE WHEN data_type = 'timestamp without time zone' THEN 'timestamp' WHEN data_type = 'character varying' THEN 'varchar(' || CAST(character_maximum_length AS VARCHAR) || ')' WHEN data_type = 'character' THEN 'char(' || CAST(character_maximum_length AS VARCHAR) || ')' WHEN data_type = 'numeric' THEN 'numeric(' || CAST(numeric_precision AS VARCHAR) || ',' || CAST(numeric_scale AS VARCHAR) || ')' ELSE data_type END AS data_type FROM information_schema.columns WHERE table_schema = '{TARGET_SCHEMA}' AND column_name = '{COLUMN_NAME}' ORDER BY data_type, table_name;
+ SELECT DISTINCT column_name, columns.table_name, CASE WHEN data_type ILIKE 'timestamp%' THEN lower(data_type) WHEN data_type ILIKE 'date' THEN lower(data_type) WHEN data_type ILIKE 'boolean' THEN 'boolean' WHEN data_type = 'TEXT' THEN 'varchar(' || CAST(character_maximum_length AS STRING) || ')' WHEN data_type ILIKE 'char%' THEN 'char(' || CAST(character_maximum_length AS STRING) || ')' WHEN data_type = 'NUMBER' AND numeric_precision = 38 AND numeric_scale = 0 THEN 'bigint' WHEN data_type ILIKE 'num%' THEN 'numeric(' || CAST(numeric_precision AS STRING) || ',' || CAST(numeric_scale AS STRING) || ')' ELSE data_type END AS data_type FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = '{TARGET_SCHEMA}' AND columns.column_name = '{COLUMN_NAME}' AND tables.table_type = 'BASE TABLE' ORDER BY data_type, table_name;
+ error_type: Profile Anomaly
+ - id: '1119'
+ test_id: '1005'
+ test_type: Multiple_Types_Major
+ sql_flavor: mssql
+ lookup_type: null
+ lookup_query: |-
+ SELECT TOP 500 column_name, columns.table_name, CASE WHEN data_type = 'datetime' THEN 'datetime' WHEN data_type = 'datetime2' THEN 'datetime' WHEN data_type = 'varchar' THEN 'varchar(' + CAST(character_maximum_length AS VARCHAR) + ')' WHEN data_type = 'char' THEN 'char(' + CAST(character_maximum_length AS VARCHAR) + ')' WHEN data_type = 'numeric' THEN 'numeric(' + CAST(numeric_precision AS VARCHAR) + ',' + CAST(numeric_scale AS VARCHAR) + ')' ELSE data_type END AS data_type FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = '{TARGET_SCHEMA}' AND columns.column_name = '{COLUMN_NAME}' AND tables.table_type = 'BASE TABLE' ORDER BY data_type, table_name;
error_type: Profile Anomaly
- id: '1062'
test_id: '1005'
@@ -32,13 +40,13 @@ profile_anomaly_types:
lookup_query: |-
SELECT DISTINCT column_name, columns.table_name, CASE WHEN data_type = 'timestamp without time zone' THEN 'timestamp' WHEN data_type = 'character varying' THEN 'varchar(' || CAST(character_maximum_length AS VARCHAR) || ')' WHEN data_type = 'character' THEN 'char(' || CAST(character_maximum_length AS VARCHAR) || ')' WHEN data_type = 'numeric' THEN 'numeric(' || CAST(numeric_precision AS VARCHAR) || ',' || CAST(numeric_scale AS VARCHAR) || ')' ELSE data_type END AS data_type FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = '{TARGET_SCHEMA}' AND columns.column_name = '{COLUMN_NAME}' AND UPPER(tables.table_type) = 'BASE TABLE' ORDER BY data_type, table_name;
error_type: Profile Anomaly
- - id: '1119'
+ - id: '1037'
test_id: '1005'
test_type: Multiple_Types_Major
- sql_flavor: mssql
+ sql_flavor: redshift
lookup_type: null
lookup_query: |-
- SELECT TOP 500 column_name, columns.table_name, CASE WHEN data_type = 'datetime' THEN 'datetime' WHEN data_type = 'datetime2' THEN 'datetime' WHEN data_type = 'varchar' THEN 'varchar(' + CAST(character_maximum_length AS VARCHAR) + ')' WHEN data_type = 'char' THEN 'char(' + CAST(character_maximum_length AS VARCHAR) + ')' WHEN data_type = 'numeric' THEN 'numeric(' + CAST(numeric_precision AS VARCHAR) + ',' + CAST(numeric_scale AS VARCHAR) + ')' ELSE data_type END AS data_type FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = '{TARGET_SCHEMA}' AND columns.column_name = '{COLUMN_NAME}' AND tables.table_type = 'BASE TABLE' ORDER BY data_type, table_name;
+ SELECT DISTINCT column_name, table_name, CASE WHEN data_type = 'timestamp without time zone' THEN 'timestamp' WHEN data_type = 'character varying' THEN 'varchar(' || CAST(character_maximum_length AS VARCHAR) || ')' WHEN data_type = 'character' THEN 'char(' || CAST(character_maximum_length AS VARCHAR) || ')' WHEN data_type = 'numeric' THEN 'numeric(' || CAST(numeric_precision AS VARCHAR) || ',' || CAST(numeric_scale AS VARCHAR) || ')' ELSE data_type END AS data_type FROM information_schema.columns WHERE table_schema = '{TARGET_SCHEMA}' AND column_name = '{COLUMN_NAME}' ORDER BY data_type, table_name;
error_type: Profile Anomaly
- id: '1176'
test_id: '1005'
@@ -48,11 +56,3 @@ profile_anomaly_types:
lookup_query: |-
SELECT DISTINCT column_name, columns.table_name, CASE WHEN data_type ILIKE 'timestamp%' THEN lower(data_type) WHEN data_type ILIKE 'date' THEN lower(data_type) WHEN data_type ILIKE 'boolean' THEN 'boolean' WHEN data_type = 'TEXT' THEN 'varchar(' || CAST(character_maximum_length AS VARCHAR) || ')' WHEN data_type ILIKE 'char%' THEN 'char(' || CAST(character_maximum_length AS VARCHAR) || ')' WHEN data_type = 'NUMBER' AND numeric_precision = 38 AND numeric_scale = 0 THEN 'bigint' WHEN data_type ILIKE 'num%' THEN 'numeric(' || CAST(numeric_precision AS VARCHAR) || ',' || CAST(numeric_scale AS VARCHAR) || ')' ELSE data_type END AS data_type FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = '{TARGET_SCHEMA}' AND columns.column_name = '{COLUMN_NAME}' AND tables.table_type = 'BASE TABLE' ORDER BY data_type, table_name;
error_type: Profile Anomaly
- - id: '1277'
- test_id: '1005'
- test_type: Multiple_Types_Major
- sql_flavor: databricks
- lookup_type: null
- lookup_query: |-
- SELECT DISTINCT column_name, columns.table_name, CASE WHEN data_type ILIKE 'timestamp%' THEN lower(data_type) WHEN data_type ILIKE 'date' THEN lower(data_type) WHEN data_type ILIKE 'boolean' THEN 'boolean' WHEN data_type = 'TEXT' THEN 'varchar(' || CAST(character_maximum_length AS STRING) || ')' WHEN data_type ILIKE 'char%' THEN 'char(' || CAST(character_maximum_length AS STRING) || ')' WHEN data_type = 'NUMBER' AND numeric_precision = 38 AND numeric_scale = 0 THEN 'bigint' WHEN data_type ILIKE 'num%' THEN 'numeric(' || CAST(numeric_precision AS STRING) || ',' || CAST(numeric_scale AS STRING) || ')' ELSE data_type END AS data_type FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = '{TARGET_SCHEMA}' AND columns.column_name = '{COLUMN_NAME}' AND tables.table_type = 'BASE TABLE' ORDER BY data_type, table_name;
- error_type: Profile Anomaly
diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Multiple_Types_Minor.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Multiple_Types_Minor.yaml
index aacc90a7..f55ab2f6 100644
--- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Multiple_Types_Minor.yaml
+++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Multiple_Types_Minor.yaml
@@ -16,13 +16,21 @@ profile_anomaly_types:
dq_score_risk_factor: null
dq_dimension: Consistency
target_data_lookups:
- - id: '1036'
+ - id: '1276'
test_id: '1004'
test_type: Multiple_Types_Minor
- sql_flavor: redshift
+ sql_flavor: databricks
lookup_type: null
lookup_query: |-
- SELECT DISTINCT column_name, table_name, CASE WHEN data_type = 'timestamp without time zone' THEN 'timestamp' WHEN data_type = 'character varying' THEN 'varchar(' || CAST(character_maximum_length AS VARCHAR) || ')' WHEN data_type = 'character' THEN 'char(' || CAST(character_maximum_length AS VARCHAR) || ')' WHEN data_type = 'numeric' THEN 'numeric(' || CAST(numeric_precision AS VARCHAR) || ',' || CAST(numeric_scale AS VARCHAR) || ')' ELSE data_type END AS data_type FROM information_schema.columns WHERE table_schema = '{TARGET_SCHEMA}' AND column_name = '{COLUMN_NAME}' ORDER BY data_type, table_name;
+ SELECT DISTINCT column_name, columns.table_name, CASE WHEN data_type ILIKE 'timestamp%' THEN lower(data_type) WHEN data_type ILIKE 'date' THEN lower(data_type) WHEN data_type ILIKE 'boolean' THEN 'boolean' WHEN data_type = 'TEXT' THEN 'varchar(' || CAST(character_maximum_length AS STRING) || ')' WHEN data_type ILIKE 'char%' THEN 'char(' || CAST(character_maximum_length AS STRING) || ')' WHEN data_type = 'NUMBER' AND numeric_precision = 38 AND numeric_scale = 0 THEN 'bigint' WHEN data_type ILIKE 'num%' THEN 'numeric(' || CAST(numeric_precision AS STRING) || ',' || CAST(numeric_scale AS STRING) || ')' ELSE data_type END AS data_type FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = '{TARGET_SCHEMA}' AND columns.column_name = '{COLUMN_NAME}' AND tables.table_type = 'BASE TABLE' ORDER BY data_type, table_name;
+ error_type: Profile Anomaly
+ - id: '1118'
+ test_id: '1004'
+ test_type: Multiple_Types_Minor
+ sql_flavor: mssql
+ lookup_type: null
+ lookup_query: |-
+ SELECT TOP 500 column_name, columns.table_name, CASE WHEN data_type = 'datetime' THEN 'datetime' WHEN data_type = 'datetime2' THEN 'datetime' WHEN data_type = 'varchar' THEN 'varchar(' + CAST(character_maximum_length AS VARCHAR) + ')' WHEN data_type = 'char' THEN 'char(' + CAST(character_maximum_length AS VARCHAR) + ')' WHEN data_type = 'numeric' THEN 'numeric(' + CAST(numeric_precision AS VARCHAR) + ',' + CAST(numeric_scale AS VARCHAR) + ')' ELSE data_type END AS data_type FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = '{TARGET_SCHEMA}' AND columns.column_name = '{COLUMN_NAME}' AND tables.table_type = 'BASE TABLE' ORDER BY data_type, table_name;
error_type: Profile Anomaly
- id: '1061'
test_id: '1004'
@@ -32,13 +40,13 @@ profile_anomaly_types:
lookup_query: |-
SELECT DISTINCT column_name, columns.table_name, CASE WHEN data_type = 'timestamp without time zone' THEN 'timestamp' WHEN data_type = 'character varying' THEN 'varchar(' || CAST(character_maximum_length AS VARCHAR) || ')' WHEN data_type = 'character' THEN 'char(' || CAST(character_maximum_length AS VARCHAR) || ')' WHEN data_type = 'numeric' THEN 'numeric(' || CAST(numeric_precision AS VARCHAR) || ',' || CAST(numeric_scale AS VARCHAR) || ')' ELSE data_type END AS data_type FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = '{TARGET_SCHEMA}' AND columns.column_name = '{COLUMN_NAME}' AND UPPER(tables.table_type) = 'BASE TABLE' ORDER BY data_type, table_name;
error_type: Profile Anomaly
- - id: '1118'
+ - id: '1036'
test_id: '1004'
test_type: Multiple_Types_Minor
- sql_flavor: mssql
+ sql_flavor: redshift
lookup_type: null
lookup_query: |-
- SELECT TOP 500 column_name, columns.table_name, CASE WHEN data_type = 'datetime' THEN 'datetime' WHEN data_type = 'datetime2' THEN 'datetime' WHEN data_type = 'varchar' THEN 'varchar(' + CAST(character_maximum_length AS VARCHAR) + ')' WHEN data_type = 'char' THEN 'char(' + CAST(character_maximum_length AS VARCHAR) + ')' WHEN data_type = 'numeric' THEN 'numeric(' + CAST(numeric_precision AS VARCHAR) + ',' + CAST(numeric_scale AS VARCHAR) + ')' ELSE data_type END AS data_type FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = '{TARGET_SCHEMA}' AND columns.column_name = '{COLUMN_NAME}' AND tables.table_type = 'BASE TABLE' ORDER BY data_type, table_name;
+ SELECT DISTINCT column_name, table_name, CASE WHEN data_type = 'timestamp without time zone' THEN 'timestamp' WHEN data_type = 'character varying' THEN 'varchar(' || CAST(character_maximum_length AS VARCHAR) || ')' WHEN data_type = 'character' THEN 'char(' || CAST(character_maximum_length AS VARCHAR) || ')' WHEN data_type = 'numeric' THEN 'numeric(' || CAST(numeric_precision AS VARCHAR) || ',' || CAST(numeric_scale AS VARCHAR) || ')' ELSE data_type END AS data_type FROM information_schema.columns WHERE table_schema = '{TARGET_SCHEMA}' AND column_name = '{COLUMN_NAME}' ORDER BY data_type, table_name;
error_type: Profile Anomaly
- id: '1175'
test_id: '1004'
@@ -48,11 +56,3 @@ profile_anomaly_types:
lookup_query: |-
SELECT DISTINCT column_name, columns.table_name, CASE WHEN data_type ILIKE 'timestamp%' THEN lower(data_type) WHEN data_type ILIKE 'date' THEN lower(data_type) WHEN data_type ILIKE 'boolean' THEN 'boolean' WHEN data_type = 'TEXT' THEN 'varchar(' || CAST(character_maximum_length AS VARCHAR) || ')' WHEN data_type ILIKE 'char%' THEN 'char(' || CAST(character_maximum_length AS VARCHAR) || ')' WHEN data_type = 'NUMBER' AND numeric_precision = 38 AND numeric_scale = 0 THEN 'bigint' WHEN data_type ILIKE 'num%' THEN 'numeric(' || CAST(numeric_precision AS VARCHAR) || ',' || CAST(numeric_scale AS VARCHAR) || ')' ELSE data_type END AS data_type FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = '{TARGET_SCHEMA}' AND columns.column_name = '{COLUMN_NAME}' AND tables.table_type = 'BASE TABLE' ORDER BY data_type, table_name;
error_type: Profile Anomaly
- - id: '1276'
- test_id: '1004'
- test_type: Multiple_Types_Minor
- sql_flavor: databricks
- lookup_type: null
- lookup_query: |-
- SELECT DISTINCT column_name, columns.table_name, CASE WHEN data_type ILIKE 'timestamp%' THEN lower(data_type) WHEN data_type ILIKE 'date' THEN lower(data_type) WHEN data_type ILIKE 'boolean' THEN 'boolean' WHEN data_type = 'TEXT' THEN 'varchar(' || CAST(character_maximum_length AS STRING) || ')' WHEN data_type ILIKE 'char%' THEN 'char(' || CAST(character_maximum_length AS STRING) || ')' WHEN data_type = 'NUMBER' AND numeric_precision = 38 AND numeric_scale = 0 THEN 'bigint' WHEN data_type ILIKE 'num%' THEN 'numeric(' || CAST(numeric_precision AS STRING) || ',' || CAST(numeric_scale AS STRING) || ')' ELSE data_type END AS data_type FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = '{TARGET_SCHEMA}' AND columns.column_name = '{COLUMN_NAME}' AND tables.table_type = 'BASE TABLE' ORDER BY data_type, table_name;
- error_type: Profile Anomaly
diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_No_Values.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_No_Values.yaml
index 9b130d57..a70fea71 100644
--- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_No_Values.yaml
+++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_No_Values.yaml
@@ -19,13 +19,21 @@ profile_anomaly_types:
dq_score_risk_factor: '0.33'
dq_dimension: Completeness
target_data_lookups:
- - id: '1038'
+ - id: '1278'
test_id: '1006'
test_type: No_Values
- sql_flavor: redshift
+ sql_flavor: databricks
lookup_type: null
lookup_query: |-
- SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";
+ SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}`;
+ error_type: Profile Anomaly
+ - id: '1120'
+ test_id: '1006'
+ test_type: No_Values
+ sql_flavor: mssql
+ lookup_type: null
+ lookup_query: |-
+ SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";
error_type: Profile Anomaly
- id: '1063'
test_id: '1006'
@@ -35,13 +43,13 @@ profile_anomaly_types:
lookup_query: |-
SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";
error_type: Profile Anomaly
- - id: '1120'
+ - id: '1038'
test_id: '1006'
test_type: No_Values
- sql_flavor: mssql
+ sql_flavor: redshift
lookup_type: null
lookup_query: |-
- SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";
+ SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";
error_type: Profile Anomaly
- id: '1177'
test_id: '1006'
@@ -51,11 +59,3 @@ profile_anomaly_types:
lookup_query: |-
SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";
error_type: Profile Anomaly
- - id: '1278'
- test_id: '1006'
- test_type: No_Values
- sql_flavor: databricks
- lookup_type: null
- lookup_query: |-
- SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}`;
- error_type: Profile Anomaly
diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Alpha_Name_Address.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Alpha_Name_Address.yaml
index ea507508..b5e9f27c 100644
--- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Alpha_Name_Address.yaml
+++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Alpha_Name_Address.yaml
@@ -17,15 +17,25 @@ profile_anomaly_types:
dq_score_risk_factor: '1.0'
dq_dimension: Validity
target_data_lookups:
- - id: '1263'
+ - id: '1267'
test_id: '1029'
test_type: Non_Alpha_Name_Address
- sql_flavor: redshift
+ sql_flavor: databricks
lookup_type: null
lookup_query: |-
- SELECT "{COLUMN_NAME}", COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}"
+ SELECT `{COLUMN_NAME}`, COUNT(*) as record_ct FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`
+ WHERE `{COLUMN_NAME}` = UPPER(`{COLUMN_NAME}`) AND `{COLUMN_NAME}` = LOWER(`{COLUMN_NAME}`) AND `{COLUMN_NAME}` > ''
+ GROUP BY `{COLUMN_NAME}` LIMIT 500
+ error_type: Profile Anomaly
+ - id: '1265'
+ test_id: '1029'
+ test_type: Non_Alpha_Name_Address
+ sql_flavor: mssql
+ lookup_type: null
+ lookup_query: |-
+ SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}"
WHERE "{COLUMN_NAME}" = UPPER("{COLUMN_NAME}") AND "{COLUMN_NAME}" = LOWER("{COLUMN_NAME}") AND "{COLUMN_NAME}" > ''
- GROUP BY "{COLUMN_NAME}" LIMIT 500
+ GROUP BY "{COLUMN_NAME}"
error_type: Profile Anomaly
- id: '1264'
test_id: '1029'
@@ -37,15 +47,15 @@ profile_anomaly_types:
WHERE "{COLUMN_NAME}" = UPPER("{COLUMN_NAME}") AND "{COLUMN_NAME}" = LOWER("{COLUMN_NAME}") AND "{COLUMN_NAME}" > ''
GROUP BY "{COLUMN_NAME}" LIMIT 500
error_type: Profile Anomaly
- - id: '1265'
+ - id: '1263'
test_id: '1029'
test_type: Non_Alpha_Name_Address
- sql_flavor: mssql
+ sql_flavor: redshift
lookup_type: null
lookup_query: |-
- SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}"
+ SELECT "{COLUMN_NAME}", COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}"
WHERE "{COLUMN_NAME}" = UPPER("{COLUMN_NAME}") AND "{COLUMN_NAME}" = LOWER("{COLUMN_NAME}") AND "{COLUMN_NAME}" > ''
- GROUP BY "{COLUMN_NAME}"
+ GROUP BY "{COLUMN_NAME}" LIMIT 500
error_type: Profile Anomaly
- id: '1266'
test_id: '1029'
@@ -57,13 +67,3 @@ profile_anomaly_types:
WHERE "{COLUMN_NAME}" = UPPER("{COLUMN_NAME}") AND "{COLUMN_NAME}" = LOWER("{COLUMN_NAME}") AND "{COLUMN_NAME}" > ''
GROUP BY "{COLUMN_NAME}" LIMIT 500
error_type: Profile Anomaly
- - id: '1267'
- test_id: '1029'
- test_type: Non_Alpha_Name_Address
- sql_flavor: databricks
- lookup_type: null
- lookup_query: |-
- SELECT `{COLUMN_NAME}`, COUNT(*) as record_ct FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`
- WHERE `{COLUMN_NAME}` = UPPER(`{COLUMN_NAME}`) AND `{COLUMN_NAME}` = LOWER(`{COLUMN_NAME}`) AND `{COLUMN_NAME}` > ''
- GROUP BY `{COLUMN_NAME}` LIMIT 500
- error_type: Profile Anomaly
diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Alpha_Prefixed_Name.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Alpha_Prefixed_Name.yaml
index 453e86b2..807e49d0 100644
--- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Alpha_Prefixed_Name.yaml
+++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Alpha_Prefixed_Name.yaml
@@ -17,15 +17,25 @@ profile_anomaly_types:
dq_score_risk_factor: '1.0'
dq_dimension: Validity
target_data_lookups:
- - id: '1268'
+ - id: '1272'
test_id: '1030'
test_type: Non_Alpha_Prefixed_Name
- sql_flavor: redshift
+ sql_flavor: databricks
lookup_type: null
lookup_query: |-
- SELECT "{COLUMN_NAME}", COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}"
+ SELECT `{COLUMN_NAME}`, COUNT(*) as record_ct FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`
+ WHERE `{COLUMN_NAME}` < 'A' AND LEFT(`{COLUMN_NAME}`, 1) NOT IN ('"', ' ') AND RIGHT(`{COLUMN_NAME}`, 1) <> ''''
+ GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` LIMIT 500
+ error_type: Profile Anomaly
+ - id: '1270'
+ test_id: '1030'
+ test_type: Non_Alpha_Prefixed_Name
+ sql_flavor: mssql
+ lookup_type: null
+ lookup_query: |-
+ SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}"
WHERE "{COLUMN_NAME}" < 'A' AND LEFT("{COLUMN_NAME}", 1) NOT IN ('"', ' ') AND RIGHT("{COLUMN_NAME}", 1) <> ''''
- GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500
+ GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}"
error_type: Profile Anomaly
- id: '1269'
test_id: '1030'
@@ -37,15 +47,15 @@ profile_anomaly_types:
WHERE "{COLUMN_NAME}" < 'A' AND LEFT("{COLUMN_NAME}", 1) NOT IN ('"', ' ') AND RIGHT("{COLUMN_NAME}", 1) <> ''''
GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500
error_type: Profile Anomaly
- - id: '1270'
+ - id: '1268'
test_id: '1030'
test_type: Non_Alpha_Prefixed_Name
- sql_flavor: mssql
+ sql_flavor: redshift
lookup_type: null
lookup_query: |-
- SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}"
+ SELECT "{COLUMN_NAME}", COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}"
WHERE "{COLUMN_NAME}" < 'A' AND LEFT("{COLUMN_NAME}", 1) NOT IN ('"', ' ') AND RIGHT("{COLUMN_NAME}", 1) <> ''''
- GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}"
+ GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500
error_type: Profile Anomaly
- id: '1271'
test_id: '1030'
@@ -57,13 +67,3 @@ profile_anomaly_types:
WHERE "{COLUMN_NAME}" < 'A' AND LEFT("{COLUMN_NAME}", 1) NOT IN ('"', ' ') AND RIGHT("{COLUMN_NAME}", 1) <> ''''
GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500
error_type: Profile Anomaly
- - id: '1272'
- test_id: '1030'
- test_type: Non_Alpha_Prefixed_Name
- sql_flavor: databricks
- lookup_type: null
- lookup_query: |-
- SELECT `{COLUMN_NAME}`, COUNT(*) as record_ct FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`
- WHERE `{COLUMN_NAME}` < 'A' AND LEFT(`{COLUMN_NAME}`, 1) NOT IN ('"', ' ') AND RIGHT(`{COLUMN_NAME}`, 1) <> ''''
- GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` LIMIT 500
- error_type: Profile Anomaly
diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Printing_Chars.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Printing_Chars.yaml
index 1ca207a3..0abc0e99 100644
--- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Printing_Chars.yaml
+++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Printing_Chars.yaml
@@ -17,10 +17,53 @@ profile_anomaly_types:
dq_score_risk_factor: '1.0'
dq_dimension: Validity
target_data_lookups:
- - id: '1273'
+ - id: '1277'
test_id: '1031'
test_type: Non_Printing_Chars
- sql_flavor: redshift
+ sql_flavor: databricks
+ lookup_type: null
+ lookup_query: |-
+ SELECT REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(`{COLUMN_NAME}`,
+ '\u00a0', '\x160'),
+ '\u2009', '\x8201'),
+ '\u200b', '\x8203'),
+ '\u200c', '\x8204'),
+ '\u200d', '\x8205'),
+ '\u200e', '\x8206'),
+ '\u200f', '\x8207'),
+ '\u202f', '\x8239'),
+ '\u3000', '\x12288'),
+ '\ufeff', '\x65279') as `{COLUMN_NAME}_content`,
+ COUNT(*) as record_ct FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`
+ WHERE TRANSLATE(`{COLUMN_NAME}`, '\u00a0\u2009\u200b\u200c\u200d\u200e\u200f\u202f\u3000\ufeff', 'XXXXXXXXXX') <> `{COLUMN_NAME}`
+ GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` LIMIT 500
+ error_type: Profile Anomaly
+ - id: '1275'
+ test_id: '1031'
+ test_type: Non_Printing_Chars
+ sql_flavor: mssql
+ lookup_type: null
+ lookup_query: |-
+ SELECT TOP 500 REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE("{COLUMN_NAME}",
+ NCHAR(160), '\x160'),
+ NCHAR(8201), '\x8201'),
+ NCHAR(8203), '\x8203'),
+ NCHAR(8204), '\x8204'),
+ NCHAR(8205), '\x8205'),
+ NCHAR(8206), '\x8206'),
+ NCHAR(8207), '\x8207'),
+ NCHAR(8239), '\x8239'),
+ NCHAR(12288), '\x12288'),
+ NCHAR(65279), '\x65279') AS "{COLUMN_NAME}_content",
+ COUNT(*) AS record_ct
+ FROM "{TARGET_SCHEMA}"."{TABLE_NAME}"
+ WHERE TRANSLATE("{COLUMN_NAME}", NCHAR(160) + NCHAR(8201) + NCHAR(8203) + NCHAR(8204) + NCHAR(8205) + NCHAR(8206) + NCHAR(8207) + NCHAR(8239) + NCHAR(12288) + NCHAR(65279), 'XXXXXXXXXX') <> "{COLUMN_NAME}"
+ GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}"
+ error_type: Profile Anomaly
+ - id: '1274'
+ test_id: '1031'
+ test_type: Non_Printing_Chars
+ sql_flavor: postgresql
lookup_type: null
lookup_query: |-
SELECT REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE("{COLUMN_NAME}",
@@ -38,10 +81,10 @@ profile_anomaly_types:
WHERE TRANSLATE("{COLUMN_NAME}", CHR(160) || CHR(8201) || CHR(8203) || CHR(8204) || CHR(8205) || CHR(8206) || CHR(8207) || CHR(8239) || CHR(12288) || CHR(65279), 'XXXXXXXXXX') <> "{COLUMN_NAME}"
GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500
error_type: Profile Anomaly
- - id: '1274'
+ - id: '1273'
test_id: '1031'
test_type: Non_Printing_Chars
- sql_flavor: postgresql
+ sql_flavor: redshift
lookup_type: null
lookup_query: |-
SELECT REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE("{COLUMN_NAME}",
@@ -59,28 +102,6 @@ profile_anomaly_types:
WHERE TRANSLATE("{COLUMN_NAME}", CHR(160) || CHR(8201) || CHR(8203) || CHR(8204) || CHR(8205) || CHR(8206) || CHR(8207) || CHR(8239) || CHR(12288) || CHR(65279), 'XXXXXXXXXX') <> "{COLUMN_NAME}"
GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500
error_type: Profile Anomaly
- - id: '1275'
- test_id: '1031'
- test_type: Non_Printing_Chars
- sql_flavor: mssql
- lookup_type: null
- lookup_query: |-
- SELECT TOP 500 REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE("{COLUMN_NAME}",
- NCHAR(160), '\x160'),
- NCHAR(8201), '\x8201'),
- NCHAR(8203), '\x8203'),
- NCHAR(8204), '\x8204'),
- NCHAR(8205), '\x8205'),
- NCHAR(8206), '\x8206'),
- NCHAR(8207), '\x8207'),
- NCHAR(8239), '\x8239'),
- NCHAR(12288), '\x12288'),
- NCHAR(65279), '\x65279') AS "{COLUMN_NAME}_content",
- COUNT(*) AS record_ct
- FROM "{TARGET_SCHEMA}"."{TABLE_NAME}"
- WHERE TRANSLATE("{COLUMN_NAME}", NCHAR(160) + NCHAR(8201) + NCHAR(8203) + NCHAR(8204) + NCHAR(8205) + NCHAR(8206) + NCHAR(8207) + NCHAR(8239) + NCHAR(12288) + NCHAR(65279), 'XXXXXXXXXX') <> "{COLUMN_NAME}"
- GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}"
- error_type: Profile Anomaly
- id: '1276'
test_id: '1031'
test_type: Non_Printing_Chars
@@ -102,24 +123,3 @@ profile_anomaly_types:
WHERE TRANSLATE("{COLUMN_NAME}", CHR(160) || CHR(8201) || CHR(8203) || CHR(8204) || CHR(8205) || CHR(8206) || CHR(8207) || CHR(8239) || CHR(12288) || CHR(65279), 'XXXXXXXXXX') <> "{COLUMN_NAME}"
GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500
error_type: Profile Anomaly
- - id: '1277'
- test_id: '1031'
- test_type: Non_Printing_Chars
- sql_flavor: databricks
- lookup_type: null
- lookup_query: |-
- SELECT REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(`{COLUMN_NAME}`,
- '\u00a0', '\x160'),
- '\u2009', '\x8201'),
- '\u200b', '\x8203'),
- '\u200c', '\x8204'),
- '\u200d', '\x8205'),
- '\u200e', '\x8206'),
- '\u200f', '\x8207'),
- '\u202f', '\x8239'),
- '\u3000', '\x12288'),
- '\ufeff', '\x65279') as `{COLUMN_NAME}_content`,
- COUNT(*) as record_ct FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`
- WHERE TRANSLATE(`{COLUMN_NAME}`, '\u00a0\u2009\u200b\u200c\u200d\u200e\u200f\u202f\u3000\ufeff', 'XXXXXXXXXX') <> `{COLUMN_NAME}`
- GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` LIMIT 500
- error_type: Profile Anomaly
diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Standard_Blanks.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Standard_Blanks.yaml
index 73eb6c03..eaf2dae5 100644
--- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Standard_Blanks.yaml
+++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Standard_Blanks.yaml
@@ -17,13 +17,21 @@ profile_anomaly_types:
dq_score_risk_factor: '1.0'
dq_dimension: Completeness
target_data_lookups:
- - id: '1034'
+ - id: '1274'
test_id: '1002'
test_type: Non_Standard_Blanks
- sql_flavor: redshift
+ sql_flavor: databricks
lookup_type: null
lookup_query: |-
- SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE CASE WHEN "{COLUMN_NAME}" IN ('.', '?', ' ') THEN 1 WHEN LOWER("{COLUMN_NAME}") SIMILAR TO '(^.{2,}|-{2,}|0{2,}|9{2,}|x{2,}|z{2,}$)' THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('blank','error','missing','tbd', 'n/a','#na','none','null','unknown') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('(blank)','(error)','(missing)','(tbd)', '(n/a)','(#na)','(none)','(null)','(unknown)') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('[blank]','[error]','[missing]','[tbd]', '[n/a]','[#na]','[none]','[null]','[unknown]') THEN 1 WHEN "{COLUMN_NAME}" = '' THEN 1 WHEN "{COLUMN_NAME}" IS NULL THEN 1 ELSE 0 END = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";
+ SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE CASE WHEN `{COLUMN_NAME}` IN ('.', '?', ' ') THEN 1 WHEN LOWER(`{COLUMN_NAME}`::STRING) REGEXP '-{2,}' OR LOWER(`{COLUMN_NAME}`::STRING) REGEXP '0{2,}' OR LOWER(`{COLUMN_NAME}`::STRING) REGEXP '9{2,}' OR LOWER(`{COLUMN_NAME}`::STRING) REGEXP 'x{2,}' OR LOWER(`{COLUMN_NAME}`::STRING) REGEXP 'z{2,}' THEN 1 WHEN LOWER(`{COLUMN_NAME}`) IN ('blank','error','missing','tbd', 'n/a','#na','none','null','unknown') THEN 1 WHEN LOWER(`{COLUMN_NAME}`) IN ('(blank)','(error)','(missing)','(tbd)', '(n/a)','(#na)','(none)','(null)','(unknown)') THEN 1 WHEN LOWER(`{COLUMN_NAME}`) IN ('[blank]','[error]','[missing]','[tbd]', '[n/a]','[#na]','[none]','[null]','[unknown]') THEN 1 WHEN `{COLUMN_NAME}` = '' THEN 1 WHEN `{COLUMN_NAME}` IS NULL THEN 1 ELSE 0 END = 1 GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}`;
+ error_type: Profile Anomaly
+ - id: '1116'
+ test_id: '1002'
+ test_type: Non_Standard_Blanks
+ sql_flavor: mssql
+ lookup_type: null
+ lookup_query: |-
+ SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE CASE WHEN "{COLUMN_NAME}" IN ('.', '?') OR "{COLUMN_NAME}" LIKE ' ' THEN 1 WHEN LEN("{COLUMN_NAME}") > 1 AND ( LOWER("{COLUMN_NAME}") LIKE '%..%' OR LOWER("{COLUMN_NAME}") LIKE '%--%' OR (LEN(REPLACE("{COLUMN_NAME}", '0', ''))= 0 ) OR (LEN(REPLACE("{COLUMN_NAME}", '9', ''))= 0 ) OR (LEN(REPLACE(LOWER("{COLUMN_NAME}"), 'x', ''))= 0 ) OR (LEN(REPLACE(LOWER("{COLUMN_NAME}"), 'z', ''))= 0 ) ) THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('blank','error','missing','tbd', 'n/a','#na','none','null','unknown') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('(blank)','(error)','(missing)','(tbd)', '(n/a)','(#na)','(none)','(null)','(unknown)') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('[blank]','[error]','[missing]','[tbd]', '[n/a]','[#na]','[none]','[null]','[unknown]') THEN 1 WHEN "{COLUMN_NAME}" = '' THEN 1 WHEN "{COLUMN_NAME}" IS NULL THEN 1 ELSE 0 END = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";
error_type: Profile Anomaly
- id: '1059'
test_id: '1002'
@@ -33,13 +41,13 @@ profile_anomaly_types:
lookup_query: |-
SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE CASE WHEN "{COLUMN_NAME}" IN ('.', '?', ' ') THEN 1 WHEN LOWER("{COLUMN_NAME}") SIMILAR TO '(^.{2,}|-{2,}|0{2,}|9{2,}|x{2,}|z{2,}$)' THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('blank','error','missing','tbd', 'n/a','#na','none','null','unknown') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('(blank)','(error)','(missing)','(tbd)', '(n/a)','(#na)','(none)','(null)','(unknown)') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('[blank]','[error]','[missing]','[tbd]', '[n/a]','[#na]','[none]','[null]','[unknown]') THEN 1 WHEN "{COLUMN_NAME}" = '' THEN 1 WHEN "{COLUMN_NAME}" IS NULL THEN 1 ELSE 0 END = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";
error_type: Profile Anomaly
- - id: '1116'
+ - id: '1034'
test_id: '1002'
test_type: Non_Standard_Blanks
- sql_flavor: mssql
+ sql_flavor: redshift
lookup_type: null
lookup_query: |-
- SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE CASE WHEN "{COLUMN_NAME}" IN ('.', '?') OR "{COLUMN_NAME}" LIKE ' ' THEN 1 WHEN LEN("{COLUMN_NAME}") > 1 AND ( LOWER("{COLUMN_NAME}") LIKE '%..%' OR LOWER("{COLUMN_NAME}") LIKE '%--%' OR (LEN(REPLACE("{COLUMN_NAME}", '0', ''))= 0 ) OR (LEN(REPLACE("{COLUMN_NAME}", '9', ''))= 0 ) OR (LEN(REPLACE(LOWER("{COLUMN_NAME}"), 'x', ''))= 0 ) OR (LEN(REPLACE(LOWER("{COLUMN_NAME}"), 'z', ''))= 0 ) ) THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('blank','error','missing','tbd', 'n/a','#na','none','null','unknown') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('(blank)','(error)','(missing)','(tbd)', '(n/a)','(#na)','(none)','(null)','(unknown)') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('[blank]','[error]','[missing]','[tbd]', '[n/a]','[#na]','[none]','[null]','[unknown]') THEN 1 WHEN "{COLUMN_NAME}" = '' THEN 1 WHEN "{COLUMN_NAME}" IS NULL THEN 1 ELSE 0 END = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";
+ SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE CASE WHEN "{COLUMN_NAME}" IN ('.', '?', ' ') THEN 1 WHEN LOWER("{COLUMN_NAME}") SIMILAR TO '(^.{2,}|-{2,}|0{2,}|9{2,}|x{2,}|z{2,}$)' THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('blank','error','missing','tbd', 'n/a','#na','none','null','unknown') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('(blank)','(error)','(missing)','(tbd)', '(n/a)','(#na)','(none)','(null)','(unknown)') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('[blank]','[error]','[missing]','[tbd]', '[n/a]','[#na]','[none]','[null]','[unknown]') THEN 1 WHEN "{COLUMN_NAME}" = '' THEN 1 WHEN "{COLUMN_NAME}" IS NULL THEN 1 ELSE 0 END = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";
error_type: Profile Anomaly
- id: '1173'
test_id: '1002'
@@ -49,11 +57,3 @@ profile_anomaly_types:
lookup_query: |-
SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE CASE WHEN "{COLUMN_NAME}" IN ('.', '?', ' ') THEN 1 WHEN LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP '-{2,}' OR LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP '0{2,}' OR LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP '9{2,}' OR LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP 'x{2,}' OR LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP 'z{2,}' THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('blank','error','missing','tbd', 'n/a','#na','none','null','unknown') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('(blank)','(error)','(missing)','(tbd)', '(n/a)','(#na)','(none)','(null)','(unknown)') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('[blank]','[error]','[missing]','[tbd]', '[n/a]','[#na]','[none]','[null]','[unknown]') THEN 1 WHEN "{COLUMN_NAME}" = '' THEN 1 WHEN "{COLUMN_NAME}" IS NULL THEN 1 ELSE 0 END = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";
error_type: Profile Anomaly
- - id: '1274'
- test_id: '1002'
- test_type: Non_Standard_Blanks
- sql_flavor: databricks
- lookup_type: null
- lookup_query: |-
- SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE CASE WHEN `{COLUMN_NAME}` IN ('.', '?', ' ') THEN 1 WHEN LOWER(`{COLUMN_NAME}`::STRING) REGEXP '-{2,}' OR LOWER(`{COLUMN_NAME}`::STRING) REGEXP '0{2,}' OR LOWER(`{COLUMN_NAME}`::STRING) REGEXP '9{2,}' OR LOWER(`{COLUMN_NAME}`::STRING) REGEXP 'x{2,}' OR LOWER(`{COLUMN_NAME}`::STRING) REGEXP 'z{2,}' THEN 1 WHEN LOWER(`{COLUMN_NAME}`) IN ('blank','error','missing','tbd', 'n/a','#na','none','null','unknown') THEN 1 WHEN LOWER(`{COLUMN_NAME}`) IN ('(blank)','(error)','(missing)','(tbd)', '(n/a)','(#na)','(none)','(null)','(unknown)') THEN 1 WHEN LOWER(`{COLUMN_NAME}`) IN ('[blank]','[error]','[missing]','[tbd]', '[n/a]','[#na]','[none]','[null]','[unknown]') THEN 1 WHEN `{COLUMN_NAME}` = '' THEN 1 WHEN `{COLUMN_NAME}` IS NULL THEN 1 ELSE 0 END = 1 GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}`;
- error_type: Profile Anomaly
diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Potential_Duplicates.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Potential_Duplicates.yaml
index bc9fe8c5..4aaaa825 100644
--- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Potential_Duplicates.yaml
+++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Potential_Duplicates.yaml
@@ -18,13 +18,21 @@ profile_anomaly_types:
dq_score_risk_factor: '0.33'
dq_dimension: Uniqueness
target_data_lookups:
- - id: '1048'
+ - id: '1288'
test_id: '1016'
test_type: Potential_Duplicates
- sql_flavor: redshift
+ sql_flavor: databricks
lookup_type: null
lookup_query: |-
- SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" HAVING COUNT(*)> 1 ORDER BY COUNT(*) DESC LIMIT 500;
+ SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY `{COLUMN_NAME}` HAVING count > 1 ORDER BY count DESC LIMIT 500;
+ error_type: Profile Anomaly
+ - id: '1130'
+ test_id: '1016'
+ test_type: Potential_Duplicates
+ sql_flavor: mssql
+ lookup_type: null
+ lookup_query: |-
+ SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" HAVING COUNT(*)> 1 ORDER BY COUNT(*) DESC;
error_type: Profile Anomaly
- id: '1073'
test_id: '1016'
@@ -34,13 +42,13 @@ profile_anomaly_types:
lookup_query: |-
SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" HAVING COUNT(*)> 1 ORDER BY COUNT(*) DESC LIMIT 500;
error_type: Profile Anomaly
- - id: '1130'
+ - id: '1048'
test_id: '1016'
test_type: Potential_Duplicates
- sql_flavor: mssql
+ sql_flavor: redshift
lookup_type: null
lookup_query: |-
- SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" HAVING COUNT(*)> 1 ORDER BY COUNT(*) DESC;
+ SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" HAVING COUNT(*)> 1 ORDER BY COUNT(*) DESC LIMIT 500;
error_type: Profile Anomaly
- id: '1187'
test_id: '1016'
@@ -50,11 +58,3 @@ profile_anomaly_types:
lookup_query: |-
SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" HAVING COUNT(*)> 1 ORDER BY COUNT(*) DESC LIMIT 500;
error_type: Profile Anomaly
- - id: '1288'
- test_id: '1016'
- test_type: Potential_Duplicates
- sql_flavor: databricks
- lookup_type: null
- lookup_query: |-
- SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY `{COLUMN_NAME}` HAVING count > 1 ORDER BY count DESC LIMIT 500;
- error_type: Profile Anomaly
diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Potential_PII.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Potential_PII.yaml
index d160615d..09a80941 100644
--- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Potential_PII.yaml
+++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Potential_PII.yaml
@@ -17,21 +17,13 @@ profile_anomaly_types:
WHEN 'C' THEN 0.33 END
dq_dimension: Validity
target_data_lookups:
- - id: '1269'
- test_id: '1100'
- test_type: Potential_PII
- sql_flavor: redshift
- lookup_type: null
- lookup_query: |-
- SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500;
- error_type: Profile Anomaly
- - id: '1270'
+ - id: '1338'
test_id: '1100'
test_type: Potential_PII
- sql_flavor: snowflake
+ sql_flavor: databricks
lookup_type: null
lookup_query: |-
- SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500;
+ SELECT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` DESC LIMIT 500;
error_type: Profile Anomaly
- id: '1271'
test_id: '1100'
@@ -49,11 +41,19 @@ profile_anomaly_types:
lookup_query: |-
SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500;
error_type: Profile Anomaly
- - id: '1338'
+ - id: '1269'
test_id: '1100'
test_type: Potential_PII
- sql_flavor: databricks
+ sql_flavor: redshift
lookup_type: null
lookup_query: |-
- SELECT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` DESC LIMIT 500;
+ SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500;
+ error_type: Profile Anomaly
+ - id: '1270'
+ test_id: '1100'
+ test_type: Potential_PII
+ sql_flavor: snowflake
+ lookup_type: null
+ lookup_query: |-
+ SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500;
error_type: Profile Anomaly
diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Quoted_Values.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Quoted_Values.yaml
index 4c9542c3..36c535fc 100644
--- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Quoted_Values.yaml
+++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Quoted_Values.yaml
@@ -17,13 +17,21 @@ profile_anomaly_types:
dq_score_risk_factor: '0.66'
dq_dimension: Validity
target_data_lookups:
- - id: '1042'
+ - id: '1282'
test_id: '1010'
test_type: Quoted_Values
- sql_flavor: redshift
+ sql_flavor: databricks
lookup_type: null
lookup_query: |-
- SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (CASE WHEN "{COLUMN_NAME}" ILIKE '"%"' OR "{COLUMN_NAME}" ILIKE '''%''' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500;
+ SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (CASE WHEN `{COLUMN_NAME}` ILIKE '"%"' OR `{COLUMN_NAME}` ILIKE '''%''' THEN 1 ELSE 0 END) = 1 GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` LIMIT 500;
+ error_type: Profile Anomaly
+ - id: '1124'
+ test_id: '1010'
+ test_type: Quoted_Values
+ sql_flavor: mssql
+ lookup_type: null
+ lookup_query: |-
+ SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (CASE WHEN "{COLUMN_NAME}" LIKE '"%"' OR "{COLUMN_NAME}" LIKE '''%''' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";
error_type: Profile Anomaly
- id: '1067'
test_id: '1010'
@@ -33,13 +41,13 @@ profile_anomaly_types:
lookup_query: |-
SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (CASE WHEN "{COLUMN_NAME}" ILIKE '"%"' OR "{COLUMN_NAME}" ILIKE '''%''' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500;
error_type: Profile Anomaly
- - id: '1124'
+ - id: '1042'
test_id: '1010'
test_type: Quoted_Values
- sql_flavor: mssql
+ sql_flavor: redshift
lookup_type: null
lookup_query: |-
- SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (CASE WHEN "{COLUMN_NAME}" LIKE '"%"' OR "{COLUMN_NAME}" LIKE '''%''' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";
+ SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (CASE WHEN "{COLUMN_NAME}" ILIKE '"%"' OR "{COLUMN_NAME}" ILIKE '''%''' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500;
error_type: Profile Anomaly
- id: '1181'
test_id: '1010'
@@ -49,11 +57,3 @@ profile_anomaly_types:
lookup_query: |-
SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (CASE WHEN "{COLUMN_NAME}" ILIKE '"%"' OR "{COLUMN_NAME}" ILIKE '''%''' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500;
error_type: Profile Anomaly
- - id: '1282'
- test_id: '1010'
- test_type: Quoted_Values
- sql_flavor: databricks
- lookup_type: null
- lookup_query: |-
- SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (CASE WHEN `{COLUMN_NAME}` ILIKE '"%"' OR `{COLUMN_NAME}` ILIKE '''%''' THEN 1 ELSE 0 END) = 1 GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` LIMIT 500;
- error_type: Profile Anomaly
diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Recency_One_Year.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Recency_One_Year.yaml
index cfefa973..6564153d 100644
--- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Recency_One_Year.yaml
+++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Recency_One_Year.yaml
@@ -16,34 +16,34 @@ profile_anomaly_types:
dq_score_risk_factor: null
dq_dimension: Timeliness
target_data_lookups:
- - id: '1051'
+ - id: '1291'
test_id: '1019'
test_type: Recency_One_Year
- sql_flavor: redshift
+ sql_flavor: databricks
lookup_type: null
lookup_query: |-
created_in_ui
error_type: Profile Anomaly
- - id: '1076'
+ - id: '1133'
test_id: '1019'
test_type: Recency_One_Year
- sql_flavor: postgresql
+ sql_flavor: mssql
lookup_type: null
lookup_query: |-
created_in_ui
error_type: Profile Anomaly
- - id: '1133'
+ - id: '1076'
test_id: '1019'
test_type: Recency_One_Year
- sql_flavor: mssql
+ sql_flavor: postgresql
lookup_type: null
lookup_query: |-
created_in_ui
error_type: Profile Anomaly
- - id: '1291'
+ - id: '1051'
test_id: '1019'
test_type: Recency_One_Year
- sql_flavor: databricks
+ sql_flavor: redshift
lookup_type: null
lookup_query: |-
created_in_ui
diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Recency_Six_Months.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Recency_Six_Months.yaml
index 7388aba5..ae3e25e5 100644
--- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Recency_Six_Months.yaml
+++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Recency_Six_Months.yaml
@@ -16,34 +16,34 @@ profile_anomaly_types:
dq_score_risk_factor: null
dq_dimension: Timeliness
target_data_lookups:
- - id: '1052'
+ - id: '1292'
test_id: '1020'
test_type: Recency_Six_Months
- sql_flavor: redshift
+ sql_flavor: databricks
lookup_type: null
lookup_query: |-
created_in_ui
error_type: Profile Anomaly
- - id: '1077'
+ - id: '1134'
test_id: '1020'
test_type: Recency_Six_Months
- sql_flavor: postgresql
+ sql_flavor: mssql
lookup_type: null
lookup_query: |-
created_in_ui
error_type: Profile Anomaly
- - id: '1134'
+ - id: '1077'
test_id: '1020'
test_type: Recency_Six_Months
- sql_flavor: mssql
+ sql_flavor: postgresql
lookup_type: null
lookup_query: |-
created_in_ui
error_type: Profile Anomaly
- - id: '1292'
+ - id: '1052'
test_id: '1020'
test_type: Recency_Six_Months
- sql_flavor: databricks
+ sql_flavor: redshift
lookup_type: null
lookup_query: |-
created_in_ui
diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Small Divergent Value Ct.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Small_Divergent_Value_Ct.yaml
similarity index 100%
rename from testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Small Divergent Value Ct.yaml
rename to testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Small_Divergent_Value_Ct.yaml
index 798d99a9..0d950923 100644
--- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Small Divergent Value Ct.yaml
+++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Small_Divergent_Value_Ct.yaml
@@ -22,13 +22,21 @@ profile_anomaly_types:
dq_score_risk_factor: '0.33'
dq_dimension: Validity
target_data_lookups:
- - id: '1046'
+ - id: '1286'
test_id: '1014'
test_type: Small Divergent Value Ct
- sql_flavor: redshift
+ sql_flavor: databricks
lookup_type: null
lookup_query: |-
- SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC;
+ SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY `{COLUMN_NAME}` ORDER BY count DESC;
+ error_type: Profile Anomaly
+ - id: '1128'
+ test_id: '1014'
+ test_type: Small Divergent Value Ct
+ sql_flavor: mssql
+ lookup_type: null
+ lookup_query: |-
+ SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC;
error_type: Profile Anomaly
- id: '1071'
test_id: '1014'
@@ -38,13 +46,13 @@ profile_anomaly_types:
lookup_query: |-
SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC;
error_type: Profile Anomaly
- - id: '1128'
+ - id: '1046'
test_id: '1014'
test_type: Small Divergent Value Ct
- sql_flavor: mssql
+ sql_flavor: redshift
lookup_type: null
lookup_query: |-
- SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC;
+ SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC;
error_type: Profile Anomaly
- id: '1185'
test_id: '1014'
@@ -54,11 +62,3 @@ profile_anomaly_types:
lookup_query: |-
SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC;
error_type: Profile Anomaly
- - id: '1286'
- test_id: '1014'
- test_type: Small Divergent Value Ct
- sql_flavor: databricks
- lookup_type: null
- lookup_query: |-
- SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY `{COLUMN_NAME}` ORDER BY count DESC;
- error_type: Profile Anomaly
diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Small Missing Value Ct.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Small_Missing_Value_Ct.yaml
similarity index 100%
rename from testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Small Missing Value Ct.yaml
rename to testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Small_Missing_Value_Ct.yaml
index 5a3fc09a..58591b77 100644
--- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Small Missing Value Ct.yaml
+++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Small_Missing_Value_Ct.yaml
@@ -25,13 +25,21 @@ profile_anomaly_types:
dq_score_risk_factor: '0.33'
dq_dimension: Completeness
target_data_lookups:
- - id: '1045'
+ - id: '1285'
test_id: '1013'
test_type: Small Missing Value Ct
- sql_flavor: redshift
+ sql_flavor: databricks
lookup_type: null
lookup_query: |-
- SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (CASE WHEN "{COLUMN_NAME}" IN ('.', '?', ' ') THEN 1 WHEN LOWER("{COLUMN_NAME}") SIMILAR TO '(^.{2,}|-{2,}|0{2,}|9{2,}|x{2,}|z{2,}$)' THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('blank','error','missing','tbd', 'n/a','#na','none','null','unknown') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('(blank)','(error)','(missing)','(tbd)', '(n/a)','(#na)','(none)','(null)','(unknown)') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('[blank]','[error]','[missing]','[tbd]', '[n/a]','[#na]','[none]','[null]','[unknown]') THEN 1 WHEN "{COLUMN_NAME}" = '' THEN 1 WHEN "{COLUMN_NAME}" IS NULL THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";
+ SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (CASE WHEN `{COLUMN_NAME}` IN ('.', '?', ' ') THEN 1 WHEN LOWER(`{COLUMN_NAME}`::STRING) REGEXP '-{2,}' OR LOWER(`{COLUMN_NAME}`::STRING) REGEXP '0{2,}' OR LOWER(`{COLUMN_NAME}`::STRING) REGEXP '9{2,}' OR LOWER(`{COLUMN_NAME}`::STRING) REGEXP 'x{2,}' OR LOWER(`{COLUMN_NAME}`::STRING) REGEXP 'z{2,}' THEN 1 WHEN LOWER(`{COLUMN_NAME}`) IN ('blank','error','missing','tbd', 'n/a','#na','none','null','unknown') THEN 1 WHEN LOWER(`{COLUMN_NAME}`) IN ('(blank)','(error)','(missing)','(tbd)', '(n/a)','(#na)','(none)','(null)','(unknown)') THEN 1 WHEN LOWER(`{COLUMN_NAME}`) IN ('[blank]','[error]','[missing]','[tbd]', '[n/a]','[#na]','[none]','[null]','[unknown]') THEN 1 WHEN `{COLUMN_NAME}` = '' THEN 1 WHEN `{COLUMN_NAME}` IS NULL THEN 1 ELSE 0 END) = 1 GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}`;
+ error_type: Profile Anomaly
+ - id: '1127'
+ test_id: '1013'
+ test_type: Small Missing Value Ct
+ sql_flavor: mssql
+ lookup_type: null
+ lookup_query: |-
+ SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (CASE WHEN "{COLUMN_NAME}" IN ('.', '?', ' ') THEN 1 WHEN LEN("{COLUMN_NAME}") > 1 AND ( LOWER("{COLUMN_NAME}") LIKE '%..%' OR LOWER("{COLUMN_NAME}") LIKE '%--%' OR (LEN(REPLACE("{COLUMN_NAME}", '0', ''))= 0 ) OR (LEN(REPLACE("{COLUMN_NAME}", '9', ''))= 0 ) OR (LEN(REPLACE(LOWER("{COLUMN_NAME}"), 'x', ''))= 0 ) OR (LEN(REPLACE(LOWER("{COLUMN_NAME}"), 'z', ''))= 0 ) ) THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('blank','error','missing','tbd', 'n/a','#na','none','null','unknown') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('(blank)','(error)','(missing)','(tbd)', '(n/a)','(#na)','(none)','(null)','(unknown)') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('[blank]','[error]','[missing]','[tbd]', '[n/a]','[#na]','[none]','[null]','[unknown]') THEN 1 WHEN "{COLUMN_NAME}" = '' THEN 1 WHEN "{COLUMN_NAME}" IS NULL THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";
error_type: Profile Anomaly
- id: '1070'
test_id: '1013'
@@ -41,13 +49,13 @@ profile_anomaly_types:
lookup_query: |-
SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (CASE WHEN "{COLUMN_NAME}" IN ('.', '?', ' ') THEN 1 WHEN LOWER("{COLUMN_NAME}") SIMILAR TO '(^.{2,}|-{2,}|0{2,}|9{2,}|x{2,}|z{2,}$)' THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('blank','error','missing','tbd', 'n/a','#na','none','null','unknown') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('(blank)','(error)','(missing)','(tbd)', '(n/a)','(#na)','(none)','(null)','(unknown)') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('[blank]','[error]','[missing]','[tbd]', '[n/a]','[#na]','[none]','[null]','[unknown]') THEN 1 WHEN "{COLUMN_NAME}" = '' THEN 1 WHEN "{COLUMN_NAME}" IS NULL THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";
error_type: Profile Anomaly
- - id: '1127'
+ - id: '1045'
test_id: '1013'
test_type: Small Missing Value Ct
- sql_flavor: mssql
+ sql_flavor: redshift
lookup_type: null
lookup_query: |-
- SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (CASE WHEN "{COLUMN_NAME}" IN ('.', '?', ' ') THEN 1 WHEN LEN("{COLUMN_NAME}") > 1 AND ( LOWER("{COLUMN_NAME}") LIKE '%..%' OR LOWER("{COLUMN_NAME}") LIKE '%--%' OR (LEN(REPLACE("{COLUMN_NAME}", '0', ''))= 0 ) OR (LEN(REPLACE("{COLUMN_NAME}", '9', ''))= 0 ) OR (LEN(REPLACE(LOWER("{COLUMN_NAME}"), 'x', ''))= 0 ) OR (LEN(REPLACE(LOWER("{COLUMN_NAME}"), 'z', ''))= 0 ) ) THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('blank','error','missing','tbd', 'n/a','#na','none','null','unknown') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('(blank)','(error)','(missing)','(tbd)', '(n/a)','(#na)','(none)','(null)','(unknown)') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('[blank]','[error]','[missing]','[tbd]', '[n/a]','[#na]','[none]','[null]','[unknown]') THEN 1 WHEN "{COLUMN_NAME}" = '' THEN 1 WHEN "{COLUMN_NAME}" IS NULL THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";
+ SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (CASE WHEN "{COLUMN_NAME}" IN ('.', '?', ' ') THEN 1 WHEN LOWER("{COLUMN_NAME}") SIMILAR TO '(^.{2,}|-{2,}|0{2,}|9{2,}|x{2,}|z{2,}$)' THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('blank','error','missing','tbd', 'n/a','#na','none','null','unknown') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('(blank)','(error)','(missing)','(tbd)', '(n/a)','(#na)','(none)','(null)','(unknown)') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('[blank]','[error]','[missing]','[tbd]', '[n/a]','[#na]','[none]','[null]','[unknown]') THEN 1 WHEN "{COLUMN_NAME}" = '' THEN 1 WHEN "{COLUMN_NAME}" IS NULL THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";
error_type: Profile Anomaly
- id: '1184'
test_id: '1013'
@@ -57,11 +65,3 @@ profile_anomaly_types:
lookup_query: |-
SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (CASE WHEN "{COLUMN_NAME}" IN ('.', '?', ' ') THEN 1 WHEN LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP '-{2,}' OR LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP '0{2,}' OR LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP '9{2,}' OR LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP 'x{2,}' OR LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP 'z{2,}' THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('blank','error','missing','tbd', 'n/a','#na','none','null','unknown') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('(blank)','(error)','(missing)','(tbd)', '(n/a)','(#na)','(none)','(null)','(unknown)') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('[blank]','[error]','[missing]','[tbd]', '[n/a]','[#na]','[none]','[null]','[unknown]') THEN 1 WHEN "{COLUMN_NAME}" = '' THEN 1 WHEN "{COLUMN_NAME}" IS NULL THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";
error_type: Profile Anomaly
- - id: '1285'
- test_id: '1013'
- test_type: Small Missing Value Ct
- sql_flavor: databricks
- lookup_type: null
- lookup_query: |-
- SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (CASE WHEN `{COLUMN_NAME}` IN ('.', '?', ' ') THEN 1 WHEN LOWER(`{COLUMN_NAME}`::STRING) REGEXP '-{2,}' OR LOWER(`{COLUMN_NAME}`::STRING) REGEXP '0{2,}' OR LOWER(`{COLUMN_NAME}`::STRING) REGEXP '9{2,}' OR LOWER(`{COLUMN_NAME}`::STRING) REGEXP 'x{2,}' OR LOWER(`{COLUMN_NAME}`::STRING) REGEXP 'z{2,}' THEN 1 WHEN LOWER(`{COLUMN_NAME}`) IN ('blank','error','missing','tbd', 'n/a','#na','none','null','unknown') THEN 1 WHEN LOWER(`{COLUMN_NAME}`) IN ('(blank)','(error)','(missing)','(tbd)', '(n/a)','(#na)','(none)','(null)','(unknown)') THEN 1 WHEN LOWER(`{COLUMN_NAME}`) IN ('[blank]','[error]','[missing]','[tbd]', '[n/a]','[#na]','[none]','[null]','[unknown]') THEN 1 WHEN `{COLUMN_NAME}` = '' THEN 1 WHEN `{COLUMN_NAME}` IS NULL THEN 1 ELSE 0 END) = 1 GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}`;
- error_type: Profile Anomaly
diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Small_Numeric_Value_Ct.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Small_Numeric_Value_Ct.yaml
index ef3d5d28..c56cafdc 100644
--- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Small_Numeric_Value_Ct.yaml
+++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Small_Numeric_Value_Ct.yaml
@@ -19,13 +19,21 @@ profile_anomaly_types:
dq_score_risk_factor: '0.66'
dq_dimension: Validity
target_data_lookups:
- - id: '1055'
+ - id: '1295'
test_id: '1023'
test_type: Small_Numeric_Value_Ct
- sql_flavor: redshift
+ sql_flavor: databricks
lookup_type: null
lookup_query: |-
- SELECT A.* FROM ( SELECT TOP 10 DISTINCT 'Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_NUM;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC) AS A UNION ALL SELECT B.* FROM ( SELECT TOP 10 DISTINCT 'Non-Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_NUM;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS B ORDER BY data_type, count DESC;
+ SELECT A.* FROM (SELECT DISTINCT 'Numeric' as data_type, `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_NUM;`{COLUMN_NAME}`%> = 1 GROUP BY `{COLUMN_NAME}` ORDER BY count DESC LIMIT 10) AS A UNION ALL SELECT B.* FROM (SELECT DISTINCT 'Non-Numeric' as data_type, `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_NUM;`{COLUMN_NAME}`%> != 1 GROUP BY `{COLUMN_NAME}` ORDER BY count DESC) AS B ORDER BY data_type, count DESC LIMIT 10;
+ error_type: Profile Anomaly
+ - id: '1137'
+ test_id: '1023'
+ test_type: Small_Numeric_Value_Ct
+ sql_flavor: mssql
+ lookup_type: null
+ lookup_query: |-
+ SELECT A.* FROM ( SELECT DISTINCT TOP 10 'Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_NUM;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS A UNION ALL SELECT B.* FROM ( SELECT DISTINCT TOP 10 'Non-Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_NUM;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS B ORDER BY data_type, count DESC;
error_type: Profile Anomaly
- id: '1080'
test_id: '1023'
@@ -35,13 +43,13 @@ profile_anomaly_types:
lookup_query: |-
SELECT A.* FROM ( SELECT DISTINCT 'Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_NUM;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT 10 ) AS A UNION ALL SELECT B.* FROM ( SELECT DISTINCT 'Non-Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_NUM;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT 10 ) AS B ORDER BY data_type, count DESC;
error_type: Profile Anomaly
- - id: '1137'
+ - id: '1055'
test_id: '1023'
test_type: Small_Numeric_Value_Ct
- sql_flavor: mssql
+ sql_flavor: redshift
lookup_type: null
lookup_query: |-
- SELECT A.* FROM ( SELECT DISTINCT TOP 10 'Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_NUM;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS A UNION ALL SELECT B.* FROM ( SELECT DISTINCT TOP 10 'Non-Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_NUM;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS B ORDER BY data_type, count DESC;
+ SELECT A.* FROM ( SELECT TOP 10 DISTINCT 'Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_NUM;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC) AS A UNION ALL SELECT B.* FROM ( SELECT TOP 10 DISTINCT 'Non-Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_NUM;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS B ORDER BY data_type, count DESC;
error_type: Profile Anomaly
- id: '1194'
test_id: '1023'
@@ -51,11 +59,3 @@ profile_anomaly_types:
lookup_query: |-
SELECT A.* FROM (SELECT DISTINCT TOP 10 'Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_NUM;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC) AS A UNION ALL SELECT B.* FROM (SELECT DISTINCT TOP 10 'Non-Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_NUM;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC) AS B ORDER BY data_type, count DESC;
error_type: Profile Anomaly
- - id: '1295'
- test_id: '1023'
- test_type: Small_Numeric_Value_Ct
- sql_flavor: databricks
- lookup_type: null
- lookup_query: |-
- SELECT A.* FROM (SELECT DISTINCT 'Numeric' as data_type, `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_NUM;`{COLUMN_NAME}`%> = 1 GROUP BY `{COLUMN_NAME}` ORDER BY count DESC LIMIT 10) AS A UNION ALL SELECT B.* FROM (SELECT DISTINCT 'Non-Numeric' as data_type, `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_NUM;`{COLUMN_NAME}`%> != 1 GROUP BY `{COLUMN_NAME}` ORDER BY count DESC) AS B ORDER BY data_type, count DESC LIMIT 10;
- error_type: Profile Anomaly
diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Standardized_Value_Matches.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Standardized_Value_Matches.yaml
index 63498405..648128c5 100644
--- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Standardized_Value_Matches.yaml
+++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Standardized_Value_Matches.yaml
@@ -18,13 +18,21 @@ profile_anomaly_types:
dq_score_risk_factor: '0.66'
dq_dimension: Uniqueness
target_data_lookups:
- - id: '1049'
+ - id: '1289'
test_id: '1017'
test_type: Standardized_Value_Matches
- sql_flavor: redshift
+ sql_flavor: databricks
lookup_type: null
lookup_query: |-
- WITH CTE AS ( SELECT DISTINCT UPPER(TRANSLATE("{COLUMN_NAME}", ' '',.-', '')) as possible_standard_value, COUNT(DISTINCT "{COLUMN_NAME}") FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY UPPER(TRANSLATE("{COLUMN_NAME}", ' '',.-', '')) HAVING COUNT(DISTINCT "{COLUMN_NAME}") > 1 ) SELECT DISTINCT a."{COLUMN_NAME}", possible_standard_value, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} a, cte b WHERE UPPER(TRANSLATE(a."{COLUMN_NAME}", ' '',.-', '')) = b.possible_standard_value GROUP BY a."{COLUMN_NAME}", possible_standard_value ORDER BY possible_standard_value ASC, count DESC LIMIT 500;
+ WITH CTE AS ( SELECT DISTINCT UPPER(TRANSLATE(`{COLUMN_NAME}`, ' '',.-', '')) as possible_standard_value, COUNT(DISTINCT `{COLUMN_NAME}`) FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY UPPER(TRANSLATE(`{COLUMN_NAME}`, ' '',.-', '')) HAVING COUNT(DISTINCT `{COLUMN_NAME}`) > 1 ) SELECT DISTINCT a.`{COLUMN_NAME}`, possible_standard_value, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} a, cte b WHERE UPPER(TRANSLATE(a.`{COLUMN_NAME}`, ' '',.-', '')) = b.possible_standard_value GROUP BY a.`{COLUMN_NAME}`, possible_standard_value ORDER BY possible_standard_value ASC, count DESC LIMIT 500;
+ error_type: Profile Anomaly
+ - id: '1131'
+ test_id: '1017'
+ test_type: Standardized_Value_Matches
+ sql_flavor: mssql
+ lookup_type: null
+ lookup_query: |-
+ WITH CTE AS ( SELECT DISTINCT TOP 500 UPPER(REPLACE(TRANSLATE("{COLUMN_NAME}",' '''',.-',REPLICATE(' ', LEN(' '''',.-'))),' ','')) as possible_standard_value, COUNT(DISTINCT "{COLUMN_NAME}") as distinct_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY UPPER(REPLACE(TRANSLATE("{COLUMN_NAME}",' '''',.-',REPLICATE(' ', LEN(' '''',.-'))),' ','')) HAVING COUNT(DISTINCT "{COLUMN_NAME}") > 1 ) SELECT DISTINCT a."{COLUMN_NAME}", possible_standard_value, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} a, cte b WHERE UPPER(REPLACE(TRANSLATE("{COLUMN_NAME}",' '''',.-',REPLICATE(' ', LEN(' '''',.-'))),' ','')) = b.possible_standard_value GROUP BY a."{COLUMN_NAME}", possible_standard_value ORDER BY possible_standard_value ASC, count DESC;
error_type: Profile Anomaly
- id: '1074'
test_id: '1017'
@@ -34,13 +42,13 @@ profile_anomaly_types:
lookup_query: |-
WITH CTE AS ( SELECT DISTINCT UPPER(TRANSLATE("{COLUMN_NAME}", ' '',.-', '')) as possible_standard_value, COUNT(DISTINCT "{COLUMN_NAME}") FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY UPPER(TRANSLATE("{COLUMN_NAME}", ' '',.-', '')) HAVING COUNT(DISTINCT "{COLUMN_NAME}") > 1 ) SELECT DISTINCT a."{COLUMN_NAME}", possible_standard_value, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} a, cte b WHERE UPPER(TRANSLATE(a."{COLUMN_NAME}", ' '',.-', '')) = b.possible_standard_value GROUP BY a."{COLUMN_NAME}", possible_standard_value ORDER BY possible_standard_value ASC, count DESC LIMIT 500;
error_type: Profile Anomaly
- - id: '1131'
+ - id: '1049'
test_id: '1017'
test_type: Standardized_Value_Matches
- sql_flavor: mssql
+ sql_flavor: redshift
lookup_type: null
lookup_query: |-
- WITH CTE AS ( SELECT DISTINCT TOP 500 UPPER(REPLACE(TRANSLATE("{COLUMN_NAME}",' '''',.-',REPLICATE(' ', LEN(' '''',.-'))),' ','')) as possible_standard_value, COUNT(DISTINCT "{COLUMN_NAME}") as distinct_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY UPPER(REPLACE(TRANSLATE("{COLUMN_NAME}",' '''',.-',REPLICATE(' ', LEN(' '''',.-'))),' ','')) HAVING COUNT(DISTINCT "{COLUMN_NAME}") > 1 ) SELECT DISTINCT a."{COLUMN_NAME}", possible_standard_value, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} a, cte b WHERE UPPER(REPLACE(TRANSLATE("{COLUMN_NAME}",' '''',.-',REPLICATE(' ', LEN(' '''',.-'))),' ','')) = b.possible_standard_value GROUP BY a."{COLUMN_NAME}", possible_standard_value ORDER BY possible_standard_value ASC, count DESC;
+ WITH CTE AS ( SELECT DISTINCT UPPER(TRANSLATE("{COLUMN_NAME}", ' '',.-', '')) as possible_standard_value, COUNT(DISTINCT "{COLUMN_NAME}") FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY UPPER(TRANSLATE("{COLUMN_NAME}", ' '',.-', '')) HAVING COUNT(DISTINCT "{COLUMN_NAME}") > 1 ) SELECT DISTINCT a."{COLUMN_NAME}", possible_standard_value, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} a, cte b WHERE UPPER(TRANSLATE(a."{COLUMN_NAME}", ' '',.-', '')) = b.possible_standard_value GROUP BY a."{COLUMN_NAME}", possible_standard_value ORDER BY possible_standard_value ASC, count DESC LIMIT 500;
error_type: Profile Anomaly
- id: '1188'
test_id: '1017'
@@ -50,11 +58,3 @@ profile_anomaly_types:
lookup_query: |-
WITH CTE AS ( SELECT DISTINCT UPPER(TRANSLATE("{COLUMN_NAME}", ' '',.-', '')) as possible_standard_value, COUNT(DISTINCT "{COLUMN_NAME}") FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY UPPER(TRANSLATE("{COLUMN_NAME}", ' '',.-', '')) HAVING COUNT(DISTINCT "{COLUMN_NAME}") > 1 ) SELECT DISTINCT a."{COLUMN_NAME}", possible_standard_value, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} a, cte b WHERE UPPER(TRANSLATE(a."{COLUMN_NAME}", ' '',.-', '')) = b.possible_standard_value GROUP BY a."{COLUMN_NAME}", possible_standard_value ORDER BY possible_standard_value ASC, count DESC LIMIT 500;
error_type: Profile Anomaly
- - id: '1289'
- test_id: '1017'
- test_type: Standardized_Value_Matches
- sql_flavor: databricks
- lookup_type: null
- lookup_query: |-
- WITH CTE AS ( SELECT DISTINCT UPPER(TRANSLATE(`{COLUMN_NAME}`, ' '',.-', '')) as possible_standard_value, COUNT(DISTINCT `{COLUMN_NAME}`) FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY UPPER(TRANSLATE(`{COLUMN_NAME}`, ' '',.-', '')) HAVING COUNT(DISTINCT `{COLUMN_NAME}`) > 1 ) SELECT DISTINCT a.`{COLUMN_NAME}`, possible_standard_value, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} a, cte b WHERE UPPER(TRANSLATE(a.`{COLUMN_NAME}`, ' '',.-', '')) = b.possible_standard_value GROUP BY a.`{COLUMN_NAME}`, possible_standard_value ORDER BY possible_standard_value ASC, count DESC LIMIT 500;
- error_type: Profile Anomaly
diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Suggested_Type.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Suggested_Type.yaml
index 36a2e3fa..46d1b9bc 100644
--- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Suggested_Type.yaml
+++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Suggested_Type.yaml
@@ -18,10 +18,18 @@ profile_anomaly_types:
dq_score_risk_factor: null
dq_dimension: null
target_data_lookups:
- - id: '1033'
+ - id: '1273'
test_id: '1001'
test_type: Suggested_Type
- sql_flavor: redshift
+ sql_flavor: databricks
+ lookup_type: null
+ lookup_query: |-
+ SELECT `{COLUMN_NAME}`, COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY `{COLUMN_NAME}` ORDER BY record_ct DESC LIMIT 20;
+ error_type: Profile Anomaly
+ - id: '1115'
+ test_id: '1001'
+ test_type: Suggested_Type
+ sql_flavor: mssql
lookup_type: null
lookup_query: |-
SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC;
@@ -34,10 +42,10 @@ profile_anomaly_types:
lookup_query: |-
SELECT "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC LIMIT 20;
error_type: Profile Anomaly
- - id: '1115'
+ - id: '1033'
test_id: '1001'
test_type: Suggested_Type
- sql_flavor: mssql
+ sql_flavor: redshift
lookup_type: null
lookup_query: |-
SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC;
@@ -50,11 +58,3 @@ profile_anomaly_types:
lookup_query: |-
SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC;
error_type: Profile Anomaly
- - id: '1273'
- test_id: '1001'
- test_type: Suggested_Type
- sql_flavor: databricks
- lookup_type: null
- lookup_query: |-
- SELECT `{COLUMN_NAME}`, COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY `{COLUMN_NAME}` ORDER BY record_ct DESC LIMIT 20;
- error_type: Profile Anomaly
diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Table_Pattern_Mismatch.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Table_Pattern_Mismatch.yaml
index dfc98e26..ced56d76 100644
--- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Table_Pattern_Mismatch.yaml
+++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Table_Pattern_Mismatch.yaml
@@ -23,13 +23,24 @@ profile_anomaly_types:
dq_score_risk_factor: null
dq_dimension: Validity
target_data_lookups:
- - id: '1040'
+ - id: '1280'
test_id: '1008'
test_type: Table_Pattern_Mismatch
- sql_flavor: redshift
+ sql_flavor: databricks
+ lookup_type: null
+ lookup_query: "SELECT DISTINCT column_name, columns.table_name FROM information_schema.columns\
+ \ JOIN information_schema.tables ON columns.table_name = tables.table_name AND\
+ \ columns.table_schema = tables.table_schema WHERE columns.table_schema = '{TARGET_SCHEMA}'\
+ \ AND columns.column_name = '{COLUMN_NAME}' AND UPPER(tables.table_type) = 'BASE\
+ \ TABLE' ORDER BY table_name; "
+ error_type: Profile Anomaly
+ - id: '1122'
+ test_id: '1008'
+ test_type: Table_Pattern_Mismatch
+ sql_flavor: mssql
lookup_type: null
lookup_query: |-
- SELECT column_name, table_name, data_type FROM information_schema.columns WHERE table_schema = '{TARGET_SCHEMA}' AND column_name = '{COLUMN_NAME}' ORDER BY data_type;
+ SELECT TOP 500 column_name, columns.table_name FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = '{TARGET_SCHEMA}' AND columns.column_name = '{COLUMN_NAME}' AND UPPER(tables.table_type) = 'BASE TABLE' ORDER BY table_name;
error_type: Profile Anomaly
- id: '1065'
test_id: '1008'
@@ -39,13 +50,13 @@ profile_anomaly_types:
lookup_query: |-
SELECT column_name, columns.table_name FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = '{TARGET_SCHEMA}' AND columns.column_name = '{COLUMN_NAME}' AND UPPER(tables.table_type) = 'BASE TABLE' ORDER BY columns.table_name;
error_type: Profile Anomaly
- - id: '1122'
+ - id: '1040'
test_id: '1008'
test_type: Table_Pattern_Mismatch
- sql_flavor: mssql
+ sql_flavor: redshift
lookup_type: null
lookup_query: |-
- SELECT TOP 500 column_name, columns.table_name FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = '{TARGET_SCHEMA}' AND columns.column_name = '{COLUMN_NAME}' AND UPPER(tables.table_type) = 'BASE TABLE' ORDER BY table_name;
+ SELECT column_name, table_name, data_type FROM information_schema.columns WHERE table_schema = '{TARGET_SCHEMA}' AND column_name = '{COLUMN_NAME}' ORDER BY data_type;
error_type: Profile Anomaly
- id: '1179'
test_id: '1008'
@@ -58,14 +69,3 @@ profile_anomaly_types:
\ AND columns.column_name = '{COLUMN_NAME}' AND UPPER(tables.table_type) = 'BASE\
\ TABLE' ORDER BY table_name; "
error_type: Profile Anomaly
- - id: '1280'
- test_id: '1008'
- test_type: Table_Pattern_Mismatch
- sql_flavor: databricks
- lookup_type: null
- lookup_query: "SELECT DISTINCT column_name, columns.table_name FROM information_schema.columns\
- \ JOIN information_schema.tables ON columns.table_name = tables.table_name AND\
- \ columns.table_schema = tables.table_schema WHERE columns.table_schema = '{TARGET_SCHEMA}'\
- \ AND columns.column_name = '{COLUMN_NAME}' AND UPPER(tables.table_type) = 'BASE\
- \ TABLE' ORDER BY table_name; "
- error_type: Profile Anomaly
diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Unexpected Emails.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Unexpected_Emails.yaml
similarity index 100%
rename from testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Unexpected Emails.yaml
rename to testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Unexpected_Emails.yaml
index 6aac3fd8..c975eec0 100644
--- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Unexpected Emails.yaml
+++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Unexpected_Emails.yaml
@@ -17,13 +17,21 @@ profile_anomaly_types:
dq_score_risk_factor: '0.33'
dq_dimension: Consistency
target_data_lookups:
- - id: '1054'
+ - id: '1294'
test_id: '1022'
test_type: Unexpected Emails
- sql_flavor: redshift
+ sql_flavor: databricks
lookup_type: null
lookup_query: |-
- SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500;
+ SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` DESC LIMIT 500;
+ error_type: Profile Anomaly
+ - id: '1136'
+ test_id: '1022'
+ test_type: Unexpected Emails
+ sql_flavor: mssql
+ lookup_type: null
+ lookup_query: |-
+ SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC;
error_type: Profile Anomaly
- id: '1079'
test_id: '1022'
@@ -33,13 +41,13 @@ profile_anomaly_types:
lookup_query: |-
SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500;
error_type: Profile Anomaly
- - id: '1136'
+ - id: '1054'
test_id: '1022'
test_type: Unexpected Emails
- sql_flavor: mssql
+ sql_flavor: redshift
lookup_type: null
lookup_query: |-
- SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC;
+ SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500;
error_type: Profile Anomaly
- id: '1193'
test_id: '1022'
@@ -49,11 +57,3 @@ profile_anomaly_types:
lookup_query: |-
SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500;
error_type: Profile Anomaly
- - id: '1294'
- test_id: '1022'
- test_type: Unexpected Emails
- sql_flavor: databricks
- lookup_type: null
- lookup_query: |-
- SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` DESC LIMIT 500;
- error_type: Profile Anomaly
diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Unexpected US States.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Unexpected_US_States.yaml
similarity index 100%
rename from testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Unexpected US States.yaml
rename to testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Unexpected_US_States.yaml
index 81a28c35..d74cea69 100644
--- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Unexpected US States.yaml
+++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Unexpected_US_States.yaml
@@ -19,13 +19,21 @@ profile_anomaly_types:
dq_score_risk_factor: '0.33'
dq_dimension: Consistency
target_data_lookups:
- - id: '1053'
+ - id: '1293'
test_id: '1021'
test_type: Unexpected US States
- sql_flavor: redshift
+ sql_flavor: databricks
lookup_type: null
lookup_query: |-
- SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500;
+ SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` DESC LIMIT 500;
+ error_type: Profile Anomaly
+ - id: '1135'
+ test_id: '1021'
+ test_type: Unexpected US States
+ sql_flavor: mssql
+ lookup_type: null
+ lookup_query: |-
+ SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC;
error_type: Profile Anomaly
- id: '1078'
test_id: '1021'
@@ -35,13 +43,13 @@ profile_anomaly_types:
lookup_query: |-
SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500;
error_type: Profile Anomaly
- - id: '1135'
+ - id: '1053'
test_id: '1021'
test_type: Unexpected US States
- sql_flavor: mssql
+ sql_flavor: redshift
lookup_type: null
lookup_query: |-
- SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC;
+ SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500;
error_type: Profile Anomaly
- id: '1192'
test_id: '1021'
@@ -51,11 +59,3 @@ profile_anomaly_types:
lookup_query: |-
SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500;
error_type: Profile Anomaly
- - id: '1293'
- test_id: '1021'
- test_type: Unexpected US States
- sql_flavor: databricks
- lookup_type: null
- lookup_query: |-
- SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` DESC LIMIT 500;
- error_type: Profile Anomaly
diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Unlikely_Date_Values.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Unlikely_Date_Values.yaml
index c42f354b..d111361c 100644
--- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Unlikely_Date_Values.yaml
+++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Unlikely_Date_Values.yaml
@@ -19,13 +19,21 @@ profile_anomaly_types:
dq_score_risk_factor: '0.66'
dq_dimension: Accuracy
target_data_lookups:
- - id: '1050'
+ - id: '1290'
test_id: '1018'
test_type: Unlikely_Date_Values
- sql_flavor: redshift
+ sql_flavor: databricks
lookup_type: null
lookup_query: |-
- SELECT DISTINCT "{COLUMN_NAME}", '{PROFILE_RUN_DATE}' :: DATE AS profile_run_date, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} a WHERE ("{COLUMN_NAME}" < '1900-01-01'::DATE) OR ("{COLUMN_NAME}" > '{PROFILE_RUN_DATE}' :: DATE + INTERVAL '30 year' ) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500;
+ SELECT DISTINCT `{COLUMN_NAME}`, '{PROFILE_RUN_DATE}' :: DATE AS profile_run_date, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} a WHERE (`{COLUMN_NAME}` < '1900-01-01'::DATE) OR (`{COLUMN_NAME}` > '{PROFILE_RUN_DATE}' :: DATE + INTERVAL '30 year' ) GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` DESC LIMIT 500;
+ error_type: Profile Anomaly
+ - id: '1132'
+ test_id: '1018'
+ test_type: Unlikely_Date_Values
+ sql_flavor: mssql
+ lookup_type: null
+ lookup_query: |-
+ SELECT TOP 500 "{COLUMN_NAME}", CAST( '{PROFILE_RUN_DATE}' AS DATE) AS profile_run_date, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} a WHERE ("{COLUMN_NAME}" < CAST('1900-01-01' AS DATE) ) OR ("{COLUMN_NAME}" > DATEADD(YEAR, 30, CAST('{PROFILE_RUN_DATE}' AS DATE ))) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC;
error_type: Profile Anomaly
- id: '1075'
test_id: '1018'
@@ -35,13 +43,13 @@ profile_anomaly_types:
lookup_query: |-
SELECT DISTINCT "{COLUMN_NAME}", '{PROFILE_RUN_DATE}' :: DATE AS profile_run_date, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} a WHERE ("{COLUMN_NAME}" < '1900-01-01'::DATE) OR ("{COLUMN_NAME}" > '{PROFILE_RUN_DATE}' :: DATE + INTERVAL '30 year' ) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500;
error_type: Profile Anomaly
- - id: '1132'
+ - id: '1050'
test_id: '1018'
test_type: Unlikely_Date_Values
- sql_flavor: mssql
+ sql_flavor: redshift
lookup_type: null
lookup_query: |-
- SELECT TOP 500 "{COLUMN_NAME}", CAST( '{PROFILE_RUN_DATE}' AS DATE) AS profile_run_date, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} a WHERE ("{COLUMN_NAME}" < CAST('1900-01-01' AS DATE) ) OR ("{COLUMN_NAME}" > DATEADD(YEAR, 30, CAST('{PROFILE_RUN_DATE}' AS DATE ))) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC;
+ SELECT DISTINCT "{COLUMN_NAME}", '{PROFILE_RUN_DATE}' :: DATE AS profile_run_date, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} a WHERE ("{COLUMN_NAME}" < '1900-01-01'::DATE) OR ("{COLUMN_NAME}" > '{PROFILE_RUN_DATE}' :: DATE + INTERVAL '30 year' ) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500;
error_type: Profile Anomaly
- id: '1189'
test_id: '1018'
@@ -51,11 +59,3 @@ profile_anomaly_types:
lookup_query: |-
SELECT DISTINCT "{COLUMN_NAME}", '{PROFILE_RUN_DATE}' :: DATE AS profile_run_date, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} a WHERE ("{COLUMN_NAME}" < '1900-01-01'::DATE) OR ("{COLUMN_NAME}" > '{PROFILE_RUN_DATE}' :: DATE + INTERVAL '30 year' ) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500;
error_type: Profile Anomaly
- - id: '1290'
- test_id: '1018'
- test_type: Unlikely_Date_Values
- sql_flavor: databricks
- lookup_type: null
- lookup_query: |-
- SELECT DISTINCT `{COLUMN_NAME}`, '{PROFILE_RUN_DATE}' :: DATE AS profile_run_date, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} a WHERE (`{COLUMN_NAME}` < '1900-01-01'::DATE) OR (`{COLUMN_NAME}` > '{PROFILE_RUN_DATE}' :: DATE + INTERVAL '30 year' ) GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` DESC LIMIT 500;
- error_type: Profile Anomaly
diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Variant_Coded_Values.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Variant_Coded_Values.yaml
index adab3e19..ae92b8b1 100644
--- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Variant_Coded_Values.yaml
+++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Variant_Coded_Values.yaml
@@ -19,21 +19,13 @@ profile_anomaly_types:
dq_score_risk_factor: null
dq_dimension: Consistency
target_data_lookups:
- - id: '1229'
- test_id: '1027'
- test_type: Variant_Coded_Values
- sql_flavor: redshift
- lookup_type: null
- lookup_query: |-
- WITH val_array AS (SELECT 1 as valkey, SPLIT_TO_ARRAY(SUBSTRING ('{DETAIL_EXPRESSION}', STRPOS('{DETAIL_EXPRESSION}', ':') + 2), '|') vals), val_list AS ( SELECT valkey, val::VARCHAR FROM val_array v, v.vals val ) SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} t INNER JOIN val_list v ON (LOWER("{COLUMN_NAME}") = v.val) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";
- error_type: Profile Anomaly
- id: '1230'
test_id: '1027'
test_type: Variant_Coded_Values
- sql_flavor: snowflake
+ sql_flavor: databricks
lookup_type: null
lookup_query: |-
- SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE lower("{COLUMN_NAME}") IN (SELECT trim(value) FROM TABLE (FLATTEN(INPUT => SPLIT(SUBSTRING('{DETAIL_EXPRESSION}', POSITION(':', '{DETAIL_EXPRESSION}') + 2), '|'))) ) GROUP BY "{COLUMN_NAME}";
+ SELECT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE LOWER(`{COLUMN_NAME}`) IN (SELECT TRIM(value) FROM (SELECT EXPLODE(SPLIT(SUBSTRING('{DETAIL_EXPRESSION}', INSTR('{DETAIL_EXPRESSION}', ':') + 2), '\\|')) AS value)) GROUP BY `{COLUMN_NAME}`;
error_type: Profile Anomaly
- id: '1231'
test_id: '1027'
@@ -51,11 +43,19 @@ profile_anomaly_types:
lookup_query: |-
SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE LOWER("{COLUMN_NAME}") = ANY(STRING_TO_ARRAY(SUBSTRING('{DETAIL_EXPRESSION}', STRPOS('{DETAIL_EXPRESSION}', ':') + 2), '|')) GROUP BY "{COLUMN_NAME}";
error_type: Profile Anomaly
+ - id: '1229'
+ test_id: '1027'
+ test_type: Variant_Coded_Values
+ sql_flavor: redshift
+ lookup_type: null
+ lookup_query: |-
+ WITH val_array AS (SELECT 1 as valkey, SPLIT_TO_ARRAY(SUBSTRING ('{DETAIL_EXPRESSION}', STRPOS('{DETAIL_EXPRESSION}', ':') + 2), '|') vals), val_list AS ( SELECT valkey, val::VARCHAR FROM val_array v, v.vals val ) SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} t INNER JOIN val_list v ON (LOWER("{COLUMN_NAME}") = v.val) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";
+ error_type: Profile Anomaly
- id: '1230'
test_id: '1027'
test_type: Variant_Coded_Values
- sql_flavor: databricks
+ sql_flavor: snowflake
lookup_type: null
lookup_query: |-
- SELECT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE LOWER(`{COLUMN_NAME}`) IN (SELECT TRIM(value) FROM (SELECT EXPLODE(SPLIT(SUBSTRING('{DETAIL_EXPRESSION}', INSTR('{DETAIL_EXPRESSION}', ':') + 2), '\\|')) AS value)) GROUP BY `{COLUMN_NAME}`;
+ SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE lower("{COLUMN_NAME}") IN (SELECT trim(value) FROM TABLE (FLATTEN(INPUT => SPLIT(SUBSTRING('{DETAIL_EXPRESSION}', POSITION(':', '{DETAIL_EXPRESSION}') + 2), '|'))) ) GROUP BY "{COLUMN_NAME}";
error_type: Profile Anomaly
diff --git a/testgen/template/dbsetup_test_types/test_types_Aggregate_Balance.yaml b/testgen/template/dbsetup_test_types/test_types_Aggregate_Balance.yaml
index 0e04bdf0..fb68a907 100644
--- a/testgen/template/dbsetup_test_types/test_types_Aggregate_Balance.yaml
+++ b/testgen/template/dbsetup_test_types/test_types_Aggregate_Balance.yaml
@@ -30,17 +30,17 @@ test_types:
health_dimension: Data Drift
threshold_description: |-
Expected count of group totals not matching aggregate value
+ result_visualization: line_chart
+ result_visualization_params: null
usage_notes: |-
This test compares sums or counts of a column rolled up to one or more category combinations across two different tables. Both tables must be accessible at the same time. It's ideal for confirming that two datasets exactly match -- that the sum of a measure or count of a value hasn't changed or shifted between categories. Use this test to compare a raw and processed version of the same dataset, or to confirm that an aggregated table exactly matches the detail table that it's built from. An error here means that one or more value combinations fail to match. New categories or combinations will cause failure.
active: Y
- result_visualization: line_chart
- result_visualization_params: null
cat_test_conditions: []
target_data_lookups:
- - id: '1245'
+ - id: '1333'
test_id: '1500'
test_type: Aggregate_Balance
- sql_flavor: redshift
+ sql_flavor: databricks
lookup_type: null
lookup_query: |-
SELECT *
@@ -61,10 +61,10 @@ test_types:
WHERE total <> match_total OR (total IS NOT NULL AND match_total IS NULL) OR (total IS NULL AND match_total IS NOT NULL)
ORDER BY {GROUPBY_NAMES};
error_type: Test Results
- - id: '1246'
+ - id: '1247'
test_id: '1500'
test_type: Aggregate_Balance
- sql_flavor: snowflake
+ sql_flavor: mssql
lookup_type: null
lookup_query: |-
SELECT *
@@ -85,10 +85,10 @@ test_types:
WHERE total <> match_total OR (total IS NOT NULL AND match_total IS NULL) OR (total IS NULL AND match_total IS NOT NULL)
ORDER BY {GROUPBY_NAMES};
error_type: Test Results
- - id: '1247'
+ - id: '1248'
test_id: '1500'
test_type: Aggregate_Balance
- sql_flavor: mssql
+ sql_flavor: postgresql
lookup_type: null
lookup_query: |-
SELECT *
@@ -109,10 +109,10 @@ test_types:
WHERE total <> match_total OR (total IS NOT NULL AND match_total IS NULL) OR (total IS NULL AND match_total IS NOT NULL)
ORDER BY {GROUPBY_NAMES};
error_type: Test Results
- - id: '1248'
+ - id: '1245'
test_id: '1500'
test_type: Aggregate_Balance
- sql_flavor: postgresql
+ sql_flavor: redshift
lookup_type: null
lookup_query: |-
SELECT *
@@ -133,10 +133,10 @@ test_types:
WHERE total <> match_total OR (total IS NOT NULL AND match_total IS NULL) OR (total IS NULL AND match_total IS NOT NULL)
ORDER BY {GROUPBY_NAMES};
error_type: Test Results
- - id: '1333'
+ - id: '1246'
test_id: '1500'
test_type: Aggregate_Balance
- sql_flavor: databricks
+ sql_flavor: snowflake
lookup_type: null
lookup_query: |-
SELECT *
@@ -158,13 +158,9 @@ test_types:
ORDER BY {GROUPBY_NAMES};
error_type: Test Results
test_templates:
- - id: '2006'
- test_type: Aggregate_Balance
- sql_flavor: redshift
- template_name: ex_aggregate_match_same_generic.sql
- - id: '2106'
+ - id: '2406'
test_type: Aggregate_Balance
- sql_flavor: snowflake
+ sql_flavor: databricks
template_name: ex_aggregate_match_same_generic.sql
- id: '2206'
test_type: Aggregate_Balance
@@ -174,7 +170,11 @@ test_types:
test_type: Aggregate_Balance
sql_flavor: postgresql
template_name: ex_aggregate_match_same_generic.sql
- - id: '2406'
+ - id: '2006'
test_type: Aggregate_Balance
- sql_flavor: databricks
+ sql_flavor: redshift
+ template_name: ex_aggregate_match_same_generic.sql
+ - id: '2106'
+ test_type: Aggregate_Balance
+ sql_flavor: snowflake
template_name: ex_aggregate_match_same_generic.sql
diff --git a/testgen/template/dbsetup_test_types/test_types_Aggregate_Balance_Percent.yaml b/testgen/template/dbsetup_test_types/test_types_Aggregate_Balance_Percent.yaml
index 8d7236ef..e293ba14 100644
--- a/testgen/template/dbsetup_test_types/test_types_Aggregate_Balance_Percent.yaml
+++ b/testgen/template/dbsetup_test_types/test_types_Aggregate_Balance_Percent.yaml
@@ -30,17 +30,17 @@ test_types:
health_dimension: Data Drift
threshold_description: |-
Expected count of group totals not matching aggregate value
+ result_visualization: line_chart
+ result_visualization_params: null
usage_notes: |-
This test compares sums or counts of a column rolled up to one or more category combinations across two different tables. Both tables must be accessible at the same time. Use it to confirm that two datasets closely match within the tolerance you set -- that the sum of a measure or count of a value remains sufficiently consistent between categories. You could use this test compare sales per product within one month to another, when you want to be alerted if the difference for any product falls outside of the range defined as 5% below to 10% above the prior month. An error here means that one or more value combinations fail to match within the set tolerances. New categories or combinations will cause failure.
active: Y
- result_visualization: line_chart
- result_visualization_params: null
cat_test_conditions: []
target_data_lookups:
- - id: '1245'
+ - id: '1248'
test_id: '1504'
test_type: Aggregate_Balance_Percent
- sql_flavor: redshift
+ sql_flavor: databricks
lookup_type: null
lookup_query: |-
SELECT *
@@ -63,10 +63,10 @@ test_types:
OR (total NOT BETWEEN match_total * (1 + {LOWER_TOLERANCE}/100.0) AND match_total * (1 + {UPPER_TOLERANCE}/100.0))
ORDER BY {GROUPBY_NAMES};
error_type: Test Results
- - id: '1246'
+ - id: '1247'
test_id: '1504'
test_type: Aggregate_Balance_Percent
- sql_flavor: snowflake
+ sql_flavor: mssql
lookup_type: null
lookup_query: |-
SELECT *
@@ -89,10 +89,10 @@ test_types:
OR (total NOT BETWEEN match_total * (1 + {LOWER_TOLERANCE}/100.0) AND match_total * (1 + {UPPER_TOLERANCE}/100.0))
ORDER BY {GROUPBY_NAMES};
error_type: Test Results
- - id: '1247'
+ - id: '1248'
test_id: '1504'
test_type: Aggregate_Balance_Percent
- sql_flavor: mssql
+ sql_flavor: postgresql
lookup_type: null
lookup_query: |-
SELECT *
@@ -115,10 +115,10 @@ test_types:
OR (total NOT BETWEEN match_total * (1 + {LOWER_TOLERANCE}/100.0) AND match_total * (1 + {UPPER_TOLERANCE}/100.0))
ORDER BY {GROUPBY_NAMES};
error_type: Test Results
- - id: '1248'
+ - id: '1245'
test_id: '1504'
test_type: Aggregate_Balance_Percent
- sql_flavor: postgresql
+ sql_flavor: redshift
lookup_type: null
lookup_query: |-
SELECT *
@@ -141,10 +141,10 @@ test_types:
OR (total NOT BETWEEN match_total * (1 + {LOWER_TOLERANCE}/100.0) AND match_total * (1 + {UPPER_TOLERANCE}/100.0))
ORDER BY {GROUPBY_NAMES};
error_type: Test Results
- - id: '1248'
+ - id: '1246'
test_id: '1504'
test_type: Aggregate_Balance_Percent
- sql_flavor: databricks
+ sql_flavor: snowflake
lookup_type: null
lookup_query: |-
SELECT *
@@ -168,13 +168,9 @@ test_types:
ORDER BY {GROUPBY_NAMES};
error_type: Test Results
test_templates:
- - id: '2009'
- test_type: Aggregate_Balance_Percent
- sql_flavor: redshift
- template_name: ex_aggregate_match_percent_generic.sql
- - id: '2109'
+ - id: '2409'
test_type: Aggregate_Balance_Percent
- sql_flavor: snowflake
+ sql_flavor: databricks
template_name: ex_aggregate_match_percent_generic.sql
- id: '2209'
test_type: Aggregate_Balance_Percent
@@ -184,7 +180,11 @@ test_types:
test_type: Aggregate_Balance_Percent
sql_flavor: postgresql
template_name: ex_aggregate_match_percent_generic.sql
- - id: '2409'
+ - id: '2009'
test_type: Aggregate_Balance_Percent
- sql_flavor: databricks
+ sql_flavor: redshift
+ template_name: ex_aggregate_match_percent_generic.sql
+ - id: '2109'
+ test_type: Aggregate_Balance_Percent
+ sql_flavor: snowflake
template_name: ex_aggregate_match_percent_generic.sql
diff --git a/testgen/template/dbsetup_test_types/test_types_Aggregate_Balance_Range.yaml b/testgen/template/dbsetup_test_types/test_types_Aggregate_Balance_Range.yaml
index 0f5b5c43..a0976a2c 100644
--- a/testgen/template/dbsetup_test_types/test_types_Aggregate_Balance_Range.yaml
+++ b/testgen/template/dbsetup_test_types/test_types_Aggregate_Balance_Range.yaml
@@ -30,17 +30,17 @@ test_types:
health_dimension: Data Drift
threshold_description: |-
Expected count of group totals not matching aggregate value
+ result_visualization: line_chart
+ result_visualization_params: null
usage_notes: |-
This test compares sums or counts of a column rolled up to one or more category combinations across two different tables. Both tables must be accessible at the same time. Use it to confirm that two datasets closely match within the tolerances you define as specific values above or below the aggregate measure for the same categories in the reference dataset -- that the sum of a measure or count of a value remains sufficiently consistent between categories. For instance, you can use this test to compare sales per product within one month to another, when you want to be alerted if the difference for any product falls outside of the range defined as 10000 dollars above or below the prior week. An error here means that one or more value combinations fail to match within the set tolerances. New categories or combinations will cause failure.
active: Y
- result_visualization: line_chart
- result_visualization_params: null
cat_test_conditions: []
target_data_lookups:
- id: '1245'
test_id: '1505'
test_type: Aggregate_Balance_Range
- sql_flavor: redshift
+ sql_flavor: databricks
lookup_type: null
lookup_query: |-
SELECT *
@@ -63,10 +63,10 @@ test_types:
OR (total NOT BETWEEN match_total + {LOWER_TOLERANCE} AND match_total + {UPPER_TOLERANCE})
ORDER BY {GROUPBY_NAMES};
error_type: Test Results
- - id: '1246'
+ - id: '1247'
test_id: '1505'
test_type: Aggregate_Balance_Range
- sql_flavor: snowflake
+ sql_flavor: mssql
lookup_type: null
lookup_query: |-
SELECT *
@@ -89,10 +89,10 @@ test_types:
OR (total NOT BETWEEN match_total + {LOWER_TOLERANCE} AND match_total + {UPPER_TOLERANCE})
ORDER BY {GROUPBY_NAMES};
error_type: Test Results
- - id: '1247'
+ - id: '1248'
test_id: '1505'
test_type: Aggregate_Balance_Range
- sql_flavor: mssql
+ sql_flavor: postgresql
lookup_type: null
lookup_query: |-
SELECT *
@@ -115,10 +115,10 @@ test_types:
OR (total NOT BETWEEN match_total + {LOWER_TOLERANCE} AND match_total + {UPPER_TOLERANCE})
ORDER BY {GROUPBY_NAMES};
error_type: Test Results
- - id: '1248'
+ - id: '1245'
test_id: '1505'
test_type: Aggregate_Balance_Range
- sql_flavor: postgresql
+ sql_flavor: redshift
lookup_type: null
lookup_query: |-
SELECT *
@@ -141,10 +141,10 @@ test_types:
OR (total NOT BETWEEN match_total + {LOWER_TOLERANCE} AND match_total + {UPPER_TOLERANCE})
ORDER BY {GROUPBY_NAMES};
error_type: Test Results
- - id: '1245'
+ - id: '1246'
test_id: '1505'
test_type: Aggregate_Balance_Range
- sql_flavor: databricks
+ sql_flavor: snowflake
lookup_type: null
lookup_query: |-
SELECT *
@@ -168,13 +168,9 @@ test_types:
ORDER BY {GROUPBY_NAMES};
error_type: Test Results
test_templates:
- - id: '2010'
- test_type: Aggregate_Balance_Range
- sql_flavor: redshift
- template_name: ex_aggregate_match_range_generic.sql
- - id: '2110'
+ - id: '2410'
test_type: Aggregate_Balance_Range
- sql_flavor: snowflake
+ sql_flavor: databricks
template_name: ex_aggregate_match_range_generic.sql
- id: '2210'
test_type: Aggregate_Balance_Range
@@ -184,7 +180,11 @@ test_types:
test_type: Aggregate_Balance_Range
sql_flavor: postgresql
template_name: ex_aggregate_match_range_generic.sql
- - id: '2410'
+ - id: '2010'
test_type: Aggregate_Balance_Range
- sql_flavor: databricks
+ sql_flavor: redshift
+ template_name: ex_aggregate_match_range_generic.sql
+ - id: '2110'
+ test_type: Aggregate_Balance_Range
+ sql_flavor: snowflake
template_name: ex_aggregate_match_range_generic.sql
diff --git a/testgen/template/dbsetup_test_types/test_types_Aggregate_Minimum.yaml b/testgen/template/dbsetup_test_types/test_types_Aggregate_Minimum.yaml
index 09bbc2a2..79c41e7f 100644
--- a/testgen/template/dbsetup_test_types/test_types_Aggregate_Minimum.yaml
+++ b/testgen/template/dbsetup_test_types/test_types_Aggregate_Minimum.yaml
@@ -30,17 +30,17 @@ test_types:
health_dimension: Data Drift
threshold_description: |-
Expected count of group totals below aggregate value
+ result_visualization: line_chart
+ result_visualization_params: null
usage_notes: |-
This test compares sums or counts of a column rolled up to one or more category combinations, but requires a match or increase in the aggregate value, rather than an exact match, across two different tables. Both tables must be accessible at the same time. Use this to confirm that aggregate values have not dropped for any set of categories, even if some values may rise. This test is useful to compare an older and newer version of a cumulative dataset. An error here means that one or more values per category set fail to match or exceed the prior dataset. New categories or combinations are allowed (but can be restricted independently with a Combo_Match test). Both tables must be present to run this test.
active: Y
- result_visualization: line_chart
- result_visualization_params: null
cat_test_conditions: []
target_data_lookups:
- - id: '1249'
+ - id: '1334'
test_id: '1501'
test_type: Aggregate_Minimum
- sql_flavor: redshift
+ sql_flavor: databricks
lookup_type: null
lookup_query: |-
SELECT *
@@ -61,10 +61,10 @@ test_types:
WHERE total < match_total OR (total IS NULL AND match_total IS NOT NULL)
ORDER BY {GROUPBY_NAMES};
error_type: Test Results
- - id: '1250'
+ - id: '1251'
test_id: '1501'
test_type: Aggregate_Minimum
- sql_flavor: snowflake
+ sql_flavor: mssql
lookup_type: null
lookup_query: |-
SELECT *
@@ -85,10 +85,10 @@ test_types:
WHERE total < match_total OR (total IS NULL AND match_total IS NOT NULL)
ORDER BY {GROUPBY_NAMES};
error_type: Test Results
- - id: '1251'
+ - id: '1252'
test_id: '1501'
test_type: Aggregate_Minimum
- sql_flavor: mssql
+ sql_flavor: postgresql
lookup_type: null
lookup_query: |-
SELECT *
@@ -109,10 +109,10 @@ test_types:
WHERE total < match_total OR (total IS NULL AND match_total IS NOT NULL)
ORDER BY {GROUPBY_NAMES};
error_type: Test Results
- - id: '1252'
+ - id: '1249'
test_id: '1501'
test_type: Aggregate_Minimum
- sql_flavor: postgresql
+ sql_flavor: redshift
lookup_type: null
lookup_query: |-
SELECT *
@@ -133,10 +133,10 @@ test_types:
WHERE total < match_total OR (total IS NULL AND match_total IS NOT NULL)
ORDER BY {GROUPBY_NAMES};
error_type: Test Results
- - id: '1334'
+ - id: '1250'
test_id: '1501'
test_type: Aggregate_Minimum
- sql_flavor: databricks
+ sql_flavor: snowflake
lookup_type: null
lookup_query: |-
SELECT *
@@ -158,13 +158,9 @@ test_types:
ORDER BY {GROUPBY_NAMES};
error_type: Test Results
test_templates:
- - id: '2002'
- test_type: Aggregate_Minimum
- sql_flavor: redshift
- template_name: ex_aggregate_match_no_drops_generic.sql
- - id: '2102'
+ - id: '2402'
test_type: Aggregate_Minimum
- sql_flavor: snowflake
+ sql_flavor: databricks
template_name: ex_aggregate_match_no_drops_generic.sql
- id: '2202'
test_type: Aggregate_Minimum
@@ -174,7 +170,11 @@ test_types:
test_type: Aggregate_Minimum
sql_flavor: postgresql
template_name: ex_aggregate_match_no_drops_generic.sql
- - id: '2402'
+ - id: '2002'
test_type: Aggregate_Minimum
- sql_flavor: databricks
+ sql_flavor: redshift
+ template_name: ex_aggregate_match_no_drops_generic.sql
+ - id: '2102'
+ test_type: Aggregate_Minimum
+ sql_flavor: snowflake
template_name: ex_aggregate_match_no_drops_generic.sql
diff --git a/testgen/template/dbsetup_test_types/test_types_Alpha_Trunc.yaml b/testgen/template/dbsetup_test_types/test_types_Alpha_Trunc.yaml
index e59479d9..74dc41ce 100644
--- a/testgen/template/dbsetup_test_types/test_types_Alpha_Trunc.yaml
+++ b/testgen/template/dbsetup_test_types/test_types_Alpha_Trunc.yaml
@@ -29,23 +29,15 @@ test_types:
health_dimension: Schema Drift
threshold_description: |-
Maximum length expected
+ result_visualization: line_chart
+ result_visualization_params: null
usage_notes: |-
Alpha Truncation tests that the longest text value in a column hasn't become shorter than the defined threshold, initially 95% of the longest value at baseline. This could indicate a problem in a cumulative dataset, where prior values should still exist unchanged. A failure here would suggest that some process changed data that you would still expect to be present and matching its value when the column was profiled. This test would not be appropriate for an incremental or windowed dataset.
active: Y
- result_visualization: line_chart
- result_visualization_params: null
cat_test_conditions:
- - id: '1001'
- test_type: Alpha_Trunc
- sql_flavor: redshift
- measure: |-
- MAX(LENGTH({COLUMN_NAME}))
- test_operator: <
- test_condition: |-
- {THRESHOLD_VALUE}
- - id: '2001'
+ - id: '6001'
test_type: Alpha_Trunc
- sql_flavor: snowflake
+ sql_flavor: databricks
measure: |-
MAX(LENGTH({COLUMN_NAME}))
test_operator: <
@@ -67,30 +59,46 @@ test_types:
test_operator: <
test_condition: |-
{THRESHOLD_VALUE}
- - id: '5001'
+ - id: '1001'
test_type: Alpha_Trunc
- sql_flavor: trino
+ sql_flavor: redshift
measure: |-
MAX(LENGTH({COLUMN_NAME}))
test_operator: <
test_condition: |-
{THRESHOLD_VALUE}
- - id: '6001'
+ - id: '2001'
test_type: Alpha_Trunc
- sql_flavor: databricks
+ sql_flavor: snowflake
+ measure: |-
+ MAX(LENGTH({COLUMN_NAME}))
+ test_operator: <
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '5001'
+ test_type: Alpha_Trunc
+ sql_flavor: trino
measure: |-
MAX(LENGTH({COLUMN_NAME}))
test_operator: <
test_condition: |-
{THRESHOLD_VALUE}
target_data_lookups:
- - id: '1001'
+ - id: '1298'
test_id: '1004'
test_type: Alpha_Trunc
- sql_flavor: redshift
+ sql_flavor: databricks
lookup_type: null
lookup_query: |-
- SELECT DISTINCT "{COLUMN_NAME}", LEN("{COLUMN_NAME}") as current_max_length, {THRESHOLD_VALUE} as previous_max_length FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT MAX(LEN("{COLUMN_NAME}")) as max_length FROM {TARGET_SCHEMA}.{TABLE_NAME}) a WHERE LEN("{COLUMN_NAME}") = a.max_length AND a.max_length < {THRESHOLD_VALUE} LIMIT 500;
+ SELECT DISTINCT `{COLUMN_NAME}` , LEN(`{COLUMN_NAME}`) as current_max_length, {THRESHOLD_VALUE} as previous_max_length FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT MAX(LEN(`{COLUMN_NAME}`)) as max_length FROM {TARGET_SCHEMA}.{TABLE_NAME}) a WHERE LEN(`{COLUMN_NAME}`) = a.max_length AND a.max_length < {THRESHOLD_VALUE} LIMIT 500;
+ error_type: Test Results
+ - id: '1140'
+ test_id: '1004'
+ test_type: Alpha_Trunc
+ sql_flavor: mssql
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT TOP 500 "{COLUMN_NAME}", LEN("{COLUMN_NAME}") as current_max_length, {THRESHOLD_VALUE} as previous_max_length FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT MAX(LEN("{COLUMN_NAME}")) as max_length FROM {TARGET_SCHEMA}.{TABLE_NAME}) a WHERE LEN("{COLUMN_NAME}") = a.max_length AND a.max_length < {THRESHOLD_VALUE} ;
error_type: Test Results
- id: '1083'
test_id: '1004'
@@ -100,13 +108,13 @@ test_types:
lookup_query: |-
SELECT DISTINCT "{COLUMN_NAME}", LENGTH("{COLUMN_NAME}") as current_max_length, {THRESHOLD_VALUE} as previous_max_length FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT MAX(LENGTH("{COLUMN_NAME}")) as max_length FROM {TARGET_SCHEMA}.{TABLE_NAME}) a WHERE LENGTH("{COLUMN_NAME}") = a.max_length AND a.max_length < {THRESHOLD_VALUE} LIMIT 500;
error_type: Test Results
- - id: '1140'
+ - id: '1001'
test_id: '1004'
test_type: Alpha_Trunc
- sql_flavor: mssql
+ sql_flavor: redshift
lookup_type: null
lookup_query: |-
- SELECT DISTINCT TOP 500 "{COLUMN_NAME}", LEN("{COLUMN_NAME}") as current_max_length, {THRESHOLD_VALUE} as previous_max_length FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT MAX(LEN("{COLUMN_NAME}")) as max_length FROM {TARGET_SCHEMA}.{TABLE_NAME}) a WHERE LEN("{COLUMN_NAME}") = a.max_length AND a.max_length < {THRESHOLD_VALUE} ;
+ SELECT DISTINCT "{COLUMN_NAME}", LEN("{COLUMN_NAME}") as current_max_length, {THRESHOLD_VALUE} as previous_max_length FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT MAX(LEN("{COLUMN_NAME}")) as max_length FROM {TARGET_SCHEMA}.{TABLE_NAME}) a WHERE LEN("{COLUMN_NAME}") = a.max_length AND a.max_length < {THRESHOLD_VALUE} LIMIT 500;
error_type: Test Results
- id: '1197'
test_id: '1004'
@@ -116,12 +124,4 @@ test_types:
lookup_query: |-
SELECT DISTINCT "{COLUMN_NAME}" , LEN("{COLUMN_NAME}") as current_max_length, {THRESHOLD_VALUE} as previous_max_length FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT MAX(LEN("{COLUMN_NAME}")) as max_length FROM {TARGET_SCHEMA}.{TABLE_NAME}) a WHERE LEN("{COLUMN_NAME}") = a.max_length AND a.max_length < {THRESHOLD_VALUE} LIMIT 500;
error_type: Test Results
- - id: '1298'
- test_id: '1004'
- test_type: Alpha_Trunc
- sql_flavor: databricks
- lookup_type: null
- lookup_query: |-
- SELECT DISTINCT `{COLUMN_NAME}` , LEN(`{COLUMN_NAME}`) as current_max_length, {THRESHOLD_VALUE} as previous_max_length FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT MAX(LEN(`{COLUMN_NAME}`)) as max_length FROM {TARGET_SCHEMA}.{TABLE_NAME}) a WHERE LEN(`{COLUMN_NAME}`) = a.max_length AND a.max_length < {THRESHOLD_VALUE} LIMIT 500;
- error_type: Test Results
test_templates: []
diff --git a/testgen/template/dbsetup_test_types/test_types_Avg_Shift.yaml b/testgen/template/dbsetup_test_types/test_types_Avg_Shift.yaml
index bdc7adfc..5dbc252f 100644
--- a/testgen/template/dbsetup_test_types/test_types_Avg_Shift.yaml
+++ b/testgen/template/dbsetup_test_types/test_types_Avg_Shift.yaml
@@ -30,25 +30,17 @@ test_types:
health_dimension: Data Drift
threshold_description: |-
Standardized Difference Measure
+ result_visualization: line_chart
+ result_visualization_params: null
usage_notes: |-
Average Shift tests that the average of a numeric column has not significantly changed since baseline, when profiling was done. A significant shift may indicate errors in processing, differences in source data, or valid changes that may nevertheless impact assumptions in downstream data products. The test uses Cohen's D, a statistical technique to identify significant shifts in a value. Cohen's D measures the difference between the two averages, reporting results on a standardized scale, which can be interpreted via a rule-of-thumb from small to huge. Depending on your data, some difference may be expected, so it's reasonable to adjust the threshold value that triggers test failure. This test works well for measures, or even for identifiers if you expect them to increment consistently. You may want to periodically adjust the expected threshold, or even the expected average value if you expect shifting over time. Consider this test along with Variability Increase. If variability rises too, process or measurement flaws could be at work. If variability remains consistent, the issue is more likely to be with the source data itself.
active: Y
- result_visualization: line_chart
- result_visualization_params: null
cat_test_conditions:
- - id: '1002'
- test_type: Avg_Shift
- sql_flavor: redshift
- measure: |-
- ABS( (AVG({COLUMN_NAME}::FLOAT) - {BASELINE_AVG}) / SQRT(((COUNT({COLUMN_NAME})::FLOAT-1)*STDDEV({COLUMN_NAME}::FLOAT)^2 + ({BASELINE_VALUE_CT}::FLOAT-1) * {BASELINE_SD}::FLOAT^2) /NULLIF(COUNT({COLUMN_NAME})::FLOAT + {BASELINE_VALUE_CT}::FLOAT, 0) ))
- test_operator: '>='
- test_condition: |-
- {THRESHOLD_VALUE}
- - id: '2002'
+ - id: '6002'
test_type: Avg_Shift
- sql_flavor: snowflake
+ sql_flavor: databricks
measure: |-
- ABS( (AVG({COLUMN_NAME}::FLOAT) - {BASELINE_AVG}) / SQRT(((COUNT({COLUMN_NAME})::FLOAT-1)*POWER(STDDEV({COLUMN_NAME}::FLOAT),2) + ({BASELINE_VALUE_CT}::FLOAT-1) * POWER({BASELINE_SD}::FLOAT,2)) /NULLIF(COUNT({COLUMN_NAME})::FLOAT + {BASELINE_VALUE_CT}::FLOAT, 0) ))
+ ABS( (AVG({COLUMN_NAME}::FLOAT) - {BASELINE_AVG}) / SQRT(((COUNT({COLUMN_NAME})::FLOAT-1)*POWER(STDDEV_SAMP({COLUMN_NAME}),2) + ({BASELINE_VALUE_CT}::FLOAT-1) * POWER({BASELINE_SD}::FLOAT,2)) /NULLIF(COUNT({COLUMN_NAME})::FLOAT + {BASELINE_VALUE_CT}::FLOAT, 0) ))
test_operator: '>='
test_condition: |-
{THRESHOLD_VALUE}
@@ -68,30 +60,46 @@ test_types:
test_operator: '>='
test_condition: |-
{THRESHOLD_VALUE}
- - id: '5002'
+ - id: '1002'
test_type: Avg_Shift
- sql_flavor: trino
+ sql_flavor: redshift
measure: |-
- ABS( (CAST(AVG({COLUMN_NAME} AS REAL)) - {BASELINE_AVG}) / SQRT(((CAST(COUNT({COLUMN_NAME}) AS REAL)-1)*STDDEV({COLUMN_NAME})^2 + (CAST({BASELINE_VALUE_CT} AS REAL)-1) * CAST({BASELINE_SD} AS REAL)^2) /NULLIF(CAST(COUNT({COLUMN_NAME}) AS REAL) + CAST({BASELINE_VALUE_CT} AS REAL), 0) ))
+ ABS( (AVG({COLUMN_NAME}::FLOAT) - {BASELINE_AVG}) / SQRT(((COUNT({COLUMN_NAME})::FLOAT-1)*STDDEV({COLUMN_NAME}::FLOAT)^2 + ({BASELINE_VALUE_CT}::FLOAT-1) * {BASELINE_SD}::FLOAT^2) /NULLIF(COUNT({COLUMN_NAME})::FLOAT + {BASELINE_VALUE_CT}::FLOAT, 0) ))
test_operator: '>='
test_condition: |-
{THRESHOLD_VALUE}
- - id: '6002'
+ - id: '2002'
test_type: Avg_Shift
- sql_flavor: databricks
+ sql_flavor: snowflake
measure: |-
- ABS( (AVG({COLUMN_NAME}::FLOAT) - {BASELINE_AVG}) / SQRT(((COUNT({COLUMN_NAME})::FLOAT-1)*POWER(STDDEV_SAMP({COLUMN_NAME}),2) + ({BASELINE_VALUE_CT}::FLOAT-1) * POWER({BASELINE_SD}::FLOAT,2)) /NULLIF(COUNT({COLUMN_NAME})::FLOAT + {BASELINE_VALUE_CT}::FLOAT, 0) ))
+ ABS( (AVG({COLUMN_NAME}::FLOAT) - {BASELINE_AVG}) / SQRT(((COUNT({COLUMN_NAME})::FLOAT-1)*POWER(STDDEV({COLUMN_NAME}::FLOAT),2) + ({BASELINE_VALUE_CT}::FLOAT-1) * POWER({BASELINE_SD}::FLOAT,2)) /NULLIF(COUNT({COLUMN_NAME})::FLOAT + {BASELINE_VALUE_CT}::FLOAT, 0) ))
+ test_operator: '>='
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '5002'
+ test_type: Avg_Shift
+ sql_flavor: trino
+ measure: |-
+ ABS( (CAST(AVG({COLUMN_NAME} AS REAL)) - {BASELINE_AVG}) / SQRT(((CAST(COUNT({COLUMN_NAME}) AS REAL)-1)*STDDEV({COLUMN_NAME})^2 + (CAST({BASELINE_VALUE_CT} AS REAL)-1) * CAST({BASELINE_SD} AS REAL)^2) /NULLIF(CAST(COUNT({COLUMN_NAME}) AS REAL) + CAST({BASELINE_VALUE_CT} AS REAL), 0) ))
test_operator: '>='
test_condition: |-
{THRESHOLD_VALUE}
target_data_lookups:
- - id: '1002'
+ - id: '1299'
test_id: '1005'
test_type: Avg_Shift
- sql_flavor: redshift
+ sql_flavor: databricks
lookup_type: null
lookup_query: |-
- SELECT AVG("{COLUMN_NAME}"::FLOAT) AS current_average FROM {TARGET_SCHEMA}.{TABLE_NAME};
+ SELECT AVG(`{COLUMN_NAME}` :: FLOAT) AS current_average FROM {TARGET_SCHEMA}.{TABLE_NAME};
+ error_type: Test Results
+ - id: '1141'
+ test_id: '1005'
+ test_type: Avg_Shift
+ sql_flavor: mssql
+ lookup_type: null
+ lookup_query: |-
+ SELECT AVG(CAST("{COLUMN_NAME}" AS FLOAT)) AS current_average FROM {TARGET_SCHEMA}.{TABLE_NAME};
error_type: Test Results
- id: '1084'
test_id: '1005'
@@ -101,13 +109,13 @@ test_types:
lookup_query: |-
SELECT AVG("{COLUMN_NAME}" :: FLOAT) AS current_average FROM {TARGET_SCHEMA}.{TABLE_NAME};
error_type: Test Results
- - id: '1141'
+ - id: '1002'
test_id: '1005'
test_type: Avg_Shift
- sql_flavor: mssql
+ sql_flavor: redshift
lookup_type: null
lookup_query: |-
- SELECT AVG(CAST("{COLUMN_NAME}" AS FLOAT)) AS current_average FROM {TARGET_SCHEMA}.{TABLE_NAME};
+ SELECT AVG("{COLUMN_NAME}"::FLOAT) AS current_average FROM {TARGET_SCHEMA}.{TABLE_NAME};
error_type: Test Results
- id: '1198'
test_id: '1005'
@@ -117,12 +125,4 @@ test_types:
lookup_query: |-
SELECT AVG("{COLUMN_NAME}" :: FLOAT) AS current_average FROM {TARGET_SCHEMA}.{TABLE_NAME};
error_type: Test Results
- - id: '1299'
- test_id: '1005'
- test_type: Avg_Shift
- sql_flavor: databricks
- lookup_type: null
- lookup_query: |-
- SELECT AVG(`{COLUMN_NAME}` :: FLOAT) AS current_average FROM {TARGET_SCHEMA}.{TABLE_NAME};
- error_type: Test Results
test_templates: []
diff --git a/testgen/template/dbsetup_test_types/test_types_CUSTOM.yaml b/testgen/template/dbsetup_test_types/test_types_CUSTOM.yaml
index 6d5454ec..e7f65499 100644
--- a/testgen/template/dbsetup_test_types/test_types_CUSTOM.yaml
+++ b/testgen/template/dbsetup_test_types/test_types_CUSTOM.yaml
@@ -31,21 +31,17 @@ test_types:
health_dimension: Data Drift
threshold_description: |-
Expected count of errors found by custom query
+ result_visualization: line_chart
+ result_visualization_params: null
usage_notes: |-
This business-rule test is highly flexible, covering any error state that can be expressed by a SQL query against one or more tables in the database. In operation, the user-defined query is embedded within a parent query returning the count of error rows identified. Any row returned by the query is interpreted as a single error condition in the test. Note that this query is run independently of other tests, and that performance will be slower, depending in large part on the efficiency of the query you write. Interpretation is based on the user-defined meaning of the test. Your query might be written to return errors in individual rows identified by joining tables. Or it might return an error based on a multi-column aggregate condition returning a single row if an error is found. This query is run separately when you click `Review Source Data` from Test Results, so be sure to include enough data in your results to follow-up. Interpretation is based on the user-defined meaning of the test.
active: Y
- result_visualization: line_chart
- result_visualization_params: null
cat_test_conditions: []
target_data_lookups: []
test_templates:
- - id: '2004'
- test_type: CUSTOM
- sql_flavor: redshift
- template_name: ex_custom_query_generic.sql
- - id: '2104'
+ - id: '2404'
test_type: CUSTOM
- sql_flavor: snowflake
+ sql_flavor: databricks
template_name: ex_custom_query_generic.sql
- id: '2204'
test_type: CUSTOM
@@ -55,7 +51,11 @@ test_types:
test_type: CUSTOM
sql_flavor: postgresql
template_name: ex_custom_query_generic.sql
- - id: '2404'
+ - id: '2004'
test_type: CUSTOM
- sql_flavor: databricks
+ sql_flavor: redshift
+ template_name: ex_custom_query_generic.sql
+ - id: '2104'
+ test_type: CUSTOM
+ sql_flavor: snowflake
template_name: ex_custom_query_generic.sql
diff --git a/testgen/template/dbsetup_test_types/test_types_Combo_Match.yaml b/testgen/template/dbsetup_test_types/test_types_Combo_Match.yaml
index 94d27b92..05c1ad6e 100644
--- a/testgen/template/dbsetup_test_types/test_types_Combo_Match.yaml
+++ b/testgen/template/dbsetup_test_types/test_types_Combo_Match.yaml
@@ -30,17 +30,17 @@ test_types:
health_dimension: Schema Drift
threshold_description: |-
Expected count of non-matching value combinations
+ result_visualization: line_chart
+ result_visualization_params: null
usage_notes: |-
This test verifies that values, or combinations of values, that are present in the main table are also found in a reference table. This is a useful test for referential integrity between fact and dimension tables. You can also use it to confirm the validity of a code or category, or of combinations of values that should only be found together within each record, such as product/size/color. An error here means that one or more category combinations in the main table are not found in the reference table. Both tables must be present to run this test.
active: Y
- result_visualization: line_chart
- result_visualization_params: null
cat_test_conditions: []
target_data_lookups:
- - id: '1253'
+ - id: '1335'
test_id: '1502'
test_type: Combo_Match
- sql_flavor: redshift
+ sql_flavor: databricks
lookup_type: null
lookup_query: |-
SELECT *
@@ -58,10 +58,10 @@ test_types:
) test
ORDER BY {COLUMN_NAME_NO_QUOTES};
error_type: Test Results
- - id: '1254'
+ - id: '1255'
test_id: '1502'
test_type: Combo_Match
- sql_flavor: snowflake
+ sql_flavor: mssql
lookup_type: null
lookup_query: |-
SELECT *
@@ -79,10 +79,10 @@ test_types:
) test
ORDER BY {COLUMN_NAME_NO_QUOTES};
error_type: Test Results
- - id: '1255'
+ - id: '1256'
test_id: '1502'
test_type: Combo_Match
- sql_flavor: mssql
+ sql_flavor: postgresql
lookup_type: null
lookup_query: |-
SELECT *
@@ -100,10 +100,10 @@ test_types:
) test
ORDER BY {COLUMN_NAME_NO_QUOTES};
error_type: Test Results
- - id: '1256'
+ - id: '1253'
test_id: '1502'
test_type: Combo_Match
- sql_flavor: postgresql
+ sql_flavor: redshift
lookup_type: null
lookup_query: |-
SELECT *
@@ -121,10 +121,10 @@ test_types:
) test
ORDER BY {COLUMN_NAME_NO_QUOTES};
error_type: Test Results
- - id: '1335'
+ - id: '1254'
test_id: '1502'
test_type: Combo_Match
- sql_flavor: databricks
+ sql_flavor: snowflake
lookup_type: null
lookup_query: |-
SELECT *
@@ -143,13 +143,9 @@ test_types:
ORDER BY {COLUMN_NAME_NO_QUOTES};
error_type: Test Results
test_templates:
- - id: '2001'
- test_type: Combo_Match
- sql_flavor: redshift
- template_name: ex_data_match_generic.sql
- - id: '2101'
+ - id: '2401'
test_type: Combo_Match
- sql_flavor: snowflake
+ sql_flavor: databricks
template_name: ex_data_match_generic.sql
- id: '2201'
test_type: Combo_Match
@@ -159,7 +155,11 @@ test_types:
test_type: Combo_Match
sql_flavor: postgresql
template_name: ex_data_match_generic.sql
- - id: '2401'
+ - id: '2001'
test_type: Combo_Match
- sql_flavor: databricks
+ sql_flavor: redshift
+ template_name: ex_data_match_generic.sql
+ - id: '2101'
+ test_type: Combo_Match
+ sql_flavor: snowflake
template_name: ex_data_match_generic.sql
diff --git a/testgen/template/dbsetup_test_types/test_types_Condition_Flag.yaml b/testgen/template/dbsetup_test_types/test_types_Condition_Flag.yaml
index 69bc6af2..91f8836d 100644
--- a/testgen/template/dbsetup_test_types/test_types_Condition_Flag.yaml
+++ b/testgen/template/dbsetup_test_types/test_types_Condition_Flag.yaml
@@ -30,97 +30,97 @@ test_types:
health_dimension: Schema Drift
threshold_description: |-
Count of records that don't meet test condition
+ result_visualization: line_chart
+ result_visualization_params: null
usage_notes: |-
Custom Condition is a business-rule test for a user-defined error condition based on the value of one or more columns. The condition is applied to each record within the table, and the count of records failing the condition is added up. If that count exceeds a threshold of errors, the test as a whole is failed. This test is ideal for error conditions that TestGen cannot automatically infer, and any condition that involves the values of more than one column in the same record. Performance of this test is fast, since it is performed together with other aggregate tests. Interpretation is based on the user-defined meaning of the test.
active: Y
- result_visualization: line_chart
- result_visualization_params: null
cat_test_conditions:
- - id: '1003'
+ - id: '6003'
test_type: Condition_Flag
- sql_flavor: redshift
+ sql_flavor: databricks
measure: |-
SUM(CASE WHEN {CUSTOM_QUERY} THEN 1 ELSE 0 END)
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
- - id: '2003'
+ - id: '3003'
test_type: Condition_Flag
- sql_flavor: snowflake
+ sql_flavor: mssql
measure: |-
SUM(CASE WHEN {CUSTOM_QUERY} THEN 1 ELSE 0 END)
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
- - id: '3003'
+ - id: '4003'
test_type: Condition_Flag
- sql_flavor: mssql
+ sql_flavor: postgresql
measure: |-
SUM(CASE WHEN {CUSTOM_QUERY} THEN 1 ELSE 0 END)
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
- - id: '4003'
+ - id: '1003'
test_type: Condition_Flag
- sql_flavor: postgresql
+ sql_flavor: redshift
measure: |-
SUM(CASE WHEN {CUSTOM_QUERY} THEN 1 ELSE 0 END)
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
- - id: '5003'
+ - id: '2003'
test_type: Condition_Flag
- sql_flavor: trino
+ sql_flavor: snowflake
measure: |-
- SUM(CASE WHEN {BASELINE_VALUE} IS NOT NULL THEN 1 ELSE 0 END)
+ SUM(CASE WHEN {CUSTOM_QUERY} THEN 1 ELSE 0 END)
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
- - id: '6003'
+ - id: '5003'
test_type: Condition_Flag
- sql_flavor: databricks
+ sql_flavor: trino
measure: |-
- SUM(CASE WHEN {CUSTOM_QUERY} THEN 1 ELSE 0 END)
+ SUM(CASE WHEN {BASELINE_VALUE} IS NOT NULL THEN 1 ELSE 0 END)
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
target_data_lookups:
- - id: '1003'
+ - id: '1300'
test_id: '1006'
test_type: Condition_Flag
- sql_flavor: redshift
+ sql_flavor: databricks
lookup_type: null
lookup_query: |-
SELECT * FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE {CUSTOM_QUERY} LIMIT 500;
error_type: Test Results
- - id: '1085'
+ - id: '1142'
test_id: '1006'
test_type: Condition_Flag
- sql_flavor: postgresql
+ sql_flavor: mssql
lookup_type: null
lookup_query: |-
- SELECT * FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE {CUSTOM_QUERY} LIMIT 500;
+ SELECT TOP 500 * FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE {CUSTOM_QUERY};
error_type: Test Results
- - id: '1142'
+ - id: '1085'
test_id: '1006'
test_type: Condition_Flag
- sql_flavor: mssql
+ sql_flavor: postgresql
lookup_type: null
lookup_query: |-
- SELECT TOP 500 * FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE {CUSTOM_QUERY};
+ SELECT * FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE {CUSTOM_QUERY} LIMIT 500;
error_type: Test Results
- - id: '1199'
+ - id: '1003'
test_id: '1006'
test_type: Condition_Flag
- sql_flavor: snowflake
+ sql_flavor: redshift
lookup_type: null
lookup_query: |-
SELECT * FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE {CUSTOM_QUERY} LIMIT 500;
error_type: Test Results
- - id: '1300'
+ - id: '1199'
test_id: '1006'
test_type: Condition_Flag
- sql_flavor: databricks
+ sql_flavor: snowflake
lookup_type: null
lookup_query: |-
SELECT * FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE {CUSTOM_QUERY} LIMIT 500;
diff --git a/testgen/template/dbsetup_test_types/test_types_Constant.yaml b/testgen/template/dbsetup_test_types/test_types_Constant.yaml
index 4d41239c..848cc813 100644
--- a/testgen/template/dbsetup_test_types/test_types_Constant.yaml
+++ b/testgen/template/dbsetup_test_types/test_types_Constant.yaml
@@ -29,99 +29,99 @@ test_types:
health_dimension: Schema Drift
threshold_description: |-
Count of records with unexpected values
+ result_visualization: line_chart
+ result_visualization_params: null
usage_notes: |-
Constant Match tests that a single value determined to be a constant in baseline profiling is still the only value for the column that appears in subsequent versions of the dataset. Sometimes new data or business knowledge may reveal that the value is not a constant at all, even though only one value was present at profiling. In this case, you will want to disable this test. Alternatively, you can use the Value Match test to provide a limited number of valid values for the column.
active: Y
- result_visualization: line_chart
- result_visualization_params: null
cat_test_conditions:
- - id: '1004'
+ - id: '6004'
test_type: Constant
- sql_flavor: redshift
+ sql_flavor: databricks
measure: |-
SUM(CASE WHEN {COLUMN_NAME} <> {BASELINE_VALUE} THEN 1 ELSE 0 END)
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
- - id: '2004'
+ - id: '3004'
test_type: Constant
- sql_flavor: snowflake
+ sql_flavor: mssql
measure: |-
SUM(CASE WHEN {COLUMN_NAME} <> {BASELINE_VALUE} THEN 1 ELSE 0 END)
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
- - id: '3004'
+ - id: '4004'
test_type: Constant
- sql_flavor: mssql
+ sql_flavor: postgresql
measure: |-
SUM(CASE WHEN {COLUMN_NAME} <> {BASELINE_VALUE} THEN 1 ELSE 0 END)
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
- - id: '4004'
+ - id: '1004'
test_type: Constant
- sql_flavor: postgresql
+ sql_flavor: redshift
measure: |-
SUM(CASE WHEN {COLUMN_NAME} <> {BASELINE_VALUE} THEN 1 ELSE 0 END)
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
- - id: '5004'
+ - id: '2004'
test_type: Constant
- sql_flavor: trino
+ sql_flavor: snowflake
measure: |-
SUM(CASE WHEN {COLUMN_NAME} <> {BASELINE_VALUE} THEN 1 ELSE 0 END)
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
- - id: '6004'
+ - id: '5004'
test_type: Constant
- sql_flavor: databricks
+ sql_flavor: trino
measure: |-
SUM(CASE WHEN {COLUMN_NAME} <> {BASELINE_VALUE} THEN 1 ELSE 0 END)
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
target_data_lookups:
- - id: '1004'
+ - id: '1301'
test_id: '1007'
test_type: Constant
- sql_flavor: redshift
+ sql_flavor: databricks
lookup_type: null
lookup_query: |-
- SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" <> {BASELINE_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT 500;
+ SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE `{COLUMN_NAME}` <> {BASELINE_VALUE} GROUP BY `{COLUMN_NAME}` LIMIT 500;
error_type: Test Results
- - id: '1086'
+ - id: '1143'
test_id: '1007'
test_type: Constant
- sql_flavor: postgresql
+ sql_flavor: mssql
lookup_type: null
lookup_query: |-
- SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" <> {BASELINE_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT 500;
+ SELECT DISTINCT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" <> {BASELINE_VALUE} GROUP BY "{COLUMN_NAME}";
error_type: Test Results
- - id: '1143'
+ - id: '1086'
test_id: '1007'
test_type: Constant
- sql_flavor: mssql
+ sql_flavor: postgresql
lookup_type: null
lookup_query: |-
- SELECT DISTINCT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" <> {BASELINE_VALUE} GROUP BY "{COLUMN_NAME}";
+ SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" <> {BASELINE_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT 500;
error_type: Test Results
- - id: '1200'
+ - id: '1004'
test_id: '1007'
test_type: Constant
- sql_flavor: snowflake
+ sql_flavor: redshift
lookup_type: null
lookup_query: |-
SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" <> {BASELINE_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT 500;
error_type: Test Results
- - id: '1301'
+ - id: '1200'
test_id: '1007'
test_type: Constant
- sql_flavor: databricks
+ sql_flavor: snowflake
lookup_type: null
lookup_query: |-
- SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE `{COLUMN_NAME}` <> {BASELINE_VALUE} GROUP BY `{COLUMN_NAME}` LIMIT 500;
+ SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" <> {BASELINE_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT 500;
error_type: Test Results
test_templates: []
diff --git a/testgen/template/dbsetup_test_types/test_types_Daily_Record_Ct.yaml b/testgen/template/dbsetup_test_types/test_types_Daily_Record_Ct.yaml
index cbe772c2..735a8ee5 100644
--- a/testgen/template/dbsetup_test_types/test_types_Daily_Record_Ct.yaml
+++ b/testgen/template/dbsetup_test_types/test_types_Daily_Record_Ct.yaml
@@ -29,6 +29,8 @@ test_types:
health_dimension: Volume
threshold_description: |-
Missing calendar days within min/max range
+ result_visualization: line_chart
+ result_visualization_params: null
usage_notes: "Daily Records tests that at least one record is present for every\
\ day within the minimum and maximum date range for the column. The test is relevant\
\ for transactional data, where you would expect at least one transaction to be\
@@ -36,22 +38,12 @@ test_types:
\ of days identified without data. You can adjust the threshold to accept a number\
\ of days that you know legitimately have no records. "
active: Y
- result_visualization: line_chart
- result_visualization_params: null
cat_test_conditions:
- - id: '1005'
- test_type: Daily_Record_Ct
- sql_flavor: redshift
- measure: |-
- DATEDIFF('DAY', MIN({COLUMN_NAME}), MAX({COLUMN_NAME}))+1-COUNT(DISTINCT {COLUMN_NAME})
- test_operator: '>'
- test_condition: |-
- {THRESHOLD_VALUE}
- - id: '2005'
+ - id: '6005'
test_type: Daily_Record_Ct
- sql_flavor: snowflake
+ sql_flavor: databricks
measure: |-
- DATEDIFF(day, MIN({COLUMN_NAME}), MAX({COLUMN_NAME}))+1-COUNT(DISTINCT {COLUMN_NAME})
+ <%DATEDIFF_DAY;MIN({COLUMN_NAME});MAX({COLUMN_NAME})%>+1-COUNT(DISTINCT {COLUMN_NAME})
test_operator: <
test_condition: |-
{THRESHOLD_VALUE}
@@ -71,38 +63,38 @@ test_types:
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
- - id: '5005'
+ - id: '1005'
test_type: Daily_Record_Ct
- sql_flavor: trino
+ sql_flavor: redshift
measure: |-
- DATE_DIFF('DAY', MIN({COLUMN_NAME}), MAX({COLUMN_NAME}))+1-COUNT(DISTINCT {COLUMN_NAME})
+ DATEDIFF('DAY', MIN({COLUMN_NAME}), MAX({COLUMN_NAME}))+1-COUNT(DISTINCT {COLUMN_NAME})
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
- - id: '6005'
+ - id: '2005'
test_type: Daily_Record_Ct
- sql_flavor: databricks
+ sql_flavor: snowflake
measure: |-
- <%DATEDIFF_DAY;MIN({COLUMN_NAME});MAX({COLUMN_NAME})%>+1-COUNT(DISTINCT {COLUMN_NAME})
+ DATEDIFF(day, MIN({COLUMN_NAME}), MAX({COLUMN_NAME}))+1-COUNT(DISTINCT {COLUMN_NAME})
test_operator: <
test_condition: |-
{THRESHOLD_VALUE}
- target_data_lookups:
- - id: '1005'
- test_id: '1009'
+ - id: '5005'
test_type: Daily_Record_Ct
- sql_flavor: redshift
- lookup_type: null
- lookup_query: |-
- WITH RECURSIVE daterange(all_dates) AS (SELECT MIN("{COLUMN_NAME}") :: DATE AS all_dates FROM {TARGET_SCHEMA}.{TABLE_NAME} UNION ALL SELECT DATEADD(DAY, 1, d.all_dates) :: DATE AS all_dates FROM daterange d WHERE d.all_dates < (SELECT MAX("{COLUMN_NAME}") :: DATE FROM {TARGET_SCHEMA}.{TABLE_NAME}) ), existing_periods AS ( SELECT DISTINCT "{COLUMN_NAME}" :: DATE AS period, COUNT(1) AS period_count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" :: DATE ) SELECT d.all_dates AS missing_period, MAX(b.period) AS prior_available_date, (SELECT period_count FROM existing_periods WHERE period = MAX(b.period) ) AS prior_available_date_count, MIN(c.period) AS next_available_date, (SELECT period_count FROM existing_periods WHERE period = MIN(c.period) ) AS next_available_date_count FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates ORDER BY d.all_dates LIMIT 500;
- error_type: Test Results
- - id: '1087'
+ sql_flavor: trino
+ measure: |-
+ DATE_DIFF('DAY', MIN({COLUMN_NAME}), MAX({COLUMN_NAME}))+1-COUNT(DISTINCT {COLUMN_NAME})
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ target_data_lookups:
+ - id: '1302'
test_id: '1009'
test_type: Daily_Record_Ct
- sql_flavor: postgresql
+ sql_flavor: databricks
lookup_type: null
lookup_query: |-
- WITH RECURSIVE daterange(all_dates) AS (SELECT MIN("{COLUMN_NAME}") :: DATE AS all_dates FROM {TARGET_SCHEMA}.{TABLE_NAME} UNION ALL SELECT (d.all_dates :: DATE + INTERVAL '1 day') :: DATE AS all_dates FROM daterange d WHERE d.all_dates < (SELECT MAX("{COLUMN_NAME}") :: DATE FROM {TARGET_SCHEMA}.{TABLE_NAME}) ), existing_periods AS ( SELECT DISTINCT "{COLUMN_NAME}" :: DATE AS period, COUNT(1) AS period_count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" :: DATE ) SELECT d.all_dates AS missing_period, MAX(b.period) AS prior_available_date, (SELECT period_count FROM existing_periods WHERE period = MAX(b.period) ) AS prior_available_date_count, MIN(c.period) AS next_available_date, (SELECT period_count FROM existing_periods WHERE period = MIN(c.period) ) AS next_available_date_count FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates LIMIT 500;
+ WITH date_bounds AS( SELECT MIN(`{COLUMN_NAME}`) AS min_date, MAX(`{COLUMN_NAME}`) AS max_date FROM {TARGET_SCHEMA}.{TABLE_NAME}), all_dates AS ( SELECT EXPLODE(SEQUENCE(min_date, max_date, INTERVAL 1 DAY)) AS all_dates FROM date_bounds ), existing_periods AS ( SELECT DISTINCT CAST(`{COLUMN_NAME}` AS DATE) AS period, COUNT(1) AS period_count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY CAST(`{COLUMN_NAME}` AS DATE) ), missing_dates AS ( SELECT d.all_dates AS missing_period FROM all_dates d LEFT JOIN existing_periods e ON d.all_dates = e.period WHERE e.period IS NULL ) SELECT m.missing_period, MAX(e1.period) AS prior_available_date, MAX(e1.period_count) AS prior_available_date_count, MIN(e2.period) AS next_available_date, MAX(e2.period_count) AS next_available_date_count FROM missing_dates m LEFT JOIN existing_periods e1 ON e1.period < m.missing_period LEFT JOIN existing_periods e2 ON e2.period > m.missing_period GROUP BY m.missing_period ORDER BY m.missing_period LIMIT 500;
error_type: Test Results
- id: '1144'
test_id: '1009'
@@ -152,20 +144,28 @@ test_types:
OR next_record_ct = 0
ORDER BY check_period DESC;
error_type: Test Results
- - id: '1201'
+ - id: '1087'
test_id: '1009'
test_type: Daily_Record_Ct
- sql_flavor: snowflake
+ sql_flavor: postgresql
lookup_type: null
lookup_query: |-
- WITH RECURSIVE daterange(all_dates) AS (SELECT MIN("{COLUMN_NAME}") :: DATE AS all_dates FROM {TARGET_SCHEMA}.{TABLE_NAME} UNION ALL SELECT DATEADD(DAY, 1, d.all_dates) :: DATE AS all_dates FROM daterange d WHERE d.all_dates < (SELECT MAX("{COLUMN_NAME}") :: DATE FROM {TARGET_SCHEMA}.{TABLE_NAME}) ), existing_periods AS ( SELECT DISTINCT "{COLUMN_NAME}" :: DATE AS period, COUNT(1) AS period_count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" :: DATE ) SELECT p.missing_period, p.prior_available_date, e.period_count as prior_available_date_count, p.next_available_date, f.period_count as next_available_date_count FROM (SELECT d.all_dates AS missing_period, MAX(b.period) AS prior_available_date, MIN(c.period) AS next_available_date FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates) p LEFT JOIN existing_periods e ON (p.prior_available_date = e.period) LEFT JOIN existing_periods f ON (p.next_available_date = f.period) ORDER BY p.missing_period LIMIT 500;
+ WITH RECURSIVE daterange(all_dates) AS (SELECT MIN("{COLUMN_NAME}") :: DATE AS all_dates FROM {TARGET_SCHEMA}.{TABLE_NAME} UNION ALL SELECT (d.all_dates :: DATE + INTERVAL '1 day') :: DATE AS all_dates FROM daterange d WHERE d.all_dates < (SELECT MAX("{COLUMN_NAME}") :: DATE FROM {TARGET_SCHEMA}.{TABLE_NAME}) ), existing_periods AS ( SELECT DISTINCT "{COLUMN_NAME}" :: DATE AS period, COUNT(1) AS period_count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" :: DATE ) SELECT d.all_dates AS missing_period, MAX(b.period) AS prior_available_date, (SELECT period_count FROM existing_periods WHERE period = MAX(b.period) ) AS prior_available_date_count, MIN(c.period) AS next_available_date, (SELECT period_count FROM existing_periods WHERE period = MIN(c.period) ) AS next_available_date_count FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates LIMIT 500;
error_type: Test Results
- - id: '1302'
+ - id: '1005'
test_id: '1009'
test_type: Daily_Record_Ct
- sql_flavor: databricks
+ sql_flavor: redshift
lookup_type: null
lookup_query: |-
- WITH date_bounds AS( SELECT MIN(`{COLUMN_NAME}`) AS min_date, MAX(`{COLUMN_NAME}`) AS max_date FROM {TARGET_SCHEMA}.{TABLE_NAME}), all_dates AS ( SELECT EXPLODE(SEQUENCE(min_date, max_date, INTERVAL 1 DAY)) AS all_dates FROM date_bounds ), existing_periods AS ( SELECT DISTINCT CAST(`{COLUMN_NAME}` AS DATE) AS period, COUNT(1) AS period_count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY CAST(`{COLUMN_NAME}` AS DATE) ), missing_dates AS ( SELECT d.all_dates AS missing_period FROM all_dates d LEFT JOIN existing_periods e ON d.all_dates = e.period WHERE e.period IS NULL ) SELECT m.missing_period, MAX(e1.period) AS prior_available_date, MAX(e1.period_count) AS prior_available_date_count, MIN(e2.period) AS next_available_date, MAX(e2.period_count) AS next_available_date_count FROM missing_dates m LEFT JOIN existing_periods e1 ON e1.period < m.missing_period LEFT JOIN existing_periods e2 ON e2.period > m.missing_period GROUP BY m.missing_period ORDER BY m.missing_period LIMIT 500;
+ WITH RECURSIVE daterange(all_dates) AS (SELECT MIN("{COLUMN_NAME}") :: DATE AS all_dates FROM {TARGET_SCHEMA}.{TABLE_NAME} UNION ALL SELECT DATEADD(DAY, 1, d.all_dates) :: DATE AS all_dates FROM daterange d WHERE d.all_dates < (SELECT MAX("{COLUMN_NAME}") :: DATE FROM {TARGET_SCHEMA}.{TABLE_NAME}) ), existing_periods AS ( SELECT DISTINCT "{COLUMN_NAME}" :: DATE AS period, COUNT(1) AS period_count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" :: DATE ) SELECT d.all_dates AS missing_period, MAX(b.period) AS prior_available_date, (SELECT period_count FROM existing_periods WHERE period = MAX(b.period) ) AS prior_available_date_count, MIN(c.period) AS next_available_date, (SELECT period_count FROM existing_periods WHERE period = MIN(c.period) ) AS next_available_date_count FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates ORDER BY d.all_dates LIMIT 500;
+ error_type: Test Results
+ - id: '1201'
+ test_id: '1009'
+ test_type: Daily_Record_Ct
+ sql_flavor: snowflake
+ lookup_type: null
+ lookup_query: |-
+ WITH RECURSIVE daterange(all_dates) AS (SELECT MIN("{COLUMN_NAME}") :: DATE AS all_dates FROM {TARGET_SCHEMA}.{TABLE_NAME} UNION ALL SELECT DATEADD(DAY, 1, d.all_dates) :: DATE AS all_dates FROM daterange d WHERE d.all_dates < (SELECT MAX("{COLUMN_NAME}") :: DATE FROM {TARGET_SCHEMA}.{TABLE_NAME}) ), existing_periods AS ( SELECT DISTINCT "{COLUMN_NAME}" :: DATE AS period, COUNT(1) AS period_count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" :: DATE ) SELECT p.missing_period, p.prior_available_date, e.period_count as prior_available_date_count, p.next_available_date, f.period_count as next_available_date_count FROM (SELECT d.all_dates AS missing_period, MAX(b.period) AS prior_available_date, MIN(c.period) AS next_available_date FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates) p LEFT JOIN existing_periods e ON (p.prior_available_date = e.period) LEFT JOIN existing_periods f ON (p.next_available_date = f.period) ORDER BY p.missing_period LIMIT 500;
error_type: Test Results
test_templates: []
diff --git a/testgen/template/dbsetup_test_types/test_types_Dec_Trunc.yaml b/testgen/template/dbsetup_test_types/test_types_Dec_Trunc.yaml
index 0f20c746..e0f5818d 100644
--- a/testgen/template/dbsetup_test_types/test_types_Dec_Trunc.yaml
+++ b/testgen/template/dbsetup_test_types/test_types_Dec_Trunc.yaml
@@ -30,76 +30,68 @@ test_types:
health_dimension: Schema Drift
threshold_description: |-
Minimum expected sum of all fractional values
+ result_visualization: line_chart
+ result_visualization_params: null
usage_notes: |-
Decimal Truncation tests that the fractional (decimal) part of a numeric column has not been truncated since Baseline. This works by summing all the fractional values after the decimal point and confirming that the total is at least equal to the fractional total at baseline. This could indicate a problem in a cumulative dataset, where prior values should still exist unchanged. A failure here would suggest that some process changed data that you would still expect to be present and matching its value when the column was profiled. This test would not be appropriate for an incremental or windowed dataset.
active: Y
- result_visualization: line_chart
- result_visualization_params: null
cat_test_conditions:
- - id: '1006'
+ - id: '6006'
test_type: Dec_Trunc
- sql_flavor: redshift
+ sql_flavor: databricks
measure: |-
SUM(ROUND(ABS(({COLUMN_NAME} % 1)), 5))+1
test_operator: <
test_condition: |-
{THRESHOLD_VALUE}
- - id: '2006'
+ - id: '3006'
test_type: Dec_Trunc
- sql_flavor: snowflake
+ sql_flavor: mssql
measure: |-
SUM(ROUND(ABS(({COLUMN_NAME} % 1)), 5))+1
test_operator: <
test_condition: |-
{THRESHOLD_VALUE}
- - id: '3006'
+ - id: '4006'
test_type: Dec_Trunc
- sql_flavor: mssql
+ sql_flavor: postgresql
measure: |-
SUM(ROUND(ABS(({COLUMN_NAME} % 1)), 5))+1
test_operator: <
test_condition: |-
{THRESHOLD_VALUE}
- - id: '4006'
+ - id: '1006'
test_type: Dec_Trunc
- sql_flavor: postgresql
+ sql_flavor: redshift
measure: |-
SUM(ROUND(ABS(({COLUMN_NAME} % 1)), 5))+1
test_operator: <
test_condition: |-
{THRESHOLD_VALUE}
- - id: '5006'
+ - id: '2006'
test_type: Dec_Trunc
- sql_flavor: trino
+ sql_flavor: snowflake
measure: |-
SUM(ROUND(ABS(({COLUMN_NAME} % 1)), 5))+1
test_operator: <
test_condition: |-
{THRESHOLD_VALUE}
- - id: '6006'
+ - id: '5006'
test_type: Dec_Trunc
- sql_flavor: databricks
+ sql_flavor: trino
measure: |-
SUM(ROUND(ABS(({COLUMN_NAME} % 1)), 5))+1
test_operator: <
test_condition: |-
{THRESHOLD_VALUE}
target_data_lookups:
- - id: '1006'
- test_id: '1011'
- test_type: Dec_Trunc
- sql_flavor: redshift
- lookup_type: null
- lookup_query: |-
- SELECT DISTINCT DECIMAL_SCALE("{COLUMN_NAME}" :: SUPER) AS decimal_scale, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY DECIMAL_SCALE("{COLUMN_NAME}" :: SUPER) LIMIT 500;
- error_type: Test Results
- - id: '1088'
+ - id: '1303'
test_id: '1011'
test_type: Dec_Trunc
- sql_flavor: postgresql
+ sql_flavor: databricks
lookup_type: null
lookup_query: |-
- SELECT DISTINCT LENGTH(SPLIT_PART("{COLUMN_NAME}" :: TEXT, '.', 2)) AS decimal_scale, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY decimal_scale LIMIT 500;
+ SELECT DISTINCT LENGTH(SPLIT_PART(`{COLUMN_NAME}`::STRING, '.', 2)) AS decimal_scale, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY decimal_scale LIMIT 500;
error_type: Test Results
- id: '1145'
test_id: '1011'
@@ -111,20 +103,28 @@ test_types:
\ ) SELECT DISTINCT TOP 500 decimal_scale,COUNT(*) AS count FROM cte GROUP BY\
\ decimal_scale ORDER BY COUNT(*) DESC; "
error_type: Test Results
- - id: '1202'
+ - id: '1088'
test_id: '1011'
test_type: Dec_Trunc
- sql_flavor: snowflake
+ sql_flavor: postgresql
lookup_type: null
lookup_query: |-
SELECT DISTINCT LENGTH(SPLIT_PART("{COLUMN_NAME}" :: TEXT, '.', 2)) AS decimal_scale, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY decimal_scale LIMIT 500;
error_type: Test Results
- - id: '1303'
+ - id: '1006'
test_id: '1011'
test_type: Dec_Trunc
- sql_flavor: databricks
+ sql_flavor: redshift
lookup_type: null
lookup_query: |-
- SELECT DISTINCT LENGTH(SPLIT_PART(`{COLUMN_NAME}`::STRING, '.', 2)) AS decimal_scale, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY decimal_scale LIMIT 500;
+ SELECT DISTINCT DECIMAL_SCALE("{COLUMN_NAME}" :: SUPER) AS decimal_scale, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY DECIMAL_SCALE("{COLUMN_NAME}" :: SUPER) LIMIT 500;
+ error_type: Test Results
+ - id: '1202'
+ test_id: '1011'
+ test_type: Dec_Trunc
+ sql_flavor: snowflake
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT LENGTH(SPLIT_PART("{COLUMN_NAME}" :: TEXT, '.', 2)) AS decimal_scale, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY decimal_scale LIMIT 500;
error_type: Test Results
test_templates: []
diff --git a/testgen/template/dbsetup_test_types/test_types_Distinct_Date_Ct.yaml b/testgen/template/dbsetup_test_types/test_types_Distinct_Date_Ct.yaml
index 339d8f62..7c1c8794 100644
--- a/testgen/template/dbsetup_test_types/test_types_Distinct_Date_Ct.yaml
+++ b/testgen/template/dbsetup_test_types/test_types_Distinct_Date_Ct.yaml
@@ -30,99 +30,99 @@ test_types:
health_dimension: Recency
threshold_description: |-
Minimum distinct date count expected
+ result_visualization: line_chart
+ result_visualization_params: null
usage_notes: |-
Date Count tests that the count of distinct dates present in the column has not dropped since baseline. The test is relevant for cumulative datasets, where old records are retained. A failure here would indicate missing records, which could be caused by a processing error or changed upstream data sources.
active: Y
- result_visualization: line_chart
- result_visualization_params: null
cat_test_conditions:
- - id: '1007'
+ - id: '6007'
test_type: Distinct_Date_Ct
- sql_flavor: redshift
+ sql_flavor: databricks
measure: |-
COUNT(DISTINCT {COLUMN_NAME})
test_operator: <
test_condition: |-
{THRESHOLD_VALUE}
- - id: '2007'
+ - id: '3007'
test_type: Distinct_Date_Ct
- sql_flavor: snowflake
+ sql_flavor: mssql
measure: |-
COUNT(DISTINCT {COLUMN_NAME})
test_operator: <
test_condition: |-
{THRESHOLD_VALUE}
- - id: '3007'
+ - id: '4007'
test_type: Distinct_Date_Ct
- sql_flavor: mssql
+ sql_flavor: postgresql
measure: |-
COUNT(DISTINCT {COLUMN_NAME})
test_operator: <
test_condition: |-
{THRESHOLD_VALUE}
- - id: '4007'
+ - id: '1007'
test_type: Distinct_Date_Ct
- sql_flavor: postgresql
+ sql_flavor: redshift
measure: |-
COUNT(DISTINCT {COLUMN_NAME})
test_operator: <
test_condition: |-
{THRESHOLD_VALUE}
- - id: '5007'
+ - id: '2007'
test_type: Distinct_Date_Ct
- sql_flavor: trino
+ sql_flavor: snowflake
measure: |-
COUNT(DISTINCT {COLUMN_NAME})
test_operator: <
test_condition: |-
{THRESHOLD_VALUE}
- - id: '6007'
+ - id: '5007'
test_type: Distinct_Date_Ct
- sql_flavor: databricks
+ sql_flavor: trino
measure: |-
COUNT(DISTINCT {COLUMN_NAME})
test_operator: <
test_condition: |-
{THRESHOLD_VALUE}
target_data_lookups:
- - id: '1007'
+ - id: '1304'
test_id: '1012'
test_type: Distinct_Date_Ct
- sql_flavor: redshift
+ sql_flavor: databricks
lookup_type: null
lookup_query: |-
- SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" IS NOT NULL GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500;
+ SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE `{COLUMN_NAME}` IS NOT NULL GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` DESC LIMIT 500;
error_type: Test Results
- - id: '1089'
+ - id: '1146'
test_id: '1012'
test_type: Distinct_Date_Ct
- sql_flavor: postgresql
+ sql_flavor: mssql
lookup_type: null
lookup_query: |-
- SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" IS NOT NULL GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500;
+ SELECT DISTINCT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" IS NOT NULL GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC;
error_type: Test Results
- - id: '1146'
+ - id: '1089'
test_id: '1012'
test_type: Distinct_Date_Ct
- sql_flavor: mssql
+ sql_flavor: postgresql
lookup_type: null
lookup_query: |-
- SELECT DISTINCT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" IS NOT NULL GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC;
+ SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" IS NOT NULL GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500;
error_type: Test Results
- - id: '1203'
+ - id: '1007'
test_id: '1012'
test_type: Distinct_Date_Ct
- sql_flavor: snowflake
+ sql_flavor: redshift
lookup_type: null
lookup_query: |-
SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" IS NOT NULL GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500;
error_type: Test Results
- - id: '1304'
+ - id: '1203'
test_id: '1012'
test_type: Distinct_Date_Ct
- sql_flavor: databricks
+ sql_flavor: snowflake
lookup_type: null
lookup_query: |-
- SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE `{COLUMN_NAME}` IS NOT NULL GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` DESC LIMIT 500;
+ SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" IS NOT NULL GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500;
error_type: Test Results
test_templates: []
diff --git a/testgen/template/dbsetup_test_types/test_types_Distinct_Value_Ct.yaml b/testgen/template/dbsetup_test_types/test_types_Distinct_Value_Ct.yaml
index 95bc9080..bde871d8 100644
--- a/testgen/template/dbsetup_test_types/test_types_Distinct_Value_Ct.yaml
+++ b/testgen/template/dbsetup_test_types/test_types_Distinct_Value_Ct.yaml
@@ -29,68 +29,76 @@ test_types:
health_dimension: Schema Drift
threshold_description: |-
Expected distinct value count
+ result_visualization: line_chart
+ result_visualization_params: null
usage_notes: |-
Value Count tests that the count of unique values present in the column has not dropped since baseline. The test is relevant for cumulative datasets, where old records are retained, or for any dataset where you would expect a set number of distinct values should be present. A failure here would indicate missing records or a change in categories or value assignment.
active: Y
- result_visualization: line_chart
- result_visualization_params: null
cat_test_conditions:
- - id: '1008'
+ - id: '6008'
test_type: Distinct_Value_Ct
- sql_flavor: redshift
+ sql_flavor: databricks
measure: |-
COUNT(DISTINCT {COLUMN_NAME})
test_operator: <>
test_condition: |-
{THRESHOLD_VALUE}
- - id: '2008'
+ - id: '3008'
test_type: Distinct_Value_Ct
- sql_flavor: snowflake
+ sql_flavor: mssql
measure: |-
COUNT(DISTINCT {COLUMN_NAME})
test_operator: <>
test_condition: |-
{THRESHOLD_VALUE}
- - id: '3008'
+ - id: '4008'
test_type: Distinct_Value_Ct
- sql_flavor: mssql
+ sql_flavor: postgresql
measure: |-
COUNT(DISTINCT {COLUMN_NAME})
test_operator: <>
test_condition: |-
{THRESHOLD_VALUE}
- - id: '4008'
+ - id: '1008'
test_type: Distinct_Value_Ct
- sql_flavor: postgresql
+ sql_flavor: redshift
measure: |-
COUNT(DISTINCT {COLUMN_NAME})
test_operator: <>
test_condition: |-
{THRESHOLD_VALUE}
- - id: '5008'
+ - id: '2008'
test_type: Distinct_Value_Ct
- sql_flavor: trino
+ sql_flavor: snowflake
measure: |-
COUNT(DISTINCT {COLUMN_NAME})
test_operator: <>
test_condition: |-
{THRESHOLD_VALUE}
- - id: '6008'
+ - id: '5008'
test_type: Distinct_Value_Ct
- sql_flavor: databricks
+ sql_flavor: trino
measure: |-
COUNT(DISTINCT {COLUMN_NAME})
test_operator: <>
test_condition: |-
{THRESHOLD_VALUE}
target_data_lookups:
- - id: '1008'
+ - id: '1305'
test_id: '1013'
test_type: Distinct_Value_Ct
- sql_flavor: redshift
+ sql_flavor: databricks
lookup_type: null
lookup_query: |-
- SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" IS NOT NULL GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500;
+ SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE `{COLUMN_NAME}` IS NOT NULL GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` DESC LIMIT 500;
+ error_type: Test Results
+ - id: '1147'
+ test_id: '1013'
+ test_type: Distinct_Value_Ct
+ sql_flavor: mssql
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" IS NOT NULL GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC;
error_type: Test Results
- id: '1090'
test_id: '1013'
@@ -100,13 +108,13 @@ test_types:
lookup_query: |-
SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" IS NOT NULL GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500;
error_type: Test Results
- - id: '1147'
+ - id: '1008'
test_id: '1013'
test_type: Distinct_Value_Ct
- sql_flavor: mssql
+ sql_flavor: redshift
lookup_type: null
lookup_query: |-
- SELECT DISTINCT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" IS NOT NULL GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC;
+ SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" IS NOT NULL GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500;
error_type: Test Results
- id: '1204'
test_id: '1013'
@@ -116,12 +124,4 @@ test_types:
lookup_query: |-
SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" IS NOT NULL GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500;
error_type: Test Results
- - id: '1305'
- test_id: '1013'
- test_type: Distinct_Value_Ct
- sql_flavor: databricks
- lookup_type: null
- lookup_query: |-
- SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE `{COLUMN_NAME}` IS NOT NULL GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` DESC LIMIT 500;
- error_type: Test Results
test_templates: []
diff --git a/testgen/template/dbsetup_test_types/test_types_Distribution_Shift.yaml b/testgen/template/dbsetup_test_types/test_types_Distribution_Shift.yaml
index 52e3cfc5..098be2e6 100644
--- a/testgen/template/dbsetup_test_types/test_types_Distribution_Shift.yaml
+++ b/testgen/template/dbsetup_test_types/test_types_Distribution_Shift.yaml
@@ -31,17 +31,17 @@ test_types:
health_dimension: Data Drift
threshold_description: |-
Expected maximum divergence level between 0 and 1
+ result_visualization: line_chart
+ result_visualization_params: null
usage_notes: |-
This test measures the similarity of two sets of counts per categories, by using their proportional counts as probability distributions. Using Jensen-Shannon divergence, a measure of relative entropy or difference between two distributions, the test assigns a score ranging from 0, meaning that the distributions are identical, to 1, meaning that the distributions are completely unrelated. This test can be used to compare datasets that may not match exactly, but should have similar distributions. For example, it is a useful sanity check for data from different sources that you would expect to have a consistent spread, such as shipment of building materials per state and construction projects by state. Scores can be compared over time even if the distributions are not identical -- a dataset can be expected to maintain a comparable divergence score with a reference dataset over time. Both tables must be present to run this test.
active: Y
- result_visualization: line_chart
- result_visualization_params: null
cat_test_conditions: []
target_data_lookups:
- - id: '1257'
+ - id: '1336'
test_id: '1503'
test_type: Distribution_Shift
- sql_flavor: redshift
+ sql_flavor: databricks
lookup_type: null
lookup_query: |-
WITH latest_ver
@@ -64,21 +64,21 @@ test_types:
ON (l.category = o.category)
ORDER BY COALESCE(l.category, o.category)
error_type: Test Results
- - id: '1258'
+ - id: '1259'
test_id: '1503'
test_type: Distribution_Shift
- sql_flavor: snowflake
+ sql_flavor: mssql
lookup_type: null
lookup_query: |-
WITH latest_ver
AS ( SELECT {CONCAT_COLUMNS} as category,
- COUNT(*)::FLOAT / SUM(COUNT(*)) OVER ()::FLOAT AS pct_of_total
+ CAST(COUNT(*) as FLOAT) / CAST(SUM(COUNT(*)) OVER () as FLOAT) AS pct_of_total
FROM {TARGET_SCHEMA}.{TABLE_NAME} v1
WHERE {SUBSET_CONDITION}
GROUP BY {COLUMN_NAME_NO_QUOTES} ),
older_ver
AS ( SELECT {CONCAT_MATCH_GROUPBY} as category,
- COUNT(*)::FLOAT / SUM(COUNT(*)) OVER ()::FLOAT AS pct_of_total
+ CAST(COUNT(*) as FLOAT) / CAST(SUM(COUNT(*)) OVER () as FLOAT) AS pct_of_total
FROM {MATCH_SCHEMA_NAME}.{TABLE_NAME} v2
WHERE {MATCH_SUBSET_CONDITION}
GROUP BY {MATCH_GROUPBY_NAMES} )
@@ -90,21 +90,21 @@ test_types:
ON (l.category = o.category)
ORDER BY COALESCE(l.category, o.category)
error_type: Test Results
- - id: '1259'
+ - id: '1260'
test_id: '1503'
test_type: Distribution_Shift
- sql_flavor: mssql
+ sql_flavor: postgresql
lookup_type: null
lookup_query: |-
WITH latest_ver
AS ( SELECT {CONCAT_COLUMNS} as category,
- CAST(COUNT(*) as FLOAT) / CAST(SUM(COUNT(*)) OVER () as FLOAT) AS pct_of_total
+ COUNT(*)::FLOAT / SUM(COUNT(*)) OVER ()::FLOAT AS pct_of_total
FROM {TARGET_SCHEMA}.{TABLE_NAME} v1
WHERE {SUBSET_CONDITION}
GROUP BY {COLUMN_NAME_NO_QUOTES} ),
older_ver
AS ( SELECT {CONCAT_MATCH_GROUPBY} as category,
- CAST(COUNT(*) as FLOAT) / CAST(SUM(COUNT(*)) OVER () as FLOAT) AS pct_of_total
+ COUNT(*)::FLOAT / SUM(COUNT(*)) OVER ()::FLOAT AS pct_of_total
FROM {MATCH_SCHEMA_NAME}.{TABLE_NAME} v2
WHERE {MATCH_SUBSET_CONDITION}
GROUP BY {MATCH_GROUPBY_NAMES} )
@@ -116,10 +116,10 @@ test_types:
ON (l.category = o.category)
ORDER BY COALESCE(l.category, o.category)
error_type: Test Results
- - id: '1260'
+ - id: '1257'
test_id: '1503'
test_type: Distribution_Shift
- sql_flavor: postgresql
+ sql_flavor: redshift
lookup_type: null
lookup_query: |-
WITH latest_ver
@@ -142,10 +142,10 @@ test_types:
ON (l.category = o.category)
ORDER BY COALESCE(l.category, o.category)
error_type: Test Results
- - id: '1336'
+ - id: '1258'
test_id: '1503'
test_type: Distribution_Shift
- sql_flavor: databricks
+ sql_flavor: snowflake
lookup_type: null
lookup_query: |-
WITH latest_ver
@@ -169,13 +169,9 @@ test_types:
ORDER BY COALESCE(l.category, o.category)
error_type: Test Results
test_templates:
- - id: '2003'
- test_type: Distribution_Shift
- sql_flavor: redshift
- template_name: ex_relative_entropy_generic.sql
- - id: '2103'
+ - id: '2403'
test_type: Distribution_Shift
- sql_flavor: snowflake
+ sql_flavor: databricks
template_name: ex_relative_entropy_generic.sql
- id: '2203'
test_type: Distribution_Shift
@@ -185,7 +181,11 @@ test_types:
test_type: Distribution_Shift
sql_flavor: postgresql
template_name: ex_relative_entropy_generic.sql
- - id: '2403'
+ - id: '2003'
test_type: Distribution_Shift
- sql_flavor: databricks
+ sql_flavor: redshift
+ template_name: ex_relative_entropy_generic.sql
+ - id: '2103'
+ test_type: Distribution_Shift
+ sql_flavor: snowflake
template_name: ex_relative_entropy_generic.sql
diff --git a/testgen/template/dbsetup_test_types/test_types_Dupe_Rows.yaml b/testgen/template/dbsetup_test_types/test_types_Dupe_Rows.yaml
index a929a661..3705e014 100644
--- a/testgen/template/dbsetup_test_types/test_types_Dupe_Rows.yaml
+++ b/testgen/template/dbsetup_test_types/test_types_Dupe_Rows.yaml
@@ -30,17 +30,17 @@ test_types:
health_dimension: Schema Drift
threshold_description: |-
Expected count of duplicate value combinations
+ result_visualization: line_chart
+ result_visualization_params: null
usage_notes: |-
This test verifies that combinations of values are not repeated within the table. By default when auto-generated, the test considers all columns to protect against duplication of entire rows. If you know the minimum columns that should constitute a unique record, such as a set of ID's, you should use those to make the test as sensitive as possible. Alternatively, if you know of columns you can always exclude, such as file_date or refresh_snapshot_id, remove them to tighten the test somewhat.
active: Y
- result_visualization: line_chart
- result_visualization_params: null
cat_test_conditions: []
target_data_lookups:
- - id: '1253'
+ - id: '1257'
test_id: '1510'
test_type: Dupe_Rows
- sql_flavor: redshift
+ sql_flavor: databricks
lookup_type: null
lookup_query: |-
SELECT {GROUPBY_NAMES}, COUNT(*) as record_ct
@@ -50,10 +50,10 @@ test_types:
HAVING COUNT(*) > 1
ORDER BY {GROUPBY_NAMES}
error_type: Test Results
- - id: '1254'
+ - id: '1255'
test_id: '1510'
test_type: Dupe_Rows
- sql_flavor: snowflake
+ sql_flavor: mssql
lookup_type: null
lookup_query: |-
SELECT {GROUPBY_NAMES}, COUNT(*) as record_ct
@@ -63,10 +63,10 @@ test_types:
HAVING COUNT(*) > 1
ORDER BY {GROUPBY_NAMES}
error_type: Test Results
- - id: '1255'
+ - id: '1256'
test_id: '1510'
test_type: Dupe_Rows
- sql_flavor: mssql
+ sql_flavor: postgresql
lookup_type: null
lookup_query: |-
SELECT {GROUPBY_NAMES}, COUNT(*) as record_ct
@@ -76,10 +76,10 @@ test_types:
HAVING COUNT(*) > 1
ORDER BY {GROUPBY_NAMES}
error_type: Test Results
- - id: '1256'
+ - id: '1253'
test_id: '1510'
test_type: Dupe_Rows
- sql_flavor: postgresql
+ sql_flavor: redshift
lookup_type: null
lookup_query: |-
SELECT {GROUPBY_NAMES}, COUNT(*) as record_ct
@@ -89,10 +89,10 @@ test_types:
HAVING COUNT(*) > 1
ORDER BY {GROUPBY_NAMES}
error_type: Test Results
- - id: '1257'
+ - id: '1254'
test_id: '1510'
test_type: Dupe_Rows
- sql_flavor: databricks
+ sql_flavor: snowflake
lookup_type: null
lookup_query: |-
SELECT {GROUPBY_NAMES}, COUNT(*) as record_ct
@@ -103,13 +103,9 @@ test_types:
ORDER BY {GROUPBY_NAMES}
error_type: Test Results
test_templates:
- - id: '2011'
- test_type: Dupe_Rows
- sql_flavor: redshift
- template_name: ex_dupe_rows_generic.sql
- - id: '2111'
+ - id: '2411'
test_type: Dupe_Rows
- sql_flavor: snowflake
+ sql_flavor: databricks
template_name: ex_dupe_rows_generic.sql
- id: '2211'
test_type: Dupe_Rows
@@ -119,7 +115,11 @@ test_types:
test_type: Dupe_Rows
sql_flavor: postgresql
template_name: ex_dupe_rows_generic.sql
- - id: '2411'
+ - id: '2011'
test_type: Dupe_Rows
- sql_flavor: databricks
+ sql_flavor: redshift
+ template_name: ex_dupe_rows_generic.sql
+ - id: '2111'
+ test_type: Dupe_Rows
+ sql_flavor: snowflake
template_name: ex_dupe_rows_generic.sql
diff --git a/testgen/template/dbsetup_test_types/test_types_Email_Format.yaml b/testgen/template/dbsetup_test_types/test_types_Email_Format.yaml
index c32ab45b..7ce6ffc1 100644
--- a/testgen/template/dbsetup_test_types/test_types_Email_Format.yaml
+++ b/testgen/template/dbsetup_test_types/test_types_Email_Format.yaml
@@ -30,24 +30,16 @@ test_types:
health_dimension: Schema Drift
threshold_description: |-
Expected count of invalid email addresses
- usage_notes: null
- active: Y
result_visualization: line_chart
result_visualization_params: null
+ usage_notes: null
+ active: Y
cat_test_conditions:
- - id: '1009'
- test_type: Email_Format
- sql_flavor: redshift
- measure: |-
- SUM(CASE WHEN {COLUMN_NAME} !~ '^[A-Za-z0-9._''%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$' THEN 1 ELSE 0 END)
- test_operator: '>'
- test_condition: |-
- {THRESHOLD_VALUE}
- - id: '2009'
+ - id: '6009'
test_type: Email_Format
- sql_flavor: snowflake
+ sql_flavor: databricks
measure: |-
- SUM(CASE WHEN NOT REGEXP_LIKE({COLUMN_NAME}::VARCHAR, '^[A-Za-z0-9._''%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$') THEN 1 ELSE 0 END)
+ SUM(CASE WHEN NOT REGEXP_LIKE({COLUMN_NAME}::STRING, '^[A-Za-z0-9._''%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$') THEN 1 ELSE 0 END)
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
@@ -67,30 +59,46 @@ test_types:
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
- - id: '5009'
+ - id: '1009'
test_type: Email_Format
- sql_flavor: trino
+ sql_flavor: redshift
measure: |-
- SUM(CASE WHEN REGEXP_LIKE({COLUMN_NAME} , '^[A-Za-z0-9._''%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$') != TRUE THEN 1 ELSE 0 END)
+ SUM(CASE WHEN {COLUMN_NAME} !~ '^[A-Za-z0-9._''%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$' THEN 1 ELSE 0 END)
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
- - id: '6009'
+ - id: '2009'
test_type: Email_Format
- sql_flavor: databricks
+ sql_flavor: snowflake
measure: |-
- SUM(CASE WHEN NOT REGEXP_LIKE({COLUMN_NAME}::STRING, '^[A-Za-z0-9._''%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$') THEN 1 ELSE 0 END)
+ SUM(CASE WHEN NOT REGEXP_LIKE({COLUMN_NAME}::VARCHAR, '^[A-Za-z0-9._''%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$') THEN 1 ELSE 0 END)
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '5009'
+ test_type: Email_Format
+ sql_flavor: trino
+ measure: |-
+ SUM(CASE WHEN REGEXP_LIKE({COLUMN_NAME} , '^[A-Za-z0-9._''%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$') != TRUE THEN 1 ELSE 0 END)
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
target_data_lookups:
- - id: '1009'
+ - id: '1306'
test_id: '1014'
test_type: Email_Format
- sql_flavor: redshift
+ sql_flavor: databricks
lookup_type: null
lookup_query: |-
- SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" !~ '^[A-Za-z0-9._''%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$' GROUP BY "{COLUMN_NAME}" LIMIT 500;
+ SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE REGEXP_LIKE(`{COLUMN_NAME}`::STRING, '^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z]{2,}$') != 1 GROUP BY `{COLUMN_NAME}` LIMIT 500;
+ error_type: Test Results
+ - id: '1148'
+ test_id: '1014'
+ test_type: Email_Format
+ sql_flavor: mssql
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" NOT LIKE '%[_a-zA-Z0-9.-]%@%[a-zA-Z0-9.-]%.[a-zA-Z][a-zA-Z]%' GROUP BY "{COLUMN_NAME}";
error_type: Test Results
- id: '1091'
test_id: '1014'
@@ -100,13 +108,13 @@ test_types:
lookup_query: |-
SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" !~ '^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$' GROUP BY "{COLUMN_NAME}" LIMIT 500;
error_type: Test Results
- - id: '1148'
+ - id: '1009'
test_id: '1014'
test_type: Email_Format
- sql_flavor: mssql
+ sql_flavor: redshift
lookup_type: null
lookup_query: |-
- SELECT DISTINCT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" NOT LIKE '%[_a-zA-Z0-9.-]%@%[a-zA-Z0-9.-]%.[a-zA-Z][a-zA-Z]%' GROUP BY "{COLUMN_NAME}";
+ SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" !~ '^[A-Za-z0-9._''%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$' GROUP BY "{COLUMN_NAME}" LIMIT 500;
error_type: Test Results
- id: '1205'
test_id: '1014'
@@ -116,12 +124,4 @@ test_types:
lookup_query: |-
SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE REGEXP_LIKE("{COLUMN_NAME}"::VARCHAR, '^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z]{2,}$') != 1 GROUP BY "{COLUMN_NAME}" LIMIT 500;
error_type: Test Results
- - id: '1306'
- test_id: '1014'
- test_type: Email_Format
- sql_flavor: databricks
- lookup_type: null
- lookup_query: |-
- SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE REGEXP_LIKE(`{COLUMN_NAME}`::STRING, '^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z]{2,}$') != 1 GROUP BY `{COLUMN_NAME}` LIMIT 500;
- error_type: Test Results
test_templates: []
diff --git a/testgen/template/dbsetup_test_types/test_types_Future_Date.yaml b/testgen/template/dbsetup_test_types/test_types_Future_Date.yaml
index aa1c8270..57f0acf5 100644
--- a/testgen/template/dbsetup_test_types/test_types_Future_Date.yaml
+++ b/testgen/template/dbsetup_test_types/test_types_Future_Date.yaml
@@ -29,22 +29,14 @@ test_types:
health_dimension: Recency
threshold_description: |-
Expected count of future dates
- usage_notes: null
- active: Y
result_visualization: line_chart
result_visualization_params: null
+ usage_notes: null
+ active: Y
cat_test_conditions:
- - id: '1010'
- test_type: Future_Date
- sql_flavor: redshift
- measure: |-
- SUM(GREATEST(0, SIGN({COLUMN_NAME}::DATE - '{RUN_DATE}'::DATE)))
- test_operator: '>'
- test_condition: |-
- {THRESHOLD_VALUE}
- - id: '2010'
+ - id: '6010'
test_type: Future_Date
- sql_flavor: snowflake
+ sql_flavor: databricks
measure: |-
SUM(GREATEST(0, SIGN({COLUMN_NAME}::DATE - '{RUN_DATE}'::DATE)))
test_operator: '>'
@@ -66,30 +58,46 @@ test_types:
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
- - id: '5010'
+ - id: '1010'
test_type: Future_Date
- sql_flavor: trino
+ sql_flavor: redshift
measure: |-
- SUM(CASE WHEN CAST({COLUMN_NAME} AS DATE) >= CAST('{RUN_DATE}' AS DATE) THEN 1 ELSE 0 END)
+ SUM(GREATEST(0, SIGN({COLUMN_NAME}::DATE - '{RUN_DATE}'::DATE)))
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
- - id: '6010'
+ - id: '2010'
test_type: Future_Date
- sql_flavor: databricks
+ sql_flavor: snowflake
measure: |-
SUM(GREATEST(0, SIGN({COLUMN_NAME}::DATE - '{RUN_DATE}'::DATE)))
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
+ - id: '5010'
+ test_type: Future_Date
+ sql_flavor: trino
+ measure: |-
+ SUM(CASE WHEN CAST({COLUMN_NAME} AS DATE) >= CAST('{RUN_DATE}' AS DATE) THEN 1 ELSE 0 END)
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
target_data_lookups:
- - id: '1010'
+ - id: '1307'
test_id: '1015'
test_type: Future_Date
- sql_flavor: redshift
+ sql_flavor: databricks
lookup_type: null
lookup_query: |-
- SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE GREATEST(0, SIGN("{COLUMN_NAME}"::DATE - '{TEST_DATE}'::DATE)) > {THRESHOLD_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT 500;
+ SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE GREATEST(0, SIGN(`{COLUMN_NAME}`::DATE - '{TEST_DATE}'::DATE)) > {THRESHOLD_VALUE} GROUP BY `{COLUMN_NAME}` LIMIT 500;
+ error_type: Test Results
+ - id: '1149'
+ test_id: '1015'
+ test_type: Future_Date
+ sql_flavor: mssql
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE CAST("{COLUMN_NAME}" AS DATE) >= CONVERT(DATE, '{TEST_DATE}') GROUP BY "{COLUMN_NAME}";
error_type: Test Results
- id: '1092'
test_id: '1015'
@@ -99,13 +107,13 @@ test_types:
lookup_query: |-
SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE GREATEST(0, SIGN("{COLUMN_NAME}"::DATE - '{TEST_DATE}'::DATE)) > {THRESHOLD_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT 500;
error_type: Test Results
- - id: '1149'
+ - id: '1010'
test_id: '1015'
test_type: Future_Date
- sql_flavor: mssql
+ sql_flavor: redshift
lookup_type: null
lookup_query: |-
- SELECT DISTINCT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE CAST("{COLUMN_NAME}" AS DATE) >= CONVERT(DATE, '{TEST_DATE}') GROUP BY "{COLUMN_NAME}";
+ SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE GREATEST(0, SIGN("{COLUMN_NAME}"::DATE - '{TEST_DATE}'::DATE)) > {THRESHOLD_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT 500;
error_type: Test Results
- id: '1206'
test_id: '1015'
@@ -115,12 +123,4 @@ test_types:
lookup_query: |-
SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE GREATEST(0, SIGN("{COLUMN_NAME}"::DATE - '{TEST_DATE}'::DATE)) > {THRESHOLD_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT 500;
error_type: Test Results
- - id: '1307'
- test_id: '1015'
- test_type: Future_Date
- sql_flavor: databricks
- lookup_type: null
- lookup_query: |-
- SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE GREATEST(0, SIGN(`{COLUMN_NAME}`::DATE - '{TEST_DATE}'::DATE)) > {THRESHOLD_VALUE} GROUP BY `{COLUMN_NAME}` LIMIT 500;
- error_type: Test Results
test_templates: []
diff --git a/testgen/template/dbsetup_test_types/test_types_Future_Date_1Y.yaml b/testgen/template/dbsetup_test_types/test_types_Future_Date_1Y.yaml
index d9be7bbe..e7cb1572 100644
--- a/testgen/template/dbsetup_test_types/test_types_Future_Date_1Y.yaml
+++ b/testgen/template/dbsetup_test_types/test_types_Future_Date_1Y.yaml
@@ -29,23 +29,15 @@ test_types:
health_dimension: Recency
threshold_description: |-
Expected count of future dates beyond one year
+ result_visualization: line_chart
+ result_visualization_params: null
usage_notes: |-
Future Year looks for date values in the column that extend beyond one year after the test date. This would be appropriate for transactional dates where you would expect to find dates in the near future, but not beyond one year ahead. Errors could indicate invalid entries or possibly dummy dates representing blank values.
active: Y
- result_visualization: line_chart
- result_visualization_params: null
cat_test_conditions:
- - id: '1011'
- test_type: Future_Date_1Y
- sql_flavor: redshift
- measure: |-
- SUM(GREATEST(0, SIGN({COLUMN_NAME}::DATE - ('{RUN_DATE}'::DATE+365))))
- test_operator: '>'
- test_condition: |-
- {THRESHOLD_VALUE}
- - id: '2011'
+ - id: '6011'
test_type: Future_Date_1Y
- sql_flavor: snowflake
+ sql_flavor: databricks
measure: |-
SUM(GREATEST(0, SIGN({COLUMN_NAME}::DATE - ('{RUN_DATE}'::DATE+365))))
test_operator: '>'
@@ -67,30 +59,46 @@ test_types:
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
- - id: '5011'
+ - id: '1011'
test_type: Future_Date_1Y
- sql_flavor: trino
+ sql_flavor: redshift
measure: |-
- SUM(CASE WHEN CAST({COLUMN_NAME} AS DATE) >= (FROM_ISO8601_DATE('{RUN_DATE}') + interval '365' day ) THEN 1 ELSE 0 END)
+ SUM(GREATEST(0, SIGN({COLUMN_NAME}::DATE - ('{RUN_DATE}'::DATE+365))))
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
- - id: '6011'
+ - id: '2011'
test_type: Future_Date_1Y
- sql_flavor: databricks
+ sql_flavor: snowflake
measure: |-
SUM(GREATEST(0, SIGN({COLUMN_NAME}::DATE - ('{RUN_DATE}'::DATE+365))))
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
+ - id: '5011'
+ test_type: Future_Date_1Y
+ sql_flavor: trino
+ measure: |-
+ SUM(CASE WHEN CAST({COLUMN_NAME} AS DATE) >= (FROM_ISO8601_DATE('{RUN_DATE}') + interval '365' day ) THEN 1 ELSE 0 END)
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
target_data_lookups:
- - id: '1011'
+ - id: '1308'
test_id: '1016'
test_type: Future_Date_1Y
- sql_flavor: redshift
+ sql_flavor: databricks
lookup_type: null
lookup_query: |-
- SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE GREATEST(0, SIGN("{COLUMN_NAME}"::DATE - ('{TEST_DATE}'::DATE + 365))) > {THRESHOLD_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT 500;
+ SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE GREATEST(0, SIGN(`{COLUMN_NAME}`::DATE - ('{TEST_DATE}'::DATE + 365))) > {THRESHOLD_VALUE} GROUP BY `{COLUMN_NAME}` LIMIT 500;
+ error_type: Test Results
+ - id: '1150'
+ test_id: '1016'
+ test_type: Future_Date_1Y
+ sql_flavor: mssql
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE CAST("{COLUMN_NAME}" AS DATE) >= DATEADD(DAY, 365, CONVERT(DATE, '{TEST_DATE}')) GROUP BY "{COLUMN_NAME}";
error_type: Test Results
- id: '1093'
test_id: '1016'
@@ -100,13 +108,13 @@ test_types:
lookup_query: |-
SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE GREATEST(0, SIGN("{COLUMN_NAME}"::DATE - ('{TEST_DATE}'::DATE + 365))) > {THRESHOLD_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT 500;
error_type: Test Results
- - id: '1150'
+ - id: '1011'
test_id: '1016'
test_type: Future_Date_1Y
- sql_flavor: mssql
+ sql_flavor: redshift
lookup_type: null
lookup_query: |-
- SELECT DISTINCT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE CAST("{COLUMN_NAME}" AS DATE) >= DATEADD(DAY, 365, CONVERT(DATE, '{TEST_DATE}')) GROUP BY "{COLUMN_NAME}";
+ SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE GREATEST(0, SIGN("{COLUMN_NAME}"::DATE - ('{TEST_DATE}'::DATE + 365))) > {THRESHOLD_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT 500;
error_type: Test Results
- id: '1207'
test_id: '1016'
@@ -116,12 +124,4 @@ test_types:
lookup_query: |-
SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE GREATEST(0, SIGN("{COLUMN_NAME}"::DATE - ('{TEST_DATE}'::DATE + 365))) > {THRESHOLD_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT 500;
error_type: Test Results
- - id: '1308'
- test_id: '1016'
- test_type: Future_Date_1Y
- sql_flavor: databricks
- lookup_type: null
- lookup_query: |-
- SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE GREATEST(0, SIGN(`{COLUMN_NAME}`::DATE - ('{TEST_DATE}'::DATE + 365))) > {THRESHOLD_VALUE} GROUP BY `{COLUMN_NAME}` LIMIT 500;
- error_type: Test Results
test_templates: []
diff --git a/testgen/template/dbsetup_test_types/test_types_Incr_Avg_Shift.yaml b/testgen/template/dbsetup_test_types/test_types_Incr_Avg_Shift.yaml
index f4051df5..71332c5f 100644
--- a/testgen/template/dbsetup_test_types/test_types_Incr_Avg_Shift.yaml
+++ b/testgen/template/dbsetup_test_types/test_types_Incr_Avg_Shift.yaml
@@ -30,23 +30,15 @@ test_types:
health_dimension: Data Drift
threshold_description: |-
Maximum Z-Score (number of SD's beyond mean) expected
+ result_visualization: line_chart
+ result_visualization_params: null
usage_notes: |-
This is a more sensitive test than Average Shift, because it calculates an incremental difference in the average of new values compared to the average of values at baseline. This is appropriate for a cumulative dataset only, because it calculates the average of new entries based on the assumption that the count and average of records present at baseline are still present at the time of the test. This test compares the mean of new values with the standard deviation of the baseline average to calculate a Z-score. If the new mean falls outside the Z-score threshold, a shift is detected. Potential Z-score thresholds may range from 0 to 3, depending on the sensitivity you prefer. A failed test could indicate a quality issue or a legitimate shift in new data that should be noted and assessed by business users. Consider this test along with Variability Increase. If variability rises too, process, methodology or measurement flaws could be at issue. If variability remains consistent, the problem is more likely to be with the source data itself.
active: Y
- result_visualization: line_chart
- result_visualization_params: null
cat_test_conditions:
- - id: '1012'
- test_type: Incr_Avg_Shift
- sql_flavor: redshift
- measure: |-
- NVL(ABS( ({BASELINE_AVG} - (SUM({COLUMN_NAME}) - {BASELINE_SUM}) / NULLIF(COUNT({COLUMN_NAME})::FLOAT - {BASELINE_VALUE_CT}, 0)) / {BASELINE_SD} ), 0)
- test_operator: '>='
- test_condition: |-
- {THRESHOLD_VALUE}
- - id: '2012'
+ - id: '6012'
test_type: Incr_Avg_Shift
- sql_flavor: snowflake
+ sql_flavor: databricks
measure: |-
COALESCE(ABS( ({BASELINE_AVG} - (SUM({COLUMN_NAME}) - {BASELINE_SUM}) / NULLIF(COUNT({COLUMN_NAME})::FLOAT - {BASELINE_VALUE_CT}, 0)) / {BASELINE_SD} ), 0)
test_operator: '>='
@@ -68,61 +60,69 @@ test_types:
test_operator: '>='
test_condition: |-
{THRESHOLD_VALUE}
- - id: '5012'
+ - id: '1012'
test_type: Incr_Avg_Shift
- sql_flavor: trino
+ sql_flavor: redshift
measure: |-
- COALESCE(ABS( ({BASELINE_AVG} - (SUM({COLUMN_NAME}) - {BASELINE_SUM}) / NULLIF(CAST(COUNT({COLUMN_NAME}) AS REAL) - {BASELINE_VALUE_CT}, 0)) / {BASELINE_SD} ), 0)
+ NVL(ABS( ({BASELINE_AVG} - (SUM({COLUMN_NAME}) - {BASELINE_SUM}) / NULLIF(COUNT({COLUMN_NAME})::FLOAT - {BASELINE_VALUE_CT}, 0)) / {BASELINE_SD} ), 0)
test_operator: '>='
test_condition: |-
{THRESHOLD_VALUE}
- - id: '6012'
+ - id: '2012'
test_type: Incr_Avg_Shift
- sql_flavor: databricks
+ sql_flavor: snowflake
measure: |-
COALESCE(ABS( ({BASELINE_AVG} - (SUM({COLUMN_NAME}) - {BASELINE_SUM}) / NULLIF(COUNT({COLUMN_NAME})::FLOAT - {BASELINE_VALUE_CT}, 0)) / {BASELINE_SD} ), 0)
test_operator: '>='
test_condition: |-
{THRESHOLD_VALUE}
+ - id: '5012'
+ test_type: Incr_Avg_Shift
+ sql_flavor: trino
+ measure: |-
+ COALESCE(ABS( ({BASELINE_AVG} - (SUM({COLUMN_NAME}) - {BASELINE_SUM}) / NULLIF(CAST(COUNT({COLUMN_NAME}) AS REAL) - {BASELINE_VALUE_CT}, 0)) / {BASELINE_SD} ), 0)
+ test_operator: '>='
+ test_condition: |-
+ {THRESHOLD_VALUE}
target_data_lookups:
- - id: '1012'
+ - id: '1309'
test_id: '1017'
test_type: Incr_Avg_Shift
- sql_flavor: redshift
+ sql_flavor: databricks
lookup_type: null
lookup_query: |-
- SELECT AVG("{COLUMN_NAME}" :: FLOAT) AS current_average, SUM("{COLUMN_NAME}" ::FLOAT) AS current_sum, NULLIF(COUNT("{COLUMN_NAME}" )::FLOAT, 0) as current_value_count FROM {TARGET_SCHEMA}.{TABLE_NAME};
+ SELECT AVG(`{COLUMN_NAME}` :: FLOAT) AS current_average, SUM(`{COLUMN_NAME}` ::FLOAT) AS current_sum, NULLIF(COUNT(`{COLUMN_NAME}` )::FLOAT, 0) as current_value_count FROM {TARGET_SCHEMA}.{TABLE_NAME};
error_type: Test Results
- - id: '1094'
+ - id: '1151'
test_id: '1017'
test_type: Incr_Avg_Shift
- sql_flavor: postgresql
+ sql_flavor: mssql
lookup_type: null
lookup_query: |-
- SELECT AVG("{COLUMN_NAME}" :: FLOAT) AS current_average, SUM("{COLUMN_NAME}" ::FLOAT) AS current_sum, NULLIF(COUNT("{COLUMN_NAME}" )::FLOAT, 0) as current_value_count FROM {TARGET_SCHEMA}.{TABLE_NAME};
+ SELECT AVG(CAST("{COLUMN_NAME}" AS FLOAT)) AS current_average, SUM(CAST("{COLUMN_NAME}" AS FLOAT)) AS current_sum, NULLIF(CAST(COUNT("{COLUMN_NAME}") AS FLOAT), 0) as current_value_count FROM {TARGET_SCHEMA}.{TABLE_NAME};
error_type: Test Results
- - id: '1151'
+ - id: '1094'
test_id: '1017'
test_type: Incr_Avg_Shift
- sql_flavor: mssql
+ sql_flavor: postgresql
lookup_type: null
lookup_query: |-
- SELECT AVG(CAST("{COLUMN_NAME}" AS FLOAT)) AS current_average, SUM(CAST("{COLUMN_NAME}" AS FLOAT)) AS current_sum, NULLIF(CAST(COUNT("{COLUMN_NAME}") AS FLOAT), 0) as current_value_count FROM {TARGET_SCHEMA}.{TABLE_NAME};
+ SELECT AVG("{COLUMN_NAME}" :: FLOAT) AS current_average, SUM("{COLUMN_NAME}" ::FLOAT) AS current_sum, NULLIF(COUNT("{COLUMN_NAME}" )::FLOAT, 0) as current_value_count FROM {TARGET_SCHEMA}.{TABLE_NAME};
error_type: Test Results
- - id: '1208'
+ - id: '1012'
test_id: '1017'
test_type: Incr_Avg_Shift
- sql_flavor: snowflake
+ sql_flavor: redshift
lookup_type: null
lookup_query: |-
SELECT AVG("{COLUMN_NAME}" :: FLOAT) AS current_average, SUM("{COLUMN_NAME}" ::FLOAT) AS current_sum, NULLIF(COUNT("{COLUMN_NAME}" )::FLOAT, 0) as current_value_count FROM {TARGET_SCHEMA}.{TABLE_NAME};
error_type: Test Results
- - id: '1309'
+ - id: '1208'
test_id: '1017'
test_type: Incr_Avg_Shift
- sql_flavor: databricks
+ sql_flavor: snowflake
lookup_type: null
lookup_query: |-
- SELECT AVG(`{COLUMN_NAME}` :: FLOAT) AS current_average, SUM(`{COLUMN_NAME}` ::FLOAT) AS current_sum, NULLIF(COUNT(`{COLUMN_NAME}` )::FLOAT, 0) as current_value_count FROM {TARGET_SCHEMA}.{TABLE_NAME};
+ SELECT AVG("{COLUMN_NAME}" :: FLOAT) AS current_average, SUM("{COLUMN_NAME}" ::FLOAT) AS current_sum, NULLIF(COUNT("{COLUMN_NAME}" )::FLOAT, 0) as current_value_count FROM {TARGET_SCHEMA}.{TABLE_NAME};
error_type: Test Results
test_templates: []
diff --git a/testgen/template/dbsetup_test_types/test_types_LOV_All.yaml b/testgen/template/dbsetup_test_types/test_types_LOV_All.yaml
index 54fa704b..e17caf4a 100644
--- a/testgen/template/dbsetup_test_types/test_types_LOV_All.yaml
+++ b/testgen/template/dbsetup_test_types/test_types_LOV_All.yaml
@@ -27,25 +27,17 @@ test_types:
health_dimension: Schema Drift
threshold_description: |-
List of values expected, in form ('Val1','Val2)
+ result_visualization: line_chart
+ result_visualization_params: null
usage_notes: |-
This is a more restrictive form of Value Match, testing that all values in the dataset match the list provided, and also that all values present in the list appear at least once in the dataset. This would be appropriate for tables where all category values in the column are represented at least once.
active: Y
- result_visualization: line_chart
- result_visualization_params: null
cat_test_conditions:
- - id: '1013'
- test_type: LOV_All
- sql_flavor: redshift
- measure: |-
- LISTAGG(DISTINCT {COLUMN_NAME}, '|') WITHIN GROUP (ORDER BY {COLUMN_NAME})
- test_operator: <>
- test_condition: |-
- {THRESHOLD_VALUE}
- - id: '2013'
+ - id: '6013'
test_type: LOV_All
- sql_flavor: snowflake
+ sql_flavor: databricks
measure: |-
- LISTAGG(DISTINCT {COLUMN_NAME}, '|') WITHIN GROUP (ORDER BY {COLUMN_NAME})
+ STRING_AGG(DISTINCT {COLUMN_NAME}, '|') WITHIN GROUP (ORDER BY {COLUMN_NAME})
test_operator: <>
test_condition: |-
{THRESHOLD_VALUE}
@@ -65,11 +57,19 @@ test_types:
test_operator: <>
test_condition: |-
{THRESHOLD_VALUE}
- - id: '6013'
+ - id: '1013'
test_type: LOV_All
- sql_flavor: databricks
+ sql_flavor: redshift
measure: |-
- STRING_AGG(DISTINCT {COLUMN_NAME}, '|') WITHIN GROUP (ORDER BY {COLUMN_NAME})
+ LISTAGG(DISTINCT {COLUMN_NAME}, '|') WITHIN GROUP (ORDER BY {COLUMN_NAME})
+ test_operator: <>
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '2013'
+ test_type: LOV_All
+ sql_flavor: snowflake
+ measure: |-
+ LISTAGG(DISTINCT {COLUMN_NAME}, '|') WITHIN GROUP (ORDER BY {COLUMN_NAME})
test_operator: <>
test_condition: |-
{THRESHOLD_VALUE}
@@ -82,44 +82,44 @@ test_types:
test_condition: |-
{THRESHOLD_VALUE}
target_data_lookups:
- - id: '1013'
+ - id: '1310'
test_id: '1018'
test_type: LOV_All
- sql_flavor: redshift
+ sql_flavor: databricks
lookup_type: null
lookup_query: |-
- SELECT LISTAGG(DISTINCT "{COLUMN_NAME}", '|') WITHIN GROUP (ORDER BY "{COLUMN_NAME}") FROM {TARGET_SCHEMA}.{TABLE_NAME} HAVING LISTAGG(DISTINCT "{COLUMN_NAME}", '|') WITHIN GROUP (ORDER BY "{COLUMN_NAME}") <> '{THRESHOLD_VALUE}' LIMIT 500;
+ SELECT ARRAY_JOIN(ARRAY_SORT(COLLECT_SET(`{COLUMN_NAME}`)), '|') AS aggregated_values FROM {TARGET_SCHEMA}.{TABLE_NAME} HAVING ARRAY_JOIN(ARRAY_SORT(COLLECT_SET(`{COLUMN_NAME}`)), '|') <> '{THRESHOLD_VALUE}' LIMIT 500;
error_type: Test Results
- - id: '1095'
+ - id: '1152'
test_id: '1018'
test_type: LOV_All
- sql_flavor: postgresql
+ sql_flavor: mssql
lookup_type: null
lookup_query: |-
- SELECT STRING_AGG(DISTINCT "{COLUMN_NAME}", '|' ORDER BY "{COLUMN_NAME}" ASC) FROM {TARGET_SCHEMA}.{TABLE_NAME} HAVING STRING_AGG(DISTINCT "{COLUMN_NAME}", '|' ORDER BY "{COLUMN_NAME}" ASC) <> '{THRESHOLD_VALUE}' LIMIT 500;
+ WITH CTE AS (SELECT DISTINCT "{COLUMN_NAME}" FROM {TARGET_SCHEMA}.{TABLE_NAME}) SELECT STRING_AGG( "{COLUMN_NAME}", '|' ) WITHIN GROUP (ORDER BY "{COLUMN_NAME}" ASC) FROM CTE HAVING STRING_AGG("{COLUMN_NAME}", '|') WITHIN GROUP (ORDER BY "{COLUMN_NAME}" ASC) <> '{THRESHOLD_VALUE}';
error_type: Test Results
- - id: '1152'
+ - id: '1095'
test_id: '1018'
test_type: LOV_All
- sql_flavor: mssql
+ sql_flavor: postgresql
lookup_type: null
lookup_query: |-
- WITH CTE AS (SELECT DISTINCT "{COLUMN_NAME}" FROM {TARGET_SCHEMA}.{TABLE_NAME}) SELECT STRING_AGG( "{COLUMN_NAME}", '|' ) WITHIN GROUP (ORDER BY "{COLUMN_NAME}" ASC) FROM CTE HAVING STRING_AGG("{COLUMN_NAME}", '|') WITHIN GROUP (ORDER BY "{COLUMN_NAME}" ASC) <> '{THRESHOLD_VALUE}';
+ SELECT STRING_AGG(DISTINCT "{COLUMN_NAME}", '|' ORDER BY "{COLUMN_NAME}" ASC) FROM {TARGET_SCHEMA}.{TABLE_NAME} HAVING STRING_AGG(DISTINCT "{COLUMN_NAME}", '|' ORDER BY "{COLUMN_NAME}" ASC) <> '{THRESHOLD_VALUE}' LIMIT 500;
error_type: Test Results
- - id: '1209'
+ - id: '1013'
test_id: '1018'
test_type: LOV_All
- sql_flavor: snowflake
+ sql_flavor: redshift
lookup_type: null
lookup_query: |-
SELECT LISTAGG(DISTINCT "{COLUMN_NAME}", '|') WITHIN GROUP (ORDER BY "{COLUMN_NAME}") FROM {TARGET_SCHEMA}.{TABLE_NAME} HAVING LISTAGG(DISTINCT "{COLUMN_NAME}", '|') WITHIN GROUP (ORDER BY "{COLUMN_NAME}") <> '{THRESHOLD_VALUE}' LIMIT 500;
error_type: Test Results
- - id: '1310'
+ - id: '1209'
test_id: '1018'
test_type: LOV_All
- sql_flavor: databricks
+ sql_flavor: snowflake
lookup_type: null
lookup_query: |-
- SELECT ARRAY_JOIN(ARRAY_SORT(COLLECT_SET(`{COLUMN_NAME}`)), '|') AS aggregated_values FROM {TARGET_SCHEMA}.{TABLE_NAME} HAVING ARRAY_JOIN(ARRAY_SORT(COLLECT_SET(`{COLUMN_NAME}`)), '|') <> '{THRESHOLD_VALUE}' LIMIT 500;
+ SELECT LISTAGG(DISTINCT "{COLUMN_NAME}", '|') WITHIN GROUP (ORDER BY "{COLUMN_NAME}") FROM {TARGET_SCHEMA}.{TABLE_NAME} HAVING LISTAGG(DISTINCT "{COLUMN_NAME}", '|') WITHIN GROUP (ORDER BY "{COLUMN_NAME}") <> '{THRESHOLD_VALUE}' LIMIT 500;
error_type: Test Results
test_templates: []
diff --git a/testgen/template/dbsetup_test_types/test_types_LOV_Match.yaml b/testgen/template/dbsetup_test_types/test_types_LOV_Match.yaml
index 033c290a..4ee6b63d 100644
--- a/testgen/template/dbsetup_test_types/test_types_LOV_Match.yaml
+++ b/testgen/template/dbsetup_test_types/test_types_LOV_Match.yaml
@@ -29,68 +29,76 @@ test_types:
health_dimension: Schema Drift
threshold_description: |-
List of values expected, in form ('Val1','Val2)
+ result_visualization: line_chart
+ result_visualization_params: null
usage_notes: |-
This tests that all values in the column match the hard-coded list provided. This is relevant when the list of allowable values is small and not expected to change often. Even if new values might occasionally be added, this test is useful for downstream data products to provide warning that assumptions and logic may need to change.
active: Y
- result_visualization: line_chart
- result_visualization_params: null
cat_test_conditions:
- - id: '1014'
+ - id: '6014'
test_type: LOV_Match
- sql_flavor: redshift
+ sql_flavor: databricks
measure: |-
SUM(CASE WHEN NULLIF({COLUMN_NAME}, '') NOT IN {BASELINE_VALUE} THEN 1 ELSE 0 END)
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
- - id: '2014'
+ - id: '3014'
test_type: LOV_Match
- sql_flavor: snowflake
+ sql_flavor: mssql
measure: |-
SUM(CASE WHEN NULLIF({COLUMN_NAME}, '') NOT IN {BASELINE_VALUE} THEN 1 ELSE 0 END)
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
- - id: '3014'
+ - id: '4014'
test_type: LOV_Match
- sql_flavor: mssql
+ sql_flavor: postgresql
measure: |-
SUM(CASE WHEN NULLIF({COLUMN_NAME}, '') NOT IN {BASELINE_VALUE} THEN 1 ELSE 0 END)
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
- - id: '4014'
+ - id: '1014'
test_type: LOV_Match
- sql_flavor: postgresql
+ sql_flavor: redshift
measure: |-
SUM(CASE WHEN NULLIF({COLUMN_NAME}, '') NOT IN {BASELINE_VALUE} THEN 1 ELSE 0 END)
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
- - id: '5014'
+ - id: '2014'
test_type: LOV_Match
- sql_flavor: trino
+ sql_flavor: snowflake
measure: |-
SUM(CASE WHEN NULLIF({COLUMN_NAME}, '') NOT IN {BASELINE_VALUE} THEN 1 ELSE 0 END)
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
- - id: '6014'
+ - id: '5014'
test_type: LOV_Match
- sql_flavor: databricks
+ sql_flavor: trino
measure: |-
SUM(CASE WHEN NULLIF({COLUMN_NAME}, '') NOT IN {BASELINE_VALUE} THEN 1 ELSE 0 END)
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
target_data_lookups:
- - id: '1014'
+ - id: '1311'
test_id: '1019'
test_type: LOV_Match
- sql_flavor: redshift
+ sql_flavor: databricks
lookup_type: null
lookup_query: |-
- SELECT DISTINCT NULLIF("{COLUMN_NAME}", '') AS "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE NULLIF("{COLUMN_NAME}", '') NOT IN {BASELINE_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT 500;
+ SELECT DISTINCT NULLIF(`{COLUMN_NAME}`, '') AS `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE NULLIF(`{COLUMN_NAME}`, '') NOT IN {BASELINE_VALUE} GROUP BY `{COLUMN_NAME}` LIMIT 500;
+ error_type: Test Results
+ - id: '1153'
+ test_id: '1019'
+ test_type: LOV_Match
+ sql_flavor: mssql
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT TOP 500 NULLIF("{COLUMN_NAME}", '') AS "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE NULLIF("{COLUMN_NAME}", '') NOT IN {BASELINE_VALUE} GROUP BY "{COLUMN_NAME}" ;
error_type: Test Results
- id: '1096'
test_id: '1019'
@@ -100,13 +108,13 @@ test_types:
lookup_query: |-
SELECT DISTINCT NULLIF("{COLUMN_NAME}", '') AS "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE NULLIF("{COLUMN_NAME}", '') NOT IN {BASELINE_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT 500;
error_type: Test Results
- - id: '1153'
+ - id: '1014'
test_id: '1019'
test_type: LOV_Match
- sql_flavor: mssql
+ sql_flavor: redshift
lookup_type: null
lookup_query: |-
- SELECT DISTINCT TOP 500 NULLIF("{COLUMN_NAME}", '') AS "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE NULLIF("{COLUMN_NAME}", '') NOT IN {BASELINE_VALUE} GROUP BY "{COLUMN_NAME}" ;
+ SELECT DISTINCT NULLIF("{COLUMN_NAME}", '') AS "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE NULLIF("{COLUMN_NAME}", '') NOT IN {BASELINE_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT 500;
error_type: Test Results
- id: '1210'
test_id: '1019'
@@ -116,12 +124,4 @@ test_types:
lookup_query: |-
SELECT DISTINCT NULLIF("{COLUMN_NAME}", '') AS "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE NULLIF("{COLUMN_NAME}", '') NOT IN {BASELINE_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT 500;
error_type: Test Results
- - id: '1311'
- test_id: '1019'
- test_type: LOV_Match
- sql_flavor: databricks
- lookup_type: null
- lookup_query: |-
- SELECT DISTINCT NULLIF(`{COLUMN_NAME}`, '') AS `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE NULLIF(`{COLUMN_NAME}`, '') NOT IN {BASELINE_VALUE} GROUP BY `{COLUMN_NAME}` LIMIT 500;
- error_type: Test Results
test_templates: []
diff --git a/testgen/template/dbsetup_test_types/test_types_Min_Date.yaml b/testgen/template/dbsetup_test_types/test_types_Min_Date.yaml
index 0a5874dc..e54dcb22 100644
--- a/testgen/template/dbsetup_test_types/test_types_Min_Date.yaml
+++ b/testgen/template/dbsetup_test_types/test_types_Min_Date.yaml
@@ -29,68 +29,76 @@ test_types:
health_dimension: Schema Drift
threshold_description: |-
Expected count of dates prior to minimum
+ result_visualization: line_chart
+ result_visualization_params: null
usage_notes: |-
This test is appropriate for a cumulative dataset only, because it assumes all prior values are still present. It's appropriate where new records are added with more recent dates, but old dates dates do not change.
active: Y
- result_visualization: line_chart
- result_visualization_params: null
cat_test_conditions:
- - id: '1015'
+ - id: '6015'
test_type: Min_Date
- sql_flavor: redshift
+ sql_flavor: databricks
measure: |-
SUM(CASE WHEN {COLUMN_NAME} < '{BASELINE_VALUE}' THEN 1 ELSE 0 END)
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
- - id: '2015'
+ - id: '3015'
test_type: Min_Date
- sql_flavor: snowflake
+ sql_flavor: mssql
measure: |-
SUM(CASE WHEN {COLUMN_NAME} < '{BASELINE_VALUE}' THEN 1 ELSE 0 END)
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
- - id: '3015'
+ - id: '4015'
test_type: Min_Date
- sql_flavor: mssql
+ sql_flavor: postgresql
measure: |-
SUM(CASE WHEN {COLUMN_NAME} < '{BASELINE_VALUE}' THEN 1 ELSE 0 END)
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
- - id: '4015'
+ - id: '1015'
test_type: Min_Date
- sql_flavor: postgresql
+ sql_flavor: redshift
measure: |-
SUM(CASE WHEN {COLUMN_NAME} < '{BASELINE_VALUE}' THEN 1 ELSE 0 END)
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
- - id: '5015'
+ - id: '2015'
test_type: Min_Date
- sql_flavor: trino
+ sql_flavor: snowflake
measure: |-
- SUM(CASE WHEN {COLUMN_NAME} < CAST('{BASELINE_VALUE}' AS DATE) THEN 1 ELSE 0 END)
+ SUM(CASE WHEN {COLUMN_NAME} < '{BASELINE_VALUE}' THEN 1 ELSE 0 END)
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
- - id: '6015'
+ - id: '5015'
test_type: Min_Date
- sql_flavor: databricks
+ sql_flavor: trino
measure: |-
- SUM(CASE WHEN {COLUMN_NAME} < '{BASELINE_VALUE}' THEN 1 ELSE 0 END)
+ SUM(CASE WHEN {COLUMN_NAME} < CAST('{BASELINE_VALUE}' AS DATE) THEN 1 ELSE 0 END)
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
target_data_lookups:
- - id: '1015'
+ - id: '1312'
test_id: '1020'
test_type: Min_Date
- sql_flavor: redshift
+ sql_flavor: databricks
lookup_type: null
lookup_query: |-
- SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" :: DATE < '{BASELINE_VALUE}' :: DATE GROUP BY "{COLUMN_NAME}" LIMIT 500;
+ SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE `{COLUMN_NAME}` :: DATE < '{BASELINE_VALUE}' :: DATE GROUP BY `{COLUMN_NAME}` LIMIT 500;
+ error_type: Test Results
+ - id: '1154'
+ test_id: '1020'
+ test_type: Min_Date
+ sql_flavor: mssql
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE CAST("{COLUMN_NAME}" AS DATE) < CAST('{BASELINE_VALUE}' AS DATE) GROUP BY "{COLUMN_NAME}";
error_type: Test Results
- id: '1097'
test_id: '1020'
@@ -100,13 +108,13 @@ test_types:
lookup_query: |-
SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" :: DATE < '{BASELINE_VALUE}' :: DATE GROUP BY "{COLUMN_NAME}" LIMIT 500;
error_type: Test Results
- - id: '1154'
+ - id: '1015'
test_id: '1020'
test_type: Min_Date
- sql_flavor: mssql
+ sql_flavor: redshift
lookup_type: null
lookup_query: |-
- SELECT DISTINCT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE CAST("{COLUMN_NAME}" AS DATE) < CAST('{BASELINE_VALUE}' AS DATE) GROUP BY "{COLUMN_NAME}";
+ SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" :: DATE < '{BASELINE_VALUE}' :: DATE GROUP BY "{COLUMN_NAME}" LIMIT 500;
error_type: Test Results
- id: '1211'
test_id: '1020'
@@ -116,12 +124,4 @@ test_types:
lookup_query: |-
SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" :: DATE < '{BASELINE_VALUE}' :: DATE GROUP BY "{COLUMN_NAME}" LIMIT 500;
error_type: Test Results
- - id: '1312'
- test_id: '1020'
- test_type: Min_Date
- sql_flavor: databricks
- lookup_type: null
- lookup_query: |-
- SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE `{COLUMN_NAME}` :: DATE < '{BASELINE_VALUE}' :: DATE GROUP BY `{COLUMN_NAME}` LIMIT 500;
- error_type: Test Results
test_templates: []
diff --git a/testgen/template/dbsetup_test_types/test_types_Min_Val.yaml b/testgen/template/dbsetup_test_types/test_types_Min_Val.yaml
index effe37b6..d63d7db0 100644
--- a/testgen/template/dbsetup_test_types/test_types_Min_Val.yaml
+++ b/testgen/template/dbsetup_test_types/test_types_Min_Val.yaml
@@ -29,99 +29,99 @@ test_types:
health_dimension: Schema Drift
threshold_description: |-
Expected count of values under limit
+ result_visualization: line_chart
+ result_visualization_params: null
usage_notes: |-
This test is appropriate for a cumulative dataset only, assuming all prior values are still present. It is also appropriate for any measure that has an absolute, definable minimum value, or a heuristic that makes senes for valid data.
active: Y
- result_visualization: line_chart
- result_visualization_params: null
cat_test_conditions:
- - id: '1016'
+ - id: '6016'
test_type: Min_Val
- sql_flavor: redshift
+ sql_flavor: databricks
measure: |-
SUM(CASE WHEN {COLUMN_NAME} < {BASELINE_VALUE} - 1e-6 THEN 1 ELSE 0 END)
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
- - id: '2016'
+ - id: '3016'
test_type: Min_Val
- sql_flavor: snowflake
+ sql_flavor: mssql
measure: |-
SUM(CASE WHEN {COLUMN_NAME} < {BASELINE_VALUE} - 1e-6 THEN 1 ELSE 0 END)
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
- - id: '3016'
+ - id: '4016'
test_type: Min_Val
- sql_flavor: mssql
+ sql_flavor: postgresql
measure: |-
SUM(CASE WHEN {COLUMN_NAME} < {BASELINE_VALUE} - 1e-6 THEN 1 ELSE 0 END)
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
- - id: '4016'
+ - id: '1016'
test_type: Min_Val
- sql_flavor: postgresql
+ sql_flavor: redshift
measure: |-
SUM(CASE WHEN {COLUMN_NAME} < {BASELINE_VALUE} - 1e-6 THEN 1 ELSE 0 END)
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
- - id: '5016'
+ - id: '2016'
test_type: Min_Val
- sql_flavor: trino
+ sql_flavor: snowflake
measure: |-
SUM(CASE WHEN {COLUMN_NAME} < {BASELINE_VALUE} - 1e-6 THEN 1 ELSE 0 END)
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
- - id: '6016'
+ - id: '5016'
test_type: Min_Val
- sql_flavor: databricks
+ sql_flavor: trino
measure: |-
SUM(CASE WHEN {COLUMN_NAME} < {BASELINE_VALUE} - 1e-6 THEN 1 ELSE 0 END)
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
target_data_lookups:
- - id: '1016'
+ - id: '1313'
test_id: '1021'
test_type: Min_Val
- sql_flavor: redshift
+ sql_flavor: databricks
lookup_type: null
lookup_query: |-
- SELECT DISTINCT "{COLUMN_NAME}", (ABS("{COLUMN_NAME}") - ABS({BASELINE_VALUE})) AS difference_from_baseline FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" < {BASELINE_VALUE} LIMIT 500;
+ SELECT DISTINCT `{COLUMN_NAME}`, (ABS(`{COLUMN_NAME}`) - ABS({BASELINE_VALUE})) AS difference_from_baseline FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE `{COLUMN_NAME}` < {BASELINE_VALUE} LIMIT 500;
error_type: Test Results
- - id: '1098'
+ - id: '1155'
test_id: '1021'
test_type: Min_Val
- sql_flavor: postgresql
+ sql_flavor: mssql
lookup_type: null
lookup_query: |-
- SELECT DISTINCT "{COLUMN_NAME}", (ABS("{COLUMN_NAME}") - ABS({BASELINE_VALUE})) AS difference_from_baseline FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" < {BASELINE_VALUE} LIMIT 500;
+ SELECT DISTINCT TOP 500 "{COLUMN_NAME}", (ABS("{COLUMN_NAME}") - ABS({BASELINE_VALUE})) AS difference_from_baseline FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" < {BASELINE_VALUE};
error_type: Test Results
- - id: '1155'
+ - id: '1098'
test_id: '1021'
test_type: Min_Val
- sql_flavor: mssql
+ sql_flavor: postgresql
lookup_type: null
lookup_query: |-
- SELECT DISTINCT TOP 500 "{COLUMN_NAME}", (ABS("{COLUMN_NAME}") - ABS({BASELINE_VALUE})) AS difference_from_baseline FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" < {BASELINE_VALUE};
+ SELECT DISTINCT "{COLUMN_NAME}", (ABS("{COLUMN_NAME}") - ABS({BASELINE_VALUE})) AS difference_from_baseline FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" < {BASELINE_VALUE} LIMIT 500;
error_type: Test Results
- - id: '1212'
+ - id: '1016'
test_id: '1021'
test_type: Min_Val
- sql_flavor: snowflake
+ sql_flavor: redshift
lookup_type: null
lookup_query: |-
SELECT DISTINCT "{COLUMN_NAME}", (ABS("{COLUMN_NAME}") - ABS({BASELINE_VALUE})) AS difference_from_baseline FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" < {BASELINE_VALUE} LIMIT 500;
error_type: Test Results
- - id: '1313'
+ - id: '1212'
test_id: '1021'
test_type: Min_Val
- sql_flavor: databricks
+ sql_flavor: snowflake
lookup_type: null
lookup_query: |-
- SELECT DISTINCT `{COLUMN_NAME}`, (ABS(`{COLUMN_NAME}`) - ABS({BASELINE_VALUE})) AS difference_from_baseline FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE `{COLUMN_NAME}` < {BASELINE_VALUE} LIMIT 500;
+ SELECT DISTINCT "{COLUMN_NAME}", (ABS("{COLUMN_NAME}") - ABS({BASELINE_VALUE})) AS difference_from_baseline FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" < {BASELINE_VALUE} LIMIT 500;
error_type: Test Results
test_templates: []
diff --git a/testgen/template/dbsetup_test_types/test_types_Missing_Pct.yaml b/testgen/template/dbsetup_test_types/test_types_Missing_Pct.yaml
index ba7b6fb7..ad4d5a02 100644
--- a/testgen/template/dbsetup_test_types/test_types_Missing_Pct.yaml
+++ b/testgen/template/dbsetup_test_types/test_types_Missing_Pct.yaml
@@ -30,25 +30,17 @@ test_types:
health_dimension: Data Drift
threshold_description: |-
Expected maximum Cohen's H Difference
+ result_visualization: line_chart
+ result_visualization_params: null
usage_notes: |-
This test uses Cohen's H, a statistical test to identify a significant difference between two ratios. Results are reported on a standardized scale, which can be interpreted via a rule-of-thumb from small to huge. An uptick in missing data may indicate a collection issue at the source. A larger change may indicate a processing failure. A drop in missing data may also be significant, if it affects assumptions built into analytic products downstream. You can refine the expected threshold value as you view legitimate results of the measure over time.
active: Y
- result_visualization: line_chart
- result_visualization_params: null
cat_test_conditions:
- - id: '1017'
- test_type: Missing_Pct
- sql_flavor: redshift
- measure: |-
- ABS( 2.0 * ASIN( SQRT( {BASELINE_VALUE_CT}::FLOAT / {BASELINE_CT}::FLOAT ) ) - 2 * ASIN( SQRT( COUNT( {COLUMN_NAME} )::FLOAT / NULLIF(COUNT(*), 0)::FLOAT )) )
- test_operator: '>='
- test_condition: |-
- {THRESHOLD_VALUE}
- - id: '2017'
+ - id: '6017'
test_type: Missing_Pct
- sql_flavor: snowflake
+ sql_flavor: databricks
measure: |-
- ABS( 2.0 * ASIN( SQRT( {BASELINE_VALUE_CT}::FLOAT / {BASELINE_CT}::FLOAT ) ) - 2 * ASIN( SQRT( COUNT( {COLUMN_NAME} )::FLOAT / NULLIF(COUNT(*), 0)::FLOAT )) )
+ ABS( 2.0 * ASIN( SQRT( {BASELINE_VALUE_CT}::FLOAT / {BASELINE_CT}::FLOAT ) ) - 2 * ASIN( SQRT( COUNT({COLUMN_NAME})::FLOAT / NULLIF(COUNT(*), 0)::FLOAT )) )
test_operator: '>='
test_condition: |-
{THRESHOLD_VALUE}
@@ -68,61 +60,69 @@ test_types:
test_operator: '>='
test_condition: |-
{THRESHOLD_VALUE}
- - id: '5017'
+ - id: '1017'
test_type: Missing_Pct
- sql_flavor: trino
+ sql_flavor: redshift
measure: |-
- ABS(2.0 * ASIN(SQRT(CAST({BASELINE_VALUE_CT} AS REAL) / CAST({BASELINE_CT} AS REAL))) - 2 * ASIN(SQRT(CAST(COUNT({COLUMN_NAME}) AS REAL) / CAST(NULLIF(COUNT(*), 0) AS REAL) )))
+ ABS( 2.0 * ASIN( SQRT( {BASELINE_VALUE_CT}::FLOAT / {BASELINE_CT}::FLOAT ) ) - 2 * ASIN( SQRT( COUNT( {COLUMN_NAME} )::FLOAT / NULLIF(COUNT(*), 0)::FLOAT )) )
test_operator: '>='
test_condition: |-
{THRESHOLD_VALUE}
- - id: '6017'
+ - id: '2017'
test_type: Missing_Pct
- sql_flavor: databricks
+ sql_flavor: snowflake
measure: |-
- ABS( 2.0 * ASIN( SQRT( {BASELINE_VALUE_CT}::FLOAT / {BASELINE_CT}::FLOAT ) ) - 2 * ASIN( SQRT( COUNT({COLUMN_NAME})::FLOAT / NULLIF(COUNT(*), 0)::FLOAT )) )
+ ABS( 2.0 * ASIN( SQRT( {BASELINE_VALUE_CT}::FLOAT / {BASELINE_CT}::FLOAT ) ) - 2 * ASIN( SQRT( COUNT( {COLUMN_NAME} )::FLOAT / NULLIF(COUNT(*), 0)::FLOAT )) )
+ test_operator: '>='
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '5017'
+ test_type: Missing_Pct
+ sql_flavor: trino
+ measure: |-
+ ABS(2.0 * ASIN(SQRT(CAST({BASELINE_VALUE_CT} AS REAL) / CAST({BASELINE_CT} AS REAL))) - 2 * ASIN(SQRT(CAST(COUNT({COLUMN_NAME}) AS REAL) / CAST(NULLIF(COUNT(*), 0) AS REAL) )))
test_operator: '>='
test_condition: |-
{THRESHOLD_VALUE}
target_data_lookups:
- - id: '1017'
+ - id: '1314'
test_id: '1022'
test_type: Missing_Pct
- sql_flavor: redshift
+ sql_flavor: databricks
lookup_type: null
lookup_query: |-
- SELECT TOP 10 * FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" IS NULL OR "{COLUMN_NAME}" :: VARCHAR(255) = '' ;
+ SELECT * FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE `{COLUMN_NAME}` IS NULL OR `{COLUMN_NAME}` :: VARCHAR(255) = '' LIMIT 10;
error_type: Test Results
- - id: '1099'
+ - id: '1156'
test_id: '1022'
test_type: Missing_Pct
- sql_flavor: postgresql
+ sql_flavor: mssql
lookup_type: null
lookup_query: |-
- SELECT * FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" IS NULL OR "{COLUMN_NAME}" :: VARCHAR(255) = '' LIMIT 10;
+ SELECT TOP 10 * FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" IS NULL OR CAST("{COLUMN_NAME}" AS VARCHAR(255)) = '';
error_type: Test Results
- - id: '1156'
+ - id: '1099'
test_id: '1022'
test_type: Missing_Pct
- sql_flavor: mssql
+ sql_flavor: postgresql
lookup_type: null
lookup_query: |-
- SELECT TOP 10 * FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" IS NULL OR CAST("{COLUMN_NAME}" AS VARCHAR(255)) = '';
+ SELECT * FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" IS NULL OR "{COLUMN_NAME}" :: VARCHAR(255) = '' LIMIT 10;
error_type: Test Results
- - id: '1213'
+ - id: '1017'
test_id: '1022'
test_type: Missing_Pct
- sql_flavor: snowflake
+ sql_flavor: redshift
lookup_type: null
lookup_query: |-
SELECT TOP 10 * FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" IS NULL OR "{COLUMN_NAME}" :: VARCHAR(255) = '' ;
error_type: Test Results
- - id: '1314'
+ - id: '1213'
test_id: '1022'
test_type: Missing_Pct
- sql_flavor: databricks
+ sql_flavor: snowflake
lookup_type: null
lookup_query: |-
- SELECT * FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE `{COLUMN_NAME}` IS NULL OR `{COLUMN_NAME}` :: VARCHAR(255) = '' LIMIT 10;
+ SELECT TOP 10 * FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" IS NULL OR "{COLUMN_NAME}" :: VARCHAR(255) = '' ;
error_type: Test Results
test_templates: []
diff --git a/testgen/template/dbsetup_test_types/test_types_Monthly_Rec_Ct.yaml b/testgen/template/dbsetup_test_types/test_types_Monthly_Rec_Ct.yaml
index 65f92f3b..58f8b1e4 100644
--- a/testgen/template/dbsetup_test_types/test_types_Monthly_Rec_Ct.yaml
+++ b/testgen/template/dbsetup_test_types/test_types_Monthly_Rec_Ct.yaml
@@ -30,25 +30,17 @@ test_types:
health_dimension: Volume
threshold_description: |-
Expected maximum count of calendar months without dates present
+ result_visualization: line_chart
+ result_visualization_params: null
usage_notes: |-
Monthly Records tests that at least one record is present for every calendar month within the minimum and maximum date range for the column. The test is relevant for transactional data, where you would expect at least one transaction to be recorded each month. A failure here would suggest missing records for the number of months identified without data. You can adjust the threshold to accept a number of month that you know legitimately have no records.
active: Y
- result_visualization: line_chart
- result_visualization_params: null
cat_test_conditions:
- - id: '1018'
- test_type: Monthly_Rec_Ct
- sql_flavor: redshift
- measure: |-
- (MAX(DATEDIFF(month, {COLUMN_NAME}, '{RUN_DATE}'::DATE)) - MIN(DATEDIFF(month, {COLUMN_NAME}, '{RUN_DATE}'::DATE)) + 1) - COUNT(DISTINCT DATEDIFF(month, {COLUMN_NAME}, '{RUN_DATE}'::DATE))
- test_operator: '>'
- test_condition: |-
- {THRESHOLD_VALUE}
- - id: '2018'
+ - id: '6018'
test_type: Monthly_Rec_Ct
- sql_flavor: snowflake
+ sql_flavor: databricks
measure: |-
- (MAX(DATEDIFF(month, {COLUMN_NAME}, '{RUN_DATE}'::DATE)) - MIN(DATEDIFF(month, {COLUMN_NAME}, '{RUN_DATE}'::DATE)) + 1) - COUNT(DISTINCT DATEDIFF(month, {COLUMN_NAME}, '{RUN_DATE}'::DATE))
+ (MAX(<%DATEDIFF_MONTH;{COLUMN_NAME};'{RUN_DATE}'::DATE%>) - MIN(<%DATEDIFF_MONTH;{COLUMN_NAME};'{RUN_DATE}'::DATE%>) + 1) - COUNT(DISTINCT <%DATEDIFF_MONTH;{COLUMN_NAME};'{RUN_DATE}'::DATE%>)
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
@@ -68,38 +60,38 @@ test_types:
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
- - id: '5018'
+ - id: '1018'
test_type: Monthly_Rec_Ct
- sql_flavor: trino
+ sql_flavor: redshift
measure: |-
- (MAX(DATE_DIFF('month', {COLUMN_NAME}, CAST('{RUN_DATE}' AS DATE))) - MIN(DATE_DIFF('month', {COLUMN_NAME}, CAST('{RUN_DATE}' AS DATE))) + 1) - COUNT(DISTINCT DATE_DIFF('month', {COLUMN_NAME}, CAST('{RUN_DATE}' AS DATE)))
+ (MAX(DATEDIFF(month, {COLUMN_NAME}, '{RUN_DATE}'::DATE)) - MIN(DATEDIFF(month, {COLUMN_NAME}, '{RUN_DATE}'::DATE)) + 1) - COUNT(DISTINCT DATEDIFF(month, {COLUMN_NAME}, '{RUN_DATE}'::DATE))
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
- - id: '6018'
+ - id: '2018'
test_type: Monthly_Rec_Ct
- sql_flavor: databricks
+ sql_flavor: snowflake
measure: |-
- (MAX(<%DATEDIFF_MONTH;{COLUMN_NAME};'{RUN_DATE}'::DATE%>) - MIN(<%DATEDIFF_MONTH;{COLUMN_NAME};'{RUN_DATE}'::DATE%>) + 1) - COUNT(DISTINCT <%DATEDIFF_MONTH;{COLUMN_NAME};'{RUN_DATE}'::DATE%>)
+ (MAX(DATEDIFF(month, {COLUMN_NAME}, '{RUN_DATE}'::DATE)) - MIN(DATEDIFF(month, {COLUMN_NAME}, '{RUN_DATE}'::DATE)) + 1) - COUNT(DISTINCT DATEDIFF(month, {COLUMN_NAME}, '{RUN_DATE}'::DATE))
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
- target_data_lookups:
- - id: '1018'
- test_id: '1023'
+ - id: '5018'
test_type: Monthly_Rec_Ct
- sql_flavor: redshift
- lookup_type: null
- lookup_query: |-
- WITH RECURSIVE daterange(all_dates) AS (SELECT DATE_TRUNC('month', MIN("{COLUMN_NAME}")) :: DATE AS all_dates FROM {TARGET_SCHEMA}.{TABLE_NAME} UNION ALL SELECT DATEADD(MONTH, 1, d.all_dates) :: DATE AS all_dates FROM daterange d WHERE d.all_dates < (SELECT DATE_TRUNC('month', MAX("{COLUMN_NAME}")) :: DATE FROM {TARGET_SCHEMA}.{TABLE_NAME}) ), existing_periods AS ( SELECT DISTINCT DATE_TRUNC('month',"{COLUMN_NAME}") :: DATE AS period, COUNT(1) AS period_count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY DATE_TRUNC('month',"{COLUMN_NAME}") :: DATE ) SELECT d.all_dates as missing_period, MAX(b.period) AS prior_available_month, (SELECT period_count FROM existing_periods WHERE period = MAX(b.period) ) AS prior_available_month_count, MIN(c.period) AS next_available_month, (SELECT period_count FROM existing_periods WHERE period = MIN(c.period) ) AS next_available_month_count FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates ORDER BY d.all_dates;
- error_type: Test Results
- - id: '1100'
+ sql_flavor: trino
+ measure: |-
+ (MAX(DATE_DIFF('month', {COLUMN_NAME}, CAST('{RUN_DATE}' AS DATE))) - MIN(DATE_DIFF('month', {COLUMN_NAME}, CAST('{RUN_DATE}' AS DATE))) + 1) - COUNT(DISTINCT DATE_DIFF('month', {COLUMN_NAME}, CAST('{RUN_DATE}' AS DATE)))
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ target_data_lookups:
+ - id: '1315'
test_id: '1023'
test_type: Monthly_Rec_Ct
- sql_flavor: postgresql
+ sql_flavor: databricks
lookup_type: null
lookup_query: |-
- WITH RECURSIVE daterange(all_dates) AS (SELECT DATE_TRUNC('month', MIN("{COLUMN_NAME}")) :: DATE AS all_dates FROM {TARGET_SCHEMA}.{TABLE_NAME} UNION ALL SELECT (d.all_dates :: DATE + INTERVAL '1 month') :: DATE AS all_dates FROM daterange d WHERE d.all_dates < (SELECT DATE_TRUNC('month', MAX("{COLUMN_NAME}")) :: DATE FROM {TARGET_SCHEMA}.{TABLE_NAME}) ), existing_periods AS ( SELECT DISTINCT DATE_TRUNC('month',"{COLUMN_NAME}") :: DATE AS period, COUNT(1) AS period_count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY DATE_TRUNC('month',"{COLUMN_NAME}") :: DATE ) SELECT d.all_dates as missing_period, MAX(b.period) AS prior_available_month, (SELECT period_count FROM existing_periods WHERE period = MAX(b.period) ) AS prior_available_month_count, MIN(c.period) AS next_available_month, (SELECT period_count FROM existing_periods WHERE period = MIN(c.period) ) AS next_available_month_count FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates ORDER BY d.all_dates;
+ WITH daterange AS( SELECT explode( sequence( date_trunc('month', (SELECT MIN(`{COLUMN_NAME}`) FROM {TARGET_SCHEMA}.{TABLE_NAME})), date_trunc('month', (SELECT MAX(`{COLUMN_NAME}`) FROM {TARGET_SCHEMA}.{TABLE_NAME})), interval 1 month) ) AS all_dates ), existing_periods AS ( SELECT DISTINCT date_trunc('month', `{COLUMN_NAME}`) AS period, COUNT(1) AS period_count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY date_trunc('month', `{COLUMN_NAME}`) ) SELECT p.missing_period, p.prior_available_month, e.period_count AS prior_available_month_count, p.next_available_month, f.period_count AS next_available_month_count FROM ( SELECT d.all_dates AS missing_period, MAX(b.period) AS prior_available_month, MIN(c.period) AS next_available_month FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates ) p LEFT JOIN existing_periods e ON p.prior_available_month = e.period LEFT JOIN existing_periods f ON p.next_available_month = f.period ORDER BY p.missing_period;
error_type: Test Results
- id: '1157'
test_id: '1023'
@@ -149,20 +141,28 @@ test_types:
OR next_record_ct = 0
ORDER BY check_period DESC;
error_type: Test Results
- - id: '1214'
+ - id: '1100'
test_id: '1023'
test_type: Monthly_Rec_Ct
- sql_flavor: snowflake
+ sql_flavor: postgresql
lookup_type: null
lookup_query: |-
- WITH RECURSIVE daterange(all_dates) AS (SELECT DATE_TRUNC('month', MIN("{COLUMN_NAME}")) :: DATE AS all_dates FROM {TARGET_SCHEMA}.{TABLE_NAME} UNION ALL SELECT DATEADD(MONTH, 1, d.all_dates) :: DATE AS all_dates FROM daterange d WHERE d.all_dates < (SELECT DATE_TRUNC('month', MAX("{COLUMN_NAME}")) :: DATE FROM {TARGET_SCHEMA}.{TABLE_NAME}) ), existing_periods AS (SELECT DISTINCT DATE_TRUNC('month',"{COLUMN_NAME}") :: DATE AS period, COUNT(1) AS period_count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY DATE_TRUNC('month',"{COLUMN_NAME}") :: DATE ) SELECT p.missing_period, p.prior_available_month, e.period_count as prior_available_month_count, p.next_available_month, f.period_count as next_available_month_count FROM (SELECT d.all_dates as missing_period, MAX(b.period) AS prior_available_month, MIN(c.period) AS next_available_month FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates) p LEFT JOIN existing_periods e ON (p.prior_available_month = e.period) LEFT JOIN existing_periods f ON (p.next_available_month = f.period) ORDER BY p.missing_period;
+ WITH RECURSIVE daterange(all_dates) AS (SELECT DATE_TRUNC('month', MIN("{COLUMN_NAME}")) :: DATE AS all_dates FROM {TARGET_SCHEMA}.{TABLE_NAME} UNION ALL SELECT (d.all_dates :: DATE + INTERVAL '1 month') :: DATE AS all_dates FROM daterange d WHERE d.all_dates < (SELECT DATE_TRUNC('month', MAX("{COLUMN_NAME}")) :: DATE FROM {TARGET_SCHEMA}.{TABLE_NAME}) ), existing_periods AS ( SELECT DISTINCT DATE_TRUNC('month',"{COLUMN_NAME}") :: DATE AS period, COUNT(1) AS period_count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY DATE_TRUNC('month',"{COLUMN_NAME}") :: DATE ) SELECT d.all_dates as missing_period, MAX(b.period) AS prior_available_month, (SELECT period_count FROM existing_periods WHERE period = MAX(b.period) ) AS prior_available_month_count, MIN(c.period) AS next_available_month, (SELECT period_count FROM existing_periods WHERE period = MIN(c.period) ) AS next_available_month_count FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates ORDER BY d.all_dates;
error_type: Test Results
- - id: '1315'
+ - id: '1018'
test_id: '1023'
test_type: Monthly_Rec_Ct
- sql_flavor: databricks
+ sql_flavor: redshift
lookup_type: null
lookup_query: |-
- WITH daterange AS( SELECT explode( sequence( date_trunc('month', (SELECT MIN(`{COLUMN_NAME}`) FROM {TARGET_SCHEMA}.{TABLE_NAME})), date_trunc('month', (SELECT MAX(`{COLUMN_NAME}`) FROM {TARGET_SCHEMA}.{TABLE_NAME})), interval 1 month) ) AS all_dates ), existing_periods AS ( SELECT DISTINCT date_trunc('month', `{COLUMN_NAME}`) AS period, COUNT(1) AS period_count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY date_trunc('month', `{COLUMN_NAME}`) ) SELECT p.missing_period, p.prior_available_month, e.period_count AS prior_available_month_count, p.next_available_month, f.period_count AS next_available_month_count FROM ( SELECT d.all_dates AS missing_period, MAX(b.period) AS prior_available_month, MIN(c.period) AS next_available_month FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates ) p LEFT JOIN existing_periods e ON p.prior_available_month = e.period LEFT JOIN existing_periods f ON p.next_available_month = f.period ORDER BY p.missing_period;
+ WITH RECURSIVE daterange(all_dates) AS (SELECT DATE_TRUNC('month', MIN("{COLUMN_NAME}")) :: DATE AS all_dates FROM {TARGET_SCHEMA}.{TABLE_NAME} UNION ALL SELECT DATEADD(MONTH, 1, d.all_dates) :: DATE AS all_dates FROM daterange d WHERE d.all_dates < (SELECT DATE_TRUNC('month', MAX("{COLUMN_NAME}")) :: DATE FROM {TARGET_SCHEMA}.{TABLE_NAME}) ), existing_periods AS ( SELECT DISTINCT DATE_TRUNC('month',"{COLUMN_NAME}") :: DATE AS period, COUNT(1) AS period_count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY DATE_TRUNC('month',"{COLUMN_NAME}") :: DATE ) SELECT d.all_dates as missing_period, MAX(b.period) AS prior_available_month, (SELECT period_count FROM existing_periods WHERE period = MAX(b.period) ) AS prior_available_month_count, MIN(c.period) AS next_available_month, (SELECT period_count FROM existing_periods WHERE period = MIN(c.period) ) AS next_available_month_count FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates ORDER BY d.all_dates;
+ error_type: Test Results
+ - id: '1214'
+ test_id: '1023'
+ test_type: Monthly_Rec_Ct
+ sql_flavor: snowflake
+ lookup_type: null
+ lookup_query: |-
+ WITH RECURSIVE daterange(all_dates) AS (SELECT DATE_TRUNC('month', MIN("{COLUMN_NAME}")) :: DATE AS all_dates FROM {TARGET_SCHEMA}.{TABLE_NAME} UNION ALL SELECT DATEADD(MONTH, 1, d.all_dates) :: DATE AS all_dates FROM daterange d WHERE d.all_dates < (SELECT DATE_TRUNC('month', MAX("{COLUMN_NAME}")) :: DATE FROM {TARGET_SCHEMA}.{TABLE_NAME}) ), existing_periods AS (SELECT DISTINCT DATE_TRUNC('month',"{COLUMN_NAME}") :: DATE AS period, COUNT(1) AS period_count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY DATE_TRUNC('month',"{COLUMN_NAME}") :: DATE ) SELECT p.missing_period, p.prior_available_month, e.period_count as prior_available_month_count, p.next_available_month, f.period_count as next_available_month_count FROM (SELECT d.all_dates as missing_period, MAX(b.period) AS prior_available_month, MIN(c.period) AS next_available_month FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates) p LEFT JOIN existing_periods e ON (p.prior_available_month = e.period) LEFT JOIN existing_periods f ON (p.next_available_month = f.period) ORDER BY p.missing_period;
error_type: Test Results
test_templates: []
diff --git a/testgen/template/dbsetup_test_types/test_types_Outlier_Pct_Above.yaml b/testgen/template/dbsetup_test_types/test_types_Outlier_Pct_Above.yaml
index 736de800..2d077ce8 100644
--- a/testgen/template/dbsetup_test_types/test_types_Outlier_Pct_Above.yaml
+++ b/testgen/template/dbsetup_test_types/test_types_Outlier_Pct_Above.yaml
@@ -29,6 +29,8 @@ test_types:
health_dimension: Data Drift
threshold_description: |-
Expected maximum pct records over upper 2 SD limit
+ result_visualization: line_chart
+ result_visualization_params: null
usage_notes: "This test counts the number of data points that may be considered\
\ as outliers, determined by whether their value exceeds 2 standard deviations\
\ above the mean at baseline. Assuming a normal distribution, a small percentage\
@@ -37,20 +39,10 @@ test_types:
\ you expect to see. This test uses the baseline mean rather than the mean for\
\ the latest dataset to capture systemic shift as well as individual outliers. "
active: Y
- result_visualization: line_chart
- result_visualization_params: null
cat_test_conditions:
- - id: '1019'
- test_type: Outlier_Pct_Above
- sql_flavor: redshift
- measure: |-
- SUM(CASE WHEN {COLUMN_NAME}::FLOAT > {BASELINE_AVG}+(2.0*{BASELINE_SD}) THEN 1 ELSE 0 END)::FLOAT / NULLIF(COUNT({COLUMN_NAME}), 0)::FLOAT
- test_operator: '>'
- test_condition: |-
- {THRESHOLD_VALUE}
- - id: '2019'
+ - id: '6019'
test_type: Outlier_Pct_Above
- sql_flavor: snowflake
+ sql_flavor: databricks
measure: |-
SUM(CASE WHEN {COLUMN_NAME}::FLOAT > {BASELINE_AVG}+(2.0*{BASELINE_SD}) THEN 1 ELSE 0 END)::FLOAT / NULLIF(COUNT({COLUMN_NAME}), 0)::FLOAT
test_operator: '>'
@@ -72,30 +64,46 @@ test_types:
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
- - id: '5019'
+ - id: '1019'
test_type: Outlier_Pct_Above
- sql_flavor: trino
+ sql_flavor: redshift
measure: |-
- CAST(SUM(CASE WHEN CAST({COLUMN_NAME} AS REAL) > {BASELINE_AVG}+(2.0*{BASELINE_SD}) THEN 1 ELSE 0 END) AS REAL) / CAST(COUNT({COLUMN_NAME}) AS REAL)
+ SUM(CASE WHEN {COLUMN_NAME}::FLOAT > {BASELINE_AVG}+(2.0*{BASELINE_SD}) THEN 1 ELSE 0 END)::FLOAT / NULLIF(COUNT({COLUMN_NAME}), 0)::FLOAT
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
- - id: '6019'
+ - id: '2019'
test_type: Outlier_Pct_Above
- sql_flavor: databricks
+ sql_flavor: snowflake
measure: |-
SUM(CASE WHEN {COLUMN_NAME}::FLOAT > {BASELINE_AVG}+(2.0*{BASELINE_SD}) THEN 1 ELSE 0 END)::FLOAT / NULLIF(COUNT({COLUMN_NAME}), 0)::FLOAT
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
+ - id: '5019'
+ test_type: Outlier_Pct_Above
+ sql_flavor: trino
+ measure: |-
+ CAST(SUM(CASE WHEN CAST({COLUMN_NAME} AS REAL) > {BASELINE_AVG}+(2.0*{BASELINE_SD}) THEN 1 ELSE 0 END) AS REAL) / CAST(COUNT({COLUMN_NAME}) AS REAL)
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
target_data_lookups:
- - id: '1019'
+ - id: '1316'
test_id: '1024'
test_type: Outlier_Pct_Above
- sql_flavor: redshift
+ sql_flavor: databricks
lookup_type: null
lookup_query: |-
- SELECT ({BASELINE_AVG} + (2*{BASELINE_SD})) AS outlier_threshold, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" :: FLOAT > ({BASELINE_AVG} + (2*{BASELINE_SD})) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC;
+ SELECT ({BASELINE_AVG} + (2*{BASELINE_SD})) AS outlier_threshold, `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE `{COLUMN_NAME}` :: FLOAT > ({BASELINE_AVG} + (2*{BASELINE_SD})) GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` DESC;
+ error_type: Test Results
+ - id: '1158'
+ test_id: '1024'
+ test_type: Outlier_Pct_Above
+ sql_flavor: mssql
+ lookup_type: null
+ lookup_query: |-
+ SELECT ({BASELINE_AVG} + (2*{BASELINE_SD})) AS outlier_threshold, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE CAST("{COLUMN_NAME}" AS FLOAT) > ({BASELINE_AVG} + (2*{BASELINE_SD})) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC;
error_type: Test Results
- id: '1101'
test_id: '1024'
@@ -105,13 +113,13 @@ test_types:
lookup_query: |-
SELECT ({BASELINE_AVG} + (2*{BASELINE_SD})) AS outlier_threshold, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" :: FLOAT > ({BASELINE_AVG} + (2*{BASELINE_SD})) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC;
error_type: Test Results
- - id: '1158'
+ - id: '1019'
test_id: '1024'
test_type: Outlier_Pct_Above
- sql_flavor: mssql
+ sql_flavor: redshift
lookup_type: null
lookup_query: |-
- SELECT ({BASELINE_AVG} + (2*{BASELINE_SD})) AS outlier_threshold, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE CAST("{COLUMN_NAME}" AS FLOAT) > ({BASELINE_AVG} + (2*{BASELINE_SD})) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC;
+ SELECT ({BASELINE_AVG} + (2*{BASELINE_SD})) AS outlier_threshold, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" :: FLOAT > ({BASELINE_AVG} + (2*{BASELINE_SD})) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC;
error_type: Test Results
- id: '1215'
test_id: '1024'
@@ -121,12 +129,4 @@ test_types:
lookup_query: |-
SELECT ({BASELINE_AVG} + (2*{BASELINE_SD})) AS outlier_threshold, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" :: FLOAT > ({BASELINE_AVG} + (2*{BASELINE_SD})) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC;
error_type: Test Results
- - id: '1316'
- test_id: '1024'
- test_type: Outlier_Pct_Above
- sql_flavor: databricks
- lookup_type: null
- lookup_query: |-
- SELECT ({BASELINE_AVG} + (2*{BASELINE_SD})) AS outlier_threshold, `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE `{COLUMN_NAME}` :: FLOAT > ({BASELINE_AVG} + (2*{BASELINE_SD})) GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` DESC;
- error_type: Test Results
test_templates: []
diff --git a/testgen/template/dbsetup_test_types/test_types_Outlier_Pct_Below.yaml b/testgen/template/dbsetup_test_types/test_types_Outlier_Pct_Below.yaml
index 22559430..fea7e15f 100644
--- a/testgen/template/dbsetup_test_types/test_types_Outlier_Pct_Below.yaml
+++ b/testgen/template/dbsetup_test_types/test_types_Outlier_Pct_Below.yaml
@@ -29,6 +29,8 @@ test_types:
health_dimension: Data Drift
threshold_description: |-
Expected maximum pct records over lower 2 SD limit
+ result_visualization: line_chart
+ result_visualization_params: null
usage_notes: "This test counts the number of data points that may be considered\
\ as outliers, determined by whether their value exceeds 2 standard deviations\
\ below the mean at baseline. Assuming a normal distribution, a small percentage\
@@ -37,20 +39,10 @@ test_types:
\ you expect to see. This test uses the baseline mean rather than the mean for\
\ the latest dataset to capture systemic shift as well as individual outliers. "
active: Y
- result_visualization: line_chart
- result_visualization_params: null
cat_test_conditions:
- - id: '1020'
- test_type: Outlier_Pct_Below
- sql_flavor: redshift
- measure: |-
- SUM(CASE WHEN {COLUMN_NAME}::FLOAT < {BASELINE_AVG}-(2.0*{BASELINE_SD}) THEN 1 ELSE 0 END)::FLOAT / NULLIF(COUNT({COLUMN_NAME}), 0)::FLOAT
- test_operator: '>'
- test_condition: |-
- {THRESHOLD_VALUE}
- - id: '2020'
+ - id: '6020'
test_type: Outlier_Pct_Below
- sql_flavor: snowflake
+ sql_flavor: databricks
measure: |-
SUM(CASE WHEN {COLUMN_NAME}::FLOAT < {BASELINE_AVG}-(2.0*{BASELINE_SD}) THEN 1 ELSE 0 END)::FLOAT / NULLIF(COUNT({COLUMN_NAME}), 0)::FLOAT
test_operator: '>'
@@ -72,30 +64,46 @@ test_types:
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
- - id: '5020'
+ - id: '1020'
test_type: Outlier_Pct_Below
- sql_flavor: trino
+ sql_flavor: redshift
measure: |-
- CAST(SUM(CASE WHEN CAST( {COLUMN_NAME} AS REAL) < {BASELINE_AVG}-(2.0*{BASELINE_SD}) THEN 1 ELSE 0 END) AS REAL) / CAST(COUNT({COLUMN_NAME}) AS REAL)
+ SUM(CASE WHEN {COLUMN_NAME}::FLOAT < {BASELINE_AVG}-(2.0*{BASELINE_SD}) THEN 1 ELSE 0 END)::FLOAT / NULLIF(COUNT({COLUMN_NAME}), 0)::FLOAT
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
- - id: '6020'
+ - id: '2020'
test_type: Outlier_Pct_Below
- sql_flavor: databricks
+ sql_flavor: snowflake
measure: |-
SUM(CASE WHEN {COLUMN_NAME}::FLOAT < {BASELINE_AVG}-(2.0*{BASELINE_SD}) THEN 1 ELSE 0 END)::FLOAT / NULLIF(COUNT({COLUMN_NAME}), 0)::FLOAT
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
+ - id: '5020'
+ test_type: Outlier_Pct_Below
+ sql_flavor: trino
+ measure: |-
+ CAST(SUM(CASE WHEN CAST( {COLUMN_NAME} AS REAL) < {BASELINE_AVG}-(2.0*{BASELINE_SD}) THEN 1 ELSE 0 END) AS REAL) / CAST(COUNT({COLUMN_NAME}) AS REAL)
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
target_data_lookups:
- - id: '1020'
+ - id: '1317'
test_id: '1025'
test_type: Outlier_Pct_Below
- sql_flavor: redshift
+ sql_flavor: databricks
lookup_type: null
lookup_query: |-
- SELECT ({BASELINE_AVG} + (2*{BASELINE_SD})) AS outlier_threshold, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" :: FLOAT < ({BASELINE_AVG} + (2*{BASELINE_SD})) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC;
+ SELECT ({BASELINE_AVG} + (2*{BASELINE_SD})) AS outlier_threshold, `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE `{COLUMN_NAME}` :: FLOAT < ({BASELINE_AVG} + (2*{BASELINE_SD})) GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` DESC;
+ error_type: Test Results
+ - id: '1159'
+ test_id: '1025'
+ test_type: Outlier_Pct_Below
+ sql_flavor: mssql
+ lookup_type: null
+ lookup_query: |-
+ SELECT ({BASELINE_AVG} + (2*{BASELINE_SD})) AS outlier_threshold, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE CAST("{COLUMN_NAME}" AS FLOAT) < ({BASELINE_AVG} + (2*{BASELINE_SD})) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC;
error_type: Test Results
- id: '1102'
test_id: '1025'
@@ -105,13 +113,13 @@ test_types:
lookup_query: |-
SELECT ({BASELINE_AVG} + (2*{BASELINE_SD})) AS outlier_threshold, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" :: FLOAT < ({BASELINE_AVG} + (2*{BASELINE_SD})) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC;
error_type: Test Results
- - id: '1159'
+ - id: '1020'
test_id: '1025'
test_type: Outlier_Pct_Below
- sql_flavor: mssql
+ sql_flavor: redshift
lookup_type: null
lookup_query: |-
- SELECT ({BASELINE_AVG} + (2*{BASELINE_SD})) AS outlier_threshold, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE CAST("{COLUMN_NAME}" AS FLOAT) < ({BASELINE_AVG} + (2*{BASELINE_SD})) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC;
+ SELECT ({BASELINE_AVG} + (2*{BASELINE_SD})) AS outlier_threshold, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" :: FLOAT < ({BASELINE_AVG} + (2*{BASELINE_SD})) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC;
error_type: Test Results
- id: '1216'
test_id: '1025'
@@ -121,12 +129,4 @@ test_types:
lookup_query: |-
SELECT ({BASELINE_AVG} + (2*{BASELINE_SD})) AS outlier_threshold, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" :: FLOAT < ({BASELINE_AVG} + (2*{BASELINE_SD})) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC;
error_type: Test Results
- - id: '1317'
- test_id: '1025'
- test_type: Outlier_Pct_Below
- sql_flavor: databricks
- lookup_type: null
- lookup_query: |-
- SELECT ({BASELINE_AVG} + (2*{BASELINE_SD})) AS outlier_threshold, `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE `{COLUMN_NAME}` :: FLOAT < ({BASELINE_AVG} + (2*{BASELINE_SD})) GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` DESC;
- error_type: Test Results
test_templates: []
diff --git a/testgen/template/dbsetup_test_types/test_types_Pattern_Match.yaml b/testgen/template/dbsetup_test_types/test_types_Pattern_Match.yaml
index 835e5258..425d3e0f 100644
--- a/testgen/template/dbsetup_test_types/test_types_Pattern_Match.yaml
+++ b/testgen/template/dbsetup_test_types/test_types_Pattern_Match.yaml
@@ -29,25 +29,17 @@ test_types:
health_dimension: Schema Drift
threshold_description: |-
Expected count of pattern mismatches
+ result_visualization: line_chart
+ result_visualization_params: null
usage_notes: |-
This test is appropriate for character fields that are expected to appear in a consistent format. It uses pattern matching syntax as appropriate for your database: REGEX matching if available, otherwise LIKE expressions. The expected threshold is the number of records that fail to match the defined pattern.
active: Y
- result_visualization: line_chart
- result_visualization_params: null
cat_test_conditions:
- - id: '1021'
- test_type: Pattern_Match
- sql_flavor: redshift
- measure: |-
- COUNT(NULLIF({COLUMN_NAME}, '')) - SUM((NULLIF({COLUMN_NAME}, '') SIMILAR TO '{BASELINE_VALUE}')::BIGINT)
- test_operator: '>'
- test_condition: |-
- {THRESHOLD_VALUE}
- - id: '2021'
+ - id: '6021'
test_type: Pattern_Match
- sql_flavor: snowflake
+ sql_flavor: databricks
measure: |-
- COUNT(NULLIF({COLUMN_NAME}, '')) - SUM(REGEXP_LIKE(NULLIF({COLUMN_NAME}::VARCHAR, ''), '{BASELINE_VALUE}')::BIGINT)
+ COUNT(NULLIF({COLUMN_NAME}, '')) - SUM(REGEXP_LIKE(NULLIF({COLUMN_NAME}::STRING, ''), '{BASELINE_VALUE}')::BIGINT)
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
@@ -67,30 +59,46 @@ test_types:
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
- - id: '5021'
+ - id: '1021'
test_type: Pattern_Match
- sql_flavor: trino
+ sql_flavor: redshift
measure: |-
- COUNT(NULLIF({COLUMN_NAME}, '')) - SUM(CASE WHEN REGEXP_LIKE(NULLIF({COLUMN_NAME}, '') , '{BASELINE_VALUE}') = TRUE THEN 1 ELSE 0 END)
+ COUNT(NULLIF({COLUMN_NAME}, '')) - SUM((NULLIF({COLUMN_NAME}, '') SIMILAR TO '{BASELINE_VALUE}')::BIGINT)
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
- - id: '6021'
+ - id: '2021'
test_type: Pattern_Match
- sql_flavor: databricks
+ sql_flavor: snowflake
measure: |-
- COUNT(NULLIF({COLUMN_NAME}, '')) - SUM(REGEXP_LIKE(NULLIF({COLUMN_NAME}::STRING, ''), '{BASELINE_VALUE}')::BIGINT)
+ COUNT(NULLIF({COLUMN_NAME}, '')) - SUM(REGEXP_LIKE(NULLIF({COLUMN_NAME}::VARCHAR, ''), '{BASELINE_VALUE}')::BIGINT)
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '5021'
+ test_type: Pattern_Match
+ sql_flavor: trino
+ measure: |-
+ COUNT(NULLIF({COLUMN_NAME}, '')) - SUM(CASE WHEN REGEXP_LIKE(NULLIF({COLUMN_NAME}, '') , '{BASELINE_VALUE}') = TRUE THEN 1 ELSE 0 END)
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
target_data_lookups:
- - id: '1021'
+ - id: '1318'
test_id: '1026'
test_type: Pattern_Match
- sql_flavor: redshift
+ sql_flavor: databricks
lookup_type: null
lookup_query: |-
- SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE NULLIF("{COLUMN_NAME}", '') NOT SIMILAR TO '{BASELINE_VALUE}' GROUP BY "{COLUMN_NAME}";
+ SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE REGEXP_LIKE(NULLIF(`{COLUMN_NAME}`::STRING, ''),'{BASELINE_VALUE}') != 1 GROUP BY `{COLUMN_NAME}`;
+ error_type: Test Results
+ - id: '1160'
+ test_id: '1026'
+ test_type: Pattern_Match
+ sql_flavor: mssql
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE NULLIF("{COLUMN_NAME}", '') NOT LIKE '{BASELINE_VALUE}' GROUP BY "{COLUMN_NAME}";
error_type: Test Results
- id: '1103'
test_id: '1026'
@@ -100,13 +108,13 @@ test_types:
lookup_query: |-
SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE NULLIF("{COLUMN_NAME}", '') NOT SIMILAR TO '{BASELINE_VALUE}' GROUP BY "{COLUMN_NAME}";
error_type: Test Results
- - id: '1160'
+ - id: '1021'
test_id: '1026'
test_type: Pattern_Match
- sql_flavor: mssql
+ sql_flavor: redshift
lookup_type: null
lookup_query: |-
- SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE NULLIF("{COLUMN_NAME}", '') NOT LIKE '{BASELINE_VALUE}' GROUP BY "{COLUMN_NAME}";
+ SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE NULLIF("{COLUMN_NAME}", '') NOT SIMILAR TO '{BASELINE_VALUE}' GROUP BY "{COLUMN_NAME}";
error_type: Test Results
- id: '1217'
test_id: '1026'
@@ -116,12 +124,4 @@ test_types:
lookup_query: |-
SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE REGEXP_LIKE(NULLIF("{COLUMN_NAME}"::VARCHAR, ''),'{BASELINE_VALUE}') != 1 GROUP BY "{COLUMN_NAME}";
error_type: Test Results
- - id: '1318'
- test_id: '1026'
- test_type: Pattern_Match
- sql_flavor: databricks
- lookup_type: null
- lookup_query: |-
- SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE REGEXP_LIKE(NULLIF(`{COLUMN_NAME}`::STRING, ''),'{BASELINE_VALUE}') != 1 GROUP BY `{COLUMN_NAME}`;
- error_type: Test Results
test_templates: []
diff --git a/testgen/template/dbsetup_test_types/test_types_Recency.yaml b/testgen/template/dbsetup_test_types/test_types_Recency.yaml
index ebafb9a1..2ebd28e3 100644
--- a/testgen/template/dbsetup_test_types/test_types_Recency.yaml
+++ b/testgen/template/dbsetup_test_types/test_types_Recency.yaml
@@ -30,25 +30,17 @@ test_types:
health_dimension: Recency
threshold_description: |-
Expected maximum count of days preceding test date
+ result_visualization: line_chart
+ result_visualization_params: null
usage_notes: |-
This test evaluates recency based on the latest referenced dates in the column. The test is appropriate for transactional dates and timestamps. The test can be especially valuable because timely data deliveries themselves may not assure that the most recent data is present. You can adjust the expected threshold to the maximum number of days that you expect the data to age before the dataset is refreshed.
active: Y
- result_visualization: line_chart
- result_visualization_params: null
cat_test_conditions:
- - id: '1022'
- test_type: Recency
- sql_flavor: redshift
- measure: |-
- DATEDIFF('D', MAX({COLUMN_NAME}), '{RUN_DATE}'::DATE)
- test_operator: '>'
- test_condition: |-
- {THRESHOLD_VALUE}
- - id: '2022'
+ - id: '6022'
test_type: Recency
- sql_flavor: snowflake
+ sql_flavor: databricks
measure: |-
- DATEDIFF('D', MAX({COLUMN_NAME}), '{RUN_DATE}'::DATE)
+ <%DATEDIFF_DAY;MAX({COLUMN_NAME});'{RUN_DATE}'::DATE%>
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
@@ -68,61 +60,69 @@ test_types:
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
- - id: '5022'
+ - id: '1022'
test_type: Recency
- sql_flavor: trino
+ sql_flavor: redshift
measure: |-
- DATE_DIFF('day', MAX({COLUMN_NAME}), CAST('{RUN_DATE}' AS DATE))
+ DATEDIFF('D', MAX({COLUMN_NAME}), '{RUN_DATE}'::DATE)
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
- - id: '6022'
+ - id: '2022'
test_type: Recency
- sql_flavor: databricks
+ sql_flavor: snowflake
measure: |-
- <%DATEDIFF_DAY;MAX({COLUMN_NAME});'{RUN_DATE}'::DATE%>
+ DATEDIFF('D', MAX({COLUMN_NAME}), '{RUN_DATE}'::DATE)
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '5022'
+ test_type: Recency
+ sql_flavor: trino
+ measure: |-
+ DATE_DIFF('day', MAX({COLUMN_NAME}), CAST('{RUN_DATE}' AS DATE))
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
target_data_lookups:
- - id: '1022'
+ - id: '1319'
test_id: '1028'
test_type: Recency
- sql_flavor: redshift
+ sql_flavor: databricks
lookup_type: null
lookup_query: |-
- SELECT DISTINCT col AS latest_date_available, '{TEST_DATE}' :: DATE as test_run_date FROM (SELECT MAX("{COLUMN_NAME}") AS col FROM {TARGET_SCHEMA}.{TABLE_NAME}) WHERE DATEDIFF('D', col, '{TEST_DATE}'::DATE) > {THRESHOLD_VALUE};
+ SELECT DISTINCT col AS latest_date_available, '{TEST_DATE}' :: DATE as test_run_date FROM (SELECT MAX(`{COLUMN_NAME}`) AS col FROM {TARGET_SCHEMA}.{TABLE_NAME}) WHERE ABS(<%DATEDIFF_DAY;col;'{TEST_DATE}'::DATE%>) > {THRESHOLD_VALUE};
error_type: Test Results
- - id: '1104'
+ - id: '1161'
test_id: '1028'
test_type: Recency
- sql_flavor: postgresql
+ sql_flavor: mssql
lookup_type: null
lookup_query: |-
- SELECT DISTINCT col AS latest_date_available, '{TEST_DATE}' :: DATE as test_run_date FROM (SELECT MAX("{COLUMN_NAME}") AS col FROM {TARGET_SCHEMA}.{TABLE_NAME}) a WHERE <%DATEDIFF_DAY;col;'{TEST_DATE}'::DATE%> > {THRESHOLD_VALUE};
+ SELECT DISTINCT col AS latest_date_available, CAST('{TEST_DATE}' AS DATE) AS test_run_date FROM (SELECT MAX("{COLUMN_NAME}") AS col FROM {TARGET_SCHEMA}.{TABLE_NAME}) a WHERE DATEDIFF(day, col, CAST('{TEST_DATE}' AS DATE)) > {THRESHOLD_VALUE};
error_type: Test Results
- - id: '1161'
+ - id: '1104'
test_id: '1028'
test_type: Recency
- sql_flavor: mssql
+ sql_flavor: postgresql
lookup_type: null
lookup_query: |-
- SELECT DISTINCT col AS latest_date_available, CAST('{TEST_DATE}' AS DATE) AS test_run_date FROM (SELECT MAX("{COLUMN_NAME}") AS col FROM {TARGET_SCHEMA}.{TABLE_NAME}) a WHERE DATEDIFF(day, col, CAST('{TEST_DATE}' AS DATE)) > {THRESHOLD_VALUE};
+ SELECT DISTINCT col AS latest_date_available, '{TEST_DATE}' :: DATE as test_run_date FROM (SELECT MAX("{COLUMN_NAME}") AS col FROM {TARGET_SCHEMA}.{TABLE_NAME}) a WHERE <%DATEDIFF_DAY;col;'{TEST_DATE}'::DATE%> > {THRESHOLD_VALUE};
error_type: Test Results
- - id: '1218'
+ - id: '1022'
test_id: '1028'
test_type: Recency
- sql_flavor: snowflake
+ sql_flavor: redshift
lookup_type: null
lookup_query: |-
SELECT DISTINCT col AS latest_date_available, '{TEST_DATE}' :: DATE as test_run_date FROM (SELECT MAX("{COLUMN_NAME}") AS col FROM {TARGET_SCHEMA}.{TABLE_NAME}) WHERE DATEDIFF('D', col, '{TEST_DATE}'::DATE) > {THRESHOLD_VALUE};
error_type: Test Results
- - id: '1319'
+ - id: '1218'
test_id: '1028'
test_type: Recency
- sql_flavor: databricks
+ sql_flavor: snowflake
lookup_type: null
lookup_query: |-
- SELECT DISTINCT col AS latest_date_available, '{TEST_DATE}' :: DATE as test_run_date FROM (SELECT MAX(`{COLUMN_NAME}`) AS col FROM {TARGET_SCHEMA}.{TABLE_NAME}) WHERE ABS(<%DATEDIFF_DAY;col;'{TEST_DATE}'::DATE%>) > {THRESHOLD_VALUE};
+ SELECT DISTINCT col AS latest_date_available, '{TEST_DATE}' :: DATE as test_run_date FROM (SELECT MAX("{COLUMN_NAME}") AS col FROM {TARGET_SCHEMA}.{TABLE_NAME}) WHERE DATEDIFF('D', col, '{TEST_DATE}'::DATE) > {THRESHOLD_VALUE};
error_type: Test Results
test_templates: []
diff --git a/testgen/template/dbsetup_test_types/test_types_Required.yaml b/testgen/template/dbsetup_test_types/test_types_Required.yaml
index 27200ce5..c49cf447 100644
--- a/testgen/template/dbsetup_test_types/test_types_Required.yaml
+++ b/testgen/template/dbsetup_test_types/test_types_Required.yaml
@@ -29,22 +29,14 @@ test_types:
health_dimension: Schema Drift
threshold_description: |-
Expected count of missing values
- usage_notes: null
- active: Y
result_visualization: line_chart
result_visualization_params: null
+ usage_notes: null
+ active: Y
cat_test_conditions:
- - id: '1023'
- test_type: Required
- sql_flavor: redshift
- measure: |-
- COUNT(*) - COUNT( {COLUMN_NAME} )
- test_operator: '>'
- test_condition: |-
- {THRESHOLD_VALUE}
- - id: '2023'
+ - id: '6023'
test_type: Required
- sql_flavor: snowflake
+ sql_flavor: databricks
measure: |-
COUNT(*) - COUNT( {COLUMN_NAME} )
test_operator: '>'
@@ -66,61 +58,69 @@ test_types:
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
- - id: '5023'
+ - id: '1023'
test_type: Required
- sql_flavor: trino
+ sql_flavor: redshift
measure: |-
- COUNT(*) - COUNT({COLUMN_NAME})
+ COUNT(*) - COUNT( {COLUMN_NAME} )
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
- - id: '6023'
+ - id: '2023'
test_type: Required
- sql_flavor: databricks
+ sql_flavor: snowflake
measure: |-
COUNT(*) - COUNT( {COLUMN_NAME} )
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
+ - id: '5023'
+ test_type: Required
+ sql_flavor: trino
+ measure: |-
+ COUNT(*) - COUNT({COLUMN_NAME})
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
target_data_lookups:
- - id: '1023'
+ - id: '1320'
test_id: '1030'
test_type: Required
- sql_flavor: redshift
+ sql_flavor: databricks
lookup_type: null
lookup_query: |-
- SELECT * FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" IS NULL LIMIT 500;
+ SELECT * FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE `{COLUMN_NAME}` IS NULL LIMIT 500;
error_type: Test Results
- - id: '1105'
+ - id: '1162'
test_id: '1030'
test_type: Required
- sql_flavor: postgresql
+ sql_flavor: mssql
lookup_type: null
lookup_query: |-
- SELECT * FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" IS NULL LIMIT 500;
+ SELECT TOP 500 * FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" IS NULL;
error_type: Test Results
- - id: '1162'
+ - id: '1105'
test_id: '1030'
test_type: Required
- sql_flavor: mssql
+ sql_flavor: postgresql
lookup_type: null
lookup_query: |-
- SELECT TOP 500 * FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" IS NULL;
+ SELECT * FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" IS NULL LIMIT 500;
error_type: Test Results
- - id: '1219'
+ - id: '1023'
test_id: '1030'
test_type: Required
- sql_flavor: snowflake
+ sql_flavor: redshift
lookup_type: null
lookup_query: |-
SELECT * FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" IS NULL LIMIT 500;
error_type: Test Results
- - id: '1320'
+ - id: '1219'
test_id: '1030'
test_type: Required
- sql_flavor: databricks
+ sql_flavor: snowflake
lookup_type: null
lookup_query: |-
- SELECT * FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE `{COLUMN_NAME}` IS NULL LIMIT 500;
+ SELECT * FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" IS NULL LIMIT 500;
error_type: Test Results
test_templates: []
diff --git a/testgen/template/dbsetup_test_types/test_types_Row_Ct.yaml b/testgen/template/dbsetup_test_types/test_types_Row_Ct.yaml
index e2470d79..35864b32 100644
--- a/testgen/template/dbsetup_test_types/test_types_Row_Ct.yaml
+++ b/testgen/template/dbsetup_test_types/test_types_Row_Ct.yaml
@@ -28,97 +28,97 @@ test_types:
health_dimension: Volume
threshold_description: |-
Expected minimum row count
+ result_visualization: line_chart
+ result_visualization_params: null
usage_notes: |-
Because this tests the row count against a constant minimum threshold, it's appropriate for any dataset, as long as the number of rows doesn't radically change from refresh to refresh. But it's not responsive to change over time. You may want to adjust the threshold periodically if you are dealing with a cumulative dataset.
active: Y
- result_visualization: line_chart
- result_visualization_params: null
cat_test_conditions:
- - id: '1024'
+ - id: '6024'
test_type: Row_Ct
- sql_flavor: redshift
+ sql_flavor: databricks
measure: |-
COUNT(*)
test_operator: <
test_condition: |-
{THRESHOLD_VALUE}
- - id: '2024'
+ - id: '3024'
test_type: Row_Ct
- sql_flavor: snowflake
+ sql_flavor: mssql
measure: |-
COUNT(*)
test_operator: <
test_condition: |-
{THRESHOLD_VALUE}
- - id: '3024'
+ - id: '4024'
test_type: Row_Ct
- sql_flavor: mssql
+ sql_flavor: postgresql
measure: |-
COUNT(*)
test_operator: <
test_condition: |-
{THRESHOLD_VALUE}
- - id: '4024'
+ - id: '1024'
test_type: Row_Ct
- sql_flavor: postgresql
+ sql_flavor: redshift
measure: |-
COUNT(*)
test_operator: <
test_condition: |-
{THRESHOLD_VALUE}
- - id: '5024'
+ - id: '2024'
test_type: Row_Ct
- sql_flavor: trino
+ sql_flavor: snowflake
measure: |-
COUNT(*)
test_operator: <
test_condition: |-
{THRESHOLD_VALUE}
- - id: '6024'
+ - id: '5024'
test_type: Row_Ct
- sql_flavor: databricks
+ sql_flavor: trino
measure: |-
COUNT(*)
test_operator: <
test_condition: |-
{THRESHOLD_VALUE}
target_data_lookups:
- - id: '1024'
+ - id: '1321'
test_id: '1031'
test_type: Row_Ct
- sql_flavor: redshift
+ sql_flavor: databricks
lookup_type: null
lookup_query: |-
- WITH CTE AS (SELECT COUNT(*) AS current_count FROM {TARGET_SCHEMA}.{TABLE_NAME}) SELECT current_count, ABS(ROUND(100 * (current_count - {THRESHOLD_VALUE}) :: FLOAT / {THRESHOLD_VALUE} :: FLOAT,2)) AS row_count_pct_decrease FROM cte WHERE current_count < {THRESHOLD_VALUE};
+ WITH CTE AS (SELECT COUNT(*) AS current_count FROM {TARGET_SCHEMA}.{TABLE_NAME}) SELECT current_count, ABS(ROUND(100 *(current_count - {THRESHOLD_VALUE}) :: FLOAT / {THRESHOLD_VALUE} :: FLOAT,2)) AS row_count_pct_decrease FROM cte WHERE current_count < {THRESHOLD_VALUE};
error_type: Test Results
- - id: '1106'
+ - id: '1163'
test_id: '1031'
test_type: Row_Ct
- sql_flavor: postgresql
+ sql_flavor: mssql
lookup_type: null
lookup_query: |-
- WITH CTE AS (SELECT COUNT(*) AS current_count FROM {TARGET_SCHEMA}.{TABLE_NAME}) SELECT current_count, ABS(ROUND(100 * (current_count - {THRESHOLD_VALUE}) :: NUMERIC / {THRESHOLD_VALUE} :: NUMERIC,2)) AS row_count_pct_decrease FROM cte WHERE current_count < {THRESHOLD_VALUE};
+ WITH CTE AS (SELECT COUNT(*) AS current_count FROM {TARGET_SCHEMA}.{TABLE_NAME}) SELECT current_count, ABS(ROUND(CAST(100 * (current_count - {THRESHOLD_VALUE}) AS NUMERIC) / CAST({THRESHOLD_VALUE} AS NUMERIC) ,2)) AS row_count_pct_decrease FROM cte WHERE current_count < {THRESHOLD_VALUE};
error_type: Test Results
- - id: '1163'
+ - id: '1106'
test_id: '1031'
test_type: Row_Ct
- sql_flavor: mssql
+ sql_flavor: postgresql
lookup_type: null
lookup_query: |-
- WITH CTE AS (SELECT COUNT(*) AS current_count FROM {TARGET_SCHEMA}.{TABLE_NAME}) SELECT current_count, ABS(ROUND(CAST(100 * (current_count - {THRESHOLD_VALUE}) AS NUMERIC) / CAST({THRESHOLD_VALUE} AS NUMERIC) ,2)) AS row_count_pct_decrease FROM cte WHERE current_count < {THRESHOLD_VALUE};
+ WITH CTE AS (SELECT COUNT(*) AS current_count FROM {TARGET_SCHEMA}.{TABLE_NAME}) SELECT current_count, ABS(ROUND(100 * (current_count - {THRESHOLD_VALUE}) :: NUMERIC / {THRESHOLD_VALUE} :: NUMERIC,2)) AS row_count_pct_decrease FROM cte WHERE current_count < {THRESHOLD_VALUE};
error_type: Test Results
- - id: '1220'
+ - id: '1024'
test_id: '1031'
test_type: Row_Ct
- sql_flavor: snowflake
+ sql_flavor: redshift
lookup_type: null
lookup_query: |-
- WITH CTE AS (SELECT COUNT(*) AS current_count FROM {TARGET_SCHEMA}.{TABLE_NAME}) SELECT current_count, ABS(ROUND(100 *(current_count - {THRESHOLD_VALUE}) :: FLOAT / {THRESHOLD_VALUE} :: FLOAT,2)) AS row_count_pct_decrease FROM cte WHERE current_count < {THRESHOLD_VALUE};
+ WITH CTE AS (SELECT COUNT(*) AS current_count FROM {TARGET_SCHEMA}.{TABLE_NAME}) SELECT current_count, ABS(ROUND(100 * (current_count - {THRESHOLD_VALUE}) :: FLOAT / {THRESHOLD_VALUE} :: FLOAT,2)) AS row_count_pct_decrease FROM cte WHERE current_count < {THRESHOLD_VALUE};
error_type: Test Results
- - id: '1321'
+ - id: '1220'
test_id: '1031'
test_type: Row_Ct
- sql_flavor: databricks
+ sql_flavor: snowflake
lookup_type: null
lookup_query: |-
WITH CTE AS (SELECT COUNT(*) AS current_count FROM {TARGET_SCHEMA}.{TABLE_NAME}) SELECT current_count, ABS(ROUND(100 *(current_count - {THRESHOLD_VALUE}) :: FLOAT / {THRESHOLD_VALUE} :: FLOAT,2)) AS row_count_pct_decrease FROM cte WHERE current_count < {THRESHOLD_VALUE};
diff --git a/testgen/template/dbsetup_test_types/test_types_Row_Ct_Pct.yaml b/testgen/template/dbsetup_test_types/test_types_Row_Ct_Pct.yaml
index c3c687bd..7850b6f0 100644
--- a/testgen/template/dbsetup_test_types/test_types_Row_Ct_Pct.yaml
+++ b/testgen/template/dbsetup_test_types/test_types_Row_Ct_Pct.yaml
@@ -29,23 +29,15 @@ test_types:
health_dimension: Volume
threshold_description: |-
Expected percent window below or above baseline
+ result_visualization: line_chart
+ result_visualization_params: null
usage_notes: |-
This test is better than Row Count for an incremental or windowed dataset where you would expect the row count to range within a percentage of baseline.
active: Y
- result_visualization: line_chart
- result_visualization_params: null
cat_test_conditions:
- - id: '1025'
- test_type: Row_Ct_Pct
- sql_flavor: redshift
- measure: |-
- ABS(ROUND(100.0 * (COUNT(*) - {BASELINE_CT})::FLOAT / {BASELINE_CT}::FLOAT, 2))
- test_operator: '>'
- test_condition: |-
- {THRESHOLD_VALUE}
- - id: '2025'
+ - id: '6025'
test_type: Row_Ct_Pct
- sql_flavor: snowflake
+ sql_flavor: databricks
measure: |-
ABS(ROUND(100.0 * (COUNT(*) - {BASELINE_CT})::FLOAT / {BASELINE_CT}::FLOAT, 2))
test_operator: '>'
@@ -67,59 +59,67 @@ test_types:
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
- - id: '5025'
+ - id: '1025'
test_type: Row_Ct_Pct
- sql_flavor: trino
+ sql_flavor: redshift
measure: |-
- ABS(ROUND(100.0 * CAST((COUNT(*) - {BASELINE_CT}) AS DECIMAL(18,4)) /CAST( {BASELINE_CT} AS DECIMAL(18,4) ), 2))
+ ABS(ROUND(100.0 * (COUNT(*) - {BASELINE_CT})::FLOAT / {BASELINE_CT}::FLOAT, 2))
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
- - id: '6025'
+ - id: '2025'
test_type: Row_Ct_Pct
- sql_flavor: databricks
+ sql_flavor: snowflake
measure: |-
ABS(ROUND(100.0 * (COUNT(*) - {BASELINE_CT})::FLOAT / {BASELINE_CT}::FLOAT, 2))
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
+ - id: '5025'
+ test_type: Row_Ct_Pct
+ sql_flavor: trino
+ measure: |-
+ ABS(ROUND(100.0 * CAST((COUNT(*) - {BASELINE_CT}) AS DECIMAL(18,4)) /CAST( {BASELINE_CT} AS DECIMAL(18,4) ), 2))
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
target_data_lookups:
- - id: '1025'
+ - id: '1322'
test_id: '1032'
test_type: Row_Ct_Pct
- sql_flavor: redshift
+ sql_flavor: databricks
lookup_type: null
lookup_query: |-
WITH CTE AS (SELECT COUNT(*) AS current_count FROM {TARGET_SCHEMA}.{TABLE_NAME}) SELECT current_count, {BASELINE_CT} AS baseline_count, ABS(ROUND(100 * (current_count - {BASELINE_CT}) :: FLOAT / {BASELINE_CT} :: FLOAT,2)) AS row_count_pct_difference FROM cte;
error_type: Test Results
- - id: '1107'
+ - id: '1164'
test_id: '1032'
test_type: Row_Ct_Pct
- sql_flavor: postgresql
+ sql_flavor: mssql
lookup_type: null
lookup_query: |-
- WITH CTE AS (SELECT COUNT(*) AS current_count FROM {TARGET_SCHEMA}.{TABLE_NAME}) SELECT current_count, {BASELINE_CT} AS baseline_count, ABS(ROUND(100 * (current_count - {BASELINE_CT}) :: NUMERIC / {BASELINE_CT} :: NUMERIC,2)) AS row_count_pct_difference FROM cte;
+ WITH CTE AS (SELECT COUNT(*) AS current_count FROM {TARGET_SCHEMA}.{TABLE_NAME}) SELECT current_count, {BASELINE_CT} AS baseline_count, ABS(ROUND(CAST(100 * (current_count - {BASELINE_CT}) AS NUMERIC) / CAST({BASELINE_CT} AS NUMERIC) ,2)) AS row_count_pct_difference FROM cte;
error_type: Test Results
- - id: '1164'
+ - id: '1107'
test_id: '1032'
test_type: Row_Ct_Pct
- sql_flavor: mssql
+ sql_flavor: postgresql
lookup_type: null
lookup_query: |-
- WITH CTE AS (SELECT COUNT(*) AS current_count FROM {TARGET_SCHEMA}.{TABLE_NAME}) SELECT current_count, {BASELINE_CT} AS baseline_count, ABS(ROUND(CAST(100 * (current_count - {BASELINE_CT}) AS NUMERIC) / CAST({BASELINE_CT} AS NUMERIC) ,2)) AS row_count_pct_difference FROM cte;
+ WITH CTE AS (SELECT COUNT(*) AS current_count FROM {TARGET_SCHEMA}.{TABLE_NAME}) SELECT current_count, {BASELINE_CT} AS baseline_count, ABS(ROUND(100 * (current_count - {BASELINE_CT}) :: NUMERIC / {BASELINE_CT} :: NUMERIC,2)) AS row_count_pct_difference FROM cte;
error_type: Test Results
- - id: '1221'
+ - id: '1025'
test_id: '1032'
test_type: Row_Ct_Pct
- sql_flavor: snowflake
+ sql_flavor: redshift
lookup_type: null
lookup_query: |-
WITH CTE AS (SELECT COUNT(*) AS current_count FROM {TARGET_SCHEMA}.{TABLE_NAME}) SELECT current_count, {BASELINE_CT} AS baseline_count, ABS(ROUND(100 * (current_count - {BASELINE_CT}) :: FLOAT / {BASELINE_CT} :: FLOAT,2)) AS row_count_pct_difference FROM cte;
error_type: Test Results
- - id: '1322'
+ - id: '1221'
test_id: '1032'
test_type: Row_Ct_Pct
- sql_flavor: databricks
+ sql_flavor: snowflake
lookup_type: null
lookup_query: |-
WITH CTE AS (SELECT COUNT(*) AS current_count FROM {TARGET_SCHEMA}.{TABLE_NAME}) SELECT current_count, {BASELINE_CT} AS baseline_count, ABS(ROUND(100 * (current_count - {BASELINE_CT}) :: FLOAT / {BASELINE_CT} :: FLOAT,2)) AS row_count_pct_difference FROM cte;
diff --git a/testgen/template/dbsetup_test_types/test_types_Street_Addr_Pattern.yaml b/testgen/template/dbsetup_test_types/test_types_Street_Addr_Pattern.yaml
index c1775d5a..bddb98c1 100644
--- a/testgen/template/dbsetup_test_types/test_types_Street_Addr_Pattern.yaml
+++ b/testgen/template/dbsetup_test_types/test_types_Street_Addr_Pattern.yaml
@@ -30,25 +30,17 @@ test_types:
health_dimension: Schema Drift
threshold_description: |-
Expected percent of records that match standard street address pattern
+ result_visualization: line_chart
+ result_visualization_params: null
usage_notes: |-
The street address pattern used in this test should match the vast majority of USA addresses. You can adjust the threshold percent of matches based on the results you are getting -- you may well want to tighten it to make the test more sensitive to invalid entries.
active: Y
- result_visualization: line_chart
- result_visualization_params: null
cat_test_conditions:
- - id: '1026'
- test_type: Street_Addr_Pattern
- sql_flavor: redshift
- measure: |-
- 100.0*SUM(({COLUMN_NAME} ~ '^[0-9]{1,5}[a-zA-Z]?\\s\\w{1,5}\\.?\\s?\\w*\\s?\\w*\\s[a-zA-Z]{1,6}\\.?\\s?[0-9]{0,5}[A-Z]{0,1}$')::BIGINT)::FLOAT / NULLIF(COUNT({COLUMN_NAME}), 0)::FLOAT
- test_operator: <
- test_condition: |-
- {THRESHOLD_VALUE}
- - id: '2026'
+ - id: '6026'
test_type: Street_Addr_Pattern
- sql_flavor: snowflake
+ sql_flavor: databricks
measure: |-
- 100.0*SUM((regexp_like({COLUMN_NAME}::VARCHAR, '^[0-9]{1,5}[a-zA-Z]?\\s\\w{1,5}\\.?\\s?\\w*\\s?\\w*\\s[a-zA-Z]{1,6}\\.?\\s?[0-9]{0,5}[A-Z]{0,1}$'))::BIGINT)::FLOAT / NULLIF(COUNT({COLUMN_NAME}), 0)::FLOAT
+ 100.0*SUM((regexp_like({COLUMN_NAME}::STRING, '^[0-9]{1,5}[a-zA-Z]?\\s\\w{1,5}\\.?\\s?\\w*\\s?\\w*\\s[a-zA-Z]{1,6}\\.?\\s?[0-9]{0,5}[A-Z]{0,1}$'))::BIGINT)::FLOAT / NULLIF(COUNT({COLUMN_NAME}), 0)::FLOAT
test_operator: <
test_condition: |-
{THRESHOLD_VALUE}
@@ -68,30 +60,46 @@ test_types:
test_operator: <
test_condition: |-
{THRESHOLD_VALUE}
- - id: '5026'
+ - id: '1026'
test_type: Street_Addr_Pattern
- sql_flavor: trino
+ sql_flavor: redshift
measure: |-
- CAST(100.0*SUM(CASE WHEN REGEXP_LIKE({COLUMN_NAME} , '^[0-9]{1,5}[a-zA-Z]?\s\w{1,5}\.?\s?\w*\s?\w*\s[a-zA-Z]{1,6}\.?\s?[0-9]{0,5}[A-Z]{0,1}$') = TRUE THEN 1 ELSE 0 END) AS REAL )/ CAST(COUNT({COLUMN_NAME}) AS REAL)
+ 100.0*SUM(({COLUMN_NAME} ~ '^[0-9]{1,5}[a-zA-Z]?\\s\\w{1,5}\\.?\\s?\\w*\\s?\\w*\\s[a-zA-Z]{1,6}\\.?\\s?[0-9]{0,5}[A-Z]{0,1}$')::BIGINT)::FLOAT / NULLIF(COUNT({COLUMN_NAME}), 0)::FLOAT
test_operator: <
test_condition: |-
{THRESHOLD_VALUE}
- - id: '6026'
+ - id: '2026'
test_type: Street_Addr_Pattern
- sql_flavor: databricks
+ sql_flavor: snowflake
measure: |-
- 100.0*SUM((regexp_like({COLUMN_NAME}::STRING, '^[0-9]{1,5}[a-zA-Z]?\\s\\w{1,5}\\.?\\s?\\w*\\s?\\w*\\s[a-zA-Z]{1,6}\\.?\\s?[0-9]{0,5}[A-Z]{0,1}$'))::BIGINT)::FLOAT / NULLIF(COUNT({COLUMN_NAME}), 0)::FLOAT
+ 100.0*SUM((regexp_like({COLUMN_NAME}::VARCHAR, '^[0-9]{1,5}[a-zA-Z]?\\s\\w{1,5}\\.?\\s?\\w*\\s?\\w*\\s[a-zA-Z]{1,6}\\.?\\s?[0-9]{0,5}[A-Z]{0,1}$'))::BIGINT)::FLOAT / NULLIF(COUNT({COLUMN_NAME}), 0)::FLOAT
+ test_operator: <
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '5026'
+ test_type: Street_Addr_Pattern
+ sql_flavor: trino
+ measure: |-
+ CAST(100.0*SUM(CASE WHEN REGEXP_LIKE({COLUMN_NAME} , '^[0-9]{1,5}[a-zA-Z]?\s\w{1,5}\.?\s?\w*\s?\w*\s[a-zA-Z]{1,6}\.?\s?[0-9]{0,5}[A-Z]{0,1}$') = TRUE THEN 1 ELSE 0 END) AS REAL )/ CAST(COUNT({COLUMN_NAME}) AS REAL)
test_operator: <
test_condition: |-
{THRESHOLD_VALUE}
target_data_lookups:
- - id: '1026'
+ - id: '1323'
test_id: '1033'
test_type: Street_Addr_Pattern
- sql_flavor: redshift
+ sql_flavor: databricks
lookup_type: null
lookup_query: |-
- SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" !~ '^[0-9]{1,5}[a-zA-Z]?\\s\\w{1,5}\\.?\\s?\\w*\\s?\\w*\\s[a-zA-Z]{1,6}\\.?\\s?[0-9]{0,5}[A-Z]{0,1}$' GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT 500;
+ SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE REGEXP_LIKE(`{COLUMN_NAME}`::STRING, '^[0-9]{1,5}[a-zA-Z]?\\s\\w{1,5}\\.?\\s?\\w*\\s?\\w*\\s[a-zA-Z]{1,6}\\.?\\s?[0-9]{0,5}[A-Z]{0,1}$') != 1 GROUP BY `{COLUMN_NAME}` ORDER BY count DESC LIMIT 500;
+ error_type: Test Results
+ - id: '1165'
+ test_id: '1033'
+ test_type: Street_Addr_Pattern
+ sql_flavor: mssql
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE UPPER("{COLUMN_NAME}") NOT LIKE '[1-9]% [A-Z]% %' AND CHARINDEX(' ', "{COLUMN_NAME}") NOT BETWEEN 2 AND 6 GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC;
error_type: Test Results
- id: '1108'
test_id: '1033'
@@ -101,13 +109,13 @@ test_types:
lookup_query: |-
SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" !~ '^[0-9]{1,5}[a-zA-Z]?\s\w{1,5}\.?\s?\w*\s?\w*\s[a-zA-Z]{1,6}\.?\s?[0-9]{0,5}[A-Z]{0,1}$' GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT 500;
error_type: Test Results
- - id: '1165'
+ - id: '1026'
test_id: '1033'
test_type: Street_Addr_Pattern
- sql_flavor: mssql
+ sql_flavor: redshift
lookup_type: null
lookup_query: |-
- SELECT DISTINCT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE UPPER("{COLUMN_NAME}") NOT LIKE '[1-9]% [A-Z]% %' AND CHARINDEX(' ', "{COLUMN_NAME}") NOT BETWEEN 2 AND 6 GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC;
+ SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" !~ '^[0-9]{1,5}[a-zA-Z]?\\s\\w{1,5}\\.?\\s?\\w*\\s?\\w*\\s[a-zA-Z]{1,6}\\.?\\s?[0-9]{0,5}[A-Z]{0,1}$' GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT 500;
error_type: Test Results
- id: '1222'
test_id: '1033'
@@ -117,12 +125,4 @@ test_types:
lookup_query: |-
SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE REGEXP_LIKE("{COLUMN_NAME}"::VARCHAR, '^[0-9]{1,5}[a-zA-Z]?\\s\\w{1,5}\\.?\\s?\\w*\\s?\\w*\\s[a-zA-Z]{1,6}\\.?\\s?[0-9]{0,5}[A-Z]{0,1}$') != 1 GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT 500;
error_type: Test Results
- - id: '1323'
- test_id: '1033'
- test_type: Street_Addr_Pattern
- sql_flavor: databricks
- lookup_type: null
- lookup_query: |-
- SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE REGEXP_LIKE(`{COLUMN_NAME}`::STRING, '^[0-9]{1,5}[a-zA-Z]?\\s\\w{1,5}\\.?\\s?\\w*\\s?\\w*\\s[a-zA-Z]{1,6}\\.?\\s?[0-9]{0,5}[A-Z]{0,1}$') != 1 GROUP BY `{COLUMN_NAME}` ORDER BY count DESC LIMIT 500;
- error_type: Test Results
test_templates: []
diff --git a/testgen/template/dbsetup_test_types/test_types_Table_Freshness.yaml b/testgen/template/dbsetup_test_types/test_types_Table_Freshness.yaml
index 08e74413..032a8e15 100644
--- a/testgen/template/dbsetup_test_types/test_types_Table_Freshness.yaml
+++ b/testgen/template/dbsetup_test_types/test_types_Table_Freshness.yaml
@@ -31,21 +31,17 @@ test_types:
health_dimension: Recency
threshold_description: |-
Most recent prior table fingerprint
+ result_visualization: binary_chart
+ result_visualization_params: '{"legend":{"labels":{"0":"Stale","1":"Updated"}}}'
usage_notes: |-
This test compares the current table fingerprint, calculated signature of column contents, to confirm that the table has been updated. The table fingerprint is derived from a set of values and aggregates from columns most likely to change. This test allows you to track the schedule and frequency of updates and refreshes to the table.
active: Y
- result_visualization: binary_chart
- result_visualization_params: '{"legend":{"labels":{"0":"Stale","1":"Updated"}}}'
cat_test_conditions: []
target_data_lookups: []
test_templates:
- - id: '2012'
- test_type: Table_Freshness
- sql_flavor: redshift
- template_name: ex_table_changed_generic.sql
- - id: '2112'
+ - id: '2412'
test_type: Table_Freshness
- sql_flavor: snowflake
+ sql_flavor: databricks
template_name: ex_table_changed_generic.sql
- id: '2212'
test_type: Table_Freshness
@@ -55,7 +51,11 @@ test_types:
test_type: Table_Freshness
sql_flavor: postgresql
template_name: ex_table_changed_generic.sql
- - id: '2412'
+ - id: '2012'
test_type: Table_Freshness
- sql_flavor: databricks
+ sql_flavor: redshift
+ template_name: ex_table_changed_generic.sql
+ - id: '2112'
+ test_type: Table_Freshness
+ sql_flavor: snowflake
template_name: ex_table_changed_generic.sql
diff --git a/testgen/template/dbsetup_test_types/test_types_Timeframe_Combo_Gain.yaml b/testgen/template/dbsetup_test_types/test_types_Timeframe_Combo_Gain.yaml
index 2ae08ca0..213d7926 100644
--- a/testgen/template/dbsetup_test_types/test_types_Timeframe_Combo_Gain.yaml
+++ b/testgen/template/dbsetup_test_types/test_types_Timeframe_Combo_Gain.yaml
@@ -31,36 +31,36 @@ test_types:
health_dimension: Data Drift
threshold_description: |-
Expected count of missing value combinations
+ result_visualization: line_chart
+ result_visualization_params: null
usage_notes: |-
This test checks a single transactional table to verify that categorical values or combinations that are present in the most recent time window you define include at least all those found in the prior time window of the same duration. Missing values in the latest time window will trigger the test to fail. New values are permitted. Use this test to confirm that codes or categories are not lost across successive time periods in a transactional table.
active: Y
- result_visualization: line_chart
- result_visualization_params: null
cat_test_conditions: []
target_data_lookups:
- - id: '1261'
+ - id: '1263'
test_id: '1508'
test_type: Timeframe_Combo_Gain
- sql_flavor: redshift
+ sql_flavor: mssql
lookup_type: null
lookup_query: |-
SELECT {COLUMN_NAME_NO_QUOTES}
FROM {TARGET_SCHEMA}.{TABLE_NAME}
WHERE {SUBSET_CONDITION}
- AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - 2 * {WINDOW_DAYS}
- AND {WINDOW_DATE_COLUMN} < (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS}
+ AND {WINDOW_DATE_COLUMN} >= DATEADD("day", - 2 * {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}))
+ AND {WINDOW_DATE_COLUMN} < DATEADD("day", - {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}))
GROUP BY {COLUMN_NAME_NO_QUOTES}
EXCEPT
SELECT {COLUMN_NAME_NO_QUOTES}
FROM {TARGET_SCHEMA}.{TABLE_NAME}
WHERE {SUBSET_CONDITION}
- AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS}
+ AND {WINDOW_DATE_COLUMN} >= DATEADD("day", - {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}))
GROUP BY {COLUMN_NAME_NO_QUOTES}
error_type: Test Results
- - id: '1262'
+ - id: '1264'
test_id: '1508'
test_type: Timeframe_Combo_Gain
- sql_flavor: snowflake
+ sql_flavor: postgresql
lookup_type: null
lookup_query: |-
SELECT {COLUMN_NAME_NO_QUOTES}
@@ -76,29 +76,29 @@ test_types:
AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS}
GROUP BY {COLUMN_NAME_NO_QUOTES}
error_type: Test Results
- - id: '1263'
+ - id: '1261'
test_id: '1508'
test_type: Timeframe_Combo_Gain
- sql_flavor: mssql
+ sql_flavor: redshift
lookup_type: null
lookup_query: |-
SELECT {COLUMN_NAME_NO_QUOTES}
FROM {TARGET_SCHEMA}.{TABLE_NAME}
WHERE {SUBSET_CONDITION}
- AND {WINDOW_DATE_COLUMN} >= DATEADD("day", - 2 * {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}))
- AND {WINDOW_DATE_COLUMN} < DATEADD("day", - {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}))
+ AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - 2 * {WINDOW_DAYS}
+ AND {WINDOW_DATE_COLUMN} < (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS}
GROUP BY {COLUMN_NAME_NO_QUOTES}
EXCEPT
SELECT {COLUMN_NAME_NO_QUOTES}
FROM {TARGET_SCHEMA}.{TABLE_NAME}
WHERE {SUBSET_CONDITION}
- AND {WINDOW_DATE_COLUMN} >= DATEADD("day", - {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}))
+ AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS}
GROUP BY {COLUMN_NAME_NO_QUOTES}
error_type: Test Results
- - id: '1264'
+ - id: '1262'
test_id: '1508'
test_type: Timeframe_Combo_Gain
- sql_flavor: postgresql
+ sql_flavor: snowflake
lookup_type: null
lookup_query: |-
SELECT {COLUMN_NAME_NO_QUOTES}
@@ -115,14 +115,10 @@ test_types:
GROUP BY {COLUMN_NAME_NO_QUOTES}
error_type: Test Results
test_templates:
- - id: '2007'
- test_type: Timeframe_Combo_Gain
- sql_flavor: redshift
- template_name: ex_window_match_no_drops_generic.sql
- - id: '2107'
+ - id: '2407'
test_type: Timeframe_Combo_Gain
- sql_flavor: snowflake
- template_name: ex_window_match_no_drops_generic.sql
+ sql_flavor: databricks
+ template_name: ex_window_match_no_drops_databricks.sql
- id: '2207'
test_type: Timeframe_Combo_Gain
sql_flavor: mssql
@@ -131,7 +127,11 @@ test_types:
test_type: Timeframe_Combo_Gain
sql_flavor: postgresql
template_name: ex_window_match_no_drops_postgresql.sql
- - id: '2407'
+ - id: '2007'
test_type: Timeframe_Combo_Gain
- sql_flavor: databricks
- template_name: ex_window_match_no_drops_databricks.sql
+ sql_flavor: redshift
+ template_name: ex_window_match_no_drops_generic.sql
+ - id: '2107'
+ test_type: Timeframe_Combo_Gain
+ sql_flavor: snowflake
+ template_name: ex_window_match_no_drops_generic.sql
diff --git a/testgen/template/dbsetup_test_types/test_types_Timeframe_Combo_Match.yaml b/testgen/template/dbsetup_test_types/test_types_Timeframe_Combo_Match.yaml
index ae338117..73283c62 100644
--- a/testgen/template/dbsetup_test_types/test_types_Timeframe_Combo_Match.yaml
+++ b/testgen/template/dbsetup_test_types/test_types_Timeframe_Combo_Match.yaml
@@ -29,17 +29,17 @@ test_types:
health_dimension: Data Drift
threshold_description: |-
Expected count of non-matching value combinations
+ result_visualization: line_chart
+ result_visualization_params: null
usage_notes: |-
This test checks a single transactional table (such as a fact table) to verify that categorical values or combinations that are present in the most recent time window you define match those found in the prior time window of the same duration. New or missing values in the latest time window will trigger the test to fail. Use this test to confirm the consistency in the occurrence of codes or categories across successive time periods in a transactional table.
active: Y
- result_visualization: line_chart
- result_visualization_params: null
cat_test_conditions: []
target_data_lookups:
- - id: '1265'
+ - id: '1337'
test_id: '1509'
test_type: Timeframe_Combo_Match
- sql_flavor: redshift
+ sql_flavor: databricks
lookup_type: null
lookup_query: |2-
(
@@ -68,74 +68,74 @@ test_types:
AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS}
)
error_type: Test Results
- - id: '1266'
+ - id: '1267'
test_id: '1509'
test_type: Timeframe_Combo_Match
- sql_flavor: snowflake
+ sql_flavor: mssql
lookup_type: null
lookup_query: |2-
(
SELECT 'Prior Timeframe' as missing_from, {COLUMN_NAME}
FROM {TARGET_SCHEMA}.{TABLE_NAME}
WHERE {SUBSET_CONDITION}
- AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS}
+ AND {WINDOW_DATE_COLUMN} >= DATEADD("day", - {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}))
EXCEPT
SELECT 'Prior Timeframe' as missing_from, {COLUMN_NAME}
FROM {TARGET_SCHEMA}.{TABLE_NAME}
WHERE {SUBSET_CONDITION}
- AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - 2 * {WINDOW_DAYS}
- AND {WINDOW_DATE_COLUMN} < (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS}
+ AND {WINDOW_DATE_COLUMN} >= DATEADD("day", - 2 * {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}))
+ AND {WINDOW_DATE_COLUMN} < DATEADD("day", - {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}))
)
UNION ALL
(
SELECT 'Latest Timeframe' as missing_from, {COLUMN_NAME}
FROM {TARGET_SCHEMA}.{TABLE_NAME}
WHERE {SUBSET_CONDITION}
- AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - 2 * {WINDOW_DAYS}
- AND {WINDOW_DATE_COLUMN} < (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS}
+ AND {WINDOW_DATE_COLUMN} >= DATEADD("day", - 2 * {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}))
+ AND {WINDOW_DATE_COLUMN} < DATEADD("day", - {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}))
EXCEPT
SELECT 'Latest Timeframe' as missing_from, {COLUMN_NAME}
FROM {TARGET_SCHEMA}.{TABLE_NAME}
WHERE {SUBSET_CONDITION}
- AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS}
+ AND {WINDOW_DATE_COLUMN} >= DATEADD("day", - {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}))
)
error_type: Test Results
- - id: '1267'
+ - id: '1268'
test_id: '1509'
test_type: Timeframe_Combo_Match
- sql_flavor: mssql
+ sql_flavor: postgresql
lookup_type: null
lookup_query: |2-
(
SELECT 'Prior Timeframe' as missing_from, {COLUMN_NAME}
FROM {TARGET_SCHEMA}.{TABLE_NAME}
WHERE {SUBSET_CONDITION}
- AND {WINDOW_DATE_COLUMN} >= DATEADD("day", - {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}))
+ AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS}
EXCEPT
SELECT 'Prior Timeframe' as missing_from, {COLUMN_NAME}
FROM {TARGET_SCHEMA}.{TABLE_NAME}
WHERE {SUBSET_CONDITION}
- AND {WINDOW_DATE_COLUMN} >= DATEADD("day", - 2 * {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}))
- AND {WINDOW_DATE_COLUMN} < DATEADD("day", - {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}))
+ AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - 2 * {WINDOW_DAYS}
+ AND {WINDOW_DATE_COLUMN} < (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS}
)
UNION ALL
(
SELECT 'Latest Timeframe' as missing_from, {COLUMN_NAME}
FROM {TARGET_SCHEMA}.{TABLE_NAME}
WHERE {SUBSET_CONDITION}
- AND {WINDOW_DATE_COLUMN} >= DATEADD("day", - 2 * {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}))
- AND {WINDOW_DATE_COLUMN} < DATEADD("day", - {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}))
+ AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - 2 * {WINDOW_DAYS}
+ AND {WINDOW_DATE_COLUMN} < (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS}
EXCEPT
SELECT 'Latest Timeframe' as missing_from, {COLUMN_NAME}
FROM {TARGET_SCHEMA}.{TABLE_NAME}
WHERE {SUBSET_CONDITION}
- AND {WINDOW_DATE_COLUMN} >= DATEADD("day", - {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}))
+ AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS}
)
error_type: Test Results
- - id: '1268'
+ - id: '1265'
test_id: '1509'
test_type: Timeframe_Combo_Match
- sql_flavor: postgresql
+ sql_flavor: redshift
lookup_type: null
lookup_query: |2-
(
@@ -164,10 +164,10 @@ test_types:
AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS}
)
error_type: Test Results
- - id: '1337'
+ - id: '1266'
test_id: '1509'
test_type: Timeframe_Combo_Match
- sql_flavor: databricks
+ sql_flavor: snowflake
lookup_type: null
lookup_query: |2-
(
@@ -197,14 +197,10 @@ test_types:
)
error_type: Test Results
test_templates:
- - id: '2008'
- test_type: Timeframe_Combo_Match
- sql_flavor: redshift
- template_name: ex_window_match_same_generic.sql
- - id: '2108'
+ - id: '2408'
test_type: Timeframe_Combo_Match
- sql_flavor: snowflake
- template_name: ex_window_match_same_generic.sql
+ sql_flavor: databricks
+ template_name: ex_window_match_same_databricks.sql
- id: '2208'
test_type: Timeframe_Combo_Match
sql_flavor: mssql
@@ -213,7 +209,11 @@ test_types:
test_type: Timeframe_Combo_Match
sql_flavor: postgresql
template_name: ex_window_match_same_postgresql.sql
- - id: '2408'
+ - id: '2008'
test_type: Timeframe_Combo_Match
- sql_flavor: databricks
- template_name: ex_window_match_same_databricks.sql
+ sql_flavor: redshift
+ template_name: ex_window_match_same_generic.sql
+ - id: '2108'
+ test_type: Timeframe_Combo_Match
+ sql_flavor: snowflake
+ template_name: ex_window_match_same_generic.sql
diff --git a/testgen/template/dbsetup_test_types/test_types_US_State.yaml b/testgen/template/dbsetup_test_types/test_types_US_State.yaml
index d6d9dd8e..47a94fde 100644
--- a/testgen/template/dbsetup_test_types/test_types_US_State.yaml
+++ b/testgen/template/dbsetup_test_types/test_types_US_State.yaml
@@ -30,23 +30,15 @@ test_types:
health_dimension: Schema Drift
threshold_description: |-
Expected count of values that are not US state abbreviations
+ result_visualization: line_chart
+ result_visualization_params: null
usage_notes: |-
This test validates entries against a fixed list of two-character US state codes and related Armed Forces codes.
active: Y
- result_visualization: line_chart
- result_visualization_params: null
cat_test_conditions:
- - id: '1027'
- test_type: US_State
- sql_flavor: redshift
- measure: |-
- SUM(CASE WHEN {COLUMN_NAME} NOT IN ('','AL','AK','AS','AZ','AR','CA','CO','CT','DE','DC','FM','FL','GA','GU','HI','ID','IL','IN','IA','KS','KY','LA','ME','MH','MD','MA','MI','MN','MS','MO','MT','NE','NV','NH','NJ','NM','NY','NC','ND','MP','OH','OK','OR','PW','PA','PR','RI','SC','SD','TN','TX','UT','VT','VI','VA','WA','WV','WI','WY','AE','AP','AA') THEN 1 ELSE 0 END)
- test_operator: '>'
- test_condition: |-
- {THRESHOLD_VALUE}
- - id: '2027'
+ - id: '6027'
test_type: US_State
- sql_flavor: snowflake
+ sql_flavor: databricks
measure: |-
SUM(CASE WHEN {COLUMN_NAME} NOT IN ('','AL','AK','AS','AZ','AR','CA','CO','CT','DE','DC','FM','FL','GA','GU','HI','ID','IL','IN','IA','KS','KY','LA','ME','MH','MD','MA','MI','MN','MS','MO','MT','NE','NV','NH','NJ','NM','NY','NC','ND','MP','OH','OK','OR','PW','PA','PR','RI','SC','SD','TN','TX','UT','VT','VI','VA','WA','WV','WI','WY','AE','AP','AA') THEN 1 ELSE 0 END)
test_operator: '>'
@@ -68,30 +60,46 @@ test_types:
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
- - id: '5027'
+ - id: '1027'
test_type: US_State
- sql_flavor: trino
+ sql_flavor: redshift
measure: |-
- SUM(CASE WHEN NULLIF({COLUMN_NAME}, '') NOT IN ('AL','AK','AS','AZ','AR','CA','CO','CT','DE','DC','FM','FL','GA','GU','HI','ID','IL','IN','IA','KS','KY','LA','ME','MH','MD','MA','MI','MN','MS','MO','MT','NE','NV','NH','NJ','NM','NY','NC','ND','MP','OH','OK','OR','PW','PA','PR','RI','SC','SD','TN','TX','UT','VT','VI','VA','WA','WV','WI','WY','AE','AP','AA') THEN 1 ELSE 0 END)
+ SUM(CASE WHEN {COLUMN_NAME} NOT IN ('','AL','AK','AS','AZ','AR','CA','CO','CT','DE','DC','FM','FL','GA','GU','HI','ID','IL','IN','IA','KS','KY','LA','ME','MH','MD','MA','MI','MN','MS','MO','MT','NE','NV','NH','NJ','NM','NY','NC','ND','MP','OH','OK','OR','PW','PA','PR','RI','SC','SD','TN','TX','UT','VT','VI','VA','WA','WV','WI','WY','AE','AP','AA') THEN 1 ELSE 0 END)
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
- - id: '6027'
+ - id: '2027'
test_type: US_State
- sql_flavor: databricks
+ sql_flavor: snowflake
measure: |-
SUM(CASE WHEN {COLUMN_NAME} NOT IN ('','AL','AK','AS','AZ','AR','CA','CO','CT','DE','DC','FM','FL','GA','GU','HI','ID','IL','IN','IA','KS','KY','LA','ME','MH','MD','MA','MI','MN','MS','MO','MT','NE','NV','NH','NJ','NM','NY','NC','ND','MP','OH','OK','OR','PW','PA','PR','RI','SC','SD','TN','TX','UT','VT','VI','VA','WA','WV','WI','WY','AE','AP','AA') THEN 1 ELSE 0 END)
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
+ - id: '5027'
+ test_type: US_State
+ sql_flavor: trino
+ measure: |-
+ SUM(CASE WHEN NULLIF({COLUMN_NAME}, '') NOT IN ('AL','AK','AS','AZ','AR','CA','CO','CT','DE','DC','FM','FL','GA','GU','HI','ID','IL','IN','IA','KS','KY','LA','ME','MH','MD','MA','MI','MN','MS','MO','MT','NE','NV','NH','NJ','NM','NY','NC','ND','MP','OH','OK','OR','PW','PA','PR','RI','SC','SD','TN','TX','UT','VT','VI','VA','WA','WV','WI','WY','AE','AP','AA') THEN 1 ELSE 0 END)
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
target_data_lookups:
- - id: '1027'
+ - id: '1324'
test_id: '1036'
test_type: US_State
- sql_flavor: redshift
+ sql_flavor: databricks
lookup_type: null
lookup_query: |-
- SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE NULLIF("{COLUMN_NAME}", '') NOT IN ('AL','AK','AS','AZ','AR','CA','CO','CT','DE','DC','FM','FL','GA','GU','HI','ID','IL','IN','IA','KS','KY','LA','ME','MH','MD','MA','MI','MN','MS','MO','MT','NE','NV','NH','NJ','NM','NY','NC','ND','MP','OH','OK','OR','PW','PA','PR','RI','SC','SD','TN','TX','UT','VT','VI','VA','WA','WV','WI','WY','AE','AP','AA') GROUP BY "{COLUMN_NAME}" LIMIT 500;
+ SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE NULLIF(`{COLUMN_NAME}`, '') NOT IN ('AL','AK','AS','AZ','AR','CA','CO','CT','DE','DC','FM','FL','GA','GU','HI','ID','IL','IN','IA','KS','KY','LA','ME','MH','MD','MA','MI','MN','MS','MO','MT','NE','NV','NH','NJ','NM','NY','NC','ND','MP','OH','OK','OR','PW','PA','PR','RI','SC','SD','TN','TX','UT','VT','VI','VA','WA','WV','WI','WY','AE','AP','AA') GROUP BY `{COLUMN_NAME}` LIMIT 500;
+ error_type: Test Results
+ - id: '1166'
+ test_id: '1036'
+ test_type: US_State
+ sql_flavor: mssql
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE NULLIF("{COLUMN_NAME}", '') NOT IN ('AL','AK','AS','AZ','AR','CA','CO','CT','DE','DC','FM','FL','GA','GU','HI','ID','IL','IN','IA','KS','KY','LA','ME','MH','MD','MA','MI','MN','MS','MO','MT','NE','NV','NH','NJ','NM','NY','NC','ND','MP','OH','OK','OR','PW','PA','PR','RI','SC','SD','TN','TX','UT','VT','VI','VA','WA','WV','WI','WY','AE','AP','AA') GROUP BY "{COLUMN_NAME}";
error_type: Test Results
- id: '1109'
test_id: '1036'
@@ -101,13 +109,13 @@ test_types:
lookup_query: |-
SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE NULLIF("{COLUMN_NAME}", '') NOT IN ('AL','AK','AS','AZ','AR','CA','CO','CT','DE','DC','FM','FL','GA','GU','HI','ID','IL','IN','IA','KS','KY','LA','ME','MH','MD','MA','MI','MN','MS','MO','MT','NE','NV','NH','NJ','NM','NY','NC','ND','MP','OH','OK','OR','PW','PA','PR','RI','SC','SD','TN','TX','UT','VT','VI','VA','WA','WV','WI','WY','AE','AP','AA') GROUP BY "{COLUMN_NAME}" LIMIT 500;
error_type: Test Results
- - id: '1166'
+ - id: '1027'
test_id: '1036'
test_type: US_State
- sql_flavor: mssql
+ sql_flavor: redshift
lookup_type: null
lookup_query: |-
- SELECT DISTINCT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE NULLIF("{COLUMN_NAME}", '') NOT IN ('AL','AK','AS','AZ','AR','CA','CO','CT','DE','DC','FM','FL','GA','GU','HI','ID','IL','IN','IA','KS','KY','LA','ME','MH','MD','MA','MI','MN','MS','MO','MT','NE','NV','NH','NJ','NM','NY','NC','ND','MP','OH','OK','OR','PW','PA','PR','RI','SC','SD','TN','TX','UT','VT','VI','VA','WA','WV','WI','WY','AE','AP','AA') GROUP BY "{COLUMN_NAME}";
+ SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE NULLIF("{COLUMN_NAME}", '') NOT IN ('AL','AK','AS','AZ','AR','CA','CO','CT','DE','DC','FM','FL','GA','GU','HI','ID','IL','IN','IA','KS','KY','LA','ME','MH','MD','MA','MI','MN','MS','MO','MT','NE','NV','NH','NJ','NM','NY','NC','ND','MP','OH','OK','OR','PW','PA','PR','RI','SC','SD','TN','TX','UT','VT','VI','VA','WA','WV','WI','WY','AE','AP','AA') GROUP BY "{COLUMN_NAME}" LIMIT 500;
error_type: Test Results
- id: '1223'
test_id: '1036'
@@ -117,12 +125,4 @@ test_types:
lookup_query: |-
SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE NULLIF("{COLUMN_NAME}", '') NOT IN ('AL','AK','AS','AZ','AR','CA','CO','CT','DE','DC','FM','FL','GA','GU','HI','ID','IL','IN','IA','KS','KY','LA','ME','MH','MD','MA','MI','MN','MS','MO','MT','NE','NV','NH','NJ','NM','NY','NC','ND','MP','OH','OK','OR','PW','PA','PR','RI','SC','SD','TN','TX','UT','VT','VI','VA','WA','WV','WI','WY','AE','AP','AA') GROUP BY "{COLUMN_NAME}" LIMIT 500;
error_type: Test Results
- - id: '1324'
- test_id: '1036'
- test_type: US_State
- sql_flavor: databricks
- lookup_type: null
- lookup_query: |-
- SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE NULLIF(`{COLUMN_NAME}`, '') NOT IN ('AL','AK','AS','AZ','AR','CA','CO','CT','DE','DC','FM','FL','GA','GU','HI','ID','IL','IN','IA','KS','KY','LA','ME','MH','MD','MA','MI','MN','MS','MO','MT','NE','NV','NH','NJ','NM','NY','NC','ND','MP','OH','OK','OR','PW','PA','PR','RI','SC','SD','TN','TX','UT','VT','VI','VA','WA','WV','WI','WY','AE','AP','AA') GROUP BY `{COLUMN_NAME}` LIMIT 500;
- error_type: Test Results
test_templates: []
diff --git a/testgen/template/dbsetup_test_types/test_types_Unique.yaml b/testgen/template/dbsetup_test_types/test_types_Unique.yaml
index 013e2d88..20a8df28 100644
--- a/testgen/template/dbsetup_test_types/test_types_Unique.yaml
+++ b/testgen/template/dbsetup_test_types/test_types_Unique.yaml
@@ -30,68 +30,76 @@ test_types:
health_dimension: Schema Drift
threshold_description: |-
Expected count of duplicate values
+ result_visualization: line_chart
+ result_visualization_params: null
usage_notes: |-
This test is ideal when the database itself does not enforce a primary key constraint on the table. It serves as an independent check on uniqueness. If's also useful when there are a small number of exceptions to uniqueness, which can be reflected in the expected threshold count of duplicates.
active: Y
- result_visualization: line_chart
- result_visualization_params: null
cat_test_conditions:
- - id: '1028'
+ - id: '6028'
test_type: Unique
- sql_flavor: redshift
+ sql_flavor: databricks
measure: |-
COUNT(*) - COUNT(DISTINCT {COLUMN_NAME})
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
- - id: '2028'
+ - id: '3028'
test_type: Unique
- sql_flavor: snowflake
+ sql_flavor: mssql
measure: |-
COUNT(*) - COUNT(DISTINCT {COLUMN_NAME})
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
- - id: '3028'
+ - id: '4028'
test_type: Unique
- sql_flavor: mssql
+ sql_flavor: postgresql
measure: |-
COUNT(*) - COUNT(DISTINCT {COLUMN_NAME})
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
- - id: '4028'
+ - id: '1028'
test_type: Unique
- sql_flavor: postgresql
+ sql_flavor: redshift
measure: |-
COUNT(*) - COUNT(DISTINCT {COLUMN_NAME})
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
- - id: '5028'
+ - id: '2028'
test_type: Unique
- sql_flavor: trino
+ sql_flavor: snowflake
measure: |-
COUNT(*) - COUNT(DISTINCT {COLUMN_NAME})
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
- - id: '6028'
+ - id: '5028'
test_type: Unique
- sql_flavor: databricks
+ sql_flavor: trino
measure: |-
COUNT(*) - COUNT(DISTINCT {COLUMN_NAME})
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
target_data_lookups:
- - id: '1028'
+ - id: '1325'
test_id: '1034'
test_type: Unique
- sql_flavor: redshift
+ sql_flavor: databricks
lookup_type: null
lookup_query: |-
- SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" HAVING COUNT(*) > 1 ORDER BY COUNT(*) DESC LIMIT 500;
+ SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY `{COLUMN_NAME}` HAVING count > 1 ORDER BY count DESC LIMIT 500;
+ error_type: Test Results
+ - id: '1167'
+ test_id: '1034'
+ test_type: Unique
+ sql_flavor: mssql
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" HAVING COUNT(*) > 1 ORDER BY COUNT(*) DESC;
error_type: Test Results
- id: '1110'
test_id: '1034'
@@ -101,13 +109,13 @@ test_types:
lookup_query: |-
SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" HAVING COUNT(*) > 1 ORDER BY COUNT(*) DESC LIMIT 500;
error_type: Test Results
- - id: '1167'
+ - id: '1028'
test_id: '1034'
test_type: Unique
- sql_flavor: mssql
+ sql_flavor: redshift
lookup_type: null
lookup_query: |-
- SELECT DISTINCT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" HAVING COUNT(*) > 1 ORDER BY COUNT(*) DESC;
+ SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" HAVING COUNT(*) > 1 ORDER BY COUNT(*) DESC LIMIT 500;
error_type: Test Results
- id: '1224'
test_id: '1034'
@@ -117,12 +125,4 @@ test_types:
lookup_query: |-
SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" HAVING COUNT(*) > 1 ORDER BY COUNT(*) DESC LIMIT 500;
error_type: Test Results
- - id: '1325'
- test_id: '1034'
- test_type: Unique
- sql_flavor: databricks
- lookup_type: null
- lookup_query: |-
- SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY `{COLUMN_NAME}` HAVING count > 1 ORDER BY count DESC LIMIT 500;
- error_type: Test Results
test_templates: []
diff --git a/testgen/template/dbsetup_test_types/test_types_Unique_Pct.yaml b/testgen/template/dbsetup_test_types/test_types_Unique_Pct.yaml
index 0e74a8dd..a1dfdf46 100644
--- a/testgen/template/dbsetup_test_types/test_types_Unique_Pct.yaml
+++ b/testgen/template/dbsetup_test_types/test_types_Unique_Pct.yaml
@@ -30,23 +30,15 @@ test_types:
health_dimension: Data Drift
threshold_description: |-
Expected maximum Cohen's H Difference
+ result_visualization: line_chart
+ result_visualization_params: null
usage_notes: |-
You can think of this as a test of similarity that measures whether the percentage of unique values is consistent with the percentage at baseline. A significant change might indicate duplication or a telling shift in cardinality between entities. The test uses Cohen's H, a statistical test to identify a significant difference between two ratios. Results are reported on a standardized scale, which can be interpreted via a rule-of-thumb from small to huge. You can refine the expected threshold value as you view legitimate results of the measure over time.
active: Y
- result_visualization: line_chart
- result_visualization_params: null
cat_test_conditions:
- - id: '1029'
- test_type: Unique_Pct
- sql_flavor: redshift
- measure: |-
- ABS( 2.0 * ASIN( SQRT({BASELINE_UNIQUE_CT}::FLOAT / {BASELINE_VALUE_CT}::FLOAT ) ) - 2 * ASIN( SQRT( COUNT( DISTINCT {COLUMN_NAME} )::FLOAT / NULLIF(COUNT( {COLUMN_NAME} ), 0)::FLOAT )) )
- test_operator: '>='
- test_condition: |-
- {THRESHOLD_VALUE}
- - id: '2029'
+ - id: '6029'
test_type: Unique_Pct
- sql_flavor: snowflake
+ sql_flavor: databricks
measure: |-
ABS( 2.0 * ASIN( SQRT({BASELINE_UNIQUE_CT}::FLOAT / {BASELINE_VALUE_CT}::FLOAT ) ) - 2 * ASIN( SQRT( COUNT( DISTINCT {COLUMN_NAME} )::FLOAT / NULLIF(COUNT( {COLUMN_NAME} ), 0)::FLOAT )) )
test_operator: '>='
@@ -68,30 +60,46 @@ test_types:
test_operator: '>='
test_condition: |-
{THRESHOLD_VALUE}
- - id: '5029'
+ - id: '1029'
test_type: Unique_Pct
- sql_flavor: trino
+ sql_flavor: redshift
measure: |-
- ABS( 2.0 * ASIN( SQRT(CAST({BASELINE_UNIQUE_CT} AS REAL) / CAST({BASELINE_VALUE_CT} AS REAL) ) ) - 2 * ASIN( SQRT( CAST(COUNT( DISTINCT {COLUMN_NAME} ) AS REAL) / CAST(NULLIF(COUNT( {COLUMN_NAME} ), 0) AS REAL) )))
+ ABS( 2.0 * ASIN( SQRT({BASELINE_UNIQUE_CT}::FLOAT / {BASELINE_VALUE_CT}::FLOAT ) ) - 2 * ASIN( SQRT( COUNT( DISTINCT {COLUMN_NAME} )::FLOAT / NULLIF(COUNT( {COLUMN_NAME} ), 0)::FLOAT )) )
test_operator: '>='
test_condition: |-
{THRESHOLD_VALUE}
- - id: '6029'
+ - id: '2029'
test_type: Unique_Pct
- sql_flavor: databricks
+ sql_flavor: snowflake
measure: |-
ABS( 2.0 * ASIN( SQRT({BASELINE_UNIQUE_CT}::FLOAT / {BASELINE_VALUE_CT}::FLOAT ) ) - 2 * ASIN( SQRT( COUNT( DISTINCT {COLUMN_NAME} )::FLOAT / NULLIF(COUNT( {COLUMN_NAME} ), 0)::FLOAT )) )
test_operator: '>='
test_condition: |-
{THRESHOLD_VALUE}
+ - id: '5029'
+ test_type: Unique_Pct
+ sql_flavor: trino
+ measure: |-
+ ABS( 2.0 * ASIN( SQRT(CAST({BASELINE_UNIQUE_CT} AS REAL) / CAST({BASELINE_VALUE_CT} AS REAL) ) ) - 2 * ASIN( SQRT( CAST(COUNT( DISTINCT {COLUMN_NAME} ) AS REAL) / CAST(NULLIF(COUNT( {COLUMN_NAME} ), 0) AS REAL) )))
+ test_operator: '>='
+ test_condition: |-
+ {THRESHOLD_VALUE}
target_data_lookups:
- - id: '1029'
+ - id: '1326'
test_id: '1035'
test_type: Unique_Pct
- sql_flavor: redshift
+ sql_flavor: databricks
lookup_type: null
lookup_query: |-
- SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT 500;
+ SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY `{COLUMN_NAME}` ORDER BY count DESC LIMIT 500;
+ error_type: Test Results
+ - id: '1168'
+ test_id: '1035'
+ test_type: Unique_Pct
+ sql_flavor: mssql
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC;
error_type: Test Results
- id: '1111'
test_id: '1035'
@@ -101,13 +109,13 @@ test_types:
lookup_query: |-
SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT 500;
error_type: Test Results
- - id: '1168'
+ - id: '1029'
test_id: '1035'
test_type: Unique_Pct
- sql_flavor: mssql
+ sql_flavor: redshift
lookup_type: null
lookup_query: |-
- SELECT DISTINCT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC;
+ SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT 500;
error_type: Test Results
- id: '1225'
test_id: '1035'
@@ -117,12 +125,4 @@ test_types:
lookup_query: |-
SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT 500;
error_type: Test Results
- - id: '1326'
- test_id: '1035'
- test_type: Unique_Pct
- sql_flavor: databricks
- lookup_type: null
- lookup_query: |-
- SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY `{COLUMN_NAME}` ORDER BY count DESC LIMIT 500;
- error_type: Test Results
test_templates: []
diff --git a/testgen/template/dbsetup_test_types/test_types_Valid_Characters.yaml b/testgen/template/dbsetup_test_types/test_types_Valid_Characters.yaml
index e06ff91a..2e21cb68 100644
--- a/testgen/template/dbsetup_test_types/test_types_Valid_Characters.yaml
+++ b/testgen/template/dbsetup_test_types/test_types_Valid_Characters.yaml
@@ -30,68 +30,76 @@ test_types:
health_dimension: Schema Drift
threshold_description: |-
Threshold Invalid Value Count
+ result_visualization: line_chart
+ result_visualization_params: null
usage_notes: |-
This test looks for the presence of non-printing ASCII characters that are considered non-standard in basic text processing. It also identifies leading spaces and values enclosed in quotes. Values that fail this test may be artifacts of data conversion, or just more difficult to process or analyze downstream.
active: N
- result_visualization: line_chart
- result_visualization_params: null
cat_test_conditions:
- - id: '1036'
+ - id: '6036'
test_type: Valid_Characters
- sql_flavor: redshift
+ sql_flavor: databricks
measure: |-
SUM(CASE WHEN TRANSLATE({COLUMN_NAME}, CHR(160) || CHR(8203) || CHR(65279) || CHR(8239) || CHR(8201) || CHR(12288) || CHR(8204), 'XXXXXXX') <> {COLUMN_NAME} OR {COLUMN_NAME} LIKE ' %' OR {COLUMN_NAME} LIKE '''%''' OR {COLUMN_NAME} LIKE '"%"' THEN 1 ELSE 0 END)
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
- - id: '4036'
+ - id: '3036'
test_type: Valid_Characters
- sql_flavor: postgresql
+ sql_flavor: mssql
measure: |-
- SUM(CASE WHEN TRANSLATE({COLUMN_NAME}, CHR(160) || CHR(8203) || CHR(65279) || CHR(8239) || CHR(8201) || CHR(12288) || CHR(8204), 'XXXXXXX') <> {COLUMN_NAME} OR {COLUMN_NAME} LIKE ' %' OR {COLUMN_NAME} LIKE '''%''' OR {COLUMN_NAME} LIKE '"%"' THEN 1 ELSE 0 END)
+ SUM(CASE WHEN TRANSLATE({COLUMN_NAME}, NCHAR(160) || NCHAR(8203) || NCHAR(65279) || NCHAR(8239) || NCHAR(8201) || NCHAR(12288) || NCHAR(8204), 'XXXXXXX') <> {COLUMN_NAME} OR {COLUMN_NAME} LIKE ' %' OR {COLUMN_NAME} LIKE '''%''' OR {COLUMN_NAME} LIKE '"%"' THEN 1 ELSE 0 END)
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
- - id: '2036'
+ - id: '4036'
test_type: Valid_Characters
- sql_flavor: snowflake
+ sql_flavor: postgresql
measure: |-
- SUM(CASE WHEN TRANSLATE({COLUMN_NAME}, CHAR(160) || CHAR(8203) || CHAR(65279) || CHAR(8239) || CHAR(8201) || CHAR(12288) || CHAR(8204), 'XXXXXXX') <> {COLUMN_NAME} OR {COLUMN_NAME} LIKE ' %' OR {COLUMN_NAME} LIKE '''%''' OR {COLUMN_NAME} LIKE '"%"' THEN 1 ELSE 0 END)
+ SUM(CASE WHEN TRANSLATE({COLUMN_NAME}, CHR(160) || CHR(8203) || CHR(65279) || CHR(8239) || CHR(8201) || CHR(12288) || CHR(8204), 'XXXXXXX') <> {COLUMN_NAME} OR {COLUMN_NAME} LIKE ' %' OR {COLUMN_NAME} LIKE '''%''' OR {COLUMN_NAME} LIKE '"%"' THEN 1 ELSE 0 END)
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
- - id: '5036'
+ - id: '1036'
test_type: Valid_Characters
- sql_flavor: trino
+ sql_flavor: redshift
measure: |-
SUM(CASE WHEN TRANSLATE({COLUMN_NAME}, CHR(160) || CHR(8203) || CHR(65279) || CHR(8239) || CHR(8201) || CHR(12288) || CHR(8204), 'XXXXXXX') <> {COLUMN_NAME} OR {COLUMN_NAME} LIKE ' %' OR {COLUMN_NAME} LIKE '''%''' OR {COLUMN_NAME} LIKE '"%"' THEN 1 ELSE 0 END)
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
- - id: '3036'
+ - id: '2036'
test_type: Valid_Characters
- sql_flavor: mssql
+ sql_flavor: snowflake
measure: |-
- SUM(CASE WHEN TRANSLATE({COLUMN_NAME}, NCHAR(160) || NCHAR(8203) || NCHAR(65279) || NCHAR(8239) || NCHAR(8201) || NCHAR(12288) || NCHAR(8204), 'XXXXXXX') <> {COLUMN_NAME} OR {COLUMN_NAME} LIKE ' %' OR {COLUMN_NAME} LIKE '''%''' OR {COLUMN_NAME} LIKE '"%"' THEN 1 ELSE 0 END)
+ SUM(CASE WHEN TRANSLATE({COLUMN_NAME}, CHAR(160) || CHAR(8203) || CHAR(65279) || CHAR(8239) || CHAR(8201) || CHAR(12288) || CHAR(8204), 'XXXXXXX') <> {COLUMN_NAME} OR {COLUMN_NAME} LIKE ' %' OR {COLUMN_NAME} LIKE '''%''' OR {COLUMN_NAME} LIKE '"%"' THEN 1 ELSE 0 END)
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
- - id: '6036'
+ - id: '5036'
test_type: Valid_Characters
- sql_flavor: databricks
+ sql_flavor: trino
measure: |-
SUM(CASE WHEN TRANSLATE({COLUMN_NAME}, CHR(160) || CHR(8203) || CHR(65279) || CHR(8239) || CHR(8201) || CHR(12288) || CHR(8204), 'XXXXXXX') <> {COLUMN_NAME} OR {COLUMN_NAME} LIKE ' %' OR {COLUMN_NAME} LIKE '''%''' OR {COLUMN_NAME} LIKE '"%"' THEN 1 ELSE 0 END)
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
target_data_lookups:
- - id: '1233'
+ - id: '1330'
test_id: '1043'
test_type: Valid_Characters
- sql_flavor: redshift
+ sql_flavor: databricks
lookup_type: null
lookup_query: |-
- SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}", CHR(160) || CHR(8203) || CHR(65279) || CHR(8239) || CHR(8201) || CHR(12288) || CHR(8204), 'XXXXXXX') <> "{COLUMN_NAME}" OR "{COLUMN_NAME}" LIKE ' %' OR "{COLUMN_NAME}" LIKE '''%''' OR "{COLUMN_NAME}" LIKE '"%"' ORDER BY record_ct DESC;
+ SELECT `{COLUMN_NAME}`, COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE REGEXP_LIKE(`{COLUMN_NAME}`, '.*[[:cntrl:]].*') OR `{COLUMN_NAME}`::STRING LIKE ' %' OR `{COLUMN_NAME}`::STRING LIKE '''%''' OR `{COLUMN_NAME}`::STRING LIKE '"%"' GROUP BY `{COLUMN_NAME}` ORDER BY record_ct DESC LIMIT 20;
+ error_type: Test Results
+ - id: '1235'
+ test_id: '1043'
+ test_type: Valid_Characters
+ sql_flavor: mssql
+ lookup_type: null
+ lookup_query: |-
+ SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}", NCHAR(160) || NCHAR(8203) || NCHAR(65279) || NCHAR(8239) || NCHAR(8201) || NCHAR(12288) || NCHAR(8204), 'XXXXXXX') <> "{COLUMN_NAME}" OR "{COLUMN_NAME}" LIKE ' %' OR "{COLUMN_NAME}" LIKE '''%''' OR "{COLUMN_NAME}" LIKE '"%"' ORDER BY record_ct DESC;
error_type: Test Results
- id: '1234'
test_id: '1043'
@@ -101,13 +109,13 @@ test_types:
lookup_query: |-
SELECT "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}", CHR(160) || CHR(8203) || CHR(65279) || CHR(8239) || CHR(8201) || CHR(12288) || CHR(8204), 'XXXXXXX') <> "{COLUMN_NAME}" OR "{COLUMN_NAME}" LIKE ' %' OR "{COLUMN_NAME}" LIKE '''%''' OR "{COLUMN_NAME}" LIKE '"%"' ORDER BY record_ct DESC LIMIT 20;
error_type: Test Results
- - id: '1235'
+ - id: '1233'
test_id: '1043'
test_type: Valid_Characters
- sql_flavor: mssql
+ sql_flavor: redshift
lookup_type: null
lookup_query: |-
- SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}", NCHAR(160) || NCHAR(8203) || NCHAR(65279) || NCHAR(8239) || NCHAR(8201) || NCHAR(12288) || NCHAR(8204), 'XXXXXXX') <> "{COLUMN_NAME}" OR "{COLUMN_NAME}" LIKE ' %' OR "{COLUMN_NAME}" LIKE '''%''' OR "{COLUMN_NAME}" LIKE '"%"' ORDER BY record_ct DESC;
+ SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}", CHR(160) || CHR(8203) || CHR(65279) || CHR(8239) || CHR(8201) || CHR(12288) || CHR(8204), 'XXXXXXX') <> "{COLUMN_NAME}" OR "{COLUMN_NAME}" LIKE ' %' OR "{COLUMN_NAME}" LIKE '''%''' OR "{COLUMN_NAME}" LIKE '"%"' ORDER BY record_ct DESC;
error_type: Test Results
- id: '1236'
test_id: '1043'
@@ -117,12 +125,4 @@ test_types:
lookup_query: |-
SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}", CHAR(160) || CHAR(8203) || CHAR(65279) || CHAR(8239) || CHAR(8201) || CHAR(12288) || CHAR(8204), 'XXXXXXX') <> "{COLUMN_NAME}" OR "{COLUMN_NAME}" LIKE ' %' OR "{COLUMN_NAME}" LIKE '''%''' OR "{COLUMN_NAME}" LIKE '"%"' ORDER BY record_ct DESC;
error_type: Test Results
- - id: '1330'
- test_id: '1043'
- test_type: Valid_Characters
- sql_flavor: databricks
- lookup_type: null
- lookup_query: |-
- SELECT `{COLUMN_NAME}`, COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE REGEXP_LIKE(`{COLUMN_NAME}`, '.*[[:cntrl:]].*') OR `{COLUMN_NAME}`::STRING LIKE ' %' OR `{COLUMN_NAME}`::STRING LIKE '''%''' OR `{COLUMN_NAME}`::STRING LIKE '"%"' GROUP BY `{COLUMN_NAME}` ORDER BY record_ct DESC LIMIT 20;
- error_type: Test Results
test_templates: []
diff --git a/testgen/template/dbsetup_test_types/test_types_Valid_Month.yaml b/testgen/template/dbsetup_test_types/test_types_Valid_Month.yaml
index 0ec9e5ad..6cccbe0e 100644
--- a/testgen/template/dbsetup_test_types/test_types_Valid_Month.yaml
+++ b/testgen/template/dbsetup_test_types/test_types_Valid_Month.yaml
@@ -30,54 +30,54 @@ test_types:
health_dimension: Schema Drift
threshold_description: |-
Expected count of invalid months
- usage_notes: null
- active: N
result_visualization: line_chart
result_visualization_params: null
+ usage_notes: null
+ active: N
cat_test_conditions:
- - id: '1033'
+ - id: '6033'
test_type: Valid_Month
- sql_flavor: redshift
+ sql_flavor: databricks
measure: |-
SUM(CASE WHEN NULLIF({COLUMN_NAME}, '') NOT IN ({BASELINE_VALUE}) THEN 1 ELSE 0 END)
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
- - id: '2033'
+ - id: '3033'
test_type: Valid_Month
- sql_flavor: snowflake
+ sql_flavor: mssql
measure: |-
SUM(CASE WHEN NULLIF({COLUMN_NAME}, '') NOT IN ({BASELINE_VALUE}) THEN 1 ELSE 0 END)
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
- - id: '3033'
+ - id: '4033'
test_type: Valid_Month
- sql_flavor: mssql
+ sql_flavor: postgresql
measure: |-
SUM(CASE WHEN NULLIF({COLUMN_NAME}, '') NOT IN ({BASELINE_VALUE}) THEN 1 ELSE 0 END)
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
- - id: '4033'
+ - id: '1033'
test_type: Valid_Month
- sql_flavor: postgresql
+ sql_flavor: redshift
measure: |-
SUM(CASE WHEN NULLIF({COLUMN_NAME}, '') NOT IN ({BASELINE_VALUE}) THEN 1 ELSE 0 END)
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
- - id: '5033'
+ - id: '2033'
test_type: Valid_Month
- sql_flavor: trino
+ sql_flavor: snowflake
measure: |-
SUM(CASE WHEN NULLIF({COLUMN_NAME}, '') NOT IN ({BASELINE_VALUE}) THEN 1 ELSE 0 END)
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
- - id: '6033'
+ - id: '5033'
test_type: Valid_Month
- sql_flavor: databricks
+ sql_flavor: trino
measure: |-
SUM(CASE WHEN NULLIF({COLUMN_NAME}, '') NOT IN ({BASELINE_VALUE}) THEN 1 ELSE 0 END)
test_operator: '>'
diff --git a/testgen/template/dbsetup_test_types/test_types_Valid_US_Zip.yaml b/testgen/template/dbsetup_test_types/test_types_Valid_US_Zip.yaml
index f8eaa0e5..a7d5a3d3 100644
--- a/testgen/template/dbsetup_test_types/test_types_Valid_US_Zip.yaml
+++ b/testgen/template/dbsetup_test_types/test_types_Valid_US_Zip.yaml
@@ -29,64 +29,72 @@ test_types:
health_dimension: Schema Drift
threshold_description: |-
Threshold Invalid Value Count
- usage_notes: null
- active: Y
result_visualization: line_chart
result_visualization_params: null
+ usage_notes: null
+ active: Y
cat_test_conditions:
- - id: '1034'
+ - id: '6034'
test_type: Valid_US_Zip
- sql_flavor: redshift
+ sql_flavor: databricks
measure: |-
SUM(CASE WHEN TRANSLATE({COLUMN_NAME},'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') THEN 1 ELSE 0 END)
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
- - id: '4034'
+ - id: '3034'
test_type: Valid_US_Zip
- sql_flavor: postgresql
+ sql_flavor: mssql
measure: |-
SUM(CASE WHEN TRANSLATE({COLUMN_NAME},'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') THEN 1 ELSE 0 END)
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
- - id: '2034'
+ - id: '4034'
test_type: Valid_US_Zip
- sql_flavor: snowflake
+ sql_flavor: postgresql
measure: |-
SUM(CASE WHEN TRANSLATE({COLUMN_NAME},'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') THEN 1 ELSE 0 END)
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
- - id: '5034'
+ - id: '1034'
test_type: Valid_US_Zip
- sql_flavor: trino
+ sql_flavor: redshift
measure: |-
SUM(CASE WHEN TRANSLATE({COLUMN_NAME},'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') THEN 1 ELSE 0 END)
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
- - id: '3034'
+ - id: '2034'
test_type: Valid_US_Zip
- sql_flavor: mssql
+ sql_flavor: snowflake
measure: |-
SUM(CASE WHEN TRANSLATE({COLUMN_NAME},'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') THEN 1 ELSE 0 END)
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
- - id: '6034'
+ - id: '5034'
test_type: Valid_US_Zip
- sql_flavor: databricks
+ sql_flavor: trino
measure: |-
SUM(CASE WHEN TRANSLATE({COLUMN_NAME},'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') THEN 1 ELSE 0 END)
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
target_data_lookups:
- - id: '1237'
+ - id: '1331'
test_id: '1044'
test_type: Valid_US_Zip
- sql_flavor: redshift
+ sql_flavor: databricks
+ lookup_type: null
+ lookup_query: |-
+ SELECT `{COLUMN_NAME}`, COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE(`{COLUMN_NAME}`,'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') GROUP BY `{COLUMN_NAME}` ORDER BY record_ct DESC LIMIT 20;
+ error_type: Test Results
+ - id: '1239'
+ test_id: '1044'
+ test_type: Valid_US_Zip
+ sql_flavor: mssql
lookup_type: null
lookup_query: |-
SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC;
@@ -99,10 +107,10 @@ test_types:
lookup_query: |-
SELECT "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC LIMIT 20;
error_type: Test Results
- - id: '1239'
+ - id: '1237'
test_id: '1044'
test_type: Valid_US_Zip
- sql_flavor: mssql
+ sql_flavor: redshift
lookup_type: null
lookup_query: |-
SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC;
@@ -115,12 +123,4 @@ test_types:
lookup_query: |-
SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC;
error_type: Test Results
- - id: '1331'
- test_id: '1044'
- test_type: Valid_US_Zip
- sql_flavor: databricks
- lookup_type: null
- lookup_query: |-
- SELECT `{COLUMN_NAME}`, COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE(`{COLUMN_NAME}`,'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') GROUP BY `{COLUMN_NAME}` ORDER BY record_ct DESC LIMIT 20;
- error_type: Test Results
test_templates: []
diff --git a/testgen/template/dbsetup_test_types/test_types_Valid_US_Zip3.yaml b/testgen/template/dbsetup_test_types/test_types_Valid_US_Zip3.yaml
index dac90d63..16159eef 100644
--- a/testgen/template/dbsetup_test_types/test_types_Valid_US_Zip3.yaml
+++ b/testgen/template/dbsetup_test_types/test_types_Valid_US_Zip3.yaml
@@ -29,68 +29,76 @@ test_types:
health_dimension: Schema Drift
threshold_description: |-
Threshold Invalid Zip3 Count
+ result_visualization: line_chart
+ result_visualization_params: null
usage_notes: |-
This test looks for the presence of values that fail to match the three-digit numeric code expected for US Zip Code regional prefixes. These prefixes are often used to roll up Zip Code data to a regional level, and may be critical to anonymize detailed data and protect PID. Depending on your needs and regulatory requirements, longer zip codes could place PID at risk.
active: Y
- result_visualization: line_chart
- result_visualization_params: null
cat_test_conditions:
- - id: '1035'
+ - id: '6035'
test_type: Valid_US_Zip3
- sql_flavor: redshift
+ sql_flavor: databricks
measure: |-
SUM(CASE WHEN TRANSLATE({COLUMN_NAME},'012345678','999999999') <> '999' THEN 1 ELSE 0 END)
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
- - id: '4035'
+ - id: '3035'
test_type: Valid_US_Zip3
- sql_flavor: postgresql
+ sql_flavor: mssql
measure: |-
SUM(CASE WHEN TRANSLATE({COLUMN_NAME},'012345678','999999999') <> '999' THEN 1 ELSE 0 END)
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
- - id: '2035'
+ - id: '4035'
test_type: Valid_US_Zip3
- sql_flavor: snowflake
+ sql_flavor: postgresql
measure: |-
SUM(CASE WHEN TRANSLATE({COLUMN_NAME},'012345678','999999999') <> '999' THEN 1 ELSE 0 END)
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
- - id: '5035'
+ - id: '1035'
test_type: Valid_US_Zip3
- sql_flavor: trino
+ sql_flavor: redshift
measure: |-
SUM(CASE WHEN TRANSLATE({COLUMN_NAME},'012345678','999999999') <> '999' THEN 1 ELSE 0 END)
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
- - id: '3035'
+ - id: '2035'
test_type: Valid_US_Zip3
- sql_flavor: mssql
+ sql_flavor: snowflake
measure: |-
SUM(CASE WHEN TRANSLATE({COLUMN_NAME},'012345678','999999999') <> '999' THEN 1 ELSE 0 END)
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
- - id: '6035'
+ - id: '5035'
test_type: Valid_US_Zip3
- sql_flavor: databricks
+ sql_flavor: trino
measure: |-
SUM(CASE WHEN TRANSLATE({COLUMN_NAME},'012345678','999999999') <> '999' THEN 1 ELSE 0 END)
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
target_data_lookups:
- - id: '1241'
+ - id: '1332'
test_id: '1045'
test_type: Valid_US_Zip3
- sql_flavor: redshift
+ sql_flavor: databricks
lookup_type: null
lookup_query: |-
- SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') <> '999' GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC;
+ SELECT `{COLUMN_NAME}`, COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE(`{COLUMN_NAME}`,'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') GROUP BY `{COLUMN_NAME}` ORDER BY record_ct DESC LIMIT 20;
+ error_type: Test Results
+ - id: '1243'
+ test_id: '1045'
+ test_type: Valid_US_Zip3
+ sql_flavor: mssql
+ lookup_type: null
+ lookup_query: |-
+ SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC;
error_type: Test Results
- id: '1242'
test_id: '1045'
@@ -100,13 +108,13 @@ test_types:
lookup_query: |-
SELECT "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') <> '' GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC LIMIT 20;
error_type: Test Results
- - id: '1243'
+ - id: '1241'
test_id: '1045'
test_type: Valid_US_Zip3
- sql_flavor: mssql
+ sql_flavor: redshift
lookup_type: null
lookup_query: |-
- SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC;
+ SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') <> '999' GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC;
error_type: Test Results
- id: '1244'
test_id: '1045'
@@ -116,12 +124,4 @@ test_types:
lookup_query: |-
SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC;
error_type: Test Results
- - id: '1332'
- test_id: '1045'
- test_type: Valid_US_Zip3
- sql_flavor: databricks
- lookup_type: null
- lookup_query: |-
- SELECT `{COLUMN_NAME}`, COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE(`{COLUMN_NAME}`,'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') GROUP BY `{COLUMN_NAME}` ORDER BY record_ct DESC LIMIT 20;
- error_type: Test Results
test_templates: []
diff --git a/testgen/template/dbsetup_test_types/test_types_Variability_Decrease.yaml b/testgen/template/dbsetup_test_types/test_types_Variability_Decrease.yaml
index 30804567..8b0338ff 100644
--- a/testgen/template/dbsetup_test_types/test_types_Variability_Decrease.yaml
+++ b/testgen/template/dbsetup_test_types/test_types_Variability_Decrease.yaml
@@ -30,6 +30,8 @@ test_types:
health_dimension: Data Drift
threshold_description: |-
Expected minimum pct of baseline Standard Deviation (SD)
+ result_visualization: line_chart
+ result_visualization_params: null
usage_notes: "This test looks for percent shifts in standard deviation as a measure\
\ of the stability of a measure over time. A significant change could indicate\
\ that new values are erroneous, or that the cohort being evaluated is significantly\
@@ -37,22 +39,12 @@ test_types:
\ process, better precision in measurement, the elimination of outliers, or a\
\ more homogeneous cohort. "
active: Y
- result_visualization: line_chart
- result_visualization_params: null
cat_test_conditions:
- - id: '1032'
- test_type: Variability_Decrease
- sql_flavor: redshift
- measure: |-
- 100.0*STDDEV({COLUMN_NAME}::FLOAT)/{BASELINE_SD}::FLOAT
- test_operator: <
- test_condition: |-
- {THRESHOLD_VALUE}
- - id: '2032'
+ - id: '6032'
test_type: Variability_Decrease
- sql_flavor: snowflake
+ sql_flavor: databricks
measure: |-
- 100.0*STDDEV({COLUMN_NAME}::FLOAT)/{BASELINE_SD}::FLOAT
+ 100.0*STDDEV_SAMP({COLUMN_NAME}::FLOAT)/{BASELINE_SD}::FLOAT
test_operator: <
test_condition: |-
{THRESHOLD_VALUE}
@@ -72,11 +64,19 @@ test_types:
test_operator: <
test_condition: |-
{THRESHOLD_VALUE}
- - id: '6032'
+ - id: '1032'
test_type: Variability_Decrease
- sql_flavor: databricks
+ sql_flavor: redshift
measure: |-
- 100.0*STDDEV_SAMP({COLUMN_NAME}::FLOAT)/{BASELINE_SD}::FLOAT
+ 100.0*STDDEV({COLUMN_NAME}::FLOAT)/{BASELINE_SD}::FLOAT
+ test_operator: <
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '2032'
+ test_type: Variability_Decrease
+ sql_flavor: snowflake
+ measure: |-
+ 100.0*STDDEV({COLUMN_NAME}::FLOAT)/{BASELINE_SD}::FLOAT
test_operator: <
test_condition: |-
{THRESHOLD_VALUE}
@@ -89,13 +89,21 @@ test_types:
test_condition: |-
{THRESHOLD_VALUE}
target_data_lookups:
- - id: '1032'
+ - id: '1329'
test_id: '1041'
test_type: Variability_Decrease
- sql_flavor: redshift
+ sql_flavor: databricks
lookup_type: null
lookup_query: |-
- SELECT STDDEV("{COLUMN_NAME}"::FLOAT) as current_standard_deviation FROM {TARGET_SCHEMA}.{TABLE_NAME};
+ SELECT STDDEV(CAST(`{COLUMN_NAME}` AS FLOAT)) as current_standard_deviation FROM {TARGET_SCHEMA}.{TABLE_NAME};
+ error_type: Test Results
+ - id: '1171'
+ test_id: '1041'
+ test_type: Variability_Decrease
+ sql_flavor: mssql
+ lookup_type: null
+ lookup_query: |-
+ SELECT STDEV(CAST("{COLUMN_NAME}" AS FLOAT)) as current_standard_deviation FROM {TARGET_SCHEMA}.{TABLE_NAME};
error_type: Test Results
- id: '1114'
test_id: '1041'
@@ -105,13 +113,13 @@ test_types:
lookup_query: |-
SELECT STDDEV(CAST("{COLUMN_NAME}" AS FLOAT)) as current_standard_deviation FROM {TARGET_SCHEMA}.{TABLE_NAME};
error_type: Test Results
- - id: '1171'
+ - id: '1032'
test_id: '1041'
test_type: Variability_Decrease
- sql_flavor: mssql
+ sql_flavor: redshift
lookup_type: null
lookup_query: |-
- SELECT STDEV(CAST("{COLUMN_NAME}" AS FLOAT)) as current_standard_deviation FROM {TARGET_SCHEMA}.{TABLE_NAME};
+ SELECT STDDEV("{COLUMN_NAME}"::FLOAT) as current_standard_deviation FROM {TARGET_SCHEMA}.{TABLE_NAME};
error_type: Test Results
- id: '1228'
test_id: '1041'
@@ -121,12 +129,4 @@ test_types:
lookup_query: |-
SELECT STDDEV(CAST("{COLUMN_NAME}" AS FLOAT)) as current_standard_deviation FROM {TARGET_SCHEMA}.{TABLE_NAME};
error_type: Test Results
- - id: '1329'
- test_id: '1041'
- test_type: Variability_Decrease
- sql_flavor: databricks
- lookup_type: null
- lookup_query: |-
- SELECT STDDEV(CAST(`{COLUMN_NAME}` AS FLOAT)) as current_standard_deviation FROM {TARGET_SCHEMA}.{TABLE_NAME};
- error_type: Test Results
test_templates: []
diff --git a/testgen/template/dbsetup_test_types/test_types_Variability_Increase.yaml b/testgen/template/dbsetup_test_types/test_types_Variability_Increase.yaml
index 1b4d4c8b..7229c38b 100644
--- a/testgen/template/dbsetup_test_types/test_types_Variability_Increase.yaml
+++ b/testgen/template/dbsetup_test_types/test_types_Variability_Increase.yaml
@@ -30,6 +30,8 @@ test_types:
health_dimension: Data Drift
threshold_description: |-
Expected maximum pct of baseline Standard Deviation (SD)
+ result_visualization: line_chart
+ result_visualization_params: null
usage_notes: "This test looks for percent shifts in standard deviation as a measure\
\ of the stability of a measure over time. A significant change could indicate\
\ that new values are erroneous, or that the cohort being evaluated is significantly\
@@ -41,22 +43,12 @@ test_types:
\ that should be noted and assessed by business users. If the average does not\
\ shift, this may point to a data quality or data collection problem. "
active: Y
- result_visualization: line_chart
- result_visualization_params: null
cat_test_conditions:
- - id: '1031'
- test_type: Variability_Increase
- sql_flavor: redshift
- measure: |-
- 100.0*STDDEV({COLUMN_NAME}::FLOAT)/{BASELINE_SD}::FLOAT
- test_operator: '>'
- test_condition: |-
- {THRESHOLD_VALUE}
- - id: '2031'
+ - id: '6031'
test_type: Variability_Increase
- sql_flavor: snowflake
+ sql_flavor: databricks
measure: |-
- 100.0*STDDEV({COLUMN_NAME}::FLOAT)/{BASELINE_SD}::FLOAT
+ 100.0*STDDEV_SAMP({COLUMN_NAME}::FLOAT)/{BASELINE_SD}::FLOAT
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
@@ -76,11 +68,19 @@ test_types:
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
- - id: '6031'
+ - id: '1031'
test_type: Variability_Increase
- sql_flavor: databricks
+ sql_flavor: redshift
measure: |-
- 100.0*STDDEV_SAMP({COLUMN_NAME}::FLOAT)/{BASELINE_SD}::FLOAT
+ 100.0*STDDEV({COLUMN_NAME}::FLOAT)/{BASELINE_SD}::FLOAT
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ - id: '2031'
+ test_type: Variability_Increase
+ sql_flavor: snowflake
+ measure: |-
+ 100.0*STDDEV({COLUMN_NAME}::FLOAT)/{BASELINE_SD}::FLOAT
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
@@ -93,13 +93,21 @@ test_types:
test_condition: |-
{THRESHOLD_VALUE}
target_data_lookups:
- - id: '1031'
+ - id: '1328'
test_id: '1040'
test_type: Variability_Increase
- sql_flavor: redshift
+ sql_flavor: databricks
lookup_type: null
lookup_query: |-
- SELECT STDDEV("{COLUMN_NAME}"::FLOAT) as current_standard_deviation FROM {TARGET_SCHEMA}.{TABLE_NAME};
+ SELECT STDDEV(CAST(`{COLUMN_NAME}` AS FLOAT)) as current_standard_deviation FROM {TARGET_SCHEMA}.{TABLE_NAME};
+ error_type: Test Results
+ - id: '1170'
+ test_id: '1040'
+ test_type: Variability_Increase
+ sql_flavor: mssql
+ lookup_type: null
+ lookup_query: |-
+ SELECT STDEV(CAST("{COLUMN_NAME}" AS FLOAT)) as current_standard_deviation FROM {TARGET_SCHEMA}.{TABLE_NAME};
error_type: Test Results
- id: '1113'
test_id: '1040'
@@ -109,13 +117,13 @@ test_types:
lookup_query: |-
SELECT STDDEV(CAST("{COLUMN_NAME}" AS FLOAT)) as current_standard_deviation FROM {TARGET_SCHEMA}.{TABLE_NAME};
error_type: Test Results
- - id: '1170'
+ - id: '1031'
test_id: '1040'
test_type: Variability_Increase
- sql_flavor: mssql
+ sql_flavor: redshift
lookup_type: null
lookup_query: |-
- SELECT STDEV(CAST("{COLUMN_NAME}" AS FLOAT)) as current_standard_deviation FROM {TARGET_SCHEMA}.{TABLE_NAME};
+ SELECT STDDEV("{COLUMN_NAME}"::FLOAT) as current_standard_deviation FROM {TARGET_SCHEMA}.{TABLE_NAME};
error_type: Test Results
- id: '1227'
test_id: '1040'
@@ -125,12 +133,4 @@ test_types:
lookup_query: |-
SELECT STDDEV(CAST("{COLUMN_NAME}" AS FLOAT)) as current_standard_deviation FROM {TARGET_SCHEMA}.{TABLE_NAME};
error_type: Test Results
- - id: '1328'
- test_id: '1040'
- test_type: Variability_Increase
- sql_flavor: databricks
- lookup_type: null
- lookup_query: |-
- SELECT STDDEV(CAST(`{COLUMN_NAME}` AS FLOAT)) as current_standard_deviation FROM {TARGET_SCHEMA}.{TABLE_NAME};
- error_type: Test Results
test_templates: []
diff --git a/testgen/template/dbsetup_test_types/test_types_Weekly_Rec_Ct.yaml b/testgen/template/dbsetup_test_types/test_types_Weekly_Rec_Ct.yaml
index 5f8ab3ee..16b73329 100644
--- a/testgen/template/dbsetup_test_types/test_types_Weekly_Rec_Ct.yaml
+++ b/testgen/template/dbsetup_test_types/test_types_Weekly_Rec_Ct.yaml
@@ -30,25 +30,17 @@ test_types:
health_dimension: Volume
threshold_description: |-
Expected maximum count of calendar weeks without dates present
+ result_visualization: line_chart
+ result_visualization_params: null
usage_notes: |-
Weekly Records tests that at least one record is present for every calendar week within the minimum and maximum date range for the column. The test is relevant for transactional data, where you would expect at least one transaction to be recorded each week. A failure here would suggest missing records for the number of weeks identified without data. You can adjust the threshold to accept a number of weeks that you know legitimately have no records.
active: Y
- result_visualization: line_chart
- result_visualization_params: null
cat_test_conditions:
- - id: '1030'
- test_type: Weekly_Rec_Ct
- sql_flavor: redshift
- measure: |-
- MAX(DATEDIFF(week, '1800-01-01'::DATE, {COLUMN_NAME})) - MIN(DATEDIFF(week, '1800-01-01'::DATE, {COLUMN_NAME}))+1 - COUNT(DISTINCT DATEDIFF(week, '1800-01-01'::DATE, {COLUMN_NAME}))
- test_operator: '>'
- test_condition: |-
- {THRESHOLD_VALUE}
- - id: '2030'
+ - id: '6030'
test_type: Weekly_Rec_Ct
- sql_flavor: snowflake
+ sql_flavor: databricks
measure: |-
- MAX(DATEDIFF(week, '1800-01-01'::DATE, {COLUMN_NAME})) - MIN(DATEDIFF(week, '1800-01-01'::DATE, {COLUMN_NAME}))+1 - COUNT(DISTINCT DATEDIFF(week, '1800-01-01'::DATE, {COLUMN_NAME}))
+ CAST(<%DATEDIFF_WEEK;MIN({COLUMN_NAME});MAX({COLUMN_NAME})%> + 1 - COUNT(DISTINCT DATE_TRUNC('week', {COLUMN_NAME})) AS INT)
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
@@ -68,38 +60,38 @@ test_types:
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
- - id: '5030'
+ - id: '1030'
test_type: Weekly_Rec_Ct
- sql_flavor: trino
+ sql_flavor: redshift
measure: |-
- MAX(DATE_DIFF('week', CAST('1800-01-01' AS DATE), {COLUMN_NAME})) - MIN(DATE_DIFF('week', CAST('1800-01-01' AS DATE), {COLUMN_NAME})) +1 - COUNT(DISTINCT DATE_DIFF('week', CAST('1800-01-01' AS DATE), {COLUMN_NAME}))
+ MAX(DATEDIFF(week, '1800-01-01'::DATE, {COLUMN_NAME})) - MIN(DATEDIFF(week, '1800-01-01'::DATE, {COLUMN_NAME}))+1 - COUNT(DISTINCT DATEDIFF(week, '1800-01-01'::DATE, {COLUMN_NAME}))
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
- - id: '6030'
+ - id: '2030'
test_type: Weekly_Rec_Ct
- sql_flavor: databricks
+ sql_flavor: snowflake
measure: |-
- CAST(<%DATEDIFF_WEEK;MIN({COLUMN_NAME});MAX({COLUMN_NAME})%> + 1 - COUNT(DISTINCT DATE_TRUNC('week', {COLUMN_NAME})) AS INT)
+ MAX(DATEDIFF(week, '1800-01-01'::DATE, {COLUMN_NAME})) - MIN(DATEDIFF(week, '1800-01-01'::DATE, {COLUMN_NAME}))+1 - COUNT(DISTINCT DATEDIFF(week, '1800-01-01'::DATE, {COLUMN_NAME}))
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
- target_data_lookups:
- - id: '1030'
- test_id: '1037'
+ - id: '5030'
test_type: Weekly_Rec_Ct
- sql_flavor: redshift
- lookup_type: null
- lookup_query: |-
- WITH RECURSIVE daterange(all_dates) AS (SELECT DATE_TRUNC('week',MIN("{COLUMN_NAME}")) :: DATE AS all_dates FROM {TARGET_SCHEMA}.{TABLE_NAME} UNION ALL SELECT (d.all_dates + INTERVAL '1 week' ) :: DATE AS all_dates FROM daterange d WHERE d.all_dates < (SELECT DATE_TRUNC('week', MAX("{COLUMN_NAME}")) :: DATE FROM {TARGET_SCHEMA}.{TABLE_NAME}) ), existing_periods AS ( SELECT DISTINCT DATE_TRUNC('week',"{COLUMN_NAME}") :: DATE AS period, COUNT(1) as period_count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY DATE_TRUNC('week',"{COLUMN_NAME}") :: DATE ) SELECT d.all_dates as missing_period, MAX(b.period) AS prior_available_week, (SELECT period_count FROM existing_periods WHERE period = MAX(b.period) ) AS prior_available_week_count, MIN(c.period) AS next_available_week, (SELECT period_count FROM existing_periods WHERE period = MIN(c.period) ) AS next_available_week_count FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates ORDER BY d.all_dates;
- error_type: Test Results
- - id: '1112'
+ sql_flavor: trino
+ measure: |-
+ MAX(DATE_DIFF('week', CAST('1800-01-01' AS DATE), {COLUMN_NAME})) - MIN(DATE_DIFF('week', CAST('1800-01-01' AS DATE), {COLUMN_NAME})) +1 - COUNT(DISTINCT DATE_DIFF('week', CAST('1800-01-01' AS DATE), {COLUMN_NAME}))
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
+ target_data_lookups:
+ - id: '1327'
test_id: '1037'
test_type: Weekly_Rec_Ct
- sql_flavor: postgresql
+ sql_flavor: databricks
lookup_type: null
lookup_query: |-
- WITH RECURSIVE daterange(all_dates) AS (SELECT DATE_TRUNC('week', MIN("{COLUMN_NAME}")) :: DATE AS all_dates FROM {TARGET_SCHEMA}.{TABLE_NAME} UNION ALL SELECT (d.all_dates + INTERVAL '1 week' ) :: DATE AS all_dates FROM daterange d WHERE d.all_dates < (SELECT DATE_TRUNC('week' , MAX("{COLUMN_NAME}")) :: DATE FROM {TARGET_SCHEMA}.{TABLE_NAME}) ), existing_periods AS (SELECT DISTINCT DATE_TRUNC('week', "{COLUMN_NAME}") :: DATE AS period, COUNT(1) as period_count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY DATE_TRUNC('week', "{COLUMN_NAME}") :: DATE) SELECT d.all_dates as missing_period, MAX(b.period) AS prior_available_week, (SELECT period_count FROM existing_periods WHERE period = MAX(b.period) ) AS prior_available_week_count, MIN(c.period) AS next_available_week, (SELECT period_count FROM existing_periods WHERE period = MIN(c.period) ) AS next_available_week_count FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates ORDER BY d.all_dates;
+ WITH daterange AS( SELECT explode(sequence( date_trunc('week', (SELECT min(`{COLUMN_NAME}`) FROM {TARGET_SCHEMA}.{TABLE_NAME})), date_trunc('week', (SELECT max(`{COLUMN_NAME}`) FROM {TARGET_SCHEMA}.{TABLE_NAME})), interval 1 week)) AS all_dates ), existing_periods AS ( SELECT DISTINCT date_trunc('week', `{COLUMN_NAME}`) AS period, COUNT(1) AS period_count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY date_trunc('week', `{COLUMN_NAME}`) ) SELECT p.missing_period, p.prior_available_week, e.period_count AS prior_available_week_count, p.next_available_week, f.period_count AS next_available_week_count FROM ( SELECT d.all_dates AS missing_period, MAX(b.period) AS prior_available_week, MIN(c.period) AS next_available_week FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates ) p LEFT JOIN existing_periods e ON p.prior_available_week = e.period LEFT JOIN existing_periods f ON p.next_available_week = f.period ORDER BY p.missing_period;
error_type: Test Results
- id: '1169'
test_id: '1037'
@@ -149,20 +141,28 @@ test_types:
OR next_record_ct = 0
ORDER BY check_period DESC;
error_type: Test Results
- - id: '1226'
+ - id: '1112'
test_id: '1037'
test_type: Weekly_Rec_Ct
- sql_flavor: snowflake
+ sql_flavor: postgresql
lookup_type: null
lookup_query: |-
- WITH RECURSIVE daterange(all_dates) AS (SELECT DATE_TRUNC('week',MIN("{COLUMN_NAME}")) :: DATE AS all_dates FROM {TARGET_SCHEMA}.{TABLE_NAME} UNION ALL SELECT (d.all_dates + INTERVAL '1 week' ) :: DATE AS all_dates FROM daterange d WHERE d.all_dates < (SELECT DATE_TRUNC('week', MAX("{COLUMN_NAME}")) :: DATE FROM {TARGET_SCHEMA}.{TABLE_NAME}) ), existing_periods AS ( SELECT DISTINCT DATE_TRUNC('week',"{COLUMN_NAME}") :: DATE AS period, COUNT(1) as period_count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY DATE_TRUNC('week',"{COLUMN_NAME}") :: DATE ) SELECT p.missing_period, p.prior_available_week, e.period_count as prior_available_week_count, p.next_available_week, f.period_count as next_available_week_count FROM( SELECT d.all_dates as missing_period, MAX(b.period) AS prior_available_week, MIN(c.period) AS next_available_week FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates ) p LEFT JOIN existing_periods e ON (p.prior_available_week = e.period) LEFT JOIN existing_periods f ON (p.next_available_week = f.period) ORDER BY p.missing_period;
+ WITH RECURSIVE daterange(all_dates) AS (SELECT DATE_TRUNC('week', MIN("{COLUMN_NAME}")) :: DATE AS all_dates FROM {TARGET_SCHEMA}.{TABLE_NAME} UNION ALL SELECT (d.all_dates + INTERVAL '1 week' ) :: DATE AS all_dates FROM daterange d WHERE d.all_dates < (SELECT DATE_TRUNC('week' , MAX("{COLUMN_NAME}")) :: DATE FROM {TARGET_SCHEMA}.{TABLE_NAME}) ), existing_periods AS (SELECT DISTINCT DATE_TRUNC('week', "{COLUMN_NAME}") :: DATE AS period, COUNT(1) as period_count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY DATE_TRUNC('week', "{COLUMN_NAME}") :: DATE) SELECT d.all_dates as missing_period, MAX(b.period) AS prior_available_week, (SELECT period_count FROM existing_periods WHERE period = MAX(b.period) ) AS prior_available_week_count, MIN(c.period) AS next_available_week, (SELECT period_count FROM existing_periods WHERE period = MIN(c.period) ) AS next_available_week_count FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates ORDER BY d.all_dates;
error_type: Test Results
- - id: '1327'
+ - id: '1030'
test_id: '1037'
test_type: Weekly_Rec_Ct
- sql_flavor: databricks
+ sql_flavor: redshift
lookup_type: null
lookup_query: |-
- WITH daterange AS( SELECT explode(sequence( date_trunc('week', (SELECT min(`{COLUMN_NAME}`) FROM {TARGET_SCHEMA}.{TABLE_NAME})), date_trunc('week', (SELECT max(`{COLUMN_NAME}`) FROM {TARGET_SCHEMA}.{TABLE_NAME})), interval 1 week)) AS all_dates ), existing_periods AS ( SELECT DISTINCT date_trunc('week', `{COLUMN_NAME}`) AS period, COUNT(1) AS period_count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY date_trunc('week', `{COLUMN_NAME}`) ) SELECT p.missing_period, p.prior_available_week, e.period_count AS prior_available_week_count, p.next_available_week, f.period_count AS next_available_week_count FROM ( SELECT d.all_dates AS missing_period, MAX(b.period) AS prior_available_week, MIN(c.period) AS next_available_week FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates ) p LEFT JOIN existing_periods e ON p.prior_available_week = e.period LEFT JOIN existing_periods f ON p.next_available_week = f.period ORDER BY p.missing_period;
+ WITH RECURSIVE daterange(all_dates) AS (SELECT DATE_TRUNC('week',MIN("{COLUMN_NAME}")) :: DATE AS all_dates FROM {TARGET_SCHEMA}.{TABLE_NAME} UNION ALL SELECT (d.all_dates + INTERVAL '1 week' ) :: DATE AS all_dates FROM daterange d WHERE d.all_dates < (SELECT DATE_TRUNC('week', MAX("{COLUMN_NAME}")) :: DATE FROM {TARGET_SCHEMA}.{TABLE_NAME}) ), existing_periods AS ( SELECT DISTINCT DATE_TRUNC('week',"{COLUMN_NAME}") :: DATE AS period, COUNT(1) as period_count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY DATE_TRUNC('week',"{COLUMN_NAME}") :: DATE ) SELECT d.all_dates as missing_period, MAX(b.period) AS prior_available_week, (SELECT period_count FROM existing_periods WHERE period = MAX(b.period) ) AS prior_available_week_count, MIN(c.period) AS next_available_week, (SELECT period_count FROM existing_periods WHERE period = MIN(c.period) ) AS next_available_week_count FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates ORDER BY d.all_dates;
+ error_type: Test Results
+ - id: '1226'
+ test_id: '1037'
+ test_type: Weekly_Rec_Ct
+ sql_flavor: snowflake
+ lookup_type: null
+ lookup_query: |-
+ WITH RECURSIVE daterange(all_dates) AS (SELECT DATE_TRUNC('week',MIN("{COLUMN_NAME}")) :: DATE AS all_dates FROM {TARGET_SCHEMA}.{TABLE_NAME} UNION ALL SELECT (d.all_dates + INTERVAL '1 week' ) :: DATE AS all_dates FROM daterange d WHERE d.all_dates < (SELECT DATE_TRUNC('week', MAX("{COLUMN_NAME}")) :: DATE FROM {TARGET_SCHEMA}.{TABLE_NAME}) ), existing_periods AS ( SELECT DISTINCT DATE_TRUNC('week',"{COLUMN_NAME}") :: DATE AS period, COUNT(1) as period_count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY DATE_TRUNC('week',"{COLUMN_NAME}") :: DATE ) SELECT p.missing_period, p.prior_available_week, e.period_count as prior_available_week_count, p.next_available_week, f.period_count as next_available_week_count FROM( SELECT d.all_dates as missing_period, MAX(b.period) AS prior_available_week, MIN(c.period) AS next_available_week FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates ) p LEFT JOIN existing_periods e ON (p.prior_available_week = e.period) LEFT JOIN existing_periods f ON (p.next_available_week = f.period) ORDER BY p.missing_period;
error_type: Test Results
test_templates: []
From 79bafc283a20700f242da49bc2370fedf2a72fe7 Mon Sep 17 00:00:00 2001
From: Diogo Basto
Date: Mon, 22 Sep 2025 17:30:43 +0100
Subject: [PATCH 07/48] Re-review
---
testgen/__main__.py | 2 +-
testgen/common/read_yaml_metadata_records.py | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/testgen/__main__.py b/testgen/__main__.py
index fda81d29..c0d7a7f9 100644
--- a/testgen/__main__.py
+++ b/testgen/__main__.py
@@ -516,7 +516,7 @@ def export_test_metadata(configuration: Configuration, path: str):
click.echo("export-test-metadata")
LOG.info("CurrentStep: Main Program - Test Metadata Export")
if not os.path.isdir(path):
- LOG.error("Provided path {path} is not a directory. Please correct the --path option.")
+ LOG.error(f"Provided path {path} is not a directory. Please correct the --path option.")
return
run_test_metadata_exporter(path)
LOG.info("CurrentStep: Main Program - Test Metadata Export - DONE")
diff --git a/testgen/common/read_yaml_metadata_records.py b/testgen/common/read_yaml_metadata_records.py
index 971397c5..26733ab5 100644
--- a/testgen/common/read_yaml_metadata_records.py
+++ b/testgen/common/read_yaml_metadata_records.py
@@ -1,4 +1,4 @@
-__all__ = ["import_metadata_records_from_yaml", "export_metadata_records_to_yaml"]
+__all__ = ["export_metadata_records_to_yaml", "import_metadata_records_from_yaml"]
import logging
from importlib.resources import as_file
From 446c4ac1610235fc9d4fa55d1f0cd5d2835a00db Mon Sep 17 00:00:00 2001
From: Diogo Basto
Date: Mon, 22 Sep 2025 18:02:23 +0100
Subject: [PATCH 08/48] re-re-review
---
testgen/commands/run_launch_db_config.py | 2 +-
.../commands/run_test_metadata_exporter.py | 12 ++++++++-
testgen/commands/run_upgrade_db_config.py | 2 +-
testgen/common/read_yaml_metadata_records.py | 27 ++++++++++---------
4 files changed, 28 insertions(+), 15 deletions(-)
diff --git a/testgen/commands/run_launch_db_config.py b/testgen/commands/run_launch_db_config.py
index 37226ea5..6899f0dc 100644
--- a/testgen/commands/run_launch_db_config.py
+++ b/testgen/commands/run_launch_db_config.py
@@ -86,7 +86,7 @@ def run_launch_db_config(delete_db: bool, drop_users_and_roles: bool = True) ->
password_override=params_mapping["TESTGEN_ADMIN_PASSWORD"],
user_type="schema_admin",
)
- import_metadata_records_from_yaml()
+ import_metadata_records_from_yaml(params_mapping)
ScoreDefinition.from_table_group(
TableGroup(
diff --git a/testgen/commands/run_test_metadata_exporter.py b/testgen/commands/run_test_metadata_exporter.py
index 89f7e8cc..126839ef 100644
--- a/testgen/commands/run_test_metadata_exporter.py
+++ b/testgen/commands/run_test_metadata_exporter.py
@@ -1,10 +1,20 @@
import logging
+from testgen import settings
+from testgen.common.credentials import get_tg_schema
from testgen.common.models import with_database_session
from testgen.common.read_yaml_metadata_records import export_metadata_records_to_yaml
LOG = logging.getLogger("testgen")
+def _get_params_mapping() -> dict:
+ return {
+ "SCHEMA_NAME": get_tg_schema(),
+ "TESTGEN_ADMIN_USER": settings.DATABASE_ADMIN_USER,
+ "TESTGEN_ADMIN_PASSWORD": settings.DATABASE_ADMIN_PASSWORD,
+ "OBSERVABILITY_URL": settings.OBSERVABILITY_API_URL,
+ }
+
@with_database_session
def run_test_metadata_exporter(templates_path) -> None:
- export_metadata_records_to_yaml(templates_path)
+ export_metadata_records_to_yaml(_get_params_mapping(), templates_path)
diff --git a/testgen/commands/run_upgrade_db_config.py b/testgen/commands/run_upgrade_db_config.py
index a6b7d91c..95ec4bc0 100644
--- a/testgen/commands/run_upgrade_db_config.py
+++ b/testgen/commands/run_upgrade_db_config.py
@@ -97,7 +97,7 @@ def _refresh_static_metadata(params_mapping):
password_override=params_mapping["TESTGEN_ADMIN_PASSWORD"],
user_type="schema_admin",
)
- import_metadata_records_from_yaml()
+ import_metadata_records_from_yaml(params_mapping)
strQueryMetadataConstraints = read_template_sql_file("055_recreate_metadata_constraints.sql", "dbsetup")
strQueryMetadataConstraints = replace_params(strQueryMetadataConstraints, params_mapping)
diff --git a/testgen/common/read_yaml_metadata_records.py b/testgen/common/read_yaml_metadata_records.py
index 26733ab5..28f8cf59 100644
--- a/testgen/common/read_yaml_metadata_records.py
+++ b/testgen/common/read_yaml_metadata_records.py
@@ -8,7 +8,6 @@
from yaml import SafeDumper, safe_dump, safe_load
-from testgen.common.credentials import get_tg_schema
from testgen.common.database.database_service import execute_db_queries, fetch_from_db_threaded
from testgen.common.read_file import get_template_files
@@ -111,10 +110,8 @@ def _literal_representer(dumper, data):
SafeDumper.add_representer(LiteralString, _literal_representer)
-def _process_yaml_for_import(data:dict, parent_table:str, parent_key:str, child_tables:list[str], default_pk:dict[str, list[str]], parent_child_column_map:dict[str, dict[str,str]]):
+def _process_yaml_for_import(params_mapping: dict, data:dict, parent_table:str, parent_key:str, child_tables:list[str], default_pk:dict[str, list[str]], parent_child_column_map:dict[str, dict[str,str]]):
queries = []
- schema = get_tg_schema()
-
parent = data.get(parent_table)
if not isinstance(parent, dict):
raise TypeError(f"YAML key '{parent_table}' must be a dict")
@@ -140,7 +137,7 @@ def _process_yaml_for_import(data:dict, parent_table:str, parent_key:str, child_
bound_values = {c: record[c] for c in columns}
sql = f"""
- INSERT INTO {schema}.{table_name} ({insert_cols})
+ INSERT INTO {params_mapping["SCHEMA_NAME"]}.{table_name} ({insert_cols})
VALUES ({insert_vals})
ON CONFLICT ({', '.join(pk_cols)}) DO UPDATE
SET {update_stmt};
@@ -154,7 +151,7 @@ def _process_yaml_for_import(data:dict, parent_table:str, parent_key:str, child_
update_stmt = ", ".join(f"{c}=EXCLUDED.{c}" for c in columns if c != parent_key)
bound_values = {c: parent[c] for c in columns}
parent_insert_query = f"""
- INSERT INTO {schema}.{parent_table} ({insert_cols})
+ INSERT INTO {params_mapping["SCHEMA_NAME"]}.{parent_table} ({insert_cols})
VALUES ({insert_vals})
ON CONFLICT ({parent_key}) DO UPDATE
SET {update_stmt};
@@ -164,16 +161,20 @@ def _process_yaml_for_import(data:dict, parent_table:str, parent_key:str, child_
execute_db_queries(
queries,
+ user_override=params_mapping["TESTGEN_ADMIN_USER"],
+ password_override=params_mapping["TESTGEN_ADMIN_PASSWORD"],
+ user_type="schema_admin",
)
return
-def import_metadata_records_from_yaml() -> None:
+def import_metadata_records_from_yaml(params_mapping: dict) -> None:
files = sorted(get_template_files(mask="^.*ya?ml$", sub_directory=TEST_TYPES_TEMPLATE_FOLDER), key=lambda key: str(key))
for yaml_file in files:
with as_file(yaml_file) as f:
with f.open("r") as file:
data = safe_load(file)
_process_yaml_for_import(
+ params_mapping,
data,
TEST_TYPES_PARENT_TABLE,
TEST_TYPES_PARENT_KEY,
@@ -188,6 +189,7 @@ def import_metadata_records_from_yaml() -> None:
LOG.info(f"Importing {yaml_file}")
data = safe_load(file)
_process_yaml_for_import(
+ params_mapping,
data,
ANOMALY_TYPES_PARENT_TABLE,
ANOMALY_TYPES_PARENT_KEY,
@@ -204,13 +206,12 @@ def _wrap_literal(table_name: str, recs: list[dict], literal_fields: dict[str, l
if isinstance(val, str) and val != "":
rec[fld] = LiteralString(val)
-def _process_records_for_export(export_path:str, parent_table:str, parent_key:str, child_tables:list[str], default_pk:dict[str, list[str]], parent_child_column_map:dict[str, dict[str,str]], literal_fields:dict[str, list[str]]) -> None:
+def _process_records_for_export(params_mapping: dict, export_path:str, parent_table:str, parent_key:str, child_tables:list[str], default_pk:dict[str, list[str]], parent_child_column_map:dict[str, dict[str,str]], literal_fields:dict[str, list[str]]) -> None:
if not isdir(export_path):
mkdir(export_path)
- schema = get_tg_schema()
fetch_parent_query = f"""
SELECT *
- FROM {schema}.{parent_table};
+ FROM {params_mapping["SCHEMA_NAME"]}.{parent_table};
"""
parent_records, parent_columns, _ = fetch_from_db_threaded(
[(fetch_parent_query, None)],
@@ -220,7 +221,7 @@ def _process_records_for_export(export_path:str, parent_table:str, parent_key:st
for child_name in child_tables:
child_key = next(key for key, value in parent_child_column_map[child_name].items() if value==parent_key)
fetch_children_query = f"""
- SELECT * FROM {schema}.{child_name}
+ SELECT * FROM {params_mapping["SCHEMA_NAME"]}.{child_name}
WHERE {child_key} = '{parent_record_dict[parent_key]}'
ORDER BY {", ".join(default_pk[child_name])};
"""
@@ -241,9 +242,10 @@ def _process_records_for_export(export_path:str, parent_table:str, parent_key:st
safe_dump(payload, f, sort_keys=False)
-def export_metadata_records_to_yaml(templates_path) -> None:
+def export_metadata_records_to_yaml(params_mapping: dict, templates_path: str) -> None:
_add_literal_representer()
_process_records_for_export(
+ params_mapping,
f"{templates_path}{path_seperator}{TEST_TYPES_TEMPLATE_FOLDER}",
TEST_TYPES_PARENT_TABLE,
TEST_TYPES_PARENT_KEY,
@@ -253,6 +255,7 @@ def export_metadata_records_to_yaml(templates_path) -> None:
TEST_TYPES_LITERAL_FIELDS,
)
_process_records_for_export(
+ params_mapping,
f"{templates_path}{path_seperator}{ANOMALY_TYPES_TEMPLATE_FOLDER}",
ANOMALY_TYPES_PARENT_TABLE,
ANOMALY_TYPES_PARENT_KEY,
From c70ec9e2d1bbbc6c415aa4fe3d6b77c00cb57eb8 Mon Sep 17 00:00:00 2001
From: Diogo Basto
Date: Mon, 22 Sep 2025 18:06:50 +0100
Subject: [PATCH 09/48] remove obs url
---
testgen/commands/run_test_metadata_exporter.py | 1 -
1 file changed, 1 deletion(-)
diff --git a/testgen/commands/run_test_metadata_exporter.py b/testgen/commands/run_test_metadata_exporter.py
index 126839ef..b204a554 100644
--- a/testgen/commands/run_test_metadata_exporter.py
+++ b/testgen/commands/run_test_metadata_exporter.py
@@ -12,7 +12,6 @@ def _get_params_mapping() -> dict:
"SCHEMA_NAME": get_tg_schema(),
"TESTGEN_ADMIN_USER": settings.DATABASE_ADMIN_USER,
"TESTGEN_ADMIN_PASSWORD": settings.DATABASE_ADMIN_PASSWORD,
- "OBSERVABILITY_URL": settings.OBSERVABILITY_API_URL,
}
@with_database_session
From c60a840d66d10a86f051c073a87b0a56be67a1db Mon Sep 17 00:00:00 2001
From: Aarthy Adityan
Date: Tue, 16 Sep 2025 17:19:20 -0400
Subject: [PATCH 10/48] fix(sorting selector): selected items not sorted
correctly
---
.../ui/components/frontend/js/components/sorting_selector.js | 2 +-
testgen/ui/views/hygiene_issues.py | 2 +-
testgen/ui/views/test_results.py | 2 +-
3 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/testgen/ui/components/frontend/js/components/sorting_selector.js b/testgen/ui/components/frontend/js/components/sorting_selector.js
index 0833d3dd..847850e5 100644
--- a/testgen/ui/components/frontend/js/components/sorting_selector.js
+++ b/testgen/ui/components/frontend/js/components/sorting_selector.js
@@ -139,7 +139,7 @@ const SortingSelector = (/** @type {Properties} */ props) => {
},
Object.entries(componentState)
.filter(([, colState]) => colState.val.order != null)
- .sort(([, colState]) => colState.val.order)
+ .sort(([, colStateA], [, colStateB]) => colStateA.val.order - colStateB.val.order)
.map(([colId,]) => activeColumnItem(colId))
),
div(
diff --git a/testgen/ui/views/hygiene_issues.py b/testgen/ui/views/hygiene_issues.py
index 7cc788d0..dbf6082e 100644
--- a/testgen/ui/views/hygiene_issues.py
+++ b/testgen/ui/views/hygiene_issues.py
@@ -610,7 +610,7 @@ def source_data_dialog(selected_row):
if len(df_bad) == 500:
testgen.caption("* Top 500 records displayed", "text-align: right;")
# Display the dataframe
- st.dataframe(df_bad, height=500, width=1050, hide_index=True)
+ st.dataframe(df_bad, width=1050, hide_index=True)
def do_disposition_update(selected, str_new_status):
diff --git a/testgen/ui/views/test_results.py b/testgen/ui/views/test_results.py
index 80d24bf1..5b747ec1 100644
--- a/testgen/ui/views/test_results.py
+++ b/testgen/ui/views/test_results.py
@@ -838,7 +838,7 @@ def source_data_dialog(selected_row):
if len(df_bad) == 500:
testgen.caption("* Top 500 records displayed", "text-align: right;")
# Display the dataframe
- st.dataframe(df_bad, height=500, width=1050, hide_index=True)
+ st.dataframe(df_bad, width=1050, hide_index=True)
def view_edit_test(button_container, test_definition_id):
From 35ccbefe27e10e288926073769878cf81510988b Mon Sep 17 00:00:00 2001
From: Aarthy Adityan
Date: Wed, 17 Sep 2025 17:08:17 -0400
Subject: [PATCH 11/48] fix(sql): quote snowflake identifiers correctly
---
testgen/commands/queries/execute_tests_query.py | 5 +++--
testgen/common/clean_sql.py | 14 +++++++++-----
2 files changed, 12 insertions(+), 7 deletions(-)
diff --git a/testgen/commands/queries/execute_tests_query.py b/testgen/commands/queries/execute_tests_query.py
index 93010829..c0c1d88f 100644
--- a/testgen/commands/queries/execute_tests_query.py
+++ b/testgen/commands/queries/execute_tests_query.py
@@ -1,6 +1,7 @@
from typing import ClassVar, TypedDict
-from testgen.common import AddQuotesToIdentifierCSV, CleanSQL, ConcatColumnList, date_service, read_template_sql_file
+from testgen.common import date_service, read_template_sql_file
+from testgen.common.clean_sql import CleanSQL, ConcatColumnList, quote_identifiers
from testgen.common.database.database_service import replace_params
@@ -113,7 +114,7 @@ def _get_query(
if self.test_params:
column_name = self.test_params["column_name"]
- params["COLUMN_NAME"] = AddQuotesToIdentifierCSV(column_name) if column_name else ""
+ params["COLUMN_NAME"] = quote_identifiers(column_name, self.flavor) if column_name else ""
# Shows contents without double-quotes for display and aggregate expressions
params["COLUMN_NAME_NO_QUOTES"] = column_name or ""
# Concatenates column list into single expression for relative entropy
diff --git a/testgen/common/clean_sql.py b/testgen/common/clean_sql.py
index 8c275ac4..8f1375c6 100644
--- a/testgen/common/clean_sql.py
+++ b/testgen/common/clean_sql.py
@@ -1,5 +1,3 @@
-__all__ = ["AddQuotesToIdentifierCSV", "CleanSQL", "ConcatColumnList"]
-
import re
@@ -16,7 +14,7 @@ def CleanSQL(strInput: str) -> str:
return " ".join(parts)
-def AddQuotesToIdentifierCSV(strInput: str) -> str:
+def quote_identifiers(identifiers: str, flavor: str) -> str:
# Keywords -- identifiers to quote
keywords = [
"select",
@@ -28,11 +26,17 @@ def AddQuotesToIdentifierCSV(strInput: str) -> str:
]
quoted_values = []
- for value in strInput.split(","):
+ for value in identifiers.split(","):
value = value.strip()
if value.startswith('"') and value.endswith('"'):
quoted_values.append(value)
- elif any(c.isupper() or c.isspace() or value.lower() in keywords for c in value):
+ elif any(
+ (flavor == "snowflake" and c.lower())
+ or (flavor != "snowflake" and c.isupper())
+ or c.isspace()
+ or value.lower() in keywords
+ for c in value
+ ):
quoted_values.append(f'"{value}"')
else:
quoted_values.append(value)
From cc50412b1a758b2dc8abb64960dac6a40dd43108 Mon Sep 17 00:00:00 2001
From: Aarthy Adityan
Date: Wed, 17 Sep 2025 18:22:21 -0400
Subject: [PATCH 12/48] fix(tests): don't quote columns in Timeframe Combo
Match
---
.../test_types_Timeframe_Combo_Match.yaml | 40 +++++++++----------
.../ex_window_match_same_databricks.sql | 8 ++--
.../ex_window_match_same_generic.sql | 8 ++--
.../ex_window_match_same_postgresql.sql | 8 ++--
4 files changed, 32 insertions(+), 32 deletions(-)
diff --git a/testgen/template/dbsetup_test_types/test_types_Timeframe_Combo_Match.yaml b/testgen/template/dbsetup_test_types/test_types_Timeframe_Combo_Match.yaml
index 73283c62..1b35da33 100644
--- a/testgen/template/dbsetup_test_types/test_types_Timeframe_Combo_Match.yaml
+++ b/testgen/template/dbsetup_test_types/test_types_Timeframe_Combo_Match.yaml
@@ -43,12 +43,12 @@ test_types:
lookup_type: null
lookup_query: |2-
(
- SELECT 'Prior Timeframe' as missing_from, {COLUMN_NAME}
+ SELECT 'Prior Timeframe' as missing_from, {COLUMN_NAME_NO_QUOTES}
FROM {TARGET_SCHEMA}.{TABLE_NAME}
WHERE {SUBSET_CONDITION}
AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS}
EXCEPT
- SELECT 'Prior Timeframe' as missing_from, {COLUMN_NAME}
+ SELECT 'Prior Timeframe' as missing_from, {COLUMN_NAME_NO_QUOTES}
FROM {TARGET_SCHEMA}.{TABLE_NAME}
WHERE {SUBSET_CONDITION}
AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - 2 * {WINDOW_DAYS}
@@ -56,13 +56,13 @@ test_types:
)
UNION ALL
(
- SELECT 'Latest Timeframe' as missing_from, {COLUMN_NAME}
+ SELECT 'Latest Timeframe' as missing_from, {COLUMN_NAME_NO_QUOTES}
FROM {TARGET_SCHEMA}.{TABLE_NAME}
WHERE {SUBSET_CONDITION}
AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - 2 * {WINDOW_DAYS}
AND {WINDOW_DATE_COLUMN} < (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS}
EXCEPT
- SELECT 'Latest Timeframe' as missing_from, {COLUMN_NAME}
+ SELECT 'Latest Timeframe' as missing_from, {COLUMN_NAME_NO_QUOTES}
FROM {TARGET_SCHEMA}.{TABLE_NAME}
WHERE {SUBSET_CONDITION}
AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS}
@@ -75,12 +75,12 @@ test_types:
lookup_type: null
lookup_query: |2-
(
- SELECT 'Prior Timeframe' as missing_from, {COLUMN_NAME}
+ SELECT 'Prior Timeframe' as missing_from, {COLUMN_NAME_NO_QUOTES}
FROM {TARGET_SCHEMA}.{TABLE_NAME}
WHERE {SUBSET_CONDITION}
AND {WINDOW_DATE_COLUMN} >= DATEADD("day", - {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}))
EXCEPT
- SELECT 'Prior Timeframe' as missing_from, {COLUMN_NAME}
+ SELECT 'Prior Timeframe' as missing_from, {COLUMN_NAME_NO_QUOTES}
FROM {TARGET_SCHEMA}.{TABLE_NAME}
WHERE {SUBSET_CONDITION}
AND {WINDOW_DATE_COLUMN} >= DATEADD("day", - 2 * {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}))
@@ -88,13 +88,13 @@ test_types:
)
UNION ALL
(
- SELECT 'Latest Timeframe' as missing_from, {COLUMN_NAME}
+ SELECT 'Latest Timeframe' as missing_from, {COLUMN_NAME_NO_QUOTES}
FROM {TARGET_SCHEMA}.{TABLE_NAME}
WHERE {SUBSET_CONDITION}
AND {WINDOW_DATE_COLUMN} >= DATEADD("day", - 2 * {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}))
AND {WINDOW_DATE_COLUMN} < DATEADD("day", - {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}))
EXCEPT
- SELECT 'Latest Timeframe' as missing_from, {COLUMN_NAME}
+ SELECT 'Latest Timeframe' as missing_from, {COLUMN_NAME_NO_QUOTES}
FROM {TARGET_SCHEMA}.{TABLE_NAME}
WHERE {SUBSET_CONDITION}
AND {WINDOW_DATE_COLUMN} >= DATEADD("day", - {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}))
@@ -107,12 +107,12 @@ test_types:
lookup_type: null
lookup_query: |2-
(
- SELECT 'Prior Timeframe' as missing_from, {COLUMN_NAME}
+ SELECT 'Prior Timeframe' as missing_from, {COLUMN_NAME_NO_QUOTES}
FROM {TARGET_SCHEMA}.{TABLE_NAME}
WHERE {SUBSET_CONDITION}
AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS}
EXCEPT
- SELECT 'Prior Timeframe' as missing_from, {COLUMN_NAME}
+ SELECT 'Prior Timeframe' as missing_from, {COLUMN_NAME_NO_QUOTES}
FROM {TARGET_SCHEMA}.{TABLE_NAME}
WHERE {SUBSET_CONDITION}
AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - 2 * {WINDOW_DAYS}
@@ -120,13 +120,13 @@ test_types:
)
UNION ALL
(
- SELECT 'Latest Timeframe' as missing_from, {COLUMN_NAME}
+ SELECT 'Latest Timeframe' as missing_from, {COLUMN_NAME_NO_QUOTES}
FROM {TARGET_SCHEMA}.{TABLE_NAME}
WHERE {SUBSET_CONDITION}
AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - 2 * {WINDOW_DAYS}
AND {WINDOW_DATE_COLUMN} < (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS}
EXCEPT
- SELECT 'Latest Timeframe' as missing_from, {COLUMN_NAME}
+ SELECT 'Latest Timeframe' as missing_from, {COLUMN_NAME_NO_QUOTES}
FROM {TARGET_SCHEMA}.{TABLE_NAME}
WHERE {SUBSET_CONDITION}
AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS}
@@ -139,12 +139,12 @@ test_types:
lookup_type: null
lookup_query: |2-
(
- SELECT 'Prior Timeframe' as missing_from, {COLUMN_NAME}
+ SELECT 'Prior Timeframe' as missing_from, {COLUMN_NAME_NO_QUOTES}
FROM {TARGET_SCHEMA}.{TABLE_NAME}
WHERE {SUBSET_CONDITION}
AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS}
EXCEPT
- SELECT 'Prior Timeframe' as missing_from, {COLUMN_NAME}
+ SELECT 'Prior Timeframe' as missing_from, {COLUMN_NAME_NO_QUOTES}
FROM {TARGET_SCHEMA}.{TABLE_NAME}
WHERE {SUBSET_CONDITION}
AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - 2 * {WINDOW_DAYS}
@@ -152,13 +152,13 @@ test_types:
)
UNION ALL
(
- SELECT 'Latest Timeframe' as missing_from, {COLUMN_NAME}
+ SELECT 'Latest Timeframe' as missing_from, {COLUMN_NAME_NO_QUOTES}
FROM {TARGET_SCHEMA}.{TABLE_NAME}
WHERE {SUBSET_CONDITION}
AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - 2 * {WINDOW_DAYS}
AND {WINDOW_DATE_COLUMN} < (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS}
EXCEPT
- SELECT 'Latest Timeframe' as missing_from, {COLUMN_NAME}
+ SELECT 'Latest Timeframe' as missing_from, {COLUMN_NAME_NO_QUOTES}
FROM {TARGET_SCHEMA}.{TABLE_NAME}
WHERE {SUBSET_CONDITION}
AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS}
@@ -171,12 +171,12 @@ test_types:
lookup_type: null
lookup_query: |2-
(
- SELECT 'Prior Timeframe' as missing_from, {COLUMN_NAME}
+ SELECT 'Prior Timeframe' as missing_from, {COLUMN_NAME_NO_QUOTES}
FROM {TARGET_SCHEMA}.{TABLE_NAME}
WHERE {SUBSET_CONDITION}
AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS}
EXCEPT
- SELECT 'Prior Timeframe' as missing_from, {COLUMN_NAME}
+ SELECT 'Prior Timeframe' as missing_from, {COLUMN_NAME_NO_QUOTES}
FROM {TARGET_SCHEMA}.{TABLE_NAME}
WHERE {SUBSET_CONDITION}
AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - 2 * {WINDOW_DAYS}
@@ -184,13 +184,13 @@ test_types:
)
UNION ALL
(
- SELECT 'Latest Timeframe' as missing_from, {COLUMN_NAME}
+ SELECT 'Latest Timeframe' as missing_from, {COLUMN_NAME_NO_QUOTES}
FROM {TARGET_SCHEMA}.{TABLE_NAME}
WHERE {SUBSET_CONDITION}
AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - 2 * {WINDOW_DAYS}
AND {WINDOW_DATE_COLUMN} < (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS}
EXCEPT
- SELECT 'Latest Timeframe' as missing_from, {COLUMN_NAME}
+ SELECT 'Latest Timeframe' as missing_from, {COLUMN_NAME_NO_QUOTES}
FROM {TARGET_SCHEMA}.{TABLE_NAME}
WHERE {SUBSET_CONDITION}
AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS}
diff --git a/testgen/template/flavors/databricks/exec_query_tests/ex_window_match_same_databricks.sql b/testgen/template/flavors/databricks/exec_query_tests/ex_window_match_same_databricks.sql
index 2fe39587..ce52e82f 100644
--- a/testgen/template/flavors/databricks/exec_query_tests/ex_window_match_same_databricks.sql
+++ b/testgen/template/flavors/databricks/exec_query_tests/ex_window_match_same_databricks.sql
@@ -32,12 +32,12 @@ SELECT '{TEST_TYPE}' as test_type,
NULL as result_query
FROM (
(
-SELECT 'Prior Timeframe' as missing_from, {COLUMN_NAME}
+SELECT 'Prior Timeframe' as missing_from, {COLUMN_NAME_NO_QUOTES}
FROM {SCHEMA_NAME}.{TABLE_NAME}
WHERE {SUBSET_CONDITION}
AND {WINDOW_DATE_COLUMN} >= DATEADD(day, - {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {SCHEMA_NAME}.{TABLE_NAME}))
EXCEPT
-SELECT 'Prior Timeframe' as missing_from, {COLUMN_NAME}
+SELECT 'Prior Timeframe' as missing_from, {COLUMN_NAME_NO_QUOTES}
FROM {SCHEMA_NAME}.{TABLE_NAME}
WHERE {SUBSET_CONDITION}
AND {WINDOW_DATE_COLUMN} >= DATEADD(day, - 2 * {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {SCHEMA_NAME}.{TABLE_NAME}))
@@ -45,13 +45,13 @@ WHERE {SUBSET_CONDITION}
)
UNION ALL
(
-SELECT 'Latest Timeframe' as missing_from, {COLUMN_NAME}
+SELECT 'Latest Timeframe' as missing_from, {COLUMN_NAME_NO_QUOTES}
FROM {SCHEMA_NAME}.{TABLE_NAME}
WHERE {SUBSET_CONDITION}
AND {WINDOW_DATE_COLUMN} >= DATEADD(day, - 2 * {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {SCHEMA_NAME}.{TABLE_NAME}))
AND {WINDOW_DATE_COLUMN} < DATEADD(day, - {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {SCHEMA_NAME}.{TABLE_NAME}))
EXCEPT
-SELECT 'Latest Timeframe' as missing_from, {COLUMN_NAME}
+SELECT 'Latest Timeframe' as missing_from, {COLUMN_NAME_NO_QUOTES}
FROM {SCHEMA_NAME}.{TABLE_NAME}
WHERE {SUBSET_CONDITION}
AND {WINDOW_DATE_COLUMN} >= DATEADD(day, - {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {SCHEMA_NAME}.{TABLE_NAME}))
diff --git a/testgen/template/flavors/generic/exec_query_tests/ex_window_match_same_generic.sql b/testgen/template/flavors/generic/exec_query_tests/ex_window_match_same_generic.sql
index 42e603be..c2088220 100644
--- a/testgen/template/flavors/generic/exec_query_tests/ex_window_match_same_generic.sql
+++ b/testgen/template/flavors/generic/exec_query_tests/ex_window_match_same_generic.sql
@@ -32,12 +32,12 @@ SELECT '{TEST_TYPE}' as test_type,
NULL as result_query
FROM (
(
-SELECT 'Prior Timeframe' as missing_from, {COLUMN_NAME}
+SELECT 'Prior Timeframe' as missing_from, {COLUMN_NAME_NO_QUOTES}
FROM {SCHEMA_NAME}.{TABLE_NAME}
WHERE {SUBSET_CONDITION}
AND {WINDOW_DATE_COLUMN} >= DATEADD("day", - {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {SCHEMA_NAME}.{TABLE_NAME}))
EXCEPT
-SELECT 'Prior Timeframe' as missing_from, {COLUMN_NAME}
+SELECT 'Prior Timeframe' as missing_from, {COLUMN_NAME_NO_QUOTES}
FROM {SCHEMA_NAME}.{TABLE_NAME}
WHERE {SUBSET_CONDITION}
AND {WINDOW_DATE_COLUMN} >= DATEADD("day", - 2 * {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {SCHEMA_NAME}.{TABLE_NAME}))
@@ -45,13 +45,13 @@ WHERE {SUBSET_CONDITION}
)
UNION ALL
(
-SELECT 'Latest Timeframe' as missing_from, {COLUMN_NAME}
+SELECT 'Latest Timeframe' as missing_from, {COLUMN_NAME_NO_QUOTES}
FROM {SCHEMA_NAME}.{TABLE_NAME}
WHERE {SUBSET_CONDITION}
AND {WINDOW_DATE_COLUMN} >= DATEADD("day", - 2 * {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {SCHEMA_NAME}.{TABLE_NAME}))
AND {WINDOW_DATE_COLUMN} < DATEADD("day", - {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {SCHEMA_NAME}.{TABLE_NAME}))
EXCEPT
-SELECT 'Latest Timeframe' as missing_from, {COLUMN_NAME}
+SELECT 'Latest Timeframe' as missing_from, {COLUMN_NAME_NO_QUOTES}
FROM {SCHEMA_NAME}.{TABLE_NAME}
WHERE {SUBSET_CONDITION}
AND {WINDOW_DATE_COLUMN} >= DATEADD("day", - {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {SCHEMA_NAME}.{TABLE_NAME}))
diff --git a/testgen/template/flavors/postgresql/exec_query_tests/ex_window_match_same_postgresql.sql b/testgen/template/flavors/postgresql/exec_query_tests/ex_window_match_same_postgresql.sql
index 4a6aaee4..81106fb0 100644
--- a/testgen/template/flavors/postgresql/exec_query_tests/ex_window_match_same_postgresql.sql
+++ b/testgen/template/flavors/postgresql/exec_query_tests/ex_window_match_same_postgresql.sql
@@ -32,12 +32,12 @@ SELECT '{TEST_TYPE}' as test_type,
NULL as result_query
FROM (
(
-SELECT 'Prior Timeframe' as missing_from, {COLUMN_NAME}
+SELECT 'Prior Timeframe' as missing_from, {COLUMN_NAME_NO_QUOTES}
FROM {SCHEMA_NAME}.{TABLE_NAME}
WHERE {SUBSET_CONDITION}
AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {SCHEMA_NAME}.{TABLE_NAME}) - {WINDOW_DAYS}
EXCEPT
-SELECT 'Prior Timeframe' as missing_from, {COLUMN_NAME}
+SELECT 'Prior Timeframe' as missing_from, {COLUMN_NAME_NO_QUOTES}
FROM {SCHEMA_NAME}.{TABLE_NAME}
WHERE {SUBSET_CONDITION}
AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {SCHEMA_NAME}.{TABLE_NAME}) - 2 * {WINDOW_DAYS}
@@ -45,13 +45,13 @@ WHERE {SUBSET_CONDITION}
)
UNION ALL
(
-SELECT 'Latest Timeframe' as missing_from, {COLUMN_NAME}
+SELECT 'Latest Timeframe' as missing_from, {COLUMN_NAME_NO_QUOTES}
FROM {SCHEMA_NAME}.{TABLE_NAME}
WHERE {SUBSET_CONDITION}
AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {SCHEMA_NAME}.{TABLE_NAME}) - 2 * {WINDOW_DAYS}
AND {WINDOW_DATE_COLUMN} < (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {SCHEMA_NAME}.{TABLE_NAME}) - {WINDOW_DAYS}
EXCEPT
-SELECT 'Latest Timeframe' as missing_from, {COLUMN_NAME}
+SELECT 'Latest Timeframe' as missing_from, {COLUMN_NAME_NO_QUOTES}
FROM {SCHEMA_NAME}.{TABLE_NAME}
WHERE {SUBSET_CONDITION}
AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {SCHEMA_NAME}.{TABLE_NAME}) - {WINDOW_DAYS}
From 4d1e261ad55c9c3141ec4a998bf75347f0640a42 Mon Sep 17 00:00:00 2001
From: Aarthy Adityan
Date: Thu, 18 Sep 2025 15:08:06 -0400
Subject: [PATCH 13/48] fix(redshift): profiling error for timestamps with time
zone
---
.../project_profiling_query_redshift.yaml | 20 +++++++++----------
1 file changed, 10 insertions(+), 10 deletions(-)
diff --git a/testgen/template/flavors/redshift/profiling/project_profiling_query_redshift.yaml b/testgen/template/flavors/redshift/profiling/project_profiling_query_redshift.yaml
index 1596dd1d..b3cf7277 100644
--- a/testgen/template/flavors/redshift/profiling/project_profiling_query_redshift.yaml
+++ b/testgen/template/flavors/redshift/profiling/project_profiling_query_redshift.yaml
@@ -136,17 +136,17 @@ strTemplate11_D: CASE
ELSE GREATEST(MIN("{COL_NAME}"), '0001-01-01')
END as min_date,
MAX("{COL_NAME}") as max_date,
- COUNT( CASE WHEN DATEDIFF('MON', "{COL_NAME}", '{RUN_DATE}') > 12 THEN 1 END) AS before_1yr_date_ct,
- COUNT( CASE WHEN DATEDIFF('MON', "{COL_NAME}", '{RUN_DATE}') > 60 THEN 1 END) AS before_5yr_date_ct,
- COUNT( CASE WHEN DATEDIFF('MON', "{COL_NAME}", '{RUN_DATE}') > 240 THEN 1 END) AS before_20yr_date_ct,
- COUNT( CASE WHEN DATEDIFF('MON', "{COL_NAME}", '{RUN_DATE}') > 1200 THEN 1 END) AS before_100yr_date_ct,
- COUNT( CASE WHEN DATEDIFF('DAY', "{COL_NAME}", '{RUN_DATE}') BETWEEN 0 AND 365 THEN 1 END) AS within_1yr_date_ct,
- COUNT( CASE WHEN DATEDIFF('DAY', "{COL_NAME}", '{RUN_DATE}') BETWEEN 0 AND 30 THEN 1 END) AS within_1mo_date_ct,
+ COUNT( CASE WHEN DATEDIFF('MON', "{COL_NAME}"::DATE, '{RUN_DATE}') > 12 THEN 1 END) AS before_1yr_date_ct,
+ COUNT( CASE WHEN DATEDIFF('MON', "{COL_NAME}"::DATE, '{RUN_DATE}') > 60 THEN 1 END) AS before_5yr_date_ct,
+ COUNT( CASE WHEN DATEDIFF('MON', "{COL_NAME}"::DATE, '{RUN_DATE}') > 240 THEN 1 END) AS before_20yr_date_ct,
+ COUNT( CASE WHEN DATEDIFF('MON', "{COL_NAME}"::DATE, '{RUN_DATE}') > 1200 THEN 1 END) AS before_100yr_date_ct,
+ COUNT( CASE WHEN DATEDIFF('DAY', "{COL_NAME}"::DATE, '{RUN_DATE}') BETWEEN 0 AND 365 THEN 1 END) AS within_1yr_date_ct,
+ COUNT( CASE WHEN DATEDIFF('DAY', "{COL_NAME}"::DATE, '{RUN_DATE}') BETWEEN 0 AND 30 THEN 1 END) AS within_1mo_date_ct,
COUNT( CASE WHEN "{COL_NAME}" > '{RUN_DATE}' THEN 1 END) AS future_date_ct,
- COUNT( CASE WHEN DATEDIFF('MON', '{RUN_DATE}', "{COL_NAME}") > 240 THEN 1 END) AS distant_future_date_ct,
- COUNT(DISTINCT DATEDIFF(day, "{COL_NAME}", '{RUN_DATE}' ) ) as date_days_present,
- COUNT(DISTINCT DATEDIFF(week, "{COL_NAME}", '{RUN_DATE}' ) ) as date_weeks_present,
- COUNT(DISTINCT DATEDIFF(month, "{COL_NAME}", '{RUN_DATE}' ) ) as date_months_present,
+ COUNT( CASE WHEN DATEDIFF('MON', '{RUN_DATE}', "{COL_NAME}"::DATE) > 240 THEN 1 END) AS distant_future_date_ct,
+ COUNT(DISTINCT DATEDIFF(day, "{COL_NAME}"::DATE, '{RUN_DATE}' ) ) as date_days_present,
+ COUNT(DISTINCT DATEDIFF(week, "{COL_NAME}"::DATE, '{RUN_DATE}' ) ) as date_weeks_present,
+ COUNT(DISTINCT DATEDIFF(month, "{COL_NAME}"::DATE, '{RUN_DATE}' ) ) as date_months_present,
strTemplate11_else: NULL as min_date,
NULL as max_date,
From 32517abdd0935bc02b565e1e0efc66d03e040ac4 Mon Sep 17 00:00:00 2001
From: Aarthy Adityan
Date: Thu, 18 Sep 2025 15:57:04 -0400
Subject: [PATCH 14/48] fix(postgres): profiling bugs on money and time data
types
---
.../schema_ddf_query_postgresql.sql | 2 +-
.../project_profiling_query_postgresql.yaml | 20 +++++++++----------
2 files changed, 11 insertions(+), 11 deletions(-)
diff --git a/testgen/template/flavors/postgresql/data_chars/schema_ddf_query_postgresql.sql b/testgen/template/flavors/postgresql/data_chars/schema_ddf_query_postgresql.sql
index 5e3136ca..7807e942 100644
--- a/testgen/template/flavors/postgresql/data_chars/schema_ddf_query_postgresql.sql
+++ b/testgen/template/flavors/postgresql/data_chars/schema_ddf_query_postgresql.sql
@@ -26,7 +26,7 @@ SELECT '{PROJECT_CODE}' as project_code,
WHEN c.data_type ILIKE 'date'
OR c.data_type ILIKE 'timestamp%'
THEN 'D'
- WHEN c.data_type ILIKE 'time without time zone'
+ WHEN c.data_type ILIKE 'time with%'
THEN 'T'
WHEN LOWER(c.data_type) IN ('bigint', 'integer', 'smallint', 'double precision', 'real', 'numeric', 'money')
THEN 'N'
diff --git a/testgen/template/flavors/postgresql/profiling/project_profiling_query_postgresql.yaml b/testgen/template/flavors/postgresql/profiling/project_profiling_query_postgresql.yaml
index 763dd4b7..8bc65688 100644
--- a/testgen/template/flavors/postgresql/profiling/project_profiling_query_postgresql.yaml
+++ b/testgen/template/flavors/postgresql/profiling/project_profiling_query_postgresql.yaml
@@ -25,7 +25,7 @@ strTemplate03_else: NULL as min_length,
strTemplate04_A: SUM(CASE
WHEN TRIM("{COL_NAME}") ~ '^0(\.0*)?$' THEN 1 ELSE 0
END) AS zero_value_ct,
-strTemplate04_N: SUM( 1 - ABS(SIGN("{COL_NAME}")) )::BIGINT AS zero_value_ct,
+strTemplate04_N: SUM( 1 - ABS(SIGN("{COL_NAME}"::NUMERIC)) )::BIGINT AS zero_value_ct,
strTemplate04_else: NULL as zero_value_ct,
strTemplate05_A: COUNT(DISTINCT UPPER(TRANSLATE("{COL_NAME}", ' '',.-', ''))) as distinct_std_value_ct,
SUM(CASE
@@ -135,10 +135,10 @@ strTemplate07_A_freq: ( SELECT LEFT(STRING_AGG(val, ' | ' ORDER BY ct DESC), 1
) AS top_freq_values,
strTemplate07_else: NULL as top_freq_values,
strTemplate08_N: MIN("{COL_NAME}") AS min_value,
- MIN(CASE WHEN "{COL_NAME}" > 0 THEN "{COL_NAME}" ELSE NULL END) AS min_value_over_0,
+ MIN(CASE WHEN "{COL_NAME}"::NUMERIC > 0 THEN "{COL_NAME}" ELSE NULL END) AS min_value_over_0,
MAX("{COL_NAME}") AS max_value,
- AVG(CAST("{COL_NAME}" AS FLOAT)) AS avg_value,
- STDDEV(CAST("{COL_NAME}" AS FLOAT)) AS stdev_value,
+ AVG(CAST("{COL_NAME}"::NUMERIC AS FLOAT)) AS avg_value,
+ STDDEV(CAST("{COL_NAME}"::NUMERIC AS FLOAT)) AS stdev_value,
MIN(pct_25) as percentile_25,
MIN(pct_50) as percentile_50,
MIN(pct_75) as percentile_75,
@@ -243,16 +243,16 @@ strTemplate98_else: ' FROM {DATA_SCHEMA}.{DATA_TABLE} '
strTemplate99_N: |
, (SELECT
- PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY "{COL_NAME}") AS pct_25,
- PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY "{COL_NAME}") AS pct_50,
- PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY "{COL_NAME}") AS pct_75
+ PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY "{COL_NAME}"::NUMERIC) AS pct_25,
+ PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY "{COL_NAME}"::NUMERIC) AS pct_50,
+ PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY "{COL_NAME}"::NUMERIC) AS pct_75
FROM {DATA_SCHEMA}.{DATA_TABLE} LIMIT 1) pctile
strTemplate99_N_sampling: |
, (SELECT
- PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY "{COL_NAME}") AS pct_25,
- PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY "{COL_NAME}") AS pct_50,
- PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY "{COL_NAME}") AS pct_75
+ PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY "{COL_NAME}"::NUMERIC) AS pct_25,
+ PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY "{COL_NAME}"::NUMERIC) AS pct_50,
+ PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY "{COL_NAME}"::NUMERIC) AS pct_75
FROM {DATA_SCHEMA}.{DATA_TABLE} TABLESAMPLE BERNOULLI ({SAMPLE_PERCENT_CALC}) REPEATABLE (64) LIMIT 1) pctile
strTemplate99_else: ' '
From 3c7f5ead048969b0619b642f3d51c132ab954be2 Mon Sep 17 00:00:00 2001
From: Ricardo Boni
Date: Tue, 23 Sep 2025 09:28:25 -0400
Subject: [PATCH 15/48] fix: Applying sampling to the secondary profiling query
---
testgen/commands/queries/profiling_query.py | 29 +++++++++++++++++++
...t_secondary_profiling_query_databricks.sql | 3 ++
...roject_secondary_profiling_query_mssql.sql | 5 +++-
...t_secondary_profiling_query_postgresql.sql | 3 ++
...ect_secondary_profiling_query_redshift.sql | 3 ++
...ct_secondary_profiling_query_snowflake.sql | 3 ++
6 files changed, 45 insertions(+), 1 deletion(-)
diff --git a/testgen/commands/queries/profiling_query.py b/testgen/commands/queries/profiling_query.py
index 4eadab10..18e8e0b0 100644
--- a/testgen/commands/queries/profiling_query.py
+++ b/testgen/commands/queries/profiling_query.py
@@ -1,3 +1,4 @@
+import re
import typing
from testgen.commands.queries.refresh_data_chars_query import CRefreshDataCharsSQL
@@ -130,6 +131,7 @@ def _get_query(
params = {}
if query:
+ query = self._process_conditionals(query)
if extra_params:
params.update(extra_params)
params.update(self._get_params())
@@ -139,6 +141,33 @@ def _get_query(
return query, params
+ def _process_conditionals(self, query: str):
+ re_pattern = re.compile(r"^--\s+TG-(IF|ELSE|ENDIF)(?:\s+(\w+))?\s*$")
+ condition = None
+ updated_query = []
+ for line in query.splitlines(True):
+ if re_match := re_pattern.match(line):
+ match re_match.group(1):
+ case "IF" if condition is None and re_match.group(2) is not None:
+ condition = bool(getattr(self, re_match.group(2)))
+ case "ELSE" if condition is not None:
+ condition = not condition
+ case "ENDIF" if condition is not None:
+ condition = None
+ case _:
+ raise ValueError("Template conditional misused")
+ elif condition is not False:
+ updated_query.append(line)
+
+ if condition is not None:
+ raise ValueError("Template conditional misused")
+
+ return "".join(updated_query)
+
+ @property
+ def do_sample_bool(self):
+ return self.parm_do_sample == "Y"
+
def GetSecondProfilingColumnsQuery(self) -> tuple[str, dict]:
# Runs on App database
return self._get_query("secondary_profiling_columns.sql")
diff --git a/testgen/template/flavors/databricks/profiling/project_secondary_profiling_query_databricks.sql b/testgen/template/flavors/databricks/profiling/project_secondary_profiling_query_databricks.sql
index 483fb373..601098dc 100644
--- a/testgen/template/flavors/databricks/profiling/project_secondary_profiling_query_databricks.sql
+++ b/testgen/template/flavors/databricks/profiling/project_secondary_profiling_query_databricks.sql
@@ -5,6 +5,9 @@ AS
COUNT(*) AS ct,
ROW_NUMBER() OVER (ORDER BY COUNT(*) DESC) AS rn
FROM {DATA_SCHEMA}.{DATA_TABLE}
+-- TG-IF do_sample_bool
+ TABLESAMPLE ({SAMPLE_PERCENT_CALC} PERCENT)
+-- TG-ENDIF
WHERE `{COL_NAME}` > ' '
GROUP BY `{COL_NAME}`
),
diff --git a/testgen/template/flavors/mssql/profiling/project_secondary_profiling_query_mssql.sql b/testgen/template/flavors/mssql/profiling/project_secondary_profiling_query_mssql.sql
index cdb368fe..eee2416c 100644
--- a/testgen/template/flavors/mssql/profiling/project_secondary_profiling_query_mssql.sql
+++ b/testgen/template/flavors/mssql/profiling/project_secondary_profiling_query_mssql.sql
@@ -5,6 +5,9 @@ AS
COUNT(*) AS ct,
ROW_NUMBER() OVER (ORDER BY COUNT(*) DESC) AS rn
FROM {DATA_SCHEMA}.{DATA_TABLE}
+-- TG-IF do_sample_bool
+ TABLESAMPLE ({SAMPLE_PERCENT_CALC} PERCENT)
+-- TG-ENDIF
WHERE "{COL_NAME}" > ' '
GROUP BY "{COL_NAME}"
),
@@ -33,4 +36,4 @@ SELECT '{PROJECT_CODE}' as project_code,
) as distinct_value_hash
FROM consol_vals;
--- Convert function has style = 2 : The characters 0x aren't added to the left of the converted result for style 2.
\ No newline at end of file
+-- Convert function has style = 2 : The characters 0x aren't added to the left of the converted result for style 2.
diff --git a/testgen/template/flavors/postgresql/profiling/project_secondary_profiling_query_postgresql.sql b/testgen/template/flavors/postgresql/profiling/project_secondary_profiling_query_postgresql.sql
index 857af0d3..e3261f14 100644
--- a/testgen/template/flavors/postgresql/profiling/project_secondary_profiling_query_postgresql.sql
+++ b/testgen/template/flavors/postgresql/profiling/project_secondary_profiling_query_postgresql.sql
@@ -4,6 +4,9 @@ WITH ranked_vals AS (
COUNT(*) AS ct,
ROW_NUMBER() OVER (ORDER BY COUNT(*) DESC, "{COL_NAME}") AS rn
FROM {DATA_SCHEMA}.{DATA_TABLE}
+-- TG-IF do_sample_bool
+ TABLESAMPLE BERNOULLI ({SAMPLE_PERCENT_CALC}) REPEATABLE (64)
+-- TG-ENDIF
WHERE "{COL_NAME}" > ' '
GROUP BY "{COL_NAME}"
),
diff --git a/testgen/template/flavors/redshift/profiling/project_secondary_profiling_query_redshift.sql b/testgen/template/flavors/redshift/profiling/project_secondary_profiling_query_redshift.sql
index 84c25587..6a0a3d5b 100644
--- a/testgen/template/flavors/redshift/profiling/project_secondary_profiling_query_redshift.sql
+++ b/testgen/template/flavors/redshift/profiling/project_secondary_profiling_query_redshift.sql
@@ -5,6 +5,9 @@ WITH ranked_vals AS (
ROW_NUMBER() OVER (ORDER BY COUNT(*) DESC, "{COL_NAME}") AS rn
FROM {DATA_SCHEMA}.{DATA_TABLE}
WHERE "{COL_NAME}" > ' '
+-- TG-IF do_sample_bool
+ AND RAND() <= 1.0 / {PROFILE_SAMPLE_RATIO}
+-- TG-ENDIF
GROUP BY "{COL_NAME}"
),
consol_vals AS (
diff --git a/testgen/template/flavors/snowflake/profiling/project_secondary_profiling_query_snowflake.sql b/testgen/template/flavors/snowflake/profiling/project_secondary_profiling_query_snowflake.sql
index 3e186892..709643b5 100644
--- a/testgen/template/flavors/snowflake/profiling/project_secondary_profiling_query_snowflake.sql
+++ b/testgen/template/flavors/snowflake/profiling/project_secondary_profiling_query_snowflake.sql
@@ -4,6 +4,9 @@ WITH ranked_vals AS (
COUNT(*) AS ct,
ROW_NUMBER() OVER (ORDER BY COUNT(*) DESC, "{COL_NAME}") AS rn
FROM {DATA_SCHEMA}.{DATA_TABLE}
+-- TG-IF do_sample_bool
+ SAMPLE ({SAMPLE_SIZE} rows)
+-- TG-ENDIF
WHERE "{COL_NAME}" > ' '
GROUP BY "{COL_NAME}"
),
From 6fcca5eb0a4ef46d0a69a070bbef104d476521d6 Mon Sep 17 00:00:00 2001
From: Aarthy Adityan
Date: Wed, 3 Sep 2025 23:44:15 -0400
Subject: [PATCH 16/48] feat: add support for redshift spectrum
---
.../queries/refresh_data_chars_query.py | 52 +++--
.../test_parameter_validation_query.py | 6 +-
testgen/common/database/database_service.py | 2 +-
.../common/database/flavor/flavor_service.py | 2 +-
.../redshift_spectrum_flavor_service.py | 5 +
testgen/common/models/connection.py | 2 +-
.../020_create_standard_functions_sprocs.sql | 2 +-
..._anomaly_types_Boolean_Value_Mismatch.yaml | 8 +
...anomaly_types_Char_Column_Date_Values.yaml | 8 +
...omaly_types_Char_Column_Number_Values.yaml | 8 +
...anomaly_types_Column_Pattern_Mismatch.yaml | 8 +
...anomaly_types_Delimited_Data_Embedded.yaml | 8 +
...ile_anomaly_types_Inconsistent_Casing.yaml | 14 ++
...rofile_anomaly_types_Invalid_Zip3_USA.yaml | 8 +
...profile_anomaly_types_Invalid_Zip_USA.yaml | 8 +
.../profile_anomaly_types_Leading_Spaces.yaml | 8 +
...le_anomaly_types_Multiple_Types_Major.yaml | 8 +
...le_anomaly_types_Multiple_Types_Minor.yaml | 8 +
.../profile_anomaly_types_No_Values.yaml | 8 +
..._anomaly_types_Non_Alpha_Name_Address.yaml | 10 +
...anomaly_types_Non_Alpha_Prefixed_Name.yaml | 10 +
...file_anomaly_types_Non_Printing_Chars.yaml | 21 ++
...ile_anomaly_types_Non_Standard_Blanks.yaml | 8 +
...le_anomaly_types_Potential_Duplicates.yaml | 8 +
.../profile_anomaly_types_Potential_PII.yaml | 8 +
.../profile_anomaly_types_Quoted_Values.yaml | 8 +
...rofile_anomaly_types_Recency_One_Year.yaml | 8 +
...file_anomaly_types_Recency_Six_Months.yaml | 8 +
...nomaly_types_Small_Divergent_Value_Ct.yaml | 8 +
..._anomaly_types_Small_Missing_Value_Ct.yaml | 8 +
..._anomaly_types_Small_Numeric_Value_Ct.yaml | 8 +
...maly_types_Standardized_Value_Matches.yaml | 8 +
.../profile_anomaly_types_Suggested_Type.yaml | 8 +
..._anomaly_types_Table_Pattern_Mismatch.yaml | 8 +
...ofile_anomaly_types_Unexpected_Emails.yaml | 8 +
...le_anomaly_types_Unexpected_US_States.yaml | 8 +
...le_anomaly_types_Unlikely_Date_Values.yaml | 8 +
...le_anomaly_types_Variant_Coded_Values.yaml | 8 +
.../test_types_Aggregate_Balance.yaml | 28 +++
.../test_types_Aggregate_Balance_Percent.yaml | 30 +++
.../test_types_Aggregate_Balance_Range.yaml | 30 +++
.../test_types_Aggregate_Minimum.yaml | 28 +++
.../test_types_Alpha_Trunc.yaml | 16 ++
.../test_types_Avg_Shift.yaml | 16 ++
.../dbsetup_test_types/test_types_CUSTOM.yaml | 4 +
.../test_types_Combo_Match.yaml | 25 ++
.../test_types_Condition_Flag.yaml | 16 ++
.../test_types_Constant.yaml | 16 ++
.../test_types_Daily_Record_Ct.yaml | 16 ++
.../test_types_Dec_Trunc.yaml | 16 ++
.../test_types_Distinct_Date_Ct.yaml | 16 ++
.../test_types_Distinct_Value_Ct.yaml | 16 ++
.../test_types_Distribution_Shift.yaml | 30 +++
.../test_types_Dupe_Rows.yaml | 17 ++
.../test_types_Email_Format.yaml | 16 ++
.../test_types_Future_Date.yaml | 16 ++
.../test_types_Future_Date_1Y.yaml | 16 ++
.../test_types_Incr_Avg_Shift.yaml | 16 ++
.../test_types_LOV_All.yaml | 16 ++
.../test_types_LOV_Match.yaml | 16 ++
.../test_types_Min_Date.yaml | 16 ++
.../test_types_Min_Val.yaml | 16 ++
.../test_types_Missing_Pct.yaml | 16 ++
.../test_types_Monthly_Rec_Ct.yaml | 16 ++
.../test_types_Outlier_Pct_Above.yaml | 16 ++
.../test_types_Outlier_Pct_Below.yaml | 16 ++
.../test_types_Pattern_Match.yaml | 16 ++
.../test_types_Recency.yaml | 16 ++
.../test_types_Required.yaml | 16 ++
.../dbsetup_test_types/test_types_Row_Ct.yaml | 16 ++
.../test_types_Row_Ct_Pct.yaml | 16 ++
.../test_types_Street_Addr_Pattern.yaml | 16 ++
.../test_types_Table_Freshness.yaml | 4 +
.../test_types_Timeframe_Combo_Gain.yaml | 23 ++
.../test_types_Timeframe_Combo_Match.yaml | 36 +++
.../test_types_US_State.yaml | 16 ++
.../dbsetup_test_types/test_types_Unique.yaml | 16 ++
.../test_types_Unique_Pct.yaml | 16 ++
.../test_types_Valid_Characters.yaml | 16 ++
.../test_types_Valid_Month.yaml | 8 +
.../test_types_Valid_US_Zip.yaml | 16 ++
.../test_types_Valid_US_Zip3.yaml | 16 ++
.../test_types_Variability_Decrease.yaml | 16 ++
.../test_types_Variability_Increase.yaml | 16 ++
.../test_types_Weekly_Rec_Ct.yaml | 16 ++
.../dbupgrade/0152_incremental_upgrade.sql | 27 +++
...ric.sql => ex_get_project_column_list.sql} | 0
.../schema_ddf_query_redshift_spectrum.sql | 37 +++
...t_table_sample_count_redshift_spectrum.sql | 23 ++
...ect_profiling_query_redshift_spectrum.yaml | 213 ++++++++++++++++++
...dary_profiling_query_redshift_spectrum.sql | 32 +++
.../profiling/templated_functions.yaml | 101 +++++++++
.../ex_get_project_column_list.sql | 3 +
.../profiling/datatype_suggestions.sql | 2 +
.../frontend/js/components/connection_form.js | 12 +
testgen/ui/views/connections.py | 6 +
tests/unit/test_profiling_query.py | 4 +-
97 files changed, 1594 insertions(+), 31 deletions(-)
create mode 100644 testgen/common/database/flavor/redshift_spectrum_flavor_service.py
create mode 100644 testgen/template/dbupgrade/0152_incremental_upgrade.sql
rename testgen/template/flavors/generic/validate_tests/{ex_get_project_column_list_generic.sql => ex_get_project_column_list.sql} (100%)
create mode 100644 testgen/template/flavors/redshift_spectrum/data_chars/schema_ddf_query_redshift_spectrum.sql
create mode 100644 testgen/template/flavors/redshift_spectrum/profiling/project_get_table_sample_count_redshift_spectrum.sql
create mode 100644 testgen/template/flavors/redshift_spectrum/profiling/project_profiling_query_redshift_spectrum.yaml
create mode 100644 testgen/template/flavors/redshift_spectrum/profiling/project_secondary_profiling_query_redshift_spectrum.sql
create mode 100644 testgen/template/flavors/redshift_spectrum/profiling/templated_functions.yaml
create mode 100644 testgen/template/flavors/redshift_spectrum/validate_tests/ex_get_project_column_list.sql
diff --git a/testgen/commands/queries/refresh_data_chars_query.py b/testgen/commands/queries/refresh_data_chars_query.py
index 414616f0..983b360a 100644
--- a/testgen/commands/queries/refresh_data_chars_query.py
+++ b/testgen/commands/queries/refresh_data_chars_query.py
@@ -44,43 +44,47 @@ def _get_query(self, template_file_name: str, sub_directory: str | None = "data_
query = replace_params(query, params)
return query, params
- def _get_mask_query(self, mask: str, is_include: bool) -> str:
- escape = ""
+ def _get_table_criteria(self) -> str:
+ table_criteria = ""
+
+ table_ref = "c.table_name"
+ escape_clause = ""
+ escaped_underscore = "\\_"
if self.sql_flavor.startswith("mssql"):
escaped_underscore = "[_]"
elif self.sql_flavor == "snowflake":
escaped_underscore = "\\\\_"
- escape = "ESCAPE '\\\\'"
+ escape_clause = "ESCAPE '\\\\'"
elif self.sql_flavor == "redshift":
escaped_underscore = "\\\\_"
- else:
- escaped_underscore = "\\_"
-
- table_names = [ item.strip().replace("_", escaped_underscore) for item in mask.split(",") ]
- sub_query = f"""
- AND {"NOT" if not is_include else ""} (
- {" OR ".join([ f"(c.table_name LIKE '{item}' {escape})" for item in table_names ])}
- )
- """
-
- return sub_query
-
- def GetDDFQuery(self) -> tuple[str, dict]:
- # Runs on Target database
- query, params = self._get_query(f"schema_ddf_query_{self.sql_flavor}.sql", f"flavors/{self.sql_flavor}/data_chars")
+ elif self.sql_flavor == "redshift_spectrum":
+ table_ref = "c.tablename"
- table_criteria = ""
if self.profiling_table_set:
- table_criteria += f" AND c.table_name IN ({self.profiling_table_set})"
+ table_criteria += f" AND {table_ref} IN ({self.profiling_table_set})"
if self.profiling_include_mask:
- table_criteria += self._get_mask_query(self.profiling_include_mask, is_include=True)
+ include_table_names = [ item.strip().replace("_", escaped_underscore) for item in self.profiling_include_mask.split(",") ]
+ table_criteria += f"""
+ AND (
+ {" OR ".join([ f"({table_ref} LIKE '{item}' {escape_clause})" for item in include_table_names ])}
+ )
+ """
if self.profiling_exclude_mask:
- table_criteria += self._get_mask_query(self.profiling_exclude_mask, is_include=False)
-
- query = query.replace("{TABLE_CRITERIA}", table_criteria)
+ exclude_table_names = [ item.strip().replace("_", escaped_underscore) for item in self.profiling_exclude_mask.split(",") ]
+ table_criteria += f"""
+ AND NOT (
+ {" OR ".join([ f"({table_ref} LIKE '{item}' {escape_clause})" for item in exclude_table_names ])}
+ )
+ """
+ return table_criteria
+
+ def GetDDFQuery(self) -> tuple[str, dict]:
+ # Runs on Target database
+ query, params = self._get_query(f"schema_ddf_query_{self.sql_flavor}.sql", f"flavors/{self.sql_flavor}/data_chars")
+ query = query.replace("{TABLE_CRITERIA}", self._get_table_criteria())
return query, params
def GetRecordCountQueries(self, schema_tables: list[str]) -> list[tuple[str, None]]:
diff --git a/testgen/commands/queries/test_parameter_validation_query.py b/testgen/commands/queries/test_parameter_validation_query.py
index ec8cf408..bfad6a50 100644
--- a/testgen/commands/queries/test_parameter_validation_query.py
+++ b/testgen/commands/queries/test_parameter_validation_query.py
@@ -47,7 +47,11 @@ def GetTestValidationColumns(self) -> tuple[str, dict]:
def GetProjectTestValidationColumns(self) -> tuple[str, dict]:
# Runs on Target database
- return self._get_query("ex_get_project_column_list_generic.sql", "flavors/generic/validate_tests")
+ filename = "ex_get_project_column_list.sql"
+ try:
+ return self._get_query(filename, f"flavors/{self.flavor}/validate_tests")
+ except ModuleNotFoundError:
+ return self._get_query(filename, "flavors/generic/validate_tests")
def PrepFlagTestsWithFailedValidation(self) -> tuple[str, dict]:
# Runs on App database
diff --git a/testgen/common/database/database_service.py b/testgen/common/database/database_service.py
index 75016501..42cb268e 100644
--- a/testgen/common/database/database_service.py
+++ b/testgen/common/database/database_service.py
@@ -77,7 +77,7 @@ def set_target_db_params(connection_params: ConnectionParams) -> None:
def get_flavor_service(flavor: SQLFlavor) -> FlavorService:
module_path = f"testgen.common.database.flavor.{flavor}_flavor_service"
- class_name = f"{flavor.capitalize()}FlavorService"
+ class_name = f"{flavor.replace('_', ' ').title().replace(' ', '')}FlavorService"
module = importlib.import_module(module_path)
flavor_class = getattr(module, class_name)
return flavor_class()
diff --git a/testgen/common/database/flavor/flavor_service.py b/testgen/common/database/flavor/flavor_service.py
index a257b2da..a5618bc6 100644
--- a/testgen/common/database/flavor/flavor_service.py
+++ b/testgen/common/database/flavor/flavor_service.py
@@ -4,7 +4,7 @@
from testgen.common.encrypt import DecryptText
-SQLFlavor = Literal["redshift", "snowflake", "mssql", "postgresql", "databricks"]
+SQLFlavor = Literal["redshift", "redshift_spectrum", "snowflake", "mssql", "postgresql", "databricks"]
class ConnectionParams(TypedDict):
diff --git a/testgen/common/database/flavor/redshift_spectrum_flavor_service.py b/testgen/common/database/flavor/redshift_spectrum_flavor_service.py
new file mode 100644
index 00000000..6554a8a8
--- /dev/null
+++ b/testgen/common/database/flavor/redshift_spectrum_flavor_service.py
@@ -0,0 +1,5 @@
+from testgen.common.database.flavor.redshift_flavor_service import RedshiftFlavorService
+
+
+class RedshiftSpectrumFlavorService(RedshiftFlavorService):
+ pass
diff --git a/testgen/common/models/connection.py b/testgen/common/models/connection.py
index 660f51fd..444ac3e5 100644
--- a/testgen/common/models/connection.py
+++ b/testgen/common/models/connection.py
@@ -27,7 +27,7 @@
from testgen.common.models.table_group import TableGroup
from testgen.utils import is_uuid4
-SQLFlavorCode = Literal["redshift", "snowflake", "mssql", "azure_mssql", "synapse_mssql", "postgresql", "databricks"]
+SQLFlavorCode = Literal["redshift", "redshift_spectrum", "snowflake", "mssql", "azure_mssql", "synapse_mssql", "postgresql", "databricks"]
@dataclass
diff --git a/testgen/template/dbsetup/020_create_standard_functions_sprocs.sql b/testgen/template/dbsetup/020_create_standard_functions_sprocs.sql
index 40e7d585..01b65623 100644
--- a/testgen/template/dbsetup/020_create_standard_functions_sprocs.sql
+++ b/testgen/template/dbsetup/020_create_standard_functions_sprocs.sql
@@ -74,7 +74,7 @@ BEGIN
IF lower_case_sql_flavor IN ('postgres', 'postgresql') THEN
escaped_value := QUOTE_LITERAL(var_value);
- ELSIF lower_case_sql_flavor IN ('redshift', 'snowflake') THEN
+ ELSIF lower_case_sql_flavor IN ('redshift', 'redshift_spectrum', 'snowflake') THEN
escaped_value := TRIM(LEADING 'E' FROM QUOTE_LITERAL(var_value));
ELSIF lower_case_sql_flavor = 'mssql' THEN
escaped_value := '''' || REPLACE(var_value, '''', '''''') || '''';
diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Boolean_Value_Mismatch.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Boolean_Value_Mismatch.yaml
index 10cedbcf..b3e7f5aa 100644
--- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Boolean_Value_Mismatch.yaml
+++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Boolean_Value_Mismatch.yaml
@@ -55,6 +55,14 @@ profile_anomaly_types:
lookup_query: |-
SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC;
error_type: Profile Anomaly
+ - id: '1447'
+ test_id: '1015'
+ test_type: Boolean_Value_Mismatch
+ sql_flavor: redshift_spectrum
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC;
+ error_type: Profile Anomaly
- id: '1186'
test_id: '1015'
test_type: Boolean_Value_Mismatch
diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Char_Column_Date_Values.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Char_Column_Date_Values.yaml
index f31357b2..96e07437 100644
--- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Char_Column_Date_Values.yaml
+++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Char_Column_Date_Values.yaml
@@ -54,6 +54,14 @@ profile_anomaly_types:
lookup_query: |-
SELECT A.* FROM ( SELECT TOP 10 DISTINCT 'Date' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_DATE;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS A UNION ALL SELECT B.* FROM ( SELECT TOP 10 DISTINCT 'Non-Date' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_DATE;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS B ORDER BY data_type, count DESC;
error_type: Profile Anomaly
+ - id: '1444'
+ test_id: '1012'
+ test_type: Char_Column_Date_Values
+ sql_flavor: redshift_spectrum
+ lookup_type: null
+ lookup_query: |-
+ SELECT A.* FROM ( SELECT TOP 10 DISTINCT 'Date' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_DATE;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS A UNION ALL SELECT B.* FROM ( SELECT TOP 10 DISTINCT 'Non-Date' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_DATE;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS B ORDER BY data_type, count DESC;
+ error_type: Profile Anomaly
- id: '1183'
test_id: '1012'
test_type: Char_Column_Date_Values
diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Char_Column_Number_Values.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Char_Column_Number_Values.yaml
index fbc78bc8..fe734a80 100644
--- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Char_Column_Number_Values.yaml
+++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Char_Column_Number_Values.yaml
@@ -54,6 +54,14 @@ profile_anomaly_types:
lookup_query: |-
SELECT A.* FROM ( SELECT TOP 10 DISTINCT 'Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_NUM;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC) AS A UNION ALL SELECT B.* FROM ( SELECT TOP 10 DISTINCT 'Non-Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_NUM;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS B ORDER BY data_type, count DESC;
error_type: Profile Anomaly
+ - id: '1443'
+ test_id: '1011'
+ test_type: Char_Column_Number_Values
+ sql_flavor: redshift_spectrum
+ lookup_type: null
+ lookup_query: |-
+ SELECT A.* FROM ( SELECT TOP 10 DISTINCT 'Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_NUM;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC) AS A UNION ALL SELECT B.* FROM ( SELECT TOP 10 DISTINCT 'Non-Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_NUM;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS B ORDER BY data_type, count DESC;
+ error_type: Profile Anomaly
- id: '1182'
test_id: '1011'
test_type: Char_Column_Number_Values
diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Column_Pattern_Mismatch.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Column_Pattern_Mismatch.yaml
index 2c69ccfa..55a33bd1 100644
--- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Column_Pattern_Mismatch.yaml
+++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Column_Pattern_Mismatch.yaml
@@ -61,6 +61,14 @@ profile_anomaly_types:
lookup_query: |-
SELECT A.* FROM ( SELECT TOP 5 DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 4)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", '[a-z]', 'a'),'[A-Z]', 'A'),'[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC ) A UNION ALL SELECT B.* FROM ( SELECT TOP 5 DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 6)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", '[a-z]', 'a'),'[A-Z]', 'A'),'[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC ) B UNION ALL SELECT C.* FROM ( SELECT TOP 5 DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 8)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", '[a-z]', 'a'),'[A-Z]', 'A'),'[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC ) C UNION ALL SELECT D.* FROM ( SELECT TOP 5 DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 10)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", '[a-z]', 'a'),'[A-Z]', 'A'),'[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC ) D ORDER BY top_pattern DESC, count DESC;
error_type: Profile Anomaly
+ - id: '1439'
+ test_id: '1007'
+ test_type: Column_Pattern_Mismatch
+ sql_flavor: redshift_spectrum
+ lookup_type: null
+ lookup_query: |-
+ SELECT A.* FROM ( SELECT TOP 5 DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 4)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", '[a-z]', 'a'),'[A-Z]', 'A'),'[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC ) A UNION ALL SELECT B.* FROM ( SELECT TOP 5 DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 6)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", '[a-z]', 'a'),'[A-Z]', 'A'),'[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC ) B UNION ALL SELECT C.* FROM ( SELECT TOP 5 DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 8)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", '[a-z]', 'a'),'[A-Z]', 'A'),'[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC ) C UNION ALL SELECT D.* FROM ( SELECT TOP 5 DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 10)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", '[a-z]', 'a'),'[A-Z]', 'A'),'[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC ) D ORDER BY top_pattern DESC, count DESC;
+ error_type: Profile Anomaly
- id: '1178'
test_id: '1007'
test_type: Column_Pattern_Mismatch
diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Delimited_Data_Embedded.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Delimited_Data_Embedded.yaml
index eb5d7db4..5f1efb38 100644
--- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Delimited_Data_Embedded.yaml
+++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Delimited_Data_Embedded.yaml
@@ -48,6 +48,14 @@ profile_anomaly_types:
lookup_query: |-
SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" ~ '^([^,|\t]{1,20}[,|\t]){2,}[^,|\t]{0,20}([,|\t]{0,1}[^,|\t]{0,20})*$' AND "{COLUMN_NAME}" !~ '\\s(and|but|or|yet)\\s' GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT 500;
error_type: Profile Anomaly
+ - id: '1457'
+ test_id: '1025'
+ test_type: Delimited_Data_Embedded
+ sql_flavor: redshift_spectrum
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" ~ '^([^,|\t]{1,20}[,|\t]){2,}[^,|\t]{0,20}([,|\t]{0,1}[^,|\t]{0,20})*$' AND "{COLUMN_NAME}" !~ '\\s(and|but|or|yet)\\s' GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT 500;
+ error_type: Profile Anomaly
- id: '1196'
test_id: '1025'
test_type: Delimited_Data_Embedded
diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Inconsistent_Casing.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Inconsistent_Casing.yaml
index 865601d2..4e33416d 100644
--- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Inconsistent_Casing.yaml
+++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Inconsistent_Casing.yaml
@@ -73,6 +73,20 @@ profile_anomaly_types:
WHERE "{COLUMN_NAME}" <> UPPER("{COLUMN_NAME}") AND "{COLUMN_NAME}" <> LOWER("{COLUMN_NAME}")
GROUP BY "{COLUMN_NAME}" LIMIT 20)
error_type: Profile Anomaly
+ - id: '1473'
+ test_id: '1028'
+ test_type: Inconsistent_Casing
+ sql_flavor: redshift_spectrum
+ lookup_type: null
+ lookup_query: |-
+ (SELECT 'Upper Case' as casing, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}
+ WHERE UPPER("{COLUMN_NAME}") = "{COLUMN_NAME}"
+ GROUP BY "{COLUMN_NAME}" LIMIT 20)
+ UNION ALL
+ (SELECT 'Mixed Case' as casing, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}
+ WHERE "{COLUMN_NAME}" <> UPPER("{COLUMN_NAME}") AND "{COLUMN_NAME}" <> LOWER("{COLUMN_NAME}")
+ GROUP BY "{COLUMN_NAME}" LIMIT 20)
+ error_type: Profile Anomaly
- id: '1261'
test_id: '1028'
test_type: Inconsistent_Casing
diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Invalid_Zip3_USA.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Invalid_Zip3_USA.yaml
index 14130bfd..5ad0fe18 100644
--- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Invalid_Zip3_USA.yaml
+++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Invalid_Zip3_USA.yaml
@@ -52,6 +52,14 @@ profile_anomaly_types:
lookup_query: |-
SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') <> '999' GROUP BY "{COLUMN_NAME}" ORDER BY count DESC, "{COLUMN_NAME}" LIMIT 500;
error_type: Profile Anomaly
+ - id: '1456'
+ test_id: '1024'
+ test_type: Invalid_Zip3_USA
+ sql_flavor: redshift_spectrum
+ lookup_type: null
+ lookup_query: |-
+ SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') <> '999' GROUP BY "{COLUMN_NAME}" ORDER BY count DESC, "{COLUMN_NAME}" LIMIT 500;
+ error_type: Profile Anomaly
- id: '1195'
test_id: '1024'
test_type: Invalid_Zip3_USA
diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Invalid_Zip_USA.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Invalid_Zip_USA.yaml
index 32cee0ac..dc5c80b0 100644
--- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Invalid_Zip_USA.yaml
+++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Invalid_Zip_USA.yaml
@@ -48,6 +48,14 @@ profile_anomaly_types:
lookup_query: |-
SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500;
error_type: Profile Anomaly
+ - id: '1435'
+ test_id: '1003'
+ test_type: Invalid_Zip_USA
+ sql_flavor: redshift_spectrum
+ lookup_type: null
+ lookup_query: |-
+ SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500;
+ error_type: Profile Anomaly
- id: '1174'
test_id: '1003'
test_type: Invalid_Zip_USA
diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Leading_Spaces.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Leading_Spaces.yaml
index 812146b8..a0ce8896 100644
--- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Leading_Spaces.yaml
+++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Leading_Spaces.yaml
@@ -49,6 +49,14 @@ profile_anomaly_types:
lookup_query: |-
SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (CASE WHEN "{COLUMN_NAME}" BETWEEN ' !' AND '!' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";
error_type: Profile Anomaly
+ - id: '1441'
+ test_id: '1009'
+ test_type: Leading_Spaces
+ sql_flavor: redshift_spectrum
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (CASE WHEN "{COLUMN_NAME}" BETWEEN ' !' AND '!' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";
+ error_type: Profile Anomaly
- id: '1180'
test_id: '1009'
test_type: Leading_Spaces
diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Multiple_Types_Major.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Multiple_Types_Major.yaml
index 53e7a7a9..aec5f134 100644
--- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Multiple_Types_Major.yaml
+++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Multiple_Types_Major.yaml
@@ -48,6 +48,14 @@ profile_anomaly_types:
lookup_query: |-
SELECT DISTINCT column_name, table_name, CASE WHEN data_type = 'timestamp without time zone' THEN 'timestamp' WHEN data_type = 'character varying' THEN 'varchar(' || CAST(character_maximum_length AS VARCHAR) || ')' WHEN data_type = 'character' THEN 'char(' || CAST(character_maximum_length AS VARCHAR) || ')' WHEN data_type = 'numeric' THEN 'numeric(' || CAST(numeric_precision AS VARCHAR) || ',' || CAST(numeric_scale AS VARCHAR) || ')' ELSE data_type END AS data_type FROM information_schema.columns WHERE table_schema = '{TARGET_SCHEMA}' AND column_name = '{COLUMN_NAME}' ORDER BY data_type, table_name;
error_type: Profile Anomaly
+ - id: '1437'
+ test_id: '1005'
+ test_type: Multiple_Types_Major
+ sql_flavor: redshift_spectrum
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT columnname AS column_name, tablename AS table_name, external_type AS data_type FROM svv_external_columns WHERE schemaname = '{TARGET_SCHEMA}' AND columnname = '{COLUMN_NAME}' ORDER BY external_type, tablename;
+ error_type: Profile Anomaly
- id: '1176'
test_id: '1005'
test_type: Multiple_Types_Major
diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Multiple_Types_Minor.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Multiple_Types_Minor.yaml
index f55ab2f6..1030b543 100644
--- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Multiple_Types_Minor.yaml
+++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Multiple_Types_Minor.yaml
@@ -48,6 +48,14 @@ profile_anomaly_types:
lookup_query: |-
SELECT DISTINCT column_name, table_name, CASE WHEN data_type = 'timestamp without time zone' THEN 'timestamp' WHEN data_type = 'character varying' THEN 'varchar(' || CAST(character_maximum_length AS VARCHAR) || ')' WHEN data_type = 'character' THEN 'char(' || CAST(character_maximum_length AS VARCHAR) || ')' WHEN data_type = 'numeric' THEN 'numeric(' || CAST(numeric_precision AS VARCHAR) || ',' || CAST(numeric_scale AS VARCHAR) || ')' ELSE data_type END AS data_type FROM information_schema.columns WHERE table_schema = '{TARGET_SCHEMA}' AND column_name = '{COLUMN_NAME}' ORDER BY data_type, table_name;
error_type: Profile Anomaly
+ - id: '1436'
+ test_id: '1004'
+ test_type: Multiple_Types_Minor
+ sql_flavor: redshift_spectrum
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT columnname AS column_name, tablename AS table_name, external_type AS data_type FROM svv_external_columns WHERE schemaname = '{TARGET_SCHEMA}' AND columnname = '{COLUMN_NAME}' ORDER BY external_type, tablename;
+ error_type: Profile Anomaly
- id: '1175'
test_id: '1004'
test_type: Multiple_Types_Minor
diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_No_Values.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_No_Values.yaml
index a70fea71..68dde091 100644
--- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_No_Values.yaml
+++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_No_Values.yaml
@@ -51,6 +51,14 @@ profile_anomaly_types:
lookup_query: |-
SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";
error_type: Profile Anomaly
+ - id: '1438'
+ test_id: '1006'
+ test_type: No_Values
+ sql_flavor: redshift_spectrum
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";
+ error_type: Profile Anomaly
- id: '1177'
test_id: '1006'
test_type: No_Values
diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Alpha_Name_Address.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Alpha_Name_Address.yaml
index b5e9f27c..aba7ef44 100644
--- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Alpha_Name_Address.yaml
+++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Alpha_Name_Address.yaml
@@ -57,6 +57,16 @@ profile_anomaly_types:
WHERE "{COLUMN_NAME}" = UPPER("{COLUMN_NAME}") AND "{COLUMN_NAME}" = LOWER("{COLUMN_NAME}") AND "{COLUMN_NAME}" > ''
GROUP BY "{COLUMN_NAME}" LIMIT 500
error_type: Profile Anomaly
+ - id: '1474'
+ test_id: '1029'
+ test_type: Non_Alpha_Name_Address
+ sql_flavor: redshift_spectrum
+ lookup_type: null
+ lookup_query: |-
+ SELECT "{COLUMN_NAME}", COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}"
+ WHERE "{COLUMN_NAME}" = UPPER("{COLUMN_NAME}") AND "{COLUMN_NAME}" = LOWER("{COLUMN_NAME}") AND "{COLUMN_NAME}" > ''
+ GROUP BY "{COLUMN_NAME}" LIMIT 500
+ error_type: Profile Anomaly
- id: '1266'
test_id: '1029'
test_type: Non_Alpha_Name_Address
diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Alpha_Prefixed_Name.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Alpha_Prefixed_Name.yaml
index 807e49d0..b75744b7 100644
--- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Alpha_Prefixed_Name.yaml
+++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Alpha_Prefixed_Name.yaml
@@ -57,6 +57,16 @@ profile_anomaly_types:
WHERE "{COLUMN_NAME}" < 'A' AND LEFT("{COLUMN_NAME}", 1) NOT IN ('"', ' ') AND RIGHT("{COLUMN_NAME}", 1) <> ''''
GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500
error_type: Profile Anomaly
+ - id: '1475'
+ test_id: '1030'
+ test_type: Non_Alpha_Prefixed_Name
+ sql_flavor: redshift_spectrum
+ lookup_type: null
+ lookup_query: |-
+ SELECT "{COLUMN_NAME}", COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}"
+ WHERE "{COLUMN_NAME}" < 'A' AND LEFT("{COLUMN_NAME}", 1) NOT IN ('"', ' ') AND RIGHT("{COLUMN_NAME}", 1) <> ''''
+ GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500
+ error_type: Profile Anomaly
- id: '1271'
test_id: '1030'
test_type: Non_Alpha_Prefixed_Name
diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Printing_Chars.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Printing_Chars.yaml
index 0abc0e99..6761e2bc 100644
--- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Printing_Chars.yaml
+++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Printing_Chars.yaml
@@ -102,6 +102,27 @@ profile_anomaly_types:
WHERE TRANSLATE("{COLUMN_NAME}", CHR(160) || CHR(8201) || CHR(8203) || CHR(8204) || CHR(8205) || CHR(8206) || CHR(8207) || CHR(8239) || CHR(12288) || CHR(65279), 'XXXXXXXXXX') <> "{COLUMN_NAME}"
GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500
error_type: Profile Anomaly
+ - id: '1476'
+ test_id: '1031'
+ test_type: Non_Printing_Chars
+ sql_flavor: redshift_spectrum
+ lookup_type: null
+ lookup_query: |-
+ SELECT REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE("{COLUMN_NAME}",
+ CHR(160), '\x160'),
+ CHR(8201), '\x8201'),
+ CHR(8203), '\x8203'),
+ CHR(8204), '\x8204'),
+ CHR(8205), '\x8205'),
+ CHR(8206), '\x8206'),
+ CHR(8207), '\x8207'),
+ CHR(8239), '\x8239'),
+ CHR(12288), '\x12288'),
+ CHR(65279), '\x65279') as "{COLUMN_NAME}_content",
+ COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}"
+ WHERE TRANSLATE("{COLUMN_NAME}", CHR(160) || CHR(8201) || CHR(8203) || CHR(8204) || CHR(8205) || CHR(8206) || CHR(8207) || CHR(8239) || CHR(12288) || CHR(65279), 'XXXXXXXXXX') <> "{COLUMN_NAME}"
+ GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500
+ error_type: Profile Anomaly
- id: '1276'
test_id: '1031'
test_type: Non_Printing_Chars
diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Standard_Blanks.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Standard_Blanks.yaml
index eaf2dae5..9c81c6c2 100644
--- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Standard_Blanks.yaml
+++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Standard_Blanks.yaml
@@ -49,6 +49,14 @@ profile_anomaly_types:
lookup_query: |-
SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE CASE WHEN "{COLUMN_NAME}" IN ('.', '?', ' ') THEN 1 WHEN LOWER("{COLUMN_NAME}") SIMILAR TO '(^.{2,}|-{2,}|0{2,}|9{2,}|x{2,}|z{2,}$)' THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('blank','error','missing','tbd', 'n/a','#na','none','null','unknown') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('(blank)','(error)','(missing)','(tbd)', '(n/a)','(#na)','(none)','(null)','(unknown)') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('[blank]','[error]','[missing]','[tbd]', '[n/a]','[#na]','[none]','[null]','[unknown]') THEN 1 WHEN "{COLUMN_NAME}" = '' THEN 1 WHEN "{COLUMN_NAME}" IS NULL THEN 1 ELSE 0 END = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";
error_type: Profile Anomaly
+ - id: '1434'
+ test_id: '1002'
+ test_type: Non_Standard_Blanks
+ sql_flavor: redshift_spectrum
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE CASE WHEN "{COLUMN_NAME}" IN ('.', '?', ' ') THEN 1 WHEN LOWER("{COLUMN_NAME}") SIMILAR TO '(^.{2,}|-{2,}|0{2,}|9{2,}|x{2,}|z{2,}$)' THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('blank','error','missing','tbd', 'n/a','#na','none','null','unknown') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('(blank)','(error)','(missing)','(tbd)', '(n/a)','(#na)','(none)','(null)','(unknown)') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('[blank]','[error]','[missing]','[tbd]', '[n/a]','[#na]','[none]','[null]','[unknown]') THEN 1 WHEN "{COLUMN_NAME}" = '' THEN 1 WHEN "{COLUMN_NAME}" IS NULL THEN 1 ELSE 0 END = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";
+ error_type: Profile Anomaly
- id: '1173'
test_id: '1002'
test_type: Non_Standard_Blanks
diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Potential_Duplicates.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Potential_Duplicates.yaml
index 4aaaa825..8ea8cc52 100644
--- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Potential_Duplicates.yaml
+++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Potential_Duplicates.yaml
@@ -50,6 +50,14 @@ profile_anomaly_types:
lookup_query: |-
SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" HAVING COUNT(*)> 1 ORDER BY COUNT(*) DESC LIMIT 500;
error_type: Profile Anomaly
+ - id: '1448'
+ test_id: '1016'
+ test_type: Potential_Duplicates
+ sql_flavor: redshift_spectrum
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" HAVING COUNT(*)> 1 ORDER BY COUNT(*) DESC LIMIT 500;
+ error_type: Profile Anomaly
- id: '1187'
test_id: '1016'
test_type: Potential_Duplicates
diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Potential_PII.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Potential_PII.yaml
index 09a80941..9fadca96 100644
--- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Potential_PII.yaml
+++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Potential_PII.yaml
@@ -49,6 +49,14 @@ profile_anomaly_types:
lookup_query: |-
SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500;
error_type: Profile Anomaly
+ - id: '1470'
+ test_id: '1100'
+ test_type: Potential_PII
+ sql_flavor: redshift_spectrum
+ lookup_type: null
+ lookup_query: |-
+ SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500;
+ error_type: Profile Anomaly
- id: '1270'
test_id: '1100'
test_type: Potential_PII
diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Quoted_Values.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Quoted_Values.yaml
index 36c535fc..34335441 100644
--- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Quoted_Values.yaml
+++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Quoted_Values.yaml
@@ -49,6 +49,14 @@ profile_anomaly_types:
lookup_query: |-
SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (CASE WHEN "{COLUMN_NAME}" ILIKE '"%"' OR "{COLUMN_NAME}" ILIKE '''%''' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500;
error_type: Profile Anomaly
+ - id: '1442'
+ test_id: '1010'
+ test_type: Quoted_Values
+ sql_flavor: redshift_spectrum
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (CASE WHEN "{COLUMN_NAME}" ILIKE '"%"' OR "{COLUMN_NAME}" ILIKE '''%''' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500;
+ error_type: Profile Anomaly
- id: '1181'
test_id: '1010'
test_type: Quoted_Values
diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Recency_One_Year.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Recency_One_Year.yaml
index 6564153d..68823ef7 100644
--- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Recency_One_Year.yaml
+++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Recency_One_Year.yaml
@@ -48,6 +48,14 @@ profile_anomaly_types:
lookup_query: |-
created_in_ui
error_type: Profile Anomaly
+ - id: '1451'
+ test_id: '1019'
+ test_type: Recency_One_Year
+ sql_flavor: redshift_spectrum
+ lookup_type: null
+ lookup_query: |-
+ created_in_ui
+ error_type: Profile Anomaly
- id: '1190'
test_id: '1019'
test_type: Recency_One_Year
diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Recency_Six_Months.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Recency_Six_Months.yaml
index ae3e25e5..0ee99bfd 100644
--- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Recency_Six_Months.yaml
+++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Recency_Six_Months.yaml
@@ -48,6 +48,14 @@ profile_anomaly_types:
lookup_query: |-
created_in_ui
error_type: Profile Anomaly
+ - id: '1452'
+ test_id: '1020'
+ test_type: Recency_Six_Months
+ sql_flavor: redshift_spectrum
+ lookup_type: null
+ lookup_query: |-
+ created_in_ui
+ error_type: Profile Anomaly
- id: '1191'
test_id: '1020'
test_type: Recency_Six_Months
diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Small_Divergent_Value_Ct.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Small_Divergent_Value_Ct.yaml
index 0d950923..20110211 100644
--- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Small_Divergent_Value_Ct.yaml
+++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Small_Divergent_Value_Ct.yaml
@@ -54,6 +54,14 @@ profile_anomaly_types:
lookup_query: |-
SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC;
error_type: Profile Anomaly
+ - id: '1446'
+ test_id: '1014'
+ test_type: Small Divergent Value Ct
+ sql_flavor: redshift_spectrum
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC;
+ error_type: Profile Anomaly
- id: '1185'
test_id: '1014'
test_type: Small Divergent Value Ct
diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Small_Missing_Value_Ct.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Small_Missing_Value_Ct.yaml
index 58591b77..6d3e8d69 100644
--- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Small_Missing_Value_Ct.yaml
+++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Small_Missing_Value_Ct.yaml
@@ -57,6 +57,14 @@ profile_anomaly_types:
lookup_query: |-
SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (CASE WHEN "{COLUMN_NAME}" IN ('.', '?', ' ') THEN 1 WHEN LOWER("{COLUMN_NAME}") SIMILAR TO '(^.{2,}|-{2,}|0{2,}|9{2,}|x{2,}|z{2,}$)' THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('blank','error','missing','tbd', 'n/a','#na','none','null','unknown') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('(blank)','(error)','(missing)','(tbd)', '(n/a)','(#na)','(none)','(null)','(unknown)') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('[blank]','[error]','[missing]','[tbd]', '[n/a]','[#na]','[none]','[null]','[unknown]') THEN 1 WHEN "{COLUMN_NAME}" = '' THEN 1 WHEN "{COLUMN_NAME}" IS NULL THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";
error_type: Profile Anomaly
+ - id: '1445'
+ test_id: '1013'
+ test_type: Small Missing Value Ct
+ sql_flavor: redshift_spectrum
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (CASE WHEN "{COLUMN_NAME}" IN ('.', '?', ' ') THEN 1 WHEN LOWER("{COLUMN_NAME}") SIMILAR TO '(^.{2,}|-{2,}|0{2,}|9{2,}|x{2,}|z{2,}$)' THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('blank','error','missing','tbd', 'n/a','#na','none','null','unknown') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('(blank)','(error)','(missing)','(tbd)', '(n/a)','(#na)','(none)','(null)','(unknown)') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('[blank]','[error]','[missing]','[tbd]', '[n/a]','[#na]','[none]','[null]','[unknown]') THEN 1 WHEN "{COLUMN_NAME}" = '' THEN 1 WHEN "{COLUMN_NAME}" IS NULL THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";
+ error_type: Profile Anomaly
- id: '1184'
test_id: '1013'
test_type: Small Missing Value Ct
diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Small_Numeric_Value_Ct.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Small_Numeric_Value_Ct.yaml
index c56cafdc..7a8bdc71 100644
--- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Small_Numeric_Value_Ct.yaml
+++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Small_Numeric_Value_Ct.yaml
@@ -51,6 +51,14 @@ profile_anomaly_types:
lookup_query: |-
SELECT A.* FROM ( SELECT TOP 10 DISTINCT 'Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_NUM;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC) AS A UNION ALL SELECT B.* FROM ( SELECT TOP 10 DISTINCT 'Non-Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_NUM;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS B ORDER BY data_type, count DESC;
error_type: Profile Anomaly
+ - id: '1455'
+ test_id: '1023'
+ test_type: Small_Numeric_Value_Ct
+ sql_flavor: redshift_spectrum
+ lookup_type: null
+ lookup_query: |-
+ SELECT A.* FROM ( SELECT TOP 10 DISTINCT 'Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_NUM;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC) AS A UNION ALL SELECT B.* FROM ( SELECT TOP 10 DISTINCT 'Non-Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_NUM;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS B ORDER BY data_type, count DESC;
+ error_type: Profile Anomaly
- id: '1194'
test_id: '1023'
test_type: Small_Numeric_Value_Ct
diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Standardized_Value_Matches.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Standardized_Value_Matches.yaml
index 648128c5..291e3355 100644
--- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Standardized_Value_Matches.yaml
+++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Standardized_Value_Matches.yaml
@@ -50,6 +50,14 @@ profile_anomaly_types:
lookup_query: |-
WITH CTE AS ( SELECT DISTINCT UPPER(TRANSLATE("{COLUMN_NAME}", ' '',.-', '')) as possible_standard_value, COUNT(DISTINCT "{COLUMN_NAME}") FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY UPPER(TRANSLATE("{COLUMN_NAME}", ' '',.-', '')) HAVING COUNT(DISTINCT "{COLUMN_NAME}") > 1 ) SELECT DISTINCT a."{COLUMN_NAME}", possible_standard_value, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} a, cte b WHERE UPPER(TRANSLATE(a."{COLUMN_NAME}", ' '',.-', '')) = b.possible_standard_value GROUP BY a."{COLUMN_NAME}", possible_standard_value ORDER BY possible_standard_value ASC, count DESC LIMIT 500;
error_type: Profile Anomaly
+ - id: '1449'
+ test_id: '1017'
+ test_type: Standardized_Value_Matches
+ sql_flavor: redshift_spectrum
+ lookup_type: null
+ lookup_query: |-
+ WITH CTE AS ( SELECT DISTINCT UPPER(TRANSLATE("{COLUMN_NAME}", ' '',.-', '')) as possible_standard_value, COUNT(DISTINCT "{COLUMN_NAME}") FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY UPPER(TRANSLATE("{COLUMN_NAME}", ' '',.-', '')) HAVING COUNT(DISTINCT "{COLUMN_NAME}") > 1 ) SELECT DISTINCT a."{COLUMN_NAME}", possible_standard_value, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} a, cte b WHERE UPPER(TRANSLATE(a."{COLUMN_NAME}", ' '',.-', '')) = b.possible_standard_value GROUP BY a."{COLUMN_NAME}", possible_standard_value ORDER BY possible_standard_value ASC, count DESC LIMIT 500;
+ error_type: Profile Anomaly
- id: '1188'
test_id: '1017'
test_type: Standardized_Value_Matches
diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Suggested_Type.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Suggested_Type.yaml
index 46d1b9bc..1f3bc6b3 100644
--- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Suggested_Type.yaml
+++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Suggested_Type.yaml
@@ -50,6 +50,14 @@ profile_anomaly_types:
lookup_query: |-
SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC;
error_type: Profile Anomaly
+ - id: '1433'
+ test_id: '1001'
+ test_type: Suggested_Type
+ sql_flavor: redshift_spectrum
+ lookup_type: null
+ lookup_query: |-
+ SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC;
+ error_type: Profile Anomaly
- id: '1172'
test_id: '1001'
test_type: Suggested_Type
diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Table_Pattern_Mismatch.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Table_Pattern_Mismatch.yaml
index ced56d76..8d1f3e50 100644
--- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Table_Pattern_Mismatch.yaml
+++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Table_Pattern_Mismatch.yaml
@@ -58,6 +58,14 @@ profile_anomaly_types:
lookup_query: |-
SELECT column_name, table_name, data_type FROM information_schema.columns WHERE table_schema = '{TARGET_SCHEMA}' AND column_name = '{COLUMN_NAME}' ORDER BY data_type;
error_type: Profile Anomaly
+ - id: '1440'
+ test_id: '1008'
+ test_type: Table_Pattern_Mismatch
+ sql_flavor: redshift_spectrum
+ lookup_type: null
+ lookup_query: |-
+ SELECT columnname AS column_name, tablename AS table_name, external_type AS data_type FROM svv_external_columns WHERE schemaname = '{TARGET_SCHEMA}' AND columnname = '{COLUMN_NAME}' ORDER BY external_type;
+ error_type: Profile Anomaly
- id: '1179'
test_id: '1008'
test_type: Table_Pattern_Mismatch
diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Unexpected_Emails.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Unexpected_Emails.yaml
index c975eec0..a5b244b3 100644
--- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Unexpected_Emails.yaml
+++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Unexpected_Emails.yaml
@@ -49,6 +49,14 @@ profile_anomaly_types:
lookup_query: |-
SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500;
error_type: Profile Anomaly
+ - id: '1454'
+ test_id: '1022'
+ test_type: Unexpected Emails
+ sql_flavor: redshift_spectrum
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500;
+ error_type: Profile Anomaly
- id: '1193'
test_id: '1022'
test_type: Unexpected Emails
diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Unexpected_US_States.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Unexpected_US_States.yaml
index d74cea69..085a39c4 100644
--- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Unexpected_US_States.yaml
+++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Unexpected_US_States.yaml
@@ -51,6 +51,14 @@ profile_anomaly_types:
lookup_query: |-
SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500;
error_type: Profile Anomaly
+ - id: '1453'
+ test_id: '1021'
+ test_type: Unexpected US States
+ sql_flavor: redshift_spectrum
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500;
+ error_type: Profile Anomaly
- id: '1192'
test_id: '1021'
test_type: Unexpected US States
diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Unlikely_Date_Values.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Unlikely_Date_Values.yaml
index d111361c..0752e201 100644
--- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Unlikely_Date_Values.yaml
+++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Unlikely_Date_Values.yaml
@@ -51,6 +51,14 @@ profile_anomaly_types:
lookup_query: |-
SELECT DISTINCT "{COLUMN_NAME}", '{PROFILE_RUN_DATE}' :: DATE AS profile_run_date, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} a WHERE ("{COLUMN_NAME}" < '1900-01-01'::DATE) OR ("{COLUMN_NAME}" > '{PROFILE_RUN_DATE}' :: DATE + INTERVAL '30 year' ) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500;
error_type: Profile Anomaly
+ - id: '1450'
+ test_id: '1018'
+ test_type: Unlikely_Date_Values
+ sql_flavor: redshift_spectrum
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT "{COLUMN_NAME}", '{PROFILE_RUN_DATE}' :: DATE AS profile_run_date, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} a WHERE ("{COLUMN_NAME}" < '1900-01-01'::DATE) OR ("{COLUMN_NAME}" > '{PROFILE_RUN_DATE}' :: DATE + INTERVAL '30 year' ) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500;
+ error_type: Profile Anomaly
- id: '1189'
test_id: '1018'
test_type: Unlikely_Date_Values
diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Variant_Coded_Values.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Variant_Coded_Values.yaml
index ae92b8b1..786be14d 100644
--- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Variant_Coded_Values.yaml
+++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Variant_Coded_Values.yaml
@@ -51,6 +51,14 @@ profile_anomaly_types:
lookup_query: |-
WITH val_array AS (SELECT 1 as valkey, SPLIT_TO_ARRAY(SUBSTRING ('{DETAIL_EXPRESSION}', STRPOS('{DETAIL_EXPRESSION}', ':') + 2), '|') vals), val_list AS ( SELECT valkey, val::VARCHAR FROM val_array v, v.vals val ) SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} t INNER JOIN val_list v ON (LOWER("{COLUMN_NAME}") = v.val) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";
error_type: Profile Anomaly
+ - id: '1458'
+ test_id: '1027'
+ test_type: Variant_Coded_Values
+ sql_flavor: redshift_spectrum
+ lookup_type: null
+ lookup_query: |-
+ WITH val_array AS (SELECT 1 as valkey, SPLIT_TO_ARRAY(SUBSTRING ('{DETAIL_EXPRESSION}', STRPOS('{DETAIL_EXPRESSION}', ':') + 2), '|') vals), val_list AS ( SELECT valkey, val::VARCHAR FROM val_array v, v.vals val ) SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} t INNER JOIN val_list v ON (LOWER("{COLUMN_NAME}") = v.val) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";
+ error_type: Profile Anomaly
- id: '1230'
test_id: '1027'
test_type: Variant_Coded_Values
diff --git a/testgen/template/dbsetup_test_types/test_types_Aggregate_Balance.yaml b/testgen/template/dbsetup_test_types/test_types_Aggregate_Balance.yaml
index fb68a907..e51bd26c 100644
--- a/testgen/template/dbsetup_test_types/test_types_Aggregate_Balance.yaml
+++ b/testgen/template/dbsetup_test_types/test_types_Aggregate_Balance.yaml
@@ -133,6 +133,30 @@ test_types:
WHERE total <> match_total OR (total IS NOT NULL AND match_total IS NULL) OR (total IS NULL AND match_total IS NOT NULL)
ORDER BY {GROUPBY_NAMES};
error_type: Test Results
+ - id: '1462'
+ test_id: '1500'
+ test_type: Aggregate_Balance
+ sql_flavor: redshift_spectrum
+ lookup_type: null
+ lookup_query: |-
+ SELECT *
+ FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) AS total, SUM(MATCH_TOTAL) AS MATCH_TOTAL
+ FROM
+ ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} AS total, NULL AS match_total
+ FROM {TARGET_SCHEMA}.{TABLE_NAME}
+ WHERE {SUBSET_CONDITION}
+ GROUP BY {GROUPBY_NAMES}
+ {HAVING_CONDITION}
+ UNION ALL
+ SELECT {MATCH_GROUPBY_NAMES}, NULL AS total, {MATCH_COLUMN_NAMES} AS match_total
+ FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME}
+ WHERE {MATCH_SUBSET_CONDITION}
+ GROUP BY {MATCH_GROUPBY_NAMES}
+ {MATCH_HAVING_CONDITION} ) a
+ GROUP BY {GROUPBY_NAMES} ) s
+ WHERE total <> match_total OR (total IS NOT NULL AND match_total IS NULL) OR (total IS NULL AND match_total IS NOT NULL)
+ ORDER BY {GROUPBY_NAMES};
+ error_type: Test Results
- id: '1246'
test_id: '1500'
test_type: Aggregate_Balance
@@ -174,6 +198,10 @@ test_types:
test_type: Aggregate_Balance
sql_flavor: redshift
template_name: ex_aggregate_match_same_generic.sql
+ - id: '2506'
+ test_type: Aggregate_Balance
+ sql_flavor: redshift_spectrum
+ template_name: ex_aggregate_match_same_generic.sql
- id: '2106'
test_type: Aggregate_Balance
sql_flavor: snowflake
diff --git a/testgen/template/dbsetup_test_types/test_types_Aggregate_Balance_Percent.yaml b/testgen/template/dbsetup_test_types/test_types_Aggregate_Balance_Percent.yaml
index e293ba14..5219941a 100644
--- a/testgen/template/dbsetup_test_types/test_types_Aggregate_Balance_Percent.yaml
+++ b/testgen/template/dbsetup_test_types/test_types_Aggregate_Balance_Percent.yaml
@@ -141,6 +141,32 @@ test_types:
OR (total NOT BETWEEN match_total * (1 + {LOWER_TOLERANCE}/100.0) AND match_total * (1 + {UPPER_TOLERANCE}/100.0))
ORDER BY {GROUPBY_NAMES};
error_type: Test Results
+ - id: '1466'
+ test_id: '1504'
+ test_type: Aggregate_Balance_Percent
+ sql_flavor: redshift_spectrum
+ lookup_type: null
+ lookup_query: |-
+ SELECT *
+ FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) AS total, SUM(MATCH_TOTAL) AS MATCH_TOTAL
+ FROM
+ ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} AS total, NULL AS match_total
+ FROM {TARGET_SCHEMA}.{TABLE_NAME}
+ WHERE {SUBSET_CONDITION}
+ GROUP BY {GROUPBY_NAMES}
+ {HAVING_CONDITION}
+ UNION ALL
+ SELECT {MATCH_GROUPBY_NAMES}, NULL AS total, {MATCH_COLUMN_NAMES} AS match_total
+ FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME}
+ WHERE {MATCH_SUBSET_CONDITION}
+ GROUP BY {MATCH_GROUPBY_NAMES}
+ {MATCH_HAVING_CONDITION} ) a
+ GROUP BY {GROUPBY_NAMES} ) s
+ WHERE (total IS NOT NULL AND match_total IS NULL)
+ OR (total IS NULL AND match_total IS NOT NULL)
+ OR (total NOT BETWEEN match_total * (1 + {LOWER_TOLERANCE}/100.0) AND match_total * (1 + {UPPER_TOLERANCE}/100.0))
+ ORDER BY {GROUPBY_NAMES};
+ error_type: Test Results
- id: '1246'
test_id: '1504'
test_type: Aggregate_Balance_Percent
@@ -184,6 +210,10 @@ test_types:
test_type: Aggregate_Balance_Percent
sql_flavor: redshift
template_name: ex_aggregate_match_percent_generic.sql
+ - id: '2509'
+ test_type: Aggregate_Balance_Percent
+ sql_flavor: redshift_spectrum
+ template_name: ex_aggregate_match_percent_generic.sql
- id: '2109'
test_type: Aggregate_Balance_Percent
sql_flavor: snowflake
diff --git a/testgen/template/dbsetup_test_types/test_types_Aggregate_Balance_Range.yaml b/testgen/template/dbsetup_test_types/test_types_Aggregate_Balance_Range.yaml
index a0976a2c..d18ffc55 100644
--- a/testgen/template/dbsetup_test_types/test_types_Aggregate_Balance_Range.yaml
+++ b/testgen/template/dbsetup_test_types/test_types_Aggregate_Balance_Range.yaml
@@ -141,6 +141,32 @@ test_types:
OR (total NOT BETWEEN match_total + {LOWER_TOLERANCE} AND match_total + {UPPER_TOLERANCE})
ORDER BY {GROUPBY_NAMES};
error_type: Test Results
+ - id: '1467'
+ test_id: '1505'
+ test_type: Aggregate_Balance_Range
+ sql_flavor: redshift_spectrum
+ lookup_type: null
+ lookup_query: |-
+ SELECT *
+ FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) AS total, SUM(MATCH_TOTAL) AS MATCH_TOTAL
+ FROM
+ ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} AS total, NULL AS match_total
+ FROM {TARGET_SCHEMA}.{TABLE_NAME}
+ WHERE {SUBSET_CONDITION}
+ GROUP BY {GROUPBY_NAMES}
+ {HAVING_CONDITION}
+ UNION ALL
+ SELECT {MATCH_GROUPBY_NAMES}, NULL AS total, {MATCH_COLUMN_NAMES} AS match_total
+ FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME}
+ WHERE {MATCH_SUBSET_CONDITION}
+ GROUP BY {MATCH_GROUPBY_NAMES}
+ {MATCH_HAVING_CONDITION} ) a
+ GROUP BY {GROUPBY_NAMES} ) s
+ WHERE (total IS NOT NULL AND match_total IS NULL)
+ OR (total IS NULL AND match_total IS NOT NULL)
+ OR (total NOT BETWEEN match_total + {LOWER_TOLERANCE} AND match_total + {UPPER_TOLERANCE})
+ ORDER BY {GROUPBY_NAMES};
+ error_type: Test Results
- id: '1246'
test_id: '1505'
test_type: Aggregate_Balance_Range
@@ -184,6 +210,10 @@ test_types:
test_type: Aggregate_Balance_Range
sql_flavor: redshift
template_name: ex_aggregate_match_range_generic.sql
+ - id: '2510'
+ test_type: Aggregate_Balance_Range
+ sql_flavor: redshift_spectrum
+ template_name: ex_aggregate_match_range_generic.sql
- id: '2110'
test_type: Aggregate_Balance_Range
sql_flavor: snowflake
diff --git a/testgen/template/dbsetup_test_types/test_types_Aggregate_Minimum.yaml b/testgen/template/dbsetup_test_types/test_types_Aggregate_Minimum.yaml
index 79c41e7f..491e499b 100644
--- a/testgen/template/dbsetup_test_types/test_types_Aggregate_Minimum.yaml
+++ b/testgen/template/dbsetup_test_types/test_types_Aggregate_Minimum.yaml
@@ -133,6 +133,30 @@ test_types:
WHERE total < match_total OR (total IS NULL AND match_total IS NOT NULL)
ORDER BY {GROUPBY_NAMES};
error_type: Test Results
+ - id: '1463'
+ test_id: '1501'
+ test_type: Aggregate_Minimum
+ sql_flavor: redshift_spectrum
+ lookup_type: null
+ lookup_query: |-
+ SELECT *
+ FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) as total, SUM(MATCH_TOTAL) as MATCH_TOTAL
+ FROM
+ ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} as total, NULL as match_total
+ FROM {TARGET_SCHEMA}.{TABLE_NAME}
+ WHERE {SUBSET_CONDITION}
+ GROUP BY {GROUPBY_NAMES}
+ {HAVING_CONDITION}
+ UNION ALL
+ SELECT {MATCH_GROUPBY_NAMES}, NULL as total, {MATCH_COLUMN_NAMES} as match_total
+ FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME}
+ WHERE {MATCH_SUBSET_CONDITION}
+ GROUP BY {MATCH_GROUPBY_NAMES}
+ {MATCH_HAVING_CONDITION} ) a
+ GROUP BY {GROUPBY_NAMES} ) s
+ WHERE total < match_total OR (total IS NULL AND match_total IS NOT NULL)
+ ORDER BY {GROUPBY_NAMES};
+ error_type: Test Results
- id: '1250'
test_id: '1501'
test_type: Aggregate_Minimum
@@ -174,6 +198,10 @@ test_types:
test_type: Aggregate_Minimum
sql_flavor: redshift
template_name: ex_aggregate_match_no_drops_generic.sql
+ - id: '2502'
+ test_type: Aggregate_Minimum
+ sql_flavor: redshift_spectrum
+ template_name: ex_aggregate_match_no_drops_generic.sql
- id: '2102'
test_type: Aggregate_Minimum
sql_flavor: snowflake
diff --git a/testgen/template/dbsetup_test_types/test_types_Alpha_Trunc.yaml b/testgen/template/dbsetup_test_types/test_types_Alpha_Trunc.yaml
index 74dc41ce..0cce0c35 100644
--- a/testgen/template/dbsetup_test_types/test_types_Alpha_Trunc.yaml
+++ b/testgen/template/dbsetup_test_types/test_types_Alpha_Trunc.yaml
@@ -67,6 +67,14 @@ test_types:
test_operator: <
test_condition: |-
{THRESHOLD_VALUE}
+ - id: '7001'
+ test_type: Alpha_Trunc
+ sql_flavor: redshift_spectrum
+ measure: |-
+ MAX(LENGTH({COLUMN_NAME}))
+ test_operator: <
+ test_condition: |-
+ {THRESHOLD_VALUE}
- id: '2001'
test_type: Alpha_Trunc
sql_flavor: snowflake
@@ -116,6 +124,14 @@ test_types:
lookup_query: |-
SELECT DISTINCT "{COLUMN_NAME}", LEN("{COLUMN_NAME}") as current_max_length, {THRESHOLD_VALUE} as previous_max_length FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT MAX(LEN("{COLUMN_NAME}")) as max_length FROM {TARGET_SCHEMA}.{TABLE_NAME}) a WHERE LEN("{COLUMN_NAME}") = a.max_length AND a.max_length < {THRESHOLD_VALUE} LIMIT 500;
error_type: Test Results
+ - id: '1401'
+ test_id: '1004'
+ test_type: Alpha_Trunc
+ sql_flavor: redshift_spectrum
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT "{COLUMN_NAME}", LEN("{COLUMN_NAME}") as current_max_length, {THRESHOLD_VALUE} as previous_max_length FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT MAX(LEN("{COLUMN_NAME}")) as max_length FROM {TARGET_SCHEMA}.{TABLE_NAME}) a WHERE LEN("{COLUMN_NAME}") = a.max_length AND a.max_length < {THRESHOLD_VALUE} LIMIT 500;
+ error_type: Test Results
- id: '1197'
test_id: '1004'
test_type: Alpha_Trunc
diff --git a/testgen/template/dbsetup_test_types/test_types_Avg_Shift.yaml b/testgen/template/dbsetup_test_types/test_types_Avg_Shift.yaml
index 5dbc252f..749a8aaa 100644
--- a/testgen/template/dbsetup_test_types/test_types_Avg_Shift.yaml
+++ b/testgen/template/dbsetup_test_types/test_types_Avg_Shift.yaml
@@ -68,6 +68,14 @@ test_types:
test_operator: '>='
test_condition: |-
{THRESHOLD_VALUE}
+ - id: '7002'
+ test_type: Avg_Shift
+ sql_flavor: redshift_spectrum
+ measure: |-
+ ABS( (AVG({COLUMN_NAME}::FLOAT) - {BASELINE_AVG}) / SQRT(((COUNT({COLUMN_NAME})::FLOAT-1)*STDDEV({COLUMN_NAME})^2 + ({BASELINE_VALUE_CT}::FLOAT-1) * {BASELINE_SD}::FLOAT^2) /NULLIF(COUNT({COLUMN_NAME})::FLOAT + {BASELINE_VALUE_CT}::FLOAT, 0) ))
+ test_operator: '>='
+ test_condition: |-
+ {THRESHOLD_VALUE}
- id: '2002'
test_type: Avg_Shift
sql_flavor: snowflake
@@ -117,6 +125,14 @@ test_types:
lookup_query: |-
SELECT AVG("{COLUMN_NAME}"::FLOAT) AS current_average FROM {TARGET_SCHEMA}.{TABLE_NAME};
error_type: Test Results
+ - id: '1402'
+ test_id: '1005'
+ test_type: Avg_Shift
+ sql_flavor: redshift_spectrum
+ lookup_type: null
+ lookup_query: |-
+ SELECT AVG("{COLUMN_NAME}" :: FLOAT) AS current_average FROM {TARGET_SCHEMA}.{TABLE_NAME};
+ error_type: Test Results
- id: '1198'
test_id: '1005'
test_type: Avg_Shift
diff --git a/testgen/template/dbsetup_test_types/test_types_CUSTOM.yaml b/testgen/template/dbsetup_test_types/test_types_CUSTOM.yaml
index e7f65499..5f53770c 100644
--- a/testgen/template/dbsetup_test_types/test_types_CUSTOM.yaml
+++ b/testgen/template/dbsetup_test_types/test_types_CUSTOM.yaml
@@ -55,6 +55,10 @@ test_types:
test_type: CUSTOM
sql_flavor: redshift
template_name: ex_custom_query_generic.sql
+ - id: '2504'
+ test_type: CUSTOM
+ sql_flavor: redshift_spectrum
+ template_name: ex_custom_query_generic.sql
- id: '2104'
test_type: CUSTOM
sql_flavor: snowflake
diff --git a/testgen/template/dbsetup_test_types/test_types_Combo_Match.yaml b/testgen/template/dbsetup_test_types/test_types_Combo_Match.yaml
index 05c1ad6e..6d2fdea1 100644
--- a/testgen/template/dbsetup_test_types/test_types_Combo_Match.yaml
+++ b/testgen/template/dbsetup_test_types/test_types_Combo_Match.yaml
@@ -121,6 +121,27 @@ test_types:
) test
ORDER BY {COLUMN_NAME_NO_QUOTES};
error_type: Test Results
+ - id: '1464'
+ test_id: '1502'
+ test_type: Combo_Match
+ sql_flavor: redshift_spectrum
+ lookup_type: null
+ lookup_query: |-
+ SELECT *
+ FROM ( SELECT {COLUMN_NAME_NO_QUOTES}
+ FROM {TARGET_SCHEMA}.{TABLE_NAME}
+ WHERE {SUBSET_CONDITION}
+ GROUP BY {COLUMN_NAME_NO_QUOTES}
+ {HAVING_CONDITION}
+ EXCEPT
+ SELECT {MATCH_GROUPBY_NAMES}
+ FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME}
+ WHERE {MATCH_SUBSET_CONDITION}
+ GROUP BY {MATCH_GROUPBY_NAMES}
+ {MATCH_HAVING_CONDITION}
+ ) test
+ ORDER BY {COLUMN_NAME_NO_QUOTES};
+ error_type: Test Results
- id: '1254'
test_id: '1502'
test_type: Combo_Match
@@ -159,6 +180,10 @@ test_types:
test_type: Combo_Match
sql_flavor: redshift
template_name: ex_data_match_generic.sql
+ - id: '2501'
+ test_type: Combo_Match
+ sql_flavor: redshift_spectrum
+ template_name: ex_data_match_generic.sql
- id: '2101'
test_type: Combo_Match
sql_flavor: snowflake
diff --git a/testgen/template/dbsetup_test_types/test_types_Condition_Flag.yaml b/testgen/template/dbsetup_test_types/test_types_Condition_Flag.yaml
index 91f8836d..194f5d61 100644
--- a/testgen/template/dbsetup_test_types/test_types_Condition_Flag.yaml
+++ b/testgen/template/dbsetup_test_types/test_types_Condition_Flag.yaml
@@ -68,6 +68,14 @@ test_types:
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
+ - id: '7003'
+ test_type: Condition_Flag
+ sql_flavor: redshift_spectrum
+ measure: |-
+ SUM(CASE WHEN {CUSTOM_QUERY} THEN 1 ELSE 0 END)
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
- id: '2003'
test_type: Condition_Flag
sql_flavor: snowflake
@@ -117,6 +125,14 @@ test_types:
lookup_query: |-
SELECT * FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE {CUSTOM_QUERY} LIMIT 500;
error_type: Test Results
+ - id: '1403'
+ test_id: '1006'
+ test_type: Condition_Flag
+ sql_flavor: redshift_spectrum
+ lookup_type: null
+ lookup_query: |-
+ SELECT * FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE {CUSTOM_QUERY} LIMIT 500;
+ error_type: Test Results
- id: '1199'
test_id: '1006'
test_type: Condition_Flag
diff --git a/testgen/template/dbsetup_test_types/test_types_Constant.yaml b/testgen/template/dbsetup_test_types/test_types_Constant.yaml
index 848cc813..2faf6d21 100644
--- a/testgen/template/dbsetup_test_types/test_types_Constant.yaml
+++ b/testgen/template/dbsetup_test_types/test_types_Constant.yaml
@@ -67,6 +67,14 @@ test_types:
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
+ - id: '7004'
+ test_type: Constant
+ sql_flavor: redshift_spectrum
+ measure: |-
+ SUM(CASE WHEN {COLUMN_NAME} <> {BASELINE_VALUE} THEN 1 ELSE 0 END)
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
- id: '2004'
test_type: Constant
sql_flavor: snowflake
@@ -116,6 +124,14 @@ test_types:
lookup_query: |-
SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" <> {BASELINE_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT 500;
error_type: Test Results
+ - id: '1404'
+ test_id: '1007'
+ test_type: Constant
+ sql_flavor: redshift_spectrum
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" <> {BASELINE_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT 500;
+ error_type: Test Results
- id: '1200'
test_id: '1007'
test_type: Constant
diff --git a/testgen/template/dbsetup_test_types/test_types_Daily_Record_Ct.yaml b/testgen/template/dbsetup_test_types/test_types_Daily_Record_Ct.yaml
index 735a8ee5..debd6938 100644
--- a/testgen/template/dbsetup_test_types/test_types_Daily_Record_Ct.yaml
+++ b/testgen/template/dbsetup_test_types/test_types_Daily_Record_Ct.yaml
@@ -71,6 +71,14 @@ test_types:
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
+ - id: '7005'
+ test_type: Daily_Record_Ct
+ sql_flavor: redshift_spectrum
+ measure: |-
+ DATEDIFF('DAY', MIN({COLUMN_NAME}), MAX({COLUMN_NAME}))+1-COUNT(DISTINCT {COLUMN_NAME})
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
- id: '2005'
test_type: Daily_Record_Ct
sql_flavor: snowflake
@@ -160,6 +168,14 @@ test_types:
lookup_query: |-
WITH RECURSIVE daterange(all_dates) AS (SELECT MIN("{COLUMN_NAME}") :: DATE AS all_dates FROM {TARGET_SCHEMA}.{TABLE_NAME} UNION ALL SELECT DATEADD(DAY, 1, d.all_dates) :: DATE AS all_dates FROM daterange d WHERE d.all_dates < (SELECT MAX("{COLUMN_NAME}") :: DATE FROM {TARGET_SCHEMA}.{TABLE_NAME}) ), existing_periods AS ( SELECT DISTINCT "{COLUMN_NAME}" :: DATE AS period, COUNT(1) AS period_count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" :: DATE ) SELECT d.all_dates AS missing_period, MAX(b.period) AS prior_available_date, (SELECT period_count FROM existing_periods WHERE period = MAX(b.period) ) AS prior_available_date_count, MIN(c.period) AS next_available_date, (SELECT period_count FROM existing_periods WHERE period = MIN(c.period) ) AS next_available_date_count FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates ORDER BY d.all_dates LIMIT 500;
error_type: Test Results
+ - id: '1405'
+ test_id: '1009'
+ test_type: Daily_Record_Ct
+ sql_flavor: redshift_spectrum
+ lookup_type: null
+ lookup_query: |-
+ WITH RECURSIVE daterange(all_dates) AS (SELECT MIN("{COLUMN_NAME}") :: DATE AS all_dates FROM {TARGET_SCHEMA}.{TABLE_NAME} UNION ALL SELECT DATEADD(DAY, 1, d.all_dates) :: DATE AS all_dates FROM daterange d WHERE d.all_dates < (SELECT MAX("{COLUMN_NAME}") :: DATE FROM {TARGET_SCHEMA}.{TABLE_NAME}) ), existing_periods AS ( SELECT DISTINCT "{COLUMN_NAME}" :: DATE AS period, COUNT(1) AS period_count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" :: DATE ) SELECT d.all_dates AS missing_period, MAX(b.period) AS prior_available_date, (SELECT period_count FROM existing_periods WHERE period = MAX(b.period) ) AS prior_available_date_count, MIN(c.period) AS next_available_date, (SELECT period_count FROM existing_periods WHERE period = MIN(c.period) ) AS next_available_date_count FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates ORDER BY d.all_dates LIMIT 500;
+ error_type: Test Results
- id: '1201'
test_id: '1009'
test_type: Daily_Record_Ct
diff --git a/testgen/template/dbsetup_test_types/test_types_Dec_Trunc.yaml b/testgen/template/dbsetup_test_types/test_types_Dec_Trunc.yaml
index e0f5818d..4d1bb73e 100644
--- a/testgen/template/dbsetup_test_types/test_types_Dec_Trunc.yaml
+++ b/testgen/template/dbsetup_test_types/test_types_Dec_Trunc.yaml
@@ -68,6 +68,14 @@ test_types:
test_operator: <
test_condition: |-
{THRESHOLD_VALUE}
+ - id: '7006'
+ test_type: Dec_Trunc
+ sql_flavor: redshift_spectrum
+ measure: |-
+ SUM(ROUND(ABS(({COLUMN_NAME} % 1)), 5))+1
+ test_operator: <
+ test_condition: |-
+ {THRESHOLD_VALUE}
- id: '2006'
test_type: Dec_Trunc
sql_flavor: snowflake
@@ -119,6 +127,14 @@ test_types:
lookup_query: |-
SELECT DISTINCT DECIMAL_SCALE("{COLUMN_NAME}" :: SUPER) AS decimal_scale, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY DECIMAL_SCALE("{COLUMN_NAME}" :: SUPER) LIMIT 500;
error_type: Test Results
+ - id: '1406'
+ test_id: '1011'
+ test_type: Dec_Trunc
+ sql_flavor: redshift_spectrum
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT DECIMAL_SCALE("{COLUMN_NAME}" :: SUPER) AS decimal_scale, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY DECIMAL_SCALE("{COLUMN_NAME}" :: SUPER) LIMIT 500;
+ error_type: Test Results
- id: '1202'
test_id: '1011'
test_type: Dec_Trunc
diff --git a/testgen/template/dbsetup_test_types/test_types_Distinct_Date_Ct.yaml b/testgen/template/dbsetup_test_types/test_types_Distinct_Date_Ct.yaml
index 7c1c8794..142ceb5a 100644
--- a/testgen/template/dbsetup_test_types/test_types_Distinct_Date_Ct.yaml
+++ b/testgen/template/dbsetup_test_types/test_types_Distinct_Date_Ct.yaml
@@ -68,6 +68,14 @@ test_types:
test_operator: <
test_condition: |-
{THRESHOLD_VALUE}
+ - id: '7007'
+ test_type: Distinct_Date_Ct
+ sql_flavor: redshift_spectrum
+ measure: |-
+ COUNT(DISTINCT {COLUMN_NAME})
+ test_operator: <
+ test_condition: |-
+ {THRESHOLD_VALUE}
- id: '2007'
test_type: Distinct_Date_Ct
sql_flavor: snowflake
@@ -117,6 +125,14 @@ test_types:
lookup_query: |-
SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" IS NOT NULL GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500;
error_type: Test Results
+ - id: '1407'
+ test_id: '1012'
+ test_type: Distinct_Date_Ct
+ sql_flavor: redshift_spectrum
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" IS NOT NULL GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500;
+ error_type: Test Results
- id: '1203'
test_id: '1012'
test_type: Distinct_Date_Ct
diff --git a/testgen/template/dbsetup_test_types/test_types_Distinct_Value_Ct.yaml b/testgen/template/dbsetup_test_types/test_types_Distinct_Value_Ct.yaml
index bde871d8..29363f06 100644
--- a/testgen/template/dbsetup_test_types/test_types_Distinct_Value_Ct.yaml
+++ b/testgen/template/dbsetup_test_types/test_types_Distinct_Value_Ct.yaml
@@ -67,6 +67,14 @@ test_types:
test_operator: <>
test_condition: |-
{THRESHOLD_VALUE}
+ - id: '7008'
+ test_type: Distinct_Value_Ct
+ sql_flavor: redshift_spectrum
+ measure: |-
+ COUNT(DISTINCT {COLUMN_NAME})
+ test_operator: <>
+ test_condition: |-
+ {THRESHOLD_VALUE}
- id: '2008'
test_type: Distinct_Value_Ct
sql_flavor: snowflake
@@ -116,6 +124,14 @@ test_types:
lookup_query: |-
SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" IS NOT NULL GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500;
error_type: Test Results
+ - id: '1408'
+ test_id: '1013'
+ test_type: Distinct_Value_Ct
+ sql_flavor: redshift_spectrum
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" IS NOT NULL GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500;
+ error_type: Test Results
- id: '1204'
test_id: '1013'
test_type: Distinct_Value_Ct
diff --git a/testgen/template/dbsetup_test_types/test_types_Distribution_Shift.yaml b/testgen/template/dbsetup_test_types/test_types_Distribution_Shift.yaml
index 098be2e6..51a6d24b 100644
--- a/testgen/template/dbsetup_test_types/test_types_Distribution_Shift.yaml
+++ b/testgen/template/dbsetup_test_types/test_types_Distribution_Shift.yaml
@@ -142,6 +142,32 @@ test_types:
ON (l.category = o.category)
ORDER BY COALESCE(l.category, o.category)
error_type: Test Results
+ - id: '1465'
+ test_id: '1503'
+ test_type: Distribution_Shift
+ sql_flavor: redshift_spectrum
+ lookup_type: null
+ lookup_query: |-
+ WITH latest_ver
+ AS ( SELECT {CONCAT_COLUMNS} as category,
+ COUNT(*)::FLOAT / SUM(COUNT(*)) OVER ()::FLOAT AS pct_of_total
+ FROM {TARGET_SCHEMA}.{TABLE_NAME} v1
+ WHERE {SUBSET_CONDITION}
+ GROUP BY {COLUMN_NAME_NO_QUOTES} ),
+ older_ver
+ AS ( SELECT {CONCAT_MATCH_GROUPBY} as category,
+ COUNT(*)::FLOAT / SUM(COUNT(*)) OVER ()::FLOAT AS pct_of_total
+ FROM {MATCH_SCHEMA_NAME}.{TABLE_NAME} v2
+ WHERE {MATCH_SUBSET_CONDITION}
+ GROUP BY {MATCH_GROUPBY_NAMES} )
+ SELECT COALESCE(l.category, o.category) AS category,
+ o.pct_of_total AS old_pct,
+ l.pct_of_total AS new_pct
+ FROM latest_ver l
+ FULL JOIN older_ver o
+ ON (l.category = o.category)
+ ORDER BY COALESCE(l.category, o.category)
+ error_type: Test Results
- id: '1258'
test_id: '1503'
test_type: Distribution_Shift
@@ -185,6 +211,10 @@ test_types:
test_type: Distribution_Shift
sql_flavor: redshift
template_name: ex_relative_entropy_generic.sql
+ - id: '2503'
+ test_type: Distribution_Shift
+ sql_flavor: redshift_spectrum
+ template_name: ex_relative_entropy_generic.sql
- id: '2103'
test_type: Distribution_Shift
sql_flavor: snowflake
diff --git a/testgen/template/dbsetup_test_types/test_types_Dupe_Rows.yaml b/testgen/template/dbsetup_test_types/test_types_Dupe_Rows.yaml
index 3705e014..e0cae9af 100644
--- a/testgen/template/dbsetup_test_types/test_types_Dupe_Rows.yaml
+++ b/testgen/template/dbsetup_test_types/test_types_Dupe_Rows.yaml
@@ -89,6 +89,19 @@ test_types:
HAVING COUNT(*) > 1
ORDER BY {GROUPBY_NAMES}
error_type: Test Results
+ - id: '1472'
+ test_id: '1510'
+ test_type: Dupe_Rows
+ sql_flavor: redshift_spectrum
+ lookup_type: null
+ lookup_query: |-
+ SELECT {GROUPBY_NAMES}, COUNT(*) as record_ct
+ FROM {TARGET_SCHEMA}.{TABLE_NAME}
+ WHERE {SUBSET_CONDITION}
+ GROUP BY {GROUPBY_NAMES}
+ HAVING COUNT(*) > 1
+ ORDER BY {GROUPBY_NAMES}
+ error_type: Test Results
- id: '1254'
test_id: '1510'
test_type: Dupe_Rows
@@ -119,6 +132,10 @@ test_types:
test_type: Dupe_Rows
sql_flavor: redshift
template_name: ex_dupe_rows_generic.sql
+ - id: '2511'
+ test_type: Dupe_Rows
+ sql_flavor: redshift_spectrum
+ template_name: ex_dupe_rows_generic.sql
- id: '2111'
test_type: Dupe_Rows
sql_flavor: snowflake
diff --git a/testgen/template/dbsetup_test_types/test_types_Email_Format.yaml b/testgen/template/dbsetup_test_types/test_types_Email_Format.yaml
index 7ce6ffc1..fecb5f98 100644
--- a/testgen/template/dbsetup_test_types/test_types_Email_Format.yaml
+++ b/testgen/template/dbsetup_test_types/test_types_Email_Format.yaml
@@ -67,6 +67,14 @@ test_types:
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
+ - id: '7009'
+ test_type: Email_Format
+ sql_flavor: redshift_spectrum
+ measure: |-
+ SUM(CASE WHEN {COLUMN_NAME} !~ '^[A-Za-z0-9._''%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$' THEN 1 ELSE 0 END)
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
- id: '2009'
test_type: Email_Format
sql_flavor: snowflake
@@ -116,6 +124,14 @@ test_types:
lookup_query: |-
SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" !~ '^[A-Za-z0-9._''%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$' GROUP BY "{COLUMN_NAME}" LIMIT 500;
error_type: Test Results
+ - id: '1409'
+ test_id: '1014'
+ test_type: Email_Format
+ sql_flavor: redshift_spectrum
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" !~ '^[A-Za-z0-9._''%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$' GROUP BY "{COLUMN_NAME}" LIMIT 500;
+ error_type: Test Results
- id: '1205'
test_id: '1014'
test_type: Email_Format
diff --git a/testgen/template/dbsetup_test_types/test_types_Future_Date.yaml b/testgen/template/dbsetup_test_types/test_types_Future_Date.yaml
index 57f0acf5..aa2406eb 100644
--- a/testgen/template/dbsetup_test_types/test_types_Future_Date.yaml
+++ b/testgen/template/dbsetup_test_types/test_types_Future_Date.yaml
@@ -66,6 +66,14 @@ test_types:
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
+ - id: '7010'
+ test_type: Future_Date
+ sql_flavor: redshift_spectrum
+ measure: |-
+ SUM(GREATEST(0, SIGN({COLUMN_NAME}::DATE - '{RUN_DATE}'::DATE)))
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
- id: '2010'
test_type: Future_Date
sql_flavor: snowflake
@@ -115,6 +123,14 @@ test_types:
lookup_query: |-
SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE GREATEST(0, SIGN("{COLUMN_NAME}"::DATE - '{TEST_DATE}'::DATE)) > {THRESHOLD_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT 500;
error_type: Test Results
+ - id: '1410'
+ test_id: '1015'
+ test_type: Future_Date
+ sql_flavor: redshift_spectrum
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE GREATEST(0, SIGN("{COLUMN_NAME}"::DATE - '{TEST_DATE}'::DATE)) > {THRESHOLD_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT 500;
+ error_type: Test Results
- id: '1206'
test_id: '1015'
test_type: Future_Date
diff --git a/testgen/template/dbsetup_test_types/test_types_Future_Date_1Y.yaml b/testgen/template/dbsetup_test_types/test_types_Future_Date_1Y.yaml
index e7cb1572..d7a416f4 100644
--- a/testgen/template/dbsetup_test_types/test_types_Future_Date_1Y.yaml
+++ b/testgen/template/dbsetup_test_types/test_types_Future_Date_1Y.yaml
@@ -67,6 +67,14 @@ test_types:
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
+ - id: '7011'
+ test_type: Future_Date_1Y
+ sql_flavor: redshift_spectrum
+ measure: |-
+ SUM(GREATEST(0, SIGN({COLUMN_NAME}::DATE - ('{RUN_DATE}'::DATE+365))))
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
- id: '2011'
test_type: Future_Date_1Y
sql_flavor: snowflake
@@ -116,6 +124,14 @@ test_types:
lookup_query: |-
SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE GREATEST(0, SIGN("{COLUMN_NAME}"::DATE - ('{TEST_DATE}'::DATE + 365))) > {THRESHOLD_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT 500;
error_type: Test Results
+ - id: '1411'
+ test_id: '1016'
+ test_type: Future_Date_1Y
+ sql_flavor: redshift_spectrum
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE GREATEST(0, SIGN("{COLUMN_NAME}"::DATE - ('{TEST_DATE}'::DATE + 365))) > {THRESHOLD_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT 500;
+ error_type: Test Results
- id: '1207'
test_id: '1016'
test_type: Future_Date_1Y
diff --git a/testgen/template/dbsetup_test_types/test_types_Incr_Avg_Shift.yaml b/testgen/template/dbsetup_test_types/test_types_Incr_Avg_Shift.yaml
index 71332c5f..8b23c335 100644
--- a/testgen/template/dbsetup_test_types/test_types_Incr_Avg_Shift.yaml
+++ b/testgen/template/dbsetup_test_types/test_types_Incr_Avg_Shift.yaml
@@ -68,6 +68,14 @@ test_types:
test_operator: '>='
test_condition: |-
{THRESHOLD_VALUE}
+ - id: '7012'
+ test_type: Incr_Avg_Shift
+ sql_flavor: redshift_spectrum
+ measure: |-
+ NVL(ABS( ({BASELINE_AVG} - (SUM({COLUMN_NAME}) - {BASELINE_SUM}) / NULLIF(COUNT({COLUMN_NAME})::FLOAT - {BASELINE_VALUE_CT}, 0)) / {BASELINE_SD} ), 0)
+ test_operator: '>='
+ test_condition: |-
+ {THRESHOLD_VALUE}
- id: '2012'
test_type: Incr_Avg_Shift
sql_flavor: snowflake
@@ -117,6 +125,14 @@ test_types:
lookup_query: |-
SELECT AVG("{COLUMN_NAME}" :: FLOAT) AS current_average, SUM("{COLUMN_NAME}" ::FLOAT) AS current_sum, NULLIF(COUNT("{COLUMN_NAME}" )::FLOAT, 0) as current_value_count FROM {TARGET_SCHEMA}.{TABLE_NAME};
error_type: Test Results
+ - id: '1412'
+ test_id: '1017'
+ test_type: Incr_Avg_Shift
+ sql_flavor: redshift_spectrum
+ lookup_type: null
+ lookup_query: |-
+ SELECT AVG("{COLUMN_NAME}" :: FLOAT) AS current_average, SUM("{COLUMN_NAME}" ::FLOAT) AS current_sum, NULLIF(COUNT("{COLUMN_NAME}" )::FLOAT, 0) as current_value_count FROM {TARGET_SCHEMA}.{TABLE_NAME};
+ error_type: Test Results
- id: '1208'
test_id: '1017'
test_type: Incr_Avg_Shift
diff --git a/testgen/template/dbsetup_test_types/test_types_LOV_All.yaml b/testgen/template/dbsetup_test_types/test_types_LOV_All.yaml
index e17caf4a..00f9b545 100644
--- a/testgen/template/dbsetup_test_types/test_types_LOV_All.yaml
+++ b/testgen/template/dbsetup_test_types/test_types_LOV_All.yaml
@@ -65,6 +65,14 @@ test_types:
test_operator: <>
test_condition: |-
{THRESHOLD_VALUE}
+ - id: '7013'
+ test_type: LOV_All
+ sql_flavor: redshift_spectrum
+ measure: |-
+ LISTAGG(DISTINCT {COLUMN_NAME}, '|') WITHIN GROUP (ORDER BY {COLUMN_NAME})
+ test_operator: <>
+ test_condition: |-
+ {THRESHOLD_VALUE}
- id: '2013'
test_type: LOV_All
sql_flavor: snowflake
@@ -114,6 +122,14 @@ test_types:
lookup_query: |-
SELECT LISTAGG(DISTINCT "{COLUMN_NAME}", '|') WITHIN GROUP (ORDER BY "{COLUMN_NAME}") FROM {TARGET_SCHEMA}.{TABLE_NAME} HAVING LISTAGG(DISTINCT "{COLUMN_NAME}", '|') WITHIN GROUP (ORDER BY "{COLUMN_NAME}") <> '{THRESHOLD_VALUE}' LIMIT 500;
error_type: Test Results
+ - id: '1413'
+ test_id: '1018'
+ test_type: LOV_All
+ sql_flavor: redshift_spectrum
+ lookup_type: null
+ lookup_query: |-
+ SELECT LISTAGG(DISTINCT "{COLUMN_NAME}", '|') WITHIN GROUP (ORDER BY "{COLUMN_NAME}") FROM {TARGET_SCHEMA}.{TABLE_NAME} HAVING LISTAGG(DISTINCT "{COLUMN_NAME}", '|') WITHIN GROUP (ORDER BY "{COLUMN_NAME}") <> '{THRESHOLD_VALUE}' LIMIT 500;
+ error_type: Test Results
- id: '1209'
test_id: '1018'
test_type: LOV_All
diff --git a/testgen/template/dbsetup_test_types/test_types_LOV_Match.yaml b/testgen/template/dbsetup_test_types/test_types_LOV_Match.yaml
index 4ee6b63d..e3006661 100644
--- a/testgen/template/dbsetup_test_types/test_types_LOV_Match.yaml
+++ b/testgen/template/dbsetup_test_types/test_types_LOV_Match.yaml
@@ -67,6 +67,14 @@ test_types:
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
+ - id: '7014'
+ test_type: LOV_Match
+ sql_flavor: redshift_spectrum
+ measure: |-
+ SUM(CASE WHEN NULLIF({COLUMN_NAME}, '') NOT IN {BASELINE_VALUE} THEN 1 ELSE 0 END)
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
- id: '2014'
test_type: LOV_Match
sql_flavor: snowflake
@@ -116,6 +124,14 @@ test_types:
lookup_query: |-
SELECT DISTINCT NULLIF("{COLUMN_NAME}", '') AS "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE NULLIF("{COLUMN_NAME}", '') NOT IN {BASELINE_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT 500;
error_type: Test Results
+ - id: '1414'
+ test_id: '1019'
+ test_type: LOV_Match
+ sql_flavor: redshift_spectrum
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT NULLIF("{COLUMN_NAME}", '') AS "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE NULLIF("{COLUMN_NAME}", '') NOT IN {BASELINE_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT 500;
+ error_type: Test Results
- id: '1210'
test_id: '1019'
test_type: LOV_Match
diff --git a/testgen/template/dbsetup_test_types/test_types_Min_Date.yaml b/testgen/template/dbsetup_test_types/test_types_Min_Date.yaml
index e54dcb22..9fd503bb 100644
--- a/testgen/template/dbsetup_test_types/test_types_Min_Date.yaml
+++ b/testgen/template/dbsetup_test_types/test_types_Min_Date.yaml
@@ -67,6 +67,14 @@ test_types:
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
+ - id: '7015'
+ test_type: Min_Date
+ sql_flavor: redshift_spectrum
+ measure: |-
+ SUM(CASE WHEN {COLUMN_NAME} < '{BASELINE_VALUE}' THEN 1 ELSE 0 END)
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
- id: '2015'
test_type: Min_Date
sql_flavor: snowflake
@@ -116,6 +124,14 @@ test_types:
lookup_query: |-
SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" :: DATE < '{BASELINE_VALUE}' :: DATE GROUP BY "{COLUMN_NAME}" LIMIT 500;
error_type: Test Results
+ - id: '1415'
+ test_id: '1020'
+ test_type: Min_Date
+ sql_flavor: redshift_spectrum
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" :: DATE < '{BASELINE_VALUE}' :: DATE GROUP BY "{COLUMN_NAME}" LIMIT 500;
+ error_type: Test Results
- id: '1211'
test_id: '1020'
test_type: Min_Date
diff --git a/testgen/template/dbsetup_test_types/test_types_Min_Val.yaml b/testgen/template/dbsetup_test_types/test_types_Min_Val.yaml
index d63d7db0..0b6ce887 100644
--- a/testgen/template/dbsetup_test_types/test_types_Min_Val.yaml
+++ b/testgen/template/dbsetup_test_types/test_types_Min_Val.yaml
@@ -67,6 +67,14 @@ test_types:
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
+ - id: '7016'
+ test_type: Min_Val
+ sql_flavor: redshift_spectrum
+ measure: |-
+ SUM(CASE WHEN {COLUMN_NAME} < {BASELINE_VALUE} - 1e-6 THEN 1 ELSE 0 END)
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
- id: '2016'
test_type: Min_Val
sql_flavor: snowflake
@@ -116,6 +124,14 @@ test_types:
lookup_query: |-
SELECT DISTINCT "{COLUMN_NAME}", (ABS("{COLUMN_NAME}") - ABS({BASELINE_VALUE})) AS difference_from_baseline FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" < {BASELINE_VALUE} LIMIT 500;
error_type: Test Results
+ - id: '1416'
+ test_id: '1021'
+ test_type: Min_Val
+ sql_flavor: redshift_spectrum
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT "{COLUMN_NAME}", (ABS("{COLUMN_NAME}") - ABS({BASELINE_VALUE})) AS difference_from_baseline FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" < {BASELINE_VALUE} LIMIT 500;
+ error_type: Test Results
- id: '1212'
test_id: '1021'
test_type: Min_Val
diff --git a/testgen/template/dbsetup_test_types/test_types_Missing_Pct.yaml b/testgen/template/dbsetup_test_types/test_types_Missing_Pct.yaml
index ad4d5a02..3747423f 100644
--- a/testgen/template/dbsetup_test_types/test_types_Missing_Pct.yaml
+++ b/testgen/template/dbsetup_test_types/test_types_Missing_Pct.yaml
@@ -68,6 +68,14 @@ test_types:
test_operator: '>='
test_condition: |-
{THRESHOLD_VALUE}
+ - id: '7017'
+ test_type: Missing_Pct
+ sql_flavor: redshift_spectrum
+ measure: |-
+ ABS( 2.0 * ASIN( SQRT( {BASELINE_VALUE_CT}::FLOAT / {BASELINE_CT}::FLOAT ) ) - 2 * ASIN( SQRT( COUNT( {COLUMN_NAME} )::FLOAT / NULLIF(COUNT(*), 0)::FLOAT )) )
+ test_operator: '>='
+ test_condition: |-
+ {THRESHOLD_VALUE}
- id: '2017'
test_type: Missing_Pct
sql_flavor: snowflake
@@ -117,6 +125,14 @@ test_types:
lookup_query: |-
SELECT TOP 10 * FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" IS NULL OR "{COLUMN_NAME}" :: VARCHAR(255) = '' ;
error_type: Test Results
+ - id: '1417'
+ test_id: '1022'
+ test_type: Missing_Pct
+ sql_flavor: redshift_spectrum
+ lookup_type: null
+ lookup_query: |-
+ SELECT TOP 10 * FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" IS NULL OR "{COLUMN_NAME}" :: VARCHAR(255) = '' ;
+ error_type: Test Results
- id: '1213'
test_id: '1022'
test_type: Missing_Pct
diff --git a/testgen/template/dbsetup_test_types/test_types_Monthly_Rec_Ct.yaml b/testgen/template/dbsetup_test_types/test_types_Monthly_Rec_Ct.yaml
index 58f8b1e4..4439e83e 100644
--- a/testgen/template/dbsetup_test_types/test_types_Monthly_Rec_Ct.yaml
+++ b/testgen/template/dbsetup_test_types/test_types_Monthly_Rec_Ct.yaml
@@ -68,6 +68,14 @@ test_types:
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
+ - id: '7018'
+ test_type: Monthly_Rec_Ct
+ sql_flavor: redshift_spectrum
+ measure: |-
+ (MAX(DATEDIFF(month, {COLUMN_NAME}, '{RUN_DATE}'::DATE)) - MIN(DATEDIFF(month, {COLUMN_NAME}, '{RUN_DATE}'::DATE)) + 1) - COUNT(DISTINCT DATEDIFF(month, {COLUMN_NAME}, '{RUN_DATE}'::DATE))
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
- id: '2018'
test_type: Monthly_Rec_Ct
sql_flavor: snowflake
@@ -157,6 +165,14 @@ test_types:
lookup_query: |-
WITH RECURSIVE daterange(all_dates) AS (SELECT DATE_TRUNC('month', MIN("{COLUMN_NAME}")) :: DATE AS all_dates FROM {TARGET_SCHEMA}.{TABLE_NAME} UNION ALL SELECT DATEADD(MONTH, 1, d.all_dates) :: DATE AS all_dates FROM daterange d WHERE d.all_dates < (SELECT DATE_TRUNC('month', MAX("{COLUMN_NAME}")) :: DATE FROM {TARGET_SCHEMA}.{TABLE_NAME}) ), existing_periods AS ( SELECT DISTINCT DATE_TRUNC('month',"{COLUMN_NAME}") :: DATE AS period, COUNT(1) AS period_count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY DATE_TRUNC('month',"{COLUMN_NAME}") :: DATE ) SELECT d.all_dates as missing_period, MAX(b.period) AS prior_available_month, (SELECT period_count FROM existing_periods WHERE period = MAX(b.period) ) AS prior_available_month_count, MIN(c.period) AS next_available_month, (SELECT period_count FROM existing_periods WHERE period = MIN(c.period) ) AS next_available_month_count FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates ORDER BY d.all_dates;
error_type: Test Results
+ - id: '1418'
+ test_id: '1023'
+ test_type: Monthly_Rec_Ct
+ sql_flavor: redshift_spectrum
+ lookup_type: null
+ lookup_query: |-
+ WITH RECURSIVE daterange(all_dates) AS (SELECT DATE_TRUNC('month', MIN("{COLUMN_NAME}")) :: DATE AS all_dates FROM {TARGET_SCHEMA}.{TABLE_NAME} UNION ALL SELECT DATEADD(MONTH, 1, d.all_dates) :: DATE AS all_dates FROM daterange d WHERE d.all_dates < (SELECT DATE_TRUNC('month', MAX("{COLUMN_NAME}")) :: DATE FROM {TARGET_SCHEMA}.{TABLE_NAME}) ), existing_periods AS ( SELECT DISTINCT DATE_TRUNC('month',"{COLUMN_NAME}") :: DATE AS period, COUNT(1) AS period_count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY DATE_TRUNC('month',"{COLUMN_NAME}") :: DATE ) SELECT d.all_dates as missing_period, MAX(b.period) AS prior_available_month, (SELECT period_count FROM existing_periods WHERE period = MAX(b.period) ) AS prior_available_month_count, MIN(c.period) AS next_available_month, (SELECT period_count FROM existing_periods WHERE period = MIN(c.period) ) AS next_available_month_count FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates ORDER BY d.all_dates;
+ error_type: Test Results
- id: '1214'
test_id: '1023'
test_type: Monthly_Rec_Ct
diff --git a/testgen/template/dbsetup_test_types/test_types_Outlier_Pct_Above.yaml b/testgen/template/dbsetup_test_types/test_types_Outlier_Pct_Above.yaml
index 2d077ce8..38801f88 100644
--- a/testgen/template/dbsetup_test_types/test_types_Outlier_Pct_Above.yaml
+++ b/testgen/template/dbsetup_test_types/test_types_Outlier_Pct_Above.yaml
@@ -72,6 +72,14 @@ test_types:
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
+ - id: '7019'
+ test_type: Outlier_Pct_Above
+ sql_flavor: redshift_spectrum
+ measure: |-
+ SUM(CASE WHEN {COLUMN_NAME}::FLOAT > {BASELINE_AVG}+(2.0*{BASELINE_SD}) THEN 1 ELSE 0 END)::FLOAT / NULLIF(COUNT({COLUMN_NAME}), 0)::FLOAT
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
- id: '2019'
test_type: Outlier_Pct_Above
sql_flavor: snowflake
@@ -121,6 +129,14 @@ test_types:
lookup_query: |-
SELECT ({BASELINE_AVG} + (2*{BASELINE_SD})) AS outlier_threshold, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" :: FLOAT > ({BASELINE_AVG} + (2*{BASELINE_SD})) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC;
error_type: Test Results
+ - id: '1419'
+ test_id: '1024'
+ test_type: Outlier_Pct_Above
+ sql_flavor: redshift_spectrum
+ lookup_type: null
+ lookup_query: |-
+ SELECT ({BASELINE_AVG} + (2*{BASELINE_SD})) AS outlier_threshold, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" :: FLOAT > ({BASELINE_AVG} + (2*{BASELINE_SD})) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC;
+ error_type: Test Results
- id: '1215'
test_id: '1024'
test_type: Outlier_Pct_Above
diff --git a/testgen/template/dbsetup_test_types/test_types_Outlier_Pct_Below.yaml b/testgen/template/dbsetup_test_types/test_types_Outlier_Pct_Below.yaml
index fea7e15f..d6e664cb 100644
--- a/testgen/template/dbsetup_test_types/test_types_Outlier_Pct_Below.yaml
+++ b/testgen/template/dbsetup_test_types/test_types_Outlier_Pct_Below.yaml
@@ -72,6 +72,14 @@ test_types:
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
+ - id: '7020'
+ test_type: Outlier_Pct_Below
+ sql_flavor: redshift_spectrum
+ measure: |-
+ SUM(CASE WHEN {COLUMN_NAME}::FLOAT < {BASELINE_AVG}-(2.0*{BASELINE_SD}) THEN 1 ELSE 0 END)::FLOAT / NULLIF(COUNT({COLUMN_NAME}), 0)::FLOAT
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
- id: '2020'
test_type: Outlier_Pct_Below
sql_flavor: snowflake
@@ -121,6 +129,14 @@ test_types:
lookup_query: |-
SELECT ({BASELINE_AVG} + (2*{BASELINE_SD})) AS outlier_threshold, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" :: FLOAT < ({BASELINE_AVG} + (2*{BASELINE_SD})) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC;
error_type: Test Results
+ - id: '1420'
+ test_id: '1025'
+ test_type: Outlier_Pct_Below
+ sql_flavor: redshift_spectrum
+ lookup_type: null
+ lookup_query: |-
+ SELECT ({BASELINE_AVG} + (2*{BASELINE_SD})) AS outlier_threshold, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" :: FLOAT < ({BASELINE_AVG} + (2*{BASELINE_SD})) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC;
+ error_type: Test Results
- id: '1216'
test_id: '1025'
test_type: Outlier_Pct_Below
diff --git a/testgen/template/dbsetup_test_types/test_types_Pattern_Match.yaml b/testgen/template/dbsetup_test_types/test_types_Pattern_Match.yaml
index 425d3e0f..e9feff93 100644
--- a/testgen/template/dbsetup_test_types/test_types_Pattern_Match.yaml
+++ b/testgen/template/dbsetup_test_types/test_types_Pattern_Match.yaml
@@ -67,6 +67,14 @@ test_types:
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
+ - id: '7021'
+ test_type: Pattern_Match
+ sql_flavor: redshift_spectrum
+ measure: |-
+ COUNT(NULLIF({COLUMN_NAME}, '')) - SUM((NULLIF({COLUMN_NAME}, '') SIMILAR TO '{BASELINE_VALUE}')::BIGINT)
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
- id: '2021'
test_type: Pattern_Match
sql_flavor: snowflake
@@ -116,6 +124,14 @@ test_types:
lookup_query: |-
SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE NULLIF("{COLUMN_NAME}", '') NOT SIMILAR TO '{BASELINE_VALUE}' GROUP BY "{COLUMN_NAME}";
error_type: Test Results
+ - id: '1421'
+ test_id: '1026'
+ test_type: Pattern_Match
+ sql_flavor: redshift_spectrum
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE NULLIF("{COLUMN_NAME}", '') NOT SIMILAR TO '{BASELINE_VALUE}' GROUP BY "{COLUMN_NAME}";
+ error_type: Test Results
- id: '1217'
test_id: '1026'
test_type: Pattern_Match
diff --git a/testgen/template/dbsetup_test_types/test_types_Recency.yaml b/testgen/template/dbsetup_test_types/test_types_Recency.yaml
index 2ebd28e3..a2c43d0c 100644
--- a/testgen/template/dbsetup_test_types/test_types_Recency.yaml
+++ b/testgen/template/dbsetup_test_types/test_types_Recency.yaml
@@ -68,6 +68,14 @@ test_types:
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
+ - id: '7022'
+ test_type: Recency
+ sql_flavor: redshift_spectrum
+ measure: |-
+ DATEDIFF('D', MAX({COLUMN_NAME}), '{RUN_DATE}'::DATE)
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
- id: '2022'
test_type: Recency
sql_flavor: snowflake
@@ -117,6 +125,14 @@ test_types:
lookup_query: |-
SELECT DISTINCT col AS latest_date_available, '{TEST_DATE}' :: DATE as test_run_date FROM (SELECT MAX("{COLUMN_NAME}") AS col FROM {TARGET_SCHEMA}.{TABLE_NAME}) WHERE DATEDIFF('D', col, '{TEST_DATE}'::DATE) > {THRESHOLD_VALUE};
error_type: Test Results
+ - id: '1422'
+ test_id: '1028'
+ test_type: Recency
+ sql_flavor: redshift_spectrum
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT col AS latest_date_available, '{TEST_DATE}' :: DATE as test_run_date FROM (SELECT MAX("{COLUMN_NAME}") AS col FROM {TARGET_SCHEMA}.{TABLE_NAME}) WHERE DATEDIFF('D', col, '{TEST_DATE}'::DATE) > {THRESHOLD_VALUE};
+ error_type: Test Results
- id: '1218'
test_id: '1028'
test_type: Recency
diff --git a/testgen/template/dbsetup_test_types/test_types_Required.yaml b/testgen/template/dbsetup_test_types/test_types_Required.yaml
index c49cf447..27284b2c 100644
--- a/testgen/template/dbsetup_test_types/test_types_Required.yaml
+++ b/testgen/template/dbsetup_test_types/test_types_Required.yaml
@@ -66,6 +66,14 @@ test_types:
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
+ - id: '7023'
+ test_type: Required
+ sql_flavor: redshift_spectrum
+ measure: |-
+ COUNT(*) - COUNT( {COLUMN_NAME} )
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
- id: '2023'
test_type: Required
sql_flavor: snowflake
@@ -115,6 +123,14 @@ test_types:
lookup_query: |-
SELECT * FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" IS NULL LIMIT 500;
error_type: Test Results
+ - id: '1423'
+ test_id: '1030'
+ test_type: Required
+ sql_flavor: redshift_spectrum
+ lookup_type: null
+ lookup_query: |-
+ SELECT * FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" IS NULL LIMIT 500;
+ error_type: Test Results
- id: '1219'
test_id: '1030'
test_type: Required
diff --git a/testgen/template/dbsetup_test_types/test_types_Row_Ct.yaml b/testgen/template/dbsetup_test_types/test_types_Row_Ct.yaml
index 35864b32..27c4fcc8 100644
--- a/testgen/template/dbsetup_test_types/test_types_Row_Ct.yaml
+++ b/testgen/template/dbsetup_test_types/test_types_Row_Ct.yaml
@@ -66,6 +66,14 @@ test_types:
test_operator: <
test_condition: |-
{THRESHOLD_VALUE}
+ - id: '7024'
+ test_type: Row_Ct
+ sql_flavor: redshift_spectrum
+ measure: |-
+ COUNT(*)
+ test_operator: <
+ test_condition: |-
+ {THRESHOLD_VALUE}
- id: '2024'
test_type: Row_Ct
sql_flavor: snowflake
@@ -115,6 +123,14 @@ test_types:
lookup_query: |-
WITH CTE AS (SELECT COUNT(*) AS current_count FROM {TARGET_SCHEMA}.{TABLE_NAME}) SELECT current_count, ABS(ROUND(100 * (current_count - {THRESHOLD_VALUE}) :: FLOAT / {THRESHOLD_VALUE} :: FLOAT,2)) AS row_count_pct_decrease FROM cte WHERE current_count < {THRESHOLD_VALUE};
error_type: Test Results
+ - id: '1424'
+ test_id: '1031'
+ test_type: Row_Ct
+ sql_flavor: redshift_spectrum
+ lookup_type: null
+ lookup_query: |-
+ WITH CTE AS (SELECT COUNT(*) AS current_count FROM {TARGET_SCHEMA}.{TABLE_NAME}) SELECT current_count, ABS(ROUND(100 * (current_count - {THRESHOLD_VALUE}) :: FLOAT / {THRESHOLD_VALUE} :: FLOAT,2)) AS row_count_pct_decrease FROM cte WHERE current_count < {THRESHOLD_VALUE};
+ error_type: Test Results
- id: '1220'
test_id: '1031'
test_type: Row_Ct
diff --git a/testgen/template/dbsetup_test_types/test_types_Row_Ct_Pct.yaml b/testgen/template/dbsetup_test_types/test_types_Row_Ct_Pct.yaml
index 7850b6f0..00078d63 100644
--- a/testgen/template/dbsetup_test_types/test_types_Row_Ct_Pct.yaml
+++ b/testgen/template/dbsetup_test_types/test_types_Row_Ct_Pct.yaml
@@ -67,6 +67,14 @@ test_types:
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
+ - id: '7025'
+ test_type: Row_Ct_Pct
+ sql_flavor: redshift_spectrum
+ measure: |-
+ ABS(ROUND(100.0 * (COUNT(*) - {BASELINE_CT})::FLOAT / {BASELINE_CT}::FLOAT, 2))
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
- id: '2025'
test_type: Row_Ct_Pct
sql_flavor: snowflake
@@ -116,6 +124,14 @@ test_types:
lookup_query: |-
WITH CTE AS (SELECT COUNT(*) AS current_count FROM {TARGET_SCHEMA}.{TABLE_NAME}) SELECT current_count, {BASELINE_CT} AS baseline_count, ABS(ROUND(100 * (current_count - {BASELINE_CT}) :: FLOAT / {BASELINE_CT} :: FLOAT,2)) AS row_count_pct_difference FROM cte;
error_type: Test Results
+ - id: '1425'
+ test_id: '1032'
+ test_type: Row_Ct_Pct
+ sql_flavor: redshift_spectrum
+ lookup_type: null
+ lookup_query: |-
+ WITH CTE AS (SELECT COUNT(*) AS current_count FROM {TARGET_SCHEMA}.{TABLE_NAME}) SELECT current_count, {BASELINE_CT} AS baseline_count, ABS(ROUND(100 * (current_count - {BASELINE_CT}) :: FLOAT / {BASELINE_CT} :: FLOAT,2)) AS row_count_pct_difference FROM cte;
+ error_type: Test Results
- id: '1221'
test_id: '1032'
test_type: Row_Ct_Pct
diff --git a/testgen/template/dbsetup_test_types/test_types_Street_Addr_Pattern.yaml b/testgen/template/dbsetup_test_types/test_types_Street_Addr_Pattern.yaml
index bddb98c1..74fb31db 100644
--- a/testgen/template/dbsetup_test_types/test_types_Street_Addr_Pattern.yaml
+++ b/testgen/template/dbsetup_test_types/test_types_Street_Addr_Pattern.yaml
@@ -68,6 +68,14 @@ test_types:
test_operator: <
test_condition: |-
{THRESHOLD_VALUE}
+ - id: '7026'
+ test_type: Street_Addr_Pattern
+ sql_flavor: redshift_spectrum
+ measure: |-
+ 100.0*SUM(({COLUMN_NAME} ~ '^[0-9]{1,5}[a-zA-Z]?\\s\\w{1,5}\\.?\\s?\\w*\\s?\\w*\\s[a-zA-Z]{1,6}\\.?\\s?[0-9]{0,5}[A-Z]{0,1}$')::BIGINT)::FLOAT / NULLIF(COUNT({COLUMN_NAME}), 0)::FLOAT
+ test_operator: <
+ test_condition: |-
+ {THRESHOLD_VALUE}
- id: '2026'
test_type: Street_Addr_Pattern
sql_flavor: snowflake
@@ -117,6 +125,14 @@ test_types:
lookup_query: |-
SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" !~ '^[0-9]{1,5}[a-zA-Z]?\\s\\w{1,5}\\.?\\s?\\w*\\s?\\w*\\s[a-zA-Z]{1,6}\\.?\\s?[0-9]{0,5}[A-Z]{0,1}$' GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT 500;
error_type: Test Results
+ - id: '1426'
+ test_id: '1033'
+ test_type: Street_Addr_Pattern
+ sql_flavor: redshift_spectrum
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" !~ '^[0-9]{1,5}[a-zA-Z]?\\s\\w{1,5}\\.?\\s?\\w*\\s?\\w*\\s[a-zA-Z]{1,6}\\.?\\s?[0-9]{0,5}[A-Z]{0,1}$' GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT 500;
+ error_type: Test Results
- id: '1222'
test_id: '1033'
test_type: Street_Addr_Pattern
diff --git a/testgen/template/dbsetup_test_types/test_types_Table_Freshness.yaml b/testgen/template/dbsetup_test_types/test_types_Table_Freshness.yaml
index 032a8e15..d5d75273 100644
--- a/testgen/template/dbsetup_test_types/test_types_Table_Freshness.yaml
+++ b/testgen/template/dbsetup_test_types/test_types_Table_Freshness.yaml
@@ -55,6 +55,10 @@ test_types:
test_type: Table_Freshness
sql_flavor: redshift
template_name: ex_table_changed_generic.sql
+ - id: '2512'
+ test_type: Table_Freshness
+ sql_flavor: redshift_spectrum
+ template_name: ex_table_changed_generic.sql
- id: '2112'
test_type: Table_Freshness
sql_flavor: snowflake
diff --git a/testgen/template/dbsetup_test_types/test_types_Timeframe_Combo_Gain.yaml b/testgen/template/dbsetup_test_types/test_types_Timeframe_Combo_Gain.yaml
index 213d7926..649e007d 100644
--- a/testgen/template/dbsetup_test_types/test_types_Timeframe_Combo_Gain.yaml
+++ b/testgen/template/dbsetup_test_types/test_types_Timeframe_Combo_Gain.yaml
@@ -95,6 +95,25 @@ test_types:
AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS}
GROUP BY {COLUMN_NAME_NO_QUOTES}
error_type: Test Results
+ - id: '1468'
+ test_id: '1508'
+ test_type: Timeframe_Combo_Gain
+ sql_flavor: redshift_spectrum
+ lookup_type: null
+ lookup_query: |-
+ SELECT {COLUMN_NAME_NO_QUOTES}
+ FROM {TARGET_SCHEMA}.{TABLE_NAME}
+ WHERE {SUBSET_CONDITION}
+ AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - 2 * {WINDOW_DAYS}
+ AND {WINDOW_DATE_COLUMN} < (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS}
+ GROUP BY {COLUMN_NAME_NO_QUOTES}
+ EXCEPT
+ SELECT {COLUMN_NAME_NO_QUOTES}
+ FROM {TARGET_SCHEMA}.{TABLE_NAME}
+ WHERE {SUBSET_CONDITION}
+ AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS}
+ GROUP BY {COLUMN_NAME_NO_QUOTES}
+ error_type: Test Results
- id: '1262'
test_id: '1508'
test_type: Timeframe_Combo_Gain
@@ -131,6 +150,10 @@ test_types:
test_type: Timeframe_Combo_Gain
sql_flavor: redshift
template_name: ex_window_match_no_drops_generic.sql
+ - id: '2507'
+ test_type: Timeframe_Combo_Gain
+ sql_flavor: redshift_spectrum
+ template_name: ex_window_match_no_drops_generic.sql
- id: '2107'
test_type: Timeframe_Combo_Gain
sql_flavor: snowflake
diff --git a/testgen/template/dbsetup_test_types/test_types_Timeframe_Combo_Match.yaml b/testgen/template/dbsetup_test_types/test_types_Timeframe_Combo_Match.yaml
index 1b35da33..5d075980 100644
--- a/testgen/template/dbsetup_test_types/test_types_Timeframe_Combo_Match.yaml
+++ b/testgen/template/dbsetup_test_types/test_types_Timeframe_Combo_Match.yaml
@@ -164,6 +164,38 @@ test_types:
AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS}
)
error_type: Test Results
+ - id: '1469'
+ test_id: '1509'
+ test_type: Timeframe_Combo_Match
+ sql_flavor: redshift_spectrum
+ lookup_type: null
+ lookup_query: |2-
+ (
+ SELECT 'Prior Timeframe' as missing_from, {COLUMN_NAME}
+ FROM {TARGET_SCHEMA}.{TABLE_NAME}
+ WHERE {SUBSET_CONDITION}
+ AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS}
+ EXCEPT
+ SELECT 'Prior Timeframe' as missing_from, {COLUMN_NAME}
+ FROM {TARGET_SCHEMA}.{TABLE_NAME}
+ WHERE {SUBSET_CONDITION}
+ AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - 2 * {WINDOW_DAYS}
+ AND {WINDOW_DATE_COLUMN} < (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS}
+ )
+ UNION ALL
+ (
+ SELECT 'Latest Timeframe' as missing_from, {COLUMN_NAME}
+ FROM {TARGET_SCHEMA}.{TABLE_NAME}
+ WHERE {SUBSET_CONDITION}
+ AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - 2 * {WINDOW_DAYS}
+ AND {WINDOW_DATE_COLUMN} < (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS}
+ EXCEPT
+ SELECT 'Latest Timeframe' as missing_from, {COLUMN_NAME}
+ FROM {TARGET_SCHEMA}.{TABLE_NAME}
+ WHERE {SUBSET_CONDITION}
+ AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS}
+ )
+ error_type: Test Results
- id: '1266'
test_id: '1509'
test_type: Timeframe_Combo_Match
@@ -213,6 +245,10 @@ test_types:
test_type: Timeframe_Combo_Match
sql_flavor: redshift
template_name: ex_window_match_same_generic.sql
+ - id: '2508'
+ test_type: Timeframe_Combo_Match
+ sql_flavor: redshift_spectrum
+ template_name: ex_window_match_same_generic.sql
- id: '2108'
test_type: Timeframe_Combo_Match
sql_flavor: snowflake
diff --git a/testgen/template/dbsetup_test_types/test_types_US_State.yaml b/testgen/template/dbsetup_test_types/test_types_US_State.yaml
index 47a94fde..e9f06e9f 100644
--- a/testgen/template/dbsetup_test_types/test_types_US_State.yaml
+++ b/testgen/template/dbsetup_test_types/test_types_US_State.yaml
@@ -68,6 +68,14 @@ test_types:
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
+ - id: '7027'
+ test_type: US_State
+ sql_flavor: redshift_spectrum
+ measure: |-
+ SUM(CASE WHEN {COLUMN_NAME} NOT IN ('','AL','AK','AS','AZ','AR','CA','CO','CT','DE','DC','FM','FL','GA','GU','HI','ID','IL','IN','IA','KS','KY','LA','ME','MH','MD','MA','MI','MN','MS','MO','MT','NE','NV','NH','NJ','NM','NY','NC','ND','MP','OH','OK','OR','PW','PA','PR','RI','SC','SD','TN','TX','UT','VT','VI','VA','WA','WV','WI','WY','AE','AP','AA') THEN 1 ELSE 0 END)
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
- id: '2027'
test_type: US_State
sql_flavor: snowflake
@@ -117,6 +125,14 @@ test_types:
lookup_query: |-
SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE NULLIF("{COLUMN_NAME}", '') NOT IN ('AL','AK','AS','AZ','AR','CA','CO','CT','DE','DC','FM','FL','GA','GU','HI','ID','IL','IN','IA','KS','KY','LA','ME','MH','MD','MA','MI','MN','MS','MO','MT','NE','NV','NH','NJ','NM','NY','NC','ND','MP','OH','OK','OR','PW','PA','PR','RI','SC','SD','TN','TX','UT','VT','VI','VA','WA','WV','WI','WY','AE','AP','AA') GROUP BY "{COLUMN_NAME}" LIMIT 500;
error_type: Test Results
+ - id: '1427'
+ test_id: '1036'
+ test_type: US_State
+ sql_flavor: redshift_spectrum
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE NULLIF("{COLUMN_NAME}", '') NOT IN ('AL','AK','AS','AZ','AR','CA','CO','CT','DE','DC','FM','FL','GA','GU','HI','ID','IL','IN','IA','KS','KY','LA','ME','MH','MD','MA','MI','MN','MS','MO','MT','NE','NV','NH','NJ','NM','NY','NC','ND','MP','OH','OK','OR','PW','PA','PR','RI','SC','SD','TN','TX','UT','VT','VI','VA','WA','WV','WI','WY','AE','AP','AA') GROUP BY "{COLUMN_NAME}" LIMIT 500;
+ error_type: Test Results
- id: '1223'
test_id: '1036'
test_type: US_State
diff --git a/testgen/template/dbsetup_test_types/test_types_Unique.yaml b/testgen/template/dbsetup_test_types/test_types_Unique.yaml
index 20a8df28..04864ed7 100644
--- a/testgen/template/dbsetup_test_types/test_types_Unique.yaml
+++ b/testgen/template/dbsetup_test_types/test_types_Unique.yaml
@@ -68,6 +68,14 @@ test_types:
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
+ - id: '7028'
+ test_type: Unique
+ sql_flavor: redshift_spectrum
+ measure: |-
+ COUNT(*) - COUNT(DISTINCT {COLUMN_NAME})
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
- id: '2028'
test_type: Unique
sql_flavor: snowflake
@@ -117,6 +125,14 @@ test_types:
lookup_query: |-
SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" HAVING COUNT(*) > 1 ORDER BY COUNT(*) DESC LIMIT 500;
error_type: Test Results
+ - id: '1428'
+ test_id: '1034'
+ test_type: Unique
+ sql_flavor: redshift_spectrum
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" HAVING COUNT(*) > 1 ORDER BY COUNT(*) DESC LIMIT 500;
+ error_type: Test Results
- id: '1224'
test_id: '1034'
test_type: Unique
diff --git a/testgen/template/dbsetup_test_types/test_types_Unique_Pct.yaml b/testgen/template/dbsetup_test_types/test_types_Unique_Pct.yaml
index a1dfdf46..1b6c0930 100644
--- a/testgen/template/dbsetup_test_types/test_types_Unique_Pct.yaml
+++ b/testgen/template/dbsetup_test_types/test_types_Unique_Pct.yaml
@@ -68,6 +68,14 @@ test_types:
test_operator: '>='
test_condition: |-
{THRESHOLD_VALUE}
+ - id: '7029'
+ test_type: Unique_Pct
+ sql_flavor: redshift_spectrum
+ measure: |-
+ ABS( 2.0 * ASIN( SQRT({BASELINE_UNIQUE_CT}::FLOAT / {BASELINE_VALUE_CT}::FLOAT ) ) - 2 * ASIN( SQRT( COUNT( DISTINCT {COLUMN_NAME} )::FLOAT / NULLIF(COUNT( {COLUMN_NAME} ), 0)::FLOAT )) )
+ test_operator: '>='
+ test_condition: |-
+ {THRESHOLD_VALUE}
- id: '2029'
test_type: Unique_Pct
sql_flavor: snowflake
@@ -117,6 +125,14 @@ test_types:
lookup_query: |-
SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT 500;
error_type: Test Results
+ - id: '1429'
+ test_id: '1035'
+ test_type: Unique_Pct
+ sql_flavor: redshift_spectrum
+ lookup_type: null
+ lookup_query: |-
+ SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT 500;
+ error_type: Test Results
- id: '1225'
test_id: '1035'
test_type: Unique_Pct
diff --git a/testgen/template/dbsetup_test_types/test_types_Valid_Characters.yaml b/testgen/template/dbsetup_test_types/test_types_Valid_Characters.yaml
index 2e21cb68..b7e64893 100644
--- a/testgen/template/dbsetup_test_types/test_types_Valid_Characters.yaml
+++ b/testgen/template/dbsetup_test_types/test_types_Valid_Characters.yaml
@@ -68,6 +68,14 @@ test_types:
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
+ - id: '7036'
+ test_type: Valid_Characters
+ sql_flavor: redshift_spectrum
+ measure: |-
+ SUM(CASE WHEN TRANSLATE({COLUMN_NAME}, CHR(160) || CHR(8203) || CHR(65279) || CHR(8239) || CHR(8201) || CHR(12288) || CHR(8204), 'XXXXXXX') <> {COLUMN_NAME} OR {COLUMN_NAME} LIKE ' %' OR {COLUMN_NAME} LIKE '''%''' OR {COLUMN_NAME} LIKE '"%"' THEN 1 ELSE 0 END)
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
- id: '2036'
test_type: Valid_Characters
sql_flavor: snowflake
@@ -117,6 +125,14 @@ test_types:
lookup_query: |-
SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}", CHR(160) || CHR(8203) || CHR(65279) || CHR(8239) || CHR(8201) || CHR(12288) || CHR(8204), 'XXXXXXX') <> "{COLUMN_NAME}" OR "{COLUMN_NAME}" LIKE ' %' OR "{COLUMN_NAME}" LIKE '''%''' OR "{COLUMN_NAME}" LIKE '"%"' ORDER BY record_ct DESC;
error_type: Test Results
+ - id: '1459'
+ test_id: '1043'
+ test_type: Valid_Characters
+ sql_flavor: redshift_spectrum
+ lookup_type: null
+ lookup_query: |-
+ SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}", CHR(160) || CHR(8203) || CHR(65279) || CHR(8239) || CHR(8201) || CHR(12288) || CHR(8204), 'XXXXXXX') <> "{COLUMN_NAME}" OR "{COLUMN_NAME}" LIKE ' %' OR "{COLUMN_NAME}" LIKE '''%''' OR "{COLUMN_NAME}" LIKE '"%"' ORDER BY record_ct DESC;
+ error_type: Test Results
- id: '1236'
test_id: '1043'
test_type: Valid_Characters
diff --git a/testgen/template/dbsetup_test_types/test_types_Valid_Month.yaml b/testgen/template/dbsetup_test_types/test_types_Valid_Month.yaml
index 6cccbe0e..bca30c7b 100644
--- a/testgen/template/dbsetup_test_types/test_types_Valid_Month.yaml
+++ b/testgen/template/dbsetup_test_types/test_types_Valid_Month.yaml
@@ -67,6 +67,14 @@ test_types:
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
+ - id: '7033'
+ test_type: Valid_Month
+ sql_flavor: redshift_spectrum
+ measure: |-
+ SUM(CASE WHEN NULLIF({COLUMN_NAME}, '') NOT IN ({BASELINE_VALUE}) THEN 1 ELSE 0 END)
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
- id: '2033'
test_type: Valid_Month
sql_flavor: snowflake
diff --git a/testgen/template/dbsetup_test_types/test_types_Valid_US_Zip.yaml b/testgen/template/dbsetup_test_types/test_types_Valid_US_Zip.yaml
index a7d5a3d3..710fdcfe 100644
--- a/testgen/template/dbsetup_test_types/test_types_Valid_US_Zip.yaml
+++ b/testgen/template/dbsetup_test_types/test_types_Valid_US_Zip.yaml
@@ -66,6 +66,14 @@ test_types:
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
+ - id: '7034'
+ test_type: Valid_US_Zip
+ sql_flavor: redshift_spectrum
+ measure: |-
+ SUM(CASE WHEN TRANSLATE({COLUMN_NAME},'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') THEN 1 ELSE 0 END)
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
- id: '2034'
test_type: Valid_US_Zip
sql_flavor: snowflake
@@ -115,6 +123,14 @@ test_types:
lookup_query: |-
SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC;
error_type: Test Results
+ - id: '1460'
+ test_id: '1044'
+ test_type: Valid_US_Zip
+ sql_flavor: redshift_spectrum
+ lookup_type: null
+ lookup_query: |-
+ SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC;
+ error_type: Test Results
- id: '1240'
test_id: '1044'
test_type: Valid_US_Zip
diff --git a/testgen/template/dbsetup_test_types/test_types_Valid_US_Zip3.yaml b/testgen/template/dbsetup_test_types/test_types_Valid_US_Zip3.yaml
index 16159eef..5a79bdec 100644
--- a/testgen/template/dbsetup_test_types/test_types_Valid_US_Zip3.yaml
+++ b/testgen/template/dbsetup_test_types/test_types_Valid_US_Zip3.yaml
@@ -67,6 +67,14 @@ test_types:
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
+ - id: '7035'
+ test_type: Valid_US_Zip3
+ sql_flavor: redshift_spectrum
+ measure: |-
+ SUM(CASE WHEN TRANSLATE({COLUMN_NAME},'012345678','999999999') <> '999' THEN 1 ELSE 0 END)
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
- id: '2035'
test_type: Valid_US_Zip3
sql_flavor: snowflake
@@ -116,6 +124,14 @@ test_types:
lookup_query: |-
SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') <> '999' GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC;
error_type: Test Results
+ - id: '1461'
+ test_id: '1045'
+ test_type: Valid_US_Zip3
+ sql_flavor: redshift_spectrum
+ lookup_type: null
+ lookup_query: |-
+ SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') <> '999' GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC;
+ error_type: Test Results
- id: '1244'
test_id: '1045'
test_type: Valid_US_Zip3
diff --git a/testgen/template/dbsetup_test_types/test_types_Variability_Decrease.yaml b/testgen/template/dbsetup_test_types/test_types_Variability_Decrease.yaml
index 8b0338ff..c38409ec 100644
--- a/testgen/template/dbsetup_test_types/test_types_Variability_Decrease.yaml
+++ b/testgen/template/dbsetup_test_types/test_types_Variability_Decrease.yaml
@@ -72,6 +72,14 @@ test_types:
test_operator: <
test_condition: |-
{THRESHOLD_VALUE}
+ - id: '7032'
+ test_type: Variability_Decrease
+ sql_flavor: redshift_spectrum
+ measure: |-
+ 100.0*STDDEV({COLUMN_NAME}::FLOAT)/{BASELINE_SD}::FLOAT
+ test_operator: <
+ test_condition: |-
+ {THRESHOLD_VALUE}
- id: '2032'
test_type: Variability_Decrease
sql_flavor: snowflake
@@ -121,6 +129,14 @@ test_types:
lookup_query: |-
SELECT STDDEV("{COLUMN_NAME}"::FLOAT) as current_standard_deviation FROM {TARGET_SCHEMA}.{TABLE_NAME};
error_type: Test Results
+ - id: '1432'
+ test_id: '1041'
+ test_type: Variability_Decrease
+ sql_flavor: redshift_spectrum
+ lookup_type: null
+ lookup_query: |-
+ SELECT STDDEV(CAST("{COLUMN_NAME}" AS FLOAT)) as current_standard_deviation FROM {TARGET_SCHEMA}.{TABLE_NAME};
+ error_type: Test Results
- id: '1228'
test_id: '1041'
test_type: Variability_Decrease
diff --git a/testgen/template/dbsetup_test_types/test_types_Variability_Increase.yaml b/testgen/template/dbsetup_test_types/test_types_Variability_Increase.yaml
index 7229c38b..55004409 100644
--- a/testgen/template/dbsetup_test_types/test_types_Variability_Increase.yaml
+++ b/testgen/template/dbsetup_test_types/test_types_Variability_Increase.yaml
@@ -76,6 +76,14 @@ test_types:
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
+ - id: '7031'
+ test_type: Variability_Increase
+ sql_flavor: redshift_spectrum
+ measure: |-
+ 100.0*STDDEV({COLUMN_NAME}::FLOAT)/{BASELINE_SD}::FLOAT
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
- id: '2031'
test_type: Variability_Increase
sql_flavor: snowflake
@@ -125,6 +133,14 @@ test_types:
lookup_query: |-
SELECT STDDEV("{COLUMN_NAME}"::FLOAT) as current_standard_deviation FROM {TARGET_SCHEMA}.{TABLE_NAME};
error_type: Test Results
+ - id: '1431'
+ test_id: '1040'
+ test_type: Variability_Increase
+ sql_flavor: redshift_spectrum
+ lookup_type: null
+ lookup_query: |-
+ SELECT STDDEV(CAST("{COLUMN_NAME}" AS FLOAT)) as current_standard_deviation FROM {TARGET_SCHEMA}.{TABLE_NAME};
+ error_type: Test Results
- id: '1227'
test_id: '1040'
test_type: Variability_Increase
diff --git a/testgen/template/dbsetup_test_types/test_types_Weekly_Rec_Ct.yaml b/testgen/template/dbsetup_test_types/test_types_Weekly_Rec_Ct.yaml
index 16b73329..42ca30ff 100644
--- a/testgen/template/dbsetup_test_types/test_types_Weekly_Rec_Ct.yaml
+++ b/testgen/template/dbsetup_test_types/test_types_Weekly_Rec_Ct.yaml
@@ -68,6 +68,14 @@ test_types:
test_operator: '>'
test_condition: |-
{THRESHOLD_VALUE}
+ - id: '7030'
+ test_type: Weekly_Rec_Ct
+ sql_flavor: redshift_spectrum
+ measure: |-
+ MAX(DATEDIFF(week, '1800-01-01'::DATE, {COLUMN_NAME})) - MIN(DATEDIFF(week, '1800-01-01'::DATE, {COLUMN_NAME}))+1 - COUNT(DISTINCT DATEDIFF(week, '1800-01-01'::DATE, {COLUMN_NAME}))
+ test_operator: '>'
+ test_condition: |-
+ {THRESHOLD_VALUE}
- id: '2030'
test_type: Weekly_Rec_Ct
sql_flavor: snowflake
@@ -157,6 +165,14 @@ test_types:
lookup_query: |-
WITH RECURSIVE daterange(all_dates) AS (SELECT DATE_TRUNC('week',MIN("{COLUMN_NAME}")) :: DATE AS all_dates FROM {TARGET_SCHEMA}.{TABLE_NAME} UNION ALL SELECT (d.all_dates + INTERVAL '1 week' ) :: DATE AS all_dates FROM daterange d WHERE d.all_dates < (SELECT DATE_TRUNC('week', MAX("{COLUMN_NAME}")) :: DATE FROM {TARGET_SCHEMA}.{TABLE_NAME}) ), existing_periods AS ( SELECT DISTINCT DATE_TRUNC('week',"{COLUMN_NAME}") :: DATE AS period, COUNT(1) as period_count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY DATE_TRUNC('week',"{COLUMN_NAME}") :: DATE ) SELECT d.all_dates as missing_period, MAX(b.period) AS prior_available_week, (SELECT period_count FROM existing_periods WHERE period = MAX(b.period) ) AS prior_available_week_count, MIN(c.period) AS next_available_week, (SELECT period_count FROM existing_periods WHERE period = MIN(c.period) ) AS next_available_week_count FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates ORDER BY d.all_dates;
error_type: Test Results
+ - id: '1430'
+ test_id: '1037'
+ test_type: Weekly_Rec_Ct
+ sql_flavor: redshift_spectrum
+ lookup_type: null
+ lookup_query: |-
+ WITH RECURSIVE daterange(all_dates) AS (SELECT DATE_TRUNC('week',MIN("{COLUMN_NAME}")) :: DATE AS all_dates FROM {TARGET_SCHEMA}.{TABLE_NAME} UNION ALL SELECT (d.all_dates + INTERVAL '1 week' ) :: DATE AS all_dates FROM daterange d WHERE d.all_dates < (SELECT DATE_TRUNC('week', MAX("{COLUMN_NAME}")) :: DATE FROM {TARGET_SCHEMA}.{TABLE_NAME}) ), existing_periods AS ( SELECT DISTINCT DATE_TRUNC('week',"{COLUMN_NAME}") :: DATE AS period, COUNT(1) as period_count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY DATE_TRUNC('week',"{COLUMN_NAME}") :: DATE ) SELECT d.all_dates as missing_period, MAX(b.period) AS prior_available_week, (SELECT period_count FROM existing_periods WHERE period = MAX(b.period) ) AS prior_available_week_count, MIN(c.period) AS next_available_week, (SELECT period_count FROM existing_periods WHERE period = MIN(c.period) ) AS next_available_week_count FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates ORDER BY d.all_dates;
+ error_type: Test Results
- id: '1226'
test_id: '1037'
test_type: Weekly_Rec_Ct
diff --git a/testgen/template/dbupgrade/0152_incremental_upgrade.sql b/testgen/template/dbupgrade/0152_incremental_upgrade.sql
new file mode 100644
index 00000000..2184830e
--- /dev/null
+++ b/testgen/template/dbupgrade/0152_incremental_upgrade.sql
@@ -0,0 +1,27 @@
+SET SEARCH_PATH TO {SCHEMA_NAME};
+
+CREATE OR REPLACE FUNCTION fn_quote_literal_escape(var_value varchar, sql_flavor varchar) RETURNS varchar
+ LANGUAGE plpgsql
+AS
+$$
+DECLARE
+ escaped_value varchar;
+ lower_case_sql_flavor varchar;
+BEGIN
+ lower_case_sql_flavor := LOWER(sql_flavor);
+
+ IF lower_case_sql_flavor IN ('postgres', 'postgresql') THEN
+ escaped_value := QUOTE_LITERAL(var_value);
+ ELSIF lower_case_sql_flavor IN ('redshift', 'redshift_spectrum', 'snowflake') THEN
+ escaped_value := TRIM(LEADING 'E' FROM QUOTE_LITERAL(var_value));
+ ELSIF lower_case_sql_flavor = 'mssql' THEN
+ escaped_value := '''' || REPLACE(var_value, '''', '''''') || '''';
+ ELSIF lower_case_sql_flavor = 'databricks' THEN
+ escaped_value := '''' || REPLACE(REPLACE(var_value, '\', '\\'), '''', '\''') || '''';
+ ELSE
+ RAISE EXCEPTION 'Invalid sql_flavor name: %', sql_flavor;
+ END IF;
+
+ RETURN escaped_value;
+END;
+$$;
diff --git a/testgen/template/flavors/generic/validate_tests/ex_get_project_column_list_generic.sql b/testgen/template/flavors/generic/validate_tests/ex_get_project_column_list.sql
similarity index 100%
rename from testgen/template/flavors/generic/validate_tests/ex_get_project_column_list_generic.sql
rename to testgen/template/flavors/generic/validate_tests/ex_get_project_column_list.sql
diff --git a/testgen/template/flavors/redshift_spectrum/data_chars/schema_ddf_query_redshift_spectrum.sql b/testgen/template/flavors/redshift_spectrum/data_chars/schema_ddf_query_redshift_spectrum.sql
new file mode 100644
index 00000000..278caa75
--- /dev/null
+++ b/testgen/template/flavors/redshift_spectrum/data_chars/schema_ddf_query_redshift_spectrum.sql
@@ -0,0 +1,37 @@
+SELECT '{PROJECT_CODE}' AS project_code,
+ CURRENT_TIMESTAMP AT TIME ZONE 'UTC' AS refresh_timestamp,
+ c.schemaname AS table_schema,
+ c.tablename AS table_name,
+ c.columnname AS column_name,
+ c.external_type AS data_type,
+ NULLIF(
+ REGEXP_SUBSTR(c.external_type, 'char\\(([0-9]+)\\)', 1, 1, 'e'),
+ ''
+ ) AS character_maximum_length,
+ c.columnnum AS ordinal_position,
+ CASE
+ WHEN c.external_type = 'string'
+ OR c.external_type ILIKE 'varchar%'
+ OR c.external_type ILIKE 'char%'
+ THEN 'A'
+ WHEN c.external_type = 'boolean'
+ THEN 'B'
+ WHEN c.external_type IN ('date', 'timestamp')
+ THEN 'D'
+ WHEN c.external_type IN ('long', 'double', 'float')
+ OR c.external_type ILIKE '%int%'
+ OR c.external_type ILIKE 'decimal%'
+ THEN 'N'
+ ELSE 'X'
+ END AS general_type,
+ CASE
+ WHEN REGEXP_SUBSTR(c.external_type, 'decimal\\([0-9]+,([0-9]+)\\)', 1, 1, 'e') > 0
+ THEN 1
+ ELSE 0
+ END AS is_decimal
+FROM svv_external_columns c
+WHERE c.schemaname = '{DATA_SCHEMA}'
+ {TABLE_CRITERIA}
+ORDER BY c.schemaname,
+ c.tablename,
+ c.columnnum
\ No newline at end of file
diff --git a/testgen/template/flavors/redshift_spectrum/profiling/project_get_table_sample_count_redshift_spectrum.sql b/testgen/template/flavors/redshift_spectrum/profiling/project_get_table_sample_count_redshift_spectrum.sql
new file mode 100644
index 00000000..9a62c3d6
--- /dev/null
+++ b/testgen/template/flavors/redshift_spectrum/profiling/project_get_table_sample_count_redshift_spectrum.sql
@@ -0,0 +1,23 @@
+WITH stats
+ AS (SELECT COUNT(*)::FLOAT as record_ct,
+ ROUND(CAST({PROFILE_SAMPLE_PERCENT} as FLOAT) * CAST(COUNT(*) as FLOAT) / 100.0) as calc_sample_ct,
+ CAST({PROFILE_SAMPLE_MIN_COUNT} as FLOAT) as min_sample_ct,
+ CAST(999000 as FLOAT) as max_sample_ct
+ FROM {SAMPLING_TABLE} )
+SELECT '{SAMPLING_TABLE}' as schema_table,
+ CASE WHEN record_ct <= min_sample_ct THEN -1
+ WHEN calc_sample_ct > max_sample_ct THEN max_sample_ct
+ WHEN calc_sample_ct > min_sample_ct THEN calc_sample_ct
+ ELSE {PROFILE_SAMPLE_MIN_COUNT}
+ END as sample_count,
+ CASE WHEN record_ct <= min_sample_ct THEN 1
+ WHEN calc_sample_ct > max_sample_ct THEN record_ct / max_sample_ct
+ WHEN calc_sample_ct > min_sample_ct THEN record_ct / calc_sample_ct
+ ELSE record_ct / min_sample_ct
+ END as sample_ratio,
+ ROUND(CASE WHEN record_ct <= min_sample_ct THEN 100
+ WHEN calc_sample_ct > max_sample_ct THEN 100.0 * max_sample_ct / record_ct
+ WHEN calc_sample_ct > min_sample_ct THEN 100.0 * calc_sample_ct / record_ct
+ ELSE 100.0 * min_sample_ct / record_ct
+ END, 4) as sample_percent_calc
+ FROM stats;
diff --git a/testgen/template/flavors/redshift_spectrum/profiling/project_profiling_query_redshift_spectrum.yaml b/testgen/template/flavors/redshift_spectrum/profiling/project_profiling_query_redshift_spectrum.yaml
new file mode 100644
index 00000000..1596dd1d
--- /dev/null
+++ b/testgen/template/flavors/redshift_spectrum/profiling/project_profiling_query_redshift_spectrum.yaml
@@ -0,0 +1,213 @@
+---
+strTemplate01_sampling: "SELECT "
+strTemplate01_else: "SELECT "
+strTemplate02_all: |
+ {CONNECTION_ID} as connection_id,
+ '{PROJECT_CODE}' as project_code,
+ '{TABLE_GROUPS_ID}' as table_groups_id,
+ '{DATA_SCHEMA}' AS schema_name,
+ '{RUN_DATE}' AS run_date,
+ '{DATA_TABLE}' AS table_name,
+ {COL_POS} AS position,
+ '{COL_NAME_SANITIZED}' AS column_name,
+ '{COL_TYPE}' AS column_type,
+ '{COL_GEN_TYPE}' AS general_type,
+ COUNT(*) AS record_ct,
+ COUNT("{COL_NAME}") AS value_ct,
+ COUNT(DISTINCT "{COL_NAME}") AS distinct_value_ct,
+ SUM(NVL2("{COL_NAME}", 0, 1)) AS null_value_ct,
+strTemplate03_ADN: MIN(LEN("{COL_NAME}")) AS min_length,
+ MAX(LEN("{COL_NAME}")) AS max_length,
+ AVG(NULLIF(LEN("{COL_NAME}"), 0)::FLOAT) AS avg_length,
+strTemplate03_else: NULL as min_length,
+ NULL as max_length,
+ NULL as avg_length,
+strTemplate04_A: COUNT( CASE WHEN TRIM("{COL_NAME}") ~ '^0(\.0*)?$' THEN 1 END) AS zero_value_ct,
+strTemplate04_N: SUM( 1 - ABS(SIGN("{COL_NAME}")) )::BIGINT AS zero_value_ct,
+strTemplate04_else: NULL as zero_value_ct,
+strTemplate05_A: COUNT(DISTINCT UPPER(TRANSLATE("{COL_NAME}", ' '',.-', ''))) as distinct_std_value_ct,
+ COUNT( CASE WHEN "{COL_NAME}" = '' THEN 1 END) AS zero_length_ct,
+ COUNT( CASE WHEN "{COL_NAME}" BETWEEN ' !' AND '!' THEN 1 END ) AS lead_space_ct,
+ COUNT( CASE WHEN "{COL_NAME}" ILIKE '"%"' OR "{COL_NAME}" ILIKE '''%''' THEN 1 END ) as quoted_value_ct,
+ COUNT( CASE WHEN "{COL_NAME}" ~ '[0-9]' THEN 1 END ) as includes_digit_ct,
+ COUNT( CASE
+ WHEN LENGTH("{COL_NAME}") > 0 AND "{COL_NAME}" IN ('.', '?', ' ') THEN 1
+ WHEN LOWER("{COL_NAME}") SIMILAR TO '(^.{2,}|-{2,}|0{2,}|9{2,}|x{2,}|z{2,}$)' THEN 1
+ WHEN LOWER("{COL_NAME}") IN ('blank','error','missing','tbd',
+ 'n/a','#na','none','null','unknown') THEN 1
+ WHEN LOWER("{COL_NAME}") IN ('(blank)','(error)','(missing)','(tbd)',
+ '(n/a)','(#na)','(none)','(null)','(unknown)') THEN 1
+ WHEN LOWER("{COL_NAME}") IN ('[blank]','[error]','[missing]','[tbd]',
+ '[n/a]','[#na]','[none]','[null]','[unknown]') THEN 1
+ END ) AS filled_value_ct,
+ LEFT(MIN(NULLIF("{COL_NAME}", '')), 100) AS min_text,
+ LEFT(MAX(NULLIF("{COL_NAME}", '')), 100) AS max_text,
+ COUNT( CASE WHEN "{COL_NAME}" = UPPER("{COL_NAME}") AND "{COL_NAME}" <> LOWER("{COL_NAME}") THEN 1 END) AS upper_case_ct,
+ COUNT( CASE WHEN "{COL_NAME}" = LOWER("{COL_NAME}") AND "{COL_NAME}" <> UPPER("{COL_NAME}") THEN 1 END) AS lower_case_ct,
+ COUNT( CASE WHEN "{COL_NAME}" = UPPER("{COL_NAME}") AND "{COL_NAME}" = LOWER("{COL_NAME}") THEN 1 END) AS non_alpha_ct,
+ COUNT( CASE WHEN TRANSLATE("{COL_NAME}", CHR(160) || CHR(8201) || CHR(8203) || CHR(8204) || CHR(8205) || CHR(8206) || CHR(8207) || CHR(8239) || CHR(12288) || CHR(65279), 'XXXXXXXXXX') <> "{COL_NAME}" THEN 1 END) as non_printing_ct,
+ SUM(<%IS_NUM;LEFT("{COL_NAME}", 31)%>) AS numeric_ct,
+ SUM(<%IS_DATE;LEFT("{COL_NAME}", 26)%>) AS date_ct,
+ CASE
+ WHEN SUM( CASE WHEN "{COL_NAME}" ~ '^[0-9]{1,5}[a-zA-Z]?\\s\\w{1,5}\\.?\\s?\\w*\\s?\\w*\\s[a-zA-Z]{1,6}\\.?\\s?[0-9]{0,5}[A-Z]{0,1}$'
+ THEN 1 END)::FLOAT/COUNT("{COL_NAME}")::FLOAT > 0.8 THEN 'STREET_ADDR'
+ WHEN SUM( CASE WHEN "{COL_NAME}" IN ('AL','AK','AS','AZ','AR','CA','CO','CT','DE','DC','FM','FL','GA','GU','HI','ID','IL','IN','IA','KS','KY','LA','ME','MH','MD','MA','MI','MN','MS','MO','MT','NE','NV','NH','NJ','NM','NY','NC','ND','MP','OH','OK','OR','PW','PA','PR','RI','SC','SD','TN','TX','UT','VT','VI','VA','WA','WV','WI','WY','AE','AP','AA')
+ THEN 1 END)::FLOAT/COUNT("{COL_NAME}")::FLOAT > 0.9 THEN 'STATE_USA'
+ WHEN SUM( CASE WHEN "{COL_NAME}" ~ '^(\\+1|1)?[ .-]?(\\([2-9][0-9]{2}\\)|[2-9][0-9]{2})[ .-]?[2-9][0-9]{2}[ .-]?[0-9]{4}$'
+ THEN 1 END)::FLOAT/COUNT("{COL_NAME}")::FLOAT > 0.8 THEN 'PHONE_USA'
+ WHEN SUM( CASE WHEN "{COL_NAME}" ~ '^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z]{2,}$'
+ THEN 1 END)::FLOAT/COUNT("{COL_NAME}")::FLOAT > 0.9 THEN 'EMAIL'
+ WHEN SUM( CASE WHEN TRANSLATE("{COL_NAME}",'012345678','999999999') IN ('99999', '999999999', '99999-9999')
+ THEN 1 END)::FLOAT/COUNT("{COL_NAME}")::FLOAT > 0.9 THEN 'ZIP_USA'
+ WHEN SUM( CASE WHEN "{COL_NAME}" ~ '^[\\w\\s\-]+\\.(txt|(c|t|p)sv|dat|doc|docx|json|pdf|xlsx|xml)$'
+ THEN 1 END)::FLOAT/COUNT("{COL_NAME}")::FLOAT > 0.9 THEN 'FILE_NAME'
+ WHEN SUM( CASE WHEN "{COL_NAME}" ~ '^([0-9]{4}[- ]){3}[0-9]{4}$'
+ THEN 1 END)::FLOAT/COUNT("{COL_NAME}")::FLOAT > 0.8 THEN 'CREDIT_CARD'
+ WHEN SUM( CASE WHEN "{COL_NAME}" ~ '^([^,|\t]{1,20}[,|\t]){2,}[^,|\t]{0,20}([,|\t]{0,1}[^,|\t]{0,20})*$'
+ AND "{COL_NAME}" !~ '\\s(and|but|or|yet)\\s'
+ THEN 1 END)::FLOAT/COUNT("{COL_NAME}")::FLOAT > 0.8 THEN 'DELIMITED_DATA'
+ WHEN SUM( CASE WHEN "{COL_NAME}" ~ '^[0-8][0-9]{2}-[0-9]{2}-[0-9]{4}$'
+ AND LEFT("{COL_NAME}", 3) NOT BETWEEN '734' AND '749'
+ AND LEFT("{COL_NAME}", 3) <> '666' THEN 1 END)::FLOAT/COUNT("{COL_NAME}")::FLOAT > 0.9 THEN 'SSN'
+ END as std_pattern_match,
+strTemplate05_else: NULL as distinct_std_value_ct,
+ NULL as zero_length_ct,
+ NULL as lead_space_ct,
+ NULL as quoted_value_ct,
+ NULL as includes_digit_ct,
+ NULL as filled_value_ct,
+ NULL as min_text,
+ NULL as max_text,
+ NULL as upper_case_ct,
+ NULL as lower_case_ct,
+ NULL as non_alpha_ct,
+ NULL as non_printing_ct,
+ NULL as numeric_ct,
+ NULL as date_ct,
+ NULL as std_pattern_match,
+strTemplate06_A_patterns: (SELECT LEFT(LISTAGG(pattern, ' | ') WITHIN GROUP (ORDER BY ct DESC), 1000) AS concat_pats
+ FROM ( SELECT TOP 5 CAST(COUNT(*) AS VARCHAR(40)) || ' | ' || pattern AS pattern,
+ COUNT(*) AS ct
+ FROM ( SELECT REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE(
+ "{COL_NAME}", '[a-z]', 'a'),
+ '[A-Z]', 'A'),
+ '[0-9]', 'N') AS pattern
+ FROM {DATA_SCHEMA}.{DATA_TABLE}
+ WHERE "{COL_NAME}" > ' ' AND (SELECT MAX(LEN("{COL_NAME}"))
+ FROM {DATA_SCHEMA}.{DATA_TABLE}) BETWEEN 3 and {PARM_MAX_PATTERN_LENGTH}) p
+ GROUP BY pattern
+ HAVING pattern > ' '
+ ORDER BY COUNT(*) DESC) as ps) AS top_patterns,
+strTemplate06_else: NULL as top_patterns,
+strTemplate07_A_freq: ( SELECT LEFT(LISTAGG(val, ' | ') WITHIN GROUP (ORDER BY ct DESC), 1000) as concat_vals
+ FROM (
+ SELECT TOP 10 CAST(COUNT(*) as VARCHAR(10)) || ' | ' || "{COL_NAME}" as val,
+ COUNT(*) as ct
+ FROM {DATA_SCHEMA}.{DATA_TABLE}
+ WHERE "{COL_NAME}" > ' '
+ GROUP BY "{COL_NAME}"
+ HAVING "{COL_NAME}" > ' '
+ ORDER BY COUNT(*), "{COL_NAME}" DESC
+ ) ps
+ ) AS top_freq_values,
+strTemplate07_else: NULL as top_freq_values,
+strTemplate08_N: MIN("{COL_NAME}") AS min_value,
+ MIN(CASE WHEN "{COL_NAME}" > 0 THEN "{COL_NAME}" ELSE NULL END) AS min_value_over_0,
+ MAX("{COL_NAME}") AS max_value,
+ AVG(CAST("{COL_NAME}" AS FLOAT)) AS avg_value,
+ STDDEV(CAST("{COL_NAME}" AS FLOAT)) AS stdev_value,
+ MIN(pct_25) as percentile_25,
+ MIN(pct_50) as percentile_50,
+ MIN(pct_75) as percentile_75,
+strTemplate08_else: NULL as min_value,
+ NULL as min_value_over_0,
+ NULL as max_value,
+ NULL as avg_value,
+ NULL as stdev_value,
+ NULL as percentile_25,
+ NULL as percentile_50,
+ NULL as percentile_75,
+strTemplate10_N_dec: SUM(ROUND(ABS(MOD("{COL_NAME}", 1)), 5)) as fractional_sum,
+
+strTemplate10_else: NULL as fractional_sum,
+
+strTemplate11_D: CASE
+ WHEN MIN("{COL_NAME}") IS NULL THEN NULL
+ ELSE GREATEST(MIN("{COL_NAME}"), '0001-01-01')
+ END as min_date,
+ MAX("{COL_NAME}") as max_date,
+ COUNT( CASE WHEN DATEDIFF('MON', "{COL_NAME}", '{RUN_DATE}') > 12 THEN 1 END) AS before_1yr_date_ct,
+ COUNT( CASE WHEN DATEDIFF('MON', "{COL_NAME}", '{RUN_DATE}') > 60 THEN 1 END) AS before_5yr_date_ct,
+ COUNT( CASE WHEN DATEDIFF('MON', "{COL_NAME}", '{RUN_DATE}') > 240 THEN 1 END) AS before_20yr_date_ct,
+ COUNT( CASE WHEN DATEDIFF('MON', "{COL_NAME}", '{RUN_DATE}') > 1200 THEN 1 END) AS before_100yr_date_ct,
+ COUNT( CASE WHEN DATEDIFF('DAY', "{COL_NAME}", '{RUN_DATE}') BETWEEN 0 AND 365 THEN 1 END) AS within_1yr_date_ct,
+ COUNT( CASE WHEN DATEDIFF('DAY', "{COL_NAME}", '{RUN_DATE}') BETWEEN 0 AND 30 THEN 1 END) AS within_1mo_date_ct,
+ COUNT( CASE WHEN "{COL_NAME}" > '{RUN_DATE}' THEN 1 END) AS future_date_ct,
+ COUNT( CASE WHEN DATEDIFF('MON', '{RUN_DATE}', "{COL_NAME}") > 240 THEN 1 END) AS distant_future_date_ct,
+ COUNT(DISTINCT DATEDIFF(day, "{COL_NAME}", '{RUN_DATE}' ) ) as date_days_present,
+ COUNT(DISTINCT DATEDIFF(week, "{COL_NAME}", '{RUN_DATE}' ) ) as date_weeks_present,
+ COUNT(DISTINCT DATEDIFF(month, "{COL_NAME}", '{RUN_DATE}' ) ) as date_months_present,
+
+strTemplate11_else: NULL as min_date,
+ NULL as max_date,
+ NULL as before_1yr_date_ct,
+ NULL as before_5yr_date_ct,
+ NULL as before_20yr_date_ct,
+ NULL AS before_100yr_date_ct,
+ NULL as within_1yr_date_ct,
+ NULL as within_1mo_date_ct,
+ NULL as future_date_ct,
+ NULL as distant_future_date_ct,
+ NULL as date_days_present,
+ NULL as date_weeks_present,
+ NULL as date_months_present,
+
+strTemplate12_B: SUM(CAST("{COL_NAME}" AS INTEGER)) AS boolean_true_ct,
+
+strTemplate12_else: NULL as boolean_true_ct,
+
+strTemplate13_ALL: NULL AS datatype_suggestion,
+strTemplate14_A_do_patterns: ( SELECT COUNT(DISTINCT REGEXP_REPLACE( REGEXP_REPLACE( REGEXP_REPLACE(
+ "{COL_NAME}", '[a-z]', 'a'),
+ '[A-Z]', 'A'),
+ '[0-9]', 'N')
+ ) AS pattern_ct
+ FROM {DATA_SCHEMA}.{DATA_TABLE}
+ WHERE "{COL_NAME}" > ' ' ) AS distinct_pattern_ct,
+ SUM(SIGN(REGEXP_COUNT(TRIM("{COL_NAME}"), ' '))::BIGINT) AS embedded_space_ct,
+ AVG(REGEXP_COUNT(TRIM("{COL_NAME}"), ' ')::FLOAT) AS avg_embedded_spaces,
+
+strTemplate14_A_no_patterns: NULL as distinct_pattern_ct,
+ SUM(SIGN(REGEXP_COUNT(TRIM("{COL_NAME}"), ' '))::BIGINT) AS embedded_space_ct,
+ AVG(REGEXP_COUNT(TRIM("{COL_NAME}"), ' ')::FLOAT) AS avg_embedded_spaces,
+
+strTemplate14_else: NULL as distinct_pattern_ct,
+ NULL as embedded_space_ct,
+ NULL as avg_embedded_spaces,
+
+strTemplate15_ALL: NULL as functional_data_type,
+ NULL as functional_table_type,
+
+strTemplate16_ALL: " '{PROFILE_RUN_ID}' as profile_run_id"
+
+strTemplate98_sampling: ' FROM {DATA_SCHEMA}.{DATA_TABLE} '
+
+strTemplate98_else: ' FROM {DATA_SCHEMA}.{DATA_TABLE}'
+
+strTemplate99_N: |
+ , (SELECT
+ PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_25,
+ PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_50,
+ PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_75
+ FROM {DATA_SCHEMA}.{DATA_TABLE} LIMIT 1) pctile
+
+strTemplate99_N_sampling: |
+ , (SELECT
+ PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_25,
+ PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_50,
+ PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_75
+ FROM {DATA_SCHEMA}.{DATA_TABLE} LIMIT 1) pctile
+
+strTemplate99_else: ' '
+
+strTemplate100_sampling: 'WHERE RAND() <= 1.0 / {PROFILE_SAMPLE_RATIO}'
diff --git a/testgen/template/flavors/redshift_spectrum/profiling/project_secondary_profiling_query_redshift_spectrum.sql b/testgen/template/flavors/redshift_spectrum/profiling/project_secondary_profiling_query_redshift_spectrum.sql
new file mode 100644
index 00000000..6a0a3d5b
--- /dev/null
+++ b/testgen/template/flavors/redshift_spectrum/profiling/project_secondary_profiling_query_redshift_spectrum.sql
@@ -0,0 +1,32 @@
+-- Get Freqs for selected columns
+WITH ranked_vals AS (
+ SELECT "{COL_NAME}",
+ COUNT(*) AS ct,
+ ROW_NUMBER() OVER (ORDER BY COUNT(*) DESC, "{COL_NAME}") AS rn
+ FROM {DATA_SCHEMA}.{DATA_TABLE}
+ WHERE "{COL_NAME}" > ' '
+-- TG-IF do_sample_bool
+ AND RAND() <= 1.0 / {PROFILE_SAMPLE_RATIO}
+-- TG-ENDIF
+ GROUP BY "{COL_NAME}"
+),
+consol_vals AS (
+ SELECT COALESCE(CASE WHEN rn <= 10 THEN '| ' || "{COL_NAME}" || ' | ' || CAST(ct AS VARCHAR)
+ ELSE NULL
+ END, '| Other Values (' || CAST(COUNT(DISTINCT "{COL_NAME}") as VARCHAR) || ') | ' || CAST(SUM(ct) as VARCHAR) ) AS val,
+ MIN(rn) as min_rn
+ FROM ranked_vals
+ GROUP BY CASE WHEN rn <= 10 THEN '| ' || "{COL_NAME}" || ' | ' || CAST(ct AS VARCHAR)
+ ELSE NULL
+ END
+)
+SELECT '{PROJECT_CODE}' as project_code,
+ '{DATA_SCHEMA}' as schema_name,
+ '{RUN_DATE}' as run_date,
+ '{DATA_TABLE}' as table_name,
+ '{COL_NAME}' as column_name,
+ REPLACE(LISTAGG(val, '^#^') WITHIN GROUP (ORDER BY min_rn), '^#^', CHR(10)) AS top_freq_values,
+ ( SELECT MD5(LISTAGG(DISTINCT "{COL_NAME}", '|')
+ WITHIN GROUP (ORDER BY "{COL_NAME}")) as dvh
+ FROM {DATA_SCHEMA}.{DATA_TABLE} ) as distinct_value_hash
+ FROM consol_vals;
diff --git a/testgen/template/flavors/redshift_spectrum/profiling/templated_functions.yaml b/testgen/template/flavors/redshift_spectrum/profiling/templated_functions.yaml
new file mode 100644
index 00000000..4953e254
--- /dev/null
+++ b/testgen/template/flavors/redshift_spectrum/profiling/templated_functions.yaml
@@ -0,0 +1,101 @@
+IS_NUM: CASE
+ WHEN {$1} ~ '^\\s*[+-]?\\$?\\s*[0-9]+(,[0-9]{3})*(\\.[0-9]*)?[\\%]?\\s*$' THEN 1
+ ELSE 0
+ END
+
+IS_DATE: CASE
+ /* YYYY-MM-DD HH:MM:SS SSSSSS or YYYY-MM-DD HH:MM:SS */
+ WHEN {$1} ~
+ '^(\\d{4})-(0[1-9]|1[0-2])-(0[1-9]|[12][0-9]|3[01])\\s(2[0-3]|[01][0-9]):([0-5][0-9]):([0-5][0-9])(\\s[0-9]{6})?$'
+ THEN CASE
+ WHEN LEFT({$1}, 4)::INT BETWEEN 1800 AND 2200
+ AND (
+ (SUBSTRING({$1}, 6, 2) IN ('01', '03', '05', '07', '08',
+ '10', '12')
+ AND SUBSTRING({$1}, 9, 2)::INT BETWEEN 1 AND 31)
+ OR (SUBSTRING({$1}, 6, 2) IN ('04', '06', '09')
+ AND SUBSTRING({$1}, 9, 2)::INT BETWEEN 1 AND 30)
+ OR (SUBSTRING({$1}, 6, 2) = '02'
+ AND SUBSTRING({$1}, 9, 2)::INT ::INT BETWEEN 1 AND 29)
+ )
+ THEN 1
+ ELSE 0
+ END
+ /* YYYYMMDDHHMMSSSSSS or YYYYMMDD */
+ WHEN {$1} ~
+ '^(\\d{4})(0[1-9]|1[0-2])(0[1-9]|[12][0-9]|3[01])(2[0-3]|[01][0-9])([0-5][0-9])([0-5][0-9])([0-9]{6})$'
+ OR {$1} ~ '^(\\d{4})(0[1-9]|1[0-2])(0[1-9]|[12][0-9]|3[01])(2[0-3]|[01][0-9])$'
+ THEN CASE
+ WHEN LEFT({$1}, 4)::INT BETWEEN 1800 AND 2200
+ AND (
+ (SUBSTRING({$1}, 5, 2) IN ('01', '03', '05', '07', '08',
+ '10', '12')
+ AND SUBSTRING({$1}, 7, 2)::INT BETWEEN 1 AND 31)
+ OR (SUBSTRING({$1}, 5, 2) IN ('04', '06', '09')
+ AND SUBSTRING({$1}, 7, 2)::INT BETWEEN 1 AND 30)
+ OR (SUBSTRING({$1}, 5, 2) = '02'
+ AND SUBSTRING({$1}, 7, 2)::INT::INT BETWEEN 1 AND 29)
+ )
+ THEN 1
+ ELSE 0
+ END
+ /* Exclude anything else long */
+ WHEN LENGTH({$1}) > 11 THEN 0
+ /* YYYY-MMM/MM-DD */
+ WHEN REGEXP_REPLACE(UPPER({$1}), '(JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)', '12')
+ ~ '[12][09][0-9][0-9]-[0-1]?[0-9]-[0-3]?[0-9]'
+ THEN CASE
+ WHEN SPLIT_PART({$1}, '-', 1)::INT BETWEEN 1800 AND 2200
+ AND (
+ (UPPER(SPLIT_PART({$1}, '-', 2)) IN ('01', '03', '05', '07', '08',
+ '1', '3', '5', '7', '8', '10', '12',
+ 'JAN', 'MAR', 'MAY', 'JUL', 'AUG',
+ 'OCT', 'DEC')
+ AND SPLIT_PART({$1}, '-', 3)::INT BETWEEN 1 AND 31)
+ OR (UPPER(SPLIT_PART({$1}, '-', 2)) IN ('04', '06', '09', '4', '6', '9', '11',
+ 'APR', 'JUN', 'SEP', 'NOV')
+ AND SPLIT_PART({$1}, '-', 3)::INT BETWEEN 1 AND 30)
+ OR (UPPER(SPLIT_PART({$1}, '-', 2)) IN ('02', '2', 'FEB')
+ AND SPLIT_PART({$1}, '-', 3)::INT BETWEEN 1 AND 29)
+ )
+ THEN 1
+ ELSE 0
+ END
+ /* MM/-DD/-YY/YYYY */
+ WHEN REPLACE({$1}, '-', '/') ~ '^[0-1]?[0-9]/[0-3]?[0-9]/[12][09][0-9][0-9]$'
+ OR REPLACE({$1}, '-', '/') ~ '^[0-1]?[0-9]/[0-3]?[0-9]/[0-9][0-9]$'
+ THEN
+ CASE
+ WHEN SPLIT_PART(REPLACE({$1}, '-', '/'), '/', 1)::INT BETWEEN 1 AND 12
+ AND (
+ (SPLIT_PART(REPLACE({$1}, '-', '/'), '/', 1)::INT IN (1, 3, 5, 7, 8, 10, 12)
+ AND SPLIT_PART(REPLACE({$1}, '-', '/'), '/', 2)::INT BETWEEN 1 AND 31)
+ OR (SPLIT_PART(REPLACE({$1}, '-', '/'), '/', 1)::INT IN (4, 6, 9, 11)
+ AND SPLIT_PART(REPLACE({$1}, '-', '/'), '/', 2)::INT BETWEEN 1 AND 30)
+ OR (SPLIT_PART(REPLACE({$1}, '-', '/'), '/', 1)::INT = 2
+ AND SPLIT_PART(REPLACE({$1}, '-', '/'), '/', 2)::INT BETWEEN 1 AND 29)
+ )
+ AND
+ ('20' + RIGHT(SPLIT_PART(REPLACE({$1}, '-', '/'), '/', 3), 2))::INT BETWEEN 1800 AND 2200
+ THEN 1
+ ELSE 0
+ END
+ /* DD-MMM-YYYY */
+ WHEN UPPER({$1}) ~ '[0-3]?[0-9]-(JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)-[12][09][0-9][0-9]'
+ THEN
+ CASE
+ WHEN SPLIT_PART({$1}, '-', 3)::INT BETWEEN 1800 AND 2200
+ AND (
+ (UPPER(SPLIT_PART({$1}, '-', 2)) IN ('JAN', 'MAR', 'MAY', 'JUL', 'AUG', 'OCT', 'DEC')
+ AND SPLIT_PART({$1}, '-', 1)::INT BETWEEN 1 AND 31)
+ OR (UPPER(SPLIT_PART({$1}, '-', 2)) IN ('APR', 'JUN', 'SEP', 'NOV')
+ AND SPLIT_PART({$1}, '-', 1)::INT BETWEEN 1 AND 30)
+ OR (UPPER(SPLIT_PART({$1}, '-', 2)) = 'FEB'
+ AND SPLIT_PART({$1}, '-', 1)::INT BETWEEN 1 AND 29)
+ )
+ THEN 1
+ ELSE 0
+ END
+ ELSE 0
+ END
+
diff --git a/testgen/template/flavors/redshift_spectrum/validate_tests/ex_get_project_column_list.sql b/testgen/template/flavors/redshift_spectrum/validate_tests/ex_get_project_column_list.sql
new file mode 100644
index 00000000..83cc6091
--- /dev/null
+++ b/testgen/template/flavors/redshift_spectrum/validate_tests/ex_get_project_column_list.sql
@@ -0,0 +1,3 @@
+select concat(concat(concat(schemaname, '.'), concat(tablename, '.')), columnname) as columns
+from svv_external_columns
+where schemaname in ({TEST_SCHEMAS});
diff --git a/testgen/template/profiling/datatype_suggestions.sql b/testgen/template/profiling/datatype_suggestions.sql
index 6eff3b95..785350a4 100644
--- a/testgen/template/profiling/datatype_suggestions.sql
+++ b/testgen/template/profiling/datatype_suggestions.sql
@@ -47,6 +47,7 @@ SET datatype_suggestion =
AND POSITION('+' IN pr.top_freq_values) > 0
THEN CASE
WHEN '{SQL_FLAVOR}' = 'redshift' THEN 'TIMESTAMPZ'
+ WHEN '{SQL_FLAVOR}' = 'redshift_spectrum' THEN 'TIMESTAMPZ'
WHEN '{SQL_FLAVOR}' = 'postgresql' THEN 'TIMESTAMPZ'
WHEN '{SQL_FLAVOR}' = 'snowflake' THEN 'TIMESTAMP_TZ'
WHEN '{SQL_FLAVOR}' LIKE 'mssql%' THEN 'DATETIMEOFFSET'
@@ -61,6 +62,7 @@ SET datatype_suggestion =
AND POSITION(':' IN pr.top_freq_values) > 0
THEN CASE
WHEN '{SQL_FLAVOR}' = 'redshift' THEN 'TIMESTAMP'
+ WHEN '{SQL_FLAVOR}' = 'redshift_spectrum' THEN 'TIMESTAMP'
WHEN '{SQL_FLAVOR}' = 'postgresql' THEN 'TIMESTAMP'
WHEN '{SQL_FLAVOR}' = 'snowflake' THEN 'TIMESTAMP_NTZ'
WHEN '{SQL_FLAVOR}' LIKE 'mssql%' THEN 'DATETIME2'
diff --git a/testgen/ui/components/frontend/js/components/connection_form.js b/testgen/ui/components/frontend/js/components/connection_form.js
index 0a2b43f4..c5142ed0 100644
--- a/testgen/ui/components/frontend/js/components/connection_form.js
+++ b/testgen/ui/components/frontend/js/components/connection_form.js
@@ -72,6 +72,7 @@ const clearSentinel = '';
const secretsPlaceholder = '';
const defaultPorts = {
redshift: '5439',
+ redshift_spectrum: '5439',
azure_mssql: '1433',
synapse_mssql: '1433',
mssql: '1433',
@@ -155,6 +156,15 @@ const ConnectionForm = (props, saveButton) => {
connection,
dynamicConnectionUrl,
),
+ redshift_spectrum: () => RedshiftSpectrumForm(
+ updatedConnection,
+ getValue(props.flavors).find(f => f.value === connectionFlavor.rawVal),
+ (formValue, isValid) => {
+ updatedConnection.val = {...updatedConnection.val, ...formValue};
+ setFieldValidity('redshift_spectrum_form', isValid);
+ },
+ connection,
+ ),
azure_mssql: () => AzureMSSQLForm(
updatedConnection,
getValue(props.flavors).find(f => f.value === connectionFlavor.rawVal),
@@ -517,6 +527,8 @@ const RedshiftForm = (
);
};
+const RedshiftSpectrumForm = RedshiftForm;
+
const PostgresqlForm = RedshiftForm;
const AzureMSSQLForm = RedshiftForm;
diff --git a/testgen/ui/views/connections.py b/testgen/ui/views/connections.py
index ab52de96..fd8f4056 100644
--- a/testgen/ui/views/connections.py
+++ b/testgen/ui/views/connections.py
@@ -441,6 +441,12 @@ class ConnectionFlavor:
flavor="redshift",
icon=get_asset_data_url("flavors/redshift.svg"),
),
+ ConnectionFlavor(
+ label="Amazon Redshift Spectrum",
+ value="redshift_spectrum",
+ flavor="redshift_spectrum",
+ icon=get_asset_data_url("flavors/redshift.svg"),
+ ),
ConnectionFlavor(
label="Azure SQL Database",
value="azure_mssql",
diff --git a/tests/unit/test_profiling_query.py b/tests/unit/test_profiling_query.py
index 6ca71ecc..368fb5b6 100644
--- a/tests/unit/test_profiling_query.py
+++ b/tests/unit/test_profiling_query.py
@@ -18,7 +18,7 @@ def test_include_exclude_mask_basic():
# test assertions
assert "SELECT 'dummy_project_code'" in query
- assert r"""AND (
+ assert r"""AND (
(c.table_name LIKE 'important%' ) OR (c.table_name LIKE '%useful%' )
)""" in query
assert r"""AND NOT (
@@ -63,6 +63,6 @@ def test_include_empty_include_mask(mask):
print(query)
# test assertions
- assert r"""AND (
+ assert r"""AND (
(c.table_name LIKE 'important%' ) OR (c.table_name LIKE '%useful[_]%' )
)""" in query
From ffb63761697fed31589f8dbfefa01ceee74a6f28 Mon Sep 17 00:00:00 2001
From: "Chip.Bloche"
Date: Fri, 26 Sep 2025 14:58:31 -0400
Subject: [PATCH 17/48] Prof fix irregular table names, X datatypes
---
testgen/commands/queries/profiling_query.py | 10 +++----
.../project_profiling_query_databricks.yaml | 22 +++++++++-------
...t_secondary_profiling_query_databricks.sql | 4 +--
.../project_profiling_query_mssql.yaml | 26 +++++++++++--------
...roject_secondary_profiling_query_mssql.sql | 4 +--
.../project_profiling_query_postgresql.yaml | 26 +++++++++++--------
...t_secondary_profiling_query_postgresql.sql | 4 +--
.../project_profiling_query_redshift.yaml | 26 +++++++++++--------
...ect_secondary_profiling_query_redshift.sql | 4 +--
...ect_profiling_query_redshift_spectrum.yaml | 26 +++++++++++--------
...dary_profiling_query_redshift_spectrum.sql | 4 +--
.../project_profiling_query_snowflake.yaml | 26 +++++++++++--------
...ct_secondary_profiling_query_snowflake.sql | 4 +--
.../project_profiling_query_trino.yaml | 26 +++++++++++--------
14 files changed, 120 insertions(+), 92 deletions(-)
diff --git a/testgen/commands/queries/profiling_query.py b/testgen/commands/queries/profiling_query.py
index 18e8e0b0..bfe62fbc 100644
--- a/testgen/commands/queries/profiling_query.py
+++ b/testgen/commands/queries/profiling_query.py
@@ -284,12 +284,12 @@ def GetProfilingQuery(self) -> tuple[str, dict]:
# Assemble in function
strQ = ""
- if self.parm_do_sample == "Y":
- strQ += dctSnippetTemplate["strTemplate01_sampling"]
- else:
- strQ += dctSnippetTemplate["strTemplate01_else"]
+ strQ += dctSnippetTemplate["strTemplate01"]
- strQ += dctSnippetTemplate["strTemplate02_all"]
+ if self.col_gen_type == "X":
+ strQ += dctSnippetTemplate["strTemplate02_X"]
+ else:
+ strQ += dctSnippetTemplate["strTemplate02_else"]
if self.col_gen_type in ["A", "D", "N"]:
strQ += dctSnippetTemplate["strTemplate03_ADN"]
diff --git a/testgen/template/flavors/databricks/profiling/project_profiling_query_databricks.yaml b/testgen/template/flavors/databricks/profiling/project_profiling_query_databricks.yaml
index d7612bd3..4df551e7 100644
--- a/testgen/template/flavors/databricks/profiling/project_profiling_query_databricks.yaml
+++ b/testgen/template/flavors/databricks/profiling/project_profiling_query_databricks.yaml
@@ -1,7 +1,6 @@
---
-strTemplate01_sampling: "SELECT "
-strTemplate01_else: "SELECT "
-strTemplate02_all: |
+strTemplate01: |
+ SELECT
{CONNECTION_ID} as connection_id,
'{PROJECT_CODE}' as project_code,
'{TABLE_GROUPS_ID}' as table_groups_id,
@@ -13,6 +12,11 @@ strTemplate02_all: |
'{COL_TYPE}' AS column_type,
'{COL_GEN_TYPE}' AS general_type,
COUNT(*) AS record_ct,
+strTemplate02_X: |
+ COUNT(`{COL_NAME}`) AS value_ct,
+ COUNT(DISTINCT `{COL_NAME}`) AS distinct_value_ct,
+ SUM(CASE WHEN `{COL_NAME}` IS NULL THEN 1 ELSE 0 END) AS null_value_ct,
+strTemplate02_else: |
COUNT(`{COL_NAME}`) AS value_ct,
COUNT(DISTINCT `{COL_NAME}`) AS distinct_value_ct,
SUM(CASE WHEN `{COL_NAME}` IS NULL THEN 1 ELSE 0 END) AS null_value_ct,
@@ -151,7 +155,7 @@ strTemplate07_A_freq: ( SELECT LEFT(CONCAT_WS(' | ', collect_list(val)), 1000)
FROM (
SELECT CAST(COUNT(*) as VARCHAR(10)) || ' | ' || `{COL_NAME}` as val,
COUNT(*) as ct
- FROM {DATA_SCHEMA}.{DATA_TABLE}
+ FROM `{DATA_SCHEMA}`.`{DATA_TABLE}`
WHERE `{COL_NAME}` > ' '
GROUP BY `{COL_NAME}`
HAVING `{COL_NAME}` > ' '
@@ -244,7 +248,7 @@ strTemplate14_A_do_patterns: ( SELECT COUNT(DISTINCT TRANSLATE(`{COL_NAME}`,
'aaaaaaaaaaaaaaaaaaaaaaaaaaAAAAAAAAAAAAAAAAAAAAAAAAAANNNNNNNNNN'
)
) AS pattern_ct
- FROM {DATA_SCHEMA}.{DATA_TABLE}
+ FROM `{DATA_SCHEMA}`.`{DATA_TABLE}`
WHERE `{COL_NAME}` > ' ' ) AS distinct_pattern_ct,
SUM(CAST(SIGN(LEN(TRIM(`{COL_NAME}`)) - LEN(REPLACE(TRIM(`{COL_NAME}`),' ',''))) AS BIGINT)) AS embedded_space_ct,
AVG(CAST(LEN(TRIM(`{COL_NAME}`)) - LEN(REPLACE(TRIM(`{COL_NAME}`),' ','')) AS FLOAT)) AS avg_embedded_spaces,
@@ -262,23 +266,23 @@ strTemplate15_ALL: NULL as functional_data_type,
strTemplate16_ALL: " '{PROFILE_RUN_ID}' as profile_run_id"
-strTemplate98_sampling: ' FROM {DATA_SCHEMA}.{DATA_TABLE} TABLESAMPLE ({SAMPLE_PERCENT_CALC} PERCENT)'
+strTemplate98_sampling: ' FROM `{DATA_SCHEMA}`.`{DATA_TABLE}` TABLESAMPLE ({SAMPLE_PERCENT_CALC} PERCENT)'
-strTemplate98_else: ' FROM {DATA_SCHEMA}.{DATA_TABLE}'
+strTemplate98_else: ' FROM `{DATA_SCHEMA}`.`{DATA_TABLE}`'
strTemplate99_N: |
, (SELECT
PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY `{COL_NAME}`) OVER () AS pct_25,
PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY `{COL_NAME}`) OVER () AS pct_50,
PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY `{COL_NAME}`) OVER () AS pct_75
- FROM {DATA_SCHEMA}.{DATA_TABLE} LIMIT 1) pctile
+ FROM `{DATA_SCHEMA}`.`{DATA_TABLE}` LIMIT 1) pctile
strTemplate99_N_sampling: |
, (SELECT
PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY `{COL_NAME}`) OVER () AS pct_25,
PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY `{COL_NAME}`) OVER () AS pct_50,
PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY `{COL_NAME}`) OVER () AS pct_75
- FROM {DATA_SCHEMA}.{DATA_TABLE} TABLESAMPLE ({SAMPLE_PERCENT_CALC} PERCENT) LIMIT 1 ) pctile
+ FROM `{DATA_SCHEMA}`.`{DATA_TABLE}` TABLESAMPLE ({SAMPLE_PERCENT_CALC} PERCENT) LIMIT 1 ) pctile
strTemplate99_else: ' '
diff --git a/testgen/template/flavors/databricks/profiling/project_secondary_profiling_query_databricks.sql b/testgen/template/flavors/databricks/profiling/project_secondary_profiling_query_databricks.sql
index 601098dc..7def8c78 100644
--- a/testgen/template/flavors/databricks/profiling/project_secondary_profiling_query_databricks.sql
+++ b/testgen/template/flavors/databricks/profiling/project_secondary_profiling_query_databricks.sql
@@ -4,7 +4,7 @@ AS
(SELECT `{COL_NAME}`,
COUNT(*) AS ct,
ROW_NUMBER() OVER (ORDER BY COUNT(*) DESC) AS rn
- FROM {DATA_SCHEMA}.{DATA_TABLE}
+ FROM `{DATA_SCHEMA}`.`{DATA_TABLE}`
-- TG-IF do_sample_bool
TABLESAMPLE ({SAMPLE_PERCENT_CALC} PERCENT)
-- TG-ENDIF
@@ -33,6 +33,6 @@ SELECT '{PROJECT_CODE}' as project_code,
)), '^#^', '\n') AS top_freq_values,
(SELECT MD5(CONCAT_WS('|', ARRAY_SORT(COLLECT_LIST(NULLIF(dist_col_name,''))))) as dvh
FROM (SELECT DISTINCT `{COL_NAME}` as dist_col_name
- FROM {DATA_SCHEMA}.{DATA_TABLE}) a
+ FROM `{DATA_SCHEMA}`.`{DATA_TABLE}`) a
) as distinct_value_hash
FROM consol_vals;
diff --git a/testgen/template/flavors/mssql/profiling/project_profiling_query_mssql.yaml b/testgen/template/flavors/mssql/profiling/project_profiling_query_mssql.yaml
index 2b8aae99..f0cacef2 100644
--- a/testgen/template/flavors/mssql/profiling/project_profiling_query_mssql.yaml
+++ b/testgen/template/flavors/mssql/profiling/project_profiling_query_mssql.yaml
@@ -1,7 +1,6 @@
---
-strTemplate01_sampling: "SELECT "
-strTemplate01_else: "SELECT "
-strTemplate02_all: |
+strTemplate01: |
+ SELECT
{CONNECTION_ID} as connection_id,
'{PROJECT_CODE}' as project_code,
'{TABLE_GROUPS_ID}' as table_groups_id,
@@ -13,6 +12,11 @@ strTemplate02_all: |
'{COL_TYPE}' AS column_type,
'{COL_GEN_TYPE}' AS general_type,
COUNT(*) AS record_ct,
+strTemplate02_X: |
+ COUNT(CASE WHEN "{COL_NAME}" IS NOT NULL THEN 1 END) AS value_ct,
+ NULL AS distinct_value_ct,
+ SUM(CASE WHEN "{COL_NAME}" IS NULL THEN 1 ELSE 0 END) AS null_value_ct,
+strTemplate02_else: |
COUNT("{COL_NAME}") AS value_ct,
COUNT(DISTINCT "{COL_NAME}") AS distinct_value_ct,
SUM(CASE WHEN "{COL_NAME}" IS NULL THEN 1 ELSE 0 END) AS null_value_ct,
@@ -137,9 +141,9 @@ strTemplate06_A_patterns: ( SELECT LEFT(STRING_AGG(pattern, ' | ') WITHIN GROUP
'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789',
'aaaaaaaaaaaaaaaaaaaaaaaaaaAAAAAAAAAAAAAAAAAAAAAAAAAANNNNNNNNNN' )
AS pattern
- FROM {DATA_SCHEMA}.{DATA_TABLE} WITH (NOLOCK)
+ FROM "{DATA_SCHEMA}"."{DATA_TABLE}" WITH (NOLOCK)
WHERE "{COL_NAME}" > ' ' AND ((SELECT MAX(LEN("{COL_NAME}"))
- FROM {DATA_SCHEMA}.{DATA_TABLE} WITH (NOLOCK)) BETWEEN 3 and {PARM_MAX_PATTERN_LENGTH})) p
+ FROM "{DATA_SCHEMA}"."{DATA_TABLE}" WITH (NOLOCK)) BETWEEN 3 and {PARM_MAX_PATTERN_LENGTH})) p
GROUP BY pattern
HAVING pattern > ' '
ORDER BY COUNT(*) DESC
@@ -149,7 +153,7 @@ strTemplate07_A_freq: ( SELECT LEFT(STRING_AGG(val, ' | ') WITHIN GROUP (ORDER
FROM (
SELECT TOP 10 CAST(COUNT(*) as VARCHAR(10)) + ' | ' + "{COL_NAME}" as val,
COUNT(*) as ct
- FROM {DATA_SCHEMA}.{DATA_TABLE} WITH (NOLOCK)
+ FROM "{DATA_SCHEMA}"."{DATA_TABLE}" WITH (NOLOCK)
WHERE "{COL_NAME}" > ' '
GROUP BY "{COL_NAME}"
HAVING "{COL_NAME}" > ' '
@@ -241,7 +245,7 @@ strTemplate14_A_do_patterns: ( SELECT COUNT(DISTINCT TRANSLATE("{COL_NAME}" CO
'aaaaaaaaaaaaaaaaaaaaaaaaaaAAAAAAAAAAAAAAAAAAAAAAAAAANNNNNNNNNN'
)
) AS pattern_ct
- FROM {DATA_SCHEMA}.{DATA_TABLE} WITH (NOLOCK)
+ FROM "{DATA_SCHEMA}"."{DATA_TABLE}" WITH (NOLOCK)
WHERE "{COL_NAME}" > ' ' ) AS distinct_pattern_ct,
SUM(CAST(SIGN(LEN(RTRIM(LTRIM("{COL_NAME}"))) - LEN(REPLACE(RTRIM(LTRIM("{COL_NAME}")),' ',''))) AS BIGINT)) AS embedded_space_ct,
AVG(CAST(LEN(RTRIM(LTRIM("{COL_NAME}"))) - LEN(REPLACE(RTRIM(LTRIM("{COL_NAME}")),' ','')) AS FLOAT)) AS avg_embedded_spaces,
@@ -259,23 +263,23 @@ strTemplate15_ALL: NULL as functional_data_type,
strTemplate16_ALL: " '{PROFILE_RUN_ID}' as profile_run_id"
-strTemplate98_sampling: ' FROM {DATA_SCHEMA}.{DATA_TABLE} TABLESAMPLE ({SAMPLE_PERCENT_CALC} PERCENT) WITH (NOLOCK)'
+strTemplate98_sampling: ' FROM "{DATA_SCHEMA}"."{DATA_TABLE}" TABLESAMPLE ({SAMPLE_PERCENT_CALC} PERCENT) WITH (NOLOCK)'
-strTemplate98_else: ' FROM {DATA_SCHEMA}.{DATA_TABLE} WITH (NOLOCK)'
+strTemplate98_else: ' FROM "{DATA_SCHEMA}"."{DATA_TABLE}" WITH (NOLOCK)'
strTemplate99_N: |
, (SELECT TOP 1
PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_25,
PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_50,
PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_75
- FROM {DATA_SCHEMA}.{DATA_TABLE} WITH (NOLOCK)) pctile
+ FROM "{DATA_SCHEMA}"."{DATA_TABLE}" WITH (NOLOCK)) pctile
strTemplate99_N_sampling: |
, (SELECT TOP 1
PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_25,
PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_50,
PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_75
- FROM {DATA_SCHEMA}.{DATA_TABLE} TABLESAMPLE ({SAMPLE_PERCENT_CALC} PERCENT) WITH (NOLOCK)) pctile
+ FROM "{DATA_SCHEMA}"."{DATA_TABLE}" TABLESAMPLE ({SAMPLE_PERCENT_CALC} PERCENT) WITH (NOLOCK)) pctile
strTemplate99_else: ' '
diff --git a/testgen/template/flavors/mssql/profiling/project_secondary_profiling_query_mssql.sql b/testgen/template/flavors/mssql/profiling/project_secondary_profiling_query_mssql.sql
index eee2416c..54505605 100644
--- a/testgen/template/flavors/mssql/profiling/project_secondary_profiling_query_mssql.sql
+++ b/testgen/template/flavors/mssql/profiling/project_secondary_profiling_query_mssql.sql
@@ -4,7 +4,7 @@ AS
(SELECT "{COL_NAME}",
COUNT(*) AS ct,
ROW_NUMBER() OVER (ORDER BY COUNT(*) DESC) AS rn
- FROM {DATA_SCHEMA}.{DATA_TABLE}
+ FROM "{DATA_SCHEMA}"."{DATA_TABLE}"
-- TG-IF do_sample_bool
TABLESAMPLE ({SAMPLE_PERCENT_CALC} PERCENT)
-- TG-ENDIF
@@ -32,7 +32,7 @@ SELECT '{PROJECT_CODE}' as project_code,
(SELECT CONVERT(VARCHAR(40), HASHBYTES('MD5', STRING_AGG( NULLIF(dist_col_name,''),
'|') WITHIN GROUP (ORDER BY dist_col_name)), 2) as dvh
FROM (SELECT DISTINCT "{COL_NAME}" as dist_col_name
- FROM {DATA_SCHEMA}.{DATA_TABLE}) a
+ FROM "{DATA_SCHEMA}"."{DATA_TABLE}") a
) as distinct_value_hash
FROM consol_vals;
diff --git a/testgen/template/flavors/postgresql/profiling/project_profiling_query_postgresql.yaml b/testgen/template/flavors/postgresql/profiling/project_profiling_query_postgresql.yaml
index 8bc65688..2b1294cd 100644
--- a/testgen/template/flavors/postgresql/profiling/project_profiling_query_postgresql.yaml
+++ b/testgen/template/flavors/postgresql/profiling/project_profiling_query_postgresql.yaml
@@ -1,7 +1,6 @@
---
-strTemplate01_sampling: "SELECT "
-strTemplate01_else: "SELECT "
-strTemplate02_all: |
+strTemplate01: |
+ SELECT
{CONNECTION_ID} as connection_id,
'{PROJECT_CODE}' as project_code,
'{TABLE_GROUPS_ID}' as table_groups_id,
@@ -13,6 +12,11 @@ strTemplate02_all: |
'{COL_TYPE}' AS column_type,
'{COL_GEN_TYPE}' AS general_type,
COUNT(*) AS record_ct,
+strTemplate02_X: |
+ COUNT("{COL_NAME}") AS value_ct,
+ COUNT(DISTINCT "{COL_NAME}") AS distinct_value_ct,
+ SUM(CASE WHEN "{COL_NAME}" IS NULL THEN 1 ELSE 0 END) AS null_value_ct,
+strTemplate02_else: |
COUNT("{COL_NAME}") AS value_ct,
COUNT(DISTINCT "{COL_NAME}") AS distinct_value_ct,
SUM(CASE WHEN "{COL_NAME}" IS NULL THEN 1 ELSE 0 END) AS null_value_ct,
@@ -113,9 +117,9 @@ strTemplate06_A_patterns: ( SELECT LEFT(STRING_AGG(pattern, ' | ' ORDER BY ct DE
"{COL_NAME}", '[a-z]', 'a', 'g'),
'[A-Z]', 'A', 'g'),
'[0-9]', 'N', 'g') AS pattern
- FROM {DATA_SCHEMA}.{DATA_TABLE}
+ FROM "{DATA_SCHEMA}"."{DATA_TABLE}"
WHERE "{COL_NAME}" > ' ' AND (SELECT MAX(LENGTH("{COL_NAME}"))
- FROM {DATA_SCHEMA}.{DATA_TABLE}) BETWEEN 3 and {PARM_MAX_PATTERN_LENGTH}) p
+ FROM "{DATA_SCHEMA}"."{DATA_TABLE}") BETWEEN 3 and {PARM_MAX_PATTERN_LENGTH}) p
GROUP BY pattern
HAVING pattern > ' '
ORDER BY COUNT(*) DESC
@@ -126,7 +130,7 @@ strTemplate07_A_freq: ( SELECT LEFT(STRING_AGG(val, ' | ' ORDER BY ct DESC), 1
FROM (
SELECT TOP 10 CAST(COUNT(*) as VARCHAR(10)) || ' | ' || "{COL_NAME}" as val,
COUNT(*) as ct
- FROM {DATA_SCHEMA}.{DATA_TABLE}
+ FROM "{DATA_SCHEMA}"."{DATA_TABLE}"
WHERE "{COL_NAME}" > ' '
GROUP BY "{COL_NAME}"
HAVING "{COL_NAME}" > ' '
@@ -219,7 +223,7 @@ strTemplate14_A_do_patterns: ( SELECT COUNT(DISTINCT REGEXP_REPLACE( REGEXP_REPL
'[A-Z]', 'A', 'g'),
'[0-9]', 'N', 'g')
) AS pattern_ct
- FROM {DATA_SCHEMA}.{DATA_TABLE}
+ FROM "{DATA_SCHEMA}"."{DATA_TABLE}"
WHERE "{COL_NAME}" > ' ' ) AS distinct_pattern_ct,
SUM(SIGN(LENGTH(TRIM("{COL_NAME}")) - LENGTH(REGEXP_REPLACE(TRIM("{COL_NAME}"), ' ', '', 'g')))::BIGINT) AS embedded_space_ct,
AVG(LENGTH(TRIM("{COL_NAME}")) - LENGTH(REGEXP_REPLACE(TRIM("{COL_NAME}"), ' ', '', 'g'))::FLOAT) AS avg_embedded_spaces,
@@ -237,23 +241,23 @@ strTemplate15_ALL: NULL as functional_data_type,
strTemplate16_ALL: " '{PROFILE_RUN_ID}' as profile_run_id"
-strTemplate98_sampling: ' FROM {DATA_SCHEMA}.{DATA_TABLE} TABLESAMPLE BERNOULLI ({SAMPLE_PERCENT_CALC}) REPEATABLE (64)'
+strTemplate98_sampling: ' FROM "{DATA_SCHEMA}"."{DATA_TABLE}" TABLESAMPLE BERNOULLI ({SAMPLE_PERCENT_CALC}) REPEATABLE (64)'
-strTemplate98_else: ' FROM {DATA_SCHEMA}.{DATA_TABLE} '
+strTemplate98_else: ' FROM "{DATA_SCHEMA}"."{DATA_TABLE}" '
strTemplate99_N: |
, (SELECT
PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY "{COL_NAME}"::NUMERIC) AS pct_25,
PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY "{COL_NAME}"::NUMERIC) AS pct_50,
PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY "{COL_NAME}"::NUMERIC) AS pct_75
- FROM {DATA_SCHEMA}.{DATA_TABLE} LIMIT 1) pctile
+ FROM "{DATA_SCHEMA}"."{DATA_TABLE}" LIMIT 1) pctile
strTemplate99_N_sampling: |
, (SELECT
PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY "{COL_NAME}"::NUMERIC) AS pct_25,
PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY "{COL_NAME}"::NUMERIC) AS pct_50,
PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY "{COL_NAME}"::NUMERIC) AS pct_75
- FROM {DATA_SCHEMA}.{DATA_TABLE} TABLESAMPLE BERNOULLI ({SAMPLE_PERCENT_CALC}) REPEATABLE (64) LIMIT 1) pctile
+ FROM "{DATA_SCHEMA}"."{DATA_TABLE}" TABLESAMPLE BERNOULLI ({SAMPLE_PERCENT_CALC}) REPEATABLE (64) LIMIT 1) pctile
strTemplate99_else: ' '
diff --git a/testgen/template/flavors/postgresql/profiling/project_secondary_profiling_query_postgresql.sql b/testgen/template/flavors/postgresql/profiling/project_secondary_profiling_query_postgresql.sql
index e3261f14..b9b0c3d6 100644
--- a/testgen/template/flavors/postgresql/profiling/project_secondary_profiling_query_postgresql.sql
+++ b/testgen/template/flavors/postgresql/profiling/project_secondary_profiling_query_postgresql.sql
@@ -3,7 +3,7 @@ WITH ranked_vals AS (
SELECT "{COL_NAME}",
COUNT(*) AS ct,
ROW_NUMBER() OVER (ORDER BY COUNT(*) DESC, "{COL_NAME}") AS rn
- FROM {DATA_SCHEMA}.{DATA_TABLE}
+ FROM "{DATA_SCHEMA}"."{DATA_TABLE}"
-- TG-IF do_sample_bool
TABLESAMPLE BERNOULLI ({SAMPLE_PERCENT_CALC}) REPEATABLE (64)
-- TG-ENDIF
@@ -27,5 +27,5 @@ SELECT '{PROJECT_CODE}' as project_code,
'{COL_NAME}' as column_name,
REPLACE(STRING_AGG(val, '^#^' ORDER BY min_rn), '^#^', CHR(10)) AS top_freq_values,
( SELECT MD5(STRING_AGG(DISTINCT "{COL_NAME}", '|' ORDER BY "{COL_NAME}")) as dvh
- FROM {DATA_SCHEMA}.{DATA_TABLE} ) as distinct_value_hash
+ FROM "{DATA_SCHEMA}"."{DATA_TABLE}" ) as distinct_value_hash
FROM consol_vals;
diff --git a/testgen/template/flavors/redshift/profiling/project_profiling_query_redshift.yaml b/testgen/template/flavors/redshift/profiling/project_profiling_query_redshift.yaml
index b3cf7277..eb85a465 100644
--- a/testgen/template/flavors/redshift/profiling/project_profiling_query_redshift.yaml
+++ b/testgen/template/flavors/redshift/profiling/project_profiling_query_redshift.yaml
@@ -1,7 +1,6 @@
---
-strTemplate01_sampling: "SELECT "
-strTemplate01_else: "SELECT "
-strTemplate02_all: |
+strTemplate01: |
+ SELECT
{CONNECTION_ID} as connection_id,
'{PROJECT_CODE}' as project_code,
'{TABLE_GROUPS_ID}' as table_groups_id,
@@ -13,6 +12,11 @@ strTemplate02_all: |
'{COL_TYPE}' AS column_type,
'{COL_GEN_TYPE}' AS general_type,
COUNT(*) AS record_ct,
+strTemplate02_X: |
+ COUNT("{COL_NAME}") AS value_ct,
+ COUNT(DISTINCT "{COL_NAME}") AS distinct_value_ct,
+ SUM(NVL2("{COL_NAME}", 0, 1)) AS null_value_ct,
+strTemplate02_else: |
COUNT("{COL_NAME}") AS value_ct,
COUNT(DISTINCT "{COL_NAME}") AS distinct_value_ct,
SUM(NVL2("{COL_NAME}", 0, 1)) AS null_value_ct,
@@ -92,9 +96,9 @@ strTemplate06_A_patterns: (SELECT LEFT(LISTAGG(pattern, ' | ') WITHIN GROUP (ORD
"{COL_NAME}", '[a-z]', 'a'),
'[A-Z]', 'A'),
'[0-9]', 'N') AS pattern
- FROM {DATA_SCHEMA}.{DATA_TABLE}
+ FROM "{DATA_SCHEMA}"."{DATA_TABLE}"
WHERE "{COL_NAME}" > ' ' AND (SELECT MAX(LEN("{COL_NAME}"))
- FROM {DATA_SCHEMA}.{DATA_TABLE}) BETWEEN 3 and {PARM_MAX_PATTERN_LENGTH}) p
+ FROM "{DATA_SCHEMA}"."{DATA_TABLE}") BETWEEN 3 and {PARM_MAX_PATTERN_LENGTH}) p
GROUP BY pattern
HAVING pattern > ' '
ORDER BY COUNT(*) DESC) as ps) AS top_patterns,
@@ -103,7 +107,7 @@ strTemplate07_A_freq: ( SELECT LEFT(LISTAGG(val, ' | ') WITHIN GROUP (ORDER BY
FROM (
SELECT TOP 10 CAST(COUNT(*) as VARCHAR(10)) || ' | ' || "{COL_NAME}" as val,
COUNT(*) as ct
- FROM {DATA_SCHEMA}.{DATA_TABLE}
+ FROM "{DATA_SCHEMA}"."{DATA_TABLE}"
WHERE "{COL_NAME}" > ' '
GROUP BY "{COL_NAME}"
HAVING "{COL_NAME}" > ' '
@@ -172,7 +176,7 @@ strTemplate14_A_do_patterns: ( SELECT COUNT(DISTINCT REGEXP_REPLACE( REGEXP_REPL
'[A-Z]', 'A'),
'[0-9]', 'N')
) AS pattern_ct
- FROM {DATA_SCHEMA}.{DATA_TABLE}
+ FROM "{DATA_SCHEMA}"."{DATA_TABLE}"
WHERE "{COL_NAME}" > ' ' ) AS distinct_pattern_ct,
SUM(SIGN(REGEXP_COUNT(TRIM("{COL_NAME}"), ' '))::BIGINT) AS embedded_space_ct,
AVG(REGEXP_COUNT(TRIM("{COL_NAME}"), ' ')::FLOAT) AS avg_embedded_spaces,
@@ -190,23 +194,23 @@ strTemplate15_ALL: NULL as functional_data_type,
strTemplate16_ALL: " '{PROFILE_RUN_ID}' as profile_run_id"
-strTemplate98_sampling: ' FROM {DATA_SCHEMA}.{DATA_TABLE} '
+strTemplate98_sampling: ' FROM "{DATA_SCHEMA}"."{DATA_TABLE}" '
-strTemplate98_else: ' FROM {DATA_SCHEMA}.{DATA_TABLE}'
+strTemplate98_else: ' FROM "{DATA_SCHEMA}"."{DATA_TABLE}"'
strTemplate99_N: |
, (SELECT
PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_25,
PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_50,
PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_75
- FROM {DATA_SCHEMA}.{DATA_TABLE} LIMIT 1) pctile
+ FROM "{DATA_SCHEMA}"."{DATA_TABLE}" LIMIT 1) pctile
strTemplate99_N_sampling: |
, (SELECT
PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_25,
PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_50,
PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_75
- FROM {DATA_SCHEMA}.{DATA_TABLE} LIMIT 1) pctile
+ FROM "{DATA_SCHEMA}"."{DATA_TABLE}" LIMIT 1) pctile
strTemplate99_else: ' '
diff --git a/testgen/template/flavors/redshift/profiling/project_secondary_profiling_query_redshift.sql b/testgen/template/flavors/redshift/profiling/project_secondary_profiling_query_redshift.sql
index 6a0a3d5b..58b86519 100644
--- a/testgen/template/flavors/redshift/profiling/project_secondary_profiling_query_redshift.sql
+++ b/testgen/template/flavors/redshift/profiling/project_secondary_profiling_query_redshift.sql
@@ -3,7 +3,7 @@ WITH ranked_vals AS (
SELECT "{COL_NAME}",
COUNT(*) AS ct,
ROW_NUMBER() OVER (ORDER BY COUNT(*) DESC, "{COL_NAME}") AS rn
- FROM {DATA_SCHEMA}.{DATA_TABLE}
+ FROM "{DATA_SCHEMA}"."{DATA_TABLE}"
WHERE "{COL_NAME}" > ' '
-- TG-IF do_sample_bool
AND RAND() <= 1.0 / {PROFILE_SAMPLE_RATIO}
@@ -28,5 +28,5 @@ SELECT '{PROJECT_CODE}' as project_code,
REPLACE(LISTAGG(val, '^#^') WITHIN GROUP (ORDER BY min_rn), '^#^', CHR(10)) AS top_freq_values,
( SELECT MD5(LISTAGG(DISTINCT "{COL_NAME}", '|')
WITHIN GROUP (ORDER BY "{COL_NAME}")) as dvh
- FROM {DATA_SCHEMA}.{DATA_TABLE} ) as distinct_value_hash
+ FROM "{DATA_SCHEMA}"."{DATA_TABLE}" ) as distinct_value_hash
FROM consol_vals;
diff --git a/testgen/template/flavors/redshift_spectrum/profiling/project_profiling_query_redshift_spectrum.yaml b/testgen/template/flavors/redshift_spectrum/profiling/project_profiling_query_redshift_spectrum.yaml
index 1596dd1d..110a2baa 100644
--- a/testgen/template/flavors/redshift_spectrum/profiling/project_profiling_query_redshift_spectrum.yaml
+++ b/testgen/template/flavors/redshift_spectrum/profiling/project_profiling_query_redshift_spectrum.yaml
@@ -1,7 +1,6 @@
---
-strTemplate01_sampling: "SELECT "
-strTemplate01_else: "SELECT "
-strTemplate02_all: |
+strTemplate01: |
+ SELECT
{CONNECTION_ID} as connection_id,
'{PROJECT_CODE}' as project_code,
'{TABLE_GROUPS_ID}' as table_groups_id,
@@ -13,6 +12,11 @@ strTemplate02_all: |
'{COL_TYPE}' AS column_type,
'{COL_GEN_TYPE}' AS general_type,
COUNT(*) AS record_ct,
+strTemplate02_X: |
+ COUNT("{COL_NAME}") AS value_ct,
+ COUNT(DISTINCT "{COL_NAME}") AS distinct_value_ct,
+ SUM(NVL2("{COL_NAME}", 0, 1)) AS null_value_ct,
+strTemplate02_else: |
COUNT("{COL_NAME}") AS value_ct,
COUNT(DISTINCT "{COL_NAME}") AS distinct_value_ct,
SUM(NVL2("{COL_NAME}", 0, 1)) AS null_value_ct,
@@ -92,9 +96,9 @@ strTemplate06_A_patterns: (SELECT LEFT(LISTAGG(pattern, ' | ') WITHIN GROUP (ORD
"{COL_NAME}", '[a-z]', 'a'),
'[A-Z]', 'A'),
'[0-9]', 'N') AS pattern
- FROM {DATA_SCHEMA}.{DATA_TABLE}
+ FROM "{DATA_SCHEMA}"."{DATA_TABLE}"
WHERE "{COL_NAME}" > ' ' AND (SELECT MAX(LEN("{COL_NAME}"))
- FROM {DATA_SCHEMA}.{DATA_TABLE}) BETWEEN 3 and {PARM_MAX_PATTERN_LENGTH}) p
+ FROM "{DATA_SCHEMA}"."{DATA_TABLE}") BETWEEN 3 and {PARM_MAX_PATTERN_LENGTH}) p
GROUP BY pattern
HAVING pattern > ' '
ORDER BY COUNT(*) DESC) as ps) AS top_patterns,
@@ -103,7 +107,7 @@ strTemplate07_A_freq: ( SELECT LEFT(LISTAGG(val, ' | ') WITHIN GROUP (ORDER BY
FROM (
SELECT TOP 10 CAST(COUNT(*) as VARCHAR(10)) || ' | ' || "{COL_NAME}" as val,
COUNT(*) as ct
- FROM {DATA_SCHEMA}.{DATA_TABLE}
+ FROM "{DATA_SCHEMA}"."{DATA_TABLE}"
WHERE "{COL_NAME}" > ' '
GROUP BY "{COL_NAME}"
HAVING "{COL_NAME}" > ' '
@@ -172,7 +176,7 @@ strTemplate14_A_do_patterns: ( SELECT COUNT(DISTINCT REGEXP_REPLACE( REGEXP_REPL
'[A-Z]', 'A'),
'[0-9]', 'N')
) AS pattern_ct
- FROM {DATA_SCHEMA}.{DATA_TABLE}
+ FROM "{DATA_SCHEMA}"."{DATA_TABLE}"
WHERE "{COL_NAME}" > ' ' ) AS distinct_pattern_ct,
SUM(SIGN(REGEXP_COUNT(TRIM("{COL_NAME}"), ' '))::BIGINT) AS embedded_space_ct,
AVG(REGEXP_COUNT(TRIM("{COL_NAME}"), ' ')::FLOAT) AS avg_embedded_spaces,
@@ -190,23 +194,23 @@ strTemplate15_ALL: NULL as functional_data_type,
strTemplate16_ALL: " '{PROFILE_RUN_ID}' as profile_run_id"
-strTemplate98_sampling: ' FROM {DATA_SCHEMA}.{DATA_TABLE} '
+strTemplate98_sampling: ' FROM "{DATA_SCHEMA}"."{DATA_TABLE}" '
-strTemplate98_else: ' FROM {DATA_SCHEMA}.{DATA_TABLE}'
+strTemplate98_else: ' FROM "{DATA_SCHEMA}"."{DATA_TABLE}"'
strTemplate99_N: |
, (SELECT
PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_25,
PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_50,
PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_75
- FROM {DATA_SCHEMA}.{DATA_TABLE} LIMIT 1) pctile
+ FROM "{DATA_SCHEMA}"."{DATA_TABLE}" LIMIT 1) pctile
strTemplate99_N_sampling: |
, (SELECT
PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_25,
PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_50,
PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_75
- FROM {DATA_SCHEMA}.{DATA_TABLE} LIMIT 1) pctile
+ FROM "{DATA_SCHEMA}"."{DATA_TABLE}" LIMIT 1) pctile
strTemplate99_else: ' '
diff --git a/testgen/template/flavors/redshift_spectrum/profiling/project_secondary_profiling_query_redshift_spectrum.sql b/testgen/template/flavors/redshift_spectrum/profiling/project_secondary_profiling_query_redshift_spectrum.sql
index 6a0a3d5b..58b86519 100644
--- a/testgen/template/flavors/redshift_spectrum/profiling/project_secondary_profiling_query_redshift_spectrum.sql
+++ b/testgen/template/flavors/redshift_spectrum/profiling/project_secondary_profiling_query_redshift_spectrum.sql
@@ -3,7 +3,7 @@ WITH ranked_vals AS (
SELECT "{COL_NAME}",
COUNT(*) AS ct,
ROW_NUMBER() OVER (ORDER BY COUNT(*) DESC, "{COL_NAME}") AS rn
- FROM {DATA_SCHEMA}.{DATA_TABLE}
+ FROM "{DATA_SCHEMA}"."{DATA_TABLE}"
WHERE "{COL_NAME}" > ' '
-- TG-IF do_sample_bool
AND RAND() <= 1.0 / {PROFILE_SAMPLE_RATIO}
@@ -28,5 +28,5 @@ SELECT '{PROJECT_CODE}' as project_code,
REPLACE(LISTAGG(val, '^#^') WITHIN GROUP (ORDER BY min_rn), '^#^', CHR(10)) AS top_freq_values,
( SELECT MD5(LISTAGG(DISTINCT "{COL_NAME}", '|')
WITHIN GROUP (ORDER BY "{COL_NAME}")) as dvh
- FROM {DATA_SCHEMA}.{DATA_TABLE} ) as distinct_value_hash
+ FROM "{DATA_SCHEMA}"."{DATA_TABLE}" ) as distinct_value_hash
FROM consol_vals;
diff --git a/testgen/template/flavors/snowflake/profiling/project_profiling_query_snowflake.yaml b/testgen/template/flavors/snowflake/profiling/project_profiling_query_snowflake.yaml
index bc0f1e7d..6cfaf2e7 100644
--- a/testgen/template/flavors/snowflake/profiling/project_profiling_query_snowflake.yaml
+++ b/testgen/template/flavors/snowflake/profiling/project_profiling_query_snowflake.yaml
@@ -1,7 +1,6 @@
---
-strTemplate01_sampling: "SELECT "
-strTemplate01_else: "SELECT "
-strTemplate02_all: |
+strTemplate01: |
+ SELECT
{CONNECTION_ID} as connection_id,
'{PROJECT_CODE}' as project_code,
'{TABLE_GROUPS_ID}' as table_groups_id,
@@ -13,6 +12,11 @@ strTemplate02_all: |
'{COL_TYPE}' AS column_type,
'{COL_GEN_TYPE}' AS general_type,
COUNT(*) AS record_ct,
+strTemplate02_X: |
+ COUNT("{COL_NAME}") AS value_ct,
+ COUNT(DISTINCT "{COL_NAME}") AS distinct_value_ct,
+ SUM(NVL2("{COL_NAME}", 0, 1)) AS null_value_ct,
+strTemplate02_else: |
COUNT("{COL_NAME}") AS value_ct,
COUNT(DISTINCT "{COL_NAME}") AS distinct_value_ct,
SUM(NVL2("{COL_NAME}", 0, 1)) AS null_value_ct,
@@ -100,9 +104,9 @@ strTemplate06_A_patterns: ( SELECT LEFT(LISTAGG(pattern, ' | ') WITHIN GROUP (OR
"{COL_NAME}"::VARCHAR, '[a-z]', 'a'),
'[A-Z]', 'A'),
'[0-9]', 'N') AS pattern
- FROM {DATA_SCHEMA}.{DATA_TABLE}
+ FROM "{DATA_SCHEMA}"."{DATA_TABLE}"
WHERE "{COL_NAME}" > ' ' AND (SELECT MAX(LEN("{COL_NAME}"))
- FROM {DATA_SCHEMA}.{DATA_TABLE}) BETWEEN 3 and {PARM_MAX_PATTERN_LENGTH}) p
+ FROM "{DATA_SCHEMA}"."{DATA_TABLE}") BETWEEN 3 and {PARM_MAX_PATTERN_LENGTH}) p
GROUP BY pattern
HAVING pattern > ' '
ORDER BY COUNT(*) DESC) as ps) AS top_patterns,
@@ -111,7 +115,7 @@ strTemplate07_A_freq: ( SELECT LEFT(LISTAGG(val, ' | ') WITHIN GROUP (ORDER BY
FROM (
SELECT TOP 10 CAST(COUNT(*) as VARCHAR(10)) || ' | ' || "{COL_NAME}" as val,
COUNT(*) as ct
- FROM {DATA_SCHEMA}.{DATA_TABLE}
+ FROM "{DATA_SCHEMA}"."{DATA_TABLE}"
WHERE "{COL_NAME}" > ' '
GROUP BY "{COL_NAME}"
HAVING "{COL_NAME}" > ' '
@@ -177,7 +181,7 @@ strTemplate14_A_do_patterns: ( SELECT COUNT(DISTINCT REGEXP_REPLACE( REGEXP_REP
'[A-Z]', 'A'),
'[0-9]', 'N')
) AS pattern_ct
- FROM {DATA_SCHEMA}.{DATA_TABLE}
+ FROM "{DATA_SCHEMA}"."{DATA_TABLE}"
WHERE "{COL_NAME}" > ' ' ) AS distinct_pattern_ct,
SUM(SIGN(REGEXP_COUNT(TRIM("{COL_NAME}"::VARCHAR), ' '))::BIGINT) AS embedded_space_ct,
AVG(REGEXP_COUNT(TRIM("{COL_NAME}"::VARCHAR), ' ')::FLOAT) AS avg_embedded_spaces,
@@ -195,9 +199,9 @@ strTemplate15_ALL: NULL as functional_data_type,
strTemplate16_ALL: " '{PROFILE_RUN_ID}' as profile_run_id "
-strTemplate98_sampling: ' FROM {DATA_SCHEMA}.{DATA_TABLE} SAMPLE ({SAMPLE_SIZE} rows)'
+strTemplate98_sampling: ' FROM "{DATA_SCHEMA}"."{DATA_TABLE}" SAMPLE ({SAMPLE_SIZE} rows)'
-strTemplate98_else: ' FROM {DATA_SCHEMA}.{DATA_TABLE}'
+strTemplate98_else: ' FROM "{DATA_SCHEMA}"."{DATA_TABLE}"'
strTemplate99_N: |
,
@@ -205,7 +209,7 @@ strTemplate99_N: |
PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_25,
PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_50,
PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_75
- FROM {DATA_SCHEMA}.{DATA_TABLE} LIMIT 1) pctile
+ FROM "{DATA_SCHEMA}"."{DATA_TABLE}" LIMIT 1) pctile
strTemplate99_N_sampling: |
,
@@ -213,7 +217,7 @@ strTemplate99_N_sampling: |
PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_25,
PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_50,
PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_75
- FROM {DATA_SCHEMA}.{DATA_TABLE} SAMPLE ({SAMPLE_SIZE} rows) LIMIT 1 ) pctile
+ FROM "{DATA_SCHEMA}"."{DATA_TABLE}" SAMPLE ({SAMPLE_SIZE} rows) LIMIT 1 ) pctile
strTemplate99_else: ;
diff --git a/testgen/template/flavors/snowflake/profiling/project_secondary_profiling_query_snowflake.sql b/testgen/template/flavors/snowflake/profiling/project_secondary_profiling_query_snowflake.sql
index 709643b5..7b80fc70 100644
--- a/testgen/template/flavors/snowflake/profiling/project_secondary_profiling_query_snowflake.sql
+++ b/testgen/template/flavors/snowflake/profiling/project_secondary_profiling_query_snowflake.sql
@@ -3,7 +3,7 @@ WITH ranked_vals AS (
SELECT "{COL_NAME}",
COUNT(*) AS ct,
ROW_NUMBER() OVER (ORDER BY COUNT(*) DESC, "{COL_NAME}") AS rn
- FROM {DATA_SCHEMA}.{DATA_TABLE}
+ FROM "{DATA_SCHEMA}"."{DATA_TABLE}"
-- TG-IF do_sample_bool
SAMPLE ({SAMPLE_SIZE} rows)
-- TG-ENDIF
@@ -28,5 +28,5 @@ SELECT '{PROJECT_CODE}' as project_code,
REPLACE(LISTAGG(val, '^#^') WITHIN GROUP (ORDER BY min_rn), '^#^', CHR(10)) AS top_freq_values,
( SELECT MD5(LISTAGG(DISTINCT NULLIF("{COL_NAME}", ''), '|')
WITHIN GROUP (ORDER BY NULLIF("{COL_NAME}", ''))) as dvh
- FROM {DATA_SCHEMA}.{DATA_TABLE} ) as distinct_value_hash
+ FROM "{DATA_SCHEMA}"."{DATA_TABLE}" ) as distinct_value_hash
FROM consol_vals;
diff --git a/testgen/template/flavors/trino/profiling/project_profiling_query_trino.yaml b/testgen/template/flavors/trino/profiling/project_profiling_query_trino.yaml
index e3ee9f83..6f369d11 100644
--- a/testgen/template/flavors/trino/profiling/project_profiling_query_trino.yaml
+++ b/testgen/template/flavors/trino/profiling/project_profiling_query_trino.yaml
@@ -1,7 +1,6 @@
---
-strTemplate01_sampling: "SELECT "
-strTemplate01_else: "SELECT "
-strTemplate02_all: |
+strTemplate01: |
+ SELECT
{CONNECTION_ID} as connection_id,
'{PROJECT_CODE}' as project_code,
'{TABLE_GROUPS_ID}' as table_groups_id,
@@ -13,6 +12,11 @@ strTemplate02_all: |
'{COL_TYPE}' AS column_type,
'{COL_GEN_TYPE}' AS general_type,
COUNT(*) AS record_ct,
+strTemplate02_X: |
+ COUNT("{COL_NAME}") AS value_ct,
+ COUNT(DISTINCT "{COL_NAME}") AS distinct_value_ct,
+ SUM(NVL2("{COL_NAME}", 0, 1)) AS null_value_ct,
+strTemplate02_else: |
COUNT("{COL_NAME}") AS value_ct,
COUNT(DISTINCT "{COL_NAME}") AS distinct_value_ct,
SUM(NVL2("{COL_NAME}", 0, 1)) AS null_value_ct,
@@ -113,9 +117,9 @@ strTemplate06_A_patterns: (SELECT SUBSTRING(LISTAGG(pattern, ' | ') WITHIN GROUP
"{COL_NAME}", '[a-z]', 'a'),
'[A-Z]', 'A'),
'[0-9]', 'N') AS pattern
- FROM {DATA_SCHEMA}.{DATA_TABLE}
+ FROM "{DATA_SCHEMA}"."{DATA_TABLE}"
WHERE "{COL_NAME}" > ' ' AND (SELECT MAX(LENGTH("{COL_NAME}"))
- FROM {DATA_SCHEMA}.{DATA_TABLE}) BETWEEN 3 and {PARM_MAX_PATTERN_LENGTH}) p
+ FROM "{DATA_SCHEMA}"."{DATA_TABLE}") BETWEEN 3 and {PARM_MAX_PATTERN_LENGTH}) p
GROUP BY pattern
HAVING pattern > ' '
ORDER BY COUNT(*) DESC LIMIT 5) as ps) AS top_patterns,
@@ -123,7 +127,7 @@ strTemplate06_else: NULL as top_patterns,
strTemplate07_A_freq: ( SELECT SUBSTRING(LISTAGG(val, ' | ') WITHIN GROUP (ORDER BY ct DESC), 1, 1000) as concat_vals
FROM (
SELECT CAST(COUNT(*) as VARCHAR(10)) || ' | ' || "{COL_NAME}" as val, COUNT(*) as ct
- FROM {DATA_SCHEMA}.{DATA_TABLE}
+ FROM "{DATA_SCHEMA}"."{DATA_TABLE}"
WHERE "{COL_NAME}" > ' '
GROUP BY "{COL_NAME}"
HAVING "{COL_NAME}" > ' '
@@ -215,7 +219,7 @@ strTemplate14_A_do_patterns: ( SELECT COUNT(DISTINCT REGEXP_REPLACE( REGEXP_REPL
'[A-Z]', 'A'),
'[0-9]', 'N')
) AS pattern_ct
- FROM {DATA_SCHEMA}.{DATA_TABLE}
+ FROM "{DATA_SCHEMA}"."{DATA_TABLE}"
WHERE "{COL_NAME}" > ' ' ) AS distinct_pattern_ct,
SUM(CAST(SIGN(REGEXP_COUNT(TRIM("{COL_NAME}"), ' ')) AS BIGINT)) AS embedded_space_ct,
AVG(CAST(REGEXP_COUNT(TRIM("{COL_NAME}"), ' ') AS REAL)) AS avg_embedded_spaces,
@@ -233,23 +237,23 @@ strTemplate15_ALL: NULL as functional_data_type,
strTemplate16_ALL: " '{PROFILE_RUN_ID}' as profile_run_id"
-strTemplate98_sampling: ' FROM {DATA_SCHEMA}.{DATA_TABLE} TABLESAMPLE SYSTEM ({SAMPLE_PERCENT_CALC})'
+strTemplate98_sampling: ' FROM "{DATA_SCHEMA}"."{DATA_TABLE}" TABLESAMPLE SYSTEM ({SAMPLE_PERCENT_CALC})'
-strTemplate98_else: ' FROM {DATA_SCHEMA}.{DATA_TABLE}'
+strTemplate98_else: ' FROM "{DATA_SCHEMA}"."{DATA_TABLE}"'
strTemplate99_N: |
, (SELECT
APPROX_PERCENTILE("{COL_NAME}", 0.25) AS pct_25,
APPROX_PERCENTILE("{COL_NAME}", 0.50) AS pct_50,
APPROX_PERCENTILE("{COL_NAME}", 0.75) AS pct_75
- FROM {DATA_SCHEMA}.{DATA_TABLE} LIMIT 1) pctile
+ FROM "{DATA_SCHEMA}"."{DATA_TABLE}" LIMIT 1) pctile
strTemplate99_N_sampling: |
, (SELECT
APPROX_PERCENTILE("{COL_NAME}", 0.25) AS pct_25,
APPROX_PERCENTILE("{COL_NAME}", 0.50) AS pct_50,
APPROX_PERCENTILE("{COL_NAME}", 0.75) AS pct_75
- FROM {DATA_SCHEMA}.{DATA_TABLE} TABLESAMPLE SYSTEM ({SAMPLE_PERCENT_CALC}) ) pctile
+ FROM "{DATA_SCHEMA}"."{DATA_TABLE}" TABLESAMPLE SYSTEM ({SAMPLE_PERCENT_CALC}) ) pctile
strTemplate99_else: ' '
From 352ab1c2dc82b85b5abb2b49e2571894ea954709 Mon Sep 17 00:00:00 2001
From: "Chip.Bloche"
Date: Fri, 26 Sep 2025 15:10:38 -0400
Subject: [PATCH 18/48] Identify TEXT, NTEXT as general_type X
---
.../template/flavors/mssql/data_chars/schema_ddf_query_mssql.sql | 1 -
1 file changed, 1 deletion(-)
diff --git a/testgen/template/flavors/mssql/data_chars/schema_ddf_query_mssql.sql b/testgen/template/flavors/mssql/data_chars/schema_ddf_query_mssql.sql
index 44d659ca..e74b6939 100644
--- a/testgen/template/flavors/mssql/data_chars/schema_ddf_query_mssql.sql
+++ b/testgen/template/flavors/mssql/data_chars/schema_ddf_query_mssql.sql
@@ -16,7 +16,6 @@ SELECT '{PROJECT_CODE}' as project_code,
c.ordinal_position,
CASE
WHEN LOWER(c.data_type) LIKE '%char%'
- OR c.data_type LIKE '%text%'
THEN 'A'
WHEN c.data_type = 'bit'
THEN 'B'
From 53bfb66f124889b49b18f02c40945fb6c1082cf2 Mon Sep 17 00:00:00 2001
From: Aarthy Adityan
Date: Wed, 24 Sep 2025 12:35:20 -0400
Subject: [PATCH 19/48] feat(pagination): paginate grid component
---
.../frontend/js/components/paginator.js | 3 +-
testgen/ui/components/widgets/paginator.py | 20 +-
testgen/ui/services/form_service.py | 185 +++++----
testgen/ui/views/hygiene_issues.py | 118 +++---
testgen/ui/views/profiling_results.py | 42 +-
testgen/ui/views/test_definitions.py | 389 +++++++++---------
testgen/ui/views/test_results.py | 300 +++++++-------
7 files changed, 540 insertions(+), 517 deletions(-)
diff --git a/testgen/ui/components/frontend/js/components/paginator.js b/testgen/ui/components/frontend/js/components/paginator.js
index 602302b2..7799e7f2 100644
--- a/testgen/ui/components/frontend/js/components/paginator.js
+++ b/testgen/ui/components/frontend/js/components/paginator.js
@@ -21,7 +21,8 @@ const Paginator = (/** @type Properties */ props) => {
}
const { count, pageSize } = props;
- const pageIndexState = van.state(getValue(props.pageIndex) ?? 0);
+ const pageIndexState = van.derive(() => getValue(props.pageIndex) ?? 0);
+
van.derive(() => {
const onChange = props.onChange?.val ?? props.onChange ?? changePage;
onChange(pageIndexState.val);
diff --git a/testgen/ui/components/widgets/paginator.py b/testgen/ui/components/widgets/paginator.py
index c98a335e..114729b1 100644
--- a/testgen/ui/components/widgets/paginator.py
+++ b/testgen/ui/components/widgets/paginator.py
@@ -1,10 +1,17 @@
+import typing
+
+import streamlit as st
+
from testgen.ui.components.utils.component import component
+from testgen.ui.navigation.router import Router
def paginator(
count: int,
page_size: int,
- page_index: int = 0,
+ page_index: int | None = None,
+ bind_to_query: str | None = None,
+ on_change: typing.Callable | None = None,
key: str = "testgen:paginator",
) -> bool:
"""
@@ -17,10 +24,21 @@ def paginator(
:param key: unique key to give the component a persisting state
"""
+ def on_page_change():
+ if bind_to_query:
+ if event_data := st.session_state[key]:
+ Router().set_query_params({ bind_to_query: event_data.get("page_index", 0) })
+ if on_change:
+ on_change()
+
+ if page_index is None:
+ page_index = int(st.query_params.get(bind_to_query, 0)) if bind_to_query else 0
+
event_data = component(
id_="paginator",
key=key,
default={ page_index: page_index },
props={"count": count, "pageSize": page_size, "pageIndex": page_index},
+ on_change=on_page_change,
)
return event_data.get("page_index", 0)
diff --git a/testgen/ui/services/form_service.py b/testgen/ui/services/form_service.py
index d9e7223c..2c8e50b6 100644
--- a/testgen/ui/services/form_service.py
+++ b/testgen/ui/services/form_service.py
@@ -9,6 +9,7 @@
from pandas.api.types import is_datetime64_any_dtype
from st_aggrid import AgGrid, ColumnsAutoSizeMode, DataReturnMode, GridOptionsBuilder, GridUpdateMode, JsCode
+from testgen.ui.components import widgets as testgen
from testgen.ui.navigation.router import Router
"""
@@ -142,30 +143,26 @@ def render_html_list(dct_row, lst_columns, str_section_header=None, int_data_wid
def render_grid_select(
df: pd.DataFrame,
- show_columns,
- str_prompt=None,
- int_height=400,
- do_multi_select: bool | None = None,
+ columns: list[str],
+ column_headers: list[str] | None = None,
+ id_column: str | None = None,
selection_mode: typing.Literal["single", "multiple", "disabled"] = "single",
- show_column_headers=None,
- render_highlights=True,
- bind_to_query_name: str | None = None,
- bind_to_query_prop: str | None = None,
+ page_size: int = 500,
+ reset_pagination: bool = False,
+ bind_to_query: bool = False,
+ render_highlights: bool = True,
key: str = "aggrid",
-):
+) -> tuple[list[dict], dict]:
"""
- :param do_multi_select: DEPRECATED. boolean to choose between single
- or multiple selection.
:param selection_mode: one of single, multiple or disabled. defaults
to single.
- :param bind_to_query_name: name of the query param where to bind the
- selected row.
- :param bind_to_query_prop: name of the property of the selected row
- which value will be set in the query param.
+ :param bind_to_query: whether to bind the selected row and page to
+ query params.
:param key: Streamlit cache key for the grid. required when binding
selection to query.
"""
- show_prompt(str_prompt)
+ if selection_mode != "disabled" and not id_column:
+ raise ValueError("id_column is required when using 'single' or 'multiple' selection mode")
# Set grid formatting
cellstyle_jscode = JsCode(
@@ -253,39 +250,62 @@ def render_grid_select(
rendering_counter = st.session_state.get(f"{key}_counter") or 0
previous_dataframe = st.session_state.get(f"{key}_dataframe")
- df = df.copy()
if previous_dataframe is not None:
data_changed = not df.equals(previous_dataframe)
- dct_col_to_header = dict(zip(show_columns, show_column_headers, strict=True)) if show_column_headers else None
+ page_changed = st.session_state.get(f"{key}_page_change", False)
+ if page_changed:
+ st.session_state[f"{key}_page_change"] = False
- gb = GridOptionsBuilder.from_dataframe(df)
- selection_mode_ = selection_mode
- if do_multi_select is not None:
- selection_mode_ = "multiple" if do_multi_select else "single"
+ grid_container = st.container()
+ selected_column, paginator_column = st.columns([.5, .5])
+ with paginator_column:
+ def on_page_change():
+ st.session_state[f"{key}_page_change"] = True
+
+ page_index = testgen.paginator(
+ count=len(df),
+ page_size=page_size,
+ page_index=0 if reset_pagination else None,
+ bind_to_query="page" if bind_to_query else None,
+ on_change=on_page_change,
+ key=f"{key}_paginator",
+ )
+ # Prevent flickering data when filters are changed (which triggers 2 reruns - one from filter and another from paginator)
+ page_index = 0 if reset_pagination else page_index
+ paginated_df = df.iloc[page_size * page_index : page_size * (page_index + 1)]
+
+ dct_col_to_header = dict(zip(columns, column_headers, strict=True)) if column_headers else None
+
+ gb = GridOptionsBuilder.from_dataframe(paginated_df)
pre_selected_rows: typing.Any = {}
- if bind_to_query_name and bind_to_query_prop:
- bound_value = st.query_params.get(bind_to_query_name)
- bound_items = df[df[bind_to_query_prop] == bound_value]
+ if selection_mode == "single" and bind_to_query:
+ bound_value = st.query_params.get("selected")
+ bound_items = paginated_df[paginated_df[id_column] == bound_value]
if len(bound_items) > 0:
# https://github.com/PablocFonseca/streamlit-aggrid/issues/207#issuecomment-1793039564
- pre_selected_rows = {str(bound_items.iloc[0][bind_to_query_prop]): True}
+ pre_selected_rows = {str(bound_value): True}
else:
- if data_changed and st.query_params.get(bind_to_query_name):
+ if data_changed and st.query_params.get("selected"):
rendering_counter += 1
- Router().set_query_params({bind_to_query_name: None})
+ Router().set_query_params({"selected": None})
+
+ selection = set()
+ if selection_mode == "multiple":
+ selection = st.session_state.get(f"{key}_multiselection", set())
+ pre_selected_rows = {str(item): True for item in selection}
gb.configure_selection(
- selection_mode=selection_mode_,
- use_checkbox=selection_mode_ == "multiple",
+ selection_mode=selection_mode,
+ use_checkbox=selection_mode == "multiple",
pre_selected_rows=pre_selected_rows,
)
- if bind_to_query_prop:
- gb.configure_grid_options(getRowId=JsCode(f"""function(row) {{ return row.data['{bind_to_query_prop}'] }}"""))
+ if id_column:
+ gb.configure_grid_options(getRowId=JsCode(f"function(row) {{ return row.data['{id_column}'] }}"))
- all_columns = list(df.columns)
+ all_columns = list(paginated_df.columns)
for column in all_columns:
# Define common kwargs for all columns: NOTE THAT FIRST COLUMN HOLDS CHECKBOX AND SHOULD BE SHOWN!
@@ -293,9 +313,9 @@ def render_grid_select(
common_kwargs = {
"field": column,
"header_name": str_header if str_header else ut_prettify_header(column),
- "hide": column not in show_columns,
- "headerCheckboxSelection": selection_mode_ == "multiple" and column == show_columns[0],
- "headerCheckboxSelectionFilteredOnly": selection_mode_ == "multiple" and column == show_columns[0],
+ "hide": column not in columns,
+ "headerCheckboxSelection": selection_mode == "multiple" and column == columns[0],
+ "headerCheckboxSelectionFilteredOnly": selection_mode == "multiple" and column == columns[0],
}
highlight_kwargs = {
"cellStyle": cellstyle_jscode,
@@ -307,8 +327,8 @@ def render_grid_select(
}
# Check if the column is a date-time column
- if is_datetime64_any_dtype(df[column]):
- if (df[column].dt.time == pd.Timestamp("00:00:00").time()).all():
+ if is_datetime64_any_dtype(paginated_df[column]):
+ if (paginated_df[column].dt.time == pd.Timestamp("00:00:00").time()).all():
format_string = "yyyy-MM-dd"
else:
format_string = "yyyy-MM-dd HH:mm"
@@ -327,49 +347,66 @@ def render_grid_select(
# Apply configuration using kwargs
gb.configure_column(**all_kwargs)
- grid_options = gb.build()
-
# Render Grid: custom_css fixes spacing bug and tightens empty space at top of grid
- grid_data = AgGrid(
- df,
- gridOptions=grid_options,
- theme="balham",
- enable_enterprise_modules=False,
- allow_unsafe_jscode=True,
- update_mode=GridUpdateMode.NO_UPDATE,
- update_on=["selectionChanged"],
- data_return_mode=DataReturnMode.FILTERED_AND_SORTED,
- columns_auto_size_mode=ColumnsAutoSizeMode.FIT_CONTENTS,
- height=int_height,
- custom_css={
- "#gridToolBar": {
- "padding-bottom": "0px !important",
- },
- ".ag-row-hover .ag-cell.status-tag": {
- "border-color": "var(--ag-row-hover-color) !important",
+ with grid_container:
+ grid_options = gb.build()
+ grid_data = AgGrid(
+ paginated_df.copy(),
+ gridOptions=grid_options,
+ theme="balham",
+ enable_enterprise_modules=False,
+ allow_unsafe_jscode=True,
+ update_mode=GridUpdateMode.NO_UPDATE,
+ update_on=["selectionChanged"],
+ data_return_mode=DataReturnMode.FILTERED_AND_SORTED,
+ columns_auto_size_mode=ColumnsAutoSizeMode.FIT_CONTENTS,
+ height=400,
+ custom_css={
+ "#gridToolBar": {
+ "padding-bottom": "0px !important",
+ },
+ ".ag-row-hover .ag-cell.status-tag": {
+ "border-color": "var(--ag-row-hover-color) !important",
+ },
+ ".ag-row-selected .ag-cell.status-tag": {
+ "border-color": "var(--ag-selected-row-background-color) !important",
+ },
},
- ".ag-row-selected .ag-cell.status-tag": {
- "border-color": "var(--ag-selected-row-background-color) !important",
- },
- },
- key=f"{key}_{selection_mode_}_{rendering_counter}",
- reload_data=data_changed,
- )
+ key=f"{key}_{page_index}_{selection_mode}_{rendering_counter}",
+ reload_data=data_changed,
+ )
st.session_state[f"{key}_counter"] = rendering_counter
st.session_state[f"{key}_dataframe"] = df
- selected_rows = grid_data["selected_rows"]
- if len(selected_rows) > 0:
- if bind_to_query_name and bind_to_query_prop:
- Router().set_query_params({bind_to_query_name: selected_rows[0][bind_to_query_prop]})
-
+ if selection_mode != "disabled":
+ selected_rows = grid_data["selected_rows"]
+ # During page change, there are 2 reruns and the first one does not return the selected rows
+ # So we ignore that run to prevent flickering the selected count
+ if not page_changed:
+ selection.difference_update(paginated_df[id_column].to_list())
+ selection.update([row[id_column] for row in selected_rows])
+ st.session_state[f"{key}_multiselection"] = selection
+
+ if selection:
# We need to get the data from the original dataframe
# Otherwise changes to the dataframe (e.g., editing the current selection) do not get reflected in the returned rows
# Adding "modelUpdated" to AgGrid(update_on=...) does not work
# because it causes unnecessary reruns that cause dialogs to close abruptly
- selected_props = [row[bind_to_query_prop] for row in selected_rows]
- selected_df = df[df[bind_to_query_prop].isin(selected_props)]
- selected_rows = json.loads(selected_df.to_json(orient="records"))
-
- return selected_rows
+ selected_df = df[df[id_column].isin(selection)]
+ selected_data = json.loads(selected_df.to_json(orient="records"))
+
+ selected_id, selected_item = None, None
+ if selected_rows:
+ selected_id = selected_rows[len(selected_rows) - 1][id_column]
+ selected_item = next((item for item in selected_data if item[id_column] == selected_id), None)
+ if bind_to_query:
+ Router().set_query_params({"selected": selected_id})
+
+ if selection_mode == "multiple" and (count := len(selected_data)):
+ with selected_column:
+ testgen.caption(f"{count} item{'s' if count != 1 else ''} selected")
+
+ return selected_data, selected_item
+
+ return None, None
diff --git a/testgen/ui/views/hygiene_issues.py b/testgen/ui/views/hygiene_issues.py
index dbf6082e..e0b91968 100644
--- a/testgen/ui/views/hygiene_issues.py
+++ b/testgen/ui/views/hygiene_issues.py
@@ -44,9 +44,9 @@ def render(
self,
run_id: str,
likelihood: str | None = None,
- issue_type: str | None = None,
table_name: str | None = None,
column_name: str | None = None,
+ issue_type: str | None = None,
action: str | None = None,
**_kwargs,
) -> None:
@@ -77,6 +77,12 @@ def render(
testgen.flex_row_end(actions_column)
testgen.flex_row_end(export_button_column)
+ filters_changed = False
+ current_filters = (likelihood, table_name, column_name, issue_type, action)
+ if st.session_state.get("hygiene_issues:filters") != current_filters:
+ filters_changed = True
+ st.session_state["hygiene_issues:filters"] = current_filters
+
with liklihood_filter_column:
likelihood = testgen.select(
options=["Definite", "Likely", "Possible", "Potential PII"],
@@ -160,8 +166,10 @@ def render(
sorting_columns = testgen.sorting_selector(sortable_columns, default)
with actions_column:
- str_help = "Toggle on to perform actions on multiple Hygiene Issues"
- do_multi_select = st.toggle("Multi-Select", help=str_help)
+ multi_select = st.toggle(
+ "Multi-Select",
+ help="Toggle on to perform actions on multiple Hygiene Issues",
+ )
with st.container():
with st.spinner("Loading data ..."):
@@ -195,31 +203,14 @@ def render(
width=400,
)
- lst_show_columns = [
- "table_name",
- "column_name",
- "issue_likelihood",
- "action",
- "anomaly_name",
- "detail",
- ]
-
- # Show main grid and retrieve selections
- selected = fm.render_grid_select(
+ selected, selected_row = fm.render_grid_select(
df_pa,
- lst_show_columns,
- int_height=400,
- do_multi_select=do_multi_select,
- bind_to_query_name="selected",
- bind_to_query_prop="id",
- show_column_headers=[
- "Table",
- "Column",
- "Likelihood",
- "Action",
- "Issue Type",
- "Detail"
- ]
+ ["table_name", "column_name", "issue_likelihood", "action", "anomaly_name", "detail"],
+ ["Table", "Column", "Likelihood", "Action", "Issue Type", "Detail"],
+ id_column="id",
+ selection_mode="multiple" if multi_select else "single",
+ reset_pagination=filters_changed,
+ bind_to_query=True,
)
popover_container = export_button_column.empty()
@@ -245,22 +236,16 @@ def open_download_dialog(data: pd.DataFrame | None = None) -> None:
if selected:
st.button(label="Selected issues", type="tertiary", on_click=partial(open_download_dialog, pd.DataFrame(selected)))
- if not df_pa.empty:
- if selected:
- # Always show details for last selected row
- selected_row = selected[len(selected) - 1]
- else:
- selected_row = None
-
- # Display hygiene issue detail for selected row
- if not selected_row:
- st.markdown(":orange[Select a record to see more information.]")
- else:
- _, buttons_column = st.columns([0.5, 0.5])
+ # Display hygiene issue detail for selected row
+ if not selected:
+ st.markdown(":orange[Select a record to see more information.]")
+ else:
+ _, buttons_column = st.columns([0.5, 0.5])
- with buttons_column:
- col1, col2, col3 = st.columns([.3, .3, .3])
+ with buttons_column:
+ col1, col2, col3 = st.columns([.3, .3, .3])
+ if selected_row:
with col1:
view_profiling_button(
selected_row["column_name"], selected_row["table_name"], selected_row["table_groups_id"]
@@ -277,32 +262,33 @@ def open_download_dialog(data: pd.DataFrame | None = None) -> None:
)
source_data_dialog(selected_row)
- with col3:
- if st.button(
- ":material/download: Issue Report",
- use_container_width=True,
- help="Generate a PDF report for each selected issue",
- ):
- MixpanelService().send_event(
- "download-issue-report",
- page=self.path,
- issue_count=len(selected),
+ with col3:
+ if st.button(
+ ":material/download: Issue Report",
+ use_container_width=True,
+ help="Generate a PDF report for each selected issue",
+ ):
+ MixpanelService().send_event(
+ "download-issue-report",
+ page=self.path,
+ issue_count=len(selected),
+ )
+ dialog_title = "Download Issue Report"
+ if len(selected) == 1:
+ download_dialog(
+ dialog_title=dialog_title,
+ file_content_func=get_report_file_data,
+ args=(selected[0],),
)
- dialog_title = "Download Issue Report"
- if len(selected) == 1:
- download_dialog(
- dialog_title=dialog_title,
- file_content_func=get_report_file_data,
- args=(selected[0],),
- )
- else:
- zip_func = zip_multi_file_data(
- "testgen_hygiene_issue_reports.zip",
- get_report_file_data,
- [(arg,) for arg in selected],
- )
- download_dialog(dialog_title=dialog_title, file_content_func=zip_func)
+ else:
+ zip_func = zip_multi_file_data(
+ "testgen_hygiene_issue_reports.zip",
+ get_report_file_data,
+ [(arg,) for arg in selected],
+ )
+ download_dialog(dialog_title=dialog_title, file_content_func=zip_func)
+ if selected_row:
fm.render_html_list(
selected_row,
[
@@ -318,8 +304,6 @@ def open_download_dialog(data: pd.DataFrame | None = None) -> None:
"Hygiene Issue Detail",
int_data_width=700,
)
- else:
- st.markdown(":green[**No Hygiene Issues Found**]")
cached_functions = [get_anomaly_disposition, get_profiling_anomaly_summary, get_profiling_anomalies]
diff --git a/testgen/ui/views/profiling_results.py b/testgen/ui/views/profiling_results.py
index 9519c314..edc5ea2d 100644
--- a/testgen/ui/views/profiling_results.py
+++ b/testgen/ui/views/profiling_results.py
@@ -60,6 +60,13 @@ def render(self, run_id: str, table_name: str | None = None, column_name: str |
[.3, .3, .08, .32], vertical_alignment="bottom"
)
+ filters_changed = False
+ current_filters = (table_name, column_name)
+ if st.session_state.get("profiling_results:filters") != current_filters:
+ filters_changed = True
+ st.session_state["profiling_results:filters"] = current_filters
+
+
with table_filter_column:
# Table Name filter
df = get_profiling_run_tables(run_id)
@@ -107,27 +114,13 @@ def render(self, run_id: str, table_name: str | None = None, column_name: str |
sorting_columns=sorting_columns,
)
- show_columns = [
- "table_name",
- "column_name",
- "column_type",
- "semantic_data_type",
- "hygiene_issues",
- ]
- show_column_headers = [
- "Table",
- "Column",
- "Data Type",
- "Semantic Data Type",
- "Hygiene Issues",
- ]
-
- selected_row = fm.render_grid_select(
+ selected, selected_row = fm.render_grid_select(
df,
- show_columns,
- bind_to_query_name="selected",
- bind_to_query_prop="id",
- show_column_headers=show_column_headers,
+ ["table_name", "column_name", "column_type", "semantic_data_type", "hygiene_issues"],
+ ["Table", "Column", "Data Type", "Semantic Data Type", "Hygiene Issues"],
+ id_column="id",
+ reset_pagination=filters_changed,
+ bind_to_query=True,
)
popover_container = export_button_column.empty()
@@ -150,19 +143,18 @@ def open_download_dialog(data: pd.DataFrame | None = None) -> None:
css_class("tg--export-wrapper")
st.button(label="All results", type="tertiary", on_click=open_download_dialog)
st.button(label="Filtered results", type="tertiary", on_click=partial(open_download_dialog, df))
- if selected_row:
- st.button(label="Selected results", type="tertiary", on_click=partial(open_download_dialog, pd.DataFrame(selected_row)))
+ if selected:
+ st.button(label="Selected results", type="tertiary", on_click=partial(open_download_dialog, pd.DataFrame(selected)))
# Display profiling for selected row
if not selected_row:
st.markdown(":orange[Select a row to see profiling details.]")
else:
- item = selected_row[0]
- item["hygiene_issues"] = profiling_queries.get_hygiene_issues(run_id, item["table_name"], item.get("column_name"))
+ selected_row["hygiene_issues"] = profiling_queries.get_hygiene_issues(run_id, selected_row["table_name"], selected_row.get("column_name"))
testgen_component(
"column_profiling_results",
- props={ "column": json.dumps(item), "data_preview": True },
+ props={ "column": json.dumps(selected_row), "data_preview": True },
on_change_handlers={
"DataPreviewClicked": lambda item: data_preview_dialog(
item["table_group_id"],
diff --git a/testgen/ui/views/test_definitions.py b/testgen/ui/views/test_definitions.py
index 4dbe4c76..f0763fe4 100644
--- a/testgen/ui/views/test_definitions.py
+++ b/testgen/ui/views/test_definitions.py
@@ -44,7 +44,14 @@ class TestDefinitionsPage(Page):
lambda: "test_suite_id" in st.query_params or "test-suites",
]
- def render(self, test_suite_id: str, table_name: str | None = None, column_name: str | None = None, **_kwargs) -> None:
+ def render(
+ self,
+ test_suite_id: str,
+ table_name: str | None = None,
+ column_name: str | None = None,
+ test_type: str | None = None,
+ **_kwargs,
+ ) -> None:
test_suite = TestSuite.get(test_suite_id)
if not test_suite:
self.router.navigate_with_warning(
@@ -74,25 +81,34 @@ def render(self, test_suite_id: str, table_name: str | None = None, column_name:
testgen.flex_row_start(actions_column)
testgen.flex_row_end(disposition_column)
+ filters_changed = False
+ current_filters = (table_name, column_name, test_type)
+ if st.session_state.get("test_definitions:filters") != current_filters:
+ filters_changed = True
+ st.session_state["test_definitions:filters"] = current_filters
+
with table_filter_column:
columns_df = get_test_suite_columns(test_suite_id)
table_options = list(columns_df["table_name"].unique())
table_name = testgen.select(
options=table_options,
value_column="table_name",
- default_value=table_name or (table_options[0] if table_options else None),
+ default_value=table_name,
bind_to_query="table_name",
- required=True,
label="Table",
)
with column_filter_column:
- column_options = columns_df.loc[columns_df["table_name"] == table_name]["column_name"].dropna().unique().tolist()
+ if table_name:
+ column_options = columns_df.loc[
+ columns_df["table_name"] == table_name
+ ]["column_name"].dropna().unique().tolist()
+ else:
+ column_options = columns_df.groupby("column_name").first().reset_index().sort_values("column_name", key=lambda x: x.str.lower())
column_name = testgen.select(
options=column_options,
default_value=column_name,
bind_to_query="column_name",
label="Column",
- disabled=not table_name,
accept_new_options=True,
)
with test_filter_column:
@@ -101,29 +117,57 @@ def render(self, test_suite_id: str, table_name: str | None = None, column_name:
options=test_options,
value_column="test_type",
display_column="test_name_short",
- default_value=None,
+ default_value=test_type,
bind_to_query="test_type",
label="Test Type",
)
- with disposition_column:
- str_help = "Toggle on to perform actions on multiple test definitions"
- do_multi_select = user_can_disposition and st.toggle("Multi-Select", help=str_help)
+ if user_can_disposition:
+ with disposition_column:
+ multi_select = st.toggle("Multi-Select", help="Toggle on to perform actions on multiple test definitions")
- if user_can_edit and actions_column.button(
- ":material/add: Add", help="Add a new Test Definition"
- ):
- add_test_dialog(table_group, test_suite, table_name, column_name)
+ if user_can_edit:
+ if actions_column.button(
+ ":material/add: Add",
+ help="Add a new Test Definition",
+ ):
+ add_test_dialog(table_group, test_suite, table_name, column_name)
- if user_can_edit and table_actions_column.button(
- ":material/play_arrow: Run Tests",
- help="Run test suite's tests",
- ):
- run_tests_dialog(project_code, test_suite)
+ if table_actions_column.button(
+ ":material/play_arrow: Run Tests",
+ help="Run test suite's tests",
+ ):
+ run_tests_dialog(project_code, test_suite)
+
+ with st.container():
+ with st.spinner("Loading data ..."):
+ df = get_test_definitions(test_suite, table_name, column_name, test_type)
+
+ selected, selected_test_def = render_grid(df, multi_select, filters_changed)
+
+ popover_container = table_actions_column.empty()
+
+ def open_download_dialog(data: pd.DataFrame | None = None) -> None:
+ # Hack to programmatically close popover: https://github.com/streamlit/streamlit/issues/8265#issuecomment-3001655849
+ with popover_container.container():
+ flex_row_end()
+ st.button(label="Export", icon=":material/download:", disabled=True)
+
+ download_dialog(
+ dialog_title="Download Excel Report",
+ file_content_func=get_excel_report_data,
+ args=(test_suite, table_group.table_group_schema, data),
+ )
+
+ with popover_container.container(key="tg--export-popover"):
+ flex_row_end()
+ with st.popover(label="Export", icon=":material/download:", help="Download test definitions to Excel"):
+ css_class("tg--export-wrapper")
+ st.button(label="All tests", type="tertiary", on_click=open_download_dialog)
+ st.button(label="Filtered tests", type="tertiary", on_click=partial(open_download_dialog, df))
+ if selected:
+ st.button(label="Selected tests", type="tertiary", on_click=partial(open_download_dialog, pd.DataFrame(selected)))
- selected = show_test_defs_grid(
- test_suite, table_name, column_name, test_type, do_multi_select, table_actions_column, table_group
- )
fm.render_refresh_button(table_actions_column)
if user_can_disposition:
@@ -156,9 +200,6 @@ def render(self, test_suite_id: str, table_name: str | None = None, column_name:
lst_cached_functions=[],
)
- if selected:
- selected_test_def = selected[0]
-
if user_can_edit:
if actions_column.button(
":material/edit: Edit",
@@ -178,6 +219,102 @@ def render(self, test_suite_id: str, table_name: str | None = None, column_name:
):
delete_test_dialog(selected)
+ if selected_test_def:
+ render_selected_details(selected_test_def, table_group)
+
+
+def render_grid(df: pd.DataFrame, multi_select: bool, filters_changed: bool) -> list[dict]:
+ columns = [
+ "table_name",
+ "column_name",
+ "test_name_short",
+ "test_active_display",
+ "lock_refresh_display",
+ "urgency",
+ "export_to_observability_display",
+ "profiling_as_of_date",
+ "last_manual_update",
+ ]
+ # Multiselect checkboxes do not display correctly if the dataframe column order does not start with the first displayed column -_-
+ df = df.reindex(columns=[columns[0]] + [ col for col in df.columns.to_list() if col != columns[0] ])
+
+ selected, selected_row = fm.render_grid_select(
+ df,
+ columns,
+ [
+ "Table",
+ "Columns / Focus",
+ "Test Type",
+ "Active",
+ "Locked",
+ "Urgency",
+ "Export to Observabilty",
+ "Based on Profiling",
+ "Last Manual Update",
+ ],
+ id_column="id",
+ selection_mode="multiple" if multi_select else "single",
+ reset_pagination=filters_changed,
+ bind_to_query=True,
+ render_highlights=False,
+ )
+
+ return selected, selected_row
+
+
+def render_selected_details(selected_test: dict, table_group: TableGroupMinimal) -> None:
+ columns = [
+ "schema_name",
+ "table_name",
+ "column_name",
+ "test_type",
+ "test_active_display",
+ "test_definition_status",
+ "lock_refresh_display",
+ "urgency",
+ "export_to_observability",
+ ]
+
+ labels = [
+ "schema_name",
+ "table_name",
+ "column_name",
+ "test_type",
+ "test_active",
+ "test_definition_status",
+ "lock_refresh",
+ "urgency",
+ "export_to_observability",
+ ]
+
+ additional_columns = [val.strip() for val in selected_test["default_parm_columns"].split(",")]
+ columns = columns + additional_columns
+ labels = labels + additional_columns
+ labels = list(map(snake_case_to_title_case, labels))
+
+ left_column, right_column = st.columns([0.5, 0.5])
+
+ with left_column:
+ fm.render_html_list(
+ selected_test,
+ columns,
+ "Test Definition Information",
+ int_data_width=700,
+ lst_labels=labels,
+ )
+
+ _, col_profile_button = right_column.columns([0.7, 0.3])
+ if selected_test["test_scope"] == "column" and selected_test["profile_run_id"]:
+ with col_profile_button:
+ view_profiling_button(
+ selected_test["column_name"],
+ selected_test["table_name"],
+ str(table_group.id),
+ )
+
+ with right_column:
+ st.write(generate_test_defs_help(selected_test["test_type"]))
+
@st.dialog("Delete Tests")
@with_database_session
@@ -472,51 +609,51 @@ def show_test_form(
# schema_name
test_definition["schema_name"] = left_column.text_input(
- label="Schema Name", max_chars=100, value=schema_name, disabled=True
+ label="Schema", max_chars=100, value=schema_name, disabled=True
)
# table_name
- test_definition["table_name"] = left_column.text_input(
- label="Table Name", max_chars=100, value=table_name, disabled=False
- )
-
- # column_name
- if selected_test_type_row["column_name_prompt"]:
- column_name_label = selected_test_type_row["column_name_prompt"]
- else:
- column_name_label = "Test Focus"
- if selected_test_type_row["column_name_help"]:
- column_name_help = selected_test_type_row["column_name_help"]
+ table_column_list = get_columns(table_groups_id)
+ if test_scope == "custom":
+ test_definition["table_name"] = left_column.text_input(
+ label="Table", max_chars=100, value=table_name, disabled=False
+ )
else:
- column_name_help = "Help is not available"
+ table_name_options = { item["table_name"] for item in table_column_list }
+ if table_name not in table_name_options:
+ table_name_options.add(table_name)
+ table_name_options = list(table_name_options)
+ table_name_options.sort(key=lambda x: x.lower())
+ test_definition["table_name"] = st.selectbox(
+ label="Table",
+ options=table_name_options,
+ index=table_name_options.index(table_name) if table_name else 0,
+ disabled=mode == "edit",
+ key="table-name-form",
+ )
+ column_name_label = None
if test_scope == "table":
test_definition["column_name"] = None
- column_name_label = None
- elif test_scope == "referential":
+ elif test_scope in ("referential", "custom"):
+ column_name_label = selected_test_type_row["column_name_prompt"] if selected_test_type_row["column_name_prompt"] else "Test Focus"
test_definition["column_name"] = left_column.text_input(
label=column_name_label,
value=column_name,
max_chars=500,
- help=column_name_help,
- )
- elif test_scope == "custom":
- test_definition["column_name"] = left_column.text_input(
- label=column_name_label,
- value=column_name,
- max_chars=100,
- help=column_name_help,
+ help=selected_test_type_row["column_name_help"] if selected_test_type_row["column_name_help"] else None,
)
elif test_scope == "column": # CAT column test
- column_name_label = "Column Name"
- column_name_options = get_column_names(table_groups_id, test_definition["table_name"])
- column_name_help = "Select the column to test"
- column_name_index = column_name_options.index(column_name) if column_name else 0
+ column_name_label = "Column"
+ column_name_options = { item["column_name"] for item in table_column_list if item["table_name"] == test_definition["table_name"]}
+ if column_name not in column_name_options:
+ column_name_options.add(column_name)
+ column_name_options = list(column_name_options)
+ column_name_options.sort(key=lambda x: x.lower())
test_definition["column_name"] = st.selectbox(
label=column_name_label,
options=column_name_options,
- index=column_name_index,
- help=column_name_help,
+ index=column_name_options.index(column_name) if column_name else 0,
key="column-name-form",
)
@@ -865,143 +1002,6 @@ def update_test_definition(selected, attribute, value, message):
return result
-def show_test_defs_grid(
- test_suite: TestSuite,
- table_name: str | None,
- column_name: str | None,
- test_type: str | None,
- do_multi_select: bool,
- export_container: DeltaGenerator,
- table_group: TableGroupMinimal,
-):
- with st.container():
- with st.spinner("Loading data ..."):
- df = get_test_definitions(test_suite, table_name, column_name, test_type)
-
- lst_show_columns = [
- "table_name",
- "column_name",
- "test_name_short",
- "test_active_display",
- "lock_refresh_display",
- "urgency",
- "export_to_observability_display",
- "profiling_as_of_date",
- "last_manual_update",
- ]
- show_column_headers = [
- "Table",
- "Columns / Focus",
- "Test Type",
- "Active",
- "Locked",
- "Urgency",
- "Export to Observabilty",
- "Based on Profiling",
- "Last Manual Update",
- ]
- # Multiselect checkboxes do not display correctly if the dataframe column order does not start with the first displayed column -_-
- columns = [lst_show_columns[0]] + [ col for col in df.columns.to_list() if col != lst_show_columns[0] ]
- df = df.reindex(columns=columns)
-
- dct_selected_row = fm.render_grid_select(
- df,
- lst_show_columns,
- do_multi_select=do_multi_select,
- show_column_headers=show_column_headers,
- render_highlights=False,
- bind_to_query_name="selected",
- bind_to_query_prop="id",
- )
-
- popover_container = export_container.empty()
-
- def open_download_dialog(data: pd.DataFrame | None = None) -> None:
- # Hack to programmatically close popover: https://github.com/streamlit/streamlit/issues/8265#issuecomment-3001655849
- with popover_container.container():
- flex_row_end()
- st.button(label="Export", icon=":material/download:", disabled=True)
-
- download_dialog(
- dialog_title="Download Excel Report",
- file_content_func=get_excel_report_data,
- args=(test_suite, table_group.table_group_schema, data),
- )
-
- with popover_container.container(key="tg--export-popover"):
- flex_row_end()
- with st.popover(label="Export", icon=":material/download:", help="Download test definitions to Excel"):
- css_class("tg--export-wrapper")
- st.button(label="All tests", type="tertiary", on_click=open_download_dialog)
- st.button(label="Filtered tests", type="tertiary", on_click=partial(open_download_dialog, df))
- if dct_selected_row:
- st.button(label="Selected tests", type="tertiary", on_click=partial(open_download_dialog, pd.DataFrame(dct_selected_row)))
-
- if dct_selected_row:
- st.html("
")
- selected_row = dct_selected_row[0]
- str_test_id = selected_row["id"]
- row_selected = df[df["id"] == str_test_id].iloc[0]
- str_parm_columns = selected_row["default_parm_columns"]
-
- # Shared columns to show
- lst_show_columns = [
- "schema_name",
- "table_name",
- "column_name",
- "test_type",
- "test_active_display",
- "test_definition_status",
- "lock_refresh_display",
- "urgency",
- "export_to_observability",
- ]
-
- labels = [
- "schema_name",
- "table_name",
- "column_name",
- "test_type",
- "test_active",
- "test_definition_status",
- "lock_refresh",
- "urgency",
- "export_to_observability",
- ]
-
- # Test-specific columns to show
- additional_columns = [val.strip() for val in str_parm_columns.split(",")]
- lst_show_columns = lst_show_columns + additional_columns
- labels = labels + additional_columns
-
- labels = list(map(snake_case_to_title_case, labels))
-
- left_column, right_column = st.columns([0.5, 0.5])
-
- with left_column:
- fm.render_html_list(
- selected_row,
- lst_show_columns,
- "Test Definition Information",
- int_data_width=700,
- lst_labels=labels,
- )
-
- _, col_profile_button = right_column.columns([0.7, 0.3])
- if selected_row["test_scope"] == "column" and selected_row["profile_run_id"]:
- with col_profile_button:
- view_profiling_button(
- selected_row["column_name"],
- selected_row["table_name"],
- str(table_group.id),
- )
-
- with right_column:
- st.write(generate_test_defs_help(row_selected["test_type"]))
-
- return dct_selected_row
-
-
@with_database_session
def get_excel_report_data(
update_progress: PROGRESS_UPDATE_TYPE,
@@ -1198,22 +1198,19 @@ def get_test_definitions_collision(
return to_dataframe(results, TestDefinitionMinimal.columns())
-def get_column_names(table_groups_id: str, table_name: str) -> list[str]:
+def get_columns(table_groups_id: str) -> list[dict]:
results = fetch_all_from_db(
"""
- SELECT column_name
+ SELECT table_name, column_name
FROM data_column_chars
WHERE table_groups_id = :table_groups_id
- AND table_name = :table_name
AND drop_date IS NULL
- ORDER BY column_name
""",
{
"table_groups_id": table_groups_id,
- "table_name": table_name,
},
)
- return [ row.column_name for row in results ]
+ return [ dict(row) for row in results ]
def validate_test(test_definition, table_group: TableGroupMinimal):
diff --git a/testgen/ui/views/test_results.py b/testgen/ui/views/test_results.py
index 5b747ec1..bfab94c3 100644
--- a/testgen/ui/views/test_results.py
+++ b/testgen/ui/views/test_results.py
@@ -5,13 +5,11 @@
from io import BytesIO
from itertools import zip_longest
from operator import attrgetter
-from uuid import UUID
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import streamlit as st
-from streamlit.delta_generator import DeltaGenerator
import testgen.ui.services.form_service as fm
from testgen.commands.run_rollup_scores import run_test_rollup_scoring_queries
@@ -21,7 +19,7 @@
from testgen.common.models.table_group import TableGroup
from testgen.common.models.test_definition import TestDefinition
from testgen.common.models.test_run import TestRun
-from testgen.common.models.test_suite import TestSuite
+from testgen.common.models.test_suite import TestSuite, TestSuiteMinimal
from testgen.ui.components import widgets as testgen
from testgen.ui.components.widgets.download_dialog import (
FILE_DATA_TYPE,
@@ -61,9 +59,9 @@ def render(
self,
run_id: str,
status: str | None = None,
- test_type: str | None = None,
table_name: str | None = None,
column_name: str | None = None,
+ test_type: str | None = None,
action: str | None = None,
**_kwargs,
) -> None:
@@ -95,6 +93,12 @@ def render(
testgen.flex_row_end(actions_column)
testgen.flex_row_end(export_button_column)
+ filters_changed = False
+ current_filters = (status, table_name, column_name, test_type, action)
+ if st.session_state.get("test_results:filters") != current_filters:
+ filters_changed = True
+ st.session_state["test_results:filters"] = current_filters
+
with summary_column:
tests_summary = get_test_result_summary(run_id)
testgen.summary_bar(items=tests_summary, height=20, width=800)
@@ -175,8 +179,10 @@ def render(
sorting_columns = testgen.sorting_selector(sortable_columns, default)
with actions_column:
- str_help = "Toggle on to perform actions on multiple results"
- do_multi_select = st.toggle("Multi-Select", help=str_help)
+ multi_select = st.toggle(
+ "Multi-Select",
+ help="Toggle on to perform actions on multiple results",
+ )
match status:
case None:
@@ -186,22 +192,80 @@ def render(
case _:
status = [status]
- # Display main grid and retrieve selection
- selected = show_result_detail(
- run_id,
- run_date,
- run.test_suite_id,
- export_button_column,
- session.auth.user_has_permission("edit"),
- status,
- test_type,
- table_name,
- column_name,
- action,
- sorting_columns,
- do_multi_select,
+ with st.container():
+ with st.spinner("Loading data ..."):
+ # Retrieve test results (always cached, action as null)
+ df = test_result_queries.get_test_results(
+ run_id, status, test_type, table_name, column_name, action, sorting_columns
+ )
+ # Retrieve disposition action (cache refreshed)
+ df_action = get_test_disposition(run_id)
+ # Update action from disposition df
+ action_map = df_action.set_index("id")["action"].to_dict()
+ df["action"] = df["test_result_id"].map(action_map).fillna(df["action"])
+
+ # Update action from disposition df
+ action_map = df_action.set_index("id")["action"].to_dict()
+ df["action"] = df["test_result_id"].map(action_map).fillna(df["action"])
+
+ test_suite = TestSuite.get_minimal(run.test_suite_id)
+ table_group = TableGroup.get_minimal(test_suite.table_groups_id)
+
+ selected, selected_row = fm.render_grid_select(
+ df,
+ [
+ "table_name",
+ "column_names",
+ "test_name_short",
+ "result_measure",
+ "measure_uom",
+ "result_status",
+ "action",
+ "result_message",
+ ],
+ [
+ "Table",
+ "Columns/Focus",
+ "Test Type",
+ "Result Measure",
+ "Unit of Measure",
+ "Status",
+ "Action",
+ "Details",
+ ],
+ id_column="test_result_id",
+ selection_mode="multiple" if multi_select else "single",
+ reset_pagination=filters_changed,
+ bind_to_query=True,
)
+ popover_container = export_button_column.empty()
+
+ def open_download_dialog(data: pd.DataFrame | None = None) -> None:
+ # Hack to programmatically close popover: https://github.com/streamlit/streamlit/issues/8265#issuecomment-3001655849
+ with popover_container.container():
+ flex_row_end()
+ st.button(label="Export", icon=":material/download:", disabled=True)
+
+ download_dialog(
+ dialog_title="Download Excel Report",
+ file_content_func=get_excel_report_data,
+ args=(test_suite.test_suite, table_group.table_group_schema, run_date, run_id, data),
+ )
+
+ with popover_container.container(key="tg--export-popover"):
+ flex_row_end()
+ with st.popover(label="Export", icon=":material/download:", help="Download test results to Excel"):
+ css_class("tg--export-wrapper")
+ st.button(label="All tests", type="tertiary", on_click=open_download_dialog)
+ st.button(label="Filtered tests", type="tertiary", on_click=partial(open_download_dialog, df))
+ if selected:
+ st.button(
+ label="Selected tests",
+ type="tertiary",
+ on_click=partial(open_download_dialog, pd.DataFrame(selected)),
+ )
+
# Need to render toolbar buttons after grid, so selection status is maintained
affected_cached_functions = [get_test_disposition, test_result_queries.get_test_results]
@@ -238,10 +302,17 @@ def render(
with score_column:
render_score(run.project_code, run_id)
+ if selected:
+ render_selected_details(
+ selected,
+ selected_row,
+ test_suite,
+ session.auth.user_has_permission("edit"),
+ multi_select,
+ )
+
# Help Links
- st.markdown(
- "[Help on Test Types](https://docs.datakitchen.io/article/dataops-testgen-help/testgen-test-types)"
- )
+ st.markdown("[Help on Test Types](https://docs.datakitchen.io/article/dataops-testgen-help/testgen-test-types)")
@st.fragment
@@ -366,7 +437,7 @@ def get_test_result_summary(test_run_id: str) -> list[dict]:
]
-def show_test_def_detail(test_definition_id: str, test_suite: TestSuite):
+def show_test_def_detail(test_definition_id: str, test_suite: TestSuiteMinimal):
def readable_boolean(v: bool):
return "Yes" if v else "No"
@@ -431,128 +502,51 @@ def readable_boolean(v: bool):
)
-def show_result_detail(
- run_id: str,
- run_date: str,
- test_suite_id: UUID,
- export_container: DeltaGenerator,
+@with_database_session
+def render_selected_details(
+ selected_rows: list[dict],
+ selected_item: dict,
+ test_suite: TestSuiteMinimal,
user_can_edit: bool,
- test_statuses: list[str] | None = None,
- test_type_id: str | None = None,
- table_name: str | None = None,
- column_name: str | None = None,
- action: typing.Literal["Confirmed", "Dismissed", "Muted", "No Action"] | None = None,
- sorting_columns: list[str] | None = None,
- do_multi_select: bool = False,
-):
- with st.container():
- with st.spinner("Loading data ..."):
- # Retrieve test results (always cached, action as null)
- df = test_result_queries.get_test_results(run_id, test_statuses, test_type_id, table_name, column_name, action, sorting_columns)
- # Retrieve disposition action (cache refreshed)
- df_action = get_test_disposition(run_id)
- # Update action from disposition df
- action_map = df_action.set_index("id")["action"].to_dict()
- df["action"] = df["test_result_id"].map(action_map).fillna(df["action"])
-
- # Update action from disposition df
- action_map = df_action.set_index("id")["action"].to_dict()
- df["action"] = df["test_result_id"].map(action_map).fillna(df["action"])
-
- test_suite = TestSuite.get_minimal(test_suite_id)
- table_group = TableGroup.get_minimal(test_suite.table_groups_id)
-
- lst_show_columns = [
- "table_name",
- "column_names",
- "test_name_short",
- "result_measure",
- "measure_uom",
- "result_status",
- "action",
- "result_message",
- ]
-
- lst_show_headers = [
- "Table",
- "Columns/Focus",
- "Test Type",
- "Result Measure",
- "Unit of Measure",
- "Status",
- "Action",
- "Details",
- ]
-
- selected_rows = fm.render_grid_select(
- df,
- lst_show_columns,
- do_multi_select=do_multi_select,
- show_column_headers=lst_show_headers,
- bind_to_query_name="selected",
- bind_to_query_prop="test_result_id",
- )
-
- popover_container = export_container.empty()
-
- def open_download_dialog(data: pd.DataFrame | None = None) -> None:
- # Hack to programmatically close popover: https://github.com/streamlit/streamlit/issues/8265#issuecomment-3001655849
- with popover_container.container():
- flex_row_end()
- st.button(label="Export", icon=":material/download:", disabled=True)
-
- download_dialog(
- dialog_title="Download Excel Report",
- file_content_func=get_excel_report_data,
- args=(test_suite.test_suite, table_group.table_group_schema, run_date, run_id, data),
- )
-
- with popover_container.container(key="tg--export-popover"):
- flex_row_end()
- with st.popover(label="Export", icon=":material/download:", help="Download test results to Excel"):
- css_class("tg--export-wrapper")
- st.button(label="All tests", type="tertiary", on_click=open_download_dialog)
- st.button(label="Filtered tests", type="tertiary", on_click=partial(open_download_dialog, df))
- if selected_rows:
- st.button(label="Selected tests", type="tertiary", on_click=partial(open_download_dialog, pd.DataFrame(selected_rows)))
-
- # Display history and detail for selected row
+ multi_select: bool = False,
+) -> None:
if not selected_rows:
st.markdown(":orange[Select a record to see more information.]")
else:
- selected_row = selected_rows[0]
- dfh = test_result_queries.get_test_result_history(selected_row)
- show_hist_columns = ["test_date", "threshold_value", "result_measure", "result_status"]
-
- time_columns = ["test_date"]
- date_service.accommodate_dataframe_to_timezone(dfh, st.session_state, time_columns)
-
pg_col1, pg_col2 = st.columns([0.5, 0.5])
with pg_col2:
v_col1, v_col2, v_col3, v_col4 = st.columns([.25, .25, .25, .25])
- if user_can_edit:
- view_edit_test(v_col1, selected_row["test_definition_id_current"])
-
- if selected_row["test_scope"] == "column":
- with v_col2:
- view_profiling_button(
- selected_row["column_names"],
- selected_row["table_name"],
- selected_row["table_groups_id"],
- )
- with v_col3:
- if st.button(
- ":material/visibility: Source Data", help="View current source data for highlighted result",
- use_container_width=True
- ):
- MixpanelService().send_event(
- "view-source-data",
- page=PAGE_PATH,
- test_type=selected_row["test_name_short"],
- )
- source_data_dialog(selected_row)
+ if selected_item:
+ dfh = test_result_queries.get_test_result_history(selected_item)
+ show_hist_columns = ["test_date", "threshold_value", "result_measure", "result_status"]
+
+ time_columns = ["test_date"]
+ date_service.accommodate_dataframe_to_timezone(dfh, st.session_state, time_columns)
+
+ if user_can_edit:
+ view_edit_test(v_col1, selected_item["test_definition_id_current"])
+
+ if selected_item["test_scope"] == "column":
+ with v_col2:
+ view_profiling_button(
+ selected_item["column_names"],
+ selected_item["table_name"],
+ selected_item["table_groups_id"],
+ )
+
+ with v_col3:
+ if st.button(
+ ":material/visibility: Source Data", help="View current source data for highlighted result",
+ use_container_width=True
+ ):
+ MixpanelService().send_event(
+ "view-source-data",
+ page=PAGE_PATH,
+ test_type=selected_item["test_name_short"],
+ )
+ source_data_dialog(selected_item)
with v_col4:
@@ -561,7 +555,7 @@ def open_download_dialog(data: pd.DataFrame | None = None) -> None:
if row["result_status"] != "Passed" and row["disposition"] in (None, "Confirmed")
]
- if do_multi_select:
+ if multi_select:
report_btn_help = (
"Generate PDF reports for the selected results that are not muted or dismissed and are not Passed"
)
@@ -594,21 +588,21 @@ def open_download_dialog(data: pd.DataFrame | None = None) -> None:
)
download_dialog(dialog_title=dialog_title, file_content_func=zip_func)
- with pg_col1:
- fm.show_subheader(selected_row["test_name_short"])
- st.markdown(f"###### {selected_row['test_description']}")
- st.caption(empty_if_null(selected_row["measure_uom_description"]))
- fm.render_grid_select(dfh, show_hist_columns, selection_mode="disabled")
- with pg_col2:
- ut_tab1, ut_tab2 = st.tabs(["History", "Test Definition"])
- with ut_tab1:
- if dfh.empty:
- st.write("Test history not available.")
- else:
- write_history_graph(dfh)
- with ut_tab2:
- show_test_def_detail(selected_row["test_definition_id_current"], test_suite)
- return selected_rows
+ if selected_item:
+ with pg_col1:
+ fm.show_subheader(selected_item["test_name_short"])
+ st.markdown(f"###### {selected_item['test_description']}")
+ st.caption(empty_if_null(selected_item["measure_uom_description"]))
+ fm.render_grid_select(dfh, show_hist_columns, selection_mode="disabled", key="test_history")
+ with pg_col2:
+ ut_tab1, ut_tab2 = st.tabs(["History", "Test Definition"])
+ with ut_tab1:
+ if dfh.empty:
+ st.write("Test history not available.")
+ else:
+ write_history_graph(dfh)
+ with ut_tab2:
+ show_test_def_detail(selected_item["test_definition_id_current"], test_suite)
@with_database_session
From c83787d02918381a9987c9c530eb783292b19bd0 Mon Sep 17 00:00:00 2001
From: Aarthy Adityan
Date: Wed, 24 Sep 2025 12:36:10 -0400
Subject: [PATCH 20/48] refactor(run-pages): move filters and pagination to
vanjs
---
.../frontend/js/components/select.js | 6 +-
.../frontend/js/pages/profiling_runs.js | 336 +++++++++++------
.../components/frontend/js/pages/test_runs.js | 341 +++++++++++++-----
.../frontend/js/pages/test_suites.js | 11 +-
testgen/ui/views/profiling_runs.py | 156 +++-----
testgen/ui/views/test_runs.py | 190 +++-------
6 files changed, 588 insertions(+), 452 deletions(-)
diff --git a/testgen/ui/components/frontend/js/components/select.js b/testgen/ui/components/frontend/js/components/select.js
index 967aee4e..72bb11cc 100644
--- a/testgen/ui/components/frontend/js/components/select.js
+++ b/testgen/ui/components/frontend/js/components/select.js
@@ -1,5 +1,5 @@
/**
- * @typedef Option
+ * @typedef SelectOption
* @type {object}
* @property {string} label
* @property {string} value
@@ -11,7 +11,7 @@
* @property {string?} id
* @property {string} label
* @property {string?} value
- * @property {Array.