DBT Macro: Test Synonym Check
DBT macro name: trase_duckdb.test_synonym_check
DBT details
Description
No description
Details
{% test synonym_check(
model,
column_name,
synonym_table,
official_name_field,
reference_synonym_field,
reference_official_name_field,
where_from=None,
where_to=None,
check_trimmed_official_names=False,
return_reference_official_names=True
) %}
with model_data as (
select
{{ column_name }} as synonym,
trim(lower({{ column_name }})) as synonym_normalized,
{{ official_name_field }} as official_name,
trim(lower({{ official_name_field }})) as official_name_normalized,
count(*) as occurrences
from {{ model }}
where {{ column_name }} is not null
{% if where_from %}
and {{ where_from }}
{% endif %}
group by 1, 2, 3, 4
),
reference_data as (
select
t.synonym,
trim(lower(t.synonym)) as synonym_normalized,
{{ reference_official_name_field }} as official_name,
trim(lower({{ reference_official_name_field }})) as official_name_normalized
from {{ synonym_table }}
{% if where_to %}
where {{ where_to }}
{% endif %}
cross join unnest({{ reference_synonym_field }}) as t(synonym)
),
reference_grouped as (
select
synonym_normalized,
list(distinct official_name order by official_name) as reference_official_names,
list(distinct official_name_normalized order by official_name_normalized) as reference_official_names_normalized,
count(distinct official_name) as reference_official_name_count
from reference_data
group by 1
)
select
m.synonym as model_synonym,
m.official_name as model_official_name,
m.occurrences as model_occurrences,
rg.reference_official_name_count,
{% if return_reference_official_names %}
rg.reference_official_names,
{% endif %}
case
when rg.reference_official_name_count is null then 'synonym not found in reference'
when {% if check_trimmed_official_names %}
not list_contains(rg.reference_official_names_normalized, m.official_name_normalized)
{% else %}
not list_contains(rg.reference_official_names, m.official_name)
{% endif %}
then 'official name does not match reference'
end as error_reason
from model_data m
left join reference_grouped rg
on m.synonym_normalized = rg.synonym_normalized
where rg.reference_official_name_count is null
or {% if check_trimmed_official_names %}
not list_contains(rg.reference_official_names_normalized, m.official_name_normalized)
{% else %}
not list_contains(rg.reference_official_names, m.official_name)
{% endif %}
order by m.occurrences desc
{% endtest %}