Skip to content

Synacomex 2020 Importers Unmask Cleaned

s3://trase-storage/brazil/trade/traders/synacomex_2020_importers_unmask_cleaned.csv

Dbt path: trase_production.main_brazil.synacomex_2020_importers_unmask_cleaned

Explore on Metabase: Full table; summary statistics

Containing yaml file link: trase/data_pipeline/models/brazil/trade/traders/_schema.yml

Model file link: trase/data_pipeline/models/brazil/trade/traders/synacomex_2020_importers_unmask_cleaned.py

Calls script: trase/data/brazil/trade/traders/synacomex_20xx_importers_unmask_cleaned.py

Dbt test runs & lineage: Test results · Lineage

Full dbt_docs page: Open in dbt docs (includes lineage graph -at the bottom right-, tests, and downstream dependencies)

Tags: mock_model, brazil, trade, traders


synacomex_2020_importers_unmask_cleaned

Description

This model was auto-generated based off .yml 'lineage' files in S3. The DBT model just raises an error; the actual script that created the data lives elsewhere. The script is located at trase/data/brazil/trade/traders/synacomex_20xx_importers_unmask_cleaned.py [permalink]. It was last run by Harry Biddle.


Details

Column Type Description

Models / Seeds

  • source.trase_duckdb.trase-storage-raw.synacomex_2019_importers_unmask
  • source.trase_duckdb.trase-storage-raw.synacomex_2020_importers_unmask

Sources

  • ['trase-storage-raw', 'synacomex_2019_importers_unmask']
  • ['trase-storage-raw', 'synacomex_2020_importers_unmask']
from psycopg2 import sql
import unicodedata

from unidecode import unidecode

from trase.tools import get_country_id
from trase.tools.aws.aws_helpers_cached import get_pandas_df_once
from trase.tools.aws.metadata import write_csv_for_upload
from trase.tools.pandasdb.find import find_traders_and_groups_by_label


def clean_string(text, upper=True):
    """
    Take a string and clean it!

    - Remove double-whitespace
    - Remove tab, newline, return, formfeed, etc.
    - Replace accented characters (e.g. ö becomes o)
    - Trim leading and trailing whitespace
    - Convert to upper-case
    """

    def keep(character):
        category = unicodedata.category(character)
        return (
            category[0] != "C"  # ignore control characters
            and category != "Zl"  # ignore line separator
            and category != "Zp"  # ignore paragraph separator
            and category != "Pe"  # ignore parentheses left
            and category != "Ps"  # ignore parentheses right
            and category != "Po"  # ignore asterisk
        )

    text = "".join(c for c in text if keep(c))
    text = " ".join(text.split())
    text = unidecode(text)
    return text.upper() if upper else text


def download_and_clean(s3_key):
    df = get_pandas_df_once(
        s3_key,
        encoding="latin-1",
        sep=",",
        dtype=str,
        keep_default_na=False,
    )
    country_id = get_country_id("BRAZIL")
    for column, group_column in [
        ("EXPORTER_SYNACOMEX", "EXPORTER_GROUP_SYNACOMEX"),
        ("EXPORTER_BOL", "EXPORTER_GROUP_BOL"),
        ("IMPORTER_SYNACOMEX", "IMPORTER_GROUP_SYNACOMEX"),
        ("IMPORTER_BOL", "IMPORTER_GROUP_BOL"),
    ]:
        label_column = f"{column}_LABEL"
        df[label_column] = df[column].apply(clean_string)
        df[[column, group_column, "count"]] = find_traders_and_groups_by_label(
            df,
            trader_label=sql.Identifier(f"{column}_LABEL"),
            country_id=sql.Literal(country_id),
            returning=["trader_name", "group_name", "count"],
            year=sql.Literal(2019),
            on_extra_columns="ignore",
        )
        bad = df["count"] != 1
        if any(bad):
            labels = df[[label_column, "count"]][bad].drop_duplicates()
            raise ValueError(
                f"Missing some {column}, add and re-run this script:\n{labels}"
            )
        df.pop("count")
        df.pop(label_column)

    return df


# 2019
df = download_and_clean("brazil/trade/traders/Synacomex_2019_Importers_Unmask.csv")
write_csv_for_upload(
    df, "brazil/trade/traders/synacomex_2019_importers_unmask_cleaned.csv"
)


# 2019
df = download_and_clean("brazil/trade/traders/Synacomex_2020_Importers_Unmask.csv")
write_csv_for_upload(
    df, "brazil/trade/traders/synacomex_2020_importers_unmask_cleaned.csv"
)
import pandas as pd


def model(dbt, cursor):
    dbt.source("trase-storage-raw", "synacomex_2019_importers_unmask")
    dbt.source("trase-storage-raw", "synacomex_2020_importers_unmask")

    raise NotImplementedError()
    return pd.DataFrame({"hello": ["world"]})