Skip to content

Brazil Port Urfs

s3://trase-storage/brazil/metadata/brazil_port_urfs.parquet

Dbt path: trase_production.main_brazil.brazil_port_urfs

Explore on Metabase: Full table; summary statistics

Containing yaml file link: trase/data_pipeline/models/brazil/metadata/_schema.yml

Model file link: trase/data_pipeline/models/brazil/metadata/brazil_port_urfs.py

Calls script: trase/data/brazil/metadata/brazil_port_urfs.py

Dbt test runs & lineage: Test results · Lineage

Full dbt_docs page: Open in dbt docs (includes lineage graph -at the bottom right-, tests, and downstream dependencies)

Tags: brazil, rfb, urf


brazil_port_urfs

Description

Brazil URF (Unidade de Receita Federal)

A Unidade de Receita Federal (URF) is a "Customs" or "Federal Revenue" Unit in Brazil. These units are located at ports, airports, and border crossings to oversee customs clearance, taxation, and trade compliance.

How to re-fetch the dataset from the original source

The file is downloaded ad-hoc from https://www.gov.br/mdic/pt-br/assuntos/comercio-exterior/estatisticas/base-de-dados-bruta. Search for the hyperlink "URF (Unidade da RFB)", under 5. Tabelas de Correlações de Códigos e Classificações. The file is furthermore converted to UTF8 encoding before upload:

iconv -f latin1 -t utf8 URF.csv > URF_utf8.csv

History of changes to the dataset

  • February 2024: since last retrieval (of unknown date), three entries were added: 0710252 - IRF MACAE, 0710304 - ITAGUAI and 0811400 - SANTO ANDRE.

Details

Column Type Description
urf_code VARCHAR
label VARCHAR
name VARCHAR
trase_id VARCHAR
found_in_database BOOLEAN

Models / Seeds

  • source.trase_duckdb.trase-storage-raw.brazil_rfb_urf_retrieved_2025_02_15
  • model.trase_duckdb.postgres_regions_without_geometry

Sources

  • ['trase-storage-raw', 'brazil_rfb_urf_retrieved_2025_02_15']
import sys

import numpy as np
import pandas as pd


from trase.tools import uses_database
from trase.tools.aws import get_pandas_df
from trase.tools.aws.metadata import write_parquet_for_upload
from trase.tools.utilities.helpers import clean_string


def get_regions(cnx):
    # get columns of views.regions
    columns_df = pd.read_sql(
        """
        SELECT column_name FROM information_schema.columns 
        WHERE table_schema = 'views' AND table_name = 'regions'
        """,
        cnx.cnx,
    )
    columns = list(columns_df["column_name"].values)

    # remove _geometry column
    assert "_geometry" in columns
    columns.pop(columns.index("_geometry"))

    # select all other columns
    return pd.read_sql(f"select {' ,'.join(columns)} from views.regions", cnx.cnx)


def clean_urf_name(urf: str):
    """
    For example:

        "0140100 - CAMPO GRANDE"  becomes "CAMPO GRANDE"
        "0147600 - ALF - CORUMBÁ" becomes "CORUMBA"

    """
    if urf.endswith("IRF/BELO HORIZONTE - MG"):
        return "IRF/BELO HORIZONTE - MG"
    elif urf.endswith("SEFAZ-SC - CONVENIO CONSULTA SISTEMA CARGA"):
        return "SEFAZ-SC - CONVENIO CONSULTA SISTEMA CARGA"
    else:
        name = urf.split(" - ")[-1]
        return clean_string(name).upper()


def rename_and_split_columns(df):
    df["urf_code"] = df.pop("CO_URF")
    df["label"] = df.pop("NO_URF").apply(clean_urf_name)


def get_port_names_from_dataframe(df_regions):
    # Filter the dataframe for the required conditions
    filtered_df = df_regions[
        (df_regions["region_type"] == "PORT") & (df_regions["country"] == "BRAZIL")
    ]

    # Unnest the synonyms and create a new dataframe
    unnested_df = filtered_df.explode("synonyms")

    # Select the distinct columns and coalesce trase_id
    df = unnested_df[["name", "synonyms", "trase_id"]].drop_duplicates()
    df["trase_id"] = df["trase_id"].fillna("")

    # Rename the columns to match the expected output
    df = df.rename(columns={"synonyms": "label"}, errors="raise")

    return df


def clean_port_names(df, df_regions):
    df_port_names = get_port_names_from_dataframe(df_regions)
    df = pd.merge(
        df,
        df_port_names,
        on="label",
        how="left",
        validate="many_to_one",
        indicator=True,
    )
    # if the port name is missing in the database, just use the label
    df["found_in_database"] = df.pop("_merge") == "both"
    missing = ~df["found_in_database"]
    if any(missing):
        print("Missing some port names! Continuing anyway:", file=sys.stderr)
        with pd.option_context("display.max_columns", 999):
            print(df.loc[missing], file=sys.stderr)
    df.loc[missing, "trase_id"] = ""
    df["name"] = np.where(missing, df["label"], df["name"])
    return df


def clean_urf(df, df_regions):
    df = df[["CO_URF", "NO_URF"]]
    rename_and_split_columns(df)
    df = clean_port_names(df, df_regions)
    return df


def get_urf():
    return get_pandas_df(
        "brazil/metadata/originals/URF.csv",
        encoding="latin-1",
        sep=";",
        dtype=str,
        keep_default_na=False,
    )


@uses_database
def main(cnx=None):
    df = get_urf()
    df_regions = get_regions(cnx=cnx)
    df = clean_urf(df, df_regions)
    write_parquet_for_upload(df, "brazil/metadata/brazil_port_urfs.parquet")


if __name__ == "__main__":
    main()
from trase.data.brazil.metadata.brazil_port_urfs import clean_urf


def model(dbt, cursor):
    dbt.config(materialized="external")

    df_original = dbt.source(
        "trase-storage-raw", "brazil_rfb_urf_retrieved_2025_02_15"
    ).to_df()
    df_regions = dbt.ref("postgres_regions_without_geometry").to_df()

    return clean_urf(df_original, df_regions)