Brazil Port Urfs
s3://trase-storage/brazil/metadata/brazil_port_urfs.parquet
Dbt path: trase_production.main_brazil.brazil_port_urfs
Explore on Metabase: Full table; summary statistics
Containing yaml file link: trase/data_pipeline/models/brazil/metadata/_schema.yml
Model file link: trase/data_pipeline/models/brazil/metadata/brazil_port_urfs.py
Calls script: trase/data/brazil/metadata/brazil_port_urfs.py
Dbt test runs & lineage: Test results · Lineage
Full dbt_docs page: Open in dbt docs (includes lineage graph -at the bottom right-, tests, and downstream dependencies)
Tags: brazil, rfb, urf
brazil_port_urfs
Description
Brazil URF (Unidade de Receita Federal)
A Unidade de Receita Federal (URF) is a "Customs" or "Federal Revenue" Unit in Brazil. These units are located at ports, airports, and border crossings to oversee customs clearance, taxation, and trade compliance.
How to re-fetch the dataset from the original source
The file is downloaded ad-hoc from https://www.gov.br/mdic/pt-br/assuntos/comercio-exterior/estatisticas/base-de-dados-bruta. Search for the hyperlink "URF (Unidade da RFB)", under 5. Tabelas de Correlações de Códigos e Classificações. The file is furthermore converted to UTF8 encoding before upload:
iconv -f latin1 -t utf8 URF.csv > URF_utf8.csv
History of changes to the dataset
- February 2024: since last retrieval (of unknown date), three entries were added:
0710252 - IRF MACAE,0710304 - ITAGUAIand0811400 - SANTO ANDRE.
Details
| Column | Type | Description |
|---|---|---|
urf_code |
VARCHAR |
|
label |
VARCHAR |
|
name |
VARCHAR |
|
trase_id |
VARCHAR |
|
found_in_database |
BOOLEAN |
Models / Seeds
source.trase_duckdb.trase-storage-raw.brazil_rfb_urf_retrieved_2025_02_15model.trase_duckdb.postgres_regions_without_geometry
Sources
['trase-storage-raw', 'brazil_rfb_urf_retrieved_2025_02_15']
import sys
import numpy as np
import pandas as pd
from trase.tools import uses_database
from trase.tools.aws import get_pandas_df
from trase.tools.aws.metadata import write_parquet_for_upload
from trase.tools.utilities.helpers import clean_string
def get_regions(cnx):
# get columns of views.regions
columns_df = pd.read_sql(
"""
SELECT column_name FROM information_schema.columns
WHERE table_schema = 'views' AND table_name = 'regions'
""",
cnx.cnx,
)
columns = list(columns_df["column_name"].values)
# remove _geometry column
assert "_geometry" in columns
columns.pop(columns.index("_geometry"))
# select all other columns
return pd.read_sql(f"select {' ,'.join(columns)} from views.regions", cnx.cnx)
def clean_urf_name(urf: str):
"""
For example:
"0140100 - CAMPO GRANDE" becomes "CAMPO GRANDE"
"0147600 - ALF - CORUMBÁ" becomes "CORUMBA"
"""
if urf.endswith("IRF/BELO HORIZONTE - MG"):
return "IRF/BELO HORIZONTE - MG"
elif urf.endswith("SEFAZ-SC - CONVENIO CONSULTA SISTEMA CARGA"):
return "SEFAZ-SC - CONVENIO CONSULTA SISTEMA CARGA"
else:
name = urf.split(" - ")[-1]
return clean_string(name).upper()
def rename_and_split_columns(df):
df["urf_code"] = df.pop("CO_URF")
df["label"] = df.pop("NO_URF").apply(clean_urf_name)
def get_port_names_from_dataframe(df_regions):
# Filter the dataframe for the required conditions
filtered_df = df_regions[
(df_regions["region_type"] == "PORT") & (df_regions["country"] == "BRAZIL")
]
# Unnest the synonyms and create a new dataframe
unnested_df = filtered_df.explode("synonyms")
# Select the distinct columns and coalesce trase_id
df = unnested_df[["name", "synonyms", "trase_id"]].drop_duplicates()
df["trase_id"] = df["trase_id"].fillna("")
# Rename the columns to match the expected output
df = df.rename(columns={"synonyms": "label"}, errors="raise")
return df
def clean_port_names(df, df_regions):
df_port_names = get_port_names_from_dataframe(df_regions)
df = pd.merge(
df,
df_port_names,
on="label",
how="left",
validate="many_to_one",
indicator=True,
)
# if the port name is missing in the database, just use the label
df["found_in_database"] = df.pop("_merge") == "both"
missing = ~df["found_in_database"]
if any(missing):
print("Missing some port names! Continuing anyway:", file=sys.stderr)
with pd.option_context("display.max_columns", 999):
print(df.loc[missing], file=sys.stderr)
df.loc[missing, "trase_id"] = ""
df["name"] = np.where(missing, df["label"], df["name"])
return df
def clean_urf(df, df_regions):
df = df[["CO_URF", "NO_URF"]]
rename_and_split_columns(df)
df = clean_port_names(df, df_regions)
return df
def get_urf():
return get_pandas_df(
"brazil/metadata/originals/URF.csv",
encoding="latin-1",
sep=";",
dtype=str,
keep_default_na=False,
)
@uses_database
def main(cnx=None):
df = get_urf()
df_regions = get_regions(cnx=cnx)
df = clean_urf(df, df_regions)
write_parquet_for_upload(df, "brazil/metadata/brazil_port_urfs.parquet")
if __name__ == "__main__":
main()
from trase.data.brazil.metadata.brazil_port_urfs import clean_urf
def model(dbt, cursor):
dbt.config(materialized="external")
df_original = dbt.source(
"trase-storage-raw", "brazil_rfb_urf_retrieved_2025_02_15"
).to_df()
df_regions = dbt.ref("postgres_regions_without_geometry").to_df()
return clean_urf(df_original, df_regions)