DBT: Brazil Bol 2014
File location: s3://trase-storage/brazil/trade/bol/2014/BRAZIL_BOL_2014.csv
DBT model name: brazil_bol_2014
Explore on Metabase: Full table; summary statistics
DBT details
- Lineage
-
Dbt path:
trase_production.main_brazil.brazil_bol_2014 -
Containing yaml link: trase/data_pipeline/models/brazil/trade/bol/2014/_schema.yml
-
Model file: trase/data_pipeline/models/brazil/trade/bol/2014/brazil_bol_2014.py
-
Calls script:
trase/data/brazil/trade/bol/2014/BRAZIL_BOL_2014.py -
Tags:
mock_model,2014,bol,brazil,trade
Description
This model was auto-generated based off .yml 'lineage' files in S3. The DBT model just raises an error; the actual script that created the data lives elsewhere. The script is located at trase/data/brazil/trade/bol/2014/BRAZIL_BOL_2014.py [permalink]. It was last run by Harry Biddle.
Details
| Column | Type | Description |
|---|---|---|
Models / Seeds
source.trase_duckdb.trase-storage-raw.university of wisconsin - madison jan 14 - dec14 brazil exp order 0061400001aszqd _rerun
Sources
['trase-storage-raw', 'university of wisconsin - madison jan 14 - dec14 brazil exp order 0061400001aszqd _rerun']
import pandas as pd
from trase.tools import uses_database
from trase.tools.aws.aws_helpers_cached import get_pandas_df_once
from trase.tools.aws.metadata import write_csv_for_upload
from trase.tools.sei_pcs.pandas_utilities import append, full_merge
from trase.tools.utilities.helpers import clean_string
COLUMNS = {
"DIRECTION": "DIRECTION", # 1=Export; 2=Imports
"LOCAL ORIGIN/DEST": "origin.name", # Is the actual Origin or Final Destination of a Cargo
"CODE LOCAL ORIGIN/DEST": "origin.code", # Code for the actual Origin or Final Destination of a Cargo
"LOCAL PORT": "port_of_export.label", # Port in Brazil that has being used
"TRANSHIPPORT": "port_of_transhipment.label", # Port of Transhipment
"FOREIGN PORT": "port_of_import.label", # Foreign Port
"CODE FOREIGN PORT": "port_of_import.code", # Code of the foreign Port
"CODE COUNTRY": "country_of_destination.trase_id", # Code for the country
"FOREIGN ORIGIN/DEST PLACE": "destination.label", # Foreign Origin or Destination
"CODE FOREIGN ORIGIN/DEST PLACE": "destination.code", # Code - Foreign Origin or Destination
"TRADELANE": "tradelane", # Route used by merchant ships
"CARRIER": "carrier", # Shipping line
"VESSEL": "vessel", # Name of the Freight Ship
"ARRDATE": "date", # Arrival /Departure Date
# "MONTH & YEAR": "", # Arrival /Departure Month & Yr
"NCM/HS": "hs8", # 8 Digits HSCODE
# "COMMODITY DESCRIPTION": "", # Harmonized Commodity Description
"CARGO TYPE": "cargo_type", # 12=Container; 11=Breakbulk; 21=Liquid Bulk; 22=DryBulk
"LOCAL TRADER 1": "exporter", # Brazil Company (Exporter-Importer)
"LOCAL TRADER 2": "owner", # Brazil Real Owner of the cargo (Importer - Exporter)
"STREET": "exporter.address", # Brazil Importer - Exporter Address
"CITY": "exporter.municipality.label", # Brazil Importer - Exporter City
"STATE CODE": "exporter.state.code", # Brazil Importer - Exporter State
"ZIP CODE": "exporter.zip_code", # Brazil Importer - Exporter Zip Code
"FOREIGN TRADER": "importer", # Foreign Company (Importer/Exporter)
"QTY OF PACKAGES": "quantity", # Quatity of Packaging
"PACKAGE TYPE": "unit", # Unit of Measure
"WEIGHT (KG)": "vol", # Total Weight in KG
"CNT TYPE": "container_type", # R=Reefer; T=Tankers; D=Dry bulk; CNT=Container
"20' CNT": "number_twenty_foot_containers", # Quantity containers 20'
"40' CNT": "number_forty_foot_containers", # Quantity containers 40'
"TEU": "total_teus", # Total of the Teus
}
STATE_TRASE_IDS = {
"RO": "BR-11",
"AC": "BR-12",
"AM": "BR-13",
"RR": "BR-14",
"PA": "BR-15",
"AP": "BR-16",
"TO": "BR-17",
"MA": "BR-21",
"PI": "BR-22",
"CE": "BR-23",
"RN": "BR-24",
"PB": "BR-25",
"PE": "BR-26",
"AL": "BR-27",
"SE": "BR-28",
"BA": "BR-29",
"MG": "BR-31",
"ES": "BR-32",
"RJ": "BR-33",
"SP": "BR-35",
"PR": "BR-41",
"SC": "BR-42",
"RS": "BR-43",
"MS": "BR-50",
"MT": "BR-51",
"GO": "BR-52",
"DF": "BR-53",
"": "BR-XX",
}
UNKNOWN_MUNICIPALITIES = [
("", "BR-26"),
("", "BR-31"),
("", "BR-33"),
("", "BR-35"),
("", "BR-41"),
("", "BR-42"),
("", "BR-51"),
("", "BR-53"),
("", "BR-XX"),
("BAGE RIO GRANDE DO SUL", "BR-43"),
("BALSA NOVA", "BR-XX"),
("CABO", "BR-26"),
("CORDEIROPOLIS", "BR-XX"),
("CRUZ DAS ALMAS", "BR-35"),
("CURITIBA", "BR-XX"),
("FORTALEZA", "BR-XX"),
("GUAICARA", "BR-XX"),
("ITABUNA", "BR-15"),
("ITAJI", "BR-42"),
("JARU", "BR-15"),
("MOGI GUACO", "BR-35"),
("MUNICIPIO", "BR-35"),
("NOVO HAMBURGO", "BR-35"),
("OURO FINO", "BR-XX"),
("PAICANDU", "BR-XX"),
("PARANA", "BR-41"),
("PORT ALERGE", "BR-43"),
("RIO GRANDE DO SUL", "BR-43"),
("SANTA MARIA", "BR-XX"),
("SAO PAULO", "BR-XX"),
("SAPIRANGA", "BR-23"),
("VALPARAISO", "BR-XX"),
("VICTORIA", "BR-32"),
("VITORIA", "BR-42"),
("ZONA RURAL", "BR-35"),
]
def clean_states(df):
df["exporter.state.code"] = df["exporter.state.code"].apply(clean_string)
df["exporter.state.trase_id"] = df["exporter.state.code"].map(STATE_TRASE_IDS)
assert not any(df["exporter.state.trase_id"].isna())
def clean_hs_codes(df):
hs8 = df["hs8"].astype(str).str.rjust(8, "0")
assert all(hs8.str.len() == 8)
return df.assign(hs8=hs8, hs6=hs8.str.slice(0, 6), hs4=hs8.str.slice(0, 4))
@uses_database
def clean_countries(df, cnx=None):
df_country_labels = pd.read_sql(
"""
select distinct
name as "country_of_destination.name",
trase_id as "country_of_destination.trase_id"
from views.regions where level = 1 and length(trase_id) = 2
""",
cnx.cnx,
)
df.loc[
df["country_of_destination.trase_id"] == "ZZ", "country_of_destination.trase_id"
] = "XX"
return full_merge(
df,
df_country_labels,
on="country_of_destination.trase_id",
validate="many_to_one",
how="left",
)
@uses_database
def clean_ports(df, cnx=None):
df_port_labels = pd.read_sql(
"""
select distinct
name as "port_of_export.name",
unnest(synonyms) as "port_of_export.label"
from views.regions where region_type = 'PORT' and country = 'BRAZIL'
""",
cnx.cnx,
)
return full_merge(
df,
df_port_labels,
on="port_of_export.label",
validate="many_to_one",
how="left",
)
@uses_database
def clean_municipalities(df, cnx=None):
df_municipalities = pd.read_sql(
"""
select distinct
name as "exporter.municipality.name",
unnest(synonyms) as "exporter.municipality.label",
trase_id as "exporter.municipality.trase_id",
substring(trase_id, 0, 6) as "exporter.state.trase_id"
from views.regions
where country = 'BRAZIL' and region_type = 'MUNICIPALITY'
""",
cnx.cnx,
)
df_municipalities = append(
df_municipalities,
[
{
"exporter.municipality.name": "UNKNOWN",
"exporter.municipality.label": label,
"exporter.municipality.trase_id": "BR-XXXXXX",
"exporter.state.trase_id": state_trase_id,
}
for (label, state_trase_id) in UNKNOWN_MUNICIPALITIES
],
)
return full_merge(
df,
df_municipalities,
left_on=["exporter.municipality.label", "exporter.state.trase_id"],
right_on=["exporter.municipality.label", "exporter.state.trase_id"],
how="left",
validate="many_to_one",
)
def main():
df = get_pandas_df_once(
"brazil/trade/bol/2014/originals/University of Wisconsin - Madison Jan 14 - Dec14 Brazil Exp Order 0061400001AszqD _Rerun.xlsx",
xlsx=True,
keep_default_na=False,
)
df = df[COLUMNS].rename(columns=COLUMNS, errors="raise")
assert all(df.pop("DIRECTION") == 1)
date = df.pop("date").astype(str)
df["year"] = date.str.slice(0, 4).astype(int)
df["month"] = date.str.slice(4, 6).astype(int)
df["day"] = date.str.slice(6, 8).astype(int)
df = clean_hs_codes(df)
df = clean_countries(df)
df = clean_ports(df)
clean_states(df)
df = clean_municipalities(df)
write_csv_for_upload(df, "brazil/trade/bol/2014/BRAZIL_BOL_2014.csv")
if __name__ == "__main__":
main()
import pandas as pd
def model(dbt, cursor):
dbt.source(
"trase-storage-raw",
"university of wisconsin - madison jan 14 - dec14 brazil exp order 0061400001aszqd _rerun",
)
raise NotImplementedError()
return pd.DataFrame({"hello": ["world"]})