Skip to content

DBT: Brazil Bol 2014

File location: s3://trase-storage/brazil/trade/bol/2014/BRAZIL_BOL_2014.csv

DBT model name: brazil_bol_2014

Explore on Metabase: Full table; summary statistics

DBT details


Description

This model was auto-generated based off .yml 'lineage' files in S3. The DBT model just raises an error; the actual script that created the data lives elsewhere. The script is located at trase/data/brazil/trade/bol/2014/BRAZIL_BOL_2014.py [permalink]. It was last run by Harry Biddle.


Details

Column Type Description

Models / Seeds

  • source.trase_duckdb.trase-storage-raw.university of wisconsin - madison jan 14 - dec14 brazil exp order 0061400001aszqd _rerun

Sources

  • ['trase-storage-raw', 'university of wisconsin - madison jan 14 - dec14 brazil exp order 0061400001aszqd _rerun']
import pandas as pd

from trase.tools import uses_database
from trase.tools.aws.aws_helpers_cached import get_pandas_df_once
from trase.tools.aws.metadata import write_csv_for_upload
from trase.tools.sei_pcs.pandas_utilities import append, full_merge
from trase.tools.utilities.helpers import clean_string

COLUMNS = {
    "DIRECTION": "DIRECTION",  # 1=Export; 2=Imports
    "LOCAL ORIGIN/DEST": "origin.name",  # Is the actual Origin or Final Destination of a Cargo
    "CODE LOCAL ORIGIN/DEST": "origin.code",  # Code for the actual Origin or Final Destination of a Cargo
    "LOCAL PORT": "port_of_export.label",  # Port in Brazil that has being used
    "TRANSHIPPORT": "port_of_transhipment.label",  # Port of Transhipment
    "FOREIGN PORT": "port_of_import.label",  # Foreign Port
    "CODE FOREIGN PORT": "port_of_import.code",  # Code of the foreign Port
    "CODE COUNTRY": "country_of_destination.trase_id",  # Code for the country
    "FOREIGN ORIGIN/DEST PLACE": "destination.label",  # Foreign Origin or Destination
    "CODE FOREIGN ORIGIN/DEST PLACE": "destination.code",  # Code - Foreign Origin or Destination
    "TRADELANE": "tradelane",  # Route used by merchant ships
    "CARRIER": "carrier",  # Shipping line
    "VESSEL": "vessel",  # Name of the Freight Ship
    "ARRDATE": "date",  # Arrival /Departure Date
    # "MONTH & YEAR": "",  # Arrival /Departure   Month & Yr
    "NCM/HS": "hs8",  # 8 Digits HSCODE
    # "COMMODITY DESCRIPTION": "",  # Harmonized Commodity Description
    "CARGO TYPE": "cargo_type",  # 12=Container; 11=Breakbulk; 21=Liquid Bulk; 22=DryBulk
    "LOCAL TRADER 1": "exporter",  # Brazil Company (Exporter-Importer)
    "LOCAL TRADER 2": "owner",  # Brazil Real Owner of the cargo (Importer - Exporter)
    "STREET": "exporter.address",  # Brazil Importer - Exporter Address
    "CITY": "exporter.municipality.label",  # Brazil Importer - Exporter City
    "STATE CODE": "exporter.state.code",  # Brazil Importer - Exporter State
    "ZIP CODE": "exporter.zip_code",  # Brazil Importer - Exporter  Zip Code
    "FOREIGN TRADER": "importer",  # Foreign Company (Importer/Exporter)
    "QTY OF PACKAGES": "quantity",  # Quatity of Packaging
    "PACKAGE TYPE": "unit",  # Unit of Measure
    "WEIGHT (KG)": "vol",  # Total Weight in KG
    "CNT TYPE": "container_type",  # R=Reefer; T=Tankers; D=Dry bulk; CNT=Container
    "20' CNT": "number_twenty_foot_containers",  # Quantity containers 20'
    "40' CNT": "number_forty_foot_containers",  # Quantity containers 40'
    "TEU": "total_teus",  # Total of the Teus
}

STATE_TRASE_IDS = {
    "RO": "BR-11",
    "AC": "BR-12",
    "AM": "BR-13",
    "RR": "BR-14",
    "PA": "BR-15",
    "AP": "BR-16",
    "TO": "BR-17",
    "MA": "BR-21",
    "PI": "BR-22",
    "CE": "BR-23",
    "RN": "BR-24",
    "PB": "BR-25",
    "PE": "BR-26",
    "AL": "BR-27",
    "SE": "BR-28",
    "BA": "BR-29",
    "MG": "BR-31",
    "ES": "BR-32",
    "RJ": "BR-33",
    "SP": "BR-35",
    "PR": "BR-41",
    "SC": "BR-42",
    "RS": "BR-43",
    "MS": "BR-50",
    "MT": "BR-51",
    "GO": "BR-52",
    "DF": "BR-53",
    "": "BR-XX",
}


UNKNOWN_MUNICIPALITIES = [
    ("", "BR-26"),
    ("", "BR-31"),
    ("", "BR-33"),
    ("", "BR-35"),
    ("", "BR-41"),
    ("", "BR-42"),
    ("", "BR-51"),
    ("", "BR-53"),
    ("", "BR-XX"),
    ("BAGE RIO GRANDE DO SUL", "BR-43"),
    ("BALSA NOVA", "BR-XX"),
    ("CABO", "BR-26"),
    ("CORDEIROPOLIS", "BR-XX"),
    ("CRUZ DAS ALMAS", "BR-35"),
    ("CURITIBA", "BR-XX"),
    ("FORTALEZA", "BR-XX"),
    ("GUAICARA", "BR-XX"),
    ("ITABUNA", "BR-15"),
    ("ITAJI", "BR-42"),
    ("JARU", "BR-15"),
    ("MOGI GUACO", "BR-35"),
    ("MUNICIPIO", "BR-35"),
    ("NOVO HAMBURGO", "BR-35"),
    ("OURO FINO", "BR-XX"),
    ("PAICANDU", "BR-XX"),
    ("PARANA", "BR-41"),
    ("PORT ALERGE", "BR-43"),
    ("RIO GRANDE DO SUL", "BR-43"),
    ("SANTA MARIA", "BR-XX"),
    ("SAO PAULO", "BR-XX"),
    ("SAPIRANGA", "BR-23"),
    ("VALPARAISO", "BR-XX"),
    ("VICTORIA", "BR-32"),
    ("VITORIA", "BR-42"),
    ("ZONA RURAL", "BR-35"),
]


def clean_states(df):
    df["exporter.state.code"] = df["exporter.state.code"].apply(clean_string)
    df["exporter.state.trase_id"] = df["exporter.state.code"].map(STATE_TRASE_IDS)
    assert not any(df["exporter.state.trase_id"].isna())


def clean_hs_codes(df):
    hs8 = df["hs8"].astype(str).str.rjust(8, "0")
    assert all(hs8.str.len() == 8)
    return df.assign(hs8=hs8, hs6=hs8.str.slice(0, 6), hs4=hs8.str.slice(0, 4))


@uses_database
def clean_countries(df, cnx=None):
    df_country_labels = pd.read_sql(
        """
        select distinct
            name as "country_of_destination.name",
            trase_id as "country_of_destination.trase_id"
        from views.regions where level = 1 and length(trase_id) = 2
        """,
        cnx.cnx,
    )
    df.loc[
        df["country_of_destination.trase_id"] == "ZZ", "country_of_destination.trase_id"
    ] = "XX"
    return full_merge(
        df,
        df_country_labels,
        on="country_of_destination.trase_id",
        validate="many_to_one",
        how="left",
    )


@uses_database
def clean_ports(df, cnx=None):
    df_port_labels = pd.read_sql(
        """
        select distinct
            name as "port_of_export.name",
            unnest(synonyms) as "port_of_export.label"
        from views.regions where region_type = 'PORT' and country = 'BRAZIL'
        """,
        cnx.cnx,
    )

    return full_merge(
        df,
        df_port_labels,
        on="port_of_export.label",
        validate="many_to_one",
        how="left",
    )


@uses_database
def clean_municipalities(df, cnx=None):
    df_municipalities = pd.read_sql(
        """
        select distinct
            name as "exporter.municipality.name",
            unnest(synonyms) as "exporter.municipality.label",
            trase_id as "exporter.municipality.trase_id",
            substring(trase_id, 0, 6) as "exporter.state.trase_id"
        from views.regions 
        where country = 'BRAZIL' and region_type = 'MUNICIPALITY'
        """,
        cnx.cnx,
    )
    df_municipalities = append(
        df_municipalities,
        [
            {
                "exporter.municipality.name": "UNKNOWN",
                "exporter.municipality.label": label,
                "exporter.municipality.trase_id": "BR-XXXXXX",
                "exporter.state.trase_id": state_trase_id,
            }
            for (label, state_trase_id) in UNKNOWN_MUNICIPALITIES
        ],
    )
    return full_merge(
        df,
        df_municipalities,
        left_on=["exporter.municipality.label", "exporter.state.trase_id"],
        right_on=["exporter.municipality.label", "exporter.state.trase_id"],
        how="left",
        validate="many_to_one",
    )


def main():
    df = get_pandas_df_once(
        "brazil/trade/bol/2014/originals/University of Wisconsin - Madison Jan 14 - Dec14 Brazil Exp Order 0061400001AszqD _Rerun.xlsx",
        xlsx=True,
        keep_default_na=False,
    )

    df = df[COLUMNS].rename(columns=COLUMNS, errors="raise")

    assert all(df.pop("DIRECTION") == 1)

    date = df.pop("date").astype(str)
    df["year"] = date.str.slice(0, 4).astype(int)
    df["month"] = date.str.slice(4, 6).astype(int)
    df["day"] = date.str.slice(6, 8).astype(int)

    df = clean_hs_codes(df)
    df = clean_countries(df)
    df = clean_ports(df)
    clean_states(df)
    df = clean_municipalities(df)

    write_csv_for_upload(df, "brazil/trade/bol/2014/BRAZIL_BOL_2014.csv")


if __name__ == "__main__":
    main()
import pandas as pd


def model(dbt, cursor):
    dbt.source(
        "trase-storage-raw",
        "university of wisconsin - madison jan 14 - dec14 brazil exp order 0061400001aszqd _rerun",
    )

    raise NotImplementedError()
    return pd.DataFrame({"hello": ["world"]})