Skip to content

View or edit on GitHub

This page is synchronized from trase/models/brazil/beef/QA of new customs data.ipynb. Last modified on 2026-03-21 22:30 CET by Trase Admin. Please view or edit the original file there; changes should be reflected here after a midnight build (CET time), or manually triggering it with a GitHub action (link).

Brazil Beef Customs Data

This script describes the Brazil beef disaggregated-and-matched customs data for 2019 and 2020 and compares it to the CD_COMBINED data for 2015-2017.

import pandas as pd
import numpy as np
import plotly.express as px

from trase.tools import sps
from trase.tools.aws.aws_helpers_cached import get_pandas_df_once

COLUMNS = [
    "year",
    "matched",
    "matching_stage",
    "message",
    "is_maritime",
    "hs4",
    "hs5",
    "hs6",
    "hs8",
    "state",
    "municipality",
    "port",
    "exporter",
    "exporter.cnpj",
    "importer",
    "country",
    "vol",
    "fob",
]

Now lets read the customs data from 2015 to 2020 inclusive (although currently missing 2018). By doing this we can look at trends.

def read(path, **kwargs):
    df = get_pandas_df_once(path, sep=";", dtype=str, keep_default_na=False, **kwargs)
    return df.rename(
        columns={
            "cnpj": "exporter.cnpj",
            "exporter.label": "exporter",
            "municipality.trase_id": "municipality",
            "exporter.municipality.trase_id": "municipality",
            "country_of_destination.name": "country",
            "ncm": "hs8",
            "port_of_export.name": "port",
            "importer.label": "importer",
            "state.trase_id": "state",
        },
    )


def read_old(path):
    df = read(path)
    df["hs8"] = df["hs8"].str.rjust(8, "0")
    df["hs6"] = df["hs8"].str.slice(0, 6)
    df["hs5"] = df["hs8"].str.slice(0, 5)
    df["hs4"] = df["hs8"].str.slice(0, 4)
    df["matched"] = "N/A"
    df["matching_stage"] = "N/A"
    df["message"] = "N/A"
    df["is_maritime"] = "Unknown"
    return df


def read_new(path, year):
    df = read(path)
    print(sum(df[df["hs4"] == "0202"]["vol"].astype(float)))
    df["is_maritime"] = np.where(df["via"] == "01", "True", "False")
    df["year"] = str(year)
    if "hs8" not in df:
        df["hs8"] = "X" * 8
    return df


dataframes = {
    2015: read_old("brazil/beef/trade/cd/combined/cleaned/CD_COMBINED_BEEF_2015.csv"),
    2016: read_old("brazil/beef/trade/cd/combined/cleaned/CD_COMBINED_BEEF_2016.csv"),
    2017: read_old("brazil/beef/trade/cd/combined/cleaned/CD_COMBINED_BEEF_2017.csv"),
    2018: read_new(
        "brazil/beef/trade/cd/disaggregated/CD_DISAGGREGATED_BEEF_2018.csv", 2018
    ),
    2019: read_new(
        "brazil/beef/trade/cd/disaggregated/CD_DISAGGREGATED_BEEF_2019.csv", 2019
    ),
    2020: read_new(
        "brazil/beef/trade/cd/disaggregated/CD_DISAGGREGATED_BEEF_2020.csv", 2020
    ),
}

# concatenate dataframes
df_customs = sps.concat(
    df.assign(year=year)[COLUMNS] for year, df in dataframes.items()
)
df_customs = df_customs.astype({"vol": float, "fob": float})
df_customs = sps.consolidate(df_customs, ["vol", "fob"])

Now let's read MDIC (Port)

def read_port(path, year):
    df = get_pandas_df_once(
        path,
        sep=";",
        keep_default_na=False,
        dtype=str,
    )
    df = sps.rename(
        df,
        {
            "country_of_destination.name": "country",
            "port.name": "port",
            "state.trase_id": "state",
        },
    )
    df["is_maritime"] = np.where(df["via"] == "01", "True", "False")
    df["matched"] = "N/A"
    df["matching_stage"] = "N/A"
    df["message"] = "N/A"
    df["municipality"] = "BR-XXXXXXX"
    df["exporter"] = "UNKNOWN"
    df["exporter.cnpj"] = "0"
    df["importer"] = "UNKNOWN"
    df["hs5"] = df["hs6"].str.slice(0, 5)
    return df.assign(year=year)


df_port = sps.concat(
    [
        read_port("brazil/trade/mdic/port/brazil_mdic_port_2015.csv", 2015),
        read_port("brazil/trade/mdic/port/brazil_mdic_port_2016.csv", 2016),
        read_port("brazil/trade/mdic/port/brazil_mdic_port_2017.csv", 2017),
        read_port("brazil/trade/mdic/port/brazil_mdic_port_2018.csv", 2018),
        read_port("brazil/trade/mdic/port/brazil_mdic_port_2019.csv", 2019),
        read_port("brazil/trade/mdic/port/archive/brazil_mdic_port_2020.csv", 2020),
    ]
)
df_port = df_port.astype({"vol": float, "fob": float, "year": int})

# # keep only the HS6 codes present in the customs data (all years!)
# df_port = pd.merge(df_port, df_customs["hs6"].drop_duplicates())

BEEF_HS6 = [
    "010221",  # Cattle; live, pure-bred breeding animals
    "010229",  # Cattle; live, other than pure-bred breeding animals
    "010290",  # Bovine animals; live, other than cattle and buffalo
    "020120",  # Meat; of bovine animals, cuts with bone in, fresh or chilled
    "020130",  # Meat; of bovine animals, boneless cuts, fresh or chilled
    "020220",  # Meat; of bovine animals, cuts with bone in, frozen
    "020230",  # Meat; of bovine animals, boneless cuts, frozen
    "020610",  # Offal, edible; of bovine animals, fresh or chilled
    "020621",  # Offal, edible; of bovine animals, tongues, frozen
    "020622",  # Offal, edible; of bovine animals, livers, frozen
    "020629",  # Offal, edible; of bovine animals, (other than tongues & livers), frozen
    "021020",  # Meat; salted, in brine, dried or smoked, of bovine animals
    "160250",  # Meat preparations; of bovine animals, meat or offal, prepared/preserved
    "050400",  # Animal products; guts, bladders and stomachs of animals (other than fish)
]
df_port = df_port[df_port["hs6"].isin(BEEF_HS6)].copy()

df_port = sps.consolidate(df_port[COLUMNS], ["vol", "fob"])

Concat the two - this will be easier for plotting

df = sps.concat([df_customs.assign(source="customs"), df_port.assign(source="mdic")])

Define some useful variables

new_years = df["year"].isin([2018, 2019, 2020])
customs_or_maritime_mdic = (df["source"] == "customs") | (df["is_maritime"] == "True")

Volume per HS4

import plotly.express as px

# filter to only the new years of data
data = df[new_years & customs_or_maritime_mdic]

# consolidate for quicker viewing
data = sps.consolidate(data, ["vol"], ["year", "source", "hs4"])

# neater labels
data["source"] = data["source"].map(
    {"customs": "Customs", "mdic": "MDIC (Only maritime)"}
)

# plot
px.bar(
    data,
    x="vol",
    y="hs4",
    color="source",
    facet_row="year",
    barmode="group",
    orientation="h",
    title="<b>Volume per HS4</b><br />Customs versus MDIC",
    height=600,
)

Row Count

In this section we are simply looking at how many rows the datasets have. The 2018 data is not present (this hasn't been made). The 2019 data has vastly more rows.

import plotly.express as px

data = df[df["source"] == "customs"]
over = [
    "year",
    "hs4",
    "state",
    "municipality",
    "port",
    "exporter",
    "importer",
    "country",
]
data = (
    data[over]
    .drop_duplicates()
    .groupby("year")
    .apply(len)
    .rename("count")
    .reset_index()
)

px.bar(
    data,
    x="year",
    y="count",
    title=f"<b>Number of rows</b><br>When consolidated over {'/'.join(over)}",
)

Volume histogram

import plotly.express as px

data = df[df["source"] == "customs"]
data = df[df["year"].isin([2017, 2018, 2019, 2020])]
over = [
    "year",
    "hs4",
    "state",
    "municipality",
    "port",
    "exporter",
    "importer",
    "country",
]
data = sps.consolidate(data, ["vol"], over)

fig = px.histogram(
    data[(data["vol"] >= 0) & (data["vol"] < 5e6)],
    x="vol",
    color="year",
    log_y=True,
    nbins=50,
)
fig.update_layout(
    title=f"<b>Distribution of volume</b><br />When consolidated over {'/'.join(over)}",
    yaxis_title="count (log scale)",
    xaxis_title="volume, cutoff at 5M",
)

Unknowns

Here we look at how much volume is "UNKNOWN". I previously broke this down by all columns, but actually an unknown in one usually implies an unknown in the others.

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

data = df[df["year"].isin([2018, 2020, 2019])].copy()
data["is_unknown"] = np.where(
    data["state"] == "BR-XX",
    "Unknown state",
    "Known state",
)
data = sps.consolidate(data, ["vol"], ["year", "is_unknown"])


labels = [
    "US",
    "China",
    "European Union",
    "Russian Federation",
    "Brazil",
    "India",
    "Rest of World",
]

# Create subplots: use 'domain' type for Pie subplot
fig = make_subplots(
    rows=1, cols=3, specs=[[{"type": "domain"}, {"type": "domain"}, {"type": "domain"}]]
)

d = data[data["year"] == 2018]
fig.add_trace(go.Pie(labels=d["is_unknown"], values=d["vol"], name="2019"), 1, 1)

d = data[data["year"] == 2019]
fig.add_trace(go.Pie(labels=d["is_unknown"], values=d["vol"], name="2019"), 1, 2)

d = data[data["year"] == 2020]
fig.add_trace(go.Pie(labels=d["is_unknown"], values=d["vol"], name="2020"), 1, 3)

# Use `hole` to create a donut-like pie chart
fig.update_traces(hole=0.4, hoverinfo="label+percent+name")

fig.update_layout(
    title_text="Percentage of volume with an unknown state of origin",
    # Add annotations in the center of the donut pies.
    annotations=[
        dict(text="2018", x=0.11, y=0.5, font_size=20, showarrow=False),
        dict(text="2019", x=0.50, y=0.5, font_size=20, showarrow=False),
        dict(text="2020", x=0.89, y=0.5, font_size=20, showarrow=False),
    ],
)
fig.show()
data_2018 = df[df["year"].isin([2018])]
data_2018[data_2018["state"] != "BR-XX"]

Top Exporters per Year

import plotly.express as px

n, word = 5, "five"


def top_exporters(df):
    df["exporter"] = df["exporter"].str.replace("S/A", "SA")
    df["exporter"] = df["exporter"].str.replace("S.A", "SA")
    df["exporter"] = df["exporter"].str.replace("SA.", "SA")
    df["exporter"] = df["exporter"].str.replace(" SA", "")
    df = sps.consolidate(df, ["vol"], ["exporter"]).sort_values("vol", ascending=False)
    df = df.reset_index(drop=True)
    df.iloc[n:, list(df.columns).index("exporter")] = "OTHER"
    df = sps.consolidate(df, ["vol"], ["exporter"]).sort_values("vol", ascending=False)
    return df


data = df[df["source"] == "customs"].groupby("year").apply(top_exporters).reset_index()

px.bar(
    data,
    x="year",
    y="vol",
    color="exporter",
    title=f"Top {word} exporting companies",
    height=500,
)

Top Countries of Destination per Year - by volume

import plotly.express as px

# filter to only the new years of data
data = df[new_years & customs_or_maritime_mdic].copy()
data["country"] = data["country"].replace(
    {"CHINA (HONG KONG)": "CHINA", "CHINA (MAINLAND)": "CHINA"}
)

# filter to the top countries
n = 7


def top_countries(df):
    top = sps.consolidate(df, ["vol"], ["country"]).sort_values("vol", ascending=False)
    return top[:n]["country"].drop_duplicates()


countries = (
    data.groupby(["year"]).apply(top_countries).reset_index(drop=True).drop_duplicates()
)

data = pd.merge(data, countries, how="left", indicator=True)
data["country"] = np.where(data.pop("_merge") == "both", data["country"], "OTHER")
data = sps.consolidate(data, ["vol"], ["year", "source", "country"])

# neater labels
data["source"] = data["source"].map(
    {"customs": "Customs", "mdic": "MDIC (Only maritime)"}
)

# scale volume to 100%
data = pd.merge(
    data,
    data.groupby(["year", "source"])["vol"].sum().rename("total_volume").reset_index(),
    on=["year", "source"],
)
data["volume_percent"] = 100 * data["vol"] / data["total_volume"]

# plot
fig = px.bar(
    data,
    x="volume_percent",
    y="country",
    color="source",
    facet_row="year",
    barmode="group",
    orientation="h",
    title="<b>Volume per country</b><br />Customs versus MDIC",
)
fig.update_layout(
    height=800,
    xaxis_title="Volume as a percentage of total volume in customs/MDIC respectively",
)
import plotly.express as px

n, word = 5, "five"


def top_countries(df):
    df = sps.consolidate(df, ["vol"], ["country"]).sort_values("vol", ascending=False)
    df = df.reset_index(drop=True)
    df.iloc[n:, list(df.columns).index("country")] = "OTHER"
    df = sps.consolidate(df, ["vol"], ["country"]).sort_values("vol", ascending=False)
    return df


data = df[df["source"] == "customs"]
data["country"] = data["country"].replace(
    {"CHINA (HONG KONG)": "CHINA", "CHINA (MAINLAND)": "CHINA"}
)
data = data.groupby("year").apply(top_countries).reset_index()

px.bar(
    data, x="year", y="vol", color="country", title=f"Top {word} importing countries"
)

Top Countries of Destination per Year - by difference in volume

import plotly.subplots

figures = []

xrange = [0, 60]

for year in [2018, 2019, 2020]:

    data = df[df["year"] == year]
    data = data.assign(
        country=df["country"].replace(
            {"CHINA (HONG KONG)": "CHINA", "CHINA (MAINLAND)": "CHINA"}
        )
    )

    a = data[(data["source"] == "mdic") & (data["is_maritime"] == "True")].copy()
    a["vol_percentage"] = 100 * a["vol"] / a["vol"].sum()

    b = data[data["source"] == "customs"].copy()
    b["vol_percentage"] = 100 * b["vol"] / b["vol"].sum()

    fig = sps.dumbbell_compare(
        a,
        b,
        "vol_percentage",
        ["country"],
        labels=("MDIC (Maritime)", "Customs"),
        xaxis_title="Volume as a percentage of respective dataset",
        max_rows=7,
        comparator="absolute_error",
        include_other=False,
    )
    fig.update_layout(
        title=f"<b>Top countries by difference [{year}]</b><br />MDIC versus Customs"
    )
    fig.update_xaxes(range=xrange)

    fig.show()

Supply Chain Fragmentation

In this section we ask: how many states does each exporter source their beef from?

import plotly.express as px

# remove unknown CNPJ and state
data = df[df["source"] == "customs"]
data = data[data["exporter"] != "0"]
data = data[data["state"] != "BR-XXXXXXX"]


# count unique states per cnpj
def count_states(df):
    df = df.groupby("exporter")["state"].nunique()
    return df.reset_index(drop=True).rename("unique_known_states")


data = data.groupby("year").apply(count_states).reset_index()

px.box(
    data,
    y="unique_known_states",
    x="year",
    title="<b>Unique sourcing states per exporter CNPJ</b><br>Unknown states are excluded",
    labels={"unique_known_states": "Number of distinct states"},
)