View or edit on GitHub
This page is synchronized from trase/models/brazil/beef/QA of new customs data.ipynb. Last modified on 2026-03-21 22:30 CET by Trase Admin.
Please view or edit the original file there; changes should be reflected here after a midnight build (CET time),
or manually triggering it with a GitHub action (link).
Brazil Beef Customs Data
This script describes the Brazil beef disaggregated-and-matched customs data for 2019 and 2020 and compares it to the CD_COMBINED data for 2015-2017.
import pandas as pd
import numpy as np
import plotly.express as px
from trase.tools import sps
from trase.tools.aws.aws_helpers_cached import get_pandas_df_once
COLUMNS = [
"year",
"matched",
"matching_stage",
"message",
"is_maritime",
"hs4",
"hs5",
"hs6",
"hs8",
"state",
"municipality",
"port",
"exporter",
"exporter.cnpj",
"importer",
"country",
"vol",
"fob",
]
Now lets read the customs data from 2015 to 2020 inclusive (although currently missing 2018). By doing this we can look at trends.
def read(path, **kwargs):
df = get_pandas_df_once(path, sep=";", dtype=str, keep_default_na=False, **kwargs)
return df.rename(
columns={
"cnpj": "exporter.cnpj",
"exporter.label": "exporter",
"municipality.trase_id": "municipality",
"exporter.municipality.trase_id": "municipality",
"country_of_destination.name": "country",
"ncm": "hs8",
"port_of_export.name": "port",
"importer.label": "importer",
"state.trase_id": "state",
},
)
def read_old(path):
df = read(path)
df["hs8"] = df["hs8"].str.rjust(8, "0")
df["hs6"] = df["hs8"].str.slice(0, 6)
df["hs5"] = df["hs8"].str.slice(0, 5)
df["hs4"] = df["hs8"].str.slice(0, 4)
df["matched"] = "N/A"
df["matching_stage"] = "N/A"
df["message"] = "N/A"
df["is_maritime"] = "Unknown"
return df
def read_new(path, year):
df = read(path)
print(sum(df[df["hs4"] == "0202"]["vol"].astype(float)))
df["is_maritime"] = np.where(df["via"] == "01", "True", "False")
df["year"] = str(year)
if "hs8" not in df:
df["hs8"] = "X" * 8
return df
dataframes = {
2015: read_old("brazil/beef/trade/cd/combined/cleaned/CD_COMBINED_BEEF_2015.csv"),
2016: read_old("brazil/beef/trade/cd/combined/cleaned/CD_COMBINED_BEEF_2016.csv"),
2017: read_old("brazil/beef/trade/cd/combined/cleaned/CD_COMBINED_BEEF_2017.csv"),
2018: read_new(
"brazil/beef/trade/cd/disaggregated/CD_DISAGGREGATED_BEEF_2018.csv", 2018
),
2019: read_new(
"brazil/beef/trade/cd/disaggregated/CD_DISAGGREGATED_BEEF_2019.csv", 2019
),
2020: read_new(
"brazil/beef/trade/cd/disaggregated/CD_DISAGGREGATED_BEEF_2020.csv", 2020
),
}
# concatenate dataframes
df_customs = sps.concat(
df.assign(year=year)[COLUMNS] for year, df in dataframes.items()
)
df_customs = df_customs.astype({"vol": float, "fob": float})
df_customs = sps.consolidate(df_customs, ["vol", "fob"])
Now let's read MDIC (Port)
def read_port(path, year):
df = get_pandas_df_once(
path,
sep=";",
keep_default_na=False,
dtype=str,
)
df = sps.rename(
df,
{
"country_of_destination.name": "country",
"port.name": "port",
"state.trase_id": "state",
},
)
df["is_maritime"] = np.where(df["via"] == "01", "True", "False")
df["matched"] = "N/A"
df["matching_stage"] = "N/A"
df["message"] = "N/A"
df["municipality"] = "BR-XXXXXXX"
df["exporter"] = "UNKNOWN"
df["exporter.cnpj"] = "0"
df["importer"] = "UNKNOWN"
df["hs5"] = df["hs6"].str.slice(0, 5)
return df.assign(year=year)
df_port = sps.concat(
[
read_port("brazil/trade/mdic/port/brazil_mdic_port_2015.csv", 2015),
read_port("brazil/trade/mdic/port/brazil_mdic_port_2016.csv", 2016),
read_port("brazil/trade/mdic/port/brazil_mdic_port_2017.csv", 2017),
read_port("brazil/trade/mdic/port/brazil_mdic_port_2018.csv", 2018),
read_port("brazil/trade/mdic/port/brazil_mdic_port_2019.csv", 2019),
read_port("brazil/trade/mdic/port/archive/brazil_mdic_port_2020.csv", 2020),
]
)
df_port = df_port.astype({"vol": float, "fob": float, "year": int})
# # keep only the HS6 codes present in the customs data (all years!)
# df_port = pd.merge(df_port, df_customs["hs6"].drop_duplicates())
BEEF_HS6 = [
"010221", # Cattle; live, pure-bred breeding animals
"010229", # Cattle; live, other than pure-bred breeding animals
"010290", # Bovine animals; live, other than cattle and buffalo
"020120", # Meat; of bovine animals, cuts with bone in, fresh or chilled
"020130", # Meat; of bovine animals, boneless cuts, fresh or chilled
"020220", # Meat; of bovine animals, cuts with bone in, frozen
"020230", # Meat; of bovine animals, boneless cuts, frozen
"020610", # Offal, edible; of bovine animals, fresh or chilled
"020621", # Offal, edible; of bovine animals, tongues, frozen
"020622", # Offal, edible; of bovine animals, livers, frozen
"020629", # Offal, edible; of bovine animals, (other than tongues & livers), frozen
"021020", # Meat; salted, in brine, dried or smoked, of bovine animals
"160250", # Meat preparations; of bovine animals, meat or offal, prepared/preserved
"050400", # Animal products; guts, bladders and stomachs of animals (other than fish)
]
df_port = df_port[df_port["hs6"].isin(BEEF_HS6)].copy()
df_port = sps.consolidate(df_port[COLUMNS], ["vol", "fob"])
Concat the two - this will be easier for plotting
df = sps.concat([df_customs.assign(source="customs"), df_port.assign(source="mdic")])
Define some useful variables
new_years = df["year"].isin([2018, 2019, 2020])
customs_or_maritime_mdic = (df["source"] == "customs") | (df["is_maritime"] == "True")
Volume per HS4
import plotly.express as px
# filter to only the new years of data
data = df[new_years & customs_or_maritime_mdic]
# consolidate for quicker viewing
data = sps.consolidate(data, ["vol"], ["year", "source", "hs4"])
# neater labels
data["source"] = data["source"].map(
{"customs": "Customs", "mdic": "MDIC (Only maritime)"}
)
# plot
px.bar(
data,
x="vol",
y="hs4",
color="source",
facet_row="year",
barmode="group",
orientation="h",
title="<b>Volume per HS4</b><br />Customs versus MDIC",
height=600,
)
Row Count
In this section we are simply looking at how many rows the datasets have. The 2018 data is not present (this hasn't been made). The 2019 data has vastly more rows.
import plotly.express as px
data = df[df["source"] == "customs"]
over = [
"year",
"hs4",
"state",
"municipality",
"port",
"exporter",
"importer",
"country",
]
data = (
data[over]
.drop_duplicates()
.groupby("year")
.apply(len)
.rename("count")
.reset_index()
)
px.bar(
data,
x="year",
y="count",
title=f"<b>Number of rows</b><br>When consolidated over {'/'.join(over)}",
)
Volume histogram
import plotly.express as px
data = df[df["source"] == "customs"]
data = df[df["year"].isin([2017, 2018, 2019, 2020])]
over = [
"year",
"hs4",
"state",
"municipality",
"port",
"exporter",
"importer",
"country",
]
data = sps.consolidate(data, ["vol"], over)
fig = px.histogram(
data[(data["vol"] >= 0) & (data["vol"] < 5e6)],
x="vol",
color="year",
log_y=True,
nbins=50,
)
fig.update_layout(
title=f"<b>Distribution of volume</b><br />When consolidated over {'/'.join(over)}",
yaxis_title="count (log scale)",
xaxis_title="volume, cutoff at 5M",
)
Unknowns
Here we look at how much volume is "UNKNOWN". I previously broke this down by all columns, but actually an unknown in one usually implies an unknown in the others.
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
data = df[df["year"].isin([2018, 2020, 2019])].copy()
data["is_unknown"] = np.where(
data["state"] == "BR-XX",
"Unknown state",
"Known state",
)
data = sps.consolidate(data, ["vol"], ["year", "is_unknown"])
labels = [
"US",
"China",
"European Union",
"Russian Federation",
"Brazil",
"India",
"Rest of World",
]
# Create subplots: use 'domain' type for Pie subplot
fig = make_subplots(
rows=1, cols=3, specs=[[{"type": "domain"}, {"type": "domain"}, {"type": "domain"}]]
)
d = data[data["year"] == 2018]
fig.add_trace(go.Pie(labels=d["is_unknown"], values=d["vol"], name="2019"), 1, 1)
d = data[data["year"] == 2019]
fig.add_trace(go.Pie(labels=d["is_unknown"], values=d["vol"], name="2019"), 1, 2)
d = data[data["year"] == 2020]
fig.add_trace(go.Pie(labels=d["is_unknown"], values=d["vol"], name="2020"), 1, 3)
# Use `hole` to create a donut-like pie chart
fig.update_traces(hole=0.4, hoverinfo="label+percent+name")
fig.update_layout(
title_text="Percentage of volume with an unknown state of origin",
# Add annotations in the center of the donut pies.
annotations=[
dict(text="2018", x=0.11, y=0.5, font_size=20, showarrow=False),
dict(text="2019", x=0.50, y=0.5, font_size=20, showarrow=False),
dict(text="2020", x=0.89, y=0.5, font_size=20, showarrow=False),
],
)
fig.show()
data_2018 = df[df["year"].isin([2018])]
data_2018[data_2018["state"] != "BR-XX"]
Top Exporters per Year
import plotly.express as px
n, word = 5, "five"
def top_exporters(df):
df["exporter"] = df["exporter"].str.replace("S/A", "SA")
df["exporter"] = df["exporter"].str.replace("S.A", "SA")
df["exporter"] = df["exporter"].str.replace("SA.", "SA")
df["exporter"] = df["exporter"].str.replace(" SA", "")
df = sps.consolidate(df, ["vol"], ["exporter"]).sort_values("vol", ascending=False)
df = df.reset_index(drop=True)
df.iloc[n:, list(df.columns).index("exporter")] = "OTHER"
df = sps.consolidate(df, ["vol"], ["exporter"]).sort_values("vol", ascending=False)
return df
data = df[df["source"] == "customs"].groupby("year").apply(top_exporters).reset_index()
px.bar(
data,
x="year",
y="vol",
color="exporter",
title=f"Top {word} exporting companies",
height=500,
)
Top Countries of Destination per Year - by volume
import plotly.express as px
# filter to only the new years of data
data = df[new_years & customs_or_maritime_mdic].copy()
data["country"] = data["country"].replace(
{"CHINA (HONG KONG)": "CHINA", "CHINA (MAINLAND)": "CHINA"}
)
# filter to the top countries
n = 7
def top_countries(df):
top = sps.consolidate(df, ["vol"], ["country"]).sort_values("vol", ascending=False)
return top[:n]["country"].drop_duplicates()
countries = (
data.groupby(["year"]).apply(top_countries).reset_index(drop=True).drop_duplicates()
)
data = pd.merge(data, countries, how="left", indicator=True)
data["country"] = np.where(data.pop("_merge") == "both", data["country"], "OTHER")
data = sps.consolidate(data, ["vol"], ["year", "source", "country"])
# neater labels
data["source"] = data["source"].map(
{"customs": "Customs", "mdic": "MDIC (Only maritime)"}
)
# scale volume to 100%
data = pd.merge(
data,
data.groupby(["year", "source"])["vol"].sum().rename("total_volume").reset_index(),
on=["year", "source"],
)
data["volume_percent"] = 100 * data["vol"] / data["total_volume"]
# plot
fig = px.bar(
data,
x="volume_percent",
y="country",
color="source",
facet_row="year",
barmode="group",
orientation="h",
title="<b>Volume per country</b><br />Customs versus MDIC",
)
fig.update_layout(
height=800,
xaxis_title="Volume as a percentage of total volume in customs/MDIC respectively",
)
import plotly.express as px
n, word = 5, "five"
def top_countries(df):
df = sps.consolidate(df, ["vol"], ["country"]).sort_values("vol", ascending=False)
df = df.reset_index(drop=True)
df.iloc[n:, list(df.columns).index("country")] = "OTHER"
df = sps.consolidate(df, ["vol"], ["country"]).sort_values("vol", ascending=False)
return df
data = df[df["source"] == "customs"]
data["country"] = data["country"].replace(
{"CHINA (HONG KONG)": "CHINA", "CHINA (MAINLAND)": "CHINA"}
)
data = data.groupby("year").apply(top_countries).reset_index()
px.bar(
data, x="year", y="vol", color="country", title=f"Top {word} importing countries"
)
Top Countries of Destination per Year - by difference in volume
import plotly.subplots
figures = []
xrange = [0, 60]
for year in [2018, 2019, 2020]:
data = df[df["year"] == year]
data = data.assign(
country=df["country"].replace(
{"CHINA (HONG KONG)": "CHINA", "CHINA (MAINLAND)": "CHINA"}
)
)
a = data[(data["source"] == "mdic") & (data["is_maritime"] == "True")].copy()
a["vol_percentage"] = 100 * a["vol"] / a["vol"].sum()
b = data[data["source"] == "customs"].copy()
b["vol_percentage"] = 100 * b["vol"] / b["vol"].sum()
fig = sps.dumbbell_compare(
a,
b,
"vol_percentage",
["country"],
labels=("MDIC (Maritime)", "Customs"),
xaxis_title="Volume as a percentage of respective dataset",
max_rows=7,
comparator="absolute_error",
include_other=False,
)
fig.update_layout(
title=f"<b>Top countries by difference [{year}]</b><br />MDIC versus Customs"
)
fig.update_xaxes(range=xrange)
fig.show()
Supply Chain Fragmentation
In this section we ask: how many states does each exporter source their beef from?
import plotly.express as px
# remove unknown CNPJ and state
data = df[df["source"] == "customs"]
data = data[data["exporter"] != "0"]
data = data[data["state"] != "BR-XXXXXXX"]
# count unique states per cnpj
def count_states(df):
df = df.groupby("exporter")["state"].nunique()
return df.reset_index(drop=True).rename("unique_known_states")
data = data.groupby("year").apply(count_states).reset_index()
px.box(
data,
y="unique_known_states",
x="year",
title="<b>Unique sourcing states per exporter CNPJ</b><br>Unknown states are excluded",
labels={"unique_known_states": "Number of distinct states"},
)