2023BR BOL COMTRADE MDIC
View or edit on GitHub
This page is synchronized from trase/models/brazil/qa_bol/2023BR_BOL_COMTRADE_MDIC.ipynb. Last modified on 2025-12-14 23:19 CET by Trase Admin.
Please view or edit the original file there; changes should be reflected here after a midnight build (CET time),
or manually triggering it with a GitHub action (link).
from trase.tools.aws.aws_helpers_cached import get_pandas_df_once
import pandas as pd
import numpy as np
def load_data():
"""Load BOL, MDIC, and Comtrade data"""
df_bol = pd.read_parquet(
"s3://trase-storage/brazil/trade/bol/2023/silver/brazil_bol_2023_silver.parquet",
columns=["hs4", "net_weight_kg", "exporter_group"],
)
df_bol = df_bol.rename(
columns={"net_weight_kg": "vol", "exporter_group": "exporter.group"},
errors="raise",
)
df_bol["mass_tonnes"] = df_bol["vol"] / 1_000
df_bol["year"] = 2023
df_comtrade = pd.read_parquet(
"s3://trase-storage/world/trade/statistical_data/comtrade/comtrade_exports_year_exporter_hs6.parquet",
columns=["year", "country_of_export", "commodity_code", "net_weight_kg"],
)
df_comtrade = df_comtrade[
(df_comtrade["country_of_export"] == "Brazil") & (df_comtrade["year"] == 2023)
]
df_comtrade["hs4"] = df_comtrade["commodity_code"].str[:4]
df_comtrade["mass_tonnes"] = df_comtrade["net_weight_kg"] / 1_000
"""
df_mdic = get_pandas_df_once(
"brazil/trade/mdic/port/archive/brazil_mdic_port_2022.csv",
encoding="utf8",
sep=";",
dtype=str,
keep_default_na=False,
)
df_mdic = df_mdic[
["month", "hs6", "country_of_destination.name", "vol", "fob", "via"]
]
df_mdic["vol"] = df_mdic["vol"].astype(float)
df_mdic["hs4"] = df_mdic["hs6"].str[:4]
"""
return df_bol, df_comtrade
df_bol, df_comtrade = load_data()
Compare vol per hs4 between Comtrade, Mdic, and BOL
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
df_bol_hs4 = df_bol.groupby("hs4")["mass_tonnes"].sum().reset_index(name="mass_tonnes")
df_bol_hs4["source"] = "BOL"
hs4_in_bol = df_bol_hs4["hs4"].drop_duplicates().to_list()
# filter and focus on maritime data in the three datasets
df_comtrade_hs4 = df_comtrade[df_comtrade["hs4"].isin(hs4_in_bol)].copy()
# df_comtrade_hs4 = df_comtrade_hs4[df_comtrade_hs4["transportation_method"] == "Sea"]
df_comtrade_hs4 = (
df_comtrade_hs4.groupby("hs4")["mass_tonnes"].sum().reset_index(name="mass_tonnes")
)
df_comtrade_hs4["source"] = "COMTRADE"
"""
df_mdic_hs4 = df_mdic[df_mdic["hs4"].isin(hs4_list)].copy()
df_mdic_hs4 = df_mdic_hs4[df_mdic_hs4["via"] == "01"]
df_mdic_hs4 = df_mdic_hs4.groupby("hs4")["vol"].sum().reset_index(name="vol")
df_mdic_hs4["source"] = "MDIC"
df_comparison_hs4 = pd.concat([df_bol_hs4, df_comtrade_hs4, df_mdic_hs4])
"""
df_comparison_hs4 = pd.concat([df_bol_hs4, df_comtrade_hs4])
# Load hs codes information
df_hs4 = pd.read_parquet(
"s3://trase-storage/postgres_views/postgres_commodities.parquet",
columns=["commodity", "hs_code"],
)
df_hs4["hs4"] = df_hs4["hs_code"].astype(str).str[:4]
df_hs4 = df_hs4.drop_duplicates()
df_hs4_description = get_pandas_df_once(
f"world/metadata/codes/hs/HS2017.csv",
encoding="utf8",
sep=";",
dtype=str,
keep_default_na=False,
usecols=["code", "description"],
)
df_hs4 = df_hs4.merge(
df_hs4_description.rename(columns={"code": "hs4"}), on="hs4", how="left"
)
# Show the commodities available in the BOL
df_hs4[df_hs4["hs4"].isin(hs4_in_bol)][
["commodity", "hs4", "description"]
].drop_duplicates().sort_values(by="commodity").reset_index(drop=True)
# plot
plt.figure(figsize=(10, 15))
ax = sns.barplot(
data=df_comparison_hs4, x="mass_tonnes", y="hs4", hue="source", orient="h"
)
# Format x-axis to use thousands separator
ax.xaxis.set_major_formatter(mtick.FuncFormatter(lambda x, _: f"{int(x):,}"))
# Add text labels with thousands separator
for container in ax.containers:
ax.bar_label(container, fmt=lambda x: f"{int(x):,}", label_type="edge", padding=3)
# Titles and labels
plt.title("Tonnes per HS4: Comtrade versus BOL", fontsize=14, fontweight="bold")
plt.xlabel("Tonnes")
plt.ylabel("HS4 Code")
plt.legend(title="Source")
plt.show()

interested_commodities = ["SOY", "BEEF"]
lookup = df_comparison_hs4.loc[:,]
for c in interested_commodities:
df_hs4_c = df_hs4[df_hs4["commodity"] == c]
list_hs4_c = list(df_hs4_c["hs4"])
df_comparison_hs4_c = df_comparison_hs4[df_comparison_hs4["hs4"].isin(list_hs4_c)]
plt.figure(figsize=(10, 6))
ax = sns.barplot(
data=df_comparison_hs4_c, x="mass_tonnes", y="hs4", hue="source", orient="h"
)
# Format x-axis with thousands separator
ax.xaxis.set_major_formatter(mtick.FuncFormatter(lambda x, _: f"{int(x):,}"))
# Add text labels with thousands separator
for container in ax.containers:
ax.bar_label(
container, fmt=lambda x: f"{int(x):,}", label_type="edge", padding=3
)
# Titles and labels
plt.title(
f"Tonnes per {c} HS4: Comtrade versus BOL", fontsize=14, fontweight="bold"
)
plt.xlabel("Tonnes")
plt.ylabel("HS4 Code")
plt.legend(title="Source")
plt.show()


Pending description....
Compare exporter trend in BOL per year
def load_bols():
"""Load bol data in previous years"""
df_list = []
for year in [2018, 2019, 2020, 2021, 2022]:
if (year == 2018) | (year == 2020):
cols = ["hs4", "exporter.label", "vol"]
else:
cols = ["hs4", "exporter.group", "vol"]
df_bol = get_pandas_df_once(
f"brazil/trade/bol/{year}/BRAZIL_BOL_{year}.csv",
encoding="utf8",
sep=";",
dtype=str,
keep_default_na=False,
usecols=cols,
)
if (year == 2018) | (year == 2020):
df_bol["exporter.group"] = df_bol["exporter.label"]
df_bol = df_bol.drop(columns=["exporter.label"])
df_bol["vol"] = df_bol["vol"].astype(float)
df_bol["year"] = year
df_list.append(df_bol)
df_bols = pd.concat(df_list)
return df_bols
df_bols = load_bols()
df_bols = pd.concat([df_bols, df_bol[["exporter.group", "vol", "hs4", "year"]]])
# Adjust some groups so they have the same name
df_bols["exporter.group"] = df_bols["exporter.group"].str.replace(
r"^(ADM|AGROEXPORT TRADING|AMAGGI|BUNGE|BRF|CARGILL|CHS|CJ INTERNATIONAL|COAMO|COFCO|ENGELHART|FRIGOL|GAVILON|GLENCORE|JBS|MARFRIG|MARUBENI|MERCURIO|MINERVA|NATURAFRIG|OLAM|PLENA|SEARA|LOUIS DREYFUS|UNKNOWN).*", # pattern
r"\1", # replace with the captured group
regex=True,
)
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import numpy as np
def plot_stacked_100_bar(
df_bols, df_hs4, interested_commodities, top_n=10, max_legend=20
):
"""
Plots stacked 100% bar charts per commodity (mimicking Plotly's barmode='stack'),
grouping small exporters as 'Others'.
:param df_bols: DataFrame with columns ['hs4', 'year', 'exporter.group', 'vol']
:param df_hs4: DataFrame with columns ['commodity', 'hs_code', 'hs4']
:param interested_commodities: list of commodity names to plot
:param top_n: how many exporters (by share per year) to keep individually
:param max_legend: max number of legend entries (including 'Others')
"""
for commodity in interested_commodities:
# 1) Filter to this commodity
hs4_codes = df_hs4.loc[df_hs4["commodity"] == commodity, "hs4"]
df_commodity = df_bols[df_bols["hs4"].isin(hs4_codes)]
if df_commodity.empty:
print(f"No data for commodity: {commodity}")
continue
# 2) Aggregate volumes by (year, exporter)
df_grouped = df_commodity.groupby(["year", "exporter.group"], as_index=False)[
"vol"
].sum()
# 3) Compute share of total volume per year as percentage
df_grouped["pct"] = df_grouped.groupby("year")["vol"].transform(
lambda x: x / x.sum() * 100
)
# 4) Identify top_n exporters per year, others => "Others"
df_grouped["rank"] = df_grouped.groupby("year")["pct"].rank(
ascending=False, method="first"
)
df_grouped.loc[df_grouped["rank"] > top_n, "exporter.group"] = "Others"
# 5) Re-summarize after grouping into "Others"
df_grouped = df_grouped.groupby(["year", "exporter.group"], as_index=False)[
"pct"
].sum()
# Make sure each year sums to 100 again after merging into "Others"
df_grouped["pct"] = df_grouped.groupby("year")["pct"].transform(
lambda x: x / x.sum() * 100
)
# 6) Pivot so rows = years, columns = exporters, values = pct
df_pivot = df_grouped.pivot(
index="year", columns="exporter.group", values="pct"
).fillna(0)
# 7) If there are too many exporters, keep the top by overall sum, plus "Others"
# (some "Others" might appear in this step – that’s okay)
sums = df_pivot.sum().sort_values(ascending=False)
top_exporters = sums.index[
:max_legend
] # keep up to `max_legend` biggest columns
# anything not in that list => merge into "Others"
columns_not_in_top = df_pivot.columns.difference(top_exporters)
if len(columns_not_in_top) > 0:
df_pivot["Others"] = df_pivot.get("Others", 0) + df_pivot[
columns_not_in_top
].sum(axis=1)
df_pivot.drop(columns=columns_not_in_top, inplace=True)
# Re-sort columns so "Others" is last
cols = [c for c in df_pivot.columns if c != "Others"]
if "Others" in df_pivot.columns:
cols.append("Others")
df_pivot = df_pivot[cols]
# 8) Make a stacked 100% bar chart with pandas
# By default each row becomes a cluster of bars; 'stacked=True' stacks them
plt.figure(figsize=(12, 6))
ax = df_pivot.plot(
kind="bar", stacked=True, ax=plt.gca() # use the current axes
)
# 9) Format the y-axis from 0..100 with percentage labels
ax.set_ylim(0, 100)
ax.yaxis.set_major_formatter(mtick.FuncFormatter(lambda x, _: f"{x:.0f}%"))
# 10) Label, Legend, Title
ax.set_title(
f"Volume percentage per exporter per year in {commodity}",
fontsize=14,
fontweight="bold",
)
ax.set_xlabel("Year")
ax.set_ylabel("Share of Volume (%)")
ax.legend(title="Exporter", bbox_to_anchor=(1.05, 1), loc="upper left")
plt.tight_layout()
plt.show()
print(f"Plotted commodity: {commodity}")
plot_stacked_100_bar(
df_bols, df_hs4, interested_commodities=["BEEF", "SOY"], top_n=15, max_legend=25
)