Skip to content

2023BR BOL COMTRADE MDIC

View or edit on GitHub

This page is synchronized from trase/models/brazil/qa_bol/2023BR_BOL_COMTRADE_MDIC.ipynb. Last modified on 2025-12-14 23:19 CET by Trase Admin. Please view or edit the original file there; changes should be reflected here after a midnight build (CET time), or manually triggering it with a GitHub action (link).

from trase.tools.aws.aws_helpers_cached import get_pandas_df_once
import pandas as pd
import numpy as np
def load_data():
    """Load BOL, MDIC, and Comtrade data"""
    df_bol = pd.read_parquet(
        "s3://trase-storage/brazil/trade/bol/2023/silver/brazil_bol_2023_silver.parquet",
        columns=["hs4", "net_weight_kg", "exporter_group"],
    )
    df_bol = df_bol.rename(
        columns={"net_weight_kg": "vol", "exporter_group": "exporter.group"},
        errors="raise",
    )
    df_bol["mass_tonnes"] = df_bol["vol"] / 1_000
    df_bol["year"] = 2023

    df_comtrade = pd.read_parquet(
        "s3://trase-storage/world/trade/statistical_data/comtrade/comtrade_exports_year_exporter_hs6.parquet",
        columns=["year", "country_of_export", "commodity_code", "net_weight_kg"],
    )

    df_comtrade = df_comtrade[
        (df_comtrade["country_of_export"] == "Brazil") & (df_comtrade["year"] == 2023)
    ]
    df_comtrade["hs4"] = df_comtrade["commodity_code"].str[:4]
    df_comtrade["mass_tonnes"] = df_comtrade["net_weight_kg"] / 1_000

    """
    df_mdic = get_pandas_df_once(
        "brazil/trade/mdic/port/archive/brazil_mdic_port_2022.csv",
        encoding="utf8",
        sep=";",
        dtype=str,
        keep_default_na=False,
    )
    df_mdic = df_mdic[
        ["month", "hs6", "country_of_destination.name", "vol", "fob", "via"]
    ]
    df_mdic["vol"] = df_mdic["vol"].astype(float)
    df_mdic["hs4"] = df_mdic["hs6"].str[:4]
    """
    return df_bol, df_comtrade
df_bol, df_comtrade = load_data()

Compare vol per hs4 between Comtrade, Mdic, and BOL

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick

df_bol_hs4 = df_bol.groupby("hs4")["mass_tonnes"].sum().reset_index(name="mass_tonnes")
df_bol_hs4["source"] = "BOL"

hs4_in_bol = df_bol_hs4["hs4"].drop_duplicates().to_list()

# filter and focus on maritime data in the three datasets
df_comtrade_hs4 = df_comtrade[df_comtrade["hs4"].isin(hs4_in_bol)].copy()
# df_comtrade_hs4 = df_comtrade_hs4[df_comtrade_hs4["transportation_method"] == "Sea"]
df_comtrade_hs4 = (
    df_comtrade_hs4.groupby("hs4")["mass_tonnes"].sum().reset_index(name="mass_tonnes")
)
df_comtrade_hs4["source"] = "COMTRADE"

"""
df_mdic_hs4 = df_mdic[df_mdic["hs4"].isin(hs4_list)].copy()
df_mdic_hs4 = df_mdic_hs4[df_mdic_hs4["via"] == "01"]
df_mdic_hs4 = df_mdic_hs4.groupby("hs4")["vol"].sum().reset_index(name="vol")
df_mdic_hs4["source"] = "MDIC"

df_comparison_hs4 = pd.concat([df_bol_hs4, df_comtrade_hs4, df_mdic_hs4])
"""
df_comparison_hs4 = pd.concat([df_bol_hs4, df_comtrade_hs4])

# Load hs codes information
df_hs4 = pd.read_parquet(
    "s3://trase-storage/postgres_views/postgres_commodities.parquet",
    columns=["commodity", "hs_code"],
)
df_hs4["hs4"] = df_hs4["hs_code"].astype(str).str[:4]
df_hs4 = df_hs4.drop_duplicates()
df_hs4_description = get_pandas_df_once(
    f"world/metadata/codes/hs/HS2017.csv",
    encoding="utf8",
    sep=";",
    dtype=str,
    keep_default_na=False,
    usecols=["code", "description"],
)
df_hs4 = df_hs4.merge(
    df_hs4_description.rename(columns={"code": "hs4"}), on="hs4", how="left"
)
# Show the commodities available in the BOL
df_hs4[df_hs4["hs4"].isin(hs4_in_bol)][
    ["commodity", "hs4", "description"]
].drop_duplicates().sort_values(by="commodity").reset_index(drop=True)
# plot
plt.figure(figsize=(10, 15))
ax = sns.barplot(
    data=df_comparison_hs4, x="mass_tonnes", y="hs4", hue="source", orient="h"
)

# Format x-axis to use thousands separator
ax.xaxis.set_major_formatter(mtick.FuncFormatter(lambda x, _: f"{int(x):,}"))

# Add text labels with thousands separator
for container in ax.containers:
    ax.bar_label(container, fmt=lambda x: f"{int(x):,}", label_type="edge", padding=3)

# Titles and labels
plt.title("Tonnes per HS4: Comtrade versus BOL", fontsize=14, fontweight="bold")
plt.xlabel("Tonnes")
plt.ylabel("HS4 Code")
plt.legend(title="Source")

plt.show()

png

interested_commodities = ["SOY", "BEEF"]
lookup = df_comparison_hs4.loc[:,]
for c in interested_commodities:
    df_hs4_c = df_hs4[df_hs4["commodity"] == c]
    list_hs4_c = list(df_hs4_c["hs4"])
    df_comparison_hs4_c = df_comparison_hs4[df_comparison_hs4["hs4"].isin(list_hs4_c)]

    plt.figure(figsize=(10, 6))
    ax = sns.barplot(
        data=df_comparison_hs4_c, x="mass_tonnes", y="hs4", hue="source", orient="h"
    )

    # Format x-axis with thousands separator
    ax.xaxis.set_major_formatter(mtick.FuncFormatter(lambda x, _: f"{int(x):,}"))

    # Add text labels with thousands separator
    for container in ax.containers:
        ax.bar_label(
            container, fmt=lambda x: f"{int(x):,}", label_type="edge", padding=3
        )

    # Titles and labels
    plt.title(
        f"Tonnes per {c} HS4: Comtrade versus BOL", fontsize=14, fontweight="bold"
    )
    plt.xlabel("Tonnes")
    plt.ylabel("HS4 Code")
    plt.legend(title="Source")
    plt.show()

png

png

Pending description....

Compare exporter trend in BOL per year

def load_bols():
    """Load bol data in previous years"""
    df_list = []
    for year in [2018, 2019, 2020, 2021, 2022]:
        if (year == 2018) | (year == 2020):
            cols = ["hs4", "exporter.label", "vol"]
        else:
            cols = ["hs4", "exporter.group", "vol"]
        df_bol = get_pandas_df_once(
            f"brazil/trade/bol/{year}/BRAZIL_BOL_{year}.csv",
            encoding="utf8",
            sep=";",
            dtype=str,
            keep_default_na=False,
            usecols=cols,
        )
        if (year == 2018) | (year == 2020):
            df_bol["exporter.group"] = df_bol["exporter.label"]
            df_bol = df_bol.drop(columns=["exporter.label"])
        df_bol["vol"] = df_bol["vol"].astype(float)
        df_bol["year"] = year
        df_list.append(df_bol)
    df_bols = pd.concat(df_list)
    return df_bols
df_bols = load_bols()
df_bols = pd.concat([df_bols, df_bol[["exporter.group", "vol", "hs4", "year"]]])
# Adjust some groups so they have the same name
df_bols["exporter.group"] = df_bols["exporter.group"].str.replace(
    r"^(ADM|AGROEXPORT TRADING|AMAGGI|BUNGE|BRF|CARGILL|CHS|CJ INTERNATIONAL|COAMO|COFCO|ENGELHART|FRIGOL|GAVILON|GLENCORE|JBS|MARFRIG|MARUBENI|MERCURIO|MINERVA|NATURAFRIG|OLAM|PLENA|SEARA|LOUIS DREYFUS|UNKNOWN).*",  # pattern
    r"\1",  # replace with the captured group
    regex=True,
)
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import numpy as np


def plot_stacked_100_bar(
    df_bols, df_hs4, interested_commodities, top_n=10, max_legend=20
):
    """
    Plots stacked 100% bar charts per commodity (mimicking Plotly's barmode='stack'),
    grouping small exporters as 'Others'.

    :param df_bols: DataFrame with columns ['hs4', 'year', 'exporter.group', 'vol']
    :param df_hs4: DataFrame with columns ['commodity', 'hs_code', 'hs4']
    :param interested_commodities: list of commodity names to plot
    :param top_n: how many exporters (by share per year) to keep individually
    :param max_legend: max number of legend entries (including 'Others')
    """

    for commodity in interested_commodities:
        # 1) Filter to this commodity
        hs4_codes = df_hs4.loc[df_hs4["commodity"] == commodity, "hs4"]
        df_commodity = df_bols[df_bols["hs4"].isin(hs4_codes)]
        if df_commodity.empty:
            print(f"No data for commodity: {commodity}")
            continue

        # 2) Aggregate volumes by (year, exporter)
        df_grouped = df_commodity.groupby(["year", "exporter.group"], as_index=False)[
            "vol"
        ].sum()

        # 3) Compute share of total volume per year as percentage
        df_grouped["pct"] = df_grouped.groupby("year")["vol"].transform(
            lambda x: x / x.sum() * 100
        )

        # 4) Identify top_n exporters per year, others => "Others"
        df_grouped["rank"] = df_grouped.groupby("year")["pct"].rank(
            ascending=False, method="first"
        )
        df_grouped.loc[df_grouped["rank"] > top_n, "exporter.group"] = "Others"

        # 5) Re-summarize after grouping into "Others"
        df_grouped = df_grouped.groupby(["year", "exporter.group"], as_index=False)[
            "pct"
        ].sum()

        # Make sure each year sums to 100 again after merging into "Others"
        df_grouped["pct"] = df_grouped.groupby("year")["pct"].transform(
            lambda x: x / x.sum() * 100
        )

        # 6) Pivot so rows = years, columns = exporters, values = pct
        df_pivot = df_grouped.pivot(
            index="year", columns="exporter.group", values="pct"
        ).fillna(0)

        # 7) If there are too many exporters, keep the top by overall sum, plus "Others"
        #    (some "Others" might appear in this step – that’s okay)
        sums = df_pivot.sum().sort_values(ascending=False)
        top_exporters = sums.index[
            :max_legend
        ]  # keep up to `max_legend` biggest columns
        # anything not in that list => merge into "Others"
        columns_not_in_top = df_pivot.columns.difference(top_exporters)
        if len(columns_not_in_top) > 0:
            df_pivot["Others"] = df_pivot.get("Others", 0) + df_pivot[
                columns_not_in_top
            ].sum(axis=1)
            df_pivot.drop(columns=columns_not_in_top, inplace=True)

        # Re-sort columns so "Others" is last
        cols = [c for c in df_pivot.columns if c != "Others"]
        if "Others" in df_pivot.columns:
            cols.append("Others")
        df_pivot = df_pivot[cols]

        # 8) Make a stacked 100% bar chart with pandas
        #    By default each row becomes a cluster of bars; 'stacked=True' stacks them
        plt.figure(figsize=(12, 6))
        ax = df_pivot.plot(
            kind="bar", stacked=True, ax=plt.gca()  # use the current axes
        )

        # 9) Format the y-axis from 0..100 with percentage labels
        ax.set_ylim(0, 100)
        ax.yaxis.set_major_formatter(mtick.FuncFormatter(lambda x, _: f"{x:.0f}%"))

        # 10) Label, Legend, Title
        ax.set_title(
            f"Volume percentage per exporter per year in {commodity}",
            fontsize=14,
            fontweight="bold",
        )
        ax.set_xlabel("Year")
        ax.set_ylabel("Share of Volume (%)")

        ax.legend(title="Exporter", bbox_to_anchor=(1.05, 1), loc="upper left")

        plt.tight_layout()
        plt.show()

        print(f"Plotted commodity: {commodity}")
plot_stacked_100_bar(
    df_bols, df_hs4, interested_commodities=["BEEF", "SOY"], top_n=15, max_legend=25
)