View or edit on GitHub

This page is synchronized from trase/models/brazil/customs_2019/feasibility_of_2023_run.ipynb. Last modified on 2025-12-14 23:19 CET by Trase Admin. Please view or edit the original file there; changes should be reflected here after a midnight build (CET time), or manually triggering it with a GitHub action (link).

Feasibility of 2023 run

This notebook reviews the BOL, MDIC Municipality and MDIC Ports differences.

Note that for MDIC port we are filtering to only consider via=="01" (maritime trade), which makes it more similar to the BOL, but makes its totals different from MDIC Municipality. In the actual run of the model this filtering is not done.

The most relevant findings are the following. More detail around it is found further down.

There are two HS4 fields that almost don't have values in the BOL, but appear with much more values in MDIC:
- 0102 (live animals) has 3 Mill Tonnes in BOL, and 199 both in MDIC Ports and MDIC Municipalities
- 0210 (Meat and edible meat offal; salted, in brine, dried or smoked) has 2 Mill Tonnes in BOL, 3 in MDIC Port, and 165 in MDIC Municipalities
Minor ports are replaced by major ones, both in BOL and MDIC Ports, based on brazil/logistics/ports/brazil_close_ports.csv
Country of destination:
- In BOL, records with country_of_destination_name with "EUROPEAN UNION" or "UNKNOWN" are filtered out
- CHILE and URUGUAY mostly only appear in MDIC Municipalities (not in MDIC Port, or in BOL, as they include only maritime trade).
States and municipalities:
- SANTA CATARINA state (BR-42) has way more records in MDIC Ports (67) and MDIC Municipalities (180), than BOL (5)

Load the data

import polars as pl

pl.Config.set_tbl_rows(500)

# storage_path = "~/Trase/repos/TRASE/trase/data_pipeline/trase-storage-local"
storage_path = "s3://trase-storage"

# Load BOL
df_bol = pl.read_parquet(
    f"{storage_path}/brazil/trade/bol/2023/gold/brazil_bol_2023_gold.parquet"
)
df_bol = df_bol.rename({"net_weight_kg": "vol"})
df_bol = df_bol.with_columns(pl.col("vol").cast(pl.Int64))

# Load MDIC port
df_port = pl.read_parquet(
    f"{storage_path}/brazil/trade/mdic/port/brazil_mdic_port_2023.parquet"
)
df_port = df_port.rename(
    {
        "port.name": "port_of_export_name",
        "country_of_destination.name": "country_of_destination_name",
        "state.trase_id": "state_trase_id",
    }
)
df_port = df_port.filter(pl.col("via") == "01")  # filter only to sea trade
df_brazil_close_ports = pl.read_csv(
    f"{storage_path}/brazil/logistics/ports/brazil_close_ports.csv"
)

# Load MDIC municipality
df_municipality = pl.read_parquet(
    f"{storage_path}/brazil/trade/mdic/municipality/brazil_mdic_municipality_2023.parquet"
)
df_municipality = df_municipality.rename(
    {
        "exporter.municipality.trase_id": "exporter_municipality_trase_id",
        "country_of_destination.name": "country_of_destination_name",
    }
)

Initial HS filtering for BEEF, and column checking

df_bol_beef = df_bol.filter(pl.col("commodity") == pl.lit("BEEF"))
beef_hs6_codes = set(df_bol_beef.select(pl.col("hs6")).unique().to_series())
beef_hs4_codes = set(df_bol_beef.select(pl.col("hs4")).unique().to_series())

df_port_beef = df_port.filter(pl.col("hs6").is_in(beef_hs6_codes))
df_municipality_beef = df_municipality.filter(pl.col("hs4").is_in(beef_hs4_codes))
del df_bol, df_port, df_municipality


# check needed columns exist
BUCKETS = ["country_of_destination_name", "hs4"]
MDIC_MUNICIPALITY_COLUMNS = [*BUCKETS, "exporter_municipality_trase_id"]
MDIC_PORT_COLUMNS = [*BUCKETS, "port_of_export_name", "hs6"]
MDIC_BOL_BLOCKING = list(
    set([*BUCKETS, *MDIC_MUNICIPALITY_COLUMNS, *MDIC_PORT_COLUMNS])
)
NUMERICAL_COLUMN = "vol"

assert not (set(MDIC_BOL_BLOCKING) | {NUMERICAL_COLUMN}) - set(df_bol_beef.columns)
assert not (set(MDIC_MUNICIPALITY_COLUMNS) | {NUMERICAL_COLUMN}) - set(
    df_municipality_beef.columns
)
assert not (set(MDIC_PORT_COLUMNS) | {NUMERICAL_COLUMN}) - set(df_port_beef.columns)

# consolidate data
# df_port_beef_agg = df_port_beef.group_by(MDIC_PORT_COLUMNS).agg(pl.col(NUMERICAL_COLUMN).sum())
# df_municipality_beef_agg = df_municipality_beef.group_by(MDIC_MUNICIPALITY_COLUMNS).agg(
#     pl.col(NUMERICAL_COLUMN).sum()
# )
# df_bol_beef_agg = df_bol_beef.group_by(MDIC_BOL_BLOCKING).agg(pl.col(NUMERICAL_COLUMN).sum())

Supporting functions

def field_comparison_df(field):
    return pl.sql(
        f"""
WITH mdic_port AS (
    SELECT {field}, SUM(vol) AS port_vol
    FROM df_port_beef
    GROUP BY {field}
),
mdic_mun AS (
    SELECT {field}, SUM(vol) AS mun_vol
    FROM df_municipality_beef
    GROUP BY {field}
),
bol AS (
    SELECT {field}, SUM(vol) AS bol_vol
    FROM df_bol_beef
    GROUP BY {field}
),

SELECT
    COALESCE(p.{field}, m.{field}, b.{field}) AS {field},
    COALESCE(p.port_vol, 0) AS port_vol,
    COALESCE(m.mun_vol, 0) AS mun_vol,
    COALESCE(b.bol_vol, 0) AS bol_vol
FROM mdic_port AS p
FULL JOIN mdic_mun AS m
ON p.{field} = m.{field}
FULL JOIN bol AS b
ON b.{field} = p.{field}
ORDER BY port_vol DESC, mun_vol DESC, bol_vol DESC
    """
    ).collect()


# method to show differences of a specific column between MDIC and BOL
def get_column_diff(column, mdic, bol, sort_by="diff"):
    bol_total = bol.select(pl.col(NUMERICAL_COLUMN).sum()).item()
    mdic_total = mdic.select(pl.col(NUMERICAL_COLUMN).sum()).item()
    mdic_grouped = mdic.group_by(column).agg(pl.col(NUMERICAL_COLUMN).sum())
    bol_grouped = bol.group_by(column).agg(pl.col(NUMERICAL_COLUMN).sum())
    df = mdic_grouped.rename({"vol": "vol_mdic"}).join(
        bol_grouped.rename({"vol": "vol_bol"}),
        on=column,
        validate="1:1",
        coalesce=True,
        how="full",
    )
    df = df.fill_null(0)
    df = df.with_columns(
        vol_mdic=pl.col("vol_mdic").cast(pl.Int64),
        diff=(pl.col("vol_mdic") - pl.col("vol_bol")).cast(pl.Int64),
        bol_to_mdic_proportion=(pl.col("vol_bol") / pl.col("vol_mdic")).cast(
            pl.Float32
        ),
        bol_percentage=((pl.col("vol_bol") / bol_total) * 100)
        .cast(pl.Float32)
        .round(2),
        mdic_percentage=((pl.col("vol_mdic") / mdic_total) * 100)
        .cast(pl.Float32)
        .round(2),
    ).sort(pl.col(sort_by).abs(), descending=True)
    return df


def make_plot_df(result_df, comparison_field_str, top_n=None):
    plot_df = (
        result_df
        # .filter(pl.col(comparison_field_str).is_not_null())
        .fill_null(0)
        .with_columns(
            [
                (pl.col("port_vol") / 1e6).alias("port_vol_kton"),
                (pl.col("mun_vol") / 1e6).alias("mun_vol_kton"),
                (pl.col("bol_vol") / 1e6).alias("bol_vol_kton"),
                # Add a helper column to sort by: port + mun
                ((pl.col("port_vol") + pl.col("mun_vol")) / 1e6).alias("sort_vol_kton"),
            ]
        )
        .select(
            [
                comparison_field_str,
                "port_vol_kton",
                "mun_vol_kton",
                "bol_vol_kton",
                "sort_vol_kton",
            ]
        )
        .sort("sort_vol_kton", descending=True)
        .drop("sort_vol_kton")
    )
    if top_n is not None:
        plot_df = plot_df.head(top_n)
    return plot_df


# method to show barplot of volume aggregates on a specific column between MDIC, Port, and BOL
import seaborn as sns
import matplotlib.pyplot as plt


def plot_bar_comparison(plot_df, comparison_field_str):
    # Melt to long format for seaborn
    pd_df = plot_df.to_pandas().melt(
        id_vars=comparison_field_str,
        var_name="Data Source",
        value_name="Volume (Thousand Tonnes)",
    )

    # Plot
    plt.figure(figsize=(14, 7))
    ax = sns.barplot(
        data=pd_df,
        x=comparison_field_str,
        y="Volume (Thousand Tonnes)",
        hue="Data Source",
    )

    # Add labels
    for container in ax.containers:
        ax.bar_label(container, fmt="%.0f", label_type="edge", padding=3)

    plt.title(
        f"Beef Export Volume by {comparison_field_str} and data source (in Thousand Tonnes)"
    )
    plt.xticks(rotation=45, ha="right")
    plt.ylabel("Volume (Thousand Tonnes)")
    plt.xlabel(comparison_field_str)
    plt.legend(title="Data Source")
    plt.tight_layout()
    plt.show()

Initial total volume check

BOL is 90.30% (245,795 tonnes less) of MDIC port and 76.51% (702,348 tonnes less) of MDIC municipality

port_vol_sum_tonnes = df_port_beef.select(pl.sum("vol")).item() / 1_000
print(f"{port_vol_sum_tonnes:,.0f} tonnes exported according to MDIC port")
mun_vol_sum_tonnes = df_municipality_beef.select(pl.sum("vol")).item() / 1_000
print(f"{mun_vol_sum_tonnes:,.0f} tonnes exported according to MDIC municipality")
bol_vol_sum_tonnes = df_bol_beef.select(pl.sum("vol")).item() / 1_000
print(f"{bol_vol_sum_tonnes:,.0f} tonnes exported according to BOL")
print(
    f"BOL is {bol_vol_sum_tonnes/port_vol_sum_tonnes:.2%} ({port_vol_sum_tonnes - bol_vol_sum_tonnes:,.0f} tonnes less) of MDIC port and {bol_vol_sum_tonnes/mun_vol_sum_tonnes:.2%} ({mun_vol_sum_tonnes-bol_vol_sum_tonnes:,.0f} tonnes less) of MDIC municipality"
)

Review hs4's

Main findings * BOL doesn't has much of 0102 (live animals), which was already acknowledged by the vendor * BOL and Port don't have much of 0210 (Meat and edible meat offal; salted, in brine, dried or smoked)

# Prepare and plot data
result_hs4 = field_comparison_df("hs4")
plot_df = make_plot_df(result_hs4, "hs4")
plot_bar_comparison(plot_df, "hs4")

hs_codes_desc_df = pl.read_csv(
    "s3://trase-storage/world/metadata/codes/hs/HS2017.csv",
    separator=";",
    infer_schema=False,
    columns=["code", "description"],
).filter(pl.col("code").is_in(beef_hs4_codes))
hs_codes_desc_df.to_pandas()

Adjust port names and review

# csv with ports close to each other
df_brazil_close_ports = pl.read_csv(
    f"{storage_path}/brazil/logistics/ports/brazil_close_ports.csv",
    separator=";",
    columns=["minor_port", "major_port"],
)

port_mapping = df_brazil_close_ports.rename({"minor_port": "port_of_export_name"})

# Update df_port_beef and df_bol_beef
df_port_beef = (
    df_port_beef.join(
        port_mapping, on="port_of_export_name", how="left", validate="m:1"
    )
    .with_columns(
        [
            # Use major_port when available, otherwise keep the original
            pl.coalesce([pl.col("major_port"), pl.col("port_of_export_name")]).alias(
                "port_of_export_name"
            )
        ]
    )
    .drop("major_port")
)

df_bol_beef = (
    df_bol_beef.join(port_mapping, on="port_of_export_name", how="left", validate="m:1")
    .with_columns(
        [
            pl.coalesce([pl.col("major_port"), pl.col("port_of_export_name")]).alias(
                "port_of_export_name"
            )
        ]
    )
    .drop("major_port")
)

# Convert to thousand tonnes and add a source column
import seaborn as sns
import matplotlib.pyplot as plt

# Group and convert to thousand tonnes
df_port_ports = (
    df_port_beef.group_by("port_of_export_name")
    .agg(pl.sum("vol").alias("vol"))
    .with_columns(
        [(pl.col("vol") / 1e6).alias("vol_kton"), pl.lit("port").alias("source")]
    )
    .select(["port_of_export_name", "vol_kton", "source"])
)

df_bol_ports = (
    df_bol_beef.group_by("port_of_export_name")
    .agg(pl.sum("vol").alias("vol"))
    .with_columns(
        [(pl.col("vol") / 1e6).alias("vol_kton"), pl.lit("bol").alias("source")]
    )
    .select(["port_of_export_name", "vol_kton", "source"])
)

# Combine datasets
combined = pl.concat([df_port_ports, df_bol_ports])

# Compute total volume per port
top_ports = (
    combined.group_by("port_of_export_name")
    .agg(pl.sum("vol_kton").alias("total_vol"))
    .sort("total_vol", descending=True)
    .head(10)
)

# Filter original combined data to keep only top 10 ports
filtered = combined.join(
    top_ports.select("port_of_export_name"), on="port_of_export_name", how="inner"
)

# Convert to pandas for plotting
plot_df = filtered.to_pandas()

# Plot
plt.figure(figsize=(14, 7))
ax = sns.barplot(data=plot_df, x="port_of_export_name", y="vol_kton", hue="source")

# Add labels
for container in ax.containers:
    ax.bar_label(container, fmt="%.1f", label_type="edge", padding=3)

plt.title("Top 10 Ports by Beef Export Volume and Data Source (in Thousand Tonnes)")
plt.xticks(rotation=45, ha="right")
plt.ylabel("Volume (Thousand Tonnes)")
plt.xlabel("Port of Export")
plt.legend(title="Data Source")
plt.tight_layout()
plt.show()

png

# Table with differences, ordered by difference
print(str(get_column_diff("port_of_export_name", df_port_beef, df_bol_beef)))

shape: (22, 7)
┌───────────────┬────────────┬────────────┬───────────┬──────────────┬──────────────┬──────────────┐
│ port_of_expor ┆ vol_mdic   ┆ vol_bol    ┆ diff      ┆ bol_to_mdic_ ┆ bol_percenta ┆ mdic_percent │
│ t_name        ┆ ---        ┆ ---        ┆ ---       ┆ proportion   ┆ ge           ┆ age          │
│ ---           ┆ i64        ┆ i64        ┆ i64       ┆ ---          ┆ ---          ┆ ---          │
│ str           ┆            ┆            ┆           ┆ f32          ┆ f32          ┆ f32          │
╞═══════════════╪════════════╪════════════╪═══════════╪══════════════╪══════════════╪══════════════╡
│ BELEM         ┆ 151120107  ┆ 49329753   ┆ 101790354 ┆ 0.326427     ┆ 2.16         ┆ 5.97         │
│ ITAJAI        ┆ 137354667  ┆ 75019136   ┆ 62335531  ┆ 0.546171     ┆ 3.28         ┆ 5.42         │
│ SANTOS        ┆ 1451782800 ┆ 1494659075 ┆ -42876275 ┆ 1.029534     ┆ 65.339996    ┆ 57.310001    │
│ RIO GRANDE    ┆ 99520845   ┆ 58731417   ┆ 40789428  ┆ 0.590142     ┆ 2.57         ┆ 3.93         │
│ SAO FRANCISCO ┆ 127920113  ┆ 88765303   ┆ 39154810  ┆ 0.693912     ┆ 3.88         ┆ 5.05         │
│ DO SUL        ┆            ┆            ┆           ┆              ┆              ┆              │
│ SAO SEBASTIAO ┆ 39192493   ┆ 2986249    ┆ 36206244  ┆ 0.076194     ┆ 0.13         ┆ 1.55         │
│ PARANAGUA     ┆ 522689694  ┆ 515041226  ┆ 7648468   ┆ 0.985367     ┆ 22.52        ┆ 20.629999    │
│ IMBITUBA      ┆ 822819     ┆ 0          ┆ 822819    ┆ 0.0          ┆ 0.0          ┆ 0.03         │
│ SALVADOR      ┆ 1833504    ┆ 2030928    ┆ -197424   ┆ 1.107676     ┆ 0.09         ┆ 0.07         │
│ SUAPE         ┆ 9619       ┆ 198629     ┆ -189010   ┆ 20.649652    ┆ 0.01         ┆ 0.0          │
│ RIO DE        ┆ 611602     ┆ 515390     ┆ 96212     ┆ 0.842689     ┆ 0.02         ┆ 0.02         │
│ JANEIRO       ┆            ┆            ┆           ┆              ┆              ┆              │
│ SAO LUIS      ┆ 89651      ┆ 0          ┆ 89651     ┆ 0.0          ┆ 0.0          ┆ 0.0          │
│ VITORIA       ┆ 71366      ┆ 0          ┆ 71366     ┆ 0.0          ┆ 0.0          ┆ 0.0          │
│ FORTALEZA     ┆ 254666     ┆ 237729     ┆ 16937     ┆ 0.933493     ┆ 0.01         ┆ 0.01         │
│ IRF CAMPOS    ┆ 14089      ┆ 0          ┆ 14089     ┆ 0.0          ┆ 0.0          ┆ 0.0          │
│ DOS           ┆            ┆            ┆           ┆              ┆              ┆              │
│ GOYTACAZES    ┆            ┆            ┆           ┆              ┆              ┆              │
│ MANAUS        ┆ 6821       ┆ 0          ┆ 6821      ┆ 0.0          ┆ 0.0          ┆ 0.0          │
│ PORTO ALEGRE  ┆ 5251       ┆ 0          ┆ 5251      ┆ 0.0          ┆ 0.0          ┆ 0.0          │
│ MACEIO        ┆ 3331       ┆ 0          ┆ 3331      ┆ 0.0          ┆ 0.0          ┆ 0.0          │
│ SANTANA       ┆ 2916       ┆ 0          ┆ 2916      ┆ 0.0          ┆ 0.0          ┆ 0.0          │
│ NEW PORT      ┆ 1615       ┆ 0          ┆ 1615      ┆ 0.0          ┆ 0.0          ┆ 0.0          │
│ CABEDELO      ┆ 811        ┆ 0          ┆ 811       ┆ 0.0          ┆ 0.0          ┆ 0.0          │
│ ARACAJU       ┆ 625        ┆ 0          ┆ 625       ┆ 0.0          ┆ 0.0          ┆ 0.0          │
└───────────────┴────────────┴────────────┴───────────┴──────────────┴──────────────┴──────────────┘

from trase.tools.sei_pcs.plotting import dumbbell_compare

dumbbell_compare(
    df_port_beef.to_pandas(),
    df_bol_beef.to_pandas(),
    "vol",
    ["port_of_export_name"],
    labels=("MDIC Port", "BoL"),
    max_rows=15,
)

Adjust countries of destination

Taking away UNKNOWN COUNTRY and EUROPEAN UNION from BOL, as they wouldn't have corresponding values in MDIC

# Filter out when "EUROPEAN UNION" or "UNKNOWN COUNTRY" is as a country in BOL
df_bol_beef = df_bol_beef.filter(
    (pl.col("country_of_destination_name") != "EUROPEAN UNION")
    & (pl.col("country_of_destination_name") != "UNKNOWN COUNTRY")
)

result_country = field_comparison_df("country_of_destination_name")
plot_df = make_plot_df(result_country, "country_of_destination_name", top_n=20)
plot_bar_comparison(plot_df, "country_of_destination_name")

Review states

Although MDIC municipalities does not include state, we take it from the parent trase_id. We can see that SANTA CATARINA (BR-42) has way more records in MDIC Ports (67), and MDIC Municipalities (180), than BOL (5). Will continue to check states when reviewing municipalities.

df_regions = pl.read_parquet(
    f"{storage_path}/postgres_views/postgres_regions.parquet"
).filter(pl.col("country") == "BRAZIL")

# Query to bring in the State in 'df_municipalities_beef', based in its exporter_municipality_trase_id and df_regions
# and then aggregate the volume by state
result = pl.sql(
    """
WITH mdic_port_state AS (
    SELECT "state_trase_id" AS state, SUM(vol) AS port_vol
    FROM df_port_beef
    GROUP BY "state_trase_id"
),
mdic_mun_state AS (
    SELECT
        r.parent_trase_id AS state,
        SUM(vol) AS mun_vol
    FROM df_municipality_beef AS m
    LEFT JOIN df_regions AS r
    ON m."exporter_municipality_trase_id" = r.trase_id
    WHERE r.country = 'BRAZIL'
    AND r.level = 6
    GROUP BY r.parent_trase_id
),
bol_state AS (
    SELECT exporter_state_trase_id AS state,
    SUM(vol) AS bol_vol
    FROM df_bol_beef
    GROUP BY exporter_state_trase_id
),
joined AS (
    SELECT
        COALESCE(p.state, m.state, b.state) AS state_id,
        COALESCE(p.port_vol, 0) AS port_vol,
        COALESCE(m.mun_vol, 0) AS mun_vol,
        COALESCE(b.bol_vol, 0) AS bol_vol
    FROM mdic_port_state AS p
    FULL JOIN mdic_mun_state AS m
    ON p.state = m.state
    FULL JOIN bol_state AS b
    ON b.state = p.state
)
SELECT
    r.name AS state_name,
    j.port_vol,
    j.mun_vol,
    j.bol_vol
FROM joined AS j
LEFT JOIN df_regions AS r
ON j.state_id = r.trase_id
ORDER BY port_vol DESC, mun_vol DESC
"""
).collect()

# Prepare data: fill nulls with 0, drop null states, and convert kg to thousand tonnes
plot_df = (
    result.filter(pl.col("state_name").is_not_null())
    .fill_null(0)
    .with_columns(
        [
            (pl.col("port_vol") / 1e6).alias("port_vol_kton"),
            (pl.col("mun_vol") / 1e6).alias("mun_vol_kton"),
            (pl.col("bol_vol") / 1e6).alias("bol_vol_kton"),
            # Add a helper column to sort by: port + mun
            ((pl.col("port_vol") + pl.col("mun_vol")) / 1e6).alias("sort_vol_kton"),
        ]
    )
    .select(
        ["state_name", "port_vol_kton", "mun_vol_kton", "bol_vol_kton", "sort_vol_kton"]
    )
    .sort("sort_vol_kton", descending=True)
    .drop("sort_vol_kton")
    .head(20)
)

# Melt to long format for seaborn
pd_df = plot_df.to_pandas().melt(
    id_vars="state_name", var_name="Source", value_name="Volume (Thousand Tonnes)"
)

# Plot
plt.figure(figsize=(14, 7))
ax = sns.barplot(data=pd_df, x="state_name", y="Volume (Thousand Tonnes)", hue="Source")

# Add labels
for container in ax.containers:
    ax.bar_label(container, fmt="%.0f", label_type="edge", padding=3)

plt.title("Beef Export Volume by State and source (in Thousand Tonnes)")
plt.xticks(rotation=45, ha="right")
plt.ylabel("Volume (Thousand Tonnes)")
plt.xlabel("State name")
plt.legend(title="Source")
plt.tight_layout()
plt.show()

png

Review municipalities

Differences with MDIC Municipality

Several of the municipalities with most differences are from SANTA CATARINA and PARANA, which makes sense looking at the state differences

dumbbell_compare(
    df_municipality_beef.to_pandas(),
    df_bol_beef.to_pandas(),
    "vol",
    ["exporter_municipality_trase_id"],
    labels=("MDIC Municipalities", "BoL"),
    max_rows=15,
)

dumbbell_compare(
    df_municipality_beef.to_pandas(),
    df_bol_beef.to_pandas(),
    "vol",
    ["exporter_municipality_trase_id"],
    labels=("MDIC Municipalities", "BoL"),
    max_rows=15,
)

get_column_diff(
    "exporter_municipality_trase_id",
    df_municipality_beef,
    df_bol_beef,
    sort_by="vol_mdic",
).head(20)

Final total volume check

BOL is 100.30% (6,876 tonnes more) of MDIC Port and 98.96% (23,943 tonnes less) of MDIC Municipality

port_vol_sum_tonnes = df_port_beef.select(pl.sum("vol")).item() / 1_000
print(f"{port_vol_sum_tonnes:,.0f} tonnes exported according to MDIC port")
mun_vol_sum_tonnes = df_municipality_beef.select(pl.sum("vol")).item() / 1_000
print(f"{mun_vol_sum_tonnes:,.0f} tonnes exported according to MDIC municipality")
bol_vol_sum_tonnes = df_bol_beef.select(pl.sum("vol")).item() / 1_000
print(f"{bol_vol_sum_tonnes:,.0f} tonnes exported according to BOL")
print(
    f"BOL is {bol_vol_sum_tonnes/port_vol_sum_tonnes:.2%} ({port_vol_sum_tonnes - bol_vol_sum_tonnes:,.0f} tonnes less) of MDIC port and {bol_vol_sum_tonnes/mun_vol_sum_tonnes:.2%} ({mun_vol_sum_tonnes-bol_vol_sum_tonnes:,.0f} tonnes less) of MDIC municipality"
)

Quick check of month totals and FOB totals

Have in mind that the date of BOL is the date of arrival at the destination port. I can imagine that in MDIC its the date of departure

# Add a month (int) column in df_bol_beef based on the date field
df_bol_beef = df_bol_beef.with_columns(
    pl.col("date").dt.month().cast(pl.Int64).alias("month")
)
result_months = field_comparison_df("month")
plot_df = make_plot_df(result_months, "month")
plot_bar_comparison(plot_df, "month")

png

# Show the total FOB values
fob_bol = df_bol_beef.select(pl.sum("fob")).item()
fob_port = df_port_beef.select(pl.sum("fob")).item()
fob_municipality = df_municipality_beef.select(pl.sum("fob")).item()
print(f"Total FOB value according to BOL: ${fob_bol:,.0f}")
print(f"Total FOB value according to MDIC port: ${fob_port:,.0f}")
print(f"Total FOB value according to MDIC municipality: ${fob_municipality:,.0f}")