Skip to content

Trase Data Wiki

Main

Main

View or edit on GitHub

This page is synchronized from trase/models/brazil/corn_2019/Soy/main.ipynb. Last modified on 2025-12-14 23:19 CET by Trase Admin. Please view or edit the original file there; changes should be reflected here after a midnight build (CET time), or manually triggering it with a GitHub action (link).

%configure_logging

from trase.tools import sps
import pandas as pd
import numpy as np
from functools import partial

import pandas as pd

supplychain = sps.SupplyChain("brazil/soy_2019", year=2019, commodity="soy")
supplychain.preparation()
supplychain.load()
supplychain.run()
supplychain.flow_report_by_attribute("vol", ["branch"], 2)
# supplychain.export_results()
# supplychain.upload_results()

"""
The result of this is a dataframe that looks like:

    logistics_hub_trase_id  hs6     port_of_export exporter_label                                       
    BR-1506807              120100  BARCARENA      CARGILL         0.020303
                                    PARANAGUA      CARGILL         0.076062
                                    SANTAREM       CARGILL         0.903635
                            120190  PARANAGUA      CARGILL         1.000000
    BR-1720655              120100  SANTOS         FIAGRIL         0.348260
                                                                        ...   

TODO:
   - Exclude non-sticky (should be done in the decision tree..)
   - Should we also break this out per branch?
"""

df_flows = supplychain.get("flows")
df_flows = df_flows[df_flows["status"].str.startswith("TO LP1")]
df_flows = df_flows[df_flows["logistics_hub_trase_id"] != "BR-XXXXXXX"]


def market_shares_to_port(df):
    total_volume = df["vol"].sum()
    return df.groupby(["port_of_export", "exporter_label"])["vol"].sum() / total_volume


df_market_share = df_flows.groupby(["hs6", "logistics_hub_trase_id"]).apply(
    market_shares_to_port
)
display(df_market_share)
# df_flow = consolidate(df_flows, ["vol"], ["exporter_label", "port_of_export", "hs6", "branch", ""]

Investigating the UNKNOWN branch

Now let's investigate the UNKNOWN branch to see what's in there. The vast majority is UNKNOWN volume. We have a possible hope for the 16% of total export volume which is in the UNKNOWN branch but with a known CNPJ.

Let's break down the UNKNOWN branch. Percentages below are relative to total export volume:

df_flows = supplychain.get("flows")

# is the cnpj known?
df_flows["cnpj_known"] = np.where(df_flows["exporter_cnpj"] == "0", "UNKNOWN", "KNOWN")
df_flows["label_known"] = np.where(
    df_flows["exporter_label"].str.contains("UNKNOWN"), "UNKNOWN", "KNOWN"
)

# let's see how much is in the CNPJ 2019 dataset
df_cnpj = supplychain.get("cnpj")
df_cnpj = df_cnpj.rename(columns={"cnpj": "exporter_cnpj"})
df_cnpj = pd.merge(df_flows[["exporter_cnpj"]], df_cnpj, on="exporter_cnpj")
df_cnpj = df_cnpj.drop_duplicates()
df_cnpj = df_cnpj.groupby("exporter_cnpj").agg(list).reset_index()

df = pd.merge(
    df_flows,
    df_cnpj,
    on="exporter_cnpj",
    how="left",
    validate="many_to_one",
    indicator=True,
)
df["in_cnpj_2019"] = np.where(df.pop("_merge") == "both", True, False)

# now let's see how much is in the knowledge base
df_trase = supplychain.get("trase")
df_trase = df_trase.drop(columns="vol").drop_duplicates()
df_trase = df_trase.groupby("exporter_cnpj").agg(list).reset_index()

df = pd.merge(
    df,
    df_trase,
    how="left",
    on="exporter_cnpj",
    validate="many_to_one",
    indicator=True,
    suffixes=("", "_trase"),
)
df["in_old_trase_data"] = np.where(df.pop("_merge") == "both", True, False)
df["level"] = df["level"].apply(
    lambda x: str(tuple(sorted(x if isinstance(x, list) else [])))
)

# report
df_report = report_percentage_by_attribute(
    df,
    "vol",
    by=[
        "branch",
        "cnpj_known",
        "label_known",
        "in_cnpj_2019",
        "in_old_trase_data",
        "level",
    ],
)
df_report = df_report[df_report.index.get_level_values(0) == "UNKNOWN"]
print(
    df_report.to_string(
        formatters={
            "sum": partial(format_float, significant_digits=2),
            "percentage": lambda f: f"{f:.0f}%",
        }
    )
)

Let's print the biggest exporters with a KNOWN label

df_flows[
    ["hs4", "port_of_export_group", "exporter_name", "logistics_hub.trase_id", "vol"]
].sample(n=10).astype({"vol": float})

Feasibility of LP

df_share = supplychain.get("marketshare")
df_flows = supplychain.get("flows")

df_flows_solved = df_flows[
    df_flows["logistics_hub.trase_id"] != "BR-XXXXXXX"
]  # TODO use branch

# find intersection
# filter to intersection with flows
on = [
    "hs4",
    "logistics_hub.trase_id",
    "exporter_name",
    "port_of_export_group",
]
df = pd.merge(
    df_flows_solved[on].drop_duplicates(),
    df_share[on].drop_duplicates(),
    on=on,
    how="outer",
    indicator=True,
)

i = 50

cols = ["exporter_name", "port_of_export_group"]

for (hs4, logistics_hub), g in df.groupby(["hs4", "logistics_hub.trase_id"]):
    a = df_flows[
        (df_flows["hs4"] == hs4) & (df_flows["logistics_hub.trase_id"] == logistics_hub)
    ]
    b = df_share[
        (df_share["hs4"] == hs4) & (df_share["logistics_hub.trase_id"] == logistics_hub)
    ]

    if not a.empty and not b.empty:

        print("\n\n")
        print(hs4, logistics_hub)

        with sps.pandas_display_context(significant_digits=2):
            a = (
                sps.consolidate(a, ["vol"], cols)
                .sort_values(cols)
                .set_index(cols)
                .astype({"vol": float})
            )
            display(a)

        with sps.pandas_display_context(float_format=lambda x: f"{x:.2f}"):
            display(b[[*cols, "share"]].sort_values(cols).set_index(cols))
        print("\n\n")

        i -= 1
        if i < 0:
            break

from trase.tools.sps import _pd_display

list(_pd_display(float_format=lambda x: f"{x:.2f}").items())

df_trase = supplychain.get("trase")
df_trase[df_trase["exporter_name"].str.contains("OLAM")]

df_flows = supplychain.get("flows")
sps.consolidate(df_flows, ["vol"], ["logistics_hub.trase_id"])