Main
View or edit on GitHub
This page is synchronized from trase/models/brazil/corn_2019/Soy/main.ipynb. Last modified on 2025-12-14 23:19 CET by Trase Admin.
Please view or edit the original file there; changes should be reflected here after a midnight build (CET time),
or manually triggering it with a GitHub action (link).
%configure_logging
from trase.tools import sps
import pandas as pd
import numpy as np
from functools import partial
import pandas as pd
supplychain = sps.SupplyChain("brazil/soy_2019", year=2019, commodity="soy")
supplychain.preparation()
supplychain.load()
supplychain.run()
supplychain.flow_report_by_attribute("vol", ["branch"], 2)
# supplychain.export_results()
# supplychain.upload_results()
Experimenting with market share
"""
The result of this is a dataframe that looks like:
logistics_hub_trase_id hs6 port_of_export exporter_label
BR-1506807 120100 BARCARENA CARGILL 0.020303
PARANAGUA CARGILL 0.076062
SANTAREM CARGILL 0.903635
120190 PARANAGUA CARGILL 1.000000
BR-1720655 120100 SANTOS FIAGRIL 0.348260
...
TODO:
- Exclude non-sticky (should be done in the decision tree..)
- Should we also break this out per branch?
"""
df_flows = supplychain.get("flows")
df_flows = df_flows[df_flows["status"].str.startswith("TO LP1")]
df_flows = df_flows[df_flows["logistics_hub_trase_id"] != "BR-XXXXXXX"]
def market_shares_to_port(df):
total_volume = df["vol"].sum()
return df.groupby(["port_of_export", "exporter_label"])["vol"].sum() / total_volume
df_market_share = df_flows.groupby(["hs6", "logistics_hub_trase_id"]).apply(
market_shares_to_port
)
display(df_market_share)
# df_flow = consolidate(df_flows, ["vol"], ["exporter_label", "port_of_export", "hs6", "branch", ""]
Investigating the UNKNOWN branch
Now let's investigate the UNKNOWN branch to see what's in there. The vast majority is UNKNOWN volume. We have a possible hope for the 16% of total export volume which is in the UNKNOWN branch but with a known CNPJ.
Let's break down the UNKNOWN branch. Percentages below are relative to total export volume:
df_flows = supplychain.get("flows")
# is the cnpj known?
df_flows["cnpj_known"] = np.where(df_flows["exporter_cnpj"] == "0", "UNKNOWN", "KNOWN")
df_flows["label_known"] = np.where(
df_flows["exporter_label"].str.contains("UNKNOWN"), "UNKNOWN", "KNOWN"
)
# let's see how much is in the CNPJ 2019 dataset
df_cnpj = supplychain.get("cnpj")
df_cnpj = df_cnpj.rename(columns={"cnpj": "exporter_cnpj"})
df_cnpj = pd.merge(df_flows[["exporter_cnpj"]], df_cnpj, on="exporter_cnpj")
df_cnpj = df_cnpj.drop_duplicates()
df_cnpj = df_cnpj.groupby("exporter_cnpj").agg(list).reset_index()
df = pd.merge(
df_flows,
df_cnpj,
on="exporter_cnpj",
how="left",
validate="many_to_one",
indicator=True,
)
df["in_cnpj_2019"] = np.where(df.pop("_merge") == "both", True, False)
# now let's see how much is in the knowledge base
df_trase = supplychain.get("trase")
df_trase = df_trase.drop(columns="vol").drop_duplicates()
df_trase = df_trase.groupby("exporter_cnpj").agg(list).reset_index()
df = pd.merge(
df,
df_trase,
how="left",
on="exporter_cnpj",
validate="many_to_one",
indicator=True,
suffixes=("", "_trase"),
)
df["in_old_trase_data"] = np.where(df.pop("_merge") == "both", True, False)
df["level"] = df["level"].apply(
lambda x: str(tuple(sorted(x if isinstance(x, list) else [])))
)
# report
df_report = report_percentage_by_attribute(
df,
"vol",
by=[
"branch",
"cnpj_known",
"label_known",
"in_cnpj_2019",
"in_old_trase_data",
"level",
],
)
df_report = df_report[df_report.index.get_level_values(0) == "UNKNOWN"]
print(
df_report.to_string(
formatters={
"sum": partial(format_float, significant_digits=2),
"percentage": lambda f: f"{f:.0f}%",
}
)
)
Let's print the biggest exporters with a KNOWN label
df_flows[
["hs4", "port_of_export_group", "exporter_name", "logistics_hub.trase_id", "vol"]
].sample(n=10).astype({"vol": float})
Feasibility of LP
df_share = supplychain.get("marketshare")
df_flows = supplychain.get("flows")
df_flows_solved = df_flows[
df_flows["logistics_hub.trase_id"] != "BR-XXXXXXX"
] # TODO use branch
# find intersection
# filter to intersection with flows
on = [
"hs4",
"logistics_hub.trase_id",
"exporter_name",
"port_of_export_group",
]
df = pd.merge(
df_flows_solved[on].drop_duplicates(),
df_share[on].drop_duplicates(),
on=on,
how="outer",
indicator=True,
)
i = 50
cols = ["exporter_name", "port_of_export_group"]
for (hs4, logistics_hub), g in df.groupby(["hs4", "logistics_hub.trase_id"]):
a = df_flows[
(df_flows["hs4"] == hs4) & (df_flows["logistics_hub.trase_id"] == logistics_hub)
]
b = df_share[
(df_share["hs4"] == hs4) & (df_share["logistics_hub.trase_id"] == logistics_hub)
]
if not a.empty and not b.empty:
print("\n\n")
print(hs4, logistics_hub)
with sps.pandas_display_context(significant_digits=2):
a = (
sps.consolidate(a, ["vol"], cols)
.sort_values(cols)
.set_index(cols)
.astype({"vol": float})
)
display(a)
with sps.pandas_display_context(float_format=lambda x: f"{x:.2f}"):
display(b[[*cols, "share"]].sort_values(cols).set_index(cols))
print("\n\n")
i -= 1
if i < 0:
break
from trase.tools.sps import _pd_display
list(_pd_display(float_format=lambda x: f"{x:.2f}").items())
df_trase = supplychain.get("trase")
df_trase[df_trase["exporter_name"].str.contains("OLAM")]
df_flows = supplychain.get("flows")
sps.consolidate(df_flows, ["vol"], ["logistics_hub.trase_id"])