View or edit on GitHub
This page is synchronized from trase/models/brazil/soy_2023_2024_v27/main.ipynb. Last modified on 2026-03-21 22:30 CET by Trase Admin.
Please view or edit the original file there; changes should be reflected here after a midnight build (CET time),
or manually triggering it with a GitHub action (link).
from trase.tools import sps
import warnings
from trase.tools.aws.metadata import write_csv_for_upload
import pandas as pd
from trase.tools.aws.aws_helpers import read_geojson
warnings.filterwarnings("ignore")
pd.options.display.float_format = "{:,.2f}".format
for year in [2023]:
try:
print(f"Running {year}")
supplychain = sps.SupplyChain("brazil/soy_2023_2024_v27", year=year)
supplychain.preparation()
supplychain.load()
supplychain.run()
supplychain.flow_report_by_attribute("vol", ["branch"], 8)
# supplychain.export_results()
# quality_assurance(year)
# supplychain.upload_results()
except Exception as e:
print(f"Failed {year}: {e}")
Running 2023
Extracting data from source ...[0m [97m took 0.4 seconds[0m
[97mSkipping re-process of Municipality[0m
Extracting data from source ...[0m [97m took 0.3 seconds[0m
[97mSkipping re-process of IndustrialCapacity[0m
Extracting data from source ...[0m [97m took 0.3 seconds[0m
[97mSkipping re-process of IndustrialCapacityFacilities[0m
Extracting data from source ...[0m [97m took 0.3 seconds[0m
[97mSkipping re-process of CommodityRatios[0m
Extracting data from source ...[0m [97m took 0.3 seconds[0m
[97mSkipping re-process of ExporterSpecialCases[0m
Extracting data from source ...[0m [97m took 0.3 seconds[0m
[97mSkipping re-process of ExtraCnpjs[0m
Extracting data from source ...[0m [97m took 0.3 seconds[0m
[97mSkipping re-process of Production[0m
Extracting data from source ...[0m [97m took 0.3 seconds[0m
[97mSkipping re-process of Cost[0m
[97mLoading data from disk took 15.0 seconds[0m
Extracting data from source ...[0m [97m took 0.3 seconds[0m
[97mSkipping re-process of SilosGeometry[0m
[1mRunning pre-processing for Silos[0m
Report: Before processing[0m
| Row Count: 0[0m
Report: After processing[0m
| Row Count: 8,672[0m
[32mWritten /Users/jailsonsoares/repos/TRASE/trase/models/brazil/soy_2023_2024_v27/2023/prepared/silos.csv[0m
Extracting data from source ...[0m [97m took 0.3 seconds[0m
[97mSkipping re-process of Hs[0m
Extracting data from source ...[0m [97m took 0.3 seconds[0m
[97mSkipping re-process of Flows[0m
Extracting data from source ...[0m [97m took 0.4 seconds[0m
[1mRunning pre-processing for Cnpj[0m
Report: Before processing[0m
| Row Count: 427,313[0m
Report: Drop rows with missing level or geocode[0m
| Row Count: 427,313[0m
Report: Drop duplicates[0m
| Row Count: 351,483[0m
Report: Add extra CNPJs[0m
| Row Count: 352,127[0m
Report: After processing[0m
| Row Count: 318,727[0m
[32mWritten /Users/jailsonsoares/repos/TRASE/trase/models/brazil/soy_2023_2024_v27/2023/prepared/cnpj.csv[0m
Extracting data from source ...[0m [97m took 0.3 seconds[0m
[97mSkipping re-process of Sicasq[0m
Use the validate option with pd.merge or pass validate=None to suppress this warning
/Users/jailsonsoares/repos/TRASE/trase/models/brazil/soy_2023_2024_v27/bills_of_lading_decision_tree.py:65
Use the validate option with pd.merge or pass validate=None to suppress this warning
/Users/jailsonsoares/repos/TRASE/trase/models/brazil/soy_2023_2024_v27/bills_of_lading_decision_tree.py:237
Use the validate option with pd.merge or pass validate=None to suppress this warning
/Users/jailsonsoares/repos/TRASE/trase/models/brazil/soy_2023_2024_v27/bills_of_lading_decision_tree.py:253
Use the validate option with pd.merge or pass validate=None to suppress this warning
/Users/jailsonsoares/repos/TRASE/trase/models/brazil/soy_2023_2024_v27/bills_of_lading_decision_tree.py:261
sum percentage
branch
UNPROCESSED 31,320,377 27%
4.1.1. office is not located at a port and trades >20,000 tonnes of soy 25,688,919 22%
2.2. link to silo cadastro only 17,630,685 15%
2.2.2 link to silo cadastro only (farm cnpj) 8,929,045 8%
2.1. link to silo list and cadastro 7,085,734 6%
1.1. link to farm municipalities 5,736,324 5%
4.1.2. office is located at a port or is big city 5,676,506 5%
4.1.3 exports from unlikely routes and/or distant from ports 5,326,977 5%
3.1.1 link to crush with silo 3,337,692 3%
2.3 exports from silo list or CNPJ located distant from ports 3,202,632 3%
3.2.2 link to crush no silos in municipality, use supply shed 1,745,307 1%
3.1.2.2 link to crush with > 1 silo in same municipality 374,407 0%
1.2. link to farm municipalities located distant from ports 286,258 0%
3.2.1 link to crush no silos in municipality, use SICASQ 19,867 0%
3.1.2.1 link to crush with 1 silo in same municipality 6,000 0%
br_boundaries_mun = read_geojson(
"brazil/spatial/boundaries/ibge/2020/br_municipalities_2020_ibge_4326_simplified.geojson",
bucket="trase-storage",
)
df_export = supplychain.get("flows")[
[
"exporter_municipality_trase_id",
"logistics_hub.trase_id",
"logistics_hub.name",
"exporter_group",
"exporter_name",
"exporter_cnpj",
"port_of_export_name",
"port_of_export_label",
"port_municipality_trase_id",
"country_of_destination_name",
"branch",
"product_type",
"vol",
]
]
df_export.loc[df_export["exporter_name"] == "UNKNOWN", "branch"] = "UNKNOWN"
# Add municipality name
df_export = (
df_export.merge(
br_boundaries_mun[["TRASE_ID", "MUN_NAME_C"]],
how="left",
left_on=["exporter_municipality_trase_id"],
right_on="TRASE_ID",
)
.drop(columns=["TRASE_ID"])
.rename(columns={"MUN_NAME_C": "exporter_municipality_name"})
)
df_export = df_export.replace(
{"branch": {"UNPROCESSED": "5. no soy economic activity"}}
)
df_costs = supplychain.get("cost")
Add cost attribute
df_export = df_export.merge(
df_costs,
left_on=["logistics_hub.trase_id", "port_municipality_trase_id"],
right_on=["origin", "destination"],
how="left",
).drop(columns=["origin", "destination"])
df_export
| exporter_municipality_trase_id | logistics_hub.trase_id | logistics_hub.name | exporter_group | exporter_name | exporter_cnpj | port_of_export_name | port_of_export_label | port_municipality_trase_id | country_of_destination_name | branch | product_type | vol | exporter_municipality_name | cost | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | BR-4314902 | BR-XXXXXXX | UNKNOWN | CARGILL | CARGILL AGRICOLA SA | 60498706020930 | RIO GRANDE | RIO GRANDE | BR-4315602 | CHINA (MAINLAND) | 5. no soy economic activity | SOYBEANS | 65,280.00 | PORTO ALEGRE | NaN |
| 1 | BR-2100808 | BR-2100808 | ANAPURUS | NOVAAGRI INFRA-ESTRUTURA DE ARMAZENAGEM E ESCO... | NOVAAGRI INFRA-ESTRUTURA DE ARMAZENAGEM E ESCO... | 09077252001831 | BARCARENA | VILA DO CONDE | BR-1501303 | CHINA (MAINLAND) | 2.1. link to silo list and cadastro | SOYBEANS | 63,000.00 | ANAPURUS | 13.30 |
| 2 | BR-1506195 | BR-1506195 | RUROPOLIS | LOUIS DREYFUS | LOUIS DREYFUS COMPANY BRASIL S.A. | 47067525019550 | SANTAREM | SANTAREM | BR-1506807 | PORTUGAL | 2.2. link to silo cadastro only | SOYBEANS | 45,700.00 | RUROPOLIS | 2.89 |
| 3 | BR-XXXXXXX | UNKNOWN | UNKNOWN | UNKNOWN | 00000000000000 | PARANAGUA | PARANAGUA | BR-4118204 | CHINA (MAINLAND) | UNKNOWN | SOYBEANS | 30,000.00 | NaN | NaN | |
| 4 | BR-2919553 | BR-2919553 | LUIS EDUARDO MAGALHAES | ENGELHART | ENGELHART | 14796754001186 | RIO GRANDE | RIO GRANDE | BR-4315602 | IRAN | 4.1.1. office is not located at a port and tra... | SOYBEANS | 67,160.00 | LUIS EDUARDO MAGALHAES | 37.38 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 11276 | BR-XXXXXXX | UNKNOWN | UNKNOWN | UNKNOWN | 00000000000000 | SANTOS | SANTOS | BR-3548500 | JAPAN | UNKNOWN | SOYBEAN CAKE | 51,252.70 | NaN | NaN | |
| 11277 | BR-XXXXXXX | UNKNOWN | UNKNOWN | UNKNOWN | 00000000000000 | RIO GRANDE | RIO GRANDE | BR-4315602 | THAILAND | UNKNOWN | SOYBEAN CAKE | 1,500.00 | NaN | NaN | |
| 11278 | BR-XXXXXXX | UNKNOWN | UNKNOWN | UNKNOWN | 00000000000000 | RIO GRANDE | RIO GRANDE | BR-4315602 | THAILAND | UNKNOWN | SOYBEAN CAKE | 2,100.00 | NaN | NaN | |
| 11279 | BR-XXXXXXX | UNKNOWN | UNKNOWN | UNKNOWN | 00000000000000 | RIO GRANDE | RIO GRANDE | BR-4315602 | THAILAND | UNKNOWN | SOYBEAN CAKE | 3,000.00 | NaN | NaN | |
| 11280 | BR-XXXXXXX | UNKNOWN | UNKNOWN | UNKNOWN | 00000000000000 | RIO GRANDE | RIO GRANDE | BR-4315602 | THAILAND | UNKNOWN | SOYBEAN CAKE | 4,000.00 | NaN | NaN |
11281 rows × 15 columns
df_export[
[
"exporter_group",
"branch",
"logistics_hub.trase_id",
"port_of_export_name",
"port_municipality_trase_id",
"country_of_destination_name",
"product_type",
"vol",
]
].to_csv("branches.csv", sep=";", index=False)
flows = pd.read_csv("2023/prepared/flows.csv", sep=";")
flows[["vol"]].sum()
df_export[["vol"]].sum()
data_for_supply_shed = (
df_export[
[
"exporter_name",
"exporter_cnpj",
"logistics_hub.trase_id",
"port_of_export_name",
"branch",
"product_type",
"vol",
]
]
.groupby(
by=[
"exporter_name",
"exporter_cnpj",
"logistics_hub.trase_id",
"port_of_export_name",
"branch",
"product_type",
#'vol'
],
as_index=False,
)["vol"]
.sum()
)
# write_csv_for_upload(
# df=data_for_supply_shed,
# bucket='trase-storage',
# key='brazil/soy/sei_pcs/v2.7.0/supply_zone/SEIPCS_BRAZIL_SOY_PARTIAL_BRANCHES_V27.csv',
# do_upload=True
# )
Aggregates sums per branch + unknowns
Below the aggregations per branch.
df_export["branch_high_level"] = (
df_export["branch"]
.str.extract(r"(\d+\.\d+)", expand=False)
.fillna(df_export["branch"])
)
# Group by branch
df_summary = df_export.groupby("branch_high_level", as_index=False)["vol"].sum()
# Check percentage
df_summary["percent"] = (df_summary["vol"] / df_summary["vol"].sum() * 100).round(2)
df_summary.sort_values(by=["branch_high_level"])
Volumes from UNPROCESSED branches by product
# Group by branch
df_summary = df_export.groupby(by=["branch", "product_type"], as_index=False)[
"vol"
].sum()
# Check percentage
df_summary["percent"] = (df_summary["vol"] / df_summary["vol"].sum() * 100).round(2)
df_summary
Top 10 Exporters from UNPROCESSED branches
# Group by branch
df_summary = (
df_export.query('branch == "5. no soy economic activity"')
.groupby(by=["exporter_name"], as_index=False)["vol"]
.sum()
)
# Check percentage
df_summary["percent"] = (df_summary["vol"] / df_summary["vol"].sum() * 100).round(2)
df_summary.sort_values(by=["vol"], ascending=False).head(10)
Economic Activity of UNKNOWN branches
cnpj_level = supplychain.get("cnpj", subset=["cnpj", "level"]).drop_duplicates()
df_export_level = df_export[
df_export["branch_high_level"] == "5. no soy economic activity"
].merge(cnpj_level, left_on="exporter_cnpj", right_on="cnpj", how="left")
df_export_level["level"] = df_export_level["level"].fillna(10000)
30% of the exporters of UNPROCESSED branches DON'T have CNAE levels related to soy activity though they are subsidiary of big trade groups or weren't present in our CADASTRO database. Two things could increase the number of connections:
- A decision rule based on the CNPJ8 for soybeans
- Trying to mannualy include the missing CNPJs
df_product_level = df_export_level.groupby(by=["level"], as_index=False)["vol"].sum()
df_product_level["percent"] = (
df_product_level["vol"] / df_product_level["vol"].sum() * 100
).round(2)
df_product_level
Some of the CNAEs from UNPROCESSED branches could be included as level 4 (wholsale activity):
- 4681801 - Comércio atacadista de álcool carburante, biodiesel, gasolina e demais derivados de petróleo, exceto lubrificantes
- 4637103 - Comércio atacadista de óleos e gorduras
Below the exporters whose CNPJ are not classified as soy activity
df_export_level.query("level == 10000")[
[
"exporter_name",
"exporter_cnpj",
"port_of_export_name",
"branch",
"product_type",
"vol",
"exporter_municipality_name",
"level",
]
].groupby(
by=["exporter_cnpj", "exporter_name", "exporter_municipality_name"], as_index=False
)[
"vol"
].sum().sort_values(
by=["vol"], ascending=False
).head(
50
)
Flow Visual Analysis
Below, we see soy flows by branches and companies as well as the municipalities linking to ports. The decision tree results look reasonable, except for some tiny flows that seem unlikely, such as those in Cargill's branches 4.1 and 2.2: - Branch 2.2 – CNPJ_14 is linked only to a silo facility from Cadastro. - Branch 4.1 – CNPJ_14 is linked to a wholesale/office activity. In general, branch 4.1 shows high uncertainty in origin locations, since it is based on office addresses. One point for discussion would be whether to model sourcing areas from office locations or directly from ports.
import matplotlib.transforms as mtransforms
import matplotlib.ticker as mticker
import numpy as np
from shapely.geometry import LineString
import geopandas as gpd
import matplotlib.pyplot as plt
class LogisticsMapPlotter:
def __init__(self, municipalities_gdf: gpd.GeoDataFrame):
"""
Initializes the plotter with the background map.
"""
# Store a copy and standardize columns to lowercase immediately
self.municipalities = municipalities_gdf.copy()
self.municipalities.columns = [x.lower() for x in self.municipalities.columns]
# Style Configuration
self.colors = {
"background": "#e0e0e0",
"border": "white",
"flow": "#2c7fb8",
"port_text": "#FF0000",
}
# --- Helper Methods (formerly standalone functions) ---
@staticmethod
def _get_curved_line(p1, p2, num_points=30, curve_intensity=0.2):
"""Generates a curved LineString between two points."""
x1, y1 = p1.x, p1.y
x2, y2 = p2.x, p2.y
t = np.linspace(0, 1, num_points)
# Linear interpolation
x = x1 + (x2 - x1) * t
y = y1 + (y2 - y1) * t
# Curve offset
dx, dy = x2 - x1, y2 - y1
length = np.sqrt(dx**2 + dy**2)
if length == 0:
return LineString([p1, p2])
offset_magnitude = length * curve_intensity
offsets = 4 * offset_magnitude * (t - t**2)
norm_x = -dy / length
norm_y = dx / length
x_curve = x + offsets * norm_x
y_curve = y + offsets * norm_y
return LineString(list(zip(x_curve, y_curve)))
@staticmethod
def _get_arrow_data(line_geom, position_pct=0.6):
"""Calculates position and rotation angle for an arrow on the line."""
if line_geom is None or line_geom.is_empty:
return 0, 0, 0
point = line_geom.interpolate(position_pct, normalized=True)
epsilon = 0.01
point_ahead = line_geom.interpolate(
min(position_pct + epsilon, 0.99), normalized=True
)
dx = point_ahead.x - point.x
dy = point_ahead.y - point.y
angle_degrees = np.degrees(np.arctan2(dy, dx))
return point.x, point.y, angle_degrees
@staticmethod
def _format_millions(x, pos):
"""Formatter for the colorbar."""
if x >= 1e6:
return f"{x/1e6:.1f}M"
elif x >= 1e3:
return f"{x/1e3:.0f}k"
return f"{x:.0f}"
def _prepare_data(self, trade_data, company, branch, product, source_field):
"""Filters and merges data with geometries."""
df_flows = trade_data.copy()
# 1. Filters
if company:
df_flows = df_flows[
df_flows["exporter_name"].str.startswith(company, na=False)
]
if branch:
# Maintained strict string check from your code
df_flows = df_flows[df_flows["branch"].str.startswith(branch, na=False)]
if product:
df_flows = df_flows[
df_flows["product_type"].str.startswith(product, na=False)
]
if df_flows.empty:
return pd.DataFrame() # Return empty if no data
# 2. Cleanup & Port Enrichment
if "geometry" in df_flows.columns:
df_flows = df_flows.drop(columns=["geometry"])
# 3. Merge Origins
df_merged = df_flows.merge(
self.municipalities[["trase_id", "geometry"]],
left_on=source_field,
right_on="trase_id",
how="left",
).rename(columns={"geometry": "origin_geom"})
# 4. Merge Destinations (Ports)
df_merged = df_merged.merge(
self.municipalities[["trase_id", "geometry"]],
left_on="port_municipality_trase_id",
right_on="trase_id",
how="left",
suffixes=("", "_port"),
).rename(columns={"geometry": "dest_geom"})
# 5. Calculate Centroids & Drop Missing
df_merged["origin_centroid"] = gpd.GeoSeries(df_merged["origin_geom"]).centroid
df_merged["dest_centroid"] = gpd.GeoSeries(df_merged["dest_geom"]).centroid
df_merged = df_merged.dropna(subset=["origin_centroid", "dest_centroid"])
return df_merged
# --- Main Plotting Method ---
def plot_logistic_arcs(
self,
trade_data,
company_name,
branch,
product_type,
source_field,
chart_title="",
):
"""
Main method to generate the plot.
"""
# Prepare Data
df_merged = self._prepare_data(
trade_data, company_name, branch, product_type, source_field
)
if df_merged.empty:
print(f"No data to display for {company_name} - {product_type}")
return
# Generate Arcs (Curves)
lines = []
volumes = []
for _, row in df_merged.iterrows():
if not row["origin_centroid"].equals(row["dest_centroid"]):
curve = self._get_curved_line(
row["origin_centroid"], row["dest_centroid"], num_points=35
)
lines.append(curve)
volumes.append(row["vol"])
gdf_lines = gpd.GeoDataFrame(
{"vol": volumes}, geometry=lines, crs=self.municipalities.crs
)
# --- Plotting ---
fig, ax = plt.subplots(figsize=(16, 16))
# 1. Base Map
self.municipalities.plot(
ax=ax,
color=self.colors["background"],
edgecolor=self.colors["border"],
linewidth=0.1,
)
# 2. Layer A: Sourcing Areas (Heatmap)
origin_vols = df_merged.groupby(source_field)["vol"].sum().reset_index()
origin_map = self.municipalities.merge(
origin_vols, left_on="trase_id", right_on=source_field
)
if not origin_map.empty:
origin_map.plot(
column="vol",
ax=ax,
cmap="magma",
legend=True,
legend_kwds={
"label": f"{company_name} Exported Volume (t)",
"shrink": 0.4,
},
alpha=0.9,
)
# Fix Colorbar label
cbar = ax.get_figure().axes[-1]
cbar.yaxis.set_major_formatter(mticker.FuncFormatter(self._format_millions))
# 3. Layer B: Arcs
if not gdf_lines.empty:
max_vol = gdf_lines["vol"].max()
if max_vol == 0:
max_vol = 1
# Dynamic linewidth
linewidths = 0.8 + (gdf_lines["vol"] / max_vol) * 2.5
gdf_lines.plot(
ax=ax,
color=self.colors["flow"],
alpha=0.1,
linewidth=linewidths,
zorder=5,
)
# 4. Layer B2: Arrows
for _, row in gdf_lines.iterrows():
arrow_x, arrow_y, angle = self._get_arrow_data(
row.geometry, position_pct=0.6
)
arrow_size = 8 + (row["vol"] / max_vol) * 12
# Filter small arrows
if arrow_size > 8.5:
# Rotation Transform
t = (
mtransforms.Affine2D().rotate_deg_around(
arrow_x, arrow_y, angle
)
+ ax.transData
)
ax.plot(
arrow_x,
arrow_y,
marker=">",
markersize=arrow_size,
color=self.colors["flow"],
alpha=0.4,
markeredgewidth=0.2,
transform=t,
zorder=6,
)
# 5. Layer C: Ports
valid_ports_ids = df_merged["port_municipality_trase_id"].unique()
ports_geo = self.municipalities[
self.municipalities["trase_id"].isin(valid_ports_ids)
].copy()
for _, row in ports_geo.iterrows():
centroid = row.geometry.centroid
ax.text(
centroid.x,
centroid.y,
"⚓",
fontsize=26,
ha="center",
va="center",
color=self.colors["port_text"],
zorder=15,
)
# Final Formatting
plt.title(f"{company_name} - {product_type} - {chart_title}", fontsize=15)
plt.axis("off")
plt.tight_layout()
plt.show()
import numpy as np
df_export = df_export.fillna(0)
# 1. Instantiate the plotter with your municipality boundaries
plotter = LogisticsMapPlotter(br_boundaries_mun)
# 2. Run the plot
plotter.plot_logistic_arcs(
trade_data=df_export,
company_name="",
branch="2.3",
product_type="",
source_field="logistics_hub.trase_id",
chart_title="Located in silo municipalities from facility list",
)
df_export.query('branch.str.startswith("2.3") and port_of_export_name == "SANTAREM"')
# 2. Run the plot
plotter.plot_logistic_arcs(
trade_data=df_export,
company_name="",
branch="1",
product_type="",
source_field="logistics_hub.trase_id",
chart_title="EXPORTS FROM FARMS",
)
# 2. Run the plot
plotter.plot_logistic_arcs(
trade_data=df_export,
company_name="",
branch="3",
product_type="",
source_field="logistics_hub.trase_id",
chart_title="EXPORTS FROM CRUSH",
)
plotter.plot_logistic_arcs(
trade_data=df_export,
company_name="CARGILL",
branch="3.1",
product_type="",
source_field="logistics_hub.trase_id",
chart_title="EXPORTS FROM CRUSHING (BRANCH 3.1.1)",
)
plotter.plot_logistic_arcs(
trade_data=df_export,
company_name="CARGILL",
branch="3.1.2",
product_type="",
source_field="logistics_hub.trase_id",
chart_title="EXPORTS FROM CRUSHING (BRANCH 3.1.2)",
)
plotter.plot_logistic_arcs(
trade_data=df_export,
company_name="",
branch="4",
product_type="",
source_field="logistics_hub.trase_id",
chart_title="EXPORTS FROM OFFICES/WHOLESALE (BRANCH 4.1)",
)