View or edit on GitHub

This page is synchronized from trase/models/brazil/soy_2023_2024_v27/main.ipynb. Last modified on 2026-03-21 22:30 CET by Trase Admin. Please view or edit the original file there; changes should be reflected here after a midnight build (CET time), or manually triggering it with a GitHub action (link).

from trase.tools import sps
import warnings
from trase.tools.aws.metadata import write_csv_for_upload
import pandas as pd
from trase.tools.aws.aws_helpers import read_geojson

warnings.filterwarnings("ignore")

pd.options.display.float_format = "{:,.2f}".format


for year in [2023]:
    try:
        print(f"Running {year}")
        supplychain = sps.SupplyChain("brazil/soy_2023_2024_v27", year=year)
        supplychain.preparation()
        supplychain.load()
        supplychain.run()
        supplychain.flow_report_by_attribute("vol", ["branch"], 8)

        # supplychain.export_results()
        # quality_assurance(year)
        # supplychain.upload_results()
    except Exception as e:
        print(f"Failed {year}: {e}")

Running 2023
Extracting data from source ...[0m [97m took 0.4 seconds[0m
[97mSkipping re-process of Municipality[0m
Extracting data from source ...[0m [97m took 0.3 seconds[0m
[97mSkipping re-process of IndustrialCapacity[0m
Extracting data from source ...[0m [97m took 0.3 seconds[0m
[97mSkipping re-process of IndustrialCapacityFacilities[0m
Extracting data from source ...[0m [97m took 0.3 seconds[0m
[97mSkipping re-process of CommodityRatios[0m
Extracting data from source ...[0m [97m took 0.3 seconds[0m
[97mSkipping re-process of ExporterSpecialCases[0m
Extracting data from source ...[0m [97m took 0.3 seconds[0m
[97mSkipping re-process of ExtraCnpjs[0m
Extracting data from source ...[0m [97m took 0.3 seconds[0m
[97mSkipping re-process of Production[0m
Extracting data from source ...[0m [97m took 0.3 seconds[0m
[97mSkipping re-process of Cost[0m
    [97mLoading data from disk took 15.0 seconds[0m
Extracting data from source ...[0m [97m took 0.3 seconds[0m
[97mSkipping re-process of SilosGeometry[0m
[1mRunning pre-processing for Silos[0m
Report: Before processing[0m
        | Row Count: 0[0m
Report: After processing[0m
        | Row Count: 8,672[0m
[32mWritten /Users/jailsonsoares/repos/TRASE/trase/models/brazil/soy_2023_2024_v27/2023/prepared/silos.csv[0m
Extracting data from source ...[0m [97m took 0.3 seconds[0m
[97mSkipping re-process of Hs[0m
Extracting data from source ...[0m [97m took 0.3 seconds[0m
[97mSkipping re-process of Flows[0m
Extracting data from source ...[0m [97m took 0.4 seconds[0m
[1mRunning pre-processing for Cnpj[0m
Report: Before processing[0m
        | Row Count: 427,313[0m
Report: Drop rows with missing level or geocode[0m
        | Row Count: 427,313[0m
Report: Drop duplicates[0m
        | Row Count: 351,483[0m
Report: Add extra CNPJs[0m
        | Row Count: 352,127[0m
Report: After processing[0m
        | Row Count: 318,727[0m
[32mWritten /Users/jailsonsoares/repos/TRASE/trase/models/brazil/soy_2023_2024_v27/2023/prepared/cnpj.csv[0m
Extracting data from source ...[0m [97m took 0.3 seconds[0m
[97mSkipping re-process of Sicasq[0m


Use the validate option with pd.merge or pass validate=None to suppress this warning
/Users/jailsonsoares/repos/TRASE/trase/models/brazil/soy_2023_2024_v27/bills_of_lading_decision_tree.py:65
Use the validate option with pd.merge or pass validate=None to suppress this warning
/Users/jailsonsoares/repos/TRASE/trase/models/brazil/soy_2023_2024_v27/bills_of_lading_decision_tree.py:237
Use the validate option with pd.merge or pass validate=None to suppress this warning
/Users/jailsonsoares/repos/TRASE/trase/models/brazil/soy_2023_2024_v27/bills_of_lading_decision_tree.py:253
Use the validate option with pd.merge or pass validate=None to suppress this warning
/Users/jailsonsoares/repos/TRASE/trase/models/brazil/soy_2023_2024_v27/bills_of_lading_decision_tree.py:261


                                                                               sum percentage
branch                                                                                       
UNPROCESSED                                                             31,320,377        27%
4.1.1. office is not located at a port and trades >20,000 tonnes of soy 25,688,919        22%
2.2. link to silo cadastro only                                         17,630,685        15%
2.2.2 link to silo cadastro only (farm cnpj)                             8,929,045         8%
2.1. link to silo list and cadastro                                      7,085,734         6%
1.1. link to farm municipalities                                         5,736,324         5%
4.1.2. office is located at a port or is big city                        5,676,506         5%
4.1.3 exports from unlikely routes and/or distant from ports             5,326,977         5%
3.1.1 link to crush with silo                                            3,337,692         3%
2.3 exports from silo list or CNPJ located distant from ports            3,202,632         3%
3.2.2 link to crush no silos in municipality, use supply shed            1,745,307         1%
3.1.2.2 link to crush with > 1 silo in same municipality                   374,407         0%
1.2. link to farm municipalities located distant from ports                286,258         0%
3.2.1 link to crush no silos in municipality, use SICASQ                    19,867         0%
3.1.2.1 link to crush with 1 silo in same municipality                       6,000         0%

br_boundaries_mun = read_geojson(
    "brazil/spatial/boundaries/ibge/2020/br_municipalities_2020_ibge_4326_simplified.geojson",
    bucket="trase-storage",
)

df_export = supplychain.get("flows")[
    [
        "exporter_municipality_trase_id",
        "logistics_hub.trase_id",
        "logistics_hub.name",
        "exporter_group",
        "exporter_name",
        "exporter_cnpj",
        "port_of_export_name",
        "port_of_export_label",
        "port_municipality_trase_id",
        "country_of_destination_name",
        "branch",
        "product_type",
        "vol",
    ]
]

df_export.loc[df_export["exporter_name"] == "UNKNOWN", "branch"] = "UNKNOWN"

# Add municipality name
df_export = (
    df_export.merge(
        br_boundaries_mun[["TRASE_ID", "MUN_NAME_C"]],
        how="left",
        left_on=["exporter_municipality_trase_id"],
        right_on="TRASE_ID",
    )
    .drop(columns=["TRASE_ID"])
    .rename(columns={"MUN_NAME_C": "exporter_municipality_name"})
)

df_export = df_export.replace(
    {"branch": {"UNPROCESSED": "5. no soy economic activity"}}
)

df_costs = supplychain.get("cost")

Add cost attribute

df_export = df_export.merge(
    df_costs,
    left_on=["logistics_hub.trase_id", "port_municipality_trase_id"],
    right_on=["origin", "destination"],
    how="left",
).drop(columns=["origin", "destination"])

df_export

	exporter_municipality_trase_id	logistics_hub.trase_id	logistics_hub.name	exporter_group	exporter_name	exporter_cnpj	port_of_export_name	port_of_export_label	port_municipality_trase_id	country_of_destination_name	branch	product_type	vol	exporter_municipality_name	cost
0	BR-4314902	BR-XXXXXXX	UNKNOWN	CARGILL	CARGILL AGRICOLA SA	60498706020930	RIO GRANDE	RIO GRANDE	BR-4315602	CHINA (MAINLAND)	5. no soy economic activity	SOYBEANS	65,280.00	PORTO ALEGRE	NaN
1	BR-2100808	BR-2100808	ANAPURUS	NOVAAGRI INFRA-ESTRUTURA DE ARMAZENAGEM E ESCO...	NOVAAGRI INFRA-ESTRUTURA DE ARMAZENAGEM E ESCO...	09077252001831	BARCARENA	VILA DO CONDE	BR-1501303	CHINA (MAINLAND)	2.1. link to silo list and cadastro	SOYBEANS	63,000.00	ANAPURUS	13.30
2	BR-1506195	BR-1506195	RUROPOLIS	LOUIS DREYFUS	LOUIS DREYFUS COMPANY BRASIL S.A.	47067525019550	SANTAREM	SANTAREM	BR-1506807	PORTUGAL	2.2. link to silo cadastro only	SOYBEANS	45,700.00	RUROPOLIS	2.89
3		BR-XXXXXXX	UNKNOWN	UNKNOWN	UNKNOWN	00000000000000	PARANAGUA	PARANAGUA	BR-4118204	CHINA (MAINLAND)	UNKNOWN	SOYBEANS	30,000.00	NaN	NaN
4	BR-2919553	BR-2919553	LUIS EDUARDO MAGALHAES	ENGELHART	ENGELHART	14796754001186	RIO GRANDE	RIO GRANDE	BR-4315602	IRAN	4.1.1. office is not located at a port and tra...	SOYBEANS	67,160.00	LUIS EDUARDO MAGALHAES	37.38
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
11276		BR-XXXXXXX	UNKNOWN	UNKNOWN	UNKNOWN	00000000000000	SANTOS	SANTOS	BR-3548500	JAPAN	UNKNOWN	SOYBEAN CAKE	51,252.70	NaN	NaN
11277		BR-XXXXXXX	UNKNOWN	UNKNOWN	UNKNOWN	00000000000000	RIO GRANDE	RIO GRANDE	BR-4315602	THAILAND	UNKNOWN	SOYBEAN CAKE	1,500.00	NaN	NaN
11278		BR-XXXXXXX	UNKNOWN	UNKNOWN	UNKNOWN	00000000000000	RIO GRANDE	RIO GRANDE	BR-4315602	THAILAND	UNKNOWN	SOYBEAN CAKE	2,100.00	NaN	NaN
11279		BR-XXXXXXX	UNKNOWN	UNKNOWN	UNKNOWN	00000000000000	RIO GRANDE	RIO GRANDE	BR-4315602	THAILAND	UNKNOWN	SOYBEAN CAKE	3,000.00	NaN	NaN
11280		BR-XXXXXXX	UNKNOWN	UNKNOWN	UNKNOWN	00000000000000	RIO GRANDE	RIO GRANDE	BR-4315602	THAILAND	UNKNOWN	SOYBEAN CAKE	4,000.00	NaN	NaN

11281 rows × 15 columns

df_export[
    [
        "exporter_group",
        "branch",
        "logistics_hub.trase_id",
        "port_of_export_name",
        "port_municipality_trase_id",
        "country_of_destination_name",
        "product_type",
        "vol",
    ]
].to_csv("branches.csv", sep=";", index=False)

flows = pd.read_csv("2023/prepared/flows.csv", sep=";")
flows[["vol"]].sum()

df_export[["vol"]].sum()

data_for_supply_shed = (
    df_export[
        [
            "exporter_name",
            "exporter_cnpj",
            "logistics_hub.trase_id",
            "port_of_export_name",
            "branch",
            "product_type",
            "vol",
        ]
    ]
    .groupby(
        by=[
            "exporter_name",
            "exporter_cnpj",
            "logistics_hub.trase_id",
            "port_of_export_name",
            "branch",
            "product_type",
            #'vol'
        ],
        as_index=False,
    )["vol"]
    .sum()
)

# write_csv_for_upload(
#     df=data_for_supply_shed,
#     bucket='trase-storage',
#     key='brazil/soy/sei_pcs/v2.7.0/supply_zone/SEIPCS_BRAZIL_SOY_PARTIAL_BRANCHES_V27.csv',
#     do_upload=True
# )

Aggregates sums per branch + unknowns

Below the aggregations per branch.

df_export["branch_high_level"] = (
    df_export["branch"]
    .str.extract(r"(\d+\.\d+)", expand=False)
    .fillna(df_export["branch"])
)

# Group by branch
df_summary = df_export.groupby("branch_high_level", as_index=False)["vol"].sum()

# Check percentage
df_summary["percent"] = (df_summary["vol"] / df_summary["vol"].sum() * 100).round(2)

df_summary.sort_values(by=["branch_high_level"])

Volumes from UNPROCESSED branches by product

# Group by branch
df_summary = df_export.groupby(by=["branch", "product_type"], as_index=False)[
    "vol"
].sum()

# Check percentage
df_summary["percent"] = (df_summary["vol"] / df_summary["vol"].sum() * 100).round(2)

df_summary

Top 10 Exporters from UNPROCESSED branches

# Group by branch
df_summary = (
    df_export.query('branch == "5. no soy economic activity"')
    .groupby(by=["exporter_name"], as_index=False)["vol"]
    .sum()
)

# Check percentage
df_summary["percent"] = (df_summary["vol"] / df_summary["vol"].sum() * 100).round(2)

df_summary.sort_values(by=["vol"], ascending=False).head(10)

Economic Activity of UNKNOWN branches

cnpj_level = supplychain.get("cnpj", subset=["cnpj", "level"]).drop_duplicates()

df_export_level = df_export[
    df_export["branch_high_level"] == "5. no soy economic activity"
].merge(cnpj_level, left_on="exporter_cnpj", right_on="cnpj", how="left")

df_export_level["level"] = df_export_level["level"].fillna(10000)

30% of the exporters of UNPROCESSED branches DON'T have CNAE levels related to soy activity though they are subsidiary of big trade groups or weren't present in our CADASTRO database. Two things could increase the number of connections:

A decision rule based on the CNPJ8 for soybeans
Trying to mannualy include the missing CNPJs

df_product_level = df_export_level.groupby(by=["level"], as_index=False)["vol"].sum()
df_product_level["percent"] = (
    df_product_level["vol"] / df_product_level["vol"].sum() * 100
).round(2)

df_product_level

Some of the CNAEs from UNPROCESSED branches could be included as level 4 (wholsale activity):

4681801 - Comércio atacadista de álcool carburante, biodiesel, gasolina e demais derivados de petróleo, exceto lubrificantes
4637103 - Comércio atacadista de óleos e gorduras

Below the exporters whose CNPJ are not classified as soy activity

df_export_level.query("level == 10000")[
    [
        "exporter_name",
        "exporter_cnpj",
        "port_of_export_name",
        "branch",
        "product_type",
        "vol",
        "exporter_municipality_name",
        "level",
    ]
].groupby(
    by=["exporter_cnpj", "exporter_name", "exporter_municipality_name"], as_index=False
)[
    "vol"
].sum().sort_values(
    by=["vol"], ascending=False
).head(
    50
)

Flow Visual Analysis

Below, we see soy flows by branches and companies as well as the municipalities linking to ports. The decision tree results look reasonable, except for some tiny flows that seem unlikely, such as those in Cargill's branches 4.1 and 2.2: - Branch 2.2 – CNPJ_14 is linked only to a silo facility from Cadastro. - Branch 4.1 – CNPJ_14 is linked to a wholesale/office activity. In general, branch 4.1 shows high uncertainty in origin locations, since it is based on office addresses. One point for discussion would be whether to model sourcing areas from office locations or directly from ports.

import matplotlib.transforms as mtransforms
import matplotlib.ticker as mticker
import numpy as np
from shapely.geometry import LineString
import geopandas as gpd
import matplotlib.pyplot as plt


class LogisticsMapPlotter:
    def __init__(self, municipalities_gdf: gpd.GeoDataFrame):
        """
        Initializes the plotter with the background map.
        """
        # Store a copy and standardize columns to lowercase immediately
        self.municipalities = municipalities_gdf.copy()
        self.municipalities.columns = [x.lower() for x in self.municipalities.columns]

        # Style Configuration
        self.colors = {
            "background": "#e0e0e0",
            "border": "white",
            "flow": "#2c7fb8",
            "port_text": "#FF0000",
        }

    # --- Helper Methods (formerly standalone functions) ---

    @staticmethod
    def _get_curved_line(p1, p2, num_points=30, curve_intensity=0.2):
        """Generates a curved LineString between two points."""
        x1, y1 = p1.x, p1.y
        x2, y2 = p2.x, p2.y
        t = np.linspace(0, 1, num_points)

        # Linear interpolation
        x = x1 + (x2 - x1) * t
        y = y1 + (y2 - y1) * t

        # Curve offset
        dx, dy = x2 - x1, y2 - y1
        length = np.sqrt(dx**2 + dy**2)

        if length == 0:
            return LineString([p1, p2])

        offset_magnitude = length * curve_intensity
        offsets = 4 * offset_magnitude * (t - t**2)
        norm_x = -dy / length
        norm_y = dx / length

        x_curve = x + offsets * norm_x
        y_curve = y + offsets * norm_y

        return LineString(list(zip(x_curve, y_curve)))

    @staticmethod
    def _get_arrow_data(line_geom, position_pct=0.6):
        """Calculates position and rotation angle for an arrow on the line."""
        if line_geom is None or line_geom.is_empty:
            return 0, 0, 0

        point = line_geom.interpolate(position_pct, normalized=True)
        epsilon = 0.01
        point_ahead = line_geom.interpolate(
            min(position_pct + epsilon, 0.99), normalized=True
        )

        dx = point_ahead.x - point.x
        dy = point_ahead.y - point.y
        angle_degrees = np.degrees(np.arctan2(dy, dx))

        return point.x, point.y, angle_degrees

    @staticmethod
    def _format_millions(x, pos):
        """Formatter for the colorbar."""
        if x >= 1e6:
            return f"{x/1e6:.1f}M"
        elif x >= 1e3:
            return f"{x/1e3:.0f}k"
        return f"{x:.0f}"

    def _prepare_data(self, trade_data, company, branch, product, source_field):
        """Filters and merges data with geometries."""
        df_flows = trade_data.copy()

        # 1. Filters
        if company:
            df_flows = df_flows[
                df_flows["exporter_name"].str.startswith(company, na=False)
            ]
        if branch:
            # Maintained strict string check from your code
            df_flows = df_flows[df_flows["branch"].str.startswith(branch, na=False)]
        if product:
            df_flows = df_flows[
                df_flows["product_type"].str.startswith(product, na=False)
            ]

        if df_flows.empty:
            return pd.DataFrame()  # Return empty if no data

        # 2. Cleanup & Port Enrichment
        if "geometry" in df_flows.columns:
            df_flows = df_flows.drop(columns=["geometry"])

        # 3. Merge Origins
        df_merged = df_flows.merge(
            self.municipalities[["trase_id", "geometry"]],
            left_on=source_field,
            right_on="trase_id",
            how="left",
        ).rename(columns={"geometry": "origin_geom"})

        # 4. Merge Destinations (Ports)
        df_merged = df_merged.merge(
            self.municipalities[["trase_id", "geometry"]],
            left_on="port_municipality_trase_id",
            right_on="trase_id",
            how="left",
            suffixes=("", "_port"),
        ).rename(columns={"geometry": "dest_geom"})

        # 5. Calculate Centroids & Drop Missing
        df_merged["origin_centroid"] = gpd.GeoSeries(df_merged["origin_geom"]).centroid
        df_merged["dest_centroid"] = gpd.GeoSeries(df_merged["dest_geom"]).centroid
        df_merged = df_merged.dropna(subset=["origin_centroid", "dest_centroid"])

        return df_merged

    # --- Main Plotting Method ---

    def plot_logistic_arcs(
        self,
        trade_data,
        company_name,
        branch,
        product_type,
        source_field,
        chart_title="",
    ):
        """
        Main method to generate the plot.
        """
        # Prepare Data
        df_merged = self._prepare_data(
            trade_data, company_name, branch, product_type, source_field
        )

        if df_merged.empty:
            print(f"No data to display for {company_name} - {product_type}")
            return

        # Generate Arcs (Curves)
        lines = []
        volumes = []
        for _, row in df_merged.iterrows():
            if not row["origin_centroid"].equals(row["dest_centroid"]):
                curve = self._get_curved_line(
                    row["origin_centroid"], row["dest_centroid"], num_points=35
                )
                lines.append(curve)
                volumes.append(row["vol"])

        gdf_lines = gpd.GeoDataFrame(
            {"vol": volumes}, geometry=lines, crs=self.municipalities.crs
        )

        # --- Plotting ---
        fig, ax = plt.subplots(figsize=(16, 16))

        # 1. Base Map
        self.municipalities.plot(
            ax=ax,
            color=self.colors["background"],
            edgecolor=self.colors["border"],
            linewidth=0.1,
        )

        # 2. Layer A: Sourcing Areas (Heatmap)
        origin_vols = df_merged.groupby(source_field)["vol"].sum().reset_index()
        origin_map = self.municipalities.merge(
            origin_vols, left_on="trase_id", right_on=source_field
        )

        if not origin_map.empty:
            origin_map.plot(
                column="vol",
                ax=ax,
                cmap="magma",
                legend=True,
                legend_kwds={
                    "label": f"{company_name} Exported Volume (t)",
                    "shrink": 0.4,
                },
                alpha=0.9,
            )
            # Fix Colorbar label
            cbar = ax.get_figure().axes[-1]
            cbar.yaxis.set_major_formatter(mticker.FuncFormatter(self._format_millions))

        # 3. Layer B: Arcs
        if not gdf_lines.empty:
            max_vol = gdf_lines["vol"].max()
            if max_vol == 0:
                max_vol = 1

            # Dynamic linewidth
            linewidths = 0.8 + (gdf_lines["vol"] / max_vol) * 2.5
            gdf_lines.plot(
                ax=ax,
                color=self.colors["flow"],
                alpha=0.1,
                linewidth=linewidths,
                zorder=5,
            )

            # 4. Layer B2: Arrows
            for _, row in gdf_lines.iterrows():
                arrow_x, arrow_y, angle = self._get_arrow_data(
                    row.geometry, position_pct=0.6
                )
                arrow_size = 8 + (row["vol"] / max_vol) * 12

                # Filter small arrows
                if arrow_size > 8.5:
                    # Rotation Transform
                    t = (
                        mtransforms.Affine2D().rotate_deg_around(
                            arrow_x, arrow_y, angle
                        )
                        + ax.transData
                    )
                    ax.plot(
                        arrow_x,
                        arrow_y,
                        marker=">",
                        markersize=arrow_size,
                        color=self.colors["flow"],
                        alpha=0.4,
                        markeredgewidth=0.2,
                        transform=t,
                        zorder=6,
                    )

        # 5. Layer C: Ports
        valid_ports_ids = df_merged["port_municipality_trase_id"].unique()
        ports_geo = self.municipalities[
            self.municipalities["trase_id"].isin(valid_ports_ids)
        ].copy()

        for _, row in ports_geo.iterrows():
            centroid = row.geometry.centroid
            ax.text(
                centroid.x,
                centroid.y,
                "⚓",
                fontsize=26,
                ha="center",
                va="center",
                color=self.colors["port_text"],
                zorder=15,
            )

        # Final Formatting
        plt.title(f"{company_name} - {product_type} - {chart_title}", fontsize=15)
        plt.axis("off")
        plt.tight_layout()
        plt.show()

import numpy as np

df_export = df_export.fillna(0)
# 1. Instantiate the plotter with your municipality boundaries
plotter = LogisticsMapPlotter(br_boundaries_mun)

# 2. Run the plot
plotter.plot_logistic_arcs(
    trade_data=df_export,
    company_name="",
    branch="2.3",
    product_type="",
    source_field="logistics_hub.trase_id",
    chart_title="Located in silo municipalities from facility list",
)

df_export.query('branch.str.startswith("2.3") and port_of_export_name == "SANTAREM"')

# 2. Run the plot
plotter.plot_logistic_arcs(
    trade_data=df_export,
    company_name="",
    branch="1",
    product_type="",
    source_field="logistics_hub.trase_id",
    chart_title="EXPORTS FROM FARMS",
)

# 2. Run the plot
plotter.plot_logistic_arcs(
    trade_data=df_export,
    company_name="",
    branch="3",
    product_type="",
    source_field="logistics_hub.trase_id",
    chart_title="EXPORTS FROM CRUSH",
)

plotter.plot_logistic_arcs(
    trade_data=df_export,
    company_name="CARGILL",
    branch="3.1",
    product_type="",
    source_field="logistics_hub.trase_id",
    chart_title="EXPORTS FROM CRUSHING (BRANCH 3.1.1)",
)

plotter.plot_logistic_arcs(
    trade_data=df_export,
    company_name="CARGILL",
    branch="3.1.2",
    product_type="",
    source_field="logistics_hub.trase_id",
    chart_title="EXPORTS FROM CRUSHING (BRANCH 3.1.2)",
)

plotter.plot_logistic_arcs(
    trade_data=df_export,
    company_name="",
    branch="4",
    product_type="",
    source_field="logistics_hub.trase_id",
    chart_title="EXPORTS FROM OFFICES/WHOLESALE (BRANCH 4.1)",
)