Skip to content

View or edit on GitHub

This page is synchronized from trase/models/brazil/soy_2023_2024_v27/main.ipynb. Last modified on 2026-03-21 22:30 CET by Trase Admin. Please view or edit the original file there; changes should be reflected here after a midnight build (CET time), or manually triggering it with a GitHub action (link).

from trase.tools import sps
import warnings
from trase.tools.aws.metadata import write_csv_for_upload
import pandas as pd
from trase.tools.aws.aws_helpers import read_geojson

warnings.filterwarnings("ignore")

pd.options.display.float_format = "{:,.2f}".format


for year in [2023]:
    try:
        print(f"Running {year}")
        supplychain = sps.SupplyChain("brazil/soy_2023_2024_v27", year=year)
        supplychain.preparation()
        supplychain.load()
        supplychain.run()
        supplychain.flow_report_by_attribute("vol", ["branch"], 8)

        # supplychain.export_results()
        # quality_assurance(year)
        # supplychain.upload_results()
    except Exception as e:
        print(f"Failed {year}: {e}")
Running 2023
Extracting data from source ...  took 0.4 seconds
Skipping re-process of Municipality
Extracting data from source ...  took 0.3 seconds
Skipping re-process of IndustrialCapacity
Extracting data from source ...  took 0.3 seconds
Skipping re-process of IndustrialCapacityFacilities
Extracting data from source ...  took 0.3 seconds
Skipping re-process of CommodityRatios
Extracting data from source ...  took 0.3 seconds
Skipping re-process of ExporterSpecialCases
Extracting data from source ...  took 0.3 seconds
Skipping re-process of ExtraCnpjs
Extracting data from source ...  took 0.3 seconds
Skipping re-process of Production
Extracting data from source ...  took 0.3 seconds
Skipping re-process of Cost
    Loading data from disk took 15.0 seconds
Extracting data from source ...  took 0.3 seconds
Skipping re-process of SilosGeometry
Running pre-processing for Silos
Report: Before processing
        | Row Count: 0
Report: After processing
        | Row Count: 8,672
Written /Users/jailsonsoares/repos/TRASE/trase/models/brazil/soy_2023_2024_v27/2023/prepared/silos.csv
Extracting data from source ...  took 0.3 seconds
Skipping re-process of Hs
Extracting data from source ...  took 0.3 seconds
Skipping re-process of Flows
Extracting data from source ...  took 0.4 seconds
Running pre-processing for Cnpj
Report: Before processing
        | Row Count: 427,313
Report: Drop rows with missing level or geocode
        | Row Count: 427,313
Report: Drop duplicates
        | Row Count: 351,483
Report: Add extra CNPJs
        | Row Count: 352,127
Report: After processing
        | Row Count: 318,727
Written /Users/jailsonsoares/repos/TRASE/trase/models/brazil/soy_2023_2024_v27/2023/prepared/cnpj.csv
Extracting data from source ...  took 0.3 seconds
Skipping re-process of Sicasq


Use the validate option with pd.merge or pass validate=None to suppress this warning
/Users/jailsonsoares/repos/TRASE/trase/models/brazil/soy_2023_2024_v27/bills_of_lading_decision_tree.py:65
Use the validate option with pd.merge or pass validate=None to suppress this warning
/Users/jailsonsoares/repos/TRASE/trase/models/brazil/soy_2023_2024_v27/bills_of_lading_decision_tree.py:237
Use the validate option with pd.merge or pass validate=None to suppress this warning
/Users/jailsonsoares/repos/TRASE/trase/models/brazil/soy_2023_2024_v27/bills_of_lading_decision_tree.py:253
Use the validate option with pd.merge or pass validate=None to suppress this warning
/Users/jailsonsoares/repos/TRASE/trase/models/brazil/soy_2023_2024_v27/bills_of_lading_decision_tree.py:261


                                                                               sum percentage
branch                                                                                       
UNPROCESSED                                                             31,320,377        27%
4.1.1. office is not located at a port and trades >20,000 tonnes of soy 25,688,919        22%
2.2. link to silo cadastro only                                         17,630,685        15%
2.2.2 link to silo cadastro only (farm cnpj)                             8,929,045         8%
2.1. link to silo list and cadastro                                      7,085,734         6%
1.1. link to farm municipalities                                         5,736,324         5%
4.1.2. office is located at a port or is big city                        5,676,506         5%
4.1.3 exports from unlikely routes and/or distant from ports             5,326,977         5%
3.1.1 link to crush with silo                                            3,337,692         3%
2.3 exports from silo list or CNPJ located distant from ports            3,202,632         3%
3.2.2 link to crush no silos in municipality, use supply shed            1,745,307         1%
3.1.2.2 link to crush with > 1 silo in same municipality                   374,407         0%
1.2. link to farm municipalities located distant from ports                286,258         0%
3.2.1 link to crush no silos in municipality, use SICASQ                    19,867         0%
3.1.2.1 link to crush with 1 silo in same municipality                       6,000         0%
br_boundaries_mun = read_geojson(
    "brazil/spatial/boundaries/ibge/2020/br_municipalities_2020_ibge_4326_simplified.geojson",
    bucket="trase-storage",
)

df_export = supplychain.get("flows")[
    [
        "exporter_municipality_trase_id",
        "logistics_hub.trase_id",
        "logistics_hub.name",
        "exporter_group",
        "exporter_name",
        "exporter_cnpj",
        "port_of_export_name",
        "port_of_export_label",
        "port_municipality_trase_id",
        "country_of_destination_name",
        "branch",
        "product_type",
        "vol",
    ]
]

df_export.loc[df_export["exporter_name"] == "UNKNOWN", "branch"] = "UNKNOWN"

# Add municipality name
df_export = (
    df_export.merge(
        br_boundaries_mun[["TRASE_ID", "MUN_NAME_C"]],
        how="left",
        left_on=["exporter_municipality_trase_id"],
        right_on="TRASE_ID",
    )
    .drop(columns=["TRASE_ID"])
    .rename(columns={"MUN_NAME_C": "exporter_municipality_name"})
)

df_export = df_export.replace(
    {"branch": {"UNPROCESSED": "5. no soy economic activity"}}
)

df_costs = supplychain.get("cost")

Add cost attribute

df_export = df_export.merge(
    df_costs,
    left_on=["logistics_hub.trase_id", "port_municipality_trase_id"],
    right_on=["origin", "destination"],
    how="left",
).drop(columns=["origin", "destination"])

df_export
exporter_municipality_trase_id logistics_hub.trase_id logistics_hub.name exporter_group exporter_name exporter_cnpj port_of_export_name port_of_export_label port_municipality_trase_id country_of_destination_name branch product_type vol exporter_municipality_name cost
0 BR-4314902 BR-XXXXXXX UNKNOWN CARGILL CARGILL AGRICOLA SA 60498706020930 RIO GRANDE RIO GRANDE BR-4315602 CHINA (MAINLAND) 5. no soy economic activity SOYBEANS 65,280.00 PORTO ALEGRE NaN
1 BR-2100808 BR-2100808 ANAPURUS NOVAAGRI INFRA-ESTRUTURA DE ARMAZENAGEM E ESCO... NOVAAGRI INFRA-ESTRUTURA DE ARMAZENAGEM E ESCO... 09077252001831 BARCARENA VILA DO CONDE BR-1501303 CHINA (MAINLAND) 2.1. link to silo list and cadastro SOYBEANS 63,000.00 ANAPURUS 13.30
2 BR-1506195 BR-1506195 RUROPOLIS LOUIS DREYFUS LOUIS DREYFUS COMPANY BRASIL S.A. 47067525019550 SANTAREM SANTAREM BR-1506807 PORTUGAL 2.2. link to silo cadastro only SOYBEANS 45,700.00 RUROPOLIS 2.89
3 BR-XXXXXXX UNKNOWN UNKNOWN UNKNOWN 00000000000000 PARANAGUA PARANAGUA BR-4118204 CHINA (MAINLAND) UNKNOWN SOYBEANS 30,000.00 NaN NaN
4 BR-2919553 BR-2919553 LUIS EDUARDO MAGALHAES ENGELHART ENGELHART 14796754001186 RIO GRANDE RIO GRANDE BR-4315602 IRAN 4.1.1. office is not located at a port and tra... SOYBEANS 67,160.00 LUIS EDUARDO MAGALHAES 37.38
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
11276 BR-XXXXXXX UNKNOWN UNKNOWN UNKNOWN 00000000000000 SANTOS SANTOS BR-3548500 JAPAN UNKNOWN SOYBEAN CAKE 51,252.70 NaN NaN
11277 BR-XXXXXXX UNKNOWN UNKNOWN UNKNOWN 00000000000000 RIO GRANDE RIO GRANDE BR-4315602 THAILAND UNKNOWN SOYBEAN CAKE 1,500.00 NaN NaN
11278 BR-XXXXXXX UNKNOWN UNKNOWN UNKNOWN 00000000000000 RIO GRANDE RIO GRANDE BR-4315602 THAILAND UNKNOWN SOYBEAN CAKE 2,100.00 NaN NaN
11279 BR-XXXXXXX UNKNOWN UNKNOWN UNKNOWN 00000000000000 RIO GRANDE RIO GRANDE BR-4315602 THAILAND UNKNOWN SOYBEAN CAKE 3,000.00 NaN NaN
11280 BR-XXXXXXX UNKNOWN UNKNOWN UNKNOWN 00000000000000 RIO GRANDE RIO GRANDE BR-4315602 THAILAND UNKNOWN SOYBEAN CAKE 4,000.00 NaN NaN

11281 rows × 15 columns

df_export[
    [
        "exporter_group",
        "branch",
        "logistics_hub.trase_id",
        "port_of_export_name",
        "port_municipality_trase_id",
        "country_of_destination_name",
        "product_type",
        "vol",
    ]
].to_csv("branches.csv", sep=";", index=False)
flows = pd.read_csv("2023/prepared/flows.csv", sep=";")
flows[["vol"]].sum()
df_export[["vol"]].sum()
data_for_supply_shed = (
    df_export[
        [
            "exporter_name",
            "exporter_cnpj",
            "logistics_hub.trase_id",
            "port_of_export_name",
            "branch",
            "product_type",
            "vol",
        ]
    ]
    .groupby(
        by=[
            "exporter_name",
            "exporter_cnpj",
            "logistics_hub.trase_id",
            "port_of_export_name",
            "branch",
            "product_type",
            #'vol'
        ],
        as_index=False,
    )["vol"]
    .sum()
)

# write_csv_for_upload(
#     df=data_for_supply_shed,
#     bucket='trase-storage',
#     key='brazil/soy/sei_pcs/v2.7.0/supply_zone/SEIPCS_BRAZIL_SOY_PARTIAL_BRANCHES_V27.csv',
#     do_upload=True
# )

Aggregates sums per branch + unknowns

Below the aggregations per branch.

df_export["branch_high_level"] = (
    df_export["branch"]
    .str.extract(r"(\d+\.\d+)", expand=False)
    .fillna(df_export["branch"])
)

# Group by branch
df_summary = df_export.groupby("branch_high_level", as_index=False)["vol"].sum()

# Check percentage
df_summary["percent"] = (df_summary["vol"] / df_summary["vol"].sum() * 100).round(2)

df_summary.sort_values(by=["branch_high_level"])

Volumes from UNPROCESSED branches by product

# Group by branch
df_summary = df_export.groupby(by=["branch", "product_type"], as_index=False)[
    "vol"
].sum()

# Check percentage
df_summary["percent"] = (df_summary["vol"] / df_summary["vol"].sum() * 100).round(2)

df_summary

Top 10 Exporters from UNPROCESSED branches

# Group by branch
df_summary = (
    df_export.query('branch == "5. no soy economic activity"')
    .groupby(by=["exporter_name"], as_index=False)["vol"]
    .sum()
)

# Check percentage
df_summary["percent"] = (df_summary["vol"] / df_summary["vol"].sum() * 100).round(2)

df_summary.sort_values(by=["vol"], ascending=False).head(10)

Economic Activity of UNKNOWN branches

cnpj_level = supplychain.get("cnpj", subset=["cnpj", "level"]).drop_duplicates()

df_export_level = df_export[
    df_export["branch_high_level"] == "5. no soy economic activity"
].merge(cnpj_level, left_on="exporter_cnpj", right_on="cnpj", how="left")

df_export_level["level"] = df_export_level["level"].fillna(10000)

30% of the exporters of UNPROCESSED branches DON'T have CNAE levels related to soy activity though they are subsidiary of big trade groups or weren't present in our CADASTRO database. Two things could increase the number of connections:

  1. A decision rule based on the CNPJ8 for soybeans
  2. Trying to mannualy include the missing CNPJs
df_product_level = df_export_level.groupby(by=["level"], as_index=False)["vol"].sum()
df_product_level["percent"] = (
    df_product_level["vol"] / df_product_level["vol"].sum() * 100
).round(2)

df_product_level

Some of the CNAEs from UNPROCESSED branches could be included as level 4 (wholsale activity):

  1. 4681801 - Comércio atacadista de álcool carburante, biodiesel, gasolina e demais derivados de petróleo, exceto lubrificantes
  2. 4637103 - Comércio atacadista de óleos e gorduras

Below the exporters whose CNPJ are not classified as soy activity

df_export_level.query("level == 10000")[
    [
        "exporter_name",
        "exporter_cnpj",
        "port_of_export_name",
        "branch",
        "product_type",
        "vol",
        "exporter_municipality_name",
        "level",
    ]
].groupby(
    by=["exporter_cnpj", "exporter_name", "exporter_municipality_name"], as_index=False
)[
    "vol"
].sum().sort_values(
    by=["vol"], ascending=False
).head(
    50
)

Flow Visual Analysis

Below, we see soy flows by branches and companies as well as the municipalities linking to ports. The decision tree results look reasonable, except for some tiny flows that seem unlikely, such as those in Cargill's branches 4.1 and 2.2: - Branch 2.2 – CNPJ_14 is linked only to a silo facility from Cadastro. - Branch 4.1 – CNPJ_14 is linked to a wholesale/office activity. In general, branch 4.1 shows high uncertainty in origin locations, since it is based on office addresses. One point for discussion would be whether to model sourcing areas from office locations or directly from ports.

import matplotlib.transforms as mtransforms
import matplotlib.ticker as mticker
import numpy as np
from shapely.geometry import LineString
import geopandas as gpd
import matplotlib.pyplot as plt


class LogisticsMapPlotter:
    def __init__(self, municipalities_gdf: gpd.GeoDataFrame):
        """
        Initializes the plotter with the background map.
        """
        # Store a copy and standardize columns to lowercase immediately
        self.municipalities = municipalities_gdf.copy()
        self.municipalities.columns = [x.lower() for x in self.municipalities.columns]

        # Style Configuration
        self.colors = {
            "background": "#e0e0e0",
            "border": "white",
            "flow": "#2c7fb8",
            "port_text": "#FF0000",
        }

    # --- Helper Methods (formerly standalone functions) ---

    @staticmethod
    def _get_curved_line(p1, p2, num_points=30, curve_intensity=0.2):
        """Generates a curved LineString between two points."""
        x1, y1 = p1.x, p1.y
        x2, y2 = p2.x, p2.y
        t = np.linspace(0, 1, num_points)

        # Linear interpolation
        x = x1 + (x2 - x1) * t
        y = y1 + (y2 - y1) * t

        # Curve offset
        dx, dy = x2 - x1, y2 - y1
        length = np.sqrt(dx**2 + dy**2)

        if length == 0:
            return LineString([p1, p2])

        offset_magnitude = length * curve_intensity
        offsets = 4 * offset_magnitude * (t - t**2)
        norm_x = -dy / length
        norm_y = dx / length

        x_curve = x + offsets * norm_x
        y_curve = y + offsets * norm_y

        return LineString(list(zip(x_curve, y_curve)))

    @staticmethod
    def _get_arrow_data(line_geom, position_pct=0.6):
        """Calculates position and rotation angle for an arrow on the line."""
        if line_geom is None or line_geom.is_empty:
            return 0, 0, 0

        point = line_geom.interpolate(position_pct, normalized=True)
        epsilon = 0.01
        point_ahead = line_geom.interpolate(
            min(position_pct + epsilon, 0.99), normalized=True
        )

        dx = point_ahead.x - point.x
        dy = point_ahead.y - point.y
        angle_degrees = np.degrees(np.arctan2(dy, dx))

        return point.x, point.y, angle_degrees

    @staticmethod
    def _format_millions(x, pos):
        """Formatter for the colorbar."""
        if x >= 1e6:
            return f"{x/1e6:.1f}M"
        elif x >= 1e3:
            return f"{x/1e3:.0f}k"
        return f"{x:.0f}"

    def _prepare_data(self, trade_data, company, branch, product, source_field):
        """Filters and merges data with geometries."""
        df_flows = trade_data.copy()

        # 1. Filters
        if company:
            df_flows = df_flows[
                df_flows["exporter_name"].str.startswith(company, na=False)
            ]
        if branch:
            # Maintained strict string check from your code
            df_flows = df_flows[df_flows["branch"].str.startswith(branch, na=False)]
        if product:
            df_flows = df_flows[
                df_flows["product_type"].str.startswith(product, na=False)
            ]

        if df_flows.empty:
            return pd.DataFrame()  # Return empty if no data

        # 2. Cleanup & Port Enrichment
        if "geometry" in df_flows.columns:
            df_flows = df_flows.drop(columns=["geometry"])

        # 3. Merge Origins
        df_merged = df_flows.merge(
            self.municipalities[["trase_id", "geometry"]],
            left_on=source_field,
            right_on="trase_id",
            how="left",
        ).rename(columns={"geometry": "origin_geom"})

        # 4. Merge Destinations (Ports)
        df_merged = df_merged.merge(
            self.municipalities[["trase_id", "geometry"]],
            left_on="port_municipality_trase_id",
            right_on="trase_id",
            how="left",
            suffixes=("", "_port"),
        ).rename(columns={"geometry": "dest_geom"})

        # 5. Calculate Centroids & Drop Missing
        df_merged["origin_centroid"] = gpd.GeoSeries(df_merged["origin_geom"]).centroid
        df_merged["dest_centroid"] = gpd.GeoSeries(df_merged["dest_geom"]).centroid
        df_merged = df_merged.dropna(subset=["origin_centroid", "dest_centroid"])

        return df_merged

    # --- Main Plotting Method ---

    def plot_logistic_arcs(
        self,
        trade_data,
        company_name,
        branch,
        product_type,
        source_field,
        chart_title="",
    ):
        """
        Main method to generate the plot.
        """
        # Prepare Data
        df_merged = self._prepare_data(
            trade_data, company_name, branch, product_type, source_field
        )

        if df_merged.empty:
            print(f"No data to display for {company_name} - {product_type}")
            return

        # Generate Arcs (Curves)
        lines = []
        volumes = []
        for _, row in df_merged.iterrows():
            if not row["origin_centroid"].equals(row["dest_centroid"]):
                curve = self._get_curved_line(
                    row["origin_centroid"], row["dest_centroid"], num_points=35
                )
                lines.append(curve)
                volumes.append(row["vol"])

        gdf_lines = gpd.GeoDataFrame(
            {"vol": volumes}, geometry=lines, crs=self.municipalities.crs
        )

        # --- Plotting ---
        fig, ax = plt.subplots(figsize=(16, 16))

        # 1. Base Map
        self.municipalities.plot(
            ax=ax,
            color=self.colors["background"],
            edgecolor=self.colors["border"],
            linewidth=0.1,
        )

        # 2. Layer A: Sourcing Areas (Heatmap)
        origin_vols = df_merged.groupby(source_field)["vol"].sum().reset_index()
        origin_map = self.municipalities.merge(
            origin_vols, left_on="trase_id", right_on=source_field
        )

        if not origin_map.empty:
            origin_map.plot(
                column="vol",
                ax=ax,
                cmap="magma",
                legend=True,
                legend_kwds={
                    "label": f"{company_name} Exported Volume (t)",
                    "shrink": 0.4,
                },
                alpha=0.9,
            )
            # Fix Colorbar label
            cbar = ax.get_figure().axes[-1]
            cbar.yaxis.set_major_formatter(mticker.FuncFormatter(self._format_millions))

        # 3. Layer B: Arcs
        if not gdf_lines.empty:
            max_vol = gdf_lines["vol"].max()
            if max_vol == 0:
                max_vol = 1

            # Dynamic linewidth
            linewidths = 0.8 + (gdf_lines["vol"] / max_vol) * 2.5
            gdf_lines.plot(
                ax=ax,
                color=self.colors["flow"],
                alpha=0.1,
                linewidth=linewidths,
                zorder=5,
            )

            # 4. Layer B2: Arrows
            for _, row in gdf_lines.iterrows():
                arrow_x, arrow_y, angle = self._get_arrow_data(
                    row.geometry, position_pct=0.6
                )
                arrow_size = 8 + (row["vol"] / max_vol) * 12

                # Filter small arrows
                if arrow_size > 8.5:
                    # Rotation Transform
                    t = (
                        mtransforms.Affine2D().rotate_deg_around(
                            arrow_x, arrow_y, angle
                        )
                        + ax.transData
                    )
                    ax.plot(
                        arrow_x,
                        arrow_y,
                        marker=">",
                        markersize=arrow_size,
                        color=self.colors["flow"],
                        alpha=0.4,
                        markeredgewidth=0.2,
                        transform=t,
                        zorder=6,
                    )

        # 5. Layer C: Ports
        valid_ports_ids = df_merged["port_municipality_trase_id"].unique()
        ports_geo = self.municipalities[
            self.municipalities["trase_id"].isin(valid_ports_ids)
        ].copy()

        for _, row in ports_geo.iterrows():
            centroid = row.geometry.centroid
            ax.text(
                centroid.x,
                centroid.y,
                "⚓",
                fontsize=26,
                ha="center",
                va="center",
                color=self.colors["port_text"],
                zorder=15,
            )

        # Final Formatting
        plt.title(f"{company_name} - {product_type} - {chart_title}", fontsize=15)
        plt.axis("off")
        plt.tight_layout()
        plt.show()
import numpy as np

df_export = df_export.fillna(0)
# 1. Instantiate the plotter with your municipality boundaries
plotter = LogisticsMapPlotter(br_boundaries_mun)

# 2. Run the plot
plotter.plot_logistic_arcs(
    trade_data=df_export,
    company_name="",
    branch="2.3",
    product_type="",
    source_field="logistics_hub.trase_id",
    chart_title="Located in silo municipalities from facility list",
)
df_export.query('branch.str.startswith("2.3") and port_of_export_name == "SANTAREM"')
# 2. Run the plot
plotter.plot_logistic_arcs(
    trade_data=df_export,
    company_name="",
    branch="1",
    product_type="",
    source_field="logistics_hub.trase_id",
    chart_title="EXPORTS FROM FARMS",
)
# 2. Run the plot
plotter.plot_logistic_arcs(
    trade_data=df_export,
    company_name="",
    branch="3",
    product_type="",
    source_field="logistics_hub.trase_id",
    chart_title="EXPORTS FROM CRUSH",
)
plotter.plot_logistic_arcs(
    trade_data=df_export,
    company_name="CARGILL",
    branch="3.1",
    product_type="",
    source_field="logistics_hub.trase_id",
    chart_title="EXPORTS FROM CRUSHING (BRANCH 3.1.1)",
)
plotter.plot_logistic_arcs(
    trade_data=df_export,
    company_name="CARGILL",
    branch="3.1.2",
    product_type="",
    source_field="logistics_hub.trase_id",
    chart_title="EXPORTS FROM CRUSHING (BRANCH 3.1.2)",
)
plotter.plot_logistic_arcs(
    trade_data=df_export,
    company_name="",
    branch="4",
    product_type="",
    source_field="logistics_hub.trase_id",
    chart_title="EXPORTS FROM OFFICES/WHOLESALE (BRANCH 4.1)",
)