Add exporter group to s3
View or edit on GitHub
This page is synchronized from trase/models/brazil/beef/add_exporter_group_to_s3.ipynb. Last modified on 2026-03-21 22:30 CET by Trase Admin.
Please view or edit the original file there; changes should be reflected here after a midnight build (CET time),
or manually triggering it with a GitHub action (link).
Imports
from trase.tools import sps
from tqdm import tqdm
import plotly.graph_objects as go
import pandas as pd
import numpy as np
from trase.tools.pandasdb import query
from trase.tools import CNX
pd.options.display.float_format = "{:,}".format
SEI_PCS v2.2
df = sps.concat(
[
sps.get_pandas_df_once(
f"brazil/beef/sei_pcs/v2.2.0/SEIPCS_BRAZIL_BEEF_{year}.csv",
dtype=str,
na_filter=False,
)
.astype({"VOLUME_RAW": float, "YEAR": float})
.rename(columns={"VOLUME_RAW": "volume"})
for year in tqdm(range(2015, 2018))
],
sort=False,
)
df["volume"] = df["volume"] / 1e3
df.BRANCH.unique()
exporters_new = df["EXPORTER"].drop_duplicates()
df_lookup_new = query.query_with_dataframe(
exporters_new,
"""
select * from (
select distinct
trader_name as exporter_name,
unnest(labels) as exporter_label,
groups->0->>'group' exporter_group
from views.traders
) v
join df on df.exporter = exporter_label
""",
)
df_lookup_new = df_lookup_new[~df_lookup_new["exporter_label"].duplicated(keep=False)]
df_new = pd.merge(
df,
df_lookup_new,
how="left",
left_on=["EXPORTER"],
right_on=["exporter_label"],
validate="many_to_one",
)
df_new["exporter_name"] = df_new["exporter_name"].fillna(df_new["EXPORTER"])
df_new["exporter_group"] = df_new["exporter_group"].fillna(df_new["EXPORTER"])
df_new["exporter_label"] = df_new["exporter_label"].fillna(df_new["EXPORTER"])
df_new.columns = [x.lower() for x in df_new.columns]
df_new["branch"] = df_new["branch"].str.upper()
df_new.branch.unique()
SEI_PCS v2.1
df_old = pd.read_sql(
'select * from splitgraph."supply-chains-concatenated" where ref_id = 2710 and year in (2015,2016,2017)',
CNX.cnx,
)
exporters_old = df_old["exporter"].drop_duplicates()
df_lookup_old = query.query_with_dataframe(
exporters_old,
"""
select * from (
select distinct
trader_name as exporter_name,
unnest(labels) as exporter_label,
groups->0->>'group' exporter_group
from views.traders
) v
join df on df.exporter = exporter_label
""",
)
df_lookup_old = df_lookup_old[~df_lookup_old["exporter_label"].duplicated(keep=False)]
df_sei_old = pd.merge(
df_old,
df_lookup_old,
how="left",
left_on=["exporter"],
right_on=["exporter_label"],
validate="many_to_one",
)
df_sei_old["exporter_name"] = df_sei_old["exporter_name"].fillna(
df_sei_old["exporter_y"]
)
df_sei_old["exporter_group"] = df_sei_old["exporter_group_y"].fillna(
df_sei_old["exporter_y"]
)
df_sei_old["exporter_label"] = df_sei_old["exporter_label"].fillna(
df_sei_old["exporter_y"]
)
df_sei_old["logistics_hub"] = df_sei_old.logistics_hub_trase_id
df_sei_old["logistics_hub"] = np.where(
df_sei_old["logistics_hub"].str.contains("XXXX"),
"BR-XXXXXXX",
df_sei_old["logistics_hub"],
)
branch_between_versions = {
"0.": "0. LIVE CATTLE",
"1.1": "1.1 SIF SLAUGHTERHOUSE",
"1.2.1": "1.2 EXPORTER SIF SLAUGHTERHOUSE IN MUNICIPALITY",
"1.2.2.1": "1.3 SUBSIDIARY SIF SLAUGHTERHOUSE IN MUNICIPALITY",
"1.2.2.2.1": "1.4 EXPORTER NON-SIF (GTA) SLAUGHTERHOUSE IN MUNICIPALITY",
"1.2.2.2.2.1": "1.5 EXPORTER SIF SLAUGHTERHOUSES IN STATE WITHIN 250 KM",
"1.2.2.2.2.2.1": "1.6 SUBSIDIARY SIF SLAUGHTERHOUSES FOR COUNTRY IN STATE WITHIN 4 HOURS",
"1.2.2.2.2.2.2.2": "1.7 UNKNOWN SLAUGHTERHOUSE IN STATE",
"2.1": "2.1 EXPORTER SINGLE COUNTRY SIF SLAUGHTERHOUSE IN STATE OF PRODUCTION",
"2.2": "2.2 UNKNOWN SLAUGHTERHOUSE IN STATE OF PRODUCTION",
"2.2.2.1": "2.2 UNKNOWN SLAUGHTERHOUSE IN STATE OF PRODUCTION",
"2.2.2.2": "2.2 UNKNOWN SLAUGHTERHOUSE IN STATE OF PRODUCTION",
"3.": "3. UNKNOWN STATE OF PRODUCTION",
}
df_sei_old["branch"] = np.where(
df_sei_old["branch"] == "LIVE_EXPORTS", "0.", df_sei_old["branch"]
)
df_sei_old["branch"] = df_sei_old["branch"].str.split("_").str[0]
df_sei_old["branch"] = df_sei_old["branch"].apply(branch_between_versions.__getitem__)
Compare versions
def compare_versions_sei_pcs(merged_df: pd.DataFrame, key_col: str) -> pd.DataFrame:
merged = merged_df.copy()
merged["sei_pcs_db"] = (
merged["sei_pcs_db"]
.groupby(["year", "exporter_group", key_col, "branch"])
.sum(numeric_only=True)
.reset_index()
)
merged["sei_old_db"] = (
merged["sei_old_db"]
.groupby(["year", "exporter_group", key_col, "branch"])
.sum(numeric_only=True)
.reset_index()
)
plot_df = pd.merge(
merged["sei_pcs_db"],
merged["sei_old_db"],
on=["year", "exporter_group", key_col, "branch"],
how="outer",
suffixes=("_SEI_PCS", "_SEI_OLD"),
)
plot_df = plot_df[plot_df["year"].isin([2015, 2016, 2017])].fillna(0)[
[
"year",
"exporter_group",
key_col,
"branch",
"volume_SEI_PCS",
"volume_SEI_OLD",
]
]
plot_df.loc[:, "pct"] = np.abs(
plot_df["volume_SEI_PCS"] - plot_df["volume_SEI_OLD"]
) / plot_df[["volume_SEI_PCS", "volume_SEI_OLD"]].min(axis=1)
plot_df.loc[:, "five_pct"] = np.where(plot_df["pct"] <= 0.05, 1, 0)
plot_df.loc[:, "ten_pct"] = np.where(plot_df["pct"] <= 0.1, 1, 0)
return plot_df
def plot_comparision_versions_sei_pcs(df, group_by):
yes_df = df[df["ten_pct"] == 1].groupby("year").sum(numeric_only=True).reset_index()
no_df = df[df["ten_pct"] == 0].groupby("year").sum(numeric_only=True).reset_index()
fig = go.Figure(
data=[
go.Bar(name="same", x=yes_df["year"], y=yes_df["volume_SEI_PCS"]),
go.Bar(name="different", x=no_df["year"], y=no_df["volume_SEI_PCS"]),
]
)
# Change the bar mode
fig.update_xaxes(title_text="Year")
fig.update_yaxes(title_text="Volume (Tonnes)")
fig.update_layout(
barmode="stack",
title_text=f"Trader-{group_by} volume from v.2.2 vc. 2.1 - ALL TRADERS within 10% volume",
)
fig.show()
dfs_DB = dict()
dfs_DB["sei_pcs_db"] = df_new
dfs_DB["sei_old_db"] = df_sei_old
key = "logistics_hub"
plot_df = compare_versions_sei_pcs(dfs_DB, key)
plot_comparision_versions_sei_pcs(plot_df, key)
round(
plot_df[plot_df["ten_pct"] == 0].groupby(["year", "branch"]).sum(numeric_only=True)
)
plot_df[plot_df["ten_pct"] == 0].groupby(["year", "branch"]).sum(numeric_only=True)
plot_df[
(plot_df["exporter_group"].str.contains("JBS")) & (plot_df["ten_pct"] == 0)
].sort_values("year").head(60)