View or edit on GitHub
This page is synchronized from trase/models/brazil/beef/Version 2.1 versus 2.2-Copy1.ipynb. Last modified on 2026-03-21 22:30 CET by Trase Admin.
Please view or edit the original file there; changes should be reflected here after a midnight build (CET time),
or manually triggering it with a GitHub action (link).
from trase.tools import sps
from tqdm import tqdm
import numpy as np
import pandas as pd
from trase.tools import sps
years = [2015, 2016, 2017]
################################################################################################
# Read New (v2.2) data
################################################################################################
df_new = sps.concat(
[
sps.get_pandas_df_once(
f"brazil/beef/sei_pcs/v2.2.0/SEIPCS_BRAZIL_BEEF_{year}.csv",
dtype=str,
na_filter=False,
)
for year in years
]
)
df_new = df_new.drop(columns=["PORT_OF_EXPORT", "STATE_OF_EXPORTER"])
df_new = df_new.astype({"YEAR": int, "VOLUME_PRODUCT": float, "VOLUME_RAW": float})
df_new = df_new.rename(
columns={"STATE_OF_PRODUCTION": "MDIC_STATE"},
errors="raise",
)
################################################################################################
# Read Old (v2.1) data and align it to the new-style dataframe
################################################################################################
df_old = sps.concat(
[
sps.get_pandas_df_once(
key,
dtype=str,
na_filter=False,
)
for key in [
"brazil/beef/sei_pcs/v2.1.0/resubmission/BRAZIL_SEIPCS_BEEF_AGGREGATED_2015_part_1.csv",
"brazil/beef/sei_pcs/v2.1.0/resubmission/BRAZIL_SEIPCS_BEEF_AGGREGATED_2015_part_10.csv",
"brazil/beef/sei_pcs/v2.1.0/resubmission/BRAZIL_SEIPCS_BEEF_AGGREGATED_2015_part_2.csv",
"brazil/beef/sei_pcs/v2.1.0/resubmission/BRAZIL_SEIPCS_BEEF_AGGREGATED_2015_part_3.csv",
"brazil/beef/sei_pcs/v2.1.0/resubmission/BRAZIL_SEIPCS_BEEF_AGGREGATED_2015_part_4.csv",
"brazil/beef/sei_pcs/v2.1.0/resubmission/BRAZIL_SEIPCS_BEEF_AGGREGATED_2015_part_5.csv",
"brazil/beef/sei_pcs/v2.1.0/resubmission/BRAZIL_SEIPCS_BEEF_AGGREGATED_2015_part_6.csv",
"brazil/beef/sei_pcs/v2.1.0/resubmission/BRAZIL_SEIPCS_BEEF_AGGREGATED_2015_part_7.csv",
"brazil/beef/sei_pcs/v2.1.0/resubmission/BRAZIL_SEIPCS_BEEF_AGGREGATED_2015_part_8.csv",
"brazil/beef/sei_pcs/v2.1.0/resubmission/BRAZIL_SEIPCS_BEEF_AGGREGATED_2015_part_9.csv",
"brazil/beef/sei_pcs/v2.1.0/resubmission/BRAZIL_SEIPCS_BEEF_AGGREGATED_2016_part_1.csv",
"brazil/beef/sei_pcs/v2.1.0/resubmission/BRAZIL_SEIPCS_BEEF_AGGREGATED_2016_part_10.csv",
"brazil/beef/sei_pcs/v2.1.0/resubmission/BRAZIL_SEIPCS_BEEF_AGGREGATED_2016_part_2.csv",
"brazil/beef/sei_pcs/v2.1.0/resubmission/BRAZIL_SEIPCS_BEEF_AGGREGATED_2016_part_3.csv",
"brazil/beef/sei_pcs/v2.1.0/resubmission/BRAZIL_SEIPCS_BEEF_AGGREGATED_2016_part_4.csv",
"brazil/beef/sei_pcs/v2.1.0/resubmission/BRAZIL_SEIPCS_BEEF_AGGREGATED_2016_part_5.csv",
"brazil/beef/sei_pcs/v2.1.0/resubmission/BRAZIL_SEIPCS_BEEF_AGGREGATED_2016_part_6.csv",
"brazil/beef/sei_pcs/v2.1.0/resubmission/BRAZIL_SEIPCS_BEEF_AGGREGATED_2016_part_7.csv",
"brazil/beef/sei_pcs/v2.1.0/resubmission/BRAZIL_SEIPCS_BEEF_AGGREGATED_2016_part_8.csv",
"brazil/beef/sei_pcs/v2.1.0/resubmission/BRAZIL_SEIPCS_BEEF_AGGREGATED_2016_part_9.csv",
"brazil/beef/sei_pcs/v2.1.0/resubmission/BRAZIL_SEIPCS_BEEF_AGGREGATED_2017_part_1.csv",
"brazil/beef/sei_pcs/v2.1.0/resubmission/BRAZIL_SEIPCS_BEEF_AGGREGATED_2017_part_10.csv",
"brazil/beef/sei_pcs/v2.1.0/resubmission/BRAZIL_SEIPCS_BEEF_AGGREGATED_2017_part_2.csv",
"brazil/beef/sei_pcs/v2.1.0/resubmission/BRAZIL_SEIPCS_BEEF_AGGREGATED_2017_part_3.csv",
"brazil/beef/sei_pcs/v2.1.0/resubmission/BRAZIL_SEIPCS_BEEF_AGGREGATED_2017_part_4.csv",
"brazil/beef/sei_pcs/v2.1.0/resubmission/BRAZIL_SEIPCS_BEEF_AGGREGATED_2017_part_5.csv",
"brazil/beef/sei_pcs/v2.1.0/resubmission/BRAZIL_SEIPCS_BEEF_AGGREGATED_2017_part_6.csv",
"brazil/beef/sei_pcs/v2.1.0/resubmission/BRAZIL_SEIPCS_BEEF_AGGREGATED_2017_part_7.csv",
"brazil/beef/sei_pcs/v2.1.0/resubmission/BRAZIL_SEIPCS_BEEF_AGGREGATED_2017_part_8.csv",
"brazil/beef/sei_pcs/v2.1.0/resubmission/BRAZIL_SEIPCS_BEEF_AGGREGATED_2017_part_9.csv",
]
]
)
df_old = df_old.drop(
columns=[
"BIOME",
"EXPORTER_GROUP",
"IMPORTER_GROUP",
"EQ_FACTOR",
"BEEF_DEF_5_YEAR_HA",
"CO2_BEEF_DEF_5_YEAR_TONS",
"LH_MUNICIPALITY",
"PRODUCT_DESC",
# "LH_STATE",
"STATE",
"MUNICIPALITY",
]
)
df_old = df_old.rename(
columns={
"HS6_CODE": "HS6",
"CNPJ": "EXPORTER_CNPJ",
"ZDC": "ZERO_DEFORESTATION_BRAZIL_BEEF",
"VOL": "VOLUME_PRODUCT",
"CWE": "VOLUME_RAW",
"LH_CNPJ": "LOGISTICS_HUB_TRASE_ID",
"LH_GEOCODE": "LOGISTICS_HUB",
"LH_STATE": "MDIC_STATE",
},
errors="raise",
)
df_old["HS4"] = df_old["HS6"].str.slice(0, 4)
df_old["MUNICIPALITY"] = "BR-" + df_old.pop("GEOCODE")
df_old["LOGISTICS_HUB_TRASE_ID"] = (
"BR-BEEF-SLAUGHTERHOUSE-" + df_old["LOGISTICS_HUB_TRASE_ID"]
)
df_old["LOGISTICS_HUB_TRASE_ID"] = df_old["LOGISTICS_HUB_TRASE_ID"].mask(
df_old["LOGISTICS_HUB_TRASE_ID"].str.contains("UNKNOWN"),
"BR-BEEF-SLAUGHTERHOUSE-UNKNOWN",
)
df_old["LOGISTICS_HUB"] = "BR-" + df_old["LOGISTICS_HUB"]
df_old["LOGISTICS_HUB"] = df_old["LOGISTICS_HUB"].mask(
df_old["LOGISTICS_HUB"].str.contains("UNKNOWN"), "BR-XXXXXXX"
)
df_old["MUNICIPALITY"] = df_old["MUNICIPALITY"].mask(
df_old["MUNICIPALITY"].str.contains("UNKNOWN"), "BR-XXXXXXX"
)
df_old["MUNICIPALITY"] = (
df_old["MUNICIPALITY"]
.map(
{
"BR-AGGREGATED ACRE": "BR-12-AGGREGATED",
"BR-AGGREGATED ALAGOAS": "BR-27-AGGREGATED",
"BR-AGGREGATED AMAPA": "BR-16-AGGREGATED",
"BR-AGGREGATED AMAZONAS": "BR-13-AGGREGATED",
"BR-AGGREGATED BAHIA": "BR-29-AGGREGATED",
"BR-AGGREGATED CEARA": "BR-23-AGGREGATED",
"BR-AGGREGATED DISTRITO FEDERAL": "BR-53-AGGREGATED",
"BR-AGGREGATED ESPIRITO SANTO": "BR-32-AGGREGATED",
"BR-AGGREGATED GOIAS": "BR-52-AGGREGATED",
"BR-AGGREGATED MARANHAO": "BR-21-AGGREGATED",
"BR-AGGREGATED MATO GROSSO": "BR-51-AGGREGATED",
"BR-AGGREGATED MATO GROSSO DO SUL": "BR-50-AGGREGATED",
"BR-AGGREGATED MINAS GERAIS": "BR-31-AGGREGATED",
"BR-AGGREGATED PARA": "BR-15-AGGREGATED",
"BR-AGGREGATED PARAIBA": "BR-25-AGGREGATED",
"BR-AGGREGATED PARANA": "BR-41-AGGREGATED",
"BR-AGGREGATED PERNAMBUCO": "BR-26-AGGREGATED",
"BR-AGGREGATED PIAUI": "BR-22-AGGREGATED",
"BR-AGGREGATED RIO DE JANEIRO": "BR-33-AGGREGATED",
"BR-AGGREGATED RIO GRANDE DO NORTE": "BR-24-AGGREGATED",
"BR-AGGREGATED RIO GRANDE DO SUL": "BR-43-AGGREGATED",
"BR-AGGREGATED RONDONIA": "BR-11-AGGREGATED",
"BR-AGGREGATED RORAIMA": "BR-14-AGGREGATED",
"BR-AGGREGATED SANTA CATARINA": "BR-42-AGGREGATED",
"BR-AGGREGATED SAO PAULO": "BR-35-AGGREGATED",
"BR-AGGREGATED SERGIPE": "BR-28-AGGREGATED",
"BR-AGGREGATED TOCANTINS": "BR-17-AGGREGATED",
}
)
.combine_first(df_old["MUNICIPALITY"])
)
branch_between_versions = {
"0.": "0. live cattle",
"1.1": "1.1 sif slaughterhouse",
"1.2.1": "1.2 exporter sif slaughterhouse in municipality",
"1.2.2.1": "1.3 subsidiary sif slaughterhouse in municipality",
"1.2.2.2.1": "1.4 exporter non-sif (gta) slaughterhouse in municipality",
"1.2.2.2.2.1": "1.5 exporter sif slaughterhouses in state within 250 km",
"1.2.2.2.2.2.1": "1.6 subsidiary sif slaughterhouses for country in state within 4 hours",
"1.2.2.2.2.2.2.2": "1.7 unknown slaughterhouse in state",
"2.1": "2.1 exporter single country sif slaughterhouse in state of production",
"2.2": "2.2 unknown slaughterhouse in state of production",
"2.2.2.1": "2.2 unknown slaughterhouse in state of production",
"2.2.2.2": "2.2 unknown slaughterhouse in state of production",
"3.": "3. unknown state of production",
}
df_old["BRANCH"] = (
df_old["BRANCH"].map(branch_between_versions).combine_first(df_old["BRANCH"])
)
df_old = df_old.astype({"YEAR": int, "VOLUME_PRODUCT": float, "VOLUME_RAW": float})
################################################################################################
# Combine into one dataframe
################################################################################################
df = sps.concat(
[
df_old.assign(source="old (v2.1)"),
df_new.assign(source="new (v2.2)"),
]
)
df_origin = df.copy()
Compare
Now we analyise how much volume is "different". To do this we choose a step in the path, say logistics hub municipality. Then for each kilogram of volume we categorise it into "different" or "same"
| Was (v2.1) | Now (v2.2) | Verdict |
|---|---|---|
| Known | Known but same | SAME |
| Known | Known but different | DIFFERENT |
| Known | Unknown | DIFFERENT |
| Unknown | * | SAME |
That is, the kilogram of volume is "different" only if the logistics hub municipality was known in v2.1 and has changed in v2.2, either to a different municipality or the unknown municipality.
We can then group the dataframe by exporter CNPJ and order them by how much volume is "different".
for year in years:
df = df_origin[df_origin["YEAR"] == year]
total_volume = df["VOLUME_PRODUCT"].sum() / 2
def pivot(df, on):
df_consolidated = sps.consolidate(
df, ["VOLUME_PRODUCT"], ["EXPORTER_CNPJ", "source", *on]
)
return (
df_consolidated.pivot(
columns="source", index=["EXPORTER_CNPJ", *on], values="VOLUME_PRODUCT"
)
.fillna(0)
.reset_index()
)
def calculate_same(df, unknowns):
# calculate how much volume has not changed at all
same = np.minimum(df["old (v2.1)"], df["new (v2.2)"])
df = df.copy()
df["old (v2.1)"] -= same
df["new (v2.2)"] -= same
# we can also optimisically assign left-over unknown
# volume in the old dataset
unknowns = list(unknowns.items())
reduction_in_unknown = 0
if unknowns:
# create a filter for unknowns
column, value = unknowns.pop()
is_unknown = df[column] == value
for column, value in unknowns:
is_unknown |= df[column] == value
# shift left-over unknown old volume to left-over known new volume
old_total_unknown = df[is_unknown]["old (v2.1)"].sum()
new_total_known = df[~is_unknown]["new (v2.2)"].sum()
reduction_in_unknown = min(old_total_unknown, new_total_known)
return same.sum() + reduction_in_unknown
UNKNOWNS = {
"LOGISTICS_HUB": "BR-XXXXXXX",
"MUNICIPALITY": "BR-XXXXXXX",
"STATE": "BR-XX",
"MDIC_STATE": "UNKNOWN STATE",
}
on = ["LOGISTICS_HUB"]
# Total amount different, overall
################################################################
df_pivoted = pivot(df, on)
same = calculate_same(df_pivoted, {k: UNKNOWNS[k] for k in on})
different = total_volume - same
print(f"Different (overall) on {on}: {100 * different / total_volume:.2f}%")
# Breakdown by exporter CNPJ
################################################################
f = pd.DataFrame(
{
"same": df_pivoted.groupby("EXPORTER_CNPJ").apply(
calculate_same, {k: UNKNOWNS[k] for k in on}
),
"total": df_pivoted.groupby("EXPORTER_CNPJ")[["new (v2.2)", "old (v2.1)"]]
.apply(sum)
.mean(axis=1),
}
)
f["different"] = f["total"] - f["same"]
assert np.isclose(f["total"].sum(), total_volume)
df_proportions = 100 * f / total_volume
print(
f"Different (summed per exporter) on {on}: {df_proportions['different'].sum():.2f}%, probably different to previous number due to rounding errors"
)