CompareNewOldCNPJ
View or edit on GitHub
This page is synchronized from trase/models/brazil/soy/CompareNewOldCNPJ.ipynb. Last modified on 2025-12-14 23:19 CET by Trase Admin.
Please view or edit the original file there; changes should be reflected here after a midnight build (CET time),
or manually triggering it with a GitHub action (link).
import pandas as pd
from trase.tools import sps
import plotly.express as px
def get(key):
return sps.get_pandas_df_once(key, dtype=str, na_filter=False)
df_old = sps.concat(
[
get("brazil/soy/sei_pcs/v2.6.1/SEIPCS_BRAZIL_SOY_2021_old_cnpj.csv"),
get("brazil/soy/sei_pcs/v2.6.1/SEIPCS_BRAZIL_SOY_2022_old_cnpj.csv"),
]
).astype({"VOLUME_RAW": float})
df_new = sps.concat(
[
get("brazil/soy/sei_pcs/v2.6.1/SEIPCS_BRAZIL_SOY_2021.csv"),
get("brazil/soy/sei_pcs/v2.6.1/SEIPCS_BRAZIL_SOY_2022.csv"),
]
).astype({"VOLUME_RAW": float})
df_old = (
df_old[["CNPJ", "BRANCH", "VOLUME_RAW", "YEAR"]]
.groupby(["CNPJ", "BRANCH", "YEAR"])["VOLUME_RAW"]
.sum()
.reset_index()
.rename(columns={"VOLUME_RAW": "VOLUME_OLD"})
)
df_new = (
df_new[["CNPJ", "BRANCH", "VOLUME_RAW", "YEAR"]]
.groupby(["CNPJ", "BRANCH", "YEAR"])["VOLUME_RAW"]
.sum()
.reset_index()
.rename(columns={"VOLUME_RAW": "VOLUME_NEW"})
)
df_old["BRANCH"] = df_old["BRANCH"].apply(lambda x: x[:3] if x[:1] == "1" else x[:1])
df_new["BRANCH"] = df_new["BRANCH"].apply(lambda x: x[:3] if x[:1] == "1" else x[:1])
df = pd.merge(
df_old,
df_new,
on=["CNPJ", "YEAR", "BRANCH"],
how="outer",
).fillna(0)
df["VOL_DIFF"] = df["VOLUME_NEW"] - df["VOLUME_OLD"]
df_cnpj_vols = df.groupby(["CNPJ", "YEAR"])["VOLUME_NEW"].sum().reset_index()
df_branches_old = (
df_old.sort_values(["CNPJ", "BRANCH"])
.groupby("CNPJ")["BRANCH"]
.apply(sorted)
.apply(set)
.apply(list)
.reset_index()
.rename(columns={"BRANCH": "BRANCHES_OLD"})
)
df_branches_new = (
df_new.sort_values(["CNPJ", "BRANCH"])
.groupby("CNPJ")["BRANCH"]
.apply(sorted)
.apply(set)
.apply(list)
.reset_index()
.rename(columns={"BRANCH": "BRANCHES_NEW"})
)
df_branches = pd.merge(df_branches_old, df_branches_new, on="CNPJ")
df_branches["BRANCHES_OLD_UNIQUE"] = df_branches.apply(
lambda x: str(sorted([y for y in x["BRANCHES_OLD"] if y not in x["BRANCHES_NEW"]])),
axis=1,
)
df_branches["BRANCHES_NEW_UNIQUE"] = df_branches.apply(
lambda x: str(sorted([y for y in x["BRANCHES_NEW"] if y not in x["BRANCHES_OLD"]])),
axis=1,
)
df_branches = df_branches[df_branches["BRANCHES_OLD"] != df_branches["BRANCHES_NEW"]]
df_branches["BRANCHES_OLD"] = df_branches["BRANCHES_OLD"].apply(str)
df_branches["BRANCHES_NEW"] = df_branches["BRANCHES_NEW"].apply(str)
df_branches = pd.merge(df_branches, df_cnpj_vols, on="CNPJ").sort_values(
["CNPJ", "YEAR"]
)
df_branches["TYPE"] = ""
df_branches["TYPE"] = df_branches["TYPE"].mask(
(df_branches["BRANCHES_NEW_UNIQUE"].str.contains("1.1"))
& (df_branches["TYPE"] == ""),
"only_new_has_farm",
)
df_branches["TYPE"] = df_branches["TYPE"].mask(
(df_branches["BRANCHES_OLD_UNIQUE"].str.contains("1.1"))
& (df_branches["TYPE"] == ""),
"only_old_has_farm",
)
df_branches["TYPE"] = df_branches["TYPE"].mask(
(df_branches["BRANCHES_OLD"].str.contains("1."))
& (df_branches["BRANCHES_NEW"].str.contains("1."))
& (df_branches["TYPE"] == ""),
"both_have_branch_1_excess_handled_differently",
)
df_branches["TYPE"] = df_branches["TYPE"].mask(
(~df_branches["BRANCHES_OLD"].str.contains("1."))
& (df_branches["BRANCHES_NEW"].str.contains("1.3"))
& (df_branches["TYPE"] == ""),
"only_new_large_producer",
)
df_branches["TYPE"] = df_branches["TYPE"].mask(
(~df_branches["BRANCHES_OLD"].str.contains("1."))
& (df_branches["BRANCHES_NEW"].str.contains("1.2"))
& (df_branches["TYPE"] == ""),
"only_new_large_exporter",
)
df_branches["TYPE"] = df_branches["TYPE"].mask(
(~df_branches["BRANCHES_NEW"].str.contains("1."))
& (df_branches["BRANCHES_OLD"].str.contains("1.3"))
& (df_branches["TYPE"] == ""),
"only_old_large_producer",
)
df_branches["TYPE"] = df_branches["TYPE"].mask(
(~df_branches["BRANCHES_NEW"].str.contains("1."))
& (df_branches["BRANCHES_OLD"].str.contains("1.2"))
& (df_branches["TYPE"] == ""),
"only_old_large_exporter",
)
df_branches["TYPE"] = df_branches["TYPE"].mask(
(df_branches["TYPE"] == ""), "other (differences in branches 2-3)"
)
# df_branches[df_branches["TYPE"] == ""][["BRANCHES_NEW_UNIQUE", "BRANCHES_OLD_UNIQUE"]].drop_duplicates()
df_branches.groupby(["TYPE", "YEAR"])["VOLUME_NEW"].sum().apply(
int
) # .reset_index().sort_values(["TYPE", "YEAR"])
df_branches.groupby(["TYPE"])["VOLUME_NEW"].sum().apply(
int
) # .reset_index().sort_values(["TYPE", "YEAR"])