CompareNewOldCNPJ

View or edit on GitHub

This page is synchronized from trase/models/brazil/soy/CompareNewOldCNPJ.ipynb. Last modified on 2025-12-14 23:19 CET by Trase Admin. Please view or edit the original file there; changes should be reflected here after a midnight build (CET time), or manually triggering it with a GitHub action (link).

import pandas as pd
from trase.tools import sps
import plotly.express as px


def get(key):
    return sps.get_pandas_df_once(key, dtype=str, na_filter=False)


df_old = sps.concat(
    [
        get("brazil/soy/sei_pcs/v2.6.1/SEIPCS_BRAZIL_SOY_2021_old_cnpj.csv"),
        get("brazil/soy/sei_pcs/v2.6.1/SEIPCS_BRAZIL_SOY_2022_old_cnpj.csv"),
    ]
).astype({"VOLUME_RAW": float})
df_new = sps.concat(
    [
        get("brazil/soy/sei_pcs/v2.6.1/SEIPCS_BRAZIL_SOY_2021.csv"),
        get("brazil/soy/sei_pcs/v2.6.1/SEIPCS_BRAZIL_SOY_2022.csv"),
    ]
).astype({"VOLUME_RAW": float})

df_old = (
    df_old[["CNPJ", "BRANCH", "VOLUME_RAW", "YEAR"]]
    .groupby(["CNPJ", "BRANCH", "YEAR"])["VOLUME_RAW"]
    .sum()
    .reset_index()
    .rename(columns={"VOLUME_RAW": "VOLUME_OLD"})
)
df_new = (
    df_new[["CNPJ", "BRANCH", "VOLUME_RAW", "YEAR"]]
    .groupby(["CNPJ", "BRANCH", "YEAR"])["VOLUME_RAW"]
    .sum()
    .reset_index()
    .rename(columns={"VOLUME_RAW": "VOLUME_NEW"})
)

df_old["BRANCH"] = df_old["BRANCH"].apply(lambda x: x[:3] if x[:1] == "1" else x[:1])
df_new["BRANCH"] = df_new["BRANCH"].apply(lambda x: x[:3] if x[:1] == "1" else x[:1])

df = pd.merge(
    df_old,
    df_new,
    on=["CNPJ", "YEAR", "BRANCH"],
    how="outer",
).fillna(0)

df["VOL_DIFF"] = df["VOLUME_NEW"] - df["VOLUME_OLD"]

df_cnpj_vols = df.groupby(["CNPJ", "YEAR"])["VOLUME_NEW"].sum().reset_index()

df_branches_old = (
    df_old.sort_values(["CNPJ", "BRANCH"])
    .groupby("CNPJ")["BRANCH"]
    .apply(sorted)
    .apply(set)
    .apply(list)
    .reset_index()
    .rename(columns={"BRANCH": "BRANCHES_OLD"})
)
df_branches_new = (
    df_new.sort_values(["CNPJ", "BRANCH"])
    .groupby("CNPJ")["BRANCH"]
    .apply(sorted)
    .apply(set)
    .apply(list)
    .reset_index()
    .rename(columns={"BRANCH": "BRANCHES_NEW"})
)

df_branches = pd.merge(df_branches_old, df_branches_new, on="CNPJ")

df_branches["BRANCHES_OLD_UNIQUE"] = df_branches.apply(
    lambda x: str(sorted([y for y in x["BRANCHES_OLD"] if y not in x["BRANCHES_NEW"]])),
    axis=1,
)
df_branches["BRANCHES_NEW_UNIQUE"] = df_branches.apply(
    lambda x: str(sorted([y for y in x["BRANCHES_NEW"] if y not in x["BRANCHES_OLD"]])),
    axis=1,
)

df_branches = df_branches[df_branches["BRANCHES_OLD"] != df_branches["BRANCHES_NEW"]]

df_branches["BRANCHES_OLD"] = df_branches["BRANCHES_OLD"].apply(str)
df_branches["BRANCHES_NEW"] = df_branches["BRANCHES_NEW"].apply(str)

df_branches = pd.merge(df_branches, df_cnpj_vols, on="CNPJ").sort_values(
    ["CNPJ", "YEAR"]
)

df_branches["TYPE"] = ""
df_branches["TYPE"] = df_branches["TYPE"].mask(
    (df_branches["BRANCHES_NEW_UNIQUE"].str.contains("1.1"))
    & (df_branches["TYPE"] == ""),
    "only_new_has_farm",
)
df_branches["TYPE"] = df_branches["TYPE"].mask(
    (df_branches["BRANCHES_OLD_UNIQUE"].str.contains("1.1"))
    & (df_branches["TYPE"] == ""),
    "only_old_has_farm",
)
df_branches["TYPE"] = df_branches["TYPE"].mask(
    (df_branches["BRANCHES_OLD"].str.contains("1."))
    & (df_branches["BRANCHES_NEW"].str.contains("1."))
    & (df_branches["TYPE"] == ""),
    "both_have_branch_1_excess_handled_differently",
)
df_branches["TYPE"] = df_branches["TYPE"].mask(
    (~df_branches["BRANCHES_OLD"].str.contains("1."))
    & (df_branches["BRANCHES_NEW"].str.contains("1.3"))
    & (df_branches["TYPE"] == ""),
    "only_new_large_producer",
)
df_branches["TYPE"] = df_branches["TYPE"].mask(
    (~df_branches["BRANCHES_OLD"].str.contains("1."))
    & (df_branches["BRANCHES_NEW"].str.contains("1.2"))
    & (df_branches["TYPE"] == ""),
    "only_new_large_exporter",
)
df_branches["TYPE"] = df_branches["TYPE"].mask(
    (~df_branches["BRANCHES_NEW"].str.contains("1."))
    & (df_branches["BRANCHES_OLD"].str.contains("1.3"))
    & (df_branches["TYPE"] == ""),
    "only_old_large_producer",
)
df_branches["TYPE"] = df_branches["TYPE"].mask(
    (~df_branches["BRANCHES_NEW"].str.contains("1."))
    & (df_branches["BRANCHES_OLD"].str.contains("1.2"))
    & (df_branches["TYPE"] == ""),
    "only_old_large_exporter",
)
df_branches["TYPE"] = df_branches["TYPE"].mask(
    (df_branches["TYPE"] == ""), "other (differences in branches 2-3)"
)

# df_branches[df_branches["TYPE"] == ""][["BRANCHES_NEW_UNIQUE", "BRANCHES_OLD_UNIQUE"]].drop_duplicates()
df_branches.groupby(["TYPE", "YEAR"])["VOLUME_NEW"].sum().apply(
    int
)  # .reset_index().sort_values(["TYPE", "YEAR"])
df_branches.groupby(["TYPE"])["VOLUME_NEW"].sum().apply(
    int
)  # .reset_index().sort_values(["TYPE", "YEAR"])