CNPJ validity analysis

View or edit on GitHub

This page is synchronized from trase/models/brazil/soy/CNPJ validity analysis.ipynb. Last modified on 2025-12-14 23:19 CET by Trase Admin. Please view or edit the original file there; changes should be reflected here after a midnight build (CET time), or manually triggering it with a GitHub action (link).

import pandas as pd
import stdnum.br.cnpj as cnpj

from trase.tools.aws.aws_helpers_cached import get_pandas_df_once
from trase.tools.pandasdb import query

pd.options.display.float_format = "{:.1f}".format
# BOL 2022
bol = get_pandas_df_once(
    "brazil/trade/bol/2022/BRAZIL_BOL_2022.csv",
    encoding="utf8",
    sep=";",
    dtype={"exporter.cnpj": str, "hs4": str, "vol": float},
    keep_default_na=False,
)[["exporter.label", "exporter.cnpj", "hs4", "vol"]]

# Commodities
beef_comm = pd.DataFrame(
    ["0201", "0202", "0206", "0504", "1602", "0210", "0102"], columns=["hs4"]
)
beef_comm["commodity"] = "BEEF"

soy_comm = pd.DataFrame(["1507", "2304", "1201", "2302"], columns=["hs4"])
soy_comm["commodity"] = "SOY"

comm = pd.concat([beef_comm, soy_comm]).set_index(keys=["hs4"])

# Join BOL and commodities
bol_comm = bol.join(comm, on="hs4", how="inner")

# check if bol cnpj is valid
bol_comm["cnpj_is_valid"] = bol_comm["exporter.cnpj"].map(cnpj.is_valid)
# Total vol per commodity
bol_comm.groupby("commodity")["vol"].sum()
commodity
BEEF    2314494089.0
SOY    93084652600.0
Name: vol, dtype: float64
# Distinct CNPJs by commodity
bol_comm.groupby(by=["commodity"])["exporter.cnpj"].nunique()
commodity
BEEF    364
SOY     403
Name: exporter.cnpj, dtype: int64
# Count and total volume of BOL records by commodity and valid cnpj
bol_comm.groupby(by=["commodity", "cnpj_is_valid"]).agg({"vol": ["count", "sum"]})
vol
count sum
commodity cnpj_is_valid
BEEF False 172 4469682.0
True 89619 2310024407.0
SOY False 704 13861657691.0
True 8903 79222994909.0
# Vol percentage by commodity and valid CNPJ
(
    bol_comm.groupby(by=["commodity", "cnpj_is_valid"])
    .agg({"vol": "sum"})
    .join(
        bol_comm.groupby(by=["commodity"])["vol"].sum(),
        on="commodity",
        rsuffix="_total",
    )
    .assign(perc_vol=lambda x: round(x["vol"] / x["vol_total"] * 100, 1))[["perc_vol"]]
)
perc_vol
commodity cnpj_is_valid
BEEF False 0.2
True 99.8
SOY False 14.9
True 85.1
# Total valid/invalid distinct CNPJs
bol_comm.groupby(by=["cnpj_is_valid"])["exporter.cnpj"].nunique()
cnpj_is_valid
False      1
True     756
Name: exporter.cnpj, dtype: int64
# List of invalid CNPJs
bol_comm[bol_comm["cnpj_is_valid"] == False]["exporter.cnpj"].unique()
array(['00000000000000'], dtype=object)
# Invalid CNPJs Vol percentage by commodity and exporter label
(
    bol_comm[bol_comm["cnpj_is_valid"] == False][["commodity", "exporter.label", "vol"]]
    .groupby(by=["commodity", "exporter.label"])
    .agg({"vol": "sum"})
    .join(
        bol_comm[bol_comm["cnpj_is_valid"] == False]
        .groupby(by=["commodity"])["vol"]
        .sum(),
        on="commodity",
        rsuffix="_total",
    )
    .assign(perc_vol=lambda x: round(x["vol"] / x["vol_total"] * 100, 2))
    .reset_index()
    .sort_values(by=["commodity", "perc_vol"], ascending=[True, False])
)
commodity exporter.label vol vol_total perc_vol
5 BEEF FRIGORIFICO PUL PULSA SA 3510679.0 4469682.0 78.5
2 BEEF FRIGORIFICO CANELONES SA 486016.0 4469682.0 10.9
3 BEEF FRIGORIFICO CARRASCO SA 124673.0 4469682.0 2.8
12 BEEF UNKNOWN CUSTOMER 120061.0 4469682.0 2.7
1 BEEF FRIBOI LTDA 28954.0 4469682.0 0.7
7 BEEF GANESHAA GLOBAL LTD 29092.0 4469682.0 0.7
11 BEEF SA DES EAUX MINERALES DEVIAN 29109.0 4469682.0 0.7
9 BEEF OOO TD PERVOMAISKYI 28232.0 4469682.0 0.6
4 BEEF FRIGORIFICO MATADERO PANDO ONTILCOR SA 26670.0 4469682.0 0.6
8 BEEF MINERVA SA 26957.0 4469682.0 0.6
6 BEEF FRIGORIFICO TACUAREMBO SA 24843.0 4469682.0 0.6
10 BEEF PRIVATE PERSON 21600.0 4469682.0 0.5
0 BEEF BEEF PARAGUAY SA 12796.0 4469682.0 0.3
26 SOY UNKNOWN CUSTOMER 13329541487.0 13861657691.0 96.2
16 SOY BIOSEV SA 170769270.0 13861657691.0 1.2
24 SOY RAIZEN CENTRO SUL PAULISTA SA 82020206.0 13861657691.0 0.6
25 SOY RISA SA 69070792.0 13861657691.0 0.5
19 SOY CARGILL 57774680.0 13861657691.0 0.4
18 SOY BUNGE ASIA PTE LTD 42096876.0 13861657691.0 0.3
15 SOY BIOSEV BIOENERGIA SA 33600000.0 13861657691.0 0.2
13 SOY ARASCO TRADING BV 28000000.0 13861657691.0 0.2
22 SOY ENGELHART CTP BRASIL SA 22759380.0 13861657691.0 0.2
20 SOY CARGILL URUGUAY SA 13000000.0 13861657691.0 0.1
17 SOY BSBIOS INDUSTRIA & COMERCIO DE BIODIESEL SUL B... 5000000.0 13861657691.0 0.0
23 SOY PRIVATE PERSON 5000000.0 13861657691.0 0.0
14 SOY BATAVO COOPERATIVA AGROINDUSTRIAL 3000000.0 13861657691.0 0.0
21 SOY DEL CORONA & SCARDIGLI LOGISTICS 25000.0 13861657691.0 0.0
# BOL CNPJs not in RF CNPJs
bol_rf_cnpj = query.query_with_dataframe(
    pd.DataFrame(bol_comm["exporter.cnpj"].unique(), columns=["cnpj"]),
    """
    SELECT
        bol.cnpj AS cnpj_bol,
        rf.cnpj AS cnpj_rf,
        company_name,
        trade_name
    FROM df AS bol
    LEFT JOIN cnpj.cnpj AS rf USING (cnpj)
    ORDER BY company_name
    """,
)

bol_rf_cnpj[pd.isna(bol_rf_cnpj.cnpj_rf)]
cnpj_bol cnpj_rf company_name trade_name
756 00000000000000 None None None