CNPJ validity analysis
View or edit on GitHub
This page is synchronized from trase/models/brazil/soy/CNPJ validity analysis.ipynb. Last modified on 2025-12-14 23:19 CET by Trase Admin.
Please view or edit the original file there; changes should be reflected here after a midnight build (CET time),
or manually triggering it with a GitHub action (link).
import pandas as pd
import stdnum.br.cnpj as cnpj
from trase.tools.aws.aws_helpers_cached import get_pandas_df_once
from trase.tools.pandasdb import query
pd.options.display.float_format = "{:.1f}".format
# BOL 2022
bol = get_pandas_df_once(
"brazil/trade/bol/2022/BRAZIL_BOL_2022.csv",
encoding="utf8",
sep=";",
dtype={"exporter.cnpj": str, "hs4": str, "vol": float},
keep_default_na=False,
)[["exporter.label", "exporter.cnpj", "hs4", "vol"]]
# Commodities
beef_comm = pd.DataFrame(
["0201", "0202", "0206", "0504", "1602", "0210", "0102"], columns=["hs4"]
)
beef_comm["commodity"] = "BEEF"
soy_comm = pd.DataFrame(["1507", "2304", "1201", "2302"], columns=["hs4"])
soy_comm["commodity"] = "SOY"
comm = pd.concat([beef_comm, soy_comm]).set_index(keys=["hs4"])
# Join BOL and commodities
bol_comm = bol.join(comm, on="hs4", how="inner")
# check if bol cnpj is valid
bol_comm["cnpj_is_valid"] = bol_comm["exporter.cnpj"].map(cnpj.is_valid)
# Total vol per commodity
bol_comm.groupby("commodity")["vol"].sum()
commodity
BEEF 2314494089.0
SOY 93084652600.0
Name: vol, dtype: float64
# Distinct CNPJs by commodity
bol_comm.groupby(by=["commodity"])["exporter.cnpj"].nunique()
commodity
BEEF 364
SOY 403
Name: exporter.cnpj, dtype: int64
# Count and total volume of BOL records by commodity and valid cnpj
bol_comm.groupby(by=["commodity", "cnpj_is_valid"]).agg({"vol": ["count", "sum"]})
|
|
vol |
|
|
count |
sum |
| commodity |
cnpj_is_valid |
|
|
| BEEF |
False |
172 |
4469682.0 |
| True |
89619 |
2310024407.0 |
| SOY |
False |
704 |
13861657691.0 |
| True |
8903 |
79222994909.0 |
# Vol percentage by commodity and valid CNPJ
(
bol_comm.groupby(by=["commodity", "cnpj_is_valid"])
.agg({"vol": "sum"})
.join(
bol_comm.groupby(by=["commodity"])["vol"].sum(),
on="commodity",
rsuffix="_total",
)
.assign(perc_vol=lambda x: round(x["vol"] / x["vol_total"] * 100, 1))[["perc_vol"]]
)
|
|
perc_vol |
| commodity |
cnpj_is_valid |
|
| BEEF |
False |
0.2 |
| True |
99.8 |
| SOY |
False |
14.9 |
| True |
85.1 |
# Total valid/invalid distinct CNPJs
bol_comm.groupby(by=["cnpj_is_valid"])["exporter.cnpj"].nunique()
cnpj_is_valid
False 1
True 756
Name: exporter.cnpj, dtype: int64
# List of invalid CNPJs
bol_comm[bol_comm["cnpj_is_valid"] == False]["exporter.cnpj"].unique()
array(['00000000000000'], dtype=object)
# Invalid CNPJs Vol percentage by commodity and exporter label
(
bol_comm[bol_comm["cnpj_is_valid"] == False][["commodity", "exporter.label", "vol"]]
.groupby(by=["commodity", "exporter.label"])
.agg({"vol": "sum"})
.join(
bol_comm[bol_comm["cnpj_is_valid"] == False]
.groupby(by=["commodity"])["vol"]
.sum(),
on="commodity",
rsuffix="_total",
)
.assign(perc_vol=lambda x: round(x["vol"] / x["vol_total"] * 100, 2))
.reset_index()
.sort_values(by=["commodity", "perc_vol"], ascending=[True, False])
)
|
commodity |
exporter.label |
vol |
vol_total |
perc_vol |
| 5 |
BEEF |
FRIGORIFICO PUL PULSA SA |
3510679.0 |
4469682.0 |
78.5 |
| 2 |
BEEF |
FRIGORIFICO CANELONES SA |
486016.0 |
4469682.0 |
10.9 |
| 3 |
BEEF |
FRIGORIFICO CARRASCO SA |
124673.0 |
4469682.0 |
2.8 |
| 12 |
BEEF |
UNKNOWN CUSTOMER |
120061.0 |
4469682.0 |
2.7 |
| 1 |
BEEF |
FRIBOI LTDA |
28954.0 |
4469682.0 |
0.7 |
| 7 |
BEEF |
GANESHAA GLOBAL LTD |
29092.0 |
4469682.0 |
0.7 |
| 11 |
BEEF |
SA DES EAUX MINERALES DEVIAN |
29109.0 |
4469682.0 |
0.7 |
| 9 |
BEEF |
OOO TD PERVOMAISKYI |
28232.0 |
4469682.0 |
0.6 |
| 4 |
BEEF |
FRIGORIFICO MATADERO PANDO ONTILCOR SA |
26670.0 |
4469682.0 |
0.6 |
| 8 |
BEEF |
MINERVA SA |
26957.0 |
4469682.0 |
0.6 |
| 6 |
BEEF |
FRIGORIFICO TACUAREMBO SA |
24843.0 |
4469682.0 |
0.6 |
| 10 |
BEEF |
PRIVATE PERSON |
21600.0 |
4469682.0 |
0.5 |
| 0 |
BEEF |
BEEF PARAGUAY SA |
12796.0 |
4469682.0 |
0.3 |
| 26 |
SOY |
UNKNOWN CUSTOMER |
13329541487.0 |
13861657691.0 |
96.2 |
| 16 |
SOY |
BIOSEV SA |
170769270.0 |
13861657691.0 |
1.2 |
| 24 |
SOY |
RAIZEN CENTRO SUL PAULISTA SA |
82020206.0 |
13861657691.0 |
0.6 |
| 25 |
SOY |
RISA SA |
69070792.0 |
13861657691.0 |
0.5 |
| 19 |
SOY |
CARGILL |
57774680.0 |
13861657691.0 |
0.4 |
| 18 |
SOY |
BUNGE ASIA PTE LTD |
42096876.0 |
13861657691.0 |
0.3 |
| 15 |
SOY |
BIOSEV BIOENERGIA SA |
33600000.0 |
13861657691.0 |
0.2 |
| 13 |
SOY |
ARASCO TRADING BV |
28000000.0 |
13861657691.0 |
0.2 |
| 22 |
SOY |
ENGELHART CTP BRASIL SA |
22759380.0 |
13861657691.0 |
0.2 |
| 20 |
SOY |
CARGILL URUGUAY SA |
13000000.0 |
13861657691.0 |
0.1 |
| 17 |
SOY |
BSBIOS INDUSTRIA & COMERCIO DE BIODIESEL SUL B... |
5000000.0 |
13861657691.0 |
0.0 |
| 23 |
SOY |
PRIVATE PERSON |
5000000.0 |
13861657691.0 |
0.0 |
| 14 |
SOY |
BATAVO COOPERATIVA AGROINDUSTRIAL |
3000000.0 |
13861657691.0 |
0.0 |
| 21 |
SOY |
DEL CORONA & SCARDIGLI LOGISTICS |
25000.0 |
13861657691.0 |
0.0 |
# BOL CNPJs not in RF CNPJs
bol_rf_cnpj = query.query_with_dataframe(
pd.DataFrame(bol_comm["exporter.cnpj"].unique(), columns=["cnpj"]),
"""
SELECT
bol.cnpj AS cnpj_bol,
rf.cnpj AS cnpj_rf,
company_name,
trade_name
FROM df AS bol
LEFT JOIN cnpj.cnpj AS rf USING (cnpj)
ORDER BY company_name
""",
)
bol_rf_cnpj[pd.isna(bol_rf_cnpj.cnpj_rf)]
|
cnpj_bol |
cnpj_rf |
company_name |
trade_name |
| 756 |
00000000000000 |
None |
None |
None |