Skip to content

View or edit on GitHub

This page is synchronized from trase/models/brazil/soy/brazil_soy_sicarm.ipynb. Last modified on 2025-12-14 23:19 CET by Trase Admin. Please view or edit the original file there; changes should be reflected here after a midnight build (CET time), or manually triggering it with a GitHub action (link).

Brazil Soy: including SICARM in the decision tree

6th May 2024, by Harry

This notebook evaluates whether we can make a significant impact to our model by including SICARM and SICASQ in the decision tree for the 2021 and 2022 update.

SICASQ, or rather the newer incarnation called "CGC/MAPA", stands for "Cadastro Geral de Contribuintes do Ministério da Agricultura, Pecuária e Abastecimento," which translates to "General Registry of Taxpayers of the Ministry of Agriculture, Livestock, and Supply" in English is a registration system managed by the Brazilian Ministry of Agriculture, Livestock, and Supply. This registry is used to keep track of individuals and entities involved in agricultural activities in Brazil. It helps the government monitor and regulate agricultural production, ensuring compliance with relevant laws and regulations.

SICARM, which stands for "Sistema de Cadastro e Acompanhamento de Armazéns" (System for Registration and Monitoring of Warehouses), is part of CONAB (Companhia Nacional de Abastecimento), which is the National Supply Company in Brazil. SICARM is the system used to register storage facilities in Brazil, where the entire life cycle of the storage facility can be tracked and monitored. This includes the initial registration of the warehouse, inspections, certifications, and any updates or changes to the facility over time. By using SICARM, CONAB and other relevant authorities can maintain a comprehensive database of storage facilities across the country. This helps to ensure the quality and safety of stored agricultural products, facilitates logistics planning, and supports the efficient management of Brazil's agricultural supply chain.

Datasets

This notebook uses the following datasets:

  • SICASQ data from s3://trase-storage/brazil/logistics/sicasq/out/SICASQ_2023.csv
  • SICARM data from s3://trase-storage/brazil/logistics/sicarm/out/SICARM_16_19_21_22_23.csv. We ignore entries with a fully 0 CNPJ. Otherwise we assume all entries have valid municipalities (I did a quick visual check and seems OK).
  • Bills of lading (BOL) data from e.g. s3://trase-storage/brazil/trade/bol/2021/BRAZIL_BOL_2021.csv. We filter to Soy HS4 codes.
  • SEI-PCS results from e.g. s3://trase-storage/brazil/soy/sei_pcs/v2.6.1/SEIPCS_BRAZIL_SOY_2021.csv
  • Brazilian Federal Tax Agency (RFB) data is from the database, specifically the cnpj.cnpj table.
from trase.tools import sps
import numpy as np
import pandas as pd
import stdnum.br.cnpj
import stdnum.br.cpf
from trase.tools.sei_pcs.plotting import sankey
import plotly.express as px
from trase.tools.pandasdb.query import query_with_dataframe
from trase.models.brazil.soy.preparation import SOY_CNAE, SOY_HS4

import plotly.io as pio

pio.renderers.default = "png"
def clean_cnpjs(df, column):
    """Clean cnpjs and create a column 'exporter.type' indicating cnpj or cpf."""

    df = df.copy()

    # strip non-digit characters
    df["cnpj"] = df[column].apply(lambda cnpj: "".join(c for c in cnpj if c.isdigit()))
    df["cnpj"] = df["cnpj"].replace("", "0")

    cnpj = df["cnpj"].str.rjust(14, "0")
    cnpj_valid = cnpj.apply(stdnum.br.cnpj.is_valid)

    cpf = df["cnpj"].str.rjust(11, "0")
    cpf_valid = cpf.apply(stdnum.br.cpf.is_valid)

    df["cnpj_type"] = "unknown"
    df.loc[cpf_valid, "cnpj_type"] = "cpf"
    df.loc[cnpj_valid, "cnpj_type"] = "cnpj"

    df["cnpj"] = np.where(cnpj_valid, cnpj, df["cnpj"])
    df["cnpj"] = np.where(cpf_valid, cpf, df["cnpj"])
    df.loc[df["cnpj"] == "0", "cnpj"] = "0" * 14

    return df


# ------------------------------------------------------------ #
# SICASQ
# ------------------------------------------------------------ #

df_sicasq = sps.get_pandas_df_once(
    "brazil/logistics/sicasq/out/SICASQ_2023.csv",
    dtype=str,
    na_filter=False,
)
df_sicasq = clean_cnpjs(df_sicasq, "CNPJ")

# ------------------------------------------------------------ #
# SICARM
# ------------------------------------------------------------ #

df_sicarm = sps.get_pandas_df_once(
    "brazil/logistics/sicarm/out/SICARM_16_19_21_22_23.csv",
    dtype=str,
    na_filter=False,
)
df_sicarm = clean_cnpjs(df_sicarm, "CPF_CNPJ")
df_sicarm = df_sicarm[df_sicarm["cnpj"] != 0]
df_sicarm = df_sicarm[df_sicarm["cnpj"] != "00000000000000"]
df_sicarm["TYPE"] = df_sicarm["TYPE"].apply(
    lambda x: {
        "BATERIA DE SILOS": "SILO BATTERY",
        "CHAPÉU CHINÊS": "CHINESE HAT",
        "CONVENCIONAL": "CONVENTIONAL",
        "DEPÓSITO": "DEPOSIT",
        "ESTRUTURAL": "STRUCTURAL",
        "GRANELEIRO": "BULK CARRIER",
        "SILO": "SILO",
    }.get(x, x)
)

# ------------------------------------------------------------ #
# BOL
# ------------------------------------------------------------ #

df_bol = sps.concat(
    sps.get_pandas_df_once(
        f"brazil/trade/bol/{year}/BRAZIL_BOL_{year}.csv",
        dtype=str,
        na_filter=False,
        usecols=["exporter.cnpj", "vol", "year", "exporter.type", "hs4"],
    )
    for year in [2021, 2022]
).drop_duplicates()
df_bol = df_bol[df_bol["hs4"].isin(SOY_HS4)].copy()
df_bol = clean_cnpjs(df_bol, "exporter.cnpj")
df_bol["cnpj_type"] = df_bol["exporter.type"]
df_bol["vol"] = df_bol["vol"].astype(float)
df_bol = sps.consolidate(df_bol, ["vol"], ["year", "cnpj", "cnpj_type"])

# ------------------------------------------------------------ #
# SEI-PCS results
# ------------------------------------------------------------ #

df_seipcs = sps.concat(
    sps.get_pandas_df_once(
        f"brazil/soy/sei_pcs/v2.6.1/SEIPCS_BRAZIL_SOY_{year}.csv",
        dtype=str,
        na_filter=False,
        usecols=["CNPJ", "VOLUME_RAW", "YEAR", "EXPORTER_TYPE", "BRANCH"],
    )
    for year in [2021, 2022]
)
df_seipcs = df_seipcs.drop_duplicates()
df_seipcs = df_seipcs.rename(
    columns={
        "EXPORTER_TYPE": "cnpj_type",
        "VOLUME_RAW": "vol",
        "BRANCH": "branch",
        "YEAR": "year",
    },
    errors="raise",
)
df_seipcs["major_branch"] = (
    df_seipcs["branch"].str.slice(0, 1).replace({"U": "UNKNOWN", "D": "DOMESTIC"})
)
df_seipcs["vol"] = df_seipcs["vol"].astype(float)
df_seipcs = clean_cnpjs(df_seipcs, "CNPJ")
df_seipcs = sps.consolidate(
    df_seipcs, ["vol"], ["branch", "major_branch", "cnpj", "year", "cnpj_type"]
)

Here are some sample rows to illustrate what we have:

print("SICARM")
display(df_sicarm.sample(1).T)

print("SICASQ")
display(df_sicasq.sample(1).T)

print("BOL")
display(df_bol.sample(1).T)

print("SEI-PCS")
display(df_seipcs.sample(1).T)
SICARM
4942
index 5023
CDA 56.0239.0015-1
CPF_CNPJ 60.498.706/0316-21
COMPANY CARGILL AGRICOLA S A
STATUS CADASTRO EFETIVADO
TYPE BULK CARRIER
CAPACITY 18060
UF MT
MUNICIPALITY ALTO TAQUARI
ADDRESS ROD MT 100 KM 60 SAÍDA PARA ALTO ARAGUAIA - 7...
LAT -17.81153
LONG -53.28889
GEOCODE 5100607
YEAR 2016
MUNICIPALITY_CLEAN nan
LAT_NORTH False
cnpj 60498706031621
cnpj_type cnpj
SICASQ
11456
COMPANY LOUIS DREYFUS COMPANY BRASIL S.A.
CNPJ 47.067.525/0097-50
PRODUCT SOJA
STATUS Ativo
ADDRESS PR,Ponta Grossa,Colônia Dona Luíza,Rodovia BR-...
UF PR
DESTINATION CHINA, REPUBLICA POPULAR
EMAIL paulo.conti@ldc.com
RT PAULO HENRIQUE FERREIRA CONTI
DATE OF VALIDITY 28/02/2028
NUMBER OF ASSETS 1
cnpj 47067525009750
cnpj_type cnpj
BOL
686
year 2022
cnpj 13547850000156
cnpj_type CNPJ
vol 40256.0
SEI-PCS
1448
branch UNKNOWN
major_branch UNKNOWN
cnpj 13797986000114
year 2022
cnpj_type cnpj
vol 687628.848175

Matching SICARM to SEI-PCS by CNPJ

We first do a quick check to match the SICARM dataset to our SEI-PCS results, based on CNPJ. This will allow us to see how much volume from each branch would match with SICARM.

We see that indeed a significant volume matches SICARM. In particular, around 20% of the "Unknown" branch and 16% of branch 3:

right = (
    df_sicarm.groupby("cnpj")[["STATUS", "TYPE", "MUNICIPALITY"]].agg(set).reset_index()
)
df = pd.merge(
    df_seipcs,
    right,
    on=["cnpj"],
    suffixes=["_seipcs", "_sicarm"],
    validate="many_to_one",
    how="left",
    indicator=True,
)
df["_merge"] = df["_merge"].map(
    {
        "left_only": "not_in_sicarm",
        "both": "in_sicarm",
    }
)
data = sps.consolidate(df, ["vol"], ["year", "_merge", "major_branch"])

# display as table
total = data.groupby("major_branch")["vol"].transform("sum")
data["percent"] = 100 * data["vol"] / total
with sps.pandas_display_context(float_format=sps.format_float):
    display(
        data[data["_merge"] == "in_sicarm"][["year", "major_branch", "percent"]]
        .rename(columns={"percent": "percentage of branch matching SICARM"})
        .set_index(["year", "major_branch"])
    )

# display as sankey
fig = sankey(
    sps.consolidate(data, ["vol"], ["_merge", "major_branch"]),
    value_column="vol",
    categorical_columns=["_merge", "major_branch"],
)
fig.show()
percentage of branch matching SICARM
year major_branch
2021 1 23
2 11
3 9
UNKNOWN 10
2022 1 18
2 10
3 7
UNKNOWN 13

png

Let's print out the CNPJ8's (cast to integer) which matched, just to confirm that they are all valid CNPJs, and not some unknown CNPJ or something.

Looks good!

df[df["_merge"] == "in_sicarm"]["cnpj"].unique()

This looks promising! The matches in branch 1 are not so interesting, because in that branch the muncipality of the exporter has already taken to be the logistics hub municipality. (Although, it is interesting whetther the municipality of exporter matches the municipality in SICARM).

Let's have a look at the SICARM matches in the other branches (i.e. not branch 1). First we should just check that the entry in SICARM is active. This is in the field "status".

We see that it is motly CADASTRO EFTIVADO, which looks good.

We also check the "type". A CNPJ8 can be associated with many types, so we are looking at lists of types. One concern I see here is that there is a lot of listing of "bulk carrier". Does this actually mean there is a physical processing facility?

# filter data
data = df[df["major_branch"] != "1"]
data = data[data["_merge"] == "in_sicarm"]
data["status"] = data["STATUS"].apply(
    lambda s: ", ".join(sorted(i for i in s if i)) if s else "unknown"
)
data["type"] = data["TYPE"].apply(
    lambda s: ", ".join(sorted(i for i in s if i)) if s else "unknown"
)

# status
data_for_status = sps.consolidate(data, ["vol"], ["status"])
fig = px.bar(
    data_for_status,
    x="vol",
    y="status",
)
fig.show()

# type
data_for_type = sps.consolidate(data, ["vol"], ["type"]).sort_values("vol")
fig = px.bar(
    data_for_type,
    x="vol",
    y="type",
)
fig = fig.update_layout(
    yaxis={"dtick": 1},
    height=800,
)
fig.show()

png

png

Matching SICARM to BOL using CNPJ

Now let's match to BoL. As expected, we find a decent match.

right = (
    df_sicarm.groupby("cnpj")[["STATUS", "TYPE", "MUNICIPALITY"]].agg(set).reset_index()
)
df = pd.merge(
    df_bol,
    right,
    on=["cnpj"],
    suffixes=["_bol", "_sicarm"],
    validate="many_to_one",
    how="left",
    indicator=True,
)
df["_merge"] = df["_merge"].map(
    {
        "left_only": "not_in_sicarm",
        "both": "in_sicarm",
    }
)
data = sps.consolidate(df, ["vol"], ["_merge"])
px.bar(
    data,
    y="_merge",
    x="vol",
    labels={"vol": "Total export volume in BOL (all years summed)", "_merge": ""},
    title=f"Match on CNPJ between BoL and SICARM ({', '.join(sorted(df['year'].unique()))})",
)

png

Let's see what RFB has to say about these CNPJs that match (warning - this cell can take a while).

Here is a reminder of the asset levels

  • Level 1. This type of asset is geographically and tightly link to the exact location where the commodity is originated in the first place. Examples of this type of activity include, but are not limited to, the following assets: farms, plantations, mines, quarries, fisheries, aquaculture ponds, production concessions of any kind (e.g. forestry or mining concessions) etc.. Soy farms and oil palm plantations are examples of Level 1 assets we have dealt with in current Trase commodities.
  • Level 2. This type of asset is geographically and tightly linked to the local region where the commodity is originated in the first place, usually where it is gathered for the first time and becomes part of an international supply chain, not under the control of direct producers of the commodity. This has been traditionally named in Trase as “logistic hubs”, and constitute the targeted level of sourcing detail we aim for in Trase. This acceptable level or geographic resolution needs to be determined by the SEIPCs developer, after context and data scoping. Examples of this type of activity include the following assets: silos and other storage forms, mills, and in general any form of activity that involves the reception, collection, preparation, conditioning, selection and classification of raw materials in the vicinity of the areas where they are produced. These include a variety of facilities for washing, drying, cooling, hulling, husking, milling (depending on the commodity), peeling, slaughtering, fish-cutting, sawing, collecting, smelting, heating etc…, which SEIPCS developers need to identify for each supply chain. Soy silos, palm oil mills or cattle slaughterhouses are examples of Level 2 assets.
  • Level 3. This type of asset is generally geographically disentangled from the production area. As a result, relevant flow constrains data needs to be cross with these level 3 assets in the appropriate decision tree branch, in order to establish unambiguous or plausible relationships between these assets and the assets in levels 1 and 2. Generally speaking, these assets provide with added value to the raw material and/or prepare the product to meet internal trade and market requirements. Examples of these assets include, but are not limited to: oil refineries, meat factories, packaging and canning facilities, timber processing plants, roasters, tanneries, cooking preparation, chemical facilities etc…These activities are often concentrated in large industrial poles geographically removed from production areas and closer to national and international markets, but they retain the ownership by the same companies as level 1 and level 2 assets for the more vertically integrated companies. For supply chains with limited vertical integration these assets will not be useful to determine level 1 or 2 assets of origin, leading to unknown origin records. The quality and detail of the material flow constrains data is key to avoid this.
  • Level 4. This type of asset is completely or partially removed from any physical handling of the commodity of interest, and/or fits a role considerably downstream the supply chain of interest, and therefore its location is completely independent geographically from assets described in level 1, 2 and 3. Examples of these assets include, but are not limited to: cargo handling facilities in ports, wholesale retailing, import/export trading, international transport services etc… These assets can be of importance for the development of a SEIPCS context because they often allow to circumscribe material flows observed in the detailed trade data to specific locations by deriving conclusions on logistics and economic feasibility of potential supply chain configurations. However, as with level 3 assets, to link these assets with sourcing regions upstream relevant flow constrains data needs to be included.
  • Nominally a level 5 is used to refer to the activity type of trade records for which no information is available.

Below, it is interesting to see that the asset level in the RFB data is often 3 or higher, which indicates something that isn't a silo...

data = df[df["_merge"] == "in_sicarm"]
df_cnpjs = df[["cnpj"]].drop_duplicates()
df_cnpjs = df_cnpjs[df_cnpjs["cnpj"].astype(int) != 0]

# lookup CNPJs in RFB data
df_db_lookup = query_with_dataframe(
    df_cnpjs,
    """
    , _df as (
        select cnpj, cnpj::bigint as cnpj_int 
        from df
    ), _lookup as (
        select *, cnpj::bigint as cnpj_int
        from cnpj.cnpj
    )
    select distinct 
        _df.cnpj,
        cnae,
        cnae_name,
        cnae_secondary,
        municipality
    from _df join _lookup using (cnpj_int)
    """,
)
df_db_lookup.columns = [c if c == "cnpj" else f"{c}_db" for c in df_db_lookup.columns]


# add activity levels to CNPJs from database
def cnae_to_activity_level(cnae):
    return str(SOY_CNAE.get(int(cnae), "other"))


df_db_lookup["level_db"] = df_db_lookup["cnae_db"].apply(cnae_to_activity_level)
df_db_lookup["secondary_level_db"] = df_db_lookup["cnae_secondary_db"].apply(
    lambda x: ", ".join(
        sorted(set(cnae_to_activity_level(cnae) for cnae in x.split(",")))
        if x
        else "none"
    )
)

# lookup CNPJs in SICARM data
right = df_sicarm
right = right.groupby("cnpj")[["TYPE"]].agg(set)
df_sicarm_lookup = pd.merge(
    df_cnpjs,
    right,
    on="cnpj",
    validate="one_to_one",
)
df_sicarm_lookup.columns = [
    c if c == "cnpj" else f"{c}_sicarm" for c in df_sicarm_lookup.columns
]

# add both lookups into CNPJ listdf_cnpjs = pd.merge(
df_cnpjs = pd.merge(
    df_cnpjs,
    df_db_lookup,
    on="cnpj",
    how="left",
    validate="one_to_one",
)
df_cnpjs = pd.merge(
    df_cnpjs,
    df_sicarm_lookup,
    on="cnpj",
    how="left",
    validate="one_to_one",
)


# add minimium CNAE activity
def minimum_activity(row):
    activities = []
    primary = row["level_db"]
    if primary and str(primary).isdigit():
        activities.append(int(primary))
    secondary_levels = row["secondary_level_db"]
    if secondary_levels and not pd.isna(secondary_levels):
        for secondary in secondary_levels.split(","):
            secondary = secondary.strip()
            if secondary.isdigit():
                activities.append(int(secondary))
    if not activities:
        return primary
    else:
        return str(min(activities))


df_cnpjs["min_level_db"] = df_cnpjs.apply(minimum_activity, axis=1)

# add BoL volumes
df_cnpjs = pd.merge(
    df_cnpjs,
    sps.consolidate(df_bol, ["vol"], ["cnpj"]),
    on="cnpj",
    how="left",
    validate="one_to_one",
    indicator=True,
)
assert all(df_cnpjs.pop("_merge") == "both")
df_cnpjs = df_cnpjs.fillna("unknown")
fig = px.bar(
    sps.consolidate(df_cnpjs, ["vol"], ["level_db"]),
    x="vol",
    y="level_db",
    title="RFB CNAE primary activity level of CNPJS which are in both SICARM and BOL",
    labels={
        "vol": "Total export volume in BoL (all years summed)",
        "level_db": "CNAE primary activity level in RFB",
    },
)
fig.show()

fig = px.bar(
    sps.consolidate(df_cnpjs, ["vol"], ["min_level_db"]),
    x="vol",
    y="min_level_db",
    title="RFB CNAE minimum activity level of CNPJS which are in both SICARM and BOL",
    labels={
        "vol": "Total export volume in BoL (all years summed)",
        "level_db": "lowest primary or secondary CNAE level in RFB",
    },
)
fig.show()

png

png