View or edit on GitHub

This page is synchronized from trase/models/brazil/soy/brazil_soy_v2_6_1_round_two_quality_assurance.ipynb. Last modified on 2026-03-21 22:30 CET by Trase Admin. Please view or edit the original file there; changes should be reflected here after a midnight build (CET time), or manually triggering it with a GitHub action (link).

Brazil Soy v2.6.1: Round Two Quality Assurance

This notebook attempts measure the quality of the Brazil soy v2.6.1 results.

Another analysis with a similar goal (but slightly different graphs) can be found in trase/data/brazil/soy/brazil_soy_2_6_1_qa.html.

We use the following data sources:

SEI-PCS results from S3, with 2017-18 coming from v2.6.0 and 2019-22 coming from v2.6.1
MDIC from S3
Comtrade from S3

from trase.tools import sps
import plotly.express as px
import pandas as pd

########################################################################################
# Read SEI-PCS results
########################################################################################
dfs = [
    sps.get_pandas_df_once(
        f"brazil/soy/sei_pcs/v{version}/SEIPCS_BRAZIL_SOY_{year}.csv",
        dtype=str,
        na_filter=False,
        print_version_id=True,
    )
    for version, year in [
        ("2.6.0", 2017),
        ("2.6.0", 2018),
        ("2.6.1", 2019),
        ("2.6.1", 2020),
        ("2.6.1", 2021),
        ("2.6.1", 2022),
    ]
]
common_columns = set.intersection(*[set(df.columns) for df in dfs])
df = sps.concat(df[list(common_columns)] for df in dfs)
df = df.astype({"VOLUME_RAW": float, "YEAR": int, "FOB": float})

########################################################################################
# read MDIC data
########################################################################################
SOY_HS4 = [
    "2304",
    "1208",
    "1507",
    "1201",
]  # not include SOYBEAN RESIDUAL because of no eq factor
df_mdic = sps.concat(
    sps.get_pandas_df_once(
        f"brazil/trade/mdic/port/brazil_mdic_port_{year}.csv",
        dtype=str,
        na_filter=False,
        print_version_id=True,
    )
    for year in [2017, 2018, 2019, 2020, 2021, 2022]
)
df_mdic = df_mdic[df_mdic["hs4"].isin(SOY_HS4)]
df_mdic = df_mdic.astype({"vol": float, "year": int, "fob": float}).rename(
    columns={"vol": "VOLUME_RAW", "year": "YEAR", "fob": "FOB"}
)

########################################################################################
# read comtrade data
########################################################################################
df_comtrade = sps.concat(
    sps.get_pandas_df_once(
        f"world/trade/statistical_data/comtrade/COMTRADE_{year}.csv",
        dtype=str,
        na_filter=False,
        print_version_id=True,
    )
    for year in [2020, 2021, 2022]
)
df_comtrade = df_comtrade[df_comtrade["volume"] != ""]
df_comtrade = df_comtrade[df_comtrade["transportation_method"] == "TOTAL MOT"]
df_comtrade = df_comtrade.astype({"year": int, "volume": float, "fob": float})
df_comtrade = df_comtrade.rename(
    columns={"year": "YEAR", "volume": "VOLUME_RAW", "fob": "FOB"}
)
df_comtrade = df_comtrade[df_comtrade["reporter.iso3"] == "BRA"]
df_comtrade = df_comtrade[df_comtrade["hs6"].str.slice(0, 4).isin(SOY_HS4)]

########################################################################################
# read CNAE
########################################################################################
df_cnae = sps.get_pandas_df_once(
    "brazil/logistics/cnpj/receita_federal_do_brasil/cnaes-2022-11-24.csv",
    dtype=str,
    na_filter=False,
    usecols=["cnae_code", "description"],
    print_version_id=True,
)

/Users/harrybiddle/dev/TRASE/trase/tools/sei_pcs/pandas_utilities.py:18: FutureWarning: pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead.
  from pandas.util.testing import assert_frame_equal


Accessing s3://trase-storage/brazil/soy/sei_pcs/v2.6.0/SEIPCS_BRAZIL_SOY_2017.csv version_id=qKtDR2xK7EEjeI_6eRsVmJa.9A1qfH_b
Accessing s3://trase-storage/brazil/soy/sei_pcs/v2.6.0/SEIPCS_BRAZIL_SOY_2018.csv version_id=j4zTGNLsjTIts0AAg8J.LGP4ycc6KEt9
Accessing s3://trase-storage/brazil/soy/sei_pcs/v2.6.1/SEIPCS_BRAZIL_SOY_2019.csv version_id=a1fTkYc0FghpLMTtpD3_wAsHFE1U6FBi
Accessing s3://trase-storage/brazil/soy/sei_pcs/v2.6.1/SEIPCS_BRAZIL_SOY_2020.csv version_id=nuArlOqWRXyEzXZHeIcaHEs0QNEH8i_c
Accessing s3://trase-storage/brazil/soy/sei_pcs/v2.6.1/SEIPCS_BRAZIL_SOY_2021.csv version_id=ClAQ9M4tm7u4teAyw3ChcpEdjxOFxgef
Accessing s3://trase-storage/brazil/soy/sei_pcs/v2.6.1/SEIPCS_BRAZIL_SOY_2022.csv version_id=Pr5.DDGjAGVkZo8pVrz0qO3kkLhqsX_I
Accessing s3://trase-storage/brazil/trade/mdic/port/brazil_mdic_port_2017.csv version_id=hdQvOdLaODJ8xwq0WxXYuvUtdEn36.PC
Accessing s3://trase-storage/brazil/trade/mdic/port/brazil_mdic_port_2018.csv version_id=nD6mLszb4r32dAEWLNFOpPpIAijfD1g1
Accessing s3://trase-storage/brazil/trade/mdic/port/brazil_mdic_port_2019.csv version_id=ACAVzUQA80DVl3_WZqoIMp.2ypjE9dWf
Accessing s3://trase-storage/brazil/trade/mdic/port/brazil_mdic_port_2020.csv version_id=gWLODG.8P4L0sYSI3wlLGsxrO3VsezYd
Accessing s3://trase-storage/brazil/trade/mdic/port/brazil_mdic_port_2021.csv version_id=KQoKTsS0RWRhm3TUdNQsaD.FzA9yfWTZ
Accessing s3://trase-storage/brazil/trade/mdic/port/brazil_mdic_port_2022.csv version_id=ypYKNepCepg2OWXxy.EIjtgtJnGoGhZl
Accessing s3://trase-storage/world/trade/statistical_data/comtrade/COMTRADE_2020.csv version_id=GiifhPYv.rngEirbSQPS97IqbVgJmD3k
Accessing s3://trase-storage/world/trade/statistical_data/comtrade/COMTRADE_2021.csv version_id=JeuLzy6q.k9IFC5PKbUE_VQDhCLoMQLm
Accessing s3://trase-storage/world/trade/statistical_data/comtrade/COMTRADE_2022.csv version_id=Q9iJbhJlvxQriG6gXnEOkqmt0S1aBMaj
Accessing s3://trase-storage/brazil/logistics/cnpj/receita_federal_do_brasil/cnaes-2022-11-24.csv version_id=WJnPRXxQVHeCJ4xynlunGd1UC_Zfy2ru

Volume sourced from the unknown municipality

df_unknown = df.copy()
df_unknown["VOLUME_YEAR"] = df_unknown.groupby(["YEAR"])["VOLUME_RAW"].transform("sum")
df_unknown["PERCENTAGE"] = 100 * df_unknown["VOLUME_RAW"] / df_unknown["VOLUME_YEAR"]
df_unknown = df_unknown[df_unknown["LVL6_TRASE_ID_PROD"] == "BR-XXXXXXX"]
data = sps.consolidate(df_unknown, ["PERCENTAGE"], ["YEAR"])
fig = px.bar(
    data,
    x="YEAR",
    y="PERCENTAGE",
    title="Percentage of volume sourced from unknown municipality",
)
fig.data[0].text = [f"{sps.format_float(v)}%" for v in fig.data[0].y.tolist()]
fig.show("png")

png

Volume exported by unknown exporter

df_unknown2 = df.copy()
df_unknown2["IS_DOMESTIC"] = df_unknown2["BRANCH"] == "DOMESTIC"
df_unknown2["VOLUME_YEAR"] = df_unknown2.groupby(["YEAR"])["VOLUME_RAW"].transform(
    "sum"
)
df_unknown2["PERCENTAGE"] = 100 * df_unknown2["VOLUME_RAW"] / df_unknown2["VOLUME_YEAR"]
df_unknown2 = df_unknown2[
    df_unknown2["EXPORTER_TRASE_ID"].isin(["BR-TRADER-00000000", "BR-TRADER-XXXXXXXX"])
]
data = sps.consolidate(df_unknown2, ["PERCENTAGE"], ["YEAR", "IS_DOMESTIC"])
fig = px.bar(
    data,
    x="YEAR",
    y="PERCENTAGE",
    color="IS_DOMESTIC",
    title="Percentage of volume exported by unknown exporter",
)
fig.show("png")

png

Supply Chain Fragmentation

In this section we ask the question: for each exporter, how many municipalities of production did our model identify?

This gives a sense of how widely our model distributes the trade shipments across Brazil.

The following data is excluded from this analysis:

Trade shipments to unknown exporters (meaning that domestic consumption is also excluded)
Trade shipments from the unknown municipality

# fragmentation of supply chain
# remove unknown municipality and trader
df_known = df[
    (df["LVL6_TRASE_ID_PROD"] != "BR-XXXXXXX")
    & ~(df["EXPORTER_TRASE_ID"].isin(["BR-TRADER-00000000", "BR-TRADER-XXXXXXXX"]))
].copy()

data = df_known.groupby(["YEAR", "EXPORTER_TRASE_ID"]).agg(
    {
        # number of unique municipalties of production per exporter/year
        "LVL6_TRASE_ID_PROD": "nunique",
        # total volume from known municipalities of production per exporter/year
        "VOLUME_RAW": "sum",
    }
)


# mark if the exporter is in the top ten producers for that year
data = pd.merge(
    data,
    data.groupby("YEAR")
    .apply(lambda group: group.sort_values("VOLUME_RAW")[-10:].min())["VOLUME_RAW"]
    .rename("CUTOFF")
    .reset_index(),
    on="YEAR",
)
is_top_ten = data["VOLUME_RAW"] >= data["CUTOFF"]

# add average
data = sps.concat(
    [
        data.groupby("YEAR")["LVL6_TRASE_ID_PROD"]
        .mean()
        .rename("average")
        .reset_index()
        .assign(filter="all exporters"),
        data[is_top_ten]
        .groupby("YEAR")["LVL6_TRASE_ID_PROD"]
        .mean()
        .rename("average")
        .reset_index()
        .assign(filter="top ten exporters"),
    ]
)

fig = px.line(
    data,
    color="filter",
    x="YEAR",
    y="average",
    title="<b>Fragmentation of supply chain</b><br />Defined as the number of municipalities per exporter<br />Unknown exporters and municipalities excluded",
    labels={"YEAR": "year", "average": "average # of municipalities of production"},
)
fig.show("png")

png

Breakdown of decision tree

# plot major branch as a percentage of volume in each year
def parse_branch(branch):
    if branch == "UNKNOWN":
        return "Unknown"
    elif branch == "DOMESTIC":
        return "Domestic"
    else:
        number = branch.split(".")[0]
        return {
            "1": "1 (Direct link)",
            "2": "2 (Stickiness)",
            "3": "3 (MDIC)",
        }.get(number, number)


df_branch = df.copy()
df_branch["MAJOR_BRANCH"] = df_branch["BRANCH"].apply(parse_branch)

df_branch = sps.consolidate(df_branch, ["VOLUME_RAW"], ["YEAR", "MAJOR_BRANCH"])
df_branch["VOLUME_RAW_PER_YEAR"] = df_branch.groupby("YEAR")["VOLUME_RAW"].transform(
    "sum"
)
df_branch["VOLUME_RAW_PER_BRANCH"] = df_branch.groupby("MAJOR_BRANCH")[
    "VOLUME_RAW"
].transform("sum")
df_branch["PERCENTAGE_PER_YEAR"] = (
    100 * df_branch["VOLUME_RAW"] / df_branch["VOLUME_RAW_PER_YEAR"]
)

# exclude 2017, since the branch numbers change in 2018, so comparison
# is not easy
df_branch = df_branch[df_branch["YEAR"] > 2017]
fig = px.bar(
    df_branch.sort_values(["MAJOR_BRANCH", "YEAR"]).astype({"YEAR": str}),
    x="YEAR",
    y="PERCENTAGE_PER_YEAR",
    labels={"PERCENTAGE_PER_YEAR": "% volume"},
    color="MAJOR_BRANCH",
    # facet by cols with a wrap of one
    # this essentially facets by rows but with headers _above_ the graphs
    # rather than to the right
    facet_col="MAJOR_BRANCH",
    facet_col_wrap=3,
    height=800,
    width=1000,
    title="Percentage volume in each decision tree branch",
)
for facet in fig.data:
    facet.text = [f"{sps.format_float(v)}%" for v in facet.y.tolist()]
fig.show("png")

png

Row count

A simple count of how many rows our results file CSV has. This is another, cruder measure of fragmentation

fig = px.bar(
    df.groupby("YEAR").apply(len).rename("row_count").reset_index(),
    x="YEAR",
    y="row_count",
    title="Number of rows in model results CSV file",
)
fig.show("png")

png

df_export = df[df["BRANCH"] != "DOMESTIC"]
df_totals = sps.concat(
    sps.concat(
        [
            df_mdic.groupby("YEAR")[metric]
            .sum()
            .rename("value")
            .reset_index()
            .assign(source="MDIC", metric=metric),
            df_export.groupby("YEAR")[metric]
            .sum()
            .rename("value")
            .reset_index()
            .assign(source="SEI-PCS", metric=metric),
            df_comtrade.groupby("YEAR")[metric]
            .sum()
            .rename("value")
            .reset_index()
            .assign(source="Comtrade", metric=metric),
        ]
    )
    for metric in ["VOLUME_RAW", "FOB"]
)
fig = px.line(
    df_totals,
    x="YEAR",
    y="value",
    color="source",
    facet_col="metric",
    facet_col_wrap=1,
    title=f"Comparing SEI-PCS, Comtrade and MDIC",
)

# make the facets have independent axes
fig.update_yaxes(matches=None)
fig.show("png")

png