View or edit on GitHub
This page is synchronized from trase/models/brazil/soy/brazil_soy_v2_6_1_round_two_quality_assurance.ipynb. Last modified on 2025-12-14 23:19 CET by Trase Admin.
Please view or edit the original file there; changes should be reflected here after a midnight build (CET time),
or manually triggering it with a GitHub action (link).
Brazil Soy v2.6.1: Round Two Quality Assurance
This notebook attempts measure the quality of the Brazil soy v2.6.1 results.
Another analysis with a similar goal (but slightly different graphs) can be found in trase/data/brazil/soy/brazil_soy_2_6_1_qa.html.
We use the following data sources:
- SEI-PCS results from S3, with 2017-18 coming from v2.6.0 and 2019-22 coming from v2.6.1
- MDIC from S3
- Comtrade from S3
from trase.tools import sps
import plotly.express as px
import pandas as pd
########################################################################################
# Read SEI-PCS results
########################################################################################
dfs = [
sps.get_pandas_df_once(
f"brazil/soy/sei_pcs/v{version}/SEIPCS_BRAZIL_SOY_{year}.csv",
dtype=str,
na_filter=False,
print_version_id=True,
)
for version, year in [
("2.6.0", 2017),
("2.6.0", 2018),
("2.6.1", 2019),
("2.6.1", 2020),
("2.6.1", 2021),
("2.6.1", 2022),
]
]
common_columns = set.intersection(*[set(df.columns) for df in dfs])
df = sps.concat(df[list(common_columns)] for df in dfs)
df = df.astype({"VOLUME_RAW": float, "YEAR": int, "FOB": float})
########################################################################################
# read MDIC data
########################################################################################
SOY_HS4 = [
"2304",
"1208",
"1507",
"1201",
] # not include SOYBEAN RESIDUAL because of no eq factor
df_mdic = sps.concat(
sps.get_pandas_df_once(
f"brazil/trade/mdic/port/brazil_mdic_port_{year}.csv",
dtype=str,
na_filter=False,
print_version_id=True,
)
for year in [2017, 2018, 2019, 2020, 2021, 2022]
)
df_mdic = df_mdic[df_mdic["hs4"].isin(SOY_HS4)]
df_mdic = df_mdic.astype({"vol": float, "year": int, "fob": float}).rename(
columns={"vol": "VOLUME_RAW", "year": "YEAR", "fob": "FOB"}
)
########################################################################################
# read comtrade data
########################################################################################
df_comtrade = sps.concat(
sps.get_pandas_df_once(
f"world/trade/statistical_data/comtrade/COMTRADE_{year}.csv",
dtype=str,
na_filter=False,
print_version_id=True,
)
for year in [2020, 2021, 2022]
)
df_comtrade = df_comtrade[df_comtrade["volume"] != ""]
df_comtrade = df_comtrade[df_comtrade["transportation_method"] == "TOTAL MOT"]
df_comtrade = df_comtrade.astype({"year": int, "volume": float, "fob": float})
df_comtrade = df_comtrade.rename(
columns={"year": "YEAR", "volume": "VOLUME_RAW", "fob": "FOB"}
)
df_comtrade = df_comtrade[df_comtrade["reporter.iso3"] == "BRA"]
df_comtrade = df_comtrade[df_comtrade["hs6"].str.slice(0, 4).isin(SOY_HS4)]
########################################################################################
# read CNAE
########################################################################################
df_cnae = sps.get_pandas_df_once(
"brazil/logistics/cnpj/receita_federal_do_brasil/cnaes-2022-11-24.csv",
dtype=str,
na_filter=False,
usecols=["cnae_code", "description"],
print_version_id=True,
)
/Users/harrybiddle/dev/TRASE/trase/tools/sei_pcs/pandas_utilities.py:18: FutureWarning: pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead.
from pandas.util.testing import assert_frame_equal
Accessing s3://trase-storage/brazil/soy/sei_pcs/v2.6.0/SEIPCS_BRAZIL_SOY_2017.csv version_id=qKtDR2xK7EEjeI_6eRsVmJa.9A1qfH_b
Accessing s3://trase-storage/brazil/soy/sei_pcs/v2.6.0/SEIPCS_BRAZIL_SOY_2018.csv version_id=j4zTGNLsjTIts0AAg8J.LGP4ycc6KEt9
Accessing s3://trase-storage/brazil/soy/sei_pcs/v2.6.1/SEIPCS_BRAZIL_SOY_2019.csv version_id=a1fTkYc0FghpLMTtpD3_wAsHFE1U6FBi
Accessing s3://trase-storage/brazil/soy/sei_pcs/v2.6.1/SEIPCS_BRAZIL_SOY_2020.csv version_id=nuArlOqWRXyEzXZHeIcaHEs0QNEH8i_c
Accessing s3://trase-storage/brazil/soy/sei_pcs/v2.6.1/SEIPCS_BRAZIL_SOY_2021.csv version_id=ClAQ9M4tm7u4teAyw3ChcpEdjxOFxgef
Accessing s3://trase-storage/brazil/soy/sei_pcs/v2.6.1/SEIPCS_BRAZIL_SOY_2022.csv version_id=Pr5.DDGjAGVkZo8pVrz0qO3kkLhqsX_I
Accessing s3://trase-storage/brazil/trade/mdic/port/brazil_mdic_port_2017.csv version_id=hdQvOdLaODJ8xwq0WxXYuvUtdEn36.PC
Accessing s3://trase-storage/brazil/trade/mdic/port/brazil_mdic_port_2018.csv version_id=nD6mLszb4r32dAEWLNFOpPpIAijfD1g1
Accessing s3://trase-storage/brazil/trade/mdic/port/brazil_mdic_port_2019.csv version_id=ACAVzUQA80DVl3_WZqoIMp.2ypjE9dWf
Accessing s3://trase-storage/brazil/trade/mdic/port/brazil_mdic_port_2020.csv version_id=gWLODG.8P4L0sYSI3wlLGsxrO3VsezYd
Accessing s3://trase-storage/brazil/trade/mdic/port/brazil_mdic_port_2021.csv version_id=KQoKTsS0RWRhm3TUdNQsaD.FzA9yfWTZ
Accessing s3://trase-storage/brazil/trade/mdic/port/brazil_mdic_port_2022.csv version_id=ypYKNepCepg2OWXxy.EIjtgtJnGoGhZl
Accessing s3://trase-storage/world/trade/statistical_data/comtrade/COMTRADE_2020.csv version_id=GiifhPYv.rngEirbSQPS97IqbVgJmD3k
Accessing s3://trase-storage/world/trade/statistical_data/comtrade/COMTRADE_2021.csv version_id=JeuLzy6q.k9IFC5PKbUE_VQDhCLoMQLm
Accessing s3://trase-storage/world/trade/statistical_data/comtrade/COMTRADE_2022.csv version_id=Q9iJbhJlvxQriG6gXnEOkqmt0S1aBMaj
Accessing s3://trase-storage/brazil/logistics/cnpj/receita_federal_do_brasil/cnaes-2022-11-24.csv version_id=WJnPRXxQVHeCJ4xynlunGd1UC_Zfy2ru
Volume sourced from the unknown municipality
df_unknown = df.copy()
df_unknown["VOLUME_YEAR"] = df_unknown.groupby(["YEAR"])["VOLUME_RAW"].transform("sum")
df_unknown["PERCENTAGE"] = 100 * df_unknown["VOLUME_RAW"] / df_unknown["VOLUME_YEAR"]
df_unknown = df_unknown[df_unknown["LVL6_TRASE_ID_PROD"] == "BR-XXXXXXX"]
data = sps.consolidate(df_unknown, ["PERCENTAGE"], ["YEAR"])
fig = px.bar(
data,
x="YEAR",
y="PERCENTAGE",
title="Percentage of volume sourced from unknown municipality",
)
fig.data[0].text = [f"{sps.format_float(v)}%" for v in fig.data[0].y.tolist()]
fig.show("png")

Volume exported by unknown exporter
df_unknown2 = df.copy()
df_unknown2["IS_DOMESTIC"] = df_unknown2["BRANCH"] == "DOMESTIC"
df_unknown2["VOLUME_YEAR"] = df_unknown2.groupby(["YEAR"])["VOLUME_RAW"].transform(
"sum"
)
df_unknown2["PERCENTAGE"] = 100 * df_unknown2["VOLUME_RAW"] / df_unknown2["VOLUME_YEAR"]
df_unknown2 = df_unknown2[
df_unknown2["EXPORTER_TRASE_ID"].isin(["BR-TRADER-00000000", "BR-TRADER-XXXXXXXX"])
]
data = sps.consolidate(df_unknown2, ["PERCENTAGE"], ["YEAR", "IS_DOMESTIC"])
fig = px.bar(
data,
x="YEAR",
y="PERCENTAGE",
color="IS_DOMESTIC",
title="Percentage of volume exported by unknown exporter",
)
fig.show("png")

Supply Chain Fragmentation
In this section we ask the question: for each exporter, how many municipalities of production did our model identify?
This gives a sense of how widely our model distributes the trade shipments across Brazil.
The following data is excluded from this analysis:
- Trade shipments to unknown exporters (meaning that domestic consumption is also excluded)
- Trade shipments from the unknown municipality
# fragmentation of supply chain
# remove unknown municipality and trader
df_known = df[
(df["LVL6_TRASE_ID_PROD"] != "BR-XXXXXXX")
& ~(df["EXPORTER_TRASE_ID"].isin(["BR-TRADER-00000000", "BR-TRADER-XXXXXXXX"]))
].copy()
data = df_known.groupby(["YEAR", "EXPORTER_TRASE_ID"]).agg(
{
# number of unique municipalties of production per exporter/year
"LVL6_TRASE_ID_PROD": "nunique",
# total volume from known municipalities of production per exporter/year
"VOLUME_RAW": "sum",
}
)
# mark if the exporter is in the top ten producers for that year
data = pd.merge(
data,
data.groupby("YEAR")
.apply(lambda group: group.sort_values("VOLUME_RAW")[-10:].min())["VOLUME_RAW"]
.rename("CUTOFF")
.reset_index(),
on="YEAR",
)
is_top_ten = data["VOLUME_RAW"] >= data["CUTOFF"]
# add average
data = sps.concat(
[
data.groupby("YEAR")["LVL6_TRASE_ID_PROD"]
.mean()
.rename("average")
.reset_index()
.assign(filter="all exporters"),
data[is_top_ten]
.groupby("YEAR")["LVL6_TRASE_ID_PROD"]
.mean()
.rename("average")
.reset_index()
.assign(filter="top ten exporters"),
]
)
fig = px.line(
data,
color="filter",
x="YEAR",
y="average",
title="<b>Fragmentation of supply chain</b><br />Defined as the number of municipalities per exporter<br />Unknown exporters and municipalities excluded",
labels={"YEAR": "year", "average": "average # of municipalities of production"},
)
fig.show("png")

Breakdown of decision tree
# plot major branch as a percentage of volume in each year
def parse_branch(branch):
if branch == "UNKNOWN":
return "Unknown"
elif branch == "DOMESTIC":
return "Domestic"
else:
number = branch.split(".")[0]
return {
"1": "1 (Direct link)",
"2": "2 (Stickiness)",
"3": "3 (MDIC)",
}.get(number, number)
df_branch = df.copy()
df_branch["MAJOR_BRANCH"] = df_branch["BRANCH"].apply(parse_branch)
df_branch = sps.consolidate(df_branch, ["VOLUME_RAW"], ["YEAR", "MAJOR_BRANCH"])
df_branch["VOLUME_RAW_PER_YEAR"] = df_branch.groupby("YEAR")["VOLUME_RAW"].transform(
"sum"
)
df_branch["VOLUME_RAW_PER_BRANCH"] = df_branch.groupby("MAJOR_BRANCH")[
"VOLUME_RAW"
].transform("sum")
df_branch["PERCENTAGE_PER_YEAR"] = (
100 * df_branch["VOLUME_RAW"] / df_branch["VOLUME_RAW_PER_YEAR"]
)
# exclude 2017, since the branch numbers change in 2018, so comparison
# is not easy
df_branch = df_branch[df_branch["YEAR"] > 2017]
fig = px.bar(
df_branch.sort_values(["MAJOR_BRANCH", "YEAR"]).astype({"YEAR": str}),
x="YEAR",
y="PERCENTAGE_PER_YEAR",
labels={"PERCENTAGE_PER_YEAR": "% volume"},
color="MAJOR_BRANCH",
# facet by cols with a wrap of one
# this essentially facets by rows but with headers _above_ the graphs
# rather than to the right
facet_col="MAJOR_BRANCH",
facet_col_wrap=3,
height=800,
width=1000,
title="Percentage volume in each decision tree branch",
)
for facet in fig.data:
facet.text = [f"{sps.format_float(v)}%" for v in facet.y.tolist()]
fig.show("png")

Row count
A simple count of how many rows our results file CSV has. This is another, cruder measure of fragmentation
fig = px.bar(
df.groupby("YEAR").apply(len).rename("row_count").reset_index(),
x="YEAR",
y="row_count",
title="Number of rows in model results CSV file",
)
fig.show("png")

df_export = df[df["BRANCH"] != "DOMESTIC"]
df_totals = sps.concat(
sps.concat(
[
df_mdic.groupby("YEAR")[metric]
.sum()
.rename("value")
.reset_index()
.assign(source="MDIC", metric=metric),
df_export.groupby("YEAR")[metric]
.sum()
.rename("value")
.reset_index()
.assign(source="SEI-PCS", metric=metric),
df_comtrade.groupby("YEAR")[metric]
.sum()
.rename("value")
.reset_index()
.assign(source="Comtrade", metric=metric),
]
)
for metric in ["VOLUME_RAW", "FOB"]
)
fig = px.line(
df_totals,
x="YEAR",
y="value",
color="source",
facet_col="metric",
facet_col_wrap=1,
title=f"Comparing SEI-PCS, Comtrade and MDIC",
)
# make the facets have independent axes
fig.update_yaxes(matches=None)
fig.show("png")
