View or edit on GitHub
This page is synchronized from trase/models/brazil/customs_2019/estimate_new_objective_function.ipynb. Last modified on 2025-12-14 23:19 CET by Trase Admin.
Please view or edit the original file there; changes should be reflected here after a midnight build (CET time),
or manually triggering it with a GitHub action (link).
import pandas as pd
import numpy as np
from trase.tools import sps
from trase.tools.sps import compare_dataframes_single
import os
from trase.tools.aws.aws_helpers_cached import get_pandas_df_once
from trase.tools.sps import consolidate
from trase.models.brazil.customs_2019 import constants
from trase.tools.sps import compare_dataframes, Compare, grouped_proportion
from trase.tools.sps import print_report_by_attribute
BEEF_HS4 = [
"0102", # Bovine animals; live
"0201", # Meat of bovine animals; fresh or chilled
"0202", # Meat of bovine animals; frozen
"0206", # Edible offal of bovine + other animals; fresh, chilled or frozen
"0210", # Meat and edible meat offal; salted/brine/etc. (does not exist in BoL)
"0504", # Guts, bladders and stomachs of animals (does not exist in BoL)
"1602", # Prepared or preserved meat, meat offal or blood
]
def compare_vol(key="results", year="2015"):
"""Compare the CD and disaggregated MDIC vol per HS4 for beef"""
def is_unknown(series: pd.Series) -> pd.Series:
if series.name in constants.UNKNOWNS:
return series == constants.UNKNOWNS[series.name]
else:
series = series.copy()
series[:] = False
return series
solved = pd.read_csv(
f"{year}/results/{key}.csv", sep=";", dtype=str, keep_default_na=False
).astype({"vol": float})
actual = pd.read_csv(
f"{year}/prepared/customs_declaration.csv",
sep=";",
dtype=str,
keep_default_na=False,
).astype({"vol": float})
# add commodity names
df_commodities = pd.read_csv("2019/prepared/commodity.csv", sep=";", dtype=str)
solved = pd.merge(solved, df_commodities, on="hs6")
actual = pd.merge(actual, df_commodities, on="hs6")
# filter both datasets to only those buckets which solved *and* don't have unknowns in them
solved = solved[solved["success"] == "True"]
has_unknowns = solved.groupby(constants.BUCKETS).apply(
lambda df: df.apply(is_unknown).any(axis=1).any()
)
no_unknowns = has_unknowns[~has_unknowns].reset_index()
solved = pd.merge(solved, no_unknowns, on=constants.BUCKETS, validate="many_to_one")
actual = pd.merge(
actual,
solved[constants.BUCKETS].drop_duplicates(),
on=constants.BUCKETS,
validate="many_to_one",
)
# consolidate to only the columns of interest in the SEI-PCS model
columns = [
"hs4",
"hs6",
"commodity",
"country_of_destination.name",
"exporter.municipality.trase_id",
"state.trase_id",
"port.name",
]
solved = consolidate(solved, ["vol"], columns)
actual = consolidate(actual, ["vol"], columns)
# first check that the buckets are consistent with one another
# (this is a simple QA check on the solver itself)
df = compare_dataframes(
solved,
actual,
{"vol": [Compare.absolute_error]},
["hs4", "country_of_destination.name"],
)
assert df[df[("vol", "absolute_error")].abs() > 1e-1].empty
# now classify the solution itself
def classify(solved_vol, actual_vol):
error = np.abs(solved_vol - actual_vol) / actual_vol
answer = pd.Series("(uncategorised)", index=solved_vol.index)
answer = np.where(error >= 0.5, "greater than 50%", answer)
answer = np.where(error < 0.5, "within 50%", answer)
answer = np.where(error < 0.4, "within 40%", answer)
answer = np.where(error < 0.3, "within 30%", answer)
answer = np.where(error < 0.2, "within 20%", answer)
answer = np.where(error < 0.1, "within 10%", answer)
answer = np.where(
(solved_vol < 1) & (actual_vol >= 1),
"missing from solved",
answer,
)
answer = np.where(
(solved_vol < 1) & (actual_vol >= 1),
"missing from actual",
answer,
)
return answer
df = compare_dataframes(
solved[solved["vol"] > 0.1],
actual[actual["vol"] > 0.1],
{"vol": [classify]},
columns,
)
df.columns = ("total_solved", "total_solution", "error_margin")
df = consolidate(
df.reset_index(), ["total_solution"], ["commodity", "error_margin"]
)
df["percentage"] = grouped_proportion(df, "total_solution", ["commodity"])
df = df.drop(columns=["total_solution"])
df = df[df["commodity"] == "BEEF"]
return df
def compare_states(key="results", year="2015"):
df_disaggregated = pd.read_csv(
f"/usr/share/TRASE/trase/models/brazil/customs_2019/{year}/results/{key}.csv",
dtype=str,
sep=";",
)
df_disaggregated["vol"] = df_disaggregated["vol"].astype(float)
df_disaggregated = df_disaggregated[df_disaggregated["hs4"].isin(BEEF_HS4)]
# df_disaggregated = df_disaggregated[df_disaggregated["success"] == "True"]
df_disaggregated = df_disaggregated[df_disaggregated["state.trase_id"] != "BR-XX"]
a = df_disaggregated[
df_disaggregated["state.trase_id"]
== df_disaggregated["exporter.municipality.trase_id"].str[:5]
]["vol"].sum()
b = df_disaggregated["vol"].sum()
print(f"The original disaggregated {year} MDIC with equal states:")
print(a / b)
Original
Vol comparison
compare_vol(key="results_0")
State comparison
compare_states(key="results_0")
df_cd = pd.read_csv(
f"/usr/share/TRASE/trase/models/brazil/customs_2019/2015/prepared/customs_declaration.csv",
dtype=str,
sep=";",
)
df_cd_beef = df_cd[df_cd["hs4"].isin(BEEF_HS4)].copy()
df_cd_beef["vol"] = df_cd_beef["vol"].astype(float)
df_cd_beef = df_cd_beef[df_cd_beef["state.trase_id"] != "BR-XX"]
a = df_cd_beef[
df_cd_beef["state.trase_id"] == df_cd_beef["exporter.municipality.trase_id"].str[:5]
]["vol"].sum()
b = df_cd_beef["vol"].sum()
print("CD 2015 with equal states:")
a / b
l=0.1 result
compare_vol(key="results_01")
compare_states(key="results_01")
l=0.18 result
compare_vol(key="results_018")
compare_states(key="results_018")
l=0.2 result
compare_vol(key="results_02")
compare_states(key="results_02")
l=0.21 result
compare_vol(key="results_021")
compare_states(key="results_021")
l=0.22 result
compare_vol(key="results_022")
compare_states(key="results_022")
l=0.3 result
compare_vol(key="results")
compare_states(key="results")
l=0.4 result
compare_vol(key="results")
compare_states(key="results")
l=0.6 result
compare_vol(key="results_06")
compare_states(key="results_06")
Look at the perticular example (one country and one HS code)
"""Compare the CD and disaggregated MDIC vol per HS4 for beef"""
def is_unknown(series: pd.Series) -> pd.Series:
if series.name in constants.UNKNOWNS:
return series == constants.UNKNOWNS[series.name]
else:
series = series.copy()
series[:] = False
return series
solved = pd.read_csv(
f"2015/results/results_0.csv", sep=";", dtype=str, keep_default_na=False
).astype({"vol": float})
actual = pd.read_csv(
"2015/prepared/customs_declaration.csv", sep=";", dtype=str, keep_default_na=False
).astype({"vol": float})
# add commodity names
df_commodities = pd.read_csv("2019/prepared/commodity.csv", sep=";", dtype=str)
solved = pd.merge(solved, df_commodities, on="hs6")
actual = pd.merge(actual, df_commodities, on="hs6")
# filter both datasets to only those buckets which solved *and* don't have unknowns in them
solved = solved[solved["success"] == "True"]
has_unknowns = solved.groupby(constants.BUCKETS).apply(
lambda df: df.apply(is_unknown).any(axis=1).any()
)
no_unknowns = has_unknowns[~has_unknowns].reset_index()
solved = pd.merge(solved, no_unknowns, on=constants.BUCKETS, validate="many_to_one")
actual = pd.merge(
actual,
solved[constants.BUCKETS].drop_duplicates(),
on=constants.BUCKETS,
validate="many_to_one",
)
# consolidate to only the columns of interest in the SEI-PCS model
columns = [
"hs4",
"hs6",
"commodity",
"country_of_destination.name",
"exporter.municipality.trase_id",
"state.trase_id",
"port.name",
]
solved = consolidate(solved, ["vol"], columns)
actual = consolidate(actual, ["vol"], columns)
# first check that the buckets are consistent with one another
# (this is a simple QA check on the solver itself)
df = compare_dataframes(
solved,
actual,
{"vol": [Compare.absolute_error]},
columns,
)
# assert df[df[("vol", "absolute_error")].abs() > 1e-1].empty
df.sort_values(by=("vol", "absolute_error"))
df = df.reset_index()
df_spe = df[
(df["hs6"] == "160250")
& (df["country_of_destination.name"] == "UNITED KINGDOM")
& (df["port.name"] == "SANTOS")
& (df["exporter.municipality.trase_id"] == "BR-4208203")
]
df_spe[("vol", "absolute_error")].sum()
UNITED KINGDOM 160250 SANTOS BR-4208203
"""Compare the CD and disaggregated MDIC vol per HS4 for beef"""
def is_unknown(series: pd.Series) -> pd.Series:
if series.name in constants.UNKNOWNS:
return series == constants.UNKNOWNS[series.name]
else:
series = series.copy()
series[:] = False
return series
solved = pd.read_csv(
f"2015/results/results_02.csv", sep=";", dtype=str, keep_default_na=False
).astype({"vol": float})
actual = pd.read_csv(
"2015/prepared/customs_declaration.csv", sep=";", dtype=str, keep_default_na=False
).astype({"vol": float})
# add commodity names
df_commodities = pd.read_csv("2019/prepared/commodity.csv", sep=";", dtype=str)
solved = pd.merge(solved, df_commodities, on="hs6")
actual = pd.merge(actual, df_commodities, on="hs6")
# filter both datasets to only those buckets which solved *and* don't have unknowns in them
solved = solved[solved["success"] == "True"]
has_unknowns = solved.groupby(constants.BUCKETS).apply(
lambda df: df.apply(is_unknown).any(axis=1).any()
)
no_unknowns = has_unknowns[~has_unknowns].reset_index()
solved = pd.merge(solved, no_unknowns, on=constants.BUCKETS, validate="many_to_one")
actual = pd.merge(
actual,
solved[constants.BUCKETS].drop_duplicates(),
on=constants.BUCKETS,
validate="many_to_one",
)
# consolidate to only the columns of interest in the SEI-PCS model
columns = [
"hs4",
"hs6",
"commodity",
"country_of_destination.name",
"exporter.municipality.trase_id",
"state.trase_id",
"port.name",
]
solved = consolidate(solved, ["vol"], columns)
actual = consolidate(actual, ["vol"], columns)
# first check that the buckets are consistent with one another
# (this is a simple QA check on the solver itself)
df = compare_dataframes(
solved,
actual,
{"vol": [Compare.absolute_error]},
columns,
)
df = df.reset_index()
df_spe = df[
(df["hs6"] == "160250")
& (df["country_of_destination.name"] == "UNITED KINGDOM")
& (df["port.name"] == "SANTOS")
& (df["exporter.municipality.trase_id"] == "BR-4208203")
]
df_spe[("vol", "absolute_error")].sum()
p = -383448 * 100 / -6316718
print(f"The absolute error in this case decreases to {p: .2f}% of the baseline.")
Test: 2017
df_cd = pd.read_csv(
f"/usr/share/TRASE/trase/models/brazil/customs_2019/2017/prepared/customs_declaration.csv",
dtype=str,
sep=";",
)
df_cd_beef = df_cd[df_cd["hs4"].isin(BEEF_HS4)].copy()
df_cd_beef["vol"] = df_cd_beef["vol"].astype(float)
df_cd_beef = df_cd_beef[df_cd_beef["state.trase_id"] != "BR-XX"]
a = df_cd_beef[
df_cd_beef["state.trase_id"] == df_cd_beef["exporter.municipality.trase_id"].str[:5]
]["vol"].sum()
b = df_cd_beef["vol"].sum()
print("CD 2017 with equal states:")
a / b
Baseline
compare_vol(key="results", year="2017")
compare_states(key="results", year="2017")
l=0.2
compare_vol(key="results_02", year="2017")
compare_states(key="results_02", year="2017")
Use CD to replace BOL
compare_vol(key="results")
compare_states(key="results")
df_seipcs = pd.read_csv(
f"/usr/share/TRASE/trase/models/brazil/customs_2019/2015/results/results_06.csv",
dtype=str,
sep=";",
)
df_seipcs["vol"] = df_seipcs["vol"].astype(float)
# df_seipcs.groupby('message')['vol'].sum().reset_index().sort_values('vol')
df_seipcs[df_seipcs["success"] == "True"]["vol"].sum() / df_seipcs["vol"].sum()
# df_seipcs[(df_seipcs['hs4']=='0206') & (df_seipcs['country_of_destination.name']=='CHINA (HONG KONG)')]['message'].drop_duplicates()
# df_seipcs[df_seipcs['message']=='number of elements exceeds INT_MAX'].groupby('hs4')['vol'].sum().reset_index()
Missing CNPJs
## 03853896006857
df = get_pandas_df_once(
"brazil/trade/bol/2019/BRAZIL_BOL_2019.csv",
encoding="utf8",
sep=";",
dtype=str,
keep_default_na=False,
)
df[df["exporter.cnpj"] == "03853896006857"].columns
df_matched = get_pandas_df_once(
"brazil/beef/trade/cd/disaggregated/CD_DISAGGREGATED_BEEF_2019.csv",
encoding="utf8",
sep=";",
dtype=str,
keep_default_na=False,
)
df_matched["vol"] = df_matched["vol"].astype(float)
df_matched[df_matched["exporter.cnpj"] == "03853896006857"].groupby("state.trase_id")[
"vol"
].sum()
df_seipcs = pd.read_csv(
f"/usr/share/TRASE/trase/models/brazil/beef/2019/results/results.csv",
dtype=str,
sep=";",
)
df_seipcs["VOLUME_PRODUCT"] = df_seipcs["VOLUME_PRODUCT"].astype(float)
df_seipcs[df_seipcs["BRANCH"] != "3. unknown state of production"]
df_seipcs = (
df_seipcs.groupby("STATE_OF_PRODUCTION")["VOLUME_PRODUCT"].sum().reset_index()
)
df_mdic = pd.read_csv(
f"/usr/share/TRASE/trase/models/brazil/customs_2019/2019/prepared/mdic_port.csv",
dtype=str,
sep=";",
)
df_mdic["vol"] = df_mdic["vol"].astype(float)
# get state names
df_state = pd.read_csv(
f"/usr/share/TRASE/trase/models/brazil/beef/2019/prepared/state.csv",
dtype=str,
sep=";",
)
df_state = df_state[["state_name", "trase_id"]].drop_duplicates()
df_mdic = pd.merge(
df_mdic,
df_state,
left_on="state.trase_id",
right_on="trase_id",
how="left",
validate="many_to_one",
)
df_mdic = df_mdic.groupby("state_name")["vol"].sum().reset_index()
df_mdic = df_mdic.rename(columns={"state_name": "state"})
df_seipcs = df_seipcs.rename(
columns={"STATE_OF_PRODUCTION": "state", "VOLUME_PRODUCT": "vol"}
)
df_comparison = compare_dataframes(
df_mdic,
df_seipcs,
{"vol": [Compare.absolute_error]},
["state"],
suffixes=("_mdic", "_seipcs"),
)
df_comparison[("vol", "absolute_error")].sum()