Skip to content

View or edit on GitHub

This page is synchronized from trase/models/brazil/customs_2019/estimate_new_objective_function.ipynb. Last modified on 2025-12-14 23:19 CET by Trase Admin. Please view or edit the original file there; changes should be reflected here after a midnight build (CET time), or manually triggering it with a GitHub action (link).

import pandas as pd
import numpy as np
from trase.tools import sps
from trase.tools.sps import compare_dataframes_single
import os
from trase.tools.aws.aws_helpers_cached import get_pandas_df_once
from trase.tools.sps import consolidate
from trase.models.brazil.customs_2019 import constants
from trase.tools.sps import compare_dataframes, Compare, grouped_proportion
from trase.tools.sps import print_report_by_attribute
BEEF_HS4 = [
    "0102",  # Bovine animals; live
    "0201",  # Meat of bovine animals; fresh or chilled
    "0202",  # Meat of bovine animals; frozen
    "0206",  # Edible offal of bovine + other animals; fresh, chilled or frozen
    "0210",  # Meat and edible meat offal; salted/brine/etc. (does not exist in BoL)
    "0504",  # Guts, bladders and stomachs of animals (does not exist in BoL)
    "1602",  # Prepared or preserved meat, meat offal or blood
]
def compare_vol(key="results", year="2015"):
    """Compare the CD and disaggregated MDIC vol per HS4 for beef"""

    def is_unknown(series: pd.Series) -> pd.Series:
        if series.name in constants.UNKNOWNS:
            return series == constants.UNKNOWNS[series.name]
        else:
            series = series.copy()
            series[:] = False
            return series

    solved = pd.read_csv(
        f"{year}/results/{key}.csv", sep=";", dtype=str, keep_default_na=False
    ).astype({"vol": float})
    actual = pd.read_csv(
        f"{year}/prepared/customs_declaration.csv",
        sep=";",
        dtype=str,
        keep_default_na=False,
    ).astype({"vol": float})

    # add commodity names
    df_commodities = pd.read_csv("2019/prepared/commodity.csv", sep=";", dtype=str)
    solved = pd.merge(solved, df_commodities, on="hs6")
    actual = pd.merge(actual, df_commodities, on="hs6")

    # filter both datasets to only those buckets which solved *and* don't have unknowns in them
    solved = solved[solved["success"] == "True"]
    has_unknowns = solved.groupby(constants.BUCKETS).apply(
        lambda df: df.apply(is_unknown).any(axis=1).any()
    )
    no_unknowns = has_unknowns[~has_unknowns].reset_index()
    solved = pd.merge(solved, no_unknowns, on=constants.BUCKETS, validate="many_to_one")
    actual = pd.merge(
        actual,
        solved[constants.BUCKETS].drop_duplicates(),
        on=constants.BUCKETS,
        validate="many_to_one",
    )

    # consolidate to only the columns of interest in the SEI-PCS model
    columns = [
        "hs4",
        "hs6",
        "commodity",
        "country_of_destination.name",
        "exporter.municipality.trase_id",
        "state.trase_id",
        "port.name",
    ]
    solved = consolidate(solved, ["vol"], columns)
    actual = consolidate(actual, ["vol"], columns)

    # first check that the buckets are consistent with one another
    # (this is a simple QA check on the solver itself)
    df = compare_dataframes(
        solved,
        actual,
        {"vol": [Compare.absolute_error]},
        ["hs4", "country_of_destination.name"],
    )
    assert df[df[("vol", "absolute_error")].abs() > 1e-1].empty

    # now classify the solution itself
    def classify(solved_vol, actual_vol):
        error = np.abs(solved_vol - actual_vol) / actual_vol

        answer = pd.Series("(uncategorised)", index=solved_vol.index)
        answer = np.where(error >= 0.5, "greater than 50%", answer)
        answer = np.where(error < 0.5, "within 50%", answer)
        answer = np.where(error < 0.4, "within 40%", answer)
        answer = np.where(error < 0.3, "within 30%", answer)
        answer = np.where(error < 0.2, "within 20%", answer)
        answer = np.where(error < 0.1, "within 10%", answer)

        answer = np.where(
            (solved_vol < 1) & (actual_vol >= 1),
            "missing from solved",
            answer,
        )
        answer = np.where(
            (solved_vol < 1) & (actual_vol >= 1),
            "missing from actual",
            answer,
        )
        return answer

    df = compare_dataframes(
        solved[solved["vol"] > 0.1],
        actual[actual["vol"] > 0.1],
        {"vol": [classify]},
        columns,
    )

    df.columns = ("total_solved", "total_solution", "error_margin")
    df = consolidate(
        df.reset_index(), ["total_solution"], ["commodity", "error_margin"]
    )
    df["percentage"] = grouped_proportion(df, "total_solution", ["commodity"])
    df = df.drop(columns=["total_solution"])
    df = df[df["commodity"] == "BEEF"]
    return df
def compare_states(key="results", year="2015"):
    df_disaggregated = pd.read_csv(
        f"/usr/share/TRASE/trase/models/brazil/customs_2019/{year}/results/{key}.csv",
        dtype=str,
        sep=";",
    )
    df_disaggregated["vol"] = df_disaggregated["vol"].astype(float)
    df_disaggregated = df_disaggregated[df_disaggregated["hs4"].isin(BEEF_HS4)]
    #     df_disaggregated = df_disaggregated[df_disaggregated["success"] == "True"]
    df_disaggregated = df_disaggregated[df_disaggregated["state.trase_id"] != "BR-XX"]
    a = df_disaggregated[
        df_disaggregated["state.trase_id"]
        == df_disaggregated["exporter.municipality.trase_id"].str[:5]
    ]["vol"].sum()
    b = df_disaggregated["vol"].sum()
    print(f"The original disaggregated {year} MDIC with equal states:")
    print(a / b)

Original

Vol comparison

compare_vol(key="results_0")

State comparison

compare_states(key="results_0")
df_cd = pd.read_csv(
    f"/usr/share/TRASE/trase/models/brazil/customs_2019/2015/prepared/customs_declaration.csv",
    dtype=str,
    sep=";",
)
df_cd_beef = df_cd[df_cd["hs4"].isin(BEEF_HS4)].copy()
df_cd_beef["vol"] = df_cd_beef["vol"].astype(float)
df_cd_beef = df_cd_beef[df_cd_beef["state.trase_id"] != "BR-XX"]
a = df_cd_beef[
    df_cd_beef["state.trase_id"] == df_cd_beef["exporter.municipality.trase_id"].str[:5]
]["vol"].sum()
b = df_cd_beef["vol"].sum()
print("CD 2015 with equal states:")
a / b

l=0.1 result

compare_vol(key="results_01")
compare_states(key="results_01")

l=0.18 result

compare_vol(key="results_018")
compare_states(key="results_018")

l=0.2 result

compare_vol(key="results_02")
compare_states(key="results_02")

l=0.21 result

compare_vol(key="results_021")
compare_states(key="results_021")

l=0.22 result

compare_vol(key="results_022")
compare_states(key="results_022")

l=0.3 result

compare_vol(key="results")
compare_states(key="results")

l=0.4 result

compare_vol(key="results")
compare_states(key="results")

l=0.6 result

compare_vol(key="results_06")
compare_states(key="results_06")

Look at the perticular example (one country and one HS code)

"""Compare the CD and disaggregated MDIC vol per HS4 for beef"""


def is_unknown(series: pd.Series) -> pd.Series:
    if series.name in constants.UNKNOWNS:
        return series == constants.UNKNOWNS[series.name]
    else:
        series = series.copy()
        series[:] = False
        return series


solved = pd.read_csv(
    f"2015/results/results_0.csv", sep=";", dtype=str, keep_default_na=False
).astype({"vol": float})
actual = pd.read_csv(
    "2015/prepared/customs_declaration.csv", sep=";", dtype=str, keep_default_na=False
).astype({"vol": float})

# add commodity names
df_commodities = pd.read_csv("2019/prepared/commodity.csv", sep=";", dtype=str)
solved = pd.merge(solved, df_commodities, on="hs6")
actual = pd.merge(actual, df_commodities, on="hs6")

# filter both datasets to only those buckets which solved *and* don't have unknowns in them
solved = solved[solved["success"] == "True"]
has_unknowns = solved.groupby(constants.BUCKETS).apply(
    lambda df: df.apply(is_unknown).any(axis=1).any()
)
no_unknowns = has_unknowns[~has_unknowns].reset_index()
solved = pd.merge(solved, no_unknowns, on=constants.BUCKETS, validate="many_to_one")
actual = pd.merge(
    actual,
    solved[constants.BUCKETS].drop_duplicates(),
    on=constants.BUCKETS,
    validate="many_to_one",
)

# consolidate to only the columns of interest in the SEI-PCS model
columns = [
    "hs4",
    "hs6",
    "commodity",
    "country_of_destination.name",
    "exporter.municipality.trase_id",
    "state.trase_id",
    "port.name",
]
solved = consolidate(solved, ["vol"], columns)
actual = consolidate(actual, ["vol"], columns)

# first check that the buckets are consistent with one another
# (this is a simple QA check on the solver itself)
df = compare_dataframes(
    solved,
    actual,
    {"vol": [Compare.absolute_error]},
    columns,
)
# assert df[df[("vol", "absolute_error")].abs() > 1e-1].empty
df.sort_values(by=("vol", "absolute_error"))
df = df.reset_index()
df_spe = df[
    (df["hs6"] == "160250")
    & (df["country_of_destination.name"] == "UNITED KINGDOM")
    & (df["port.name"] == "SANTOS")
    & (df["exporter.municipality.trase_id"] == "BR-4208203")
]
df_spe[("vol", "absolute_error")].sum()

UNITED KINGDOM 160250 SANTOS BR-4208203

"""Compare the CD and disaggregated MDIC vol per HS4 for beef"""


def is_unknown(series: pd.Series) -> pd.Series:
    if series.name in constants.UNKNOWNS:
        return series == constants.UNKNOWNS[series.name]
    else:
        series = series.copy()
        series[:] = False
        return series


solved = pd.read_csv(
    f"2015/results/results_02.csv", sep=";", dtype=str, keep_default_na=False
).astype({"vol": float})
actual = pd.read_csv(
    "2015/prepared/customs_declaration.csv", sep=";", dtype=str, keep_default_na=False
).astype({"vol": float})

# add commodity names
df_commodities = pd.read_csv("2019/prepared/commodity.csv", sep=";", dtype=str)
solved = pd.merge(solved, df_commodities, on="hs6")
actual = pd.merge(actual, df_commodities, on="hs6")

# filter both datasets to only those buckets which solved *and* don't have unknowns in them
solved = solved[solved["success"] == "True"]
has_unknowns = solved.groupby(constants.BUCKETS).apply(
    lambda df: df.apply(is_unknown).any(axis=1).any()
)
no_unknowns = has_unknowns[~has_unknowns].reset_index()
solved = pd.merge(solved, no_unknowns, on=constants.BUCKETS, validate="many_to_one")
actual = pd.merge(
    actual,
    solved[constants.BUCKETS].drop_duplicates(),
    on=constants.BUCKETS,
    validate="many_to_one",
)

# consolidate to only the columns of interest in the SEI-PCS model
columns = [
    "hs4",
    "hs6",
    "commodity",
    "country_of_destination.name",
    "exporter.municipality.trase_id",
    "state.trase_id",
    "port.name",
]
solved = consolidate(solved, ["vol"], columns)
actual = consolidate(actual, ["vol"], columns)

# first check that the buckets are consistent with one another
# (this is a simple QA check on the solver itself)
df = compare_dataframes(
    solved,
    actual,
    {"vol": [Compare.absolute_error]},
    columns,
)
df = df.reset_index()
df_spe = df[
    (df["hs6"] == "160250")
    & (df["country_of_destination.name"] == "UNITED KINGDOM")
    & (df["port.name"] == "SANTOS")
    & (df["exporter.municipality.trase_id"] == "BR-4208203")
]
df_spe[("vol", "absolute_error")].sum()
p = -383448 * 100 / -6316718
print(f"The absolute error in this case decreases to {p: .2f}% of the baseline.")

Test: 2017

df_cd = pd.read_csv(
    f"/usr/share/TRASE/trase/models/brazil/customs_2019/2017/prepared/customs_declaration.csv",
    dtype=str,
    sep=";",
)
df_cd_beef = df_cd[df_cd["hs4"].isin(BEEF_HS4)].copy()
df_cd_beef["vol"] = df_cd_beef["vol"].astype(float)
df_cd_beef = df_cd_beef[df_cd_beef["state.trase_id"] != "BR-XX"]
a = df_cd_beef[
    df_cd_beef["state.trase_id"] == df_cd_beef["exporter.municipality.trase_id"].str[:5]
]["vol"].sum()
b = df_cd_beef["vol"].sum()
print("CD 2017 with equal states:")
a / b

Baseline

compare_vol(key="results", year="2017")
compare_states(key="results", year="2017")

l=0.2

compare_vol(key="results_02", year="2017")
compare_states(key="results_02", year="2017")

Use CD to replace BOL

compare_vol(key="results")
compare_states(key="results")
df_seipcs = pd.read_csv(
    f"/usr/share/TRASE/trase/models/brazil/customs_2019/2015/results/results_06.csv",
    dtype=str,
    sep=";",
)
df_seipcs["vol"] = df_seipcs["vol"].astype(float)
# df_seipcs.groupby('message')['vol'].sum().reset_index().sort_values('vol')
df_seipcs[df_seipcs["success"] == "True"]["vol"].sum() / df_seipcs["vol"].sum()
# df_seipcs[(df_seipcs['hs4']=='0206') & (df_seipcs['country_of_destination.name']=='CHINA (HONG KONG)')]['message'].drop_duplicates()

# df_seipcs[df_seipcs['message']=='number of elements exceeds INT_MAX'].groupby('hs4')['vol'].sum().reset_index()

Missing CNPJs

## 03853896006857
df = get_pandas_df_once(
    "brazil/trade/bol/2019/BRAZIL_BOL_2019.csv",
    encoding="utf8",
    sep=";",
    dtype=str,
    keep_default_na=False,
)
df[df["exporter.cnpj"] == "03853896006857"].columns
df_matched = get_pandas_df_once(
    "brazil/beef/trade/cd/disaggregated/CD_DISAGGREGATED_BEEF_2019.csv",
    encoding="utf8",
    sep=";",
    dtype=str,
    keep_default_na=False,
)
df_matched["vol"] = df_matched["vol"].astype(float)
df_matched[df_matched["exporter.cnpj"] == "03853896006857"].groupby("state.trase_id")[
    "vol"
].sum()
df_seipcs = pd.read_csv(
    f"/usr/share/TRASE/trase/models/brazil/beef/2019/results/results.csv",
    dtype=str,
    sep=";",
)
df_seipcs["VOLUME_PRODUCT"] = df_seipcs["VOLUME_PRODUCT"].astype(float)
df_seipcs[df_seipcs["BRANCH"] != "3. unknown state of production"]
df_seipcs = (
    df_seipcs.groupby("STATE_OF_PRODUCTION")["VOLUME_PRODUCT"].sum().reset_index()
)
df_mdic = pd.read_csv(
    f"/usr/share/TRASE/trase/models/brazil/customs_2019/2019/prepared/mdic_port.csv",
    dtype=str,
    sep=";",
)
df_mdic["vol"] = df_mdic["vol"].astype(float)

# get state names
df_state = pd.read_csv(
    f"/usr/share/TRASE/trase/models/brazil/beef/2019/prepared/state.csv",
    dtype=str,
    sep=";",
)
df_state = df_state[["state_name", "trase_id"]].drop_duplicates()
df_mdic = pd.merge(
    df_mdic,
    df_state,
    left_on="state.trase_id",
    right_on="trase_id",
    how="left",
    validate="many_to_one",
)

df_mdic = df_mdic.groupby("state_name")["vol"].sum().reset_index()
df_mdic = df_mdic.rename(columns={"state_name": "state"})
df_seipcs = df_seipcs.rename(
    columns={"STATE_OF_PRODUCTION": "state", "VOLUME_PRODUCT": "vol"}
)
df_comparison = compare_dataframes(
    df_mdic,
    df_seipcs,
    {"vol": [Compare.absolute_error]},
    ["state"],
    suffixes=("_mdic", "_seipcs"),
)
df_comparison[("vol", "absolute_error")].sum()