Main 2023

View or edit on GitHub

This page is synchronized from trase/models/brazil/customs_2019/main_2023.ipynb. Last modified on 2025-12-14 23:19 CET by Trase Admin. Please view or edit the original file there; changes should be reflected here after a midnight build (CET time), or manually triggering it with a GitHub action (link).

# %configure_logging

from trase.tools.sps import SupplyChain
import mosek

supplychain = SupplyChain("brazil/customs_2019", year=2023)
supplychain.preparation()
supplychain.load()

supplychain.run()

supplychain.export_results(suffix="_01")
# supplychain.upload_results(
#     key_prefix="brazil/trade/mdic/disaggregated/",
#     filename="brazil_mdic_disaggregated_2018_beef_01.csv",
#     suffix="_01",
#     create_ingest_metadata=False,
# )

supplychain.upload_results(
    key_prefix="brazil/trade/mdic/disaggregated/",
    filename="brazil_mdic_disaggregated_2023_beef_01.csv",
    suffix="_01",
)

import os

os.getcwd()

supplychain.context.year

bol = supplychain.get("bill_of_lading")
print(sum(bol[bol["hs6"] != "XXXXXX"]["vol"]))
print(sum(bol[bol["hs4"] != "XXXX"]["vol"]))

from trase.tools.aws.aws_helpers_cached import get_pandas_df_once
from trase.tools import sps

df_port = get_pandas_df_once(
    "brazil/trade/mdic/port/brazil_mdic_port_2018.csv",
    encoding="utf8",
    sep=";",
    dtype=str,
    keep_default_na=False,
)
df_mun = get_pandas_df_once(
    "brazil/trade/mdic/municipality/brazil_mdic_municipality_2018.csv",
    encoding="utf8",
    sep=";",
    dtype=str,
    keep_default_na=False,
)
df_port

import pandas as pd
import os
from trase.tools.sps import get_pandas_df_once

os.getcwd()
HS_LIST = ["0102", "0201", "0202", "0206", "0210", "0504", "1602"]

# df = pd.read_csv('/usr/share/TRASE/trase/models/brazil/customs_2019/2019/results/results.csv', dtype=str, sep=';')
# df['vol'] = df['vol'].astype(float)
# df = df[df['hs4'].isin(HS_LIST)].copy()
# a = df[df['state.trase_id']=='BR-XX']['vol'].sum()
# b = df[df['state.trase_id']!='BR-XX']['vol'].sum()
# print(a/b)

df1 = pd.read_csv(
    "/usr/share/TRASE/trase/models/brazil/customs_2019/2018/results/result_1.csv",
    dtype=str,
    sep=";",
)
df2 = pd.read_csv(
    "/usr/share/TRASE/trase/models/brazil/customs_2019/2018/results/results.csv",
    dtype=str,
    sep=";",
)
df = pd.concat([df1, df2], ignore_index=True)
# df['vol'] = df['vol'].astype(float)
# df = df[df['hs4'].isin(HS_LIST)].copy()
# a = df[df['state.trase_id']=='BR-XX']['vol'].sum()
# b = df[df['state.trase_id']!='BR-XX']['vol'].sum()
df["fob"].astype(float).sum()

df["state.trase_id"].drop_duplicates().to_list()

df_port["vol"] = df_port["vol"].astype(int)
df_mun["vol"] = df_mun["vol"].astype(int)
sps.compare_dataframes_single(
    df_port,
    df_mun,
    "vol",
    ["country_of_destination.name"],
    "relative_error",
)

# Quick QA on 2020 results
import pandas as pd

result_2018_1 = pd.read_csv("2018/results/result_1.csv", sep=";", dtype="str")
result_2018_2 = pd.read_csv("2018/results/results.csv", sep=";", dtype="str")
result_2018 = pd.concat([result_2018_1, result_2018_2])
sum(result_2018[result_2018["hs4"] == "0202"]["vol"].astype(float))

result_2018.columns

a = "20200112"
a[4:6]

BEEF_HS4 = [
    "0102",  # Bovine animals; live
    "0201",  # Meat of bovine animals; fresh or chilled
    "0202",  # Meat of bovine animals; frozen
    "0206",  # Edible offal of bovine + other animals; fresh, chilled or frozen
    "0210",  # Meat and edible meat offal; salted/brine/etc. (does not exist in BoL)
    "0504",  # Guts, bladders and stomachs of animals (does not exist in BoL)
    "1602",  # Prepared or preserved meat, meat offal or blood
]
result_2018[result_2018["hs4"].isin(BEEF_HS4)]["success"].value_counts(
    normalize=True
) * 100

sum_v = sum(result_2018[result_2018["hs4"].isin(BEEF_HS4)]["vol"])
result_2018[result_2018["hs4"].isin(BEEF_HS4)].groupby(["success"])[
    "vol"
].sum() / sum_v * 100

sum_v = sum(result_2018_1[result_2018_1["hs4"].isin(BEEF_HS4)]["vol"])
result_2018_1[result_2018_1["hs4"].isin(BEEF_HS4)].groupby(["success"])[
    "vol"
].sum() / sum_v * 100

sum_v = sum(result_2018_2[result_2018_2["hs4"].isin(BEEF_HS4)]["vol"])
result_2018_2[result_2018_2["hs4"].isin(BEEF_HS4)].groupby(["success"])[
    "vol"
].sum() / sum_v * 100

pd.set_option("display.float_format", lambda x: "%.2f" % x)
foo = result_2020.groupby(["success"]).agg({"vol": "sum"}).reset_index()
foo["vol_pct"] = 100 * foo["vol"] / foo["vol"].sum()
foo

success = result_2020["country_of_destination.name"][
    result_2020["success"] == True
].unique()

failure = result_2020["country_of_destination.name"][
    result_2020["success"] == False
].unique()

main_list = list(set(success) - set(failure))

unique_successful_countries_of_destination = pd.Series(main_list)

display(unique_successful_countries_of_destination.nunique())
display(unique_successful_countries_of_destination.unique())

df_port = supplychain.get("mdic_port")
df_municipality = supplychain.get("mdic_municipality")

display(sorted(df_municipality["month"].unique()))
display(sorted(df_port["month"].unique()))

df_municipality = df_municipality[
    (df_municipality["country_of_destination.name"] == "EGYPT")
    & (df_municipality["hs4"] == "0102")
]
df_port = df_port[
    (df_port["country_of_destination.name"] == "EGYPT") & (df_port["hs4"] == "0102")
]


common = set(df_municipality.columns) & set(df_port.columns)
common

from trase.tools.sps import consolidate

display(consolidate(df_port, ["fob", "vol"], ["country_of_destination.name", "month"]))
df_municipality

import itertools
from dataclasses import dataclass
from functools import partial
from itertools import chain
from textwrap import indent
from typing import Any, Dict, List, Optional, Tuple, Union

import numpy as np
import pandas as pd
import scipy
import scipy.optimize
from more_itertools import one
from scipy.sparse import dok_matrix
from termcolor import colored
from trase.models.BrazilCustoms2019 import constants
from trase.tools.sps import (
    consolidate,
    SupplyChain,
    assert_frame_like,
    concat,
    full_merge,
    group_by_and_zip,
)


df_bol = supplychain.get("bill_of_lading")
df_port = supplychain.get("mdic_port")
df_municipality = supplychain.get("mdic_municipality")
buckets = df_port[constants.BUCKETS].drop_duplicates()

a = filter_to(df_municipality, country, hs4)
b = df_port_

construct_variables(
    b[[*constants.COMMON, "via", "port.name", "state.trase_id", "hs6", "hs8"]],
    a[[*constants.COMMON, "exporter.municipality.trase_id"]],
    constants.COMMON,
)

QA Results

Check that they are consistent with MDIC (Port) and MDIC (Municipality)

import pandas as pd

from trase.tools.sps import (
    consolidate,
    full_merge,
    concat,
    compare_dataframes,
    Compare,
    grouped_proportion,
)
from trase.tools.sps import print_report_by_attribute

df = pd.read_csv(
    # "All/2019/POST/results_2021-08-16.csv",
    supplychain.results_path,
    sep=";",
    dtype=str,
    keep_default_na=False,
)
df = df.astype({"vol": float})
df = df.replace({"success": {"True": True, "False": False}})

df_port = supplychain.get("mdic_port")
df_municipality = supplychain.get("mdic_municipality")

Check total volume and per-bucket

print(f'{100 * df["vol"].sum() / df_port["vol"].sum():,.1f}')

report = compare_dataframes(
    df,
    df_port,
    {"vol": [lambda a, b: b - a]},
    ["country_of_destination.name", "hs4"],
    suffixes=("_mdic", "_solution"),
)
report[report[("vol", "<lambda>")].abs() > 1]

Check consistency with MDIC (Port)

# TODO handle unknowns!
#
# columns = set(df_port_.columns) - {"vol", "fob"}
# report = compare_dataframes(
#     df_port_,
#     consolidate(df, ["vol"], columns),
#     {"vol": [lambda a, b: b - a]},
#     columns,
#     suffixes=("_mdic", "_solution"),
# )
# report
# report[report[("vol", "<lambda>")].abs() > 1]

Explore Results

Report a summary of how the solver did:

import pandas as pd


df_commodities = pd.read_csv("All/2019/IN/commodity.csv", sep=";", dtype=str)

report = pd.merge(df, df_commodities, on=["hs6"], validate="many_to_one")
report = consolidate(report, ["vol"], ["commodity", "success"])
report = report.pivot(index="commodity", columns="success", values="vol").fillna(0)
report["success_percentage"] = 100 * report[True] / (report[True] + report[False])
report = report.sort_values("success_percentage", ascending=False)
report = report[["success_percentage"]]
report = report[
    report.index.isin(
        ["CORN", "COTTON", "SOY", "COCOA", "PORK", "BEEF", "CHICKEN", "COFFEE"]
    )
]
with pd.option_context("display.float_format", lambda x: f"{x:,.1f}%"):
    display(report)

Failed solves

x = df[~df["success"]].copy()
x.loc[x["message"].str.contains("Invalid dimensions"), "message"] = "Invalid dimension"
x.loc[x["message"].str.contains("Unable to handle"), "message"] = "Too many unknowns"
x.loc[x["message"].str.contains("Try another solver"), "message"] = "Solver failed"
print_report_by_attribute(x, "vol", ["message"])

Length/Size

from trase.tools.aws.aws_helpers_cached import get_pandas_df_once
from trase.models.BrazilCustoms2019.constants import BUCKETS

columns = [
    "month",
    "hs4",
    "hs8",
    "via",
    "hs6",
    "country_of_destination.name",
    "state.trase_id",
    "port.name",
]

interesting = ["CORN", "COTTON", "SOY", "COCOA", "PORK", "BEEF", "CHICKEN", "COFFEE"]

# MDIC (Port) for 2019
df_port19 = df[df["success"] & (df["via"] == "01")]
df_port19 = consolidate(df_port19, ["vol"], columns)
df_port19 = pd.merge(df_port19, df_commodities, on="hs6", validate="many_to_one")
df_port19 = df_port19[df_port19["commodity"].isin(interesting)]

# MDIC (Port) for 2017 - filtered to what we have for 2019
df_port17 = get_pandas_df_once(
    "brazil/trade/mdic/port/brazil_mdic_port_2017.csv",
    sep=";",
    dtype=str,
    keep_default_na=False,
)
df_port17 = df_port17[df_port17["via"] == "01"]
df_port17 = df_port17.astype({"vol": int})
df_port17 = consolidate(df_port17, ["vol"], columns)
df_port17 = pd.merge(
    df_port17,
    df_port19[["country_of_destination.name", "hs4"]].drop_duplicates(),
    on=["country_of_destination.name", "hs4"],
    validate="many_to_one",
)
df_port17 = pd.merge(df_port17, df_commodities, on="hs6", validate="many_to_one")
df_port17 = df_port17[df_port17["commodity"].isin(interesting)]

# lengths
report17 = df_port17.groupby("commodity").apply(len).rename("count").reset_index()
report19 = df_port19.groupby("commodity").apply(len).rename("count").reset_index()
report = compare_dataframes(
    report17,
    report19,
    {"count": [lambda a, b: b / a]},
    ["commodity"],
    suffixes=("_2017", "_2019"),
)
report.sort_values(("count", "total_2017"), ascending=False)

valuesa = df_port17[df_port17["commodity"] == "BEEF"]["vol"]
valuesb = df_port19[df_port19["commodity"] == "BEEF"]["vol"]

import numpy as np
import matplotlib.pyplot as plt

bins = np.linspace(0, 100000, 20)

plt.figure(figsize=(8, 8), dpi=100)
plt.hist(
    [valuesa, valuesb],
    bins,
    label=["2017 (official)", "2019 (solved)"],
    color=("#00d19a", "#ff6a5f"),
)
plt.rcParams.update({"font.size": 14})
plt.xlabel("Exported beef (kg)")
plt.ylabel("Count of rows")
plt.legend(loc="upper right")
plt.show()

df_2019[df_2019["vol"] > cutoff]