TradeDataComparisons
View or edit on GitHub
This page is synchronized from trase/models/bolivia/soy/TradeDataComparisons.ipynb. Last modified on 2025-12-14 23:19 CET by Trase Admin.
Please view or edit the original file there; changes should be reflected here after a midnight build (CET time),
or manually triggering it with a GitHub action (link).
from trase.tools.aws import *
from trase.tools.jupyter.observable import notebook, sankey
# Create dataset
df_2019 = get_pandas_df(
"bolivia/soy/trade/cd/export/out/bolivia_soy_export_2019.csv"
) # ['amendment_date', 'packages'] columns not present in 2018 dataset
df_2018 = get_pandas_df("bolivia/soy/trade/cd/export/out/bolivia_soy_export_2018.csv")
print(df_2018.columns)
sankey(df_2018, "total_fob_usd_first", ["port", "exporter", "country"], limit=50)
"""
IMPEXP
QUESTIONS:
- "Unnamed: 35", # TODO: what are the codes in this column?
- Recognize some of the importer names as shrimp importers in Ecuador: EUROFISH S.A., NIRSA
- Routing_Country = re exports?
- Unknown traders form a much larger proportion for hscode 230400
LATER:
- Fix HS codes for brazil_nuts_hscodes = ["080122", "080121", "080122"] # 801220, and deal with bag units
"""
import pandas as pd
import numpy as np
from trase.tools.aws import *
from trase.tools.jupyter.observable import notebook
pd.set_option("display.float_format", "{:.2f}".format)
pd.options.mode.chained_assignment = None
path = "bolivia/trade/impex/ORIGINALS/4hs_code_exp_Bolivia_2018.csv"
df = get_pandas_df(path, sep=";")
soy_hscodes = ["230400", "150790", "150710"]
df_soy = df[df["HS_Code"].isin(soy_hscodes)]
total_sum = df_soy["Net_Weight"].sum() / 1000
def clean_country(country):
country = country.split("(")[0]
if country.upper().strip() == "MYANMAR - BURMA":
return "MYANMAR"
return country.upper().strip()
def clean_importer_name(trader_name):
trader_name = trader_name.strip()
trader_name = trader_name.replace(" S.A.", " SA")
trader_name = trader_name.replace(" S.A.C.", " SAC")
trader_name = trader_name.replace(" S A", " SA")
return trader_name
def clean_trader_name(trader_name):
trader_name = trader_name.strip()
trader_name = trader_name.replace(" S.A.", " SA")
trader_name = trader_name.replace(" S A", " SA")
trader_name = trader_name.replace(" INC.", " INC")
trader_name = trader_name.replace(", INCORPORATED", " INC")
trader_name = trader_name.replace(", S. DE R.L.", "")
trader_name = trader_name.replace(" S. DE R.L", "")
trader_name = trader_name.replace(" S.DE R.L.", "")
trader_name = trader_name.replace(" , S.A.1", " SA")
trader_name = trader_name.replace(" SA1", " SA")
trader_name = trader_name.replace(", SA1", " SA")
trader_name = trader_name.replace(", SA", " SA")
trader_name = trader_name.replace(" ", " ")
trader_name = trader_name.replace(",", " ")
return trader_name
def create_exporter_group(trader_name):
if trader_name in [
"ADM AMERICAS",
"ADM AMERICAS ",
"ADM AMERICAS S DE R L",
"ADM SAO SA",
]:
return "ADM"
if trader_name in [
"CARGILL AMERICAS INC",
"CARGILL AMERICAS INC",
"CARGILL BOLIVIA SA",
]:
return "CARGILL"
if trader_name in [
"INDUSTRIAS ACEITE SA",
"INDUSTRIAS DE ACEITE SA",
"INDUSTRIAS DE ACEITE SA POR CUENTA Y ORDEN DE COLCUN SA",
"INDUSTRIAS DEL ACEITE SA",
"INDUSTRIAS DEL ACEITE SA POR CUENTA Y ORIGEN DE COLCUN SA",
]:
return "INDUSTRIAS ACEITE SA"
if trader_name in ["LDC TRADING&SERVICE CO SA", "LDC TRADING&SERVICES CO SA"]:
return "LDC TRADING % SERVICES"
return trader_name
# Country
df_soy["COUNTRY_OF_DESTINATION"] = df_soy.Country_of_Importers.apply(clean_country)
assert total_sum == df_soy["Net_Weight"].sum() / 1000
# Volume: Traded volume (Quantity, Gross_Weight, Net_Weight, Metric_Tons)
# df_soy[df_soy.Gross_Weight != df_soy.Net_Weight]...
df_soy["VOLUME_TONNES"] = df_soy["Net_Weight"] / 1000
assert total_sum == df_soy["Net_Weight"].sum() / 1000
assert total_sum == df_soy["Net_Weight"].sum() / 1000
# Traders
df_soy["Exporter_LL"] = df_soy["Exporter_LL"].fillna("UNKNOWN")
df_soy["EXPORTER"] = df_soy["Exporter_LL"].apply(clean_trader_name) # 38 unique
df_soy["EXPORTER_GROUP"] = df_soy["EXPORTER"].apply(create_exporter_group)
df_soy["Importer_LL"] = df_soy["Importer_LL"].fillna("UNKNOWN")
df_soy["IMPORTER"] = df_soy["Importer_LL"].apply(clean_importer_name) # 163 unique
assert total_sum == df_soy["Net_Weight"].sum() / 1000
# Ports
df_soy["Local_Port"] = df_soy["Local_Port"].fillna("UNKNOWN PORT")
df_soy["PORT_OF_EXPORT"] = df_soy.Local_Port
df_soy["PORT_OF_EXPORT"] = np.where(
df_soy["PORT_OF_EXPORT"].isin(
[
"AMERICA OTROS PUERTOS",
"AMERICA OTROS PUERTOS",
"OTROS AMERICA NO ESPECIFI",
"OTROS PTOS.AMERICA",
"OTROS ARGENTINA",
]
),
"AMERICA OTROS PUERTOS",
df_soy["PORT_OF_EXPORT"],
)
df_soy["PORT_OF_EXPORT"] = np.where(
df_soy["PORT_OF_EXPORT"].isin(["ILO (CODE CHANGE IATA)"]),
"ILO",
df_soy["PORT_OF_EXPORT"],
)
assert total_sum == df_soy["Net_Weight"].sum() / 1000
df_soy["HS6"] = df_soy["HS_Code"].astype(str)
assert total_sum == df_soy["Net_Weight"].sum() / 1000
df_soy = df_soy[
[
"Month",
"Date",
"PORT_OF_EXPORT",
"EXPORTER",
"EXPORTER_GROUP",
"IMPORTER",
"COUNTRY_OF_DESTINATION",
"Transport",
"VOLUME_TONNES",
"HS_Code",
"HS6",
"Routing_Country",
]
]
# Table up differences between trade datasets
FLOUR = ["1208100000"]
SEED = ["1201100000"]
OIL = ["1507100000", "1507909000"]
CAKE = ["2304000000"]
comex_df = get_pandas_df(
"bolivia/soy/trade/cd/export/out/comex_bolivia_soy_2018.csv", dtype=str
)
comex_df["Valor_FOB_Sus"] = comex_df["Valor_FOB_Sus"].astype(float)
comex_df["Peso_Bruto_Kg"] = comex_df["Peso_Bruto_Kg"].astype(float)
comex_df["Peso_Neto_Kg"] = comex_df["Peso_Neto_Kg"].astype(float)
comex_summary_df = (
comex_df[comex_df.NANDINA.isin(OIL + CAKE)]
.groupby(["des_Via_Sal", "COUNTRY_OF_DESTINATION"])[["Peso_Neto_Kg"]]
.sum()
.reset_index()
)
comex_summary_df = (
comex_df[comex_df.NANDINA.isin(OIL + CAKE)]
.groupby(["des_cuci3", "COUNTRY_OF_DESTINATION", "HS6"])[
["Valor_FOB_Sus", "Peso_Bruto_Kg", "Peso_Neto_Kg"]
]
.sum()
.reset_index()
)
ALL = ["150790", "150710", "230400"]
diff = pd.merge(
comex_summary_df,
df_soy[df_soy.HS6.isin(ALL)]
.groupby(["COUNTRY_OF_DESTINATION", "HS6"])["VOLUME_TONNES"]
.sum()
.reset_index(),
how="outer",
left_on=["COUNTRY_OF_DESTINATION", "HS6"],
right_on=["COUNTRY_OF_DESTINATION", "HS6"],
)
diff["IMPEX_COMEX_DIFF"] = (diff.VOLUME_TONNES / (diff.Peso_Neto_Kg / 1000)) * 100
# SICEX Dataset
sicex = get_pandas_df("bolivia/trade/cd/export/OLD/2018/sicex25/CD_BOLIVIA_2018.csv")
sicex["HS6"] = sicex["Product.Schedule.B.Code"].astype(str).str[0:6]
sicex["COUNTRY_OF_DESTINATION"] = sicex["Country.of.Destiny"].apply(clean_country)
sicex_df = (
sicex[sicex["HS6"].isin(ALL)]
.groupby(["Country.of.Destiny", "HS6"])["TOTAL.Net.Weight..Kg."]
.sum()
.reset_index()
)
diff = pd.merge(
diff,
sicex_df,
how="outer",
left_on=["COUNTRY_OF_DESTINATION", "HS6"],
right_on=["Country.of.Destiny", "HS6"],
)
diff = diff.rename(
columns={
"VOLUME_TONNES": "IMPEX_TN",
"Peso_Neto_Kg": "COMEX_KG",
"TOTAL.Net.Weight..Kg.": "SICEX_KG",
}
)
diff["IMPEX_SICEX_DIFF"] = (diff.IMPEX_TN / (diff.SICEX_KG) * 1000) * 100
diff["SICEX_COMEX_DIFF"] = (diff.SICEX_KG / diff.COMEX_KG) * 100
diff["COMEX_TN"] = diff["COMEX_KG"] / 1_000
diff["SICEX_TN"] = diff["SICEX_KG"] / 1_000
# Latest dataset
latest_df = df_2018
latest_df.hs6 = latest_df.hs6.astype(str)
latest_df = (
latest_df[latest_df["hs6"].isin(ALL)]
.groupby(["country", "hs6"])["weight_kg"]
.sum()
.reset_index()
)
diff = pd.merge(
diff,
latest_df,
how="left",
left_on=["COUNTRY_OF_DESTINATION", "HS6"],
right_on=["country", "hs6"],
)
diff = diff.rename(columns={"weight_kg": "LATEST_KG"})
diff["LATEST_TN"] = diff["LATEST_KG"] / 1_000
diff["LATEST_COMEX_DIFF"] = (diff.LATEST_KG / diff.COMEX_KG) * 100
diff = diff[
[
"COUNTRY_OF_DESTINATION",
"HS6",
"COMEX_TN",
"IMPEX_TN",
"LATEST_TN",
"IMPEX_COMEX_DIFF",
"LATEST_COMEX_DIFF",
]
].sort_values(["HS6", "COUNTRY_OF_DESTINATION"])
diff
# Difference with COMEX
ALL = ["150790", "150710", "230400"]
print(df_soy["COUNTRY_OF_DESTINATION"].unique())
diff = pd.merge(
comex_summary_df,
df_soy[df_soy.HS6.isin(ALL)]
.groupby(["COUNTRY_OF_DESTINATION", "HS6"])["VOLUME_TONNES"]
.sum()
.reset_index(),
how="outer",
left_on=["COUNTRY_OF_DESTINATION", "HS6"],
right_on=["COUNTRY_OF_DESTINATION", "HS6"],
)
diff["IMPEX_COMEX_DIFF"] = (diff.VOLUME_TONNES / (diff.Peso_Neto_Kg / 1000)) * 100
# SICEX Dataset
sicex = get_pandas_df("bolivia/trade/cd/export/OLD/2018/sicex25/CD_BOLIVIA_2018.csv")
sicex["HS6"] = sicex["Product.Schedule.B.Code"].astype(str).str[0:6]
sicex["COUNTRY_OF_DESTINATION"] = sicex["Country.of.Destiny"].apply(clean_country)
sicex_df = (
sicex[sicex["HS6"].isin(ALL)]
.groupby(["Country.of.Destiny", "HS6"])["TOTAL.Net.Weight..Kg."]
.sum()
.reset_index()
)
diff = pd.merge(
diff,
sicex_df,
how="outer",
left_on=["COUNTRY_OF_DESTINATION", "HS6"],
right_on=["Country.of.Destiny", "HS6"],
)
diff = diff.rename(
columns={
"VOLUME_TONNES": "IMPEX_TN",
"Peso_Neto_Kg": "COMEX_KG",
"TOTAL.Net.Weight..Kg.": "SICEX_KG",
}
)
diff["IMPEX_SICEX_DIFF"] = (diff.IMPEX_TN / (diff.SICEX_KG) * 1000) * 100
diff["SICEX_COMEX_DIFF"] = (diff.SICEX_KG / diff.COMEX_KG) * 100
diff["COMEX_TN"] = diff["COMEX_KG"] / 1000
diff["SICEX_TN"] = diff["SICEX_KG"] / 1000
diff = diff[
[
"COUNTRY_OF_DESTINATION",
"HS6",
"COMEX_TN",
"SICEX_TN",
"IMPEX_TN",
"SICEX_COMEX_DIFF",
"IMPEX_COMEX_DIFF",
"IMPEX_SICEX_DIFF",
]
].sort_values(["HS6", "COUNTRY_OF_DESTINATION"])
diff
# HYBRID dataset
# Get proportions of per country sourcing from Cochabamba + Santa Cruz or Tarija + Santa Cruz
# TODO: ask javi about imports in SICEX for panama and peru?
cdf = comex_df[
[
"MES",
"HS6",
"des",
"des_Via_Sal",
"COUNTRY_OF_DESTINATION",
"Peso_Neto_Kg",
"Valor_FOB_Sus",
]
]
months = {
"Jan": 1,
"Feb": 2,
"Mar": 3,
"Apr": 4,
"May": 5,
"Jun": 6,
"Jul": 7,
"Aug": 8,
"Sep": 9,
"Oct": 10,
"Nov": 11,
"Dec": 12,
}
df_2018["MES"] = df_2018["month"].map(months)
sdf = (
df_2018.groupby(["MES", "hs6", "exporter", "importer", "country"])["weight_kg"]
.sum()
.reset_index()
)
monthly_hscode_country_department_sourcing = (
cdf.groupby(["HS6", "MES", "COUNTRY_OF_DESTINATION", "des", "des_Via_Sal"])[
"Peso_Neto_Kg"
]
.sum()
.reset_index()
)
monthly_hscode_country_department_sourcing["MES"] = (
monthly_hscode_country_department_sourcing.MES.astype(float).astype(int)
)
def get_sourcing_department(row):
matches = monthly_hscode_country_department_sourcing[
(monthly_hscode_country_department_sourcing.HS6 == row.hs6)
& (monthly_hscode_country_department_sourcing.MES == row.MES)
& (
monthly_hscode_country_department_sourcing.COUNTRY_OF_DESTINATION
== row.country
)
]
if len(matches) == 1:
row["department"] = matches.des.item()
row["port_of_export"] = matches.des_Via_Sal.item()
return [row.to_dict()]
elif len(matches) > 1:
sums = matches.Peso_Neto_Kg.sum()
ratios = {
(r.des, r.des_Via_Sal): (r.Peso_Neto_Kg / sums)
for _, r in matches.iterrows()
}
rows = []
for (dep, port), ratio in ratios.items():
new_row = row.copy()
new_row["department"] = dep
new_row["port_of_export"] = port
new_row["weight_kg"] = new_row.weight_kg * ratio
rows.append(new_row.to_dict())
return rows
elif len(matches) == 0:
row["department"] = "UNKNOWN"
row["port_of_export"] = "UNKNOWN"
return [row.to_dict()]
# Match
data = []
for index, row in df_2018.iterrows():
returned_rows = get_sourcing_department(row)
for rrow in returned_rows:
data.append(rrow)
hybrid_df = pd.DataFrame(data)
assert (hybrid_df["weight_kg"].sum()) / 1_000 == (df_2018["weight_kg"].sum() / 1_000)