View or edit on GitHub
This page is synchronized from trase/data/brazil/coffee/trade/y2020/tmp_brazil_coffee_traders_processing.ipynb. Last modified on 2025-12-14 23:19 CET by Trase Admin.
Please view or edit the original file there; changes should be reflected here after a midnight build (CET time),
or manually triggering it with a GitHub action (link).
Temporary file - Brazil coffee exporters 2020
Notebook for exploration of the Brazil coffee exporters (traders) 2020 identification.
-
New exporters get ingested in a separate script
- First identify the new (labels not found)
- Save them in raw text
- Order to see if some are representing the same trader (and if yes later integrate as a label)
- Ingest them (see if we can add
trase-idat the same time). Currently the process is like:missing = ["ZIGOTI COFFEE WORKS LTD", ..., "ZZZ"] new_ids_from_missing = [] for label in tqdm(missing): new_ids_from_missing.append(insert_trader_tree_if_missing(label, autocommit=False)) # Take note of the ids generated.., and then # Add new labels (alternative names) to recently added traders LABELS_TO_NEW_TRADER_IDS = { "ALDWAMI COMPANY LIMITED": 15597083, "ALDWAMI COMPANY LTD": 15597083, .. }
-
This logic also goes to the main pre-processing script to validate all exporters are in the Database
- Exporters with
trase-idpointing to a different name and tax number will go to a sample file for further exploration
Currently clarifying: * Should the gold file contain both the trader's node_id, the trader's group name and the trader's group node_id? * Or maybe only the trader 'official' name, the original name, and the trader id? * At least in the tanzania flows, it only shows the exporter name. So guess it's ok to have the name and id without group, and report this.
import duckdb
import pandas as pd
from trase.tools import CNX, get_country_id
from trase.tools.pandasdb.find import find_traders_and_groups_by_label
from trase.tools.pandasdb.find import find_traders_and_groups_by_trase_id
from psycopg2 import sql
# Configure duckdb and connect to data
conn = duckdb.connect()
conn.execute("""
SET s3_region='eu-west-1';
INSTALL httpfs; LOAD httpfs;
CALL load_aws_credentials('default', set_region= true);
""")
bol_clean = conn.sql(f"""
SELECT
hs6,
vol::int64 AS vol,
\"exporter.label\" AS exporter_name,
\"exporter.cnpj\" AS exporter_taxn,
\"country_of_destination.trase_id\" AS country_iso2
FROM 's3://trase-storage/brazil/coffee/trade/2020/bronze/bronze_brazil_bol_coffee_2020.parquet'
""").df()
# Find the trader information - first try with the ones with tax number
country_id = get_country_id("BRAZIL")
bol_data = bol_clean.copy()
# Save the records with '0' in 'exporter_taxn' in a separate dataframe and remove them
bol_data_null_taxn = bol_data[bol_data['exporter_taxn'] == '0']
bol_data = bol_data[bol_data['exporter_taxn'] != '0']
# Concatenate 'BR-' to the first 8 characters of the exporter tax number
bol_data['trial_exporter_trase_id'] = 'BR-TRADER-' + bol_data['exporter_taxn'].str[:8]
# Find exporter names against the database
bol_data[
["exporter_group_name", "exporter_group_id", "exporter_trader_id", "count"]
] = find_traders_and_groups_by_trase_id(
bol_data.rename(columns={"trial_exporter_trase_id": "trase_id"}, errors="raise"),
returning=["group_name", "group_id", "trader_id", "count"],
year=sql.Literal(2020),
on_extra_columns="ignore",
)
# Save the records with 0 in 'count' in a separate dataframe and remove them
bol_data_null_trase_id = bol_data[bol_data['count'] == 0]
bol_data = bol_data[bol_data['count'] != 0]
bol_data_null_trase_id.drop(columns='count', inplace=True)
count = bol_data.pop("count")
assert(all(count == 1))
#bol_data.rename(columns={"trase_id": "exporter_trase_id"}, inplace=True)
# rename column 'trial_exporter_trase_id' to 'exporter_trase_id'
bol_data.rename(columns={"trial_exporter_trase_id": "exporter_trase_id"}, inplace=True)
print(f"{bol_data.shape[0]:,} of {bol_clean.shape[0]:,} records have traders identified with trase-id. Here a sample:")
bol_data[['exporter_name', 'exporter_taxn', 'exporter_trase_id', 'exporter_trader_id', 'exporter_group_name']].head(10)
# Now try to find the remaining traders by their names
# Append the records from bol_data_null_taxn to bol_data_null_trase
bol_data_null_trader = pd.concat([bol_data_null_trase_id, bol_data_null_taxn], ignore_index=True)
bol_data_null_trader["exporter_name_original"] = bol_data_null_trader["exporter_name"]
# Find exporter names against the database
bol_data_null_trader[
["exporter_name", "exporter_trader_id", "exporter_group_name", "exporter_group_id", "trase_id", "count"]
] = find_traders_and_groups_by_label(
bol_data_null_trader.rename(columns={"exporter_name": "trader_label"}, errors="raise"),
returning=["trader_name", "trader_id", "group_name", "group_id", "trase_id", "count"],
country_id=sql.Literal(country_id),
year=sql.Literal(2020),
on_extra_columns="ignore",
)
traders_found_by_name_cnt = bol_data_null_trader[bol_data_null_trader['count'] == 1].shape[0]
traders_not_found_cnt = bol_data_null_trader[bol_data_null_trader['count'] == 0].shape[0]
print(f"{traders_found_by_name_cnt} records have traders identified by label, and {traders_not_found_cnt} have not been found. Here a sample of the ones identified:")
traders_found_by_name = bol_data_null_trader[bol_data_null_trader['count'] == 1]
traders_found_by_name[['exporter_name', 'exporter_name_original', 'exporter_taxn', 'trase_id', 'trial_exporter_trase_id', 'exporter_trader_id', 'exporter_group_name', 'exporter_group_id']].head(10)
existing_traders_wout_trase_id = traders_found_by_name[traders_found_by_name['trase_id'].isnull()]
existing_traders_with_diff_trase_id = traders_found_by_name[traders_found_by_name['trase_id'].notnull()]
print(f"Exporters found in the database but without trase_id assigned: {existing_traders_wout_trase_id.shape[0]}")
print(f"Exporters found in the database but with different trase_id assigned: {existing_traders_with_diff_trase_id.shape[0]}")
# Save the results to a csv file
existing_traders_wout_trase_id.to_csv('s3://trase-temp/diet_trase/existing_traders_wout_trase_id.csv', index=False)
existing_traders_with_diff_trase_id.to_csv('s3://trase-temp/diet_trase/existing_traders_with_diff_trase_id.csv', index=False)
print(f"Here the ones not identified:")
traders_not_found_by_name = bol_data_null_trader[bol_data_null_trader['count'] == 0]
traders_not_found_by_name[['exporter_name', 'exporter_name_original', 'exporter_taxn', 'trase_id', 'trial_exporter_trase_id', 'exporter_trader_id', 'exporter_group_name']]
traders_not_found_by_name[['exporter_name_original', 'trial_exporter_trase_id']].drop_duplicates().sort_values('exporter_name_original')
# Count the records with 0 in 'count'
bol_data_null_trader[bol_data_null_trader['count'] == 0].shape[0]