Full coops pipeline
View or edit on GitHub
This page is synchronized from trase/data/cote_divoire/cocoa/logistics/q4_2025/full_coops_pipeline.ipynb. Last modified on 2026-06-21 06:35 CEST by GitHub Actions.
Please view or edit the original file there; changes should be reflected here after a midnight build (CET time),
or manually triggering it with a GitHub action (link).
# =============================================================================
# PHASE 0: SETUP AND FILE PATHS
# =============================================================================
import pandas as pd
import numpy as np
import geopandas as gpd
import re
import unicodedata
from rapidfuzz import fuzz, process
from shapely.geometry import Point
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
# Define all paths here to make switching environments easier
PATHS = {
'messy_2024_combined': '/Users/niamhfrench/repos/TRASE/trase/data/cote_divoire/cocoa/logistics/q4_2025/csvs/messy_2024_combined.csv',
'cam_coopyear_2023': '/Users/niamhfrench/repos/TRASE/trase/data/cote_divoire/cocoa/logistics/q4_2025/csvs/CAM_coopyear.csv',
'civ_departements': '/Users/niamhfrench/repos/TRASE/trase/data/cote_divoire/cocoa/logistics/q4_2025/csvs/ci_departments.geojson',
'cam_seipcs_2023': '/Users/niamhfrench/repos/TRASE/trase/data/cote_divoire/cocoa/logistics/q4_2025/csvs/CAM_seipcs.csv',
# Export Paths
'out_cam_long': '/Users/niamhfrench/repos/TRASE/trase/data/cote_divoire/cocoa/logistics/q4_2025/csvs/2024_civ_coops_like_cam_long_march30.csv',
'out_cam_coopyear': '/Users/niamhfrench/repos/TRASE/trase/data/cote_divoire/cocoa/logistics/q4_2025/csvs/2024_civ_coops_like_cam_coopyear_march30.csv',
'out_coops_traseearth': '/Users/niamhfrench/repos/TRASE/trase/data/cote_divoire/cocoa/logistics/q4_2025/csvs/2024_civ_coops_like_cam_long_march30_traseearth.geojson'
}
# =============================================================================
# PHASE 1: COLUMN CLEANING & NORMALIZATION (2024 Data Only)
# =============================================================================
print("--- Phase 1: Column Cleaning ---")
df = pd.read_csv(PATHS['messy_2024_combined'], low_memory=False)
df['YEAR'] = 2024
# 1. Standardize Country Name
country_cols = ['COUNTRY', 'DISCL_COUNTRY', 'DISCL_COUNTRY_NAME', 'COUNTRY_NAME']
df[country_cols] = df[country_cols].replace(r'^\s*$', pd.NA, regex=True)
df['COUNTRY'] = df[country_cols].bfill(axis=1).iloc[:, 0]
def normalize_country(val):
if pd.isna(val): return val
s = str(val).strip()
s = unicodedata.normalize('NFKD', s).encode('ASCII', 'ignore').decode('ASCII')
return s.upper()
df['COUNTRY'] = df['COUNTRY'].apply(normalize_country)
df.drop(columns=['DISCL_COUNTRY', 'DISCL_COUNTRY_NAME', 'COUNTRY_NAME'], inplace=True, errors='ignore')
# Filter for Ivory Coast
df = df[df['COUNTRY'].isin(["IVORY COAST", "COTE D'IVOIRE", "CIV", "COTE DIVOIRE"])].copy()
df['COUNTRY'] = "IVORY_COAST"
# 2. Capitalize all string data and column names
df = df.map(lambda v: v.upper() if isinstance(v, str) else v)
df.columns = df.columns.str.upper()
# 3. Consolidate '.1' duplicated columns
dup_cols = ['DISCL_NUMBER_FARMERS', 'DISCL_VOLUMES', 'DISCL_VOLUMES_UNIT', 'DISCL_CERTIFICATION_NAME', 'DISCL_CERTIFICATION_PCT']
for col in dup_cols:
col_1 = col + '.1'
if col_1 in df.columns:
mask = df[col].isna() & df[col_1].notna()
df.loc[mask, col] = df.loc[mask, col_1]
df.drop(columns=[col_1], inplace=True)
# =============================================================================
# PHASE 2: HYBRID IMPUTATION (Clean Names & Fuzzy Match)
# =============================================================================
print("--- Phase 2: Hybrid Imputation ---")
def move_long_abrvname_to_fullname(row):
abrv = row.get('DISCL_SUPPLIER_ABRVNAME')
fullname = row.get('DISCL_SUPPLIER_FULLNAME')
if isinstance(abrv, str) and len(abrv) > 35 and (pd.isna(fullname) or fullname == ""):
row['DISCL_SUPPLIER_FULLNAME'] = abrv
row['DISCL_SUPPLIER_ABRVNAME'] = np.nan
return row
def clean_abrvname1(col_name):
if not isinstance(col_name, str): return col_name
patterns = {
r"UNITE COOPERATIVE AGRICOLE DE DANANE": "COOP UDAN", r"COOPERATIVE AGRICOLE BACON ESPOIR": "CABES",
r"COOPERATIVE AGRICOLE ABOTRE DE NIABLE": "COAANI", r"COOPERATIVE AGRICOLE BENKADI": "COOPABENKADI",
r"COOPERATIVE AGRICOLE D'AGNANFOUTOU": "COESAG", r"COOPERATIVE AGRICOLE ZEMESS TAABA DE GBABAM": "COOPAZEG",
r"COOPERATIVE DES PRODUCTEURS DE YAKASSE ATTOBROU": "COOPROYA", r"SOCIETE AGRICOLE BINKADI DE BROUDOUGOU PENDA CA": "SOCABB",
r"SCOOPAO SOCIETE COOPERATIVE DES AGRICULTEURS DE PETIT OUAGA": "SCOOPAO", r"SOCIETE COOPERATIVE SIMPLIFIEE AGRICOLE KAMBONOU DE ARRAH": "SCOAKA",
r"SOCIETE COOPERATIVE ESPERENCE DE KPELEKRO": "SOCAEK", r"SOCIETE COOPERATIVE AVEC CONSEIL D'ADMINISTRATION ENTENTE DE SEGUELA": "COOP CA ES",
r"COOPERATIVE DES PRODUCTEURS AGRICOLES DE NIABLE": "COOP CA PAN", r"COOPERATIVE YEYONIAN DU CANTON ANIASSUE": "COOPYCA",
r"SOCIETE COOPERATIVE AGRICOLE DE GNATO AVEC CONSEIL D'ADMINISTRATION": "SOCAG", r"SOCIETE COOPERATIVE AGRICOLE DE BAYOTA": "SOCABA COOP CA",
r"SPAD GAGNOA": "SPAD GAGNOA", r"SCAT I|SCAT 1": "SCAT 1", r"\(CA\)$": " CA ", r"\(COOP\)$": " COOP ", r"\(SCOOP\)$": " SCOOP "
}
for pattern, replacement in patterns.items():
if re.search(pattern, col_name): return replacement
return col_name
def clean_abrvname2(col_name):
if not isinstance(col_name, str): return col_name
c = col_name.strip()
c = re.sub(r"\.|[(]|[)]| WAREHOUSE$", "", c)
c = re.sub(r"\n|_|/|-", " ", c)
c = re.sub(r"Ô", "O", c)
return " ".join(c.split())
def clean_abrvname3(col_name):
if not isinstance(col_name, str): return col_name
pattern = r"\bCOOP CA\b|\bSAND BOX\b|\bCOOP\b|\bWAREHOUSE\b|\bAVEC CONSEIL D'ADMINISTRATION\b|\bAGRICOLE\b|\bUNION DES\b|\bDES PRODUCTEURS\b|\bSCOOPS?\b|- CA$"
cleaned = re.sub(pattern, "", col_name)
return " ".join(cleaned.split())
def clean_fullname(col_name):
if not isinstance(col_name, str): return col_name
# Strip all accents
col_name = unicodedata.normalize('NFKD', col_name).encode('ascii', 'ignore').decode('utf-8')
# 1. Remove standalone 'ERATIVE' completely
fn = re.sub(r"\bERATIVE\b", "", col_name, flags=re.IGNORECASE)
# 2. Standardize 'COOPERATIVE' (Removed ERATIVE from this list)
coop_pattern = r"(?:SOCIETE COOPERATIVE|STE COOP|COOPERATIVE|COPERATIVE|COOPRATIVE|SOCIETES? AGRICOLE COOPERATIVE|ENTREPRISE COOPERATIVE|ENTREPRISE AGRICOLE COOPERATIVE|ENTREP COOPERA)"
fn = re.sub(coop_pattern, " COOPERATIVE ", fn, flags=re.IGNORECASE)
# 3. Standardize 'CAFE ET CACAO'
cafe_pattern = r"(?:DU CAFE&CACAO|DU CAFE & CACAO|DU CAFE& CACAO|DU CAFE &CACAO|DE CAFE&CACAO|DE CAFE & CACAO|DE CAFE& CACAO|DE CAFE &CACAO|DU CAFE-CACAO|DU CAFE - CACAO|DU CAFE- CACAO|DU CAFE -CACAO|DE CAFE-CACAO|DE CAFE - CACAO|DE CAFE- CACAO|DE CAFE- CACAO|CAFE&CACAO|CAFE & CACAO|CAFE& CACAO|CAFE &CACAO|CAFE-CACAO|CAFE - CACAO|CAFE- CACAO|CAFE -CACAO|DU CAFE ET DU CACAO|DE CAFE ET DE CACAO|CAFE ET DU CACAO|CAFE ET DE CACAO|DU CAFE ET CACAO|DE CAFE ET CACAO|CAFE ET CACAO|DU CAFE CACAO|DE CAFE CACAO|CAFE CACAO|CAFECACAO|DU CAFE COCOA)"
fn = re.sub(cafe_pattern, " CAFE ET CACAO ", fn, flags=re.IGNORECASE)
return " ".join(fn.split())
def clean_fullname_manual(col_name):
if not isinstance(col_name, str): return col_name
if re.search(r"COOPERATIVE AGRICOLE SOLIDARITE DE BLOLEQUIN", col_name): return "ENTREPRISE COOPERATIVE AGRICOLE DE BLOLEQUIN"
if re.search(r"COOPERATIVE POUR L'AMELIORAT DU REVENU DU PLANTEUR DE CI", col_name): return "COOPERATIVE POUR L'AMELIORATION DU REVENU DU PLANTEUR"
if re.search(r"COOPERATIVE SABABOUGNOUMAN DAGADJI", col_name): return "COOPERATIVE SABABOUGNOUMAN DE DAGADJI"
if re.search(r"ROBERT", col_name) and re.search(r"PORTE", col_name): return "ENTREPRISE COOPERATIVE AGRICOLE DES PRODUCTEURS CAFE ET CACAO DE ROBERT-PORTE"
if re.search(r"MAN EDI ANOUANZE", col_name): return "SOCIETE COOPERATIVE AGRICOLE MAN EDI ANOUANZE"
return col_name
df = df.apply(move_long_abrvname_to_fullname, axis=1)
if 'DISCL_SUPPLIER_ABRVNAME' in df.columns:
df['SIMPLIF_ABRVNAME'] = df['DISCL_SUPPLIER_ABRVNAME'].apply(clean_abrvname1).apply(clean_abrvname2).apply(clean_abrvname3)
if 'DISCL_SUPPLIER_FULLNAME' in df.columns:
df['SUPPLIER_FULLNAME'] = df['DISCL_SUPPLIER_FULLNAME'].apply(clean_fullname).apply(clean_abrvname2).apply(clean_abrvname3).apply(clean_fullname_manual)
df['SUPPLIER_ABRVNAME'] = df.get('SIMPLIF_ABRVNAME', df.get('SUPPLIER_ABRVNAME'))
if 'SUPPLIER_FULLNAME' in df.columns:
df.loc[df['SUPPLIER_FULLNAME'].str.contains("COOPERATIVE ", na=False), 'DISCL_SUPPLIER_TYPE'] = "COOPERATIVE"
# Load Historical Master for Fuzzy Matching & Syncing
df_master = pd.read_csv(PATHS['cam_coopyear_2023'], sep=';', low_memory=False)
print("Running secondary historical fuzzy imputation...")
abrv_2023_clean = df_master['DISCL_SUPPLIER_ABRVNAME'].dropna().unique().tolist()
abrv_2024_clean = df['SUPPLIER_ABRVNAME'].dropna().unique().tolist()
fuzzy_match_map = {}
for val in abrv_2024_clean:
match, score, _ = process.extractOne(val, abrv_2023_clean, scorer=fuzz.token_sort_ratio)
if score >= 90 and match is not None: fuzzy_match_map[val] = match
lookup_2023 = df_master.dropna(subset=['DISCL_SUPPLIER_ABRVNAME']).drop_duplicates('DISCL_SUPPLIER_ABRVNAME').set_index('DISCL_SUPPLIER_ABRVNAME')
cols_to_fill = ['DISCL_LONGITUDE', 'DISCL_LATITUDE', 'DISTRICT_NAME', 'DISTRICT_GEOCODE']
def apply_fuzzy_fill(row):
abrv = row['SUPPLIER_ABRVNAME']
if pd.notna(abrv) and abrv in fuzzy_match_map:
match_val = fuzzy_match_map[abrv]
if match_val in lookup_2023.index:
match_row = lookup_2023.loc[match_val]
for col in cols_to_fill:
if col in row and pd.isna(row.get(col)): row[col] = match_row.get(col, np.nan)
return row
df = df.apply(apply_fuzzy_fill, axis=1)
df['DISCL_SUPPLIER_ABRVNAME_CLEANED'] = df['SUPPLIER_ABRVNAME']
df['DISCL_SUPPLIER_FULLNAME_CLEANED'] = df['SUPPLIER_FULLNAME']
# =============================================================================
# PHASE 2.5: ADVANCED GEOGRAPHIC RECOVERY
# =============================================================================
print("--- Phase 2.5: Advanced Geographic Recovery ---")
def clean_text_geo(text):
if pd.isna(text) or text == "" or str(text).lower() == 'nan': return None
return unicodedata.normalize('NFKD', str(text)).encode('ascii', 'ignore').decode('utf-8').upper().strip()
def aggressive_clean(text):
if pd.isna(text): return None
text = str(text).upper().strip()
for noise in ['SAND BOX', 'SANS BOX', 'SAND_BOX', 'SANDBOX', ' IN PROD', ' TEST', ' COOP CA', ' COOP-CA', ' SCOOPS']: text = text.replace(noise, '')
return text.strip()
gdf = gpd.read_file(PATHS['civ_departements'])
if gdf.crs is None or gdf.crs.to_string() != "EPSG:4326": gdf = gdf.to_crs("EPSG:4326")
gdf['LVL_4_NAME_CLEAN'] = gdf['LVL_4_NAME'].apply(clean_text_geo)
name_to_geocode_gdf = dict(zip(gdf['LVL_4_NAME_CLEAN'], gdf['LVL_4_CODE']))
geocode_to_name_gdf = dict(zip(gdf['LVL_4_CODE'], gdf['LVL_4_NAME_CLEAN']))
print("Running initial spatial join...")
mask_has_coords = df['DISCL_LONGITUDE'].notna() & df['DISCL_LATITUDE'].notna()
if 'DISTRICT_GEOCODE' not in df.columns: df['DISTRICT_GEOCODE'] = np.nan
if 'DISTRICT_NAME' not in df.columns: df['DISTRICT_NAME'] = np.nan
if mask_has_coords.sum() > 0:
temp_gdf = gpd.GeoDataFrame(df.loc[mask_has_coords], geometry=gpd.points_from_xy(df.loc[mask_has_coords, 'DISCL_LONGITUDE'], df.loc[mask_has_coords, 'DISCL_LATITUDE']), crs="EPSG:4326")
joined = gpd.sjoin(temp_gdf, gdf[['LVL_4_CODE', 'LVL_4_NAME', 'geometry']], how='left', predicate='intersects')
joined = joined[~joined.index.duplicated(keep='first')]
mask_misses = joined['LVL_4_CODE'].isna() & joined['DISTRICT_GEOCODE'].isna()
miss_indices = joined[mask_misses].index
if len(miss_indices) > 0:
df.loc[miss_indices, 'DISCL_LONGITUDE'] = np.nan
df.loc[miss_indices, 'DISCL_LATITUDE'] = np.nan
update_mask = joined['DISTRICT_GEOCODE'].isna() & joined['LVL_4_CODE'].notna()
update_indices = joined[update_mask].index
df.loc[update_indices, 'DISTRICT_GEOCODE'] = joined.loc[update_indices, 'LVL_4_CODE']
df.loc[update_indices, 'DISTRICT_NAME'] = df.loc[update_indices, 'DISTRICT_NAME'].fillna(joined.loc[update_indices, 'LVL_4_NAME'])
print("Running Two-Way Geographic Sync...")
# Backward Sync: Heal historical df_master using 2024's recent geography
recent_district_map = df.dropna(subset=['SUPPLIER_FULLNAME', 'DISTRICT_NAME']).groupby('SUPPLIER_FULLNAME')['DISTRICT_NAME'].agg(lambda x: x.mode()[0] if not x.mode().empty else np.nan).to_dict()
missing_dist_mask = (df_master['DISTRICT_NAME'].isna()) | (df_master['DISTRICT_NAME'].astype(str).str.strip() == '') | (df_master['DISTRICT_NAME'].astype(str).str.upper() == 'NAN')
if 'SUPPLIER_FULLNAME' in df_master.columns:
df_master.loc[missing_dist_mask, 'DISTRICT_NAME'] = df_master.loc[missing_dist_mask, 'SUPPLIER_FULLNAME'].map(recent_district_map)
# Forward Sync: Heal 2024 data using historical baseline
df_master['DISTRICT_NAME_CLEAN'] = df_master['DISTRICT_NAME'].apply(clean_text_geo)
if 'SUPPLIER_FULLNAME' in df_master.columns:
ref_coops = df_master[df_master['YEAR'] == 2023].dropna(subset=['SUPPLIER_FULLNAME', 'DISTRICT_NAME_CLEAN']).drop_duplicates('SUPPLIER_FULLNAME')
coop_to_name_2023 = dict(zip(ref_coops['SUPPLIER_FULLNAME'], ref_coops['DISTRICT_NAME_CLEAN']))
coop_to_code_2023 = dict(zip(ref_coops['SUPPLIER_FULLNAME'], ref_coops['DISTRICT_GEOCODE']))
else:
ref_coops = df_master[df_master['YEAR'] == 2023].dropna(subset=['SUPPLIER_ABRVNAME', 'DISTRICT_NAME_CLEAN']).drop_duplicates('SUPPLIER_ABRVNAME')
coop_to_name_2023 = dict(zip(ref_coops['SUPPLIER_ABRVNAME'], ref_coops['DISTRICT_NAME_CLEAN']))
coop_to_code_2023 = dict(zip(ref_coops['SUPPLIER_ABRVNAME'], ref_coops['DISTRICT_GEOCODE']))
df['CLEANED_RESIDUAL_NAME'] = df.get('DISCL_SUPPLIER_ABRVNAME', df['SUPPLIER_ABRVNAME']).apply(aggressive_clean)
manual_fix_map = {
'BAKAYO': 'SASSANDRA', "N'DOUCI": 'TIASSALE', 'BINHOUYE': 'ZOUAN-HOUNIEN', 'KPOUEBO (TOUMODI)': 'TOUMODI',
'HERMANKONO DIES (GUITRY)': 'GUITRY', 'BONON': 'BOUAFLE', 'BAYOTA (GAGNOA)': 'GAGNOA',
'MENE CENTRE (SAN-PEDRO)': 'SAN-PEDRO', 'DAGADJI': 'SAN-PEDRO', 'ABONGOUA': 'ARRAH', 'SAN PEDRO': 'SAN-PEDRO',
'GNATO': 'TABOU', 'BIANKOUMAN': 'BIANKOUMA', 'GOUDI': 'GAGNOA', 'MENEE CENTRE/GABIADJI': 'SAN-PEDRO',
'GRAND-BEREBY': 'TABOU', 'TINHOU': 'BLOLEQUIN', 'YAKASSE ATTOBROU': 'YAKASSE-ATTOBROU', 'AFFERY': 'AKOUPE',
'GUIBEROUA': 'GAGNOA', 'GRAND LAHOU': 'GRAND-LAHOU', 'GRABO': 'TABOU', 'DIEGONEFLA': 'OUME',
'SEGUELLA': 'SEGUELA', 'TOULEPLEU': 'TOULEUPLEU', 'GRAND-ZATTRY': 'SOUBRE', 'DIBY': 'SAKASSOU',
'ECAKOOG': 'LAKOTA', 'SCOOPAKAM': 'DIVO', 'SCOOWEND COOP-CA': 'SAN-PEDRO', 'CASIB COOP-CA': 'BLOLEQUIN',
'COOP-CA ADA': 'SAN-PEDRO', 'BARA COOP-CA': 'BANGOLO', 'AGRIAL COOP-CA': 'DALOA', 'COPAWEB COOP-CA': 'SOUBRE',
'ECAPA GZ COOP-CA': 'SOUBRE', 'SOCAAN COOP-CA': 'ADZOPE', 'SUHUM MUNICIPAL': None, 'MBANGASSINA': None,
'NTUI': None, 'LOCATION': None, 'AVERAGE PER FARMER': None,
'CENTRE-EST': '', 'EST': '', 'SUD-EST': '', 'OUEST': '', 'CENTRE-OUEST': '', 'SUD-OUEST': '', 'NORD': '', 'SUD': '', 'CENTRE': ''
}
dict_broad_names = {
'CAAA': 'ABENGOUROU', 'CAABRO': 'ABENGOUROU', 'CAAN': 'ABENGOUROU', 'CAMAP': 'AGNIBILEKRO', 'CAPA': 'ABENGOUROU',
'CAPRESSA': 'ABENGOUROU', 'CEAA COOP CA': 'ABENGOUROU', 'COAKA': 'ABENGOUROU', 'COOAAB-COOP CA': 'ABENGOUROU',
'COOP CA ABOTRE': 'ABENGOUROU', 'COOP CA ESPOIR': 'ABENGOUROU', 'COOP CA NOBIELTO': 'ABENGOUROU', 'COOPAYAF': 'ABENGOUROU',
'COOPAZA': 'ABENGOUROU', 'COOPRAZO': 'ABENGOUROU', 'COOPYCA': 'ABENGOUROU', 'CP2A': 'ABENGOUROU', 'SCAANIAS': 'ABENGOUROU',
'SCOAPANI': 'ABENGOUROU', 'SOLIDARITE': 'ABENGOUROU', 'CABA': 'MAN', 'CADB': 'BIANKOUMA', 'CADT': 'MAN',
'CAMOBIAN': 'BIANKOUMA', 'CAPB': 'MAN', 'CAPROD': 'MAN', 'COOP CA SPAM MAN': 'MAN', 'COOP CA WEND SONGDA': 'DANANE',
'COOP-CA-UDAN': 'DANANE', 'COOP-CA-PAN': 'DANANE', 'COOPAHK': 'MAN', 'COOPANAM': 'MAN', 'LAFI BEBE DE MAN': 'MAN',
'NWK': 'MAN', 'SCAB': 'SIPILOU', 'SCAG COOP CA': 'MAN', 'SCAPCC': 'MAN', 'SCAW COOP-CA': 'MAN', 'SCOOP CAYEM': 'MAN',
'SCOOPS JST': 'MAN', 'SCOOPS LA PAIX': 'MAN', 'SCOOPS SCAG': 'MAN', 'SCOOPS-AH': 'MAN', 'SCOOPS-APRAB': 'MAN',
'SINIKAN-SCOOPAS': 'MAN', 'SOCAET COOP CA': 'MAN', 'CABG SCOOPS': 'DALOA', 'CAEG': 'DALOA', 'CAKB': 'DALOA',
'CEDA': 'DALOA', 'COAM': 'VAVOUA', 'COOBADI': 'DALOA', 'COOP-CA-CABS': 'DALOA', 'COOPADEF': 'DALOA', 'COOPAGARO': 'DALOA',
'COOPAGES': 'DALOA', 'COOPAGO': 'DALOA', 'COOPAKCODI': 'DALOA', 'COOPAMDI': 'DALOA', 'COOPATESA': 'DALOA',
'COOPINA': 'DALOA', 'CPACD': 'DALOA', 'CPI': 'ZUENOULA', 'ECODEBO': 'DALOA', 'ECOPADI': 'DALOA', 'FLRI COOPADIM': 'DALOA',
'IQJ': 'VAVOUA', 'LE PROGRES': 'GAGNOA', 'S-COOP COOPRADA': 'VAVOUA', 'SC WLSA COOP CA': 'DALOA', 'SCAAO': 'DALOA',
'SCABK': 'DALOA', 'SCABODI': 'DALOA', 'SCAC': 'DALOA', 'SCAGB': 'DALOA', 'SCAGG': 'DALOA', 'SCAM': 'DALOA', 'SCANG': 'DALOA',
'SCAPEKO': 'DALOA', 'SCARDI': 'DALOA', 'SCBPAG': 'DALOA', 'SCELA COOP-CA': 'DALOA', 'SCOOPA': 'DALOA', 'SCOOPAAS': 'DALOA',
'SCOOPAKAM': 'DALOA', 'SCOOPAKOKO': 'DALOA', 'SCOOPECO': 'DALOA', 'SCOOPNIA SCOOPS': 'DALOA', 'SCOOPS AGRI.BIO': 'DALOA',
'SCOOPS-CODEMA': 'DALOA', 'SCOOPS-LTDV': 'DALOA', 'SCOOPS-UPRAD': 'GAGNOA', 'SOCABB': 'DALOA', 'SOCAGS': 'DALOA',
'SOCAHI': 'DALOA', 'SOCOOPELA': 'DALOA', 'SOCOPADOU': 'DALOA', 'SOCOPAL SCOOPS': 'DALOA', 'SOCOPDAL SCOOPS': 'DALOA',
'CAFUVA': 'VAVOUA', 'COOP CA CAPLI': 'ZUENOULA', 'CABGB': 'SAN-PEDRO', 'CABID': 'SOUBRE', 'CABING': 'SOUBRE', 'CAC': 'SOUBRE',
'CADA': 'SASSANDRA', 'CADEDO': 'SOUBRE', 'CAL2B': 'SOUBRE', 'CAPL': 'SOUBRE', 'CAPO': 'SOUBRE', 'CATBK': 'SOUBRE',
'CJPK': 'SOUBRE', 'COOP CA ANZUE II': 'SOUBRE', 'COOP CA BIRBA': 'SAN-PEDRO', 'COOP CA COANI': 'SOUBRE',
'COOP CA COOPAA': 'SOUBRE', 'COOP JCAS': 'SOUBRE', 'COOP-CA CAGNAN': 'SOUBRE', 'COOP-CA COANGNA': 'SOUBRE',
'COOP-CA COOPABIN': 'SOUBRE', 'COOP-CA COOPADO': 'SOUBRE', 'COOP-CA NECAB': 'SOUBRE', 'COOP-CA-ABO': 'SOUBRE',
'COOPALBA': 'SOUBRE', 'COOPAOU': 'SOUBRE', 'COOPASM': 'SOUBRE', 'COOPAZEG': 'SOUBRE', 'CPSL': 'SOUBRE', 'ECAB': 'SOUBRE',
'ECAMOG': 'SOUBRE', 'ECEAGN COOP-CA': 'TABOU', 'ECOPAS': 'SOUBRE', 'ECSP': 'SAN-PEDRO', 'ECSP PURATOS': 'SAN-PEDRO',
'FLRI SOCAMA': 'SOUBRE', 'SCAACI': 'SOUBRE', 'SCAEG': 'SOUBRE', 'SCAEPGY': 'SOUBRE', 'SCAND': 'SOUBRE', 'SCAPD': 'SOUBRE',
'SCAPUD': 'SOUBRE', 'SCAS COOP-CA': 'SOUBRE', 'SCASOU': 'SOUBRE', 'SCO SACI': 'SOUBRE', 'SCOOP CAES': 'BONDOUKOU',
'SCOOPA CA': 'SAN-PEDRO', 'SCOOPAC': 'SOUBRE', 'SCOOPADJI': 'SOUBRE', 'SCOOPS AL': 'SOUBRE', 'SCOOPS COOP.A.SAM': 'SOUBRE',
'SCOOPS COOPAA': 'SOUBRE', 'SCOOPS SCAT 1': 'MEAGUI', 'SCOOPS SCSAG': 'TABOU', 'SCOOPS SNCC': 'SOUBRE', 'SCOOPS WEBE': 'SOUBRE',
'SCOOWEND': 'SOUBRE', 'SCSPA': 'SOUBRE', 'SCUW': 'SOUBRE', 'SO.CO.SA.DA': 'SOUBRE', 'SO.COA.MO': 'SOUBRE', 'SOCADJO': 'SOUBRE',
'SOCAG': 'SOUBRE', 'SOCAGNIPI': 'SOUBRE', 'SOCAGRI2': 'SOUBRE', 'SOCAJ': 'SOUBRE', 'SOCOPADOGBO': 'SOUBRE', 'SOCOPEK': 'SOUBRE',
'SOCOSOM': 'SOUBRE', 'UPAS': 'SOUBRE', 'CADESA': 'SOUBRE', 'CANK': 'DUEKOUE', 'CASODU': 'DUEKOUE', 'COOP CA KWS': 'DUEKOUE',
'COOP CA GCT': 'DUEKOUE', 'COOP TEEGA WENDE': 'DUEKOUE', 'SCOOPASG': 'DUEKOUE', 'SCOOPS AKY': 'DUEKOUE',
'SCOOPS BADEMAYA': 'DUEKOUE', 'SCOOPS EPG': 'DUEKOUE', 'SC2WK COOP CA': 'DUEKOUE', 'COOP-CA INCHALA': 'DUEKOUE',
'CABES': 'ABOISSO', 'CNAD': 'ABOISSO', 'COABIA': 'ADIAKE', 'COESAG': 'ABOISSO', 'COOPRAS': 'ABOISSO', 'COOPROYA': 'ABOISSO',
'ECAAF': 'ABOISSO', 'SCAK': 'ADZOPE', 'SCOOPS 3A': 'ABOISSO', 'SOCAEK': 'ABOISSO', 'SO.CC.NA': 'ABOISSO',
'CABOHGA COOP CA': 'LAKOTA', 'CIPACCA': 'GAGNOA', 'CODERLACS': 'YAMOUSSOUKRO', 'COOP CA COPACOL': 'LAKOTA',
'COOPRADI': 'LAKOTA', 'ECAESB': 'TOUMODI', 'ECOOPAK': 'DIVO', 'SCOAWWASI': 'DIVO', 'SCOOPS ABOKA': 'DIVO',
'SCOOPS BAD': 'AGBOVILLE', 'SCOOPS YAT': 'TIASSALE', 'SOCABA': 'AGBOVILLE', 'SOCABA SCOOPS': 'AGBOVILLE',
'COOPABENKADI': 'BONDOUKOU', 'FLRI NANTEBESSOU': 'BONDOUKOU', 'CARET': 'DAOUKRO', 'SCAESG': 'BONGOUANOU'
}
residual_fix_map = {
'BMC VAVOUA': 'VAVOUA', 'SPAD GAGNOA': 'GAGNOA', 'ECO LAKOTA': 'LAKOTA', 'SPAD GUIGLO': 'GUIGLO',
'BARRA GABIADJI': 'SAN-PEDRO', 'SCOOPS SCOEL BONGOUANOU': 'BONGOUANOU', 'ESPOIR VAVOUA': 'VAVOUA',
'SCAW MAN': 'MAN', 'APM GAGNOA': 'GAGNOA', 'COOPABE BOKASSO': 'BOUAFLE', 'CABID II': 'SOUBRE', 'SCAP-CC': 'MAN',
'ECEPAG': 'GAGNOA', 'SOCBPI': 'BEOUMI', 'SOCAET2': 'MAN', 'CALTDV': 'DALOA', 'ASETENAPA': None, 'ASUNAFO': None,
'KUKUOM': None, 'DAKUA': None, 'ABOCFA': None, 'ASANKRAGWA': None, 'TARKWA HUNI': None, 'ASSIN': None,
'ESCOPAG': None, 'CDI FG': None, 'FARMER GROUP': None, 'FG': None, 'AAT': None, 'IIAP': None, 'HKF 7': None
}
final_patch_map = {
'SCANDI WAREHOUSE': 'SOUBRE', 'SCAND-COOP CA': 'SOUBRE', 'COOP CA SCAND': 'SOUBRE', 'SOCOAMO': 'SOUBRE',
'COOPALBA COOP-CA': 'SOUBRE', 'SOCONEL-SCOOPS': 'SOUBRE', 'COOP CATBK': 'SOUBRE', 'SCAEPGY': 'SOUBRE',
'CABING': 'SOUBRE', 'SCSPA': 'SOUBRE', 'SCOOPS COOP.A.SAM': 'SOUBRE', 'SCOOPNAS': 'SOUBRE', 'COOBADI': 'DALOA',
'COOBADI COOP-CA': 'DALOA', 'COOPINA COOP-CA': 'DALOA', 'SCOOPS-AGRIOBIO': 'DALOA', 'SOCAHI': 'DALOA',
'SOCOPADOU': 'DALOA', 'CAREBO': 'DALOA', 'CAPEDIG (PART OF ECOOKIM)': 'DALOA', 'UCA': 'DALOA', 'ECAKOOG': 'LAKOTA',
'SCOOP ABOKA': 'DIVO', 'IQJ': 'VAVOUA', 'SCOOPAVI': 'VAVOUA', 'COOPAGG': 'GUEYO', 'ECAPR': 'SINFRA',
'COOP-CA-ECAPR': 'SINFRA', 'SOCAET2': 'MAN', 'BECIDA': 'ADZOPE', 'COOKADI': 'KANI', 'COOPAEEN COOP-CA': 'ABENGOUROU',
'SCOOPRAZA COOP-CA': 'ABENGOUROU', 'CAEZ-COOP-CA': 'ABENGOUROU', 'COOP CA ABO': 'ABOISSO', 'COOPRAMA-CA': 'ABOISSO',
'SCAND': 'SOUBRE', 'CAAA': 'ABENGOUROU', 'SOCOAMO COOP-CA': 'SOUBRE', 'COOPAEEN': 'ABENGOUROU', 'COOPALBA': 'SOUBRE',
'SOCONPAYAT COOP-CA': 'YAKASSE-ATTOBROU', 'SCAA CI': 'ABENGOUROU', 'COOPASM': 'SOUBRE', 'COOPRAS': 'ABOISSO'
}
if 'DISCL_AREA_NAME' in df.columns:
misplaced_coops = [k for k in manual_fix_map.keys() if k in df['DISCL_AREA_NAME'].unique()]
mask_misplaced = df['DISCL_AREA_NAME'].isin(misplaced_coops)
abrv_col = 'DISCL_SUPPLIER_ABRVNAME' if 'DISCL_SUPPLIER_ABRVNAME' in df.columns else 'SUPPLIER_ABRVNAME'
df.loc[mask_misplaced, abrv_col] = df.loc[mask_misplaced, 'DISCL_AREA_NAME']
df['DISCL_AREA_NAME'] = df['DISCL_AREA_NAME'].replace(manual_fix_map)
mask_missing = (df['DISCL_AREA_NAME'] == '') | (df['DISCL_AREA_NAME'].isna())
df.loc[mask_missing, 'DISCL_AREA_NAME'] = df.loc[mask_missing, 'CLEANED_RESIDUAL_NAME'].map(dict_broad_names)
mask_missing = (df['DISCL_AREA_NAME'] == '') | (df['DISCL_AREA_NAME'].isna())
df.loc[mask_missing, 'DISCL_AREA_NAME'] = df.loc[mask_missing, 'CLEANED_RESIDUAL_NAME'].map(residual_fix_map)
mask_missing = (df['DISCL_AREA_NAME'] == '') | (df['DISCL_AREA_NAME'].isna())
df.loc[mask_missing, 'DISCL_AREA_NAME'] = df.loc[mask_missing, abrv_col].map(final_patch_map)
df['DISCL_AREA_NAME_CLEAN'] = df['DISCL_AREA_NAME'].apply(clean_text_geo)
df['DISTRICT_GEOCODE'] = df.get('DISTRICT_GEOCODE', df['DISCL_AREA_NAME_CLEAN'].map(name_to_geocode_gdf)).fillna(df['DISCL_AREA_NAME_CLEAN'].map(name_to_geocode_gdf))
# CRITICAL FIX: Overwrite 2024 district with historical 2023 district using FULLNAME
use_col = 'SUPPLIER_FULLNAME' if 'SUPPLIER_FULLNAME' in df.columns else 'SUPPLIER_ABRVNAME'
mask_has_hist = df[use_col].map(coop_to_name_2023).notna()
df.loc[mask_has_hist, 'DISTRICT_GEOCODE'] = df.loc[mask_has_hist, use_col].map(coop_to_code_2023)
df.loc[mask_has_hist, 'DISCL_AREA_NAME_CLEAN'] = df.loc[mask_has_hist, use_col].map(coop_to_name_2023)
df.loc[mask_has_hist, 'DISTRICT_NAME'] = df.loc[mask_has_hist, use_col].map(coop_to_name_2023)
# Spatial Imputation for missing coordinates
mask_needs_spatial = df.get('DISTRICT_GEOCODE', pd.Series(dtype=float)).isna() & df.get('DISCL_LATITUDE', pd.Series(dtype=float)).notna() & df.get('DISCL_LONGITUDE', pd.Series(dtype=float)).notna()
if mask_needs_spatial.sum() > 0:
temp_gdf = gpd.GeoDataFrame(
df.loc[mask_needs_spatial],
geometry=gpd.points_from_xy(df.loc[mask_needs_spatial, 'DISCL_LONGITUDE'], df.loc[mask_needs_spatial, 'DISCL_LATITUDE']),
crs="EPSG:4326"
)
joined = gpd.sjoin(temp_gdf, gdf[['LVL_4_CODE', 'LVL_4_NAME_CLEAN', 'geometry']], how='left', predicate='within')
joined = joined[~joined.index.duplicated(keep='first')]
df.loc[joined.index, 'DISTRICT_GEOCODE'] = df.loc[joined.index, 'DISTRICT_GEOCODE'].fillna(joined['LVL_4_CODE'])
df.loc[joined.index, 'DISCL_AREA_NAME_CLEAN'] = df.loc[joined.index, 'DISCL_AREA_NAME_CLEAN'].fillna(joined['LVL_4_NAME_CLEAN'])
if 'DISCL_AREA_NAME_CLEAN' in df.columns:
df['DISTRICT_NAME'] = df.get('DISTRICT_NAME', df['DISCL_AREA_NAME_CLEAN']).fillna(df['DISTRICT_GEOCODE'].map(geocode_to_name_gdf))
df['DISTRICT_GEOCODE'] = df.get('DISTRICT_GEOCODE', pd.Series(dtype=float)).fillna(df['DISTRICT_NAME'].map(name_to_geocode_gdf))
df.drop(columns=['CLEANED_RESIDUAL_NAME', 'DISCL_AREA_NAME_CLEAN'], inplace=True, errors='ignore')
# =============================================================================
# PHASE 3: CAM v4 STRUCTURE & MANUAL FIXES
# =============================================================================
print("--- Phase 3: CAM v4 Structure Alignment ---")
if 'SOURCE_FILE' in df.columns:
lindt_mask = df['SOURCE_FILE'] == '2024_lindt_sprungli.csv'
df.loc[lindt_mask, 'DISCL_TRADER_NAME'] = df.loc[lindt_mask, 'COMPANY']
df.loc[lindt_mask, 'COMPANY'] = "LINDT SPRUNGLI"
bc_mask = df['SOURCE_FILE'] == '2024_barry_callebaut.csv'
df.loc[bc_mask, 'DISCL_TRADER_NAME'] = "BARRY CALLEBAUT AG"
df.loc[bc_mask, 'COMPANY'] = "BARRY CALLEBAUT AG"
ritter_mask = df['SOURCE_FILE'] == '2024_ritter_sport.csv'
df.loc[ritter_mask, ['DISCL_VOLUMES', 'DISCL_LATITUDE', 'DISCL_NUMBER_FARMERS']] = None
df.loc[ritter_mask, 'COMPANY'] = "ALFRED RITTER"
df.loc[ritter_mask & df['SUPPLIER_ABRVNAME'].isin(['CEMOI_COOP_1', 'CEMOI_COOP_2']), 'DISCL_TRADER_NAME'] = "CEMOI"
df.loc[ritter_mask & df['SUPPLIER_ABRVNAME'].isin(['SUSCOM_COOP_1']), 'DISCL_TRADER_NAME'] = "SUSCOM"
df.loc[ritter_mask & df['SUPPLIER_ABRVNAME'].isin(['CARGILL_COOP_1']), 'DISCL_TRADER_NAME'] = "CARGILL"
else:
known_lindt_traders = ['BARRY CALLEBAUT AG', 'ECOM', 'SUCDEN']
lindt_mask = df['COMPANY'].isin(known_lindt_traders) & df['DISCL_TRADER_NAME'].isna()
df.loc[lindt_mask, 'DISCL_TRADER_NAME'] = df.loc[lindt_mask, 'COMPANY']
df.loc[lindt_mask, 'COMPANY'] = "LINDT SPRUNGLI"
bc_mask = df['COMPANY'].isna() & df['DISCL_TRADER_NAME'].isna() & df['SUPPLIER_FULLNAME'].str.contains('BENKADI DE SIAKAKRO', na=False)
df.loc[bc_mask, 'DISCL_TRADER_NAME'] = "BARRY CALLEBAUT AG"
df.loc[bc_mask, 'COMPANY'] = "BARRY CALLEBAUT AG"
ritter_coops_cemoi = ['COOP_A', 'COOP_B']
ritter_coops_suscom = ['COOP_C', 'COOP_D']
ritter_coops_cargill = ['COOP_E', 'COOP_F']
all_ritter_coops = ritter_coops_cemoi + ritter_coops_suscom + ritter_coops_cargill
ritter_mask = df['SUPPLIER_ABRVNAME'].isin(all_ritter_coops)
df.loc[ritter_mask, ['DISCL_VOLUMES', 'DISCL_LATITUDE', 'DISCL_NUMBER_FARMERS']] = None
df.loc[ritter_mask, 'COMPANY'] = "ALFRED RITTER"
df.loc[ritter_mask & df['SUPPLIER_ABRVNAME'].isin(ritter_coops_cemoi), 'DISCL_TRADER_NAME'] = "CEMOI"
df.loc[ritter_mask & df['SUPPLIER_ABRVNAME'].isin(ritter_coops_suscom), 'DISCL_TRADER_NAME'] = "SUSCOM"
df.loc[ritter_mask & df['SUPPLIER_ABRVNAME'].isin(ritter_coops_cargill), 'DISCL_TRADER_NAME'] = "CARGILL"
replacements = {
'BARRY CALLEBAUT AG': 'BARRY CALLEBAUT',
'BARRY CALLEBEAUT': 'BARRY CALLEBAUT',
'ECOM AGROTRADE LTD': 'ECOM',
'CEMOI VIA TRANSPARANCE COCOA': 'CEMOI',
'OLAM INTERNATIONAL LIMITED': 'OLAM',
'OLAM COCOA PROCESSING': 'OLAM',
'OFI': 'OLAM',
'ETG': 'ETC GROUP',
'BEYOND BEANS': 'ETC GROUP',
'OCEAN': 'COCOASOURCE',
'BLOMMER CHOCOLATE COMPANY': 'BLOMMER',
'BLOMMER CHOCOLATE': 'BLOMMER',
'SUCDEN CÔTE DIVOIRE': 'SUCDEN',
'SUCDEN COTE DIVOIRE': 'SUCDEN',
'SOCIÉTÉ AFRICAINE DE CACAO': 'SACC',
'IVORY COCOA PRODUCTS (ICP)': 'IVORY COCOA PRODUCTS',
'JB COCOA': 'JB FOODS',
'GCB': 'GUAN CHONG COCOA',
'ONEM': 'FILDISI'
}
df['DISCL_TRADER_NAME'] = df['DISCL_TRADER_NAME'].replace(replacements)
# FIX 1: Use singular TRADER_NAME to match 2023 schema expectations
df['TRADER_NAME'] = df['DISCL_TRADER_NAME']
# FIX 2: Backfill TRADER_NAME if the COMPANY is a known trader
known_traders = [
'BARRY CALLEBAUT', 'BLOMMER', 'CARGILL', 'CEMOI', 'ECOM', 'ETC GROUP',
'OLAM', 'SUCDEN', 'TOUTON', 'COCOASOURCE', 'SACC', 'IVORY COCOA PRODUCTS',
'JB FOODS', 'GUAN CHONG COCOA', 'FILDISI'
]
mask_missing_trader = df['TRADER_NAME'].isna() & df['COMPANY'].isin(known_traders)
df.loc[mask_missing_trader, 'TRADER_NAME'] = df.loc[mask_missing_trader, 'COMPANY']
# --- FIX 2.5 - SCRUB MANUFACTURERS FROM TRADER_NAME ---
known_manufacturers = [
'NESTLE', 'MONDELEZ', 'FERRERO', 'MARS', 'HERSHEY', 'LINDT SPRUNGLI',
'ALFRED RITTER', 'UNILEVER', 'GENERAL MILLS', 'GODIVA', "SAINSBURY'S",
'VALRHONA', 'HALBA', 'COCOCO CHOCOLATIERS', 'MEIJI', 'TOMS GROUP',
'WHITTAKER', 'WHITTAKERS', # UPDATED
'MARKS & SPENCER', 'STARBUCKS', 'PURATOS',
'TONYS CHOCOLONELY', "TONY'S CHOCOLONELY" # NEW
]
# Identify rows where a manufacturer is incorrectly assigned to TRADER_NAME
mask_is_manufacturer = df['TRADER_NAME'].isin(known_manufacturers)
# Ensure the manufacturer is retained in the COMPANY column before scrubbing
df.loc[mask_is_manufacturer, 'COMPANY'] = df.loc[mask_is_manufacturer, 'COMPANY'].fillna(df.loc[mask_is_manufacturer, 'TRADER_NAME'])
# Nullify the TRADER_NAME for these manufacturers
df.loc[mask_is_manufacturer, 'TRADER_NAME'] = np.nan
# --------------------------------------------------------------------
# FIX 3: Define BUYER using the populated TRADER_NAME
df['BUYER'] = np.where(df['TRADER_NAME'].isna(), df['COMPANY'], df['TRADER_NAME'])
df['DISCLOSURE_SOURCES'] = df['COMPANY']
df['DISCL_SUPPLIER_TYPE'] = np.where(df.get('DISCL_SUPPLIER_ABRVNAME_CLEANED', df['SUPPLIER_ABRVNAME']).str.contains('WAREHOUSE', case=False, na=False), 'WAREHOUSE', 'COOPERATIVE')
# =============================================================================
# PHASE 4: SPLITTING FLOWS & DEDUPLICATION
# =============================================================================
print("--- Phase 4: Splitting Flows ---")
def unroll_connections_safe(dataframe):
unrolled_rows = []
for _, row in dataframe.iterrows():
# Skip empty coops
if pd.isna(row.get('SUPPLIER_ABRVNAME')) and pd.isna(row.get('SUPPLIER_FULLNAME')):
continue
entities = set()
for col in ['DISCLOSURE_SOURCES', 'BUYER', 'TRADER_NAME']:
val = row.get(col)
if pd.isna(val):
continue
# 1. Handle if it's an actual Python list object
if isinstance(val, list):
parsed_parts = [str(v).strip() for v in val if pd.notna(v)]
else:
# 2. Handle if it's a string
v_str = str(val).strip()
# Strip out Python stringified list characters: [, ], ', "
if v_str.startswith('[') and v_str.endswith(']'):
v_str = v_str[1:-1]
v_str = v_str.replace("'", "").replace('"', "")
# Normalize separators (turn '+' into commas) and split
v_str = v_str.replace('+', ',')
parsed_parts = [x.strip() for x in v_str.split(',')]
# Add to our clean set, ignoring empties/nans
entities.update([p for p in parsed_parts if p and p.upper() not in ['NAN', 'NONE', '']])
# If we found multiple entities, unroll them into separate rows
if len(entities) > 1:
for entity in sorted(list(entities)):
new_row = row.copy()
new_row['DISCLOSURE_SOURCES'] = entity
new_row['BUYER'] = entity
# Conditional assignment to prevent manufacturer bleed
new_row['TRADER_NAME'] = np.nan if entity in known_manufacturers else entity
unrolled_rows.append(new_row)
# Even if there's only 1 entity, reassign it just in case it was a single item
elif len(entities) == 1:
entity = list(entities)[0]
new_row = row.copy()
new_row['DISCLOSURE_SOURCES'] = entity
new_row['BUYER'] = entity
# Conditional assignment to prevent manufacturer bleed
new_row['TRADER_NAME'] = np.nan if entity in known_manufacturers else entity
unrolled_rows.append(new_row)
else:
unrolled_rows.append(row)
return pd.DataFrame(unrolled_rows)
df = unroll_connections_safe(df)
# Catch any entities that were extracted from comma-separated lists during unrolling
df['TRADER_NAME'] = df['TRADER_NAME'].replace(replacements)
df['BUYER'] = df['BUYER'].replace(replacements)
df = df.drop_duplicates(subset=['YEAR', 'SUPPLIER_ABRVNAME', 'BUYER', 'TRADER_NAME'], keep='first')
# =============================================================================
# PHASE 5: BUYING STATION & STABLE COOP ID GENERATION
# =============================================================================
print("--- Phase 5: Buying Station & Stable Coop ID Generation ---")
ref_hist = df_master[df_master['YEAR'] == 2023].copy()
# 1. Build Full Name Keys
ref_hist['MATCH_NAME'] = ref_hist['SUPPLIER_FULLNAME'].fillna('MISSING_M').astype(str).str.strip().str.upper()
ref_hist['MATCH_DISTRICT'] = ref_hist['DISTRICT_NAME'].fillna('MISSING_M').astype(str).str.strip().str.upper()
ref_hist['MATCH_KEY'] = ref_hist['MATCH_NAME'] + "_" + ref_hist['MATCH_DISTRICT']
# 2. Build Abbreviation Keys
ref_hist['MATCH_ABRV'] = ref_hist['SUPPLIER_ABRVNAME'].fillna('MISSING_M').astype(str).str.strip().str.upper()
ref_hist['MATCH_ABRV_KEY'] = ref_hist['MATCH_ABRV'] + "_" + ref_hist['MATCH_DISTRICT']
# 3. Create Mapping Dictionaries
key_to_id_map = ref_hist.dropna(subset=['COOP_ID']).groupby('MATCH_KEY')['COOP_ID'].min().to_dict() if 'COOP_ID' in ref_hist.columns else {}
name_to_id_map = ref_hist.dropna(subset=['COOP_ID']).groupby('MATCH_NAME')['COOP_ID'].min().to_dict() if 'COOP_ID' in ref_hist.columns else {}
abrv_key_to_id_map = ref_hist.dropna(subset=['COOP_ID']).groupby('MATCH_ABRV_KEY')['COOP_ID'].min().to_dict() if 'COOP_ID' in ref_hist.columns else {}
abrv_to_id_map = ref_hist.dropna(subset=['COOP_ID']).groupby('MATCH_ABRV')['COOP_ID'].min().to_dict() if 'COOP_ID' in ref_hist.columns else {}
# 4. Create Matching Columns in 2024 Data
df['MATCH_NAME'] = df['SUPPLIER_FULLNAME'].fillna('MISSING_D').astype(str).str.strip().str.upper()
df['MATCH_DISTRICT'] = df['DISTRICT_NAME'].fillna('MISSING_D').astype(str).str.strip().str.upper()
df['MATCH_KEY'] = df['MATCH_NAME'] + "_" + df['MATCH_DISTRICT']
df['MATCH_ABRV'] = df['SUPPLIER_ABRVNAME'].fillna('MISSING_D').astype(str).str.strip().str.upper()
df['MATCH_ABRV_KEY'] = df['MATCH_ABRV'] + "_" + df['MATCH_DISTRICT']
def assign_stable_id(row):
# Try Full Name + District
if row['MATCH_KEY'] != 'MISSING_D_MISSING_D' and row['MATCH_KEY'] in key_to_id_map:
return key_to_id_map[row['MATCH_KEY']]
# Try Abbreviation + District
if row['MATCH_ABRV_KEY'] != 'MISSING_D_MISSING_D' and row['MATCH_ABRV_KEY'] in abrv_key_to_id_map:
return abrv_key_to_id_map[row['MATCH_ABRV_KEY']]
# Try Full Name only
if row['MATCH_NAME'] != 'MISSING_D' and row['MATCH_NAME'] in name_to_id_map:
return name_to_id_map[row['MATCH_NAME']]
# Try Abbreviation only
if row['MATCH_ABRV'] != 'MISSING_D' and row['MATCH_ABRV'] in abrv_to_id_map:
return abrv_to_id_map[row['MATCH_ABRV']]
return np.nan
df['COOP_ID'] = df.apply(assign_stable_id, axis=1)
max_id_raw = pd.to_numeric(df_master.get('COOP_ID', pd.Series(dtype=float)), errors='coerce').max()
max_id = int(max_id_raw) if pd.notna(max_id_raw) else 0
new_coops_mask = df['COOP_ID'].isna()
# NEW: Flag coops as repeated if they already existed in the historical master
df['REPEATED_FROM_PAST_YEAR'] = ~new_coops_mask
if new_coops_mask.any():
# Fallback to creating new IDs using abbreviation if full name is missing
df['NEW_ID_KEY'] = np.where(df['MATCH_KEY'] == 'MISSING_D_MISSING_D', df['MATCH_ABRV_KEY'], df['MATCH_KEY'])
unique_new_keys = df.loc[new_coops_mask, 'NEW_ID_KEY'].unique()
unique_new_keys = [k for k in unique_new_keys if not k.startswith('MISSING')]
new_id_map = {key: (max_id + i + 1) for i, key in enumerate(unique_new_keys)}
df.loc[new_coops_mask, 'COOP_ID'] = df.loc[new_coops_mask, 'NEW_ID_KEY'].map(new_id_map)
# FIX: Catch any stragglers that still don't have an ID and assign them a new one
final_missing_mask = df['COOP_ID'].isna()
if final_missing_mask.any():
current_max = int(df['COOP_ID'].max(skipna=True)) if pd.notna(df['COOP_ID'].max(skipna=True)) else 0
df.loc[final_missing_mask, 'COOP_ID'] = range(current_max + 1, current_max + 1 + final_missing_mask.sum())
# NO DROPNA ALLOWED HERE. Just convert to integer safely.
df['COOP_ID'] = df['COOP_ID'].astype(int)
df['COORD_KEY'] = df.get('DISCL_LONGITUDE', df.get('LONGITUDE')).astype(str) + "_" + df.get('DISCL_LATITUDE', df.get('LATITUDE')).astype(str)
df['BS_RANK'] = df.groupby('COOP_ID')['COORD_KEY'].transform(lambda x: pd.factorize(x)[0] + 1)
df['COOP_BS_ID'] = "COOP-" + df['COOP_ID'].astype(str) + "_BS-" + df['BS_RANK'].astype(str)
df.drop(columns=['MATCH_NAME', 'MATCH_DISTRICT', 'MATCH_KEY', 'MATCH_ABRV', 'MATCH_ABRV_KEY', 'NEW_ID_KEY', 'COORD_KEY', 'BS_RANK'], inplace=True, errors='ignore')
# =============================================================================
# PHASE 6: FARMER BASE IMPUTATION & CALCULATION
# =============================================================================
print("--- Phase 6: Farmer Base Imputation & Calculation ---")
exper_cln = [
'BARRY CALLEBAUT', 'BLOMMER', 'CARGILL', 'CEMOI', 'ECOM', 'ETC GROUP',
'OLAM', 'SUCDEN', 'TOUTON', 'COCOASOURCE', 'SACC', 'IVORY COCOA PRODUCTS',
'JB FOODS', 'GUAN CHONG COCOA', 'FILDISI'
]
df['NOT_RFA'] = df['COMPANY'] != 'RAINFOREST ALLIANCE'
if df.get('DISCL_NUMBER_FARMERS', pd.Series(dtype=float)).dtype == object:
df['DISCL_NUMBER_FARMERS'] = df['DISCL_NUMBER_FARMERS'].astype(str).str.replace(',', '').str.replace(' ', '')
df['DISCL_NUMBER_FARMERS'] = pd.to_numeric(df.get('DISCL_NUMBER_FARMERS'), errors='coerce')
df['NOT_RFA'] = (df['COMPANY'].fillna('UNKNOWN') != 'RAINFOREST ALLIANCE')
df['NB_FARMERS_COMPANY_YEAR'] = df.groupby(['COOP_ID', 'YEAR', 'COMPANY', 'TRADER_NAME'])['DISCL_NUMBER_FARMERS'].transform('mean')
# FIX 1: Removed global_mean fill. Missing values remain NaN to trigger 2023 imputation.
df['NUM_FARMERS'] = df['NB_FARMERS_COMPANY_YEAR'].round(0)
df['NON_TRADER'] = df['COMPANY'].notna() & (~df['COMPANY'].isin(exper_cln)) & df['NOT_RFA'] & (df['TRADER_NAME'].isna() | (df['COMPANY'] == df['TRADER_NAME']))
df['TRADER'] = df['COMPANY'].notna() & (df['COMPANY'].isin(exper_cln) | df['TRADER_NAME'].isin(exper_cln)) & df['NOT_RFA']
df['NO_DISCLOSING'] = df['COMPANY'].isna()
def sum_unique_links(group_df, condition_col, value_col):
filtered = group_df[group_df[condition_col]]
unique_links = filtered.drop_duplicates(subset=['COMPANY'])
# FIX 2: min_count=1 ensures that summing all NaNs returns NaN instead of 0.0
return unique_links[value_col].sum(min_count=1)
def calculate_coop_sums(g):
return pd.Series({
'TOTAL_FARMERS_NONTRADER': sum_unique_links(g, 'NON_TRADER', 'NUM_FARMERS'),
'TOTAL_FARMERS_TRADER': sum_unique_links(g, 'TRADER', 'NUM_FARMERS'),
'TOTAL_FARMERS_RFA': sum_unique_links(g, 'NOT_RFA', 'NUM_FARMERS') if not g['NOT_RFA'].all() else np.nan,
'TOTAL_FARMERS_NODISCL': sum_unique_links(g, 'NO_DISCLOSING', 'NUM_FARMERS')
})
sums_df = df.groupby(['COOP_ID', 'YEAR']).apply(calculate_coop_sums, include_groups=False).reset_index()
expected_cols = ['TOTAL_FARMERS_TRADER', 'TOTAL_FARMERS_NONTRADER', 'TOTAL_FARMERS_RFA', 'TOTAL_FARMERS_NODISCL']
for col in expected_cols:
if col not in sums_df.columns: sums_df[col] = np.nan
df = df.drop(columns=expected_cols, errors='ignore')
df = df.merge(sums_df, on=['COOP_ID', 'YEAR'], how='left')
# Calculate maximum across the subgroups
df['TOTAL_FARMERS'] = df[expected_cols].max(axis=1)
# =============================================================================
# NEW: IMPUTE MISSING 2024 FARMERS FROM 2023 MASTER (UPDATED)
# =============================================================================
if 'TOTAL_FARMERS' in df_master.columns:
print("Imputing missing 2024 farmer counts from 2023 history...")
# 1. Force strict integer matching to prevent float/string dictionary misses
df_master['COOP_ID_CLEAN'] = pd.to_numeric(df_master['COOP_ID'], errors='coerce').fillna(-1).astype(int)
df['COOP_ID_CLEAN'] = pd.to_numeric(df['COOP_ID'], errors='coerce').fillna(-1).astype(int)
# 2. Build the dictionaries from 2023
farmers_2023_map = df_master[df_master['YEAR'] == 2023].groupby('COOP_ID_CLEAN')['TOTAL_FARMERS'].max().to_dict()
# FIX 3: Remove the -1 key to prevent mapping invalid IDs
farmers_2023_map.pop(-1, None)
# 3. Target rows missing TOTAL_FARMERS
mask_missing_total = df['TOTAL_FARMERS'].isna() | (df['TOTAL_FARMERS'] == 0)
df.loc[mask_missing_total, 'TOTAL_FARMERS'] = df.loc[mask_missing_total, 'COOP_ID_CLEAN'].map(farmers_2023_map)
# 4. Target rows missing NUM_FARMERS (For cam_long)
mask_missing_extra = df['NUM_FARMERS'].isna() | (df['NUM_FARMERS'] == 0)
df.loc[mask_missing_extra, 'NUM_FARMERS'] = df.loc[mask_missing_extra, 'COOP_ID_CLEAN'].map(farmers_2023_map)
# 5. Cleanup the temporary matching column
df.drop(columns=['COOP_ID_CLEAN'], inplace=True)
df_master.drop(columns=['COOP_ID_CLEAN'], inplace=True, errors='ignore')
# =============================================================================
# NEW: CONVERT 0 FARMERS TO NaN
# =============================================================================
farmer_cols = [
'TOTAL_FARMERS_NONTRADER', 'TOTAL_FARMERS_TRADER',
'TOTAL_FARMERS_RFA', 'TOTAL_FARMERS_NODISCL',
'TOTAL_FARMERS', 'NUM_FARMERS'
]
valid_farmer_cols = [col for col in farmer_cols if col in df.columns]
df[valid_farmer_cols] = df[valid_farmer_cols].replace({0: np.nan, 0.0: np.nan})
# Resume existing logic...
df = df.reset_index(drop=True)
df['FLOW_ID'] = df['YEAR'].astype(str) + '-' + (df.index + 1).astype(str)
# =============================================================================
# NEW: IMPUTE MISSING 2024 FARMERS FROM 2023 MASTER (UPDATED)
# =============================================================================
if 'TOTAL_FARMERS' in df_master.columns:
print("Imputing missing 2024 farmer counts from 2023 history...")
# 1. Force strict integer matching to prevent float/string dictionary misses
df_master['COOP_ID_CLEAN'] = pd.to_numeric(df_master['COOP_ID'], errors='coerce').fillna(-1).astype(int)
df['COOP_ID_CLEAN'] = df['COOP_ID'].astype(int)
# 2. Build the dictionaries from 2023
farmers_2023_map = df_master[df_master['YEAR'] == 2023].groupby('COOP_ID_CLEAN')['TOTAL_FARMERS'].max().to_dict()
# 3. Target rows missing TOTAL_FARMERS
mask_missing_total = df['TOTAL_FARMERS'].isna() | (df['TOTAL_FARMERS'] == 0)
df.loc[mask_missing_total, 'TOTAL_FARMERS'] = df.loc[mask_missing_total, 'COOP_ID_CLEAN'].map(farmers_2023_map)
# 4. Target rows missing NUM_FARMERS (For cam_long)
mask_missing_extra = df['NUM_FARMERS'].isna() | (df['NUM_FARMERS'] == 0)
df.loc[mask_missing_extra, 'NUM_FARMERS'] = df.loc[mask_missing_extra, 'COOP_ID_CLEAN'].map(farmers_2023_map)
# 5. Cleanup the temporary matching column
df.drop(columns=['COOP_ID_CLEAN'], inplace=True)
df_master.drop(columns=['COOP_ID_CLEAN'], inplace=True, errors='ignore')
# =============================================================================
# NEW: CONVERT 0 FARMERS TO NaN
# =============================================================================
farmer_cols = [
'TOTAL_FARMERS_NONTRADER', 'TOTAL_FARMERS_TRADER',
'TOTAL_FARMERS_RFA', 'TOTAL_FARMERS_NODISCL',
'TOTAL_FARMERS', 'NUM_FARMERS'
]
valid_farmer_cols = [col for col in farmer_cols if col in df.columns]
df[valid_farmer_cols] = df[valid_farmer_cols].replace({0: np.nan, 0.0: np.nan})
# Resume existing logic...
df = df.reset_index(drop=True)
df['FLOW_ID'] = df['YEAR'].astype(str) + '-' + (df.index + 1).astype(str)
# =============================================================================
# PHASE 7: FINAL RECOVERY
# =============================================================================
print("--- Phase 7: Final Historical Recovery ---")
# Guarantee that every row has a standard coordinate column to survive GeoJSON
if 'LONGITUDE' not in df.columns: df['LONGITUDE'] = np.nan
if 'LATITUDE' not in df.columns: df['LATITUDE'] = np.nan
df['LONGITUDE'] = df['LONGITUDE'].fillna(df.get('DISCL_LONGITUDE'))
df['LATITUDE'] = df['LATITUDE'].fillna(df.get('DISCL_LATITUDE'))
if 'REPEATED_FROM_PAST_YEAR' in df.columns:
df['REPEATED_FROM_PAST_YEAR'] = df['REPEATED_FROM_PAST_YEAR'].replace({'NA': False, 'nan': False, 'NaN': False}).fillna(False).astype(bool)
else:
df['REPEATED_FROM_PAST_YEAR'] = False
# =============================================================================
# PHASE 7.5: ROLL-FORWARD MISSING 2023 LINKS INTO 2024
# =============================================================================
print("--- Phase 7.5: Rolling forward missing 2023 links into 2024 ---")
# Load historical LONG format data (which contains the TRADER_NAME column)
df_hist_long = pd.read_csv(PATHS['cam_seipcs_2023'], low_memory=False, sep=';')
# 1. Create unique identifiers for every flow based on Coop ID and Trader
df_hist_long['FLOW_KEY'] = df_hist_long['COOP_ID'].astype(str) + "_" + df_hist_long['TRADER_NAME'].fillna('UNKNOWN').astype(str)
df['FLOW_KEY'] = df['COOP_ID'].astype(str) + "_" + df['TRADER_NAME'].fillna('UNKNOWN').astype(str)
# 2. Identify flows that existed in 2023 but are completely missing in 2024
flows_2023 = df_hist_long[df_hist_long['YEAR'] == 2023].copy()
missing_flow_keys = set(flows_2023['FLOW_KEY']) - set(df['FLOW_KEY'])
if missing_flow_keys:
print(f"Identified {len(missing_flow_keys)} historical links from 2023 missing in 2024. Rolling them forward...")
# 3. Extract the missing rows from the 2023 history
rolled_forward_df = flows_2023[flows_2023['FLOW_KEY'].isin(missing_flow_keys)].copy()
# 4. Update the temporal metadata to flag them as 2024 roll-forwards
rolled_forward_df['YEAR'] = 2024
rolled_forward_df['REPEATED_FROM_PAST_YEAR'] = True
# Clean up flow keys before merging
rolled_forward_df.drop(columns=['FLOW_KEY'], inplace=True, errors='ignore')
df.drop(columns=['FLOW_KEY'], inplace=True, errors='ignore')
df_hist_long.drop(columns=['FLOW_KEY'], inplace=True, errors='ignore')
# 5. Append the rolled-forward rows to the main 2024 dataframe
df = pd.concat([df, rolled_forward_df], ignore_index=True)
# Re-sequence the FLOW_ID to prevent duplicates
df = df.reset_index(drop=True)
df['FLOW_ID'] = df['YEAR'].astype(str) + '-' + (df.index + 1).astype(str)
print("Successfully appended historical links to the 2024 dataset.")
else:
print("No missing historical links found. 2024 data contains all 2023 flows.")
df.drop(columns=['FLOW_KEY'], inplace=True, errors='ignore')
df_hist_long.drop(columns=['FLOW_KEY'], inplace=True, errors='ignore')
# =============================================================================
# PHASE 8: EXPORTING FORMATTED DATASETS
# =============================================================================
print("\n--- Phase 8: Generating Output Datasets ---")
# Ensure base types are clean before export
df['YEAR'] = df['YEAR'].fillna(2024).astype(int)
df['COOP_ID'] = df['COOP_ID'].astype(int)
# -------------------------------------------------------------------
# FORMAT 1: CAM_LONG
# -------------------------------------------------------------------
print("Exporting cam_long format...")
# Load historical LONG format data
df_seipcs_2023 = pd.read_csv(PATHS['cam_seipcs_2023'], low_memory=False, sep=';')
# Stack into massive ledger
cam_long_full = pd.concat([df_seipcs_2023, df], ignore_index=True)
# NEW: Impute missing names back and forth
for col in ['SUPPLIER_ABRVNAME', 'SUPPLIER_FULLNAME']:
cam_long_full[col] = cam_long_full[col].replace(['', 'nan', 'NaN', 'None', 'NA'], np.nan)
cam_long_full['SUPPLIER_ABRVNAME'] = cam_long_full['SUPPLIER_ABRVNAME'].fillna(cam_long_full['SUPPLIER_FULLNAME'])
cam_long_full['SUPPLIER_FULLNAME'] = cam_long_full['SUPPLIER_FULLNAME'].fillna(cam_long_full['SUPPLIER_ABRVNAME'])
cam_long_full.to_csv(PATHS['out_cam_long'], sep=';', index=False)
print(f"-> cam_long saved to: {PATHS['out_cam_long']}")
# -------------------------------------------------------------------
# FORMAT 2: CAM_COOPYEAR
# -------------------------------------------------------------------
print("Exporting cam_coopyear format...")
def combine_unique_strings(series):
items = sorted(list(set(series.dropna().astype(str))))
items = [i for i in items if i.lower() not in ['nan', 'none', 'na', '']]
return ", ".join(items) if items else None
# AGGREGATE THE 2024 DATA
df['BUYER_AGG'] = df['BUYER'].fillna(df['TRADER_NAME']).fillna(df['DISCLOSURE_SOURCES'])
agg_dict = {
'SUPPLIER_ABRVNAME': 'first', 'SUPPLIER_FULLNAME': 'first', 'LATITUDE': 'first', 'LONGITUDE': 'first',
'DISTRICT_NAME': 'first', 'DISTRICT_GEOCODE': 'first', 'CERTIFICATIONS': 'first',
'TOTAL_FARMERS_NONTRADER': 'first', 'TOTAL_FARMERS_TRADER': 'first', 'TOTAL_FARMERS_RFA': 'first',
'TOTAL_FARMERS_NODISCL': 'first', 'TOTAL_FARMERS': 'first', 'REPEATED_FROM_PAST_YEAR': 'first',
'SIMPLIF_ABRVNAME': 'first', 'IS_ALL_CAM_V3': 'max', 'IS_ANY_CAM_V3': 'max', 'CERTIFIED': 'max',
'NB_FARMERS_COMPANY_YEAR': 'mean', '!NOT_RFA': 'max', 'unique_rfa_link': 'max', 'LVL_4_NAME': 'first',
'BUYER_AGG': combine_unique_strings, 'COMPANY': combine_unique_strings, 'DISCLOSURE_SOURCES': combine_unique_strings,
'FARMERS_IMPUTED_FROM_2023': 'max'
}
agg_dict = {k: v for k, v in agg_dict.items() if k in df.columns}
df_2024_short = df.groupby(['COOP_ID', 'YEAR'], as_index=False).agg(agg_dict)
if 'BUYER_AGG' in df_2024_short.columns:
df_2024_short = df_2024_short.rename(columns={'BUYER_AGG': 'BUYER'})
df_2024_short['CCTN_COOP_ID'] = (
df_2024_short.get('SUPPLIER_ABRVNAME', pd.Series(dtype=str)).fillna('NA') + "_" +
df_2024_short.get('LONGITUDE', pd.Series(dtype=float)).round(2).astype(str).replace('nan', 'NA') + "_" +
df_2024_short.get('LATITUDE', pd.Series(dtype=float)).round(2).astype(str).replace('nan', 'NA') + "_" +
df_2024_short.get('SUPPLIER_FULLNAME', pd.Series(dtype=str)).fillna('NA')
)
# EXTRACT HISTORICAL BUYERS FROM SEIPCS (To fix missing buyers in df_master!)
# Create a safe, empty series that matches the length of the dataframe
empty_fallback = pd.Series(np.nan, index=df_seipcs_2023.index)
# Safely extract columns, defaulting to the empty series if they are missing
buyer_col = df_seipcs_2023.get('BUYER', empty_fallback)
trader_col = df_seipcs_2023.get('TRADER_NAME', empty_fallback)
source_col = df_seipcs_2023.get('DISCLOSURE_SOURCES', empty_fallback)
# Now fillna will safely evaluate actual pandas Series instead of 'None'
df_seipcs_2023['BUYER_AGG'] = buyer_col.fillna(trader_col).fillna(source_col)
hist_buyers = df_seipcs_2023.groupby(['COOP_ID', 'YEAR'])['BUYER_AGG'].apply(combine_unique_strings).reset_index()
hist_buyers = hist_buyers.rename(columns={'BUYER_AGG': 'BUYER'})
# INJECT HISTORICAL BUYERS INTO MASTER
df_master = df_master.drop(columns=['BUYER'], errors='ignore').merge(hist_buyers, on=['COOP_ID', 'YEAR'], how='left')
# Guarantee df_master has valid coordinate columns to survive the GeoJSON dropna
df_master['LATITUDE'] = df_master.get('LATITUDE', df_master.get('DISCL_LATITUDE'))
df_master['LONGITUDE'] = df_master.get('LONGITUDE', df_master.get('DISCL_LONGITUDE'))
# Concatenate Historical Short Data with 2024 Short Data
cam_coopyear_full = pd.concat([df_master, df_2024_short], ignore_index=True)
# NEW: Impute missing names back and forth
for col in ['SUPPLIER_ABRVNAME', 'SUPPLIER_FULLNAME']:
cam_coopyear_full[col] = cam_coopyear_full[col].replace(['', 'nan', 'NaN', 'None', 'NA'], np.nan)
cam_coopyear_full['SUPPLIER_ABRVNAME'] = cam_coopyear_full['SUPPLIER_ABRVNAME'].fillna(cam_coopyear_full['SUPPLIER_FULLNAME'])
cam_coopyear_full['SUPPLIER_FULLNAME'] = cam_coopyear_full['SUPPLIER_FULLNAME'].fillna(cam_coopyear_full['SUPPLIER_ABRVNAME'])
cam_coopyear_full.to_csv(PATHS['out_cam_coopyear'], sep=';', index=False)
print(f"-> cam_coopyear saved to: {PATHS['out_cam_coopyear']}")
# -------------------------------------------------------------------
# FORMAT 3: COOPS_TRASEEARTH
# -------------------------------------------------------------------
print("Exporting coops_traseearth GeoJSON format...")
rename_map = {
'YEAR': 'year', 'REPEATED_FROM_PAST_YEAR': 'repeated_from_past_year', 'SUPPLIER_ABRVNAME': 'abbreviated_name',
'SUPPLIER_FULLNAME': 'full_name', 'LATITUDE': 'latitude', 'LONGITUDE': 'longitude',
'DISTRICT_NAME': 'department_name', 'DISTRICT_GEOCODE': 'department_id', 'COOP_ID': 'coop_stable_id',
'BUYER': 'buyers', 'TOTAL_FARMERS': 'buyers_estimated_farmer_base'
}
df_trase = cam_coopyear_full.rename(columns=rename_map).copy()
# =============================================================================
# NEW: CLOSEST-YEAR IMPUTATION FOR GEOJSON
# =============================================================================
print("Imputing missing values in GeoJSON based on closest year...")
# Ensure 'year' is strictly numeric so we can calculate absolute distances
df_trase['year'] = pd.to_numeric(df_trase['year'], errors='coerce')
def impute_closest_year(group):
# If the cooperative only has one year of data, there's nothing to impute from
if len(group) < 2:
return group
# Columns we shouldn't attempt to impute
skip_cols = ['year', 'coop_stable_id']
cols_to_check = [c for c in group.columns if c not in skip_cols]
for col in cols_to_check:
# Treat NaN and empty strings as missing data
is_missing = group[col].isna() | (group[col] == "")
# If some values are missing, but valid data exists in other years
if is_missing.any() and not is_missing.all():
# Grab all rows in this group that have valid data AND a known year
valid_rows = group[~is_missing & group['year'].notna()]
if not valid_rows.empty:
# Iterate over the indices of the missing rows
for idx in group[is_missing].index:
target_year = group.loc[idx, 'year']
if pd.notna(target_year):
# Calculate the absolute difference in years and grab the index of the minimum
closest_idx = (valid_rows['year'] - target_year).abs().idxmin()
# Impute the value from the closest year
group.loc[idx, col] = valid_rows.loc[closest_idx, col]
return group
# Apply the closest-year imputation per coop_stable_id
df_trase = df_trase.groupby('coop_stable_id', group_keys=False).apply(impute_closest_year)
# =============================================================================
if 'repeated_from_past_year' not in df_trase.columns:
df_trase['repeated_from_past_year'] = False
else:
df_trase['repeated_from_past_year'] = df_trase['repeated_from_past_year'].replace({'NA': False, 'nan': False, 'NaN': False}).fillna(False).astype(bool)
# NEW: Impute missing names back and forth
for col in ['abbreviated_name', 'full_name']:
df_trase[col] = df_trase[col].replace(['', 'nan', 'NaN', 'None', 'NA'], np.nan)
df_trase['abbreviated_name'] = df_trase['abbreviated_name'].fillna(df_trase['full_name'])
df_trase['full_name'] = df_trase['full_name'].fillna(df_trase['abbreviated_name'])
def format_trase_id(geocode):
if pd.isna(geocode): return None
match = re.search(r'CI-(\d+)\.(\d+)\.(\d+)_', str(geocode))
if match: return f"CI-{int(match.group(1)):02d}{int(match.group(2)):02d}{int(match.group(3)):02d}"
return None
df_trase['department_trase_id'] = df_trase.get('department_id', pd.Series(dtype=str)).apply(format_trase_id)
df_trase['buyers_list'] = df_trase.get('buyers', pd.Series(dtype=str)).apply(
lambda x: [b.strip() for b in str(x).split(', ')] if pd.notna(x) and str(x).lower() not in ['nan', 'none', ''] else []
)
df_trase['buyers'] = df_trase.get('buyers', pd.Series(dtype=str)).replace(["", "nan", "None", "NaN"], None)
if 'buyers_estimated_farmer_base' in df_trase.columns:
df_trase['buyers_estimated_farmer_base'] = pd.to_numeric(df_trase['buyers_estimated_farmer_base'], errors='coerce')
# Drop rows missing coordinates right before converting to GeoDataFrame
df_trase_geo = df_trase.dropna(subset=['latitude', 'longitude']).copy()
gdf_trase = gpd.GeoDataFrame(
df_trase_geo,
geometry=gpd.points_from_xy(df_trase_geo.longitude, df_trase_geo.latitude),
crs="EPSG:4326"
)
final_trase_cols = [col for col in ['year', 'repeated_from_past_year', 'abbreviated_name', 'full_name', 'latitude', 'longitude', 'department_name', 'department_id', 'department_trase_id', 'coop_stable_id', 'buyers', 'buyers_estimated_farmer_base', 'buyers_list', 'geometry'] if col in gdf_trase.columns]
gdf_trase[final_trase_cols].to_file(PATHS['out_coops_traseearth'], driver='GeoJSON')
print(f"-> coops_traseearth saved to: {PATHS['out_coops_traseearth']}")
print("\n✅ All formats successfully generated!")
# ==========================================
# VALIDATION CHECKS
# ==========================================
print("\n--- Running Final Validation Checks ---")
df_2024_val = cam_coopyear_full[cam_coopyear_full['YEAR'] == 2024].copy()
ref_2023_val = df_master[df_master['YEAR'] == 2023].copy()
cols_2024 = set(df_2024_val.columns)
cols_2023 = set(ref_2023_val.columns)
missing_in_2024 = cols_2023 - cols_2024
if not missing_in_2024:
print("✅ Success: All mandatory 2023 columns are present in the 2024 output.")
else:
print(f"⚠️ Warning: Missing columns in 2024: {missing_in_2024}")
# =============================================================================
# NEW: VALIDATE MISSING FARMER YEARS ACROSS FORMATS
# =============================================================================
print("\n--- Checking for Years with Completely Missing Farmer Data ---")
# Dataframes to check and their respective farmer columns
dfs_to_check = {
"cam_long": (cam_long_full, 'NUM_FARMERS'), # or whatever the core farmer col is in long format
"cam_coopyear (and GeoJSON)": (cam_coopyear_full, 'TOTAL_FARMERS')
}
for name, (check_df, farmer_col) in dfs_to_check.items():
print(f"\nEvaluating {name}...")
if farmer_col not in check_df.columns:
print(f"⚠️ Alert: Column '{farmer_col}' not found in {name}.")
continue
for year in sorted(check_df['YEAR'].dropna().unique()):
year_mask = check_df['YEAR'] == year
if check_df.loc[year_mask, farmer_col].isna().all():
print(f" ⚠️ ALERT: All '{farmer_col}' values for the year {int(year)} are NaN or 0!")
else:
valid_count = check_df.loc[year_mask, farmer_col].notna().sum()
print(f" ✅ Year {int(year)} has {valid_count} valid farmer records.")
map_2024_ids = df_2024_val.groupby('SUPPLIER_FULLNAME')['COOP_ID'].unique().to_dict()
map_2023_ids = ref_2023_val.groupby('SUPPLIER_FULLNAME')['COOP_ID'].unique().to_dict()
map_2024_dists = df_2024_val.groupby('SUPPLIER_FULLNAME')['DISTRICT_NAME'].unique().to_dict()
map_2023_dists = ref_2023_val.groupby('SUPPLIER_FULLNAME')['DISTRICT_NAME'].unique().to_dict()
id_drift = []
for name, old_ids in map_2023_ids.items():
if name in map_2024_ids:
new_ids = map_2024_ids[name]
if not set(new_ids).issubset(set(old_ids)):
id_drift.append({'Name': name, '2023_IDs': old_ids.tolist(), '2024_IDs': new_ids.tolist(), '2023_Districts': map_2023_dists.get(name, []).tolist(), '2024_Districts': map_2024_dists.get(name, []).tolist()})
if not id_drift:
print("✅ Success: No ID drift detected. 2024 successfully inherited 2023 IDs.")
else:
drift_df = pd.DataFrame(id_drift)
print(f"⚠️ Alert: {len(id_drift)} 2024 coops have matched to multiple coops with different IDs from previous years!")
print("\n--- Top 50 Inconsistent Coops ---")
print(drift_df.head(50).to_string(index=False))
new_coops_mask = ~df_2024_val['COOP_ID'].isin(ref_2023_val['COOP_ID'])
new_coops_df = df_2024_val[new_coops_mask].drop_duplicates(subset=['COOP_ID'])
print(f"\nFound {len(new_coops_df)} brand new cooperatives for 2024.")
if len(new_coops_df) > 0:
print(new_coops_df.sort_values(by='TOTAL_FARMERS', ascending=False)[['SUPPLIER_FULLNAME', 'TOTAL_FARMERS', 'BUYER']].head())
# =============================================================================
# NEW: VALIDATE TRADER_NAME VALUES FOR 2024
# =============================================================================
print("\n--- Checking TRADER_NAME values for 2024 ---")
# Filter the long format dataframe for 2024 and grab unique trader names
traders_2024 = cam_long_full.loc[cam_long_full['YEAR'] == 2024, 'TRADER_NAME'].dropna().unique()
traders_2024_sorted = sorted(list(traders_2024))
print(f"Found {len(traders_2024_sorted)} unique disclosing traders in 2024:")
for t in traders_2024_sorted:
print(f" - {t}")
# Optional: Check how many rows are completely missing a TRADER_NAME
missing_trader_count = cam_long_full.loc[cam_long_full['YEAR'] == 2024, 'TRADER_NAME'].isna().sum()
print(f"\nNote: There are {missing_trader_count} flows in 2024 where TRADER_NAME is completely blank.")
--- Phase 1: Column Cleaning ---
--- Phase 2: Hybrid Imputation ---
Running secondary historical fuzzy imputation...
--- Phase 2.5: Advanced Geographic Recovery ---
Running initial spatial join...
Running Two-Way Geographic Sync...
--- Phase 3: CAM v4 Structure Alignment ---
--- Phase 4: Splitting Flows ---
--- Phase 5: Buying Station & Stable Coop ID Generation ---
--- Phase 6: Farmer Base Imputation & Calculation ---
Imputing missing 2024 farmer counts from 2023 history...
Imputing missing 2024 farmer counts from 2023 history...
--- Phase 7: Final Historical Recovery ---
--- Phase 7.5: Rolling forward missing 2023 links into 2024 ---
Identified 5454 historical links from 2023 missing in 2024. Rolling them forward...
Successfully appended historical links to the 2024 dataset.
--- Phase 8: Generating Output Datasets ---
Exporting cam_long format...
-> cam_long saved to: /Users/niamhfrench/repos/TRASE/trase/data/cote_divoire/cocoa/logistics/q4_2025/csvs/2024_civ_coops_like_cam_long_march30.csv
Exporting cam_coopyear format...
-> cam_coopyear saved to: /Users/niamhfrench/repos/TRASE/trase/data/cote_divoire/cocoa/logistics/q4_2025/csvs/2024_civ_coops_like_cam_coopyear_march30.csv
Exporting coops_traseearth GeoJSON format...
Imputing missing values in GeoJSON based on closest year...
-> coops_traseearth saved to: /Users/niamhfrench/repos/TRASE/trase/data/cote_divoire/cocoa/logistics/q4_2025/csvs/2024_civ_coops_like_cam_long_march30_traseearth.geojson
✅ All formats successfully generated!
--- Running Final Validation Checks ---
✅ Success: All mandatory 2023 columns are present in the 2024 output.
--- Checking for Years with Completely Missing Farmer Data ---
Evaluating cam_long...
✅ Year 2009 has 5 valid farmer records.
✅ Year 2010 has 13 valid farmer records.
✅ Year 2011 has 29 valid farmer records.
✅ Year 2012 has 72 valid farmer records.
✅ Year 2013 has 93 valid farmer records.
✅ Year 2014 has 93 valid farmer records.
✅ Year 2015 has 96 valid farmer records.
✅ Year 2016 has 112 valid farmer records.
✅ Year 2017 has 123 valid farmer records.
✅ Year 2018 has 134 valid farmer records.
✅ Year 2019 has 909 valid farmer records.
✅ Year 2020 has 805 valid farmer records.
✅ Year 2021 has 725 valid farmer records.
✅ Year 2022 has 730 valid farmer records.
✅ Year 2023 has 720 valid farmer records.
✅ Year 2024 has 1324 valid farmer records.
Evaluating cam_coopyear (and GeoJSON)...
✅ Year 2009 has 5 valid farmer records.
✅ Year 2010 has 15 valid farmer records.
✅ Year 2011 has 32 valid farmer records.
✅ Year 2012 has 76 valid farmer records.
✅ Year 2013 has 97 valid farmer records.
✅ Year 2014 has 97 valid farmer records.
✅ Year 2015 has 100 valid farmer records.
✅ Year 2016 has 115 valid farmer records.
✅ Year 2017 has 126 valid farmer records.
✅ Year 2018 has 137 valid farmer records.
✅ Year 2019 has 843 valid farmer records.
✅ Year 2020 has 926 valid farmer records.
✅ Year 2021 has 922 valid farmer records.
✅ Year 2022 has 1848 valid farmer records.
✅ Year 2023 has 1124 valid farmer records.
✅ Year 2024 has 1127 valid farmer records.
⚠️ Alert: 77 2024 coops have matched to multiple coops with different IDs from previous years!
--- Top 50 Inconsistent Coops ---
Name 2023_IDs 2024_IDs 2023_Districts 2024_Districts
3A [12] [10, 12] [nan] [None]
AGBS [80] [79, 80] [nan] [SIKENSI, None]
AGREL [89] [88, 89] [nan] [LAKOTA, None]
AHONDJO [103, 102] [102, 103, 104] [ABENGOUROU] [ABENGOUROU, None]
BARA [259] [258, 259, 261] [BANGOLO] [None, BANGOLO]
BIRBA [321] [321, 324] [SASSANDRA] [SASSANDRA, None]
CABA [470] [472] [DUEKOUE] [DUEKOUE]
CABD [493, 494, 487] [487, 493, 494, 496] [DUEKOUE, nan] [OUME, DUEKOUE, None]
CABG [521] [520, 521] [nan] [DALOA, None]
CABID [526] [526, 527, 529] [nan] [None, SAN-PEDRO]
CAES [705] [709] [AGNIBILEKRO] [None]
CANWG [923] [922, 923] [nan] [GAGNOA, None]
CAPEDIG [964] [966] [DUEKOUE] [DUEKOUE]
CAPKA [987] [985, 987] [nan] [None]
CAYAT [1199] [1198, 1199] [nan] [None]
CEXPAG [1300] [1298, 1300] [nan] [BLOLEQUIN, None]
COALIDAN [1403] [1403, 1405] [VAVOUA] [VAVOUA, None]
COASI [1433] [1433, 1434] [nan] [None, MEAGUI]
COOJAB [1548, 1550, 1551] [1548, 1550, 1551, 1553] [BETTIE, nan] [BETTIE, None]
COOPAC [1654] [1649, 1654] [nan] [ABENGOUROU, None]
COOPADO [1691] [1691, 1692] [nan] [None, SASSANDRA]
COOPASAM [1937] [1937, 1940] [SAN-PEDRO] [SASSANDRA, None]
COOPERATIVE CAPRESSA AGNITIE D'APPROMPRON [1006] [1006, 1009] [nan] [None, ABENGOUROU]
COOPERATIVE DES AGRICULTEURS DE DAHORO [14] [14, 60] [nan] [None, SAN-PEDRO]
COOPERATIVE KANWORI DE YAOU [3291] [3290, 3291] [nan] [ABOISSO, None]
COOPERATIVE LE ROCHER [3364] [3364, 3365] [nan] [None, SINFRA]
COOPINA [2462] [2462, 2466] [GAGNOA] [GAGNOA]
COOPYCA [2529] [2528, 2529] [ABENGOUROU] [ABENGOUROU]
CPAGB [2645] [2643, 2645, 2648] [nan] [SAN-PEDRO, None]
CPB [2667] [2667, 2668] [nan] [None, GAGNOA]
ECAMOM [2865] [2864, 2865, 2867] [SOUBRE] [MEAGUI, SOUBRE, None]
ECAO [2886, 2887] [2886, 2887, 2889] [SOUBRE, TABOU] [SOUBRE, TABOU, None]
ECATIE [2913] [2912, 2913] [nan] [None]
ECOAS [2948] [2947, 2948] [nan] [SAN-PEDRO, None]
ECOPAS [3045] [3045, 3047] [SAN-PEDRO] [SAN-PEDRO, None]
ECSP [3064] [3064, 3067] [SAN-PEDRO] [SAN-PEDRO, None]
ESPOIR [3148, 3149] [3148, 3149, 3156] [ABENGOUROU, nan] [ABENGOUROU, None]
ETC TAABO [3167] [3167, 3169] [TAABO] [TAABO, None]
FAHO [3192] [3192, 3195] [nan] [None]
GCT [3216] [3215, 3216] [MAN] [DUEKOUE, MAN]
KRIHIRI [3322] [3322, 3325] [GAGNOA] [GAGNOA, None]
NOUVELLE SCOOPRANYD [3499] [3498, 3499] [nan] [DUEKOUE, None]
PUSAH [3641] [3640, 3641] [nan] [SASSANDRA, None]
SAMA HS [3691] [3690, 3691] [nan] [ISSIA, None]
SCAB [3767] [3763, 3767] [SAN-PEDRO] [ALEPE, SAN-PEDRO]
SCAEB [3818] [3818, 3822] [BOUAFLE] [BOUAFLE, None]
SCAMG [3930] [3930, 3932] [nan] [None]
SCAND [3954, 3957, 3956] [3953, 3954, 3956, 3957] [GAGNOA, SAN-PEDRO, nan] [SOUBRE, GAGNOA, None, SAN-PEDRO]
SCANDI [3952] [3951, 3952] [nan] [OUME, None]
SCAT [4091, 4087] [4087, 4091, 4093] [MEAGUI, SAN-PEDRO] [SAN-PEDRO, MEAGUI, None]
Found 157 brand new cooperatives for 2024.
SUPPLIER_FULLNAME TOTAL_FARMERS \
31788 CENTRE D'ACHAT HASSAN KAMEL FTOUNI HKF6 NaN
31789 COOPERATIVE DES JEUNES PRODUCTEURS AGRICOLES D... NaN
31790 COOPERATIVE DES AMIS DE LA NATURE DE MAN COOPC... NaN
31791 CABID II NaN
31792 SCJPAB I NaN
BUYER
31788 SUCDEN
31789 COMPAGNIE CACAOYERE DU BANDAMA (CCB)
31790 IVORY COCOA PRODUCTS
31791 BARRY CALLEBAUT, NESTLE
31792 BARRY CALLEBAUT, NESTLE
--- Checking TRADER_NAME values for 2024 ---
Found 23 unique disclosing traders in 2024:
- BARONIE
- BARRY CALLEBAUT
- BLOMMER
- CARGILL
- CEMOI
- COCOASOURCE
- COMPAGNIE CACAOYERE DU BANDAMA (CCB)
- ECOM
- ECOOKIM
- ETC GROUP
- ETHIQUABLE
- FARMSTRONG
- FILDISI
- GUAN CHONG COCOA
- IVORY COCOA PRODUCTS
- JB FOODS
- KRUGER
- OLAM
- SACC
- SO B GREEN
- SUCDEN
- SUSCOM
- TOUTON
Note: There are 5448 flows in 2024 where TRADER_NAME is completely blank.
QA
import pandas as pd
import numpy as np
import geopandas as gpd
# example cam_long
cam_long_ex = pd.read_csv("/Users/niamhfrench/repos/TRASE/trase/data/cote_divoire/cocoa/logistics/q4_2025/csvs/2024_coops_like_camlong_marchfixes.csv")
print(cam_long_ex)
# example cam_coopyear
cam_coopyear_ex = pd.read_csv("/Users/niamhfrench/repos/TRASE/trase/data/cote_divoire/cocoa/logistics/q4_2025/csvs/2024_coops_like_camcoopyear_marchfixes.csv")
print(cam_coopyear_ex)
# example coops_for_traseearth
coops_traseearth_ex = gpd.read_file("/Users/niamhfrench/repos/TRASE/trase/data/cote_divoire/cocoa/logistics/q4_2025/csvs/cote-d-ivoire-cocoa-cooperatives.geojson")
print(coops_traseearth_ex)
<positron-console-cell-27>:6: DtypeWarning: Columns (33,34) have mixed types. Specify dtype option on import or set low_memory=False.
FLOW_ID COOP_ID YEAR SUPPLIER_ABRVNAME \
0 2009-1 448 2009 CAAP
1 2009-2 726 2009 CAFHS
2 2009-3 1734 2009 COOPAGA
3 2009-4 2906 2009 ECASO
4 2009-5 4783 2009 SOCAB
... ... ... ... ...
35081 2024-7421 5792 2024 ECAM SASSANDRA
35082 2024-7422 6000 2024 SUF
35083 2024-7423 6185 2024 TOULEPLEU
35084 2024-7424 5789 2024 SANAB
35085 2024-7425 6339 2024 CCAPUO
SUPPLIER_FULLNAME LATITUDE LONGITUDE \
0 COOPERATIVE ANOUANZE DE PETIT BOUAKE 5.890 -6.376
1 COOPERATIVE AGRICOLE FIEDIFOUE DU HAUT SASSANDRA 6.891 -6.461
2 COOPERATIVE AGRICOLE GENERALE DE GAGNOA 5.022 -6.550
3 COOPERATIVE DES AGRICULTEURS DU SUD-OUEST 5.785 -6.597
4 COOPERATIVE DES AGRICULTEURS DE BOUAFLE 6.263 -6.999
... ... ... ...
35081 ENTREPRISE COOPERATIVE DES AGRICULTEURS DE MEA... NaN NaN
35082 COOPERATIVE AGRICOLE DES PLANTEURS UNIS DE FEN... 6.739 -7.353
35083 SOCIETE COOPERATIVE AVEC CONSEIL DADMINISTRATI... 6.492 -6.585
35084 SOCIETE COOPERATIVE AGRICOLE NOUVELLE ALLIANCE... 4.957 -6.085
35085 SOCIETE COOPERATIVE AVEC CONSEIL D`ADMINISTRAT... 6.747 -7.362
DISTRICT_NAME DISTRICT_GEOCODE BUYER ... IS_ALL_CAM_V3 \
0 BOUAKE CI-11.1.3_1 CARGILL ... False
1 SASSANDRA CI-2.1.2_1 CARGILL ... False
2 GAGNOA CI-5.1.1_1 CARGILL ... False
3 SOUBRE CI-2.2.5_1 CARGILL ... False
4 BOUAFLE CI-9.2.1_1 CARGILL ... False
... ... ... ... ... ...
35081 MEAGUI CI-2.2.3_1 ECOM ... NaN
35082 DUEKOUE CI-8.2.2_1 ECOM ... NaN
35083 TOULEUPLEU CI-8.1.4_1 ECOM ... NaN
35084 BANGOLO CI-8.2.1_1 ECOM ... NaN
35085 DUEKOUE CI-8.2.2_1 ECOM ... NaN
IS_ANY_CAM_V3 CERTIFIED NB_FARMERS_COMPANY_YEAR NON_TRADER IS_TRADER \
0 True False 277.0 False True
1 True True 838.0 False True
2 True True 1190.0 False True
3 True True 735.0 False True
4 False False 237.0 False True
... ... ... ... ... ...
35081 NaN NaN NaN NaN False
35082 NaN NaN NaN NaN False
35083 NaN NaN NaN NaN False
35084 NaN NaN NaN NaN False
35085 NaN NaN NaN NaN False
!NOT_RFA unique_rfa_link LVL_4_NAME REPEATED_FROM_PAST_YEAR
0 False False SOUBRE False
1 False False DALOA False
2 False False SAN-PEDRO False
3 False False SOUBRE False
4 False False BUYO False
... ... ... ... ...
35081 NaN NaN CI-2.1.2_1 False
35082 NaN NaN CI-8.2.2_1 False
35083 NaN NaN CI-8.1.4_1 False
35084 NaN NaN CI-8.2.1_1 False
35085 NaN NaN CI-8.2.2_1 False
[35086 rows x 43 columns]
COOP_ID YEAR SUPPLIER_ABRVNAME \
0 1 2019 0UBE
1 1 2020 0UBE
2 1 2021 0UBE
3 1 2022 0UBE
4 1 2023 0UBE
... ... ... ...
23814 6329 2024 ANOU
23815 6330 2024 SCAETHS
23816 6331 2024 SS_DAKUA
23817 6332 2024 SORANHS
23818 6339 2024 CCAPUO
SUPPLIER_FULLNAME LATITUDE LONGITUDE \
0 COOPERATIVE AGRICOLE OUBE DE BANOUFLA 6.878 -6.450
1 COOPERATIVE AGRICOLE OUBE DE BANOUFLA 6.878 -6.450
2 COOPERATIVE AGRICOLE OUBE DE BANOUFLA 6.878 -6.450
3 COOPERATIVE AGRICOLE OUBE DE BANOUFLA 6.878 -6.450
4 COOPERATIVE AGRICOLE OUBE DE BANOUFLA 6.878 -6.450
... ... ... ...
23814 COOPERATIVE AGRICOLE DE COOPANOU NaN NaN
23815 SOCIETE COOPERATIVE AGRICOLE ELIE TIKPA DE DALOA 6.874 -6.458
23816 SCOOP-DAKUA NaN NaN
23817 SOCIETE COOPERATIVE DES PRODUCTEURS AGRICOLES ... 6.880 -6.448
23818 SOCIETE COOPERATIVE AVEC CONSEIL D`ADMINISTRAT... 6.747 -7.362
DISTRICT_NAME DISTRICT_GEOCODE CERTIFICATIONS TOTAL_FARMERS_NONTRADER \
0 DALOA CI-9.1.1_1 NaN 0.0
1 DALOA CI-9.1.1_1 NaN 0.0
2 DALOA CI-9.1.1_1 NaN 0.0
3 DALOA CI-9.1.1_1 NaN 0.0
4 DALOA CI-9.1.1_1 NaN 0.0
... ... ... ... ...
23814 TABOU CI-2.3.2_1 NaN NaN
23815 DALOA CI-9.1.1_1 NaN NaN
23816 GUIGLO CI-8.1.2_1 NaN NaN
23817 SASSANDRA CI-2.1.2_1 NaN NaN
23818 DUEKOUE CI-8.2.2_1 NaN NaN
... SIMPLIF_ABRVNAME \
0 ... 0UBE
1 ... 0UBE
2 ... 0UBE
3 ... 0UBE
4 ... 0UBE
... ... ...
23814 ... COOPANOU
23815 ... NaN
23816 ... DAKUA
23817 ... SOCOOPRANHS
23818 ... NaN
CCTN_COOP_ID IS_ALL_CAM_V3 \
0 0UBE_-6.4_6.9_COOPERATIVE AGRICOLE OUBE DE BAN... True
1 NaN True
2 NaN True
3 NaN True
4 NaN True
... ... ...
23814 COOPANOU_NA_NA_COOPERATIVE AGRICOLE DE COOPANOU False
23815 NaN NaN
23816 DAKUA_NA_NA_SCOOP-DAKUA False
23817 SOCOOPRANHS_NA_NA_COOPERATIVE DES PRODUCTEURS ... False
23818 NaN NaN
IS_ANY_CAM_V3 CERTIFIED NB_FARMERS_COMPANY_YEAR !NOT_RFA \
0 True False NaN NaN
1 True NaN NaN NaN
2 True NaN NaN NaN
3 True NaN NaN NaN
4 True NaN NaN NaN
... ... ... ... ...
23814 True False NaN False
23815 NaN NaN NaN NaN
23816 False False NaN False
23817 False False NaN False
23818 NaN NaN NaN NaN
unique_rfa_link LVL_4_NAME REPEATED_FROM_PAST_YEAR
0 NaN DALOA False
1 NaN NaN True
2 NaN NaN True
3 NaN NaN True
4 NaN NaN True
... ... ... ...
23814 False CI-2.3.2_1 False
23815 NaN CI-9.1.1_1 False
23816 False CI-8.1.2_1 False
23817 False CI-9.1.1_1 False
23818 NaN CI-8.2.2_1 False
[23819 rows x 34 columns]
year repeated_from_past_year abbreviated_name \
0 2019 False 0UBE
1 2019 False 2 AD
2 2019 False 2A SCCOPS
3 2019 False 2AP
4 2019 False 2A
... ... ... ...
26017 2023 True ZOUAN KOUALOU
26018 2023 True ZOUEUDO
26019 2023 True ZOUTOKEGUI
26020 2023 True ZRANLEU
26021 2023 True ZSB
full_name latitude longitude \
0 COOPERATIVE AGRICOLE OUBE DE BANOUFLA 6.878 -6.450
1 2 AD 6.720 -3.503
2 COOPERATIVE LES AGRICULTEURS D'AKOUPE 5.477 -4.151
3 COOPERATIVE AGRICOLE ANOUANZE DE PARA NaN NaN
4 2A 5.480 -4.155
... ... ... ...
26017 COOPERATIVE AGRICOLE DE ZOUAN-HOUNIEN 6.915 -8.211
26018 COOPERATIVE AGRICOLE ZOUEUDO DE MAHAPLEU 7.256 -8.156
26019 COOPERATIVE AGRICOLE ZOUTOKEGUI DE GOURANE 7.734 -7.618
26020 COOPERATIVE ZRANLEU DE MAN 7.408 -7.549
26021 ZSB 7.404 -7.554
department_name department_id department_trase_id coop_stable_id \
0 DALOA CI-9.1.1_1 CI-090101 1
1 ABENGOUROU CI-3.1.1_1 CI-030101 2
2 ABIDJAN CI-1.1.1_1 CI-010101 3
3 None None None 4
4 ABIDJAN CI-1.1.1_1 CI-010101 5
... ... ... ... ...
26017 ZOUAN-HOUNIEN CI-8.3.5_1 CI-080305 5761
26018 DANANE CI-8.3.2_1 CI-080302 5762
26019 BIANKOUMA CI-8.3.1_1 CI-080301 5763
26020 MAN CI-8.3.3_1 CI-080303 5764
26021 MAN CI-8.3.3_1 CI-080303 5766
buyers buyers_estimated_farmer_base buyers_list geometry
0 None NaN [] POINT (-6.45 6.878)
1 None NaN [] POINT (-3.503 6.72)
2 None NaN [] POINT (-4.151 5.477)
3 None NaN [] None
4 None NaN [] POINT (-4.155 5.48)
... ... ... ... ...
26017 None NaN [] POINT (-8.211 6.915)
26018 None NaN [] POINT (-8.156 7.256)
26019 None NaN [] POINT (-7.618 7.734)
26020 None NaN [] POINT (-7.549 7.408)
26021 None NaN [] POINT (-7.554 7.404)
[26022 rows x 14 columns]
import geopandas as gpd
gdf = gpd.read_file("/Users/niamhfrench/repos/TRASE/trase/data/cote_divoire/cocoa/logistics/q4_2025/csvs/2024_civ_coops_like_cam_long_march30_traseearth.geojson")
gdf
| year | repeated_from_past_year | abbreviated_name | full_name | latitude | longitude | department_name | department_id | department_trase_id | coop_stable_id | buyers | buyers_estimated_farmer_base | buyers_list | geometry | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2019 | False | 0UBE | COOPERATIVE AGRICOLE OUBE DE BANOUFLA | 6.878000 | -6.450000 | DALOA | CI-9.1.1_1 | CI-090101 | 1 | None | NaN | [] | POINT (-6.45 6.878) |
| 1 | 2019 | False | 2 AD | 2 AD | 6.720000 | -3.503000 | ABENGOUROU | CI-3.1.1_1 | CI-030101 | 2 | None | NaN | [] | POINT (-3.503 6.72) |
| 2 | 2019 | False | 2A SCCOPS | COOPERATIVE LES AGRICULTEURS D'AKOUPE | 5.477000 | -4.151000 | ABIDJAN | CI-1.1.1_1 | CI-010101 | 3 | None | NaN | [] | POINT (-4.151 5.477) |
| 3 | 2019 | False | 2A | 2A | 5.480000 | -4.155000 | ABIDJAN | CI-1.1.1_1 | CI-010101 | 5 | None | NaN | [] | POINT (-4.155 5.48) |
| 4 | 2019 | False | 3AS | COOPERATIVE AGRICOLE LES 3 AS | 6.106000 | -3.866000 | ADZOPE | CI-7.3.1_1 | CI-070301 | 6 | None | NaN | [] | POINT (-3.866 6.106) |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27423 | 2024 | False | ZAGTOULI | ZAGTOULI | 7.262500 | -8.120300 | DANANE | CI-8.3.2_1 | CI-080302 | 5914 | ETC GROUP | NaN | [ETC GROUP] | POINT (-8.1203 7.2625) |
| 27424 | 2024 | False | ECOOKIM COOPAPIG | ECOOKIM COOPAPIG | 7.050014 | -7.386513 | BANGOLO | CI-8.2.1_1 | CI-080201 | 5915 | MARS | NaN | [MARS] | POINT (-7.38651 7.05001) |
| 27425 | 2024 | False | ECOOKIM CAVA | ECOOKIM CAVA | 7.375600 | -6.476274 | VAVOUA | CI-9.1.3_1 | CI-090103 | 5916 | MARS | NaN | [MARS] | POINT (-6.47627 7.3756) |
| 27426 | 2024 | False | ECOOKIM CAUD | ECOOKIM CAUD | 5.825625 | -5.340961 | DIVO | CI-5.2.1_1 | CI-050201 | 5917 | MARS | NaN | [MARS] | POINT (-5.34096 5.82563) |
| 27427 | 2024 | False | ECOOKIM CAKIB | ECOOKIM CAKIB | 4.647175 | -6.932056 | SAN-PEDRO | CI-2.3.1_1 | CI-020301 | 5918 | MARS | NaN | [MARS] | POINT (-6.93206 4.64718) |
27428 rows × 14 columns
for i,n in enumerate(gdf.buyers.unique()):
print(i,n)
0 None
1 OLAM
2 FERRERO
3 CARGILL, OLAM
4 BLOMMER, NESTLE, OLAM
5 BARRY CALLEBAUT
6 BARRY CALLEBAUT, HERSHEY
7 BARRY CALLEBAUT, BLOMMER, NESTLE, OLAM, TOUTON
8 BLOMMER, OLAM, TOUTON
9 BARRY CALLEBAUT, FERRERO
10 CARGILL
11 CARGILL, NESTLE
12 CARGILL, CEMOI
13 CEMOI
14 ECOM, NESTLE
15 NESTLE, OLAM
16 MONDELEZ
17 BARRY CALLEBAUT, MONDELEZ
18 MARS
19 BARRY CALLEBAUT, MARS
20 ETC GROUP
21 CEMOI, OLAM
22 FERRERO, NESTLE, SUCDEN
23 ETC GROUP, NESTLE
24 UNILEVER
25 HERSHEY
26 FERRERO, TOUTON
27 ECOM, LINDT SPRUNGLI
28 ECOM, HERSHEY
29 ECOM
30 BARRY CALLEBAUT, ETC GROUP, NESTLE, TOUTON
31 BLOMMER, HERSHEY, OLAM
32 SUCDEN
33 BLOMMER, HERSHEY
34 CARGILL, MARS
35 NESTLE, SUCDEN
36 CARGILL, MONDELEZ
37 MARS, OLAM
38 CARGILL, FERRERO
39 BARRY CALLEBAUT, NESTLE
40 HERSHEY, OLAM
41 MONDELEZ, OLAM
42 BLOMMER, OLAM
43 NESTLE
44 BARRY CALLEBAUT, OLAM
45 ECOM, OLAM
46 BARRY CALLEBAUT, CARGILL, MARS
47 FERRERO, OLAM
48 ECOM, PURATOS
49 BARRY CALLEBAUT, MARS, OLAM
50 ECOM, HERSHEY, MARS, SUCDEN
51 MARS, SUCDEN
52 TOUTON
53 BLOMMER
54 CARGILL, HERSHEY
55 VALRHONA
56 ECOOKIM
57 CEMOI, MONDELEZ, OLAM
58 BLOMMER, MARS, OLAM
59 HERSHEY, OLAM, SUCDEN
60 FILDISI, MARS
61 ECOM, FERRERO
62 BARRY CALLEBAUT, BLOMMER, OLAM
63 BLOMMER, MARS
64 BARRY CALLEBAUT, CARGILL
65 BARRY CALLEBAUT, CARGILL, NESTLE
66 BLOMMER, MARS, TOUTON
67 CARGILL, MARS, OLAM
68 BARRY CALLEBAUT, CARGILL, MARS, MONDELEZ, OLAM
69 BARRY CALLEBAUT, CARGILL, MARS, OLAM
70 BARRY CALLEBAUT, HERSHEY, MONDELEZ
71 BARRY CALLEBAUT, CARGILL, HERSHEY
72 BLOMMER, CEMOI, HERSHEY
73 GUAN CHONG COCOA, NESTLE
74 BARRY CALLEBAUT, MARS, MONDELEZ, OLAM
75 CARGILL, CEMOI, HERSHEY
76 CEMOI, NESTLE, OLAM, SUCDEN
77 BARRY CALLEBAUT, FERRERO, HERSHEY
78 BARRY CALLEBAUT, ECOM, HERSHEY, OLAM
79 ECOM, MONDELEZ
80 BLOMMER, CEMOI, HERSHEY, OLAM
81 ECOOKIM, MARS
82 BLOMMER, CEMOI
83 BARRY CALLEBAUT, CEMOI, FERRERO, MONDELEZ, OLAM, SUCDEN
84 ECOM, FERRERO, LINDT SPRUNGLI
85 NESTLE, TOUTON
86 BLOMMER, ECOM
87 MARS, NESTLE, SUCDEN
88 BARRY CALLEBAUT, MARS, MONDELEZ
89 BARRY CALLEBAUT, HERSHEY, OLAM
90 CEMOI, MARS, OLAM
91 ALFRED RITTER
92 ALFRED RITTER, CARGILL
93 ALFRED RITTER, BARRY CALLEBAUT, CARGILL, FERRERO
94 ALFRED RITTER, BARRY CALLEBAUT, CARGILL, CEMOI, FERRERO
95 CARGILL, ECOM, MARS, NESTLE
96 ECOM, FERRERO, MONDELEZ
97 COCOASOURCE, ECOM
98 COCOASOURCE
99 CEMOI, FERRERO, TOUTON
100 BARRY CALLEBAUT, FERRERO, TOUTON
101 BARRY CALLEBAUT, NESTLE, OLAM
102 BLOMMER, NESTLE
103 BLOMMER, FERRERO, OLAM, SUCDEN
104 PURATOS
105 FERRERO, MARS
106 COCOASOURCE, FERRERO
107 BLOMMER, ECOM, MARS
108 BLOMMER, NESTLE, SUCDEN
109 BARRY CALLEBAUT, ECOM, NESTLE
110 BLOMMER, ECOM, HERSHEY
111 ALTER ECO
112 MARS, TOUTON
113 BARRY CALLEBAUT, ETC GROUP
114 ALTER ECO, CARGILL, ECOM, MONDELEZ
115 ECOM, FERRERO, HERSHEY
116 BARRY CALLEBAUT, HERSHEY, MARS, OLAM
117 ECOM, MARS
118 BARRY CALLEBAUT, SUCDEN
119 BLOMMER, CARGILL, NESTLE, TOUTON
120 CARGILL, ECOM, MONDELEZ
121 CARGILL, OLAM, PURATOS
122 CARGILL, MARS, NESTLE, OLAM
123 NESTLE, SO B GREEN
124 CEMOI, MARS, SUCDEN
125 NESTLE, OLAM, SUCDEN
126 CEMOI, ETC GROUP
127 ETHIQUABLE
128 BARRY CALLEBAUT, ETC GROUP, MONDELEZ
129 ECOM, FILDISI
130 ECOM, FILDISI, SUCDEN
131 CARGILL, KRUGER, SUSCOM
132 BLOMMER, ETC GROUP
133 ECOM, LINDT SPRUNGLI, MARS
134 BARRY CALLEBAUT, BLOMMER, FERRERO, TOUTON
135 BARRY CALLEBAUT, BLOMMER, TOUTON
136 BARRY CALLEBAUT, TOUTON
137 BARRY CALLEBAUT, FERRERO, MARS
138 BARRY CALLEBAUT, BLOMMER, MARS
139 FERRERO, SUCDEN
140 CARGILL, GUAN CHONG COCOA, MARS, NESTLE, OLAM
141 BLOMMER, CEMOI, HERSHEY, OLAM, SUCDEN
142 ECOM, TOUTON
143 BARONIE, BARRY CALLEBAUT, CEMOI, COCOASOURCE, ECOOKIM, TONYS CHOCOLONELY
144 FERRERO, OLAM, SUCDEN
145 BARRY CALLEBAUT, ECOM
146 CEMOI, FERRERO
147 ALBERT HEIJN, FERRERO
148 OLAM, SUCDEN
149 CEMOI, SUCDEN
150 BLOMMER, MARS, OLAM, TOUTON
151 BARRY CALLEBAUT, BLOMMER, MARS, TOUTON
152 ETC GROUP, FERRERO, NESTLE
153 CARGILL, HERSHEY, NESTLE
154 MARS, NESTLE, OLAM, SUCDEN
155 MARS, MONDELEZ, OLAM
156 CEMOI, OLAM, SUCDEN
157 ECOM, HERSHEY, OLAM
158 CEMOI, MONDELEZ, OLAM, SUCDEN
159 BLOMMER, OLAM, SUCDEN
160 MARS, OLAM, SUCDEN
161 BLOMMER, SUCDEN
162 BLOMMER, TOUTON
163 ETC GROUP, TOUTON
164 CEMOI, HERSHEY
165 HERSHEY, MONDELEZ
166 BLOMMER, CEMOI, OLAM
167 FARMSTRONG, NESTLE
168 NESTLE, OLAM, UNILEVER
169 CARGILL, NESTLE, OLAM
170 FERRERO, NESTLE, OLAM
171 ETC GROUP, OLAM
172 OLAM, UNILEVER
173 SACC
174 ECOM, LINDT SPRUNGLI, NESTLE
175 BARRY CALLEBAUT, HERSHEY, TOUTON
176 BARONIE, BARRY CALLEBAUT, CEMOI, COCOASOURCE, ECOOKIM, FERRERO, MARS, TONYS CHOCOLONELY
177 ETC GROUP, FERRERO, MARS
178 CEMOI, NESTLE, OLAM
179 BLOMMER, ETC GROUP, NESTLE
180 NESTLE, TOUTON, UNILEVER
181 BARRY CALLEBAUT, ECOM, FERRERO, LINDT SPRUNGLI
182 ETC GROUP, UNILEVER
183 BARRY CALLEBAUT, BLOMMER, OLAM, UNILEVER
184 BLOMMER, FERRERO, OLAM
185 BARRY CALLEBAUT, FERRERO, NESTLE
186 BARRY CALLEBAUT, NESTLE, OLAM, UNILEVER
187 ECOM, LINDT SPRUNGLI, PURATOS
188 CARGILL, OLAM, UNILEVER
189 ECOM, LINDT SPRUNGLI, PURATOS, SUCDEN
190 CARGILL, UNILEVER
191 BLOMMER, ECOM, ECOOKIM
192 BARRY CALLEBAUT, FERRERO, NESTLE, OLAM, UNILEVER
193 BARONIE, BARRY CALLEBAUT, CARGILL, CEMOI, COCOASOURCE, ECOOKIM, NESTLE, TONYS CHOCOLONELY, UNILEVER
194 BARRY CALLEBAUT, UNILEVER
195 FERRERO, NESTLE, OLAM, SUCDEN
196 BARONIE, BARRY CALLEBAUT, CARGILL, CEMOI, COCOASOURCE, ECOOKIM, MARS, OLAM, TONYS CHOCOLONELY, UNILEVER
197 BLOMMER, ECOM, ETC GROUP, FILDISI
198 CEMOI, MARS
199 ETC GROUP, FERRERO
200 BARRY CALLEBAUT, FERRERO, NESTLE, SUCDEN
201 BARRY CALLEBAUT, ECOM, FERRERO, LINDT SPRUNGLI, OLAM
202 CARGILL, ECOM, LINDT SPRUNGLI
203 BARONIE, BARRY CALLEBAUT, CEMOI, COCOASOURCE, ECOOKIM, ETC GROUP, NESTLE, OLAM, TONYS CHOCOLONELY, UNILEVER
204 BARRY CALLEBAUT, MARS, UNILEVER
205 FERRERO, MARS, OLAM
206 BARONIE, BARRY CALLEBAUT, CARGILL, CEMOI, COCOASOURCE, ECOM, ECOOKIM, LINDT SPRUNGLI, TONYS CHOCOLONELY
207 BARRY CALLEBAUT, CARGILL, FERRERO
208 BARRY CALLEBAUT, CARGILL, CEMOI, FERRERO, GODIVA, PURATOS
209 CARGILL, MARS, NESTLE
210 BARONIE, BARRY CALLEBAUT, CARGILL, CEMOI, COCOASOURCE, ECOOKIM, MARS, TONYS CHOCOLONELY, UNILEVER
211 BARONIE, BARRY CALLEBAUT, CEMOI, COCOASOURCE, ECOOKIM, FERRERO, TONYS CHOCOLONELY, UNILEVER
212 FERRERO, PURATOS
213 ETC GROUP, MARS
214 BARRY CALLEBAUT, CARGILL, ETC GROUP
215 ETC GROUP, OLAM, PURATOS
216 BARRY CALLEBAUT, ECOM, LINDT SPRUNGLI, PURATOS
217 ECOM, FERRERO, OLAM
218 BARRY CALLEBAUT, MARS, NESTLE
219 BARRY CALLEBAUT, ECOM, LINDT SPRUNGLI
220 BARRY CALLEBAUT, ETC GROUP, NESTLE
221 BARRY CALLEBAUT, ETC GROUP, FERRERO
222 NESTLE, OLAM, TOUTON
223 CARGILL, ECOM, LINDT SPRUNGLI, PURATOS
224 ECOM, ETC GROUP, FILDISI, LINDT SPRUNGLI, SUCDEN
225 ETC GROUP, NESTLE, OLAM
226 BLOMMER, ECOM, LINDT SPRUNGLI, SUCDEN
227 CARGILL, KRUGER, OLAM, PURATOS, SUSCOM
228 BARONIE, BARRY CALLEBAUT, CEMOI, COCOASOURCE, ECOOKIM, ETC GROUP, MARS, TONYS CHOCOLONELY
229 BARRY CALLEBAUT, BLOMMER, MARS, OLAM
230 BARONIE, BARRY CALLEBAUT, CARGILL, CEMOI, COCOASOURCE, ECOOKIM, TONYS CHOCOLONELY
231 BARRY CALLEBAUT, BLOMMER, NESTLE, SO B GREEN, SUCDEN
232 BLOMMER, CEMOI, MARS, OLAM, SUCDEN
233 OLAM, PURATOS
234 BARRY CALLEBAUT, CARGILL, GUAN CHONG COCOA, NESTLE
235 BARONIE, BARRY CALLEBAUT, CEMOI, COCOASOURCE, ECOOKIM, TONYS CHOCOLONELY, UNILEVER
236 BARRY CALLEBAUT, GUAN CHONG COCOA, NESTLE, OLAM
237 BARRY CALLEBAUT, BLOMMER, MARS, OLAM, TOUTON
238 ECOM, LINDT SPRUNGLI, NESTLE, OLAM