Clean 2024 csv column names
View or edit on GitHub
This page is synchronized from trase/data/cote_divoire/cocoa/logistics/q4_2025/clean/clean_2024_csv_column_names.ipynb. Last modified on 2026-06-21 06:35 CEST by GitHub Actions.
Please view or edit the original file there; changes should be reflected here after a midnight build (CET time),
or manually triggering it with a GitHub action (link).
import pandas as pd
import numpy as np
df = pd.read_csv("/Users/niamhfrench/Library/CloudStorage/OneDrive-SEI/Documents/civ_cooperatives_update/combined_data/messy_2024_combined.csv")
df = df.copy()
print(df.shape)
# copy over value from DISCL_COUNTRY_NAME where DISCL_COUNTRY is NaN and vice versa
mask1 = df['DISCL_COUNTRY'].isna() & df['DISCL_COUNTRY_NAME'].notna()
df.loc[mask1, 'DISCL_COUNTRY'] = df.loc[mask1, 'DISCL_COUNTRY_NAME']
mask2 = df['DISCL_COUNTRY_NAME'].isna() & df['DISCL_COUNTRY'].notna()
df.loc[mask2, 'DISCL_COUNTRY_NAME'] = df.loc[mask2, 'DISCL_COUNTRY']
mask = (df['DISCL_COUNTRY'].isin(["Côte d'Ivoire", " Ivory Coast", "Cote d'Ivoire", np.nan])) | \
(df['DISCL_COUNTRY_NAME'].isin(["IVORY COAST", "CIV", np.nan])) | \
(df['COUNTRY'].isin([np.nan, "Cote d'Ivoire", 'Ivory Coast', "Côte d'Ivoire",'CÔTE D’IVOIRE', 'CIV'])) | \
(df['COUNTRY_NAME'].isin([np.nan, "IVORY COAST"]))
df = df[mask].copy() # avoid SettingWithCopyWarning
print(df.shape)
import pandas as pd
import numpy as np
import unicodedata
# prefer keeping df as a copy when mutating
df = df.copy()
# columns involved (adjust names if you have slightly different names/casing)
cols = ['COUNTRY', 'DISCL_COUNTRY', 'DISCL_COUNTRY_NAME', 'COUNTRY_NAME']
# 1) Normalise empty strings -> NA across these columns
df[cols] = df[cols].replace(r'^\s*$', pd.NA, regex=True)
# 2) Build COUNTRY by taking the first non-null value in the preferred order:
# (this is vectorised and fast)
df['COUNTRY'] = df[cols].bfill(axis=1).iloc[:, 0]
# 3) Optional normalisation: strip, remove accents, uppercase
def normalize_country(val):
if pd.isna(val):
return val
s = str(val).strip()
# remove accents (Côte -> Cote) and force ASCII
s = unicodedata.normalize('NFKD', s).encode('ASCII', 'ignore').decode('ASCII')
return s.upper()
df['COUNTRY'] = df['COUNTRY'].apply(normalize_country)
# 4) Drop the other country columns (keep only df['COUNTRY'])
df.drop(columns=['DISCL_COUNTRY', 'DISCL_COUNTRY_NAME', 'COUNTRY_NAME'], inplace=True)
# Quick checks
print("shape:", df.shape)
print("COUNTRY value counts (top):")
print(df['COUNTRY'].value_counts(dropna=False).head(10))
shape: (2250, 39)
COUNTRY value counts (top):
IVORY COAST 1004
COTE D'IVOIRE 664
CIV 198
COTE DIVOIRE 100
GHANA 83
CAMEROON 61
NIGERIA 34
ECUADOR 21
GHA 17
BRAZIL 15
Name: COUNTRY, dtype: int64
df = df[df.COUNTRY.isin(["IVORY COAST", "COTE D'IVOIRE", "CIV", "COTE DIVOIRE", np.nan])].copy()
df.COUNTRY.unique()
array(["COTE D'IVOIRE", 'IVORY COAST', nan, 'COTE DIVOIRE', 'CIV'],
dtype=object)
df['COUNTRY'] = "IVORY_COAST"
import pandas as pd
# copy if you want to keep original
df = df.copy()
# Uppercase every cell only when it's a str (preserves NaN, numbers, lists, dicts, ...)
df = df.applymap(lambda v: v.upper() if isinstance(v, str) else v)
# Uppercase column names (still vectorised)
df.columns = df.columns.map(lambda c: c.upper() if isinstance(c, str) else c)
# df.to_csv("just_CIV_messy_2024.csv", index=False)
# Standardise names in text
# Decide which values should be under which columns
# Impute missing values using 2023 dataset
df_2023 = pd.read_csv('/Users/niamhfrench/Library/CloudStorage/OneDrive-SEI/Documents/civ_cooperatives_update/world_cocoa_logistics/out/consolidated_disclosures.csv', sep=';')
df_2023[df_2023.DISCL_YEAR == 2023].shape
(770, 23)
df_2023[df_2023.DISCL_YEAR == 2022].shape
(3363, 23)
df_2023[df_2023.DISCL_YEAR == 2021].shape
(926, 23)
df_2023[df_2023.DISCL_YEAR == 2020].shape
(765, 23)
df_2023.COMPANY.unique()
array(['BARRY_CALLEBAUT', 'CARGILL', 'HERSHEY', 'MARS', 'ALFRED RITTER',
'ALTER ECO', 'CEMOI', 'COLRUYT', 'ECOM', 'ECOOKIM', 'ETG',
'FERRERO', 'HALBA', 'MONDELEZ', 'NESTLE', 'OLAM', 'ONEM',
'PURATOS', 'SUCDEN', 'TONYS_CHOCOLONELY', 'UNCOMMON COCOA',
'VALRHONA', 'WHITTAKER', nan, 'BARRY CALLEBAUT', 'COCOANECT',
'TOUTON', 'BLOMMER', 'COCOASOURCE', "TONY'S CHOCOLONELY",
'ETHIQUABLE', 'ALBERT HEIJN', 'RAINFOREST ALLIANCE'], dtype=object)
df.COMPANY.unique()
array([nan, 'NESTLE', 'PURATOS', 'TONYS CHOCOLONELY', 'ONEM', 'FERRERO',
'CARGILL', 'BLOMMER CHOCOLATE COMPANY', 'OFI', 'UNILEVER',
'VALRHONA', 'GODIVA', 'WHITTAKERS', 'ALTER ECO', 'ETG', 'MARS',
'TOUTON', 'ECOM'], dtype=object)
df.loc[2246,:]
TRADER_NAME NaN
DISCL_TRADER_NAME NaN
DISCL_SUPPLIER_ABRVNAME CAFUGO COOP CA
DISCL_AREA_NAME NaN
DISCL_CERTIFICATION_NAME NaN
DISCL_NUMBER_FARMERS NaN
DISCL_LONGITUDE NaN
DISCL_LATITUDE NaN
COUNTRY IVORY_COAST
TYPE NaN
DISCL_YEAR 2024.0
DISCL_SUPPLIER_TYPE COOPERATIVE
COMPANY ECOM
DISCL_SUPPLIER_FULLNAME COOPÉRATIVE AGRICOLE DES FRÈRES UNIS DE GOBROKO
LOCALITY_NAME NaN
INDEX NaN
DISCL_SUPPLIER_NAME NaN
DISCL_VOLUMES NaN
DISCL_VOLUMES_UNIT NaN
DISCL_CERTIFICATION_PCT NaN
DISTRICT_NAME SASSANDRA
DISTRICT_GEOCODE NaN
CAM_BUYERS NaN
IS_CAM_V3 NaN
IS_JRC NaN
JRC_BUYER_ID NaN
YEAR NaN
DISCL_SUPPLIER_TYPE NaN
DISCL_YEAR NaN
TRADER NaN
SUPPLIER_ABRVNAME NaN
SUPPLIER_FULLNAME NaN
LONGITUDE NaN
LATITUDE NaN
NUMBER_FARMERS NaN
VOLUMES NaN
VOLUMES_UNIT NaN
CERTIFICATION_NAME NaN
CERTIFICATION_PCT NaN
Name: 2246, dtype: object
# How did Valentin handle traders vs companies?
df_2023.columns
Index(['COMPANY', 'DISCL_COUNTRY_NAME', 'DISCL_TRADER_NAME', 'DISCL_AREA_NAME',
'DISCL_SUPPLIER_ABRVNAME', 'DISCL_SUPPLIER_FULLNAME',
'DISCL_SUPPLIER_TYPE', 'DISCL_LONGITUDE', 'DISCL_LATITUDE',
'DISCL_YEAR', 'DISCL_NUMBER_FARMERS', 'DISCL_VOLUMES',
'DISCL_VOLUMES_UNIT', 'DISCL_CERTIFICATION_NAME',
'DISCL_CERTIFICATION_PCT', 'DISTRICT_NAME', 'DISTRICT_GEOCODE',
'LOCALITY_NAME', 'TRADER_NAME', 'CAM_BUYERS', 'IS_CAM_V3', 'IS_JRC',
'JRC_BUYER_ID'],
dtype='object')
print(df_2023.COMPANY.unique())
['BARRY_CALLEBAUT' 'CARGILL' 'HERSHEY' 'MARS' 'ALFRED RITTER' 'ALTER ECO'
'CEMOI' 'COLRUYT' 'ECOM' 'ECOOKIM' 'ETG' 'FERRERO' 'HALBA' 'MONDELEZ'
'NESTLE' 'OLAM' 'ONEM' 'PURATOS' 'SUCDEN' 'TONYS_CHOCOLONELY'
'UNCOMMON COCOA' 'VALRHONA' 'WHITTAKER' nan 'BARRY CALLEBAUT' 'COCOANECT'
'TOUTON' 'BLOMMER' 'COCOASOURCE' "TONY'S CHOCOLONELY" 'ETHIQUABLE'
'ALBERT HEIJN' 'RAINFOREST ALLIANCE']
print(df_2023.DISCL_TRADER_NAME.unique())
[nan 'CEMOI COTE D‘IVOIRE' 'Fuchs & Hoffmann Kakaoprodukte GmbH'
'SUSCOM-CI' 'CARGILL' 'Ascot Amsterdam B.V.' 'Olam International Limited'
'Ritter Sport Nicaragua S.A' 'Minka' 'CEMOI' 'ECOM' 'ECOOKIM' 'ETG'
'DAARNHOUWER & CO.' 'Walter Matter Sa' 'BARRY CALLEBAUT Cocoa AG'
'Cargill B.V.' 'Ecom Agrotrade Ltd.' 'Olam International Ltd.'
'Touton SA' 'Barry Callebaut Cocoa AG' 'Barry Callebaut' 'Blommer/Olam'
'Cargill' 'Cocoanect' 'Ecom' 'Sucden' 'Touton' 'COCOANECT' 'OLAM'
'Fildisi Cocoa Industry' 'SUCDEN' 'COCOASOURCE' 'UNCOMMON COCOA']
df_2023[df_2023.COMPANY == "TONY'S CHOCOLONELY"]
| COMPANY | DISCL_COUNTRY_NAME | DISCL_TRADER_NAME | DISCL_AREA_NAME | DISCL_SUPPLIER_ABRVNAME | DISCL_SUPPLIER_FULLNAME | DISCL_SUPPLIER_TYPE | DISCL_LONGITUDE | DISCL_LATITUDE | DISCL_YEAR | ... | DISCL_CERTIFICATION_NAME | DISCL_CERTIFICATION_PCT | DISTRICT_NAME | DISTRICT_GEOCODE | LOCALITY_NAME | TRADER_NAME | CAM_BUYERS | IS_CAM_V3 | IS_JRC | JRC_BUYER_ID | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 8382 | TONY'S CHOCOLONELY | IVORY_COAST | NaN | NaN | ECAM SCOOPS | ENTREPRISE COOPERATIVE DES AGRICULTEURS MODERNES | COOPERATIVE | -7.344635 | 6.750040 | 2019 | ... | FAIRTRADE | NaN | NaN | CI-8.2.2_1 | NaN | NaN | 2000 (TONY'S CHOCOLONELY) | True | NaN | NaN |
| 8535 | TONY'S CHOCOLONELY | IVORY_COAST | NaN | NaN | EKOOKIM ECOJAD | ENTREPRISE COOPERATIVE DES JEUNES AGRICULTEURS... | COOPERATIVE | -6.457590 | 6.875014 | 2019 | ... | FAIRTRADE; UTZ/RA | NaN | NaN | CI-9.1.1_1 | NaN | NaN | 1751 (TONY'S CHOCOLONELY) | True | NaN | NaN |
| 8537 | TONY'S CHOCOLONELY | IVORY_COAST | NaN | NaN | EKOOKIM KAPATCHIVA | SOCIETE COOPERATIVE KAPATCHIVA | COOPERATIVE | -5.739872 | 6.988296 | 2019 | ... | FAIRTRADE | NaN | NaN | CI-9.2.1_1 | NaN | NaN | 1190 (TONY'S CHOCOLONELY) | True | NaN | NaN |
| 10834 | TONY'S CHOCOLONELY | IVORY_COAST | NaN | NaN | SOCOOPACDI | SOCIETE COOPERATIVE AGRICOLE DE CAFE ET DE CAC... | COOPERATIVE | -5.362552 | 5.841540 | 2019 | ... | FAIRTRADE; UTZ | NaN | NaN | CI-5.2.1_1 | NaN | NaN | 1536 (TONY'S CHOCOLONELY) | True | NaN | NaN |
4 rows × 23 columns
df_2023[df_2023.DISCL_TRADER_NAME == "BARRY CALLEBAUT Cocoa AG"]
| COMPANY | DISCL_COUNTRY_NAME | DISCL_TRADER_NAME | DISCL_AREA_NAME | DISCL_SUPPLIER_ABRVNAME | DISCL_SUPPLIER_FULLNAME | DISCL_SUPPLIER_TYPE | DISCL_LONGITUDE | DISCL_LATITUDE | DISCL_YEAR | ... | DISCL_CERTIFICATION_NAME | DISCL_CERTIFICATION_PCT | DISTRICT_NAME | DISTRICT_GEOCODE | LOCALITY_NAME | TRADER_NAME | CAM_BUYERS | IS_CAM_V3 | IS_JRC | JRC_BUYER_ID | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 4331 | MONDELEZ | IVORY_COAST | BARRY CALLEBAUT Cocoa AG | NaN | S.CA-CA | Ste coop Anouanzé de N'Denou | NaN | NaN | NaN | 2021 | ... | COCOA LIFE | 100.0 | NaN | NaN | NaN | NaN | NaN | False | NaN | NaN |
| 4332 | MONDELEZ | IVORY_COAST | BARRY CALLEBAUT Cocoa AG | NaN | CAAG COOP-CA | Ste Coop Agri Allakabo de Gly | NaN | NaN | NaN | 2021 | ... | COCOA LIFE | 100.0 | NaN | NaN | NaN | NaN | NaN | False | NaN | NaN |
| 4333 | MONDELEZ | IVORY_COAST | BARRY CALLEBAUT Cocoa AG | NaN | SCOOPAAD_DATTA | Ste Coop Agri Akayara de Datta | NaN | NaN | NaN | 2021 | ... | COCOA LIFE | 100.0 | NaN | NaN | NaN | NaN | NaN | False | NaN | NaN |
| 4334 | MONDELEZ | IVORY_COAST | BARRY CALLEBAUT Cocoa AG | NaN | COOP CA PM | Ste Coop des Producteurs de M'Bokro | NaN | NaN | NaN | 2021 | ... | COCOA LIFE | 100.0 | NaN | NaN | NaN | NaN | NaN | False | NaN | NaN |
| 4335 | MONDELEZ | IVORY_COAST | BARRY CALLEBAUT Cocoa AG | NaN | EWOUNBO TIASSALE | Ste Coop S EWOUNBO de Tiassalé | NaN | NaN | NaN | 2021 | ... | COCOA LIFE | 100.0 | NaN | NaN | NaN | NaN | NaN | False | NaN | NaN |
| 4336 | MONDELEZ | IVORY_COAST | BARRY CALLEBAUT Cocoa AG | NaN | SOGAD | Ste Gle d'Achat et de Distribution | NaN | NaN | NaN | 2021 | ... | COCOA LIFE | 100.0 | NaN | NaN | NaN | NaN | NaN | False | NaN | NaN |
| 4337 | MONDELEZ | IVORY_COAST | BARRY CALLEBAUT Cocoa AG | NaN | SCAPEN | Ste Coop Agri N'Zassa de Tiassalé | NaN | NaN | NaN | 2021 | ... | COCOA LIFE | 100.0 | NaN | NaN | NaN | NaN | NaN | False | NaN | NaN |
| 4338 | MONDELEZ | IVORY_COAST | BARRY CALLEBAUT Cocoa AG | NaN | ECOGA_COOP-CA | Entr coop de GABIADJI | NaN | NaN | NaN | 2021 | ... | COCOA LIFE | 100.0 | NaN | NaN | NaN | NaN | NaN | False | NaN | NaN |
| 4339 | MONDELEZ | IVORY_COAST | BARRY CALLEBAUT Cocoa AG | NaN | ECADA | Ent Coop Agri de Dagadji | NaN | NaN | NaN | 2021 | ... | COCOA LIFE | 100.0 | NaN | NaN | NaN | NaN | NaN | False | NaN | NaN |
| 4340 | MONDELEZ | IVORY_COAST | BARRY CALLEBAUT Cocoa AG | NaN | COPACOL | Ste Coop agri cot lauzoua carrefour | NaN | NaN | NaN | 2021 | ... | COCOA LIFE | 100.0 | NaN | NaN | NaN | NaN | NaN | False | NaN | NaN |
| 4341 | MONDELEZ | IVORY_COAST | BARRY CALLEBAUT Cocoa AG | NaN | SCOOPS-YEYASSO | Ste Coop Yeyasso de Man | NaN | NaN | NaN | 2021 | ... | COCOA LIFE | 100.0 | NaN | NaN | NaN | NaN | NaN | False | NaN | NaN |
| 4342 | MONDELEZ | IVORY_COAST | BARRY CALLEBAUT Cocoa AG | NaN | SCOAVA | Ste Coop Agricole de Vavoua | NaN | NaN | NaN | 2021 | ... | COCOA LIFE | 100.0 | NaN | NaN | NaN | NaN | NaN | False | NaN | NaN |
| 4343 | MONDELEZ | IVORY_COAST | BARRY CALLEBAUT Cocoa AG | NaN | ECOAN | Ste coop av CA Ecoagricole de pont nero | NaN | NaN | NaN | 2021 | ... | COCOA LIFE | 100.0 | NaN | NaN | NaN | NaN | NaN | False | NaN | NaN |
| 4344 | MONDELEZ | IVORY_COAST | BARRY CALLEBAUT Cocoa AG | NaN | CABAK SCOOPS | Ste coop Simp bada de ketro bassam | NaN | NaN | NaN | 2021 | ... | COCOA LIFE | 100.0 | NaN | NaN | NaN | NaN | NaN | False | NaN | NaN |
| 4345 | MONDELEZ | IVORY_COAST | BARRY CALLEBAUT Cocoa AG | NaN | COOPADYA | coop des prod agri dept yakasse | NaN | NaN | NaN | 2021 | ... | COCOA LIFE | 100.0 | NaN | NaN | NaN | NaN | NaN | False | NaN | NaN |
| 4346 | MONDELEZ | IVORY_COAST | BARRY CALLEBAUT Cocoa AG | NaN | SCOES SCOOPS | Ste Coop Espoir Simplifiée | NaN | NaN | NaN | 2021 | ... | COCOA LIFE | 100.0 | NaN | NaN | NaN | NaN | NaN | False | NaN | NaN |
| 4347 | MONDELEZ | IVORY_COAST | BARRY CALLEBAUT Cocoa AG | NaN | LAFILA BOUMBOU | Ste coop LAFI LA BOUMBOU | NaN | NaN | NaN | 2021 | ... | COCOA LIFE | 100.0 | NaN | NaN | NaN | NaN | NaN | False | NaN | NaN |
| 4348 | MONDELEZ | IVORY_COAST | BARRY CALLEBAUT Cocoa AG | NaN | COOPINA COOP-CA | Ste coop ivoir de nouveaux agriculteur | NaN | NaN | NaN | 2021 | ... | COCOA LIFE | 100.0 | NaN | NaN | NaN | NaN | NaN | False | NaN | NaN |
| 4349 | MONDELEZ | IVORY_COAST | BARRY CALLEBAUT Cocoa AG | NaN | COOJAB COOP-CA | Ste coop des jeunes agricult de Borobo | NaN | NaN | NaN | 2021 | ... | COCOA LIFE | 100.0 | NaN | NaN | NaN | NaN | NaN | False | NaN | NaN |
| 4350 | MONDELEZ | IVORY_COAST | BARRY CALLEBAUT Cocoa AG | NaN | ECAB_BLOLEQUIN | Ste Coop Elite coop agri Blolequin | NaN | NaN | NaN | 2021 | ... | COCOA LIFE | 100.0 | NaN | NaN | NaN | NaN | NaN | False | NaN | NaN |
| 4351 | MONDELEZ | IVORY_COAST | BARRY CALLEBAUT Cocoa AG | NaN | CASIB COOP-CA | Coopérative Agricole Siakadougou de Bida | NaN | NaN | NaN | 2021 | ... | COCOA LIFE | 100.0 | NaN | NaN | NaN | NaN | NaN | False | NaN | NaN |
| 4352 | MONDELEZ | IVORY_COAST | BARRY CALLEBAUT Cocoa AG | NaN | COOPAD_DUEKOUE | Ste Coop des Prod Agri de Duekoue | NaN | NaN | NaN | 2021 | ... | COCOA LIFE | 100.0 | NaN | NaN | NaN | NaN | NaN | False | NaN | NaN |
| 4353 | MONDELEZ | IVORY_COAST | BARRY CALLEBAUT Cocoa AG | NaN | SCAN COOP-CA | Ste Coop Agri Nesmon | NaN | NaN | NaN | 2021 | ... | COCOA LIFE | 100.0 | NaN | NaN | NaN | NaN | NaN | False | NaN | NaN |
| 4354 | MONDELEZ | IVORY_COAST | BARRY CALLEBAUT Cocoa AG | NaN | ALLAH-KABO SCOOPS | Ste Coop Simpl Allah Kabo de Duekoue | NaN | NaN | NaN | 2021 | ... | COCOA LIFE | 100.0 | NaN | NaN | NaN | NaN | NaN | False | NaN | NaN |
| 4355 | MONDELEZ | IVORY_COAST | BARRY CALLEBAUT Cocoa AG | NaN | SOCOCCI SCOOPS | Ste Coop de cafe-cacao Ivoirien | NaN | NaN | NaN | 2021 | ... | COCOA LIFE | 100.0 | NaN | NaN | NaN | NaN | NaN | False | NaN | NaN |
| 4356 | MONDELEZ | IVORY_COAST | BARRY CALLEBAUT Cocoa AG | NaN | PHENIPRAD | Ste Coop Phénix de Product Agri Durable | NaN | NaN | NaN | 2021 | ... | COCOA LIFE | 100.0 | NaN | NaN | NaN | NaN | NaN | False | NaN | NaN |
| 4357 | MONDELEZ | IVORY_COAST | BARRY CALLEBAUT Cocoa AG | NaN | SCOPAF | Ste Coop des Prod Agri du Fromager | NaN | NaN | NaN | 2021 | ... | COCOA LIFE | 100.0 | NaN | NaN | NaN | NaN | NaN | False | NaN | NaN |
27 rows × 23 columns
df_2023[df_2023.DISCL_TRADER_NAME == "Barry Callebaut Cocoa AG"]
| COMPANY | DISCL_COUNTRY_NAME | DISCL_TRADER_NAME | DISCL_AREA_NAME | DISCL_SUPPLIER_ABRVNAME | DISCL_SUPPLIER_FULLNAME | DISCL_SUPPLIER_TYPE | DISCL_LONGITUDE | DISCL_LATITUDE | DISCL_YEAR | ... | DISCL_CERTIFICATION_NAME | DISCL_CERTIFICATION_PCT | DISTRICT_NAME | DISTRICT_GEOCODE | LOCALITY_NAME | TRADER_NAME | CAM_BUYERS | IS_CAM_V3 | IS_JRC | JRC_BUYER_ID | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 4404 | MONDELEZ | GHANA | Barry Callebaut Cocoa AG | ANHWIASO A DISTRICT | NaN | ANHWIASO A DISTRICT | NaN | NaN | NaN | 2021 | ... | COCOA LIFE | 100.0 | NaN | NaN | NaN | NaN | NaN | False | NaN | NaN |
| 4405 | MONDELEZ | GHANA | Barry Callebaut Cocoa AG | SEFWI BEKWAI A DISTRICT | NaN | SEFWI BEKWAI A DISTRICT | NaN | NaN | NaN | 2021 | ... | COCOA LIFE | 100.0 | NaN | NaN | NaN | NaN | NaN | False | NaN | NaN |
| 4406 | MONDELEZ | GHANA | Barry Callebaut Cocoa AG | SEFWI BEKWAI B DISTRICT | NaN | SEFWI BEKWAI B DISTRICT | NaN | NaN | NaN | 2021 | ... | COCOA LIFE | 100.0 | NaN | NaN | NaN | NaN | NaN | False | NaN | NaN |
| 4407 | MONDELEZ | GHANA | Barry Callebaut Cocoa AG | SEFWI WIAWSO A DISTRICT | NaN | SEFWI WIAWSO A DISTRICT | NaN | NaN | NaN | 2021 | ... | COCOA LIFE | 100.0 | NaN | NaN | NaN | NaN | NaN | False | NaN | NaN |
| 4408 | MONDELEZ | GHANA | Barry Callebaut Cocoa AG | SEFWI WIAWSO B DISTRICT | NaN | SEFWI WIAWSO B DISTRICT | NaN | NaN | NaN | 2021 | ... | COCOA LIFE | 100.0 | NaN | NaN | NaN | NaN | NaN | False | NaN | NaN |
| 4409 | MONDELEZ | GHANA | Barry Callebaut Cocoa AG | SEFWI WIAWSO C DISTRICT | NaN | SEFWI WIAWSO C DISTRICT | NaN | NaN | NaN | 2021 | ... | COCOA LIFE | 100.0 | NaN | NaN | NaN | NaN | NaN | False | NaN | NaN |
6 rows × 23 columns
df_2023[df_2023.DISCL_TRADER_NAME == "Barry Callebaut"]
| COMPANY | DISCL_COUNTRY_NAME | DISCL_TRADER_NAME | DISCL_AREA_NAME | DISCL_SUPPLIER_ABRVNAME | DISCL_SUPPLIER_FULLNAME | DISCL_SUPPLIER_TYPE | DISCL_LONGITUDE | DISCL_LATITUDE | DISCL_YEAR | ... | DISCL_CERTIFICATION_NAME | DISCL_CERTIFICATION_PCT | DISTRICT_NAME | DISTRICT_GEOCODE | LOCALITY_NAME | TRADER_NAME | CAM_BUYERS | IS_CAM_V3 | IS_JRC | JRC_BUYER_ID | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 4410 | NESTLE | BRAZIL | Barry Callebaut | NaN | NaN | NaN | NaN | NaN | NaN | 2020 | ... | COCOA PLAN; RAINFOREST ALLIANCE | NaN | NaN | NaN | NaN | NaN | NaN | False | NaN | NaN |
| 4411 | NESTLE | IVORY_COAST | Barry Callebaut | NaN | DUEKOUE NORD-BIO | NaN | NaN | NaN | NaN | 2020 | ... | COCOA PLAN; RAINFOREST ALLIANCE | NaN | NaN | NaN | NaN | NaN | NaN | False | NaN | NaN |
| 4412 | NESTLE | IVORY_COAST | Barry Callebaut | NaN | CABZOU COOP-CA | NaN | NaN | NaN | NaN | 2020 | ... | COCOA PLAN; RAINFOREST ALLIANCE | NaN | NaN | NaN | NaN | NaN | NaN | False | NaN | NaN |
| 4413 | NESTLE | IVORY_COAST | Barry Callebaut | NaN | CEXPAG COOP-CA | NaN | NaN | NaN | NaN | 2020 | ... | COCOA PLAN; RAINFOREST ALLIANCE | NaN | NaN | NaN | NaN | NaN | NaN | False | NaN | NaN |
| 4414 | NESTLE | IVORY_COAST | Barry Callebaut | NaN | COOTIEN-OULY | NaN | NaN | NaN | NaN | 2020 | ... | COCOA PLAN; RAINFOREST ALLIANCE | NaN | NaN | NaN | NaN | NaN | NaN | False | NaN | NaN |
| 4415 | NESTLE | IVORY_COAST | Barry Callebaut | NaN | COOP-CA ABO | NaN | NaN | NaN | NaN | 2020 | ... | COCOA PLAN; RAINFOREST ALLIANCE | NaN | NaN | NaN | NaN | NaN | NaN | False | NaN | NaN |
| 4416 | NESTLE | IVORY_COAST | Barry Callebaut | NaN | SCOOPADOB | NaN | NaN | NaN | NaN | 2020 | ... | COCOA PLAN; RAINFOREST ALLIANCE | NaN | NaN | NaN | NaN | NaN | NaN | False | NaN | NaN |
| 4417 | NESTLE | IVORY_COAST | Barry Callebaut | NaN | SOCOOPAK COOP-CA | NaN | NaN | NaN | NaN | 2020 | ... | COCOA PLAN; RAINFOREST ALLIANCE | NaN | NaN | NaN | NaN | NaN | NaN | False | NaN | NaN |
| 4418 | NESTLE | IVORY_COAST | Barry Callebaut | NaN | CABID | NaN | NaN | NaN | NaN | 2020 | ... | COCOA PLAN; RAINFOREST ALLIANCE | NaN | NaN | NaN | NaN | NaN | NaN | False | NaN | NaN |
| 4419 | NESTLE | IVORY_COAST | Barry Callebaut | NaN | SCJPAB COOP-CA | NaN | NaN | NaN | NaN | 2020 | ... | COCOA PLAN; RAINFOREST ALLIANCE | NaN | NaN | NaN | NaN | NaN | NaN | False | NaN | NaN |
| 4420 | NESTLE | IVORY_COAST | Barry Callebaut | NaN | SCOABIA COOP-CA (BC) | NaN | NaN | NaN | NaN | 2020 | ... | COCOA PLAN; RAINFOREST ALLIANCE | NaN | NaN | NaN | NaN | NaN | NaN | False | NaN | NaN |
11 rows × 23 columns
import pandas as pd
df = pd.read_csv('/Users/niamhfrench/Library/CloudStorage/OneDrive-SEI/Documents/civ_cooperatives_update/combined_data/just_CIV_messy_2024.csv')
df.columns
Index(['TRADER_NAME', 'DISCL_TRADER_NAME', 'DISCL_SUPPLIER_ABRVNAME',
'DISCL_AREA_NAME', 'DISCL_CERTIFICATION_NAME', 'DISCL_NUMBER_FARMERS',
'DISCL_LONGITUDE', 'DISCL_LATITUDE', 'COUNTRY', 'TYPE', 'DISCL_YEAR',
'DISCL_SUPPLIER_TYPE', 'COMPANY', 'DISCL_SUPPLIER_FULLNAME',
'LOCALITY_NAME', 'INDEX', 'DISCL_SUPPLIER_NAME', 'DISCL_VOLUMES',
'DISCL_VOLUMES_UNIT', 'DISCL_CERTIFICATION_PCT', 'DISTRICT_NAME',
'DISTRICT_GEOCODE', 'CAM_BUYERS', 'IS_CAM_V3', 'IS_JRC', 'JRC_BUYER_ID',
'YEAR', ' DISCL_SUPPLIER_TYPE', ' DISCL_YEAR', 'TRADER',
'SUPPLIER_ABRVNAME', 'SUPPLIER_FULLNAME', 'LONGITUDE', 'LATITUDE',
'NUMBER_FARMERS', 'VOLUMES', 'VOLUMES_UNIT', 'CERTIFICATION_NAME',
'CERTIFICATION_PCT'],
dtype='object')
df_2024_cols_ordered = ['TRADER_NAME', 'TRADER', 'DISCL_TRADER_NAME', # TODO: Reduce to 'DISCL_TRADER_NAME'
'COMPANY',
'YEAR', ' DISCL_YEAR', 'DISCL_YEAR', # TODO: Reduce to 'DISCL_YEAR'
'DISCL_LONGITUDE', 'DISCL_LATITUDE', 'LATITUDE', 'LONGITUDE', # TODO: Reduce to 'DISCL_LATITUDE' and 'DISCL_LONGITUDE'
'SUPPLIER_ABRVNAME', 'SUPPLIER_FULLNAME', 'DISCL_SUPPLIER_ABRVNAME', 'DISCL_SUPPLIER_NAME', 'DISCL_SUPPLIER_FULLNAME', # TODO: Reduce to 'DISCL_SUPPLIER_ABRVNAME' and 'DISCL_SUPPLIER_FULLNAME'
'DISCL_AREA_NAME', 'LOCALITY_NAME', 'DISTRICT_GEOCODE', 'DISTRICT_NAME', # TODO: Reduce to 'DISCL_AREA_NAME' and 'DISTRICT_GEOCODE'
'TYPE', 'DISCL_SUPPLIER_TYPE', ' DISCL_SUPPLIER_TYPE', # TODO: Reduce to 'DISCL_SUPPLIER_TYPE'
'DISCL_CERTIFICATION_NAME', 'DISCL_CERTIFICATION_PCT',
'DISCL_NUMBER_FARMERS',
'COUNTRY', # TODO: Call it DISCL_COUNTRY_NAME
'INDEX', # TODO: Remove
'DISCL_VOLUMES','DISCL_VOLUMES_UNIT',
'CAM_BUYERS', 'IS_CAM_V3', 'IS_JRC', 'JRC_BUYER_ID']
df_2023_columns_ordered = ['DISCL_TRADER_NAME', 'TRADER_NAME',
'COMPANY',
'DISCL_YEAR',
'DISCL_LONGITUDE', 'DISCL_LATITUDE',
'DISCL_SUPPLIER_ABRVNAME', 'DISCL_SUPPLIER_FULLNAME',
'DISCL_AREA_NAME','LOCALITY_NAME', 'DISTRICT_GEOCODE', 'DISTRICT_NAME',
'DISCL_SUPPLIER_TYPE',
'DISCL_CERTIFICATION_NAME', 'DISCL_CERTIFICATION_PCT',
'DISCL_NUMBER_FARMERS',
'DISCL_COUNTRY_NAME',
'DISCL_VOLUMES','DISCL_VOLUMES_UNIT',
'CAM_BUYERS', 'IS_CAM_V3', 'IS_JRC','JRC_BUYER_ID']
Discern difference between similar columns
print(df_2023['TRADER_NAME'].unique())
print(df_2023['DISCL_TRADER_NAME'].unique())
[nan]
[nan 'CEMOI COTE D‘IVOIRE' 'Fuchs & Hoffmann Kakaoprodukte GmbH'
'SUSCOM-CI' 'CARGILL' 'Ascot Amsterdam B.V.' 'Olam International Limited'
'Ritter Sport Nicaragua S.A' 'Minka' 'CEMOI' 'ECOM' 'ECOOKIM' 'ETG'
'DAARNHOUWER & CO.' 'Walter Matter Sa' 'BARRY CALLEBAUT Cocoa AG'
'Cargill B.V.' 'Ecom Agrotrade Ltd.' 'Olam International Ltd.'
'Touton SA' 'Barry Callebaut Cocoa AG' 'Barry Callebaut' 'Blommer/Olam'
'Cargill' 'Cocoanect' 'Ecom' 'Sucden' 'Touton' 'COCOANECT' 'OLAM'
'Fildisi Cocoa Industry' 'SUCDEN' 'COCOASOURCE' 'UNCOMMON COCOA']
print(df_2023['DISCL_AREA_NAME'].unique())
print(df_2023['LOCALITY_NAME'].unique())
print(df_2023['DISTRICT_NAME'].unique())
print(df_2023['DISTRICT_GEOCODE'].unique())
[nan 'ANTOAKROM' 'ASSIN FOSU A & B' 'WASSA AKROPONG' 'MANSO AMENFI'
'DUNKWA' 'TWIFO PRASO' 'SAMREBOI' 'AWASO' 'NEW EDUBIASE' 'BOGOSO'
'ASANKRANGWA' 'Agroecom Ghana Limited, Antoakrom A'
'Agroecom Ghana Limited, Diaso' 'Agroecom Ghana Limited, Agona B'
'Agroecom Ghana Limited, Sankore B' 'Awaso' 'Bafia'
'Climate Cocoa Cooperative Union' 'Cocoa Merchants' 'Cocoa Abrabopa'
'Fedco' 'Goaso' 'Kasapin' 'Kwapong' 'Kokoo Pa'
'Olam Ghana Climate Cocoa-Asawinso' 'Olam Ghana Climate Cocoa-Kakum'
'Sankore' 'Sunyani' 'ABOISSO' 'DUEKOUE' 'GNIPI' 'AFFERY' 'DIANGOBO'
'AGBOVILLE' 'GRAND MORIE' 'LAKOTA' 'GAGNOA' 'FRESCO' 'DIVO' 'TIASSALE'
'BEREBY' 'TOUIH' 'ABENGOUROU' 'GUITRY' 'PAULY' 'AKOUPE' 'ISSIA'
'GUIBEROUA' 'GRAND LAHOU' 'GUIGLO' 'TAI' 'SAN-PEDRO' 'BLOLEQUIN' 'SINFRA'
'SASSANDRA' 'BAYOTA' 'GUIDRY' 'SAN PEDRO' 'SOUBRE' 'OUME' 'GRABO'
'DIEGONEFLA' 'TABOU' 'YAKASSE ATTOBROU' 'CECHI' 'ZAGNE' 'SEGUELLA'
'UCAYALI' 'SAO TOME' 'GUAYAS' 'SANTO DOMINGO' 'TAABO' 'HERMANGONO'
'ADZOPE' 'Abengourou' 'Blolequin' 'Daloa' 'Meagui' 'Guiglo' 'Danané'
'MENEE CENTRE' 'Sooubre' 'MEAGUI' 'Vavoua' 'San Pedro' 'OUPOUYO' 'SOFOCI'
'Gagnoa' 'Grobonou Dan' 'Soubré' 'Zoukougbeu' 'Bonon' 'Abenase'
'Adabokrom' 'Agona' 'Agona Amenfi' 'Agona B (Effiduase)' 'Akim Oda'
'Akontombra' 'Antoakrom' 'Asafo' 'Asamankese' 'Asankragwa' 'Asawinso'
'Asempanaye' 'Assin Breku' 'Assin Fosu' 'Asumura' 'Bekwai' 'Bepoh'
'Bibiani' 'Boako' 'Bogoso' 'Bonsu' 'Brakwa' 'Breman Asikuma' 'Dadieso'
'Debiso' 'Diaso' 'Dunkwa' 'Elluokrom' 'Enchi' 'Essam' 'Juaboso' 'Kibi'
'Kukuom' 'Mankranso' 'Manso Amanfi' 'New Edubiase' 'Nkawie' 'Nkawkaw'
'Nyinahin' 'Nyinase' 'Obuasi' 'Offinso' 'Ofoase' 'Papase' 'Proso'
'Samreboi' 'Sefwi Bekwai' 'Sefwi Wiawso' 'Suhum' 'Tarkwa' 'Tepa'
'Twifo Praso' 'Wassa Akropong' 'Cross River State' 'Kajola, Ondo state'
'Odigbo, Ondo State' 'Alade,Ondo State' 'Owo, Ondo State'
'Ondo, Ondo State' 'Ilesa, Osun State' 'Modakeke, Osun State'
'Ife, Osun State' 'Ijebu, Ogun State' 'Tarapoto' 'San Alejandro'
'Tocache' 'Quillabamba' 'Sisa' 'Tingo Maria' 'Ciudad Constitución'
'Guayas' 'Los Rios' 'Manabi' 'El Oro' 'Esmeraldas' 'BONON' 'Bodi'
'Lakota' 'Sassandra' 'Dagadji' 'Sanegourifla' 'Sinfra' 'Ayame'
'Ebilassoukro' 'Issia' 'Taaboo' 'Goudí' 'Bouaflé' 'Tiassalé' 'Divo'
'Aboisso' 'Fresco' 'Hermankono-Diès' 'Dibobly' 'Tinhou' 'Okrouyo'
'Guehiebly' 'Gazibouo' 'Gadouan' 'Biankouma' 'Toumbokro' 'Duekoue'
'Zagné' 'Abidjan' 'Guessiguie' 'Taabo' 'Socofi' 'Danane' 'San-Pedro'
'Yakassé Attobrou' 'Bayota' 'Tabou' 'Grand-Zattry' 'Soubre' 'Apprompron'
'Man' 'Kouibly' 'Robert Porte' 'Grand-Lahou' 'KUMASI' 'AGONA DUAKWA'
'AKONTOMBRA' 'ANHWIASO' 'ANYINAM' 'ASANKRAGWA' 'ASAWINSO' 'ASEMPANEYE'
'ASHANTI BEKWAI' 'BIBIANI' 'BONSU NKWANTA' 'DADIESO' 'ELLUOKROM' 'ENCHI'
'JUABOSO' 'JUASO' 'KOFORIDUA' 'KONONGO' 'NKAWIE' 'NSOKOTE' 'OBUASI'
'PROSO' 'SUHUM' 'TEPA' 'WASA AKROPONG' 'Ibadan' 'Okada / Auchi' 'Ife'
'Ijebu Ode' 'Ikom' 'Ijebu-Ife' 'Akure / Owo' 'Ife / Ilesha'
'Odigbo / Ofosu' 'Obgbere / Omifunfun' 'Akonwanjo' 'OBALA'
'Paraiso La 14' 'Guayaquil' 'Norcasia' 'Victoria' 'Tello' 'Falan' 'Maceo'
'Tarqui' 'Montenegro' 'Hacarí' 'Duekue' 'Yamoussoukro' 'Betesso' 'Kotobi'
'Kumasi' 'Agona Swedru / DUAKWA' 'Agona Juaben' 'Suhum - Kibi'
'Asempaneye' 'Ashanti Bekwai' 'Bonsu Nkwanta' 'Juaso' 'Koforidua'
'Konongo-Odumase' 'Nsokote' 'Okada, Auchi' 'Ogbere / Omifunfun'
'Ijebu / Ode' 'Obala' 'Meyomessi' 'Triunfo' 'Quevedo' 'Caracoli' 'Neiva'
'Caicedonia' 'Fresno' 'Garzón' 'Quimbaya' 'San Pedro de Uraba' 'Anserma'
'Ocaña' 'San Juan de Mariquita' 'Bangolo' 'Moussadougou' 'Gnipi 2'
'Menée centre/Gabiadji' 'Biankouman' 'ADABOKROM' 'AGONA JUABEN' 'DIASO'
'EFFIJAASE' 'FOSUKROM' 'JAMASI' 'MANKRANSO' 'NKASEIM' 'NYINAHIN'
'OFFONSO ABOFORO' 'SANKORE' 'SEFWI BEKWAI' 'TARKWA' 'Ado Ekiti'
'Ondo/Akure' 'Cerecita' 'Vinces' 'La Concordia' 'Quinindé' 'La Troncal'
'Virgen del Fatima' 'Chigorodó' 'Gigante' 'Bucaramanga'
'San Juan de\nMariquita' 'ANHWIASO A DISTRICT' 'SEFWI BEKWAI A DISTRICT'
'SEFWI BEKWAI B DISTRICT' 'SEFWI WIAWSO A DISTRICT'
'SEFWI WIAWSO B DISTRICT' 'SEFWI WIAWSO C DISTRICT' 'Ilhéus - Bahia'
'Ipiaú - Bahia' 'Itubera - Bahia' 'Gandu - Bahia' 'Itabuna - Bahia'
'Mutuipe - Bahia' 'São Miguel das Matas - Bahia' 'Anapu - Pará'
'Altamira - Pará' 'Novo Repartimento - Pará' 'São Félix do Xingu - Pará'
'Tucumã - Pará' 'Uruará - Medicilândia - Pará' 'Medicilândia - Pará'
'Makenene' 'Ngoro' 'Voundou' 'Mbangassina' 'Ntui' 'Nguila' 'Ngomedzap'
'Ayos' 'Abongoua' 'Adzopé' 'Bahe-Sebon' 'Bâcon' 'Guépahouo'
'Hermakono Garo' 'Dogbo' 'Bloléquin' 'Adebem' 'Téapleu' 'Duékoué'
'Sahoua' 'Opouyo' 'Bonoufla' 'Mahapleu' 'Kahen' 'Yaou/Aboisso' 'Arrah'
'Toumodi' 'Ananguié' 'Hermankono-Dies' 'Djoroplo' 'Sasa-manois' 'Gagny'
'Nando' 'Kokolilie (Lakota)' 'Mene Centre' 'Oumé' 'Gbagbam' 'Monokozohi'
'Gnato' 'Gnago 1' 'Lobogrou/ Lakota' 'Guitry' 'Diby' 'Aniassué'
'Bongouanou' 'Afféry' 'Aboulayekro' 'Walebo' 'Patricekro' 'Grabo'
'Petit Grabo' 'Grand Lahou' 'Dassioko' 'Agboville' 'Balokuya' 'Bianouan'
'Dabouyo' 'Akoupé' 'Djimon/ Lakota' 'Aboisso Comoé' 'Yorodougou' 'vavoua'
'Niapidou' 'Watté' "N'Douci" 'Ketesso' 'gnato' 'Hire' 'Neka' 'Djapadji'
'TrawinninkroV8' 'Diangobo' 'Dokpodon' 'Douaville' 'Karie' 'Mahino'
'Sikensi' 'Sago' 'Santo Domingo' 'EL ORO' 'Elloukrom' 'Wiawso' 'Nsawora'
'Anhwiaso' 'Kakum' 'Lima Puluh Kota' 'Padang' 'Kolaka Utara'
'Kolaka-Kolaka Timur' 'Kolaka' 'Kolaka Timur' 'Banggai' 'Parimo'
'Kota Poso' 'Luwu Utara' 'Luwu Timur' 'Bantaeng' 'Mamuju'
'Polewali Mandar' 'Pringsewu' 'Tanggamus' 'Pesawaran & Pringsewu'
'Pesawaran' 'Way Ratai' 'Lampung Tengah' 'Lampung Timur' 'Lampung'
'Simalungun - Tanah Karo' 'Cross River' 'Ondo' 'Owo' 'Owo\nOsun' 'Osun'
'Idanre' 'Ogun' 'UMI-ATZERO RURAL' 'AHI RURAL' 'ONGA-WAFFA RURAL'
'LAE URBAN' 'TRANSGOGOL RURAL' 'SALAKI' 'Gabensis' 'INLAND BAINING RURAL'
'ALMAMI RURAL' 'LABUTA RURAL' 'Tuvi' 'mewa' 'WAMPAR RURAL' 'Oronga'
'Numa Blk5' 'WEWAK URBAN' 'BANA RURAL' 'ARAWA RURAL' 'BAIMURU'
'UMI-ATZERO RURAL\nMainoki' 'Ebenan' 'Numa Blk4' 'Avaipa' 'Siuema' 'Yall'
'Kurai' 'Ralubang' 'BOEMSARA' 'Numa Blk1' 'KASMIN TWO'
'CENTRAL GAZELLE RURAL' 'Parawen' 'DIVISION 3' 'Kadaulung' 'BALBAL AREA'
'Tavidua' 'BITAPAKA RURAL' 'Tadup' 'HAGAMA' 'Mainoki' 'NUMIESOLI'
'Sundumbu' 'WANTOAT-LERON RURAL' 'Jobtou\nKurai' 'NIMBIH' 'WAMAYANG'
'Amaimon' 'Mare' 'YEKIMBOLYE' 'North Nasioi' 'Kuruvina' 'Kerei West'
'KASMIN ONE' 'KAUKAMBAR' 'BUKIENDUON' 'NIMIESOLI' 'ahi rural' 'Alemo'
'AMBENOB RURAL' 'Mafanazo' 'Sauruan' 'HARUA' 'Wampan' 'Siriwarang'
'NIAKANDOGUM' 'Iwap' 'GAVIEN DIVISION' 'Kagi'
'WAMPAR RURAL\nTRANSGOGOL RURAL' 'Sehan' 'Derin' 'Chiatz' 'Malarina'
'Kuriguma' 'Ngaruwain' 'SAUSENDUON' 'PARINA' 'SIMBOMIE' 'Jobtou'
'HERENGEI' 'USINO RURAL' 'AMBIKANJA' 'KUMUN' 'BUBUAMU' 'YAREBOEM'
'MARINGE' 'Kopani 3' 'BUKSAK' 'KUFAR' 'Bai' 'HANGRERAK' 'SENGRI'
'KWARAPRI' 'SOLI (1)' 'TOMA-VUNADIDIR RURAL' 'GAVIEN DIV 2'
'MANDIEN/SOLI' 'KOKOPO-VUNAMAMI URBA' 'MADANG URBAN' 'Marangints'
'Panguna' 'Gonoa' 'TUONUMBO' 'Remboini- Aropa' 'SUMGILBAR RURAL' 'BALBAL'
'Kopei' 'GAVIEN SCHEEM' 'KOTTE RURAL' 'SAINA' 'YAWAR RURAL' 'MARAMBANJA'
'HOWI' 'Apusipa' 'Itaupa' 'Mandress' 'JAPARAKA (1)'
'LASSUL BAINING RURAL' 'PAPAROM' 'SIRIAR' 'Miike' 'Atoma'
'SUMGILBAR RURAL\nALMAMI RURAL' 'Sirovai' 'SAINA\nRumba' 'Domakung'
'Karapia' 'Balama' 'Tagitagi' 'LIVUAN-REIMBER RURAL' 'South Nasioi'
'NANGUMARAM' 'IGURUWE' 'Kokoda' 'KWAHUIE' 'ARIAPAN' 'KIARIVU'
'UMI-ATZERO RURAL\nHIMBRU/BUKINARE' 'Binimamp' 'Yalu' 'SAUSENUMBOHU'
'JAPARAKA (3)\nUMI-ATZERO RURAL' 'UMI-ATZERO RURAL\nINLAND BAINING RURAL'
'Ragitamut' 'Biviaro' 'ASTROLABE BAY RURAL' 'RABAUL URBAN' 'Numa Blk2'
'SALAKI-KUNAI HEIGHTS' 'Kerei East' 'NABAK RURAL' 'WAMAIAN' 'Warus'
'KUSAUN' 'Tsili Tsili' 'WANINGIAGUM' 'Parawen\nBALBAL AREA' 'NAMBARI'
'Atoh' 'ALMAMI RURAL\nUMI-ATZERO RURAL' 'HOLIK HOWI' 'Bubukwanga' 'Humya'
'Mataisa' 'Bundinyama' 'Buganikere' 'Busaru' 'Hakitara' 'Kaghema' 'Kuka'
'Mirambi' 'Kirindi' 'Busoru' 'Bugombwa' 'Bundimulinga' 'Busunga' 'Lamia'
'Masule 2' 'Lembe-Yezoum' 'Simanyai' 'SUHUM MUNICIPAL' 'Cape coast'
'Tontokrom' 'Nkrumakrom' 'Nipankyeremia' 'Manukrom' 'Manso Bonsaaso'
'Kobiriso' 'Keniago' 'Hiamankwa' 'Fahiakobo' 'Dewusaso' 'Datano'
'Bayerebon No 6' 'Ayiem' 'Assamang' 'Apenimade' 'Akyerekyerekrom'
'Abuoso' 'Aboi Nkwanta' 'Amoakokrom' 'Amoamang A' 'Amoamang B'
'Asuogya A' 'Asuogya B' 'Asuosu' 'Attobrakrom' 'Bepokokoo' 'Bremang'
'Brodziekrom' 'Domeabra' 'Gonukrom' 'Gyaman' 'Gyaman A' 'Koduakrom'
'Meteameba' 'Nkrankrom' 'Nsabrekwa A' 'Nsabrekwa B' 'Nsabrekwa Nkubeim'
'Nyame Nnae' 'Oda Kotoamso' 'Pantoso' 'Shed' 'Sika Ye Akono' 'Tena Adjei'
'Wassa Dunkwa A' 'Wassa Dunkwa B' 'Wassa Dunkwa C' 'Wassa Saa' 'Yirase'
'Yirase B' 'Yirase C' 'ASSIN FOSU A' 'Abakrampah' 'Abin' 'Abura Dunkwa'
'Adiembra ' 'Akrofuom' 'Beyeden' 'Bosomadwe ' 'Darmang' 'Jakai' 'Kruwa '
'Kwafo' 'Ningo ' 'Nsenne ' 'Nsuakyi' 'ASSIN FOSU B' 'Achiase'
'Amanbete A' 'Amanbete B' 'Anyinabrim' 'Asorefie' 'Ayigo' 'Besease'
'Bunsu' 'Dawomako' 'Dwendama' 'Homaho' 'Katakyiase' 'Keteaba' 'Mankata'
'Ngresi' 'Nyamebekyere A' 'Nyamebekyere B' 'Sibinso' 'Tabil' 'Taylorkrom'
'Tehente' 'Aboabo' 'Aboanidua' 'Adobewura 2' 'Adobewura No. 1A '
'Adobewura No.1B' 'Alata' 'Ampenkrom' 'Ankamateng' 'Ashiam A' 'Ashiam B '
'Chiraa' 'Chirano' 'Humjebre' 'Humjebre B' 'Kofikrom A' 'Kofikrom B'
'Kofikrom C' 'Kronua' 'Kyeikrom ' 'Manse' 'Muoho' 'Muoho B' 'Paboase'
'Sanfifre' 'Sefwi-Bekwai A' 'Sefwi-Bekwai B' 'Sorano ' 'Subiri'
'Abekakrom' 'Aboaboso' 'Adadekrom' 'Agyirekrom' 'Akaatiso' 'Ayensukrom'
'Brakwa Line' 'Camp' 'Dorcas' 'Draw' 'Gyampre C' 'Gyampre D' 'Gyampre E'
'Gyampre F' 'Kutukrom' 'Larbikrom' 'Manseso' 'New Aboi' 'Next Time'
'Nkrumahkrom' 'Nyamebekyere' 'Obengkrom' 'Opon Valley A' 'Opon Valley B'
'Tabakrom' 'Tettehkrom' 'Yawkrom' 'DUNKWA ' 'Addokrom' 'Amissah'
'Ayanfuri' 'Buabeng Camp' 'Dunkwa Bebianiha' 'Kwameprakrom' 'Marfokrom'
'Mfanteman' 'Nkyirifi' 'Meretweso' 'Pokukrom' 'Powerline' 'Teacherkrom'
'Denkyira Bebianiha' 'Kwaameprakrom B' 'Mbradan' 'Imbraim' 'Fawomanyo'
'Anwiawia' 'Atobease' 'Mfoum' 'Kyekyewere' 'Kontokrom' 'Sobroso'
'Pokukrom 2' 'Tegyamoso' 'Kwagyansa' 'Oponso' 'Agyakwabena Krom'
'Anakum A' 'Amuni ' 'Akyekyere B' 'Broninkwanta' 'Kramokrom' 'Asasetre '
'Manso Amenfi' 'Manso Camp' 'Anakum B' 'Merewamerewa ' 'Obeng '
'State Farm' 'Balowankwanta' 'Afukey A' 'Afukey B' 'Afukey C'
'Akyekyere A' 'Bonuama' 'Gyakontabuo' 'Kwawbadukrom' 'Antobam'
'Wassa Bekwai' 'Agyedum' 'Bantama' 'Kojo- Yaw' 'Fantikrom' 'Nipahiamoah'
'Bronikrom' 'Offeikrom' 'Fumso A' 'Kojo Ntumi' 'Tweapease' 'Kwame Adu'
'Kojo Apreh' 'Obuobi' 'Akutreso' 'Menan' 'Subriso' 'T.Camp 1' 'Fumso B'
'Nsuta' 'Oparekrom' 'Ankaase' 'Atweneboana' 'Kenyakrom' 'Alhajikrom'
'Apagya' 'Krobokrom' 'Oseikrom' 'Adansi Swedru' 'T.Camp 11' 'Abenyia'
'Aboifie' 'Ahuntem' 'Akokora Krom' 'Amoaku' 'Aniantetem' 'Aniehu'
'Ayensu Nkwanta' 'Bisaaso 1' 'Bisaso 2' 'Fobikrom' 'Forest'
'Kwame Buokrom' 'Kyemuso' 'Patase Asoredan Ho' 'Tanaa' 'Tanokrom'
'Wansema C' 'Woman No Good A' 'Takyikrom ' 'Mampoma' 'Tsimtsimhwe '
'Baakondzidzi' 'Atwereboanda' 'Ganukrom ' 'Nyamebekyere ' 'Dawomase'
'Kotokye' 'Kwamoano ' 'Akwakrom ' 'Somnyamekodru ' 'Bepobeng '
'Twasokoda' 'Atwereboanda B' 'Ahenkro' 'Ahenkro A' 'Mampoma B' 'Ankaako'
'Agyinkrom' 'Morkwa' 'Esaase' 'Abrokyirekrobo' 'Acheamponkrom' 'Adansi A'
'Adansi B' 'Afransie' 'Ankwagya' 'Buraso' 'Amanikrom' 'Dikoto Junction'
'Dompoase' 'Grumisah A' 'Gyampawuah A' 'Gyampawuah B' 'Gyapa' 'Gyedua'
'Jukwa' 'Krofrom' 'Nyarkokrom' 'Teyekrom' 'Ohianhyeda' 'Hemang' 'Aduman'
'Adesu' 'Nananko' 'Wuratrem A ' 'Wuratrem B' 'Wuratrem C' 'Wuratrem D'
'Grumisah B' 'Grumisah C' 'Tarkwa Breman' 'Eshereso' 'Adonnoi']
[nan]
[nan]
[nan 'CI-2.3.1_1' 'CI-2.2.2_1' 'CI-2.2.5_1' 'CI-8.1.1_1' 'CI-8.2.1_1'
'CI-8.2.2_1' 'CI-9.1.4_1' 'CI-9.1.1_1' 'CI-9.1.3_1' 'CI-9.2.2_1'
'CI-9.2.1_1' 'CI-5.2.1_1' 'CI-3.2.1_1' 'CI-2.1.1_1' 'CI-14.1.3_1'
'CI-14.2.1_1' 'CI-4.2.3_1' 'CI-1.1.1_1' 'CI-3.1.1_1' 'CI-7.1.2_1'
'CI-2.1.2_1' 'CI-3.1.3_1' 'CI-8.3.1_1' 'CI-7.2.2_1' 'CI-6.4.1_1'
'CI-5.1.1_1' 'CI-13.1.2_1' 'CI-7.1.1_1' 'CI-8.3.3_1' 'CI-7.1.4_1'
'CI-8.1.2_1' 'CI-14.2.5_1' 'CI-6.3.1_1' 'CI-14.2.2_1' 'CI-2.2.3_1'
'CI-7.2.1_1' 'CI-8.1.3_1' 'CI-9.1.2_1' 'CI-7.3.3_1' 'CI-2.3.2_1'
'CI-5.1.2_1' 'CI-5.2.2_1' 'CI-8.2.4_1' 'CI-2.2.1_1' 'CI-7.3.2_1'
'CI-5.2.3_1' 'CI-12.3.2_1' 'CI-14.2.4_1' 'CI-8.3.2_1' 'CI-3.1.2_1'
'CI-6.1.2_1' 'CI-9.2.3_1' 'CI-7.3.4_1' 'CI-7.3.1_1' 'CI-6.3.2_1'
'CI-6.1.1_1' 'CI-11.1.1_1' 'CI-8.3.5_1' 'CI-3.2.7_1' 'CI-6.1.4_1'
'CI-7.1.3_1' 'CI-6.2.3_1' 'CI-6.4.2_1' 'CI-8.2.3_1' 'CI-6.3.3_1'
'CI-12.3.1_1' 'CI-6.1.3_1' 'CI-3.2.5_1' 'CI-12.1.2_1' 'CI-12.1.3_1'
'CI-11.1.3_1' 'CI-13.1.1_1' 'CI-10.2.2_1' 'CI-8.3.4_1' 'CI-6.2.1_1'
'CI-8.1.4_1' 'CI-12.2.2_1' 'CI-11.1.5_1' 'CI-3.2.2_1' 'CI-12.1.1_1']
Make 2024 columns match 2023 columns
df.rename(columns={'COUNTRY': 'DISCL_COUNTRY_NAME'}, inplace=True)
# TODO: Reduce to 'DISCL_TRADER_NAME'
# Where 'DISCL_TRADER_NAME' is NaN, copy over 'TRADER_NAME'
mask = df['DISCL_TRADER_NAME'].isna() & df['TRADER_NAME'].notna()
df.loc[mask, 'DISCL_TRADER_NAME'] = df.loc[mask, 'TRADER_NAME']
# Where 'DISCL_TRADER_NAME' is NaN, copy over 'TRADER'
mask = df['DISCL_TRADER_NAME'].isna() & df['TRADER'].notna()
df.loc[mask, 'DISCL_TRADER_NAME'] = df.loc[mask, 'TRADER']
# Where 'COMPANY' is NaN, copy over 'TRADER_NAME'
mask = df.COMPANY.isna()&df.TRADER_NAME.notna()
df.loc[mask, 'COMPANY'] = df.loc[mask, 'TRADER_NAME']
df.drop(columns=['TRADER_NAME', 'TRADER'], inplace=True)
df.drop(columns=['INDEX'], inplace=True)
# TODO: Reduce to 'DISCL_YEAR'
# Where 'DISCL_YEAR is NaN, copy over ' DISCL_YEAR'
mask = df['DISCL_YEAR'].isna() & df[' DISCL_YEAR'].notna()
df.loc[mask, 'DISCL_YEAR'] = df.loc[mask, ' DISCL_YEAR']
# Where 'DISCL_YEAR' is NaN, copy over 'YEAR'
mask = df['DISCL_YEAR'].isna() & df['YEAR'].notna()
df.loc[mask, 'DISCL_YEAR'] = df.loc[mask, 'YEAR']
print(df[df.DISCL_YEAR.isna() & df[' DISCL_YEAR'].notna()])
print(df[df.DISCL_YEAR.isna() & df['YEAR'].notna()])
df.drop(columns=[' DISCL_YEAR', 'YEAR'], inplace=True)
Empty DataFrame
Columns: [DISCL_TRADER_NAME, DISCL_SUPPLIER_ABRVNAME, DISCL_AREA_NAME, DISCL_CERTIFICATION_NAME, DISCL_NUMBER_FARMERS, DISCL_LONGITUDE, DISCL_LATITUDE, DISCL_COUNTRY_NAME, TYPE, DISCL_YEAR, DISCL_SUPPLIER_TYPE, COMPANY, DISCL_SUPPLIER_FULLNAME, LOCALITY_NAME, DISCL_SUPPLIER_NAME, DISCL_VOLUMES, DISCL_VOLUMES_UNIT, DISCL_CERTIFICATION_PCT, DISTRICT_NAME, DISTRICT_GEOCODE, CAM_BUYERS, IS_CAM_V3, IS_JRC, JRC_BUYER_ID, YEAR, DISCL_SUPPLIER_TYPE, DISCL_YEAR, SUPPLIER_ABRVNAME, SUPPLIER_FULLNAME, LONGITUDE, LATITUDE, NUMBER_FARMERS, VOLUMES, VOLUMES_UNIT, CERTIFICATION_NAME, CERTIFICATION_PCT]
Index: []
[0 rows x 36 columns]
Empty DataFrame
Columns: [DISCL_TRADER_NAME, DISCL_SUPPLIER_ABRVNAME, DISCL_AREA_NAME, DISCL_CERTIFICATION_NAME, DISCL_NUMBER_FARMERS, DISCL_LONGITUDE, DISCL_LATITUDE, DISCL_COUNTRY_NAME, TYPE, DISCL_YEAR, DISCL_SUPPLIER_TYPE, COMPANY, DISCL_SUPPLIER_FULLNAME, LOCALITY_NAME, DISCL_SUPPLIER_NAME, DISCL_VOLUMES, DISCL_VOLUMES_UNIT, DISCL_CERTIFICATION_PCT, DISTRICT_NAME, DISTRICT_GEOCODE, CAM_BUYERS, IS_CAM_V3, IS_JRC, JRC_BUYER_ID, YEAR, DISCL_SUPPLIER_TYPE, DISCL_YEAR, SUPPLIER_ABRVNAME, SUPPLIER_FULLNAME, LONGITUDE, LATITUDE, NUMBER_FARMERS, VOLUMES, VOLUMES_UNIT, CERTIFICATION_NAME, CERTIFICATION_PCT]
Index: []
[0 rows x 36 columns]
# TODO: Reduce to 'DISCL_LATITUDE' and 'DISCL_LONGITUDE'
mask = df['DISCL_LATITUDE'].isna() & df['LATITUDE'].notna()
df.loc[mask, 'DISCL_LATITUDE'] = df.loc[mask, 'LATITUDE']
mask = df['DISCL_LONGITUDE'].isna() & df['LONGITUDE'].notna()
df.loc[mask, 'DISCL_LONGITUDE'] = df.loc[mask, 'LONGITUDE']
df.drop(columns=['LATITUDE', 'LONGITUDE'], inplace=True)
# TODO: Reduce to 'DISCL_SUPPLIER_ABRVNAME' and 'DISCL_SUPPLIER_FULLNAME'
# Fill DISCL_SUPPLIER_ABRVNAME and DISCL_SUPPLIER_FULLNAME from other supplier columns,
# then drop the redundant columns.
# make a copy to be safe if you want to compare before/after
df = df.copy()
# Normalize empty strings -> NA on the relevant columns
supplier_cols = ['SUPPLIER_ABRVNAME', 'SUPPLIER_FULLNAME', 'DISCL_SUPPLIER_ABRVNAME',
'DISCL_SUPPLIER_NAME', 'DISCL_SUPPLIER_FULLNAME']
df[supplier_cols] = df[supplier_cols].replace(r'^\s*$', pd.NA, regex=True)
# Build DISCL_SUPPLIER_ABRVNAME: choose the first non-null value from preferred sources
df['DISCL_SUPPLIER_ABRVNAME'] = df[
['DISCL_SUPPLIER_ABRVNAME', 'SUPPLIER_ABRVNAME']
].bfill(axis=1).iloc[:, 0]
# Build DISCL_SUPPLIER_FULLNAME: choose the first non-null value from preferred sources
df['DISCL_SUPPLIER_FULLNAME'] = df[
['DISCL_SUPPLIER_FULLNAME', 'SUPPLIER_FULLNAME', 'DISCL_SUPPLIER_NAME']
].bfill(axis=1).iloc[:, 0]
# Optional: tidy whitespace on the new/filled columns
for c in ['DISCL_SUPPLIER_ABRVNAME', 'DISCL_SUPPLIER_FULLNAME']:
df[c] = df[c].astype('string').str.strip().replace({'': pd.NA})
# Drop the now-redundant columns
df.drop(columns=['SUPPLIER_ABRVNAME', 'SUPPLIER_FULLNAME', 'DISCL_SUPPLIER_NAME'], inplace=True)
# Quick verification prints (delete/adjust in notebook as you like)
print("Filled DISCL_SUPPLIER_ABRVNAME: non-null count =", df['DISCL_SUPPLIER_ABRVNAME'].notna().sum())
print("Filled DISCL_SUPPLIER_FULLNAME: non-null count =", df['DISCL_SUPPLIER_FULLNAME'].notna().sum())
# Example: show rows where a DISCL_ field was filled from another field
# (run this before/after if you want to compare; here we show current rows where values exist)
print(df[['DISCL_SUPPLIER_ABRVNAME', 'DISCL_SUPPLIER_FULLNAME']].head(8))
Filled DISCL_SUPPLIER_ABRVNAME: non-null count = 1533
Filled DISCL_SUPPLIER_FULLNAME: non-null count = 341
DISCL_SUPPLIER_ABRVNAME DISCL_SUPPLIER_FULLNAME
0 SOCIETE COOPERATIVE AVEC CONSEIL D'ADMINISTRAT... <NA>
1 SOCIETE COOPERATIVE AVEC CONSEIL D'ADMINISTRAT... <NA>
2 SOCIETE COOPERATIVE AGRICOLE AHUANOU D'ARRAH A... <NA>
3 SOCIETE COOPERATIVE AGRICOLE GUELANIN AVEC CON... <NA>
4 SOCIETE COOPERATIVE AGRICOLE IMAAH, SOCIETE CO... <NA>
5 SOCIETE COOPERATIVE AVEC CONSEIL D'ADMINISTRAT... <NA>
6 SOCIETE COOPERATIVE SIMPLIFIÉE RASSO <NA>
7 SOCIETE COOPERATIVE AGRICOLE NOUVELLE DE DJOUR... <NA>
df.columns
Index(['DISCL_TRADER_NAME', 'DISCL_SUPPLIER_ABRVNAME', 'DISCL_AREA_NAME',
'DISCL_CERTIFICATION_NAME', 'DISCL_NUMBER_FARMERS', 'DISCL_LONGITUDE',
'DISCL_LATITUDE', 'DISCL_COUNTRY_NAME', 'TYPE', 'DISCL_YEAR',
'DISCL_SUPPLIER_TYPE', 'COMPANY', 'DISCL_SUPPLIER_FULLNAME',
'LOCALITY_NAME', 'DISCL_VOLUMES', 'DISCL_VOLUMES_UNIT',
'DISCL_CERTIFICATION_PCT', 'DISTRICT_NAME', 'DISTRICT_GEOCODE',
'CAM_BUYERS', 'IS_CAM_V3', 'IS_JRC', 'JRC_BUYER_ID',
' DISCL_SUPPLIER_TYPE', 'NUMBER_FARMERS', 'VOLUMES', 'VOLUMES_UNIT',
'CERTIFICATION_NAME', 'CERTIFICATION_PCT'],
dtype='object')
# TODO: Reduce to 'DISCL_AREA_NAME' and 'DISTRICT_GEOCODE'
# Reduce to DISCL_AREA_NAME and DISTRICT_GEOCODE using LOCALITY_NAME and DISTRICT_NAME as fallbacks.
df = df.copy()
area_cols = ['DISCL_AREA_NAME', 'LOCALITY_NAME', 'DISTRICT_GEOCODE', 'DISTRICT_NAME']
# normalise empty strings -> NA
df[area_cols] = df[area_cols].replace(r'^\s*$', pd.NA, regex=True)
# Fill DISCL_AREA_NAME: prefer explicit DISCL_AREA_NAME, then DISTRICT_NAME, then LOCALITY_NAME
df['DISCL_AREA_NAME'] = df[
['DISCL_AREA_NAME', 'DISTRICT_NAME', 'LOCALITY_NAME']
].bfill(axis=1).iloc[:, 0]
# Fill DISTRICT_GEOCODE: prefer existing DISTRICT_GEOCODE, then fall back to DISTRICT_NAME
# (This keeps codes where present; if you prefer to leave codes blank when only names exist, remove 'DISTRICT_NAME' from the list.)
df['DISTRICT_GEOCODE'] = df[
['DISTRICT_GEOCODE']
].bfill(axis=1).iloc[:, 0]
# Trim whitespace and normalise empty strings to NA
for c in ['DISCL_AREA_NAME', 'DISTRICT_GEOCODE']:
df[c] = df[c].astype('string').str.strip().replace({'': pd.NA})
# Drop now-redundant columns
df.drop(columns=['LOCALITY_NAME', 'DISTRICT_NAME'], inplace=True)
# Quick checks
print("Non-null DISCL_AREA_NAME:", df['DISCL_AREA_NAME'].notna().sum())
print("Non-null DISTRICT_GEOCODE:", df['DISTRICT_GEOCODE'].notna().sum())
print(df[['DISCL_AREA_NAME', 'DISTRICT_GEOCODE']].head(10))
# TODO: Reduce to 'DISCL_SUPPLIER_TYPE'
# Reduce to DISCL_SUPPLIER_TYPE using TYPE and any whitespace-variant columns as fallbacks.
df = df.copy()
# Find relevant original columns:
orig_type_cols = [c for c in df.columns if c.strip() in {'DISCL_SUPPLIER_TYPE', 'TYPE'}]
# Example: this will include 'DISCL_SUPPLIER_TYPE', ' DISCL_SUPPLIER_TYPE', and 'TYPE' if present.
# If none of those columns exist, create the canonical column (all NA) and exit early.
if not orig_type_cols:
df['DISCL_SUPPLIER_TYPE'] = pd.NA
print("No source columns found for supplier type. Created empty 'DISCL_SUPPLIER_TYPE'.")
else:
# Normalise empty strings -> NA on those original columns
df[orig_type_cols] = df[orig_type_cols].replace(r'^\s*$', pd.NA, regex=True)
# Build an ordered list of fallback columns:
# prefer exact 'DISCL_SUPPLIER_TYPE' first, then any whitespace variants, then 'TYPE'
ordered = []
if 'DISCL_SUPPLIER_TYPE' in df.columns:
ordered.append('DISCL_SUPPLIER_TYPE')
# add whitespace variants (names that strip to DISCL_SUPPLIER_TYPE but aren't the exact name)
ws_variants = [c for c in df.columns if c.strip() == 'DISCL_SUPPLIER_TYPE' and c != 'DISCL_SUPPLIER_TYPE']
ordered.extend(ws_variants)
if 'TYPE' in df.columns:
ordered.append('TYPE')
# Safeguard: if ordered is empty (shouldn't be), create empty canonical column
if not ordered:
df['DISCL_SUPPLIER_TYPE'] = pd.NA
else:
# Merge using first non-null value across ordered fallbacks
df['DISCL_SUPPLIER_TYPE__MERGE'] = df[ordered].bfill(axis=1).iloc[:, 0]
# Tidy whitespace and normalise empty -> NA
df['DISCL_SUPPLIER_TYPE__MERGE'] = (
df['DISCL_SUPPLIER_TYPE__MERGE']
.astype('string')
.str.strip()
.replace({'': pd.NA})
)
# Drop the original/raw columns we used (we will keep only canonical name)
df.drop(columns=[c for c in ordered if c in df.columns], inplace=True)
# Rename the merge column to canonical name
df.rename(columns={'DISCL_SUPPLIER_TYPE__MERGE': 'DISCL_SUPPLIER_TYPE'}, inplace=True)
# Quick checks
print("Columns now include:", [c for c in df.columns if 'SUPPLIER_TYPE' in c or c == 'TYPE'][:10])
print("Non-null DISCL_SUPPLIER_TYPE:", df['DISCL_SUPPLIER_TYPE'].notna().sum())
print(df['DISCL_SUPPLIER_TYPE'].value_counts(dropna=False).head(10))
Columns now include: ['DISCL_SUPPLIER_TYPE']
Non-null DISCL_SUPPLIER_TYPE: 1382
COOPERATIVE 1382
<NA> 594
Name: DISCL_SUPPLIER_TYPE, dtype: Int64
df.rename(columns={'CERTIFICATION_NAME':'DISCL_CERTIFICATION_NAME',
'CERTIFICATION_PCT':'DISCL_CERTIFICATION_PCT',
'NUMBER_FARMERS':'DISCL_NUMBER_FARMERS',
'VOLUMES':'DISCL_VOLUMES',
'VOLUMES_UNIT':'DISCL_VOLUMES_UNIT'}, inplace=True)
set(df_2023.columns).difference(set(df.columns))
{'DISTRICT_NAME', 'LOCALITY_NAME', 'TRADER_NAME'}
df_2023[(df_2023[['DISCL_AREA_NAME', 'DISTRICT_NAME', 'LOCALITY_NAME']].notna().any(axis=1))&(df.DISCL_COUNTRY_NAME == "IVORY COAST")]
| COMPANY | DISCL_COUNTRY_NAME | DISCL_TRADER_NAME | DISCL_AREA_NAME | DISCL_SUPPLIER_ABRVNAME | DISCL_SUPPLIER_FULLNAME | DISCL_SUPPLIER_TYPE | DISCL_LONGITUDE | DISCL_LATITUDE | DISCL_YEAR | ... | DISCL_CERTIFICATION_NAME | DISCL_CERTIFICATION_PCT | DISTRICT_NAME | DISTRICT_GEOCODE | LOCALITY_NAME | TRADER_NAME | CAM_BUYERS | IS_CAM_V3 | IS_JRC | JRC_BUYER_ID |
|---|
0 rows × 23 columns
set(df.columns).difference(set(df_2023.columns))
{'CERTIFICATION_NAME',
'CERTIFICATION_PCT',
'NUMBER_FARMERS',
'VOLUMES',
'VOLUMES_UNIT'}
df_2023.columns
Index(['COMPANY', 'DISCL_COUNTRY_NAME', 'DISCL_TRADER_NAME', 'DISCL_AREA_NAME',
'DISCL_SUPPLIER_ABRVNAME', 'DISCL_SUPPLIER_FULLNAME',
'DISCL_SUPPLIER_TYPE', 'DISCL_LONGITUDE', 'DISCL_LATITUDE',
'DISCL_YEAR', 'DISCL_NUMBER_FARMERS', 'DISCL_VOLUMES',
'DISCL_VOLUMES_UNIT', 'DISCL_CERTIFICATION_NAME',
'DISCL_CERTIFICATION_PCT', 'DISTRICT_NAME', 'DISTRICT_GEOCODE',
'LOCALITY_NAME', 'TRADER_NAME', 'CAM_BUYERS', 'IS_CAM_V3', 'IS_JRC',
'JRC_BUYER_ID'],
dtype='object')
df.columns
Index(['DISCL_TRADER_NAME', 'DISCL_SUPPLIER_ABRVNAME', 'DISCL_AREA_NAME',
'DISCL_CERTIFICATION_NAME', 'DISCL_NUMBER_FARMERS', 'DISCL_LONGITUDE',
'DISCL_LATITUDE', 'DISCL_COUNTRY_NAME', 'DISCL_YEAR', 'COMPANY',
'DISCL_SUPPLIER_FULLNAME', 'DISCL_VOLUMES', 'DISCL_VOLUMES_UNIT',
'DISCL_CERTIFICATION_PCT', 'DISTRICT_GEOCODE', 'CAM_BUYERS',
'IS_CAM_V3', 'IS_JRC', 'JRC_BUYER_ID', 'NUMBER_FARMERS', 'VOLUMES',
'VOLUMES_UNIT', 'CERTIFICATION_NAME', 'CERTIFICATION_PCT',
'DISCL_SUPPLIER_TYPE'],
dtype='object')
Figure out what to do about NA traders - leave them (that's how df_2023 is)
df.to_csv("/Users/niamhfrench/Library/CloudStorage/OneDrive-SEI/Documents/civ_cooperatives_update/combined_data/just_CIV_2024_columns_fixed.csv", index=False)
df = pd.read_csv('/Users/niamhfrench/Library/CloudStorage/OneDrive-SEI/Documents/civ_cooperatives_update/combined_data/just_CIV_2024_columns_fixed.csv')
df[['DISCL_NUMBER_FARMERS.1',
'DISCL_VOLUMES.1', 'DISCL_VOLUMES_UNIT.1', 'DISCL_CERTIFICATION_NAME.1',
'DISCL_CERTIFICATION_PCT.1']].notna().sum()
DISCL_NUMBER_FARMERS.1 7
DISCL_VOLUMES.1 0
DISCL_VOLUMES_UNIT.1 0
DISCL_CERTIFICATION_NAME.1 7
DISCL_CERTIFICATION_PCT.1 0
dtype: int64
df[['DISCL_NUMBER_FARMERS',
'DISCL_VOLUMES', 'DISCL_VOLUMES_UNIT', 'DISCL_CERTIFICATION_NAME',
'DISCL_CERTIFICATION_PCT']].notna().sum()
DISCL_NUMBER_FARMERS 358
DISCL_VOLUMES 12
DISCL_VOLUMES_UNIT 3
DISCL_CERTIFICATION_NAME 462
DISCL_CERTIFICATION_PCT 147
dtype: int64
# Copy over the 7 number_farmers and certification values from the '.1' columns to the main columns where missing
for col in ['DISCL_NUMBER_FARMERS',
'DISCL_VOLUMES', 'DISCL_VOLUMES_UNIT', 'DISCL_CERTIFICATION_NAME',
'DISCL_CERTIFICATION_PCT']:
col_1 = col + '.1'
mask = df[col].isna() & df[col_1].notna()
df.loc[mask, col] = df.loc[mask, col_1]
df[['DISCL_NUMBER_FARMERS',
'DISCL_VOLUMES', 'DISCL_VOLUMES_UNIT', 'DISCL_CERTIFICATION_NAME',
'DISCL_CERTIFICATION_PCT']].notna().sum()
DISCL_NUMBER_FARMERS 365
DISCL_VOLUMES 12
DISCL_VOLUMES_UNIT 3
DISCL_CERTIFICATION_NAME 469
DISCL_CERTIFICATION_PCT 147
dtype: int64
df.drop(columns=['DISCL_NUMBER_FARMERS.1',
'DISCL_VOLUMES.1', 'DISCL_VOLUMES_UNIT.1', 'DISCL_CERTIFICATION_NAME.1',
'DISCL_CERTIFICATION_PCT.1'], inplace=True)
df.to_csv("/Users/niamhfrench/Library/CloudStorage/OneDrive-SEI/Documents/civ_cooperatives_update/combined_data/just_CIV_2024_columns_fixed.csv", index=False)