Clean 2024 csv column names

View or edit on GitHub

This page is synchronized from trase/data/cote_divoire/cocoa/logistics/q4_2025/clean/clean_2024_csv_column_names.ipynb. Last modified on 2026-06-21 06:35 CEST by GitHub Actions. Please view or edit the original file there; changes should be reflected here after a midnight build (CET time), or manually triggering it with a GitHub action (link).

import pandas as pd
import numpy as np
df = pd.read_csv("/Users/niamhfrench/Library/CloudStorage/OneDrive-SEI/Documents/civ_cooperatives_update/combined_data/messy_2024_combined.csv")

df = df.copy()

print(df.shape)

# copy over value from DISCL_COUNTRY_NAME where DISCL_COUNTRY is NaN and vice versa
mask1 = df['DISCL_COUNTRY'].isna() & df['DISCL_COUNTRY_NAME'].notna()
df.loc[mask1, 'DISCL_COUNTRY'] = df.loc[mask1, 'DISCL_COUNTRY_NAME']

mask2 = df['DISCL_COUNTRY_NAME'].isna() & df['DISCL_COUNTRY'].notna()
df.loc[mask2, 'DISCL_COUNTRY_NAME'] = df.loc[mask2, 'DISCL_COUNTRY']


mask = (df['DISCL_COUNTRY'].isin(["Côte d'Ivoire", " Ivory Coast", "Cote d'Ivoire", np.nan])) | \
       (df['DISCL_COUNTRY_NAME'].isin(["IVORY COAST", "CIV", np.nan])) | \
              (df['COUNTRY'].isin([np.nan, "Cote d'Ivoire", 'Ivory Coast', "Côte d'Ivoire",'CÔTE D’IVOIRE', 'CIV'])) | \
              (df['COUNTRY_NAME'].isin([np.nan, "IVORY COAST"]))
df = df[mask].copy()                  # avoid SettingWithCopyWarning

print(df.shape)
import pandas as pd
import numpy as np
import unicodedata

# prefer keeping df as a copy when mutating
df = df.copy()

# columns involved (adjust names if you have slightly different names/casing)
cols = ['COUNTRY', 'DISCL_COUNTRY', 'DISCL_COUNTRY_NAME', 'COUNTRY_NAME']

# 1) Normalise empty strings -> NA across these columns
df[cols] = df[cols].replace(r'^\s*$', pd.NA, regex=True)

# 2) Build COUNTRY by taking the first non-null value in the preferred order:
#    (this is vectorised and fast)
df['COUNTRY'] = df[cols].bfill(axis=1).iloc[:, 0]

# 3) Optional normalisation: strip, remove accents, uppercase
def normalize_country(val):
    if pd.isna(val):
        return val
    s = str(val).strip()
    # remove accents (Côte -> Cote) and force ASCII
    s = unicodedata.normalize('NFKD', s).encode('ASCII', 'ignore').decode('ASCII')
    return s.upper()

df['COUNTRY'] = df['COUNTRY'].apply(normalize_country)

# 4) Drop the other country columns (keep only df['COUNTRY'])
df.drop(columns=['DISCL_COUNTRY', 'DISCL_COUNTRY_NAME', 'COUNTRY_NAME'], inplace=True)

# Quick checks
print("shape:", df.shape)
print("COUNTRY value counts (top):")
print(df['COUNTRY'].value_counts(dropna=False).head(10))
shape: (2250, 39)
COUNTRY value counts (top):
IVORY COAST      1004
COTE D'IVOIRE     664
CIV               198
COTE DIVOIRE      100
GHANA              83
CAMEROON           61
NIGERIA            34
ECUADOR            21
GHA                17
BRAZIL             15
Name: COUNTRY, dtype: int64
df = df[df.COUNTRY.isin(["IVORY COAST", "COTE D'IVOIRE", "CIV", "COTE DIVOIRE", np.nan])].copy()
df.COUNTRY.unique()
array(["COTE D'IVOIRE", 'IVORY COAST', nan, 'COTE DIVOIRE', 'CIV'],
      dtype=object)
df['COUNTRY'] = "IVORY_COAST"
import pandas as pd

# copy if you want to keep original
df = df.copy()

# Uppercase every cell only when it's a str (preserves NaN, numbers, lists, dicts, ...)
df = df.applymap(lambda v: v.upper() if isinstance(v, str) else v)

# Uppercase column names (still vectorised)
df.columns = df.columns.map(lambda c: c.upper() if isinstance(c, str) else c)
# df.to_csv("just_CIV_messy_2024.csv", index=False)
# Standardise names in text

# Decide which values should be under which columns

# Impute missing values using 2023 dataset 
df_2023 = pd.read_csv('/Users/niamhfrench/Library/CloudStorage/OneDrive-SEI/Documents/civ_cooperatives_update/world_cocoa_logistics/out/consolidated_disclosures.csv', sep=';')
df_2023[df_2023.DISCL_YEAR == 2023].shape
(770, 23)
df_2023[df_2023.DISCL_YEAR == 2022].shape
(3363, 23)
df_2023[df_2023.DISCL_YEAR == 2021].shape
(926, 23)
df_2023[df_2023.DISCL_YEAR == 2020].shape
(765, 23)
df_2023.COMPANY.unique()
array(['BARRY_CALLEBAUT', 'CARGILL', 'HERSHEY', 'MARS', 'ALFRED RITTER',
       'ALTER ECO', 'CEMOI', 'COLRUYT', 'ECOM', 'ECOOKIM', 'ETG',
       'FERRERO', 'HALBA', 'MONDELEZ', 'NESTLE', 'OLAM', 'ONEM',
       'PURATOS', 'SUCDEN', 'TONYS_CHOCOLONELY', 'UNCOMMON COCOA',
       'VALRHONA', 'WHITTAKER', nan, 'BARRY CALLEBAUT', 'COCOANECT',
       'TOUTON', 'BLOMMER', 'COCOASOURCE', "TONY'S CHOCOLONELY",
       'ETHIQUABLE', 'ALBERT HEIJN', 'RAINFOREST ALLIANCE'], dtype=object)
df.COMPANY.unique()
array([nan, 'NESTLE', 'PURATOS', 'TONYS CHOCOLONELY', 'ONEM', 'FERRERO',
       'CARGILL', 'BLOMMER CHOCOLATE COMPANY', 'OFI', 'UNILEVER',
       'VALRHONA', 'GODIVA', 'WHITTAKERS', 'ALTER ECO', 'ETG', 'MARS',
       'TOUTON', 'ECOM'], dtype=object)
df.loc[2246,:]
TRADER_NAME                                                             NaN
DISCL_TRADER_NAME                                                       NaN
DISCL_SUPPLIER_ABRVNAME                                      CAFUGO COOP CA
DISCL_AREA_NAME                                                         NaN
DISCL_CERTIFICATION_NAME                                                NaN
DISCL_NUMBER_FARMERS                                                    NaN
DISCL_LONGITUDE                                                         NaN
DISCL_LATITUDE                                                          NaN
COUNTRY                                                         IVORY_COAST
 TYPE                                                                   NaN
DISCL_YEAR                                                           2024.0
DISCL_SUPPLIER_TYPE                                             COOPERATIVE
COMPANY                                                                ECOM
DISCL_SUPPLIER_FULLNAME     COOPÉRATIVE AGRICOLE DES FRÈRES UNIS DE GOBROKO
LOCALITY_NAME                                                           NaN
INDEX                                                                   NaN
DISCL_SUPPLIER_NAME                                                     NaN
DISCL_VOLUMES                                                           NaN
DISCL_VOLUMES_UNIT                                                      NaN
DISCL_CERTIFICATION_PCT                                                 NaN
DISTRICT_NAME                                                     SASSANDRA
DISTRICT_GEOCODE                                                        NaN
CAM_BUYERS                                                              NaN
IS_CAM_V3                                                               NaN
IS_JRC                                                                  NaN
JRC_BUYER_ID                                                            NaN
YEAR                                                                    NaN
 DISCL_SUPPLIER_TYPE                                                    NaN
 DISCL_YEAR                                                             NaN
TRADER                                                                  NaN
SUPPLIER_ABRVNAME                                                       NaN
SUPPLIER_FULLNAME                                                       NaN
LONGITUDE                                                               NaN
LATITUDE                                                                NaN
NUMBER_FARMERS                                                          NaN
VOLUMES                                                                 NaN
VOLUMES_UNIT                                                            NaN
CERTIFICATION_NAME                                                      NaN
CERTIFICATION_PCT                                                       NaN
Name: 2246, dtype: object
# How did Valentin handle traders vs companies? 
df_2023.columns
Index(['COMPANY', 'DISCL_COUNTRY_NAME', 'DISCL_TRADER_NAME', 'DISCL_AREA_NAME',
       'DISCL_SUPPLIER_ABRVNAME', 'DISCL_SUPPLIER_FULLNAME',
       'DISCL_SUPPLIER_TYPE', 'DISCL_LONGITUDE', 'DISCL_LATITUDE',
       'DISCL_YEAR', 'DISCL_NUMBER_FARMERS', 'DISCL_VOLUMES',
       'DISCL_VOLUMES_UNIT', 'DISCL_CERTIFICATION_NAME',
       'DISCL_CERTIFICATION_PCT', 'DISTRICT_NAME', 'DISTRICT_GEOCODE',
       'LOCALITY_NAME', 'TRADER_NAME', 'CAM_BUYERS', 'IS_CAM_V3', 'IS_JRC',
       'JRC_BUYER_ID'],
      dtype='object')
print(df_2023.COMPANY.unique())
['BARRY_CALLEBAUT' 'CARGILL' 'HERSHEY' 'MARS' 'ALFRED RITTER' 'ALTER ECO'
 'CEMOI' 'COLRUYT' 'ECOM' 'ECOOKIM' 'ETG' 'FERRERO' 'HALBA' 'MONDELEZ'
 'NESTLE' 'OLAM' 'ONEM' 'PURATOS' 'SUCDEN' 'TONYS_CHOCOLONELY'
 'UNCOMMON COCOA' 'VALRHONA' 'WHITTAKER' nan 'BARRY CALLEBAUT' 'COCOANECT'
 'TOUTON' 'BLOMMER' 'COCOASOURCE' "TONY'S CHOCOLONELY" 'ETHIQUABLE'
 'ALBERT HEIJN' 'RAINFOREST ALLIANCE']
print(df_2023.DISCL_TRADER_NAME.unique())
[nan 'CEMOI COTE D‘IVOIRE' 'Fuchs & Hoffmann Kakaoprodukte GmbH'
 'SUSCOM-CI' 'CARGILL' 'Ascot Amsterdam B.V.' 'Olam International Limited'
 'Ritter Sport Nicaragua S.A' 'Minka' 'CEMOI' 'ECOM' 'ECOOKIM' 'ETG'
 'DAARNHOUWER & CO.' 'Walter Matter Sa' 'BARRY CALLEBAUT Cocoa AG'
 'Cargill B.V.' 'Ecom Agrotrade Ltd.' 'Olam International Ltd.'
 'Touton SA' 'Barry Callebaut Cocoa AG' 'Barry Callebaut' 'Blommer/Olam'
 'Cargill' 'Cocoanect' 'Ecom' 'Sucden' 'Touton' 'COCOANECT' 'OLAM'
 'Fildisi Cocoa Industry' 'SUCDEN' 'COCOASOURCE' 'UNCOMMON COCOA']
df_2023[df_2023.COMPANY == "TONY'S CHOCOLONELY"]
COMPANY DISCL_COUNTRY_NAME DISCL_TRADER_NAME DISCL_AREA_NAME DISCL_SUPPLIER_ABRVNAME DISCL_SUPPLIER_FULLNAME DISCL_SUPPLIER_TYPE DISCL_LONGITUDE DISCL_LATITUDE DISCL_YEAR ... DISCL_CERTIFICATION_NAME DISCL_CERTIFICATION_PCT DISTRICT_NAME DISTRICT_GEOCODE LOCALITY_NAME TRADER_NAME CAM_BUYERS IS_CAM_V3 IS_JRC JRC_BUYER_ID
8382 TONY'S CHOCOLONELY IVORY_COAST NaN NaN ECAM SCOOPS ENTREPRISE COOPERATIVE DES AGRICULTEURS MODERNES COOPERATIVE -7.344635 6.750040 2019 ... FAIRTRADE NaN NaN CI-8.2.2_1 NaN NaN 2000 (TONY'S CHOCOLONELY) True NaN NaN
8535 TONY'S CHOCOLONELY IVORY_COAST NaN NaN EKOOKIM ECOJAD ENTREPRISE COOPERATIVE DES JEUNES AGRICULTEURS... COOPERATIVE -6.457590 6.875014 2019 ... FAIRTRADE; UTZ/RA NaN NaN CI-9.1.1_1 NaN NaN 1751 (TONY'S CHOCOLONELY) True NaN NaN
8537 TONY'S CHOCOLONELY IVORY_COAST NaN NaN EKOOKIM KAPATCHIVA SOCIETE COOPERATIVE KAPATCHIVA COOPERATIVE -5.739872 6.988296 2019 ... FAIRTRADE NaN NaN CI-9.2.1_1 NaN NaN 1190 (TONY'S CHOCOLONELY) True NaN NaN
10834 TONY'S CHOCOLONELY IVORY_COAST NaN NaN SOCOOPACDI SOCIETE COOPERATIVE AGRICOLE DE CAFE ET DE CAC... COOPERATIVE -5.362552 5.841540 2019 ... FAIRTRADE; UTZ NaN NaN CI-5.2.1_1 NaN NaN 1536 (TONY'S CHOCOLONELY) True NaN NaN

4 rows × 23 columns

df_2023[df_2023.DISCL_TRADER_NAME == "BARRY CALLEBAUT Cocoa AG"]
COMPANY DISCL_COUNTRY_NAME DISCL_TRADER_NAME DISCL_AREA_NAME DISCL_SUPPLIER_ABRVNAME DISCL_SUPPLIER_FULLNAME DISCL_SUPPLIER_TYPE DISCL_LONGITUDE DISCL_LATITUDE DISCL_YEAR ... DISCL_CERTIFICATION_NAME DISCL_CERTIFICATION_PCT DISTRICT_NAME DISTRICT_GEOCODE LOCALITY_NAME TRADER_NAME CAM_BUYERS IS_CAM_V3 IS_JRC JRC_BUYER_ID
4331 MONDELEZ IVORY_COAST BARRY CALLEBAUT Cocoa AG NaN S.CA-CA Ste coop Anouanzé de N'Denou NaN NaN NaN 2021 ... COCOA LIFE 100.0 NaN NaN NaN NaN NaN False NaN NaN
4332 MONDELEZ IVORY_COAST BARRY CALLEBAUT Cocoa AG NaN CAAG COOP-CA Ste Coop Agri Allakabo de Gly NaN NaN NaN 2021 ... COCOA LIFE 100.0 NaN NaN NaN NaN NaN False NaN NaN
4333 MONDELEZ IVORY_COAST BARRY CALLEBAUT Cocoa AG NaN SCOOPAAD_DATTA Ste Coop Agri Akayara de Datta NaN NaN NaN 2021 ... COCOA LIFE 100.0 NaN NaN NaN NaN NaN False NaN NaN
4334 MONDELEZ IVORY_COAST BARRY CALLEBAUT Cocoa AG NaN COOP CA PM Ste Coop des Producteurs de M'Bokro NaN NaN NaN 2021 ... COCOA LIFE 100.0 NaN NaN NaN NaN NaN False NaN NaN
4335 MONDELEZ IVORY_COAST BARRY CALLEBAUT Cocoa AG NaN EWOUNBO TIASSALE Ste Coop S EWOUNBO de Tiassalé NaN NaN NaN 2021 ... COCOA LIFE 100.0 NaN NaN NaN NaN NaN False NaN NaN
4336 MONDELEZ IVORY_COAST BARRY CALLEBAUT Cocoa AG NaN SOGAD Ste Gle d'Achat et de Distribution NaN NaN NaN 2021 ... COCOA LIFE 100.0 NaN NaN NaN NaN NaN False NaN NaN
4337 MONDELEZ IVORY_COAST BARRY CALLEBAUT Cocoa AG NaN SCAPEN Ste Coop Agri N'Zassa de Tiassalé NaN NaN NaN 2021 ... COCOA LIFE 100.0 NaN NaN NaN NaN NaN False NaN NaN
4338 MONDELEZ IVORY_COAST BARRY CALLEBAUT Cocoa AG NaN ECOGA_COOP-CA Entr coop de GABIADJI NaN NaN NaN 2021 ... COCOA LIFE 100.0 NaN NaN NaN NaN NaN False NaN NaN
4339 MONDELEZ IVORY_COAST BARRY CALLEBAUT Cocoa AG NaN ECADA Ent Coop Agri de Dagadji NaN NaN NaN 2021 ... COCOA LIFE 100.0 NaN NaN NaN NaN NaN False NaN NaN
4340 MONDELEZ IVORY_COAST BARRY CALLEBAUT Cocoa AG NaN COPACOL Ste Coop agri cot lauzoua carrefour NaN NaN NaN 2021 ... COCOA LIFE 100.0 NaN NaN NaN NaN NaN False NaN NaN
4341 MONDELEZ IVORY_COAST BARRY CALLEBAUT Cocoa AG NaN SCOOPS-YEYASSO Ste Coop Yeyasso de Man NaN NaN NaN 2021 ... COCOA LIFE 100.0 NaN NaN NaN NaN NaN False NaN NaN
4342 MONDELEZ IVORY_COAST BARRY CALLEBAUT Cocoa AG NaN SCOAVA Ste Coop Agricole de Vavoua NaN NaN NaN 2021 ... COCOA LIFE 100.0 NaN NaN NaN NaN NaN False NaN NaN
4343 MONDELEZ IVORY_COAST BARRY CALLEBAUT Cocoa AG NaN ECOAN Ste coop av CA Ecoagricole de pont nero NaN NaN NaN 2021 ... COCOA LIFE 100.0 NaN NaN NaN NaN NaN False NaN NaN
4344 MONDELEZ IVORY_COAST BARRY CALLEBAUT Cocoa AG NaN CABAK SCOOPS Ste coop Simp bada de ketro bassam NaN NaN NaN 2021 ... COCOA LIFE 100.0 NaN NaN NaN NaN NaN False NaN NaN
4345 MONDELEZ IVORY_COAST BARRY CALLEBAUT Cocoa AG NaN COOPADYA coop des prod agri dept yakasse NaN NaN NaN 2021 ... COCOA LIFE 100.0 NaN NaN NaN NaN NaN False NaN NaN
4346 MONDELEZ IVORY_COAST BARRY CALLEBAUT Cocoa AG NaN SCOES SCOOPS Ste Coop Espoir Simplifiée NaN NaN NaN 2021 ... COCOA LIFE 100.0 NaN NaN NaN NaN NaN False NaN NaN
4347 MONDELEZ IVORY_COAST BARRY CALLEBAUT Cocoa AG NaN LAFILA BOUMBOU Ste coop LAFI LA BOUMBOU NaN NaN NaN 2021 ... COCOA LIFE 100.0 NaN NaN NaN NaN NaN False NaN NaN
4348 MONDELEZ IVORY_COAST BARRY CALLEBAUT Cocoa AG NaN COOPINA COOP-CA Ste coop ivoir de nouveaux agriculteur NaN NaN NaN 2021 ... COCOA LIFE 100.0 NaN NaN NaN NaN NaN False NaN NaN
4349 MONDELEZ IVORY_COAST BARRY CALLEBAUT Cocoa AG NaN COOJAB COOP-CA Ste coop des jeunes agricult de Borobo NaN NaN NaN 2021 ... COCOA LIFE 100.0 NaN NaN NaN NaN NaN False NaN NaN
4350 MONDELEZ IVORY_COAST BARRY CALLEBAUT Cocoa AG NaN ECAB_BLOLEQUIN Ste Coop Elite coop agri Blolequin NaN NaN NaN 2021 ... COCOA LIFE 100.0 NaN NaN NaN NaN NaN False NaN NaN
4351 MONDELEZ IVORY_COAST BARRY CALLEBAUT Cocoa AG NaN CASIB COOP-CA Coopérative Agricole Siakadougou de Bida NaN NaN NaN 2021 ... COCOA LIFE 100.0 NaN NaN NaN NaN NaN False NaN NaN
4352 MONDELEZ IVORY_COAST BARRY CALLEBAUT Cocoa AG NaN COOPAD_DUEKOUE Ste Coop des Prod Agri de Duekoue NaN NaN NaN 2021 ... COCOA LIFE 100.0 NaN NaN NaN NaN NaN False NaN NaN
4353 MONDELEZ IVORY_COAST BARRY CALLEBAUT Cocoa AG NaN SCAN COOP-CA Ste Coop Agri Nesmon NaN NaN NaN 2021 ... COCOA LIFE 100.0 NaN NaN NaN NaN NaN False NaN NaN
4354 MONDELEZ IVORY_COAST BARRY CALLEBAUT Cocoa AG NaN ALLAH-KABO SCOOPS Ste Coop Simpl Allah Kabo de Duekoue NaN NaN NaN 2021 ... COCOA LIFE 100.0 NaN NaN NaN NaN NaN False NaN NaN
4355 MONDELEZ IVORY_COAST BARRY CALLEBAUT Cocoa AG NaN SOCOCCI SCOOPS Ste Coop de cafe-cacao Ivoirien NaN NaN NaN 2021 ... COCOA LIFE 100.0 NaN NaN NaN NaN NaN False NaN NaN
4356 MONDELEZ IVORY_COAST BARRY CALLEBAUT Cocoa AG NaN PHENIPRAD Ste Coop Phénix de Product Agri Durable NaN NaN NaN 2021 ... COCOA LIFE 100.0 NaN NaN NaN NaN NaN False NaN NaN
4357 MONDELEZ IVORY_COAST BARRY CALLEBAUT Cocoa AG NaN SCOPAF Ste Coop des Prod Agri du Fromager NaN NaN NaN 2021 ... COCOA LIFE 100.0 NaN NaN NaN NaN NaN False NaN NaN

27 rows × 23 columns

df_2023[df_2023.DISCL_TRADER_NAME == "Barry Callebaut Cocoa AG"]
COMPANY DISCL_COUNTRY_NAME DISCL_TRADER_NAME DISCL_AREA_NAME DISCL_SUPPLIER_ABRVNAME DISCL_SUPPLIER_FULLNAME DISCL_SUPPLIER_TYPE DISCL_LONGITUDE DISCL_LATITUDE DISCL_YEAR ... DISCL_CERTIFICATION_NAME DISCL_CERTIFICATION_PCT DISTRICT_NAME DISTRICT_GEOCODE LOCALITY_NAME TRADER_NAME CAM_BUYERS IS_CAM_V3 IS_JRC JRC_BUYER_ID
4404 MONDELEZ GHANA Barry Callebaut Cocoa AG ANHWIASO A DISTRICT NaN ANHWIASO A DISTRICT NaN NaN NaN 2021 ... COCOA LIFE 100.0 NaN NaN NaN NaN NaN False NaN NaN
4405 MONDELEZ GHANA Barry Callebaut Cocoa AG SEFWI BEKWAI A DISTRICT NaN SEFWI BEKWAI A DISTRICT NaN NaN NaN 2021 ... COCOA LIFE 100.0 NaN NaN NaN NaN NaN False NaN NaN
4406 MONDELEZ GHANA Barry Callebaut Cocoa AG SEFWI BEKWAI B DISTRICT NaN SEFWI BEKWAI B DISTRICT NaN NaN NaN 2021 ... COCOA LIFE 100.0 NaN NaN NaN NaN NaN False NaN NaN
4407 MONDELEZ GHANA Barry Callebaut Cocoa AG SEFWI WIAWSO A DISTRICT NaN SEFWI WIAWSO A DISTRICT NaN NaN NaN 2021 ... COCOA LIFE 100.0 NaN NaN NaN NaN NaN False NaN NaN
4408 MONDELEZ GHANA Barry Callebaut Cocoa AG SEFWI WIAWSO B DISTRICT NaN SEFWI WIAWSO B DISTRICT NaN NaN NaN 2021 ... COCOA LIFE 100.0 NaN NaN NaN NaN NaN False NaN NaN
4409 MONDELEZ GHANA Barry Callebaut Cocoa AG SEFWI WIAWSO C DISTRICT NaN SEFWI WIAWSO C DISTRICT NaN NaN NaN 2021 ... COCOA LIFE 100.0 NaN NaN NaN NaN NaN False NaN NaN

6 rows × 23 columns

df_2023[df_2023.DISCL_TRADER_NAME == "Barry Callebaut"]
COMPANY DISCL_COUNTRY_NAME DISCL_TRADER_NAME DISCL_AREA_NAME DISCL_SUPPLIER_ABRVNAME DISCL_SUPPLIER_FULLNAME DISCL_SUPPLIER_TYPE DISCL_LONGITUDE DISCL_LATITUDE DISCL_YEAR ... DISCL_CERTIFICATION_NAME DISCL_CERTIFICATION_PCT DISTRICT_NAME DISTRICT_GEOCODE LOCALITY_NAME TRADER_NAME CAM_BUYERS IS_CAM_V3 IS_JRC JRC_BUYER_ID
4410 NESTLE BRAZIL Barry Callebaut NaN NaN NaN NaN NaN NaN 2020 ... COCOA PLAN; RAINFOREST ALLIANCE NaN NaN NaN NaN NaN NaN False NaN NaN
4411 NESTLE IVORY_COAST Barry Callebaut NaN DUEKOUE NORD-BIO NaN NaN NaN NaN 2020 ... COCOA PLAN; RAINFOREST ALLIANCE NaN NaN NaN NaN NaN NaN False NaN NaN
4412 NESTLE IVORY_COAST Barry Callebaut NaN CABZOU COOP-CA NaN NaN NaN NaN 2020 ... COCOA PLAN; RAINFOREST ALLIANCE NaN NaN NaN NaN NaN NaN False NaN NaN
4413 NESTLE IVORY_COAST Barry Callebaut NaN CEXPAG COOP-CA NaN NaN NaN NaN 2020 ... COCOA PLAN; RAINFOREST ALLIANCE NaN NaN NaN NaN NaN NaN False NaN NaN
4414 NESTLE IVORY_COAST Barry Callebaut NaN COOTIEN-OULY NaN NaN NaN NaN 2020 ... COCOA PLAN; RAINFOREST ALLIANCE NaN NaN NaN NaN NaN NaN False NaN NaN
4415 NESTLE IVORY_COAST Barry Callebaut NaN COOP-CA ABO NaN NaN NaN NaN 2020 ... COCOA PLAN; RAINFOREST ALLIANCE NaN NaN NaN NaN NaN NaN False NaN NaN
4416 NESTLE IVORY_COAST Barry Callebaut NaN SCOOPADOB NaN NaN NaN NaN 2020 ... COCOA PLAN; RAINFOREST ALLIANCE NaN NaN NaN NaN NaN NaN False NaN NaN
4417 NESTLE IVORY_COAST Barry Callebaut NaN SOCOOPAK COOP-CA NaN NaN NaN NaN 2020 ... COCOA PLAN; RAINFOREST ALLIANCE NaN NaN NaN NaN NaN NaN False NaN NaN
4418 NESTLE IVORY_COAST Barry Callebaut NaN CABID NaN NaN NaN NaN 2020 ... COCOA PLAN; RAINFOREST ALLIANCE NaN NaN NaN NaN NaN NaN False NaN NaN
4419 NESTLE IVORY_COAST Barry Callebaut NaN SCJPAB COOP-CA NaN NaN NaN NaN 2020 ... COCOA PLAN; RAINFOREST ALLIANCE NaN NaN NaN NaN NaN NaN False NaN NaN
4420 NESTLE IVORY_COAST Barry Callebaut NaN SCOABIA COOP-CA (BC) NaN NaN NaN NaN 2020 ... COCOA PLAN; RAINFOREST ALLIANCE NaN NaN NaN NaN NaN NaN False NaN NaN

11 rows × 23 columns

import pandas as pd

df = pd.read_csv('/Users/niamhfrench/Library/CloudStorage/OneDrive-SEI/Documents/civ_cooperatives_update/combined_data/just_CIV_messy_2024.csv')
df.columns
Index(['TRADER_NAME', 'DISCL_TRADER_NAME', 'DISCL_SUPPLIER_ABRVNAME',
       'DISCL_AREA_NAME', 'DISCL_CERTIFICATION_NAME', 'DISCL_NUMBER_FARMERS',
       'DISCL_LONGITUDE', 'DISCL_LATITUDE', 'COUNTRY', 'TYPE', 'DISCL_YEAR',
       'DISCL_SUPPLIER_TYPE', 'COMPANY', 'DISCL_SUPPLIER_FULLNAME',
       'LOCALITY_NAME', 'INDEX', 'DISCL_SUPPLIER_NAME', 'DISCL_VOLUMES',
       'DISCL_VOLUMES_UNIT', 'DISCL_CERTIFICATION_PCT', 'DISTRICT_NAME',
       'DISTRICT_GEOCODE', 'CAM_BUYERS', 'IS_CAM_V3', 'IS_JRC', 'JRC_BUYER_ID',
       'YEAR', ' DISCL_SUPPLIER_TYPE', ' DISCL_YEAR', 'TRADER',
       'SUPPLIER_ABRVNAME', 'SUPPLIER_FULLNAME', 'LONGITUDE', 'LATITUDE',
       'NUMBER_FARMERS', 'VOLUMES', 'VOLUMES_UNIT', 'CERTIFICATION_NAME',
       'CERTIFICATION_PCT'],
      dtype='object')
df_2024_cols_ordered = ['TRADER_NAME', 'TRADER', 'DISCL_TRADER_NAME', # TODO: Reduce to 'DISCL_TRADER_NAME'
 'COMPANY', 
        'YEAR', ' DISCL_YEAR', 'DISCL_YEAR', # TODO: Reduce to 'DISCL_YEAR'
        'DISCL_LONGITUDE', 'DISCL_LATITUDE', 'LATITUDE', 'LONGITUDE', # TODO: Reduce to 'DISCL_LATITUDE' and 'DISCL_LONGITUDE'
       'SUPPLIER_ABRVNAME', 'SUPPLIER_FULLNAME', 'DISCL_SUPPLIER_ABRVNAME', 'DISCL_SUPPLIER_NAME', 'DISCL_SUPPLIER_FULLNAME', # TODO: Reduce to 'DISCL_SUPPLIER_ABRVNAME' and 'DISCL_SUPPLIER_FULLNAME'
       'DISCL_AREA_NAME', 'LOCALITY_NAME', 'DISTRICT_GEOCODE', 'DISTRICT_NAME', # TODO: Reduce to 'DISCL_AREA_NAME' and 'DISTRICT_GEOCODE'
               'TYPE', 'DISCL_SUPPLIER_TYPE', ' DISCL_SUPPLIER_TYPE', # TODO: Reduce to 'DISCL_SUPPLIER_TYPE'
       'DISCL_CERTIFICATION_NAME', 'DISCL_CERTIFICATION_PCT',
       'DISCL_NUMBER_FARMERS',
        'COUNTRY', # TODO: Call it DISCL_COUNTRY_NAME
       'INDEX', # TODO: Remove
       'DISCL_VOLUMES','DISCL_VOLUMES_UNIT',  
       'CAM_BUYERS', 'IS_CAM_V3', 'IS_JRC', 'JRC_BUYER_ID']
df_2023_columns_ordered = ['DISCL_TRADER_NAME', 'TRADER_NAME', 
                           'COMPANY', 
                        'DISCL_YEAR', 
                        'DISCL_LONGITUDE', 'DISCL_LATITUDE',
                        'DISCL_SUPPLIER_ABRVNAME', 'DISCL_SUPPLIER_FULLNAME',
                           'DISCL_AREA_NAME','LOCALITY_NAME', 'DISTRICT_GEOCODE', 'DISTRICT_NAME',
       'DISCL_SUPPLIER_TYPE', 
       'DISCL_CERTIFICATION_NAME', 'DISCL_CERTIFICATION_PCT',
       'DISCL_NUMBER_FARMERS', 
       'DISCL_COUNTRY_NAME', 

       'DISCL_VOLUMES','DISCL_VOLUMES_UNIT', 
        'CAM_BUYERS', 'IS_CAM_V3', 'IS_JRC','JRC_BUYER_ID']

Discern difference between similar columns

print(df_2023['TRADER_NAME'].unique())
print(df_2023['DISCL_TRADER_NAME'].unique())
[nan]
[nan 'CEMOI COTE D‘IVOIRE' 'Fuchs & Hoffmann Kakaoprodukte GmbH'
 'SUSCOM-CI' 'CARGILL' 'Ascot Amsterdam B.V.' 'Olam International Limited'
 'Ritter Sport Nicaragua S.A' 'Minka' 'CEMOI' 'ECOM' 'ECOOKIM' 'ETG'
 'DAARNHOUWER & CO.' 'Walter Matter Sa' 'BARRY CALLEBAUT Cocoa AG'
 'Cargill B.V.' 'Ecom Agrotrade Ltd.' 'Olam International Ltd.'
 'Touton SA' 'Barry Callebaut Cocoa AG' 'Barry Callebaut' 'Blommer/Olam'
 'Cargill' 'Cocoanect' 'Ecom' 'Sucden' 'Touton' 'COCOANECT' 'OLAM'
 'Fildisi Cocoa Industry' 'SUCDEN' 'COCOASOURCE' 'UNCOMMON COCOA']
print(df_2023['DISCL_AREA_NAME'].unique())
print(df_2023['LOCALITY_NAME'].unique())
print(df_2023['DISTRICT_NAME'].unique())
print(df_2023['DISTRICT_GEOCODE'].unique())
[nan 'ANTOAKROM' 'ASSIN FOSU A & B' 'WASSA AKROPONG' 'MANSO AMENFI'
 'DUNKWA' 'TWIFO PRASO' 'SAMREBOI' 'AWASO' 'NEW EDUBIASE' 'BOGOSO'
 'ASANKRANGWA' 'Agroecom Ghana Limited, Antoakrom A'
 'Agroecom Ghana Limited, Diaso' 'Agroecom Ghana Limited, Agona B'
 'Agroecom Ghana Limited, Sankore B' 'Awaso' 'Bafia'
 'Climate Cocoa Cooperative Union' 'Cocoa Merchants' 'Cocoa Abrabopa'
 'Fedco' 'Goaso' 'Kasapin' 'Kwapong' 'Kokoo Pa'
 'Olam Ghana Climate Cocoa-Asawinso' 'Olam Ghana Climate Cocoa-Kakum'
 'Sankore' 'Sunyani' 'ABOISSO' 'DUEKOUE' 'GNIPI' 'AFFERY' 'DIANGOBO'
 'AGBOVILLE' 'GRAND MORIE' 'LAKOTA' 'GAGNOA' 'FRESCO' 'DIVO' 'TIASSALE'
 'BEREBY' 'TOUIH' 'ABENGOUROU' 'GUITRY' 'PAULY' 'AKOUPE' 'ISSIA'
 'GUIBEROUA' 'GRAND LAHOU' 'GUIGLO' 'TAI' 'SAN-PEDRO' 'BLOLEQUIN' 'SINFRA'
 'SASSANDRA' 'BAYOTA' 'GUIDRY' 'SAN PEDRO' 'SOUBRE' 'OUME' 'GRABO'
 'DIEGONEFLA' 'TABOU' 'YAKASSE ATTOBROU' 'CECHI' 'ZAGNE' 'SEGUELLA'
 'UCAYALI' 'SAO TOME' 'GUAYAS' 'SANTO DOMINGO' 'TAABO' 'HERMANGONO'
 'ADZOPE' 'Abengourou' 'Blolequin' 'Daloa' 'Meagui' 'Guiglo' 'Danané'
 'MENEE CENTRE' 'Sooubre' 'MEAGUI' 'Vavoua' 'San Pedro' 'OUPOUYO' 'SOFOCI'
 'Gagnoa' 'Grobonou Dan' 'Soubré' 'Zoukougbeu' 'Bonon' 'Abenase'
 'Adabokrom' 'Agona' 'Agona Amenfi' 'Agona B (Effiduase)' 'Akim Oda'
 'Akontombra' 'Antoakrom' 'Asafo' 'Asamankese' 'Asankragwa' 'Asawinso'
 'Asempanaye' 'Assin Breku' 'Assin Fosu' 'Asumura' 'Bekwai' 'Bepoh'
 'Bibiani' 'Boako' 'Bogoso' 'Bonsu' 'Brakwa' 'Breman Asikuma' 'Dadieso'
 'Debiso' 'Diaso' 'Dunkwa' 'Elluokrom' 'Enchi' 'Essam' 'Juaboso' 'Kibi'
 'Kukuom' 'Mankranso' 'Manso Amanfi' 'New Edubiase' 'Nkawie' 'Nkawkaw'
 'Nyinahin' 'Nyinase' 'Obuasi' 'Offinso' 'Ofoase' 'Papase' 'Proso'
 'Samreboi' 'Sefwi Bekwai' 'Sefwi Wiawso' 'Suhum' 'Tarkwa' 'Tepa'
 'Twifo Praso' 'Wassa Akropong' 'Cross River State' 'Kajola, Ondo state'
 'Odigbo, Ondo State' 'Alade,Ondo State' 'Owo, Ondo State'
 'Ondo, Ondo State' 'Ilesa, Osun State' 'Modakeke, Osun State'
 'Ife, Osun State' 'Ijebu, Ogun State' 'Tarapoto' 'San Alejandro'
 'Tocache' 'Quillabamba' 'Sisa' 'Tingo Maria' 'Ciudad Constitución'
 'Guayas' 'Los Rios' 'Manabi' 'El Oro' 'Esmeraldas' 'BONON' 'Bodi'
 'Lakota' 'Sassandra' 'Dagadji' 'Sanegourifla' 'Sinfra' 'Ayame'
 'Ebilassoukro' 'Issia' 'Taaboo' 'Goudí' 'Bouaflé' 'Tiassalé' 'Divo'
 'Aboisso' 'Fresco' 'Hermankono-Diès' 'Dibobly' 'Tinhou' 'Okrouyo'
 'Guehiebly' 'Gazibouo' 'Gadouan' 'Biankouma' 'Toumbokro' 'Duekoue'
 'Zagné' 'Abidjan' 'Guessiguie' 'Taabo' 'Socofi' 'Danane' 'San-Pedro'
 'Yakassé Attobrou' 'Bayota' 'Tabou' 'Grand-Zattry' 'Soubre' 'Apprompron'
 'Man' 'Kouibly' 'Robert Porte' 'Grand-Lahou' 'KUMASI' 'AGONA DUAKWA'
 'AKONTOMBRA' 'ANHWIASO' 'ANYINAM' 'ASANKRAGWA' 'ASAWINSO' 'ASEMPANEYE'
 'ASHANTI BEKWAI' 'BIBIANI' 'BONSU NKWANTA' 'DADIESO' 'ELLUOKROM' 'ENCHI'
 'JUABOSO' 'JUASO' 'KOFORIDUA' 'KONONGO' 'NKAWIE' 'NSOKOTE' 'OBUASI'
 'PROSO' 'SUHUM' 'TEPA' 'WASA AKROPONG' 'Ibadan' 'Okada / Auchi' 'Ife'
 'Ijebu Ode' 'Ikom' 'Ijebu-Ife' 'Akure / Owo' 'Ife / Ilesha'
 'Odigbo / Ofosu' 'Obgbere / Omifunfun' 'Akonwanjo' 'OBALA'
 'Paraiso La 14' 'Guayaquil' 'Norcasia' 'Victoria' 'Tello' 'Falan' 'Maceo'
 'Tarqui' 'Montenegro' 'Hacarí' 'Duekue' 'Yamoussoukro' 'Betesso' 'Kotobi'
 'Kumasi' 'Agona Swedru / DUAKWA' 'Agona Juaben' 'Suhum - Kibi'
 'Asempaneye' 'Ashanti Bekwai' 'Bonsu Nkwanta' 'Juaso' 'Koforidua'
 'Konongo-Odumase' 'Nsokote' 'Okada, Auchi' 'Ogbere / Omifunfun'
 'Ijebu / Ode' 'Obala' 'Meyomessi' 'Triunfo' 'Quevedo' 'Caracoli' 'Neiva'
 'Caicedonia' 'Fresno' 'Garzón' 'Quimbaya' 'San Pedro de Uraba' 'Anserma'
 'Ocaña' 'San Juan de Mariquita' 'Bangolo' 'Moussadougou' 'Gnipi 2'
 'Menée centre/Gabiadji' 'Biankouman' 'ADABOKROM' 'AGONA JUABEN' 'DIASO'
 'EFFIJAASE' 'FOSUKROM' 'JAMASI' 'MANKRANSO' 'NKASEIM' 'NYINAHIN'
 'OFFONSO ABOFORO' 'SANKORE' 'SEFWI BEKWAI' 'TARKWA' 'Ado Ekiti'
 'Ondo/Akure' 'Cerecita' 'Vinces' 'La Concordia' 'Quinindé' 'La Troncal'
 'Virgen del Fatima' 'Chigorodó' 'Gigante' 'Bucaramanga'
 'San Juan de\nMariquita' 'ANHWIASO A DISTRICT' 'SEFWI BEKWAI A DISTRICT'
 'SEFWI BEKWAI B DISTRICT' 'SEFWI WIAWSO A DISTRICT'
 'SEFWI WIAWSO B DISTRICT' 'SEFWI WIAWSO C DISTRICT' 'Ilhéus - Bahia'
 'Ipiaú - Bahia' 'Itubera - Bahia' 'Gandu - Bahia' 'Itabuna - Bahia'
 'Mutuipe - Bahia' 'São Miguel das Matas  - Bahia' 'Anapu - Pará'
 'Altamira - Pará' 'Novo Repartimento - Pará' 'São Félix do Xingu - Pará'
 'Tucumã - Pará' 'Uruará - Medicilândia - Pará' 'Medicilândia - Pará'
 'Makenene' 'Ngoro' 'Voundou' 'Mbangassina' 'Ntui' 'Nguila' 'Ngomedzap'
 'Ayos' 'Abongoua' 'Adzopé' 'Bahe-Sebon' 'Bâcon' 'Guépahouo'
 'Hermakono Garo' 'Dogbo' 'Bloléquin' 'Adebem' 'Téapleu' 'Duékoué'
 'Sahoua' 'Opouyo' 'Bonoufla' 'Mahapleu' 'Kahen' 'Yaou/Aboisso' 'Arrah'
 'Toumodi' 'Ananguié' 'Hermankono-Dies' 'Djoroplo' 'Sasa-manois' 'Gagny'
 'Nando' 'Kokolilie (Lakota)' 'Mene Centre' 'Oumé' 'Gbagbam' 'Monokozohi'
 'Gnato' 'Gnago 1' 'Lobogrou/ Lakota' 'Guitry' 'Diby' 'Aniassué'
 'Bongouanou' 'Afféry' 'Aboulayekro' 'Walebo' 'Patricekro' 'Grabo'
 'Petit Grabo' 'Grand Lahou' 'Dassioko' 'Agboville' 'Balokuya' 'Bianouan'
 'Dabouyo' 'Akoupé' 'Djimon/ Lakota' 'Aboisso Comoé' 'Yorodougou' 'vavoua'
 'Niapidou' 'Watté' "N'Douci" 'Ketesso' 'gnato' 'Hire' 'Neka' 'Djapadji'
 'TrawinninkroV8' 'Diangobo' 'Dokpodon' 'Douaville' 'Karie' 'Mahino'
 'Sikensi' 'Sago' 'Santo Domingo' 'EL ORO' 'Elloukrom' 'Wiawso' 'Nsawora'
 'Anhwiaso' 'Kakum' 'Lima Puluh Kota' 'Padang' 'Kolaka Utara'
 'Kolaka-Kolaka Timur' 'Kolaka' 'Kolaka Timur' 'Banggai' 'Parimo'
 'Kota Poso' 'Luwu Utara' 'Luwu Timur' 'Bantaeng' 'Mamuju'
 'Polewali Mandar' 'Pringsewu' 'Tanggamus' 'Pesawaran & Pringsewu'
 'Pesawaran' 'Way Ratai' 'Lampung Tengah' 'Lampung Timur' 'Lampung'
 'Simalungun - Tanah Karo' 'Cross River' 'Ondo' 'Owo' 'Owo\nOsun' 'Osun'
 'Idanre' 'Ogun' 'UMI-ATZERO RURAL' 'AHI RURAL' 'ONGA-WAFFA RURAL'
 'LAE URBAN' 'TRANSGOGOL RURAL' 'SALAKI' 'Gabensis' 'INLAND BAINING RURAL'
 'ALMAMI RURAL' 'LABUTA RURAL' 'Tuvi' 'mewa' 'WAMPAR RURAL' 'Oronga'
 'Numa Blk5' 'WEWAK URBAN' 'BANA RURAL' 'ARAWA RURAL' 'BAIMURU'
 'UMI-ATZERO RURAL\nMainoki' 'Ebenan' 'Numa Blk4' 'Avaipa' 'Siuema' 'Yall'
 'Kurai' 'Ralubang' 'BOEMSARA' 'Numa Blk1' 'KASMIN TWO'
 'CENTRAL GAZELLE RURAL' 'Parawen' 'DIVISION 3' 'Kadaulung' 'BALBAL AREA'
 'Tavidua' 'BITAPAKA RURAL' 'Tadup' 'HAGAMA' 'Mainoki' 'NUMIESOLI'
 'Sundumbu' 'WANTOAT-LERON RURAL' 'Jobtou\nKurai' 'NIMBIH' 'WAMAYANG'
 'Amaimon' 'Mare' 'YEKIMBOLYE' 'North Nasioi' 'Kuruvina' 'Kerei West'
 'KASMIN ONE' 'KAUKAMBAR' 'BUKIENDUON' 'NIMIESOLI' 'ahi rural' 'Alemo'
 'AMBENOB RURAL' 'Mafanazo' 'Sauruan' 'HARUA' 'Wampan' 'Siriwarang'
 'NIAKANDOGUM' 'Iwap' 'GAVIEN DIVISION' 'Kagi'
 'WAMPAR RURAL\nTRANSGOGOL RURAL' 'Sehan' 'Derin' 'Chiatz' 'Malarina'
 'Kuriguma' 'Ngaruwain' 'SAUSENDUON' 'PARINA' 'SIMBOMIE' 'Jobtou'
 'HERENGEI' 'USINO RURAL' 'AMBIKANJA' 'KUMUN' 'BUBUAMU' 'YAREBOEM'
 'MARINGE' 'Kopani 3' 'BUKSAK' 'KUFAR' 'Bai' 'HANGRERAK' 'SENGRI'
 'KWARAPRI' 'SOLI (1)' 'TOMA-VUNADIDIR RURAL' 'GAVIEN DIV 2'
 'MANDIEN/SOLI' 'KOKOPO-VUNAMAMI URBA' 'MADANG URBAN' 'Marangints'
 'Panguna' 'Gonoa' 'TUONUMBO' 'Remboini- Aropa' 'SUMGILBAR RURAL' 'BALBAL'
 'Kopei' 'GAVIEN SCHEEM' 'KOTTE RURAL' 'SAINA' 'YAWAR RURAL' 'MARAMBANJA'
 'HOWI' 'Apusipa' 'Itaupa' 'Mandress' 'JAPARAKA (1)'
 'LASSUL BAINING RURAL' 'PAPAROM' 'SIRIAR' 'Miike' 'Atoma'
 'SUMGILBAR RURAL\nALMAMI RURAL' 'Sirovai' 'SAINA\nRumba' 'Domakung'
 'Karapia' 'Balama' 'Tagitagi' 'LIVUAN-REIMBER RURAL' 'South Nasioi'
 'NANGUMARAM' 'IGURUWE' 'Kokoda' 'KWAHUIE' 'ARIAPAN' 'KIARIVU'
 'UMI-ATZERO RURAL\nHIMBRU/BUKINARE' 'Binimamp' 'Yalu' 'SAUSENUMBOHU'
 'JAPARAKA (3)\nUMI-ATZERO RURAL' 'UMI-ATZERO RURAL\nINLAND BAINING RURAL'
 'Ragitamut' 'Biviaro' 'ASTROLABE BAY RURAL' 'RABAUL URBAN' 'Numa Blk2'
 'SALAKI-KUNAI HEIGHTS' 'Kerei East' 'NABAK RURAL' 'WAMAIAN' 'Warus'
 'KUSAUN' 'Tsili Tsili' 'WANINGIAGUM' 'Parawen\nBALBAL AREA' 'NAMBARI'
 'Atoh' 'ALMAMI RURAL\nUMI-ATZERO RURAL' 'HOLIK HOWI' 'Bubukwanga' 'Humya'
 'Mataisa' 'Bundinyama' 'Buganikere' 'Busaru' 'Hakitara' 'Kaghema' 'Kuka'
 'Mirambi' 'Kirindi' 'Busoru' 'Bugombwa' 'Bundimulinga' 'Busunga' 'Lamia'
 'Masule 2' 'Lembe-Yezoum' 'Simanyai' 'SUHUM MUNICIPAL' 'Cape coast'
 'Tontokrom' 'Nkrumakrom' 'Nipankyeremia' 'Manukrom' 'Manso Bonsaaso'
 'Kobiriso' 'Keniago' 'Hiamankwa' 'Fahiakobo' 'Dewusaso' 'Datano'
 'Bayerebon No 6' 'Ayiem' 'Assamang' 'Apenimade' 'Akyerekyerekrom'
 'Abuoso' 'Aboi Nkwanta' 'Amoakokrom' 'Amoamang A' 'Amoamang B'
 'Asuogya A' 'Asuogya B' 'Asuosu' 'Attobrakrom' 'Bepokokoo' 'Bremang'
 'Brodziekrom' 'Domeabra' 'Gonukrom' 'Gyaman' 'Gyaman A' 'Koduakrom'
 'Meteameba' 'Nkrankrom' 'Nsabrekwa A' 'Nsabrekwa B' 'Nsabrekwa Nkubeim'
 'Nyame Nnae' 'Oda Kotoamso' 'Pantoso' 'Shed' 'Sika Ye Akono' 'Tena Adjei'
 'Wassa Dunkwa A' 'Wassa Dunkwa B' 'Wassa Dunkwa C' 'Wassa Saa' 'Yirase'
 'Yirase B' 'Yirase C' 'ASSIN FOSU A' 'Abakrampah' 'Abin' 'Abura Dunkwa'
 'Adiembra ' 'Akrofuom' 'Beyeden' 'Bosomadwe ' 'Darmang' 'Jakai' 'Kruwa '
 'Kwafo' 'Ningo ' 'Nsenne ' 'Nsuakyi' 'ASSIN FOSU B' 'Achiase'
 'Amanbete A' 'Amanbete B' 'Anyinabrim' 'Asorefie' 'Ayigo' 'Besease'
 'Bunsu' 'Dawomako' 'Dwendama' 'Homaho' 'Katakyiase' 'Keteaba' 'Mankata'
 'Ngresi' 'Nyamebekyere A' 'Nyamebekyere B' 'Sibinso' 'Tabil' 'Taylorkrom'
 'Tehente' 'Aboabo' 'Aboanidua' 'Adobewura 2' 'Adobewura No. 1A '
 'Adobewura No.1B' 'Alata' 'Ampenkrom' 'Ankamateng' 'Ashiam A' 'Ashiam B '
 'Chiraa' 'Chirano' 'Humjebre' 'Humjebre B' 'Kofikrom A' 'Kofikrom B'
 'Kofikrom C' 'Kronua' 'Kyeikrom ' 'Manse' 'Muoho' 'Muoho B' 'Paboase'
 'Sanfifre' 'Sefwi-Bekwai A' 'Sefwi-Bekwai B' 'Sorano ' 'Subiri'
 'Abekakrom' 'Aboaboso' 'Adadekrom' 'Agyirekrom' 'Akaatiso' 'Ayensukrom'
 'Brakwa Line' 'Camp' 'Dorcas' 'Draw' 'Gyampre C' 'Gyampre D' 'Gyampre E'
 'Gyampre F' 'Kutukrom' 'Larbikrom' 'Manseso' 'New Aboi' 'Next Time'
 'Nkrumahkrom' 'Nyamebekyere' 'Obengkrom' 'Opon Valley A' 'Opon Valley B'
 'Tabakrom' 'Tettehkrom' 'Yawkrom' 'DUNKWA ' 'Addokrom' 'Amissah'
 'Ayanfuri' 'Buabeng Camp' 'Dunkwa Bebianiha' 'Kwameprakrom' 'Marfokrom'
 'Mfanteman' 'Nkyirifi' 'Meretweso' 'Pokukrom' 'Powerline' 'Teacherkrom'
 'Denkyira Bebianiha' 'Kwaameprakrom B' 'Mbradan' 'Imbraim' 'Fawomanyo'
 'Anwiawia' 'Atobease' 'Mfoum' 'Kyekyewere' 'Kontokrom' 'Sobroso'
 'Pokukrom 2' 'Tegyamoso' 'Kwagyansa' 'Oponso' 'Agyakwabena Krom'
 'Anakum A' 'Amuni ' 'Akyekyere B' 'Broninkwanta' 'Kramokrom' 'Asasetre '
 'Manso Amenfi' 'Manso Camp' 'Anakum B' 'Merewamerewa ' 'Obeng '
 'State Farm' 'Balowankwanta' 'Afukey A' 'Afukey B' 'Afukey C'
 'Akyekyere A' 'Bonuama' 'Gyakontabuo' 'Kwawbadukrom' 'Antobam'
 'Wassa Bekwai' 'Agyedum' 'Bantama' 'Kojo- Yaw' 'Fantikrom' 'Nipahiamoah'
 'Bronikrom' 'Offeikrom' 'Fumso A' 'Kojo Ntumi' 'Tweapease' 'Kwame Adu'
 'Kojo Apreh' 'Obuobi' 'Akutreso' 'Menan' 'Subriso' 'T.Camp 1' 'Fumso B'
 'Nsuta' 'Oparekrom' 'Ankaase' 'Atweneboana' 'Kenyakrom' 'Alhajikrom'
 'Apagya' 'Krobokrom' 'Oseikrom' 'Adansi Swedru' 'T.Camp 11' 'Abenyia'
 'Aboifie' 'Ahuntem' 'Akokora Krom' 'Amoaku' 'Aniantetem' 'Aniehu'
 'Ayensu Nkwanta' 'Bisaaso 1' 'Bisaso 2' 'Fobikrom' 'Forest'
 'Kwame Buokrom' 'Kyemuso' 'Patase Asoredan Ho' 'Tanaa' 'Tanokrom'
 'Wansema C' 'Woman No Good A' 'Takyikrom ' 'Mampoma' 'Tsimtsimhwe '
 'Baakondzidzi' 'Atwereboanda' 'Ganukrom ' 'Nyamebekyere ' 'Dawomase'
 'Kotokye' 'Kwamoano ' 'Akwakrom ' 'Somnyamekodru ' 'Bepobeng '
 'Twasokoda' 'Atwereboanda B' 'Ahenkro' 'Ahenkro A' 'Mampoma B' 'Ankaako'
 'Agyinkrom' 'Morkwa' 'Esaase' 'Abrokyirekrobo' 'Acheamponkrom' 'Adansi A'
 'Adansi B' 'Afransie' 'Ankwagya' 'Buraso' 'Amanikrom' 'Dikoto Junction'
 'Dompoase' 'Grumisah A' 'Gyampawuah A' 'Gyampawuah B' 'Gyapa' 'Gyedua'
 'Jukwa' 'Krofrom' 'Nyarkokrom' 'Teyekrom' 'Ohianhyeda' 'Hemang' 'Aduman'
 'Adesu' 'Nananko' 'Wuratrem A ' 'Wuratrem B' 'Wuratrem C' 'Wuratrem D'
 'Grumisah B' 'Grumisah C' 'Tarkwa Breman' 'Eshereso' 'Adonnoi']
[nan]
[nan]
[nan 'CI-2.3.1_1' 'CI-2.2.2_1' 'CI-2.2.5_1' 'CI-8.1.1_1' 'CI-8.2.1_1'
 'CI-8.2.2_1' 'CI-9.1.4_1' 'CI-9.1.1_1' 'CI-9.1.3_1' 'CI-9.2.2_1'
 'CI-9.2.1_1' 'CI-5.2.1_1' 'CI-3.2.1_1' 'CI-2.1.1_1' 'CI-14.1.3_1'
 'CI-14.2.1_1' 'CI-4.2.3_1' 'CI-1.1.1_1' 'CI-3.1.1_1' 'CI-7.1.2_1'
 'CI-2.1.2_1' 'CI-3.1.3_1' 'CI-8.3.1_1' 'CI-7.2.2_1' 'CI-6.4.1_1'
 'CI-5.1.1_1' 'CI-13.1.2_1' 'CI-7.1.1_1' 'CI-8.3.3_1' 'CI-7.1.4_1'
 'CI-8.1.2_1' 'CI-14.2.5_1' 'CI-6.3.1_1' 'CI-14.2.2_1' 'CI-2.2.3_1'
 'CI-7.2.1_1' 'CI-8.1.3_1' 'CI-9.1.2_1' 'CI-7.3.3_1' 'CI-2.3.2_1'
 'CI-5.1.2_1' 'CI-5.2.2_1' 'CI-8.2.4_1' 'CI-2.2.1_1' 'CI-7.3.2_1'
 'CI-5.2.3_1' 'CI-12.3.2_1' 'CI-14.2.4_1' 'CI-8.3.2_1' 'CI-3.1.2_1'
 'CI-6.1.2_1' 'CI-9.2.3_1' 'CI-7.3.4_1' 'CI-7.3.1_1' 'CI-6.3.2_1'
 'CI-6.1.1_1' 'CI-11.1.1_1' 'CI-8.3.5_1' 'CI-3.2.7_1' 'CI-6.1.4_1'
 'CI-7.1.3_1' 'CI-6.2.3_1' 'CI-6.4.2_1' 'CI-8.2.3_1' 'CI-6.3.3_1'
 'CI-12.3.1_1' 'CI-6.1.3_1' 'CI-3.2.5_1' 'CI-12.1.2_1' 'CI-12.1.3_1'
 'CI-11.1.3_1' 'CI-13.1.1_1' 'CI-10.2.2_1' 'CI-8.3.4_1' 'CI-6.2.1_1'
 'CI-8.1.4_1' 'CI-12.2.2_1' 'CI-11.1.5_1' 'CI-3.2.2_1' 'CI-12.1.1_1']

Make 2024 columns match 2023 columns

df.rename(columns={'COUNTRY': 'DISCL_COUNTRY_NAME'}, inplace=True)
# TODO: Reduce to 'DISCL_TRADER_NAME'

# Where 'DISCL_TRADER_NAME' is NaN, copy over 'TRADER_NAME'
mask = df['DISCL_TRADER_NAME'].isna() & df['TRADER_NAME'].notna()
df.loc[mask, 'DISCL_TRADER_NAME'] = df.loc[mask, 'TRADER_NAME']

# Where 'DISCL_TRADER_NAME' is NaN, copy over 'TRADER'
mask = df['DISCL_TRADER_NAME'].isna() & df['TRADER'].notna()
df.loc[mask, 'DISCL_TRADER_NAME'] = df.loc[mask, 'TRADER']

# Where 'COMPANY' is NaN, copy over 'TRADER_NAME'
mask = df.COMPANY.isna()&df.TRADER_NAME.notna()
df.loc[mask, 'COMPANY'] = df.loc[mask, 'TRADER_NAME']

df.drop(columns=['TRADER_NAME', 'TRADER'], inplace=True)
df.drop(columns=['INDEX'], inplace=True)
# TODO: Reduce to 'DISCL_YEAR'

# Where 'DISCL_YEAR is NaN, copy over ' DISCL_YEAR'
mask = df['DISCL_YEAR'].isna() & df[' DISCL_YEAR'].notna()
df.loc[mask, 'DISCL_YEAR'] = df.loc[mask, ' DISCL_YEAR']

# Where 'DISCL_YEAR' is NaN, copy over 'YEAR'
mask = df['DISCL_YEAR'].isna() & df['YEAR'].notna()
df.loc[mask, 'DISCL_YEAR'] = df.loc[mask, 'YEAR']

print(df[df.DISCL_YEAR.isna() & df[' DISCL_YEAR'].notna()])
print(df[df.DISCL_YEAR.isna() & df['YEAR'].notna()])

df.drop(columns=[' DISCL_YEAR', 'YEAR'], inplace=True)
Empty DataFrame
Columns: [DISCL_TRADER_NAME, DISCL_SUPPLIER_ABRVNAME, DISCL_AREA_NAME, DISCL_CERTIFICATION_NAME, DISCL_NUMBER_FARMERS, DISCL_LONGITUDE, DISCL_LATITUDE, DISCL_COUNTRY_NAME, TYPE, DISCL_YEAR, DISCL_SUPPLIER_TYPE, COMPANY, DISCL_SUPPLIER_FULLNAME, LOCALITY_NAME, DISCL_SUPPLIER_NAME, DISCL_VOLUMES, DISCL_VOLUMES_UNIT, DISCL_CERTIFICATION_PCT, DISTRICT_NAME, DISTRICT_GEOCODE, CAM_BUYERS, IS_CAM_V3, IS_JRC, JRC_BUYER_ID, YEAR,  DISCL_SUPPLIER_TYPE,  DISCL_YEAR, SUPPLIER_ABRVNAME, SUPPLIER_FULLNAME, LONGITUDE, LATITUDE, NUMBER_FARMERS, VOLUMES, VOLUMES_UNIT, CERTIFICATION_NAME, CERTIFICATION_PCT]
Index: []

[0 rows x 36 columns]
Empty DataFrame
Columns: [DISCL_TRADER_NAME, DISCL_SUPPLIER_ABRVNAME, DISCL_AREA_NAME, DISCL_CERTIFICATION_NAME, DISCL_NUMBER_FARMERS, DISCL_LONGITUDE, DISCL_LATITUDE, DISCL_COUNTRY_NAME, TYPE, DISCL_YEAR, DISCL_SUPPLIER_TYPE, COMPANY, DISCL_SUPPLIER_FULLNAME, LOCALITY_NAME, DISCL_SUPPLIER_NAME, DISCL_VOLUMES, DISCL_VOLUMES_UNIT, DISCL_CERTIFICATION_PCT, DISTRICT_NAME, DISTRICT_GEOCODE, CAM_BUYERS, IS_CAM_V3, IS_JRC, JRC_BUYER_ID, YEAR,  DISCL_SUPPLIER_TYPE,  DISCL_YEAR, SUPPLIER_ABRVNAME, SUPPLIER_FULLNAME, LONGITUDE, LATITUDE, NUMBER_FARMERS, VOLUMES, VOLUMES_UNIT, CERTIFICATION_NAME, CERTIFICATION_PCT]
Index: []

[0 rows x 36 columns]
# TODO: Reduce to 'DISCL_LATITUDE' and 'DISCL_LONGITUDE'

mask = df['DISCL_LATITUDE'].isna() & df['LATITUDE'].notna()
df.loc[mask, 'DISCL_LATITUDE'] = df.loc[mask, 'LATITUDE']

mask = df['DISCL_LONGITUDE'].isna() & df['LONGITUDE'].notna()
df.loc[mask, 'DISCL_LONGITUDE'] = df.loc[mask, 'LONGITUDE']

df.drop(columns=['LATITUDE', 'LONGITUDE'], inplace=True)
# TODO: Reduce to 'DISCL_SUPPLIER_ABRVNAME' and 'DISCL_SUPPLIER_FULLNAME'


# Fill DISCL_SUPPLIER_ABRVNAME and DISCL_SUPPLIER_FULLNAME from other supplier columns,
# then drop the redundant columns.

# make a copy to be safe if you want to compare before/after
df = df.copy()

# Normalize empty strings -> NA on the relevant columns
supplier_cols = ['SUPPLIER_ABRVNAME', 'SUPPLIER_FULLNAME', 'DISCL_SUPPLIER_ABRVNAME',
                 'DISCL_SUPPLIER_NAME', 'DISCL_SUPPLIER_FULLNAME']
df[supplier_cols] = df[supplier_cols].replace(r'^\s*$', pd.NA, regex=True)

# Build DISCL_SUPPLIER_ABRVNAME: choose the first non-null value from preferred sources
df['DISCL_SUPPLIER_ABRVNAME'] = df[
    ['DISCL_SUPPLIER_ABRVNAME', 'SUPPLIER_ABRVNAME']
].bfill(axis=1).iloc[:, 0]

# Build DISCL_SUPPLIER_FULLNAME: choose the first non-null value from preferred sources
df['DISCL_SUPPLIER_FULLNAME'] = df[
    ['DISCL_SUPPLIER_FULLNAME', 'SUPPLIER_FULLNAME', 'DISCL_SUPPLIER_NAME']
].bfill(axis=1).iloc[:, 0]

# Optional: tidy whitespace on the new/filled columns
for c in ['DISCL_SUPPLIER_ABRVNAME', 'DISCL_SUPPLIER_FULLNAME']:
    df[c] = df[c].astype('string').str.strip().replace({'': pd.NA})

# Drop the now-redundant columns
df.drop(columns=['SUPPLIER_ABRVNAME', 'SUPPLIER_FULLNAME', 'DISCL_SUPPLIER_NAME'], inplace=True)

# Quick verification prints (delete/adjust in notebook as you like)
print("Filled DISCL_SUPPLIER_ABRVNAME: non-null count =", df['DISCL_SUPPLIER_ABRVNAME'].notna().sum())
print("Filled DISCL_SUPPLIER_FULLNAME: non-null count =", df['DISCL_SUPPLIER_FULLNAME'].notna().sum())

# Example: show rows where a DISCL_ field was filled from another field
# (run this before/after if you want to compare; here we show current rows where values exist)
print(df[['DISCL_SUPPLIER_ABRVNAME', 'DISCL_SUPPLIER_FULLNAME']].head(8))
Filled DISCL_SUPPLIER_ABRVNAME: non-null count = 1533
Filled DISCL_SUPPLIER_FULLNAME: non-null count = 341
                             DISCL_SUPPLIER_ABRVNAME DISCL_SUPPLIER_FULLNAME
0  SOCIETE COOPERATIVE AVEC CONSEIL D'ADMINISTRAT...                    <NA>
1  SOCIETE COOPERATIVE AVEC CONSEIL D'ADMINISTRAT...                    <NA>
2  SOCIETE COOPERATIVE AGRICOLE AHUANOU D'ARRAH A...                    <NA>
3  SOCIETE COOPERATIVE AGRICOLE GUELANIN AVEC CON...                    <NA>
4  SOCIETE COOPERATIVE AGRICOLE IMAAH, SOCIETE CO...                    <NA>
5  SOCIETE COOPERATIVE AVEC CONSEIL D'ADMINISTRAT...                    <NA>
6               SOCIETE COOPERATIVE SIMPLIFIÉE RASSO                    <NA>
7  SOCIETE COOPERATIVE AGRICOLE NOUVELLE DE DJOUR...                    <NA>
df.columns
Index(['DISCL_TRADER_NAME', 'DISCL_SUPPLIER_ABRVNAME', 'DISCL_AREA_NAME',
       'DISCL_CERTIFICATION_NAME', 'DISCL_NUMBER_FARMERS', 'DISCL_LONGITUDE',
       'DISCL_LATITUDE', 'DISCL_COUNTRY_NAME', 'TYPE', 'DISCL_YEAR',
       'DISCL_SUPPLIER_TYPE', 'COMPANY', 'DISCL_SUPPLIER_FULLNAME',
       'LOCALITY_NAME', 'DISCL_VOLUMES', 'DISCL_VOLUMES_UNIT',
       'DISCL_CERTIFICATION_PCT', 'DISTRICT_NAME', 'DISTRICT_GEOCODE',
       'CAM_BUYERS', 'IS_CAM_V3', 'IS_JRC', 'JRC_BUYER_ID',
       ' DISCL_SUPPLIER_TYPE', 'NUMBER_FARMERS', 'VOLUMES', 'VOLUMES_UNIT',
       'CERTIFICATION_NAME', 'CERTIFICATION_PCT'],
      dtype='object')
# TODO: Reduce to 'DISCL_AREA_NAME' and 'DISTRICT_GEOCODE'

# Reduce to DISCL_AREA_NAME and DISTRICT_GEOCODE using LOCALITY_NAME and DISTRICT_NAME as fallbacks.

df = df.copy()

area_cols = ['DISCL_AREA_NAME', 'LOCALITY_NAME', 'DISTRICT_GEOCODE', 'DISTRICT_NAME']
# normalise empty strings -> NA
df[area_cols] = df[area_cols].replace(r'^\s*$', pd.NA, regex=True)

# Fill DISCL_AREA_NAME: prefer explicit DISCL_AREA_NAME, then DISTRICT_NAME, then LOCALITY_NAME
df['DISCL_AREA_NAME'] = df[
    ['DISCL_AREA_NAME', 'DISTRICT_NAME', 'LOCALITY_NAME']
].bfill(axis=1).iloc[:, 0]

# Fill DISTRICT_GEOCODE: prefer existing DISTRICT_GEOCODE, then fall back to DISTRICT_NAME
# (This keeps codes where present; if you prefer to leave codes blank when only names exist, remove 'DISTRICT_NAME' from the list.)
df['DISTRICT_GEOCODE'] = df[
    ['DISTRICT_GEOCODE']
].bfill(axis=1).iloc[:, 0]

# Trim whitespace and normalise empty strings to NA
for c in ['DISCL_AREA_NAME', 'DISTRICT_GEOCODE']:
    df[c] = df[c].astype('string').str.strip().replace({'': pd.NA})

# Drop now-redundant columns
df.drop(columns=['LOCALITY_NAME', 'DISTRICT_NAME'], inplace=True)

# Quick checks
print("Non-null DISCL_AREA_NAME:", df['DISCL_AREA_NAME'].notna().sum())
print("Non-null DISTRICT_GEOCODE:", df['DISTRICT_GEOCODE'].notna().sum())
print(df[['DISCL_AREA_NAME', 'DISTRICT_GEOCODE']].head(10))
# TODO: Reduce to 'DISCL_SUPPLIER_TYPE'

# Reduce to DISCL_SUPPLIER_TYPE using TYPE and any whitespace-variant columns as fallbacks.

df = df.copy()

# Find relevant original columns:
orig_type_cols = [c for c in df.columns if c.strip() in {'DISCL_SUPPLIER_TYPE', 'TYPE'}]
# Example: this will include 'DISCL_SUPPLIER_TYPE', ' DISCL_SUPPLIER_TYPE', and 'TYPE' if present.

# If none of those columns exist, create the canonical column (all NA) and exit early.
if not orig_type_cols:
    df['DISCL_SUPPLIER_TYPE'] = pd.NA
    print("No source columns found for supplier type. Created empty 'DISCL_SUPPLIER_TYPE'.")
else:
    # Normalise empty strings -> NA on those original columns
    df[orig_type_cols] = df[orig_type_cols].replace(r'^\s*$', pd.NA, regex=True)

    # Build an ordered list of fallback columns:
    # prefer exact 'DISCL_SUPPLIER_TYPE' first, then any whitespace variants, then 'TYPE'
    ordered = []
    if 'DISCL_SUPPLIER_TYPE' in df.columns:
        ordered.append('DISCL_SUPPLIER_TYPE')
    # add whitespace variants (names that strip to DISCL_SUPPLIER_TYPE but aren't the exact name)
    ws_variants = [c for c in df.columns if c.strip() == 'DISCL_SUPPLIER_TYPE' and c != 'DISCL_SUPPLIER_TYPE']
    ordered.extend(ws_variants)
    if 'TYPE' in df.columns:
        ordered.append('TYPE')

    # Safeguard: if ordered is empty (shouldn't be), create empty canonical column
    if not ordered:
        df['DISCL_SUPPLIER_TYPE'] = pd.NA
    else:
        # Merge using first non-null value across ordered fallbacks
        df['DISCL_SUPPLIER_TYPE__MERGE'] = df[ordered].bfill(axis=1).iloc[:, 0]

        # Tidy whitespace and normalise empty -> NA
        df['DISCL_SUPPLIER_TYPE__MERGE'] = (
            df['DISCL_SUPPLIER_TYPE__MERGE']
            .astype('string')
            .str.strip()
            .replace({'': pd.NA})
        )

        # Drop the original/raw columns we used (we will keep only canonical name)
        df.drop(columns=[c for c in ordered if c in df.columns], inplace=True)

        # Rename the merge column to canonical name
        df.rename(columns={'DISCL_SUPPLIER_TYPE__MERGE': 'DISCL_SUPPLIER_TYPE'}, inplace=True)

# Quick checks
print("Columns now include:", [c for c in df.columns if 'SUPPLIER_TYPE' in c or c == 'TYPE'][:10])
print("Non-null DISCL_SUPPLIER_TYPE:", df['DISCL_SUPPLIER_TYPE'].notna().sum())
print(df['DISCL_SUPPLIER_TYPE'].value_counts(dropna=False).head(10))
Columns now include: ['DISCL_SUPPLIER_TYPE']
Non-null DISCL_SUPPLIER_TYPE: 1382
COOPERATIVE    1382
<NA>            594
Name: DISCL_SUPPLIER_TYPE, dtype: Int64
df.rename(columns={'CERTIFICATION_NAME':'DISCL_CERTIFICATION_NAME',
 'CERTIFICATION_PCT':'DISCL_CERTIFICATION_PCT',
 'NUMBER_FARMERS':'DISCL_NUMBER_FARMERS',
 'VOLUMES':'DISCL_VOLUMES',
 'VOLUMES_UNIT':'DISCL_VOLUMES_UNIT'}, inplace=True)
set(df_2023.columns).difference(set(df.columns))
{'DISTRICT_NAME', 'LOCALITY_NAME', 'TRADER_NAME'}
df_2023[(df_2023[['DISCL_AREA_NAME', 'DISTRICT_NAME', 'LOCALITY_NAME']].notna().any(axis=1))&(df.DISCL_COUNTRY_NAME == "IVORY COAST")]
COMPANY DISCL_COUNTRY_NAME DISCL_TRADER_NAME DISCL_AREA_NAME DISCL_SUPPLIER_ABRVNAME DISCL_SUPPLIER_FULLNAME DISCL_SUPPLIER_TYPE DISCL_LONGITUDE DISCL_LATITUDE DISCL_YEAR ... DISCL_CERTIFICATION_NAME DISCL_CERTIFICATION_PCT DISTRICT_NAME DISTRICT_GEOCODE LOCALITY_NAME TRADER_NAME CAM_BUYERS IS_CAM_V3 IS_JRC JRC_BUYER_ID

0 rows × 23 columns

set(df.columns).difference(set(df_2023.columns))
{'CERTIFICATION_NAME',
 'CERTIFICATION_PCT',
 'NUMBER_FARMERS',
 'VOLUMES',
 'VOLUMES_UNIT'}
df_2023.columns
Index(['COMPANY', 'DISCL_COUNTRY_NAME', 'DISCL_TRADER_NAME', 'DISCL_AREA_NAME',
       'DISCL_SUPPLIER_ABRVNAME', 'DISCL_SUPPLIER_FULLNAME',
       'DISCL_SUPPLIER_TYPE', 'DISCL_LONGITUDE', 'DISCL_LATITUDE',
       'DISCL_YEAR', 'DISCL_NUMBER_FARMERS', 'DISCL_VOLUMES',
       'DISCL_VOLUMES_UNIT', 'DISCL_CERTIFICATION_NAME',
       'DISCL_CERTIFICATION_PCT', 'DISTRICT_NAME', 'DISTRICT_GEOCODE',
       'LOCALITY_NAME', 'TRADER_NAME', 'CAM_BUYERS', 'IS_CAM_V3', 'IS_JRC',
       'JRC_BUYER_ID'],
      dtype='object')
df.columns
Index(['DISCL_TRADER_NAME', 'DISCL_SUPPLIER_ABRVNAME', 'DISCL_AREA_NAME',
       'DISCL_CERTIFICATION_NAME', 'DISCL_NUMBER_FARMERS', 'DISCL_LONGITUDE',
       'DISCL_LATITUDE', 'DISCL_COUNTRY_NAME', 'DISCL_YEAR', 'COMPANY',
       'DISCL_SUPPLIER_FULLNAME', 'DISCL_VOLUMES', 'DISCL_VOLUMES_UNIT',
       'DISCL_CERTIFICATION_PCT', 'DISTRICT_GEOCODE', 'CAM_BUYERS',
       'IS_CAM_V3', 'IS_JRC', 'JRC_BUYER_ID', 'NUMBER_FARMERS', 'VOLUMES',
       'VOLUMES_UNIT', 'CERTIFICATION_NAME', 'CERTIFICATION_PCT',
       'DISCL_SUPPLIER_TYPE'],
      dtype='object')

Figure out what to do about NA traders - leave them (that's how df_2023 is)

df.to_csv("/Users/niamhfrench/Library/CloudStorage/OneDrive-SEI/Documents/civ_cooperatives_update/combined_data/just_CIV_2024_columns_fixed.csv", index=False)
df = pd.read_csv('/Users/niamhfrench/Library/CloudStorage/OneDrive-SEI/Documents/civ_cooperatives_update/combined_data/just_CIV_2024_columns_fixed.csv')
df[['DISCL_NUMBER_FARMERS.1',
       'DISCL_VOLUMES.1', 'DISCL_VOLUMES_UNIT.1', 'DISCL_CERTIFICATION_NAME.1',
       'DISCL_CERTIFICATION_PCT.1']].notna().sum()
DISCL_NUMBER_FARMERS.1        7
DISCL_VOLUMES.1               0
DISCL_VOLUMES_UNIT.1          0
DISCL_CERTIFICATION_NAME.1    7
DISCL_CERTIFICATION_PCT.1     0
dtype: int64
df[['DISCL_NUMBER_FARMERS',
       'DISCL_VOLUMES', 'DISCL_VOLUMES_UNIT', 'DISCL_CERTIFICATION_NAME',
       'DISCL_CERTIFICATION_PCT']].notna().sum()
DISCL_NUMBER_FARMERS        358
DISCL_VOLUMES                12
DISCL_VOLUMES_UNIT            3
DISCL_CERTIFICATION_NAME    462
DISCL_CERTIFICATION_PCT     147
dtype: int64
# Copy over the 7 number_farmers and certification values from the '.1' columns to the main columns where missing
for col in ['DISCL_NUMBER_FARMERS',
       'DISCL_VOLUMES', 'DISCL_VOLUMES_UNIT', 'DISCL_CERTIFICATION_NAME',
       'DISCL_CERTIFICATION_PCT']:
    col_1 = col + '.1'
    mask = df[col].isna() & df[col_1].notna()
    df.loc[mask, col] = df.loc[mask, col_1]         
df[['DISCL_NUMBER_FARMERS',
       'DISCL_VOLUMES', 'DISCL_VOLUMES_UNIT', 'DISCL_CERTIFICATION_NAME',
       'DISCL_CERTIFICATION_PCT']].notna().sum()
DISCL_NUMBER_FARMERS        365
DISCL_VOLUMES                12
DISCL_VOLUMES_UNIT            3
DISCL_CERTIFICATION_NAME    469
DISCL_CERTIFICATION_PCT     147
dtype: int64
df.drop(columns=['DISCL_NUMBER_FARMERS.1',
       'DISCL_VOLUMES.1', 'DISCL_VOLUMES_UNIT.1', 'DISCL_CERTIFICATION_NAME.1',
       'DISCL_CERTIFICATION_PCT.1'], inplace=True)
df.to_csv("/Users/niamhfrench/Library/CloudStorage/OneDrive-SEI/Documents/civ_cooperatives_update/combined_data/just_CIV_2024_columns_fixed.csv", index=False)