Compare 2024 to 2023 coops

View or edit on GitHub

This page is synchronized from trase/data/cote_divoire/cocoa/logistics/q4_2025/compare_2024_to_2023_coops.ipynb. Last modified on 2026-06-21 06:35 CEST by GitHub Actions. Please view or edit the original file there; changes should be reflected here after a midnight build (CET time), or manually triggering it with a GitHub action (link).

import pandas as pd
import geopandas as gpd
import os

os.chdir('/Users/niamhfrench/repos/TRASE/trase/')
print(os.getcwd())

df_2024 = pd.read_csv("data/cote_divoire/cocoa/logistics/q4_2025/clean/2024_civ_coops_like_cam_long.csv", sep=';')
df_prev = pd.read_csv("data/cote_divoire/cocoa/logistics/out/CAM_seipcs_2023.csv", sep=';')

df_2024_no_duplicates = df_2024.drop_duplicates(subset="SUPPLIER_ABRVNAME")

# Filter the dataframes
df_2023 = df_prev[df_prev.YEAR == 2023]
df_2022 = df_prev[df_prev.YEAR == 2022]
df_2021 = df_prev[df_prev.YEAR == 2021]
/Users/niamhfrench/repos/TRASE/trase



---------------------------------------------------------------------------


FileNotFoundError                         Traceback (most recent call last)


Cell In[9], line 8


      5 os.chdir('/Users/niamhfrench/repos/TRASE/trase/')


      6 print(os.getcwd())


----> 8 df_2024 = pd.read_csv("data/cote_divoire/cocoa/logistics/q4_2025/clean/2024_civ_coops_like_cam_long.csv", sep=';')


      9 df_prev = pd.read_csv("data/cote_divoire/cocoa/logistics/out/CAM_seipcs_2023.csv", sep=';')


     11 df_2024_no_duplicates = df_2024.drop_duplicates(subset="SUPPLIER_ABRVNAME")





File ~/Library/Caches/pypoetry/virtualenvs/trase-N5EnYiQe-py3.10/lib/python3.10/site-packages/pandas/io/parsers/readers.py:1026, in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options, dtype_backend)


   1013 kwds_defaults = _refine_defaults_read(


   1014     dialect,


   1015     delimiter,


   (...)


   1022     dtype_backend=dtype_backend,


   1023 )


   1024 kwds.update(kwds_defaults)


-> 1026 return _read(filepath_or_buffer, kwds)





File ~/Library/Caches/pypoetry/virtualenvs/trase-N5EnYiQe-py3.10/lib/python3.10/site-packages/pandas/io/parsers/readers.py:620, in _read(filepath_or_buffer, kwds)


    617 _validate_names(kwds.get("names", None))


    619 # Create the parser.


--> 620 parser = TextFileReader(filepath_or_buffer, **kwds)


    622 if chunksize or iterator:


    623     return parser





File ~/Library/Caches/pypoetry/virtualenvs/trase-N5EnYiQe-py3.10/lib/python3.10/site-packages/pandas/io/parsers/readers.py:1620, in TextFileReader.__init__(self, f, engine, **kwds)


   1617     self.options["has_index_names"] = kwds["has_index_names"]


   1619 self.handles: IOHandles | None = None


-> 1620 self._engine = self._make_engine(f, self.engine)





File ~/Library/Caches/pypoetry/virtualenvs/trase-N5EnYiQe-py3.10/lib/python3.10/site-packages/pandas/io/parsers/readers.py:1880, in TextFileReader._make_engine(self, f, engine)


   1878     if "b" not in mode:


   1879         mode += "b"


-> 1880 self.handles = get_handle(


   1881     f,


   1882     mode,


   1883     encoding=self.options.get("encoding", None),


   1884     compression=self.options.get("compression", None),


   1885     memory_map=self.options.get("memory_map", False),


   1886     is_text=is_text,


   1887     errors=self.options.get("encoding_errors", "strict"),


   1888     storage_options=self.options.get("storage_options", None),


   1889 )


   1890 assert self.handles is not None


   1891 f = self.handles.handle





File ~/Library/Caches/pypoetry/virtualenvs/trase-N5EnYiQe-py3.10/lib/python3.10/site-packages/pandas/io/common.py:873, in get_handle(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)


    868 elif isinstance(handle, str):


    869     # Check whether the filename is to be opened in binary mode.


    870     # Binary mode does not support 'encoding' and 'newline'.


    871     if ioargs.encoding and "b" not in ioargs.mode:


    872         # Encoding


--> 873         handle = open(


    874             handle,


    875             ioargs.mode,


    876             encoding=ioargs.encoding,


    877             errors=errors,


    878             newline="",


    879         )


    880     else:


    881         # Binary mode


    882         handle = open(handle, ioargs.mode)





FileNotFoundError: [Errno 2] No such file or directory: 'data/cote_divoire/cocoa/logistics/q4_2025/clean/2024_civ_coops_like_cam_long.csv'
df_2023_no_duplicates = df_2023.drop_duplicates(subset="SUPPLIER_ABRVNAME")
df_2023_no_duplicates[df_2023_no_duplicates.REPEATED_FROM_PAST_YEAR==False]
FLOW_ID COOP_ID YEAR SUPPLIER_ABRVNAME SUPPLIER_FULLNAME LATITUDE LONGITUDE DISTRICT_NAME DISTRICT_GEOCODE BUYER ... IS_ALL_CAM_V3 IS_ANY_CAM_V3 CERTIFIED NB_FARMERS_COMPANY_YEAR NON_TRADER IS_TRADER !NOT_RFA unique_rfa_link LVL_4_NAME REPEATED_FROM_PAST_YEAR
52 2023-1 45 2023 ABO ABO 5.632 -6.631 MEAGUI CI-2.2.3_1 BARRY CALLEBAUT ... False False True 653.0 False True False False MEAGUI False
66 2023-2 52 2023 AB AB NaN NaN SASSANDRA CI-2.1.2_1 CEMOI ... False False True NaN False True False False SASSANDRA False
79 2023-3 60 2023 ADA COOPERATIVE AVEC CONSEIL D'ADMINISTRATION DES ... 5.200 -6.832 SAN-PEDRO CI-2.3.1_1 BARRY CALLEBAUT ... False False True 223.0 False True False False SAN-PEDRO False
98 2023-6 64 2023 ADEDO COOPERATIVE AGRICOLE POUR LE DEVELOPPEMENT ET ... 6.385 -5.411 OUME CI-5.1.2_1 CARGILL ... False True True 864.0 False True False False OUME False
105 2023-7 67 2023 ADI COOPERATIVE AVEC CONSEIL D'ADMINISTRATION DE D... 6.658 -3.273 ABENGOUROU CI-3.1.1_1 CEMOI ... False True True NaN False True False False ABENGOUROU False
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
9185 2023-563 5713 2023 WENDE WAOGA NaN NaN NaN NaN NaN SUCDEN ... False False False NaN False True False False NaN False
9197 2023-564 5722 2023 WONTA WONTA NaN NaN LAKOTA CI-5.2.3_1 CEMOI ... False False True NaN False True False False LAKOTA False
9206 2023-565 5728 2023 YAT NaN NaN NaN NaN NaN SUCDEN ... False False False NaN False True False False NaN False
9221 2023-566 5731 2023 YAWOUBE D'ANIASSUE YAWOUBE D'ANIASSUE 6.649 -3.680 ABENGOUROU CI-3.1.1_1 CARGILL ... False False True 1782.0 False True False False ABENGOUROU False
9258 2023-567 5751 2023 YEYOBIE INDENIE YEYOBIE INDENIE 6.855 -3.487 ABENGOUROU CI-3.1.1_1 CARGILL ... False True False 831.0 False True False False ABENGOUROU False

493 rows × 43 columns

df_2022_no_duplicates = df_2022.drop_duplicates(subset="SUPPLIER_ABRVNAME")
df_2022_no_duplicates[df_2022_no_duplicates.REPEATED_FROM_PAST_YEAR==False]
FLOW_ID COOP_ID YEAR SUPPLIER_ABRVNAME SUPPLIER_FULLNAME LATITUDE LONGITUDE DISTRICT_NAME DISTRICT_GEOCODE BUYER ... IS_ALL_CAM_V3 IS_ANY_CAM_V3 CERTIFIED NB_FARMERS_COMPANY_YEAR NON_TRADER IS_TRADER !NOT_RFA unique_rfa_link LVL_4_NAME REPEATED_FROM_PAST_YEAR
11 2022-1 12 2022 3A 3A NaN NaN NaN NaN NaN ... False False True NaN False False True True NaN False
12 2022-2 13 2022 3KSCAED 3K COOPERATIVE AGRICOLE ETRAEYAWLIN DE DIEGONEFLA NaN NaN NaN NaN NaN ... False False True NaN False False True True NaN False
18 2022-3 19 2022 A GOH COOPERATIVE AVEC CONSENTEMENT DES AGRICULTEURS... NaN NaN GAGNOA CI-5.1.1_1 OLAM ... False False False NaN False True False False GAGNOA False
22 2022-4 23 2022 A N'ZUE A N'ZUE NaN NaN NaN NaN NaN ... False False True NaN False False True True NaN False
23 2022-5 24 2022 A SAM A SAM NaN NaN NaN NaN NaN ... False False True NaN False False True True NaN False
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
9226 2022-1897 5735 2022 YEBOYOKON COOPERATIVE AGRICOLE YEBOYOKON DE DAIRO DIDIZO NaN NaN NaN NaN NaN ... False False True NaN False False True True NaN False
9242 2022-1898 5748 2022 YEYASSO COOPERATIVE AVEC CONSEIL D'ADMINISTRATION YEYA... NaN NaN MAN CI-8.3.3_1 FERRERO ... False False False NaN True False False False MAN False
9257 2022-1900 5751 2022 YEYOBIE INDENIE YEYOBIE INDENIE 6.855 -3.487 ABENGOUROU CI-3.1.1_1 CARGILL ... False True False 831.0 False True False False ABENGOUROU False
9263 2022-1901 5756 2022 YOMIDEFE YOMIDEFE NaN NaN TAABO CI-7.1.3_1 CEMOI ... False False True NaN False True False False TAABO False
9273 2022-1902 5765 2022 ZRANSSA COOPERATIVE AVEC CONSEIL D'ADMINISTRATION ZRANSSA NaN NaN NaN NaN NaN ... False False True NaN False False True True NaN False

1225 rows × 43 columns

df_2021_no_duplicates = df_2021.drop_duplicates(subset="SUPPLIER_ABRVNAME")
df_2021_no_duplicates[df_2021_no_duplicates.REPEATED_FROM_PAST_YEAR==False]
FLOW_ID COOP_ID YEAR SUPPLIER_ABRVNAME SUPPLIER_FULLNAME LATITUDE LONGITUDE DISTRICT_NAME DISTRICT_GEOCODE BUYER ... IS_ALL_CAM_V3 IS_ANY_CAM_V3 CERTIFIED NB_FARMERS_COMPANY_YEAR NON_TRADER IS_TRADER !NOT_RFA unique_rfa_link LVL_4_NAME REPEATED_FROM_PAST_YEAR
49 2021-1 45 2021 ABO ABO 5.632 -6.631 MEAGUI CI-2.2.3_1 BARRY CALLEBAUT ... False False True 828.0 False True False False MEAGUI False
75 2021-3 60 2021 ADA COOPERATIVE AVEC CONSEIL D'ADMINISTRATION DES ... 5.200 -6.832 SAN-PEDRO CI-2.3.1_1 BARRY CALLEBAUT ... False False True 575.0 False True False False SAN-PEDRO False
96 2021-5 64 2021 ADEDO COOPERATIVE AGRICOLE POUR LE DEVELOPPEMENT ET ... 6.385 -5.411 OUME CI-5.1.2_1 CARGILL ... False True True 864.0 False True False False OUME False
127 2021-6 79 2021 AGBS COOP AGRICOLE GNATI-BIA DE SIKENSI 5.671 -4.574 SIKENSI CI-7.1.2_1 CARGILL ... False True True 910.0 False True False False SIKENSI False
147 2021-7 88 2021 AGREL COOPERATIVE POUR LE DEVELOPPEMENT RURAL 5.847 -5.686 LAKOTA CI-5.2.3_1 CARGILL ... False False True 535.0 False True False False LAKOTA False
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
9161 2021-633 5692 2021 VIMA VIMA 6.621 -6.045 SINFRA CI-9.2.2_1 BARRY CALLEBAUT ... False False True 563.0 False True False False SINFRA False
9194 2021-634 5721 2021 WINPAS COOPERATIVE WIND-PANGA DE SEGUELA 7.975 -6.675 SEGUELA CI-12.3.2_1 NaN ... False True False NaN False False True True SEGUELA False
9219 2021-635 5731 2021 YAWOUBE D'ANIASSUE YAWOUBE D'ANIASSUE 6.649 -3.680 ABENGOUROU CI-3.1.1_1 CARGILL ... False False True 1782.0 False True False False ABENGOUROU False
9241 2021-636 5747 2021 YEYASSO COOPERATIVE YEYASSO DE MAN 7.407 -7.552 MAN CI-8.3.3_1 MONDELEZ ... False True True NaN True False False False MAN False
9256 2021-637 5751 2021 YEYOBIE INDENIE YEYOBIE INDENIE 6.855 -3.487 ABENGOUROU CI-3.1.1_1 CARGILL ... False True False 831.0 False True False False ABENGOUROU False

492 rows × 43 columns

print(len(df_2023_no_duplicates), len(df_2024_no_duplicates))
4038 1116
df_2024.columns
Index(['FLOW_ID', 'COOP_ID', 'YEAR', 'SUPPLIER_ABRVNAME', 'SUPPLIER_FULLNAME',
       'LATITUDE', 'LONGITUDE', 'DISTRICT_NAME', 'DISTRICT_GEOCODE', 'BUYER',
       'COMPANY', 'TRADER_NAMES', 'CERTIFICATIONS', 'NUM_FARMERS',
       'NUM_FARMERS_EXTRAPOLATED', 'TOTAL_FARMERS_NONTRADER',
       'TOTAL_FARMERS_TRADER', 'TOTAL_FARMERS_RFA', 'TOTAL_FARMERS',
       'DISCLOSURE_SOURCES', 'CERTIFIED', 'DISCL_COUNTRY_NAME',
       'DISCL_AREA_NAME', 'DISCL_SUPPLIER_ABRVNAME', 'DISCL_SUPPLIER_FULLNAME',
       'DISCL_SUPPLIER_TYPE', 'DISCL_LONGITUDE', 'DISCL_LATITUDE',
       'DISCL_VOLUMES', 'DISCL_VOLUMES_UNIT', 'DISCL_CERTIFICATION_NAME',
       'DISCL_CERTIFICATION_PCT', 'SIMPLIF_ABRVNAME', 'CCTN_COOP_ID',
       'IS_ALL_CAM_V3', 'IS_ANY_CAM_V3', 'NB_FARMERS_COMPANY_YEAR', '!NOT_RFA',
       'unique_rfa_link', 'LVL_4_NAME', 'REPEATED_FROM_PAST_YEAR',
       'IS_TRADER'],
      dtype='object')
# Drop rows where BOTH 'BUYER' and 'TRADER_NAME' are NaN
df_2021_clean = df_2021_no_duplicates.dropna(subset=['BUYER', 'TRADER_NAME'], how='all')
df_2022_clean = df_2022_no_duplicates.dropna(subset=['BUYER', 'TRADER_NAME'], how='all')
df_2023_clean = df_2023_no_duplicates.dropna(subset=['BUYER', 'TRADER_NAME'], how='all')
df_2024_clean = df_2024_no_duplicates.dropna(subset=['BUYER', 'TRADER_NAMES'], how='all')

# Print the lengths of the cleaned dataframes
print(f"2021 Count: {len(df_2021_clean)}")
print(f"2022 Count: {len(df_2022_clean)}")
print(f"2023 Count: {len(df_2023_clean)}")
print(f"2024 Count: {len(df_2024_clean)}")
2021 Count: 691
2022 Count: 727
2023 Count: 798
2024 Count: 1038
# Create a filter where BUYER is NaN AND TRADER_NAMES is NaN
rows_missing_both = df_2024_no_duplicates[
    df_2024_no_duplicates['BUYER'].isna() & 
    df_2024_no_duplicates['TRADER_NAMES'].isna()
]

# View the dataframe
display(len(rows_missing_both))
78
print(f"2021 Count: {len(df_2021_no_duplicates)}")
print(f"2022 Count: {len(df_2022_no_duplicates)}")
print(f"2023 Count: {len(df_2023_no_duplicates)}")
print(f"2024 Count: {len(df_2024_no_duplicates)}")
2021 Count: 3536
2022 Count: 4021
2023 Count: 4038
2024 Count: 1116
df_2023_no_duplicates
/opt/homebrew/Caskroom/mambaforge/base/envs/trase-env/lib/python3.10/site-packages/IPython/lib/pretty.py:794: FutureWarning: In a future version, object-dtype columns with all-bool values will not be included in reductions with bool_only=True. Explicitly cast to bool dtype instead.
  output = repr(obj)
/opt/homebrew/Caskroom/mambaforge/base/envs/trase-env/lib/python3.10/site-packages/IPython/core/formatters.py:406: FutureWarning: In a future version, object-dtype columns with all-bool values will not be included in reductions with bool_only=True. Explicitly cast to bool dtype instead.
  return method()
FLOW_ID COOP_ID YEAR SUPPLIER_ABRVNAME SUPPLIER_FULLNAME LATITUDE LONGITUDE DISTRICT_NAME DISTRICT_GEOCODE BUYER ... IS_ALL_CAM_V3 IS_ANY_CAM_V3 CERTIFIED NB_FARMERS_COMPANY_YEAR NON_TRADER IS_TRADER !NOT_RFA unique_rfa_link LVL_4_NAME REPEATED_FROM_PAST_YEAR
52 2023-1 45 2023 ABO ABO 5.632 -6.631 MEAGUI CI-2.2.3_1 BARRY CALLEBAUT ... False False True 653.0 False True False False MEAGUI False
66 2023-2 52 2023 AB AB NaN NaN SASSANDRA CI-2.1.2_1 CEMOI ... False False True NaN False True False False SASSANDRA False
79 2023-3 60 2023 ADA COOPERATIVE AVEC CONSEIL D'ADMINISTRATION DES ... 5.200 -6.832 SAN-PEDRO CI-2.3.1_1 BARRY CALLEBAUT ... False False True 223.0 False True False False SAN-PEDRO False
98 2023-6 64 2023 ADEDO COOPERATIVE AGRICOLE POUR LE DEVELOPPEMENT ET ... 6.385 -5.411 OUME CI-5.1.2_1 CARGILL ... False True True 864.0 False True False False OUME False
105 2023-7 67 2023 ADI COOPERATIVE AVEC CONSEIL D'ADMINISTRATION DE D... 6.658 -3.273 ABENGOUROU CI-3.1.1_1 CEMOI ... False True True NaN False True False False ABENGOUROU False
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27656 2023-5983 5761 2023 ZOUAN KOUALOU COOPERATIVE AGRICOLE DE ZOUAN-HOUNIEN 6.915 -8.211 NaN CI-8.3.5_1 NaN ... True True NaN NaN NaN NaN NaN NaN NaN True
27657 2023-5984 5762 2023 ZOUEUDO COOPERATIVE AGRICOLE ZOUEUDO DE MAHAPLEU 7.256 -8.156 NaN CI-8.3.2_1 NaN ... True True NaN NaN NaN NaN NaN NaN NaN True
27658 2023-5985 5763 2023 ZOUTOKEGUI COOPERATIVE AGRICOLE ZOUTOKEGUI DE GOURANE 7.734 -7.618 NaN CI-8.3.1_1 NaN ... True True NaN NaN NaN NaN NaN NaN NaN True
27659 2023-5986 5764 2023 ZRANLEU COOPERATIVE ZRANLEU DE MAN 7.408 -7.549 NaN CI-8.3.3_1 NaN ... True True NaN NaN NaN NaN NaN NaN NaN True
27660 2023-5987 5766 2023 ZSB ZSB 7.404 -7.554 NaN CI-8.3.3_1 NaN ... True True NaN NaN NaN NaN NaN NaN NaN True

4038 rows × 43 columns

df_2023_no_duplicates.loc[27659]
FLOW_ID                                              2023-5986
COOP_ID                                                   5764
YEAR                                                      2023
SUPPLIER_ABRVNAME                                      ZRANLEU
SUPPLIER_FULLNAME                   COOPERATIVE ZRANLEU DE MAN
LATITUDE                                                 7.408
LONGITUDE                                               -7.549
DISTRICT_NAME                                              NaN
DISTRICT_GEOCODE                                    CI-8.3.3_1
BUYER                                                      NaN
DISCLOSURE_SOURCE                                          NaN
TRADER_NAME                                                NaN
CERTIFICATIONS                                             NaN
NUM_FARMERS                                                NaN
NUM_FARMERS_EXTRAPOLATED                                   NaN
TOTAL_FARMERS_NONTRADER                                    NaN
TOTAL_FARMERS_TRADER                                       NaN
TOTAL_FARMERS_RFA                                          NaN
TOTAL_FARMERS                                              NaN
DISCL_COUNTRY_NAME                                 IVORY_COAST
DISCL_AREA_NAME                                            NaN
DISCL_SUPPLIER_ABRVNAME                         SCOOPS ZRANLEU
DISCL_SUPPLIER_FULLNAME     SOCIETE COOPERATIVE ZRANLEU DE MAN
DISCL_SUPPLIER_TYPE                                        NaN
DISCL_LONGITUDE                                      -7.548712
DISCL_LATITUDE                                         7.40843
DISCL_NUMBER_FARMERS                                       NaN
DISCL_VOLUMES                                              NaN
DISCL_VOLUMES_UNIT                                         NaN
DISCL_CERTIFICATION_NAME                                   NaN
DISCL_CERTIFICATION_PCT                                    NaN
SIMPLIF_ABRVNAME                                       ZRANLEU
CCTN_COOP_ID                                               NaN
IS_ALL_CAM_V3                                             True
IS_ANY_CAM_V3                                             True
CERTIFIED                                                  NaN
NB_FARMERS_COMPANY_YEAR                                    NaN
NON_TRADER                                                 NaN
IS_TRADER                                                  NaN
!NOT_RFA                                                   NaN
unique_rfa_link                                            NaN
LVL_4_NAME                                                 NaN
REPEATED_FROM_PAST_YEAR                                   True
Name: 27659, dtype: object
df_2023_no_duplicates.DISCL_CERTIFICATION_NAME.unique()
array(['FAIRTRADE, RAINFOREST ALLIANCE', 'TRANSPARENCE CACAO',
       'COCOA HORIZONS, RAINFOREST ALLIANCE', 'CARGILL COCOA PROMISE; RA',
       'CARGILL COCOA PROMISE', nan, 'COCOA LIFE', 'RAINFOREST ALLIANCE',
       'COCOA HORIZONS, FAIRTRADE, RAINFOREST ALLIANCE',
       'COCOA HORIZONS, FERMICOA, RAINFOREST ALLIANCE',
       'COCOA HORIZONS, FERMICOA', 'COCOA HORIZONS',
       'COCOA LIFE, FERMICOA', 'ra, utz, ft', 'ft',
       'FAIRTRADE AND RAINFOREST ALLIANCE', 'COCOA LIFE, FAIRTRADE',
       'COCOA HORIZONS, FAIRTRADE', 'utz, ft', 'NOT CERTIFIED',
       'FAIRTRADE', 'ra, ft', 'RAINFOREST ALLIANCE OR FAIRTRADE', 'RFA',
       'FAIR FOR LIFE', 'RAINFOREST ALLIANCE, UTZ',
       'COCOA LIFE, FERMICOA, RAINFOREST ALLIANCE', 'ra, utz', ' ',
       'COCOA LIFE, RAINFOREST ALLIANCE',
       'COCOA HORIZONS, FAIRTRADE, RAINFOREST ALLIANCE, UTZ',
       'OLAM PROGRAMMES', 'UTZ; RESPONSIBLY SOURCED COCOA',
       'NA; RESPONSIBLY SOURCED COCOA',
       'OLD-RA; RESPONSIBLY SOURCED COCOA', 'COCOA FOR GOOD; Fair Trade',
       'COCOA FOR GOOD; Supplier Standard', 'COCOA FOR GOOD; UTZ',
       'COCOA PLAN; RAINFOREST ALLIANCE',
       'UTZ_CO1000007528; SUSTAINABLE ORIGINS', 'SUSTAINABLE ORIGINS',
       'UTZ_CO1000008807; FAIRTRADE', 'RA', 'UTZ_CO1000009238',
       'UTZ_CO1000006898', 'UTZ_CO1000007865', 'UTZ',
       'UTZ_CO1000006209; RAINFOREST ALLIANCE'], dtype=object)
df_2024_no_duplicates
FLOW_ID COOP_ID YEAR SUPPLIER_ABRVNAME SUPPLIER_FULLNAME LATITUDE LONGITUDE DISTRICT_NAME DISTRICT_GEOCODE BUYER ... SIMPLIF_ABRVNAME CCTN_COOP_ID IS_ALL_CAM_V3 IS_ANY_CAM_V3 NB_FARMERS_COMPANY_YEAR !NOT_RFA unique_rfa_link LVL_4_NAME REPEATED_FROM_PAST_YEAR IS_TRADER
0 2024-1 0 2024 SOCIETE COOPERATIVE BENKADI DE SIAKAKRO SOCIETE COOPERATIVE BENKADI DE SIAKAKRO 7.135274 -3.204332 AGNIBILEKRO CI-3.1.2_1 BARRY CALLEBAUT ... NaN NaN False False NaN NaN NaN CI-3.1.2_1 NaN False
1 2024-2 1 2024 SOCIETE COOPERATIVE ANOUANZE DES PRODUCTEURS D... SOCIETE COOPERATIVE ANOUANZE DES PRODUCTEURS D... 6.720262 -3.494930 ABENGOUROU CI-3.1.1_1 BARRY CALLEBAUT ... NaN NaN False False NaN NaN NaN CI-3.1.1_1 NaN False
2 2024-3 2 2024 SOCIETE COOPERATIVE AGRICOLE AHUANOU D'ARRAH SOCIETE COOPERATIVE AGRICOLE AHUANOU D'ARRAH NaN NaN BONGOUANOU CI-6.3.2_1 BARRY CALLEBAUT ... NaN NaN False False NaN NaN NaN CI-6.3.2_1 NaN False
3 2024-4 3 2024 TELIN COOPERATIVE AGRICOLE TELIN 5.876620 -7.455261 BANGOLO CI-8.2.1_1 BARRY CALLEBAUT ... TELIN NaN False False NaN NaN NaN CI-8.2.1_1 NaN False
4 2024-5 4 2024 COOPERATIVE AGRICOLE IMAAH, COOPERATIVE SIMPLI... COOPERATIVE AGRICOLE IMAAH, COOPERATIVE SIMPLI... NaN NaN DIVO CI-5.2.1_1 BARRY CALLEBAUT ... NaN NaN False False NaN NaN NaN CI-5.2.1_1 NaN False
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1523 2024-1524 1523 2024 SCOOPUF-COOP-CA COOPERATIVE AGRICOLE DES PLANTEURS UNIS DE FEN... 6.785423 -7.393198 DUEKOUE CI-8.2.2_1 ECOM ... NaN NaN NaN NaN NaN NaN NaN CI-8.2.2_1 NaN False
1527 2024-1528 1527 2024 CAFUGO COOP CA COOPÉRATIVE AGRICOLE DES FRÈRES UNIS DE GOBROKO 4.957198 -6.086585 SASSANDRA CI-2.1.2_1 ECOM ... NaN NaN NaN NaN NaN NaN NaN CI-2.1.2_1 NaN False
1528 2024-1529 1528 2024 COOP-CCAPUO SOCIETE COOPERATIVE AVEC CONSEIL D`ADMINISTRAT... 6.747380 -7.362460 DUEKOUE CI-8.2.2_1 ECOM ... NaN NaN NaN NaN NaN NaN NaN CI-8.2.2_1 NaN False
1529 2024-1530 1529 2024 COOP_CA_UDAN SOCIETE COOPERATIVE AVEC CONSEIL D’ADMINISTRAT... 7.252397 -8.150575 DANANE CI-8.3.2_1 ECOM ... NaN NaN NaN NaN NaN NaN NaN CI-8.3.2_1 NaN False
1530 2024-1531 1530 2024 YEBOYOKON SCOOPS SOCIETE COOPERATIVE AGRICOLE YEBOYOKON DE DAIR... NaN NaN DIVO CI-5.2.1_1 ECOM ... NaN NaN NaN NaN NaN NaN NaN CI-5.2.1_1 NaN False

1116 rows × 42 columns

import pandas as pd
import numpy as np
import re

# Load data (added low_memory=False to handle the mixed types warning in your previous log)
df_2024 = pd.read_csv("/Users/niamhfrench/Library/CloudStorage/OneDrive-SEI/Documents/TRASE/trase/data/cote_divoire/cocoa/logistics/q4_2025/clean/2024_civ_coops_like_cam_long.csv", sep=';')
df_prev = pd.read_csv("/Users/niamhfrench/Library/CloudStorage/OneDrive-SEI/Documents/TRASE/trase/data/cote_divoire/cocoa/logistics/out/CAM_seipcs_2023.csv", sep=';', low_memory=False)

# --- 1. Round lat and lon to 3 decimal places ---
# Check if columns exist before rounding to avoid errors
cols_to_round = ['LATITUDE', 'LONGITUDE']
for col in cols_to_round:
    if col in df_2024.columns:
        df_2024[col] = df_2024[col].round(3)

# --- 2. Standardise spellings to 'COOPERATIVE' ---
# Normalizing variations like "STE COOP", "SOCIETE COOPERATIVE" etc.
# Note: We apply this to SUPPLIER_ABRVNAME as requested, assuming that is the target column
pattern_standardise = r'(?i)\b(SOCIETE\s+COOPERATIVE|SOCIETÉ\s+COOPÉRATIVE|STE\s+COOP|STE\s+CPE|SOC\s+COOP)\b'
df_2024['SUPPLIER_ABRVNAME'] = df_2024['SUPPLIER_ABRVNAME'].str.replace(pattern_standardise, 'COOPERATIVE', regex=True)

# --- 3. Simplify names by removing COOP patterns ---
# Removing 'COOP', 'SCOOPS', 'COOP-CA' etc to leave the unique name
# \s* handles potential extra spaces
pattern_simplify = r'(?i)(\bSCOOPS?\b|COOP_CA_|\bCOOP\s*[-_]?\s*CA\b|\bCOOP\b|\bCOOPERATIVE\b|\s*[-–]\s*CA\b)'

# Apply replacement
df_2024['SUPPLIER_ABRVNAME'] = df_2024['SUPPLIER_ABRVNAME'].str.replace(pattern_simplify, '', regex=True).str.strip()

# Extra Cleanup: Remove any leftover leading/trailing punctuation (underscores, dashes) 
# that might remain after cutting the prefix/suffix.
df_2024['SUPPLIER_ABRVNAME'] = df_2024['SUPPLIER_ABRVNAME'].str.strip(' _-–')

# Clean up any resulting double spaces
df_2024['SUPPLIER_ABRVNAME'] = df_2024['SUPPLIER_ABRVNAME'].str.replace(r'\s+', ' ', regex=True)

# --- 4. Match to stable COOP_IDs from previous year ---

# Drop the current/temporary IDs from the 2024 file
df_2024 = df_2024.drop(columns=['COOP_ID', 'FLOW_ID'], errors='ignore')

# Create a reference table from the previous year (Name -> ID)
# We drop duplicates to ensure one-to-one mapping
ref_ids = df_prev[['SUPPLIER_ABRVNAME', 'COOP_ID']].drop_duplicates(subset=['SUPPLIER_ABRVNAME'])

# Merge 2024 data with the reference IDs
df_2024 = df_2024.merge(ref_ids, on='SUPPLIER_ABRVNAME', how='left')

# --- Handle New Cooperatives (NaN COOP_IDs) ---
# Find the highest existing ID in the previous data to start new IDs after it
max_prev_id = pd.to_numeric(df_prev['COOP_ID'], errors='coerce').max()
if np.isnan(max_prev_id):
    max_prev_id = 0

# Identify rows that didn't find a match
mask_new = df_2024['COOP_ID'].isna()
count_new = mask_new.sum()

if count_new > 0:
    # Create new IDs: range starting from max_prev_id + 1
    new_ids = range(int(max_prev_id) + 1, int(max_prev_id) + 1 + count_new)
    df_2024.loc[mask_new, 'COOP_ID'] = new_ids

# Ensure COOP_ID is integer
df_2024['COOP_ID'] = df_2024['COOP_ID'].astype(int)

# Re-create FLOW_ID (Assuming format Year-Index)
df_2024['FLOW_ID'] = df_2024['YEAR'].astype(str) + '-' + (df_2024.index + 1).astype(str)

# Final Inspection
print("Cleaning Complete.")
print(df_2024[['FLOW_ID', 'COOP_ID', 'SUPPLIER_ABRVNAME', 'BUYER']])

#In the cooperative panel dataset, we record the names of all known buyers from
#a cooperative in a given year, separated by ” + ”, in the variable BUYERS.
#Companies recorded in this variable can be buyers at different vertical stages
#of the same cocoa flow. In the link dataset, we register buyer names in variable LINK BUYER. In addition, we provide the dummy variable IS TRADER,
#which is TRUE if we consider, based on our own research and customs declaration data, that the buyer is a trading company that can potentially purchase
#directly from cooperatives.
Cleaning Complete.
        FLOW_ID  COOP_ID                      SUPPLIER_ABRVNAME  \
0        2024-1     5767                    BENKADI DE SIAKAKRO   
1        2024-2     5768  ANOUANZE DES PRODUCTEURS DE L'INDENIE   
2        2024-3     5769               AGRICOLE AHUANOU D'ARRAH   
3        2024-4     5602                                  TELIN   
4        2024-5     5770             AGRICOLE IMAAH, SIMPLIFIÉE   
...         ...      ...                                    ...   
1526  2024-1527     4419                              SCOOPANAB   
1527  2024-1528      736                                 CAFUGO   
1528  2024-1529     6101                                 CCAPUO   
1529  2024-1530     5635                                   UDAN   
1530  2024-1531     5735                              YEBOYOKON

                BUYER  
0     BARRY CALLEBAUT  
1     BARRY CALLEBAUT  
2     BARRY CALLEBAUT  
3     BARRY CALLEBAUT  
4     BARRY CALLEBAUT  
...               ...  
1526             ECOM  
1527             ECOM  
1528             ECOM  
1529             ECOM  
1530             ECOM

[1531 rows x 4 columns]
df_2024.BUYER.unique()
array(['BARRY CALLEBAUT', 'BLOMMER', 'ECOM', 'OFI', 'ETG', 'CARGILL',
       'FARMSTRONG', 'GCB', 'JB COCOA', 'SO B GREEN', 'SUCDEN', 'TOUTON',
       'KRUGER + SUSCOM + CARGILL', 'OLAM', nan,
       'BARRY CALLEBAUT + CARGILL + CEMOI', 'COCOASOURCE', 'FILDISI',
       'CEMOI', 'SUSCOM', 'FERRERO', 'BARRY CALLEBAUT + CARGILL',
       'SUCDEN + OLAM', 'HERSHEY', 'HERSHEY + BLOMMER', 'ECOOKIM',
       'BARRY CALLEBAUT + TOUTON', 'BARRY CALLEBAUT + ETC GROUP',
       'BARRY CALLEBAUT + OLAM', 'BARRY CALLEBAUT + CARGILL + OLAM',
       'ALTER ECO', 'MARS', 'ETC GROUP'], dtype=object)
new_ids
range(5767, 6102)
# We extrapolate links and cooperative existence from one year to the next one.
# This process starts from the baseline year 2019, by company. If a company did
# not disclose its supply chain in a given year, we extrapolate its links with coops
# this year from its last disclosure.
# We also extrapolate the existence of cooperatives into the future. We assume
# that the absence of a cooperative from the set of links disclosed for a given
# year does not reflect that this cooperative ceased to exist. Any cooperative
# disclosed once is thus featured in the cooperative panel dataset in subsequent
# years (the number of cooperatives is non-decreasing over time, and the panel is
# not balanced).
# In both the cooperative panel dataset and the link dataset, this extrapolation
# is flagged with a dummy variable called REPEATED FROM PAST YEAR.
df_2024 = pd.read_csv("/Users/niamhfrench/Library/CloudStorage/OneDrive-SEI/Documents/TRASE/trase/data/cote_divoire/cocoa/logistics/q4_2025/clean/2024_civ_coops_like_cam_long.csv", sep=';')
df_prev = pd.read_csv("/Users/niamhfrench/Library/CloudStorage/OneDrive-SEI/Documents/TRASE/trase/data/cote_divoire/cocoa/logistics/out/CAM_seipcs_2023.csv", sep=';', low_memory=False)

print(df_2024.columns, df_prev.columns)
Index(['FLOW_ID', 'COOP_ID', 'YEAR', 'SUPPLIER_ABRVNAME', 'SUPPLIER_FULLNAME',
       'LATITUDE', 'LONGITUDE', 'DISTRICT_NAME', 'DISTRICT_GEOCODE', 'BUYER',
       'COMPANY', 'TRADER_NAMES', 'CERTIFICATIONS', 'NUM_FARMERS',
       'NUM_FARMERS_EXTRAPOLATED', 'TOTAL_FARMERS_NONTRADER',
       'TOTAL_FARMERS_TRADER', 'TOTAL_FARMERS_RFA', 'TOTAL_FARMERS',
       'DISCLOSURE_SOURCES', 'CERTIFIED', 'DISCL_COUNTRY_NAME',
       'DISCL_AREA_NAME', 'DISCL_SUPPLIER_ABRVNAME', 'DISCL_SUPPLIER_FULLNAME',
       'DISCL_SUPPLIER_TYPE', 'DISCL_LONGITUDE', 'DISCL_LATITUDE',
       'DISCL_VOLUMES', 'DISCL_VOLUMES_UNIT', 'DISCL_CERTIFICATION_NAME',
       'DISCL_CERTIFICATION_PCT', 'SIMPLIF_ABRVNAME', 'CCTN_COOP_ID',
       'IS_ALL_CAM_V3', 'IS_ANY_CAM_V3', 'NB_FARMERS_COMPANY_YEAR', '!NOT_RFA',
       'unique_rfa_link', 'LVL_4_NAME', 'REPEATED_FROM_PAST_YEAR',
       'IS_TRADER'],
      dtype='object') Index(['FLOW_ID', 'COOP_ID', 'YEAR', 'SUPPLIER_ABRVNAME', 'SUPPLIER_FULLNAME',
       'LATITUDE', 'LONGITUDE', 'DISTRICT_NAME', 'DISTRICT_GEOCODE', 'BUYER',
       'DISCLOSURE_SOURCE', 'TRADER_NAME', 'CERTIFICATIONS', 'NUM_FARMERS',
       'NUM_FARMERS_EXTRAPOLATED', 'TOTAL_FARMERS_NONTRADER',
       'TOTAL_FARMERS_TRADER', 'TOTAL_FARMERS_RFA', 'TOTAL_FARMERS',
       'DISCL_COUNTRY_NAME', 'DISCL_AREA_NAME', 'DISCL_SUPPLIER_ABRVNAME',
       'DISCL_SUPPLIER_FULLNAME', 'DISCL_SUPPLIER_TYPE', 'DISCL_LONGITUDE',
       'DISCL_LATITUDE', 'DISCL_NUMBER_FARMERS', 'DISCL_VOLUMES',
       'DISCL_VOLUMES_UNIT', 'DISCL_CERTIFICATION_NAME',
       'DISCL_CERTIFICATION_PCT', 'SIMPLIF_ABRVNAME', 'CCTN_COOP_ID',
       'IS_ALL_CAM_V3', 'IS_ANY_CAM_V3', 'CERTIFIED',
       'NB_FARMERS_COMPANY_YEAR', 'NON_TRADER', 'IS_TRADER', '!NOT_RFA',
       'unique_rfa_link', 'LVL_4_NAME', 'REPEATED_FROM_PAST_YEAR'],
      dtype='object')
df_2024[['DISCLOSURE_SOURCES', 'BUYER', 'COMPANY']]
DISCLOSURE_SOURCES BUYER COMPANY
0 NaN BARRY CALLEBAUT NaN
1 NaN BARRY CALLEBAUT NaN
2 NaN BARRY CALLEBAUT NaN
3 NaN BARRY CALLEBAUT NaN
4 NaN BARRY CALLEBAUT NaN
... ... ... ...
1526 NaN ECOM NaN
1527 NaN ECOM NaN
1528 NaN ECOM NaN
1529 NaN ECOM NaN
1530 NaN ECOM NaN

1531 rows × 3 columns

df_prev.BUYER.unique()
array([nan, 'FERRERO', 'OLAM', 'CARGILL', 'BLOMMER', 'NESTLE',
       'BARRY CALLEBAUT', 'HERSHEY', 'TOUTON', 'CEMOI', 'KRUGER',
       'SUSCOM', 'ALFRED RITTER', 'ECOM', 'SUCDEN', 'MONDELEZ', 'MARS',
       'ETC GROUP', 'COCOASOURCE', 'PURATOS', 'ECOOKIM', 'VALRHONA',
       'FILDISI', 'COLRUYT', 'ALTER ECO', 'ETHIQUABLE', 'ALBERT HEIJN'],
      dtype=object)
df_prev.DISCLOSURE_SOURCE.unique()
array([nan, 'RAINFOREST ALLIANCE', 'FERRERO', 'OLAM', 'CARGILL',
       'BLOMMER', 'NESTLE', 'BARRY CALLEBAUT', 'HERSHEY', 'TOUTON',
       'CEMOI', 'ALFRED RITTER', 'ECOM', 'SUCDEN', 'MONDELEZ', 'MARS',
       'ETC GROUP', "TONY'S CHOCOLONELY", 'PURATOS', 'ECOOKIM',
       'VALRHONA', 'FILDISI', 'COLRUYT', 'ALTER ECO', 'ETHIQUABLE',
       'ALBERT HEIJN'], dtype=object)
df_2024.COMPANY.unique()
array([nan, 'OLAM', 'BARRY CALLEBAUT', 'CEMOI',
       'BARRY CALLEBAUT + HERSHEY', 'SUCDEN', 'CARGILL',
       'OLAM + MONDELEZ', 'BARRY CALLEBAUT + SUCDEN',
       'BARRY CALLEBAUT + MONDELEZ', 'MARS',
       'BARRY CALLEBAUT + BLOMMER + TOUTON', 'BARRY CALLEBAUT + OLAM',
       'BARRY CALLEBAUT + MARS + BLOMMER + TOUTON', 'CARGILL + OLAM',
       'NESTLE', 'SUCDEN + NESTLE',
       'BARRY CALLEBAUT + CARGILL + OLAM + MARS',
       'BARRY CALLEBAUT + CARGILL + MARS',
       'BARRY CALLEBAUT + MARS + BLOMMER', 'BARRY CALLEBAUT + FERRERO',
       'BARRY CALLEBAUT + OLAM + HERSHEY',
       'BARRY CALLEBAUT + FERRERO + TOUTON',
       'BARRY CALLEBAUT + ETC GROUP', 'SUCDEN + ECOM + FILDISI',
       'BARRY CALLEBAUT + MARS + MONDELEZ',
       'BARRY CALLEBAUT + ETC GROUP + MONDELEZ', 'BARRY CALLEBAUT + MARS',
       'FERRERO', 'BARRY CALLEBAUT + ECOM', 'SUCDEN + OLAM',
       'BARRY CALLEBAUT + TOUTON', 'ECOM', 'CARGILL + NESTLE',
       'CARGILL + HERSHEY', 'ETC GROUP', 'ECOM + NESTLE', 'SUCDEN + ECOM',
       'OLAM + ECOM', 'CARGILL + MARS', 'OLAM + NESTLE',
       'ALFRED RITTER + CARGILL', 'OLAM + PURATOS', 'ECOM + PURATOS',
       'ALFRED RITTER + BARRY CALLEBAUT + CARGILL + CEMOI + FERRERO',
       'ECOOKIM', "TONY'S CHOCOLONELY", 'ALFRED RITTER + CEMOI',
       'OLAM + HERSHEY', 'OLAM + FERRERO', 'SUCDEN + OLAM + HERSHEY',
       'HERSHEY', 'HERSHEY + BLOMMER', 'FERRERO + TOUTON',
       'OLAM + BLOMMER', 'BLOMMER', 'CEMOI + BLOMMER',
       'CEMOI + SUCDEN + OLAM + HERSHEY + BLOMMER',
       'OLAM + HERSHEY + BLOMMER', 'MARS + BLOMMER + TOUTON',
       'OLAM + MARS', 'CARGILL + OLAM + MARS',
       'BARRY CALLEBAUT + FERRERO + HERSHEY', 'CEMOI + OLAM',
       'CEMOI + OLAM + MARS', 'ALTER ECO', 'CEMOI + ETC GROUP',
       'MARS + TOUTON', 'ECOM + MONDELEZ', 'ECOM + FILDISI'], dtype=object)
df_2024.BUYER.unique()
array(['BARRY CALLEBAUT', 'BLOMMER', 'ECOM', 'OFI', 'ETG', 'CARGILL',
       'FARMSTRONG', 'GCB', 'JB COCOA', 'SO B GREEN', 'SUCDEN', 'TOUTON',
       'KRUGER + SUSCOM + CARGILL', 'OLAM', nan,
       'BARRY CALLEBAUT + CARGILL + CEMOI', 'COCOASOURCE', 'FILDISI',
       'CEMOI', 'SUSCOM', 'FERRERO', 'BARRY CALLEBAUT + CARGILL',
       'SUCDEN + OLAM', 'HERSHEY', 'HERSHEY + BLOMMER', 'ECOOKIM',
       'BARRY CALLEBAUT + TOUTON', 'BARRY CALLEBAUT + ETC GROUP',
       'BARRY CALLEBAUT + OLAM', 'BARRY CALLEBAUT + CARGILL + OLAM',
       'ALTER ECO', 'MARS', 'ETC GROUP'], dtype=object)
df_2024.DISCLOSURE_SOURCES.unique()
array([nan, 'OLAM', 'BARRY CALLEBAUT', 'CEMOI',
       'BARRY CALLEBAUT + HERSHEY', 'SUCDEN', 'CARGILL',
       'OLAM + MONDELEZ', 'BARRY CALLEBAUT + SUCDEN',
       'BARRY CALLEBAUT + MONDELEZ', 'MARS',
       'BARRY CALLEBAUT + BLOMMER + TOUTON', 'BARRY CALLEBAUT + OLAM',
       'BARRY CALLEBAUT + MARS + BLOMMER + TOUTON', 'CARGILL + OLAM',
       'NESTLE', 'SUCDEN + NESTLE',
       'BARRY CALLEBAUT + CARGILL + OLAM + MARS',
       'BARRY CALLEBAUT + CARGILL + MARS',
       'BARRY CALLEBAUT + MARS + BLOMMER', 'BARRY CALLEBAUT + FERRERO',
       'BARRY CALLEBAUT + OLAM + HERSHEY',
       'BARRY CALLEBAUT + FERRERO + TOUTON',
       'BARRY CALLEBAUT + ETC GROUP', 'SUCDEN + ECOM + FILDISI',
       'BARRY CALLEBAUT + MARS + MONDELEZ',
       'BARRY CALLEBAUT + ETC GROUP + MONDELEZ', 'BARRY CALLEBAUT + MARS',
       'FERRERO', 'BARRY CALLEBAUT + ECOM', 'SUCDEN + OLAM',
       'BARRY CALLEBAUT + TOUTON', 'ECOM', 'CARGILL + NESTLE',
       'CARGILL + HERSHEY', 'ETC GROUP', 'ECOM + NESTLE', 'SUCDEN + ECOM',
       'OLAM + ECOM', 'CARGILL + MARS', 'OLAM + NESTLE',
       'ALFRED RITTER + CARGILL', 'OLAM + PURATOS', 'ECOM + PURATOS',
       'ALFRED RITTER + BARRY CALLEBAUT + CARGILL + CEMOI + FERRERO',
       'ECOOKIM', "TONY'S CHOCOLONELY", 'ALFRED RITTER + CEMOI',
       'OLAM + HERSHEY', 'OLAM + FERRERO', 'SUCDEN + OLAM + HERSHEY',
       'HERSHEY', 'HERSHEY + BLOMMER', 'FERRERO + TOUTON',
       'OLAM + BLOMMER', 'BLOMMER', 'CEMOI + BLOMMER',
       'CEMOI + SUCDEN + OLAM + HERSHEY + BLOMMER',
       'OLAM + HERSHEY + BLOMMER', 'MARS + BLOMMER + TOUTON',
       'OLAM + MARS', 'CARGILL + OLAM + MARS',
       'BARRY CALLEBAUT + FERRERO + HERSHEY', 'CEMOI + OLAM',
       'CEMOI + OLAM + MARS', 'ALTER ECO', 'CEMOI + ETC GROUP',
       'MARS + TOUTON', 'ECOM + MONDELEZ', 'ECOM + FILDISI'], dtype=object)
import pandas as pd
import numpy as np
import re
from difflib import SequenceMatcher

# Load data
df_2024 = pd.read_csv("/Users/niamhfrench/Library/CloudStorage/OneDrive-SEI/Documents/TRASE/trase/data/cote_divoire/cocoa/logistics/q4_2025/clean/2024_civ_coops_like_cam_long.csv", sep=';')
df_prev = pd.read_csv("/Users/niamhfrench/Library/CloudStorage/OneDrive-SEI/Documents/TRASE/trase/data/cote_divoire/cocoa/logistics/out/CAM_seipcs_2023.csv", sep=';', low_memory=False)

# Ensure YEAR column is correct in df_2024
if 'YEAR' not in df_2024.columns:
    df_2024['YEAR'] = 2024
else:
    df_2024['YEAR'] = df_2024['YEAR'].fillna(2024).astype(int)

# --- 0. SMART COLUMN ALIGNMENT ---

# 1. Strip whitespace from all headers first
df_2024.columns = df_2024.columns.str.strip()
df_prev.columns = df_prev.columns.str.strip()

print(df_2024.DISCLOSURE_SOURCES.isna().sum())
print(df_2024.BUYER.isna().sum())

df_2024.DISCLOSURE_SOURCES.fillna(df_2024.COMPANY, inplace=True)
df_2024.DISCLOSURE_SOURCES.fillna(df_2024.BUYER, inplace=True)
df_2024.BUYER.fillna(df_2024.DISCLOSURE_SOURCES, inplace=True)

print(df_2024.DISCLOSURE_SOURCES.isna().sum())
print(df_2024.BUYER.isna().sum())

# 2. Manual Overrides (Add any pairs that fuzzy match might miss or get wrong)
rename_map = {
    # 'Bad Name': 'Good Name',
}

# 3. Auto-generate Fuzzy Matches
# We look for columns in 2024 that are NOT in prev, but look very similar
target_cols = list(df_prev.columns)
source_cols = list(df_2024.columns)

print("--- Column Matching Report ---")
for col in source_cols:
    # If the column is already perfect, skip
    if col in target_cols:
        continue

    # Find the best match in the previous year's columns
    # ratio() returns a score between 0 and 1
    best_match = max(target_cols, key=lambda x: SequenceMatcher(None, col, x).ratio())
    score = SequenceMatcher(None, col, best_match).ratio()

    # Threshold: 0.85 is usually safe for things like 'SOURCE' vs 'SOURCES'
    if score > 0.85:
        print(f"Auto-renaming: '{col}' -> '{best_match}' (Score: {score:.2f})")
        rename_map[col] = best_match
    else:
        # If it's a new column that doesn't match anything, we leave it alone (it will be dropped by reindex)
        print(f"Dropping new column: '{col}' (No close match in history)")

# 4. Apply Renaming
df_2024 = df_2024.rename(columns=rename_map)

# 5. Enforce Exact Structure (Reindex)
# - Drops columns not in df_prev
# - Adds columns missing in df_2024 (fills with NaN)
df_2024 = df_2024.reindex(columns=df_prev.columns)

print("------------------------------")

# --- 1. Round lat and lon to 3 decimal places ---
cols_to_round = ['LATITUDE', 'LONGITUDE']
for col in cols_to_round:
    if col in df_2024.columns:
        df_2024[col] = df_2024[col].round(3)

# --- 2. Standardise spellings to 'COOPERATIVE' ---
pattern_standardise = r'(?i)\b(SOCIETE\s+COOPERATIVE|SOCIETÉ\s+COOPÉRATIVE|STE\s+COOP|STE\s+CPE|SOC\s+COOP)\b'
df_2024['SUPPLIER_ABRVNAME'] = df_2024['SUPPLIER_ABRVNAME'].str.replace(pattern_standardise, 'COOPERATIVE', regex=True)

# --- 3. Simplify names by removing COOP patterns ---
pattern_simplify = r'(?i)(\bSCOOPS?\b|COOP_CA_|\bCOOP\s*[-_]?\s*CA\b|COOP|COOPERATIVE|\s*[-–]\s*CA\b)'
df_2024['SUPPLIER_ABRVNAME'] = df_2024['SUPPLIER_ABRVNAME'].str.replace(pattern_simplify, '', regex=True).str.strip()
df_2024['SUPPLIER_ABRVNAME'] = df_2024['SUPPLIER_ABRVNAME'].str.strip(' _-–')
df_2024['SUPPLIER_ABRVNAME'] = df_2024['SUPPLIER_ABRVNAME'].str.replace(r'\s+', ' ', regex=True)

# --- 4. Match to stable COOP_IDs from previous year ---
df_2024 = df_2024.drop(columns=['COOP_ID', 'FLOW_ID'], errors='ignore')

# Create reference table from df_prev (Name -> ID)
ref_ids = df_prev[['SUPPLIER_ABRVNAME', 'COOP_ID']].dropna(subset=['COOP_ID']).drop_duplicates(subset=['SUPPLIER_ABRVNAME'])

# Merge 2024 data with the reference IDs
df_2024 = df_2024.merge(ref_ids, on='SUPPLIER_ABRVNAME', how='left')

# Handle New Cooperatives (Assign new IDs)
max_prev_id = pd.to_numeric(df_prev['COOP_ID'], errors='coerce').max()
if np.isnan(max_prev_id):
    max_prev_id = 0

mask_new = df_2024['COOP_ID'].isna()
count_new = mask_new.sum()

if count_new > 0:
    new_ids = range(int(max_prev_id) + 1, int(max_prev_id) + 1 + count_new)
    df_2024.loc[mask_new, 'COOP_ID'] = new_ids

df_2024['COOP_ID'] = df_2024['COOP_ID'].astype(int)

# --- Initialize Flag for Real 2024 Data ---
df_2024['REPEATED_FROM_PAST_YEAR'] = False

# =============================================================================
# NEW LOGIC: EXTRAPOLATION
# =============================================================================

# --- 5. Extrapolate Links (Missing Buyers) ---
# Logic: If a Buyer is in df_prev but NOT in df_2024, copy their most recent year's rows to 2024.

buyers_prev = set(df_prev['BUYER'].unique())
buyers_curr = set(df_2024['BUYER'].unique())
missing_buyers = buyers_prev - buyers_curr

rows_to_add_buyers = []

if missing_buyers:
    print(f"Extrapolating links for {len(missing_buyers)} missing buyers...")
    # Filter df_prev for only these buyers
    df_missing_buyers = df_prev[df_prev['BUYER'].isin(missing_buyers)]

    # For each buyer, find their max year and take those rows
    for buyer, group in df_missing_buyers.groupby('BUYER'):
        last_year = group['YEAR'].max()
        rows = group[group['YEAR'] == last_year].copy()

        # Update for 2024
        rows['YEAR'] = 2024
        rows['REPEATED_FROM_PAST_YEAR'] = True
        rows_to_add_buyers.append(rows)

if rows_to_add_buyers:
    df_buyer_extrap = pd.concat(rows_to_add_buyers)
    df_2024 = pd.concat([df_2024, df_buyer_extrap], ignore_index=True)

# --- 6. Extrapolate Existence (Missing Cooperatives) ---
# Logic: Ensure every coop in df_prev exists in df_2024. 
# If a coop is still missing (even after step 5), carry forward its last known state.

# Get set of IDs currently in 2024 (Original + Extrapolated Buyers)
current_coop_ids = set(df_2024['COOP_ID'].unique())
prev_coop_ids = set(df_prev['COOP_ID'].dropna().unique())

missing_coop_ids = prev_coop_ids - current_coop_ids

if missing_coop_ids:
    print(f"Extrapolating existence for {len(missing_coop_ids)} missing cooperatives...")

    # Filter df_prev for these specific missing IDs
    df_missing_coops = df_prev[df_prev['COOP_ID'].isin(missing_coop_ids)]

    # We want the MOST RECENT row for each coop to carry forward
    # Sort by Year desc, then drop duplicates on COOP_ID to keep the latest
    coops_to_add = df_missing_coops.sort_values('YEAR', ascending=False).drop_duplicates(subset=['COOP_ID']).copy()

    # Update for 2024
    coops_to_add['YEAR'] = 2024
    coops_to_add['REPEATED_FROM_PAST_YEAR'] = True

    # Append to 2024 dataset
    df_2024 = pd.concat([df_2024, coops_to_add], ignore_index=True)

# --- 7. Final Cleanup & Merge ---

# Re-create FLOW_ID for the entire 2024 dataset (Raw + Extrapolated) to ensure uniqueness
# Reset index first to ensure clean sequence
df_2024 = df_2024.reset_index(drop=True)
df_2024['FLOW_ID'] = df_2024['YEAR'].astype(str) + '-' + (df_2024.index + 1).astype(str)

# Prepare df_prev for final merge (ensure it has the flag column)
if 'REPEATED_FROM_PAST_YEAR' not in df_prev.columns:
    df_prev['REPEATED_FROM_PAST_YEAR'] = False # Historical data is considered "real" for its year

# Combine History (2021-2023) + Completed 2024
df_final = pd.concat([df_prev, df_2024], ignore_index=True)

print("Processing Complete.")
print(f"Total rows: {len(df_final)}")
print(f"2024 rows (Real): {len(df_2024[df_2024['REPEATED_FROM_PAST_YEAR'] == False])}")
print(f"2024 rows (Extrapolated): {len(df_2024[df_2024['REPEATED_FROM_PAST_YEAR'] == True])}")

# Optional: Save
df_final.to_csv("2024_civ_coops_like_cam_long_fixes_26jan.csv", sep=';', index=False)
220
88
88
88
--- Column Matching Report ---
Dropping new column: 'COMPANY' (No close match in history)
Auto-renaming: 'TRADER_NAMES' -> 'TRADER_NAME' (Score: 0.96)
Auto-renaming: 'DISCLOSURE_SOURCES' -> 'DISCLOSURE_SOURCE' (Score: 0.97)
------------------------------
Extrapolating links for 9 missing buyers...
Extrapolating existence for 5119 missing cooperatives...
Processing Complete.
Total rows: 34452
2024 rows (Real): 1531
2024 rows (Extrapolated): 5260


/var/folders/3r/r9b5rj5s3pb069rsg5zh_1640000gn/T/ipykernel_65341/3134469334.py:143: FutureWarning: In a future version, object-dtype columns with all-bool values will not be included in reductions with bool_only=True. Explicitly cast to bool dtype instead.
  df_2024 = pd.concat([df_2024, df_buyer_extrap], ignore_index=True)
df_2024[df_2024['REPEATED_FROM_PAST_YEAR'] == False]
YEAR SUPPLIER_ABRVNAME SUPPLIER_FULLNAME LATITUDE LONGITUDE DISTRICT_NAME DISTRICT_GEOCODE BUYER DISCLOSURE_SOURCE TRADER_NAME ... CERTIFIED NB_FARMERS_COMPANY_YEAR NON_TRADER IS_TRADER !NOT_RFA unique_rfa_link LVL_4_NAME REPEATED_FROM_PAST_YEAR COOP_ID FLOW_ID
0 2024 ERATIVE BENKADI DE SIAKAKRO SOCIETE COOPERATIVE BENKADI DE SIAKAKRO 7.135 -3.204 AGNIBILEKRO CI-3.1.2_1 BARRY CALLEBAUT BARRY CALLEBAUT BARRY CALLEBAUT ... NaN NaN NaN False NaN NaN CI-3.1.2_1 False 5767 2024-1
1 2024 ERATIVE ANOUANZE DES PRODUCTEURS DE L'INDENIE SOCIETE COOPERATIVE ANOUANZE DES PRODUCTEURS D... 6.720 -3.495 ABENGOUROU CI-3.1.1_1 BARRY CALLEBAUT BARRY CALLEBAUT BARRY CALLEBAUT ... NaN NaN NaN False NaN NaN CI-3.1.1_1 False 5768 2024-2
2 2024 ERATIVE AGRICOLE AHUANOU D'ARRAH SOCIETE COOPERATIVE AGRICOLE AHUANOU D'ARRAH NaN NaN BONGOUANOU CI-6.3.2_1 BARRY CALLEBAUT BARRY CALLEBAUT BARRY CALLEBAUT ... NaN NaN NaN False NaN NaN CI-6.3.2_1 False 5769 2024-3
3 2024 TELIN COOPERATIVE AGRICOLE TELIN 5.877 -7.455 BANGOLO CI-8.2.1_1 BARRY CALLEBAUT BARRY CALLEBAUT BARRY CALLEBAUT ... NaN NaN NaN False NaN NaN CI-8.2.1_1 False 5602 2024-4
4 2024 ERATIVE AGRICOLE IMAAH, ERATIVE SIMPLIFIÉE COOPERATIVE AGRICOLE IMAAH, COOPERATIVE SIMPLI... NaN NaN DIVO CI-5.2.1_1 BARRY CALLEBAUT BARRY CALLEBAUT BARRY CALLEBAUT ... NaN NaN NaN False NaN NaN CI-5.2.1_1 False 5770 2024-5
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1526 2024 SANAB SOCIETE COOPERATIVE AGRICOLE NOUVELLE ALLIANCE... 4.957 -6.085 BANGOLO CI-8.2.1_1 ECOM ECOM ECOM ... NaN NaN NaN False NaN NaN CI-8.2.1_1 False 6338 2024-1527
1527 2024 CAFUGO COOPÉRATIVE AGRICOLE DES FRÈRES UNIS DE GOBROKO 4.957 -6.087 SASSANDRA CI-2.1.2_1 ECOM ECOM ECOM ... NaN NaN NaN False NaN NaN CI-2.1.2_1 False 736 2024-1528
1528 2024 CCAPUO SOCIETE COOPERATIVE AVEC CONSEIL D`ADMINISTRAT... 6.747 -7.362 DUEKOUE CI-8.2.2_1 ECOM ECOM ECOM ... NaN NaN NaN False NaN NaN CI-8.2.2_1 False 6339 2024-1529
1529 2024 UDAN SOCIETE COOPERATIVE AVEC CONSEIL D’ADMINISTRAT... 7.252 -8.151 DANANE CI-8.3.2_1 ECOM ECOM ECOM ... NaN NaN NaN False NaN NaN CI-8.3.2_1 False 5635 2024-1530
1530 2024 YEBOYOKON SOCIETE COOPERATIVE AGRICOLE YEBOYOKON DE DAIR... NaN NaN DIVO CI-5.2.1_1 ECOM ECOM ECOM ... NaN NaN NaN False NaN NaN CI-5.2.1_1 False 5735 2024-1531

1531 rows × 43 columns

# WHERE DO I CHANGE MY CODE - TRADE DATA CLEANING?

# Choose between Importer and Exporter by whichever is a bigger company with more disclosure and/or commitments
# Keep exporting cooperatives
# If both are similar, choose the Exporter