Compare 2024 to 2023 coops
View or edit on GitHub
This page is synchronized from trase/data/cote_divoire/cocoa/logistics/q4_2025/compare_2024_to_2023_coops.ipynb. Last modified on 2026-06-21 06:35 CEST by GitHub Actions.
Please view or edit the original file there; changes should be reflected here after a midnight build (CET time),
or manually triggering it with a GitHub action (link).
import pandas as pd
import geopandas as gpd
import os
os.chdir('/Users/niamhfrench/repos/TRASE/trase/')
print(os.getcwd())
df_2024 = pd.read_csv("data/cote_divoire/cocoa/logistics/q4_2025/clean/2024_civ_coops_like_cam_long.csv", sep=';')
df_prev = pd.read_csv("data/cote_divoire/cocoa/logistics/out/CAM_seipcs_2023.csv", sep=';')
df_2024_no_duplicates = df_2024.drop_duplicates(subset="SUPPLIER_ABRVNAME")
# Filter the dataframes
df_2023 = df_prev[df_prev.YEAR == 2023]
df_2022 = df_prev[df_prev.YEAR == 2022]
df_2021 = df_prev[df_prev.YEAR == 2021]
/Users/niamhfrench/repos/TRASE/trase
---------------------------------------------------------------------------
FileNotFoundError Traceback (most recent call last)
Cell In[9], line 8
5 os.chdir('/Users/niamhfrench/repos/TRASE/trase/')
6 print(os.getcwd())
----> 8 df_2024 = pd.read_csv("data/cote_divoire/cocoa/logistics/q4_2025/clean/2024_civ_coops_like_cam_long.csv", sep=';')
9 df_prev = pd.read_csv("data/cote_divoire/cocoa/logistics/out/CAM_seipcs_2023.csv", sep=';')
11 df_2024_no_duplicates = df_2024.drop_duplicates(subset="SUPPLIER_ABRVNAME")
File ~/Library/Caches/pypoetry/virtualenvs/trase-N5EnYiQe-py3.10/lib/python3.10/site-packages/pandas/io/parsers/readers.py:1026, in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options, dtype_backend)
1013 kwds_defaults = _refine_defaults_read(
1014 dialect,
1015 delimiter,
(...)
1022 dtype_backend=dtype_backend,
1023 )
1024 kwds.update(kwds_defaults)
-> 1026 return _read(filepath_or_buffer, kwds)
File ~/Library/Caches/pypoetry/virtualenvs/trase-N5EnYiQe-py3.10/lib/python3.10/site-packages/pandas/io/parsers/readers.py:620, in _read(filepath_or_buffer, kwds)
617 _validate_names(kwds.get("names", None))
619 # Create the parser.
--> 620 parser = TextFileReader(filepath_or_buffer, **kwds)
622 if chunksize or iterator:
623 return parser
File ~/Library/Caches/pypoetry/virtualenvs/trase-N5EnYiQe-py3.10/lib/python3.10/site-packages/pandas/io/parsers/readers.py:1620, in TextFileReader.__init__(self, f, engine, **kwds)
1617 self.options["has_index_names"] = kwds["has_index_names"]
1619 self.handles: IOHandles | None = None
-> 1620 self._engine = self._make_engine(f, self.engine)
File ~/Library/Caches/pypoetry/virtualenvs/trase-N5EnYiQe-py3.10/lib/python3.10/site-packages/pandas/io/parsers/readers.py:1880, in TextFileReader._make_engine(self, f, engine)
1878 if "b" not in mode:
1879 mode += "b"
-> 1880 self.handles = get_handle(
1881 f,
1882 mode,
1883 encoding=self.options.get("encoding", None),
1884 compression=self.options.get("compression", None),
1885 memory_map=self.options.get("memory_map", False),
1886 is_text=is_text,
1887 errors=self.options.get("encoding_errors", "strict"),
1888 storage_options=self.options.get("storage_options", None),
1889 )
1890 assert self.handles is not None
1891 f = self.handles.handle
File ~/Library/Caches/pypoetry/virtualenvs/trase-N5EnYiQe-py3.10/lib/python3.10/site-packages/pandas/io/common.py:873, in get_handle(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)
868 elif isinstance(handle, str):
869 # Check whether the filename is to be opened in binary mode.
870 # Binary mode does not support 'encoding' and 'newline'.
871 if ioargs.encoding and "b" not in ioargs.mode:
872 # Encoding
--> 873 handle = open(
874 handle,
875 ioargs.mode,
876 encoding=ioargs.encoding,
877 errors=errors,
878 newline="",
879 )
880 else:
881 # Binary mode
882 handle = open(handle, ioargs.mode)
FileNotFoundError: [Errno 2] No such file or directory: 'data/cote_divoire/cocoa/logistics/q4_2025/clean/2024_civ_coops_like_cam_long.csv'
df_2023_no_duplicates = df_2023.drop_duplicates(subset="SUPPLIER_ABRVNAME")
df_2023_no_duplicates[df_2023_no_duplicates.REPEATED_FROM_PAST_YEAR==False]
| FLOW_ID | COOP_ID | YEAR | SUPPLIER_ABRVNAME | SUPPLIER_FULLNAME | LATITUDE | LONGITUDE | DISTRICT_NAME | DISTRICT_GEOCODE | BUYER | ... | IS_ALL_CAM_V3 | IS_ANY_CAM_V3 | CERTIFIED | NB_FARMERS_COMPANY_YEAR | NON_TRADER | IS_TRADER | !NOT_RFA | unique_rfa_link | LVL_4_NAME | REPEATED_FROM_PAST_YEAR | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 52 | 2023-1 | 45 | 2023 | ABO | ABO | 5.632 | -6.631 | MEAGUI | CI-2.2.3_1 | BARRY CALLEBAUT | ... | False | False | True | 653.0 | False | True | False | False | MEAGUI | False |
| 66 | 2023-2 | 52 | 2023 | AB | AB | NaN | NaN | SASSANDRA | CI-2.1.2_1 | CEMOI | ... | False | False | True | NaN | False | True | False | False | SASSANDRA | False |
| 79 | 2023-3 | 60 | 2023 | ADA | COOPERATIVE AVEC CONSEIL D'ADMINISTRATION DES ... | 5.200 | -6.832 | SAN-PEDRO | CI-2.3.1_1 | BARRY CALLEBAUT | ... | False | False | True | 223.0 | False | True | False | False | SAN-PEDRO | False |
| 98 | 2023-6 | 64 | 2023 | ADEDO | COOPERATIVE AGRICOLE POUR LE DEVELOPPEMENT ET ... | 6.385 | -5.411 | OUME | CI-5.1.2_1 | CARGILL | ... | False | True | True | 864.0 | False | True | False | False | OUME | False |
| 105 | 2023-7 | 67 | 2023 | ADI | COOPERATIVE AVEC CONSEIL D'ADMINISTRATION DE D... | 6.658 | -3.273 | ABENGOUROU | CI-3.1.1_1 | CEMOI | ... | False | True | True | NaN | False | True | False | False | ABENGOUROU | False |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 9185 | 2023-563 | 5713 | 2023 | WENDE WAOGA | NaN | NaN | NaN | NaN | NaN | SUCDEN | ... | False | False | False | NaN | False | True | False | False | NaN | False |
| 9197 | 2023-564 | 5722 | 2023 | WONTA | WONTA | NaN | NaN | LAKOTA | CI-5.2.3_1 | CEMOI | ... | False | False | True | NaN | False | True | False | False | LAKOTA | False |
| 9206 | 2023-565 | 5728 | 2023 | YAT | NaN | NaN | NaN | NaN | NaN | SUCDEN | ... | False | False | False | NaN | False | True | False | False | NaN | False |
| 9221 | 2023-566 | 5731 | 2023 | YAWOUBE D'ANIASSUE | YAWOUBE D'ANIASSUE | 6.649 | -3.680 | ABENGOUROU | CI-3.1.1_1 | CARGILL | ... | False | False | True | 1782.0 | False | True | False | False | ABENGOUROU | False |
| 9258 | 2023-567 | 5751 | 2023 | YEYOBIE INDENIE | YEYOBIE INDENIE | 6.855 | -3.487 | ABENGOUROU | CI-3.1.1_1 | CARGILL | ... | False | True | False | 831.0 | False | True | False | False | ABENGOUROU | False |
493 rows × 43 columns
df_2022_no_duplicates = df_2022.drop_duplicates(subset="SUPPLIER_ABRVNAME")
df_2022_no_duplicates[df_2022_no_duplicates.REPEATED_FROM_PAST_YEAR==False]
| FLOW_ID | COOP_ID | YEAR | SUPPLIER_ABRVNAME | SUPPLIER_FULLNAME | LATITUDE | LONGITUDE | DISTRICT_NAME | DISTRICT_GEOCODE | BUYER | ... | IS_ALL_CAM_V3 | IS_ANY_CAM_V3 | CERTIFIED | NB_FARMERS_COMPANY_YEAR | NON_TRADER | IS_TRADER | !NOT_RFA | unique_rfa_link | LVL_4_NAME | REPEATED_FROM_PAST_YEAR | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 11 | 2022-1 | 12 | 2022 | 3A | 3A | NaN | NaN | NaN | NaN | NaN | ... | False | False | True | NaN | False | False | True | True | NaN | False |
| 12 | 2022-2 | 13 | 2022 | 3KSCAED | 3K COOPERATIVE AGRICOLE ETRAEYAWLIN DE DIEGONEFLA | NaN | NaN | NaN | NaN | NaN | ... | False | False | True | NaN | False | False | True | True | NaN | False |
| 18 | 2022-3 | 19 | 2022 | A GOH | COOPERATIVE AVEC CONSENTEMENT DES AGRICULTEURS... | NaN | NaN | GAGNOA | CI-5.1.1_1 | OLAM | ... | False | False | False | NaN | False | True | False | False | GAGNOA | False |
| 22 | 2022-4 | 23 | 2022 | A N'ZUE | A N'ZUE | NaN | NaN | NaN | NaN | NaN | ... | False | False | True | NaN | False | False | True | True | NaN | False |
| 23 | 2022-5 | 24 | 2022 | A SAM | A SAM | NaN | NaN | NaN | NaN | NaN | ... | False | False | True | NaN | False | False | True | True | NaN | False |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 9226 | 2022-1897 | 5735 | 2022 | YEBOYOKON | COOPERATIVE AGRICOLE YEBOYOKON DE DAIRO DIDIZO | NaN | NaN | NaN | NaN | NaN | ... | False | False | True | NaN | False | False | True | True | NaN | False |
| 9242 | 2022-1898 | 5748 | 2022 | YEYASSO | COOPERATIVE AVEC CONSEIL D'ADMINISTRATION YEYA... | NaN | NaN | MAN | CI-8.3.3_1 | FERRERO | ... | False | False | False | NaN | True | False | False | False | MAN | False |
| 9257 | 2022-1900 | 5751 | 2022 | YEYOBIE INDENIE | YEYOBIE INDENIE | 6.855 | -3.487 | ABENGOUROU | CI-3.1.1_1 | CARGILL | ... | False | True | False | 831.0 | False | True | False | False | ABENGOUROU | False |
| 9263 | 2022-1901 | 5756 | 2022 | YOMIDEFE | YOMIDEFE | NaN | NaN | TAABO | CI-7.1.3_1 | CEMOI | ... | False | False | True | NaN | False | True | False | False | TAABO | False |
| 9273 | 2022-1902 | 5765 | 2022 | ZRANSSA | COOPERATIVE AVEC CONSEIL D'ADMINISTRATION ZRANSSA | NaN | NaN | NaN | NaN | NaN | ... | False | False | True | NaN | False | False | True | True | NaN | False |
1225 rows × 43 columns
df_2021_no_duplicates = df_2021.drop_duplicates(subset="SUPPLIER_ABRVNAME")
df_2021_no_duplicates[df_2021_no_duplicates.REPEATED_FROM_PAST_YEAR==False]
| FLOW_ID | COOP_ID | YEAR | SUPPLIER_ABRVNAME | SUPPLIER_FULLNAME | LATITUDE | LONGITUDE | DISTRICT_NAME | DISTRICT_GEOCODE | BUYER | ... | IS_ALL_CAM_V3 | IS_ANY_CAM_V3 | CERTIFIED | NB_FARMERS_COMPANY_YEAR | NON_TRADER | IS_TRADER | !NOT_RFA | unique_rfa_link | LVL_4_NAME | REPEATED_FROM_PAST_YEAR | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 49 | 2021-1 | 45 | 2021 | ABO | ABO | 5.632 | -6.631 | MEAGUI | CI-2.2.3_1 | BARRY CALLEBAUT | ... | False | False | True | 828.0 | False | True | False | False | MEAGUI | False |
| 75 | 2021-3 | 60 | 2021 | ADA | COOPERATIVE AVEC CONSEIL D'ADMINISTRATION DES ... | 5.200 | -6.832 | SAN-PEDRO | CI-2.3.1_1 | BARRY CALLEBAUT | ... | False | False | True | 575.0 | False | True | False | False | SAN-PEDRO | False |
| 96 | 2021-5 | 64 | 2021 | ADEDO | COOPERATIVE AGRICOLE POUR LE DEVELOPPEMENT ET ... | 6.385 | -5.411 | OUME | CI-5.1.2_1 | CARGILL | ... | False | True | True | 864.0 | False | True | False | False | OUME | False |
| 127 | 2021-6 | 79 | 2021 | AGBS | COOP AGRICOLE GNATI-BIA DE SIKENSI | 5.671 | -4.574 | SIKENSI | CI-7.1.2_1 | CARGILL | ... | False | True | True | 910.0 | False | True | False | False | SIKENSI | False |
| 147 | 2021-7 | 88 | 2021 | AGREL | COOPERATIVE POUR LE DEVELOPPEMENT RURAL | 5.847 | -5.686 | LAKOTA | CI-5.2.3_1 | CARGILL | ... | False | False | True | 535.0 | False | True | False | False | LAKOTA | False |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 9161 | 2021-633 | 5692 | 2021 | VIMA | VIMA | 6.621 | -6.045 | SINFRA | CI-9.2.2_1 | BARRY CALLEBAUT | ... | False | False | True | 563.0 | False | True | False | False | SINFRA | False |
| 9194 | 2021-634 | 5721 | 2021 | WINPAS | COOPERATIVE WIND-PANGA DE SEGUELA | 7.975 | -6.675 | SEGUELA | CI-12.3.2_1 | NaN | ... | False | True | False | NaN | False | False | True | True | SEGUELA | False |
| 9219 | 2021-635 | 5731 | 2021 | YAWOUBE D'ANIASSUE | YAWOUBE D'ANIASSUE | 6.649 | -3.680 | ABENGOUROU | CI-3.1.1_1 | CARGILL | ... | False | False | True | 1782.0 | False | True | False | False | ABENGOUROU | False |
| 9241 | 2021-636 | 5747 | 2021 | YEYASSO | COOPERATIVE YEYASSO DE MAN | 7.407 | -7.552 | MAN | CI-8.3.3_1 | MONDELEZ | ... | False | True | True | NaN | True | False | False | False | MAN | False |
| 9256 | 2021-637 | 5751 | 2021 | YEYOBIE INDENIE | YEYOBIE INDENIE | 6.855 | -3.487 | ABENGOUROU | CI-3.1.1_1 | CARGILL | ... | False | True | False | 831.0 | False | True | False | False | ABENGOUROU | False |
492 rows × 43 columns
print(len(df_2023_no_duplicates), len(df_2024_no_duplicates))
4038 1116
df_2024.columns
Index(['FLOW_ID', 'COOP_ID', 'YEAR', 'SUPPLIER_ABRVNAME', 'SUPPLIER_FULLNAME',
'LATITUDE', 'LONGITUDE', 'DISTRICT_NAME', 'DISTRICT_GEOCODE', 'BUYER',
'COMPANY', 'TRADER_NAMES', 'CERTIFICATIONS', 'NUM_FARMERS',
'NUM_FARMERS_EXTRAPOLATED', 'TOTAL_FARMERS_NONTRADER',
'TOTAL_FARMERS_TRADER', 'TOTAL_FARMERS_RFA', 'TOTAL_FARMERS',
'DISCLOSURE_SOURCES', 'CERTIFIED', 'DISCL_COUNTRY_NAME',
'DISCL_AREA_NAME', 'DISCL_SUPPLIER_ABRVNAME', 'DISCL_SUPPLIER_FULLNAME',
'DISCL_SUPPLIER_TYPE', 'DISCL_LONGITUDE', 'DISCL_LATITUDE',
'DISCL_VOLUMES', 'DISCL_VOLUMES_UNIT', 'DISCL_CERTIFICATION_NAME',
'DISCL_CERTIFICATION_PCT', 'SIMPLIF_ABRVNAME', 'CCTN_COOP_ID',
'IS_ALL_CAM_V3', 'IS_ANY_CAM_V3', 'NB_FARMERS_COMPANY_YEAR', '!NOT_RFA',
'unique_rfa_link', 'LVL_4_NAME', 'REPEATED_FROM_PAST_YEAR',
'IS_TRADER'],
dtype='object')
# Drop rows where BOTH 'BUYER' and 'TRADER_NAME' are NaN
df_2021_clean = df_2021_no_duplicates.dropna(subset=['BUYER', 'TRADER_NAME'], how='all')
df_2022_clean = df_2022_no_duplicates.dropna(subset=['BUYER', 'TRADER_NAME'], how='all')
df_2023_clean = df_2023_no_duplicates.dropna(subset=['BUYER', 'TRADER_NAME'], how='all')
df_2024_clean = df_2024_no_duplicates.dropna(subset=['BUYER', 'TRADER_NAMES'], how='all')
# Print the lengths of the cleaned dataframes
print(f"2021 Count: {len(df_2021_clean)}")
print(f"2022 Count: {len(df_2022_clean)}")
print(f"2023 Count: {len(df_2023_clean)}")
print(f"2024 Count: {len(df_2024_clean)}")
2021 Count: 691
2022 Count: 727
2023 Count: 798
2024 Count: 1038
# Create a filter where BUYER is NaN AND TRADER_NAMES is NaN
rows_missing_both = df_2024_no_duplicates[
df_2024_no_duplicates['BUYER'].isna() &
df_2024_no_duplicates['TRADER_NAMES'].isna()
]
# View the dataframe
display(len(rows_missing_both))
78
print(f"2021 Count: {len(df_2021_no_duplicates)}")
print(f"2022 Count: {len(df_2022_no_duplicates)}")
print(f"2023 Count: {len(df_2023_no_duplicates)}")
print(f"2024 Count: {len(df_2024_no_duplicates)}")
2021 Count: 3536
2022 Count: 4021
2023 Count: 4038
2024 Count: 1116
df_2023_no_duplicates
/opt/homebrew/Caskroom/mambaforge/base/envs/trase-env/lib/python3.10/site-packages/IPython/lib/pretty.py:794: FutureWarning: In a future version, object-dtype columns with all-bool values will not be included in reductions with bool_only=True. Explicitly cast to bool dtype instead.
output = repr(obj)
/opt/homebrew/Caskroom/mambaforge/base/envs/trase-env/lib/python3.10/site-packages/IPython/core/formatters.py:406: FutureWarning: In a future version, object-dtype columns with all-bool values will not be included in reductions with bool_only=True. Explicitly cast to bool dtype instead.
return method()
| FLOW_ID | COOP_ID | YEAR | SUPPLIER_ABRVNAME | SUPPLIER_FULLNAME | LATITUDE | LONGITUDE | DISTRICT_NAME | DISTRICT_GEOCODE | BUYER | ... | IS_ALL_CAM_V3 | IS_ANY_CAM_V3 | CERTIFIED | NB_FARMERS_COMPANY_YEAR | NON_TRADER | IS_TRADER | !NOT_RFA | unique_rfa_link | LVL_4_NAME | REPEATED_FROM_PAST_YEAR | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 52 | 2023-1 | 45 | 2023 | ABO | ABO | 5.632 | -6.631 | MEAGUI | CI-2.2.3_1 | BARRY CALLEBAUT | ... | False | False | True | 653.0 | False | True | False | False | MEAGUI | False |
| 66 | 2023-2 | 52 | 2023 | AB | AB | NaN | NaN | SASSANDRA | CI-2.1.2_1 | CEMOI | ... | False | False | True | NaN | False | True | False | False | SASSANDRA | False |
| 79 | 2023-3 | 60 | 2023 | ADA | COOPERATIVE AVEC CONSEIL D'ADMINISTRATION DES ... | 5.200 | -6.832 | SAN-PEDRO | CI-2.3.1_1 | BARRY CALLEBAUT | ... | False | False | True | 223.0 | False | True | False | False | SAN-PEDRO | False |
| 98 | 2023-6 | 64 | 2023 | ADEDO | COOPERATIVE AGRICOLE POUR LE DEVELOPPEMENT ET ... | 6.385 | -5.411 | OUME | CI-5.1.2_1 | CARGILL | ... | False | True | True | 864.0 | False | True | False | False | OUME | False |
| 105 | 2023-7 | 67 | 2023 | ADI | COOPERATIVE AVEC CONSEIL D'ADMINISTRATION DE D... | 6.658 | -3.273 | ABENGOUROU | CI-3.1.1_1 | CEMOI | ... | False | True | True | NaN | False | True | False | False | ABENGOUROU | False |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27656 | 2023-5983 | 5761 | 2023 | ZOUAN KOUALOU | COOPERATIVE AGRICOLE DE ZOUAN-HOUNIEN | 6.915 | -8.211 | NaN | CI-8.3.5_1 | NaN | ... | True | True | NaN | NaN | NaN | NaN | NaN | NaN | NaN | True |
| 27657 | 2023-5984 | 5762 | 2023 | ZOUEUDO | COOPERATIVE AGRICOLE ZOUEUDO DE MAHAPLEU | 7.256 | -8.156 | NaN | CI-8.3.2_1 | NaN | ... | True | True | NaN | NaN | NaN | NaN | NaN | NaN | NaN | True |
| 27658 | 2023-5985 | 5763 | 2023 | ZOUTOKEGUI | COOPERATIVE AGRICOLE ZOUTOKEGUI DE GOURANE | 7.734 | -7.618 | NaN | CI-8.3.1_1 | NaN | ... | True | True | NaN | NaN | NaN | NaN | NaN | NaN | NaN | True |
| 27659 | 2023-5986 | 5764 | 2023 | ZRANLEU | COOPERATIVE ZRANLEU DE MAN | 7.408 | -7.549 | NaN | CI-8.3.3_1 | NaN | ... | True | True | NaN | NaN | NaN | NaN | NaN | NaN | NaN | True |
| 27660 | 2023-5987 | 5766 | 2023 | ZSB | ZSB | 7.404 | -7.554 | NaN | CI-8.3.3_1 | NaN | ... | True | True | NaN | NaN | NaN | NaN | NaN | NaN | NaN | True |
4038 rows × 43 columns
df_2023_no_duplicates.loc[27659]
FLOW_ID 2023-5986
COOP_ID 5764
YEAR 2023
SUPPLIER_ABRVNAME ZRANLEU
SUPPLIER_FULLNAME COOPERATIVE ZRANLEU DE MAN
LATITUDE 7.408
LONGITUDE -7.549
DISTRICT_NAME NaN
DISTRICT_GEOCODE CI-8.3.3_1
BUYER NaN
DISCLOSURE_SOURCE NaN
TRADER_NAME NaN
CERTIFICATIONS NaN
NUM_FARMERS NaN
NUM_FARMERS_EXTRAPOLATED NaN
TOTAL_FARMERS_NONTRADER NaN
TOTAL_FARMERS_TRADER NaN
TOTAL_FARMERS_RFA NaN
TOTAL_FARMERS NaN
DISCL_COUNTRY_NAME IVORY_COAST
DISCL_AREA_NAME NaN
DISCL_SUPPLIER_ABRVNAME SCOOPS ZRANLEU
DISCL_SUPPLIER_FULLNAME SOCIETE COOPERATIVE ZRANLEU DE MAN
DISCL_SUPPLIER_TYPE NaN
DISCL_LONGITUDE -7.548712
DISCL_LATITUDE 7.40843
DISCL_NUMBER_FARMERS NaN
DISCL_VOLUMES NaN
DISCL_VOLUMES_UNIT NaN
DISCL_CERTIFICATION_NAME NaN
DISCL_CERTIFICATION_PCT NaN
SIMPLIF_ABRVNAME ZRANLEU
CCTN_COOP_ID NaN
IS_ALL_CAM_V3 True
IS_ANY_CAM_V3 True
CERTIFIED NaN
NB_FARMERS_COMPANY_YEAR NaN
NON_TRADER NaN
IS_TRADER NaN
!NOT_RFA NaN
unique_rfa_link NaN
LVL_4_NAME NaN
REPEATED_FROM_PAST_YEAR True
Name: 27659, dtype: object
df_2023_no_duplicates.DISCL_CERTIFICATION_NAME.unique()
array(['FAIRTRADE, RAINFOREST ALLIANCE', 'TRANSPARENCE CACAO',
'COCOA HORIZONS, RAINFOREST ALLIANCE', 'CARGILL COCOA PROMISE; RA',
'CARGILL COCOA PROMISE', nan, 'COCOA LIFE', 'RAINFOREST ALLIANCE',
'COCOA HORIZONS, FAIRTRADE, RAINFOREST ALLIANCE',
'COCOA HORIZONS, FERMICOA, RAINFOREST ALLIANCE',
'COCOA HORIZONS, FERMICOA', 'COCOA HORIZONS',
'COCOA LIFE, FERMICOA', 'ra, utz, ft', 'ft',
'FAIRTRADE AND RAINFOREST ALLIANCE', 'COCOA LIFE, FAIRTRADE',
'COCOA HORIZONS, FAIRTRADE', 'utz, ft', 'NOT CERTIFIED',
'FAIRTRADE', 'ra, ft', 'RAINFOREST ALLIANCE OR FAIRTRADE', 'RFA',
'FAIR FOR LIFE', 'RAINFOREST ALLIANCE, UTZ',
'COCOA LIFE, FERMICOA, RAINFOREST ALLIANCE', 'ra, utz', ' ',
'COCOA LIFE, RAINFOREST ALLIANCE',
'COCOA HORIZONS, FAIRTRADE, RAINFOREST ALLIANCE, UTZ',
'OLAM PROGRAMMES', 'UTZ; RESPONSIBLY SOURCED COCOA',
'NA; RESPONSIBLY SOURCED COCOA',
'OLD-RA; RESPONSIBLY SOURCED COCOA', 'COCOA FOR GOOD; Fair Trade',
'COCOA FOR GOOD; Supplier Standard', 'COCOA FOR GOOD; UTZ',
'COCOA PLAN; RAINFOREST ALLIANCE',
'UTZ_CO1000007528; SUSTAINABLE ORIGINS', 'SUSTAINABLE ORIGINS',
'UTZ_CO1000008807; FAIRTRADE', 'RA', 'UTZ_CO1000009238',
'UTZ_CO1000006898', 'UTZ_CO1000007865', 'UTZ',
'UTZ_CO1000006209; RAINFOREST ALLIANCE'], dtype=object)
df_2024_no_duplicates
| FLOW_ID | COOP_ID | YEAR | SUPPLIER_ABRVNAME | SUPPLIER_FULLNAME | LATITUDE | LONGITUDE | DISTRICT_NAME | DISTRICT_GEOCODE | BUYER | ... | SIMPLIF_ABRVNAME | CCTN_COOP_ID | IS_ALL_CAM_V3 | IS_ANY_CAM_V3 | NB_FARMERS_COMPANY_YEAR | !NOT_RFA | unique_rfa_link | LVL_4_NAME | REPEATED_FROM_PAST_YEAR | IS_TRADER | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2024-1 | 0 | 2024 | SOCIETE COOPERATIVE BENKADI DE SIAKAKRO | SOCIETE COOPERATIVE BENKADI DE SIAKAKRO | 7.135274 | -3.204332 | AGNIBILEKRO | CI-3.1.2_1 | BARRY CALLEBAUT | ... | NaN | NaN | False | False | NaN | NaN | NaN | CI-3.1.2_1 | NaN | False |
| 1 | 2024-2 | 1 | 2024 | SOCIETE COOPERATIVE ANOUANZE DES PRODUCTEURS D... | SOCIETE COOPERATIVE ANOUANZE DES PRODUCTEURS D... | 6.720262 | -3.494930 | ABENGOUROU | CI-3.1.1_1 | BARRY CALLEBAUT | ... | NaN | NaN | False | False | NaN | NaN | NaN | CI-3.1.1_1 | NaN | False |
| 2 | 2024-3 | 2 | 2024 | SOCIETE COOPERATIVE AGRICOLE AHUANOU D'ARRAH | SOCIETE COOPERATIVE AGRICOLE AHUANOU D'ARRAH | NaN | NaN | BONGOUANOU | CI-6.3.2_1 | BARRY CALLEBAUT | ... | NaN | NaN | False | False | NaN | NaN | NaN | CI-6.3.2_1 | NaN | False |
| 3 | 2024-4 | 3 | 2024 | TELIN | COOPERATIVE AGRICOLE TELIN | 5.876620 | -7.455261 | BANGOLO | CI-8.2.1_1 | BARRY CALLEBAUT | ... | TELIN | NaN | False | False | NaN | NaN | NaN | CI-8.2.1_1 | NaN | False |
| 4 | 2024-5 | 4 | 2024 | COOPERATIVE AGRICOLE IMAAH, COOPERATIVE SIMPLI... | COOPERATIVE AGRICOLE IMAAH, COOPERATIVE SIMPLI... | NaN | NaN | DIVO | CI-5.2.1_1 | BARRY CALLEBAUT | ... | NaN | NaN | False | False | NaN | NaN | NaN | CI-5.2.1_1 | NaN | False |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1523 | 2024-1524 | 1523 | 2024 | SCOOPUF-COOP-CA | COOPERATIVE AGRICOLE DES PLANTEURS UNIS DE FEN... | 6.785423 | -7.393198 | DUEKOUE | CI-8.2.2_1 | ECOM | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | CI-8.2.2_1 | NaN | False |
| 1527 | 2024-1528 | 1527 | 2024 | CAFUGO COOP CA | COOPÉRATIVE AGRICOLE DES FRÈRES UNIS DE GOBROKO | 4.957198 | -6.086585 | SASSANDRA | CI-2.1.2_1 | ECOM | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | CI-2.1.2_1 | NaN | False |
| 1528 | 2024-1529 | 1528 | 2024 | COOP-CCAPUO | SOCIETE COOPERATIVE AVEC CONSEIL D`ADMINISTRAT... | 6.747380 | -7.362460 | DUEKOUE | CI-8.2.2_1 | ECOM | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | CI-8.2.2_1 | NaN | False |
| 1529 | 2024-1530 | 1529 | 2024 | COOP_CA_UDAN | SOCIETE COOPERATIVE AVEC CONSEIL D’ADMINISTRAT... | 7.252397 | -8.150575 | DANANE | CI-8.3.2_1 | ECOM | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | CI-8.3.2_1 | NaN | False |
| 1530 | 2024-1531 | 1530 | 2024 | YEBOYOKON SCOOPS | SOCIETE COOPERATIVE AGRICOLE YEBOYOKON DE DAIR... | NaN | NaN | DIVO | CI-5.2.1_1 | ECOM | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | CI-5.2.1_1 | NaN | False |
1116 rows × 42 columns
import pandas as pd
import numpy as np
import re
# Load data (added low_memory=False to handle the mixed types warning in your previous log)
df_2024 = pd.read_csv("/Users/niamhfrench/Library/CloudStorage/OneDrive-SEI/Documents/TRASE/trase/data/cote_divoire/cocoa/logistics/q4_2025/clean/2024_civ_coops_like_cam_long.csv", sep=';')
df_prev = pd.read_csv("/Users/niamhfrench/Library/CloudStorage/OneDrive-SEI/Documents/TRASE/trase/data/cote_divoire/cocoa/logistics/out/CAM_seipcs_2023.csv", sep=';', low_memory=False)
# --- 1. Round lat and lon to 3 decimal places ---
# Check if columns exist before rounding to avoid errors
cols_to_round = ['LATITUDE', 'LONGITUDE']
for col in cols_to_round:
if col in df_2024.columns:
df_2024[col] = df_2024[col].round(3)
# --- 2. Standardise spellings to 'COOPERATIVE' ---
# Normalizing variations like "STE COOP", "SOCIETE COOPERATIVE" etc.
# Note: We apply this to SUPPLIER_ABRVNAME as requested, assuming that is the target column
pattern_standardise = r'(?i)\b(SOCIETE\s+COOPERATIVE|SOCIETÉ\s+COOPÉRATIVE|STE\s+COOP|STE\s+CPE|SOC\s+COOP)\b'
df_2024['SUPPLIER_ABRVNAME'] = df_2024['SUPPLIER_ABRVNAME'].str.replace(pattern_standardise, 'COOPERATIVE', regex=True)
# --- 3. Simplify names by removing COOP patterns ---
# Removing 'COOP', 'SCOOPS', 'COOP-CA' etc to leave the unique name
# \s* handles potential extra spaces
pattern_simplify = r'(?i)(\bSCOOPS?\b|COOP_CA_|\bCOOP\s*[-_]?\s*CA\b|\bCOOP\b|\bCOOPERATIVE\b|\s*[-–]\s*CA\b)'
# Apply replacement
df_2024['SUPPLIER_ABRVNAME'] = df_2024['SUPPLIER_ABRVNAME'].str.replace(pattern_simplify, '', regex=True).str.strip()
# Extra Cleanup: Remove any leftover leading/trailing punctuation (underscores, dashes)
# that might remain after cutting the prefix/suffix.
df_2024['SUPPLIER_ABRVNAME'] = df_2024['SUPPLIER_ABRVNAME'].str.strip(' _-–')
# Clean up any resulting double spaces
df_2024['SUPPLIER_ABRVNAME'] = df_2024['SUPPLIER_ABRVNAME'].str.replace(r'\s+', ' ', regex=True)
# --- 4. Match to stable COOP_IDs from previous year ---
# Drop the current/temporary IDs from the 2024 file
df_2024 = df_2024.drop(columns=['COOP_ID', 'FLOW_ID'], errors='ignore')
# Create a reference table from the previous year (Name -> ID)
# We drop duplicates to ensure one-to-one mapping
ref_ids = df_prev[['SUPPLIER_ABRVNAME', 'COOP_ID']].drop_duplicates(subset=['SUPPLIER_ABRVNAME'])
# Merge 2024 data with the reference IDs
df_2024 = df_2024.merge(ref_ids, on='SUPPLIER_ABRVNAME', how='left')
# --- Handle New Cooperatives (NaN COOP_IDs) ---
# Find the highest existing ID in the previous data to start new IDs after it
max_prev_id = pd.to_numeric(df_prev['COOP_ID'], errors='coerce').max()
if np.isnan(max_prev_id):
max_prev_id = 0
# Identify rows that didn't find a match
mask_new = df_2024['COOP_ID'].isna()
count_new = mask_new.sum()
if count_new > 0:
# Create new IDs: range starting from max_prev_id + 1
new_ids = range(int(max_prev_id) + 1, int(max_prev_id) + 1 + count_new)
df_2024.loc[mask_new, 'COOP_ID'] = new_ids
# Ensure COOP_ID is integer
df_2024['COOP_ID'] = df_2024['COOP_ID'].astype(int)
# Re-create FLOW_ID (Assuming format Year-Index)
df_2024['FLOW_ID'] = df_2024['YEAR'].astype(str) + '-' + (df_2024.index + 1).astype(str)
# Final Inspection
print("Cleaning Complete.")
print(df_2024[['FLOW_ID', 'COOP_ID', 'SUPPLIER_ABRVNAME', 'BUYER']])
#In the cooperative panel dataset, we record the names of all known buyers from
#a cooperative in a given year, separated by ” + ”, in the variable BUYERS.
#Companies recorded in this variable can be buyers at different vertical stages
#of the same cocoa flow. In the link dataset, we register buyer names in variable LINK BUYER. In addition, we provide the dummy variable IS TRADER,
#which is TRUE if we consider, based on our own research and customs declaration data, that the buyer is a trading company that can potentially purchase
#directly from cooperatives.
Cleaning Complete.
FLOW_ID COOP_ID SUPPLIER_ABRVNAME \
0 2024-1 5767 BENKADI DE SIAKAKRO
1 2024-2 5768 ANOUANZE DES PRODUCTEURS DE L'INDENIE
2 2024-3 5769 AGRICOLE AHUANOU D'ARRAH
3 2024-4 5602 TELIN
4 2024-5 5770 AGRICOLE IMAAH, SIMPLIFIÉE
... ... ... ...
1526 2024-1527 4419 SCOOPANAB
1527 2024-1528 736 CAFUGO
1528 2024-1529 6101 CCAPUO
1529 2024-1530 5635 UDAN
1530 2024-1531 5735 YEBOYOKON
BUYER
0 BARRY CALLEBAUT
1 BARRY CALLEBAUT
2 BARRY CALLEBAUT
3 BARRY CALLEBAUT
4 BARRY CALLEBAUT
... ...
1526 ECOM
1527 ECOM
1528 ECOM
1529 ECOM
1530 ECOM
[1531 rows x 4 columns]
df_2024.BUYER.unique()
array(['BARRY CALLEBAUT', 'BLOMMER', 'ECOM', 'OFI', 'ETG', 'CARGILL',
'FARMSTRONG', 'GCB', 'JB COCOA', 'SO B GREEN', 'SUCDEN', 'TOUTON',
'KRUGER + SUSCOM + CARGILL', 'OLAM', nan,
'BARRY CALLEBAUT + CARGILL + CEMOI', 'COCOASOURCE', 'FILDISI',
'CEMOI', 'SUSCOM', 'FERRERO', 'BARRY CALLEBAUT + CARGILL',
'SUCDEN + OLAM', 'HERSHEY', 'HERSHEY + BLOMMER', 'ECOOKIM',
'BARRY CALLEBAUT + TOUTON', 'BARRY CALLEBAUT + ETC GROUP',
'BARRY CALLEBAUT + OLAM', 'BARRY CALLEBAUT + CARGILL + OLAM',
'ALTER ECO', 'MARS', 'ETC GROUP'], dtype=object)
new_ids
range(5767, 6102)
# We extrapolate links and cooperative existence from one year to the next one.
# This process starts from the baseline year 2019, by company. If a company did
# not disclose its supply chain in a given year, we extrapolate its links with coops
# this year from its last disclosure.
# We also extrapolate the existence of cooperatives into the future. We assume
# that the absence of a cooperative from the set of links disclosed for a given
# year does not reflect that this cooperative ceased to exist. Any cooperative
# disclosed once is thus featured in the cooperative panel dataset in subsequent
# years (the number of cooperatives is non-decreasing over time, and the panel is
# not balanced).
# In both the cooperative panel dataset and the link dataset, this extrapolation
# is flagged with a dummy variable called REPEATED FROM PAST YEAR.
df_2024 = pd.read_csv("/Users/niamhfrench/Library/CloudStorage/OneDrive-SEI/Documents/TRASE/trase/data/cote_divoire/cocoa/logistics/q4_2025/clean/2024_civ_coops_like_cam_long.csv", sep=';')
df_prev = pd.read_csv("/Users/niamhfrench/Library/CloudStorage/OneDrive-SEI/Documents/TRASE/trase/data/cote_divoire/cocoa/logistics/out/CAM_seipcs_2023.csv", sep=';', low_memory=False)
print(df_2024.columns, df_prev.columns)
Index(['FLOW_ID', 'COOP_ID', 'YEAR', 'SUPPLIER_ABRVNAME', 'SUPPLIER_FULLNAME',
'LATITUDE', 'LONGITUDE', 'DISTRICT_NAME', 'DISTRICT_GEOCODE', 'BUYER',
'COMPANY', 'TRADER_NAMES', 'CERTIFICATIONS', 'NUM_FARMERS',
'NUM_FARMERS_EXTRAPOLATED', 'TOTAL_FARMERS_NONTRADER',
'TOTAL_FARMERS_TRADER', 'TOTAL_FARMERS_RFA', 'TOTAL_FARMERS',
'DISCLOSURE_SOURCES', 'CERTIFIED', 'DISCL_COUNTRY_NAME',
'DISCL_AREA_NAME', 'DISCL_SUPPLIER_ABRVNAME', 'DISCL_SUPPLIER_FULLNAME',
'DISCL_SUPPLIER_TYPE', 'DISCL_LONGITUDE', 'DISCL_LATITUDE',
'DISCL_VOLUMES', 'DISCL_VOLUMES_UNIT', 'DISCL_CERTIFICATION_NAME',
'DISCL_CERTIFICATION_PCT', 'SIMPLIF_ABRVNAME', 'CCTN_COOP_ID',
'IS_ALL_CAM_V3', 'IS_ANY_CAM_V3', 'NB_FARMERS_COMPANY_YEAR', '!NOT_RFA',
'unique_rfa_link', 'LVL_4_NAME', 'REPEATED_FROM_PAST_YEAR',
'IS_TRADER'],
dtype='object') Index(['FLOW_ID', 'COOP_ID', 'YEAR', 'SUPPLIER_ABRVNAME', 'SUPPLIER_FULLNAME',
'LATITUDE', 'LONGITUDE', 'DISTRICT_NAME', 'DISTRICT_GEOCODE', 'BUYER',
'DISCLOSURE_SOURCE', 'TRADER_NAME', 'CERTIFICATIONS', 'NUM_FARMERS',
'NUM_FARMERS_EXTRAPOLATED', 'TOTAL_FARMERS_NONTRADER',
'TOTAL_FARMERS_TRADER', 'TOTAL_FARMERS_RFA', 'TOTAL_FARMERS',
'DISCL_COUNTRY_NAME', 'DISCL_AREA_NAME', 'DISCL_SUPPLIER_ABRVNAME',
'DISCL_SUPPLIER_FULLNAME', 'DISCL_SUPPLIER_TYPE', 'DISCL_LONGITUDE',
'DISCL_LATITUDE', 'DISCL_NUMBER_FARMERS', 'DISCL_VOLUMES',
'DISCL_VOLUMES_UNIT', 'DISCL_CERTIFICATION_NAME',
'DISCL_CERTIFICATION_PCT', 'SIMPLIF_ABRVNAME', 'CCTN_COOP_ID',
'IS_ALL_CAM_V3', 'IS_ANY_CAM_V3', 'CERTIFIED',
'NB_FARMERS_COMPANY_YEAR', 'NON_TRADER', 'IS_TRADER', '!NOT_RFA',
'unique_rfa_link', 'LVL_4_NAME', 'REPEATED_FROM_PAST_YEAR'],
dtype='object')
df_2024[['DISCLOSURE_SOURCES', 'BUYER', 'COMPANY']]
| DISCLOSURE_SOURCES | BUYER | COMPANY | |
|---|---|---|---|
| 0 | NaN | BARRY CALLEBAUT | NaN |
| 1 | NaN | BARRY CALLEBAUT | NaN |
| 2 | NaN | BARRY CALLEBAUT | NaN |
| 3 | NaN | BARRY CALLEBAUT | NaN |
| 4 | NaN | BARRY CALLEBAUT | NaN |
| ... | ... | ... | ... |
| 1526 | NaN | ECOM | NaN |
| 1527 | NaN | ECOM | NaN |
| 1528 | NaN | ECOM | NaN |
| 1529 | NaN | ECOM | NaN |
| 1530 | NaN | ECOM | NaN |
1531 rows × 3 columns
df_prev.BUYER.unique()
array([nan, 'FERRERO', 'OLAM', 'CARGILL', 'BLOMMER', 'NESTLE',
'BARRY CALLEBAUT', 'HERSHEY', 'TOUTON', 'CEMOI', 'KRUGER',
'SUSCOM', 'ALFRED RITTER', 'ECOM', 'SUCDEN', 'MONDELEZ', 'MARS',
'ETC GROUP', 'COCOASOURCE', 'PURATOS', 'ECOOKIM', 'VALRHONA',
'FILDISI', 'COLRUYT', 'ALTER ECO', 'ETHIQUABLE', 'ALBERT HEIJN'],
dtype=object)
df_prev.DISCLOSURE_SOURCE.unique()
array([nan, 'RAINFOREST ALLIANCE', 'FERRERO', 'OLAM', 'CARGILL',
'BLOMMER', 'NESTLE', 'BARRY CALLEBAUT', 'HERSHEY', 'TOUTON',
'CEMOI', 'ALFRED RITTER', 'ECOM', 'SUCDEN', 'MONDELEZ', 'MARS',
'ETC GROUP', "TONY'S CHOCOLONELY", 'PURATOS', 'ECOOKIM',
'VALRHONA', 'FILDISI', 'COLRUYT', 'ALTER ECO', 'ETHIQUABLE',
'ALBERT HEIJN'], dtype=object)
df_2024.COMPANY.unique()
array([nan, 'OLAM', 'BARRY CALLEBAUT', 'CEMOI',
'BARRY CALLEBAUT + HERSHEY', 'SUCDEN', 'CARGILL',
'OLAM + MONDELEZ', 'BARRY CALLEBAUT + SUCDEN',
'BARRY CALLEBAUT + MONDELEZ', 'MARS',
'BARRY CALLEBAUT + BLOMMER + TOUTON', 'BARRY CALLEBAUT + OLAM',
'BARRY CALLEBAUT + MARS + BLOMMER + TOUTON', 'CARGILL + OLAM',
'NESTLE', 'SUCDEN + NESTLE',
'BARRY CALLEBAUT + CARGILL + OLAM + MARS',
'BARRY CALLEBAUT + CARGILL + MARS',
'BARRY CALLEBAUT + MARS + BLOMMER', 'BARRY CALLEBAUT + FERRERO',
'BARRY CALLEBAUT + OLAM + HERSHEY',
'BARRY CALLEBAUT + FERRERO + TOUTON',
'BARRY CALLEBAUT + ETC GROUP', 'SUCDEN + ECOM + FILDISI',
'BARRY CALLEBAUT + MARS + MONDELEZ',
'BARRY CALLEBAUT + ETC GROUP + MONDELEZ', 'BARRY CALLEBAUT + MARS',
'FERRERO', 'BARRY CALLEBAUT + ECOM', 'SUCDEN + OLAM',
'BARRY CALLEBAUT + TOUTON', 'ECOM', 'CARGILL + NESTLE',
'CARGILL + HERSHEY', 'ETC GROUP', 'ECOM + NESTLE', 'SUCDEN + ECOM',
'OLAM + ECOM', 'CARGILL + MARS', 'OLAM + NESTLE',
'ALFRED RITTER + CARGILL', 'OLAM + PURATOS', 'ECOM + PURATOS',
'ALFRED RITTER + BARRY CALLEBAUT + CARGILL + CEMOI + FERRERO',
'ECOOKIM', "TONY'S CHOCOLONELY", 'ALFRED RITTER + CEMOI',
'OLAM + HERSHEY', 'OLAM + FERRERO', 'SUCDEN + OLAM + HERSHEY',
'HERSHEY', 'HERSHEY + BLOMMER', 'FERRERO + TOUTON',
'OLAM + BLOMMER', 'BLOMMER', 'CEMOI + BLOMMER',
'CEMOI + SUCDEN + OLAM + HERSHEY + BLOMMER',
'OLAM + HERSHEY + BLOMMER', 'MARS + BLOMMER + TOUTON',
'OLAM + MARS', 'CARGILL + OLAM + MARS',
'BARRY CALLEBAUT + FERRERO + HERSHEY', 'CEMOI + OLAM',
'CEMOI + OLAM + MARS', 'ALTER ECO', 'CEMOI + ETC GROUP',
'MARS + TOUTON', 'ECOM + MONDELEZ', 'ECOM + FILDISI'], dtype=object)
df_2024.BUYER.unique()
array(['BARRY CALLEBAUT', 'BLOMMER', 'ECOM', 'OFI', 'ETG', 'CARGILL',
'FARMSTRONG', 'GCB', 'JB COCOA', 'SO B GREEN', 'SUCDEN', 'TOUTON',
'KRUGER + SUSCOM + CARGILL', 'OLAM', nan,
'BARRY CALLEBAUT + CARGILL + CEMOI', 'COCOASOURCE', 'FILDISI',
'CEMOI', 'SUSCOM', 'FERRERO', 'BARRY CALLEBAUT + CARGILL',
'SUCDEN + OLAM', 'HERSHEY', 'HERSHEY + BLOMMER', 'ECOOKIM',
'BARRY CALLEBAUT + TOUTON', 'BARRY CALLEBAUT + ETC GROUP',
'BARRY CALLEBAUT + OLAM', 'BARRY CALLEBAUT + CARGILL + OLAM',
'ALTER ECO', 'MARS', 'ETC GROUP'], dtype=object)
df_2024.DISCLOSURE_SOURCES.unique()
array([nan, 'OLAM', 'BARRY CALLEBAUT', 'CEMOI',
'BARRY CALLEBAUT + HERSHEY', 'SUCDEN', 'CARGILL',
'OLAM + MONDELEZ', 'BARRY CALLEBAUT + SUCDEN',
'BARRY CALLEBAUT + MONDELEZ', 'MARS',
'BARRY CALLEBAUT + BLOMMER + TOUTON', 'BARRY CALLEBAUT + OLAM',
'BARRY CALLEBAUT + MARS + BLOMMER + TOUTON', 'CARGILL + OLAM',
'NESTLE', 'SUCDEN + NESTLE',
'BARRY CALLEBAUT + CARGILL + OLAM + MARS',
'BARRY CALLEBAUT + CARGILL + MARS',
'BARRY CALLEBAUT + MARS + BLOMMER', 'BARRY CALLEBAUT + FERRERO',
'BARRY CALLEBAUT + OLAM + HERSHEY',
'BARRY CALLEBAUT + FERRERO + TOUTON',
'BARRY CALLEBAUT + ETC GROUP', 'SUCDEN + ECOM + FILDISI',
'BARRY CALLEBAUT + MARS + MONDELEZ',
'BARRY CALLEBAUT + ETC GROUP + MONDELEZ', 'BARRY CALLEBAUT + MARS',
'FERRERO', 'BARRY CALLEBAUT + ECOM', 'SUCDEN + OLAM',
'BARRY CALLEBAUT + TOUTON', 'ECOM', 'CARGILL + NESTLE',
'CARGILL + HERSHEY', 'ETC GROUP', 'ECOM + NESTLE', 'SUCDEN + ECOM',
'OLAM + ECOM', 'CARGILL + MARS', 'OLAM + NESTLE',
'ALFRED RITTER + CARGILL', 'OLAM + PURATOS', 'ECOM + PURATOS',
'ALFRED RITTER + BARRY CALLEBAUT + CARGILL + CEMOI + FERRERO',
'ECOOKIM', "TONY'S CHOCOLONELY", 'ALFRED RITTER + CEMOI',
'OLAM + HERSHEY', 'OLAM + FERRERO', 'SUCDEN + OLAM + HERSHEY',
'HERSHEY', 'HERSHEY + BLOMMER', 'FERRERO + TOUTON',
'OLAM + BLOMMER', 'BLOMMER', 'CEMOI + BLOMMER',
'CEMOI + SUCDEN + OLAM + HERSHEY + BLOMMER',
'OLAM + HERSHEY + BLOMMER', 'MARS + BLOMMER + TOUTON',
'OLAM + MARS', 'CARGILL + OLAM + MARS',
'BARRY CALLEBAUT + FERRERO + HERSHEY', 'CEMOI + OLAM',
'CEMOI + OLAM + MARS', 'ALTER ECO', 'CEMOI + ETC GROUP',
'MARS + TOUTON', 'ECOM + MONDELEZ', 'ECOM + FILDISI'], dtype=object)
import pandas as pd
import numpy as np
import re
from difflib import SequenceMatcher
# Load data
df_2024 = pd.read_csv("/Users/niamhfrench/Library/CloudStorage/OneDrive-SEI/Documents/TRASE/trase/data/cote_divoire/cocoa/logistics/q4_2025/clean/2024_civ_coops_like_cam_long.csv", sep=';')
df_prev = pd.read_csv("/Users/niamhfrench/Library/CloudStorage/OneDrive-SEI/Documents/TRASE/trase/data/cote_divoire/cocoa/logistics/out/CAM_seipcs_2023.csv", sep=';', low_memory=False)
# Ensure YEAR column is correct in df_2024
if 'YEAR' not in df_2024.columns:
df_2024['YEAR'] = 2024
else:
df_2024['YEAR'] = df_2024['YEAR'].fillna(2024).astype(int)
# --- 0. SMART COLUMN ALIGNMENT ---
# 1. Strip whitespace from all headers first
df_2024.columns = df_2024.columns.str.strip()
df_prev.columns = df_prev.columns.str.strip()
print(df_2024.DISCLOSURE_SOURCES.isna().sum())
print(df_2024.BUYER.isna().sum())
df_2024.DISCLOSURE_SOURCES.fillna(df_2024.COMPANY, inplace=True)
df_2024.DISCLOSURE_SOURCES.fillna(df_2024.BUYER, inplace=True)
df_2024.BUYER.fillna(df_2024.DISCLOSURE_SOURCES, inplace=True)
print(df_2024.DISCLOSURE_SOURCES.isna().sum())
print(df_2024.BUYER.isna().sum())
# 2. Manual Overrides (Add any pairs that fuzzy match might miss or get wrong)
rename_map = {
# 'Bad Name': 'Good Name',
}
# 3. Auto-generate Fuzzy Matches
# We look for columns in 2024 that are NOT in prev, but look very similar
target_cols = list(df_prev.columns)
source_cols = list(df_2024.columns)
print("--- Column Matching Report ---")
for col in source_cols:
# If the column is already perfect, skip
if col in target_cols:
continue
# Find the best match in the previous year's columns
# ratio() returns a score between 0 and 1
best_match = max(target_cols, key=lambda x: SequenceMatcher(None, col, x).ratio())
score = SequenceMatcher(None, col, best_match).ratio()
# Threshold: 0.85 is usually safe for things like 'SOURCE' vs 'SOURCES'
if score > 0.85:
print(f"Auto-renaming: '{col}' -> '{best_match}' (Score: {score:.2f})")
rename_map[col] = best_match
else:
# If it's a new column that doesn't match anything, we leave it alone (it will be dropped by reindex)
print(f"Dropping new column: '{col}' (No close match in history)")
# 4. Apply Renaming
df_2024 = df_2024.rename(columns=rename_map)
# 5. Enforce Exact Structure (Reindex)
# - Drops columns not in df_prev
# - Adds columns missing in df_2024 (fills with NaN)
df_2024 = df_2024.reindex(columns=df_prev.columns)
print("------------------------------")
# --- 1. Round lat and lon to 3 decimal places ---
cols_to_round = ['LATITUDE', 'LONGITUDE']
for col in cols_to_round:
if col in df_2024.columns:
df_2024[col] = df_2024[col].round(3)
# --- 2. Standardise spellings to 'COOPERATIVE' ---
pattern_standardise = r'(?i)\b(SOCIETE\s+COOPERATIVE|SOCIETÉ\s+COOPÉRATIVE|STE\s+COOP|STE\s+CPE|SOC\s+COOP)\b'
df_2024['SUPPLIER_ABRVNAME'] = df_2024['SUPPLIER_ABRVNAME'].str.replace(pattern_standardise, 'COOPERATIVE', regex=True)
# --- 3. Simplify names by removing COOP patterns ---
pattern_simplify = r'(?i)(\bSCOOPS?\b|COOP_CA_|\bCOOP\s*[-_]?\s*CA\b|COOP|COOPERATIVE|\s*[-–]\s*CA\b)'
df_2024['SUPPLIER_ABRVNAME'] = df_2024['SUPPLIER_ABRVNAME'].str.replace(pattern_simplify, '', regex=True).str.strip()
df_2024['SUPPLIER_ABRVNAME'] = df_2024['SUPPLIER_ABRVNAME'].str.strip(' _-–')
df_2024['SUPPLIER_ABRVNAME'] = df_2024['SUPPLIER_ABRVNAME'].str.replace(r'\s+', ' ', regex=True)
# --- 4. Match to stable COOP_IDs from previous year ---
df_2024 = df_2024.drop(columns=['COOP_ID', 'FLOW_ID'], errors='ignore')
# Create reference table from df_prev (Name -> ID)
ref_ids = df_prev[['SUPPLIER_ABRVNAME', 'COOP_ID']].dropna(subset=['COOP_ID']).drop_duplicates(subset=['SUPPLIER_ABRVNAME'])
# Merge 2024 data with the reference IDs
df_2024 = df_2024.merge(ref_ids, on='SUPPLIER_ABRVNAME', how='left')
# Handle New Cooperatives (Assign new IDs)
max_prev_id = pd.to_numeric(df_prev['COOP_ID'], errors='coerce').max()
if np.isnan(max_prev_id):
max_prev_id = 0
mask_new = df_2024['COOP_ID'].isna()
count_new = mask_new.sum()
if count_new > 0:
new_ids = range(int(max_prev_id) + 1, int(max_prev_id) + 1 + count_new)
df_2024.loc[mask_new, 'COOP_ID'] = new_ids
df_2024['COOP_ID'] = df_2024['COOP_ID'].astype(int)
# --- Initialize Flag for Real 2024 Data ---
df_2024['REPEATED_FROM_PAST_YEAR'] = False
# =============================================================================
# NEW LOGIC: EXTRAPOLATION
# =============================================================================
# --- 5. Extrapolate Links (Missing Buyers) ---
# Logic: If a Buyer is in df_prev but NOT in df_2024, copy their most recent year's rows to 2024.
buyers_prev = set(df_prev['BUYER'].unique())
buyers_curr = set(df_2024['BUYER'].unique())
missing_buyers = buyers_prev - buyers_curr
rows_to_add_buyers = []
if missing_buyers:
print(f"Extrapolating links for {len(missing_buyers)} missing buyers...")
# Filter df_prev for only these buyers
df_missing_buyers = df_prev[df_prev['BUYER'].isin(missing_buyers)]
# For each buyer, find their max year and take those rows
for buyer, group in df_missing_buyers.groupby('BUYER'):
last_year = group['YEAR'].max()
rows = group[group['YEAR'] == last_year].copy()
# Update for 2024
rows['YEAR'] = 2024
rows['REPEATED_FROM_PAST_YEAR'] = True
rows_to_add_buyers.append(rows)
if rows_to_add_buyers:
df_buyer_extrap = pd.concat(rows_to_add_buyers)
df_2024 = pd.concat([df_2024, df_buyer_extrap], ignore_index=True)
# --- 6. Extrapolate Existence (Missing Cooperatives) ---
# Logic: Ensure every coop in df_prev exists in df_2024.
# If a coop is still missing (even after step 5), carry forward its last known state.
# Get set of IDs currently in 2024 (Original + Extrapolated Buyers)
current_coop_ids = set(df_2024['COOP_ID'].unique())
prev_coop_ids = set(df_prev['COOP_ID'].dropna().unique())
missing_coop_ids = prev_coop_ids - current_coop_ids
if missing_coop_ids:
print(f"Extrapolating existence for {len(missing_coop_ids)} missing cooperatives...")
# Filter df_prev for these specific missing IDs
df_missing_coops = df_prev[df_prev['COOP_ID'].isin(missing_coop_ids)]
# We want the MOST RECENT row for each coop to carry forward
# Sort by Year desc, then drop duplicates on COOP_ID to keep the latest
coops_to_add = df_missing_coops.sort_values('YEAR', ascending=False).drop_duplicates(subset=['COOP_ID']).copy()
# Update for 2024
coops_to_add['YEAR'] = 2024
coops_to_add['REPEATED_FROM_PAST_YEAR'] = True
# Append to 2024 dataset
df_2024 = pd.concat([df_2024, coops_to_add], ignore_index=True)
# --- 7. Final Cleanup & Merge ---
# Re-create FLOW_ID for the entire 2024 dataset (Raw + Extrapolated) to ensure uniqueness
# Reset index first to ensure clean sequence
df_2024 = df_2024.reset_index(drop=True)
df_2024['FLOW_ID'] = df_2024['YEAR'].astype(str) + '-' + (df_2024.index + 1).astype(str)
# Prepare df_prev for final merge (ensure it has the flag column)
if 'REPEATED_FROM_PAST_YEAR' not in df_prev.columns:
df_prev['REPEATED_FROM_PAST_YEAR'] = False # Historical data is considered "real" for its year
# Combine History (2021-2023) + Completed 2024
df_final = pd.concat([df_prev, df_2024], ignore_index=True)
print("Processing Complete.")
print(f"Total rows: {len(df_final)}")
print(f"2024 rows (Real): {len(df_2024[df_2024['REPEATED_FROM_PAST_YEAR'] == False])}")
print(f"2024 rows (Extrapolated): {len(df_2024[df_2024['REPEATED_FROM_PAST_YEAR'] == True])}")
# Optional: Save
df_final.to_csv("2024_civ_coops_like_cam_long_fixes_26jan.csv", sep=';', index=False)
220
88
88
88
--- Column Matching Report ---
Dropping new column: 'COMPANY' (No close match in history)
Auto-renaming: 'TRADER_NAMES' -> 'TRADER_NAME' (Score: 0.96)
Auto-renaming: 'DISCLOSURE_SOURCES' -> 'DISCLOSURE_SOURCE' (Score: 0.97)
------------------------------
Extrapolating links for 9 missing buyers...
Extrapolating existence for 5119 missing cooperatives...
Processing Complete.
Total rows: 34452
2024 rows (Real): 1531
2024 rows (Extrapolated): 5260
/var/folders/3r/r9b5rj5s3pb069rsg5zh_1640000gn/T/ipykernel_65341/3134469334.py:143: FutureWarning: In a future version, object-dtype columns with all-bool values will not be included in reductions with bool_only=True. Explicitly cast to bool dtype instead.
df_2024 = pd.concat([df_2024, df_buyer_extrap], ignore_index=True)
df_2024[df_2024['REPEATED_FROM_PAST_YEAR'] == False]
| YEAR | SUPPLIER_ABRVNAME | SUPPLIER_FULLNAME | LATITUDE | LONGITUDE | DISTRICT_NAME | DISTRICT_GEOCODE | BUYER | DISCLOSURE_SOURCE | TRADER_NAME | ... | CERTIFIED | NB_FARMERS_COMPANY_YEAR | NON_TRADER | IS_TRADER | !NOT_RFA | unique_rfa_link | LVL_4_NAME | REPEATED_FROM_PAST_YEAR | COOP_ID | FLOW_ID | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2024 | ERATIVE BENKADI DE SIAKAKRO | SOCIETE COOPERATIVE BENKADI DE SIAKAKRO | 7.135 | -3.204 | AGNIBILEKRO | CI-3.1.2_1 | BARRY CALLEBAUT | BARRY CALLEBAUT | BARRY CALLEBAUT | ... | NaN | NaN | NaN | False | NaN | NaN | CI-3.1.2_1 | False | 5767 | 2024-1 |
| 1 | 2024 | ERATIVE ANOUANZE DES PRODUCTEURS DE L'INDENIE | SOCIETE COOPERATIVE ANOUANZE DES PRODUCTEURS D... | 6.720 | -3.495 | ABENGOUROU | CI-3.1.1_1 | BARRY CALLEBAUT | BARRY CALLEBAUT | BARRY CALLEBAUT | ... | NaN | NaN | NaN | False | NaN | NaN | CI-3.1.1_1 | False | 5768 | 2024-2 |
| 2 | 2024 | ERATIVE AGRICOLE AHUANOU D'ARRAH | SOCIETE COOPERATIVE AGRICOLE AHUANOU D'ARRAH | NaN | NaN | BONGOUANOU | CI-6.3.2_1 | BARRY CALLEBAUT | BARRY CALLEBAUT | BARRY CALLEBAUT | ... | NaN | NaN | NaN | False | NaN | NaN | CI-6.3.2_1 | False | 5769 | 2024-3 |
| 3 | 2024 | TELIN | COOPERATIVE AGRICOLE TELIN | 5.877 | -7.455 | BANGOLO | CI-8.2.1_1 | BARRY CALLEBAUT | BARRY CALLEBAUT | BARRY CALLEBAUT | ... | NaN | NaN | NaN | False | NaN | NaN | CI-8.2.1_1 | False | 5602 | 2024-4 |
| 4 | 2024 | ERATIVE AGRICOLE IMAAH, ERATIVE SIMPLIFIÉE | COOPERATIVE AGRICOLE IMAAH, COOPERATIVE SIMPLI... | NaN | NaN | DIVO | CI-5.2.1_1 | BARRY CALLEBAUT | BARRY CALLEBAUT | BARRY CALLEBAUT | ... | NaN | NaN | NaN | False | NaN | NaN | CI-5.2.1_1 | False | 5770 | 2024-5 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1526 | 2024 | SANAB | SOCIETE COOPERATIVE AGRICOLE NOUVELLE ALLIANCE... | 4.957 | -6.085 | BANGOLO | CI-8.2.1_1 | ECOM | ECOM | ECOM | ... | NaN | NaN | NaN | False | NaN | NaN | CI-8.2.1_1 | False | 6338 | 2024-1527 |
| 1527 | 2024 | CAFUGO | COOPÉRATIVE AGRICOLE DES FRÈRES UNIS DE GOBROKO | 4.957 | -6.087 | SASSANDRA | CI-2.1.2_1 | ECOM | ECOM | ECOM | ... | NaN | NaN | NaN | False | NaN | NaN | CI-2.1.2_1 | False | 736 | 2024-1528 |
| 1528 | 2024 | CCAPUO | SOCIETE COOPERATIVE AVEC CONSEIL D`ADMINISTRAT... | 6.747 | -7.362 | DUEKOUE | CI-8.2.2_1 | ECOM | ECOM | ECOM | ... | NaN | NaN | NaN | False | NaN | NaN | CI-8.2.2_1 | False | 6339 | 2024-1529 |
| 1529 | 2024 | UDAN | SOCIETE COOPERATIVE AVEC CONSEIL D’ADMINISTRAT... | 7.252 | -8.151 | DANANE | CI-8.3.2_1 | ECOM | ECOM | ECOM | ... | NaN | NaN | NaN | False | NaN | NaN | CI-8.3.2_1 | False | 5635 | 2024-1530 |
| 1530 | 2024 | YEBOYOKON | SOCIETE COOPERATIVE AGRICOLE YEBOYOKON DE DAIR... | NaN | NaN | DIVO | CI-5.2.1_1 | ECOM | ECOM | ECOM | ... | NaN | NaN | NaN | False | NaN | NaN | CI-5.2.1_1 | False | 5735 | 2024-1531 |
1531 rows × 43 columns
# WHERE DO I CHANGE MY CODE - TRADE DATA CLEANING?
# Choose between Importer and Exporter by whichever is a bigger company with more disclosure and/or commitments
# Keep exporting cooperatives
# If both are similar, choose the Exporter