Split flows
View or edit on GitHub
This page is synchronized from trase/data/cote_divoire/cocoa/logistics/q4_2025/clean/split_flows.ipynb. Last modified on 2026-06-21 06:35 CEST by GitHub Actions.
Please view or edit the original file there; changes should be reflected here after a midnight build (CET time),
or manually triggering it with a GitHub action (link).
import pandas as pd
import geopandas as gpd
import os
os.chdir('/Users/niamhfrench/repos/TRASE/trase/')
print(os.getcwd())
df = pd.read_csv('data/cote_divoire/cocoa/logistics/q4_2025/clean/2024_civ_coops_like_cam_long_fixes_26jan.csv', sep=';', low_memory=False)
/Users/niamhfrench/repos/TRASE/trase
# Create a mask for rows where any of the specified columns contain '+'
cols_to_check = ['BUYER', 'DISCLOSURE_SOURCE', 'TRADER_NAME']
mask = df[cols_to_check].stack().str.contains('\+', regex=True).unstack().any(axis=1)
# Or the more readable, standard approach:
mask = df[cols_to_check].apply(lambda x: x.str.contains('\+', na=False)).any(axis=1)
# Display the filtered dataframe
df_with_plus = df[mask]
df_with_plus[['BUYER', 'DISCLOSURE_SOURCE', 'TRADER_NAME', 'YEAR']]
| BUYER | DISCLOSURE_SOURCE | TRADER_NAME | YEAR | |
|---|---|---|---|---|
| 27710 | BARRY CALLEBAUT | BARRY CALLEBAUT + HERSHEY | BARRY CALLEBAUT | 2024 |
| 27713 | BARRY CALLEBAUT | BARRY CALLEBAUT + HERSHEY | BARRY CALLEBAUT | 2024 |
| 27715 | BARRY CALLEBAUT | OLAM + MONDELEZ | BARRY CALLEBAUT | 2024 |
| 27718 | BARRY CALLEBAUT | BARRY CALLEBAUT + SUCDEN | BARRY CALLEBAUT | 2024 |
| 27721 | BARRY CALLEBAUT | BARRY CALLEBAUT + MONDELEZ | BARRY CALLEBAUT | 2024 |
| ... | ... | ... | ... | ... |
| 29164 | ECOM | ECOM + PURATOS | ECOM | 2024 |
| 29167 | ECOM | BARRY CALLEBAUT + FERRERO + HERSHEY | ECOM | 2024 |
| 29170 | ECOM | ECOM + MONDELEZ | ECOM | 2024 |
| 29174 | ECOM | CARGILL + MARS | ECOM | 2024 |
| 29176 | ECOM | ECOM + FILDISI | ECOM | 2024 |
211 rows × 4 columns
import pandas as pd
import numpy as np
import unicodedata
def clean_text(s):
"""Standardizes text: removes accents, uppercases, and trims whitespace."""
if pd.isna(s): return s
s = unicodedata.normalize('NFKD', str(s)).encode('ASCII', 'ignore').decode('ASCII')
return s.strip().upper()
def unroll_connections(df):
"""
Expands rows where DISCLOSURE_SOURCE, BUYER, or TRADER_NAME contain multiple
entities separated by '+'. Creates a unique row for each distinct entity.
"""
mask_2024 = df['YEAR'] == 2024
df_pre = df[~mask_2024].copy()
df_2024 = df[mask_2024].copy()
unrolled_rows = []
for _, row in df_2024.iterrows():
# Split entities by '+' and clean them
sources = [x.strip() for x in str(row['DISCLOSURE_SOURCE']).split('+')] if pd.notna(row['DISCLOSURE_SOURCE']) else []
buyers = [x.strip() for x in str(row['BUYER']).split('+')] if pd.notna(row['BUYER']) else []
traders = [x.strip() for x in str(row['TRADER_NAME']).split('+')] if pd.notna(row['TRADER_NAME']) else []
# Take the union of all mentioned entities to find all connections
entities = set()
for e in sources + buyers + traders:
if e and e.upper() not in ['NAN', 'NONE', '']:
entities.add(e)
if len(entities) > 1:
for entity in sorted(list(entities)):
new_row = row.copy()
new_row['DISCLOSURE_SOURCE'] = entity
new_row['BUYER'] = entity
new_row['TRADER_NAME'] = entity
unrolled_rows.append(new_row)
else:
# For single-entity rows, ensure they are also cleaned
if entities:
ent = list(entities)[0]
row['DISCLOSURE_SOURCE'], row['BUYER'], row['TRADER_NAME'] = ent, ent, ent
unrolled_rows.append(row)
df_2024_unrolled = pd.DataFrame(unrolled_rows)
return pd.concat([df_pre, df_2024_unrolled], ignore_index=True)
# 1. Unroll aggregated connections (Specific to 2024 request)
df = unroll_connections(df)
# 2. Standardization & Formatting
text_cols = ['SUPPLIER_FULLNAME', 'SUPPLIER_ABRVNAME', 'DISTRICT_NAME', 'BUYER', 'TRADER_NAME', 'DISCLOSURE_SOURCE']
for col in text_cols:
df[col] = df[col].apply(clean_text)
# 3. Validation: Recalculate Farmer Totals
df['TOTAL_FARMERS_NONTRADER'] = pd.to_numeric(df['TOTAL_FARMERS_NONTRADER'], errors='coerce').fillna(np.nan)
df['TOTAL_FARMERS_TRADER'] = pd.to_numeric(df['TOTAL_FARMERS_TRADER'], errors='coerce').fillna(np.nan)
calc_total = df['TOTAL_FARMERS_NONTRADER'] + df['TOTAL_FARMERS_TRADER']
mask = (df['TOTAL_FARMERS'].isna()) | (df['TOTAL_FARMERS'] == 0) | (abs(df['TOTAL_FARMERS'] - calc_total) > 1)
df.loc[mask, 'TOTAL_FARMERS'] = calc_total[mask]
# 4. Deduplication (Permitting multiple buyers per coop per year)
# Rank by data completeness, but keep distinct coop-to-buyer connections
df['completeness_score'] = (
df['LATITUDE'].notna().astype(int) +
df['NUM_FARMERS'].notna().astype(int) +
df['TOTAL_FARMERS'].notna().astype(int) +
df['DISTRICT_NAME'].notna().astype(int)
)
df = df.sort_values(['COOP_ID', 'YEAR', 'BUYER', 'completeness_score'], ascending=[True, True, True, False])
df = df.drop_duplicates(subset=['COOP_ID', 'YEAR', 'BUYER', 'TRADER_NAME'], keep='first')
# 5. Geodata Propagation: Fill missing coords/districts using historical data for the same coop
def fill_missing_geo(group):
group = group.sort_values('YEAR', ascending=False)
if group['LATITUDE'].isna().any():
valid = group.dropna(subset=['LATITUDE'])
if not valid.empty:
group['LATITUDE'] = group['LATITUDE'].fillna(valid.iloc[0]['LATITUDE'])
group['LONGITUDE'] = group['LONGITUDE'].fillna(valid.iloc[0]['LONGITUDE'])
if group['DISTRICT_NAME'].isna().any():
valid = group.dropna(subset=['DISTRICT_NAME'])
if not valid.empty:
group['DISTRICT_NAME'] = group['DISTRICT_NAME'].fillna(valid.iloc[0]['DISTRICT_NAME'])
return group
df = df.groupby('COOP_ID', group_keys=False).apply(fill_missing_geo)
# 6. Final FLOW_ID Assignment (Guaranteeing uniqueness)
df = df.sort_values(['YEAR', 'COOP_ID', 'BUYER'])
df['FLOW_ID'] = df['YEAR'].astype(str) + '-' + df.groupby('YEAR').cumcount().add(1).astype(str)
# Save result
df.drop(columns=['completeness_score']).to_csv('cleaned_coop_dataset_final.csv', index=False, sep=';')
<positron-console-cell-24>:90: FutureWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.
import pandas as pd
import numpy as np
# Load the cleaned dataset
df = pd.read_csv('cleaned_coop_dataset_final.csv', sep=';')
# Define all columns containing farmer population/count data
farmer_cols = [
'NUM_FARMERS',
'NUM_FARMERS_EXTRAPOLATED',
'TOTAL_FARMERS_NONTRADER',
'TOTAL_FARMERS_TRADER',
'TOTAL_FARMERS_RFA',
'TOTAL_FARMERS',
'DISCL_NUMBER_FARMERS',
'NB_FARMERS_COMPANY_YEAR'
]
# 1. Remove farmer values for the 2024 data
# We set them to NaN so we can fill them using historical data
df.loc[df['YEAR'] == 2024, farmer_cols] = np.nan
# 2. Sort the data by COOP_ID and YEAR
# This ensures that for every coop, records are in chronological order
df = df.sort_values(['COOP_ID', 'YEAR'])
# 3. Propagate historical values forward
# Within each cooperative group, we carry the last known farmer values
# forward into the empty 2024 slots.
df[farmer_cols] = df.groupby('COOP_ID')[farmer_cols].ffill()
# 4. Final Sorting and Export
# Return to the standard display order
df = df.sort_values(['YEAR', 'COOP_ID', 'BUYER'])
# Save the updated dataset
df.to_csv('cleaned_coop_dataset_with_historical_farmers.csv', index=False, sep=';')
print("Process Complete.")
print(f"Filled 2024 rows: {df[df['YEAR'] == 2024]['TOTAL_FARMERS'].notna().sum()} / {len(df[df['YEAR'] == 2024])}")
<positron-console-cell-1>:5: DtypeWarning: Columns (33,34) have mixed types. Specify dtype option on import or set low_memory=False.
Process Complete.
Filled 2024 rows: 6429 / 7425
df.drop_duplicates(inplace=True)
df.columns
Index(['FLOW_ID', 'COOP_ID', 'YEAR', 'SUPPLIER_ABRVNAME', 'SUPPLIER_FULLNAME',
'LATITUDE', 'LONGITUDE', 'DISTRICT_NAME', 'DISTRICT_GEOCODE', 'BUYER',
'DISCLOSURE_SOURCE', 'TRADER_NAME', 'CERTIFICATIONS', 'NUM_FARMERS',
'NUM_FARMERS_EXTRAPOLATED', 'TOTAL_FARMERS_NONTRADER',
'TOTAL_FARMERS_TRADER', 'TOTAL_FARMERS_RFA', 'TOTAL_FARMERS',
'DISCL_COUNTRY_NAME', 'DISCL_AREA_NAME', 'DISCL_SUPPLIER_ABRVNAME',
'DISCL_SUPPLIER_FULLNAME', 'DISCL_SUPPLIER_TYPE', 'DISCL_LONGITUDE',
'DISCL_LATITUDE', 'DISCL_NUMBER_FARMERS', 'DISCL_VOLUMES',
'DISCL_VOLUMES_UNIT', 'DISCL_CERTIFICATION_NAME',
'DISCL_CERTIFICATION_PCT', 'SIMPLIF_ABRVNAME', 'CCTN_COOP_ID',
'IS_ALL_CAM_V3', 'IS_ANY_CAM_V3', 'CERTIFIED',
'NB_FARMERS_COMPANY_YEAR', 'NON_TRADER', 'IS_TRADER', '!NOT_RFA',
'unique_rfa_link', 'LVL_4_NAME', 'REPEATED_FROM_PAST_YEAR'],
dtype='object')
import pandas as pd
import numpy as np
def qa_supply_chain_dataset(df):
results = {}
# --- 1. Unique Supply Chain Flow Check ---
# Requirement: Each row should show a unique supply chain between coop, trader, and buyer in a given year.
# Note: I'm using 'SUPPLIER_FULLNAME' and 'BUYER' as proxies for the chain actors.
flow_cols = ['YEAR', 'SUPPLIER_FULLNAME', 'TRADER_NAME','BUYER']
duplicates = df.duplicated(subset=flow_cols, keep=False)
results['duplicate_flows_count'] = duplicates.sum()
results['duplicate_flows_data'] = df[duplicates].sort_values(by=flow_cols)
# --- 2. Geographic Consistency (Cooperative Metadata) ---
# Requirement: Same name should have same coordinates and district geocodes.
geo_cols = ['LATITUDE', 'LONGITUDE', 'DISTRICT_GEOCODE']
# We group by Supplier Name and check how many unique coordinate/geocode sets exist for each
geo_consistency = df.groupby('SUPPLIER_FULLNAME')[geo_cols].nunique()
inconsistent_geos = geo_consistency[(geo_consistency > 1).any(axis=1)]
results['inconsistent_geos_count'] = len(inconsistent_geos)
results['inconsistent_geos_data'] = inconsistent_geos
# --- 3. Coordinate Sanity Check ---
# Checks if Lat/Long are within realistic bounds (assuming West Africa/Ivory Coast based on names)
# Adjust bounds if your data covers a different region.
lat_bounds = (4, 11) # Example for Ivory Coast
lon_bounds = (-9, -2)
invalid_coords = df[
(df['LATITUDE'].notna()) &
((df['LATITUDE'] < lat_bounds[0]) | (df['LATITUDE'] > lat_bounds[1]) |
(df['LONGITUDE'] < lon_bounds[0]) | (df['LONGITUDE'] > lon_bounds[1]))
]
results['invalid_coords_count'] = len(invalid_coords)
# --- 4. Logic Check: Trader Status ---
# Check if IS_TRADER and NON_TRADER columns contradict each other
logic_conflict = df[(df['IS_TRADER'] == True) & (df['NON_TRADER'] == True)]
results['trader_logic_conflicts'] = len(logic_conflict)
# --- 5. Missing Data (Critical Columns) ---
critical_cols = ['YEAR', 'SUPPLIER_FULLNAME', 'BUYER', 'DISTRICT_GEOCODE']
results['missing_values'] = df[critical_cols].isnull().sum()
# --- 6. Farmer Count vs. Boolean Flags ---
# If there are farmers assigned, the coop should likely be marked as IS_TRADER or similar
anomaly_farmers = df[(df['NB_FARMERS_COMPANY_YEAR'] > 0) & (df['SUPPLIER_FULLNAME'].isna())]
results['farmers_no_supplier_count'] = len(anomaly_farmers)
return results
# Usage:
# qa_report = qa_supply_chain_dataset(your_dataframe)
# Quick Summary Printout
# print(f"Duplicate Flows: {qa_report['duplicate_flows_count']}")
# print(f"Inconsistent Geographics for Same Coop: {qa_report['inconsistent_geos_count']}")
qa_supply_chain_dataset(df)
{'duplicate_flows_count': 5389,
'duplicate_flows_data': FLOW_ID COOP_ID YEAR SUPPLIER_ABRVNAME \
76 2012-25 1598 2012 COOPAAPROMAN
80 2012-29 1835 2012 COOPAMA
158 2013-31 1598 2013 COOPAAPROMAN
163 2013-36 1835 2013 COOPAMA
255 2014-31 1598 2014 COOPAAPROMAN
... ... ... ... ...
33751 2024-6091 5463 2024 SOCOPLA
33829 2024-6169 5533 2024 SONGTAABA
33905 2024-6245 5601 2024 TEEGA WENDE
33948 2024-6288 5638 2024 UDAN
34071 2024-6411 5749 2024 YEYASSO
SUPPLIER_FULLNAME LATITUDE LONGITUDE \
76 COOPERATIVE AGRICOLE ANOUANZE DES PRODUCTEURS ... 6.843 -3.291
80 COOPERATIVE AGRICOLE ANOUANZE DES PRODUCTEURS ... 6.843 -3.284
158 COOPERATIVE AGRICOLE ANOUANZE DES PRODUCTEURS ... 6.843 -3.291
163 COOPERATIVE AGRICOLE ANOUANZE DES PRODUCTEURS ... 6.843 -3.284
255 COOPERATIVE AGRICOLE ANOUANZE DES PRODUCTEURS ... 6.843 -3.291
... ... ... ...
33751 NaN NaN NaN
33829 NaN NaN NaN
33905 NaN NaN NaN
33948 NaN NaN NaN
34071 NaN NaN NaN
DISTRICT_NAME DISTRICT_GEOCODE BUYER ... IS_ALL_CAM_V3 \
76 AGNIBILEKRO CI-3.1.2_1 CARGILL ... False
80 AGNIBILEKRO CI-3.1.2_1 CARGILL ... False
158 AGNIBILEKRO CI-3.1.2_1 CARGILL ... False
163 AGNIBILEKRO CI-3.1.2_1 CARGILL ... False
255 AGNIBILEKRO CI-3.1.2_1 CARGILL ... False
... ... ... ... ... ...
33751 NaN NaN NaN ... False
33829 NaN NaN NaN ... False
33905 NaN NaN NaN ... False
33948 NaN NaN NaN ... False
34071 NaN NaN NaN ... False
IS_ANY_CAM_V3 CERTIFIED NB_FARMERS_COMPANY_YEAR NON_TRADER IS_TRADER \
76 True True 612.0 False True
80 True True 467.0 False True
158 True True 612.0 False True
163 True True 467.0 False True
255 True True 612.0 False True
... ... ... ... ... ...
33751 False NaN NaN NaN NaN
33829 False NaN NaN NaN NaN
33905 False NaN NaN NaN NaN
33948 False NaN NaN NaN NaN
34071 False NaN NaN NaN NaN
!NOT_RFA unique_rfa_link LVL_4_NAME REPEATED_FROM_PAST_YEAR
76 False False AGNIBILEKRO False
80 False False AGNIBILEKRO False
158 False False AGNIBILEKRO False
163 False False AGNIBILEKRO False
255 False False AGNIBILEKRO False
... ... ... ... ...
33751 NaN NaN NaN True
33829 NaN NaN NaN True
33905 NaN NaN NaN True
33948 NaN NaN NaN True
34071 NaN NaN NaN True
[5389 rows x 43 columns],
'inconsistent_geos_count': 539,
'inconsistent_geos_data': LATITUDE LONGITUDE \
SUPPLIER_FULLNAME
AHONDJO 2 2
ANO 2 2
ANOUANZE 2 2
BECIDA 1 1
BENKADI 5 5
... ... ...
UNION DES PRODUCTEURS AGRICOLES DE DJAKOUAKOUKRO 2 2
UNION DES SOCIETES COOPERATIVE DE LA SOUS-PREFE... 2 2
UNION DES SOCIETES COOPERATIVES DU DEPARTMENT D... 2 2
UNITE COOPERATIVE AGRICOLE DE DANANE 2 2
UPAS 1 1
DISTRICT_GEOCODE
SUPPLIER_FULLNAME
AHONDJO 1
ANO 2
ANOUANZE 2
BECIDA 2
BENKADI 4
... ...
UNION DES PRODUCTEURS AGRICOLES DE DJAKOUAKOUKRO 2
UNION DES SOCIETES COOPERATIVE DE LA SOUS-PREFE... 1
UNION DES SOCIETES COOPERATIVES DU DEPARTMENT D... 1
UNITE COOPERATIVE AGRICOLE DE DANANE 1
UPAS 2
[539 rows x 3 columns],
'invalid_coords_count': 0,
'trader_logic_conflicts': 0,
'missing_values': YEAR 0
SUPPLIER_FULLNAME 388
BUYER 25205
DISTRICT_GEOCODE 3931
dtype: int64,
'farmers_no_supplier_count': 40}
import numpy as np
# 1. If TRADER_NAME is a manufacturer, make it NA
# Note: Ensure the column name matches your dataframe (usually 'TRADER' or 'TRADER_NAME')
manufacturers = [
'FILDISI', 'NESTLE', 'FERRERO', 'ALFRED RITTER',
'MONDELEZ', 'MARS', "TONY'S CHOCOLONELY", 'HERSHEY',
'RAINFOREST ALLIANCE', 'ALTER ECO', 'ALBERT HEIJN'
]
# Using .loc for safe assignment
df.loc[df['TRADER_NAME'].isin(manufacturers), 'TRADER_NAME'] = np.nan
# 2. If disclosure source is TONY'S CHOCOLONELY, Buyer and Trader is COCOASOURCE
# Assuming 'DISCLOSURE_SOURCE' is the column name for the source
tonys_mask = df['DISCLOSURE_SOURCE'] == "TONY'S CHOCOLONELY"
df.loc[tonys_mask, 'BUYER'] = 'COCOASOURCE'
df.loc[tonys_mask, 'TRADER_NAME'] = 'COCOASOURCE'
# --- FINAL QA: RE-AGGREGATION ---
# Because you changed Traders/Buyers, you might have created "duplicate" rows
# for what is now the same supply chain. We should group them.
# Define the columns that define a unique flow
group_cols = ['YEAR', 'SUPPLIER_FULLNAME', 'TRADER_NAME', 'BUYER', 'DISTRICT_GEOCODE']
# Sum numeric values (like farmer counts) and keep the first instance of metadata
df_final = df.groupby(group_cols, as_index=False).agg({
'NB_FARMERS_COMPANY_YEAR': 'sum',
'LATITUDE': 'first',
'LONGITUDE': 'first',
'CERTIFIED': 'max', # If any row was True, the merged row is True
'IS_TRADER': 'max'
})
print(f"Cleaned dataset contains {len(df_final)} unique supply chain flows.")
Cleaned dataset contains 6273 unique supply chain flows.
df.to_csv("2024_coops_like_camlong_febfixes.csv", index=False)
df = pd.read_csv("2024_coops_like_camlong_febfixes.csv")
<positron-console-cell-37>:1: DtypeWarning: Columns (33,34) have mixed types. Specify dtype option on import or set low_memory=False.
coopyear = pd.read_csv("/Users/niamhfrench/repos/TRASE/trase/data/cote_divoire/cocoa/logistics/q4_2025/clean/CAM_coopyear.csv", sep=';')
<positron-console-cell-38>:1: DtypeWarning: Columns (6,8,9,10,11,19,23,24,27,31,32,34) have mixed types. Specify dtype option on import or set low_memory=False.
len(coopyear.columns)
len(df.columns)
43
len(df.columns)
43
import pandas as pd
# 1. Drop columns from 'df' that are NOT in 'coopyear'
# We use .columns.intersection to ensure we only keep columns that actually exist in both
cols_to_keep = df.columns.intersection(coopyear.columns)
df = df[cols_to_keep]
# 2. Remove duplicates based on 'SUPPLIER_ABRVNAME'
# 'keep=first' is the default, ensuring you keep the first occurrence
df = df.drop_duplicates(subset=['SUPPLIER_ABRVNAME','YEAR'])
# Verify the results
print(f"New column count: {len(df.columns)}")
print(f"New row count: {len(df)}")
New column count: 34
New row count: 23770
df = pd.read_csv("2024_coops_like_camlong_febfixes.csv")
len(df.COOP_ID.unique())
<positron-console-cell-83>:1: DtypeWarning: Columns (33,34) have mixed types. Specify dtype option on import or set low_memory=False.
6339
df.columns
Index(['FLOW_ID', 'COOP_ID', 'YEAR', 'SUPPLIER_ABRVNAME', 'SUPPLIER_FULLNAME', 'LATITUDE', 'LONGITUDE', 'DISTRICT_NAME', 'DISTRICT_GEOCODE', 'BUYER', 'DISCLOSURE_SOURCE', 'TRADER_NAME', 'CERTIFICATIONS', 'NUM_FARMERS', 'NUM_FARMERS_EXTRAPOLATED', 'TOTAL_FARMERS_NONTRADER',
'TOTAL_FARMERS_TRADER', 'TOTAL_FARMERS_RFA', 'TOTAL_FARMERS', 'DISCL_COUNTRY_NAME', 'DISCL_AREA_NAME', 'DISCL_SUPPLIER_ABRVNAME', 'DISCL_SUPPLIER_FULLNAME', 'DISCL_SUPPLIER_TYPE', 'DISCL_LONGITUDE', 'DISCL_LATITUDE', 'DISCL_NUMBER_FARMERS', 'DISCL_VOLUMES',
'DISCL_VOLUMES_UNIT', 'DISCL_CERTIFICATION_NAME', 'DISCL_CERTIFICATION_PCT', 'SIMPLIF_ABRVNAME', 'CCTN_COOP_ID', 'IS_ALL_CAM_V3', 'IS_ANY_CAM_V3', 'CERTIFIED', 'NB_FARMERS_COMPANY_YEAR', 'NON_TRADER', 'IS_TRADER', '!NOT_RFA', 'unique_rfa_link', 'LVL_4_NAME',
'REPEATED_FROM_PAST_YEAR'],
dtype='object')
# - COOPERATIVE SIMPLIFIEE TIPHERET DE TOULEPLEU is in Toulépleu, Cavally
# Union Inter Régional des coopératives du Grand Ouest avec conseil d`Administration (UJRECGO-COOP-CA), location is correct but not long name, and J needs replaced with I
# UDEKO is in Sinfra
# UPRAD is in SASSANDRA
# - make cam coopyear
# - rerun SEI-PCS
import pandas as pd
import numpy as np
import re
def standardize_coop_data(df):
"""
Cleans cooperative names, standardizes districts, and resolves COOP_IDs
to the earliest assigned ID, maintaining original dataframe structure.
"""
# Create a copy to avoid modifying the original dataframe in place unexpectedly
df_clean = df.copy()
# Save the exact original columns and their order
original_columns = df_clean.columns.tolist()
# --- HELPER FUNCTIONS ---
def get_longest_name(series):
# Filter out NaNs and literal 'NULL' strings
valid_names = series.dropna()[series.dropna().astype(str).str.strip().str.upper() != 'NULL']
if valid_names.empty:
return np.nan
return max(valid_names.astype(str), key=len)
def extract_core_name(fullname):
if pd.isna(fullname) or str(fullname).strip().upper() == 'NULL':
return ""
name = str(fullname).upper()
legal_terms = [
r"COOPERATIVE AVEC CONSEIL D'ADMINISTRATION",
r"SOCIETE COOPERATIVES AGRICOLE DE",
r"SOCIETE COOPERATIVE SIMPLIFIEE",
r"SOCIETE COOPERATIVE",
r"COOP-CA",
r"COOP CA",
r"SCOOPS",
r"COOPERATIVE",
r"COOP\."
]
for term in legal_terms:
name = re.sub(term, "", name)
name = re.sub(r"[^\w\s\d']", " ", name)
return " ".join(name.split())
def get_most_frequent_district(series):
valid_districts = series.dropna()[series.dropna().astype(str).str.upper() != 'NULL']
if valid_districts.empty:
return 'UNKNOWN_DISTRICT'
return valid_districts.mode().iloc[0]
# --- PIPELINE EXECUTION ---
# 1. Clean short name for grouping (temporary column)
df_clean['temp_short_name'] = df_clean['SUPPLIER_ABRVNAME'].astype(str).str.strip().str.upper()
df_clean['temp_short_name'] = df_clean['temp_short_name'].replace({'NULL': np.nan, 'NAN': np.nan})
# 2. Fix Long Names: Broadcast the longest full name based on the short name group
fixed_fullnames = df_clean.groupby('temp_short_name')['SUPPLIER_FULLNAME'].transform(get_longest_name)
# Overwrite original column, keeping original data if the fix was NaN
df_clean['SUPPLIER_FULLNAME'] = fixed_fullnames.fillna(df_clean['SUPPLIER_FULLNAME'])
# 3. Extract Core Name from the newly fixed full names (temporary column)
df_clean['temp_core_name'] = df_clean['SUPPLIER_FULLNAME'].apply(extract_core_name)
# Fill remaining NaNs in short name with empty strings so groupby doesn't drop them
df_clean['temp_short_name'] = df_clean['temp_short_name'].fillna('')
# 4. Standardize Districts: Broadcast the most frequent district per cooperative
best_districts = df_clean.groupby(['temp_short_name', 'temp_core_name'])['DISTRICT_NAME'].transform(get_most_frequent_district)
# Overwrite original column
df_clean['DISTRICT_NAME'] = best_districts.fillna(df_clean['DISTRICT_NAME'])
# 5. Assign Earliest COOP_ID
# Sort logically to ensure the first row of any group is the earliest year
df_sorted = df_clean.sort_values(by=['temp_short_name', 'temp_core_name', 'DISTRICT_NAME', 'YEAR'])
earliest_ids = df_sorted.groupby(['temp_short_name', 'temp_core_name', 'DISTRICT_NAME'])['COOP_ID'].first().reset_index()
earliest_ids.rename(columns={'COOP_ID': 'temp_STD_COOP_ID'}, inplace=True)
# Merge the earliest IDs back into the main working dataframe
df_clean = df_clean.merge(earliest_ids, on=['temp_short_name', 'temp_core_name', 'DISTRICT_NAME'], how='left')
# Overwrite original column
df_clean['COOP_ID'] = df_clean['temp_STD_COOP_ID']
# --- CLEANUP ---
# Strip away all 'temp_' columns and enforce the exact original schema
df_clean = df_clean[original_columns]
return df_clean
# Run the pipeline on your dataframe
df_final = standardize_coop_data(df)
len(df_final.COOP_ID.unique())
4444
df_final
| FLOW_ID | COOP_ID | YEAR | SUPPLIER_ABRVNAME | SUPPLIER_FULLNAME | LATITUDE | LONGITUDE | DISTRICT_NAME | DISTRICT_GEOCODE | BUYER | ... | IS_ALL_CAM_V3 | IS_ANY_CAM_V3 | CERTIFIED | NB_FARMERS_COMPANY_YEAR | NON_TRADER | IS_TRADER | !NOT_RFA | unique_rfa_link | LVL_4_NAME | REPEATED_FROM_PAST_YEAR | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2009-1 | 448 | 2009 | CAAP | COOPERATIVE ANOUANZE DE PETIT BOUAKE | 5.890 | -6.376 | SOUBRE | CI-2.2.5_1 | CARGILL | ... | False | True | False | 277.0 | False | True | False | False | SOUBRE | False |
| 1 | 2009-2 | 726 | 2009 | CAFHS | COOPERATIVE AGRICOLE FIEDIFOUE DU HAUT SASSANDRA | 6.891 | -6.461 | DALOA | CI-9.1.1_1 | CARGILL | ... | False | True | True | 838.0 | False | True | False | False | DALOA | False |
| 2 | 2009-3 | 1734 | 2009 | COOPAGA | COOPERATIVE AGRICOLE GENERALE DE GAGNOA | 5.022 | -6.550 | SAN-PEDRO | CI-2.3.1_1 | CARGILL | ... | False | True | True | 1190.0 | False | True | False | False | SAN-PEDRO | False |
| 3 | 2009-4 | 2906 | 2009 | ECASO | COOPERATIVE DES AGRICULTEURS DU SUD-OUEST | 5.785 | -6.597 | SOUBRE | CI-2.2.5_1 | CARGILL | ... | False | True | True | 735.0 | False | True | False | False | SOUBRE | False |
| 4 | 2009-5 | 4783 | 2009 | SOCAB | COOPERATIVE DES AGRICULTEURS DE BOUAFLE | 6.263 | -6.999 | BUYO | CI-2.2.1_1 | CARGILL | ... | False | False | False | 237.0 | False | True | False | False | BUYO | False |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 35081 | 2024-7421 | 5792 | 2024 | ECAM SASSANDRA | ENTREPRISE COOPERATIVE DES AGRICULTEURS DE MEA... | NaN | NaN | SASSANDRA | CI-2.1.2_1 | ECOM | ... | NaN | NaN | NaN | NaN | NaN | False | NaN | NaN | CI-2.1.2_1 | False |
| 35082 | 2024-7422 | 6000 | 2024 | SUF | COOPERATIVE AGRICOLE DES PLANTEURS UNIS DE FEN... | 6.785 | -7.393 | DUEKOUE | CI-8.2.2_1 | ECOM | ... | NaN | NaN | NaN | NaN | NaN | False | NaN | NaN | CI-8.2.2_1 | False |
| 35083 | 2024-7423 | 6185 | 2024 | TOULEPLEU | SOCIETE COOPERATIVE AVEC CONSEIL DADMINISTRATI... | NaN | NaN | TOULEUPLEU | CI-8.1.4_1 | ECOM | ... | NaN | NaN | NaN | NaN | NaN | False | NaN | NaN | CI-8.1.4_1 | False |
| 35084 | 2024-7424 | 5789 | 2024 | SANAB | SOCIETE COOPERATIVE AGRICOLE NOUVELLE ALLIANCE... | 4.957 | -6.085 | BANGOLO | CI-8.2.1_1 | ECOM | ... | NaN | NaN | NaN | NaN | NaN | False | NaN | NaN | CI-8.2.1_1 | False |
| 35085 | 2024-7425 | 6339 | 2024 | CCAPUO | SOCIETE COOPERATIVE AVEC CONSEIL D`ADMINISTRAT... | 6.747 | -7.362 | DUEKOUE | CI-8.2.2_1 | ECOM | ... | NaN | NaN | NaN | NaN | NaN | False | NaN | NaN | CI-8.2.2_1 | False |
35086 rows × 43 columns
# The na=False prevents errors if there are empty/NaN rows
df_final[df_final['SUPPLIER_ABRVNAME'].str.contains('YAT', na=False)]
| FLOW_ID | COOP_ID | YEAR | SUPPLIER_ABRVNAME | SUPPLIER_FULLNAME | LATITUDE | LONGITUDE | DISTRICT_NAME | DISTRICT_GEOCODE | BUYER | ... | IS_ALL_CAM_V3 | IS_ANY_CAM_V3 | CERTIFIED | NB_FARMERS_COMPANY_YEAR | NON_TRADER | IS_TRADER | !NOT_RFA | unique_rfa_link | LVL_4_NAME | REPEATED_FROM_PAST_YEAR | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 151 | 2013-24 | 1198 | 2013 | CAYAT | COOPERATIVE DE DEVELOPPEMENT AGRICOLE DE YAKAS... | 6.099 | -3.852 | ADZOPE | CI-7.3.1_1 | CARGILL | ... | False | True | True | 1408.0 | False | True | False | False | ADZOPE | False |
| 248 | 2014-24 | 1198 | 2014 | CAYAT | COOPERATIVE DE DEVELOPPEMENT AGRICOLE DE YAKAS... | 6.099 | -3.852 | ADZOPE | CI-7.3.1_1 | CARGILL | ... | False | True | True | 1408.0 | False | True | False | False | ADZOPE | False |
| 345 | 2015-24 | 1198 | 2015 | CAYAT | COOPERATIVE DE DEVELOPPEMENT AGRICOLE DE YAKAS... | 6.099 | -3.852 | ADZOPE | CI-7.3.1_1 | CARGILL | ... | False | True | True | 1408.0 | False | True | False | False | ADZOPE | False |
| 448 | 2016-27 | 1198 | 2016 | CAYAT | COOPERATIVE DE DEVELOPPEMENT AGRICOLE DE YAKAS... | 6.099 | -3.852 | ADZOPE | CI-7.3.1_1 | CARGILL | ... | False | True | True | 1408.0 | False | True | False | False | ADZOPE | False |
| 566 | 2017-29 | 1198 | 2017 | CAYAT | COOPERATIVE DE DEVELOPPEMENT AGRICOLE DE YAKAS... | 6.099 | -3.852 | ADZOPE | CI-7.3.1_1 | CARGILL | ... | False | True | True | 1408.0 | False | True | False | False | ADZOPE | False |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 34047 | 2024-6387 | 5727 | 2024 | YAT | SOCIETE COOPERATIVES AGRICOLE DE YAKASSE ATTOBROU | 6.220 | -3.575 | YAKASSE-ATTOBROU | CI-7.1.4_1 | SUCDEN | ... | False | False | False | 1297.0 | NaN | False | False | False | CI-7.1.4_1 | False |
| 34048 | 2024-6388 | 5727 | 2024 | YAT | SOCIETE COOPERATIVES AGRICOLE DE YAKASSE ATTOBROU | NaN | NaN | YAKASSE-ATTOBROU | NaN | SUCDEN | ... | False | False | False | NaN | False | True | False | False | NaN | True |
| 34049 | 2024-6389 | 5727 | 2024 | YAT | SOCIETE COOPERATIVES AGRICOLE DE YAKASSE ATTOBROU | NaN | NaN | YAKASSE-ATTOBROU | NaN | OLAM | ... | False | False | False | NaN | False | True | False | False | NaN | True |
| 34991 | 2024-7331 | 6274 | 2024 | SSYAT | SCOOPSYAT | 6.186 | -3.660 | YAKASSE-ATTOBROU | CI-7.3.4_1 | CARGILL | ... | False | False | True | NaN | NaN | False | False | False | CI-7.3.4_1 | False |
| 34992 | 2024-7332 | 6274 | 2024 | SSYAT | SCOOPSYAT | 6.186 | -3.660 | YAKASSE-ATTOBROU | CI-7.3.4_1 | MARS | ... | False | False | True | NaN | NaN | False | False | False | CI-7.3.4_1 | False |
68 rows × 43 columns
import numpy as np
# List all the columns you want to be identical for the same COOP_ID
columns_to_sync = [
'SUPPLIER_ABRVNAME',
'SUPPLIER_FULLNAME',
'LATITUDE',
'LONGITUDE',
'DISTRICT_NAME',
'DISTRICT_GEOCODE'
]
def get_best_value(series):
"""Finds the most frequent non-null value in a series."""
# Filter out actual NaNs and literal 'NULL' strings
valid_data = series.dropna()[series.dropna().astype(str).str.strip().str.upper() != 'NULL']
if valid_data.empty:
return np.nan
# Return the mode (most frequent value). If there's a tie, it picks the first one.
return valid_data.mode().iloc[0]
# Apply the synchronization
for col in columns_to_sync:
df_final[col] = df_final.groupby('COOP_ID')[col].transform(get_best_value)
# Visualize your fix
columns_to_view = ['COOP_ID', 'SUPPLIER_ABRVNAME', 'SUPPLIER_FULLNAME', 'LATITUDE', 'LONGITUDE', 'DISTRICT_NAME']
print(df_final[df_final['COOP_ID'] == 5727][columns_to_view].head())
COOP_ID SUPPLIER_ABRVNAME SUPPLIER_FULLNAME LATITUDE LONGITUDE DISTRICT_NAME
5663 5727 YAT SOCIETE COOPERATIVES AGRICOLE DE YAKASSE ATTOBROU 6.22 -3.575 YAKASSE-ATTOBROU
5664 5727 YAT SOCIETE COOPERATIVES AGRICOLE DE YAKASSE ATTOBROU 6.22 -3.575 YAKASSE-ATTOBROU
5665 5727 YAT SOCIETE COOPERATIVES AGRICOLE DE YAKASSE ATTOBROU 6.22 -3.575 YAKASSE-ATTOBROU
5666 5727 YAT SOCIETE COOPERATIVES AGRICOLE DE YAKASSE ATTOBROU 6.22 -3.575 YAKASSE-ATTOBROU
10542 5727 YAT SOCIETE COOPERATIVES AGRICOLE DE YAKASSE ATTOBROU 6.22 -3.575 YAKASSE-ATTOBROU
# The na=False prevents errors if there are empty/NaN rows
df_final[df_final['SUPPLIER_ABRVNAME']=='WEBE']
| FLOW_ID | COOP_ID | YEAR | SUPPLIER_ABRVNAME | SUPPLIER_FULLNAME | LATITUDE | LONGITUDE | DISTRICT_NAME | DISTRICT_GEOCODE | BUYER | ... | IS_ALL_CAM_V3 | IS_ANY_CAM_V3 | CERTIFIED | NB_FARMERS_COMPANY_YEAR | NON_TRADER | IS_TRADER | !NOT_RFA | unique_rfa_link | LVL_4_NAME | REPEATED_FROM_PAST_YEAR | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 5646 | 2019-4844 | 5701 | 2019 | WEBE | SCOOPS WEBE | 5.381 | -3.986 | SOUBRE | CI-1.1.1_1 | NaN | ... | True | True | True | NaN | False | False | NaN | NaN | ABIDJAN | False |
| 10525 | 2020-4825 | 5701 | 2020 | WEBE | SCOOPS WEBE | 5.381 | -3.986 | SOUBRE | CI-1.1.1_1 | NaN | ... | True | True | NaN | NaN | NaN | NaN | NaN | NaN | NaN | True |
| 15502 | 2021-4923 | 5701 | 2021 | WEBE | SCOOPS WEBE | 5.381 | -3.986 | SOUBRE | CI-1.1.1_1 | NaN | ... | True | True | NaN | NaN | NaN | NaN | NaN | NaN | NaN | True |
| 21611 | 2022-6055 | 5701 | 2022 | WEBE | SCOOPS WEBE | 5.381 | -3.986 | SOUBRE | CI-1.1.1_1 | NaN | ... | True | True | NaN | NaN | NaN | NaN | NaN | NaN | NaN | True |
| 21612 | 2022-6056 | 5701 | 2022 | WEBE | SCOOPS WEBE | 5.381 | -3.986 | SOUBRE | CI-1.1.1_1 | NaN | ... | False | False | True | NaN | False | False | True | True | NaN | False |
| 27595 | 2023-5922 | 5701 | 2023 | WEBE | SCOOPS WEBE | 5.381 | -3.986 | SOUBRE | CI-1.1.1_1 | NaN | ... | True | True | NaN | NaN | NaN | NaN | NaN | NaN | NaN | True |
| 27596 | 2023-5923 | 5701 | 2023 | WEBE | SCOOPS WEBE | 5.381 | -3.986 | SOUBRE | CI-1.1.1_1 | NaN | ... | False | False | NaN | NaN | NaN | NaN | NaN | NaN | NaN | True |
| 34018 | 2024-6358 | 5701 | 2024 | WEBE | SCOOPS WEBE | 5.381 | -3.986 | SOUBRE | CI-1.1.1_1 | OFI | ... | False | False | False | NaN | NaN | False | False | False | CI-2.2.5_1 | False |
| 34019 | 2024-6359 | 5701 | 2024 | WEBE | SCOOPS WEBE | 5.381 | -3.986 | SOUBRE | CI-1.1.1_1 | SUCDEN | ... | False | False | False | NaN | NaN | False | False | False | CI-2.2.5_1 | False |
| 34020 | 2024-6360 | 5701 | 2024 | WEBE | SCOOPS WEBE | 5.381 | -3.986 | SOUBRE | CI-1.1.1_1 | NaN | ... | False | False | NaN | NaN | NaN | NaN | NaN | NaN | NaN | True |
10 rows × 43 columns
import pandas as pd
import numpy as np
import re
# 1. Create a mapping dictionary from your geodataframe
# This maps the location name (e.g., "OUANINOU") to its code (e.g., "CI-12.1.2_1")
location_map = dict(zip(gdf['LVL_4_NAME'].str.upper(), gdf['LVL_4_CODE']))
# Sort the location names by length, longest first.
sorted_locations = sorted([loc for loc in location_map.keys() if pd.notna(loc)], key=len, reverse=True)
def extract_geodata_from_name(longname):
"""Searches a string for known Ivorian district names and returns the name and code."""
if pd.isna(longname) or str(longname).strip().upper() == 'NULL':
return np.nan, np.nan
name_upper = str(longname).upper()
for loc in sorted_locations:
# Use regex word boundaries (\b) for exact word matches
pattern = rf"\b{re.escape(str(loc))}\b"
if re.search(pattern, name_upper):
return loc, location_map[loc]
# Return NaNs if no district name is found in the long name
return np.nan, np.nan
# 2. Apply the extraction function to ALL rows
print(f"Scanning all {len(df_final)} rows for geographic data in the long names...")
# Create a temporary DataFrame to hold the extracted results
extracted_df = df_final['SUPPLIER_FULLNAME'].apply(
lambda x: pd.Series(extract_geodata_from_name(x), index=['EXTRACTED_NAME', 'EXTRACTED_CODE'])
)
# 3. Find exactly which rows had a successful match
found_mask = extracted_df['EXTRACTED_NAME'].notna()
# 4. Overwrite the existing columns ONLY where a match was found
# If found_mask is False for a row, pandas leaves the original value untouched
df_final.loc[found_mask, 'DISTRICT_NAME'] = extracted_df.loc[found_mask, 'EXTRACTED_NAME']
df_final.loc[found_mask, 'DISTRICT_GEOCODE'] = extracted_df.loc[found_mask, 'EXTRACTED_CODE']
print(f"Successfully updated geodata for {found_mask.sum()} rows!")
Scanning all 35086 rows for geographic data in the long names...
Successfully updated geodata for 10775 rows!
df_final.loc[fixed_indices]
| FLOW_ID | COOP_ID | YEAR | SUPPLIER_ABRVNAME | SUPPLIER_FULLNAME | LATITUDE | LONGITUDE | DISTRICT_NAME | DISTRICT_GEOCODE | BUYER | ... | IS_ALL_CAM_V3 | IS_ANY_CAM_V3 | CERTIFIED | NB_FARMERS_COMPANY_YEAR | NON_TRADER | IS_TRADER | !NOT_RFA | unique_rfa_link | LVL_4_NAME | REPEATED_FROM_PAST_YEAR | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1339 | 2019-537 | 603 | 2019 | CADB ISSIA | CADB ISSIA | NaN | NaN | ISSIA | CI-9.1.2_1 | NaN | ... | False | True | True | NaN | False | False | NaN | NaN | NaN | False |
| 1433 | 2019-631 | 710 | 2019 | CAETD | COOPERATIVE AGRICOLE ELIE TIKPA DE DALOA | NaN | NaN | DALOA | CI-9.1.1_1 | NaN | ... | True | True | True | NaN | False | False | NaN | NaN | NaN | False |
| 1535 | 2019-733 | 823 | 2019 | CAKS | COOPERATIVE AGRICOLE KOUADO DE SIPILOU | NaN | NaN | SIPILOU | CI-8.3.4_1 | NaN | ... | True | True | True | NaN | False | False | NaN | NaN | NaN | False |
| 1618 | 2019-816 | 915 | 2019 | CANOVA | COOPERATIVE AGRICOLE NON-TABA DE VAVOUA | NaN | NaN | VAVOUA | CI-9.1.3_1 | NaN | ... | False | True | True | NaN | False | False | NaN | NaN | NaN | False |
| 1661 | 2019-859 | 969 | 2019 | CAPEN DIVO | COOPERATIVE AGRICOLE N'ZASSA DE TIASSALE | NaN | NaN | TIASSALE | CI-7.1.4_1 | NaN | ... | True | True | True | NaN | False | False | NaN | NaN | NaN | False |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 33963 | 2024-6303 | 5653 | 2024 | UNION DES PRODUCTEURS CAFE ET CACAO DE GAGNOA | UNION DES PRODUCTEURS CAFE ET CACAO DE GAGNOA | NaN | NaN | GAGNOA | CI-5.1.1_1 | NaN | ... | False | False | NaN | NaN | NaN | NaN | NaN | NaN | NaN | True |
| 33967 | 2024-6307 | 5657 | 2024 | UPAD | UNION DES PRODUCTEURS AGRICOLES DE DANANE | NaN | NaN | DANANE | CI-8.3.2_1 | NaN | ... | False | False | NaN | NaN | NaN | NaN | NaN | NaN | NaN | True |
| 33973 | 2024-6313 | 5661 | 2024 | UPB BANGOLO | UPB BANGOLO | NaN | NaN | BANGOLO | CI-8.2.1_1 | NaN | ... | False | False | NaN | NaN | NaN | NaN | NaN | NaN | NaN | True |
| 34035 | 2024-6375 | 5716 | 2024 | WENDSONGDO | COOPERATIVE SIMPLIFIEE WENDSONGDO DE TAI | NaN | NaN | TAI | CI-8.1.3_1 | NaN | ... | False | False | NaN | NaN | NaN | NaN | NaN | NaN | NaN | True |
| 35021 | 2024-7361 | 6293 | 2024 | CABG /MANKONO | CABG COOP-CA/MANKONO | NaN | NaN | MANKONO | CI-12.2.3_1 | SUCDEN | ... | False | False | False | NaN | NaN | True | False | False | NaN | False |
500 rows × 43 columns
df_final.to_csv('/Users/niamhfrench/repos/TRASE/trase/2024_coops_like_camlong_marchfixes.csv', index=False)
df_long = pd.read_csv("/Users/niamhfrench/repos/TRASE/trase/2024_coops_like_camlong_marchfixes.csv")
df_coopyear = pd.read_csv("/Users/niamhfrench/repos/TRASE/trase/2024_coops_like_cam_coopyear_febfixes.csv")
<positron-console-cell-99>:1: DtypeWarning: Columns (33,34) have mixed types. Specify dtype option on import or set low_memory=False.
<positron-console-cell-99>:2: DtypeWarning: Columns (26,27) have mixed types. Specify dtype option on import or set low_memory=False.
df_coopyear
| COOP_ID | YEAR | SUPPLIER_ABRVNAME | SUPPLIER_FULLNAME | LATITUDE | LONGITUDE | DISTRICT_NAME | DISTRICT_GEOCODE | CERTIFICATIONS | TOTAL_FARMERS_NONTRADER | ... | SIMPLIF_ABRVNAME | CCTN_COOP_ID | IS_ALL_CAM_V3 | IS_ANY_CAM_V3 | CERTIFIED | NB_FARMERS_COMPANY_YEAR | !NOT_RFA | unique_rfa_link | LVL_4_NAME | REPEATED_FROM_PAST_YEAR | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 448 | 2009 | CAAP | CAAP | 5.890 | -6.376 | SOUBRE | CI-2.2.5_1 | NaN | 0.0 | ... | CAAP | CAAP_-6.4_5.9_CAAP | False | True | False | 277.0 | False | False | SOUBRE | False |
| 1 | 726 | 2009 | CAFHS | COOPERATIVE AGRICOLE FIEDIFOUE DU HAUT SASSANDRA | 6.891 | -6.461 | DALOA | CI-9.1.1_1 | CARGILL COCOA PROMISE + RAINFOREST ALLIANCE | 0.0 | ... | CAFHS | CAFHS_-6.5_6.9_COOPERATIVE AGRICOLE FIEDIFOUE ... | False | True | True | 838.0 | False | False | DALOA | False |
| 2 | 1734 | 2009 | COOPAGA | COOPERATIVE AGRICOLE DE GABIADJI | 5.022 | -6.550 | SAN-PEDRO | CI-2.3.1_1 | CARGILL COCOA PROMISE + RAINFOREST ALLIANCE | 0.0 | ... | COOPAGA | COOPAGA_-6.6_5_COOPERATIVE AGRICOLE DE GABIADJI | False | True | True | 1190.0 | False | False | SAN-PEDRO | False |
| 3 | 2906 | 2009 | ECASO | COOPERATIVE DES AGRICULTEURS DU SUD-OUEST | 5.785 | -6.597 | SOUBRE | CI-2.2.5_1 | CARGILL COCOA PROMISE + RAINFOREST ALLIANCE | 0.0 | ... | ECASO | ECASO_-6.6_5.8_COOPERATIVE DES AGRICULTEURS DU... | False | True | True | 735.0 | False | False | SOUBRE | False |
| 4 | 4783 | 2009 | SOCAB | COOPERATIVE DE LA BIA | 6.263 | -6.999 | BUYO | CI-2.2.1_1 | NaN | 0.0 | ... | SOCAB | SOCAB_-7_6.3_COOPERATIVE DE LA BIA | False | False | False | 237.0 | False | False | BUYO | False |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 23765 | 6329 | 2024 | ANOU | COOPERATIVE AGRICOLE DE COOPANOU | NaN | NaN | TABOU | CI-2.3.2_1 | NaN | NaN | ... | COOPANOU | COOPANOU_NA_NA_COOPERATIVE AGRICOLE DE COOPANOU | False | True | False | NaN | False | False | CI-2.3.2_1 | False |
| 23766 | 6330 | 2024 | SCAETHS | SOCIETE COOPERATIVE AGRICOLE ELIE TIKPA DE DALOA | 6.874 | -6.458 | DALOA | CI-9.1.1_1 | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | CI-9.1.1_1 | False |
| 23767 | 6331 | 2024 | SS_DAKUA | SCOOP-DAKUA | NaN | NaN | GUIGLO | CI-8.1.2_1 | NaN | NaN | ... | DAKUA | DAKUA_NA_NA_SCOOP-DAKUA | False | False | False | NaN | False | False | CI-8.1.2_1 | False |
| 23768 | 6332 | 2024 | SORANHS | SOCIETE COOPERATIVE DES PRODUCTEURS AGRICOLES ... | 6.880 | -6.448 | DALOA | CI-9.1.1_1 | NaN | NaN | ... | SOCOOPRANHS | SOCOOPRANHS_NA_NA_COOPERATIVE DES PRODUCTEURS ... | False | False | False | NaN | False | False | CI-9.1.1_1 | False |
| 23769 | 6339 | 2024 | CCAPUO | SOCIETE COOPERATIVE AVEC CONSEIL D`ADMINISTRAT... | 6.747 | -7.362 | DUEKOUE | CI-8.2.2_1 | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | CI-8.2.2_1 | False |
23770 rows × 34 columns
df_long
| FLOW_ID | COOP_ID | YEAR | SUPPLIER_ABRVNAME | SUPPLIER_FULLNAME | LATITUDE | LONGITUDE | DISTRICT_NAME | DISTRICT_GEOCODE | BUYER | ... | IS_ALL_CAM_V3 | IS_ANY_CAM_V3 | CERTIFIED | NB_FARMERS_COMPANY_YEAR | NON_TRADER | IS_TRADER | !NOT_RFA | unique_rfa_link | LVL_4_NAME | REPEATED_FROM_PAST_YEAR | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2009-1 | 448 | 2009 | CAAP | COOPERATIVE ANOUANZE DE PETIT BOUAKE | 5.890 | -6.376 | BOUAKE | CI-11.1.3_1 | CARGILL | ... | False | True | False | 277.0 | False | True | False | False | SOUBRE | False |
| 1 | 2009-2 | 726 | 2009 | CAFHS | COOPERATIVE AGRICOLE FIEDIFOUE DU HAUT SASSANDRA | 6.891 | -6.461 | SASSANDRA | CI-2.1.2_1 | CARGILL | ... | False | True | True | 838.0 | False | True | False | False | DALOA | False |
| 2 | 2009-3 | 1734 | 2009 | COOPAGA | COOPERATIVE AGRICOLE GENERALE DE GAGNOA | 5.022 | -6.550 | GAGNOA | CI-5.1.1_1 | CARGILL | ... | False | True | True | 1190.0 | False | True | False | False | SAN-PEDRO | False |
| 3 | 2009-4 | 2906 | 2009 | ECASO | COOPERATIVE DES AGRICULTEURS DU SUD-OUEST | 5.785 | -6.597 | SOUBRE | CI-2.2.5_1 | CARGILL | ... | False | True | True | 735.0 | False | True | False | False | SOUBRE | False |
| 4 | 2009-5 | 4783 | 2009 | SOCAB | COOPERATIVE DES AGRICULTEURS DE BOUAFLE | 6.263 | -6.999 | BOUAFLE | CI-9.2.1_1 | CARGILL | ... | False | False | False | 237.0 | False | True | False | False | BUYO | False |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 35081 | 2024-7421 | 5792 | 2024 | ECAM SASSANDRA | ENTREPRISE COOPERATIVE DES AGRICULTEURS DE MEA... | NaN | NaN | MEAGUI | CI-2.2.3_1 | ECOM | ... | NaN | NaN | NaN | NaN | NaN | False | NaN | NaN | CI-2.1.2_1 | False |
| 35082 | 2024-7422 | 6000 | 2024 | SUF | COOPERATIVE AGRICOLE DES PLANTEURS UNIS DE FEN... | 6.739 | -7.353 | DUEKOUE | CI-8.2.2_1 | ECOM | ... | NaN | NaN | NaN | NaN | NaN | False | NaN | NaN | CI-8.2.2_1 | False |
| 35083 | 2024-7423 | 6185 | 2024 | TOULEPLEU | SOCIETE COOPERATIVE AVEC CONSEIL DADMINISTRATI... | 6.492 | -6.585 | TOULEUPLEU | CI-8.1.4_1 | ECOM | ... | NaN | NaN | NaN | NaN | NaN | False | NaN | NaN | CI-8.1.4_1 | False |
| 35084 | 2024-7424 | 5789 | 2024 | SANAB | SOCIETE COOPERATIVE AGRICOLE NOUVELLE ALLIANCE... | 4.957 | -6.085 | BANGOLO | CI-8.2.1_1 | ECOM | ... | NaN | NaN | NaN | NaN | NaN | False | NaN | NaN | CI-8.2.1_1 | False |
| 35085 | 2024-7425 | 6339 | 2024 | CCAPUO | SOCIETE COOPERATIVE AVEC CONSEIL D`ADMINISTRAT... | 6.747 | -7.362 | DUEKOUE | CI-8.2.2_1 | ECOM | ... | NaN | NaN | NaN | NaN | NaN | False | NaN | NaN | CI-8.2.2_1 | False |
35086 rows × 43 columns
# 1. Define the exact list of columns you requested
target_columns = [
'COOP_ID', 'YEAR', 'SUPPLIER_ABRVNAME', 'SUPPLIER_FULLNAME', 'LATITUDE',
'LONGITUDE', 'DISTRICT_NAME', 'DISTRICT_GEOCODE', 'CERTIFICATIONS',
'TOTAL_FARMERS_NONTRADER', 'TOTAL_FARMERS_TRADER', 'TOTAL_FARMERS_RFA',
'TOTAL_FARMERS', 'DISCL_COUNTRY_NAME', 'DISCL_AREA_NAME',
'DISCL_SUPPLIER_ABRVNAME', 'DISCL_SUPPLIER_FULLNAME', 'DISCL_SUPPLIER_TYPE',
'DISCL_LONGITUDE', 'DISCL_LATITUDE', 'DISCL_VOLUMES', 'DISCL_VOLUMES_UNIT',
'DISCL_CERTIFICATION_NAME', 'DISCL_CERTIFICATION_PCT', 'SIMPLIF_ABRVNAME',
'CCTN_COOP_ID', 'IS_ALL_CAM_V3', 'IS_ANY_CAM_V3', 'CERTIFIED',
'NB_FARMERS_COMPANY_YEAR', '!NOT_RFA', 'unique_rfa_link', 'LVL_4_NAME',
'REPEATED_FROM_PAST_YEAR'
]
# 2. Create a new DataFrame with only those columns
# We use .copy() to ensure it's a completely independent dataframe, preventing warnings later
df_yearly = df_final[target_columns].copy()
# 3. Keep only the unique combinations of COOP_ID and YEAR
# 'keep="first"' tells pandas to retain the first occurrence it finds and drop the rest
df_yearly = df_yearly.drop_duplicates(subset=['COOP_ID', 'YEAR'], keep='first')
# 4. Sort it so it's clean and easy to read
df_yearly = df_yearly.sort_values(by=['COOP_ID', 'YEAR']).reset_index(drop=True)
# Verify the shape to see how many unique coop/year combinations you have
print(f"Created df_yearly with {df_yearly.shape[0]} rows and {df_yearly.shape[1]} columns.")
Created df_yearly with 23819 rows and 34 columns.
df_yearly.to_csv("/Users/niamhfrench/repos/TRASE/trase/2024_coops_like_camcoopyear_marchfixes.csv")