Split flows

View or edit on GitHub

This page is synchronized from trase/data/cote_divoire/cocoa/logistics/q4_2025/clean/split_flows.ipynb. Last modified on 2026-06-21 06:35 CEST by GitHub Actions. Please view or edit the original file there; changes should be reflected here after a midnight build (CET time), or manually triggering it with a GitHub action (link).

import pandas as pd
import geopandas as gpd
import os

os.chdir('/Users/niamhfrench/repos/TRASE/trase/')
print(os.getcwd())

df = pd.read_csv('data/cote_divoire/cocoa/logistics/q4_2025/clean/2024_civ_coops_like_cam_long_fixes_26jan.csv', sep=';', low_memory=False)
/Users/niamhfrench/repos/TRASE/trase
# Create a mask for rows where any of the specified columns contain '+'
cols_to_check = ['BUYER', 'DISCLOSURE_SOURCE', 'TRADER_NAME']
mask = df[cols_to_check].stack().str.contains('\+', regex=True).unstack().any(axis=1)

# Or the more readable, standard approach:
mask = df[cols_to_check].apply(lambda x: x.str.contains('\+', na=False)).any(axis=1)

# Display the filtered dataframe
df_with_plus = df[mask]
df_with_plus[['BUYER', 'DISCLOSURE_SOURCE', 'TRADER_NAME', 'YEAR']]
BUYER DISCLOSURE_SOURCE TRADER_NAME YEAR
27710 BARRY CALLEBAUT BARRY CALLEBAUT + HERSHEY BARRY CALLEBAUT 2024
27713 BARRY CALLEBAUT BARRY CALLEBAUT + HERSHEY BARRY CALLEBAUT 2024
27715 BARRY CALLEBAUT OLAM + MONDELEZ BARRY CALLEBAUT 2024
27718 BARRY CALLEBAUT BARRY CALLEBAUT + SUCDEN BARRY CALLEBAUT 2024
27721 BARRY CALLEBAUT BARRY CALLEBAUT + MONDELEZ BARRY CALLEBAUT 2024
... ... ... ... ...
29164 ECOM ECOM + PURATOS ECOM 2024
29167 ECOM BARRY CALLEBAUT + FERRERO + HERSHEY ECOM 2024
29170 ECOM ECOM + MONDELEZ ECOM 2024
29174 ECOM CARGILL + MARS ECOM 2024
29176 ECOM ECOM + FILDISI ECOM 2024

211 rows × 4 columns

import pandas as pd
import numpy as np
import unicodedata

def clean_text(s):
    """Standardizes text: removes accents, uppercases, and trims whitespace."""
    if pd.isna(s): return s
    s = unicodedata.normalize('NFKD', str(s)).encode('ASCII', 'ignore').decode('ASCII')
    return s.strip().upper()

def unroll_connections(df):
    """
    Expands rows where DISCLOSURE_SOURCE, BUYER, or TRADER_NAME contain multiple 
    entities separated by '+'. Creates a unique row for each distinct entity.
    """
    mask_2024 = df['YEAR'] == 2024
    df_pre = df[~mask_2024].copy()
    df_2024 = df[mask_2024].copy()

    unrolled_rows = []
    for _, row in df_2024.iterrows():
        # Split entities by '+' and clean them
        sources = [x.strip() for x in str(row['DISCLOSURE_SOURCE']).split('+')] if pd.notna(row['DISCLOSURE_SOURCE']) else []
        buyers = [x.strip() for x in str(row['BUYER']).split('+')] if pd.notna(row['BUYER']) else []
        traders = [x.strip() for x in str(row['TRADER_NAME']).split('+')] if pd.notna(row['TRADER_NAME']) else []

        # Take the union of all mentioned entities to find all connections
        entities = set()
        for e in sources + buyers + traders:
            if e and e.upper() not in ['NAN', 'NONE', '']:
                entities.add(e)

        if len(entities) > 1:
            for entity in sorted(list(entities)):
                new_row = row.copy()
                new_row['DISCLOSURE_SOURCE'] = entity
                new_row['BUYER'] = entity
                new_row['TRADER_NAME'] = entity
                unrolled_rows.append(new_row)
        else:
            # For single-entity rows, ensure they are also cleaned
            if entities:
                ent = list(entities)[0]
                row['DISCLOSURE_SOURCE'], row['BUYER'], row['TRADER_NAME'] = ent, ent, ent
            unrolled_rows.append(row)

    df_2024_unrolled = pd.DataFrame(unrolled_rows)
    return pd.concat([df_pre, df_2024_unrolled], ignore_index=True)

# 1. Unroll aggregated connections (Specific to 2024 request)
df = unroll_connections(df)

# 2. Standardization & Formatting
text_cols = ['SUPPLIER_FULLNAME', 'SUPPLIER_ABRVNAME', 'DISTRICT_NAME', 'BUYER', 'TRADER_NAME', 'DISCLOSURE_SOURCE']
for col in text_cols:
    df[col] = df[col].apply(clean_text)

# 3. Validation: Recalculate Farmer Totals
df['TOTAL_FARMERS_NONTRADER'] = pd.to_numeric(df['TOTAL_FARMERS_NONTRADER'], errors='coerce').fillna(np.nan)
df['TOTAL_FARMERS_TRADER'] = pd.to_numeric(df['TOTAL_FARMERS_TRADER'], errors='coerce').fillna(np.nan)
calc_total = df['TOTAL_FARMERS_NONTRADER'] + df['TOTAL_FARMERS_TRADER']
mask = (df['TOTAL_FARMERS'].isna()) | (df['TOTAL_FARMERS'] == 0) | (abs(df['TOTAL_FARMERS'] - calc_total) > 1)
df.loc[mask, 'TOTAL_FARMERS'] = calc_total[mask]

# 4. Deduplication (Permitting multiple buyers per coop per year)
# Rank by data completeness, but keep distinct coop-to-buyer connections
df['completeness_score'] = (
    df['LATITUDE'].notna().astype(int) + 
    df['NUM_FARMERS'].notna().astype(int) + 
    df['TOTAL_FARMERS'].notna().astype(int) +
    df['DISTRICT_NAME'].notna().astype(int)
)
df = df.sort_values(['COOP_ID', 'YEAR', 'BUYER', 'completeness_score'], ascending=[True, True, True, False])
df = df.drop_duplicates(subset=['COOP_ID', 'YEAR', 'BUYER', 'TRADER_NAME'], keep='first')

# 5. Geodata Propagation: Fill missing coords/districts using historical data for the same coop
def fill_missing_geo(group):
    group = group.sort_values('YEAR', ascending=False)
    if group['LATITUDE'].isna().any():
        valid = group.dropna(subset=['LATITUDE'])
        if not valid.empty:
            group['LATITUDE'] = group['LATITUDE'].fillna(valid.iloc[0]['LATITUDE'])
            group['LONGITUDE'] = group['LONGITUDE'].fillna(valid.iloc[0]['LONGITUDE'])
    if group['DISTRICT_NAME'].isna().any():
        valid = group.dropna(subset=['DISTRICT_NAME'])
        if not valid.empty:
            group['DISTRICT_NAME'] = group['DISTRICT_NAME'].fillna(valid.iloc[0]['DISTRICT_NAME'])
    return group

df = df.groupby('COOP_ID', group_keys=False).apply(fill_missing_geo)

# 6. Final FLOW_ID Assignment (Guaranteeing uniqueness)
df = df.sort_values(['YEAR', 'COOP_ID', 'BUYER'])
df['FLOW_ID'] = df['YEAR'].astype(str) + '-' + df.groupby('YEAR').cumcount().add(1).astype(str)

# Save result
df.drop(columns=['completeness_score']).to_csv('cleaned_coop_dataset_final.csv', index=False, sep=';')
<positron-console-cell-24>:90: FutureWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.
import pandas as pd
import numpy as np

# Load the cleaned dataset
df = pd.read_csv('cleaned_coop_dataset_final.csv', sep=';')

# Define all columns containing farmer population/count data
farmer_cols = [
    'NUM_FARMERS', 
    'NUM_FARMERS_EXTRAPOLATED', 
    'TOTAL_FARMERS_NONTRADER', 
    'TOTAL_FARMERS_TRADER', 
    'TOTAL_FARMERS_RFA', 
    'TOTAL_FARMERS', 
    'DISCL_NUMBER_FARMERS', 
    'NB_FARMERS_COMPANY_YEAR'
]

# 1. Remove farmer values for the 2024 data
# We set them to NaN so we can fill them using historical data
df.loc[df['YEAR'] == 2024, farmer_cols] = np.nan

# 2. Sort the data by COOP_ID and YEAR
# This ensures that for every coop, records are in chronological order
df = df.sort_values(['COOP_ID', 'YEAR'])

# 3. Propagate historical values forward
# Within each cooperative group, we carry the last known farmer values 
# forward into the empty 2024 slots.
df[farmer_cols] = df.groupby('COOP_ID')[farmer_cols].ffill()

# 4. Final Sorting and Export
# Return to the standard display order
df = df.sort_values(['YEAR', 'COOP_ID', 'BUYER'])

# Save the updated dataset
df.to_csv('cleaned_coop_dataset_with_historical_farmers.csv', index=False, sep=';')

print("Process Complete.")
print(f"Filled 2024 rows: {df[df['YEAR'] == 2024]['TOTAL_FARMERS'].notna().sum()} / {len(df[df['YEAR'] == 2024])}")
<positron-console-cell-1>:5: DtypeWarning: Columns (33,34) have mixed types. Specify dtype option on import or set low_memory=False.


Process Complete.
Filled 2024 rows: 6429 / 7425
df.drop_duplicates(inplace=True)
df.columns
Index(['FLOW_ID', 'COOP_ID', 'YEAR', 'SUPPLIER_ABRVNAME', 'SUPPLIER_FULLNAME',
       'LATITUDE', 'LONGITUDE', 'DISTRICT_NAME', 'DISTRICT_GEOCODE', 'BUYER',
       'DISCLOSURE_SOURCE', 'TRADER_NAME', 'CERTIFICATIONS', 'NUM_FARMERS',
       'NUM_FARMERS_EXTRAPOLATED', 'TOTAL_FARMERS_NONTRADER',
       'TOTAL_FARMERS_TRADER', 'TOTAL_FARMERS_RFA', 'TOTAL_FARMERS',
       'DISCL_COUNTRY_NAME', 'DISCL_AREA_NAME', 'DISCL_SUPPLIER_ABRVNAME',
       'DISCL_SUPPLIER_FULLNAME', 'DISCL_SUPPLIER_TYPE', 'DISCL_LONGITUDE',
       'DISCL_LATITUDE', 'DISCL_NUMBER_FARMERS', 'DISCL_VOLUMES',
       'DISCL_VOLUMES_UNIT', 'DISCL_CERTIFICATION_NAME',
       'DISCL_CERTIFICATION_PCT', 'SIMPLIF_ABRVNAME', 'CCTN_COOP_ID',
       'IS_ALL_CAM_V3', 'IS_ANY_CAM_V3', 'CERTIFIED',
       'NB_FARMERS_COMPANY_YEAR', 'NON_TRADER', 'IS_TRADER', '!NOT_RFA',
       'unique_rfa_link', 'LVL_4_NAME', 'REPEATED_FROM_PAST_YEAR'],
      dtype='object')
import pandas as pd
import numpy as np

def qa_supply_chain_dataset(df):
    results = {}

    # --- 1. Unique Supply Chain Flow Check ---
    # Requirement: Each row should show a unique supply chain between coop, trader, and buyer in a given year.
    # Note: I'm using 'SUPPLIER_FULLNAME' and 'BUYER' as proxies for the chain actors.
    flow_cols = ['YEAR', 'SUPPLIER_FULLNAME', 'TRADER_NAME','BUYER']
    duplicates = df.duplicated(subset=flow_cols, keep=False)
    results['duplicate_flows_count'] = duplicates.sum()
    results['duplicate_flows_data'] = df[duplicates].sort_values(by=flow_cols)

    # --- 2. Geographic Consistency (Cooperative Metadata) ---
    # Requirement: Same name should have same coordinates and district geocodes.
    geo_cols = ['LATITUDE', 'LONGITUDE', 'DISTRICT_GEOCODE']

    # We group by Supplier Name and check how many unique coordinate/geocode sets exist for each
    geo_consistency = df.groupby('SUPPLIER_FULLNAME')[geo_cols].nunique()
    inconsistent_geos = geo_consistency[(geo_consistency > 1).any(axis=1)]
    results['inconsistent_geos_count'] = len(inconsistent_geos)
    results['inconsistent_geos_data'] = inconsistent_geos

    # --- 3. Coordinate Sanity Check ---
    # Checks if Lat/Long are within realistic bounds (assuming West Africa/Ivory Coast based on names)
    # Adjust bounds if your data covers a different region.
    lat_bounds = (4, 11)   # Example for Ivory Coast
    lon_bounds = (-9, -2)

    invalid_coords = df[
        (df['LATITUDE'].notna()) & 
        ((df['LATITUDE'] < lat_bounds[0]) | (df['LATITUDE'] > lat_bounds[1]) |
         (df['LONGITUDE'] < lon_bounds[0]) | (df['LONGITUDE'] > lon_bounds[1]))
    ]
    results['invalid_coords_count'] = len(invalid_coords)

    # --- 4. Logic Check: Trader Status ---
    # Check if IS_TRADER and NON_TRADER columns contradict each other
    logic_conflict = df[(df['IS_TRADER'] == True) & (df['NON_TRADER'] == True)]
    results['trader_logic_conflicts'] = len(logic_conflict)

    # --- 5. Missing Data (Critical Columns) ---
    critical_cols = ['YEAR', 'SUPPLIER_FULLNAME', 'BUYER', 'DISTRICT_GEOCODE']
    results['missing_values'] = df[critical_cols].isnull().sum()

    # --- 6. Farmer Count vs. Boolean Flags ---
    # If there are farmers assigned, the coop should likely be marked as IS_TRADER or similar
    anomaly_farmers = df[(df['NB_FARMERS_COMPANY_YEAR'] > 0) & (df['SUPPLIER_FULLNAME'].isna())]
    results['farmers_no_supplier_count'] = len(anomaly_farmers)

    return results

# Usage:
# qa_report = qa_supply_chain_dataset(your_dataframe)

# Quick Summary Printout
# print(f"Duplicate Flows: {qa_report['duplicate_flows_count']}")
# print(f"Inconsistent Geographics for Same Coop: {qa_report['inconsistent_geos_count']}")
qa_supply_chain_dataset(df)
{'duplicate_flows_count': 5389,
 'duplicate_flows_data':          FLOW_ID  COOP_ID  YEAR SUPPLIER_ABRVNAME  \
 76       2012-25     1598  2012      COOPAAPROMAN   
 80       2012-29     1835  2012           COOPAMA   
 158      2013-31     1598  2013      COOPAAPROMAN   
 163      2013-36     1835  2013           COOPAMA   
 255      2014-31     1598  2014      COOPAAPROMAN   
 ...          ...      ...   ...               ...   
 33751  2024-6091     5463  2024           SOCOPLA   
 33829  2024-6169     5533  2024         SONGTAABA   
 33905  2024-6245     5601  2024       TEEGA WENDE   
 33948  2024-6288     5638  2024              UDAN   
 34071  2024-6411     5749  2024           YEYASSO

                                        SUPPLIER_FULLNAME  LATITUDE  LONGITUDE  \
 76     COOPERATIVE AGRICOLE ANOUANZE DES PRODUCTEURS ...     6.843     -3.291   
 80     COOPERATIVE AGRICOLE ANOUANZE DES PRODUCTEURS ...     6.843     -3.284   
 158    COOPERATIVE AGRICOLE ANOUANZE DES PRODUCTEURS ...     6.843     -3.291   
 163    COOPERATIVE AGRICOLE ANOUANZE DES PRODUCTEURS ...     6.843     -3.284   
 255    COOPERATIVE AGRICOLE ANOUANZE DES PRODUCTEURS ...     6.843     -3.291   
 ...                                                  ...       ...        ...   
 33751                                                NaN       NaN        NaN   
 33829                                                NaN       NaN        NaN   
 33905                                                NaN       NaN        NaN   
 33948                                                NaN       NaN        NaN   
 34071                                                NaN       NaN        NaN

       DISTRICT_NAME DISTRICT_GEOCODE    BUYER  ... IS_ALL_CAM_V3  \
 76      AGNIBILEKRO       CI-3.1.2_1  CARGILL  ...         False   
 80      AGNIBILEKRO       CI-3.1.2_1  CARGILL  ...         False   
 158     AGNIBILEKRO       CI-3.1.2_1  CARGILL  ...         False   
 163     AGNIBILEKRO       CI-3.1.2_1  CARGILL  ...         False   
 255     AGNIBILEKRO       CI-3.1.2_1  CARGILL  ...         False   
 ...             ...              ...      ...  ...           ...   
 33751           NaN              NaN      NaN  ...         False   
 33829           NaN              NaN      NaN  ...         False   
 33905           NaN              NaN      NaN  ...         False   
 33948           NaN              NaN      NaN  ...         False   
 34071           NaN              NaN      NaN  ...         False

       IS_ANY_CAM_V3 CERTIFIED  NB_FARMERS_COMPANY_YEAR  NON_TRADER  IS_TRADER  \
 76             True      True                    612.0       False       True   
 80             True      True                    467.0       False       True   
 158            True      True                    612.0       False       True   
 163            True      True                    467.0       False       True   
 255            True      True                    612.0       False       True   
 ...             ...       ...                      ...         ...        ...   
 33751         False       NaN                      NaN         NaN        NaN   
 33829         False       NaN                      NaN         NaN        NaN   
 33905         False       NaN                      NaN         NaN        NaN   
 33948         False       NaN                      NaN         NaN        NaN   
 34071         False       NaN                      NaN         NaN        NaN

        !NOT_RFA  unique_rfa_link   LVL_4_NAME REPEATED_FROM_PAST_YEAR  
 76        False            False  AGNIBILEKRO                   False  
 80        False            False  AGNIBILEKRO                   False  
 158       False            False  AGNIBILEKRO                   False  
 163       False            False  AGNIBILEKRO                   False  
 255       False            False  AGNIBILEKRO                   False  
 ...         ...              ...          ...                     ...  
 33751       NaN              NaN          NaN                    True  
 33829       NaN              NaN          NaN                    True  
 33905       NaN              NaN          NaN                    True  
 33948       NaN              NaN          NaN                    True  
 34071       NaN              NaN          NaN                    True

 [5389 rows x 43 columns],
 'inconsistent_geos_count': 539,
 'inconsistent_geos_data':                                                     LATITUDE  LONGITUDE  \
 SUPPLIER_FULLNAME                                                         
 AHONDJO                                                    2          2   
 ANO                                                        2          2   
 ANOUANZE                                                   2          2   
 BECIDA                                                     1          1   
 BENKADI                                                    5          5   
 ...                                                      ...        ...   
 UNION DES PRODUCTEURS AGRICOLES DE DJAKOUAKOUKRO           2          2   
 UNION DES SOCIETES COOPERATIVE DE LA SOUS-PREFE...         2          2   
 UNION DES SOCIETES COOPERATIVES DU DEPARTMENT D...         2          2   
 UNITE COOPERATIVE AGRICOLE DE DANANE                       2          2   
 UPAS                                                       1          1

                                                     DISTRICT_GEOCODE  
 SUPPLIER_FULLNAME                                                     
 AHONDJO                                                            1  
 ANO                                                                2  
 ANOUANZE                                                           2  
 BECIDA                                                             2  
 BENKADI                                                            4  
 ...                                                              ...  
 UNION DES PRODUCTEURS AGRICOLES DE DJAKOUAKOUKRO                   2  
 UNION DES SOCIETES COOPERATIVE DE LA SOUS-PREFE...                 1  
 UNION DES SOCIETES COOPERATIVES DU DEPARTMENT D...                 1  
 UNITE COOPERATIVE AGRICOLE DE DANANE                               1  
 UPAS                                                               2

 [539 rows x 3 columns],
 'invalid_coords_count': 0,
 'trader_logic_conflicts': 0,
 'missing_values': YEAR                     0
 SUPPLIER_FULLNAME      388
 BUYER                25205
 DISTRICT_GEOCODE      3931
 dtype: int64,
 'farmers_no_supplier_count': 40}
import numpy as np

# 1. If TRADER_NAME is a manufacturer, make it NA
# Note: Ensure the column name matches your dataframe (usually 'TRADER' or 'TRADER_NAME')
manufacturers = [
    'FILDISI', 'NESTLE', 'FERRERO', 'ALFRED RITTER',
    'MONDELEZ', 'MARS', "TONY'S CHOCOLONELY", 'HERSHEY',
    'RAINFOREST ALLIANCE', 'ALTER ECO', 'ALBERT HEIJN'
]

# Using .loc for safe assignment
df.loc[df['TRADER_NAME'].isin(manufacturers), 'TRADER_NAME'] = np.nan


# 2. If disclosure source is TONY'S CHOCOLONELY, Buyer and Trader is COCOASOURCE
# Assuming 'DISCLOSURE_SOURCE' is the column name for the source
tonys_mask = df['DISCLOSURE_SOURCE'] == "TONY'S CHOCOLONELY"

df.loc[tonys_mask, 'BUYER'] = 'COCOASOURCE'
df.loc[tonys_mask, 'TRADER_NAME'] = 'COCOASOURCE'


# --- FINAL QA: RE-AGGREGATION ---
# Because you changed Traders/Buyers, you might have created "duplicate" rows 
# for what is now the same supply chain. We should group them.

# Define the columns that define a unique flow
group_cols = ['YEAR', 'SUPPLIER_FULLNAME', 'TRADER_NAME', 'BUYER', 'DISTRICT_GEOCODE']

# Sum numeric values (like farmer counts) and keep the first instance of metadata
df_final = df.groupby(group_cols, as_index=False).agg({
    'NB_FARMERS_COMPANY_YEAR': 'sum',
    'LATITUDE': 'first',
    'LONGITUDE': 'first',
    'CERTIFIED': 'max', # If any row was True, the merged row is True
    'IS_TRADER': 'max'
})

print(f"Cleaned dataset contains {len(df_final)} unique supply chain flows.")
Cleaned dataset contains 6273 unique supply chain flows.
df.to_csv("2024_coops_like_camlong_febfixes.csv", index=False)
df = pd.read_csv("2024_coops_like_camlong_febfixes.csv")
<positron-console-cell-37>:1: DtypeWarning: Columns (33,34) have mixed types. Specify dtype option on import or set low_memory=False.
coopyear = pd.read_csv("/Users/niamhfrench/repos/TRASE/trase/data/cote_divoire/cocoa/logistics/q4_2025/clean/CAM_coopyear.csv", sep=';')
<positron-console-cell-38>:1: DtypeWarning: Columns (6,8,9,10,11,19,23,24,27,31,32,34) have mixed types. Specify dtype option on import or set low_memory=False.
len(coopyear.columns)
len(df.columns)
43
len(df.columns)
43
import pandas as pd

# 1. Drop columns from 'df' that are NOT in 'coopyear'
# We use .columns.intersection to ensure we only keep columns that actually exist in both
cols_to_keep = df.columns.intersection(coopyear.columns)
df = df[cols_to_keep]

# 2. Remove duplicates based on 'SUPPLIER_ABRVNAME'
# 'keep=first' is the default, ensuring you keep the first occurrence
df = df.drop_duplicates(subset=['SUPPLIER_ABRVNAME','YEAR'])

# Verify the results
print(f"New column count: {len(df.columns)}")
print(f"New row count: {len(df)}")
New column count: 34
New row count: 23770
df = pd.read_csv("2024_coops_like_camlong_febfixes.csv")
len(df.COOP_ID.unique())
<positron-console-cell-83>:1: DtypeWarning: Columns (33,34) have mixed types. Specify dtype option on import or set low_memory=False.





6339
df.columns
Index(['FLOW_ID', 'COOP_ID', 'YEAR', 'SUPPLIER_ABRVNAME', 'SUPPLIER_FULLNAME', 'LATITUDE', 'LONGITUDE', 'DISTRICT_NAME', 'DISTRICT_GEOCODE', 'BUYER', 'DISCLOSURE_SOURCE', 'TRADER_NAME', 'CERTIFICATIONS', 'NUM_FARMERS', 'NUM_FARMERS_EXTRAPOLATED', 'TOTAL_FARMERS_NONTRADER',
       'TOTAL_FARMERS_TRADER', 'TOTAL_FARMERS_RFA', 'TOTAL_FARMERS', 'DISCL_COUNTRY_NAME', 'DISCL_AREA_NAME', 'DISCL_SUPPLIER_ABRVNAME', 'DISCL_SUPPLIER_FULLNAME', 'DISCL_SUPPLIER_TYPE', 'DISCL_LONGITUDE', 'DISCL_LATITUDE', 'DISCL_NUMBER_FARMERS', 'DISCL_VOLUMES',
       'DISCL_VOLUMES_UNIT', 'DISCL_CERTIFICATION_NAME', 'DISCL_CERTIFICATION_PCT', 'SIMPLIF_ABRVNAME', 'CCTN_COOP_ID', 'IS_ALL_CAM_V3', 'IS_ANY_CAM_V3', 'CERTIFIED', 'NB_FARMERS_COMPANY_YEAR', 'NON_TRADER', 'IS_TRADER', '!NOT_RFA', 'unique_rfa_link', 'LVL_4_NAME',
       'REPEATED_FROM_PAST_YEAR'],
      dtype='object')
# - COOPERATIVE SIMPLIFIEE TIPHERET DE TOULEPLEU is in Toulépleu, Cavally
# Union Inter Régional des coopératives du Grand Ouest avec conseil d`Administration (UJRECGO-COOP-CA), location is correct but not long name, and J needs replaced with I
# UDEKO is in Sinfra
# UPRAD is in SASSANDRA 


# - make cam coopyear

# - rerun SEI-PCS
import pandas as pd
import numpy as np
import re

def standardize_coop_data(df):
    """
    Cleans cooperative names, standardizes districts, and resolves COOP_IDs 
    to the earliest assigned ID, maintaining original dataframe structure.
    """
    # Create a copy to avoid modifying the original dataframe in place unexpectedly
    df_clean = df.copy()

    # Save the exact original columns and their order
    original_columns = df_clean.columns.tolist()

    # --- HELPER FUNCTIONS ---
    def get_longest_name(series):
        # Filter out NaNs and literal 'NULL' strings
        valid_names = series.dropna()[series.dropna().astype(str).str.strip().str.upper() != 'NULL']
        if valid_names.empty:
            return np.nan
        return max(valid_names.astype(str), key=len)

    def extract_core_name(fullname):
        if pd.isna(fullname) or str(fullname).strip().upper() == 'NULL':
            return ""
        name = str(fullname).upper()
        legal_terms = [
            r"COOPERATIVE AVEC CONSEIL D'ADMINISTRATION",
            r"SOCIETE COOPERATIVES AGRICOLE DE",
            r"SOCIETE COOPERATIVE SIMPLIFIEE",
            r"SOCIETE COOPERATIVE",
            r"COOP-CA",
            r"COOP CA",
            r"SCOOPS",
            r"COOPERATIVE",
            r"COOP\."
        ]
        for term in legal_terms:
            name = re.sub(term, "", name)
        name = re.sub(r"[^\w\s\d']", " ", name)
        return " ".join(name.split())

    def get_most_frequent_district(series):
        valid_districts = series.dropna()[series.dropna().astype(str).str.upper() != 'NULL']
        if valid_districts.empty:
            return 'UNKNOWN_DISTRICT'
        return valid_districts.mode().iloc[0]


    # --- PIPELINE EXECUTION ---

    # 1. Clean short name for grouping (temporary column)
    df_clean['temp_short_name'] = df_clean['SUPPLIER_ABRVNAME'].astype(str).str.strip().str.upper()
    df_clean['temp_short_name'] = df_clean['temp_short_name'].replace({'NULL': np.nan, 'NAN': np.nan})

    # 2. Fix Long Names: Broadcast the longest full name based on the short name group
    fixed_fullnames = df_clean.groupby('temp_short_name')['SUPPLIER_FULLNAME'].transform(get_longest_name)
    # Overwrite original column, keeping original data if the fix was NaN
    df_clean['SUPPLIER_FULLNAME'] = fixed_fullnames.fillna(df_clean['SUPPLIER_FULLNAME'])

    # 3. Extract Core Name from the newly fixed full names (temporary column)
    df_clean['temp_core_name'] = df_clean['SUPPLIER_FULLNAME'].apply(extract_core_name)

    # Fill remaining NaNs in short name with empty strings so groupby doesn't drop them
    df_clean['temp_short_name'] = df_clean['temp_short_name'].fillna('')

    # 4. Standardize Districts: Broadcast the most frequent district per cooperative
    best_districts = df_clean.groupby(['temp_short_name', 'temp_core_name'])['DISTRICT_NAME'].transform(get_most_frequent_district)
    # Overwrite original column
    df_clean['DISTRICT_NAME'] = best_districts.fillna(df_clean['DISTRICT_NAME'])

    # 5. Assign Earliest COOP_ID
    # Sort logically to ensure the first row of any group is the earliest year
    df_sorted = df_clean.sort_values(by=['temp_short_name', 'temp_core_name', 'DISTRICT_NAME', 'YEAR'])

    earliest_ids = df_sorted.groupby(['temp_short_name', 'temp_core_name', 'DISTRICT_NAME'])['COOP_ID'].first().reset_index()
    earliest_ids.rename(columns={'COOP_ID': 'temp_STD_COOP_ID'}, inplace=True)

    # Merge the earliest IDs back into the main working dataframe
    df_clean = df_clean.merge(earliest_ids, on=['temp_short_name', 'temp_core_name', 'DISTRICT_NAME'], how='left')

    # Overwrite original column
    df_clean['COOP_ID'] = df_clean['temp_STD_COOP_ID']

    # --- CLEANUP ---
    # Strip away all 'temp_' columns and enforce the exact original schema
    df_clean = df_clean[original_columns]

    return df_clean

# Run the pipeline on your dataframe
df_final = standardize_coop_data(df)
len(df_final.COOP_ID.unique())
4444
df_final
FLOW_ID COOP_ID YEAR SUPPLIER_ABRVNAME SUPPLIER_FULLNAME LATITUDE LONGITUDE DISTRICT_NAME DISTRICT_GEOCODE BUYER ... IS_ALL_CAM_V3 IS_ANY_CAM_V3 CERTIFIED NB_FARMERS_COMPANY_YEAR NON_TRADER IS_TRADER !NOT_RFA unique_rfa_link LVL_4_NAME REPEATED_FROM_PAST_YEAR
0 2009-1 448 2009 CAAP COOPERATIVE ANOUANZE DE PETIT BOUAKE 5.890 -6.376 SOUBRE CI-2.2.5_1 CARGILL ... False True False 277.0 False True False False SOUBRE False
1 2009-2 726 2009 CAFHS COOPERATIVE AGRICOLE FIEDIFOUE DU HAUT SASSANDRA 6.891 -6.461 DALOA CI-9.1.1_1 CARGILL ... False True True 838.0 False True False False DALOA False
2 2009-3 1734 2009 COOPAGA COOPERATIVE AGRICOLE GENERALE DE GAGNOA 5.022 -6.550 SAN-PEDRO CI-2.3.1_1 CARGILL ... False True True 1190.0 False True False False SAN-PEDRO False
3 2009-4 2906 2009 ECASO COOPERATIVE DES AGRICULTEURS DU SUD-OUEST 5.785 -6.597 SOUBRE CI-2.2.5_1 CARGILL ... False True True 735.0 False True False False SOUBRE False
4 2009-5 4783 2009 SOCAB COOPERATIVE DES AGRICULTEURS DE BOUAFLE 6.263 -6.999 BUYO CI-2.2.1_1 CARGILL ... False False False 237.0 False True False False BUYO False
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
35081 2024-7421 5792 2024 ECAM SASSANDRA ENTREPRISE COOPERATIVE DES AGRICULTEURS DE MEA... NaN NaN SASSANDRA CI-2.1.2_1 ECOM ... NaN NaN NaN NaN NaN False NaN NaN CI-2.1.2_1 False
35082 2024-7422 6000 2024 SUF COOPERATIVE AGRICOLE DES PLANTEURS UNIS DE FEN... 6.785 -7.393 DUEKOUE CI-8.2.2_1 ECOM ... NaN NaN NaN NaN NaN False NaN NaN CI-8.2.2_1 False
35083 2024-7423 6185 2024 TOULEPLEU SOCIETE COOPERATIVE AVEC CONSEIL DADMINISTRATI... NaN NaN TOULEUPLEU CI-8.1.4_1 ECOM ... NaN NaN NaN NaN NaN False NaN NaN CI-8.1.4_1 False
35084 2024-7424 5789 2024 SANAB SOCIETE COOPERATIVE AGRICOLE NOUVELLE ALLIANCE... 4.957 -6.085 BANGOLO CI-8.2.1_1 ECOM ... NaN NaN NaN NaN NaN False NaN NaN CI-8.2.1_1 False
35085 2024-7425 6339 2024 CCAPUO SOCIETE COOPERATIVE AVEC CONSEIL D`ADMINISTRAT... 6.747 -7.362 DUEKOUE CI-8.2.2_1 ECOM ... NaN NaN NaN NaN NaN False NaN NaN CI-8.2.2_1 False

35086 rows × 43 columns

# The na=False prevents errors if there are empty/NaN rows
df_final[df_final['SUPPLIER_ABRVNAME'].str.contains('YAT', na=False)]
FLOW_ID COOP_ID YEAR SUPPLIER_ABRVNAME SUPPLIER_FULLNAME LATITUDE LONGITUDE DISTRICT_NAME DISTRICT_GEOCODE BUYER ... IS_ALL_CAM_V3 IS_ANY_CAM_V3 CERTIFIED NB_FARMERS_COMPANY_YEAR NON_TRADER IS_TRADER !NOT_RFA unique_rfa_link LVL_4_NAME REPEATED_FROM_PAST_YEAR
151 2013-24 1198 2013 CAYAT COOPERATIVE DE DEVELOPPEMENT AGRICOLE DE YAKAS... 6.099 -3.852 ADZOPE CI-7.3.1_1 CARGILL ... False True True 1408.0 False True False False ADZOPE False
248 2014-24 1198 2014 CAYAT COOPERATIVE DE DEVELOPPEMENT AGRICOLE DE YAKAS... 6.099 -3.852 ADZOPE CI-7.3.1_1 CARGILL ... False True True 1408.0 False True False False ADZOPE False
345 2015-24 1198 2015 CAYAT COOPERATIVE DE DEVELOPPEMENT AGRICOLE DE YAKAS... 6.099 -3.852 ADZOPE CI-7.3.1_1 CARGILL ... False True True 1408.0 False True False False ADZOPE False
448 2016-27 1198 2016 CAYAT COOPERATIVE DE DEVELOPPEMENT AGRICOLE DE YAKAS... 6.099 -3.852 ADZOPE CI-7.3.1_1 CARGILL ... False True True 1408.0 False True False False ADZOPE False
566 2017-29 1198 2017 CAYAT COOPERATIVE DE DEVELOPPEMENT AGRICOLE DE YAKAS... 6.099 -3.852 ADZOPE CI-7.3.1_1 CARGILL ... False True True 1408.0 False True False False ADZOPE False
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
34047 2024-6387 5727 2024 YAT SOCIETE COOPERATIVES AGRICOLE DE YAKASSE ATTOBROU 6.220 -3.575 YAKASSE-ATTOBROU CI-7.1.4_1 SUCDEN ... False False False 1297.0 NaN False False False CI-7.1.4_1 False
34048 2024-6388 5727 2024 YAT SOCIETE COOPERATIVES AGRICOLE DE YAKASSE ATTOBROU NaN NaN YAKASSE-ATTOBROU NaN SUCDEN ... False False False NaN False True False False NaN True
34049 2024-6389 5727 2024 YAT SOCIETE COOPERATIVES AGRICOLE DE YAKASSE ATTOBROU NaN NaN YAKASSE-ATTOBROU NaN OLAM ... False False False NaN False True False False NaN True
34991 2024-7331 6274 2024 SSYAT SCOOPSYAT 6.186 -3.660 YAKASSE-ATTOBROU CI-7.3.4_1 CARGILL ... False False True NaN NaN False False False CI-7.3.4_1 False
34992 2024-7332 6274 2024 SSYAT SCOOPSYAT 6.186 -3.660 YAKASSE-ATTOBROU CI-7.3.4_1 MARS ... False False True NaN NaN False False False CI-7.3.4_1 False

68 rows × 43 columns

import numpy as np

# List all the columns you want to be identical for the same COOP_ID
columns_to_sync = [
    'SUPPLIER_ABRVNAME', 
    'SUPPLIER_FULLNAME', 
    'LATITUDE', 
    'LONGITUDE', 
    'DISTRICT_NAME', 
    'DISTRICT_GEOCODE'
]

def get_best_value(series):
    """Finds the most frequent non-null value in a series."""
    # Filter out actual NaNs and literal 'NULL' strings
    valid_data = series.dropna()[series.dropna().astype(str).str.strip().str.upper() != 'NULL']

    if valid_data.empty:
        return np.nan

    # Return the mode (most frequent value). If there's a tie, it picks the first one.
    return valid_data.mode().iloc[0]

# Apply the synchronization
for col in columns_to_sync:
    df_final[col] = df_final.groupby('COOP_ID')[col].transform(get_best_value)

# Visualize your fix
columns_to_view = ['COOP_ID', 'SUPPLIER_ABRVNAME', 'SUPPLIER_FULLNAME', 'LATITUDE', 'LONGITUDE', 'DISTRICT_NAME']
print(df_final[df_final['COOP_ID'] == 5727][columns_to_view].head())
       COOP_ID SUPPLIER_ABRVNAME                                  SUPPLIER_FULLNAME  LATITUDE  LONGITUDE     DISTRICT_NAME
5663      5727               YAT  SOCIETE COOPERATIVES AGRICOLE DE YAKASSE ATTOBROU      6.22     -3.575  YAKASSE-ATTOBROU
5664      5727               YAT  SOCIETE COOPERATIVES AGRICOLE DE YAKASSE ATTOBROU      6.22     -3.575  YAKASSE-ATTOBROU
5665      5727               YAT  SOCIETE COOPERATIVES AGRICOLE DE YAKASSE ATTOBROU      6.22     -3.575  YAKASSE-ATTOBROU
5666      5727               YAT  SOCIETE COOPERATIVES AGRICOLE DE YAKASSE ATTOBROU      6.22     -3.575  YAKASSE-ATTOBROU
10542     5727               YAT  SOCIETE COOPERATIVES AGRICOLE DE YAKASSE ATTOBROU      6.22     -3.575  YAKASSE-ATTOBROU
# The na=False prevents errors if there are empty/NaN rows
df_final[df_final['SUPPLIER_ABRVNAME']=='WEBE']
FLOW_ID COOP_ID YEAR SUPPLIER_ABRVNAME SUPPLIER_FULLNAME LATITUDE LONGITUDE DISTRICT_NAME DISTRICT_GEOCODE BUYER ... IS_ALL_CAM_V3 IS_ANY_CAM_V3 CERTIFIED NB_FARMERS_COMPANY_YEAR NON_TRADER IS_TRADER !NOT_RFA unique_rfa_link LVL_4_NAME REPEATED_FROM_PAST_YEAR
5646 2019-4844 5701 2019 WEBE SCOOPS WEBE 5.381 -3.986 SOUBRE CI-1.1.1_1 NaN ... True True True NaN False False NaN NaN ABIDJAN False
10525 2020-4825 5701 2020 WEBE SCOOPS WEBE 5.381 -3.986 SOUBRE CI-1.1.1_1 NaN ... True True NaN NaN NaN NaN NaN NaN NaN True
15502 2021-4923 5701 2021 WEBE SCOOPS WEBE 5.381 -3.986 SOUBRE CI-1.1.1_1 NaN ... True True NaN NaN NaN NaN NaN NaN NaN True
21611 2022-6055 5701 2022 WEBE SCOOPS WEBE 5.381 -3.986 SOUBRE CI-1.1.1_1 NaN ... True True NaN NaN NaN NaN NaN NaN NaN True
21612 2022-6056 5701 2022 WEBE SCOOPS WEBE 5.381 -3.986 SOUBRE CI-1.1.1_1 NaN ... False False True NaN False False True True NaN False
27595 2023-5922 5701 2023 WEBE SCOOPS WEBE 5.381 -3.986 SOUBRE CI-1.1.1_1 NaN ... True True NaN NaN NaN NaN NaN NaN NaN True
27596 2023-5923 5701 2023 WEBE SCOOPS WEBE 5.381 -3.986 SOUBRE CI-1.1.1_1 NaN ... False False NaN NaN NaN NaN NaN NaN NaN True
34018 2024-6358 5701 2024 WEBE SCOOPS WEBE 5.381 -3.986 SOUBRE CI-1.1.1_1 OFI ... False False False NaN NaN False False False CI-2.2.5_1 False
34019 2024-6359 5701 2024 WEBE SCOOPS WEBE 5.381 -3.986 SOUBRE CI-1.1.1_1 SUCDEN ... False False False NaN NaN False False False CI-2.2.5_1 False
34020 2024-6360 5701 2024 WEBE SCOOPS WEBE 5.381 -3.986 SOUBRE CI-1.1.1_1 NaN ... False False NaN NaN NaN NaN NaN NaN NaN True

10 rows × 43 columns

import pandas as pd
import numpy as np
import re

# 1. Create a mapping dictionary from your geodataframe
# This maps the location name (e.g., "OUANINOU") to its code (e.g., "CI-12.1.2_1")
location_map = dict(zip(gdf['LVL_4_NAME'].str.upper(), gdf['LVL_4_CODE']))

# Sort the location names by length, longest first. 
sorted_locations = sorted([loc for loc in location_map.keys() if pd.notna(loc)], key=len, reverse=True)

def extract_geodata_from_name(longname):
    """Searches a string for known Ivorian district names and returns the name and code."""
    if pd.isna(longname) or str(longname).strip().upper() == 'NULL':
        return np.nan, np.nan

    name_upper = str(longname).upper()

    for loc in sorted_locations:
        # Use regex word boundaries (\b) for exact word matches
        pattern = rf"\b{re.escape(str(loc))}\b"

        if re.search(pattern, name_upper):
            return loc, location_map[loc]

    # Return NaNs if no district name is found in the long name
    return np.nan, np.nan

# 2. Apply the extraction function to ALL rows
print(f"Scanning all {len(df_final)} rows for geographic data in the long names...")

# Create a temporary DataFrame to hold the extracted results
extracted_df = df_final['SUPPLIER_FULLNAME'].apply(
    lambda x: pd.Series(extract_geodata_from_name(x), index=['EXTRACTED_NAME', 'EXTRACTED_CODE'])
)

# 3. Find exactly which rows had a successful match
found_mask = extracted_df['EXTRACTED_NAME'].notna()

# 4. Overwrite the existing columns ONLY where a match was found
# If found_mask is False for a row, pandas leaves the original value untouched
df_final.loc[found_mask, 'DISTRICT_NAME'] = extracted_df.loc[found_mask, 'EXTRACTED_NAME']
df_final.loc[found_mask, 'DISTRICT_GEOCODE'] = extracted_df.loc[found_mask, 'EXTRACTED_CODE']

print(f"Successfully updated geodata for {found_mask.sum()} rows!")
Scanning all 35086 rows for geographic data in the long names...
Successfully updated geodata for 10775 rows!
df_final.loc[fixed_indices]
FLOW_ID COOP_ID YEAR SUPPLIER_ABRVNAME SUPPLIER_FULLNAME LATITUDE LONGITUDE DISTRICT_NAME DISTRICT_GEOCODE BUYER ... IS_ALL_CAM_V3 IS_ANY_CAM_V3 CERTIFIED NB_FARMERS_COMPANY_YEAR NON_TRADER IS_TRADER !NOT_RFA unique_rfa_link LVL_4_NAME REPEATED_FROM_PAST_YEAR
1339 2019-537 603 2019 CADB ISSIA CADB ISSIA NaN NaN ISSIA CI-9.1.2_1 NaN ... False True True NaN False False NaN NaN NaN False
1433 2019-631 710 2019 CAETD COOPERATIVE AGRICOLE ELIE TIKPA DE DALOA NaN NaN DALOA CI-9.1.1_1 NaN ... True True True NaN False False NaN NaN NaN False
1535 2019-733 823 2019 CAKS COOPERATIVE AGRICOLE KOUADO DE SIPILOU NaN NaN SIPILOU CI-8.3.4_1 NaN ... True True True NaN False False NaN NaN NaN False
1618 2019-816 915 2019 CANOVA COOPERATIVE AGRICOLE NON-TABA DE VAVOUA NaN NaN VAVOUA CI-9.1.3_1 NaN ... False True True NaN False False NaN NaN NaN False
1661 2019-859 969 2019 CAPEN DIVO COOPERATIVE AGRICOLE N'ZASSA DE TIASSALE NaN NaN TIASSALE CI-7.1.4_1 NaN ... True True True NaN False False NaN NaN NaN False
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
33963 2024-6303 5653 2024 UNION DES PRODUCTEURS CAFE ET CACAO DE GAGNOA UNION DES PRODUCTEURS CAFE ET CACAO DE GAGNOA NaN NaN GAGNOA CI-5.1.1_1 NaN ... False False NaN NaN NaN NaN NaN NaN NaN True
33967 2024-6307 5657 2024 UPAD UNION DES PRODUCTEURS AGRICOLES DE DANANE NaN NaN DANANE CI-8.3.2_1 NaN ... False False NaN NaN NaN NaN NaN NaN NaN True
33973 2024-6313 5661 2024 UPB BANGOLO UPB BANGOLO NaN NaN BANGOLO CI-8.2.1_1 NaN ... False False NaN NaN NaN NaN NaN NaN NaN True
34035 2024-6375 5716 2024 WENDSONGDO COOPERATIVE SIMPLIFIEE WENDSONGDO DE TAI NaN NaN TAI CI-8.1.3_1 NaN ... False False NaN NaN NaN NaN NaN NaN NaN True
35021 2024-7361 6293 2024 CABG /MANKONO CABG COOP-CA/MANKONO NaN NaN MANKONO CI-12.2.3_1 SUCDEN ... False False False NaN NaN True False False NaN False

500 rows × 43 columns

df_final.to_csv('/Users/niamhfrench/repos/TRASE/trase/2024_coops_like_camlong_marchfixes.csv', index=False)
df_long = pd.read_csv("/Users/niamhfrench/repos/TRASE/trase/2024_coops_like_camlong_marchfixes.csv")
df_coopyear = pd.read_csv("/Users/niamhfrench/repos/TRASE/trase/2024_coops_like_cam_coopyear_febfixes.csv")
<positron-console-cell-99>:1: DtypeWarning: Columns (33,34) have mixed types. Specify dtype option on import or set low_memory=False.
<positron-console-cell-99>:2: DtypeWarning: Columns (26,27) have mixed types. Specify dtype option on import or set low_memory=False.
df_coopyear
COOP_ID YEAR SUPPLIER_ABRVNAME SUPPLIER_FULLNAME LATITUDE LONGITUDE DISTRICT_NAME DISTRICT_GEOCODE CERTIFICATIONS TOTAL_FARMERS_NONTRADER ... SIMPLIF_ABRVNAME CCTN_COOP_ID IS_ALL_CAM_V3 IS_ANY_CAM_V3 CERTIFIED NB_FARMERS_COMPANY_YEAR !NOT_RFA unique_rfa_link LVL_4_NAME REPEATED_FROM_PAST_YEAR
0 448 2009 CAAP CAAP 5.890 -6.376 SOUBRE CI-2.2.5_1 NaN 0.0 ... CAAP CAAP_-6.4_5.9_CAAP False True False 277.0 False False SOUBRE False
1 726 2009 CAFHS COOPERATIVE AGRICOLE FIEDIFOUE DU HAUT SASSANDRA 6.891 -6.461 DALOA CI-9.1.1_1 CARGILL COCOA PROMISE + RAINFOREST ALLIANCE 0.0 ... CAFHS CAFHS_-6.5_6.9_COOPERATIVE AGRICOLE FIEDIFOUE ... False True True 838.0 False False DALOA False
2 1734 2009 COOPAGA COOPERATIVE AGRICOLE DE GABIADJI 5.022 -6.550 SAN-PEDRO CI-2.3.1_1 CARGILL COCOA PROMISE + RAINFOREST ALLIANCE 0.0 ... COOPAGA COOPAGA_-6.6_5_COOPERATIVE AGRICOLE DE GABIADJI False True True 1190.0 False False SAN-PEDRO False
3 2906 2009 ECASO COOPERATIVE DES AGRICULTEURS DU SUD-OUEST 5.785 -6.597 SOUBRE CI-2.2.5_1 CARGILL COCOA PROMISE + RAINFOREST ALLIANCE 0.0 ... ECASO ECASO_-6.6_5.8_COOPERATIVE DES AGRICULTEURS DU... False True True 735.0 False False SOUBRE False
4 4783 2009 SOCAB COOPERATIVE DE LA BIA 6.263 -6.999 BUYO CI-2.2.1_1 NaN 0.0 ... SOCAB SOCAB_-7_6.3_COOPERATIVE DE LA BIA False False False 237.0 False False BUYO False
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
23765 6329 2024 ANOU COOPERATIVE AGRICOLE DE COOPANOU NaN NaN TABOU CI-2.3.2_1 NaN NaN ... COOPANOU COOPANOU_NA_NA_COOPERATIVE AGRICOLE DE COOPANOU False True False NaN False False CI-2.3.2_1 False
23766 6330 2024 SCAETHS SOCIETE COOPERATIVE AGRICOLE ELIE TIKPA DE DALOA 6.874 -6.458 DALOA CI-9.1.1_1 NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN CI-9.1.1_1 False
23767 6331 2024 SS_DAKUA SCOOP-DAKUA NaN NaN GUIGLO CI-8.1.2_1 NaN NaN ... DAKUA DAKUA_NA_NA_SCOOP-DAKUA False False False NaN False False CI-8.1.2_1 False
23768 6332 2024 SORANHS SOCIETE COOPERATIVE DES PRODUCTEURS AGRICOLES ... 6.880 -6.448 DALOA CI-9.1.1_1 NaN NaN ... SOCOOPRANHS SOCOOPRANHS_NA_NA_COOPERATIVE DES PRODUCTEURS ... False False False NaN False False CI-9.1.1_1 False
23769 6339 2024 CCAPUO SOCIETE COOPERATIVE AVEC CONSEIL D`ADMINISTRAT... 6.747 -7.362 DUEKOUE CI-8.2.2_1 NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN CI-8.2.2_1 False

23770 rows × 34 columns

df_long
FLOW_ID COOP_ID YEAR SUPPLIER_ABRVNAME SUPPLIER_FULLNAME LATITUDE LONGITUDE DISTRICT_NAME DISTRICT_GEOCODE BUYER ... IS_ALL_CAM_V3 IS_ANY_CAM_V3 CERTIFIED NB_FARMERS_COMPANY_YEAR NON_TRADER IS_TRADER !NOT_RFA unique_rfa_link LVL_4_NAME REPEATED_FROM_PAST_YEAR
0 2009-1 448 2009 CAAP COOPERATIVE ANOUANZE DE PETIT BOUAKE 5.890 -6.376 BOUAKE CI-11.1.3_1 CARGILL ... False True False 277.0 False True False False SOUBRE False
1 2009-2 726 2009 CAFHS COOPERATIVE AGRICOLE FIEDIFOUE DU HAUT SASSANDRA 6.891 -6.461 SASSANDRA CI-2.1.2_1 CARGILL ... False True True 838.0 False True False False DALOA False
2 2009-3 1734 2009 COOPAGA COOPERATIVE AGRICOLE GENERALE DE GAGNOA 5.022 -6.550 GAGNOA CI-5.1.1_1 CARGILL ... False True True 1190.0 False True False False SAN-PEDRO False
3 2009-4 2906 2009 ECASO COOPERATIVE DES AGRICULTEURS DU SUD-OUEST 5.785 -6.597 SOUBRE CI-2.2.5_1 CARGILL ... False True True 735.0 False True False False SOUBRE False
4 2009-5 4783 2009 SOCAB COOPERATIVE DES AGRICULTEURS DE BOUAFLE 6.263 -6.999 BOUAFLE CI-9.2.1_1 CARGILL ... False False False 237.0 False True False False BUYO False
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
35081 2024-7421 5792 2024 ECAM SASSANDRA ENTREPRISE COOPERATIVE DES AGRICULTEURS DE MEA... NaN NaN MEAGUI CI-2.2.3_1 ECOM ... NaN NaN NaN NaN NaN False NaN NaN CI-2.1.2_1 False
35082 2024-7422 6000 2024 SUF COOPERATIVE AGRICOLE DES PLANTEURS UNIS DE FEN... 6.739 -7.353 DUEKOUE CI-8.2.2_1 ECOM ... NaN NaN NaN NaN NaN False NaN NaN CI-8.2.2_1 False
35083 2024-7423 6185 2024 TOULEPLEU SOCIETE COOPERATIVE AVEC CONSEIL DADMINISTRATI... 6.492 -6.585 TOULEUPLEU CI-8.1.4_1 ECOM ... NaN NaN NaN NaN NaN False NaN NaN CI-8.1.4_1 False
35084 2024-7424 5789 2024 SANAB SOCIETE COOPERATIVE AGRICOLE NOUVELLE ALLIANCE... 4.957 -6.085 BANGOLO CI-8.2.1_1 ECOM ... NaN NaN NaN NaN NaN False NaN NaN CI-8.2.1_1 False
35085 2024-7425 6339 2024 CCAPUO SOCIETE COOPERATIVE AVEC CONSEIL D`ADMINISTRAT... 6.747 -7.362 DUEKOUE CI-8.2.2_1 ECOM ... NaN NaN NaN NaN NaN False NaN NaN CI-8.2.2_1 False

35086 rows × 43 columns

# 1. Define the exact list of columns you requested
target_columns = [
    'COOP_ID', 'YEAR', 'SUPPLIER_ABRVNAME', 'SUPPLIER_FULLNAME', 'LATITUDE', 
    'LONGITUDE', 'DISTRICT_NAME', 'DISTRICT_GEOCODE', 'CERTIFICATIONS', 
    'TOTAL_FARMERS_NONTRADER', 'TOTAL_FARMERS_TRADER', 'TOTAL_FARMERS_RFA', 
    'TOTAL_FARMERS', 'DISCL_COUNTRY_NAME', 'DISCL_AREA_NAME',
    'DISCL_SUPPLIER_ABRVNAME', 'DISCL_SUPPLIER_FULLNAME', 'DISCL_SUPPLIER_TYPE', 
    'DISCL_LONGITUDE', 'DISCL_LATITUDE', 'DISCL_VOLUMES', 'DISCL_VOLUMES_UNIT', 
    'DISCL_CERTIFICATION_NAME', 'DISCL_CERTIFICATION_PCT', 'SIMPLIF_ABRVNAME', 
    'CCTN_COOP_ID', 'IS_ALL_CAM_V3', 'IS_ANY_CAM_V3', 'CERTIFIED', 
    'NB_FARMERS_COMPANY_YEAR', '!NOT_RFA', 'unique_rfa_link', 'LVL_4_NAME', 
    'REPEATED_FROM_PAST_YEAR'
]

# 2. Create a new DataFrame with only those columns
# We use .copy() to ensure it's a completely independent dataframe, preventing warnings later
df_yearly = df_final[target_columns].copy()

# 3. Keep only the unique combinations of COOP_ID and YEAR
# 'keep="first"' tells pandas to retain the first occurrence it finds and drop the rest
df_yearly = df_yearly.drop_duplicates(subset=['COOP_ID', 'YEAR'], keep='first')

# 4. Sort it so it's clean and easy to read
df_yearly = df_yearly.sort_values(by=['COOP_ID', 'YEAR']).reset_index(drop=True)

# Verify the shape to see how many unique coop/year combinations you have
print(f"Created df_yearly with {df_yearly.shape[0]} rows and {df_yearly.shape[1]} columns.")
Created df_yearly with 23819 rows and 34 columns.
df_yearly.to_csv("/Users/niamhfrench/repos/TRASE/trase/2024_coops_like_camcoopyear_marchfixes.csv")