Spatial aggregations production

View or edit on GitHub

This page is synchronized from trase/data/cote_divoire/cocoa/metrics/q3_2025/spatial_aggregations_production.ipynb. Last modified on 2026-02-03 10:30 CET by Jason J. Benedict. Please view or edit the original file there; changes should be reflected here after a midnight build (CET time), or manually triggering it with a GitHub action (link).

import pandas as pd

prod_stats = pd.read_csv("C:\\Users\\NiamhFrench\\Documents\\spatial_metrics_QA_civ_ghana\\data\\production\\ci_combine_regional_metrics_def_prod.csv", sep=";") [['LVL_4_CODE','PRODUCTION_TON', 'YEAR']]
cocoa_area = pd.read_csv("C:\\Users\\NiamhFrench\\Documents\\spatial_metrics_QA_civ_ghana\\data\\production\\ci_combine_regional_metrics_area_cdef.csv", sep=";")[['LVL_4_CODE','COCOA_HA', 'YEAR']]
def ci_lvl4_to_trase_id(lvl4_code: str) -> str:
    """
    Convert LVL_4_CODE (e.g., 'CI-1.1.1_1') to TRASE_ID (e.g., 'CI-010101').
    """
    prefix, rest = lvl4_code.split('-')
    nums = rest.split('_')[0].split('.')
    trase_id = prefix + '-' + ''.join([n.zfill(2) for n in nums])
    return trase_id


prod_stats['TRASE_ID'] = prod_stats['LVL_4_CODE'].apply(ci_lvl4_to_trase_id)
cocoa_area['TRASE_ID'] = cocoa_area['LVL_4_CODE'].apply(ci_lvl4_to_trase_id)

df = pd.merge(prod_stats, cocoa_area, on=['TRASE_ID', 'YEAR'], how='outer', suffixes=('_prod', '_area'))[['YEAR','TRASE_ID','PRODUCTION_TON','COCOA_HA']]
df
def aggregate_spatial_levels(df, country_code='CI'):

    def extract_region(trase_id):
        return trase_id[:5]
    def extract_department(trase_id):
        return trase_id[:7]


    df['REGION_ID'] = df['TRASE_ID'].apply(extract_region)
    if country_code == 'CI':
        df['DEPARTMENT_ID'] = df['TRASE_ID'].apply(extract_department)
    df['NATIONAL_ID'] = country_code

    metric_cols = [col for col in df.columns if col not in ['TRASE_ID', 'YEAR', 'REGION_ID', 'DEPARTMENT_ID', 'NATIONAL_ID', 'level']]

    if country_code == 'CI':
        dept_df = df.groupby(['DEPARTMENT_ID', 'YEAR'])[metric_cols].sum().reset_index()
        dept_df['TRASE_ID'] = dept_df['DEPARTMENT_ID']
        dept_df.drop(columns=['DEPARTMENT_ID'], inplace=True)

    region_df = df.groupby(['REGION_ID', 'YEAR'])[metric_cols].sum().reset_index()
    region_df['TRASE_ID'] = region_df['REGION_ID']
    region_df.drop(columns=['REGION_ID'], inplace=True)

    national_df = df.groupby(['NATIONAL_ID', 'YEAR'])[metric_cols].sum().reset_index()
    national_df['TRASE_ID'] = national_df['NATIONAL_ID']
    national_df.drop(columns=['NATIONAL_ID'], inplace=True)

    if country_code == 'CI':
        agg_df = pd.concat([df.drop(columns=['REGION_ID', 'DEPARTMENT_ID', 'NATIONAL_ID']),
                            dept_df,
                            region_df,
                            national_df],
                        ignore_index=True)
    else:
        agg_df = pd.concat([df.drop(columns=['REGION_ID', 'NATIONAL_ID']),
                            region_df,
                            national_df],
                        ignore_index=True)

    return agg_df

agg_df = aggregate_spatial_levels(df, country_code='CI')
display(agg_df)
def fill_na_and_zero(group):
    # Get valid (non-null and non-zero) values
    valid = group[(group.notna()) & (group != 0)]

    if not valid.empty:
        fill_value = valid.iloc[0]
    else:
        fill_value = 0  # Fallback if no valid value exists

    return group.fillna(0).replace(0, fill_value)

agg_df['COCOA_HA'] = agg_df.groupby('TRASE_ID')['COCOA_HA'].transform(fill_na_and_zero)
agg_df["COCOA_YIELD_TON_HA"] = agg_df['PRODUCTION_TON'] / agg_df['COCOA_HA']
agg_df['COCOA_YIELD_TON_HA'] = agg_df.groupby('TRASE_ID')['COCOA_YIELD_TON_HA'].transform(fill_na_and_zero)
agg_df.describe()
agg_df[agg_df['PRODUCTION_TON'] == 0][['PRODUCTION_TON', 'COCOA_HA', 'COCOA_YIELD_TON_HA']]
out_fp = "C:\\Users\\NiamhFrench\\Documents\\spatial_metrics_QA_civ_ghana\\data\\production\\ci_combine_regional_metrics_prod.parquet"
agg_df.to_parquet(out_fp, index=False)
df = pd.read_parquet('C:\\Users\\NiamhFrench\\Documents\\spatial_metrics_QA_civ_ghana\\data\\gh_combine_regional_metrics_c_def_emission_multilevel.parquet')
ghana_cocoa_area = pd.read_csv("C:\\Users\\NiamhFrench\\Documents\\spatial_metrics_QA_civ_ghana\\data\\production\\gh_combine_regional_metrics_cocoa_area_multilevel.csv", sep=";")
ghana_cocoa_area.drop(columns=['level'], inplace=True)
ghana_cocoa_area = aggregate_spatial_levels(ghana_cocoa_area, country_code='GH')
ghana_cocoa_area.drop_duplicates(subset=['TRASE_ID', 'YEAR'], inplace=True)
ghana_cocoa_area.to_parquet("C:\\Users\\NiamhFrench\\Documents\\spatial_metrics_QA_civ_ghana\\gh_combine_regional_metrics_cocoa_area_multilevel.parquet", index=False)