Spatial aggregations production

View or edit on GitHub

This page is synchronized from trase/data/cote_divoire/cocoa/metrics/q3_2025/spatial_aggregations_production.ipynb. Last modified on 2026-06-21 06:35 CEST by GitHub Actions. Please view or edit the original file there; changes should be reflected here after a midnight build (CET time), or manually triggering it with a GitHub action (link).

def ci_lvl4_to_trase_id(lvl4_code: str) -> str:
    """
    Convert LVL_4_CODE (e.g., 'CI-1.1.1_1') to TRASE_ID (e.g., 'CI-010101').
    """
    prefix, rest = lvl4_code.split('-')
    nums = rest.split('_')[0].split('.')
    trase_id = prefix + '-' + ''.join([n.zfill(2) for n in nums])
    return trase_id
import pandas as pd

prod_stats = pd.read_csv("cocoa_production_2019_2024_q4_2025.csv", sep=",")
cocoa_area = pd.read_csv("C:\\Users\\NiamhFrench\\Documents\\spatial_metrics_QA_civ_ghana\\data\\production\\ci_combine_regional_metrics_area_cdef.csv", sep=";")[['LVL_4_CODE','COCOA_HA', 'YEAR']]


# Identify the columns to pivot
prod_cols = [col for col in prod_stats.columns if col.startswith('production_')]
id_cols = [col for col in prod_stats.columns if col not in prod_cols]

prod_stats = (
    prod_stats
    # pivot_longer equivalent
    .melt(
        id_vars=id_cols, 
        value_vars=prod_cols,
        var_name='YEAR',
        value_name='PRODUCTION_TON'
    )
    # mutate equivalent
    .assign(
        YEAR=lambda df: df['YEAR'].str.replace('production_', '').astype(int),
    )
    # select equivalent
    [['LVL_4_CODE', 'YEAR', 'PRODUCTION_TON']]
)

prod_stats['TRASE_ID'] = prod_stats['LVL_4_CODE'].apply(ci_lvl4_to_trase_id)
def aggregate_spatial_levels(df, country_code='CI'):

    def extract_region(trase_id):
        return trase_id[:5]
    def extract_department(trase_id):
        return trase_id[:7]


    df['REGION_ID'] = df['TRASE_ID'].apply(extract_region)
    if country_code == 'CI':
        df['DEPARTMENT_ID'] = df['TRASE_ID'].apply(extract_department)
    df['NATIONAL_ID'] = country_code

    metric_cols = [col for col in df.columns if col not in ['TRASE_ID', 'YEAR', 'REGION_ID', 'DEPARTMENT_ID', 'NATIONAL_ID', 'level']]

    if country_code == 'CI':
        dept_df = df.groupby(['DEPARTMENT_ID', 'YEAR'])[metric_cols].sum().reset_index()
        dept_df['TRASE_ID'] = dept_df['DEPARTMENT_ID']
        dept_df.drop(columns=['DEPARTMENT_ID'], inplace=True)

    region_df = df.groupby(['REGION_ID', 'YEAR'])[metric_cols].sum().reset_index()
    region_df['TRASE_ID'] = region_df['REGION_ID']
    region_df.drop(columns=['REGION_ID'], inplace=True)

    national_df = df.groupby(['NATIONAL_ID', 'YEAR'])[metric_cols].sum().reset_index()
    national_df['TRASE_ID'] = national_df['NATIONAL_ID']
    national_df.drop(columns=['NATIONAL_ID'], inplace=True)

    if country_code == 'CI':
        agg_df = pd.concat([df.drop(columns=['REGION_ID', 'DEPARTMENT_ID', 'NATIONAL_ID']),
                            dept_df,
                            region_df,
                            national_df],
                        ignore_index=True)
    else:
        agg_df = pd.concat([df.drop(columns=['REGION_ID', 'NATIONAL_ID']),
                            region_df,
                            national_df],
                        ignore_index=True)

    return agg_df

agg_df = aggregate_spatial_levels(prod_stats, country_code='CI')
display(agg_df)
LVL_4_CODE YEAR PRODUCTION_TON TRASE_ID
0 CI-1.1.1_1 2019 8.705134e+02 CI-010101
1 CI-2.1.2_1 2019 1.236141e+05 CI-020102
2 CI-2.1.1_1 2019 3.108117e+04 CI-020101
3 CI-2.2.1_1 2019 3.618921e+04 CI-020201
4 CI-2.2.3_1 2019 7.319666e+04 CI-020203
... ... ... ... ...
931 CI-1.1.1_1CI-2.1.2_1CI-2.1.1_1CI-2.2.1_1CI-2.2... 2020 2.105064e+06 CI
932 CI-1.1.1_1CI-2.1.2_1CI-2.1.1_1CI-2.2.1_1CI-2.2... 2021 2.248069e+06 CI
933 CI-1.1.1_1CI-2.1.2_1CI-2.1.1_1CI-2.2.1_1CI-2.2... 2022 2.121065e+06 CI
934 CI-1.1.1_1CI-2.1.2_1CI-2.1.1_1CI-2.2.1_1CI-2.2... 2023 2.241068e+06 CI
935 CI-1.1.1_1CI-2.1.2_1CI-2.1.1_1CI-2.2.1_1CI-2.2... 2024 1.674051e+06 CI

936 rows × 4 columns

def fill_na_and_zero(group):
    # Get valid (non-null and non-zero) values
    valid = group[(group.notna()) & (group != 0)]

    if not valid.empty:
        fill_value = valid.iloc[0]
    else:
        fill_value = 0  # Fallback if no valid value exists

    return group.fillna(0).replace(0, fill_value)
agg_df.describe()
YEAR PRODUCTION_TON
count 936.000000 9.360000e+02
mean 2021.500000 5.360420e+04
std 1.708738 1.817791e+05
min 2019.000000 0.000000e+00
25% 2020.000000 2.544852e+01
50% 2021.500000 8.335500e+03
75% 2023.000000 5.163369e+04
max 2024.000000 2.248069e+06
agg_df[agg_df['PRODUCTION_TON'] == 0][['PRODUCTION_TON', 'COCOA_HA', 'COCOA_YIELD_TON_HA']]
old_df = pd.read_parquet("/Users/niamhfrench/repos/TRASE/trase/data/cote_divoire/cocoa/metrics/q3_2025/old_ci_combine_regional_metrics_production_multilevel.parquet")
old_df
YEAR TRASE_ID PRODUCTION_TON COCOA_YIELD_TON_HA
0 2019 CI-010101 8.706336e+02 0.572106
1 2020 CI-010101 8.508281e+02 0.559092
2 2021 CI-010101 9.086278e+02 0.597073
3 2022 CI-010101 8.572952e+02 0.563342
4 2019 CI-020101 3.108490e+04 0.551566
... ... ... ... ...
619 2022 CI-14 1.640554e+04 0.472519
620 2019 CI 2.153856e+06 0.493715
621 2020 CI 2.104859e+06 0.482483
622 2021 CI 2.247849e+06 0.515260
623 2022 CI 2.120858e+06 0.486151

624 rows × 4 columns

out_fp = "ci_combine_regional_metrics_prod.parquet"
agg_df.to_parquet(out_fp, index=False)
df = pd.read_parquet('C:\\Users\\NiamhFrench\\Documents\\spatial_metrics_QA_civ_ghana\\data\\gh_combine_regional_metrics_c_def_emission_multilevel.parquet')
ghana_cocoa_area = pd.read_csv("C:\\Users\\NiamhFrench\\Documents\\spatial_metrics_QA_civ_ghana\\data\\production\\gh_combine_regional_metrics_cocoa_area_multilevel.csv", sep=";")
ghana_cocoa_area.drop(columns=['level'], inplace=True)
ghana_cocoa_area = aggregate_spatial_levels(ghana_cocoa_area, country_code='GH')
ghana_cocoa_area.drop_duplicates(subset=['TRASE_ID', 'YEAR'], inplace=True)
ghana_cocoa_area.to_parquet("C:\\Users\\NiamhFrench\\Documents\\spatial_metrics_QA_civ_ghana\\gh_combine_regional_metrics_cocoa_area_multilevel.parquet", index=False)