Spatial aggregations production
View or edit on GitHub
This page is synchronized from trase/data/cote_divoire/cocoa/metrics/q3_2025/spatial_aggregations_production.ipynb. Last modified on 2026-06-21 06:35 CEST by GitHub Actions.
Please view or edit the original file there; changes should be reflected here after a midnight build (CET time),
or manually triggering it with a GitHub action (link).
def ci_lvl4_to_trase_id(lvl4_code: str) -> str:
"""
Convert LVL_4_CODE (e.g., 'CI-1.1.1_1') to TRASE_ID (e.g., 'CI-010101').
"""
prefix, rest = lvl4_code.split('-')
nums = rest.split('_')[0].split('.')
trase_id = prefix + '-' + ''.join([n.zfill(2) for n in nums])
return trase_id
import pandas as pd
prod_stats = pd.read_csv("cocoa_production_2019_2024_q4_2025.csv", sep=",")
cocoa_area = pd.read_csv("C:\\Users\\NiamhFrench\\Documents\\spatial_metrics_QA_civ_ghana\\data\\production\\ci_combine_regional_metrics_area_cdef.csv", sep=";")[['LVL_4_CODE','COCOA_HA', 'YEAR']]
# Identify the columns to pivot
prod_cols = [col for col in prod_stats.columns if col.startswith('production_')]
id_cols = [col for col in prod_stats.columns if col not in prod_cols]
prod_stats = (
prod_stats
# pivot_longer equivalent
.melt(
id_vars=id_cols,
value_vars=prod_cols,
var_name='YEAR',
value_name='PRODUCTION_TON'
)
# mutate equivalent
.assign(
YEAR=lambda df: df['YEAR'].str.replace('production_', '').astype(int),
)
# select equivalent
[['LVL_4_CODE', 'YEAR', 'PRODUCTION_TON']]
)
prod_stats['TRASE_ID'] = prod_stats['LVL_4_CODE'].apply(ci_lvl4_to_trase_id)
def aggregate_spatial_levels(df, country_code='CI'):
def extract_region(trase_id):
return trase_id[:5]
def extract_department(trase_id):
return trase_id[:7]
df['REGION_ID'] = df['TRASE_ID'].apply(extract_region)
if country_code == 'CI':
df['DEPARTMENT_ID'] = df['TRASE_ID'].apply(extract_department)
df['NATIONAL_ID'] = country_code
metric_cols = [col for col in df.columns if col not in ['TRASE_ID', 'YEAR', 'REGION_ID', 'DEPARTMENT_ID', 'NATIONAL_ID', 'level']]
if country_code == 'CI':
dept_df = df.groupby(['DEPARTMENT_ID', 'YEAR'])[metric_cols].sum().reset_index()
dept_df['TRASE_ID'] = dept_df['DEPARTMENT_ID']
dept_df.drop(columns=['DEPARTMENT_ID'], inplace=True)
region_df = df.groupby(['REGION_ID', 'YEAR'])[metric_cols].sum().reset_index()
region_df['TRASE_ID'] = region_df['REGION_ID']
region_df.drop(columns=['REGION_ID'], inplace=True)
national_df = df.groupby(['NATIONAL_ID', 'YEAR'])[metric_cols].sum().reset_index()
national_df['TRASE_ID'] = national_df['NATIONAL_ID']
national_df.drop(columns=['NATIONAL_ID'], inplace=True)
if country_code == 'CI':
agg_df = pd.concat([df.drop(columns=['REGION_ID', 'DEPARTMENT_ID', 'NATIONAL_ID']),
dept_df,
region_df,
national_df],
ignore_index=True)
else:
agg_df = pd.concat([df.drop(columns=['REGION_ID', 'NATIONAL_ID']),
region_df,
national_df],
ignore_index=True)
return agg_df
agg_df = aggregate_spatial_levels(prod_stats, country_code='CI')
display(agg_df)
| LVL_4_CODE | YEAR | PRODUCTION_TON | TRASE_ID | |
|---|---|---|---|---|
| 0 | CI-1.1.1_1 | 2019 | 8.705134e+02 | CI-010101 |
| 1 | CI-2.1.2_1 | 2019 | 1.236141e+05 | CI-020102 |
| 2 | CI-2.1.1_1 | 2019 | 3.108117e+04 | CI-020101 |
| 3 | CI-2.2.1_1 | 2019 | 3.618921e+04 | CI-020201 |
| 4 | CI-2.2.3_1 | 2019 | 7.319666e+04 | CI-020203 |
| ... | ... | ... | ... | ... |
| 931 | CI-1.1.1_1CI-2.1.2_1CI-2.1.1_1CI-2.2.1_1CI-2.2... | 2020 | 2.105064e+06 | CI |
| 932 | CI-1.1.1_1CI-2.1.2_1CI-2.1.1_1CI-2.2.1_1CI-2.2... | 2021 | 2.248069e+06 | CI |
| 933 | CI-1.1.1_1CI-2.1.2_1CI-2.1.1_1CI-2.2.1_1CI-2.2... | 2022 | 2.121065e+06 | CI |
| 934 | CI-1.1.1_1CI-2.1.2_1CI-2.1.1_1CI-2.2.1_1CI-2.2... | 2023 | 2.241068e+06 | CI |
| 935 | CI-1.1.1_1CI-2.1.2_1CI-2.1.1_1CI-2.2.1_1CI-2.2... | 2024 | 1.674051e+06 | CI |
936 rows × 4 columns
def fill_na_and_zero(group):
# Get valid (non-null and non-zero) values
valid = group[(group.notna()) & (group != 0)]
if not valid.empty:
fill_value = valid.iloc[0]
else:
fill_value = 0 # Fallback if no valid value exists
return group.fillna(0).replace(0, fill_value)
agg_df.describe()
| YEAR | PRODUCTION_TON | |
|---|---|---|
| count | 936.000000 | 9.360000e+02 |
| mean | 2021.500000 | 5.360420e+04 |
| std | 1.708738 | 1.817791e+05 |
| min | 2019.000000 | 0.000000e+00 |
| 25% | 2020.000000 | 2.544852e+01 |
| 50% | 2021.500000 | 8.335500e+03 |
| 75% | 2023.000000 | 5.163369e+04 |
| max | 2024.000000 | 2.248069e+06 |
agg_df[agg_df['PRODUCTION_TON'] == 0][['PRODUCTION_TON', 'COCOA_HA', 'COCOA_YIELD_TON_HA']]
old_df = pd.read_parquet("/Users/niamhfrench/repos/TRASE/trase/data/cote_divoire/cocoa/metrics/q3_2025/old_ci_combine_regional_metrics_production_multilevel.parquet")
old_df
| YEAR | TRASE_ID | PRODUCTION_TON | COCOA_YIELD_TON_HA | |
|---|---|---|---|---|
| 0 | 2019 | CI-010101 | 8.706336e+02 | 0.572106 |
| 1 | 2020 | CI-010101 | 8.508281e+02 | 0.559092 |
| 2 | 2021 | CI-010101 | 9.086278e+02 | 0.597073 |
| 3 | 2022 | CI-010101 | 8.572952e+02 | 0.563342 |
| 4 | 2019 | CI-020101 | 3.108490e+04 | 0.551566 |
| ... | ... | ... | ... | ... |
| 619 | 2022 | CI-14 | 1.640554e+04 | 0.472519 |
| 620 | 2019 | CI | 2.153856e+06 | 0.493715 |
| 621 | 2020 | CI | 2.104859e+06 | 0.482483 |
| 622 | 2021 | CI | 2.247849e+06 | 0.515260 |
| 623 | 2022 | CI | 2.120858e+06 | 0.486151 |
624 rows × 4 columns
out_fp = "ci_combine_regional_metrics_prod.parquet"
agg_df.to_parquet(out_fp, index=False)
df = pd.read_parquet('C:\\Users\\NiamhFrench\\Documents\\spatial_metrics_QA_civ_ghana\\data\\gh_combine_regional_metrics_c_def_emission_multilevel.parquet')
ghana_cocoa_area = pd.read_csv("C:\\Users\\NiamhFrench\\Documents\\spatial_metrics_QA_civ_ghana\\data\\production\\gh_combine_regional_metrics_cocoa_area_multilevel.csv", sep=";")
ghana_cocoa_area.drop(columns=['level'], inplace=True)
ghana_cocoa_area = aggregate_spatial_levels(ghana_cocoa_area, country_code='GH')
ghana_cocoa_area.drop_duplicates(subset=['TRASE_ID', 'YEAR'], inplace=True)
ghana_cocoa_area.to_parquet("C:\\Users\\NiamhFrench\\Documents\\spatial_metrics_QA_civ_ghana\\gh_combine_regional_metrics_cocoa_area_multilevel.parquet", index=False)