Spatial metrics quality assurance
View or edit on GitHub
This page is synchronized from trase/data/cote_divoire/cocoa/metrics/q3_2025/spatial_metrics_quality_assurance.ipynb. Last modified on 2026-02-03 10:30 CET by Jason J. Benedict.
Please view or edit the original file there; changes should be reflected here after a midnight build (CET time),
or manually triggering it with a GitHub action (link).
import matplotlib.pyplot as plt
import pandas as pd
import geopandas as gpd
import os
from functools import reduce
import re
ci_departments = ('C:\\Users\\NiamhFrench\\Documents\\cocoa_map_comparison_fdap0.5\\ci_departments_geojson_export.geojson')
gh_departments = ('C:\\Users\\NiamhFrench\\Documents\\cocoa_map_comparison_fdap0.5\\gh_departments_geojson_export.geojson')
ci_gdf = gpd.read_file(ci_departments)
gh_gdf = gpd.read_file(gh_departments)
data_dir = 'C:\\Users\\NiamhFrench\\Documents\\spatial_metrics_QA_civ_ghana\\data'
parquet_files = [f for f in os.listdir(data_dir) if f.endswith('.parquet')]
ci_files = [os.path.join(data_dir, f) for f in parquet_files if f.startswith('ci')]
gh_files = [os.path.join(data_dir, f) for f in parquet_files if f.startswith('gh')]
# Read and merge on TRASE_ID and YEAR
def merge_parquet_files(file_list):
dfs = [pd.read_parquet(f) for f in file_list]
merged_df = dfs[0]
for df in dfs[1:]:
merged_df = pd.merge(merged_df, df, on=['TRASE_ID', 'YEAR'], how='outer')
return merged_df
ci_df = merge_parquet_files(ci_files)
gh_df = merge_parquet_files(gh_files)
# Load CSVs from last year's update, to get standardised column names
ci_2024_cocoa = pd.read_csv('C:\\Users\\NiamhFrench\\Documents\\spatial_metrics_QA_civ_ghana\\data\\q4_2024\\ci_combine_regional_metrics_c_def_emission.csv', sep=';')
ci_2024_def = pd.read_csv('C:\\Users\\NiamhFrench\Documents\\spatial_metrics_QA_civ_ghana\\data\\q4_2024\\ci_combine_regional_metrics_def.csv', sep=';')
ci_2024_emissions = pd.read_csv('C:\\Users\\NiamhFrench\Documents\\spatial_metrics_QA_civ_ghana\\data\\q4_2024\\ci_combine_regional_metrics_emission.csv', sep=';')
ci_2024 = pd.merge(ci_2024_cocoa, ci_2024_def, on=['TRASE_ID', 'YEAR'], how='outer')
ci_2024 = pd.merge(ci_2024, ci_2024_emissions, on=['TRASE_ID', 'YEAR'], how='outer')
# Load CSVs from last year's update, to get standardised column names
gh_2024_cocoa = pd.read_csv('C:\\Users\\NiamhFrench\\Documents\\spatial_metrics_QA_civ_ghana\\data\\q4_2024\\gh_combine_regional_metrics_c_def_emission_multilevel.csv', sep=';')
gh_2024_def = pd.read_csv('C:\\Users\\NiamhFrench\Documents\\spatial_metrics_QA_civ_ghana\\data\\q4_2024\\gh_combine_regional_metrics_def_multilevel.csv', sep=';')
gh_2024_emissions = pd.read_csv('C:\\Users\\NiamhFrench\Documents\\spatial_metrics_QA_civ_ghana\\data\\q4_2024\\gh_combine_regional_metrics_emission_multilevel.csv', sep=';')
gh_2024 = pd.merge(gh_2024_cocoa, gh_2024_def, on=['TRASE_ID', 'YEAR'], how='outer')
gh_2024 = pd.merge(gh_2024, gh_2024_emissions, on=['TRASE_ID', 'YEAR'], how='outer')
def ci_lvl4_to_trase_id(lvl4_code: str) -> str:
"""
Convert LVL_4_CODE (e.g., 'CI-1.1.1_1') to TRASE_ID (e.g., 'CI-010101').
"""
prefix, rest = lvl4_code.split('-')
nums = rest.split('_')[0].split('.')
trase_id = prefix + '-' + ''.join([n.zfill(2) for n in nums])
return trase_id
def gid2_to_trase_id_and_level(gid2: str):
# Split into main and level part
main, lvl = gid2.split('_')
# Extract country code and numbers
country = main[:3]
nums = main[3:].split('.')
# Determine level and format TRASE_ID
if len(nums) == 2:
trase_id = f"{country[:2]}-{int(nums[0]):02d}.{int(nums[1]):02d}"
level = "district"
elif len(nums) == 1:
trase_id = f"{country[:2]}-{int(nums[0]):02d}"
level = "region"
else:
trase_id = gid2
level = "unknown"
return trase_id
ci_gdf['TRASE_ID'] = ci_gdf['LVL_4_CODE'].apply(ci_lvl4_to_trase_id)
gh_gdf['TRASE_ID'] = gh_gdf['GID_2'].apply(gid2_to_trase_id_and_level)
metrics = [
'COCOA_DEFORESTATION_ANNUAL',
'COCOA_DEFORESTATION_15_YEARS_TOTAL',
'COCOA_GROSS_EMISSIONS_ANNUAL',
'COCOA_GROSS_EMISSIONS_15_YEARS_TOTAL',
'COCOA_NET_EMISSIONS_ANNUAL',
'COCOA_NET_EMISSIONS_15_YEARS_TOTAL',
'CO2_GROSS_EMISSIONS_TERRITORIAL_DEFORESTATION',
'TERRITORIAL_DEFORESTATION',
'TERRITORIAL_UNDISTURBED_FOREST'
]
def plot_aggregated_comparison(df_2024, df_2025, metrics, year_col='YEAR', agg='median', out_dir='time_series_charts'):
# Create output directory if it doesn't exist
os.makedirs(out_dir, exist_ok=True)
for metric in metrics:
plt.figure(figsize=(8, 5))
if agg == 'sum':
agg_2024 = df_2024.groupby(year_col)[metric].sum()
agg_2025 = df_2025.groupby(year_col)[metric].sum()
elif agg == 'mean':
agg_2024 = df_2024.groupby(year_col)[metric].mean()
agg_2025 = df_2025.groupby(year_col)[metric].mean()
elif agg == 'median':
agg_2024 = df_2024.groupby(year_col)[metric].median()
agg_2025 = df_2025.groupby(year_col)[metric].median()
else:
raise ValueError("agg must be 'sum' or 'mean' or 'median'")
years_2024 = agg_2024.index.astype(int)
years_2025 = agg_2025.index.astype(int)
plt.plot(years_2024, agg_2024.values, marker='o', label='2024 update')
plt.plot(years_2025, agg_2025.values, marker='s', label='2025 update')
for idx, (years, agg_data) in enumerate([(years_2024, agg_2024), (years_2025, agg_2025)]):
color = f"C{idx}"
lower_10 = agg_data.values * 0.9
upper_10 = agg_data.values * 1.1
plt.fill_between(years, lower_10, upper_10, color=color, alpha=0.15, label=f"{agg_data.name} ±10%")
lower_20 = agg_data.values * 0.8
upper_20 = agg_data.values * 1.2
plt.fill_between(years, lower_20, upper_20, color=color, alpha=0.08, label=f"{agg_data.name} ±20%")
plt.title(f"{agg.capitalize()} of {metric}: 2024 update vs 2025 update")
plt.xlabel("Year")
plt.ylabel(f"{agg.capitalize()} {metric}")
plt.xticks(sorted(set(years_2024).union(years_2025)), rotation=0)
plt.legend()
plt.tight_layout()
# Save figure instead of showing
plt.savefig(os.path.join(out_dir, f"{metric}_{agg}_comparison.png"))
plt.close()
plot_aggregated_comparison(ci_2024, ci_df, metrics, agg='mean')
plot_aggregated_comparison(gh_2024, gh_df, metrics, agg='mean')
def plot_side_by_side_choropleths(ci_gdf, df_2024, df_2025, metrics, year_col='YEAR', out_dir='map_charts'):
os.makedirs(out_dir, exist_ok=True)
for metric in metrics:
years_2024 = set(df_2024.loc[df_2024[metric].notna(), year_col])
years_2025 = set(df_2025.loc[df_2025[metric].notna(), year_col])
years = sorted(years_2024 & years_2025)
for year in years:
df_2024_year = df_2024[df_2024[year_col] == year][['TRASE_ID', metric]]
gdf_2024 = ci_gdf.merge(df_2024_year, on='TRASE_ID', how='left')
df_2025_year = df_2025[df_2025[year_col] == year][['TRASE_ID', metric]]
gdf_2025 = ci_gdf.merge(df_2025_year, on='TRASE_ID', how='left')
combined = pd.concat([gdf_2024[metric], gdf_2025[metric]])
vmin = combined.min()
vmax = combined.max()
fig, axes = plt.subplots(2, 1, figsize=(10, 14))
gdf_2024.plot(column=metric, ax=axes[0], legend=True, cmap='BuPu',
edgecolor='black', linewidth=0.3,
vmin=vmin, vmax=vmax,
missing_kwds={"color": "lightgrey", "label": "Missing"})
axes[0].set_title(f"{metric} ({year}) - 2024 update")
axes[0].axis('off')
gdf_2025.plot(column=metric, ax=axes[1], legend=True, cmap='BuPu',
edgecolor='black', linewidth=0.3,
vmin=vmin, vmax=vmax,
missing_kwds={"color": "lightgrey", "label": "Missing"})
axes[1].set_title(f"{metric} ({year}) - 2025 update")
axes[1].axis('off')
plt.suptitle(f"{metric} ({year}) comparison", fontsize=16)
plt.tight_layout()
# Save figure instead of showing
plt.savefig(os.path.join(out_dir, f"{metric}_{year}_comparison.png"))
plt.close()
plot_side_by_side_choropleths(ci_gdf, ci_2024, ci_df, metrics, year_col='YEAR')
plot_side_by_side_choropleths(gh_gdf, gh_2024, gh_df, metrics, year_col='YEAR')