Spatial metrics quality assurance

View or edit on GitHub

This page is synchronized from trase/data/cote_divoire/cocoa/metrics/q3_2025/spatial_metrics_quality_assurance.ipynb. Last modified on 2026-02-03 10:30 CET by Jason J. Benedict. Please view or edit the original file there; changes should be reflected here after a midnight build (CET time), or manually triggering it with a GitHub action (link).

import matplotlib.pyplot as plt
import pandas as pd
import geopandas as gpd
import os
from functools import reduce
import re

ci_departments = ('C:\\Users\\NiamhFrench\\Documents\\cocoa_map_comparison_fdap0.5\\ci_departments_geojson_export.geojson')
gh_departments = ('C:\\Users\\NiamhFrench\\Documents\\cocoa_map_comparison_fdap0.5\\gh_departments_geojson_export.geojson')
ci_gdf = gpd.read_file(ci_departments)
gh_gdf = gpd.read_file(gh_departments)
data_dir = 'C:\\Users\\NiamhFrench\\Documents\\spatial_metrics_QA_civ_ghana\\data'
parquet_files = [f for f in os.listdir(data_dir) if f.endswith('.parquet')]
ci_files = [os.path.join(data_dir, f) for f in parquet_files if f.startswith('ci')]
gh_files = [os.path.join(data_dir, f) for f in parquet_files if f.startswith('gh')]

# Read and merge on TRASE_ID and YEAR
def merge_parquet_files(file_list):
    dfs = [pd.read_parquet(f) for f in file_list]
    merged_df = dfs[0]
    for df in dfs[1:]:
        merged_df = pd.merge(merged_df, df, on=['TRASE_ID', 'YEAR'], how='outer')
    return merged_df

ci_df = merge_parquet_files(ci_files)
gh_df = merge_parquet_files(gh_files)

# Load CSVs from last year's update, to get standardised column names

ci_2024_cocoa = pd.read_csv('C:\\Users\\NiamhFrench\\Documents\\spatial_metrics_QA_civ_ghana\\data\\q4_2024\\ci_combine_regional_metrics_c_def_emission.csv', sep=';')
ci_2024_def = pd.read_csv('C:\\Users\\NiamhFrench\Documents\\spatial_metrics_QA_civ_ghana\\data\\q4_2024\\ci_combine_regional_metrics_def.csv', sep=';')
ci_2024_emissions = pd.read_csv('C:\\Users\\NiamhFrench\Documents\\spatial_metrics_QA_civ_ghana\\data\\q4_2024\\ci_combine_regional_metrics_emission.csv', sep=';')
ci_2024 = pd.merge(ci_2024_cocoa, ci_2024_def, on=['TRASE_ID', 'YEAR'], how='outer')
ci_2024 = pd.merge(ci_2024, ci_2024_emissions, on=['TRASE_ID', 'YEAR'], how='outer')

# Load CSVs from last year's update, to get standardised column names

gh_2024_cocoa = pd.read_csv('C:\\Users\\NiamhFrench\\Documents\\spatial_metrics_QA_civ_ghana\\data\\q4_2024\\gh_combine_regional_metrics_c_def_emission_multilevel.csv', sep=';')
gh_2024_def = pd.read_csv('C:\\Users\\NiamhFrench\Documents\\spatial_metrics_QA_civ_ghana\\data\\q4_2024\\gh_combine_regional_metrics_def_multilevel.csv', sep=';')
gh_2024_emissions = pd.read_csv('C:\\Users\\NiamhFrench\Documents\\spatial_metrics_QA_civ_ghana\\data\\q4_2024\\gh_combine_regional_metrics_emission_multilevel.csv', sep=';')
gh_2024 = pd.merge(gh_2024_cocoa, gh_2024_def, on=['TRASE_ID', 'YEAR'], how='outer')
gh_2024 = pd.merge(gh_2024, gh_2024_emissions, on=['TRASE_ID', 'YEAR'], how='outer')

def ci_lvl4_to_trase_id(lvl4_code: str) -> str:
    """
    Convert LVL_4_CODE (e.g., 'CI-1.1.1_1') to TRASE_ID (e.g., 'CI-010101').
    """
    prefix, rest = lvl4_code.split('-')
    nums = rest.split('_')[0].split('.')
    trase_id = prefix + '-' + ''.join([n.zfill(2) for n in nums])
    return trase_id


def gid2_to_trase_id_and_level(gid2: str):
    # Split into main and level part
    main, lvl = gid2.split('_')
    # Extract country code and numbers
    country = main[:3]
    nums = main[3:].split('.')
    # Determine level and format TRASE_ID
    if len(nums) == 2:
        trase_id = f"{country[:2]}-{int(nums[0]):02d}.{int(nums[1]):02d}"
        level = "district"
    elif len(nums) == 1:
        trase_id = f"{country[:2]}-{int(nums[0]):02d}"
        level = "region"
    else:
        trase_id = gid2
        level = "unknown"
    return trase_id

ci_gdf['TRASE_ID'] = ci_gdf['LVL_4_CODE'].apply(ci_lvl4_to_trase_id)
gh_gdf['TRASE_ID'] = gh_gdf['GID_2'].apply(gid2_to_trase_id_and_level)

metrics = [
    'COCOA_DEFORESTATION_ANNUAL',
    'COCOA_DEFORESTATION_15_YEARS_TOTAL',
    'COCOA_GROSS_EMISSIONS_ANNUAL',
    'COCOA_GROSS_EMISSIONS_15_YEARS_TOTAL',
    'COCOA_NET_EMISSIONS_ANNUAL', 
    'COCOA_NET_EMISSIONS_15_YEARS_TOTAL',
    'CO2_GROSS_EMISSIONS_TERRITORIAL_DEFORESTATION',
    'TERRITORIAL_DEFORESTATION',
    'TERRITORIAL_UNDISTURBED_FOREST'
]

def plot_aggregated_comparison(df_2024, df_2025, metrics, year_col='YEAR', agg='median', out_dir='time_series_charts'):
    # Create output directory if it doesn't exist
    os.makedirs(out_dir, exist_ok=True)
    for metric in metrics:
        plt.figure(figsize=(8, 5))
        if agg == 'sum':
            agg_2024 = df_2024.groupby(year_col)[metric].sum()
            agg_2025 = df_2025.groupby(year_col)[metric].sum()
        elif agg == 'mean':
            agg_2024 = df_2024.groupby(year_col)[metric].mean()
            agg_2025 = df_2025.groupby(year_col)[metric].mean()
        elif agg == 'median':
            agg_2024 = df_2024.groupby(year_col)[metric].median()
            agg_2025 = df_2025.groupby(year_col)[metric].median()
        else:
            raise ValueError("agg must be 'sum' or 'mean' or 'median'")

        years_2024 = agg_2024.index.astype(int)
        years_2025 = agg_2025.index.astype(int)

        plt.plot(years_2024, agg_2024.values, marker='o', label='2024 update')
        plt.plot(years_2025, agg_2025.values, marker='s', label='2025 update')

        for idx, (years, agg_data) in enumerate([(years_2024, agg_2024), (years_2025, agg_2025)]):
            color = f"C{idx}"
            lower_10 = agg_data.values * 0.9
            upper_10 = agg_data.values * 1.1
            plt.fill_between(years, lower_10, upper_10, color=color, alpha=0.15, label=f"{agg_data.name} ±10%")
            lower_20 = agg_data.values * 0.8
            upper_20 = agg_data.values * 1.2
            plt.fill_between(years, lower_20, upper_20, color=color, alpha=0.08, label=f"{agg_data.name} ±20%")

        plt.title(f"{agg.capitalize()} of {metric}: 2024 update vs 2025 update")
        plt.xlabel("Year")
        plt.ylabel(f"{agg.capitalize()} {metric}")
        plt.xticks(sorted(set(years_2024).union(years_2025)), rotation=0)
        plt.legend()
        plt.tight_layout()
        # Save figure instead of showing
        plt.savefig(os.path.join(out_dir, f"{metric}_{agg}_comparison.png"))
        plt.close()


plot_aggregated_comparison(ci_2024, ci_df, metrics, agg='mean')
plot_aggregated_comparison(gh_2024, gh_df, metrics, agg='mean')

def plot_side_by_side_choropleths(ci_gdf, df_2024, df_2025, metrics, year_col='YEAR', out_dir='map_charts'):
    os.makedirs(out_dir, exist_ok=True)
    for metric in metrics:
        years_2024 = set(df_2024.loc[df_2024[metric].notna(), year_col])
        years_2025 = set(df_2025.loc[df_2025[metric].notna(), year_col])
        years = sorted(years_2024 & years_2025)
        for year in years:
            df_2024_year = df_2024[df_2024[year_col] == year][['TRASE_ID', metric]]
            gdf_2024 = ci_gdf.merge(df_2024_year, on='TRASE_ID', how='left')
            df_2025_year = df_2025[df_2025[year_col] == year][['TRASE_ID', metric]]
            gdf_2025 = ci_gdf.merge(df_2025_year, on='TRASE_ID', how='left')

            combined = pd.concat([gdf_2024[metric], gdf_2025[metric]])
            vmin = combined.min()
            vmax = combined.max()

            fig, axes = plt.subplots(2, 1, figsize=(10, 14))
            gdf_2024.plot(column=metric, ax=axes[0], legend=True, cmap='BuPu',
                          edgecolor='black', linewidth=0.3,
                          vmin=vmin, vmax=vmax,
                          missing_kwds={"color": "lightgrey", "label": "Missing"})
            axes[0].set_title(f"{metric} ({year}) - 2024 update")
            axes[0].axis('off')

            gdf_2025.plot(column=metric, ax=axes[1], legend=True, cmap='BuPu',
                          edgecolor='black', linewidth=0.3,
                          vmin=vmin, vmax=vmax,
                          missing_kwds={"color": "lightgrey", "label": "Missing"})
            axes[1].set_title(f"{metric} ({year}) - 2025 update")
            axes[1].axis('off')

            plt.suptitle(f"{metric} ({year}) comparison", fontsize=16)
            plt.tight_layout()
            # Save figure instead of showing
            plt.savefig(os.path.join(out_dir, f"{metric}_{year}_comparison.png"))
            plt.close()

plot_side_by_side_choropleths(ci_gdf, ci_2024, ci_df, metrics, year_col='YEAR')
plot_side_by_side_choropleths(gh_gdf, gh_2024, gh_df, metrics, year_col='YEAR')