Skip to content

View or edit on GitHub

This page is synchronized from trase/models/brazil/soy/brazil_soy_v2_6_1_and_0_indicators_comparison.ipynb. Last modified on 2025-12-14 23:19 CET by Trase Admin. Please view or edit the original file there; changes should be reflected here after a midnight build (CET time), or manually triggering it with a GitHub action (link).

Brazil Soy v2.6.1 vs 2.6.0 indicators comparison

This notebook reviews the following: * Is the difference in total exposure of v2_6_1 similar (within 3-5%) of previous data in v2_6_0? * Does exporter groups ranking on exposures match previous ranking?

This notebook loads the dbt tests run for brazil_soy_v2_6_1, in trase/database/dbt/models/supply_chains/_schema_brazil_soy_nicolas.yml.

For the check_exposure_ranking_between_realeases tests, it uses the test defined in trase/database/dbt/tests/generic/supply_chains/brazil/soy/check_exposure_ranking_between_realeases.sql.

Note: currently working in branch brazil_soy_dbt_qa

source DBT tests last run: 2024-10-23 10:15 CET

# Load libraries, data, and create helper functions

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
import textwrap

from trase.tools.pcs.connect import CNX

rename_columns = {
    "col_1": "year",
    "expression": "2_6_1_exposure",
    "compare_expression": "2_6_0_exposure",
    "expression_difference": "difference",
    "expression_difference_percent": "difference_percent",
}

## Load exposure aggregation checks from dbt tests
total_def_exposure_check = pd.read_sql(
    "select * from trase.dbt_test__audit.deforestation_exposure_aggregation_check_brazil_soy_v2_6_1",
    con=CNX.cnx,
)
total_def_exposure_check = total_def_exposure_check.rename(columns=rename_columns)
total_def_exposure_check["difference_percent"] = (
    total_def_exposure_check["difference_percent"] * 100
).round(2)
total_co2_net_exposure_check = pd.read_sql(
    "select * from trase.dbt_test__audit.co2_net_exposure_aggregation_check_brazil_soy_v2_6_1",
    con=CNX.cnx,
)
total_co2_net_exposure_check = total_co2_net_exposure_check.rename(
    columns=rename_columns
)
total_co2_net_exposure_check["difference_percent"] = (
    total_co2_net_exposure_check["difference_percent"] * 100
).round(2)
total_co2_gross_exposure_check = pd.read_sql(
    "select * from trase.dbt_test__audit.co2_gross_exposure_aggregation_check_brazil_soy_v2_6_1",
    con=CNX.cnx,
)
total_co2_gross_exposure_check = total_co2_gross_exposure_check.rename(
    columns=rename_columns
)
total_co2_gross_exposure_check["difference_percent"] = (
    total_co2_gross_exposure_check["difference_percent"] * 100
).round(2)


## Load exporter group ranking check from dbt tests
exporter_group_ranking_check = pd.read_sql(
    "select * from trase.dbt_test__audit.deforestation_exposure_ranking_check_brazil_soy_v2_6_1",
    con=CNX.cnx,
)

# Take only the last 4 years (2020 - 2017)
exporter_group_ranking_check = exporter_group_ranking_check[
    exporter_group_ranking_check["year"] >= 2017
]
# Create a set with 'exporter_group' names iterating through the different 'years' and add the first 15 'exporter_gruoup' values per year to the set
exporter_group_set = set()
for year in exporter_group_ranking_check["year"].unique():
    exporter_group_set.update(
        exporter_group_ranking_check[exporter_group_ranking_check["year"] == year]
        .head(15)["exporter_group"]
        .values
    )
# Create a dictionary with the 'exporter_group' names as keys and a sum(abs(rank_change)) as values
exporter_group_ranking_check_dict = {}
for exporter_group in exporter_group_set:
    exporter_group_ranking_check_dict[exporter_group] = (
        exporter_group_ranking_check[
            exporter_group_ranking_check["exporter_group"] == exporter_group
        ]["rank_change"]
        .abs()
        .sum()
    )
# Create a list with the exporter_group_ranking_check_dict keys and sort it by its values descending order
# Later on we will iterate showing the details of these exporter_groups deforestation exposure through the years
exporter_group_ranking_check_list = list(exporter_group_ranking_check_dict.keys())
exporter_group_ranking_check_list.sort(
    key=lambda x: exporter_group_ranking_check_dict[x], reverse=True
)


def thousands_formatter(x, pos):
    return f"{int(x):,}"


def plot_aggregated_exposure(df, title):
    # Melt the DataFrame to long-form
    melted_df = pd.melt(
        df,
        id_vars=["year"],
        value_vars=["2_6_1_exposure", "2_6_0_exposure"],
        var_name="Exposure_Type",
        value_name="Exposure",
    )

    # Plot using seaborn
    sns.set_theme(style="whitegrid")
    plt.figure(figsize=(12, 6))
    ax = sns.barplot(
        x="year",
        y="Exposure",
        hue="Exposure_Type",
        data=melted_df,
        hue_order=["2_6_1_exposure", "2_6_0_exposure"],
    )

    # Format the y-axis with thousands separator
    ax.yaxis.set_major_formatter(FuncFormatter(thousands_formatter))

    # Annotate bars with the actual values
    for p in ax.patches:
        ax.annotate(
            f"{int(p.get_height()):,}",
            (p.get_x() + p.get_width() / 2.0, p.get_height()),
            ha="center",
            va="center",
            xytext=(0, 10),
            textcoords="offset points",
        )

    # Customize legend labels
    handles, labels = ax.get_legend_handles_labels()
    labels = ["2_6_1", "2_6_0"]
    ax.legend(handles, labels, title="Version")

    plt.ylabel("Exposure")
    plt.xlabel("Year")
    plt.title(title, pad=20)
    plt.show()

1. Difference in exposure totals (deforestation and emissions) between versions by year

1.1. Difference in deforestation exposure

The graph below shows the following difference in totals: * 2020: v2_6_1 has 34% more deforestation exposure than v2_6_0 * 2019: v_2_6_1 has 26% more * 2018: v_2_6_1 has 18% more * 2017: v_2_6_1 has 18% more

plot_aggregated_exposure(
    total_def_exposure_check,
    "Deforestation exposure difference between v2.6.1 and v2.6.0",
)

1.2. Difference in CO2 net exposure

The graph below shows the following difference in totals: * 2020: v2_6_1 has 31% more CO2 net exposure than v2_6_0 * 2019: v_2_6_1 has 18% more * 2018: v_2_6_1 has 5% more * 2017: v_2_6_1 has 15% more

plot_aggregated_exposure(
    total_co2_net_exposure_check,
    "CO2 net exposure difference between v2.6.1 and v2.6.0",
)

1.3. Difference in CO2 gross exposure

The graph below shows the following difference in totals: * 2020: v2_6_1 has 31% more CO2 gross exposure than v2_6_0 * 2019: v_2_6_1 has 19% more * 2018: v_2_6_1 has 6% more * 2017: v_2_6_1 has 15% more

plot_aggregated_exposure(
    total_co2_gross_exposure_check,
    "CO2 gross exposure difference between v2.6.1 and v2.6.0",
)

2. Comparison of deforestation exposure by exporter group

First we compare the actual changes in deforestation exposure, and then the changes in ranking

2.1. Changes in exporter_group deforestation exposure between release versions, for years 2020 - 2017

melted_top_exporters = pd.melt(
    exporter_group_ranking_check,
    id_vars=["year", "exporter_group"],
    value_vars=["current_exposure", "previous_exposure"],
    var_name="Exposure_Type",
    value_name="Exposure",
)


# Function to wrap text labels for long exporter group names
def wrap_labels(label, width=40):
    return "\n".join(textwrap.wrap(label, width))


# Apply text wrapping to exporter_group column using .loc[]
melted_top_exporters.loc[:, "exporter_group"] = melted_top_exporters[
    "exporter_group"
].apply(lambda x: wrap_labels(x))

# Get unique years
years = melted_top_exporters["year"].unique()

# Create subplots
fig, axes = plt.subplots(len(years), 1, figsize=(12, 10 * len(years)), sharex=True)

# Plot each year in a separate subplot
for ax, year in zip(axes, years):
    data = melted_top_exporters[melted_top_exporters["year"] == year]
    sns.barplot(
        x="Exposure",
        y="exporter_group",
        hue="Exposure_Type",
        hue_order=["current_exposure", "previous_exposure"],
        data=data,
        ax=ax,
        orient="h",
    )

    # Format the x-axis with thousands separator
    ax.xaxis.set_major_formatter(FuncFormatter(thousands_formatter))

    # Annotate bars with the actual values
    for p in ax.patches:
        ax.annotate(
            f"{int(p.get_width()):,}",
            (p.get_width(), p.get_y() + p.get_height() / 2.0),
            ha="center",
            va="center",
            xytext=(10, 0),
            textcoords="offset points",
            fontsize=8,
        )

    # Customize legend labels
    handles, labels = ax.get_legend_handles_labels()
    labels = ["v2_6_1", "v2_6_0"]
    ax.legend(handles, labels, title="Exposure in ha")

    ax.set_title(f"Year: {year}")
    ax.set_ylabel("Exporter Group")
    ax.set_xlabel("Exposure")

    ax.tick_params(axis="y", labelsize=6)

plt.tight_layout(
    rect=[0, 0, 1, 0.95], h_pad=2
)  # Add padding between subplots for better separation
fig.suptitle(
    "2.1. Top exporters exposure comparison btw v2_6_1 and v2_6_0 for 2020 - 2017",
    fontsize=14,
)
plt.show()

2.2. Comparing the differences in ranking between version v2_6_1 and v2_6_0 for years 2020 - 2017

# Changes in ranking between versions / years
# Take the first 15 exporter groups from the list 'exporter_group_ranking_check_list' and plot the
# changes in ranking for each year (one plot per exporter group, with a line showing the changes through the years of each version)
top_ranking_changes = exporter_group_ranking_check[
    exporter_group_ranking_check["exporter_group"].isin(
        exporter_group_ranking_check_list[:15]
    )
]

# Create subplots
fig, axes = plt.subplots(
    len(top_ranking_changes["exporter_group"].unique()),
    1,
    figsize=(8, 3 * len(top_ranking_changes["exporter_group"].unique())),
    sharex=False,
)

# Plot each exporter group in a separate subplot
for ax, exporter_group in zip(axes, top_ranking_changes["exporter_group"].unique()):
    data = top_ranking_changes[top_ranking_changes["exporter_group"] == exporter_group]

    # Plot current_rank and previous_rank
    sns.lineplot(x="year", y="current_rank", data=data, ax=ax, label="v2_6_1")
    sns.lineplot(x="year", y="previous_rank", data=data, ax=ax, label="v2_6_0")

    ax.set_title(f"Exporter Group: {exporter_group}", fontsize=10)
    ax.set_ylabel("Rank")
    ax.set_xlabel("Year")  # Add x-label for each subplot
    ax.set_xticks([2020, 2019, 2018, 2017])  # Set specific years as x-axis labels
    ax.invert_xaxis()  # Start with 2020 and go down to 2017
    # Ensure y-axis only has integer ticks
    y_min, y_max = ax.get_ylim()  # Get the current y-axis limits
    ax.set_yticks(range(int(y_min), int(y_max) + 1))  # Set y-ticks to be integers
    ax.tick_params(axis="y", labelsize=8)

# Adjust layout to reduce white space and ensure labels fit well
plt.tight_layout(rect=[0, 0, 1, 0.96], h_pad=0.5)  # Reduce padding between subplots
fig.suptitle(
    "Top exporters ranking changes btw v2_6_1 and v2_6_0 for 2020 - 2017", fontsize=14
)

plt.show()
# Show the details of the dataframe
exporter_group_ranking_check