View or edit on GitHub
This page is synchronized from trase/models/brazil/soy/brazil_soy_v2_6_1_and_0_indicators_comparison.ipynb. Last modified on 2025-12-14 23:19 CET by Trase Admin.
Please view or edit the original file there; changes should be reflected here after a midnight build (CET time),
or manually triggering it with a GitHub action (link).
Brazil Soy v2.6.1 vs 2.6.0 indicators comparison
This notebook reviews the following: * Is the difference in total exposure of v2_6_1 similar (within 3-5%) of previous data in v2_6_0? * Does exporter groups ranking on exposures match previous ranking?
This notebook loads the dbt tests run for brazil_soy_v2_6_1, in trase/database/dbt/models/supply_chains/_schema_brazil_soy_nicolas.yml.
For the check_exposure_ranking_between_realeases tests, it uses the test defined in trase/database/dbt/tests/generic/supply_chains/brazil/soy/check_exposure_ranking_between_realeases.sql.
Note: currently working in branch brazil_soy_dbt_qa
source DBT tests last run: 2024-10-23 10:15 CET
# Load libraries, data, and create helper functions
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
import textwrap
from trase.tools.pcs.connect import CNX
rename_columns = {
"col_1": "year",
"expression": "2_6_1_exposure",
"compare_expression": "2_6_0_exposure",
"expression_difference": "difference",
"expression_difference_percent": "difference_percent",
}
## Load exposure aggregation checks from dbt tests
total_def_exposure_check = pd.read_sql(
"select * from trase.dbt_test__audit.deforestation_exposure_aggregation_check_brazil_soy_v2_6_1",
con=CNX.cnx,
)
total_def_exposure_check = total_def_exposure_check.rename(columns=rename_columns)
total_def_exposure_check["difference_percent"] = (
total_def_exposure_check["difference_percent"] * 100
).round(2)
total_co2_net_exposure_check = pd.read_sql(
"select * from trase.dbt_test__audit.co2_net_exposure_aggregation_check_brazil_soy_v2_6_1",
con=CNX.cnx,
)
total_co2_net_exposure_check = total_co2_net_exposure_check.rename(
columns=rename_columns
)
total_co2_net_exposure_check["difference_percent"] = (
total_co2_net_exposure_check["difference_percent"] * 100
).round(2)
total_co2_gross_exposure_check = pd.read_sql(
"select * from trase.dbt_test__audit.co2_gross_exposure_aggregation_check_brazil_soy_v2_6_1",
con=CNX.cnx,
)
total_co2_gross_exposure_check = total_co2_gross_exposure_check.rename(
columns=rename_columns
)
total_co2_gross_exposure_check["difference_percent"] = (
total_co2_gross_exposure_check["difference_percent"] * 100
).round(2)
## Load exporter group ranking check from dbt tests
exporter_group_ranking_check = pd.read_sql(
"select * from trase.dbt_test__audit.deforestation_exposure_ranking_check_brazil_soy_v2_6_1",
con=CNX.cnx,
)
# Take only the last 4 years (2020 - 2017)
exporter_group_ranking_check = exporter_group_ranking_check[
exporter_group_ranking_check["year"] >= 2017
]
# Create a set with 'exporter_group' names iterating through the different 'years' and add the first 15 'exporter_gruoup' values per year to the set
exporter_group_set = set()
for year in exporter_group_ranking_check["year"].unique():
exporter_group_set.update(
exporter_group_ranking_check[exporter_group_ranking_check["year"] == year]
.head(15)["exporter_group"]
.values
)
# Create a dictionary with the 'exporter_group' names as keys and a sum(abs(rank_change)) as values
exporter_group_ranking_check_dict = {}
for exporter_group in exporter_group_set:
exporter_group_ranking_check_dict[exporter_group] = (
exporter_group_ranking_check[
exporter_group_ranking_check["exporter_group"] == exporter_group
]["rank_change"]
.abs()
.sum()
)
# Create a list with the exporter_group_ranking_check_dict keys and sort it by its values descending order
# Later on we will iterate showing the details of these exporter_groups deforestation exposure through the years
exporter_group_ranking_check_list = list(exporter_group_ranking_check_dict.keys())
exporter_group_ranking_check_list.sort(
key=lambda x: exporter_group_ranking_check_dict[x], reverse=True
)
def thousands_formatter(x, pos):
return f"{int(x):,}"
def plot_aggregated_exposure(df, title):
# Melt the DataFrame to long-form
melted_df = pd.melt(
df,
id_vars=["year"],
value_vars=["2_6_1_exposure", "2_6_0_exposure"],
var_name="Exposure_Type",
value_name="Exposure",
)
# Plot using seaborn
sns.set_theme(style="whitegrid")
plt.figure(figsize=(12, 6))
ax = sns.barplot(
x="year",
y="Exposure",
hue="Exposure_Type",
data=melted_df,
hue_order=["2_6_1_exposure", "2_6_0_exposure"],
)
# Format the y-axis with thousands separator
ax.yaxis.set_major_formatter(FuncFormatter(thousands_formatter))
# Annotate bars with the actual values
for p in ax.patches:
ax.annotate(
f"{int(p.get_height()):,}",
(p.get_x() + p.get_width() / 2.0, p.get_height()),
ha="center",
va="center",
xytext=(0, 10),
textcoords="offset points",
)
# Customize legend labels
handles, labels = ax.get_legend_handles_labels()
labels = ["2_6_1", "2_6_0"]
ax.legend(handles, labels, title="Version")
plt.ylabel("Exposure")
plt.xlabel("Year")
plt.title(title, pad=20)
plt.show()
1. Difference in exposure totals (deforestation and emissions) between versions by year
1.1. Difference in deforestation exposure
The graph below shows the following difference in totals:
* 2020: v2_6_1 has 34% more deforestation exposure than v2_6_0
* 2019: v_2_6_1 has 26% more
* 2018: v_2_6_1 has 18% more
* 2017: v_2_6_1 has 18% more
plot_aggregated_exposure(
total_def_exposure_check,
"Deforestation exposure difference between v2.6.1 and v2.6.0",
)
1.2. Difference in CO2 net exposure
The graph below shows the following difference in totals:
* 2020: v2_6_1 has 31% more CO2 net exposure than v2_6_0
* 2019: v_2_6_1 has 18% more
* 2018: v_2_6_1 has 5% more
* 2017: v_2_6_1 has 15% more
plot_aggregated_exposure(
total_co2_net_exposure_check,
"CO2 net exposure difference between v2.6.1 and v2.6.0",
)
1.3. Difference in CO2 gross exposure
The graph below shows the following difference in totals:
* 2020: v2_6_1 has 31% more CO2 gross exposure than v2_6_0
* 2019: v_2_6_1 has 19% more
* 2018: v_2_6_1 has 6% more
* 2017: v_2_6_1 has 15% more
plot_aggregated_exposure(
total_co2_gross_exposure_check,
"CO2 gross exposure difference between v2.6.1 and v2.6.0",
)
2. Comparison of deforestation exposure by exporter group
First we compare the actual changes in deforestation exposure, and then the changes in ranking
2.1. Changes in exporter_group deforestation exposure between release versions, for years 2020 - 2017
melted_top_exporters = pd.melt(
exporter_group_ranking_check,
id_vars=["year", "exporter_group"],
value_vars=["current_exposure", "previous_exposure"],
var_name="Exposure_Type",
value_name="Exposure",
)
# Function to wrap text labels for long exporter group names
def wrap_labels(label, width=40):
return "\n".join(textwrap.wrap(label, width))
# Apply text wrapping to exporter_group column using .loc[]
melted_top_exporters.loc[:, "exporter_group"] = melted_top_exporters[
"exporter_group"
].apply(lambda x: wrap_labels(x))
# Get unique years
years = melted_top_exporters["year"].unique()
# Create subplots
fig, axes = plt.subplots(len(years), 1, figsize=(12, 10 * len(years)), sharex=True)
# Plot each year in a separate subplot
for ax, year in zip(axes, years):
data = melted_top_exporters[melted_top_exporters["year"] == year]
sns.barplot(
x="Exposure",
y="exporter_group",
hue="Exposure_Type",
hue_order=["current_exposure", "previous_exposure"],
data=data,
ax=ax,
orient="h",
)
# Format the x-axis with thousands separator
ax.xaxis.set_major_formatter(FuncFormatter(thousands_formatter))
# Annotate bars with the actual values
for p in ax.patches:
ax.annotate(
f"{int(p.get_width()):,}",
(p.get_width(), p.get_y() + p.get_height() / 2.0),
ha="center",
va="center",
xytext=(10, 0),
textcoords="offset points",
fontsize=8,
)
# Customize legend labels
handles, labels = ax.get_legend_handles_labels()
labels = ["v2_6_1", "v2_6_0"]
ax.legend(handles, labels, title="Exposure in ha")
ax.set_title(f"Year: {year}")
ax.set_ylabel("Exporter Group")
ax.set_xlabel("Exposure")
ax.tick_params(axis="y", labelsize=6)
plt.tight_layout(
rect=[0, 0, 1, 0.95], h_pad=2
) # Add padding between subplots for better separation
fig.suptitle(
"2.1. Top exporters exposure comparison btw v2_6_1 and v2_6_0 for 2020 - 2017",
fontsize=14,
)
plt.show()
2.2. Comparing the differences in ranking between version v2_6_1 and v2_6_0 for years 2020 - 2017
# Changes in ranking between versions / years
# Take the first 15 exporter groups from the list 'exporter_group_ranking_check_list' and plot the
# changes in ranking for each year (one plot per exporter group, with a line showing the changes through the years of each version)
top_ranking_changes = exporter_group_ranking_check[
exporter_group_ranking_check["exporter_group"].isin(
exporter_group_ranking_check_list[:15]
)
]
# Create subplots
fig, axes = plt.subplots(
len(top_ranking_changes["exporter_group"].unique()),
1,
figsize=(8, 3 * len(top_ranking_changes["exporter_group"].unique())),
sharex=False,
)
# Plot each exporter group in a separate subplot
for ax, exporter_group in zip(axes, top_ranking_changes["exporter_group"].unique()):
data = top_ranking_changes[top_ranking_changes["exporter_group"] == exporter_group]
# Plot current_rank and previous_rank
sns.lineplot(x="year", y="current_rank", data=data, ax=ax, label="v2_6_1")
sns.lineplot(x="year", y="previous_rank", data=data, ax=ax, label="v2_6_0")
ax.set_title(f"Exporter Group: {exporter_group}", fontsize=10)
ax.set_ylabel("Rank")
ax.set_xlabel("Year") # Add x-label for each subplot
ax.set_xticks([2020, 2019, 2018, 2017]) # Set specific years as x-axis labels
ax.invert_xaxis() # Start with 2020 and go down to 2017
# Ensure y-axis only has integer ticks
y_min, y_max = ax.get_ylim() # Get the current y-axis limits
ax.set_yticks(range(int(y_min), int(y_max) + 1)) # Set y-ticks to be integers
ax.tick_params(axis="y", labelsize=8)
# Adjust layout to reduce white space and ensure labels fit well
plt.tight_layout(rect=[0, 0, 1, 0.96], h_pad=0.5) # Reduce padding between subplots
fig.suptitle(
"Top exporters ranking changes btw v2_6_1 and v2_6_0 for 2020 - 2017", fontsize=14
)
plt.show()
# Show the details of the dataframe
exporter_group_ranking_check