Main
View or edit on GitHub
This page is synchronized from trase/models/bolivia/soy/archive/main.ipynb. Last modified on 2025-12-14 23:19 CET by Trase Admin.
Please view or edit the original file there; changes should be reflected here after a midnight build (CET time),
or manually triggering it with a GitHub action (link).
Documentation
Because Cargill sources from Prolega, for the purposes of the model we've changed the RUC code for Prolega (1015571020) to that of Cargill (181110026).
LP
While an ideal LP would be:
Bean --> Silo --> Crusher --> Port
for Bolivia, Javi believes that this will suffice:
Bean --> Crusher --> Port
Other Notes
Javier believes that all traders in Bolivia are vertically integrated, so every trader should have a link to a processing facility.
In the case of Cargill, we have a link to a facility identified but it is not direct (different NIT code)
Todo:
- handle flows that have no processing facility - one flow from DHL -- doublecheck there are no others; include and make "unknown"
- convert port of export code to trase id and upload to trase database
- Add domestic demand to crusher total demand; i.e. make total crusher demand = production
- use relative crushing facility capacity to estimate throughput; use oil + cake + residue constraints to model outputs
- check that output per facility is consistent with owner exports; i.e. total quantities, oil/cake split etc. make sense
- Ignore refineries
- Remove options to run at community level (i.e. run at municipality level)
- Check in with Clement re. latest LP implementation, and make sure inputs are of correct
Things we had down to do when refining model: - Aggregate flows by product type before going to LP
import sys
from datetime import datetime
import numpy as np
import pandas as pd
from tqdm import tqdm
from trase.tools.sps import (
SupplyChain,
solve_transportation_problem,
upload_pandas_df_to_s3,
)
from definition import years as model_years, version
from constants import HS6_TO_PRODUCT
pd.set_option("display.float_format", "{:,.2f}".format)
A note for Maria/Simon: eventually, we want to move all the model code to a separate file called "model.py", and then we can run our models for multiple years very easily and simply by changing the values in range(year_start, year_end+1) in the cell below.
But for now, since it's easier to have the code available in cells while editing, we can only run one year at a time. So to run results for multiple years, you change the values in range(2019, 2020) below to match whatever year you want, then run the whole notebook (please use hotkeys or the menu bar to do this, or i'll be turning over in my grave lol), then change then value in range() and run everything again.
# Run supply chain model
for year in range(2019, 2020):
print("-" * 40)
print("Running {} {} {}...".format("Bolivia", "Soy", year))
sc = SupplyChain("bolivia/soy", year=year, bucket="trase-storage")
# Set up Dataset
sc.preparation()
sc.load()
model_resolution = MODEL_RESOLUTION_BY_YEAR[sc.context.year]
## Step 1: assign processing facilities to flows that have traders that own a processing facility
flows_df = sc.get("flows")
## Convert Prolega NIT to Cargill's NIT, but keep exporter names separate, as they share crushing facilities
flows_df.loc[flows_df["nit"] == "181110026", "nit"] = "1015571020"
# add a derived column
flows_df["has_processing_facility"] = flows_df["nit"].isin(
sc.get("assets")["nit_tax_id"]
)
# and convert another to proper tpye
flows_df["weight_kg"] = flows_df["weight_kg"].astype(float)
# just to take a look at the data, doesn't actually change anything
flows_df.sample(3)
# Excluding these flows where no processing facility is linked to exporter (very little)
flows_df[flows_df["has_processing_facility"] == False]
flows_with_facility = flows_df[flows_df["has_processing_facility"] == True]
flows_with_facility = flows_with_facility.set_index("nit")
assets_to_merge = sc.get("assets").set_index("nit_tax_id")["unique_id"]
flows_with_facility = flows_with_facility.join(assets_to_merge)
flows_with_facility = flows_with_facility.reset_index()
flows_with_facility = flows_with_facility.rename(columns={"index": "nit"})
demand = flows_with_facility.groupby(["unique_id"]).agg("sum")
demand["weight_tn"] = demand["weight_kg"] / 1000
demand = demand["weight_tn"]
def get_cost_matrix_for_resolution(model_resolution):
if model_resolution == "community":
return (
sc.get("distance_matrix")
.set_index(["origin", "destination.unique_id"])["total_cost"]
.astype(float)
)
if model_resolution == "municipality":
return (
sc.get("distance_matrix_municipality")
.set_index(["origin", "destination.unique_id"])["total_cost"]
.astype(float)
)
raise BadModelResolutionException(
"Model resolution must be either 'community' or 'municipality'"
)
def get_production_for_resolution(model_resolution):
if model_resolution == "community":
return sc.get("community_production").set_index("community_code")["soy_tn"]
if model_resolution == "municipality":
return sc.get("production").set_index("geocode")["value"]
raise BadModelResolutionException(
"Model resolution must be either 'community' or 'municipality'"
)
# This is the actual LP, nicely packaged up in a function called solve_transportation_problem. It takes in 3 pandas Series objects.
# Series objects are like one column in a dataframe, but with row names included: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.html
results = solve_transportation_problem(
supply=get_production_for_resolution(model_resolution),
demand=demand,
costs=get_cost_matrix_for_resolution(model_resolution),
)
# format results
results = results[results > 0]
results = results.reset_index()
results = results.rename(columns={"sink": "processing_facility_id", "quantity": "tons"})
results = results.reset_index(drop=True).set_index("processing_facility_id")
results = results.join(demand).rename(
columns={"weight_tn": "total_processing_facility_tn"}
)
results["proportion"] = results["tons"] / results["total_processing_facility_tn"]
flows_with_facility["weight_tn"] = flows_with_facility["weight_kg"] / 1000
In the next two cells, we're linking our LP results (COMMUNITY or MUNICIPALITY --> CRUSHER)
# for each row in demand, get unique id
for facility_id, facility_tons in demand.iteritems():
supply_shed = results[results.index == facility_id]
expanded_flows = []
for ix, flow_row in tqdm(flows_with_facility.iterrows()):
facility_id = flow_row["unique_id"]
supply_shed = results[results.index == facility_id]
for supply_ix, supply_row in supply_shed.iterrows():
expanded_flows.append(
{
"nit": flow_row["nit"],
"exporter_trase_id": flow_row["exporter_trase_id"],
"hs6": flow_row["hs6"],
"importer": flow_row["importer"],
"port_of_export_code": flow_row["port_of_export_code"],
"exporter": flow_row["exporter"],
"country": flow_row["country"],
"unique_id": flow_row["unique_id"],
"tons": flow_row["weight_tn"] * supply_row["proportion"],
model_resolution: supply_row["source"],
}
)
out = pd.DataFrame(expanded_flows)
Checking some sum totals here, as a sanity check:
out.tons.sum()
# community production should be 0 in all years except for 2019
sc.get("community_production").soy_tn.sum()
Add domestic flows below
(Maria/Simon: Please double check this logic / code. It seems to work at both resolution levels (just assigning leftover production), but I didn't have time to QA at all. - Andrew 7/30)
if model_resolution == "municipality":
exports_grouped_by_municipality = (
out.groupby("municipality").agg(sum).rename(columns={"tons": "export_tn"})
)
municipality_lvl_domestic_volume = (
exports_grouped_by_municipality.join(
sc.get("production").set_index("geocode")["value"],
how="outer",
)
.rename(columns={"value": "production_tn"})
.fillna(0)
)
# mostly only municipality 071005 has some domestic and some exported volume as of 7/30. Is this a problem? IDK! But worth noting.
municipality_lvl_domestic_volume["domestic_tn"] = (
municipality_lvl_domestic_volume["production_tn"]
- municipality_lvl_domestic_volume["export_tn"]
)
municipalities_with_domestic_demand = municipality_lvl_domestic_volume[
municipality_lvl_domestic_volume["domestic_tn"] > 0.0001
]
domestic_demand_rows = []
for municipality, row in municipalities_with_domestic_demand.iterrows():
dd_row = {
"nit": "DOMESTIC",
"exporter_trase_id": "DOMESTIC",
"hs6": "120100", # hard to know what product to assign to domestic flows, so I just did it as soybean. We could change this to oil or cake if we add crusher outputs/limitations to the model.
"importer": "DOMESTIC",
"port_of_export_code": "DOMESTIC",
"exporter": "DOMESTIC",
"country": "DOMESTIC",
"unique_id": "DOMESTIC",
"tons": row["domestic_tn"],
"municipality": municipality,
}
domestic_demand_rows.append(dd_row)
out = out.append(domestic_demand_rows)
if model_resolution == "community":
outputs_by_community = (
out.groupby("community").agg(sum).rename(columns={"tons": "export_tn"})
)
community_production = sc.get("community_production").set_index("community_code")[
"soy_tn"
]
community_prod_and_exports = pd.concat(
[community_production, outputs_by_community], axis=1, sort=True
)
community_prod_and_exports["export_tn"] = community_prod_and_exports[
"export_tn"
].fillna(0)
community_prod_and_exports["domestic_tn"] = (
community_prod_and_exports["soy_tn"] - community_prod_and_exports["export_tn"]
)
communities_with_domestic_demand = community_prod_and_exports[
community_prod_and_exports["domestic_tn"] > 0.0001
]
domestic_demand_rows = []
for community_code, row in communities_with_domestic_demand.iterrows():
dd_row = {
"nit": "DOMESTIC",
"exporter_trase_id": "DOMESTIC",
"hs6": "120100", # hard to know what product to assign to domestic flows, so I just did it as soybean. We could change this to oil or cake if we add crusher outputs/limitations to the model.
"importer": "DOMESTIC",
"port_of_export_code": "DOMESTIC",
"exporter": "DOMESTIC",
"country": "DOMESTIC",
"unique_id": "DOMESTIC",
"tons": row["domestic_tn"],
"community": community_code,
}
domestic_demand_rows.append(dd_row)
out = out.append(domestic_demand_rows)
link hs6 and product
out["product"] = out.hs6.apply(lambda hs6: HS6_TO_PRODUCT[hs6])
If model resolution is community, add municipality
def add_municipality_to_community_level_results(
results_df: pd.DataFrame,
) -> pd.DataFrame:
if model_resolution == "community":
community_municipality_link = sc.get("communities")[
["community_code", "municipality"]
].rename(
columns={
"community_code": "community",
}
)
return results_df.merge(community_municipality_link, how="left", on="community")
else:
return results_df
out = add_municipality_to_community_level_results(out)
Add municipality name to municipality level results (based on geocode)
if model_resolution == "municipality":
municipality_name_and_geocode = sc.get("municipalities")[
["municipality", "geocode"]
].rename(columns={"geocode": "municipality_code"})
out = out.rename(columns={"municipality": "municipality_code"}).merge(
municipality_name_and_geocode, how="left", on="municipality_code"
)
# group by columns used in output to have smaller datasets
all_columns_except_tons = list(out.drop(columns="tons").columns)
out = out.groupby(all_columns_except_tons).agg(sum)["tons"].reset_index()
convert asset ids to trase id format
out["asset_trase_id"] = out["unique_id"]
out = out.drop(columns="unique_id")
Write results to S3
write_results = False
if write_results:
upload_pandas_df_to_s3(
out,
f"bolivia/soy/sei_pcs/results/{model_resolution}/SEIPCS_BO_SOY_{sc.context.year}_{datetime.now().strftime('%m%d%Y_%H%M')}.csv",
bucket_name="trase-storage",
)
Setting up a sankey just to check results
from trase.tools.jupyter import observable
com_mun_link = (
sc.get("community_production")
.merge(
sc.get("municipalities")[["geocode", "municipality"]],
left_on="municipality_code",
right_on="geocode",
)
.rename(columns={"community_code": "community"})
)
# for_observable = out.set_index("community").join(com_mun_link[["community", "municipality"]].set_index("community"))
for_observable = out.merge(
sc.get("municipalities")[["geocode", "municipality"]],
how="left",
left_on="municipality",
right_on="geocode",
).rename(
columns={"municipality_x": "municipality"}
) # .drop('municipality_x', axis=1)
for_observable
observable.sankey(
for_observable,
value_column="tons",
categorical_columns=["municipality", "exporter", "country"],
)
# observable.notebook(
# "@trase/sankey",
# ["chart"],
# {
# "data": results_map.to_dict(orient="records"),
# "tooltipKey": "Results (TONS)"}
# )