Diet Trase Subnational Regions
s3://trase-storage/diet-trase/diet_trase_subnational_regions.parquet
Dbt path: trase_production.main.diet_trase_subnational_regions
Explore on Metabase: Full table; summary statistics
Containing yaml file link: trase/data_pipeline/models/diet_trase/_schema.yml
Model file link: trase/data_pipeline/models/diet_trase/diet_trase_subnational_regions.py
Dbt test runs & lineage: Test results · Lineage
Full dbt_docs page: Open in dbt docs (includes lineage graph -at the bottom right-, tests, and downstream dependencies)
Tags: world, diet-trase, 2020
diet_trase_subnational_regions
Description
A dictionary of canonical subnational regions (codes and names) for countries in the Diet Trase model. This is used to validate and standardise region names
Details
| Column | Type | Description |
|---|---|---|
geocode |
VARCHAR |
|
name |
VARCHAR |
|
gadm_level |
FLOAT |
|
country |
VARCHAR |
Models / Seeds
source.trase_duckdb.trase-storage-raw.diet_trase_brazil_coffee_2020_jurisdictionssource.trase_duckdb.trase-storage-raw.diet_trase_colombia_coffee_2020_jurisdictionssource.trase_duckdb.trase-storage-raw.ethopia_gadm_level_1source.trase_duckdb.trase-storage-raw.diet_trase_india_coffee_2020_jurisdictionssource.trase_duckdb.trase-storage-raw.diet_trase_indonesia_coffee_2020_jurisdictionssource.trase_duckdb.trase-storage-raw.diet_trase_peru_coffee_2020_jurisdictionssource.trase_duckdb.trase-storage-raw.diet_trase_tanzania_coffee_2020_jurisdictions
Sources
['trase-storage-raw', 'diet_trase_brazil_coffee_2020_jurisdictions']['trase-storage-raw', 'diet_trase_colombia_coffee_2020_jurisdictions']['trase-storage-raw', 'ethopia_gadm_level_1']['trase-storage-raw', 'diet_trase_india_coffee_2020_jurisdictions']['trase-storage-raw', 'diet_trase_indonesia_coffee_2020_jurisdictions']['trase-storage-raw', 'diet_trase_peru_coffee_2020_jurisdictions']['trase-storage-raw', 'diet_trase_tanzania_coffee_2020_jurisdictions']
No called script or script source not found.
import polars as pl
def model(dbt, cursor):
dbt.config(materialized="external")
# Brazil ------------------------------------------------------------------------- #
brazil = dbt.source(
"trase-storage-raw", "diet_trase_brazil_coffee_2020_jurisdictions"
).pl()
brazil = brazil.select("geocode", "name")
brazil = brazil.with_columns(gadm_level=pl.lit(6), country=pl.lit("Brazil"))
# Colombia ----------------------------------------------------------------------- #
colombia = dbt.source(
"trase-storage-raw", "diet_trase_colombia_coffee_2020_jurisdictions"
).pl()
colombia = colombia.select("geocode", "name")
colombia = colombia.with_columns(gadm_level=pl.lit(3), country=pl.lit("Colombia"))
# Ethiopia ----------------------------------------------------------------------- #
# I'd like to read the GeoJSON but can't quite figure out how to extract, so just
# hard-coding for now
dbt.source("trase-storage-raw", "ethopia_gadm_level_1")
ethiopia = pl.DataFrame(
[
{"geocode": "ETH.1_1", "name": "Addis Ababa"},
{"geocode": "ETH.2_1", "name": "Afar"},
{"geocode": "ETH.3_1", "name": "Amhara"},
{"geocode": "ETH.4_1", "name": "Benishangul-Gumuz"},
{"geocode": "ETH.5_1", "name": "Dire Dawa"},
{"geocode": "ETH.6_1", "name": "Gambela Peoples"},
{"geocode": "ETH.7_1", "name": "Harari People"},
{"geocode": "ETH.8_1", "name": "Oromia"},
{"geocode": "ETH.9_1", "name": "Somali"},
{
"geocode": "ETH.10_1",
"name": "Southern Nations, Nationalities and Peoples",
},
{"geocode": "ETH.11_1", "name": "Tigray"},
]
)
ethiopia = ethiopia.with_columns(gadm_level=pl.lit(1), country=pl.lit("Ethopia"))
# India -------------------------------------------------------------------------- #
india = dbt.source(
"trase-storage-raw", "diet_trase_india_coffee_2020_jurisdictions"
).pl()
india = india.select("geocode", "name")
india = india.with_columns(gadm_level=pl.lit(1), country=pl.lit("India"))
# Indonesia ---------------------------------------------------------------------- #
indonesia = dbt.source(
"trase-storage-raw", "diet_trase_indonesia_coffee_2020_jurisdictions"
).pl()
indonesia = indonesia.select("geocode", "name")
indonesia = indonesia.with_columns(
gadm_level=pl.lit(1), country=pl.lit("Indonesia")
)
# Peru --------------------------------------------------------------------------- #
peru = dbt.source(
"trase-storage-raw", "diet_trase_peru_coffee_2020_jurisdictions"
).pl()
peru = peru.select("geocode", "name")
peru = peru.with_columns(gadm_level=pl.lit(2), country=pl.lit("Peru"))
# Tanzania ----------------------------------------------------------------------- #
tanzania = dbt.source(
"trase-storage-raw", "diet_trase_tanzania_coffee_2020_jurisdictions"
).pl()
tanzania = tanzania.select("geocode", "name")
tanzania = tanzania.with_columns(gadm_level=pl.lit(1), country=pl.lit("Tanzania"))
# Uganda ------------------------------------------------------------------------- #
uganda = pl.DataFrame(
[
{"geocode": "UG-002", "name": "Ankole"},
{"geocode": "UG-003", "name": "Buganda"},
{"geocode": "UG-006", "name": "Bunyoro"},
{"geocode": "UG-007", "name": "Busoga"},
{"geocode": "UG-016", "name": "Elgon"},
{"geocode": "UG-009", "name": "Kigezi"},
{"geocode": "UG-014", "name": "Tooro"},
{"geocode": "UG-015", "name": "West Nile"},
]
)
uganda = uganda.with_columns(gadm_level=pl.lit(1.5), country=pl.lit("Uganda"))
# Côte d'Ivoire ------------------------------------------------------------------ #
# There doesn't seem to be a Parquet file for this so I'm just hard-coding here for
# now
cote_d_ivoire = pl.DataFrame(
[
{"geocode": "CIV.1_1", "name": "Abidjan"},
{"geocode": "CIV.2_1", "name": "Bas-Sassandra"},
{"geocode": "CIV.3_1", "name": "Comoé"},
{"geocode": "CIV.4_1", "name": "Denguélé"},
{"geocode": "CIV.5_1", "name": "Gôh-Djiboua"},
{"geocode": "CIV.6_1", "name": "Lacs"},
{"geocode": "CIV.7_1", "name": "Lagunes"},
{"geocode": "CIV.8_1", "name": "Montagnes"},
{"geocode": "CIV.9_1", "name": "Sassandra-Marahoué"},
{"geocode": "CIV.10_1", "name": "Savanes"},
{"geocode": "CIV.11_1", "name": "Vallée du Bandama"},
{"geocode": "CIV.12_1", "name": "Woroba"},
{"geocode": "CIV.13_1", "name": "Yamoussoukro"},
{"geocode": "CIV.14_1", "name": "Zanzan"},
]
)
cote_d_ivoire = cote_d_ivoire.with_columns(
gadm_level=pl.lit(1),
country=pl.lit("Côte d'Ivoire"),
)
# Vietnam ------------------------------------------------------------------------ #
vietnam = pl.DataFrame(
[
{"geocode": "VN-001", "name": "Northwest"},
{"geocode": "VN-002", "name": "Northeast"},
{"geocode": "VN-003", "name": "Red River Delta"},
{"geocode": "VN-004", "name": "North Central Coast"},
{"geocode": "VN-005", "name": "South Central Coast"},
{"geocode": "VN-006", "name": "Central Highlands"},
{"geocode": "VN-007", "name": "Southeast"},
{"geocode": "VN-008", "name": "Mekong River Delta"},
]
)
vietnam = vietnam.with_columns(gadm_level=pl.lit(1), country=pl.lit("Vietnam"))
# Other countries - country level ------------------------------------------------ #
# TODO: it would be better to have one unique code per country
other_countries = pl.DataFrame(
[
{
"geocode": "NA",
"name": "Country-level",
"gadm_level": 0,
"country": "Global",
}
]
)
return pl.concat(
df.cast({"gadm_level": pl.Float32})
for df in [
brazil,
colombia,
cote_d_ivoire,
ethiopia,
india,
indonesia,
other_countries,
peru,
tanzania,
uganda,
vietnam,
]
)