Skip to content

Diet Trase Subnational Regions

s3://trase-storage/diet-trase/diet_trase_subnational_regions.parquet

Dbt path: trase_production.main.diet_trase_subnational_regions

Explore on Metabase: Full table; summary statistics

Containing yaml file link: trase/data_pipeline/models/diet_trase/_schema.yml

Model file link: trase/data_pipeline/models/diet_trase/diet_trase_subnational_regions.py

Dbt test runs & lineage: Test results · Lineage

Full dbt_docs page: Open in dbt docs (includes lineage graph -at the bottom right-, tests, and downstream dependencies)

Tags: world, diet-trase, 2020


diet_trase_subnational_regions

Description

A dictionary of canonical subnational regions (codes and names) for countries in the Diet Trase model. This is used to validate and standardise region names


Details

Column Type Description
geocode VARCHAR
name VARCHAR
gadm_level FLOAT
country VARCHAR

Models / Seeds

  • source.trase_duckdb.trase-storage-raw.diet_trase_brazil_coffee_2020_jurisdictions
  • source.trase_duckdb.trase-storage-raw.diet_trase_colombia_coffee_2020_jurisdictions
  • source.trase_duckdb.trase-storage-raw.ethopia_gadm_level_1
  • source.trase_duckdb.trase-storage-raw.diet_trase_india_coffee_2020_jurisdictions
  • source.trase_duckdb.trase-storage-raw.diet_trase_indonesia_coffee_2020_jurisdictions
  • source.trase_duckdb.trase-storage-raw.diet_trase_peru_coffee_2020_jurisdictions
  • source.trase_duckdb.trase-storage-raw.diet_trase_tanzania_coffee_2020_jurisdictions

Sources

  • ['trase-storage-raw', 'diet_trase_brazil_coffee_2020_jurisdictions']
  • ['trase-storage-raw', 'diet_trase_colombia_coffee_2020_jurisdictions']
  • ['trase-storage-raw', 'ethopia_gadm_level_1']
  • ['trase-storage-raw', 'diet_trase_india_coffee_2020_jurisdictions']
  • ['trase-storage-raw', 'diet_trase_indonesia_coffee_2020_jurisdictions']
  • ['trase-storage-raw', 'diet_trase_peru_coffee_2020_jurisdictions']
  • ['trase-storage-raw', 'diet_trase_tanzania_coffee_2020_jurisdictions']

No called script or script source not found.

import polars as pl


def model(dbt, cursor):
    dbt.config(materialized="external")

    # Brazil ------------------------------------------------------------------------- #
    brazil = dbt.source(
        "trase-storage-raw", "diet_trase_brazil_coffee_2020_jurisdictions"
    ).pl()
    brazil = brazil.select("geocode", "name")
    brazil = brazil.with_columns(gadm_level=pl.lit(6), country=pl.lit("Brazil"))

    # Colombia ----------------------------------------------------------------------- #
    colombia = dbt.source(
        "trase-storage-raw", "diet_trase_colombia_coffee_2020_jurisdictions"
    ).pl()
    colombia = colombia.select("geocode", "name")
    colombia = colombia.with_columns(gadm_level=pl.lit(3), country=pl.lit("Colombia"))

    # Ethiopia ----------------------------------------------------------------------- #
    # I'd like to read the GeoJSON but can't quite figure out how to extract, so just
    # hard-coding for now
    dbt.source("trase-storage-raw", "ethopia_gadm_level_1")
    ethiopia = pl.DataFrame(
        [
            {"geocode": "ETH.1_1", "name": "Addis Ababa"},
            {"geocode": "ETH.2_1", "name": "Afar"},
            {"geocode": "ETH.3_1", "name": "Amhara"},
            {"geocode": "ETH.4_1", "name": "Benishangul-Gumuz"},
            {"geocode": "ETH.5_1", "name": "Dire Dawa"},
            {"geocode": "ETH.6_1", "name": "Gambela Peoples"},
            {"geocode": "ETH.7_1", "name": "Harari People"},
            {"geocode": "ETH.8_1", "name": "Oromia"},
            {"geocode": "ETH.9_1", "name": "Somali"},
            {
                "geocode": "ETH.10_1",
                "name": "Southern Nations, Nationalities and Peoples",
            },
            {"geocode": "ETH.11_1", "name": "Tigray"},
        ]
    )
    ethiopia = ethiopia.with_columns(gadm_level=pl.lit(1), country=pl.lit("Ethopia"))

    # India -------------------------------------------------------------------------- #
    india = dbt.source(
        "trase-storage-raw", "diet_trase_india_coffee_2020_jurisdictions"
    ).pl()
    india = india.select("geocode", "name")
    india = india.with_columns(gadm_level=pl.lit(1), country=pl.lit("India"))

    # Indonesia ---------------------------------------------------------------------- #
    indonesia = dbt.source(
        "trase-storage-raw", "diet_trase_indonesia_coffee_2020_jurisdictions"
    ).pl()
    indonesia = indonesia.select("geocode", "name")
    indonesia = indonesia.with_columns(
        gadm_level=pl.lit(1), country=pl.lit("Indonesia")
    )

    # Peru --------------------------------------------------------------------------- #
    peru = dbt.source(
        "trase-storage-raw", "diet_trase_peru_coffee_2020_jurisdictions"
    ).pl()
    peru = peru.select("geocode", "name")
    peru = peru.with_columns(gadm_level=pl.lit(2), country=pl.lit("Peru"))

    # Tanzania ----------------------------------------------------------------------- #
    tanzania = dbt.source(
        "trase-storage-raw", "diet_trase_tanzania_coffee_2020_jurisdictions"
    ).pl()
    tanzania = tanzania.select("geocode", "name")
    tanzania = tanzania.with_columns(gadm_level=pl.lit(1), country=pl.lit("Tanzania"))

    # Uganda ------------------------------------------------------------------------- #
    uganda = pl.DataFrame(
        [
            {"geocode": "UG-002", "name": "Ankole"},
            {"geocode": "UG-003", "name": "Buganda"},
            {"geocode": "UG-006", "name": "Bunyoro"},
            {"geocode": "UG-007", "name": "Busoga"},
            {"geocode": "UG-016", "name": "Elgon"},
            {"geocode": "UG-009", "name": "Kigezi"},
            {"geocode": "UG-014", "name": "Tooro"},
            {"geocode": "UG-015", "name": "West Nile"},
        ]
    )
    uganda = uganda.with_columns(gadm_level=pl.lit(1.5), country=pl.lit("Uganda"))

    # Côte d'Ivoire ------------------------------------------------------------------ #
    # There doesn't seem to be a Parquet file for this so I'm just hard-coding here for
    # now
    cote_d_ivoire = pl.DataFrame(
        [
            {"geocode": "CIV.1_1", "name": "Abidjan"},
            {"geocode": "CIV.2_1", "name": "Bas-Sassandra"},
            {"geocode": "CIV.3_1", "name": "Comoé"},
            {"geocode": "CIV.4_1", "name": "Denguélé"},
            {"geocode": "CIV.5_1", "name": "Gôh-Djiboua"},
            {"geocode": "CIV.6_1", "name": "Lacs"},
            {"geocode": "CIV.7_1", "name": "Lagunes"},
            {"geocode": "CIV.8_1", "name": "Montagnes"},
            {"geocode": "CIV.9_1", "name": "Sassandra-Marahoué"},
            {"geocode": "CIV.10_1", "name": "Savanes"},
            {"geocode": "CIV.11_1", "name": "Vallée du Bandama"},
            {"geocode": "CIV.12_1", "name": "Woroba"},
            {"geocode": "CIV.13_1", "name": "Yamoussoukro"},
            {"geocode": "CIV.14_1", "name": "Zanzan"},
        ]
    )
    cote_d_ivoire = cote_d_ivoire.with_columns(
        gadm_level=pl.lit(1),
        country=pl.lit("Côte d'Ivoire"),
    )

    # Vietnam ------------------------------------------------------------------------ #
    vietnam = pl.DataFrame(
        [
            {"geocode": "VN-001", "name": "Northwest"},
            {"geocode": "VN-002", "name": "Northeast"},
            {"geocode": "VN-003", "name": "Red River Delta"},
            {"geocode": "VN-004", "name": "North Central Coast"},
            {"geocode": "VN-005", "name": "South Central Coast"},
            {"geocode": "VN-006", "name": "Central Highlands"},
            {"geocode": "VN-007", "name": "Southeast"},
            {"geocode": "VN-008", "name": "Mekong River Delta"},
        ]
    )
    vietnam = vietnam.with_columns(gadm_level=pl.lit(1), country=pl.lit("Vietnam"))

    # Other countries - country level ------------------------------------------------ #
    # TODO: it would be better to have one unique code per country
    other_countries = pl.DataFrame(
        [
            {
                "geocode": "NA",
                "name": "Country-level",
                "gadm_level": 0,
                "country": "Global",
            }
        ]
    )

    return pl.concat(
        df.cast({"gadm_level": pl.Float32})
        for df in [
            brazil,
            colombia,
            cote_d_ivoire,
            ethiopia,
            india,
            indonesia,
            other_countries,
            peru,
            tanzania,
            uganda,
            vietnam,
        ]
    )