Skip to content

DBT: Diet Trase Subnational Regions

File location: s3://trase-storage/diet-trase/diet_trase_subnational_regions.parquet

DBT model name: diet_trase_subnational_regions

Explore on Metabase: Full table; summary statistics

DBT details


Description

A dictionary of canonical subnational regions (codes and names) for countries in the Diet Trase model. This is used to validate and standardise region names


Details

Column Type Description
geocode VARCHAR
name VARCHAR
gadm_level FLOAT
country VARCHAR

Models / Seeds

  • source.trase_duckdb.trase-storage-raw.diet_trase_brazil_coffee_2020_jurisdictions
  • source.trase_duckdb.trase-storage-raw.diet_trase_colombia_coffee_2020_jurisdictions
  • source.trase_duckdb.trase-storage-raw.ethopia_gadm_level_1
  • source.trase_duckdb.trase-storage-raw.diet_trase_india_coffee_2020_jurisdictions
  • source.trase_duckdb.trase-storage-raw.diet_trase_indonesia_coffee_2020_jurisdictions
  • source.trase_duckdb.trase-storage-raw.diet_trase_peru_coffee_2020_jurisdictions
  • source.trase_duckdb.trase-storage-raw.diet_trase_tanzania_coffee_2020_jurisdictions

Sources

  • ['trase-storage-raw', 'diet_trase_brazil_coffee_2020_jurisdictions']
  • ['trase-storage-raw', 'diet_trase_colombia_coffee_2020_jurisdictions']
  • ['trase-storage-raw', 'ethopia_gadm_level_1']
  • ['trase-storage-raw', 'diet_trase_india_coffee_2020_jurisdictions']
  • ['trase-storage-raw', 'diet_trase_indonesia_coffee_2020_jurisdictions']
  • ['trase-storage-raw', 'diet_trase_peru_coffee_2020_jurisdictions']
  • ['trase-storage-raw', 'diet_trase_tanzania_coffee_2020_jurisdictions']

No called script or script source not found.

import polars as pl


def model(dbt, cursor):
    dbt.config(materialized="external")

    # Brazil ------------------------------------------------------------------------- #
    brazil = dbt.source(
        "trase-storage-raw", "diet_trase_brazil_coffee_2020_jurisdictions"
    ).pl()
    brazil = brazil.select("geocode", "name")
    brazil = brazil.with_columns(gadm_level=pl.lit(6), country=pl.lit("Brazil"))

    # Colombia ----------------------------------------------------------------------- #
    colombia = dbt.source(
        "trase-storage-raw", "diet_trase_colombia_coffee_2020_jurisdictions"
    ).pl()
    colombia = colombia.select("geocode", "name")
    colombia = colombia.with_columns(gadm_level=pl.lit(3), country=pl.lit("Colombia"))

    # Ethiopia ----------------------------------------------------------------------- #
    # I'd like to read the GeoJSON but can't quite figure out how to extract, so just
    # hard-coding for now
    dbt.source("trase-storage-raw", "ethopia_gadm_level_1")
    ethiopia = pl.DataFrame(
        [
            {"geocode": "ETH.1_1", "name": "Addis Ababa"},
            {"geocode": "ETH.2_1", "name": "Afar"},
            {"geocode": "ETH.3_1", "name": "Amhara"},
            {"geocode": "ETH.4_1", "name": "Benishangul-Gumuz"},
            {"geocode": "ETH.5_1", "name": "Dire Dawa"},
            {"geocode": "ETH.6_1", "name": "Gambela Peoples"},
            {"geocode": "ETH.7_1", "name": "Harari People"},
            {"geocode": "ETH.8_1", "name": "Oromia"},
            {"geocode": "ETH.9_1", "name": "Somali"},
            {
                "geocode": "ETH.10_1",
                "name": "Southern Nations, Nationalities and Peoples",
            },
            {"geocode": "ETH.11_1", "name": "Tigray"},
        ]
    )
    ethiopia = ethiopia.with_columns(gadm_level=pl.lit(1), country=pl.lit("Ethopia"))

    # India -------------------------------------------------------------------------- #
    india = dbt.source(
        "trase-storage-raw", "diet_trase_india_coffee_2020_jurisdictions"
    ).pl()
    india = india.select("geocode", "name")
    india = india.with_columns(gadm_level=pl.lit(1), country=pl.lit("India"))

    # Indonesia ---------------------------------------------------------------------- #
    indonesia = dbt.source(
        "trase-storage-raw", "diet_trase_indonesia_coffee_2020_jurisdictions"
    ).pl()
    indonesia = indonesia.select("geocode", "name")
    indonesia = indonesia.with_columns(
        gadm_level=pl.lit(1), country=pl.lit("Indonesia")
    )

    # Peru --------------------------------------------------------------------------- #
    peru = dbt.source(
        "trase-storage-raw", "diet_trase_peru_coffee_2020_jurisdictions"
    ).pl()
    peru = peru.select("geocode", "name")
    peru = peru.with_columns(gadm_level=pl.lit(2), country=pl.lit("Peru"))

    # Tanzania ----------------------------------------------------------------------- #
    tanzania = dbt.source(
        "trase-storage-raw", "diet_trase_tanzania_coffee_2020_jurisdictions"
    ).pl()
    tanzania = tanzania.select("geocode", "name")
    tanzania = tanzania.with_columns(gadm_level=pl.lit(1), country=pl.lit("Tanzania"))

    # Uganda ------------------------------------------------------------------------- #
    uganda = pl.DataFrame(
        [
            {"geocode": "UG-002", "name": "Ankole"},
            {"geocode": "UG-003", "name": "Buganda"},
            {"geocode": "UG-006", "name": "Bunyoro"},
            {"geocode": "UG-007", "name": "Busoga"},
            {"geocode": "UG-016", "name": "Elgon"},
            {"geocode": "UG-009", "name": "Kigezi"},
            {"geocode": "UG-014", "name": "Tooro"},
            {"geocode": "UG-015", "name": "West Nile"},
        ]
    )
    uganda = uganda.with_columns(gadm_level=pl.lit(1.5), country=pl.lit("Uganda"))

    # Côte d'Ivoire ------------------------------------------------------------------ #
    # There doesn't seem to be a Parquet file for this so I'm just hard-coding here for
    # now
    cote_d_ivoire = pl.DataFrame(
        [
            {"geocode": "CIV.1_1", "name": "Abidjan"},
            {"geocode": "CIV.2_1", "name": "Bas-Sassandra"},
            {"geocode": "CIV.3_1", "name": "Comoé"},
            {"geocode": "CIV.4_1", "name": "Denguélé"},
            {"geocode": "CIV.5_1", "name": "Gôh-Djiboua"},
            {"geocode": "CIV.6_1", "name": "Lacs"},
            {"geocode": "CIV.7_1", "name": "Lagunes"},
            {"geocode": "CIV.8_1", "name": "Montagnes"},
            {"geocode": "CIV.9_1", "name": "Sassandra-Marahoué"},
            {"geocode": "CIV.10_1", "name": "Savanes"},
            {"geocode": "CIV.11_1", "name": "Vallée du Bandama"},
            {"geocode": "CIV.12_1", "name": "Woroba"},
            {"geocode": "CIV.13_1", "name": "Yamoussoukro"},
            {"geocode": "CIV.14_1", "name": "Zanzan"},
        ]
    )
    cote_d_ivoire = cote_d_ivoire.with_columns(
        gadm_level=pl.lit(1),
        country=pl.lit("Côte d'Ivoire"),
    )

    # Vietnam ------------------------------------------------------------------------ #
    vietnam = pl.DataFrame(
        [
            {"geocode": "VN-001", "name": "Northwest"},
            {"geocode": "VN-002", "name": "Northeast"},
            {"geocode": "VN-003", "name": "Red River Delta"},
            {"geocode": "VN-004", "name": "North Central Coast"},
            {"geocode": "VN-005", "name": "South Central Coast"},
            {"geocode": "VN-006", "name": "Central Highlands"},
            {"geocode": "VN-007", "name": "Southeast"},
            {"geocode": "VN-008", "name": "Mekong River Delta"},
        ]
    )
    vietnam = vietnam.with_columns(gadm_level=pl.lit(1), country=pl.lit("Vietnam"))

    # Other countries - country level ------------------------------------------------ #
    # TODO: it would be better to have one unique code per country
    other_countries = pl.DataFrame(
        [
            {
                "geocode": "NA",
                "name": "Country-level",
                "gadm_level": 0,
                "country": "Global",
            }
        ]
    )

    return pl.concat(
        df.cast({"gadm_level": pl.Float32})
        for df in [
            brazil,
            colombia,
            cote_d_ivoire,
            ethiopia,
            india,
            indonesia,
            other_countries,
            peru,
            tanzania,
            uganda,
            vietnam,
        ]
    )