Cwe 2019

s3://trase-storage/brazil/auxiliary/cwe/CWE_2019.csv

Dbt path: trase_production.main_brazil.cwe_2019

Explore on Metabase: Full table; summary statistics

Containing yaml file link: trase/data_pipeline/models/brazil/auxiliary/cwe/_schema.yml

Model file link: trase/data_pipeline/models/brazil/auxiliary/cwe/cwe_2019.py

Calls script: trase/data/brazil/auxiliary/cwe/CWE_2019.py

Dbt test runs & lineage: Test results · Lineage

Full dbt_docs page: Open in dbt docs (includes lineage graph -at the bottom right-, tests, and downstream dependencies)

Tags: mock_model, auxiliary, brazil, cwe

cwe_2019

Description

This model was auto-generated based off .yml 'lineage' files in S3. The DBT model just raises an error; the actual script that created the data lives elsewhere. The script is located at trase/data/brazil/auxiliary/cwe/CWE_2019.py [permalink]. It was last run by Harry Biddle.

Details

ColumnsDepends OnCalled script codeModel code

Column	Type	Description

Models / Seeds

source.trase_duckdb.trase-storage-raw.igbe_animal_chicken_quarterly_survey_2019
source.trase_duckdb.trase-storage-raw.igbe_animal_pig_quarterly_survey_2019

Sources

['trase-storage-raw', 'igbe_animal_chicken_quarterly_survey_2019']
['trase-storage-raw', 'igbe_animal_pig_quarterly_survey_2019']

"""
Compute "carcass weight equivalences" from the IGBE animal quarterly slaughter survey.
This is done by dividing the total weight in kilograms slaughtered in the whole year by
the head count slaughtered in the whole year, on a per-state basis.

This script was first used for 2019}: previous years were done by Javier.
"""

import pandas as pd

from trase.tools.aws.aws_helpers_cached import get_pandas_df_once
from trase.tools.aws.metadata import write_csv_for_upload


# these are the columns that we will read from the Excel file. in the Excel file the
# headers are spread across multiple rows!
COLUMNS = {
    ("Cód.", "Unnamed: 1_level_1", "Unnamed: 1_level_2"): "STATE_ID",
    (
        "Brasil e Unidade da Federação",
        "Unnamed: 2_level_1",
        "Unnamed: 2_level_2",
    ): "NAME",
    ("Tipo de inspeção", "Unnamed: 3_level_1", "Unnamed: 3_level_2"): "INSPECTION_TYPE",
    (
        "Trimestre x Referência temporal",
        "1º trimestre 2019",
        "Total do trimestre",
    ): "Q1",
    (
        "Trimestre x Referência temporal",
        "2º trimestre 2019",
        "Total do trimestre",
    ): "Q2",
    (
        "Trimestre x Referência temporal",
        "3º trimestre 2019",
        "Total do trimestre",
    ): "Q3",
    (
        "Trimestre x Referência temporal",
        "4º trimestre 2019",
        "Total do trimestre",
    ): "Q4",
}


def main():
    df_pork = compute_carcass_weight_equivalences(
        "brazil/production/slaughter/igbe/animal_slaughter_quarterly_survey/IGBE_ANIMAL_PIG_QUARTERLY_SURVEY_2019.xlsx",
        "pork",
    )
    df_chicken = compute_carcass_weight_equivalences(
        "brazil/production/slaughter/igbe/animal_slaughter_quarterly_survey/IGBE_ANIMAL_CHICKEN_QUARTERLY_SURVEY_2019.xlsx",
        type_string="chicken",
    )
    df = pd.concat([df_pork, df_chicken]).assign(YEAR=2019)
    write_csv_for_upload(df, "brazil/auxiliary/cwe/CWE_2019.csv")


def read_survey_data(s3_key, sheet, column):
    """Extract and parse survery data from a specific sheet"""

    df = get_pandas_df_once(s3_key, sheet_name=sheet, header=[2, 3, 4], xlsx=True)

    # select a subset of columns and then rename them
    df = df[COLUMNS.keys()]
    df.columns = COLUMNS.values()

    # remove empty lines
    df = df.dropna(how="all")

    # select sum of all inspections
    df = df[df.pop("INSPECTION_TYPE") == "Total"]

    # ignore Brazil
    df = df[df["STATE_ID"] != 1.0]

    # santise the state names
    # Convert UTF8 characters in the state names to ASCII and upper-case them
    df["NAME"] = (
        df["NAME"]
        .str.normalize("NFKD")
        .str.encode("ascii", errors="ignore")
        .str.decode("utf-8")
        .str.upper()
    )

    # state codes were stored as integers by excel: pad them to a two-digit code
    df["STATE_ID"] = df["STATE_ID"].astype(int).astype(str).str.rjust(2, "0")

    # replace a bunch of things that are placeholders for zero
    df = df.replace("-", 0)
    df = df.replace("X", 0)  # this is when there are too few inspectors in the state
    df = df.replace("...", 0)

    # sum values across all quarters
    columns = ["Q1", "Q2", "Q3", "Q4"]
    df[column] = df[columns].astype(int).sum(axis=1)
    df = df.drop(columns=columns)

    assert not df.empty

    return df


def compute_carcass_weight_equivalences(s3_key, type_string):
    df_heads = read_survey_data(s3_key, sheet="Animais abatidos", column="HEADS_TOTAL")
    df_weights = read_survey_data(
        s3_key, sheet="Peso total das carcaças", column="CARCASS_WEIGHT_TOTAL"
    )

    # join together on states. in order to join without losing or duplicating data the
    # states should match in both datasets. actually we are being a bit too strict here
    # since it's fine if they have a different order
    join_columns = ["STATE_ID", "NAME"]
    assert not any(df_heads[join_columns].duplicated())
    assert df_heads[join_columns].equals(df_weights[join_columns])
    df = pd.merge(df_heads, df_weights, on=join_columns)

    # compute ratio
    df["CW_KG_PER_HEAD"] = (df["CARCASS_WEIGHT_TOTAL"] / df["HEADS_TOTAL"]).fillna(0)

    # add type column
    df["type"] = type_string

    return df


if __name__ == "__main__":
    main()

import pandas as pd


def model(dbt, cursor):
    dbt.source("trase-storage-raw", "igbe_animal_chicken_quarterly_survey_2019")
    dbt.source("trase-storage-raw", "igbe_animal_pig_quarterly_survey_2019")

    raise NotImplementedError()
    return pd.DataFrame({"hello": ["world"]})