Cwe 2019
s3://trase-storage/brazil/auxiliary/cwe/CWE_2019.csv
Dbt path: trase_production.main_brazil.cwe_2019
Explore on Metabase: Full table; summary statistics
Containing yaml file link: trase/data_pipeline/models/brazil/auxiliary/cwe/_schema.yml
Model file link: trase/data_pipeline/models/brazil/auxiliary/cwe/cwe_2019.py
Calls script: trase/data/brazil/auxiliary/cwe/CWE_2019.py
Dbt test runs & lineage: Test results · Lineage
Full dbt_docs page: Open in dbt docs (includes lineage graph -at the bottom right-, tests, and downstream dependencies)
Tags: mock_model, auxiliary, brazil, cwe
cwe_2019
Description
This model was auto-generated based off .yml 'lineage' files in S3. The DBT model just raises an error; the actual script that created the data lives elsewhere. The script is located at trase/data/brazil/auxiliary/cwe/CWE_2019.py [permalink]. It was last run by Harry Biddle.
Details
| Column | Type | Description |
|---|---|---|
Models / Seeds
source.trase_duckdb.trase-storage-raw.igbe_animal_chicken_quarterly_survey_2019source.trase_duckdb.trase-storage-raw.igbe_animal_pig_quarterly_survey_2019
Sources
['trase-storage-raw', 'igbe_animal_chicken_quarterly_survey_2019']['trase-storage-raw', 'igbe_animal_pig_quarterly_survey_2019']
"""
Compute "carcass weight equivalences" from the IGBE animal quarterly slaughter survey.
This is done by dividing the total weight in kilograms slaughtered in the whole year by
the head count slaughtered in the whole year, on a per-state basis.
This script was first used for 2019}: previous years were done by Javier.
"""
import pandas as pd
from trase.tools.aws.aws_helpers_cached import get_pandas_df_once
from trase.tools.aws.metadata import write_csv_for_upload
# these are the columns that we will read from the Excel file. in the Excel file the
# headers are spread across multiple rows!
COLUMNS = {
("Cód.", "Unnamed: 1_level_1", "Unnamed: 1_level_2"): "STATE_ID",
(
"Brasil e Unidade da Federação",
"Unnamed: 2_level_1",
"Unnamed: 2_level_2",
): "NAME",
("Tipo de inspeção", "Unnamed: 3_level_1", "Unnamed: 3_level_2"): "INSPECTION_TYPE",
(
"Trimestre x Referência temporal",
"1º trimestre 2019",
"Total do trimestre",
): "Q1",
(
"Trimestre x Referência temporal",
"2º trimestre 2019",
"Total do trimestre",
): "Q2",
(
"Trimestre x Referência temporal",
"3º trimestre 2019",
"Total do trimestre",
): "Q3",
(
"Trimestre x Referência temporal",
"4º trimestre 2019",
"Total do trimestre",
): "Q4",
}
def main():
df_pork = compute_carcass_weight_equivalences(
"brazil/production/slaughter/igbe/animal_slaughter_quarterly_survey/IGBE_ANIMAL_PIG_QUARTERLY_SURVEY_2019.xlsx",
"pork",
)
df_chicken = compute_carcass_weight_equivalences(
"brazil/production/slaughter/igbe/animal_slaughter_quarterly_survey/IGBE_ANIMAL_CHICKEN_QUARTERLY_SURVEY_2019.xlsx",
type_string="chicken",
)
df = pd.concat([df_pork, df_chicken]).assign(YEAR=2019)
write_csv_for_upload(df, "brazil/auxiliary/cwe/CWE_2019.csv")
def read_survey_data(s3_key, sheet, column):
"""Extract and parse survery data from a specific sheet"""
df = get_pandas_df_once(s3_key, sheet_name=sheet, header=[2, 3, 4], xlsx=True)
# select a subset of columns and then rename them
df = df[COLUMNS.keys()]
df.columns = COLUMNS.values()
# remove empty lines
df = df.dropna(how="all")
# select sum of all inspections
df = df[df.pop("INSPECTION_TYPE") == "Total"]
# ignore Brazil
df = df[df["STATE_ID"] != 1.0]
# santise the state names
# Convert UTF8 characters in the state names to ASCII and upper-case them
df["NAME"] = (
df["NAME"]
.str.normalize("NFKD")
.str.encode("ascii", errors="ignore")
.str.decode("utf-8")
.str.upper()
)
# state codes were stored as integers by excel: pad them to a two-digit code
df["STATE_ID"] = df["STATE_ID"].astype(int).astype(str).str.rjust(2, "0")
# replace a bunch of things that are placeholders for zero
df = df.replace("-", 0)
df = df.replace("X", 0) # this is when there are too few inspectors in the state
df = df.replace("...", 0)
# sum values across all quarters
columns = ["Q1", "Q2", "Q3", "Q4"]
df[column] = df[columns].astype(int).sum(axis=1)
df = df.drop(columns=columns)
assert not df.empty
return df
def compute_carcass_weight_equivalences(s3_key, type_string):
df_heads = read_survey_data(s3_key, sheet="Animais abatidos", column="HEADS_TOTAL")
df_weights = read_survey_data(
s3_key, sheet="Peso total das carcaças", column="CARCASS_WEIGHT_TOTAL"
)
# join together on states. in order to join without losing or duplicating data the
# states should match in both datasets. actually we are being a bit too strict here
# since it's fine if they have a different order
join_columns = ["STATE_ID", "NAME"]
assert not any(df_heads[join_columns].duplicated())
assert df_heads[join_columns].equals(df_weights[join_columns])
df = pd.merge(df_heads, df_weights, on=join_columns)
# compute ratio
df["CW_KG_PER_HEAD"] = (df["CARCASS_WEIGHT_TOTAL"] / df["HEADS_TOTAL"]).fillna(0)
# add type column
df["type"] = type_string
return df
if __name__ == "__main__":
main()
import pandas as pd
def model(dbt, cursor):
dbt.source("trase-storage-raw", "igbe_animal_chicken_quarterly_survey_2019")
dbt.source("trase-storage-raw", "igbe_animal_pig_quarterly_survey_2019")
raise NotImplementedError()
return pd.DataFrame({"hello": ["world"]})