Old Sif 2019
s3://trase-storage/brazil/production/statistics/sigsif/out/old/SIF_2019.csv
Dbt path: trase_production.main_brazil.old_sif_2019
Explore on Metabase: Full table; summary statistics
Containing yaml file link: trase/data_pipeline/models/brazil/production/statistics/sigsif/out/old/_schema.yml
Model file link: trase/data_pipeline/models/brazil/production/statistics/sigsif/out/old/old_sif_2019.py
Calls script: trase/data/brazil/production/statistics/sigsif/out/archive/SIF_2019.py
Dbt test runs & lineage: Test results · Lineage
Full dbt_docs page: Open in dbt docs (includes lineage graph -at the bottom right-, tests, and downstream dependencies)
Tags: mock_model, brazil, old, out, production, sigsif, statistics
old_sif_2019
Description
This model was auto-generated based off .yml 'lineage' files in S3. The DBT model just raises an error; the actual script that created the data lives elsewhere. The script is located at trase/data/brazil/production/statistics/sigsif/out/archive/SIF_2019.py [permalink]. It was last run by Harry Biddle.
Details
| Column | Type | Description |
|---|---|---|
Models / Seeds
source.trase_duckdb.trase-storage-raw.id_mun_master_asource.trase_duckdb.trase-storage-raw.sigsif_slaughter_2019
Sources
['trase-storage-raw', 'id_mun_master_a']['trase-storage-raw', 'sigsif_slaughter_2019']
"""
Filter and clean the SIGSIF data, which we assume has already been extracted from the
PDF using Tabula.
"""
import pandas as pd
from trase.tools.aws import get_pandas_df
from trase.tools.aws.metadata import write_csv_for_upload
########################################################################################
# Clean raw SIF data
########################################################################################
df = get_pandas_df(
"brazil/production/statistics/sigsif/original/SIGSIF_SLAUGHTER_2019.csv",
na_values=[""],
sep=",",
dtype=str,
)
df.columns = ["STATE_SLAUGHTER", "MUNICIPALITY", "DESCRIPTION", "QUANTITY"]
# remove lines that are completely empty
df = df.dropna(how="all")
# fill empty quantites with 0
df["QUANTITY"] = df["QUANTITY"].fillna("0")
# fill empty cells using the previous value
df = df.fillna(method="ffill")
# Split strings like "Água Fria/BA" into two columns
df[["MUNICIPALITY", "STATE"]] = df["MUNICIPALITY"].str.split("/", expand=True)
# Upper-case the municipality
df["MUNICIPALITY"] = df["MUNICIPALITY"].str.upper()
# get rid of accented characters
df["MUNICIPALITY"] = df["MUNICIPALITY"].str.normalize("NFKD")
df["MUNICIPALITY"] = df["MUNICIPALITY"].str.encode("ascii", errors="ignore")
df["MUNICIPALITY"] = df["MUNICIPALITY"].str.decode("utf-8")
# replace some names so that it matches ID_MUN_MASTER_A.csv
for old_name, state, new_name in [
("POXOREU", "MT", "POXOREO"),
("ITABIRINHA DE MANTENA", "MG", "ITABIRINHA"),
("VILA BELA DA SANTISSIMA TRINDADE", "MT", "VILA BELA SANTISSIMA TRINDADE"),
]:
df.loc[
(df["MUNICIPALITY"] == old_name) & (df["STATE"] == state),
"MUNICIPALITY",
] = new_name
# Drop rows without a municipality
df = df[df["MUNICIPALITY"] != "MUNICIPIO NAO CADASTRADO"]
# Parse quantity
df["QUANTITY"] = df["QUANTITY"].str.replace(".", "").astype(int)
########################################################################################
# Add a "TYPE" column (chicken/pork/etc.)
########################################################################################
# Parse portugese category to english commodity
df_descriptions = pd.DataFrame(
[
#
# Beef
#
("CATTLE", "BOVINO FEMEA"),
("CATTLE", "BOVINO MACHO"),
("CATTLE", "NOVILHA INTERMEDIARIA"),
("CATTLE", "NOVILHA PRECOCE"),
("CATTLE", "NOVILHAO"),
("CATTLE", "NOVILHO INTERMEDIARIO"),
("CATTLE", "NOVILHO PRECOCE"),
("CATTLE", "NOVILHONA"),
("CATTLE", "TOURO/TOURUNO"),
("CATTLE", "VACA"),
("CATTLE", "VITELO"),
#
# Chicken
#
("CHICKEN", "FRANGO ESPECIAL"),
("CHICKEN", "FRANGO"),
#
# Pork
#
("PORK", "LEITAO"),
("PORK", "LEITOA"),
("PORK", "PORCA MATRIZEIRA"),
("PORK", "PORCO MATRIZEIRO"),
("PORK", "PORCO NAO CLASSIFICADO"),
("PORK", "SUINO FEMEA"),
("PORK", "SUINO MACHO"),
("PORK", "SUINO TIPO BANHA, FEMEA"),
("PORK", "SUINO TIPO BANHA, MACHO"),
("PORK", "SUINO TIPO CARNE, FEMEA"),
("PORK", "SUINO TIPO CARNE, MACHO"),
#
# These animal products are not interesting to use
# Intentionally keeping this as an explicit list!
#
("OTHER", "RÃ"),
("OTHER", "ASININO FEMEA"),
("OTHER", "ASININO MACHO"),
("OTHER", "AVESTRUZ"),
("OTHER", "BUBALINO JOVEM"),
("OTHER", "BUFALA"),
("OTHER", "BUFALO"),
("OTHER", "CABRITO"),
("OTHER", "CAPAO"),
("OTHER", "CAPRINO FEMEA"),
("OTHER", "CAPRINO MACHO"),
("OTHER", "COELHO"),
("OTHER", "CORDEIRA"),
("OTHER", "CORDEIRO"),
("OTHER", "EQUINO FEMEA"),
("OTHER", "EQUINO MACHO"),
("OTHER", "FAIZÃO"),
("OTHER", "GANSO"),
("OTHER", "MARRECO"),
("OTHER", "MUAR FEMEA"),
("OTHER", "MUAR MACHO"),
("OTHER", "NONATO"),
("OTHER", "OVELHA"),
("OTHER", "OVINO FEMEA"),
("OTHER", "OVINO MACHO"),
("OTHER", "PATO"),
("OTHER", "PERDIZ"),
("OTHER", "PERU"),
#
# These products are chicken, but they are intentionally excluded
# We look for broilers, meat
#
("OTHER", "GALINHA DE POSTURA"),
("OTHER", "GALINHA"),
("OTHER", "GALO"),
("OTHER", "GALETO"),
],
columns=["TYPE", "DESCRIPTION"],
)
df = pd.merge(
left=df,
right=df_descriptions,
on="DESCRIPTION",
how="left",
)
assert df["TYPE"].notnull().all()
df = df[df["TYPE"] != "OTHER"]
########################################################################################
# Add a GEOCODE column for municipalities
########################################################################################
columns = {"ID_MUN_IBGE": "GEOCODE", "MUN_NAME2": "MUNICIPALITY", "STATE_ID": "STATE"}
df_municipality = get_pandas_df(
"brazil/metadata/ID_MUN_MASTER_A.csv", encoding="latin-1", usecols=columns, sep=";"
)
df_municipality = df_municipality.rename(columns=columns)
assert not df_municipality.duplicated().any()
df = pd.merge(
left=df,
right=df_municipality,
on=["MUNICIPALITY", "STATE"],
how="left",
)
assert df["GEOCODE"].notnull().all()
########################################################################################
# Final check and write to disk
########################################################################################
# check that no geocode/state of slaughter/description is ever duplicated
assert not df[["GEOCODE", "STATE_SLAUGHTER", "DESCRIPTION"]].duplicated().any()
# write to disk (we leave it to the user to do the upload)
write_csv_for_upload(df, "brazil/production/statistics/sigsif/out/SIF_2019.csv")
import pandas as pd
def model(dbt, cursor):
dbt.source("trase-storage-raw", "id_mun_master_a")
dbt.source("trase-storage-raw", "sigsif_slaughter_2019")
raise NotImplementedError()
return pd.DataFrame({"hello": ["world"]})