Sicasq 2021
s3://trase-storage/brazil/logistics/sicasq/out/SICASQ_2021.csv
Dbt path: trase_production.main_brazil.sicasq_2021
Explore on Metabase: Full table; summary statistics
Containing yaml file link: trase/data_pipeline/models/brazil/logistics/sicasq/out/_schema.yml
Model file link: trase/data_pipeline/models/brazil/logistics/sicasq/out/sicasq_2021.py
Calls script: trase/data/brazil/logistics/sicasq/out/SICASQ_2021.py
Dbt test runs & lineage: Test results · Lineage
Full dbt_docs page: Open in dbt docs (includes lineage graph -at the bottom right-, tests, and downstream dependencies)
Tags: mock_model, brazil, logistics, out, sicasq
sicasq_2021
Description
This model was auto-generated based off .yml 'lineage' files in S3. The DBT model just raises an error; the actual script that created the data lives elsewhere. The script is located at trase/data/brazil/logistics/sicasq/out/SICASQ_2021.py [permalink]. It was last run by Harry Biddle.
Details
| Column | Type | Description |
|---|---|---|
Models / Seeds
source.trase_duckdb.trase-storage-raw.br_sicasq_original_20210717
Sources
['trase-storage-raw', 'br_sicasq_original_20210717']
"""
Brazil - SICASQ (Sistema de Cadastro dos Agentes da Cadeira Produtiva de Vegetais)
"""
from datetime import datetime
import pandas as pd
from trase.tools.aws.aws_helpers_cached import get_pandas_df_once
from trase.tools.aws.metadata import write_csv_for_upload
from trase.tools.sei_pcs.pandas_utilities import rename
def main():
start_time = datetime.now()
print("Script started:", start_time)
df = load_data()
df = translate_products_name(df)
df = fix_countries_destination_names(df)
df = organize_data_structure(df)
write_csv_for_upload(df, "brazil/logistics/sicasq/out/SICASQ_2021.csv")
print("Script took ", datetime.now() - start_time, "to run")
def load_data():
"""Load data from S3 bucket"""
s3_key = "brazil/logistics/sicasq/ori/br_sicasq_original_20210717.csv"
df = get_pandas_df_once(s3_key, sep=";", dtype=str, keep_default_na=False)
df = rename(
df,
columns={
"Estabelecimento": "COMPANY",
"Produto": "PRODUCT",
"CNPJ/CPF": "CNPJ",
"Situação do Estabelecimento": "STATUS",
"País de Destino": "DESTINATION",
"Bloco Econômico": "ECONOMIC BLOC",
"Validade do Registro": "DATE OF VALIDITY",
"Endereço(UF,Município,Bairro,Logradouro,Complemento,Número)": "ADDRESS",
"E-Mail": "EMAIL",
},
)
df["SICASQ_YEAR"] = "2021"
df["SICASQ_DOWNLOAD_DATE"] = "2021-07-17"
df["CNPJ_2"] = (
df["CNPJ"].str.replace("/", "").str.replace(".", "").str.replace("-", "")
)
return df
def translate_products_name(df: pd.DataFrame):
"""Translate PT-BR product names into EN-US"""
products = {
"CASTANHA DO BRASIL": "BRAZIL NUT",
"AZEITE DE OLIVA": "OLIVE OIL",
"AÇUCAR": "SUGAR",
"SOJA": "SOY",
"ARROZ BENEFICIADO": "BENEFITED RICE",
"ARROZ EM CASCA": "RICE",
"CANJICA DE MILHO": "CORN HOMINY",
"FEIJAO": "BEANS",
"FARINHA DE MANDIOCA": "CASSAVA FLOUR",
"MILHO": "CORN",
"MAMAO": "PAPAYA",
"MANGA": "MANGO",
"PRODUTOS AMILACEOS DERIVADOS DA MANDIOCA": "STILLY PRODUCTS DERIVED FROM CASSAVA",
"ERVILHA": "GREEN PEAS",
"LENTILHA": "LENTILS",
"MILHO PIPOCA": "POPCORN CORN",
"FARINHA DE TRIGO": "WHEAT FLOUR",
"UVA FINA DE MESA": "FINE TABLE GRAPE",
"CAFE BENEFICIADO GRAO CRU": "COFFEE",
"RASPA DE MANDIOCA": "CASSAVA SCRAP",
"AMENDOIM BENEFICIADO": "BENEFITED PEANUTS",
"AMENDOIM EM CASCA": "PEANUT IN SHELL",
"MELAO": "MELON",
"PIMENTA DO REINO": "BLACK PEPPER",
"FARELO DE SOJA": "SOY CAKE",
"OLEO DE SOJA BRUTO E DEGOMADO": "GROSS AND DEGUMMED SOYBEAN OIL",
"OLEO DE SOJA REFINADO": "REFINED SOYBEAN OIL",
"UVA": "GRAPES",
"GUARANA": "GUARANA",
"ALPISTE": "BIRDSEED",
"GIRASSOL": "SUNFLOWER",
"-": "-",
"MARGARINA": "MARGARINA",
"CASTANHA DE CAJU": "CASHEW NUT",
"MAÇA": "APPLE",
"AMENDOA DA CASTANHA DE CAJU": "CAJU NUT ALMONDS",
"ABACAXI": "PINEAPPLE",
"GOIABA": "GOIABA",
"OLEO DE BAGAÇO DE OLIVA": "OLIVE POSTAGE OIL",
"TRIGO": "WHEAT",
"OLEO DE ALGODAO REFINADO": "REFINED COTTON OIL",
"OLEO DE CANOLA REFINADO": "REFINED CANOLA OIL",
"OLEO DE GIRASSOL REFINADO": "REFINED SUNFLOWER OIL",
"OLEO DE MILHO REFINADO": "REFINED CORN OIL",
"CRAVO DA INDIA": "CLOVE",
"AÇAÍ": "ACAI",
"ACEROLA": "ACEROLA",
"AMORA": "BLACKBERRY",
"CAJU": "CAJU",
"MARACUJÁ": "PASSION FRUIT",
"MIRTILO": "BLUEBERRY",
"MORANGO": "STRAWBERRY",
"TANGERINA": "TANGERINE",
"ALGODAO EM PLUMA": "FEATHER COTTON",
"CAROÇO DE ALGODAO": "COTTONSEED",
"ALHO": "GARLIC",
"BATATA": "POTATO",
"ABACATE": "AVOCADO",
"ABÓBORA": "PUMPKIN",
"ABOBRINHA": "ZUCCHINI",
"BANANA": "BANANA",
"BATATA DOCE": "SWEET POTATO",
"CACAU": "CACAO",
"LARANJA": "ORANGE",
"AVEIA": "OAT",
"AMENDOIM": "PEANUT",
"CEVADA": "BARLEY",
"CENTEIO": "RYE",
"CEBOLA": "ONION",
"SORGO": "SORGHUM",
"OLEO DE MENTA": "MINT OIL",
"AGRIÃO": "CRESS",
"ALFACE": "LETUCCE",
"BETERRABA": "BEETROOT",
"COUVE": "CABBAGE",
}
df["PRODUCTS"] = df["PRODUCT"].map(products)
return df
def fix_countries_destination_names(df: pd.DataFrame):
"""Rename Countries' Names"""
countries = {
"BRASIL": "BRAZIL",
"CHINA, REPUBLICA POPULAR": "CHINA",
"RÚSSIA, FEDERAÇÃO DA": "RUSSIA",
"COLÔMBIA": "COLOMBIA",
"EMIRADOS ÁRABES UNIDOS": "UNITED ARAB EMIRATES",
"ARÁBIA SAUDITA": "SAUDI ARABIA",
"BAHREIN, ILHAS": "BAHREIN",
"OMA": "OMA",
"QATAR": "QATAR",
"KUWAIT": "KUWAIT",
"-": "-",
}
df["DESTINATION"] = df["DESTINATION"].map(countries)
return df
def organize_data_structure(df: pd.DataFrame):
"""Organizes the data into the desirable final structure"""
df = df.drop(columns="PRODUCT", errors="raise")
df = rename(df, columns={"PRODUCTS": "PRODUCT"})
df = df[
[
"COMPANY",
"CNPJ",
"CNPJ_2",
"UF",
"PRODUCT",
"DESTINATION",
"ADDRESS",
"EMAIL",
"STATUS",
"DATE OF VALIDITY",
"SICASQ_YEAR",
"SICASQ_DOWNLOAD_DATE",
]
]
# Check if there are no empty values
for column in df.columns:
assert not any(df[column].isna())
assert not any(df[df[column] == ""]) == 0
assert not any(df[df[column].str.contains(",")]) == 0
assert not any(df[df[column] == "NAN"]) == 0
return df
if __name__ == "__main__":
main()
import pandas as pd
def model(dbt, cursor):
dbt.source("trase-storage-raw", "br_sicasq_original_20210717")
raise NotImplementedError()
return pd.DataFrame({"hello": ["world"]})