Br Bovine Slaughter 2015 2022

s3://trase-storage/brazil/production/statistics/sigsif/out/br_bovine_slaughter_2015_2022.csv

Dbt path: trase_production.main_brazil.br_bovine_slaughter_2015_2022

Explore on Metabase: Full table; summary statistics

Containing yaml file link: trase/data_pipeline/models/brazil/production/statistics/sigsif/out/_schema.yml

Model file link: trase/data_pipeline/models/brazil/production/statistics/sigsif/out/br_bovine_slaughter_2015_2022.py

Calls script: trase/data/brazil/production/statistics/sigsif/clean_bovine_sigsif_municipal_slaughter_data.Rmd

Dbt test runs & lineage: Test results · Lineage

Full dbt_docs page: Open in dbt docs (includes lineage graph -at the bottom right-, tests, and downstream dependencies)

Tags: mock_model, brazil, out, production, sigsif, statistics

br_bovine_slaughter_2015_2022

Description

No description

Details

ColumnsDepends OnCalled script codeModel code

Column	Type	Description

Models / Seeds

source.trase_duckdb.trase-storage-raw.municipalities_bra_geojson
source.trase_duckdb.trase-storage-raw.brazil_munic_biome
model.trase_duckdb.sif_2015_2017
model.trase_duckdb.sif_2018
model.trase_duckdb.sif_2019
model.trase_duckdb.sif_2020
model.trase_duckdb.sif_2021
model.trase_duckdb.sif_2022

Sources

['trase-storage-raw', 'municipalities_bra_geojson']
['trase-storage-raw', 'brazil_munic_biome']

---
title: "Prepare bovine SIGSIF slaughter data for SEI-PCS"
author: "Erasmus zu Ermgassen"
date: "2022-06-20"
output: html_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE, warning = F, message = F, fig.width = 14, fig.align = 'center')
options(encoding = "utf8")
library(tidyverse)
library(aws.s3)
library(sf)
library(viridis)
aws.signature::use_credentials()
Sys.setenv("AWS_DEFAULT_REGION" = "eu-west-1")
ts <- "trase-storage"

str_trans <- function(x) {
  x %>%
    stringi::stri_trans_toupper() %>%
    stringi::stri_trans_general("Latin-ASCII")
}

Script to make a clean version of bovine SIGSIF data on the municipal origin of cattle slaughtered in SIF slaughterhouses per state.

The output has the columns: STATE_SLAUGHTER GEOCODE PROP_FLOWS QUANTITY BIOME

Load data

# [A] Load raw SIGSIF for 2015-2019
# Note: these data are semi_colon delimted
path <- "brazil/production/statistics/sigsif/out/"
keys <- map_chr(get_bucket(ts, prefix = path), "Key")
keys <- keys[!grepl("old", keys)]
keys_semi_colon <- keys[grepl("SIF_2015_2017|SIF_2018|SIF_2019", keys)]
keys_semi_colon <- keys_semi_colon[grepl("csv", keys_semi_colon)]
sigsif_2015_2019 <- 
  map_df(keys_semi_colon, 
         ~ s3read_using(
           object = .x, 
           FUN = read_delim, 
           col_types = cols(GEOCODE = col_character()),
           delim = ";",
           bucket = ts, 
           opts = c("check_region" = T)
         )
  )


# [B] Load raw SIGSIF for 2020, 2021, 2022
# Note: these data are not comma delimted
keys_comma <- keys[grepl("SIF_2020|SIF_2021|SIF_2022", keys)]
keys_comma <- keys_comma[grepl("csv", keys_comma)]
sigsif_2020_2022 <- 
  map_df(keys_comma, 
         ~ s3read_using(
           object = .x, 
           FUN = read_delim, 
           col_types = cols(GEOCODE = col_character()),
           delim = ";",
           bucket = ts, 
           opts = c("check_region" = T)
         )
  )


# [C] Join 2015-2022 data together
sigsif <- bind_rows(sigsif_2015_2019, sigsif_2020_2022)



stopifnot(any(is.na(sigsif$GEOCODE)) == FALSE)


# [D] Load spatial data
spatial_path <- "brazil/spatial/BOUNDARIES/ibge/old/boundaries municipalities/"
munis <- get_object(paste0(spatial_path, "MUNICIPALITIES_BRA.geojson"), "trase-storage", check_region = T)
munis <- read_sf(rawToChar(munis))
brazil <- st_union(munis)
muni_centroids <- st_centroid(munis)


# muni/biome classifications
muni_biomes <-
  read_delim(
    get_object("brazil/spatial/BOUNDARIES/boundaries biomes/old/MUNIC_BIOME.csv",
               "trase-storage",
               check_region = T),
    col_types = cols(GEOCODMU = col_character()),
    delim = ";"
  ) %>%
  select(GEOCODE = GEOCODMU,
         BIOME)


# tidy up 
rm(path, key, keys_comma, keys_semi_colon, sigsif_2015_2019, sigsif_2020_2022, spatial_path)

Process data

# Check for consistency in SIGSIF data across years
sigsif %>% 
  group_by(YEAR, TYPE) %>% 
  summarise(SUM_HEAD = sum(QUANTITY)) %>% 
  ungroup() %>%
  drop_na(TYPE) %>%
  ggplot(aes(YEAR, SUM_HEAD)) + 
  geom_bar(stat = "identity", position = "dodge") + 
  facet_wrap(~TYPE, scales = "free_y") +
  theme_classic() +
  labs(title = "Check consistency",
       subtitle = "2015-2021")

Here I check how consistent the data are between years, by evaluating the R-sq of a model of PROP_FLOWS per GEOCODE (i.e. any variation is because of changes in sourcing between years).

# Identify the proportion per GEOCODE, per STATE_SLAUGHTER
sigsif_bov_annual <- 
  sigsif %>%
  filter(TYPE == "CATTLE") %>%
  group_by(YEAR, STATE_SLAUGHTER) %>%
  mutate(TOTAL_SLAUGHTERED_HEADS = sum(QUANTITY)) %>%
  group_by(YEAR, STATE_SLAUGHTER, GEOCODE) %>%
  summarise(PROP_FLOWS = sum(QUANTITY) / unique(TOTAL_SLAUGHTERED_HEADS)) %>%
  ungroup()

extract_sq_per_state <- function(state_name, data) {
  data <- data %>% filter(STATE_SLAUGHTER == state_name)
  if(length(unique(data$YEAR)) > 1 & length(unique(data$GEOCODE)) > 1) {
    m1 <-
      lm(data = data,
         PROP_FLOWS ~ GEOCODE)
    model_output <- summary(m1)
    model_r_sq <- model_output$r.squared
    tibble("STATE" = state_name,
           "R_SQ" = model_r_sq)
  } else {
    NULL
  }
}
res <- 
  map_df(
    .x = unique(sigsif_bov_annual$STATE_SLAUGHTER), 
    .f = extract_sq_per_state, 
    data = sigsif_bov_annual
  )

The sourcing is very consistent across years (R-sq > 0.9) for most states, except SE (r res %>% filter(STATE == "SE") %>% pull(R_SQ) %>% round(., 2)), SC (r res %>% filter(STATE == "SC") %>% pull(R_SQ) %>% round(., 2)), and SP (r res %>% filter(STATE == "SP") %>% pull(R_SQ) %>% round(., 2)).

# Visualise the consistency across years
ggplot(data = res,
       aes(STATE, R_SQ)) + 
  geom_point() + 
  scale_y_continuous(limits = c(0, 1)) +
  theme_bw() +
  geom_hline(yintercept = 0.9, linetype = "dashed") + 
  labs(title = "R-sq of municipal-level sourcing of SIFs per state", 
       subtitle = "R-sq = 0.9 shown as dashed line")

An example of the variation between years is in SP, where sourcing from PA and RO appears in some years, but not others

chk <- inner_join(munis, sigsif_bov_annual %>% filter(STATE_SLAUGHTER == "SP"), by = "GEOCODE")
ggplot() + 
  geom_sf(data = brazil, fill = "lightgrey") + 
  geom_sf(data = chk, aes(fill = PROP_FLOWS), col = NA) +
  facet_wrap(~YEAR) + 
  theme_minimal() + 
  coord_sf(datum = NA) +
  scale_fill_viridis(option="magma",
                     name = "% SOURCING",
                     na.value="white",
                     direction = -1)

I choose to aggregate these data across years (i.e. have one municipal-level supply shed per state), because:

[i] the above show that, for the most part, there's little variation between years;
[ii] the main use of these data are for SEI-PCS/the dPaP app, where they are paired with GTA data to identify sourcing per logistics hub. Here, the GTA data are also aggregated across years (in that case, because of patchy data), also aggregating SIGSIF therefore leads to more methodological consistency;
[iii] here we only load SIGSIF data from 2015-2020, though we intend to map exports for 2010-2020, aggregating across years avoids the need to go back and download SIGSIF data for earlier years (i.e. 2010-2014).

# Make cleaned output dataset
# Identify the proportion per GEOCODE, per STATE_SLAUGHTER
sigsif_bov_summary <- 
  sigsif %>%
  filter(TYPE == "CATTLE") %>%
  group_by(STATE_SLAUGHTER) %>%
  mutate(TOTAL_SLAUGHTERED_HEADS = sum(QUANTITY)) %>%
  group_by(STATE_SLAUGHTER, GEOCODE) %>%
  summarise(SUM_QUANTITY = sum(QUANTITY), 
            PROP_FLOWS = SUM_QUANTITY / unique(TOTAL_SLAUGHTERED_HEADS)) %>%
  ungroup()

Below I plot these supply sheds, as a visual check that the data are OK:

# Check it looks OK
plot_sourcing <- function(state_name, data){
    munis_to_plot <- 
    inner_join(munis, sigsif_bov_summary %>% filter(STATE_SLAUGHTER == state_name), by = "GEOCODE") 
  p <- ggplot() + 
    geom_sf(data = brazil, fill = 'grey') +
    geom_sf(data = munis_to_plot, 
            aes(fill = PROP_FLOWS * 100)) +
    labs(title = paste("Municipal-level sourcing of SIFs in", state_name)) +
    theme_minimal() +
    coord_sf(datum = NA) +
    scale_fill_viridis(option="magma",
                     name = "% SOURCING",
                     na.value="white",
                     direction = -1) +
  rm(munis_to_plot)
  return(p)
}
map(
  .x = unique(sigsif_bov_summary$STATE_SLAUGHTER),
  .f = plot_sourcing,
  data = sigsif_bov_summary
)

And then export the data.

Note: before I export the data, I add the BIOME because this information is required in SEI-PCS when aggregating small flows (to avoid losing information about deforestation rates across biomes). I also rename the SUM_QUANTITY column to match the br_bovine_slaughter_2015_2019_addQuantity.csv file S3.

# Add biome
sigsif_bov_summary <-
  left_join(sigsif_bov_summary, muni_biomes, by = "GEOCODE") %>%
  mutate(BIOME = 
           case_when(GEOCODE == "4212650" ~ "MATA ATLANTICA", 
                     TRUE ~ BIOME)
  )

# Fill na in BIOME with UNKNOWN
sigsif_bov_summary[["BIOME"]][is.na(sigsif_bov_summary[["BIOME"]])] <- 'UNKNOWN'

stopifnot(!any(is.na(sigsif_bov_summary$BIOME)))


# export
sigsif_bov_summary %>%
  rename(QUANTITY = SUM_QUANTITY) %>%
s3write_using(
  x = ., 
      object = "brazil/production/statistics/sigsif/out/br_bovine_slaughter_2015_2022.csv",
  FUN = write_delim,
  delim = ";",
  bucket = ts,
  opts = c("check_region" = T)
)

```

import pandas as pd


def model(dbt, cursor):
    dbt.ref("sif_2015_2017")
    dbt.ref("sif_2018")
    dbt.ref("sif_2019")
    dbt.ref("sif_2020")
    dbt.ref("sif_2021")
    dbt.ref("sif_2022")
    dbt.source("trase-storage-raw", "municipalities_bra_geojson")
    dbt.source("trase-storage-raw", "brazil_munic_biome")

    raise NotImplementedError()
    return pd.DataFrame({"hello": ["world"]})