Br Bovine Slaughter 2015 2022
s3://trase-storage/brazil/production/statistics/sigsif/out/br_bovine_slaughter_2015_2022.csv
Dbt path: trase_production.main_brazil.br_bovine_slaughter_2015_2022
Explore on Metabase: Full table; summary statistics
Containing yaml file link: trase/data_pipeline/models/brazil/production/statistics/sigsif/out/_schema.yml
Model file link: trase/data_pipeline/models/brazil/production/statistics/sigsif/out/br_bovine_slaughter_2015_2022.py
Calls script: trase/data/brazil/production/statistics/sigsif/clean_bovine_sigsif_municipal_slaughter_data.Rmd
Dbt test runs & lineage: Test results ยท Lineage
Full dbt_docs page: Open in dbt docs (includes lineage graph -at the bottom right-, tests, and downstream dependencies)
Tags: mock_model, brazil, out, production, sigsif, statistics
br_bovine_slaughter_2015_2022
Description
No description
Details
| Column | Type | Description |
|---|---|---|
Models / Seeds
source.trase_duckdb.trase-storage-raw.municipalities_bra_geojsonsource.trase_duckdb.trase-storage-raw.brazil_munic_biomemodel.trase_duckdb.sif_2015_2017model.trase_duckdb.sif_2018model.trase_duckdb.sif_2019model.trase_duckdb.sif_2020model.trase_duckdb.sif_2021model.trase_duckdb.sif_2022
Sources
['trase-storage-raw', 'municipalities_bra_geojson']['trase-storage-raw', 'brazil_munic_biome']
---
title: "Prepare bovine SIGSIF slaughter data for SEI-PCS"
author: "Erasmus zu Ermgassen"
date: "2022-06-20"
output: html_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE, warning = F, message = F, fig.width = 14, fig.align = 'center')
options(encoding = "utf8")
library(tidyverse)
library(aws.s3)
library(sf)
library(viridis)
aws.signature::use_credentials()
Sys.setenv("AWS_DEFAULT_REGION" = "eu-west-1")
ts <- "trase-storage"
str_trans <- function(x) {
x %>%
stringi::stri_trans_toupper() %>%
stringi::stri_trans_general("Latin-ASCII")
}
Script to make a clean version of bovine SIGSIF data on the municipal origin of cattle slaughtered in SIF slaughterhouses per state.
The output has the columns:
STATE_SLAUGHTER
GEOCODE
PROP_FLOWS
QUANTITY
BIOME
Load data
# [A] Load raw SIGSIF for 2015-2019
# Note: these data are semi_colon delimted
path <- "brazil/production/statistics/sigsif/out/"
keys <- map_chr(get_bucket(ts, prefix = path), "Key")
keys <- keys[!grepl("old", keys)]
keys_semi_colon <- keys[grepl("SIF_2015_2017|SIF_2018|SIF_2019", keys)]
keys_semi_colon <- keys_semi_colon[grepl("csv", keys_semi_colon)]
sigsif_2015_2019 <-
map_df(keys_semi_colon,
~ s3read_using(
object = .x,
FUN = read_delim,
col_types = cols(GEOCODE = col_character()),
delim = ";",
bucket = ts,
opts = c("check_region" = T)
)
)
# [B] Load raw SIGSIF for 2020, 2021, 2022
# Note: these data are not comma delimted
keys_comma <- keys[grepl("SIF_2020|SIF_2021|SIF_2022", keys)]
keys_comma <- keys_comma[grepl("csv", keys_comma)]
sigsif_2020_2022 <-
map_df(keys_comma,
~ s3read_using(
object = .x,
FUN = read_delim,
col_types = cols(GEOCODE = col_character()),
delim = ";",
bucket = ts,
opts = c("check_region" = T)
)
)
# [C] Join 2015-2022 data together
sigsif <- bind_rows(sigsif_2015_2019, sigsif_2020_2022)
stopifnot(any(is.na(sigsif$GEOCODE)) == FALSE)
# [D] Load spatial data
spatial_path <- "brazil/spatial/BOUNDARIES/ibge/old/boundaries municipalities/"
munis <- get_object(paste0(spatial_path, "MUNICIPALITIES_BRA.geojson"), "trase-storage", check_region = T)
munis <- read_sf(rawToChar(munis))
brazil <- st_union(munis)
muni_centroids <- st_centroid(munis)
# muni/biome classifications
muni_biomes <-
read_delim(
get_object("brazil/spatial/BOUNDARIES/boundaries biomes/old/MUNIC_BIOME.csv",
"trase-storage",
check_region = T),
col_types = cols(GEOCODMU = col_character()),
delim = ";"
) %>%
select(GEOCODE = GEOCODMU,
BIOME)
# tidy up
rm(path, key, keys_comma, keys_semi_colon, sigsif_2015_2019, sigsif_2020_2022, spatial_path)
Process data
# Check for consistency in SIGSIF data across years
sigsif %>%
group_by(YEAR, TYPE) %>%
summarise(SUM_HEAD = sum(QUANTITY)) %>%
ungroup() %>%
drop_na(TYPE) %>%
ggplot(aes(YEAR, SUM_HEAD)) +
geom_bar(stat = "identity", position = "dodge") +
facet_wrap(~TYPE, scales = "free_y") +
theme_classic() +
labs(title = "Check consistency",
subtitle = "2015-2021")
PROP_FLOWS per GEOCODE (i.e. any variation is because of changes in sourcing between years).
# Identify the proportion per GEOCODE, per STATE_SLAUGHTER
sigsif_bov_annual <-
sigsif %>%
filter(TYPE == "CATTLE") %>%
group_by(YEAR, STATE_SLAUGHTER) %>%
mutate(TOTAL_SLAUGHTERED_HEADS = sum(QUANTITY)) %>%
group_by(YEAR, STATE_SLAUGHTER, GEOCODE) %>%
summarise(PROP_FLOWS = sum(QUANTITY) / unique(TOTAL_SLAUGHTERED_HEADS)) %>%
ungroup()
extract_sq_per_state <- function(state_name, data) {
data <- data %>% filter(STATE_SLAUGHTER == state_name)
if(length(unique(data$YEAR)) > 1 & length(unique(data$GEOCODE)) > 1) {
m1 <-
lm(data = data,
PROP_FLOWS ~ GEOCODE)
model_output <- summary(m1)
model_r_sq <- model_output$r.squared
tibble("STATE" = state_name,
"R_SQ" = model_r_sq)
} else {
NULL
}
}
res <-
map_df(
.x = unique(sigsif_bov_annual$STATE_SLAUGHTER),
.f = extract_sq_per_state,
data = sigsif_bov_annual
)
The sourcing is very consistent across years (R-sq > 0.9) for most states, except SE (r res %>% filter(STATE == "SE") %>% pull(R_SQ) %>% round(., 2)), SC (r res %>% filter(STATE == "SC") %>% pull(R_SQ) %>% round(., 2)), and SP (r res %>% filter(STATE == "SP") %>% pull(R_SQ) %>% round(., 2)).
# Visualise the consistency across years
ggplot(data = res,
aes(STATE, R_SQ)) +
geom_point() +
scale_y_continuous(limits = c(0, 1)) +
theme_bw() +
geom_hline(yintercept = 0.9, linetype = "dashed") +
labs(title = "R-sq of municipal-level sourcing of SIFs per state",
subtitle = "R-sq = 0.9 shown as dashed line")
An example of the variation between years is in SP, where sourcing from PA and RO appears in some years, but not others
chk <- inner_join(munis, sigsif_bov_annual %>% filter(STATE_SLAUGHTER == "SP"), by = "GEOCODE")
ggplot() +
geom_sf(data = brazil, fill = "lightgrey") +
geom_sf(data = chk, aes(fill = PROP_FLOWS), col = NA) +
facet_wrap(~YEAR) +
theme_minimal() +
coord_sf(datum = NA) +
scale_fill_viridis(option="magma",
name = "% SOURCING",
na.value="white",
direction = -1)
I choose to aggregate these data across years (i.e. have one municipal-level supply shed per state), because:
- [i] the above show that, for the most part, there's little variation between years;
- [ii] the main use of these data are for SEI-PCS/the dPaP app, where they are paired with GTA data to identify sourcing per logistics hub. Here, the GTA data are also aggregated across years (in that case, because of patchy data), also aggregating SIGSIF therefore leads to more methodological consistency;
- [iii] here we only load SIGSIF data from 2015-2020, though we intend to map exports for 2010-2020, aggregating across years avoids the need to go back and download SIGSIF data for earlier years (i.e. 2010-2014).
# Make cleaned output dataset
# Identify the proportion per GEOCODE, per STATE_SLAUGHTER
sigsif_bov_summary <-
sigsif %>%
filter(TYPE == "CATTLE") %>%
group_by(STATE_SLAUGHTER) %>%
mutate(TOTAL_SLAUGHTERED_HEADS = sum(QUANTITY)) %>%
group_by(STATE_SLAUGHTER, GEOCODE) %>%
summarise(SUM_QUANTITY = sum(QUANTITY),
PROP_FLOWS = SUM_QUANTITY / unique(TOTAL_SLAUGHTERED_HEADS)) %>%
ungroup()
Below I plot these supply sheds, as a visual check that the data are OK:
# Check it looks OK
plot_sourcing <- function(state_name, data){
munis_to_plot <-
inner_join(munis, sigsif_bov_summary %>% filter(STATE_SLAUGHTER == state_name), by = "GEOCODE")
p <- ggplot() +
geom_sf(data = brazil, fill = 'grey') +
geom_sf(data = munis_to_plot,
aes(fill = PROP_FLOWS * 100)) +
labs(title = paste("Municipal-level sourcing of SIFs in", state_name)) +
theme_minimal() +
coord_sf(datum = NA) +
scale_fill_viridis(option="magma",
name = "% SOURCING",
na.value="white",
direction = -1) +
rm(munis_to_plot)
return(p)
}
map(
.x = unique(sigsif_bov_summary$STATE_SLAUGHTER),
.f = plot_sourcing,
data = sigsif_bov_summary
)
And then export the data.
Note: before I export the data, I add the BIOME because this information is required in SEI-PCS when aggregating small flows (to avoid losing information about deforestation rates across biomes). I also rename the SUM_QUANTITY column to match the br_bovine_slaughter_2015_2019_addQuantity.csv file S3.
# Add biome
sigsif_bov_summary <-
left_join(sigsif_bov_summary, muni_biomes, by = "GEOCODE") %>%
mutate(BIOME =
case_when(GEOCODE == "4212650" ~ "MATA ATLANTICA",
TRUE ~ BIOME)
)
# Fill na in BIOME with UNKNOWN
sigsif_bov_summary[["BIOME"]][is.na(sigsif_bov_summary[["BIOME"]])] <- 'UNKNOWN'
stopifnot(!any(is.na(sigsif_bov_summary$BIOME)))
# export
sigsif_bov_summary %>%
rename(QUANTITY = SUM_QUANTITY) %>%
s3write_using(
x = .,
object = "brazil/production/statistics/sigsif/out/br_bovine_slaughter_2015_2022.csv",
FUN = write_delim,
delim = ";",
bucket = ts,
opts = c("check_region" = T)
)
import pandas as pd
def model(dbt, cursor):
dbt.ref("sif_2015_2017")
dbt.ref("sif_2018")
dbt.ref("sif_2019")
dbt.ref("sif_2020")
dbt.ref("sif_2021")
dbt.ref("sif_2022")
dbt.source("trase-storage-raw", "municipalities_bra_geojson")
dbt.source("trase-storage-raw", "brazil_munic_biome")
raise NotImplementedError()
return pd.DataFrame({"hello": ["world"]})