Prepare bovine SIGSIF slaughter data for SEI-PCS

Script to make a clean version of bovine SIGSIF data on the municipal origin of cattle slaughtered in SIF slaughterhouses per state.

The output has the columns: STATE_SLAUGHTER GEOCODE PROP_FLOWS QUANTITY BIOME

Load data

# [A] Load raw SIGSIF for 2015-2019
# Note: these data are semi_colon delimted
path <- "brazil/production/statistics/sigsif/out/"
keys <- map_chr(get_bucket(ts, prefix = path), "Key")
keys_semi_colon <- keys[grepl("SIF_2015_2017|SIF_2018|SIF_2019", keys)]
keys_semi_colon <- keys_semi_colon[grepl("csv", keys_semi_colon)]
sigsif_2015_2019 <- 
  map_df(keys_semi_colon, 
         ~ s3read_using(
           object = .x, 
           FUN = read_delim, 
           col_types = cols(GEOCODE = col_character()),
           delim = ";",
           bucket = ts, 
           opts = c("check_region" = T)
         )
  )


# [B] Load raw SIGSIF for 2020 & 2021
# Note: these data are comma delimted
keys_comma <- keys[grepl("SIF_2020|SIF_2021", keys)]
keys_comma <- keys_comma[grepl("csv", keys_comma)]
sigsif_2020_2021 <- 
  map_df(keys_comma, 
         ~ s3read_using(
           object = .x, 
           FUN = read_delim, 
           col_types = cols(GEOCODE = col_character()),
           delim = ",",
           bucket = ts, 
           opts = c("check_region" = T)
         )
  )


# [C] Join 2015-2021 data together
sigsif <- bind_rows(sigsif_2015_2019, sigsif_2020_2021)


# Fix missing GEOCODEs
sigsif <- sigsif %>%
  mutate(GEOCODE = case_when(
    is.na(GEOCODE) & MUNICIPALITY == "MIRASSOL D OESTE" ~ "5105622",
    TRUE ~ GEOCODE
  ))
stopifnot(any(is.na(sigsif$GEOCODE)) == FALSE)


# [D] Load spatial data
spatial_path <- "brazil/spatial/BOUNDARIES/ibge/old/boundaries municipalities/"
munis <- get_object(paste0(spatial_path, "MUNICIPALITIES_BRA.geojson"), "trase-storage", check_region = T)
munis <- read_sf(rawToChar(munis))
brazil <- st_union(munis)
muni_centroids <- st_centroid(munis)


# muni/biome classifications
muni_biomes <-
  read_delim(
    get_object("brazil/spatial/BOUNDARIES/boundaries biomes/old/MUNIC_BIOME.csv",
               "trase-storage",
               check_region = T),
    col_types = cols(GEOCODMU = col_character()),
    delim = ";"
  ) %>%
  select(GEOCODE = GEOCODMU,
         BIOME)


# tidy up 
rm(path, key, keys_comma, keys_semi_colon, sigsif_2015_2019, sigsif_2020_2021, spatial_path)

Process data

First, we drop the 2021 SIGSIF data - which is woefully incomplete! (Only having ca. 10% of the expected number of slaughtered head).

# Check for consistency in SIGSIF data across years
sigsif %>% 
  group_by(YEAR, TYPE) %>% 
  summarise(SUM_HEAD = sum(QUANTITY)) %>% 
  ungroup() %>%
  drop_na(TYPE) %>%
  ggplot(aes(YEAR, SUM_HEAD)) + 
  geom_bar(stat = "identity", position = "dodge") + 
  facet_wrap(~TYPE, scales = "free_y") +
  theme_classic() +
  labs(title = "2021 data on slaughtered heads are incomplete",
       subtitle = "We therefore drop these data to focus on 2015-2020")

# Drop extra years
sigsif <- sigsif %>%
  drop_na(TYPE) %>%
  filter(YEAR != 2021)

Here I check how consistent the data are between years, by evaluating the R-sq of a model of PROP_FLOWS per GEOCODE (i.e. any variation is because of changes in sourcing between years).

# Identify the proportion per GEOCODE, per STATE_SLAUGHTER
sigsif_bov_annual <- 
  sigsif %>%
  filter(TYPE == "CATTLE") %>%
  group_by(YEAR, STATE_SLAUGHTER) %>%
  mutate(TOTAL_SLAUGHTERED_HEADS = sum(QUANTITY)) %>%
  group_by(YEAR, STATE_SLAUGHTER, GEOCODE) %>%
  summarise(PROP_FLOWS = sum(QUANTITY) / unique(TOTAL_SLAUGHTERED_HEADS)) %>%
  ungroup()

extract_sq_per_state <- function(state_name, data) {
  data <- data %>% filter(STATE_SLAUGHTER == state_name)
  if(length(unique(data$YEAR)) > 1 & length(unique(data$GEOCODE)) > 1) {
    m1 <-
      lm(data = data,
         PROP_FLOWS ~ GEOCODE)
    model_output <- summary(m1)
    model_r_sq <- model_output$r.squared
    tibble("STATE" = state_name,
           "R_SQ" = model_r_sq)
  } else {
    NULL
  }
}
res <- 
  map_df(
    .x = unique(sigsif_bov_annual$STATE_SLAUGHTER), 
    .f = extract_sq_per_state, 
    data = sigsif_bov_annual
  )

The sourcing is very consistent across years (R-sq > 0.9) for most states, except SE (0.75), SC (0.85), and SP (0.86).

# Visualise the consistency across years
ggplot(data = res,
       aes(STATE, R_SQ)) + 
  geom_point() + 
  scale_y_continuous(limits = c(0, 1)) +
  theme_bw() +
  geom_hline(yintercept = 0.9, linetype = "dashed") + 
  labs(title = "R-sq of municipal-level sourcing of SIFs per state", 
       subtitle = "R-sq = 0.9 shown as dashed line")

An example of the variation between years is in SP, where sourcing from PA and RO appears in some years, but not others

chk <- inner_join(munis, sigsif_bov_annual %>% filter(STATE_SLAUGHTER == "SP"), by = "GEOCODE")
ggplot() + 
  geom_sf(data = brazil, fill = "lightgrey") + 
  geom_sf(data = chk, aes(fill = PROP_FLOWS), col = NA) +
  facet_wrap(~YEAR) + 
  theme_minimal() + 
  coord_sf(datum = NA) +
  scale_fill_viridis(option="magma",
                     name = "% SOURCING",
                     na.value="white",
                     direction = -1)

I choose to aggregate these data across years (i.e. have one municipal-level supply shed per state), because:

[i] the above show that, for the most part, there’s little variation between years;
[ii] the main use of these data are for SEI-PCS/the dPaP app, where they are paired with GTA data to identify sourcing per logistics hub. Here, the GTA data are also aggregated across years (in that case, because of patchy data), also aggregating SIGSIF therefore leads to more methodological consistency;
[iii] here we only load SIGSIF data from 2015-2020, though we intend to map exports for 2010-2020, aggregating across years avoids the need to go back and download SIGSIF data for earlier years (i.e. 2010-2014).

# Make cleaned output dataset
# Identify the proportion per GEOCODE, per STATE_SLAUGHTER
sigsif_bov_summary <- 
  sigsif %>%
  filter(TYPE == "CATTLE") %>%
  group_by(STATE_SLAUGHTER) %>%
  mutate(TOTAL_SLAUGHTERED_HEADS = sum(QUANTITY)) %>%
  group_by(STATE_SLAUGHTER, GEOCODE) %>%
  summarise(SUM_QUANTITY = sum(QUANTITY), 
            PROP_FLOWS = SUM_QUANTITY / unique(TOTAL_SLAUGHTERED_HEADS)) %>%
  ungroup()

Below I plot these supply sheds, as a visual check that the data are OK:

# Check it looks OK
plot_sourcing <- function(state_name, data){
    munis_to_plot <- 
    inner_join(munis, sigsif_bov_summary %>% filter(STATE_SLAUGHTER == state_name), by = "GEOCODE") 
  p <- ggplot() + 
    geom_sf(data = brazil, fill = 'grey') +
    geom_sf(data = munis_to_plot, 
            aes(fill = PROP_FLOWS * 100)) +
    labs(title = paste("Municipal-level sourcing of SIFs in", state_name)) +
    theme_minimal() +
    coord_sf(datum = NA) +
    scale_fill_viridis(option="magma",
                     name = "% SOURCING",
                     na.value="white",
                     direction = -1) +
  rm(munis_to_plot)
  return(p)
}
map(
  .x = unique(sigsif_bov_summary$STATE_SLAUGHTER),
  .f = plot_sourcing,
  data = sigsif_bov_summary
)

## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

## 
## [[5]]

## 
## [[6]]

## 
## [[7]]

## 
## [[8]]

## 
## [[9]]

## 
## [[10]]

## 
## [[11]]

## 
## [[12]]

## 
## [[13]]

## 
## [[14]]

## 
## [[15]]

## 
## [[16]]

## 
## [[17]]

## 
## [[18]]

And then export the data.

Note: before I export the data, I add the BIOME because this information is required in SEI-PCS when aggregating small flows (to avoid losing information about deforestation rates across biomes). I also rename the SUM_QUANTITY column to match the br_bovine_slaughter_2015_2019_addQuantity.csv file S3.

# Add biome
sigsif_bov_summary <-
  left_join(sigsif_bov_summary, muni_biomes, by = "GEOCODE") %>%
  mutate(BIOME = 
           case_when(GEOCODE == "4212650" ~ "MATA ATLANTICA", 
                     TRUE ~ BIOME)
  )
stopifnot(!any(is.na(sigsif_bov_summary$BIOME)))


# export
sigsif_bov_summary %>%
  rename(QUANTITY = SUM_QUANTITY) %>%
s3write_using(
  x = ., 
  object = "brazil/production/statistics/sigsif/out/br_bovine_slaughter_2015_2020.csv", 
  FUN = write_delim,
  delim = ";",
  bucket = ts,
  opts = c("check_region" = T)
)

Prepare bovine SIGSIF slaughter data for SEI-PCS

Erasmus zu Ermgassen

12/17/2020

Load data

Process data