Script to make a clean version of bovine SIGSIF data on the municipal origin of cattle slaughtered in SIF slaughterhouses per state.

The output has the columns: STATE_SLAUGHTER GEOCODE PROP_FLOWS QUANTITY BIOME

Load data

# [A] Load raw SIGSIF for 2015-2019
# Note: these data are semi_colon delimted
path <- "brazil/production/statistics/sigsif/out/"
keys <- map_chr(get_bucket(ts, prefix = path), "Key")
keys_semi_colon <- keys[grepl("SIF_2015_2017|SIF_2018|SIF_2019", keys)]
keys_semi_colon <- keys_semi_colon[grepl("csv", keys_semi_colon)]
sigsif_2015_2019 <- 
  map_df(keys_semi_colon, 
         ~ s3read_using(
           object = .x, 
           FUN = read_delim, 
           col_types = cols(GEOCODE = col_character()),
           delim = ";",
           bucket = ts, 
           opts = c("check_region" = T)
         )
  )


# [B] Load raw SIGSIF for 2020 & 2021
# Note: these data are comma delimted
keys_comma <- keys[grepl("SIF_2020|SIF_2021", keys)]
keys_comma <- keys_comma[grepl("csv", keys_comma)]
sigsif_2020_2021 <- 
  map_df(keys_comma, 
         ~ s3read_using(
           object = .x, 
           FUN = read_delim, 
           col_types = cols(GEOCODE = col_character()),
           delim = ",",
           bucket = ts, 
           opts = c("check_region" = T)
         )
  )


# [C] Join 2015-2021 data together
sigsif <- bind_rows(sigsif_2015_2019, sigsif_2020_2021)


# Fix missing GEOCODEs
sigsif <- sigsif %>%
  mutate(GEOCODE = case_when(
    is.na(GEOCODE) & MUNICIPALITY == "MIRASSOL D OESTE" ~ "5105622",
    TRUE ~ GEOCODE
  ))
stopifnot(any(is.na(sigsif$GEOCODE)) == FALSE)


# [D] Load spatial data
spatial_path <- "brazil/spatial/BOUNDARIES/ibge/old/boundaries municipalities/"
munis <- get_object(paste0(spatial_path, "MUNICIPALITIES_BRA.geojson"), "trase-storage", check_region = T)
munis <- read_sf(rawToChar(munis))
brazil <- st_union(munis)
muni_centroids <- st_centroid(munis)


# muni/biome classifications
muni_biomes <-
  read_delim(
    get_object("brazil/spatial/BOUNDARIES/boundaries biomes/old/MUNIC_BIOME.csv",
               "trase-storage",
               check_region = T),
    col_types = cols(GEOCODMU = col_character()),
    delim = ";"
  ) %>%
  select(GEOCODE = GEOCODMU,
         BIOME)


# tidy up 
rm(path, key, keys_comma, keys_semi_colon, sigsif_2015_2019, sigsif_2020_2021, spatial_path)

Process data

First, we drop the 2021 SIGSIF data - which is woefully incomplete! (Only having ca. 10% of the expected number of slaughtered head).

# Check for consistency in SIGSIF data across years
sigsif %>% 
  group_by(YEAR, TYPE) %>% 
  summarise(SUM_HEAD = sum(QUANTITY)) %>% 
  ungroup() %>%
  drop_na(TYPE) %>%
  ggplot(aes(YEAR, SUM_HEAD)) + 
  geom_bar(stat = "identity", position = "dodge") + 
  facet_wrap(~TYPE, scales = "free_y") +
  theme_classic() +
  labs(title = "2021 data on slaughtered heads are incomplete",
       subtitle = "We therefore drop these data to focus on 2015-2020")

# Drop extra years
sigsif <- sigsif %>%
  drop_na(TYPE) %>%
  filter(YEAR != 2021)

Here I check how consistent the data are between years, by evaluating the R-sq of a model of PROP_FLOWS per GEOCODE (i.e. any variation is because of changes in sourcing between years).

# Identify the proportion per GEOCODE, per STATE_SLAUGHTER
sigsif_bov_annual <- 
  sigsif %>%
  filter(TYPE == "CATTLE") %>%
  group_by(YEAR, STATE_SLAUGHTER) %>%
  mutate(TOTAL_SLAUGHTERED_HEADS = sum(QUANTITY)) %>%
  group_by(YEAR, STATE_SLAUGHTER, GEOCODE) %>%
  summarise(PROP_FLOWS = sum(QUANTITY) / unique(TOTAL_SLAUGHTERED_HEADS)) %>%
  ungroup()

extract_sq_per_state <- function(state_name, data) {
  data <- data %>% filter(STATE_SLAUGHTER == state_name)
  if(length(unique(data$YEAR)) > 1 & length(unique(data$GEOCODE)) > 1) {
    m1 <-
      lm(data = data,
         PROP_FLOWS ~ GEOCODE)
    model_output <- summary(m1)
    model_r_sq <- model_output$r.squared
    tibble("STATE" = state_name,
           "R_SQ" = model_r_sq)
  } else {
    NULL
  }
}
res <- 
  map_df(
    .x = unique(sigsif_bov_annual$STATE_SLAUGHTER), 
    .f = extract_sq_per_state, 
    data = sigsif_bov_annual
  )

The sourcing is very consistent across years (R-sq > 0.9) for most states, except SE (0.75), SC (0.85), and SP (0.86).

# Visualise the consistency across years
ggplot(data = res,
       aes(STATE, R_SQ)) + 
  geom_point() + 
  scale_y_continuous(limits = c(0, 1)) +
  theme_bw() +
  geom_hline(yintercept = 0.9, linetype = "dashed") + 
  labs(title = "R-sq of municipal-level sourcing of SIFs per state", 
       subtitle = "R-sq = 0.9 shown as dashed line")

An example of the variation between years is in SP, where sourcing from PA and RO appears in some years, but not others

chk <- inner_join(munis, sigsif_bov_annual %>% filter(STATE_SLAUGHTER == "SP"), by = "GEOCODE")
ggplot() + 
  geom_sf(data = brazil, fill = "lightgrey") + 
  geom_sf(data = chk, aes(fill = PROP_FLOWS), col = NA) +
  facet_wrap(~YEAR) + 
  theme_minimal() + 
  coord_sf(datum = NA) +
  scale_fill_viridis(option="magma",
                     name = "% SOURCING",
                     na.value="white",
                     direction = -1) 

I choose to aggregate these data across years (i.e. have one municipal-level supply shed per state), because:

# Make cleaned output dataset
# Identify the proportion per GEOCODE, per STATE_SLAUGHTER
sigsif_bov_summary <- 
  sigsif %>%
  filter(TYPE == "CATTLE") %>%
  group_by(STATE_SLAUGHTER) %>%
  mutate(TOTAL_SLAUGHTERED_HEADS = sum(QUANTITY)) %>%
  group_by(STATE_SLAUGHTER, GEOCODE) %>%
  summarise(SUM_QUANTITY = sum(QUANTITY), 
            PROP_FLOWS = SUM_QUANTITY / unique(TOTAL_SLAUGHTERED_HEADS)) %>%
  ungroup()

Below I plot these supply sheds, as a visual check that the data are OK:

# Check it looks OK
plot_sourcing <- function(state_name, data){
    munis_to_plot <- 
    inner_join(munis, sigsif_bov_summary %>% filter(STATE_SLAUGHTER == state_name), by = "GEOCODE") 
  p <- ggplot() + 
    geom_sf(data = brazil, fill = 'grey') +
    geom_sf(data = munis_to_plot, 
            aes(fill = PROP_FLOWS * 100)) +
    labs(title = paste("Municipal-level sourcing of SIFs in", state_name)) +
    theme_minimal() +
    coord_sf(datum = NA) +
    scale_fill_viridis(option="magma",
                     name = "% SOURCING",
                     na.value="white",
                     direction = -1) +
  rm(munis_to_plot)
  return(p)
}
map(
  .x = unique(sigsif_bov_summary$STATE_SLAUGHTER),
  .f = plot_sourcing,
  data = sigsif_bov_summary
)
## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

## 
## [[5]]

## 
## [[6]]

## 
## [[7]]

## 
## [[8]]

## 
## [[9]]

## 
## [[10]]

## 
## [[11]]

## 
## [[12]]

## 
## [[13]]

## 
## [[14]]

## 
## [[15]]

## 
## [[16]]

## 
## [[17]]

## 
## [[18]]

And then export the data.

Note: before I export the data, I add the BIOME because this information is required in SEI-PCS when aggregating small flows (to avoid losing information about deforestation rates across biomes). I also rename the SUM_QUANTITY column to match the br_bovine_slaughter_2015_2019_addQuantity.csv file S3.

# Add biome
sigsif_bov_summary <-
  left_join(sigsif_bov_summary, muni_biomes, by = "GEOCODE") %>%
  mutate(BIOME = 
           case_when(GEOCODE == "4212650" ~ "MATA ATLANTICA", 
                     TRUE ~ BIOME)
  )
stopifnot(!any(is.na(sigsif_bov_summary$BIOME)))


# export
sigsif_bov_summary %>%
  rename(QUANTITY = SUM_QUANTITY) %>%
s3write_using(
  x = ., 
  object = "brazil/production/statistics/sigsif/out/br_bovine_slaughter_2015_2020.csv", 
  FUN = write_delim,
  delim = ";",
  bucket = ts,
  opts = c("check_region" = T)
)