Skip to content

View or edit on GitHub

This page is synchronized from trase/data/indonesia/palm_oil/indicators/q1_2024/quality_assessment/QA_area.md. Last modified on 2026-02-03 10:30 CET by Jason J. Benedict. Please view or edit the original file there; changes should be reflected here after a midnight build (CET time), or manually triggering it with a GitHub action (link).

Metrics QA for Indonesia Oil palm - AREA

Here I compare and check differences bewteen previous and current palm areas for embedding

  • correlation (visual), mean, and sd differences in area and percent
  • evaluating which units did not produce palm before but now produce and vice versa (check GEE script fro visualization)
  • evaluating outliers

differences at national and ffb level

national_area_diff <- test_palm_area %>%
    group_by(YEAR_HARVEST) %>%
    summarise(
        curr_PALM_HA = sum(palm_ha),
        prev_PALM_HA = sum(prev_PALM_HA)
    ) %>%
    mutate(difference_curr_minus_prev_ha = curr_PALM_HA - prev_PALM_HA, difference_perc = ((curr_PALM_HA / prev_PALM_HA) * 100) - 100)
ggplot(national_area_diff, aes(x = as.factor(YEAR_HARVEST), y = difference_curr_minus_prev_ha / 1000)) +
    geom_bar(stat = "identity") +
    theme_bw() +
    labs(title = "NATIONAL LEVEL: palm area difference")

ggplot(national_area_diff, aes(x = as.factor(YEAR_HARVEST), y = difference_perc)) +
    geom_bar(stat = "identity") +
    theme_bw() +
    labs(title = "NATIONAL LEVEL:  perc palm area difference")

# xy plot
ggplot(test_palm_area) +
    geom_point(aes(x = palm_ha / 1000, prev_PALM_HA / 1000, col = YEAR_HARVEST)) +
    labs(title = "FFB-leve: prev to current area") +
    theme_bw()

# at ffb level
ggplot(test_palm_area %>% mutate(diff = palm_ha - prev_PALM_HA) %>% group_by(YEAR_HARVEST) %>% summarize(mean_diff_curr_minus_prev_ha = mean(diff))) +
    geom_bar(aes(y = mean_diff_curr_minus_prev_ha, x = as.factor(YEAR_HARVEST)), stat = "identity") +
    labs(title = "FFB-leve: Mean area difference HA") +
    theme_bw()

ggplot(test_palm_area %>% group_by(YEAR_HARVEST) %>% summarize(SD_diff_curr_minus_prev_ha = sd(palm_ha - prev_PALM_HA))) +
    geom_bar(aes(y = SD_diff_curr_minus_prev_ha, x = as.factor(YEAR_HARVEST)), stat = "identity") +
    labs(title = "FFB-leve: SD area difference HA") +
    theme_bw()

At national level, the difference is small in percent! and the mean area at ffb level is also fine :)
but the standrd deviation (sd) is high for the recent & relevant years (2018-2020)

Lets look at those that do not have palm previously but have palm now and vice versa

ggplot(test_palm_area %>% group_by(YEAR_HARVEST) %>% summarize(n_no_prev_production = sum((palm_ha != 0 & prev_PALM_HA == 0)))) +
    geom_bar(aes(y = n_no_prev_production, x = as.factor(YEAR_HARVEST)), stat = "identity") +
    labs(title = "ffbs with No palm before ") +
    theme_bw()

# check which ones and sorted based on ha (first 50)
test_palm_area[which(test_palm_area$palm_ha != 0 & test_palm_area$prev_PALM_HA == 0), c("ffb_code", "palm_ha")] %>%
    group_by(ffb_code) %>%
    summarize(max_area = max(palm_ha)) %>%
    arrange(desc(max_area)) %>%
    filter(max_area > 100) %>%
    print(n = 50)
## # A tibble: 15 × 2
##    ffb_code  max_area
##    <chr>        <dbl>
##  1 FFB-11626    5285.
##  2 FFB-09567    1101.
##  3 FFB-07197     808.
##  4 FFB-09224     474.
##  5 FFB-09016     356.
##  6 FFB-04834     306.
##  7 FFB-09569     253.
##  8 FFB-09570     231.
##  9 FFB-03371     213.
## 10 FFB-06555     172.
## 11 FFB-11022     155.
## 12 FFB-07396     141.
## 13 FFB-01236     122.
## 14 FFB-06547     115.
## 15 FFB-03605     104.
ggplot(test_palm_area %>% group_by(YEAR_HARVEST) %>% summarize(n_no_current_production = sum((palm_ha == 0 & prev_PALM_HA != 0)))) +
    geom_bar(aes(y = n_no_current_production, x = as.factor(YEAR_HARVEST)), stat = "identity") +
    labs(title = "ffbs with No current palm, but before") +
    theme_bw()

# chech which ones and how many - sorted baased on ha
test_palm_area[which(test_palm_area$palm_ha == 0 & test_palm_area$prev_PALM_HA != 0), c("ffb_code", "prev_PALM_HA")] %>%
    group_by(ffb_code) %>%
    summarize(max_area = max(prev_PALM_HA)) %>%
    arrange(desc(max_area)) %>%
    print(n = 50)
## # A tibble: 3 × 2
##   ffb_code  max_area
##   <chr>        <dbl>
## 1 FFB-08724     1484
## 2 FFB-10529      100
## 3 FFB-05691        3

Lets look at the outliers >2sd difference

sd_difference_ha <- test_palm_area %>%
    group_by(YEAR_HARVEST) %>%
    summarize(SD_diff_curr_minus_prev_ha = sd(palm_ha - prev_PALM_HA))

test_palm_area_sd <- test_palm_area %>%
    select(ffb_code, YEAR_HARVEST, palm_ha, prev_PALM_HA) %>%
    left_join(sd_difference_ha) %>%
    mutate(
        diff_ha = palm_ha - prev_PALM_HA,
        diff_2SD = if_else(diff_ha > (2 * SD_diff_curr_minus_prev_ha), TRUE, FALSE),
        diff_1SD = if_else(diff_ha > SD_diff_curr_minus_prev_ha, TRUE, FALSE)
    )

print(test_palm_area_sd %>% filter(diff_2SD == TRUE) %>%
    select(ffb_code, YEAR_HARVEST, SD_diff_curr_minus_prev_ha, diff_ha, diff_2SD) %>%
    arrange(desc(diff_ha)), n = 20)
## # A tibble: 3,378 × 5
##    ffb_code  YEAR_HARVEST SD_diff_curr_minus_prev_ha diff_ha diff_2SD
##    <chr>            <dbl>                      <dbl>   <dbl> <lgl>   
##  1 FFB-07065         2020                      246.    6606. TRUE    
##  2 FFB-07065         2019                      214.    6509. TRUE    
##  3 FFB-11609         2018                      183.    6018. TRUE    
##  4 FFB-11609         2019                      214.    6018. TRUE    
##  5 FFB-11609         2020                      246.    6018. TRUE    
##  6 FFB-10903         2020                      246.    5589. TRUE    
##  7 FFB-10903         2019                      214.    5540. TRUE    
##  8 FFB-11609         2017                      161.    5535. TRUE    
##  9 FFB-10903         2018                      183.    5502. TRUE    
## 10 FFB-10903         2017                      161.    5402. TRUE    
## 11 FFB-11626         2019                      214.    5285. TRUE    
## 12 FFB-11626         2020                      246.    5285. TRUE    
## 13 FFB-11609         2016                      136.    5274. TRUE    
## 14 FFB-11626         2018                      183.    5267. TRUE    
## 15 FFB-10903         2016                      136.    5246. TRUE    
## 16 FFB-10903         2015                      114.    5125. TRUE    
## 17 FFB-11626         2017                      161.    4917. TRUE    
## 18 FFB-10903         2014                      100.    4889. TRUE    
## 19 FFB-10903         2013                       95.0   4772. TRUE    
## 20 FFB-10903         2012                       92.7   4710. TRUE    
## # ℹ 3,358 more rows
test_palm_area_sd_agg <- test_palm_area_sd %>%
    group_by(YEAR_HARVEST) %>%
    summarize(n_2sd = sum(diff_2SD == TRUE), n_1sd = sum(diff_1SD == TRUE), sd_1 = first(SD_diff_curr_minus_prev_ha))
test_palm_area_sd_agg
## # A tibble: 18 × 4
##    YEAR_HARVEST n_2sd n_1sd  sd_1
##           <dbl> <int> <int> <dbl>
##  1         2003   164   335  93.6
##  2         2004   172   348  90.3
##  3         2005   186   367  86.7
##  4         2006   189   369  87.2
##  5         2007   189   377  87.7
##  6         2008   191   387  87.7
##  7         2009   203   403  87.6
##  8         2010   210   412  88.9
##  9         2011   219   422  89.8
## 10         2012   234   431  92.7
## 11         2013   236   436  95.0
## 12         2014   236   434 100. 
## 13         2015   227   430 114. 
## 14         2016   180   390 136. 
## 15         2017   156   347 161. 
## 16         2018   150   336 183. 
## 17         2019   124   311 214. 
## 18         2020   112   288 246.
ggplot(test_palm_area_sd_agg, aes(x = as.factor(YEAR_HARVEST), y = n_2sd)) +
    geom_bar(stat = "identity") +
    labs(title = "Number ffb area outliers >2sd (based on yearly difference)") +
    theme_bw()

We have a lot of outliers (between 112-150 for 2018-2020 >2sd) that may affect embedding