View or edit on GitHub

This page is synchronized from trase/data/indonesia/palm_oil/indicators/q1_2024/quality_assessment/QA_area.md. Last modified on 2026-02-03 10:30 CET by Jason J. Benedict. Please view or edit the original file there; changes should be reflected here after a midnight build (CET time), or manually triggering it with a GitHub action (link).

Metrics QA for Indonesia Oil palm - AREA

Here I compare and check differences bewteen previous and current palm areas for embedding

correlation (visual), mean, and sd differences in area and percent
evaluating which units did not produce palm before but now produce and vice versa (check GEE script fro visualization)
evaluating outliers

differences at national and ffb level

national_area_diff <- test_palm_area %>%
    group_by(YEAR_HARVEST) %>%
    summarise(
        curr_PALM_HA = sum(palm_ha),
        prev_PALM_HA = sum(prev_PALM_HA)
    ) %>%
    mutate(difference_curr_minus_prev_ha = curr_PALM_HA - prev_PALM_HA, difference_perc = ((curr_PALM_HA / prev_PALM_HA) * 100) - 100)
ggplot(national_area_diff, aes(x = as.factor(YEAR_HARVEST), y = difference_curr_minus_prev_ha / 1000)) +
    geom_bar(stat = "identity") +
    theme_bw() +
    labs(title = "NATIONAL LEVEL: palm area difference")

ggplot(national_area_diff, aes(x = as.factor(YEAR_HARVEST), y = difference_perc)) +
    geom_bar(stat = "identity") +
    theme_bw() +
    labs(title = "NATIONAL LEVEL:  perc palm area difference")

# xy plot
ggplot(test_palm_area) +
    geom_point(aes(x = palm_ha / 1000, prev_PALM_HA / 1000, col = YEAR_HARVEST)) +
    labs(title = "FFB-leve: prev to current area") +
    theme_bw()

# at ffb level
ggplot(test_palm_area %>% mutate(diff = palm_ha - prev_PALM_HA) %>% group_by(YEAR_HARVEST) %>% summarize(mean_diff_curr_minus_prev_ha = mean(diff))) +
    geom_bar(aes(y = mean_diff_curr_minus_prev_ha, x = as.factor(YEAR_HARVEST)), stat = "identity") +
    labs(title = "FFB-leve: Mean area difference HA") +
    theme_bw()

ggplot(test_palm_area %>% group_by(YEAR_HARVEST) %>% summarize(SD_diff_curr_minus_prev_ha = sd(palm_ha - prev_PALM_HA))) +
    geom_bar(aes(y = SD_diff_curr_minus_prev_ha, x = as.factor(YEAR_HARVEST)), stat = "identity") +
    labs(title = "FFB-leve: SD area difference HA") +
    theme_bw()

At national level, the difference is small in percent! and the mean area at ffb level is also fine :)
but the standrd deviation (sd) is high for the recent & relevant years (2018-2020)

Lets look at those that do not have palm previously but have palm now and vice versa

ggplot(test_palm_area %>% group_by(YEAR_HARVEST) %>% summarize(n_no_prev_production = sum((palm_ha != 0 & prev_PALM_HA == 0)))) +
    geom_bar(aes(y = n_no_prev_production, x = as.factor(YEAR_HARVEST)), stat = "identity") +
    labs(title = "ffbs with No palm before ") +
    theme_bw()

# check which ones and sorted based on ha (first 50)
test_palm_area[which(test_palm_area$palm_ha != 0 & test_palm_area$prev_PALM_HA == 0), c("ffb_code", "palm_ha")] %>%
    group_by(ffb_code) %>%
    summarize(max_area = max(palm_ha)) %>%
    arrange(desc(max_area)) %>%
    filter(max_area > 100) %>%
    print(n = 50)

## # A tibble: 15 × 2
##    ffb_code  max_area
##    <chr>        <dbl>
##  1 FFB-11626    5285.
##  2 FFB-09567    1101.
##  3 FFB-07197     808.
##  4 FFB-09224     474.
##  5 FFB-09016     356.
##  6 FFB-04834     306.
##  7 FFB-09569     253.
##  8 FFB-09570     231.
##  9 FFB-03371     213.
## 10 FFB-06555     172.
## 11 FFB-11022     155.
## 12 FFB-07396     141.
## 13 FFB-01236     122.
## 14 FFB-06547     115.
## 15 FFB-03605     104.

ggplot(test_palm_area %>% group_by(YEAR_HARVEST) %>% summarize(n_no_current_production = sum((palm_ha == 0 & prev_PALM_HA != 0)))) +
    geom_bar(aes(y = n_no_current_production, x = as.factor(YEAR_HARVEST)), stat = "identity") +
    labs(title = "ffbs with No current palm, but before") +
    theme_bw()

# chech which ones and how many - sorted baased on ha
test_palm_area[which(test_palm_area$palm_ha == 0 & test_palm_area$prev_PALM_HA != 0), c("ffb_code", "prev_PALM_HA")] %>%
    group_by(ffb_code) %>%
    summarize(max_area = max(prev_PALM_HA)) %>%
    arrange(desc(max_area)) %>%
    print(n = 50)

## # A tibble: 3 × 2
##   ffb_code  max_area
##   <chr>        <dbl>
## 1 FFB-08724     1484
## 2 FFB-10529      100
## 3 FFB-05691        3

Lets look at the outliers >2sd difference

sd_difference_ha <- test_palm_area %>%
    group_by(YEAR_HARVEST) %>%
    summarize(SD_diff_curr_minus_prev_ha = sd(palm_ha - prev_PALM_HA))

test_palm_area_sd <- test_palm_area %>%
    select(ffb_code, YEAR_HARVEST, palm_ha, prev_PALM_HA) %>%
    left_join(sd_difference_ha) %>%
    mutate(
        diff_ha = palm_ha - prev_PALM_HA,
        diff_2SD = if_else(diff_ha > (2 * SD_diff_curr_minus_prev_ha), TRUE, FALSE),
        diff_1SD = if_else(diff_ha > SD_diff_curr_minus_prev_ha, TRUE, FALSE)
    )

print(test_palm_area_sd %>% filter(diff_2SD == TRUE) %>%
    select(ffb_code, YEAR_HARVEST, SD_diff_curr_minus_prev_ha, diff_ha, diff_2SD) %>%
    arrange(desc(diff_ha)), n = 20)

## # A tibble: 3,378 × 5
##    ffb_code  YEAR_HARVEST SD_diff_curr_minus_prev_ha diff_ha diff_2SD
##    <chr>            <dbl>                      <dbl>   <dbl> <lgl>   
##  1 FFB-07065         2020                      246.    6606. TRUE    
##  2 FFB-07065         2019                      214.    6509. TRUE    
##  3 FFB-11609         2018                      183.    6018. TRUE    
##  4 FFB-11609         2019                      214.    6018. TRUE    
##  5 FFB-11609         2020                      246.    6018. TRUE    
##  6 FFB-10903         2020                      246.    5589. TRUE    
##  7 FFB-10903         2019                      214.    5540. TRUE    
##  8 FFB-11609         2017                      161.    5535. TRUE    
##  9 FFB-10903         2018                      183.    5502. TRUE    
## 10 FFB-10903         2017                      161.    5402. TRUE    
## 11 FFB-11626         2019                      214.    5285. TRUE    
## 12 FFB-11626         2020                      246.    5285. TRUE    
## 13 FFB-11609         2016                      136.    5274. TRUE    
## 14 FFB-11626         2018                      183.    5267. TRUE    
## 15 FFB-10903         2016                      136.    5246. TRUE    
## 16 FFB-10903         2015                      114.    5125. TRUE    
## 17 FFB-11626         2017                      161.    4917. TRUE    
## 18 FFB-10903         2014                      100.    4889. TRUE    
## 19 FFB-10903         2013                       95.0   4772. TRUE    
## 20 FFB-10903         2012                       92.7   4710. TRUE    
## # ℹ 3,358 more rows

test_palm_area_sd_agg <- test_palm_area_sd %>%
    group_by(YEAR_HARVEST) %>%
    summarize(n_2sd = sum(diff_2SD == TRUE), n_1sd = sum(diff_1SD == TRUE), sd_1 = first(SD_diff_curr_minus_prev_ha))
test_palm_area_sd_agg

## # A tibble: 18 × 4
##    YEAR_HARVEST n_2sd n_1sd  sd_1
##           <dbl> <int> <int> <dbl>
##  1         2003   164   335  93.6
##  2         2004   172   348  90.3
##  3         2005   186   367  86.7
##  4         2006   189   369  87.2
##  5         2007   189   377  87.7
##  6         2008   191   387  87.7
##  7         2009   203   403  87.6
##  8         2010   210   412  88.9
##  9         2011   219   422  89.8
## 10         2012   234   431  92.7
## 11         2013   236   436  95.0
## 12         2014   236   434 100. 
## 13         2015   227   430 114. 
## 14         2016   180   390 136. 
## 15         2017   156   347 161. 
## 16         2018   150   336 183. 
## 17         2019   124   311 214. 
## 18         2020   112   288 246.

ggplot(test_palm_area_sd_agg, aes(x = as.factor(YEAR_HARVEST), y = n_2sd)) +
    geom_bar(stat = "identity") +
    labs(title = "Number ffb area outliers >2sd (based on yearly difference)") +
    theme_bw()

We have a lot of outliers (between 112-150 for 2018-2020 >2sd) that may affect embedding