View or edit on GitHub
This page is synchronized from trase/data/indonesia/palm_oil/indicators/q1_2024/quality_assessment/QA_area.md. Last modified on 2026-02-03 10:30 CET by Jason J. Benedict.
Please view or edit the original file there; changes should be reflected here after a midnight build (CET time),
or manually triggering it with a GitHub action (link).
Metrics QA for Indonesia Oil palm - AREA
Here I compare and check differences bewteen previous and current palm areas for embedding
- correlation (visual), mean, and sd differences in area and percent
- evaluating which units did not produce palm before but now produce and vice versa (check GEE script fro visualization)
- evaluating outliers
differences at national and ffb level
national_area_diff <- test_palm_area %>%
group_by(YEAR_HARVEST) %>%
summarise(
curr_PALM_HA = sum(palm_ha),
prev_PALM_HA = sum(prev_PALM_HA)
) %>%
mutate(difference_curr_minus_prev_ha = curr_PALM_HA - prev_PALM_HA, difference_perc = ((curr_PALM_HA / prev_PALM_HA) * 100) - 100)
ggplot(national_area_diff, aes(x = as.factor(YEAR_HARVEST), y = difference_curr_minus_prev_ha / 1000)) +
geom_bar(stat = "identity") +
theme_bw() +
labs(title = "NATIONAL LEVEL: palm area difference")

ggplot(national_area_diff, aes(x = as.factor(YEAR_HARVEST), y = difference_perc)) +
geom_bar(stat = "identity") +
theme_bw() +
labs(title = "NATIONAL LEVEL: perc palm area difference")

# xy plot
ggplot(test_palm_area) +
geom_point(aes(x = palm_ha / 1000, prev_PALM_HA / 1000, col = YEAR_HARVEST)) +
labs(title = "FFB-leve: prev to current area") +
theme_bw()

# at ffb level
ggplot(test_palm_area %>% mutate(diff = palm_ha - prev_PALM_HA) %>% group_by(YEAR_HARVEST) %>% summarize(mean_diff_curr_minus_prev_ha = mean(diff))) +
geom_bar(aes(y = mean_diff_curr_minus_prev_ha, x = as.factor(YEAR_HARVEST)), stat = "identity") +
labs(title = "FFB-leve: Mean area difference HA") +
theme_bw()

ggplot(test_palm_area %>% group_by(YEAR_HARVEST) %>% summarize(SD_diff_curr_minus_prev_ha = sd(palm_ha - prev_PALM_HA))) +
geom_bar(aes(y = SD_diff_curr_minus_prev_ha, x = as.factor(YEAR_HARVEST)), stat = "identity") +
labs(title = "FFB-leve: SD area difference HA") +
theme_bw()

At national level, the difference is small in percent! and the mean area
at ffb level is also fine :)
but the standrd deviation (sd) is high for the recent & relevant years
(2018-2020)
Lets look at those that do not have palm previously but have palm now and vice versa
ggplot(test_palm_area %>% group_by(YEAR_HARVEST) %>% summarize(n_no_prev_production = sum((palm_ha != 0 & prev_PALM_HA == 0)))) +
geom_bar(aes(y = n_no_prev_production, x = as.factor(YEAR_HARVEST)), stat = "identity") +
labs(title = "ffbs with No palm before ") +
theme_bw()

# check which ones and sorted based on ha (first 50)
test_palm_area[which(test_palm_area$palm_ha != 0 & test_palm_area$prev_PALM_HA == 0), c("ffb_code", "palm_ha")] %>%
group_by(ffb_code) %>%
summarize(max_area = max(palm_ha)) %>%
arrange(desc(max_area)) %>%
filter(max_area > 100) %>%
print(n = 50)
## # A tibble: 15 × 2
## ffb_code max_area
## <chr> <dbl>
## 1 FFB-11626 5285.
## 2 FFB-09567 1101.
## 3 FFB-07197 808.
## 4 FFB-09224 474.
## 5 FFB-09016 356.
## 6 FFB-04834 306.
## 7 FFB-09569 253.
## 8 FFB-09570 231.
## 9 FFB-03371 213.
## 10 FFB-06555 172.
## 11 FFB-11022 155.
## 12 FFB-07396 141.
## 13 FFB-01236 122.
## 14 FFB-06547 115.
## 15 FFB-03605 104.
ggplot(test_palm_area %>% group_by(YEAR_HARVEST) %>% summarize(n_no_current_production = sum((palm_ha == 0 & prev_PALM_HA != 0)))) +
geom_bar(aes(y = n_no_current_production, x = as.factor(YEAR_HARVEST)), stat = "identity") +
labs(title = "ffbs with No current palm, but before") +
theme_bw()

# chech which ones and how many - sorted baased on ha
test_palm_area[which(test_palm_area$palm_ha == 0 & test_palm_area$prev_PALM_HA != 0), c("ffb_code", "prev_PALM_HA")] %>%
group_by(ffb_code) %>%
summarize(max_area = max(prev_PALM_HA)) %>%
arrange(desc(max_area)) %>%
print(n = 50)
## # A tibble: 3 × 2
## ffb_code max_area
## <chr> <dbl>
## 1 FFB-08724 1484
## 2 FFB-10529 100
## 3 FFB-05691 3
Lets look at the outliers >2sd difference
sd_difference_ha <- test_palm_area %>%
group_by(YEAR_HARVEST) %>%
summarize(SD_diff_curr_minus_prev_ha = sd(palm_ha - prev_PALM_HA))
test_palm_area_sd <- test_palm_area %>%
select(ffb_code, YEAR_HARVEST, palm_ha, prev_PALM_HA) %>%
left_join(sd_difference_ha) %>%
mutate(
diff_ha = palm_ha - prev_PALM_HA,
diff_2SD = if_else(diff_ha > (2 * SD_diff_curr_minus_prev_ha), TRUE, FALSE),
diff_1SD = if_else(diff_ha > SD_diff_curr_minus_prev_ha, TRUE, FALSE)
)
print(test_palm_area_sd %>% filter(diff_2SD == TRUE) %>%
select(ffb_code, YEAR_HARVEST, SD_diff_curr_minus_prev_ha, diff_ha, diff_2SD) %>%
arrange(desc(diff_ha)), n = 20)
## # A tibble: 3,378 × 5
## ffb_code YEAR_HARVEST SD_diff_curr_minus_prev_ha diff_ha diff_2SD
## <chr> <dbl> <dbl> <dbl> <lgl>
## 1 FFB-07065 2020 246. 6606. TRUE
## 2 FFB-07065 2019 214. 6509. TRUE
## 3 FFB-11609 2018 183. 6018. TRUE
## 4 FFB-11609 2019 214. 6018. TRUE
## 5 FFB-11609 2020 246. 6018. TRUE
## 6 FFB-10903 2020 246. 5589. TRUE
## 7 FFB-10903 2019 214. 5540. TRUE
## 8 FFB-11609 2017 161. 5535. TRUE
## 9 FFB-10903 2018 183. 5502. TRUE
## 10 FFB-10903 2017 161. 5402. TRUE
## 11 FFB-11626 2019 214. 5285. TRUE
## 12 FFB-11626 2020 246. 5285. TRUE
## 13 FFB-11609 2016 136. 5274. TRUE
## 14 FFB-11626 2018 183. 5267. TRUE
## 15 FFB-10903 2016 136. 5246. TRUE
## 16 FFB-10903 2015 114. 5125. TRUE
## 17 FFB-11626 2017 161. 4917. TRUE
## 18 FFB-10903 2014 100. 4889. TRUE
## 19 FFB-10903 2013 95.0 4772. TRUE
## 20 FFB-10903 2012 92.7 4710. TRUE
## # ℹ 3,358 more rows
test_palm_area_sd_agg <- test_palm_area_sd %>%
group_by(YEAR_HARVEST) %>%
summarize(n_2sd = sum(diff_2SD == TRUE), n_1sd = sum(diff_1SD == TRUE), sd_1 = first(SD_diff_curr_minus_prev_ha))
test_palm_area_sd_agg
## # A tibble: 18 × 4
## YEAR_HARVEST n_2sd n_1sd sd_1
## <dbl> <int> <int> <dbl>
## 1 2003 164 335 93.6
## 2 2004 172 348 90.3
## 3 2005 186 367 86.7
## 4 2006 189 369 87.2
## 5 2007 189 377 87.7
## 6 2008 191 387 87.7
## 7 2009 203 403 87.6
## 8 2010 210 412 88.9
## 9 2011 219 422 89.8
## 10 2012 234 431 92.7
## 11 2013 236 436 95.0
## 12 2014 236 434 100.
## 13 2015 227 430 114.
## 14 2016 180 390 136.
## 15 2017 156 347 161.
## 16 2018 150 336 183.
## 17 2019 124 311 214.
## 18 2020 112 288 246.
ggplot(test_palm_area_sd_agg, aes(x = as.factor(YEAR_HARVEST), y = n_2sd)) +
geom_bar(stat = "identity") +
labs(title = "Number ffb area outliers >2sd (based on yearly difference)") +
theme_bw()

We have a lot of outliers (between 112-150 for 2018-2020 >2sd) that may affect embedding