Annex I: Compendium of R scripts
This chapter compiles the complete list of R scripts involved in the process of soil sampling design.
Script 1: Introduction to R
# Introduction to R
# 0. Playground =================================================
# learn important keyboard shortcuts
# Ctrl + enter for running code
# tab after writing the first three
# characters of the function name
# F1 to access the help
# explore the use of <-, $, [], ==, !=, c(), :,
# data.frame(), list(), as.factor()
a <- 10:15
a[2]
a[2:3]
b <- c("1", "a", a )
length(b)
df <- data.frame(column_a = 1:8, column_b = b)
df[,1]
df$column_b
as.numeric(df$column_b)
plot(df)
df[1:3,]
df[,1]
as.factor(b)
d <- list(a, b, df)
d
names(d)
names(d) <- c("numeric_vector", "character_vector", "dataframe")
d
d[[1]]
d$numeric_vector
a == b
a != b
# 1. Set working directory ======================================
# Set working directory to a specific location
setwd("C:/GIT/Digital-Soil-Mapping/")
# Set working directory to source file location
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
# 2. Install and load packages ==================================
# readxl, tidyverse, and data.table packages using the functions
install.packages("tidyverse")
install.packages("readxl")
install.packages("data.table")
library(tidyverse)
library(readxl)
library(data.table)
# Install packages from other repository than CRAN
install.packages("remotes")
remotes::install_github("lemuscanovas/synoptReg")
# 3. Import an spreadsheet ======================================
## 3.1 Read the MS Excel file -----------------------------------
#Read the soil_data.xlsx file, spreadsheet 2, using read_excel
read_excel(path = "data/other/soil_data.xlsx", sheet = 2)
## 3.2 Read the csv file with the native function ---------------
# 01-Data/horizon.csv
read.csv("data/other/soil_profile_data.csv")
## 3.3 Read the csv file with the tidyverse function ------------
read_csv("data/other/soil_profile_data.csv")
## 3.4 Read the csv file with the data.table function -----------
fread("data/other/soil_profile_data.csv")
## 3.5 Assign the dataframe to an object called dat -------------
dat <- read_csv("data/other/soil_profile_data.csv")
# 4. Tidyverse functions ========================================
## 4.1 Select pid, hip, top, bottom, ph_h2o, cec from dat -------
dat_1 <- dat %>%
select(id_prof, id_hor, top, bottom, ph_h2o, cec)
## 4.2 Filter: pick observations by their values ----------------
# filter observations with cec > 50 cmolc/100g
dat_2 <- dat_1 %>%
filter(cec > 30)
dat_2
## 4.3 Mutate: create a new variable ----------------------------
# thickness = top - bottom
dat_3 <- dat_2 %>%
mutate(thickness = bottom - top)
## 4.4 Group_by and summarise -----------------------------------
# group by variable pid
# summarise taking the mean of pH and cec
dat_4 <- dat_3 %>%
group_by(id_prof) %>%
summarise(mean_ph = mean(ph_h2o),
mean_cec = mean(cec))
## 4.5 Reshape the table using pivot_longer ---------------------
# use dat_3
# put the names of the variables ph_h2o,
# cec and thickness in the column
# variable and keep the rest of the table. Save in dat_5
dat_5 <- dat_3 %>%
pivot_longer(ph_h2o:thickness, names_to = "soil_property",
values_to = "value")
## 4.6 Join the table sites.csv with dat_3 ----------------------
# Load soil_phys_data030.csv (in 01-Data folder)
# Join its columns with dat_3 keeping all the rows of dat_3
# save the result as dat_6
phys <- read_csv("data/other/soil_phys_data030_2.csv")
phys <- phys %>% rename(id_prof = "ProfID")
dat_6 <- dat_3 %>%
left_join(phys)
# or
dat_6 <- phys %>%
right_join(dat_3)
# 5. Data visualization with ggplot2 ============================
## 5.1 1D plot: histograms --------------------------------------
# histograms of cec and ph_h2o
ggplot(dat_3, aes(x=cec)) + geom_histogram()
## 5.2 2D plot: scatterplot -------------------------------------
# Scatterplot bottom vs. ph_h2o
ggplot(dat_3, aes(x = bottom, y = ph_h2o)) +
geom_point()
# add a fitting line
ggplot(dat_3, aes(x = bottom, y = ph_h2o)) +
geom_point() +
geom_smooth(method = "lm" )
## 5.3 3D plot: scatterplot -------------------------------------
# Scatterplot bottom vs. ph_h2o, add clay as color
# and size inside the
# function aes()
ggplot(dat_3,
aes(x = bottom, y = ph_h2o, color = cec, size = cec)) +
geom_point()
# 6. Geospatial data with terra =================================
## Load packages (install them if needed)
library(terra)
## 6.1 Load a raster and a vector layer -------------------------
# Load data/other/landcover2013.tif using rast() function, then plot it
# Load data/shapes/landcover.shp using vect() function and
# plot it
# explore the attributes of these layers
r <- rast("data/other/landcover2013.tif")
plot(r)
v <- vect("data/shapes/landcover.shp")
plot(v)
## 6.2 Load a raster and a vector layer -------------------------
# Check the current CRS (EPSG) of the raster and the vector.
# Find a *projected* CRS in http://epsg.io for Vietnam and
# copy the number
# Check the Arguments of function project (?project) that need to
# be defined
# Save the new object as r_proj and v_proj
# plot both objects
r_proj <- project(x = r, y = "epsg:3405", method = "near",
res = 250)
plot(r_proj)
v_proj <- project(x = v, y = "epsg:3405")
plot(v_proj, add = TRUE)
## 6.3 Cropping and masking a raster ----------------------------
# Compute the area of the polygons in v_proj
# (search for a function) and
# assign the values to a new column named area
# select the largest polygon using [], $, == and max() func.
# and save it as pol
# crop the raster with pol using the crop() function and save
#it as r_pol
# mask the raster r_pol with the polygon pol and save it
# with the same name
# plot each result
v_proj$area <- expanse(v_proj, unit = "ha")
pol <- v_proj[v_proj$area == max(v_proj$area)]
plot(pol)
r_pol <- crop(r_proj, pol)
plot(r_pol)
plot(pol, add = TRUE)
r_pol <- mask(r_pol, pol)
plot(r_pol)
## 6.4 Replace values in a raster by filtering their cells ------
# Explore the following link to understand how terra
#manage cell values
# https://rspatial.org/terra/pkg/4-algebra.html
# Replace values lower than 5 in r+pol by 0
r_pol[r_pol$landcover2013 < 5] <- 0
plot(r_pol)
## 6.5 Rasterize a vector layer ---------------------------------
# Use rasterize() function to convert v_proj to raster
# Use r_proj as reference raster
# Use field landcover to assign cell values, and plot the new map
v_class <- rasterize(x = v_proj, y = r_proj, field = "landcover" )
plot(v_class)
v_class
activeCat(v_class) <- 1
## 6.6 Extracting raster values using points --------------------
# Covert dat_6 to spatial points using vect() function
# (check help of vect())
# Note that the EPSG number is 3405
# Save the points as s
# Plot s and r_proj together in the same map (Argument add=TRUE)
# Extract the values of the raster using extract()
# function (check the help)
# Remove the ID column of the extracted values
# merge the extracted data with s using cbind() function
# Convert s as a dataframe
s <- vect(dat_6, geom=c("x", "y"), crs = "epsg:4326")
#s <- project(x = s, y = "epsg:3405")
plot(r_proj)
plot(s, add=TRUE)
x <- extract(r_proj,s, ID=FALSE)
s <- cbind(s,x)
d <- as.data.frame(s)
d
#GGally::ggscatmat(d)
## 6.7 Zonal statistics using polygons and rasters --------------
# Use the extract() func. to estimate the mean value of
# distance_proj at each polygon
# Use the fun= argument (check the help)
# Use the cbind() func. to merge v_proj and the extracted values
# convert v_proj to a dataframe
# Create a ggplot boxplot (geom_boxplot) with x=landcover
# and y=dist2access
distance <- rast("data/other/nghe_d2roads.tif")
plot(distance)
distance_proj <- project(x = distance, y = "epsg:3405", method = "bilinear",
res = 250)
plot(distance_proj)
x <- extract(distance_proj, v_proj, fun = mean, ID=FALSE)
v_proj <- cbind(v_proj, x)
d <- as_tibble(v_proj)
d %>%
ggplot(aes(x =landcover, y = dist2access, fill = landcover)) +
geom_boxplot() +
ylab("Distance to roads")
## ENDScript 2: Download environmental covariates
var assets =
["projects/digital-soil-mapping-gsp-fao/assets/CHELSA/bio1",
"projects/digital-soil-mapping-gsp-fao/assets/CHELSA/bio12",
"projects/digital-soil-mapping-gsp-fao/assets/CHELSA/bio13",
"projects/digital-soil-mapping-gsp-fao/assets/CHELSA/bio14",
"projects/digital-soil-mapping-gsp-fao/assets/CHELSA/bio16",
"projects/digital-soil-mapping-gsp-fao/assets/CHELSA/bio17",
"projects/digital-soil-mapping-gsp-fao/assets/CHELSA/bio5",
"projects/digital-soil-mapping-gsp-fao/assets/CHELSA/bio6",
"projects/digital-soil-mapping-gsp-fao/assets/CHELSA/ngd10",
"projects/digital-soil-mapping-gsp-fao/assets/CHELSA/pet_penman_max",
"projects/digital-soil-mapping-gsp-fao/assets/CHELSA/pet_penman_mean",
"projects/digital-soil-mapping-gsp-fao/assets/CHELSA/pet_penman_min",
"projects/digital-soil-mapping-gsp-fao/assets/CHELSA/pet_penman_range",
"projects/digital-soil-mapping-gsp-fao/assets/CHELSA/sfcWind_max",
"projects/digital-soil-mapping-gsp-fao/assets/CHELSA/sfcWind_mean",
"projects/digital-soil-mapping-gsp-fao/assets/CHELSA/sfcWind_range",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/fpar_030405_500m_mean",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/fpar_030405_500m_sd",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/fpar_060708_500m_mean",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/fpar_060708_500m_sd",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/fpar_091011_500m_mean",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/fpar_091011_500m_sd",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/fpar_120102_500m_mean",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/fpar_120102_500m_sd",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/lstd_030405_mean",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/lstd_030405_sd",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/lstd_060708_mean",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/lstd_060708_sd",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/lstd_091011_mean",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/lstd_091011_sd",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/lstd_120102_mean",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/lstd_120102_sd",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/ndlst_030405_mean",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/ndlst_030405_sd",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/ndlst_060708_mean",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/ndlst_060708_sd",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/ndlst_091011_mean",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/ndlst_091011_sd",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/ndlst_120102_mean",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/ndlst_120102_sd",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/ndvi_030405_250m_mean",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/ndvi_030405_250m_sd",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/ndvi_060708_250m_mean",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/ndvi_060708_250m_sd",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/ndvi_091011_250m_mean",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/ndvi_091011_250m_sd",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/ndvi_120102_250m_mean",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/ndvi_120102_250m_sd",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/snow_cover",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/swir_060708_500m_mean",
"projects/digital-soil-mapping-gsp-fao/assets/LANDCOVER/crops",
"projects/digital-soil-mapping-gsp-fao/assets/LANDCOVER/flooded_vegetation",
"projects/digital-soil-mapping-gsp-fao/assets/LANDCOVER/grass",
"projects/digital-soil-mapping-gsp-fao/assets/LANDCOVER/shrub_and_scrub",
"projects/digital-soil-mapping-gsp-fao/assets/LANDCOVER/trees",
"projects/digital-soil-mapping-gsp-fao/assets/OPENLANDMAP/dtm_curvature_250m",
"projects/digital-soil-mapping-gsp-fao/assets/OPENLANDMAP/dtm_downslopecurvature_250m",
"projects/digital-soil-mapping-gsp-fao/assets/OPENLANDMAP/dtm_dvm2_250m",
"projects/digital-soil-mapping-gsp-fao/assets/OPENLANDMAP/dtm_dvm_250m",
"projects/digital-soil-mapping-gsp-fao/assets/OPENLANDMAP/dtm_elevation_250m",
"projects/digital-soil-mapping-gsp-fao/assets/OPENLANDMAP/dtm_mrn_250m",
"projects/digital-soil-mapping-gsp-fao/assets/OPENLANDMAP/dtm_neg_openness_250m",
"projects/digital-soil-mapping-gsp-fao/assets/OPENLANDMAP/dtm_pos_openness_250m",
"projects/digital-soil-mapping-gsp-fao/assets/OPENLANDMAP/dtm_slope_250m",
"projects/digital-soil-mapping-gsp-fao/assets/OPENLANDMAP/dtm_tpi_250m",
"projects/digital-soil-mapping-gsp-fao/assets/OPENLANDMAP/dtm_twi_500m",
"projects/digital-soil-mapping-gsp-fao/assets/OPENLANDMAP/dtm_upslopecurvature_250m",
"projects/digital-soil-mapping-gsp-fao/assets/OPENLANDMAP/dtm_vbf_250m"];
// Load borders
/// Using UN 2020 (replace the countries to download)
/// var ISO = ['ITA'];
/// var aoi =
/// ee.FeatureCollection(
/// 'projects/digital-soil-mapping-gsp-fao/assets/UN_BORDERS/BNDA_CTY'
/// )
/// .filter(ee.Filter.inList('ISO3CD', ISO));
/// var region = aoi.geometry();
/// Using a shapefile
/// 1. Upload the borders of your countries as an asset
/// 2. Replace 'your_shapefile' with the path to your shapefile
var shapefile = ee.FeatureCollection('projects/digital-soil-mapping-gsp-fao/assets/Nghe_An');
var region = shapefile.geometry().bounds();
// Load assets as ImageCollection
var assetsCollection = ee.ImageCollection(assets);
// Clip each image in the collection to the region of interest
var clippedCollection = assetsCollection.map(function(img){
return img.clip(region).toFloat();
});
// Function to replace masked values with zeroes for fpar bands
function replaceMaskedFpar(img) {
var allBands = img.bandNames();
var fparBands =
allBands.filter(ee.Filter.stringStartsWith('item', 'fpar'));
var nonFparBands = allBands.removeAll(fparBands);
var fparImg = img.select(fparBands).unmask(0);
var nonFparImg = img.select(nonFparBands);
// If there are no fpar bands, return the original image
var result = ee.Algorithms.If(fparBands.length().eq(0),
img,
nonFparImg.addBands(fparImg));
return ee.Image(result);
}
// Clip each image in the collection to the region of
//interest and replace masked values for fpar bands
var clippedCollection = assetsCollection.map(function(img){
var clippedImg = img.clip(region).toFloat();
return replaceMaskedFpar(clippedImg);
});
// Stack the layers and maintain the
//layer names in the final file
var stacked = clippedCollection.toBands();
// Get the list of asset names
var assetNames = ee.List(assets).map(function(asset) {
return ee.String(asset).split('/').get(-1);
});
// Rename the bands with asset names
var renamed = stacked.rename(assetNames);
print(renamed, 'Covariates to be exported')
// Visualize the result
// Set a visualization parameter
// (you can adjust the colors as desired)
var visParams = {
bands: 'bio1',
min: 0,
max: 1,
palette: ['blue', 'green', 'yellow', 'red']
};
// Add the layer to the map
Map.centerObject(renamed, 6)
Map.addLayer(renamed, visParams, 'Covariates');
// Export the stacked image to Google Drive
Export.image.toDrive({
image: renamed,
description: 'covariates',
folder: 'GEE',
scale: 250,
maxPixels: 1e13,
region: region
});
/* Create mask for croplands ----------------------------*/
// Load the Copernicus Global Land Service image collection
var imageCollection =
ee.Image("COPERNICUS/Landcover/100m/Proba-V-C3/Global/2019")
.select("discrete_classification")
.clip(region)
var crs = 'EPSG:4326'; // WGS84
var res = 250; // Resolution in decimal degrees
// Default resampling is nearest neighbor
var image1 = imageCollection.resample()
.reproject({
crs: crs, // Add your desired CRS here
scale: res // Add your desired scale here
});
// Reclassify the land cover classes
var inList = [0, 20, 30, 40, 50, 60, 70, 80, 90, 100,
111, 112, 113, 114, 115, 116,
121, 122, 123, 124, 125, 126, 200];
var outList = [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
var FAO_lu = image1.remap(inList, outList)
.toDouble()
.clip(region);
// print(FAO_lu)
// Convert 0 to NA
var mask = FAO_lu.neq(0);
print(mask)
FAO_lu = FAO_lu.updateMask(mask);
print(FAO_lu, "Mask")
var visParams = {
bands: 'remapped',
min: 0,
max: 1,
palette: ['green', 'yellow']
};
// Add the layer to the map
Map.addLayer(FAO_lu,visParams ,'Mask');
// Export the land cover image as a raster to Google Drive
Export.image.toDrive({
image: FAO_lu,
folder: 'GEE',
description: 'mask',
scale: res, // Add your desired scale here
region: region,
crs: crs, // Add your desired CRS here
maxPixels: 1e13 // Add a maximum number of pixels
//for export if needed
});Script 3: Evaluate Legacy Data
#
# Digital Soil Mapping
# Soil Sampling Design
# Evaluation of Legacy Data
#
# GSP-Secretariat
# Contact: Luis.RodriguezLado@fao.org
#________________________________________________________________
# Empty environment and cache
rm(list = ls())
gc()
# Content of this script ========================================
# Script for evaluation the degree of representativeness of a soil legacy dataset
# relative to the diversity of the environmental conditions described in a set
# of raster covariates.
#
# 0 - Set working directory and load packages
# 1 - User-defined variables
# 2 - Prepare data
# 3 - Extract environmental data from rasters at soil locations
# 4 - Compute variability matrix in covariates
# 5 - Calculate hypercube of "covariates" distribution (P)
# 6 - Calculate hypercube of "sample" distribution (Q)
# 7 - Calculate Representativeness of the Legacy Dataset
#________________________________________________________________
## 0 - Set working directory and load packages =================================
#remotes::install_github("lemuscanovas/synoptReg")
# Set working directory to source file location
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
setwd("../") # Move wd down to main folder
# List of packages
packages <- c("sp","terra","raster","sf","clhs", "sgsR","entropy", "tripack",
"manipulate","dplyr","synoptReg")
# Load packages
lapply(packages, require, character.only = TRUE)
# Remove object to save memory space
rm(packages)
## 1 - User-defined variables ==================================================
# Path to rasters
raster.path <- "data/rasters"
# Path to shapes
shp.path <- "data/shapes"
# Path to results
results.path <- "data/results/"
# Path to additional data
other.path <- "data/other/"
# Aggregation factor for up-scaling raster covariates (optional)
agg.factor = 10
## 2 - Prepare data ============================================================
## Load raster covariate data
# Read Spatial data covariates as rasters with terra
cov.dat <- list.files(raster.path, pattern = "tif$", recursive = TRUE, full.names = TRUE)
cov.dat <- terra::rast(cov.dat) # SpatRaster from terra
# Aggregate stack to simplify data rasters for calculations
cov.dat <- aggregate(cov.dat, fact=agg.factor, fun="mean")
# Load shape of district
nghe <- sf::st_read(file.path(paste0(shp.path,"/Nghe_An.shp")),quiet=TRUE)
# Crop covariates on administrative boundary
cov.dat <- crop(cov.dat, nghe, mask=TRUE)
# Transform raster information with PCA
pca <- raster_pca(cov.dat)
# Get SpatRaster layers
cov.dat <- pca$PCA
# Create a raster stack to be used as input in the clhs::clhs function
cov.dat.ras <- raster::stack(cov.dat)
# Subset rasters
cov.dat <- pca$PCA[[1:first(which(pca$summaryPCA[3,]>0.99))]]
cov.dat.ras <- cov.dat.ras[[1:first(which(pca$summaryPCA[3,]>0.99))]]
## 3 - Extract environmental data from rasters at soil locations ===============
# Load legacy soil data
p.dat <- terra::vect(file.path(paste0(shp.path,"/legacy_soils.shp")))
# Extract data
p.dat_I <- terra::extract(cov.dat, p.dat)
p.dat_I <- na.omit(p.dat_I) # Remove soil points outside study area
p.dat_I.df <- p.dat_I[,-1]
str(p.dat_I.df)
## 4 - Compute variability matrix in the covariates ====================================
# Define Number of bins
nb <- 25
# Quantile matrix of the covariate data
q.mat <- matrix(NA, nrow = (nb + 1), ncol = nlyr(cov.dat))
j = 1
for (i in 1:nlyr(cov.dat)) {
ran1 <- minmax(cov.dat[[i]])[2] - minmax(cov.dat[[i]])[1]
step1 <- ran1 / nb
q.mat[, j] <-
seq(minmax(cov.dat[[i]])[1],
to = minmax(cov.dat[[i]])[2],
by = step1)
j <- j + 1
}
q.mat
## 5 - Calculate hypercube of "covariates" distribution (P) ===================
# Convert SpatRaster to dataframe for calculations
cov.dat.df <- as.data.frame(cov.dat)
cov.mat <- matrix(1, nrow = nb, ncol = ncol(q.mat))
cov.dat.mx <- as.matrix(cov.dat.df)
for (i in 1:nrow(cov.dat.mx)) {
for (j in 1:ncol(cov.dat.mx)) {
dd <- cov.dat.mx[[i, j]]
if (!is.na(dd)) {
for (k in 1:nb) {
kl <- q.mat[k, j]
ku <- q.mat[k + 1, j]
if (dd >= kl && dd <= ku) {
cov.mat[k, j] <- cov.mat[k, j] + 1
}
}
}
}
}
cov.mat
## 6 - Calculate hypercube of "sample" distribution (Q) ========================
h.mat <- matrix(1, nrow = nb, ncol = ncol(q.mat))
for (i in 1:nrow(p.dat_I.df)) {
for (j in 1:ncol(p.dat_I.df)) {
dd <- p.dat_I.df[i, j]
if (!is.na(dd)) {
for (k in 1:nb) {
kl <- q.mat[k, j]
ku <- q.mat[k + 1, j]
if (dd >= kl && dd <= ku) {
h.mat[k, j] <- h.mat[k, j] + 1
}
}
}
}
}
h.mat
## 7 - Calculate Representativeness of the Legacy Dataset ==================
## Calculate the proportion of "variables" in the covariate spectra that fall within the convex hull of variables in the "environmental sample space"
# Principal component of the legacy data sample
pca.s = prcomp(p.dat_I[,2:(ncol(cov.dat.df)+1)],scale=TRUE, center=TRUE)
scores_pca1 = as.data.frame(pca.s$x)
# Plot the first 2 principal components and convex hull
rand.tr <- tri.mesh(scores_pca1[,1],scores_pca1[,2],"remove") # Delaunay triangulation
rand.ch <- convex.hull(rand.tr, plot.it=F) # convex hull
pr_poly = cbind(x=c(rand.ch$x),y=c(rand.ch$y)) # save the convex hull vertices
plot(scores_pca1[,1], scores_pca1[,2], xlab="PCA 1", ylab="PCA 2", xlim=c(min(scores_pca1[,1:2]), max(scores_pca1[,1:2])),ylim=c(min(scores_pca1[,1:2]), max(scores_pca1[,1:2])), main='Convex hull of soil legacy data')
lines(c(rand.ch$x,rand.ch$x[1]), c(rand.ch$y,rand.ch$y[1]),col="red",lwd=1) # draw the convex hull (domain of legacy data)
# PCA projection of study area population onto the principal components
PCA_projection <- predict(pca.s, cov.dat.df) # Project study area population onto sample PC
newScores = cbind(x=PCA_projection[,1],y=PCA_projection[,2]) # PC scores of projected population
# Plot the polygon and all points to be checked
plot(newScores, xlab="PCA 1", ylab="PCA 2", xlim=c(min(newScores[,1:2]), max(newScores[,1:2])), ylim=c(min(newScores[,1:2]), max(newScores[,1:2])), col='black', main='Environmental space plots over the convex hull of soil legacy data')
polygon(pr_poly,col='#99999990')
# Check which points fall within the polygon
pip <- point.in.polygon(newScores[,2], newScores[,1], pr_poly[,2],pr_poly[,1],mode.checked=FALSE)
newScores <- data.frame(cbind(newScores, pip))
# Plot points outside convex hull
points(newScores[which(newScores$pip==0),1:2],pch='X', col='red')
# Proportion of the conditions in the study area that fall within the convex hull of sample conditions
sum(nrow(newScores[newScores$pip>0,]))/nrow(newScores)*100
## ENDScript 4: Calculate Minimum and Optimal Sample Sizes
#
# Digital Soil Mapping
# Soil Sampling Design
# Optimizing Sample Size
#
# GSP-Secretariat
# Contact: Luis.RodriguezLado@fao.org
#________________________________________________________________
#Empty environment and cache
rm(list = ls())
gc()
# Content of this script ========================================
# The goal of this script is to determine the minimum sample size required to describe an area
# while retaining for a 95% of coincidence in the environmental variability of covariates
# in the area
#
# 0 - Set working directory and load necessary packages
# 1 - User-defined variables
# 2 - Import national data
# 3 - Calculate the minimum sample size to describe the area
# 4 - Plot covariate diversity as PCA scores
# 5 - KL divergence and % similarity results for growing N samples
# 6 - Model KL divergence
# 7 - Determine the minimum sample size for 95% coincidence
# 8 - Determine the optimal iteration according to the minimum N size
# 9 - Plot minimum points from best iteration
#________________________________________________________________
## 0 - Set working directory and load necessary packages =======================
# Set working directory to source file location
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
setwd("../") # Move wd down to main folder
# List of packages
packages <- c("sp","terra","raster","sf","clhs",
"sgsR","entropy", "tripack",
"manipulate","dplyr","plotly","synoptReg")
# Load packages
lapply(packages, require, character.only = TRUE)
rm(packages) # Remove object to save memory space
## 1 - User-defined variables ==================================================
# Path to rasters
raster.path <- "data/rasters/"
# Path to shapes
shp.path <- "data/shapes/"
# Path to results
results.path <- "data/results/"
# Path to additional data
other.path <- "data/other/"
# Aggregation factor for up-scaling raster covariates (optional)
agg.factor = 10
# Define parameters to determine minimum sampling size
initial.n <- 60 # Initial sampling size to test
final.n <- 360 # Final sampling size to test
by.n <- 20 # Increment size
iters <- 5 # Number of trials on each size
## 2 - Import national data ====================================================
## Load raster covariate data
# Read Spatial data covariates as rasters with terra
cov.dat <- list.files(raster.path, pattern = "tif$", recursive = TRUE, full.names = TRUE)
cov.dat <- terra::rast(cov.dat) # SpatRaster from terra
# Aggregate stack to simplify data rasters for calculations
cov.dat <- aggregate(cov.dat, fact=agg.factor, fun="mean")
# Load shape of district
nghe <- sf::st_read(file.path(paste0(shp.path,"/Nghe_An.shp")),quiet=TRUE)
# Crop covariates on administrative boundary
cov.dat <- crop(cov.dat, nghe, mask=TRUE)
# Store elevation and slope separately
elevation <- cov.dat$dtm_elevation_250m
slope <- cov.dat$dtm_slope_250m
# Load roads
roads <- vect(file.path(paste0(shp.path,"/roads.shp")))
roads <- crop(roads, nghe)
# Simplify raster information with PCA
pca <- raster_pca(cov.dat)
# Get SpatRaster layers
cov.dat <- pca$PCA
# Subset rasters to main PC (var. explained <=0.99)
n_comps <- first(which(pca$summaryPCA[3,]>0.99))
cov.dat <- pca$PCA[[1:n_comps]]
# Plot covariates
plot(cov.dat)
# 3 - Calculate the minimum sample size to describe the area ===================
# Start computations ----
# Initialize empty vectors to store results
number_of_samples <- c()
prop_explained <- c()
klo_samples <- c()
samples_storage <- list()
# Convert SpatRaster to dataframe
cov.dat.df <- as.data.frame(cov.dat)
# Start evaluation with growing sample sizes
for (trial in seq(initial.n, final.n, by = by.n)) {
for (iteration in 1:iters) {
# Generate stratified clhs samples
p.dat_I <- sample_clhs(cov.dat,
nSamp = trial, iter = 10000,
progress = FALSE, simple = FALSE)
# Get covariate values as dataframe and delete NAs, avoid geometry
p.dat_I.df <- as.data.frame(p.dat_I) %>%
dplyr::select(!(geometry)) %>%
na.omit()
# Store samples as list
samples_storage[[paste0("N", trial, "_", iteration)]] <- p.dat_I
## Comparison of population and sample distributions - Kullback-Leibler (KL) divergence
# Define quantiles of the study area (number of bins)
nb <- 25
# Quantile matrix of the covariate data
q.mat <- matrix(NA, nrow = (nb + 1), ncol = nlyr(cov.dat))
j = 1
for (i in 1:nlyr(cov.dat)) {
ran1 <- minmax(cov.dat[[i]])[2] - minmax(cov.dat[[i]])[1]
step1 <- ran1 / nb
q.mat[, j] <-
seq(minmax(cov.dat[[i]])[1],
to = minmax(cov.dat[[i]])[2],
by = step1)
j <- j + 1
}
q.mat
# Hypercube of covariates in study area
# Initialize the covariate matrix
cov.mat <- matrix(1, nrow = nb, ncol = ncol(q.mat))
cov.dat.mx <- as.matrix(cov.dat.df)
for (i in 1:nrow(cov.dat.mx)) {
for (j in 1:ncol(cov.dat.mx)) {
dd <- cov.dat.mx[[i, j]]
if (!is.na(dd)) {
for (k in 1:nb) {
kl <- q.mat[k, j]
ku <- q.mat[k + 1, j]
if (dd >= kl && dd <= ku) {
cov.mat[k, j] <- cov.mat[k, j] + 1
}
}
}
}
}
cov.mat
# Compare whole study area covariate space with the selected sample
# Sample data hypercube (the same as for the raster data but on the sample data)
h.mat <- matrix(1, nrow = nb, ncol = ncol(q.mat))
for (i in 1:nrow(p.dat_I.df)) {
for (j in 1:ncol(p.dat_I.df)) {
dd <- p.dat_I.df[i, j]
if (!is.na(dd)) {
for (k in 1:nb) {
kl <- q.mat[k, j]
ku <- q.mat[k + 1, j]
if (dd >= kl && dd <= ku) {
h.mat[k, j] <- h.mat[k, j] + 1
}
}
}
}
}
h.mat
## Compute Kullback-Leibler (KL) divergence
kl.index <- c()
for (i in 1:ncol(cov.dat.df)) {
kl <- KL.empirical(c(cov.mat[, i]), c(h.mat[, i]))
kl.index <- c(kl.index, kl)
klo <- mean(kl.index)
}
## Calculate the proportion of "env. variables" in the covariate spectra that fall within the convex hull of variables in the "environmental sample space"
# Principal component of the data sample
pca.s = prcomp(p.dat_I.df, scale = TRUE, center = TRUE)
scores_pca1 = as.data.frame(pca.s$x)
# Plot the first 2 principal components and convex hull
rand.tr <-
tri.mesh(scores_pca1[, 1], scores_pca1[, 2], "remove") # Delaunay triangulation
rand.ch <- convex.hull(rand.tr, plot.it = F) # convex hull
pr_poly <-
cbind(x = c(rand.ch$x), y = c(rand.ch$y)) # save the convex hull vertices
# PCA projection of study area population onto the principal components
PCA_projection <-
predict(pca.s, cov.dat.df) # Project study area population onto sample PC
newScores = cbind(x = PCA_projection[, 1], y = PCA_projection[, 2]) # PC scores of projected population
# Check which points fall within the polygon
pip <-
point.in.polygon(newScores[, 2], newScores[, 1], pr_poly[, 2], pr_poly[, 1], mode.checked =
FALSE)
newScores <- data.frame(cbind(newScores, pip))
klo_samples <- c(klo_samples, klo)
prop_explained <-
c(prop_explained, sum(newScores$pip) / nrow(newScores) * 100)
number_of_samples <- c(number_of_samples, trial)
print(
paste(
"N samples = ",
trial,
" out of ",
final.n,
"; iteration = ",
iteration,
"; KL = ",
klo,
"; Proportion = ",
sum(newScores$pip) / nrow(newScores) * 100
)
)
}
}
# 4 - Plot covariate diversity as PCA scores ===================================
plot(newScores[,1:2],
xlab = "PCA 1",
ylab = "PCA 2",
xlim = c(min(newScores[,1:2], na.rm = T), max(newScores[,1:2], na.rm = T)),
ylim = c(min(newScores[,1:2], na.rm = T), max(newScores[,1:2], na.rm = T)),
col = 'black',
main = 'Environmental space plots on convex hull of soil samples')
polygon(pr_poly, col = '#99999990')
# # Plot points outside convex hull
points(newScores[which(newScores$pip == 0), 1:2],
col = 'red',
pch = 12,
cex = 1)
# 5 - KL divergence and % coincidence for growing N samples =============
# Merge data from number of samples, KL divergence and % coincidence
results <- data.frame(number_of_samples, klo_samples, prop_explained)
names(results) <- c("N", "KL", "Perc")
# Calculate mean results by N size
mean_result <- results %>%
group_by(N) %>%
summarize_all(mean)
mean_result
## Plot dispersion on KL and % by N
par(mar = c(5, 4, 5, 5))
boxplot(
Perc ~ N,
data = results,
col = rgb(1, 0.1, 0, alpha = 0.5),
ylab = "% coincidence"
)
mtext("KL divergence", side = 4, line = 3)
# Add new plot
par(new = TRUE, mar = c(5, 4, 5, 5))
# Box plot
boxplot(
KL ~ N,
data = results,
axes = FALSE,
outline = FALSE,
col = rgb(0, 0.8, 1, alpha = 0.5),
ylab = ""
)
axis(
4,
at = seq(0.02, 0.36, by = .06),
label = seq(0.02, 0.36, by = .06),
las = 3
)
# Draw legend
par(xpd=TRUE)
legend("top", inset=c(1,-.15) ,c("% coincidence", "KL divergence"), horiz=T,cex=.9,
box.lty=0,fill = c(rgb(1, 0.1, 0, alpha = 0.5), rgb(0, 0.8, 1, alpha = 0.5)))
par(xpd=FALSE)
# 6 - Model KL divergence ======================================================
# Create an exponential decay function of the KL divergence
x <- mean_result$N
y = (mean_result$KL - min(mean_result$KL)) / (max(mean_result$KL) - min(mean_result$KL)) #KL
# Parametrize Exponential decay function
start <- list() # Initialize an empty list for the starting values
#fit function
k = 2
b0 = 0.01
b1 = 0.01
fit1 <-
nls(
y ~ k * exp(-b1 * x) + b0,
start = list(k = k, b0 = b0, b1 = b1),
control = list(maxiter = 500),
trace = T
)
summary(fit1)
# Plot fit
xx <- seq(1, final.n, 1)
plot(x, y)
lines(xx, predict(fit1, list(x = xx)))
# Predict with vfit function
jj <- predict(fit1, list(x = xx))
normalized = 1 - (jj - min(jj)) / (max(jj) - min(jj))
# 7 - Determine the minimum sample size to account for 95% of cumulative probability of the covariate diversity ====
minimum_n <- length(which(normalized < 0.95)) + 1
# Plot cdf and minimum sampling point
x <- xx
y <- normalized
mydata <- data.frame(x, y)
opti <- mydata[mydata$x == minimum_n, ]
plot_ly(
mydata,
x = ~ x,
y = ~ normalized,
mode = "lines+markers",
type = "scatter",
name = "CDF (1–KL divergence)"
) %>%
add_trace(
x = ~ x,
y = ~ jj,
mode = "lines+markers",
type = "scatter",
yaxis = "y2",
name = "KL divergence"
) %>%
add_trace(
x = ~ opti$x,
y = ~ opti$y,
yaxis = "y",
mode = "markers",
name = "Minimum N",
marker = list(
size = 8,
color = '#d62728',
line = list(color = 'black', width = 1)
)
) %>%
layout(
xaxis = list(
title = "N",
showgrid = T,
dtick = 50,
tickfont = list(size = 11)
),
yaxis = list(title = "1–KL divergence (% CDF)", showgrid = F),
yaxis2 = list(
title = "KL divergence",
overlaying = "y",
side = "right"
),
legend = list(
orientation = "h",
y = 1.2,
x = 0.1,
traceorder = "normal"
),
margin = list(
t = 50,
b = 50,
r = 100,
l = 80
),
hovermode = 'x'
) %>%
config(displayModeBar = FALSE)
# 8 - Determine the optimal iteration according to the minimum N size ==========
optimal_iteration <- results[which(abs(results$N - minimum_n) == min(abs(results$N - minimum_n))), ] %>%
mutate(IDX = 1:n()) %>%
filter(Perc == max(Perc))
# 9 - Plot minimum points from best iteration ==================================
N_final <- samples_storage[paste0("N", optimal_iteration$N, "_", optimal_iteration$IDX)][[1]]
plot(cov.dat[[1]])
plot(N_final,add=T,col="red")
## 10 - Calculate COOBS (sgsR) =================================================
#' COOBS (count of observations) is an algorithm to map relatively
#' adequate and under-sampled areas on a sampling pattern.
#' COOBS algorithms allow one to understand which areas in a spatial domain are adequately and under-sampled.
#' COOBS ca be used to design an additional survey by limiting the cLHS algorithm to areas where the COOBS
#' value is below some specified threshold.
coobs <- calculate_coobs(
mraster = cov.dat,
existing = N_final,
plot = TRUE,
cores = 4,
filename = paste0(results.path,"/coobs_SampleSize.tif"),
overwrite = TRUE
)
# Plot COOBS
plot(coobs[[2]], main="Number of samples in similar environment")
plot(nghe[1], col="transparent", add=TRUE)
## 11 - Calculate minimum and and optimal sample size with opendsm =============
#install.packages('~/Downloads/DescTools_0.99.45.tar', repos=NULL, type='source')
#install.packages('DescTools')
library("DescTools")
#' Source scripts from https://github.com/newdale/opendsm/tree/main
#'
#' Determine minimum sample size for the clhs algorithm
#' This script is based on the publication:
#' Determining minimum sample size for the conditioned Latin hypercube sampling algorithm
#' DOI: https://doi.org/10.1016/j.pedsph.2022.09.001
source("scripts/clhs_min.R")
#' Optimal sample size based on the publication:
#' Divergence metrics for determining optimal training sample size in digital soil mapping
#' DOI: https://doi.org/10.1016/j.geoderma.2023.116553
source("scripts/opt_sample.R")
#' Perform Sequential Variable Inflation Factor Analysis
source("scripts/seqVIF.R")
#' Function to calculate the KL-divergence
#' score between two probability distributions, P and Q.
source("scripts/kldiv.R")
#' Function to calculate the Jensen-Shannon Distance
#' score between two probability distributions, P and Q.
source("scripts/jsdist.R")
#' Function to calculate the Jensen-Shannon-divergence
#' score between two probability distributions, P and Q.
source("scripts/jsdiv.R")
# Convert raster covariates to dataframe
cov.dat.df <- data.frame(cov.dat)
# Calculate minimum sample size
min_N <-clhs_min(cov.dat.df)
# Calculate optimal sample size ising normalized KL-div, JS-div and JS distance
opt_N <- opt_sample(alg="clhs",s_min=150,s_max=500, s_step=50, s_reps=4, covs = cov.dat.df,clhs_iter=100, cpus=NULL, conf=0.95)
## ENDScript 5: Stratified sampling on vector and raster data
This script uses soil and landcover classes to define polygon ‘strata’ for an area weighted stratified sampling design.
#
# Digital Soil Mapping
# Soil Sampling Design
# Stratified Sampling on 'soil-landcover' strata
#
# Contact: Luis.RodriguezLado@fao.org
# Marcos.Angelini@fao.org
#________________________________________________________________
# Empty environment and cache
rm(list = ls())
gc()
# Content of this script =======================================================
# The goal of this script is to perform random and regular stratified
# soil sampling designs using soil/landcover classes as 'strata'
# The distribution of samples on the strata is based on a
# weighted area distribution with the requirement of at least 2 sampling points
# by strata. Small polygons (< 100 has) are eliminated from calculations.
# Only 'strata' classes prone to be samples are included in the analyses.
# The final sample data includes 'target' points - i.e. primary sampling points,
# and 'replacement' points - i.e. points to be used as replacements in case
# that the corresponding 'target' point cannot be sampled for any reason.
#
# 0 - Set working directory and load packages
# 1 - User-defined variables
# 2 - Import data
# 3 - Delineate soil strata
# 4 - Delineate land-use strata
# 5 - Create sampling strata
# 6 - Accommodate strata to requirements
# 7 - Stratified random sampling
# 8 - Calculate replacement areas for points
# 9 - Stratified regular sampling
# 10 - Random Sampling based on a stratified raster
#________________________________________________________________
## 0 - Set working directory and load packages =================================
# Load packages as a vector objects
packages <- c("sf", "terra", "tidyverse", "rmapshaper",
"units","plyr", "mapview", "leaflet", "stars","sgsR")
lapply(packages, require, character.only = TRUE) # Load packages
rm(packages) # Remove object to save memory space
# Set working directory to source file location
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
setwd("../") # Move wd down to main folder
## 1 - User-defined variables ==================================================
# Path to rasters
raster.path <- "data/rasters"
# Path to shapes
shp.path <- "data/shapes"
# Path to results
results.path <- "data/results/"
# Path to additional data
other.path <- "data/other/"
# Set number of samples
n <- 250
## 2 - Import data =============================================================
# Load shape of district
nghe <- sf::st_read(file.path(paste0(shp.path,"/Nghe_An.shp")),quiet=TRUE)
# Load soil data
soil <- st_read(paste0(shp.path,"/soils.shp"),promote_to_multi = FALSE)
# Load Landcover data
lc <- st_read(paste0(shp.path,"/landcover.shp"))
# Define a bounding area (optional).
# If BOUNDING = TRUE, then a bounding–area shapefile (bb) must be specified
# and the line in the code below must be uncommented
BOUNDING = FALSE
#bb <- st_read(paste0(shp.path,"/your_bounding_area.shp"))
# Compute replacement areas (optional).
distance.buffer = 500
## 3 - Delineate soil strata ===================================================
# Clip soil data by bounding area if defined
if (BOUNDING) {
soil <- st_intersection(soil, bb)
} else {
soil <- soil
}
# Check geometry
soil <- st_make_valid(soil)
soil <- st_cast(soil, "MULTIPOLYGON")
#Aggregate information by reference soil group (SRG)
#unique(soil$RSG)
# Remove NAs (water bodies are deleted)
soil <- soil[!is.na(soil$RSG),]
#unique(soil$RSG)
# Dissolve polygons by reference soil group
soil <- soil %>% group_by(RSG) %>% dplyr::summarize()
# Write soil strata to disk
st_write(soil, paste0(results.path,"soil_classes.shp"), delete_dsn = TRUE)
## Plot aggregated soil classes
map = leaflet(leafletOptions(minZoom = 8)) %>%
addTiles()
mv <- mapview(soil["RSG"], alpha=0, homebutton=T, layer.name = "soils", map=map)
mv@map
#ggplot() + geom_sf(data=soil, aes(fill = factor(RSG)))
## 4 - Delineate land-use strata ===============================================
# Clip landcover data by bounding area if exist
if (BOUNDING) {
lc <- st_intersection(lc, bb)
} else {
lc <- lc
}
# Check geometry
lc <- st_make_valid(lc)
# Agregate units
unique(lc$landcover)
lc$landcover[lc$landcover=="Broadleaf forest"]<- "Forest"
lc$landcover[lc$landcover=="Needleleaf forest"]<- "Forest"
lc$landcover[lc$landcover=="Mixed forest"]<- "Forest"
lc <- lc[lc$landcover!="Water body",]
unique(lc$landcover)
# Dissolve polygons by reference soil group
lc <- lc %>% group_by(landcover) %>% dplyr::summarize()
# Write landcover strata as shapefile
st_write(lc, paste0(results.path,"lc_classes.shp"), delete_dsn = TRUE)
# Plot map with the land cover information
map = leaflet() %>%
addTiles()
mv <- mapview(lc["landcover"], alpha=0, homebutton=T, layer.name = "Landcover", map=map)
mv@map
#ggplot() + geom_sf(data=lc, aes(fill = factor(landcover)))
## 5 - Create sampling strata ==================================================
# Combine soil and landcover layers
sf_use_s2(FALSE) # use in case st_intersection cannot be run
soil_lc <- st_intersection(st_make_valid(soil), st_make_valid(lc))
soil_lc$soil_lc <- paste0(soil_lc$RSG, "_", soil_lc$landcover)
soil_lc <- soil_lc %>% dplyr::select(soil_lc, geometry)
## 6 - Accommodate strata to requirements ======================================
# Select by Area. Convert to area to ha and select polygons with more than 100 has
soil_lc$area <- st_area(soil_lc)/10000
soil_lc$area <- as.vector(soil_lc$area)
soil_lc <- soil_lc %>%
group_by(soil_lc) %>%
mutate(area = sum(area))
soil_lc <- soil_lc[soil_lc$area > 100,]
plot(soil_lc[1])
# Replace blank spaces with underscore symbol to keep names uniform
soil_lc$soil_lc <- str_replace_all(soil_lc$soil_lc, " ", "_")
# Create a column of strata numeric codes
soil_lc$code <- as.character(as.numeric(as.factor(soil_lc$soil_lc)))
# List final strata
unique(soil_lc$soil_lc)
# Write final sampling strata map
st_write(soil_lc, paste0(results.path,"strata.shp"), delete_dsn = TRUE)
# Plot strata
# Plot final map of stratum
map = leaflet(options = leafletOptions(minZoom = 8.3)) %>%
addTiles()
mv <- mapview(soil_lc["soil_lc"], alpha=0, homebutton=T, layer.name = "Strata", map=map)
mv@map
## 7 - Stratified random sampling ==============================================
# Read already created strata shapefile
polygons <- st_read(paste0(results.path,"strata.shp"))
if(BOUNDING){
polygons = st_intersection(polygons,bb)
}
# Calculate the area of each polygon
polygons$area <- st_area(polygons)
# Create a new column to group polygons by a common attribute
polygons$group <- polygons$soil_lc
# Drop units to allow computations
polygons <- drop_units(polygons)
# Calculate the total area of all polygons in each group
group_areas <- polygons %>%
dplyr::group_by(group) %>%
dplyr::summarize(total_area = sum(area))
# Add a code to each group
group_codes <- polygons %>% group_by(group) %>%
dplyr::summarize(code = first(code))
# Join polygon strata and codes
group_areas <- left_join(group_areas,st_drop_geometry(group_codes), by = "group")
# Ensure minimum of 2 samples at each polygon in each group
group_areas$sample_count <- 2
# Calculate the number of samples per group based on relative area
group_areas$sample_count <-
group_areas$sample_count +
round(group_areas$total_area/sum(group_areas$total_area) *
(n-sum(group_areas$sample_count)))
# Adjust sample size on classes
while (sum(group_areas$sample_count) != n) {
if (sum(group_areas$sample_count) > n) {
# Reduce sample count for the largest polygon until total count is n
max_index <- which.max(group_areas$sample_count)
group_areas$sample_count[max_index] <- group_areas$sample_count[max_index] - 1
} else {
# Increase sample count for the smallest polygon until total count is n
min_index <- which.min(group_areas$sample_count)
group_areas$sample_count[min_index] <- group_areas$sample_count[min_index] + 1
}
}
# Count the total samples. It must be equal to the sampling size
sum(group_areas$sample_count)
polygons <- left_join(polygons, st_drop_geometry(group_areas),
by = c("soil_lc"="group"))
polygons <- dplyr::select(polygons, soil_lc, code.x, sample_count, geometry)
# Generate random points within each strata of size 3 times
#the required samples for each strata
x <- spatSample(x = vect(group_areas),
size = group_areas$sample_count * 3, method = "random")
# Compute sampling points for strata
z <- x %>%
st_as_sf() %>%
dplyr::group_by(code) %>%
dplyr::mutate(sample_count = as.numeric(sample_count),
order = seq_along(code),
ID = paste0(code, ".", order),
type = ifelse(sample_count >= order, "Target", "Replacement")) %>%
vect()
# Find classes with missing samples
missing.data <- left_join(group_areas,data.frame(z) %>%
dplyr::filter(type=="Target") %>%
dplyr::group_by(code) %>%
tally()) %>%
dplyr::mutate(diff=sample_count-n)
# Determine missing sampled strata
missing.strata <- which(is.na(missing.data$diff))
# Determine missing sampling point in strata (undersampled strata)
missing.sample = which(missing.data$diff != 0)
missing.number <- as.numeric(unlist(st_drop_geometry(missing.data[(missing.sample <- which(missing.data$diff != 0)),7])))
# Compute sampling points for missing sampled strata
x.missing.strata <- x[1]
x.missing.strata$sample_count<- 0
for(i in missing.strata){
xx.missing.strata <- x[1]
xx.missing.strata$sample_count<- 0
nn=0
while (sum(xx.missing.strata$sample_count) <
group_areas[group_areas$code==i,][["sample_count"]]*5) {
while(nn < group_areas[group_areas$code==i,][["sample_count"]]*3){
my.missing.strata <- spatSample(x = vect(group_areas[group_areas$code %in% i,]),
size = group_areas[group_areas$code==i,][["sample_count"]]*5,
method = "random")
nn <- nn + nrow(data.frame(my.missing.strata))
}
xx.missing.strata <- rbind(xx.missing.strata,my.missing.strata)
print(sum(xx.missing.strata$sample_count))
}
print(i)
print(xx.missing.strata)
x.missing.strata <- rbind(x.missing.strata,xx.missing.strata)
}
# Join initial sampling with missing sampling strata data
x <- rbind(x, x.missing.strata)
# Compute sampling points for missing samples (random sampling)
x.missing.sample <- x[1]
for(i in missing.sample){
xx.missing.sample <- x[1]
xx.missing.sample$sample_count<- 0
while (sum(xx.missing.sample$sample_count) < (group_areas[group_areas$code==i,][["sample_count"]]*3)) {
my.missing.sample <- spatSample(x = vect(group_areas[group_areas$code %in% i,]),
size = as.numeric(vect(group_areas[group_areas$code %in% i,])[[4]])+
(group_areas[group_areas$code==i,][["sample_count"]]*3), method = "random")
xx.missing.sample <- rbind(xx.missing.sample,my.missing.sample)
print(sum(xx.missing.sample$sample_count))
}
print(i)
print(xx.missing.sample)
x.missing.sample <- rbind(x.missing.sample,xx.missing.sample)
}
# Join initial sampling with missing sampling strata data and with missing samples
x <- rbind(x, x.missing.sample)
# Remove extra artificial replacements
x <- x[x$sample_count > 0,]
# Convert and export to shapefile
z <- x %>%
st_as_sf() %>%
dplyr::group_by(code) %>%
dplyr::mutate(sample_count = as.numeric(sample_count),
order = seq_along(code),
ID = paste0(code, ".", order),
type = ifelse(sample_count >= order, "Target", "Replacement")) %>%
vect()
writeVector(z,
filename = paste0(results.path,"strat_randm_samples.shp"), overwrite=TRUE)
# Check if the number of initial target points equals the final target points
n==nrow(z[z$type=="Target",])
n;nrow(z[z$type=="Target",])
# Plot points over strata
map = leaflet(options = leafletOptions(minZoom = 11.4)) %>%
addTiles()
mv <- mapview(soil_lc["soil_lc"], alpha=0, homebutton=T, layer.name = "Strata") +
mapview(sf::st_as_sf(z), zcol = 'type', color = "white", col.regions = c('royalblue', 'tomato'), cex=3, legend = TRUE,layer.name = "Samples")
mv@map
## 8 - Calculate replacement areas for points ==================================
# Load strata
soil_lc <- st_read(paste0(results.path,"strata.shp"))
# Read sampling. points from previous step
z <- st_read(paste0(results.path,"strat_randm_samples.shp"))
z <- z[z$type=="Target",]
# Apply buffer of 500 meters
buf.samples <- st_buffer(z, dist=distance.buffer)
# Intersect buffers
samples_buffer = st_intersection(soil_lc, buf.samples)
samples_buffer <- samples_buffer[samples_buffer$type=="Target",]
samples_buffer <- samples_buffer[samples_buffer$soil_lc==samples_buffer$group,]
# Save Sampling areas
st_write(samples_buffer, paste0(results.path,"replacement_areas.shp"),
delete_dsn = TRUE)
# Write target points only
targets <- z[z$type=="Target",]
st_write(targets, paste0(results.path,"target_points.shp"), delete_dsn = TRUE)
## 9 - Stratified regular sampling =============================================
# Read strata shapefile
polygons <- st_read(paste0(results.path,"strata.shp"))
# Calculate the area of each polygon
polygons$area <- st_area(polygons)
# Create a new column to group polygons by a common attribute
polygons$group <- polygons$soil_lc
# Drop units to allow computations
polygons <- drop_units(polygons)
# Calculate the total area of all polygons in each group
group_areas <- polygons %>%
dplyr::group_by(group) %>%
dplyr::summarize(total_area = sum(area))
# Add a code to each group
group_codes <- polygons %>% group_by(group) %>%
dplyr::summarize(code = first(code))
group_areas <- left_join(group_areas,st_drop_geometry(group_codes), by = "group")
# Ensure minimum of 2 samples at each polygon in each group
group_areas$sample_count <- 2
# Calculate the number of samples per group based on relative area
group_areas$sample_count <- group_areas$sample_count+round(group_areas$total_area/sum(group_areas$total_area) *
(n-sum(group_areas$sample_count)))
while (sum(group_areas$sample_count) != n) {
if (sum(group_areas$sample_count) > n) {
# Reduce sample count for the largest polygon until total count is n
max_index <- which.max(group_areas$sample_count)
group_areas$sample_count[max_index] <- group_areas$sample_count[max_index] - 1
} else {
# Increase sample count for the smallest polygon until total count is n
min_index <- which.min(group_areas$sample_count)
group_areas$sample_count[min_index] <- group_areas$sample_count[min_index] + 1
}
}
#sum(group_areas$sample_count)
polygons <- left_join(polygons, st_drop_geometry(group_areas), by = c("soil_lc"="group"))
polygons <- dplyr::select(polygons, soil_lc, code.x, sample_count, geometry)
# Generate regular points within each strata of size 3 times the required samples for each strata ----
x <- spatSample(x = vect(group_areas), size = group_areas$sample_count * 3, method = "regular")
# Compute sampling points for strata
z <- x %>%
st_as_sf() %>%
dplyr::group_by(code) %>%
dplyr::mutate(sample_count = as.numeric(sample_count),
order = seq_along(code),
ID = paste0(code, ".", order),
type = ifelse(sample_count >= order, "Target", "Replacement")) %>%
vect()
# Find missing samples
missing.data <- left_join(group_areas,data.frame(z) %>%
dplyr::filter(type=="Target") %>%
dplyr::group_by(code) %>%
tally()) %>%
dplyr::mutate(diff=sample_count-n)
# Determine missing sampled strata
missing.strata <- which(is.na(missing.data$diff))
# Determine missing sampling point in strata (undersampled strata)
missing.sample = which(missing.data$diff != 0)
missing.number <- as.numeric(unlist(st_drop_geometry(missing.data[(missing.sample <- which(missing.data$diff != 0)),7])))
# Compute sampling points for missing sampled strata
x.missing.strata <- x[1]
x.missing.strata$sample_count<- 0
for(i in missing.strata){
xx.missing.strata <- x[1]
xx.missing.strata$sample_count<- 0
nn=0
while (sum(xx.missing.strata$sample_count) <
group_areas[group_areas$code==i,][["sample_count"]]*5) {
while(nn < group_areas[group_areas$code==i,][["sample_count"]]*3){
my.missing.strata <- spatSample(x = vect(group_areas[group_areas$code %in% i,]),
size = group_areas[group_areas$code==i,][["sample_count"]]*5,
method = "random")
nn <- nn + nrow(data.frame(my.missing.strata))
}
xx.missing.strata <- rbind(xx.missing.strata,my.missing.strata)
print(sum(xx.missing.strata$sample_count))
}
print(i)
print(xx.missing.strata)
x.missing.strata <- rbind(x.missing.strata,xx.missing.strata)
}
# Join initial sampling with missing sampling strata data
x <- rbind(x, x.missing.strata)
# Compute sampling points for missing samples (regular sampling)
x.missing.sample <- x[1]
for(i in missing.sample){
xx.missing.sample <- x[1]
xx.missing.sample$sample_count<- 0
while (sum(xx.missing.sample$sample_count) < (group_areas[group_areas$code==i,][["sample_count"]]*3)) {
my.missing.sample <- spatSample(x = vect(group_areas[group_areas$code %in% i,]),
size = as.numeric(vect(group_areas[group_areas$code %in% i,])[[4]])+
(group_areas[group_areas$code==i,][["sample_count"]]*3), method = "regular")
xx.missing.sample <- rbind(xx.missing.sample,my.missing.sample)
print(sum(xx.missing.sample$sample_count))
}
print(i)
print(xx.missing.sample)
x.missing.sample <- rbind(x.missing.sample,xx.missing.sample)
}
# Join initial sampling with missing sampling strata data and with missing samples
x <- rbind(x, x.missing.sample)
# Remove extra artificial replacements
x <- x[x$sample_count > 0,]
# Convert to Shapefile
z <- x %>%
st_as_sf() %>%
dplyr::group_by(code) %>%
dplyr::mutate(sample_count = as.numeric(sample_count),
order = seq_along(code),
ID = paste0(code, ".", order),
type = ifelse(sample_count >= order, "Target", "Replacement")) %>%
vect()
# Export data to Shapefile ----
# Write sampling points to shp
writeVector(z, paste0(results.path,"regular_str_points.shp"), overwrite=TRUE)
# Check if the number of initial target points equals the final target points
n==nrow(z[z$type=="Target",])
n;nrow(z[z$type=="Target",])
# Plot results
map = leaflet(options = leafletOptions(minZoom = 11.4)) %>%
addTiles()
mv <- mapview(soil_lc["soil_lc"], alpha=0, homebutton=T, layer.name = "Strata") +
mapview(sf::st_as_sf(z), zcol = 'type', color = "white", col.regions = c('royalblue', 'tomato'), cex=3, legend = TRUE,layer.name = "Samples")
mv@map
# 10 - Random Sampling based on a stratified raster =============================
strata <- st_read(paste0(results.path,"strata.shp"),quiet = TRUE)
strata$code <- as.integer(strata$code)
# Create stratification raster
strata <- rast(st_rasterize(strata["code"],st_as_stars(st_bbox(strata), nx = 250, ny = 250)))
names(strata) <- "strata"
strata <- crop(strata, nghe, mask=TRUE)
# Create stratified random sampling
# Target points
target <- sample_strat(
sraster = strata,
nSamp = n
)
target$type <- "target"
# Replacement points
replacement <- sample_strat(
sraster = strata,
nSamp = nrow(target)*3
)
replacement$type <- "replacement"
# Histogram of frequencies for targets
calculate_representation(
sraster = strata,
existing = target,
drop=0,
plot = TRUE
)
# Histogram of frequencies for replacements
calculate_representation(
sraster = strata,
existing = replacement,
drop=0,
plot = TRUE
)
# Add index by strata to targets
target <- target %>%
st_as_sf() %>%
dplyr::group_by(strata) %>%
dplyr::mutate(order = seq_along(strata),
ID = paste0(strata, ".", order)) %>%
vect()
# Add index by strata to replacements
replacement <- replacement %>%
st_as_sf() %>%
dplyr::group_by(strata) %>%
dplyr::mutate(order = seq_along(strata),
ID = paste0(strata, ".", order)) %>%
vect()
# Merge target and replacement points in a single object
sampling.points <- rbind(target,replacement)
# Plot samples over strata
map = leaflet(options = leafletOptions(minZoom = 8.3)) %>%
addTiles()
mv <- mapview(st_as_stars(strata),homebutton=T, layer.name = "Strata") +
mapview(sampling.points, zcol = 'type', color = "white", col.regions = c('royalblue', 'tomato'), cex=3, legend = TRUE,layer.name = "Samples")
mv@map
# Plot samples over strata
# plot(strata, main="Strata and random samples")
# plot(nghe[1], col="transparent", add=TRUE)
# points(target,col="red")
# points(replacement,col="dodgerblue")
# legend("topleft", legend = c("target","replacement"), pch = 20, xpd=NA, bg="white", col=c("red","dodgerblue"))
## Export to shapefile
writeVector(sampling.points,
paste0(results.path,"pts_raster.shp"), overwrite=TRUE)Script 6: Conditional Latin Hypercube Sampling
#
# Digital Soil Mapping
# Soil Sampling Design
# conditional Latin Hypercube Sampling
#
# GSP-Secretariat
# Contact: Luis.RodriguezLado@fao.org
#________________________________________________________________
# Empty environment and cache
rm(list = ls())
gc()
# Content of this script ========================================
# Script for creating a sampling design based on conditioned Latin Hypercube Sampling.
# Given a suite of covariates this algorithm will assess the optimal location of
# samples based on the amount of information in the set of covariates.
#
# 0 - Set working directory and load packages
# 1 - User-defined variables
# 2 - Import national data
# 3 - Compute clhs
# 4 - Identify under-sampled areas
# 5 - Including existing legacy data in a cLHS sampling design
# 6 - Working with large raster data
# 7 - Cost–constrained cLHS sampling
# 8 - Replacement areas in cLHS design
# 9 - Polygonize replacement areas by similarity
# 10 - Constrained cLHS sampling accounting for accessibility and legacy data
# 11 - Plot sample density over covariates
# 12 - Calculate minimum and and optimal sample size with opendms
#________________________________________________________________
start_time <- Sys.time()
## 0 - Set working directory and load packages =================================
#remotes::install_github("lemuscanovas/synoptReg")
# Set working directory to source file location
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
setwd("../") # Move wd down to main folder
# List of packages
packages <- c("sp","terra","raster","sf","clhs", "sgsR","entropy", "tripack",
"manipulate","dplyr","synoptReg")
# Load packages
lapply(packages, require, character.only = TRUE)
# Remove object to save memory space
rm(packages)
## 1 - User-defined variables ==================================================
# Path to rasters
raster.path <- "data/rasters/"
# Path to shapes
shp.path <- "data/shapes/"
# Path to results
results.path <- "data/results/"
# Path to additional data
other.path <- "data/other/"
# Buffer distance for replacement areas (clhs)
D <- 1000 # Buffer distance to calculate replacement areas
# Define the minimum sample size. By default it uses the value calculated previously
minimum_n <- 180
# Aggregation factor for up-scaling raster covariates (optional)
agg.factor = 5
## 2 - Import national data ====================================================
# Read Spatial data covariates as rasters with terra
cov.dat <- list.files(raster.path, pattern = "tif$", recursive = TRUE, full.names = TRUE)
cov.dat <- terra::rast(cov.dat) # SpatRaster from terra
# Load shape of district
nghe <- sf::st_read(file.path(paste0(shp.path,"/Nghe_An.shp")),quiet=TRUE)
# Crop covariates on administrative boundary
cov.dat <- crop(cov.dat, nghe, mask=TRUE)
# Store elevation and slope separately
elevation <- cov.dat$dtm_elevation_250m
slope <- cov.dat$dtm_slope_250m
# Load roads
roads <- sf::st_read(file.path(paste0(shp.path,"/roads.shp")),quiet=TRUE)
roads <- st_intersection(roads, nghe)
# Simplify raster information with PCA
pca <- raster_pca(cov.dat)
# Get SpatRaster layers
cov.dat <- pca$PCA
# Subset rasters to main PC (var. explained <=0.99)
n_comps <- first(which(pca$summaryPCA[3,]>0.99))
cov.dat <- pca$PCA[[1:n_comps]]
# Remove pca stack
rm(pca)
# Aggregate stack to simplify data rasters for calculations
# cov.dat <- aggregate(cov.dat, fact=10, fun="mean")
# Plot of covariates
plot(cov.dat)
## 3 - Compute clhs ============================================================
# Distribute sampling points with clhs
pts <- sample_clhs(cov.dat, nSamp = 100, iter = 10000, progress = FALSE, simple = FALSE)
# Plot cLHS samples on map
plot(cov.dat[[1]], main="cLHS samples")
points(pts, col="red", pch = 1)
## 4 - Identify under-sampled areas ============================================
#' Calculate COOBS (count of observations) is an algorithm to map relatively
#' COOBS (count of observations) is an algorithm to map relatively
#' adequate and under-sampled areas on a sampling pattern.
#' COOBS algorithms allow one to understand which areas in a spatial domain are adequately and under-sampled.
#' COOBS ca be used to design an additional survey by limiting the cLHS algorithm to areas where the COOBS
#' value is below some specified threshold.
coobs <- calculate_coobs(
mraster = cov.dat,
existing = pts,
plot = TRUE,
cores = 4,
filename = paste0(results.path,"/coobs_clhs.tif"),
overwrite = TRUE
)
## 5 - Including existing legacy data in a cLHS sampling design ================
# Create an artificial random legacy dataset of 50 samples over the study area as an example
legacy.data <- sample_srs( raster = cov.dat, nSamp = 50)
# Calculate clhs 100 points plus locations of legacy data
res <- sample_clhs(cov.dat, nSamp = 150, existing=legacy.data, iter = 10000, progress = FALSE)
# Plot points
plot(cov.dat[[1]], main="cLHS samples (blue circles) and legacy samples (red diamonds)")
points(res, col="navy", pch = 1)
points(res[res$type=="existing",], col="red", pch = 5, cex=2)
## 6 - Working with large raster data ==========================================
# Scaling covariates
# Aggregation of covariates by a factor of 10.
# The original grid resolution is up-scaled using the mean value of the pixels in the grid
cov.dat2 <- aggregate(cov.dat, fact=10, fun="mean")
# Create clhs samples upon the resamples rasters
resampled.clhs <- sample_clhs(cov.dat2, nSamp = 150, iter = 10000, progress = FALSE)
# Plot the points over the 1st raster
plot(cov.dat2[[1]], main="Regular resampled data")
points(resampled.clhs , col="red", pch = 1)
rm(cov.dat2)
# Sampling to regular points
# Create a regular grid of 1000 points on the covariate space
regular.sample <- spatSample(cov.dat, size = 1000, xy=TRUE, method="regular", na.rm=TRUE)
# plot the points over the 1st raster
plot(cov.dat[[1]], main="Regular resampled data")
points(regular.sample, col="red", pch = 1)
# Create clhs samples upon the regular grid
regular.sample.clhs <- clhs(regular.sample, size = 100, progress = FALSE, iter = 10000, simple = FALSE)
# Plot points of clhs samples
points <- regular.sample.clhs$sampled_data # Get point coordinates of clhs sampling
plot(cov.dat[[1]], main="cLHS samples (red) and covariate resampled points (blue)")
points(regular.sample, col="navy", pch = 1)
points(points, col="red", cex=1)
## 7 - Cost–constrained cLHS sampling
# Use 'Distance to roads' as cost surface
# Calculate distance to roads with te same spatial definition than the covariates
# dist2access <- terra::distance(cov.dat[[1]], roads, progress=TRUE)
# names(dist2access) <- "dist2access"
# Save cost surface to disk
# writeRaster(dist2access, paste0(other.path,"nghe_d2roads.tif"), overwrite=TRUE)
# Load pre-calculated distance–to–roads surface
dist2access <- terra::rast(paste0(other.path,"nghe_d2roads.tif"))
# Aggregate to the same soatial definition
# dist2access <- aggregate(dist2access, fact=10, fun="mean")
plot(dist2access)
plot(nghe, col="transparent", add=TRUE)
# Add cost surface (distance to roads) as layer
cov.dat <- c(cov.dat,dist2access)
names(cov.dat)
# Harmonize NAs
cov.dat$dist2access <- cov.dat$dist2access * cov.dat[[1]]/cov.dat[[1]]
plot(cov.dat$dist2access)
plot(nghe, col="transparent",add=TRUE)
# Compute sampling points
cost.clhs <- sample_clhs(cov.dat, nSamp = minimum_n, iter = 10000, progress = FALSE, cost = 'dist2access', use.cpp = TRUE)
# Plot samples
plot(cov.dat[['dist2access']], main="cLHS samples with 'cost' constraints")
plot(cost.clhs, col="red", cex=1,add=T)
## 8 - Replacement areas in cLHS design (requires raster package)
# Determine the similarity to points in a buffer of distance 'D'
# Compute the buffers around points # cov25??
gw <- similarity_buffer(raster::stack(cov.dat) , as(cost.clhs, "Spatial"), buffer = D)
# Plot results
plot(elevation, legend=TRUE,main=paste("Similarity probability over elevation"))
## Overlay points
points(cost.clhs, col = "dodgerblue", pch = 3)
## Overlay probability stack for point 1
colors <- c((RColorBrewer::brewer.pal(9, "YlOrRd")))
terra::plot(gw[[1]], add=TRUE , legend=FALSE, col=colors)
## Overlay 1st cLHS point
points(cost.clhs[1,], col = "red", pch = 12,cex=1)
# 9 - Polygonize replacement areas by similarity
# Determine a threshold break to delineate replacement areas
similarity_threshold <- 0.90
# Reclassify buffer raster data according to the threshold break of probability
# 1 = similarity >= similarity_break; NA = similarity < similarity_break
# Define a vector with the break intervals and the output values (NA,1)
breaks <- c(0, similarity_threshold, NA, similarity_threshold, 1, 1)
# Convert to a matrix
breaks <- matrix(breaks, ncol=3, byrow=TRUE)
# Reclassify the data in the layers from probabilities to (NA,)
s = stack(lapply(1:raster::nlayers(gw), function(i){raster::reclassify(gw[[i]], breaks, right=FALSE)}))
# Polygonize replacement areas
s = lapply(as.list(s), rasterToPolygons, dissolve=TRUE)
s <- bind(s,keepnames=TRUE)
# Add the identifier of the corresponding target point
for(i in 1: length(s)){
s@data$ID[i] <- as.integer(stringr::str_replace(s@polygons[[i]]@ID,"1.",""))
}
# Clean the data by storing target ID data only
s@data <- s@data["ID"]
# Plot results
plot(cov.dat[[1]], main=paste("cLHS samples and replacement areas for threshold = ", similarity_threshold))
plot(s,add=TRUE, col=NA, border="gray40")
points(cost.clhs, col="red", pch = 3)
# Export replacement areas to shapefile
s <- st_as_sf(s)
st_write(s, file.path(paste0(results.path,'replacement_areas_', D, '.shp')), delete_dsn = TRUE)
# Write cLHS sampling points to shapefile
cost.clhs$ID <- row(cost.clhs)[,1] # Add identifier
out.pts <- st_as_sf(cost.clhs)
st_write(out.pts, paste0(results.path,'target_clhs.shp'), delete_dsn = TRUE)
## 10 - Constrained cLHS sampling taking into account accessibility and legacy data
# Load legacy data
legacy <- sf::st_read(file.path(paste0(shp.path,"/legacy_soils.shp")),quiet=TRUE)
# Add slope to the stack
cov.dat$slope <- slope
# Calculate clhs points with legacy, cost and buffer to roads
buff_inner=20;
buff_outer=3000
# Convert roads to sf object and cast to multilinestring
roads <- st_as_sf(roads) %>%
st_cast("MULTILINESTRING")
# Calculate clhs samples using slope as cost surface, distance to roads as
# access limitations, and including existing legacy data
aa <- sgsR::sample_clhs(mraster = cov.dat, nSamp = minimum_n, existing = legacy,
iter = 10000, details = TRUE, cost="slope", access=roads,
buff_inner=buff_inner, buff_outer=buff_outer)
## Plot distances, roads, clhs points and legacy data
plot(cov.dat$dist2access, main="New (red) and existing (blue) samples")
plot(roads,add=TRUE, col="gray60")
plot(aa$samples[aa$samples$type=="new",], col= "tomato",add=TRUE)
plot(aa$samples[aa$samples$type=="existing",], col= "dodgerblue2",add=TRUE)
# Write samples as shapefile
aa$samples[c("type","dist2access")] %>%
st_write(paste0(results.path,'const_clhs.shp'), delete_dsn = TRUE)
## 11 - Plot sample density over covariates ====================================
# Distribution of elevation values on the sampling design vs. original elevation data
s0 <- rbind(data.frame(method = "Elevation", data.frame(elevation)))
s1 <- extract(elevation,pts)
s1$method <-"cLHS"
# Plot the covariate elevation values as histogram bars and the sampling distribution as curve
library(ggplot2)
dens_elev <- ggplot() +
geom_histogram(data=s0, aes(x=dtm_elevation_250m,fill="red",y=after_stat(density)),binwidth=25, alpha=.5, position="identity") +
geom_density(data = s1, aes(x = dtm_elevation_250m, col = method))
dens_elev
## 12 - Calculate minimum and and optimal sample size
#' Source scripts from https://github.com/newdale/opendsm/tree/main
#'
#' Determine minimum sample size for the clhs algorithm
#' This script is based on the publication:
#' Determining minimum sample size for the conditioned Latin hypercube sampling algorithm
#' DOI: https://doi.org/10.1016/j.pedsph.2022.09.001
source("scripts/clhs_min.R")
#' Optimal sample size based on the publication:
#' Divergence metrics for determining optimal training sample size in digital soil mapping
#' DOI: https://doi.org/10.1016/j.geoderma.2023.116553
source("scripts/opt_sample.R")
#' Perform Sequential Variable Inflation Factor Analysis
source("scripts/seqVIF.R")
#' Function to calculate the KL-divergence
#' score between two probability distributions, P and Q.
source("scripts/kldiv.R")
#' Function to calculate the Jensen-Shannon Distance
#' score between two probability distributions, P and Q.
source("scripts/jsdist.R")
#' Function to calculate the Jensen-Shannon-divergence
#' score between two probability distributions, P and Q.
source("scripts/jsdiv.R")
#install.packages('~/Downloads/DescTools_0.99.45.tar', repos=NULL, type='source')
#install.packages('DescTools')
library("DescTools")
# Convert raster covariates to dataframe
cov.dat.df <- data.frame(cov.dat[[1:n_comps]])
# Calculate minimum sample size
min_N <-clhs_min(cov.dat.df)
min_n <- as.numeric(min_N$Sample_Size[1])
# Calculate optimal sample size ising normalized KL-div, JS-div and JS distance
opt_N <- opt_sample(alg="clhs",s_min = min_n, s_max = 500, s_step=50, s_reps=4, covs = cov.dat.df,clhs_iter=100, cpus=NULL, conf=0.95)
Soil Sampling Design