Annex I: Compendium of R scripts
This chapter compiles the complete list of R scripts involved in the process of soil sampling design.
Script 1: Introduction to R
# Introduction to R
# 0. Playground =================================================
# learn important keyboard shortcuts
# Ctrl + enter for running code
# tab after writing the first three
# characters of the function name
# F1 to access the help
# explore the use of <-, $, [], ==, !=, c(), :,
# data.frame(), list(), as.factor()
<- 10:15
a 2]
a[2:3]
a[<- c("1", "a", a )
b length(b)
<- data.frame(column_a = 1:8, column_b = b)
df
1]
df[,$column_b
dfas.numeric(df$column_b)
plot(df)
1:3,]
df[1]
df[,
as.factor(b)
<- list(a, b, df)
d
dnames(d)
names(d) <- c("numeric_vector", "character_vector", "dataframe")
d1]]
d[[$numeric_vector
d
== b
a != b
a
# 1. Set working directory ======================================
# Set working directory to a specific location
setwd("C:/GIT/Digital-Soil-Mapping/")
# Set working directory to source file location
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
# 2. Install and load packages ==================================
# readxl, tidyverse, and data.table packages using the functions
install.packages("tidyverse")
install.packages("readxl")
install.packages("data.table")
library(tidyverse)
library(readxl)
library(data.table)
# Install packages from other repository than CRAN
install.packages("remotes")
::install_github("lemuscanovas/synoptReg")
remotes
# 3. Import an spreadsheet ======================================
## 3.1 Read the MS Excel file -----------------------------------
#Read the soil_data.xlsx file, spreadsheet 2, using read_excel
read_excel(path = "data/other/soil_data.xlsx", sheet = 2)
## 3.2 Read the csv file with the native function ---------------
# 01-Data/horizon.csv
read.csv("data/other/soil_profile_data.csv")
## 3.3 Read the csv file with the tidyverse function ------------
read_csv("data/other/soil_profile_data.csv")
## 3.4 Read the csv file with the data.table function -----------
fread("data/other/soil_profile_data.csv")
## 3.5 Assign the dataframe to an object called dat -------------
<- read_csv("data/other/soil_profile_data.csv")
dat
# 4. Tidyverse functions ========================================
## 4.1 Select pid, hip, top, bottom, ph_h2o, cec from dat -------
<- dat %>%
dat_1 select(id_prof, id_hor, top, bottom, ph_h2o, cec)
## 4.2 Filter: pick observations by their values ----------------
# filter observations with cec > 50 cmolc/100g
<- dat_1 %>%
dat_2 filter(cec > 30)
dat_2## 4.3 Mutate: create a new variable ----------------------------
# thickness = top - bottom
<- dat_2 %>%
dat_3 mutate(thickness = bottom - top)
## 4.4 Group_by and summarise -----------------------------------
# group by variable pid
# summarise taking the mean of pH and cec
<- dat_3 %>%
dat_4 group_by(id_prof) %>%
summarise(mean_ph = mean(ph_h2o),
mean_cec = mean(cec))
## 4.5 Reshape the table using pivot_longer ---------------------
# use dat_3
# put the names of the variables ph_h2o,
# cec and thickness in the column
# variable and keep the rest of the table. Save in dat_5
<- dat_3 %>%
dat_5 pivot_longer(ph_h2o:thickness, names_to = "soil_property",
values_to = "value")
## 4.6 Join the table sites.csv with dat_3 ----------------------
# Load soil_phys_data030.csv (in 01-Data folder)
# Join its columns with dat_3 keeping all the rows of dat_3
# save the result as dat_6
<- read_csv("data/other/soil_phys_data030_2.csv")
phys
<- phys %>% rename(id_prof = "ProfID")
phys
<- dat_3 %>%
dat_6 left_join(phys)
# or
<- phys %>%
dat_6 right_join(dat_3)
# 5. Data visualization with ggplot2 ============================
## 5.1 1D plot: histograms --------------------------------------
# histograms of cec and ph_h2o
ggplot(dat_3, aes(x=cec)) + geom_histogram()
## 5.2 2D plot: scatterplot -------------------------------------
# Scatterplot bottom vs. ph_h2o
ggplot(dat_3, aes(x = bottom, y = ph_h2o)) +
geom_point()
# add a fitting line
ggplot(dat_3, aes(x = bottom, y = ph_h2o)) +
geom_point() +
geom_smooth(method = "lm" )
## 5.3 3D plot: scatterplot -------------------------------------
# Scatterplot bottom vs. ph_h2o, add clay as color
# and size inside the
# function aes()
ggplot(dat_3,
aes(x = bottom, y = ph_h2o, color = cec, size = cec)) +
geom_point()
# 6. Geospatial data with terra =================================
## Load packages (install them if needed)
library(terra)
## 6.1 Load a raster and a vector layer -------------------------
# Load data/other/landcover2013.tif using rast() function, then plot it
# Load data/shapes/landcover.shp using vect() function and
# plot it
# explore the attributes of these layers
<- rast("data/other/landcover2013.tif")
r plot(r)
<- vect("data/shapes/landcover.shp")
v plot(v)
## 6.2 Load a raster and a vector layer -------------------------
# Check the current CRS (EPSG) of the raster and the vector.
# Find a *projected* CRS in http://epsg.io for Vietnam and
# copy the number
# Check the Arguments of function project (?project) that need to
# be defined
# Save the new object as r_proj and v_proj
# plot both objects
<- project(x = r, y = "epsg:3405", method = "near",
r_proj res = 250)
plot(r_proj)
<- project(x = v, y = "epsg:3405")
v_proj plot(v_proj, add = TRUE)
## 6.3 Cropping and masking a raster ----------------------------
# Compute the area of the polygons in v_proj
# (search for a function) and
# assign the values to a new column named area
# select the largest polygon using [], $, == and max() func.
# and save it as pol
# crop the raster with pol using the crop() function and save
#it as r_pol
# mask the raster r_pol with the polygon pol and save it
# with the same name
# plot each result
$area <- expanse(v_proj, unit = "ha")
v_proj<- v_proj[v_proj$area == max(v_proj$area)]
pol plot(pol)
<- crop(r_proj, pol)
r_pol plot(r_pol)
plot(pol, add = TRUE)
<- mask(r_pol, pol)
r_pol plot(r_pol)
## 6.4 Replace values in a raster by filtering their cells ------
# Explore the following link to understand how terra
#manage cell values
# https://rspatial.org/terra/pkg/4-algebra.html
# Replace values lower than 5 in r+pol by 0
$landcover2013 < 5] <- 0
r_pol[r_polplot(r_pol)
## 6.5 Rasterize a vector layer ---------------------------------
# Use rasterize() function to convert v_proj to raster
# Use r_proj as reference raster
# Use field landcover to assign cell values, and plot the new map
<- rasterize(x = v_proj, y = r_proj, field = "landcover" )
v_class plot(v_class)
v_classactiveCat(v_class) <- 1
## 6.6 Extracting raster values using points --------------------
# Covert dat_6 to spatial points using vect() function
# (check help of vect())
# Note that the EPSG number is 3405
# Save the points as s
# Plot s and r_proj together in the same map (Argument add=TRUE)
# Extract the values of the raster using extract()
# function (check the help)
# Remove the ID column of the extracted values
# merge the extracted data with s using cbind() function
# Convert s as a dataframe
<- vect(dat_6, geom=c("x", "y"), crs = "epsg:4326")
s #s <- project(x = s, y = "epsg:3405")
plot(r_proj)
plot(s, add=TRUE)
<- extract(r_proj,s, ID=FALSE)
x <- cbind(s,x)
s <- as.data.frame(s)
d
d#GGally::ggscatmat(d)
## 6.7 Zonal statistics using polygons and rasters --------------
# Use the extract() func. to estimate the mean value of
# distance_proj at each polygon
# Use the fun= argument (check the help)
# Use the cbind() func. to merge v_proj and the extracted values
# convert v_proj to a dataframe
# Create a ggplot boxplot (geom_boxplot) with x=landcover
# and y=dist2access
<- rast("data/other/nghe_d2roads.tif")
distance plot(distance)
<- project(x = distance, y = "epsg:3405", method = "bilinear",
distance_proj res = 250)
plot(distance_proj)
<- extract(distance_proj, v_proj, fun = mean, ID=FALSE)
x <- cbind(v_proj, x)
v_proj
<- as_tibble(v_proj)
d
%>%
d ggplot(aes(x =landcover, y = dist2access, fill = landcover)) +
geom_boxplot() +
ylab("Distance to roads")
## END
Script 2: Download environmental covariates
var assets =
"projects/digital-soil-mapping-gsp-fao/assets/CHELSA/bio1",
["projects/digital-soil-mapping-gsp-fao/assets/CHELSA/bio12",
"projects/digital-soil-mapping-gsp-fao/assets/CHELSA/bio13",
"projects/digital-soil-mapping-gsp-fao/assets/CHELSA/bio14",
"projects/digital-soil-mapping-gsp-fao/assets/CHELSA/bio16",
"projects/digital-soil-mapping-gsp-fao/assets/CHELSA/bio17",
"projects/digital-soil-mapping-gsp-fao/assets/CHELSA/bio5",
"projects/digital-soil-mapping-gsp-fao/assets/CHELSA/bio6",
"projects/digital-soil-mapping-gsp-fao/assets/CHELSA/ngd10",
"projects/digital-soil-mapping-gsp-fao/assets/CHELSA/pet_penman_max",
"projects/digital-soil-mapping-gsp-fao/assets/CHELSA/pet_penman_mean",
"projects/digital-soil-mapping-gsp-fao/assets/CHELSA/pet_penman_min",
"projects/digital-soil-mapping-gsp-fao/assets/CHELSA/pet_penman_range",
"projects/digital-soil-mapping-gsp-fao/assets/CHELSA/sfcWind_max",
"projects/digital-soil-mapping-gsp-fao/assets/CHELSA/sfcWind_mean",
"projects/digital-soil-mapping-gsp-fao/assets/CHELSA/sfcWind_range",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/fpar_030405_500m_mean",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/fpar_030405_500m_sd",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/fpar_060708_500m_mean",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/fpar_060708_500m_sd",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/fpar_091011_500m_mean",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/fpar_091011_500m_sd",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/fpar_120102_500m_mean",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/fpar_120102_500m_sd",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/lstd_030405_mean",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/lstd_030405_sd",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/lstd_060708_mean",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/lstd_060708_sd",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/lstd_091011_mean",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/lstd_091011_sd",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/lstd_120102_mean",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/lstd_120102_sd",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/ndlst_030405_mean",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/ndlst_030405_sd",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/ndlst_060708_mean",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/ndlst_060708_sd",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/ndlst_091011_mean",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/ndlst_091011_sd",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/ndlst_120102_mean",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/ndlst_120102_sd",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/ndvi_030405_250m_mean",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/ndvi_030405_250m_sd",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/ndvi_060708_250m_mean",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/ndvi_060708_250m_sd",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/ndvi_091011_250m_mean",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/ndvi_091011_250m_sd",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/ndvi_120102_250m_mean",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/ndvi_120102_250m_sd",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/snow_cover",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/swir_060708_500m_mean",
"projects/digital-soil-mapping-gsp-fao/assets/LANDCOVER/crops",
"projects/digital-soil-mapping-gsp-fao/assets/LANDCOVER/flooded_vegetation",
"projects/digital-soil-mapping-gsp-fao/assets/LANDCOVER/grass",
"projects/digital-soil-mapping-gsp-fao/assets/LANDCOVER/shrub_and_scrub",
"projects/digital-soil-mapping-gsp-fao/assets/LANDCOVER/trees",
"projects/digital-soil-mapping-gsp-fao/assets/OPENLANDMAP/dtm_curvature_250m",
"projects/digital-soil-mapping-gsp-fao/assets/OPENLANDMAP/dtm_downslopecurvature_250m",
"projects/digital-soil-mapping-gsp-fao/assets/OPENLANDMAP/dtm_dvm2_250m",
"projects/digital-soil-mapping-gsp-fao/assets/OPENLANDMAP/dtm_dvm_250m",
"projects/digital-soil-mapping-gsp-fao/assets/OPENLANDMAP/dtm_elevation_250m",
"projects/digital-soil-mapping-gsp-fao/assets/OPENLANDMAP/dtm_mrn_250m",
"projects/digital-soil-mapping-gsp-fao/assets/OPENLANDMAP/dtm_neg_openness_250m",
"projects/digital-soil-mapping-gsp-fao/assets/OPENLANDMAP/dtm_pos_openness_250m",
"projects/digital-soil-mapping-gsp-fao/assets/OPENLANDMAP/dtm_slope_250m",
"projects/digital-soil-mapping-gsp-fao/assets/OPENLANDMAP/dtm_tpi_250m",
"projects/digital-soil-mapping-gsp-fao/assets/OPENLANDMAP/dtm_twi_500m",
"projects/digital-soil-mapping-gsp-fao/assets/OPENLANDMAP/dtm_upslopecurvature_250m",
"projects/digital-soil-mapping-gsp-fao/assets/OPENLANDMAP/dtm_vbf_250m"];
// Load borders
/// Using UN 2020 (replace the countries to download)
/// var ISO = ['ITA'];
/// var aoi =
/// ee.FeatureCollection(
/// 'projects/digital-soil-mapping-gsp-fao/assets/UN_BORDERS/BNDA_CTY'
/// )
/// .filter(ee.Filter.inList('ISO3CD', ISO));
/// var region = aoi.geometry();
/// Using a shapefile
/// 1. Upload the borders of your countries as an asset
/// 2. Replace 'your_shapefile' with the path to your shapefile
var shapefile = ee.FeatureCollection('projects/digital-soil-mapping-gsp-fao/assets/Nghe_An');
var region = shapefile.geometry().bounds();
// Load assets as ImageCollection
var assetsCollection = ee.ImageCollection(assets);
// Clip each image in the collection to the region of interest
var clippedCollection = assetsCollection.map(function(img){
return img.clip(region).toFloat();
;
})
// Function to replace masked values with zeroes for fpar bands
function replaceMaskedFpar(img) {
var allBands = img.bandNames();
var fparBands =
.filter(ee.Filter.stringStartsWith('item', 'fpar'));
allBandsvar nonFparBands = allBands.removeAll(fparBands);
var fparImg = img.select(fparBands).unmask(0);
var nonFparImg = img.select(nonFparBands);
// If there are no fpar bands, return the original image
var result = ee.Algorithms.If(fparBands.length().eq(0),
,
img.addBands(fparImg));
nonFparImg
return ee.Image(result);
}
// Clip each image in the collection to the region of
//interest and replace masked values for fpar bands
var clippedCollection = assetsCollection.map(function(img){
var clippedImg = img.clip(region).toFloat();
return replaceMaskedFpar(clippedImg);
;
})
// Stack the layers and maintain the
//layer names in the final file
var stacked = clippedCollection.toBands();
// Get the list of asset names
var assetNames = ee.List(assets).map(function(asset) {
return ee.String(asset).split('/').get(-1);
;
})
// Rename the bands with asset names
var renamed = stacked.rename(assetNames);
print(renamed, 'Covariates to be exported')
// Visualize the result
// Set a visualization parameter
// (you can adjust the colors as desired)
var visParams = {
bands: 'bio1',
min: 0,
max: 1,
palette: ['blue', 'green', 'yellow', 'red']
;
}
// Add the layer to the map
Map.centerObject(renamed, 6)
Map.addLayer(renamed, visParams, 'Covariates');
// Export the stacked image to Google Drive
.image.toDrive({
Exportimage: renamed,
description: 'covariates',
folder: 'GEE',
scale: 250,
maxPixels: 1e13,
region: region
;
})
/* Create mask for croplands ----------------------------*/
// Load the Copernicus Global Land Service image collection
var imageCollection =
.Image("COPERNICUS/Landcover/100m/Proba-V-C3/Global/2019")
ee.select("discrete_classification")
.clip(region)
var crs = 'EPSG:4326'; // WGS84
var res = 250; // Resolution in decimal degrees
// Default resampling is nearest neighbor
var image1 = imageCollection.resample()
.reproject({
crs: crs, // Add your desired CRS here
scale: res // Add your desired scale here
;
})
// Reclassify the land cover classes
var inList = [0, 20, 30, 40, 50, 60, 70, 80, 90, 100,
111, 112, 113, 114, 115, 116,
121, 122, 123, 124, 125, 126, 200];
var outList = [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
var FAO_lu = image1.remap(inList, outList)
.toDouble()
.clip(region);
// print(FAO_lu)
// Convert 0 to NA
var mask = FAO_lu.neq(0);
print(mask)
= FAO_lu.updateMask(mask);
FAO_lu
print(FAO_lu, "Mask")
var visParams = {
bands: 'remapped',
min: 0,
max: 1,
palette: ['green', 'yellow']
;
}
// Add the layer to the map
Map.addLayer(FAO_lu,visParams ,'Mask');
// Export the land cover image as a raster to Google Drive
.image.toDrive({
Exportimage: FAO_lu,
folder: 'GEE',
description: 'mask',
scale: res, // Add your desired scale here
region: region,
crs: crs, // Add your desired CRS here
maxPixels: 1e13 // Add a maximum number of pixels
//for export if needed
; })
Script 3: Evaluate Legacy Data
#
# Digital Soil Mapping
# Soil Sampling Design
# Evaluation of Legacy Data
#
# GSP-Secretariat
# Contact: Luis.RodriguezLado@fao.org
#________________________________________________________________
# Empty environment and cache
rm(list = ls())
gc()
# Content of this script ========================================
# Script for evaluation the degree of representativeness of a soil legacy dataset
# relative to the diversity of the environmental conditions described in a set
# of raster covariates.
#
# 0 - Set working directory and load packages
# 1 - User-defined variables
# 2 - Prepare data
# 3 - Extract environmental data from rasters at soil locations
# 4 - Compute variability matrix in covariates
# 5 - Calculate hypercube of "covariates" distribution (P)
# 6 - Calculate hypercube of "sample" distribution (Q)
# 7 - Calculate Representativeness of the Legacy Dataset
#________________________________________________________________
## 0 - Set working directory and load packages =================================
#remotes::install_github("lemuscanovas/synoptReg")
# Set working directory to source file location
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
setwd("../") # Move wd down to main folder
# List of packages
<- c("sp","terra","raster","sf","clhs", "sgsR","entropy", "tripack",
packages "manipulate","dplyr","synoptReg")
# Load packages
lapply(packages, require, character.only = TRUE)
# Remove object to save memory space
rm(packages)
## 1 - User-defined variables ==================================================
# Path to rasters
<- "data/rasters"
raster.path # Path to shapes
<- "data/shapes"
shp.path # Path to results
<- "data/results/"
results.path # Path to additional data
<- "data/other/"
other.path # Aggregation factor for up-scaling raster covariates (optional)
= 10
agg.factor
## 2 - Prepare data ============================================================
## Load raster covariate data
# Read Spatial data covariates as rasters with terra
<- list.files(raster.path, pattern = "tif$", recursive = TRUE, full.names = TRUE)
cov.dat <- terra::rast(cov.dat) # SpatRaster from terra
cov.dat # Aggregate stack to simplify data rasters for calculations
<- aggregate(cov.dat, fact=agg.factor, fun="mean")
cov.dat
# Load shape of district
<- sf::st_read(file.path(paste0(shp.path,"/Nghe_An.shp")),quiet=TRUE)
nghe
# Crop covariates on administrative boundary
<- crop(cov.dat, nghe, mask=TRUE)
cov.dat
# Transform raster information with PCA
<- raster_pca(cov.dat)
pca
# Get SpatRaster layers
<- pca$PCA
cov.dat # Create a raster stack to be used as input in the clhs::clhs function
<- raster::stack(cov.dat)
cov.dat.ras # Subset rasters
<- pca$PCA[[1:first(which(pca$summaryPCA[3,]>0.99))]]
cov.dat <- cov.dat.ras[[1:first(which(pca$summaryPCA[3,]>0.99))]]
cov.dat.ras
## 3 - Extract environmental data from rasters at soil locations ===============
# Load legacy soil data
<- terra::vect(file.path(paste0(shp.path,"/legacy_soils.shp")))
p.dat # Extract data
<- terra::extract(cov.dat, p.dat)
p.dat_I <- na.omit(p.dat_I) # Remove soil points outside study area
p.dat_I <- p.dat_I[,-1]
p.dat_I.df str(p.dat_I.df)
## 4 - Compute variability matrix in the covariates ====================================
# Define Number of bins
<- 25
nb # Quantile matrix of the covariate data
<- matrix(NA, nrow = (nb + 1), ncol = nlyr(cov.dat))
q.mat = 1
j for (i in 1:nlyr(cov.dat)) {
<- minmax(cov.dat[[i]])[2] - minmax(cov.dat[[i]])[1]
ran1 <- ran1 / nb
step1 <-
q.mat[, j] seq(minmax(cov.dat[[i]])[1],
to = minmax(cov.dat[[i]])[2],
by = step1)
<- j + 1
j
}
q.mat
## 5 - Calculate hypercube of "covariates" distribution (P) ===================
# Convert SpatRaster to dataframe for calculations
<- as.data.frame(cov.dat)
cov.dat.df <- matrix(1, nrow = nb, ncol = ncol(q.mat))
cov.mat <- as.matrix(cov.dat.df)
cov.dat.mx for (i in 1:nrow(cov.dat.mx)) {
for (j in 1:ncol(cov.dat.mx)) {
<- cov.dat.mx[[i, j]]
dd
if (!is.na(dd)) {
for (k in 1:nb) {
<- q.mat[k, j]
kl <- q.mat[k + 1, j]
ku
if (dd >= kl && dd <= ku) {
<- cov.mat[k, j] + 1
cov.mat[k, j]
}
}
}
}
}
cov.mat
## 6 - Calculate hypercube of "sample" distribution (Q) ========================
<- matrix(1, nrow = nb, ncol = ncol(q.mat))
h.mat for (i in 1:nrow(p.dat_I.df)) {
for (j in 1:ncol(p.dat_I.df)) {
<- p.dat_I.df[i, j]
dd
if (!is.na(dd)) {
for (k in 1:nb) {
<- q.mat[k, j]
kl <- q.mat[k + 1, j]
ku
if (dd >= kl && dd <= ku) {
<- h.mat[k, j] + 1
h.mat[k, j]
}
}
}
}
}
h.mat
## 7 - Calculate Representativeness of the Legacy Dataset ==================
## Calculate the proportion of "variables" in the covariate spectra that fall within the convex hull of variables in the "environmental sample space"
# Principal component of the legacy data sample
= prcomp(p.dat_I[,2:(ncol(cov.dat.df)+1)],scale=TRUE, center=TRUE)
pca.s = as.data.frame(pca.s$x)
scores_pca1 # Plot the first 2 principal components and convex hull
<- tri.mesh(scores_pca1[,1],scores_pca1[,2],"remove") # Delaunay triangulation
rand.tr <- convex.hull(rand.tr, plot.it=F) # convex hull
rand.ch = cbind(x=c(rand.ch$x),y=c(rand.ch$y)) # save the convex hull vertices
pr_poly plot(scores_pca1[,1], scores_pca1[,2], xlab="PCA 1", ylab="PCA 2", xlim=c(min(scores_pca1[,1:2]), max(scores_pca1[,1:2])),ylim=c(min(scores_pca1[,1:2]), max(scores_pca1[,1:2])), main='Convex hull of soil legacy data')
lines(c(rand.ch$x,rand.ch$x[1]), c(rand.ch$y,rand.ch$y[1]),col="red",lwd=1) # draw the convex hull (domain of legacy data)
# PCA projection of study area population onto the principal components
<- predict(pca.s, cov.dat.df) # Project study area population onto sample PC
PCA_projection = cbind(x=PCA_projection[,1],y=PCA_projection[,2]) # PC scores of projected population
newScores
# Plot the polygon and all points to be checked
plot(newScores, xlab="PCA 1", ylab="PCA 2", xlim=c(min(newScores[,1:2]), max(newScores[,1:2])), ylim=c(min(newScores[,1:2]), max(newScores[,1:2])), col='black', main='Environmental space plots over the convex hull of soil legacy data')
polygon(pr_poly,col='#99999990')
# Check which points fall within the polygon
<- point.in.polygon(newScores[,2], newScores[,1], pr_poly[,2],pr_poly[,1],mode.checked=FALSE)
pip <- data.frame(cbind(newScores, pip))
newScores # Plot points outside convex hull
points(newScores[which(newScores$pip==0),1:2],pch='X', col='red')
# Proportion of the conditions in the study area that fall within the convex hull of sample conditions
sum(nrow(newScores[newScores$pip>0,]))/nrow(newScores)*100
## END
Script 4: Calculate Minimum and Optimal Sample Sizes
#
# Digital Soil Mapping
# Soil Sampling Design
# Optimizing Sample Size
#
# GSP-Secretariat
# Contact: Luis.RodriguezLado@fao.org
#________________________________________________________________
#Empty environment and cache
rm(list = ls())
gc()
# Content of this script ========================================
# The goal of this script is to determine the minimum sample size required to describe an area
# while retaining for a 95% of coincidence in the environmental variability of covariates
# in the area
#
# 0 - Set working directory and load necessary packages
# 1 - User-defined variables
# 2 - Import national data
# 3 - Calculate the minimum sample size to describe the area
# 4 - Plot covariate diversity as PCA scores
# 5 - KL divergence and % similarity results for growing N samples
# 6 - Model KL divergence
# 7 - Determine the minimum sample size for 95% coincidence
# 8 - Determine the optimal iteration according to the minimum N size
# 9 - Plot minimum points from best iteration
#________________________________________________________________
## 0 - Set working directory and load necessary packages =======================
# Set working directory to source file location
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
setwd("../") # Move wd down to main folder
# List of packages
<- c("sp","terra","raster","sf","clhs",
packages "sgsR","entropy", "tripack",
"manipulate","dplyr","plotly","synoptReg")
# Load packages
lapply(packages, require, character.only = TRUE)
rm(packages) # Remove object to save memory space
## 1 - User-defined variables ==================================================
# Path to rasters
<- "data/rasters/"
raster.path # Path to shapes
<- "data/shapes/"
shp.path # Path to results
<- "data/results/"
results.path # Path to additional data
<- "data/other/"
other.path # Aggregation factor for up-scaling raster covariates (optional)
= 10
agg.factor
# Define parameters to determine minimum sampling size
<- 60 # Initial sampling size to test
initial.n <- 360 # Final sampling size to test
final.n <- 20 # Increment size
by.n <- 5 # Number of trials on each size
iters
## 2 - Import national data ====================================================
## Load raster covariate data
# Read Spatial data covariates as rasters with terra
<- list.files(raster.path, pattern = "tif$", recursive = TRUE, full.names = TRUE)
cov.dat <- terra::rast(cov.dat) # SpatRaster from terra
cov.dat # Aggregate stack to simplify data rasters for calculations
<- aggregate(cov.dat, fact=agg.factor, fun="mean")
cov.dat # Load shape of district
<- sf::st_read(file.path(paste0(shp.path,"/Nghe_An.shp")),quiet=TRUE)
nghe
# Crop covariates on administrative boundary
<- crop(cov.dat, nghe, mask=TRUE)
cov.dat # Store elevation and slope separately
<- cov.dat$dtm_elevation_250m
elevation <- cov.dat$dtm_slope_250m
slope
# Load roads
<- vect(file.path(paste0(shp.path,"/roads.shp")))
roads <- crop(roads, nghe)
roads
# Simplify raster information with PCA
<- raster_pca(cov.dat)
pca
# Get SpatRaster layers
<- pca$PCA
cov.dat
# Subset rasters to main PC (var. explained <=0.99)
<- first(which(pca$summaryPCA[3,]>0.99))
n_comps <- pca$PCA[[1:n_comps]]
cov.dat
# Plot covariates
plot(cov.dat)
# 3 - Calculate the minimum sample size to describe the area ===================
# Start computations ----
# Initialize empty vectors to store results
<- c()
number_of_samples <- c()
prop_explained <- c()
klo_samples <- list()
samples_storage
# Convert SpatRaster to dataframe
<- as.data.frame(cov.dat)
cov.dat.df
# Start evaluation with growing sample sizes
for (trial in seq(initial.n, final.n, by = by.n)) {
for (iteration in 1:iters) {
# Generate stratified clhs samples
<- sample_clhs(cov.dat,
p.dat_I nSamp = trial, iter = 10000,
progress = FALSE, simple = FALSE)
# Get covariate values as dataframe and delete NAs, avoid geometry
<- as.data.frame(p.dat_I) %>%
p.dat_I.df ::select(!(geometry)) %>%
dplyrna.omit()
# Store samples as list
paste0("N", trial, "_", iteration)]] <- p.dat_I
samples_storage[[
## Comparison of population and sample distributions - Kullback-Leibler (KL) divergence
# Define quantiles of the study area (number of bins)
<- 25
nb # Quantile matrix of the covariate data
<- matrix(NA, nrow = (nb + 1), ncol = nlyr(cov.dat))
q.mat = 1
j for (i in 1:nlyr(cov.dat)) {
<- minmax(cov.dat[[i]])[2] - minmax(cov.dat[[i]])[1]
ran1 <- ran1 / nb
step1 <-
q.mat[, j] seq(minmax(cov.dat[[i]])[1],
to = minmax(cov.dat[[i]])[2],
by = step1)
<- j + 1
j
}
q.mat
# Hypercube of covariates in study area
# Initialize the covariate matrix
<- matrix(1, nrow = nb, ncol = ncol(q.mat))
cov.mat <- as.matrix(cov.dat.df)
cov.dat.mx for (i in 1:nrow(cov.dat.mx)) {
for (j in 1:ncol(cov.dat.mx)) {
<- cov.dat.mx[[i, j]]
dd
if (!is.na(dd)) {
for (k in 1:nb) {
<- q.mat[k, j]
kl <- q.mat[k + 1, j]
ku
if (dd >= kl && dd <= ku) {
<- cov.mat[k, j] + 1
cov.mat[k, j]
}
}
}
}
}
cov.mat
# Compare whole study area covariate space with the selected sample
# Sample data hypercube (the same as for the raster data but on the sample data)
<- matrix(1, nrow = nb, ncol = ncol(q.mat))
h.mat for (i in 1:nrow(p.dat_I.df)) {
for (j in 1:ncol(p.dat_I.df)) {
<- p.dat_I.df[i, j]
dd
if (!is.na(dd)) {
for (k in 1:nb) {
<- q.mat[k, j]
kl <- q.mat[k + 1, j]
ku
if (dd >= kl && dd <= ku) {
<- h.mat[k, j] + 1
h.mat[k, j]
}
}
}
}
}
h.mat
## Compute Kullback-Leibler (KL) divergence
<- c()
kl.index for (i in 1:ncol(cov.dat.df)) {
<- KL.empirical(c(cov.mat[, i]), c(h.mat[, i]))
kl <- c(kl.index, kl)
kl.index <- mean(kl.index)
klo
}
## Calculate the proportion of "env. variables" in the covariate spectra that fall within the convex hull of variables in the "environmental sample space"
# Principal component of the data sample
= prcomp(p.dat_I.df, scale = TRUE, center = TRUE)
pca.s = as.data.frame(pca.s$x)
scores_pca1 # Plot the first 2 principal components and convex hull
<-
rand.tr tri.mesh(scores_pca1[, 1], scores_pca1[, 2], "remove") # Delaunay triangulation
<- convex.hull(rand.tr, plot.it = F) # convex hull
rand.ch <-
pr_poly cbind(x = c(rand.ch$x), y = c(rand.ch$y)) # save the convex hull vertices
# PCA projection of study area population onto the principal components
<-
PCA_projection predict(pca.s, cov.dat.df) # Project study area population onto sample PC
= cbind(x = PCA_projection[, 1], y = PCA_projection[, 2]) # PC scores of projected population
newScores # Check which points fall within the polygon
<-
pip point.in.polygon(newScores[, 2], newScores[, 1], pr_poly[, 2], pr_poly[, 1], mode.checked =
FALSE)
<- data.frame(cbind(newScores, pip))
newScores <- c(klo_samples, klo)
klo_samples <-
prop_explained c(prop_explained, sum(newScores$pip) / nrow(newScores) * 100)
<- c(number_of_samples, trial)
number_of_samples print(
paste(
"N samples = ",
trial," out of ",
final.n,"; iteration = ",
iteration,"; KL = ",
klo,"; Proportion = ",
sum(newScores$pip) / nrow(newScores) * 100
)
)
}
}
# 4 - Plot covariate diversity as PCA scores ===================================
plot(newScores[,1:2],
xlab = "PCA 1",
ylab = "PCA 2",
xlim = c(min(newScores[,1:2], na.rm = T), max(newScores[,1:2], na.rm = T)),
ylim = c(min(newScores[,1:2], na.rm = T), max(newScores[,1:2], na.rm = T)),
col = 'black',
main = 'Environmental space plots on convex hull of soil samples')
polygon(pr_poly, col = '#99999990')
# # Plot points outside convex hull
points(newScores[which(newScores$pip == 0), 1:2],
col = 'red',
pch = 12,
cex = 1)
# 5 - KL divergence and % coincidence for growing N samples =============
# Merge data from number of samples, KL divergence and % coincidence
<- data.frame(number_of_samples, klo_samples, prop_explained)
results names(results) <- c("N", "KL", "Perc")
# Calculate mean results by N size
<- results %>%
mean_result group_by(N) %>%
summarize_all(mean)
mean_result
## Plot dispersion on KL and % by N
par(mar = c(5, 4, 5, 5))
boxplot(
~ N,
Perc data = results,
col = rgb(1, 0.1, 0, alpha = 0.5),
ylab = "% coincidence"
)mtext("KL divergence", side = 4, line = 3)
# Add new plot
par(new = TRUE, mar = c(5, 4, 5, 5))
# Box plot
boxplot(
~ N,
KL data = results,
axes = FALSE,
outline = FALSE,
col = rgb(0, 0.8, 1, alpha = 0.5),
ylab = ""
)axis(
4,
at = seq(0.02, 0.36, by = .06),
label = seq(0.02, 0.36, by = .06),
las = 3
)# Draw legend
par(xpd=TRUE)
legend("top", inset=c(1,-.15) ,c("% coincidence", "KL divergence"), horiz=T,cex=.9,
box.lty=0,fill = c(rgb(1, 0.1, 0, alpha = 0.5), rgb(0, 0.8, 1, alpha = 0.5)))
par(xpd=FALSE)
# 6 - Model KL divergence ======================================================
# Create an exponential decay function of the KL divergence
<- mean_result$N
x = (mean_result$KL - min(mean_result$KL)) / (max(mean_result$KL) - min(mean_result$KL)) #KL
y
# Parametrize Exponential decay function
<- list() # Initialize an empty list for the starting values
start
#fit function
= 2
k = 0.01
b0 = 0.01
b1
<-
fit1 nls(
~ k * exp(-b1 * x) + b0,
y start = list(k = k, b0 = b0, b1 = b1),
control = list(maxiter = 500),
trace = T
)summary(fit1)
# Plot fit
<- seq(1, final.n, 1)
xx plot(x, y)
lines(xx, predict(fit1, list(x = xx)))
# Predict with vfit function
<- predict(fit1, list(x = xx))
jj = 1 - (jj - min(jj)) / (max(jj) - min(jj))
normalized
# 7 - Determine the minimum sample size to account for 95% of cumulative probability of the covariate diversity ====
<- length(which(normalized < 0.95)) + 1
minimum_n
# Plot cdf and minimum sampling point
<- xx
x <- normalized
y
<- data.frame(x, y)
mydata <- mydata[mydata$x == minimum_n, ]
opti
plot_ly(
mydata,x = ~ x,
y = ~ normalized,
mode = "lines+markers",
type = "scatter",
name = "CDF (1–KL divergence)"
%>%
) add_trace(
x = ~ x,
y = ~ jj,
mode = "lines+markers",
type = "scatter",
yaxis = "y2",
name = "KL divergence"
%>%
) add_trace(
x = ~ opti$x,
y = ~ opti$y,
yaxis = "y",
mode = "markers",
name = "Minimum N",
marker = list(
size = 8,
color = '#d62728',
line = list(color = 'black', width = 1)
)%>%
) layout(
xaxis = list(
title = "N",
showgrid = T,
dtick = 50,
tickfont = list(size = 11)
),yaxis = list(title = "1–KL divergence (% CDF)", showgrid = F),
yaxis2 = list(
title = "KL divergence",
overlaying = "y",
side = "right"
),legend = list(
orientation = "h",
y = 1.2,
x = 0.1,
traceorder = "normal"
),margin = list(
t = 50,
b = 50,
r = 100,
l = 80
),hovermode = 'x'
%>%
) config(displayModeBar = FALSE)
# 8 - Determine the optimal iteration according to the minimum N size ==========
<- results[which(abs(results$N - minimum_n) == min(abs(results$N - minimum_n))), ] %>%
optimal_iteration mutate(IDX = 1:n()) %>%
filter(Perc == max(Perc))
# 9 - Plot minimum points from best iteration ==================================
<- samples_storage[paste0("N", optimal_iteration$N, "_", optimal_iteration$IDX)][[1]]
N_final plot(cov.dat[[1]])
plot(N_final,add=T,col="red")
## 10 - Calculate COOBS (sgsR) =================================================
#' COOBS (count of observations) is an algorithm to map relatively
#' adequate and under-sampled areas on a sampling pattern.
#' COOBS algorithms allow one to understand which areas in a spatial domain are adequately and under-sampled.
#' COOBS ca be used to design an additional survey by limiting the cLHS algorithm to areas where the COOBS
#' value is below some specified threshold.
<- calculate_coobs(
coobs mraster = cov.dat,
existing = N_final,
plot = TRUE,
cores = 4,
filename = paste0(results.path,"/coobs_SampleSize.tif"),
overwrite = TRUE
)# Plot COOBS
plot(coobs[[2]], main="Number of samples in similar environment")
plot(nghe[1], col="transparent", add=TRUE)
## 11 - Calculate minimum and and optimal sample size with opendsm =============
#install.packages('~/Downloads/DescTools_0.99.45.tar', repos=NULL, type='source')
#install.packages('DescTools')
library("DescTools")
#' Source scripts from https://github.com/newdale/opendsm/tree/main
#'
#' Determine minimum sample size for the clhs algorithm
#' This script is based on the publication:
#' Determining minimum sample size for the conditioned Latin hypercube sampling algorithm
#' DOI: https://doi.org/10.1016/j.pedsph.2022.09.001
source("scripts/clhs_min.R")
#' Optimal sample size based on the publication:
#' Divergence metrics for determining optimal training sample size in digital soil mapping
#' DOI: https://doi.org/10.1016/j.geoderma.2023.116553
source("scripts/opt_sample.R")
#' Perform Sequential Variable Inflation Factor Analysis
source("scripts/seqVIF.R")
#' Function to calculate the KL-divergence
#' score between two probability distributions, P and Q.
source("scripts/kldiv.R")
#' Function to calculate the Jensen-Shannon Distance
#' score between two probability distributions, P and Q.
source("scripts/jsdist.R")
#' Function to calculate the Jensen-Shannon-divergence
#' score between two probability distributions, P and Q.
source("scripts/jsdiv.R")
# Convert raster covariates to dataframe
<- data.frame(cov.dat)
cov.dat.df # Calculate minimum sample size
<-clhs_min(cov.dat.df)
min_N # Calculate optimal sample size ising normalized KL-div, JS-div and JS distance
<- opt_sample(alg="clhs",s_min=150,s_max=500, s_step=50, s_reps=4, covs = cov.dat.df,clhs_iter=100, cpus=NULL, conf=0.95)
opt_N
## END
Script 5: Stratified sampling on vector and raster data
This script uses soil and landcover classes to define polygon ‘strata’ for an area weighted stratified sampling design.
#
# Digital Soil Mapping
# Soil Sampling Design
# Stratified Sampling on 'soil-landcover' strata
#
# Contact: Luis.RodriguezLado@fao.org
# Marcos.Angelini@fao.org
#________________________________________________________________
# Empty environment and cache
rm(list = ls())
gc()
# Content of this script =======================================================
# The goal of this script is to perform random and regular stratified
# soil sampling designs using soil/landcover classes as 'strata'
# The distribution of samples on the strata is based on a
# weighted area distribution with the requirement of at least 2 sampling points
# by strata. Small polygons (< 100 has) are eliminated from calculations.
# Only 'strata' classes prone to be samples are included in the analyses.
# The final sample data includes 'target' points - i.e. primary sampling points,
# and 'replacement' points - i.e. points to be used as replacements in case
# that the corresponding 'target' point cannot be sampled for any reason.
#
# 0 - Set working directory and load packages
# 1 - User-defined variables
# 2 - Import data
# 3 - Delineate soil strata
# 4 - Delineate land-use strata
# 5 - Create sampling strata
# 6 - Accommodate strata to requirements
# 7 - Stratified random sampling
# 8 - Calculate replacement areas for points
# 9 - Stratified regular sampling
# 10 - Random Sampling based on a stratified raster
#________________________________________________________________
## 0 - Set working directory and load packages =================================
# Load packages as a vector objects
<- c("sf", "terra", "tidyverse", "rmapshaper",
packages "units","plyr", "mapview", "leaflet", "stars","sgsR")
lapply(packages, require, character.only = TRUE) # Load packages
rm(packages) # Remove object to save memory space
# Set working directory to source file location
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
setwd("../") # Move wd down to main folder
## 1 - User-defined variables ==================================================
# Path to rasters
<- "data/rasters"
raster.path # Path to shapes
<- "data/shapes"
shp.path # Path to results
<- "data/results/"
results.path # Path to additional data
<- "data/other/"
other.path # Set number of samples
<- 250
n
## 2 - Import data =============================================================
# Load shape of district
<- sf::st_read(file.path(paste0(shp.path,"/Nghe_An.shp")),quiet=TRUE)
nghe
# Load soil data
<- st_read(paste0(shp.path,"/soils.shp"),promote_to_multi = FALSE)
soil # Load Landcover data
<- st_read(paste0(shp.path,"/landcover.shp"))
lc
# Define a bounding area (optional).
# If BOUNDING = TRUE, then a bounding–area shapefile (bb) must be specified
# and the line in the code below must be uncommented
= FALSE
BOUNDING #bb <- st_read(paste0(shp.path,"/your_bounding_area.shp"))
# Compute replacement areas (optional).
= 500
distance.buffer
## 3 - Delineate soil strata ===================================================
# Clip soil data by bounding area if defined
if (BOUNDING) {
<- st_intersection(soil, bb)
soil else {
} <- soil
soil
}# Check geometry
<- st_make_valid(soil)
soil <- st_cast(soil, "MULTIPOLYGON")
soil #Aggregate information by reference soil group (SRG)
#unique(soil$RSG)
# Remove NAs (water bodies are deleted)
<- soil[!is.na(soil$RSG),]
soil #unique(soil$RSG)
# Dissolve polygons by reference soil group
<- soil %>% group_by(RSG) %>% dplyr::summarize()
soil # Write soil strata to disk
st_write(soil, paste0(results.path,"soil_classes.shp"), delete_dsn = TRUE)
## Plot aggregated soil classes
= leaflet(leafletOptions(minZoom = 8)) %>%
map addTiles()
<- mapview(soil["RSG"], alpha=0, homebutton=T, layer.name = "soils", map=map)
mv @map
mv#ggplot() + geom_sf(data=soil, aes(fill = factor(RSG)))
## 4 - Delineate land-use strata ===============================================
# Clip landcover data by bounding area if exist
if (BOUNDING) {
<- st_intersection(lc, bb)
lc else {
} <- lc
lc
}# Check geometry
<- st_make_valid(lc)
lc # Agregate units
unique(lc$landcover)
$landcover[lc$landcover=="Broadleaf forest"]<- "Forest"
lc$landcover[lc$landcover=="Needleleaf forest"]<- "Forest"
lc$landcover[lc$landcover=="Mixed forest"]<- "Forest"
lc<- lc[lc$landcover!="Water body",]
lc unique(lc$landcover)
# Dissolve polygons by reference soil group
<- lc %>% group_by(landcover) %>% dplyr::summarize()
lc
# Write landcover strata as shapefile
st_write(lc, paste0(results.path,"lc_classes.shp"), delete_dsn = TRUE)
# Plot map with the land cover information
= leaflet() %>%
map addTiles()
<- mapview(lc["landcover"], alpha=0, homebutton=T, layer.name = "Landcover", map=map)
mv @map
mv#ggplot() + geom_sf(data=lc, aes(fill = factor(landcover)))
## 5 - Create sampling strata ==================================================
# Combine soil and landcover layers
sf_use_s2(FALSE) # use in case st_intersection cannot be run
<- st_intersection(st_make_valid(soil), st_make_valid(lc))
soil_lc $soil_lc <- paste0(soil_lc$RSG, "_", soil_lc$landcover)
soil_lc<- soil_lc %>% dplyr::select(soil_lc, geometry)
soil_lc
## 6 - Accommodate strata to requirements ======================================
# Select by Area. Convert to area to ha and select polygons with more than 100 has
$area <- st_area(soil_lc)/10000
soil_lc$area <- as.vector(soil_lc$area)
soil_lc<- soil_lc %>%
soil_lc group_by(soil_lc) %>%
mutate(area = sum(area))
<- soil_lc[soil_lc$area > 100,]
soil_lc plot(soil_lc[1])
# Replace blank spaces with underscore symbol to keep names uniform
$soil_lc <- str_replace_all(soil_lc$soil_lc, " ", "_")
soil_lc
# Create a column of strata numeric codes
$code <- as.character(as.numeric(as.factor(soil_lc$soil_lc)))
soil_lc
# List final strata
unique(soil_lc$soil_lc)
# Write final sampling strata map
st_write(soil_lc, paste0(results.path,"strata.shp"), delete_dsn = TRUE)
# Plot strata
# Plot final map of stratum
= leaflet(options = leafletOptions(minZoom = 8.3)) %>%
map addTiles()
<- mapview(soil_lc["soil_lc"], alpha=0, homebutton=T, layer.name = "Strata", map=map)
mv @map
mv
## 7 - Stratified random sampling ==============================================
# Read already created strata shapefile
<- st_read(paste0(results.path,"strata.shp"))
polygons if(BOUNDING){
= st_intersection(polygons,bb)
polygons
}
# Calculate the area of each polygon
$area <- st_area(polygons)
polygons
# Create a new column to group polygons by a common attribute
$group <- polygons$soil_lc
polygons# Drop units to allow computations
<- drop_units(polygons)
polygons
# Calculate the total area of all polygons in each group
<- polygons %>%
group_areas ::group_by(group) %>%
dplyr::summarize(total_area = sum(area))
dplyr# Add a code to each group
<- polygons %>% group_by(group) %>%
group_codes ::summarize(code = first(code))
dplyr# Join polygon strata and codes
<- left_join(group_areas,st_drop_geometry(group_codes), by = "group")
group_areas
# Ensure minimum of 2 samples at each polygon in each group
$sample_count <- 2
group_areas
# Calculate the number of samples per group based on relative area
$sample_count <-
group_areas$sample_count +
group_areasround(group_areas$total_area/sum(group_areas$total_area) *
-sum(group_areas$sample_count)))
(n# Adjust sample size on classes
while (sum(group_areas$sample_count) != n) {
if (sum(group_areas$sample_count) > n) {
# Reduce sample count for the largest polygon until total count is n
<- which.max(group_areas$sample_count)
max_index $sample_count[max_index] <- group_areas$sample_count[max_index] - 1
group_areaselse {
} # Increase sample count for the smallest polygon until total count is n
<- which.min(group_areas$sample_count)
min_index $sample_count[min_index] <- group_areas$sample_count[min_index] + 1
group_areas
}
}# Count the total samples. It must be equal to the sampling size
sum(group_areas$sample_count)
<- left_join(polygons, st_drop_geometry(group_areas),
polygons by = c("soil_lc"="group"))
<- dplyr::select(polygons, soil_lc, code.x, sample_count, geometry)
polygons
# Generate random points within each strata of size 3 times
#the required samples for each strata
<- spatSample(x = vect(group_areas),
x size = group_areas$sample_count * 3, method = "random")
# Compute sampling points for strata
<- x %>%
z st_as_sf() %>%
::group_by(code) %>%
dplyr::mutate(sample_count = as.numeric(sample_count),
dplyrorder = seq_along(code),
ID = paste0(code, ".", order),
type = ifelse(sample_count >= order, "Target", "Replacement")) %>%
vect()
# Find classes with missing samples
<- left_join(group_areas,data.frame(z) %>%
missing.data ::filter(type=="Target") %>%
dplyr::group_by(code) %>%
dplyrtally()) %>%
::mutate(diff=sample_count-n)
dplyr
# Determine missing sampled strata
<- which(is.na(missing.data$diff))
missing.strata
# Determine missing sampling point in strata (undersampled strata)
= which(missing.data$diff != 0)
missing.sample <- as.numeric(unlist(st_drop_geometry(missing.data[(missing.sample <- which(missing.data$diff != 0)),7])))
missing.number
# Compute sampling points for missing sampled strata
<- x[1]
x.missing.strata $sample_count<- 0
x.missing.strata
for(i in missing.strata){
<- x[1]
xx.missing.strata $sample_count<- 0
xx.missing.strata=0
nnwhile (sum(xx.missing.strata$sample_count) <
$code==i,][["sample_count"]]*5) {
group_areas[group_areas
while(nn < group_areas[group_areas$code==i,][["sample_count"]]*3){
<- spatSample(x = vect(group_areas[group_areas$code %in% i,]),
my.missing.strata size = group_areas[group_areas$code==i,][["sample_count"]]*5,
method = "random")
<- nn + nrow(data.frame(my.missing.strata))
nn
}<- rbind(xx.missing.strata,my.missing.strata)
xx.missing.strata print(sum(xx.missing.strata$sample_count))
}print(i)
print(xx.missing.strata)
<- rbind(x.missing.strata,xx.missing.strata)
x.missing.strata
}
# Join initial sampling with missing sampling strata data
<- rbind(x, x.missing.strata)
x
# Compute sampling points for missing samples (random sampling)
<- x[1]
x.missing.sample
for(i in missing.sample){
<- x[1]
xx.missing.sample $sample_count<- 0
xx.missing.samplewhile (sum(xx.missing.sample$sample_count) < (group_areas[group_areas$code==i,][["sample_count"]]*3)) {
<- spatSample(x = vect(group_areas[group_areas$code %in% i,]),
my.missing.sample size = as.numeric(vect(group_areas[group_areas$code %in% i,])[[4]])+
$code==i,][["sample_count"]]*3), method = "random")
(group_areas[group_areas
<- rbind(xx.missing.sample,my.missing.sample)
xx.missing.sample print(sum(xx.missing.sample$sample_count))
}print(i)
print(xx.missing.sample)
<- rbind(x.missing.sample,xx.missing.sample)
x.missing.sample
}
# Join initial sampling with missing sampling strata data and with missing samples
<- rbind(x, x.missing.sample)
x
# Remove extra artificial replacements
<- x[x$sample_count > 0,]
x
# Convert and export to shapefile
<- x %>%
z st_as_sf() %>%
::group_by(code) %>%
dplyr::mutate(sample_count = as.numeric(sample_count),
dplyrorder = seq_along(code),
ID = paste0(code, ".", order),
type = ifelse(sample_count >= order, "Target", "Replacement")) %>%
vect()
writeVector(z,
filename = paste0(results.path,"strat_randm_samples.shp"), overwrite=TRUE)
# Check if the number of initial target points equals the final target points
==nrow(z[z$type=="Target",])
nnrow(z[z$type=="Target",])
n;
# Plot points over strata
= leaflet(options = leafletOptions(minZoom = 11.4)) %>%
map addTiles()
<- mapview(soil_lc["soil_lc"], alpha=0, homebutton=T, layer.name = "Strata") +
mv mapview(sf::st_as_sf(z), zcol = 'type', color = "white", col.regions = c('royalblue', 'tomato'), cex=3, legend = TRUE,layer.name = "Samples")
@map
mv
## 8 - Calculate replacement areas for points ==================================
# Load strata
<- st_read(paste0(results.path,"strata.shp"))
soil_lc
# Read sampling. points from previous step
<- st_read(paste0(results.path,"strat_randm_samples.shp"))
z <- z[z$type=="Target",]
z
# Apply buffer of 500 meters
<- st_buffer(z, dist=distance.buffer)
buf.samples
# Intersect buffers
= st_intersection(soil_lc, buf.samples)
samples_buffer <- samples_buffer[samples_buffer$type=="Target",]
samples_buffer <- samples_buffer[samples_buffer$soil_lc==samples_buffer$group,]
samples_buffer
# Save Sampling areas
st_write(samples_buffer, paste0(results.path,"replacement_areas.shp"),
delete_dsn = TRUE)
# Write target points only
<- z[z$type=="Target",]
targets st_write(targets, paste0(results.path,"target_points.shp"), delete_dsn = TRUE)
## 9 - Stratified regular sampling =============================================
# Read strata shapefile
<- st_read(paste0(results.path,"strata.shp"))
polygons # Calculate the area of each polygon
$area <- st_area(polygons)
polygons
# Create a new column to group polygons by a common attribute
$group <- polygons$soil_lc
polygons# Drop units to allow computations
<- drop_units(polygons)
polygons
# Calculate the total area of all polygons in each group
<- polygons %>%
group_areas ::group_by(group) %>%
dplyr::summarize(total_area = sum(area))
dplyr# Add a code to each group
<- polygons %>% group_by(group) %>%
group_codes ::summarize(code = first(code))
dplyr
<- left_join(group_areas,st_drop_geometry(group_codes), by = "group")
group_areas
# Ensure minimum of 2 samples at each polygon in each group
$sample_count <- 2
group_areas
# Calculate the number of samples per group based on relative area
$sample_count <- group_areas$sample_count+round(group_areas$total_area/sum(group_areas$total_area) *
group_areas-sum(group_areas$sample_count)))
(n
while (sum(group_areas$sample_count) != n) {
if (sum(group_areas$sample_count) > n) {
# Reduce sample count for the largest polygon until total count is n
<- which.max(group_areas$sample_count)
max_index $sample_count[max_index] <- group_areas$sample_count[max_index] - 1
group_areaselse {
} # Increase sample count for the smallest polygon until total count is n
<- which.min(group_areas$sample_count)
min_index $sample_count[min_index] <- group_areas$sample_count[min_index] + 1
group_areas
}
}#sum(group_areas$sample_count)
<- left_join(polygons, st_drop_geometry(group_areas), by = c("soil_lc"="group"))
polygons <- dplyr::select(polygons, soil_lc, code.x, sample_count, geometry)
polygons
# Generate regular points within each strata of size 3 times the required samples for each strata ----
<- spatSample(x = vect(group_areas), size = group_areas$sample_count * 3, method = "regular")
x
# Compute sampling points for strata
<- x %>%
z st_as_sf() %>%
::group_by(code) %>%
dplyr::mutate(sample_count = as.numeric(sample_count),
dplyrorder = seq_along(code),
ID = paste0(code, ".", order),
type = ifelse(sample_count >= order, "Target", "Replacement")) %>%
vect()
# Find missing samples
<- left_join(group_areas,data.frame(z) %>%
missing.data ::filter(type=="Target") %>%
dplyr::group_by(code) %>%
dplyrtally()) %>%
::mutate(diff=sample_count-n)
dplyr
# Determine missing sampled strata
<- which(is.na(missing.data$diff))
missing.strata
# Determine missing sampling point in strata (undersampled strata)
= which(missing.data$diff != 0)
missing.sample <- as.numeric(unlist(st_drop_geometry(missing.data[(missing.sample <- which(missing.data$diff != 0)),7])))
missing.number
# Compute sampling points for missing sampled strata
<- x[1]
x.missing.strata $sample_count<- 0
x.missing.strata
for(i in missing.strata){
<- x[1]
xx.missing.strata $sample_count<- 0
xx.missing.strata=0
nnwhile (sum(xx.missing.strata$sample_count) <
$code==i,][["sample_count"]]*5) {
group_areas[group_areas
while(nn < group_areas[group_areas$code==i,][["sample_count"]]*3){
<- spatSample(x = vect(group_areas[group_areas$code %in% i,]),
my.missing.strata size = group_areas[group_areas$code==i,][["sample_count"]]*5,
method = "random")
<- nn + nrow(data.frame(my.missing.strata))
nn
}<- rbind(xx.missing.strata,my.missing.strata)
xx.missing.strata print(sum(xx.missing.strata$sample_count))
}print(i)
print(xx.missing.strata)
<- rbind(x.missing.strata,xx.missing.strata)
x.missing.strata
}
# Join initial sampling with missing sampling strata data
<- rbind(x, x.missing.strata)
x
# Compute sampling points for missing samples (regular sampling)
<- x[1]
x.missing.sample
for(i in missing.sample){
<- x[1]
xx.missing.sample $sample_count<- 0
xx.missing.samplewhile (sum(xx.missing.sample$sample_count) < (group_areas[group_areas$code==i,][["sample_count"]]*3)) {
<- spatSample(x = vect(group_areas[group_areas$code %in% i,]),
my.missing.sample size = as.numeric(vect(group_areas[group_areas$code %in% i,])[[4]])+
$code==i,][["sample_count"]]*3), method = "regular")
(group_areas[group_areas
<- rbind(xx.missing.sample,my.missing.sample)
xx.missing.sample print(sum(xx.missing.sample$sample_count))
}print(i)
print(xx.missing.sample)
<- rbind(x.missing.sample,xx.missing.sample)
x.missing.sample
}
# Join initial sampling with missing sampling strata data and with missing samples
<- rbind(x, x.missing.sample)
x
# Remove extra artificial replacements
<- x[x$sample_count > 0,]
x
# Convert to Shapefile
<- x %>%
z st_as_sf() %>%
::group_by(code) %>%
dplyr::mutate(sample_count = as.numeric(sample_count),
dplyrorder = seq_along(code),
ID = paste0(code, ".", order),
type = ifelse(sample_count >= order, "Target", "Replacement")) %>%
vect()
# Export data to Shapefile ----
# Write sampling points to shp
writeVector(z, paste0(results.path,"regular_str_points.shp"), overwrite=TRUE)
# Check if the number of initial target points equals the final target points
==nrow(z[z$type=="Target",])
nnrow(z[z$type=="Target",])
n;
# Plot results
= leaflet(options = leafletOptions(minZoom = 11.4)) %>%
map addTiles()
<- mapview(soil_lc["soil_lc"], alpha=0, homebutton=T, layer.name = "Strata") +
mv mapview(sf::st_as_sf(z), zcol = 'type', color = "white", col.regions = c('royalblue', 'tomato'), cex=3, legend = TRUE,layer.name = "Samples")
@map
mv
# 10 - Random Sampling based on a stratified raster =============================
<- st_read(paste0(results.path,"strata.shp"),quiet = TRUE)
strata $code <- as.integer(strata$code)
strata
# Create stratification raster
<- rast(st_rasterize(strata["code"],st_as_stars(st_bbox(strata), nx = 250, ny = 250)))
strata names(strata) <- "strata"
<- crop(strata, nghe, mask=TRUE)
strata
# Create stratified random sampling
# Target points
<- sample_strat(
target sraster = strata,
nSamp = n
)$type <- "target"
target
# Replacement points
<- sample_strat(
replacement sraster = strata,
nSamp = nrow(target)*3
)$type <- "replacement"
replacement
# Histogram of frequencies for targets
calculate_representation(
sraster = strata,
existing = target,
drop=0,
plot = TRUE
)# Histogram of frequencies for replacements
calculate_representation(
sraster = strata,
existing = replacement,
drop=0,
plot = TRUE
)
# Add index by strata to targets
<- target %>%
target st_as_sf() %>%
::group_by(strata) %>%
dplyr::mutate(order = seq_along(strata),
dplyrID = paste0(strata, ".", order)) %>%
vect()
# Add index by strata to replacements
<- replacement %>%
replacement st_as_sf() %>%
::group_by(strata) %>%
dplyr::mutate(order = seq_along(strata),
dplyrID = paste0(strata, ".", order)) %>%
vect()
# Merge target and replacement points in a single object
<- rbind(target,replacement)
sampling.points
# Plot samples over strata
= leaflet(options = leafletOptions(minZoom = 8.3)) %>%
map addTiles()
<- mapview(st_as_stars(strata),homebutton=T, layer.name = "Strata") +
mv mapview(sampling.points, zcol = 'type', color = "white", col.regions = c('royalblue', 'tomato'), cex=3, legend = TRUE,layer.name = "Samples")
@map
mv
# Plot samples over strata
# plot(strata, main="Strata and random samples")
# plot(nghe[1], col="transparent", add=TRUE)
# points(target,col="red")
# points(replacement,col="dodgerblue")
# legend("topleft", legend = c("target","replacement"), pch = 20, xpd=NA, bg="white", col=c("red","dodgerblue"))
## Export to shapefile
writeVector(sampling.points,
paste0(results.path,"pts_raster.shp"), overwrite=TRUE)
Script 6: Conditional Latin Hypercube Sampling
#
# Digital Soil Mapping
# Soil Sampling Design
# conditional Latin Hypercube Sampling
#
# GSP-Secretariat
# Contact: Luis.RodriguezLado@fao.org
#________________________________________________________________
# Empty environment and cache
rm(list = ls())
gc()
# Content of this script ========================================
# Script for creating a sampling design based on conditioned Latin Hypercube Sampling.
# Given a suite of covariates this algorithm will assess the optimal location of
# samples based on the amount of information in the set of covariates.
#
# 0 - Set working directory and load packages
# 1 - User-defined variables
# 2 - Import national data
# 3 - Compute clhs
# 4 - Identify under-sampled areas
# 5 - Including existing legacy data in a cLHS sampling design
# 6 - Working with large raster data
# 7 - Cost–constrained cLHS sampling
# 8 - Replacement areas in cLHS design
# 9 - Polygonize replacement areas by similarity
# 10 - Constrained cLHS sampling accounting for accessibility and legacy data
# 11 - Plot sample density over covariates
# 12 - Calculate minimum and and optimal sample size with opendms
#________________________________________________________________
<- Sys.time()
start_time
## 0 - Set working directory and load packages =================================
#remotes::install_github("lemuscanovas/synoptReg")
# Set working directory to source file location
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
setwd("../") # Move wd down to main folder
# List of packages
<- c("sp","terra","raster","sf","clhs", "sgsR","entropy", "tripack",
packages "manipulate","dplyr","synoptReg")
# Load packages
lapply(packages, require, character.only = TRUE)
# Remove object to save memory space
rm(packages)
## 1 - User-defined variables ==================================================
# Path to rasters
<- "data/rasters/"
raster.path # Path to shapes
<- "data/shapes/"
shp.path # Path to results
<- "data/results/"
results.path # Path to additional data
<- "data/other/"
other.path # Buffer distance for replacement areas (clhs)
<- 1000 # Buffer distance to calculate replacement areas
D # Define the minimum sample size. By default it uses the value calculated previously
<- 180
minimum_n # Aggregation factor for up-scaling raster covariates (optional)
= 5
agg.factor
## 2 - Import national data ====================================================
# Read Spatial data covariates as rasters with terra
<- list.files(raster.path, pattern = "tif$", recursive = TRUE, full.names = TRUE)
cov.dat <- terra::rast(cov.dat) # SpatRaster from terra
cov.dat # Load shape of district
<- sf::st_read(file.path(paste0(shp.path,"/Nghe_An.shp")),quiet=TRUE)
nghe
# Crop covariates on administrative boundary
<- crop(cov.dat, nghe, mask=TRUE)
cov.dat # Store elevation and slope separately
<- cov.dat$dtm_elevation_250m
elevation <- cov.dat$dtm_slope_250m
slope
# Load roads
<- sf::st_read(file.path(paste0(shp.path,"/roads.shp")),quiet=TRUE)
roads <- st_intersection(roads, nghe)
roads
# Simplify raster information with PCA
<- raster_pca(cov.dat)
pca
# Get SpatRaster layers
<- pca$PCA
cov.dat # Subset rasters to main PC (var. explained <=0.99)
<- first(which(pca$summaryPCA[3,]>0.99))
n_comps <- pca$PCA[[1:n_comps]]
cov.dat
# Remove pca stack
rm(pca)
# Aggregate stack to simplify data rasters for calculations
# cov.dat <- aggregate(cov.dat, fact=10, fun="mean")
# Plot of covariates
plot(cov.dat)
## 3 - Compute clhs ============================================================
# Distribute sampling points with clhs
<- sample_clhs(cov.dat, nSamp = 100, iter = 10000, progress = FALSE, simple = FALSE)
pts # Plot cLHS samples on map
plot(cov.dat[[1]], main="cLHS samples")
points(pts, col="red", pch = 1)
## 4 - Identify under-sampled areas ============================================
#' Calculate COOBS (count of observations) is an algorithm to map relatively
#' COOBS (count of observations) is an algorithm to map relatively
#' adequate and under-sampled areas on a sampling pattern.
#' COOBS algorithms allow one to understand which areas in a spatial domain are adequately and under-sampled.
#' COOBS ca be used to design an additional survey by limiting the cLHS algorithm to areas where the COOBS
#' value is below some specified threshold.
<- calculate_coobs(
coobs mraster = cov.dat,
existing = pts,
plot = TRUE,
cores = 4,
filename = paste0(results.path,"/coobs_clhs.tif"),
overwrite = TRUE
)
## 5 - Including existing legacy data in a cLHS sampling design ================
# Create an artificial random legacy dataset of 50 samples over the study area as an example
<- sample_srs( raster = cov.dat, nSamp = 50)
legacy.data
# Calculate clhs 100 points plus locations of legacy data
<- sample_clhs(cov.dat, nSamp = 150, existing=legacy.data, iter = 10000, progress = FALSE)
res
# Plot points
plot(cov.dat[[1]], main="cLHS samples (blue circles) and legacy samples (red diamonds)")
points(res, col="navy", pch = 1)
points(res[res$type=="existing",], col="red", pch = 5, cex=2)
## 6 - Working with large raster data ==========================================
# Scaling covariates
# Aggregation of covariates by a factor of 10.
# The original grid resolution is up-scaled using the mean value of the pixels in the grid
<- aggregate(cov.dat, fact=10, fun="mean")
cov.dat2 # Create clhs samples upon the resamples rasters
<- sample_clhs(cov.dat2, nSamp = 150, iter = 10000, progress = FALSE)
resampled.clhs # Plot the points over the 1st raster
plot(cov.dat2[[1]], main="Regular resampled data")
points(resampled.clhs , col="red", pch = 1)
rm(cov.dat2)
# Sampling to regular points
# Create a regular grid of 1000 points on the covariate space
<- spatSample(cov.dat, size = 1000, xy=TRUE, method="regular", na.rm=TRUE)
regular.sample # plot the points over the 1st raster
plot(cov.dat[[1]], main="Regular resampled data")
points(regular.sample, col="red", pch = 1)
# Create clhs samples upon the regular grid
<- clhs(regular.sample, size = 100, progress = FALSE, iter = 10000, simple = FALSE)
regular.sample.clhs # Plot points of clhs samples
<- regular.sample.clhs$sampled_data # Get point coordinates of clhs sampling
points plot(cov.dat[[1]], main="cLHS samples (red) and covariate resampled points (blue)")
points(regular.sample, col="navy", pch = 1)
points(points, col="red", cex=1)
## 7 - Cost–constrained cLHS sampling
# Use 'Distance to roads' as cost surface
# Calculate distance to roads with te same spatial definition than the covariates
# dist2access <- terra::distance(cov.dat[[1]], roads, progress=TRUE)
# names(dist2access) <- "dist2access"
# Save cost surface to disk
# writeRaster(dist2access, paste0(other.path,"nghe_d2roads.tif"), overwrite=TRUE)
# Load pre-calculated distance–to–roads surface
<- terra::rast(paste0(other.path,"nghe_d2roads.tif"))
dist2access # Aggregate to the same soatial definition
# dist2access <- aggregate(dist2access, fact=10, fun="mean")
plot(dist2access)
plot(nghe, col="transparent", add=TRUE)
# Add cost surface (distance to roads) as layer
<- c(cov.dat,dist2access)
cov.dat names(cov.dat)
# Harmonize NAs
$dist2access <- cov.dat$dist2access * cov.dat[[1]]/cov.dat[[1]]
cov.datplot(cov.dat$dist2access)
plot(nghe, col="transparent",add=TRUE)
# Compute sampling points
<- sample_clhs(cov.dat, nSamp = minimum_n, iter = 10000, progress = FALSE, cost = 'dist2access', use.cpp = TRUE)
cost.clhs
# Plot samples
plot(cov.dat[['dist2access']], main="cLHS samples with 'cost' constraints")
plot(cost.clhs, col="red", cex=1,add=T)
## 8 - Replacement areas in cLHS design (requires raster package)
# Determine the similarity to points in a buffer of distance 'D'
# Compute the buffers around points # cov25??
<- similarity_buffer(raster::stack(cov.dat) , as(cost.clhs, "Spatial"), buffer = D)
gw
# Plot results
plot(elevation, legend=TRUE,main=paste("Similarity probability over elevation"))
## Overlay points
points(cost.clhs, col = "dodgerblue", pch = 3)
## Overlay probability stack for point 1
<- c((RColorBrewer::brewer.pal(9, "YlOrRd")))
colors ::plot(gw[[1]], add=TRUE , legend=FALSE, col=colors)
terra## Overlay 1st cLHS point
points(cost.clhs[1,], col = "red", pch = 12,cex=1)
# 9 - Polygonize replacement areas by similarity
# Determine a threshold break to delineate replacement areas
<- 0.90
similarity_threshold # Reclassify buffer raster data according to the threshold break of probability
# 1 = similarity >= similarity_break; NA = similarity < similarity_break
# Define a vector with the break intervals and the output values (NA,1)
<- c(0, similarity_threshold, NA, similarity_threshold, 1, 1)
breaks # Convert to a matrix
<- matrix(breaks, ncol=3, byrow=TRUE)
breaks # Reclassify the data in the layers from probabilities to (NA,)
= stack(lapply(1:raster::nlayers(gw), function(i){raster::reclassify(gw[[i]], breaks, right=FALSE)}))
s
# Polygonize replacement areas
= lapply(as.list(s), rasterToPolygons, dissolve=TRUE)
s <- bind(s,keepnames=TRUE)
s # Add the identifier of the corresponding target point
for(i in 1: length(s)){
@data$ID[i] <- as.integer(stringr::str_replace(s@polygons[[i]]@ID,"1.",""))
s
}# Clean the data by storing target ID data only
@data <- s@data["ID"]
s
# Plot results
plot(cov.dat[[1]], main=paste("cLHS samples and replacement areas for threshold = ", similarity_threshold))
plot(s,add=TRUE, col=NA, border="gray40")
points(cost.clhs, col="red", pch = 3)
# Export replacement areas to shapefile
<- st_as_sf(s)
s st_write(s, file.path(paste0(results.path,'replacement_areas_', D, '.shp')), delete_dsn = TRUE)
# Write cLHS sampling points to shapefile
$ID <- row(cost.clhs)[,1] # Add identifier
cost.clhs<- st_as_sf(cost.clhs)
out.pts st_write(out.pts, paste0(results.path,'target_clhs.shp'), delete_dsn = TRUE)
## 10 - Constrained cLHS sampling taking into account accessibility and legacy data
# Load legacy data
<- sf::st_read(file.path(paste0(shp.path,"/legacy_soils.shp")),quiet=TRUE)
legacy
# Add slope to the stack
$slope <- slope
cov.dat# Calculate clhs points with legacy, cost and buffer to roads
=20;
buff_inner=3000
buff_outer# Convert roads to sf object and cast to multilinestring
<- st_as_sf(roads) %>%
roads st_cast("MULTILINESTRING")
# Calculate clhs samples using slope as cost surface, distance to roads as
# access limitations, and including existing legacy data
<- sgsR::sample_clhs(mraster = cov.dat, nSamp = minimum_n, existing = legacy,
aa iter = 10000, details = TRUE, cost="slope", access=roads,
buff_inner=buff_inner, buff_outer=buff_outer)
## Plot distances, roads, clhs points and legacy data
plot(cov.dat$dist2access, main="New (red) and existing (blue) samples")
plot(roads,add=TRUE, col="gray60")
plot(aa$samples[aa$samples$type=="new",], col= "tomato",add=TRUE)
plot(aa$samples[aa$samples$type=="existing",], col= "dodgerblue2",add=TRUE)
# Write samples as shapefile
$samples[c("type","dist2access")] %>%
aast_write(paste0(results.path,'const_clhs.shp'), delete_dsn = TRUE)
## 11 - Plot sample density over covariates ====================================
# Distribution of elevation values on the sampling design vs. original elevation data
<- rbind(data.frame(method = "Elevation", data.frame(elevation)))
s0 <- extract(elevation,pts)
s1 $method <-"cLHS"
s1
# Plot the covariate elevation values as histogram bars and the sampling distribution as curve
library(ggplot2)
<- ggplot() +
dens_elev geom_histogram(data=s0, aes(x=dtm_elevation_250m,fill="red",y=after_stat(density)),binwidth=25, alpha=.5, position="identity") +
geom_density(data = s1, aes(x = dtm_elevation_250m, col = method))
dens_elev
## 12 - Calculate minimum and and optimal sample size
#' Source scripts from https://github.com/newdale/opendsm/tree/main
#'
#' Determine minimum sample size for the clhs algorithm
#' This script is based on the publication:
#' Determining minimum sample size for the conditioned Latin hypercube sampling algorithm
#' DOI: https://doi.org/10.1016/j.pedsph.2022.09.001
source("scripts/clhs_min.R")
#' Optimal sample size based on the publication:
#' Divergence metrics for determining optimal training sample size in digital soil mapping
#' DOI: https://doi.org/10.1016/j.geoderma.2023.116553
source("scripts/opt_sample.R")
#' Perform Sequential Variable Inflation Factor Analysis
source("scripts/seqVIF.R")
#' Function to calculate the KL-divergence
#' score between two probability distributions, P and Q.
source("scripts/kldiv.R")
#' Function to calculate the Jensen-Shannon Distance
#' score between two probability distributions, P and Q.
source("scripts/jsdist.R")
#' Function to calculate the Jensen-Shannon-divergence
#' score between two probability distributions, P and Q.
source("scripts/jsdiv.R")
#install.packages('~/Downloads/DescTools_0.99.45.tar', repos=NULL, type='source')
#install.packages('DescTools')
library("DescTools")
# Convert raster covariates to dataframe
<- data.frame(cov.dat[[1:n_comps]])
cov.dat.df # Calculate minimum sample size
<-clhs_min(cov.dat.df)
min_N <- as.numeric(min_N$Sample_Size[1])
min_n # Calculate optimal sample size ising normalized KL-div, JS-div and JS distance
<- opt_sample(alg="clhs",s_min = min_n, s_max = 500, s_step=50, s_reps=4, covs = cov.dat.df,clhs_iter=100, cpus=NULL, conf=0.95) opt_N