Annex I: Compendium of R scripts

This chapter contains the complete list of R scripts to run the process of mapping soil nutrient and associated soil properties.

Script 0: Introduction to R

# Introduction to R

# 0. Playground =================================================

# learn important keyboard shortcuts
# Ctrl + enter for running code
# tab after writing the first three 
# characters of the function name
# F1 to access the help

# explore the use of <-, $, [], ==, !=, c(), :, 
# data.frame(), list(), as.factor()

a <- 10:15
a[2]
a[2:3]
b <- c("1", "a", a )
length(b)
df <- data.frame(column_a = 1:8, column_b = b)

df[,1]
df$column_b
as.numeric(df$column_b)
plot(df)

df[1:3,]
df[,1]

as.factor(b)

d <- list(a, b, df)
d
names(d)
names(d) <- c("numeric_vector", "character_vector", "dataframe")
d
d[[1]]
d$numeric_vector

a == b
a != b

# 1. Set working directory ======================================
setwd("C:/GIT/Digital-Soil-Mapping/")

# 2. Install and load packages ==================================
# readxl, tidyverse, and data.table packages using the functions
install.packages("tidyverse")
install.packages("readxl")
install.packages("data.table")
library(tidyverse)
library(readxl)
library(data.table)

# 3. Import an spreadsheet ======================================
## 3.1 Read the MS Excel file -----------------------------------
#Read the soil_data.xlsx file, spreadsheet 2, using read_excel 
read_excel(path = "01-Data/soil_data.xlsx", sheet = 2)

## 3.2 Read the csv file with the native function ---------------
# 01-Data/horizon.csv
read.csv("01-Data/soil_profile_data.csv")

## 3.3 Read the csv file with the tidyverse function ------------
read_csv("01-Data/soil_profile_data.csv")

## 3.4 Read the csv file with the data.table function -----------
fread("01-Data/soil_profile_data.csv")

## 3.5 Assign the dataframe to an object called dat -------------
dat <- read_csv("01-Data/soil_profile_data.csv")

# 4. Tidyverse functions ========================================
## 4.1 Select pid, hip, top, bottom, ph_h2o, cec from dat -------
dat_1 <- dat %>% 
  select(id_prof, id_hor, top, bottom, ph_h2o, cec)

## 4.2 Filter: pick observations by their values ----------------
# filter observations with cec > 50 cmolc/100g

dat_2 <- dat_1 %>% 
  filter(cec > 30)

dat_2
## 4.3 Mutate: create a new variable ----------------------------
# thickness = top - bottom

dat_3 <- dat_2 %>% 
  mutate(thickness = bottom - top)

## 4.4 Group_by and summarise -----------------------------------
# group by variable pid
# summarise taking the mean of pH and cec

dat_4 <- dat_3 %>% 
  group_by(id_prof) %>% 
  summarise(mean_ph = mean(ph_h2o),
            mean_cec = mean(cec))

## 4.5 Reshape the table using pivot_longer ---------------------
# use dat_3
# put the names of the variables ph_h2o, 
# cec and thickness in the column 
# variable and keep the rest of the table. Save in dat_5 

dat_5 <- dat_3 %>% 
  pivot_longer(ph_h2o:thickness, names_to = "soil_property",
               values_to = "value")

## 4.6 Join the table sites.csv with dat_3 ----------------------
# Load soil_phys_data030.csv (in 01-Data folder) 
# Join its columns with dat_3 keeping all the rows of dat_3
# save the result as dat_6

phys <- read_csv("01-Data/soil_phys_data030.csv")

phys <- phys %>% rename(id_prof = "ProfID")

dat_6 <- dat_3 %>% 
  left_join(phys)
# or
dat_6 <- phys %>% 
  right_join(dat_3)

# 5. Data visualization with ggplot2 ============================
## 5.1 1D plot: histograms --------------------------------------
# histograms of cec and ph_h2o

ggplot(dat_3, aes(x=cec)) + geom_histogram()

## 5.2 2D plot: scatterplot -------------------------------------
# Scatterplot bottom vs. ph_h2o

ggplot(dat_3, aes(x = bottom, y = ph_h2o)) + 
  geom_point() 

# add a fitting line

ggplot(dat_3, aes(x = bottom, y = ph_h2o)) + 
  geom_point() +
  geom_smooth(method = "lm" )

## 5.3 3D plot: scatterplot -------------------------------------
# Scatterplot bottom vs. ph_h2o, add clay as color 
# and size inside the 
# function aes()

ggplot(dat_3, 
       aes(x = bottom, y = ph_h2o, color = cec, size = cec)) + 
  geom_point()

# 6. Geospatial data with terra =================================
## Load packages (install them if needed)
library(terra)
## 6.1 Load a raster and a vector layer -------------------------
# Load 01-Data/covs/grass.tif using rast() function, then plot it
# Load 01-Data/soil map/SoilTypes.shp using vect() function and 
# plot it 
# explore the attributes of these layers

r <- rast("01-Data/Macedonia/grass.tif")
plot(r)

v <- vect("01-Data/Macedonia/SoilTypes.shp")
plot(v)

## 6.2 Load a raster and a vector layer -------------------------
# Check the current CRS (EPSG) of the raster and the vector. 
# Find a *projected* CRS in http://epsg.io for Macedonia and 
# copy the number
# Check the Arguments of function project (?project) that need to
# be defined
# Save the new object as r_proj and v_proj
# plot both objects

r_proj <- project(x = r, y = "epsg:6204", method = "bilinear",
                  res = 250)
plot(r_proj)
v_proj <- project(x = v, y = "epsg:6204")
plot(v_proj, add = TRUE)

## 6.3 Cropping and masking a raster ----------------------------
# Compute the area of the polygons in v_proj 
# (search for a function) and
# assign the values to a new column named area
# select the largest polygon using [], $, == and max() func. 
# and save it as pol
# crop the raster with pol using the crop() function and save 
#it as r_pol
# mask the raster r_pol with the polygon pol and save it 
# with the same name
# plot each result

v_proj$area <- expanse(v_proj, unit = "ha")
pol <- v_proj[v_proj$area == max(v_proj$area)]
plot(pol)
r_pol <- crop(r_proj, pol)
plot(r_pol)
plot(pol, add = TRUE)
r_pol <- mask(r_pol, pol)
plot(r_pol)

## 6.4 Replace values in a raster by filtering their cells ------
# Explore the following link to understand how terra 
#manage cell values
# https://rspatial.org/terra/pkg/4-algebra.html 
# Replace values lower than 5 in r+pol by 0

r_pol[r_pol$grass < 5] <- 0
plot(r_pol)

## 6.5 Rasterize a vector layer ---------------------------------
# Use rasterize() function to convert v_proj to raster
# Use r_proj as reference raster
# Use field Symbol to assign cell values, and plot the new map

v_class <- rasterize(x = v_proj, y = r_proj, field = "Symbol" )
plot(v_class)
v_class
activeCat(v_class) <- 1

## 6.6 Extracting raster values using points --------------------
# Covert dat_6 to spatial points using vect() function 
# (check help of vect())
# Note that the EPSG number is 6204
# Save the points as s
# Plot s and r_proj together in the same map (Argument add=TRUE)
# Extract the values of the raster using extract() 
# function (check the help)
# Remove the ID column of the extracted values
# merge the extracted data with s using cbind() function
# Convert s as a dataframe

s <- vect(dat_6, geom=c("x", "y"), crs = "epsg:6204")
plot(r_proj)
plot(s, add=TRUE)
x <- extract(r_proj,s, ID=FALSE)
s <- cbind(s,x)
d <- as.data.frame(s)
d
#GGally::ggscatmat(d)

## 6.7 Zonal statistics using polygons and rasters --------------
# Use the extract() func. to estimate the mean value of 
# r_proj at each polygon
# Use the fun= argument (check the help)
# Use the cbind() func. to merge v_proj and the extracted values
# convert v_proj to a dataframe
# Create a ggplot boxplot (geom_boxplot) with x=Symbol
# and y=grass 

x <- extract(r_proj, v_proj, fun = mean, ID=FALSE)
v_proj <- cbind(v_proj, x)

d <- as_tibble(v_proj)

d %>% 
  ggplot(aes(x =Symbol, y = grass, fill = Symbol)) +
  geom_boxplot() +
  ylab("Grass probability")

## 6.8 Bonus track: use zonal() function ------------------------

zonal(r_proj, v_class, na.rm=T)

Script 2: Data preparation

#
# Digital Soil Mapping
# Soil Profile Data
# Cleaning and Processing
#
# GSP-Secretariat
# Contact: Isabel.Luotto@fao.org
#          Marcos.Angelini@fao.org
#________________________________________________________________

#Empty environment and cache 
rm(list = ls())
gc()

# Content of this script ========================================
# The goal of this script is to organise the soil data for 
# mapping, including:
# 
# 0 - User-defined variables 
# 1 - Set working directory and load necessary packages
# 2 - Import national data 
# 3 - Select useful columns
# 4 - Quality check
# 5 - Estimate BD using pedotransfer function
# 6 - Harmonize soil layers
# 7 - Add chemical properties from additional dataset
# 8 - Plot and save results
#________________________________________________________________

# 0 - User-defined variables ====================================

wd <- "C:/GIT/GSNmap-TM/Digital-Soil-Mapping"
#wd <- 'C:/Users/hp/Documents/GitHub/GSNmap-TM/Digital-Soil-Mapping'


# 1 - Set working directory and load necessary packages =========
setwd(wd) # change the path accordingly
#(setwd(dirname(rstudioapi::getActiveDocumentContext()$path)))

library(tidyverse) # for data management and reshaping
library(readxl) # for importing excel files
library(mapview) # for seeing the profiles in a map
library(sf) # to manage spatial data (shp vectors) 
library(aqp) # for soil profile data
library(mpspline2) # for horizon harmonization
library(plotly) # interactive plots

# 2 - Import national data ======================================
# Save your national soil dataset in the data folder 
# /01-Data as a .csv file or 
# as a .xlsx file

## 2.1 - for .xlsx files ----------------------------------------
# Import horizon data 
# hor <- read_excel("01-Data/soil_data.xlsx", sheet = 2)
# # Import site-level data
# site <- read_excel("01-Data/soil_data.xlsx", sheet = 1)
# chem <- read_excel("01-Data/soil_data.xlsx", sheet = 2)
# phys <- read_excel("01-Data/soil_data.xlsx", sheet = 3)


## 2.2 - for .csv files -----------------------------------------
# Import horizon data 
hor <- 
  read_csv(file = 
          "01-Data/soil_profile_data.csv",show_col_types = FALSE)
chem <- 
  read_csv(file = 
          "01-Data/soil_chem_data030.csv",show_col_types = FALSE)
phys <- read_csv(file = 
          "01-Data/soil_phys_data030.csv",show_col_types = FALSE)


site <- select(hor, id_prof, x, y) %>% unique()
hor <- select(hor, id_prof, id_hor, top:cec)

# change names of key columns
names(site)
names(site)[1] <- "ProfID"
names(hor)
names(hor)[1] <- "ProfID"
names(hor)[2] <- "HorID"
# scan the data
summary(site)
summary(hor)

# 3 - select useful columns =====================================
## 3.1 - select AND RENAME columns ------------------------------
hor <- select(hor, ProfID, HorID, top, bottom, ph=ph_h2o, 
              k, soc, bd, cec)

# 4 - Quality check =============================================

## 4.1 - Check locations ----------------------------------------
# Macedonian State Coordinate System EPSG:6204  
# https://epsg.io/6204 
site %>% 
  st_as_sf(coords = c("x", "y"), crs = 4326) %>% # convert to 
  #spatial object
  mapview(zcol = "ProfID", cex = 3, lwd = 0.1) # visualise in an 
#interactive map

site_sub <- site[site$ProfID=='2823',] 

site_sub %>% 
  st_as_sf(coords = c("x", "y"), crs = 4326) %>% # convert to 
  #spatial object
  mapview(zcol = "ProfID", cex = 3, lwd = 0.1) # visualise in an 
#interactive map
# profile 2823 is wrongly located, so let's remove it
site <- filter(site, ProfID != 2823)


## 4.2 - Convert data into a Soil Profile Collection ------------
aqp::depths(hor) <- ProfID ~ top + bottom 
hor@site$ProfID <- as.numeric(hor@site$ProfID) 
site(hor) <- left_join(site(hor), site)
profiles <- hor

profiles

## 4.3 - plot first 20 profiles using pH as color ---------------
plotSPC(x = profiles[1:20], name = "pH", color = "ph",
        name.style = "center-center")


## 4.4 - check data integrity -----------------------------------
# A valid profile is TRUE if all of the 
#following criteria are false:
#    + depthLogic : boolean, errors related to depth logic
#    + sameDepth : boolean, errors related to same top/bottom 
#depths
#    + missingDepth : boolean, NA in top / bottom depths
#    + overlapOrGap : boolean, gaps or overlap in adjacent
#horizons
aqp::checkHzDepthLogic(profiles)

# Identify non-valid profiles 
dl <- checkHzDepthLogic(profiles)
dl[dl$depthLogic==T | dl$sameDepth==T | dl$missingDepth==T | 
     dl$overlapOrGap==T,"ProfID"]

# visualize some of these profiles by the pid
subset(profiles, grepl(6566, ProfID, ignore.case = TRUE))
subset(profiles, grepl(7410 , ProfID, ignore.case = TRUE))
subset(profiles, grepl(8002, ProfID, ignore.case = TRUE))

## 4.5 - keep only valid profiles -------------------------------
clean_prof <- HzDepthLogicSubset(profiles)
metadata(clean_prof)$removed.profiles
# write_rds(clean_prof, "01-Data/soilProfileCollection.rds")

## 4.6 convert soilProfileCollection to a table -----------------
dat <- left_join(clean_prof@site, clean_prof@horizons)
dat <- select(dat, ProfID, HorID, x, y, top, bottom, ph:cec )


# 5 - Estimate BD using pedotransfer functions ==================

# create the function with all PTF
method_names <- c("Saini1996", "Drew1973", "Jeffrey1979", 
                  "Grigal1989", 
                  "Adams1973", "Honeyset_Ratkowsky1989") 

estimateBD <- function(SOC=NULL, method=NULL) {
  OM <- SOC * 1.724
  BD <- switch(method,
            "Saini1996" = 1.62 - 0.06 * OM,
            "Drew1973" = 1 / (0.6268 + 0.0361 * OM),
            "Jeffrey1979" = 1.482 - 0.6786 * (log(OM)),
            "Grigal1989" = 0.669 + 0.941 * exp(1)^(-0.06 * OM),
            "Adams1973" = 100 / (OM / 0.244 + (100 - OM) / 2.65),
            "Honeyset_Ratkowsky1989" = 1 / (0.564 + 0.0556 * OM),
            stop("Invalid method specified.")
  )
  return(BD)
}

# 5.1 - Select a pedotransfer function --------------------------


# Create a test dataset with BD and SOC data
BD_test <- data.frame(SOC = dat$soc, BD_observed = dat$bd)

# Remove missing values
BD_test <- BD_test[complete.cases(BD_test),]
BD_test <- na.omit(BD_test) 


# 5.2 - Estimate BLD for a subset using the 
# pedotransfer functions ------------
for (i in method_names) {
  BD_test[[i]] <- estimateBD(BD_test$SOC, method = i)
}

# Print the resulting data frame
BD_test


## 5.3 Compare results ------------------------------------------
# Observed values:
summary(BD_test)

# Compare data distributions for observed and predicted BLD
plot.bd <- BD_test %>%
  select(-SOC) %>% 
  pivot_longer(cols = c("BD_observed", "Saini1996", "Drew1973", 
                        "Jeffrey1979",
                        "Grigal1989", "Adams1973", 
                        "Honeyset_Ratkowsky1989"), 
               names_to = "Method", values_to = "BD") %>% 
  ggplot(aes(x = BD, color = Method)) + 
  geom_density()

plot.bd

# Dymanic plot with plotly 
ggplotly(plot.bd)

ggplotly(plot.bd) %>%
  layout(hovermode = "x")

# Plot the Selected function again
BD_test %>% 
  select(-SOC) %>% 
  pivot_longer(cols = c("BD_observed", "Honeyset_Ratkowsky1989"), 
               names_to = "Method", values_to = "BD") %>% 
  ggplot(aes(x = BD, color = Method)) + 
  geom_density() + xlim(c(0,2.5))

# Same dynamic plot 
ggplotly(BD_test %>% 
           select(-SOC) %>% 
           pivot_longer(cols = c("BD_observed", 
                                 "Honeyset_Ratkowsky1989"), 
                       names_to = "Method", values_to = "BD") %>% 
           ggplot(aes(x = BD, color = Method)) + 
           geom_density() + xlim(c(0,2.5))) %>%
  layout(hovermode = "x")

## 5.4 Estimate BD for the missing horizons ---------------------
dat$bd[is.na(dat$bd)] <-
  estimateBD(dat[is.na(dat$bd),]$soc,
             method="Honeyset_Ratkowsky1989")

# Explore the results
summary(dat$bd)

g <- BD_test %>% 
  select(-SOC) %>% 
  pivot_longer(cols = c("BD_observed"), 
               names_to = "Method", values_to = "BD") %>% 
  ggplot(aes(x = BD, color = Method)) + 
  geom_density() +
  xlim(c(0,2.5))
g + geom_density(data = dat, aes(x=bd, 
                              color = "Predicted +\n observed"))


## 5.5 - Explore outliers ---------------------------------------
# Outliers should be carefully explored and compared with 
#literature values.
# Only if it is clear that outliers represent 
#impossible or highly unlikely 
# values, they should be removed as errors.
# 
# Carbon content higher than 15% is only typical for organic soil
# (histosols)
# We will remove all atypically high SOC as outliers
summary(dat$soc)
na.omit(dat$ProfID[dat$soc > 10])
dat$ProfID[dat$soc > 10][!is.na(dat$ProfID[dat$soc > 10])] 

dat <- dat[dat$ProfID != 6915,]
dat <- dat[dat$ProfID != 7726,]

dat<- dat[!(dat$ProfID %in% dat$ProfID[dat$soc > 10]
            [!is.na(dat$ProfID[dat$soc > 10])]),]

# Explore bulk density data, identify outliers
# remove layers with Bulk Density < 1 g/cm^3
low_bd_profiles <- na.omit(dat$ProfID[dat$bd<1])
dat <- dat[!(dat$ProfID %in% low_bd_profiles),]

# Explore data, identify outliers
x <- pivot_longer(dat, cols = ph:cec, values_to = "value",
                  names_to = "soil_property")
x <- na.omit(x)
ggplot(x, aes(x = soil_property, y = value, 
              fill = soil_property)) +
  geom_boxplot() + 
  facet_wrap(~soil_property, scales = "free")


# 6 - Harmonize soil layers =====================================
source("03-Scripts/.spline_functions.R") 
## 6.1 - Set target soil properties and depths ------------------
names(dat)
dat <- select(dat, ProfID, HorID, x, y, top, bottom, 
              ph, k, soc, bd, cec)

target <- c("ph", "k", "soc",  "bd", "cec")
depths <- c(0,30)

## 6.2 - Create standard layers ---------------------------------
splines <- apply_mpspline_all(df = dat, properties = target,
                              depth_range = depths)
summary(splines)

# merge splines with x and y
d <- unique(select(dat, ProfID, x, y))
d <- left_join(d, splines)



# 7 - Harmonize units ===========================================
#Harmonize units if different from target units
# Mandatory Soil Propertes and corresponing units:
# Total N - ppm
# Available P - ppm
# Available K - ppm
# Cation exchange capacity cmolc/kg
# pH
# SOC - %
# Bulk density g/cm3
# Soil fractions (clay, silt and sand) - 

# Units soil profile data (dataframe d)
# 
head(d) # pH; K cmolc/kg; SOC %; BD g/cm3; CEC  cmolc/kg

# K => convert cmolc/kg to ppm (K *10 * 39.096)
d$k_0_30 <- d$k_0_30 * 10 * 39.096

head(chem)# P ppm; N %; K ppm
# N => convert % to ppm (N * 10000)
chem$tn <-chem$tn*10000

head(phys)# clay, sand, silt g/kg
# convert g/kg to % (/10)
phys$clay_0_30 <-phys$clay_0_30/10
phys$sand_0_30  <-phys$sand_0_30 /10
phys$silt_0_30 <-phys$silt_0_30/10

# Correct negative clay content
phys$clay_0_30[phys$clay_0_30<0] <- 0


# Add chemical and physical properties from additional datasets =


# Rename columns to match the main data set
names(d)
names(chem)[1] <- 'ProfID'
names(chem)[4] <- 'p_0_30'
names(chem)[5] <- 'k_0_30' 
names(chem)[6] <- 'n_0_30'


#The chem dataframe comes from and independent 
#dataset we need to create new unique ProfIDs 
#Create unique ProfID 
chem$ProfID <- seq(max(d$ProfID)+1,max(d$ProfID)+1+nrow(chem)-1)

# Add the new data as new rows using dplyr
# we can add empty rows
# automatically for the not measured
# properties in the chem dataset
d <- bind_rows(d, chem)

#The phys dataframe with the texture instead shares the 
# same ProfIDs (we can directly merge)
d <- left_join(d, phys, by=c('ProfID', 'x', 'y'))

# 8 - Plot  and save results ====================================
names(d)
x <- pivot_longer(d, cols = ph_0_30:silt_0_30, 
                  values_to = "value",
                  names_sep = "_", 
                  names_to = c("soil_property", 
                               "top", "bottom"))
x <- mutate(x, depth = paste(top, "-" , bottom))
#x <- na.omit(x)
ggplot(x, aes(x = depth, y = value, fill = soil_property)) +
  geom_boxplot() + 
  facet_wrap(~soil_property, scales = "free")


ggplotly(ggplot(x, aes(x = depth, y = value, 
                       fill = soil_property)) +
           geom_boxplot() + 
           facet_wrap(~soil_property, scales = "free"))

# save the data
write_csv(d, "02-Outputs/harmonized_soil_data.csv")

Scripts 3: Download environmental covariates

var assets = 
["projects/digital-soil-mapping-gsp-fao/assets/CHELSA/bio1",
"projects/digital-soil-mapping-gsp-fao/assets/CHELSA/bio12",
"projects/digital-soil-mapping-gsp-fao/assets/CHELSA/bio13",
"projects/digital-soil-mapping-gsp-fao/assets/CHELSA/bio14",
"projects/digital-soil-mapping-gsp-fao/assets/CHELSA/bio16",
"projects/digital-soil-mapping-gsp-fao/assets/CHELSA/bio17",
"projects/digital-soil-mapping-gsp-fao/assets/CHELSA/bio5",
"projects/digital-soil-mapping-gsp-fao/assets/CHELSA/bio6",
"projects/digital-soil-mapping-gsp-fao/assets/CHELSA/ngd10",
"projects/digital-soil-mapping-gsp-fao/assets/CHELSA/pet_penman_max",
"projects/digital-soil-mapping-gsp-fao/assets/CHELSA/pet_penman_mean",
"projects/digital-soil-mapping-gsp-fao/assets/CHELSA/pet_penman_min",
"projects/digital-soil-mapping-gsp-fao/assets/CHELSA/pet_penman_range",
"projects/digital-soil-mapping-gsp-fao/assets/CHELSA/sfcWind_max",
"projects/digital-soil-mapping-gsp-fao/assets/CHELSA/sfcWind_mean",
"projects/digital-soil-mapping-gsp-fao/assets/CHELSA/sfcWind_range",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/fpar_030405_500m_mean",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/fpar_030405_500m_sd",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/fpar_060708_500m_mean",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/fpar_060708_500m_sd",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/fpar_091011_500m_mean",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/fpar_091011_500m_sd",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/fpar_120102_500m_mean",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/fpar_120102_500m_sd",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/lstd_030405_mean",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/lstd_030405_sd",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/lstd_060708_mean",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/lstd_060708_sd",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/lstd_091011_mean",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/lstd_091011_sd",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/lstd_120102_mean",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/lstd_120102_sd",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/ndlst_030405_mean",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/ndlst_030405_sd",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/ndlst_060708_mean",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/ndlst_060708_sd",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/ndlst_091011_mean",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/ndlst_091011_sd",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/ndlst_120102_mean",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/ndlst_120102_sd",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/ndvi_030405_250m_mean",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/ndvi_030405_250m_sd",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/ndvi_060708_250m_mean",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/ndvi_060708_250m_sd",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/ndvi_091011_250m_mean",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/ndvi_091011_250m_sd",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/ndvi_120102_250m_mean",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/ndvi_120102_250m_sd",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/snow_cover",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/swir_060708_500m_mean",
"projects/digital-soil-mapping-gsp-fao/assets/LANDCOVER/crops",
"projects/digital-soil-mapping-gsp-fao/assets/LANDCOVER/flooded_vegetation",
"projects/digital-soil-mapping-gsp-fao/assets/LANDCOVER/grass",
"projects/digital-soil-mapping-gsp-fao/assets/LANDCOVER/shrub_and_scrub",
"projects/digital-soil-mapping-gsp-fao/assets/LANDCOVER/trees",
"projects/digital-soil-mapping-gsp-fao/assets/OPENLANDMAP/dtm_curvature_250m",
"projects/digital-soil-mapping-gsp-fao/assets/OPENLANDMAP/dtm_downslopecurvature_250m",
"projects/digital-soil-mapping-gsp-fao/assets/OPENLANDMAP/dtm_dvm2_250m",
"projects/digital-soil-mapping-gsp-fao/assets/OPENLANDMAP/dtm_dvm_250m",
"projects/digital-soil-mapping-gsp-fao/assets/OPENLANDMAP/dtm_elevation_250m",
"projects/digital-soil-mapping-gsp-fao/assets/OPENLANDMAP/dtm_mrn_250m",
"projects/digital-soil-mapping-gsp-fao/assets/OPENLANDMAP/dtm_neg_openness_250m",
"projects/digital-soil-mapping-gsp-fao/assets/OPENLANDMAP/dtm_pos_openness_250m",
"projects/digital-soil-mapping-gsp-fao/assets/OPENLANDMAP/dtm_slope_250m",
"projects/digital-soil-mapping-gsp-fao/assets/OPENLANDMAP/dtm_tpi_250m",
"projects/digital-soil-mapping-gsp-fao/assets/OPENLANDMAP/dtm_twi_500m",
"projects/digital-soil-mapping-gsp-fao/assets/OPENLANDMAP/dtm_upslopecurvature_250m",
"projects/digital-soil-mapping-gsp-fao/assets/OPENLANDMAP/dtm_vbf_250m",
"projects/digital-soil-mapping-gsp-fao/assets/POP/hfp2013_merisINT",
"projects/digital-soil-mapping-gsp-fao/assets/POP/night_lights_stable_2013",
"projects/digital-soil-mapping-gsp-fao/assets/POP/population_density_2020"];

// Load borders 

/// Using UN 2020 (replace the countries to download)
var ISO = ['ITA'];
var aoi =
ee.FeatureCollection(
'projects/digital-soil-mapping-gsp-fao/assets/UN_BORDERS/BNDA_CTY'
)
  .filter(ee.Filter.inList('ISO3CD', ISO));
var region = aoi.geometry();

/// Using a shapefile
/// 1. Upload the borders of your countries as an asset
/// 2. Replace 'your_shapefile' with the path to your shapefile
// var shapefile = 
//ee.FeatureCollection('users/your_username/your_shapefile');
// var region = shapefile.geometry().bounds();


// Load assets as ImageCollection
var assetsCollection = ee.ImageCollection(assets);

// Clip each image in the collection to the region of interest
var clippedCollection = assetsCollection.map(function(img){
  return img.clip(region).toFloat();
});

// Function to replace masked values with zeroes for fpar bands
function replaceMaskedFpar(img) {
  var allBands = img.bandNames();
  var fparBands =
  allBands.filter(ee.Filter.stringStartsWith('item', 'fpar'));
  var nonFparBands = allBands.removeAll(fparBands);
  
  var fparImg = img.select(fparBands).unmask(0);
  var nonFparImg = img.select(nonFparBands);
  
  // If there are no fpar bands, return the original image
  var result = ee.Algorithms.If(fparBands.length().eq(0),
                                 img,
                                 nonFparImg.addBands(fparImg));
  
  return ee.Image(result);
}

// Clip each image in the collection to the region of 
//interest and replace masked values for fpar bands
var clippedCollection = assetsCollection.map(function(img){
  var clippedImg = img.clip(region).toFloat();
  return replaceMaskedFpar(clippedImg);
});

// Stack the layers and maintain the 
//layer names in the final file
var stacked = clippedCollection.toBands();

// Get the list of asset names
var assetNames = ee.List(assets).map(function(asset) {
  return ee.String(asset).split('/').get(-1);
});

// Rename the bands with asset names
var renamed = stacked.rename(assetNames);
print(renamed, 'Covariates to be exported')

// Visualize the result
// Set a visualization parameter 
// (you can adjust the colors as desired)
var visParams = {
  bands: 'bio1',
  min: 0,
  max: 1,
  palette: ['blue', 'green', 'yellow', 'red']
};

// Add the layer to the map
Map.centerObject(renamed, 6)
Map.addLayer(renamed, visParams, 'Covariates');
// Export the stacked image to Google Drive
Export.image.toDrive({
  image: renamed,
  description: 'covariates',
  folder: 'GEE',
  scale: 250,
  maxPixels: 1e13,
  region: region
});

/* Create mask for croplands ----------------------------*/

// Load the Copernicus Global Land Service image collection
var imageCollection = 
ee.Image("COPERNICUS/Landcover/100m/Proba-V-C3/Global/2019")
  .select("discrete_classification")
  .clip(region)

var crs = 'EPSG:4326'; // WGS84
var res = 250; // Resolution in decimal degrees

// Default resampling is nearest neighbor
var image1 = imageCollection.resample()
  .reproject({
    crs: crs, // Add your desired CRS here
    scale: res // Add your desired scale here
  });

// Reclassify the land cover classes
var inList = [0, 20, 30, 40, 50, 60, 70, 80, 90, 100, 
               111, 112, 113, 114, 115, 116, 
              121, 122, 123, 124, 125, 126, 200];
var outList = [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
               0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];

var FAO_lu = image1.remap(inList, outList)
  .toDouble()
  .clip(region);

// print(FAO_lu)

// Convert 0 to NA
var mask = FAO_lu.neq(0);
print(mask)
FAO_lu = FAO_lu.updateMask(mask);

print(FAO_lu, "Mask")

var visParams = {
  bands: 'remapped',
  min: 0,
  max: 1,
  palette: ['green', 'yellow']
};

// Add the layer to the map
Map.addLayer(FAO_lu,visParams ,'Mask');

// Export the land cover image as a raster to Google Drive
Export.image.toDrive({
  image: FAO_lu,
  folder: 'GEE',
  description: 'mask',
  scale: res, // Add your desired scale here
  region: region,
  crs: crs, // Add your desired CRS here
  maxPixels: 1e13 // Add a maximum number of pixels 
  //for export if needed
});

Script 4: Modelling, validation and prediction using soil data with coordinates

#________________________________________________________________
#
# Quantile Regression Forest
# Soil Property Mapping
#
# GSP-Secretariat
# Contact: Isabel.Luotto@fao.org
#          Marcos.Angelini@fao.org
#________________________________________________________________

#Empty environment and cache 
rm(list = ls())
gc()

# Content of this script ========================================
# 0 - Set working directory, soil attribute, and packages
# 1 - Merge soil data with environmental covariates 
# 2 - Covariate selection
# 3 - Model calibration
# 4 - Uncertainty assessment
# 5 - Prediction
# 6 - Export final maps
#________________________________________________________________


# 0 - Set working directory, soil attribute, and packages =======

# Working directory
wd <- 'C:/User/Documents/GitHub/GSNmap-TM/Digital-Soil-Mapping'
setwd(wd)

# Define country of interes throuhg 3-digit ISO code
ISO ='ISO'

# Load Area of interest (shp)
AOI <- '01-Data/AOI.shp'

# Terget soil attribute (Mandatory 10)
target_properties<- c("ph_0_30", "k_0_30" ,
                      "soc_0_30" ,"bd_0_30", "cec_0_30","p_0_30",   
                      "n_0_30","clay_0_30", "sand_0_30" 
                      ,"silt_0_30")

# Function for Uncertainty Assessment
load(file = "03-Scripts/eval.RData")

#load packages
library(tidyverse)
library(data.table)
library(caret)
library(quantregForest)
library(terra)
library(sf)
library(doParallel)


# 1 - Merge soil data with environmental covariates =============

## 1.1 - Load covariates ----------------------------------------
files <- list.files(path= '01-Data/covs/', pattern = '.tif$', 
                    full.names = T)
ncovs <- list.files(path= '01-Data/covs/', pattern = '.tif$',
                    full.names = F)
#In case of extent error, or if covariates other 
#than the default ones are added
# ref <- rast(files[1])
# covs <- list()
# for (i in seq_along(files)) {
#   r <- rast(files[i])
#   r <- project(r, ref)
#   covs[[i]] <- r
# }
# covs <- rast(covs)

covs<- rast(files)
ncovs <-  filename <- sub('.tif', '', ncovs)

ncovs[ncovs=="dtm_neg-openness_250m"] = 'dtm_neg'
ncovs[ncovs=="dtm_pos-openness_250m"] = 'dtm_pos'
names(covs) <- ncovs


## 1.2 - Load the soil data (Script 2) -------------------------
dat <- read_csv("02-Outputs/harmonized_soil_data.csv")

# Convert soil data into a spatial object 
#(check https://epsg.io/6204)
dat <- vect(dat, geom=c("x", "y"), crs = crs(covs))

# Reproject point coordinates to match coordinate 
#system of covariates
dat <- terra::project(dat, covs)
names(dat)

## 1.3 - Extract values from covariates to the soil points ------
pv <- terra::extract(x = covs, y = dat, xy=F)
dat <- cbind(dat,pv)
dat <- as.data.frame(dat)

summary(dat)



for(soilatt in unique(target_properties)){


## 1.4 - Target soil attribute + covariates ---------------------
d <- dplyr::select(dat, soilatt, names(covs))
d <- na.omit(d)

# 2 - Covariate selection with RFE ==============================
## 2.1 - Setting parameters -------------------------------------
# Repeatedcv = 3-times repeated 10-fold cross-validation
fitControl <- rfeControl(functions = rfFuncs,
                         method = "repeatedcv",
                         number = 10,         ## 10 -fold CV
                         repeats = 3,        ## repeated 3 times
                         verbose = TRUE,
                         saveDetails = TRUE, 
                         returnResamp = "all")

# Set the regression function
fm = as.formula(paste(soilatt," ~", paste0(ncovs,
                                             collapse = "+")))

# Calibrate the model using multiple cores
cl <- makeCluster(detectCores()-1)
registerDoParallel(cl)


## 2.2 - Calibrate a RFE model to select covariates -------------
covsel <- rfe(fm,
              data = d,  
              sizes = seq(from=10, to=length(ncovs)-1, by = 5),
              rfeControl = fitControl,
              verbose = TRUE,
              keep.inbag = T)
stopCluster(cl)
saveRDS(covsel, "02-Outputs/models/covsel.rda")

## 2.3 - Plot selection of covariates ---------------------------
trellis.par.set(caretTheme())
plot(covsel, type = c("g", "o"))

# Extract selection of covariates and subset covs
opt_covs <- predictors(covsel)

# 3 - QRF Model calibration =====================================
## 3.1 - Update formula with the selected covariates ------------
fm <- as.formula(paste(soilatt," ~",
                       paste0(opt_covs, collapse = "+")))

# parallel processing
cl <- makeCluster(detectCores()-1)
registerDoParallel(cl)

## 3.2 - Set training parameters --------------------------------
fitControl <- trainControl(method = "repeatedcv",
                           number = 10,         ## 10 -fold CV
                           repeats = 3,        ## repeated 3 times
                           savePredictions = TRUE)

# Tune mtry hyperparameters
mtry <- round(length(opt_covs)/3)
tuneGrid <-  expand.grid(mtry = c(mtry-5, mtry, mtry+5))

## 3.3 - Calibrate the QRF model --------------------------------
model <- caret::train(fm,
                      data = d,
                      method = "qrf",
                      trControl = fitControl,
                      verbose = TRUE,
                      tuneGrid = tuneGrid,
                      keep.inbag = T,
                      importance = TRUE)
stopCluster(cl)
gc()


## 3.4 - Extract predictor importance as relative values (%)
x <- randomForest::importance(model$finalModel)
model$importance <- x
## 3.5 - Print and save model -----------------------------------
print(model)
saveRDS(model, file = paste0("02-Outputs/models/model_",
                             soilatt,".rds"))

# 4 - Uncertainty assessment ====================================
# extract observed and predicted values
o <- model$pred$obs
p <- model$pred$pred
df <- data.frame(o,p)

## 4.1 - Plot and save scatterplot ------------------------------
(g1 <- ggplot(df, aes(x = o, y = p)) + 
  geom_point(alpha = 0.1) + 
   geom_abline(slope = 1, intercept = 0, color = "red")+
  ylim(c(min(o), max(o))) + theme(aspect.ratio=1)+ 
  labs(title = soilatt) + 
  xlab("Observed") + ylab("Predicted"))
# ggsave(g1, filename = paste0("02-Outputs/residuals_",
soilatt,".png"), scale = 1, 
#        units = "cm", width = 12, height = 12)

## 4.2 - Print accuracy coeficients -----------------------------
# https://github.com/AlexandreWadoux/MapQualityEvaluation
eval(p,o)

## 4.3 - Plot Covariate importance ------------------------------
(g2 <- varImpPlot(model$finalModel, main = soilatt, type = 1))

# png(filename = paste0("02-Outputs/importance_",soilatt,".png"), 
#     width = 15, height = 15, units = "cm", res = 600)
# g2
# dev.off()

# 5 - Prediction ================================================
# Generation of maps (prediction of soil attributes) 
## 5.1 - Produce tiles ------------------------------------------
r <-covs[[1]]
t <- rast(nrows = 5, ncols = 5, extent = ext(r), crs = crs(r))
tile <- makeTiles(r, t,overwrite=TRUE,filename=
                    "02-Outputs/tiles/tiles.tif")

## 5.2 - Predict soil attributes per tiles ----------------------
# loop to predict on each tile

for (j in seq_along(tile)) {
  gc()
  t <- rast(tile[j])
  covst <- crop(covs, t)
  
  
  # plot(r)# 
  pred_mean <- terra::predict(covst, model = 
                                model$finalModel, na.rm=TRUE,  
                              cpkgs="quantregForest", what=mean)
  pred_sd <- terra::predict(covst, model = 
                              model$finalModel, na.rm=TRUE,  
                            cpkgs="quantregForest", what=sd)  
  
  
  
  # ###### Raster package solution (in case terra results 
  # in many NA pixels)
  # library(raster)
  # covst <- stack(covst)
  # class(final_mod$finalModel) <-"quantregForest"
  # # Estimate model uncertainty
  # pred_sd <- predict(covst,model=final_mod$finalModel,type=sd)
  # # OCSKGMlog prediction based in all available data
  # pred_mean <- predict(covst,model=final_mod)
  # 
  # 
  # ##################################  
  
  writeRaster(pred_mean, 
            filename = paste0("02-Outputs/tiles/soilatt_tiles/",
                                soilatt,"_tile_", j, ".tif"), 
              overwrite = TRUE)
  writeRaster(pred_sd, 
            filename = paste0("02-Outputs/tiles/soilatt_tiles/",
                                soilatt,"_tileSD_", j, ".tif"), 
              overwrite = TRUE)
  
  rm(pred_mean)
  rm(pred_sd)
  
  
  print(paste("tile",tile[j]))
}

## 5.3 - Merge tiles both prediction and st.Dev -----------------
f_mean <- list.files(path = "02-Outputs/tiles/soilatt_tiles/", 
                     pattern = paste0(soilatt,"_tile_"), 
                     full.names = TRUE)
f_sd <- list.files(path = "02-Outputs/tiles/soilatt_tiles/", 
                   pattern =  paste0(soilatt,"_tileSD_"), 
                   full.names = TRUE)
r_mean_l <- list()
r_sd_l <- list()

for (g in 1:length(f_mean)){
  r <- rast(f_mean[g])
  r_mean_l[g] <-r
  rm(r)
}

for (g in 1:length(f_sd)){
  
  r <- rast(f_sd[g])
  r_sd_l[g] <-r
  rm(r)
}
r_mean <-sprc(r_mean_l)
r_sd <-sprc(r_sd_l)
pred_mean <- mosaic(r_mean)
pred_sd <- mosaic(r_sd)

aoi <- vect(AOI)
pred_mean <- mask(pred_mean,aoi)
pred_sd <- mask(pred_sd,aoi)


plot(pred_mean)
plot(pred_sd)


# 6 - Export final maps =========================================
## 6.1 - Mask croplands -----------------------------------------
msk <- rast("01-Data/mask.tif")
plot(msk)
pred_mean <- mask(pred_mean, msk)
plot(pred_mean)
pred_sd <- mask(pred_sd, msk)
plot(pred_sd)
plot(pred_sd/pred_mean*100, main = paste("CV",soilatt))

## 6.2 - Save results -------------------------------------------

# Harmonized naming 
if (soilatt == 'ph_0_30'){
  name <-'_GSNmap_pH_Map030.tiff'
}else if (soilatt == 'k_0_30'){
  name <-'_GSNmap_Ktot_Map030.tiff'
}else if (soilatt == 'soc_0_30'){
  name <-'_GSNmap_SOC_Map030.tiff'
}else if (soilatt == 'clay_0_30'){
  name <-'_GSNmap_Clay_Map030.tiff'
}else if (soilatt == 'bd_0_30'){
  name <-'_GSNmap_BD_Map030.tiff'
}else if (soilatt == 'cec_0_30'){
  name <-'_GSNmap_CEC_Map030.tiff'
}else if (soilatt == 'p_0_30'){
  name <-'_GSNmap_Pav_Map030.tiff'
}else if (soilatt == 'n_0_30'){
  name <-'_GSNmap_Ntot_Map030.tiff'
}else if (soilatt == 'sand_0_30'){
  name <-'_GSNmap_Sand_Map030.tiff'
}else if (soilatt == 'silt_0_30'){
  name <-'_GSNmap_Silt_Map030.tiff'
}

writeRaster(pred_mean, 
            paste0("02-Outputs/maps/",ISO,name),
            overwrite=TRUE)
writeRaster(pred_sd, 
            paste0("02-Outputs/maps/",ISO, '_SD',name),
            overwrite=TRUE)



}