Annex I: Compendium of R scripts
This chapter contains the complete list of R scripts to run the process of mapping soil nutrient and associated soil properties.
Script 0: Introduction to R
# Introduction to R
# 0. Playground =================================================
# learn important keyboard shortcuts
# Ctrl + enter for running code
# tab after writing the first three
# characters of the function name
# F1 to access the help
# explore the use of <-, $, [], ==, !=, c(), :,
# data.frame(), list(), as.factor()
a <- 10:15
a[2]
a[2:3]
b <- c("1", "a", a )
length(b)
df <- data.frame(column_a = 1:8, column_b = b)
df[,1]
df$column_b
as.numeric(df$column_b)
plot(df)
df[1:3,]
df[,1]
as.factor(b)
d <- list(a, b, df)
d
names(d)
names(d) <- c("numeric_vector", "character_vector", "dataframe")
d
d[[1]]
d$numeric_vector
a == b
a != b
# 1. Set working directory ======================================
setwd("C:/GIT/Digital-Soil-Mapping/")
# 2. Install and load packages ==================================
# readxl, tidyverse, and data.table packages using the functions
install.packages("tidyverse")
install.packages("readxl")
install.packages("data.table")
library(tidyverse)
library(readxl)
library(data.table)
# 3. Import an spreadsheet ======================================
## 3.1 Read the MS Excel file -----------------------------------
#Read the soil_data.xlsx file, spreadsheet 2, using read_excel
read_excel(path = "01-Data/soil_data.xlsx", sheet = 2)
## 3.2 Read the csv file with the native function ---------------
# 01-Data/horizon.csv
read.csv("01-Data/soil_profile_data.csv")
## 3.3 Read the csv file with the tidyverse function ------------
read_csv("01-Data/soil_profile_data.csv")
## 3.4 Read the csv file with the data.table function -----------
fread("01-Data/soil_profile_data.csv")
## 3.5 Assign the dataframe to an object called dat -------------
dat <- read_csv("01-Data/soil_profile_data.csv")
# 4. Tidyverse functions ========================================
## 4.1 Select pid, hip, top, bottom, ph_h2o, cec from dat -------
dat_1 <- dat %>%
select(id_prof, id_hor, top, bottom, ph_h2o, cec)
## 4.2 Filter: pick observations by their values ----------------
# filter observations with cec > 50 cmolc/100g
dat_2 <- dat_1 %>%
filter(cec > 30)
dat_2
## 4.3 Mutate: create a new variable ----------------------------
# thickness = top - bottom
dat_3 <- dat_2 %>%
mutate(thickness = bottom - top)
## 4.4 Group_by and summarise -----------------------------------
# group by variable pid
# summarise taking the mean of pH and cec
dat_4 <- dat_3 %>%
group_by(id_prof) %>%
summarise(mean_ph = mean(ph_h2o),
mean_cec = mean(cec))
## 4.5 Reshape the table using pivot_longer ---------------------
# use dat_3
# put the names of the variables ph_h2o,
# cec and thickness in the column
# variable and keep the rest of the table. Save in dat_5
dat_5 <- dat_3 %>%
pivot_longer(ph_h2o:thickness, names_to = "soil_property",
values_to = "value")
## 4.6 Join the table sites.csv with dat_3 ----------------------
# Load soil_phys_data030.csv (in 01-Data folder)
# Join its columns with dat_3 keeping all the rows of dat_3
# save the result as dat_6
phys <- read_csv("01-Data/soil_phys_data030.csv")
phys <- phys %>% rename(id_prof = "ProfID")
dat_6 <- dat_3 %>%
left_join(phys)
# or
dat_6 <- phys %>%
right_join(dat_3)
# 5. Data visualization with ggplot2 ============================
## 5.1 1D plot: histograms --------------------------------------
# histograms of cec and ph_h2o
ggplot(dat_3, aes(x=cec)) + geom_histogram()
## 5.2 2D plot: scatterplot -------------------------------------
# Scatterplot bottom vs. ph_h2o
ggplot(dat_3, aes(x = bottom, y = ph_h2o)) +
geom_point()
# add a fitting line
ggplot(dat_3, aes(x = bottom, y = ph_h2o)) +
geom_point() +
geom_smooth(method = "lm" )
## 5.3 3D plot: scatterplot -------------------------------------
# Scatterplot bottom vs. ph_h2o, add clay as color
# and size inside the
# function aes()
ggplot(dat_3,
aes(x = bottom, y = ph_h2o, color = cec, size = cec)) +
geom_point()
# 6. Geospatial data with terra =================================
## Load packages (install them if needed)
library(terra)
## 6.1 Load a raster and a vector layer -------------------------
# Load 01-Data/covs/grass.tif using rast() function, then plot it
# Load 01-Data/soil map/SoilTypes.shp using vect() function and
# plot it
# explore the attributes of these layers
r <- rast("01-Data/Macedonia/grass.tif")
plot(r)
v <- vect("01-Data/Macedonia/SoilTypes.shp")
plot(v)
## 6.2 Load a raster and a vector layer -------------------------
# Check the current CRS (EPSG) of the raster and the vector.
# Find a *projected* CRS in http://epsg.io for Macedonia and
# copy the number
# Check the Arguments of function project (?project) that need to
# be defined
# Save the new object as r_proj and v_proj
# plot both objects
r_proj <- project(x = r, y = "epsg:6204", method = "bilinear",
res = 250)
plot(r_proj)
v_proj <- project(x = v, y = "epsg:6204")
plot(v_proj, add = TRUE)
## 6.3 Cropping and masking a raster ----------------------------
# Compute the area of the polygons in v_proj
# (search for a function) and
# assign the values to a new column named area
# select the largest polygon using [], $, == and max() func.
# and save it as pol
# crop the raster with pol using the crop() function and save
#it as r_pol
# mask the raster r_pol with the polygon pol and save it
# with the same name
# plot each result
v_proj$area <- expanse(v_proj, unit = "ha")
pol <- v_proj[v_proj$area == max(v_proj$area)]
plot(pol)
r_pol <- crop(r_proj, pol)
plot(r_pol)
plot(pol, add = TRUE)
r_pol <- mask(r_pol, pol)
plot(r_pol)
## 6.4 Replace values in a raster by filtering their cells ------
# Explore the following link to understand how terra
#manage cell values
# https://rspatial.org/terra/pkg/4-algebra.html
# Replace values lower than 5 in r+pol by 0
r_pol[r_pol$grass < 5] <- 0
plot(r_pol)
## 6.5 Rasterize a vector layer ---------------------------------
# Use rasterize() function to convert v_proj to raster
# Use r_proj as reference raster
# Use field Symbol to assign cell values, and plot the new map
v_class <- rasterize(x = v_proj, y = r_proj, field = "Symbol" )
plot(v_class)
v_class
activeCat(v_class) <- 1
## 6.6 Extracting raster values using points --------------------
# Covert dat_6 to spatial points using vect() function
# (check help of vect())
# Note that the EPSG number is 6204
# Save the points as s
# Plot s and r_proj together in the same map (Argument add=TRUE)
# Extract the values of the raster using extract()
# function (check the help)
# Remove the ID column of the extracted values
# merge the extracted data with s using cbind() function
# Convert s as a dataframe
s <- vect(dat_6, geom=c("x", "y"), crs = "epsg:6204")
plot(r_proj)
plot(s, add=TRUE)
x <- extract(r_proj,s, ID=FALSE)
s <- cbind(s,x)
d <- as.data.frame(s)
d
#GGally::ggscatmat(d)
## 6.7 Zonal statistics using polygons and rasters --------------
# Use the extract() func. to estimate the mean value of
# r_proj at each polygon
# Use the fun= argument (check the help)
# Use the cbind() func. to merge v_proj and the extracted values
# convert v_proj to a dataframe
# Create a ggplot boxplot (geom_boxplot) with x=Symbol
# and y=grass
x <- extract(r_proj, v_proj, fun = mean, ID=FALSE)
v_proj <- cbind(v_proj, x)
d <- as_tibble(v_proj)
d %>%
ggplot(aes(x =Symbol, y = grass, fill = Symbol)) +
geom_boxplot() +
ylab("Grass probability")
## 6.8 Bonus track: use zonal() function ------------------------
zonal(r_proj, v_class, na.rm=T)Script 2: Data preparation
#
# Digital Soil Mapping
# Soil Profile Data
# Cleaning and Processing
#
# GSP-Secretariat
# Contact: Isabel.Luotto@fao.org
# Marcos.Angelini@fao.org
#________________________________________________________________
#Empty environment and cache
rm(list = ls())
gc()
# Content of this script ========================================
# The goal of this script is to organise the soil data for
# mapping, including:
#
# 0 - User-defined variables
# 1 - Set working directory and load necessary packages
# 2 - Import national data
# 3 - Select useful columns
# 4 - Quality check
# 5 - Estimate BD using pedotransfer function
# 6 - Harmonize soil layers
# 7 - Add chemical properties from additional dataset
# 8 - Plot and save results
#________________________________________________________________
# 0 - User-defined variables ====================================
wd <- "C:/GIT/GSNmap-TM/Digital-Soil-Mapping"
#wd <- 'C:/Users/hp/Documents/GitHub/GSNmap-TM/Digital-Soil-Mapping'
# 1 - Set working directory and load necessary packages =========
setwd(wd) # change the path accordingly
#(setwd(dirname(rstudioapi::getActiveDocumentContext()$path)))
library(tidyverse) # for data management and reshaping
library(readxl) # for importing excel files
library(mapview) # for seeing the profiles in a map
library(sf) # to manage spatial data (shp vectors)
library(aqp) # for soil profile data
library(mpspline2) # for horizon harmonization
library(plotly) # interactive plots
# 2 - Import national data ======================================
# Save your national soil dataset in the data folder
# /01-Data as a .csv file or
# as a .xlsx file
## 2.1 - for .xlsx files ----------------------------------------
# Import horizon data
# hor <- read_excel("01-Data/soil_data.xlsx", sheet = 2)
# # Import site-level data
# site <- read_excel("01-Data/soil_data.xlsx", sheet = 1)
# chem <- read_excel("01-Data/soil_data.xlsx", sheet = 2)
# phys <- read_excel("01-Data/soil_data.xlsx", sheet = 3)
## 2.2 - for .csv files -----------------------------------------
# Import horizon data
hor <-
read_csv(file =
"01-Data/soil_profile_data.csv",show_col_types = FALSE)
chem <-
read_csv(file =
"01-Data/soil_chem_data030.csv",show_col_types = FALSE)
phys <- read_csv(file =
"01-Data/soil_phys_data030.csv",show_col_types = FALSE)
site <- select(hor, id_prof, x, y) %>% unique()
hor <- select(hor, id_prof, id_hor, top:cec)
# change names of key columns
names(site)
names(site)[1] <- "ProfID"
names(hor)
names(hor)[1] <- "ProfID"
names(hor)[2] <- "HorID"
# scan the data
summary(site)
summary(hor)
# 3 - select useful columns =====================================
## 3.1 - select AND RENAME columns ------------------------------
hor <- select(hor, ProfID, HorID, top, bottom, ph=ph_h2o,
k, soc, bd, cec)
# 4 - Quality check =============================================
## 4.1 - Check locations ----------------------------------------
# Macedonian State Coordinate System EPSG:6204
# https://epsg.io/6204
site %>%
st_as_sf(coords = c("x", "y"), crs = 4326) %>% # convert to
#spatial object
mapview(zcol = "ProfID", cex = 3, lwd = 0.1) # visualise in an
#interactive map
site_sub <- site[site$ProfID=='2823',]
site_sub %>%
st_as_sf(coords = c("x", "y"), crs = 4326) %>% # convert to
#spatial object
mapview(zcol = "ProfID", cex = 3, lwd = 0.1) # visualise in an
#interactive map
# profile 2823 is wrongly located, so let's remove it
site <- filter(site, ProfID != 2823)
## 4.2 - Convert data into a Soil Profile Collection ------------
aqp::depths(hor) <- ProfID ~ top + bottom
hor@site$ProfID <- as.numeric(hor@site$ProfID)
site(hor) <- left_join(site(hor), site)
profiles <- hor
profiles
## 4.3 - plot first 20 profiles using pH as color ---------------
plotSPC(x = profiles[1:20], name = "pH", color = "ph",
name.style = "center-center")
## 4.4 - check data integrity -----------------------------------
# A valid profile is TRUE if all of the
#following criteria are false:
# + depthLogic : boolean, errors related to depth logic
# + sameDepth : boolean, errors related to same top/bottom
#depths
# + missingDepth : boolean, NA in top / bottom depths
# + overlapOrGap : boolean, gaps or overlap in adjacent
#horizons
aqp::checkHzDepthLogic(profiles)
# Identify non-valid profiles
dl <- checkHzDepthLogic(profiles)
dl[dl$depthLogic==T | dl$sameDepth==T | dl$missingDepth==T |
dl$overlapOrGap==T,"ProfID"]
# visualize some of these profiles by the pid
subset(profiles, grepl(6566, ProfID, ignore.case = TRUE))
subset(profiles, grepl(7410 , ProfID, ignore.case = TRUE))
subset(profiles, grepl(8002, ProfID, ignore.case = TRUE))
## 4.5 - keep only valid profiles -------------------------------
clean_prof <- HzDepthLogicSubset(profiles)
metadata(clean_prof)$removed.profiles
# write_rds(clean_prof, "01-Data/soilProfileCollection.rds")
## 4.6 convert soilProfileCollection to a table -----------------
dat <- left_join(clean_prof@site, clean_prof@horizons)
dat <- select(dat, ProfID, HorID, x, y, top, bottom, ph:cec )
# 5 - Estimate BD using pedotransfer functions ==================
# create the function with all PTF
method_names <- c("Saini1996", "Drew1973", "Jeffrey1979",
"Grigal1989",
"Adams1973", "Honeyset_Ratkowsky1989")
estimateBD <- function(SOC=NULL, method=NULL) {
OM <- SOC * 1.724
BD <- switch(method,
"Saini1996" = 1.62 - 0.06 * OM,
"Drew1973" = 1 / (0.6268 + 0.0361 * OM),
"Jeffrey1979" = 1.482 - 0.6786 * (log(OM)),
"Grigal1989" = 0.669 + 0.941 * exp(1)^(-0.06 * OM),
"Adams1973" = 100 / (OM / 0.244 + (100 - OM) / 2.65),
"Honeyset_Ratkowsky1989" = 1 / (0.564 + 0.0556 * OM),
stop("Invalid method specified.")
)
return(BD)
}
# 5.1 - Select a pedotransfer function --------------------------
# Create a test dataset with BD and SOC data
BD_test <- data.frame(SOC = dat$soc, BD_observed = dat$bd)
# Remove missing values
BD_test <- BD_test[complete.cases(BD_test),]
BD_test <- na.omit(BD_test)
# 5.2 - Estimate BLD for a subset using the
# pedotransfer functions ------------
for (i in method_names) {
BD_test[[i]] <- estimateBD(BD_test$SOC, method = i)
}
# Print the resulting data frame
BD_test
## 5.3 Compare results ------------------------------------------
# Observed values:
summary(BD_test)
# Compare data distributions for observed and predicted BLD
plot.bd <- BD_test %>%
select(-SOC) %>%
pivot_longer(cols = c("BD_observed", "Saini1996", "Drew1973",
"Jeffrey1979",
"Grigal1989", "Adams1973",
"Honeyset_Ratkowsky1989"),
names_to = "Method", values_to = "BD") %>%
ggplot(aes(x = BD, color = Method)) +
geom_density()
plot.bd
# Dymanic plot with plotly
ggplotly(plot.bd)
ggplotly(plot.bd) %>%
layout(hovermode = "x")
# Plot the Selected function again
BD_test %>%
select(-SOC) %>%
pivot_longer(cols = c("BD_observed", "Honeyset_Ratkowsky1989"),
names_to = "Method", values_to = "BD") %>%
ggplot(aes(x = BD, color = Method)) +
geom_density() + xlim(c(0,2.5))
# Same dynamic plot
ggplotly(BD_test %>%
select(-SOC) %>%
pivot_longer(cols = c("BD_observed",
"Honeyset_Ratkowsky1989"),
names_to = "Method", values_to = "BD") %>%
ggplot(aes(x = BD, color = Method)) +
geom_density() + xlim(c(0,2.5))) %>%
layout(hovermode = "x")
## 5.4 Estimate BD for the missing horizons ---------------------
dat$bd[is.na(dat$bd)] <-
estimateBD(dat[is.na(dat$bd),]$soc,
method="Honeyset_Ratkowsky1989")
# Explore the results
summary(dat$bd)
g <- BD_test %>%
select(-SOC) %>%
pivot_longer(cols = c("BD_observed"),
names_to = "Method", values_to = "BD") %>%
ggplot(aes(x = BD, color = Method)) +
geom_density() +
xlim(c(0,2.5))
g + geom_density(data = dat, aes(x=bd,
color = "Predicted +\n observed"))
## 5.5 - Explore outliers ---------------------------------------
# Outliers should be carefully explored and compared with
#literature values.
# Only if it is clear that outliers represent
#impossible or highly unlikely
# values, they should be removed as errors.
#
# Carbon content higher than 15% is only typical for organic soil
# (histosols)
# We will remove all atypically high SOC as outliers
summary(dat$soc)
na.omit(dat$ProfID[dat$soc > 10])
dat$ProfID[dat$soc > 10][!is.na(dat$ProfID[dat$soc > 10])]
dat <- dat[dat$ProfID != 6915,]
dat <- dat[dat$ProfID != 7726,]
dat<- dat[!(dat$ProfID %in% dat$ProfID[dat$soc > 10]
[!is.na(dat$ProfID[dat$soc > 10])]),]
# Explore bulk density data, identify outliers
# remove layers with Bulk Density < 1 g/cm^3
low_bd_profiles <- na.omit(dat$ProfID[dat$bd<1])
dat <- dat[!(dat$ProfID %in% low_bd_profiles),]
# Explore data, identify outliers
x <- pivot_longer(dat, cols = ph:cec, values_to = "value",
names_to = "soil_property")
x <- na.omit(x)
ggplot(x, aes(x = soil_property, y = value,
fill = soil_property)) +
geom_boxplot() +
facet_wrap(~soil_property, scales = "free")
# 6 - Harmonize soil layers =====================================
source("03-Scripts/.spline_functions.R")
## 6.1 - Set target soil properties and depths ------------------
names(dat)
dat <- select(dat, ProfID, HorID, x, y, top, bottom,
ph, k, soc, bd, cec)
target <- c("ph", "k", "soc", "bd", "cec")
depths <- c(0,30)
## 6.2 - Create standard layers ---------------------------------
splines <- apply_mpspline_all(df = dat, properties = target,
depth_range = depths)
summary(splines)
# merge splines with x and y
d <- unique(select(dat, ProfID, x, y))
d <- left_join(d, splines)
# 7 - Harmonize units ===========================================
#Harmonize units if different from target units
# Mandatory Soil Propertes and corresponing units:
# Total N - ppm
# Available P - ppm
# Available K - ppm
# Cation exchange capacity cmolc/kg
# pH
# SOC - %
# Bulk density g/cm3
# Soil fractions (clay, silt and sand) -
# Units soil profile data (dataframe d)
#
head(d) # pH; K cmolc/kg; SOC %; BD g/cm3; CEC cmolc/kg
# K => convert cmolc/kg to ppm (K *10 * 39.096)
d$k_0_30 <- d$k_0_30 * 10 * 39.096
head(chem)# P ppm; N %; K ppm
# N => convert % to ppm (N * 10000)
chem$tn <-chem$tn*10000
head(phys)# clay, sand, silt g/kg
# convert g/kg to % (/10)
phys$clay_0_30 <-phys$clay_0_30/10
phys$sand_0_30 <-phys$sand_0_30 /10
phys$silt_0_30 <-phys$silt_0_30/10
# Correct negative clay content
phys$clay_0_30[phys$clay_0_30<0] <- 0
# Add chemical and physical properties from additional datasets =
# Rename columns to match the main data set
names(d)
names(chem)[1] <- 'ProfID'
names(chem)[4] <- 'p_0_30'
names(chem)[5] <- 'k_0_30'
names(chem)[6] <- 'n_0_30'
#The chem dataframe comes from and independent
#dataset we need to create new unique ProfIDs
#Create unique ProfID
chem$ProfID <- seq(max(d$ProfID)+1,max(d$ProfID)+1+nrow(chem)-1)
# Add the new data as new rows using dplyr
# we can add empty rows
# automatically for the not measured
# properties in the chem dataset
d <- bind_rows(d, chem)
#The phys dataframe with the texture instead shares the
# same ProfIDs (we can directly merge)
d <- left_join(d, phys, by=c('ProfID', 'x', 'y'))
# 8 - Plot and save results ====================================
names(d)
x <- pivot_longer(d, cols = ph_0_30:silt_0_30,
values_to = "value",
names_sep = "_",
names_to = c("soil_property",
"top", "bottom"))
x <- mutate(x, depth = paste(top, "-" , bottom))
#x <- na.omit(x)
ggplot(x, aes(x = depth, y = value, fill = soil_property)) +
geom_boxplot() +
facet_wrap(~soil_property, scales = "free")
ggplotly(ggplot(x, aes(x = depth, y = value,
fill = soil_property)) +
geom_boxplot() +
facet_wrap(~soil_property, scales = "free"))
# save the data
write_csv(d, "02-Outputs/harmonized_soil_data.csv")Scripts 3: Download environmental covariates
var assets =
["projects/digital-soil-mapping-gsp-fao/assets/CHELSA/bio1",
"projects/digital-soil-mapping-gsp-fao/assets/CHELSA/bio12",
"projects/digital-soil-mapping-gsp-fao/assets/CHELSA/bio13",
"projects/digital-soil-mapping-gsp-fao/assets/CHELSA/bio14",
"projects/digital-soil-mapping-gsp-fao/assets/CHELSA/bio16",
"projects/digital-soil-mapping-gsp-fao/assets/CHELSA/bio17",
"projects/digital-soil-mapping-gsp-fao/assets/CHELSA/bio5",
"projects/digital-soil-mapping-gsp-fao/assets/CHELSA/bio6",
"projects/digital-soil-mapping-gsp-fao/assets/CHELSA/ngd10",
"projects/digital-soil-mapping-gsp-fao/assets/CHELSA/pet_penman_max",
"projects/digital-soil-mapping-gsp-fao/assets/CHELSA/pet_penman_mean",
"projects/digital-soil-mapping-gsp-fao/assets/CHELSA/pet_penman_min",
"projects/digital-soil-mapping-gsp-fao/assets/CHELSA/pet_penman_range",
"projects/digital-soil-mapping-gsp-fao/assets/CHELSA/sfcWind_max",
"projects/digital-soil-mapping-gsp-fao/assets/CHELSA/sfcWind_mean",
"projects/digital-soil-mapping-gsp-fao/assets/CHELSA/sfcWind_range",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/fpar_030405_500m_mean",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/fpar_030405_500m_sd",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/fpar_060708_500m_mean",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/fpar_060708_500m_sd",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/fpar_091011_500m_mean",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/fpar_091011_500m_sd",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/fpar_120102_500m_mean",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/fpar_120102_500m_sd",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/lstd_030405_mean",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/lstd_030405_sd",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/lstd_060708_mean",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/lstd_060708_sd",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/lstd_091011_mean",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/lstd_091011_sd",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/lstd_120102_mean",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/lstd_120102_sd",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/ndlst_030405_mean",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/ndlst_030405_sd",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/ndlst_060708_mean",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/ndlst_060708_sd",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/ndlst_091011_mean",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/ndlst_091011_sd",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/ndlst_120102_mean",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/ndlst_120102_sd",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/ndvi_030405_250m_mean",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/ndvi_030405_250m_sd",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/ndvi_060708_250m_mean",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/ndvi_060708_250m_sd",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/ndvi_091011_250m_mean",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/ndvi_091011_250m_sd",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/ndvi_120102_250m_mean",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/ndvi_120102_250m_sd",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/snow_cover",
"projects/digital-soil-mapping-gsp-fao/assets/MODIS/swir_060708_500m_mean",
"projects/digital-soil-mapping-gsp-fao/assets/LANDCOVER/crops",
"projects/digital-soil-mapping-gsp-fao/assets/LANDCOVER/flooded_vegetation",
"projects/digital-soil-mapping-gsp-fao/assets/LANDCOVER/grass",
"projects/digital-soil-mapping-gsp-fao/assets/LANDCOVER/shrub_and_scrub",
"projects/digital-soil-mapping-gsp-fao/assets/LANDCOVER/trees",
"projects/digital-soil-mapping-gsp-fao/assets/OPENLANDMAP/dtm_curvature_250m",
"projects/digital-soil-mapping-gsp-fao/assets/OPENLANDMAP/dtm_downslopecurvature_250m",
"projects/digital-soil-mapping-gsp-fao/assets/OPENLANDMAP/dtm_dvm2_250m",
"projects/digital-soil-mapping-gsp-fao/assets/OPENLANDMAP/dtm_dvm_250m",
"projects/digital-soil-mapping-gsp-fao/assets/OPENLANDMAP/dtm_elevation_250m",
"projects/digital-soil-mapping-gsp-fao/assets/OPENLANDMAP/dtm_mrn_250m",
"projects/digital-soil-mapping-gsp-fao/assets/OPENLANDMAP/dtm_neg_openness_250m",
"projects/digital-soil-mapping-gsp-fao/assets/OPENLANDMAP/dtm_pos_openness_250m",
"projects/digital-soil-mapping-gsp-fao/assets/OPENLANDMAP/dtm_slope_250m",
"projects/digital-soil-mapping-gsp-fao/assets/OPENLANDMAP/dtm_tpi_250m",
"projects/digital-soil-mapping-gsp-fao/assets/OPENLANDMAP/dtm_twi_500m",
"projects/digital-soil-mapping-gsp-fao/assets/OPENLANDMAP/dtm_upslopecurvature_250m",
"projects/digital-soil-mapping-gsp-fao/assets/OPENLANDMAP/dtm_vbf_250m",
"projects/digital-soil-mapping-gsp-fao/assets/POP/hfp2013_merisINT",
"projects/digital-soil-mapping-gsp-fao/assets/POP/night_lights_stable_2013",
"projects/digital-soil-mapping-gsp-fao/assets/POP/population_density_2020"];
// Load borders
/// Using UN 2020 (replace the countries to download)
var ISO = ['ITA'];
var aoi =
ee.FeatureCollection(
'projects/digital-soil-mapping-gsp-fao/assets/UN_BORDERS/BNDA_CTY'
)
.filter(ee.Filter.inList('ISO3CD', ISO));
var region = aoi.geometry();
/// Using a shapefile
/// 1. Upload the borders of your countries as an asset
/// 2. Replace 'your_shapefile' with the path to your shapefile
// var shapefile =
//ee.FeatureCollection('users/your_username/your_shapefile');
// var region = shapefile.geometry().bounds();
// Load assets as ImageCollection
var assetsCollection = ee.ImageCollection(assets);
// Clip each image in the collection to the region of interest
var clippedCollection = assetsCollection.map(function(img){
return img.clip(region).toFloat();
});
// Function to replace masked values with zeroes for fpar bands
function replaceMaskedFpar(img) {
var allBands = img.bandNames();
var fparBands =
allBands.filter(ee.Filter.stringStartsWith('item', 'fpar'));
var nonFparBands = allBands.removeAll(fparBands);
var fparImg = img.select(fparBands).unmask(0);
var nonFparImg = img.select(nonFparBands);
// If there are no fpar bands, return the original image
var result = ee.Algorithms.If(fparBands.length().eq(0),
img,
nonFparImg.addBands(fparImg));
return ee.Image(result);
}
// Clip each image in the collection to the region of
//interest and replace masked values for fpar bands
var clippedCollection = assetsCollection.map(function(img){
var clippedImg = img.clip(region).toFloat();
return replaceMaskedFpar(clippedImg);
});
// Stack the layers and maintain the
//layer names in the final file
var stacked = clippedCollection.toBands();
// Get the list of asset names
var assetNames = ee.List(assets).map(function(asset) {
return ee.String(asset).split('/').get(-1);
});
// Rename the bands with asset names
var renamed = stacked.rename(assetNames);
print(renamed, 'Covariates to be exported')
// Visualize the result
// Set a visualization parameter
// (you can adjust the colors as desired)
var visParams = {
bands: 'bio1',
min: 0,
max: 1,
palette: ['blue', 'green', 'yellow', 'red']
};
// Add the layer to the map
Map.centerObject(renamed, 6)
Map.addLayer(renamed, visParams, 'Covariates');
// Export the stacked image to Google Drive
Export.image.toDrive({
image: renamed,
description: 'covariates',
folder: 'GEE',
scale: 250,
maxPixels: 1e13,
region: region
});
/* Create mask for croplands ----------------------------*/
// Load the Copernicus Global Land Service image collection
var imageCollection =
ee.Image("COPERNICUS/Landcover/100m/Proba-V-C3/Global/2019")
.select("discrete_classification")
.clip(region)
var crs = 'EPSG:4326'; // WGS84
var res = 250; // Resolution in decimal degrees
// Default resampling is nearest neighbor
var image1 = imageCollection.resample()
.reproject({
crs: crs, // Add your desired CRS here
scale: res // Add your desired scale here
});
// Reclassify the land cover classes
var inList = [0, 20, 30, 40, 50, 60, 70, 80, 90, 100,
111, 112, 113, 114, 115, 116,
121, 122, 123, 124, 125, 126, 200];
var outList = [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
var FAO_lu = image1.remap(inList, outList)
.toDouble()
.clip(region);
// print(FAO_lu)
// Convert 0 to NA
var mask = FAO_lu.neq(0);
print(mask)
FAO_lu = FAO_lu.updateMask(mask);
print(FAO_lu, "Mask")
var visParams = {
bands: 'remapped',
min: 0,
max: 1,
palette: ['green', 'yellow']
};
// Add the layer to the map
Map.addLayer(FAO_lu,visParams ,'Mask');
// Export the land cover image as a raster to Google Drive
Export.image.toDrive({
image: FAO_lu,
folder: 'GEE',
description: 'mask',
scale: res, // Add your desired scale here
region: region,
crs: crs, // Add your desired CRS here
maxPixels: 1e13 // Add a maximum number of pixels
//for export if needed
});Script 4: Modelling, validation and prediction using soil data with coordinates
#________________________________________________________________
#
# Quantile Regression Forest
# Soil Property Mapping
#
# GSP-Secretariat
# Contact: Isabel.Luotto@fao.org
# Marcos.Angelini@fao.org
#________________________________________________________________
#Empty environment and cache
rm(list = ls())
gc()
# Content of this script ========================================
# 0 - Set working directory, soil attribute, and packages
# 1 - Merge soil data with environmental covariates
# 2 - Covariate selection
# 3 - Model calibration
# 4 - Uncertainty assessment
# 5 - Prediction
# 6 - Export final maps
#________________________________________________________________
# 0 - Set working directory, soil attribute, and packages =======
# Working directory
wd <- 'C:/User/Documents/GitHub/GSNmap-TM/Digital-Soil-Mapping'
setwd(wd)
# Define country of interes throuhg 3-digit ISO code
ISO ='ISO'
# Load Area of interest (shp)
AOI <- '01-Data/AOI.shp'
# Terget soil attribute (Mandatory 10)
target_properties<- c("ph_0_30", "k_0_30" ,
"soc_0_30" ,"bd_0_30", "cec_0_30","p_0_30",
"n_0_30","clay_0_30", "sand_0_30"
,"silt_0_30")
# Function for Uncertainty Assessment
load(file = "03-Scripts/eval.RData")
#load packages
library(tidyverse)
library(data.table)
library(caret)
library(quantregForest)
library(terra)
library(sf)
library(doParallel)
# 1 - Merge soil data with environmental covariates =============
## 1.1 - Load covariates ----------------------------------------
files <- list.files(path= '01-Data/covs/', pattern = '.tif$',
full.names = T)
ncovs <- list.files(path= '01-Data/covs/', pattern = '.tif$',
full.names = F)
#In case of extent error, or if covariates other
#than the default ones are added
# ref <- rast(files[1])
# covs <- list()
# for (i in seq_along(files)) {
# r <- rast(files[i])
# r <- project(r, ref)
# covs[[i]] <- r
# }
# covs <- rast(covs)
covs<- rast(files)
ncovs <- filename <- sub('.tif', '', ncovs)
ncovs[ncovs=="dtm_neg-openness_250m"] = 'dtm_neg'
ncovs[ncovs=="dtm_pos-openness_250m"] = 'dtm_pos'
names(covs) <- ncovs
## 1.2 - Load the soil data (Script 2) -------------------------
dat <- read_csv("02-Outputs/harmonized_soil_data.csv")
# Convert soil data into a spatial object
#(check https://epsg.io/6204)
dat <- vect(dat, geom=c("x", "y"), crs = crs(covs))
# Reproject point coordinates to match coordinate
#system of covariates
dat <- terra::project(dat, covs)
names(dat)
## 1.3 - Extract values from covariates to the soil points ------
pv <- terra::extract(x = covs, y = dat, xy=F)
dat <- cbind(dat,pv)
dat <- as.data.frame(dat)
summary(dat)
for(soilatt in unique(target_properties)){
## 1.4 - Target soil attribute + covariates ---------------------
d <- dplyr::select(dat, soilatt, names(covs))
d <- na.omit(d)
# 2 - Covariate selection with RFE ==============================
## 2.1 - Setting parameters -------------------------------------
# Repeatedcv = 3-times repeated 10-fold cross-validation
fitControl <- rfeControl(functions = rfFuncs,
method = "repeatedcv",
number = 10, ## 10 -fold CV
repeats = 3, ## repeated 3 times
verbose = TRUE,
saveDetails = TRUE,
returnResamp = "all")
# Set the regression function
fm = as.formula(paste(soilatt," ~", paste0(ncovs,
collapse = "+")))
# Calibrate the model using multiple cores
cl <- makeCluster(detectCores()-1)
registerDoParallel(cl)
## 2.2 - Calibrate a RFE model to select covariates -------------
covsel <- rfe(fm,
data = d,
sizes = seq(from=10, to=length(ncovs)-1, by = 5),
rfeControl = fitControl,
verbose = TRUE,
keep.inbag = T)
stopCluster(cl)
saveRDS(covsel, "02-Outputs/models/covsel.rda")
## 2.3 - Plot selection of covariates ---------------------------
trellis.par.set(caretTheme())
plot(covsel, type = c("g", "o"))
# Extract selection of covariates and subset covs
opt_covs <- predictors(covsel)
# 3 - QRF Model calibration =====================================
## 3.1 - Update formula with the selected covariates ------------
fm <- as.formula(paste(soilatt," ~",
paste0(opt_covs, collapse = "+")))
# parallel processing
cl <- makeCluster(detectCores()-1)
registerDoParallel(cl)
## 3.2 - Set training parameters --------------------------------
fitControl <- trainControl(method = "repeatedcv",
number = 10, ## 10 -fold CV
repeats = 3, ## repeated 3 times
savePredictions = TRUE)
# Tune mtry hyperparameters
mtry <- round(length(opt_covs)/3)
tuneGrid <- expand.grid(mtry = c(mtry-5, mtry, mtry+5))
## 3.3 - Calibrate the QRF model --------------------------------
model <- caret::train(fm,
data = d,
method = "qrf",
trControl = fitControl,
verbose = TRUE,
tuneGrid = tuneGrid,
keep.inbag = T,
importance = TRUE)
stopCluster(cl)
gc()
## 3.4 - Extract predictor importance as relative values (%)
x <- randomForest::importance(model$finalModel)
model$importance <- x
## 3.5 - Print and save model -----------------------------------
print(model)
saveRDS(model, file = paste0("02-Outputs/models/model_",
soilatt,".rds"))
# 4 - Uncertainty assessment ====================================
# extract observed and predicted values
o <- model$pred$obs
p <- model$pred$pred
df <- data.frame(o,p)
## 4.1 - Plot and save scatterplot ------------------------------
(g1 <- ggplot(df, aes(x = o, y = p)) +
geom_point(alpha = 0.1) +
geom_abline(slope = 1, intercept = 0, color = "red")+
ylim(c(min(o), max(o))) + theme(aspect.ratio=1)+
labs(title = soilatt) +
xlab("Observed") + ylab("Predicted"))
# ggsave(g1, filename = paste0("02-Outputs/residuals_",
soilatt,".png"), scale = 1,
# units = "cm", width = 12, height = 12)
## 4.2 - Print accuracy coeficients -----------------------------
# https://github.com/AlexandreWadoux/MapQualityEvaluation
eval(p,o)
## 4.3 - Plot Covariate importance ------------------------------
(g2 <- varImpPlot(model$finalModel, main = soilatt, type = 1))
# png(filename = paste0("02-Outputs/importance_",soilatt,".png"),
# width = 15, height = 15, units = "cm", res = 600)
# g2
# dev.off()
# 5 - Prediction ================================================
# Generation of maps (prediction of soil attributes)
## 5.1 - Produce tiles ------------------------------------------
r <-covs[[1]]
t <- rast(nrows = 5, ncols = 5, extent = ext(r), crs = crs(r))
tile <- makeTiles(r, t,overwrite=TRUE,filename=
"02-Outputs/tiles/tiles.tif")
## 5.2 - Predict soil attributes per tiles ----------------------
# loop to predict on each tile
for (j in seq_along(tile)) {
gc()
t <- rast(tile[j])
covst <- crop(covs, t)
# plot(r)#
pred_mean <- terra::predict(covst, model =
model$finalModel, na.rm=TRUE,
cpkgs="quantregForest", what=mean)
pred_sd <- terra::predict(covst, model =
model$finalModel, na.rm=TRUE,
cpkgs="quantregForest", what=sd)
# ###### Raster package solution (in case terra results
# in many NA pixels)
# library(raster)
# covst <- stack(covst)
# class(final_mod$finalModel) <-"quantregForest"
# # Estimate model uncertainty
# pred_sd <- predict(covst,model=final_mod$finalModel,type=sd)
# # OCSKGMlog prediction based in all available data
# pred_mean <- predict(covst,model=final_mod)
#
#
# ##################################
writeRaster(pred_mean,
filename = paste0("02-Outputs/tiles/soilatt_tiles/",
soilatt,"_tile_", j, ".tif"),
overwrite = TRUE)
writeRaster(pred_sd,
filename = paste0("02-Outputs/tiles/soilatt_tiles/",
soilatt,"_tileSD_", j, ".tif"),
overwrite = TRUE)
rm(pred_mean)
rm(pred_sd)
print(paste("tile",tile[j]))
}
## 5.3 - Merge tiles both prediction and st.Dev -----------------
f_mean <- list.files(path = "02-Outputs/tiles/soilatt_tiles/",
pattern = paste0(soilatt,"_tile_"),
full.names = TRUE)
f_sd <- list.files(path = "02-Outputs/tiles/soilatt_tiles/",
pattern = paste0(soilatt,"_tileSD_"),
full.names = TRUE)
r_mean_l <- list()
r_sd_l <- list()
for (g in 1:length(f_mean)){
r <- rast(f_mean[g])
r_mean_l[g] <-r
rm(r)
}
for (g in 1:length(f_sd)){
r <- rast(f_sd[g])
r_sd_l[g] <-r
rm(r)
}
r_mean <-sprc(r_mean_l)
r_sd <-sprc(r_sd_l)
pred_mean <- mosaic(r_mean)
pred_sd <- mosaic(r_sd)
aoi <- vect(AOI)
pred_mean <- mask(pred_mean,aoi)
pred_sd <- mask(pred_sd,aoi)
plot(pred_mean)
plot(pred_sd)
# 6 - Export final maps =========================================
## 6.1 - Mask croplands -----------------------------------------
msk <- rast("01-Data/mask.tif")
plot(msk)
pred_mean <- mask(pred_mean, msk)
plot(pred_mean)
pred_sd <- mask(pred_sd, msk)
plot(pred_sd)
plot(pred_sd/pred_mean*100, main = paste("CV",soilatt))
## 6.2 - Save results -------------------------------------------
# Harmonized naming
if (soilatt == 'ph_0_30'){
name <-'_GSNmap_pH_Map030.tiff'
}else if (soilatt == 'k_0_30'){
name <-'_GSNmap_Ktot_Map030.tiff'
}else if (soilatt == 'soc_0_30'){
name <-'_GSNmap_SOC_Map030.tiff'
}else if (soilatt == 'clay_0_30'){
name <-'_GSNmap_Clay_Map030.tiff'
}else if (soilatt == 'bd_0_30'){
name <-'_GSNmap_BD_Map030.tiff'
}else if (soilatt == 'cec_0_30'){
name <-'_GSNmap_CEC_Map030.tiff'
}else if (soilatt == 'p_0_30'){
name <-'_GSNmap_Pav_Map030.tiff'
}else if (soilatt == 'n_0_30'){
name <-'_GSNmap_Ntot_Map030.tiff'
}else if (soilatt == 'sand_0_30'){
name <-'_GSNmap_Sand_Map030.tiff'
}else if (soilatt == 'silt_0_30'){
name <-'_GSNmap_Silt_Map030.tiff'
}
writeRaster(pred_mean,
paste0("02-Outputs/maps/",ISO,name),
overwrite=TRUE)
writeRaster(pred_sd,
paste0("02-Outputs/maps/",ISO, '_SD',name),
overwrite=TRUE)
}
Global Soil Nutrient and Nutrient Budgets maps (GSNmap) Phase I