diff --git a/pipelines/SDM/SDM_maxEnt.json b/pipelines/SDM/SDM_maxEnt.json index 855b00fb..9340d945 100644 --- a/pipelines/SDM/SDM_maxEnt.json +++ b/pipelines/SDM/SDM_maxEnt.json @@ -366,7 +366,6 @@ "data": { "type": "text[]", "value": [ - "L", "LQ", "LQHP" ] @@ -384,8 +383,7 @@ "type": "float[]", "value": [ 0.5, - 1, - 2 + 1 ] } }, @@ -763,7 +761,8 @@ "random", "inclusion_buffer", "weighted_raster", - "unweighted_raster" + "unweighted_raster", + "thickening" ], "example": "random" }, @@ -866,14 +865,18 @@ "type": "text[]", "example": [ "chelsa-clim|bio1", - "chelsa-clim|bio2" + "chelsa-clim|bio7", + "chelsa-clim|bio12", + "earthenv_topography|elevation_median", + "earthenv_landcover|class_3", + "earthenv_landcover|class_12" ] }, "data>loadFromStac.yml@119|spatial_res": { "description": "Integer, spatial resolution of the rasters", "label": "spatial resolution", "type": "float", - "example": 1000.0 + "example": 5000.0 }, "data>loadFromStac.yml@119|mask": { "description": "Shapefile, used to mask the output rasters", @@ -896,5 +899,17 @@ 1 ] } + }, + "metadata": { + "name": "Maxent SDM pipeline", + "description": "A pipeline to produce species distribution models using Maxent", + "author": [ + { + "name": "Sarah Valentin", + "identifier": "https://orcid.org/0000-0002-9028-681X" + } + ], + "license": null, + "external_link": null } } \ No newline at end of file diff --git a/scripts/SDM/runMaxent.R b/scripts/SDM/runMaxent.R index aa24adbf..ef16bb20 100644 --- a/scripts/SDM/runMaxent.R +++ b/scripts/SDM/runMaxent.R @@ -31,7 +31,7 @@ print("Inputs : ") print(input) -presence_background <- read.table(file = input$presence_background, sep = '\t', header = TRUE) +presence_background <- read.table(file = input$presence_background, sep = '\t', header = TRUE, check.names = FALSE) predictors <- terra::rast(unlist(input$predictors)) mod_tuning <- run_maxent(presence_background, with_raster = F, # can be set to F to speed up @@ -52,7 +52,7 @@ mod_tuning <- run_maxent(presence_background, res_tuning <- mod_tuning@results tuned_param <- select_param(res_tuning, method = input$method_select_params, list = T) -predictors <- raster::stack(predictors) +#predictors <- raster::stack(predictors) sdms <- predict_maxent(presence_background, algorithm = "maxent.jar", @@ -73,17 +73,17 @@ names(sdm_pred) <- "prediction" sdm_runs <- sdms[["pred_runs"]] pred.output <- file.path(outputFolder, "sdm_pred.tif") -runs.output <- paste0(outputFolder,"/sdm_runs_", 1:nlayers(sdm_runs), ".tif") +runs.output <- paste0(outputFolder,"/sdm_runs_", 1:nlyr(sdm_runs), ".tif") #runs.output <- file.path(outputFolder, "sdm_runs.tif") -sdm_pred<-project(rast(sdm_pred),crs(input$proj)) ##Temporary fix while maxent transitions to terra +sdm_pred<-project(sdm_pred,crs(input$proj)) ##Temporary fix while maxent transitions to terra terra::writeRaster(x = sdm_pred, filename = pred.output, filetype = "COG", wopt= list(gdal=c("COMPRESS=DEFLATE")), overwrite = TRUE) -for (i in 1:nlayers(sdm_runs)){ - thisrun<-project(rast(sdm_runs[[i]]),crs(input$proj)) ##Temporary fix while maxent transitions to terra +for (i in 1:nlyr(sdm_runs)){ + thisrun<-project(sdm_runs[[i]],crs(input$proj)) ##Temporary fix while maxent transitions to terra terra::writeRaster(x = thisrun, filename = file.path(outputFolder, paste0("/sdm_runs_", i, ".tif")), filetype = "COG", diff --git a/scripts/SDM/runMaxentFunc.R b/scripts/SDM/runMaxentFunc.R index 5b35e5a9..70a89b56 100644 --- a/scripts/SDM/runMaxentFunc.R +++ b/scripts/SDM/runMaxentFunc.R @@ -22,8 +22,8 @@ run_maxent <- function(presence.bg, with_raster = F, parallelType = "doParallel" ) { - presence <- presence.bg |> dplyr::filter(pa == 1) |> data.frame() - background <- presence.bg |> dplyr::filter(pa == 0) |> data.frame() + presence <- presence.bg |> dplyr::filter(pa == 1) |> as.data.frame() + background <- presence.bg |> dplyr::filter(pa == 0) |> as.data.frame() if (with_raster) { ENMmodel <- ENMeval::ENMevaluate(occs = presence[, c("lon", "lat")], @@ -259,8 +259,8 @@ predict_maxent <- function(presence_background, # We calculate the prediction with the whole dataset - presence <- presence_background |> dplyr::filter(pa == 1) |> data.frame() - background <- presence_background |> dplyr::filter(pa == 0) |> data.frame() + presence <- presence_background |> dplyr::filter(pa == 1) |> as.data.frame() + background <- presence_background |> dplyr::filter(pa == 0) |> as.data.frame() mod_tuning <- ENMeval::ENMevaluate(occs = presence[, c("lon", "lat", layers)], @@ -304,8 +304,8 @@ runs <- c("run_1") backg_test <- backgr[bg.grp == g, ] presence_bg_train <- group.all[which(group.all[,1] == g),] - presence <- presence_bg_train |> dplyr::filter(pa == 1) |> data.frame() - background <- presence_bg_train |> dplyr::filter(pa == 0) |> data.frame() + presence <- presence_bg_train |> dplyr::filter(pa == 1) |> as.data.frame() + background <- presence_bg_train |> dplyr::filter(pa == 0) |> as.data.frame() mod_tuning <- ENMeval::ENMevaluate(occs = presence[, c("lon", "lat", layers)], @@ -322,12 +322,18 @@ runs <- c("run_1") pred_pres <- dismo::predict(mod_tuning@models[[tuned_param]], predictors, args = sprintf("outputformat=%s", type)) - if (inherits(pred_runs, "RasterLayer") || inherits(pred_runs, "RasterStack")) { - pred_runs <- raster::stack(pred_runs, pred_pres) - - } else { - pred_runs <- pred_pres - } + #if (inherits(pred_runs, "RasterLayer") || inherits(pred_runs, "RasterStack")) { + # pred_runs <- raster::stack(pred_runs, pred_pres) + #} else { + # pred_runs <- pred_pres + #} + if (is.null(pred_runs)) { + pred_runs <- pred_pres + } else { + pred_runs <- c(pred_runs, pred_pres) + } + + } diff --git a/scripts/SDM/selectBackground.yml b/scripts/SDM/selectBackground.yml index 1ebc31b5..9bd0e30c 100644 --- a/scripts/SDM/selectBackground.yml +++ b/scripts/SDM/selectBackground.yml @@ -26,6 +26,7 @@ inputs: - inclusion_buffer - weighted_raster - unweighted_raster + - thickening example: random n_background: label: number of background points diff --git a/scripts/SDM/selectBackgroundFunc.R b/scripts/SDM/selectBackgroundFunc.R index f0dc9cf4..940dc3bb 100644 --- a/scripts/SDM/selectBackgroundFunc.R +++ b/scripts/SDM/selectBackgroundFunc.R @@ -1,15 +1,21 @@ - +#' #' @title Create background points +#' +#' @description +#' Generates background points using any of the six available methods. +#' +#' @details +#' When `method = "random"`, background points are randomly sampled throughout the whole study extent. When `method = "weighted_raster"`, background points are sampled in proportion to the number of observations of a target group in an observation density raster. When `method = "unweighted_raster"`, background points are sampled only in cells where there are observations from a target group. With `method = "inclusion_buffer"`, background points are sampled within a buffer around observations (to be confirmed...). With `method = "thickening"`, background points are sampled in proportion the local density of observations by sampling in a buffer around each observation (to be confirmed...). Finally, when `method = "biased"`, a `density_bias` raster representing the effort is given and background points are sampled in proportion to this raster (to be confirmed...). #' #' @name create_background -#' @param predictors spat raster, containing the predictor variables -#' @param mask, spat vector, mask to apply to the predictors. -#' @param method string, random, inclusion_buffer (thickening or biased), or raster (unweighted or weighted) -#' @param n integer, number of background points to select. -#' @param obs data.frame, containing the observations. Used with method == "thickening" or "inclusion_buffer" -#' @param width_buffer int, buffer width around observations. Used with method == "inclusion_buffer" -#' @param species string, species name. +#' @param predictors SpatRasterr, containing the predictor variables +#' @param mask SpatVector, mask to apply to the predictors. +#' @param method one of "random","weighted_raster","unweighted_raster","inclusion_buffer","biased","thickening". +#' @param n integer, number of background points to sample. +#' @param obs data.frame, containing the observations. Used with "thickening" or "inclusion_buffer". +#' @param density_bias SpatRaster giving an effort/bias surface from which background points are sampled +#' @param width_buffer int, buffer width around observations. Used with "thickening" or "inclusion_buffer". #' @param raster SpatRaster, raster heatmap used for weighted or unweighted sampling, default NULL when not using those methods #' @return spatial points #' @export @@ -17,184 +23,111 @@ create_background <- function( predictors, mask = NULL, - method = "random", + method = c("random","weighted_raster","unweighted_raster","inclusion_buffer","biased","thickening"), n = 10000, obs = NULL, density_bias = NULL, width_buffer = NULL, raster = NULL) { - proj <- terra::crs(predictors, proj = T) + method <- match.arg(method) + + proj <- terra::crs(predictors) ## New method: If we use raster, we re-project our raster and add it as an additional layer if (grepl("raster", method)) { if (is.null(raster)) stop(paste("No raster included with method:", method)) - - # Bilinear interpolation when projecting in this manner - # Exclude 0s, since we don't want to sample areas with no sightings - raster[raster <= 0] <- NA - - add(predictors) <- raster - + } + + if (method == "biased") { + if (is.null(density_bias)) stop(paste("No density_bias included with method:", method)) } if (!is.null(mask)) { predictors <- fast_crop(predictors, mask) } - sum_na_layer <- terra::tapp(predictors, index = rep(1, terra::nlyr(predictors)), fun = sum, na.rm = F) - - ncellr <- terra::global(!is.na(sum_na_layer), sum) - - if (ncellr < n) { - message( - "Number of background-points exceeds number of cell. ", - ncellr, - " background-points will be sampled" - ) - backgr <- terra::as.data.frame(predictors, na.rm = TRUE, xy = TRUE) |> dplyr::select(x, y) + if (method %in% c("inclusion_buffer","thickening")){ + ### Not sure why the following buffer width method was originally used. package FNN could be faster for searching nearest neighbours. + #message("Argument width_buffer not provided. Using the 95% quantile + # of the nearest neighbour distance that is not in the same location.") + # + #if(nrow(obs) > 1000){ + # sampled_obs <- sample(1:nrow(obs), min(1000, nrow(obs))) + # message("Only 1000 randomly sampled locations are used to determine the width_buffer.") + #}else{ + # sampled_obs <- 1:nrow(obs) + #} + #width_buffer<-quantile(st_distance(obs[sampled_obs,]),0.95) + # + #width_buffer<-obs[sampled_obs,] |> + # st_distance() |> + # apply(2,function(x){min(x[x>0])}) |> + # quantile(0.95) + # + #message(sprintf("width-buffer used is %s (in the units of the crs)", width_buffer)) - } else { + if (is.null(obs)){ + stop(paste("No obs included with method:", method)) + } - if (any(method == "random" | grepl("^unweighted", method))) { - - # all the cells have the same probability to be selected - message(sprintf("Selecting %i background point based on %s method.", n, method )) - - # Apparently this breaks if there are a lot of NA values.. using a different version for now - if (0) { - # This version has issues sampling rasters with a lot (or mostly) NA value cells - backgr <- terra::spatSample(sum_na_layer, - size = n, - method="random", - replace = FALSE, - na.rm = T, - xy = TRUE, - as.points = FALSE, - values = F) - }else{ - # cells only retrieves non-NA cells - backgr <- sample(cells(sum_na_layer), n) - backgr <- xyFromCell(sum_na_layer,backgr) - - } - } else if (any(method == "thickening")) { - - message(sprintf("Selecting %i background point based on %s method.", n, method )) - - obs_vec <- terra::vect(obs, geom = c("lon", "lat"), crs = crs(predictors)) - - if (is.null(width_buffer)) { - width_buffer <- mean(terra::distance(obs_vec)) - } else { - width_buffer <- as.numeric(width_buffer) - } - buf <- terra::buffer(obs_vec, width_buffer, quadsegs = 10) - buf_r <- !is.na(terra::rasterize(buf[1], sum_na_layer)) - for (i in 2:nrow(buf)) { - buf_r <- buf_r + !is.na(terra::rasterize(buf[i], sum_na_layer)) - } - buf_r <- terra::mask(buf_r, sum_na_layer) - backgr <- terra::spatSample(buf_r, - size = n, - method = "weighted", - replace = FALSE, - na.rm = T, - xy = TRUE, - as.points = FALSE, - values = F) - - } else if (any(method == "biased")) { - - if (!is.null(mask)) density_bias <- fast_crop(density_bias, mask) - - backgr <- terra::spatSample(density_bias, - size = n, - method = "weighted", - replace = FALSE, - na.rm = T, - xy = TRUE, - as.points = FALSE, - values = F) - - } else if (method == "inclusion_buffer") { - - obs_vec <- terra::vect(obs, geom = c("lon", "lat"), crs = crs(predictors)) - - if (is.null(dist_buffer)) { - - message("Buffer distance not provided. Using the 95% quantile - of the minimum distance between each point.") - dist_buffer <- calculate_dist_buffer(obs |> dplyr::select(lon, lat)) - message(sprintf("Buffer distance: %s (unit of projection)", dist_buffer)) - - } - - # Creating the buffer - - buffer_shape <- rgeos::gBuffer(spgeom = as(obs_vec, "Spatial"), - byid = FALSE, width = dist_buffer) - - # crops the predictors to that shape to rasterize - sum_na_layer <- fast_crop(sum_na_layer, buffer_shape) - - message(sprintf("Trying selecting %i background point based on %s method.", n ,method )) - backgr <- terra::spatSample(sum_na_layer, - size = n, - method="random", - replace=FALSE, - na.rm=T, - xy=TRUE, - as.points=FALSE, - values=F) - - } else if (grepl("^weighted", method)) { - ## Set heatmap cells to NA if they are NA in sum_na_layer - ## Assumes the last layer of predictors is the heatmap, and that it is counts! - values(predictors[[dim(predictors)[3]]])[which(is.na(values(sum_na_layer)))] <- NA - tgb_weights <- predictors[[dim(predictors)[3]]] - tgb_weights <- tgb_weights/sum(values(tgb_weights), na.rm = T) - - #backgr <- sample(cells(tgb_weights), n, prob = unlist(extract(tgb_weights, cells(tgb_weights)))) - #backgr <- xyFromCell(tgb_weights, backgr) - - # Sampling using density as probabilities - # We still run into NA values in the weights, which we should exclude - weightsVector <- unlist(extract(tgb_weights, cells(tgb_weights))) - names(weightsVector) <- cells(tgb_weights) - weightsVector <- na.omit(weightsVector) - - if(length(weightsVector) < n){ - message( - "Number of background-points exceeds number of cell. ", - ncellr, - " background-points will be sampled" - ) - backgr <- names(weightsVector) + obs <- st_as_sf(obs, coords=c("lon", "lat"), crs = proj) - }else{ - backgr <- sample(names(weightsVector), n, prob = weightsVector) - - } - - backgr <- xyFromCell(tgb_weights, as.numeric(backgr)) + if(is.null(width_buffer)){ + width_buffer <- 0.1 * max(diff(st_bbox(obs)[c(1, 3)]),diff(st_bbox(obs)[c(2, 4)])) + message(sprintf("width_buffer used is 10%% of the largest bounding box dimension (%s in the units of the crs)", round(width_buffer))) } + } - message(sprintf("%s selected", nrow(backgr))) - #species <- unique(obs$scientific_name) This didn't work if there are more than one equivalent species names - species <- obs$scientific_name[1] - backgr <- dplyr::bind_cols(id = 1:nrow(backgr), - scientific_name = species, - backgr |> data.frame()) |> - setNames(c("id", "scientific_name", "lon", "lat")) + backgr<-switch(method, + random = { + terra::spatSample(predictors[[1]], size = n, method = "random", replace = TRUE, values = FALSE, as.points = TRUE) + }, + weighted_raster = { + terra::spatSample(raster, size = n, method = "weights", replace = TRUE, values = FALSE, as.points = TRUE) + }, + unweighted_raster = { + terra::spatSample(raster > 0, size = n, method = "weights", replace = TRUE, values = FALSE, as.points = TRUE) + }, + inclusion_buffer = { # this thing can take a while if there are many observations + obs |> + st_buffer(dist = width_buffer, nQuadSegs = 10) |> + st_union() |> + st_sample(n) |> + st_as_sf() + }, + biased = { # not sure how this method differs from weighted_raster, ignored at the moment as an option + terra::spatSample(density_bias, size = n, method = "weights", replace = TRUE, values = FALSE, as.points = TRUE) + }, + thickening = { + # There is probably a faster way using a radius and angle from each observations. Right now, rasterizing the buffers and sampling from the raster seems much faster than sampling within the buffers using sf (commented version) + #backgr<-obs |> + # st_buffer(dist = width_buffer, nQuadSegs = 10) |> + # st_sample(size = rep(ceiling(n / nrow(obs)),nrow(obs)), by_polygon = FALSE) |> + # st_as_sf() + #backgr[sample(1:nrow(backgr),n),] + terra::spatSample( + rasterize(st_buffer(obs, dist = width_buffer, nQuadSegs = 10), rast(predictors[[1]]), fun = sum), + size = n, + method = "weights", + replace = TRUE, + values = FALSE, + as.points = TRUE + ) + } + ) + backgr <- backgr[,-1] # temporary fix for https://github.com/rspatial/terra/issues/1275 + backgr <- st_as_sf(backgr) |> + st_coordinates() |> + data.frame(id = 1:nrow(backgr), scientific_name = obs$scientific_name[1], lon = _) |> + setNames(c("id","scientific_name","lon","lat")) + backgr - return(backgr) } - calculate_dist_buffer <- function(obs, n = 1000) { - #Uses the first 1000 points (randomly sampled) to create buffers and distances + # uses the first 1000 points (randomly sampled) to create buffers and distances if (nrow(obs) > n) { nb_buffer_point <- n } else {