Merge pull request #27 from EvolEcolGroup/cran_sub

Cran 0.9.3
EvolEcolGroup · Jan 18, 2024 · b367aeb · b367aeb
2 parents fbc7095 + d7a9c5f
commit b367aeb
Show file tree

Hide file tree

Showing 39 changed files with 606 additions and 71 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: tidysdm
 Title: Species Distribution Models with Tidymodels
-Version: 0.9.2
+Version: 0.9.3
 Authors@R: c(
     person("Michela", "Leonardi", role = "aut"),
     person("Margherita", "Colucci", role = "aut"),
@@ -48,14 +48,14 @@ Imports:
     yardstick
 Suggests:
     blockCV,
+    data.table,
     doParallel,
     earth,
     kernlab,
     knitr,
     overlapping,
     pastclim (>= 2.0.0),
     ranger,
-    readr,
     rmarkdown,
     spelling,
     stacks,

diff --git a/NAMESPACE b/NAMESPACE
@@ -14,6 +14,8 @@ S3method(boyce_cont,data.frame)
 S3method(boyce_cont,sf)
 S3method(brier_class,sf)
 S3method(classification_cost,sf)
+S3method(collect_metrics,repeat_ensemble)
+S3method(collect_metrics,simple_ensemble)
 S3method(explain_tidysdm,default)
 S3method(explain_tidysdm,repeat_ensemble)
 S3method(explain_tidysdm,simple_ensemble)
@@ -106,3 +108,4 @@ importFrom(ggplot2,autoplot)
 importFrom(magrittr,"%>%")
 importFrom(rlang,":=")
 importFrom(rlang,.data)
+importFrom(tune,collect_metrics)
diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,8 @@
+# `tidysdm` 0.9.3
+
+* fix bug in `filter_high_cor` due to changes in `terra` 1.6.75
+* implement `collect_metrics` for ensembles.
+
 # `tidysdm` 0.9.2
 
 * Release on CRAN

diff --git a/R/autoplot_simple_ensemble.R b/R/autoplot_simple_ensemble.R
@@ -8,7 +8,8 @@
 #' the results. If none is given, the first metric in the metric set is used
 #' (after filtering by the `metric` option).
 #' @param metric A character vector for which metrics (apart from `rank_metric`)
-#' to be included in the visualization.
+#' to be included in the visualization. If NULL (the default), all available 
+#' metrics will be plotted
 #' @param std_errs The number of standard errors to plot (if the standard error
 #' exists).
 #' @param ... Other options to pass to `autoplot()`. Currently unused.
@@ -28,11 +29,12 @@
 #' (95% confidence, by default).
 #' @returns A ggplot object.
 #' @examples
-#' # we use the two_class_example from `workflowsets`
+#' \donttest{
+#' #' # we use the two_class_example from `workflowsets`
 #' two_class_ens <- simple_ensemble() %>%
 #'   add_member(two_class_res, metric = "roc_auc")
 #' autoplot(two_class_ens)
-#'
+#' }
 #' @export
 autoplot.simple_ensemble <- function(object, rank_metric = NULL, metric = NULL,
                                      std_errs = stats::qnorm(0.95), ...) {

diff --git a/R/blockcv2rsample.R b/R/blockcv2rsample.R
@@ -1,7 +1,14 @@
 #' Convert an object created with `blockCV` to an `rsample` object
 #'
 #' This function creates objects created with `blockCV` to `rsample` objects
-#' that can be used by `tidysdm`
+#' that can be used by `tidysdm`. BlockCV provides more sophisticated sampling
+#' options than the `spatialsample` library. For example, it is possible to
+#' stratify the sampling to ensure that presences and absences are evenly
+#' distributed among the folds (see the example below).
+#' 
+#' Note that currently only objects of type `cv_spatial` and `cv_cluster` are
+#' supported.
+#' 
 #' @param x a object created with a `blockCV` function
 #' @param data the `sf` object used to create `x`
 #' @returns an `rsample` object
@@ -13,22 +20,30 @@
 #' pa_data <- sf::st_as_sf(points, coords = c("x", "y"), crs = 7845)
 #' sb1 <- cv_spatial(
 #'   x = pa_data,
-#'   column = "occ", # the response column (binary or multi-class)
+#'   column = "occ", # the response column to balance the folds
 #'   k = 5, # number of folds
 #'   size = 350000, # size of the blocks in metres
 #'   selection = "random", # random blocks-to-fold
 #'   iteration = 10
 #' ) # find evenly dispersed folds
 #' sb1_rsample <- blockcv2rsample(sb1, pa_data)
 #' class(sb1_rsample)
+#' autoplot(sb1_rsample)
 #' }
 blockcv2rsample <- function(x, data) {
+  if(!(any(inherits(x,"cv_spatial"),
+           inherits(x,"cv_cluster")))){
+    stop("this function does support this object type\n",
+         "only objects of class cv_spatial or cv_cluster are supported.")
+  }
   splits <- lapply(
     x$folds_list,
     function(this_fold) {
       names(this_fold) <- c("analysis", "assessment")
-      rsample::make_splits(this_fold, data = data)
+      rsample::make_splits(this_fold, data = data, class = "spatial_rsplit")
     }
   )
-  rsample::manual_rset(splits, ids = paste0("Fold", seq(1:length(splits))))
+  rsample::new_rset(splits, ids = paste0("Fold", seq(1:length(splits))),
+                    attrib = NULL, subclass = c("spatial_rset", "rset"))
+
 }
diff --git a/R/check_split_balance.R b/R/check_split_balance.R
@@ -1,9 +1,11 @@
 #' Check the balance of presences vs pseudoabsences among splits
 #'
-#' @param splits the data splits (an `rset` object), generated by a function such as
+#' @param splits the data splits (an `rset` or `split` object), generated by a function such as
 #' [spatialsample::spatial_block_cv()]
 #' @param .col the column containing the presences
-#' @returns a table of number of presences and pseudoabsences
+#' @returns a tibble of the number of presences and pseudoabsences in the assessment
+#' and analysis
+#' set of each split (or training and testing in an initial split)
 #' @export
 #' @examples
 #' lacerta_thin <- readRDS(system.file("extdata/lacerta_climate_sf.RDS",
@@ -18,10 +20,41 @@ check_splits_balance <- function(
   .col <- rlang::enquo(.col) %>%
     rlang::quo_get_expr() %>%
     rlang::as_string()
-  balance_list <- lapply(splits$splits, function(x) {
-    table(rsample::training(x) %>%
-      sf::st_drop_geometry() %>% dplyr::pull(.col))
-  })
-  balance_df <- do.call("rbind", balance_list)
+
+  if (inherits(splits,"rset")){
+    if (!(.col %in% names(splits$splits[[1]]$data))){
+      stop(".col should be a column in the data used to generate the splits")
+    }
+    training_list <- lapply(splits$splits, function(x) {
+      table(rsample::training(x) %>%
+            sf::st_drop_geometry() %>% dplyr::pull(.col))
+    })
+    training_df <- do.call("rbind", training_list)
+    testing_list <- lapply(splits$splits, function(x) {
+      table(rsample::testing(x) %>%
+            sf::st_drop_geometry() %>% dplyr::pull(.col))
+    })
+    testing_df <- do.call("rbind", testing_list)
+    dimnames(testing_df)[[2]] <- paste0(dimnames(testing_df)[[2]],"_analysis")  
+    dimnames(training_df)[[2]] <- paste0(dimnames(training_df)[[2]],"_assessment")
+
+    balance_df <- dplyr::bind_cols(training_df, testing_df) 
+  } else if (inherits(splits,"rsplit"))  {
+    if (!(.col %in% names(splits$data))){
+      stop(".col should be a column in the data used to generate the splits")
+    }
+    training_df <- table(rsample::training(splits) %>%
+                           sf::st_drop_geometry() %>% dplyr::pull(.col))
+    testing_df <- table(rsample::testing(splits) %>%
+              sf::st_drop_geometry() %>% dplyr::pull(.col))
+    # coerce to a df with only one row
+    balance_df <- as.data.frame(matrix(c(training_df, testing_df),nrow=1))
+    names(balance_df) <- c(paste0(names(testing_df),"_test"),
+                           paste0(names(training_df),"_train"))
+    balance_df <- tibble::as_tibble(balance_df)
+  } else {
+    stop ("splits should be either a spatial_rset or a spatial_rsplit")
+  }
+
   return(balance_df)
 }
diff --git a/R/collect_metrics.R b/R/collect_metrics.R
@@ -0,0 +1,35 @@
+#' Obtain and format results produced by tuning functions for ensemble objects
+#'
+#' Return a tibble of performance metrics for all models.
+#'
+#' @param x A [`simple_ensemble`] or [`repeat_ensemble`] object
+#' @param ... Not currently used.
+#' @return A tibble.
+#' @details
+#'
+#' When applied to a ensemble, the metrics that are returned
+#' do not contain the actual tuning parameter columns and values (unlike when
+#' these collect functions are run on other objects). The reason is that ensembles
+#' contain different types of models or models with different tuning
+#' parameters.
+#'
+#' @seealso [tune::collect_metrics()]
+#'
+#' @examples
+#' collect_metrics(lacerta_ensemble)
+#' collect_metrics(lacerta_rep_ens)
+#' @importFrom tune collect_metrics
+#' @export
+
+collect_metrics.simple_ensemble <- function(x, ...) {
+  dplyr::bind_rows(x$metrics)
+}
+
+#' @export
+#' @rdname collect_metrics.simple_ensemble
+collect_metrics.repeat_ensemble <- function(x, ...) {
+  metric_table <- dplyr::bind_rows(x$metrics, .id = "rep_id")
+  metric_table$rep_id <- x$rep_id[as.numeric(metric_table$rep_id)]
+  return(metric_table)
+}
+
diff --git a/R/datasets_docs.R b/R/datasets_docs.R
@@ -25,7 +25,14 @@
 
 #' A simple ensemble for the lacerta data
 #'
-#' Ensemble SDM for *Lacerta schreiberi*.
+#' Ensemble SDM for *Lacerta schreiberi*, as generated in the vignette.
 #'
 #' @format A [`simple_ensemble`] object
 "lacerta_ensemble"
+
+#' A repeat ensemble for the lacerta data
+#'
+#' Ensemble SDM for *Lacerta schreiberi*, as generated in the vignette.
+#'
+#' @format A [`repeat_ensemble`] object
+"lacerta_rep_ens"
diff --git a/R/out_of_range_warning.R b/R/out_of_range_warning.R
@@ -0,0 +1,22 @@
+#' Warn if some times are outside the range of time steps from a raster
+#'
+#' This function helps making sure that, when we assign times to time_step
+#' layers of a raster, we do not have values which are badly out of range
+#' @param times the times of the locations
+#' @param time_steps the time steps from the raster
+#' @returns NULL return
+#' @keywords internal
+
+out_of_range_warning <- function(times, time_steps){
+  time_steps_ordered<-sort(time_steps)
+  range_minmax <- c(utils::head(time_steps_ordered,n=1)[1]-
+                      (abs(utils::head(time_steps_ordered,n=2)[1]-utils::head(time_steps_ordered,n=2)[2])/2),
+                    utils::tail(time_steps_ordered,n=1)[1] + 
+                      (abs(utils::tail(time_steps_ordered,n=2)[1]-utils::tail(time_steps_ordered,n=2)[2])/2))
+  if(any(times<range_minmax[1]) | any(times>range_minmax[2])) {
+    warning("Some dates are out of the range of the available time series.\n",
+            "They will be assigned to the most extreme time point available, but this\n",
+            "might not make sense. The potentially problematic dates are:\n",
+            times[which((times<range_minmax[1]) | (times>range_minmax[2]))])
+  } 
+}
diff --git a/R/sample_pseudoabs.R b/R/sample_pseudoabs.R
@@ -61,11 +61,20 @@ sample_pseudoabs <- function(data, raster, n, coords = NULL,
   coords <- check_coords_names(data, coords)
   dist_min <- dist_max <- NULL
   if (method[1] == "dist_disc") {
+    if (length(method)!=3){
+      stop("method 'dist_disc' should have two thresholds, e.g. c('dist_disc',10,20)")
+    }
     dist_min <- as.numeric(method[2])
     dist_max <- as.numeric(method[3])
   } else if (method[1] == "dist_min") {
+    if (length(method)!=2){
+      stop("method 'dist_min' should have one threshold, e.g. c('dist_min',10)")
+    }
     dist_min <- as.numeric(method[2])
   } else if (method[1] == "dist_max") {
+    if (length(method)!=2){
+      stop("method 'dist_max' should have one threshold, e.g. c('dist_max',50)")
+    }
     dist_max <- as.numeric(method[2])
   } else if (!method[1] %in% "random") {
     stop("method has to be one of 'random', 'dist_min', 'dist_max', or 'dist_disc'")