diff --git a/.Rbuildignore b/.Rbuildignore index ebdc8b742..b84863208 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -12,6 +12,7 @@ ^Meta$ ^_pkgdown\.yml$ ^inst/manuscript/manuscript_cache$ +^inst/manuscript/.trackdown$ ^\.lintr$ ^docs$ ^\.devcontainer$ diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 000000000..90e05c40d --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,11 @@ +# To get started with Dependabot version updates, you'll need to specify which +# package ecosystems to update and where the package manifests are located. +# Please see the documentation for all configuration options: +# https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates + +version: 2 +updates: + - package-ecosystem: "github-actions" # See documentation for possible values + directory: "/" # Location of package manifests + schedule: + interval: "weekly" diff --git a/.github/workflows/render_readme.yaml b/.github/workflows/render_readme.yaml index d2053f89e..d629c74c7 100644 --- a/.github/workflows/render_readme.yaml +++ b/.github/workflows/render_readme.yaml @@ -9,6 +9,7 @@ on: push: paths: - 'README.Rmd' + - DESCRIPTION jobs: render-readme: diff --git a/.gitignore b/.gitignore index 0884bab49..6afe7eefa 100644 --- a/.gitignore +++ b/.gitignore @@ -8,9 +8,12 @@ doc Meta inst/manuscript/manuscript_cache/ inst/manuscript/manuscript.log +inst/manuscript/manuscript.aux +inst/manuscript/manuscript.blg inst/manuscript/manuscript.pdf inst/manuscript/manuscript.tex inst/manuscript/manuscript_files/ +inst/manuscript/.trackdown docs ..bfg-report/ .DS_Store diff --git a/.lintr b/.lintr index 7ddfe22a1..5f2c4aacf 100644 --- a/.lintr +++ b/.lintr @@ -11,6 +11,6 @@ linters: linters_with_tags( exclusions: c( list.files("tests", recursive = TRUE, full.names = TRUE), list.files("inst", recursive = TRUE, full.names = TRUE), - "vignettes/metric-details.Rmd" + list.files("vignettes", pattern = ".R$", full.names = TRUE) ) exclude: "# nolint" diff --git a/DESCRIPTION b/DESCRIPTION index cbf8e4e1c..abe92fc61 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: scoringutils Title: Utilities for Scoring and Assessing Predictions -Version: 1.2.2 +Version: 1.2.2.9000 Language: en-GB Authors@R: c( person(given = "Nikos", @@ -47,11 +47,13 @@ License: MIT + file LICENSE Encoding: UTF-8 LazyData: true Imports: + checkmate, data.table, ggdist (>= 3.2.0), ggplot2 (>= 3.4.0), lifecycle, methods, + Metrics, rlang, scoringRules, stats diff --git a/NAMESPACE b/NAMESPACE index 21ef9654c..02d7122b4 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,22 +1,37 @@ # Generated by roxygen2: do not edit by hand +S3method(as_forecast,default) S3method(print,scoringutils_check) -export(abs_error) +S3method(quantile_to_interval,data.frame) +S3method(quantile_to_interval,numeric) +S3method(score,default) +S3method(score,forecast_binary) +S3method(score,forecast_point) +S3method(score,forecast_quantile) +S3method(score,forecast_sample) +S3method(validate_forecast,forecast_binary) +S3method(validate_forecast,forecast_point) +S3method(validate_forecast,forecast_quantile) +S3method(validate_forecast,forecast_sample) export(add_coverage) +export(add_pairwise_comparison) export(ae_median_quantile) export(ae_median_sample) -export(avail_forecasts) +export(as_forecast) export(available_metrics) export(bias_quantile) -export(bias_range) export(bias_sample) export(brier_score) -export(check_forecasts) export(correlation) export(crps_sample) +export(dispersion) export(dss_sample) -export(find_duplicates) -export(interval_score) +export(get_duplicate_forecasts) +export(get_forecast_counts) +export(get_forecast_type) +export(get_forecast_unit) +export(interval_coverage) +export(interval_coverage_deviation) export(log_shift) export(logs_binary) export(logs_sample) @@ -24,11 +39,13 @@ export(mad_sample) export(make_NA) export(make_na) export(merge_pred_and_obs) +export(new_forecast) +export(overprediction) export(pairwise_comparison) export(pit) export(pit_sample) -export(plot_avail_forecasts) export(plot_correlation) +export(plot_forecast_counts) export(plot_heatmap) export(plot_interval_coverage) export(plot_pairwise_comparison) @@ -39,15 +56,49 @@ export(plot_ranges) export(plot_score_table) export(plot_wis) export(quantile_score) +export(quantile_to_interval) +export(rules_binary) +export(rules_point) +export(rules_quantile) +export(rules_sample) +export(run_safely) export(sample_to_quantile) export(score) export(se_mean_sample) +export(select_rules) export(set_forecast_unit) -export(squared_error) export(summarise_scores) export(summarize_scores) export(theme_scoringutils) export(transform_forecasts) +export(underprediction) +export(validate_forecast) +export(validate_general) +export(wis) +importFrom(Metrics,ae) +importFrom(Metrics,ape) +importFrom(Metrics,se) +importFrom(checkmate,assert) +importFrom(checkmate,assert_character) +importFrom(checkmate,assert_data_frame) +importFrom(checkmate,assert_data_table) +importFrom(checkmate,assert_factor) +importFrom(checkmate,assert_list) +importFrom(checkmate,assert_logical) +importFrom(checkmate,assert_number) +importFrom(checkmate,assert_numeric) +importFrom(checkmate,assert_string) +importFrom(checkmate,assert_subset) +importFrom(checkmate,assert_vector) +importFrom(checkmate,check_atomic_vector) +importFrom(checkmate,check_data_frame) +importFrom(checkmate,check_function) +importFrom(checkmate,check_matrix) +importFrom(checkmate,check_numeric) +importFrom(checkmate,check_vector) +importFrom(checkmate,test_factor) +importFrom(checkmate,test_list) +importFrom(checkmate,test_numeric) importFrom(data.table,"%like%") importFrom(data.table,':=') importFrom(data.table,.I) @@ -59,8 +110,11 @@ importFrom(data.table,data.table) importFrom(data.table,dcast) importFrom(data.table,is.data.table) importFrom(data.table,melt) +importFrom(data.table,nafill) importFrom(data.table,rbindlist) importFrom(data.table,setDT) +importFrom(data.table,setattr) +importFrom(data.table,setcolorder) importFrom(data.table,setnames) importFrom(ggdist,geom_lineribbon) importFrom(ggplot2,.data) @@ -103,7 +157,6 @@ importFrom(ggplot2,xlab) importFrom(ggplot2,ylab) importFrom(lifecycle,deprecated) importFrom(methods,hasArg) -importFrom(methods,is) importFrom(rlang,enexprs) importFrom(rlang,warn) importFrom(scoringRules,crps_sample) @@ -121,5 +174,6 @@ importFrom(stats,rbinom) importFrom(stats,reorder) importFrom(stats,runif) importFrom(stats,sd) +importFrom(stats,weighted.mean) importFrom(stats,wilcox.test) importFrom(utils,combn) diff --git a/NEWS.md b/NEWS.md index 208129252..d71f93f1d 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,48 @@ +# scoringutils 1.2.2.9000 + +This major update and addresses a variety of comments made by reviewers from the Journal of Statistical Software (see preprint of the manuscript [here](https://arxiv.org/abs/2205.07090)). + +The update introduces breaking changes. If you want to keep using the older version, you can download it using `remotes::install_github("epiforecasts/scoringutils@v1.2")`. + +## Package updates +- In `score()`, required columns "true_value" and "prediction" were renamed and replaced by required columns "observed" and "predicted". Scoring functions now also use the function arguments "observed" and "predicted" everywhere consistently. +- The overall scoring workflow was updated. `score()` is now a generic function that dispatches the correct method based on the forecast type. forecast types currently supported are "binary", "point", "sample" and "quantile" with corresponding classes "forecast_binary", "forecast_point", "forecast_sample" and "forecast_quantile". An object of class `forecast_*` can be created using the function `as_forecast()`, which also replaces the previous function `check_forecasts()` (see more information below). +- Scoring rules (functions used for scoring) received a consistent interface and input checks: + - Scoring rules for binary forecasts: + - `observed`: factor with exactly 2 levels + - `predicted`: numeric, vector with probabilities + - Scoring rules for point forecasts: + - `observed`: numeric vector + - `predicted`: numeric vector + - Scoring rules for sample-based forecasts: + - `observed`: numeric, either a scalar or a vector + - `predicted`: numeric, a vector (if `observed` is a scalar) or a matrix (if `observed` is a vector) + - Scoring rules for quantile-based forecasts: + - `observed`: numeric, either a scalar or a vector + - `predicted`: numeric, a vector (if `observed` is a scalar) or a matrix (if `observed` is a vector) + - `quantile`: numeric, a vector with quantile-levels. Can alternatively be a matrix of the same shape as `predicted`. +- Users can now supply their own scoring rules to `score()` as a list of functions. Default scoring rules can be accessed using the functions `rules_point()`, `rules_sample()`, `rules_quantile()` and `rules_binary()`, which return a list of scoring rules suitable for the respective forecast type. +- `check_forecasts()` was replaced by a different workflow. There now is a function, `as_forecast()`, that determines forecast type of the data, constructs a forecasting object and validates it using the function `validate_forecast()` (a generic that dispatches the correct method based on the forecast type). Objects of class `forecast_binary`, `forecast_point`, `forecast_sample` and `forecast_quantile` have print methods that fulfill the functionality of `check_forecasts()`. +- The functionality for computing pairwise comparisons was now split from `summarise_scores()`. Instead of doing pairwise comparisons as part of summarising scores, a new function, `add_pairwise_comparison()`, was introduced that takes summarised scores as an input and adds pairwise comparisons to it. +- `add_coverage()` was reworked completely. It's new purpose is now to add coverage information to the raw forecast data (essentially fulfilling some of the functionality that was previously covered by `score_quantile()`) +- Support for the interval format was mostly dropped (see PR #525 by @nikosbosse and reviewed by @seabbs) + - The function `bias_range()` was removed (users should now use `bias_quantile()` instead) + - The function `interval_score()` was made an internal function rather than being exported to users. We recommend using `wis()` instead. +- The function `find_duplicates()` was renamed to `get_duplicate_forecasts()` +- Changes to `avail_forecasts()` and `plot_avail_forecasts()`: + - The function `avail_forecasts()` was renamed to `get_forecast_counts()`. This represents a change in the naming convention where we aim to name functions that provide the user with additional useful information about the data with a prefix "get_". Sees Issue #403 and #521 and PR #511 by @nikosbosse and reviewed by @seabbs for details. + - For clarity, the output column in `get_forecast_counts()` was renamed from "Number forecasts" to "count". + - `get_forecast_counts()` now also displays combinations where there are 0 forecasts, instead of silently dropping corresponding rows. + - `plot_avail_forecasts()` was renamed `plot_forecast_counts()` in line with the change in the function name. The `x` argument no longer has a default value, as the value will depend on the data provided by the user. +- The deprecated `..density..` was replaced with `after_stat(density)` in ggplot calls. +- Files ending in ".Rda" were renamed to ".rds" where appropriate when used together with `saveRDS()` or `readRDS()`. +- `score()` now calls `na.omit()` on the data, instead of only removing rows with missing values in the columns `observed` and `predicted`. This is because `NA` values in other columns can also mess up e.g. grouping of forecasts according to the unit of a single forecast. +- added documentation for the return value of `summarise_scores()`. +- Removed abs_error and squared_error from the package in favour of `Metrics::ae` and `Metrics::se`. +- Renamed `interval_coverage_quantile()` and `interval_coverage_dev_quantile()` to `interval_coverage()` and `interval_coverage_deviation()`, respectively. Removed `interval_coverage_sample()` as users are now expected to convert to a quantile format first before scoring. +- Added unit tests for `interval_coverage_quantile()` and `interval_coverage_dev_quantile()` in order to make sure that the functions provide the correct warnings when insufficient quantiles are provided. +- Documentation pkgdown pages are now created both for the stable and dev versions. + # scoringutils 1.2.2 ## Package updates @@ -6,7 +51,7 @@ ## Bug fixes - Fixes a bug with `set_forecast_unit()` where the function only workded with a data.table, but not a data.frame as an input. -- The metrics table in the vignette [Details on the metrics implemented in `scoringutils`](https://epiforecasts.io/scoringutils/articles/metric-details.html) had duplicated entries. This was fixed by removing the duplicated rows. +- The metrics table in the vignette [Details on the metrics implemented in `scoringutils`](https://epiforecasts.io/scoringutils/articles/metric-details.html) had duplicated entries. This was fixed by removing the duplicated rows. # scoringutils 1.2.1 diff --git a/R/add_coverage.R b/R/add_coverage.R new file mode 100644 index 000000000..398d06564 --- /dev/null +++ b/R/add_coverage.R @@ -0,0 +1,83 @@ +#' @title Add Coverage Values to Quantile-Based Forecasts +#' +#' @description Adds interval coverage of central prediction intervals, +#' quantile coverage for predictive quantiles, as well as the deviation between +#' desired and actual coverage to a data.table. Forecasts should be in a +#' quantile format (following the input requirements of `score()`). +#' +#' **Interval coverage** +#' +#' Interval coverage for a given interval range is defined as the proportion of +#' observations that fall within the corresponding central prediction intervals. +#' Central prediction intervals are symmetric around the median and and formed +#' by two quantiles that denote the lower and upper bound. For example, the 50% +#' central prediction interval is the interval between the 0.25 and 0.75 +#' quantiles of the predictive distribution. +#' +#' The function `add_coverage()` computes the coverage per central prediction +#' interval, so the interval coverage will always be either `TRUE` +#' (observed value falls within the interval) or `FALSE` (observed value falls +#' outside the interval). You can summarise the interval coverage values to get +#' the proportion of observations that fall within the central prediction +#' intervals. +#' +#' **Quantile coverage** +#' +#' Quantile coverage for a given quantile is defined as the proportion of +#' observed values that are smaller than the corresponding predictive quantile. +#' For example, the 0.5 quantile coverage is the proportion of observed values +#' that are smaller than the 0.5 quantile of the predictive distribution. +#' Just as above, for a single observation and the quantile of a single +#' predictive distribution, the value will either be `TRUE` or `FALSE`. +#' +#' **Coverage deviation** +#' +#' The coverage deviation is the difference between the desired coverage +#' (can be either interval or quantile coverage) and the +#' actual coverage. For example, if the desired coverage is 90% and the actual +#' coverage is 80%, the coverage deviation is -0.1. +#' +#' @inheritParams score +#' @return a data.table with the input and columns "interval_coverage", +#' "interval_coverage_deviation", "quantile_coverage", +#' "quantile_coverage_deviation" added. +#' @importFrom data.table setcolorder +#' @examples +#' library(magrittr) # pipe operator +#' example_quantile %>% +#' add_coverage() +#' @export +#' @keywords scoring +#' @export +add_coverage <- function(data) { + stored_attributes <- get_scoringutils_attributes(data) + data <- as_forecast(data) + forecast_unit <- get_forecast_unit(data) + data_cols <- colnames(data) # store so we can reset column order later + + interval_data <- quantile_to_interval(data, format = "wide") + interval_data[, + interval_coverage := (observed <= upper) & (observed >= lower) + ][, c("lower", "upper", "observed") := NULL] + + data[, range := get_range_from_quantile(quantile)] + + data <- merge(interval_data, data, by = unique(c(forecast_unit, "range"))) + data[, interval_coverage_deviation := interval_coverage - range / 100] + data[, quantile_coverage := observed <= predicted] + data[, quantile_coverage_deviation := quantile_coverage - quantile] + + # reset column order + new_metrics <- c("interval_coverage", "interval_coverage_deviation", + "quantile_coverage", "quantile_coverage_deviation") + setcolorder(data, unique(c(data_cols, "range", new_metrics))) + + # add coverage "metrics" to list of stored metrics + # this makes it possible to use `summarise_scores()` later on + stored_attributes[["metric_names"]] <- c( + stored_attributes[["metric_names"]], + new_metrics + ) + data <- assign_attributes(data, stored_attributes) + return(data[]) +} diff --git a/R/avail_forecasts.R b/R/available_forecasts.R similarity index 53% rename from R/avail_forecasts.R rename to R/available_forecasts.R index 5fd6160f8..9c84142ba 100644 --- a/R/avail_forecasts.R +++ b/R/available_forecasts.R @@ -1,4 +1,4 @@ -#' @title Display Number of Forecasts Available +#' @title Count Number of Available Forecasts #' #' @description #' @@ -11,19 +11,21 @@ #' categories over which the number of forecasts should be counted. #' By default (`by = NULL`) this will be the unit of a single forecast (i.e. #' all available columns (apart from a few "protected" columns such as -#' 'prediction' and 'true value') plus "quantile" or "sample" where present). +#' 'predicted' and 'observed') plus "quantile" or "sample_id" where present). #' -#' @param collapse character vector (default is `c("quantile", "sample"`) with -#' names of categories for which the number of rows should be collapsed to one -#' when counting. For example, a single forecast is usually represented by a +#' @param collapse character vector (default is `c("quantile", "sample_id"`) +#' with names of categories for which the number of rows should be collapsed to +#' one when counting. For example, a single forecast is usually represented by a #' set of several quantiles or samples and collapsing these to one makes sure -#' that a single forecast only gets counted once. +#' that a single forecast only gets counted once. Setting `collapse = c()` +#' would mean that all quantiles / samples would be counted as individual +#' forecasts. #' #' @return A data.table with columns as specified in `by` and an additional -#' column with the number of forecasts. +#' column "count" with the number of forecasts. #' #' @inheritParams score -#' @importFrom data.table .I .N +#' @importFrom data.table .I .N nafill #' @export #' @keywords check-forecasts #' @examples @@ -31,19 +33,16 @@ #' data.table::setDTthreads(2) # restricts number of cores used on CRAN #' } #' -#' avail_forecasts(example_quantile, -#' collapse = c("quantile"), +#' get_forecast_counts(example_quantile, #' by = c("model", "target_type") #' ) -avail_forecasts <- function(data, - by = NULL, - collapse = c("quantile", "sample")) { +get_forecast_counts <- function(data, + by = NULL, + collapse = c("quantile", "sample_id")) { - check_data <- check_forecasts(data) - - - data <- check_data$cleaned_data - forecast_unit <- check_data$forecast_unit + data <- as_forecast(data) + forecast_unit <- get_forecast_unit(data) + data <- na.omit(data) if (is.null(by)) { by <- forecast_unit @@ -52,7 +51,7 @@ avail_forecasts <- function(data, # collapse several rows to 1, e.g. treat a set of 10 quantiles as one, # because they all belong to one single forecast that should be counted once collapse_by <- setdiff( - c(forecast_unit, "quantile", "sample"), + c(forecast_unit, "quantile", "sample_id"), collapse ) # filter out "quantile" or "sample" if present in collapse_by, but not data @@ -61,7 +60,17 @@ avail_forecasts <- function(data, data <- data[data[, .I[1], by = collapse_by]$V1] # count number of rows = number of forecasts - out <- data[, .(`Number forecasts` = .N), by = by] + out <- data[, .(count = .N), by = by] + + # make sure that all combinations in "by" are included in the output (with + # count = 0). To achieve that, take the unique values in data and expand grid + col_vecs <- unclass(out) + col_vecs$count <- NULL + col_vecs <- lapply(col_vecs, unique) + out_empty <- expand.grid(col_vecs, stringsAsFactors = FALSE) + + out <- merge(out, out_empty, by = by, all.y = TRUE) + out[, count := nafill(count, fill = 0)] return(out[]) } diff --git a/R/bias.R b/R/bias.R deleted file mode 100644 index 25347b209..000000000 --- a/R/bias.R +++ /dev/null @@ -1,318 +0,0 @@ -#' @title Determines bias of forecasts -#' -#' @description -#' Determines bias from predictive Monte-Carlo samples. The function -#' automatically recognises, whether forecasts are continuous or -#' integer valued and adapts the Bias function accordingly. -#' -#' @details -#' For continuous forecasts, Bias is measured as -#' -#' \deqn{ -#' B_t (P_t, x_t) = 1 - 2 * (P_t (x_t)) -#' } -#' -#' where \eqn{P_t} is the empirical cumulative distribution function of the -#' prediction for the true value \eqn{x_t}. Computationally, \eqn{P_t (x_t)} is -#' just calculated as the fraction of predictive samples for \eqn{x_t} -#' that are smaller than \eqn{x_t}. -#' -#' For integer valued forecasts, Bias is measured as -#' -#' \deqn{ -#' B_t (P_t, x_t) = 1 - (P_t (x_t) + P_t (x_t + 1)) -#' } -#' -#' to adjust for the integer nature of the forecasts. -#' -#' In both cases, Bias can assume values between -#' -1 and 1 and is 0 ideally. -#' -#' @return vector of length n with the biases of the predictive samples with -#' respect to the true values. -#' @inheritParams ae_median_sample -#' @author Nikos Bosse \email{nikosbosse@@gmail.com} -#' @examples -#' -#' ## integer valued forecasts -#' true_values <- rpois(30, lambda = 1:30) -#' predictions <- replicate(200, rpois(n = 30, lambda = 1:30)) -#' bias_sample(true_values, predictions) -#' -#' ## continuous forecasts -#' true_values <- rnorm(30, mean = 1:30) -#' predictions <- replicate(200, rnorm(30, mean = 1:30)) -#' bias_sample(true_values, predictions) -#' @export -#' @references -#' The integer valued Bias function is discussed in -#' Assessing the performance of real-time epidemic forecasts: A case study of -#' Ebola in the Western Area region of Sierra Leone, 2014-15 Funk S, Camacho A, -#' Kucharski AJ, Lowe R, Eggo RM, et al. (2019) Assessing the performance of -#' real-time epidemic forecasts: A case study of Ebola in the Western Area -#' region of Sierra Leone, 2014-15. PLOS Computational Biology 15(2): e1006785. -#' \doi{10.1371/journal.pcbi.1006785} -#' @keywords metric - -bias_sample <- function(true_values, predictions) { - - # check inputs - check_true_values(true_values) - check_predictions(predictions, true_values, class = "matrix") - prediction_type <- get_prediction_type(predictions) - - # empirical cdf - n_pred <- ncol(predictions) - p_x <- rowSums(predictions <= true_values) / n_pred - - if (prediction_type == "continuous") { - res <- 1 - 2 * p_x - return(res) - } else { - # for integer case also calculate empirical cdf for (y-1) - p_xm1 <- rowSums(predictions <= (true_values - 1)) / n_pred - - res <- 1 - (p_x + p_xm1) - return(res) - } -} - -#' @title Determines Bias of Quantile Forecasts -#' -#' @description -#' Determines bias from quantile forecasts. For an increasing number of -#' quantiles this measure converges against the sample based bias version -#' for integer and continuous forecasts. -#' -#' @details -#' For quantile forecasts, bias is measured as -#' -#' \deqn{ -#' B_t = (1 - 2 \cdot \max \{i | q_{t,i} \in Q_t \land q_{t,i} \leq x_t\}) -#' \mathbf{1}( x_t \leq q_{t, 0.5}) \\ -#' + (1 - 2 \cdot \min \{i | q_{t,i} \in Q_t \land q_{t,i} \geq x_t\}) -#' 1( x_t \geq q_{t, 0.5}),} -#' -#' where \eqn{Q_t} is the set of quantiles that form the predictive -#' distribution at time \eqn{t}. They represent our -#' belief about what the true value $x_t$ will be. For consistency, we define -#' \eqn{Q_t} such that it always includes the element -#' \eqn{q_{t, 0} = - \infty} and \eqn{q_{t,1} = \infty}. -#' \eqn{1()} is the indicator function that is \eqn{1} if the -#' condition is satisfied and $0$ otherwise. In clearer terms, \eqn{B_t} is -#' defined as the maximum percentile rank for which the corresponding quantile -#' is still below the true value, if the true value is smaller than the -#' median of the predictive distribution. If the true value is above the -#' median of the predictive distribution, then $B_t$ is the minimum percentile -#' rank for which the corresponding quantile is still larger than the true -#' value. If the true value is exactly the median, both terms cancel out and -#' \eqn{B_t} is zero. For a large enough number of quantiles, the -#' percentile rank will equal the proportion of predictive samples below the -#' observed true value, and this metric coincides with the one for -#' continuous forecasts. -#' -#' Bias can assume values between -#' -1 and 1 and is 0 ideally (i.e. unbiased). -#' @param predictions vector of length corresponding to the number of quantiles -#' that holds predictions -#' @param quantiles vector of corresponding size with the quantiles for which -#' predictions were made. If this does not contain the median (0.5) then the -#' median is imputed as being the mean of the two innermost quantiles. -#' @inheritParams bias_range -#' @return scalar with the quantile bias for a single quantile prediction -#' @author Nikos Bosse \email{nikosbosse@@gmail.com} -#' @examples -#' -#' predictions <- c( -#' 705.500, 1127.000, 4006.250, 4341.500, 4709.000, 4821.996, -#' 5340.500, 5451.000, 5703.500, 6087.014, 6329.500, 6341.000, -#' 6352.500, 6594.986, 6978.500, 7231.000, 7341.500, 7860.004, -#' 7973.000, 8340.500, 8675.750, 11555.000, 11976.500 -#' ) -#' -#' quantiles <- c(0.01, 0.025, seq(0.05, 0.95, 0.05), 0.975, 0.99) -#' -#' true_value <- 8062 -#' -#' bias_quantile(predictions, quantiles, true_value = true_value) -#' @export -#' @keywords metric - -bias_quantile <- function(predictions, quantiles, true_value) { - # check that predictions and quantiles have the same length - if (!length(predictions) == length(quantiles)) { - stop("predictions and quantiles must have the same length") - } - - if (anyNA(predictions)) { - quantiles <- quantiles[!is.na(predictions)] - predictions <- predictions[!is.na(predictions)] - } - - if (anyNA(quantiles)) { - quantiles <- quantiles[!is.na(quantiles)] - predictions <- predictions[!is.na(quantiles)] - } - - # if there is no input, return NA - if (length(quantiles) == 0 || length(predictions) == 0) { - return(NA_real_) - } - - check_quantiles(quantiles) - - if (!all(diff(predictions) >= 0)) { - stop("predictions must be increasing with quantiles") - } - - if (0.5 %in% quantiles) { - median_prediction <- predictions[quantiles == 0.5] - } else { - # if median is not available, compute as mean of two innermost quantiles - message( - "Median not available, computing as mean of two innermost quantiles", - " in order to compute bias." - ) - median_prediction <- - 0.5 * predictions[quantiles == max(quantiles[quantiles < 0.5])] + - 0.5 * predictions[quantiles == min(quantiles[quantiles > 0.5])] - } - - if (true_value == median_prediction) { - bias <- 0 - return(bias) - } else if (true_value < median_prediction) { - if (true_value < min(predictions)) { - bias <- 1 - } else { - q <- max(quantiles[predictions <= true_value]) - bias <- 1 - 2 * q - } - } else if (true_value > median_prediction) { - if (true_value > max(predictions)) { - bias <- -1 - } else { - q <- min(quantiles[predictions >= true_value]) - bias <- 1 - 2 * q - } - } - return(bias) -} - -#' @title Determines Bias of Quantile Forecasts based on the range of the -#' prediction intervals -#' -#' @description -#' Determines bias from quantile forecasts based on the range of the -#' prediction intervals. For an increasing number of quantiles this measure -#' converges against the sample based bias version for integer and continuous -#' forecasts. -#' -#' @details -#' For quantile forecasts, bias is measured as -#' -#' \deqn{ -#' B_t = (1 - 2 \cdot \max \{i | q_{t,i} \in Q_t \land q_{t,i} \leq x_t\}) -#' \mathbf{1}( x_t \leq q_{t, 0.5}) \\ -#' + (1 - 2 \cdot \min \{i | q_{t,i} \in Q_t \land q_{t,i} \geq x_t\}) -#' \mathbf{1}( x_t \geq q_{t, 0.5}), -#' }{ -#' B_t = (1 - 2 * max(i | q_{t,i} in Q_t and q_{t,i} <= x_t\)) -#' 1( x_t <= q_{t, 0.5}) + (1 - 2 * min(i | q_{t,i} in Q_t and q_{t,i} >= x_t)) -#' 1( x_t >= q_{t, 0.5}), -#' } -#' -#' where \eqn{Q_t} is the set of quantiles that form the predictive -#' distribution at time \eqn{t}. They represent our -#' belief about what the true value \eqn{x_t} will be. For consistency, we -#' define -#' \eqn{Q_t} such that it always includes the element -#' \eqn{q_{t, 0} = - \infty} and \eqn{q_{t,1} = \infty}. -#' \eqn{\mathbf{1}()}{1()} is the indicator function that is \eqn{1} if the -#' condition is satisfied and $0$ otherwise. In clearer terms, \eqn{B_t} is -#' defined as the maximum percentile rank for which the corresponding quantile -#' is still below the true value, if the true value is smaller than the -#' median of the predictive distribution. If the true value is above the -#' median of the predictive distribution, then $B_t$ is the minimum percentile -#' rank for which the corresponding quantile is still larger than the true -#' value. If the true value is exactly the median, both terms cancel out and -#' \eqn{B_t} is zero. For a large enough number of quantiles, the -#' percentile rank will equal the proportion of predictive samples below the -#' observed true value, and this metric coincides with the one for -#' continuous forecasts. -#' -#' Bias can assume values between -#' -1 and 1 and is 0 ideally. -#' @param lower vector of length corresponding to the number of central -#' prediction intervals that holds predictions for the lower bounds of a -#' prediction interval -#' @param upper vector of length corresponding to the number of central -#' prediction intervals that holds predictions for the upper bounds of a -#' prediction interval -#' @param range vector of corresponding size with information about the width -#' of the central prediction interval -#' @param true_value a single true value -#' @return scalar with the quantile bias for a single quantile prediction -#' @seealso bias_quantile bias_sample -#' @author Nikos Bosse \email{nikosbosse@@gmail.com} -#' @examples -#' -#' lower <- c( -#' 6341.000, 6329.500, 6087.014, 5703.500, -#' 5451.000, 5340.500, 4821.996, 4709.000, -#' 4341.500, 4006.250, 1127.000, 705.500 -#' ) -#' -#' upper <- c( -#' 6341.000, 6352.500, 6594.986, 6978.500, -#' 7231.000, 7341.500, 7860.004, 7973.000, -#' 8340.500, 8675.750, 11555.000, 11976.500 -#' ) -#' -#' range <- c(0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 95, 98) -#' -#' true_value <- 8062 -#' -#' bias_range( -#' lower = lower, upper = upper, -#' range = range, true_value = true_value -#' ) -#' @export -#' @keywords metric - -bias_range <- function(lower, upper, range, true_value) { - - if (anyNA(range)) { - if (is.na(range[1]) && !any(range[-1] == 0)) { - range[1] <- 0 - } - range <- range[!is.na(range)] - lower <- lower[!is.na(range)] - upper <- upper[!is.na(range)] - } - - if (length(range) > 1 && !all(diff(range) > 0)) { - stop("Range must be increasing") - } - - if (length(lower) != length(upper) || length(range) != length(lower)) { - stop("Inputs must have same length") - } - - check_quantiles(range, name = "range", range = c(0, 100)) - - # Convert range to quantiles - quantiles <- c( - rev(abs(100 - range) / (2 * 100)), - abs(100 + range[range != 0]) / (2 * 100) - ) - - # Combine predictions - upper_without_median <- upper[range != 0] - predictions <- c(rev(lower), upper_without_median) - - # Call bias_quantile - bias <- bias_quantile(predictions, quantiles, true_value) - - return(bias) -} diff --git a/R/brier_score.R b/R/brier_score.R deleted file mode 100644 index c1005e4ab..000000000 --- a/R/brier_score.R +++ /dev/null @@ -1,42 +0,0 @@ -#' Brier Score -#' -#' @description -#' Computes the Brier Score for probabilistic forecasts of binary outcomes. -#' -#' @details -#' The Brier score is a proper score rule that assesses the accuracy of -#' probabilistic binary predictions. The outcomes can be either 0 or 1, -#' the predictions must be a probability that the true outcome will be 1. -#' -#' The Brier Score is then computed as the mean squared error between the -#' probabilistic prediction and the true outcome. -#' -#' \deqn{ -#' \textrm{Brier\_Score} = \frac{1}{N} \sum_{t = 1}^{n} (\textrm{prediction}_t - -#' \textrm{outcome}_t)^2 -#' }{ -#' Brier_Score = 1/N sum_{t = 1}^{n} (prediction_t - outcome_t)² -#' } -#' -#' @param true_values A vector with the true observed values of size n with -#' all values equal to either 0 or 1 -#' @param predictions A vector with a predicted probability -#' that true_value = 1. -#' @return A numeric value with the Brier Score, i.e. the mean squared -#' error of the given probability forecasts -#' @export -#' -#' @examples -#' true_values <- sample(c(0, 1), size = 30, replace = TRUE) -#' predictions <- runif(n = 30, min = 0, max = 1) -#' -#' brier_score(true_values, predictions) -#' @keywords metric - -brier_score <- function(true_values, predictions) { - check_true_values(true_values, type = "binary") - check_predictions(predictions, true_values, type = "binary") - - brierscore <- (true_values - predictions)^2 - return(brierscore) -} diff --git a/R/check-input-helpers.R b/R/check-input-helpers.R new file mode 100644 index 000000000..0470829d8 --- /dev/null +++ b/R/check-input-helpers.R @@ -0,0 +1,381 @@ +#' @title Check whether an input is an atomic vector of mode 'numeric' +#' +#' @description Helper function +#' @param x input to check +#' @inheritDotParams checkmate::check_numeric +#' @importFrom checkmate check_atomic_vector check_numeric +#' @inherit document_check_functions return +#' @keywords internal_input_check +check_numeric_vector <- function(x, ...) { + # check functions must return TRUE on success + # and a custom error message otherwise + numeric <- check_numeric(x, ...) + vector <- check_atomic_vector(x) + if (!isTRUE(numeric)) { + return(numeric) + } else if (!isTRUE(vector)) { + return(vector) + } + return(TRUE) +} + + +#' Check that quantiles are valid +#' +#' @description +#' Helper function to check that input quantiles are valid. +#' Quantiles must be in the range specified, increase monotonically, +#' and contain no duplicates. +#' +#' This is used in [bias_range()]() and [bias_quantile()]() to +#' provide informative errors to users. +#' +#' @param quantiles Numeric vector of quantiles to check +#' @param name Character name to use in error messages +#' @param range Numeric vector giving allowed range +#' +#' @return None. Function errors if quantiles are invalid. +#' +#' @keywords internal_input_check +check_quantiles <- function(quantiles, name = "quantiles", range = c(0, 1)) { + if (any(quantiles < range[1]) || any(quantiles > range[2])) { + stop(name, " must be between ", range[1], " and ", range[2]) + } + + if (!all(diff(quantiles) > 0)) { + stop(name, " must be increasing") + } +} + + +#' @title Helper function to convert assert statements into checks +#' +#' @description Tries to execute an expression. Internally, this is used to +#' see whether assertions fail when checking inputs (i.e. to convert an +#' `assert_*()` statement into a check). If the expression fails, the error +#' message is returned. If the expression succeeds, `TRUE` is returned. +#' @param expr an expression to be evaluated +#' @importFrom checkmate assert assert_numeric check_matrix +#' @inherit document_check_functions return +#' @keywords internal_input_check +check_try <- function(expr) { + result <- try(expr, silent = TRUE) + if (is.null(result)) { + return(TRUE) + } + msg <- conditionMessage(attr(result, "condition")) + return(msg) +} + + +#' @title Check Variable is not NULL +#' +#' @description +#' Check whether a certain variable is not `NULL` and return the name of that +#' variable and the function call where the variable is missing. This function +#' is a helper function that should only be called within other functions +#' @param ... The variables to check +#' @inherit document_assert_functions return +#' @return The function returns `NULL`, but throws an error if the variable is +#' missing. +#' +#' @keywords internal_input_check +assert_not_null <- function(...) { + vars <- list(...) + varnames <- names(vars) + + calling_function <- deparse(sys.calls()[[sys.nframe() - 1]]) + + for (i in seq_along(vars)) { + varname <- varnames[i] + if (is.null(vars[[i]])) { + stop( + "variable '", varname, + "' is `NULL` in the following function call: '", + calling_function, "'" + ) + } + } + return(invisible(NULL)) +} + + +#' @title Check Length of Two Vectors is Equal +#' +#' @description +#' Check whether variables all have the same length +#' @param ... The variables to check +#' @param one_allowed logical, allow arguments of length one that can be +#' recycled +#' @param call_levels_up How many levels to go up when including the function +#' call in the error message. This is useful when calling `assert_equal_length()` +#' within another checking function. +#' @inherit document_assert_functions return +#' +#' @keywords internal_input_check +assert_equal_length <- function(..., + one_allowed = TRUE, + call_levels_up = 2) { + vars <- list(...) + lengths <- lengths(vars) + + lengths <- unique(lengths) + + if (one_allowed) { + # check passes if all have length 1 + if (all(lengths == 1)) { + return(invisible(NULL)) + } + # ignore those where length is one for later checks, as we allow length 1 + lengths <- lengths[lengths != 1] + } + + if (length(unique(lengths)) != 1) { + calling_function <- deparse(sys.calls()[[sys.nframe() - call_levels_up]]) + + lengths_message <- ifelse( + one_allowed, + "' should have the same length (or length one). Actual lengths: ", + "' should have the same length. Actual lengths: " + ) + + stop( + "Arguments to the following function call: '", + calling_function, + lengths_message, + toString(lengths) + ) + } + return(invisible(NULL)) +} + + +#' @title Check Whether There Is a Conflict Between Data and Attributes +#' @description +#' Check whether there is a conflict between a stored attribute and the +#' same value as inferred from the data. For example, this could be if +#' an attribute `forecast_unit` is stored, but is different from the +#' `forecast_unit` inferred from the data. The check is successful if +#' the stored and the inferred value are the same. +#' @param object The object to check +#' @param attribute The name of the attribute to check +#' @param expected The expected value of the attribute +#' @inherit document_check_functions return +#' @keywords internal_input_check +check_attribute_conflict <- function(object, attribute, expected) { + existing <- attr(object, attribute) + if (is.vector(existing) && is.vector(expected)) { + existing <- sort(existing) + expected <- sort(expected) + } + + if (!is.null(existing) && !identical(existing, expected)) { + msg <- paste0( + "Object has an attribute `", attribute, "`, but it looks different ", + "from what's expected based on the data.\n", + "Existing: ", toString(existing), "\n", + "Expected: ", toString(expected), "\n", + "Running `as_forecast()` again might solve the problem" + ) + return(msg) + } + return(TRUE) +} + + +#' @title Assure that Data Has a `model` Column +#' +#' @description +#' Check whether the data.table has a column called `model`. +#' If not, a column called `model` is added with the value `Unspecified model`. +#' @inheritParams score +#' @return The data.table with a column called `model` +#' @keywords internal_input_check +assure_model_column <- function(data) { + if (!("model" %in% colnames(data))) { + message( + "There is no column called `model` in the data.", + "scoringutils assumes that all forecasts come from the same model" # nolint + ) + data[, model := "Unspecified model"] + } + return(data[]) +} + + +#' Check that all forecasts have the same number of quantiles or samples +#' @description Function checks the number of quantiles or samples per forecast. +#' If the number of quantiles or samples is the same for all forecasts, it +#' returns TRUE and a string with an error message otherwise. +#' @param forecast_unit Character vector denoting the unit of a single forecast. +#' @inherit document_check_functions params return +#' @keywords internal_input_check +check_number_per_forecast <- function(data, forecast_unit) { + data <- na.omit(data) + # check whether there are the same number of quantiles, samples -------------- + data[, scoringutils_InternalNumCheck := length(predicted), by = forecast_unit] + n <- unique(data$scoringutils_InternalNumCheck) + data[, scoringutils_InternalNumCheck := NULL] + if (length(n) > 1) { + msg <- paste0( + "Some forecasts have different numbers of rows ", + "(e.g. quantiles or samples). ", + "scoringutils found: ", toString(n), + ". This may be a problem (it can potentially distort scores, ", + "making it more difficult to compare them), ", + "so make sure this is intended." + ) + return(msg) + } + return(TRUE) +} + + +#' Check columns in data.frame don't have NA values +#' @description Function checks whether any of the columns in a data.frame, +#' as specified in `columns`, have NA values. If so, it returns a string with +#' an error message, otherwise it returns TRUE. +#' @inherit document_check_functions params return +#' +#' @keywords internal_input_check +check_no_NA_present <- function(data, columns) { + for (x in columns){ + if (anyNA(data[[x]])) { + msg <- paste0( + "Checking `data`: ", + sum(is.na(data[[x]])), + " values in column `", + x, + "`` are NA and corresponding rows will be removed. This is fine if not unexpected." # nolint + ) + return(msg) + } + } + return(TRUE) +} + + +#' Check that there are no duplicate forecasts +#' +#' @description +#' Runs [get_duplicate_forecasts()] and returns a message if an issue is encountered +#' @inheritParams get_duplicate_forecasts +#' @inherit document_check_functions return +#' @keywords internal_input_check +check_duplicates <- function(data, forecast_unit = NULL) { + check_duplicates <- get_duplicate_forecasts(data, forecast_unit = forecast_unit) + + if (nrow(check_duplicates) > 0) { + msg <- paste0( + "There are instances with more than one forecast for the same target. ", + "This can't be right and needs to be resolved. Maybe you need to ", + "check the unit of a single forecast and add missing columns? Use ", + "the function get_duplicate_forecasts() to identify duplicate rows" + ) + return(msg) + } + return(TRUE) +} + + +#' Check column names are present in a data.frame +#' @description +#' The functions loops over the column names and checks whether they are +#' present. If an issue is encountered, the function immediately stops +#' and returns a message with the first issue encountered. +#' @inherit document_check_functions params return +#' @importFrom checkmate assert_character +#' @keywords internal_input_check +check_columns_present <- function(data, columns) { + if (is.null(columns)) { + return(TRUE) + } + assert_character(columns, min.len = 1) + colnames <- colnames(data) + missing <- list() + for (x in columns){ + if (!(x %in% colnames)) { + missing[[x]] <- x + } + } + missing <- unlist(missing) + if (length(missing) > 1) { + msg <- paste0( + "Columns '", paste(missing, collapse = "', '"), "' not found in data" + ) + return(msg) + } else if (length(missing) == 1) { + msg <- paste0("Column '", missing, "' not found in data") + return(msg) + } + return(TRUE) +} + +#' Test whether all column names are present in a data.frame +#' @description The function checks whether all column names are present. If +#' one or more columns are missing, the function returns FALSE. If all columns +#' are present, the function returns TRUE. +#' @inheritParams document_check_functions +#' @return Returns TRUE if all columns are present and FALSE otherwise +#' @keywords internal_input_check +test_columns_present <- function(data, columns) { + check <- check_columns_present(data, columns) + return(is.logical(check)) +} + +#' Test whether column names are NOT present in a data.frame +#' @description The function checks whether all column names are NOT present. +#' If none of the columns are present, the function returns TRUE. If one or +#' more columns are present, the function returns FALSE. +#' @inheritParams document_check_functions +#' @return Returns TRUE if none of the columns are present and FALSE otherwise +#' @keywords internal_input_check +test_columns_not_present <- function(data, columns) { + if (any(columns %in% colnames(data))) { + return(FALSE) + } else { + return(TRUE) + } +} + +#' Check whether data is data.frame with correct columns +#' @description Checks whether data is a data.frame, whether columns +#' "observed" and "predicted" are present, and checks that only one of +#' "quantile" and "sample_id" is present. +#' @inherit document_check_functions params return +#' @importFrom checkmate check_data_frame +#' @keywords internal_input_check +check_data_columns <- function(data) { + is_data <- check_data_frame(data, min.rows = 1) + if (!is.logical(is_data)) { + return(is_data) + } + needed <- test_columns_present(data, c("observed", "predicted")) + if (!needed) { + return("Both columns `observed` and predicted` are needed") + } + problem <- test_columns_present(data, c("sample_id", "quantile")) + if (problem) { + return( + "Found columns `quantile` and `sample_id`. Only one of these is allowed" + ) + } + return(TRUE) +} + + +#' Check whether an attribute is present +#' @description Checks whether an object has an attribute +#' @param object An object to be checked +#' @param attribute name of an attribute to be checked +#' @inherit document_check_functions return +#' @keywords internal_input_check +check_has_attribute <- function(object, attribute) { + if (is.null(attr(object, attribute))) { + return( + paste0("Found no attribute `", attribute, "`") + ) + } else { + return(TRUE) + } +} diff --git a/R/check-inputs-scoring-functions.R b/R/check-inputs-scoring-functions.R new file mode 100644 index 000000000..e0c784ff2 --- /dev/null +++ b/R/check-inputs-scoring-functions.R @@ -0,0 +1,257 @@ +#' @title Assert that inputs are correct for sample-based forecast +#' @description Function assesses whether the inputs correspond to the +#' requirements for scoring sample-based forecasts. +#' @param observed Input to be checked. Should be a numeric vector with the +#' observed values of size n +#' @param predicted Input to be checked. Should be a numeric nxN matrix of +#' predictive samples, n (number of rows) being the number of data points and N +#' (number of columns) the number of samples per forecast. +#' If `observed` is just a single number, then predicted values can just be a +#' vector of size N. +#' @importFrom checkmate assert assert_numeric check_matrix +#' @inherit document_assert_functions return +#' @keywords internal_input_check +assert_input_sample <- function(observed, predicted) { + assert_numeric(observed, min.len = 1) + n_obs <- length(observed) + + if (n_obs == 1) { + assert( + # allow one of two options + check_numeric_vector(predicted, min.len = 1), + check_matrix(predicted, mode = "numeric", nrows = n_obs) + ) + } else { + assert(check_matrix(predicted, mode = "numeric", nrows = n_obs)) + } + return(invisible(NULL)) +} + +#' @title Check that inputs are correct for sample-based forecast +#' @inherit assert_input_sample params description +#' @inherit document_check_functions return +#' @keywords internal_input_check +check_input_sample <- function(observed, predicted) { + result <- check_try(assert_input_sample(observed, predicted)) + return(result) +} + + +#' @title Assert that inputs are correct for quantile-based forecast +#' @description Function assesses whether the inputs correspond to the +#' requirements for scoring quantile-based forecasts. +#' @param observed Input to be checked. Should be a numeric vector with the +#' observed values of size n +#' @param predicted Input to be checked. Should be nxN matrix of predictive +#' quantiles, n (number of rows) being the number of data points and N +#' (number of columns) the number of quantiles per forecast. +#' If `observed` is just a single number, then predicted can just be a +#' vector of size N. +#' @param quantile Input to be checked. Should be a vector of size N that +#' denotes the quantile levels corresponding to the columns of the prediction +#' matrix. +#' @param unique_quantiles Input to be checked. Should be TRUE (default) or +#' FALSE. Whether the quantile levels are required to be unique or not. +#' @importFrom checkmate assert assert_numeric check_matrix check_vector +#' @inherit document_assert_functions return +#' @keywords internal_input_check +assert_input_quantile <- function(observed, predicted, quantile, + unique_quantiles = TRUE) { + assert_numeric(observed, min.len = 1) + n_obs <- length(observed) + + assert_numeric( + quantile, min.len = 1, lower = 0, upper = 1, + unique = unique_quantiles + ) + n_quantiles <- length(quantile) + if (n_obs == 1) { + assert( + # allow one of two options + check_numeric_vector(predicted, min.len = 1), + check_matrix(predicted, mode = "numeric", + nrows = n_obs, ncols = n_quantiles) + ) + assert(check_vector(quantile, len = length(predicted))) + } else { + assert( + check_matrix(predicted, mode = "numeric", + nrows = n_obs, ncols = n_quantiles) + ) + } + return(invisible(NULL)) +} + +#' @title Check that inputs are correct for quantile-based forecast +#' @inherit assert_input_quantile params description +#' @inherit check_input_sample return description +#' @keywords internal_input_check +check_input_quantile <- function(observed, predicted, quantile) { + result <- check_try(assert_input_quantile(observed, predicted, quantile)) + return(result) +} + + +#' @title Assert that inputs are correct for interval-based forecast +#' @description Function assesses whether the inputs correspond to the +#' requirements for scoring interval-based forecasts. +#' @param observed Input to be checked. Should be a numeric vector with the +#' observed values of size n +#' @param lower Input to be checked. Should be a numeric vector of size n that +#' holds the predicted value for the lower bounds of the prediction intervals. +#' @param upper Input to be checked. Should be a numeric vector of size n that +#' holds the predicted value for the upper bounds of the prediction intervals. +#' @param range Input to be checked. Should be a vector of size n that +#' denotes the interval range in percent. E.g. a value of 50 denotes a +#' (25%, 75%) prediction interval. +#' @importFrom rlang warn +#' @inherit document_assert_functions return +#' @keywords internal_input_check +assert_input_interval <- function(observed, lower, upper, range) { + + assert(check_numeric_vector(observed, min.len = 1)) + n <- length(observed) + assert(check_numeric_vector(lower, len = n)) + assert(check_numeric_vector(upper, len = n)) + assert( + check_numeric_vector(range, len = 1, lower = 0, upper = 100), + check_numeric_vector(range, len = n, lower = 0, upper = 100) + ) + + diff <- upper - lower + diff <- diff[!is.na(diff)] + if (any(diff < 0)) { + stop( + "All values in `upper` need to be greater than or equal to ", + "the corresponding values in `lower`" + ) + } + if (any(range > 0 & range < 1, na.rm = TRUE)) { + msg <- paste( + "Found interval ranges between 0 and 1. Are you sure that's right? An", + "interval range of 0.5 e.g. implies a (49.75%, 50.25%) prediction", + "interval. If you want to score a (25%, 75%) prediction interval, set", + "`interval_range = 50`." + ) + rlang::warn( + message = msg, .frequency = "once", + .frequency_id = "small_interval_range" + ) + } + return(invisible(NULL)) +} + + +#' @title Check that inputs are correct for interval-based forecast +#' @inherit assert_input_interval params description +#' @inherit check_input_sample return description +#' @keywords internal_input_check +check_input_interval <- function(observed, lower, upper, range) { + result <- check_try(assert_input_quantile(observed, lower, upper, range)) + return(result) +} + + +#' @title Assert that inputs are correct for binary forecast +#' @description Function assesses whether the inputs correspond to the +#' requirements for scoring binary forecasts. +#' @param observed Input to be checked. Should be a factor of length n with +#' exactly two levels, holding the observed values. +#' The highest factor level is assumed to be the reference level. This means +#' that `predicted` represents the probability that the observed value is equal +#' to the highest factor level. +#' @param predicted Input to be checked. `predicted` should be a vector of +#' length n, holding probabilities. Alternatively, `predicted` can be a matrix +#' of size n x 1. Values represent the probability that +#' the corresponding value in `observed` will be equal to the highest +#' available factor level. +#' @importFrom checkmate assert assert_factor +#' @inherit document_assert_functions return +#' @keywords internal_input_check +assert_input_binary <- function(observed, predicted) { + assert_factor(observed, n.levels = 2, min.len = 1) + assert_numeric(predicted, lower = 0, upper = 1) + assert_dims_ok_point(observed, predicted) + return(invisible(NULL)) +} + + +#' @title Check that inputs are correct for binary forecast +#' @inherit assert_input_binary params description +#' @inherit document_check_functions return +#' @keywords internal_input_check +check_input_binary <- function(observed, predicted) { + result <- check_try(assert_input_binary(observed, predicted)) + return(result) +} + + +#' @title Assert that inputs are correct for point forecast +#' @description Function assesses whether the inputs correspond to the +#' requirements for scoring point forecasts. +#' @param observed Input to be checked. Should be a numeric vector with the +#' observed values of size n +#' @param predicted Input to be checked. Should be a numeric vector with the +#' predicted values of size n +#' @inherit document_assert_functions return +#' @keywords internal_input_check +assert_input_point <- function(observed, predicted) { + assert(check_numeric(observed)) + assert(check_numeric(predicted)) + assert(check_dims_ok_point(observed, predicted)) + return(invisible(NULL)) +} + +#' @title Check that inputs are correct for point forecast +#' @inherit assert_input_point params description +#' @inherit document_check_functions return +#' @keywords internal_input_check +check_input_point <- function(observed, predicted) { + result <- check_try(assert_input_point(observed, predicted)) + return(result) +} + + +#' @title Assert Inputs Have Matching Dimensions +#' @description Function assesses whether input dimensions match. In the +#' following, n is the number of observations / forecasts. Scalar values may +#' be repeated to match the length of the other input. +#' Allowed options are therefore +#' - `observed` is vector of length 1 or length n +#' - `predicted` is +#' - a vector of of length 1 or length n +#' - a matrix with n rows and 1 column +#' @inherit assert_input_binary +#' @inherit document_assert_functions return +#' @importFrom checkmate assert_vector check_matrix check_vector assert +#' @keywords internal_input_check +assert_dims_ok_point <- function(observed, predicted) { + assert_vector(observed, min.len = 1) + n_obs <- length(observed) + assert( + check_vector(predicted, min.len = 1, strict = TRUE), + check_matrix(predicted, ncols = 1, nrows = n_obs) + ) + dim_p <- dim(predicted) + if (!is.null(dim_p) && (length(dim_p) > 1) && (dim_p[2] > 1)) { + stop("`predicted` must be a vector or a matrix with one column. Found ", + dim(predicted)[2], " columns") + } + n_pred <- length(as.vector(predicted)) + # check that both are either of length 1 or of equal length + if ((n_obs != 1) && (n_pred != 1) && (n_obs != n_pred)) { + stop("`observed` and `predicted` must either be of length 1 or ", + "of equal length. Found ", n_obs, " and ", n_pred) + } + return(invisible(NULL)) +} + + +#' @title Check Inputs Have Matching Dimensions +#' @inherit assert_dims_ok_point params description +#' @inherit document_check_functions return +#' @keywords internal_input_check +check_dims_ok_point <- function(observed, predicted) { + result <- check_try(assert_dims_ok_point(observed, predicted)) + return(result) +} diff --git a/R/check_forecasts.R b/R/check_forecasts.R deleted file mode 100644 index 6d9ddb4bd..000000000 --- a/R/check_forecasts.R +++ /dev/null @@ -1,299 +0,0 @@ -#' @title Check forecasts -#' -#' @description Function to check the input data before running -#' [score()]. -#' -#' The data should come in one of three different formats: -#' - A format for binary predictions (see [example_binary]) -#' - A sample-based format for discrete or continuous predictions -#' (see [example_continuous] and [example_integer]) -#' - A quantile-based format (see [example_quantile]) -#' -#' @seealso Function to move from sample-based to quantile format: -#' [sample_to_quantile()] -#' @inheritParams avail_forecasts -#' @return A list with elements that give information about what `scoringutils` -#' thinks you are trying to do and potential issues. -#' -#' - `target_type` the type of the prediction target as inferred from the -#' input: 'binary', if all values in `true_value` are either 0 or 1 and values -#' in `prediction` are between 0 and 1, 'discrete' if all true values are -#' integers. -#' and 'continuous' if not. -#' - `prediction_type` inferred type of the prediction. 'quantile', if there is -#' a column called 'quantile', else 'discrete' if all values in `prediction` -#' are integer, else 'continuous. -#' - `forecast_unit` unit of a single forecast, i.e. the grouping that uniquely -#' defines a single forecast. This is assumed to be all -#' present columns apart from the following protected columns: -#' `c("prediction", "true_value", "sample", "quantile","range", "boundary")`. -#' It is important that you remove all unnecessary columns before scoring. -#' - `rows_per_forecast` a data.frame that shows how many rows (usually -#' quantiles or samples there are available per forecast. If a forecast model -#' has several entries, then there a forecasts with differing numbers of -#' quantiles / samples. -#' - `unique_values` A data.frame that shows how many unique values there are -#' present per model and column in the data. This doesn't directly show missing -#' values, but rather the maximum number of unique values across the whole data. -#' - `warnings` A vector with warnings. These can be ignored if you know what -#' you are doing. -#' - `errors` A vector with issues that will cause an error when running -#' [score()]. -#' - `messages` A verbal explanation of the information provided above. -#' -#' @importFrom data.table ':=' is.data.table -#' @author Nikos Bosse \email{nikosbosse@@gmail.com} -#' @export -#' @keywords check-forecasts -#' @examples -#' check <- check_forecasts(example_quantile) -#' print(check) -#' check_forecasts(example_binary) -check_forecasts <- function(data) { - - # create lists to store results ---------------------------------------------- - out <- list() - warnings <- list() - errors <- list() - messages <- list() - - - # check data columns --------------------------------------------------------- - if (!is.data.frame(data)) { - stop("Input should be a data.frame or similar") - } - data <- data.table::as.data.table(data) - - # make sure true_value and prediction are present - if (!all(c("true_value", "prediction") %in% colnames(data))) { - stop("Data needs to have columns called `true_value` and `prediction`") - } - - # check whether any column name is a scoringutils metric - clashing_colnames <- intersect(colnames(data), available_metrics()) - if (length(clashing_colnames) > 0) { - clashing_colnames <- paste0('"', clashing_colnames, '"') - warnings <- c( - warnings, - paste0( - "At least one column in the data ", - "(", toString(clashing_colnames), ") ", - "corresponds to the name of a metric that will be computed by ", - "scoringutils. Please check `available_metrics()`" - ) - ) - } - - # check whether there is a model column present - if (!("model" %in% colnames(data))) { - messages <- c( - messages, - paste( - "There is no column called `model` in the data.", - "scoringutils therefore thinks that all forecasts come from the same model" # nolint - ) - ) - data[, model := "Unspecified model"] - } - - - # remove rows where prediction or true value are NA -------------------------- - if (anyNA(data$true_value)) { - messages <- c( - messages, - paste( - sum(is.na(data$true_value)), - "values for `true_value` are NA in the data provided and the corresponding rows were removed. This may indicate a problem if unexpected." # nolint - ) - ) - } - if (anyNA(data$prediction)) { - messages <- c( - messages, - paste( - sum(is.na(data$prediction)), - "values for `prediction` are NA in the data provided and the corresponding rows were removed. This may indicate a problem if unexpected." # nolint - ) - ) - } - data <- data[!is.na(true_value) & !is.na(prediction)] - - if (nrow(data) == 0) { - stop("After removing all NA true values and predictions, there were no observations left") - } - - - # get information about the forecasts ---------------------------------------- - prediction_type <- get_prediction_type(data) - forecast_unit <- get_forecast_unit(data) - target_type <- get_target_type(data) - - # check whether a column called 'quantile' or 'sample' is present ------------ - if (!any(c("quantile", "sample") %in% colnames(data)) && - !target_type == "binary") { - errors <- c( - errors, - "This forecast does not seem to be for a binary prediction target, so we need a column called quantile or sample" # nolint - ) - } - - # check duplicate forecasts -------------------------------------------------- - # check whether there is more than one prediction for the same target, i.e. - # the length of prediction is greater 1 for a sample / quantile for - # a single forecast - - check_duplicates <- find_duplicates(data, forecast_unit = forecast_unit) - - if (nrow(check_duplicates) > 0) { - errors <- c( - errors, - paste0( - "There are instances with more than one forecast for the same target. ", - "This can't be right and needs to be resolved. Maybe you need to ", - "check the unit of a single forecast and add missing columns? Use ", - "the function find_duplicates() to identify duplicate rows." - ) - ) - } - - # check whether there are the same number of quantiles, samples -------------- - data[, InternalNumCheck := length(prediction), by = forecast_unit] - n <- unique(data$InternalNumCheck) - if (length(n) > 1) { - warnings <- c( - warnings, - "Some forecasts have different numbers of rows ", - "(e.g. quantiles or samples). ", - "scoringutils found: ", toString(n), - ". This may be a problem (it can potentially distort scores, ", - "making it more difficult to compare them), ", - "so make sure this is intended." - ) - } - data[, InternalNumCheck := NULL] - - - # store info so it can be accessed by the user ------------------------------- - out[["cleaned_data"]] <- data - - # available unique values per model for the different columns - out[["unique_values"]] <- - data[, lapply(.SD, FUN = function(x) length(unique(x))), by = "model"] - - # forecast infos - out[["forecast_unit"]] <- forecast_unit - out[["target_type"]] <- target_type - out[["prediction_type"]] <- prediction_type - - out[["messages"]] <- unlist(messages) - out[["warnings"]] <- unlist(warnings) - out[["errors"]] <- unlist(errors) - - - # generate messages, warnings, errors ---------------------------------------- - if (length(messages) > 0) { - msg <- collapse_messages(type = "messages", messages) - message(msg) - } - if (length(warnings) > 0) { - msg <- collapse_messages(type = "warnings", warnings) - warning(msg) - } - if (length(errors) > 0) { - msg <- collapse_messages(type = "errors", errors) - stop(msg) - } - - # return check results - class(out) <- c("scoringutils_check", "list") - return(out) -} - - -#' @title Collapse several messages to one -#' -#' @description Internal helper function to facilitate generating messages -#' and warnings in [check_forecasts()] -#' -#' @param type character, should be either "messages", "warnings" or "errors" -#' @param messages the messages or warnings to collapse -#' -#' @return string with the message or warning -#' @keywords internal -collapse_messages <- function(type = "messages", messages) { - paste0( - "The following ", type, " were produced when checking inputs:\n", - paste(paste0(seq_along(messages), ". "), messages, collapse = "\n") - ) -} - - -#' @title Print output from `check_forecasts()` -#' -#' @description Helper function that prints the output generated by -#' [check_forecasts()] -#' -#' @param x An object of class 'scoringutils_check' as produced by -#' [check_forecasts()] -#' @param ... additional arguments (not used here) -#' -#' @return NULL -#' @export -#' @keywords check-forecasts -#' @examples -#' check <- check_forecasts(example_quantile) -#' print(check) -print.scoringutils_check <- function(x, ...) { - cat("Your forecasts seem to be for a target of the following type:\n") - print(x["target_type"]) - cat("and in the following format:\n") - print(x["prediction_type"]) - - cat("The unit of a single forecast is defined by:\n") - print(x["forecast_unit"]) - - cat("Cleaned data, rows with NA values in prediction or true_value removed:\n") - print.default(x["cleaned_data"]) - - cat("Number of unique values per column per model:\n") - print.default(x["unique_values"]) - - colnames <- names(x)[names(x) %in% c("messages", "warnings", "errors")] - if (length(colnames) > 0) { - print.default(x[colnames]) - } - - return(invisible(x)) -} - - -#' @title Find duplicate forecasts -#' -#' @description Helper function to identify duplicate forecasts, i.e. -#' instances where there is more than one forecast for the same prediction -#' target. -#' -#' @param data A data.frame as used for [score()] -#' -#' @param forecast_unit A character vector with the column names that define -#' the unit of a single forecast. If missing the function tries to infer the -#' unit of a single forecast. -#' -#' @return A data.frame with all rows for which a duplicate forecast was found -#' @export -#' @keywords check-forecasts -#' @examples -#' example <- rbind(example_quantile, example_quantile[1000:1010]) -#' find_duplicates(example) - -find_duplicates <- function(data, forecast_unit) { - type <- c("sample", "quantile")[c("sample", "quantile") %in% colnames(data)] - if (missing(forecast_unit)) { - forecast_unit <- get_forecast_unit(data) - } - data <- as.data.table(data) - data[, InternalDuplicateCheck := .N, by = c(forecast_unit, type)] - out <- data[InternalDuplicateCheck > 1] - out[, InternalDuplicateCheck := NULL] - return(out[]) -} diff --git a/R/convenience-functions.R b/R/convenience-functions.R index 41592d7d4..32822b96d 100644 --- a/R/convenience-functions.R +++ b/R/convenience-functions.R @@ -1,6 +1,7 @@ #' @title Transform forecasts and observed values #' -#' @description Function to transform forecasts and true values before scoring. +#' @description Function to transform forecasts and observed values before +#' scoring. #' #' @details There are a few reasons, depending on the circumstances, for #' why this might be desirable (check out the linked reference for more info). @@ -20,7 +21,7 @@ #' #' @inheritParams score #' -#' @param fun A function used to transform both true values and predictions. +#' @param fun A function used to transform both observed values and predictions. #' The default function is [log_shift()], a custom function that is essentially #' the same as [log()], but has an additional arguments (`offset`) #' that allows you add an offset before applying the logarithm. This is often @@ -67,7 +68,7 @@ #' # transform forecasts using the natural logarithm #' # negative values need to be handled (here by replacing them with 0) #' example_quantile %>% -#' .[, true_value := ifelse(true_value < 0, 0, true_value)] %>% +#' .[, observed := ifelse(observed < 0, 0, observed)] %>% #' # Here we use the default function log_shift() which is essentially the same #' # as log(), but has an additional arguments (offset) that allows you add an #' # offset before applying the logarithm. @@ -84,20 +85,20 @@ #' # specifying an offset for the log transformation removes the #' # warning caused by zeros in the data #' example_quantile %>% -#' .[, true_value := ifelse(true_value < 0, 0, true_value)] %>% +#' .[, observed := ifelse(observed < 0, 0, observed)] %>% #' transform_forecasts(offset = 1, append = FALSE) %>% #' head() #' #' # adding square root transformed forecasts to the original ones #' example_quantile %>% -#' .[, true_value := ifelse(true_value < 0, 0, true_value)] %>% +#' .[, observed := ifelse(observed < 0, 0, observed)] %>% #' transform_forecasts(fun = sqrt, label = "sqrt") %>% #' score() %>% #' summarise_scores(by = c("model", "scale")) #' #' # adding multiple transformations #' example_quantile %>% -#' .[, true_value := ifelse(true_value < 0, 0, true_value)] %>% +#' .[, observed := ifelse(observed < 0, 0, observed)] %>% #' transform_forecasts(fun = log_shift, offset = 1) %>% #' transform_forecasts(fun = sqrt, label = "sqrt") %>% #' head() @@ -114,7 +115,8 @@ transform_forecasts <- function(data, if (scale_col_present) { if (!("natural" %in% original_data$scale)) { stop( - "If a column 'scale' is present, entries with scale =='natural' are required for the transformation" + "If a column 'scale' is present, entries with scale =='natural' ", + "are required for the transformation" ) } if (append && (label %in% original_data$scale)) { @@ -135,8 +137,8 @@ transform_forecasts <- function(data, transformed_data <- data.table::copy(original_data) original_data[, scale := "natural"] } - transformed_data[, prediction := fun(prediction, ...)] - transformed_data[, true_value := fun(true_value, ...)] + transformed_data[, predicted := fun(predicted, ...)] + transformed_data[, observed := fun(observed, ...)] transformed_data[, scale := label] out <- rbind(original_data, transformed_data) return(out[]) @@ -145,12 +147,12 @@ transform_forecasts <- function(data, # check if a column called "scale" is already present and if so, only # restrict to transformations of the original data if (scale_col_present) { - original_data[scale == "natural", prediction := fun(prediction, ...)] - original_data[scale == "natural", true_value := fun(true_value, ...)] + original_data[scale == "natural", predicted := fun(predicted, ...)] + original_data[scale == "natural", observed := fun(observed, ...)] original_data[scale == "natural", scale := label] } else { - original_data[, prediction := fun(prediction, ...)] - original_data[, true_value := fun(true_value, ...)] + original_data[, predicted := fun(predicted, ...)] + original_data[, observed := fun(observed, ...)] } return(original_data[]) } @@ -183,7 +185,7 @@ transform_forecasts <- function(data, #' log_shift(0:9, offset = 1) #' #' transform_forecasts( -#' example_quantile[true_value > 0, ], +#' example_quantile[observed > 0, ], #' fun = log_shift, #' offset = 1 #' ) @@ -209,14 +211,14 @@ log_shift <- function(x, offset = 0, base = exp(1)) { #' @description Helper function to set the unit of a single forecast (i.e. the #' combination of columns that uniquely define a single forecast) manually. #' This simple function keeps the columns specified in `forecast_unit` (plus -#' additional protected columns, e.g. for true values, predictions or quantile -#' levels) and removes duplicate rows. +#' additional protected columns, e.g. for observed values, predictions or +#' quantile levels) and removes duplicate rows. #' If not done manually, `scoringutils` attempts to determine the unit #' of a single forecast automatically by simply assuming that all column names #' are relevant to determine the forecast unit. This may lead to unexpected #' behaviour, so setting the forecast unit explicitly can help make the code #' easier to debug and easier to read. When used as part of a workflow, -#' `set_forecast_unit()` can be directly piped into `check_forecasts()` to +#' `set_forecast_unit()` can be directly piped into `as_forecast()` to #' check everything is in order. #' #' @inheritParams score @@ -233,21 +235,13 @@ log_shift <- function(x, offset = 0, base = exp(1)) { #' example_quantile, #' c("location", "target_end_date", "target_type", "horizon", "model") #' ) - set_forecast_unit <- function(data, forecast_unit) { - data <- as.data.table(data) - datacols <- colnames(data) - missing <- forecast_unit[!(forecast_unit %in% datacols)] - - if (length(missing) > 0) { - warning( - "Column(s) '", - missing, - "' are not columns of the data and will be ignored." - ) - forecast_unit <- intersect(forecast_unit, datacols) + data <- ensure_data.table(data) + missing <- check_columns_present(data, forecast_unit) + if (!is.logical(missing)) { + warning(missing) + forecast_unit <- intersect(forecast_unit, colnames(data)) } - keep_cols <- c(get_protected_columns(data), forecast_unit) out <- unique(data[, .SD, .SDcols = keep_cols])[] return(out) diff --git a/R/correlations.R b/R/correlations.R index 5b8233576..0a553e8a7 100644 --- a/R/correlations.R +++ b/R/correlations.R @@ -7,6 +7,8 @@ #' @param metrics A character vector with the metrics to show. If set to #' `NULL` (default), all metrics present in `scores` will #' be shown +#' @param digits A number indicating how many decimal places the result should +#' be rounded to. By default (`digits = NULL`) no rounding takes place. #' @inheritParams pairwise_comparison #' @return A data.table with correlations for the different metrics #' @importFrom data.table setDT @@ -15,13 +17,11 @@ #' @keywords scoring #' @examples #' scores <- score(example_quantile) -#' correlation(scores) +#' correlation(scores, digits = 2) correlation <- function(scores, - metrics = NULL) { - metrics <- check_metrics(metrics) - - # check metrics are present - metrics <- names(scores)[names(scores) %in% metrics] + metrics = NULL, + digits = NULL) { + metrics <- get_metrics(scores) # if quantile column is present, throw a warning if ("quantile" %in% names(scores)) { @@ -43,7 +43,11 @@ correlation <- function(scores, df <- df[, .SD, .SDcols = names(df) %in% metrics] # define correlation matrix - cor_mat <- round(cor(as.matrix(df)), 2) + cor_mat <- cor(as.matrix(df)) + + if (!is.null(digits)) { + cor_mat <- round(cor_mat, digits) + } correlations <- setDT(as.data.frame((cor_mat)), keep.rownames = TRUE diff --git a/R/data.R b/R/data.R index 056581ada..0b67543b9 100644 --- a/R/data.R +++ b/R/data.R @@ -11,15 +11,17 @@ #' \item{location}{the country for which a prediction was made} #' \item{target_end_date}{the date for which a prediction was made} #' \item{target_type}{the target to be predicted (cases or deaths)} -#' \item{true_value}{true observed values} +#' \item{observed}{Numeric: observed values} #' \item{location_name}{name of the country for which a prediction was made} #' \item{forecast_date}{the date on which a prediction was made} #' \item{quantile}{quantile of the corresponding prediction} -#' \item{prediction}{predicted value} +#' \item{predicted}{predicted value} #' \item{model}{name of the model that generated the forecasts} #' \item{horizon}{forecast horizon in weeks} #' } -#' @source \url{https://github.com/european-modelling-hubs/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} # nolint +# nolint start +#' @source \url{https://github.com/european-modelling-hubs/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} +# nolint end "example_quantile" @@ -37,15 +39,16 @@ #' \item{location}{the country for which a prediction was made} #' \item{target_end_date}{the date for which a prediction was made} #' \item{target_type}{the target to be predicted (cases or deaths)} -#' \item{true_value}{true observed values} +#' \item{observed}{observed values} #' \item{location_name}{name of the country for which a prediction was made} #' \item{forecast_date}{the date on which a prediction was made} -#' \item{quantile}{quantile of the corresponding prediction} -#' \item{prediction}{predicted value} +#' \item{predicted}{predicted value} #' \item{model}{name of the model that generated the forecasts} #' \item{horizon}{forecast horizon in weeks} #' } -#' @source \url{https://github.com/european-modelling-hubs/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} # nolint +# nolint start +#' @source \url{https://github.com/european-modelling-hubs/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} +# nolint end "example_point" @@ -62,15 +65,17 @@ #' \item{location}{the country for which a prediction was made} #' \item{target_end_date}{the date for which a prediction was made} #' \item{target_type}{the target to be predicted (cases or deaths)} -#' \item{true_value}{true observed values} +#' \item{observed}{observed values} #' \item{location_name}{name of the country for which a prediction was made} #' \item{forecast_date}{the date on which a prediction was made} #' \item{model}{name of the model that generated the forecasts} #' \item{horizon}{forecast horizon in weeks} -#' \item{prediction}{predicted value} -#' \item{sample}{id for the corresponding sample} +#' \item{predicted}{predicted value} +#' \item{sample_id}{id for the corresponding sample} #' } -#' @source \url{https://github.com/european-modelling-hubs/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} # nolint +# nolint start +#' @source \url{https://github.com/european-modelling-hubs/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} +# nolint end "example_continuous" @@ -87,14 +92,17 @@ #' \item{location}{the country for which a prediction was made} #' \item{target_end_date}{the date for which a prediction was made} #' \item{target_type}{the target to be predicted (cases or deaths)} -#' \item{true_value}{true observed values} +#' \item{observed}{observed values} #' \item{location_name}{name of the country for which a prediction was made} #' \item{forecast_date}{the date on which a prediction was made} #' \item{model}{name of the model that generated the forecasts} #' \item{horizon}{forecast horizon in weeks} -#' \item{prediction}{predicted value} -#' \item{sample}{id for the corresponding sample} +#' \item{predicted}{predicted value} +#' \item{sample_id}{id for the corresponding sample} #' } +# nolint start +#' @source \url{https://github.com/european-modelling-hubs/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} +# nolint end "example_integer" @@ -119,13 +127,15 @@ #' \item{location_name}{name of the country for which a prediction was made} #' \item{target_end_date}{the date for which a prediction was made} #' \item{target_type}{the target to be predicted (cases or deaths)} -#' \item{true_value}{true observed values} +#' \item{observed}{A factor with observed values} #' \item{forecast_date}{the date on which a prediction was made} #' \item{model}{name of the model that generated the forecasts} #' \item{horizon}{forecast horizon in weeks} -#' \item{prediction}{predicted value} +#' \item{predicted}{predicted value} #' } -#' @source \url{https://github.com/european-modelling-hubs/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} # nolint +# nolint start +#' @source \url{https://github.com/european-modelling-hubs/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} +# nolint end "example_binary" @@ -144,11 +154,13 @@ #' \item{target_type}{the target to be predicted (cases or deaths)} #' \item{forecast_date}{the date on which a prediction was made} #' \item{quantile}{quantile of the corresponding prediction} -#' \item{prediction}{predicted value} +#' \item{predicted}{predicted value} #' \item{model}{name of the model that generated the forecasts} #' \item{horizon}{forecast horizon in weeks} #' } -#' @source \url{https://github.com/european-modelling-hubs/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} # nolint +# nolint start +#' @source \url{https://github.com/european-modelling-hubs/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} +# nolint end "example_quantile_forecasts_only" @@ -165,10 +177,12 @@ #' \item{location}{the country for which a prediction was made} #' \item{target_end_date}{the date for which a prediction was made} #' \item{target_type}{the target to be predicted (cases or deaths)} -#' \item{true_value}{true observed values} +#' \item{observed}{observed values} #' \item{location_name}{name of the country for which a prediction was made} #' } -#' @source \url{https://github.com/european-modelling-hubs/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} # nolint +# nolint start +#' @source \url{https://github.com/european-modelling-hubs/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} +# nolint end "example_truth_only" #' Summary information for selected metrics diff --git a/R/default-scoring-rules.R b/R/default-scoring-rules.R new file mode 100644 index 000000000..422d63c3f --- /dev/null +++ b/R/default-scoring-rules.R @@ -0,0 +1,169 @@ +#' @title Select Scoring Rules From A List of Possible Scoring Rules +#' @description Helper function to return only the scoring rules selected by +#' the user from a list of possible scoring rules. +#' @param rules A list of scoring rules. +#' @param select A character vector of scoring rules to select from the list. +#' If `select` is `NULL` (the default), all possible scoring rules are returned. +#' @param exclude A character vector of scoring rules to exclude from the list. +#' If `select` is not `NULL`, this argument is ignored. +#' @return A list of scoring rules. +#' @keywords metric +#' @importFrom checkmate assert_subset assert_list +#' @export +#' @examples +#' select_rules( +#' rules = rules_binary(), +#' select = "brier_score" +#' ) +#' select_rules( +#' rules = rules_binary(), +#' exclude = "log_score" +#' ) +select_rules <- function(rules, select = NULL, exclude = NULL) { + assert_character(x = c(select, exclude), null.ok = TRUE) + assert_list(rules, names = "named") + allowed <- names(rules) + + if (is.null(select) && is.null(exclude)) { + return(rules) + } else if (is.null(select)) { + assert_subset(exclude, allowed) + select <- allowed[!allowed %in% exclude] + return(rules[select]) + } else { + assert_subset(select, allowed) + return(rules[select]) + } +} + + +#' @title Scoring Rules for Binary Forecasts +#' @description Helper function that returns a named list of default +#' scoring rules suitable for binary forecasts. +#' +#' The default scoring rules are: +#' - "brier_score" = [brier_score()] +#' - "log_score" = [logs_binary()] +#' @inherit select_rules params return +#' @export +#' @keywords metric +#' @examples +#' rules_binary() +#' rules_binary(select = "brier_score") +#' rules_binary(exclude = "log_score") +rules_binary <- function(select = NULL, exclude = NULL) { + all <- list( + brier_score = brier_score, + log_score = logs_binary + ) + selected <- select_rules(all, select, exclude) + return(selected) +} + + +#' @title Scoring Rules for Point Forecasts +#' @description Helper function that returns a named list of default +#' scoring rules suitable for point forecasts. +#' +#' The default scoring rules are: +#' - "ae_point" = [ae()][Metrics::ae()] +#' - "se_point" = [se()][Metrics::se()] +#' - "ape" = [ape()][Metrics::ape()] +#' @inherit select_rules params return +#' @export +#' @keywords metric +#' @examples +#' rules_point() +#' rules_point(select = "ape") +rules_point <- function(select = NULL, exclude = NULL) { + all <- list( + ae_point = Metrics::ae, + se_point = Metrics::se, + ape = Metrics::ape + ) + selected <- select_rules(all, select, exclude) + return(selected) +} + + +#' @title Scoring Rules for Sample-Based Forecasts +#' @description Helper function that returns a named list of default +#' scoring rules suitable for forecasts in a sample-based format +#' +#' The default scoring rules are: +#' - "mad" = [mad_sample()] +#' - "bias" = [bias_sample()] +#' - "dss" = [dss_sample()] +#' - "crps" = [crps_sample()] +#' - "log_score" = [logs_sample()] +#' - "mad" = [mad_sample()] +#' - "ae_median" = [ae_median_sample()] +#' - "se_mean" = [se_mean_sample()] +#' @inherit select_rules params return +#' @export +#' @keywords metric +#' @examples +#' rules_sample() +#' rules_sample(select = "mad") +rules_sample <- function(select = NULL, exclude = NULL) { + all <- list( + bias = bias_sample, + dss = dss_sample, + crps = crps_sample, + log_score = logs_sample, + mad = mad_sample, + ae_median = ae_median_sample, + se_mean = se_mean_sample + ) + selected <- select_rules(all, select, exclude) + return(selected) +} + + +#' @title Scoring Rules for Quantile-Based Forecasts +#' @description Helper function that returns a named list of default +#' scoring rules suitable for forecasts in a quantile-based format +#' +#' The default scoring rules are: +#' - "wis" = [wis] +#' - "overprediction" = [overprediction()] +#' - "underprediction" = [underprediction()] +#' - "dispersion" = [dispersion()] +#' - "bias" = [bias_quantile()] +#' - "interval_coverage_50" = [interval_coverage()] +#' - "interval_coverage_90" = function(...) \{ +#' run_safely(..., range = 90, fun = [interval_coverage]) +#' \} +#' - "interval_coverage_deviation" = [interval_coverage_deviation()], +#' - "ae_median" = [ae_median_quantile()] +#' +#' Note: The `coverage_90` scoring rule is created as a wrapper around +#' [interval_coverage()], making use of the function [run_safely()]. +#' This construct allows the function to deal with arbitrary arguments in `...`, +#' while making sure that only those that [interval_coverage()] can +#' accept get passed on to it. `range = 90` is set in the function definition, +#' as passing an argument `range = 90` to [score()] would mean it would also +#' get passed to `coverage_50`. +#' @inherit select_rules params return +#' @export +#' @keywords metric +#' @examples +#' rules_quantile() +#' rules_quantile(select = "wis") +rules_quantile <- function(select = NULL, exclude = NULL) { + all <- list( + wis = wis, + overprediction = overprediction, + underprediction = underprediction, + dispersion = dispersion, + bias = bias_quantile, + interval_coverage_50 = interval_coverage, + interval_coverage_90 = function(...) { + run_safely(..., range = 90, fun = interval_coverage) + }, + interval_coverage_deviation = interval_coverage_deviation, + ae_median = ae_median_quantile + ) + selected <- select_rules(all, select, exclude) + return(selected) +} diff --git a/R/documentation-templates.R b/R/documentation-templates.R new file mode 100644 index 000000000..e89648f6a --- /dev/null +++ b/R/documentation-templates.R @@ -0,0 +1,94 @@ +#' @title Documentation template for forecast types +#' +#' @details # Forecast types and input format +#' +#' Various different forecast types / forecast formats are supported. At the +#' moment, those are +#' - point forecasts +#' - binary forecasts ("soft binary classification") +#' - Probabilistic forecasts in a quantile-based format (a forecast is +#' represented as a set of predictive quantiles) +#' - Probabilistic forecasts in a sample-based format (a forecast is represented +#' as a set of predictive samples) +#' +#' Forecast types are determined based on the columns present in the input data. +#' +#' *Point forecasts* require a column `observed` of type numeric and a column +#' `predicted` of type numeric. +#' +#' *Binary forecasts* require a column `observed` of type factor with exactly +#' two levels and a column `predicted` of type numeric with probabilities, +#' corresponding to the probability that `observed` is equal to the second +#' factor level. See details [here][brier_score()] for more information. +#' +#' *Quantile-based forecasts* require a column `observed` of type numeric, +#' a column `predicted` of type numeric, and a column `quantile` of type numeric +#' with quantile-levels (between 0 and 1). +#' +#' *Sample-based forecasts* require a column `observed` of type numeric, +#' a column `predicted` of type numeric, and a column `sample_id` of type +#' numeric with sample indices. +#' +#' For more information see the vignettes and the example data +#' ([example_quantile], [example_continuous], [example_integer], +#' [example_point()], and [example_binary]). +#' +#' @details # Forecast unit +#' +#' In order to score forecasts, `scoringutils` needs to know which of the rows +#' of the data belong together and jointly form a single forecasts. This is +#' easy e.g. for point forecast, where there is one row per forecast. For +#' quantile or sample-based forecasts, however, there are multiple rows that +#' belong to single forecast. +#' +#' The *forecast unit* or *unit of a single forecast* is then described by the +#' combination of columns that uniquely identify a single forecast. +#' For example, we could have forecasts made by different models in various +#' locations at different time points, each for several weeks into the future. +#' The forecast unit could then be described as +#' `forecast_unit = c("model", "location", "forecast_date", "forecast_horizon")`. +#' `scoringutils` automatically tries to determine the unit of a single +#' forecast. It uses all existing columns for this, which means that no columns +#' must be present that are unrelated to the forecast unit. As a very simplistic +#' example, if you had an additional row, "even", that is one if the row number +#' is even and zero otherwise, then this would mess up scoring as `scoringutils` +#' then thinks that this column was relevant in defining the forecast unit. +#' +#' In order to avoid issues, we recommend using the function +#' [set_forecast_unit()] to determine the forecast unit manually. +#' The function simply drops unneeded columns, while making sure that all +#' necessary, 'protected columns' like "predicted" or "observed" are retained. +#' +#' @name forecast_types +#' @keywords internal +NULL + +#' Documentation template for check functions +#' @param data A data.frame or similar to be checked +#' @param columns A character vector of column names to check +#' @return Returns TRUE if the check was successful and a string with an +#' error message otherwise +#' @name document_check_functions +#' @keywords internal +NULL + +#' Documentation template for check functions +#' @returns Returns NULL invisibly if the assertion was successful and throws an +#' error otherwise. +#' @name document_assert_functions +#' @keywords internal +NULL + +#' Documentation template for test functions +#' @returns Returns TRUE if the check was successful and FALSE otherwise +#' @name document_test_functions +#' @keywords internal +NULL + +#' Documentation template for scoring input data +#' @param data A data frame (or similar) of forecasts following the +#' specifications detailed in [score()]. +#' @param scores A data.table of scores as produced by [score()]. +#' @name document_score_data +#' @keywords internal +NULL diff --git a/R/get_-functions.R b/R/get_-functions.R new file mode 100644 index 000000000..08bf20593 --- /dev/null +++ b/R/get_-functions.R @@ -0,0 +1,259 @@ +# Functions that help to obtain information about the data + +#' @title Infer Forecast Type +#' @description Helper function to infer the forecast type based on a +#' data.frame or similar with predictions. Please check the vignettes to +#' learn more about forecast types. +#' +#' Possible forecast types are +#' - "sample-based" +#' - "quantile-based" +#' - "binary" +#' - "point" forecast. +#' +#' The function runs additional checks to make sure the data satisfies the +#' requirements of the respective forecast type and throws an +#' informative error if any issues are found. +#' @inheritParams score +#' @return Character vector of length one with either "binary", "quantile", +#' "sample" or "point". +#' @export +#' @keywords check-forecasts +get_forecast_type <- function(data) { + assert_data_frame(data) + assert(check_columns_present(data, c("observed", "predicted"))) + if (test_forecast_type_is_binary(data)) { + forecast_type <- "binary" + } else if (test_forecast_type_is_quantile(data)) { + forecast_type <- "quantile" + } else if (test_forecast_type_is_sample(data)) { + forecast_type <- "sample" + } else if (test_forecast_type_is_point(data)) { + forecast_type <- "point" + } else { + stop( + "Checking `data`: input doesn't satisfy criteria for any forecast type. ", + "Are you missing a column `quantile` or `sample_id`? ", + "Please check the vignette for additional info." + ) + } + conflict <- check_attribute_conflict(data, "forecast_type", forecast_type) + if (!is.logical(conflict)) { + warning(conflict) + } + return(forecast_type) +} + + +#' Test whether data could be a binary forecast. +#' @description Checks type of the necessary columns. +#' @inheritParams document_check_functions +#' @importFrom checkmate test_factor test_numeric +#' @return Returns TRUE if basic requirements are satisfied and FALSE otherwise +#' @keywords internal_input_check +test_forecast_type_is_binary <- function(data) { + observed_correct <- test_factor(x = data$observed) + predicted_correct <- test_numeric(x = data$predicted) + return(observed_correct && predicted_correct) +} + +#' Test whether data could be a sample-based forecast. +#' @description Checks type of the necessary columns. +#' @inheritParams document_check_functions +#' @return Returns TRUE if basic requirements are satisfied and FALSE otherwise +#' @keywords internal_input_check +test_forecast_type_is_sample <- function(data) { + observed_correct <- test_numeric(x = data$observed) + predicted_correct <- test_numeric(x = data$predicted) + columns_correct <- test_columns_present(data, "sample_id") + return(observed_correct && predicted_correct && columns_correct) +} + +#' Test whether data could be a point forecast. +#' @description Checks type of the necessary columns. +#' @inheritParams document_check_functions +#' @return Returns TRUE if basic requirements are satisfied and FALSE otherwise +#' @keywords internal_input_check +test_forecast_type_is_point <- function(data) { + observed_correct <- test_numeric(x = data$observed) + predicted_correct <- test_numeric(x = data$predicted) + columns_correct <- test_columns_not_present(data, c("sample_id", "quantile")) + return(observed_correct && predicted_correct && columns_correct) +} + +#' Test whether data could be a quantile forecast. +#' @description Checks type of the necessary columns. +#' @inheritParams document_check_functions +#' @return Returns TRUE if basic requirements are satisfied and FALSE otherwise +#' @keywords internal_input_check +test_forecast_type_is_quantile <- function(data) { + observed_correct <- test_numeric(x = data$observed) + predicted_correct <- test_numeric(x = data$predicted) + columns_correct <- test_columns_present(data, "quantile") + return(observed_correct && predicted_correct && columns_correct) +} + + +#' @title Get type of a vector or matrix of observed values or predictions +#' +#' @description Internal helper function to get the type of a vector (usually +#' of observed or predicted values). The function checks whether the input is +#' a factor, or else whether it is integer (or can be coerced to integer) or +#' whether it's continuous. +#' @param x Input used to get the type. +#' @return Character vector of length one with either "classification", +#' "integer", or "continuous" +#' @keywords internal_input_check +get_type <- function(x) { + if (is.factor(x)) { + return("classification") + } + assert_numeric(as.vector(x)) + if (all(is.na(as.vector(x)))) { + stop("Can't get type: all values of are NA") + } + if (is.integer(x)) { + return("integer") + } + if ( + isTRUE(all.equal(as.vector(x), as.integer(x))) && !all(is.na(as.integer(x))) + ) { + return("integer") + } else { + return("continuous") + } +} + + +#' @title Get metrics that were used for scoring +#' @description Internal helper function to get the metrics that were used +#' to score forecasts. +#' @param scores A data.table with an attribute `metric_names` +#' @return Character vector with the metrics that were used for scoring. +#' @keywords internal_input_check +get_metrics <- function(scores) { + metric_names <- attr(scores, "metric_names") + if (is.null(metric_names)) { + stop("The data needs to have an attribute `metric_names` with the names ", + " of the metrics that were used for scoring. This should be the case ", + "if the data was produced using `score()`. Either run `score()` ", + "again, or set the attribute manually using ", + "`attr(data, 'metric_names') <- names_of_the_scoring_metrics") + } + return(metric_names) +} + + +#' @title Get unit of a single forecast +#' @description Helper function to get the unit of a single forecast, i.e. +#' the column names that define where a single forecast was made for. +#' This just takes all columns that are available in the data and subtracts +#' the columns that are protected, i.e. those returned by +#' [get_protected_columns()] as well as the names of the metrics that were +#' specified during scoring, if any. +#' @inheritParams validate_forecast +#' @return A character vector with the column names that define the unit of +#' a single forecast +#' @export +#' @keywords check-forecasts +get_forecast_unit <- function(data) { + # check whether there is a conflict in the forecast_unit and if so warn + protected_columns <- get_protected_columns(data) + protected_columns <- c(protected_columns, attr(data, "metric_names")) + forecast_unit <- setdiff(colnames(data), unique(protected_columns)) + return(forecast_unit) +} + + +#' @title Get protected columns from a data frame +#' +#' @description Helper function to get the names of all columns in a data frame +#' that are protected columns. +#' +#' @inheritParams validate_forecast +#' +#' @return A character vector with the names of protected columns in the data. +#' If data is `NULL` (default) then it returns a list of all columns that are +#' protected in scoringutils. +#' +#' @keywords internal +get_protected_columns <- function(data = NULL) { + + protected_columns <- c( + "predicted", "observed", "sample_id", "quantile", "upper", "lower", + "pit_value", "range", "boundary", "relative_skill", "scaled_rel_skill", + "interval_coverage", "interval_coverage_deviation", + "quantile_coverage", "quantile_coverage_deviation", + available_metrics(), + grep("coverage_", names(data), fixed = TRUE, value = TRUE) + ) + + if (is.null(data)) { + return(protected_columns) + } + + # only return protected columns that are present + datacols <- colnames(data) + protected_columns <- intersect( + datacols, + protected_columns + ) + + return(protected_columns) +} + + +#' @title Find duplicate forecasts +#' +#' @description Helper function to identify duplicate forecasts, i.e. +#' instances where there is more than one forecast for the same prediction +#' target. +#' +#' @param data A data.frame as used for [score()] +#' +#' @param forecast_unit A character vector with the column names that define +#' the unit of a single forecast. If `NULL` (the default) the function tries +#' to infer the unit of a single forecast. +#' +#' @return A data.frame with all rows for which a duplicate forecast was found +#' @export +#' @keywords check-forecasts +#' @examples +#' example <- rbind(example_quantile, example_quantile[1000:1010]) +#' get_duplicate_forecasts(example) + +get_duplicate_forecasts <- function(data, forecast_unit = NULL) { + type <- c("sample_id", "quantile")[c("sample_id", "quantile") %in% colnames(data)] + if (is.null(forecast_unit)) { + forecast_unit <- get_forecast_unit(data) + } + data <- as.data.table(data) + data[, scoringutils_InternalDuplicateCheck := .N, by = c(forecast_unit, type)] + out <- data[scoringutils_InternalDuplicateCheck > 1] + out[, scoringutils_InternalDuplicateCheck := NULL] + return(out[]) +} + + +#' @title Get a list of all attributes of a scoringutils object +#' +#' @param object A object of class `forecast_` +#' +#' @return A named list with the attributes of that object. +#' @keywords internal +get_scoringutils_attributes <- function(object) { + possible_attributes <- c( + "scoringutils_by", + "forecast_unit", + "forecast_type", + "metric_names", + "messages", + "warnings" + ) + + attr_list <- list() + for (attr_name in possible_attributes) { + attr_list[[attr_name]] <- attr(object, attr_name) + } + return(attr_list) +} diff --git a/R/input-check-helpers.R b/R/input-check-helpers.R deleted file mode 100644 index 2d7e0e51b..000000000 --- a/R/input-check-helpers.R +++ /dev/null @@ -1,241 +0,0 @@ -#' @title Check Prediction Input For Lower-level Scoring Functions -#' -#' @description -#' Helper function to check inputs for lower-level score functions. -#' @param predictions an object with predictions. Depending on whether -#' `class = vector` or `class = "matrix"` this can be either a vector of length -#' n (corresponding to the length of the true_values) or a nxN matrix of -#' predictive samples, n (number of rows) being the number of data points and -#' N (number of columns) the number of Monte Carlo samples -#' @param type character, one of "continuous" (default), "integer" or "binary" that -#' defines the type of the forecast -#' @param class character, either "vector" (default) or "matrix" that determines the -#' class the input has to correspond to -#' @inheritParams ae_median_sample -#' @return NULL -#' @keywords internal - -check_predictions <- function(predictions, - true_values = NULL, - type = c("continuous", "integer", "binary"), - class = c("vector", "matrix")) { - type <- match.arg(type) - class <- match.arg(class) - - if (missing(predictions)) { - stop("argument 'predictions' missing") - } - - if (class == "vector") { - if (!is.vector(predictions)) { - msg <- sprintf( - "'predictions' should be a vector. Instead `%s` was found", - class(predictions)[1] - ) - stop(msg) - } - if (!is.null(true_values) && length(predictions) != length(true_values)) { - msg <- sprintf( - "Mismatch: 'true_values' has length `%s`, but 'predictions' has length `%s`.", # nolint - length(true_values), length(predictions) - ) - stop(msg) - } - } - - if (class == "matrix") { - if (!is.matrix(predictions)) { - msg <- sprintf( - "'predictions' should be a matrix. Instead `%s` was found", - class(predictions[1]) - ) - stop(msg) - } - if (!is.null(true_values) && nrow(predictions) != length(true_values)) { - msg <- sprintf( - "Mismatch: 'true_values' has length `%s`, but 'predictions' has `%s` rows.", - length(true_values), nrow(predictions) - ) - stop(msg) - } - } - - if (type == "integer" && - isFALSE(all.equal(as.vector(predictions), as.integer(predictions))) - ) { - warning( - "Prediction type should be 'integer', but some of the predictions are", " not integers" - ) - } - - if (type == "binary" && - isFALSE(all(predictions >= 0) && all(predictions <= 1)) - ) { - stop( - "For a binary forecast, all predictions should be probabilities between", - " 0 or 1." - ) - } - - return(NULL) -} - - -#' @title Check Observed Value Input For Lower-level Scoring Functions -#' -#' @description -#' Helper function to check inputs for lower-level score functions. -#' @inheritParams check_predictions -#' @return NULL -#' @keywords internal - -check_true_values <- function(true_values, - type = c("continuous", "integer", "binary")) { - type <- match.arg(type) - if (missing(true_values)) { - stop("true_values argument is missing") - } - - if (type == "integer" && - isFALSE(all.equal(true_values, as.integer(true_values))) - ) { - stop("Some of the true_values are not integers") - } - - if (type == "binary" && - isFALSE(all(true_values %in% c(0, 1))) - ) { - stop("For a binary forecast, all true_values should be either 0 or 1.") - } -} - -#' @title Check Variable is not NULL -#' -#' @description -#' Check whether a certain variable is not `NULL` and return the name of that -#' variable and the function call where the variable is missing. This function -#' is a helper function that should only be called within other functions -#' @param ... The variables to check -#' @return The function returns `NULL`, but throws an error if the variable is -#' missing. -#' -#' @keywords internal -check_not_null <- function(...) { - vars <- list(...) - varnames <- names(vars) - - calling_function <- deparse(sys.calls()[[sys.nframe() - 1]]) - - for (i in seq_along(vars)) { - varname <- varnames[i] - if (is.null(vars[[i]])) { - stop( - "variable '", varname, - "' is `NULL` in the following function call: '", - calling_function, "'" - ) - } - } - return(invisible(NULL)) -} - - -#' @title Check Length -#' -#' @description -#' Check whether variables all have the same length -#' @param ... The variables to check -#' @param one_allowed logical, allow arguments of length one that can be -#' recycled -#' -#' @return The function returns `NULL`, but throws an error if variable lengths -#' differ -#' -#' @keywords internal -check_equal_length <- function(..., - one_allowed = TRUE) { - vars <- list(...) - lengths <- lengths(vars) - - lengths <- unique(lengths) - - if (one_allowed) { - # check passes if all have length 1 - if (all(lengths == 1)) { - return(invisible(NULL)) - } - # ignore those where length is one for later checks, as we allow length 1 - lengths <- lengths[lengths != 1] - } - - if (length(unique(lengths)) != 1) { - calling_function <- deparse(sys.calls()[[sys.nframe() - 1]]) - stop( - "Arguments passed to the following function call: '", - calling_function, - "' should have the same length (or length one). Arguments have the following lengths: ", - toString(lengths) - ) - } - return(invisible(NULL)) -} - - - -#' @title Check whether the desired metrics are available in scoringutils -#' -#' @description Helper function to check whether desired metrics are -#' available. If the input is `NULL`, all metrics will be returned. -#' -#' @param metrics character vector with desired metrics -#' -#' @return A character vector with metrics that can be used for downstream -#' computation -#' -#' @keywords internal - -check_metrics <- function(metrics) { - # use all available metrics if none are given - if (is.null(metrics)) { - metrics <- available_metrics() - } - - # check desired metrics are actually available in scoringutils - available_metrics <- available_metrics() - if (!all(metrics %in% available_metrics)) { - msg <- paste( - "The following metrics are not available:", - toString(setdiff(metrics, available_metrics)) - ) - warning(msg) - } - return(metrics) -} - -#' Check that quantiles are valid -#' -#' @description -#' Helper function to check that input quantiles are valid. -#' Quantiles must be in the range specified, increase monotonically, -#' and contain no duplicates. -#' -#' This is used in [bias_range()] and [bias_quantile()] to -#' provide informative errors to users. -#' -#' @param quantiles Numeric vector of quantiles to check -#' @param name Character name to use in error messages -#' @param range Numeric vector giving allowed range -#' -#' @return None. Function errors if quantiles are invalid. -#' -#' @keywords internal - -check_quantiles <- function(quantiles, name = "quantiles", range = c(0, 1)) { - if (any(quantiles < range[1]) || any(quantiles > range[2])) { - stop(name, " must be between ", range[1], " and ", range[2]) - } - - if (!all(diff(quantiles) > 0)) { - stop(name, " must be increasing") - } -} diff --git a/R/interval_score.R b/R/interval_score.R deleted file mode 100644 index 39cfe532d..000000000 --- a/R/interval_score.R +++ /dev/null @@ -1,227 +0,0 @@ -#' @title Interval Score -#' -#' @description -#' Proper Scoring Rule to score quantile predictions, following Gneiting -#' and Raftery (2007). Smaller values are better. -#' -#' The score is computed as -#' -#' \deqn{ -#' \textrm{score} = (\textrm{upper} - \textrm{lower}) + \frac{2}{\alpha}(\textrm{lower} -#' - \textrm{true\_value}) * -#' \mathbf{1}(\textrm{true\_value} < \textrm{lower}) + -#' \frac{2}{\alpha}(\textrm{true\_value} - \textrm{upper}) * -#' \mathbf{1}(\textrm{true\_value} > \textrm{upper}) -#' }{ -#' score = (upper - lower) + 2/alpha * (lower - true_value) * -#' 1(true_values < lower) + 2/alpha * (true_value - upper) * -#' 1(true_value > upper) -#' } -#' where \eqn{\mathbf{1}()}{1()} is the indicator function and -#' indicates how much is outside the prediction interval. -#' \eqn{\alpha}{alpha} is the decimal value that indicates how much is outside -#' the prediction interval. -#' -#' To improve usability, the user is asked to provide an interval range in -#' percentage terms, i.e. interval_range = 90 (percent) for a 90 percent -#' prediction interval. Correspondingly, the user would have to provide the -#' 5% and 95% quantiles (the corresponding alpha would then be 0.1). -#' No specific distribution is assumed, -#' but the range has to be symmetric (i.e you can't use the 0.1 quantile -#' as the lower bound and the 0.7 quantile as the upper). -#' Non-symmetric quantiles can be scored using the function [quantile_score()]. -#' -#' @param lower vector of size n with the prediction for the lower quantile -#' of the given range -#' @param upper vector of size n with the prediction for the upper quantile -#' of the given range -#' @param interval_range the range of the prediction intervals. i.e. if you're -#' forecasting the 0.05 and 0.95 quantile, the interval_range would be 90. -#' Can be either a single number or a vector of size n, if the range changes -#' for different forecasts to be scored. This corresponds to (100-alpha)/100 -#' in Gneiting and Raftery (2007). Internally, the range will be transformed -#' to alpha. -#' @param weigh if TRUE, weigh the score by alpha / 2, so it can be averaged -#' into an interval score that, in the limit, corresponds to CRPS. Alpha is the -#' decimal value that represents how much is outside a central prediction -#' interval (e.g. for a 90 percent central prediction interval, alpha is 0.1) -#' Default: `TRUE`. -#' @param separate_results if `TRUE` (default is `FALSE`), then the separate -#' parts of the interval score (dispersion penalty, penalties for over- and -#' under-prediction get returned as separate elements of a list). If you want a -#' `data.frame` instead, simply call [as.data.frame()] on the output. -#' @return vector with the scoring values, or a list with separate entries if -#' `separate_results` is `TRUE`. -#' @importFrom rlang warn -#' @inheritParams ae_median_sample -#' @examples -#' true_values <- rnorm(30, mean = 1:30) -#' interval_range <- rep(90, 30) -#' alpha <- (100 - interval_range) / 100 -#' lower <- qnorm(alpha / 2, rnorm(30, mean = 1:30)) -#' upper <- qnorm((1 - alpha / 2), rnorm(30, mean = 1:30)) -#' -#' interval_score( -#' true_values = true_values, -#' lower = lower, -#' upper = upper, -#' interval_range = interval_range -#' ) -#' -#' # gives a warning, as the interval_range should likely be 50 instead of 0.5 -#' interval_score(true_value = 4, upper = 2, lower = 8, interval_range = 0.5) -#' -#' # example with missing values and separate results -#' interval_score( -#' true_values = c(true_values, NA), -#' lower = c(lower, NA), -#' upper = c(NA, upper), -#' separate_results = TRUE, -#' interval_range = 90 -#' ) -#' @export -#' @keywords metric -#' @references Strictly Proper Scoring Rules, Prediction,and Estimation, -#' Tilmann Gneiting and Adrian E. Raftery, 2007, Journal of the American -#' Statistical Association, Volume 102, 2007 - Issue 477 -#' -#' Evaluating epidemic forecasts in an interval format, -#' Johannes Bracher, Evan L. Ray, Tilmann Gneiting and Nicholas G. Reich, -#' # nolint -#' - -interval_score <- function(true_values, - lower, - upper, - interval_range, - weigh = TRUE, - separate_results = FALSE) { - - # error handling - not sure how I can make this better - present <- c( - methods::hasArg("true_values"), methods::hasArg("lower"), - methods::hasArg("upper"), methods::hasArg("interval_range") - ) - if (!all(present)) { - stop( - "need all arguments 'true_values', 'lower', 'upper' and 'interval_range' in function 'interval_score()'" # nolint - ) - } - check_not_null( - true_values = true_values, lower = lower, upper = upper, - interval_range = interval_range - ) - check_equal_length(true_values, lower, interval_range, upper) - - if (any(interval_range < 0, na.rm = TRUE)) { - stop("interval ranges must be positive") - } - if (any(interval_range > 0 & interval_range < 1, na.rm = TRUE)) { - msg <- paste("Found interval ranges between 0 and 1. Are you sure that's right?", - "An interval range of 0.5 e.g. implies a (49.75%, 50.25%) prediction interval.", - "If you want to score a (25%, 75%) prediction interval, set interval_range = 50.") - rlang::warn(message = msg, .frequency = "once", .frequency_id = "small_interval_range") - } - - # calculate alpha from the interval range - alpha <- (100 - interval_range) / 100 - - # calculate three components of WIS - dispersion <- (upper - lower) - overprediction <- - 2 / alpha * (lower - true_values) * as.numeric(true_values < lower) - underprediction <- - 2 / alpha * (true_values - upper) * as.numeric(true_values > upper) - - if (weigh) { - dispersion <- dispersion * alpha / 2 - underprediction <- underprediction * alpha / 2 - overprediction <- overprediction * alpha / 2 - } - - score <- dispersion + underprediction + overprediction - - if (separate_results) { - return(list( - interval_score = score, - dispersion = dispersion, - underprediction = underprediction, - overprediction = overprediction - )) - } else { - return(score) - } -} - -#' @title Quantile Score -#' -#' @description -#' Proper Scoring Rule to score quantile predictions. Smaller values are better. -#' The quantile score is -#' closely related to the Interval score (see [interval_score()]) and is -#' the quantile equivalent that works with single quantiles instead of -#' central prediction intervals. -#' -#' @param quantiles vector of size n with the quantile values of the -#' corresponding predictions. -#' @param weigh if TRUE, weigh the score by alpha / 2, so it can be averaged -#' into an interval score that, in the limit, corresponds to CRPS. Alpha is the -#' value that corresponds to the (alpha/2) or (1 - alpha/2) quantiles provided -#' and will be computed from the quantile. Alpha is the decimal value that -#' represents how much is outside a central prediction interval (E.g. for a -#' 90 percent central prediction interval, alpha is 0.1). Default: `TRUE`. -#' @return vector with the scoring values -#' @inheritParams interval_score -#' @inheritParams ae_median_sample -#' @examples -#' true_values <- rnorm(10, mean = 1:10) -#' alpha <- 0.5 -#' -#' lower <- qnorm(alpha / 2, rnorm(10, mean = 1:10)) -#' upper <- qnorm((1 - alpha / 2), rnorm(10, mean = 1:10)) -#' -#' qs_lower <- quantile_score(true_values, -#' predictions = lower, -#' quantiles = alpha / 2 -#' ) -#' qs_upper <- quantile_score(true_values, -#' predictions = upper, -#' quantiles = 1 - alpha / 2 -#' ) -#' interval_score <- (qs_lower + qs_upper) / 2 -#' @export -#' @keywords metric -#' @references Strictly Proper Scoring Rules, Prediction,and Estimation, -#' Tilmann Gneiting and Adrian E. Raftery, 2007, Journal of the American -#' Statistical Association, Volume 102, 2007 - Issue 477 -#' -#' Evaluating epidemic forecasts in an interval format, -#' Johannes Bracher, Evan L. Ray, Tilmann Gneiting and Nicholas G. Reich, -#' -# - -quantile_score <- function(true_values, - predictions, - quantiles, - weigh = TRUE) { - - # get central prediction interval which corresponds to given quantiles - central_interval <- abs(0.5 - quantiles) * 2 - alpha <- 1 - central_interval - - # compute score - this is the version explained in the SI of Bracher et. al. - error <- abs(predictions - true_values) - score <- 2 * ifelse( - true_values <= predictions, 1 - quantiles, quantiles - ) * error - - # adapt score such that mean of unweighted quantile scores corresponds to - # unweighted interval score of the corresponding prediction interval - score <- 2 * score / alpha - - if (weigh) { - score <- score * alpha / 2 - } - - return(score) -} diff --git a/R/log_score.R b/R/log_score.R deleted file mode 100644 index 517623fb3..000000000 --- a/R/log_score.R +++ /dev/null @@ -1,33 +0,0 @@ -#' Log Score for Binary outcomes -#' -#' @description -#' Computes the Log Score for probabilistic forecasts of binary outcomes. -#' -#' @details -#' The Log Score is a proper score rule suited to assessing the accuracy of -#' probabilistic binary predictions. The outcomes can be either 0 or 1, -#' the predictions must be a probability that the true outcome will be 1. -#' -#' The Log Score is then computed as the negative logarithm of the probability -#' assigned to the true outcome. Reporting the negative logarithm means that -#' smaller values are better. -#' -#' @inheritParams brier_score -#' @return A numeric value with the Log Score, i.e. the mean squared -#' error of the given probability forecasts -#' @importFrom methods hasArg -#' @export -#' @keywords metric -#' -#' @examples -#' true_values <- sample(c(0, 1), size = 30, replace = TRUE) -#' predictions <- runif(n = 30, min = 0, max = 1) - -#' logs_binary(true_values, predictions) -logs_binary <- function(true_values, predictions) { - check_true_values(true_values, type = "binary") - check_predictions(predictions, true_values, type = "binary") - - logs <- -log(ifelse(true_values == 1, predictions, 1 - predictions)) - return(logs) -} diff --git a/R/metrics-binary.R b/R/metrics-binary.R new file mode 100644 index 000000000..104a8bcbe --- /dev/null +++ b/R/metrics-binary.R @@ -0,0 +1,88 @@ +#' Metrics for Binary Outcomes +#' +#' @details +#' The functions require users to provide observed values as a factor in order +#' to distinguish its input from the input format required for scoring point +#' forecasts. Internally, however, factors will be converted to numeric values. +#' A factor `observed = factor(c(0, 1, 1, 0, 1)` with two levels (`0` and `1`) +#' would internally be coerced to a numeric vector (in this case this would +#' result in the numeric vector c(1, 2, 2, 1, 1)). After subtracting 1, the +#' resulting vector (`c(0, 1, 1, 0)` in this case) is used for internal +#' calculations. All predictions are assumed represent the probability that the +#' outcome is equal of the highest factor level (in this case that the +#' outcome is equal to 1). +#' +#' You could alternatively also provide a vector like +#' `observed = factor(c("a", "b", "b", "a"))` (with two levels, `a` and `b`), +#' which would result in exactly the same internal representation. Probabilities +#' then represent the probability that the outcome is equal to "b". +#' If you want your predictions to be probabilities that the outcome is "a", +#' then you could of course make `observed` a factor with levels swapped, i.e. +#' `observed = factor(c("a", "b", "b", "a"), levels = c("b", "a"))` +#' +#' @param observed A factor of length n with exactly two levels, holding +#' the observed values. +#' The highest factor level is assumed to be the reference level. This means +#' that `predicted` represents the probability that the observed value is +#' equal to the highest factor level. +#' @param predicted A numeric vector of length n, holding probabilities. +#' Values represent the probability that the corresponding outcome is equal to +#' the highest level of the factor `observed`. +#' @examples +#' observed <- factor(sample(c(0, 1), size = 30, replace = TRUE)) +#' predicted <- runif(n = 30, min = 0, max = 1) +#' +#' brier_score(observed, predicted) +#' logs_binary(observed, predicted) +#' @name scoring-functions-binary +NULL + + +#' @description +#' **Brier score** +#' +#' The Brier Score is the mean squared error between the probabilistic +#' prediction and the observed outcome. The Brier score is a proper scoring +#' rule. Small values are better (best is 0, the worst is 1). +#' +#' \deqn{ +#' \textrm{Brier\_Score} = \frac{1}{N} \sum_{t = 1}^{n} (\textrm{prediction}_t - +#' \textrm{outcome}_t)^2, +#' }{ +#' Brier_Score = 1/N sum_{t = 1}^{n} (prediction_t - outcome_t)², +#' } where \eqn{\textrm{outcome}_t \in \{0, 1\}}{outcome_t in {0, 1}}, and +#' \eqn{\textrm{prediction}_t \in [0, 1]}{prediction_t in [0, 1]} represents +#' the probability that the outcome is equal to 1. +#' @return A numeric vector of size n with the Brier scores +#' @keywords metric +#' @export +#' @rdname scoring-functions-binary +brier_score <- function(observed, predicted) { + assert_input_binary(observed, predicted) + + observed <- as.numeric(observed) - 1 + brierscore <- (observed - predicted)^2 + return(brierscore) +} + + +#' Log Score for Binary outcomes +#' +#' @description +#' **Log score for binary outcomes** +#' +#' The Log Score is the negative logarithm of the probability +#' assigned to the observed value. It is a proper scoring rule. Small values +#' are better (best is zero, worst is infinity). +#' +#' @return A numeric vector of size n with log scores +#' @importFrom methods hasArg +#' @export +#' @keywords metric +#' @rdname scoring-functions-binary +logs_binary <- function(observed, predicted) { + assert_input_binary(observed, predicted) + observed <- as.numeric(observed) - 1 + logs <- -log(1 - abs(observed - predicted)) + return(logs) +} diff --git a/R/metrics-quantile.R b/R/metrics-quantile.R new file mode 100644 index 000000000..39cc9ba1a --- /dev/null +++ b/R/metrics-quantile.R @@ -0,0 +1,693 @@ +################################################################################ +# Metrics with a many-to-one relationship between input and score +################################################################################ + +#' Weighted Interval Score (WIS) +#' @description +#' The WIS is a proper scoring rule used to evaluate forecasts in an interval- / +#' quantile-based format. See Bracher et al. (2021). Smaller values are better. +#' +#' As the name suggest the score assumes that a forecast comes in the form of +#' one or multiple central prediction intervals. A prediction interval is +#' characterised by a lower and an upper bound formed by a pair of predictive +#' quantiles. For example, a 50% central prediction interval is formed by the +#' 0.25 and 0.75 quantiles of the predictive distribution. +#' +#' **Interval score** +#' +#' The interval score (IS) is the sum of three components: +#' overprediction, underprediction and dispersion. For a single prediction +#' interval only one of the components is non-zero. If for a single prediction +#' interval the observed value is below the lower bound, then the interval +#' score is equal to the absolute difference between the lower bound and the +#' observed value ("underprediction"). "Overprediction" is defined analogously. +#' If the observed value falls within the bounds of the prediction interval, +#' then the interval score is equal to the width of the prediction interval, +#' i.e. the difference between the upper and lower bound. For a single interval, +#' we therefore have: +#' +#' \deqn{ +#' \textrm{IS} = (\textrm{upper} - \textrm{lower}) + \frac{2}{\alpha}(\textrm{lower} +#' - \textrm{observed}) * +#' \mathbf{1}(\textrm{observed} < \textrm{lower}) + +#' \frac{2}{\alpha}(\textrm{observed} - \textrm{upper}) * +#' \mathbf{1}(\textrm{observed} > \textrm{upper}) +#' }{ +#' score = (upper - lower) + 2/alpha * (lower - observed) * +#' 1(observed < lower) + 2/alpha * (observed - upper) * +#' 1(observed > upper) +#' } +#' where \eqn{\mathbf{1}()}{1()} is the indicator function and +#' indicates how much is outside the prediction interval. +#' \eqn{\alpha}{alpha} is the decimal value that indicates how much is outside +#' the prediction interval. For a 90% prediction interval, for example, +#' \eqn{\alpha}{alpha} is equal to 0.1. No specific distribution is assumed, +#' but the range has to be symmetric (i.e you can't use the 0.1 quantile +#' as the lower bound and the 0.7 quantile as the upper). +#' Non-symmetric quantiles can be scored using the function [quantile_score()]. +#' +#' Usually the interval score is weighted by a factor that makes sure that the +#' average score across an increasing number of equally spaced +#' quantiles, converges to the continuous ranked probability score (CRPS). This +#' weighted score is called the weihted interval score (WIS). +#' The weight commonly used is \eqn{\alpha / 2}{alpha / 2}. +#' +#' **Quantile score** +#' +#' In addition to the interval score, there also exists a quantile score (QS) +#' (see [quantile_score()]), which is equal to the so-called pinball loss. +#' The quantile score can be computed for a single quantile (whereas the +#' interval score requires two quantiles that form an interval). However, +#' the intuitive decomposition into overprediction, underprediction and +#' dispersion does not exist for the quantile score. +#' +#' **Two versions of the weighted interval score** +#' +#' There are two ways to conceptualise the weighted interval score across +#' several quantiles / prediction intervals and the median. +#' +#' In one view, you would treat the WIS as the average of quantile scores (and +#' the median as 0.5-quantile) (this is the default for `wis()`). In another +#' view, you would treat the WIS as the average of several interval scores + +#' the difference between observed value and median forecast. The effect of +#' that is that in contrast to the first view, the median has twice as much +#' weight (because it is weighted like a prediction interval, rather than like +#' a single quantile). Both are valid ways to conceptualise the WIS and you +#' can control the behvaviour with the `count_median_twice`-argument. +#' +#' **WIS components**: +#' WIS components can be computed individually using the functions +#' `overprediction`, `underprediction`, and `dispersion.` +#' +#' @inheritParams interval_score +#' @param observed numeric vector of size n with the observed values +#' @param predicted numeric nxN matrix of predictive +#' quantiles, n (number of rows) being the number of forecasts (corresponding +#' to the number of observed values) and N +#' (number of columns) the number of quantiles per forecast. +#' If `observed` is just a single number, then predicted can just be a +#' vector of size N. +#' @param quantile vector with quantile levels of size N +#' @param count_median_twice if TRUE, count the median twice in the score +#' @param na.rm if TRUE, ignore NA values when computing the score +#' @importFrom stats weighted.mean +#' @importFrom checkmate assert_logical +#' @return +#' `wis()`: a numeric vector with WIS values of size n (one per observation), +#' or a list with separate entries if `separate_results` is `TRUE`. +#' @export +#' @keywords metric +#' @examples +#' observed <- c(1, -15, 22) +#' predicted <- rbind( +#' c(-1, 0, 1, 2, 3), +#' c(-2, 1, 2, 2, 4), +#' c(-2, 0, 3, 3, 4) +#' ) +#' quantile <- c(0.1, 0.25, 0.5, 0.75, 0.9) +#' wis(observed, predicted, quantile) +wis <- function(observed, + predicted, + quantile, + separate_results = FALSE, + weigh = TRUE, + count_median_twice = FALSE, + na.rm = TRUE) { + assert_input_quantile(observed, predicted, quantile) + reformatted <- quantile_to_interval(observed, predicted, quantile) + + assert_logical(separate_results, len = 1) + assert_logical(weigh, len = 1) + assert_logical(count_median_twice, len = 1) + assert_logical(na.rm, len = 1) + + if (separate_results) { + cols <- c("wis", "dispersion", "underprediction", "overprediction") + } else { + cols <- "wis" + } + + reformatted[, eval(cols) := do.call( + interval_score, + list( + observed = observed, + lower = lower, + upper = upper, + interval_range = range, + weigh = weigh, + separate_results = separate_results + ) + )] + + if (count_median_twice) { + reformatted[, weight := 1] + } else { + reformatted[, weight := ifelse(range == 0, 0.5, 1)] + } + + # summarise results by forecast_id + reformatted <- reformatted[ + , lapply(.SD, weighted.mean, na.rm = na.rm, w = weight), + by = "forecast_id", + .SDcols = colnames(reformatted) %like% paste(cols, collapse = "|") + ] + + if (separate_results) { + return(list( + wis = reformatted$wis, + dispersion = reformatted$dispersion, + underprediction = reformatted$underprediction, + overprediction = reformatted$overprediction + )) + } else { + return(reformatted$wis) + } +} + + +#' @return +#' `dispersion()`: a numeric vector with dispersion values (one per observation) +#' @param ... Additional arguments passed on to `wis()` from functions +#' `overprediction()`, `underprediction()` and `dispersion()` +#' @export +#' @rdname wis +#' @keywords metric +dispersion <- function(observed, predicted, quantile, ...) { + args <- list(...) + args$separate_results <- TRUE + assert_input_quantile(observed, predicted, quantile) + do.call(wis, c(list(observed), list(predicted), list(quantile), args))$dispersion +} + + +#' @return +#' `overprediction()`: a numeric vector with overprediction values (one per +#' observation) +#' @export +#' @rdname wis +#' @keywords metric +overprediction <- function(observed, predicted, quantile, ...) { + args <- list(...) + args$separate_results <- TRUE + assert_input_quantile(observed, predicted, quantile) + do.call(wis, c(list(observed), list(predicted), list(quantile), args))$overprediction +} + + +#' @return +#' `underprediction()`: a numeric vector with underprediction values (one per +#' observation) +#' @export +#' @rdname wis +#' @keywords metric +underprediction <- function(observed, predicted, quantile, ...) { + args <- list(...) + args$separate_results <- TRUE + assert_input_quantile(observed, predicted, quantile) + do.call(wis, c(list(observed), list(predicted), list(quantile), args))$underprediction +} + + +#' @title Interval Coverage (For Quantile-Based Forecasts) +#' @description Check whether the observed value is within a given central +#' prediction interval. The prediction interval is defined by a lower and an +#' upper bound formed by a pair of predictive quantiles. For example, a 50% +#' prediction interval is formed by the 0.25 and 0.75 quantiles of the +#' predictive distribution. +#' @inheritParams wis +#' @param range A single number with the range of the prediction interval in +#' percent (e.g. 50 for a 50% prediction interval) for which you want to compute +#' interval coverage. +#' @importFrom checkmate assert_number +#' @return A vector of length n with elements either TRUE, +#' if the observed value is within the corresponding prediction interval, and +#' FALSE otherwise. +#' @name interval_coverage +#' @export +#' @keywords metric +#' @examples +#' observed <- c(1, -15, 22) +#' predicted <- rbind( +#' c(-1, 0, 1, 2, 3), +#' c(-2, 1, 2, 2, 4), +#' c(-2, 0, 3, 3, 4) +#' ) +#' quantile <- c(0.1, 0.25, 0.5, 0.75, 0.9) +#' interval_coverage(observed, predicted, quantile) +interval_coverage <- function(observed, predicted, quantile, range = 50) { + assert_input_quantile(observed, predicted, quantile) + assert_number(range) + necessary_quantiles <- c((100 - range) / 2, 100 - (100 - range) / 2) / 100 + if (!all(necessary_quantiles %in% quantile)) { + warning( + "To compute the interval coverage for a range of ", range, + "%, the quantiles `", toString(necessary_quantiles), + "` are required. Returning `NA`." + ) + return(NA) + } + r <- range + reformatted <- quantile_to_interval(observed, predicted, quantile) + reformatted <- reformatted[range %in% r] + reformatted[, interval_coverage := (observed >= lower) & (observed <= upper)] + return(reformatted$interval_coverage) +} + + +#' @title Interval Coverage Deviation (For Quantile-Based Forecasts) +#' @description Check the agreement between desired and actual interval coverage +#' of a forecast. +#' +#' The function is similar to [interval_coverage()], +#' but takes all provided prediction intervals into account and +#' compares nominal interval coverage (i.e. the desired interval coverage) with +#' the actual observed interval coverage. +#' +#' A central symmetric prediction interval is defined by a lower and an +#' upper bound formed by a pair of predictive quantiles. For example, a 50% +#' prediction interval is formed by the 0.25 and 0.75 quantiles of the +#' predictive distribution. Ideally, a forecaster should aim to cover about +#' 50% of all observed values with their 50% prediction intervals, 90% of all +#' observed values with their 90% prediction intervals, and so on. +#' +#' For every prediction interval, the deviation is computed as the difference +#' between the observed interval coverage and the nominal interval coverage +#' For a single observed value and a single prediction interval, coverage is +#' always either 0 or 1 (`FALSE` or `TRUE`). This is not the case for a single +#' observed value and multiple prediction intervals, +#' but it still doesn't make that much +#' sense to compare nominal (desired) coverage and actual coverage for a single +#' observation. In that sense coverage deviation only really starts to make +#' sense as a metric when averaged across multiple observations). +#' +#' Positive values of interval coverage deviation are an indication for +#' underconfidence, i.e. the forecaster could likely have issued a narrower +#' forecast. Negative values are an indication for overconfidence, i.e. the +#' forecasts were too narrow. +#' +#' \deqn{ +#' \textrm{interval coverage deviation} = +#' \mathbf{1}(\textrm{observed value falls within interval}) - +#' \textrm{nominal interval coverage} +#' }{ +#' interval coverage deviation = +#' 1(observed value falls within interval) - nominal interval coverage +#' } +#' The interval coverage deviation is then averaged across all prediction +#' intervals. The median is ignored when computing coverage deviation. +#' @inheritParams wis +#' @return A numeric vector of length n with the interval coverage deviation +#' for each forecast (comprising one or multiple prediction intervals). +#' @export +#' @keywords metric +#' @examples +#' observed <- c(1, -15, 22) +#' predicted <- rbind( +#' c(-1, 0, 1, 2, 3), +#' c(-2, 1, 2, 2, 4), +#' c(-2, 0, 3, 3, 4) +#' ) +#' quantile <- c(0.1, 0.25, 0.5, 0.75, 0.9) +#' interval_coverage_deviation(observed, predicted, quantile) +interval_coverage_deviation <- function(observed, predicted, quantile) { + assert_input_quantile(observed, predicted, quantile) + + # transform available quantiles into central interval ranges + available_ranges <- unique(get_range_from_quantile(quantile)) + + # check if all necessary quantiles are available + necessary_quantiles <- unique( + c((100 - available_ranges) / 2, 100 - (100 - available_ranges) / 2) / 100 + ) + if (!all(necessary_quantiles %in% quantile)) { + missing <- necessary_quantiles[!necessary_quantiles %in% quantile] + warning( + "To compute inteval coverage deviation, all quantiles must form central ", + "symmetric prediction intervals. Missing quantiles: ", + toString(missing), ". Returning `NA`." + ) + return(NA) + } + + reformatted <- quantile_to_interval(observed, predicted, quantile)[range != 0] + reformatted[, interval_coverage := (observed >= lower) & (observed <= upper)] + reformatted[, interval_coverage_deviation := interval_coverage - range / 100] + out <- reformatted[, .( + interval_coverage_deviation = mean(interval_coverage_deviation) + ), by = "forecast_id"] + return(out$interval_coverage_dev) +} + + +#' @title Determines Bias of Quantile Forecasts +#' +#' @description +#' Determines bias from quantile forecasts. For an increasing number of +#' quantiles this measure converges against the sample based bias version +#' for integer and continuous forecasts. +#' +#' @details +#' For quantile forecasts, bias is measured as +#' +#' \deqn{ +#' B_t = (1 - 2 \cdot \max \{i | q_{t,i} \in Q_t \land q_{t,i} \leq x_t\}) +#' \mathbf{1}( x_t \leq q_{t, 0.5}) \\ +#' + (1 - 2 \cdot \min \{i | q_{t,i} \in Q_t \land q_{t,i} \geq x_t\}) +#' 1( x_t \geq q_{t, 0.5}),} +#' +#' where \eqn{Q_t} is the set of quantiles that form the predictive +#' distribution at time \eqn{t}. They represent our +#' belief about what the observed value $x_t$ will be. For consistency, we define +#' \eqn{Q_t} such that it always includes the element +#' \eqn{q_{t, 0} = - \infty} and \eqn{q_{t,1} = \infty}. +#' \eqn{1()} is the indicator function that is \eqn{1} if the +#' condition is satisfied and $0$ otherwise. In clearer terms, \eqn{B_t} is +#' defined as the maximum percentile rank for which the corresponding quantile +#' is still below the observed value, if the observed value is smaller than the +#' median of the predictive distribution. If the observed value is above the +#' median of the predictive distribution, then $B_t$ is the minimum percentile +#' rank for which the corresponding quantile is still larger than the true +#' value. If the observed value is exactly the median, both terms cancel out and +#' \eqn{B_t} is zero. For a large enough number of quantiles, the +#' percentile rank will equal the proportion of predictive samples below the +#' observed value, and this metric coincides with the one for +#' continuous forecasts. +#' +#' Bias can assume values between +#' -1 and 1 and is 0 ideally (i.e. unbiased). +#' @param observed a single number representing the observed value +#' @param predicted vector of length corresponding to the number of quantiles +#' that holds predictions +#' @param quantile vector of corresponding size with the quantile levels for +#' which predictions were made. If this does not contain the median (0.5) then +#' the median is imputed as being the mean of the two innermost quantiles. +#' @param na.rm logical. Should missing values be removed? +#' @return scalar with the quantile bias for a single quantile prediction +#' @export +#' @keywords metric +#' @examples +#' predicted <- c( +#' 705.500, 1127.000, 4006.250, 4341.500, 4709.000, 4821.996, +#' 5340.500, 5451.000, 5703.500, 6087.014, 6329.500, 6341.000, +#' 6352.500, 6594.986, 6978.500, 7231.000, 7341.500, 7860.004, +#' 7973.000, 8340.500, 8675.750, 11555.000, 11976.500 +#' ) +#' +#' quantile <- c(0.01, 0.025, seq(0.05, 0.95, 0.05), 0.975, 0.99) +#' +#' observed <- 8062 +#' +#' bias_quantile(observed, predicted, quantile) +bias_quantile <- function(observed, predicted, quantile, na.rm = TRUE) { + assert_input_quantile(observed, predicted, quantile) + n <- length(observed) + N <- length(quantile) + if (is.null(dim(predicted))) { + dim(predicted) <- c(n, N) + } + if (!(0.5 %in% quantile)) { + message( + "Median not available, computing bias as mean of the two innermost ", + "quantiles in order to compute bias." + ) + } + bias <- sapply(1:n, function(i) { + bias_quantile_single_vector(observed[i], predicted[i, ], quantile, na.rm) + }) + return(bias) +} + + +#' Compute Bias for a Single Vector of Quantile Predictions +#' @description Internal function to compute bias for a single observed value, +#' a vector of predicted values and a vector of quantiles. +#' @param observed scalar with the observed value +#' @param predicted vector of length N corresponding to the number of quantiles +#' that holds predictions +#' @param quantile vector of corresponding size N with the quantile levels for +#' which predictions were made. If this does not contain the median (0.5) then +#' the median is imputed as being the mean of the two innermost quantiles. +#' @inheritParams bias_quantile +#' @return scalar with the quantile bias for a single quantile prediction +#' @keywords internal +bias_quantile_single_vector <- function(observed, predicted, quantile, na.rm) { + + assert_number(observed) + # other checks should have happend before + + predicted_has_NAs <- anyNA(predicted) + quantile_has_NAs <- anyNA(quantile) + + if (any(predicted_has_NAs, quantile_has_NAs)) { + if (na.rm) { + quantile <- quantile[!is.na(predicted)] + predicted <- predicted[!is.na(predicted)] + predicted <- predicted[!is.na(quantile)] + quantile <- quantile[!is.na(quantile)] + } else { + return(NA_real_) + } + } + + order <- order(quantile) + predicted <- predicted[order] + if (!all(diff(predicted) >= 0)) { + stop("Predictions must not be decreasing with increasing quantile level") + } + + if (0.5 %in% quantile) { + median_prediction <- predicted[quantile == 0.5] + } else { + # if median is not available, compute as mean of two innermost quantile + median_prediction <- + 0.5 * predicted[quantile == max(quantile[quantile < 0.5])] + + 0.5 * predicted[quantile == min(quantile[quantile > 0.5])] + } + + if (observed == median_prediction) { + bias <- 0 + return(bias) + } else if (observed < median_prediction) { + if (observed < min(predicted)) { + bias <- 1 + } else { + q <- max(quantile[predicted <= observed]) + bias <- 1 - 2 * q + } + } else if (observed > median_prediction) { + if (observed > max(predicted)) { + bias <- -1 + } else { + q <- min(quantile[predicted >= observed]) + bias <- 1 - 2 * q + } + } + return(bias) +} + + +#' @title Absolute Error of the Median (Quantile-based Version) +#' @description +#' Compute the absolute error of the median calculated as +#' \deqn{ +#' \textrm{abs}(\textrm{observed} - \textrm{median prediction}) +#' }{ +#' abs(observed - median_prediction) +#' } +#' The median prediction is the predicted value for which quantile == 0.5, +#' the function therefore requires 0.5 to be among the quantile levels in +#' `quantile`. +#' @inheritParams wis +#' @return numeric vector of length N with the absolute error of the median +#' @seealso [ae_median_sample()] +#' @importFrom stats median +#' @examples +#' observed <- rnorm(30, mean = 1:30) +#' predicted_values <- matrix(rnorm(30, mean = 1:30)) +#' ae_median_quantile(observed, predicted_values, quantile = 0.5) +#' @export +#' @keywords metric +ae_median_quantile <- function(observed, predicted, quantile) { + assert_input_quantile(observed, predicted, quantile) + if (!any(quantile == 0.5)) { + warning( + "in order to compute the absolute error of the median, `0.5` must be ", + "among the quantiles given. Returning `NA`." + ) + return(NA_real_) + } + if (is.null(dim(predicted))) { + predicted <- matrix(predicted, nrow = 1) + } + predicted <- predicted[, quantile == 0.5] + abs_error_median <- abs(observed - predicted) + return(abs_error_median) +} + + +################################################################################ +# Metrics with a one-to-one relationship between input and score +################################################################################ + + +#' @title Quantile Score +#' +#' @description +#' Proper Scoring Rule to score quantile predictions. Smaller values are better. +#' The quantile score is +#' closely related to the Interval score (see [wis()]) and is +#' the quantile equivalent that works with single quantiles instead of +#' central prediction intervals. +#' +#' @param quantile vector of size n with the quantile levels of the +#' corresponding predictions. +#' @param weigh if TRUE, weigh the score by alpha / 2, so it can be averaged +#' into an interval score that, in the limit, corresponds to CRPS. Alpha is the +#' value that corresponds to the (alpha/2) or (1 - alpha/2) quantiles provided +#' and will be computed from the quantile. Alpha is the decimal value that +#' represents how much is outside a central prediction interval (E.g. for a +#' 90 percent central prediction interval, alpha is 0.1). Default: `TRUE`. +#' @return vector with the scoring values +#' @inheritParams interval_score +#' @inheritParams ae_median_sample +#' @examples +#' observed <- rnorm(10, mean = 1:10) +#' alpha <- 0.5 +#' +#' lower <- qnorm(alpha / 2, rnorm(10, mean = 1:10)) +#' upper <- qnorm((1 - alpha / 2), rnorm(10, mean = 1:10)) +#' +#' qs_lower <- quantile_score(observed, +#' predicted = lower, +#' quantile = alpha / 2 +#' ) +#' qs_upper <- quantile_score(observed, +#' predicted = upper, +#' quantile = 1 - alpha / 2 +#' ) +#' interval_score <- (qs_lower + qs_upper) / 2 +#' @export +#' @keywords metric +#' @references Strictly Proper Scoring Rules, Prediction,and Estimation, +#' Tilmann Gneiting and Adrian E. Raftery, 2007, Journal of the American +#' Statistical Association, Volume 102, 2007 - Issue 477 +#' +#' Evaluating epidemic forecasts in an interval format, +#' Johannes Bracher, Evan L. Ray, Tilmann Gneiting and Nicholas G. Reich, +#' +quantile_score <- function(observed, + predicted, + quantile, + weigh = TRUE) { + + # compute score - this is the version explained in the SI of Bracher et. al. + error <- abs(predicted - observed) + score <- 2 * ifelse( + observed <= predicted, 1 - quantile, quantile + ) * error + + # adapt score such that mean of unweighted quantile scores corresponds to + # unweighted interval score of the corresponding prediction interval + # --> needs central prediction interval which corresponds to given quantiles + central_interval <- abs(0.5 - quantile) * 2 + alpha <- 1 - central_interval + score <- 2 * score / alpha + + # if weigh, then reverse last operation + if (weigh) { + score <- score * alpha / 2 + } + + return(score) +} + + +# Weighted Interval Score, But With One-to-One Relationship +wis_one_to_one <- function(observed, + predicted, + quantile, + separate_results = FALSE, + output = c("matrix", "data.frame", "vector"), + weigh = TRUE) { + + # input checks + assert_input_quantile(observed, predicted, quantile) + + # store original data + n <- length(observed) + N <- length(quantile) + original_data <- data.table( + forecast_id = rep(1:n, each = N), + observed = rep(observed, each = N), + predicted = as.vector(t(predicted)), + quantile = quantile + ) + + # define output columns + if (separate_results) { + cols <- c("wis", "dispersion", "underprediction", "overprediction") + } else { + cols <- "wis" + } + + # reformat input to interval format and calculate interval score + reformatted <- quantile_to_interval(observed, predicted, quantile) + reformatted[, eval(cols) := do.call( + interval_score, + list( + observed = observed, + lower = lower, + upper = upper, + interval_range = range, + weigh = weigh, + separate_results = separate_results + ) + )] + + # melt data to long format, calclate quantiles, and merge back to original + long <- melt(reformatted, + measure.vars = c("lower", "upper"), + variable.name = "boundary", + value.name = "predicted", + id.vars = c("forecast_id", "observed", "range", cols)) + # calculate quantiles + long[, quantile := (100 - range) / 200] # lower quantiles + long[boundary == "upper", quantile := 1 - quantile] # upper quantiles + # remove boundary, range, take unique value to get rid of duplicated median + long[, c("boundary", "range") := NULL] + long <- unique(long) # should maybe check for count_median_twice? + out <- merge( + original_data, long, all.x = TRUE, + by = c("forecast_id", "observed", "predicted", "quantile") + )[, forecast_id := NULL] + + # handle returns depending on the output format + if (output == "data.frame") { + return(out) + } + + wis <- out$wis + if (separate_results) { + components <- list( + underprediction = out$underprediction, + overprediction = out$overprediction, + dispersion = out$dispersion + ) + } + + if (output == "vector" && separate_results) { + return(c(wis = wis, components)) + } else if (output == "vector") { + return(wis) + } + + if (output == "matrix") { + wis <- matrix(wis, nrow = n, ncol = N) + if (separate_results) { + components <- lapply(components, matrix, nrow = n, ncol = N) + return(c(wis, components)) + } else { + return(wis) + } + } +} diff --git a/R/metrics-range.R b/R/metrics-range.R new file mode 100644 index 000000000..172380306 --- /dev/null +++ b/R/metrics-range.R @@ -0,0 +1,135 @@ +################################################################################ +# Metrics with a one-to-one relationship between input and score +################################################################################ + +#' @title Interval Score +#' +#' @description +#' Proper Scoring Rule to score quantile predictions, following Gneiting +#' and Raftery (2007). Smaller values are better. +#' +#' The score is computed as +#' +#' \deqn{ +#' \textrm{score} = (\textrm{upper} - \textrm{lower}) + \frac{2}{\alpha}(\textrm{lower} +#' - \textrm{observed}) * +#' \mathbf{1}(\textrm{observed} < \textrm{lower}) + +#' \frac{2}{\alpha}(\textrm{observed} - \textrm{upper}) * +#' \mathbf{1}(\textrm{observed} > \textrm{upper}) +#' }{ +#' score = (upper - lower) + 2/alpha * (lower - observed) * +#' 1(observed < lower) + 2/alpha * (observed - upper) * +#' 1(observed > upper) +#' } +#' where \eqn{\mathbf{1}()}{1()} is the indicator function and +#' indicates how much is outside the prediction interval. +#' \eqn{\alpha}{alpha} is the decimal value that indicates how much is outside +#' the prediction interval. +#' +#' To improve usability, the user is asked to provide an interval range in +#' percentage terms, i.e. interval_range = 90 (percent) for a 90 percent +#' prediction interval. Correspondingly, the user would have to provide the +#' 5% and 95% quantiles (the corresponding alpha would then be 0.1). +#' No specific distribution is assumed, +#' but the range has to be symmetric (i.e you can't use the 0.1 quantile +#' as the lower bound and the 0.7 quantile as the upper). +#' Non-symmetric quantiles can be scored using the function [quantile_score()]. +#' +#' @param lower vector of size n with the prediction for the lower quantile +#' of the given range +#' @param upper vector of size n with the prediction for the upper quantile +#' of the given range +#' @param interval_range the range of the prediction intervals. i.e. if you're +#' forecasting the 0.05 and 0.95 quantile, the interval_range would be 90. +#' Can be either a single number or a vector of size n, if the range changes +#' for different forecasts to be scored. This corresponds to (100-alpha)/100 +#' in Gneiting and Raftery (2007). Internally, the range will be transformed +#' to alpha. +#' @param weigh if TRUE, weigh the score by alpha / 2, so it can be averaged +#' into an interval score that, in the limit, corresponds to CRPS. Alpha is the +#' decimal value that represents how much is outside a central prediction +#' interval (e.g. for a 90 percent central prediction interval, alpha is 0.1) +#' Default: `TRUE`. +#' @param separate_results if `TRUE` (default is `FALSE`), then the separate +#' parts of the interval score (dispersion penalty, penalties for over- and +#' under-prediction get returned as separate elements of a list). If you want a +#' `data.frame` instead, simply call [as.data.frame()] on the output. +#' @return vector with the scoring values, or a list with separate entries if +#' `separate_results` is `TRUE`. +#' @importFrom rlang warn +#' @inheritParams ae_median_sample +#' @examples +#' observed <- rnorm(30, mean = 1:30) +#' interval_range <- rep(90, 30) +#' alpha <- (100 - interval_range) / 100 +#' lower <- qnorm(alpha / 2, rnorm(30, mean = 1:30)) +#' upper <- qnorm((1 - alpha / 2), rnorm(30, mean = 11:40)) +#' +#' scoringutils:::interval_score( +#' observed = observed, +#' lower = lower, +#' upper = upper, +#' interval_range = interval_range +#' ) +#' +#' # gives a warning, as the interval_range should likely be 50 instead of 0.5 +#' scoringutils:::interval_score( +#' observed = 4, upper = 8, lower = 2, interval_range = 0.5 +#' ) +#' +#' # example with missing values and separate results +#' scoringutils:::interval_score( +#' observed = c(observed, NA), +#' lower = c(lower, NA), +#' upper = c(NA, upper), +#' separate_results = TRUE, +#' interval_range = 90 +#' ) +#' @keywords metric +#' @references Strictly Proper Scoring Rules, Prediction,and Estimation, +#' Tilmann Gneiting and Adrian E. Raftery, 2007, Journal of the American +#' Statistical Association, Volume 102, 2007 - Issue 477 +#' +#' Evaluating epidemic forecasts in an interval format, +#' Johannes Bracher, Evan L. Ray, Tilmann Gneiting and Nicholas G. Reich, +#' # nolint +#' + +interval_score <- function(observed, + lower, + upper, + interval_range, + weigh = TRUE, + separate_results = FALSE) { + + assert_input_interval(observed, lower, upper, interval_range) + + # calculate alpha from the interval range + alpha <- (100 - interval_range) / 100 + + # calculate three components of WIS + dispersion <- (upper - lower) + overprediction <- + 2 / alpha * (lower - observed) * as.numeric(observed < lower) + underprediction <- + 2 / alpha * (observed - upper) * as.numeric(observed > upper) + + if (weigh) { + dispersion <- dispersion * alpha / 2 + underprediction <- underprediction * alpha / 2 + overprediction <- overprediction * alpha / 2 + } + + score <- dispersion + underprediction + overprediction + + if (separate_results) { + return(list( + interval_score = score, + dispersion = dispersion, + underprediction = underprediction, + overprediction = overprediction + )) + } else { + return(score) + } +} diff --git a/R/metrics-sample.R b/R/metrics-sample.R new file mode 100644 index 000000000..044ca8ee0 --- /dev/null +++ b/R/metrics-sample.R @@ -0,0 +1,283 @@ +#' @title Determines bias of forecasts +#' +#' @description +#' Determines bias from predictive Monte-Carlo samples. The function +#' automatically recognises, whether forecasts are continuous or +#' integer valued and adapts the Bias function accordingly. +#' +#' @details +#' For continuous forecasts, Bias is measured as +#' +#' \deqn{ +#' B_t (P_t, x_t) = 1 - 2 * (P_t (x_t)) +#' } +#' +#' where \eqn{P_t} is the empirical cumulative distribution function of the +#' prediction for the observed value \eqn{x_t}. Computationally, \eqn{P_t (x_t)} is +#' just calculated as the fraction of predictive samples for \eqn{x_t} +#' that are smaller than \eqn{x_t}. +#' +#' For integer valued forecasts, Bias is measured as +#' +#' \deqn{ +#' B_t (P_t, x_t) = 1 - (P_t (x_t) + P_t (x_t + 1)) +#' } +#' +#' to adjust for the integer nature of the forecasts. +#' +#' In both cases, Bias can assume values between +#' -1 and 1 and is 0 ideally. +#' +#' @return vector of length n with the biases of the predictive samples with +#' respect to the observed values. +#' @inheritParams ae_median_sample +#' @author Nikos Bosse \email{nikosbosse@@gmail.com} +#' @examples +#' +#' ## integer valued forecasts +#' observed <- rpois(30, lambda = 1:30) +#' predicted <- replicate(200, rpois(n = 30, lambda = 1:30)) +#' bias_sample(observed, predicted) +#' +#' ## continuous forecasts +#' observed <- rnorm(30, mean = 1:30) +#' predicted <- replicate(200, rnorm(30, mean = 1:30)) +#' bias_sample(observed, predicted) +#' @export +#' @references +#' The integer valued Bias function is discussed in +#' Assessing the performance of real-time epidemic forecasts: A case study of +#' Ebola in the Western Area region of Sierra Leone, 2014-15 Funk S, Camacho A, +#' Kucharski AJ, Lowe R, Eggo RM, et al. (2019) Assessing the performance of +#' real-time epidemic forecasts: A case study of Ebola in the Western Area +#' region of Sierra Leone, 2014-15. PLOS Computational Biology 15(2): e1006785. +#' \doi{10.1371/journal.pcbi.1006785} +#' @keywords metric + +bias_sample <- function(observed, predicted) { + + assert_input_sample(observed, predicted) + prediction_type <- get_type(predicted) + + # empirical cdf + n_pred <- ncol(predicted) + p_x <- rowSums(predicted <= observed) / n_pred + + if (prediction_type == "continuous") { + res <- 1 - 2 * p_x + return(res) + } else { + # for integer case also calculate empirical cdf for (y-1) + p_xm1 <- rowSums(predicted <= (observed - 1)) / n_pred + + res <- 1 - (p_x + p_xm1) + return(res) + } +} + + +#' @title Absolute Error of the Median (Sample-based Version) +#' +#' @description +#' Absolute error of the median calculated as +#' +#' \deqn{% +#' \textrm{abs}(\textrm{observevd} - \textrm{median\_prediction}) +#' }{% +#' abs(observed - median_prediction) +#' } +#' +#' @param observed A vector with observed values of size n +#' @param predicted nxN matrix of predictive samples, n (number of rows) being +#' the number of data points and N (number of columns) the number of Monte +#' Carlo samples. Alternatively, `predicted` can just be a vector of size n. +#' @return vector with the scoring values +#' @seealso [ae_median_quantile()] +#' @importFrom stats median +#' @examples +#' observed <- rnorm(30, mean = 1:30) +#' predicted_values <- matrix(rnorm(30, mean = 1:30)) +#' ae_median_sample(observed, predicted_values) +#' @export +#' @keywords metric + +ae_median_sample <- function(observed, predicted) { + assert_input_sample(observed, predicted) + median_predictions <- apply( + as.matrix(predicted), MARGIN = 1, FUN = median # this is row-wise + ) + ae_median <- abs(observed - median_predictions) + return(ae_median) +} + + +#' @title Squared Error of the Mean (Sample-based Version) +#' +#' @description +#' Squared error of the mean calculated as +#' +#' \deqn{ +#' \textrm{mean}(\textrm{observed} - \textrm{mean prediction})^2 +#' }{ +#' mean(observed - mean prediction)^2 +#' } +#' The mean prediction is calculated as the mean of the predictive samples. +#' @param observed A vector with observed values of size n +#' @param predicted nxN matrix of predictive samples, n (number of rows) being +#' the number of data points and N (number of columns) the number of Monte +#' Carlo samples. Alternatively, `predicted` can just be a vector of size n. +#' @return vector with the scoring values +#' @examples +#' observed <- rnorm(30, mean = 1:30) +#' predicted_values <- matrix(rnorm(30, mean = 1:30)) +#' se_mean_sample(observed, predicted_values) +#' @export +#' @keywords metric + +se_mean_sample <- function(observed, predicted) { + assert_input_sample(observed, predicted) + mean_predictions <- rowMeans(as.matrix(predicted)) + se_mean <- (observed - mean_predictions)^2 + + return(se_mean) +} + + +#' @title Logarithmic score +#' +#' @description +#' Wrapper around the [`logs_sample()`][scoringRules::scores_sample_univ] +#' function from the +#' \pkg{scoringRules} package. Used to score continuous predictions. +#' While the Log Score is in theory also applicable +#' to integer forecasts, the problem lies in the implementation: The Log Score +#' needs a kernel density estimation, which is not well defined with +#' integer-valued Monte Carlo Samples. The Log Score can be used for specific +#' integer valued probability distributions. See the scoringRules package for +#' more details. +#' @inheritParams ae_median_sample +#' @param ... additional arguments passed to +#' [logs_sample()][scoringRules::logs_sample()] from the scoringRules package. +#' @return vector with the scoring values +#' @importFrom scoringRules logs_sample +#' @examples +#' observed <- rpois(30, lambda = 1:30) +#' predicted <- replicate(200, rpois(n = 30, lambda = 1:30)) +#' logs_sample(observed, predicted) +#' @export +#' @references +#' Alexander Jordan, Fabian Krüger, Sebastian Lerch, Evaluating Probabilistic +#' Forecasts with scoringRules, +#' @keywords metric + +logs_sample <- function(observed, predicted, ...) { + assert_input_sample(observed, predicted) + scoringRules::logs_sample( + y = observed, + dat = predicted, + ... + ) +} + +#' @title Dawid-Sebastiani Score +#' +#' @description +#' Wrapper around the [`dss_sample()`][scoringRules::scores_sample_univ] +#' function from the +#' \pkg{scoringRules} package. +#' @inheritParams logs_sample +#' @param ... additional arguments passed to +#' [dss_sample()][scoringRules::dss_sample()] from the scoringRules package. +#' @return vector with scoring values +#' @importFrom scoringRules dss_sample +#' @examples +#' observed <- rpois(30, lambda = 1:30) +#' predicted <- replicate(200, rpois(n = 30, lambda = 1:30)) +#' dss_sample(observed, predicted) +#' @export +#' @references +#' Alexander Jordan, Fabian Krüger, Sebastian Lerch, Evaluating Probabilistic +#' Forecasts with scoringRules, +#' @keywords metric + +dss_sample <- function(observed, predicted, ...) { + assert_input_sample(observed, predicted) + + scoringRules::dss_sample( + y = observed, + dat = predicted, + ... + ) +} + +#' @title Ranked Probability Score +#' +#' @description +#' Wrapper around the [`crps_sample()`][scoringRules::scores_sample_univ] +#' function from the +#' \pkg{scoringRules} package. Can be used for continuous as well as integer +#' valued forecasts +#' @inheritParams logs_sample +#' @param ... additional arguments passed to +#' [crps_sample()][scoringRules::crps_sample()] from the scoringRules package. +#' @return vector with the scoring values +#' @importFrom scoringRules crps_sample +#' @examples +#' observed <- rpois(30, lambda = 1:30) +#' predicted <- replicate(200, rpois(n = 30, lambda = 1:30)) +#' crps_sample(observed, predicted) +#' @export +#' @references +#' Alexander Jordan, Fabian Krüger, Sebastian Lerch, Evaluating Probabilistic +#' Forecasts with scoringRules, +#' @keywords metric + +crps_sample <- function(observed, predicted, ...) { + assert_input_sample(observed, predicted) + + scoringRules::crps_sample( + y = observed, + dat = predicted, + ... + ) +} + + +#' @title Determine dispersion of a probabilistic forecast +#' @details +#' Sharpness is the ability of the model to generate predictions within a +#' narrow range and dispersion is the lack thereof. +#' It is a data-independent measure, and is purely a feature +#' of the forecasts themselves. +#' +#' Dispersion of predictive samples corresponding to one single observed value is +#' measured as the normalised median of the absolute deviation from +#' the median of the predictive samples. For details, see [mad()][stats::mad()] +#' and the explanations given in Funk et al. (2019) +#' +#' @inheritParams ae_median_sample +#' @param observed place holder, argument will be ignored and exists only for +#' consistency with other scoring functions. The output does not depend on +#' any observed values. +#' @param ... additional arguments passed to [mad()][stats::mad()]. +#' @importFrom stats mad +#' @return vector with dispersion values +#' +#' @references +#' Funk S, Camacho A, Kucharski AJ, Lowe R, Eggo RM, Edmunds WJ (2019) +#' Assessing the performance of real-time epidemic forecasts: A case study of +#' Ebola in the Western Area region of Sierra Leone, 2014-15. +#' PLoS Comput Biol 15(2): e1006785. \doi{10.1371/journal.pcbi.1006785} +#' +#' @export +#' @examples +#' predicted <- replicate(200, rpois(n = 30, lambda = 1:30)) +#' mad_sample(predicted = predicted) +#' @keywords metric +mad_sample <- function(observed = NULL, predicted, ...) { + + assert_input_sample(rep(NA_real_, nrow(predicted)), predicted) + + sharpness <- apply(predicted, MARGIN = 1, mad, ...) + return(sharpness) +} diff --git a/R/metrics_point_forecasts.R b/R/metrics_point_forecasts.R deleted file mode 100644 index 66d55b449..000000000 --- a/R/metrics_point_forecasts.R +++ /dev/null @@ -1,167 +0,0 @@ -#' @title Absolute Error of the Median (Sample-based Version) -#' -#' @description -#' Absolute error of the median calculated as -#' -#' \deqn{% -#' \textrm{abs}(\textrm{true\_value} - \textrm{median\_prediction}) -#' }{% -#' abs(true_value - median_prediction) -#' } -#' -#' @param true_values A vector with the true observed values of size n -#' @param predictions nxN matrix of predictive samples, n (number of rows) being -#' the number of data points and N (number of columns) the number of Monte -#' Carlo samples. Alternatively, predictions can just be a vector of size n. -#' @return vector with the scoring values -#' @seealso [ae_median_quantile()], [abs_error()] -#' @importFrom stats median -#' @examples -#' true_values <- rnorm(30, mean = 1:30) -#' predicted_values <- rnorm(30, mean = 1:30) -#' ae_median_sample(true_values, predicted_values) -#' @export -#' @keywords metric - -ae_median_sample <- function(true_values, predictions) { - median_predictions <- apply( - as.matrix(predictions), MARGIN = 1, FUN = median # this is rowwise - ) - - ae_median <- abs(true_values - median_predictions) - - return(ae_median) -} - -#' @title Squared Error of the Mean (Sample-based Version) -#' -#' @description -#' Squared error of the mean calculated as -#' -#' \deqn{ -#' \textrm{mean}(\textrm{true\_value} - \textrm{prediction})^2 -#' }{ -#' mean(true_value - mean_prediction)^2 -#' } -#' -#' @param true_values A vector with the true observed values of size n -#' @param predictions nxN matrix of predictive samples, n (number of rows) being -#' the number of data points and N (number of columns) the number of Monte -#' Carlo samples. Alternatively, predictions can just be a vector of size n. -#' @return vector with the scoring values -#' @seealso [squared_error()] -#' @examples -#' true_values <- rnorm(30, mean = 1:30) -#' predicted_values <- rnorm(30, mean = 1:30) -#' se_mean_sample(true_values, predicted_values) -#' @export -#' @keywords metric - -se_mean_sample <- function(true_values, predictions) { - mean_predictions <- rowMeans(as.matrix(predictions)) - se_mean <- (true_values - mean_predictions)^2 - - return(se_mean) -} - - -#' @title Absolute Error of the Median (Quantile-based Version) -#' -#' @description -#' Absolute error of the median calculated as -#' -#' \deqn{ -#' \textrm{abs}(\textrm{true\_value} - \textrm{prediction}) -#' }{ -#' abs(true_value - median_prediction) -#' } -#' -#' The function was created for internal use within [score()], but can also -#' used as a standalone function. -#' -#' @param predictions numeric vector with predictions, corresponding to the -#' quantiles in a second vector, `quantiles`. -#' @param quantiles numeric vector that denotes the quantile for the values -#' in `predictions`. Only those predictions where `quantiles == 0.5` will -#' be kept. If `quantiles` is `NULL`, then all `predictions` and -#' `true_values` will be used (this is then the same as [abs_error()]) -#' @return vector with the scoring values -#' @seealso [ae_median_sample()], [abs_error()] -#' @importFrom stats median -#' @inheritParams ae_median_sample -#' @examples -#' true_values <- rnorm(30, mean = 1:30) -#' predicted_values <- rnorm(30, mean = 1:30) -#' ae_median_quantile(true_values, predicted_values, quantiles = 0.5) -#' @export -#' @keywords metric - -ae_median_quantile <- function(true_values, predictions, quantiles = NULL) { - if (!is.null(quantiles)) { - if (!any(quantiles == 0.5) && !anyNA(quantiles)) { - return(NA_real_) - warning( - "in order to compute the absolute error of the median, `0.5` must be among the quantiles given. Maybe you want to use `abs_error()`?" # nolint - ) - } - true_values <- true_values[quantiles == 0.5] - predictions <- predictions[quantiles == 0.5] - } - abs_error_median <- abs(true_values - predictions) - return(abs_error_median) -} - - - - -#' @title Absolute Error -#' -#' @description -#' Calculate absolute error as -#' -#' \deqn{ -#' \textrm{abs}(\textrm{true\_value} - \textrm{median\_prediction}) -#' }{ -#' abs(true_value - prediction) -#' } -#' -#' @return vector with the absolute error -#' @inheritParams ae_median_quantile -#' @seealso [ae_median_sample()], [ae_median_quantile()] -#' @examples -#' true_values <- rnorm(30, mean = 1:30) -#' predicted_values <- rnorm(30, mean = 1:30) -#' abs_error(true_values, predicted_values) -#' @export -#' @keywords metric - -abs_error <- function(true_values, predictions) { - return(abs(true_values - predictions)) -} - - -#' @title Squared Error -#' -#' @description -#' Squared Error SE calculated as -#' -#' \deqn{ -#' (\textrm{true\_values} - \textrm{predicted\_values})^2 -#' }{ -#' (true_values - predicted_values)^2 -#' } -#' -#' @param predictions A vector with predicted values of size n -#' @return vector with the scoring values -#' @inheritParams ae_median_sample -#' @export -#' @keywords metric -#' @examples -#' true_values <- rnorm(30, mean = 1:30) -#' predicted_values <- rnorm(30, mean = 1:30) -#' squared_error(true_values, predicted_values) - -squared_error <- function(true_values, predictions) { - se <- (true_values - predictions)^2 - return(se) -} diff --git a/R/pairwise-comparisons.R b/R/pairwise-comparisons.R index f18be4831..86e17c95b 100644 --- a/R/pairwise-comparisons.R +++ b/R/pairwise-comparisons.R @@ -28,9 +28,8 @@ #' #' @param scores A data.table of scores as produced by [score()]. #' @param metric A character vector of length one with the metric to do the -#' comparison on. The default is "auto", meaning that either "interval_score", +#' comparison on. The default is "auto", meaning that either "wis", #' "crps", or "brier_score" will be selected where available. -#' See [available_metrics()] for available metrics. #' @param by character vector with names of columns present in the input #' data.frame. `by` determines how pairwise comparisons will be computed. #' You will get a relative skill score for every grouping level determined in @@ -68,10 +67,12 @@ pairwise_comparison <- function(scores, metric = "auto", baseline = NULL, ...) { - metric <- match.arg(metric, c("auto", available_metrics())) - - scores <- data.table::as.data.table(scores) + if (is.data.table(scores)) { + scores <- copy(scores) + } else { + scores <- as.data.table(scores) + } # determine metric automatically if (metric == "auto") { @@ -236,7 +237,7 @@ pairwise_comparison_one_group <- function(scores, # calculate relative skill as geometric mean # small theta is again better (assuming that the score is negatively oriented) result[, `:=`( - theta = geom_mean_helper(ratio), + theta = geometric_mean(ratio), rel_to_baseline = NA_real_ ), by = "model" @@ -361,12 +362,14 @@ compare_two_models <- function(scores, #' Helper function to infer the metric for which pairwise comparisons shall #' be made. The function simply checks the names of the available columns and #' chooses the most widely used metric. +#' Used in [pairwise_comparison()]. +#' #' @inheritParams pairwise_comparison #' @keywords internal infer_rel_skill_metric <- function(scores) { - if ("interval_score" %in% colnames(scores)) { - rel_skill_metric <- "interval_score" + if ("wis" %in% colnames(scores)) { + rel_skill_metric <- "wis" } else if ("crps" %in% colnames(scores)) { rel_skill_metric <- "crps" } else if ("brier_score" %in% colnames(scores)) { @@ -380,3 +383,81 @@ infer_rel_skill_metric <- function(scores) { return(rel_skill_metric) } + + +#' @title Calculate Geometric Mean +#' +#' @details +#' Used in [pairwise_comparison()]. +#' +#' @param x numeric vector of values for which to calculate the geometric mean +#' @return the geometric mean of the values in `x`. `NA` values are ignored. +#' +#' @keywords internal +geometric_mean <- function(x) { + geometric_mean <- exp(mean(log(x[!is.na(x)]))) + return(geometric_mean) +} + +#' @title Simple permutation test +#' +#' @description The implementation of the permutation test follows the +#' function +#' `permutationTest` from the `surveillance` package by Michael Höhle, +#' Andrea Riebler and Michaela Paul. +#' The function compares two vectors of scores. It computes the mean of each +#' vector independently and then takes either the difference or the ratio of +#' the two. This observed difference or ratio is compared against the same +#' test statistic based on permutations of the original data. +#' +#' Used in [pairwise_comparison()]. +#' +#' @param scores1 vector of scores to compare against another vector of scores +#' @param scores2 A second vector of scores to compare against the first +#' @param n_permutation The number of replications to use for a permutation +#' test. More replications yield more exact results, but require more +#' computation. +#' @param one_sided Whether or not to compute a one-sided test. Default is +#' `FALSE`, +#' @param comparison_mode How to compute the test statistic for the comparison +#' of the two scores. Should be either "difference" or "ratio". +#' +#' @return p-value of the permutation test +#' @keywords internal +permutation_test <- function(scores1, + scores2, + n_permutation = 999, + one_sided = FALSE, + comparison_mode = c("difference", "ratio")) { + nTime <- length(scores1) + meanscores1 <- mean(scores1) + meanscores2 <- mean(scores2) + comparison_mode <- match.arg(comparison_mode) + if (comparison_mode == "ratio") { + # distinguish between on-sided and two-sided: + test_stat_observed <- ifelse( + one_sided, + meanscores1 / meanscores2, + max(meanscores1 / meanscores2, meanscores2 / meanscores1) + ) + } else { + test_stat_observed <- ifelse( + one_sided, + meanscores1 - meanscores2, + abs(meanscores1 - meanscores2) + ) + } + test_stat_permuted <- replicate(n_permutation, { + sel <- rbinom(nTime, size = 1, prob = 0.5) + g1 <- (sum(scores1[sel == 0]) + sum(scores2[sel == 1])) / nTime + g2 <- (sum(scores1[sel == 1]) + sum(scores2[sel == 0])) / nTime + if (comparison_mode == "ratio") { + ifelse(one_sided, g1 / g2, max(g1 / g2, g2 / g1)) + } else { + ifelse(one_sided, g1 - g2, abs(g1 - g2)) + } + }) + pVal <- (1 + sum(test_stat_permuted >= test_stat_observed)) / (n_permutation + 1) + # plus ones to make sure p-val is never 0? + return(pVal) +} diff --git a/R/pit.R b/R/pit.R index de5361bfa..ab5a580d1 100644 --- a/R/pit.R +++ b/R/pit.R @@ -56,9 +56,9 @@ #' integer predictions. #' @inheritParams ae_median_sample #' @return A vector with PIT-values. For continuous forecasts, the vector will -#' correspond to the length of `true_values`. For integer forecasts, a +#' correspond to the length of `observed`. For integer forecasts, a #' randomised PIT will be returned of length -#' `length(true_values) * n_replicates` +#' `length(observed) * n_replicates` #' @seealso [pit()] #' @importFrom stats runif #' @examples @@ -67,15 +67,15 @@ #' } #' #' ## continuous predictions -#' true_values <- rnorm(20, mean = 1:20) -#' predictions <- replicate(100, rnorm(n = 20, mean = 1:20)) -#' pit <- pit_sample(true_values, predictions) +#' observed <- rnorm(20, mean = 1:20) +#' predicted <- replicate(100, rnorm(n = 20, mean = 1:20)) +#' pit <- pit_sample(observed, predicted) #' plot_pit(pit) #' #' ## integer predictions -#' true_values <- rpois(50, lambda = 1:50) -#' predictions <- replicate(2000, rpois(n = 50, lambda = 1:50)) -#' pit <- pit_sample(true_values, predictions, n_replicates = 30) +#' observed <- rpois(50, lambda = 1:50) +#' predicted <- replicate(2000, rpois(n = 50, lambda = 1:50)) +#' pit <- pit_sample(observed, predicted, n_replicates = 30) #' plot_pit(pit) #' @export #' @references @@ -85,20 +85,20 @@ #' region of Sierra Leone, 2014-15, \doi{10.1371/journal.pcbi.1006785} #' @keywords metric -pit_sample <- function(true_values, - predictions, +pit_sample <- function(observed, + predicted, n_replicates = 100) { # error handling-------------------------------------------------------------- # check al arguments are provided - # this could be integrated into check_not_null - if (missing("true_values") || missing("predictions")) { - stop("`true_values` or `predictions` missing in function 'pit_sample()'") + # this could be integrated into assert_not_null + if (missing("observed") || missing("predicted")) { + stop("`observed` or `predicted` missing in function 'pit_sample()'") } - check_not_null(true_values = true_values, predictions = predictions) + assert_not_null(observed = observed, predicted = predicted) # check if there is more than one observation - n <- length(true_values) + n <- length(observed) if (n == 1) { message( "you need more than one observation to assess uniformity of the PIT" @@ -107,44 +107,44 @@ pit_sample <- function(true_values, } # check and handle format of predictions - if (is.data.frame(predictions)) { - predictions <- as.matrix(predictions) + if (is.data.frame(predicted)) { + predicted <- as.matrix(predicted) } - if (!is.matrix(predictions)) { + if (!is.matrix(predicted)) { msg <- sprintf( - "'predictions' should be a matrix. Instead `%s` was found", - class(predictions)[1] + "'predicted' should be a matrix. Instead `%s` was found", + class(predicted)[1] ) stop(msg) } - if (nrow(predictions) != n) { + if (nrow(predicted) != n) { msg <- sprintf( - "Mismatch: 'true_values' has length `%s`, but 'predictions' has `%s` rows.", - n, nrow(predictions) + "Mismatch: 'observed' has length `%s`, but 'predicted' has `%s` rows.", + n, nrow(predicted) ) stop(msg) } # check data type ------------------------------------------------------------ # check whether continuous or integer - if (isTRUE(all.equal(as.vector(predictions), as.integer(predictions)))) { + if (isTRUE(all.equal(as.vector(predicted), as.integer(predicted)))) { continuous_predictions <- FALSE } else { continuous_predictions <- TRUE } # calculate PIT-values ------------------------------------------------------- - n_pred <- ncol(predictions) + n_pred <- ncol(predicted) # calculate emipirical cumulative distribution function as - # Portion of (y_true <= y_predicted) - p_x <- rowSums(predictions <= true_values) / n_pred + # Portion of (y_observed <= y_predicted) + p_x <- rowSums(predicted <= observed) / n_pred # calculate PIT for continuous predictions case if (continuous_predictions) { pit_values <- p_x } else { - p_xm1 <- rowSums(predictions <= (true_values - 1)) / n_pred + p_xm1 <- rowSums(predicted <= (observed - 1)) / n_pred pit_values <- as.vector( replicate(n_replicates, p_xm1 + runif(1) * (p_x - p_xm1)) ) @@ -159,8 +159,8 @@ pit_sample <- function(true_values, #' @details #' see [pit()] #' -#' @param data a data.frame with the following columns: `true_value`, -#' `prediction`, `sample`. +#' @param data a data.frame with the following columns: `observed`, +#' `predicted`, `sample_id`. #' @param by Character vector with the columns according to which the #' PIT values shall be grouped. If you e.g. have the columns 'model' and #' 'location' in the data and want to have a PIT histogram for @@ -187,41 +187,33 @@ pit <- function(data, by, n_replicates = 100) { - check_data <- check_forecasts(data) + data <- as_forecast(data) + data <- na.omit(data) + forecast_type <- get_forecast_type(data) - data <- check_data$cleaned_data - prediction_type <- check_data$prediction_type - - # if prediction type is quantile, simply extract coverage values from - # score and returned a list with named vectors - if (prediction_type == "quantile") { - coverage <- - score(data, metrics = "quantile_coverage") - - coverage <- summarise_scores(coverage, - by = unique(c(by, "quantile")) - ) - - coverage <- coverage[order(quantile), + if (forecast_type == "quantile") { + data[, quantile_coverage := (observed <= predicted)] + quantile_coverage <- data[, .(quantile_coverage = mean(quantile_coverage)), + by = c(unique(c(by, "quantile")))] + quantile_coverage <- quantile_coverage[order(quantile), .( quantile = c(quantile, 1), pit_value = diff(c(0, quantile_coverage, 1)) ), - by = c(get_forecast_unit(coverage)) + by = c(get_forecast_unit(quantile_coverage)) ] - - return(coverage[]) + return(quantile_coverage[]) } # if prediction type is not quantile, calculate PIT values based on samples data_wide <- data.table::dcast(data, - ... ~ paste0("InternalSampl_", sample), - value.var = "prediction" + ... ~ paste0("InternalSampl_", sample_id), + value.var = "predicted" ) pit <- data_wide[, .(pit_value = pit_sample( - true_values = true_value, - predictions = as.matrix(.SD) + observed = observed, + predicted = as.matrix(.SD) )), by = by, .SDcols = grepl("InternalSampl_", names(data_wide), fixed = TRUE) diff --git a/R/plot.R b/R/plot.R index 0aa7d4185..928ab7760 100644 --- a/R/plot.R +++ b/R/plot.R @@ -47,14 +47,10 @@ plot_score_table <- function(scores, # identify metrics ----------------------------------------------------------- id_vars <- get_forecast_unit(scores) - if (is.null(metrics)) { - metrics <- names(scores)[names(scores) %in% available_metrics()] - } + metrics <- get_metrics(scores) - scores <- delete_columns( - scores, - names(scores)[!(names(scores) %in% c(metrics, id_vars))] - ) + cols_to_delete <- names(scores)[!(names(scores) %in% c(metrics, id_vars))] + suppressWarnings(scores[, eval(cols_to_delete) := NULL]) # compute scaled values ------------------------------------------------------ # scaling is done in order to colour the different scores @@ -63,7 +59,7 @@ plot_score_table <- function(scores, # define which metrics are scaled using min (larger is worse) and # which not (metrics like bias where deviations in both directions are bad) - metrics_zero_good <- c("bias", "coverage_deviation") + metrics_zero_good <- c("bias", "interval_coverage_deviation") metrics_no_color <- "coverage" metrics_min_good <- setdiff(metrics, c( @@ -225,7 +221,7 @@ plot_wis <- function(scores, #' produced by [score()] or [summarise_scores()]. Note that "range" must be included #' in the `by` argument when running [summarise_scores()] #' @param y The variable from the scores you want to show on the y-Axis. -#' This could be something like "interval_score" (the default) or "dispersion" +#' This could be something like "wis" (the default) or "dispersion" #' @param x The variable from the scores you want to show on the x-Axis. #' Usually this will be "model" #' @param colour Character vector of length one used to determine a variable @@ -237,18 +233,19 @@ plot_wis <- function(scores, #' @export #' @examples #' library(ggplot2) -#' scores <- score(example_quantile) -#' scores <- summarise_scores(scores, by = c("model", "target_type", "range")) -#' -#' plot_ranges(scores, x = "model") + +#' ex <- example_quantile +#' ex$interval_range <- scoringutils:::get_range_from_quantile(ex$quantile) +#' scores <- score(ex, metrics = list("wis" = wis)) +#' scores$range <- scores$interval_range +#' summarised <- summarise_scores( +#' scores, +#' by = c("model", "target_type", "range") +#' ) +#' plot_ranges(summarised, x = "model") + #' facet_wrap(~target_type, scales = "free") -#' -#' # visualise dispersion instead of interval score -#' plot_ranges(scores, y = "dispersion", x = "model") + -#' facet_wrap(~target_type) plot_ranges <- function(scores, - y = "interval_score", + y = "wis", x = "model", colour = "range") { plot <- ggplot( @@ -300,7 +297,7 @@ plot_ranges <- function(scores, #' @export #' @examples #' scores <- score(example_quantile) -#' scores <- summarise_scores(scores, by = c("model", "target_type", "range")) +#' scores <- summarise_scores(scores, by = c("model", "target_type")) #' #' plot_heatmap(scores, x = "target_type", metric = "bias") @@ -401,28 +398,26 @@ plot_predictions <- function(data, range = c(0, 50, 90)) { # split truth data and forecasts in order to apply different filtering - truth_data <- data.table::as.data.table(data)[!is.na(true_value)] - forecasts <- data.table::as.data.table(data)[!is.na(prediction)] + truth_data <- data.table::as.data.table(data)[!is.na(observed)] + forecasts <- data.table::as.data.table(data)[!is.na(predicted)] del_cols <- - colnames(truth_data)[!(colnames(truth_data) %in% c(by, "true_value", x))] + colnames(truth_data)[!(colnames(truth_data) %in% c(by, "observed", x))] - truth_data <- delete_columns( - truth_data, - del_cols, - make_unique = TRUE - ) + truth_data <- unique(suppressWarnings(truth_data[, eval(del_cols) := NULL])) # find out what type of predictions we have. convert sample based to # range data - prediction_type <- get_prediction_type(data) - if (prediction_type %in% c("integer", "continuous")) { - forecasts <- sample_to_range_long(forecasts, - range = range, + + if (test_forecast_type_is_quantile(data)) { + forecasts <- quantile_to_interval( + forecasts, keep_quantile_col = FALSE ) - } else if (prediction_type == "quantile") { - forecasts <- quantile_to_range_long(forecasts, + } else if (test_forecast_type_is_sample(data)) { + forecasts <- sample_to_range_long( + forecasts, + range = range, keep_quantile_col = FALSE ) } @@ -444,7 +439,7 @@ plot_predictions <- function(data, if (nrow(intervals) != 0) { # pivot wider and convert range to a factor intervals <- data.table::dcast(intervals, ... ~ boundary, - value.var = "prediction") + value.var = "predicted") # only plot ranges if there are ranges to plot plot <- plot + @@ -479,20 +474,20 @@ plot_predictions <- function(data, plot <- plot + geom_line( data = median, - mapping = aes(y = prediction), + mapping = aes(y = predicted), lwd = 0.4 ) } } - # add true_values + # add observed values if (nrow(truth_data) > 0) { plot <- plot + geom_point( data = truth_data, show.legend = FALSE, inherit.aes = FALSE, - aes(x = .data[[x]], y = true_value), + aes(x = .data[[x]], y = observed), color = "black", size = 0.5 ) + @@ -500,7 +495,7 @@ plot_predictions <- function(data, data = truth_data, inherit.aes = FALSE, show.legend = FALSE, - aes(x = .data[[x]], y = true_value), + aes(x = .data[[x]], y = observed), linetype = 1, color = "grey40", lwd = 0.2 @@ -520,7 +515,7 @@ plot_predictions <- function(data, #' #' @inheritParams score #' @param what character vector that determines which values should be turned -#' into `NA`. If `what = "truth"`, values in the column 'true_value' will be +#' into `NA`. If `what = "truth"`, values in the column 'observed' will be #' turned into `NA`. If `what = "forecast"`, values in the column 'prediction' #' will be turned into `NA`. If `what = "both"`, values in both column will be #' turned into `NA`. @@ -542,7 +537,7 @@ make_NA <- function(data = NULL, what = c("truth", "forecast", "both"), ...) { - check_not_null(data = data) + assert_not_null(data = data) data <- data.table::copy(data) what <- match.arg(what) @@ -552,10 +547,10 @@ make_NA <- function(data = NULL, vars <- NULL if (what %in% c("forecast", "both")) { - vars <- c(vars, "prediction") + vars <- c(vars, "predicted") } if (what %in% c("truth", "both")) { - vars <- c(vars, "true_value") + vars <- c(vars, "observed") } for (expr in args) { data <- data[eval(expr), eval(vars) := NA_real_] @@ -587,10 +582,9 @@ make_na <- make_NA #' \dontshow{ #' data.table::setDTthreads(2) # restricts number of cores used on CRAN #' } -#' scores <- score(example_quantile) -#' scores <- summarise_scores(scores, by = c("model", "range")) -#' plot_interval_coverage(scores) - +#' data_coverage <- add_coverage(example_quantile) +#' summarised <- summarise_scores(data_coverage, by = c("model", "range")) +#' plot_interval_coverage(summarised) plot_interval_coverage <- function(scores, colour = "model") { ## overall model calibration - empirical interval coverage @@ -617,7 +611,7 @@ plot_interval_coverage <- function(scores, colour = "grey", linetype = "dashed" ) + - geom_line(aes(y = coverage * 100)) + + geom_line(aes(y = interval_coverage * 100)) + theme_scoringutils() + ylab("% Obs inside interval") + xlab("Nominal interval coverage") + @@ -642,9 +636,9 @@ plot_interval_coverage <- function(scores, #' @importFrom data.table dcast #' @export #' @examples -#' scores <- score(example_quantile) -#' scores <- summarise_scores(scores, by = c("model", "quantile")) -#' plot_quantile_coverage(scores) +#' data_coverage <- add_coverage(example_quantile) +#' summarised <- summarise_scores(data_coverage, by = c("model", "quantile")) +#' plot_quantile_coverage(summarised) plot_quantile_coverage <- function(scores, colour = "model") { @@ -768,8 +762,8 @@ plot_pairwise_comparison <- function(comparison_result, plot <- ggplot( comparison_result, aes( - y = reorder(model, 1 / mean_scores_ratio, FUN = geom_mean_helper), - x = reorder(compare_against, mean_scores_ratio, FUN = geom_mean_helper), + y = reorder(model, 1 / mean_scores_ratio, FUN = geometric_mean), + x = reorder(compare_against, mean_scores_ratio, FUN = geometric_mean), fill = fill_col ) ) + @@ -844,9 +838,9 @@ plot_pairwise_comparison <- function(comparison_result, #' } #' #' # PIT histogram in vector based format -#' true_values <- rnorm(30, mean = 1:30) -#' predictions <- replicate(200, rnorm(n = 30, mean = 1:30)) -#' pit <- pit_sample(true_values, predictions) +#' observed <- rnorm(30, mean = 1:30) +#' predicted <- replicate(200, rnorm(n = 30, mean = 1:30)) +#' pit <- pit_sample(observed, predicted) #' plot_pit(pit) #' #' # quantile-based pit @@ -947,51 +941,58 @@ plot_pit <- function(pit, #' #' @description #' Visualise Where Forecasts Are Available -#' -#' @param avail_forecasts data.frame with a column called `Number forecasts` as -#' produced by [avail_forecasts()] +#' @param forecast_counts a data.table (or similar) with a column `count` +#' holding forecast counts, as produced by [get_forecast_counts()] +#' @param x character vector of length one that denotes the name of the column +#' to appear on the x-axis of the plot. #' @param y character vector of length one that denotes the name of the column #' to appear on the y-axis of the plot. Default is "model". -#' @param x character vector of length one that denotes the name of the column -#' to appear on the x-axis of the plot. Default is "forecast_date". -#' @param make_x_factor logical (default is TRUE). Whether or not to convert +#' @param x_as_factor logical (default is TRUE). Whether or not to convert #' the variable on the x-axis to a factor. This has an effect e.g. if dates #' are shown on the x-axis. -#' @param show_numbers logical (default is `TRUE`) that indicates whether +#' @param show_counts logical (default is `TRUE`) that indicates whether #' or not to show the actual count numbers on the plot #' @return ggplot object with a plot of interval coverage #' @importFrom ggplot2 ggplot scale_colour_manual scale_fill_manual #' geom_tile scale_fill_gradient .data #' @importFrom data.table dcast .I .N +#' @importFrom checkmate assert_string assert_logical assert #' @export #' @examples #' library(ggplot2) -#' avail_forecasts <- avail_forecasts( +#' forecast_counts <- get_forecast_counts( #' example_quantile, by = c("model", "target_type", "target_end_date") #' ) -#' plot_avail_forecasts( -#' avail_forecasts, x = "target_end_date", show_numbers = FALSE +#' plot_forecast_counts( +#' forecast_counts, x = "target_end_date", show_counts = FALSE #' ) + #' facet_wrap("target_type") -plot_avail_forecasts <- function(avail_forecasts, +plot_forecast_counts <- function(forecast_counts, + x, y = "model", - x = "forecast_date", - make_x_factor = TRUE, - show_numbers = TRUE) { - avail_forecasts <- as.data.table(avail_forecasts) - - if (make_x_factor) { - avail_forecasts[, eval(x) := as.factor(get(x))] + x_as_factor = TRUE, + show_counts = TRUE) { + + forecast_counts <- ensure_data.table(forecast_counts) + assert_string(y) + assert_string(x) + assert(check_columns_present(forecast_counts, c(y, x))) + assert_logical(x_as_factor) + assert_logical(show_counts) + + if (x_as_factor) { + forecast_counts[, eval(x) := as.factor(get(x))] } + setnames(forecast_counts, old = "count", new = "Count") + plot <- ggplot( - avail_forecasts, + forecast_counts, aes(y = .data[[y]], x = .data[[x]]) ) + - geom_tile(aes(fill = `Number forecasts`), - width = 0.97, height = 0.97 - ) + + geom_tile(aes(fill = `Count`), + width = 0.97, height = 0.97) + scale_fill_gradient( low = "grey95", high = "steelblue", na.value = "lightgrey" @@ -1004,15 +1005,14 @@ plot_avail_forecasts <- function(avail_forecasts, ) ) + theme(panel.spacing = unit(2, "lines")) - - if (show_numbers) { + if (show_counts) { plot <- plot + - geom_text(aes(label = `Number forecasts`)) + geom_text(aes(label = `Count`)) } - return(plot) } + #' @title Plot Correlation Between Metrics #' #' @description @@ -1029,7 +1029,8 @@ plot_avail_forecasts <- function(avail_forecasts, #' @examples #' scores <- score(example_quantile) #' correlations <- correlation( -#' summarise_scores(scores) +#' summarise_scores(scores), +#' digits = 2 #' ) #' plot_correlation(correlations) diff --git a/R/score.R b/R/score.R index 0b653a8f4..f490a1d75 100644 --- a/R/score.R +++ b/R/score.R @@ -1,87 +1,43 @@ -#' @title Evaluate forecasts -#' -#' @description This function allows automatic scoring of forecasts using a -#' range of metrics. For most users it will be the workhorse for -#' scoring forecasts as it wraps the lower level functions package functions. -#' However, these functions are also available if you wish to make use of them -#' independently. -#' -#' A range of forecasts formats are supported, including quantile-based, -#' sample-based, binary forecasts. Prior to scoring, users may wish to make use -#' of [check_forecasts()] to ensure that the input data is in a supported -#' format though this will also be run internally by [score()]. Examples for -#' each format are also provided (see the documentation for `data` below or in -#' [check_forecasts()]). -#' -#' Each format has a set of required columns (see below). Additional columns may -#' be present to indicate a grouping of forecasts. For example, we could have -#' forecasts made by different models in various locations at different time -#' points, each for several weeks into the future. It is important, that there -#' are only columns present which are relevant in order to group forecasts. -#' A combination of different columns should uniquely define the -#' *unit of a single forecast*, meaning that a single forecast is defined by the -#' values in the other columns. Adding additional unrelated columns may alter -#' results. -#' -#' To obtain a quick overview of the currently supported evaluation metrics, -#' have a look at the [metrics] data included in the package. The column -#' `metrics$Name` gives an overview of all available metric names that can be -#' computed. If interested in an unsupported metric please open a [feature -#' request](https://github.com/epiforecasts/scoringutils/issues) or consider -#' contributing a pull request. -#' +#' @title Evaluate forecasts in a data.frame format +#' @description `score()` applies a selection of scoring metrics to a data.frame +#' of forecasts. It is the workhorse of the `scoringutils` package. +#' `score()` is a generic that dispatches to different methods depending on the +#' class of the input data. +#' +#' We recommend that users call [as_forecast()] prior to calling `score()` to +#' validate the input data and convert it to a forecast object (though +#' `score.default()` will do this if it hasn't happened before). +#' See below for more information on forecast types and input formats. #' For additional help and examples, check out the [Getting Started -#' Vignette](https://epiforecasts.io/scoringutils/articles/scoringutils.html) -#' as well as the paper [Evaluating Forecasts with scoringutils in +#' Vignette](https://epiforecasts.io/scoringutils/articles/scoringutils.html) as +#' well as the paper [Evaluating Forecasts with scoringutils in #' R](https://arxiv.org/abs/2205.07090). -#' -#' @param data A data.frame or data.table with the predictions and observations. -#' For scoring using [score()], the following columns need to be present: -#' \itemize{ -#' \item `true_value` - the true observed values -#' \item `prediction` - predictions or predictive samples for one -#' true value. (You only don't need to provide a prediction column if -#' you want to score quantile forecasts in a wide range format.)} -#' For scoring integer and continuous forecasts a `sample` column is needed: -#' \itemize{ -#' \item `sample` - an index to identify the predictive samples in the -#' prediction column generated by one model for one true value. Only -#' necessary for continuous and integer forecasts, not for -#' binary predictions.} -#' For scoring predictions in a quantile-format forecast you should provide -#' a column called `quantile`: -#' - `quantile`: quantile to which the prediction corresponds -#' -#' In addition a `model` column is suggested and if not present this will be -#' flagged and added to the input data with all forecasts assigned as an -#' "unspecified model"). -#' -#' You can check the format of your data using [check_forecasts()] and there -#' are examples for each format ([example_quantile], [example_continuous], -#' [example_integer], and [example_binary]). -#' -#' @param metrics the metrics you want to have in the output. If `NULL` (the -#' default), all available metrics will be computed. For a list of available -#' metrics see [available_metrics()], or check the [metrics] data set. -#' -#' @param ... additional parameters passed down to [score_quantile()] (internal -#' function used for scoring forecasts in a quantile-based format). -#' -#' @return A data.table with unsummarised scores. There will be one score per -#' quantile or sample, which is usually not desired, so you should almost -#' always run [summarise_scores()] on the unsummarised scores. -#' +#' @inheritSection forecast_types Forecast types and input format +#' @inheritSection forecast_types Forecast unit +#' @param data A data.frame or data.table with predicted and observed values. +#' @param metrics A named list of scoring functions. Names will be used as +#' column names in the output. See [rules_point()], [rules_binary()], +#' [rules_quantile()], and [rules_sample()] for more information on the +#' default metrics used. +#' @param ... additional arguments +#' @return A data.table with unsummarised scores. This will generally be +#' one score per forecast (as defined by the unit of a single forecast). +#' +#' For quantile-based forecasts, one score per quantile will be returned +#' instead. This is done as scores can be computed and may be of interest +#' for individual quantiles. You can call [summarise_scores()]) on the +#' unsummarised scores to obtain one score per forecast unit for quantile-based +#' forecasts. #' @importFrom data.table ':=' as.data.table -#' +#' @importFrom stats na.omit #' @examples #' library(magrittr) # pipe operator #' \dontshow{ #' data.table::setDTthreads(2) # restricts number of cores used on CRAN #' } #' -#' check_forecasts(example_quantile) -#' score(example_quantile) %>% -#' add_coverage(by = c("model", "target_type")) %>% +#' validated <- as_forecast(example_quantile) +#' score(validated) %>% #' summarise_scores(by = c("model", "target_type")) #' #' # set forecast unit manually (to avoid issues with scoringutils trying to @@ -90,75 +46,182 @@ #' set_forecast_unit( #' c("location", "target_end_date", "target_type", "horizon", "model") #' ) %>% -#' check_forecasts() %>% +#' as_forecast() %>% #' score() #' #' # forecast formats with different metrics #' \dontrun{ #' score(example_binary) #' score(example_quantile) +#' score(example_point) #' score(example_integer) #' score(example_continuous) #' } -#' -#' # score point forecasts (marked by 'NA' in the quantile column) -#' score(example_point) %>% -#' summarise_scores(by = "model", na.rm = TRUE) -#' #' @author Nikos Bosse \email{nikosbosse@@gmail.com} -#' @references Funk S, Camacho A, Kucharski AJ, Lowe R, Eggo RM, Edmunds WJ -#' (2019) Assessing the performance of real-time epidemic forecasts: A -#' case study of Ebola in the Western Area region of Sierra Leone, 2014-15. -#' PLoS Comput Biol 15(2): e1006785. \doi{10.1371/journal.pcbi.1006785} +#' @references +#' Bosse NI, Gruson H, Cori A, van Leeuwen E, Funk S, Abbott S +#' (2022) Evaluating Forecasts with scoringutils in R. +#' \doi{10.48550/arXiv.2205.07090} #' @export -score <- function(data, - metrics = NULL, - ...) { - - # preparations --------------------------------------------------------------- - if (is_scoringutils_check(data)) { - check_data <- data - } else { - check_data <- check_forecasts(data) - } - - data <- check_data$cleaned_data - prediction_type <- check_data$prediction_type - forecast_unit <- check_data$forecast_unit - target_type <- check_data$target_type - - # check metrics are available or set to all metrics -------------------------- - metrics <- check_metrics(metrics) - - # Score binary predictions --------------------------------------------------- - if (target_type == "binary") { - scores <- score_binary( - data = data, - forecast_unit = forecast_unit, - metrics = metrics - ) - } - - # Score quantile predictions ------------------------------------------------- - if (prediction_type == "quantile") { - scores <- score_quantile( - data = data, - forecast_unit = forecast_unit, - metrics = metrics, - ... +score <- function(data, ...) { + UseMethod("score") +} + +#' @rdname score +#' @export +score.default <- function(data, ...) { + assert(check_data_columns(data)) + forecast_type <- get_forecast_type(data) + data <- new_forecast(data, paste0("forecast_", forecast_type)) + score(data, ...) +} + +#' @importFrom stats na.omit +#' @importFrom data.table setattr +#' @rdname score +#' @export +score.forecast_binary <- function(data, metrics = rules_binary(), ...) { + data <- validate_forecast(data) + data <- na.omit(data) + metrics <- validate_metrics(metrics) + + data <- apply_rules( + data, metrics, + data$observed, data$predicted, ... + ) + + setattr(data, "metric_names", names(metrics)) + + return(data[]) + +} + + +#' @importFrom Metrics se ae ape +#' @importFrom stats na.omit +#' @importFrom data.table setattr +#' @rdname score +#' @export +score.forecast_point <- function(data, metrics = rules_point(), ...) { + data <- validate_forecast(data) + data <- na.omit(data) + metrics <- validate_metrics(metrics) + + data <- apply_rules( + data, metrics, + data$observed, data$predicted, ... + ) + + setattr(data, "metric_names", names(metrics)) + + return(data[]) +} + +#' @importFrom stats na.omit +#' @importFrom data.table setattr +#' @rdname score +#' @export +score.forecast_sample <- function(data, metrics = rules_sample(), ...) { + data <- validate_forecast(data) + data <- na.omit(data) + forecast_unit <- get_forecast_unit(data) + metrics <- validate_metrics(metrics) + + # transpose the forecasts that belong to the same forecast unit + d_transposed <- data[, .(predicted = list(predicted), + observed = unique(observed), + scoringutils_N = length(list(sample_id))), + by = forecast_unit] + + # split according to number of samples and do calculations for different + # sample lengths separately + d_split <- split(d_transposed, d_transposed$scoringutils_N) + + split_result <- lapply(d_split, function(data) { + # create a matrix + observed <- data$observed + predicted <- do.call(rbind, data$predicted) + data[, c("observed", "predicted", "scoringutils_N") := NULL] + + data <- apply_rules( + data, metrics, + observed, predicted, ... ) - } - - # Score integer or continuous predictions ------------------------------------ - if (prediction_type %in% c("integer", "continuous") && (target_type != "binary")) { - scores <- score_sample( - data = data, - forecast_unit = forecast_unit, - metrics = metrics, - prediction_type = prediction_type + return(data) + }) + data <- rbindlist(split_result) + setattr(data, "metric_names", names(metrics)) + + return(data[]) +} + + +#' @importFrom stats na.omit +#' @importFrom data.table `:=` as.data.table rbindlist %like% setattr +#' @rdname score +#' @export +score.forecast_quantile <- function(data, metrics = rules_quantile(), ...) { + data <- validate_forecast(data) + data <- na.omit(data) + forecast_unit <- get_forecast_unit(data) + metrics <- validate_metrics(metrics) + + # transpose the forecasts that belong to the same forecast unit + # make sure the quantiles and predictions are ordered in the same way + d_transposed <- data[, .( + predicted = list(predicted[order(quantile)]), + observed = unique(observed), + quantile = list(sort(quantile, na.last = TRUE)), + scoringutils_quantile = toString(sort(quantile, na.last = TRUE)) + ), by = forecast_unit] + + # split according to quantile lengths and do calculations for different + # quantile lengths separately. The function `wis()` assumes that all + # forecasts have the same quantiles + d_split <- split(d_transposed, d_transposed$scoringutils_quantile) + + split_result <- lapply(d_split, function(data) { + # create a matrix out of the list of predicted values and quantiles + observed <- data$observed + predicted <- do.call(rbind, data$predicted) + quantile <- unlist(unique(data$quantile)) + data[, c( + "observed", "predicted", "quantile", "scoringutils_quantile" + ) := NULL] + + data <- apply_rules( + data, metrics, + observed, predicted, quantile, ... ) - } + return(data) + }) + + data <- rbindlist(split_result) + setattr(data, "metric_names", names(metrics)) + + return(data[]) +} + - return(scores[]) +#' @title Apply A List Of Functions To A Data Table Of Forecasts +#' @description This helper function applies scoring rules (stored as a list of +#' functions) to a data table of forecasts. `apply_rules` is used within +#' `score()` to apply all scoring rules to the data. +#' Scoring rules are wrapped in [run_safely()] to catch errors and to make +#' sure that only arguments are passed to the scoring rule that are actually +#' accepted by it. +#' @inheritParams score +#' @return A data table with the forecasts and the calculated metrics +#' @keywords internal +apply_rules <- function(data, metrics, ...) { + expr <- expression( + data[, (metric_name) := do.call(run_safely, list(..., fun = fun))] + ) + lapply(seq_along(metrics), function(i, data, ...) { + metric_name <- names(metrics[i]) # nolint + fun <- metrics[[i]] # nolint + eval(expr) + }, data, ...) + return(data) } diff --git a/R/score_binary.R b/R/score_binary.R deleted file mode 100644 index fee34f308..000000000 --- a/R/score_binary.R +++ /dev/null @@ -1,32 +0,0 @@ -#' @title Evaluate forecasts in a Binary Format -#' -#' @inheritParams score -#' @param forecast_unit A character vector with the column names that define -#' the unit of a single forecast, i.e. a forecast was made for a combination -#' of the values in `forecast_unit`. -#' -#' @return A data.table with appropriate scores. For more information see -#' [score()]. -#' -#' @importFrom data.table ':=' -#' -#' @author Nikos Bosse \email{nikosbosse@@gmail.com} -#' @keywords internal - -score_binary <- function(data, - forecast_unit, - metrics) { - if ("brier_score" %in% metrics) { - data[, "brier_score" := brier_score(true_value, prediction), - by = forecast_unit - ] - } - - if ("log_score" %in% metrics) { - data[, "log_score" := logs_binary(true_value, prediction), - by = forecast_unit - ] - } - - return(data[]) -} diff --git a/R/score_continuous_integer.R b/R/score_continuous_integer.R deleted file mode 100644 index 6aeb35b76..000000000 --- a/R/score_continuous_integer.R +++ /dev/null @@ -1,86 +0,0 @@ -#' @title Evaluate forecasts in a Sample-Based Format (Integer or Continuous) -#' -#' @inheritParams score -#' @param prediction_type character, should be either "continuous" or "integer" -#' @param forecast_unit A character vector with the column names that define -#' the unit of a single forecast, i.e. a forecast was made for a combination -#' of the values in `forecast_unit` -#' -#' @return A data.table with appropriate scores. For more information see -#' [score()] -#' -#' @importFrom data.table ':=' as.data.table rbindlist %like% -#' -#' @author Nikos Bosse \email{nikosbosse@@gmail.com} -#' @inherit score references -#' @keywords internal - -score_sample <- function(data, - forecast_unit, - metrics, - prediction_type) { - if (missing(prediction_type)) { - if (isTRUE(all.equal(data$prediction, as.integer(data$prediction)))) { - prediction_type <- "integer" - } else { - prediction_type <- "continuous" - } - } - - # calculate scores ----------------------------------------------------------- - # sharpness - if (any(c("sharpness", "mad") %in% metrics)) { - data[, mad := mad_sample(t(prediction)), by = forecast_unit] - } - # bias - if ("bias" %in% metrics) { - data[, bias := bias_sample( - unique(true_value), - t(prediction) - ), by = forecast_unit] - } - # DSS - if ("dss" %in% metrics) { - data[, dss := scoringutils::dss_sample( - unique(true_value), - t(prediction) - ), by = forecast_unit] - } - # CRPS - if ("crps" %in% metrics) { - data[, crps := scoringutils::crps_sample( - unique(true_value), - t(prediction) - ), by = forecast_unit] - } - # Log Score - if ("log_score" %in% metrics) { - # only compute if prediction type is continuous - if (prediction_type == "continuous") { - data[, log_score := scoringutils::logs_sample( - unique(true_value), - t(prediction) - ), by = forecast_unit] - } - } - # absolute error - if (any(c("ae_median", "abs_error", "ae_point") %in% metrics)) { - data[, ae_median := abs(unique(true_value) - median(prediction)), - by = forecast_unit] - } - # squared error - if (any(c("se_mean", "squared_error", "se_point") %in% metrics)) { - data[, se_mean := (unique(true_value) - mean(prediction))^2, - by = forecast_unit] - } - - res <- data.table::copy(data) - - # make scores unique to avoid redundancy. - res <- res[, lapply(.SD, unique), - .SDcols = colnames(res) %like% paste(metrics, collapse = "|"), - by = forecast_unit - ] - - return(res[]) -} diff --git a/R/score_quantile.R b/R/score_quantile.R deleted file mode 100644 index bebbdda6d..000000000 --- a/R/score_quantile.R +++ /dev/null @@ -1,160 +0,0 @@ -#' @title Evaluate forecasts in a Quantile-Based Format -#' -#' @inheritParams score -#' @inheritParams interval_score -#' @param count_median_twice logical that controls whether or not to count the -#' median twice when summarising (default is \code{FALSE}). Counting the -#' median twice would conceptually treat it as a 0\% prediction interval, where -#' the median is the lower as well as the upper bound. The alternative is to -#' treat the median as a single quantile forecast instead of an interval. The -#' interval score would then be better understood as an average of quantile -#' scores. -#' @param forecast_unit A character vector with the column names that define -#' the unit of a single forecast, i.e. a forecast was made for a combination -#' of the values in `forecast_unit` -#' -#' @return A data.table with appropriate scores. For more information see -#' [score()] -#' -#' @importFrom data.table ':=' as.data.table rbindlist %like% -#' -#' @author Nikos Bosse \email{nikosbosse@@gmail.com} -#' @inherit score references -#' @keywords internal - -score_quantile <- function(data, - forecast_unit, - metrics, - weigh = TRUE, - count_median_twice = FALSE, - separate_results = TRUE) { - - # make sure to have both quantile as well as range format -------------------- - range_data <- quantile_to_range_long(data, - keep_quantile_col = FALSE - ) - # adds the range column to the quantile data set - quantile_data <- range_long_to_quantile(range_data, - keep_range_col = TRUE - ) - - # to deal with point forecasts in a quantile format. This in effect adds - # a third column next to lower and upper after pivoting - range_data[is.na(range), boundary := "point"] - - range_data <- data.table::dcast(range_data, ... ~ boundary, - value.var = "prediction" - ) - - # if we only score point forecasts, it may be true that there are no columns - # upper and lower in the data.frame. If so, these need to be added - if (!all(c("upper", "lower") %in% colnames(range_data))) { - range_data[, c("upper", "lower") := NA] - } - - # set up results data.table that will then be modified throughout ------------ - res <- data.table::copy(range_data) - - # calculate scores on range format ------------------------------------------- - if ("interval_score" %in% metrics) { - # compute separate results if desired - if (separate_results) { - outcols <- c( - "interval_score", "dispersion", - "underprediction", "overprediction" - ) - } else { - outcols <- "interval_score" - } - res <- res[, eval(outcols) := do.call( - scoringutils::interval_score, - list(true_value, lower, - upper, range, - weigh, - separate_results = separate_results - ) - )] - } - - # compute coverage for every single observation - if ("coverage" %in% metrics) { - res[, coverage := ifelse(true_value <= upper & true_value >= lower, 1, 0)] # nolint - res[, coverage_deviation := coverage - range / 100] - } - - # compute bias - if ("bias" %in% metrics) { - res[, bias := bias_range( - range = range, lower = lower, upper = upper, - true_value = unique(true_value) - ), - by = forecast_unit - ] - } - - # compute absolute and squared error for point forecasts - # these are marked by an NA in range, and a numeric value for point - if (any(c("se_point, se_mean, ae_point", "ae_median", "absolute_error") %in% metrics)) { - if ("point" %in% colnames(res)) { - res[ - is.na(range) & is.numeric(point), - `:=`(ae_point = abs_error(predictions = point, true_value), - se_point = squared_error(predictions = point, true_value)) - ] - } - } - - # calculate scores on quantile format ---------------------------------------- - # compute absolute error of the median - if ("ae_median" %in% metrics) { - quantile_data[, ae_median := ae_median_quantile( - true_value, - prediction, - quantile - ), - by = forecast_unit - ] - } - - # compute quantile coverage based on quantile version - if ("quantile_coverage" %in% metrics) { - quantile_data[, quantile_coverage := (true_value <= prediction)] - } - - # merge metrics computed on quantile data (i.e. ae_median, quantile_coverage) back - # into metrics computed on range data. One important side effect of this is - # that it controls whether we count the median twice for the interval score - # (row is then duplicated) or only once. However, merge only needs to happen - # if we computed either the interval score or the ae_median or quantile coverage - if (any(c("ae_median", "interval_score", "quantile_coverage") %in% metrics)) { - # delete unnecessary columns before merging back - keep_cols <- unique(c( - forecast_unit, "quantile", "ae_median", "quantile_coverage", - "boundary", "range" - )) - delete_cols <- names(quantile_data)[!(names(quantile_data) %in% keep_cols)] - quantile_data[, eval(delete_cols) := NULL] - - # duplicate median column before merging if median is to be counted twice - # if this is false, then the res will have one entry for every quantile, - # which translates to two rows for every interval, but only one for the median - if (count_median_twice) { - median <- quantile_data[quantile == 0.5, ][, boundary := "upper"] - quantile_data <- data.table::rbindlist(list(quantile_data, median)) - } - - # merge back with other metrics - merge_cols <- setdiff(keep_cols, c( - "ae_median", "quantile_coverage", "quantile", - "boundary" - )) - # specify all.x = TRUE as the point forecasts got deleted when - # going from range to quantile above - res <- merge(res, quantile_data, by = merge_cols, all.x = TRUE) - } - - # delete internal columns before returning result - res <- delete_columns(res, c("upper", "lower", "boundary", "point", "true_value")) - - return(res[]) -} diff --git a/R/scoringRules_wrappers.R b/R/scoringRules_wrappers.R deleted file mode 100644 index 817dc88be..000000000 --- a/R/scoringRules_wrappers.R +++ /dev/null @@ -1,101 +0,0 @@ -#' @title Logarithmic score -#' -#' @description -#' Wrapper around the [`logs_sample()`][scoringRules::scores_sample_univ] -#' function from the -#' \pkg{scoringRules} package. Used to score continuous predictions. -#' While the Log Score is in theory also applicable -#' to integer forecasts, the problem lies in the implementation: The Log Score -#' needs a kernel density estimation, which is not well defined with -#' integer-valued Monte Carlo Samples. The Log Score can be used for specific -#' integer valued probability distributions. See the scoringRules package for -#' more details. -#' @inheritParams ae_median_sample -#' @return vector with the scoring values -#' @importFrom scoringRules logs_sample -#' @examples -#' true_values <- rpois(30, lambda = 1:30) -#' predictions <- replicate(200, rpois(n = 30, lambda = 1:30)) -#' logs_sample(true_values, predictions) -#' @export -#' @references -#' Alexander Jordan, Fabian Krüger, Sebastian Lerch, Evaluating Probabilistic -#' Forecasts with scoringRules, -#' @keywords metric - -logs_sample <- function(true_values, predictions) { - check_true_values(true_values) - check_predictions(predictions, true_values, - class = "matrix" - ) - - scoringRules::logs_sample( - y = true_values, - dat = predictions - ) -} - -#' @title Dawid-Sebastiani Score -#' -#' @description -#' Wrapper around the [`dss_sample()`][scoringRules::scores_sample_univ] -#' function from the -#' \pkg{scoringRules} package. -#' @inheritParams logs_sample -#' @return vector with scoring values -#' @importFrom scoringRules dss_sample -#' @examples -#' true_values <- rpois(30, lambda = 1:30) -#' predictions <- replicate(200, rpois(n = 30, lambda = 1:30)) -#' dss_sample(true_values, predictions) -#' @export -#' @references -#' Alexander Jordan, Fabian Krüger, Sebastian Lerch, Evaluating Probabilistic -#' Forecasts with scoringRules, -#' @keywords metric - -dss_sample <- function(true_values, predictions) { - check_true_values(true_values) - check_predictions(predictions, true_values, - class = "matrix" - ) - - scoringRules::dss_sample( - y = true_values, - dat = predictions - ) -} - -#' @title Ranked Probability Score -#' -#' @description -#' Wrapper around the [`crps_sample()`][scoringRules::scores_sample_univ] -#' function from the -#' \pkg{scoringRules} package. Can be used for continuous as well as integer -#' valued forecasts -#' @inheritParams logs_sample -#' @return vector with the scoring values -#' @importFrom scoringRules crps_sample -#' @examples -#' true_values <- rpois(30, lambda = 1:30) -#' predictions <- replicate(200, rpois(n = 30, lambda = 1:30)) -#' crps_sample(true_values, predictions) -#' @export -#' @references -#' Alexander Jordan, Fabian Krüger, Sebastian Lerch, Evaluating Probabilistic -#' Forecasts with scoringRules, -#' @keywords metric - -crps_sample <- function(true_values, predictions) { - - # check inputs - check_true_values(true_values) - check_predictions(predictions, true_values, - class = "matrix" - ) - - scoringRules::crps_sample( - y = true_values, - dat = predictions - ) -} diff --git a/R/sharpness.R b/R/sharpness.R deleted file mode 100644 index 335ea2508..000000000 --- a/R/sharpness.R +++ /dev/null @@ -1,34 +0,0 @@ -#' @title Determine dispersion of a probabilistic forecast -#' @details -#' Sharpness is the ability of the model to generate predictions within a -#' narrow range and dispersion is the lack thereof. -#' It is a data-independent measure, and is purely a feature -#' of the forecasts themselves. -#' -#' Dispersion of predictive samples corresponding to one single true value is -#' measured as the normalised median of the absolute deviation from -#' the median of the predictive samples. For details, see [mad()][stats::mad()] -#' and the explanations given in Funk et al. (2019) -#' -#' @inheritParams ae_median_sample -#' @importFrom stats mad -#' @return vector with dispersion values -#' -#' @references -#' Funk S, Camacho A, Kucharski AJ, Lowe R, Eggo RM, Edmunds WJ (2019) -#' Assessing the performance of real-time epidemic forecasts: A case study of -#' Ebola in the Western Area region of Sierra Leone, 2014-15. -#' PLoS Comput Biol 15(2): e1006785. \doi{10.1371/journal.pcbi.1006785} -#' -#' @export -#' @examples -#' predictions <- replicate(200, rpois(n = 30, lambda = 1:30)) -#' mad_sample(predictions) -#' @keywords metric - -mad_sample <- function(predictions) { - check_predictions(predictions, class = "matrix") - - sharpness <- apply(predictions, MARGIN = 1, mad) - return(sharpness) -} diff --git a/R/summarise_scores.R b/R/summarise_scores.R index 11fda06a6..7c7d526b4 100644 --- a/R/summarise_scores.R +++ b/R/summarise_scores.R @@ -21,35 +21,21 @@ #' be used or inferred internally if also not specified. Only one of `across` #' and `by` may be used at a time. #' @param fun a function used for summarising scores. Default is `mean`. -#' @param relative_skill logical, whether or not to compute relative -#' performance between models based on pairwise comparisons. -#' If `TRUE` (default is `FALSE`), then a column called -#' 'model' must be present in the input data. For more information on -#' the computation of relative skill, see [pairwise_comparison()]. -#' Relative skill will be calculated for the aggregation level specified in -#' `by`. -#' @param relative_skill_metric character with the name of the metric for which -#' a relative skill shall be computed. If equal to 'auto' (the default), then -#' this will be either interval score, CRPS or Brier score (depending on which -#' of these is available in the input data) -#' @param metric `r lifecycle::badge("deprecated")` Deprecated in 1.1.0. Use -#' `relative_skill_metric` instead. -#' @param baseline character string with the name of a model. If a baseline is -#' given, then a scaled relative skill with respect to the baseline will be -#' returned. By default (`NULL`), relative skill will not be scaled with -#' respect to a baseline model. #' @param ... additional parameters that can be passed to the summary function #' provided to `fun`. For more information see the documentation of the #' respective function. +#' @return a data.table with summarised scores. Scores are summarised according +#' to the names of the columns of the original data specified in `by` or +#' `across` using the `fun` passed to `summarise_scores()`. #' @examples #' \dontshow{ #' data.table::setDTthreads(2) # restricts number of cores used on CRAN #' } #' library(magrittr) # pipe operator -#' +#' \dontrun{ #' scores <- score(example_continuous) #' summarise_scores(scores) -#' +#' } #' #' # summarise over samples or quantiles to get one score per forecast #' scores <- score(example_quantile) @@ -89,22 +75,17 @@ summarise_scores <- function(scores, by = NULL, across = NULL, fun = mean, - relative_skill = FALSE, - relative_skill_metric = "auto", - metric = deprecated(), - baseline = NULL, ...) { - if (lifecycle::is_present(metric)) { - lifecycle::deprecate_warn( - "1.1.0", "summarise_scores(metric)", - "summarise_scores(relative_skill_metric)" - ) - } - if (!is.null(across) && !is.null(by)) { stop("You cannot specify both 'across' and 'by'. Please choose one.") } + metric_names <- attr(scores, "metric_names") + if (is.null(metric_names)) { + stop("`scores` needs to have an attribute `metric_names` with the names of + the metrics that were used for scoring.") + } + # preparations --------------------------------------------------------------- # get unit of a single forecast forecast_unit <- get_forecast_unit(scores) @@ -116,37 +97,123 @@ summarise_scores <- function(scores, # if across is provided, remove from by if (!is.null(across)) { - if (!all(across %in% by)) { + if (!all(across %in% forecast_unit)) { stop( "The columns specified in 'across' must be a subset of the columns ", "that define the forecast unit (possible options are ", - toString(by), + toString(forecast_unit), "). Please check your input and try again." ) } - by <- setdiff(by, across) + by <- setdiff(forecast_unit, across) } # check input arguments and check whether relative skill can be computed - relative_skill <- check_summary_params( - scores = scores, - by = by, - relative_skill = relative_skill, - baseline = baseline, - metric = relative_skill_metric - ) + assert(check_columns_present(scores, by)) - # get all available metrics to determine names of columns to summarise over - cols_to_summarise <- paste0(available_metrics(), collapse = "|") + # store attributes as they may be dropped in data.table operations + stored_attributes <- c( + get_scoringutils_attributes(scores), + list( + scoringutils_by = by, + unsummarised_scores = scores + ) + ) # takes the mean over ranges and quantiles first, if neither range nor # quantile are in `by`. Reason to do this is that summaries may be # inaccurate if we treat individual quantiles as independent forecasts scores <- scores[, lapply(.SD, base::mean, ...), by = c(unique(c(forecast_unit, by))), - .SDcols = colnames(scores) %like% cols_to_summarise + .SDcols = colnames(scores) %like% paste(metric_names, collapse = "|") + ] + + # summarise scores ----------------------------------------------------------- + scores <- scores[, lapply(.SD, fun, ...), + by = c(by), + .SDcols = colnames(scores) %like% paste(metric_names, collapse = "|") ] + # remove unnecessary columns ------------------------------------------------- + # if neither quantile nor range are in by, remove coverage and + # quantile_coverage because averaging does not make sense + if (!("range" %in% by) && ("coverage" %in% colnames(scores))) { + scores[, "coverage" := NULL] + } + if (!("quantile" %in% by) && "quantile_coverage" %in% names(scores)) { + scores[, "quantile_coverage" := NULL] + } + + scores <- assign_attributes(scores, stored_attributes) + return(scores[]) +} + +#' @rdname summarise_scores +#' @keywords scoring +#' @export +summarize_scores <- summarise_scores + + + +#' @title Add pairwise comparisons +#' @description Adds a columns with relative skills computed by running +#' pairwise comparisons on the scores. +#' +#' a column called +#' 'model' must be present in the input data. For more information on +#' the computation of relative skill, see [pairwise_comparison()]. +#' Relative skill will be calculated for the aggregation level specified in +#' `by`. +#' WRITE MORE INFO HERE. +#' +#' +#' @param scores MORE INFO HERE. +#' @param by character vector with column names to summarise scores by. Default +#' is `NULL`, meaning that the only summary that takes is place is summarising +#' over samples or quantiles (in case of quantile-based forecasts), such that +#' there is one score per forecast as defined by the *unit of a single forecast* +#' (rather than one score for every sample or quantile). +#' @param relative_skill_metric character with the name of the metric for which +#' a relative skill shall be computed. If equal to 'auto' (the default), then +#' this will be either interval score, CRPS or Brier score (depending on which +#' of these is available in the input data) +#' @param baseline character string with the name of a model. If a baseline is +#' given, then a scaled relative skill with respect to the baseline will be +#' returned. By default (`NULL`), relative skill will not be scaled with +#' respect to a baseline model. +#' @export +#' @keywords keyword scoring +add_pairwise_comparison <- function(scores, + by = NULL, + relative_skill_metric = "auto", + baseline = NULL) { + + stored_attributes <- get_scoringutils_attributes(scores) + + if (is.null(stored_attributes[["metric_names"]])) { + stop("`scores` needs to have an attribute `metric_names` with the names of + the metrics that were used for scoring.") + } + + if (!is.null(attr(scores, "unsummarised_scores"))) { + scores <- attr(scores, "unsummarised_scores") + } + + if (is.null(by) && !is.null(stored_attributes[["scoringutils_by"]])) { + by <- stored_attributes[["scoringutils_by"]] + } else if (is.null(by)) { + by <- get_forecast_unit(scores) + } + + # check input arguments and check whether relative skill can be computed + relative_skill <- check_summary_params( + scores = scores, + by = by, + relative_skill = TRUE, + baseline = baseline, + metric = relative_skill_metric + ) + # do pairwise comparisons ---------------------------------------------------- if (relative_skill) { pairwise <- pairwise_comparison( @@ -171,29 +238,17 @@ summarise_scores <- function(scores, } } - # summarise scores ----------------------------------------------------------- - scores <- scores[, lapply(.SD, fun, ...), - by = c(by), - .SDcols = colnames(scores) %like% cols_to_summarise - ] - - # remove unnecessary columns ------------------------------------------------- - # if neither quantile nor range are in by, remove coverage and - # quantile_coverage because averaging does not make sense - if (!("range" %in% by) && ("coverage" %in% colnames(scores))) { - scores[, "coverage" := NULL] - } - if (!("quantile" %in% by) && "quantile_coverage" %in% names(scores)) { - scores[, "quantile_coverage" := NULL] - } - - return(scores[]) + # add relative skill to list of metric names + stored_attributes[["metric_names"]] <- c( + stored_attributes[["metric_names"]], + "relative_skill", "scaled_rel_skill" + ) + scores <- assign_attributes(scores, stored_attributes) + scores <- summarise_scores(scores, by = by) + return(scores) } -#' @rdname summarise_scores -#' @keywords scoring -#' @export -summarize_scores <- summarise_scores + #' @title Check input parameters for [summarise_scores()] @@ -256,64 +311,3 @@ check_summary_params <- function(scores, } return(relative_skill) } - - - -#' @title Add coverage of central prediction intervals -#' -#' @description Adds a column with the coverage of central prediction intervals -#' to unsummarised scores as produced by [score()] -#' -#' @details -#' The coverage values that are added are computed according to the values -#' specified in `by`. If, for example, `by = "model"`, then there will be one -#' coverage value for every model and [add_coverage()] will compute the coverage -#' for every model across the values present in all other columns which define -#' the unit of a single forecast. -#' -#' @inheritParams summarise_scores -#' @param by character vector with column names to add the coverage for. -#' @param ranges numeric vector of the ranges of the central prediction intervals -#' for which coverage values shall be added. -#' @return a data.table with unsummarised scores with columns added for the -#' coverage of the central prediction intervals. While the overall data.table -#' is still unsummarised, note that for the coverage columns some level of -#' summary is present according to the value specified in `by`. -#' @examples -#' library(magrittr) # pipe operator -#' \dontshow{ -#' data.table::setDTthreads(2) # restricts number of cores used on CRAN -#' } -#' score(example_quantile) %>% -#' add_coverage(by = c("model", "target_type")) %>% -#' summarise_scores(by = c("model", "target_type")) %>% -#' summarise_scores(fun = signif, digits = 2) -#' @export -#' @keywords scoring - -add_coverage <- function(scores, - by, - ranges = c(50, 90)) { - summarised_scores <- summarise_scores( - scores, - by = c(by, "range") - )[range %in% ranges] - - - # create cast formula - cast_formula <- - paste( - paste(by, collapse = "+"), - "~", - "paste0('coverage_', range)" - ) - - coverages <- dcast( - summarised_scores, - value.var = "coverage", - formula = cast_formula - ) - - scores_with_coverage <- merge(scores, coverages, by = by) - return(scores_with_coverage[]) -} diff --git a/R/utils.R b/R/utils.R index e8c97632c..bcf234e0b 100644 --- a/R/utils.R +++ b/R/utils.R @@ -1,333 +1,202 @@ -#' @title Calculate Geometric Mean -#' -#' @param x numeric vector of values for which to calculate the geometric mean -#' @return the geometric mean of the values in `x` -#' -#' @keywords internal -geom_mean_helper <- function(x) { - geom_mean <- exp(mean(log(x[!is.na(x)]))) - return(geom_mean) -} - - -globalVariables(c( - "..index", - "..quantiles", - "..type", - ".", - ".SD", - "adj_pval", - "ae_point", - "ae_median", - "boundary", - "bias", - "brier_score", - "component_value", - "..colnames_x", - "..colnames_y", - "..samplecols", - "compare_against", - "count", - "coverage_deviation", - "CRPS", - "crps", - "DSS", - "dss", - "fill_col", - "identifCol", - "Interval_Score", - "overprediction", - "underprediction", - "quantile_coverage", - "LogS", - "calibration", - "coverage", - "hist", - "InternalDuplicateCheck", - "InternalNumCheck", - "log_score", - "lower", - "mad", - "mean_scores_ratio", - "metric", - "metrics_select", - "metrics", - "model", - "n_obs", - "n_obs wis_component_name", - "Number forecasts", - "pit_p_val", - "pit_value", - "point", - "prediction", - "pval", - "quantile", - "ratio", - "rel_to_baseline", - "relative_skill", - "rn", - "se_mean", - "sharpness", - "theta", - "true_value", - "type", - "upper", - "value", - "value_scaled", - "var_of_interest", - "variable", - "wis_component_name", - "x", - "y", - "g" -)) - - #' @title Available metrics in scoringutils #' #' @return A vector with the name of all available metrics #' @export #' @keywords info - available_metrics <- function() { - return(unique(scoringutils::metrics$Name)) + return(unique(c(scoringutils::metrics$Name, + "wis", "interval_coverage_50", "interval_coverage_90", + "interval_coverage_deviation"))) } -#' @title Simple permutation test + +#' @title Collapse several messages to one +#' +#' @description Internal helper function to facilitate generating messages +#' and warnings. #' -#' @description #' The implementation of the permutation test follows the -#' function -#' `permutationTest` from the `surveillance` package by Michael Höhle, -#' Andrea Riebler and Michaela Paul. +#' @param type character, should be either "messages", "warnings" or "errors" +#' @param messages the messages or warnings to collapse #' -#' @return p-value of the permutation test +#' @return string with the message or warning #' @keywords internal -permutation_test <- function(scores1, - scores2, - n_permutation = 999, - one_sided = FALSE, - comparison_mode = c("difference", "ratio")) { - nTime <- length(scores1) - meanscores1 <- mean(scores1) - meanscores2 <- mean(scores2) - comparison_mode <- match.arg(comparison_mode) - if (comparison_mode == "ratio") { - # distinguish between on-sided and two-sided: - testStat_observed <- ifelse(one_sided, - meanscores1 / meanscores2, - max(meanscores1 / meanscores2, meanscores2 / meanscores1) - ) - } else { - testStat_observed <- ifelse(one_sided, meanscores1 - meanscores2, abs(meanscores1 - meanscores2)) - } - testStat_permuted <- replicate(n_permutation, { - sel <- rbinom(nTime, size = 1, prob = 0.5) - g1 <- (sum(scores1[sel == 0]) + sum(scores2[sel == 1])) / nTime - g2 <- (sum(scores1[sel == 1]) + sum(scores2[sel == 0])) / nTime - if (comparison_mode == "ratio") { - ifelse(one_sided, g1 / g2, max(g1 / g2, g2 / g1)) - } else { - ifelse(one_sided, g1 - g2, abs(g1 - g2)) - } - }) - pVal <- (1 + sum(testStat_permuted >= testStat_observed)) / (n_permutation + 1) - # plus ones to make sure p-val is never 0? - return(pVal) +collapse_messages <- function(type = "messages", messages) { + paste0( + "The following ", type, " were produced when checking inputs:\n", + paste(paste0(seq_along(messages), ". "), messages, collapse = "\n") + ) } -#' Delete Columns From a Data.table +#' @title Print output from `check_forecasts()` #' -#' @description take a vector of column names and delete the columns if they -#' are present in the data.table -#' @param df A data.table or data.frame from which columns shall be deleted -#' @param cols_to_delete character vector with names of columns to be deleted -#' @param make_unique whether to make the data set unique after removing columns -#' @importFrom data.table as.data.table -#' @return A data.table +#' @description Helper function that prints the output generated by +#' `check_forecasts()` #' -#' @keywords internal +#' @param x An object of class 'scoringutils_check' as produced b y +#' `check_forecasts()` +#' @param ... additional arguments (not used here) #' -delete_columns <- function(df, cols_to_delete, make_unique = FALSE) { - df <- data.table::as.data.table(df) - delete_columns <- names(df)[names(df) %in% cols_to_delete] - if (length(delete_columns) > 0) { - if (make_unique) { - df <- unique(df[, eval(delete_columns) := NULL]) - } else { - df <- df[, eval(delete_columns) := NULL] - } +#' @return NULL +#' @export +#' @keywords check-forecasts +#' @examples +#' check <- as_forecast(example_quantile) +#' print(check) +print.scoringutils_check <- function(x, ...) { + cat("Your forecasts seem to be for a target of the following type:\n") + print(x["target_type"]) + cat("and in the following format:\n") + print(x["prediction_type"]) + + cat("The unit of a single forecast is defined by:\n") + print(x["forecast_unit"]) + + cat("Cleaned data, rows with NA values in predicted or observed removed:\n") + print.default(x["cleaned_data"]) + + cat("Number of unique values per column per model:\n") + print.default(x["unique_values"]) + + colnames <- names(x)[names(x) %in% c("messages", "warnings", "errors")] + if (length(colnames) > 0) { + print.default(x[colnames]) } - return(df) + + return(invisible(x)) } -#' @title Check if predictions are quantile forecasts +#' @title Filter function arguments #' -#' @description Internal helper function to check if a data frame contains -#' quantile forecast predictions. This is determined by checking if the -#' "quantile" column is present. +#' @description This function compares a list of arguments with the arguments +#' that a function can accept. It only returns those arguments that can be +#' passed to the function. #' -#' @param data Data frame containing forecast predictions +#' The function is used in [score()] to handle additional arguments passed to +#' [score()] that get then passed along to the different scoring functions. #' -#' @return Logical indicating whether predictions are quantile forecasts +#' @param fun A function to which arguments shall be passed +#' @param args A list of arguments that shall be passed to fun #' +#' @return A list of function arguments (a subset of `args`) that `fun` can +#' accept. #' @keywords internal - -prediction_is_quantile <- function(data) { - if (is.data.frame(data)) { - if ("quantile" %in% names(data)) { - return(TRUE) - } - return(FALSE) +filter_function_args <- function(fun, args) { + # Check if the function accepts ... as an argument + if ("..." %in% names(formals(fun))) { + # If it does, return all arguments + return(args) + } else { + # Identify the arguments that fun() accepts and only keep valid ones + valid_args <- names(formals(fun)) + return(args[names(args) %in% valid_args]) } - stop("Input is not a data.frame") } - -#' @title Get prediction type of a forecast -#' -#' @description Internal helper function to get the prediction type of a -#' forecast. That is inferred based on the properties of the values in the -#' `prediction` column. -#' -#' @inheritParams check_forecasts -#' -#' @return Character vector of length one with either "quantile", "integer", or -#' "continuous". +#' @title Assign attributes to an object from a named list +#' @description +#' Every list item will be made an attribute of the object. +#' @param object An object to assign attributes to +#' @param attribute_list A named list of attributes to assign to the object. #' +#' @return The object with new attributes according to the contents of +#' `attribute_list` #' @keywords internal - -get_prediction_type <- function(data) { - if (is.data.frame(data)) { - if ("quantile" %in% names(data)) { - return("quantile") - } else { - if ("prediction" %in% names(data)) { - data <- data$prediction - }else { - stop("Input does not contain a column named 'prediction'") - } - } +assign_attributes <- function(object, attribute_list) { + if (is.null(object)) { + return(NULL) } - - if (isTRUE(all.equal(as.vector(data), as.integer(data))) && - !all(is.na(as.integer(data)))) { - return("integer") - } else if (suppressWarnings(!all(is.na(as.numeric(data))))) { - return("continuous") - } else { - stop("Input is not numeric and cannot be coerced to numeric") + for (i in seq_along(attribute_list)) { + setattr(object, names(attribute_list)[i], attribute_list[[i]]) } + return(object) } -#' @title Get type of the target true values of a forecast -#' -#' @description Internal helper function to get the type of the target -#' true values of a forecast. That is inferred based on the which columns -#' are present in the data. -#' -#' @inheritParams check_forecasts -#' -#' @return Character vector of length one with either "binary", "integer", or -#' "continuous" -#' +#' Strip attributes from an object +#' @description This function removes all attributes from an object that are +#' specified in the `attributes` argument. +#' @param object An object to remove attributes from +#' @param attributes A character vector of attribute names to remove from the +#' object +#' @return The object with attributes removed #' @keywords internal - -get_target_type <- function(data) { - if (isTRUE(all.equal(data$true_value, as.integer(data$true_value)))) { - if (all(data$true_value %in% c(0, 1)) && - all(data$prediction >= 0) && all(data$prediction <= 1)) { - return("binary") - } else { - return("integer") - } - } else { - return("continuous") +strip_attributes <- function(object, attributes) { + if (is.null(object)) { + return(NULL) } -} - - - -#' @title Get unit of a single forecast -#' -#' @description Helper function to get the unit of a single forecast, i.e. -#' the column names that define where a single forecast was made for -#' -#' @inheritParams check_forecasts -#' -#' @return A character vector with the column names that define the unit of -#' a single forecast -#' -#' @keywords internal - -get_forecast_unit <- function(data) { - - protected_columns <- get_protected_columns(data) - if (prediction_is_quantile(data)) { - protected_columns <- setdiff(protected_columns, "sample") + for (i in seq_along(attributes)) { + setattr(object, attributes[i], NULL) } - forecast_unit <- setdiff(colnames(data), protected_columns) - return(forecast_unit) + return(object) } -#' @title Get protected columns from a data frame -#' -#' @description Helper function to get the names of all columns in a data frame -#' that are protected columns. +#' @title Run a function safely +#' @description This is a wrapper function designed to run a function safely +#' when it is not completely clear what arguments could be passed to the +#' function. #' -#' @inheritParams check_forecasts +#' All named arguments in `...` that are not accepted by `fun` are removed. +#' All unnamed arguments are passed on to the function. In case `fun` errors, +#' the error will be converted to a warning and `run_safely` returns `NULL`. #' -#' @return A character vector with the names of protected columns in the data +#' `run_safely` can be useful when constructing functions to be used as +#' metrics in [score()]. #' -#' @keywords internal - -get_protected_columns <- function(data) { - - datacols <- colnames(data) - protected_columns <- c( - "prediction", "true_value", "sample", "quantile", "upper", "lower", - "pit_value", "range", "boundary", available_metrics(), - grep("coverage_", names(data), fixed = TRUE, value = TRUE) - ) +#' @param ... Arguments to pass to `fun` +#' @param fun A function to execute +#' @return The result of `fun` or `NULL` if `fun` errors +#' @export +#' @keywords scoring +#' @examples +#' f <- function(x) {x} +#' run_safely(2, fun = f) +#' run_safely(2, y = 3, fun = f) +#' run_safely(fun = f) +#' run_safely(y = 3, fun = f) +run_safely <- function(..., fun) { + args <- list(...) + # Check if the function accepts ... as an argument + if ("..." %in% names(formals(fun))) { + valid_args <- args + } else if (is.null(names(args))) { + # if no arguments are named, just pass all arguments on + valid_args <- args + } else { + # Identify the arguments that fun() accepts + possible_args <- names(formals(fun)) + # keep valid arguments as well as unnamed arguments + valid_args <- args[names(args) == "" | names(args) %in% possible_args] + } - # only return protected columns that are present - protected_columns <- intersect( - datacols, - protected_columns - ) + result <- try(do.call(fun, valid_args), silent = TRUE) - return(protected_columns) + if (inherits(result, "try-error")) { + msg <- conditionMessage(attr(result, "condition")) + warning( + "Function execution failed, returning NULL. Error: ", + msg + ) + return(NULL) + } + return(result) } -#' @title Check whether object has been checked with check_forecasts() -#' -#' @description Helper function to determine whether an object has been checked -#' by and passed [check_forecasts()]. -#' -#' @param data An object of class `scoringutils_check()` as produced by -#' [check_forecasts()]. -#' -#' @importFrom methods is -#' -#' @return Logical, either TRUE or FALSE -#' +#' Ensure That an Object is a Data Table +#' @description This function ensures that an object is a data table. +#' If the object is not a data table, it is converted to one. If the object +#' is a data table, a copy of the object is returned. +#' @param data An object to ensure is a data table +#' @return A data table #' @keywords internal - -is_scoringutils_check <- function(data) { - - result <- is(data, "scoringutils_check") - - if (result && - any(is.null(data$cleaned_data), is.null(data$prediction_type), - is.null(data$forecast_unit), is.null(data$target_type))) { - stop("Input seems to be an output of `scoringutils_check()`, ", - "but at least one of the required list items ", - "'cleaned_data', 'prediction_type', 'forecast_unit', or - 'target_type' is missing. Try running check_forecasts() again.") +#' @importFrom data.table copy is.data.table as.data.table +ensure_data.table <- function(data) { + if (is.data.table(data)) { + data <- copy(data) + } else { + data <- as.data.table(data) } - - return(result) + return(data) } diff --git a/R/utils_data_handling.R b/R/utils_data_handling.R index e067fa5b3..3239f33bd 100644 --- a/R/utils_data_handling.R +++ b/R/utils_data_handling.R @@ -32,7 +32,7 @@ merge_pred_and_obs <- function(forecasts, observations, if (is.null(by)) { protected_columns <- c( - "prediction", "true_value", "sample", "quantile", + "predicted", "observed", "sample_id", "quantile", "range", "boundary" ) by <- setdiff(colnames(forecasts), protected_columns) @@ -65,12 +65,14 @@ merge_pred_and_obs <- function(forecasts, observations, basenames_y <- sub(".y$", "", colnames_y) # see whether the column name as well as the content is the same - overlapping <- (as.list(combined[, ..colnames_x]) %in% as.list(combined[, ..colnames_y])) & basenames_x == basenames_y + content_x <- as.list(combined[, ..colnames_x]) + content_y <- as.list(combined[, ..colnames_y]) + overlapping <- (content_x %in% content_y) & (basenames_x == basenames_y) overlap_names <- colnames_x[overlapping] basenames_overlap <- sub(".x$", "", overlap_names) # delete overlapping columns - if (length(basenames_overlap > 0)) { + if (length(basenames_overlap) > 0) { combined[, paste0(basenames_overlap, ".x") := NULL] combined[, paste0(basenames_overlap, ".y") := NULL] } @@ -101,14 +103,15 @@ merge_pred_and_obs <- function(forecasts, observations, sample_to_quantile <- function(data, quantiles = c(0.05, 0.25, 0.5, 0.75, 0.95), type = 7) { - data <- data.table::as.data.table(data) - - reserved_columns <- c("prediction", "sample") + data <- ensure_data.table(data) + reserved_columns <- c("predicted", "sample_id") by <- setdiff(colnames(data), reserved_columns) + quantiles <- unique(round(c(quantiles, 1 - quantiles), digits = 10)) + data <- data[, .(quantile = quantiles, - prediction = quantile(x = prediction, prob = ..quantiles, - type = ..type, na.rm = TRUE)), + predicted = quantile(x = predicted, prob = ..quantiles, + type = ..type, na.rm = TRUE)), by = by] return(data[]) @@ -158,50 +161,112 @@ range_long_to_quantile <- function(data, } -#' @title Change Data from a Plain Quantile Format to a Long Range Format -#' +#' Transform From a Quantile Format to an Interval Format #' @description +#' **Quantile format** +#' In a quantile format, a prediction is characterised by one or multiple +#' predicted values and the corresponding quantile levels. For example, a +#' prediction in a quantile format could be represented by the 0.05, 0.25, 0.5, +#' 0.75 and 0.95 quantiles of the predictive distribution. #' -#' Transform data from a format that uses quantiles only to one that uses -#' interval ranges to denote quantiles. -#' -#' @param data a data.frame in quantile format +#' **Interval format** +#' In the interval format, two quantiles are assumed to form a prediction +#' interval. Prediction intervals need to be symmetric around the median and +#' are characterised by a lower and an upper bound. The lower bound is defined +#' by the lower quantile and the upper bound is defined by the upper quantile. +#' A 90% prediction interval, for example, covers 90% of the probability mass +#' and is defined by the 5% and 95% quantiles. A forecast could therefore +#' be characterised by one or multiple prediction intervals, e.g. the lower +#' and upper bounds of the 50% and 90% prediction intervals (corresponding to +#' the 0.25 and 0.75 as well as the 0.05 and 0.095 quantiles). +#' @param ... method arguments +#' @keywords data-handling +#' @export +quantile_to_interval <- function(...) { + UseMethod("quantile_to_interval") +} + + +#' @param dt a data.table with columns `quantile` and `predicted` +#' @param format the format of the output. Either "long" or "wide". If "long" +#' (the default), there will be a column `boundary` (with values either "upper" +#' or "lower" and a column `range` that contains the range of the interval. +#' If "wide", there will be a column `range` and two columns +#' `lower` and `upper` that contain the lower and upper bounds of the +#' prediction interval, respectively. #' @param keep_quantile_col keep the quantile column in the final -#' output after transformation (default is FALSE) -#' @return a data.frame in a long interval range format +#' output after transformation (default is FALSE). This only works if +#' `format = "long"`. If `format = "wide"`, the quantile column will always be +#' dropped. +#' @return +#' *quantile_to_interval.data.frame*: +#' a data.frame in an interval format (either "long" or "wide"), with or +#' without a quantile column. Rows will not be reordered. #' @importFrom data.table copy -#' @keywords internal +#' @export +#' @rdname quantile_to_interval +#' @keywords data-handling +quantile_to_interval.data.frame <- function(dt, + format = "long", + keep_quantile_col = FALSE, + ...) { + dt <- ensure_data.table(dt) -quantile_to_range_long <- function(data, - keep_quantile_col = TRUE) { - data <- data.table::as.data.table(data) - - data[, boundary := ifelse(quantile <= 0.5, "lower", "upper")] - data[, range := ifelse( - boundary == "lower", - round((1 - 2 * quantile) * 100, 10), - round((2 * quantile - 1) * 100, 10) - )] + dt[, boundary := ifelse(quantile <= 0.5, "lower", "upper")] + dt[, range := get_range_from_quantile(quantile)] # add median quantile - median <- data[quantile == 0.5, ] + median <- dt[quantile == 0.5, ] median[, boundary := "upper"] - data <- data.table::rbindlist(list(data, median)) - + dt <- data.table::rbindlist(list(dt, median)) if (!keep_quantile_col) { - data[, "quantile" := NULL] + dt[, quantile := NULL] } - # if only point forecasts are scored, we only have NA values for range and - # boundary. In that instance we need to set the type of the columns - # explicitly to avoid future collisions. - data[, `:=`( - boundary = as.character(boundary), - range = as.numeric(range) - )] + if (format == "wide") { + suppressWarnings(dt[, "quantile" := NULL]) + dt <- dcast(dt, ... ~ boundary, value.var = "predicted") + # if there are NA values in `predicted`, this introduces a column "NA" + if ("NA" %in% colnames(dt) && all(is.na(dt[["NA"]]))) { + dt[, "NA" := NULL] + } + } + return(dt[]) +} - return(data[]) + +#' @param observed a numeric vector of observed values of size n +#' @param predicted a numeric vector of predicted values of size n x N. If +#' `observed` is a single number, then `predicted` can be a vector of length N +#' @param quantile a numeric vector of quantile levels of size N +#' @return +#' *quantile_to_interval.numeric*: +#' a data.frame in a wide interval format with columns `forecast_id`, +#' `observed`, `lower`, `upper`, and `range`. The `forecast_id` column is a +#' unique identifier for each forecast. Rows will be reordered according to +#' `forecast_id` and `range`. +#' @export +#' @rdname quantile_to_interval +#' @keywords data-handling +quantile_to_interval.numeric <- function(observed, + predicted, + quantile, + ...) { + assert_input_quantile(observed, predicted, quantile) + + n <- length(observed) + N <- length(quantile) + + dt <- data.table( + forecast_id = rep(1:n, each = N), + observed = rep(observed, each = N), + predicted = as.vector(t(predicted)), + quantile = quantile + ) + out <- quantile_to_interval(dt, format = "wide") + out <- out[order(forecast_id, range)] + return(out) } @@ -240,7 +305,29 @@ sample_to_range_long <- function(data, type = type ) - data <- quantile_to_range_long(data, keep_quantile_col = keep_quantile_col) + data <- quantile_to_interval(data, keep_quantile_col = keep_quantile_col) return(data[]) } + +#' Get Range Belonging to a Quantile +#' @description Every quantile can be thought of either as the lower or the +#' upper bound of a symmetric central prediction interval. This helper function +#' returns the range of the central prediction interval to which the quantile +#' belongs. +#' +#' Due to numeric instability that sometimes occurred in the past, ranges are +#' rounded to 10 decimal places. This is not a problem for the vast majority of +#' use cases, but it is something to be aware of. +#' @param quantile a numeric vector of quantile levels of size N +#' @return a numeric vector of interval ranges of size N +#' @keywords internal +get_range_from_quantile <- function(quantile) { + boundary <- ifelse(quantile <= 0.5, "lower", "upper") + range <- ifelse( + boundary == "lower", + round((1 - 2 * quantile) * 100, digits = 10), + round((2 * quantile - 1) * 100, digits = 10) + ) + return(range) +} diff --git a/R/validate.R b/R/validate.R new file mode 100644 index 000000000..e737ce003 --- /dev/null +++ b/R/validate.R @@ -0,0 +1,222 @@ +#' @title Create a `forecast` Object +#' @description Convert a data.frame or similar of forecasts into an object of +#' class `forecast_*` and validate it. +#' +#' `as_forecast()` determines the forecast type (binary, point, sample-based or +#' quantile-based) from the input data (using the function +#' [get_forecast_type()]. It then constructs an object of the +#' appropriate class (`forecast_binary`, `forecast_point`, `forecast_sample`, or +#' `forecast_quantile`, using the function [new_forecast()]). +#' Lastly, it calls [as_forecast()] on the object to make sure it conforms with +#' the required input formats. +#' @inheritParams score +#' @inheritSection forecast_types Forecast types and input format +#' @return Depending on the forecast type, an object of class +#' `forecast_binary`, `forecast_point`, `forecast_sample` or +#' `forecast_quantile`. +#' @export +#' @keywords check-forecasts +#' @examples +#' as_forecast(example_binary) +#' as_forecast(example_quantile) +as_forecast <- function(data, ...) { + UseMethod("as_forecast") +} + +#' @rdname as_forecast +#' @export +as_forecast.default <- function(data, ...) { + assert(check_data_columns(data)) + + # find forecast type + forecast_type <- get_forecast_type(data) + + # construct class + data <- new_forecast(data, paste0("forecast_", forecast_type)) + + # validate class + validate_forecast(data) +} + + +#' @title Validate input data +#' +#' @description +#' Methods for the different classes run [validate_general()], which performs +#' checks that are the same for all forecast types and then perform specific +#' checks for the specific forecast type. +#' @inheritParams score +#' @inheritSection forecast_types Forecast types and input format +#' @return Depending on the forecast type, an object of class +#' `forecast_binary`, `forecast_point`, `forecast_sample` or +#' `forecast_quantile`. +#' @importFrom data.table ':=' is.data.table +#' @importFrom checkmate assert_data_frame +#' @export +#' @keywords check-forecasts +#' @examples +#' forecast <- as_forecast(example_binary) +#' validate_forecast(forecast) +validate_forecast <- function(data, ...) { + UseMethod("validate_forecast") +} + + +#' @export +#' @keywords check-forecasts +validate_forecast.forecast_binary <- function(data, ...) { + data <- validate_general(data) + + columns_correct <- test_columns_not_present(data, c("sample_id", "quantile")) + if (!columns_correct) { + stop("Checking `data`: Input looks like a binary forecast, but an", + "additional column called `sample_id` or `quantile` was found.", + "Please remove the column.") + } + input_check <- check_input_binary(data$observed, data$predicted) + if (!is.logical(input_check)) { + stop("Checking `data`:", + "Input looks like a binary forecast, but found the following issue: ", + input_check) + } + return(data[]) +} + + +#' @export +#' @keywords check-forecasts +validate_forecast.forecast_point <- function(data, ...) { + data <- validate_general(data) + + input_check <- check_input_point(data$observed, data$predicted) + if (!is.logical(input_check)) { + stop("Checking `data`:", + "Input looks like a point forecast, but found the following issue: ", + input_check) + } + return(data[]) +} + + +#' @export +#' @keywords check-forecasts +validate_forecast.forecast_quantile <- function(data, ...) { + data <- validate_general(data) + assert_numeric(data$quantile, lower = 0, upper = 1) + return(data[]) +} + + +#' @export +#' @keywords check-forecasts +validate_forecast.forecast_sample <- function(data, ...) { + data <- validate_general(data) + return(data[]) +} + + +#' @title Apply scoringutls input checks that are the same across forecast types +#' +#' @description +#' The function runs input checks that apply to all input data, regardless of +#' forecast type. The function +#' - asserts that the data is a data.table which has columns `observed` and +#' `predicted`, as well as a column called `model`. +#' - checks the forecast type and forecast unit +#' - checks there are no duplicate forecasts +#' - if appropriate, checks the number of samples / quantiles is the same +#' for all forecasts +#' @inheritParams get_forecast_counts +#' @return returns the input, with a few new attributes that hold additional +#' information, messages and warnings +#' @importFrom data.table ':=' is.data.table +#' @importFrom checkmate assert_data_table +#' @export +#' @keywords internal_input_check +validate_general <- function(data) { + # check that data is a data.table and that the columns look fine + assert_data_table(data) + assert(check_data_columns(data)) + data <- assure_model_column(data) + + # check that there aren't any duplicated forecasts + forecast_unit <- get_forecast_unit(data) + assert(check_duplicates(data, forecast_unit = forecast_unit)) + + # check that the number of forecasts per sample / quantile is the same + number_quantiles_samples <- check_number_per_forecast(data, forecast_unit) + if (!is.logical(number_quantiles_samples)) { + warning(number_quantiles_samples) + } + + # check whether there are any NA values + if (anyNA(data)) { + if (nrow(na.omit(data)) == 0) { + stop( + "After removing rows with NA values in the data, no forecasts are left." + ) + } + message( + "Some rows containing NA values may be removed. ", + "This is fine if not unexpected." + ) + } + + return(data[]) +} + + +#' @title Class constructor for scoringutils objects +#' +#' @description +#' Construct a class based on a data.frame or similar. The constructor +#' - coerces the data into a data.table +#' - makes sure that a column called `model` exists and if not creates one +#' - assigns a class +#' +#' @inheritParams get_forecast_counts +#' @param classname name of the class to be created +#' @return An object of the class indicated by `classname` +#' @export +#' @keywords internal +new_forecast <- function(data, classname) { + data <- as.data.table(data) + data <- assure_model_column(data) + class(data) <- c(classname, class(data)) + data <- copy(data) + return(data[]) +} + + +#' @title Validate metrics +#' +#' @description This function validates whether the list of metrics is a list +#' of valid functions. +#' +#' The function is used in [score()] to make sure that all metrics are valid +#' functions +#' +#' @param metrics A named list with metrics. Every element should be a scoring +#' function to be applied to the data. +#' +#' @return A named list of metrics, with those filtered out that are not +#' valid functions +#' @importFrom checkmate assert_list test_list check_function +#' @keywords internal_input_check +validate_metrics <- function(metrics) { + + assert_list(metrics, min.len = 1, names = "named") + + for (i in seq_along(metrics)) { + check_fun <- check_function(metrics[[i]]) + if (!is.logical(check_fun)) { + warning("`Metrics` element number ", i, " is not a valid function") + names(metrics)[i] <- "scoringutils_delete" + } + } + metrics[names(metrics) == "scoringutils_delete"] <- NULL + + assert_list(metrics, min.len = 1, .var.name = "valid metrics") + + return(metrics) +} diff --git a/R/z_globalVariables.R b/R/z_globalVariables.R new file mode 100644 index 000000000..bcd591290 --- /dev/null +++ b/R/z_globalVariables.R @@ -0,0 +1,84 @@ +globalVariables(c( + "..index", + "..quantiles", + "..type", + ".", + ".SD", + "adj_pval", + "ae_point", + "ae_median", + "boundary", + "bias", + "brier_score", + "component_value", + "..colnames_x", + "..colnames_y", + "..samplecols", + "calibration", + "compare_against", + "coverage", + "count", + "coverage_deviation", + "CRPS", + "crps", + "DSS", + "dss", + "existing", + "fill_col", + "forecast_id", + "hist", + "identifCol", + "Interval_Score", + "interval_range", + "interval_coverage", + "interval_coverage_deviation", + "overprediction", + "underprediction", + "quantile_coverage", + "quantile_coverage_deviation", + "LogS", + "log_score", + "lower", + "mad", + "mean_scores_ratio", + "metric", + "metrics_select", + "metrics", + "rules_binary", + "rules_point", + "rules_quantile", + "rules_sample", + "model", + "n_obs", + "n_obs wis_component_name", + "Count", + "pit_p_val", + "pit_value", + "point", + "predicted", + "pval", + "quantile", + "ratio", + "rel_to_baseline", + "relative_skill", + "rn", + "sample_id", + "scoringutils_InternalDuplicateCheck", + "scoringutils_InternalNumCheck", + "se_mean", + "sharpness", + "theta", + "observed", + "type", + "upper", + "value", + "value_scaled", + "var_of_interest", + "variable", + "weight", + "wis", + "wis_component_name", + "x", + "y", + "g" +)) diff --git a/README.Rmd b/README.Rmd index 59055342b..c21583696 100644 --- a/README.Rmd +++ b/README.Rmd @@ -26,11 +26,46 @@ library(magrittr) library(data.table) library(ggplot2) library(knitr) + +## code to determine version inspired by [pkgdown:::dev_mode_auto()] +version <- packageVersion("scoringutils") +devel <- length(unclass(package_version(version))[[1]]) > 3 +``` + +```{r note_dev, results = 'asis', echo = FALSE} +if (devel) { + cat( + "**Note**: ", + "This documentation refers to the development version of `scoringutils`. ", + "You can also view the [documentation of the stable version]", + "(https://epiforecasts.io/scoringutils).", + sep = "" + ) +} else { + cat( + "**Note**: ", + "This documentation refers to the stable version of `scoringutils`. ", + "You can also view the [documentation of the development version]", + "(https://epiforecasts.io/scoringutils/dev).", + sep = "" + ) +} +cat("\n\n") ``` -The `scoringutils` package provides a collection of metrics and proper scoring rules and aims to make it simple to score probabilistic forecasts against the true observed values. +The `scoringutils` package provides a collection of metrics and proper scoring rules and aims to make it simple to score probabilistic forecasts against observed values. + +A good starting point for those wishing to use `scoringutils` are the vignettes on [Getting started](https://epiforecasts.io/scoringutils/articles/scoringutils.html), [Details on the metrics implemented](https://epiforecasts.io/scoringutils/articles/metric-details.html) and [Scoring forecasts directly](https://epiforecasts.io/scoringutils/articles/scoring-forecasts-directly.html). + +For a detailed description of the package, its rationale and design, usage examples and how it relates to other packages in the R ecosystem, please see the corresponding paper: -You can find additional information and examples in the papers [Evaluating Forecasts with scoringutils in R](https://arxiv.org/abs/2205.07090) [Scoring epidemiological forecasts on transformed scales](https://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1011393) as well as the Vignettes ([Getting started](https://epiforecasts.io/scoringutils/articles/scoringutils.html), [Details on the metrics implemented](https://epiforecasts.io/scoringutils/articles/metric-details.html) and [Scoring forecasts directly](https://epiforecasts.io/scoringutils/articles/scoring-forecasts-directly.html)). +> Nikos I. Bosse, Hugo Gruson, Anne Cori, Edwin van Leeuwen, Sebastian Funk and Sam Abbott (2022). _`Evaluating Forecasts with scoringutils in R`_. arXiv:2205.07090 + +For further details on the specific issue of transforming forecasts for scoring see: + +> Nikos I. Bosse, Sam Abbott, Anne Cori, Edwin van Leeuwen, Johannes Bracher\* and Sebastian Funk\* (\*: equal contribution) (2023). _`Scoring epidemiological forecasts on transformed scales`_, PLoS Comput Biol 19(8): e1011393 + +## Package overview The `scoringutils` package offers convenient automated forecast evaluation through the function `score()`. The function operates on data.frames (it uses `data.table` internally for speed and efficiency) and can easily be integrated in a workflow based on `dplyr` or `data.table`. It also provides experienced users with a set of reliable lower-level scoring metrics operating on vectors/matrices they can build upon in other applications. In addition it implements a wide range of flexible plots designed to cover many use cases. @@ -85,17 +120,18 @@ example_quantile %>% ### Scoring forecasts -Forecasts can be easily and quickly scored using the `score()` function. `score()` automatically tries to determine the `forecast_unit`, i.e. the set of columns that uniquely defines a single forecast, by taking all column names of the data into account. However, it is recommended to set the forecast unit manually using `set_forecast_unit()` as this may help to avoid errors, especially when scoringutils is used in automated pipelines. The function `set_forecast_unit()` will simply drop unneeded columns. To verify everything is in order, the function `check_forecasts()` should be used. The result of that check can then passed directly into `score()`. `score()` returns unsummarised scores, which in most cases is not what the user wants. Here we make use of additional functions from `scoringutils` to add empirical coverage-levels (`add_coverage()`), and scores relative to a baseline model (here chosen to be the EuroCOVIDhub-ensemble model). See the getting started vignette for more details. Finally we summarise these scores by model and target type. +Forecasts can be easily and quickly scored using the `score()` function. `score()` automatically tries to determine the `forecast_unit`, i.e. the set of columns that uniquely defines a single forecast, by taking all column names of the data into account. However, it is recommended to set the forecast unit manually using `set_forecast_unit()` as this may help to avoid errors, especially when scoringutils is used in automated pipelines. The function `set_forecast_unit()` will simply drop unneeded columns. To verify everything is in order, the function `validate_forecast()` should be used. The result of that check can then passed directly into `score()`. `score()` returns unsummarised scores, which in most cases is not what the user wants. Here we make use of additional functions from `scoringutils` to add empirical coverage-levels (`add_coverage()`), and scores relative to a baseline model (here chosen to be the EuroCOVIDhub-ensemble model). See the getting started vignette for more details. Finally we summarise these scores by model and target type. ```{r score-example} example_quantile %>% set_forecast_unit(c("location", "target_end_date", "target_type", "horizon", "model")) %>% - check_forecasts() %>% + as_forecast() %>% + add_coverage() %>% score() %>% - add_coverage(ranges = c(50, 90), by = c("model", "target_type")) %>% summarise_scores( - by = c("model", "target_type"), - relative_skill = TRUE, + by = c("model", "target_type") + ) %>% + add_pairwise_comparison( baseline = "EuroCOVIDhub-ensemble" ) %>% summarise_scores( @@ -111,7 +147,7 @@ You may want to score forecasts based on transformations of the original data in ```{r} example_quantile %>% - .[, true_value := ifelse(true_value < 0, 0, true_value)] %>% + .[, observed := ifelse(observed < 0, 0, observed)] %>% transform_forecasts(append = TRUE, fun = log_shift, offset = 1) %>% score %>% summarise_scores(by = c("model", "target_type", "scale")) %>% diff --git a/README.md b/README.md index 96d12d06c..50ebe6751 100644 --- a/README.md +++ b/README.md @@ -12,21 +12,41 @@ version](https://img.shields.io/github/r-package/v/epiforecasts/scoringutils) downloads](http://cranlogs.r-pkg.org/badges/grand-total/scoringutils)](https://cran.r-project.org/package=scoringutils) +**Note**: This documentation refers to the development version of +`scoringutils`. You can also view the [documentation of the stable +version](https://epiforecasts.io/scoringutils). + The `scoringutils` package provides a collection of metrics and proper scoring rules and aims to make it simple to score probabilistic -forecasts against the true observed values. - -You can find additional information and examples in the papers -[Evaluating Forecasts with scoringutils in -R](https://arxiv.org/abs/2205.07090) [Scoring epidemiological forecasts -on transformed -scales](https://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1011393) -as well as the Vignettes ([Getting +forecasts against observed values. + +A good starting point for those wishing to use `scoringutils` are the +vignettes on [Getting started](https://epiforecasts.io/scoringutils/articles/scoringutils.html), [Details on the metrics implemented](https://epiforecasts.io/scoringutils/articles/metric-details.html) and [Scoring forecasts -directly](https://epiforecasts.io/scoringutils/articles/scoring-forecasts-directly.html)). +directly](https://epiforecasts.io/scoringutils/articles/scoring-forecasts-directly.html). + +For a detailed description of the package, its rationale and design, +usage examples and how it relates to other packages in the R ecosystem, +please see the corresponding paper: + +> Nikos I. Bosse, Hugo Gruson, Anne Cori, Edwin van Leeuwen, Sebastian +> Funk and Sam Abbott (2022). +> *`Evaluating Forecasts with scoringutils in R`*. arXiv:2205.07090 +> + +For further details on the specific issue of transforming forecasts for +scoring see: + +> Nikos I. Bosse, Sam Abbott, Anne Cori, Edwin van Leeuwen, Johannes +> Bracher\* and Sebastian Funk\* (\*: equal contribution) (2023). +> *`Scoring epidemiological forecasts on transformed scales`*, PLoS +> Comput Biol 19(8): e1011393 +> + +## Package overview The `scoringutils` package offers convenient automated forecast evaluation through the function `score()`. The function operates on @@ -117,7 +137,7 @@ the forecast unit manually using `set_forecast_unit()` as this may help to avoid errors, especially when scoringutils is used in automated pipelines. The function `set_forecast_unit()` will simply drop unneeded columns. To verify everything is in order, the function -`check_forecasts()` should be used. The result of that check can then +`validate_forecast()` should be used. The result of that check can then passed directly into `score()`. `score()` returns unsummarised scores, which in most cases is not what the user wants. Here we make use of additional functions from `scoringutils` to add empirical @@ -129,12 +149,13 @@ scores by model and target type. ``` r example_quantile %>% set_forecast_unit(c("location", "target_end_date", "target_type", "horizon", "model")) %>% - check_forecasts() %>% + as_forecast() %>% + add_coverage() %>% score() %>% - add_coverage(ranges = c(50, 90), by = c("model", "target_type")) %>% summarise_scores( - by = c("model", "target_type"), - relative_skill = TRUE, + by = c("model", "target_type") + ) %>% + add_pairwise_comparison( baseline = "EuroCOVIDhub-ensemble" ) %>% summarise_scores( @@ -142,19 +163,20 @@ example_quantile %>% digits = 2 ) %>% kable() -#> The following messages were produced when checking inputs: -#> 1. 144 values for `prediction` are NA in the data provided and the corresponding rows were removed. This may indicate a problem if unexpected. +#> Some rows containing NA values may be removed. This is fine if not unexpected. +#> Some rows containing NA values may be removed. This is fine if not unexpected. +#> Some rows containing NA values may be removed. This is fine if not unexpected. ``` -| model | target_type | interval_score | dispersion | underprediction | overprediction | coverage_deviation | bias | ae_median | coverage_50 | coverage_90 | relative_skill | scaled_rel_skill | -|:----------------------|:------------|---------------:|-----------:|----------------:|---------------:|-------------------:|--------:|----------:|------------:|------------:|---------------:|-----------------:| -| EuroCOVIDhub-baseline | Cases | 28000 | 4100 | 10000.0 | 14000.0 | -0.110 | 0.0980 | 38000 | 0.33 | 0.82 | 1.30 | 1.6 | -| EuroCOVIDhub-baseline | Deaths | 160 | 91 | 2.1 | 66.0 | 0.120 | 0.3400 | 230 | 0.66 | 1.00 | 2.30 | 3.8 | -| EuroCOVIDhub-ensemble | Cases | 18000 | 3700 | 4200.0 | 10000.0 | -0.098 | -0.0560 | 24000 | 0.39 | 0.80 | 0.82 | 1.0 | -| EuroCOVIDhub-ensemble | Deaths | 41 | 30 | 4.1 | 7.1 | 0.200 | 0.0730 | 53 | 0.88 | 1.00 | 0.60 | 1.0 | -| UMass-MechBayes | Deaths | 53 | 27 | 17.0 | 9.0 | -0.023 | -0.0220 | 78 | 0.46 | 0.88 | 0.75 | 1.3 | -| epiforecasts-EpiNow2 | Cases | 21000 | 5700 | 3300.0 | 12000.0 | -0.067 | -0.0790 | 28000 | 0.47 | 0.79 | 0.95 | 1.2 | -| epiforecasts-EpiNow2 | Deaths | 67 | 32 | 16.0 | 19.0 | -0.043 | -0.0051 | 100 | 0.42 | 0.91 | 0.98 | 1.6 | +| model | target_type | wis | overprediction | underprediction | dispersion | bias | interval_coverage_50 | interval_coverage_90 | interval_coverage_deviation | ae_median | relative_skill | scaled_rel_skill | +|:----------------------|:------------|------:|---------------:|----------------:|-----------:|--------:|---------------------:|---------------------:|----------------------------:|----------:|---------------:|-----------------:| +| EuroCOVIDhub-baseline | Cases | 28000 | 14000.0 | 10000.0 | 4100 | 0.0980 | 0.33 | 0.82 | -0.120 | 38000 | 1.30 | 1.6 | +| EuroCOVIDhub-baseline | Deaths | 160 | 66.0 | 2.1 | 91 | 0.3400 | 0.66 | 1.00 | 0.120 | 230 | 2.30 | 3.8 | +| EuroCOVIDhub-ensemble | Cases | 18000 | 10000.0 | 4200.0 | 3700 | -0.0560 | 0.39 | 0.80 | -0.100 | 24000 | 0.82 | 1.0 | +| EuroCOVIDhub-ensemble | Deaths | 41 | 7.1 | 4.1 | 30 | 0.0730 | 0.88 | 1.00 | 0.200 | 53 | 0.60 | 1.0 | +| UMass-MechBayes | Deaths | 53 | 9.0 | 17.0 | 27 | -0.0220 | 0.46 | 0.88 | -0.025 | 78 | 0.75 | 1.3 | +| epiforecasts-EpiNow2 | Cases | 21000 | 12000.0 | 3300.0 | 5700 | -0.0790 | 0.47 | 0.79 | -0.070 | 28000 | 0.95 | 1.2 | +| epiforecasts-EpiNow2 | Deaths | 67 | 19.0 | 16.0 | 32 | -0.0051 | 0.42 | 0.91 | -0.045 | 100 | 0.98 | 1.6 | `scoringutils` contains additional functionality to transform forecasts, to summarise scores at different levels, to visualise them, and to @@ -171,27 +193,33 @@ applying the natural logarithm. ``` r example_quantile %>% - .[, true_value := ifelse(true_value < 0, 0, true_value)] %>% + .[, observed := ifelse(observed < 0, 0, observed)] %>% transform_forecasts(append = TRUE, fun = log_shift, offset = 1) %>% score %>% summarise_scores(by = c("model", "target_type", "scale")) %>% head() -#> The following messages were produced when checking inputs: -#> 1. 288 values for `prediction` are NA in the data provided and the corresponding rows were removed. This may indicate a problem if unexpected. -#> model target_type scale interval_score dispersion -#> 1: EuroCOVIDhub-baseline Cases log 1.169972e+00 0.4373146 -#> 2: EuroCOVIDhub-baseline Cases natural 2.209046e+04 4102.5009443 -#> 3: EuroCOVIDhub-ensemble Cases log 5.500974e-01 0.1011850 -#> 4: EuroCOVIDhub-ensemble Cases natural 1.155071e+04 3663.5245788 -#> 5: epiforecasts-EpiNow2 Cases log 6.005778e-01 0.1066329 -#> 6: epiforecasts-EpiNow2 Cases natural 1.443844e+04 5664.3779484 -#> underprediction overprediction coverage_deviation bias ae_median -#> 1: 3.521964e-01 0.3804607 -0.10940217 0.09726562 1.185905e+00 -#> 2: 1.028497e+04 7702.9836957 -0.10940217 0.09726562 3.208048e+04 -#> 3: 1.356563e-01 0.3132561 -0.09785326 -0.05640625 7.410484e-01 -#> 4: 4.237177e+03 3650.0047554 -0.09785326 -0.05640625 1.770795e+04 -#> 5: 1.858699e-01 0.3080750 -0.06660326 -0.07890625 7.656591e-01 -#> 6: 3.260356e+03 5513.7058424 -0.06660326 -0.07890625 2.153070e+04 +#> Some rows containing NA values may be removed. This is fine if not unexpected. +#> model target_type scale wis overprediction +#> 1: EuroCOVIDhub-ensemble Cases natural 11550.70664 3650.004755 +#> 2: EuroCOVIDhub-baseline Cases natural 22090.45747 7702.983696 +#> 3: epiforecasts-EpiNow2 Cases natural 14438.43943 5513.705842 +#> 4: EuroCOVIDhub-ensemble Deaths natural 41.42249 7.138247 +#> 5: EuroCOVIDhub-baseline Deaths natural 159.40387 65.899117 +#> 6: UMass-MechBayes Deaths natural 52.65195 8.978601 +#> underprediction dispersion bias interval_coverage_50 +#> 1: 4237.177310 3663.52458 -0.05640625 0.3906250 +#> 2: 10284.972826 4102.50094 0.09726562 0.3281250 +#> 3: 3260.355639 5664.37795 -0.07890625 0.4687500 +#> 4: 4.103261 30.18099 0.07265625 0.8750000 +#> 5: 2.098505 91.40625 0.33906250 0.6640625 +#> 6: 16.800951 26.87239 -0.02234375 0.4609375 +#> interval_coverage_90 interval_coverage_deviation ae_median +#> 1: 0.8046875 -0.10230114 17707.95312 +#> 2: 0.8203125 -0.11437500 32080.48438 +#> 3: 0.7890625 -0.06963068 21530.69531 +#> 4: 1.0000000 0.20380682 53.13281 +#> 5: 1.0000000 0.12142045 233.25781 +#> 6: 0.8750000 -0.02488636 78.47656 ``` ## Citation diff --git a/_pkgdown.yml b/_pkgdown.yml index 8f21f55aa..ea4f1e2a0 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -21,14 +21,14 @@ reference: - title: Package documentation contents: - scoringutils - - title: Functions to check and analyse inputs + - title: Functions to analyse inputs and obtain useful information contents: - has_keyword("check-forecasts") - title: Functions for convenient forecast evaluation contents: - score - has_keyword("scoring") - - starts_with("score_") + - starts_with("score") - title: Lower-level scoring functions contents: - has_keyword("metric") @@ -37,9 +37,12 @@ reference: - has_keyword("data-handling") - title: Functions for plotting and data visualisation contents: - - starts_with("plot_") + - starts_with("plot") - has_keyword("plotting") - - title: Internal functions + - title: Internal input check functions + contents: + - has_keyword("internal_input_check") + - title: Misc internal functions contents: - has_keyword("internal") - title: Example data and information diff --git a/data/example_binary.rda b/data/example_binary.rda index 69103a8f4..e79959e10 100644 Binary files a/data/example_binary.rda and b/data/example_binary.rda differ diff --git a/data/example_continuous.rda b/data/example_continuous.rda index b668deea1..c8fdbbb02 100644 Binary files a/data/example_continuous.rda and b/data/example_continuous.rda differ diff --git a/data/example_integer.rda b/data/example_integer.rda index df3d161e3..334dd8384 100644 Binary files a/data/example_integer.rda and b/data/example_integer.rda differ diff --git a/data/example_point.rda b/data/example_point.rda index 04c4325b5..c657e0ba0 100644 Binary files a/data/example_point.rda and b/data/example_point.rda differ diff --git a/data/example_quantile.rda b/data/example_quantile.rda index d8127147c..47bcb36a0 100644 Binary files a/data/example_quantile.rda and b/data/example_quantile.rda differ diff --git a/data/example_quantile_forecasts_only.rda b/data/example_quantile_forecasts_only.rda index ef4bdee7e..cff223826 100644 Binary files a/data/example_quantile_forecasts_only.rda and b/data/example_quantile_forecasts_only.rda differ diff --git a/data/example_truth_only.rda b/data/example_truth_only.rda index ae829a5b3..d53839fcf 100644 Binary files a/data/example_truth_only.rda and b/data/example_truth_only.rda differ diff --git a/inst/create-example-data.R b/inst/create-example-data.R index f445a6742..c80f9506e 100644 --- a/inst/create-example-data.R +++ b/inst/create-example-data.R @@ -2,7 +2,7 @@ library(data.table) library(dplyr) library(devtools) library(here) -library(covidHubUtils) # devtools::install_github("reichlab/covidHubUtils") # nolint +library(covidHubUtils) # devtools::install_github("reichlab/covidHubUtils@v0.1.8") # nolint library(purrr) library(data.table) library(stringr) @@ -25,7 +25,7 @@ truth <- covidHubUtils::load_truth(hub = "ECDC") |> )) |> rename( target_type = target_variable, - true_value = value + observed = value ) |> select(-model) @@ -70,13 +70,13 @@ prediction_data <- map_dfr(file_paths, ), horizon = as.numeric(substr(target, 1, 1)) ) %>% - rename(prediction = value) %>% + rename(predicted = value) %>% filter( type == "quantile", grepl("inc", target) ) %>% select( - location, forecast_date, quantile, prediction, + location, forecast_date, quantile, predicted, model, target_end_date, target, target_type, horizon ) @@ -105,6 +105,8 @@ truth <- truth |> # save example data with forecasts only example_quantile_forecasts_only <- hub_data +example_quantile_forecasts_only[, quantile := round(quantile, 3)] + usethis::use_data(example_quantile_forecasts_only, overwrite = TRUE) example_truth_only <- truth @@ -119,7 +121,7 @@ usethis::use_data(example_quantile, overwrite = TRUE) # create data with point forecasts --------------------------------------------- example_point <- data.table::copy(example_quantile) -example_point[quantile == 0.5, quantile := NA_real_] +example_point <- example_point[quantile %in% c(NA, 0.5)][, quantile := NULL] usethis::use_data(example_point, overwrite = TRUE) @@ -165,13 +167,13 @@ get_samples <- function(values, quantiles, n_samples = 1000) { setDT(example_quantile) n_samples <- 40 example_continuous <- example_quantile[, .( - prediction = get_samples( - prediction, + predicted = get_samples( + predicted, quantile, n_samples = n_samples ), - sample = 1:n_samples, - true_value = unique(true_value) + sample_id = 1:n_samples, + observed = unique(observed) ), by = c( "location", "location_name", @@ -180,20 +182,17 @@ by = c( ) ] # remove unnecessary rows where no predictions are available -example_continuous[is.na(prediction), sample := NA] +example_continuous[is.na(predicted), sample_id := NA] example_continuous <- unique(example_continuous) usethis::use_data(example_continuous, overwrite = TRUE) # get integer sample data ------------------------------------------------------ example_integer <- data.table::copy(example_continuous) -example_integer <- example_integer[, prediction := round(prediction)] +example_integer <- example_integer[, predicted := round(predicted)] usethis::use_data(example_integer, overwrite = TRUE) - - - # get binary example data ------------------------------------------------------ # construct a binary prediction by looking at the number of samples below the # mean prediction. Construct the outcome as whether or not the actually @@ -209,22 +208,23 @@ by <- c( ) # calculate mean value -example_binary[, mean_val := mean(prediction), +example_binary[, mean_val := mean(predicted), by = by ] # calculate binary prediction as percentage above mean -example_binary[, prediction := mean(prediction > mean_val), +example_binary[, predicted := mean(predicted > mean_val), by = by ] # calculate true value as whether or not observed was above mean -example_binary[, true_value := true_value > mean_val] +example_binary[, observed := observed > mean_val] # delete unnecessary columns and take unique values example_binary[, `:=`( - sample = NULL, mean_val = NULL, - true_value = as.numeric(true_value) + sample_id = NULL, mean_val = NULL, + observed = factor(as.numeric(observed)) )] example_binary <- unique(example_binary) + usethis::use_data(example_binary, overwrite = TRUE) diff --git a/inst/create-metric-tables.R b/inst/create-metric-tables.R index d0dffe342..56accd9d4 100644 --- a/inst/create-metric-tables.R +++ b/inst/create-metric-tables.R @@ -419,4 +419,4 @@ data <- rbind(as.data.frame(crps), as.data.frame(mean_score_ratio), as.data.frame(relative_skill)) -saveRDS(data, "inst/metrics-overview/metrics-detailed.Rda") +saveRDS(data, "inst/metrics-overview/metrics-detailed.rds") diff --git a/inst/manuscript/R/00-standalone-Figure-replication.R b/inst/manuscript/R/00-standalone-Figure-replication.R index 70bb7776b..373f25ec4 100644 --- a/inst/manuscript/R/00-standalone-Figure-replication.R +++ b/inst/manuscript/R/00-standalone-Figure-replication.R @@ -29,7 +29,6 @@ p2 <- ggplot(data.frame(x = seq(-8, 8, 0.01), x_example = rnorm(n = 1601, mean = 0, sd = 1.25)), aes(x = x)) + - # geom_histogram(aes(x = x_example, y = ..density..), colour = "white", fill = "grey50") + geom_function(fun = dnorm, colour = "black", args = list(sd = 1.25)) + expand_limits(y = c(0, 1.0), x = c(-3, 3)) + @@ -40,7 +39,7 @@ p2 <- p21 <- ggplot(data.frame(x = seq(-8, 8, 0.01), x_example = rnorm(n = 1601, mean = 0, sd = 1.05)), aes(x = x)) + - geom_histogram(aes(x = x_example, y = ..density..), colour = "white", fill = "grey50") + + geom_histogram(aes(x = x_example, y = after_stat(density)), colour = "white", fill = "grey50") + geom_function(fun = dnorm, colour = "black", args = list(sd = 1)) + ggtitle("Well calibrated") + @@ -50,7 +49,7 @@ p21 <- ggplot(data.frame(x = seq(-8, 8, 0.01), p22 <- ggplot(data.frame(x = seq(-8, 8, 0.01), x_example = rnorm(n = 1601, mean = 1, sd = 1.05)), aes(x = x)) + - geom_histogram(aes(x = x_example, y = ..density..), colour = "white", fill = "grey50") + + geom_histogram(aes(x = x_example, y = after_stat(density)), colour = "white", fill = "grey50") + geom_function(fun = dnorm, colour = "black", args = list(mean = 2, sd = 1)) + ggtitle("Badly calibrated") + @@ -60,7 +59,7 @@ p22 <- ggplot(data.frame(x = seq(-8, 8, 0.01), p23 <- ggplot(data.frame(x = seq(-8, 8, 0.01), x_example = rnorm(n = 1601, mean = 0, sd = 1.05)), aes(x = x)) + - geom_histogram(aes(x = x_example, y = ..density..), colour = "white", fill = "grey50") + + geom_histogram(aes(x = x_example, y = after_stat(density)), colour = "white", fill = "grey50") + geom_function(fun = dnorm, colour = "black", args = list(mean = 0, sd = 2.05)) + ggtitle("Badly calibrated") + @@ -98,18 +97,18 @@ df[, model := factor(`model`, levels = c("Pred: N(0, 1)", "Pred: N(0.5, 1)", "Pred: N(0, 2)", "Pred: N(0, 0.5)"))] -if (!file.exists("inst/manuscript/output/calibration-diagnostic-examples.Rda")) { +if (!file.exists("inst/manuscript/output/calibration-diagnostic-examples.rds")) { res <- score(df) pit <- pit(df, by = "model") stored <- list(res = res, pit = pit) - saveRDS(stored, "inst/manuscript/output/calibration-diagnostic-examples.Rda") + saveRDS(stored, "inst/manuscript/output/calibration-diagnostic-examples.rds") } else { - stored <- readRDS("inst/manuscript/output/calibration-diagnostic-examples.Rda") + stored <- readRDS("inst/manuscript/output/calibration-diagnostic-examples.rds") } res_summarised <- summarise_scores(stored$res,by = "model") @@ -127,10 +126,10 @@ scores_table_plot <- summarise_scores(res_summarised, fun = signif, digits = 2) pred_hist <- df |> ggplot(aes(x = true_value)) + facet_wrap(~ model, nrow = 1) + - geom_histogram(aes(y=..density..), + geom_histogram(aes(y=after_stat(density)), fill = "grey", colour = "dark grey") + - geom_density(aes(y=..density.., x = prediction), + geom_density(aes(y=after_stat(density), x = prediction), colour = "black") + theme_scoringutils() + labs(y = "Density", x = "Value") @@ -197,7 +196,7 @@ true_crps <- scoringRules::crps(y = 0, family = "normal", mean = mu, sd = sd) true_logs <- scoringRules::logs(y = 0, family = "normal", mean = mu, sd = sd) true_dss <- scoringRules::dss_norm(y = 0, mean = mu, sd = sd) -if (!file.exists("inst/manuscript/output/sample-convergence.Rda")) { +if (!file.exists("inst/manuscript/output/sample-convergence.rds")) { results <- list() for (i in sample_sizes) { samples <- as.data.table( @@ -214,9 +213,9 @@ if (!file.exists("inst/manuscript/output/sample-convergence.Rda")) { samples, metrics = c("crps", "log_score", "dss") )[, n_samples := i] } - saveRDS(results, "inst/manuscript/output/sample-convergence.Rda") + saveRDS(results, "inst/manuscript/output/sample-convergence.rds") } else { - results <- readRDS("inst/manuscript/output/sample-convergence.Rda") + results <- readRDS("inst/manuscript/output/sample-convergence.rds") } results <- rbindlist(results) @@ -472,14 +471,14 @@ grid <- expand.grid( setDT() -if (!file.exists("inst/manuscript/output/relation-to-scale-example.Rda")) { +if (!file.exists("inst/manuscript/output/relation-to-scale-example.rds")) { res <- grid |> rowwise() |> mutate(simulation := list(simulate(scale_mean = scale_mean, scale_sd = scale_sd))) - saveRDS(res, file = "inst/manuscript/output/relation-to-scale-example.Rda") + saveRDS(res, file = "inst/manuscript/output/relation-to-scale-example.rds") } else { - res <- readRDS("inst/manuscript/output/relation-to-scale-example.Rda") + res <- readRDS("inst/manuscript/output/relation-to-scale-example.rds") } df <- res |> @@ -535,10 +534,10 @@ p2 + p1 + p_true + # =============================================================================# # Figure 6 # =============================================================================# -avail_forecasts(data = example_integer, +available_forecasts(data = example_integer, by = c("model", "target_type", "forecast_date")) |> - plot_avail_forecasts(x = "forecast_date", - show_numbers = FALSE) + + plot_available_forecasts(x = "forecast_date", + show_counts = FALSE) + facet_wrap(~ target_type) + labs(y = "Model", x = "Forecast date") @@ -655,7 +654,7 @@ p1 / p2 + correlations <- example_quantile |> score() |> summarise_scores() |> - correlation() + correlation(digits = 2) correlations |> glimpse() diff --git a/inst/manuscript/R/illustration-relation-to-scale.R b/inst/manuscript/R/illustration-relation-to-scale.R index e0502f640..a4717680f 100644 --- a/inst/manuscript/R/illustration-relation-to-scale.R +++ b/inst/manuscript/R/illustration-relation-to-scale.R @@ -85,14 +85,14 @@ grid <- expand.grid( setDT() -if (!file.exists("inst/manuscript/output/relation-to-scale-example.Rda")) { +if (!file.exists("inst/manuscript/output/relation-to-scale-example.rds")) { res <- grid |> rowwise() |> mutate(simulation := list(simulate(scale_mean = scale_mean, scale_sd = scale_sd))) - saveRDS(res, file = "inst/manuscript/output/relation-to-scale-example.Rda") + saveRDS(res, file = "inst/manuscript/output/relation-to-scale-example.rds") } else { - res <- readRDS("inst/manuscript/output/relation-to-scale-example.Rda") + res <- readRDS("inst/manuscript/output/relation-to-scale-example.rds") } df <- res |> diff --git a/inst/manuscript/R/illustration-sharpness-calibration.R b/inst/manuscript/R/illustration-sharpness-calibration.R index dc9281205..c67430098 100644 --- a/inst/manuscript/R/illustration-sharpness-calibration.R +++ b/inst/manuscript/R/illustration-sharpness-calibration.R @@ -6,8 +6,6 @@ p1 <- ggplot(data.frame(x = seq(-8, 8, 0.01), x_example = rnorm(n = 1601, mean = 0, sd = 0.45)), aes(x = x)) + - # geom_histogram(aes(x = x_example, y = ..density..), - # colour = "white", fill = "grey50", bins = 50) + geom_function(fun = dnorm, colour = "black", args = list(sd = 0.45)) + expand_limits(y = c(0, 1.0), x = c(-3, 3)) + @@ -19,7 +17,6 @@ p2 <- ggplot(data.frame(x = seq(-8, 8, 0.01), x_example = rnorm(n = 1601, mean = 0, sd = 1.25)), aes(x = x)) + - # geom_histogram(aes(x = x_example, y = ..density..), colour = "white", fill = "grey50") + geom_function(fun = dnorm, colour = "black", args = list(sd = 1.25)) + expand_limits(y = c(0, 1.0), x = c(-3, 3)) + @@ -30,7 +27,7 @@ p2 <- p21 <- ggplot(data.frame(x = seq(-8, 8, 0.01), x_example = rnorm(n = 1601, mean = 0, sd = 1.05)), aes(x = x)) + - geom_histogram(aes(x = x_example, y = ..density..), colour = "white", fill = "grey50") + + geom_histogram(aes(x = x_example, y = after_stat(density)), colour = "white", fill = "grey50") + geom_function(fun = dnorm, colour = "black", args = list(sd = 1)) + ggtitle("Well calibrated") + @@ -40,7 +37,7 @@ p21 <- ggplot(data.frame(x = seq(-8, 8, 0.01), p22 <- ggplot(data.frame(x = seq(-8, 8, 0.01), x_example = rnorm(n = 1601, mean = 1, sd = 1.05)), aes(x = x)) + - geom_histogram(aes(x = x_example, y = ..density..), colour = "white", fill = "grey50") + + geom_histogram(aes(x = x_example, y = after_stat(density)), colour = "white", fill = "grey50") + geom_function(fun = dnorm, colour = "black", args = list(mean = 2, sd = 1)) + ggtitle("Badly calibrated") + @@ -50,7 +47,7 @@ p22 <- ggplot(data.frame(x = seq(-8, 8, 0.01), p23 <- ggplot(data.frame(x = seq(-8, 8, 0.01), x_example = rnorm(n = 1601, mean = 0, sd = 1.05)), aes(x = x)) + - geom_histogram(aes(x = x_example, y = ..density..), colour = "white", fill = "grey50") + + geom_histogram(aes(x = x_example, y = after_stat(density)), colour = "white", fill = "grey50") + geom_function(fun = dnorm, colour = "black", args = list(mean = 0, sd = 2.05)) + ggtitle("Badly calibrated") + diff --git a/inst/manuscript/R/toy-example-calibration.R b/inst/manuscript/R/toy-example-calibration.R index 1267f504a..9473e905d 100644 --- a/inst/manuscript/R/toy-example-calibration.R +++ b/inst/manuscript/R/toy-example-calibration.R @@ -26,18 +26,18 @@ df[, model := factor(`model`, levels = c("Pred: N(0, 1)", "Pred: N(0.5, 1)", "Pred: N(0, 2)", "Pred: N(0, 0.5)"))] -if (!file.exists("inst/manuscript/output/calibration-diagnostic-examples.Rda")) { +if (!file.exists("inst/manuscript/output/calibration-diagnostic-examples.rds")) { res <- score(df) pit <- pit(df, by = "model") stored <- list(res = res, pit = pit) - saveRDS(stored, "inst/manuscript/output/calibration-diagnostic-examples.Rda") + saveRDS(stored, "inst/manuscript/output/calibration-diagnostic-examples.rds") } else { - stored <- readRDS("inst/manuscript/output/calibration-diagnostic-examples.Rda") + stored <- readRDS("inst/manuscript/output/calibration-diagnostic-examples.rds") } res_summarised <- summarise_scores(stored$res,by = "model") @@ -55,10 +55,10 @@ scores_table_plot <- summarise_scores(res_summarised, fun = signif, digits = 2) pred_hist <- df |> ggplot(aes(x = true_value)) + facet_wrap(~ model, nrow = 1) + - geom_histogram(aes(y=..density..), + geom_histogram(aes(y = after_stat(density)), fill = "grey", colour = "dark grey") + - geom_density(aes(y=..density.., x = prediction), + geom_density(aes(y = after_stat(density), x = prediction), colour = "black") + theme_scoringutils() + labs(y = "Density", x = "Value") diff --git a/inst/manuscript/R/toy-example-convergence-outliers.R b/inst/manuscript/R/toy-example-convergence-outliers.R index 001836ed8..5c02c92cf 100644 --- a/inst/manuscript/R/toy-example-convergence-outliers.R +++ b/inst/manuscript/R/toy-example-convergence-outliers.R @@ -19,7 +19,7 @@ true_crps <- scoringRules::crps(y = 0, family = "normal", mean = mu, sd = sd) true_logs <- scoringRules::logs(y = 0, family = "normal", mean = mu, sd = sd) true_dss <- scoringRules::dss_norm(y = 0, mean = mu, sd = sd) -if (!file.exists("inst/manuscript/output/sample-convergence.Rda")) { +if (!file.exists("inst/manuscript/output/sample-convergence.rds")) { results <- list() for (i in sample_sizes) { samples <- as.data.table( @@ -36,9 +36,9 @@ if (!file.exists("inst/manuscript/output/sample-convergence.Rda")) { samples, metrics = c("crps", "log_score", "dss") )[, n_samples := i] } - saveRDS(results, "inst/manuscript/output/sample-convergence.Rda") + saveRDS(results, "inst/manuscript/output/sample-convergence.rds") } else { - results <- readRDS("inst/manuscript/output/sample-convergence.Rda") + results <- readRDS("inst/manuscript/output/sample-convergence.rds") } results <- rbindlist(results) diff --git a/inst/manuscript/manuscript.Rmd b/inst/manuscript/manuscript.Rmd index 3aa6a9f3e..782dc468d 100644 --- a/inst/manuscript/manuscript.Rmd +++ b/inst/manuscript/manuscript.Rmd @@ -417,10 +417,10 @@ The values stored in the list elements \code{target_type} and \code{prediction_t ## Visualising forecast data -It is helpful to start the evaluation process by examining forecast availability, as missing forecasts can impact the evaluation if missingness correlates with performance. The function \fct{avail_forecasts} returns information about the number of available forecasts, given a level of summary that can be specified through the \code{by} argument. For example, to see how many forecasts there are per model and target_type, we can run +It is helpful to start the evaluation process by examining forecast availability, as missing forecasts can impact the evaluation if missingness correlates with performance. The function \fct{available_forecasts} returns information about the number of available forecasts, given a level of summary that can be specified through the \code{by} argument. For example, to see how many forecasts there are per model and target_type, we can run ```{r avail-forecasts, echo=TRUE, fig.width = 10, eval = TRUE, fig.cap="Overview of the number of available forecasts."} -avail_forecasts(data = example_integer, +available_forecasts(data = example_integer, by = c("model", "target_type")) ``` @@ -429,10 +429,10 @@ and visualise results using the function \fct{plot\_avail\_forecasts}. The plot ```{r avail-forecasts-plot, echo=TRUE, fig.pos = "!h", fig.width = 8, fig.height = 3, eval = TRUE, fig.cap="Overview of the number of available forecasts."} library("ggplot2") -avail_forecasts(data = example_integer, +available_forecasts(data = example_integer, by = c("model", "target_type", "forecast_date")) |> - plot_avail_forecasts(x = "forecast_date", - show_numbers = FALSE) + + plot_available_forecasts(x = "forecast_date", + show_counts = FALSE) + facet_wrap(~ target_type) + labs(y = "Model", x = "Forecast date") ``` @@ -637,7 +637,7 @@ It may sometimes be interesting to see how different scores correlate with each correlations <- example_quantile |> score() |> summarise_scores() |> - correlation() + correlation(digits = 2) correlations |> glimpse() @@ -690,7 +690,7 @@ We thank Community Jameel for Institute and research funding ```{r score-table-detailed, echo=FALSE, cache = FALSE, eval = FALSE} -data <- readRDS(system.file("metrics-overview/metrics-detailed.Rda", package = "scoringutils")) +data <- readRDS(system.file("metrics-overview/metrics-detailed.rds", package = "scoringutils")) data[, 1:2] |> kableExtra::kbl(format = "latex", booktabs = TRUE, @@ -725,7 +725,7 @@ Metric & Explanation\\ CRPS (Continuous) ranked probability score & The crps is a proper scoring rule that generalises the absolute error to probabilistic forecasts. It measures the 'distance' of the predictive distribution to the observed data-generating distribution. The CRPS is given as $$\text{CRPS}(F, y) = \int_{-\infty}^\infty \left( F(x) - 1(x \geq y) \right)^2 dx,$$ where y is the true observed value and F the CDF of predictive distribution. Often An alternative representation is used: - $$ \text{CRPS}(F, y) = \frac{1}{2} \mathbb{E}_{F} |X - X'| - \mathbb{E}_P |X - y|,$$ where $X$ and $X'$ are independent realisations from the predictive distributions $F$ with finite first moment and $y$ is the true value. In this represenation we can simply replace $X$ and $X'$ by samples sum over all possible combinations to obtain the CRPS. + $$ \text{CRPS}(F, y) = \frac{1}{2} \mathbb{E}_{F} |X - X'| - \mathbb{E}_P |X - y|,$$ where $X$ and $X'$ are independent realisations from the predictive distributions $F$ with finite first moment and $y$ is the true value. In this representation we can simply replace $X$ and $X'$ by samples sum over all possible combinations to obtain the CRPS. For integer-valued forecasts, the RPS is given as $$ \text{RPS}(F, y) = \sum_{x = 0}^\infty (F(x) - 1(x \geq y))^2. $$ diff --git a/inst/manuscript/manuscript.aux b/inst/manuscript/manuscript.aux index b2af6db05..1b924ad3a 100644 --- a/inst/manuscript/manuscript.aux +++ b/inst/manuscript/manuscript.aux @@ -1,15 +1,12 @@ \relax -\providecommand\hyper@newdestlabel[2]{} \bibstyle{jss} +\providecommand\hyper@newdestlabel[2]{} \providecommand\HyperFirstAtBeginDocument{\AtBeginDocument} \HyperFirstAtBeginDocument{\ifx\hyper@anchor\@undefined -\global\let\oldcontentsline\contentsline -\gdef\contentsline#1#2#3#4{\oldcontentsline{#1}{#2}{#3}} \global\let\oldnewlabel\newlabel \gdef\newlabel#1#2{\newlabelxx{#1}#2} \gdef\newlabelxx#1#2#3#4#5#6{\oldnewlabel{#1}{{#2}{#3}}} \AtEndDocument{\ifx\hyper@anchor\@undefined -\let\contentsline\oldcontentsline \let\newlabel\oldnewlabel \fi} \fi} @@ -58,7 +55,7 @@ {1}{19.15776pt}\LT@entry {1}{276.6107pt}} \newlabel{tab:metrics-summary}{{2}{5}{}{table.2}{}} -\@writefile{lot}{\contentsline {table}{\numberline {2}{\ignorespaces Summary table of scores available in {\fontseries {m}\fontseries {b}\selectfont scoringutils}. This table (including corresponding function names) can be accessed by calling \bgroup \catcode `\_12\relax \catcode `\~12\relax \catcode `\$12\relax {\normalfont \ttfamily \hyphenchar \font =-1 scoringutils::metrics}\egroup in \textsf {R}. Not all metrics are implemented for all types of forecasts and forecasting formats, as indicated by tickmarks, '-', or '$\sim $' (depends). D (discrete forecasts based on predictive samples), C (continuous, sample-based forecasts), B (binary), and Q (any forecasts in a quantile-based format) refer to different forecast formats. While the distinction is not clear-cut (e.g., binary is a special case of discrete), it is useful in the context of the package as available functions and functionality may differ. For a more detailed description of the terms used in this table see the corresponding paper sections (e.g., for 'global' and 'local' see Section \ref {localglobal}). For mathematical definitions of the metrics see Table \ref {tab:score-table-detailed}.\relax }}{5}{table.2}\protected@file@percent } +\@writefile{lot}{\contentsline {table}{\numberline {2}{\ignorespaces Summary table of scores available in {\fontseries {m}\fontseries {b}\selectfont scoringutils}. This table (including corresponding function names) can be accessed by calling \bgroup \catcode `\_12\relax \catcode `\~12\relax \catcode `\$12\relax {\normalfont \ttfamily \hyphenchar \font =-1 scoringutils::metrics}\egroup in \textsf {R}. Not all metrics are implemented for all types of forecasts and forecasting formats, as indicated by tickmarks, '-', or '$\sim $' (depends). D (discrete forecasts based on predictive samples), C (continuous, sample-based forecasts), B (binary), and Q (any forecasts in a quantile-based format) refer to different forecast formats. While the distinction is not clear-cut (e.g., binary is a special case of discrete), it is useful in the context of the package as available functions and functionality may differ. For a more detailed description of the terms used in this table see the corresponding paper sections (e.g., for 'global' and 'local' see Section \ref {localglobal}). For mathematical definitions of the metrics see Table \ref {tab:score-table-detailed}.\relax }}{5}{table.2}\protected@file@percent } \citation{gneitingProbabilisticForecastsCalibration2007} \citation{dawidPresentPositionPotential1984} \citation{angusProbabilityIntegralTransform1994} @@ -125,22 +122,22 @@ \@writefile{lof}{\contentsline {figure}{\numberline {8}{\ignorespaces Coloured table to visualise the computed scores}}{20}{figure.caption.14}\protected@file@percent } \newlabel{fig:score-table}{{8}{20}{Coloured table to visualise the computed scores}{figure.caption.14}{}} \newlabel{pairwisecode}{{3.5}{21}{Sensitivity to the order of magnitude of the forecast quantity}{subsection.3.5}{}} +\newlabel{model-diagnostics}{{3.6}{22}{Sensitivity to the order of magnitude of the forecast quantity}{subsection.3.6}{}} \@writefile{lof}{\contentsline {figure}{\numberline {9}{\ignorespaces Ratios of mean scores based on overlapping forecast sets}}{23}{figure.caption.15}\protected@file@percent } \newlabel{fig:pairwise-plot}{{9}{23}{Ratios of mean scores based on overlapping forecast sets}{figure.caption.15}{}} -\newlabel{model-diagnostics}{{3.6}{24}{Sensitivity to the order of magnitude of the forecast quantity}{subsection.3.6}{}} \@writefile{lof}{\contentsline {figure}{\numberline {10}{\ignorespaces Heatmap of bias values for different models across different locations and forecast targets}}{24}{figure.caption.16}\protected@file@percent } \newlabel{fig:score-heatmap}{{10}{24}{Heatmap of bias values for different models across different locations and forecast targets}{figure.caption.16}{}} -\@writefile{lof}{\contentsline {figure}{\numberline {11}{\ignorespaces Decomposition of the weighted interval score (WIS) into dispersion, overprediction and underprediction}}{25}{figure.caption.17}\protected@file@percent } -\newlabel{fig:wis-components}{{11}{25}{Decomposition of the weighted interval score (WIS) into dispersion, overprediction and underprediction}{figure.caption.17}{}} -\@writefile{lof}{\contentsline {figure}{\numberline {12}{\ignorespaces PIT histograms of all models stratified by forecast target}}{26}{figure.caption.18}\protected@file@percent } -\newlabel{fig:pit-plots}{{12}{26}{PIT histograms of all models stratified by forecast target}{figure.caption.18}{}} +\@writefile{lof}{\contentsline {figure}{\numberline {11}{\ignorespaces Decomposition of the weighted interval score (WIS) into dispersion, overprediction and underprediction}}{24}{figure.caption.17}\protected@file@percent } +\newlabel{fig:wis-components}{{11}{24}{Decomposition of the weighted interval score (WIS) into dispersion, overprediction and underprediction}{figure.caption.17}{}} +\@writefile{lof}{\contentsline {figure}{\numberline {12}{\ignorespaces PIT histograms of all models stratified by forecast target}}{25}{figure.caption.18}\protected@file@percent } +\newlabel{fig:pit-plots}{{12}{25}{PIT histograms of all models stratified by forecast target}{figure.caption.18}{}} +\@writefile{lof}{\contentsline {figure}{\numberline {13}{\ignorespaces Interval coverage (A) and quantile coverage (B) plots}}{26}{figure.caption.19}\protected@file@percent } +\newlabel{fig:coverage}{{13}{26}{Interval coverage (A) and quantile coverage (B) plots}{figure.caption.19}{}} \citation{gneitingStrictlyProperScoring2007,joseCharacterizationSphericalScoring2009,macheteContrastingProbabilisticScoring2012} -\@writefile{lof}{\contentsline {figure}{\numberline {13}{\ignorespaces Interval coverage (A) and quantile coverage (B) plots}}{27}{figure.caption.19}\protected@file@percent } -\newlabel{fig:coverage}{{13}{27}{Interval coverage (A) and quantile coverage (B) plots}{figure.caption.19}{}} +\@writefile{lof}{\contentsline {figure}{\numberline {14}{\ignorespaces Correlation between different scores}}{27}{figure.caption.20}\protected@file@percent } +\newlabel{fig:correlation-plot}{{14}{27}{Correlation between different scores}{figure.caption.20}{}} \newlabel{summary-and-discussion}{{3.7}{27}{Sensitivity to the order of magnitude of the forecast quantity}{subsection.3.7}{}} -\@writefile{lof}{\contentsline {figure}{\numberline {14}{\ignorespaces Correlation between different scores}}{28}{figure.caption.20}\protected@file@percent } -\newlabel{fig:correlation-plot}{{14}{28}{Correlation between different scores}{figure.caption.20}{}} -\newlabel{acknowledgments}{{3.8}{28}{Sensitivity to the order of magnitude of the forecast quantity}{subsection.3.8}{}} +\newlabel{acknowledgments}{{3.8}{27}{Sensitivity to the order of magnitude of the forecast quantity}{subsection.3.8}{}} \gdef \LT@ii {\LT@entry {1}{91.49744pt}\LT@entry {1}{346.24875pt}} diff --git a/inst/manuscript/manuscript.blg b/inst/manuscript/manuscript.blg index ca297d827..51866bde0 100644 --- a/inst/manuscript/manuscript.blg +++ b/inst/manuscript/manuscript.blg @@ -1,4 +1,4 @@ -This is BibTeX, Version 0.99d (TeX Live 2022) +This is BibTeX, Version 0.99d (TeX Live 2023/Homebrew) Capacity: max_strings=200000, hash_size=200000, hash_prime=170003 The top-level auxiliary file: manuscript.aux The style file: jss.bst diff --git a/inst/manuscript/output/calibration-diagnostic-examples.Rda b/inst/manuscript/output/calibration-diagnostic-examples.rds similarity index 100% rename from inst/manuscript/output/calibration-diagnostic-examples.Rda rename to inst/manuscript/output/calibration-diagnostic-examples.rds diff --git a/inst/manuscript/output/relation-to-scale-example.Rda b/inst/manuscript/output/relation-to-scale-example.rds similarity index 100% rename from inst/manuscript/output/relation-to-scale-example.Rda rename to inst/manuscript/output/relation-to-scale-example.rds diff --git a/inst/manuscript/output/sample-convergence.Rda b/inst/manuscript/output/sample-convergence.rds similarity index 100% rename from inst/manuscript/output/sample-convergence.Rda rename to inst/manuscript/output/sample-convergence.rds diff --git a/inst/metrics-overview/metrics-detailed.rds b/inst/metrics-overview/metrics-detailed.rds new file mode 100644 index 000000000..942584001 Binary files /dev/null and b/inst/metrics-overview/metrics-detailed.rds differ diff --git a/man/abs_error.Rd b/man/abs_error.Rd deleted file mode 100644 index bf8190207..000000000 --- a/man/abs_error.Rd +++ /dev/null @@ -1,35 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/metrics_point_forecasts.R -\name{abs_error} -\alias{abs_error} -\title{Absolute Error} -\usage{ -abs_error(true_values, predictions) -} -\arguments{ -\item{true_values}{A vector with the true observed values of size n} - -\item{predictions}{numeric vector with predictions, corresponding to the -quantiles in a second vector, \code{quantiles}.} -} -\value{ -vector with the absolute error -} -\description{ -Calculate absolute error as - -\deqn{ - \textrm{abs}(\textrm{true\_value} - \textrm{median\_prediction}) -}{ - abs(true_value - prediction) -} -} -\examples{ -true_values <- rnorm(30, mean = 1:30) -predicted_values <- rnorm(30, mean = 1:30) -abs_error(true_values, predicted_values) -} -\seealso{ -\code{\link[=ae_median_sample]{ae_median_sample()}}, \code{\link[=ae_median_quantile]{ae_median_quantile()}} -} -\keyword{metric} diff --git a/man/add_coverage.Rd b/man/add_coverage.Rd index 56f2a2e3f..addd1f72d 100644 --- a/man/add_coverage.Rd +++ b/man/add_coverage.Rd @@ -1,44 +1,60 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/summarise_scores.R +% Please edit documentation in R/add_coverage.R \name{add_coverage} \alias{add_coverage} -\title{Add coverage of central prediction intervals} +\title{Add Coverage Values to Quantile-Based Forecasts} \usage{ -add_coverage(scores, by, ranges = c(50, 90)) +add_coverage(data) } \arguments{ -\item{scores}{A data.table of scores as produced by \code{\link[=score]{score()}}.} - -\item{by}{character vector with column names to add the coverage for.} - -\item{ranges}{numeric vector of the ranges of the central prediction intervals -for which coverage values shall be added.} +\item{data}{A data.frame or data.table with predicted and observed values.} } \value{ -a data.table with unsummarised scores with columns added for the -coverage of the central prediction intervals. While the overall data.table -is still unsummarised, note that for the coverage columns some level of -summary is present according to the value specified in \code{by}. +a data.table with the input and columns "interval_coverage", +"interval_coverage_deviation", "quantile_coverage", +"quantile_coverage_deviation" added. } \description{ -Adds a column with the coverage of central prediction intervals -to unsummarised scores as produced by \code{\link[=score]{score()}} -} -\details{ -The coverage values that are added are computed according to the values -specified in \code{by}. If, for example, \code{by = "model"}, then there will be one -coverage value for every model and \code{\link[=add_coverage]{add_coverage()}} will compute the coverage -for every model across the values present in all other columns which define -the unit of a single forecast. +Adds interval coverage of central prediction intervals, +quantile coverage for predictive quantiles, as well as the deviation between +desired and actual coverage to a data.table. Forecasts should be in a +quantile format (following the input requirements of \code{score()}). + +\strong{Interval coverage} + +Interval coverage for a given interval range is defined as the proportion of +observations that fall within the corresponding central prediction intervals. +Central prediction intervals are symmetric around the median and and formed +by two quantiles that denote the lower and upper bound. For example, the 50\% +central prediction interval is the interval between the 0.25 and 0.75 +quantiles of the predictive distribution. + +The function \code{add_coverage()} computes the coverage per central prediction +interval, so the interval coverage will always be either \code{TRUE} +(observed value falls within the interval) or \code{FALSE} (observed value falls +outside the interval). You can summarise the interval coverage values to get +the proportion of observations that fall within the central prediction +intervals. + +\strong{Quantile coverage} + +Quantile coverage for a given quantile is defined as the proportion of +observed values that are smaller than the corresponding predictive quantile. +For example, the 0.5 quantile coverage is the proportion of observed values +that are smaller than the 0.5 quantile of the predictive distribution. +Just as above, for a single observation and the quantile of a single +predictive distribution, the value will either be \code{TRUE} or \code{FALSE}. + +\strong{Coverage deviation} + +The coverage deviation is the difference between the desired coverage +(can be either interval or quantile coverage) and the +actual coverage. For example, if the desired coverage is 90\% and the actual +coverage is 80\%, the coverage deviation is -0.1. } \examples{ library(magrittr) # pipe operator -\dontshow{ - data.table::setDTthreads(2) # restricts number of cores used on CRAN -} -score(example_quantile) \%>\% - add_coverage(by = c("model", "target_type")) \%>\% - summarise_scores(by = c("model", "target_type")) \%>\% - summarise_scores(fun = signif, digits = 2) +example_quantile \%>\% + add_coverage() } \keyword{scoring} diff --git a/man/add_pairwise_comparison.Rd b/man/add_pairwise_comparison.Rd new file mode 100644 index 000000000..75217069c --- /dev/null +++ b/man/add_pairwise_comparison.Rd @@ -0,0 +1,45 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/summarise_scores.R +\name{add_pairwise_comparison} +\alias{add_pairwise_comparison} +\title{Add pairwise comparisons} +\usage{ +add_pairwise_comparison( + scores, + by = NULL, + relative_skill_metric = "auto", + baseline = NULL +) +} +\arguments{ +\item{scores}{MORE INFO HERE.} + +\item{by}{character vector with column names to summarise scores by. Default +is \code{NULL}, meaning that the only summary that takes is place is summarising +over samples or quantiles (in case of quantile-based forecasts), such that +there is one score per forecast as defined by the \emph{unit of a single forecast} +(rather than one score for every sample or quantile).} + +\item{relative_skill_metric}{character with the name of the metric for which +a relative skill shall be computed. If equal to 'auto' (the default), then +this will be either interval score, CRPS or Brier score (depending on which +of these is available in the input data)} + +\item{baseline}{character string with the name of a model. If a baseline is +given, then a scaled relative skill with respect to the baseline will be +returned. By default (\code{NULL}), relative skill will not be scaled with +respect to a baseline model.} +} +\description{ +Adds a columns with relative skills computed by running +pairwise comparisons on the scores. + +a column called +'model' must be present in the input data. For more information on +the computation of relative skill, see \code{\link[=pairwise_comparison]{pairwise_comparison()}}. +Relative skill will be calculated for the aggregation level specified in +\code{by}. +WRITE MORE INFO HERE. +} +\keyword{keyword} +\keyword{scoring} diff --git a/man/ae_median_quantile.Rd b/man/ae_median_quantile.Rd index 69da598a4..d8d463366 100644 --- a/man/ae_median_quantile.Rd +++ b/man/ae_median_quantile.Rd @@ -1,43 +1,43 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/metrics_point_forecasts.R +% Please edit documentation in R/metrics-quantile.R \name{ae_median_quantile} \alias{ae_median_quantile} \title{Absolute Error of the Median (Quantile-based Version)} \usage{ -ae_median_quantile(true_values, predictions, quantiles = NULL) +ae_median_quantile(observed, predicted, quantile) } \arguments{ -\item{true_values}{A vector with the true observed values of size n} +\item{observed}{numeric vector of size n with the observed values} -\item{predictions}{numeric vector with predictions, corresponding to the -quantiles in a second vector, \code{quantiles}.} +\item{predicted}{numeric nxN matrix of predictive +quantiles, n (number of rows) being the number of forecasts (corresponding +to the number of observed values) and N +(number of columns) the number of quantiles per forecast. +If \code{observed} is just a single number, then predicted can just be a +vector of size N.} -\item{quantiles}{numeric vector that denotes the quantile for the values -in \code{predictions}. Only those predictions where \code{quantiles == 0.5} will -be kept. If \code{quantiles} is \code{NULL}, then all \code{predictions} and -\code{true_values} will be used (this is then the same as \code{\link[=abs_error]{abs_error()}})} +\item{quantile}{vector with quantile levels of size N} } \value{ -vector with the scoring values +numeric vector of length N with the absolute error of the median } \description{ -Absolute error of the median calculated as - +Compute the absolute error of the median calculated as \deqn{ - \textrm{abs}(\textrm{true\_value} - \textrm{prediction}) + \textrm{abs}(\textrm{observed} - \textrm{median prediction}) }{ - abs(true_value - median_prediction) + abs(observed - median_prediction) } - -The function was created for internal use within \code{\link[=score]{score()}}, but can also -used as a standalone function. +The median prediction is the predicted value for which quantile == 0.5, +the function therefore requires 0.5 to be among the quantile levels in +\code{quantile}. } \examples{ -true_values <- rnorm(30, mean = 1:30) -predicted_values <- rnorm(30, mean = 1:30) -ae_median_quantile(true_values, predicted_values, quantiles = 0.5) +observed <- rnorm(30, mean = 1:30) +predicted_values <- matrix(rnorm(30, mean = 1:30)) +ae_median_quantile(observed, predicted_values, quantile = 0.5) } \seealso{ -\code{\link[=ae_median_sample]{ae_median_sample()}}, \code{\link[=abs_error]{abs_error()}} +\code{\link[=ae_median_sample]{ae_median_sample()}} } \keyword{metric} diff --git a/man/ae_median_sample.Rd b/man/ae_median_sample.Rd index 719f4321e..6967f44fd 100644 --- a/man/ae_median_sample.Rd +++ b/man/ae_median_sample.Rd @@ -1,17 +1,17 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/metrics_point_forecasts.R +% Please edit documentation in R/metrics-sample.R \name{ae_median_sample} \alias{ae_median_sample} \title{Absolute Error of the Median (Sample-based Version)} \usage{ -ae_median_sample(true_values, predictions) +ae_median_sample(observed, predicted) } \arguments{ -\item{true_values}{A vector with the true observed values of size n} +\item{observed}{A vector with observed values of size n} -\item{predictions}{nxN matrix of predictive samples, n (number of rows) being +\item{predicted}{nxN matrix of predictive samples, n (number of rows) being the number of data points and N (number of columns) the number of Monte -Carlo samples. Alternatively, predictions can just be a vector of size n.} +Carlo samples. Alternatively, \code{predicted} can just be a vector of size n.} } \value{ vector with the scoring values @@ -20,17 +20,17 @@ vector with the scoring values Absolute error of the median calculated as \deqn{% - \textrm{abs}(\textrm{true\_value} - \textrm{median\_prediction}) + \textrm{abs}(\textrm{observevd} - \textrm{median\_prediction}) }{% - abs(true_value - median_prediction) + abs(observed - median_prediction) } } \examples{ -true_values <- rnorm(30, mean = 1:30) -predicted_values <- rnorm(30, mean = 1:30) -ae_median_sample(true_values, predicted_values) +observed <- rnorm(30, mean = 1:30) +predicted_values <- matrix(rnorm(30, mean = 1:30)) +ae_median_sample(observed, predicted_values) } \seealso{ -\code{\link[=ae_median_quantile]{ae_median_quantile()}}, \code{\link[=abs_error]{abs_error()}} +\code{\link[=ae_median_quantile]{ae_median_quantile()}} } \keyword{metric} diff --git a/man/apply_rules.Rd b/man/apply_rules.Rd new file mode 100644 index 000000000..294033e62 --- /dev/null +++ b/man/apply_rules.Rd @@ -0,0 +1,30 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/score.R +\name{apply_rules} +\alias{apply_rules} +\title{Apply A List Of Functions To A Data Table Of Forecasts} +\usage{ +apply_rules(data, metrics, ...) +} +\arguments{ +\item{data}{A data.frame or data.table with predicted and observed values.} + +\item{metrics}{A named list of scoring functions. Names will be used as +column names in the output. See \code{\link[=rules_point]{rules_point()}}, \code{\link[=rules_binary]{rules_binary()}}, +\code{\link[=rules_quantile]{rules_quantile()}}, and \code{\link[=rules_sample]{rules_sample()}} for more information on the +default metrics used.} + +\item{...}{additional arguments} +} +\value{ +A data table with the forecasts and the calculated metrics +} +\description{ +This helper function applies scoring rules (stored as a list of +functions) to a data table of forecasts. \code{apply_rules} is used within +\code{score()} to apply all scoring rules to the data. +Scoring rules are wrapped in \code{\link[=run_safely]{run_safely()}} to catch errors and to make +sure that only arguments are passed to the scoring rule that are actually +accepted by it. +} +\keyword{internal} diff --git a/man/as_forecast.Rd b/man/as_forecast.Rd new file mode 100644 index 000000000..3704364a8 --- /dev/null +++ b/man/as_forecast.Rd @@ -0,0 +1,73 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/validate.R +\name{as_forecast} +\alias{as_forecast} +\alias{as_forecast.default} +\title{Create a \code{forecast} Object} +\usage{ +as_forecast(data, ...) + +\method{as_forecast}{default}(data, ...) +} +\arguments{ +\item{data}{A data.frame or data.table with predicted and observed values.} + +\item{...}{additional arguments} +} +\value{ +Depending on the forecast type, an object of class +\code{forecast_binary}, \code{forecast_point}, \code{forecast_sample} or +\code{forecast_quantile}. +} +\description{ +Convert a data.frame or similar of forecasts into an object of +class \verb{forecast_*} and validate it. + +\code{as_forecast()} determines the forecast type (binary, point, sample-based or +quantile-based) from the input data (using the function +\code{\link[=get_forecast_type]{get_forecast_type()}}. It then constructs an object of the +appropriate class (\code{forecast_binary}, \code{forecast_point}, \code{forecast_sample}, or +\code{forecast_quantile}, using the function \code{\link[=new_forecast]{new_forecast()}}). +Lastly, it calls \code{\link[=as_forecast]{as_forecast()}} on the object to make sure it conforms with +the required input formats. +} +\section{Forecast types and input format}{ +Various different forecast types / forecast formats are supported. At the +moment, those are +\itemize{ +\item point forecasts +\item binary forecasts ("soft binary classification") +\item Probabilistic forecasts in a quantile-based format (a forecast is +represented as a set of predictive quantiles) +\item Probabilistic forecasts in a sample-based format (a forecast is represented +as a set of predictive samples) +} + +Forecast types are determined based on the columns present in the input data. + +\emph{Point forecasts} require a column \code{observed} of type numeric and a column +\code{predicted} of type numeric. + +\emph{Binary forecasts} require a column \code{observed} of type factor with exactly +two levels and a column \code{predicted} of type numeric with probabilities, +corresponding to the probability that \code{observed} is equal to the second +factor level. See details \link[=brier_score]{here} for more information. + +\emph{Quantile-based forecasts} require a column \code{observed} of type numeric, +a column \code{predicted} of type numeric, and a column \code{quantile} of type numeric +with quantile-levels (between 0 and 1). + +\emph{Sample-based forecasts} require a column \code{observed} of type numeric, +a column \code{predicted} of type numeric, and a column \code{sample_id} of type +numeric with sample indices. + +For more information see the vignettes and the example data +(\link{example_quantile}, \link{example_continuous}, \link{example_integer}, +\code{\link[=example_point]{example_point()}}, and \link{example_binary}). +} + +\examples{ +as_forecast(example_binary) +as_forecast(example_quantile) +} +\keyword{check-forecasts} diff --git a/man/assert_dims_ok_point.Rd b/man/assert_dims_ok_point.Rd new file mode 100644 index 000000000..e7534eb1b --- /dev/null +++ b/man/assert_dims_ok_point.Rd @@ -0,0 +1,40 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/check-inputs-scoring-functions.R +\name{assert_dims_ok_point} +\alias{assert_dims_ok_point} +\title{Assert Inputs Have Matching Dimensions} +\usage{ +assert_dims_ok_point(observed, predicted) +} +\arguments{ +\item{observed}{Input to be checked. Should be a factor of length n with +exactly two levels, holding the observed values. +The highest factor level is assumed to be the reference level. This means +that \code{predicted} represents the probability that the observed value is equal +to the highest factor level.} + +\item{predicted}{Input to be checked. \code{predicted} should be a vector of +length n, holding probabilities. Alternatively, \code{predicted} can be a matrix +of size n x 1. Values represent the probability that +the corresponding value in \code{observed} will be equal to the highest +available factor level.} +} +\value{ +Returns NULL invisibly if the assertion was successful and throws an +error otherwise. +} +\description{ +Function assesses whether input dimensions match. In the +following, n is the number of observations / forecasts. Scalar values may +be repeated to match the length of the other input. +Allowed options are therefore +\itemize{ +\item \code{observed} is vector of length 1 or length n +\item \code{predicted} is +\itemize{ +\item a vector of of length 1 or length n +\item a matrix with n rows and 1 column +} +} +} +\keyword{internal_input_check} diff --git a/man/assert_equal_length.Rd b/man/assert_equal_length.Rd new file mode 100644 index 000000000..686209bf4 --- /dev/null +++ b/man/assert_equal_length.Rd @@ -0,0 +1,26 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/check-input-helpers.R +\name{assert_equal_length} +\alias{assert_equal_length} +\title{Check Length of Two Vectors is Equal} +\usage{ +assert_equal_length(..., one_allowed = TRUE, call_levels_up = 2) +} +\arguments{ +\item{...}{The variables to check} + +\item{one_allowed}{logical, allow arguments of length one that can be +recycled} + +\item{call_levels_up}{How many levels to go up when including the function +call in the error message. This is useful when calling \code{assert_equal_length()} +within another checking function.} +} +\value{ +Returns NULL invisibly if the assertion was successful and throws an +error otherwise. +} +\description{ +Check whether variables all have the same length +} +\keyword{internal_input_check} diff --git a/man/assert_input_binary.Rd b/man/assert_input_binary.Rd new file mode 100644 index 000000000..e1aae3621 --- /dev/null +++ b/man/assert_input_binary.Rd @@ -0,0 +1,30 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/check-inputs-scoring-functions.R +\name{assert_input_binary} +\alias{assert_input_binary} +\title{Assert that inputs are correct for binary forecast} +\usage{ +assert_input_binary(observed, predicted) +} +\arguments{ +\item{observed}{Input to be checked. Should be a factor of length n with +exactly two levels, holding the observed values. +The highest factor level is assumed to be the reference level. This means +that \code{predicted} represents the probability that the observed value is equal +to the highest factor level.} + +\item{predicted}{Input to be checked. \code{predicted} should be a vector of +length n, holding probabilities. Alternatively, \code{predicted} can be a matrix +of size n x 1. Values represent the probability that +the corresponding value in \code{observed} will be equal to the highest +available factor level.} +} +\value{ +Returns NULL invisibly if the assertion was successful and throws an +error otherwise. +} +\description{ +Function assesses whether the inputs correspond to the +requirements for scoring binary forecasts. +} +\keyword{internal_input_check} diff --git a/man/assert_input_interval.Rd b/man/assert_input_interval.Rd new file mode 100644 index 000000000..4ab7025bc --- /dev/null +++ b/man/assert_input_interval.Rd @@ -0,0 +1,31 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/check-inputs-scoring-functions.R +\name{assert_input_interval} +\alias{assert_input_interval} +\title{Assert that inputs are correct for interval-based forecast} +\usage{ +assert_input_interval(observed, lower, upper, range) +} +\arguments{ +\item{observed}{Input to be checked. Should be a numeric vector with the +observed values of size n} + +\item{lower}{Input to be checked. Should be a numeric vector of size n that +holds the predicted value for the lower bounds of the prediction intervals.} + +\item{upper}{Input to be checked. Should be a numeric vector of size n that +holds the predicted value for the upper bounds of the prediction intervals.} + +\item{range}{Input to be checked. Should be a vector of size n that +denotes the interval range in percent. E.g. a value of 50 denotes a +(25\%, 75\%) prediction interval.} +} +\value{ +Returns NULL invisibly if the assertion was successful and throws an +error otherwise. +} +\description{ +Function assesses whether the inputs correspond to the +requirements for scoring interval-based forecasts. +} +\keyword{internal_input_check} diff --git a/man/assert_input_point.Rd b/man/assert_input_point.Rd new file mode 100644 index 000000000..c2ea48b7c --- /dev/null +++ b/man/assert_input_point.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/check-inputs-scoring-functions.R +\name{assert_input_point} +\alias{assert_input_point} +\title{Assert that inputs are correct for point forecast} +\usage{ +assert_input_point(observed, predicted) +} +\arguments{ +\item{observed}{Input to be checked. Should be a numeric vector with the +observed values of size n} + +\item{predicted}{Input to be checked. Should be a numeric vector with the +predicted values of size n} +} +\value{ +Returns NULL invisibly if the assertion was successful and throws an +error otherwise. +} +\description{ +Function assesses whether the inputs correspond to the +requirements for scoring point forecasts. +} +\keyword{internal_input_check} diff --git a/man/assert_input_quantile.Rd b/man/assert_input_quantile.Rd new file mode 100644 index 000000000..380cf5e6f --- /dev/null +++ b/man/assert_input_quantile.Rd @@ -0,0 +1,34 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/check-inputs-scoring-functions.R +\name{assert_input_quantile} +\alias{assert_input_quantile} +\title{Assert that inputs are correct for quantile-based forecast} +\usage{ +assert_input_quantile(observed, predicted, quantile, unique_quantiles = TRUE) +} +\arguments{ +\item{observed}{Input to be checked. Should be a numeric vector with the +observed values of size n} + +\item{predicted}{Input to be checked. Should be nxN matrix of predictive +quantiles, n (number of rows) being the number of data points and N +(number of columns) the number of quantiles per forecast. +If \code{observed} is just a single number, then predicted can just be a +vector of size N.} + +\item{quantile}{Input to be checked. Should be a vector of size N that +denotes the quantile levels corresponding to the columns of the prediction +matrix.} + +\item{unique_quantiles}{Input to be checked. Should be TRUE (default) or +FALSE. Whether the quantile levels are required to be unique or not.} +} +\value{ +Returns NULL invisibly if the assertion was successful and throws an +error otherwise. +} +\description{ +Function assesses whether the inputs correspond to the +requirements for scoring quantile-based forecasts. +} +\keyword{internal_input_check} diff --git a/man/assert_input_sample.Rd b/man/assert_input_sample.Rd new file mode 100644 index 000000000..52071391a --- /dev/null +++ b/man/assert_input_sample.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/check-inputs-scoring-functions.R +\name{assert_input_sample} +\alias{assert_input_sample} +\title{Assert that inputs are correct for sample-based forecast} +\usage{ +assert_input_sample(observed, predicted) +} +\arguments{ +\item{observed}{Input to be checked. Should be a numeric vector with the +observed values of size n} + +\item{predicted}{Input to be checked. Should be a numeric nxN matrix of +predictive samples, n (number of rows) being the number of data points and N +(number of columns) the number of samples per forecast. +If \code{observed} is just a single number, then predicted values can just be a +vector of size N.} +} +\value{ +Returns NULL invisibly if the assertion was successful and throws an +error otherwise. +} +\description{ +Function assesses whether the inputs correspond to the +requirements for scoring sample-based forecasts. +} +\keyword{internal_input_check} diff --git a/man/check_not_null.Rd b/man/assert_not_null.Rd similarity index 75% rename from man/check_not_null.Rd rename to man/assert_not_null.Rd index 7e468e46e..ce95cd10e 100644 --- a/man/check_not_null.Rd +++ b/man/assert_not_null.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/input-check-helpers.R -\name{check_not_null} -\alias{check_not_null} +% Please edit documentation in R/check-input-helpers.R +\name{assert_not_null} +\alias{assert_not_null} \title{Check Variable is not NULL} \usage{ -check_not_null(...) +assert_not_null(...) } \arguments{ \item{...}{The variables to check} @@ -18,4 +18,4 @@ Check whether a certain variable is not \code{NULL} and return the name of that variable and the function call where the variable is missing. This function is a helper function that should only be called within other functions } -\keyword{internal} +\keyword{internal_input_check} diff --git a/man/assign_attributes.Rd b/man/assign_attributes.Rd new file mode 100644 index 000000000..f6dfdeadb --- /dev/null +++ b/man/assign_attributes.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{assign_attributes} +\alias{assign_attributes} +\title{Assign attributes to an object from a named list} +\usage{ +assign_attributes(object, attribute_list) +} +\arguments{ +\item{object}{An object to assign attributes to} + +\item{attribute_list}{A named list of attributes to assign to the object.} +} +\value{ +The object with new attributes according to the contents of +\code{attribute_list} +} +\description{ +Every list item will be made an attribute of the object. +} +\keyword{internal} diff --git a/man/assure_model_column.Rd b/man/assure_model_column.Rd new file mode 100644 index 000000000..591588fea --- /dev/null +++ b/man/assure_model_column.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/check-input-helpers.R +\name{assure_model_column} +\alias{assure_model_column} +\title{Assure that Data Has a \code{model} Column} +\usage{ +assure_model_column(data) +} +\arguments{ +\item{data}{A data.frame or data.table with predicted and observed values.} +} +\value{ +The data.table with a column called \code{model} +} +\description{ +Check whether the data.table has a column called \code{model}. +If not, a column called \code{model} is added with the value \verb{Unspecified model}. +} +\keyword{internal_input_check} diff --git a/man/avail_forecasts.Rd b/man/avail_forecasts.Rd deleted file mode 100644 index 60fed26fb..000000000 --- a/man/avail_forecasts.Rd +++ /dev/null @@ -1,69 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/avail_forecasts.R -\name{avail_forecasts} -\alias{avail_forecasts} -\title{Display Number of Forecasts Available} -\usage{ -avail_forecasts(data, by = NULL, collapse = c("quantile", "sample")) -} -\arguments{ -\item{data}{A data.frame or data.table with the predictions and observations. -For scoring using \code{\link[=score]{score()}}, the following columns need to be present: -\itemize{ -\item \code{true_value} - the true observed values -\item \code{prediction} - predictions or predictive samples for one -true value. (You only don't need to provide a prediction column if -you want to score quantile forecasts in a wide range format.)} -For scoring integer and continuous forecasts a \code{sample} column is needed: -\itemize{ -\item \code{sample} - an index to identify the predictive samples in the -prediction column generated by one model for one true value. Only -necessary for continuous and integer forecasts, not for -binary predictions.} -For scoring predictions in a quantile-format forecast you should provide -a column called \code{quantile}: -\itemize{ -\item \code{quantile}: quantile to which the prediction corresponds -} - -In addition a \code{model} column is suggested and if not present this will be -flagged and added to the input data with all forecasts assigned as an -"unspecified model"). - -You can check the format of your data using \code{\link[=check_forecasts]{check_forecasts()}} and there -are examples for each format (\link{example_quantile}, \link{example_continuous}, -\link{example_integer}, and \link{example_binary}).} - -\item{by}{character vector or \code{NULL} (the default) that denotes the -categories over which the number of forecasts should be counted. -By default (\code{by = NULL}) this will be the unit of a single forecast (i.e. -all available columns (apart from a few "protected" columns such as -'prediction' and 'true value') plus "quantile" or "sample" where present).} - -\item{collapse}{character vector (default is \verb{c("quantile", "sample"}) with -names of categories for which the number of rows should be collapsed to one -when counting. For example, a single forecast is usually represented by a -set of several quantiles or samples and collapsing these to one makes sure -that a single forecast only gets counted once.} -} -\value{ -A data.table with columns as specified in \code{by} and an additional -column with the number of forecasts. -} -\description{ -Given a data set with forecasts, count the number of available forecasts -for arbitrary grouping (e.g. the number of forecasts per model, or the -number of forecasts per model and location). -This is useful to determine whether there are any missing forecasts. -} -\examples{ -\dontshow{ - data.table::setDTthreads(2) # restricts number of cores used on CRAN -} - -avail_forecasts(example_quantile, - collapse = c("quantile"), - by = c("model", "target_type") -) -} -\keyword{check-forecasts} diff --git a/man/bias_quantile.Rd b/man/bias_quantile.Rd index 6da89c5da..d7e781253 100644 --- a/man/bias_quantile.Rd +++ b/man/bias_quantile.Rd @@ -1,20 +1,22 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/bias.R +% Please edit documentation in R/metrics-quantile.R \name{bias_quantile} \alias{bias_quantile} \title{Determines Bias of Quantile Forecasts} \usage{ -bias_quantile(predictions, quantiles, true_value) +bias_quantile(observed, predicted, quantile, na.rm = TRUE) } \arguments{ -\item{predictions}{vector of length corresponding to the number of quantiles +\item{observed}{a single number representing the observed value} + +\item{predicted}{vector of length corresponding to the number of quantiles that holds predictions} -\item{quantiles}{vector of corresponding size with the quantiles for which -predictions were made. If this does not contain the median (0.5) then the -median is imputed as being the mean of the two innermost quantiles.} +\item{quantile}{vector of corresponding size with the quantile levels for +which predictions were made. If this does not contain the median (0.5) then +the median is imputed as being the mean of the two innermost quantiles.} -\item{true_value}{a single true value} +\item{na.rm}{logical. Should missing values be removed?} } \value{ scalar with the quantile bias for a single quantile prediction @@ -35,41 +37,37 @@ B_t = (1 - 2 \cdot \max \{i | q_{t,i} \in Q_t \land q_{t,i} \leq x_t\}) where \eqn{Q_t} is the set of quantiles that form the predictive distribution at time \eqn{t}. They represent our -belief about what the true value $x_t$ will be. For consistency, we define +belief about what the observed value $x_t$ will be. For consistency, we define \eqn{Q_t} such that it always includes the element \eqn{q_{t, 0} = - \infty} and \eqn{q_{t,1} = \infty}. \eqn{1()} is the indicator function that is \eqn{1} if the condition is satisfied and $0$ otherwise. In clearer terms, \eqn{B_t} is defined as the maximum percentile rank for which the corresponding quantile -is still below the true value, if the true value is smaller than the -median of the predictive distribution. If the true value is above the +is still below the observed value, if the observed value is smaller than the +median of the predictive distribution. If the observed value is above the median of the predictive distribution, then $B_t$ is the minimum percentile rank for which the corresponding quantile is still larger than the true -value. If the true value is exactly the median, both terms cancel out and +value. If the observed value is exactly the median, both terms cancel out and \eqn{B_t} is zero. For a large enough number of quantiles, the percentile rank will equal the proportion of predictive samples below the -observed true value, and this metric coincides with the one for +observed value, and this metric coincides with the one for continuous forecasts. Bias can assume values between -1 and 1 and is 0 ideally (i.e. unbiased). } \examples{ - -predictions <- c( +predicted <- c( 705.500, 1127.000, 4006.250, 4341.500, 4709.000, 4821.996, 5340.500, 5451.000, 5703.500, 6087.014, 6329.500, 6341.000, 6352.500, 6594.986, 6978.500, 7231.000, 7341.500, 7860.004, 7973.000, 8340.500, 8675.750, 11555.000, 11976.500 ) -quantiles <- c(0.01, 0.025, seq(0.05, 0.95, 0.05), 0.975, 0.99) +quantile <- c(0.01, 0.025, seq(0.05, 0.95, 0.05), 0.975, 0.99) -true_value <- 8062 +observed <- 8062 -bias_quantile(predictions, quantiles, true_value = true_value) -} -\author{ -Nikos Bosse \email{nikosbosse@gmail.com} +bias_quantile(observed, predicted, quantile) } \keyword{metric} diff --git a/man/bias_quantile_single_vector.Rd b/man/bias_quantile_single_vector.Rd new file mode 100644 index 000000000..5d7ba030d --- /dev/null +++ b/man/bias_quantile_single_vector.Rd @@ -0,0 +1,28 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/metrics-quantile.R +\name{bias_quantile_single_vector} +\alias{bias_quantile_single_vector} +\title{Compute Bias for a Single Vector of Quantile Predictions} +\usage{ +bias_quantile_single_vector(observed, predicted, quantile, na.rm) +} +\arguments{ +\item{observed}{scalar with the observed value} + +\item{predicted}{vector of length N corresponding to the number of quantiles +that holds predictions} + +\item{quantile}{vector of corresponding size N with the quantile levels for +which predictions were made. If this does not contain the median (0.5) then +the median is imputed as being the mean of the two innermost quantiles.} + +\item{na.rm}{logical. Should missing values be removed?} +} +\value{ +scalar with the quantile bias for a single quantile prediction +} +\description{ +Internal function to compute bias for a single observed value, +a vector of predicted values and a vector of quantiles. +} +\keyword{internal} diff --git a/man/bias_range.Rd b/man/bias_range.Rd deleted file mode 100644 index 3a9cf7557..000000000 --- a/man/bias_range.Rd +++ /dev/null @@ -1,98 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/bias.R -\name{bias_range} -\alias{bias_range} -\title{Determines Bias of Quantile Forecasts based on the range of the -prediction intervals} -\usage{ -bias_range(lower, upper, range, true_value) -} -\arguments{ -\item{lower}{vector of length corresponding to the number of central -prediction intervals that holds predictions for the lower bounds of a -prediction interval} - -\item{upper}{vector of length corresponding to the number of central -prediction intervals that holds predictions for the upper bounds of a -prediction interval} - -\item{range}{vector of corresponding size with information about the width -of the central prediction interval} - -\item{true_value}{a single true value} -} -\value{ -scalar with the quantile bias for a single quantile prediction -} -\description{ -Determines bias from quantile forecasts based on the range of the -prediction intervals. For an increasing number of quantiles this measure -converges against the sample based bias version for integer and continuous -forecasts. -} -\details{ -For quantile forecasts, bias is measured as - -\deqn{ -B_t = (1 - 2 \cdot \max \{i | q_{t,i} \in Q_t \land q_{t,i} \leq x_t\}) -\mathbf{1}( x_t \leq q_{t, 0.5}) \\ -+ (1 - 2 \cdot \min \{i | q_{t,i} \in Q_t \land q_{t,i} \geq x_t\}) - \mathbf{1}( x_t \geq q_{t, 0.5}), -}{ -B_t = (1 - 2 * max(i | q_{t,i} in Q_t and q_{t,i} <= x_t\)) -1( x_t <= q_{t, 0.5}) + (1 - 2 * min(i | q_{t,i} in Q_t and q_{t,i} >= x_t)) - 1( x_t >= q_{t, 0.5}), -} - -where \eqn{Q_t} is the set of quantiles that form the predictive -distribution at time \eqn{t}. They represent our -belief about what the true value \eqn{x_t} will be. For consistency, we -define -\eqn{Q_t} such that it always includes the element -\eqn{q_{t, 0} = - \infty} and \eqn{q_{t,1} = \infty}. -\eqn{\mathbf{1}()}{1()} is the indicator function that is \eqn{1} if the -condition is satisfied and $0$ otherwise. In clearer terms, \eqn{B_t} is -defined as the maximum percentile rank for which the corresponding quantile -is still below the true value, if the true value is smaller than the -median of the predictive distribution. If the true value is above the -median of the predictive distribution, then $B_t$ is the minimum percentile -rank for which the corresponding quantile is still larger than the true -value. If the true value is exactly the median, both terms cancel out and -\eqn{B_t} is zero. For a large enough number of quantiles, the -percentile rank will equal the proportion of predictive samples below the -observed true value, and this metric coincides with the one for -continuous forecasts. - -Bias can assume values between --1 and 1 and is 0 ideally. -} -\examples{ - -lower <- c( - 6341.000, 6329.500, 6087.014, 5703.500, - 5451.000, 5340.500, 4821.996, 4709.000, - 4341.500, 4006.250, 1127.000, 705.500 -) - -upper <- c( - 6341.000, 6352.500, 6594.986, 6978.500, - 7231.000, 7341.500, 7860.004, 7973.000, - 8340.500, 8675.750, 11555.000, 11976.500 -) - -range <- c(0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 95, 98) - -true_value <- 8062 - -bias_range( - lower = lower, upper = upper, - range = range, true_value = true_value -) -} -\seealso{ -bias_quantile bias_sample -} -\author{ -Nikos Bosse \email{nikosbosse@gmail.com} -} -\keyword{metric} diff --git a/man/bias_sample.Rd b/man/bias_sample.Rd index a0c80f27f..98d7e19d5 100644 --- a/man/bias_sample.Rd +++ b/man/bias_sample.Rd @@ -1,21 +1,21 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/bias.R +% Please edit documentation in R/metrics-sample.R \name{bias_sample} \alias{bias_sample} \title{Determines bias of forecasts} \usage{ -bias_sample(true_values, predictions) +bias_sample(observed, predicted) } \arguments{ -\item{true_values}{A vector with the true observed values of size n} +\item{observed}{A vector with observed values of size n} -\item{predictions}{nxN matrix of predictive samples, n (number of rows) being +\item{predicted}{nxN matrix of predictive samples, n (number of rows) being the number of data points and N (number of columns) the number of Monte -Carlo samples. Alternatively, predictions can just be a vector of size n.} +Carlo samples. Alternatively, \code{predicted} can just be a vector of size n.} } \value{ vector of length n with the biases of the predictive samples with -respect to the true values. +respect to the observed values. } \description{ Determines bias from predictive Monte-Carlo samples. The function @@ -30,7 +30,7 @@ B_t (P_t, x_t) = 1 - 2 * (P_t (x_t)) } where \eqn{P_t} is the empirical cumulative distribution function of the -prediction for the true value \eqn{x_t}. Computationally, \eqn{P_t (x_t)} is +prediction for the observed value \eqn{x_t}. Computationally, \eqn{P_t (x_t)} is just calculated as the fraction of predictive samples for \eqn{x_t} that are smaller than \eqn{x_t}. @@ -48,14 +48,14 @@ In both cases, Bias can assume values between \examples{ ## integer valued forecasts -true_values <- rpois(30, lambda = 1:30) -predictions <- replicate(200, rpois(n = 30, lambda = 1:30)) -bias_sample(true_values, predictions) +observed <- rpois(30, lambda = 1:30) +predicted <- replicate(200, rpois(n = 30, lambda = 1:30)) +bias_sample(observed, predicted) ## continuous forecasts -true_values <- rnorm(30, mean = 1:30) -predictions <- replicate(200, rnorm(30, mean = 1:30)) -bias_sample(true_values, predictions) +observed <- rnorm(30, mean = 1:30) +predicted <- replicate(200, rnorm(30, mean = 1:30)) +bias_sample(observed, predicted) } \references{ The integer valued Bias function is discussed in diff --git a/man/brier_score.Rd b/man/brier_score.Rd deleted file mode 100644 index 263150e89..000000000 --- a/man/brier_score.Rd +++ /dev/null @@ -1,44 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/brier_score.R -\name{brier_score} -\alias{brier_score} -\title{Brier Score} -\usage{ -brier_score(true_values, predictions) -} -\arguments{ -\item{true_values}{A vector with the true observed values of size n with -all values equal to either 0 or 1} - -\item{predictions}{A vector with a predicted probability -that true_value = 1.} -} -\value{ -A numeric value with the Brier Score, i.e. the mean squared -error of the given probability forecasts -} -\description{ -Computes the Brier Score for probabilistic forecasts of binary outcomes. -} -\details{ -The Brier score is a proper score rule that assesses the accuracy of -probabilistic binary predictions. The outcomes can be either 0 or 1, -the predictions must be a probability that the true outcome will be 1. - -The Brier Score is then computed as the mean squared error between the -probabilistic prediction and the true outcome. - -\deqn{ - \textrm{Brier\_Score} = \frac{1}{N} \sum_{t = 1}^{n} (\textrm{prediction}_t - - \textrm{outcome}_t)^2 -}{ - Brier_Score = 1/N sum_{t = 1}^{n} (prediction_t - outcome_t)² -} -} -\examples{ -true_values <- sample(c(0, 1), size = 30, replace = TRUE) -predictions <- runif(n = 30, min = 0, max = 1) - -brier_score(true_values, predictions) -} -\keyword{metric} diff --git a/man/check_attribute_conflict.Rd b/man/check_attribute_conflict.Rd new file mode 100644 index 000000000..588771565 --- /dev/null +++ b/man/check_attribute_conflict.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/check-input-helpers.R +\name{check_attribute_conflict} +\alias{check_attribute_conflict} +\title{Check Whether There Is a Conflict Between Data and Attributes} +\usage{ +check_attribute_conflict(object, attribute, expected) +} +\arguments{ +\item{object}{The object to check} + +\item{attribute}{The name of the attribute to check} + +\item{expected}{The expected value of the attribute} +} +\value{ +Returns TRUE if the check was successful and a string with an +error message otherwise +} +\description{ +Check whether there is a conflict between a stored attribute and the +same value as inferred from the data. For example, this could be if +an attribute \code{forecast_unit} is stored, but is different from the +\code{forecast_unit} inferred from the data. The check is successful if +the stored and the inferred value are the same. +} +\keyword{internal_input_check} diff --git a/man/check_columns_present.Rd b/man/check_columns_present.Rd new file mode 100644 index 000000000..392839b07 --- /dev/null +++ b/man/check_columns_present.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/check-input-helpers.R +\name{check_columns_present} +\alias{check_columns_present} +\title{Check column names are present in a data.frame} +\usage{ +check_columns_present(data, columns) +} +\arguments{ +\item{data}{A data.frame or similar to be checked} + +\item{columns}{A character vector of column names to check} +} +\value{ +Returns TRUE if the check was successful and a string with an +error message otherwise +} +\description{ +The functions loops over the column names and checks whether they are +present. If an issue is encountered, the function immediately stops +and returns a message with the first issue encountered. +} +\keyword{internal_input_check} diff --git a/man/check_data_columns.Rd b/man/check_data_columns.Rd new file mode 100644 index 000000000..6115db266 --- /dev/null +++ b/man/check_data_columns.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/check-input-helpers.R +\name{check_data_columns} +\alias{check_data_columns} +\title{Check whether data is data.frame with correct columns} +\usage{ +check_data_columns(data) +} +\arguments{ +\item{data}{A data.frame or similar to be checked} +} +\value{ +Returns TRUE if the check was successful and a string with an +error message otherwise +} +\description{ +Checks whether data is a data.frame, whether columns +"observed" and "predicted" are present, and checks that only one of +"quantile" and "sample_id" is present. +} +\keyword{internal_input_check} diff --git a/man/check_dims_ok_point.Rd b/man/check_dims_ok_point.Rd new file mode 100644 index 000000000..b0fb82ab8 --- /dev/null +++ b/man/check_dims_ok_point.Rd @@ -0,0 +1,40 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/check-inputs-scoring-functions.R +\name{check_dims_ok_point} +\alias{check_dims_ok_point} +\title{Check Inputs Have Matching Dimensions} +\usage{ +check_dims_ok_point(observed, predicted) +} +\arguments{ +\item{observed}{Input to be checked. Should be a factor of length n with +exactly two levels, holding the observed values. +The highest factor level is assumed to be the reference level. This means +that \code{predicted} represents the probability that the observed value is equal +to the highest factor level.} + +\item{predicted}{Input to be checked. \code{predicted} should be a vector of +length n, holding probabilities. Alternatively, \code{predicted} can be a matrix +of size n x 1. Values represent the probability that +the corresponding value in \code{observed} will be equal to the highest +available factor level.} +} +\value{ +Returns TRUE if the check was successful and a string with an +error message otherwise +} +\description{ +Function assesses whether input dimensions match. In the +following, n is the number of observations / forecasts. Scalar values may +be repeated to match the length of the other input. +Allowed options are therefore +\itemize{ +\item \code{observed} is vector of length 1 or length n +\item \code{predicted} is +\itemize{ +\item a vector of of length 1 or length n +\item a matrix with n rows and 1 column +} +} +} +\keyword{internal_input_check} diff --git a/man/check_duplicates.Rd b/man/check_duplicates.Rd new file mode 100644 index 000000000..dff552eb5 --- /dev/null +++ b/man/check_duplicates.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/check-input-helpers.R +\name{check_duplicates} +\alias{check_duplicates} +\title{Check that there are no duplicate forecasts} +\usage{ +check_duplicates(data, forecast_unit = NULL) +} +\arguments{ +\item{data}{A data.frame as used for \code{\link[=score]{score()}}} + +\item{forecast_unit}{A character vector with the column names that define +the unit of a single forecast. If \code{NULL} (the default) the function tries +to infer the unit of a single forecast.} +} +\value{ +Returns TRUE if the check was successful and a string with an +error message otherwise +} +\description{ +Runs \code{\link[=get_duplicate_forecasts]{get_duplicate_forecasts()}} and returns a message if an issue is encountered +} +\keyword{internal_input_check} diff --git a/man/check_equal_length.Rd b/man/check_equal_length.Rd deleted file mode 100644 index 38edbcd9c..000000000 --- a/man/check_equal_length.Rd +++ /dev/null @@ -1,22 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/input-check-helpers.R -\name{check_equal_length} -\alias{check_equal_length} -\title{Check Length} -\usage{ -check_equal_length(..., one_allowed = TRUE) -} -\arguments{ -\item{...}{The variables to check} - -\item{one_allowed}{logical, allow arguments of length one that can be -recycled} -} -\value{ -The function returns \code{NULL}, but throws an error if variable lengths -differ -} -\description{ -Check whether variables all have the same length -} -\keyword{internal} diff --git a/man/check_forecasts.Rd b/man/check_forecasts.Rd deleted file mode 100644 index 018678405..000000000 --- a/man/check_forecasts.Rd +++ /dev/null @@ -1,92 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/check_forecasts.R -\name{check_forecasts} -\alias{check_forecasts} -\title{Check forecasts} -\usage{ -check_forecasts(data) -} -\arguments{ -\item{data}{A data.frame or data.table with the predictions and observations. -For scoring using \code{\link[=score]{score()}}, the following columns need to be present: -\itemize{ -\item \code{true_value} - the true observed values -\item \code{prediction} - predictions or predictive samples for one -true value. (You only don't need to provide a prediction column if -you want to score quantile forecasts in a wide range format.)} -For scoring integer and continuous forecasts a \code{sample} column is needed: -\itemize{ -\item \code{sample} - an index to identify the predictive samples in the -prediction column generated by one model for one true value. Only -necessary for continuous and integer forecasts, not for -binary predictions.} -For scoring predictions in a quantile-format forecast you should provide -a column called \code{quantile}: -\itemize{ -\item \code{quantile}: quantile to which the prediction corresponds -} - -In addition a \code{model} column is suggested and if not present this will be -flagged and added to the input data with all forecasts assigned as an -"unspecified model"). - -You can check the format of your data using \code{\link[=check_forecasts]{check_forecasts()}} and there -are examples for each format (\link{example_quantile}, \link{example_continuous}, -\link{example_integer}, and \link{example_binary}).} -} -\value{ -A list with elements that give information about what \code{scoringutils} -thinks you are trying to do and potential issues. -\itemize{ -\item \code{target_type} the type of the prediction target as inferred from the -input: 'binary', if all values in \code{true_value} are either 0 or 1 and values -in \code{prediction} are between 0 and 1, 'discrete' if all true values are -integers. -and 'continuous' if not. -\item \code{prediction_type} inferred type of the prediction. 'quantile', if there is -a column called 'quantile', else 'discrete' if all values in \code{prediction} -are integer, else 'continuous. -\item \code{forecast_unit} unit of a single forecast, i.e. the grouping that uniquely -defines a single forecast. This is assumed to be all -present columns apart from the following protected columns: -\code{c("prediction", "true_value", "sample", "quantile","range", "boundary")}. -It is important that you remove all unnecessary columns before scoring. -\item \code{rows_per_forecast} a data.frame that shows how many rows (usually -quantiles or samples there are available per forecast. If a forecast model -has several entries, then there a forecasts with differing numbers of -quantiles / samples. -\item \code{unique_values} A data.frame that shows how many unique values there are -present per model and column in the data. This doesn't directly show missing -values, but rather the maximum number of unique values across the whole data. -\item \code{warnings} A vector with warnings. These can be ignored if you know what -you are doing. -\item \code{errors} A vector with issues that will cause an error when running -\code{\link[=score]{score()}}. -\item \code{messages} A verbal explanation of the information provided above. -} -} -\description{ -Function to check the input data before running -\code{\link[=score]{score()}}. - -The data should come in one of three different formats: -\itemize{ -\item A format for binary predictions (see \link{example_binary}) -\item A sample-based format for discrete or continuous predictions -(see \link{example_continuous} and \link{example_integer}) -\item A quantile-based format (see \link{example_quantile}) -} -} -\examples{ -check <- check_forecasts(example_quantile) -print(check) -check_forecasts(example_binary) -} -\seealso{ -Function to move from sample-based to quantile format: -\code{\link[=sample_to_quantile]{sample_to_quantile()}} -} -\author{ -Nikos Bosse \email{nikosbosse@gmail.com} -} -\keyword{check-forecasts} diff --git a/man/check_has_attribute.Rd b/man/check_has_attribute.Rd new file mode 100644 index 000000000..5111aac38 --- /dev/null +++ b/man/check_has_attribute.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/check-input-helpers.R +\name{check_has_attribute} +\alias{check_has_attribute} +\title{Check whether an attribute is present} +\usage{ +check_has_attribute(object, attribute) +} +\arguments{ +\item{object}{An object to be checked} + +\item{attribute}{name of an attribute to be checked} +} +\value{ +Returns TRUE if the check was successful and a string with an +error message otherwise +} +\description{ +Checks whether an object has an attribute +} +\keyword{internal_input_check} diff --git a/man/check_input_binary.Rd b/man/check_input_binary.Rd new file mode 100644 index 000000000..86d4d2f9e --- /dev/null +++ b/man/check_input_binary.Rd @@ -0,0 +1,30 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/check-inputs-scoring-functions.R +\name{check_input_binary} +\alias{check_input_binary} +\title{Check that inputs are correct for binary forecast} +\usage{ +check_input_binary(observed, predicted) +} +\arguments{ +\item{observed}{Input to be checked. Should be a factor of length n with +exactly two levels, holding the observed values. +The highest factor level is assumed to be the reference level. This means +that \code{predicted} represents the probability that the observed value is equal +to the highest factor level.} + +\item{predicted}{Input to be checked. \code{predicted} should be a vector of +length n, holding probabilities. Alternatively, \code{predicted} can be a matrix +of size n x 1. Values represent the probability that +the corresponding value in \code{observed} will be equal to the highest +available factor level.} +} +\value{ +Returns TRUE if the check was successful and a string with an +error message otherwise +} +\description{ +Function assesses whether the inputs correspond to the +requirements for scoring binary forecasts. +} +\keyword{internal_input_check} diff --git a/man/check_input_interval.Rd b/man/check_input_interval.Rd new file mode 100644 index 000000000..7b68f92f5 --- /dev/null +++ b/man/check_input_interval.Rd @@ -0,0 +1,31 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/check-inputs-scoring-functions.R +\name{check_input_interval} +\alias{check_input_interval} +\title{Check that inputs are correct for interval-based forecast} +\usage{ +check_input_interval(observed, lower, upper, range) +} +\arguments{ +\item{observed}{Input to be checked. Should be a numeric vector with the +observed values of size n} + +\item{lower}{Input to be checked. Should be a numeric vector of size n that +holds the predicted value for the lower bounds of the prediction intervals.} + +\item{upper}{Input to be checked. Should be a numeric vector of size n that +holds the predicted value for the upper bounds of the prediction intervals.} + +\item{range}{Input to be checked. Should be a vector of size n that +denotes the interval range in percent. E.g. a value of 50 denotes a +(25\%, 75\%) prediction interval.} +} +\value{ +Returns TRUE if the check was successful and a string with an +error message otherwise +} +\description{ +Function assesses whether the inputs correspond to the +requirements for scoring interval-based forecasts. +} +\keyword{internal_input_check} diff --git a/man/check_input_point.Rd b/man/check_input_point.Rd new file mode 100644 index 000000000..31e838db4 --- /dev/null +++ b/man/check_input_point.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/check-inputs-scoring-functions.R +\name{check_input_point} +\alias{check_input_point} +\title{Check that inputs are correct for point forecast} +\usage{ +check_input_point(observed, predicted) +} +\arguments{ +\item{observed}{Input to be checked. Should be a numeric vector with the +observed values of size n} + +\item{predicted}{Input to be checked. Should be a numeric vector with the +predicted values of size n} +} +\value{ +Returns TRUE if the check was successful and a string with an +error message otherwise +} +\description{ +Function assesses whether the inputs correspond to the +requirements for scoring point forecasts. +} +\keyword{internal_input_check} diff --git a/man/check_input_quantile.Rd b/man/check_input_quantile.Rd new file mode 100644 index 000000000..e0a9877b1 --- /dev/null +++ b/man/check_input_quantile.Rd @@ -0,0 +1,31 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/check-inputs-scoring-functions.R +\name{check_input_quantile} +\alias{check_input_quantile} +\title{Check that inputs are correct for quantile-based forecast} +\usage{ +check_input_quantile(observed, predicted, quantile) +} +\arguments{ +\item{observed}{Input to be checked. Should be a numeric vector with the +observed values of size n} + +\item{predicted}{Input to be checked. Should be nxN matrix of predictive +quantiles, n (number of rows) being the number of data points and N +(number of columns) the number of quantiles per forecast. +If \code{observed} is just a single number, then predicted can just be a +vector of size N.} + +\item{quantile}{Input to be checked. Should be a vector of size N that +denotes the quantile levels corresponding to the columns of the prediction +matrix.} +} +\value{ +Returns TRUE if the check was successful and a string with an +error message otherwise +} +\description{ +Function assesses whether the inputs correspond to the +requirements for scoring quantile-based forecasts. +} +\keyword{internal_input_check} diff --git a/man/check_input_sample.Rd b/man/check_input_sample.Rd new file mode 100644 index 000000000..32724b8f7 --- /dev/null +++ b/man/check_input_sample.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/check-inputs-scoring-functions.R +\name{check_input_sample} +\alias{check_input_sample} +\title{Check that inputs are correct for sample-based forecast} +\usage{ +check_input_sample(observed, predicted) +} +\arguments{ +\item{observed}{Input to be checked. Should be a numeric vector with the +observed values of size n} + +\item{predicted}{Input to be checked. Should be a numeric nxN matrix of +predictive samples, n (number of rows) being the number of data points and N +(number of columns) the number of samples per forecast. +If \code{observed} is just a single number, then predicted values can just be a +vector of size N.} +} +\value{ +Returns TRUE if the check was successful and a string with an +error message otherwise +} +\description{ +Function assesses whether the inputs correspond to the +requirements for scoring sample-based forecasts. +} +\keyword{internal_input_check} diff --git a/man/check_metrics.Rd b/man/check_metrics.Rd deleted file mode 100644 index 11fb19035..000000000 --- a/man/check_metrics.Rd +++ /dev/null @@ -1,20 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/input-check-helpers.R -\name{check_metrics} -\alias{check_metrics} -\title{Check whether the desired metrics are available in scoringutils} -\usage{ -check_metrics(metrics) -} -\arguments{ -\item{metrics}{character vector with desired metrics} -} -\value{ -A character vector with metrics that can be used for downstream -computation -} -\description{ -Helper function to check whether desired metrics are -available. If the input is \code{NULL}, all metrics will be returned. -} -\keyword{internal} diff --git a/man/check_no_NA_present.Rd b/man/check_no_NA_present.Rd new file mode 100644 index 000000000..bb742779d --- /dev/null +++ b/man/check_no_NA_present.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/check-input-helpers.R +\name{check_no_NA_present} +\alias{check_no_NA_present} +\title{Check columns in data.frame don't have NA values} +\usage{ +check_no_NA_present(data, columns) +} +\arguments{ +\item{data}{A data.frame or similar to be checked} + +\item{columns}{A character vector of column names to check} +} +\value{ +Returns TRUE if the check was successful and a string with an +error message otherwise +} +\description{ +Function checks whether any of the columns in a data.frame, +as specified in \code{columns}, have NA values. If so, it returns a string with +an error message, otherwise it returns TRUE. +} +\keyword{internal_input_check} diff --git a/man/check_number_per_forecast.Rd b/man/check_number_per_forecast.Rd new file mode 100644 index 000000000..f15ef1a54 --- /dev/null +++ b/man/check_number_per_forecast.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/check-input-helpers.R +\name{check_number_per_forecast} +\alias{check_number_per_forecast} +\title{Check that all forecasts have the same number of quantiles or samples} +\usage{ +check_number_per_forecast(data, forecast_unit) +} +\arguments{ +\item{data}{A data.frame or similar to be checked} + +\item{forecast_unit}{Character vector denoting the unit of a single forecast.} +} +\value{ +Returns TRUE if the check was successful and a string with an +error message otherwise +} +\description{ +Function checks the number of quantiles or samples per forecast. +If the number of quantiles or samples is the same for all forecasts, it +returns TRUE and a string with an error message otherwise. +} +\keyword{internal_input_check} diff --git a/man/check_numeric_vector.Rd b/man/check_numeric_vector.Rd new file mode 100644 index 000000000..1b736bc01 --- /dev/null +++ b/man/check_numeric_vector.Rd @@ -0,0 +1,57 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/check-input-helpers.R +\name{check_numeric_vector} +\alias{check_numeric_vector} +\title{Check whether an input is an atomic vector of mode 'numeric'} +\usage{ +check_numeric_vector(x, ...) +} +\arguments{ +\item{x}{input to check} + +\item{...}{ + Arguments passed on to \code{\link[checkmate:checkNumeric]{checkmate::check_numeric}} + \describe{ + \item{\code{lower}}{[\code{numeric(1)}]\cr +Lower value all elements of \code{x} must be greater than or equal to.} + \item{\code{upper}}{[\code{numeric(1)}]\cr +Upper value all elements of \code{x} must be lower than or equal to.} + \item{\code{finite}}{[\code{logical(1)}]\cr +Check for only finite values? Default is \code{FALSE}.} + \item{\code{any.missing}}{[\code{logical(1)}]\cr +Are vectors with missing values allowed? Default is \code{TRUE}.} + \item{\code{all.missing}}{[\code{logical(1)}]\cr +Are vectors with no non-missing values allowed? Default is \code{TRUE}. +Note that empty vectors do not have non-missing values.} + \item{\code{len}}{[\code{integer(1)}]\cr +Exact expected length of \code{x}.} + \item{\code{min.len}}{[\code{integer(1)}]\cr +Minimal length of \code{x}.} + \item{\code{max.len}}{[\code{integer(1)}]\cr +Maximal length of \code{x}.} + \item{\code{unique}}{[\code{logical(1)}]\cr +Must all values be unique? Default is \code{FALSE}.} + \item{\code{sorted}}{[\code{logical(1)}]\cr +Elements must be sorted in ascending order. Missing values are ignored.} + \item{\code{names}}{[\code{character(1)}]\cr +Check for names. See \code{\link[checkmate]{checkNamed}} for possible values. +Default is \dQuote{any} which performs no check at all. +Note that you can use \code{\link[checkmate]{checkSubset}} to check for a specific set of names.} + \item{\code{typed.missing}}{[\code{logical(1)}]\cr +If set to \code{FALSE} (default), all types of missing values (\code{NA}, \code{NA_integer_}, +\code{NA_real_}, \code{NA_character_} or \code{NA_character_}) as well as empty vectors are allowed +while type-checking atomic input. +Set to \code{TRUE} to enable strict type checking.} + \item{\code{null.ok}}{[\code{logical(1)}]\cr +If set to \code{TRUE}, \code{x} may also be \code{NULL}. +In this case only a type check of \code{x} is performed, all additional checks are disabled.} + }} +} +\value{ +Returns TRUE if the check was successful and a string with an +error message otherwise +} +\description{ +Helper function +} +\keyword{internal_input_check} diff --git a/man/check_predictions.Rd b/man/check_predictions.Rd deleted file mode 100644 index 682ea3a95..000000000 --- a/man/check_predictions.Rd +++ /dev/null @@ -1,32 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/input-check-helpers.R -\name{check_predictions} -\alias{check_predictions} -\title{Check Prediction Input For Lower-level Scoring Functions} -\usage{ -check_predictions( - predictions, - true_values = NULL, - type = c("continuous", "integer", "binary"), - class = c("vector", "matrix") -) -} -\arguments{ -\item{predictions}{an object with predictions. Depending on whether -\code{class = vector} or \code{class = "matrix"} this can be either a vector of length -n (corresponding to the length of the true_values) or a nxN matrix of -predictive samples, n (number of rows) being the number of data points and -N (number of columns) the number of Monte Carlo samples} - -\item{true_values}{A vector with the true observed values of size n} - -\item{type}{character, one of "continuous" (default), "integer" or "binary" that -defines the type of the forecast} - -\item{class}{character, either "vector" (default) or "matrix" that determines the -class the input has to correspond to} -} -\description{ -Helper function to check inputs for lower-level score functions. -} -\keyword{internal} diff --git a/man/check_quantiles.Rd b/man/check_quantiles.Rd index b0a2edb0b..3b3ae08b4 100644 --- a/man/check_quantiles.Rd +++ b/man/check_quantiles.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/input-check-helpers.R +% Please edit documentation in R/check-input-helpers.R \name{check_quantiles} \alias{check_quantiles} \title{Check that quantiles are valid} @@ -21,7 +21,7 @@ Helper function to check that input quantiles are valid. Quantiles must be in the range specified, increase monotonically, and contain no duplicates. -This is used in \code{\link[=bias_range]{bias_range()}} and \code{\link[=bias_quantile]{bias_quantile()}} to +This is used in \url{bias_range()} and \url{bias_quantile()} to provide informative errors to users. } -\keyword{internal} +\keyword{internal_input_check} diff --git a/man/check_summary_params.Rd b/man/check_summary_params.Rd index 042d565e9..9b605f999 100644 --- a/man/check_summary_params.Rd +++ b/man/check_summary_params.Rd @@ -19,22 +19,6 @@ input data that do not correspond to a metric produced by \code{\link[=score]{sc indicate indicate a grouping of forecasts (for example there may be one forecast per day, location and model). Adding additional, unrelated, columns may alter results in an unpredictable way.} - -\item{relative_skill}{logical, whether or not to compute relative -performance between models based on pairwise comparisons. -If \code{TRUE} (default is \code{FALSE}), then a column called -'model' must be present in the input data. For more information on -the computation of relative skill, see \code{\link[=pairwise_comparison]{pairwise_comparison()}}. -Relative skill will be calculated for the aggregation level specified in -\code{by}.} - -\item{baseline}{character string with the name of a model. If a baseline is -given, then a scaled relative skill with respect to the baseline will be -returned. By default (\code{NULL}), relative skill will not be scaled with -respect to a baseline model.} - -\item{metric}{\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#deprecated}{\figure{lifecycle-deprecated.svg}{options: alt='[Deprecated]'}}}{\strong{[Deprecated]}} Deprecated in 1.1.0. Use -\code{relative_skill_metric} instead.} } \description{ A helper function to check the input parameters for diff --git a/man/check_true_values.Rd b/man/check_true_values.Rd deleted file mode 100644 index da5489dcc..000000000 --- a/man/check_true_values.Rd +++ /dev/null @@ -1,18 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/input-check-helpers.R -\name{check_true_values} -\alias{check_true_values} -\title{Check Observed Value Input For Lower-level Scoring Functions} -\usage{ -check_true_values(true_values, type = c("continuous", "integer", "binary")) -} -\arguments{ -\item{true_values}{A vector with the true observed values of size n} - -\item{type}{character, one of "continuous" (default), "integer" or "binary" that -defines the type of the forecast} -} -\description{ -Helper function to check inputs for lower-level score functions. -} -\keyword{internal} diff --git a/man/check_try.Rd b/man/check_try.Rd new file mode 100644 index 000000000..94574cf48 --- /dev/null +++ b/man/check_try.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/check-input-helpers.R +\name{check_try} +\alias{check_try} +\title{Helper function to convert assert statements into checks} +\usage{ +check_try(expr) +} +\arguments{ +\item{expr}{an expression to be evaluated} +} +\value{ +Returns TRUE if the check was successful and a string with an +error message otherwise +} +\description{ +Tries to execute an expression. Internally, this is used to +see whether assertions fail when checking inputs (i.e. to convert an +\verb{assert_*()} statement into a check). If the expression fails, the error +message is returned. If the expression succeeds, \code{TRUE} is returned. +} +\keyword{internal_input_check} diff --git a/man/collapse_messages.Rd b/man/collapse_messages.Rd index 47f3abaed..ec2d9f224 100644 --- a/man/collapse_messages.Rd +++ b/man/collapse_messages.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/check_forecasts.R +% Please edit documentation in R/utils.R \name{collapse_messages} \alias{collapse_messages} \title{Collapse several messages to one} @@ -16,6 +16,6 @@ string with the message or warning } \description{ Internal helper function to facilitate generating messages -and warnings in \code{\link[=check_forecasts]{check_forecasts()}} +and warnings. } \keyword{internal} diff --git a/man/compare_two_models.Rd b/man/compare_two_models.Rd index 840658c23..1d4686a1c 100644 --- a/man/compare_two_models.Rd +++ b/man/compare_two_models.Rd @@ -22,9 +22,8 @@ compare_two_models( \item{name_model2}{character, name of the model to compare against} \item{metric}{A character vector of length one with the metric to do the -comparison on. The default is "auto", meaning that either "interval_score", -"crps", or "brier_score" will be selected where available. -See \code{\link[=available_metrics]{available_metrics()}} for available metrics.} +comparison on. The default is "auto", meaning that either "wis", +"crps", or "brier_score" will be selected where available.} \item{one_sided}{Boolean, default is \code{FALSE}, whether two conduct a one-sided instead of a two-sided test to determine significance in a pairwise diff --git a/man/correlation.Rd b/man/correlation.Rd index 77de3a469..59bda9e42 100644 --- a/man/correlation.Rd +++ b/man/correlation.Rd @@ -4,7 +4,7 @@ \alias{correlation} \title{Correlation Between Metrics} \usage{ -correlation(scores, metrics = NULL) +correlation(scores, metrics = NULL, digits = NULL) } \arguments{ \item{scores}{A data.table of scores as produced by \code{\link[=score]{score()}}.} @@ -12,6 +12,9 @@ correlation(scores, metrics = NULL) \item{metrics}{A character vector with the metrics to show. If set to \code{NULL} (default), all metrics present in \code{scores} will be shown} + +\item{digits}{A number indicating how many decimal places the result should +be rounded to. By default (\code{digits = NULL}) no rounding takes place.} } \value{ A data.table with correlations for the different metrics @@ -22,6 +25,6 @@ scores as produced by \code{\link[=score]{score()}}. } \examples{ scores <- score(example_quantile) -correlation(scores) +correlation(scores, digits = 2) } \keyword{scoring} diff --git a/man/crps_sample.Rd b/man/crps_sample.Rd index a91490ed4..c29099f23 100644 --- a/man/crps_sample.Rd +++ b/man/crps_sample.Rd @@ -1,17 +1,20 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/scoringRules_wrappers.R +% Please edit documentation in R/metrics-sample.R \name{crps_sample} \alias{crps_sample} \title{Ranked Probability Score} \usage{ -crps_sample(true_values, predictions) +crps_sample(observed, predicted, ...) } \arguments{ -\item{true_values}{A vector with the true observed values of size n} +\item{observed}{A vector with observed values of size n} -\item{predictions}{nxN matrix of predictive samples, n (number of rows) being +\item{predicted}{nxN matrix of predictive samples, n (number of rows) being the number of data points and N (number of columns) the number of Monte -Carlo samples. Alternatively, predictions can just be a vector of size n.} +Carlo samples. Alternatively, \code{predicted} can just be a vector of size n.} + +\item{...}{additional arguments passed to +\link[scoringRules:scores_sample_univ]{crps_sample()} from the scoringRules package.} } \value{ vector with the scoring values @@ -23,9 +26,9 @@ function from the valued forecasts } \examples{ -true_values <- rpois(30, lambda = 1:30) -predictions <- replicate(200, rpois(n = 30, lambda = 1:30)) -crps_sample(true_values, predictions) +observed <- rpois(30, lambda = 1:30) +predicted <- replicate(200, rpois(n = 30, lambda = 1:30)) +crps_sample(observed, predicted) } \references{ Alexander Jordan, Fabian Krüger, Sebastian Lerch, Evaluating Probabilistic diff --git a/man/delete_columns.Rd b/man/delete_columns.Rd deleted file mode 100644 index 124b86625..000000000 --- a/man/delete_columns.Rd +++ /dev/null @@ -1,23 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utils.R -\name{delete_columns} -\alias{delete_columns} -\title{Delete Columns From a Data.table} -\usage{ -delete_columns(df, cols_to_delete, make_unique = FALSE) -} -\arguments{ -\item{df}{A data.table or data.frame from which columns shall be deleted} - -\item{cols_to_delete}{character vector with names of columns to be deleted} - -\item{make_unique}{whether to make the data set unique after removing columns} -} -\value{ -A data.table -} -\description{ -take a vector of column names and delete the columns if they -are present in the data.table -} -\keyword{internal} diff --git a/man/document_assert_functions.Rd b/man/document_assert_functions.Rd new file mode 100644 index 000000000..c8046bf67 --- /dev/null +++ b/man/document_assert_functions.Rd @@ -0,0 +1,13 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/documentation-templates.R +\name{document_assert_functions} +\alias{document_assert_functions} +\title{Documentation template for check functions} +\value{ +Returns NULL invisibly if the assertion was successful and throws an +error otherwise. +} +\description{ +Documentation template for check functions +} +\keyword{internal} diff --git a/man/document_check_functions.Rd b/man/document_check_functions.Rd new file mode 100644 index 000000000..256db46f5 --- /dev/null +++ b/man/document_check_functions.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/documentation-templates.R +\name{document_check_functions} +\alias{document_check_functions} +\title{Documentation template for check functions} +\arguments{ +\item{data}{A data.frame or similar to be checked} + +\item{columns}{A character vector of column names to check} +} +\value{ +Returns TRUE if the check was successful and a string with an +error message otherwise +} +\description{ +Documentation template for check functions +} +\keyword{internal} diff --git a/man/document_score_data.Rd b/man/document_score_data.Rd new file mode 100644 index 000000000..9b8b19558 --- /dev/null +++ b/man/document_score_data.Rd @@ -0,0 +1,15 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/documentation-templates.R +\name{document_score_data} +\alias{document_score_data} +\title{Documentation template for scoring input data} +\arguments{ +\item{data}{A data frame (or similar) of forecasts following the +specifications detailed in \code{\link[=score]{score()}}.} + +\item{scores}{A data.table of scores as produced by \code{\link[=score]{score()}}.} +} +\description{ +Documentation template for scoring input data +} +\keyword{internal} diff --git a/man/document_test_functions.Rd b/man/document_test_functions.Rd new file mode 100644 index 000000000..68fe7c7a5 --- /dev/null +++ b/man/document_test_functions.Rd @@ -0,0 +1,12 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/documentation-templates.R +\name{document_test_functions} +\alias{document_test_functions} +\title{Documentation template for test functions} +\value{ +Returns TRUE if the check was successful and FALSE otherwise +} +\description{ +Documentation template for test functions +} +\keyword{internal} diff --git a/man/dss_sample.Rd b/man/dss_sample.Rd index 771beda7d..8aa792517 100644 --- a/man/dss_sample.Rd +++ b/man/dss_sample.Rd @@ -1,17 +1,20 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/scoringRules_wrappers.R +% Please edit documentation in R/metrics-sample.R \name{dss_sample} \alias{dss_sample} \title{Dawid-Sebastiani Score} \usage{ -dss_sample(true_values, predictions) +dss_sample(observed, predicted, ...) } \arguments{ -\item{true_values}{A vector with the true observed values of size n} +\item{observed}{A vector with observed values of size n} -\item{predictions}{nxN matrix of predictive samples, n (number of rows) being +\item{predicted}{nxN matrix of predictive samples, n (number of rows) being the number of data points and N (number of columns) the number of Monte -Carlo samples. Alternatively, predictions can just be a vector of size n.} +Carlo samples. Alternatively, \code{predicted} can just be a vector of size n.} + +\item{...}{additional arguments passed to +\link[scoringRules:scores_sample_univ]{dss_sample()} from the scoringRules package.} } \value{ vector with scoring values @@ -22,9 +25,9 @@ function from the \pkg{scoringRules} package. } \examples{ -true_values <- rpois(30, lambda = 1:30) -predictions <- replicate(200, rpois(n = 30, lambda = 1:30)) -dss_sample(true_values, predictions) +observed <- rpois(30, lambda = 1:30) +predicted <- replicate(200, rpois(n = 30, lambda = 1:30)) +dss_sample(observed, predicted) } \references{ Alexander Jordan, Fabian Krüger, Sebastian Lerch, Evaluating Probabilistic diff --git a/man/ensure_data.table.Rd b/man/ensure_data.table.Rd new file mode 100644 index 000000000..6d2457ee5 --- /dev/null +++ b/man/ensure_data.table.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{ensure_data.table} +\alias{ensure_data.table} +\title{Ensure That an Object is a Data Table} +\usage{ +ensure_data.table(data) +} +\arguments{ +\item{data}{An object to ensure is a data table} +} +\value{ +A data table +} +\description{ +This function ensures that an object is a data table. +If the object is not a data table, it is converted to one. If the object +is a data table, a copy of the object is returned. +} +\keyword{internal} diff --git a/man/example_binary.Rd b/man/example_binary.Rd index 7cdfcfb3c..efd78cb9c 100644 --- a/man/example_binary.Rd +++ b/man/example_binary.Rd @@ -11,15 +11,15 @@ A data frame with 346 rows and 10 columns: \item{location_name}{name of the country for which a prediction was made} \item{target_end_date}{the date for which a prediction was made} \item{target_type}{the target to be predicted (cases or deaths)} -\item{true_value}{true observed values} +\item{observed}{A factor with observed values} \item{forecast_date}{the date on which a prediction was made} \item{model}{name of the model that generated the forecasts} \item{horizon}{forecast horizon in weeks} -\item{prediction}{predicted value} +\item{predicted}{predicted value} } } \source{ -\url{https://github.com/european-modelling-hubs/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} # nolint +\url{https://github.com/european-modelling-hubs/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} } \usage{ example_binary diff --git a/man/example_continuous.Rd b/man/example_continuous.Rd index 7c5de286d..17386704d 100644 --- a/man/example_continuous.Rd +++ b/man/example_continuous.Rd @@ -10,17 +10,17 @@ A data frame with 13,429 rows and 10 columns: \item{location}{the country for which a prediction was made} \item{target_end_date}{the date for which a prediction was made} \item{target_type}{the target to be predicted (cases or deaths)} -\item{true_value}{true observed values} +\item{observed}{observed values} \item{location_name}{name of the country for which a prediction was made} \item{forecast_date}{the date on which a prediction was made} \item{model}{name of the model that generated the forecasts} \item{horizon}{forecast horizon in weeks} -\item{prediction}{predicted value} -\item{sample}{id for the corresponding sample} +\item{predicted}{predicted value} +\item{sample_id}{id for the corresponding sample} } } \source{ -\url{https://github.com/european-modelling-hubs/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} # nolint +\url{https://github.com/european-modelling-hubs/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} } \usage{ example_continuous diff --git a/man/example_integer.Rd b/man/example_integer.Rd index 5bbd059fe..6589e3ce3 100644 --- a/man/example_integer.Rd +++ b/man/example_integer.Rd @@ -10,15 +10,18 @@ A data frame with 13,429 rows and 10 columns: \item{location}{the country for which a prediction was made} \item{target_end_date}{the date for which a prediction was made} \item{target_type}{the target to be predicted (cases or deaths)} -\item{true_value}{true observed values} +\item{observed}{observed values} \item{location_name}{name of the country for which a prediction was made} \item{forecast_date}{the date on which a prediction was made} \item{model}{name of the model that generated the forecasts} \item{horizon}{forecast horizon in weeks} -\item{prediction}{predicted value} -\item{sample}{id for the corresponding sample} +\item{predicted}{predicted value} +\item{sample_id}{id for the corresponding sample} } } +\source{ +\url{https://github.com/european-modelling-hubs/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} +} \usage{ example_integer } diff --git a/man/example_point.Rd b/man/example_point.Rd index 1a3036617..5ce43c8fa 100644 --- a/man/example_point.Rd +++ b/man/example_point.Rd @@ -10,17 +10,16 @@ A data frame with \item{location}{the country for which a prediction was made} \item{target_end_date}{the date for which a prediction was made} \item{target_type}{the target to be predicted (cases or deaths)} -\item{true_value}{true observed values} +\item{observed}{observed values} \item{location_name}{name of the country for which a prediction was made} \item{forecast_date}{the date on which a prediction was made} -\item{quantile}{quantile of the corresponding prediction} -\item{prediction}{predicted value} +\item{predicted}{predicted value} \item{model}{name of the model that generated the forecasts} \item{horizon}{forecast horizon in weeks} } } \source{ -\url{https://github.com/european-modelling-hubs/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} # nolint +\url{https://github.com/european-modelling-hubs/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} } \usage{ example_point diff --git a/man/example_quantile.Rd b/man/example_quantile.Rd index 0e996eef3..dd048ec8a 100644 --- a/man/example_quantile.Rd +++ b/man/example_quantile.Rd @@ -10,17 +10,17 @@ A data frame with \item{location}{the country for which a prediction was made} \item{target_end_date}{the date for which a prediction was made} \item{target_type}{the target to be predicted (cases or deaths)} -\item{true_value}{true observed values} +\item{observed}{Numeric: observed values} \item{location_name}{name of the country for which a prediction was made} \item{forecast_date}{the date on which a prediction was made} \item{quantile}{quantile of the corresponding prediction} -\item{prediction}{predicted value} +\item{predicted}{predicted value} \item{model}{name of the model that generated the forecasts} \item{horizon}{forecast horizon in weeks} } } \source{ -\url{https://github.com/european-modelling-hubs/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} # nolint +\url{https://github.com/european-modelling-hubs/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} } \usage{ example_quantile diff --git a/man/example_quantile_forecasts_only.Rd b/man/example_quantile_forecasts_only.Rd index 00520c8cd..30782740f 100644 --- a/man/example_quantile_forecasts_only.Rd +++ b/man/example_quantile_forecasts_only.Rd @@ -12,13 +12,13 @@ A data frame with 7,581 rows and 9 columns: \item{target_type}{the target to be predicted (cases or deaths)} \item{forecast_date}{the date on which a prediction was made} \item{quantile}{quantile of the corresponding prediction} -\item{prediction}{predicted value} +\item{predicted}{predicted value} \item{model}{name of the model that generated the forecasts} \item{horizon}{forecast horizon in weeks} } } \source{ -\url{https://github.com/european-modelling-hubs/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} # nolint +\url{https://github.com/european-modelling-hubs/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} } \usage{ example_quantile_forecasts_only diff --git a/man/example_truth_only.Rd b/man/example_truth_only.Rd index 8cfa41498..ce32e8a33 100644 --- a/man/example_truth_only.Rd +++ b/man/example_truth_only.Rd @@ -10,12 +10,12 @@ A data frame with 140 rows and 5 columns: \item{location}{the country for which a prediction was made} \item{target_end_date}{the date for which a prediction was made} \item{target_type}{the target to be predicted (cases or deaths)} -\item{true_value}{true observed values} +\item{observed}{observed values} \item{location_name}{name of the country for which a prediction was made} } } \source{ -\url{https://github.com/european-modelling-hubs/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} # nolint +\url{https://github.com/european-modelling-hubs/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} } \usage{ example_truth_only diff --git a/man/filter_function_args.Rd b/man/filter_function_args.Rd new file mode 100644 index 000000000..39bc93272 --- /dev/null +++ b/man/filter_function_args.Rd @@ -0,0 +1,26 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{filter_function_args} +\alias{filter_function_args} +\title{Filter function arguments} +\usage{ +filter_function_args(fun, args) +} +\arguments{ +\item{fun}{A function to which arguments shall be passed} + +\item{args}{A list of arguments that shall be passed to fun} +} +\value{ +A list of function arguments (a subset of \code{args}) that \code{fun} can +accept. +} +\description{ +This function compares a list of arguments with the arguments +that a function can accept. It only returns those arguments that can be +passed to the function. + +The function is used in \code{\link[=score]{score()}} to handle additional arguments passed to +\code{\link[=score]{score()}} that get then passed along to the different scoring functions. +} +\keyword{internal} diff --git a/man/forecast_types.Rd b/man/forecast_types.Rd new file mode 100644 index 000000000..b0d825f76 --- /dev/null +++ b/man/forecast_types.Rd @@ -0,0 +1,70 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/documentation-templates.R +\name{forecast_types} +\alias{forecast_types} +\title{Documentation template for forecast types} +\description{ +Documentation template for forecast types +} +\section{Forecast types and input format}{ +Various different forecast types / forecast formats are supported. At the +moment, those are +\itemize{ +\item point forecasts +\item binary forecasts ("soft binary classification") +\item Probabilistic forecasts in a quantile-based format (a forecast is +represented as a set of predictive quantiles) +\item Probabilistic forecasts in a sample-based format (a forecast is represented +as a set of predictive samples) +} + +Forecast types are determined based on the columns present in the input data. + +\emph{Point forecasts} require a column \code{observed} of type numeric and a column +\code{predicted} of type numeric. + +\emph{Binary forecasts} require a column \code{observed} of type factor with exactly +two levels and a column \code{predicted} of type numeric with probabilities, +corresponding to the probability that \code{observed} is equal to the second +factor level. See details \link[=brier_score]{here} for more information. + +\emph{Quantile-based forecasts} require a column \code{observed} of type numeric, +a column \code{predicted} of type numeric, and a column \code{quantile} of type numeric +with quantile-levels (between 0 and 1). + +\emph{Sample-based forecasts} require a column \code{observed} of type numeric, +a column \code{predicted} of type numeric, and a column \code{sample_id} of type +numeric with sample indices. + +For more information see the vignettes and the example data +(\link{example_quantile}, \link{example_continuous}, \link{example_integer}, +\code{\link[=example_point]{example_point()}}, and \link{example_binary}). +} + +\section{Forecast unit}{ +In order to score forecasts, \code{scoringutils} needs to know which of the rows +of the data belong together and jointly form a single forecasts. This is +easy e.g. for point forecast, where there is one row per forecast. For +quantile or sample-based forecasts, however, there are multiple rows that +belong to single forecast. + +The \emph{forecast unit} or \emph{unit of a single forecast} is then described by the +combination of columns that uniquely identify a single forecast. +For example, we could have forecasts made by different models in various +locations at different time points, each for several weeks into the future. +The forecast unit could then be described as +\code{forecast_unit = c("model", "location", "forecast_date", "forecast_horizon")}. +\code{scoringutils} automatically tries to determine the unit of a single +forecast. It uses all existing columns for this, which means that no columns +must be present that are unrelated to the forecast unit. As a very simplistic +example, if you had an additional row, "even", that is one if the row number +is even and zero otherwise, then this would mess up scoring as \code{scoringutils} +then thinks that this column was relevant in defining the forecast unit. + +In order to avoid issues, we recommend using the function +\code{\link[=set_forecast_unit]{set_forecast_unit()}} to determine the forecast unit manually. +The function simply drops unneeded columns, while making sure that all +necessary, 'protected columns' like "predicted" or "observed" are retained. +} + +\keyword{internal} diff --git a/man/geom_mean_helper.Rd b/man/geom_mean_helper.Rd deleted file mode 100644 index 8853a3719..000000000 --- a/man/geom_mean_helper.Rd +++ /dev/null @@ -1,18 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utils.R -\name{geom_mean_helper} -\alias{geom_mean_helper} -\title{Calculate Geometric Mean} -\usage{ -geom_mean_helper(x) -} -\arguments{ -\item{x}{numeric vector of values for which to calculate the geometric mean} -} -\value{ -the geometric mean of the values in \code{x} -} -\description{ -Calculate Geometric Mean -} -\keyword{internal} diff --git a/man/geometric_mean.Rd b/man/geometric_mean.Rd new file mode 100644 index 000000000..316715299 --- /dev/null +++ b/man/geometric_mean.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/pairwise-comparisons.R +\name{geometric_mean} +\alias{geometric_mean} +\title{Calculate Geometric Mean} +\usage{ +geometric_mean(x) +} +\arguments{ +\item{x}{numeric vector of values for which to calculate the geometric mean} +} +\value{ +the geometric mean of the values in \code{x}. \code{NA} values are ignored. +} +\description{ +Calculate Geometric Mean +} +\details{ +Used in \code{\link[=pairwise_comparison]{pairwise_comparison()}}. +} +\keyword{internal} diff --git a/man/find_duplicates.Rd b/man/get_duplicate_forecasts.Rd similarity index 64% rename from man/find_duplicates.Rd rename to man/get_duplicate_forecasts.Rd index 13838b931..26f77e857 100644 --- a/man/find_duplicates.Rd +++ b/man/get_duplicate_forecasts.Rd @@ -1,17 +1,17 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/check_forecasts.R -\name{find_duplicates} -\alias{find_duplicates} +% Please edit documentation in R/get_-functions.R +\name{get_duplicate_forecasts} +\alias{get_duplicate_forecasts} \title{Find duplicate forecasts} \usage{ -find_duplicates(data, forecast_unit) +get_duplicate_forecasts(data, forecast_unit = NULL) } \arguments{ \item{data}{A data.frame as used for \code{\link[=score]{score()}}} \item{forecast_unit}{A character vector with the column names that define -the unit of a single forecast. If missing the function tries to infer the -unit of a single forecast.} +the unit of a single forecast. If \code{NULL} (the default) the function tries +to infer the unit of a single forecast.} } \value{ A data.frame with all rows for which a duplicate forecast was found @@ -23,6 +23,6 @@ target. } \examples{ example <- rbind(example_quantile, example_quantile[1000:1010]) -find_duplicates(example) +get_duplicate_forecasts(example) } \keyword{check-forecasts} diff --git a/man/get_forecast_counts.Rd b/man/get_forecast_counts.Rd new file mode 100644 index 000000000..52ea524e7 --- /dev/null +++ b/man/get_forecast_counts.Rd @@ -0,0 +1,45 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/available_forecasts.R +\name{get_forecast_counts} +\alias{get_forecast_counts} +\title{Count Number of Available Forecasts} +\usage{ +get_forecast_counts(data, by = NULL, collapse = c("quantile", "sample_id")) +} +\arguments{ +\item{data}{A data.frame or data.table with predicted and observed values.} + +\item{by}{character vector or \code{NULL} (the default) that denotes the +categories over which the number of forecasts should be counted. +By default (\code{by = NULL}) this will be the unit of a single forecast (i.e. +all available columns (apart from a few "protected" columns such as +'predicted' and 'observed') plus "quantile" or "sample_id" where present).} + +\item{collapse}{character vector (default is \verb{c("quantile", "sample_id"}) +with names of categories for which the number of rows should be collapsed to +one when counting. For example, a single forecast is usually represented by a +set of several quantiles or samples and collapsing these to one makes sure +that a single forecast only gets counted once. Setting \code{collapse = c()} +would mean that all quantiles / samples would be counted as individual +forecasts.} +} +\value{ +A data.table with columns as specified in \code{by} and an additional +column "count" with the number of forecasts. +} +\description{ +Given a data set with forecasts, count the number of available forecasts +for arbitrary grouping (e.g. the number of forecasts per model, or the +number of forecasts per model and location). +This is useful to determine whether there are any missing forecasts. +} +\examples{ +\dontshow{ + data.table::setDTthreads(2) # restricts number of cores used on CRAN +} + +get_forecast_counts(example_quantile, + by = c("model", "target_type") +) +} +\keyword{check-forecasts} diff --git a/man/get_forecast_type.Rd b/man/get_forecast_type.Rd new file mode 100644 index 000000000..27c4e08b2 --- /dev/null +++ b/man/get_forecast_type.Rd @@ -0,0 +1,33 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/get_-functions.R +\name{get_forecast_type} +\alias{get_forecast_type} +\title{Infer Forecast Type} +\usage{ +get_forecast_type(data) +} +\arguments{ +\item{data}{A data.frame or data.table with predicted and observed values.} +} +\value{ +Character vector of length one with either "binary", "quantile", +"sample" or "point". +} +\description{ +Helper function to infer the forecast type based on a +data.frame or similar with predictions. Please check the vignettes to +learn more about forecast types. + +Possible forecast types are +\itemize{ +\item "sample-based" +\item "quantile-based" +\item "binary" +\item "point" forecast. +} + +The function runs additional checks to make sure the data satisfies the +requirements of the respective forecast type and throws an +informative error if any issues are found. +} +\keyword{check-forecasts} diff --git a/man/get_forecast_unit.Rd b/man/get_forecast_unit.Rd index b0be599f8..ff2785030 100644 --- a/man/get_forecast_unit.Rd +++ b/man/get_forecast_unit.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utils.R +% Please edit documentation in R/get_-functions.R \name{get_forecast_unit} \alias{get_forecast_unit} \title{Get unit of a single forecast} @@ -7,32 +7,7 @@ get_forecast_unit(data) } \arguments{ -\item{data}{A data.frame or data.table with the predictions and observations. -For scoring using \code{\link[=score]{score()}}, the following columns need to be present: -\itemize{ -\item \code{true_value} - the true observed values -\item \code{prediction} - predictions or predictive samples for one -true value. (You only don't need to provide a prediction column if -you want to score quantile forecasts in a wide range format.)} -For scoring integer and continuous forecasts a \code{sample} column is needed: -\itemize{ -\item \code{sample} - an index to identify the predictive samples in the -prediction column generated by one model for one true value. Only -necessary for continuous and integer forecasts, not for -binary predictions.} -For scoring predictions in a quantile-format forecast you should provide -a column called \code{quantile}: -\itemize{ -\item \code{quantile}: quantile to which the prediction corresponds -} - -In addition a \code{model} column is suggested and if not present this will be -flagged and added to the input data with all forecasts assigned as an -"unspecified model"). - -You can check the format of your data using \code{\link[=check_forecasts]{check_forecasts()}} and there -are examples for each format (\link{example_quantile}, \link{example_continuous}, -\link{example_integer}, and \link{example_binary}).} +\item{data}{A data.frame or data.table with predicted and observed values.} } \value{ A character vector with the column names that define the unit of @@ -40,6 +15,10 @@ a single forecast } \description{ Helper function to get the unit of a single forecast, i.e. -the column names that define where a single forecast was made for +the column names that define where a single forecast was made for. +This just takes all columns that are available in the data and subtracts +the columns that are protected, i.e. those returned by +\code{\link[=get_protected_columns]{get_protected_columns()}} as well as the names of the metrics that were +specified during scoring, if any. } -\keyword{internal} +\keyword{check-forecasts} diff --git a/man/get_metrics.Rd b/man/get_metrics.Rd new file mode 100644 index 000000000..aed3b5e81 --- /dev/null +++ b/man/get_metrics.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/get_-functions.R +\name{get_metrics} +\alias{get_metrics} +\title{Get metrics that were used for scoring} +\usage{ +get_metrics(scores) +} +\arguments{ +\item{scores}{A data.table with an attribute \code{metric_names}} +} +\value{ +Character vector with the metrics that were used for scoring. +} +\description{ +Internal helper function to get the metrics that were used +to score forecasts. +} +\keyword{internal_input_check} diff --git a/man/get_prediction_type.Rd b/man/get_prediction_type.Rd deleted file mode 100644 index 4ea08edbe..000000000 --- a/man/get_prediction_type.Rd +++ /dev/null @@ -1,46 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utils.R -\name{get_prediction_type} -\alias{get_prediction_type} -\title{Get prediction type of a forecast} -\usage{ -get_prediction_type(data) -} -\arguments{ -\item{data}{A data.frame or data.table with the predictions and observations. -For scoring using \code{\link[=score]{score()}}, the following columns need to be present: -\itemize{ -\item \code{true_value} - the true observed values -\item \code{prediction} - predictions or predictive samples for one -true value. (You only don't need to provide a prediction column if -you want to score quantile forecasts in a wide range format.)} -For scoring integer and continuous forecasts a \code{sample} column is needed: -\itemize{ -\item \code{sample} - an index to identify the predictive samples in the -prediction column generated by one model for one true value. Only -necessary for continuous and integer forecasts, not for -binary predictions.} -For scoring predictions in a quantile-format forecast you should provide -a column called \code{quantile}: -\itemize{ -\item \code{quantile}: quantile to which the prediction corresponds -} - -In addition a \code{model} column is suggested and if not present this will be -flagged and added to the input data with all forecasts assigned as an -"unspecified model"). - -You can check the format of your data using \code{\link[=check_forecasts]{check_forecasts()}} and there -are examples for each format (\link{example_quantile}, \link{example_continuous}, -\link{example_integer}, and \link{example_binary}).} -} -\value{ -Character vector of length one with either "quantile", "integer", or -"continuous". -} -\description{ -Internal helper function to get the prediction type of a -forecast. That is inferred based on the properties of the values in the -\code{prediction} column. -} -\keyword{internal} diff --git a/man/get_protected_columns.Rd b/man/get_protected_columns.Rd index 79af67598..4bbf8f6da 100644 --- a/man/get_protected_columns.Rd +++ b/man/get_protected_columns.Rd @@ -1,41 +1,18 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utils.R +% Please edit documentation in R/get_-functions.R \name{get_protected_columns} \alias{get_protected_columns} \title{Get protected columns from a data frame} \usage{ -get_protected_columns(data) +get_protected_columns(data = NULL) } \arguments{ -\item{data}{A data.frame or data.table with the predictions and observations. -For scoring using \code{\link[=score]{score()}}, the following columns need to be present: -\itemize{ -\item \code{true_value} - the true observed values -\item \code{prediction} - predictions or predictive samples for one -true value. (You only don't need to provide a prediction column if -you want to score quantile forecasts in a wide range format.)} -For scoring integer and continuous forecasts a \code{sample} column is needed: -\itemize{ -\item \code{sample} - an index to identify the predictive samples in the -prediction column generated by one model for one true value. Only -necessary for continuous and integer forecasts, not for -binary predictions.} -For scoring predictions in a quantile-format forecast you should provide -a column called \code{quantile}: -\itemize{ -\item \code{quantile}: quantile to which the prediction corresponds -} - -In addition a \code{model} column is suggested and if not present this will be -flagged and added to the input data with all forecasts assigned as an -"unspecified model"). - -You can check the format of your data using \code{\link[=check_forecasts]{check_forecasts()}} and there -are examples for each format (\link{example_quantile}, \link{example_continuous}, -\link{example_integer}, and \link{example_binary}).} +\item{data}{A data.frame or data.table with predicted and observed values.} } \value{ -A character vector with the names of protected columns in the data +A character vector with the names of protected columns in the data. +If data is \code{NULL} (default) then it returns a list of all columns that are +protected in scoringutils. } \description{ Helper function to get the names of all columns in a data frame diff --git a/man/get_range_from_quantile.Rd b/man/get_range_from_quantile.Rd new file mode 100644 index 000000000..eca15af36 --- /dev/null +++ b/man/get_range_from_quantile.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils_data_handling.R +\name{get_range_from_quantile} +\alias{get_range_from_quantile} +\title{Get Range Belonging to a Quantile} +\usage{ +get_range_from_quantile(quantile) +} +\arguments{ +\item{quantile}{a numeric vector of quantile levels of size N} +} +\value{ +a numeric vector of interval ranges of size N +} +\description{ +Every quantile can be thought of either as the lower or the +upper bound of a symmetric central prediction interval. This helper function +returns the range of the central prediction interval to which the quantile +belongs. + +Due to numeric instability that sometimes occurred in the past, ranges are +rounded to 10 decimal places. This is not a problem for the vast majority of +use cases, but it is something to be aware of. +} +\keyword{internal} diff --git a/man/get_scoringutils_attributes.Rd b/man/get_scoringutils_attributes.Rd new file mode 100644 index 000000000..5f2770a6a --- /dev/null +++ b/man/get_scoringutils_attributes.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/get_-functions.R +\name{get_scoringutils_attributes} +\alias{get_scoringutils_attributes} +\title{Get a list of all attributes of a scoringutils object} +\usage{ +get_scoringutils_attributes(object) +} +\arguments{ +\item{object}{A object of class \code{forecast_}} +} +\value{ +A named list with the attributes of that object. +} +\description{ +Get a list of all attributes of a scoringutils object +} +\keyword{internal} diff --git a/man/get_target_type.Rd b/man/get_target_type.Rd deleted file mode 100644 index ae8ee5fea..000000000 --- a/man/get_target_type.Rd +++ /dev/null @@ -1,46 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utils.R -\name{get_target_type} -\alias{get_target_type} -\title{Get type of the target true values of a forecast} -\usage{ -get_target_type(data) -} -\arguments{ -\item{data}{A data.frame or data.table with the predictions and observations. -For scoring using \code{\link[=score]{score()}}, the following columns need to be present: -\itemize{ -\item \code{true_value} - the true observed values -\item \code{prediction} - predictions or predictive samples for one -true value. (You only don't need to provide a prediction column if -you want to score quantile forecasts in a wide range format.)} -For scoring integer and continuous forecasts a \code{sample} column is needed: -\itemize{ -\item \code{sample} - an index to identify the predictive samples in the -prediction column generated by one model for one true value. Only -necessary for continuous and integer forecasts, not for -binary predictions.} -For scoring predictions in a quantile-format forecast you should provide -a column called \code{quantile}: -\itemize{ -\item \code{quantile}: quantile to which the prediction corresponds -} - -In addition a \code{model} column is suggested and if not present this will be -flagged and added to the input data with all forecasts assigned as an -"unspecified model"). - -You can check the format of your data using \code{\link[=check_forecasts]{check_forecasts()}} and there -are examples for each format (\link{example_quantile}, \link{example_continuous}, -\link{example_integer}, and \link{example_binary}).} -} -\value{ -Character vector of length one with either "binary", "integer", or -"continuous" -} -\description{ -Internal helper function to get the type of the target -true values of a forecast. That is inferred based on the which columns -are present in the data. -} -\keyword{internal} diff --git a/man/get_type.Rd b/man/get_type.Rd new file mode 100644 index 000000000..596cf4c65 --- /dev/null +++ b/man/get_type.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/get_-functions.R +\name{get_type} +\alias{get_type} +\title{Get type of a vector or matrix of observed values or predictions} +\usage{ +get_type(x) +} +\arguments{ +\item{x}{Input used to get the type.} +} +\value{ +Character vector of length one with either "classification", +"integer", or "continuous" +} +\description{ +Internal helper function to get the type of a vector (usually +of observed or predicted values). The function checks whether the input is +a factor, or else whether it is integer (or can be coerced to integer) or +whether it's continuous. +} +\keyword{internal_input_check} diff --git a/man/infer_rel_skill_metric.Rd b/man/infer_rel_skill_metric.Rd index 82413818b..323cd84fd 100644 --- a/man/infer_rel_skill_metric.Rd +++ b/man/infer_rel_skill_metric.Rd @@ -13,5 +13,6 @@ infer_rel_skill_metric(scores) Helper function to infer the metric for which pairwise comparisons shall be made. The function simply checks the names of the available columns and chooses the most widely used metric. +Used in \code{\link[=pairwise_comparison]{pairwise_comparison()}}. } \keyword{internal} diff --git a/man/interval_coverage.Rd b/man/interval_coverage.Rd new file mode 100644 index 000000000..681a07581 --- /dev/null +++ b/man/interval_coverage.Rd @@ -0,0 +1,47 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/metrics-quantile.R +\name{interval_coverage} +\alias{interval_coverage} +\title{Interval Coverage (For Quantile-Based Forecasts)} +\usage{ +interval_coverage(observed, predicted, quantile, range = 50) +} +\arguments{ +\item{observed}{numeric vector of size n with the observed values} + +\item{predicted}{numeric nxN matrix of predictive +quantiles, n (number of rows) being the number of forecasts (corresponding +to the number of observed values) and N +(number of columns) the number of quantiles per forecast. +If \code{observed} is just a single number, then predicted can just be a +vector of size N.} + +\item{quantile}{vector with quantile levels of size N} + +\item{range}{A single number with the range of the prediction interval in +percent (e.g. 50 for a 50\% prediction interval) for which you want to compute +interval coverage.} +} +\value{ +A vector of length n with elements either TRUE, +if the observed value is within the corresponding prediction interval, and +FALSE otherwise. +} +\description{ +Check whether the observed value is within a given central +prediction interval. The prediction interval is defined by a lower and an +upper bound formed by a pair of predictive quantiles. For example, a 50\% +prediction interval is formed by the 0.25 and 0.75 quantiles of the +predictive distribution. +} +\examples{ +observed <- c(1, -15, 22) +predicted <- rbind( + c(-1, 0, 1, 2, 3), + c(-2, 1, 2, 2, 4), + c(-2, 0, 3, 3, 4) +) +quantile <- c(0.1, 0.25, 0.5, 0.75, 0.9) +interval_coverage(observed, predicted, quantile) +} +\keyword{metric} diff --git a/man/interval_coverage_deviation.Rd b/man/interval_coverage_deviation.Rd new file mode 100644 index 000000000..c82825f12 --- /dev/null +++ b/man/interval_coverage_deviation.Rd @@ -0,0 +1,77 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/metrics-quantile.R +\name{interval_coverage_deviation} +\alias{interval_coverage_deviation} +\title{Interval Coverage Deviation (For Quantile-Based Forecasts)} +\usage{ +interval_coverage_deviation(observed, predicted, quantile) +} +\arguments{ +\item{observed}{numeric vector of size n with the observed values} + +\item{predicted}{numeric nxN matrix of predictive +quantiles, n (number of rows) being the number of forecasts (corresponding +to the number of observed values) and N +(number of columns) the number of quantiles per forecast. +If \code{observed} is just a single number, then predicted can just be a +vector of size N.} + +\item{quantile}{vector with quantile levels of size N} +} +\value{ +A numeric vector of length n with the interval coverage deviation +for each forecast (comprising one or multiple prediction intervals). +} +\description{ +Check the agreement between desired and actual interval coverage +of a forecast. + +The function is similar to \code{\link[=interval_coverage]{interval_coverage()}}, +but takes all provided prediction intervals into account and +compares nominal interval coverage (i.e. the desired interval coverage) with +the actual observed interval coverage. + +A central symmetric prediction interval is defined by a lower and an +upper bound formed by a pair of predictive quantiles. For example, a 50\% +prediction interval is formed by the 0.25 and 0.75 quantiles of the +predictive distribution. Ideally, a forecaster should aim to cover about +50\% of all observed values with their 50\% prediction intervals, 90\% of all +observed values with their 90\% prediction intervals, and so on. + +For every prediction interval, the deviation is computed as the difference +between the observed interval coverage and the nominal interval coverage +For a single observed value and a single prediction interval, coverage is +always either 0 or 1 (\code{FALSE} or \code{TRUE}). This is not the case for a single +observed value and multiple prediction intervals, +but it still doesn't make that much +sense to compare nominal (desired) coverage and actual coverage for a single +observation. In that sense coverage deviation only really starts to make +sense as a metric when averaged across multiple observations). + +Positive values of interval coverage deviation are an indication for +underconfidence, i.e. the forecaster could likely have issued a narrower +forecast. Negative values are an indication for overconfidence, i.e. the +forecasts were too narrow. + +\deqn{ +\textrm{interval coverage deviation} = +\mathbf{1}(\textrm{observed value falls within interval}) - +\textrm{nominal interval coverage} +}{ +interval coverage deviation = +1(observed value falls within interval) - nominal interval coverage +} +The interval coverage deviation is then averaged across all prediction +intervals. The median is ignored when computing coverage deviation. +} +\examples{ +observed <- c(1, -15, 22) +predicted <- rbind( + c(-1, 0, 1, 2, 3), + c(-2, 1, 2, 2, 4), + c(-2, 0, 3, 3, 4) +) +quantile <- c(0.1, 0.25, 0.5, 0.75, 0.9) +interval_coverage_deviation(observed, predicted, quantile) +} +\keyword{metric} diff --git a/man/interval_score.Rd b/man/interval_score.Rd index 8520393ec..3616eabfb 100644 --- a/man/interval_score.Rd +++ b/man/interval_score.Rd @@ -1,11 +1,11 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/interval_score.R +% Please edit documentation in R/metrics-range.R \name{interval_score} \alias{interval_score} \title{Interval Score} \usage{ interval_score( - true_values, + observed, lower, upper, interval_range, @@ -14,7 +14,7 @@ interval_score( ) } \arguments{ -\item{true_values}{A vector with the true observed values of size n} +\item{observed}{A vector with observed values of size n} \item{lower}{vector of size n with the prediction for the lower quantile of the given range} @@ -52,14 +52,14 @@ The score is computed as \deqn{ \textrm{score} = (\textrm{upper} - \textrm{lower}) + \frac{2}{\alpha}(\textrm{lower} - - \textrm{true\_value}) * -\mathbf{1}(\textrm{true\_value} < \textrm{lower}) + -\frac{2}{\alpha}(\textrm{true\_value} - \textrm{upper}) * -\mathbf{1}(\textrm{true\_value} > \textrm{upper}) + - \textrm{observed}) * +\mathbf{1}(\textrm{observed} < \textrm{lower}) + +\frac{2}{\alpha}(\textrm{observed} - \textrm{upper}) * +\mathbf{1}(\textrm{observed} > \textrm{upper}) }{ -score = (upper - lower) + 2/alpha * (lower - true_value) * -1(true_values < lower) + 2/alpha * (true_value - upper) * -1(true_value > upper) +score = (upper - lower) + 2/alpha * (lower - observed) * +1(observed < lower) + 2/alpha * (observed - upper) * +1(observed > upper) } where \eqn{\mathbf{1}()}{1()} is the indicator function and indicates how much is outside the prediction interval. @@ -76,25 +76,27 @@ as the lower bound and the 0.7 quantile as the upper). Non-symmetric quantiles can be scored using the function \code{\link[=quantile_score]{quantile_score()}}. } \examples{ -true_values <- rnorm(30, mean = 1:30) +observed <- rnorm(30, mean = 1:30) interval_range <- rep(90, 30) alpha <- (100 - interval_range) / 100 lower <- qnorm(alpha / 2, rnorm(30, mean = 1:30)) -upper <- qnorm((1 - alpha / 2), rnorm(30, mean = 1:30)) +upper <- qnorm((1 - alpha / 2), rnorm(30, mean = 11:40)) -interval_score( - true_values = true_values, +scoringutils:::interval_score( + observed = observed, lower = lower, upper = upper, interval_range = interval_range ) # gives a warning, as the interval_range should likely be 50 instead of 0.5 -interval_score(true_value = 4, upper = 2, lower = 8, interval_range = 0.5) +scoringutils:::interval_score( + observed = 4, upper = 8, lower = 2, interval_range = 0.5 +) # example with missing values and separate results -interval_score( - true_values = c(true_values, NA), +scoringutils:::interval_score( + observed = c(observed, NA), lower = c(lower, NA), upper = c(NA, upper), separate_results = TRUE, diff --git a/man/is_scoringutils_check.Rd b/man/is_scoringutils_check.Rd deleted file mode 100644 index fa51f9e18..000000000 --- a/man/is_scoringutils_check.Rd +++ /dev/null @@ -1,20 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utils.R -\name{is_scoringutils_check} -\alias{is_scoringutils_check} -\title{Check whether object has been checked with check_forecasts()} -\usage{ -is_scoringutils_check(data) -} -\arguments{ -\item{data}{An object of class \code{scoringutils_check()} as produced by -\code{\link[=check_forecasts]{check_forecasts()}}.} -} -\value{ -Logical, either TRUE or FALSE -} -\description{ -Helper function to determine whether an object has been checked -by and passed \code{\link[=check_forecasts]{check_forecasts()}}. -} -\keyword{internal} diff --git a/man/log_shift.Rd b/man/log_shift.Rd index 66e0ffb53..d72667305 100644 --- a/man/log_shift.Rd +++ b/man/log_shift.Rd @@ -30,7 +30,7 @@ log_shift(1:10) log_shift(0:9, offset = 1) transform_forecasts( - example_quantile[true_value > 0, ], + example_quantile[observed > 0, ], fun = log_shift, offset = 1 ) diff --git a/man/logs_binary.Rd b/man/logs_binary.Rd deleted file mode 100644 index 0cc1a03ff..000000000 --- a/man/logs_binary.Rd +++ /dev/null @@ -1,37 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/log_score.R -\name{logs_binary} -\alias{logs_binary} -\title{Log Score for Binary outcomes} -\usage{ -logs_binary(true_values, predictions) -} -\arguments{ -\item{true_values}{A vector with the true observed values of size n with -all values equal to either 0 or 1} - -\item{predictions}{A vector with a predicted probability -that true_value = 1.} -} -\value{ -A numeric value with the Log Score, i.e. the mean squared -error of the given probability forecasts -} -\description{ -Computes the Log Score for probabilistic forecasts of binary outcomes. -} -\details{ -The Log Score is a proper score rule suited to assessing the accuracy of -probabilistic binary predictions. The outcomes can be either 0 or 1, -the predictions must be a probability that the true outcome will be 1. - -The Log Score is then computed as the negative logarithm of the probability -assigned to the true outcome. Reporting the negative logarithm means that -smaller values are better. -} -\examples{ -true_values <- sample(c(0, 1), size = 30, replace = TRUE) -predictions <- runif(n = 30, min = 0, max = 1) -logs_binary(true_values, predictions) -} -\keyword{metric} diff --git a/man/logs_sample.Rd b/man/logs_sample.Rd index 29b9b9392..aebb6ac5c 100644 --- a/man/logs_sample.Rd +++ b/man/logs_sample.Rd @@ -1,17 +1,20 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/scoringRules_wrappers.R +% Please edit documentation in R/metrics-sample.R \name{logs_sample} \alias{logs_sample} \title{Logarithmic score} \usage{ -logs_sample(true_values, predictions) +logs_sample(observed, predicted, ...) } \arguments{ -\item{true_values}{A vector with the true observed values of size n} +\item{observed}{A vector with observed values of size n} -\item{predictions}{nxN matrix of predictive samples, n (number of rows) being +\item{predicted}{nxN matrix of predictive samples, n (number of rows) being the number of data points and N (number of columns) the number of Monte -Carlo samples. Alternatively, predictions can just be a vector of size n.} +Carlo samples. Alternatively, \code{predicted} can just be a vector of size n.} + +\item{...}{additional arguments passed to +\link[scoringRules:scores_sample_univ]{logs_sample()} from the scoringRules package.} } \value{ vector with the scoring values @@ -28,9 +31,9 @@ integer valued probability distributions. See the scoringRules package for more details. } \examples{ -true_values <- rpois(30, lambda = 1:30) -predictions <- replicate(200, rpois(n = 30, lambda = 1:30)) -logs_sample(true_values, predictions) +observed <- rpois(30, lambda = 1:30) +predicted <- replicate(200, rpois(n = 30, lambda = 1:30)) +logs_sample(observed, predicted) } \references{ Alexander Jordan, Fabian Krüger, Sebastian Lerch, Evaluating Probabilistic diff --git a/man/mad_sample.Rd b/man/mad_sample.Rd index 032bf187d..7594caa8e 100644 --- a/man/mad_sample.Rd +++ b/man/mad_sample.Rd @@ -1,15 +1,21 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/sharpness.R +% Please edit documentation in R/metrics-sample.R \name{mad_sample} \alias{mad_sample} \title{Determine dispersion of a probabilistic forecast} \usage{ -mad_sample(predictions) +mad_sample(observed = NULL, predicted, ...) } \arguments{ -\item{predictions}{nxN matrix of predictive samples, n (number of rows) being +\item{observed}{place holder, argument will be ignored and exists only for +consistency with other scoring functions. The output does not depend on +any observed values.} + +\item{predicted}{nxN matrix of predictive samples, n (number of rows) being the number of data points and N (number of columns) the number of Monte -Carlo samples. Alternatively, predictions can just be a vector of size n.} +Carlo samples. Alternatively, \code{predicted} can just be a vector of size n.} + +\item{...}{additional arguments passed to \link[stats:mad]{mad()}.} } \value{ vector with dispersion values @@ -23,14 +29,14 @@ narrow range and dispersion is the lack thereof. It is a data-independent measure, and is purely a feature of the forecasts themselves. -Dispersion of predictive samples corresponding to one single true value is +Dispersion of predictive samples corresponding to one single observed value is measured as the normalised median of the absolute deviation from the median of the predictive samples. For details, see \link[stats:mad]{mad()} and the explanations given in Funk et al. (2019) } \examples{ -predictions <- replicate(200, rpois(n = 30, lambda = 1:30)) -mad_sample(predictions) +predicted <- replicate(200, rpois(n = 30, lambda = 1:30)) +mad_sample(predicted = predicted) } \references{ Funk S, Camacho A, Kucharski AJ, Lowe R, Eggo RM, Edmunds WJ (2019) diff --git a/man/make_NA.Rd b/man/make_NA.Rd index be4cb3afa..1b1814c0e 100644 --- a/man/make_NA.Rd +++ b/man/make_NA.Rd @@ -10,35 +10,10 @@ make_NA(data = NULL, what = c("truth", "forecast", "both"), ...) make_na(data = NULL, what = c("truth", "forecast", "both"), ...) } \arguments{ -\item{data}{A data.frame or data.table with the predictions and observations. -For scoring using \code{\link[=score]{score()}}, the following columns need to be present: -\itemize{ -\item \code{true_value} - the true observed values -\item \code{prediction} - predictions or predictive samples for one -true value. (You only don't need to provide a prediction column if -you want to score quantile forecasts in a wide range format.)} -For scoring integer and continuous forecasts a \code{sample} column is needed: -\itemize{ -\item \code{sample} - an index to identify the predictive samples in the -prediction column generated by one model for one true value. Only -necessary for continuous and integer forecasts, not for -binary predictions.} -For scoring predictions in a quantile-format forecast you should provide -a column called \code{quantile}: -\itemize{ -\item \code{quantile}: quantile to which the prediction corresponds -} - -In addition a \code{model} column is suggested and if not present this will be -flagged and added to the input data with all forecasts assigned as an -"unspecified model"). - -You can check the format of your data using \code{\link[=check_forecasts]{check_forecasts()}} and there -are examples for each format (\link{example_quantile}, \link{example_continuous}, -\link{example_integer}, and \link{example_binary}).} +\item{data}{A data.frame or data.table with predicted and observed values.} \item{what}{character vector that determines which values should be turned -into \code{NA}. If \code{what = "truth"}, values in the column 'true_value' will be +into \code{NA}. If \code{what = "truth"}, values in the column 'observed' will be turned into \code{NA}. If \code{what = "forecast"}, values in the column 'prediction' will be turned into \code{NA}. If \code{what = "both"}, values in both column will be turned into \code{NA}.} diff --git a/man/new_forecast.Rd b/man/new_forecast.Rd new file mode 100644 index 000000000..772c5e1b4 --- /dev/null +++ b/man/new_forecast.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/validate.R +\name{new_forecast} +\alias{new_forecast} +\title{Class constructor for scoringutils objects} +\usage{ +new_forecast(data, classname) +} +\arguments{ +\item{data}{A data.frame or data.table with predicted and observed values.} + +\item{classname}{name of the class to be created} +} +\value{ +An object of the class indicated by \code{classname} +} +\description{ +Construct a class based on a data.frame or similar. The constructor +\itemize{ +\item coerces the data into a data.table +\item makes sure that a column called \code{model} exists and if not creates one +\item assigns a class +} +} +\keyword{internal} diff --git a/man/pairwise_comparison.Rd b/man/pairwise_comparison.Rd index d45442b7e..f95b29bbc 100644 --- a/man/pairwise_comparison.Rd +++ b/man/pairwise_comparison.Rd @@ -25,9 +25,8 @@ splitting) and the pairwise comparisons will be computed separately for the split data.frames.} \item{metric}{A character vector of length one with the metric to do the -comparison on. The default is "auto", meaning that either "interval_score", -"crps", or "brier_score" will be selected where available. -See \code{\link[=available_metrics]{available_metrics()}} for available metrics.} +comparison on. The default is "auto", meaning that either "wis", +"crps", or "brier_score" will be selected where available.} \item{baseline}{character vector of length one that denotes the baseline model against which to compare other models.} diff --git a/man/pairwise_comparison_one_group.Rd b/man/pairwise_comparison_one_group.Rd index b176523bb..df59b1472 100644 --- a/man/pairwise_comparison_one_group.Rd +++ b/man/pairwise_comparison_one_group.Rd @@ -10,9 +10,8 @@ pairwise_comparison_one_group(scores, metric, baseline, by, ...) \item{scores}{A data.table of scores as produced by \code{\link[=score]{score()}}.} \item{metric}{A character vector of length one with the metric to do the -comparison on. The default is "auto", meaning that either "interval_score", -"crps", or "brier_score" will be selected where available. -See \code{\link[=available_metrics]{available_metrics()}} for available metrics.} +comparison on. The default is "auto", meaning that either "wis", +"crps", or "brier_score" will be selected where available.} \item{baseline}{character vector of length one that denotes the baseline model against which to compare other models.} diff --git a/man/permutation_test.Rd b/man/permutation_test.Rd index e889ee17e..56a26ecb9 100644 --- a/man/permutation_test.Rd +++ b/man/permutation_test.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utils.R +% Please edit documentation in R/pairwise-comparisons.R \name{permutation_test} \alias{permutation_test} \title{Simple permutation test} @@ -12,13 +12,34 @@ permutation_test( comparison_mode = c("difference", "ratio") ) } +\arguments{ +\item{scores1}{vector of scores to compare against another vector of scores} + +\item{scores2}{A second vector of scores to compare against the first} + +\item{n_permutation}{The number of replications to use for a permutation +test. More replications yield more exact results, but require more +computation.} + +\item{one_sided}{Whether or not to compute a one-sided test. Default is +\code{FALSE},} + +\item{comparison_mode}{How to compute the test statistic for the comparison +of the two scores. Should be either "difference" or "ratio".} +} \value{ p-value of the permutation test } \description{ -#' The implementation of the permutation test follows the +The implementation of the permutation test follows the function \code{permutationTest} from the \code{surveillance} package by Michael Höhle, Andrea Riebler and Michaela Paul. +The function compares two vectors of scores. It computes the mean of each +vector independently and then takes either the difference or the ratio of +the two. This observed difference or ratio is compared against the same +test statistic based on permutations of the original data. + +Used in \code{\link[=pairwise_comparison]{pairwise_comparison()}}. } \keyword{internal} diff --git a/man/pit.Rd b/man/pit.Rd index bae1fc9d1..6e73c7da8 100644 --- a/man/pit.Rd +++ b/man/pit.Rd @@ -7,8 +7,8 @@ pit(data, by, n_replicates = 100) } \arguments{ -\item{data}{a data.frame with the following columns: \code{true_value}, -\code{prediction}, \code{sample}.} +\item{data}{a data.frame with the following columns: \code{observed}, +\code{predicted}, \code{sample_id}.} \item{by}{Character vector with the columns according to which the PIT values shall be grouped. If you e.g. have the columns 'model' and diff --git a/man/pit_sample.Rd b/man/pit_sample.Rd index 335b66574..bfcf1bcc6 100644 --- a/man/pit_sample.Rd +++ b/man/pit_sample.Rd @@ -4,23 +4,23 @@ \alias{pit_sample} \title{Probability Integral Transformation (sample-based version)} \usage{ -pit_sample(true_values, predictions, n_replicates = 100) +pit_sample(observed, predicted, n_replicates = 100) } \arguments{ -\item{true_values}{A vector with the true observed values of size n} +\item{observed}{A vector with observed values of size n} -\item{predictions}{nxN matrix of predictive samples, n (number of rows) being +\item{predicted}{nxN matrix of predictive samples, n (number of rows) being the number of data points and N (number of columns) the number of Monte -Carlo samples. Alternatively, predictions can just be a vector of size n.} +Carlo samples. Alternatively, \code{predicted} can just be a vector of size n.} \item{n_replicates}{the number of draws for the randomised PIT for integer predictions.} } \value{ A vector with PIT-values. For continuous forecasts, the vector will -correspond to the length of \code{true_values}. For integer forecasts, a +correspond to the length of \code{observed}. For integer forecasts, a randomised PIT will be returned of length -\code{length(true_values) * n_replicates} +\code{length(observed) * n_replicates} } \description{ Uses a Probability Integral Transformation (PIT) (or a @@ -81,15 +81,15 @@ PIT is a necessary but not sufficient condition of calibration. } ## continuous predictions -true_values <- rnorm(20, mean = 1:20) -predictions <- replicate(100, rnorm(n = 20, mean = 1:20)) -pit <- pit_sample(true_values, predictions) +observed <- rnorm(20, mean = 1:20) +predicted <- replicate(100, rnorm(n = 20, mean = 1:20)) +pit <- pit_sample(observed, predicted) plot_pit(pit) ## integer predictions -true_values <- rpois(50, lambda = 1:50) -predictions <- replicate(2000, rpois(n = 50, lambda = 1:50)) -pit <- pit_sample(true_values, predictions, n_replicates = 30) +observed <- rpois(50, lambda = 1:50) +predicted <- replicate(2000, rpois(n = 50, lambda = 1:50)) +pit <- pit_sample(observed, predicted, n_replicates = 30) plot_pit(pit) } \references{ diff --git a/man/plot_correlation.Rd b/man/plot_correlation.Rd index 390c90f20..0bade90cb 100644 --- a/man/plot_correlation.Rd +++ b/man/plot_correlation.Rd @@ -20,7 +20,8 @@ Plots a heatmap of correlations between different metrics \examples{ scores <- score(example_quantile) correlations <- correlation( - summarise_scores(scores) + summarise_scores(scores), + digits = 2 ) plot_correlation(correlations) } diff --git a/man/plot_avail_forecasts.Rd b/man/plot_forecast_counts.Rd similarity index 54% rename from man/plot_avail_forecasts.Rd rename to man/plot_forecast_counts.Rd index 6b9871afd..c5285c9dd 100644 --- a/man/plot_avail_forecasts.Rd +++ b/man/plot_forecast_counts.Rd @@ -1,32 +1,32 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/plot.R -\name{plot_avail_forecasts} -\alias{plot_avail_forecasts} +\name{plot_forecast_counts} +\alias{plot_forecast_counts} \title{Visualise Where Forecasts Are Available} \usage{ -plot_avail_forecasts( - avail_forecasts, +plot_forecast_counts( + forecast_counts, + x, y = "model", - x = "forecast_date", - make_x_factor = TRUE, - show_numbers = TRUE + x_as_factor = TRUE, + show_counts = TRUE ) } \arguments{ -\item{avail_forecasts}{data.frame with a column called \verb{Number forecasts} as -produced by \code{\link[=avail_forecasts]{avail_forecasts()}}} +\item{forecast_counts}{a data.table (or similar) with a column \code{count} +holding forecast counts, as produced by \code{\link[=get_forecast_counts]{get_forecast_counts()}}} + +\item{x}{character vector of length one that denotes the name of the column +to appear on the x-axis of the plot.} \item{y}{character vector of length one that denotes the name of the column to appear on the y-axis of the plot. Default is "model".} -\item{x}{character vector of length one that denotes the name of the column -to appear on the x-axis of the plot. Default is "forecast_date".} - -\item{make_x_factor}{logical (default is TRUE). Whether or not to convert +\item{x_as_factor}{logical (default is TRUE). Whether or not to convert the variable on the x-axis to a factor. This has an effect e.g. if dates are shown on the x-axis.} -\item{show_numbers}{logical (default is \code{TRUE}) that indicates whether +\item{show_counts}{logical (default is \code{TRUE}) that indicates whether or not to show the actual count numbers on the plot} } \value{ @@ -37,11 +37,11 @@ Visualise Where Forecasts Are Available } \examples{ library(ggplot2) -avail_forecasts <- avail_forecasts( +forecast_counts <- get_forecast_counts( example_quantile, by = c("model", "target_type", "target_end_date") ) -plot_avail_forecasts( - avail_forecasts, x = "target_end_date", show_numbers = FALSE +plot_forecast_counts( + forecast_counts, x = "target_end_date", show_counts = FALSE ) + facet_wrap("target_type") } diff --git a/man/plot_heatmap.Rd b/man/plot_heatmap.Rd index 8b1aac549..837ef4243 100644 --- a/man/plot_heatmap.Rd +++ b/man/plot_heatmap.Rd @@ -29,7 +29,7 @@ different locations. } \examples{ scores <- score(example_quantile) -scores <- summarise_scores(scores, by = c("model", "target_type", "range")) +scores <- summarise_scores(scores, by = c("model", "target_type")) plot_heatmap(scores, x = "target_type", metric = "bias") } diff --git a/man/plot_interval_coverage.Rd b/man/plot_interval_coverage.Rd index 8c95634ac..315a6972e 100644 --- a/man/plot_interval_coverage.Rd +++ b/man/plot_interval_coverage.Rd @@ -24,7 +24,7 @@ Plot interval coverage \dontshow{ data.table::setDTthreads(2) # restricts number of cores used on CRAN } -scores <- score(example_quantile) -scores <- summarise_scores(scores, by = c("model", "range")) -plot_interval_coverage(scores) +data_coverage <- add_coverage(example_quantile) +summarised <- summarise_scores(data_coverage, by = c("model", "range")) +plot_interval_coverage(summarised) } diff --git a/man/plot_pit.Rd b/man/plot_pit.Rd index af95f421a..7aafebed2 100644 --- a/man/plot_pit.Rd +++ b/man/plot_pit.Rd @@ -37,9 +37,9 @@ visually check whether a uniform distribution seems likely. } # PIT histogram in vector based format -true_values <- rnorm(30, mean = 1:30) -predictions <- replicate(200, rnorm(n = 30, mean = 1:30)) -pit <- pit_sample(true_values, predictions) +observed <- rnorm(30, mean = 1:30) +predicted <- replicate(200, rnorm(n = 30, mean = 1:30)) +pit <- pit_sample(observed, predicted) plot_pit(pit) # quantile-based pit diff --git a/man/plot_quantile_coverage.Rd b/man/plot_quantile_coverage.Rd index 2e6ef489e..f7be4c318 100644 --- a/man/plot_quantile_coverage.Rd +++ b/man/plot_quantile_coverage.Rd @@ -21,7 +21,7 @@ ggplot object with a plot of interval coverage Plot quantile coverage } \examples{ -scores <- score(example_quantile) -scores <- summarise_scores(scores, by = c("model", "quantile")) -plot_quantile_coverage(scores) +data_coverage <- add_coverage(example_quantile) +summarised <- summarise_scores(data_coverage, by = c("model", "quantile")) +plot_quantile_coverage(summarised) } diff --git a/man/plot_ranges.Rd b/man/plot_ranges.Rd index a4a999ff2..0635de44c 100644 --- a/man/plot_ranges.Rd +++ b/man/plot_ranges.Rd @@ -4,7 +4,7 @@ \alias{plot_ranges} \title{Plot Metrics by Range of the Prediction Interval} \usage{ -plot_ranges(scores, y = "interval_score", x = "model", colour = "range") +plot_ranges(scores, y = "wis", x = "model", colour = "range") } \arguments{ \item{scores}{A data.frame of scores based on quantile forecasts as @@ -12,7 +12,7 @@ produced by \code{\link[=score]{score()}} or \code{\link[=summarise_scores]{summ in the \code{by} argument when running \code{\link[=summarise_scores]{summarise_scores()}}} \item{y}{The variable from the scores you want to show on the y-Axis. -This could be something like "interval_score" (the default) or "dispersion"} +This could be something like "wis" (the default) or "dispersion"} \item{x}{The variable from the scores you want to show on the x-Axis. Usually this will be "model"} @@ -31,13 +31,14 @@ sharpness / dispersion changes by range. } \examples{ library(ggplot2) -scores <- score(example_quantile) -scores <- summarise_scores(scores, by = c("model", "target_type", "range")) - -plot_ranges(scores, x = "model") + +ex <- example_quantile +ex$interval_range <- scoringutils:::get_range_from_quantile(ex$quantile) +scores <- score(ex, metrics = list("wis" = wis)) +scores$range <- scores$interval_range +summarised <- summarise_scores( + scores, + by = c("model", "target_type", "range") +) +plot_ranges(summarised, x = "model") + facet_wrap(~target_type, scales = "free") - -# visualise dispersion instead of interval score -plot_ranges(scores, y = "dispersion", x = "model") + - facet_wrap(~target_type) } diff --git a/man/prediction_is_quantile.Rd b/man/prediction_is_quantile.Rd deleted file mode 100644 index c7dee479e..000000000 --- a/man/prediction_is_quantile.Rd +++ /dev/null @@ -1,20 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utils.R -\name{prediction_is_quantile} -\alias{prediction_is_quantile} -\title{Check if predictions are quantile forecasts} -\usage{ -prediction_is_quantile(data) -} -\arguments{ -\item{data}{Data frame containing forecast predictions} -} -\value{ -Logical indicating whether predictions are quantile forecasts -} -\description{ -Internal helper function to check if a data frame contains -quantile forecast predictions. This is determined by checking if the -"quantile" column is present. -} -\keyword{internal} diff --git a/man/print.scoringutils_check.Rd b/man/print.scoringutils_check.Rd index 19a0e03b3..f411e226c 100644 --- a/man/print.scoringutils_check.Rd +++ b/man/print.scoringutils_check.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/check_forecasts.R +% Please edit documentation in R/utils.R \name{print.scoringutils_check} \alias{print.scoringutils_check} \title{Print output from \code{check_forecasts()}} @@ -7,17 +7,17 @@ \method{print}{scoringutils_check}(x, ...) } \arguments{ -\item{x}{An object of class 'scoringutils_check' as produced by -\code{\link[=check_forecasts]{check_forecasts()}}} +\item{x}{An object of class 'scoringutils_check' as produced b y +\code{check_forecasts()}} \item{...}{additional arguments (not used here)} } \description{ Helper function that prints the output generated by -\code{\link[=check_forecasts]{check_forecasts()}} +\code{check_forecasts()} } \examples{ -check <- check_forecasts(example_quantile) +check <- as_forecast(example_quantile) print(check) } \keyword{check-forecasts} diff --git a/man/quantile_score.Rd b/man/quantile_score.Rd index 0fbc09244..0cab0b160 100644 --- a/man/quantile_score.Rd +++ b/man/quantile_score.Rd @@ -1,19 +1,19 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/interval_score.R +% Please edit documentation in R/metrics-quantile.R \name{quantile_score} \alias{quantile_score} \title{Quantile Score} \usage{ -quantile_score(true_values, predictions, quantiles, weigh = TRUE) +quantile_score(observed, predicted, quantile, weigh = TRUE) } \arguments{ -\item{true_values}{A vector with the true observed values of size n} +\item{observed}{A vector with observed values of size n} -\item{predictions}{nxN matrix of predictive samples, n (number of rows) being +\item{predicted}{nxN matrix of predictive samples, n (number of rows) being the number of data points and N (number of columns) the number of Monte -Carlo samples. Alternatively, predictions can just be a vector of size n.} +Carlo samples. Alternatively, \code{predicted} can just be a vector of size n.} -\item{quantiles}{vector of size n with the quantile values of the +\item{quantile}{vector of size n with the quantile levels of the corresponding predictions.} \item{weigh}{if TRUE, weigh the score by alpha / 2, so it can be averaged @@ -29,24 +29,24 @@ vector with the scoring values \description{ Proper Scoring Rule to score quantile predictions. Smaller values are better. The quantile score is -closely related to the Interval score (see \code{\link[=interval_score]{interval_score()}}) and is +closely related to the Interval score (see \code{\link[=wis]{wis()}}) and is the quantile equivalent that works with single quantiles instead of central prediction intervals. } \examples{ -true_values <- rnorm(10, mean = 1:10) +observed <- rnorm(10, mean = 1:10) alpha <- 0.5 lower <- qnorm(alpha / 2, rnorm(10, mean = 1:10)) upper <- qnorm((1 - alpha / 2), rnorm(10, mean = 1:10)) -qs_lower <- quantile_score(true_values, - predictions = lower, - quantiles = alpha / 2 +qs_lower <- quantile_score(observed, + predicted = lower, + quantile = alpha / 2 ) -qs_upper <- quantile_score(true_values, - predictions = upper, - quantiles = 1 - alpha / 2 +qs_upper <- quantile_score(observed, + predicted = upper, + quantile = 1 - alpha / 2 ) interval_score <- (qs_lower + qs_upper) / 2 } diff --git a/man/quantile_to_interval.Rd b/man/quantile_to_interval.Rd new file mode 100644 index 000000000..b61498eee --- /dev/null +++ b/man/quantile_to_interval.Rd @@ -0,0 +1,68 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils_data_handling.R +\name{quantile_to_interval} +\alias{quantile_to_interval} +\alias{quantile_to_interval.data.frame} +\alias{quantile_to_interval.numeric} +\title{Transform From a Quantile Format to an Interval Format} +\usage{ +quantile_to_interval(...) + +\method{quantile_to_interval}{data.frame}(dt, format = "long", keep_quantile_col = FALSE, ...) + +\method{quantile_to_interval}{numeric}(observed, predicted, quantile, ...) +} +\arguments{ +\item{...}{method arguments} + +\item{dt}{a data.table with columns \code{quantile} and \code{predicted}} + +\item{format}{the format of the output. Either "long" or "wide". If "long" +(the default), there will be a column \code{boundary} (with values either "upper" +or "lower" and a column \code{range} that contains the range of the interval. +If "wide", there will be a column \code{range} and two columns +\code{lower} and \code{upper} that contain the lower and upper bounds of the +prediction interval, respectively.} + +\item{keep_quantile_col}{keep the quantile column in the final +output after transformation (default is FALSE). This only works if +\code{format = "long"}. If \code{format = "wide"}, the quantile column will always be +dropped.} + +\item{observed}{a numeric vector of observed values of size n} + +\item{predicted}{a numeric vector of predicted values of size n x N. If +\code{observed} is a single number, then \code{predicted} can be a vector of length N} + +\item{quantile}{a numeric vector of quantile levels of size N} +} +\value{ +\emph{quantile_to_interval.data.frame}: +a data.frame in an interval format (either "long" or "wide"), with or +without a quantile column. Rows will not be reordered. + +\emph{quantile_to_interval.numeric}: +a data.frame in a wide interval format with columns \code{forecast_id}, +\code{observed}, \code{lower}, \code{upper}, and \code{range}. The \code{forecast_id} column is a +unique identifier for each forecast. Rows will be reordered according to +\code{forecast_id} and \code{range}. +} +\description{ +\strong{Quantile format} +In a quantile format, a prediction is characterised by one or multiple +predicted values and the corresponding quantile levels. For example, a +prediction in a quantile format could be represented by the 0.05, 0.25, 0.5, +0.75 and 0.95 quantiles of the predictive distribution. + +\strong{Interval format} +In the interval format, two quantiles are assumed to form a prediction +interval. Prediction intervals need to be symmetric around the median and +are characterised by a lower and an upper bound. The lower bound is defined +by the lower quantile and the upper bound is defined by the upper quantile. +A 90\% prediction interval, for example, covers 90\% of the probability mass +and is defined by the 5\% and 95\% quantiles. A forecast could therefore +be characterised by one or multiple prediction intervals, e.g. the lower +and upper bounds of the 50\% and 90\% prediction intervals (corresponding to +the 0.25 and 0.75 as well as the 0.05 and 0.095 quantiles). +} +\keyword{data-handling} diff --git a/man/quantile_to_range_long.Rd b/man/quantile_to_range_long.Rd deleted file mode 100644 index ad214f3e5..000000000 --- a/man/quantile_to_range_long.Rd +++ /dev/null @@ -1,22 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utils_data_handling.R -\name{quantile_to_range_long} -\alias{quantile_to_range_long} -\title{Change Data from a Plain Quantile Format to a Long Range Format} -\usage{ -quantile_to_range_long(data, keep_quantile_col = TRUE) -} -\arguments{ -\item{data}{a data.frame in quantile format} - -\item{keep_quantile_col}{keep the quantile column in the final -output after transformation (default is FALSE)} -} -\value{ -a data.frame in a long interval range format -} -\description{ -Transform data from a format that uses quantiles only to one that uses -interval ranges to denote quantiles. -} -\keyword{internal} diff --git a/man/rules_binary.Rd b/man/rules_binary.Rd new file mode 100644 index 000000000..f5c455e35 --- /dev/null +++ b/man/rules_binary.Rd @@ -0,0 +1,34 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/default-scoring-rules.R +\name{rules_binary} +\alias{rules_binary} +\title{Scoring Rules for Binary Forecasts} +\usage{ +rules_binary(select = NULL, exclude = NULL) +} +\arguments{ +\item{select}{A character vector of scoring rules to select from the list. +If \code{select} is \code{NULL} (the default), all possible scoring rules are returned.} + +\item{exclude}{A character vector of scoring rules to exclude from the list. +If \code{select} is not \code{NULL}, this argument is ignored.} +} +\value{ +A list of scoring rules. +} +\description{ +Helper function that returns a named list of default +scoring rules suitable for binary forecasts. + +The default scoring rules are: +\itemize{ +\item "brier_score" = \code{\link[=brier_score]{brier_score()}} +\item "log_score" = \code{\link[=logs_binary]{logs_binary()}} +} +} +\examples{ +rules_binary() +rules_binary(select = "brier_score") +rules_binary(exclude = "log_score") +} +\keyword{metric} diff --git a/man/rules_point.Rd b/man/rules_point.Rd new file mode 100644 index 000000000..55f562dc9 --- /dev/null +++ b/man/rules_point.Rd @@ -0,0 +1,34 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/default-scoring-rules.R +\name{rules_point} +\alias{rules_point} +\title{Scoring Rules for Point Forecasts} +\usage{ +rules_point(select = NULL, exclude = NULL) +} +\arguments{ +\item{select}{A character vector of scoring rules to select from the list. +If \code{select} is \code{NULL} (the default), all possible scoring rules are returned.} + +\item{exclude}{A character vector of scoring rules to exclude from the list. +If \code{select} is not \code{NULL}, this argument is ignored.} +} +\value{ +A list of scoring rules. +} +\description{ +Helper function that returns a named list of default +scoring rules suitable for point forecasts. + +The default scoring rules are: +\itemize{ +\item "ae_point" = \link[Metrics:ae]{ae()} +\item "se_point" = \link[Metrics:se]{se()} +\item "ape" = \link[Metrics:ape]{ape()} +} +} +\examples{ +rules_point() +rules_point(select = "ape") +} +\keyword{metric} diff --git a/man/rules_quantile.Rd b/man/rules_quantile.Rd new file mode 100644 index 000000000..a20861961 --- /dev/null +++ b/man/rules_quantile.Rd @@ -0,0 +1,50 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/default-scoring-rules.R +\name{rules_quantile} +\alias{rules_quantile} +\title{Scoring Rules for Quantile-Based Forecasts} +\usage{ +rules_quantile(select = NULL, exclude = NULL) +} +\arguments{ +\item{select}{A character vector of scoring rules to select from the list. +If \code{select} is \code{NULL} (the default), all possible scoring rules are returned.} + +\item{exclude}{A character vector of scoring rules to exclude from the list. +If \code{select} is not \code{NULL}, this argument is ignored.} +} +\value{ +A list of scoring rules. +} +\description{ +Helper function that returns a named list of default +scoring rules suitable for forecasts in a quantile-based format + +The default scoring rules are: +\itemize{ +\item "wis" = \link{wis} +\item "overprediction" = \code{\link[=overprediction]{overprediction()}} +\item "underprediction" = \code{\link[=underprediction]{underprediction()}} +\item "dispersion" = \code{\link[=dispersion]{dispersion()}} +\item "bias" = \code{\link[=bias_quantile]{bias_quantile()}} +\item "interval_coverage_50" = \code{\link[=interval_coverage]{interval_coverage()}} +\item "interval_coverage_90" = function(...) \{ +run_safely(..., range = 90, fun = \link{interval_coverage}) +\} +\item "interval_coverage_deviation" = \code{\link[=interval_coverage_deviation]{interval_coverage_deviation()}}, +\item "ae_median" = \code{\link[=ae_median_quantile]{ae_median_quantile()}} +} + +Note: The \code{coverage_90} scoring rule is created as a wrapper around +\code{\link[=interval_coverage]{interval_coverage()}}, making use of the function \code{\link[=run_safely]{run_safely()}}. +This construct allows the function to deal with arbitrary arguments in \code{...}, +while making sure that only those that \code{\link[=interval_coverage]{interval_coverage()}} can +accept get passed on to it. \code{range = 90} is set in the function definition, +as passing an argument \code{range = 90} to \code{\link[=score]{score()}} would mean it would also +get passed to \code{coverage_50}. +} +\examples{ +rules_quantile() +rules_quantile(select = "wis") +} +\keyword{metric} diff --git a/man/rules_sample.Rd b/man/rules_sample.Rd new file mode 100644 index 000000000..0a6399beb --- /dev/null +++ b/man/rules_sample.Rd @@ -0,0 +1,39 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/default-scoring-rules.R +\name{rules_sample} +\alias{rules_sample} +\title{Scoring Rules for Sample-Based Forecasts} +\usage{ +rules_sample(select = NULL, exclude = NULL) +} +\arguments{ +\item{select}{A character vector of scoring rules to select from the list. +If \code{select} is \code{NULL} (the default), all possible scoring rules are returned.} + +\item{exclude}{A character vector of scoring rules to exclude from the list. +If \code{select} is not \code{NULL}, this argument is ignored.} +} +\value{ +A list of scoring rules. +} +\description{ +Helper function that returns a named list of default +scoring rules suitable for forecasts in a sample-based format + +The default scoring rules are: +\itemize{ +\item "mad" = \code{\link[=mad_sample]{mad_sample()}} +\item "bias" = \code{\link[=bias_sample]{bias_sample()}} +\item "dss" = \code{\link[=dss_sample]{dss_sample()}} +\item "crps" = \code{\link[=crps_sample]{crps_sample()}} +\item "log_score" = \code{\link[=logs_sample]{logs_sample()}} +\item "mad" = \code{\link[=mad_sample]{mad_sample()}} +\item "ae_median" = \code{\link[=ae_median_sample]{ae_median_sample()}} +\item "se_mean" = \code{\link[=se_mean_sample]{se_mean_sample()}} +} +} +\examples{ +rules_sample() +rules_sample(select = "mad") +} +\keyword{metric} diff --git a/man/run_safely.Rd b/man/run_safely.Rd new file mode 100644 index 000000000..a4f809040 --- /dev/null +++ b/man/run_safely.Rd @@ -0,0 +1,36 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{run_safely} +\alias{run_safely} +\title{Run a function safely} +\usage{ +run_safely(..., fun) +} +\arguments{ +\item{...}{Arguments to pass to \code{fun}} + +\item{fun}{A function to execute} +} +\value{ +The result of \code{fun} or \code{NULL} if \code{fun} errors +} +\description{ +This is a wrapper function designed to run a function safely +when it is not completely clear what arguments could be passed to the +function. + +All named arguments in \code{...} that are not accepted by \code{fun} are removed. +All unnamed arguments are passed on to the function. In case \code{fun} errors, +the error will be converted to a warning and \code{run_safely} returns \code{NULL}. + +\code{run_safely} can be useful when constructing functions to be used as +metrics in \code{\link[=score]{score()}}. +} +\examples{ +f <- function(x) {x} +run_safely(2, fun = f) +run_safely(2, y = 3, fun = f) +run_safely(fun = f) +run_safely(y = 3, fun = f) +} +\keyword{scoring} diff --git a/man/score.Rd b/man/score.Rd index 4804efcd0..79edbb24d 100644 --- a/man/score.Rd +++ b/man/score.Rd @@ -2,92 +2,127 @@ % Please edit documentation in R/score.R \name{score} \alias{score} -\title{Evaluate forecasts} +\alias{score.default} +\alias{score.forecast_binary} +\alias{score.forecast_point} +\alias{score.forecast_sample} +\alias{score.forecast_quantile} +\title{Evaluate forecasts in a data.frame format} \usage{ -score(data, metrics = NULL, ...) +score(data, ...) + +\method{score}{default}(data, ...) + +\method{score}{forecast_binary}(data, metrics = rules_binary(), ...) + +\method{score}{forecast_point}(data, metrics = rules_point(), ...) + +\method{score}{forecast_sample}(data, metrics = rules_sample(), ...) + +\method{score}{forecast_quantile}(data, metrics = rules_quantile(), ...) } \arguments{ -\item{data}{A data.frame or data.table with the predictions and observations. -For scoring using \code{\link[=score]{score()}}, the following columns need to be present: -\itemize{ -\item \code{true_value} - the true observed values -\item \code{prediction} - predictions or predictive samples for one -true value. (You only don't need to provide a prediction column if -you want to score quantile forecasts in a wide range format.)} -For scoring integer and continuous forecasts a \code{sample} column is needed: -\itemize{ -\item \code{sample} - an index to identify the predictive samples in the -prediction column generated by one model for one true value. Only -necessary for continuous and integer forecasts, not for -binary predictions.} -For scoring predictions in a quantile-format forecast you should provide -a column called \code{quantile}: +\item{data}{A data.frame or data.table with predicted and observed values.} + +\item{...}{additional arguments} + +\item{metrics}{A named list of scoring functions. Names will be used as +column names in the output. See \code{\link[=rules_point]{rules_point()}}, \code{\link[=rules_binary]{rules_binary()}}, +\code{\link[=rules_quantile]{rules_quantile()}}, and \code{\link[=rules_sample]{rules_sample()}} for more information on the +default metrics used.} +} +\value{ +A data.table with unsummarised scores. This will generally be +one score per forecast (as defined by the unit of a single forecast). + +For quantile-based forecasts, one score per quantile will be returned +instead. This is done as scores can be computed and may be of interest +for individual quantiles. You can call \code{\link[=summarise_scores]{summarise_scores()}}) on the +unsummarised scores to obtain one score per forecast unit for quantile-based +forecasts. +} +\description{ +\code{score()} applies a selection of scoring metrics to a data.frame +of forecasts. It is the workhorse of the \code{scoringutils} package. +\code{score()} is a generic that dispatches to different methods depending on the +class of the input data. + +We recommend that users call \code{\link[=as_forecast]{as_forecast()}} prior to calling \code{score()} to +validate the input data and convert it to a forecast object (though +\code{score.default()} will do this if it hasn't happened before). +See below for more information on forecast types and input formats. +For additional help and examples, check out the \href{https://epiforecasts.io/scoringutils/articles/scoringutils.html}{Getting Started Vignette} as +well as the paper \href{https://arxiv.org/abs/2205.07090}{Evaluating Forecasts with scoringutils in R}. +} +\section{Forecast types and input format}{ +Various different forecast types / forecast formats are supported. At the +moment, those are \itemize{ -\item \code{quantile}: quantile to which the prediction corresponds +\item point forecasts +\item binary forecasts ("soft binary classification") +\item Probabilistic forecasts in a quantile-based format (a forecast is +represented as a set of predictive quantiles) +\item Probabilistic forecasts in a sample-based format (a forecast is represented +as a set of predictive samples) } -In addition a \code{model} column is suggested and if not present this will be -flagged and added to the input data with all forecasts assigned as an -"unspecified model"). +Forecast types are determined based on the columns present in the input data. -You can check the format of your data using \code{\link[=check_forecasts]{check_forecasts()}} and there -are examples for each format (\link{example_quantile}, \link{example_continuous}, -\link{example_integer}, and \link{example_binary}).} +\emph{Point forecasts} require a column \code{observed} of type numeric and a column +\code{predicted} of type numeric. -\item{metrics}{the metrics you want to have in the output. If \code{NULL} (the -default), all available metrics will be computed. For a list of available -metrics see \code{\link[=available_metrics]{available_metrics()}}, or check the \link{metrics} data set.} +\emph{Binary forecasts} require a column \code{observed} of type factor with exactly +two levels and a column \code{predicted} of type numeric with probabilities, +corresponding to the probability that \code{observed} is equal to the second +factor level. See details \link[=brier_score]{here} for more information. -\item{...}{additional parameters passed down to \code{\link[=score_quantile]{score_quantile()}} (internal -function used for scoring forecasts in a quantile-based format).} -} -\value{ -A data.table with unsummarised scores. There will be one score per -quantile or sample, which is usually not desired, so you should almost -always run \code{\link[=summarise_scores]{summarise_scores()}} on the unsummarised scores. +\emph{Quantile-based forecasts} require a column \code{observed} of type numeric, +a column \code{predicted} of type numeric, and a column \code{quantile} of type numeric +with quantile-levels (between 0 and 1). + +\emph{Sample-based forecasts} require a column \code{observed} of type numeric, +a column \code{predicted} of type numeric, and a column \code{sample_id} of type +numeric with sample indices. + +For more information see the vignettes and the example data +(\link{example_quantile}, \link{example_continuous}, \link{example_integer}, +\code{\link[=example_point]{example_point()}}, and \link{example_binary}). } -\description{ -This function allows automatic scoring of forecasts using a -range of metrics. For most users it will be the workhorse for -scoring forecasts as it wraps the lower level functions package functions. -However, these functions are also available if you wish to make use of them -independently. - -A range of forecasts formats are supported, including quantile-based, -sample-based, binary forecasts. Prior to scoring, users may wish to make use -of \code{\link[=check_forecasts]{check_forecasts()}} to ensure that the input data is in a supported -format though this will also be run internally by \code{\link[=score]{score()}}. Examples for -each format are also provided (see the documentation for \code{data} below or in -\code{\link[=check_forecasts]{check_forecasts()}}). - -Each format has a set of required columns (see below). Additional columns may -be present to indicate a grouping of forecasts. For example, we could have -forecasts made by different models in various locations at different time -points, each for several weeks into the future. It is important, that there -are only columns present which are relevant in order to group forecasts. -A combination of different columns should uniquely define the -\emph{unit of a single forecast}, meaning that a single forecast is defined by the -values in the other columns. Adding additional unrelated columns may alter -results. - -To obtain a quick overview of the currently supported evaluation metrics, -have a look at the \link{metrics} data included in the package. The column -\code{metrics$Name} gives an overview of all available metric names that can be -computed. If interested in an unsupported metric please open a \href{https://github.com/epiforecasts/scoringutils/issues}{feature request} or consider -contributing a pull request. - -For additional help and examples, check out the \href{https://epiforecasts.io/scoringutils/articles/scoringutils.html}{Getting Started Vignette} -as well as the paper \href{https://arxiv.org/abs/2205.07090}{Evaluating Forecasts with scoringutils in R}. + +\section{Forecast unit}{ +In order to score forecasts, \code{scoringutils} needs to know which of the rows +of the data belong together and jointly form a single forecasts. This is +easy e.g. for point forecast, where there is one row per forecast. For +quantile or sample-based forecasts, however, there are multiple rows that +belong to single forecast. + +The \emph{forecast unit} or \emph{unit of a single forecast} is then described by the +combination of columns that uniquely identify a single forecast. +For example, we could have forecasts made by different models in various +locations at different time points, each for several weeks into the future. +The forecast unit could then be described as +\code{forecast_unit = c("model", "location", "forecast_date", "forecast_horizon")}. +\code{scoringutils} automatically tries to determine the unit of a single +forecast. It uses all existing columns for this, which means that no columns +must be present that are unrelated to the forecast unit. As a very simplistic +example, if you had an additional row, "even", that is one if the row number +is even and zero otherwise, then this would mess up scoring as \code{scoringutils} +then thinks that this column was relevant in defining the forecast unit. + +In order to avoid issues, we recommend using the function +\code{\link[=set_forecast_unit]{set_forecast_unit()}} to determine the forecast unit manually. +The function simply drops unneeded columns, while making sure that all +necessary, 'protected columns' like "predicted" or "observed" are retained. } + \examples{ library(magrittr) # pipe operator \dontshow{ data.table::setDTthreads(2) # restricts number of cores used on CRAN } -check_forecasts(example_quantile) -score(example_quantile) \%>\% - add_coverage(by = c("model", "target_type")) \%>\% +validated <- as_forecast(example_quantile) +score(validated) \%>\% summarise_scores(by = c("model", "target_type")) # set forecast unit manually (to avoid issues with scoringutils trying to @@ -96,27 +131,22 @@ example_quantile \%>\% set_forecast_unit( c("location", "target_end_date", "target_type", "horizon", "model") ) \%>\% - check_forecasts() \%>\% + as_forecast() \%>\% score() # forecast formats with different metrics \dontrun{ score(example_binary) score(example_quantile) +score(example_point) score(example_integer) score(example_continuous) } - -# score point forecasts (marked by 'NA' in the quantile column) -score(example_point) \%>\% - summarise_scores(by = "model", na.rm = TRUE) - } \references{ -Funk S, Camacho A, Kucharski AJ, Lowe R, Eggo RM, Edmunds WJ -(2019) Assessing the performance of real-time epidemic forecasts: A -case study of Ebola in the Western Area region of Sierra Leone, 2014-15. -PLoS Comput Biol 15(2): e1006785. \doi{10.1371/journal.pcbi.1006785} +Bosse NI, Gruson H, Cori A, van Leeuwen E, Funk S, Abbott S +(2022) Evaluating Forecasts with scoringutils in R. +\doi{10.48550/arXiv.2205.07090} } \author{ Nikos Bosse \email{nikosbosse@gmail.com} diff --git a/man/score_binary.Rd b/man/score_binary.Rd deleted file mode 100644 index c1a78e6a1..000000000 --- a/man/score_binary.Rd +++ /dev/null @@ -1,55 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/score_binary.R -\name{score_binary} -\alias{score_binary} -\title{Evaluate forecasts in a Binary Format} -\usage{ -score_binary(data, forecast_unit, metrics) -} -\arguments{ -\item{data}{A data.frame or data.table with the predictions and observations. -For scoring using \code{\link[=score]{score()}}, the following columns need to be present: -\itemize{ -\item \code{true_value} - the true observed values -\item \code{prediction} - predictions or predictive samples for one -true value. (You only don't need to provide a prediction column if -you want to score quantile forecasts in a wide range format.)} -For scoring integer and continuous forecasts a \code{sample} column is needed: -\itemize{ -\item \code{sample} - an index to identify the predictive samples in the -prediction column generated by one model for one true value. Only -necessary for continuous and integer forecasts, not for -binary predictions.} -For scoring predictions in a quantile-format forecast you should provide -a column called \code{quantile}: -\itemize{ -\item \code{quantile}: quantile to which the prediction corresponds -} - -In addition a \code{model} column is suggested and if not present this will be -flagged and added to the input data with all forecasts assigned as an -"unspecified model"). - -You can check the format of your data using \code{\link[=check_forecasts]{check_forecasts()}} and there -are examples for each format (\link{example_quantile}, \link{example_continuous}, -\link{example_integer}, and \link{example_binary}).} - -\item{forecast_unit}{A character vector with the column names that define -the unit of a single forecast, i.e. a forecast was made for a combination -of the values in \code{forecast_unit}.} - -\item{metrics}{the metrics you want to have in the output. If \code{NULL} (the -default), all available metrics will be computed. For a list of available -metrics see \code{\link[=available_metrics]{available_metrics()}}, or check the \link{metrics} data set.} -} -\value{ -A data.table with appropriate scores. For more information see -\code{\link[=score]{score()}}. -} -\description{ -Evaluate forecasts in a Binary Format -} -\author{ -Nikos Bosse \email{nikosbosse@gmail.com} -} -\keyword{internal} diff --git a/man/score_quantile.Rd b/man/score_quantile.Rd deleted file mode 100644 index 2f5660206..000000000 --- a/man/score_quantile.Rd +++ /dev/null @@ -1,87 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/score_quantile.R -\name{score_quantile} -\alias{score_quantile} -\title{Evaluate forecasts in a Quantile-Based Format} -\usage{ -score_quantile( - data, - forecast_unit, - metrics, - weigh = TRUE, - count_median_twice = FALSE, - separate_results = TRUE -) -} -\arguments{ -\item{data}{A data.frame or data.table with the predictions and observations. -For scoring using \code{\link[=score]{score()}}, the following columns need to be present: -\itemize{ -\item \code{true_value} - the true observed values -\item \code{prediction} - predictions or predictive samples for one -true value. (You only don't need to provide a prediction column if -you want to score quantile forecasts in a wide range format.)} -For scoring integer and continuous forecasts a \code{sample} column is needed: -\itemize{ -\item \code{sample} - an index to identify the predictive samples in the -prediction column generated by one model for one true value. Only -necessary for continuous and integer forecasts, not for -binary predictions.} -For scoring predictions in a quantile-format forecast you should provide -a column called \code{quantile}: -\itemize{ -\item \code{quantile}: quantile to which the prediction corresponds -} - -In addition a \code{model} column is suggested and if not present this will be -flagged and added to the input data with all forecasts assigned as an -"unspecified model"). - -You can check the format of your data using \code{\link[=check_forecasts]{check_forecasts()}} and there -are examples for each format (\link{example_quantile}, \link{example_continuous}, -\link{example_integer}, and \link{example_binary}).} - -\item{forecast_unit}{A character vector with the column names that define -the unit of a single forecast, i.e. a forecast was made for a combination -of the values in \code{forecast_unit}} - -\item{metrics}{the metrics you want to have in the output. If \code{NULL} (the -default), all available metrics will be computed. For a list of available -metrics see \code{\link[=available_metrics]{available_metrics()}}, or check the \link{metrics} data set.} - -\item{weigh}{if TRUE, weigh the score by alpha / 2, so it can be averaged -into an interval score that, in the limit, corresponds to CRPS. Alpha is the -decimal value that represents how much is outside a central prediction -interval (e.g. for a 90 percent central prediction interval, alpha is 0.1) -Default: \code{TRUE}.} - -\item{count_median_twice}{logical that controls whether or not to count the -median twice when summarising (default is \code{FALSE}). Counting the -median twice would conceptually treat it as a 0\\% prediction interval, where -the median is the lower as well as the upper bound. The alternative is to -treat the median as a single quantile forecast instead of an interval. The -interval score would then be better understood as an average of quantile -scores.} - -\item{separate_results}{if \code{TRUE} (default is \code{FALSE}), then the separate -parts of the interval score (dispersion penalty, penalties for over- and -under-prediction get returned as separate elements of a list). If you want a -\code{data.frame} instead, simply call \code{\link[=as.data.frame]{as.data.frame()}} on the output.} -} -\value{ -A data.table with appropriate scores. For more information see -\code{\link[=score]{score()}} -} -\description{ -Evaluate forecasts in a Quantile-Based Format -} -\references{ -Funk S, Camacho A, Kucharski AJ, Lowe R, Eggo RM, Edmunds WJ -(2019) Assessing the performance of real-time epidemic forecasts: A -case study of Ebola in the Western Area region of Sierra Leone, 2014-15. -PLoS Comput Biol 15(2): e1006785. \doi{10.1371/journal.pcbi.1006785} -} -\author{ -Nikos Bosse \email{nikosbosse@gmail.com} -} -\keyword{internal} diff --git a/man/score_sample.Rd b/man/score_sample.Rd deleted file mode 100644 index c94a1fea8..000000000 --- a/man/score_sample.Rd +++ /dev/null @@ -1,63 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/score_continuous_integer.R -\name{score_sample} -\alias{score_sample} -\title{Evaluate forecasts in a Sample-Based Format (Integer or Continuous)} -\usage{ -score_sample(data, forecast_unit, metrics, prediction_type) -} -\arguments{ -\item{data}{A data.frame or data.table with the predictions and observations. -For scoring using \code{\link[=score]{score()}}, the following columns need to be present: -\itemize{ -\item \code{true_value} - the true observed values -\item \code{prediction} - predictions or predictive samples for one -true value. (You only don't need to provide a prediction column if -you want to score quantile forecasts in a wide range format.)} -For scoring integer and continuous forecasts a \code{sample} column is needed: -\itemize{ -\item \code{sample} - an index to identify the predictive samples in the -prediction column generated by one model for one true value. Only -necessary for continuous and integer forecasts, not for -binary predictions.} -For scoring predictions in a quantile-format forecast you should provide -a column called \code{quantile}: -\itemize{ -\item \code{quantile}: quantile to which the prediction corresponds -} - -In addition a \code{model} column is suggested and if not present this will be -flagged and added to the input data with all forecasts assigned as an -"unspecified model"). - -You can check the format of your data using \code{\link[=check_forecasts]{check_forecasts()}} and there -are examples for each format (\link{example_quantile}, \link{example_continuous}, -\link{example_integer}, and \link{example_binary}).} - -\item{forecast_unit}{A character vector with the column names that define -the unit of a single forecast, i.e. a forecast was made for a combination -of the values in \code{forecast_unit}} - -\item{metrics}{the metrics you want to have in the output. If \code{NULL} (the -default), all available metrics will be computed. For a list of available -metrics see \code{\link[=available_metrics]{available_metrics()}}, or check the \link{metrics} data set.} - -\item{prediction_type}{character, should be either "continuous" or "integer"} -} -\value{ -A data.table with appropriate scores. For more information see -\code{\link[=score]{score()}} -} -\description{ -Evaluate forecasts in a Sample-Based Format (Integer or Continuous) -} -\references{ -Funk S, Camacho A, Kucharski AJ, Lowe R, Eggo RM, Edmunds WJ -(2019) Assessing the performance of real-time epidemic forecasts: A -case study of Ebola in the Western Area region of Sierra Leone, 2014-15. -PLoS Comput Biol 15(2): e1006785. \doi{10.1371/journal.pcbi.1006785} -} -\author{ -Nikos Bosse \email{nikosbosse@gmail.com} -} -\keyword{internal} diff --git a/man/scoring-functions-binary.Rd b/man/scoring-functions-binary.Rd new file mode 100644 index 000000000..9a06018a1 --- /dev/null +++ b/man/scoring-functions-binary.Rd @@ -0,0 +1,78 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/metrics-binary.R +\name{scoring-functions-binary} +\alias{scoring-functions-binary} +\alias{brier_score} +\alias{logs_binary} +\title{Metrics for Binary Outcomes} +\usage{ +brier_score(observed, predicted) + +logs_binary(observed, predicted) +} +\arguments{ +\item{observed}{A factor of length n with exactly two levels, holding +the observed values. +The highest factor level is assumed to be the reference level. This means +that \code{predicted} represents the probability that the observed value is +equal to the highest factor level.} + +\item{predicted}{A numeric vector of length n, holding probabilities. +Values represent the probability that the corresponding outcome is equal to +the highest level of the factor \code{observed}.} +} +\value{ +A numeric vector of size n with the Brier scores + +A numeric vector of size n with log scores +} +\description{ +\strong{Brier score} + +The Brier Score is the mean squared error between the probabilistic +prediction and the observed outcome. The Brier score is a proper scoring +rule. Small values are better (best is 0, the worst is 1). + +\deqn{ + \textrm{Brier\_Score} = \frac{1}{N} \sum_{t = 1}^{n} (\textrm{prediction}_t - + \textrm{outcome}_t)^2, +}{ + Brier_Score = 1/N sum_{t = 1}^{n} (prediction_t - outcome_t)², +} where \eqn{\textrm{outcome}_t \in \{0, 1\}}{outcome_t in {0, 1}}, and +\eqn{\textrm{prediction}_t \in [0, 1]}{prediction_t in [0, 1]} represents +the probability that the outcome is equal to 1. + +\strong{Log score for binary outcomes} + +The Log Score is the negative logarithm of the probability +assigned to the observed value. It is a proper scoring rule. Small values +are better (best is zero, worst is infinity). +} +\details{ +The functions require users to provide observed values as a factor in order +to distinguish its input from the input format required for scoring point +forecasts. Internally, however, factors will be converted to numeric values. +A factor \verb{observed = factor(c(0, 1, 1, 0, 1)} with two levels (\code{0} and \code{1}) +would internally be coerced to a numeric vector (in this case this would +result in the numeric vector c(1, 2, 2, 1, 1)). After subtracting 1, the +resulting vector (\code{c(0, 1, 1, 0)} in this case) is used for internal +calculations. All predictions are assumed represent the probability that the +outcome is equal of the highest factor level (in this case that the +outcome is equal to 1). + +You could alternatively also provide a vector like +\code{observed = factor(c("a", "b", "b", "a"))} (with two levels, \code{a} and \code{b}), +which would result in exactly the same internal representation. Probabilities +then represent the probability that the outcome is equal to "b". +If you want your predictions to be probabilities that the outcome is "a", +then you could of course make \code{observed} a factor with levels swapped, i.e. +\code{observed = factor(c("a", "b", "b", "a"), levels = c("b", "a"))} +} +\examples{ +observed <- factor(sample(c(0, 1), size = 30, replace = TRUE)) +predicted <- runif(n = 30, min = 0, max = 1) + +brier_score(observed, predicted) +logs_binary(observed, predicted) +} +\keyword{metric} diff --git a/man/se_mean_sample.Rd b/man/se_mean_sample.Rd index da2b47cde..4a5d62aab 100644 --- a/man/se_mean_sample.Rd +++ b/man/se_mean_sample.Rd @@ -1,17 +1,17 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/metrics_point_forecasts.R +% Please edit documentation in R/metrics-sample.R \name{se_mean_sample} \alias{se_mean_sample} \title{Squared Error of the Mean (Sample-based Version)} \usage{ -se_mean_sample(true_values, predictions) +se_mean_sample(observed, predicted) } \arguments{ -\item{true_values}{A vector with the true observed values of size n} +\item{observed}{A vector with observed values of size n} -\item{predictions}{nxN matrix of predictive samples, n (number of rows) being +\item{predicted}{nxN matrix of predictive samples, n (number of rows) being the number of data points and N (number of columns) the number of Monte -Carlo samples. Alternatively, predictions can just be a vector of size n.} +Carlo samples. Alternatively, \code{predicted} can just be a vector of size n.} } \value{ vector with the scoring values @@ -20,17 +20,15 @@ vector with the scoring values Squared error of the mean calculated as \deqn{ - \textrm{mean}(\textrm{true\_value} - \textrm{prediction})^2 + \textrm{mean}(\textrm{observed} - \textrm{mean prediction})^2 }{ - mean(true_value - mean_prediction)^2 + mean(observed - mean prediction)^2 } +The mean prediction is calculated as the mean of the predictive samples. } \examples{ -true_values <- rnorm(30, mean = 1:30) -predicted_values <- rnorm(30, mean = 1:30) -se_mean_sample(true_values, predicted_values) -} -\seealso{ -\code{\link[=squared_error]{squared_error()}} +observed <- rnorm(30, mean = 1:30) +predicted_values <- matrix(rnorm(30, mean = 1:30)) +se_mean_sample(observed, predicted_values) } \keyword{metric} diff --git a/man/select_rules.Rd b/man/select_rules.Rd new file mode 100644 index 000000000..605d7176b --- /dev/null +++ b/man/select_rules.Rd @@ -0,0 +1,35 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/default-scoring-rules.R +\name{select_rules} +\alias{select_rules} +\title{Select Scoring Rules From A List of Possible Scoring Rules} +\usage{ +select_rules(rules, select = NULL, exclude = NULL) +} +\arguments{ +\item{rules}{A list of scoring rules.} + +\item{select}{A character vector of scoring rules to select from the list. +If \code{select} is \code{NULL} (the default), all possible scoring rules are returned.} + +\item{exclude}{A character vector of scoring rules to exclude from the list. +If \code{select} is not \code{NULL}, this argument is ignored.} +} +\value{ +A list of scoring rules. +} +\description{ +Helper function to return only the scoring rules selected by +the user from a list of possible scoring rules. +} +\examples{ +select_rules( + rules = rules_binary(), + select = "brier_score" +) +select_rules( + rules = rules_binary(), + exclude = "log_score" +) +} +\keyword{metric} diff --git a/man/set_forecast_unit.Rd b/man/set_forecast_unit.Rd index 8482a27d7..d1d5e5639 100644 --- a/man/set_forecast_unit.Rd +++ b/man/set_forecast_unit.Rd @@ -7,32 +7,7 @@ set_forecast_unit(data, forecast_unit) } \arguments{ -\item{data}{A data.frame or data.table with the predictions and observations. -For scoring using \code{\link[=score]{score()}}, the following columns need to be present: -\itemize{ -\item \code{true_value} - the true observed values -\item \code{prediction} - predictions or predictive samples for one -true value. (You only don't need to provide a prediction column if -you want to score quantile forecasts in a wide range format.)} -For scoring integer and continuous forecasts a \code{sample} column is needed: -\itemize{ -\item \code{sample} - an index to identify the predictive samples in the -prediction column generated by one model for one true value. Only -necessary for continuous and integer forecasts, not for -binary predictions.} -For scoring predictions in a quantile-format forecast you should provide -a column called \code{quantile}: -\itemize{ -\item \code{quantile}: quantile to which the prediction corresponds -} - -In addition a \code{model} column is suggested and if not present this will be -flagged and added to the input data with all forecasts assigned as an -"unspecified model"). - -You can check the format of your data using \code{\link[=check_forecasts]{check_forecasts()}} and there -are examples for each format (\link{example_quantile}, \link{example_continuous}, -\link{example_integer}, and \link{example_binary}).} +\item{data}{A data.frame or data.table with predicted and observed values.} \item{forecast_unit}{Character vector with the names of the columns that uniquely identify a single forecast.} @@ -45,14 +20,14 @@ scoring or denote the unit of a single forecast as specified by the user. Helper function to set the unit of a single forecast (i.e. the combination of columns that uniquely define a single forecast) manually. This simple function keeps the columns specified in \code{forecast_unit} (plus -additional protected columns, e.g. for true values, predictions or quantile -levels) and removes duplicate rows. +additional protected columns, e.g. for observed values, predictions or +quantile levels) and removes duplicate rows. If not done manually, \code{scoringutils} attempts to determine the unit of a single forecast automatically by simply assuming that all column names are relevant to determine the forecast unit. This may lead to unexpected behaviour, so setting the forecast unit explicitly can help make the code easier to debug and easier to read. When used as part of a workflow, -\code{set_forecast_unit()} can be directly piped into \code{check_forecasts()} to +\code{set_forecast_unit()} can be directly piped into \code{as_forecast()} to check everything is in order. } \examples{ diff --git a/man/squared_error.Rd b/man/squared_error.Rd deleted file mode 100644 index c07299561..000000000 --- a/man/squared_error.Rd +++ /dev/null @@ -1,31 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/metrics_point_forecasts.R -\name{squared_error} -\alias{squared_error} -\title{Squared Error} -\usage{ -squared_error(true_values, predictions) -} -\arguments{ -\item{true_values}{A vector with the true observed values of size n} - -\item{predictions}{A vector with predicted values of size n} -} -\value{ -vector with the scoring values -} -\description{ -Squared Error SE calculated as - -\deqn{ - (\textrm{true\_values} - \textrm{predicted\_values})^2 -}{ - (true_values - predicted_values)^2 -} -} -\examples{ -true_values <- rnorm(30, mean = 1:30) -predicted_values <- rnorm(30, mean = 1:30) -squared_error(true_values, predicted_values) -} -\keyword{metric} diff --git a/man/strip_attributes.Rd b/man/strip_attributes.Rd new file mode 100644 index 000000000..5f727f539 --- /dev/null +++ b/man/strip_attributes.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{strip_attributes} +\alias{strip_attributes} +\title{Strip attributes from an object} +\usage{ +strip_attributes(object, attributes) +} +\arguments{ +\item{object}{An object to remove attributes from} + +\item{attributes}{A character vector of attribute names to remove from the +object} +} +\value{ +The object with attributes removed +} +\description{ +This function removes all attributes from an object that are +specified in the \code{attributes} argument. +} +\keyword{internal} diff --git a/man/summarise_scores.Rd b/man/summarise_scores.Rd index d41d5a6ae..addd49b96 100644 --- a/man/summarise_scores.Rd +++ b/man/summarise_scores.Rd @@ -5,29 +5,9 @@ \alias{summarize_scores} \title{Summarise scores as produced by \code{\link[=score]{score()}}} \usage{ -summarise_scores( - scores, - by = NULL, - across = NULL, - fun = mean, - relative_skill = FALSE, - relative_skill_metric = "auto", - metric = deprecated(), - baseline = NULL, - ... -) +summarise_scores(scores, by = NULL, across = NULL, fun = mean, ...) -summarize_scores( - scores, - by = NULL, - across = NULL, - fun = mean, - relative_skill = FALSE, - relative_skill_metric = "auto", - metric = deprecated(), - baseline = NULL, - ... -) +summarize_scores(scores, by = NULL, across = NULL, fun = mean, ...) } \arguments{ \item{scores}{A data.table of scores as produced by \code{\link[=score]{score()}}.} @@ -52,31 +32,15 @@ and \code{by} may be used at a time.} \item{fun}{a function used for summarising scores. Default is \code{mean}.} -\item{relative_skill}{logical, whether or not to compute relative -performance between models based on pairwise comparisons. -If \code{TRUE} (default is \code{FALSE}), then a column called -'model' must be present in the input data. For more information on -the computation of relative skill, see \code{\link[=pairwise_comparison]{pairwise_comparison()}}. -Relative skill will be calculated for the aggregation level specified in -\code{by}.} - -\item{relative_skill_metric}{character with the name of the metric for which -a relative skill shall be computed. If equal to 'auto' (the default), then -this will be either interval score, CRPS or Brier score (depending on which -of these is available in the input data)} - -\item{metric}{\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#deprecated}{\figure{lifecycle-deprecated.svg}{options: alt='[Deprecated]'}}}{\strong{[Deprecated]}} Deprecated in 1.1.0. Use -\code{relative_skill_metric} instead.} - -\item{baseline}{character string with the name of a model. If a baseline is -given, then a scaled relative skill with respect to the baseline will be -returned. By default (\code{NULL}), relative skill will not be scaled with -respect to a baseline model.} - \item{...}{additional parameters that can be passed to the summary function provided to \code{fun}. For more information see the documentation of the respective function.} } +\value{ +a data.table with summarised scores. Scores are summarised according +to the names of the columns of the original data specified in \code{by} or +\code{across} using the \code{fun} passed to \code{summarise_scores()}. +} \description{ Summarise scores as produced by \code{\link[=score]{score()}} } @@ -85,10 +49,10 @@ Summarise scores as produced by \code{\link[=score]{score()}} data.table::setDTthreads(2) # restricts number of cores used on CRAN } library(magrittr) # pipe operator - +\dontrun{ scores <- score(example_continuous) summarise_scores(scores) - +} # summarise over samples or quantiles to get one score per forecast scores <- score(example_quantile) diff --git a/man/test_columns_not_present.Rd b/man/test_columns_not_present.Rd new file mode 100644 index 000000000..5f7b71994 --- /dev/null +++ b/man/test_columns_not_present.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/check-input-helpers.R +\name{test_columns_not_present} +\alias{test_columns_not_present} +\title{Test whether column names are NOT present in a data.frame} +\usage{ +test_columns_not_present(data, columns) +} +\arguments{ +\item{data}{A data.frame or similar to be checked} + +\item{columns}{A character vector of column names to check} +} +\value{ +Returns TRUE if none of the columns are present and FALSE otherwise +} +\description{ +The function checks whether all column names are NOT present. +If none of the columns are present, the function returns TRUE. If one or +more columns are present, the function returns FALSE. +} +\keyword{internal_input_check} diff --git a/man/test_columns_present.Rd b/man/test_columns_present.Rd new file mode 100644 index 000000000..1931ca59c --- /dev/null +++ b/man/test_columns_present.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/check-input-helpers.R +\name{test_columns_present} +\alias{test_columns_present} +\title{Test whether all column names are present in a data.frame} +\usage{ +test_columns_present(data, columns) +} +\arguments{ +\item{data}{A data.frame or similar to be checked} + +\item{columns}{A character vector of column names to check} +} +\value{ +Returns TRUE if all columns are present and FALSE otherwise +} +\description{ +The function checks whether all column names are present. If +one or more columns are missing, the function returns FALSE. If all columns +are present, the function returns TRUE. +} +\keyword{internal_input_check} diff --git a/man/test_forecast_type_is_binary.Rd b/man/test_forecast_type_is_binary.Rd new file mode 100644 index 000000000..79c952e17 --- /dev/null +++ b/man/test_forecast_type_is_binary.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/get_-functions.R +\name{test_forecast_type_is_binary} +\alias{test_forecast_type_is_binary} +\title{Test whether data could be a binary forecast.} +\usage{ +test_forecast_type_is_binary(data) +} +\arguments{ +\item{data}{A data.frame or similar to be checked} +} +\value{ +Returns TRUE if basic requirements are satisfied and FALSE otherwise +} +\description{ +Checks type of the necessary columns. +} +\keyword{internal_input_check} diff --git a/man/test_forecast_type_is_point.Rd b/man/test_forecast_type_is_point.Rd new file mode 100644 index 000000000..ad1c624fc --- /dev/null +++ b/man/test_forecast_type_is_point.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/get_-functions.R +\name{test_forecast_type_is_point} +\alias{test_forecast_type_is_point} +\title{Test whether data could be a point forecast.} +\usage{ +test_forecast_type_is_point(data) +} +\arguments{ +\item{data}{A data.frame or similar to be checked} +} +\value{ +Returns TRUE if basic requirements are satisfied and FALSE otherwise +} +\description{ +Checks type of the necessary columns. +} +\keyword{internal_input_check} diff --git a/man/test_forecast_type_is_quantile.Rd b/man/test_forecast_type_is_quantile.Rd new file mode 100644 index 000000000..7f77043f3 --- /dev/null +++ b/man/test_forecast_type_is_quantile.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/get_-functions.R +\name{test_forecast_type_is_quantile} +\alias{test_forecast_type_is_quantile} +\title{Test whether data could be a quantile forecast.} +\usage{ +test_forecast_type_is_quantile(data) +} +\arguments{ +\item{data}{A data.frame or similar to be checked} +} +\value{ +Returns TRUE if basic requirements are satisfied and FALSE otherwise +} +\description{ +Checks type of the necessary columns. +} +\keyword{internal_input_check} diff --git a/man/test_forecast_type_is_sample.Rd b/man/test_forecast_type_is_sample.Rd new file mode 100644 index 000000000..be7fea95c --- /dev/null +++ b/man/test_forecast_type_is_sample.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/get_-functions.R +\name{test_forecast_type_is_sample} +\alias{test_forecast_type_is_sample} +\title{Test whether data could be a sample-based forecast.} +\usage{ +test_forecast_type_is_sample(data) +} +\arguments{ +\item{data}{A data.frame or similar to be checked} +} +\value{ +Returns TRUE if basic requirements are satisfied and FALSE otherwise +} +\description{ +Checks type of the necessary columns. +} +\keyword{internal_input_check} diff --git a/man/transform_forecasts.Rd b/man/transform_forecasts.Rd index 3fe0de992..a77043a95 100644 --- a/man/transform_forecasts.Rd +++ b/man/transform_forecasts.Rd @@ -7,34 +7,9 @@ transform_forecasts(data, fun = log_shift, append = TRUE, label = "log", ...) } \arguments{ -\item{data}{A data.frame or data.table with the predictions and observations. -For scoring using \code{\link[=score]{score()}}, the following columns need to be present: -\itemize{ -\item \code{true_value} - the true observed values -\item \code{prediction} - predictions or predictive samples for one -true value. (You only don't need to provide a prediction column if -you want to score quantile forecasts in a wide range format.)} -For scoring integer and continuous forecasts a \code{sample} column is needed: -\itemize{ -\item \code{sample} - an index to identify the predictive samples in the -prediction column generated by one model for one true value. Only -necessary for continuous and integer forecasts, not for -binary predictions.} -For scoring predictions in a quantile-format forecast you should provide -a column called \code{quantile}: -\itemize{ -\item \code{quantile}: quantile to which the prediction corresponds -} - -In addition a \code{model} column is suggested and if not present this will be -flagged and added to the input data with all forecasts assigned as an -"unspecified model"). - -You can check the format of your data using \code{\link[=check_forecasts]{check_forecasts()}} and there -are examples for each format (\link{example_quantile}, \link{example_continuous}, -\link{example_integer}, and \link{example_binary}).} +\item{data}{A data.frame or data.table with predicted and observed values.} -\item{fun}{A function used to transform both true values and predictions. +\item{fun}{A function used to transform both observed values and predictions. The default function is \code{\link[=log_shift]{log_shift()}}, a custom function that is essentially the same as \code{\link[=log]{log()}}, but has an additional arguments (\code{offset}) that allows you add an offset before applying the logarithm. This is often @@ -65,7 +40,8 @@ additional column, `scale', present which will be set to "natural" for the untransformed forecasts. } \description{ -Function to transform forecasts and true values before scoring. +Function to transform forecasts and observed values before +scoring. } \details{ There are a few reasons, depending on the circumstances, for @@ -91,7 +67,7 @@ library(magrittr) # pipe operator # transform forecasts using the natural logarithm # negative values need to be handled (here by replacing them with 0) example_quantile \%>\% - .[, true_value := ifelse(true_value < 0, 0, true_value)] \%>\% + .[, observed := ifelse(observed < 0, 0, observed)] \%>\% # Here we use the default function log_shift() which is essentially the same # as log(), but has an additional arguments (offset) that allows you add an # offset before applying the logarithm. @@ -108,20 +84,20 @@ example_quantile \%>\% # specifying an offset for the log transformation removes the # warning caused by zeros in the data example_quantile \%>\% - .[, true_value := ifelse(true_value < 0, 0, true_value)] \%>\% + .[, observed := ifelse(observed < 0, 0, observed)] \%>\% transform_forecasts(offset = 1, append = FALSE) \%>\% head() # adding square root transformed forecasts to the original ones example_quantile \%>\% - .[, true_value := ifelse(true_value < 0, 0, true_value)] \%>\% + .[, observed := ifelse(observed < 0, 0, observed)] \%>\% transform_forecasts(fun = sqrt, label = "sqrt") \%>\% score() \%>\% summarise_scores(by = c("model", "scale")) # adding multiple transformations example_quantile \%>\% - .[, true_value := ifelse(true_value < 0, 0, true_value)] \%>\% + .[, observed := ifelse(observed < 0, 0, observed)] \%>\% transform_forecasts(fun = log_shift, offset = 1) \%>\% transform_forecasts(fun = sqrt, label = "sqrt") \%>\% head() diff --git a/man/validate_forecast.Rd b/man/validate_forecast.Rd new file mode 100644 index 000000000..542e84dd5 --- /dev/null +++ b/man/validate_forecast.Rd @@ -0,0 +1,63 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/validate.R +\name{validate_forecast} +\alias{validate_forecast} +\title{Validate input data} +\usage{ +validate_forecast(data, ...) +} +\arguments{ +\item{data}{A data.frame or data.table with predicted and observed values.} + +\item{...}{additional arguments} +} +\value{ +Depending on the forecast type, an object of class +\code{forecast_binary}, \code{forecast_point}, \code{forecast_sample} or +\code{forecast_quantile}. +} +\description{ +Methods for the different classes run \code{\link[=validate_general]{validate_general()}}, which performs +checks that are the same for all forecast types and then perform specific +checks for the specific forecast type. +} +\section{Forecast types and input format}{ +Various different forecast types / forecast formats are supported. At the +moment, those are +\itemize{ +\item point forecasts +\item binary forecasts ("soft binary classification") +\item Probabilistic forecasts in a quantile-based format (a forecast is +represented as a set of predictive quantiles) +\item Probabilistic forecasts in a sample-based format (a forecast is represented +as a set of predictive samples) +} + +Forecast types are determined based on the columns present in the input data. + +\emph{Point forecasts} require a column \code{observed} of type numeric and a column +\code{predicted} of type numeric. + +\emph{Binary forecasts} require a column \code{observed} of type factor with exactly +two levels and a column \code{predicted} of type numeric with probabilities, +corresponding to the probability that \code{observed} is equal to the second +factor level. See details \link[=brier_score]{here} for more information. + +\emph{Quantile-based forecasts} require a column \code{observed} of type numeric, +a column \code{predicted} of type numeric, and a column \code{quantile} of type numeric +with quantile-levels (between 0 and 1). + +\emph{Sample-based forecasts} require a column \code{observed} of type numeric, +a column \code{predicted} of type numeric, and a column \code{sample_id} of type +numeric with sample indices. + +For more information see the vignettes and the example data +(\link{example_quantile}, \link{example_continuous}, \link{example_integer}, +\code{\link[=example_point]{example_point()}}, and \link{example_binary}). +} + +\examples{ +forecast <- as_forecast(example_binary) +validate_forecast(forecast) +} +\keyword{check-forecasts} diff --git a/man/validate_general.Rd b/man/validate_general.Rd new file mode 100644 index 000000000..e379c5a0f --- /dev/null +++ b/man/validate_general.Rd @@ -0,0 +1,28 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/validate.R +\name{validate_general} +\alias{validate_general} +\title{Apply scoringutls input checks that are the same across forecast types} +\usage{ +validate_general(data) +} +\arguments{ +\item{data}{A data.frame or data.table with predicted and observed values.} +} +\value{ +returns the input, with a few new attributes that hold additional +information, messages and warnings +} +\description{ +The function runs input checks that apply to all input data, regardless of +forecast type. The function +\itemize{ +\item asserts that the data is a data.table which has columns \code{observed} and +\code{predicted}, as well as a column called \code{model}. +\item checks the forecast type and forecast unit +\item checks there are no duplicate forecasts +\item if appropriate, checks the number of samples / quantiles is the same +for all forecasts +} +} +\keyword{internal_input_check} diff --git a/man/validate_metrics.Rd b/man/validate_metrics.Rd new file mode 100644 index 000000000..d373eaa58 --- /dev/null +++ b/man/validate_metrics.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/validate.R +\name{validate_metrics} +\alias{validate_metrics} +\title{Validate metrics} +\usage{ +validate_metrics(metrics) +} +\arguments{ +\item{metrics}{A named list with metrics. Every element should be a scoring +function to be applied to the data.} +} +\value{ +A named list of metrics, with those filtered out that are not +valid functions +} +\description{ +This function validates whether the list of metrics is a list +of valid functions. + +The function is used in \code{\link[=score]{score()}} to make sure that all metrics are valid +functions +} +\keyword{internal_input_check} diff --git a/man/wis.Rd b/man/wis.Rd new file mode 100644 index 000000000..eb749f309 --- /dev/null +++ b/man/wis.Rd @@ -0,0 +1,154 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/metrics-quantile.R +\name{wis} +\alias{wis} +\alias{dispersion} +\alias{overprediction} +\alias{underprediction} +\title{Weighted Interval Score (WIS)} +\usage{ +wis( + observed, + predicted, + quantile, + separate_results = FALSE, + weigh = TRUE, + count_median_twice = FALSE, + na.rm = TRUE +) + +dispersion(observed, predicted, quantile, ...) + +overprediction(observed, predicted, quantile, ...) + +underprediction(observed, predicted, quantile, ...) +} +\arguments{ +\item{observed}{numeric vector of size n with the observed values} + +\item{predicted}{numeric nxN matrix of predictive +quantiles, n (number of rows) being the number of forecasts (corresponding +to the number of observed values) and N +(number of columns) the number of quantiles per forecast. +If \code{observed} is just a single number, then predicted can just be a +vector of size N.} + +\item{quantile}{vector with quantile levels of size N} + +\item{separate_results}{if \code{TRUE} (default is \code{FALSE}), then the separate +parts of the interval score (dispersion penalty, penalties for over- and +under-prediction get returned as separate elements of a list). If you want a +\code{data.frame} instead, simply call \code{\link[=as.data.frame]{as.data.frame()}} on the output.} + +\item{weigh}{if TRUE, weigh the score by alpha / 2, so it can be averaged +into an interval score that, in the limit, corresponds to CRPS. Alpha is the +decimal value that represents how much is outside a central prediction +interval (e.g. for a 90 percent central prediction interval, alpha is 0.1) +Default: \code{TRUE}.} + +\item{count_median_twice}{if TRUE, count the median twice in the score} + +\item{na.rm}{if TRUE, ignore NA values when computing the score} + +\item{...}{Additional arguments passed on to \code{wis()} from functions +\code{overprediction()}, \code{underprediction()} and \code{dispersion()}} +} +\value{ +\code{wis()}: a numeric vector with WIS values of size n (one per observation), +or a list with separate entries if \code{separate_results} is \code{TRUE}. + +\code{dispersion()}: a numeric vector with dispersion values (one per observation) + +\code{overprediction()}: a numeric vector with overprediction values (one per +observation) + +\code{underprediction()}: a numeric vector with underprediction values (one per +observation) +} +\description{ +The WIS is a proper scoring rule used to evaluate forecasts in an interval- / +quantile-based format. See Bracher et al. (2021). Smaller values are better. + +As the name suggest the score assumes that a forecast comes in the form of +one or multiple central prediction intervals. A prediction interval is +characterised by a lower and an upper bound formed by a pair of predictive +quantiles. For example, a 50\% central prediction interval is formed by the +0.25 and 0.75 quantiles of the predictive distribution. + +\strong{Interval score} + +The interval score (IS) is the sum of three components: +overprediction, underprediction and dispersion. For a single prediction +interval only one of the components is non-zero. If for a single prediction +interval the observed value is below the lower bound, then the interval +score is equal to the absolute difference between the lower bound and the +observed value ("underprediction"). "Overprediction" is defined analogously. +If the observed value falls within the bounds of the prediction interval, +then the interval score is equal to the width of the prediction interval, +i.e. the difference between the upper and lower bound. For a single interval, +we therefore have: + +\deqn{ +\textrm{IS} = (\textrm{upper} - \textrm{lower}) + \frac{2}{\alpha}(\textrm{lower} + - \textrm{observed}) * +\mathbf{1}(\textrm{observed} < \textrm{lower}) + +\frac{2}{\alpha}(\textrm{observed} - \textrm{upper}) * +\mathbf{1}(\textrm{observed} > \textrm{upper}) +}{ +score = (upper - lower) + 2/alpha * (lower - observed) * +1(observed < lower) + 2/alpha * (observed - upper) * +1(observed > upper) +} +where \eqn{\mathbf{1}()}{1()} is the indicator function and +indicates how much is outside the prediction interval. +\eqn{\alpha}{alpha} is the decimal value that indicates how much is outside +the prediction interval. For a 90\% prediction interval, for example, +\eqn{\alpha}{alpha} is equal to 0.1. No specific distribution is assumed, +but the range has to be symmetric (i.e you can't use the 0.1 quantile +as the lower bound and the 0.7 quantile as the upper). +Non-symmetric quantiles can be scored using the function \code{\link[=quantile_score]{quantile_score()}}. + +Usually the interval score is weighted by a factor that makes sure that the +average score across an increasing number of equally spaced +quantiles, converges to the continuous ranked probability score (CRPS). This +weighted score is called the weihted interval score (WIS). +The weight commonly used is \eqn{\alpha / 2}{alpha / 2}. + +\strong{Quantile score} + +In addition to the interval score, there also exists a quantile score (QS) +(see \code{\link[=quantile_score]{quantile_score()}}), which is equal to the so-called pinball loss. +The quantile score can be computed for a single quantile (whereas the +interval score requires two quantiles that form an interval). However, +the intuitive decomposition into overprediction, underprediction and +dispersion does not exist for the quantile score. + +\strong{Two versions of the weighted interval score} + +There are two ways to conceptualise the weighted interval score across +several quantiles / prediction intervals and the median. + +In one view, you would treat the WIS as the average of quantile scores (and +the median as 0.5-quantile) (this is the default for \code{wis()}). In another +view, you would treat the WIS as the average of several interval scores + +the difference between observed value and median forecast. The effect of +that is that in contrast to the first view, the median has twice as much +weight (because it is weighted like a prediction interval, rather than like +a single quantile). Both are valid ways to conceptualise the WIS and you +can control the behvaviour with the \code{count_median_twice}-argument. + +\strong{WIS components}: +WIS components can be computed individually using the functions +\code{overprediction}, \code{underprediction}, and \code{dispersion.} +} +\examples{ +observed <- c(1, -15, 22) +predicted <- rbind( + c(-1, 0, 1, 2, 3), + c(-2, 1, 2, 2, 4), + c(-2, 0, 3, 3, 4) +) +quantile <- c(0.1, 0.25, 0.5, 0.75, 0.9) +wis(observed, predicted, quantile) +} +\keyword{metric} diff --git a/tests/testthat/_snaps/plot_avail_forecasts/plot-avail-forecasts.svg b/tests/testthat/_snaps/plot_avail_forecasts/plot-available-forecasts.svg similarity index 86% rename from tests/testthat/_snaps/plot_avail_forecasts/plot-avail-forecasts.svg rename to tests/testthat/_snaps/plot_avail_forecasts/plot-available-forecasts.svg index e70c93f14..6909ba780 100644 --- a/tests/testthat/_snaps/plot_avail_forecasts/plot-avail-forecasts.svg +++ b/tests/testthat/_snaps/plot_avail_forecasts/plot-available-forecasts.svg @@ -25,42 +25,54 @@ - - - - - - - + + - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - + @@ -70,54 +82,54 @@ - - - - - - - - - + + - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - - + @@ -201,23 +213,23 @@ target_end_date model - -4 -6 -8 -10 -12 -Number forecasts - - - - - - - - - - -plot_avail_forecasts + +0 +3 +6 +9 +12 +Count + + + + + + + + + + +plot_available_forecasts diff --git a/tests/testthat/_snaps/plot_correlation/plot-correlation.svg b/tests/testthat/_snaps/plot_correlation/plot-correlation.svg index f56619980..d51d9d2b0 100644 --- a/tests/testthat/_snaps/plot_correlation/plot-correlation.svg +++ b/tests/testthat/_snaps/plot_correlation/plot-correlation.svg @@ -20,110 +20,152 @@ - - + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -1 -0.46 -1 -0.28 -0.15 -1 -0.94 -0.32 --0.03 -1 --0.34 --0.12 --0.33 --0.25 -1 -0.11 -0.11 --0.35 -0.22 -0.06 -1 -0.99 -0.54 -0.34 -0.9 --0.38 -0.1 -1 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +1 +0.94 +1 +0.28 +-0.03 +1 +0.46 +0.32 +0.15 +1 +0.11 +0.22 +-0.35 +0.11 +1 +-0.21 +-0.15 +-0.21 +-0.09 +0.01 +1 +-0.41 +-0.32 +-0.36 +-0.09 +0.1 +0.37 +1 +-0.34 +-0.25 +-0.33 +-0.12 +0.06 +0.85 +0.64 +1 +0.99 +0.9 +0.34 +0.54 +0.1 +-0.25 +-0.41 +-0.38 +1 - -interval_score -dispersion -underprediction -overprediction -coverage_deviation -bias -ae_median - - - - - - - - - - - - - - - -ae_median -bias -coverage_deviation -overprediction -underprediction -dispersion -interval_score - -0.0 -0.5 -1.0 -Correlation - - - - - - -plot__correlation + +wis +overprediction +underprediction +dispersion +bias +interval_coverage_50 +interval_coverage_90 +interval_coverage_deviation +ae_median + + + + + + + + + + + + + + + + + + + +ae_median +interval_coverage_deviation +interval_coverage_90 +interval_coverage_50 +bias +dispersion +underprediction +overprediction +wis + +0.0 +0.5 +1.0 +Correlation + + + + + + +plot__correlation diff --git a/tests/testthat/_snaps/plot_heatmap/plot-heatmap.svg b/tests/testthat/_snaps/plot_heatmap/plot-heatmap.svg index 528aefb09..8c4222d9a 100644 --- a/tests/testthat/_snaps/plot_heatmap/plot-heatmap.svg +++ b/tests/testthat/_snaps/plot_heatmap/plot-heatmap.svg @@ -25,174 +25,20 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + - - - - - - - - - - - -0.1 -0.1 -0.1 -0.1 -0.1 -0.1 -0.1 -0.1 -0.1 -0.1 -0.1 -0.1 --0.06 --0.06 --0.06 --0.06 --0.06 --0.06 --0.06 --0.06 --0.06 -0.06 --0.06 --0.06 --0.08 --0.08 --0.08 --0.08 --0.08 --0.08 --0.08 --0.08 --0.08 --0.08 --0.08 +0.1 -0.08 -0.34 -0.34 -0.34 -0.34 -0.34 -0.34 -0.34 -0.34 -0.34 -0.34 -0.34 -0.34 -0.07 -0.07 -0.07 -0.07 0.07 -0.07 -0.07 -0.07 -0.07 -0.07 -0.07 -0.07 --0.02 --0.02 --0.02 --0.02 --0.02 --0.02 --0.02 --0.02 --0.02 --0.02 --0.02 +0.34 -0.02 -0.01 --0.01 --0.01 --0.01 --0.01 --0.01 --0.01 --0.01 --0.01 --0.01 --0.01 --0.01 diff --git a/tests/testthat/_snaps/plot_pairwise_comparison/plot-pairwise-comparison-pval.svg b/tests/testthat/_snaps/plot_pairwise_comparison/plot-pairwise-comparison-pval.svg index 22e2408dc..458487011 100644 --- a/tests/testthat/_snaps/plot_pairwise_comparison/plot-pairwise-comparison-pval.svg +++ b/tests/testthat/_snaps/plot_pairwise_comparison/plot-pairwise-comparison-pval.svg @@ -28,8 +28,8 @@ - + @@ -37,8 +37,8 @@ < 0.001 < 0.001 1 -0.298 < 0.001 +0.298 1 0.298 < 0.001 @@ -56,9 +56,9 @@ + - @@ -72,9 +72,9 @@ < 0.001 < 0.001 1 +< 0.001 < 0.001 < 0.001 -< 0.001 1 0.007 < 0.001 diff --git a/tests/testthat/_snaps/plot_pairwise_comparison/plot-pairwise-comparison.svg b/tests/testthat/_snaps/plot_pairwise_comparison/plot-pairwise-comparison.svg index 1ff397cfe..3a7599da4 100644 --- a/tests/testthat/_snaps/plot_pairwise_comparison/plot-pairwise-comparison.svg +++ b/tests/testthat/_snaps/plot_pairwise_comparison/plot-pairwise-comparison.svg @@ -28,8 +28,8 @@ - + @@ -37,8 +37,8 @@ 1.37 1.59 1 -0.86 0.63 +0.86 1 1.16 0.73 @@ -56,9 +56,9 @@ + - @@ -72,9 +72,9 @@ 3.03 3.85 1 +0.26 0.62 0.79 -0.26 1 0.74 1.27 diff --git a/tests/testthat/_snaps/plot_ranges/plot-ranges-dispersion.svg b/tests/testthat/_snaps/plot_ranges/plot-ranges-dispersion.svg index 812e1600f..4ad667f8a 100644 --- a/tests/testthat/_snaps/plot_ranges/plot-ranges-dispersion.svg +++ b/tests/testthat/_snaps/plot_ranges/plot-ranges-dispersion.svg @@ -25,42 +25,42 @@ - - - - - - - - - - - + - - - - - - - - - - + - - - - - - - - - - - + - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -82,54 +82,54 @@ - - - - - - - - - - - + - - - - - - - - - - + + - - - - - - - - - - - + - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/tests/testthat/_snaps/plot_ranges/plot-ranges-interval.svg b/tests/testthat/_snaps/plot_ranges/plot-ranges-interval.svg index 15c9ee3e3..98a9a883c 100644 --- a/tests/testthat/_snaps/plot_ranges/plot-ranges-interval.svg +++ b/tests/testthat/_snaps/plot_ranges/plot-ranges-interval.svg @@ -25,42 +25,42 @@ - - - - - - - - - - - + - - - - - - - - - - + - - - - - - - - - - - + - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -82,54 +82,54 @@ - - - - - - - - - - - + - - - - - - - - - - + + - - - - - - - - - - - + - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -203,7 +203,7 @@ model -interval_score +wis 0 25 diff --git a/tests/testthat/_snaps/plot_score_table/plot-score-table.svg b/tests/testthat/_snaps/plot_score_table/plot-score-table.svg index 95f9fe247..ead05c2a1 100644 --- a/tests/testthat/_snaps/plot_score_table/plot-score-table.svg +++ b/tests/testthat/_snaps/plot_score_table/plot-score-table.svg @@ -20,93 +20,113 @@ - - + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -10000 -9000 -50 -10000 -2000 -2000 -30 -3000 -5000 -2000 -20 -2000 -7000 -5000 -9 -6000 -0.002 -0.05 --0.02 --0.06 -0.2 -0.008 --0.02 --0.04 -20000 -10000 -80 -10000 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +9000 +10000 +10000 +50 +5000 +7000 +6000 +9 +2000 +5000 +2000 +20 +2000 +2000 +3000 +30 +0.008 +0.2 +-0.04 +-0.02 +0.6 +0.5 +0.4 +0.5 +0.9 +0.9 +0.8 +0.9 +0.05 +0.002 +-0.06 +-0.02 +10000 +20000 +10000 +80 - -EuroCOVIDhub-baseline -EuroCOVIDhub-ensemble -UMass-MechBayes -epiforecasts-EpiNow2 - - - - - - - - - - - - -interval_score -dispersion -underprediction -overprediction -coverage_deviation -bias -ae_median + +EuroCOVIDhub-baseline +EuroCOVIDhub-ensemble +UMass-MechBayes +epiforecasts-EpiNow2 + + + + + + + + + + + + + + +wis +overprediction +underprediction +dispersion +bias +interval_coverage_50 +interval_coverage_90 +interval_coverage_deviation +ae_median plot_score_table diff --git a/tests/testthat/_snaps/plot_wis/plot-wis-flip.svg b/tests/testthat/_snaps/plot_wis/plot-wis-flip.svg index 758a3c147..c315cf06d 100644 --- a/tests/testthat/_snaps/plot_wis/plot-wis-flip.svg +++ b/tests/testthat/_snaps/plot_wis/plot-wis-flip.svg @@ -25,14 +25,14 @@ - + - + - + @@ -43,16 +43,16 @@ - + - + - + diff --git a/tests/testthat/_snaps/plot_wis/plot-wis-no-relative.svg b/tests/testthat/_snaps/plot_wis/plot-wis-no-relative.svg index 987072ca4..fea309214 100644 --- a/tests/testthat/_snaps/plot_wis/plot-wis-no-relative.svg +++ b/tests/testthat/_snaps/plot_wis/plot-wis-no-relative.svg @@ -25,14 +25,14 @@ - + - + - + @@ -43,16 +43,16 @@ - + - + - + diff --git a/tests/testthat/_snaps/plot_wis/plot-wis.svg b/tests/testthat/_snaps/plot_wis/plot-wis.svg index 5328b4779..a2bdf8653 100644 --- a/tests/testthat/_snaps/plot_wis/plot-wis.svg +++ b/tests/testthat/_snaps/plot_wis/plot-wis.svg @@ -25,14 +25,14 @@ - + - + - + @@ -43,16 +43,16 @@ - + - + - + diff --git a/tests/testthat/_snaps/score.md b/tests/testthat/_snaps/score.md deleted file mode 100644 index 78c7810cd..000000000 --- a/tests/testthat/_snaps/score.md +++ /dev/null @@ -1,17 +0,0 @@ -# score() can support a sample column when a quantile forecast is used - - Code - summarise_scores(summarise_scores(scores, by = "model"), by = "model", fun = signif, - digits = 2) - Output - model interval_score dispersion underprediction - 1: EuroCOVIDhub-baseline 8500 850 0 - 2: EuroCOVIDhub-ensemble NA NA NA - 3: epiforecasts-EpiNow2 13000 4100 0 - 4: UMass-MechBayes 120 77 39 - overprediction coverage_deviation bias ae_median - 1: 7600 -0.081 0.62 13000 - 2: 11000 NA 0.58 21000 - 3: 8600 0.050 0.50 22000 - 4: 0 0.050 -0.50 210 - diff --git a/tests/testthat/_snaps/utils_data_handling.md b/tests/testthat/_snaps/utils_data_handling.md new file mode 100644 index 000000000..285514031 --- /dev/null +++ b/tests/testthat/_snaps/utils_data_handling.md @@ -0,0 +1,70 @@ +# quantile_to_range works - scalar and vector case + + Code + out1 + Output + forecast_id observed range lower upper + 1: 1 5 0 5 5 + 2: 1 5 20 4 6 + 3: 1 5 40 3 7 + 4: 1 5 60 2 8 + 5: 1 5 80 1 9 + +--- + + Code + out3 + Output + forecast_id observed range lower upper + 1: 1 5 0 5 5 + 2: 1 5 20 4 6 + 3: 1 5 40 3 7 + 4: 1 5 60 2 8 + 5: 1 5 80 1 NA + +--- + + Code + out4 + Output + forecast_id observed range lower upper + 1: 1 5 0 5 5 + 2: 1 5 20 4 6 + 3: 1 5 40 3 7 + 4: 1 5 60 2 8 + 5: 1 5 80 1 NA + 6: 1 5 90 NA 9 + +# quantile_to_range works - matrix case + + Code + out1 + Output + forecast_id observed range lower upper + 1: 1 21 0 21 21 + 2: 1 21 20 16 26 + 3: 1 21 40 11 31 + 4: 1 21 60 6 36 + 5: 1 21 80 1 41 + 6: 2 22 0 22 22 + 7: 2 22 20 17 27 + 8: 2 22 40 12 32 + 9: 2 22 60 7 37 + 10: 2 22 80 2 42 + 11: 3 23 0 23 23 + 12: 3 23 20 18 28 + 13: 3 23 40 13 33 + 14: 3 23 60 8 38 + 15: 3 23 80 3 43 + 16: 4 24 0 24 24 + 17: 4 24 20 19 29 + 18: 4 24 40 14 34 + 19: 4 24 60 9 39 + 20: 4 24 80 4 44 + 21: 5 25 0 25 25 + 22: 5 25 20 20 30 + 23: 5 25 40 15 35 + 24: 5 25 60 10 40 + 25: 5 25 80 5 45 + forecast_id observed range lower upper + diff --git a/tests/testthat/setup.R b/tests/testthat/setup.R index 11342633b..e035f09c2 100644 --- a/tests/testthat/setup.R +++ b/tests/testthat/setup.R @@ -1,7 +1,21 @@ # load common required test packages library(ggplot2, quietly = TRUE) +library(data.table) suppressMessages(library(magrittr)) data.table::setDTthreads(2) # restricts number of cores used on CRAN -# compute quantile scores -scores <- suppressMessages(score(example_quantile)) +metrics_no_cov <- rules_quantile( + exclude = c("interval_coverage_50", "interval_coverage_90", + "interval_coverage_deviation") +) +metrics_no_cov_no_ae <- rules_quantile( + exclude = c("interval_coverage_50", "interval_coverage_90", + "interval_coverage_deviation", "ae_median") +) + + +# compute scores +scores_quantile <- suppressMessages(score(example_quantile)) +scores_continuous <- suppressMessages(score(data = example_continuous)) +scores_point <- suppressMessages(score(example_point)) +scores_binary <- suppressMessages(score(example_binary)) diff --git a/tests/testthat/test-add_coverage.R b/tests/testthat/test-add_coverage.R index 4ceb68d9a..32b722a8a 100644 --- a/tests/testthat/test-add_coverage.R +++ b/tests/testthat/test-add_coverage.R @@ -1,16 +1,16 @@ +ex_coverage <- example_quantile[model == "EuroCOVIDhub-ensemble"] + test_that("add_coverage() works as expected", { - expect_error(add_coverage(scores)) - expect_error( - add_coverage(scores, by = c("model", "target_type"), range = c()) + expect_message( + cov <- add_coverage(example_quantile), + "Some rows containing NA values may be removed." ) - expect_error( - add_coverage(scores, by = c("model", "target_type")), NA - ) - cov <- add_coverage( - scores, by = c("model", "target_type"), range = c(10, 50, 80) - ) - expect_equal( - grep("coverage_", colnames(cov), value = TRUE), - c("coverage_deviation", "coverage_10", "coverage_50", "coverage_80") + + required_names <- c( + "range", "interval_coverage", "interval_coverage_deviation", + "quantile_coverage", "quantile_coverage_deviation" ) + expect_equal(colnames(cov), c(colnames(example_quantile), required_names)) + + expect_equal(nrow(cov), nrow(example_quantile)) }) diff --git a/tests/testthat/test-avail_forecasts.R b/tests/testthat/test-avail_forecasts.R deleted file mode 100644 index f24b55e4d..000000000 --- a/tests/testthat/test-avail_forecasts.R +++ /dev/null @@ -1,29 +0,0 @@ -test_that("avail_forecasts() works as expected", { - af <- suppressMessages( - avail_forecasts(example_quantile, - by = c("model", "target_type", "target_end_date") - ) - ) - expect_type(af, "list") - expect_type(af$target_type, "character") - expect_type(af$`Number forecasts`, "integer") - expect_equal(nrow(af[is.na(`Number forecasts`)]), 0) - af <- suppressMessages( - avail_forecasts(example_quantile, - by = "model" - ) - ) - expect_equal(nrow(af), 4) - expect_equal(af$`Number forecasts`, c(256, 256, 247, 128)) - af <- suppressMessages( - avail_forecasts(example_quantile, - by = "model", collapse = c() - ) - ) - expect_equal(nrow(af), 4) - expect_equal(af$`Number forecasts`, c(5888, 5888, 5681, 2944)) - af <- suppressMessages( - avail_forecasts(example_quantile) - ) - expect_equal(nrow(af), 887) -}) \ No newline at end of file diff --git a/tests/testthat/test-available_forecasts.R b/tests/testthat/test-available_forecasts.R new file mode 100644 index 000000000..4507820b1 --- /dev/null +++ b/tests/testthat/test-available_forecasts.R @@ -0,0 +1,33 @@ +test_that("get_forecast_counts() works as expected", { + af <- suppressMessages( + get_forecast_counts(example_quantile, + by = c("model", "target_type", "target_end_date") + ) + ) + expect_type(af, "list") + expect_type(af$target_type, "character") + expect_type(af$`count`, "integer") + expect_equal(nrow(af[is.na(`count`)]), 0) + af <- get_forecast_counts(na.omit(example_quantile), by = "model") + expect_equal(nrow(af), 4) + expect_equal(af$`count`, c(256, 256, 128, 247)) + + # Setting `collapse = c()` means that all quantiles and samples are counted + af <- get_forecast_counts( + na.omit(example_quantile), + by = "model", collapse = c() + ) + expect_equal(nrow(af), 4) + expect_equal(af$`count`, c(5888, 5888, 2944, 5681)) + + # setting by = NULL, the default, results in by equal to forecast unit + af <- get_forecast_counts(na.omit(example_quantile)) + expect_equal(nrow(af), 50688) + + # check whether collapsing also works for model-based forecasts + af <- get_forecast_counts(na.omit(example_integer), by = "model") + expect_equal(nrow(af), 4) + + af <- get_forecast_counts(na.omit(example_integer), by = "model", collapse = c()) + expect_equal(af$count, c(10240, 10240, 5120, 9880)) +}) diff --git a/tests/testthat/test-bias.R b/tests/testthat/test-bias.R deleted file mode 100644 index 88f84ab93..000000000 --- a/tests/testthat/test-bias.R +++ /dev/null @@ -1,242 +0,0 @@ -test_that("bias_sample() throws an error when missing true_values", { - true_values <- rpois(10, lambda = 1:10) - predictions <- replicate(50, rpois(n = 10, lambda = 1:10)) - - expect_error( - bias_sample(predictions = predictions), - "true_values argument is missing" - ) -}) - -test_that("bias_sample() throws an error when missing 'predictions'", { - true_values <- rpois(10, lambda = 1:10) - predictions <- replicate(50, rpois(n = 10, lambda = 1:10)) - - expect_error( - bias_sample(true_values = true_values), - "argument 'predictions' missing" - ) -}) - -test_that("bias_sample() works for integer true_values and predictions", { - true_values <- rpois(10, lambda = 1:10) - predictions <- replicate(10, rpois(10, lambda = 1:10)) - output <- bias_sample( - true_values = true_values, - predictions = predictions - ) - expect_equal( - length(output), - length(true_values) - ) - expect_equal( - class(output), - "numeric" - ) -}) - -test_that("bias_sample() works for continuous true_values and predictions", { - true_values <- rnorm(10) - predictions <- replicate(10, rnorm(10)) - output <- bias_sample( - true_values = true_values, - predictions = predictions - ) - expect_equal( - length(output), - length(true_values) - ) - expect_equal( - class(output), - "numeric" - ) -}) - -test_that("bias_sample() works as expected", { - true_values <- rpois(30, lambda = 1:30) - predictions <- replicate(200, rpois(n = 30, lambda = 1:30)) - expect_true(all(bias_sample(true_values, predictions) == bias_sample(true_values, predictions))) - - ## continuous forecasts - true_values <- rnorm(30, mean = 1:30) - predictions <- replicate(200, rnorm(30, mean = 1:30)) - - scoringutils2 <- bias_sample(true_values, predictions) - scoringutils <- bias_sample(true_values, predictions) - - expect_equal(scoringutils, scoringutils2) -}) - -test_that("bias_quantile() handles NA values", { - predictions <- c(NA, 1, 2, 3) - quantiles <- c(0.1, 0.5, 0.9) - - expect_error( - bias_quantile(predictions, quantiles, true_value = 2), - "predictions and quantiles must have the same length" - ) -}) - -test_that("bias_quantile() returns NA if no predictions", { - expect_true(is.na(bias_quantile(numeric(0), numeric(0), true_value = 2))) -}) - -test_that("bias_quantile() returns correct bias if value below the median", { - predictions <- c(1, 2, 4, 5) - quantiles <- c(0.1, 0.3, 0.7, 0.9) - suppressMessages( - expect_equal(bias_quantile(predictions, quantiles, true_value = 1), 0.8) - ) -}) - -test_that("bias_quantile() returns correct bias if value above median", { - predictions <- c(1, 2, 4, 5) - quantiles <- c(0.1, 0.3, 0.7, 0.9) - suppressMessages( - expect_equal(bias_quantile(predictions, quantiles, true_value = 5), -0.8) - ) -}) - -test_that("bias_quantile() returns correct bias if value at the median", { - predictions <- c(1, 2, 3, 4) - quantiles <- c(0.1, 0.3, 0.5, 0.7) - - expect_equal(bias_quantile(predictions, quantiles, true_value = 3), 0) -}) - -test_that("bias_quantile() returns 1 if true value below min prediction", { - predictions <- c(2, 3, 4, 5) - quantiles <- c(0.1, 0.3, 0.7, 0.9) - - suppressMessages( - expect_equal(bias_quantile(predictions, quantiles, true_value = 1), 1) - ) -}) - -test_that("bias_quantile() returns -1 if true value above max prediction", { - predictions <- c(1, 2, 3, 4) - quantiles <- c(0.1, 0.3, 0.5, 0.7) - - expect_equal(bias_quantile(predictions, quantiles, true_value = 6), -1) -}) - -test_that("bias_quantile(): quantiles must be between 0 and 1", { - predictions <- 1:4 - - # Failing example - quantiles <- c(-0.1, 0.3, 0.5, 0.8) - expect_error(bias_quantile(predictions, quantiles, true_value = 3), - "quantiles must be between 0 and 1") - - # Passing counter example - quantiles <- c(0.1, 0.3, 0.5, 0.8) - expect_silent(bias_quantile(predictions, quantiles, true_value = 3)) -}) - -test_that("bias_quantile(): quantiles must be increasing", { - predictions <- 1:4 - - # Failing example - quantiles <- c(0.8, 0.3, 0.5, 0.9) - expect_error(bias_quantile(predictions, quantiles, true_value = 3), - "quantiles must be increasing") - - # Passing counter example - quantiles <- c(0.3, 0.5, 0.8, 0.9) - expect_silent(bias_quantile(predictions, quantiles, true_value = 3)) -}) - -test_that("bias_quantile(): predictions must be increasing", { - predictions <- c(1, 2, 4, 3) - quantiles <- c(0.1, 0.3, 0.5, 0.9) - - expect_error( - bias_quantile(predictions, quantiles, true_value = 3), - "predictions must be increasing" - ) - expect_silent(bias_quantile(1:4, quantiles, true_value = 3)) -}) - -test_that("bias_quantile(): quantiles must be unique", { - predictions <- 1:4 - - # Failing example - quantiles <- c(0.3, 0.3, 0.5, 0.8) - expect_error(bias_quantile(predictions, quantiles, true_value = 3), - "quantiles must be increasing") - - # Passing example - quantiles <- c(0.3, 0.5, 0.8, 0.9) - expect_silent(bias_quantile(predictions, quantiles, true_value = 3)) -}) - -test_that("bias_sample() approx equals bias_quantile() for many samples", { - set.seed(123) - - # Generate true value - true_value <- 3 - - # Generate many sample predictions - predictions <- sample(rnorm(1000, mean = true_value, sd = 2), 1000) - - # Get sample based bias - bias_sample_result <- bias_sample( - true_value, matrix(predictions, nrow = 1) - ) - - # Convert predictions to quantiles - quantiles <- seq(0, 1, length.out = 100) - quantile_preds <- quantile(predictions, probs = quantiles) - - # Get quantile based bias - bias_quantile_result <- suppressMessages( - bias_quantile(quantile_preds, quantiles, true_value) - ) - - # Difference should be small - expect_equal(bias_quantile_result, bias_sample_result, tolerance = 0.1) -}) - -test_that("bias_quantile() and bias_range() give the same result", { - predictions <- sort(rnorm(23)) - lower <- rev(predictions[1:12]) - upper <- predictions[12:23] - - range <- c(0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 95, 98) - quantiles <- c(0.01, 0.025, seq(0.05, 0.95, 0.05), 0.975, 0.99) - true_value <- rnorm(1) - - range_bias <- bias_range( - lower = lower, upper = upper, - range = range, true_value = true_value - ) - range_quantile <- bias_quantile( - predictions = predictions, quantiles = quantiles, - true_value = true_value - ) - expect_equal(range_bias, range_quantile) -}) - -test_that("bias_range() works with point forecasts", { - predictions <- 1 - true_value <- 1 - range <- c(0) - - expect_equal(bias_range(predictions, predictions, range, true_value), 0) -}) - -test_that("bias_range(): ranges must be between 0 and 100", { - lower <- 4:1 - upper <- 5:8 - - # Failing example - range <- c(-10, 0, 10, 20) - expect_error( - bias_range(lower, upper, range, true_value = 3), - "range must be between 0 and 100" - ) - - # Passing counter example - range <- c(0, 10, 20, 30) - expect_silent(bias_range(lower, upper, range, true_value = 3)) -}) \ No newline at end of file diff --git a/tests/testthat/test-check_forecasts.R b/tests/testthat/test-check_forecasts.R index 043a85a83..447596768 100644 --- a/tests/testthat/test-check_forecasts.R +++ b/tests/testthat/test-check_forecasts.R @@ -1,64 +1,102 @@ -test_that("check_forecasts() function works", { - check <- suppressMessages(check_forecasts(example_quantile)) - expect_s3_class(check, "scoringutils_check") +test_that("as_forecast() function works", { + check <- suppressMessages(as_forecast(example_quantile)) + expect_s3_class(check, "forecast_quantile") }) -test_that("check_forecasts() function has an error for empty data.frame", { - expect_error(suppressMessages(check_forecasts(data.frame()))) +test_that("as_forecast() function has an error for empty data.frame", { + expect_error(suppressMessages(as_forecast(data.frame()))) }) -test_that("check_forecasts() function returns a message with NA in the data", { - expect_message( - { check <- check_forecasts(example_quantile) }, - "\\d+ values for `prediction` are NA" +test_that("check_columns_present() works", { + expect_equal( + check_columns_present(example_quantile, c("observed", "predicted", "nop")), + "Column 'nop' not found in data" ) - expect_match( - unlist(check$messages), - "\\d+ values for `prediction` are NA" + expect_true( + check_columns_present(example_quantile, c("observed", "predicted")) ) }) -test_that("check_forecasts() function returns messages with NA in the data", { - example <- data.table::copy(example_quantile) - example[horizon == 2, true_value := NA] - check <- suppressMessages(check_forecasts(example)) +test_that("check_duplicates() works", { + bad <- rbind( + example_quantile[1000:1010], + example_quantile[1000:1010] + ) - expect_equal(length(check$messages), 2) + expect_equal(scoringutils:::check_duplicates(bad), + "There are instances with more than one forecast for the same target. This can't be right and needs to be resolved. Maybe you need to check the unit of a single forecast and add missing columns? Use the function get_duplicate_forecasts() to identify duplicate rows" + ) }) -test_that("check_forecasts() function throws an error with duplicate forecasts", { +# test_that("as_forecast() function returns a message with NA in the data", { +# expect_message( +# { check <- as_forecast(example_quantile) }, +# "\\d+ values for `predicted` are NA" +# ) +# expect_match( +# unlist(check$messages), +# "\\d+ values for `predicted` are NA" +# ) +# }) + +# test_that("as_forecast() function returns messages with NA in the data", { +# example <- data.table::copy(example_quantile) +# example[horizon == 2, observed := NA] +# check <- suppressMessages(as_forecast(example)) +# +# expect_equal(length(check$messages), 2) +# }) + +test_that("as_forecast() function throws an error with duplicate forecasts", { example <- rbind(example_quantile, example_quantile[1000:1010]) - expect_error(suppressMessages(suppressWarnings(check_forecasts(example)))) + expect_error( + suppressMessages(suppressWarnings(as_forecast(example))), + "Assertion on 'data' failed: There are instances with more than one forecast for the same target. This can't be right and needs to be resolved. Maybe you need to check the unit of a single forecast and add missing columns? Use the function get_duplicate_forecasts() to identify duplicate rows.", #nolint + fixed = TRUE + ) }) -test_that("check_forecasts() function throws an error when no model column is +test_that("as_forecast() function creates a message when no model column is present", { - no_model <- data.table::copy(example_quantile)[, model := NULL] - expect_error(suppressMessages(suppressWarnings(check_forecasts(no_model)))) + no_model <- data.table::copy(example_quantile[model == "EuroCOVIDhub-ensemble"])[, model := NULL][] + expect_message( + suppressWarnings(as_forecast(no_model)), + "There is no column called `model` in the data.scoringutils assumes that all forecasts come from the same model") }) -test_that("check_forecasts() function throws an error when no predictions or - true values are present", { - expect_error(suppressMessages(suppressWarnings(check_forecasts( - data.table::copy(example_quantile)[, prediction := NA] - )))) - expect_error(suppressMessages(suppressWarnings(check_forecasts( - data.table::copy(example_quantile)[, true_value := NA] - )))) -}) +test_that("as_forecast() function throws an error when no predictions or observed values are present", { + expect_error(suppressMessages(suppressWarnings(as_forecast( + data.table::copy(example_quantile)[, predicted := NULL] + ))), + "Assertion on 'data' failed: Both columns `observed` and predicted` are needed.") -test_that("check_forecasts() function throws an sample/quantile not present", { - expect_error(suppressMessages(suppressWarnings(check_forecasts( - data.table::copy(example_quantile)[, quantile := NULL] - )))) + expect_error(suppressMessages(suppressWarnings(as_forecast( + data.table::copy(example_quantile)[, observed := NULL] + ))), + "Assertion on 'data' failed: Both columns `observed` and predicted` are needed.") }) +# test_that("as_forecast() function throws an error when no predictions or observed values are present", { +# expect_error(suppressMessages(suppressWarnings(as_forecast( +# data.table::copy(example_quantile)[, predicted := NA] +# )))) +# expect_error(suppressMessages(suppressWarnings(check_forecasts( +# data.table::copy(example_quantile)[, observed := NA] +# )))) +# }) + +# test_that("as_forecast() function throws an sample/quantile not present", { +# expect_error(suppressMessages(suppressWarnings(as_forecast( +# data.table::copy(example_quantile)[, quantile := NULL] +# )))) +# }) + test_that("output of check_forecasts() is accepted as input to score()", { - check <- suppressMessages(check_forecasts(example_binary)) + check <- suppressMessages(as_forecast(example_binary)) expect_no_error( - score_check <- score(check) + score_check <- score(na.omit(check)) ) expect_equal(score_check, suppressMessages(score(example_binary))) }) diff --git a/tests/testthat/test-convenience-functions.R b/tests/testthat/test-convenience-functions.R index 4fea72fa9..7d20c03ba 100644 --- a/tests/testthat/test-convenience-functions.R +++ b/tests/testthat/test-convenience-functions.R @@ -1,21 +1,21 @@ test_that("function transform_forecasts works", { - predictions_original <- example_quantile$prediction + predictions_original <- example_quantile$predicted predictions <- transform_forecasts( example_quantile, fun = function(x) pmax(0, x), append = FALSE ) - expect_equal(predictions$prediction, pmax(0, predictions_original)) + expect_equal(predictions$predicted, pmax(0, predictions_original)) one <- transform_forecasts(predictions, offset = 1) - expect_equal(one$prediction, - c(predictions$prediction, log(predictions$prediction + 1))) + expect_equal(one$predicted, + c(predictions$predicted, log(predictions$predicted + 1))) two <- transform_forecasts(predictions, fun = sqrt, label = "sqrt") - expect_equal(two$prediction, - c(predictions$prediction, sqrt(predictions$prediction))) + expect_equal(two$predicted, + c(predictions$predicted, sqrt(predictions$predicted))) # expect a warning if existing transformation is overwritten @@ -30,54 +30,63 @@ test_that("function transform_forecasts works", { # multiple transformations without append four <- transform_forecasts(two, fun = log_shift, offset = 1, append = FALSE) compare <- c( - one$prediction[one$scale == "log"], - three$prediction[three$scale == "sqrt"] + one$predicted[one$scale == "log"], + three$predicted[three$scale == "sqrt"] ) - expect_equal(four$prediction, compare) + expect_equal(four$predicted, compare) }) +# ============================================================================ # +# `set_forecast_unit()` +# ============================================================================ # test_that("function set_forecast_unit() works", { - # some columns in the example data have duplicated information. So we can remove # these and see whether the result stays the same. - - scores1 <- suppressMessages(score(example_quantile)) - scores1 <- scores1[order(location, target_end_date, target_type, horizon, model), ] + scores1 <- scores_quantile[order(location, target_end_date, target_type, horizon, model), ] ex2 <- set_forecast_unit( example_quantile, c("location", "target_end_date", "target_type", "horizon", "model") ) - scores2 <- suppressMessages(score(ex2)) + scores2 <- score(na.omit(ex2)) scores2 <- scores2[order(location, target_end_date, target_type, horizon, model), ] expect_equal(scores1$interval_score, scores2$interval_score) }) +test_that("set_forecast_unit() works on input that's not a data.table", { + df <- data.frame( + a = 1:2, + b = 2:3, + c = 3:4 + ) + expect_equal( + colnames(set_forecast_unit(df, c("a", "b"))), + c("a", "b") + ) + # apparently it also works on a matrix... good to know :) + expect_equal( + names(set_forecast_unit(as.matrix(df), "a")), + "a" + ) +}) -test_that("function set_forecast_unit() gives warning when column is not there", { +test_that("function set_forecast_unit() gives warning when column is not there", { expect_warning( set_forecast_unit( example_quantile, - c("location", "target_end_date", "target_type", "horizon", "model", "test") + c("location", "target_end_date", "target_type", "horizon", "model", "test1", "test2") ) ) }) - test_that("function get_forecast_unit() and set_forecast_unit() work together", { - fu_set <- c("location", "target_end_date", "target_type", "horizon", "model") - - ex <- set_forecast_unit( - example_binary, - fu_set - ) - + ex <- set_forecast_unit(example_binary, fu_set) fu_get <- get_forecast_unit(ex) expect_equal(fu_set, fu_get) }) diff --git a/tests/testthat/test-default-scoring-rules.R b/tests/testthat/test-default-scoring-rules.R new file mode 100644 index 000000000..a53dcbe00 --- /dev/null +++ b/tests/testthat/test-default-scoring-rules.R @@ -0,0 +1,76 @@ +test_that("`select_rules` works as expected", { + + expect_equal( + scoringutils:::select_rules(rules_point(), select = NULL), + rules_point() + ) + + expect_equal( + scoringutils:::select_rules(rules_point(), select = NULL), + scoringutils:::select_rules(rules_point()) + ) + + expect_equal( + names(scoringutils:::select_rules(rules_point(), select = "ape")), + "ape" + ) + + expect_equal( + length(scoringutils:::select_rules(rules_point(), select = NULL, exclude = "ape")), + length(rules_point()) - 1 + ) + + # if both select and exclude are specified, exclude is ignored + expect_equal( + names(scoringutils:::select_rules(rules_point(), select = "ape", exclude = "ape")), + "ape" + ) + + # expect error if possibilities is not a list + expect_error( + scoringutils:::select_rules(rules_point, select = NULL), + "Assertion on 'rules' failed: Must be of type 'list', not 'closure'." + ) +}) + + +test_that("default rules work as expected", { + + expect_true( + all(c( + is.list(rules_point()), + is.list(rules_binary()), + is.list(rules_quantile()), + is.list(rules_sample())) + ) + ) + + expect_equal( + names(rules_point(select = "ape")), + "ape" + ) + + expect_equal( + length(rules_binary(select = NULL, exclude = "brier_score")), + length(rules_binary()) - 1 + ) + + # if both select and exclude are specified, exclude is ignored + expect_equal( + names(scoringutils:::select_rules(rules_quantile(), select = "wis", exclude = "wis")), + "wis" + ) + + # expect error if select is not included in the default possibilities + expect_error( + rules_sample(select = "not-included"), + "Must be a subset of" + ) + + # expect error if exclude is not included in the default possibilities + expect_error( + rules_quantile(exclude = "not-included"), + "Must be a subset of" + ) +}) + diff --git a/tests/testthat/test-find_duplicates.R b/tests/testthat/test-find_duplicates.R deleted file mode 100644 index c9354c433..000000000 --- a/tests/testthat/test-find_duplicates.R +++ /dev/null @@ -1,8 +0,0 @@ -test_that("find_duplicates() works as expected", { - expect_equal(nrow(find_duplicates(example_quantile)), 0) - expect_equal( - nrow( - find_duplicates(rbind(example_quantile, example_quantile[1000:1010]))), - 22 - ) -}) \ No newline at end of file diff --git a/tests/testthat/test-get_-functions.R b/tests/testthat/test-get_-functions.R new file mode 100644 index 000000000..573abb75b --- /dev/null +++ b/tests/testthat/test-get_-functions.R @@ -0,0 +1,195 @@ +# ============================================================================== +# `get_forecast_unit()` +# ============================================================================== +test_that("get_forecast_unit() works as expected", { + expect_equal( + get_forecast_unit(example_quantile), + c("location", "target_end_date", "target_type", "location_name", + "forecast_date", "model", "horizon") + ) + + expect_equal( + get_forecast_unit(scores_quantile), + c("location", "target_end_date", "target_type", "location_name", + "forecast_date", "model", "horizon") + ) +}) + + +# ============================================================================== +# Test removing `NA` values from the data +# ============================================================================== +test_that("removing NA rows from data works as expected", { + expect_equal(nrow(na.omit(example_quantile)), 20401) + + ex <- data.frame(observed = c(NA, 1:3), predicted = 1:4) + expect_equal(nrow(na.omit(ex)), 3) + + ex$predicted <- c(1:3, NA) + expect_equal(nrow(na.omit(ex)), 2) + + # test that attributes and classes are retained + ex <- as_forecast(na.omit(example_integer)) + expect_equal( + class(na.omit(ex)), + c("forecast_sample", "data.table", "data.frame") + ) + + attributes <- get_scoringutils_attributes(ex) + expect_equal( + get_scoringutils_attributes(na.omit(ex)), + attributes + ) +}) + + +# ============================================================================== +# `get_type()` +# ============================================================================== +test_that("get_type() works as expected with vectors", { + expect_equal(get_type(1:3), "integer") + expect_equal(get_type(factor(1:2)), "classification") + expect_equal(get_type(c(1.0, 2)), "integer") + expect_equal(get_type(c(1.0, 2.3)), "continuous") + expect_error( + get_type(c("a", "b")), + "Assertion on 'as.vector(x)' failed: Must be of type 'numeric', not 'character'.", + fixed = TRUE + ) +}) + +test_that("get_type() works as expected with matrices", { + expect_equal(get_type(matrix(1:4, nrow = 2)), "integer") + expect_equal(get_type(matrix(c(1.0, 2:4))), "integer") + expect_equal(get_type(matrix(c(1.0, 2.3, 3, 4))), "continuous") + + # matrix of factors doesn't work + expect_error( + get_type(matrix(factor(1:4), nrow = 2)), + "Assertion on 'as.vector(x)' failed: Must be of type 'numeric', not 'character'.", + fixed = TRUE + ) + + expect_error( + get_type(matrix(c("a", "b", "c", "d"))), + "Assertion on 'as.vector(x)' failed: Must be of type 'numeric', not 'character'.", + fixed = TRUE + ) +}) + + +test_that("new `get_type()` is equal to old `prediction_type()", { + get_prediction_type <- function(data) { + if (is.data.frame(data)) { + data <- data$predicted + } + if ( + isTRUE(all.equal(as.vector(data), as.integer(data))) && + !all(is.na(as.integer(data))) + ) { + return("integer") + } else if (suppressWarnings(!all(is.na(as.numeric(data))))) { + return("continuous") + } else { + stop("Input is not numeric and cannot be coerced to numeric") + } + } + + check_data <- list( + 1:2, + # factor(1:2) # old function would classify as "continuous" + c(1.0, 2), + c(1.0, 2.3), + matrix(1:4, nrow = 2), + matrix(c(1.0, 2:4)), + matrix(c(1.0, 2.3, 3, 4)) + ) + + for (i in seq_along(check_data)) { + expect_equal( + get_prediction_type(check_data[[i]]), + get_type(check_data[[i]]) + ) + } +}) + +test_that("get_type() handles `NA` values", { + expect_equal(get_type(c(1, NA, 3)), "integer") + expect_equal(get_type(c(1, NA, 3.2)), "continuous") + expect_error(get_type(NA), "Can't get type: all values of are NA") +}) + + +# `get_duplicate_forecasts()` ================================================== +test_that("get_duplicate_forecasts() works as expected for quantile", { + expect_equal(nrow(get_duplicate_forecasts(example_quantile)), 0) + expect_equal( + nrow( + get_duplicate_forecasts(rbind(example_quantile, example_quantile[1000:1010]))), + 22 + ) +}) + +test_that("get_duplicate_forecasts() works as expected for sample", { + expect_equal(nrow(get_duplicate_forecasts(example_continuous)), 0) + expect_equal( + nrow( + get_duplicate_forecasts(rbind(example_continuous, example_continuous[1040:1050]))), + 22 + ) +}) + + +test_that("get_duplicate_forecasts() works as expected for binary", { + expect_equal(nrow(get_duplicate_forecasts(example_binary)), 0) + expect_equal( + nrow( + get_duplicate_forecasts(rbind(example_binary, example_binary[1000:1010]))), + 22 + ) +}) + +test_that("get_duplicate_forecasts() works as expected for point", { + expect_equal(nrow(get_duplicate_forecasts(example_binary)), 0) + expect_equal( + nrow( + get_duplicate_forecasts(rbind(example_point, example_point[1010:1020]))), + 22 + ) +}) + + +# ============================================================================== +# `get_forecast_type` +# ============================================================================== +test_that("get_forecast_type() works as expected", { + expect_equal(get_forecast_type(as.data.frame(example_quantile)), "quantile") + expect_equal(get_forecast_type(example_continuous), "sample") + expect_equal(get_forecast_type(example_integer), "sample") + expect_equal(get_forecast_type(example_binary), "binary") + expect_equal(get_forecast_type(example_point), "point") + + expect_error( + get_forecast_type(data.frame(x = 1:10)), + "Assertion on 'data' failed: Columns 'observed', 'predicted' not found in data.", + fixed = TRUE + ) + + df <- data.frame(observed = 1:10, predicted = factor(1:10)) + expect_error( + get_forecast_type(df), + "Checking `data`: input doesn't satisfy criteria for any forecast type. Are you missing a column `quantile` or `sample_id`? Please check the vignette for additional info.", + fixed = TRUE + ) + + data <- as_forecast(na.omit(example_integer)) + attr(data, "forecast_type") <- "binary" + expect_warning( + get_forecast_type(data), + "Object has an attribute `forecast_type`, but it looks different from what's expected based on the data. +Existing: binary +Expected: sample +Running `as_forecast()` again might solve the problem", + fixed = TRUE + ) +}) diff --git a/tests/testthat/test-input-check-helpers.R b/tests/testthat/test-input-check-helpers.R index d8ac875fb..d3dc8cd2d 100644 --- a/tests/testthat/test-input-check-helpers.R +++ b/tests/testthat/test-input-check-helpers.R @@ -1,6 +1,6 @@ test_that("Check equal length works if all arguments have length 1", { out <- interval_score( - true_values = 5, + observed = 5, lower = 4, upper = 6, interval_range = 95, @@ -11,9 +11,9 @@ test_that("Check equal length works if all arguments have length 1", { }) -test_that("Check_not_null works", { +test_that("assert_not_null works", { test_function <- function(argument = NULL) { - scoringutils:::check_not_null("argument" = argument) + scoringutils:::assert_not_null("argument" = argument) return(paste("Input:", argument)) } out <- test_function("works") diff --git a/tests/testthat/test-interval_score.R b/tests/testthat/test-interval_score.R deleted file mode 100644 index 481754f31..000000000 --- a/tests/testthat/test-interval_score.R +++ /dev/null @@ -1,428 +0,0 @@ -test_that("wis works, median only", { - y <- c(1, -15, 22) - lower <- upper <- c(1, 2, 3) - quantile_probs <- 0.5 - - actual <- interval_score(y, - lower = lower, upper = upper, - weigh = TRUE, - interval_range = 0 - ) - expected <- abs(y - lower) - - expect_identical(actual, expected) -}) - -test_that("WIS works within score for median forecast", { - test_data <- data.frame( - true_value = c(1, -15, 22), - prediction = 1:3, - quantile = rep(c(0.5), each = 3), - model = "model1", - date = 1:3 - ) - eval <- scoringutils::score(test_data, - count_median_twice = TRUE - ) - expect_equal(eval$ae_median, eval$interval_score) -}) - -test_that("wis works, 1 interval only", { - y <- c(1, -15, 22) - lower <- c(0, 1, 0) - upper <- c(2, 2, 3) - quantile_probs <- c(0.25, 0.75) - - alpha <- 0.5 - - actual <- scoringutils::interval_score(y, - lower = lower, upper = upper, - weigh = TRUE, - interval_range = 50 - ) - expected <- (upper - lower) * (alpha / 2) + c(0, 1 - (-15), 22 - 3) - - expect_identical(actual, expected) -}) - -test_that("WIS works within score for one interval", { - test_data <- data.frame( - true_value = rep(c(1, -15, 22), times = 2), - quantile = rep(c(0.25, 0.75), each = 3), - prediction = c(c(0, 1, 0), c(2, 2, 3)), - model = c("model1"), - date = rep(1:3, times = 2) - ) - - eval <- suppressMessages(scoringutils::score(test_data, - count_median_twice = TRUE - )) - - eval <- summarise_scores(eval, by = c("model", "date")) - - lower <- c(0, 1, 0) - upper <- c(2, 2, 3) - alpha <- 0.5 - - expected <- (upper - lower) * (alpha / 2) + c(0, 1 - (-15), 22 - 3) - - expect_equal(expected, eval$interval_score) -}) - -test_that("wis works, 1 interval and median", { - test_data <- data.frame( - true_value = rep(c(1, -15, 22), times = 3), - quantile = rep(c(0.25, 0.5, 0.75), each = 3), - prediction = c(c(0, 1, 0), c(1, 2, 3), c(2, 2, 3)), - model = c("model1"), - date = rep(1:3, times = 3) - ) - - eval <- scoringutils::score(test_data, - count_median_twice = TRUE - ) - - eval <- summarise_scores(eval, by = c("model", "date")) - - y <- c(1, -15, 22) - quantiles <- rbind(c(0, 1, 2), c(1, 2, 2), c(0, 3, 3)) - quantile_probs <- c(0.25, 0.5, 0.75) - - alpha <- 0.5 - - expected <- 0.5 * ( - abs(y - quantiles[, 2]) + - (quantiles[, 3] - quantiles[, 1]) * (alpha / 2) + c(0, 1 - (-15), 22 - 3) - ) - - expect_identical(eval$interval_score, expected) -}) - -test_that("wis works, 2 intervals and median", { - test_data <- data.frame( - true_value = rep(c(1, -15, 22), times = 5), - quantile = rep(c(0.1, 0.25, 0.5, 0.75, 0.9), each = 3), - prediction = c( - c(-1, -2, -2), c(0, 1, 0), c(1, 2, 3), - c(2, 2, 3), c(3, 4, 4) - ), - model = c("model1"), - date = rep(1:3, times = 5) - ) - - eval <- scoringutils::score(test_data, - count_median_twice = TRUE - ) - - eval <- summarise_scores(eval, by = c("model", "date")) - - y <- c(1, -15, 22) - quantiles <- rbind(c(-1, 0, 1, 2, 3), c(-2, 1, 2, 2, 4), c(-2, 0, 3, 3, 4)) - quantile_probs <- c(0.1, 0.25, 0.5, 0.75, 0.9) - - alpha1 <- 0.2 - alpha2 <- 0.5 - - expected <- (1 / 3) * ( - abs(y - quantiles[, 3]) + - (quantiles[, 5] - quantiles[, 1]) * (alpha1 / 2) + c(0, (-2) - (-15), 22 - 4) + - (quantiles[, 4] - quantiles[, 2]) * (alpha2 / 2) + c(0, 1 - (-15), 22 - 3) - ) - - expect_equal( - as.numeric(eval$interval_score), - as.numeric(expected) - ) -}) - -# additional tests from the covidhubutils repo -test_that("wis is correct, median only - test corresponds to covidHubUtils", { - y <- c(1, -15, 22) - forecast_quantiles_matrix <- rbind( - c(-1, 0, 1, 2, 3), - c(-2, 1, 2, 2, 4), - c(-2, 0, 3, 3, 4) - ) - forecast_quantile_probs <- c(0.1, 0.25, 0.5, 0.75, 0.9) - forecast_quantiles_matrix <- forecast_quantiles_matrix[, 3, drop = FALSE] - forecast_quantile_probs <- forecast_quantile_probs[3] - - target_end_dates <- as.Date("2020-01-01") + c(7, 14, 7) - horizons <- c("1", "2", "1") - locations <- c("01", "01", "02") - target_variables <- rep("inc death", length(y)) - - forecast_target_end_dates <- - rep(target_end_dates, times = ncol(forecast_quantiles_matrix)) - forecast_horizons <- rep(horizons, times = ncol(forecast_quantiles_matrix)) - forecast_locations <- rep(locations, times = ncol(forecast_quantiles_matrix)) - forecast_target_variables <- - rep(target_variables, times = ncol(forecast_quantiles_matrix)) - forecast_quantile_probs <- rep(forecast_quantile_probs, each = length(y)) - forecast_quantiles <- forecast_quantiles_matrix - dim(forecast_quantiles) <- prod(dim(forecast_quantiles)) - - test_truth <- data.frame( - model = rep("truth_source", length(y)), - target_variable = target_variables, - target_end_date = target_end_dates, - location = locations, - value = y, - stringsAsFactors = FALSE - ) - - n_forecasts <- length(forecast_quantiles) - test_forecasts <- data.frame( - model = rep("m1", n_forecasts), - forecast_date = rep(as.Date("2020-01-01"), n_forecasts), - location = forecast_locations, - horizon = forecast_horizons, - temporal_resolution = rep("wk", n_forecasts), - target_variable = forecast_target_variables, - target_end_date = forecast_target_end_dates, - type = rep("quantile", n_forecasts), - quantile = forecast_quantile_probs, - value = forecast_quantiles, - stringsAsFactors = FALSE - ) - - # make a version that conforms to scoringutils format - truth_formatted <- data.table::as.data.table(test_truth) - truth_formatted[, `:=`(model = NULL)] - data.table::setnames(truth_formatted, old = "value", new = "true_value") - - forecasts_formated <- data.table::as.data.table(test_forecasts) - data.table::setnames(forecasts_formated, old = "value", new = "prediction") - - data_formatted <- merge(forecasts_formated, truth_formatted) - - eval <- scoringutils::score(data_formatted, - count_median_twice = FALSE - ) - - expected <- abs(y - forecast_quantiles_matrix[, 1]) - - expect_equal(eval$interval_score, expected) -}) - - -test_that("wis is correct, 1 interval only - test corresponds to covidHubUtils", { - y <- c(1, -15, 22) - forecast_quantiles_matrix <- rbind( - c(-1, 0, 1, 2, 3), - c(-2, 1, 2, 2, 4), - c(-2, 0, 3, 3, 4) - ) - forecast_quantile_probs <- c(0.1, 0.25, 0.5, 0.75, 0.9) - forecast_quantiles_matrix <- forecast_quantiles_matrix[, c(1, 5), drop = FALSE] - forecast_quantile_probs <- forecast_quantile_probs[c(1, 5)] - - target_end_dates <- as.Date("2020-01-01") + c(7, 14, 7) - horizons <- c("1", "2", "1") - locations <- c("01", "01", "02") - target_variables <- rep("inc death", length(y)) - - forecast_target_end_dates <- - rep(target_end_dates, times = ncol(forecast_quantiles_matrix)) - forecast_horizons <- rep(horizons, times = ncol(forecast_quantiles_matrix)) - forecast_locations <- rep(locations, times = ncol(forecast_quantiles_matrix)) - forecast_target_variables <- - rep(target_variables, times = ncol(forecast_quantiles_matrix)) - forecast_quantile_probs <- rep(forecast_quantile_probs, each = length(y)) - forecast_quantiles <- forecast_quantiles_matrix - dim(forecast_quantiles) <- prod(dim(forecast_quantiles)) - - test_truth <- data.frame( - model = rep("truth_source", length(y)), - target_variable = target_variables, - target_end_date = target_end_dates, - location = locations, - value = y, - stringsAsFactors = FALSE - ) - - n_forecasts <- length(forecast_quantiles) - test_forecasts <- data.frame( - model = rep("m1", n_forecasts), - forecast_date = rep(as.Date("2020-01-01"), n_forecasts), - location = forecast_locations, - horizon = forecast_horizons, - temporal_resolution = rep("wk", n_forecasts), - target_variable = forecast_target_variables, - target_end_date = forecast_target_end_dates, - type = rep("quantile", n_forecasts), - quantile = forecast_quantile_probs, - value = forecast_quantiles, - stringsAsFactors = FALSE - ) - - # make a version that conforms to scoringutils format - truth_formatted <- data.table::as.data.table(test_truth) - truth_formatted[, `:=`(model = NULL)] - data.table::setnames(truth_formatted, old = "value", new = "true_value") - - forecasts_formated <- data.table::as.data.table(test_forecasts) - data.table::setnames(forecasts_formated, old = "value", new = "prediction") - - data_formatted <- merge(forecasts_formated, truth_formatted) - - eval <- suppressMessages(scoringutils::score(data_formatted, - count_median_twice = FALSE - )) - - eval <- summarise_scores(eval, - by = c( - "model", "location", "target_variable", - "target_end_date", "forecast_date", "horizon" - ) - ) - - alpha1 <- 0.2 - expected <- (forecast_quantiles_matrix[, 2] - forecast_quantiles_matrix[, 1]) * (alpha1 / 2) + - c(0, (-2) - (-15), 22 - 4) - - expect_equal(eval$interval_score, expected) -}) - - -test_that("wis is correct, 2 intervals and median - test corresponds to covidHubUtils", { - y <- c(1, -15, 22) - forecast_quantiles_matrix <- rbind( - c(-1, 0, 1, 2, 3), - c(-2, 1, 2, 2, 4), - c(-2, 0, 3, 3, 4) - ) - forecast_quantile_probs <- c(0.1, 0.25, 0.5, 0.75, 0.9) - - target_end_dates <- as.Date("2020-01-01") + c(7, 14, 7) - horizons <- c("1", "2", "1") - locations <- c("01", "01", "02") - target_variables <- rep("inc death", length(y)) - - forecast_target_end_dates <- - rep(target_end_dates, times = ncol(forecast_quantiles_matrix)) - forecast_horizons <- rep(horizons, times = ncol(forecast_quantiles_matrix)) - forecast_locations <- rep(locations, times = ncol(forecast_quantiles_matrix)) - forecast_target_variables <- - rep(target_variables, times = ncol(forecast_quantiles_matrix)) - forecast_quantile_probs <- rep(forecast_quantile_probs, each = length(y)) - forecast_quantiles <- forecast_quantiles_matrix - dim(forecast_quantiles) <- prod(dim(forecast_quantiles)) - - test_truth <- data.frame( - model = rep("truth_source", length(y)), - target_variable = target_variables, - target_end_date = target_end_dates, - location = locations, - value = y, - stringsAsFactors = FALSE - ) - - n_forecasts <- length(forecast_quantiles) - test_forecasts <- data.frame( - model = rep("m1", n_forecasts), - forecast_date = rep(as.Date("2020-01-01"), n_forecasts), - location = forecast_locations, - horizon = forecast_horizons, - temporal_resolution = rep("wk", n_forecasts), - target_variable = forecast_target_variables, - target_end_date = forecast_target_end_dates, - type = rep("quantile", n_forecasts), - quantile = forecast_quantile_probs, - value = forecast_quantiles, - stringsAsFactors = FALSE - ) - - # make a version that conforms to scoringutils format - truth_formatted <- data.table::as.data.table(test_truth) - truth_formatted[, `:=`(model = NULL)] - data.table::setnames(truth_formatted, old = "value", new = "true_value") - - forecasts_formated <- data.table::as.data.table(test_forecasts) - data.table::setnames(forecasts_formated, old = "value", new = "prediction") - - data_formatted <- merge(forecasts_formated, truth_formatted) - - eval <- scoringutils::score(data_formatted, - count_median_twice = FALSE - ) - - eval <- summarise_scores(eval, - by = c( - "model", "location", "target_variable", - "target_end_date", "forecast_date", "horizon" - ) - ) - - alpha1 <- 0.2 - alpha2 <- 0.5 - expected <- (1 / 2.5) * ( - 0.5 * abs(y - forecast_quantiles_matrix[, 3]) + - (forecast_quantiles_matrix[, 5] - forecast_quantiles_matrix[, 1]) * (alpha1 / 2) + c(0, (-2) - (-15), 22 - 4) + - (forecast_quantiles_matrix[, 4] - forecast_quantiles_matrix[, 2]) * (alpha2 / 2) + c(0, 1 - (-15), 22 - 3) - ) - - expect_equal(eval$interval_score, expected) -}) - -test_that("Quantlie score and interval score yield the same result, weigh = FALSE", { - true_values <- rnorm(10, mean = 1:10) - alphas <- c(0.1, 0.5, 0.9) - - for (alpha in alphas) { - lower <- qnorm(alpha / 2, rnorm(10, mean = 1:10)) - upper <- qnorm((1 - alpha / 2), rnorm(10, mean = 1:10)) - - w <- FALSE - is <- interval_score( - true_values = true_values, - lower = lower, - upper = upper, - interval_range = (1 - alpha) * 100, - weigh = w - ) - - qs_lower <- quantile_score(true_values, - predictions = lower, - quantiles = alpha / 2, - weigh = w - ) - qs_upper <- quantile_score(true_values, - predictions = upper, - quantiles = 1 - alpha / 2, - weigh = w - ) - expect_equal((qs_lower + qs_upper) / 2, is) - } -}) - -test_that("Quantlie score and interval score yield the same result, weigh = TRUE", { - true_values <- rnorm(10, mean = 1:10) - alphas <- c(0.1, 0.5, 0.9) - - for (alpha in alphas) { - lower <- qnorm(alpha / 2, rnorm(10, mean = 1:10)) - upper <- qnorm((1 - alpha / 2), rnorm(10, mean = 1:10)) - - w <- TRUE - is <- interval_score( - true_values = true_values, - lower = lower, - upper = upper, - interval_range = (1 - alpha) * 100, - weigh = w - ) - - qs_lower <- quantile_score(true_values, - predictions = lower, - quantiles = alpha / 2, - weigh = w - ) - qs_upper <- quantile_score(true_values, - predictions = upper, - quantiles = 1 - alpha / 2, - weigh = w - ) - expect_equal((qs_lower + qs_upper) / 2, is) - } -}) diff --git a/tests/testthat/test-lower-level-check-functions.R b/tests/testthat/test-lower-level-check-functions.R deleted file mode 100644 index 0f2966cf7..000000000 --- a/tests/testthat/test-lower-level-check-functions.R +++ /dev/null @@ -1,117 +0,0 @@ -test_that("Lower-level input check functions work", { - true_values <- rpois(30, lambda = 1:30) - predictions <- replicate(20, rpois(n = 30, lambda = 1:30)) - expect_equal(length(crps_sample(true_values, predictions)), 30) - - # should error when wrong prediction type is given - predictions2 <- rpois(30, lambda = 1) - expect_error(crps_sample(true_values, predictions2), - "'predictions' should be a matrix. Instead `integer` was found", - fixed = TRUE - ) - - # predictions have wrong number of rows - predictions3 <- replicate(20, rpois(n = 31, lambda = 1)) - expect_error(crps_sample(true_values, predictions3), - "Mismatch: 'true_values' has length `30`, but 'predictions' has `31` rows.", - fixed = TRUE - ) - - # error with missing argument - expect_error(crps_sample(predictions = predictions), - "true_values argument is missing", - fixed = TRUE - ) - - # checks work for binary forecasts - true_values <- sample(c(0, 1), size = 10, replace = TRUE) - predictions <- runif(n = 10) - expect_equal(length(brier_score(true_values, predictions)), 10) - - # true values are not either 0 or 1 - true_values2 <- true_values + 2 - expect_error(brier_score(true_values2, predictions), - "For a binary forecast, all true_values should be either 0 or 1.", - fixed = TRUE - ) - - # predictions are not between 0 and 1 - predictions2 <- predictions + 2 - expect_error(brier_score(true_values, predictions2), - "For a binary forecast, all predictions should be probabilities between 0 or 1.", - fixed = TRUE - ) -}) - - -test_that("function throws an error when missing true_values or predictions", { - true_values <- sample(c(0, 1), size = 10, replace = TRUE) - predictions <- replicate( - 20, - sample(c(0, 1), size = 10, replace = TRUE) - ) - - expect_error( - brier_score(predictions = predictions), - "true_values argument is missing" - ) - - expect_error( - brier_score(true_values = true_values), - "argument 'predictions' missing" - ) -}) - - - -test_that("function throws an error for wrong format of true_value", { - true_values <- rpois(10, lambda = 1:10) - predictions <- runif(10, min = 0, max = 1) - - expect_error( - brier_score( - true_values = true_values, - predictions = predictions - ), - "For a binary forecast, all true_values should be either 0 or 1." - ) - - true_values <- rnorm(10) - expect_error( - brier_score( - true_values = true_values, - predictions = predictions - ), - "For a binary forecast, all true_values should be either 0 or 1." - ) -}) - -test_that("function throws an error for wrong format of predictions", { - true_values <- sample(c(0, 1), size = 10, replace = TRUE) - predictions <- runif(10, min = 0, max = 3) - expect_error( - brier_score( - true_values = true_values, - predictions = predictions - ), - "For a binary forecast, all predictions should be probabilities between 0 or 1." - ) - - predictions <- runif(10, min = 0, max = 1) - expect_error( - brier_score( - true_values = true_values, - predictions = list(predictions) - ), - "Mismatch: 'true_values' has length `10`, but 'predictions' has length `1`" - ) - - predictions <- runif(15, min = 0, max = 1) - expect_error( - brier_score( - true_values = true_values, - predictions = predictions - ), - "Mismatch: 'true_values' has length `10`, but 'predictions' has length `15`" - ) -}) diff --git a/tests/testthat/test-metrics-binary.R b/tests/testthat/test-metrics-binary.R new file mode 100644 index 000000000..fae7476c5 --- /dev/null +++ b/tests/testthat/test-metrics-binary.R @@ -0,0 +1,187 @@ +observed <- factor(rbinom(10, size = 1, prob = 0.5)) +predicted <- c(0.425, 0.55, 0.541, 0.52, 0.13, 0.469, 0.86, 0.22, 0.74, 0.9) +df <- data.table( + observed = observed, + predicted = predicted, + model = "m1", + id = 1:10 +) + +observed_point <- rnorm(10) +predicted_point <- rnorm(10) + +# ============================================================================== +# Test Input Checks - this also checks point inputs where functions are similar +# ============================================================================== +test_that("correct input works", { + expect_no_condition(assert_input_binary(observed, predicted)) + expect_no_condition(assert_input_point(observed_point, predicted_point)) + + # observed is a single number and does not have the same length as predicted + expect_no_condition( + assert_input_binary(factor(1, levels = c(0, 1)), predicted) + ) + expect_no_condition( + assert_input_point(1, predicted_point) + ) + + # predicted is a single number and does not have the same length as observed + expect_no_condition(assert_input_binary(observed, predicted = 0.2)) + expect_no_condition(assert_input_point(observed_point, predicted = 0.2)) + + # predicted is a matrix with nrow equal to observed + expect_no_condition(assert_input_binary(observed, matrix(predicted))) + expect_no_condition(assert_input_point(observed_point, matrix(predicted_point))) +}) + +# test input handling +test_that("function throws an error for wrong input formats", { + # observed value not as expected + expect_error( + assert_input_binary(observed = rnorm(10), predicted = predicted), + "Assertion on 'observed' failed: Must be of type 'factor', not 'double'." + ) + expect_error( + assert_input_binary(1:10, predicted), + "Assertion on 'observed' failed: Must be of type 'factor', not 'integer'." + ) + expect_error( + assert_input_binary(observed = observed, predicted = as.list(predicted)), + "Assertion on 'predicted' failed: Must be of type 'numeric', not 'list'." + ) + expect_error( + assert_input_point(observed = factor(rnorm(10)), predicted = predicted), + "Assertion on 'observed' failed: Must be of type 'numeric', not 'factor'." + ) + expect_error( + assert_input_point(observed = observed_point, list(predicted_point)), + "Assertion on 'predicted' failed: Must be of type 'numeric', not 'list'." + ) + + # observed value has not 2 levels + expect_error( + assert_input_binary(factor(1:10), predicted), + "Assertion on 'observed' failed: Must have exactly 2 levels." + ) + + # wrong length + expect_error( + assert_input_binary(observed = observed, predicted = runif(15, min = 0, max = 1)), + "`observed` and `predicted` must either be of length 1 or of equal length. Found 10 and 15", + fixed = TRUE + ) + expect_error( + assert_input_point(observed_point, runif(15, min = 0, max = 1)), + "Assertion on 'observed' failed: `observed` and `predicted` must either be of length 1 or of equal length. Found 10 and 15.", + fixed = TRUE + ) + + # predicted > 1 + expect_error( + assert_input_binary(observed, predicted + 1), + "Assertion on 'predicted' failed: Element 1 is not <= 1." + ) + + # predicted < 0 + expect_error( + assert_input_binary(observed, predicted - 1), + "Assertion on 'predicted' failed: Element 1 is not >= 0." + ) + + # predicted is a matrix with one row + expect_error( + assert_input_binary(observed, predicted = matrix(0.2)), + "Assertion failed. One of the following must apply:\n * check_vector(predicted): Must be of type 'vector', not 'matrix'\n * check_matrix(predicted): Must have exactly 10 rows, but has 1 rows", + fixed = TRUE) + expect_error( + assert_input_point(observed_point, predicted = matrix(0.2)), + "Assertion failed. One of the following must apply:\n * check_vector(predicted): Must be of type 'vector', not 'matrix'\n * check_matrix(predicted): Must have exactly 10 rows, but has 1 rows", + fixed = TRUE) + + # predicted is a matrix with 2 rows + expect_error( + assert_input_binary(observed, matrix(rep(predicted, 2), ncol = 2)), + "Assertion failed. One of the following must apply:\n * check_vector(predicted): Must be of type 'vector', not 'matrix'\n * check_matrix(predicted): Must have exactly 1 cols, but has 2 cols", + fixed = TRUE + ) +}) + + +# ============================================================================== +# Test Binary Metrics +# ============================================================================== + +test_that("function throws an error when missing observed or predicted", { + expect_error( + brier_score(predicted = predicted), + 'argument "observed" is missing, with no default' + ) + + expect_error( + brier_score(observed = observed), + 'argument "predicted" is missing, with no default' + ) +}) + +test_that("Brier score works with different inputs", { + # observed is a single number and does not have the same length as predicted + expect_equal( + brier_score(factor(1, levels = c(0, 1)), predicted), + (1 - predicted)^2 + ) + + # predicted is a single number and does not have the same length as observed + expect_equal( + brier_score(observed, predicted = 0.2), + ifelse(observed == 1, (1 - 0.2)^2, (0.2)^2) + ) + + # predicted is a matrix with 1 row + expect_error( + brier_score(observed, predicted = matrix(0.2)), + "Assertion failed. One of the following must apply:\n * check_vector(predicted): Must be of type 'vector', not 'matrix'\n * check_matrix(predicted): Must have exactly 10 rows, but has 1 rows", + fixed = TRUE + ) + + # predicted is an array + expect_error( + brier_score(observed, predicted = array(0.2)), + "Assertion failed. One of the following must apply:\n * check_vector(predicted): Must be of type 'vector', not 'array'\n * check_matrix(predicted): Must be of type 'matrix', not 'array'", + fixed = TRUE + ) +}) + + +test_that("Binary metrics work within and outside of `score()`", { + result <- score(df) + expect_equal( + brier_score(observed, predicted), + result$brier_score + ) + + expect_equal( + logs_binary(observed, predicted), + result$log_score + ) +}) + +test_that("`logs_binary()` works as expected", { + # check against the function Metrics::ll + obs2 <- as.numeric(as.character(observed)) + expect_equal( + logs_binary(observed, predicted), + Metrics::ll(obs2, predicted) + ) + + # check this works for a single observed value + expect_equal( + logs_binary(observed[1], predicted), + Metrics::ll(obs2[1], predicted) + ) + + # check this works for a single predicted value + expect_equal( + logs_binary(observed, predicted[1]), + Metrics::ll(obs2, predicted[1]) + ) +}) diff --git a/tests/testthat/test-absolute_error.R b/tests/testthat/test-metrics-point.R similarity index 77% rename from tests/testthat/test-absolute_error.R rename to tests/testthat/test-metrics-point.R index c9c76dfec..54aeb623c 100644 --- a/tests/testthat/test-absolute_error.R +++ b/tests/testthat/test-metrics-point.R @@ -1,25 +1,20 @@ -test_that("absolute error (sample based) works", { - true_values <- rnorm(30, mean = 1:30) - predicted_values <- rnorm(30, mean = 1:30) - - scoringutils <- scoringutils::ae_median_sample(true_values, predicted_values) - - ae <- abs(true_values - predicted_values) - expect_equal(ae, scoringutils) -}) - - -# covidHubUtils-tests +# covidHubUtils-tests on absolute error ======================================== +# test are adapted from the package +# covidHubUtils, https://github.com/reichlab/covidHubUtils/ +y <- c(1, -15, 22) +forecast_quantiles_matrix <- rbind( + c(-1, 0, 1, 2, 3), + c(-2, 1, 2, 2, 4), + c(-2, 0, 3, 3, 4) +) +forecast_quantile_probs <- c(0.1, 0.25, 0.5, 0.75, 0.9) + +target_end_dates <- as.Date("2020-01-01") + c(7, 14, 7) +horizons <- c("1", "2", "1") +locations <- c("01", "01", "02") +target_variables <- rep("inc death", length(y)) test_that("abs error is correct within score, point forecast only", { - # test is adapted from the package covidHubUtils, https://github.com/reichlab/covidHubUtils/ - y <- c(1, -15, 22) - - target_end_dates <- as.Date("2020-01-01") + c(7, 14, 7) - horizons <- c("1", "2", "1") - locations <- c("01", "01", "02") - target_variables <- rep("inc death", length(y)) - forecast_target_end_dates <- rep(target_end_dates, times = 1) forecast_horizons <- rep(horizons, times = 1) @@ -57,41 +52,25 @@ test_that("abs error is correct within score, point forecast only", { # bring in scoringutils format truth_scoringutils <- data.table::as.data.table(test_truth) fc_scoringutils <- data.table::as.data.table(test_forecasts) - data.table::setnames(truth_scoringutils, old = "value", new = "true_value") - data.table::setnames(fc_scoringutils, old = "value", new = "prediction") + data.table::setnames(truth_scoringutils, old = "value", new = "observed") + data.table::setnames(fc_scoringutils, old = "value", new = "predicted") truth_scoringutils[, model := NULL] data_scoringutils <- merge_pred_and_obs( forecasts = fc_scoringutils, observations = truth_scoringutils - ) + )[, quantile := NULL] eval <- scoringutils::score(data_scoringutils) - # actual <- score_forecasts(forecasts = test_forecasts, truth = test_truth) - expected <- abs(y - point_forecast) - - # expect_equal(actual$abs_error, expected) - expect_equal(eval$ae_point, expected) + expect_equal(eval$ae, expected) }) test_that("abs error is correct, point and median forecasts different", { - y <- c(1, -15, 22) - forecast_quantiles_matrix <- rbind( - c(-1, 0, 1, 2, 3), - c(-2, 1, 2, 2, 4), - c(-2, 0, 3, 3, 4) - ) - forecast_quantile_probs <- c(0.1, 0.25, 0.5, 0.75, 0.9) forecast_quantiles_matrix <- forecast_quantiles_matrix[, 3, drop = FALSE] forecast_quantile_probs <- forecast_quantile_probs[3] - target_end_dates <- as.Date("2020-01-01") + c(7, 14, 7) - horizons <- c("1", "2", "1") - locations <- c("01", "01", "02") - target_variables <- rep("inc death", length(y)) - forecast_target_end_dates <- rep(target_end_dates, times = 1 + ncol(forecast_quantiles_matrix)) forecast_horizons <- rep(horizons, times = 1 + ncol(forecast_quantiles_matrix)) @@ -131,38 +110,27 @@ test_that("abs error is correct, point and median forecasts different", { # bring in scoringutils format truth_scoringutils <- data.table::as.data.table(test_truth) fc_scoringutils <- data.table::as.data.table(test_forecasts) - data.table::setnames(truth_scoringutils, old = "value", new = "true_value") - data.table::setnames(fc_scoringutils, old = "value", new = "prediction") + data.table::setnames(truth_scoringutils, old = "value", new = "observed") + data.table::setnames(fc_scoringutils, old = "value", new = "predicted") truth_scoringutils[, model := NULL] data_scoringutils <- merge_pred_and_obs( forecasts = fc_scoringutils, observations = truth_scoringutils - ) + )[, quantile := NULL] + eval <- scoringutils::score(data_scoringutils) expected <- abs(y - point_forecast) # expect_equal(actual$abs_error, expected) - expect_equal(eval[!is.na(ae_point)]$ae_point, expected) + expect_equal(eval[type == "point"]$ae_point, expected) }) test_that("abs error is correct, point and median forecasts same", { - y <- c(1, -15, 22) - forecast_quantiles_matrix <- rbind( - c(-1, 0, 1, 2, 3), - c(-2, 1, 2, 2, 4), - c(-2, 0, 3, 3, 4) - ) - forecast_quantile_probs <- c(0.1, 0.25, 0.5, 0.75, 0.9) forecast_quantiles_matrix <- forecast_quantiles_matrix[, 3, drop = FALSE] forecast_quantile_probs <- forecast_quantile_probs[3] - target_end_dates <- as.Date("2020-01-01") + c(7, 14, 7) - horizons <- c("1", "2", "1") - locations <- c("01", "01", "02") - target_variables <- rep("inc death", length(y)) - forecast_target_end_dates <- rep(target_end_dates, times = 1 + ncol(forecast_quantiles_matrix)) forecast_horizons <- rep(horizons, times = 1 + ncol(forecast_quantiles_matrix)) @@ -199,12 +167,11 @@ test_that("abs error is correct, point and median forecasts same", { stringsAsFactors = FALSE ) - # bring in scoringutils format truth_scoringutils <- data.table::as.data.table(test_truth) fc_scoringutils <- data.table::as.data.table(test_forecasts) - data.table::setnames(truth_scoringutils, old = "value", new = "true_value") - data.table::setnames(fc_scoringutils, old = "value", new = "prediction") + data.table::setnames(truth_scoringutils, old = "value", new = "observed") + data.table::setnames(fc_scoringutils, old = "value", new = "predicted") truth_scoringutils[, model := NULL] data_scoringutils <- merge_pred_and_obs( @@ -212,20 +179,17 @@ test_that("abs error is correct, point and median forecasts same", { observations = truth_scoringutils ) - eval <- score(data = data_scoringutils) + data_forecast_point <- data_scoringutils[type == "point"][, quantile := NULL] + + eval <- score(data = data_forecast_point) eval <- summarise_scores(eval, by = c( "location", "target_end_date", - "target_variable", "horizon" + "horizon" ), na.rm = TRUE ) - - # actual <- score_forecasts(forecasts = test_forecasts, truth = test_truth) - expected <- abs(y - point_forecast) - - # expect_equal(actual$abs_error, expected) expect_equal(eval$ae_point, expected) }) diff --git a/tests/testthat/test-metrics-quantile.R b/tests/testthat/test-metrics-quantile.R new file mode 100644 index 000000000..8da2c14f9 --- /dev/null +++ b/tests/testthat/test-metrics-quantile.R @@ -0,0 +1,824 @@ +observed <- c(1, -15, 22) +predicted <- rbind( + c(-1, 0, 1, 2, 3), + c(-2, 1, 2, 2, 4), + c(-2, 0, 3, 3, 4) +) +quantile <- c(0.1, 0.25, 0.5, 0.75, 0.9) + +# covidHubUtils test: +y <- c(1, -15, 22) +forecast_quantiles_matrix <- rbind( + c(-1, 0, 1, 2, 3), + c(-2, 1, 2, 2, 4), + c(-2, 0, 3, 3, 4) +) +forecast_quantile_probs <- c(0.1, 0.25, 0.5, 0.75, 0.9) + + +# ============================================================================ # +# Input handling =============================================================== +# ============================================================================ # +test_that("Input checking for quantile forecasts works", { + # everything correct + expect_no_condition( + scoringutils:::assert_input_quantile(observed, predicted, quantile) + ) + + # quantile > 1 + expect_error( + scoringutils:::assert_input_quantile(observed, predicted, quantile + 1), + "Assertion on 'quantile' failed: Element 1 is not <= 1." + ) + + # quantile < 0 + expect_error( + scoringutils:::assert_input_quantile(observed, predicted, quantile - 1), + "Assertion on 'quantile' failed: Element 1 is not >= 0." + ) + + # 10 observations, but only 3 forecasts + expect_error( + scoringutils:::assert_input_quantile(1:10, predicted, quantile), + "Assertion on 'predicted' failed: Must have exactly 10 rows, but has 3 rows." + ) + + # observed value is a factor + expect_error( + scoringutils:::assert_input_quantile(factor(1:10), predicted, quantile), + "Assertion on 'observed' failed: Must be of type 'numeric', not 'factor'." + ) + + # observed is a single number and does not have the same length as predicted + # There seems to be an issue with the error message: there is one \n to many + # such that the test fails when executed alone, but works when executed + # together with others. + expect_error( + scoringutils:::assert_input_quantile(1, predicted, quantile), + "Assertion failed. One of the following must apply:\n * check_numeric_vector(predicted): Must be of type 'atomic vector',\n * not 'matrix'\n * check_matrix(predicted): Must have exactly 1 rows, but has 3 rows", + fixed = TRUE + ) + + # predicted is a vector + expect_error( + scoringutils:::assert_input_quantile(observed, as.vector(predicted), quantile), + "Assertion on 'predicted' failed: Must be of type 'matrix', not 'double'." + ) +}) + + +# ============================================================================ # +# wis ========================================================================== +# ============================================================================ #a +test_that("wis works standalone, median only", { + y <- c(1, -15, 22) + lower <- upper <- predicted_quantile <- c(1, 2, 3) + quantile_probs <- 0.5 + + actual <- interval_score(y, + lower = lower, upper = upper, + weigh = TRUE, + interval_range = 0 + ) + + actual_wis <- wis( + observed = y, + predicted = matrix(predicted_quantile), + quantile = quantile_probs, + ) + + expected <- abs(y - lower) + + expect_identical(actual, expected) +}) + +test_that("`wis()` works within score for median forecast", { + test_data <- data.frame( + observed = c(1, -15, 22), + predicted = 1:3, + quantile = rep(c(0.5), each = 3), + model = "model1", + date = 1:3 + ) + eval <- score( + test_data, + count_median_twice = TRUE, metrics = metrics_no_cov + ) + expect_equal(eval$ae_median, eval$wis) +}) + +test_that("`wis()` equals `interval_score()`, 1 interval only", { + y <- c(1, -15, 22) + lower <- c(0, 1, 0) + upper <- c(2, 2, 3) + quantile_probs <- c(0.25, 0.75) + predicted <- matrix(c(lower, upper), ncol = 2) + + alpha <- 0.5 + expected <- (upper - lower) * (alpha / 2) + c(0, 1 - (-15), 22 - 3) + + actual <- interval_score( + y, + lower = lower, upper = upper, + weigh = TRUE, + interval_range = 50 + ) + + actual_wis <- wis( + observed = y, + predicted = predicted, + quantile = quantile_probs, + ) + + expect_identical(actual, expected) + expect_identical(actual_wis, expected) +}) + +test_that("wis() works within score for one interval", { + test_data <- data.frame( + observed = rep(c(1, -15, 22), times = 2), + quantile = rep(c(0.25, 0.75), each = 3), + predicted = c(c(0, 1, 0), c(2, 2, 3)), + model = c("model1"), + date = rep(1:3, times = 2) + ) + + eval <- score( + test_data, + count_median_twice = TRUE, metrics = list(wis = wis) + ) + + eval <- summarise_scores(eval, by = c("model", "date")) + + lower <- c(0, 1, 0) + upper <- c(2, 2, 3) + alpha <- 0.5 + + expected <- (upper - lower) * (alpha / 2) + c(0, 1 - (-15), 22 - 3) + + expect_equal(expected, eval$wis) +}) + +test_that("`wis()` works 1 interval and median", { + test_data <- data.frame( + observed = rep(c(1, -15, 22), times = 3), + quantile = rep(c(0.25, 0.5, 0.75), each = 3), + predicted = c(c(0, 1, 0), c(1, 2, 3), c(2, 2, 3)), + model = c("model1"), + date = rep(1:3, times = 3) + ) + + eval <- score( + test_data, + count_median_twice = TRUE, metrics = metrics_no_cov + ) + + eval <- summarise_scores(eval, by = c("model", "date")) + + y <- c(1, -15, 22) + quantile <- rbind(c(0, 1, 2), c(1, 2, 2), c(0, 3, 3)) + quantile_probs <- c(0.25, 0.5, 0.75) + + alpha <- 0.5 + expected <- 0.5 * ( + abs(y - quantile[, 2]) + + (quantile[, 3] - quantile[, 1]) * (alpha / 2) + c(0, 1 - (-15), 22 - 3) + ) + + actual_wis <- wis( + observed = y, + predicted = quantile, + quantile = quantile_probs, + count_median_twice = TRUE + ) + + expect_identical(eval$wis, expected) + expect_identical(actual_wis, expected) +}) + +test_that("wis works, 2 intervals and median", { + test_data <- data.frame( + observed = rep(c(1, -15, 22), times = 5), + quantile = rep(c(0.1, 0.25, 0.5, 0.75, 0.9), each = 3), + predicted = c( + c(-1, -2, -2), c(0, 1, 0), c(1, 2, 3), + c(2, 2, 3), c(3, 4, 4) + ), + model = c("model1"), + date = rep(1:3, times = 5) + ) + + eval <- score( + test_data, + count_median_twice = TRUE, metrics = metrics_no_cov + ) + + eval <- summarise_scores(eval, by = c("model", "date")) + + quantile <- forecast_quantiles_matrix + quantile_probs <- c(0.1, 0.25, 0.5, 0.75, 0.9) + + alpha1 <- 0.2 + alpha2 <- 0.5 + + expected <- (1 / 3) * ( + abs(y - quantile[, 3]) + + (quantile[, 5] - quantile[, 1]) * (alpha1 / 2) + c(0, (-2) - (-15), 22 - 4) + + (quantile[, 4] - quantile[, 2]) * (alpha2 / 2) + c(0, 1 - (-15), 22 - 3) + ) + + actual_wis <- wis( + observed = y, + predicted = quantile, + quantile = quantile_probs, + count_median_twice = TRUE + ) + + expect_equal( + as.numeric(eval$wis), + as.numeric(expected) + ) + expect_identical(actual_wis, expected) +}) + +# additional tests from the covidhubutils repo +test_that("wis is correct, median only - test corresponds to covidHubUtils", { + forecast_quantiles_matrix <- forecast_quantiles_matrix[, 3, drop = FALSE] + forecast_quantile_probs <- forecast_quantile_probs[3] + + target_end_dates <- as.Date("2020-01-01") + c(7, 14, 7) + horizons <- c("1", "2", "1") + locations <- c("01", "01", "02") + target_variables <- rep("inc death", length(y)) + + forecast_target_end_dates <- + rep(target_end_dates, times = ncol(forecast_quantiles_matrix)) + forecast_horizons <- rep(horizons, times = ncol(forecast_quantiles_matrix)) + forecast_locations <- rep(locations, times = ncol(forecast_quantiles_matrix)) + forecast_target_variables <- + rep(target_variables, times = ncol(forecast_quantiles_matrix)) + forecast_quantile_probs <- rep(forecast_quantile_probs, each = length(y)) + forecast_quantiles <- forecast_quantiles_matrix + dim(forecast_quantiles) <- prod(dim(forecast_quantiles)) + + test_truth <- data.frame( + model = rep("truth_source", length(y)), + target_variable = target_variables, + target_end_date = target_end_dates, + location = locations, + value = y, + stringsAsFactors = FALSE + ) + + n_forecasts <- length(forecast_quantiles) + test_forecasts <- data.frame( + model = rep("m1", n_forecasts), + forecast_date = rep(as.Date("2020-01-01"), n_forecasts), + location = forecast_locations, + horizon = forecast_horizons, + temporal_resolution = rep("wk", n_forecasts), + target_variable = forecast_target_variables, + target_end_date = forecast_target_end_dates, + type = rep("quantile", n_forecasts), + quantile = forecast_quantile_probs, + value = forecast_quantiles, + stringsAsFactors = FALSE + ) + + # make a version that conforms to scoringutils format + truth_formatted <- data.table::as.data.table(test_truth) + truth_formatted[, `:=`(model = NULL)] + data.table::setnames(truth_formatted, old = "value", new = "observed") + + forecasts_formated <- data.table::as.data.table(test_forecasts) + data.table::setnames(forecasts_formated, old = "value", new = "predicted") + + data_formatted <- merge(forecasts_formated, truth_formatted) + + eval <- score( + data_formatted, + count_median_twice = FALSE, metrics = metrics_no_cov + ) + + expected <- abs(y - forecast_quantiles_matrix[, 1]) + + actual_wis <- wis( + observed = y, + predicted = matrix(forecast_quantiles_matrix), + quantile = 0.5, + count_median_twice = FALSE + ) + + expect_equal(eval$wis, expected) + expect_equal(actual_wis, expected) +}) + +test_that("wis is correct, 1 interval only - test corresponds to covidHubUtils", { + forecast_quantiles_matrix <- forecast_quantiles_matrix[, c(1, 5), drop = FALSE] + forecast_quantile_probs <- forecast_quantile_probs[c(1, 5)] + + target_end_dates <- as.Date("2020-01-01") + c(7, 14, 7) + horizons <- c("1", "2", "1") + locations <- c("01", "01", "02") + target_variables <- rep("inc death", length(y)) + + forecast_target_end_dates <- + rep(target_end_dates, times = ncol(forecast_quantiles_matrix)) + forecast_horizons <- rep(horizons, times = ncol(forecast_quantiles_matrix)) + forecast_locations <- rep(locations, times = ncol(forecast_quantiles_matrix)) + forecast_target_variables <- + rep(target_variables, times = ncol(forecast_quantiles_matrix)) + forecast_quantile_probs <- rep(forecast_quantile_probs, each = length(y)) + forecast_quantiles <- forecast_quantiles_matrix + dim(forecast_quantiles) <- prod(dim(forecast_quantiles)) + + test_truth <- data.frame( + model = rep("truth_source", length(y)), + target_variable = target_variables, + target_end_date = target_end_dates, + location = locations, + value = y, + stringsAsFactors = FALSE + ) + + n_forecasts <- length(forecast_quantiles) + test_forecasts <- data.frame( + model = rep("m1", n_forecasts), + forecast_date = rep(as.Date("2020-01-01"), n_forecasts), + location = forecast_locations, + horizon = forecast_horizons, + temporal_resolution = rep("wk", n_forecasts), + target_variable = forecast_target_variables, + target_end_date = forecast_target_end_dates, + type = rep("quantile", n_forecasts), + quantile = forecast_quantile_probs, + value = forecast_quantiles, + stringsAsFactors = FALSE + ) + + # make a version that conforms to scoringutils format + truth_formatted <- data.table::as.data.table(test_truth) + truth_formatted[, `:=`(model = NULL)] + data.table::setnames(truth_formatted, old = "value", new = "observed") + + forecasts_formated <- data.table::as.data.table(test_forecasts) + data.table::setnames(forecasts_formated, old = "value", new = "predicted") + + data_formatted <- merge(forecasts_formated, truth_formatted) + + eval <- suppressMessages(score(data_formatted, + count_median_twice = FALSE, metrics = metrics_no_cov_no_ae + )) + + eval <- summarise_scores(eval, + by = c( + "model", "location", "target_variable", + "target_end_date", "forecast_date", "horizon" + ) + ) + + alpha1 <- 0.2 + expected <- (forecast_quantiles_matrix[, 2] - forecast_quantiles_matrix[, 1]) * (alpha1 / 2) + + c(0, (-2) - (-15), 22 - 4) + + actual_wis <- wis( + observed = y, + predicted = forecast_quantiles_matrix, + quantile = c(0.1, 0.9), + count_median_twice = FALSE + ) + + expect_equal(eval$wis, expected) + expect_equal(actual_wis, expected) +}) + +test_that("wis is correct, 2 intervals and median - test corresponds to covidHubUtils", { + target_end_dates <- as.Date("2020-01-01") + c(7, 14, 7) + horizons <- c("1", "2", "1") + locations <- c("01", "01", "02") + target_variables <- rep("inc death", length(y)) + + forecast_target_end_dates <- + rep(target_end_dates, times = ncol(forecast_quantiles_matrix)) + forecast_horizons <- rep(horizons, times = ncol(forecast_quantiles_matrix)) + forecast_locations <- rep(locations, times = ncol(forecast_quantiles_matrix)) + forecast_target_variables <- + rep(target_variables, times = ncol(forecast_quantiles_matrix)) + forecast_quantile_probs <- rep(forecast_quantile_probs, each = length(y)) + forecast_quantiles <- forecast_quantiles_matrix + dim(forecast_quantiles) <- prod(dim(forecast_quantiles)) + + test_truth <- data.frame( + model = rep("truth_source", length(y)), + target_variable = target_variables, + target_end_date = target_end_dates, + location = locations, + value = y, + stringsAsFactors = FALSE + ) + + n_forecasts <- length(forecast_quantiles) + test_forecasts <- data.frame( + model = rep("m1", n_forecasts), + forecast_date = rep(as.Date("2020-01-01"), n_forecasts), + location = forecast_locations, + horizon = forecast_horizons, + temporal_resolution = rep("wk", n_forecasts), + target_variable = forecast_target_variables, + target_end_date = forecast_target_end_dates, + type = rep("quantile", n_forecasts), + quantile = forecast_quantile_probs, + value = forecast_quantiles, + stringsAsFactors = FALSE + ) + + # make a version that conforms to scoringutils format + truth_formatted <- data.table::as.data.table(test_truth) + truth_formatted[, `:=`(model = NULL)] + data.table::setnames(truth_formatted, old = "value", new = "observed") + + forecasts_formated <- data.table::as.data.table(test_forecasts) + data.table::setnames(forecasts_formated, old = "value", new = "predicted") + + data_formatted <- merge(forecasts_formated, truth_formatted) + + eval <- score(data_formatted, + count_median_twice = FALSE, metrics = metrics_no_cov + ) + + eval <- summarise_scores(eval, + by = c( + "model", "location", "target_variable", + "target_end_date", "forecast_date", "horizon" + ) + ) + + alpha1 <- 0.2 + alpha2 <- 0.5 + expected <- (1 / 2.5) * ( + 0.5 * abs(y - forecast_quantiles_matrix[, 3]) + + (forecast_quantiles_matrix[, 5] - forecast_quantiles_matrix[, 1]) * (alpha1 / 2) + c(0, (-2) - (-15), 22 - 4) + + (forecast_quantiles_matrix[, 4] - forecast_quantiles_matrix[, 2]) * (alpha2 / 2) + c(0, 1 - (-15), 22 - 3) + ) + + actual_wis <- wis( + observed = y, + predicted = forecast_quantiles_matrix, + quantile = c(0.1, 0.25, 0.5, 0.75, 0.9), + count_median_twice = FALSE + ) + + expect_equal(eval$wis, expected) +}) + +test_that("Quantlie score and interval score yield the same result, weigh = FALSE", { + observed <- rnorm(10, mean = 1:10) + alphas <- c(0.1, 0.5, 0.9) + + for (alpha in alphas) { + lower <- qnorm(alpha / 2, rnorm(10, mean = 1:10)) + upper <- qnorm((1 - alpha / 2), rnorm(10, mean = 11:20)) + + w <- FALSE + is <- interval_score( + observed = observed, + lower = lower, + upper = upper, + interval_range = (1 - alpha) * 100, + weigh = w + ) + + wis <- wis( + observed = observed, + predicted = cbind(lower, upper), + quantile = c(alpha / 2, 1 - alpha / 2), + count_median_twice = FALSE, + weigh = w + ) + + qs_lower <- quantile_score(observed, + predicted = lower, + quantile = alpha / 2, + weigh = w + ) + qs_upper <- quantile_score(observed, + predicted = upper, + quantile = 1 - alpha / 2, + weigh = w + ) + expect_equal((qs_lower + qs_upper) / 2, is) + expect_equal(wis, is) + } +}) + + +# ============================================================================ # +# Quantile score ============================================================= # +# ============================================================================ # +test_that("Quantlie score and interval score yield the same result, weigh = TRUE", { + observed <- rnorm(10, mean = 1:10) + alphas <- c(0.1, 0.5, 0.9) + + for (alpha in alphas) { + lower <- qnorm(alpha / 2, rnorm(10, mean = 1:10)) + upper <- qnorm((1 - alpha / 2), rnorm(10, mean = 11:20)) + + w <- TRUE + is <- interval_score( + observed = observed, + lower = lower, + upper = upper, + interval_range = (1 - alpha) * 100, + weigh = w + ) + + wis <- wis( + observed = observed, + predicted = cbind(lower, upper), + quantile = c(alpha / 2, 1 - alpha / 2), + count_median_twice = FALSE, + weigh = w + ) + + qs_lower <- quantile_score(observed, + predicted = lower, + quantile = alpha / 2, + weigh = w + ) + qs_upper <- quantile_score(observed, + predicted = upper, + quantile = 1 - alpha / 2, + weigh = w + ) + expect_equal((qs_lower + qs_upper) / 2, is) + expect_equal(wis, is) + } +}) + +test_that("wis works with separate results", { + wis <- wis( + observed = y, + predicted = forecast_quantiles_matrix, + quantile = forecast_quantile_probs, + separate_results = TRUE + ) + expect_equal(wis$wis, wis$dispersion + wis$overprediction + wis$underprediction) +}) + + +# ============================================================================ # +# overprediction, underprediction, dispersion ================================ # +# ============================================================================ # +test_that("wis is the sum of overprediction, underprediction, dispersion", { + wis <- wis( + observed = y, + predicted = forecast_quantiles_matrix, + quantile = forecast_quantile_probs + ) + + d <- dispersion(y, forecast_quantiles_matrix, forecast_quantile_probs) + o <- overprediction(y, forecast_quantiles_matrix, forecast_quantile_probs) + u <- underprediction(y, forecast_quantiles_matrix, forecast_quantile_probs) + + expect_equal(wis, d + o + u) +}) + + +# ============================================================================ # +# `interval_coverage` =============================================== # +# ============================================================================ # +test_that("interval_coverage works", { + expect_equal( + interval_coverage(observed, predicted, quantile, range = 50), + c(TRUE, FALSE, FALSE) + ) +}) + +test_that("interval_coverage rejects wrong inputs", { + expect_error( + interval_coverage(observed, predicted, quantile, range = c(50, 0)), + "Assertion on 'range' failed: Must have length 1." + ) +}) + +test_that("interval_coverage_quantile throws a warning when a required quantile is not available", { + dropped_quantile_pred <- predicted[, -4] + dropped_quantiles <- quantile[-4] + expect_warning( + interval_coverage( + observed, dropped_quantile_pred, dropped_quantiles, range = 50 + ), + "To compute the interval coverage for a range of 50%, the quantiles `0.25, 0.75` are required. Returning `NA`" + ) +}) + + +# ============================================================================ # +# `interval_coverage_deviation` ===================================== # +# ============================================================================ # +test_that("interval_coverage_deviation works", { + existing_ranges <- unique(get_range_from_quantile(quantile)) + expect_equal(existing_ranges, c(80, 50, 0)) + + cov_50 <- interval_coverage(observed, predicted, quantile, range = c(50)) + cov_80 <- interval_coverage(observed, predicted, quantile, range = c(80)) + manual <- 0.5 * (cov_50 - 0.5) + 0.5 * (cov_80 - 0.8) + + expect_equal( + interval_coverage_deviation(observed, predicted, quantile), + manual + ) + expect_warning( + interval_coverage_deviation( + observed, predicted, c(quantile[-4], 0.76) + ), + "To compute inteval coverage deviation, all quantiles must form central symmetric prediction intervals. Missing quantiles: 0.24, 0.75. Returning `NA`." + ) +}) + + +# ============================================================================ # +# `bias_quantile` ============================================================== +# ============================================================================ # +test_that("bias_quantile() works as expected", { + predicted <- c(1, 2, 3) + quantiles <- c(0.1, 0.5, 0.9) + expect_equal( + bias_quantile(observed = 2, predicted, quantiles), + 0 + ) + predicted <- c(0, 1, 2) + quantiles <- c(0.1, 0.5, 0.9) + expect_equal( + bias_quantile(observed = 2, predicted, quantiles), + -0.8 + ) + + predicted <- c( + 705.500, 1127.000, 4006.250, 4341.500, 4709.000, 4821.996, + 5340.500, 5451.000, 5703.500, 6087.014, 6329.500, 6341.000, + 6352.500, 6594.986, 6978.500, 7231.000, 7341.500, 7860.004, + 7973.000, 8340.500, 8675.750, 11555.000, 11976.500 + ) + + quantile <- c(0.01, 0.025, seq(0.05, 0.95, 0.05), 0.975, 0.99) + + observed <- 8062 + expect_equal(bias_quantile(observed, predicted, quantile), -0.8) +}) + +test_that("bias_quantile handles matrix input", { + observed <- seq(10, 0, length.out = 4) + predicted <- matrix(1:12, ncol = 3) + quantiles <- c(0.1, 0.5, 0.9) + expect_equal( + bias_quantile(observed, predicted, quantiles), + c(-1.0, -0.8, 0.8, 1.0) + ) +}) + + +test_that("bias_quantile() handles vector that is too long", { + predicted <- c(NA, 1, 2, 3) + quantiles <- c(0.1, 0.5, 0.9) + + expect_error( + bias_quantile(observed = 2, predicted, quantiles), + "Assertion on 'quantile' failed: Must have length 4, but has length 3." + ) +}) + +test_that("bias_quantile() handles NA values", { + predicted <- c(NA, 1, 2) + quantiles <- c(0.1, 0.5, 0.9) + expect_equal( + bias_quantile(observed = 2, predicted, quantiles), + -0.8 + ) + predicted <- c(0, 1, 2) + quantiles <- c(0.1, 0.5, NA) + expect_equal( + bias_quantile(observed = 2, predicted, quantiles), + -1 + ) + expect_equal( + bias_quantile(observed = 2, predicted, quantiles, na.rm = FALSE), + NA_real_ + ) +}) + +test_that("bias_quantile() errors if no predictions", { + expect_error( + bias_quantile(observed = 2, numeric(0), numeric(0)), + "Assertion on 'quantile' failed: Must have length >= 1, but has length 0" + ) +}) + +test_that("bias_quantile() returns correct bias if value below the median", { + predicted <- c(1, 2, 4, 5) + quantiles <- c(0.1, 0.3, 0.7, 0.9) + suppressMessages( + expect_equal(bias_quantile(observed = 1, predicted, quantiles), 0.8) + ) +}) + +test_that("bias_quantile() returns correct bias if value above median", { + predicted <- c(1, 2, 4, 5) + quantiles <- c(0.1, 0.3, 0.7, 0.9) + suppressMessages( + expect_equal(bias_quantile(observed = 5, predicted, quantiles), -0.8) + ) +}) + +test_that("bias_quantile() returns correct bias if value at the median", { + predicted <- c(1, 2, 3, 4) + quantiles <- c(0.1, 0.3, 0.5, 0.7) + + expect_equal(bias_quantile(observed = 3, predicted, quantiles), 0) +}) + +test_that("bias_quantile() returns 1 if true value below min prediction", { + predicted <- c(2, 3, 4, 5) + quantiles <- c(0.1, 0.3, 0.7, 0.9) + + suppressMessages( + expect_equal(bias_quantile(observed = 1, predicted, quantiles), 1) + ) +}) + +test_that("bias_quantile() returns -1 if true value above max prediction", { + predicted <- c(1, 2, 3, 4) + quantiles <- c(0.1, 0.3, 0.5, 0.7) + + expect_equal(bias_quantile(observed = 6, predicted, quantiles), -1) +}) + +test_that("bias_quantile(): quantiles must be between 0 and 1", { + predicted <- 1:4 + + # Failing example + quantiles <- c(-0.1, 0.3, 0.5, 0.8) + expect_error( + bias_quantile(observed = 3, predicted, quantiles), + "Assertion on 'quantile' failed: Element 1 is not >= 0." + ) + + # Passing counter example + quantiles <- c(0.1, 0.3, 0.5, 0.8) + expect_silent(bias_quantile(observed = 3, predicted, quantiles)) +}) + +test_that("bias_quantile(): quantiles must be increasing", { + predicted <- 1:4 + + # Failing example + quantiles <- c(0.8, 0.3, 0.5, 0.9) + expect_error( + bias_quantile(observed = 3, predicted, quantiles), + "Predictions must not be decreasing with increasing quantile level" + ) + + # Passing counter example + quantiles <- c(0.3, 0.5, 0.8, 0.9) + expect_silent(bias_quantile(observed = 3, predicted, quantiles)) +}) + +test_that("bias_quantile(): predictions must be increasing", { + predicted <- c(1, 2, 4, 3) + quantiles <- c(0.1, 0.3, 0.5, 0.9) + + expect_error( + bias_quantile(observed = 3, predicted, quantiles), + "Predictions must not be decreasing with increasing quantile level" + ) + expect_silent(bias_quantile( observed = 3, 1:4, quantiles)) +}) + +test_that("bias_quantile(): quantiles must be unique", { + predicted <- 1:4 + + # Failing example + quantiles <- c(0.3, 0.3, 0.5, 0.8) + expect_error( + bias_quantile(observed = 3, predicted, quantiles), + "Assertion on 'quantile' failed: Contains duplicated values, position 2." + ) + + # Passing example + quantiles <- c(0.3, 0.5, 0.8, 0.9) + expect_silent(bias_quantile(observed = 3, predicted, quantiles)) +}) + +test_that("bias_quantile only produces one message", { + expect_message( + bias_quantile(observed, predicted[, -3], quantile[-3]), + "Median not available, computing bias as mean of the two innermost quantiles in order to compute bias." + ) +}) + +test_that("bias_quantile() works with point forecasts", { + predicted <- 1 + observed <- 1 + quantile <- 0.5 + expect_equal(bias_quantile(observed, predicted, quantile), 0) +}) diff --git a/tests/testthat/test-metrics-sample.R b/tests/testthat/test-metrics-sample.R new file mode 100644 index 000000000..ded7b52ae --- /dev/null +++ b/tests/testthat/test-metrics-sample.R @@ -0,0 +1,146 @@ +test_that("Input handling", { + observed <- rpois(30, lambda = 1:30) + predicted <- replicate(20, rpois(n = 30, lambda = 1:30)) + expect_equal(length(crps_sample(observed, predicted)), 30) + + # should error when wrong prediction type is given + predicted2 <- rpois(30, lambda = 1) + expect_error(crps_sample(observed, predicted2), + "Assertion on 'predicted' failed: Must be of type 'matrix', not 'integer'", + fixed = TRUE + ) + + # predictions have wrong number of rows + predicted3 <- replicate(20, rpois(n = 31, lambda = 1)) + expect_error( + crps_sample(observed, predicted3), + "Assertion on 'predicted' failed: Must have exactly 30 rows, but has 31 rows.", + # "Mismatch: 'observed' has length `30`, but 'predicted' has `31` rows.", + fixed = TRUE + ) + + # error with missing argument + expect_error(crps_sample(predicted = predicted), + 'argument "observed" is missing, with no default', + fixed = TRUE + ) +}) + + + +test_that("bias_sample() throws an error when missing observed", { + observed <- rpois(10, lambda = 1:10) + predicted <- replicate(50, rpois(n = 10, lambda = 1:10)) + + expect_error( + bias_sample(predicted = predicted), + 'argument "observed" is missing, with no default' + ) +}) + +test_that("bias_sample() throws an error when missing 'predicted'", { + observed <- rpois(10, lambda = 1:10) + predicted <- replicate(50, rpois(n = 10, lambda = 1:10)) + + expect_error( + bias_sample(observed = observed), + 'argument "predicted" is missing, with no default' + ) +}) + +test_that("bias_sample() works for integer observed and predicted", { + observed <- rpois(10, lambda = 1:10) + predicted <- replicate(10, rpois(10, lambda = 1:10)) + output <- bias_sample( + observed = observed, + predicted = predicted + ) + expect_equal( + length(output), + length(observed) + ) + expect_equal( + class(output), + "numeric" + ) +}) + +test_that("bias_sample() works for continuous observed values and predicted", { + observed <- rnorm(10) + predicted <- replicate(10, rnorm(10)) + output <- bias_sample( + observed = observed, + predicted = predicted + ) + expect_equal( + length(output), + length(observed) + ) + expect_equal( + class(output), + "numeric" + ) +}) + +test_that("bias_sample() works as expected", { + observed <- rpois(30, lambda = 1:30) + predicted <- replicate(200, rpois(n = 30, lambda = 1:30)) + expect_true(all(bias_sample(observed, predicted) == bias_sample(observed, predicted))) + + ## continuous forecasts + observed <- rnorm(30, mean = 1:30) + predicted <- replicate(200, rnorm(30, mean = 1:30)) + + scoringutils2 <- bias_sample(observed, predicted) + scoringutils <- bias_sample(observed, predicted) + + expect_equal(scoringutils, scoringutils2) +}) + + +test_that("bias_sample() approx equals bias_quantile() for many samples", { + set.seed(123) + + # Generate true value + observed <- 3 + + # Generate many sample predictions + predicted <- sample(rnorm(1000, mean = observed, sd = 2), 1000) + + # Get sample based bias + bias_sample_result <- bias_sample( + observed, matrix(predicted, nrow = 1) + ) + + # Convert predictions to quantiles + quantiles <- seq(0, 1, length.out = 100) + quantile_preds <- quantile(predicted, probs = quantiles) + + # Get quantile based bias + bias_quantile_result <- suppressMessages( + bias_quantile(observed, quantile_preds, quantiles) + ) + + # Difference should be small + expect_equal(bias_quantile_result, bias_sample_result, tolerance = 0.1) +}) + + +# `ae_median_sample` =========================================================== +test_that("ae_median_sample works", { + observed <- rnorm(30, mean = 1:30) + predicted_values <- rnorm(30, mean = 1:30) + scoringutils <- ae_median_sample(observed, matrix(predicted_values)) + ae <- abs(observed - predicted_values) + expect_equal(ae, scoringutils) +}) + +# `mad_sample()` =============================================================== +test_that("function throws an error when missing 'predicted'", { + predicted <- replicate(50, rpois(n = 10, lambda = 1:10)) + + expect_error( + mad_sample() + ) +}) + diff --git a/tests/testthat/test-pairwise_comparison.R b/tests/testthat/test-pairwise_comparison.R index 87b9dc20c..74275a06a 100644 --- a/tests/testthat/test-pairwise_comparison.R +++ b/tests/testthat/test-pairwise_comparison.R @@ -28,7 +28,7 @@ test_that("pairwise_comparison() works", { set.seed(123) test_forecasts[, value := sapply(value, function(x) { abs(rnorm(n = 1, mean = x, sd = model_sd)) - }), + }), by = .(model, target_end_date, location, target_variable) ] @@ -43,9 +43,9 @@ test_that("pairwise_comparison() works", { # make a version of that data that conforms to scoringutils format truth_formatted <- data.table::as.data.table(test_truth) truth_formatted[, `:=`(model = NULL)] - data.table::setnames(truth_formatted, old = "value", new = "true_value") + data.table::setnames(truth_formatted, old = "value", new = "observed") forecasts_formatted <- data.table::as.data.table(test_forecasts) - data.table::setnames(forecasts_formatted, old = "value", new = "prediction") + data.table::setnames(forecasts_formatted, old = "value", new = "predicted") data_formatted <- scoringutils::merge_pred_and_obs( forecasts_formatted, @@ -53,31 +53,25 @@ test_that("pairwise_comparison() works", { ) # evaluate the toy forecasts, once with and once without a baseline model specified - eval_without_baseline <- suppressMessages(score(data_formatted)) - - eval_without_baseline <- suppressMessages( - summarise_scores(eval_without_baseline, - relative_skill = TRUE, - by = c( - "model", "location", "target_end_date", - "target_variable" - ) + eval <- score(data_formatted) + + # check with relative skills + eval_without_rel_skill <- summarise_scores( + eval, + by = c( + "model", "location", "target_end_date", + "target_variable" ) ) - eval_with_baseline <- suppressMessages(score(data_formatted, - count_median_twice = FALSE - )) + eval_without_baseline <- suppressMessages( + add_pairwise_comparison(eval_without_rel_skill) + ) + eval_with_baseline <- suppressMessages( - summarise_scores(eval_with_baseline, - baseline = "m1", - relative_skill = TRUE, - by = c( - "model", "location", "target_end_date", - "target_variable" - ) - ) + add_pairwise_comparison(eval_without_rel_skill, baseline = "m1") ) + # extract the relative_skill values relative_skills_without <- eval_without_baseline[, .( model = unique(model), @@ -91,7 +85,7 @@ test_that("pairwise_comparison() works", { # prepare scores for the code Johannes Bracher wrote scores_johannes <- data.table::copy(eval_without_baseline) # doesn't matter which one data.table::setnames(scores_johannes, - old = c("location", "target_end_date", "interval_score"), + old = c("location", "target_end_date", "wis"), new = c("unit", "timezero", "wis") ) @@ -162,15 +156,15 @@ test_that("pairwise_comparison() works", { # -----------------------------------------------------------------------------# # compare results without a baseline specified - geom_mean_ratios <- exp(rowMeans(log(results_ratio), na.rm = TRUE)) - names(geom_mean_ratios) <- NULL - expect_equal(relative_skills_without$relative_skill, geom_mean_ratios) + geometric_mean_ratios <- exp(rowMeans(log(results_ratio), na.rm = TRUE)) + names(geometric_mean_ratios) <- NULL + expect_equal(relative_skills_without$relative_skill, geometric_mean_ratios) # comparison with a baseline ind_baseline <- which(rownames(results_ratio) == "m1") - geom_mean_ratios <- exp(rowMeans(log(results_ratio[, -ind_baseline]), na.rm = TRUE)) + geometric_mean_ratios <- exp(rowMeans(log(results_ratio[, -ind_baseline]), na.rm = TRUE)) ratios_baseline <- results_ratio[, "m1"] - ratios_scaled <- geom_mean_ratios / geom_mean_ratios["m1"] + ratios_scaled <- geometric_mean_ratios / geometric_mean_ratios["m1"] names(ratios_scaled) <- NULL expect_equal(relative_skills_with$relative_skill, ratios_scaled) @@ -201,20 +195,14 @@ test_that("pairwise_comparison() works", { } } ind_baseline <- which(rownames(results_ratio) == "m1") - geom_mean_ratios <- exp(rowMeans(log(results_ratio[, -ind_baseline]), na.rm = TRUE)) + geometric_mean_ratios <- exp(rowMeans(log(results_ratio[, -ind_baseline]), na.rm = TRUE)) ratios_baseline <- results_ratio[, "m1"] - ratios_scaled <- geom_mean_ratios / geom_mean_ratios["m1"] + ratios_scaled <- geometric_mean_ratios / geometric_mean_ratios["m1"] names(ratios_scaled) <- NULL - eval_with_baseline <- suppressMessages( - suppressMessages(score(data_formatted, - count_median_twice = FALSE - ))) - eval_with_baseline <- summarise_scores(eval_with_baseline, - baseline = "m1", - relative_skill = TRUE, - by = c("model", "location") - ) + eval <- score(data_formatted) + eval_summarised <- summarise_scores(eval, by = c("model", "location")) + eval_with_baseline <- add_pairwise_comparison(eval_summarised, baseline = "m1") relative_skills_with <- eval_with_baseline[ location == "location_3", @@ -229,20 +217,16 @@ test_that("pairwise_comparison() works", { test_that("pairwise_comparison() work in score() with integer data", { eval <- suppressMessages(score(data = example_integer)) - eval <- suppressMessages( - summarise_scores(eval, by = "model", relative_skill = TRUE) - ) - + eval_summarised <- summarise_scores(eval, by = "model") + eval <- add_pairwise_comparison(eval_summarised) expect_true("relative_skill" %in% colnames(eval)) }) test_that("pairwise_comparison() work in score() with binary data", { eval <- suppressMessages(score(data = example_binary)) - eval <- suppressMessages( - summarise_scores(eval, by = "model", relative_skill = TRUE) - ) - + eval_summarised <- summarise_scores(eval, by = "model") + eval <- add_pairwise_comparison(eval_summarised) expect_true("relative_skill" %in% colnames(eval)) }) @@ -254,7 +238,7 @@ test_that("pairwise_comparison() works", { model = rep(c("model1", "model2", "model3"), each = 10), date = as.Date("2020-01-01") + rep(1:5, each = 2), location = c(1, 2), - interval_score = (abs(rnorm(30))), + wis = (abs(rnorm(30))), ae_median = (abs(rnorm(30))) ) @@ -270,15 +254,15 @@ test_that("pairwise_comparison() works", { test_that("pairwise_comparison() works inside and outside of score()", { - eval <- suppressMessages(score(data = example_continuous)) + eval <- scores_continuous pairwise <- suppressMessages(pairwise_comparison(eval, by = "model", metric = "crps" )) - eval2 <- suppressMessages(score(data = example_continuous)) - eval2 <- summarise_scores(eval2, by = "model", relative_skill = TRUE) + eval2_summarised <- summarise_scores(scores_continuous, by = "model") + eval2 <- add_pairwise_comparison(eval2_summarised) expect_equal( sort(unique(pairwise$relative_skill)), sort(eval2$relative_skill) @@ -286,10 +270,18 @@ test_that("pairwise_comparison() works inside and outside of score()", { }) test_that("pairwise_comparison() realises when there is no baseline model", { - - scores <- suppressMessages(score(example_quantile)) - expect_error( - pairwise_comparison(scores, baseline = "missing_model"), "missing" + pairwise_comparison(scores_quantile, baseline = "missing_model"), "missing" ) }) + +test_that("Order of `add_pairwise_comparison()` and `summarise_scores()` doesn't matter", { + pw1 <- suppressMessages(add_pairwise_comparison(scores_quantile)) + pw1_sum <- summarise_scores(pw1, by = "model") + + pw2 <- summarise_scores(scores_quantile, by = "model") + pw2 <- add_pairwise_comparison(pw2) + + expect_true(all(pw1_sum == pw2, na.rm = TRUE)) + expect_true(all(names(attributes(pw2)) == names(attributes(pw1_sum)))) +}) diff --git a/tests/testthat/test-pit.R b/tests/testthat/test-pit.R index 9fa11e020..87f5325a4 100644 --- a/tests/testthat/test-pit.R +++ b/tests/testthat/test-pit.R @@ -1,30 +1,30 @@ -test_that("pit_sample() function throws an error when missing true_values", { - true_values <- rpois(10, lambda = 1:10) - predictions <- replicate(50, rpois(n = 10, lambda = 1:10)) +test_that("pit_sample() function throws an error when missing observed", { + observed <- rpois(10, lambda = 1:10) + predicted <- replicate(50, rpois(n = 10, lambda = 1:10)) expect_error( - pit_sample(predictions = predictions), - "true_values` or `predictions` missing in function 'pit_sample()" + pit_sample(predicted = predicted), + "observed` or `predicted` missing in function 'pit_sample()" ) }) -test_that("pit_sample() function throws an error when missing 'predictions'", { - true_values <- rpois(10, lambda = 1:10) - predictions <- replicate(50, rpois(n = 10, lambda = 1:10)) +test_that("pit_sample() function throws an error when missing 'predicted'", { + observed <- rpois(10, lambda = 1:10) + predicted <- replicate(50, rpois(n = 10, lambda = 1:10)) expect_error( - pit_sample(predictions = predictions), - "true_values` or `predictions` missing in function 'pit_sample()" + pit_sample(predicted = predicted), + "observed` or `predicted` missing in function 'pit_sample()" ) }) -test_that("pit_sample() function works for integer true_values and predictions", { - true_values <- rpois(10, lambda = 1:10) - predictions <- replicate(10, rpois(10, lambda = 1:10)) +test_that("pit_sample() function works for integer observed and predicted", { + observed <- rpois(10, lambda = 1:10) + predicted <- replicate(10, rpois(10, lambda = 1:10)) output <- pit_sample( - true_values = true_values, - predictions = predictions, + observed = observed, + predicted = predicted, n_replicates = 56 ) expect_equal( @@ -33,12 +33,12 @@ test_that("pit_sample() function works for integer true_values and predictions", ) }) -test_that("pit_sample() function works for continuous true_values and predictions", { - true_values <- rnorm(10) - predictions <- replicate(10, rnorm(10)) +test_that("pit_sample() function works for continuous observed and predicted", { + observed <- rnorm(10) + predicted <- replicate(10, rnorm(10)) output <- pit_sample( - true_values = true_values, - predictions = predictions, + observed = observed, + predicted = predicted, n_replicates = 56 ) expect_equal( @@ -60,3 +60,4 @@ test_that("pit function works for continuous integer and quantile data", { expect_equal(names(pit2), c("model", "target_type", "pit_value")) expect_equal(names(pit3), c("model", "location", "pit_value")) }) + diff --git a/tests/testthat/test-plot_avail_forecasts.R b/tests/testthat/test-plot_avail_forecasts.R index 77617427e..4240552cb 100644 --- a/tests/testthat/test-plot_avail_forecasts.R +++ b/tests/testthat/test-plot_avail_forecasts.R @@ -1,14 +1,13 @@ -test_that("plot_avail_forecasts() works as expected", { - avail_forecasts <- suppressMessages( - avail_forecasts(example_quantile, - by = c("model", "target_type", "target_end_date") - ) +test_that("plot.forecast_counts() works as expected", { + available_forecasts <- get_forecast_counts( + na.omit(example_quantile), + by = c("model", "target_type", "target_end_date") ) - p <- plot_avail_forecasts(avail_forecasts, - x = "target_end_date", show_numbers = FALSE + p <- plot_forecast_counts(available_forecasts, + x = "target_end_date", show_counts = FALSE ) + facet_wrap("target_type") expect_s3_class(p, "ggplot") skip_on_cran() - vdiffr::expect_doppelganger("plot_avail_forecasts", p) -}) \ No newline at end of file + vdiffr::expect_doppelganger("plot_available_forecasts", p) +}) diff --git a/tests/testthat/test-plot_correlation.R b/tests/testthat/test-plot_correlation.R index 9d6280d6e..0b170b963 100644 --- a/tests/testthat/test-plot_correlation.R +++ b/tests/testthat/test-plot_correlation.R @@ -1,5 +1,5 @@ test_that("plot_correlation() works as expected", { - correlations <- correlation(summarise_scores(scores)) + correlations <- correlation(summarise_scores(scores_quantile), digits = 2) p <- plot_correlation(correlations) expect_s3_class(p, "ggplot") skip_on_cran() diff --git a/tests/testthat/test-plot_heatmap.R b/tests/testthat/test-plot_heatmap.R index 3246fbd64..d6453bd45 100644 --- a/tests/testthat/test-plot_heatmap.R +++ b/tests/testthat/test-plot_heatmap.R @@ -1,11 +1,9 @@ library(ggplot2, quietly = TRUE) test_that("plot_heatmap() works as expected", { - scores <- suppressMessages( - summarise_scores(scores, by = c("model", "target_type", "range")) - ) + scores <- summarise_scores(scores_quantile, by = c("model", "target_type")) p <- plot_heatmap(scores, x = "target_type", metric = "bias") expect_s3_class(p, "ggplot") skip_on_cran() vdiffr::expect_doppelganger("plot_heatmap", p) -}) \ No newline at end of file +}) diff --git a/tests/testthat/test-plot_interval_coverage.R b/tests/testthat/test-plot_interval_coverage.R index 6180704a4..f8f3461ad 100644 --- a/tests/testthat/test-plot_interval_coverage.R +++ b/tests/testthat/test-plot_interval_coverage.R @@ -1,11 +1,8 @@ -library(ggplot2, quietly = TRUE) - test_that("plot_interval_coverage() works as expected", { - scores <- suppressMessages( - summarise_scores(scores, by = c("model", "range")) - ) - p <- plot_interval_coverage(scores) + coverage <- add_coverage(na.omit(example_quantile)) %>% + summarise_scores(by = c("model", "range")) + p <- plot_interval_coverage(coverage) expect_s3_class(p, "ggplot") skip_on_cran() - vdiffr::expect_doppelganger("plot_interval_coverage", p) -}) \ No newline at end of file + suppressWarnings(vdiffr::expect_doppelganger("plot_interval_coverage", p)) +}) diff --git a/tests/testthat/test-plot_pairwise_comparison.R b/tests/testthat/test-plot_pairwise_comparison.R index 4cd477e90..ffbf15374 100644 --- a/tests/testthat/test-plot_pairwise_comparison.R +++ b/tests/testthat/test-plot_pairwise_comparison.R @@ -1,5 +1,5 @@ pairwise <- suppressMessages( - pairwise_comparison(scores, by = "target_type") + pairwise_comparison(scores_quantile, by = "target_type") ) test_that("plot_pairwise_comparison() works as expected", { diff --git a/tests/testthat/test-plot_pit.R b/tests/testthat/test-plot_pit.R index 43ce97326..7ca8a9fd0 100644 --- a/tests/testthat/test-plot_pit.R +++ b/tests/testthat/test-plot_pit.R @@ -19,11 +19,11 @@ test_that("plot_pit() works as expected with integer forecasts", { }) test_that("plot_pit() works as expected with sample forecasts", { - true_values <- rnorm(30, mean = 1:30) - predictions <- replicate(200, rnorm(n = 30, mean = 1:30)) - pit <- pit_sample(true_values, predictions) + observed <- rnorm(30, mean = 1:30) + predicted <- replicate(200, rnorm(n = 30, mean = 1:30)) + pit <- pit_sample(observed, predicted) p <- plot_pit(pit) expect_s3_class(p, "ggplot") skip_on_cran() vdiffr::expect_doppelganger("plot_pit_sample", p) -}) \ No newline at end of file +}) diff --git a/tests/testthat/test-plot_quantile_coverage.R b/tests/testthat/test-plot_quantile_coverage.R index 9b210bfc9..9b44b8bf1 100644 --- a/tests/testthat/test-plot_quantile_coverage.R +++ b/tests/testthat/test-plot_quantile_coverage.R @@ -1,9 +1,9 @@ test_that("plot_quantile_coverage() works as expected", { - scores <- suppressMessages( - summarise_scores(scores, by = c("model", "quantile")) - ) - p <- plot_quantile_coverage(scores) + coverage <- add_coverage(na.omit(example_quantile)) %>% + summarise_scores(by = c("model", "quantile")) + + p <- plot_quantile_coverage(coverage) expect_s3_class(p, "ggplot") skip_on_cran() - vdiffr::expect_doppelganger("plot_quantile_coverage", p) -}) \ No newline at end of file + suppressWarnings(vdiffr::expect_doppelganger("plot_quantile_coverage", p)) +}) diff --git a/tests/testthat/test-plot_ranges.R b/tests/testthat/test-plot_ranges.R index d773d4f91..36c2ea41c 100644 --- a/tests/testthat/test-plot_ranges.R +++ b/tests/testthat/test-plot_ranges.R @@ -1,13 +1,19 @@ -sum_scores <- suppressMessages( - summarise_scores(scores, by = c("model", "target_type", "range")) -) +m <- modifyList(metrics_no_cov_no_ae, list("bias" = NULL)) + +sum_scores <- copy(example_quantile) %>% + na.omit() %>% + .[, interval_range := scoringutils:::get_range_from_quantile(quantile)] %>% + score(metrics = m) %>% + summarise_scores(by = c("model", "target_type", "interval_range")) + +sum_scores[, range := interval_range] test_that("plot_ranges() works as expected with interval score", { p <- plot_ranges(sum_scores, x = "model") + - facet_wrap(~target_type, scales = "free") + facet_wrap(~target_type, scales = "free") expect_s3_class(p, "ggplot") skip_on_cran() - vdiffr::expect_doppelganger("plot_ranges_interval", p) + suppressWarnings(vdiffr::expect_doppelganger("plot_ranges_interval", p)) }) test_that("plot_ranges() works as expected with dispersion", { @@ -15,5 +21,5 @@ test_that("plot_ranges() works as expected with dispersion", { facet_wrap(~target_type) expect_s3_class(p, "ggplot") skip_on_cran() - vdiffr::expect_doppelganger("plot_ranges_dispersion", p) + suppressWarnings(vdiffr::expect_doppelganger("plot_ranges_dispersion", p)) }) diff --git a/tests/testthat/test-plot_score_table.R b/tests/testthat/test-plot_score_table.R index 662a1cefc..5ffc0b029 100644 --- a/tests/testthat/test-plot_score_table.R +++ b/tests/testthat/test-plot_score_table.R @@ -1,7 +1,6 @@ test_that("plot_score_table() works as expected", { p <- suppressMessages( - scores %>% - add_coverage(by = c("model")) %>% + scores_quantile %>% summarise_scores(by = c("model")) %>% summarise_scores(by = c("model"), fun = signif, digits = 1) %>% plot_score_table() diff --git a/tests/testthat/test-plot_wis.R b/tests/testthat/test-plot_wis.R index 6e3c92fed..9e3c03409 100644 --- a/tests/testthat/test-plot_wis.R +++ b/tests/testthat/test-plot_wis.R @@ -1,5 +1,5 @@ sum_scores <- suppressMessages( - summarise_scores(scores, by = c("model", "target_type")) + summarise_scores(scores_quantile, by = c("model", "target_type")) ) test_that("plot_wis() works as expected with relative contributions", { @@ -34,4 +34,4 @@ test_that("plot_wis() works as expected when flipped", { expect_s3_class(p, "ggplot") skip_on_cran() vdiffr::expect_doppelganger("plot_wis_flip", p) -}) \ No newline at end of file +}) diff --git a/tests/testthat/test-score.R b/tests/testthat/test-score.R index f40cd579d..7b0eca9ad 100644 --- a/tests/testthat/test-score.R +++ b/tests/testthat/test-score.R @@ -3,26 +3,24 @@ test_that("function throws an error if data is missing", { expect_error(suppressMessages(score(data = NULL))) }) -test_that("score() warns if column name equals a metric name", { - data <- data.frame( - true_value = rep(1:10, each = 2), - prediction = rep(c(-0.3, 0.3), 10) + rep(1:10, each = 2), - model = "Model 1", - date = as.Date("2020-01-01") + rep(1:10, each = 2), - quantile = rep(c(0.1, 0.9), times = 10), - bias = 3 - ) - - expect_warning(suppressMessages(score(data = data))) -}) +# test_that("score() warns if column name equals a metric name", { +# data <- data.frame( +# observed = rep(1:10, each = 2), +# predicted = rep(c(-0.3, 0.3), 10) + rep(1:10, each = 2), +# model = "Model 1", +# date = as.Date("2020-01-01") + rep(1:10, each = 2), +# quantile = rep(c(0.1, 0.9), times = 10), +# bias = 3 +# ) +# +# expect_warning(suppressMessages(score(data = data))) +# }) # test binary case ------------------------------------------------------------- test_that("function produces output for a binary case", { - binary_example <- data.table::setDT(scoringutils::example_binary) - eval <- suppressMessages(score(binary_example[!is.na(prediction)])) - eval <- summarise_scores(eval, by = c("model", "target_type")) + eval <- summarise_scores(scores_binary, by = c("model", "target_type")) expect_equal( nrow(eval) > 1, @@ -36,59 +34,165 @@ test_that("function produces output for a binary case", { "log_score" ) ) + + expect_true("brier_score" %in% names(eval)) }) +test_that("score.forecast_binary() errors with only NA values", { + only_nas <- copy(example_binary)[, predicted := NA_real_] + expect_error( + score(only_nas), + "After removing rows with NA values in the data, no forecasts are left." + ) +}) -test_that("function produces score for a binary case", { - binary_example <- data.table::setDT(scoringutils::example_binary) - eval <- suppressMessages(score(binary_example[!is.na(prediction)])) - eval <- summarise_scores(eval, by = c("model", "target_type")) - expect_true("brier_score" %in% names(eval)) +test_that("score() gives same result for binary as regular function", { + manual_eval <- brier_score( + factor(example_binary$observed), + example_binary$predicted + ) + expect_equal(scores_binary$brier_score, manual_eval[!is.na(manual_eval)]) }) +test_that( + "passing additional functions to score binary works handles them", { + test_fun <- function(x, y, ...) { + if (hasArg("test")) { + message("test argument found") + } + return(y) + } + + df <- example_binary[model == "EuroCOVIDhub-ensemble" & + target_type == "Cases" & location == "DE"] + + # passing a simple function works + expect_equal( + score(df, + metrics = list("identity" = function(x, y) {return(y)}))$identity, + df$predicted + ) + # passing an additional argument that is not part of the function + # definition works + expect_equal( + score(df, + metrics = list("identity" = function(x, y) {return(y)}), + additional_arg = "something")$identity, + df$predicted + ) -# test quantile case ----------------------------------------------------------- -test_that("function produces output for a quantile format case", { - quantile_example <- data.table::setDT(scoringutils::example_quantile) - eval <- suppressMessages(score(quantile_example[!is.na(prediction)])) + # passing an additional function to one that accepts ... works + expect_message( + score(df, + metrics = list("test_function" = test_fun), + test = "something"), + "test argument found" + ) + + # passing an argument that's the same as a named argument + expect_equal( + unique( + score(df, + metrics = list("test_function" = test_fun), + y = "something")$test_function + ), + "something" + ) + + + ## Additional tests for validate_metrics() + # passing in something that's not a function or a known metric + expect_warning( + expect_warning( + score(df, metrics = list( + "test1" = test_fun, "test" = test_fun, "hi" = "hi", "2" = 3) + ), + "`Metrics` element number 3 is not a valid function" + ), + "`Metrics` element number 4 is not a valid function") + + # passing a single named argument for metrics by position + expect_contains( + names(score(df, list("hi" = test_fun))), + "hi") + + # providing an additional, unrelated function argument works + expect_no_error( + score(df, unnecessary_argument = "unnecessary") + ) + expect_no_error( + score(df, metrics = list("brier_score" = brier_score), + unnecessary_argument = "unnecessary") + ) + } +) + + +# test point case -------------------------------------------------------------- +test_that("function produces output for a point case", { + eval <- summarise_scores(scores_point, by = c("model", "target_type")) expect_equal( nrow(eval) > 1, TRUE ) + expect_equal( + colnames(eval), + c("model", "target_type", names(rules_point())) + ) }) -test_that("score_quantile correctly handles separate results = FALSE", { - quantile_example <- data.table::setDT(scoringutils::example_quantile) - eval <- suppressMessages( - score( - quantile_example[!is.na(prediction)], - separate_results = FALSE - ) +test_that("Changing metrics names works", { + metrics_test <- rules_point() + names(metrics_test)[1] = "just_testing" + eval <- suppressMessages(score(example_point, metrics = metrics_test)) + eval_summarised <- summarise_scores(eval, by = "model") + expect_equal( + colnames(eval_summarised), + c("model", "just_testing", names(rules_point())[-1]) ) +}) + + +test_that("score.forecast_point() errors with only NA values", { + only_nas <- copy(example_point)[, predicted := NA_real_] + expect_error( + score(only_nas), + "After removing rows with NA values in the data, no forecasts are left." + ) +}) + +# test quantile case ----------------------------------------------------------- +test_that("score_quantile correctly handles separate results = FALSE", { + df <- example_quantile[model == "EuroCOVIDhub-ensemble" & + target_type == "Cases" & location == "DE"] + eval <- score(df[!is.na(predicted)], separate_results = FALSE) expect_equal( nrow(eval) > 1, TRUE ) + expect_true(all(names(rules_quantile()) %in% colnames(eval))) }) test_that("score() quantile produces desired metrics", { data <- data.frame( - true_value = rep(1:10, each = 2), - prediction = rep(c(-0.3, 0.3), 10) + rep(1:10, each = 2), + observed = rep(1:10, each = 2), + predicted = rep(c(-0.3, 0.3), 10) + rep(1:10, each = 2), model = "Model 1", date = as.Date("2020-01-01") + rep(1:10, each = 2), quantile = rep(c(0.1, 0.9), times = 10) ) - out <- suppressMessages(score(data = data)) + out <- suppressWarnings(suppressMessages( + score(data = data, metrics = metrics_no_cov)) + ) metric_names <- c( "dispersion", "underprediction", "overprediction", - "bias", "ae_median", "coverage_deviation" + "bias", "ae_median" ) expect_true(all(metric_names %in% colnames(out))) @@ -96,14 +200,10 @@ test_that("score() quantile produces desired metrics", { test_that("calculation of ae_median is correct for a quantile format case", { - eval <- suppressMessages( - score(scoringutils::example_quantile[!is.na(prediction)]) - ) - - eval <- summarise_scores(eval,by = "model") + eval <- summarise_scores(scores_quantile,by = "model") example <- scoringutils::example_quantile - ae <- example[quantile == 0.5, ae := abs(true_value - prediction)][!is.na(model), .(mean = mean(ae, na.rm = TRUE)), + ae <- example[quantile == 0.5, ae := abs(observed - predicted)][!is.na(model), .(mean = mean(ae, na.rm = TRUE)), by = "model" ]$mean @@ -112,13 +212,12 @@ test_that("calculation of ae_median is correct for a quantile format case", { test_that("all quantile and range formats yield the same result", { - quantile_example1 <- data.table::setDT(scoringutils::example_quantile) + eval1 <- summarise_scores(scores_quantile, by = "model") - eval1 <- suppressMessages(score(quantile_example1[!is.na(prediction)])) - eval1 <- summarise_scores(eval1, by = "model") + df <- data.table::copy(example_quantile) - ae <- quantile_example1[ - quantile == 0.5, ae := abs(true_value - prediction)][ + ae <- df[ + quantile == 0.5, ae := abs(observed - predicted)][ !is.na(model), .(mean = mean(ae, na.rm = TRUE)), by = "model" ]$mean @@ -126,27 +225,25 @@ test_that("all quantile and range formats yield the same result", { expect_equal(sort(eval1$ae_median), sort(ae)) }) -test_that("function produces output even if only some metrics are chosen", { - example <- scoringutils::example_quantile +test_that("WIS is the same with other metrics omitted or included", { + eval <- suppressMessages(score(example_quantile, + metrics = list("wis" = wis) + )) - eval <- suppressMessages(score(example, metrics = "coverage")) + eval2 <- scores_quantile expect_equal( - nrow(eval) > 1, - TRUE + sum(eval$wis), + sum(eval2$wis) ) }) -test_that("WIS is the same with other metrics omitted or included", { - eval <- suppressMessages(score(example_quantile, - metrics = "interval_score" - )) - - eval2 <- suppressMessages(score(example_quantile)) - expect_equal( - sum(eval$interval_score), - sum(eval2$interval_score) +test_that("score.forecast_quantile() errors with only NA values", { + only_nas <- copy(example_quantile)[, predicted := NA_real_] + expect_error( + score(only_nas), + "After removing rows with NA values in the data, no forecasts are left." ) }) @@ -156,32 +253,55 @@ test_that("WIS is the same with other metrics omitted or included", { # test integer and continuous case --------------------------------------------- test_that("function produces output for a continuous format case", { - example <- data.table::setDT(scoringutils::example_continuous) - eval <- suppressMessages(score(example[!is.na(prediction)])) - eval2 <- suppressMessages(score(example)) + eval <- scores_continuous - data.table::setcolorder(eval2, colnames(eval)) - eval <- eval[order(model)] - eval2 <- eval2[order(model)] - all(eval == eval2, na.rm = TRUE) + only_nas <- copy(example_continuous)[, predicted := NA_real_] + expect_error( + score(only_nas), + "After removing rows with NA values in the data, no forecasts are left." + ) expect_equal( nrow(eval) > 1, TRUE ) + + expect_equal( + nrow(eval), + 887 + ) }) -test_that( - "score() can support a sample column when a quantile forecast is used", { - ex <- example_quantile[!is.na(quantile)][1:200, ] - ex <- rbind( - data.table::copy(ex)[, sample := 1], - ex[, sample := 2] - ) - scores <- suppressWarnings(score(ex)) - expect_snapshot(summarise_scores( - summarise_scores(scores, by = "model"), by = "model", - fun = signif, digits = 2 - )) - }) +test_that("function throws an error if data is missing", { + expect_error(suppressMessages(score(data = NULL))) +}) + +# ============================================================================= +# `apply_rules()` +# ============================================================================= + +test_that("apply_rules() works", { + + dt <- data.table::data.table(x = 1:10) + scoringutils:::apply_rules( + data = dt, metrics = list("test" = function(x) x + 1), + dt$x + ) + expect_equal(dt$test, 2:11) + + # additional named argument works + expect_no_condition( + scoringutils:::apply_rules( + data = dt, metrics = list("test" = function(x) x + 1), + dt$x, y = dt$test) + ) + + # additional unnamed argument does not work + + expect_warning( + scoringutils:::apply_rules( + data = dt, metrics = list("test" = function(x) x + 1), + dt$x, dt$test) + ) +}) diff --git a/tests/testthat/test-sharpness.R b/tests/testthat/test-sharpness.R deleted file mode 100644 index 4d50190e5..000000000 --- a/tests/testthat/test-sharpness.R +++ /dev/null @@ -1,8 +0,0 @@ -test_that("function throws an error when missing 'predictions'", { - predictions <- replicate(50, rpois(n = 10, lambda = 1:10)) - - expect_error( - mad_sample(), - "argument 'predictions' missing" - ) -}) diff --git a/tests/testthat/test-summarise_scores.R b/tests/testthat/test-summarise_scores.R index a1cf0355f..dbd2cde4b 100644 --- a/tests/testthat/test-summarise_scores.R +++ b/tests/testthat/test-summarise_scores.R @@ -1,10 +1,8 @@ test_that("summarise_scores() works without any arguments", { - expect_true("quantile" %in% names(scores)) + summarised_scores <- summarise_scores(scores_quantile) + expect_false("quantile" %in% names(summarised_scores)) - scores <- summarise_scores(scores) - expect_false("quantile" %in% names(scores)) - - s2 <- summarise_scores(scores, + s2 <- summarise_scores(scores_quantile, by = c( "location", "target_end_date", "target_type", "location_name", "forecast_date", "model", @@ -12,81 +10,63 @@ test_that("summarise_scores() works without any arguments", { ) ) - expect_equal(nrow(scores), nrow(s2)) + expect_equal(nrow(summarised_scores), nrow(s2)) }) test_that("summarise_scores() handles wrong by argument well", { - expect_error(summarise_scores(scores, by = "not_present"), - "The following items in `by` are notvalid column names of the data: 'not_present'. Check and run `summarise_scores()` again", # nolint + expect_error( + summarise_scores(scores_quantile, by = "not_present"), + "Column 'not_present' not found in data.", # nolint fixed = TRUE ) - expect_error(summarise_scores(scores, by = "sample"), - "The following items in `by` are notvalid column names of the data: 'sample'. Check and run `summarise_scores()` again", # nolint + expect_error( + summarise_scores(scores_quantile, by = "sample_id"), + "Column 'sample_id' not found in data.", fixed = TRUE ) }) -test_that("summarise_scores() works with point forecasts in a quantile format", { - ex <- data.table::copy(example_quantile) - ex <- ex[quantile == 0.5][, quantile := NA_real_] - - scores <- suppressMessages(score(ex)) +test_that("summarise_scores() works with point forecasts", { + summarised_scores <- summarise_scores(scores_point, by = "model") - summarise_scores(scores, by = "model", na.rm = TRUE) - expect_warning( - expect_warning( - summarise_scores( - scores, by = "model", relative_skill = TRUE, na.rm = TRUE) + expect_no_condition( + pw_point <- add_pairwise_comparison( + summarised_scores, + relative_skill_metric = "se_point" ) ) - scores_point <- suppressMessages(score(example_point[is.na(quantile)])) + pw_manual <- pairwise_comparison( + scores_point, by = "model", metric = "se_point" + ) - expect_warning( - expect_warning( - summarise_scores( - scores_point, by = "model", relative_skill = TRUE, na.rm = TRUE) - ) + expect_equal( + pw_point$relative_skill, + unique(pw_manual$relative_skill) ) }) test_that("summarise_scores() can compute relative measures", { - ex <- data.table::copy(example_quantile) - scores <- suppressMessages(score(ex)) + scores_with <- add_pairwise_comparison( + summarise_scores(scores_quantile, by = "model") + ) expect_equal( - summarise_scores( - scores, by = "model", relative_skill = TRUE - )[, relative_skill], + scores_with[, relative_skill], c(1.6, 0.81, 0.75, 1.03), tolerance = 0.01 ) - expect_equal( - summarise_scores( - scores, by = "model", relative_skill = TRUE, - relative_skill_metric = "ae_median" - )[, relative_skill], - c(1.6, 0.78, 0.77, 1.04), tolerance = 0.01 + scores_with <- add_pairwise_comparison( + summarise_scores(scores_quantile, by = "model"), + relative_skill_metric = "ae_median" ) -}) - -test_that("summarise_scores() metric is deprecated", { - ex <- data.table::copy(example_quantile) - scores <- suppressMessages(score(ex)) expect_equal( - suppressWarnings(summarise_scores( - scores, by = "model", metric = "auto", relative_skill = TRUE - ))[, relative_skill], - c(1.6, 0.81, 0.75, 1.03), tolerance = 0.01 - ) - expect_snapshot( - x <- summarise_scores( - scores, by = "model", metric = "auto", relative_skill = TRUE - ) - ) + scores_with[, relative_skill], + c(1.6, 0.78, 0.77, 1.04), tolerance = 0.01 + ) }) test_that("summarise_scores() across argument works as expected", { @@ -99,7 +79,7 @@ test_that("summarise_scores() across argument works as expected", { ), regexp = "You cannot specify both" ) - expect_error( + expect_error( summarise_scores( scores, across = "horizons" ), diff --git a/tests/testthat/test-utils.R b/tests/testthat/test-utils.R index f75d68524..1afc9df1e 100644 --- a/tests/testthat/test-utils.R +++ b/tests/testthat/test-utils.R @@ -2,7 +2,7 @@ test_that("get_protected_columns() returns the correct result", { data <- example_quantile manual <- protected_columns <- c( - "prediction", "true_value", "sample", "quantile", "upper", "lower", + "predicted", "observed", "sample_id", "quantile", "upper", "lower", "pit_value", "range", "boundary", available_metrics(), grep("coverage_", names(data), fixed = TRUE, value = TRUE) @@ -14,7 +14,7 @@ test_that("get_protected_columns() returns the correct result", { data <- example_binary manual <- protected_columns <- c( - "prediction", "true_value", "sample", "quantile", "upper", "lower", + "predicted", "observed", "sample_id", "quantile", "upper", "lower", "pit_value", "range", "boundary", available_metrics(), grep("coverage_", names(data), fixed = TRUE, value = TRUE) @@ -25,7 +25,7 @@ test_that("get_protected_columns() returns the correct result", { data <- example_continuous manual <- protected_columns <- c( - "prediction", "true_value", "sample", "quantile", "upper", "lower", + "predicted", "observed", "sample_id", "quantile", "upper", "lower", "pit_value", "range", "boundary", available_metrics(), grep("coverage_", names(data), fixed = TRUE, value = TRUE) @@ -35,137 +35,79 @@ test_that("get_protected_columns() returns the correct result", { expect_equal(sort(manual), sort(auto)) }) -test_that("get_prediction_type() correctly identifies quantile predictions", { - data <- data.frame( - prediction = 1:3, - quantile = c(0.1, 0.5, 0.9) - ) - - expect_equal(get_prediction_type(data), "quantile") -}) - -test_that("get_prediction_type() correctly identifies integer predictions", { - data <- data.frame( - prediction = as.integer(1:5) - ) - - expect_equal(get_prediction_type(data), "integer") - - data <- matrix(as.integer(1:9), nrow = 3) - expect_equal(get_prediction_type(data), "integer") -}) -test_that("get_prediction_type() correctly identifies continuous predictions", { - data <- data.frame( - prediction = rnorm(5) +test_that("run_safely() works as expected", { + f <- function(x) {x} + expect_equal(run_safely(2, fun = f), 2) + expect_equal(run_safely(2, y = 3, fun = f), 2) + expect_warning( + run_safely(fun = f), + 'Function execution failed, returning NULL. Error: argument "x" is missing, with no default', + fixed = TRUE ) - - expect_equal(get_prediction_type(data), "continuous") + expect_equal(suppressWarnings(run_safely(y = 3, fun = f)), NULL) }) -test_that("works with vector input", { - predictions <- rnorm(5) - - expect_equal(get_prediction_type(predictions), "continuous") -}) - -test_that("get_prediction_type() returns error on invalid input", { - suppressWarnings(expect_error(get_prediction_type("foo"))) -}) - -test_that("get_prediction_type() handles NA values across prediction types", { - # Quantile - data <- data.frame( - prediction = c(1, NA, 3), - quantile = c(0.1, 0.5, 0.9) - ) - expect_equal(get_prediction_type(data), "quantile") - - # Integer - data <- data.frame( - prediction = c(1, NA, 3) - ) - expect_equal(get_prediction_type(data), "integer") - - # Continuous - data <- data.frame( - prediction = c(1.1, NA, 3.2) - ) - expect_equal(get_prediction_type(data), "continuous") - predictions <- c(1.1, NA, 3.5) - expect_equal(get_prediction_type(predictions), "continuous") - - # All NA - data <- data.frame(prediction = NA) - expect_error( - get_prediction_type(data), - "Input is not numeric and cannot be coerced to numeric" - ) - expect_error( - get_prediction_type(NA_real_), - "Input is not numeric and cannot be coerced to numeric" - ) -}) - -test_that("prediction_is_quantile() correctly identifies quantile predictions", { - data <- data.frame( - prediction = 1:3, - quantile = c(0.1, 0.5, 0.9) - ) - - expect_true(prediction_is_quantile(data)) -}) -test_that("prediction_is_quantile() returns false for non-quantile predictions", { - data <- data.frame( - prediction = rnorm(5) - ) - - expect_false(prediction_is_quantile(data)) -}) - -test_that("prediction_is_quantile() returns false if quantile column has wrong values", { - data <- data.frame( - prediction = rnorm(5), - quantile = rnorm(5) - ) - - expect_true(prediction_is_quantile(data)) -}) - -test_that("prediction_is_quantile() returns false if quantile column is character", { - data <- data.frame( - prediction = rep(rnorm(5), 3), - quantile = c("A", "B", "C") - ) - - expect_true(prediction_is_quantile(data)) -}) - -test_that("prediction_is_quantile() errors on non-data.frame input", { - expect_error(prediction_is_quantile(1:5)) -}) - -test_that("prediction_is_quantile() handles empty data frame", { - data <- data.frame(prediction = numeric(0)) - - expect_false(prediction_is_quantile(data)) -}) - -test_that("prediction_is_quantile() handles NA values", { - data <- data.frame( - prediction = c(1, NA, 3), - quantile = c(0.1, NA, 0.5) - ) - - expect_true(prediction_is_quantile(data)) -}) - -test_that("is_scoringutils_check() is working", { - checked <- suppressMessages(check_forecasts(example_binary)) - expect_true(is_scoringutils_check(checked)) - - checked$cleaned_data <- NULL - expect_error(is_scoringutils_check(checked)) -}) +# test_that("prediction_is_quantile() correctly identifies quantile predictions", { +# data <- data.frame( +# predicted = 1:3, +# quantile = c(0.1, 0.5, 0.9) +# ) +# +# expect_true(prediction_is_quantile(data)) +# }) + +# test_that("prediction_is_quantile() returns false for non-quantile predictions", { +# data <- data.frame( +# predicted = rnorm(5) +# ) +# +# expect_false(prediction_is_quantile(data)) +# }) + +# test_that("prediction_is_quantile() returns false if quantile column has wrong values", { +# data <- data.frame( +# predicted = rnorm(5), +# quantile = rnorm(5) +# ) +# +# expect_true(prediction_is_quantile(data)) +# }) + +# test_that("prediction_is_quantile() returns false if quantile column is character", { +# data <- data.frame( +# predicted = rep(rnorm(5), 3), +# quantile = c("A", "B", "C") +# ) +# +# expect_true(prediction_is_quantile(data)) +# }) + +# test_that("prediction_is_quantile() errors on non-data.frame input", { +# expect_error(prediction_is_quantile(1:5)) +# }) +# +# test_that("prediction_is_quantile() handles empty data frame", { +# data <- data.frame(predicted = numeric(0)) +# +# expect_false(prediction_is_quantile(data)) +# }) +# +# test_that("prediction_is_quantile() handles NA values", { +# data <- data.frame( +# predicted = c(1, NA, 3), +# quantile = c(0.1, NA, 0.5) +# ) +# +# expect_true(prediction_is_quantile(data)) +# }) + +# test_that("is_scoringutils_check() is working", { +# checked <- suppressMessages(validate_forecast(example_binary)) +# expect_true(is_scoringutils_check(checked)) +# +# checked$cleaned_data <- NULL +# expect_error(is_scoringutils_check(checked)) +# }) diff --git a/tests/testthat/test-utils_data_handling.R b/tests/testthat/test-utils_data_handling.R index 930ccc002..30b8c3a6f 100644 --- a/tests/testthat/test-utils_data_handling.R +++ b/tests/testthat/test-utils_data_handling.R @@ -2,8 +2,8 @@ test_that("range_long_to_quantile works", { long <- data.frame( date = as.Date("2020-01-01") + 1:10, model = "model1", - true_value = 1:10, - prediction = c(2:11, 4:13), + observed = 1:10, + predicted = c(2:11, 4:13), range = 50, boundary = rep(c("lower", "upper"), each = 10) ) @@ -11,8 +11,8 @@ test_that("range_long_to_quantile works", { quantile <- data.frame( date = as.Date("2020-01-01") + 1:10, model = "model1", - true_value = 1:10, - prediction = c(2:11, 4:13), + observed = 1:10, + predicted = c(2:11, 4:13), quantile = rep(c(0.25, 0.75), each = 10) ) @@ -22,35 +22,54 @@ test_that("range_long_to_quantile works", { -test_that("quantile_to_range_long works", { +test_that("quantile_to_interval.data.frame() works", { quantile <- data.frame( date = as.Date("2020-01-01") + 1:10, model = "model1", - true_value = 1:10, - prediction = c(2:11, 4:13), + observed = 1:10, + predicted = c(2:11, 4:13), quantile = rep(c(0.25, 0.75), each = 10) ) - long <- data.frame( date = as.Date("2020-01-01") + 1:10, model = "model1", - true_value = 1:10, - prediction = c(2:11, 4:13), + observed = 1:10, + predicted = c(2:11, 4:13), range = 50, boundary = rep(c("lower", "upper"), each = 10) ) - - long2 <- as.data.frame(scoringutils:::quantile_to_range_long(quantile, + long2 <- as.data.frame(quantile_to_interval( + quantile, keep_quantile_col = FALSE )) - data.table::setcolorder(long2, names(long)) - # for some reason this is needed to pass the unit tests on gh actions long2$boundary <- as.character(long2$boundary) long$boundary <- as.character(long$boundary) - expect_equal(long, as.data.frame(long2)) + + # check that it handles NA values + setDT(quantile) + quantile[c(1, 3, 11, 13), c("observed", "predicted", "quantile") := NA] + # in this instance, a problem appears because there is an NA value both + # for the upper and lower bound. + expect_message( + quantile_to_interval( + quantile, + keep_quantile_col = FALSE, + format = "wide" + ), + "Aggregate function missing, defaulting to 'length'" + ) + quantile <- quantile[-c(1, 3), ] + wide2 <- scoringutils:::quantile_to_interval( + quantile, + keep_quantile_col = FALSE, + format = "wide" + ) + expect_equal(nrow(wide2), 10) + expect_true(!("NA") %in% colnames(wide2)) + expect_equal(sum(wide2$lower, na.rm = TRUE), 59) }) @@ -58,20 +77,20 @@ test_that("sample_to_quantiles works", { samples <- data.frame( date = as.Date("2020-01-01") + 1:10, model = "model1", - true_value = 1:10, - prediction = c(rep(0, 10), 2:11, 3:12, 4:13, rep(100, 10)), - sample = rep(1:5, each = 10) + observed = 1:10, + predicted = c(rep(0, 10), 2:11, 3:12, 4:13, rep(100, 10)), + sample_id = rep(1:5, each = 10) ) quantile <- data.frame( date = rep(as.Date("2020-01-01") + 1:10, each = 2), model = "model1", - true_value = rep(1:10, each = 2), + observed = rep(1:10, each = 2), quantile = c(0.25, 0.75), - prediction = rep(2:11, each = 2) + c(0, 2) + predicted = rep(2:11, each = 2) + c(0, 2) ) - quantile2 <- scoringutils::sample_to_quantile(samples, quantiles = c(0.25, 0.75)) + quantile2 <- sample_to_quantile(samples, quantiles = c(0.25, 0.75)) expect_equal(quantile, as.data.frame(quantile2)) @@ -90,22 +109,32 @@ test_that("sample_to_quantiles works", { }) +test_that("sample_to_quantiles issue 557 fix", { + + out <- example_integer %>% + sample_to_quantile( + quantiles = c(0.01, 0.025, seq(0.05, 0.95, 0.05), 0.975, 0.99) + ) %>% + score() + + expect_equal(any(is.na(out$interval_coverage_deviation)), FALSE) +}) test_that("sample_to_range_long works", { samples <- data.frame( date = as.Date("2020-01-01") + 1:10, model = "model1", - true_value = 1:10, - prediction = c(rep(0, 10), 2:11, 3:12, 4:13, rep(100, 10)), - sample = rep(1:5, each = 10) + observed = 1:10, + predicted = c(rep(0, 10), 2:11, 3:12, 4:13, rep(100, 10)), + sample_id = rep(1:5, each = 10) ) long <- data.frame( date = as.Date("2020-01-01") + 1:10, model = "model1", - true_value = 1:10, - prediction = c(2:11, 4:13), + observed = 1:10, + predicted = c(2:11, 4:13), range = 50, boundary = rep(c("lower", "upper"), each = 10) ) @@ -123,3 +152,120 @@ test_that("sample_to_range_long works", { expect_equal(long, as.data.frame(long2)) }) + +test_that("quantile_to_range works - scalar and vector case", { + predicted <- 9:1 + quantile <- rev(seq(0.1, 0.9, 0.1)) + observed <- 5 + + # check output is produced + out1 <- quantile_to_interval(observed, predicted, quantile) + expect_snapshot(out1) + + # check order of predictions doesn't matter + predicted <- 1:9 + quantile <- seq(0.1, 0.9, 0.1) + out2 <- quantile_to_interval(observed, predicted, quantile) + expect_equal(out1, out2) + + # check error if observed is a vector and predicted is a vector as well + expect_error(quantile_to_interval( + observed = c(1, 2), predicted = c(1, 2), quantile = c(0.1, 0.9)), + "Assertion on 'predicted' failed: Must be of type 'matrix', not 'double'." + ) + + # check NA values are handled gracefully - there should just be NA in the output + predicted <- c(1:8, NA) + quantile <- seq(0.1, 0.9, 0.1) + observed <- 5 + out3 <- quantile_to_interval(observed, predicted, quantile) + expect_snapshot(out3) + + # check non-symmetrical intervals are handled gracefully + # result should be newly introduced ranges where one value is NA + predicted <- c(1:9) + quantile <- c(seq(0.1, 0.8, 0.1), 0.95) + observed <- 5 + out4 <- quantile_to_interval(observed, predicted, quantile) + expect_snapshot(out4) + + # check function works without a median + predicted <- c(1:8) + quantile <- c(seq(0.1, 0.4, 0.1), seq(0.6, 0.9, 0.1)) + observed <- 5 + expect_no_condition( + quantile_to_interval(observed, predicted, quantile) + ) + + # check a one-dimensional matrix works fine + predicted <- matrix(1:9, nrow = 1) + quantile <- seq(0.1, 0.9, 0.1) + expect_no_condition( + quantile_to_interval(observed, predicted, quantile) + ) +}) + +test_that("quantile_to_range works - matrix case", { + n <- 5 + N <- 9 + predicted <- matrix(1:45, nrow = n, ncol = N) + quantile <- seq(0.1, 0.9, 0.1) + observed <- seq(21, 25, 1) + + # check output is produced + out1 <- quantile_to_interval(observed, predicted, quantile) + expect_snapshot(out1) + + # check order of predictions doesn't matter + predicted <- matrix( + c(41:45, 36:40, 31:35, 26:30, 21:25, 16:20, 11:15, 6:10, 1:5), + nrow = n, + ncol = N + ) + quantile <- rev(seq(0.1, 0.9, 0.1)) + out2 <- quantile_to_interval(observed, predicted, quantile) + expect_equal(out1, out2) + + # check NA values are fine + predicted[1, 1] <- NA + expect_no_condition( + quantile_to_interval(observed, predicted, quantile) + ) +}) + + +test_that("quantile_to_interval works - data.frame case", { + dt <- data.table( + observed = 5, + predicted = 1:9, + quantile = seq(0.1, 0.9, 0.1) + ) + + expect_no_condition( + quantile_to_interval(dt) + ) + + expect_no_condition( + quantile_to_interval(dt, format = "wide") + ) + + # check that the number of rows after transformation is equal to the number + # of rows plus the number of medians added (as upper boundary of a 0% + # prediction interval) + ex <- example_quantile[!is.na(predicted)] + n_preds <- nrow(ex) + n_medians <- nrow(ex[quantile == 0.5]) + ex_interval <- quantile_to_interval(ex, keep_quantile_col = TRUE) + expect_equal( + nrow(ex_interval), + n_preds + n_medians + ) + + expect_equal( + colnames(ex_interval), + c(colnames(ex), "boundary", "range") + ) +}) + + + diff --git a/vignettes/metric-details.Rmd b/vignettes/metric-details.Rmd index 71f423bbb..384bb71f1 100644 --- a/vignettes/metric-details.Rmd +++ b/vignettes/metric-details.Rmd @@ -26,8 +26,8 @@ library(data.table) This table gives an overview for when which metric can be applied and gives a very brief description. Note that this table on shows the metrics as implemented in `scoringutils`. For example, only scoring of sample-based discrete and continuous distributions is implemented in `scoringutils`, but closed-form solutions often exist (e.g. in the `scoringRules` package). ```{r, echo = FALSE, results = "asis"} -data <- copy(metrics) -setnames(data, old = c("Discrete", "Continuous", "Binary", "Quantile"), +data <- copy(metrics) +setnames(data, old = c("Discrete", "Continuous", "Binary", "Quantile"), new = c("D", "C", "B", "Q")) data[, c("Name", "Functions") := NULL] @@ -48,8 +48,8 @@ data <- data[, 1:6] %>% data %>% kbl(format = "html", escape = FALSE, - align = c("lccccl"), - linesep = c('\\addlinespace')) %>% + align = "lccccl", + linesep = "\\addlinespace") %>% column_spec(1, width = "3.2cm") %>% column_spec(2, width = "1.5cm") %>% column_spec(3, width = "1.5cm") %>% @@ -65,9 +65,11 @@ data %>% ## Detailed explanation of the metrics implemented in `scoringutils` ```{r, echo = FALSE, results = "asis"} - data <- readRDS( - system.file("metrics-overview/metrics-detailed.Rda", package = "scoringutils") + system.file( + "metrics-overview", "metrics-detailed.rds", + package = "scoringutils" + ) ) data[, 1:2] %>% diff --git a/vignettes/scoring-forecasts-directly.Rmd b/vignettes/scoring-forecasts-directly.Rmd index e6083f205..bd956092a 100644 --- a/vignettes/scoring-forecasts-directly.Rmd +++ b/vignettes/scoring-forecasts-directly.Rmd @@ -17,7 +17,6 @@ knitr::opts_chunk$set( ) library(scoringutils) library(data.table) -library(ggplot2) ``` A variety of metrics and scoring rules can also be accessed directly through @@ -35,7 +34,7 @@ For continuous forecasts, Bias is measured as $$B_t (P_t, x_t) = 1 - 2 \cdot (P_t (x_t))$$ where $P_t$ is the empirical cumulative distribution function of the -prediction for the true value $x_t$. Computationally, $P_t (x_t)$ is +prediction for the observed value $x_t$. Computationally, $P_t (x_t)$ is just calculated as the fraction of predictive samples for $x_t$ that are smaller than $x_t$. @@ -48,14 +47,14 @@ assume values between -1 and 1 and is 0 ideally. ```{r} ## integer valued forecasts -true_values <- rpois(30, lambda = 1:30) -predictions <- replicate(200, rpois(n = 30, lambda = 1:30)) -bias_sample(true_values, predictions) +observed <- rpois(30, lambda = 1:30) +predicted <- replicate(200, rpois(n = 30, lambda = 1:30)) +bias_sample(observed, predicted) ## continuous forecasts -true_values <- rnorm(30, mean = 1:30) -predictions <- replicate(200, rnorm(30, mean = 1:30)) -bias_sample(true_values, predictions) +observed <- rnorm(30, mean = 1:30) +predicted <- replicate(200, rnorm(30, mean = 1:30)) +bias_sample(observed, predicted) ``` @@ -64,13 +63,13 @@ Sharpness is the ability of the model to generate predictions within a narrow range. It is a data-independent measure, and is purely a feature of the forecasts themselves. -Sharpness / dispersion of predictive samples corresponding to one single true value is +Sharpness / dispersion of predictive samples corresponding to one single observed value is measured as the normalised median of the absolute deviation from the median of the predictive samples. For details, see `?stats::mad` ```{r} -predictions <- replicate(200, rpois(n = 30, lambda = 1:30)) -mad_sample(predictions) +predicted <- replicate(200, rpois(n = 30, lambda = 1:30)) +mad_sample(predicted = predicted) ``` # Calibration @@ -126,9 +125,9 @@ Wrapper around the `crps_sample()` function from the integer valued forecasts. Smaller values are better. ```{r} -true_values <- rpois(30, lambda = 1:30) -predictions <- replicate(200, rpois(n = 30, lambda = 1:30)) -crps_sample(true_values, predictions) +observed <- rpois(30, lambda = 1:30) +predicted <- replicate(200, rpois(n = 30, lambda = 1:30)) +crps_sample(observed, predicted) ``` @@ -140,9 +139,9 @@ Wrapper around the `dss_sample()` function from the integer valued forecasts. Smaller values are better. ```{r} -true_values <- rpois(30, lambda = 1:30) -predictions <- replicate(200, rpois(n = 30, lambda = 1:30)) -dss_sample(true_values, predictions) +observed <- rpois(30, lambda = 1:30) +predicted <- replicate(200, rpois(n = 30, lambda = 1:30)) +dss_sample(observed, predicted) ``` # Log Score @@ -154,27 +153,27 @@ forecasts they require a kernel density estimate which is not well defined for discrete values. Smaller values are better. ```{r} -true_values <- rnorm(30, mean = 1:30) -predictions <- replicate(200, rnorm(n = 30, mean = 1:30)) -logs_sample(true_values, predictions) +observed <- rnorm(30, mean = 1:30) +predicted <- replicate(200, rnorm(n = 30, mean = 1:30)) +logs_sample(observed, predicted) ``` # Brier Score The Brier score is a proper score rule that assesses the accuracy of probabilistic binary predictions. The outcomes can be either 0 or 1, -the predictions must be a probability that the true outcome will be 1. +the predictions must be a probability that the observed outcome will be 1. The Brier Score is then computed as the mean squared error between the -probabilistic prediction and the true outcome. +probabilistic prediction and the observed outcome. $$\text{Brier_Score} = \frac{1}{N} \sum_{t = 1}^{n} (\text{prediction}_t - \text{outcome}_t)^2$$ ```{r} -true_values <- sample(c(0, 1), size = 30, replace = TRUE) -predictions <- runif(n = 30, min = 0, max = 1) +observed <- factor(sample(c(0, 1), size = 30, replace = TRUE)) +predicted <- runif(n = 30, min = 0, max = 1) -brier_score(true_values, predictions) +brier_score(observed, predicted) ``` ## Interval Score @@ -184,9 +183,9 @@ following Gneiting and Raftery (2007). Smaller values are better. The score is computed as $$ \text{score} = (\text{upper} - \text{lower}) + \\ -\frac{2}{\alpha} \cdot (\text{lower} - \text{true_value}) \cdot 1(\text{true_values} < \text{lower}) + \\ -\frac{2}{\alpha} \cdot (\text{true_value} - \text{upper}) \cdot -1(\text{true_value} > \text{upper})$$ +\frac{2}{\alpha} \cdot (\text{lower} - \text{observed}) \cdot 1(\text{observed} < \text{lower}) + \\ +\frac{2}{\alpha} \cdot (\text{observed} - \text{upper}) \cdot +1(\text{observed} > \text{upper})$$ where $1()$ is the indicator function and $\alpha$ is the decimal value that @@ -203,17 +202,13 @@ the Interval Score converges to the CRPS for increasing number of quantiles. ```{r} -true_values <- rnorm(30, mean = 1:30) -interval_range <- 90 -alpha <- (100 - interval_range) / 100 -lower <- qnorm(alpha / 2, rnorm(30, mean = 1:30)) -upper <- qnorm((1 - alpha / 2), rnorm(30, mean = 1:30)) - -interval_score( - true_values = true_values, - lower = lower, - upper = upper, - interval_range = interval_range +observed <- c(1, -15, 22) +predicted <- rbind( + c(-1, 0, 1, 2, 3), + c(-2, 1, 2, 2, 4), + c(-2, 0, 3, 3, 4) ) -``` +quantile <- c(0.1, 0.25, 0.5, 0.75, 0.9) +wis(observed, predicted, quantile) +``` diff --git a/vignettes/scoringutils.Rmd b/vignettes/scoringutils.Rmd index 022e0a2ad..5c35cd492 100644 --- a/vignettes/scoringutils.Rmd +++ b/vignettes/scoringutils.Rmd @@ -23,7 +23,7 @@ library(knitr) data.table::setDTthreads(2) ``` -The `scoringutils` package provides a collection of metrics and proper scoring rules that make it simple to score probabilistic forecasts against the true observed values. You can find more information in the paper [Evaluating Forecasts with scoringutils in R](https://arxiv.org/abs/2205.07090) as well as the [Metrics-Vignette](https://epiforecasts.io/scoringutils/articles/metric-details.html) and the [Scoring forecasts directly Vignette](https://epiforecasts.io/scoringutils/articles/scoring-forecasts-directly.html). +The `scoringutils` package provides a collection of metrics and proper scoring rules that make it simple to score probabilistic forecasts against observed values. You can find more information in the paper [Evaluating Forecasts with scoringutils in R](https://arxiv.org/abs/2205.07090) as well as the [Metrics-Vignette](https://epiforecasts.io/scoringutils/articles/metric-details.html) and the [Scoring forecasts directly Vignette](https://epiforecasts.io/scoringutils/articles/scoring-forecasts-directly.html). The `scoringutils` package offers convenient automated forecast evaluation in a `data.table` format (using the function `score()`), but also provides experienced users with a set of reliable lower-level scoring metrics operating on vectors/matriced they can build upon in other applications. In addition it implements a wide range of flexible plots that are able to cover many use cases. @@ -41,8 +41,8 @@ requirements <- data.table( "quantile-based", "sample-based", "binary", "pairwise-comparisons" ), `Required columns` = c( - "'true_value', 'prediction', 'quantile'", - "'true_value', 'prediction', 'sample'", "'true_value', 'prediction'", + "'observed', 'predicted', 'quantile'", + "'observed', 'predicted', 'sample'", "'observed', 'predicted'", "additionally a column 'model'" ) ) @@ -50,7 +50,7 @@ kable(requirements) ``` Additional columns may be present to indicate a grouping of forecasts. For example, we could have forecasts made by different models in various locations at different time points, each for several weeks into the future. -`scoringutils` automatically tries to determine the *unit of a single forecast*, i.e. the combination of existing columns that are able to uniquely identify a single forecast. It uses all existing columns for this, which can sometimes lead to issues. We therefore recommend using the function `set_forecast_unit()` to determine the forecast unit manually. The function simply drops unneeded columns, while making sure that some necessary, 'protected columns' like "prediction" or "true_value" are retained. +`scoringutils` automatically tries to determine the *unit of a single forecast*, i.e. the combination of existing columns that are able to uniquely identify a single forecast. It uses all existing columns for this, which can sometimes lead to issues. We therefore recommend using the function `set_forecast_unit()` to determine the forecast unit manually. The function simply drops unneeded columns, while making sure that some necessary, 'protected columns' like "predicted" or "observed" are retained. ```{r} colnames(example_quantile) @@ -62,46 +62,78 @@ set_forecast_unit( colnames() ``` +## Constructing a forecast object -## Checking the input data - -The function `check_forecasts()` can be used to check the input data. It gives a summary of what `scoringutils` thinks you are trying to achieve. It infers the type of the prediction target, the prediction format, and the unit of a single forecasts, gives an overview of the number of unique values per column (helpful for spotting missing data) and returns warnings or errors. +The function `as_forecast()` is be used to construct a forecast object and check the input data. It determines the forecast type, creates an object of the appropriate class (`forecast_binary`, `forecast_quantile` or `forecast_sample`), and validates the input data. Objects of class `forecast_*` have a print method that provides additional information. ```{r} head(example_quantile) ``` ```{r} -check_forecasts(example_quantile) +forecast_quantile <- example_quantile %>% + set_forecast_unit( + c("model", "location", "target_end_date", "forecast_date", + "target_type", "horizon") + ) %>% + as_forecast() + +forecast_quantile ``` If you are unsure what your input data should look like, have a look at the `example_quantile`, `example_integer`, `example_continuous` and `example_binary` data sets provided in the package. -The output of `check_forecasts()` can later be directly used as input to `score()` (otherwise, `score()` will just run `check_forecasts()` anyway internally). +The output of `as_forecast()` can later be directly used as input to `score()`. This is the recommended workflow (however, `score()` will run `as_forecast()` internally if this hasn't happened before). + +Note that in the above example some columns contain duplicated information with regards to the forecast unit, e.g. "location" and "location_name", and can be dropped. If we drop essential information, for example, the "target_type" column, we'll get an error informing us that the forecasts aren't uniquely identified any more. + +```{r, error=TRUE} +example_quantile %>% + set_forecast_unit( + c("location", "target_end_date", + "forecast_date", "model", "horizon") + ) %>% + as_forecast() +``` + +The function `get_duplicate_forecasts()` may help to investigate the issue. When filtering for only a single quantile of the EuroCOVIDhub-ensemble, we can see that there are indeed two forecasts for every date, location and horizon. + +```{r} +duplicates <- example_quantile %>% + set_forecast_unit( + c("location", "target_end_date", + "forecast_date", "model", "horizon") + ) %>% + get_duplicate_forecasts() + +duplicates[quantile == 0.5 & model == "EuroCOVIDhub-ensemble", ] %>% + head() +``` + ## Showing available forecasts -The function `avail_forecasts()` may also be helpful to determine where forecasts are available. Using the `by` argument you can specify the level of summary. For example, to see how many forecasts there are per model and target_type, we can run +The function `get_forecast_counts()` may also be helpful to determine where forecasts are available. Using the `by` argument you can specify the level of summary. For example, to see how many forecasts there are per model and target_type, we can run ```{r} -avail_forecasts(example_quantile, by = c("model", "target_type")) +get_forecast_counts(forecast_quantile, by = c("model", "target_type")) ``` We see that 'epiforecasts-EpiNow2' has some missing forecasts for the deaths forecast target and that UMass-MechBayes has no case forecasts. -This information can also be visualised using the `plot_avail_forecasts()` function: +This information can also be visualised using `plot()`: ```{r, fig.width=11, fig.height=6} -example_quantile %>% - avail_forecasts(by = c("model", "forecast_date", "target_type")) %>% - plot_avail_forecasts() + +forecast_quantile %>% + get_forecast_counts(by = c("model", "forecast_date", "target_type")) %>% + plot_forecast_counts(x = "forecast_date") + facet_wrap(~ target_type) ``` You can also visualise forecasts directly using the `plot_predictions()` function: ```{r, fig.width = 9, fig.height = 6} -example_quantile %>% +forecast_quantile %>% make_NA( what = "truth", target_end_date >= "2021-07-15", @@ -123,82 +155,24 @@ example_quantile %>% Forecasts can easily be scored using the `score()` function. -For clarity, we suggest setting the forecast unit explicitly and you may also want to call `check_forecasts()` explicitly. +For clarity, we suggest setting the forecast unit explicitly and calling `as_forecast()` explicitly. ```{r} -scores <- example_quantile %>% - set_forecast_unit( - c("location", "target_end_date", "target_type", "location_name", - "forecast_date", "model", "horizon") - ) %>% - check_forecasts() %>% +scores <- forecast_quantile %>% score() -head(scores) -``` - -Note that in the above example some columns contain duplicated information with regards to the forecast unit, e.g. "location" and "location_name", and could be dropped. -```{r} -example_quantile %>% - set_forecast_unit( - c("location", "target_end_date", "target_type", - "forecast_date", "model", "horizon") - ) %>% - check_forecasts() +print(scores, 2) ``` -If we drop essential information, for example, the "target_type" column, we'll get an error informing us that the forecasts aren't uniquely identified any more. +The function `score()` returns unsumarised scores, which in most cases is not what the user wants. It returns a single score per forecast (as determined by the forecast unit). -```{r, error=TRUE} -example_quantile %>% - set_forecast_unit( - c("location", "target_end_date", - "forecast_date", "model", "horizon") - ) %>% - check_forecasts() -``` - -The function `find_duplicates()` may help to investigate the issue. When filtering for only median forecasts of the EuroCOVIDhub-ensemble, we can see that there are indeed two forecasts for every date, location and horizon. - -```{r} -duplicates <- example_quantile %>% - set_forecast_unit( - c("location", "target_end_date", - "forecast_date", "model", "horizon") - ) %>% - find_duplicates() - -duplicates[quantile == 0.5 & model == "EuroCOVIDhub-ensemble", ] %>% - head() -``` - -The function `score()` returns unsumarised scores, which in most cases is not what the user wants. It returns a single score per forecast (as determined by the forecast unit). For forecasts in a quantile format, it returns one score per quantile. - -A second function, `summarise_scores()` takes care of summarising these scores to the level specified by the user. The `by` argument can be used to define the level of summary. By default, `by = NULL` and the summary unit is assumed to be the same as the unit of a single forecast. For continuous forecasts, this means that nothing happens if `by` isn't specified. +A second function, `summarise_scores()` takes care of summarising these scores to the level specified by the user. The `by` argument can be used to define the level of summary. By default, `by = NULL` and the summary unit is assumed to be the same as the unit of a single forecast (meaning that nothing happens if `by` isn't specified). ```{r} scores <- score(example_continuous) all(scores == summarise_scores(scores), na.rm = TRUE) ``` -For quantile based forecasts, if `by = NULL`, then scores are summarised across quantiles and instead of one score per forecast_unit and quantile we only get one score per forecast unit. - -```{r} -scores <- example_quantile %>% - set_forecast_unit( - c("location", "target_end_date", "target_type", - "forecast_date", "model", "horizon") - ) %>% - check_forecasts() %>% - score() - -head(scores) - -scores %>% - summarise_scores() %>% - head() -``` - Through the `by` argument we can specify what unit of summary we want. We can also call `sumarise_scores()` multiple tines, e.g to round your outputs by specifying e.g. `signif()` as a summary function. ```{r} @@ -225,7 +199,6 @@ For quantile-based forecasts we are often interested in specific coverage-levels ```{r} score(example_quantile) %>% - add_coverage(ranges = c(50, 90), by = c("model", "target_type")) %>% summarise_scores(by = c("model", "target_type")) %>% summarise_scores(fun = signif, digits = 2) ``` @@ -236,9 +209,8 @@ In order to better compare models against each other we can use relative scores ```{r} score(example_quantile) %>% - summarise_scores(by = c("model", "target_type"), - relative_skill = TRUE, - baseline = "EuroCOVIDhub-ensemble") + summarise_scores(by = c("model", "target_type")) %>% + add_pairwise_comparison(baseline = "EuroCOVIDhub-ensemble") ``` @@ -280,7 +252,7 @@ score(example_quantile) %>% ## Calibration -Calibration is a measure statistical consistency between the forecasts and the observed values. The most common way of assessing calibration (more precisely: probabilistic calibration) are PIT histograms. The probability integral transform (PIT) is equal to the cumulative distribution function of a forecast evaluated at the true observed value. Ideally, pit values should be uniformly distributed after the transformation. +Calibration is a measure statistical consistency between the forecasts and observed values. The most common way of assessing calibration (more precisely: probabilistic calibration) are PIT histograms. The probability integral transform (PIT) is equal to the cumulative distribution function of a forecast evaluated at the observed value. Ideally, pit values should be uniformly distributed after the transformation. We can compute pit values as such: @@ -307,20 +279,20 @@ example_quantile[quantile %in% seq(0.1, 0.9, 0.1), ] %>% facet_grid(model ~ target_type) ``` -Another way to look at calibration are interval coverage and quantile coverage. Interval coverage is the percentage of true values that fall inside a given central prediction interval. Quantile coverage is the percentage of observed values that fall below a given quantile level. + -In order to plot interval coverage, you need to include "range" in the `by` argument to `summarise_scores()`. The green area on the plot marks conservative behaviour, i.e. your empirical coverage is greater than it nominally need be (e.g. 55% of true values covered by all 50% central prediction intervals.) + -```{r} +```{r, eval=FALSE, include=FALSE} example_quantile %>% score() %>% summarise_scores(by = c("model", "range")) %>% plot_interval_coverage() ``` -To visualise quantile coverage, you need to include "quantile" in `by`. Again, the green area corresponds to conservative forecasts, where central prediction intervals would cover more than needed. + -```{r} +```{r, eval=FALSE, include=FALSE} example_quantile %>% score() %>% summarise_scores(by = c("model", "quantile")) %>% @@ -334,23 +306,22 @@ Relative scores for different models can be computed using pairwise comparisons, In `scoringutils`, pairwise comparisons can be made in two ways: Through the standalone function `pairwise_comparison()` or from within `summarise_scores()` which simply adds relative scores to an existing set of scores. ```{r} -example_quantile %>% +forecast_quantile %>% score() %>% pairwise_comparison(by = "model", baseline = "EuroCOVIDhub-baseline") ``` ```{r} -example_quantile %>% +forecast_quantile %>% score() %>% - summarise_scores( - by = "model", relative_skill = TRUE, baseline = "EuroCOVIDhub-baseline" - ) + summarise_scores(by = "model") %>% + add_pairwise_comparison(baseline = "EuroCOVIDhub-baseline") ``` If using the `pairwise_comparison()` function, we can also visualise pairwise comparisons by showing the mean score ratios between models. By default, smaller values are better and the model we care about is showing on the y axis on the left, while the model against it is compared is shown on the x-axis on the bottom. In the example above, the EuroCOVIDhub-ensemble performs best (it only has values smaller 1), while the EuroCOVIDhub-baseline performs worst (and only has values larger than 1). For cases, the UMass-MechBayes model is of course excluded as there are no case forecasts available and therefore the set of overlapping forecasts is empty. ```{r, fig.width=9, fig.height=7} -example_quantile %>% +forecast_quantile %>% score() %>% pairwise_comparison(by = c("model", "target_type")) %>% plot_pairwise_comparison() + @@ -365,7 +336,7 @@ example_quantile %>% It may sometimes be interesting to see how different scores correlate with each other. We can examine this using the function `correlation()`. When dealing with quantile-based forecasts, it is important to call `summarise_scorees()` before `correlation()` to summarise over quantiles before computing correlations. ```{r} -example_quantile %>% +forecast_quantile %>% score() %>% summarise_scores() %>% correlation() @@ -374,19 +345,19 @@ example_quantile %>% Visualising correlations: ```{r} -example_quantile %>% +forecast_quantile %>% score() %>% summarise_scores() %>% - correlation() %>% + correlation(digits = 2) %>% plot_correlation() ``` -### Scores by interval ranges + -If you would like to see how different forecast interval ranges contribute to average scores, you can visualise scores by interval range: + -```{r} -example_quantile %>% +```{r, eval = FALSE, include = FALSE} +forecast_quantile %>% score() %>% summarise_scores(by = c("model", "range", "target_type")) %>% plot_ranges() + @@ -404,8 +375,7 @@ example_integer %>% sample_to_quantile( quantiles = c(0.01, 0.025, seq(0.05, 0.95, 0.05), 0.975, 0.99) ) %>% - score() %>% - add_coverage(by = c("model", "target_type")) + score() ``` ## Available metrics