From abf946fd5d2ae08b93fabd20cca86a2f64e4bc15 Mon Sep 17 00:00:00 2001 From: Charlton Callender Date: Fri, 17 Sep 2021 14:10:15 -0700 Subject: [PATCH] option to identify all possible overlapping intervals --- DESCRIPTION | 2 +- R/interval_assertions.R | 36 +++++++++++++++-------- R/interval_collapse.R | 2 +- man/overlapping_intervals.Rd | 14 ++++++--- tests/testthat/test-interval_assertions.R | 14 ++++----- 5 files changed, 43 insertions(+), 25 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index ff45325..d12882f 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -26,7 +26,7 @@ Suggests: ggplot2, scales Roxygen: list(markdown = TRUE) -RoxygenNote: 7.1.1 +RoxygenNote: 7.1.2 VignetteBuilder: knitr Language: en-US Depends: R (>= 2.10) diff --git a/R/interval_assertions.R b/R/interval_assertions.R index 48506d8..8e827fa 100644 --- a/R/interval_assertions.R +++ b/R/interval_assertions.R @@ -84,6 +84,10 @@ identify_missing_intervals <- function(ints_dt, expected_ints_dt) { #' overlapping intervals. #' #' @inheritParams identify_missing_intervals +#' @param identify_all_possible \[`logical(1)`\]\cr +#' Whether to return all overlapping intervals ('TRUE') or try to identify just +#' the less granular interval ('FALSE'). Default is 'FALSE'. Useful when it may +#' not be clear what is the less granular interval. #' #' @return `identify_overlapping_intervals` returns a \[`data.table()`\] with #' columns for the 'start' and 'end' of the overlapping intervals. If no @@ -93,10 +97,11 @@ identify_missing_intervals <- function(ints_dt, expected_ints_dt) { #' #' @examples #' ints_dt <- data.table::data.table( -#' start = c(seq(0, 95, 5), 0), -#' end = c(seq(5, 95, 5), Inf, Inf) +#' start = c(seq(10, 50, 5), 0), +#' end = c(seq(15, 55, 5), 11) #' ) -#' overlapping_dt <- identify_overlapping_intervals(ints_dt) +#' overlapping_dt <- identify_overlapping_intervals(ints_dt, identify_all_possible = FALSE) +#' overlapping_dt <- identify_overlapping_intervals(ints_dt, identify_all_possible = TRUE) #' #' #' @export @@ -115,7 +120,7 @@ assert_no_overlapping_intervals <- function(ints_dt) { #' @export #' @rdname overlapping_intervals -identify_overlapping_intervals <- function(ints_dt) { +identify_overlapping_intervals <- function(ints_dt, identify_all_possible = FALSE) { assertthat::assert_that( assertive::is_data.table(ints_dt), @@ -133,20 +138,27 @@ identify_overlapping_intervals <- function(ints_dt) { overlaps <- intervals::interval_overlap(ints, ints) names(overlaps) <- 1:length(overlaps) + # remove self match only + overlaps <- overlaps[sapply(overlaps, function(i) length(i) > 1)] + # sort by number of intervals that each interval overlaps with so that we can # identify the largest overlapping intervals first overlaps <- overlaps[order(sapply(overlaps, length), decreasing=T)] - overlapping_indices <- c() - for (i in names(overlaps)) { - # remove match to itself - overlaps[[i]] <- overlaps[[i]][overlaps[[i]] != i] + if (identify_all_possible) { + overlapping_indices <- names(overlaps) + } else { + overlapping_indices <- c() + for (i in names(overlaps)) { + # remove match to itself + overlaps[[i]] <- overlaps[[i]][overlaps[[i]] != i] - # remove indices of overlapping intervals that have already been identified - overlaps[[i]] <- overlaps[[i]][!overlaps[[i]] %in% overlapping_indices] + # remove indices of overlapping intervals that have already been identified + overlaps[[i]] <- overlaps[[i]][!overlaps[[i]] %in% overlapping_indices] - if (length(overlaps[[i]]) > 0) { - overlapping_indices <- c(overlapping_indices, i) + if (length(overlaps[[i]]) > 0) { + overlapping_indices <- c(overlapping_indices, i) + } } } overlapping_ints <- ints[as.integer(overlapping_indices)] diff --git a/R/interval_collapse.R b/R/interval_collapse.R index 5386016..760ebf0 100644 --- a/R/interval_collapse.R +++ b/R/interval_collapse.R @@ -137,7 +137,7 @@ collapse_common_intervals <- function(dt, # check for overlapping intervals if (overlapping_dt_severity != "skip") { overlapping_dt <- dt[ - , identify_overlapping_intervals(unique(.SD)), + , identify_overlapping_intervals(unique(.SD), identify_all_possible = overlapping_dt_severity != "none"), .SDcols = cols, by = by_id_cols ] data.table::setnames(overlapping_dt, c("start", "end"), cols) diff --git a/man/overlapping_intervals.Rd b/man/overlapping_intervals.Rd index 39b44cc..1116c56 100644 --- a/man/overlapping_intervals.Rd +++ b/man/overlapping_intervals.Rd @@ -7,12 +7,17 @@ \usage{ assert_no_overlapping_intervals(ints_dt) -identify_overlapping_intervals(ints_dt) +identify_overlapping_intervals(ints_dt, identify_all_possible = FALSE) } \arguments{ \item{ints_dt}{[\code{data.table()}]\cr Unique intervals. The first column represents the start of each interval and the second column represents the end of each interval.} + +\item{identify_all_possible}{[\code{logical(1)}]\cr +Whether to return all overlapping intervals ('TRUE') or try to identify just +the less granular interval ('FALSE'). Default is 'FALSE'. Useful when it may +not be clear what is the less granular interval.} } \value{ \code{identify_overlapping_intervals} returns a [\code{data.table()}] with @@ -27,10 +32,11 @@ overlapping intervals. } \examples{ ints_dt <- data.table::data.table( - start = c(seq(0, 95, 5), 0), - end = c(seq(5, 95, 5), Inf, Inf) + start = c(seq(10, 50, 5), 0), + end = c(seq(15, 55, 5), 11) ) -overlapping_dt <- identify_overlapping_intervals(ints_dt) +overlapping_dt <- identify_overlapping_intervals(ints_dt, identify_all_possible = FALSE) +overlapping_dt <- identify_overlapping_intervals(ints_dt, identify_all_possible = TRUE) } diff --git a/tests/testthat/test-interval_assertions.R b/tests/testthat/test-interval_assertions.R index 4f6b6eb..6b09159 100644 --- a/tests/testthat/test-interval_assertions.R +++ b/tests/testthat/test-interval_assertions.R @@ -46,19 +46,19 @@ ints_dt <- data.table( testthat::test_that("missing intervals are identified correctly", { - testthat::expect_silent( assert_no_overlapping_intervals(ints_dt) ) - expected_overlapping_dt <- data.table( - start = c(15), - end = c(60) - ) - + expected_overlapping_dt <- data.table(start = c(15), end = c(60)) ints_dt <- rbind(ints_dt, expected_overlapping_dt) - overlapping_dt <- identify_overlapping_intervals(ints_dt) + overlapping_dt <- identify_overlapping_intervals(ints_dt, identify_all_possible = FALSE) + setkeyv(expected_overlapping_dt, c("start", "end")) + testthat::expect_equal(overlapping_dt, expected_overlapping_dt) + + overlapping_dt <- identify_overlapping_intervals(ints_dt, identify_all_possible = TRUE) + expected_overlapping_dt <- ints_dt[start >= 15 & end <= 60] setkeyv(expected_overlapping_dt, c("start", "end")) testthat::expect_equal(overlapping_dt, expected_overlapping_dt)