-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathinterval_assertions.R
170 lines (147 loc) · 6.48 KB
/
interval_assertions.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
#' @title Is the interval variable missing any expected intervals?
#'
#' @description Checks to see if the input interval variable is missing any
#' expected intervals.
#'
#' @param ints_dt \[`data.table()`\]\cr
#' Unique intervals. The first column represents the start of each interval
#' and the second column represents the end of each interval.
#' @param expected_ints_dt \[`data.table()`\]\cr
#' The expected intervals that should be completely included in `ints_dt`.
#' The first column represents the start of each interval and the second
#' column represents the end of each interval.
#'
#' @return `identify_missing_intervals` returns a \[`data.table()`\] with
#' columns for the 'start' and 'end' of the missing intervals. If no intervals
#' are missing then a zero-row \[`data.table()`\] is returned.
#' `assert_no_missing_intervals` returns nothing but throws an error if
#' `identify_missing_intervals` returns a non-empty data.table.
#'
#' @examples
#' ints_dt <- data.table::data.table(
#' start = seq(0, 95, 5),
#' end = c(seq(5, 95, 5), Inf)
#' )
#' missing_dt <- identify_missing_intervals(
#' ints_dt = ints_dt[!start %in% c(0, 10, 95)],
#' expected_ints_dt = data.table::data.table(start = 0, end = Inf)
#' )
#'
#' @export
#' @rdname missing_intervals
assert_no_missing_intervals <- function(ints_dt, expected_ints_dt) {
missing_intervals <- identify_missing_intervals(ints_dt, expected_ints_dt)
no_missing_intervals <- nrow(missing_intervals) == 0
error_msg <-
paste0("There are missing intervals in `ints_dt`",
paste0(capture.output(missing_intervals), collapse = "\n"))
assertthat::assert_that(no_missing_intervals, msg = error_msg)
}
#' @export
#' @rdname missing_intervals
identify_missing_intervals <- function(ints_dt, expected_ints_dt) {
assertthat::assert_that(
assertive::is_data.table(ints_dt),
ncol(ints_dt) == 2,
all(ints_dt[[1]] < ints_dt[[2]]),
msg = paste("`ints_dt` must a 2-column data.table with the first column",
"representing the start of the interval and the second column",
"the end of each interval")
)
assertthat::assert_that(
assertive::is_data.table(expected_ints_dt),
ncol(ints_dt) == 2,
all(ints_dt[[1]] < ints_dt[[2]]),
msg = paste("`expected_ints_dt` must a 2-column data.table with the first",
"column representing the start of the interval and the second",
"column the end of each interval")
)
# create full interval that all sub intervals should span
expected_ints <- intervals::Intervals_full(as.matrix(expected_ints_dt),
closed = c(TRUE, FALSE))
# create left-closed, right-open intervals
ints <- intervals::Intervals_full(as.matrix(ints_dt), closed = c(TRUE, FALSE))
# identify missing intervals
missing_ints <- intervals::interval_difference(expected_ints, ints)
missing_ints_dt <- data.table::as.data.table(missing_ints)
data.table::setnames(missing_ints_dt, c("start", "end"))
data.table::setkeyv(missing_ints_dt, c("start", "end"))
return(missing_ints_dt)
}
#' @title Does the interval variable have any overlapping intervals?
#'
#' @description Checks to see if the input interval variable has any
#' overlapping intervals.
#'
#' @inheritParams identify_missing_intervals
#' @param identify_all_possible \[`logical(1)`\]\cr
#' Whether to return all overlapping intervals ('TRUE') or try to identify just
#' the less granular interval ('FALSE'). Default is 'FALSE'. Useful when it may
#' not be clear what is the less granular interval.
#'
#' @return `identify_overlapping_intervals` returns a \[`data.table()`\] with
#' columns for the 'start' and 'end' of the overlapping intervals. If no
#' intervals are overlapping then a zero-row \[`data.table()`\] is returned.
#' `assert_no_overlapping_intervals` returns nothing but throws an error if
#' `identify_overlapping_intervals` returns a non-empty data.table.
#'
#' @examples
#' ints_dt <- data.table::data.table(
#' start = c(seq(10, 50, 5), 0),
#' end = c(seq(15, 55, 5), 11)
#' )
#' overlapping_dt <- identify_overlapping_intervals(ints_dt, identify_all_possible = FALSE)
#' overlapping_dt <- identify_overlapping_intervals(ints_dt, identify_all_possible = TRUE)
#'
#'
#' @export
#' @rdname overlapping_intervals
assert_no_overlapping_intervals <- function(ints_dt) {
overlapping_intervals <- identify_overlapping_intervals(ints_dt)
no_overlapping_intervals <- nrow(overlapping_intervals) == 0
error_msg <-
paste0("There are overlapping intervals in `ints_dt`",
paste0(capture.output(overlapping_intervals), collapse = "\n"))
assertthat::assert_that(no_overlapping_intervals, msg = error_msg)
}
#' @export
#' @rdname overlapping_intervals
identify_overlapping_intervals <- function(ints_dt, identify_all_possible = FALSE) {
assertthat::assert_that(
assertive::is_data.table(ints_dt),
ncol(ints_dt) == 2,
all(ints_dt[[1]] < ints_dt[[2]]),
msg = paste("`ints_dt` must a 2-column data.table with the first column",
"representing the start of the interval and the second column",
"the end of each interval")
)
# create left-closed, right-open intervals
ints <- intervals::Intervals_full(as.matrix(ints_dt), closed = c(TRUE, FALSE))
# get list mapping between intervals if they overlap at all
overlaps <- intervals::interval_overlap(ints, ints)
names(overlaps) <- 1:length(overlaps)
# remove self match only
overlaps <- overlaps[sapply(overlaps, function(i) length(i) > 1)]
# sort by number of intervals that each interval overlaps with so that we can
# identify the largest overlapping intervals first
overlaps <- overlaps[order(sapply(overlaps, length), decreasing=T)]
if (identify_all_possible) {
overlapping_indices <- names(overlaps)
} else {
overlapping_indices <- c()
for (i in names(overlaps)) {
# remove match to itself
overlaps[[i]] <- overlaps[[i]][overlaps[[i]] != i]
# remove indices of overlapping intervals that have already been identified
overlaps[[i]] <- overlaps[[i]][!overlaps[[i]] %in% overlapping_indices]
if (length(overlaps[[i]]) > 0) {
overlapping_indices <- c(overlapping_indices, i)
}
}
}
overlapping_ints <- ints[as.integer(overlapping_indices)]
overlapping_ints_dt <- data.table::as.data.table(overlapping_ints)
data.table::setnames(overlapping_ints_dt, c("start", "end"))
data.table::setkeyv(overlapping_ints_dt, c("start", "end"))
return(overlapping_ints_dt)
}