-
Notifications
You must be signed in to change notification settings - Fork 14
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #307 from ncss-tech/collapseHz
Add `collapseHz()`
- Loading branch information
Showing
7 changed files
with
695 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,307 @@ | ||
#' Collapse Horizons within Profiles Based on Pattern Matching | ||
#' | ||
#' Combines layers and aggregates data by grouping adjacent horizons which match `pattern` in | ||
#' `hzdesgn` or, alternately, share a common value in `by` argument. Numeric properties are combined | ||
#' using the weighted average, and other properties are derived from the dominant condition based on | ||
#' thickness of layers and values in each group. | ||
#' | ||
#' @param x A _SoilProfileCollection_ | ||
#' @param pattern _character_. A regular expression pattern to match in `hzdesgn` column. Default: | ||
#' `NULL`. | ||
#' @param by _character_. A column name specifying horizons that should be combined. Aggregation | ||
#' will be applied to adjacent groups of layers within profiles that have the same value in `by`. | ||
#' Used in lieu of `pattern` and `hzdesgn`. Default: `NULL`. | ||
#' @param hzdesgn _character_. Any character column containing horizon-level identifiers. Default: | ||
#' `hzdesgnname(x, required = TRUE)`. | ||
#' @param FUN _function_. A function that returns a _logical_ vector equal in length to the number | ||
#' of horizons in `x`. Used only when `pattern` is specified. See details. | ||
#' @param ... Additional arguments passed to the matching function `FUN`. | ||
#' @param AGGFUN _list_. A _named_ list containing custom aggregation functions. List element names | ||
#' should match the column name that they transform. The functions defined should take three | ||
#' arguments: `x` (a vector of horizon property values), `top` (a vector of top depths), and | ||
#' `bottom` (a vector of bottom depths). Default: `NULL` applies `weighted.mean()` to all numeric | ||
#' columns not listed in `ignore_numerics` and takes the dominant condition (value with greatest | ||
#' aggregate thickness sum) for all other columns. See details. | ||
#' @param ignore_numerics _character_. Vector of column names that contain numeric values which | ||
#' should _not_ be aggregated using `weighted.mean()`. For example, soil color "value" and | ||
#' "chroma". | ||
#' @param na.rm _logical_. If `TRUE` `NA` values are ignored when calculating min/max boundaries for | ||
#' each group and in weighted averages. If `FALSE` `NA` values are propagated to the result. | ||
#' Default: `FALSE`. | ||
#' | ||
#' @details | ||
#' | ||
#' If a custom matching function (`FUN`) is used, it should accept arbitrary additional arguments | ||
#' via an ellipsis (`...`). It is not necessary to do anything with arguments, but the result should | ||
#' match the number of horizons found in the input SoilProfileCollection `x`. | ||
#' | ||
#' Custom aggregation functions defined in the `AGGFUN` argument should either return a single | ||
#' vector value for each group*column combination, or should return a _data.frame_ object with named | ||
#' columns. If the input column name is used as a column name in the result _data.frame_, then the | ||
#' values of that column name in the result _SoilProfileCollection_ will be replaced by the output | ||
#' of the aggregation function. See examples. | ||
#' | ||
#' @return A _SoilProfileCollection_ | ||
#' | ||
#' @author Andrew G. Brown | ||
#' | ||
#' @seealso `hz_dissolve()` | ||
#' | ||
#' @export | ||
#' | ||
#' @examples | ||
#' data(jacobs2000) | ||
#' | ||
#' # calculate a new SPC with genhz column based on patterns | ||
#' new_labels <- c("A", "E", "Bt", "Bh", "C") | ||
#' patterns <- c("A", "E", "B.*t", "B.*h", "C") | ||
#' jacobs2000_gen <- generalizeHz(jacobs2000, new = new_labels, pattern = patterns) | ||
#' | ||
#' # use existing generalized horizon labels | ||
#' i <- collapseHz(jacobs2000_gen, by = "genhz") | ||
#' | ||
#' profile_id(i) <- paste0(profile_id(i), "_collapse") | ||
#' | ||
#' plot( | ||
#' c(i, jacobs2000), | ||
#' color = "genhz", | ||
#' name = "name", | ||
#' name.style = "center-center", | ||
#' cex.names = 1 | ||
#' ) | ||
#' | ||
#' # custom pattern argument | ||
#' j <- collapseHz(jacobs2000, | ||
#' c( | ||
#' `A` = "^A", | ||
#' `E` = "E", | ||
#' `Bt` = "[ABC]+t", | ||
#' `C` = "^C", | ||
#' `foo` = "bar" | ||
#' )) | ||
#' profile_id(j) <- paste0(profile_id(j), "_collapse") | ||
#' plot(c(j, jacobs2000), color = "clay") | ||
#' | ||
#' # custom aggregation function for matrix_color_munsell | ||
#' k <- collapseHz(jacobs2000, | ||
#' pattern = c( | ||
#' `A` = "^A", | ||
#' `E` = "E", | ||
#' `Bt` = "[ABC]+t", | ||
#' `C` = "^C", | ||
#' `foo` = "bar" | ||
#' ), | ||
#' AGGFUN = list( | ||
#' matrix_color_munsell = function(x, top, bottom) { | ||
#' thk <- bottom - top | ||
#' if (length(x) > 1) { | ||
#' xord <- order(thk, decreasing = TRUE) | ||
#' paste0(paste0(x[xord], " (t=", thk[xord], ")"), collapse = ", ") | ||
#' } else | ||
#' x | ||
#' } | ||
#' ) | ||
#' ) | ||
#' profile_id(k) <- paste0(profile_id(k), "_collapse_custom") | ||
#' | ||
#' unique(k$matrix_color_munsell) | ||
#' | ||
#' # custom aggregation function for matrix_color_munsell (returns data.frame) | ||
#' m <- collapseHz(jacobs2000, | ||
#' pattern = c( | ||
#' `A` = "^A", | ||
#' `E` = "E", | ||
#' `Bt` = "[ABC]+t", | ||
#' `C` = "^C", | ||
#' `foo` = "bar" | ||
#' ), | ||
#' AGGFUN = list( | ||
#' matrix_color_munsell = function(x, top, bottom) { | ||
#' thk <- bottom - top | ||
#' if (length(x) > 1) { | ||
#' xord <- order(thk, decreasing = TRUE) | ||
#' data.frame(matrix_color_munsell = paste0(x, collapse = ";"), | ||
#' n_matrix_color = length(x)) | ||
#' } else { | ||
#' data.frame(matrix_color_munsell = x, | ||
#' n_matrix_color = length(x)) | ||
#' } | ||
#' } | ||
#' ) | ||
#' ) | ||
#' profile_id(m) <- paste0(profile_id(m), "_collapse_custom") | ||
#' | ||
#' m$matrix_color_munsell.n_matrix_color | ||
collapseHz <- function(x, | ||
pattern = NULL, | ||
by = NULL, | ||
hzdesgn = hzdesgnname(x, required = TRUE), | ||
FUN = function(x, pattern, hzdesgn, ...) grepl(pattern, x[[hzdesgn]], ignore.case = FALSE), | ||
..., | ||
AGGFUN = NULL, | ||
ignore_numerics = NULL, | ||
na.rm = FALSE) { | ||
idn <- idname(x) | ||
hzd <- horizonDepths(x) | ||
|
||
.screenDepths(x, hzd) | ||
|
||
# use exact match of existing genhz labels as default in lieu of pattern | ||
if (is.null(pattern) & missing(by)) { | ||
by <- GHL(x, required = TRUE) | ||
} | ||
|
||
if (length(pattern) == 0) { | ||
pattern <- NA | ||
} | ||
|
||
# if a named vector of patterns is given, use the names as new labels | ||
if (!is.null(names(pattern))) { | ||
labels <- names(pattern) | ||
pattern <- as.character(pattern) | ||
} else { | ||
# otherwise, the patterns and labels are the same | ||
pattern <- as.character(pattern) | ||
labels <- pattern | ||
} | ||
|
||
h <- data.table::data.table(horizons(x)) | ||
|
||
# iterate over patterns | ||
for (p in seq(pattern)) { | ||
|
||
# calculate matches | ||
if (!is.null(by) && length(pattern) == 1 && is.na(pattern)) { | ||
|
||
if (!by %in% horizonNames(x)) { | ||
stop("Column name `by` (\"", by, ") is not a horizon-level variable.", call. = FALSE) | ||
} | ||
|
||
labels <- h[[by]] | ||
|
||
if (any(is.na(labels))) { | ||
stop("Missing values are not allowed in `by` column argument", call. = FALSE) | ||
} | ||
|
||
r <- rle(paste0(h[[idn]], "-", as.character(labels))) | ||
l <- rep(TRUE, nrow(h)) | ||
} else { | ||
l <- FUN(x, pattern = pattern[p], hzdesgn = hzdesgn, na.rm = na.rm, ...) | ||
r <- rle(l) | ||
} | ||
|
||
# only apply aggregation if there are adjacent horizons that match the target criteria | ||
if (any(r$lengths > 1)) { | ||
g <- unlist(lapply(seq_along(r$lengths), function(i) rep(i, r$lengths[i]))) | ||
hidx <- unlist(lapply(seq_along(r$lengths), function(i) if (r$lengths[i] == 1) TRUE else rep(FALSE, r$lengths[i]))) & l | ||
gidx <- g %in% unique(g[l]) & !hidx | ||
naf <- names(AGGFUN) | ||
|
||
# iterate over sets of layers needing aggregation within each matching group | ||
if (sum(gidx) > 0){ | ||
res <- h[gidx, c(list(hzdeptnew = suppressWarnings(min(.SD[[hzd[1]]], na.rm = na.rm)), | ||
hzdepbnew = suppressWarnings(max(.SD[[hzd[2]]], na.rm = na.rm))), | ||
|
||
# process numeric depth weighted averages w/ dominant condition otherwise | ||
sapply(colnames(.SD)[!colnames(.SD) %in% c(hzd, naf)], | ||
function(n, top, bottom) { | ||
v <- .SD[[n]] | ||
if (length(v) > 1) { | ||
if (!n %in% ignore_numerics && is.numeric(v)) { | ||
|
||
# weighted average by thickness (numerics not in exclusion list) | ||
v <- weighted.mean(v, bottom - top, na.rm = na.rm) | ||
|
||
} else { | ||
# take thickest value | ||
# v[which.max(bottom - top)[1]] | ||
|
||
# convert factors etc to character | ||
# results may not conform with existing factor levels | ||
v <- as.character(v) | ||
|
||
# replace NA values for use in aggregate() | ||
if (!na.rm) { | ||
v[is.na(v)] <- "<collapseHZ-category-missing>" | ||
} | ||
|
||
# take dominant condition (based on sum of thickness) | ||
cond <- aggregate(bottom - top, by = list(v), sum, na.rm = na.rm) | ||
v <- cond[[1]][which.max(cond[[2]])[1]] | ||
|
||
if (!na.rm) { | ||
v[v == "<collapseHZ-category-missing>"] <- NA | ||
} | ||
} | ||
} | ||
out <- data.frame(v) | ||
colnames(out) <- n | ||
out | ||
}, | ||
top = .SD[[hzd[1]]], | ||
bottom = .SD[[hzd[2]]]), | ||
|
||
# process custom aggregation functions (may return data.frames) | ||
do.call('c', lapply(colnames(.SD)[colnames(.SD) %in% naf], | ||
function(n, top, bottom) { | ||
out <- AGGFUN[[n]](.SD[[n]], top, bottom) | ||
if (!is.data.frame(out)) { | ||
out <- data.frame(out) | ||
colnames(out) <- n | ||
} else { | ||
colnames(out) <- paste0(n, ".", colnames(out)) | ||
} | ||
out | ||
}, | ||
top = .SD[[hzd[1]]], | ||
bottom = .SD[[hzd[2]]]))), | ||
by = g[gidx]] | ||
# remove grouping ID | ||
res$g <- NULL | ||
} else { | ||
res <- h[0, ] | ||
} | ||
|
||
# allow for replacing values as well as adding new values with data.frame AGGFUN | ||
test1.idx <- na.omit(match(colnames(res), paste0(colnames(h), ".", colnames(h)))) | ||
test2.idx <- na.omit(match(paste0(colnames(h), ".", colnames(h)), colnames(res))) | ||
colnames(res)[test2.idx] <- colnames(h)[test1.idx] | ||
|
||
# determine matches that are only a single layer (no aggregation applied) | ||
res2 <- h[hidx & l, ] | ||
res2$hzdeptnew <- res2[[hzd[1]]] | ||
res2$hzdepbnew <- res2[[hzd[2]]] | ||
res2[[hzd[1]]] <- NULL | ||
res2[[hzd[2]]] <- NULL | ||
|
||
# combine matches | ||
res3 <- data.table::rbindlist(list(res, res2), fill = TRUE) | ||
if (missing(by) && nrow(res3) > 0){ | ||
res3[[hzdesgn]] <- labels[p] | ||
} | ||
|
||
# combine matches with horizons that did not match | ||
agg.idx <- which(g %in% unique(g[l]) | hidx) | ||
if (length(agg.idx) > 0) { | ||
h <- h[-agg.idx, ] | ||
} | ||
h <- data.table::rbindlist(list(h, res3), fill = TRUE) | ||
|
||
# replace depths | ||
hn <- !is.na(h$hzdeptnew) & !is.na(h$hzdepbnew) | ||
h[[hzd[1]]][hn] <- h$hzdeptnew[hn] | ||
h[[hzd[2]]][hn] <- h$hzdepbnew[hn] | ||
h$hzdeptnew <- NULL | ||
h$hzdepbnew <- NULL | ||
|
||
# sort horizons by id name and top depth | ||
h <- h[order(h[[idn]], h[[hzd[1]]]),] | ||
|
||
} | ||
|
||
# replace horizons in parent SPC | ||
replaceHorizons(x) <- h | ||
} | ||
x | ||
} | ||
|
Oops, something went wrong.