Skip to content

Commit

Permalink
Merge pull request #307 from ncss-tech/collapseHz
Browse files Browse the repository at this point in the history
Add `collapseHz()`
  • Loading branch information
brownag authored Oct 15, 2024
2 parents 32fd050 + 70fc053 commit 8b31518
Show file tree
Hide file tree
Showing 7 changed files with 695 additions and 2 deletions.
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ export(buntley.westin.index)
export(checkHzDepthLogic)
export(checkSPC)
export(col2Munsell)
export(collapseHz)
export(colorChart)
export(colorContrast)
export(colorContrastPlot)
Expand Down
5 changes: 3 additions & 2 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,10 @@
* added Munsell values of 8.5 and 9.5 to Munsell look up table and (interpolated) reference spectra (#318)
* `munsell2rgb()` now safely selects the closest Munsell value and chroma to those available in the package LUT
* new function `soilTextureColorPal()` for suggesting a color palette suitable for soil texture class
* **Breaking Change**: `@sp` slot of the `SoilProfileCollection` object, and dependency on sp package, has been removed.
* any `SoilProfileCollection` objects previously written to file (.rda, .rds) with aqp <2.1.x will need to be rebuilt using `rebuildSPC()` due to changes to S4 object structure
* **Breaking Change**: `@sp` slot of the SoilProfileCollection object, and dependency on sp package, has been removed.
* Any SoilProfileCollection objects previously written to file (.rda, .rds) with aqp <2.1.x will need to be rebuilt using `rebuildSPC()` due to changes to S4 object structure
* `estimatePSCS()` gains argument `"lieutex"` for in lieu textures which are used in the new routine for identification of the particle size control section of organic soils
* new function `collapseHz()` combines and aggregates data for adjacent horizons matching a pattern or sharing a common ID

# aqp 2.0.4 (2024-07-30)
* CRAN release
Expand Down
15 changes: 15 additions & 0 deletions R/SoilProfileCollection-setters.R
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,18 @@ setReplaceMethod("depths", "data.frame",
return(depth)
}

.checkDepthOrder <- function(x, depthcols) {
if (any(x[[depthcols[2]]] < x[[depthcols[1]]], na.rm = TRUE)) {
warning("One or more horizon bottom depths are shallower than top depth. Check depth logic with aqp::checkHzDepthLogic()", call. = FALSE)
}
}

.screenDepths <- function(x, depthcols = horizonDepths(x)) {
.checkNAdepths(x[[depthcols[1]]], "top")
.checkNAdepths(x[[depthcols[2]]], "bottom")
.checkDepthOrder(x, depthcols)
}

# create 0-length spc from id and horizon depth columns (`idn`, `hzd`)
# - allows template horizon (`hz`) and site (`st`) data to be provided (for additional columns)
.prototypeSPC <- function(idn, hzd,
Expand Down Expand Up @@ -178,6 +190,9 @@ setReplaceMethod("depths", "data.frame",
data[[depthcols[1]]] <- .checkNAdepths(data[[depthcols[1]]], "top")
data[[depthcols[2]]] <- .checkNAdepths(data[[depthcols[2]]], "bottom")

# warn if bottom depth shallower than top (old style O horizons, data entry issues, etc.)
.checkDepthOrder(data, depthcols)

tdep <- data[[depthcols[1]]]

# calculate ID-top depth order, re-order input data
Expand Down
307 changes: 307 additions & 0 deletions R/collapseHz.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,307 @@
#' Collapse Horizons within Profiles Based on Pattern Matching
#'
#' Combines layers and aggregates data by grouping adjacent horizons which match `pattern` in
#' `hzdesgn` or, alternately, share a common value in `by` argument. Numeric properties are combined
#' using the weighted average, and other properties are derived from the dominant condition based on
#' thickness of layers and values in each group.
#'
#' @param x A _SoilProfileCollection_
#' @param pattern _character_. A regular expression pattern to match in `hzdesgn` column. Default:
#' `NULL`.
#' @param by _character_. A column name specifying horizons that should be combined. Aggregation
#' will be applied to adjacent groups of layers within profiles that have the same value in `by`.
#' Used in lieu of `pattern` and `hzdesgn`. Default: `NULL`.
#' @param hzdesgn _character_. Any character column containing horizon-level identifiers. Default:
#' `hzdesgnname(x, required = TRUE)`.
#' @param FUN _function_. A function that returns a _logical_ vector equal in length to the number
#' of horizons in `x`. Used only when `pattern` is specified. See details.
#' @param ... Additional arguments passed to the matching function `FUN`.
#' @param AGGFUN _list_. A _named_ list containing custom aggregation functions. List element names
#' should match the column name that they transform. The functions defined should take three
#' arguments: `x` (a vector of horizon property values), `top` (a vector of top depths), and
#' `bottom` (a vector of bottom depths). Default: `NULL` applies `weighted.mean()` to all numeric
#' columns not listed in `ignore_numerics` and takes the dominant condition (value with greatest
#' aggregate thickness sum) for all other columns. See details.
#' @param ignore_numerics _character_. Vector of column names that contain numeric values which
#' should _not_ be aggregated using `weighted.mean()`. For example, soil color "value" and
#' "chroma".
#' @param na.rm _logical_. If `TRUE` `NA` values are ignored when calculating min/max boundaries for
#' each group and in weighted averages. If `FALSE` `NA` values are propagated to the result.
#' Default: `FALSE`.
#'
#' @details
#'
#' If a custom matching function (`FUN`) is used, it should accept arbitrary additional arguments
#' via an ellipsis (`...`). It is not necessary to do anything with arguments, but the result should
#' match the number of horizons found in the input SoilProfileCollection `x`.
#'
#' Custom aggregation functions defined in the `AGGFUN` argument should either return a single
#' vector value for each group*column combination, or should return a _data.frame_ object with named
#' columns. If the input column name is used as a column name in the result _data.frame_, then the
#' values of that column name in the result _SoilProfileCollection_ will be replaced by the output
#' of the aggregation function. See examples.
#'
#' @return A _SoilProfileCollection_
#'
#' @author Andrew G. Brown
#'
#' @seealso `hz_dissolve()`
#'
#' @export
#'
#' @examples
#' data(jacobs2000)
#'
#' # calculate a new SPC with genhz column based on patterns
#' new_labels <- c("A", "E", "Bt", "Bh", "C")
#' patterns <- c("A", "E", "B.*t", "B.*h", "C")
#' jacobs2000_gen <- generalizeHz(jacobs2000, new = new_labels, pattern = patterns)
#'
#' # use existing generalized horizon labels
#' i <- collapseHz(jacobs2000_gen, by = "genhz")
#'
#' profile_id(i) <- paste0(profile_id(i), "_collapse")
#'
#' plot(
#' c(i, jacobs2000),
#' color = "genhz",
#' name = "name",
#' name.style = "center-center",
#' cex.names = 1
#' )
#'
#' # custom pattern argument
#' j <- collapseHz(jacobs2000,
#' c(
#' `A` = "^A",
#' `E` = "E",
#' `Bt` = "[ABC]+t",
#' `C` = "^C",
#' `foo` = "bar"
#' ))
#' profile_id(j) <- paste0(profile_id(j), "_collapse")
#' plot(c(j, jacobs2000), color = "clay")
#'
#' # custom aggregation function for matrix_color_munsell
#' k <- collapseHz(jacobs2000,
#' pattern = c(
#' `A` = "^A",
#' `E` = "E",
#' `Bt` = "[ABC]+t",
#' `C` = "^C",
#' `foo` = "bar"
#' ),
#' AGGFUN = list(
#' matrix_color_munsell = function(x, top, bottom) {
#' thk <- bottom - top
#' if (length(x) > 1) {
#' xord <- order(thk, decreasing = TRUE)
#' paste0(paste0(x[xord], " (t=", thk[xord], ")"), collapse = ", ")
#' } else
#' x
#' }
#' )
#' )
#' profile_id(k) <- paste0(profile_id(k), "_collapse_custom")
#'
#' unique(k$matrix_color_munsell)
#'
#' # custom aggregation function for matrix_color_munsell (returns data.frame)
#' m <- collapseHz(jacobs2000,
#' pattern = c(
#' `A` = "^A",
#' `E` = "E",
#' `Bt` = "[ABC]+t",
#' `C` = "^C",
#' `foo` = "bar"
#' ),
#' AGGFUN = list(
#' matrix_color_munsell = function(x, top, bottom) {
#' thk <- bottom - top
#' if (length(x) > 1) {
#' xord <- order(thk, decreasing = TRUE)
#' data.frame(matrix_color_munsell = paste0(x, collapse = ";"),
#' n_matrix_color = length(x))
#' } else {
#' data.frame(matrix_color_munsell = x,
#' n_matrix_color = length(x))
#' }
#' }
#' )
#' )
#' profile_id(m) <- paste0(profile_id(m), "_collapse_custom")
#'
#' m$matrix_color_munsell.n_matrix_color
collapseHz <- function(x,
pattern = NULL,
by = NULL,
hzdesgn = hzdesgnname(x, required = TRUE),
FUN = function(x, pattern, hzdesgn, ...) grepl(pattern, x[[hzdesgn]], ignore.case = FALSE),
...,
AGGFUN = NULL,
ignore_numerics = NULL,
na.rm = FALSE) {
idn <- idname(x)
hzd <- horizonDepths(x)

.screenDepths(x, hzd)

# use exact match of existing genhz labels as default in lieu of pattern
if (is.null(pattern) & missing(by)) {
by <- GHL(x, required = TRUE)
}

if (length(pattern) == 0) {
pattern <- NA
}

# if a named vector of patterns is given, use the names as new labels
if (!is.null(names(pattern))) {
labels <- names(pattern)
pattern <- as.character(pattern)
} else {
# otherwise, the patterns and labels are the same
pattern <- as.character(pattern)
labels <- pattern
}

h <- data.table::data.table(horizons(x))

# iterate over patterns
for (p in seq(pattern)) {

# calculate matches
if (!is.null(by) && length(pattern) == 1 && is.na(pattern)) {

if (!by %in% horizonNames(x)) {
stop("Column name `by` (\"", by, ") is not a horizon-level variable.", call. = FALSE)
}

labels <- h[[by]]

if (any(is.na(labels))) {
stop("Missing values are not allowed in `by` column argument", call. = FALSE)
}

r <- rle(paste0(h[[idn]], "-", as.character(labels)))
l <- rep(TRUE, nrow(h))
} else {
l <- FUN(x, pattern = pattern[p], hzdesgn = hzdesgn, na.rm = na.rm, ...)
r <- rle(l)
}

# only apply aggregation if there are adjacent horizons that match the target criteria
if (any(r$lengths > 1)) {
g <- unlist(lapply(seq_along(r$lengths), function(i) rep(i, r$lengths[i])))
hidx <- unlist(lapply(seq_along(r$lengths), function(i) if (r$lengths[i] == 1) TRUE else rep(FALSE, r$lengths[i]))) & l
gidx <- g %in% unique(g[l]) & !hidx
naf <- names(AGGFUN)

# iterate over sets of layers needing aggregation within each matching group
if (sum(gidx) > 0){
res <- h[gidx, c(list(hzdeptnew = suppressWarnings(min(.SD[[hzd[1]]], na.rm = na.rm)),
hzdepbnew = suppressWarnings(max(.SD[[hzd[2]]], na.rm = na.rm))),

# process numeric depth weighted averages w/ dominant condition otherwise
sapply(colnames(.SD)[!colnames(.SD) %in% c(hzd, naf)],
function(n, top, bottom) {
v <- .SD[[n]]
if (length(v) > 1) {
if (!n %in% ignore_numerics && is.numeric(v)) {

# weighted average by thickness (numerics not in exclusion list)
v <- weighted.mean(v, bottom - top, na.rm = na.rm)

} else {
# take thickest value
# v[which.max(bottom - top)[1]]

# convert factors etc to character
# results may not conform with existing factor levels
v <- as.character(v)

# replace NA values for use in aggregate()
if (!na.rm) {
v[is.na(v)] <- "<collapseHZ-category-missing>"
}

# take dominant condition (based on sum of thickness)
cond <- aggregate(bottom - top, by = list(v), sum, na.rm = na.rm)
v <- cond[[1]][which.max(cond[[2]])[1]]

if (!na.rm) {
v[v == "<collapseHZ-category-missing>"] <- NA
}
}
}
out <- data.frame(v)
colnames(out) <- n
out
},
top = .SD[[hzd[1]]],
bottom = .SD[[hzd[2]]]),

# process custom aggregation functions (may return data.frames)
do.call('c', lapply(colnames(.SD)[colnames(.SD) %in% naf],
function(n, top, bottom) {
out <- AGGFUN[[n]](.SD[[n]], top, bottom)
if (!is.data.frame(out)) {
out <- data.frame(out)
colnames(out) <- n
} else {
colnames(out) <- paste0(n, ".", colnames(out))
}
out
},
top = .SD[[hzd[1]]],
bottom = .SD[[hzd[2]]]))),
by = g[gidx]]
# remove grouping ID
res$g <- NULL
} else {
res <- h[0, ]
}

# allow for replacing values as well as adding new values with data.frame AGGFUN
test1.idx <- na.omit(match(colnames(res), paste0(colnames(h), ".", colnames(h))))
test2.idx <- na.omit(match(paste0(colnames(h), ".", colnames(h)), colnames(res)))
colnames(res)[test2.idx] <- colnames(h)[test1.idx]

# determine matches that are only a single layer (no aggregation applied)
res2 <- h[hidx & l, ]
res2$hzdeptnew <- res2[[hzd[1]]]
res2$hzdepbnew <- res2[[hzd[2]]]
res2[[hzd[1]]] <- NULL
res2[[hzd[2]]] <- NULL

# combine matches
res3 <- data.table::rbindlist(list(res, res2), fill = TRUE)
if (missing(by) && nrow(res3) > 0){
res3[[hzdesgn]] <- labels[p]
}

# combine matches with horizons that did not match
agg.idx <- which(g %in% unique(g[l]) | hidx)
if (length(agg.idx) > 0) {
h <- h[-agg.idx, ]
}
h <- data.table::rbindlist(list(h, res3), fill = TRUE)

# replace depths
hn <- !is.na(h$hzdeptnew) & !is.na(h$hzdepbnew)
h[[hzd[1]]][hn] <- h$hzdeptnew[hn]
h[[hzd[2]]][hn] <- h$hzdepbnew[hn]
h$hzdeptnew <- NULL
h$hzdepbnew <- NULL

# sort horizons by id name and top depth
h <- h[order(h[[idn]], h[[hzd[1]]]),]

}

# replace horizons in parent SPC
replaceHorizons(x) <- h
}
x
}

Loading

0 comments on commit 8b31518

Please sign in to comment.