Add interpret() and cleanup

ncss-tech · May 13, 2024 · 9552e0b · 9552e0b
1 parent 4c9cacd
commit 9552e0b
Show file tree

Hide file tree

Showing 16 changed files with 391 additions and 82 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -19,8 +19,24 @@ Description: InterpretationEngine contains tools to develop, test, and apply int
 License: GPL (>= 3)
 Encoding: UTF-8
 Depends: R (>= 3.5.0)
-Imports: XML, caret, data.tree, digest, doParallel, dplyr, foreach, raster, soilDB, forcats, tidyr
+Imports: 
+  methods,
+  data.tree, 
+  data.table,
+  parallel,
+  digest, 
+  soilDB, 
+  terra,
+  XML, 
+  caret,
+  doParallel,
+  dplyr, 
+  foreach, 
+  raster, 
+  forcats,
+  tidyr
 LazyData: true
+LazyDataCompression: xz
 Roxygen: list(markdown = TRUE)
 RoxygenNote: 7.3.1
 Suggests: testthat (>= 3.0.0)

diff --git a/NAMESPACE b/NAMESPACE
@@ -28,6 +28,7 @@ export(getAttributeByEval)
 export(getPropertySet)
 export(hsg_calc)
 export(initRuleset)
+export(interpret)
 export(linkEvaluationFunctions)
 export(linkHedgeOperatorFunctions)
 export(linkSubRules)
@@ -42,14 +43,15 @@ export(propByPropname)
 export(propdefByPropname)
 export(pull_SDA)
 export(pull_SDA_compup)
-export(ruleByRulename)
 export(svi_calc)
 export(tree_eval)
 export(vf_calc)
 export(xmlChunkParse)
+exportMethods(interpret)
 importFrom(XML,xmlParse)
 importFrom(XML,xmlToList)
 importFrom(caret,confusionMatrix)
+importFrom(data.table,rbindlist)
 importFrom(data.tree,FromListExplicit)
 importFrom(data.tree,isLeaf)
 importFrom(digest,digest)
@@ -65,7 +67,11 @@ importFrom(graphics,abline)
 importFrom(graphics,grid)
 importFrom(graphics,lines)
 importFrom(graphics,points)
+importFrom(methods,setGeneric)
+importFrom(methods,setMethod)
+importFrom(parallel,clusterApply)
 importFrom(parallel,detectCores)
+importFrom(parallel,makeCluster)
 importFrom(parallel,stopCluster)
 importFrom(raster,as.data.frame)
 importFrom(raster,brick)
@@ -79,4 +85,11 @@ importFrom(soilDB,uncode)
 importFrom(stats,approxfun)
 importFrom(stats,na.omit)
 importFrom(stats,splinefun)
+importFrom(terra,ncell)
+importFrom(terra,readStart)
+importFrom(terra,readStop)
+importFrom(terra,readValues)
+importFrom(terra,writeStart)
+importFrom(terra,writeStop)
+importFrom(terra,writeValues)
 importFrom(tidyr,replace_na)
diff --git a/R/CVIRCurve.R b/R/CVIRCurve.R
@@ -231,10 +231,8 @@ CVIRPI <- function(x, xlim) {
 #' x <- seq(-1, 10, 0.01) 
 #' y <- CVIRLinear(x, c(0.5, 3))
 #' plot(y ~ x)
-CVIRLinear <- function(x, xlim = NULL, ylim = c(0, 1)) {
-  if (is.null(ylim) && length(ylim) != 2) {
-    ylim <- c(0, 1)
-  }
+CVIRLinear <- function(x, xlim = NULL) {
+  ylim <- c(0, 1)
   if (!is.null(xlim) && length(xlim) == 2) {
     minx <- xlim[1]
     maxx <- xlim[2]

diff --git a/R/InterpretationEngine-package.R b/R/InterpretationEngine-package.R
@@ -1,4 +1,5 @@
 #' @keywords internal
+#' @importFrom methods setMethod setGeneric
 "_PACKAGE"
 
 # The following block is used by usethis to automatically manage

diff --git a/R/hedge.R b/R/hedge.R
@@ -4,10 +4,12 @@
 #' @importFrom stats na.omit
 .NULL_HEDGE <- function(x, null.value = NULL, na.rm = FALSE) {
   if (na.rm) x <- na.omit(x)
-  x[is.null(x) | ((is.na(x) | !is.finite(x)) & !is.nan(x))] <- null.value
+  if (!is.list(x)) {
+    x[is.null(x) | (is.na(x) & !is.nan(x))] <- null.value
+  }
   x
 }
-
+ 
 .NULL_NOT_RATED <- function(x, na.rm = FALSE) {
   # NULL NOT RATED hedge: if NULL data in `x` then `NaN`, else `x`
   .NULL_HEDGE(x, null.value = NaN, na.rm = na.rm)

diff --git a/R/interpret.R b/R/interpret.R
@@ -0,0 +1,226 @@
+#' Run a Rule on Custom Property Data
+#'
+#' Allows for evaluation of a primary rule including subrules, operators, hedges, and evaluations. 
+#' 
+#' The user must supply a _data.frame_ object or SpatRaster object with property data as input. 
+#' The column names should be named using `make.names(propname)` where `propname` is the property name from
+#' `[NASIS_properties]` data object in this package.
+#'  
+#' @param x A _data.tree_ Object containing Rule tree (e.g. from [initRuleset()])
+#' @param propdata A data.frame or SpatRaster object
+#' @param ... Additional arguments
+#' @details `cache` argument stores input data in x along with evaluation ratings
+#'
+#' @return _data.frame_ containing `"rating"`
+#' @export
+#' @rdname interpret
+#'
+#' @examples
+#' 
+#' r <- initRuleset("Erodibility Factor Maximum")
+#' p <- getPropertySet(r)
+#' 
+#' my_data <- data.frame(Kmax = seq(0, 1, 0.01))
+#' colnames(my_data) <- make.names(p$propname)
+#' 
+#' res <- interpret(r, my_data)
+#' 
+#' plot(res$rating ~ my_data[[1]],
+#'      xlab = "K factor (input)",
+#'      ylab = "Rating [0-1]")
+#' 
+#' 
+setGeneric("interpret", function(x, propdata, ...) {
+  standardGeneric("interpret")
+})
+
+#' @param cache logical. Save input property data in data.tree object? Default: `FALSE`
+#' @export
+#' @rdname interpret
+setMethod("interpret", signature = c("Node", "data.frame"), 
+          function(x, propdata, cache = FALSE, ...) {
+  .interpret(x, propdata, cache = cache, ...)
+})
+
+#' @param cores integer. Default `1` core.
+#' @param core_thresh integer. Default `25000` cells.
+#' @param file character. Path to output raster file. Defaults to a temporary GeoTIFF.
+#' @param nrows integer. Default `nrow(propdata) / (terra::ncell(propdata) / core_thresh)`
+#' @param overwrite logical. Overwrite `file` if it exists?
+#' @export
+#' @rdname interpret
+setMethod("interpret", signature = c("Node", "SpatRaster"), 
+          function(x,
+                   propdata,
+                   cores = 1,
+                   core_thresh = 25000,
+                   file = paste0(tempfile(), ".tif"),
+                   nrows = nrow(propdata) / (terra::ncell(propdata) / core_thresh),
+                   overwrite = TRUE,
+                   ...) {
+
+            .interpretRast(x, propdata, cores = cores, core_thresh = 25000, ...)
+          })
+
+# workhorse data.frame method
+.interpret <- function(x, propdata, cache = FALSE, ...) {
+  x$Do(traversal = "post-order", .interpretNode, propdata, cache = cache)
+  data.frame(rating = x$rating)
+}
+
+.interpretNode <- function(x, propdata, cache = FALSE) {
+
+  if (x$isRoot) {
+
+    # pass value of first child to root node
+    x$rating <- x$children[[1]]$rating
+
+  } else if (!is.null(x$propname)) {
+    # extract data from `evaldata`
+    x_data <- propdata[, make.names(x$propname)]
+
+    # TODO: generalize methods for naming `evaldata`, use of propiid
+
+    # evaluate properties
+    x$rating <- x$evalFunction(x_data)
+
+    # storing input in tree useful, but larger object
+    if (cache) {
+      x$data <- x_data
+    }
+  } else if (!is.null(x$rule_refid)) {
+
+    # rules are an aggregation of their children
+    x$rating <- sapply(x$children, function(y) y$rating)
+
+  } else if (!is.null(x$Type)) {
+
+    # evaluate hedges and operators on children
+    x$rating <- x$evalFunction(sapply(x$children, function(y) y$rating))
+
+  }
+
+  x$rating <- as.numeric(x$rating)
+
+}
+
+#' @importFrom terra ncell readStart writeStart readValues writeValues readStop writeStop
+#' @importFrom parallel makeCluster stopCluster clusterApply
+#' @importFrom data.table rbindlist 
+.interpretRast <- function(x,
+                           propdata,
+                           cores = 1,
+                           core_thresh = 25000L,
+                           file = paste0(tempfile(), ".tif"),
+                           nrows = nrow(propdata) / (terra::ncell(propdata) / core_thresh),
+                           overwrite = TRUE) {
+
+  stopifnot(requireNamespace("terra"))
+  suppressWarnings(terra::readStart(propdata))
+
+  # nrows must be an integer
+  nrows <- floor(nrows)
+
+  # create template brick
+  out <- terra::rast(propdata)
+  cnm <- c("rating")
+  terra::nlyr(out) <- length(cnm)
+  names(out) <- cnm
+
+  out_info <- terra::writeStart(out, filename = file, overwrite = overwrite, progress = 0)
+  outrows <- c(out_info$row, nrow(out))
+  start_row <- lapply(1:out_info$n, function(i) out_info$row[i] + c(0, (seq_len(floor((out_info$nrows[i]) / nrows)) * nrows)))
+  n_row <- lapply(seq_along(start_row), function(i) diff(c(start_row[[i]] - 1, outrows[i + 1])))
+  n_set <- sum(sapply(start_row, length))
+
+  if (cores > 1) {
+    cls <- parallel::makeCluster(cores)
+    on.exit(parallel::stopCluster(cls))
+
+    # TODO: can blocks be parallelized?
+    count <- 1
+    for (i in seq_along(n_row)) {
+      for (j in seq_along(n_row[[i]])) {
+        if (n_row[[i]][j] > 0) {
+          st <- Sys.time()
+          blockdata <- terra::readValues(propdata,
+                                         row = start_row[[i]][j],
+                                         nrows = n_row[[i]][j],
+                                         dataframe = TRUE)
+          ids <- 1:nrow(blockdata)
+          skip.idx <- which(is.na(blockdata[[1]]))
+
+          if (length(skip.idx) > 0) {
+            blockcomplete <- blockdata[-skip.idx,]
+          } else blockcomplete <- blockdata
+
+          if (nrow(blockcomplete) > 0) {
+            # parallel within-block processing
+            cids <- 1:nrow(blockcomplete)
+            sz <- round(nrow(blockcomplete) / cores) + 1
+            X <- split(blockcomplete, f = rep(seq(from = 1, to = floor(length(cids) / sz) + 1),
+                                              each = sz)[1:length(cids)])
+            r <- data.table::rbindlist(
+              parallel::clusterApply(cls, X, function(y) {
+                .interpret(x, y)
+              }),
+              use.names = TRUE,
+              fill = TRUE
+            )
+            # TODO: why does fill=TRUE need to be used here? it introduces NAs
+          } else {
+            r <- data.frame(rating = numeric(0), stringsAsFactors = FALSE)
+          }
+
+          # fill skipped NA cells
+          r.na <- r[0, , drop = FALSE][1:length(skip.idx), , drop = FALSE]
+          r <- rbind(r, r.na)[order(c(ids[!ids %in% skip.idx], skip.idx)),]
+
+          st2 <- Sys.time()
+          terra::writeValues(out, as.matrix(r), start_row[[i]][j], nrows = n_row[[i]][j])
+          st3 <- Sys.time()
+
+          deltat <- signif(difftime(Sys.time(), st, units = "auto"), 2)
+          message(paste0(
+            "Batch ", count, " of ", n_set, " (n=",
+            nrow(blockcomplete), " on ", cores, " cores) done in ",
+            deltat, " ", attr(deltat, 'units')
+          ))
+          count <- count + 1
+        }
+      }
+    }
+  } else {
+    for (i in seq_along(n_row)) {
+      for (j in seq_along(n_row[[i]])) {
+        if (n_row[[i]][j] > 0) {
+          dataall <- terra::readValues(
+            propdata,
+            row = start_row[[i]][j],
+            nrows = n_row[[i]][j],
+            dataframe = TRUE
+          )
+          ids <- 1:nrow(dataall)
+
+          skip.idx <- which(is.na(dataall[[1]]))
+
+          if (length(skip.idx) > 0) {
+            datacomplete <- dataall[-skip.idx,]
+          } else datacomplete <- dataall
+
+          r2 <- .interpret(x, datacomplete)
+
+          # fill skipped NA cells
+          r.na <- r2[0, , drop = FALSE][1:length(skip.idx), , drop = FALSE]
+          r2 <- rbind(r2, r.na)[order(c(ids[!ids %in% skip.idx], skip.idx)),]
+
+          terra::writeValues(out, as.matrix(r2), start_row[[i]][j], nrows = n_row[[i]][j])
+        }
+      }
+    }
+  }
+
+  out <- terra::writeStop(out)
+  terra::readStop(propdata)
+  out
+}
diff --git a/R/local-functions.R b/R/local-functions.R
@@ -61,10 +61,8 @@ initRuleset <- function(rulename) {
 
   # recursively splice-in sub-rules
   dt$Do(traversal = 'pre-order', fun = linkSubRules)
-
-  ## TODO: is this working?
-  # splice-in evaluation functions, if possible
   dt$Do(traversal = 'pre-order', fun = linkEvaluationFunctions)
+  dt$Do(traversal = 'pre-order', fun = linkHedgeOperatorFunctions)
 
   return(dt)
 }

diff --git a/R/ruleByRuleName.R b/R/ruleByRuleName.R
diff --git a/man/CVIRCurve.Rd b/man/CVIRCurve.Rd