new ideas, still need to work out function defaults and interpretation

ncss-tech · Oct 25, 2023 · a14beaa · a14beaa
1 parent 30736aa
commit a14beaa
Show file tree

Hide file tree

Showing 4 changed files with 205 additions and 221 deletions.
diff --git a/R/profileInformationIndex.R b/R/profileInformationIndex.R
@@ -24,11 +24,54 @@
     x <- as.character(x)
   }
 
+  # collapse to single string
+  # otherwise, memCompress will add '\n' between vector elements
+  x <- paste0(x, collapse = '')
+
   return(x)
 }
 
 
-# x: vector of any type
+.prepareBaseline <- function(x, d, type) {
+
+  if(type == 'numeric') {
+    # replicate simple mean over all horizons
+    # weighted by 1cm slices
+    b <- format(
+      round(
+        rep(
+          mean(x, na.rm = TRUE), 
+          times = length(x)
+        ), 
+        digits = d
+      ),
+      digits = d
+    )
+  } else {
+    # logical and factor -> convert to integer codes
+    if(!inherits(x, 'character')) {
+      x <- as.integer(x)
+    }
+
+    # baseline is the most frequent, 
+    # integer-coded (factor, logical)
+    # frequency weighted by hz thickness, 
+    # non-NA value
+    b <- names(sort(table(x), decreasing = TRUE))[1]
+    b <- rep(b, times = length(x))
+  }
+
+
+  # collapse to single string
+  # otherwise, memCompress will add '\n' between vector elements
+  b <- paste0(b, collapse = '')
+
+  return(b)
+}
+
+
+
+# x: character, length 1 output from .prepareVector()
 # d: number of significant digits to retain for numeric x
 # n: compression method
 .compressedLength <- function(x, d = 10, m = 'gzip') {
@@ -37,140 +80,18 @@
   # otherwise, 1, 10, 100 will have different compressed lengths
 
   # memCompress() requires character data
-  # convert all types to character, preserving 'd' significant digits
-  if(!inherits(x, 'character')) {
+  # this algorithm is based on character vectors flattened to single string
+  # output from .prepareVector() is ideal
+  if(!inherits(x, 'character') || length(x) > 1) {
+    message('flattening character vector')
     x <- .prepareVector(x, d = d)
-  } 
+  }
 
   # length of compressed vector is the metric
   res <- length(memCompress(x, type = m))
   return(res)
 }
 
-# 
-# # x: single profile SPC
-# # vars: columns to consider
-# # method: aggregation over columns
-# # baseline: toggle for baseline comparison
-# # numericDigits: number of digits to retain in numeric -> character conversion
-# # removeNA: do not include NA in gz(data)
-# .pii_by_profile <- function(x, vars, method, baseline, numericDigits, removeNA) {
-#   
-#   # SPC -> DF
-#   h <- as(x, 'data.frame')[, vars, drop = FALSE]
-#   
-#   
-#   ## TODO: computing PII by column removes possible redundancy created by correlation within
-#   ##       -> PII as currently computed is an over-estimate
-#   ##       -> joint PII via: compress(c(v1, v2, v3))
-#   
-#   
-#   # iterate over columns and compute column-wise PII
-#   h <- lapply(
-#     h[, vars, drop = FALSE], 
-#     FUN = .pii, 
-#     baseline = baseline, 
-#     numericDigits = numericDigits,
-#     removeNA = removeNA
-#   )
-#   
-#   # each list element is a 1-length numeric
-#   h <- unlist(h)
-#   
-#   # unless it is all NA
-#   if(all(is.na(h))) {
-#     return(NA)
-#   }
-#   
-#   # aggregate PII by column to single value / profile
-#   res <- switch(method,
-#                 mean = {
-#                   mean(h, na.rm = TRUE)
-#                 },
-#                 median = {
-#                   median(h, na.rm = TRUE)
-#                 },
-#                 sum = {
-#                   sum(h, na.rm = TRUE)
-#                 }
-#   )
-#   
-#   return(res)
-# }
-
-
-
-## TODO: consider "max entropy" base line of runif(length(i))
-##       PII = 1 - I/Ib
-
-# 
-# # main algorithm
-# # i: vector of values
-# # baseline: toggle for baseline comparison
-# # numericDigits: number of digits to retain in numeric -> character conversion
-# # removeNA: do not include NA in gz(data)
-# .pii <- function(i, baseline, numericDigits, removeNA) {
-#   
-#   if(all(is.na(i))) {
-#     return(NA)
-#   }
-#   
-#   # baseline is mean(i)
-#   if(is.numeric(i)) {
-#     # optionally remove NA from length(gz())
-#     if(removeNA) {
-#       i <- na.omit(i)
-#     }
-#     
-#     # baseline for numeric data:
-#     # replicate simple mean over all horizons (no weighting)
-#     b <- as.character(
-#       signif(
-#         rep(
-#           mean(i, na.rm = TRUE), 
-#           times = length(i)
-#         ), 
-#         digits = numericDigits
-#       )
-#     )
-#     
-#     # return compressed length
-#     # any data type
-#     v <- .compressedLength(i, d = numericDigits)
-#     b <- .compressedLength(b)
-#     
-#   } else {
-#     # categorical data
-#     # baseline is most-frequent, non-NA value
-#     
-#     # optionally remove NA from length(gz())
-#     if(removeNA) {
-#       i <- na.omit(i)
-#     }
-#     
-#     # treating all categorical variables as nominal for now
-#     
-#     # baseline is the most frequent, weighted by hz thickness, non-NA value
-#     b <- names(sort(table(i), decreasing = TRUE))[1]
-#     b <- rep(b, times = length(i))
-#     
-#     # return compressed length
-#     # any data type
-#     v <- .compressedLength(i)
-#     b <- .compressedLength(b)
-#   }
-#   
-#   # optionally compare vs. baseline
-#   if(baseline) {
-#     res <- v / b
-#   } else {
-#     # no comparison
-#     res <- v
-#   }
-#   
-#   return(res)
-# }
-
 
 ## TODO: think more about NA handling
 
@@ -193,18 +114,8 @@
       i <- na.omit(i)
     }
 
-    # baseline for numeric data:
-    # replicate simple mean over all horizons (no weighting)
-    b <- format(
-      round(
-        rep(
-          mean(i, na.rm = TRUE), 
-          times = length(i)
-        ), 
-        digits = numericDigits
-      ),
-      digits = numericDigits
-    )
+    # baseline for numeric data: 1cm slice wt. mean
+    b <- .prepareBaseline(i, d = numericDigits, type = 'numeric')
 
     # prepare for compression
     v <- .prepareVector(i, d = numericDigits)
@@ -221,8 +132,8 @@
     # treating all categorical variables as nominal for now
 
     # baseline is the most frequent, weighted by hz thickness, non-NA value
-    b <- names(sort(table(i), decreasing = TRUE))[1]
-    b <- rep(b, times = length(i))
+    b <- .prepareBaseline(i, d = numericDigits, type = 'other')
+
 
     # prepare for compression
     v <- .prepareVector(i, d = numericDigits)
@@ -232,7 +143,7 @@
 }
 
 
-.PII_by_profile <- function(x, vars, baseline, method, numericDigits, removeNA) {
+.PII_by_profile <- function(x, vars, baseline, method, numericDigits, removeNA, compression) {
 
   # diced SPC -> DF of horizon data
   # just variables of interest
@@ -258,28 +169,29 @@
 
       # joint complexity
 
-      # concatenate prepared variables, as formatted text
-      v <- as.vector(do.call('c', v))
-      # concatenate prepared baseline, as formatted text
-      b <- as.vector(do.call('c', b))
+      # concatenate prepared variables, by flattening character vectors
+      v <- as.vector(do.call('paste0', list(v, collapse = '')))
+      # concatenate prepared baseline, by flattening character vectors
+      b <- as.vector(do.call('paste0', list(b, collapse = '')))
 
       # compress entire collection of data
-      v <- .compressedLength(v)
+      v <- .compressedLength(v, m = compression)
 
       # this is the joint baseline complexity
-      b <- .compressedLength(b)
+      b <- .compressedLength(b, m = compression)
 
       # ratio complexity vs. baseline complexity
       if(baseline) {
-       v <- v / b 
+        v <- v / b 
       }
       v
     },
     'individual' = {
 
+      ## TODO: flattening by .compressedLength() or here?
       # sum of individual complexities
-      v <- sum(sapply(v, .compressedLength), na.rm = TRUE)
-      b <- sum(sapply(b, .compressedLength), na.rm = TRUE)
+      v <- suppressMessages(sum(sapply(v, .compressedLength, m = compression), na.rm = TRUE))
+      b <- suppressMessages(sum(sapply(b, .compressedLength, m = compression), na.rm = TRUE))
 
       # ratio complexity vs. baseline complexity
       if(baseline) {
@@ -288,7 +200,7 @@
       v
     }
   )
-
+  
   return(res)
 }
 
@@ -310,19 +222,21 @@
 #' @param removeNA remove NA before compression
 #' 
 #' @param padNA pad all profiles with NA to deepest lower depth in `x` (`max(x)`)
+#' 
+#' @param compression character, compression method as used by [memCompress()]: 'gzip', 'bzip2', 'xz', 'none'
 #'
 #' @return a numeric vector of the same length as `length(x)` and in the same order, suitable for direct assignment to a new site-level attribute 
 #' 
 #' @export
 #'
 #' @author D.E. Beaudette
 #' 
-#' @details Information content via compression (gzip) is the central assumption behind this function: the values associated with a simple soil profile having few horizons and little variation between horizons (isotropic depth-functions) will compress to a much smaller size than a complex profile (many horizons, strong anisotropy). Information content is evaluated a profile at a time, over each site or horizon level attribute specified in `vars`. Values are aggregated to the profile level by `method`: median, mean, or sum. The `baseline` argument invokes a comparison to the simplest possible representation of each depth-function:
+#' @details Information content via compression (gzip) is the central assumption behind this function: the values associated with a simple soil profile having few horizons and little variation between horizons (isotropic depth-functions) will compress to a much smaller size than a complex profile (many horizons, strong anisotropy). Information content is evaluated a profile at a time, over each site or horizon level attribute specified in `vars`. The `baseline` argument invokes a comparison to the simplest possible representation of each depth-function:
 #' 
 #'    * `numeric`: replication of the mean value to match the number of horizons with non-NA values
 #'    * `character` or `factor`: replication of the most frequent value to match the number of horizons with non-NA values
 #'
-#' The ratios computed against a "simple" baseline represent something like "information gain", ranging from 0 to 1. Larger baseline ratios suggest more complexity (more information) associated with a soil profile's depth-functions. Alternatively, the total quantity of information (in bytes) can be determined by setting `baseline = FALSE` and `method = 'sum'`.
+#' The ratios computed against a "simple" baseline represent something like "information gain". Larger baseline ratios suggest more complexity (more information) associated with a soil profile's depth-functions. Alternatively, the total quantity of information (in bytes) can be determined by setting `baseline = FALSE`.
 #'
 #'
 #'
@@ -374,14 +288,14 @@
 #' 
 #' vars <- c('p', 'name')
 #' # result is total bytes
-#' pi <- profileInformationIndex(z, vars = vars, method = 'sum', baseline = FALSE)
+#' pi <- profileInformationIndex(z, vars = vars, method = 'joint', baseline = FALSE)
 #' 
 #' text(x = 1:5, y = 105, labels = pi, cex = 0.85)
 #' mtext('Profile Information Index (bytes)', side = 1, line = -1)
-
 #'
-profileInformationIndex <- function(x, vars, method = c('joint', 'individual'), baseline = TRUE, numericDigits = 8, removeNA = FALSE, padNA = TRUE) {
+profileInformationIndex <- function(x, vars, method = c('joint', 'individual'), baseline = FALSE, numericDigits = 8, removeNA = !padNA, padNA = FALSE, compression = 'gzip') {
 
+  # sanity check
   method <- match.arg(method)
 
   # scale numeric variables to mean of 0 and SD of 1
@@ -419,7 +333,8 @@ profileInformationIndex <- function(x, vars, method = c('joint', 'individual'),
     baseline = baseline, 
     method = method,
     numericDigits = numericDigits, 
-    removeNA = removeNA
+    removeNA = removeNA, 
+    compression = compression
   )
 
   # done

diff --git a/man/profileInformationIndex.Rd b/man/profileInformationIndex.Rd