conditional scaling, NA always removed

ncss-tech · Nov 18, 2023 · 669a495 · 669a495
1 parent 15f808d
commit 669a495
Show file tree

Hide file tree

Showing 2 changed files with 111 additions and 141 deletions.
diff --git a/R/profileInformationIndex.R b/R/profileInformationIndex.R
@@ -1,5 +1,5 @@
 
-# x: vector of any type
+# x: vector of any type, NA removed
 # d: number of significant digits to retain for numeric x
 .prepareVector <- function(x, d) {
 
@@ -31,7 +31,8 @@
   return(x)
 }
 
-
+# x: vector of any type, ignoring NA
+# d: number of significant digits to retain for numeric x
 .prepareBaseline <- function(x, d, type) {
 
   if(type == 'numeric') {
@@ -57,7 +58,7 @@
     # integer-coded (factor, logical)
     # frequency weighted by hz thickness, 
     # non-NA value
-    b <- names(sort(table(x), decreasing = TRUE))[1]
+    b <- names(sort(table(x, useNA = 'no'), decreasing = TRUE))[1]
     b <- rep(b, times = length(x))
   }
 
@@ -87,20 +88,25 @@
     x <- .prepareVector(x, d = d)
   }
 
+  # otherwise using a single string, with NA removed
+
   # length of compressed vector is the metric
   res <- length(memCompress(x, type = m))
+
+  # length of empty string ('') is 8 bits
+  res <- res - 8
+
   return(res)
 }
 
 
-## TODO: think more about NA handling
 
 # i: vector, any type
 # numericDigits: requested precision for numeric data
-# removeNA: remove NA before preparing
-.prepareVariable <- function(i, numericDigits, removeNA) {
+.prepareVariable <- function(i, numericDigits) {
 
-  ## TODO: what is appropriate return value when all NA?
+  # remove all NA
+  i <- as.vector(na.omit(i))
 
   # short-circuit for all NA
   if(all(is.na(i))) {
@@ -109,10 +115,6 @@
 
   # baseline is mean(i)
   if(is.numeric(i)) {
-    # optionally remove NA from length(gz())
-    if(removeNA) {
-      i <- na.omit(i)
-    }
 
     # baseline for numeric data: 1cm slice wt. mean
     b <- .prepareBaseline(i, d = numericDigits, type = 'numeric')
@@ -124,11 +126,6 @@
     # categorical or logical
     # baseline is most-frequent, non-NA value
 
-    # optionally remove NA from length(gz())
-    if(removeNA) {
-      i <- na.omit(i)
-    }
-
     # treating all categorical variables as nominal for now
 
     # baseline is the most frequent, weighted by hz thickness, non-NA value
@@ -143,7 +140,7 @@
 }
 
 
-.PII_by_profile <- function(x, vars, baseline, method, numericDigits, removeNA, compression) {
+.PII_by_profile <- function(x, vars, baseline, method, numericDigits, compression) {
 
   # diced SPC -> DF of horizon data
   # just variables of interest
@@ -153,8 +150,7 @@
   h <- lapply(
     h[, vars, drop = FALSE], 
     FUN = .prepareVariable, 
-    numericDigits = numericDigits,
-    removeNA = removeNA
+    numericDigits = numericDigits
   )
 
   # extract variables
@@ -219,9 +215,9 @@
 #' 
 #' @param numericDigits integer, number of significant digits to retain in numeric -> character conversion
 #' 
-#' @param removeNA remove NA before compression
+#' @param padNA logical, pad depths to `max(x)`, supplied to `dice(fill = padNA)`
 #' 
-#' @param padNA pad all profiles with NA to deepest lower depth in `x` (`max(x)`)
+#' @param scaleNumeric logical, `scale()` each numeric variable, causing "profile information" to vary based on other profiles in the collection
 #' 
 #' @param compression character, compression method as used by [memCompress()]: 'gzip', 'bzip2', 'xz', 'none'
 #'
@@ -293,29 +289,34 @@
 #' text(x = 1:5, y = 105, labels = pi, cex = 0.85)
 #' mtext('Profile Information Index (bytes)', side = 1, line = -1)
 #'
-profileInformationIndex <- function(x, vars, method = c('joint', 'individual'), baseline = FALSE, numericDigits = 8, removeNA = !padNA, padNA = FALSE, compression = 'gzip') {
+profileInformationIndex <- function(x, vars, method = c('joint', 'individual'), baseline = FALSE, numericDigits = 8, padNA = FALSE, scaleNumeric = FALSE, compression = 'gzip') {
 
   # sanity check
   method <- match.arg(method)
 
-  # scale numeric variables to mean of 0 and SD of 1
-  for(i in vars) {
-    if(inherits(x[[i]], 'numeric')) {
-
-      # scale() will return NaN when SD() is NA or 0
-      .sd_i <- sd(x[[i]], na.rm = TRUE)
-
-      # threshold for very small numbers 
-      if(!is.na(.sd_i)) {
-        if(.sd_i > 0.0001) {
-          x[[i]] <- scale(x[[i]])
+  ## TODO: think more about this
+  # scaling means that PII depends on collection and is not absolute
+  if(scaleNumeric) {
+    # scale numeric variables to mean of 0 and SD of 1
+    for(i in vars) {
+      if(inherits(x[[i]], 'numeric')) {
+
+        # scale() will return NaN when SD() is NA or 0
+        .sd_i <- sd(x[[i]], na.rm = TRUE)
+
+        # threshold for very small numbers 
+        if(!is.na(.sd_i)) {
+          if(.sd_i > 0.0001) {
+            x[[i]] <- scale(x[[i]])
+          }
         }
+
+        # otherwise, use as-is
       }
-
-      # otherwise, use as-is
     }
   }
 
+
   ## TODO: this will error / drop profiles in the presence of bad horizonation
 
   # dice() to 1cm intervals for common baseline
@@ -339,7 +340,6 @@ profileInformationIndex <- function(x, vars, method = c('joint', 'individual'),
     baseline = baseline, 
     method = method,
     numericDigits = numericDigits, 
-    removeNA = removeNA, 
     compression = compression
   )