Skip to content

Commit

Permalink
conditional scaling, NA always removed
Browse files Browse the repository at this point in the history
  • Loading branch information
dylanbeaudette committed Nov 18, 2023
1 parent 15f808d commit 669a495
Show file tree
Hide file tree
Showing 2 changed files with 111 additions and 141 deletions.
72 changes: 36 additions & 36 deletions R/profileInformationIndex.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@

# x: vector of any type
# x: vector of any type, NA removed
# d: number of significant digits to retain for numeric x
.prepareVector <- function(x, d) {

Expand Down Expand Up @@ -31,7 +31,8 @@
return(x)
}


# x: vector of any type, ignoring NA
# d: number of significant digits to retain for numeric x
.prepareBaseline <- function(x, d, type) {

if(type == 'numeric') {
Expand All @@ -57,7 +58,7 @@
# integer-coded (factor, logical)
# frequency weighted by hz thickness,
# non-NA value
b <- names(sort(table(x), decreasing = TRUE))[1]
b <- names(sort(table(x, useNA = 'no'), decreasing = TRUE))[1]
b <- rep(b, times = length(x))
}

Expand Down Expand Up @@ -87,20 +88,25 @@
x <- .prepareVector(x, d = d)
}

# otherwise using a single string, with NA removed

# length of compressed vector is the metric
res <- length(memCompress(x, type = m))

# length of empty string ('') is 8 bits
res <- res - 8

return(res)
}


## TODO: think more about NA handling

# i: vector, any type
# numericDigits: requested precision for numeric data
# removeNA: remove NA before preparing
.prepareVariable <- function(i, numericDigits, removeNA) {
.prepareVariable <- function(i, numericDigits) {

## TODO: what is appropriate return value when all NA?
# remove all NA
i <- as.vector(na.omit(i))

# short-circuit for all NA
if(all(is.na(i))) {
Expand All @@ -109,10 +115,6 @@

# baseline is mean(i)
if(is.numeric(i)) {
# optionally remove NA from length(gz())
if(removeNA) {
i <- na.omit(i)
}

# baseline for numeric data: 1cm slice wt. mean
b <- .prepareBaseline(i, d = numericDigits, type = 'numeric')
Expand All @@ -124,11 +126,6 @@
# categorical or logical
# baseline is most-frequent, non-NA value

# optionally remove NA from length(gz())
if(removeNA) {
i <- na.omit(i)
}

# treating all categorical variables as nominal for now

# baseline is the most frequent, weighted by hz thickness, non-NA value
Expand All @@ -143,7 +140,7 @@
}


.PII_by_profile <- function(x, vars, baseline, method, numericDigits, removeNA, compression) {
.PII_by_profile <- function(x, vars, baseline, method, numericDigits, compression) {

# diced SPC -> DF of horizon data
# just variables of interest
Expand All @@ -153,8 +150,7 @@
h <- lapply(
h[, vars, drop = FALSE],
FUN = .prepareVariable,
numericDigits = numericDigits,
removeNA = removeNA
numericDigits = numericDigits
)

# extract variables
Expand Down Expand Up @@ -219,9 +215,9 @@
#'
#' @param numericDigits integer, number of significant digits to retain in numeric -> character conversion
#'
#' @param removeNA remove NA before compression
#' @param padNA logical, pad depths to `max(x)`, supplied to `dice(fill = padNA)`
#'
#' @param padNA pad all profiles with NA to deepest lower depth in `x` (`max(x)`)
#' @param scaleNumeric logical, `scale()` each numeric variable, causing "profile information" to vary based on other profiles in the collection
#'
#' @param compression character, compression method as used by [memCompress()]: 'gzip', 'bzip2', 'xz', 'none'
#'
Expand Down Expand Up @@ -293,29 +289,34 @@
#' text(x = 1:5, y = 105, labels = pi, cex = 0.85)
#' mtext('Profile Information Index (bytes)', side = 1, line = -1)
#'
profileInformationIndex <- function(x, vars, method = c('joint', 'individual'), baseline = FALSE, numericDigits = 8, removeNA = !padNA, padNA = FALSE, compression = 'gzip') {
profileInformationIndex <- function(x, vars, method = c('joint', 'individual'), baseline = FALSE, numericDigits = 8, padNA = FALSE, scaleNumeric = FALSE, compression = 'gzip') {

# sanity check
method <- match.arg(method)

# scale numeric variables to mean of 0 and SD of 1
for(i in vars) {
if(inherits(x[[i]], 'numeric')) {

# scale() will return NaN when SD() is NA or 0
.sd_i <- sd(x[[i]], na.rm = TRUE)

# threshold for very small numbers
if(!is.na(.sd_i)) {
if(.sd_i > 0.0001) {
x[[i]] <- scale(x[[i]])
## TODO: think more about this
# scaling means that PII depends on collection and is not absolute
if(scaleNumeric) {
# scale numeric variables to mean of 0 and SD of 1
for(i in vars) {
if(inherits(x[[i]], 'numeric')) {

# scale() will return NaN when SD() is NA or 0
.sd_i <- sd(x[[i]], na.rm = TRUE)

# threshold for very small numbers
if(!is.na(.sd_i)) {
if(.sd_i > 0.0001) {
x[[i]] <- scale(x[[i]])
}
}

# otherwise, use as-is
}

# otherwise, use as-is
}
}


## TODO: this will error / drop profiles in the presence of bad horizonation

# dice() to 1cm intervals for common baseline
Expand All @@ -339,7 +340,6 @@ profileInformationIndex <- function(x, vars, method = c('joint', 'individual'),
baseline = baseline,
method = method,
numericDigits = numericDigits,
removeNA = removeNA,
compression = compression
)

Expand Down
Loading

0 comments on commit 669a495

Please sign in to comment.