Skip to content

Commit

Permalink
new ideas, still need to work out function defaults and interpretation
Browse files Browse the repository at this point in the history
  • Loading branch information
dylanbeaudette committed Oct 25, 2023
1 parent 30736aa commit a14beaa
Show file tree
Hide file tree
Showing 4 changed files with 205 additions and 221 deletions.
233 changes: 74 additions & 159 deletions R/profileInformationIndex.R
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,54 @@
x <- as.character(x)
}

# collapse to single string
# otherwise, memCompress will add '\n' between vector elements
x <- paste0(x, collapse = '')

return(x)
}


# x: vector of any type
.prepareBaseline <- function(x, d, type) {

if(type == 'numeric') {
# replicate simple mean over all horizons
# weighted by 1cm slices
b <- format(
round(
rep(
mean(x, na.rm = TRUE),
times = length(x)
),
digits = d
),
digits = d
)
} else {
# logical and factor -> convert to integer codes
if(!inherits(x, 'character')) {
x <- as.integer(x)
}

# baseline is the most frequent,
# integer-coded (factor, logical)
# frequency weighted by hz thickness,
# non-NA value
b <- names(sort(table(x), decreasing = TRUE))[1]
b <- rep(b, times = length(x))
}


# collapse to single string
# otherwise, memCompress will add '\n' between vector elements
b <- paste0(b, collapse = '')

return(b)
}



# x: character, length 1 output from .prepareVector()
# d: number of significant digits to retain for numeric x
# n: compression method
.compressedLength <- function(x, d = 10, m = 'gzip') {
Expand All @@ -37,140 +80,18 @@
# otherwise, 1, 10, 100 will have different compressed lengths

# memCompress() requires character data
# convert all types to character, preserving 'd' significant digits
if(!inherits(x, 'character')) {
# this algorithm is based on character vectors flattened to single string
# output from .prepareVector() is ideal
if(!inherits(x, 'character') || length(x) > 1) {
message('flattening character vector')
x <- .prepareVector(x, d = d)
}
}

# length of compressed vector is the metric
res <- length(memCompress(x, type = m))
return(res)
}

#
# # x: single profile SPC
# # vars: columns to consider
# # method: aggregation over columns
# # baseline: toggle for baseline comparison
# # numericDigits: number of digits to retain in numeric -> character conversion
# # removeNA: do not include NA in gz(data)
# .pii_by_profile <- function(x, vars, method, baseline, numericDigits, removeNA) {
#
# # SPC -> DF
# h <- as(x, 'data.frame')[, vars, drop = FALSE]
#
#
# ## TODO: computing PII by column removes possible redundancy created by correlation within
# ## -> PII as currently computed is an over-estimate
# ## -> joint PII via: compress(c(v1, v2, v3))
#
#
# # iterate over columns and compute column-wise PII
# h <- lapply(
# h[, vars, drop = FALSE],
# FUN = .pii,
# baseline = baseline,
# numericDigits = numericDigits,
# removeNA = removeNA
# )
#
# # each list element is a 1-length numeric
# h <- unlist(h)
#
# # unless it is all NA
# if(all(is.na(h))) {
# return(NA)
# }
#
# # aggregate PII by column to single value / profile
# res <- switch(method,
# mean = {
# mean(h, na.rm = TRUE)
# },
# median = {
# median(h, na.rm = TRUE)
# },
# sum = {
# sum(h, na.rm = TRUE)
# }
# )
#
# return(res)
# }



## TODO: consider "max entropy" base line of runif(length(i))
## PII = 1 - I/Ib

#
# # main algorithm
# # i: vector of values
# # baseline: toggle for baseline comparison
# # numericDigits: number of digits to retain in numeric -> character conversion
# # removeNA: do not include NA in gz(data)
# .pii <- function(i, baseline, numericDigits, removeNA) {
#
# if(all(is.na(i))) {
# return(NA)
# }
#
# # baseline is mean(i)
# if(is.numeric(i)) {
# # optionally remove NA from length(gz())
# if(removeNA) {
# i <- na.omit(i)
# }
#
# # baseline for numeric data:
# # replicate simple mean over all horizons (no weighting)
# b <- as.character(
# signif(
# rep(
# mean(i, na.rm = TRUE),
# times = length(i)
# ),
# digits = numericDigits
# )
# )
#
# # return compressed length
# # any data type
# v <- .compressedLength(i, d = numericDigits)
# b <- .compressedLength(b)
#
# } else {
# # categorical data
# # baseline is most-frequent, non-NA value
#
# # optionally remove NA from length(gz())
# if(removeNA) {
# i <- na.omit(i)
# }
#
# # treating all categorical variables as nominal for now
#
# # baseline is the most frequent, weighted by hz thickness, non-NA value
# b <- names(sort(table(i), decreasing = TRUE))[1]
# b <- rep(b, times = length(i))
#
# # return compressed length
# # any data type
# v <- .compressedLength(i)
# b <- .compressedLength(b)
# }
#
# # optionally compare vs. baseline
# if(baseline) {
# res <- v / b
# } else {
# # no comparison
# res <- v
# }
#
# return(res)
# }


## TODO: think more about NA handling

Expand All @@ -193,18 +114,8 @@
i <- na.omit(i)
}

# baseline for numeric data:
# replicate simple mean over all horizons (no weighting)
b <- format(
round(
rep(
mean(i, na.rm = TRUE),
times = length(i)
),
digits = numericDigits
),
digits = numericDigits
)
# baseline for numeric data: 1cm slice wt. mean
b <- .prepareBaseline(i, d = numericDigits, type = 'numeric')

# prepare for compression
v <- .prepareVector(i, d = numericDigits)
Expand All @@ -221,8 +132,8 @@
# treating all categorical variables as nominal for now

# baseline is the most frequent, weighted by hz thickness, non-NA value
b <- names(sort(table(i), decreasing = TRUE))[1]
b <- rep(b, times = length(i))
b <- .prepareBaseline(i, d = numericDigits, type = 'other')


# prepare for compression
v <- .prepareVector(i, d = numericDigits)
Expand All @@ -232,7 +143,7 @@
}


.PII_by_profile <- function(x, vars, baseline, method, numericDigits, removeNA) {
.PII_by_profile <- function(x, vars, baseline, method, numericDigits, removeNA, compression) {

# diced SPC -> DF of horizon data
# just variables of interest
Expand All @@ -258,28 +169,29 @@

# joint complexity

# concatenate prepared variables, as formatted text
v <- as.vector(do.call('c', v))
# concatenate prepared baseline, as formatted text
b <- as.vector(do.call('c', b))
# concatenate prepared variables, by flattening character vectors
v <- as.vector(do.call('paste0', list(v, collapse = '')))
# concatenate prepared baseline, by flattening character vectors
b <- as.vector(do.call('paste0', list(b, collapse = '')))

# compress entire collection of data
v <- .compressedLength(v)
v <- .compressedLength(v, m = compression)

# this is the joint baseline complexity
b <- .compressedLength(b)
b <- .compressedLength(b, m = compression)

# ratio complexity vs. baseline complexity
if(baseline) {
v <- v / b
v <- v / b
}
v
},
'individual' = {

## TODO: flattening by .compressedLength() or here?
# sum of individual complexities
v <- sum(sapply(v, .compressedLength), na.rm = TRUE)
b <- sum(sapply(b, .compressedLength), na.rm = TRUE)
v <- suppressMessages(sum(sapply(v, .compressedLength, m = compression), na.rm = TRUE))
b <- suppressMessages(sum(sapply(b, .compressedLength, m = compression), na.rm = TRUE))

# ratio complexity vs. baseline complexity
if(baseline) {
Expand All @@ -288,7 +200,7 @@
v
}
)

return(res)
}

Expand All @@ -310,19 +222,21 @@
#' @param removeNA remove NA before compression
#'
#' @param padNA pad all profiles with NA to deepest lower depth in `x` (`max(x)`)
#'
#' @param compression character, compression method as used by [memCompress()]: 'gzip', 'bzip2', 'xz', 'none'
#'
#' @return a numeric vector of the same length as `length(x)` and in the same order, suitable for direct assignment to a new site-level attribute
#'
#' @export
#'
#' @author D.E. Beaudette
#'
#' @details Information content via compression (gzip) is the central assumption behind this function: the values associated with a simple soil profile having few horizons and little variation between horizons (isotropic depth-functions) will compress to a much smaller size than a complex profile (many horizons, strong anisotropy). Information content is evaluated a profile at a time, over each site or horizon level attribute specified in `vars`. Values are aggregated to the profile level by `method`: median, mean, or sum. The `baseline` argument invokes a comparison to the simplest possible representation of each depth-function:
#' @details Information content via compression (gzip) is the central assumption behind this function: the values associated with a simple soil profile having few horizons and little variation between horizons (isotropic depth-functions) will compress to a much smaller size than a complex profile (many horizons, strong anisotropy). Information content is evaluated a profile at a time, over each site or horizon level attribute specified in `vars`. The `baseline` argument invokes a comparison to the simplest possible representation of each depth-function:
#'
#' * `numeric`: replication of the mean value to match the number of horizons with non-NA values
#' * `character` or `factor`: replication of the most frequent value to match the number of horizons with non-NA values
#'
#' The ratios computed against a "simple" baseline represent something like "information gain", ranging from 0 to 1. Larger baseline ratios suggest more complexity (more information) associated with a soil profile's depth-functions. Alternatively, the total quantity of information (in bytes) can be determined by setting `baseline = FALSE` and `method = 'sum'`.
#' The ratios computed against a "simple" baseline represent something like "information gain". Larger baseline ratios suggest more complexity (more information) associated with a soil profile's depth-functions. Alternatively, the total quantity of information (in bytes) can be determined by setting `baseline = FALSE`.
#'
#'
#'
Expand Down Expand Up @@ -374,14 +288,14 @@
#'
#' vars <- c('p', 'name')
#' # result is total bytes
#' pi <- profileInformationIndex(z, vars = vars, method = 'sum', baseline = FALSE)
#' pi <- profileInformationIndex(z, vars = vars, method = 'joint', baseline = FALSE)
#'
#' text(x = 1:5, y = 105, labels = pi, cex = 0.85)
#' mtext('Profile Information Index (bytes)', side = 1, line = -1)

#'
profileInformationIndex <- function(x, vars, method = c('joint', 'individual'), baseline = TRUE, numericDigits = 8, removeNA = FALSE, padNA = TRUE) {
profileInformationIndex <- function(x, vars, method = c('joint', 'individual'), baseline = FALSE, numericDigits = 8, removeNA = !padNA, padNA = FALSE, compression = 'gzip') {

# sanity check
method <- match.arg(method)

# scale numeric variables to mean of 0 and SD of 1
Expand Down Expand Up @@ -419,7 +333,8 @@ profileInformationIndex <- function(x, vars, method = c('joint', 'individual'),
baseline = baseline,
method = method,
numericDigits = numericDigits,
removeNA = removeNA
removeNA = removeNA,
compression = compression
)

# done
Expand Down
15 changes: 9 additions & 6 deletions man/profileInformationIndex.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit a14beaa

Please sign in to comment.