Skip to content

Commit

Permalink
accounting for various forms of compression
Browse files Browse the repository at this point in the history
  • Loading branch information
dylanbeaudette committed Nov 18, 2023
1 parent 669a495 commit fa840a8
Show file tree
Hide file tree
Showing 3 changed files with 179 additions and 8 deletions.
10 changes: 8 additions & 2 deletions R/profileInformationIndex.R
Original file line number Diff line number Diff line change
Expand Up @@ -93,8 +93,14 @@
# length of compressed vector is the metric
res <- length(memCompress(x, type = m))

# length of empty string ('') is 8 bits
res <- res - 8
# remove length of empty string ('')
res <- res - switch(
m,
'gzip' = 8,
'bzip2' = 14,
'xz' = 32,
'none' = 0
)

return(res)
}
Expand Down
30 changes: 24 additions & 6 deletions misc/sandbox/profile-information-index.R
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,18 @@ aqp:::.prepareVector(1:10, d = 4)
aqp:::.prepareVariable(1:10, numericDigits = 4)
aqp:::.prepareVariable(c('A', 'A', 'A', 'B', 'C'), numericDigits = 4)

# empty strings should always be 0
# adjusted for each compression type
aqp:::.compressedLength('', m = 'gzip')
aqp:::.compressedLength('', m = 'bzip2')
aqp:::.compressedLength('', m = 'xz')

# compression types
aqp:::.compressedLength(rep(letters, 100), m = 'gzip')
aqp:::.compressedLength(rep(letters, 100), m = 'bzip2')
aqp:::.compressedLength(rep(letters, 100), m = 'xz')


# note usage
aqp:::.compressedLength(1:10)
aqp:::.compressedLength(aqp:::.prepareVector(1:10))
Expand Down Expand Up @@ -180,8 +192,8 @@ z <- combine(z, a, b)


vars <- c('p', 'name')
pi <- profileInformationIndex(z, vars = vars, method = 'j')
pi.b <- profileInformationIndex(z, vars = vars, method = 'j', baseline = TRUE)
pi <- profileInformationIndex(z, vars = vars, method = 'j', compression = 'gzip')
pi.b <- profileInformationIndex(z, vars = vars, method = 'j', baseline = TRUE, compression = 'gzip')


# visual check
Expand All @@ -200,8 +212,14 @@ mtext('Profile Complexity Ratio', side = 1, line = -0.5)


# effect of aggregation function
profileInformationIndex(z, vars = vars, method = 'i', baseline = TRUE)
profileInformationIndex(z, vars = vars, method = 'j', baseline = TRUE)
profileInformationIndex(z, vars = vars, method = 'i', baseline = FALSE)
profileInformationIndex(z, vars = vars, method = 'j', baseline = FALSE)

# effect of compression
profileInformationIndex(z, vars = vars, method = 'j', baseline = FALSE, compression = 'gzip')
profileInformationIndex(z, vars = vars, method = 'j', baseline = FALSE, compression = 'bzip2')
profileInformationIndex(z, vars = vars, method = 'j', baseline = FALSE, compression = 'xz')
profileInformationIndex(z, vars = vars, method = 'j', baseline = FALSE, compression = 'none')

# effect of baseline
profileInformationIndex(z, vars = vars, method = 'j', baseline = TRUE)
Expand Down Expand Up @@ -275,14 +293,14 @@ x$pi <- profileInformationIndex(x, vars = vars, method = 'j', baseline = FALSE)

plotSPC(x, width = 0.3, name.style = 'center-center', plot.order = order(x$pi), cex.names = 0.66, shrink = TRUE, max.depth = 200)
axis(side = 1, at = 1:length(x), labels = format(x$pi, digits = 3)[order(x$pi)], cex.axis = 0.75, las = 1)
title('baseline = FALSE')
title('Profile Information Index (bytes)')


x$pi <- profileInformationIndex(x, vars = vars, method = 'j', baseline = TRUE)

plotSPC(x, width = 0.3, name.style = 'center-center', plot.order = order(x$pi), cex.names = 0.66, shrink = TRUE, max.depth = 200)
axis(side = 1, at = 1:length(x), labels = format(x$pi, digits = 3)[order(x$pi)], cex.axis = 0.75, las = 1)
title('baseline = TRUE')
title('Profile Complexity Ratio')



Expand Down
147 changes: 147 additions & 0 deletions vignettes/missing-data.Rmd
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
---
title: "Missing Data"
output: rmarkdown::html_vignette
vignette: >
%\VignetteIndexEntry{Missing Data}
%\VignetteEngine{knitr::rmarkdown}
%\VignetteEncoding{UTF-8}
---

```{r setup, echo=FALSE, results='hide', warning=FALSE}
knitr::opts_chunk$set(
message = FALSE,
warning = FALSE,
background = '#F7F7F7',
fig.align = 'center',
dev = 'png',
comment = "#>"
)
# keep examples from using more than 2 cores
data.table::setDTthreads(Sys.getenv("OMP_THREAD_LIMIT", unset = 2))
options(width = 100, stringsAsFactors = FALSE, timeout = 600)
```


```{r}
library(aqp)
library(soilDB)
```



```{r}
# example data
data("jacobs2000")
# fully populated
plotSPC(jacobs2000, name.style = 'center-center',
cex.names = 0.8, color = 'time_saturated')
# missing some data
plotSPC(jacobs2000, name.style = 'center-center',
cex.names = 0.8, color = 'concentration_color')
# very nearly complete
plotSPC(jacobs2000, name.style = 'center-center',
cex.names = 0.8, color = 'matrix_color')
# variables to consider
v <- c('time_saturated', 'concentration_color', 'matrix_color')
# compute data completeness by profile
# ignore 2C horizons
jacobs2000$data.complete <- evalMissingData(
jacobs2000,
vars = v,
method = 'relative',
p = '2C'
)
jacobs2000$data.complete.abs <- evalMissingData(
jacobs2000,
vars = v,
method = 'absolute',
p = '2C'
)
# compute data completeness by horizon
# ignore 2C horizons
jacobs2000$hz.data.complete <- evalMissingData(
jacobs2000,
vars = v,
method = 'horizon',
p = '2C'
)
# "fraction complete" by horizon
plotSPC(
jacobs2000, name.style = 'center-center',
cex.names = 0.8, color = 'hz.data.complete'
)
# rank on profile completeness
new.order <- order(jacobs2000$data.complete)
# plot along data completeness ranking
plotSPC(
jacobs2000, name.style = 'center-center',
cex.names = 0.8, color = 'hz.data.complete',
plot.order = new.order
)
# add relative completeness axis
# note re-ordering of axis labels
axis(
side = 1, at = 1:length(jacobs2000),
labels = round(jacobs2000$data.complete[new.order], 2),
line = 0, cex.axis = 0.75
)
# add absolute completeness (cm)
axis(
side = 1, at = 1:length(jacobs2000),
labels = jacobs2000$data.complete.abs[new.order],
line = 2.5, cex.axis=0.75
)
```


```{r}
x <- fetchKSSL(series = 'pierre')
par(mar = c(0, 0, 3, 2))
plotSPC(x, color = 'clay', width = 0.3, name.style = 'center-center', label = 'pedon_completeness_index')
plotSPC(x, color = 'cec7', width = 0.3, name.style = 'center-center', label = 'pedon_completeness_index')
plotSPC(x, color = 'estimated_oc', width = 0.3, name.style = 'center-center', label = 'pedon_completeness_index')
plotSPC(x, color = 'ph_h2o', width = 0.3, name.style = 'center-center', label = 'pedon_completeness_index')
plotSPC(x, color = 'db_13b', width = 0.3, name.style = 'center-center', label = 'pedon_completeness_index')
par(mar = c(1, 0, 3, 2))
plotSPC(x, color = 'ph_h2o', width = 0.3, name.style = 'center-center', label = 'pedon_completeness_index')
.b <- x[, , .LAST, .BOTTOM]
text(x = 1:length(x), y = .b, labels = x$pi, cex = 0.85, pos = 1)
mtext('Profile Information Index (bytes)', side = 1, line = -0.5)
v <- c('clay', 'db_13b', 'cec7', 'ph_h2o')
x$rel.not.missing <- evalMissingData(x, vars = v, method = 'relative')
x$abs.not.missing <- evalMissingData(x, vars = v, method = 'absolute')
x$hz.not.missing <- evalMissingData(x, vars = v, method = 'horizon')
o <- order(x$rel.not.missing)
plotSPC(x, color = 'hz.not.missing', width = 0.33, name.style = 'center-center', label = 'pedon_completeness_index', plot.order = o)
text(x = 1:length(x), y = .b[o], labels = round(x$rel.not.missing[o], 2), cex = 0.85, pos = 1)
mtext('Relative Non-Missing Fraction', side = 1, line = -0.5)
o <- order(x$abs.not.missing)
plotSPC(x, color = 'hz.not.missing', width = 0.33, name.style = 'center-center', label = 'pedon_completeness_index', plot.order = o)
text(x = 1:length(x), y = .b[o], labels = x$abs.not.missing[o], cex = 0.85, pos = 1)
mtext('Absolute Non-Missing (cm)', side = 1, line = -0.5)
```

0 comments on commit fa840a8

Please sign in to comment.