Skip to content

Commit

Permalink
Distinguish between scalars and length-1 vectors when saving/loading …
Browse files Browse the repository at this point in the history
…lists. (#25)

This effectively unboxes all length-1 vectors in a list, by default; this is
probably the more reasonable expectation for other languages that have a
concept of scalars. Users can override this by calling I() on elements that
they want to keep as length-1 vectors, in the same manner as jsonlite.
  • Loading branch information
LTLA authored May 3, 2024
1 parent d3c815a commit 4654222
Show file tree
Hide file tree
Showing 6 changed files with 164 additions and 35 deletions.
6 changes: 6 additions & 0 deletions R/readBaseList.R
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,12 @@
#'
#' @author Aaron Lun
#'
#' @details
#' The \pkg{uzuki2} specification (see \url{https://github.com/ArtifactDB/uzuki2}) allows length-1 vectors to be stored as-is or as a scalar.
#' If the file stores a length-1 vector as-is, \code{readBaseList} will read the list element as a length-1 vector with the \link{AsIs} class.
#' If the file stores a length-1 vector as a scalar, \code{readBaseList} will read the list element as a length-1 vector without this class.
#' This allows downstream users to distinguish between the storage modes in the rare cases that it is necessary.
#'
#' @seealso
#' \code{"\link{stageObject,list-method}"}, for the staging method.
#'
Expand Down
23 changes: 19 additions & 4 deletions R/saveBaseList.R
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,12 @@
#' If \code{list.format="hdf5"}, \code{x} is saved into a HDF5 file instead.
#' This format is most useful for random access and for preserving the precision of numerical data.
#'
#' @section Storing scalars:
#' The \pkg{uzuki2} specification (see \url{https://github.com/ArtifactDB/uzuki2}) allows length-1 vectors to be stored as-is or as a scalar.
#' If a list element is of length 1, \code{saveBaseList} will store it as a scalar on-disk, effectively \dQuote{unboxing} it for languages with a concept of scalars.
#' Users can override this behavior by adding the \link{AsIs} class to the affected list element, which will force storage as a length-1 vector.
#' This reflects the decisions made by \code{\link{readBaseList}} and mimics the behavior of packages like \pkg{jsonlite}.
#'
#' @author Aaron Lun
#'
#' @seealso
Expand Down Expand Up @@ -179,6 +185,7 @@ saveBaseListFormat <- (function() {
h5_write_vector(ghandle, "format", sltype, scalar=TRUE)
}

scalarize <- length(x) == 1L && !is(x, "AsIs")
y <- .sanitize_stringlike(x, sltype)

missing.placeholder <- NULL
Expand All @@ -194,7 +201,7 @@ saveBaseListFormat <- (function() {
}

local({
dhandle <- h5_write_vector(ghandle, "data", y, emit=TRUE)
dhandle <- h5_write_vector(ghandle, "data", y, emit=TRUE, scalar=scalarize)
on.exit(H5Dclose(dhandle), add=TRUE, after=FALSE)
if (!is.null(missing.placeholder)) {
h5_write_attribute(dhandle, missingPlaceholderName, missing.placeholder, scalar=TRUE)
Expand All @@ -207,6 +214,7 @@ saveBaseListFormat <- (function() {

} else if (is.atomic(x)) {
coerced <- .remap_atomic_type(x)
scalarize <- length(x) == 1L && !is(x, "AsIs")
y <- coerced$values

h5_write_attribute(ghandle, "uzuki_object", "vector", scalar=TRUE)
Expand All @@ -224,7 +232,7 @@ saveBaseListFormat <- (function() {
}

local({
dhandle <- h5_write_vector(ghandle, "data", y, emit=TRUE)
dhandle <- h5_write_vector(ghandle, "data", y, emit=TRUE, scalar=scalarize)
on.exit(H5Dclose(dhandle), add=TRUE, after=FALSE)
if (!is.null(missing.placeholder)) {
h5_write_attribute(dhandle, missingPlaceholderName, missing.placeholder, scalar=TRUE)
Expand Down Expand Up @@ -335,9 +343,13 @@ saveBaseListFormat <- (function() {
} else if (!is.null(sltype <- .is_stringlike(x))) {
formatted <- list(
type=if (.version == 1) sltype else "string",
values=I(.sanitize_stringlike(x, sltype))
values=.sanitize_stringlike(x, sltype)
)

if (length(x) == 1L && is(x, "AsIs")) {
formatted$values <- I(formatted$values)
}

if (.version > 1 && sltype != "string") {
formatted$format <- sltype
}
Expand Down Expand Up @@ -369,7 +381,10 @@ saveBaseListFormat <- (function() {
}
}

formatted$values <- I(formatted$values)
if (length(x) == 1L && is(x, "AsIs")) {
formatted$values <- I(formatted$values)
}

formatted <- .add_json_names(x, formatted)
return(formatted)
}
Expand Down
6 changes: 6 additions & 0 deletions man/readBaseList.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 8 additions & 0 deletions man/saveBaseList.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

81 changes: 53 additions & 28 deletions src/load_list.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,30 @@

#include "uzuki2/uzuki2.hpp"

template<class Input_>
void scalarize(Input_& object, bool needs_marker) {
if (needs_marker) {
Rcpp::RObject raw_classes = object.attr("class");
if (raw_classes.sexp_type() != STRSXP) {
object.attr("class") = "AsIs";
return;
}

Rcpp::CharacterVector classes(raw_classes);
Rcpp::CharacterVector new_classes(classes.size() + 1);
std::copy(classes.begin(), classes.end(), new_classes.begin() + 1);
new_classes[0] = "AsIs";
object.attr("class") = new_classes;
}
}

template<class Input_>
void nameify(Input_& object, bool named, const Rcpp::CharacterVector& names) {
if (named) {
object.names() = names;
}
}

/** Defining the simple vectors first. **/

struct RBase {
Expand All @@ -10,7 +34,7 @@ struct RBase {
};

struct RIntegerVector : public uzuki2::IntegerVector, public RBase {
RIntegerVector(size_t l, bool n, bool) : vec(l), named(n), names(n ? l : 0) {}
RIntegerVector(size_t l, bool n, bool s) : vec(l), named(n), names(n ? l : 0), scalar(s) {}

size_t size() const {
return vec.size();
Expand All @@ -33,10 +57,9 @@ struct RIntegerVector : public uzuki2::IntegerVector, public RBase {
}

Rcpp::RObject extract_object() {
if (named) {
vec.names() = names;
}
if (!has_placeholder) {
nameify(vec, named, names);
scalarize(vec, !scalar && vec.size() == 1);
return vec;
}

Expand All @@ -47,9 +70,9 @@ struct RIntegerVector : public uzuki2::IntegerVector, public RBase {
for (auto i : true_missing) {
alt[i] = R_NaReal;
}
if (named) {
alt.names() = names;
}

nameify(alt, named, names);
scalarize(alt, !scalar && vec.size() == 1);
return alt;
}

Expand All @@ -58,10 +81,11 @@ struct RIntegerVector : public uzuki2::IntegerVector, public RBase {
bool has_placeholder = false;
bool named = false;
Rcpp::CharacterVector names;
bool scalar = false;
};

struct RNumberVector : public uzuki2::NumberVector, public RBase {
RNumberVector(size_t l, bool n, bool) : vec(l), named(n), names(n ? l : 0) {}
RNumberVector(size_t l, bool n, bool s) : vec(l), named(n), names(n ? l : 0), scalar(s) {}

size_t size() const {
return vec.size();
Expand All @@ -80,19 +104,19 @@ struct RNumberVector : public uzuki2::NumberVector, public RBase {
}

Rcpp::RObject extract_object() {
if (named) {
vec.names() = names;
}
nameify(vec, named, names);
scalarize(vec, !scalar && vec.size() == 1);
return vec;
}

Rcpp::NumericVector vec;
bool named = false;
Rcpp::CharacterVector names;
bool scalar = false;
};

struct RBooleanVector : public uzuki2::BooleanVector, public RBase {
RBooleanVector(size_t l, bool n, bool) : vec(l), named(n), names(n ? l : 0) {}
RBooleanVector(size_t l, bool n, bool s) : vec(l), named(n), names(n ? l : 0), scalar(s) {}

size_t size() const {
return vec.size();
Expand All @@ -111,19 +135,19 @@ struct RBooleanVector : public uzuki2::BooleanVector, public RBase {
}

Rcpp::RObject extract_object() {
if (named) {
vec.names() = names;
}
nameify(vec, named, names);
scalarize(vec, !scalar && vec.size() == 1);
return vec;
}

Rcpp::LogicalVector vec;
bool named = false;
Rcpp::CharacterVector names;
bool scalar = false;
};

struct RStringVector : public uzuki2::StringVector, public RBase {
RStringVector(size_t l, bool n, bool) : vec(l), named(n), names(n ? l : 0) {}
RStringVector(size_t l, bool n, bool s) : vec(l), named(n), names(n ? l : 0), scalar(s) {}

size_t size() const {
return vec.size();
Expand All @@ -142,19 +166,19 @@ struct RStringVector : public uzuki2::StringVector, public RBase {
}

Rcpp::RObject extract_object() {
if (named) {
vec.names() = names;
}
nameify(vec, named, names);
scalarize(vec, !scalar && vec.size() == 1);
return vec;
}

Rcpp::StringVector vec;
bool named = false;
Rcpp::CharacterVector names;
bool scalar = false;
};

struct RDateVector : public uzuki2::StringVector, public RBase {
RDateVector(size_t l, bool n, bool) : vec(l), named(n), names(n ? l : 0) {}
RDateVector(size_t l, bool n, bool s) : vec(l), named(n), names(n ? l : 0), scalar(s) {}

size_t size() const {
return vec.size();
Expand All @@ -173,19 +197,19 @@ struct RDateVector : public uzuki2::StringVector, public RBase {
}

Rcpp::RObject extract_object() {
if (named) {
vec.names() = names;
}
nameify(vec, named, names);
scalarize(vec, !scalar && vec.size() == 1);
return vec;
}

Rcpp::DateVector vec;
bool named = false;
Rcpp::CharacterVector names;
bool scalar = false;
};

struct RDateTimeVector : public uzuki2::StringVector, public RBase {
RDateTimeVector(size_t l, bool n, bool) : vec(l), named(n), names(n ? l : 0) {}
RDateTimeVector(size_t l, bool n, bool s) : vec(l), named(n), names(n ? l : 0), scalar(s) {}

size_t size() const {
return vec.size();
Expand All @@ -206,17 +230,18 @@ struct RDateTimeVector : public uzuki2::StringVector, public RBase {
}

Rcpp::RObject extract_object() {
if (named) {
vec.names() = names;
}
nameify(vec, named, names);
Rcpp::Environment ns = Rcpp::Environment::namespace_env("alabaster.base");
Rcpp::Function f = ns["as.Rfc3339"];
return f(vec);
Rcpp::RObject output = f(vec);
scalarize(output, !scalar && vec.size() == 1);
return output;
}

Rcpp::StringVector vec;
bool named = false;
Rcpp::CharacterVector names;
bool scalar = false;
};

/** As do factors. **/
Expand Down
Loading

0 comments on commit 4654222

Please sign in to comment.