Skip to content

Commit

Permalink
Added optional 'length' property to simple_lists for fast height() ca…
Browse files Browse the repository at this point in the history
…lls.

This is specifically relevant for JSON-based formats that would
otherwise need to read through the entire file to get the length.
  • Loading branch information
LTLA committed Feb 6, 2024
1 parent 91bd498 commit 2e33d72
Show file tree
Hide file tree
Showing 3 changed files with 98 additions and 7 deletions.
7 changes: 6 additions & 1 deletion docs/specifications/simple_list.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ cat(" This should be set to \"", as.character(.version), "\".", sep="")
- (optional) `format`, a string specifying the format to use.
This should be one of `"hdf5"` or `"json.gz"`.
If not provided, it is assumed to be `"hdf5"`.
- (optional) `length`, a non-negative integer specifying the length of the list.

## Files

Expand All @@ -34,14 +35,18 @@ This is a HDF5 file that follows the [**uzuki2**](https://github.com/ArtifactDB/
If `format = "json.gz"`, the directory should contain a `list_contents.json.gz` file.
This is a Gzip-compressed JSON file that follows the [**uzuki2**](https://github.com/ArtifactDB/uzuki2) JSON specification for R lists.

If the `simple_list.length` property is present, this should be equal to the length of the list (see [below](#height)).
This property is provided as a convenience for JSON-formatted files, so that readers do not need to parse the entire file to determine the list length.

The directory may contain an `other_contents` subdirectory, containing external list items as further subdirectories.
Each subdirectory in `other_contents` represents a child object and should be named after its external list index, which are consecutive zero-based indices, i.e., `"0"`, `"1"`, and so on.
Every external list index should be used at least once by the list representation in `list_contents.h5` or `list_contents.json.gz`.

## Height

The height of the sequence string set is defined as the length of the list in `list_contents.h5` or `list_contents.json.gz`.
Only the length of the top-level list is considered here.
The length of the list is equal to its number of elements, without any consideration for the size/shape of nested lists or other objects in each element.
The length can also be taken from the `simple_list.length` property, if available.

## Interfaces

Expand Down
36 changes: 32 additions & 4 deletions include/takane/simple_list.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,18 @@ inline std::string extract_format(const internal_json::JsonObjectMap& map) {
return reinterpret_cast<millijson::String*>(val.get())->value;
}

inline std::pair<bool, size_t> extract_length(const internal_json::JsonObjectMap& map) {
auto lIt = map.find("length");
if (lIt == map.end()) {
return std::pair<bool, size_t>(false, 0);
}
const auto& val = lIt->second;
if (val->type() != millijson::NUMBER) {
throw std::runtime_error("'simple_list.length' in the object metadata should be a JSON number");
}
return std::pair<bool, size_t>(true, reinterpret_cast<millijson::Number*>(val.get())->value);
}

}
/**
* @endcond
Expand Down Expand Up @@ -93,18 +105,30 @@ inline void validate(const std::filesystem::path& path, const ObjectMetadata& me
}
}

size_t len;
if (format == "json.gz") {
uzuki2::json::Options opt;
opt.parallel = options.parallel_reads;
auto gzreader = internal_other::open_reader<byteme::GzipFileReader>(path / "list_contents.json.gz");
uzuki2::json::validate(gzreader, num_external, opt);
auto loaded = uzuki2::json::parse<uzuki2::DummyProvisioner>(gzreader, uzuki2::DummyExternals(num_external), std::move(opt));
len = reinterpret_cast<const uzuki2::List*>(loaded.get())->size();

} else if (format == "hdf5") {
auto handle = ritsuko::hdf5::open_file(path / "list_contents.h5");
auto ghandle = ritsuko::hdf5::open_group(handle, "simple_list");
uzuki2::hdf5::validate(ghandle, num_external);
auto loaded = uzuki2::hdf5::parse<uzuki2::DummyProvisioner>(ghandle, uzuki2::DummyExternals(num_external));
len = reinterpret_cast<const uzuki2::List*>(loaded.get())->size();

} else {
throw std::runtime_error("unknown format '" + format + "'");
}

auto len_info = internal::extract_length(metamap);
if (len_info.first) {
if (len_info.second != len) {
throw std::runtime_error("'simple_list.length' differs from the length of the list");
}
}
}

/**
Expand All @@ -117,6 +141,11 @@ inline size_t height(const std::filesystem::path& path, const ObjectMetadata& me
const auto& metamap = internal_json::extract_typed_object_from_metadata(metadata.other, "simple_list");
std::string format = internal::extract_format(metamap);

auto len_info = internal::extract_length(metamap);
if (len_info.first) {
return len_info.second;
}

if (format == "hdf5") {
auto handle = ritsuko::hdf5::open_file(path / "list_contents.h5");
auto lhandle = handle.openGroup("simple_list");
Expand All @@ -135,8 +164,7 @@ inline size_t height(const std::filesystem::path& path, const ObjectMetadata& me
uzuki2::json::Options opt;
opt.parallel = options.parallel_reads;
auto gzreader = internal_other::open_reader<byteme::GzipFileReader>(path / "list_contents.json.gz");
uzuki2::DummyExternals ext(num_external);
auto ptr = uzuki2::json::parse<uzuki2::DummyProvisioner>(gzreader, std::move(ext), std::move(opt));
auto ptr = uzuki2::json::parse<uzuki2::DummyProvisioner>(gzreader, uzuki2::DummyExternals(num_external), std::move(opt));
return reinterpret_cast<const uzuki2::List*>(ptr.get())->size();
}
}
Expand Down
62 changes: 60 additions & 2 deletions tests/src/simple_list.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ TEST_F(SimpleListTest, Basics) {
expect_error("should be a JSON string");
}

TEST_F(SimpleListTest, Json) {
TEST_F(SimpleListTest, JsonBasic) {
simple_list::initialize_with_metadata(dir, "1.0", "json.gz");

// Success!
Expand Down Expand Up @@ -92,7 +92,23 @@ TEST_F(SimpleListTest, Json) {
EXPECT_EQ(test_height(dir), 1);
}

TEST_F(SimpleListTest, Hdf5) {
TEST_F(SimpleListTest, JsonLength) {
{
initialize_directory(dir);
std::ofstream output(dir / "OBJECT");
output << "{ \"type\": \"simple_list\", \"simple_list\": { \"version\": \"1.0\", \"format\": \"json.gz\", \"length\": 2 } }";
dump_json("{ \"type\": \"list\", \"values\": [ { \"type\": \"nothing\" } ] }");
}
expect_error("length of the list");

{
dump_json("{ \"type\": \"list\", \"values\": [ { \"type\": \"nothing\" }, { \"type\": \"nothing\" } ] }");
}
test_validate(dir);
EXPECT_EQ(test_height(dir), 2);
}

TEST_F(SimpleListTest, Hdf5Basic) {
// Success!
{
simple_list::initialize_with_metadata(dir, "1.0", "hdf5");
Expand Down Expand Up @@ -141,3 +157,45 @@ TEST_F(SimpleListTest, Hdf5) {
test_validate(dir);
EXPECT_EQ(test_height(dir), 1);
}

TEST_F(SimpleListTest, Hdf5Length) {
H5::StrType stype(0, H5T_VARIABLE);

{
initialize_directory(dir);
std::ofstream output(dir / "OBJECT");
output << "{ \"type\": \"simple_list\", \"simple_list\": { \"version\": \"1.0\", \"length\": 2 } }";

H5::H5File handle(dir / "list_contents.h5", H5F_ACC_TRUNC);
auto ghandle = handle.createGroup("simple_list");
auto ahandle = ghandle.createAttribute("uzuki_object", stype, H5S_SCALAR);
ahandle.write(stype, std::string("list"));

auto dhandle = ghandle.createGroup("data");
auto zhandle = dhandle.createGroup("0");
{
auto xhandle = zhandle.createAttribute("uzuki_object", stype, H5S_SCALAR);
xhandle.write(stype, std::string("nothing"));
}
}
expect_error("length of the list");

{
H5::H5File handle(dir / "list_contents.h5", H5F_ACC_RDWR);
auto ghandle = handle.openGroup("simple_list");
auto dhandle = ghandle.openGroup("data");
auto zhandle = dhandle.createGroup("1");
{
auto xhandle = zhandle.createAttribute("uzuki_object", stype, H5S_SCALAR);
xhandle.write(stype, std::string("nothing"));
}
}
test_validate(dir);
EXPECT_EQ(test_height(dir), 2);

{
std::ofstream output(dir / "OBJECT");
output << "{ \"type\": \"simple_list\", \"simple_list\": { \"version\": \"1.0\", \"length\": true } }";
}
expect_error("should be a JSON number");
}

0 comments on commit 2e33d72

Please sign in to comment.