diff --git a/R/saveBaseFactor.R b/R/saveBaseFactor.R index 64a391b..379931d 100644 --- a/R/saveBaseFactor.R +++ b/R/saveBaseFactor.R @@ -35,39 +35,47 @@ setMethod("saveObject", "factor", function(x, path, ...) { fhandle <- H5Fopen(ofile) on.exit(H5Fclose(fhandle), add=TRUE) + ghandle <- H5Gopen(fhandle, host) + on.exit(H5Gclose(ghandle), add=TRUE, after=FALSE) (function (){ - ghandle <- H5Gopen(fhandle, host) - on.exit(H5Gclose(ghandle), add=TRUE) h5writeAttribute("1.0", ghandle, "version", asScalar=TRUE) if (is.ordered(x)) { h5writeAttribute(1L, ghandle, "ordered", asScalar=TRUE) } })() - .simple_save_codes(fhandle, host, x) - h5write(levels(x), fhandle, paste0(host, "/levels")) + .simple_save_codes(ghandle, x) + h5write(levels(x), ghandle, "levels") write("string_factor", file=file.path(path, "OBJECT")) invisible(NULL) }) -.simple_save_codes <- function(fhandle, host, x, save.names=TRUE) { +.simple_save_codes <- function(ghandle, x, save.names=TRUE) { codes <- as.integer(x) - 1L missing.placeholder <- NULL if (anyNA(codes)) { - missing.placeholder <- -1L + missing.placeholder <- nlevels(x) codes[is.na(codes)] <- missing.placeholder } - full.data.name <- paste0(host, "/codes") - h5write(codes, fhandle, full.data.name) + shandle <- H5Screate_simple(length(x)) + on.exit(H5Sclose(shandle), add=TRUE) + dhandle <- H5Dcreate(ghandle, "codes", dtype_id="H5T_NATIVE_UINT32", h5space=shandle) + on.exit(H5Dclose(dhandle), add=TRUE, after=FALSE) + H5Dwrite(dhandle, codes) + if (!is.null(missing.placeholder)) { - addMissingPlaceholderAttributeForHdf5(fhandle, full.data.name, missing.placeholder) + ashandle <- H5Screate("H5S_SCALAR") + on.exit(H5Sclose(ashandle), add=TRUE, after=FALSE) + ahandle <- H5Acreate(dhandle, "missing-value-placeholder", dtype_id="H5T_NATIVE_UINT32", h5space=ashandle) + on.exit(H5Aclose(ahandle), add=TRUE, after=FALSE) + H5Awrite(ahandle, missing.placeholder) } if (save.names && !is.null(names(x))) { - h5write(names(x), fhandle, paste0(host, "/names")) + h5write(names(x), ghandle, "names") } } diff --git a/R/saveDataFrame.R b/R/saveDataFrame.R index 0440b90..e0b5c6c 100644 --- a/R/saveDataFrame.R +++ b/R/saveDataFrame.R @@ -91,11 +91,10 @@ setMethod("saveObject", "DataFrame", function(x, path, ...) { if (is.ordered(col)) { h5writeAttribute(1L, ghandle, "ordered", asScalar=TRUE) } + .simple_save_codes(ghandle, col, save.names=FALSE) + h5write(levels(col), ghandle, "levels"); })() - .simple_save_codes(fhandle, full.data.name, col, save.names=FALSE) - h5write(levels(col), fhandle, paste0(full.data.name, "/levels")); - } else if (.is_datetime(col)) { coltype <- "string" colformat <- "date-time" diff --git a/R/saveDataFrameFactor.R b/R/saveDataFrameFactor.R index 8cd6622..a79479b 100644 --- a/R/saveDataFrameFactor.R +++ b/R/saveDataFrameFactor.R @@ -32,13 +32,11 @@ setMethod("saveObject", "DataFrameFactor", function(x, path, ...) { fhandle <- H5Fopen(ofile) on.exit(H5Fclose(fhandle), add=TRUE) - (function (){ - ghandle <- H5Gopen(fhandle, host) - on.exit(H5Gclose(ghandle), add=TRUE) - h5writeAttribute("1.0", ghandle, "version", asScalar=TRUE) - })() + ghandle <- H5Gopen(fhandle, host) + on.exit(H5Gclose(ghandle), add=TRUE, after=FALSE) + h5writeAttribute("1.0", ghandle, "version", asScalar=TRUE) - .simple_save_codes(fhandle, host, x) + .simple_save_codes(ghandle, x) stuff <- levels(x) altSaveObject(stuff, paste0(path, "/levels"), ...) diff --git a/inst/include/fetch.sh b/inst/include/fetch.sh index 57e43e2..3e105d1 100755 --- a/inst/include/fetch.sh +++ b/inst/include/fetch.sh @@ -31,7 +31,7 @@ harvester() { harvester millijson https://github.com/ArtifactDB/millijson v1.0.0 harvester byteme https://github.com/LTLA/byteme v1.1.0 -harvester uzuki2 https://github.com/ArtifactDB/uzuki2 v1.3.0 harvester comservatory https://github.com/ArtifactDB/comservatory v2.0.1 -harvester ritsuko https://github.com/ArtifactDB/ritsuko v0.3.3 +harvester uzuki2 https://github.com/ArtifactDB/uzuki2 master +harvester ritsuko https://github.com/ArtifactDB/ritsuko master harvester takane https://github.com/ArtifactDB/takane master diff --git a/inst/include/ritsuko/choose_missing_placeholder.hpp b/inst/include/ritsuko/choose_missing_placeholder.hpp index 599857a..01d8014 100644 --- a/inst/include/ritsuko/choose_missing_placeholder.hpp +++ b/inst/include/ritsuko/choose_missing_placeholder.hpp @@ -15,20 +15,58 @@ namespace ritsuko { /** - * Choose an appropriate placeholder for missing values in an integer dataset. + * @cond + */ +template +bool found(Iterator start, Iterator end, Mask mask, Type candidate) { + if constexpr(std::is_same::value) { + return (std::find(start, end, candidate) != end); + } else { + for (; start != end; ++start, ++mask) { + if (!*mask && candidate == *start) { + return true; + } + } + return false; + } +} + +template()))>::type>::type> +std::set create_unique_set(Iterator start, Iterator end, Mask mask) { + if constexpr(std::is_same::value) { + return std::set(start, end); + } else { + std::set output; + for (; start != end; ++start, ++mask) { + if (!*mask) { + output.insert(*start); + } + } + return output; + } +} +/** + * @endcond + */ + +/** + * Choose an appropriate placeholder for missing values in an integer dataset, after ignoring all the masked values. * This will try the various special values (the minimum, the maximum, and for signed types, 0) * before sorting the dataset and searching for an unused integer value. * * @tparam Iterator_ Forward iterator for integer values. + * @tparam Mask_ Random access iterator for mask values. * @tparam Type_ Integer type pointed to by `Iterator_`. * * @param start Start of the dataset. * @param end End of the dataset. + * @param mask Start of the mask vector. + * This should have the same length as `end - start`; each entry is true if the corresponding value of the integer dataset is masked, and false otherwise. * * @return Pair containing (i) a boolean indicating whether a placeholder was successfully found, and (ii) the chosen placeholder if the previous boolean is true. */ -template()))>::type>::type> -std::pair choose_missing_integer_placeholder(Iterator start, Iterator end) { +template()))>::type>::type> +std::pair choose_missing_integer_placeholder(Iterator start, Iterator end, Mask mask) { static_assert(std::numeric_limits::is_integer); // Trying important points first; minima and maxima, and 0. @@ -42,7 +80,7 @@ std::pair choose_missing_integer_placeholder(Iterator start, Iterat } else { candidate = 0; } - if (std::find(start, end, candidate) == end) { + if (!found(start, end, mask, candidate)) { return std::make_pair(true, candidate); } } @@ -55,14 +93,14 @@ std::pair choose_missing_integer_placeholder(Iterator start, Iterat } else { candidate = 0; } - if (std::find(start, end, candidate) == end) { + if (!found(start, end, mask, candidate)) { return std::make_pair(true, candidate); } } } // Well... going through it in order. - std::set uniq_sort(start, end); + auto uniq_sort = create_unique_set(start, end, mask); Type_ last = std::numeric_limits::min(); for (auto x : uniq_sort) { if (last + 1 < x) { @@ -75,7 +113,23 @@ std::pair choose_missing_integer_placeholder(Iterator start, Iterat } /** - * Choose an appropriate placeholder for missing values in a floating-point dataset. + * Overload of `choose_missing_integer_placeholder()` where no values are masked. + * + * @tparam Iterator_ Forward iterator for integer values. + * @tparam Type_ Integer type pointed to by `Iterator_`. + * + * @param start Start of the dataset. + * @param end End of the dataset. + * + * @return Pair containing (i) a boolean indicating whether a placeholder was successfully found, and (ii) the chosen placeholder if the previous boolean is true. + */ +template()))>::type>::type> +std::pair choose_missing_integer_placeholder(Iterator start, Iterator end) { + return choose_missing_integer_placeholder(start, end, false); +} + +/** + * Choose an appropriate placeholder for missing values in a floating-point dataset, after ignoring all masked values. * This will try the various IEEE special values (NaN, Inf, -Inf) and then some type-specific boundaries (the minimum, the maximum, and for signed types, 0) * before sorting the dataset and searching for an unused float. * @@ -84,22 +138,35 @@ std::pair choose_missing_integer_placeholder(Iterator start, Iterat * * @param start Start of the dataset. * @param end End of the dataset. + * @param mask Start of the mask vector. * @param skip_nan Whether to skip NaN as a potential placeholder. * Useful in frameworks like R that need special consideration of NaN payloads. * * @return Pair containing (i) a boolean indicating whether a placeholder was successfully found, and (ii) the chosen placeholder if the previous boolean is true. */ -template()))>::type>::type> -std::pair choose_missing_float_placeholder(Iterator start, Iterator end, bool skip_nan = false) { +template()))>::type>::type> +std::pair choose_missing_float_placeholder(Iterator start, Iterator end, Mask mask, bool skip_nan) { if constexpr(std::numeric_limits::is_iec559) { if (!skip_nan) { bool has_nan = false; - for (auto x = start; x != end; ++x) { - if (std::isnan(*x)) { - has_nan = true; - break; + + if constexpr(std::is_same::value) { + for (auto x = start; x != end; ++x) { + if (std::isnan(*x)) { + has_nan = true; + break; + } + } + } else { + auto sIt = mask; + for (auto x = start; x != end; ++x, ++sIt) { + if (!*sIt && std::isnan(*x)) { + has_nan = true; + break; + } } } + if (!has_nan) { return std::make_pair(true, std::numeric_limits::quiet_NaN()); } @@ -107,7 +174,7 @@ std::pair choose_missing_float_placeholder(Iterator start, Iterator for (int i = 0; i < 2; ++i) { Type_ candidate = std::numeric_limits::infinity() * (i == 0 ? 1 : -1); - if (std::find(start, end, candidate) == end) { + if (!found(start, end, mask, candidate)) { return std::make_pair(true, candidate); } } @@ -123,13 +190,13 @@ std::pair choose_missing_float_placeholder(Iterator start, Iterator } else { candidate = 0; } - if (std::find(start, end, candidate) == end) { + if (!found(start, end, mask, candidate)) { return std::make_pair(true, candidate); } } // Well... going through it in order. - std::set uniq_sort(start, end); + auto uniq_sort = create_unique_set(start, end, mask); Type_ last = std::numeric_limits::lowest(); for (auto x : uniq_sort) { if (std::isfinite(x)) { @@ -144,6 +211,23 @@ std::pair choose_missing_float_placeholder(Iterator start, Iterator return std::make_pair(false, 0); } +/** + * Overload of `choose_missing_float_placeholder()` where no values are masked. + * + * @tparam Iterator_ Forward iterator for floating-point values. + * @tparam Type_ Integer type pointed to by `Iterator_`. + * + * @param start Start of the dataset. + * @param end End of the dataset. + * @param skip_nan Whether to skip NaN as a potential placeholder. + * + * @return Pair containing (i) a boolean indicating whether a placeholder was successfully found, and (ii) the chosen placeholder if the previous boolean is true. + */ +template()))>::type>::type> +std::pair choose_missing_float_placeholder(Iterator start, Iterator end, bool skip_nan = false) { + return choose_missing_float_placeholder(start, end, false, skip_nan); +} + } #endif diff --git a/inst/include/ritsuko/hdf5/Stream1dNumericDataset.hpp b/inst/include/ritsuko/hdf5/Stream1dNumericDataset.hpp new file mode 100644 index 0000000..bbe866f --- /dev/null +++ b/inst/include/ritsuko/hdf5/Stream1dNumericDataset.hpp @@ -0,0 +1,134 @@ +#ifndef RITSUKO_HDF5_STREAM_1D_NUMERIC_DATASET_HPP +#define RITSUKO_HDF5_STREAM_1D_NUMERIC_DATASET_HPP + +#include "H5Cpp.h" + +#include +#include + +#include "pick_1d_block_size.hpp" +#include "get_1d_length.hpp" +#include "get_name.hpp" +#include "as_numeric_datatype.hpp" + +/** + * @file Stream1dNumericDataset.hpp + * @brief Stream a numeric 1D HDF5 dataset into memory. + */ + +namespace ritsuko { + +namespace hdf5 { + +/** + * @brief Stream a numeric 1D HDF5 dataset into memory. + * @tparam Type_ Type to represent the data in memory. + * + * This streams in a HDF5 dataset in contiguous blocks, using block sizes defined by `pick_1d_block_size()`. + * Callers can then extract one value at a time or they can acquire the entire block. + */ +template +class Stream1dNumericDataset { +public: + /** + * @param ptr Pointer to a HDF5 dataset handle. + * @param size Length of the dataset as a 1-dimensional vector. + * @param buffer_size Size of the buffer for holding streamed blocks of values. + * Larger buffers improve speed at the cost of some memory efficiency. + */ + Stream1dNumericDataset(const H5::DataSet* ptr, hsize_t length, hsize_t buffer_size) : + ptr(ptr), + full_length(length), + block_size(pick_1d_block_size(ptr->getCreatePlist(), full_length, buffer_size)), + mspace(1, &block_size), + dspace(1, &full_length), + buffer(block_size) + {} + + /** + * Overloaded constructor where the length is automatically determined. + * + * @param ptr Pointer to a HDF5 dataset handle. + * @param buffer_size Size of the buffer for holding streamed blocks of values. + */ + Stream1dNumericDataset(const H5::DataSet* ptr, hsize_t buffer_size) : + Stream1dNumericDataset(ptr, get_1d_length(ptr->getSpace(), false), buffer_size) + {} + +public: + /** + * @return Value at the current position of the stream. + */ + Type_ get() { + while (consumed >= available) { + consumed -= available; + load(); + } + return buffer[consumed]; + } + + /** + * @return Pair containing a pointer to and the length of an array. + * The array holds all loaded values of the stream at its current position, up to the specified length. + * Note that the pointer is only valid until the next invocation of `next()`. + */ + std::pair get_many() { + while (consumed >= available) { + consumed -= available; + load(); + } + return std::make_pair(buffer.data() + consumed, available - consumed); + } + + /** + * Advance the position of the stream by `jump`. + * + * @param jump Number of positions by which to advance the stream. + */ + void next(size_t jump = 1) { + consumed += jump; + } + + /** + * @return Length of the dataset. + */ + hsize_t length() const { + return full_length; + } + + /** + * @return Current position on the stream. + */ + hsize_t position() const { + return consumed + last_loaded; + } + +private: + const H5::DataSet* ptr; + hsize_t full_length, block_size; + H5::DataSpace mspace; + H5::DataSpace dspace; + std::vector buffer; + + hsize_t last_loaded = 0; + hsize_t consumed = 0; + hsize_t available = 0; + + void load() { + if (last_loaded >= full_length) { + throw std::runtime_error("requesting data beyond the end of the dataset at '" + get_name(*ptr) + "'"); + } + available = std::min(full_length - last_loaded, block_size); + constexpr hsize_t zero = 0; + mspace.selectHyperslab(H5S_SELECT_SET, &available, &zero); + dspace.selectHyperslab(H5S_SELECT_SET, &available, &last_loaded); + ptr->read(buffer.data(), as_numeric_datatype(), mspace, dspace); + last_loaded += available; + } +}; + +} + +} + +#endif diff --git a/inst/include/ritsuko/hdf5/Stream1dStringDataset.hpp b/inst/include/ritsuko/hdf5/Stream1dStringDataset.hpp new file mode 100644 index 0000000..69c7515 --- /dev/null +++ b/inst/include/ritsuko/hdf5/Stream1dStringDataset.hpp @@ -0,0 +1,171 @@ +#ifndef RITSUKO_HDF5_STREAM_1D_STRING_DATASET_HPP +#define RITSUKO_HDF5_STREAM_1D_STRING_DATASET_HPP + +#include "H5Cpp.h" + +#include +#include +#include + +#include "pick_1d_block_size.hpp" +#include "get_1d_length.hpp" +#include "get_name.hpp" +#include "as_numeric_datatype.hpp" +#include "_strings.hpp" + +/** + * @file Stream1dStringDataset.hpp + * @brief Stream a numeric 1D HDF5 dataset into memory. + */ + +namespace ritsuko { + +namespace hdf5 { + +/** + * @brief Stream a 1D HDF5 string dataset into memory. + * + * This streams in a HDF5 dataset in contiguous blocks, using block sizes defined by `pick_1d_block_size()`. + * Callers can then extract one C-style string at a time. + */ +class Stream1dStringDataset { +public: + /** + * @param ptr Pointer to a HDF5 dataset handle. + * @param length Length of the dataset as a 1-dimensional vector. + * @param buffer_size Size of the buffer for holding streamed blocks of values. + * Larger buffers improve speed at the cost of some memory efficiency. + */ + Stream1dStringDataset(const H5::DataSet* ptr, hsize_t length, hsize_t buffer_size) : + ptr(ptr), + full_length(length), + block_size(pick_1d_block_size(ptr->getCreatePlist(), full_length, buffer_size)), + mspace(1, &block_size), + dspace(1, &full_length), + dtype(ptr->getDataType()), + is_variable(dtype.isVariableStr()) + { + if (is_variable) { + var_buffer.resize(block_size); + } else { + fixed_length = dtype.getSize(); + fix_buffer.resize(fixed_length * block_size); + } + final_buffer.resize(block_size); + } + + /** + * Overloaded constructor where the length is automatically determined. + * + * @param ptr Pointer to a HDF5 dataset handle. + * @param buffer_size Size of the buffer for holding streamed blocks of values. + */ + Stream1dStringDataset(const H5::DataSet* ptr, hsize_t buffer_size) : + Stream1dStringDataset(ptr, get_1d_length(ptr->getSpace(), false), buffer_size) + {} + +public: + /** + * @return String at the current position of the stream. + */ + std::string get() { + while (consumed >= available) { + consumed -= available; + load(); + } + return final_buffer[consumed]; + } + + /** + * @return String at the current position of the stream. + * Unlike `get()`, this avoids a copy by directly acquiring the string, + * but it invalidates all subsequent `get()` and `steal()` requests until `next()` is called. + */ + std::string steal() { + while (consumed >= available) { + consumed -= available; + load(); + } + return std::move(final_buffer[consumed]); + } + + /** + * Advance to the next position of the stream. + * + * @param jump Number of positions by which to advance the stream. + */ + void next(size_t jump = 1) { + consumed += jump; + } + + /** + * @return Length of the dataset. + */ + hsize_t length() const { + return full_length; + } + + /** + * @return Current position on the stream. + */ + hsize_t position() const { + return consumed + last_loaded; + } + +private: + const H5::DataSet* ptr; + hsize_t full_length, block_size; + H5::DataSpace mspace; + H5::DataSpace dspace; + + H5::DataType dtype; + bool is_variable; + std::vector var_buffer; + size_t fixed_length = 0; + std::vector fix_buffer; + std::vector final_buffer; + + hsize_t last_loaded = 0; + hsize_t consumed = 0; + hsize_t available = 0; + + void load() { + if (last_loaded >= full_length) { + throw std::runtime_error("requesting data beyond the end of the dataset at '" + get_name(*ptr) + "'"); + } + available = std::min(full_length - last_loaded, block_size); + constexpr hsize_t zero = 0; + mspace.selectHyperslab(H5S_SELECT_SET, &available, &zero); + dspace.selectHyperslab(H5S_SELECT_SET, &available, &last_loaded); + + if (is_variable) { + ptr->read(var_buffer.data(), dtype, mspace, dspace); + [[maybe_unused]] VariableStringCleaner deletor(dtype.getId(), mspace.getId(), var_buffer.data()); + for (hsize_t i = 0; i < block_size; ++i) { + if (var_buffer[i] == NULL) { + throw std::runtime_error("detected a NULL pointer for a variable length string in '" + get_name(*ptr) + "'"); + } + auto& curstr = final_buffer[i]; + curstr.clear(); + curstr.insert(0, var_buffer[i]); + } + + } else { + auto bptr = fix_buffer.data(); + ptr->read(bptr, dtype, mspace, dspace); + for (size_t i = 0; i < available; ++i, bptr += fixed_length) { + auto& curstr = final_buffer[i]; + curstr.clear(); + curstr.insert(curstr.end(), bptr, bptr + find_string_length(bptr, fixed_length)); + } + } + + last_loaded += available; + } +}; + +} + +} + +#endif diff --git a/inst/include/ritsuko/hdf5/_strings.hpp b/inst/include/ritsuko/hdf5/_strings.hpp new file mode 100644 index 0000000..fe4e07f --- /dev/null +++ b/inst/include/ritsuko/hdf5/_strings.hpp @@ -0,0 +1,29 @@ +#ifndef RITSUKO_HDF5_STRINGS_HPP +#define RITSUKO_HDF5_STRINGS_HPP + +#include "H5Cpp.h" + +namespace ritsuko { + +namespace hdf5 { + +inline size_t find_string_length(const char* ptr, size_t max) { + size_t j = 0; + for (; j < max && ptr[j] != '\0'; ++j) {} + return j; +} + +struct VariableStringCleaner { + VariableStringCleaner(hid_t did, hid_t mid, char** buffer) : did(did), mid(mid), buffer(buffer) {} + ~VariableStringCleaner() { + H5Dvlen_reclaim(did, mid, H5P_DEFAULT, buffer); + } + hid_t did, mid; + char** buffer; +}; + +} + +} + +#endif diff --git a/inst/include/ritsuko/hdf5/as_numeric_datatype.hpp b/inst/include/ritsuko/hdf5/as_numeric_datatype.hpp new file mode 100644 index 0000000..ad02af3 --- /dev/null +++ b/inst/include/ritsuko/hdf5/as_numeric_datatype.hpp @@ -0,0 +1,55 @@ +#ifndef RITSUKO_AS_NUMERIC_DATATYPE_HPP +#define RITSUKO_AS_NUMERIC_DATATYPE_HPP + +#include +#include +#include "H5Cpp.h" + +/** + * @file as_numeric_datatype.hpp + * @brief Choose a HDF5 datatype. + */ + +namespace ritsuko { + +namespace hdf5 { + +/** + * Choose the HDF5 datatype object corresponding to a particular C++ numeric type. + * Currently, only fixed-width integer types (e.g., `uint16_t`, `int32_t`) and the usual floating-point types are supported. + * + * @tparam Type_ A numeric C++ type of fixed width. + * This can be any of the fixed-width integers or a floating-point number of known precision. + * @returns A HDF5 datatype object. + */ +template +H5::PredType as_numeric_datatype() { + if constexpr(std::is_same::value) { + return H5::PredType::NATIVE_UINT8; + } else if constexpr(std::is_same::value) { + return H5::PredType::NATIVE_INT8; + } else if constexpr(std::is_same::value) { + return H5::PredType::NATIVE_UINT16; + } else if constexpr(std::is_same::value) { + return H5::PredType::NATIVE_INT16; + } else if constexpr(std::is_same::value) { + return H5::PredType::NATIVE_UINT32; + } else if constexpr(std::is_same::value) { + return H5::PredType::NATIVE_INT32; + } else if constexpr(std::is_same::value) { + return H5::PredType::NATIVE_UINT64; + } else if constexpr(std::is_same::value) { + return H5::PredType::NATIVE_INT64; + } else if constexpr(std::is_same::value) { + return H5::PredType::NATIVE_FLOAT; + } else { + static_assert(std::is_same::value, "specified type is not yet supported"); + return H5::PredType::NATIVE_DOUBLE; + } +} + +} + +} + +#endif diff --git a/inst/include/ritsuko/hdf5/get_1d_length.hpp b/inst/include/ritsuko/hdf5/get_1d_length.hpp index a3ac9d5..3345b8c 100644 --- a/inst/include/ritsuko/hdf5/get_1d_length.hpp +++ b/inst/include/ritsuko/hdf5/get_1d_length.hpp @@ -39,6 +39,56 @@ inline hsize_t get_1d_length(const H5::DataSpace& space, bool allow_scalar) { return dims; } +/** + * Overload of `get_1d_length()` that accepts a dataset handle. + * + * @param handle Handle to a HDF5 dataset. + * @param allow_scalar Whether to allow scalars. + * + * @return Length of the dataset, i.e., the extent of its single dimension. + */ +inline hsize_t get_1d_length(const H5::DataSet& handle, bool allow_scalar) { + return get_1d_length(handle.getSpace(), allow_scalar); +} + +/** + * Overload of `get_1d_length()` that accepts an attribute handle. + * + * @param handle Handle to a HDF5 attribute. + * @param allow_scalar Whether to allow scalars. + * + * @return Length of the attribute, i.e., the extent of its single dimension. + */ +inline hsize_t get_1d_length(const H5::Attribute& handle, bool allow_scalar) { + return get_1d_length(handle.getSpace(), allow_scalar); +} + +/** + * @param space The data space of the dataset. + * @return Whether `space` represents a scalar dataset. + */ +inline bool is_scalar(const H5::DataSpace& space) { + return space.getSimpleExtentNdims() == 0; +} + +/** + * Overload of `is_scalar()` that accepts a dataset handle. + * @param handle Handle to a HDF5 dataset. + * @return Whether `space` represents a scalar dataset. + */ +inline bool is_scalar(const H5::DataSet& handle) { + return is_scalar(handle.getSpace()); +} + +/** + * Overload of `is_scalar()` that accepts an attribute handle. + * @param handle Handle to a HDF5 attribute. + * @return Whether `space` represents a scalar dataset. + */ +inline bool is_scalar(const H5::Attribute& handle) { + return is_scalar(handle.getSpace()); +} + } } diff --git a/inst/include/ritsuko/hdf5/get_dataset.hpp b/inst/include/ritsuko/hdf5/get_dataset.hpp deleted file mode 100644 index dd64dd5..0000000 --- a/inst/include/ritsuko/hdf5/get_dataset.hpp +++ /dev/null @@ -1,51 +0,0 @@ -#ifndef RITSUKO_HDF5_GET_DATASET_HPP -#define RITSUKO_HDF5_GET_DATASET_HPP - -#include "H5Cpp.h" -#include - -/** - * @file get_dataset.hpp - * @brief Quick functions to get a dataset handle. - */ - -namespace ritsuko { - -namespace hdf5 { - -/** - * @param handle Group containing the dataset. - * @param name Name of the dataset inside the group. - * @return Handle to the dataset. - * An error is raised if `name` does not refer to a dataset. - */ -inline H5::DataSet get_dataset(const H5::Group& handle, const char* name) { - if (!handle.exists(name) || handle.childObjType(name) != H5O_TYPE_DATASET) { - throw std::runtime_error("expected a dataset at '" + std::string(name) + "'"); - } - return handle.openDataSet(name); -} - -/** - * @param handle Group containing the scalar dataset. - * @param name Name of the dataset inside the group. - * @return Handle to a scalar dataset. - * An error is raised if `name` does not refer to a scalar dataset. - */ -inline H5::DataSet get_scalar_dataset(const H5::Group& handle, const char* name) { - auto dhandle = get_dataset(handle, name); - auto dspace = dhandle.getSpace(); - int ndims = dspace.getSimpleExtentNdims(); - if (ndims != 0) { - throw std::runtime_error("expected a scalar dataset at '" + std::string(name) + "'"); - } - return dhandle; -} - -} - -} - -#endif - - diff --git a/inst/include/ritsuko/hdf5/get_missing_placeholder_attribute.hpp b/inst/include/ritsuko/hdf5/get_missing_placeholder_attribute.hpp deleted file mode 100644 index 029051a..0000000 --- a/inst/include/ritsuko/hdf5/get_missing_placeholder_attribute.hpp +++ /dev/null @@ -1,49 +0,0 @@ -#ifndef RITSUKO_HDF5_GET_MISSING_PLACEHOLDER_ATTRIBUTE_HPP -#define RITSUKO_HDF5_GET_MISSING_PLACEHOLDER_ATTRIBUTE_HPP - -#include "H5Cpp.h" -#include - -/** - * @file get_missing_placeholder_attribute.hpp - * @brief Get the missing placeholder attribute. - */ - -namespace ritsuko { - -namespace hdf5 { - -/** - * @param handle Dataset handle. - * @param attr_name Name of the attribute containing the missing value placeholder. - * @param type_class_only Whether to only require identical type classes for the placeholder. - * By default, we require identity in the types themselves. - * - * - * @return Handle to the attribute. - * An error is raised if the attribute is not a scalar or has a different type (or type class, if `type_class_only_ = true`) to the dataset. - */ -inline H5::Attribute get_missing_placeholder_attribute(const H5::DataSet& handle, const char* attr_name, bool type_class_only = false) { - auto attr = handle.openAttribute(attr_name); - if (attr.getSpace().getSimpleExtentNdims() != 0) { - throw std::runtime_error("expected the '" + std::string(attr_name) + "' attribute to be a scalar"); - } - - if (type_class_only) { - if (attr.getTypeClass() != handle.getTypeClass()) { - throw std::runtime_error("expected the '" + std::string(attr_name) + "' attribute to have the same type class as its dataset"); - } - } else { - if (attr.getDataType() != handle.getDataType()) { - throw std::runtime_error("expected the '" + std::string(attr_name) + "' attribute to have the same type as its dataset"); - } - } - - return attr; -} - -} - -} - -#endif diff --git a/inst/include/ritsuko/hdf5/get_name.hpp b/inst/include/ritsuko/hdf5/get_name.hpp index 8dd5ab1..f907117 100644 --- a/inst/include/ritsuko/hdf5/get_name.hpp +++ b/inst/include/ritsuko/hdf5/get_name.hpp @@ -16,16 +16,22 @@ namespace hdf5 { /** * Get the name of a HDF5 object from its handle, usually for printing informative error messages. - * @tparam Handle_ Type of HDF5 handle, usually a `Group` or a `DataSet`. + * @tparam Handle_ Type of HDF5 handle, usually a `Group`, `DataSet` or `Attribute`. * @param handle Handle to a HDF5 object. * @return Name of the HDF5 object inside the file. */ template std::string get_name(const Handle_& handle) { - size_t len = H5Iget_name(handle.getId(), NULL, 0); - std::vector buffer(len); - H5Iget_name(handle.getId(), buffer.data(), len+1); - return std::string(buffer.begin(), buffer.end()); + if constexpr(std::is_same::value) { + std::string name; + handle.getName(name); + return name; + } else { + size_t len = H5Iget_name(handle.getId(), NULL, 0); + std::vector buffer(len + 1); + H5Iget_name(handle.getId(), buffer.data(), buffer.size()); + return std::string(buffer.begin(), buffer.begin() + len); + } } } diff --git a/inst/include/ritsuko/hdf5/get_scalar_attribute.hpp b/inst/include/ritsuko/hdf5/get_scalar_attribute.hpp deleted file mode 100644 index d5f4877..0000000 --- a/inst/include/ritsuko/hdf5/get_scalar_attribute.hpp +++ /dev/null @@ -1,43 +0,0 @@ -#ifndef RITSUKO_HDF5_GET_ATTRIBUTE_HPP -#define RITSUKO_HDF5_GET_ATTRIBUTE_HPP - -#include "H5Cpp.h" -#include - -/** - * @file get_scalar_attribute.hpp - * @brief Helper to get a scalar attribute handle. - */ - -namespace ritsuko { - -namespace hdf5 { - -/** - * Convenient wrapper to get a scalar attribute with all of the usual error checks. - * - * @tparam Object_ Type of the HDF5 handle, usually a `DataSet` or `Group`. - * @param handle HDF5 dataset or group handle. - * @param name Name of the attribute. - * - * @return Attribute handle. - */ -template -H5::Attribute get_scalar_attribute(const Object_& handle, const char* name) { - if (!handle.attrExists(name)) { - throw std::runtime_error("expected an attribute at '" + std::string(name) + "'"); - } - - auto attr = handle.openAttribute(name); - if (attr.getSpace().getSimpleExtentNdims() != 0) { - throw std::runtime_error("expected a scalar attribute at '" + std::string(name) + "'"); - } - - return attr; -} - -} - -} - -#endif diff --git a/inst/include/ritsuko/hdf5/hdf5.hpp b/inst/include/ritsuko/hdf5/hdf5.hpp index 778b370..7b09b7c 100644 --- a/inst/include/ritsuko/hdf5/hdf5.hpp +++ b/inst/include/ritsuko/hdf5/hdf5.hpp @@ -1,15 +1,19 @@ #ifndef RITSUKO_HDF5_HPP #define RITSUKO_HDF5_HPP +#include "Stream1dNumericDataset.hpp" +#include "Stream1dStringDataset.hpp" +#include "as_numeric_datatype.hpp" #include "exceeds_limit.hpp" #include "get_1d_length.hpp" -#include "iterate_1d_blocks.hpp" -#include "load_1d_string_dataset.hpp" -#include "load_scalar_string_attribute.hpp" -#include "get_missing_placeholder_attribute.hpp" -#include "get_dataset.hpp" -#include "get_scalar_attribute.hpp" #include "get_name.hpp" +#include "load_attribute.hpp" +#include "load_dataset.hpp" +#include "missing_placeholder.hpp" +#include "miscellaneous.hpp" +#include "open.hpp" +#include "pick_1d_block_size.hpp" +#include "validate_string.hpp" /** * @file hdf5.hpp diff --git a/inst/include/ritsuko/hdf5/iterate_1d_blocks.hpp b/inst/include/ritsuko/hdf5/iterate_1d_blocks.hpp deleted file mode 100644 index 2680a09..0000000 --- a/inst/include/ritsuko/hdf5/iterate_1d_blocks.hpp +++ /dev/null @@ -1,44 +0,0 @@ -#ifndef RITSUKO_HDF5_ITERATE_1D_BLOCKS_HPP -#define RITSUKO_HDF5_ITERATE_1D_BLOCKS_HPP - -#include "H5Cpp.h" -#include - -/** - * @file iterate_1d_blocks.hpp - * @brief Blockwise iteration through a 1-dimensional HDF5 dataset. - */ - -namespace ritsuko { - -namespace hdf5 { - -/** - * Iterate through a 1-dimensional dataset via contiguous blocks. - * - * @param full_length Length of the dataset, usually obtained from `get_1d_length()`. - * @param block_size Size of the blocks, usually calculated by `pick_1d_block_size()`. - * @param fun Function that accepts `(hsize_t start, hsize_t len, H5::DataSpace& memspace, H5::DataSpace& dataspace)` and is called on each block. - * In each call, the block contains elements from `[start, start + len)`. - * `dataspace` is configured to extract that block from the dataset, while `memspace` is configured to deposit the block contents in a buffer from `[0, len)`. - * It can be assumed that consecutive calls to `fun` will operate on consecutive contiguous blocks. - */ -template -void iterate_1d_blocks(hsize_t full_length, hsize_t block_size, Function_ fun) { - H5::DataSpace mspace(1, &block_size); - H5::DataSpace dspace(1, &full_length); - hsize_t start = 0; - - for (hsize_t counter = 0; counter < full_length; counter += block_size) { - hsize_t limit = std::min(full_length - counter, block_size); - mspace.selectHyperslab(H5S_SELECT_SET, &limit, &start); - dspace.selectHyperslab(H5S_SELECT_SET, &limit, &counter); - fun(counter, limit, mspace, dspace); - } -} - -} - -} - -#endif diff --git a/inst/include/ritsuko/hdf5/load_1d_string_dataset.hpp b/inst/include/ritsuko/hdf5/load_1d_string_dataset.hpp deleted file mode 100644 index 2be553b..0000000 --- a/inst/include/ritsuko/hdf5/load_1d_string_dataset.hpp +++ /dev/null @@ -1,114 +0,0 @@ -#ifndef RITSUKO_HDF5_LOAD_1D_STRING_DATASET_HPP -#define RITSUKO_HDF5_LOAD_1D_STRING_DATASET_HPP - -#include "H5Cpp.h" -#include -#include - -#include "pick_1d_block_size.hpp" -#include "iterate_1d_blocks.hpp" - -/** - * @file load_1d_string_dataset.hpp - * @brief Load and iterate over a 1-dimensional HDF5 string dataset. - */ - -namespace ritsuko { - -namespace hdf5 { - -/** - * Iterate across a string dataset, extracting each string and running a user-specified function. - * This works for both variable- and fixed-length strings, and performs iteration via `iterate_1d_blocks()` to avoid loading everything into memory at once. - * - * @tparam Function_ Function class that accepts `(hsize_t i, const char* start, size_t len)` - * where `i` is the index of the string from `[start, start + len)`. - * - * @param handle Handle to a string dataset. - * @param full_length Length of the dataset in `handle`, usually obtained by `get_1d_length()`. - * @param buffer_size Buffer size to use for iteration in `iterate_1d_blocks()`. - * @param fun Function to be called on each string. - * It can be assumed that the consecutive calls to `fun` will operate on consecutive `i`. - */ -template -void load_1d_string_dataset(const H5::DataSet& handle, hsize_t full_length, hsize_t buffer_size, Function_ fun) { - auto block_size = pick_1d_block_size(handle.getCreatePlist(), full_length, buffer_size); - auto dtype = handle.getDataType(); - - if (dtype.isVariableStr()) { - std::vector buffer(block_size); - iterate_1d_blocks( - full_length, - block_size, - [&](hsize_t start, hsize_t len, const H5::DataSpace& mspace, const H5::DataSpace& dspace) -> void { - handle.read(buffer.data(), dtype, mspace, dspace); - for (hsize_t i = 0; i < len; ++i) { - fun(start + i, buffer[i], std::strlen(buffer[i])); - } - H5Dvlen_reclaim(dtype.getId(), mspace.getId(), H5P_DEFAULT, buffer.data()); - } - ); - - } else { - size_t len = dtype.getSize(); - std::vector buffer(len * block_size); - iterate_1d_blocks( - full_length, - block_size, - [&](hsize_t start, hsize_t length, const H5::DataSpace& mspace, const H5::DataSpace& dspace) -> void { - handle.read(buffer.data(), dtype, mspace, dspace); - auto ptr = buffer.data(); - for (size_t i = 0; i < length; ++i, ptr += len) { - size_t j = 0; - for (; j < len && ptr[j] != '\0'; ++j) {} - fun(start + i, ptr, j); - } - } - ); - } -} - -/** - * Iterate across a string attribute, extracting each string and running a user-specified function. - * This works for both variable- and fixed-length strings. - * - * @tparam Function_ Function class that accepts `(hsize_t i, const char* start, size_t len)` - * where `i` is the index of the string from `[start, start + len)`. - * - * @param handle Handle to a string attribute. - * @param full_length Length of the attribute in `handle`, usually obtained by `get_1d_length()`. - * @param fun Function to be called on each string. - * It can be assumed that the consecutive calls to `fun` will operate on consecutive `i`. - */ -template -void load_1d_string_attribute(const H5::Attribute& handle, hsize_t full_length, Function_ fun) { - auto dtype = handle.getDataType(); - - if (dtype.isVariableStr()) { - std::vector buffer(full_length); - handle.read(dtype, buffer.data()); - for (hsize_t i = 0; i < full_length; ++i) { - fun(i, buffer[i], std::strlen(buffer[i])); - } - auto mspace = handle.getSpace(); - H5Dvlen_reclaim(dtype.getId(), mspace.getId(), H5P_DEFAULT, buffer.data()); - - } else { - size_t len = dtype.getSize(); - std::vector buffer(len * full_length); - handle.read(dtype, buffer.data()); - auto ptr = buffer.data(); - for (size_t i = 0; i < full_length; ++i, ptr += len) { - size_t j = 0; - for (; j < len && ptr[j] != '\0'; ++j) {} - fun(i, ptr, j); - } - } -} - -} - -} - -#endif - diff --git a/inst/include/ritsuko/hdf5/load_attribute.hpp b/inst/include/ritsuko/hdf5/load_attribute.hpp new file mode 100644 index 0000000..e2f4a5e --- /dev/null +++ b/inst/include/ritsuko/hdf5/load_attribute.hpp @@ -0,0 +1,144 @@ +#ifndef RITSUKO_HDF5_LOAD_ATTRIBUTE_HPP +#define RITSUKO_HDF5_LOAD_ATTRIBUTE_HPP + +#include "H5Cpp.h" + +#include +#include + +#include "get_1d_length.hpp" +#include "as_numeric_datatype.hpp" +#include "_strings.hpp" + +/** + * @file load_scalar_string_attribute.hpp + * @brief Load a scalar string HDF5 attribute. + */ + +namespace ritsuko { + +namespace hdf5 { + +/** + * @param attr Handle to a scalar string attribute. + * Callers are responsible for checking that `attr` contains a string datatype class. + * @return The attribute as a string. + */ +inline std::string load_scalar_string_attribute(const H5::Attribute& attr) { + auto dtype = attr.getDataType(); + + // Unfortunately, we can't just do 'std::string output; attr.read(dtype, output);', + // as we need to catch NULL pointers in the variable case. + + if (dtype.isVariableStr()) { + auto mspace = attr.getSpace(); + char* buffer; + attr.read(dtype, &buffer); + [[maybe_unused]] VariableStringCleaner deletor(dtype.getId(), mspace.getId(), &buffer); + if (buffer == NULL) { + throw std::runtime_error("detected a NULL pointer for a variable length string attribute"); + } + return std::string(buffer); + + } else { + size_t len = dtype.getSize(); + std::vector buffer(len); + attr.read(dtype, buffer.data()); + auto ptr = buffer.data(); + return std::string(ptr, ptr + find_string_length(ptr, len)); + } +} + +/** + * @tparam check_ Whether to check that `attr` is a 1-dimensional string attribute. + * @param attr Handle to a 1-dimensional string attribute. + * Callers are responsible for checking that `attr` contains a string datatype class. + * @param full_length Length of the attribute in `attr`, usually obtained by `get_1d_length()`. + * @return Vector of strings. + */ +inline std::vector load_1d_string_attribute(const H5::Attribute& attr, hsize_t full_length) { + auto dtype = attr.getDataType(); + auto mspace = attr.getSpace(); + std::vector output; + output.reserve(full_length); + + if (dtype.isVariableStr()) { + std::vector buffer(full_length); + attr.read(dtype, buffer.data()); + [[maybe_unused]] VariableStringCleaner deletor(dtype.getId(), mspace.getId(), buffer.data()); + for (hsize_t i = 0; i < full_length; ++i) { + if (buffer[i] == NULL) { + throw std::runtime_error("detected a NULL pointer for a variable length string attribute"); + } + output.emplace_back(buffer[i]); + } + + } else { + size_t len = dtype.getSize(); + std::vector buffer(len * full_length); + attr.read(dtype, buffer.data()); + auto ptr = buffer.data(); + for (size_t i = 0; i < full_length; ++i, ptr += len) { + output.emplace_back(ptr, ptr + find_string_length(ptr, len)); + } + } + + return output; +} + +/** + * Overload of `load_1d_string_attribute()` that determines the length of the attribute via `get_1d_length()`. + * @param attr Handle to a 1-dimensional string attribute. + * Callers are responsible for checking that `attr` contains a string datatype class. + * @return Vector of strings. + */ +inline std::vector load_1d_string_attribute(const H5::Attribute& attr) { + return load_1d_string_attribute(attr, get_1d_length(attr.getSpace(), false)); +} + +/** + * @tparam Type_ Type for holding the data in memory, see `as_numeric_datatype()` for supported types. + * @param attr Handle to a scalar numeric attribute. + * Callers are responsible for checking that the datatype of `attr` is appropriate for `Type_`, e.g., with `exceeds_integer_limit()`. + * @return The value of the attribute. + */ +template +Type_ load_scalar_numeric_attribute(const H5::Attribute& attr) { + Type_ val; + auto mtype = as_numeric_datatype(); + attr.read(mtype, &val); + return val; +} + +/** + * @tparam Type_ Type for holding the data in memory, see `as_numeric_datatype()` for supported types. + * @param attr Handle to a numeric attribute. + * Callers are responsible for checking that the datatype of `attr` is appropriate for `Type_`, e.g., with `exceeds_integer_limit()`. + * @param full_length Length of the attribute in `attr`, usually obtained by `get_1d_length()`. + * @return Vector containing the contents of the attribute. + */ +template +std::vector load_1d_numeric_attribute(const H5::Attribute& attr, hsize_t full_length) { + auto mtype = as_numeric_datatype(); + std::vector buffer(full_length); + attr.read(mtype, buffer.data()); + return buffer; +} + +/** + * Overload of `load_1d_numeric_attribute()` that determines the length of the attribute via `get_1d_length()`. + * @tparam Type_ Type for holding the data in memory, see `as_numeric_datatype()` for supported types. + * @param attr Handle to a numeric attribute. + * Callers are responsible for checking that the datatype of `attr` is appropriate for `Type_`, e.g., with `exceeds_integer_limit()`. + * @return Vector containing the contents of the attribute. + */ +template +std::vector load_1d_numeric_attribute(const H5::Attribute& attr) { + return load_1d_numeric_attribute(attr, get_1d_length(attr.getSpace(), false)); +} + +} + +} + +#endif diff --git a/inst/include/ritsuko/hdf5/load_dataset.hpp b/inst/include/ritsuko/hdf5/load_dataset.hpp new file mode 100644 index 0000000..3c0fe14 --- /dev/null +++ b/inst/include/ritsuko/hdf5/load_dataset.hpp @@ -0,0 +1,79 @@ +#ifndef RITSUKO_HDF5_LOAD_DATASET_HPP +#define RITSUKO_HDF5_LOAD_DATASET_HPP + +#include +#include +#include + +#include "H5Cpp.h" + +#include "get_name.hpp" +#include "Stream1dStringDataset.hpp" +#include "_strings.hpp" + +/** + * @file load_dataset.hpp + * @brief Helper functions to load datasets. + */ + +namespace ritsuko { + +namespace hdf5 { + +/** + * Load a scalar string dataset into a single string. + * @param handle Handle to the HDF5 scalar dataset. + * @return String containing the contents of the sole dataset entry. + */ +inline std::string load_scalar_string_dataset(const H5::DataSet& handle) { + auto dtype = handle.getDataType(); + if (dtype.isVariableStr()) { + char* vptr; + handle.read(&vptr, dtype); + auto dspace = handle.getSpace(); // don't set as temporary in constructor below, otherwise it gets destroyed and the ID invalidated. + [[maybe_unused]] VariableStringCleaner deletor(dtype.getId(), dspace.getId(), &vptr); + if (vptr == NULL) { + throw std::runtime_error("detected a NULL pointer for a variable length string in '" + get_name(handle) + "'"); + } + std::string output(vptr); + return output; + } else { + size_t fixed_length = dtype.getSize(); + std::vector buffer(fixed_length); + handle.read(buffer.data(), dtype); + return std::string(buffer.begin(), buffer.begin() + find_string_length(buffer.data(), fixed_length)); + } +} + +/** + * Load a 1-dimensional string dataset into a vector of strings. + * @param handle Handle to the HDF5 scalar dataset. + * @param full_length Length of the dataset as a 1-dimensional vector. + * @param buffer_size Size of the buffer for holding loaded strings. + * @return Vector of strings. + */ +inline std::vector load_1d_string_dataset(const H5::DataSet& handle, hsize_t full_length, hsize_t buffer_size) { + Stream1dStringDataset stream(&handle, full_length, buffer_size); + std::vector output; + output.reserve(full_length); + for (hsize_t i = 0; i < full_length; ++i, stream.next()) { + output.emplace_back(stream.steal()); + } + return output; +} + +/** + * Overload of `load_1d_string_dataset()` that determines the length via `get_1d_length()`. + * @param handle Handle to the HDF5 scalar dataset. + * @param buffer_size Size of the buffer for holding loaded strings. + * @return Vector of strings. + */ +inline std::vector load_1d_string_dataset(const H5::DataSet& handle, hsize_t buffer_size) { + return load_1d_string_dataset(handle, get_1d_length(handle, false), buffer_size); +} + +} + +} + +#endif diff --git a/inst/include/ritsuko/hdf5/load_scalar_string_attribute.hpp b/inst/include/ritsuko/hdf5/load_scalar_string_attribute.hpp deleted file mode 100644 index 12d4cc8..0000000 --- a/inst/include/ritsuko/hdf5/load_scalar_string_attribute.hpp +++ /dev/null @@ -1,53 +0,0 @@ -#ifndef RITSUKO_HDF5_LOAD_SCALAR_STRING_ATTRIBUTE_HPP -#define RITSUKO_HDF5_LOAD_SCALAR_STRING_ATTRIBUTE_HPP - -#include "H5Cpp.h" -#include - -#include "get_name.hpp" - -/** - * @file load_scalar_string_attribute.hpp - * @brief Load a scalar string HDF5 attribute. - */ - -namespace ritsuko { - -namespace hdf5 { - -/** - * @param attr An ``Attribute`` handle. - * @return The attribute as a string. - */ -inline std::string load_scalar_string_attribute(const H5::Attribute& attr) { - if (attr.getTypeClass() != H5T_STRING || attr.getSpace().getSimpleExtentNdims() != 0) { - std::string name; - attr.getName(name); - throw std::runtime_error("expected attribute '" + name + "' to be a scalar string"); - } - std::string output; - attr.read(attr.getStrType(), output); - return output; -} - -/** - * @tparam Object_ HDF5 object class, usually a ``DataSet`` or a ``Group``. - * - * @param handle Handle to a HDF5 object that can contain attributes. - * @param field Name of the attribute. - * - * @return The attribute as a string. - */ -template -std::string load_scalar_string_attribute(const Object_& handle, const char* field) { - if (!handle.attrExists(field)) { - throw std::runtime_error("expected a '" + std::string(field) + "' attribute to be present"); - } - return load_scalar_string_attribute(handle.openAttribute(field)); -} - -} - -} - -#endif diff --git a/inst/include/ritsuko/hdf5/miscellaneous.hpp b/inst/include/ritsuko/hdf5/miscellaneous.hpp new file mode 100644 index 0000000..91b8d90 --- /dev/null +++ b/inst/include/ritsuko/hdf5/miscellaneous.hpp @@ -0,0 +1,56 @@ +#ifndef RITSUKO_MISCELLANEOUS_HPP +#define RITSUKO_MISCELLANEOUS_HPP + +#include +#include "H5Cpp.h" + +#include "open.hpp" +#include "load_attribute.hpp" + +/** + * @file miscellaneous.hpp + * @brief Miscellaneous functions for user convenience. + */ + +namespace ritsuko { + +namespace hdf5 { + +/** + * @tparam Object_ Type of the HDF5 handle, usually a `DataSet` or `Group`. + * @param handle HDF5 dataset or group handle. + * @param name Name of the attribute. + * + * @return Attribute handle. + * An error is raised if `name` does not refer to a scalar attribute. + */ +template +inline H5::Attribute open_scalar_attribute(const H5Object_& handle, const char* name) { + auto attr = open_attribute(handle, name); + if (!is_scalar(attr)) { + throw std::runtime_error("expected '" + std::string(name) + "' attribute to be a scalar"); + } + return attr; +} + +/** + * @tparam Object_ Type of the HDF5 handle, usually a `DataSet` or `Group`. + * @param handle HDF5 dataset or group handle. + * @param name Name of the attribute. + * + * @return A string containing the attribute value. + */ +template +std::string open_and_load_scalar_string_attribute(const H5Object_& handle, const char* name) { + auto attr = open_scalar_attribute(handle, name); + if (attr.getTypeClass() != H5T_STRING) { + throw std::runtime_error("expected '" + std::string(name) + "' attribute to be a string"); + } + return load_scalar_string_attribute(attr); +} + +} + +} + +#endif diff --git a/inst/include/ritsuko/hdf5/missing_placeholder.hpp b/inst/include/ritsuko/hdf5/missing_placeholder.hpp new file mode 100644 index 0000000..824ed03 --- /dev/null +++ b/inst/include/ritsuko/hdf5/missing_placeholder.hpp @@ -0,0 +1,98 @@ +#ifndef RITSUKO_HDF5_MISSING_PLACEHOLDER_HPP +#define RITSUKO_HDF5_MISSING_PLACEHOLDER_HPP + +#include "H5Cpp.h" +#include + +#include "as_numeric_datatype.hpp" +#include "load_attribute.hpp" +#include "get_1d_length.hpp" +#include "get_name.hpp" + +/** + * @file missing_placeholder.hpp + * @brief Get the missing placeholder attribute. + */ + +namespace ritsuko { + +namespace hdf5 { + +/** + * Check the validity of a missing placeholder attribute on a dataset. + * An error is raised if the attribute is not a scalar or has a different type (or type class, if `type_class_only_ = true`) to the dataset. + * + * @param dset Dataset handle. + * @param attr Handle for the missing placeholder, typically as an attribute on `dset`. + * @param type_class_only Whether to only require identical type classes for the placeholder. + * If 0, this is false, and the types between `dset` and `attr` must be identical. + * If 1, this is true, and `dset` and `attr` just need to have the same type class. + * If -1 (default), this is true for all string types and false for all numeric types. + */ +inline void check_missing_placeholder_attribute(const H5::DataSet& dset, const H5::Attribute& attr, int type_class_only = -1) { + if (!is_scalar(attr)) { + throw std::runtime_error("expected the '" + get_name(attr) + "' attribute to be a scalar"); + } + + if (type_class_only == -1) { + type_class_only = (dset.getTypeClass() == H5T_STRING); + } + + if (type_class_only == 1) { + if (attr.getTypeClass() != dset.getTypeClass()) { + throw std::runtime_error("expected the '" + get_name(attr) + "' attribute to have the same type class as its dataset"); + } + } else { + if (attr.getDataType() != dset.getDataType()) { + throw std::runtime_error("expected the '" + get_name(attr) + "' attribute to have the same type as its dataset"); + } + } +} + +/** + * Check if a missing numeric placeholder attribute is present, and if so, open it and loads it value. + * This will also call `check_missing_placeholder_attribute()` to validate the placeholder's properties. + * + * @tparam Type_ Type to use to store the data in memory, see `as_numeric_datatype()` for supported types. + * @param handle Dataset handle. + * @param attr_name Name of the attribute containing the missing value placeholder. + * @return Pair containing (i) a boolean indicating whether the placeholder attribute was present, and (ii) the value of the placeholder if the first element is `true`. + */ +template +std::pair open_and_load_optional_numeric_missing_placeholder(const H5::DataSet& handle, const char* attr_name) { + std::pair output(false, 0); + if (!handle.attrExists(attr_name)) { + return output; + } + output.first = true; + auto ahandle = handle.openAttribute(attr_name); + check_missing_placeholder_attribute(handle, ahandle); + ahandle.read(as_numeric_datatype(), &(output.second)); + return output; +} + +/** + * Check if a missing string placeholder attribute is present, and if so, open it and loads it value. + * This will also call `check_missing_placeholder_attribute()` to validate the placeholder's properties. + * + * @param handle Dataset handle. + * @param attr_name Name of the attribute containing the missing value placeholder. + * @return Pair containing (i) a boolean indicating whether the placeholder attribute was present, and (ii) the value of the placeholder if the first element is `true`. + */ +inline std::pair open_and_load_optional_string_missing_placeholder(const H5::DataSet& handle, const char* attr_name) { + std::pair output(false, ""); + if (!handle.attrExists(attr_name)) { + return output; + } + output.first = true; + auto ahandle = handle.openAttribute(attr_name); + check_missing_placeholder_attribute(handle, ahandle); + output.second = load_scalar_string_attribute(ahandle); + return output; +} + +} + +} + +#endif diff --git a/inst/include/ritsuko/hdf5/open.hpp b/inst/include/ritsuko/hdf5/open.hpp new file mode 100644 index 0000000..58c6c7b --- /dev/null +++ b/inst/include/ritsuko/hdf5/open.hpp @@ -0,0 +1,79 @@ +#ifndef RITSUKO_HDF5_OPEN_HPP +#define RITSUKO_HDF5_OPEN_HPP + +#include "H5Cpp.h" + +#include +#include +#include + +/** + * @file open.hpp + * @brief Convenience functions to safely open HDF5 handles. + */ + +namespace ritsuko { + +namespace hdf5 { + +/** + * @param path Path to a HDF5 file. + * @return Handle to the file. + * An error is raised if `path` does not exist. + */ +inline H5::H5File open_file(const std::filesystem::path& path) try { + if (!std::filesystem::exists(path)) { + throw std::runtime_error("no file is present at '" + path.string() + "'"); + } + return H5::H5File(path, H5F_ACC_RDONLY); +} catch (H5::Exception& e) { + throw std::runtime_error("failed to open the HDF5 file at '" + path.string() + "'; " + e.getDetailMsg()); +} + +/** + * @param handle Parent group (or file). + * @param name Name of the group. + * @return Handle to the group. + * An error is raised if `name` does not refer to a dataset. + */ +inline H5::Group open_group(const H5::Group& handle, const char* name) { + if (!handle.exists(name) || handle.childObjType(name) != H5O_TYPE_GROUP) { + throw std::runtime_error("expected a group at '" + std::string(name) + "'"); + } + return handle.openGroup(name); +} + +/** + * @param handle Group containing the dataset. + * @param name Name of the dataset inside the group. + * @return Handle to the dataset. + * An error is raised if `name` does not refer to a dataset. + */ +inline H5::DataSet open_dataset(const H5::Group& handle, const char* name) { + if (!handle.exists(name) || handle.childObjType(name) != H5O_TYPE_DATASET) { + throw std::runtime_error("expected a dataset at '" + std::string(name) + "'"); + } + return handle.openDataSet(name); +} + +/** + * @tparam Object_ Type of the HDF5 handle, usually a `DataSet` or `Group`. + * @param handle HDF5 dataset or group handle. + * @param name Name of the attribute. + * + * @return Attribute handle. + * An error is raised if `name` does not refer to an attribute. + */ +template +H5::Attribute open_attribute(const Object_& handle, const char* name) { + if (!handle.attrExists(name)) { + throw std::runtime_error("expected an attribute at '" + std::string(name) + "'"); + } + return handle.openAttribute(name); +} + +} + +} + +#endif diff --git a/inst/include/ritsuko/hdf5/validate_string.hpp b/inst/include/ritsuko/hdf5/validate_string.hpp new file mode 100644 index 0000000..7cf3573 --- /dev/null +++ b/inst/include/ritsuko/hdf5/validate_string.hpp @@ -0,0 +1,148 @@ +#ifndef RITSUKO_HDF5_VALIDATE_STRING_HPP +#define RITSUKO_HDF5_VALIDATE_STRING_HPP + +#include +#include +#include + +#include "H5Cpp.h" + +#include "get_name.hpp" +#include "pick_1d_block_size.hpp" +#include "_strings.hpp" + +/** + * @file validate_string.hpp + * @brief Helper functions to validate strings. + */ + +namespace ritsuko { + +namespace hdf5 { + +/** + * Check that a scalar string dataset is valid. + * Currently, this involves checking that there are no `NULL` entries for variable-length string datatypes. + * For fixed-width string datasets, this function is a no-op. + * + * @param handle Handle to the HDF5 string dataset. + */ +inline void validate_scalar_string_dataset(const H5::DataSet& handle) { + auto dtype = handle.getDataType(); + if (!dtype.isVariableStr()) { + return; + } + + char* vptr; + handle.read(&vptr, dtype); + auto dspace = handle.getSpace(); // don't set as temporary in constructor below, otherwise it gets destroyed and the ID invalidated. + [[maybe_unused]] VariableStringCleaner deletor(dtype.getId(), dspace.getId(), &vptr); + if (vptr == NULL) { + throw std::runtime_error("detected a NULL pointer for a variable length string in '" + get_name(handle) + "'"); + } +} + +/** + * Check that a 1-dimensional string dataset is valid. + * Currently, this involves checking that there are no `NULL` entries for variable-length string datatypes. + * For fixed-width string datasets, this function is a no-op. + * + * @param handle Handle to the HDF5 string dataset. + * @param full_length Length of the dataset as a 1-dimensional vector. + * @param buffer_size Size of the buffer for holding loaded strings. + */ +inline void validate_1d_string_dataset(const H5::DataSet& handle, hsize_t full_length, hsize_t buffer_size) { + auto dtype = handle.getDataType(); + if (!dtype.isVariableStr()) { + return; + } + + hsize_t block_size = pick_1d_block_size(handle.getCreatePlist(), full_length, buffer_size); + H5::DataSpace mspace(1, &block_size), dspace(1, &full_length); + std::vector buffer(block_size); + + for (hsize_t i = 0; i < full_length; i += block_size) { + auto available = std::min(full_length - i, block_size); + constexpr hsize_t zero = 0; + mspace.selectHyperslab(H5S_SELECT_SET, &available, &zero); + dspace.selectHyperslab(H5S_SELECT_SET, &available, &i); + + handle.read(buffer.data(), dtype, mspace, dspace); + [[maybe_unused]] VariableStringCleaner deletor(dtype.getId(), mspace.getId(), buffer.data()); + for (hsize_t j = 0; j < available; ++j) { + if (buffer[j] == NULL) { + throw std::runtime_error("detected a NULL pointer for a variable length string in '" + get_name(handle) + "'"); + } + } + } +} + +/** + * Overload for `validate_1d_string_dataset()` that automatically determines its length via `get_1d_length()`. + * @param handle Handle to the HDF5 string dataset. + * @param buffer_size Size of the buffer for holding loaded strings. + */ +inline void validate_1d_string_dataset(const H5::DataSet& handle, hsize_t buffer_size) { + validate_1d_string_dataset(handle, get_1d_length(handle, false), buffer_size); +} + +/** + * Check that a scalar string attribute is valid. + * Currently, this involves checking that there are no `NULL` entries for variable-length string datatypes. + * For fixed-width string attributes, this function is a no-op. + * + * @param handle Handle to the HDF5 string attribute. + */ +inline void validate_scalar_string_attribute(const H5::Attribute& attr) { + auto dtype = attr.getDataType(); + if (!dtype.isVariableStr()) { + return; + } + + auto mspace = attr.getSpace(); + char* buffer; + attr.read(dtype, &buffer); + [[maybe_unused]] VariableStringCleaner deletor(dtype.getId(), mspace.getId(), &buffer); + if (buffer == NULL) { + throw std::runtime_error("detected a NULL pointer for a variable length string attribute"); + } +} + +/** + * Check that a 1-dimensional string attribute is valid. + * Currently, this involves checking that there are no `NULL` entries for variable-length string datatypes. + * For fixed-width string attributes, this function is a no-op. + * + * @param handle Handle to the HDF5 string attribute. + * @param full_length Length of the attribute as a 1-dimensional vector. + */ +inline void validate_1d_string_attribute(const H5::Attribute& attr, hsize_t full_length) { + auto dtype = attr.getDataType(); + if (!dtype.isVariableStr()) { + return; + } + + auto mspace = attr.getSpace(); + std::vector buffer(full_length); + attr.read(dtype, buffer.data()); + [[maybe_unused]] VariableStringCleaner deletor(dtype.getId(), mspace.getId(), buffer.data()); + for (hsize_t i = 0; i < full_length; ++i) { + if (buffer[i] == NULL) { + throw std::runtime_error("detected a NULL pointer for a variable length string attribute"); + } + } +} + +/** + * Overload for `validate_1d_string_attribute()` that automatically determines its length via `get_1d_length()`. + * @param handle Handle to the HDF5 string attribute. + */ +inline void validate_1d_string_attribute(const H5::Attribute& attr) { + validate_1d_string_attribute(attr, get_1d_length(attr, false)); +} + +} + +} + +#endif diff --git a/inst/include/takane/_height.hpp b/inst/include/takane/_height.hpp index 218baec..7223214 100644 --- a/inst/include/takane/_height.hpp +++ b/inst/include/takane/_height.hpp @@ -12,6 +12,10 @@ #include "simple_list.hpp" #include "data_frame.hpp" #include "data_frame_factor.hpp" +#include "genomic_ranges.hpp" +#include "atomic_vector_list.hpp" +#include "data_frame_list.hpp" +#include "genomic_ranges_list.hpp" /** * @file _height.hpp @@ -32,6 +36,10 @@ inline auto default_registry() { registry["simple_list"] = [](const std::filesystem::path& p, const Options& o) -> size_t { return simple_list::height(p, o); }; registry["data_frame"] = [](const std::filesystem::path& p, const Options& o) -> size_t { return data_frame::height(p, o); }; registry["data_frame_factor"] = [](const std::filesystem::path& p, const Options& o) -> size_t { return data_frame_factor::height(p, o); }; + registry["genomic_ranges"] = [](const std::filesystem::path& p, const Options& o) -> size_t { return genomic_ranges::height(p, o); }; + registry["atomic_vector_list"] = [](const std::filesystem::path& p, const Options& o) -> size_t { return atomic_vector_list::height(p, o); }; + registry["data_frame_list"] = [](const std::filesystem::path& p, const Options& o) -> size_t { return data_frame_list::height(p, o); }; + registry["genomic_ranges_list"] = [](const std::filesystem::path& p, const Options& o) -> size_t { return genomic_ranges_list::height(p, o); }; return registry; } diff --git a/inst/include/takane/_satisfies_interface.hpp b/inst/include/takane/_satisfies_interface.hpp new file mode 100644 index 0000000..5fb49f4 --- /dev/null +++ b/inst/include/takane/_satisfies_interface.hpp @@ -0,0 +1,53 @@ +#ifndef TAKANE_SATISFIES_INTERFACE_HPP +#define TAKANE_SATISFIES_INTERFACE_HPP + +#include +#include +#include + +namespace takane { + +/** + * @cond + */ +namespace internal_satisfies_interface { + +inline auto default_registry() { + std::unordered_map > registry; + registry["SIMPLE_LIST"] = { "simple_list" }; + registry["DATA_FRAME"] = { "data_frame" }; + return registry; +} + +} +/** + * @endcond + */ + +/** + * Registry of object types that satisfy a particular object interface. + * Each key is the interface and each value is the set of all types that satisfy it. + * Applications can extend the **takane** framework by adding custom types to each set. + */ +inline std::unordered_map > satisfies_interface_registry = internal_satisfies_interface::default_registry(); + +/** + * Check whether a particular object type satisfies a particular object interface. + * This can be used by specifications to check that child components satisfy certain expectations. + * + * @param type Object type. + * @param interface Interface type. + * @returns Whether `type` satisfies `interface`. + */ +inline bool satisfies_interface(const std::string& type, const std::string& interface) { + auto it = satisfies_interface_registry.find(interface); + if (it == satisfies_interface_registry.end()) { + return false; + } + const auto& listing = it->second; + return listing.find(type) != listing.end(); +} + +} + +#endif diff --git a/inst/include/takane/_validate.hpp b/inst/include/takane/_validate.hpp index c960282..b3ae6a9 100644 --- a/inst/include/takane/_validate.hpp +++ b/inst/include/takane/_validate.hpp @@ -12,6 +12,11 @@ #include "simple_list.hpp" #include "data_frame.hpp" #include "data_frame_factor.hpp" +#include "sequence_information.hpp" +#include "genomic_ranges.hpp" +#include "atomic_vector_list.hpp" +#include "data_frame_list.hpp" +#include "genomic_ranges_list.hpp" /** * @file _validate.hpp @@ -32,6 +37,11 @@ inline auto default_registry() { registry["simple_list"] = [](const std::filesystem::path& p, const Options& o) { simple_list::validate(p, o); }; registry["data_frame"] = [](const std::filesystem::path& p, const Options& o) { data_frame::validate(p, o); }; registry["data_frame_factor"] = [](const std::filesystem::path& p, const Options& o) { data_frame_factor::validate(p, o); }; + registry["sequence_information"] = [](const std::filesystem::path& p, const Options& o) { sequence_information::validate(p, o); }; + registry["genomic_ranges"] = [](const std::filesystem::path& p, const Options& o) { genomic_ranges::validate(p, o); }; + registry["atomic_vector_list"] = [](const std::filesystem::path& p, const Options& o) { atomic_vector_list::validate(p, o); }; + registry["data_frame_list"] = [](const std::filesystem::path& p, const Options& o) { data_frame_list::validate(p, o); }; + registry["genomic_ranges_list"] = [](const std::filesystem::path& p, const Options& o) { genomic_ranges_list::validate(p, o); }; return registry; } diff --git a/inst/include/takane/atomic_vector.hpp b/inst/include/takane/atomic_vector.hpp index 5278317..44cfd9d 100644 --- a/inst/include/takane/atomic_vector.hpp +++ b/inst/include/takane/atomic_vector.hpp @@ -8,7 +8,7 @@ #include "ritsuko/hdf5/hdf5.hpp" #include "utils_public.hpp" -#include "utils_hdf5.hpp" +#include "utils_string.hpp" /** * @file atomic_vector.hpp @@ -28,42 +28,28 @@ namespace atomic_vector { * @param options Validation options, typically for reading performance. */ inline void validate(const std::filesystem::path& path, const Options& options) try { - H5::H5File handle((path / "contents.h5").string(), H5F_ACC_RDONLY); - - const char* parent = "atomic_vector"; - if (!handle.exists(parent) || handle.childObjType(parent) != H5O_TYPE_GROUP) { - throw std::runtime_error("expected an 'atomic_vector' group"); - } - auto ghandle = handle.openGroup(parent); + auto handle = ritsuko::hdf5::open_file(path / "contents.h5"); + auto ghandle = ritsuko::hdf5::open_group(handle, "atomic_vector"); - auto vstring = ritsuko::hdf5::load_scalar_string_attribute(ghandle, "version"); + auto vstring = ritsuko::hdf5::open_and_load_scalar_string_attribute(ghandle, "version"); auto version = ritsuko::parse_version_string(vstring.c_str(), vstring.size(), /* skip_patch = */ true); if (version.major != 1) { throw std::runtime_error("unsupported version string '" + vstring + "'"); } - auto dhandle = ritsuko::hdf5::get_dataset(ghandle, "values"); + auto dhandle = ritsuko::hdf5::open_dataset(ghandle, "values"); auto vlen = ritsuko::hdf5::get_1d_length(dhandle.getSpace(), false); - auto type = ritsuko::hdf5::load_scalar_string_attribute(ghandle, "type"); + auto type = ritsuko::hdf5::open_and_load_scalar_string_attribute(ghandle, "type"); const char* missing_attr_name = "missing-value-placeholder"; - bool has_missing = dhandle.attrExists(missing_attr_name); if (type == "string") { if (dhandle.getTypeClass() != H5T_STRING) { throw std::runtime_error("expected a string datatype for 'values'"); } - - std::string missing_value; - if (has_missing) { - auto missing_attr = ritsuko::hdf5::get_missing_placeholder_attribute(dhandle, missing_attr_name, /* type_class_only = */ true); - missing_value = ritsuko::hdf5::load_scalar_string_attribute(missing_attr); - } - - if (ghandle.attrExists("format")) { - auto format = ritsuko::hdf5::load_scalar_string_attribute(ghandle, "format"); - internal_hdf5::validate_string_format(dhandle, vlen, format, has_missing, missing_value, options.hdf5_buffer_size); - } + auto missingness = ritsuko::hdf5::open_and_load_optional_string_missing_placeholder(dhandle, missing_attr_name); + std::string format = internal_string::fetch_format_attribute(ghandle); + internal_string::validate_string_format(dhandle, vlen, format, missingness.first, missingness.second, options.hdf5_buffer_size); } else { if (type == "integer") { @@ -82,21 +68,13 @@ inline void validate(const std::filesystem::path& path, const Options& options) throw std::runtime_error("unsupported type '" + type + "'"); } - if (has_missing) { - ritsuko::hdf5::get_missing_placeholder_attribute(dhandle, missing_attr_name); + if (dhandle.attrExists(missing_attr_name)) { + auto missing_attr = dhandle.openAttribute(missing_attr_name); + ritsuko::hdf5::check_missing_placeholder_attribute(dhandle, missing_attr); } } - if (ghandle.exists("names")) { - auto nhandle = ritsuko::hdf5::get_dataset(ghandle, "names"); - if (nhandle.getTypeClass() != H5T_STRING) { - throw std::runtime_error("'names' should be a string datatype class"); - } - auto nlen = ritsuko::hdf5::get_1d_length(nhandle.getSpace(), false); - if (vlen != nlen) { - throw std::runtime_error("'names' and 'values' should have the same length"); - } - } + internal_string::validate_names(ghandle, "names", vlen, options.hdf5_buffer_size); } catch (std::exception& e) { throw std::runtime_error("failed to validate an 'atomic_vector' at '" + path.string() + "'; " + std::string(e.what())); diff --git a/inst/include/takane/atomic_vector_list.hpp b/inst/include/takane/atomic_vector_list.hpp new file mode 100644 index 0000000..22a577a --- /dev/null +++ b/inst/include/takane/atomic_vector_list.hpp @@ -0,0 +1,45 @@ +#ifndef TAKANE_ATOMIC_VECTOR_LIST_HPP +#define TAKANE_ATOMIC_VECTOR_LIST_HPP + +#include "H5Cpp.h" + +#include +#include +#include + +#include "utils_public.hpp" +#include "utils_compressed_list.hpp" + +/** + * @file atomic_vector_list.hpp + * @brief Validation for atomic vector lists. + */ + +namespace takane { + +namespace atomic_vector_list { + +/** + * @param path Path to the directory containing the atomic vector list. + * @param options Validation options, typically for reading performance. + */ +inline void validate(const std::filesystem::path& path, const Options& options) try { + internal_compressed_list::validate_directory(path, "atomic_vector_list", "atomic_vector", options); +} catch (std::exception& e) { + throw std::runtime_error("failed to validate an 'atomic_vector_list' object at '" + path.string() + "'; " + std::string(e.what())); +} + +/** + * @param path Path to a directory containing an atomic vector list. + * @param options Validation options, mostly for input performance. + * @return The length of the list. + */ +inline size_t height(const std::filesystem::path& path, const Options& options) { + return internal_compressed_list::height(path, "atomic_vector_list", options); +} + +} + +} + +#endif diff --git a/inst/include/takane/compressed_list.hpp b/inst/include/takane/compressed_list.hpp deleted file mode 100644 index 192653a..0000000 --- a/inst/include/takane/compressed_list.hpp +++ /dev/null @@ -1,143 +0,0 @@ -#ifndef TAKANE_COMPRESSED_LIST_HPP -#define TAKANE_COMPRESSED_LIST_HPP - -#include "comservatory/comservatory.hpp" - -#include "utils_csv.hpp" - -#include - -/** - * @file compressed_list.hpp - * @brief Validation for compressed lists. - */ - -namespace takane { - -/** - * @namespace takane::compressed_list - * @brief Definitions for compressed lists. - */ -namespace compressed_list { - -/** - * @brief Parameters for validating the compressed list file. - */ -struct Parameters { - /** - * Length of the compressed list. - */ - size_t length = 0; - - /** - * Total length of the concatenated elements. - */ - size_t concatenated = 0; - - /** - * Whether the compressed list is named. - */ - bool has_names = false; - - /** - * Whether to load and parse the file in parallel, see `comservatory::ReadOptions` for details. - */ - bool parallel = false; - - /** - * Version of the `compressed_list` format. - */ - int version = 1; -}; - -/** - * @cond - */ -template -CsvContents validate_base(ParseCommand parse, const Parameters& params, CsvFieldCreator* creator = NULL) { - DummyCsvFieldCreator default_creator; - if (creator == NULL) { - creator = &default_creator; - } - - comservatory::Contents contents; - CsvContents output; - if (params.has_names) { - auto ptr = creator->string(); - output.fields.emplace_back(ptr); - contents.fields.emplace_back(new CsvNameField(false, ptr)); - } - - auto ptr0 = creator->integer(); - output.fields.emplace_back(ptr0); - auto ptr = new CsvCompressedLengthField(static_cast(params.has_names), ptr0); - contents.fields.emplace_back(ptr); - - comservatory::ReadOptions opt; - opt.parallel = params.parallel; - parse(contents, opt); - if (contents.num_records() != params.length) { - throw std::runtime_error("number of records in the CSV file does not match the expected length"); - } - - if (params.concatenated != ptr->total) { - throw std::runtime_error("sum of lengths in the compressed list did not equal the expected concatenated total"); - } - - if (contents.names.back() != "number") { - throw std::runtime_error("column containing the compressed list lengths should be named 'number'"); - } - - return output; -} -/** - * @endcond - */ - -/** - * Checks if a CSV is correctly formatted for the `compressed_list` format. - * An error is raised if the file does not meet the specifications. - * - * @tparam Reader A **byteme** reader class. - * - * @param reader A stream of bytes from the CSV file. - * @param params Validation parameters. - * @param creator Factory to create objects for holding the contents of each CSV field. - * Defaults to a pointer to a `DummyFieldCreator` instance. - * - * @return Contents of the loaded CSV. - * Whether the `fields` member actually contains the CSV data depends on `creator`. - * If `params.has_names = true`, an additional field containing the names is present at the start. - */ -template -CsvContents validate(Reader& reader, const Parameters& params, CsvFieldCreator* creator = NULL) { - return validate_base( - [&](comservatory::Contents& contents, const comservatory::ReadOptions& opts) -> void { comservatory::read(reader, contents, opts); }, - params, - creator - ); -} - -/** - * Overload of `compressed_list::validate()` that accepts a file path. - * - * @param path Path to the CSV file. - * @param params Validation parameters. - * @param creator Factory to create objects for holding the contents of each CSV field. - * Defaults to a pointer to a `DummyFieldCreator` instance. - * - * @return Contents of the loaded CSV. - */ -inline CsvContents validate(const char* path, const Parameters& params, CsvFieldCreator* creator = NULL) { - return validate_base( - [&](comservatory::Contents& contents, const comservatory::ReadOptions& opts) -> void { comservatory::read_file(path, contents, opts); }, - params, - creator - ); -} - -} - -} - -#endif diff --git a/inst/include/takane/data_frame.hpp b/inst/include/takane/data_frame.hpp index 1e3729e..f227397 100644 --- a/inst/include/takane/data_frame.hpp +++ b/inst/include/takane/data_frame.hpp @@ -13,7 +13,8 @@ #include #include "utils_public.hpp" -#include "utils_hdf5.hpp" +#include "utils_string.hpp" +#include "utils_factor.hpp" #include "utils_other.hpp" /** @@ -37,7 +38,7 @@ namespace data_frame { /** * @cond */ -inline void validate_row_names(const H5::Group& handle, hsize_t num_rows) try { +inline void validate_row_names(const H5::Group& handle, hsize_t num_rows, const Options& options) try { if (handle.childObjType("row_names") != H5O_TYPE_DATASET) { throw std::runtime_error("expected a 'row_names' dataset when row names are present"); } @@ -48,16 +49,13 @@ inline void validate_row_names(const H5::Group& handle, hsize_t num_rows) try { if (ritsuko::hdf5::get_1d_length(rnhandle.getSpace(), false) != num_rows) { throw std::runtime_error("expected 'row_names' to have length equal to the number of rows"); } + ritsuko::hdf5::validate_1d_string_dataset(rnhandle, num_rows, options.hdf5_buffer_size); } catch (std::exception& e) { throw std::runtime_error("failed to validate the row names for '" + ritsuko::hdf5::get_name(handle) + "'; " + std::string(e.what())); } inline hsize_t validate_column_names(const H5::Group& ghandle, const Options& options) try { - if (!ghandle.exists("column_names") || ghandle.childObjType("column_names") != H5O_TYPE_DATASET) { - throw std::runtime_error("expected a 'column_names' dataset"); - } - - auto cnhandle = ghandle.openDataSet("column_names"); + auto cnhandle = ritsuko::hdf5::open_dataset(ghandle, "column_names"); if (cnhandle.getTypeClass() != H5T_STRING) { throw std::runtime_error("expected 'column_names' to be a string dataset"); } @@ -65,21 +63,17 @@ inline hsize_t validate_column_names(const H5::Group& ghandle, const Options& op auto num_cols = ritsuko::hdf5::get_1d_length(cnhandle.getSpace(), false); std::unordered_set column_names; - ritsuko::hdf5::load_1d_string_dataset( - cnhandle, - num_cols, - options.hdf5_buffer_size, - [&](size_t, const char* p, size_t l) { - if (l == 0) { - throw std::runtime_error("column names should not be empty strings"); - } - std::string col_name(p, p + l); - if (column_names.find(col_name) != column_names.end()) { - throw std::runtime_error("duplicated column name '" + col_name + "'"); - } - column_names.insert(std::move(col_name)); + ritsuko::hdf5::Stream1dStringDataset stream(&cnhandle, num_cols, options.hdf5_buffer_size); + for (size_t c = 0; c < num_cols; ++c, stream.next()) { + auto x = stream.steal(); + if (x.empty()) { + throw std::runtime_error("column names should not be empty strings"); } - ); + if (column_names.find(x) != column_names.end()) { + throw std::runtime_error("duplicated column name '" + x + "'"); + } + column_names.insert(std::move(x)); + } return num_cols; @@ -88,52 +82,38 @@ inline hsize_t validate_column_names(const H5::Group& ghandle, const Options& op } inline void validate_column(const H5::Group& dhandle, const std::string& dset_name, hsize_t num_rows, const Options& options) try { - if (dhandle.childObjType(dset_name) == H5O_TYPE_GROUP) { + auto dtype = dhandle.childObjType(dset_name); + if (dtype == H5O_TYPE_GROUP) { auto fhandle = dhandle.openGroup(dset_name); - auto type = ritsuko::hdf5::load_scalar_string_attribute(fhandle, "type"); + auto type = ritsuko::hdf5::open_and_load_scalar_string_attribute(fhandle, "type"); if (type != "factor") { throw std::runtime_error("expected HDF5 groups to have a 'type' attribute set to 'factor'"); } - if (fhandle.attrExists("ordered")) { - auto attr = ritsuko::hdf5::get_scalar_attribute(fhandle, "ordered"); - if (ritsuko::hdf5::exceeds_integer_limit(attr, 32, true)) { - throw std::runtime_error("an 'ordered' attribute on a factor column should have a datatype that fits in a 32-bit signed integer"); - } - } + internal_factor::check_ordered_attribute(fhandle); - auto num_levels = internal_hdf5::validate_factor_levels(fhandle, "levels", options.hdf5_buffer_size); - auto num_codes = internal_hdf5::validate_factor_codes(fhandle, "codes", num_levels, options.hdf5_buffer_size); + auto num_levels = internal_factor::validate_factor_levels(fhandle, "levels", options.hdf5_buffer_size); + auto num_codes = internal_factor::validate_factor_codes(fhandle, "codes", num_levels, options.hdf5_buffer_size); if (num_codes != num_rows) { throw std::runtime_error("expected column to have length equal to the number of rows"); } - - } else { - auto xhandle = ritsuko::hdf5::get_dataset(dhandle, dset_name.c_str()); + } else if (dtype == H5O_TYPE_DATASET) { + auto xhandle = dhandle.openDataSet(dset_name); if (num_rows != ritsuko::hdf5::get_1d_length(xhandle.getSpace(), false)) { throw std::runtime_error("expected column to have length equal to the number of rows"); } const char* missing_attr_name = "missing-value-placeholder"; - bool has_missing = xhandle.attrExists(missing_attr_name); - auto type = ritsuko::hdf5::load_scalar_string_attribute(xhandle, "type"); + auto type = ritsuko::hdf5::open_and_load_scalar_string_attribute(xhandle, "type"); if (type == "string") { if (xhandle.getTypeClass() != H5T_STRING) { throw std::runtime_error("expected column " + dset_name + " to be a string dataset"); } - - std::string missing_value; - if (has_missing) { - auto missing_attr = ritsuko::hdf5::get_missing_placeholder_attribute(xhandle, missing_attr_name, /* type_class_only = */ true); - missing_value = ritsuko::hdf5::load_scalar_string_attribute(missing_attr); - } - - if (xhandle.attrExists("format")) { - auto format = ritsuko::hdf5::load_scalar_string_attribute(xhandle, "format"); - internal_hdf5::validate_string_format(xhandle, num_rows, format, has_missing, missing_value, options.hdf5_buffer_size); - } + auto missingness = ritsuko::hdf5::open_and_load_optional_string_missing_placeholder(xhandle, missing_attr_name); + std::string format = internal_string::fetch_format_attribute(xhandle); + internal_string::validate_string_format(xhandle, num_rows, format, missingness.first, missingness.second, options.hdf5_buffer_size); } else { if (type == "integer") { @@ -152,10 +132,14 @@ inline void validate_column(const H5::Group& dhandle, const std::string& dset_na throw std::runtime_error("unknown column type '" + type + "'"); } - if (has_missing) { - ritsuko::hdf5::get_missing_placeholder_attribute(xhandle, missing_attr_name); + if (xhandle.attrExists(missing_attr_name)) { + auto ahandle = xhandle.openAttribute(missing_attr_name); + ritsuko::hdf5::check_missing_placeholder_attribute(xhandle, ahandle); } } + + } else { + throw std::runtime_error("unknown HDF5 object type"); } } catch (std::exception& e) { @@ -170,22 +154,17 @@ inline void validate_column(const H5::Group& dhandle, const std::string& dset_na * @param options Validation options, typically for reading performance. */ inline void validate(const std::filesystem::path& path, const Options& options) { - auto h5path = path / "basic_columns.h5"; - - H5::H5File handle(h5path, H5F_ACC_RDONLY); - if (!handle.exists("data_frame") || handle.childObjType("data_frame") != H5O_TYPE_GROUP) { - throw std::runtime_error("expected a 'data_frame' group"); - } - auto ghandle = handle.openGroup("data_frame"); + auto handle = ritsuko::hdf5::open_file(path / "basic_columns.h5"); + auto ghandle = ritsuko::hdf5::open_group(handle, "data_frame"); - auto vstring = ritsuko::hdf5::load_scalar_string_attribute(ghandle, "version"); + auto vstring = ritsuko::hdf5::open_and_load_scalar_string_attribute(ghandle, "version"); auto version = ritsuko::parse_version_string(vstring.c_str(), vstring.size(), /* skip_patch = */ true); if (version.major != 1) { throw std::runtime_error("unsupported version '" + vstring + "'"); } // Checking the number of rows. - auto attr = ritsuko::hdf5::get_scalar_attribute(ghandle, "row-count"); + auto attr = ritsuko::hdf5::open_scalar_attribute(ghandle, "row-count"); if (ritsuko::hdf5::exceeds_integer_limit(attr, 64, false)) { throw std::runtime_error("'row-count' attribute should have a datatype that fits in a 64-bit unsigned integer"); } @@ -194,15 +173,12 @@ inline void validate(const std::filesystem::path& path, const Options& options) // Checking row and column names. if (ghandle.exists("row_names")) { - validate_row_names(ghandle, num_rows); + validate_row_names(ghandle, num_rows, options); } size_t NC = validate_column_names(ghandle, options); // Finally iterating through the columns. - if (!ghandle.exists("data") || ghandle.childObjType("data") != H5O_TYPE_GROUP) { - throw std::runtime_error("expected a 'data_frame/data' group"); - } - auto dhandle = ghandle.openGroup("data"); + auto dhandle = ritsuko::hdf5::open_group(ghandle, "data"); hsize_t found = 0; for (size_t c = 0; c < NC; ++c) { @@ -229,18 +205,8 @@ inline void validate(const std::filesystem::path& path, const Options& options) throw std::runtime_error("more objects present in the 'data_frame/data' group than expected"); } - // Checking the metadata. - try { - internal_other::validate_mcols(path / "column_annotations", NC, options); - } catch (std::exception& e) { - throw std::runtime_error("failed to validate 'column_annotations'; " + std::string(e.what())); - } - - try { - internal_other::validate_metadata(path / "other_annotations", options); - } catch (std::exception& e) { - throw std::runtime_error("failed to validate 'other_annotations'; " + std::string(e.what())); - } + internal_other::validate_mcols(path, "column_annotations", NC, options); + internal_other::validate_metadata(path, "other_annotations", options); } /** @@ -254,10 +220,7 @@ inline size_t height(const std::filesystem::path& path, const Options&) { // Assume it's all valid already. H5::H5File handle(h5path, H5F_ACC_RDONLY); auto ghandle = handle.openGroup("data_frame"); - auto attr = ritsuko::hdf5::get_scalar_attribute(ghandle, "row-count"); - uint64_t num_rows = 0; - attr.read(H5::PredType::NATIVE_UINT64, &num_rows); - return num_rows; + return ritsuko::hdf5::load_scalar_numeric_attribute(ghandle.openAttribute("row-count")); } } diff --git a/inst/include/takane/data_frame_factor.hpp b/inst/include/takane/data_frame_factor.hpp index 266c473..c4e8a04 100644 --- a/inst/include/takane/data_frame_factor.hpp +++ b/inst/include/takane/data_frame_factor.hpp @@ -8,7 +8,8 @@ #include "ritsuko/hdf5/hdf5.hpp" #include "utils_public.hpp" -#include "utils_hdf5.hpp" +#include "utils_string.hpp" +#include "utils_factor.hpp" /** * @file data_frame_factor.hpp @@ -22,6 +23,7 @@ namespace takane { */ void validate(const std::filesystem::path&, const std::string&, const Options&); size_t height(const std::filesystem::path&, const std::string&, const Options&); +bool satisfies_interface(const std::string&, const std::string&); /** * @endcond */ @@ -48,15 +50,10 @@ inline std::function +#include +#include + +#include "utils_public.hpp" +#include "utils_compressed_list.hpp" + +/** + * @file data_frame_list.hpp + * @brief Validation for data frame lists. + */ + +namespace takane { + +namespace data_frame_list { + +/** + * @param path Path to the directory containing the data frame list. + * @param options Validation options, typically for reading performance. + */ +inline void validate(const std::filesystem::path& path, const Options& options) try { + internal_compressed_list::validate_directory(path, "data_frame_list", "DATA_FRAME", options); +} catch (std::exception& e) { + throw std::runtime_error("failed to validate an 'data_frame_list' object at '" + path.string() + "'; " + std::string(e.what())); +} + +/** + * @param path Path to a directory containing an data frame list. + * @param options Validation options, mostly for input performance. + * @return The length of the list. + */ +inline size_t height(const std::filesystem::path& path, const Options& options) { + return internal_compressed_list::height(path, "data_frame_list", options); +} + +} + +} + +#endif diff --git a/inst/include/takane/genomic_ranges.hpp b/inst/include/takane/genomic_ranges.hpp index 511bb3f..b07cfd8 100644 --- a/inst/include/takane/genomic_ranges.hpp +++ b/inst/include/takane/genomic_ranges.hpp @@ -4,11 +4,16 @@ #include "ritsuko/ritsuko.hpp" #include "comservatory/comservatory.hpp" -#include "WrappedOption.hpp" - -#include #include +#include #include +#include +#include +#include + +#include "utils_string.hpp" +#include "utils_public.hpp" +#include "utils_other.hpp" /** * @file genomic_ranges.hpp @@ -18,200 +23,203 @@ namespace takane { /** - * @namespace takane::genomic_ranges - * @brief Definitions for genomic ranges. + * @cond + */ +void validate(const std::filesystem::path&, const std::string&, const Options& options); +/** + * @endcond */ -namespace genomic_ranges { /** - * @brief Parameters for validating the genomic ranges file. + * @namespace takane::genomic_ranges + * @brief Definitions for genomic ranges. */ -struct Parameters { - /** - * Number of genomic ranges in this object. - */ - size_t num_ranges; - - /** - * Whether the ranges are named. - */ - bool has_names; - - /** - * Universe of sequence names for this object. - */ - WrappedOption > seqnames; - - /** - * Whether to load and parse the file in parallel, see `comservatory::ReadOptions` for details. - */ - bool parallel = false; - - /** - * Version of the `genomic_ranges` format. - */ - int version = 1; -}; +namespace genomic_ranges { /** * @cond */ -struct NamesField : public comservatory::DummyStringField { - void add_missing() { - throw std::runtime_error("missing values should not be present in the names column"); - } -}; +namespace internal { -struct SeqnamesField : public comservatory::DummyStringField { - void add_missing() { - throw std::runtime_error("missing values should not be present in the seqnames column"); - } +struct SequenceLimits { + SequenceLimits(size_t n) : restricted(n), seqlen(n) {} + std::vector restricted; + std::vector seqlen; +}; - void push_back(std::string x) { - if (all_seqnames->find(x) == all_seqnames->end()) { - throw std::runtime_error("unknown sequence name '" + x + "'"); - } - comservatory::DummyStringField::push_back(std::move(x)); +inline SequenceLimits find_sequence_limits(const std::filesystem::path& path, const Options& options) { + auto xtype = read_object_type(path); + if (xtype != "sequence_information") { + throw std::runtime_error("'sequence_information' directory should contain a 'sequence_information' object"); } + ::takane::validate(path, xtype, options); - const std::unordered_set* all_seqnames = NULL; -}; + auto fpath = path / "info.h5"; + H5::H5File handle(fpath, H5F_ACC_RDONLY); + auto ghandle = handle.openGroup("sequence_information"); -struct StartField : public comservatory::DummyNumberField { - void add_missing() { - throw std::runtime_error("missing values should not be present in the start column"); - } + auto lhandle = ghandle.openDataSet("length"); + auto num_seq = ritsuko::hdf5::get_1d_length(lhandle.getSpace(), false); + ritsuko::hdf5::Stream1dNumericDataset lstream(&lhandle, num_seq, options.hdf5_buffer_size); + auto lmissing = ritsuko::hdf5::open_and_load_optional_numeric_missing_placeholder(lhandle, "missing-value-placeholder"); - void push_back(double x) { - if (x < -2147483648 || x > 2147483647) { // constrain within limits. - throw std::runtime_error("start position does not fit inside a 32-bit signed integer"); - } - if (x != std::floor(x)) { - throw std::runtime_error("start position is not an integer"); - } - last = x; - comservatory::DummyNumberField::push_back(x); - } + auto chandle = ghandle.openDataSet("circular"); + ritsuko::hdf5::Stream1dNumericDataset cstream(&chandle, num_seq, options.hdf5_buffer_size); + auto cmissing = ritsuko::hdf5::open_and_load_optional_numeric_missing_placeholder(chandle, "missing-value-placeholder"); - int32_t last = 0; -}; + SequenceLimits output(num_seq); + auto& restricted = output.restricted; + auto& seqlen = output.seqlen; -struct EndField : public comservatory::DummyNumberField { - void add_missing() { - throw std::runtime_error("missing values should not be present in the strand column"); - } + for (size_t i = 0; i < num_seq; ++i, lstream.next(), cstream.next()) { + auto slen = lstream.get(); + auto circ = cstream.get(); + seqlen[i] = slen; - void push_back(double x) { - if (x < -2147483648 || x > 2147483647) { // constrain within limits. - throw std::runtime_error("end position does not fit inside a 32-bit signed integer"); + // Skipping restriction if the sequence length is missing OR the sequence is circular. + if (lmissing.first && lmissing.second == slen) { + continue; } - if (x != std::floor(x)) { - throw std::runtime_error("end position is not an integer"); + if (circ && !(cmissing.first && cmissing.second == circ)) { + continue; } - comservatory::DummyNumberField::push_back(x); - if (start->size() != size()) { - throw std::runtime_error("'start' and 'end' validator fields are out of sync"); - } - if (x + 1 < start->last) { - throw std::runtime_error("'end' coordinate must be greater than or equal to 'start - 1'"); - } + restricted[i] = true; } - const StartField* start = NULL; -}; + return output; +} -struct StrandField : public comservatory::DummyStringField { - void add_missing() { - throw std::runtime_error("missing values should not be present in the strand column"); - } +} +/** + * @endcond + */ - void push_back(std::string x) { - if (x.size() != 1 || (x[0] != '+' && x[0] != '-' && x[0] != '*')) { - throw std::runtime_error("invalid strand '" + x + "'"); - } - comservatory::DummyStringField::push_back(std::move(x)); +/** + * @param path Path to the directory containing the genomic ranges. + * @param options Validation options, typically for reading performance. + */ +inline void validate(const std::filesystem::path& path, const Options& options) try { + // Figuring out the sequence length constraints. + auto limits = internal::find_sequence_limits(path / "sequence_information", options); + const auto& restricted = limits.restricted; + const auto& seqlen = limits.seqlen; + size_t num_sequences = restricted.size(); + + // Now loading all three components. + auto handle = ritsuko::hdf5::open_file(path / "ranges.h5"); + auto ghandle = ritsuko::hdf5::open_group(handle, "genomic_ranges"); + + auto id_handle = ritsuko::hdf5::open_dataset(ghandle, "sequence"); + auto num_ranges = ritsuko::hdf5::get_1d_length(id_handle, false); + if (ritsuko::hdf5::exceeds_integer_limit(id_handle, 64, false)) { + throw std::runtime_error("expected 'sequence' to have a datatype that fits into a 64-bit unsigned integer"); } -}; + ritsuko::hdf5::Stream1dNumericDataset id_stream(&id_handle, num_ranges, options.hdf5_buffer_size); -template -void validate_base(ParseCommand parse, const Parameters& params) { - comservatory::Contents contents; - if (params.has_names) { - contents.fields.emplace_back(new NamesField); + auto start_handle = ritsuko::hdf5::open_dataset(ghandle, "start"); + if (num_ranges != ritsuko::hdf5::get_1d_length(start_handle, false)) { + throw std::runtime_error("'start' and 'sequence' should have the same length"); + } + if (ritsuko::hdf5::exceeds_integer_limit(start_handle, 64, true)) { + throw std::runtime_error("expected 'start' to have a datatype that fits into a 64-bit signed integer"); } + ritsuko::hdf5::Stream1dNumericDataset start_stream(&start_handle, num_ranges, options.hdf5_buffer_size); - { - auto ptr = new SeqnamesField; - ptr->all_seqnames = params.seqnames.get(); - contents.fields.emplace_back(ptr); + auto width_handle = ritsuko::hdf5::open_dataset(ghandle, "width"); + if (num_ranges != ritsuko::hdf5::get_1d_length(width_handle, false)) { + throw std::runtime_error("'width' and 'sequence' should have the same length"); } - - { - auto sptr = new StartField; - contents.fields.emplace_back(sptr); - auto eptr = new EndField; - eptr->start = sptr; - contents.fields.emplace_back(eptr); + if (ritsuko::hdf5::exceeds_integer_limit(width_handle, 64, false)) { + throw std::runtime_error("expected 'width' to have a datatype that fits into a 64-bit unsigned integer"); } + ritsuko::hdf5::Stream1dNumericDataset width_stream(&width_handle, num_ranges, options.hdf5_buffer_size); - contents.fields.emplace_back(new StrandField); + constexpr uint64_t end_limit = std::numeric_limits::max(); + for (size_t i = 0; i < num_ranges; ++i, id_stream.next(), start_stream.next(), width_stream.next()) { + auto id = id_stream.get(); + if (id >= num_sequences) { + throw std::runtime_error("'sequence' must be less than the number of sequences (got " + std::to_string(id) + ")"); + } - comservatory::ReadOptions opt; - opt.parallel = params.parallel; - parse(contents, opt); - if (contents.num_records() != params.num_ranges) { - throw std::runtime_error("number of records in the CSV file does not match the expected number of ranges"); - } + auto start = start_stream.get(); + auto width = width_stream.get(); + + if (restricted[id]) { + if (start < 1) { + throw std::runtime_error("non-positive start position (" + std::to_string(start) + ") for non-circular sequence"); + } + + auto spos = static_cast(start); + auto limit = seqlen[id]; + if (spos > limit) { + throw std::runtime_error("start position beyond sequence length (" + std::to_string(start) + " > " + std::to_string(limit) + ") for non-circular sequence"); + } + + // The LHS should not overflow as 'spos >= 1' so 'limit - spos + 1' should still be no greater than 'limit'. + if (limit - spos + 1 < width) { + throw std::runtime_error("end position beyond sequence length (" + + std::to_string(start) + " + " + std::to_string(width) + " > " + std::to_string(limit) + + ") for non-circular sequence"); + } + } - if (contents.names[0 + params.has_names] != "seqnames") { - throw std::runtime_error("expected the first (non-name) column to be 'seqnames'"); - } - if (contents.names[1 + params.has_names] != "start") { - throw std::runtime_error("expected the second (non-name) column to be 'start'"); - } - if (contents.names[2 + params.has_names] != "end") { - throw std::runtime_error("expected the third (non-name) column to be 'end'"); + bool exceeded = false; + if (start > 0) { + // 'end_limit - start' is always non-negative as 'end_limit' is the largest value of an int64_t and 'start' is also int64_t. + exceeded = (end_limit - static_cast(start) < width); + } else { + // 'end_limit - start' will not overflow a uint64_t, because 'end_limit' is the largest value of an int64_t and 'start' as also 'int64_t'. + exceeded = (end_limit + static_cast(-start) < width); + } + if (exceeded) { + throw std::runtime_error("end position beyond the range of a 64-bit integer (" + std::to_string(start) + " + " + std::to_string(width) + ")"); + } } - if (contents.names[3 + params.has_names] != "strand") { - throw std::runtime_error("expected the fourth (non-name) column to be 'strand'"); + + { + auto strand_handle = ritsuko::hdf5::open_dataset(ghandle, "strand"); + if (num_ranges != ritsuko::hdf5::get_1d_length(strand_handle, false)) { + throw std::runtime_error("'strand' and 'sequence' should have the same length"); + } + if (ritsuko::hdf5::exceeds_integer_limit(strand_handle, 32, true)) { + throw std::runtime_error("expected 'strand' to have a datatype that fits into a 32-bit signed integer"); + } + + ritsuko::hdf5::Stream1dNumericDataset strand_stream(&strand_handle, num_ranges, options.hdf5_buffer_size); + for (hsize_t i = 0; i < num_ranges; ++i, strand_stream.next()) { + auto x = strand_stream.get(); + if (x < -1 || x > 1) { + throw std::runtime_error("values of 'strand' should be one of 0, -1, or 1 (got " + std::to_string(x) + ")"); + } + } } -} -/** - * @endcond - */ -/** - * Checks if a CSV data frame is correctly formatted for genomic ranges. - * An error is raised if the file does not meet the specifications. - * - * @tparam Reader A **byteme** reader class. - * - * @param reader A stream of bytes from the CSV file. - * @param params Validation parameters. - */ -template -void validate(Reader& reader, const Parameters& params) { - validate_base( - [&](comservatory::Contents& contents, const comservatory::ReadOptions& opt) -> void { comservatory::read(reader, contents, opt); }, - params - ); + internal_other::validate_mcols(path, "range_annotations", num_ranges, options); + internal_other::validate_metadata(path, "other_annotations", options); + + internal_string::validate_names(ghandle, "name", num_ranges, options.hdf5_buffer_size); + +} catch (std::exception& e) { + throw std::runtime_error("failed to validate 'genomic_ranges' object at '" + path.string() + "'; " + std::string(e.what())); } /** - * Checks if a CSV data frame is correctly formatted for genomic ranges. - * An error is raised if the file does not meet the specifications. - * - * @param path Path to the CSV file. - * @param params Validation parameters. + * @param path Path to a directory containing genomic ranges. + * @param options Validation options, mostly for input performance. + * @return The number of ranges. */ -inline void validate(const char* path, const Parameters& params) { - validate_base( - [&](comservatory::Contents& contents, const comservatory::ReadOptions& opt) -> void { comservatory::read_file(path, contents, opt); }, - params - ); +inline size_t height(const std::filesystem::path& path, const Options&) { + auto h5path = path / "ranges.h5"; + + // Assume it's all valid already. + H5::H5File handle(h5path, H5F_ACC_RDONLY); + auto ghandle = handle.openGroup("genomic_ranges"); + auto dhandle = ghandle.openDataSet("sequence"); + return ritsuko::hdf5::get_1d_length(dhandle, false); } + } } diff --git a/inst/include/takane/genomic_ranges_list.hpp b/inst/include/takane/genomic_ranges_list.hpp new file mode 100644 index 0000000..4ec64ce --- /dev/null +++ b/inst/include/takane/genomic_ranges_list.hpp @@ -0,0 +1,45 @@ +#ifndef TAKANE_GENOMIC_RANGES_LIST_HPP +#define TAKANE_GENOMIC_RANGES_LIST_HPP + +#include "H5Cpp.h" + +#include +#include +#include + +#include "utils_public.hpp" +#include "utils_compressed_list.hpp" + +/** + * @file genomic_ranges_list.hpp + * @brief Validation for genomic ranges lists. + */ + +namespace takane { + +namespace genomic_ranges_list { + +/** + * @param path Path to the directory containing the genomic ranges list. + * @param options Validation options, typically for reading performance. + */ +inline void validate(const std::filesystem::path& path, const Options& options) try { + internal_compressed_list::validate_directory(path, "genomic_ranges_list", "genomic_ranges", options); +} catch (std::exception& e) { + throw std::runtime_error("failed to validate an 'genomic_ranges_list' object at '" + path.string() + "'; " + std::string(e.what())); +} + +/** + * @param path Path to a directory containing an genomic ranges list. + * @param options Validation options, mostly for input performance. + * @return The length of the list. + */ +inline size_t height(const std::filesystem::path& path, const Options& options) { + return internal_compressed_list::height(path, "genomic_ranges_list", options); +} + +} + +} + +#endif diff --git a/inst/include/takane/sequence_information.hpp b/inst/include/takane/sequence_information.hpp index c9cfc18..537f55a 100644 --- a/inst/include/takane/sequence_information.hpp +++ b/inst/include/takane/sequence_information.hpp @@ -1,12 +1,14 @@ #ifndef TAKANE_SEQUENCE_INFORMATION_HPP #define TAKANE_SEQUENCE_INFORMATION_HPP -#include "comservatory/comservatory.hpp" - -#include "data_frame.hpp" -#include "utils_csv.hpp" +#include "ritsuko/hdf5/hdf5.hpp" +#include #include +#include +#include + +#include "utils_public.hpp" /** * @file sequence_information.hpp @@ -22,112 +24,77 @@ namespace takane { namespace sequence_information { /** - * @brief Parameters for validating the sequence information file. + * @param path Path to the directory containing the data frame. + * @param options Validation options, typically for reading performance. */ -struct Parameters { - /** - * Expected number of sequences. - */ - size_t num_sequences = 0; - - /** - * Whether to load and parse the file in parallel, see `comservatory::ReadOptions` for details. - */ - bool parallel = false; - - /** - * Version of the `sequence_information` format. - */ - int version = 1; -}; +inline void validate(const std::filesystem::path& path, const Options& options) try { + auto handle = ritsuko::hdf5::open_file(path / "info.h5"); + auto ghandle = ritsuko::hdf5::open_group(handle, "sequence_information"); -/** - * @cond - */ -template -CsvContents validate_base(ParseCommand parse, const Parameters& params, CsvFieldCreator* creator) { - DummyCsvFieldCreator default_creator; - if (creator == NULL) { - creator = &default_creator; + size_t nseq = 0; + { + auto nhandle = ritsuko::hdf5::open_dataset(ghandle, "name"); + if (nhandle.getTypeClass() != H5T_STRING) { + throw std::runtime_error("expected a string datatype class for 'name'"); + } + + nseq = ritsuko::hdf5::get_1d_length(nhandle.getSpace(), false); + std::unordered_set collected; + ritsuko::hdf5::Stream1dStringDataset stream(&nhandle, nseq, options.hdf5_buffer_size); + for (size_t s = 0; s < nseq; ++s, stream.next()) { + auto x = stream.steal(); + if (collected.find(x) != collected.end()) { + throw std::runtime_error("detected duplicated sequence name '" + x + "'"); + } + collected.insert(std::move(x)); + } } - comservatory::Contents contents; - CsvContents output; - contents.names.push_back("seqnames"); + const char* missing_attr_name = "missing-value-placeholder"; + { - auto ptr = creator->string(); - output.fields.emplace_back(ptr); - contents.fields.emplace_back(new CsvUniqueStringField(0, ptr)); + auto lhandle = ritsuko::hdf5::open_dataset(ghandle, "length"); + if (ritsuko::hdf5::exceeds_integer_limit(lhandle, 64, false)) { + throw std::runtime_error("expected a datatype for 'length' that fits in a 64-bit unsigned integer"); + } + if (ritsuko::hdf5::get_1d_length(lhandle.getSpace(), false) != nseq) { + throw std::runtime_error("expected lengths of 'length' and 'name' to be equal"); + } + if (lhandle.attrExists(missing_attr_name)) { + auto ahandle = lhandle.openAttribute(missing_attr_name); + ritsuko::hdf5::check_missing_placeholder_attribute(lhandle, ahandle); + } } - contents.names.push_back("seqlengths"); { - auto ptr = creator->integer(); - output.fields.emplace_back(ptr); - contents.fields.emplace_back(new CsvNonNegativeIntegerField(1, ptr)); + auto chandle = ritsuko::hdf5::open_dataset(ghandle, "circular"); + if (ritsuko::hdf5::exceeds_integer_limit(chandle, 32, true)) { + throw std::runtime_error("expected a datatype for 'circular' that fits in a 32-bit signed integer"); + } + if (ritsuko::hdf5::get_1d_length(chandle.getSpace(), false) != nseq) { + throw std::runtime_error("expected lengths of 'length' and 'circular' to be equal"); + } + if (chandle.attrExists(missing_attr_name)) { + auto ahandle = chandle.openAttribute(missing_attr_name); + ritsuko::hdf5::check_missing_placeholder_attribute(chandle, ahandle); + } } - contents.names.push_back("isCircular"); - output.fields.emplace_back(nullptr); - contents.fields.emplace_back(creator->boolean()); - - contents.names.push_back("genome"); - output.fields.emplace_back(nullptr); - contents.fields.emplace_back(creator->string()); - - comservatory::ReadOptions opt; - opt.parallel = params.parallel; - parse(contents, opt); - if (contents.num_records() != params.num_sequences) { - throw std::runtime_error("number of records in the CSV file does not match the expected number of ranges"); + { + auto gnhandle = ritsuko::hdf5::open_dataset(ghandle, "genome"); + if (gnhandle.getTypeClass() != H5T_STRING) { + throw std::runtime_error("expected a string datatype class for 'genome'"); + } + if (ritsuko::hdf5::get_1d_length(gnhandle.getSpace(), false) != nseq) { + throw std::runtime_error("expected lengths of 'length' and 'genome' to be equal"); + } + if (gnhandle.attrExists(missing_attr_name)) { + auto ahandle = gnhandle.openAttribute(missing_attr_name); + ritsuko::hdf5::check_missing_placeholder_attribute(gnhandle, ahandle); + } } - - output.reconstitute(contents.fields); - return output; -} -/** - * @endcond - */ - -/** - * Checks if a CSV data frame is correctly formatted for sequence information. - * An error is raised if the file does not meet the specifications. - * - * @tparam Reader A **byteme** reader class. - * - * @param reader A stream of bytes from the CSV file. - * @param params Validation parameters. - * @param creator Factory to create objects for holding the contents of each CSV field. - * Defaults to a pointer to a `DummyFieldCreator` instance. - * - * @return Contents of the loaded CSV. - * Whether the `fields` member actually contains the CSV data depends on `creator`. - */ -template -CsvContents validate(Reader& reader, const Parameters& params, CsvFieldCreator* creator = NULL) { - return validate_base( - [&](comservatory::Contents& contents, const comservatory::ReadOptions& opts) -> void { comservatory::read(reader, contents, opts); }, - params, - creator - ); -} - -/** - * Overload of `sequence_information::validate()` that accepts a file path. - * - * @param path Path to the CSV file. - * @param params Validation parameters. - * @param creator Factory to create objects for holding the contents of each CSV field. - * Defaults to a pointer to a `DummyFieldCreator` instance. - * - * @return Contents of the loaded CSV. - */ -inline CsvContents validate(const char* path, const Parameters& params, CsvFieldCreator* creator = NULL) { - return validate_base( - [&](comservatory::Contents& contents, const comservatory::ReadOptions& opts) -> void { comservatory::read_file(path, contents, opts); }, - params, - creator - ); +} catch (std::exception& e) { + throw std::runtime_error("failed to validate 'sequence_information' object at '" + path.string() + "'; " + std::string(e.what())); } } diff --git a/inst/include/takane/string_factor.hpp b/inst/include/takane/string_factor.hpp index 0b979e7..e8cc512 100644 --- a/inst/include/takane/string_factor.hpp +++ b/inst/include/takane/string_factor.hpp @@ -8,7 +8,8 @@ #include "ritsuko/hdf5/hdf5.hpp" #include "utils_public.hpp" -#include "utils_hdf5.hpp" +#include "utils_string.hpp" +#include "utils_factor.hpp" /** * @file string_factor.hpp @@ -28,41 +29,21 @@ namespace string_factor { * @param options Validation options, typically for reading performance. */ inline void validate(const std::filesystem::path& path, const Options& options) try { - H5::H5File handle((path / "contents.h5").string(), H5F_ACC_RDONLY); - - const char* parent = "string_factor"; - if (!handle.exists(parent) || handle.childObjType(parent) != H5O_TYPE_GROUP) { - throw std::runtime_error("expected a 'string_factor' group"); - } - auto ghandle = handle.openGroup(parent); + auto handle = ritsuko::hdf5::open_file(path / "contents.h5"); + auto ghandle = ritsuko::hdf5::open_group(handle, "string_factor"); - auto vstring = ritsuko::hdf5::load_scalar_string_attribute(ghandle, "version"); + auto vstring = ritsuko::hdf5::open_and_load_scalar_string_attribute(ghandle, "version"); auto version = ritsuko::parse_version_string(vstring.c_str(), vstring.size(), /* skip_patch = */ true); if (version.major != 1) { throw std::runtime_error("unsupported version string '" + vstring + "'"); } - if (ghandle.attrExists("ordered")) { - auto oattr = ritsuko::hdf5::get_scalar_attribute(ghandle, "ordered"); - if (ritsuko::hdf5::exceeds_integer_limit(oattr, 32, true)) { - throw std::runtime_error("expected a datatype for the 'ordered' attribute that fits in a 32-bit signed integer"); - } - } + internal_factor::check_ordered_attribute(ghandle); - // Number of levels. - size_t num_levels = internal_hdf5::validate_factor_levels(ghandle, "levels", options.hdf5_buffer_size); - size_t num_codes = internal_hdf5::validate_factor_codes(ghandle, "codes", num_levels, options.hdf5_buffer_size); - - if (ghandle.exists("names")) { - auto nhandle = ritsuko::hdf5::get_dataset(ghandle, "names"); - if (nhandle.getTypeClass() != H5T_STRING) { - throw std::runtime_error("'names' should be a string datatype class"); - } - auto nlen = ritsuko::hdf5::get_1d_length(nhandle.getSpace(), false); - if (num_codes != nlen) { - throw std::runtime_error("'names' and 'codes' should have the same length"); - } - } + size_t num_levels = internal_factor::validate_factor_levels(ghandle, "levels", options.hdf5_buffer_size); + size_t num_codes = internal_factor::validate_factor_codes(ghandle, "codes", num_levels, options.hdf5_buffer_size); + + internal_string::validate_names(ghandle, "names", num_codes, options.hdf5_buffer_size); } catch (std::exception& e) { throw std::runtime_error("failed to validate a 'string_factor' at '" + path.string() + "'; " + std::string(e.what())); diff --git a/inst/include/takane/takane.hpp b/inst/include/takane/takane.hpp index f2e11a7..4fede0e 100644 --- a/inst/include/takane/takane.hpp +++ b/inst/include/takane/takane.hpp @@ -3,6 +3,7 @@ #include "_validate.hpp" #include "_height.hpp" +#include "_satisfies_interface.hpp" /** * @namespace takane diff --git a/inst/include/takane/utils_compressed_list.hpp b/inst/include/takane/utils_compressed_list.hpp new file mode 100644 index 0000000..8f1ab24 --- /dev/null +++ b/inst/include/takane/utils_compressed_list.hpp @@ -0,0 +1,96 @@ +#ifndef TAKANE_UTILS_COMPRESSED_LIST_HPP +#define TAKANE_UTILS_COMPRESSED_LIST_HPP + +#include "H5Cpp.h" +#include "ritsuko/ritsuko.hpp" +#include "ritsuko/hdf5/hdf5.hpp" + +#include +#include +#include +#include +#include + +#include "utils_public.hpp" +#include "utils_string.hpp" +#include "utils_other.hpp" + +namespace takane { + +void validate(const std::filesystem::path&, const std::string&, const Options&); +size_t height(const std::filesystem::path&, const std::string&, const Options&); +bool satisfies_interface(const std::string&, const std::string&); + +namespace internal_compressed_list { + +inline hsize_t validate_group(const H5::Group& handle, size_t concatenated_length, hsize_t buffer_size) { + auto lhandle = ritsuko::hdf5::open_dataset(handle, "lengths"); + if (ritsuko::hdf5::exceeds_integer_limit(lhandle, 64, false)) { + throw std::runtime_error("expected 'lengths' to have a datatype that fits in a 64-bit unsigned integer"); + } + + size_t len = ritsuko::hdf5::get_1d_length(lhandle.getSpace(), false); + ritsuko::hdf5::Stream1dNumericDataset stream(&lhandle, len, buffer_size); + size_t total = 0; + for (size_t i = 0; i < len; ++i, stream.next()) { + total += stream.get(); + } + if (total != concatenated_length) { + throw std::runtime_error("sum of 'lengths' does not equal the height of the concatenated object (got " + std::to_string(total) + ", expected " + std::to_string(concatenated_length) + ")"); + } + + return len; +} + +template +void validate_directory(const std::filesystem::path& path, const std::string& object_type, const std::string& concatenated_type, const Options& options) try { + auto handle = ritsuko::hdf5::open_file(path / "partitions.h5"); + auto ghandle = ritsuko::hdf5::open_group(handle, object_type.c_str()); + + auto vstring = ritsuko::hdf5::open_and_load_scalar_string_attribute(ghandle, "version"); + auto version = ritsuko::parse_version_string(vstring.c_str(), vstring.size(), /* skip_patch = */ true); + if (version.major != 1) { + throw std::runtime_error("unsupported version string '" + vstring + "'"); + } + + auto catdir = path / "concatenated"; + auto cattype = read_object_type(catdir); + if constexpr(satisfies_interface_) { + if (!satisfies_interface(cattype, concatenated_type)) { + throw std::runtime_error("'concatenated' should satisfy the '" + concatenated_type + "' interface"); + } + } else { + if (cattype != concatenated_type) { + throw std::runtime_error("'concatenated' should contain an '" + concatenated_type + "' object"); + } + } + + try { + ::takane::validate(catdir, cattype, options); + } catch (std::exception& e) { + throw std::runtime_error("failed to validate the 'concatenated' object; " + std::string(e.what())); + } + size_t catheight = ::takane::height(catdir, cattype, options); + + size_t len = validate_group(ghandle, catheight, options.hdf5_buffer_size); + + internal_string::validate_names(ghandle, "names", len, options.hdf5_buffer_size); + internal_other::validate_mcols(path, "element_annotations", len, options); + internal_other::validate_metadata(path, "other_annotations", options); + +} catch (std::exception& e) { + throw std::runtime_error("failed to validate an '" + object_type + "' object at '" + path.string() + "'; " + std::string(e.what())); +} + +inline size_t height(const std::filesystem::path& path, const std::string& name, [[maybe_unused]] const Options& options) { + H5::H5File handle(path / "partitions.h5", H5F_ACC_RDONLY); + auto ghandle = handle.openGroup(name); + auto dhandle = ghandle.openDataSet("lengths"); + return ritsuko::hdf5::get_1d_length(dhandle, false); +} + +} + +} + +#endif diff --git a/inst/include/takane/utils_factor.hpp b/inst/include/takane/utils_factor.hpp new file mode 100644 index 0000000..868a05d --- /dev/null +++ b/inst/include/takane/utils_factor.hpp @@ -0,0 +1,86 @@ +#ifndef TAKANE_UTILS_FACTOR_HPP +#define TAKANE_UTILS_FACTOR_HPP + +#include +#include +#include +#include +#include + +#include "ritsuko/ritsuko.hpp" +#include "ritsuko/hdf5/hdf5.hpp" + +namespace takane { + +namespace internal_factor { + +template +void check_ordered_attribute(const H5Object_& handle) { + if (!handle.attrExists("ordered")) { + return; + } + + auto attr = handle.openAttribute("ordered"); + if (!ritsuko::hdf5::is_scalar(attr)) { + throw std::runtime_error("expected 'ordered' attribute to be a scalar"); + } + if (ritsuko::hdf5::exceeds_integer_limit(attr, 32, true)) { + throw std::runtime_error("expected 'ordered' attribute to have a datatype that fits in a 32-bit signed integer"); + } +} + +inline hsize_t validate_factor_levels(const H5::Group& handle, const std::string& name, hsize_t buffer_size) { + auto lhandle = ritsuko::hdf5::open_dataset(handle, name.c_str()); + if (lhandle.getTypeClass() != H5T_STRING) { + throw std::runtime_error("expected a string datatype for '" + name + "'"); + } + + auto len = ritsuko::hdf5::get_1d_length(lhandle.getSpace(), false); + std::unordered_set present; + + ritsuko::hdf5::Stream1dStringDataset stream(&lhandle, len, buffer_size); + for (hsize_t i = 0; i < len; ++i, stream.next()) { + auto x = stream.steal(); + if (present.find(x) != present.end()) { + throw std::runtime_error("'" + name + "' contains duplicated factor level '" + x + "'"); + } + present.insert(std::move(x)); + } + + return len; +} + +inline hsize_t validate_factor_codes(const H5::Group& handle, const std::string& name, hsize_t num_levels, hsize_t buffer_size, bool allow_missing = true) { + auto chandle = ritsuko::hdf5::open_dataset(handle, name.c_str()); + if (ritsuko::hdf5::exceeds_integer_limit(chandle, 64, false)) { + throw std::runtime_error("expected a datatype for '" + name + "' that fits in a 64-bit unsigned integer"); + } + + bool has_missing = false; + int32_t missing_placeholder = 0; + if (allow_missing) { + auto missingness = ritsuko::hdf5::open_and_load_optional_numeric_missing_placeholder(chandle, "missing-value-placeholder"); + has_missing = missingness.first; + missing_placeholder = missingness.second; + } + + auto len = ritsuko::hdf5::get_1d_length(chandle.getSpace(), false); + ritsuko::hdf5::Stream1dNumericDataset stream(&chandle, len, buffer_size); + for (hsize_t i = 0; i < len; ++i, stream.next()) { + auto x = stream.get(); + if (has_missing && x == missing_placeholder) { + continue; + } + if (static_cast(x) >= num_levels) { + throw std::runtime_error("expected factor codes to be less than the number of levels"); + } + } + + return len; +} + +} + +} + +#endif diff --git a/inst/include/takane/utils_hdf5.hpp b/inst/include/takane/utils_hdf5.hpp deleted file mode 100644 index 5ae35b8..0000000 --- a/inst/include/takane/utils_hdf5.hpp +++ /dev/null @@ -1,127 +0,0 @@ -#ifndef TAKANE_UTILS_HDF5_HPP -#define TAKANE_UTILS_HDF5_HPP - -#include -#include -#include -#include -#include - -#include "ritsuko/ritsuko.hpp" -#include "ritsuko/hdf5/hdf5.hpp" - -namespace takane { - -namespace internal_hdf5 { - -inline void validate_string_format(const H5::DataSet& handle, hsize_t len, const std::string& format, bool has_missing, const std::string& missing_value, hsize_t buffer_size) { - if (format == "date") { - ritsuko::hdf5::load_1d_string_dataset( - handle, - len, - buffer_size, - [&](size_t, const char* p, size_t l) { - std::string x(p, p + l); - if (has_missing && missing_value == x) { - return; - } - if (!ritsuko::is_date(p, l)) { - throw std::runtime_error("expected a date-formatted string (got '" + x + "')"); - } - } - ); - - } else if (format == "date-time") { - ritsuko::hdf5::load_1d_string_dataset( - handle, - len, - buffer_size, - [&](size_t, const char* p, size_t l) { - std::string x(p, p + l); - if (has_missing && missing_value == x) { - return; - } - if (!ritsuko::is_rfc3339(p, l)) { - throw std::runtime_error("expected a date/time-formatted string (got '" + x + "')"); - } - } - ); - - } else if (format != "none") { - throw std::runtime_error("unsupported format '" + format + "'"); - } -} - -inline hsize_t validate_factor_levels(const H5::Group& handle, const std::string& name, hsize_t buffer_size) { - auto lhandle = ritsuko::hdf5::get_dataset(handle, name.c_str()); - if (lhandle.getTypeClass() != H5T_STRING) { - throw std::runtime_error("expected a string datatype for '" + name + "'"); - } - - auto len = ritsuko::hdf5::get_1d_length(lhandle.getSpace(), false); - std::unordered_set present; - - ritsuko::hdf5::load_1d_string_dataset( - lhandle, - len, - buffer_size, - [&](hsize_t, const char* p, size_t len) { - std::string x(p, p + len); - if (present.find(x) != present.end()) { - throw std::runtime_error("'" + name + "' contains duplicated factor level '" + x + "'"); - } - present.insert(std::move(x)); - } - ); - - return len; -} - -inline hsize_t validate_factor_codes(const H5::Group& handle, const std::string& name, hsize_t num_levels, hsize_t buffer_size, bool allow_missing = true) { - auto chandle = ritsuko::hdf5::get_dataset(handle, name.c_str()); - if (ritsuko::hdf5::exceeds_integer_limit(chandle, 32, true)) { - throw std::runtime_error("expected a datatype for '" + name + "' that fits in a 32-bit signed integer"); - } - - auto len = ritsuko::hdf5::get_1d_length(chandle.getSpace(), false); - auto block_size = ritsuko::hdf5::pick_1d_block_size(chandle.getCreatePlist(), len, buffer_size); - std::vector buffer(block_size); - - bool has_missing = false; - int32_t missing_placeholder = 0; - if (allow_missing) { - const char* missing_attr_name = "missing-value-placeholder"; - has_missing = chandle.attrExists(missing_attr_name); - if (has_missing) { - auto missing_attr = ritsuko::hdf5::get_missing_placeholder_attribute(chandle, missing_attr_name); - missing_attr.read(H5::PredType::NATIVE_INT32, &missing_placeholder); - } - } - - ritsuko::hdf5::iterate_1d_blocks( - len, - block_size, - [&](hsize_t, hsize_t len, const H5::DataSpace& memspace, const H5::DataSpace& dataspace) { - chandle.read(buffer.data(), H5::PredType::NATIVE_INT32, memspace, dataspace); - for (hsize_t i = 0; i < len; ++i) { - if (has_missing && buffer[i] == missing_placeholder) { - continue; - } - if (buffer[i] < 0) { - throw std::runtime_error("expected factor codes to be non-negative"); - } - if (static_cast(buffer[i]) >= num_levels) { - throw std::runtime_error("expected factor codes to be less than the number of levels"); - } - } - } - ); - - return len; -} - -} - -} - -#endif diff --git a/inst/include/takane/utils_other.hpp b/inst/include/takane/utils_other.hpp index 81084ce..bf995a4 100644 --- a/inst/include/takane/utils_other.hpp +++ b/inst/include/takane/utils_other.hpp @@ -13,42 +13,45 @@ namespace takane { */ void validate(const std::filesystem::path&, const std::string&, const Options&); size_t height(const std::filesystem::path&, const std::string&, const Options&); +bool satisfies_interface(const std::string&, const std::string&); /** * @endcond */ namespace internal_other { -inline bool ends_with(const std::string& full, const std::string& sub) { - return (full.size() >= sub.size() && full.find(sub) == full.size() - sub.size()); -} - -inline void validate_mcols(const std::filesystem::path& path, size_t expected, const Options& options) { +inline void validate_mcols(const std::filesystem::path& parent, const std::string& name, size_t expected, const Options& options) try { + auto path = parent / name; if (!std::filesystem::exists(path)) { return; } auto xtype = read_object_type(path); - if (!ends_with(xtype, "data_frame")) { - throw std::runtime_error("expected a 'data_frame' or one of its derivatives"); + if (!satisfies_interface(xtype, "DATA_FRAME")) { + throw std::runtime_error("expected an object that satisfies the 'DATA_FRAME' interface"); } ::takane::validate(path, xtype, options); if (::takane::height(path, xtype, options) != expected) { throw std::runtime_error("unexpected number of rows"); } +} catch (std::exception& e) { + throw std::runtime_error("failed to validate '" + name + "'; " + std::string(e.what())); } -inline void validate_metadata(const std::filesystem::path& path, const Options& options) { +inline void validate_metadata(const std::filesystem::path& parent, const std::string& name, const Options& options) try { + auto path = parent / name; if (!std::filesystem::exists(path)) { return; } auto xtype = read_object_type(path); - if (!ends_with(xtype, "simple_list")) { - throw std::runtime_error("expected a 'simple_list' or one of its derivatives"); + if (!satisfies_interface(xtype, "SIMPLE_LIST")) { + throw std::runtime_error("expected an object that satisfies the 'SIMPLE_LIST' interface'"); } ::takane::validate(path, xtype, options); +} catch (std::exception& e) { + throw std::runtime_error("failed to validate '" + name + "'; " + std::string(e.what())); } } diff --git a/inst/include/takane/utils_string.hpp b/inst/include/takane/utils_string.hpp new file mode 100644 index 0000000..ba862f8 --- /dev/null +++ b/inst/include/takane/utils_string.hpp @@ -0,0 +1,88 @@ +#ifndef TAKANE_UTILS_STRING_HPP +#define TAKANE_UTILS_STRING_HPP + +#include +#include +#include +#include +#include + +#include "ritsuko/ritsuko.hpp" +#include "ritsuko/hdf5/hdf5.hpp" + +namespace takane { + +namespace internal_string { + +template +std::string fetch_format_attribute(const H5Object_& handle) { + if (!handle.attrExists("format")) { + return "none"; + } + + auto attr = handle.openAttribute("format"); + if (!ritsuko::hdf5::is_scalar(attr)) { + throw std::runtime_error("expected 'format' attribute to be a scalar"); + } + if (attr.getTypeClass() != H5T_STRING) { + throw std::runtime_error("expected 'format' attribute to be a string"); + } + return ritsuko::hdf5::load_scalar_string_attribute(attr); +} + +inline void validate_string_format(const H5::DataSet& handle, hsize_t len, const std::string& format, bool has_missing, const std::string& missing_value, hsize_t buffer_size) { + if (format == "date") { + ritsuko::hdf5::Stream1dStringDataset stream(&handle, len, buffer_size); + for (hsize_t i = 0; i < len; ++i, stream.next()) { + auto x = stream.steal(); + if (has_missing && missing_value == x) { + continue; + } + if (!ritsuko::is_date(x.c_str(), x.size())) { + throw std::runtime_error("expected a date-formatted string (got '" + x + "')"); + } + } + + } else if (format == "date-time") { + ritsuko::hdf5::Stream1dStringDataset stream(&handle, len, buffer_size); + for (hsize_t i = 0; i < len; ++i, stream.next()) { + auto x = stream.steal(); + if (has_missing && missing_value == x) { + continue; + } + if (!ritsuko::is_rfc3339(x.c_str(), x.size())) { + throw std::runtime_error("expected a date/time-formatted string (got '" + x + "')"); + } + } + + } else if (format == "none") { + ritsuko::hdf5::validate_1d_string_dataset(handle, len, buffer_size); + + } else { + throw std::runtime_error("unsupported format '" + format + "'"); + } +} + +inline void validate_names(const H5::Group& handle, const std::string& name, size_t len, hsize_t buffer_size) { + if (!handle.exists(name)) { + return; + } + + auto nhandle = ritsuko::hdf5::open_dataset(handle, name.c_str()); + if (nhandle.getTypeClass() != H5T_STRING) { + throw std::runtime_error("'" + name + "' should be a string datatype class"); + } + + auto nlen = ritsuko::hdf5::get_1d_length(nhandle.getSpace(), false); + if (len != nlen) { + throw std::runtime_error("'" + name + "' should have the same length as the parent object (got " + std::to_string(nlen) + ", expected " + std::to_string(len) + ")"); + } + + ritsuko::hdf5::validate_1d_string_dataset(nhandle, len, buffer_size); +} + +} + +} + +#endif diff --git a/inst/include/uzuki2/Version.hpp b/inst/include/uzuki2/Version.hpp index 841185a..55ab9dc 100644 --- a/inst/include/uzuki2/Version.hpp +++ b/inst/include/uzuki2/Version.hpp @@ -1,9 +1,6 @@ #ifndef UZUKI2_VERSIONED_BASE_HPP #define UZUKI2_VERSIONED_BASE_HPP -#include -#include - /** * @file Version.hpp * @brief Version-related definitions. @@ -63,54 +60,6 @@ struct Version { } }; -/** - * @cond - */ -inline Version parse_version_string(const std::string& version_string) { - int major = 0, minor = 0; - size_t i = 0, end = version_string.size(); - - if (version_string.empty()) { - throw std::runtime_error("version string is empty"); - } - if (version_string[i] == '0') { - throw std::runtime_error("invalid version string '" + version_string + "' has leading zeros in its major version"); - } - while (i < end && version_string[i] != '.') { - if (!std::isdigit(version_string[i])) { - throw std::runtime_error("invalid version string '" + version_string + "' contains non-digit characters"); - } - major *= 10; - major += version_string[i] - '0'; - ++i; - } - - if (i == end) { - throw std::runtime_error("version string '" + version_string + "' is missing a minor version"); - } - ++i; // get past the period and check again. - if (i == end) { - throw std::runtime_error("version string '" + version_string + "' is missing a minor version"); - } - - if (version_string[i] == '0' && i + 1 < end) { - throw std::runtime_error("invalid version string '" + version_string + "' has leading zeros in its minor version"); - } - while (i < end) { - if (!std::isdigit(version_string[i])) { - throw std::runtime_error("invalid version string '" + version_string + "' contains non-digit characters"); - } - minor *= 10; - minor += version_string[i] - '0'; - ++i; - } - - return Version(major, minor); -} -/** - * @cond - */ - } #endif diff --git a/inst/include/uzuki2/parse_hdf5.hpp b/inst/include/uzuki2/parse_hdf5.hpp index 200c7b7..80aa040 100644 --- a/inst/include/uzuki2/parse_hdf5.hpp +++ b/inst/include/uzuki2/parse_hdf5.hpp @@ -41,18 +41,19 @@ namespace hdf5 { /** * @cond */ -inline H5::DataSet get_scalar_dataset(const H5::Group& handle, const std::string& name, H5T_class_t type_class) try { - auto dhandle = ritsuko::hdf5::get_scalar_dataset(handle, name.c_str()); - if (dhandle.getTypeClass() != type_class) { - throw std::runtime_error("dataset has the wrong datatype class"); +inline H5::DataSet check_scalar_dataset(const H5::Group& handle, const char* name) { + if (handle.childObjType(name) != H5O_TYPE_DATASET) { + throw std::runtime_error("expected '" + std::string(name) + "' to be a dataset"); + } + auto dhandle = handle.openDataSet(name); + if (!ritsuko::hdf5::is_scalar(dhandle)) { + throw std::runtime_error("expected '" + std::string(name) + "'to be a scalar dataset"); } return dhandle; -} catch (std::exception& e) { - throw std::runtime_error("failed to load scalar dataset at '" + ritsuko::hdf5::get_name(handle) + "/" + name + "'; " + std::string(e.what())); } template -void parse_integer_like(const H5::DataSet& handle, Host* ptr, Function check, const Version& version) try { +void parse_integer_like(const H5::DataSet& handle, Host* ptr, Function check, const Version& version, hsize_t buffer_size) try { if (ritsuko::hdf5::exceeds_integer_limit(handle, 32, true)) { throw std::runtime_error("dataset cannot be represented by 32-bit signed integers"); } @@ -65,69 +66,57 @@ void parse_integer_like(const H5::DataSet& handle, Host* ptr, Function check, co const char* placeholder_name = "missing-value-placeholder"; has_missing = handle.attrExists(placeholder_name); if (has_missing) { - auto attr = ritsuko::hdf5::get_missing_placeholder_attribute(handle, placeholder_name, /* type_class_only = */ version.lt(1, 2)); + auto attr = handle.openAttribute(placeholder_name); + ritsuko::hdf5::check_missing_placeholder_attribute(handle, attr, /* type_class_only = */ version.lt(1, 2)); attr.read(H5::PredType::NATIVE_INT32, &missing_value); } } hsize_t full_length = ptr->size(); - auto block_size = ritsuko::hdf5::pick_1d_block_size(handle.getCreatePlist(), full_length, /* buffer_size = */ 10000); - std::vector buffer(block_size); - ritsuko::hdf5::iterate_1d_blocks( - full_length, - block_size, - [&](hsize_t counter, hsize_t limit, const H5::DataSpace& mspace, const H5::DataSpace& dspace) -> void { - handle.read(buffer.data(), H5::PredType::NATIVE_INT32, mspace, dspace); - for (hsize_t i = 0; i < limit; ++i) { - auto current = buffer[i]; - if (has_missing && current == missing_value) { - ptr->set_missing(counter + i); - } else { - check(current); - ptr->set(counter + i, current); - } - } + ritsuko::hdf5::Stream1dNumericDataset stream(&handle, full_length, buffer_size); + for (hsize_t i = 0; i < full_length; ++i, stream.next()) { + auto current = stream.get(); + if (has_missing && current == missing_value) { + ptr->set_missing(i); + } else { + check(current); + ptr->set(i, current); } - ); + } + } catch (std::exception& e) { throw std::runtime_error("failed to load integer dataset at '" + ritsuko::hdf5::get_name(handle) + "'; " + std::string(e.what())); } template -void parse_string_like(const H5::DataSet& handle, Host* ptr, Function check) try { +void parse_string_like(const H5::DataSet& handle, Host* ptr, Function check, hsize_t buffer_size) try { auto dtype = handle.getDataType(); if (dtype.getClass() != H5T_STRING) { throw std::runtime_error("expected a string dataset"); } - const char* placeholder_name = "missing-value-placeholder"; - bool has_missing = handle.attrExists(placeholder_name); - std::string missing_val; - if (has_missing) { - auto ahandle = ritsuko::hdf5::get_missing_placeholder_attribute(handle, placeholder_name, /* type_class_only = */ true); - missing_val = ritsuko::hdf5::load_scalar_string_attribute(ahandle); - } + auto missingness = ritsuko::hdf5::open_and_load_optional_string_missing_placeholder(handle, "missing-value-placeholder"); + bool has_missing = missingness.first; + std::string missing_val = missingness.second; - ritsuko::hdf5::load_1d_string_dataset( - handle, - ptr->size(), - /* buffer_size = */ 10000, - [&](size_t i, const char* str, size_t len) -> void { - std::string x(str, str + len); - if (has_missing && x == missing_val) { - ptr->set_missing(i); - } else { - check(x); - ptr->set(i, std::move(x)); - } + hsize_t full_length = ptr->size(); + ritsuko::hdf5::Stream1dStringDataset stream(&handle, full_length, buffer_size); + for (hsize_t i = 0; i < full_length; ++i, stream.next()) { + auto x = stream.steal(); + if (has_missing && x == missing_val) { + ptr->set_missing(i); + } else { + check(x); + ptr->set(i, std::move(x)); } - ); + } + } catch (std::exception& e) { throw std::runtime_error("failed to load string dataset at '" + ritsuko::hdf5::get_name(handle) + "'; " + std::string(e.what())); } template -void parse_numbers(const H5::DataSet& handle, Host* ptr, Function check, const Version& version) try { +void parse_numbers(const H5::DataSet& handle, Host* ptr, Function check, const Version& version, hsize_t buffer_size) try { if (version.lt(1, 3)) { if (handle.getTypeClass() != H5T_FLOAT) { throw std::runtime_error("expected a floating-point dataset"); @@ -148,7 +137,8 @@ void parse_numbers(const H5::DataSet& handle, Host* ptr, Function check, const V const char* placeholder_name = "missing-value-placeholder"; has_missing = handle.attrExists(placeholder_name); if (has_missing) { - auto attr = ritsuko::hdf5::get_missing_placeholder_attribute(handle, placeholder_name, /* type_class_only = */ version.lt(1, 2)); + auto attr = handle.openAttribute(placeholder_name); + ritsuko::hdf5::check_missing_placeholder_attribute(handle, attr, /* type_class_only = */ version.lt(1, 2)); attr.read(H5::PredType::NATIVE_DOUBLE, &missing_value); } } @@ -166,30 +156,23 @@ void parse_numbers(const H5::DataSet& handle, Host* ptr, Function check, const V }; hsize_t full_length = ptr->size(); - auto block_size = ritsuko::hdf5::pick_1d_block_size(handle.getCreatePlist(), full_length, /* buffer_size = */ 10000); - std::vector buffer(block_size); - ritsuko::hdf5::iterate_1d_blocks( - full_length, - block_size, - [&](hsize_t counter, hsize_t limit, const H5::DataSpace& mspace, const H5::DataSpace& dspace) -> void { - handle.read(buffer.data(), H5::PredType::NATIVE_DOUBLE, mspace, dspace); - for (hsize_t i = 0; i < limit; ++i) { - auto current = buffer[i]; - if (has_missing && is_missing_value(current)) { - ptr->set_missing(counter + i); - } else { - check(current); - ptr->set(counter + i, current); - } - } + ritsuko::hdf5::Stream1dNumericDataset stream(&handle, full_length, buffer_size); + for (hsize_t i = 0; i < full_length; ++i, stream.next()) { + auto current = stream.get(); + if (has_missing && is_missing_value(current)) { + ptr->set_missing(i); + } else { + check(current); + ptr->set(i, current); } - ); + } + } catch (std::exception& e) { throw std::runtime_error("failed to load floating-point dataset at '" + ritsuko::hdf5::get_name(handle) + "'; " + std::string(e.what())); } template -void extract_names(const H5::Group& handle, Host* ptr) try { +void extract_names(const H5::Group& handle, Host* ptr, hsize_t buffer_size) try { if (handle.childObjType("names") != H5O_TYPE_DATASET) { throw std::runtime_error("expected a dataset"); } @@ -206,52 +189,46 @@ void extract_names(const H5::Group& handle, Host* ptr) try { throw std::runtime_error("number of names should be equal to the object length"); } - ritsuko::hdf5::load_1d_string_dataset( - nhandle, - nlen, - /* buffer_size = */ 10000, - [&](size_t i, const char* val, size_t len) -> void { - ptr->set_name(i, std::string(val, val + len)); - } - ); + ritsuko::hdf5::Stream1dStringDataset stream(&nhandle, nlen, buffer_size); + for (size_t i = 0; i < nlen; ++i, stream.next()) { + ptr->set_name(i, stream.steal()); + } } catch (std::exception& e) { throw std::runtime_error("failed to load names at '" + ritsuko::hdf5::get_name(handle) + "'; " + std::string(e.what())); } template -std::shared_ptr parse_inner(const H5::Group& handle, Externals& ext, const Version& version) try { +std::shared_ptr parse_inner(const H5::Group& handle, Externals& ext, const Version& version, hsize_t buffer_size) try { // Deciding what type we're dealing with. - auto object_type = ritsuko::hdf5::load_scalar_string_attribute(handle, "uzuki_object"); + auto object_type = ritsuko::hdf5::open_and_load_scalar_string_attribute(handle, "uzuki_object"); std::shared_ptr output; if (object_type == "list") { - if (!handle.exists("data") || handle.childObjType("data") != H5O_TYPE_GROUP) { - throw std::runtime_error("expected a group at 'data'"); - } - auto dhandle = handle.openGroup("data"); + auto dhandle = ritsuko::hdf5::open_group(handle, "data"); size_t len = dhandle.getNumObjs(); bool named = handle.exists("names"); auto lptr = Provisioner::new_List(len, named); output.reset(lptr); - for (size_t i = 0; i < len; ++i) { - auto istr = std::to_string(i); - if (!dhandle.exists(istr) || dhandle.childObjType(istr) != H5O_TYPE_GROUP) { - throw std::runtime_error("expected a group at 'data/" + istr + "'"); + try { + for (size_t i = 0; i < len; ++i) { + auto istr = std::to_string(i); + auto lhandle = ritsuko::hdf5::open_group(dhandle, istr.c_str()); + lptr->set(i, parse_inner(lhandle, ext, version, buffer_size)); } - auto lhandle = dhandle.openGroup(istr); - lptr->set(i, parse_inner(lhandle, ext, version)); + } catch (std::exception& e) { + throw std::runtime_error("failed to parse list contents in 'data'; " + std::string(e.what())); } if (named) { - extract_names(handle, lptr); + extract_names(handle, lptr, buffer_size); } } else if (object_type == "vector") { - auto vector_type = ritsuko::hdf5::load_scalar_string_attribute(handle, "uzuki_type"); + auto vector_type = ritsuko::hdf5::open_and_load_scalar_string_attribute(handle, "uzuki_type"); - auto dhandle = ritsuko::hdf5::get_dataset(handle, "data"); + auto dhandle = ritsuko::hdf5::open_dataset(handle, "data"); size_t len = ritsuko::hdf5::get_1d_length(dhandle.getSpace(), true); bool is_scalar = (len == 0); if (is_scalar) { @@ -263,7 +240,7 @@ std::shared_ptr parse_inner(const H5::Group& handle, Externals& ext, const if (vector_type == "integer") { auto iptr = Provisioner::new_Integer(len, named, is_scalar); output.reset(iptr); - parse_integer_like(dhandle, iptr, [](int32_t) -> void {}, version); + parse_integer_like(dhandle, iptr, [](int32_t) -> void {}, version, buffer_size); } else if (vector_type == "boolean") { auto bptr = Provisioner::new_Boolean(len, named, is_scalar); @@ -272,10 +249,10 @@ std::shared_ptr parse_inner(const H5::Group& handle, Externals& ext, const if (x != 0 && x != 1) { throw std::runtime_error("boolean values should be 0 or 1"); } - }, version); + }, version, buffer_size); } else if (vector_type == "factor" || (version.equals(1, 0) && vector_type == "ordered")) { - auto levhandle = ritsuko::hdf5::get_dataset(handle, "levels"); + auto levhandle = ritsuko::hdf5::open_dataset(handle, "levels"); auto levtype = levhandle.getDataType(); if (levtype.getClass() != H5T_STRING) { throw std::runtime_error("expected a string dataset for the levels at 'levels'"); @@ -286,9 +263,12 @@ std::shared_ptr parse_inner(const H5::Group& handle, Externals& ext, const if (vector_type == "ordered") { ordered = true; } else if (handle.exists("ordered")) { - auto ohandle = get_scalar_dataset(handle, "ordered", H5T_INTEGER); - int tmp_ordered = 0; - ohandle.read(&tmp_ordered, H5::PredType::NATIVE_INT); + auto ohandle = check_scalar_dataset(handle, "ordered"); + if (ritsuko::hdf5::exceeds_integer_limit(ohandle, 32, true)) { + throw std::runtime_error("'ordered' value cannot be represented by a 32-bit integer"); + } + int32_t tmp_ordered = 0; + ohandle.read(&tmp_ordered, H5::PredType::NATIVE_INT32); ordered = tmp_ordered > 0; } @@ -298,22 +278,18 @@ std::shared_ptr parse_inner(const H5::Group& handle, Externals& ext, const if (x < 0 || x >= levlen) { throw std::runtime_error("factor codes should be non-negative and less than the number of levels"); } - }, version); + }, version, buffer_size); std::unordered_set present; - ritsuko::hdf5::load_1d_string_dataset( - levhandle, - levlen, - /* buffer_size = */ 10000, - [&](size_t i, const char* val, size_t len) -> void { - std::string x(val, val + len); - if (present.find(x) != present.end()) { - throw std::runtime_error("levels should be unique"); - } - fptr->set_level(i, x); - present.insert(std::move(x)); + ritsuko::hdf5::Stream1dStringDataset stream(&levhandle, levlen, buffer_size); + for (int32_t i = 0; i < levlen; ++i, stream.next()) { + auto x = stream.steal(); + if (present.find(x) != present.end()) { + throw std::runtime_error("levels should be unique"); } - ); + fptr->set_level(i, x); + present.insert(std::move(x)); + } } else if (vector_type == "string" || (version.equals(1, 0) && (vector_type == "date" || vector_type == "date-time"))) { StringVector::Format format = StringVector::NONE; @@ -323,49 +299,46 @@ std::shared_ptr parse_inner(const H5::Group& handle, Externals& ext, const } else if (vector_type == "date-time") { format = StringVector::DATETIME; } + } else if (handle.exists("format")) { - auto fhandle = get_scalar_dataset(handle, "format", H5T_STRING); - ritsuko::hdf5::load_1d_string_dataset( - fhandle, - 1, - /* buffer_size = */ 10000, - [&](size_t, const char* val, size_t len) -> void { - std::string x(val, val + len); - if (x == "date") { - format = StringVector::DATE; - } else if (x == "date-time") { - format = StringVector::DATETIME; - } else { - throw std::runtime_error("unsupported format '" + x + "'"); - } - } - ); + auto fhandle = check_scalar_dataset(handle, "format"); + if (fhandle.getTypeClass() != H5T_STRING) { + throw std::runtime_error("'format' dataset should have a string datatype class"); + } + auto x = ritsuko::hdf5::load_scalar_string_dataset(fhandle); + if (x == "date") { + format = StringVector::DATE; + } else if (x == "date-time") { + format = StringVector::DATETIME; + } else { + throw std::runtime_error("unsupported format '" + x + "'"); + } } auto sptr = Provisioner::new_String(len, named, is_scalar, format); output.reset(sptr); if (format == StringVector::NONE) { - parse_string_like(dhandle, sptr, [](const std::string&) -> void {}); + parse_string_like(dhandle, sptr, [](const std::string&) -> void {}, buffer_size); } else if (format == StringVector::DATE) { parse_string_like(dhandle, sptr, [&](const std::string& x) -> void { if (!ritsuko::is_date(x.c_str(), x.size())) { throw std::runtime_error("dates should follow YYYY-MM-DD formatting"); } - }); + }, buffer_size); } else if (format == StringVector::DATETIME) { parse_string_like(dhandle, sptr, [&](const std::string& x) -> void { if (!ritsuko::is_rfc3339(x.c_str(), x.size())) { throw std::runtime_error("date-times should follow the Internet Date/Time format"); } - }); + }, buffer_size); } } else if (vector_type == "number") { auto dptr = Provisioner::new_Number(len, named, is_scalar); output.reset(dptr); - parse_numbers(dhandle, dptr, [](double) -> void {}, version); + parse_numbers(dhandle, dptr, [](double) -> void {}, version, buffer_size); } else { throw std::runtime_error("unknown vector type '" + vector_type + "'"); @@ -373,14 +346,14 @@ std::shared_ptr parse_inner(const H5::Group& handle, Externals& ext, const if (named) { auto vptr = static_cast(output.get()); - extract_names(handle, vptr); + extract_names(handle, vptr, buffer_size); } } else if (object_type == "nothing") { output.reset(Provisioner::new_Nothing()); } else if (object_type == "external") { - auto ihandle = ritsuko::hdf5::get_dataset(handle, "index"); + auto ihandle = ritsuko::hdf5::open_dataset(handle, "index"); if (ritsuko::hdf5::exceeds_integer_limit(ihandle, 32, true)) { throw std::runtime_error("external index at 'index' cannot be represented by a 32-bit signed integer"); } @@ -411,12 +384,28 @@ std::shared_ptr parse_inner(const H5::Group& handle, Externals& ext, const * @endcond */ +/** + * @brief Options for HDF5 file parsing. + */ +struct Options { + /** + * Buffer size, in terms of the number of elements, to use for reading data from HDF5 datasets. + */ + hsize_t buffer_size = 10000; + + /** + * Whether to throw an error if the top-level R object is not an R list. + */ + bool strict_list = true; +}; + /** * @tparam Provisioner A class namespace defining static methods for creating new `Base` objects. * @tparam Externals Class describing how to resolve external references for type `EXTERNAL`. * * @param handle Handle for a HDF5 group corresponding to the list. * @param ext Instance of an external reference resolver class. + * @param options Optional parameters. * * @return A `ParsedList` containing a pointer to the root `Base` object. * Depending on `Provisioner`, this may contain references to all nested objects. @@ -456,16 +445,23 @@ std::shared_ptr parse_inner(const H5::Group& handle, Externals& ext, const * - `size_t size()`, which returns the number of available external references. */ template -ParsedList parse(const H5::Group& handle, Externals ext) { +ParsedList parse(const H5::Group& handle, Externals ext, Options options = Options()) { Version version; if (handle.attrExists("uzuki_version")) { - auto ver_str = ritsuko::hdf5::load_scalar_string_attribute(handle, "uzuki_version"); - version = parse_version_string(ver_str); + auto ver_str = ritsuko::hdf5::open_and_load_scalar_string_attribute(handle, "uzuki_version"); + auto vraw = ritsuko::parse_version_string(ver_str.c_str(), ver_str.size(), /* skip_patch = */ true); + version.major = vraw.major; + version.minor = vraw.minor; } ExternalTracker etrack(std::move(ext)); - auto ptr = parse_inner(handle, etrack, version); + auto ptr = parse_inner(handle, etrack, version, options.buffer_size); + + if (options.strict_list && ptr->type() != LIST) { + throw std::runtime_error("top-level object should represent an R list"); + } etrack.validate(); + return ParsedList(std::move(ptr), std::move(version)); } @@ -476,6 +472,7 @@ ParsedList parse(const H5::Group& handle, Externals ext) { * @tparam Provisioner A class namespace defining static methods for creating new `Base` objects. * * @param handle Handle for a HDF5 group corresponding to the list. + * @param options Optional parameters. * * @return A `ParsedList` containing a pointer to the root `Base` object. * Depending on `Provisioner`, this may contain references to all nested objects. @@ -483,8 +480,8 @@ ParsedList parse(const H5::Group& handle, Externals ext) { * Any invalid representations in `contents` will cause an error to be thrown. */ template -ParsedList parse(const H5::Group& handle) { - return parse(handle, uzuki2::DummyExternals(0)); +ParsedList parse(const H5::Group& handle, Options options = Options()) { + return parse(handle, uzuki2::DummyExternals(0), std::move(options)); } /** @@ -496,6 +493,7 @@ ParsedList parse(const H5::Group& handle) { * @param file Path to a HDF5 file. * @param name Name of the HDF5 group containing the list in `file`. * @param ext Instance of an external reference resolver class. + * @param options Optional parameters. * * @return A `ParsedList` containing a pointer to the root `Base` object. * Depending on `Provisioner`, this may contain references to all nested objects. @@ -503,9 +501,9 @@ ParsedList parse(const H5::Group& handle) { * Any invalid representations in `contents` will cause an error to be thrown. */ template -ParsedList parse(const std::string& file, const std::string& name, Externals ext) { +ParsedList parse(const std::string& file, const std::string& name, Externals ext, Options options = Options()) { H5::H5File handle(file, H5F_ACC_RDONLY); - return parse(handle.openGroup(name), std::move(ext)); + return parse(ritsuko::hdf5::open_group(handle, name.c_str()), std::move(ext), std::move(options)); } /** @@ -516,6 +514,7 @@ ParsedList parse(const std::string& file, const std::string& name, Externals ext * * @param file Path to a HDF5 file. * @param name Name of the HDF5 group containing the list in `file`. + * @param options Optional parameters. * * @return A `ParsedList` containing a pointer to the root `Base` object. * Depending on `Provisioner`, this may contain references to all nested objects. @@ -523,9 +522,9 @@ ParsedList parse(const std::string& file, const std::string& name, Externals ext * Any invalid representations in `contents` will cause an error to be thrown. */ template -ParsedList parse(const std::string& file, const std::string& name) { +ParsedList parse(const std::string& file, const std::string& name, Options options = Options()) { H5::H5File handle(file, H5F_ACC_RDONLY); - return parse(handle.openGroup(name), uzuki2::DummyExternals(0)); + return parse(ritsuko::hdf5::open_group(handle, name.c_str()), uzuki2::DummyExternals(0), std::move(options)); } /** @@ -536,10 +535,11 @@ ParsedList parse(const std::string& file, const std::string& name) { * @param name Name of the HDF5 group corresponding to `handle`. * Only used for error messages. * @param num_external Expected number of external references. + * @param options Optional parameters. */ -inline void validate(const H5::Group& handle, int num_external = 0) { +inline void validate(const H5::Group& handle, int num_external = 0, Options options = Options()) { DummyExternals ext(num_external); - parse(handle, ext); + parse(handle, ext, std::move(options)); return; } @@ -550,10 +550,11 @@ inline void validate(const H5::Group& handle, int num_external = 0) { * @param file Path to a HDF5 file. * @param name Name of the HDF5 group containing the list in `file`. * @param num_external Expected number of external references. + * @param options Optional parameters. */ -inline void validate(const std::string& file, const std::string& name, int num_external = 0) { +inline void validate(const std::string& file, const std::string& name, int num_external = 0, Options options = Options()) { DummyExternals ext(num_external); - parse(file, name, ext); + parse(file, name, ext, std::move(options)); return; } diff --git a/inst/include/uzuki2/parse_json.hpp b/inst/include/uzuki2/parse_json.hpp index b380c92..b7e2fbb 100644 --- a/inst/include/uzuki2/parse_json.hpp +++ b/inst/include/uzuki2/parse_json.hpp @@ -395,6 +395,11 @@ struct Options { * If true, an extra thread is used to avoid blocking I/O operations. */ bool parallel = false; + + /** + * Whether to throw an error if the top-level R object is not an R list. + */ + bool strict_list = true; }; /** @@ -434,14 +439,21 @@ ParsedList parse(byteme::Reader& reader, Externals ext, Options options = Option if (vIt->second->type() != millijson::STRING) { throw std::runtime_error("expected a string in 'version'"); } - auto vptr = static_cast(vIt->second.get()); - version = parse_version_string(vptr->value); + const auto& vstr = static_cast(vIt->second.get())->value; + auto vraw = ritsuko::parse_version_string(vstr.c_str(), vstr.size(), /* skip_patch = */ true); + version.major = vraw.major; + version.minor = vraw.minor; } } ExternalTracker etrack(std::move(ext)); auto output = parse_object(contents.get(), etrack, "", version); + + if (options.strict_list && output->type() != LIST) { + throw std::runtime_error("top-level object should represent an R list"); + } etrack.validate(); + return ParsedList(std::move(output), std::move(version)); } diff --git a/tests/testthat/test-DataFrame.R b/tests/testthat/test-DataFrame.R index b0b9aeb..03b69c8 100644 --- a/tests/testthat/test-DataFrame.R +++ b/tests/testthat/test-DataFrame.R @@ -295,7 +295,7 @@ test_that("handling of NAs works correctly", { fpath <- file.path(tmp2, "basic_columns.h5") attrs <- rhdf5::h5readAttributes(fpath, "data_frame/data/2/codes") - expect_identical(attrs[["missing-value-placeholder"]], -1L) + expect_identical(attrs[["missing-value-placeholder"]], 2L) attrs <- rhdf5::h5readAttributes(fpath, "data_frame/data/3/codes") expect_null(attrs[["missing-value-placeholder"]])