From 742f18633dc324bb681bc8e9d1e2b847dc5ce3a5 Mon Sep 17 00:00:00 2001 From: LTLA Date: Thu, 23 Nov 2023 13:07:27 -0800 Subject: [PATCH 1/2] Transition to unsigned integers for the factor codes. --- inst/include/fetch.sh | 4 +- .../ritsuko/choose_missing_placeholder.hpp | 116 ++++++- .../ritsuko/hdf5/Stream1dNumericDataset.hpp | 134 ++++++++ .../ritsuko/hdf5/Stream1dStringDataset.hpp | 171 ++++++++++ inst/include/ritsuko/hdf5/_strings.hpp | 29 ++ .../ritsuko/hdf5/as_numeric_datatype.hpp | 55 +++ inst/include/ritsuko/hdf5/get_1d_length.hpp | 50 +++ inst/include/ritsuko/hdf5/get_dataset.hpp | 51 --- .../get_missing_placeholder_attribute.hpp | 49 --- inst/include/ritsuko/hdf5/get_name.hpp | 16 +- .../ritsuko/hdf5/get_scalar_attribute.hpp | 43 --- inst/include/ritsuko/hdf5/hdf5.hpp | 16 +- .../ritsuko/hdf5/iterate_1d_blocks.hpp | 44 --- .../ritsuko/hdf5/load_1d_string_dataset.hpp | 114 ------- inst/include/ritsuko/hdf5/load_attribute.hpp | 144 ++++++++ inst/include/ritsuko/hdf5/load_dataset.hpp | 79 +++++ .../hdf5/load_scalar_string_attribute.hpp | 53 --- inst/include/ritsuko/hdf5/miscellaneous.hpp | 56 +++ .../ritsuko/hdf5/missing_placeholder.hpp | 98 ++++++ inst/include/ritsuko/hdf5/open.hpp | 79 +++++ inst/include/ritsuko/hdf5/validate_string.hpp | 148 ++++++++ inst/include/takane/_height.hpp | 8 + inst/include/takane/_satisfies_interface.hpp | 53 +++ inst/include/takane/_validate.hpp | 10 + inst/include/takane/atomic_vector.hpp | 48 +-- inst/include/takane/atomic_vector_list.hpp | 45 +++ inst/include/takane/compressed_list.hpp | 143 -------- inst/include/takane/data_frame.hpp | 121 +++---- inst/include/takane/data_frame_factor.hpp | 46 +-- inst/include/takane/data_frame_list.hpp | 45 +++ inst/include/takane/genomic_ranges.hpp | 318 +++++++++--------- inst/include/takane/genomic_ranges_list.hpp | 45 +++ inst/include/takane/sequence_information.hpp | 165 ++++----- inst/include/takane/string_factor.hpp | 39 +-- inst/include/takane/takane.hpp | 1 + inst/include/takane/utils_compressed_list.hpp | 96 ++++++ inst/include/takane/utils_factor.hpp | 86 +++++ inst/include/takane/utils_hdf5.hpp | 127 ------- inst/include/takane/utils_other.hpp | 23 +- inst/include/takane/utils_string.hpp | 88 +++++ inst/include/uzuki2/Version.hpp | 51 --- inst/include/uzuki2/parse_hdf5.hpp | 289 ++++++++-------- inst/include/uzuki2/parse_json.hpp | 16 +- 43 files changed, 2121 insertions(+), 1291 deletions(-) create mode 100644 inst/include/ritsuko/hdf5/Stream1dNumericDataset.hpp create mode 100644 inst/include/ritsuko/hdf5/Stream1dStringDataset.hpp create mode 100644 inst/include/ritsuko/hdf5/_strings.hpp create mode 100644 inst/include/ritsuko/hdf5/as_numeric_datatype.hpp delete mode 100644 inst/include/ritsuko/hdf5/get_dataset.hpp delete mode 100644 inst/include/ritsuko/hdf5/get_missing_placeholder_attribute.hpp delete mode 100644 inst/include/ritsuko/hdf5/get_scalar_attribute.hpp delete mode 100644 inst/include/ritsuko/hdf5/iterate_1d_blocks.hpp delete mode 100644 inst/include/ritsuko/hdf5/load_1d_string_dataset.hpp create mode 100644 inst/include/ritsuko/hdf5/load_attribute.hpp create mode 100644 inst/include/ritsuko/hdf5/load_dataset.hpp delete mode 100644 inst/include/ritsuko/hdf5/load_scalar_string_attribute.hpp create mode 100644 inst/include/ritsuko/hdf5/miscellaneous.hpp create mode 100644 inst/include/ritsuko/hdf5/missing_placeholder.hpp create mode 100644 inst/include/ritsuko/hdf5/open.hpp create mode 100644 inst/include/ritsuko/hdf5/validate_string.hpp create mode 100644 inst/include/takane/_satisfies_interface.hpp create mode 100644 inst/include/takane/atomic_vector_list.hpp delete mode 100644 inst/include/takane/compressed_list.hpp create mode 100644 inst/include/takane/data_frame_list.hpp create mode 100644 inst/include/takane/genomic_ranges_list.hpp create mode 100644 inst/include/takane/utils_compressed_list.hpp create mode 100644 inst/include/takane/utils_factor.hpp delete mode 100644 inst/include/takane/utils_hdf5.hpp create mode 100644 inst/include/takane/utils_string.hpp diff --git a/inst/include/fetch.sh b/inst/include/fetch.sh index 57e43e2..3e105d1 100755 --- a/inst/include/fetch.sh +++ b/inst/include/fetch.sh @@ -31,7 +31,7 @@ harvester() { harvester millijson https://github.com/ArtifactDB/millijson v1.0.0 harvester byteme https://github.com/LTLA/byteme v1.1.0 -harvester uzuki2 https://github.com/ArtifactDB/uzuki2 v1.3.0 harvester comservatory https://github.com/ArtifactDB/comservatory v2.0.1 -harvester ritsuko https://github.com/ArtifactDB/ritsuko v0.3.3 +harvester uzuki2 https://github.com/ArtifactDB/uzuki2 master +harvester ritsuko https://github.com/ArtifactDB/ritsuko master harvester takane https://github.com/ArtifactDB/takane master diff --git a/inst/include/ritsuko/choose_missing_placeholder.hpp b/inst/include/ritsuko/choose_missing_placeholder.hpp index 599857a..01d8014 100644 --- a/inst/include/ritsuko/choose_missing_placeholder.hpp +++ b/inst/include/ritsuko/choose_missing_placeholder.hpp @@ -15,20 +15,58 @@ namespace ritsuko { /** - * Choose an appropriate placeholder for missing values in an integer dataset. + * @cond + */ +template +bool found(Iterator start, Iterator end, Mask mask, Type candidate) { + if constexpr(std::is_same::value) { + return (std::find(start, end, candidate) != end); + } else { + for (; start != end; ++start, ++mask) { + if (!*mask && candidate == *start) { + return true; + } + } + return false; + } +} + +template()))>::type>::type> +std::set create_unique_set(Iterator start, Iterator end, Mask mask) { + if constexpr(std::is_same::value) { + return std::set(start, end); + } else { + std::set output; + for (; start != end; ++start, ++mask) { + if (!*mask) { + output.insert(*start); + } + } + return output; + } +} +/** + * @endcond + */ + +/** + * Choose an appropriate placeholder for missing values in an integer dataset, after ignoring all the masked values. * This will try the various special values (the minimum, the maximum, and for signed types, 0) * before sorting the dataset and searching for an unused integer value. * * @tparam Iterator_ Forward iterator for integer values. + * @tparam Mask_ Random access iterator for mask values. * @tparam Type_ Integer type pointed to by `Iterator_`. * * @param start Start of the dataset. * @param end End of the dataset. + * @param mask Start of the mask vector. + * This should have the same length as `end - start`; each entry is true if the corresponding value of the integer dataset is masked, and false otherwise. * * @return Pair containing (i) a boolean indicating whether a placeholder was successfully found, and (ii) the chosen placeholder if the previous boolean is true. */ -template()))>::type>::type> -std::pair choose_missing_integer_placeholder(Iterator start, Iterator end) { +template()))>::type>::type> +std::pair choose_missing_integer_placeholder(Iterator start, Iterator end, Mask mask) { static_assert(std::numeric_limits::is_integer); // Trying important points first; minima and maxima, and 0. @@ -42,7 +80,7 @@ std::pair choose_missing_integer_placeholder(Iterator start, Iterat } else { candidate = 0; } - if (std::find(start, end, candidate) == end) { + if (!found(start, end, mask, candidate)) { return std::make_pair(true, candidate); } } @@ -55,14 +93,14 @@ std::pair choose_missing_integer_placeholder(Iterator start, Iterat } else { candidate = 0; } - if (std::find(start, end, candidate) == end) { + if (!found(start, end, mask, candidate)) { return std::make_pair(true, candidate); } } } // Well... going through it in order. - std::set uniq_sort(start, end); + auto uniq_sort = create_unique_set(start, end, mask); Type_ last = std::numeric_limits::min(); for (auto x : uniq_sort) { if (last + 1 < x) { @@ -75,7 +113,23 @@ std::pair choose_missing_integer_placeholder(Iterator start, Iterat } /** - * Choose an appropriate placeholder for missing values in a floating-point dataset. + * Overload of `choose_missing_integer_placeholder()` where no values are masked. + * + * @tparam Iterator_ Forward iterator for integer values. + * @tparam Type_ Integer type pointed to by `Iterator_`. + * + * @param start Start of the dataset. + * @param end End of the dataset. + * + * @return Pair containing (i) a boolean indicating whether a placeholder was successfully found, and (ii) the chosen placeholder if the previous boolean is true. + */ +template()))>::type>::type> +std::pair choose_missing_integer_placeholder(Iterator start, Iterator end) { + return choose_missing_integer_placeholder(start, end, false); +} + +/** + * Choose an appropriate placeholder for missing values in a floating-point dataset, after ignoring all masked values. * This will try the various IEEE special values (NaN, Inf, -Inf) and then some type-specific boundaries (the minimum, the maximum, and for signed types, 0) * before sorting the dataset and searching for an unused float. * @@ -84,22 +138,35 @@ std::pair choose_missing_integer_placeholder(Iterator start, Iterat * * @param start Start of the dataset. * @param end End of the dataset. + * @param mask Start of the mask vector. * @param skip_nan Whether to skip NaN as a potential placeholder. * Useful in frameworks like R that need special consideration of NaN payloads. * * @return Pair containing (i) a boolean indicating whether a placeholder was successfully found, and (ii) the chosen placeholder if the previous boolean is true. */ -template()))>::type>::type> -std::pair choose_missing_float_placeholder(Iterator start, Iterator end, bool skip_nan = false) { +template()))>::type>::type> +std::pair choose_missing_float_placeholder(Iterator start, Iterator end, Mask mask, bool skip_nan) { if constexpr(std::numeric_limits::is_iec559) { if (!skip_nan) { bool has_nan = false; - for (auto x = start; x != end; ++x) { - if (std::isnan(*x)) { - has_nan = true; - break; + + if constexpr(std::is_same::value) { + for (auto x = start; x != end; ++x) { + if (std::isnan(*x)) { + has_nan = true; + break; + } + } + } else { + auto sIt = mask; + for (auto x = start; x != end; ++x, ++sIt) { + if (!*sIt && std::isnan(*x)) { + has_nan = true; + break; + } } } + if (!has_nan) { return std::make_pair(true, std::numeric_limits::quiet_NaN()); } @@ -107,7 +174,7 @@ std::pair choose_missing_float_placeholder(Iterator start, Iterator for (int i = 0; i < 2; ++i) { Type_ candidate = std::numeric_limits::infinity() * (i == 0 ? 1 : -1); - if (std::find(start, end, candidate) == end) { + if (!found(start, end, mask, candidate)) { return std::make_pair(true, candidate); } } @@ -123,13 +190,13 @@ std::pair choose_missing_float_placeholder(Iterator start, Iterator } else { candidate = 0; } - if (std::find(start, end, candidate) == end) { + if (!found(start, end, mask, candidate)) { return std::make_pair(true, candidate); } } // Well... going through it in order. - std::set uniq_sort(start, end); + auto uniq_sort = create_unique_set(start, end, mask); Type_ last = std::numeric_limits::lowest(); for (auto x : uniq_sort) { if (std::isfinite(x)) { @@ -144,6 +211,23 @@ std::pair choose_missing_float_placeholder(Iterator start, Iterator return std::make_pair(false, 0); } +/** + * Overload of `choose_missing_float_placeholder()` where no values are masked. + * + * @tparam Iterator_ Forward iterator for floating-point values. + * @tparam Type_ Integer type pointed to by `Iterator_`. + * + * @param start Start of the dataset. + * @param end End of the dataset. + * @param skip_nan Whether to skip NaN as a potential placeholder. + * + * @return Pair containing (i) a boolean indicating whether a placeholder was successfully found, and (ii) the chosen placeholder if the previous boolean is true. + */ +template()))>::type>::type> +std::pair choose_missing_float_placeholder(Iterator start, Iterator end, bool skip_nan = false) { + return choose_missing_float_placeholder(start, end, false, skip_nan); +} + } #endif diff --git a/inst/include/ritsuko/hdf5/Stream1dNumericDataset.hpp b/inst/include/ritsuko/hdf5/Stream1dNumericDataset.hpp new file mode 100644 index 0000000..bbe866f --- /dev/null +++ b/inst/include/ritsuko/hdf5/Stream1dNumericDataset.hpp @@ -0,0 +1,134 @@ +#ifndef RITSUKO_HDF5_STREAM_1D_NUMERIC_DATASET_HPP +#define RITSUKO_HDF5_STREAM_1D_NUMERIC_DATASET_HPP + +#include "H5Cpp.h" + +#include +#include + +#include "pick_1d_block_size.hpp" +#include "get_1d_length.hpp" +#include "get_name.hpp" +#include "as_numeric_datatype.hpp" + +/** + * @file Stream1dNumericDataset.hpp + * @brief Stream a numeric 1D HDF5 dataset into memory. + */ + +namespace ritsuko { + +namespace hdf5 { + +/** + * @brief Stream a numeric 1D HDF5 dataset into memory. + * @tparam Type_ Type to represent the data in memory. + * + * This streams in a HDF5 dataset in contiguous blocks, using block sizes defined by `pick_1d_block_size()`. + * Callers can then extract one value at a time or they can acquire the entire block. + */ +template +class Stream1dNumericDataset { +public: + /** + * @param ptr Pointer to a HDF5 dataset handle. + * @param size Length of the dataset as a 1-dimensional vector. + * @param buffer_size Size of the buffer for holding streamed blocks of values. + * Larger buffers improve speed at the cost of some memory efficiency. + */ + Stream1dNumericDataset(const H5::DataSet* ptr, hsize_t length, hsize_t buffer_size) : + ptr(ptr), + full_length(length), + block_size(pick_1d_block_size(ptr->getCreatePlist(), full_length, buffer_size)), + mspace(1, &block_size), + dspace(1, &full_length), + buffer(block_size) + {} + + /** + * Overloaded constructor where the length is automatically determined. + * + * @param ptr Pointer to a HDF5 dataset handle. + * @param buffer_size Size of the buffer for holding streamed blocks of values. + */ + Stream1dNumericDataset(const H5::DataSet* ptr, hsize_t buffer_size) : + Stream1dNumericDataset(ptr, get_1d_length(ptr->getSpace(), false), buffer_size) + {} + +public: + /** + * @return Value at the current position of the stream. + */ + Type_ get() { + while (consumed >= available) { + consumed -= available; + load(); + } + return buffer[consumed]; + } + + /** + * @return Pair containing a pointer to and the length of an array. + * The array holds all loaded values of the stream at its current position, up to the specified length. + * Note that the pointer is only valid until the next invocation of `next()`. + */ + std::pair get_many() { + while (consumed >= available) { + consumed -= available; + load(); + } + return std::make_pair(buffer.data() + consumed, available - consumed); + } + + /** + * Advance the position of the stream by `jump`. + * + * @param jump Number of positions by which to advance the stream. + */ + void next(size_t jump = 1) { + consumed += jump; + } + + /** + * @return Length of the dataset. + */ + hsize_t length() const { + return full_length; + } + + /** + * @return Current position on the stream. + */ + hsize_t position() const { + return consumed + last_loaded; + } + +private: + const H5::DataSet* ptr; + hsize_t full_length, block_size; + H5::DataSpace mspace; + H5::DataSpace dspace; + std::vector buffer; + + hsize_t last_loaded = 0; + hsize_t consumed = 0; + hsize_t available = 0; + + void load() { + if (last_loaded >= full_length) { + throw std::runtime_error("requesting data beyond the end of the dataset at '" + get_name(*ptr) + "'"); + } + available = std::min(full_length - last_loaded, block_size); + constexpr hsize_t zero = 0; + mspace.selectHyperslab(H5S_SELECT_SET, &available, &zero); + dspace.selectHyperslab(H5S_SELECT_SET, &available, &last_loaded); + ptr->read(buffer.data(), as_numeric_datatype(), mspace, dspace); + last_loaded += available; + } +}; + +} + +} + +#endif diff --git a/inst/include/ritsuko/hdf5/Stream1dStringDataset.hpp b/inst/include/ritsuko/hdf5/Stream1dStringDataset.hpp new file mode 100644 index 0000000..69c7515 --- /dev/null +++ b/inst/include/ritsuko/hdf5/Stream1dStringDataset.hpp @@ -0,0 +1,171 @@ +#ifndef RITSUKO_HDF5_STREAM_1D_STRING_DATASET_HPP +#define RITSUKO_HDF5_STREAM_1D_STRING_DATASET_HPP + +#include "H5Cpp.h" + +#include +#include +#include + +#include "pick_1d_block_size.hpp" +#include "get_1d_length.hpp" +#include "get_name.hpp" +#include "as_numeric_datatype.hpp" +#include "_strings.hpp" + +/** + * @file Stream1dStringDataset.hpp + * @brief Stream a numeric 1D HDF5 dataset into memory. + */ + +namespace ritsuko { + +namespace hdf5 { + +/** + * @brief Stream a 1D HDF5 string dataset into memory. + * + * This streams in a HDF5 dataset in contiguous blocks, using block sizes defined by `pick_1d_block_size()`. + * Callers can then extract one C-style string at a time. + */ +class Stream1dStringDataset { +public: + /** + * @param ptr Pointer to a HDF5 dataset handle. + * @param length Length of the dataset as a 1-dimensional vector. + * @param buffer_size Size of the buffer for holding streamed blocks of values. + * Larger buffers improve speed at the cost of some memory efficiency. + */ + Stream1dStringDataset(const H5::DataSet* ptr, hsize_t length, hsize_t buffer_size) : + ptr(ptr), + full_length(length), + block_size(pick_1d_block_size(ptr->getCreatePlist(), full_length, buffer_size)), + mspace(1, &block_size), + dspace(1, &full_length), + dtype(ptr->getDataType()), + is_variable(dtype.isVariableStr()) + { + if (is_variable) { + var_buffer.resize(block_size); + } else { + fixed_length = dtype.getSize(); + fix_buffer.resize(fixed_length * block_size); + } + final_buffer.resize(block_size); + } + + /** + * Overloaded constructor where the length is automatically determined. + * + * @param ptr Pointer to a HDF5 dataset handle. + * @param buffer_size Size of the buffer for holding streamed blocks of values. + */ + Stream1dStringDataset(const H5::DataSet* ptr, hsize_t buffer_size) : + Stream1dStringDataset(ptr, get_1d_length(ptr->getSpace(), false), buffer_size) + {} + +public: + /** + * @return String at the current position of the stream. + */ + std::string get() { + while (consumed >= available) { + consumed -= available; + load(); + } + return final_buffer[consumed]; + } + + /** + * @return String at the current position of the stream. + * Unlike `get()`, this avoids a copy by directly acquiring the string, + * but it invalidates all subsequent `get()` and `steal()` requests until `next()` is called. + */ + std::string steal() { + while (consumed >= available) { + consumed -= available; + load(); + } + return std::move(final_buffer[consumed]); + } + + /** + * Advance to the next position of the stream. + * + * @param jump Number of positions by which to advance the stream. + */ + void next(size_t jump = 1) { + consumed += jump; + } + + /** + * @return Length of the dataset. + */ + hsize_t length() const { + return full_length; + } + + /** + * @return Current position on the stream. + */ + hsize_t position() const { + return consumed + last_loaded; + } + +private: + const H5::DataSet* ptr; + hsize_t full_length, block_size; + H5::DataSpace mspace; + H5::DataSpace dspace; + + H5::DataType dtype; + bool is_variable; + std::vector var_buffer; + size_t fixed_length = 0; + std::vector fix_buffer; + std::vector final_buffer; + + hsize_t last_loaded = 0; + hsize_t consumed = 0; + hsize_t available = 0; + + void load() { + if (last_loaded >= full_length) { + throw std::runtime_error("requesting data beyond the end of the dataset at '" + get_name(*ptr) + "'"); + } + available = std::min(full_length - last_loaded, block_size); + constexpr hsize_t zero = 0; + mspace.selectHyperslab(H5S_SELECT_SET, &available, &zero); + dspace.selectHyperslab(H5S_SELECT_SET, &available, &last_loaded); + + if (is_variable) { + ptr->read(var_buffer.data(), dtype, mspace, dspace); + [[maybe_unused]] VariableStringCleaner deletor(dtype.getId(), mspace.getId(), var_buffer.data()); + for (hsize_t i = 0; i < block_size; ++i) { + if (var_buffer[i] == NULL) { + throw std::runtime_error("detected a NULL pointer for a variable length string in '" + get_name(*ptr) + "'"); + } + auto& curstr = final_buffer[i]; + curstr.clear(); + curstr.insert(0, var_buffer[i]); + } + + } else { + auto bptr = fix_buffer.data(); + ptr->read(bptr, dtype, mspace, dspace); + for (size_t i = 0; i < available; ++i, bptr += fixed_length) { + auto& curstr = final_buffer[i]; + curstr.clear(); + curstr.insert(curstr.end(), bptr, bptr + find_string_length(bptr, fixed_length)); + } + } + + last_loaded += available; + } +}; + +} + +} + +#endif diff --git a/inst/include/ritsuko/hdf5/_strings.hpp b/inst/include/ritsuko/hdf5/_strings.hpp new file mode 100644 index 0000000..fe4e07f --- /dev/null +++ b/inst/include/ritsuko/hdf5/_strings.hpp @@ -0,0 +1,29 @@ +#ifndef RITSUKO_HDF5_STRINGS_HPP +#define RITSUKO_HDF5_STRINGS_HPP + +#include "H5Cpp.h" + +namespace ritsuko { + +namespace hdf5 { + +inline size_t find_string_length(const char* ptr, size_t max) { + size_t j = 0; + for (; j < max && ptr[j] != '\0'; ++j) {} + return j; +} + +struct VariableStringCleaner { + VariableStringCleaner(hid_t did, hid_t mid, char** buffer) : did(did), mid(mid), buffer(buffer) {} + ~VariableStringCleaner() { + H5Dvlen_reclaim(did, mid, H5P_DEFAULT, buffer); + } + hid_t did, mid; + char** buffer; +}; + +} + +} + +#endif diff --git a/inst/include/ritsuko/hdf5/as_numeric_datatype.hpp b/inst/include/ritsuko/hdf5/as_numeric_datatype.hpp new file mode 100644 index 0000000..ad02af3 --- /dev/null +++ b/inst/include/ritsuko/hdf5/as_numeric_datatype.hpp @@ -0,0 +1,55 @@ +#ifndef RITSUKO_AS_NUMERIC_DATATYPE_HPP +#define RITSUKO_AS_NUMERIC_DATATYPE_HPP + +#include +#include +#include "H5Cpp.h" + +/** + * @file as_numeric_datatype.hpp + * @brief Choose a HDF5 datatype. + */ + +namespace ritsuko { + +namespace hdf5 { + +/** + * Choose the HDF5 datatype object corresponding to a particular C++ numeric type. + * Currently, only fixed-width integer types (e.g., `uint16_t`, `int32_t`) and the usual floating-point types are supported. + * + * @tparam Type_ A numeric C++ type of fixed width. + * This can be any of the fixed-width integers or a floating-point number of known precision. + * @returns A HDF5 datatype object. + */ +template +H5::PredType as_numeric_datatype() { + if constexpr(std::is_same::value) { + return H5::PredType::NATIVE_UINT8; + } else if constexpr(std::is_same::value) { + return H5::PredType::NATIVE_INT8; + } else if constexpr(std::is_same::value) { + return H5::PredType::NATIVE_UINT16; + } else if constexpr(std::is_same::value) { + return H5::PredType::NATIVE_INT16; + } else if constexpr(std::is_same::value) { + return H5::PredType::NATIVE_UINT32; + } else if constexpr(std::is_same::value) { + return H5::PredType::NATIVE_INT32; + } else if constexpr(std::is_same::value) { + return H5::PredType::NATIVE_UINT64; + } else if constexpr(std::is_same::value) { + return H5::PredType::NATIVE_INT64; + } else if constexpr(std::is_same::value) { + return H5::PredType::NATIVE_FLOAT; + } else { + static_assert(std::is_same::value, "specified type is not yet supported"); + return H5::PredType::NATIVE_DOUBLE; + } +} + +} + +} + +#endif diff --git a/inst/include/ritsuko/hdf5/get_1d_length.hpp b/inst/include/ritsuko/hdf5/get_1d_length.hpp index a3ac9d5..3345b8c 100644 --- a/inst/include/ritsuko/hdf5/get_1d_length.hpp +++ b/inst/include/ritsuko/hdf5/get_1d_length.hpp @@ -39,6 +39,56 @@ inline hsize_t get_1d_length(const H5::DataSpace& space, bool allow_scalar) { return dims; } +/** + * Overload of `get_1d_length()` that accepts a dataset handle. + * + * @param handle Handle to a HDF5 dataset. + * @param allow_scalar Whether to allow scalars. + * + * @return Length of the dataset, i.e., the extent of its single dimension. + */ +inline hsize_t get_1d_length(const H5::DataSet& handle, bool allow_scalar) { + return get_1d_length(handle.getSpace(), allow_scalar); +} + +/** + * Overload of `get_1d_length()` that accepts an attribute handle. + * + * @param handle Handle to a HDF5 attribute. + * @param allow_scalar Whether to allow scalars. + * + * @return Length of the attribute, i.e., the extent of its single dimension. + */ +inline hsize_t get_1d_length(const H5::Attribute& handle, bool allow_scalar) { + return get_1d_length(handle.getSpace(), allow_scalar); +} + +/** + * @param space The data space of the dataset. + * @return Whether `space` represents a scalar dataset. + */ +inline bool is_scalar(const H5::DataSpace& space) { + return space.getSimpleExtentNdims() == 0; +} + +/** + * Overload of `is_scalar()` that accepts a dataset handle. + * @param handle Handle to a HDF5 dataset. + * @return Whether `space` represents a scalar dataset. + */ +inline bool is_scalar(const H5::DataSet& handle) { + return is_scalar(handle.getSpace()); +} + +/** + * Overload of `is_scalar()` that accepts an attribute handle. + * @param handle Handle to a HDF5 attribute. + * @return Whether `space` represents a scalar dataset. + */ +inline bool is_scalar(const H5::Attribute& handle) { + return is_scalar(handle.getSpace()); +} + } } diff --git a/inst/include/ritsuko/hdf5/get_dataset.hpp b/inst/include/ritsuko/hdf5/get_dataset.hpp deleted file mode 100644 index dd64dd5..0000000 --- a/inst/include/ritsuko/hdf5/get_dataset.hpp +++ /dev/null @@ -1,51 +0,0 @@ -#ifndef RITSUKO_HDF5_GET_DATASET_HPP -#define RITSUKO_HDF5_GET_DATASET_HPP - -#include "H5Cpp.h" -#include - -/** - * @file get_dataset.hpp - * @brief Quick functions to get a dataset handle. - */ - -namespace ritsuko { - -namespace hdf5 { - -/** - * @param handle Group containing the dataset. - * @param name Name of the dataset inside the group. - * @return Handle to the dataset. - * An error is raised if `name` does not refer to a dataset. - */ -inline H5::DataSet get_dataset(const H5::Group& handle, const char* name) { - if (!handle.exists(name) || handle.childObjType(name) != H5O_TYPE_DATASET) { - throw std::runtime_error("expected a dataset at '" + std::string(name) + "'"); - } - return handle.openDataSet(name); -} - -/** - * @param handle Group containing the scalar dataset. - * @param name Name of the dataset inside the group. - * @return Handle to a scalar dataset. - * An error is raised if `name` does not refer to a scalar dataset. - */ -inline H5::DataSet get_scalar_dataset(const H5::Group& handle, const char* name) { - auto dhandle = get_dataset(handle, name); - auto dspace = dhandle.getSpace(); - int ndims = dspace.getSimpleExtentNdims(); - if (ndims != 0) { - throw std::runtime_error("expected a scalar dataset at '" + std::string(name) + "'"); - } - return dhandle; -} - -} - -} - -#endif - - diff --git a/inst/include/ritsuko/hdf5/get_missing_placeholder_attribute.hpp b/inst/include/ritsuko/hdf5/get_missing_placeholder_attribute.hpp deleted file mode 100644 index 029051a..0000000 --- a/inst/include/ritsuko/hdf5/get_missing_placeholder_attribute.hpp +++ /dev/null @@ -1,49 +0,0 @@ -#ifndef RITSUKO_HDF5_GET_MISSING_PLACEHOLDER_ATTRIBUTE_HPP -#define RITSUKO_HDF5_GET_MISSING_PLACEHOLDER_ATTRIBUTE_HPP - -#include "H5Cpp.h" -#include - -/** - * @file get_missing_placeholder_attribute.hpp - * @brief Get the missing placeholder attribute. - */ - -namespace ritsuko { - -namespace hdf5 { - -/** - * @param handle Dataset handle. - * @param attr_name Name of the attribute containing the missing value placeholder. - * @param type_class_only Whether to only require identical type classes for the placeholder. - * By default, we require identity in the types themselves. - * - * - * @return Handle to the attribute. - * An error is raised if the attribute is not a scalar or has a different type (or type class, if `type_class_only_ = true`) to the dataset. - */ -inline H5::Attribute get_missing_placeholder_attribute(const H5::DataSet& handle, const char* attr_name, bool type_class_only = false) { - auto attr = handle.openAttribute(attr_name); - if (attr.getSpace().getSimpleExtentNdims() != 0) { - throw std::runtime_error("expected the '" + std::string(attr_name) + "' attribute to be a scalar"); - } - - if (type_class_only) { - if (attr.getTypeClass() != handle.getTypeClass()) { - throw std::runtime_error("expected the '" + std::string(attr_name) + "' attribute to have the same type class as its dataset"); - } - } else { - if (attr.getDataType() != handle.getDataType()) { - throw std::runtime_error("expected the '" + std::string(attr_name) + "' attribute to have the same type as its dataset"); - } - } - - return attr; -} - -} - -} - -#endif diff --git a/inst/include/ritsuko/hdf5/get_name.hpp b/inst/include/ritsuko/hdf5/get_name.hpp index 8dd5ab1..f907117 100644 --- a/inst/include/ritsuko/hdf5/get_name.hpp +++ b/inst/include/ritsuko/hdf5/get_name.hpp @@ -16,16 +16,22 @@ namespace hdf5 { /** * Get the name of a HDF5 object from its handle, usually for printing informative error messages. - * @tparam Handle_ Type of HDF5 handle, usually a `Group` or a `DataSet`. + * @tparam Handle_ Type of HDF5 handle, usually a `Group`, `DataSet` or `Attribute`. * @param handle Handle to a HDF5 object. * @return Name of the HDF5 object inside the file. */ template std::string get_name(const Handle_& handle) { - size_t len = H5Iget_name(handle.getId(), NULL, 0); - std::vector buffer(len); - H5Iget_name(handle.getId(), buffer.data(), len+1); - return std::string(buffer.begin(), buffer.end()); + if constexpr(std::is_same::value) { + std::string name; + handle.getName(name); + return name; + } else { + size_t len = H5Iget_name(handle.getId(), NULL, 0); + std::vector buffer(len + 1); + H5Iget_name(handle.getId(), buffer.data(), buffer.size()); + return std::string(buffer.begin(), buffer.begin() + len); + } } } diff --git a/inst/include/ritsuko/hdf5/get_scalar_attribute.hpp b/inst/include/ritsuko/hdf5/get_scalar_attribute.hpp deleted file mode 100644 index d5f4877..0000000 --- a/inst/include/ritsuko/hdf5/get_scalar_attribute.hpp +++ /dev/null @@ -1,43 +0,0 @@ -#ifndef RITSUKO_HDF5_GET_ATTRIBUTE_HPP -#define RITSUKO_HDF5_GET_ATTRIBUTE_HPP - -#include "H5Cpp.h" -#include - -/** - * @file get_scalar_attribute.hpp - * @brief Helper to get a scalar attribute handle. - */ - -namespace ritsuko { - -namespace hdf5 { - -/** - * Convenient wrapper to get a scalar attribute with all of the usual error checks. - * - * @tparam Object_ Type of the HDF5 handle, usually a `DataSet` or `Group`. - * @param handle HDF5 dataset or group handle. - * @param name Name of the attribute. - * - * @return Attribute handle. - */ -template -H5::Attribute get_scalar_attribute(const Object_& handle, const char* name) { - if (!handle.attrExists(name)) { - throw std::runtime_error("expected an attribute at '" + std::string(name) + "'"); - } - - auto attr = handle.openAttribute(name); - if (attr.getSpace().getSimpleExtentNdims() != 0) { - throw std::runtime_error("expected a scalar attribute at '" + std::string(name) + "'"); - } - - return attr; -} - -} - -} - -#endif diff --git a/inst/include/ritsuko/hdf5/hdf5.hpp b/inst/include/ritsuko/hdf5/hdf5.hpp index 778b370..7b09b7c 100644 --- a/inst/include/ritsuko/hdf5/hdf5.hpp +++ b/inst/include/ritsuko/hdf5/hdf5.hpp @@ -1,15 +1,19 @@ #ifndef RITSUKO_HDF5_HPP #define RITSUKO_HDF5_HPP +#include "Stream1dNumericDataset.hpp" +#include "Stream1dStringDataset.hpp" +#include "as_numeric_datatype.hpp" #include "exceeds_limit.hpp" #include "get_1d_length.hpp" -#include "iterate_1d_blocks.hpp" -#include "load_1d_string_dataset.hpp" -#include "load_scalar_string_attribute.hpp" -#include "get_missing_placeholder_attribute.hpp" -#include "get_dataset.hpp" -#include "get_scalar_attribute.hpp" #include "get_name.hpp" +#include "load_attribute.hpp" +#include "load_dataset.hpp" +#include "missing_placeholder.hpp" +#include "miscellaneous.hpp" +#include "open.hpp" +#include "pick_1d_block_size.hpp" +#include "validate_string.hpp" /** * @file hdf5.hpp diff --git a/inst/include/ritsuko/hdf5/iterate_1d_blocks.hpp b/inst/include/ritsuko/hdf5/iterate_1d_blocks.hpp deleted file mode 100644 index 2680a09..0000000 --- a/inst/include/ritsuko/hdf5/iterate_1d_blocks.hpp +++ /dev/null @@ -1,44 +0,0 @@ -#ifndef RITSUKO_HDF5_ITERATE_1D_BLOCKS_HPP -#define RITSUKO_HDF5_ITERATE_1D_BLOCKS_HPP - -#include "H5Cpp.h" -#include - -/** - * @file iterate_1d_blocks.hpp - * @brief Blockwise iteration through a 1-dimensional HDF5 dataset. - */ - -namespace ritsuko { - -namespace hdf5 { - -/** - * Iterate through a 1-dimensional dataset via contiguous blocks. - * - * @param full_length Length of the dataset, usually obtained from `get_1d_length()`. - * @param block_size Size of the blocks, usually calculated by `pick_1d_block_size()`. - * @param fun Function that accepts `(hsize_t start, hsize_t len, H5::DataSpace& memspace, H5::DataSpace& dataspace)` and is called on each block. - * In each call, the block contains elements from `[start, start + len)`. - * `dataspace` is configured to extract that block from the dataset, while `memspace` is configured to deposit the block contents in a buffer from `[0, len)`. - * It can be assumed that consecutive calls to `fun` will operate on consecutive contiguous blocks. - */ -template -void iterate_1d_blocks(hsize_t full_length, hsize_t block_size, Function_ fun) { - H5::DataSpace mspace(1, &block_size); - H5::DataSpace dspace(1, &full_length); - hsize_t start = 0; - - for (hsize_t counter = 0; counter < full_length; counter += block_size) { - hsize_t limit = std::min(full_length - counter, block_size); - mspace.selectHyperslab(H5S_SELECT_SET, &limit, &start); - dspace.selectHyperslab(H5S_SELECT_SET, &limit, &counter); - fun(counter, limit, mspace, dspace); - } -} - -} - -} - -#endif diff --git a/inst/include/ritsuko/hdf5/load_1d_string_dataset.hpp b/inst/include/ritsuko/hdf5/load_1d_string_dataset.hpp deleted file mode 100644 index 2be553b..0000000 --- a/inst/include/ritsuko/hdf5/load_1d_string_dataset.hpp +++ /dev/null @@ -1,114 +0,0 @@ -#ifndef RITSUKO_HDF5_LOAD_1D_STRING_DATASET_HPP -#define RITSUKO_HDF5_LOAD_1D_STRING_DATASET_HPP - -#include "H5Cpp.h" -#include -#include - -#include "pick_1d_block_size.hpp" -#include "iterate_1d_blocks.hpp" - -/** - * @file load_1d_string_dataset.hpp - * @brief Load and iterate over a 1-dimensional HDF5 string dataset. - */ - -namespace ritsuko { - -namespace hdf5 { - -/** - * Iterate across a string dataset, extracting each string and running a user-specified function. - * This works for both variable- and fixed-length strings, and performs iteration via `iterate_1d_blocks()` to avoid loading everything into memory at once. - * - * @tparam Function_ Function class that accepts `(hsize_t i, const char* start, size_t len)` - * where `i` is the index of the string from `[start, start + len)`. - * - * @param handle Handle to a string dataset. - * @param full_length Length of the dataset in `handle`, usually obtained by `get_1d_length()`. - * @param buffer_size Buffer size to use for iteration in `iterate_1d_blocks()`. - * @param fun Function to be called on each string. - * It can be assumed that the consecutive calls to `fun` will operate on consecutive `i`. - */ -template -void load_1d_string_dataset(const H5::DataSet& handle, hsize_t full_length, hsize_t buffer_size, Function_ fun) { - auto block_size = pick_1d_block_size(handle.getCreatePlist(), full_length, buffer_size); - auto dtype = handle.getDataType(); - - if (dtype.isVariableStr()) { - std::vector buffer(block_size); - iterate_1d_blocks( - full_length, - block_size, - [&](hsize_t start, hsize_t len, const H5::DataSpace& mspace, const H5::DataSpace& dspace) -> void { - handle.read(buffer.data(), dtype, mspace, dspace); - for (hsize_t i = 0; i < len; ++i) { - fun(start + i, buffer[i], std::strlen(buffer[i])); - } - H5Dvlen_reclaim(dtype.getId(), mspace.getId(), H5P_DEFAULT, buffer.data()); - } - ); - - } else { - size_t len = dtype.getSize(); - std::vector buffer(len * block_size); - iterate_1d_blocks( - full_length, - block_size, - [&](hsize_t start, hsize_t length, const H5::DataSpace& mspace, const H5::DataSpace& dspace) -> void { - handle.read(buffer.data(), dtype, mspace, dspace); - auto ptr = buffer.data(); - for (size_t i = 0; i < length; ++i, ptr += len) { - size_t j = 0; - for (; j < len && ptr[j] != '\0'; ++j) {} - fun(start + i, ptr, j); - } - } - ); - } -} - -/** - * Iterate across a string attribute, extracting each string and running a user-specified function. - * This works for both variable- and fixed-length strings. - * - * @tparam Function_ Function class that accepts `(hsize_t i, const char* start, size_t len)` - * where `i` is the index of the string from `[start, start + len)`. - * - * @param handle Handle to a string attribute. - * @param full_length Length of the attribute in `handle`, usually obtained by `get_1d_length()`. - * @param fun Function to be called on each string. - * It can be assumed that the consecutive calls to `fun` will operate on consecutive `i`. - */ -template -void load_1d_string_attribute(const H5::Attribute& handle, hsize_t full_length, Function_ fun) { - auto dtype = handle.getDataType(); - - if (dtype.isVariableStr()) { - std::vector buffer(full_length); - handle.read(dtype, buffer.data()); - for (hsize_t i = 0; i < full_length; ++i) { - fun(i, buffer[i], std::strlen(buffer[i])); - } - auto mspace = handle.getSpace(); - H5Dvlen_reclaim(dtype.getId(), mspace.getId(), H5P_DEFAULT, buffer.data()); - - } else { - size_t len = dtype.getSize(); - std::vector buffer(len * full_length); - handle.read(dtype, buffer.data()); - auto ptr = buffer.data(); - for (size_t i = 0; i < full_length; ++i, ptr += len) { - size_t j = 0; - for (; j < len && ptr[j] != '\0'; ++j) {} - fun(i, ptr, j); - } - } -} - -} - -} - -#endif - diff --git a/inst/include/ritsuko/hdf5/load_attribute.hpp b/inst/include/ritsuko/hdf5/load_attribute.hpp new file mode 100644 index 0000000..e2f4a5e --- /dev/null +++ b/inst/include/ritsuko/hdf5/load_attribute.hpp @@ -0,0 +1,144 @@ +#ifndef RITSUKO_HDF5_LOAD_ATTRIBUTE_HPP +#define RITSUKO_HDF5_LOAD_ATTRIBUTE_HPP + +#include "H5Cpp.h" + +#include +#include + +#include "get_1d_length.hpp" +#include "as_numeric_datatype.hpp" +#include "_strings.hpp" + +/** + * @file load_scalar_string_attribute.hpp + * @brief Load a scalar string HDF5 attribute. + */ + +namespace ritsuko { + +namespace hdf5 { + +/** + * @param attr Handle to a scalar string attribute. + * Callers are responsible for checking that `attr` contains a string datatype class. + * @return The attribute as a string. + */ +inline std::string load_scalar_string_attribute(const H5::Attribute& attr) { + auto dtype = attr.getDataType(); + + // Unfortunately, we can't just do 'std::string output; attr.read(dtype, output);', + // as we need to catch NULL pointers in the variable case. + + if (dtype.isVariableStr()) { + auto mspace = attr.getSpace(); + char* buffer; + attr.read(dtype, &buffer); + [[maybe_unused]] VariableStringCleaner deletor(dtype.getId(), mspace.getId(), &buffer); + if (buffer == NULL) { + throw std::runtime_error("detected a NULL pointer for a variable length string attribute"); + } + return std::string(buffer); + + } else { + size_t len = dtype.getSize(); + std::vector buffer(len); + attr.read(dtype, buffer.data()); + auto ptr = buffer.data(); + return std::string(ptr, ptr + find_string_length(ptr, len)); + } +} + +/** + * @tparam check_ Whether to check that `attr` is a 1-dimensional string attribute. + * @param attr Handle to a 1-dimensional string attribute. + * Callers are responsible for checking that `attr` contains a string datatype class. + * @param full_length Length of the attribute in `attr`, usually obtained by `get_1d_length()`. + * @return Vector of strings. + */ +inline std::vector load_1d_string_attribute(const H5::Attribute& attr, hsize_t full_length) { + auto dtype = attr.getDataType(); + auto mspace = attr.getSpace(); + std::vector output; + output.reserve(full_length); + + if (dtype.isVariableStr()) { + std::vector buffer(full_length); + attr.read(dtype, buffer.data()); + [[maybe_unused]] VariableStringCleaner deletor(dtype.getId(), mspace.getId(), buffer.data()); + for (hsize_t i = 0; i < full_length; ++i) { + if (buffer[i] == NULL) { + throw std::runtime_error("detected a NULL pointer for a variable length string attribute"); + } + output.emplace_back(buffer[i]); + } + + } else { + size_t len = dtype.getSize(); + std::vector buffer(len * full_length); + attr.read(dtype, buffer.data()); + auto ptr = buffer.data(); + for (size_t i = 0; i < full_length; ++i, ptr += len) { + output.emplace_back(ptr, ptr + find_string_length(ptr, len)); + } + } + + return output; +} + +/** + * Overload of `load_1d_string_attribute()` that determines the length of the attribute via `get_1d_length()`. + * @param attr Handle to a 1-dimensional string attribute. + * Callers are responsible for checking that `attr` contains a string datatype class. + * @return Vector of strings. + */ +inline std::vector load_1d_string_attribute(const H5::Attribute& attr) { + return load_1d_string_attribute(attr, get_1d_length(attr.getSpace(), false)); +} + +/** + * @tparam Type_ Type for holding the data in memory, see `as_numeric_datatype()` for supported types. + * @param attr Handle to a scalar numeric attribute. + * Callers are responsible for checking that the datatype of `attr` is appropriate for `Type_`, e.g., with `exceeds_integer_limit()`. + * @return The value of the attribute. + */ +template +Type_ load_scalar_numeric_attribute(const H5::Attribute& attr) { + Type_ val; + auto mtype = as_numeric_datatype(); + attr.read(mtype, &val); + return val; +} + +/** + * @tparam Type_ Type for holding the data in memory, see `as_numeric_datatype()` for supported types. + * @param attr Handle to a numeric attribute. + * Callers are responsible for checking that the datatype of `attr` is appropriate for `Type_`, e.g., with `exceeds_integer_limit()`. + * @param full_length Length of the attribute in `attr`, usually obtained by `get_1d_length()`. + * @return Vector containing the contents of the attribute. + */ +template +std::vector load_1d_numeric_attribute(const H5::Attribute& attr, hsize_t full_length) { + auto mtype = as_numeric_datatype(); + std::vector buffer(full_length); + attr.read(mtype, buffer.data()); + return buffer; +} + +/** + * Overload of `load_1d_numeric_attribute()` that determines the length of the attribute via `get_1d_length()`. + * @tparam Type_ Type for holding the data in memory, see `as_numeric_datatype()` for supported types. + * @param attr Handle to a numeric attribute. + * Callers are responsible for checking that the datatype of `attr` is appropriate for `Type_`, e.g., with `exceeds_integer_limit()`. + * @return Vector containing the contents of the attribute. + */ +template +std::vector load_1d_numeric_attribute(const H5::Attribute& attr) { + return load_1d_numeric_attribute(attr, get_1d_length(attr.getSpace(), false)); +} + +} + +} + +#endif diff --git a/inst/include/ritsuko/hdf5/load_dataset.hpp b/inst/include/ritsuko/hdf5/load_dataset.hpp new file mode 100644 index 0000000..3c0fe14 --- /dev/null +++ b/inst/include/ritsuko/hdf5/load_dataset.hpp @@ -0,0 +1,79 @@ +#ifndef RITSUKO_HDF5_LOAD_DATASET_HPP +#define RITSUKO_HDF5_LOAD_DATASET_HPP + +#include +#include +#include + +#include "H5Cpp.h" + +#include "get_name.hpp" +#include "Stream1dStringDataset.hpp" +#include "_strings.hpp" + +/** + * @file load_dataset.hpp + * @brief Helper functions to load datasets. + */ + +namespace ritsuko { + +namespace hdf5 { + +/** + * Load a scalar string dataset into a single string. + * @param handle Handle to the HDF5 scalar dataset. + * @return String containing the contents of the sole dataset entry. + */ +inline std::string load_scalar_string_dataset(const H5::DataSet& handle) { + auto dtype = handle.getDataType(); + if (dtype.isVariableStr()) { + char* vptr; + handle.read(&vptr, dtype); + auto dspace = handle.getSpace(); // don't set as temporary in constructor below, otherwise it gets destroyed and the ID invalidated. + [[maybe_unused]] VariableStringCleaner deletor(dtype.getId(), dspace.getId(), &vptr); + if (vptr == NULL) { + throw std::runtime_error("detected a NULL pointer for a variable length string in '" + get_name(handle) + "'"); + } + std::string output(vptr); + return output; + } else { + size_t fixed_length = dtype.getSize(); + std::vector buffer(fixed_length); + handle.read(buffer.data(), dtype); + return std::string(buffer.begin(), buffer.begin() + find_string_length(buffer.data(), fixed_length)); + } +} + +/** + * Load a 1-dimensional string dataset into a vector of strings. + * @param handle Handle to the HDF5 scalar dataset. + * @param full_length Length of the dataset as a 1-dimensional vector. + * @param buffer_size Size of the buffer for holding loaded strings. + * @return Vector of strings. + */ +inline std::vector load_1d_string_dataset(const H5::DataSet& handle, hsize_t full_length, hsize_t buffer_size) { + Stream1dStringDataset stream(&handle, full_length, buffer_size); + std::vector output; + output.reserve(full_length); + for (hsize_t i = 0; i < full_length; ++i, stream.next()) { + output.emplace_back(stream.steal()); + } + return output; +} + +/** + * Overload of `load_1d_string_dataset()` that determines the length via `get_1d_length()`. + * @param handle Handle to the HDF5 scalar dataset. + * @param buffer_size Size of the buffer for holding loaded strings. + * @return Vector of strings. + */ +inline std::vector load_1d_string_dataset(const H5::DataSet& handle, hsize_t buffer_size) { + return load_1d_string_dataset(handle, get_1d_length(handle, false), buffer_size); +} + +} + +} + +#endif diff --git a/inst/include/ritsuko/hdf5/load_scalar_string_attribute.hpp b/inst/include/ritsuko/hdf5/load_scalar_string_attribute.hpp deleted file mode 100644 index 12d4cc8..0000000 --- a/inst/include/ritsuko/hdf5/load_scalar_string_attribute.hpp +++ /dev/null @@ -1,53 +0,0 @@ -#ifndef RITSUKO_HDF5_LOAD_SCALAR_STRING_ATTRIBUTE_HPP -#define RITSUKO_HDF5_LOAD_SCALAR_STRING_ATTRIBUTE_HPP - -#include "H5Cpp.h" -#include - -#include "get_name.hpp" - -/** - * @file load_scalar_string_attribute.hpp - * @brief Load a scalar string HDF5 attribute. - */ - -namespace ritsuko { - -namespace hdf5 { - -/** - * @param attr An ``Attribute`` handle. - * @return The attribute as a string. - */ -inline std::string load_scalar_string_attribute(const H5::Attribute& attr) { - if (attr.getTypeClass() != H5T_STRING || attr.getSpace().getSimpleExtentNdims() != 0) { - std::string name; - attr.getName(name); - throw std::runtime_error("expected attribute '" + name + "' to be a scalar string"); - } - std::string output; - attr.read(attr.getStrType(), output); - return output; -} - -/** - * @tparam Object_ HDF5 object class, usually a ``DataSet`` or a ``Group``. - * - * @param handle Handle to a HDF5 object that can contain attributes. - * @param field Name of the attribute. - * - * @return The attribute as a string. - */ -template -std::string load_scalar_string_attribute(const Object_& handle, const char* field) { - if (!handle.attrExists(field)) { - throw std::runtime_error("expected a '" + std::string(field) + "' attribute to be present"); - } - return load_scalar_string_attribute(handle.openAttribute(field)); -} - -} - -} - -#endif diff --git a/inst/include/ritsuko/hdf5/miscellaneous.hpp b/inst/include/ritsuko/hdf5/miscellaneous.hpp new file mode 100644 index 0000000..91b8d90 --- /dev/null +++ b/inst/include/ritsuko/hdf5/miscellaneous.hpp @@ -0,0 +1,56 @@ +#ifndef RITSUKO_MISCELLANEOUS_HPP +#define RITSUKO_MISCELLANEOUS_HPP + +#include +#include "H5Cpp.h" + +#include "open.hpp" +#include "load_attribute.hpp" + +/** + * @file miscellaneous.hpp + * @brief Miscellaneous functions for user convenience. + */ + +namespace ritsuko { + +namespace hdf5 { + +/** + * @tparam Object_ Type of the HDF5 handle, usually a `DataSet` or `Group`. + * @param handle HDF5 dataset or group handle. + * @param name Name of the attribute. + * + * @return Attribute handle. + * An error is raised if `name` does not refer to a scalar attribute. + */ +template +inline H5::Attribute open_scalar_attribute(const H5Object_& handle, const char* name) { + auto attr = open_attribute(handle, name); + if (!is_scalar(attr)) { + throw std::runtime_error("expected '" + std::string(name) + "' attribute to be a scalar"); + } + return attr; +} + +/** + * @tparam Object_ Type of the HDF5 handle, usually a `DataSet` or `Group`. + * @param handle HDF5 dataset or group handle. + * @param name Name of the attribute. + * + * @return A string containing the attribute value. + */ +template +std::string open_and_load_scalar_string_attribute(const H5Object_& handle, const char* name) { + auto attr = open_scalar_attribute(handle, name); + if (attr.getTypeClass() != H5T_STRING) { + throw std::runtime_error("expected '" + std::string(name) + "' attribute to be a string"); + } + return load_scalar_string_attribute(attr); +} + +} + +} + +#endif diff --git a/inst/include/ritsuko/hdf5/missing_placeholder.hpp b/inst/include/ritsuko/hdf5/missing_placeholder.hpp new file mode 100644 index 0000000..824ed03 --- /dev/null +++ b/inst/include/ritsuko/hdf5/missing_placeholder.hpp @@ -0,0 +1,98 @@ +#ifndef RITSUKO_HDF5_MISSING_PLACEHOLDER_HPP +#define RITSUKO_HDF5_MISSING_PLACEHOLDER_HPP + +#include "H5Cpp.h" +#include + +#include "as_numeric_datatype.hpp" +#include "load_attribute.hpp" +#include "get_1d_length.hpp" +#include "get_name.hpp" + +/** + * @file missing_placeholder.hpp + * @brief Get the missing placeholder attribute. + */ + +namespace ritsuko { + +namespace hdf5 { + +/** + * Check the validity of a missing placeholder attribute on a dataset. + * An error is raised if the attribute is not a scalar or has a different type (or type class, if `type_class_only_ = true`) to the dataset. + * + * @param dset Dataset handle. + * @param attr Handle for the missing placeholder, typically as an attribute on `dset`. + * @param type_class_only Whether to only require identical type classes for the placeholder. + * If 0, this is false, and the types between `dset` and `attr` must be identical. + * If 1, this is true, and `dset` and `attr` just need to have the same type class. + * If -1 (default), this is true for all string types and false for all numeric types. + */ +inline void check_missing_placeholder_attribute(const H5::DataSet& dset, const H5::Attribute& attr, int type_class_only = -1) { + if (!is_scalar(attr)) { + throw std::runtime_error("expected the '" + get_name(attr) + "' attribute to be a scalar"); + } + + if (type_class_only == -1) { + type_class_only = (dset.getTypeClass() == H5T_STRING); + } + + if (type_class_only == 1) { + if (attr.getTypeClass() != dset.getTypeClass()) { + throw std::runtime_error("expected the '" + get_name(attr) + "' attribute to have the same type class as its dataset"); + } + } else { + if (attr.getDataType() != dset.getDataType()) { + throw std::runtime_error("expected the '" + get_name(attr) + "' attribute to have the same type as its dataset"); + } + } +} + +/** + * Check if a missing numeric placeholder attribute is present, and if so, open it and loads it value. + * This will also call `check_missing_placeholder_attribute()` to validate the placeholder's properties. + * + * @tparam Type_ Type to use to store the data in memory, see `as_numeric_datatype()` for supported types. + * @param handle Dataset handle. + * @param attr_name Name of the attribute containing the missing value placeholder. + * @return Pair containing (i) a boolean indicating whether the placeholder attribute was present, and (ii) the value of the placeholder if the first element is `true`. + */ +template +std::pair open_and_load_optional_numeric_missing_placeholder(const H5::DataSet& handle, const char* attr_name) { + std::pair output(false, 0); + if (!handle.attrExists(attr_name)) { + return output; + } + output.first = true; + auto ahandle = handle.openAttribute(attr_name); + check_missing_placeholder_attribute(handle, ahandle); + ahandle.read(as_numeric_datatype(), &(output.second)); + return output; +} + +/** + * Check if a missing string placeholder attribute is present, and if so, open it and loads it value. + * This will also call `check_missing_placeholder_attribute()` to validate the placeholder's properties. + * + * @param handle Dataset handle. + * @param attr_name Name of the attribute containing the missing value placeholder. + * @return Pair containing (i) a boolean indicating whether the placeholder attribute was present, and (ii) the value of the placeholder if the first element is `true`. + */ +inline std::pair open_and_load_optional_string_missing_placeholder(const H5::DataSet& handle, const char* attr_name) { + std::pair output(false, ""); + if (!handle.attrExists(attr_name)) { + return output; + } + output.first = true; + auto ahandle = handle.openAttribute(attr_name); + check_missing_placeholder_attribute(handle, ahandle); + output.second = load_scalar_string_attribute(ahandle); + return output; +} + +} + +} + +#endif diff --git a/inst/include/ritsuko/hdf5/open.hpp b/inst/include/ritsuko/hdf5/open.hpp new file mode 100644 index 0000000..58c6c7b --- /dev/null +++ b/inst/include/ritsuko/hdf5/open.hpp @@ -0,0 +1,79 @@ +#ifndef RITSUKO_HDF5_OPEN_HPP +#define RITSUKO_HDF5_OPEN_HPP + +#include "H5Cpp.h" + +#include +#include +#include + +/** + * @file open.hpp + * @brief Convenience functions to safely open HDF5 handles. + */ + +namespace ritsuko { + +namespace hdf5 { + +/** + * @param path Path to a HDF5 file. + * @return Handle to the file. + * An error is raised if `path` does not exist. + */ +inline H5::H5File open_file(const std::filesystem::path& path) try { + if (!std::filesystem::exists(path)) { + throw std::runtime_error("no file is present at '" + path.string() + "'"); + } + return H5::H5File(path, H5F_ACC_RDONLY); +} catch (H5::Exception& e) { + throw std::runtime_error("failed to open the HDF5 file at '" + path.string() + "'; " + e.getDetailMsg()); +} + +/** + * @param handle Parent group (or file). + * @param name Name of the group. + * @return Handle to the group. + * An error is raised if `name` does not refer to a dataset. + */ +inline H5::Group open_group(const H5::Group& handle, const char* name) { + if (!handle.exists(name) || handle.childObjType(name) != H5O_TYPE_GROUP) { + throw std::runtime_error("expected a group at '" + std::string(name) + "'"); + } + return handle.openGroup(name); +} + +/** + * @param handle Group containing the dataset. + * @param name Name of the dataset inside the group. + * @return Handle to the dataset. + * An error is raised if `name` does not refer to a dataset. + */ +inline H5::DataSet open_dataset(const H5::Group& handle, const char* name) { + if (!handle.exists(name) || handle.childObjType(name) != H5O_TYPE_DATASET) { + throw std::runtime_error("expected a dataset at '" + std::string(name) + "'"); + } + return handle.openDataSet(name); +} + +/** + * @tparam Object_ Type of the HDF5 handle, usually a `DataSet` or `Group`. + * @param handle HDF5 dataset or group handle. + * @param name Name of the attribute. + * + * @return Attribute handle. + * An error is raised if `name` does not refer to an attribute. + */ +template +H5::Attribute open_attribute(const Object_& handle, const char* name) { + if (!handle.attrExists(name)) { + throw std::runtime_error("expected an attribute at '" + std::string(name) + "'"); + } + return handle.openAttribute(name); +} + +} + +} + +#endif diff --git a/inst/include/ritsuko/hdf5/validate_string.hpp b/inst/include/ritsuko/hdf5/validate_string.hpp new file mode 100644 index 0000000..7cf3573 --- /dev/null +++ b/inst/include/ritsuko/hdf5/validate_string.hpp @@ -0,0 +1,148 @@ +#ifndef RITSUKO_HDF5_VALIDATE_STRING_HPP +#define RITSUKO_HDF5_VALIDATE_STRING_HPP + +#include +#include +#include + +#include "H5Cpp.h" + +#include "get_name.hpp" +#include "pick_1d_block_size.hpp" +#include "_strings.hpp" + +/** + * @file validate_string.hpp + * @brief Helper functions to validate strings. + */ + +namespace ritsuko { + +namespace hdf5 { + +/** + * Check that a scalar string dataset is valid. + * Currently, this involves checking that there are no `NULL` entries for variable-length string datatypes. + * For fixed-width string datasets, this function is a no-op. + * + * @param handle Handle to the HDF5 string dataset. + */ +inline void validate_scalar_string_dataset(const H5::DataSet& handle) { + auto dtype = handle.getDataType(); + if (!dtype.isVariableStr()) { + return; + } + + char* vptr; + handle.read(&vptr, dtype); + auto dspace = handle.getSpace(); // don't set as temporary in constructor below, otherwise it gets destroyed and the ID invalidated. + [[maybe_unused]] VariableStringCleaner deletor(dtype.getId(), dspace.getId(), &vptr); + if (vptr == NULL) { + throw std::runtime_error("detected a NULL pointer for a variable length string in '" + get_name(handle) + "'"); + } +} + +/** + * Check that a 1-dimensional string dataset is valid. + * Currently, this involves checking that there are no `NULL` entries for variable-length string datatypes. + * For fixed-width string datasets, this function is a no-op. + * + * @param handle Handle to the HDF5 string dataset. + * @param full_length Length of the dataset as a 1-dimensional vector. + * @param buffer_size Size of the buffer for holding loaded strings. + */ +inline void validate_1d_string_dataset(const H5::DataSet& handle, hsize_t full_length, hsize_t buffer_size) { + auto dtype = handle.getDataType(); + if (!dtype.isVariableStr()) { + return; + } + + hsize_t block_size = pick_1d_block_size(handle.getCreatePlist(), full_length, buffer_size); + H5::DataSpace mspace(1, &block_size), dspace(1, &full_length); + std::vector buffer(block_size); + + for (hsize_t i = 0; i < full_length; i += block_size) { + auto available = std::min(full_length - i, block_size); + constexpr hsize_t zero = 0; + mspace.selectHyperslab(H5S_SELECT_SET, &available, &zero); + dspace.selectHyperslab(H5S_SELECT_SET, &available, &i); + + handle.read(buffer.data(), dtype, mspace, dspace); + [[maybe_unused]] VariableStringCleaner deletor(dtype.getId(), mspace.getId(), buffer.data()); + for (hsize_t j = 0; j < available; ++j) { + if (buffer[j] == NULL) { + throw std::runtime_error("detected a NULL pointer for a variable length string in '" + get_name(handle) + "'"); + } + } + } +} + +/** + * Overload for `validate_1d_string_dataset()` that automatically determines its length via `get_1d_length()`. + * @param handle Handle to the HDF5 string dataset. + * @param buffer_size Size of the buffer for holding loaded strings. + */ +inline void validate_1d_string_dataset(const H5::DataSet& handle, hsize_t buffer_size) { + validate_1d_string_dataset(handle, get_1d_length(handle, false), buffer_size); +} + +/** + * Check that a scalar string attribute is valid. + * Currently, this involves checking that there are no `NULL` entries for variable-length string datatypes. + * For fixed-width string attributes, this function is a no-op. + * + * @param handle Handle to the HDF5 string attribute. + */ +inline void validate_scalar_string_attribute(const H5::Attribute& attr) { + auto dtype = attr.getDataType(); + if (!dtype.isVariableStr()) { + return; + } + + auto mspace = attr.getSpace(); + char* buffer; + attr.read(dtype, &buffer); + [[maybe_unused]] VariableStringCleaner deletor(dtype.getId(), mspace.getId(), &buffer); + if (buffer == NULL) { + throw std::runtime_error("detected a NULL pointer for a variable length string attribute"); + } +} + +/** + * Check that a 1-dimensional string attribute is valid. + * Currently, this involves checking that there are no `NULL` entries for variable-length string datatypes. + * For fixed-width string attributes, this function is a no-op. + * + * @param handle Handle to the HDF5 string attribute. + * @param full_length Length of the attribute as a 1-dimensional vector. + */ +inline void validate_1d_string_attribute(const H5::Attribute& attr, hsize_t full_length) { + auto dtype = attr.getDataType(); + if (!dtype.isVariableStr()) { + return; + } + + auto mspace = attr.getSpace(); + std::vector buffer(full_length); + attr.read(dtype, buffer.data()); + [[maybe_unused]] VariableStringCleaner deletor(dtype.getId(), mspace.getId(), buffer.data()); + for (hsize_t i = 0; i < full_length; ++i) { + if (buffer[i] == NULL) { + throw std::runtime_error("detected a NULL pointer for a variable length string attribute"); + } + } +} + +/** + * Overload for `validate_1d_string_attribute()` that automatically determines its length via `get_1d_length()`. + * @param handle Handle to the HDF5 string attribute. + */ +inline void validate_1d_string_attribute(const H5::Attribute& attr) { + validate_1d_string_attribute(attr, get_1d_length(attr, false)); +} + +} + +} + +#endif diff --git a/inst/include/takane/_height.hpp b/inst/include/takane/_height.hpp index 218baec..7223214 100644 --- a/inst/include/takane/_height.hpp +++ b/inst/include/takane/_height.hpp @@ -12,6 +12,10 @@ #include "simple_list.hpp" #include "data_frame.hpp" #include "data_frame_factor.hpp" +#include "genomic_ranges.hpp" +#include "atomic_vector_list.hpp" +#include "data_frame_list.hpp" +#include "genomic_ranges_list.hpp" /** * @file _height.hpp @@ -32,6 +36,10 @@ inline auto default_registry() { registry["simple_list"] = [](const std::filesystem::path& p, const Options& o) -> size_t { return simple_list::height(p, o); }; registry["data_frame"] = [](const std::filesystem::path& p, const Options& o) -> size_t { return data_frame::height(p, o); }; registry["data_frame_factor"] = [](const std::filesystem::path& p, const Options& o) -> size_t { return data_frame_factor::height(p, o); }; + registry["genomic_ranges"] = [](const std::filesystem::path& p, const Options& o) -> size_t { return genomic_ranges::height(p, o); }; + registry["atomic_vector_list"] = [](const std::filesystem::path& p, const Options& o) -> size_t { return atomic_vector_list::height(p, o); }; + registry["data_frame_list"] = [](const std::filesystem::path& p, const Options& o) -> size_t { return data_frame_list::height(p, o); }; + registry["genomic_ranges_list"] = [](const std::filesystem::path& p, const Options& o) -> size_t { return genomic_ranges_list::height(p, o); }; return registry; } diff --git a/inst/include/takane/_satisfies_interface.hpp b/inst/include/takane/_satisfies_interface.hpp new file mode 100644 index 0000000..5fb49f4 --- /dev/null +++ b/inst/include/takane/_satisfies_interface.hpp @@ -0,0 +1,53 @@ +#ifndef TAKANE_SATISFIES_INTERFACE_HPP +#define TAKANE_SATISFIES_INTERFACE_HPP + +#include +#include +#include + +namespace takane { + +/** + * @cond + */ +namespace internal_satisfies_interface { + +inline auto default_registry() { + std::unordered_map > registry; + registry["SIMPLE_LIST"] = { "simple_list" }; + registry["DATA_FRAME"] = { "data_frame" }; + return registry; +} + +} +/** + * @endcond + */ + +/** + * Registry of object types that satisfy a particular object interface. + * Each key is the interface and each value is the set of all types that satisfy it. + * Applications can extend the **takane** framework by adding custom types to each set. + */ +inline std::unordered_map > satisfies_interface_registry = internal_satisfies_interface::default_registry(); + +/** + * Check whether a particular object type satisfies a particular object interface. + * This can be used by specifications to check that child components satisfy certain expectations. + * + * @param type Object type. + * @param interface Interface type. + * @returns Whether `type` satisfies `interface`. + */ +inline bool satisfies_interface(const std::string& type, const std::string& interface) { + auto it = satisfies_interface_registry.find(interface); + if (it == satisfies_interface_registry.end()) { + return false; + } + const auto& listing = it->second; + return listing.find(type) != listing.end(); +} + +} + +#endif diff --git a/inst/include/takane/_validate.hpp b/inst/include/takane/_validate.hpp index c960282..b3ae6a9 100644 --- a/inst/include/takane/_validate.hpp +++ b/inst/include/takane/_validate.hpp @@ -12,6 +12,11 @@ #include "simple_list.hpp" #include "data_frame.hpp" #include "data_frame_factor.hpp" +#include "sequence_information.hpp" +#include "genomic_ranges.hpp" +#include "atomic_vector_list.hpp" +#include "data_frame_list.hpp" +#include "genomic_ranges_list.hpp" /** * @file _validate.hpp @@ -32,6 +37,11 @@ inline auto default_registry() { registry["simple_list"] = [](const std::filesystem::path& p, const Options& o) { simple_list::validate(p, o); }; registry["data_frame"] = [](const std::filesystem::path& p, const Options& o) { data_frame::validate(p, o); }; registry["data_frame_factor"] = [](const std::filesystem::path& p, const Options& o) { data_frame_factor::validate(p, o); }; + registry["sequence_information"] = [](const std::filesystem::path& p, const Options& o) { sequence_information::validate(p, o); }; + registry["genomic_ranges"] = [](const std::filesystem::path& p, const Options& o) { genomic_ranges::validate(p, o); }; + registry["atomic_vector_list"] = [](const std::filesystem::path& p, const Options& o) { atomic_vector_list::validate(p, o); }; + registry["data_frame_list"] = [](const std::filesystem::path& p, const Options& o) { data_frame_list::validate(p, o); }; + registry["genomic_ranges_list"] = [](const std::filesystem::path& p, const Options& o) { genomic_ranges_list::validate(p, o); }; return registry; } diff --git a/inst/include/takane/atomic_vector.hpp b/inst/include/takane/atomic_vector.hpp index 5278317..44cfd9d 100644 --- a/inst/include/takane/atomic_vector.hpp +++ b/inst/include/takane/atomic_vector.hpp @@ -8,7 +8,7 @@ #include "ritsuko/hdf5/hdf5.hpp" #include "utils_public.hpp" -#include "utils_hdf5.hpp" +#include "utils_string.hpp" /** * @file atomic_vector.hpp @@ -28,42 +28,28 @@ namespace atomic_vector { * @param options Validation options, typically for reading performance. */ inline void validate(const std::filesystem::path& path, const Options& options) try { - H5::H5File handle((path / "contents.h5").string(), H5F_ACC_RDONLY); - - const char* parent = "atomic_vector"; - if (!handle.exists(parent) || handle.childObjType(parent) != H5O_TYPE_GROUP) { - throw std::runtime_error("expected an 'atomic_vector' group"); - } - auto ghandle = handle.openGroup(parent); + auto handle = ritsuko::hdf5::open_file(path / "contents.h5"); + auto ghandle = ritsuko::hdf5::open_group(handle, "atomic_vector"); - auto vstring = ritsuko::hdf5::load_scalar_string_attribute(ghandle, "version"); + auto vstring = ritsuko::hdf5::open_and_load_scalar_string_attribute(ghandle, "version"); auto version = ritsuko::parse_version_string(vstring.c_str(), vstring.size(), /* skip_patch = */ true); if (version.major != 1) { throw std::runtime_error("unsupported version string '" + vstring + "'"); } - auto dhandle = ritsuko::hdf5::get_dataset(ghandle, "values"); + auto dhandle = ritsuko::hdf5::open_dataset(ghandle, "values"); auto vlen = ritsuko::hdf5::get_1d_length(dhandle.getSpace(), false); - auto type = ritsuko::hdf5::load_scalar_string_attribute(ghandle, "type"); + auto type = ritsuko::hdf5::open_and_load_scalar_string_attribute(ghandle, "type"); const char* missing_attr_name = "missing-value-placeholder"; - bool has_missing = dhandle.attrExists(missing_attr_name); if (type == "string") { if (dhandle.getTypeClass() != H5T_STRING) { throw std::runtime_error("expected a string datatype for 'values'"); } - - std::string missing_value; - if (has_missing) { - auto missing_attr = ritsuko::hdf5::get_missing_placeholder_attribute(dhandle, missing_attr_name, /* type_class_only = */ true); - missing_value = ritsuko::hdf5::load_scalar_string_attribute(missing_attr); - } - - if (ghandle.attrExists("format")) { - auto format = ritsuko::hdf5::load_scalar_string_attribute(ghandle, "format"); - internal_hdf5::validate_string_format(dhandle, vlen, format, has_missing, missing_value, options.hdf5_buffer_size); - } + auto missingness = ritsuko::hdf5::open_and_load_optional_string_missing_placeholder(dhandle, missing_attr_name); + std::string format = internal_string::fetch_format_attribute(ghandle); + internal_string::validate_string_format(dhandle, vlen, format, missingness.first, missingness.second, options.hdf5_buffer_size); } else { if (type == "integer") { @@ -82,21 +68,13 @@ inline void validate(const std::filesystem::path& path, const Options& options) throw std::runtime_error("unsupported type '" + type + "'"); } - if (has_missing) { - ritsuko::hdf5::get_missing_placeholder_attribute(dhandle, missing_attr_name); + if (dhandle.attrExists(missing_attr_name)) { + auto missing_attr = dhandle.openAttribute(missing_attr_name); + ritsuko::hdf5::check_missing_placeholder_attribute(dhandle, missing_attr); } } - if (ghandle.exists("names")) { - auto nhandle = ritsuko::hdf5::get_dataset(ghandle, "names"); - if (nhandle.getTypeClass() != H5T_STRING) { - throw std::runtime_error("'names' should be a string datatype class"); - } - auto nlen = ritsuko::hdf5::get_1d_length(nhandle.getSpace(), false); - if (vlen != nlen) { - throw std::runtime_error("'names' and 'values' should have the same length"); - } - } + internal_string::validate_names(ghandle, "names", vlen, options.hdf5_buffer_size); } catch (std::exception& e) { throw std::runtime_error("failed to validate an 'atomic_vector' at '" + path.string() + "'; " + std::string(e.what())); diff --git a/inst/include/takane/atomic_vector_list.hpp b/inst/include/takane/atomic_vector_list.hpp new file mode 100644 index 0000000..22a577a --- /dev/null +++ b/inst/include/takane/atomic_vector_list.hpp @@ -0,0 +1,45 @@ +#ifndef TAKANE_ATOMIC_VECTOR_LIST_HPP +#define TAKANE_ATOMIC_VECTOR_LIST_HPP + +#include "H5Cpp.h" + +#include +#include +#include + +#include "utils_public.hpp" +#include "utils_compressed_list.hpp" + +/** + * @file atomic_vector_list.hpp + * @brief Validation for atomic vector lists. + */ + +namespace takane { + +namespace atomic_vector_list { + +/** + * @param path Path to the directory containing the atomic vector list. + * @param options Validation options, typically for reading performance. + */ +inline void validate(const std::filesystem::path& path, const Options& options) try { + internal_compressed_list::validate_directory(path, "atomic_vector_list", "atomic_vector", options); +} catch (std::exception& e) { + throw std::runtime_error("failed to validate an 'atomic_vector_list' object at '" + path.string() + "'; " + std::string(e.what())); +} + +/** + * @param path Path to a directory containing an atomic vector list. + * @param options Validation options, mostly for input performance. + * @return The length of the list. + */ +inline size_t height(const std::filesystem::path& path, const Options& options) { + return internal_compressed_list::height(path, "atomic_vector_list", options); +} + +} + +} + +#endif diff --git a/inst/include/takane/compressed_list.hpp b/inst/include/takane/compressed_list.hpp deleted file mode 100644 index 192653a..0000000 --- a/inst/include/takane/compressed_list.hpp +++ /dev/null @@ -1,143 +0,0 @@ -#ifndef TAKANE_COMPRESSED_LIST_HPP -#define TAKANE_COMPRESSED_LIST_HPP - -#include "comservatory/comservatory.hpp" - -#include "utils_csv.hpp" - -#include - -/** - * @file compressed_list.hpp - * @brief Validation for compressed lists. - */ - -namespace takane { - -/** - * @namespace takane::compressed_list - * @brief Definitions for compressed lists. - */ -namespace compressed_list { - -/** - * @brief Parameters for validating the compressed list file. - */ -struct Parameters { - /** - * Length of the compressed list. - */ - size_t length = 0; - - /** - * Total length of the concatenated elements. - */ - size_t concatenated = 0; - - /** - * Whether the compressed list is named. - */ - bool has_names = false; - - /** - * Whether to load and parse the file in parallel, see `comservatory::ReadOptions` for details. - */ - bool parallel = false; - - /** - * Version of the `compressed_list` format. - */ - int version = 1; -}; - -/** - * @cond - */ -template -CsvContents validate_base(ParseCommand parse, const Parameters& params, CsvFieldCreator* creator = NULL) { - DummyCsvFieldCreator default_creator; - if (creator == NULL) { - creator = &default_creator; - } - - comservatory::Contents contents; - CsvContents output; - if (params.has_names) { - auto ptr = creator->string(); - output.fields.emplace_back(ptr); - contents.fields.emplace_back(new CsvNameField(false, ptr)); - } - - auto ptr0 = creator->integer(); - output.fields.emplace_back(ptr0); - auto ptr = new CsvCompressedLengthField(static_cast(params.has_names), ptr0); - contents.fields.emplace_back(ptr); - - comservatory::ReadOptions opt; - opt.parallel = params.parallel; - parse(contents, opt); - if (contents.num_records() != params.length) { - throw std::runtime_error("number of records in the CSV file does not match the expected length"); - } - - if (params.concatenated != ptr->total) { - throw std::runtime_error("sum of lengths in the compressed list did not equal the expected concatenated total"); - } - - if (contents.names.back() != "number") { - throw std::runtime_error("column containing the compressed list lengths should be named 'number'"); - } - - return output; -} -/** - * @endcond - */ - -/** - * Checks if a CSV is correctly formatted for the `compressed_list` format. - * An error is raised if the file does not meet the specifications. - * - * @tparam Reader A **byteme** reader class. - * - * @param reader A stream of bytes from the CSV file. - * @param params Validation parameters. - * @param creator Factory to create objects for holding the contents of each CSV field. - * Defaults to a pointer to a `DummyFieldCreator` instance. - * - * @return Contents of the loaded CSV. - * Whether the `fields` member actually contains the CSV data depends on `creator`. - * If `params.has_names = true`, an additional field containing the names is present at the start. - */ -template -CsvContents validate(Reader& reader, const Parameters& params, CsvFieldCreator* creator = NULL) { - return validate_base( - [&](comservatory::Contents& contents, const comservatory::ReadOptions& opts) -> void { comservatory::read(reader, contents, opts); }, - params, - creator - ); -} - -/** - * Overload of `compressed_list::validate()` that accepts a file path. - * - * @param path Path to the CSV file. - * @param params Validation parameters. - * @param creator Factory to create objects for holding the contents of each CSV field. - * Defaults to a pointer to a `DummyFieldCreator` instance. - * - * @return Contents of the loaded CSV. - */ -inline CsvContents validate(const char* path, const Parameters& params, CsvFieldCreator* creator = NULL) { - return validate_base( - [&](comservatory::Contents& contents, const comservatory::ReadOptions& opts) -> void { comservatory::read_file(path, contents, opts); }, - params, - creator - ); -} - -} - -} - -#endif diff --git a/inst/include/takane/data_frame.hpp b/inst/include/takane/data_frame.hpp index 1e3729e..f227397 100644 --- a/inst/include/takane/data_frame.hpp +++ b/inst/include/takane/data_frame.hpp @@ -13,7 +13,8 @@ #include #include "utils_public.hpp" -#include "utils_hdf5.hpp" +#include "utils_string.hpp" +#include "utils_factor.hpp" #include "utils_other.hpp" /** @@ -37,7 +38,7 @@ namespace data_frame { /** * @cond */ -inline void validate_row_names(const H5::Group& handle, hsize_t num_rows) try { +inline void validate_row_names(const H5::Group& handle, hsize_t num_rows, const Options& options) try { if (handle.childObjType("row_names") != H5O_TYPE_DATASET) { throw std::runtime_error("expected a 'row_names' dataset when row names are present"); } @@ -48,16 +49,13 @@ inline void validate_row_names(const H5::Group& handle, hsize_t num_rows) try { if (ritsuko::hdf5::get_1d_length(rnhandle.getSpace(), false) != num_rows) { throw std::runtime_error("expected 'row_names' to have length equal to the number of rows"); } + ritsuko::hdf5::validate_1d_string_dataset(rnhandle, num_rows, options.hdf5_buffer_size); } catch (std::exception& e) { throw std::runtime_error("failed to validate the row names for '" + ritsuko::hdf5::get_name(handle) + "'; " + std::string(e.what())); } inline hsize_t validate_column_names(const H5::Group& ghandle, const Options& options) try { - if (!ghandle.exists("column_names") || ghandle.childObjType("column_names") != H5O_TYPE_DATASET) { - throw std::runtime_error("expected a 'column_names' dataset"); - } - - auto cnhandle = ghandle.openDataSet("column_names"); + auto cnhandle = ritsuko::hdf5::open_dataset(ghandle, "column_names"); if (cnhandle.getTypeClass() != H5T_STRING) { throw std::runtime_error("expected 'column_names' to be a string dataset"); } @@ -65,21 +63,17 @@ inline hsize_t validate_column_names(const H5::Group& ghandle, const Options& op auto num_cols = ritsuko::hdf5::get_1d_length(cnhandle.getSpace(), false); std::unordered_set column_names; - ritsuko::hdf5::load_1d_string_dataset( - cnhandle, - num_cols, - options.hdf5_buffer_size, - [&](size_t, const char* p, size_t l) { - if (l == 0) { - throw std::runtime_error("column names should not be empty strings"); - } - std::string col_name(p, p + l); - if (column_names.find(col_name) != column_names.end()) { - throw std::runtime_error("duplicated column name '" + col_name + "'"); - } - column_names.insert(std::move(col_name)); + ritsuko::hdf5::Stream1dStringDataset stream(&cnhandle, num_cols, options.hdf5_buffer_size); + for (size_t c = 0; c < num_cols; ++c, stream.next()) { + auto x = stream.steal(); + if (x.empty()) { + throw std::runtime_error("column names should not be empty strings"); } - ); + if (column_names.find(x) != column_names.end()) { + throw std::runtime_error("duplicated column name '" + x + "'"); + } + column_names.insert(std::move(x)); + } return num_cols; @@ -88,52 +82,38 @@ inline hsize_t validate_column_names(const H5::Group& ghandle, const Options& op } inline void validate_column(const H5::Group& dhandle, const std::string& dset_name, hsize_t num_rows, const Options& options) try { - if (dhandle.childObjType(dset_name) == H5O_TYPE_GROUP) { + auto dtype = dhandle.childObjType(dset_name); + if (dtype == H5O_TYPE_GROUP) { auto fhandle = dhandle.openGroup(dset_name); - auto type = ritsuko::hdf5::load_scalar_string_attribute(fhandle, "type"); + auto type = ritsuko::hdf5::open_and_load_scalar_string_attribute(fhandle, "type"); if (type != "factor") { throw std::runtime_error("expected HDF5 groups to have a 'type' attribute set to 'factor'"); } - if (fhandle.attrExists("ordered")) { - auto attr = ritsuko::hdf5::get_scalar_attribute(fhandle, "ordered"); - if (ritsuko::hdf5::exceeds_integer_limit(attr, 32, true)) { - throw std::runtime_error("an 'ordered' attribute on a factor column should have a datatype that fits in a 32-bit signed integer"); - } - } + internal_factor::check_ordered_attribute(fhandle); - auto num_levels = internal_hdf5::validate_factor_levels(fhandle, "levels", options.hdf5_buffer_size); - auto num_codes = internal_hdf5::validate_factor_codes(fhandle, "codes", num_levels, options.hdf5_buffer_size); + auto num_levels = internal_factor::validate_factor_levels(fhandle, "levels", options.hdf5_buffer_size); + auto num_codes = internal_factor::validate_factor_codes(fhandle, "codes", num_levels, options.hdf5_buffer_size); if (num_codes != num_rows) { throw std::runtime_error("expected column to have length equal to the number of rows"); } - - } else { - auto xhandle = ritsuko::hdf5::get_dataset(dhandle, dset_name.c_str()); + } else if (dtype == H5O_TYPE_DATASET) { + auto xhandle = dhandle.openDataSet(dset_name); if (num_rows != ritsuko::hdf5::get_1d_length(xhandle.getSpace(), false)) { throw std::runtime_error("expected column to have length equal to the number of rows"); } const char* missing_attr_name = "missing-value-placeholder"; - bool has_missing = xhandle.attrExists(missing_attr_name); - auto type = ritsuko::hdf5::load_scalar_string_attribute(xhandle, "type"); + auto type = ritsuko::hdf5::open_and_load_scalar_string_attribute(xhandle, "type"); if (type == "string") { if (xhandle.getTypeClass() != H5T_STRING) { throw std::runtime_error("expected column " + dset_name + " to be a string dataset"); } - - std::string missing_value; - if (has_missing) { - auto missing_attr = ritsuko::hdf5::get_missing_placeholder_attribute(xhandle, missing_attr_name, /* type_class_only = */ true); - missing_value = ritsuko::hdf5::load_scalar_string_attribute(missing_attr); - } - - if (xhandle.attrExists("format")) { - auto format = ritsuko::hdf5::load_scalar_string_attribute(xhandle, "format"); - internal_hdf5::validate_string_format(xhandle, num_rows, format, has_missing, missing_value, options.hdf5_buffer_size); - } + auto missingness = ritsuko::hdf5::open_and_load_optional_string_missing_placeholder(xhandle, missing_attr_name); + std::string format = internal_string::fetch_format_attribute(xhandle); + internal_string::validate_string_format(xhandle, num_rows, format, missingness.first, missingness.second, options.hdf5_buffer_size); } else { if (type == "integer") { @@ -152,10 +132,14 @@ inline void validate_column(const H5::Group& dhandle, const std::string& dset_na throw std::runtime_error("unknown column type '" + type + "'"); } - if (has_missing) { - ritsuko::hdf5::get_missing_placeholder_attribute(xhandle, missing_attr_name); + if (xhandle.attrExists(missing_attr_name)) { + auto ahandle = xhandle.openAttribute(missing_attr_name); + ritsuko::hdf5::check_missing_placeholder_attribute(xhandle, ahandle); } } + + } else { + throw std::runtime_error("unknown HDF5 object type"); } } catch (std::exception& e) { @@ -170,22 +154,17 @@ inline void validate_column(const H5::Group& dhandle, const std::string& dset_na * @param options Validation options, typically for reading performance. */ inline void validate(const std::filesystem::path& path, const Options& options) { - auto h5path = path / "basic_columns.h5"; - - H5::H5File handle(h5path, H5F_ACC_RDONLY); - if (!handle.exists("data_frame") || handle.childObjType("data_frame") != H5O_TYPE_GROUP) { - throw std::runtime_error("expected a 'data_frame' group"); - } - auto ghandle = handle.openGroup("data_frame"); + auto handle = ritsuko::hdf5::open_file(path / "basic_columns.h5"); + auto ghandle = ritsuko::hdf5::open_group(handle, "data_frame"); - auto vstring = ritsuko::hdf5::load_scalar_string_attribute(ghandle, "version"); + auto vstring = ritsuko::hdf5::open_and_load_scalar_string_attribute(ghandle, "version"); auto version = ritsuko::parse_version_string(vstring.c_str(), vstring.size(), /* skip_patch = */ true); if (version.major != 1) { throw std::runtime_error("unsupported version '" + vstring + "'"); } // Checking the number of rows. - auto attr = ritsuko::hdf5::get_scalar_attribute(ghandle, "row-count"); + auto attr = ritsuko::hdf5::open_scalar_attribute(ghandle, "row-count"); if (ritsuko::hdf5::exceeds_integer_limit(attr, 64, false)) { throw std::runtime_error("'row-count' attribute should have a datatype that fits in a 64-bit unsigned integer"); } @@ -194,15 +173,12 @@ inline void validate(const std::filesystem::path& path, const Options& options) // Checking row and column names. if (ghandle.exists("row_names")) { - validate_row_names(ghandle, num_rows); + validate_row_names(ghandle, num_rows, options); } size_t NC = validate_column_names(ghandle, options); // Finally iterating through the columns. - if (!ghandle.exists("data") || ghandle.childObjType("data") != H5O_TYPE_GROUP) { - throw std::runtime_error("expected a 'data_frame/data' group"); - } - auto dhandle = ghandle.openGroup("data"); + auto dhandle = ritsuko::hdf5::open_group(ghandle, "data"); hsize_t found = 0; for (size_t c = 0; c < NC; ++c) { @@ -229,18 +205,8 @@ inline void validate(const std::filesystem::path& path, const Options& options) throw std::runtime_error("more objects present in the 'data_frame/data' group than expected"); } - // Checking the metadata. - try { - internal_other::validate_mcols(path / "column_annotations", NC, options); - } catch (std::exception& e) { - throw std::runtime_error("failed to validate 'column_annotations'; " + std::string(e.what())); - } - - try { - internal_other::validate_metadata(path / "other_annotations", options); - } catch (std::exception& e) { - throw std::runtime_error("failed to validate 'other_annotations'; " + std::string(e.what())); - } + internal_other::validate_mcols(path, "column_annotations", NC, options); + internal_other::validate_metadata(path, "other_annotations", options); } /** @@ -254,10 +220,7 @@ inline size_t height(const std::filesystem::path& path, const Options&) { // Assume it's all valid already. H5::H5File handle(h5path, H5F_ACC_RDONLY); auto ghandle = handle.openGroup("data_frame"); - auto attr = ritsuko::hdf5::get_scalar_attribute(ghandle, "row-count"); - uint64_t num_rows = 0; - attr.read(H5::PredType::NATIVE_UINT64, &num_rows); - return num_rows; + return ritsuko::hdf5::load_scalar_numeric_attribute(ghandle.openAttribute("row-count")); } } diff --git a/inst/include/takane/data_frame_factor.hpp b/inst/include/takane/data_frame_factor.hpp index 266c473..c4e8a04 100644 --- a/inst/include/takane/data_frame_factor.hpp +++ b/inst/include/takane/data_frame_factor.hpp @@ -8,7 +8,8 @@ #include "ritsuko/hdf5/hdf5.hpp" #include "utils_public.hpp" -#include "utils_hdf5.hpp" +#include "utils_string.hpp" +#include "utils_factor.hpp" /** * @file data_frame_factor.hpp @@ -22,6 +23,7 @@ namespace takane { */ void validate(const std::filesystem::path&, const std::string&, const Options&); size_t height(const std::filesystem::path&, const std::string&, const Options&); +bool satisfies_interface(const std::string&, const std::string&); /** * @endcond */ @@ -48,15 +50,10 @@ inline std::function +#include +#include + +#include "utils_public.hpp" +#include "utils_compressed_list.hpp" + +/** + * @file data_frame_list.hpp + * @brief Validation for data frame lists. + */ + +namespace takane { + +namespace data_frame_list { + +/** + * @param path Path to the directory containing the data frame list. + * @param options Validation options, typically for reading performance. + */ +inline void validate(const std::filesystem::path& path, const Options& options) try { + internal_compressed_list::validate_directory(path, "data_frame_list", "DATA_FRAME", options); +} catch (std::exception& e) { + throw std::runtime_error("failed to validate an 'data_frame_list' object at '" + path.string() + "'; " + std::string(e.what())); +} + +/** + * @param path Path to a directory containing an data frame list. + * @param options Validation options, mostly for input performance. + * @return The length of the list. + */ +inline size_t height(const std::filesystem::path& path, const Options& options) { + return internal_compressed_list::height(path, "data_frame_list", options); +} + +} + +} + +#endif diff --git a/inst/include/takane/genomic_ranges.hpp b/inst/include/takane/genomic_ranges.hpp index 511bb3f..b07cfd8 100644 --- a/inst/include/takane/genomic_ranges.hpp +++ b/inst/include/takane/genomic_ranges.hpp @@ -4,11 +4,16 @@ #include "ritsuko/ritsuko.hpp" #include "comservatory/comservatory.hpp" -#include "WrappedOption.hpp" - -#include #include +#include #include +#include +#include +#include + +#include "utils_string.hpp" +#include "utils_public.hpp" +#include "utils_other.hpp" /** * @file genomic_ranges.hpp @@ -18,200 +23,203 @@ namespace takane { /** - * @namespace takane::genomic_ranges - * @brief Definitions for genomic ranges. + * @cond + */ +void validate(const std::filesystem::path&, const std::string&, const Options& options); +/** + * @endcond */ -namespace genomic_ranges { /** - * @brief Parameters for validating the genomic ranges file. + * @namespace takane::genomic_ranges + * @brief Definitions for genomic ranges. */ -struct Parameters { - /** - * Number of genomic ranges in this object. - */ - size_t num_ranges; - - /** - * Whether the ranges are named. - */ - bool has_names; - - /** - * Universe of sequence names for this object. - */ - WrappedOption > seqnames; - - /** - * Whether to load and parse the file in parallel, see `comservatory::ReadOptions` for details. - */ - bool parallel = false; - - /** - * Version of the `genomic_ranges` format. - */ - int version = 1; -}; +namespace genomic_ranges { /** * @cond */ -struct NamesField : public comservatory::DummyStringField { - void add_missing() { - throw std::runtime_error("missing values should not be present in the names column"); - } -}; +namespace internal { -struct SeqnamesField : public comservatory::DummyStringField { - void add_missing() { - throw std::runtime_error("missing values should not be present in the seqnames column"); - } +struct SequenceLimits { + SequenceLimits(size_t n) : restricted(n), seqlen(n) {} + std::vector restricted; + std::vector seqlen; +}; - void push_back(std::string x) { - if (all_seqnames->find(x) == all_seqnames->end()) { - throw std::runtime_error("unknown sequence name '" + x + "'"); - } - comservatory::DummyStringField::push_back(std::move(x)); +inline SequenceLimits find_sequence_limits(const std::filesystem::path& path, const Options& options) { + auto xtype = read_object_type(path); + if (xtype != "sequence_information") { + throw std::runtime_error("'sequence_information' directory should contain a 'sequence_information' object"); } + ::takane::validate(path, xtype, options); - const std::unordered_set* all_seqnames = NULL; -}; + auto fpath = path / "info.h5"; + H5::H5File handle(fpath, H5F_ACC_RDONLY); + auto ghandle = handle.openGroup("sequence_information"); -struct StartField : public comservatory::DummyNumberField { - void add_missing() { - throw std::runtime_error("missing values should not be present in the start column"); - } + auto lhandle = ghandle.openDataSet("length"); + auto num_seq = ritsuko::hdf5::get_1d_length(lhandle.getSpace(), false); + ritsuko::hdf5::Stream1dNumericDataset lstream(&lhandle, num_seq, options.hdf5_buffer_size); + auto lmissing = ritsuko::hdf5::open_and_load_optional_numeric_missing_placeholder(lhandle, "missing-value-placeholder"); - void push_back(double x) { - if (x < -2147483648 || x > 2147483647) { // constrain within limits. - throw std::runtime_error("start position does not fit inside a 32-bit signed integer"); - } - if (x != std::floor(x)) { - throw std::runtime_error("start position is not an integer"); - } - last = x; - comservatory::DummyNumberField::push_back(x); - } + auto chandle = ghandle.openDataSet("circular"); + ritsuko::hdf5::Stream1dNumericDataset cstream(&chandle, num_seq, options.hdf5_buffer_size); + auto cmissing = ritsuko::hdf5::open_and_load_optional_numeric_missing_placeholder(chandle, "missing-value-placeholder"); - int32_t last = 0; -}; + SequenceLimits output(num_seq); + auto& restricted = output.restricted; + auto& seqlen = output.seqlen; -struct EndField : public comservatory::DummyNumberField { - void add_missing() { - throw std::runtime_error("missing values should not be present in the strand column"); - } + for (size_t i = 0; i < num_seq; ++i, lstream.next(), cstream.next()) { + auto slen = lstream.get(); + auto circ = cstream.get(); + seqlen[i] = slen; - void push_back(double x) { - if (x < -2147483648 || x > 2147483647) { // constrain within limits. - throw std::runtime_error("end position does not fit inside a 32-bit signed integer"); + // Skipping restriction if the sequence length is missing OR the sequence is circular. + if (lmissing.first && lmissing.second == slen) { + continue; } - if (x != std::floor(x)) { - throw std::runtime_error("end position is not an integer"); + if (circ && !(cmissing.first && cmissing.second == circ)) { + continue; } - comservatory::DummyNumberField::push_back(x); - if (start->size() != size()) { - throw std::runtime_error("'start' and 'end' validator fields are out of sync"); - } - if (x + 1 < start->last) { - throw std::runtime_error("'end' coordinate must be greater than or equal to 'start - 1'"); - } + restricted[i] = true; } - const StartField* start = NULL; -}; + return output; +} -struct StrandField : public comservatory::DummyStringField { - void add_missing() { - throw std::runtime_error("missing values should not be present in the strand column"); - } +} +/** + * @endcond + */ - void push_back(std::string x) { - if (x.size() != 1 || (x[0] != '+' && x[0] != '-' && x[0] != '*')) { - throw std::runtime_error("invalid strand '" + x + "'"); - } - comservatory::DummyStringField::push_back(std::move(x)); +/** + * @param path Path to the directory containing the genomic ranges. + * @param options Validation options, typically for reading performance. + */ +inline void validate(const std::filesystem::path& path, const Options& options) try { + // Figuring out the sequence length constraints. + auto limits = internal::find_sequence_limits(path / "sequence_information", options); + const auto& restricted = limits.restricted; + const auto& seqlen = limits.seqlen; + size_t num_sequences = restricted.size(); + + // Now loading all three components. + auto handle = ritsuko::hdf5::open_file(path / "ranges.h5"); + auto ghandle = ritsuko::hdf5::open_group(handle, "genomic_ranges"); + + auto id_handle = ritsuko::hdf5::open_dataset(ghandle, "sequence"); + auto num_ranges = ritsuko::hdf5::get_1d_length(id_handle, false); + if (ritsuko::hdf5::exceeds_integer_limit(id_handle, 64, false)) { + throw std::runtime_error("expected 'sequence' to have a datatype that fits into a 64-bit unsigned integer"); } -}; + ritsuko::hdf5::Stream1dNumericDataset id_stream(&id_handle, num_ranges, options.hdf5_buffer_size); -template -void validate_base(ParseCommand parse, const Parameters& params) { - comservatory::Contents contents; - if (params.has_names) { - contents.fields.emplace_back(new NamesField); + auto start_handle = ritsuko::hdf5::open_dataset(ghandle, "start"); + if (num_ranges != ritsuko::hdf5::get_1d_length(start_handle, false)) { + throw std::runtime_error("'start' and 'sequence' should have the same length"); + } + if (ritsuko::hdf5::exceeds_integer_limit(start_handle, 64, true)) { + throw std::runtime_error("expected 'start' to have a datatype that fits into a 64-bit signed integer"); } + ritsuko::hdf5::Stream1dNumericDataset start_stream(&start_handle, num_ranges, options.hdf5_buffer_size); - { - auto ptr = new SeqnamesField; - ptr->all_seqnames = params.seqnames.get(); - contents.fields.emplace_back(ptr); + auto width_handle = ritsuko::hdf5::open_dataset(ghandle, "width"); + if (num_ranges != ritsuko::hdf5::get_1d_length(width_handle, false)) { + throw std::runtime_error("'width' and 'sequence' should have the same length"); } - - { - auto sptr = new StartField; - contents.fields.emplace_back(sptr); - auto eptr = new EndField; - eptr->start = sptr; - contents.fields.emplace_back(eptr); + if (ritsuko::hdf5::exceeds_integer_limit(width_handle, 64, false)) { + throw std::runtime_error("expected 'width' to have a datatype that fits into a 64-bit unsigned integer"); } + ritsuko::hdf5::Stream1dNumericDataset width_stream(&width_handle, num_ranges, options.hdf5_buffer_size); - contents.fields.emplace_back(new StrandField); + constexpr uint64_t end_limit = std::numeric_limits::max(); + for (size_t i = 0; i < num_ranges; ++i, id_stream.next(), start_stream.next(), width_stream.next()) { + auto id = id_stream.get(); + if (id >= num_sequences) { + throw std::runtime_error("'sequence' must be less than the number of sequences (got " + std::to_string(id) + ")"); + } - comservatory::ReadOptions opt; - opt.parallel = params.parallel; - parse(contents, opt); - if (contents.num_records() != params.num_ranges) { - throw std::runtime_error("number of records in the CSV file does not match the expected number of ranges"); - } + auto start = start_stream.get(); + auto width = width_stream.get(); + + if (restricted[id]) { + if (start < 1) { + throw std::runtime_error("non-positive start position (" + std::to_string(start) + ") for non-circular sequence"); + } + + auto spos = static_cast(start); + auto limit = seqlen[id]; + if (spos > limit) { + throw std::runtime_error("start position beyond sequence length (" + std::to_string(start) + " > " + std::to_string(limit) + ") for non-circular sequence"); + } + + // The LHS should not overflow as 'spos >= 1' so 'limit - spos + 1' should still be no greater than 'limit'. + if (limit - spos + 1 < width) { + throw std::runtime_error("end position beyond sequence length (" + + std::to_string(start) + " + " + std::to_string(width) + " > " + std::to_string(limit) + + ") for non-circular sequence"); + } + } - if (contents.names[0 + params.has_names] != "seqnames") { - throw std::runtime_error("expected the first (non-name) column to be 'seqnames'"); - } - if (contents.names[1 + params.has_names] != "start") { - throw std::runtime_error("expected the second (non-name) column to be 'start'"); - } - if (contents.names[2 + params.has_names] != "end") { - throw std::runtime_error("expected the third (non-name) column to be 'end'"); + bool exceeded = false; + if (start > 0) { + // 'end_limit - start' is always non-negative as 'end_limit' is the largest value of an int64_t and 'start' is also int64_t. + exceeded = (end_limit - static_cast(start) < width); + } else { + // 'end_limit - start' will not overflow a uint64_t, because 'end_limit' is the largest value of an int64_t and 'start' as also 'int64_t'. + exceeded = (end_limit + static_cast(-start) < width); + } + if (exceeded) { + throw std::runtime_error("end position beyond the range of a 64-bit integer (" + std::to_string(start) + " + " + std::to_string(width) + ")"); + } } - if (contents.names[3 + params.has_names] != "strand") { - throw std::runtime_error("expected the fourth (non-name) column to be 'strand'"); + + { + auto strand_handle = ritsuko::hdf5::open_dataset(ghandle, "strand"); + if (num_ranges != ritsuko::hdf5::get_1d_length(strand_handle, false)) { + throw std::runtime_error("'strand' and 'sequence' should have the same length"); + } + if (ritsuko::hdf5::exceeds_integer_limit(strand_handle, 32, true)) { + throw std::runtime_error("expected 'strand' to have a datatype that fits into a 32-bit signed integer"); + } + + ritsuko::hdf5::Stream1dNumericDataset strand_stream(&strand_handle, num_ranges, options.hdf5_buffer_size); + for (hsize_t i = 0; i < num_ranges; ++i, strand_stream.next()) { + auto x = strand_stream.get(); + if (x < -1 || x > 1) { + throw std::runtime_error("values of 'strand' should be one of 0, -1, or 1 (got " + std::to_string(x) + ")"); + } + } } -} -/** - * @endcond - */ -/** - * Checks if a CSV data frame is correctly formatted for genomic ranges. - * An error is raised if the file does not meet the specifications. - * - * @tparam Reader A **byteme** reader class. - * - * @param reader A stream of bytes from the CSV file. - * @param params Validation parameters. - */ -template -void validate(Reader& reader, const Parameters& params) { - validate_base( - [&](comservatory::Contents& contents, const comservatory::ReadOptions& opt) -> void { comservatory::read(reader, contents, opt); }, - params - ); + internal_other::validate_mcols(path, "range_annotations", num_ranges, options); + internal_other::validate_metadata(path, "other_annotations", options); + + internal_string::validate_names(ghandle, "name", num_ranges, options.hdf5_buffer_size); + +} catch (std::exception& e) { + throw std::runtime_error("failed to validate 'genomic_ranges' object at '" + path.string() + "'; " + std::string(e.what())); } /** - * Checks if a CSV data frame is correctly formatted for genomic ranges. - * An error is raised if the file does not meet the specifications. - * - * @param path Path to the CSV file. - * @param params Validation parameters. + * @param path Path to a directory containing genomic ranges. + * @param options Validation options, mostly for input performance. + * @return The number of ranges. */ -inline void validate(const char* path, const Parameters& params) { - validate_base( - [&](comservatory::Contents& contents, const comservatory::ReadOptions& opt) -> void { comservatory::read_file(path, contents, opt); }, - params - ); +inline size_t height(const std::filesystem::path& path, const Options&) { + auto h5path = path / "ranges.h5"; + + // Assume it's all valid already. + H5::H5File handle(h5path, H5F_ACC_RDONLY); + auto ghandle = handle.openGroup("genomic_ranges"); + auto dhandle = ghandle.openDataSet("sequence"); + return ritsuko::hdf5::get_1d_length(dhandle, false); } + } } diff --git a/inst/include/takane/genomic_ranges_list.hpp b/inst/include/takane/genomic_ranges_list.hpp new file mode 100644 index 0000000..4ec64ce --- /dev/null +++ b/inst/include/takane/genomic_ranges_list.hpp @@ -0,0 +1,45 @@ +#ifndef TAKANE_GENOMIC_RANGES_LIST_HPP +#define TAKANE_GENOMIC_RANGES_LIST_HPP + +#include "H5Cpp.h" + +#include +#include +#include + +#include "utils_public.hpp" +#include "utils_compressed_list.hpp" + +/** + * @file genomic_ranges_list.hpp + * @brief Validation for genomic ranges lists. + */ + +namespace takane { + +namespace genomic_ranges_list { + +/** + * @param path Path to the directory containing the genomic ranges list. + * @param options Validation options, typically for reading performance. + */ +inline void validate(const std::filesystem::path& path, const Options& options) try { + internal_compressed_list::validate_directory(path, "genomic_ranges_list", "genomic_ranges", options); +} catch (std::exception& e) { + throw std::runtime_error("failed to validate an 'genomic_ranges_list' object at '" + path.string() + "'; " + std::string(e.what())); +} + +/** + * @param path Path to a directory containing an genomic ranges list. + * @param options Validation options, mostly for input performance. + * @return The length of the list. + */ +inline size_t height(const std::filesystem::path& path, const Options& options) { + return internal_compressed_list::height(path, "genomic_ranges_list", options); +} + +} + +} + +#endif diff --git a/inst/include/takane/sequence_information.hpp b/inst/include/takane/sequence_information.hpp index c9cfc18..537f55a 100644 --- a/inst/include/takane/sequence_information.hpp +++ b/inst/include/takane/sequence_information.hpp @@ -1,12 +1,14 @@ #ifndef TAKANE_SEQUENCE_INFORMATION_HPP #define TAKANE_SEQUENCE_INFORMATION_HPP -#include "comservatory/comservatory.hpp" - -#include "data_frame.hpp" -#include "utils_csv.hpp" +#include "ritsuko/hdf5/hdf5.hpp" +#include #include +#include +#include + +#include "utils_public.hpp" /** * @file sequence_information.hpp @@ -22,112 +24,77 @@ namespace takane { namespace sequence_information { /** - * @brief Parameters for validating the sequence information file. + * @param path Path to the directory containing the data frame. + * @param options Validation options, typically for reading performance. */ -struct Parameters { - /** - * Expected number of sequences. - */ - size_t num_sequences = 0; - - /** - * Whether to load and parse the file in parallel, see `comservatory::ReadOptions` for details. - */ - bool parallel = false; - - /** - * Version of the `sequence_information` format. - */ - int version = 1; -}; +inline void validate(const std::filesystem::path& path, const Options& options) try { + auto handle = ritsuko::hdf5::open_file(path / "info.h5"); + auto ghandle = ritsuko::hdf5::open_group(handle, "sequence_information"); -/** - * @cond - */ -template -CsvContents validate_base(ParseCommand parse, const Parameters& params, CsvFieldCreator* creator) { - DummyCsvFieldCreator default_creator; - if (creator == NULL) { - creator = &default_creator; + size_t nseq = 0; + { + auto nhandle = ritsuko::hdf5::open_dataset(ghandle, "name"); + if (nhandle.getTypeClass() != H5T_STRING) { + throw std::runtime_error("expected a string datatype class for 'name'"); + } + + nseq = ritsuko::hdf5::get_1d_length(nhandle.getSpace(), false); + std::unordered_set collected; + ritsuko::hdf5::Stream1dStringDataset stream(&nhandle, nseq, options.hdf5_buffer_size); + for (size_t s = 0; s < nseq; ++s, stream.next()) { + auto x = stream.steal(); + if (collected.find(x) != collected.end()) { + throw std::runtime_error("detected duplicated sequence name '" + x + "'"); + } + collected.insert(std::move(x)); + } } - comservatory::Contents contents; - CsvContents output; - contents.names.push_back("seqnames"); + const char* missing_attr_name = "missing-value-placeholder"; + { - auto ptr = creator->string(); - output.fields.emplace_back(ptr); - contents.fields.emplace_back(new CsvUniqueStringField(0, ptr)); + auto lhandle = ritsuko::hdf5::open_dataset(ghandle, "length"); + if (ritsuko::hdf5::exceeds_integer_limit(lhandle, 64, false)) { + throw std::runtime_error("expected a datatype for 'length' that fits in a 64-bit unsigned integer"); + } + if (ritsuko::hdf5::get_1d_length(lhandle.getSpace(), false) != nseq) { + throw std::runtime_error("expected lengths of 'length' and 'name' to be equal"); + } + if (lhandle.attrExists(missing_attr_name)) { + auto ahandle = lhandle.openAttribute(missing_attr_name); + ritsuko::hdf5::check_missing_placeholder_attribute(lhandle, ahandle); + } } - contents.names.push_back("seqlengths"); { - auto ptr = creator->integer(); - output.fields.emplace_back(ptr); - contents.fields.emplace_back(new CsvNonNegativeIntegerField(1, ptr)); + auto chandle = ritsuko::hdf5::open_dataset(ghandle, "circular"); + if (ritsuko::hdf5::exceeds_integer_limit(chandle, 32, true)) { + throw std::runtime_error("expected a datatype for 'circular' that fits in a 32-bit signed integer"); + } + if (ritsuko::hdf5::get_1d_length(chandle.getSpace(), false) != nseq) { + throw std::runtime_error("expected lengths of 'length' and 'circular' to be equal"); + } + if (chandle.attrExists(missing_attr_name)) { + auto ahandle = chandle.openAttribute(missing_attr_name); + ritsuko::hdf5::check_missing_placeholder_attribute(chandle, ahandle); + } } - contents.names.push_back("isCircular"); - output.fields.emplace_back(nullptr); - contents.fields.emplace_back(creator->boolean()); - - contents.names.push_back("genome"); - output.fields.emplace_back(nullptr); - contents.fields.emplace_back(creator->string()); - - comservatory::ReadOptions opt; - opt.parallel = params.parallel; - parse(contents, opt); - if (contents.num_records() != params.num_sequences) { - throw std::runtime_error("number of records in the CSV file does not match the expected number of ranges"); + { + auto gnhandle = ritsuko::hdf5::open_dataset(ghandle, "genome"); + if (gnhandle.getTypeClass() != H5T_STRING) { + throw std::runtime_error("expected a string datatype class for 'genome'"); + } + if (ritsuko::hdf5::get_1d_length(gnhandle.getSpace(), false) != nseq) { + throw std::runtime_error("expected lengths of 'length' and 'genome' to be equal"); + } + if (gnhandle.attrExists(missing_attr_name)) { + auto ahandle = gnhandle.openAttribute(missing_attr_name); + ritsuko::hdf5::check_missing_placeholder_attribute(gnhandle, ahandle); + } } - - output.reconstitute(contents.fields); - return output; -} -/** - * @endcond - */ - -/** - * Checks if a CSV data frame is correctly formatted for sequence information. - * An error is raised if the file does not meet the specifications. - * - * @tparam Reader A **byteme** reader class. - * - * @param reader A stream of bytes from the CSV file. - * @param params Validation parameters. - * @param creator Factory to create objects for holding the contents of each CSV field. - * Defaults to a pointer to a `DummyFieldCreator` instance. - * - * @return Contents of the loaded CSV. - * Whether the `fields` member actually contains the CSV data depends on `creator`. - */ -template -CsvContents validate(Reader& reader, const Parameters& params, CsvFieldCreator* creator = NULL) { - return validate_base( - [&](comservatory::Contents& contents, const comservatory::ReadOptions& opts) -> void { comservatory::read(reader, contents, opts); }, - params, - creator - ); -} - -/** - * Overload of `sequence_information::validate()` that accepts a file path. - * - * @param path Path to the CSV file. - * @param params Validation parameters. - * @param creator Factory to create objects for holding the contents of each CSV field. - * Defaults to a pointer to a `DummyFieldCreator` instance. - * - * @return Contents of the loaded CSV. - */ -inline CsvContents validate(const char* path, const Parameters& params, CsvFieldCreator* creator = NULL) { - return validate_base( - [&](comservatory::Contents& contents, const comservatory::ReadOptions& opts) -> void { comservatory::read_file(path, contents, opts); }, - params, - creator - ); +} catch (std::exception& e) { + throw std::runtime_error("failed to validate 'sequence_information' object at '" + path.string() + "'; " + std::string(e.what())); } } diff --git a/inst/include/takane/string_factor.hpp b/inst/include/takane/string_factor.hpp index 0b979e7..e8cc512 100644 --- a/inst/include/takane/string_factor.hpp +++ b/inst/include/takane/string_factor.hpp @@ -8,7 +8,8 @@ #include "ritsuko/hdf5/hdf5.hpp" #include "utils_public.hpp" -#include "utils_hdf5.hpp" +#include "utils_string.hpp" +#include "utils_factor.hpp" /** * @file string_factor.hpp @@ -28,41 +29,21 @@ namespace string_factor { * @param options Validation options, typically for reading performance. */ inline void validate(const std::filesystem::path& path, const Options& options) try { - H5::H5File handle((path / "contents.h5").string(), H5F_ACC_RDONLY); - - const char* parent = "string_factor"; - if (!handle.exists(parent) || handle.childObjType(parent) != H5O_TYPE_GROUP) { - throw std::runtime_error("expected a 'string_factor' group"); - } - auto ghandle = handle.openGroup(parent); + auto handle = ritsuko::hdf5::open_file(path / "contents.h5"); + auto ghandle = ritsuko::hdf5::open_group(handle, "string_factor"); - auto vstring = ritsuko::hdf5::load_scalar_string_attribute(ghandle, "version"); + auto vstring = ritsuko::hdf5::open_and_load_scalar_string_attribute(ghandle, "version"); auto version = ritsuko::parse_version_string(vstring.c_str(), vstring.size(), /* skip_patch = */ true); if (version.major != 1) { throw std::runtime_error("unsupported version string '" + vstring + "'"); } - if (ghandle.attrExists("ordered")) { - auto oattr = ritsuko::hdf5::get_scalar_attribute(ghandle, "ordered"); - if (ritsuko::hdf5::exceeds_integer_limit(oattr, 32, true)) { - throw std::runtime_error("expected a datatype for the 'ordered' attribute that fits in a 32-bit signed integer"); - } - } + internal_factor::check_ordered_attribute(ghandle); - // Number of levels. - size_t num_levels = internal_hdf5::validate_factor_levels(ghandle, "levels", options.hdf5_buffer_size); - size_t num_codes = internal_hdf5::validate_factor_codes(ghandle, "codes", num_levels, options.hdf5_buffer_size); - - if (ghandle.exists("names")) { - auto nhandle = ritsuko::hdf5::get_dataset(ghandle, "names"); - if (nhandle.getTypeClass() != H5T_STRING) { - throw std::runtime_error("'names' should be a string datatype class"); - } - auto nlen = ritsuko::hdf5::get_1d_length(nhandle.getSpace(), false); - if (num_codes != nlen) { - throw std::runtime_error("'names' and 'codes' should have the same length"); - } - } + size_t num_levels = internal_factor::validate_factor_levels(ghandle, "levels", options.hdf5_buffer_size); + size_t num_codes = internal_factor::validate_factor_codes(ghandle, "codes", num_levels, options.hdf5_buffer_size); + + internal_string::validate_names(ghandle, "names", num_codes, options.hdf5_buffer_size); } catch (std::exception& e) { throw std::runtime_error("failed to validate a 'string_factor' at '" + path.string() + "'; " + std::string(e.what())); diff --git a/inst/include/takane/takane.hpp b/inst/include/takane/takane.hpp index f2e11a7..4fede0e 100644 --- a/inst/include/takane/takane.hpp +++ b/inst/include/takane/takane.hpp @@ -3,6 +3,7 @@ #include "_validate.hpp" #include "_height.hpp" +#include "_satisfies_interface.hpp" /** * @namespace takane diff --git a/inst/include/takane/utils_compressed_list.hpp b/inst/include/takane/utils_compressed_list.hpp new file mode 100644 index 0000000..8f1ab24 --- /dev/null +++ b/inst/include/takane/utils_compressed_list.hpp @@ -0,0 +1,96 @@ +#ifndef TAKANE_UTILS_COMPRESSED_LIST_HPP +#define TAKANE_UTILS_COMPRESSED_LIST_HPP + +#include "H5Cpp.h" +#include "ritsuko/ritsuko.hpp" +#include "ritsuko/hdf5/hdf5.hpp" + +#include +#include +#include +#include +#include + +#include "utils_public.hpp" +#include "utils_string.hpp" +#include "utils_other.hpp" + +namespace takane { + +void validate(const std::filesystem::path&, const std::string&, const Options&); +size_t height(const std::filesystem::path&, const std::string&, const Options&); +bool satisfies_interface(const std::string&, const std::string&); + +namespace internal_compressed_list { + +inline hsize_t validate_group(const H5::Group& handle, size_t concatenated_length, hsize_t buffer_size) { + auto lhandle = ritsuko::hdf5::open_dataset(handle, "lengths"); + if (ritsuko::hdf5::exceeds_integer_limit(lhandle, 64, false)) { + throw std::runtime_error("expected 'lengths' to have a datatype that fits in a 64-bit unsigned integer"); + } + + size_t len = ritsuko::hdf5::get_1d_length(lhandle.getSpace(), false); + ritsuko::hdf5::Stream1dNumericDataset stream(&lhandle, len, buffer_size); + size_t total = 0; + for (size_t i = 0; i < len; ++i, stream.next()) { + total += stream.get(); + } + if (total != concatenated_length) { + throw std::runtime_error("sum of 'lengths' does not equal the height of the concatenated object (got " + std::to_string(total) + ", expected " + std::to_string(concatenated_length) + ")"); + } + + return len; +} + +template +void validate_directory(const std::filesystem::path& path, const std::string& object_type, const std::string& concatenated_type, const Options& options) try { + auto handle = ritsuko::hdf5::open_file(path / "partitions.h5"); + auto ghandle = ritsuko::hdf5::open_group(handle, object_type.c_str()); + + auto vstring = ritsuko::hdf5::open_and_load_scalar_string_attribute(ghandle, "version"); + auto version = ritsuko::parse_version_string(vstring.c_str(), vstring.size(), /* skip_patch = */ true); + if (version.major != 1) { + throw std::runtime_error("unsupported version string '" + vstring + "'"); + } + + auto catdir = path / "concatenated"; + auto cattype = read_object_type(catdir); + if constexpr(satisfies_interface_) { + if (!satisfies_interface(cattype, concatenated_type)) { + throw std::runtime_error("'concatenated' should satisfy the '" + concatenated_type + "' interface"); + } + } else { + if (cattype != concatenated_type) { + throw std::runtime_error("'concatenated' should contain an '" + concatenated_type + "' object"); + } + } + + try { + ::takane::validate(catdir, cattype, options); + } catch (std::exception& e) { + throw std::runtime_error("failed to validate the 'concatenated' object; " + std::string(e.what())); + } + size_t catheight = ::takane::height(catdir, cattype, options); + + size_t len = validate_group(ghandle, catheight, options.hdf5_buffer_size); + + internal_string::validate_names(ghandle, "names", len, options.hdf5_buffer_size); + internal_other::validate_mcols(path, "element_annotations", len, options); + internal_other::validate_metadata(path, "other_annotations", options); + +} catch (std::exception& e) { + throw std::runtime_error("failed to validate an '" + object_type + "' object at '" + path.string() + "'; " + std::string(e.what())); +} + +inline size_t height(const std::filesystem::path& path, const std::string& name, [[maybe_unused]] const Options& options) { + H5::H5File handle(path / "partitions.h5", H5F_ACC_RDONLY); + auto ghandle = handle.openGroup(name); + auto dhandle = ghandle.openDataSet("lengths"); + return ritsuko::hdf5::get_1d_length(dhandle, false); +} + +} + +} + +#endif diff --git a/inst/include/takane/utils_factor.hpp b/inst/include/takane/utils_factor.hpp new file mode 100644 index 0000000..868a05d --- /dev/null +++ b/inst/include/takane/utils_factor.hpp @@ -0,0 +1,86 @@ +#ifndef TAKANE_UTILS_FACTOR_HPP +#define TAKANE_UTILS_FACTOR_HPP + +#include +#include +#include +#include +#include + +#include "ritsuko/ritsuko.hpp" +#include "ritsuko/hdf5/hdf5.hpp" + +namespace takane { + +namespace internal_factor { + +template +void check_ordered_attribute(const H5Object_& handle) { + if (!handle.attrExists("ordered")) { + return; + } + + auto attr = handle.openAttribute("ordered"); + if (!ritsuko::hdf5::is_scalar(attr)) { + throw std::runtime_error("expected 'ordered' attribute to be a scalar"); + } + if (ritsuko::hdf5::exceeds_integer_limit(attr, 32, true)) { + throw std::runtime_error("expected 'ordered' attribute to have a datatype that fits in a 32-bit signed integer"); + } +} + +inline hsize_t validate_factor_levels(const H5::Group& handle, const std::string& name, hsize_t buffer_size) { + auto lhandle = ritsuko::hdf5::open_dataset(handle, name.c_str()); + if (lhandle.getTypeClass() != H5T_STRING) { + throw std::runtime_error("expected a string datatype for '" + name + "'"); + } + + auto len = ritsuko::hdf5::get_1d_length(lhandle.getSpace(), false); + std::unordered_set present; + + ritsuko::hdf5::Stream1dStringDataset stream(&lhandle, len, buffer_size); + for (hsize_t i = 0; i < len; ++i, stream.next()) { + auto x = stream.steal(); + if (present.find(x) != present.end()) { + throw std::runtime_error("'" + name + "' contains duplicated factor level '" + x + "'"); + } + present.insert(std::move(x)); + } + + return len; +} + +inline hsize_t validate_factor_codes(const H5::Group& handle, const std::string& name, hsize_t num_levels, hsize_t buffer_size, bool allow_missing = true) { + auto chandle = ritsuko::hdf5::open_dataset(handle, name.c_str()); + if (ritsuko::hdf5::exceeds_integer_limit(chandle, 64, false)) { + throw std::runtime_error("expected a datatype for '" + name + "' that fits in a 64-bit unsigned integer"); + } + + bool has_missing = false; + int32_t missing_placeholder = 0; + if (allow_missing) { + auto missingness = ritsuko::hdf5::open_and_load_optional_numeric_missing_placeholder(chandle, "missing-value-placeholder"); + has_missing = missingness.first; + missing_placeholder = missingness.second; + } + + auto len = ritsuko::hdf5::get_1d_length(chandle.getSpace(), false); + ritsuko::hdf5::Stream1dNumericDataset stream(&chandle, len, buffer_size); + for (hsize_t i = 0; i < len; ++i, stream.next()) { + auto x = stream.get(); + if (has_missing && x == missing_placeholder) { + continue; + } + if (static_cast(x) >= num_levels) { + throw std::runtime_error("expected factor codes to be less than the number of levels"); + } + } + + return len; +} + +} + +} + +#endif diff --git a/inst/include/takane/utils_hdf5.hpp b/inst/include/takane/utils_hdf5.hpp deleted file mode 100644 index 5ae35b8..0000000 --- a/inst/include/takane/utils_hdf5.hpp +++ /dev/null @@ -1,127 +0,0 @@ -#ifndef TAKANE_UTILS_HDF5_HPP -#define TAKANE_UTILS_HDF5_HPP - -#include -#include -#include -#include -#include - -#include "ritsuko/ritsuko.hpp" -#include "ritsuko/hdf5/hdf5.hpp" - -namespace takane { - -namespace internal_hdf5 { - -inline void validate_string_format(const H5::DataSet& handle, hsize_t len, const std::string& format, bool has_missing, const std::string& missing_value, hsize_t buffer_size) { - if (format == "date") { - ritsuko::hdf5::load_1d_string_dataset( - handle, - len, - buffer_size, - [&](size_t, const char* p, size_t l) { - std::string x(p, p + l); - if (has_missing && missing_value == x) { - return; - } - if (!ritsuko::is_date(p, l)) { - throw std::runtime_error("expected a date-formatted string (got '" + x + "')"); - } - } - ); - - } else if (format == "date-time") { - ritsuko::hdf5::load_1d_string_dataset( - handle, - len, - buffer_size, - [&](size_t, const char* p, size_t l) { - std::string x(p, p + l); - if (has_missing && missing_value == x) { - return; - } - if (!ritsuko::is_rfc3339(p, l)) { - throw std::runtime_error("expected a date/time-formatted string (got '" + x + "')"); - } - } - ); - - } else if (format != "none") { - throw std::runtime_error("unsupported format '" + format + "'"); - } -} - -inline hsize_t validate_factor_levels(const H5::Group& handle, const std::string& name, hsize_t buffer_size) { - auto lhandle = ritsuko::hdf5::get_dataset(handle, name.c_str()); - if (lhandle.getTypeClass() != H5T_STRING) { - throw std::runtime_error("expected a string datatype for '" + name + "'"); - } - - auto len = ritsuko::hdf5::get_1d_length(lhandle.getSpace(), false); - std::unordered_set present; - - ritsuko::hdf5::load_1d_string_dataset( - lhandle, - len, - buffer_size, - [&](hsize_t, const char* p, size_t len) { - std::string x(p, p + len); - if (present.find(x) != present.end()) { - throw std::runtime_error("'" + name + "' contains duplicated factor level '" + x + "'"); - } - present.insert(std::move(x)); - } - ); - - return len; -} - -inline hsize_t validate_factor_codes(const H5::Group& handle, const std::string& name, hsize_t num_levels, hsize_t buffer_size, bool allow_missing = true) { - auto chandle = ritsuko::hdf5::get_dataset(handle, name.c_str()); - if (ritsuko::hdf5::exceeds_integer_limit(chandle, 32, true)) { - throw std::runtime_error("expected a datatype for '" + name + "' that fits in a 32-bit signed integer"); - } - - auto len = ritsuko::hdf5::get_1d_length(chandle.getSpace(), false); - auto block_size = ritsuko::hdf5::pick_1d_block_size(chandle.getCreatePlist(), len, buffer_size); - std::vector buffer(block_size); - - bool has_missing = false; - int32_t missing_placeholder = 0; - if (allow_missing) { - const char* missing_attr_name = "missing-value-placeholder"; - has_missing = chandle.attrExists(missing_attr_name); - if (has_missing) { - auto missing_attr = ritsuko::hdf5::get_missing_placeholder_attribute(chandle, missing_attr_name); - missing_attr.read(H5::PredType::NATIVE_INT32, &missing_placeholder); - } - } - - ritsuko::hdf5::iterate_1d_blocks( - len, - block_size, - [&](hsize_t, hsize_t len, const H5::DataSpace& memspace, const H5::DataSpace& dataspace) { - chandle.read(buffer.data(), H5::PredType::NATIVE_INT32, memspace, dataspace); - for (hsize_t i = 0; i < len; ++i) { - if (has_missing && buffer[i] == missing_placeholder) { - continue; - } - if (buffer[i] < 0) { - throw std::runtime_error("expected factor codes to be non-negative"); - } - if (static_cast(buffer[i]) >= num_levels) { - throw std::runtime_error("expected factor codes to be less than the number of levels"); - } - } - } - ); - - return len; -} - -} - -} - -#endif diff --git a/inst/include/takane/utils_other.hpp b/inst/include/takane/utils_other.hpp index 81084ce..bf995a4 100644 --- a/inst/include/takane/utils_other.hpp +++ b/inst/include/takane/utils_other.hpp @@ -13,42 +13,45 @@ namespace takane { */ void validate(const std::filesystem::path&, const std::string&, const Options&); size_t height(const std::filesystem::path&, const std::string&, const Options&); +bool satisfies_interface(const std::string&, const std::string&); /** * @endcond */ namespace internal_other { -inline bool ends_with(const std::string& full, const std::string& sub) { - return (full.size() >= sub.size() && full.find(sub) == full.size() - sub.size()); -} - -inline void validate_mcols(const std::filesystem::path& path, size_t expected, const Options& options) { +inline void validate_mcols(const std::filesystem::path& parent, const std::string& name, size_t expected, const Options& options) try { + auto path = parent / name; if (!std::filesystem::exists(path)) { return; } auto xtype = read_object_type(path); - if (!ends_with(xtype, "data_frame")) { - throw std::runtime_error("expected a 'data_frame' or one of its derivatives"); + if (!satisfies_interface(xtype, "DATA_FRAME")) { + throw std::runtime_error("expected an object that satisfies the 'DATA_FRAME' interface"); } ::takane::validate(path, xtype, options); if (::takane::height(path, xtype, options) != expected) { throw std::runtime_error("unexpected number of rows"); } +} catch (std::exception& e) { + throw std::runtime_error("failed to validate '" + name + "'; " + std::string(e.what())); } -inline void validate_metadata(const std::filesystem::path& path, const Options& options) { +inline void validate_metadata(const std::filesystem::path& parent, const std::string& name, const Options& options) try { + auto path = parent / name; if (!std::filesystem::exists(path)) { return; } auto xtype = read_object_type(path); - if (!ends_with(xtype, "simple_list")) { - throw std::runtime_error("expected a 'simple_list' or one of its derivatives"); + if (!satisfies_interface(xtype, "SIMPLE_LIST")) { + throw std::runtime_error("expected an object that satisfies the 'SIMPLE_LIST' interface'"); } ::takane::validate(path, xtype, options); +} catch (std::exception& e) { + throw std::runtime_error("failed to validate '" + name + "'; " + std::string(e.what())); } } diff --git a/inst/include/takane/utils_string.hpp b/inst/include/takane/utils_string.hpp new file mode 100644 index 0000000..ba862f8 --- /dev/null +++ b/inst/include/takane/utils_string.hpp @@ -0,0 +1,88 @@ +#ifndef TAKANE_UTILS_STRING_HPP +#define TAKANE_UTILS_STRING_HPP + +#include +#include +#include +#include +#include + +#include "ritsuko/ritsuko.hpp" +#include "ritsuko/hdf5/hdf5.hpp" + +namespace takane { + +namespace internal_string { + +template +std::string fetch_format_attribute(const H5Object_& handle) { + if (!handle.attrExists("format")) { + return "none"; + } + + auto attr = handle.openAttribute("format"); + if (!ritsuko::hdf5::is_scalar(attr)) { + throw std::runtime_error("expected 'format' attribute to be a scalar"); + } + if (attr.getTypeClass() != H5T_STRING) { + throw std::runtime_error("expected 'format' attribute to be a string"); + } + return ritsuko::hdf5::load_scalar_string_attribute(attr); +} + +inline void validate_string_format(const H5::DataSet& handle, hsize_t len, const std::string& format, bool has_missing, const std::string& missing_value, hsize_t buffer_size) { + if (format == "date") { + ritsuko::hdf5::Stream1dStringDataset stream(&handle, len, buffer_size); + for (hsize_t i = 0; i < len; ++i, stream.next()) { + auto x = stream.steal(); + if (has_missing && missing_value == x) { + continue; + } + if (!ritsuko::is_date(x.c_str(), x.size())) { + throw std::runtime_error("expected a date-formatted string (got '" + x + "')"); + } + } + + } else if (format == "date-time") { + ritsuko::hdf5::Stream1dStringDataset stream(&handle, len, buffer_size); + for (hsize_t i = 0; i < len; ++i, stream.next()) { + auto x = stream.steal(); + if (has_missing && missing_value == x) { + continue; + } + if (!ritsuko::is_rfc3339(x.c_str(), x.size())) { + throw std::runtime_error("expected a date/time-formatted string (got '" + x + "')"); + } + } + + } else if (format == "none") { + ritsuko::hdf5::validate_1d_string_dataset(handle, len, buffer_size); + + } else { + throw std::runtime_error("unsupported format '" + format + "'"); + } +} + +inline void validate_names(const H5::Group& handle, const std::string& name, size_t len, hsize_t buffer_size) { + if (!handle.exists(name)) { + return; + } + + auto nhandle = ritsuko::hdf5::open_dataset(handle, name.c_str()); + if (nhandle.getTypeClass() != H5T_STRING) { + throw std::runtime_error("'" + name + "' should be a string datatype class"); + } + + auto nlen = ritsuko::hdf5::get_1d_length(nhandle.getSpace(), false); + if (len != nlen) { + throw std::runtime_error("'" + name + "' should have the same length as the parent object (got " + std::to_string(nlen) + ", expected " + std::to_string(len) + ")"); + } + + ritsuko::hdf5::validate_1d_string_dataset(nhandle, len, buffer_size); +} + +} + +} + +#endif diff --git a/inst/include/uzuki2/Version.hpp b/inst/include/uzuki2/Version.hpp index 841185a..55ab9dc 100644 --- a/inst/include/uzuki2/Version.hpp +++ b/inst/include/uzuki2/Version.hpp @@ -1,9 +1,6 @@ #ifndef UZUKI2_VERSIONED_BASE_HPP #define UZUKI2_VERSIONED_BASE_HPP -#include -#include - /** * @file Version.hpp * @brief Version-related definitions. @@ -63,54 +60,6 @@ struct Version { } }; -/** - * @cond - */ -inline Version parse_version_string(const std::string& version_string) { - int major = 0, minor = 0; - size_t i = 0, end = version_string.size(); - - if (version_string.empty()) { - throw std::runtime_error("version string is empty"); - } - if (version_string[i] == '0') { - throw std::runtime_error("invalid version string '" + version_string + "' has leading zeros in its major version"); - } - while (i < end && version_string[i] != '.') { - if (!std::isdigit(version_string[i])) { - throw std::runtime_error("invalid version string '" + version_string + "' contains non-digit characters"); - } - major *= 10; - major += version_string[i] - '0'; - ++i; - } - - if (i == end) { - throw std::runtime_error("version string '" + version_string + "' is missing a minor version"); - } - ++i; // get past the period and check again. - if (i == end) { - throw std::runtime_error("version string '" + version_string + "' is missing a minor version"); - } - - if (version_string[i] == '0' && i + 1 < end) { - throw std::runtime_error("invalid version string '" + version_string + "' has leading zeros in its minor version"); - } - while (i < end) { - if (!std::isdigit(version_string[i])) { - throw std::runtime_error("invalid version string '" + version_string + "' contains non-digit characters"); - } - minor *= 10; - minor += version_string[i] - '0'; - ++i; - } - - return Version(major, minor); -} -/** - * @cond - */ - } #endif diff --git a/inst/include/uzuki2/parse_hdf5.hpp b/inst/include/uzuki2/parse_hdf5.hpp index 200c7b7..80aa040 100644 --- a/inst/include/uzuki2/parse_hdf5.hpp +++ b/inst/include/uzuki2/parse_hdf5.hpp @@ -41,18 +41,19 @@ namespace hdf5 { /** * @cond */ -inline H5::DataSet get_scalar_dataset(const H5::Group& handle, const std::string& name, H5T_class_t type_class) try { - auto dhandle = ritsuko::hdf5::get_scalar_dataset(handle, name.c_str()); - if (dhandle.getTypeClass() != type_class) { - throw std::runtime_error("dataset has the wrong datatype class"); +inline H5::DataSet check_scalar_dataset(const H5::Group& handle, const char* name) { + if (handle.childObjType(name) != H5O_TYPE_DATASET) { + throw std::runtime_error("expected '" + std::string(name) + "' to be a dataset"); + } + auto dhandle = handle.openDataSet(name); + if (!ritsuko::hdf5::is_scalar(dhandle)) { + throw std::runtime_error("expected '" + std::string(name) + "'to be a scalar dataset"); } return dhandle; -} catch (std::exception& e) { - throw std::runtime_error("failed to load scalar dataset at '" + ritsuko::hdf5::get_name(handle) + "/" + name + "'; " + std::string(e.what())); } template -void parse_integer_like(const H5::DataSet& handle, Host* ptr, Function check, const Version& version) try { +void parse_integer_like(const H5::DataSet& handle, Host* ptr, Function check, const Version& version, hsize_t buffer_size) try { if (ritsuko::hdf5::exceeds_integer_limit(handle, 32, true)) { throw std::runtime_error("dataset cannot be represented by 32-bit signed integers"); } @@ -65,69 +66,57 @@ void parse_integer_like(const H5::DataSet& handle, Host* ptr, Function check, co const char* placeholder_name = "missing-value-placeholder"; has_missing = handle.attrExists(placeholder_name); if (has_missing) { - auto attr = ritsuko::hdf5::get_missing_placeholder_attribute(handle, placeholder_name, /* type_class_only = */ version.lt(1, 2)); + auto attr = handle.openAttribute(placeholder_name); + ritsuko::hdf5::check_missing_placeholder_attribute(handle, attr, /* type_class_only = */ version.lt(1, 2)); attr.read(H5::PredType::NATIVE_INT32, &missing_value); } } hsize_t full_length = ptr->size(); - auto block_size = ritsuko::hdf5::pick_1d_block_size(handle.getCreatePlist(), full_length, /* buffer_size = */ 10000); - std::vector buffer(block_size); - ritsuko::hdf5::iterate_1d_blocks( - full_length, - block_size, - [&](hsize_t counter, hsize_t limit, const H5::DataSpace& mspace, const H5::DataSpace& dspace) -> void { - handle.read(buffer.data(), H5::PredType::NATIVE_INT32, mspace, dspace); - for (hsize_t i = 0; i < limit; ++i) { - auto current = buffer[i]; - if (has_missing && current == missing_value) { - ptr->set_missing(counter + i); - } else { - check(current); - ptr->set(counter + i, current); - } - } + ritsuko::hdf5::Stream1dNumericDataset stream(&handle, full_length, buffer_size); + for (hsize_t i = 0; i < full_length; ++i, stream.next()) { + auto current = stream.get(); + if (has_missing && current == missing_value) { + ptr->set_missing(i); + } else { + check(current); + ptr->set(i, current); } - ); + } + } catch (std::exception& e) { throw std::runtime_error("failed to load integer dataset at '" + ritsuko::hdf5::get_name(handle) + "'; " + std::string(e.what())); } template -void parse_string_like(const H5::DataSet& handle, Host* ptr, Function check) try { +void parse_string_like(const H5::DataSet& handle, Host* ptr, Function check, hsize_t buffer_size) try { auto dtype = handle.getDataType(); if (dtype.getClass() != H5T_STRING) { throw std::runtime_error("expected a string dataset"); } - const char* placeholder_name = "missing-value-placeholder"; - bool has_missing = handle.attrExists(placeholder_name); - std::string missing_val; - if (has_missing) { - auto ahandle = ritsuko::hdf5::get_missing_placeholder_attribute(handle, placeholder_name, /* type_class_only = */ true); - missing_val = ritsuko::hdf5::load_scalar_string_attribute(ahandle); - } + auto missingness = ritsuko::hdf5::open_and_load_optional_string_missing_placeholder(handle, "missing-value-placeholder"); + bool has_missing = missingness.first; + std::string missing_val = missingness.second; - ritsuko::hdf5::load_1d_string_dataset( - handle, - ptr->size(), - /* buffer_size = */ 10000, - [&](size_t i, const char* str, size_t len) -> void { - std::string x(str, str + len); - if (has_missing && x == missing_val) { - ptr->set_missing(i); - } else { - check(x); - ptr->set(i, std::move(x)); - } + hsize_t full_length = ptr->size(); + ritsuko::hdf5::Stream1dStringDataset stream(&handle, full_length, buffer_size); + for (hsize_t i = 0; i < full_length; ++i, stream.next()) { + auto x = stream.steal(); + if (has_missing && x == missing_val) { + ptr->set_missing(i); + } else { + check(x); + ptr->set(i, std::move(x)); } - ); + } + } catch (std::exception& e) { throw std::runtime_error("failed to load string dataset at '" + ritsuko::hdf5::get_name(handle) + "'; " + std::string(e.what())); } template -void parse_numbers(const H5::DataSet& handle, Host* ptr, Function check, const Version& version) try { +void parse_numbers(const H5::DataSet& handle, Host* ptr, Function check, const Version& version, hsize_t buffer_size) try { if (version.lt(1, 3)) { if (handle.getTypeClass() != H5T_FLOAT) { throw std::runtime_error("expected a floating-point dataset"); @@ -148,7 +137,8 @@ void parse_numbers(const H5::DataSet& handle, Host* ptr, Function check, const V const char* placeholder_name = "missing-value-placeholder"; has_missing = handle.attrExists(placeholder_name); if (has_missing) { - auto attr = ritsuko::hdf5::get_missing_placeholder_attribute(handle, placeholder_name, /* type_class_only = */ version.lt(1, 2)); + auto attr = handle.openAttribute(placeholder_name); + ritsuko::hdf5::check_missing_placeholder_attribute(handle, attr, /* type_class_only = */ version.lt(1, 2)); attr.read(H5::PredType::NATIVE_DOUBLE, &missing_value); } } @@ -166,30 +156,23 @@ void parse_numbers(const H5::DataSet& handle, Host* ptr, Function check, const V }; hsize_t full_length = ptr->size(); - auto block_size = ritsuko::hdf5::pick_1d_block_size(handle.getCreatePlist(), full_length, /* buffer_size = */ 10000); - std::vector buffer(block_size); - ritsuko::hdf5::iterate_1d_blocks( - full_length, - block_size, - [&](hsize_t counter, hsize_t limit, const H5::DataSpace& mspace, const H5::DataSpace& dspace) -> void { - handle.read(buffer.data(), H5::PredType::NATIVE_DOUBLE, mspace, dspace); - for (hsize_t i = 0; i < limit; ++i) { - auto current = buffer[i]; - if (has_missing && is_missing_value(current)) { - ptr->set_missing(counter + i); - } else { - check(current); - ptr->set(counter + i, current); - } - } + ritsuko::hdf5::Stream1dNumericDataset stream(&handle, full_length, buffer_size); + for (hsize_t i = 0; i < full_length; ++i, stream.next()) { + auto current = stream.get(); + if (has_missing && is_missing_value(current)) { + ptr->set_missing(i); + } else { + check(current); + ptr->set(i, current); } - ); + } + } catch (std::exception& e) { throw std::runtime_error("failed to load floating-point dataset at '" + ritsuko::hdf5::get_name(handle) + "'; " + std::string(e.what())); } template -void extract_names(const H5::Group& handle, Host* ptr) try { +void extract_names(const H5::Group& handle, Host* ptr, hsize_t buffer_size) try { if (handle.childObjType("names") != H5O_TYPE_DATASET) { throw std::runtime_error("expected a dataset"); } @@ -206,52 +189,46 @@ void extract_names(const H5::Group& handle, Host* ptr) try { throw std::runtime_error("number of names should be equal to the object length"); } - ritsuko::hdf5::load_1d_string_dataset( - nhandle, - nlen, - /* buffer_size = */ 10000, - [&](size_t i, const char* val, size_t len) -> void { - ptr->set_name(i, std::string(val, val + len)); - } - ); + ritsuko::hdf5::Stream1dStringDataset stream(&nhandle, nlen, buffer_size); + for (size_t i = 0; i < nlen; ++i, stream.next()) { + ptr->set_name(i, stream.steal()); + } } catch (std::exception& e) { throw std::runtime_error("failed to load names at '" + ritsuko::hdf5::get_name(handle) + "'; " + std::string(e.what())); } template -std::shared_ptr parse_inner(const H5::Group& handle, Externals& ext, const Version& version) try { +std::shared_ptr parse_inner(const H5::Group& handle, Externals& ext, const Version& version, hsize_t buffer_size) try { // Deciding what type we're dealing with. - auto object_type = ritsuko::hdf5::load_scalar_string_attribute(handle, "uzuki_object"); + auto object_type = ritsuko::hdf5::open_and_load_scalar_string_attribute(handle, "uzuki_object"); std::shared_ptr output; if (object_type == "list") { - if (!handle.exists("data") || handle.childObjType("data") != H5O_TYPE_GROUP) { - throw std::runtime_error("expected a group at 'data'"); - } - auto dhandle = handle.openGroup("data"); + auto dhandle = ritsuko::hdf5::open_group(handle, "data"); size_t len = dhandle.getNumObjs(); bool named = handle.exists("names"); auto lptr = Provisioner::new_List(len, named); output.reset(lptr); - for (size_t i = 0; i < len; ++i) { - auto istr = std::to_string(i); - if (!dhandle.exists(istr) || dhandle.childObjType(istr) != H5O_TYPE_GROUP) { - throw std::runtime_error("expected a group at 'data/" + istr + "'"); + try { + for (size_t i = 0; i < len; ++i) { + auto istr = std::to_string(i); + auto lhandle = ritsuko::hdf5::open_group(dhandle, istr.c_str()); + lptr->set(i, parse_inner(lhandle, ext, version, buffer_size)); } - auto lhandle = dhandle.openGroup(istr); - lptr->set(i, parse_inner(lhandle, ext, version)); + } catch (std::exception& e) { + throw std::runtime_error("failed to parse list contents in 'data'; " + std::string(e.what())); } if (named) { - extract_names(handle, lptr); + extract_names(handle, lptr, buffer_size); } } else if (object_type == "vector") { - auto vector_type = ritsuko::hdf5::load_scalar_string_attribute(handle, "uzuki_type"); + auto vector_type = ritsuko::hdf5::open_and_load_scalar_string_attribute(handle, "uzuki_type"); - auto dhandle = ritsuko::hdf5::get_dataset(handle, "data"); + auto dhandle = ritsuko::hdf5::open_dataset(handle, "data"); size_t len = ritsuko::hdf5::get_1d_length(dhandle.getSpace(), true); bool is_scalar = (len == 0); if (is_scalar) { @@ -263,7 +240,7 @@ std::shared_ptr parse_inner(const H5::Group& handle, Externals& ext, const if (vector_type == "integer") { auto iptr = Provisioner::new_Integer(len, named, is_scalar); output.reset(iptr); - parse_integer_like(dhandle, iptr, [](int32_t) -> void {}, version); + parse_integer_like(dhandle, iptr, [](int32_t) -> void {}, version, buffer_size); } else if (vector_type == "boolean") { auto bptr = Provisioner::new_Boolean(len, named, is_scalar); @@ -272,10 +249,10 @@ std::shared_ptr parse_inner(const H5::Group& handle, Externals& ext, const if (x != 0 && x != 1) { throw std::runtime_error("boolean values should be 0 or 1"); } - }, version); + }, version, buffer_size); } else if (vector_type == "factor" || (version.equals(1, 0) && vector_type == "ordered")) { - auto levhandle = ritsuko::hdf5::get_dataset(handle, "levels"); + auto levhandle = ritsuko::hdf5::open_dataset(handle, "levels"); auto levtype = levhandle.getDataType(); if (levtype.getClass() != H5T_STRING) { throw std::runtime_error("expected a string dataset for the levels at 'levels'"); @@ -286,9 +263,12 @@ std::shared_ptr parse_inner(const H5::Group& handle, Externals& ext, const if (vector_type == "ordered") { ordered = true; } else if (handle.exists("ordered")) { - auto ohandle = get_scalar_dataset(handle, "ordered", H5T_INTEGER); - int tmp_ordered = 0; - ohandle.read(&tmp_ordered, H5::PredType::NATIVE_INT); + auto ohandle = check_scalar_dataset(handle, "ordered"); + if (ritsuko::hdf5::exceeds_integer_limit(ohandle, 32, true)) { + throw std::runtime_error("'ordered' value cannot be represented by a 32-bit integer"); + } + int32_t tmp_ordered = 0; + ohandle.read(&tmp_ordered, H5::PredType::NATIVE_INT32); ordered = tmp_ordered > 0; } @@ -298,22 +278,18 @@ std::shared_ptr parse_inner(const H5::Group& handle, Externals& ext, const if (x < 0 || x >= levlen) { throw std::runtime_error("factor codes should be non-negative and less than the number of levels"); } - }, version); + }, version, buffer_size); std::unordered_set present; - ritsuko::hdf5::load_1d_string_dataset( - levhandle, - levlen, - /* buffer_size = */ 10000, - [&](size_t i, const char* val, size_t len) -> void { - std::string x(val, val + len); - if (present.find(x) != present.end()) { - throw std::runtime_error("levels should be unique"); - } - fptr->set_level(i, x); - present.insert(std::move(x)); + ritsuko::hdf5::Stream1dStringDataset stream(&levhandle, levlen, buffer_size); + for (int32_t i = 0; i < levlen; ++i, stream.next()) { + auto x = stream.steal(); + if (present.find(x) != present.end()) { + throw std::runtime_error("levels should be unique"); } - ); + fptr->set_level(i, x); + present.insert(std::move(x)); + } } else if (vector_type == "string" || (version.equals(1, 0) && (vector_type == "date" || vector_type == "date-time"))) { StringVector::Format format = StringVector::NONE; @@ -323,49 +299,46 @@ std::shared_ptr parse_inner(const H5::Group& handle, Externals& ext, const } else if (vector_type == "date-time") { format = StringVector::DATETIME; } + } else if (handle.exists("format")) { - auto fhandle = get_scalar_dataset(handle, "format", H5T_STRING); - ritsuko::hdf5::load_1d_string_dataset( - fhandle, - 1, - /* buffer_size = */ 10000, - [&](size_t, const char* val, size_t len) -> void { - std::string x(val, val + len); - if (x == "date") { - format = StringVector::DATE; - } else if (x == "date-time") { - format = StringVector::DATETIME; - } else { - throw std::runtime_error("unsupported format '" + x + "'"); - } - } - ); + auto fhandle = check_scalar_dataset(handle, "format"); + if (fhandle.getTypeClass() != H5T_STRING) { + throw std::runtime_error("'format' dataset should have a string datatype class"); + } + auto x = ritsuko::hdf5::load_scalar_string_dataset(fhandle); + if (x == "date") { + format = StringVector::DATE; + } else if (x == "date-time") { + format = StringVector::DATETIME; + } else { + throw std::runtime_error("unsupported format '" + x + "'"); + } } auto sptr = Provisioner::new_String(len, named, is_scalar, format); output.reset(sptr); if (format == StringVector::NONE) { - parse_string_like(dhandle, sptr, [](const std::string&) -> void {}); + parse_string_like(dhandle, sptr, [](const std::string&) -> void {}, buffer_size); } else if (format == StringVector::DATE) { parse_string_like(dhandle, sptr, [&](const std::string& x) -> void { if (!ritsuko::is_date(x.c_str(), x.size())) { throw std::runtime_error("dates should follow YYYY-MM-DD formatting"); } - }); + }, buffer_size); } else if (format == StringVector::DATETIME) { parse_string_like(dhandle, sptr, [&](const std::string& x) -> void { if (!ritsuko::is_rfc3339(x.c_str(), x.size())) { throw std::runtime_error("date-times should follow the Internet Date/Time format"); } - }); + }, buffer_size); } } else if (vector_type == "number") { auto dptr = Provisioner::new_Number(len, named, is_scalar); output.reset(dptr); - parse_numbers(dhandle, dptr, [](double) -> void {}, version); + parse_numbers(dhandle, dptr, [](double) -> void {}, version, buffer_size); } else { throw std::runtime_error("unknown vector type '" + vector_type + "'"); @@ -373,14 +346,14 @@ std::shared_ptr parse_inner(const H5::Group& handle, Externals& ext, const if (named) { auto vptr = static_cast(output.get()); - extract_names(handle, vptr); + extract_names(handle, vptr, buffer_size); } } else if (object_type == "nothing") { output.reset(Provisioner::new_Nothing()); } else if (object_type == "external") { - auto ihandle = ritsuko::hdf5::get_dataset(handle, "index"); + auto ihandle = ritsuko::hdf5::open_dataset(handle, "index"); if (ritsuko::hdf5::exceeds_integer_limit(ihandle, 32, true)) { throw std::runtime_error("external index at 'index' cannot be represented by a 32-bit signed integer"); } @@ -411,12 +384,28 @@ std::shared_ptr parse_inner(const H5::Group& handle, Externals& ext, const * @endcond */ +/** + * @brief Options for HDF5 file parsing. + */ +struct Options { + /** + * Buffer size, in terms of the number of elements, to use for reading data from HDF5 datasets. + */ + hsize_t buffer_size = 10000; + + /** + * Whether to throw an error if the top-level R object is not an R list. + */ + bool strict_list = true; +}; + /** * @tparam Provisioner A class namespace defining static methods for creating new `Base` objects. * @tparam Externals Class describing how to resolve external references for type `EXTERNAL`. * * @param handle Handle for a HDF5 group corresponding to the list. * @param ext Instance of an external reference resolver class. + * @param options Optional parameters. * * @return A `ParsedList` containing a pointer to the root `Base` object. * Depending on `Provisioner`, this may contain references to all nested objects. @@ -456,16 +445,23 @@ std::shared_ptr parse_inner(const H5::Group& handle, Externals& ext, const * - `size_t size()`, which returns the number of available external references. */ template -ParsedList parse(const H5::Group& handle, Externals ext) { +ParsedList parse(const H5::Group& handle, Externals ext, Options options = Options()) { Version version; if (handle.attrExists("uzuki_version")) { - auto ver_str = ritsuko::hdf5::load_scalar_string_attribute(handle, "uzuki_version"); - version = parse_version_string(ver_str); + auto ver_str = ritsuko::hdf5::open_and_load_scalar_string_attribute(handle, "uzuki_version"); + auto vraw = ritsuko::parse_version_string(ver_str.c_str(), ver_str.size(), /* skip_patch = */ true); + version.major = vraw.major; + version.minor = vraw.minor; } ExternalTracker etrack(std::move(ext)); - auto ptr = parse_inner(handle, etrack, version); + auto ptr = parse_inner(handle, etrack, version, options.buffer_size); + + if (options.strict_list && ptr->type() != LIST) { + throw std::runtime_error("top-level object should represent an R list"); + } etrack.validate(); + return ParsedList(std::move(ptr), std::move(version)); } @@ -476,6 +472,7 @@ ParsedList parse(const H5::Group& handle, Externals ext) { * @tparam Provisioner A class namespace defining static methods for creating new `Base` objects. * * @param handle Handle for a HDF5 group corresponding to the list. + * @param options Optional parameters. * * @return A `ParsedList` containing a pointer to the root `Base` object. * Depending on `Provisioner`, this may contain references to all nested objects. @@ -483,8 +480,8 @@ ParsedList parse(const H5::Group& handle, Externals ext) { * Any invalid representations in `contents` will cause an error to be thrown. */ template -ParsedList parse(const H5::Group& handle) { - return parse(handle, uzuki2::DummyExternals(0)); +ParsedList parse(const H5::Group& handle, Options options = Options()) { + return parse(handle, uzuki2::DummyExternals(0), std::move(options)); } /** @@ -496,6 +493,7 @@ ParsedList parse(const H5::Group& handle) { * @param file Path to a HDF5 file. * @param name Name of the HDF5 group containing the list in `file`. * @param ext Instance of an external reference resolver class. + * @param options Optional parameters. * * @return A `ParsedList` containing a pointer to the root `Base` object. * Depending on `Provisioner`, this may contain references to all nested objects. @@ -503,9 +501,9 @@ ParsedList parse(const H5::Group& handle) { * Any invalid representations in `contents` will cause an error to be thrown. */ template -ParsedList parse(const std::string& file, const std::string& name, Externals ext) { +ParsedList parse(const std::string& file, const std::string& name, Externals ext, Options options = Options()) { H5::H5File handle(file, H5F_ACC_RDONLY); - return parse(handle.openGroup(name), std::move(ext)); + return parse(ritsuko::hdf5::open_group(handle, name.c_str()), std::move(ext), std::move(options)); } /** @@ -516,6 +514,7 @@ ParsedList parse(const std::string& file, const std::string& name, Externals ext * * @param file Path to a HDF5 file. * @param name Name of the HDF5 group containing the list in `file`. + * @param options Optional parameters. * * @return A `ParsedList` containing a pointer to the root `Base` object. * Depending on `Provisioner`, this may contain references to all nested objects. @@ -523,9 +522,9 @@ ParsedList parse(const std::string& file, const std::string& name, Externals ext * Any invalid representations in `contents` will cause an error to be thrown. */ template -ParsedList parse(const std::string& file, const std::string& name) { +ParsedList parse(const std::string& file, const std::string& name, Options options = Options()) { H5::H5File handle(file, H5F_ACC_RDONLY); - return parse(handle.openGroup(name), uzuki2::DummyExternals(0)); + return parse(ritsuko::hdf5::open_group(handle, name.c_str()), uzuki2::DummyExternals(0), std::move(options)); } /** @@ -536,10 +535,11 @@ ParsedList parse(const std::string& file, const std::string& name) { * @param name Name of the HDF5 group corresponding to `handle`. * Only used for error messages. * @param num_external Expected number of external references. + * @param options Optional parameters. */ -inline void validate(const H5::Group& handle, int num_external = 0) { +inline void validate(const H5::Group& handle, int num_external = 0, Options options = Options()) { DummyExternals ext(num_external); - parse(handle, ext); + parse(handle, ext, std::move(options)); return; } @@ -550,10 +550,11 @@ inline void validate(const H5::Group& handle, int num_external = 0) { * @param file Path to a HDF5 file. * @param name Name of the HDF5 group containing the list in `file`. * @param num_external Expected number of external references. + * @param options Optional parameters. */ -inline void validate(const std::string& file, const std::string& name, int num_external = 0) { +inline void validate(const std::string& file, const std::string& name, int num_external = 0, Options options = Options()) { DummyExternals ext(num_external); - parse(file, name, ext); + parse(file, name, ext, std::move(options)); return; } diff --git a/inst/include/uzuki2/parse_json.hpp b/inst/include/uzuki2/parse_json.hpp index b380c92..b7e2fbb 100644 --- a/inst/include/uzuki2/parse_json.hpp +++ b/inst/include/uzuki2/parse_json.hpp @@ -395,6 +395,11 @@ struct Options { * If true, an extra thread is used to avoid blocking I/O operations. */ bool parallel = false; + + /** + * Whether to throw an error if the top-level R object is not an R list. + */ + bool strict_list = true; }; /** @@ -434,14 +439,21 @@ ParsedList parse(byteme::Reader& reader, Externals ext, Options options = Option if (vIt->second->type() != millijson::STRING) { throw std::runtime_error("expected a string in 'version'"); } - auto vptr = static_cast(vIt->second.get()); - version = parse_version_string(vptr->value); + const auto& vstr = static_cast(vIt->second.get())->value; + auto vraw = ritsuko::parse_version_string(vstr.c_str(), vstr.size(), /* skip_patch = */ true); + version.major = vraw.major; + version.minor = vraw.minor; } } ExternalTracker etrack(std::move(ext)); auto output = parse_object(contents.get(), etrack, "", version); + + if (options.strict_list && output->type() != LIST) { + throw std::runtime_error("top-level object should represent an R list"); + } etrack.validate(); + return ParsedList(std::move(output), std::move(version)); } From 3f53b949d53151eacd6c555dd9a134fe17a6b1f1 Mon Sep 17 00:00:00 2001 From: LTLA Date: Thu, 23 Nov 2023 23:39:32 -0800 Subject: [PATCH 2/2] Save factor codes as unsigned integers for new takane validators. This is, on the whole, more sensible, as downstream consumers don't have to check whether the codes are negative (which would make no sense outside of missing placeholders, and we don't need half the integer space for that). --- R/saveBaseFactor.R | 28 ++++++++++++++++++---------- R/saveDataFrame.R | 5 ++--- R/saveDataFrameFactor.R | 10 ++++------ tests/testthat/test-DataFrame.R | 2 +- 4 files changed, 25 insertions(+), 20 deletions(-) diff --git a/R/saveBaseFactor.R b/R/saveBaseFactor.R index 64a391b..379931d 100644 --- a/R/saveBaseFactor.R +++ b/R/saveBaseFactor.R @@ -35,39 +35,47 @@ setMethod("saveObject", "factor", function(x, path, ...) { fhandle <- H5Fopen(ofile) on.exit(H5Fclose(fhandle), add=TRUE) + ghandle <- H5Gopen(fhandle, host) + on.exit(H5Gclose(ghandle), add=TRUE, after=FALSE) (function (){ - ghandle <- H5Gopen(fhandle, host) - on.exit(H5Gclose(ghandle), add=TRUE) h5writeAttribute("1.0", ghandle, "version", asScalar=TRUE) if (is.ordered(x)) { h5writeAttribute(1L, ghandle, "ordered", asScalar=TRUE) } })() - .simple_save_codes(fhandle, host, x) - h5write(levels(x), fhandle, paste0(host, "/levels")) + .simple_save_codes(ghandle, x) + h5write(levels(x), ghandle, "levels") write("string_factor", file=file.path(path, "OBJECT")) invisible(NULL) }) -.simple_save_codes <- function(fhandle, host, x, save.names=TRUE) { +.simple_save_codes <- function(ghandle, x, save.names=TRUE) { codes <- as.integer(x) - 1L missing.placeholder <- NULL if (anyNA(codes)) { - missing.placeholder <- -1L + missing.placeholder <- nlevels(x) codes[is.na(codes)] <- missing.placeholder } - full.data.name <- paste0(host, "/codes") - h5write(codes, fhandle, full.data.name) + shandle <- H5Screate_simple(length(x)) + on.exit(H5Sclose(shandle), add=TRUE) + dhandle <- H5Dcreate(ghandle, "codes", dtype_id="H5T_NATIVE_UINT32", h5space=shandle) + on.exit(H5Dclose(dhandle), add=TRUE, after=FALSE) + H5Dwrite(dhandle, codes) + if (!is.null(missing.placeholder)) { - addMissingPlaceholderAttributeForHdf5(fhandle, full.data.name, missing.placeholder) + ashandle <- H5Screate("H5S_SCALAR") + on.exit(H5Sclose(ashandle), add=TRUE, after=FALSE) + ahandle <- H5Acreate(dhandle, "missing-value-placeholder", dtype_id="H5T_NATIVE_UINT32", h5space=ashandle) + on.exit(H5Aclose(ahandle), add=TRUE, after=FALSE) + H5Awrite(ahandle, missing.placeholder) } if (save.names && !is.null(names(x))) { - h5write(names(x), fhandle, paste0(host, "/names")) + h5write(names(x), ghandle, "names") } } diff --git a/R/saveDataFrame.R b/R/saveDataFrame.R index 0440b90..e0b5c6c 100644 --- a/R/saveDataFrame.R +++ b/R/saveDataFrame.R @@ -91,11 +91,10 @@ setMethod("saveObject", "DataFrame", function(x, path, ...) { if (is.ordered(col)) { h5writeAttribute(1L, ghandle, "ordered", asScalar=TRUE) } + .simple_save_codes(ghandle, col, save.names=FALSE) + h5write(levels(col), ghandle, "levels"); })() - .simple_save_codes(fhandle, full.data.name, col, save.names=FALSE) - h5write(levels(col), fhandle, paste0(full.data.name, "/levels")); - } else if (.is_datetime(col)) { coltype <- "string" colformat <- "date-time" diff --git a/R/saveDataFrameFactor.R b/R/saveDataFrameFactor.R index 8cd6622..a79479b 100644 --- a/R/saveDataFrameFactor.R +++ b/R/saveDataFrameFactor.R @@ -32,13 +32,11 @@ setMethod("saveObject", "DataFrameFactor", function(x, path, ...) { fhandle <- H5Fopen(ofile) on.exit(H5Fclose(fhandle), add=TRUE) - (function (){ - ghandle <- H5Gopen(fhandle, host) - on.exit(H5Gclose(ghandle), add=TRUE) - h5writeAttribute("1.0", ghandle, "version", asScalar=TRUE) - })() + ghandle <- H5Gopen(fhandle, host) + on.exit(H5Gclose(ghandle), add=TRUE, after=FALSE) + h5writeAttribute("1.0", ghandle, "version", asScalar=TRUE) - .simple_save_codes(fhandle, host, x) + .simple_save_codes(ghandle, x) stuff <- levels(x) altSaveObject(stuff, paste0(path, "/levels"), ...) diff --git a/tests/testthat/test-DataFrame.R b/tests/testthat/test-DataFrame.R index b0b9aeb..03b69c8 100644 --- a/tests/testthat/test-DataFrame.R +++ b/tests/testthat/test-DataFrame.R @@ -295,7 +295,7 @@ test_that("handling of NAs works correctly", { fpath <- file.path(tmp2, "basic_columns.h5") attrs <- rhdf5::h5readAttributes(fpath, "data_frame/data/2/codes") - expect_identical(attrs[["missing-value-placeholder"]], -1L) + expect_identical(attrs[["missing-value-placeholder"]], 2L) attrs <- rhdf5::h5readAttributes(fpath, "data_frame/data/3/codes") expect_null(attrs[["missing-value-placeholder"]])