diff --git a/.gitignore b/.gitignore index 749f4fa..765a5ce 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,4 @@ build/ docs/html docs/latex +TEST_* diff --git a/CMakeLists.txt b/CMakeLists.txt index b83e4ed..fe51b94 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -20,10 +20,11 @@ if(TAKANE_FETCH_EXTERN) add_subdirectory(extern) else() find_package(artifactdb_ritsuko CONFIG REQUIRED) + find_package(artifactdb_uzuki2 CONFIG REQUIRED) find_package(artifactdb_comservatory CONFIG REQUIRED) endif() -target_link_libraries(takane INTERFACE artifactdb::ritsuko artifactdb::comservatory) +target_link_libraries(takane INTERFACE artifactdb::ritsuko artifactdb::comservatory artifactdb::uzuki2) option(TAKANE_FIND_HDF5 "Try to find and link to HDF5 for takane." ON) if(TAKANE_FIND_HDF5) diff --git a/README.md b/README.md index f0a2c98..fcdac84 100644 --- a/README.md +++ b/README.md @@ -29,11 +29,7 @@ For example, for the `hdf5_sparse_matrix`, we could do: ```cpp #include "takane/takane.hpp" - -takane::hdf5_sparse_matrix::Parameters params(group_name, { 10, 20 }); -params.type = takane::array::Type::BOOLEAN; - -takane::hdf5_sparse_matrix::validate(file_path, params); +takane::validate(dir); ``` Check out the [reference documentation](https://artifactdb.github.io/takane/) for more details. diff --git a/cmake/Config.cmake.in b/cmake/Config.cmake.in index 70b1437..9c79692 100644 --- a/cmake/Config.cmake.in +++ b/cmake/Config.cmake.in @@ -2,6 +2,7 @@ include(CMakeFindDependencyMacro) find_dependency(artifactdb_ritsuko CONFIG REQUIRED) +find_dependency(artifactdb_uzuki2 CONFIG REQUIRED) find_dependency(artifactdb_comservatory CONFIG REQUIRED) if(@UZUKI2_FIND_HDF5@) diff --git a/extern/CMakeLists.txt b/extern/CMakeLists.txt index e51656e..d0d36da 100644 --- a/extern/CMakeLists.txt +++ b/extern/CMakeLists.txt @@ -1,17 +1,23 @@ include(FetchContent) +FetchContent_Declare( + uzuki2 + GIT_REPOSITORY https://github.com/ArtifactDB/uzuki2 + GIT_TAG master +) + FetchContent_Declare( ritsuko GIT_REPOSITORY https://github.com/ArtifactDB/ritsuko GIT_TAG master ) -FetchContent_MakeAvailable(ritsuko) - FetchContent_Declare( comservatory GIT_REPOSITORY https://github.com/ArtifactDB/comservatory GIT_TAG master ) +FetchContent_MakeAvailable(uzuki2) +FetchContent_MakeAvailable(ritsuko) FetchContent_MakeAvailable(comservatory) diff --git a/include/takane/_height.hpp b/include/takane/_height.hpp new file mode 100644 index 0000000..f3a0eb8 --- /dev/null +++ b/include/takane/_height.hpp @@ -0,0 +1,114 @@ +#ifndef TAKANE_HEIGHT_HPP +#define TAKANE_HEIGHT_HPP + +#include +#include +#include +#include + +#include "utils_public.hpp" +#include "atomic_vector.hpp" +#include "string_factor.hpp" +#include "simple_list.hpp" +#include "data_frame.hpp" +#include "data_frame_factor.hpp" + +/** + * @file _height.hpp + * @brief Dispatch to functions for the object's height. + */ + +namespace takane { + +/** + * @cond + */ +namespace internal_height { + +inline auto default_registry() { + std::unordered_map > registry; + registry["atomic_vector"] = [](const std::filesystem::path& p, const Options& o) -> size_t { return atomic_vector::height(p, o); }; + registry["string_factor"] = [](const std::filesystem::path& p, const Options& o) -> size_t { return string_factor::height(p, o); }; + registry["simple_list"] = [](const std::filesystem::path& p, const Options& o) -> size_t { return simple_list::height(p, o); }; + registry["data_frame"] = [](const std::filesystem::path& p, const Options& o) -> size_t { return data_frame::height(p, o); }; + registry["data_frame_factor"] = [](const std::filesystem::path& p, const Options& o) -> size_t { return data_frame_factor::height(p, o); }; + return registry; +} + +} +/** + * @endcond + */ + +/** + * Registry of functions to be used by `height()`. + */ +inline std::unordered_map > height_registry = internal_height::default_registry(); + +/** + * Override for application-defined height functions, to be used by `height()`. + * + * Any supplied function should accept the directory path, a string containing the object type, and additional `Options`. + * It should then return a pair indicating whether a height function for this object type was identified, and if so, the height itself. + * + * If no overriding function is found for the object type (or if `height_override` was not set at all), + * `height()` will instead search `height_registry` for an appropriate function. + */ +inline std::function(const std::filesystem::path&, const std::string&, const Options&)> height_override; + +/** + * Get the height of an object in a subdirectory, based on the supplied object type. + * This is used to check the shape of objects stored in vertical containers, e.g., columns of a `data_frame`. + * For vectors or other 1-dimensional objects, the height is usually just the length; + * for higher dimensional objects, the height is usually the extent of the first dimension. + * + * @param path Path to a directory representing an object. + * @param type Type of the object, typically determined from its `OBJECT` file. + * @param options Validation options, mostly for input performance. + * + * @return The object's height. + */ +inline size_t height(const std::filesystem::path& path, const std::string& type, const Options& options) { + if (!std::filesystem::exists(path) || std::filesystem::status(path).type() != std::filesystem::file_type::directory) { + throw std::runtime_error("expected '" + path.string() + "' to be a directory"); + } + + if (height_override) { + auto found = height_override(path, type, options); + if (found.first) { + return found.second; + } + } + + auto vrIt = height_registry.find(type); + if (vrIt == height_registry.end()) { + throw std::runtime_error("no registered height function for object type '" + type + "' at '" + path.string() + "'"); + } + + return (vrIt->second)(path, options); +} + +/** + * Get the height of an object in a subdirectory, using its `OBJECT` file to automatically determine the type. + * + * @param path Path to a directory containing an object. + * @param options Validation options, mostly for input performance. + * @return The object's height. + */ +inline size_t height(const std::filesystem::path& path, const Options& options) { + return height(path, read_object_type(path), options); +} + +/** + * Overload of `height()` with default options. + * + * @param path Path to a directory containing an object. + * @return The object's height. + */ +inline size_t height(const std::filesystem::path& path) { + return height(path, Options()); +} + +} + +#endif diff --git a/include/takane/_validate.hpp b/include/takane/_validate.hpp new file mode 100644 index 0000000..0a2231b --- /dev/null +++ b/include/takane/_validate.hpp @@ -0,0 +1,106 @@ +#ifndef TAKANE_VALIDATE_HPP +#define TAKANE_VALIDATE_HPP + +#include +#include +#include +#include + +#include "utils_public.hpp" +#include "atomic_vector.hpp" +#include "string_factor.hpp" +#include "simple_list.hpp" +#include "data_frame.hpp" +#include "data_frame_factor.hpp" + +/** + * @file _validate.hpp + * @brief Validation dispatch function. + */ + +namespace takane { + +/** + * @cond + */ +namespace internal_validate { + +inline auto default_registry() { + std::unordered_map > registry; + registry["atomic_vector"] = [](const std::filesystem::path& p, const Options& o) { atomic_vector::validate(p, o); }; + registry["string_factor"] = [](const std::filesystem::path& p, const Options& o) { string_factor::validate(p, o); }; + registry["simple_list"] = [](const std::filesystem::path& p, const Options& o) { simple_list::validate(p, o); }; + registry["data_frame"] = [](const std::filesystem::path& p, const Options& o) { data_frame::validate(p, o); }; + registry["data_frame_factor"] = [](const std::filesystem::path& p, const Options& o) { data_frame_factor::validate(p, o); }; + return registry; +} + +} +/** + * @endcond + */ + +/** + * Registry of functions to be used by `validate()`. + */ +inline std::unordered_map > validate_registry = internal_validate::default_registry(); + +/** + * Override for application-defined validation functions, to be used by `validate()`. + * + * Any supplied function should accept the directory path, a string containing the object type, and additional `Options`. + * It should then return a boolean indicating whether a validation function for this object type was identified. + * + * If no overriding validator is found for the object type (or if `validate_override` was not set at all), + * `validate()` will instead search `validate_registry` for an appropriate function. + */ +inline std::function validate_override; + +/** + * Validate an object in a subdirectory, based on the supplied object type. + * + * @param path Path to a directory representing an object. + * @param type Type of the object, typically determined from its `OBJECT` file. + * @param options Validation options, mostly for input performance. + */ +inline void validate(const std::filesystem::path& path, const std::string& type, const Options& options) { + if (!std::filesystem::exists(path) || std::filesystem::status(path).type() != std::filesystem::file_type::directory) { + throw std::runtime_error("expected '" + path.string() + "' to be a directory"); + } + + if (validate_override) { + if (validate_override(path, type, options)) { + return; + } + } + + auto vrIt = validate_registry.find(type); + if (vrIt == validate_registry.end()) { + throw std::runtime_error("no registered validation function for object type '" + type + "' at '" + path.string() + "'"); + } + + (vrIt->second)(path, options); +} + +/** + * Validate an object in a subdirectory, using its `OBJECT` file to automatically determine the type. + * + * @param path Path to a directory containing an object. + * @param options Validation options, mostly for input performance. + */ +inline void validate(const std::filesystem::path& path, const Options& options) { + validate(path, read_object_type(path), options); +} + +/** + * Overload of `validate()` with default options. + * + * @param path Path to a directory containing an object. + */ +inline void validate(const std::filesystem::path& path) { + validate(path, Options()); +} + +} + +#endif diff --git a/include/takane/atomic_vector.hpp b/include/takane/atomic_vector.hpp index 915f661..5278317 100644 --- a/include/takane/atomic_vector.hpp +++ b/include/takane/atomic_vector.hpp @@ -1,11 +1,14 @@ #ifndef TAKANE_ATOMIC_VECTOR_HPP #define TAKANE_ATOMIC_VECTOR_HPP -#include "comservatory/comservatory.hpp" +#include +#include +#include -#include "utils_csv.hpp" +#include "ritsuko/hdf5/hdf5.hpp" -#include +#include "utils_public.hpp" +#include "utils_hdf5.hpp" /** * @file atomic_vector.hpp @@ -21,157 +24,94 @@ namespace takane { namespace atomic_vector { /** - * Type of the atomic vector. - * - * - `INTEGER`: a number that can be represented by a 32-bit signed integer. - * - `NUMBER`: a number that can be represented by a double-precision float. - * - `STRING`: a string. - * - `BOOLEAN`: a boolean. - */ -enum class Type { - INTEGER, - NUMBER, - STRING, - BOOLEAN -}; - -/** - * @brief Parameters for validating the atomic vector file. + * @param path Path to the directory containing the atomic vector. + * @param options Validation options, typically for reading performance. */ -struct Parameters { - /** - * Length of the atomic vector. - */ - size_t length = 0; - - /** - * Type of the atomic vector. - */ - Type type = Type::INTEGER; - - /** - * Whether the vector is named. - */ - bool has_names = false; - - /** - * Whether to load and parse the file in parallel, see `comservatory::ReadOptions` for details. - */ - bool parallel = false; - - /** - * Version of the `atomic_vector` format. - */ - int version = 1; -}; +inline void validate(const std::filesystem::path& path, const Options& options) try { + H5::H5File handle((path / "contents.h5").string(), H5F_ACC_RDONLY); -/** - * @cond - */ -template -CsvContents validate_base(ParseCommand parse, const Parameters& params, CsvFieldCreator* creator) { - DummyCsvFieldCreator default_creator; - if (creator == NULL) { - creator = &default_creator; + const char* parent = "atomic_vector"; + if (!handle.exists(parent) || handle.childObjType(parent) != H5O_TYPE_GROUP) { + throw std::runtime_error("expected an 'atomic_vector' group"); } + auto ghandle = handle.openGroup(parent); - comservatory::Contents contents; - CsvContents output; - if (params.has_names) { - auto ptr = creator->string(); - output.fields.emplace_back(ptr); - contents.fields.emplace_back(new CsvNameField(false, ptr)); + auto vstring = ritsuko::hdf5::load_scalar_string_attribute(ghandle, "version"); + auto version = ritsuko::parse_version_string(vstring.c_str(), vstring.size(), /* skip_patch = */ true); + if (version.major != 1) { + throw std::runtime_error("unsupported version string '" + vstring + "'"); } - switch(params.type) { - case Type::INTEGER: - { - auto ptr = creator->integer(); - output.fields.emplace_back(ptr); - contents.fields.emplace_back(new CsvIntegerField(static_cast(params.has_names), ptr)); - } - break; - case Type::NUMBER: - { - auto ptr = creator->number(); - output.fields.emplace_back(nullptr); - contents.fields.emplace_back(ptr); + auto dhandle = ritsuko::hdf5::get_dataset(ghandle, "values"); + auto vlen = ritsuko::hdf5::get_1d_length(dhandle.getSpace(), false); + auto type = ritsuko::hdf5::load_scalar_string_attribute(ghandle, "type"); + + const char* missing_attr_name = "missing-value-placeholder"; + bool has_missing = dhandle.attrExists(missing_attr_name); + + if (type == "string") { + if (dhandle.getTypeClass() != H5T_STRING) { + throw std::runtime_error("expected a string datatype for 'values'"); + } + + std::string missing_value; + if (has_missing) { + auto missing_attr = ritsuko::hdf5::get_missing_placeholder_attribute(dhandle, missing_attr_name, /* type_class_only = */ true); + missing_value = ritsuko::hdf5::load_scalar_string_attribute(missing_attr); + } + + if (ghandle.attrExists("format")) { + auto format = ritsuko::hdf5::load_scalar_string_attribute(ghandle, "format"); + internal_hdf5::validate_string_format(dhandle, vlen, format, has_missing, missing_value, options.hdf5_buffer_size); + } + + } else { + if (type == "integer") { + if (ritsuko::hdf5::exceeds_integer_limit(dhandle, 32, true)) { + throw std::runtime_error("expected a datatype for 'values' that fits in a 32-bit signed integer"); } - break; - case Type::STRING: - { - auto ptr = creator->string(); - output.fields.emplace_back(nullptr); - contents.fields.emplace_back(ptr); + } else if (type == "boolean") { + if (ritsuko::hdf5::exceeds_integer_limit(dhandle, 32, true)) { + throw std::runtime_error("expected a datatype for 'values' that fits in a 32-bit signed integer"); } - break; - case Type::BOOLEAN: - { - auto ptr = creator->boolean(); - output.fields.emplace_back(nullptr); - contents.fields.emplace_back(ptr); + } else if (type == "number") { + if (ritsuko::hdf5::exceeds_float_limit(dhandle, 64)) { + throw std::runtime_error("expected a datatype for 'values' that fits in a 64-bit float"); } - break; - } + } else { + throw std::runtime_error("unsupported type '" + type + "'"); + } - comservatory::ReadOptions opt; - opt.parallel = params.parallel; - parse(contents, opt); - if (contents.num_records() != params.length) { - throw std::runtime_error("number of records in the CSV file does not match the expected length"); + if (has_missing) { + ritsuko::hdf5::get_missing_placeholder_attribute(dhandle, missing_attr_name); + } } - if (contents.names.back() != "values") { - throw std::runtime_error("column containing vector contents should be named 'values'"); + if (ghandle.exists("names")) { + auto nhandle = ritsuko::hdf5::get_dataset(ghandle, "names"); + if (nhandle.getTypeClass() != H5T_STRING) { + throw std::runtime_error("'names' should be a string datatype class"); + } + auto nlen = ritsuko::hdf5::get_1d_length(nhandle.getSpace(), false); + if (vlen != nlen) { + throw std::runtime_error("'names' and 'values' should have the same length"); + } } - output.reconstitute(contents.fields); - return output; -} -/** - * @endcond - */ - -/** - * Checks if a CSV is correctly formatted for the `atomic_vector` format. - * An error is raised if the file does not meet the specifications. - * - * @tparam Reader A **byteme** reader class. - * - * @param reader A stream of bytes from the CSV file. - * @param params Validation parameters. - * @param creator Factory to create objects for holding the contents of each CSV field. - * Defaults to a pointer to a `DummyFieldCreator` instance. - * - * @return Contents of the loaded CSV. - * Whether the `fields` member actually contains the CSV data depends on `creator`. - * If `params.has_names = true`, an additional column containing names is present at the start. - */ -template -CsvContents validate(Reader& reader, const Parameters& params, CsvFieldCreator* creator = NULL) { - return validate_base( - [&](comservatory::Contents& contents, const comservatory::ReadOptions& opts) -> void { comservatory::read(reader, contents, opts); }, - params, - creator - ); +} catch (std::exception& e) { + throw std::runtime_error("failed to validate an 'atomic_vector' at '" + path.string() + "'; " + std::string(e.what())); } /** - * Overload of `atomic_vector::validate()` that takes a file path. - * - * @param path Path to the CSV file. - * @param params Validation parameters. - * @param creator Factory to create objects for holding the contents of each CSV field. - * Defaults to a pointer to a `DummyFieldCreator` instance. - * - * @return Contents of the loaded CSV. + * @param path Path to the directory containing the atomic vector. + * @param options Validation options, typically for reading performance. + * @return Length of the vector. */ -inline CsvContents validate(const char* path, const Parameters& params, CsvFieldCreator* creator = NULL) { - return validate_base( - [&](comservatory::Contents& contents, const comservatory::ReadOptions& opts) -> void { comservatory::read_file(path, contents, opts); }, - params, - creator - ); +inline size_t height(const std::filesystem::path& path, const Options&) { + H5::H5File handle((path / "contents.h5").string(), H5F_ACC_RDONLY); + auto ghandle = handle.openGroup("atomic_vector"); + auto dhandle = ghandle.openDataSet("values"); + return ritsuko::hdf5::get_1d_length(dhandle.getSpace(), false); } } diff --git a/include/takane/data_frame.hpp b/include/takane/data_frame.hpp index 3f105fd..1e3729e 100644 --- a/include/takane/data_frame.hpp +++ b/include/takane/data_frame.hpp @@ -1,86 +1,264 @@ -#ifndef TAKANE_DATA_FRAME_HPP -#define TAKANE_DATA_FRAME_HPP +#ifndef TAKANE_HDF5_FRAME_HPP +#define TAKANE_HDF5_FRAME_HPP -#include +#include "H5Cpp.h" +#include "ritsuko/ritsuko.hpp" +#include "ritsuko/hdf5/hdf5.hpp" + +#include #include +#include +#include +#include +#include -#include "WrappedOption.hpp" +#include "utils_public.hpp" +#include "utils_hdf5.hpp" +#include "utils_other.hpp" /** * @file data_frame.hpp - * @brief Common data frame utilities. + * @brief Validation for data frames. */ namespace takane { /** - * @namespace takane::data_frame - * @brief Definitions for abstract data frames. + * @cond */ +void validate(const std::filesystem::path&, const Options&); +size_t height(const std::filesystem::path&, const Options&); +/** + * @endcond + */ + namespace data_frame { /** - * Type of the data frame column. - * - * - `INTEGER`: a number that can be represented by a 32-bit signed integer. - * - `NUMBER`: a number that can be represented by a double-precision float. - * - `STRING`: a string. - * - `BOOLEAN`: a boolean. - * - `FACTOR`: a categorical factor, typically represented as integer indices into an array of unique levels. - * - `OTHER`: other column types that do not fall into the above categories. + * @cond + */ +inline void validate_row_names(const H5::Group& handle, hsize_t num_rows) try { + if (handle.childObjType("row_names") != H5O_TYPE_DATASET) { + throw std::runtime_error("expected a 'row_names' dataset when row names are present"); + } + auto rnhandle = handle.openDataSet("row_names"); + if (rnhandle.getTypeClass() != H5T_STRING) { + throw std::runtime_error("expected 'row_names' to be a string dataset"); + } + if (ritsuko::hdf5::get_1d_length(rnhandle.getSpace(), false) != num_rows) { + throw std::runtime_error("expected 'row_names' to have length equal to the number of rows"); + } +} catch (std::exception& e) { + throw std::runtime_error("failed to validate the row names for '" + ritsuko::hdf5::get_name(handle) + "'; " + std::string(e.what())); +} + +inline hsize_t validate_column_names(const H5::Group& ghandle, const Options& options) try { + if (!ghandle.exists("column_names") || ghandle.childObjType("column_names") != H5O_TYPE_DATASET) { + throw std::runtime_error("expected a 'column_names' dataset"); + } + + auto cnhandle = ghandle.openDataSet("column_names"); + if (cnhandle.getTypeClass() != H5T_STRING) { + throw std::runtime_error("expected 'column_names' to be a string dataset"); + } + + auto num_cols = ritsuko::hdf5::get_1d_length(cnhandle.getSpace(), false); + + std::unordered_set column_names; + ritsuko::hdf5::load_1d_string_dataset( + cnhandle, + num_cols, + options.hdf5_buffer_size, + [&](size_t, const char* p, size_t l) { + if (l == 0) { + throw std::runtime_error("column names should not be empty strings"); + } + std::string col_name(p, p + l); + if (column_names.find(col_name) != column_names.end()) { + throw std::runtime_error("duplicated column name '" + col_name + "'"); + } + column_names.insert(std::move(col_name)); + } + ); + + return num_cols; + +} catch (std::exception& e) { + throw std::runtime_error("failed to validate the column names for '" + ritsuko::hdf5::get_name(ghandle) + "'; " + std::string(e.what())); +} + +inline void validate_column(const H5::Group& dhandle, const std::string& dset_name, hsize_t num_rows, const Options& options) try { + if (dhandle.childObjType(dset_name) == H5O_TYPE_GROUP) { + auto fhandle = dhandle.openGroup(dset_name); + auto type = ritsuko::hdf5::load_scalar_string_attribute(fhandle, "type"); + if (type != "factor") { + throw std::runtime_error("expected HDF5 groups to have a 'type' attribute set to 'factor'"); + } + + if (fhandle.attrExists("ordered")) { + auto attr = ritsuko::hdf5::get_scalar_attribute(fhandle, "ordered"); + if (ritsuko::hdf5::exceeds_integer_limit(attr, 32, true)) { + throw std::runtime_error("an 'ordered' attribute on a factor column should have a datatype that fits in a 32-bit signed integer"); + } + } + + auto num_levels = internal_hdf5::validate_factor_levels(fhandle, "levels", options.hdf5_buffer_size); + auto num_codes = internal_hdf5::validate_factor_codes(fhandle, "codes", num_levels, options.hdf5_buffer_size); + if (num_codes != num_rows) { + throw std::runtime_error("expected column to have length equal to the number of rows"); + } + + + } else { + auto xhandle = ritsuko::hdf5::get_dataset(dhandle, dset_name.c_str()); + if (num_rows != ritsuko::hdf5::get_1d_length(xhandle.getSpace(), false)) { + throw std::runtime_error("expected column to have length equal to the number of rows"); + } + + const char* missing_attr_name = "missing-value-placeholder"; + bool has_missing = xhandle.attrExists(missing_attr_name); + + auto type = ritsuko::hdf5::load_scalar_string_attribute(xhandle, "type"); + if (type == "string") { + if (xhandle.getTypeClass() != H5T_STRING) { + throw std::runtime_error("expected column " + dset_name + " to be a string dataset"); + } + + std::string missing_value; + if (has_missing) { + auto missing_attr = ritsuko::hdf5::get_missing_placeholder_attribute(xhandle, missing_attr_name, /* type_class_only = */ true); + missing_value = ritsuko::hdf5::load_scalar_string_attribute(missing_attr); + } + + if (xhandle.attrExists("format")) { + auto format = ritsuko::hdf5::load_scalar_string_attribute(xhandle, "format"); + internal_hdf5::validate_string_format(xhandle, num_rows, format, has_missing, missing_value, options.hdf5_buffer_size); + } + + } else { + if (type == "integer") { + if (ritsuko::hdf5::exceeds_integer_limit(xhandle, 32, true)) { + throw std::runtime_error("expected integer column to use a datatype that is a subset of a 32-bit signed integer"); + } + } else if (type == "boolean") { + if (ritsuko::hdf5::exceeds_integer_limit(xhandle, 32, true)) { + throw std::runtime_error("expected boolean column to use a datatype that is a subset of a 32-bit signed integer"); + } + } else if (type == "number") { + if (ritsuko::hdf5::exceeds_float_limit(xhandle, 64)) { + throw std::runtime_error("expected number column to use a datatype that is a subset of a 64-bit float"); + } + } else { + throw std::runtime_error("unknown column type '" + type + "'"); + } + + if (has_missing) { + ritsuko::hdf5::get_missing_placeholder_attribute(xhandle, missing_attr_name); + } + } + } + +} catch (std::exception& e) { + throw std::runtime_error("failed to validate column at '" + ritsuko::hdf5::get_name(dhandle) + "/" + dset_name + "'; " + std::string(e.what())); +} +/** + * @endcond */ -enum class ColumnType { - INTEGER, - NUMBER, - STRING, - BOOLEAN, - FACTOR, - OTHER -}; /** - * Format of string columns in a data frame. - * - * - `NONE`: no format. - * - `DATE`: date in the YYYY-MM-DD format. - * - `DATE_TIME`: Internet date/time, following RFC3339. + * @param path Path to the directory containing the data frame. + * @param options Validation options, typically for reading performance. */ -enum class StringFormat { - NONE, - DATE, - DATE_TIME -}; +inline void validate(const std::filesystem::path& path, const Options& options) { + auto h5path = path / "basic_columns.h5"; + + H5::H5File handle(h5path, H5F_ACC_RDONLY); + if (!handle.exists("data_frame") || handle.childObjType("data_frame") != H5O_TYPE_GROUP) { + throw std::runtime_error("expected a 'data_frame' group"); + } + auto ghandle = handle.openGroup("data_frame"); + + auto vstring = ritsuko::hdf5::load_scalar_string_attribute(ghandle, "version"); + auto version = ritsuko::parse_version_string(vstring.c_str(), vstring.size(), /* skip_patch = */ true); + if (version.major != 1) { + throw std::runtime_error("unsupported version '" + vstring + "'"); + } + + // Checking the number of rows. + auto attr = ritsuko::hdf5::get_scalar_attribute(ghandle, "row-count"); + if (ritsuko::hdf5::exceeds_integer_limit(attr, 64, false)) { + throw std::runtime_error("'row-count' attribute should have a datatype that fits in a 64-bit unsigned integer"); + } + uint64_t num_rows = 0; + attr.read(H5::PredType::NATIVE_UINT64, &num_rows); + + // Checking row and column names. + if (ghandle.exists("row_names")) { + validate_row_names(ghandle, num_rows); + } + size_t NC = validate_column_names(ghandle, options); + + // Finally iterating through the columns. + if (!ghandle.exists("data") || ghandle.childObjType("data") != H5O_TYPE_GROUP) { + throw std::runtime_error("expected a 'data_frame/data' group"); + } + auto dhandle = ghandle.openGroup("data"); + + hsize_t found = 0; + for (size_t c = 0; c < NC; ++c) { + std::string dset_name = std::to_string(c); + + if (!dhandle.exists(dset_name)) { + auto opath = path / "other_columns" / dset_name; + try { + ::takane::validate(opath, options); + } catch (std::exception& e) { + throw std::runtime_error("failed to validate 'other' column " + dset_name + "; " + std::string(e.what())); + } + if (::takane::height(opath, options) != num_rows) { + throw std::runtime_error("height of column " + dset_name + " of class '" + read_object_type(opath) + "' is not the same as the number of rows"); + } + + } else { + validate_column(dhandle, dset_name, num_rows, options); + ++found; + } + } + + if (found != dhandle.getNumObjs()) { + throw std::runtime_error("more objects present in the 'data_frame/data' group than expected"); + } + + // Checking the metadata. + try { + internal_other::validate_mcols(path / "column_annotations", NC, options); + } catch (std::exception& e) { + throw std::runtime_error("failed to validate 'column_annotations'; " + std::string(e.what())); + } + + try { + internal_other::validate_metadata(path / "other_annotations", options); + } catch (std::exception& e) { + throw std::runtime_error("failed to validate 'other_annotations'; " + std::string(e.what())); + } +} /** - * @brief Details for a column of a data frame. + * @param path Path to a directory containing a data frame. + * @param options Validation options, mostly for input performance. + * @return The number of rows. */ -struct ColumnDetails { - /** - * Name of the column. - */ - std::string name; - - /** - * Type of the column. - */ - ColumnType type = ColumnType::INTEGER; - - /** - * Format of string columns, only used if `type == ColumnType::STRING`. - */ - StringFormat string_format = StringFormat::NONE; - - /** - * Whether the factor levels are ordered, only used if `type == ColumnType::FACTOR`. - */ - bool factor_ordered = false; - - /** - * Unique factor levels, only used if `type == ColumnType::FACTOR`. - * This may be ignored by specific validation functions if the factor levels are available elsewhere. - */ - WrappedOption > factor_levels; -}; +inline size_t height(const std::filesystem::path& path, const Options&) { + auto h5path = path / "basic_columns.h5"; + + // Assume it's all valid already. + H5::H5File handle(h5path, H5F_ACC_RDONLY); + auto ghandle = handle.openGroup("data_frame"); + auto attr = ritsuko::hdf5::get_scalar_attribute(ghandle, "row-count"); + uint64_t num_rows = 0; + attr.read(H5::PredType::NATIVE_UINT64, &num_rows); + return num_rows; +} } diff --git a/include/takane/data_frame_factor.hpp b/include/takane/data_frame_factor.hpp new file mode 100644 index 0000000..266c473 --- /dev/null +++ b/include/takane/data_frame_factor.hpp @@ -0,0 +1,131 @@ +#ifndef TAKANE_DATA_FRAME_FACTOR_HPP +#define TAKANE_DATA_FRAME_FACTOR_HPP + +#include +#include +#include + +#include "ritsuko/hdf5/hdf5.hpp" + +#include "utils_public.hpp" +#include "utils_hdf5.hpp" + +/** + * @file data_frame_factor.hpp + * @brief Validation for data frame factors. + */ + +namespace takane { + +/** + * @cond + */ +void validate(const std::filesystem::path&, const std::string&, const Options&); +size_t height(const std::filesystem::path&, const std::string&, const Options&); +/** + * @endcond + */ + +/** + * @namespace takane::data_frame_factor + * @brief Definitions for data frame factors. + */ +namespace data_frame_factor { + +/** + * Application-specific function to determine whether there are duplicated rows in the data frame containing the levels of a `data_frame_factor`. + * + * This should accept a path to the directory containing the data frame, a string specifying the object type, and additional reading options. + * It should return a boolean indicating whether any duplicate rows were found. + * + * If provided, this enables stricter checking of the uniqueness of the data frame levels. + * Currently, we don't provide a default method for `data_frame` objects, as it's kind of tedious and we haven't gotten around to it yet. + */ +inline std::function any_duplicated; + +/** + * @param path Path to the directory containing the data frame factor. + * @param options Validation options, typically for reading performance. + */ +inline void validate(const std::filesystem::path& path, const Options& options) try { + H5::H5File handle(path / "contents.h5", H5F_ACC_RDONLY); + + const char* parent = "data_frame_factor"; + if (!handle.exists(parent) || handle.childObjType(parent) != H5O_TYPE_GROUP) { + throw std::runtime_error("expected a 'data_frame_factor' group"); + } + auto ghandle = handle.openGroup(parent); + + auto vstring = ritsuko::hdf5::load_scalar_string_attribute(ghandle, "version"); + auto version = ritsuko::parse_version_string(vstring.c_str(), vstring.size(), /* skip_patch = */ true); + if (version.major != 1) { + throw std::runtime_error("unsupported version string '" + vstring + "'"); + } + + // Validating the levels. + auto lpath = path / "levels"; + auto xtype = read_object_type(lpath); + if (!internal_other::ends_with(xtype, "data_frame")) { + throw std::runtime_error("expected 'levels' to be a 'data_frame' or one of its derivatives"); + } + + try { + ::takane::validate(lpath, xtype, options); + } catch (std::exception& e) { + throw std::runtime_error("failed to validate 'levels'; " + std::string(e.what())); + } + size_t num_levels = ::takane::height(lpath, xtype, options); + + if (any_duplicated) { + if (any_duplicated(lpath, xtype, options)) { + throw std::runtime_error("'levels' should not contain duplicated rows"); + } + } + + size_t num_codes = internal_hdf5::validate_factor_codes(ghandle, "codes", num_levels, options.hdf5_buffer_size, /* allow_missing = */ false); + + if (ghandle.exists("names")) { + auto nhandle = ritsuko::hdf5::get_dataset(ghandle, "names"); + if (nhandle.getTypeClass() != H5T_STRING) { + throw std::runtime_error("'names' should be a string datatype class"); + } + auto nlen = ritsuko::hdf5::get_1d_length(nhandle.getSpace(), false); + if (num_codes != nlen) { + throw std::runtime_error("'names' and 'codes' should have the same length"); + } + } + + // Checking the metadata. + try { + internal_other::validate_mcols(path / "element_annotations", num_codes, options); + } catch (std::exception& e) { + throw std::runtime_error("failed to validate 'element_annotations'; " + std::string(e.what())); + } + + try { + internal_other::validate_metadata(path / "other_annotations", options); + } catch (std::exception& e) { + throw std::runtime_error("failed to validate 'other_annotations'; " + std::string(e.what())); + } + +} catch (std::exception& e) { + throw std::runtime_error("failed to validate a 'data_frame_factor' at '" + path.string() + "'; " + std::string(e.what())); +} + +/** + * @param path Path to the directory containing the data frame factor. + * @param options Validation options, typically for reading performance. + * @return Length of the factor. + */ +inline size_t height(const std::filesystem::path& path, const Options&) { + H5::H5File handle(path / "contents.h5", H5F_ACC_RDONLY); + auto ghandle = handle.openGroup("data_frame_factor"); + auto dhandle = ghandle.openDataSet("codes"); + return ritsuko::hdf5::get_1d_length(dhandle.getSpace(), false); +} + +} + +} + +#endif diff --git a/include/takane/hdf5_data_frame.hpp b/include/takane/hdf5_data_frame.hpp deleted file mode 100644 index afb5771..0000000 --- a/include/takane/hdf5_data_frame.hpp +++ /dev/null @@ -1,595 +0,0 @@ -#ifndef TAKANE_HDF5_DATA_FRAME_HPP -#define TAKANE_HDF5_DATA_FRAME_HPP - -#include "H5Cpp.h" -#include "ritsuko/ritsuko.hpp" -#include "ritsuko/hdf5/hdf5.hpp" - -#include "WrappedOption.hpp" -#include "data_frame.hpp" - -#include -#include -#include -#include - -/** - * @file hdf5_data_frame.hpp - * @brief Validation for HDF5 data frames. - */ - -namespace takane { - -/** - * @namespace takane::hdf5_data_frame - * @brief Definitions for HDF5 data frames. - */ -namespace hdf5_data_frame { - -/** - * @brief Parameters for validating the HDF5 data frame. - */ -struct Parameters { - /** - * @param group Name of the group containing the data frame's contents. - */ - Parameters(std::string group) : group(std::move(group)) {} - - /** - * Name of the group containing the data frame's contents. - */ - std::string group; - - /** - * Number of rows in the data frame. - */ - size_t num_rows = 0; - - /** - * Whether the data frame contains row names. - */ - bool has_row_names = false; - - /** - * Details about the expected columns of the data frame, in order. - * Note that any `factor_levels` inside each entry of `columns` is ignored if a `version` attribute is present on the `group`. - */ - WrappedOption > columns; - - /** - * Buffer size to use when reading values from the HDF5 file. - */ - hsize_t buffer_size = 10000; - - /** - * Version of the `data_frame` format. - * Ignored if a `version` attribute is present on the HDF5 group at `group`. - */ - int df_version = 2; - - /** - * Version of the `hdf5_data_frame` format, - * Ignored if a `version` attribute is present on the HDF5 group at `group`. - */ - int hdf5_version = 2; -}; - -/** - * @cond - */ -inline void validate_row_names(const H5::Group& handle, hsize_t num_rows) try { - if (!handle.exists("row_names") || handle.childObjType("row_names") != H5O_TYPE_DATASET) { - throw std::runtime_error("expected a 'row_names' dataset when row names are present"); - } - auto rnhandle = handle.openDataSet("row_names"); - if (rnhandle.getTypeClass() != H5T_STRING) { - throw std::runtime_error("expected 'row_names' to be a string dataset"); - } - if (ritsuko::hdf5::get_1d_length(rnhandle.getSpace(), false) != num_rows) { - throw std::runtime_error("expected 'row_names' to have length equal to the number of rows"); - } -} catch (std::exception& e) { - throw std::runtime_error("failed to validate the row names for '" + ritsuko::hdf5::get_name(handle) + "'; " + std::string(e.what())); -} - -inline void validate_column_names(const H5::Group& ghandle, const Parameters& params) try { - if (!ghandle.exists("column_names") || ghandle.childObjType("column_names") != H5O_TYPE_DATASET) { - throw std::runtime_error("expected a 'column_names' dataset"); - } - - auto cnhandle = ghandle.openDataSet("column_names"); - if (cnhandle.getTypeClass() != H5T_STRING) { - throw std::runtime_error("expected 'column_names' to be a string dataset"); - } - - const auto& columns = *(params.columns); - size_t num_cols = ritsuko::hdf5::get_1d_length(cnhandle.getSpace(), false); - if (num_cols != columns.size()) { - throw std::runtime_error("length of 'column_names' should equal the expected number of columns"); - } - - { - std::unordered_set column_names; - for (const auto& col : columns) { - if (col.name.empty()) { - throw std::runtime_error("column names should not be empty strings"); - } - if (column_names.find(col.name) != column_names.end()) { - throw std::runtime_error("duplicated column name '" + col.name + "'"); - } - column_names.insert(col.name); - } - } - - ritsuko::hdf5::load_1d_string_dataset( - cnhandle, - num_cols, - params.buffer_size, - [&](size_t i, const char* p, size_t l) { - const auto& expected = columns[i].name; - if (l != expected.size() || strncmp(expected.c_str(), p, l)) { - throw std::runtime_error("expected name '" + expected + "' but got '" + std::string(p, p + l) + "' for column " + std::to_string(i)); - } - } - ); - -} catch (std::exception& e) { - throw std::runtime_error("failed to validate the column names for '" + ritsuko::hdf5::get_name(ghandle) + "'; " + std::string(e.what())); -} - -// Validation for the older versions where the factors are stored outside of the file. -inline void validate_column_v1_v2(const H5::Group& dhandle, const std::string& dset_name, const data_frame::ColumnDetails& curcol, const Parameters& params) try { - auto xhandle = ritsuko::hdf5::get_dataset(dhandle, dset_name.c_str()); - if (params.num_rows != ritsuko::hdf5::get_1d_length(xhandle.getSpace(), false)) { - throw std::runtime_error("expected column to have length equal to the number of rows"); - } - - const char* missing_attr = "missing-value-placeholder"; - - if (curcol.type == data_frame::ColumnType::NUMBER) { - if (xhandle.getTypeClass() != H5T_FLOAT) { - throw std::runtime_error("expected column to be a floating-point dataset"); - } - if (params.hdf5_version > 1 && xhandle.attrExists(missing_attr)) { - ritsuko::hdf5::get_missing_placeholder_attribute(xhandle, missing_attr); - } - - } else if (curcol.type == data_frame::ColumnType::BOOLEAN) { - if (ritsuko::hdf5::exceeds_integer_limit(xhandle, 32, true)) { - throw std::runtime_error("expected boolean column to use a datatype that is a subset of a 32-bit signed integer"); - } - if (params.hdf5_version > 1 && xhandle.attrExists(missing_attr)) { - ritsuko::hdf5::get_missing_placeholder_attribute(xhandle, missing_attr); - } - - } else if (curcol.type == data_frame::ColumnType::INTEGER) { - if (ritsuko::hdf5::exceeds_integer_limit(xhandle, 32, true)) { - throw std::runtime_error("expected integer column to use a datatype that is a subset of a 32-bit signed integer"); - } - if (params.hdf5_version > 1 && xhandle.attrExists(missing_attr)) { - ritsuko::hdf5::get_missing_placeholder_attribute(xhandle, missing_attr); - } - - } else if (curcol.type == data_frame::ColumnType::STRING) { - if (xhandle.getTypeClass() != H5T_STRING) { - throw std::runtime_error("expected column " + dset_name + " to be a string dataset"); - } - - bool has_missing = xhandle.attrExists(missing_attr); - std::string missing_value; - if (has_missing) { - auto attr = ritsuko::hdf5::get_missing_placeholder_attribute(xhandle, missing_attr, /* type_class_only = */ true); - missing_value = ritsuko::hdf5::load_scalar_string_attribute(attr); - } - - if (curcol.string_format == data_frame::StringFormat::DATE) { - ritsuko::hdf5::load_1d_string_dataset( - xhandle, - params.num_rows, - params.buffer_size, - [&](size_t, const char* p, size_t l) { - std::string x(p, p + l); - if (has_missing && missing_value == x) { - return; - } - if (!ritsuko::is_date(p, l)) { - throw std::runtime_error("expected a date-formatted string in column (got '" + x + "')"); - } - } - ); - - } else if (curcol.string_format == data_frame::StringFormat::DATE_TIME) { - ritsuko::hdf5::load_1d_string_dataset( - xhandle, - params.num_rows, - params.buffer_size, - [&](size_t, const char* p, size_t l) { - std::string x(p, p + l); - if (has_missing && missing_value == x) { - return; - } - if (!ritsuko::is_rfc3339(p, l)) { - throw std::runtime_error("expected a date/time-formatted string in column (got '" + x + "')"); - } - } - ); - } - - } else if (curcol.type == data_frame::ColumnType::FACTOR) { - if (params.df_version <= 1) { - if (xhandle.getTypeClass() != H5T_STRING) { - throw std::runtime_error("expected column " + dset_name + " to be a string dataset"); - } - - bool has_missing = xhandle.attrExists(missing_attr); - std::string missing_string; - if (has_missing) { - auto attr = ritsuko::hdf5::get_missing_placeholder_attribute(xhandle, missing_attr, /* type_class_only = */ true); - missing_string = ritsuko::hdf5::load_scalar_string_attribute(attr); - } - - const auto& allowed = *(curcol.factor_levels); - ritsuko::hdf5::load_1d_string_dataset( - xhandle, - params.num_rows, - params.buffer_size, - [&](hsize_t, const char* p, size_t len) { - std::string x(p, p + len); - if (has_missing && x == missing_string) { - return; - } else if (allowed.find(x) == allowed.end()) { - throw std::runtime_error("column contains '" + x + "' that is not present in factor levels"); - } - } - ); - - } else if (params.df_version > 1) { - if (ritsuko::hdf5::exceeds_integer_limit(xhandle, 32, true)) { - throw std::runtime_error("expected factor column to use a datatype that is a subset of a 32-bit signed integer"); - } - - int32_t placeholder = -2147483648; - bool has_missing = true; - if (params.hdf5_version > 1) { - has_missing = xhandle.attrExists(missing_attr); - if (has_missing) { - auto attr = ritsuko::hdf5::get_missing_placeholder_attribute(xhandle, missing_attr); - attr.read(H5::PredType::NATIVE_INT32, &placeholder); - } - } - - int32_t num_levels = curcol.factor_levels->size(); - - auto block_size = ritsuko::hdf5::pick_1d_block_size(xhandle.getCreatePlist(), params.num_rows, params.buffer_size); - std::vector buffer(block_size); - ritsuko::hdf5::iterate_1d_blocks( - params.num_rows, - block_size, - [&](hsize_t, hsize_t len, const H5::DataSpace& memspace, const H5::DataSpace& dataspace) { - xhandle.read(buffer.data(), H5::PredType::NATIVE_INT32, memspace, dataspace); - for (hsize_t i = 0; i < len; ++i) { - if (has_missing && buffer[i] == placeholder) { - continue; - } - if (buffer[i] < 0) { - throw std::runtime_error("expected factor indices to be non-negative in column " + dset_name); - } - if (buffer[i] >= num_levels) { - throw std::runtime_error("expected factor indices to be less than the number of levels in column " + dset_name); - } - } - } - ); - } - - } else { - throw std::runtime_error("no dataset should exist for columns of type 'other'"); - } -} catch (std::exception& e) { - throw std::runtime_error("failed to validate column at '" + ritsuko::hdf5::get_name(dhandle) + "/" + dset_name + "'; " + std::string(e.what())); -} - -// Easier to just create a new function for the newer validators. -inline void validate_column_v3(const H5::Group& dhandle, const std::string& dset_name, const data_frame::ColumnDetails& curcol, const Parameters& params) try { - const char* missing_attr = "missing-value-placeholder"; - - if (dhandle.childObjType(dset_name) == H5O_TYPE_GROUP) { - if (curcol.type != data_frame::ColumnType::FACTOR) { - throw std::runtime_error("only factor columns should be represented as HDF5 groups"); - } - - auto fhandle = dhandle.openGroup(dset_name); - auto type = ritsuko::hdf5::load_scalar_string_attribute(fhandle, "type"); - if (type != "factor") { - throw std::runtime_error("expected factor column to have a 'type' attribute set to 'factor'"); - } - - int32_t val = 0; - if (fhandle.attrExists("ordered")) { - auto attr = ritsuko::hdf5::get_scalar_attribute(fhandle, "ordered"); - if (ritsuko::hdf5::exceeds_integer_limit(attr, 32, true)) { - throw std::runtime_error("an 'ordered' attribute on a factor column should have a datatype that fits in a 32-bit signed integer"); - } - attr.read(H5::PredType::NATIVE_INT32, &val); - } - if (val != curcol.factor_ordered) { - throw std::runtime_error("ordered status of factor is not consistent with the presence of the 'ordered' attribute"); - } - - size_t nlevels = 0; - { - auto lhandle = ritsuko::hdf5::get_dataset(fhandle, "levels"); - if (lhandle.getTypeClass() != H5T_STRING) { - throw std::runtime_error("expected 'levels' to have a string datatype"); - } - try { - std::unordered_set collected; - nlevels = ritsuko::hdf5::get_1d_length(lhandle.getSpace(), false); - ritsuko::hdf5::load_1d_string_dataset( - lhandle, - nlevels, - 10000, - [&](hsize_t, const char* start, size_t len) { - std::string x(start, start+len); - if (collected.find(x) != collected.end()) { - throw std::runtime_error("detected duplicate level '" + x + "'"); - } - collected.insert(std::move(x)); - } - ); - } catch (std::exception& e) { - throw std::runtime_error("failed to inspect 'levels'; " + std::string(e.what())); - } - } - - auto chandle = ritsuko::hdf5::get_dataset(fhandle, "codes"); - if (ritsuko::hdf5::exceeds_integer_limit(chandle, 32, true)) { - throw std::runtime_error("expected factor column to use a datatype that is a subset of a 32-bit signed integer"); - } - if (params.num_rows != ritsuko::hdf5::get_1d_length(chandle.getSpace(), false)) { - throw std::runtime_error("expected column to have length equal to the number of rows"); - } - - bool has_missing = chandle.attrExists(missing_attr); - int32_t placeholder = 0; - if (has_missing) { - auto attr = ritsuko::hdf5::get_missing_placeholder_attribute(chandle, missing_attr); - attr.read(H5::PredType::NATIVE_INT32, &placeholder); - } - - // Casting it. - if (nlevels > static_cast(std::numeric_limits::max())) { - throw std::runtime_error("number of levels should not exceed the maximum value of a 32-bit integer"); - } - int32_t num_levels = nlevels; - - auto block_size = ritsuko::hdf5::pick_1d_block_size(chandle.getCreatePlist(), params.num_rows, params.buffer_size); - std::vector buffer(block_size); - ritsuko::hdf5::iterate_1d_blocks( - params.num_rows, - block_size, - [&](hsize_t, hsize_t len, const H5::DataSpace& memspace, const H5::DataSpace& dataspace) { - chandle.read(buffer.data(), H5::PredType::NATIVE_INT32, memspace, dataspace); - for (hsize_t i = 0; i < len; ++i) { - if (has_missing && buffer[i] == placeholder) { - continue; - } - if (buffer[i] < 0) { - throw std::runtime_error("expected factor indices to be non-negative in column " + dset_name); - } - if (buffer[i] >= num_levels) { - throw std::runtime_error("expected factor indices to be less than the number of levels in column " + dset_name); - } - } - } - ); - - } else { - auto xhandle = ritsuko::hdf5::get_dataset(dhandle, dset_name.c_str()); - if (params.num_rows != ritsuko::hdf5::get_1d_length(xhandle.getSpace(), false)) { - throw std::runtime_error("expected column to have length equal to the number of rows"); - } - - if (curcol.type == data_frame::ColumnType::NUMBER) { - auto type = ritsuko::hdf5::load_scalar_string_attribute(xhandle, "type"); - if (type != "number") { - throw std::runtime_error("expected number column to have a 'type' attribute set to 'number'"); - } - if (ritsuko::hdf5::exceeds_float_limit(xhandle, 64)) { - throw std::runtime_error("expected number column to use a datatype that is a subset of a 64-bit float"); - } - if (xhandle.attrExists(missing_attr)) { - ritsuko::hdf5::get_missing_placeholder_attribute(xhandle, missing_attr); - } - - } else if (curcol.type == data_frame::ColumnType::BOOLEAN) { - auto type = ritsuko::hdf5::load_scalar_string_attribute(xhandle, "type"); - if (type != "boolean") { - throw std::runtime_error("expected boolean column to have a 'type' attribute set to 'boolean'"); - } - if (ritsuko::hdf5::exceeds_integer_limit(xhandle, 32, true)) { - throw std::runtime_error("expected boolean column to use a datatype that is a subset of a 32-bit signed integer"); - } - if (xhandle.attrExists(missing_attr)) { - ritsuko::hdf5::get_missing_placeholder_attribute(xhandle, missing_attr); - } - - } else if (curcol.type == data_frame::ColumnType::INTEGER) { - auto type = ritsuko::hdf5::load_scalar_string_attribute(xhandle, "type"); - if (type != "integer") { - throw std::runtime_error("expected integer column to have a 'type' attribute set to 'integer'"); - } - if (ritsuko::hdf5::exceeds_integer_limit(xhandle, 32, true)) { - throw std::runtime_error("expected integer column to use a datatype that is a subset of a 32-bit signed integer"); - } - if (xhandle.attrExists(missing_attr)) { - ritsuko::hdf5::get_missing_placeholder_attribute(xhandle, missing_attr); - } - - } else if (curcol.type == data_frame::ColumnType::STRING) { - auto type = ritsuko::hdf5::load_scalar_string_attribute(xhandle, "type"); - if (type != "string") { - throw std::runtime_error("expected string column to have a 'type' attribute set to 'string'"); - } - if (xhandle.getTypeClass() != H5T_STRING) { - throw std::runtime_error("expected column " + dset_name + " to be a string dataset"); - } - bool has_missing = xhandle.attrExists(missing_attr); - std::string missing_value; - if (has_missing) { - auto attr = ritsuko::hdf5::get_missing_placeholder_attribute(xhandle, missing_attr, /* type_class_only = */ true); - missing_value = ritsuko::hdf5::load_scalar_string_attribute(attr); - } - - if (curcol.string_format == data_frame::StringFormat::DATE) { - auto type = ritsuko::hdf5::load_scalar_string_attribute(xhandle, "format"); - if (type != "date") { - throw std::runtime_error("expected date-formatted column to have a 'format' attribute set to 'date'"); - } - ritsuko::hdf5::load_1d_string_dataset( - xhandle, - params.num_rows, - params.buffer_size, - [&](size_t, const char* p, size_t l) { - std::string x(p, p + l); - if (has_missing && missing_value == x) { - return; - } - if (!ritsuko::is_date(p, l)) { - throw std::runtime_error("expected a date-formatted string in column (got '" + x + "')"); - } - } - ); - - } else if (curcol.string_format == data_frame::StringFormat::DATE_TIME) { - auto type = ritsuko::hdf5::load_scalar_string_attribute(xhandle, "format"); - if (type != "date-time") { - throw std::runtime_error("expected date/time-formatted column to have a 'format' attribute set to 'date-time'"); - } - ritsuko::hdf5::load_1d_string_dataset( - xhandle, - params.num_rows, - params.buffer_size, - [&](size_t, const char* p, size_t l) { - std::string x(p, p + l); - if (has_missing && missing_value == x) { - return; - } - if (!ritsuko::is_rfc3339(p, l)) { - throw std::runtime_error("expected a date/time-formatted string in column (got '" + x + "')"); - } - } - ); - - } else { - if (xhandle.attrExists("format")) { - auto type = ritsuko::hdf5::load_scalar_string_attribute(xhandle, "format"); - if (type != "none") { - throw std::runtime_error("any 'format' attribute on an unformatted string column should be 'none'"); - } - } - } - - } else { - throw std::runtime_error("no dataset should exist for columns of type 'other' or 'factor'"); - } - } - -} catch (std::exception& e) { - throw std::runtime_error("failed to validate column at '" + ritsuko::hdf5::get_name(dhandle) + "/" + dset_name + "'; " + std::string(e.what())); -} -/** - * @endcond - */ - -/** - * Checks if a HDF5 data frame is correctly formatted. - * An error is raised if the file does not meet the specifications. - * - * @param handle Handle to a HDF5 file. - * @param params Validation parameters. - */ -inline void validate(const H5::H5File& handle, const Parameters& params) { - if (!handle.exists(params.group) || handle.childObjType(params.group) != H5O_TYPE_GROUP) { - throw std::runtime_error("expected a '" + params.group + "' group"); - } - auto ghandle = handle.openGroup(params.group); - - // Inspecting the columns. - ritsuko::Version version; - if (ghandle.attrExists("version")) { - auto vstring = ritsuko::hdf5::load_scalar_string_attribute(ghandle, "version"); - version = ritsuko::parse_version_string(vstring.c_str(), vstring.size(), /* skip_patch = */ true); - if (version.major != 1) { - throw std::runtime_error("unsupported version '" + vstring + "' for the '" + params.group + "' group"); - } - } - - // Checking the number of rows. - if (version.major > 0) { - auto attr = ritsuko::hdf5::get_scalar_attribute(ghandle, "row-count"); - if (ritsuko::hdf5::exceeds_integer_limit(attr, 64, false)) { - throw std::runtime_error("'row-count' attribute on '" + params.group + "' should have a datatype that fits in a 64-bit unsigned integer"); - } - uint64_t nrows = 0; - attr.read(H5::PredType::NATIVE_UINT64, &nrows); - if (nrows != params.num_rows) { - throw std::runtime_error("inconsistent number of rows in '" + params.group + "' (expected " + std::to_string(params.num_rows) + ", got " + std::to_string(nrows) + ")"); - } - } - - // Checking row and column names. - if (params.has_row_names) { - validate_row_names(ghandle, params.num_rows); - } - validate_column_names(ghandle, params); - - // Finally iterating through the columns. - if (!ghandle.exists("data") || ghandle.childObjType("data") != H5O_TYPE_GROUP) { - throw std::runtime_error("expected a '" + params.group + "/data' group"); - } - auto dhandle = ghandle.openGroup("data"); - - const auto& columns = *(params.columns); - size_t NC = columns.size(); - hsize_t found = 0; - for (size_t c = 0; c < NC; ++c) { - const auto& curcol = columns[c]; - - std::string dset_name = std::to_string(c); - if (!dhandle.exists(dset_name)) { - if (curcol.type == data_frame::ColumnType::OTHER) { - continue; - } - } - - if (version.major > 0) { - validate_column_v3(dhandle, dset_name, curcol, params); - } else { - validate_column_v1_v2(dhandle, dset_name, curcol, params); - } - - ++found; - } - - if (found != dhandle.getNumObjs()) { - throw std::runtime_error("more objects present in the '" + params.group + "/data' group than expected"); - } -} - -/** - * Overload of `hdf5_data_frame::validate()` that accepts a file path. - * - * @param path Path to the HDF5 file. - * @param params Validation parameters. - */ -inline void validate(const char* path, const Parameters& params) { - H5::H5File handle(path, H5F_ACC_RDONLY); - validate(handle, params); -} - -} - -} - -#endif diff --git a/include/takane/simple_list.hpp b/include/takane/simple_list.hpp new file mode 100644 index 0000000..5c41022 --- /dev/null +++ b/include/takane/simple_list.hpp @@ -0,0 +1,121 @@ +#ifndef TAKANE_SIMPLE_LIST_HPP +#define TAKANE_SIMPLE_LIST_HPP + +#include +#include +#include + +#include "uzuki2/uzuki2.hpp" +#include "byteme/byteme.hpp" + +#include "utils_public.hpp" + +/** + * @file simple_list.hpp + * @brief Validation for simple lists. + */ + +namespace takane { + +/** + * @cond + */ +void validate(const std::filesystem::path&, const Options&); +/** + * @endcond + */ + +/** + * @namespace takane::simple_list + * @brief Definitions for simple lists. + */ +namespace simple_list { + +/** + * @param path Path to the directory containing the simple list. + * @param options Validation options, typically for reading performance. + */ +inline void validate(const std::filesystem::path& path, const Options& options) try { + auto other_dir = path / "other_contents"; + int num_external = 0; + if (std::filesystem::exists(other_dir)) { + auto status = std::filesystem::status(other_dir); + if (status.type() != std::filesystem::file_type::directory) { + throw std::runtime_error("expected 'other_contents' to be a directory"); + } + + for (const auto& entry : std::filesystem::directory_iterator(other_dir)) { + try { + ::takane::validate(entry.path().string(), options); + } catch (std::exception& e) { + throw std::runtime_error("failed to validate external list object at '" + std::filesystem::relative(entry.path(), path).string() + "'; " + std::string(e.what())); + } + ++num_external; + } + } + + { + auto candidate = path / "list_contents.json.gz"; + if (std::filesystem::exists(candidate)) { + uzuki2::json::Options opt; + opt.parallel = options.parallel_reads; + byteme::SomeFileReader gzreader(candidate.string()); + uzuki2::json::validate(gzreader, num_external, opt); + return; + } + } + + { + auto candidate = path / "list_contents.h5"; + if (std::filesystem::exists(candidate)) { + uzuki2::hdf5::validate(candidate.string(), "simple_list", num_external); + return; + } + } + + throw std::runtime_error("could not determine format from the file names"); +} catch (std::exception& e) { + throw std::runtime_error("failed to validate a 'simple_list' at '" + path.string() + "'; " + std::string(e.what())); +} + +/** + * @param path Path to the directory containing the simple list. + * @param options Validation options, typically for reading performance. + * @return The number of list elements. + */ +inline size_t height(const std::filesystem::path& path, const Options& options) { + { + auto candidate = path / "list_contents.h5"; + if (std::filesystem::exists(candidate)) { + H5::H5File handle(candidate, H5F_ACC_RDONLY); + auto lhandle = handle.openGroup("simple_list"); + auto vhandle = lhandle.openGroup("data"); + return vhandle.getNumObjs(); + } + } + + // Not much choice but to parse the entire list here. We do so using the + // dummy, which still has enough self-awareness to hold its own length. + auto other_dir = path / "other_contents"; + int num_external = 0; + if (std::filesystem::exists(other_dir)) { + for (const auto& entry : std::filesystem::directory_iterator(other_dir)) { + (void)entry; // silence compiler warnings about unused variables. + ++num_external; + } + } + + uzuki2::json::Options opt; + opt.parallel = options.parallel_reads; + auto candidate = path / "list_contents.json.gz"; + byteme::SomeFileReader gzreader(candidate.string()); + uzuki2::DummyExternals ext(num_external); + auto ptr = uzuki2::json::parse(gzreader, std::move(ext), std::move(opt)); + return reinterpret_cast(ptr.get())->size(); +} + +} + +} + +#endif diff --git a/include/takane/string_factor.hpp b/include/takane/string_factor.hpp new file mode 100644 index 0000000..0b979e7 --- /dev/null +++ b/include/takane/string_factor.hpp @@ -0,0 +1,87 @@ +#ifndef TAKANE_STRING_FACTOR_HPP +#define TAKANE_STRING_FACTOR_HPP + +#include +#include +#include + +#include "ritsuko/hdf5/hdf5.hpp" + +#include "utils_public.hpp" +#include "utils_hdf5.hpp" + +/** + * @file string_factor.hpp + * @brief Validation for string factors. + */ + +namespace takane { + +/** + * @namespace takane::string_factor + * @brief Definitions for string factors. + */ +namespace string_factor { + +/** + * @param path Path to the directory containing the string factor. + * @param options Validation options, typically for reading performance. + */ +inline void validate(const std::filesystem::path& path, const Options& options) try { + H5::H5File handle((path / "contents.h5").string(), H5F_ACC_RDONLY); + + const char* parent = "string_factor"; + if (!handle.exists(parent) || handle.childObjType(parent) != H5O_TYPE_GROUP) { + throw std::runtime_error("expected a 'string_factor' group"); + } + auto ghandle = handle.openGroup(parent); + + auto vstring = ritsuko::hdf5::load_scalar_string_attribute(ghandle, "version"); + auto version = ritsuko::parse_version_string(vstring.c_str(), vstring.size(), /* skip_patch = */ true); + if (version.major != 1) { + throw std::runtime_error("unsupported version string '" + vstring + "'"); + } + + if (ghandle.attrExists("ordered")) { + auto oattr = ritsuko::hdf5::get_scalar_attribute(ghandle, "ordered"); + if (ritsuko::hdf5::exceeds_integer_limit(oattr, 32, true)) { + throw std::runtime_error("expected a datatype for the 'ordered' attribute that fits in a 32-bit signed integer"); + } + } + + // Number of levels. + size_t num_levels = internal_hdf5::validate_factor_levels(ghandle, "levels", options.hdf5_buffer_size); + size_t num_codes = internal_hdf5::validate_factor_codes(ghandle, "codes", num_levels, options.hdf5_buffer_size); + + if (ghandle.exists("names")) { + auto nhandle = ritsuko::hdf5::get_dataset(ghandle, "names"); + if (nhandle.getTypeClass() != H5T_STRING) { + throw std::runtime_error("'names' should be a string datatype class"); + } + auto nlen = ritsuko::hdf5::get_1d_length(nhandle.getSpace(), false); + if (num_codes != nlen) { + throw std::runtime_error("'names' and 'codes' should have the same length"); + } + } + +} catch (std::exception& e) { + throw std::runtime_error("failed to validate a 'string_factor' at '" + path.string() + "'; " + std::string(e.what())); +} + +/** + * @param path Path to the directory containing the string factor. + * @param options Validation options, typically for reading performance. + * @return Length of the factor. + */ +inline size_t height(const std::filesystem::path& path, const Options&) { + H5::H5File handle((path / "contents.h5").string(), H5F_ACC_RDONLY); + auto ghandle = handle.openGroup("string_factor"); + auto dhandle = ghandle.openDataSet("codes"); + return ritsuko::hdf5::get_1d_length(dhandle.getSpace(), false); +} + +} + +} + +#endif diff --git a/include/takane/takane.hpp b/include/takane/takane.hpp index 7817392..f2e11a7 100644 --- a/include/takane/takane.hpp +++ b/include/takane/takane.hpp @@ -1,19 +1,8 @@ #ifndef TAKANE_TAKANE_HPP #define TAKANE_TAKANE_HPP -#include "WrappedOption.hpp" -#include "array.hpp" -#include "atomic_vector.hpp" -#include "compressed_list.hpp" -#include "csv_data_frame.hpp" -#include "data_frame.hpp" -#include "factor.hpp" -#include "genomic_ranges.hpp" -#include "hdf5_data_frame.hpp" -#include "hdf5_dense_array.hpp" -#include "hdf5_sparse_matrix.hpp" -#include "sequence_information.hpp" -#include "utils_csv.hpp" +#include "_validate.hpp" +#include "_height.hpp" /** * @namespace takane diff --git a/include/takane/utils_hdf5.hpp b/include/takane/utils_hdf5.hpp new file mode 100644 index 0000000..5ae35b8 --- /dev/null +++ b/include/takane/utils_hdf5.hpp @@ -0,0 +1,127 @@ +#ifndef TAKANE_UTILS_HDF5_HPP +#define TAKANE_UTILS_HDF5_HPP + +#include +#include +#include +#include +#include + +#include "ritsuko/ritsuko.hpp" +#include "ritsuko/hdf5/hdf5.hpp" + +namespace takane { + +namespace internal_hdf5 { + +inline void validate_string_format(const H5::DataSet& handle, hsize_t len, const std::string& format, bool has_missing, const std::string& missing_value, hsize_t buffer_size) { + if (format == "date") { + ritsuko::hdf5::load_1d_string_dataset( + handle, + len, + buffer_size, + [&](size_t, const char* p, size_t l) { + std::string x(p, p + l); + if (has_missing && missing_value == x) { + return; + } + if (!ritsuko::is_date(p, l)) { + throw std::runtime_error("expected a date-formatted string (got '" + x + "')"); + } + } + ); + + } else if (format == "date-time") { + ritsuko::hdf5::load_1d_string_dataset( + handle, + len, + buffer_size, + [&](size_t, const char* p, size_t l) { + std::string x(p, p + l); + if (has_missing && missing_value == x) { + return; + } + if (!ritsuko::is_rfc3339(p, l)) { + throw std::runtime_error("expected a date/time-formatted string (got '" + x + "')"); + } + } + ); + + } else if (format != "none") { + throw std::runtime_error("unsupported format '" + format + "'"); + } +} + +inline hsize_t validate_factor_levels(const H5::Group& handle, const std::string& name, hsize_t buffer_size) { + auto lhandle = ritsuko::hdf5::get_dataset(handle, name.c_str()); + if (lhandle.getTypeClass() != H5T_STRING) { + throw std::runtime_error("expected a string datatype for '" + name + "'"); + } + + auto len = ritsuko::hdf5::get_1d_length(lhandle.getSpace(), false); + std::unordered_set present; + + ritsuko::hdf5::load_1d_string_dataset( + lhandle, + len, + buffer_size, + [&](hsize_t, const char* p, size_t len) { + std::string x(p, p + len); + if (present.find(x) != present.end()) { + throw std::runtime_error("'" + name + "' contains duplicated factor level '" + x + "'"); + } + present.insert(std::move(x)); + } + ); + + return len; +} + +inline hsize_t validate_factor_codes(const H5::Group& handle, const std::string& name, hsize_t num_levels, hsize_t buffer_size, bool allow_missing = true) { + auto chandle = ritsuko::hdf5::get_dataset(handle, name.c_str()); + if (ritsuko::hdf5::exceeds_integer_limit(chandle, 32, true)) { + throw std::runtime_error("expected a datatype for '" + name + "' that fits in a 32-bit signed integer"); + } + + auto len = ritsuko::hdf5::get_1d_length(chandle.getSpace(), false); + auto block_size = ritsuko::hdf5::pick_1d_block_size(chandle.getCreatePlist(), len, buffer_size); + std::vector buffer(block_size); + + bool has_missing = false; + int32_t missing_placeholder = 0; + if (allow_missing) { + const char* missing_attr_name = "missing-value-placeholder"; + has_missing = chandle.attrExists(missing_attr_name); + if (has_missing) { + auto missing_attr = ritsuko::hdf5::get_missing_placeholder_attribute(chandle, missing_attr_name); + missing_attr.read(H5::PredType::NATIVE_INT32, &missing_placeholder); + } + } + + ritsuko::hdf5::iterate_1d_blocks( + len, + block_size, + [&](hsize_t, hsize_t len, const H5::DataSpace& memspace, const H5::DataSpace& dataspace) { + chandle.read(buffer.data(), H5::PredType::NATIVE_INT32, memspace, dataspace); + for (hsize_t i = 0; i < len; ++i) { + if (has_missing && buffer[i] == missing_placeholder) { + continue; + } + if (buffer[i] < 0) { + throw std::runtime_error("expected factor codes to be non-negative"); + } + if (static_cast(buffer[i]) >= num_levels) { + throw std::runtime_error("expected factor codes to be less than the number of levels"); + } + } + } + ); + + return len; +} + +} + +} + +#endif diff --git a/include/takane/utils_other.hpp b/include/takane/utils_other.hpp new file mode 100644 index 0000000..81084ce --- /dev/null +++ b/include/takane/utils_other.hpp @@ -0,0 +1,58 @@ +#ifndef TAKANE_UTILS_OTHER_HPP +#define TAKANE_UTILS_OTHER_HPP + +#include +#include + +#include "utils_public.hpp" + +namespace takane { + +/** + * @cond + */ +void validate(const std::filesystem::path&, const std::string&, const Options&); +size_t height(const std::filesystem::path&, const std::string&, const Options&); +/** + * @endcond + */ + +namespace internal_other { + +inline bool ends_with(const std::string& full, const std::string& sub) { + return (full.size() >= sub.size() && full.find(sub) == full.size() - sub.size()); +} + +inline void validate_mcols(const std::filesystem::path& path, size_t expected, const Options& options) { + if (!std::filesystem::exists(path)) { + return; + } + + auto xtype = read_object_type(path); + if (!ends_with(xtype, "data_frame")) { + throw std::runtime_error("expected a 'data_frame' or one of its derivatives"); + } + ::takane::validate(path, xtype, options); + + if (::takane::height(path, xtype, options) != expected) { + throw std::runtime_error("unexpected number of rows"); + } +} + +inline void validate_metadata(const std::filesystem::path& path, const Options& options) { + if (!std::filesystem::exists(path)) { + return; + } + + auto xtype = read_object_type(path); + if (!ends_with(xtype, "simple_list")) { + throw std::runtime_error("expected a 'simple_list' or one of its derivatives"); + } + ::takane::validate(path, xtype, options); +} + +} + +} + +#endif diff --git a/include/takane/utils_public.hpp b/include/takane/utils_public.hpp new file mode 100644 index 0000000..be52889 --- /dev/null +++ b/include/takane/utils_public.hpp @@ -0,0 +1,53 @@ +#ifndef TAKANE_UTILS_PUBLIC_HPP +#define TAKANE_UTILS_PUBLIC_HPP + +#include +#include + +#include "H5Cpp.h" + +#include "byteme/byteme.hpp" + +/** + * @file utils_public.hpp + * @brief Exported utilities. + */ + +namespace takane { + +/** + * Reads the `OBJECT` file inside a directory to determine the object type. + * + * @param path Path to a directory containing an object. + * @return String containing the object type. + */ +inline std::string read_object_type(const std::filesystem::path& path) { + auto full = path / "OBJECT"; + byteme::RawFileReader reader(full.c_str()); + std::string output; + while (reader.load()) { + auto buffer = reinterpret_cast(reader.buffer()); + size_t available = reader.available(); + output.insert(output.end(), buffer, buffer + available); + } + return output; +} + +/** + * @brief Validation options, mostly related to input performance. + */ +struct Options { + /** + * Whether to parallelize reading from disk and parsing, when available. + */ + bool parallel_reads = true; + + /** + * Buffer size to use when reading data from a HDF5 file. + */ + hsize_t hdf5_buffer_size = 10000; +}; + +} + +#endif diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 4ac94ab..06552bf 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -17,16 +17,22 @@ enable_testing() # Main test executable. add_executable( libtest - src/hdf5_data_frame.cpp - src/csv_data_frame.cpp - src/genomic_ranges.cpp - src/sequence_information.cpp + src/data_frame.cpp + # src/csv_data_frame.cpp + # src/genomic_ranges.cpp + # src/sequence_information.cpp src/atomic_vector.cpp - src/factor.cpp - src/compressed_list.cpp - src/array.cpp - src/hdf5_sparse_matrix.cpp - src/hdf5_dense_array.cpp + src/string_factor.cpp + src/simple_list.cpp + src/data_frame_factor.cpp + # src/factor.cpp + # src/compressed_list.cpp + # src/array.cpp + # src/hdf5_sparse_matrix.cpp + # src/hdf5_dense_array.cpp + src/utils_hdf5.cpp + src/utils_other.cpp + src/dispatch.cpp ) target_link_libraries( diff --git a/tests/src/atomic_vector.cpp b/tests/src/atomic_vector.cpp index 79d1456..8f5f648 100644 --- a/tests/src/atomic_vector.cpp +++ b/tests/src/atomic_vector.cpp @@ -1,85 +1,222 @@ #include #include -#include "takane/atomic_vector.hpp" +#include "takane/takane.hpp" #include "utils.h" -#include #include +#include +#include + +struct AtomicVectorTest : public::testing::Test { + static std::filesystem::path testdir() { + return "TEST_atomic_vector"; + } + + static H5::H5File initialize() { + auto path = testdir(); + initialize_directory(path, "atomic_vector"); + path.append("contents.h5"); + return H5::H5File(path, H5F_ACC_TRUNC); + } + + static H5::H5File reopen() { + auto path = testdir() / "contents.h5"; + return H5::H5File(path, H5F_ACC_RDWR); + } -static takane::CsvContents validate(const std::string& buffer, size_t length, takane::atomic_vector::Type type, bool has_names, takane::CsvFieldCreator* creator = NULL) { - std::string path = "TEST-atomic_vector.csv"; + template + static void expect_error(const std::string& msg, Args_&& ... args) { + EXPECT_ANY_THROW({ + try { + takane::validate(testdir(), std::forward(args)...); + } catch (std::exception& e) { + EXPECT_THAT(e.what(), ::testing::HasSubstr(msg)); + throw; + } + }); + } +}; + +TEST_F(AtomicVectorTest, Basic) { { - std::ofstream ohandle(path); - ohandle << buffer; + auto handle = initialize(); } + expect_error("expected an 'atomic_vector' group"); - takane::atomic_vector::Parameters params; - params.length = length; - params.type = type; - params.has_names = has_names; + { + auto handle = reopen(); + auto ghandle = handle.createGroup("atomic_vector"); + hdf5_utils::attach_attribute(ghandle, "version", "2.0"); + } + expect_error("unsupported version string"); + + { + auto handle = reopen(); + auto ghandle = handle.openGroup("atomic_vector"); + ghandle.removeAttr("version"); + hdf5_utils::attach_attribute(ghandle, "version", "1.0"); + ghandle.createDataSet("values", H5::PredType::NATIVE_INT, H5S_SCALAR); + } + expect_error("1-dimensional dataset"); - return takane::atomic_vector::validate(path.c_str(), params, creator); + { + auto handle = reopen(); + auto ghandle = handle.openGroup("atomic_vector"); + ghandle.unlink("values"); + hdf5_utils::spawn_data(ghandle, "values", 100, H5::PredType::NATIVE_INT32); + hdf5_utils::attach_attribute(ghandle, "type", "foobar"); + } + expect_error("unsupported type"); + + // Success at last. + { + auto handle = reopen(); + auto ghandle = handle.openGroup("atomic_vector"); + ghandle.removeAttr("type"); + hdf5_utils::attach_attribute(ghandle, "type", "integer"); + } + takane::validate(testdir()); + EXPECT_EQ(takane::height(testdir()), 100); } -template -static void expect_error(const std::string& msg, const std::string& buffer, Args_&& ... args) { - EXPECT_ANY_THROW({ - try { - validate(buffer, std::forward(args)...); - } catch (std::exception& e) { - EXPECT_THAT(e.what(), ::testing::HasSubstr(msg)); - throw; +TEST_F(AtomicVectorTest, Types) { + // Integer. + { + { + auto handle = initialize(); + auto ghandle = handle.createGroup("atomic_vector"); + hdf5_utils::attach_attribute(ghandle, "version", "1.0"); + hdf5_utils::attach_attribute(ghandle, "type", "integer"); + hdf5_utils::spawn_data(ghandle, "values", 100, H5::PredType::NATIVE_FLOAT); + } + expect_error("32-bit signed integer"); + + { + auto handle = reopen(); + auto ghandle = handle.openGroup("atomic_vector"); + ghandle.unlink("values"); + hdf5_utils::spawn_data(ghandle, "values", 100, H5::PredType::NATIVE_INT32); + } + takane::validate(testdir()); + } + + // Boolean. + { + { + auto handle = initialize(); + auto ghandle = handle.createGroup("atomic_vector"); + hdf5_utils::attach_attribute(ghandle, "version", "1.0"); + hdf5_utils::attach_attribute(ghandle, "type", "boolean"); + hdf5_utils::spawn_data(ghandle, "values", 100, H5::PredType::NATIVE_FLOAT); + } + expect_error("32-bit signed integer"); + + { + auto handle = reopen(); + auto ghandle = handle.openGroup("atomic_vector"); + ghandle.unlink("values"); + hdf5_utils::spawn_data(ghandle, "values", 100, H5::PredType::NATIVE_INT32); } - }); + takane::validate(testdir()); + } + + // Number. + { + { + auto handle = initialize(); + auto ghandle = handle.createGroup("atomic_vector"); + hdf5_utils::attach_attribute(ghandle, "version", "1.0"); + hdf5_utils::attach_attribute(ghandle, "type", "number"); + hdf5_utils::spawn_data(ghandle, "values", 100, H5::PredType::NATIVE_INT64); + } + expect_error("64-bit float"); + + { + auto handle = reopen(); + auto ghandle = handle.openGroup("atomic_vector"); + ghandle.unlink("values"); + hdf5_utils::spawn_data(ghandle, "values", 100, H5::PredType::NATIVE_DOUBLE); + } + takane::validate(testdir()); + } + + // String. + { + { + auto handle = initialize(); + auto ghandle = handle.createGroup("atomic_vector"); + hdf5_utils::attach_attribute(ghandle, "version", "1.0"); + hdf5_utils::attach_attribute(ghandle, "type", "string"); + hdf5_utils::spawn_data(ghandle, "values", 100, H5::PredType::NATIVE_INT); + } + expect_error("string datatype"); + + { + auto handle = reopen(); + auto ghandle = handle.openGroup("atomic_vector"); + ghandle.unlink("values"); + hdf5_utils::spawn_data(ghandle, "values", 5, H5::StrType(0, 10)); + } + takane::validate(testdir()); + + { + auto handle = reopen(); + auto ghandle = handle.openGroup("atomic_vector"); + hdf5_utils::attach_attribute(ghandle, "format", "date"); + } + expect_error("date-formatted string"); + } } -TEST(AtomicVector, Basics) { - std::string buffer = "\"names\",\"values\"\n"; - buffer += "\"foo\",\"whee\"\n"; - buffer += "\"bar\",\"bob\"\n"; - buffer += "\"whee\",\"stuff\"\n"; - ::validate(buffer, 3, takane::atomic_vector::Type::STRING, true); - - expect_error("number of header names", buffer, 3, takane::atomic_vector::Type::STRING, false); - expect_error("number of records", buffer, 10, takane::atomic_vector::Type::STRING, true); - - FilledFieldCreator filled; - auto output = ::validate(buffer, 3, takane::atomic_vector::Type::STRING, true, &filled); - EXPECT_EQ(output.fields.size(), 2); - EXPECT_EQ(output.fields[0]->type(), comservatory::Type::STRING); - EXPECT_EQ(output.fields[1]->type(), comservatory::Type::STRING); - - buffer = "\"names\",\"blah\"\n"; - buffer += "\"foo\",\"whee\"\n"; - buffer += "\"bar\",\"bob\"\n"; - buffer += "\"whee\",\"stuff\"\n"; - expect_error("should be named 'values'", buffer, 3, takane::atomic_vector::Type::STRING, true); - - buffer = "\"values\"\n"; - buffer += "\"foo\"\n"; - buffer += "\"bar\"\n"; - buffer += "\"whee\"\n"; - ::validate(buffer, 3, takane::atomic_vector::Type::STRING, false); +TEST_F(AtomicVectorTest, Missingness) { + { + auto handle = initialize(); + auto ghandle = handle.createGroup("atomic_vector"); + hdf5_utils::attach_attribute(ghandle, "version", "1.0"); + hdf5_utils::attach_attribute(ghandle, "type", "integer"); + hdf5_utils::spawn_data(ghandle, "values", 100, H5::PredType::NATIVE_INT32); + auto dhandle = ghandle.openDataSet("values"); + auto attr = dhandle.createAttribute("missing-value-placeholder", H5::PredType::NATIVE_FLOAT, H5S_SCALAR); + } + expect_error("missing-value-placeholder"); + + { + auto handle = reopen(); + auto ghandle = handle.openGroup("atomic_vector"); + auto dhandle = ghandle.openDataSet("values"); + dhandle.removeAttr("missing-value-placeholder"); + auto attr = dhandle.createAttribute("missing-value-placeholder", H5::PredType::NATIVE_INT32, H5S_SCALAR); + int val = -1; + attr.write(H5::PredType::NATIVE_INT, &val); + } + takane::validate(testdir()); } -TEST(AtomicVector, Types) { - std::string buffer = "\"names\",\"values\"\n"; - buffer += "\"foo\",1.2\n"; - buffer += "\"bar\",3.4\n"; - buffer += "\"whee\",5.6\n"; - ::validate(buffer, 3, takane::atomic_vector::Type::NUMBER, true); - expect_error("not an integer", buffer, 3, takane::atomic_vector::Type::INTEGER, true); - - buffer = "\"values\"\n"; - buffer += "true\n"; - buffer += "false\n"; - buffer += "true\n"; - ::validate(buffer, 3, takane::atomic_vector::Type::BOOLEAN, false); - - buffer = "\"values\"\n"; - buffer += "23231\n"; - buffer += "-112312\n"; - buffer += "81\n"; - ::validate(buffer, 3, takane::atomic_vector::Type::INTEGER, false); +TEST_F(AtomicVectorTest, NameChecks) { + { + auto handle = initialize(); + auto ghandle = handle.createGroup("atomic_vector"); + hdf5_utils::attach_attribute(ghandle, "version", "1.0"); + hdf5_utils::attach_attribute(ghandle, "type", "integer"); + hdf5_utils::spawn_data(ghandle, "values", 100, H5::PredType::NATIVE_INT32); + hdf5_utils::spawn_data(ghandle, "names", 100, H5::PredType::NATIVE_INT32); + } + expect_error("string datatype"); + + { + auto handle = reopen(); + auto ghandle = handle.openGroup("atomic_vector"); + ghandle.unlink("names"); + hdf5_utils::spawn_data(ghandle, "names", 50, H5::StrType(0, 10)); + } + expect_error("same length"); + + { + auto handle = reopen(); + auto ghandle = handle.openGroup("atomic_vector"); + ghandle.unlink("names"); + hdf5_utils::spawn_data(ghandle, "names", 100, H5::StrType(0, 10)); + } + takane::validate(testdir()); } diff --git a/tests/src/data_frame.cpp b/tests/src/data_frame.cpp new file mode 100644 index 0000000..87a30aa --- /dev/null +++ b/tests/src/data_frame.cpp @@ -0,0 +1,575 @@ +#include +#include + +#include "data_frame.h" +#include "simple_list.h" +#include "takane/takane.hpp" + +#include +#include +#include +#include + +struct Hdf5DataFrameTest : public ::testing::Test { + Hdf5DataFrameTest() { + dir = "TEST_data_frame"; + name = "data_frame"; + } + + std::filesystem::path dir; + std::string name; + + H5::H5File initialize() { + initialize_directory(dir, "data_frame"); + auto path = dir / "basic_columns.h5"; + return H5::H5File(std::string(path), H5F_ACC_TRUNC); + } + + H5::H5File reopen() { + auto path = dir / "basic_columns.h5"; + return H5::H5File(path, H5F_ACC_RDWR); + } + + template + void expect_error(const std::string& msg, Args_&& ... args) { + EXPECT_ANY_THROW({ + try { + takane::validate(dir, std::forward(args)...); + } catch (std::exception& e) { + EXPECT_THAT(e.what(), ::testing::HasSubstr(msg)); + throw; + } + }); + } +}; + +TEST_F(Hdf5DataFrameTest, Rownames) { + std::vector columns(1); + columns.front().name = "WHEE"; + + { + auto handle = initialize(); + auto ghandle = handle.createGroup(name); + data_frame::mock(ghandle, 29, true, columns); + } + takane::validate(dir); + EXPECT_EQ(takane::height(dir), 29); + + { + auto handle = reopen(); + auto ghandle = handle.openGroup(name); + ghandle.unlink("row_names"); + ghandle.createGroup("row_names"); + } + expect_error("expected a 'row_names' dataset"); + + { + auto handle = reopen(); + auto ghandle = handle.openGroup(name); + ghandle.unlink("row_names"); + ghandle.createDataSet("row_names", H5::PredType::NATIVE_INT, H5S_SCALAR); + } + expect_error("string dataset"); + + { + auto handle = reopen(); + auto ghandle = handle.openGroup(name); + ghandle.unlink("row_names"); + + H5::StrType stype(0, H5T_VARIABLE); + hsize_t dummy = 20; + H5::DataSpace dspace(1, &dummy); + ghandle.createDataSet("row_names", stype, dspace); + } + expect_error("expected 'row_names' to have length"); +} + +TEST_F(Hdf5DataFrameTest, Colnames) { + std::vector columns(2); + columns[0].name = "Aaron"; + columns[1].name = "Barry"; + + { + auto handle = initialize(); + auto ghandle = handle.createGroup(name); + mock(ghandle, 29, false, columns); + } + takane::validate(dir); + EXPECT_EQ(takane::height(dir), 29); + + { + auto handle = reopen(); + auto ghandle = handle.openGroup(name); + ghandle.unlink("column_names"); + } + expect_error("dataset"); + + { + auto handle = reopen(); + auto ghandle = handle.openGroup(name); + ghandle.createGroup("column_names"); + } + expect_error("dataset"); + + { + auto handle = reopen(); + auto ghandle = handle.openGroup(name); + ghandle.unlink("column_names"); + ghandle.createDataSet("column_names", H5::PredType::NATIVE_INT, H5S_SCALAR); + } + expect_error("string dataset"); + + columns[1].name = "Aaron"; + { + auto handle = initialize(); + auto ghandle = handle.createGroup(name); + mock(ghandle, 29, false, columns); + } + expect_error("duplicated column name"); + + columns[0].name = ""; + { + auto handle = initialize(); + auto ghandle = handle.createGroup(name); + mock(ghandle, 29, false, columns); + } + expect_error("empty strings"); +} + +TEST_F(Hdf5DataFrameTest, General) { + std::vector columns(2); + columns.resize(2); + columns[0].name = "Aaron"; + columns[1].name = "Barry"; + + { + initialize(); + } + expect_error("'" + name + "' group"); + + H5::StrType stype(0, H5T_VARIABLE); + { + auto handle = reopen(); + auto ghandle = handle.createGroup(name); + auto attr = ghandle.createAttribute("version", stype, H5S_SCALAR); + attr.write(stype, std::string("2.0")); + } + expect_error("unsupported version"); + + { + auto handle = reopen(); + auto ghandle = handle.openGroup(name); + ghandle.removeAttr("version"); + auto attr = ghandle.createAttribute("version", stype, H5S_SCALAR); + attr.write(stype, std::string("1.0")); + ghandle.createAttribute("row-count", H5::PredType::NATIVE_INT8, H5S_SCALAR); + } + expect_error("64-bit unsigned"); +} + +TEST_F(Hdf5DataFrameTest, Data) { + std::vector columns(2); + columns[0].name = "Aaron"; + columns[1].name = "Barry"; + + { + auto handle = initialize(); + auto ghandle = handle.createGroup(name); + mock(ghandle, 33, false, columns); + ghandle.unlink("data"); + } + expect_error("'data_frame/data' group"); + + { + auto handle = reopen(); + auto ghandle = handle.openGroup(name); + auto dhandle = ghandle.createGroup("data"); + auto fhandle = dhandle.createGroup("0"); + hdf5_utils::attach_attribute(fhandle, "type", "something"); + } + expect_error("expected HDF5 groups"); + + { + auto handle = reopen(); + auto ghandle = handle.openGroup(name); + auto dhandle = ghandle.openGroup("data"); + dhandle.unlink("0"); + hdf5_utils::spawn_data(dhandle, "0", 2, H5::PredType::NATIVE_INT32); + } + expect_error("length equal to the number of rows"); + + { + auto handle = initialize(); + auto ghandle = handle.createGroup(name); + mock(ghandle, 33, false, columns); + auto dhandle = ghandle.openGroup("data"); + dhandle.createGroup("foo"); + } + expect_error("more objects present"); +} + +TEST_F(Hdf5DataFrameTest, Other) { + std::vector columns(2); + columns[0].name = "Aaron"; + columns[0].type = data_frame::ColumnType::OTHER; + columns[1].name = "Barry"; + columns[1].type = data_frame::ColumnType::OTHER; + + { + auto handle = initialize(); + auto ghandle = handle.createGroup(name); + mock(ghandle, 51, false, columns); + + std::filesystem::create_directory(dir / "other_columns"); + for (size_t i = 0; i < 2; ++i) { + auto subdir = dir / "other_columns" / std::to_string(i); + std::filesystem::create_directory(subdir); + std::ofstream output(subdir / "OBJECT"); + output << "data_frame"; + + std::vector subcolumns(1); + subcolumns[0].name = "version" + std::to_string(i + 1); + H5::H5File handle(subdir / "basic_columns.h5", H5F_ACC_TRUNC); + auto ghandle = handle.createGroup(name); + mock(ghandle, 51, false, subcolumns); + } + } + takane::validate(dir); + + { + std::filesystem::create_directory(dir / "other_columns"); + auto subdir = dir / "other_columns" / "0"; + std::vector subcolumns(1); + subcolumns[0].name = "version3"; + H5::H5File handle(subdir / "basic_columns.h5", H5F_ACC_TRUNC); + auto ghandle = handle.createGroup(name); + mock(ghandle, 32, false, subcolumns); + } + expect_error("height of column 0 of class 'data_frame'"); +} + +TEST_F(Hdf5DataFrameTest, Integer) { + std::vector columns(1); + columns[0].name = "Aaron"; + columns[0].type = data_frame::ColumnType::INTEGER; + + { + auto handle = initialize(); + auto ghandle = handle.createGroup(name); + data_frame::mock(ghandle, 33, false, columns); + } + takane::validate(dir); + + { + auto handle = reopen(); + auto ghandle = handle.openGroup(name); + auto dhandle = ghandle.openGroup("data"); + dhandle.unlink("0"); + auto xhandle = hdf5_utils::spawn_data(dhandle, "0", 33, H5::PredType::NATIVE_INT64); + hdf5_utils::attach_attribute(xhandle, "type", "integer"); + } + expect_error("32-bit signed integer"); + + // Checking the missing value placeholder. + { + auto handle = reopen(); + auto ghandle = handle.openGroup(name); + auto dhandle = ghandle.openGroup("data"); + dhandle.unlink("0"); + auto xhandle = hdf5_utils::spawn_data(dhandle, "0", 33, H5::PredType::NATIVE_INT16); + hdf5_utils::attach_attribute(xhandle, "type", "integer"); + xhandle.createAttribute("missing-value-placeholder", H5::PredType::NATIVE_INT16, H5S_SCALAR); + } + takane::validate(dir); + + { + auto handle = reopen(); + auto ghandle = handle.openGroup(name); + auto dhandle = ghandle.openGroup("data"); + auto xhandle = dhandle.openDataSet("0"); + xhandle.removeAttr("missing-value-placeholder"); + xhandle.createAttribute("missing-value-placeholder", H5::PredType::NATIVE_INT8, H5S_SCALAR); + } + expect_error("same type as"); +} + +TEST_F(Hdf5DataFrameTest, Boolean) { + std::vector columns(1); + columns[0].name = "Aaron"; + columns[0].type = data_frame::ColumnType::BOOLEAN; + + { + auto handle = initialize(); + auto ghandle = handle.createGroup(name); + data_frame::mock(ghandle, 55, false, columns); + } + takane::validate(dir); + + { + auto handle = reopen(); + auto ghandle = handle.openGroup(name); + auto dhandle = ghandle.openGroup("data"); + dhandle.unlink("0"); + auto xhandle = hdf5_utils::spawn_data(dhandle, "0", 55, H5::PredType::NATIVE_INT64); + hdf5_utils::attach_attribute(xhandle, "type", "boolean"); + } + expect_error("32-bit signed integer"); + + // Checking the missing value placeholder. + { + auto handle = reopen(); + auto ghandle = handle.openGroup(name); + auto dhandle = ghandle.openGroup("data"); + dhandle.unlink("0"); + auto xhandle = hdf5_utils::spawn_data(dhandle, "0", 55, H5::PredType::NATIVE_INT8); + hdf5_utils::attach_attribute(xhandle, "type", "boolean"); + xhandle.createAttribute("missing-value-placeholder", H5::PredType::NATIVE_INT8, H5S_SCALAR); + } + takane::validate(dir); + + { + auto handle = reopen(); + auto ghandle = handle.openGroup(name); + auto dhandle = ghandle.openGroup("data"); + auto xhandle = dhandle.openDataSet("0"); + xhandle.removeAttr("missing-value-placeholder"); + xhandle.createAttribute("missing-value-placeholder", H5::PredType::NATIVE_UINT16, H5S_SCALAR); + } + expect_error("same type as"); +} + +TEST_F(Hdf5DataFrameTest, Number) { + std::vector columns(1); + columns[0].name = "Aaron"; + columns[0].type = data_frame::ColumnType::NUMBER; + + { + auto handle = initialize(); + auto ghandle = handle.createGroup(name); + data_frame::mock(ghandle, 99, false, columns); + } + takane::validate(dir); + + { + auto handle = reopen(); + auto ghandle = handle.openGroup(name); + auto dhandle = ghandle.openGroup("data"); + dhandle.unlink("0"); + auto xhandle = hdf5_utils::spawn_data(dhandle, "0", 99, H5::PredType::NATIVE_INT64); + hdf5_utils::attach_attribute(xhandle, "type", "number"); + } + expect_error("64-bit float"); + + // Checking the missing value placeholder. + { + auto handle = reopen(); + auto ghandle = handle.openGroup(name); + auto dhandle = ghandle.openGroup("data"); + dhandle.unlink("0"); + auto xhandle = hdf5_utils::spawn_data(dhandle, "0", 99, H5::PredType::NATIVE_DOUBLE); + hdf5_utils::attach_attribute(xhandle, "type", "number"); + xhandle.createAttribute("missing-value-placeholder", H5::PredType::NATIVE_DOUBLE, H5S_SCALAR); + } + takane::validate(dir); + + { + auto handle = reopen(); + auto ghandle = handle.openGroup(name); + auto dhandle = ghandle.openGroup("data"); + auto xhandle = dhandle.openDataSet("0"); + xhandle.removeAttr("missing-value-placeholder"); + xhandle.createAttribute("missing-value-placeholder", H5::PredType::NATIVE_INT8, H5S_SCALAR); + } + expect_error("same type as"); +} + +TEST_F(Hdf5DataFrameTest, String) { + std::vector columns(1); + columns[0].name = "Aaron"; + columns[0].type = data_frame::ColumnType::STRING; + + { + auto handle = initialize(); + auto ghandle = handle.createGroup(name); + mock(ghandle, 72, false, columns); + } + takane::validate(dir); + + { + auto handle = reopen(); + auto ghandle = handle.openGroup(name); + auto dhandle = ghandle.openGroup("data"); + dhandle.unlink("0"); + auto xhandle = hdf5_utils::spawn_data(dhandle, "0", 72, H5::PredType::NATIVE_INT); + hdf5_utils::attach_attribute(xhandle, "type", "string"); + } + expect_error("string dataset"); + + { + auto handle = reopen(); + auto ghandle = handle.openGroup(name); + auto dhandle = ghandle.openGroup("data"); + dhandle.unlink("0"); + auto xhandle = hdf5_utils::spawn_data(dhandle, "0", 72, H5::StrType(0, 5)); + hdf5_utils::attach_attribute(xhandle, "type", "string"); + hdf5_utils::attach_attribute(xhandle, "format", "whee"); + } + expect_error("unsupported format 'whee'"); + + { + auto handle = reopen(); + auto ghandle = handle.openGroup(name); + auto dhandle = ghandle.openGroup("data"); + auto xhandle = dhandle.openDataSet("0"); + xhandle.removeAttr("format"); + hdf5_utils::attach_attribute(xhandle, "format", "none"); + } + takane::validate(dir); + + // Checking the missing value placeholder. + { + auto handle = reopen(); + auto ghandle = handle.openGroup(name); + auto dhandle = ghandle.openGroup("data"); + auto xhandle = dhandle.openDataSet("0"); + H5::StrType stype(0, H5T_VARIABLE); + auto ahandle = xhandle.createAttribute("missing-value-placeholder", stype, H5S_SCALAR); + ahandle.write(stype, std::string("asdasd")); + } + takane::validate(dir); + + { + auto handle = reopen(); + auto ghandle = handle.openGroup(name); + auto dhandle = ghandle.openGroup("data"); + auto xhandle = dhandle.openDataSet("0"); + xhandle.removeAttr("missing-value-placeholder"); + xhandle.createAttribute("missing-value-placeholder", H5::PredType::NATIVE_INT8, H5S_SCALAR); + } + expect_error("same type class as"); +} + +TEST_F(Hdf5DataFrameTest, StringFormat) { + std::vector columns(1); + columns[0].name = "Aaron"; + columns[0].type = data_frame::ColumnType::STRING; + + { + auto handle = initialize(); + auto ghandle = handle.createGroup(name); + mock(ghandle, 72, false, columns); + auto dhandle = ghandle.openGroup("data"); + auto xhandle = dhandle.openDataSet("0"); + hdf5_utils::attach_attribute(xhandle, "format", "date-time"); + } + expect_error("date/time-formatted string"); + + // But it's okay when we slap a placeholder on top. + { + auto handle = reopen(); + auto ghandle = handle.openGroup(name); + auto dhandle = ghandle.openGroup("data"); + dhandle.unlink("0"); + auto xhandle = hdf5_utils::spawn_data(dhandle, "0", 72, H5::StrType(0, 5)); + hdf5_utils::attach_attribute(xhandle, "type", "string"); + hdf5_utils::attach_attribute(xhandle, "missing-value-placeholder", ""); + } + takane::validate(dir); +} + +TEST_F(Hdf5DataFrameTest, Factor) { + std::vector columns(1); + columns[0].name = "Aaron"; + columns[0].type = data_frame::ColumnType::FACTOR; + columns[0].factor_levels = std::vector{ "kanon", "chisato", "sumire", "ren", "keke" }; + + { + auto handle = initialize(); + auto ghandle = handle.createGroup(name); + mock(ghandle, 99, false, columns); + } + takane::validate(dir); + + { + auto handle = reopen(); + auto ghandle = handle.openGroup(name); + auto dhandle = ghandle.openGroup("data"); + auto fhandle = dhandle.openGroup("0"); + fhandle.unlink("codes"); + hdf5_utils::spawn_data(fhandle, "codes", 80, H5::PredType::NATIVE_INT8); + } + expect_error("length equal to the number of rows"); + + { + auto handle = reopen(); + auto ghandle = handle.openGroup(name); + auto dhandle = ghandle.openGroup("data"); + auto fhandle = dhandle.openGroup("0"); + fhandle.unlink("codes"); + + std::vector replacement(99, columns[0].factor_levels.size()); + auto xhandle = hdf5_utils::spawn_data(fhandle, "codes", replacement.size(), H5::PredType::NATIVE_INT16); + xhandle.write(replacement.data(), H5::PredType::NATIVE_INT); + } + expect_error("less than the number of levels"); + + { + auto handle = initialize(); + auto ghandle = handle.createGroup(name); + mock(ghandle, 99, false, columns); + + auto dhandle = ghandle.openGroup("data"); + auto fhandle = dhandle.openGroup("0"); + fhandle.unlink("levels"); + + std::vector levels(columns[0].factor_levels.begin(), columns[0].factor_levels.end()); + levels.push_back(levels[0]); + hdf5_utils::spawn_string_data(fhandle, "levels", H5T_VARIABLE, levels); + } + expect_error("duplicated factor level"); + + { + auto handle = initialize(); + auto ghandle = handle.createGroup(name); + mock(ghandle, 99, false, columns); + auto fhandle = ghandle.openGroup("data/0"); + fhandle.createAttribute("ordered", H5::PredType::NATIVE_FLOAT, H5S_SCALAR); + } + expect_error("32-bit signed integer"); + + { + auto handle = reopen(); + auto ghandle = handle.openGroup(name); + auto fhandle = ghandle.openGroup("data/0"); + fhandle.removeAttr("ordered"); + fhandle.createAttribute("ordered", H5::PredType::NATIVE_UINT8, H5S_SCALAR); + } + takane::validate(dir); +} + +TEST_F(Hdf5DataFrameTest, Metadata) { + std::vector columns(1); + columns[0].name = "Aaron"; + columns[0].type = data_frame::ColumnType::FACTOR; + columns[0].factor_levels = std::vector{ "kanon", "chisato", "sumire", "ren", "keke" }; + + auto cdir = dir / "column_annotations"; + auto odir = dir / "other_annotations"; + + { + auto handle = initialize(); + auto ghandle = handle.createGroup(name); + data_frame::mock(ghandle, 99, false, columns); + initialize_directory(cdir, "simple_list"); + } + expect_error("'data_frame' or one of its derivatives"); + + initialize_directory(cdir, "data_frame"); + data_frame::mock(cdir, columns.size(), false, {}); + initialize_directory(odir, "data_frame"); + expect_error("'simple_list' or one of its derivatives"); + + initialize_directory(odir, "simple_list"); + simple_list::mock(odir); + takane::validate(dir); +} diff --git a/tests/src/data_frame.h b/tests/src/data_frame.h new file mode 100644 index 0000000..55642e2 --- /dev/null +++ b/tests/src/data_frame.h @@ -0,0 +1,149 @@ +#ifndef DATA_FRAME_H +#define DATA_FRAME_H + +#include +#include +#include + +#include "H5Cpp.h" +#include "utils.h" + +namespace data_frame { + +enum class ColumnType { + INTEGER, + NUMBER, + STRING, + BOOLEAN, + FACTOR, + OTHER +}; + +enum class StringFormat { + NONE, + DATE, + DATE_TIME +}; + +struct ColumnDetails { + std::string name; + + ColumnType type = ColumnType::INTEGER; + + StringFormat string_format = StringFormat::NONE; + + bool factor_ordered = false; + + std::vector factor_levels; +}; + +inline void mock(const H5::Group& handle, hsize_t num_rows, bool has_row_names, const std::vector& columns) { + { + hsize_t ncol = columns.size(); + H5::DataSpace dspace(1, &ncol); + H5::StrType stype(0, H5T_VARIABLE); + auto dhandle = handle.createDataSet("column_names", stype, dspace); + + std::vector column_names; + column_names.reserve(ncol); + for (const auto& col : columns) { + column_names.push_back(col.name.c_str()); + } + + dhandle.write(column_names.data(), stype); + } + + if (has_row_names) { + H5::DataSpace dspace(1, &num_rows); + H5::StrType stype(0, H5T_VARIABLE); + auto dhandle = handle.createDataSet("row_names", stype, dspace); + + std::vector row_names; + row_names.reserve(num_rows); + std::vector row_names_ptr; + row_names_ptr.reserve(num_rows); + + for (hsize_t i = 0; i < num_rows; ++i) { + row_names.push_back(std::to_string(i)); + row_names_ptr.push_back(row_names.back().c_str()); + } + + dhandle.write(row_names_ptr.data(), stype); + } + + auto attr = handle.createAttribute("row-count", H5::PredType::NATIVE_UINT32, H5S_SCALAR); + attr.write(H5::PredType::NATIVE_HSIZE, &num_rows); + + H5::StrType stype(0, H5T_VARIABLE); + auto attr2 = handle.createAttribute("version", stype, H5S_SCALAR); + attr2.write(stype, std::string("1.0")); + + auto ghandle = handle.createGroup("data"); + size_t NC = columns.size(); + for (size_t c = 0; c < NC; ++c) { + const auto& curcol = columns[c]; + if (curcol.type == data_frame::ColumnType::OTHER) { + continue; + } + + std::string colname = std::to_string(c); + if (curcol.type == data_frame::ColumnType::INTEGER) { + auto dhandle = hdf5_utils::spawn_data(ghandle, colname, num_rows, H5::PredType::NATIVE_INT32); + std::vector dump(num_rows); + std::iota(dump.begin(), dump.end(), 0); + dhandle.write(dump.data(), H5::PredType::NATIVE_INT); + hdf5_utils::attach_attribute(dhandle, "type", "integer"); + + } else if (curcol.type == data_frame::ColumnType::NUMBER) { + std::vector dump(num_rows); + std::iota(dump.begin(), dump.end(), 0.5); + auto dhandle = hdf5_utils::spawn_data(ghandle, colname, num_rows, H5::PredType::NATIVE_DOUBLE); + dhandle.write(dump.data(), H5::PredType::NATIVE_DOUBLE); + hdf5_utils::attach_attribute(dhandle, "type", "number"); + + } else if (curcol.type == data_frame::ColumnType::BOOLEAN) { + std::vector dump(num_rows); + for (hsize_t i = 0; i < num_rows; ++i) { + dump[i] = i % 2; + } + auto dhandle = hdf5_utils::spawn_data(ghandle, colname, num_rows, H5::PredType::NATIVE_INT8); + dhandle.write(dump.data(), H5::PredType::NATIVE_INT); + hdf5_utils::attach_attribute(dhandle, "type", "boolean"); + + } else if (curcol.type == data_frame::ColumnType::STRING) { + std::vector raw_dump(num_rows); + for (hsize_t i = 0; i < num_rows; ++i) { + raw_dump[i] = std::to_string(i); + } + auto dhandle = hdf5_utils::spawn_string_data(ghandle, colname, H5T_VARIABLE, raw_dump); + hdf5_utils::attach_attribute(dhandle, "type", "string"); + + } else if (curcol.type == data_frame::ColumnType::FACTOR) { + auto dhandle = ghandle.createGroup(colname); + hdf5_utils::attach_attribute(dhandle, "type", "factor"); + if (curcol.factor_ordered) { + hdf5_utils::attach_attribute(dhandle, "ordered", 1); + } + + hsize_t nchoices = curcol.factor_levels.size(); + hdf5_utils::spawn_string_data(dhandle, "levels", H5T_VARIABLE, curcol.factor_levels); + + std::vector codes(num_rows); + for (hsize_t i = 0; i < num_rows; ++i) { + codes[i] = i % nchoices; + } + auto chandle = hdf5_utils::spawn_data(dhandle, "codes", num_rows, H5::PredType::NATIVE_INT16); + chandle.write(codes.data(), H5::PredType::NATIVE_INT); + } + } +} + +inline void mock(const std::filesystem::path& path, hsize_t num_rows, bool has_row_names, const std::vector& columns) { + H5::H5File handle(path / "basic_columns.h5", H5F_ACC_TRUNC); + auto ghandle = handle.createGroup("data_frame"); + mock(ghandle, num_rows, has_row_names, columns); +} + +} + +#endif diff --git a/tests/src/data_frame_factor.cpp b/tests/src/data_frame_factor.cpp new file mode 100644 index 0000000..08046ff --- /dev/null +++ b/tests/src/data_frame_factor.cpp @@ -0,0 +1,208 @@ +#include +#include + +#include "takane/takane.hpp" +#include "utils.h" +#include "data_frame.h" +#include "simple_list.h" + +#include +#include +#include + +struct DataFrameFactorTest : public::testing::Test { + static std::filesystem::path testdir() { + return "TEST_data_frame_factor"; + } + + static H5::H5File initialize() { + auto path = testdir(); + initialize_directory(path, "data_frame_factor"); + path.append("contents.h5"); + return H5::H5File(path, H5F_ACC_TRUNC); + } + + static H5::H5File reopen() { + auto path = testdir() / "contents.h5"; + return H5::H5File(path, H5F_ACC_RDWR); + } + + template + static void expect_error(const std::string& msg, Args_&& ... args) { + expect_validation_error(testdir(), msg, std::forward(args)...); + } +}; + +TEST_F(DataFrameFactorTest, Basic) { + { + auto handle = initialize(); + } + expect_error("expected a 'data_frame_factor' group"); + + { + auto handle = reopen(); + handle.createDataSet("data_frame_factor", H5::PredType::NATIVE_INT, H5S_SCALAR); + } + expect_error("expected a 'data_frame_factor' group"); + + { + auto handle = reopen(); + handle.unlink("data_frame_factor"); + auto ghandle = handle.createGroup("data_frame_factor"); + hdf5_utils::attach_attribute(ghandle, "version", "2.0"); + } + expect_error("unsupported version string"); + + auto ldir = testdir() / "levels"; + { + auto handle = reopen(); + auto ghandle = handle.openGroup("data_frame_factor"); + ghandle.removeAttr("version"); + hdf5_utils::attach_attribute(ghandle, "version", "1.0"); + initialize_directory(ldir, "foobar"); + } + expect_error("'levels'"); + + { + initialize_directory(ldir, "data_frame"); + data_frame::mock(ldir, 5, false, {}); + } + expect_error("'codes'"); + + // Success at last. + { + auto handle = reopen(); + auto ghandle = handle.openGroup("data_frame_factor"); + hdf5_utils::spawn_data(ghandle, "codes", 100, H5::PredType::NATIVE_INT32); + } + takane::validate(testdir()); + EXPECT_EQ(takane::height(testdir()), 100); +} + +TEST_F(DataFrameFactorTest, Levels) { + { + auto handle = initialize(); + auto ghandle = handle.createGroup("data_frame_factor"); + hdf5_utils::attach_attribute(ghandle, "version", "1.0"); + } + + auto ldir = testdir() / "levels"; + { + initialize_directory(ldir, "simple_list"); + } + expect_error("'data_frame' or one of its derivatives"); + + takane::data_frame_factor::any_duplicated = [](const std::filesystem::path&, const std::string&, const takane::Options&) -> bool { return true; }; + { + initialize_directory(ldir, "data_frame"); + data_frame::mock(ldir, 5, false, {}); + } + expect_error("duplicated rows"); + + takane::data_frame_factor::any_duplicated = nullptr; +} + +TEST_F(DataFrameFactorTest, Codes) { + { + auto handle = initialize(); + auto ghandle = handle.createGroup("data_frame_factor"); + hdf5_utils::attach_attribute(ghandle, "version", "1.0"); + + std::vector codes { 0, -1, 2, 1, 3, -1, 2 }; + auto dhandle = hdf5_utils::spawn_data(ghandle, "codes", codes.size(), H5::PredType::NATIVE_INT32); + dhandle.write(codes.data(), H5::PredType::NATIVE_INT); + + auto ldir = testdir() / "levels"; + initialize_directory(ldir, "data_frame"); + data_frame::mock(ldir, 5, false, {}); + } + expect_error("non-negative"); + + { + auto handle = reopen(); + auto ghandle = handle.openGroup("data_frame_factor"); + auto dhandle = ghandle.openDataSet("codes"); + std::vector codes { 0, 1, 2, 1, 3, 100, 2 }; + dhandle.write(codes.data(), H5::PredType::NATIVE_INT); + } + expect_error("number of levels"); + + { + auto handle = reopen(); + auto ghandle = handle.openGroup("data_frame_factor"); + auto dhandle = ghandle.openDataSet("codes"); + std::vector codes { 0, 1, 2, 1, 3, 4, 2 }; + dhandle.write(codes.data(), H5::PredType::NATIVE_INT); + } + takane::validate(testdir()); +} + +TEST_F(DataFrameFactorTest, Names) { + std::vector codes { 0, 1, 2, 1, 0, 1, 2 }; + { + auto handle = initialize(); + auto ghandle = handle.createGroup("data_frame_factor"); + hdf5_utils::attach_attribute(ghandle, "version", "1.0"); + + auto dhandle = hdf5_utils::spawn_data(ghandle, "codes", codes.size(), H5::PredType::NATIVE_INT32); + dhandle.write(codes.data(), H5::PredType::NATIVE_INT); + hdf5_utils::spawn_data(ghandle, "names", codes.size(), H5::PredType::NATIVE_INT); + + auto ldir = testdir() / "levels"; + initialize_directory(ldir, "data_frame"); + data_frame::mock(ldir, 5, false, {}); + } + expect_error("string datatype"); + + { + auto handle = reopen(); + auto ghandle = handle.openGroup("data_frame_factor"); + ghandle.unlink("names"); + hdf5_utils::spawn_data(ghandle, "names", 50, H5::StrType(0, 10)); + } + expect_error("same length"); + + { + auto handle = reopen(); + auto ghandle = handle.openGroup("data_frame_factor"); + ghandle.unlink("names"); + hdf5_utils::spawn_data(ghandle, "names", codes.size(), H5::StrType(0, 10)); + } + takane::validate(testdir()); +} + +TEST_F(DataFrameFactorTest, Metadata) { + auto dir = testdir(); + auto edir = dir / "element_annotations"; + auto odir = dir / "other_annotations"; + + std::vector codes { 0, 1, 2, 1, 3, 1, 0, 2 }; + { + auto handle = initialize(); + auto ghandle = handle.createGroup("data_frame_factor"); + hdf5_utils::attach_attribute(ghandle, "version", "1.0"); + + auto dhandle = hdf5_utils::spawn_data(ghandle, "codes", codes.size(), H5::PredType::NATIVE_INT32); + dhandle.write(codes.data(), H5::PredType::NATIVE_INT); + + auto ldir = dir / "levels"; + initialize_directory(ldir, "data_frame"); + data_frame::mock(ldir, 5, false, {}); + + initialize_directory(edir, "simple_list"); + } + expect_error("'element_annotations'"); + + { + initialize_directory(edir, "data_frame"); + data_frame::mock(edir, codes.size(), false, {}); + initialize_directory(odir, "data_frame"); + } + expect_error("'other_annotations'"); + + { + initialize_directory(odir, "simple_list"); + simple_list::mock(odir); + } + takane::validate(testdir()); +} diff --git a/tests/src/dispatch.cpp b/tests/src/dispatch.cpp new file mode 100644 index 0000000..ebbb003 --- /dev/null +++ b/tests/src/dispatch.cpp @@ -0,0 +1,53 @@ +#include "gtest/gtest.h" +#include "gmock/gmock.h" + +#include "takane/takane.hpp" +#include "utils.h" + +#include + +TEST(GenericDispatch, Validate) { + std::filesystem::path dir = "TEST_dispatcher"; + initialize_directory(dir, "foobar"); + expect_validation_error(dir, "no registered validation function"); + + takane::validate_override = [](const std::filesystem::path&, const std::string&, const takane::Options&) -> bool { return true; }; + takane::validate(dir); + + takane::validate_override = [](const std::filesystem::path&, const std::string&, const takane::Options&) -> bool { return false; }; + expect_validation_error(dir, "no registered validation function"); + takane::validate_override = nullptr; + + takane::validate_registry["foobar"] = [](const std::filesystem::path&, const takane::Options&) -> void {}; + takane::validate(dir); + takane::validate_registry.erase("foobar"); +} + +template +void expect_height_error(const std::filesystem::path& dir, const std::string& msg, Args_&& ... args) { + EXPECT_ANY_THROW({ + try { + takane::height(dir, std::forward(args)...); + } catch (std::exception& e) { + EXPECT_THAT(e.what(), ::testing::HasSubstr(msg)); + throw; + } + }); +} + +TEST(GenericDispatch, Height) { + std::filesystem::path dir = "TEST_dispatcher"; + initialize_directory(dir, "foobar"); + expect_height_error(dir, "no registered height function"); + + takane::height_override = [](const std::filesystem::path&, const std::string&, const takane::Options&) -> std::pair { return std::make_pair(true, 99); }; + EXPECT_EQ(takane::height(dir), 99); + + takane::height_override = [](const std::filesystem::path&, const std::string&, const takane::Options&) -> std::pair { return std::make_pair(false, 0); }; + expect_height_error(dir, "no registered height function"); + takane::height_override = nullptr; + + takane::height_registry["foobar"] = [](const std::filesystem::path&, const takane::Options&) -> size_t { return 11; }; + EXPECT_EQ(takane::height(dir), 11); + takane::height_registry.erase("foobar"); +} diff --git a/tests/src/hdf5_data_frame.cpp b/tests/src/hdf5_data_frame.cpp deleted file mode 100644 index e85b0d7..0000000 --- a/tests/src/hdf5_data_frame.cpp +++ /dev/null @@ -1,1110 +0,0 @@ -#include -#include - -#include "takane/hdf5_data_frame.hpp" - -#include -#include -#include - -struct Hdf5DataFrameTest : public ::testing::TestWithParam { - Hdf5DataFrameTest() { - path = "TEST-hdf5_data_frame.h5"; - name = "df"; - } - std::string path, name; - -public: - template - static void attach_type(const Handle& handle, const std::string& type) { - H5::StrType stype(0, H5T_VARIABLE); - auto attr = handle.createAttribute("type", stype, H5S_SCALAR); - attr.write(stype, type); - } - - template - static void attach_format(const Handle& handle, const std::string& format) { - H5::StrType stype(0, H5T_VARIABLE); - auto attr = handle.createAttribute("format", stype, H5S_SCALAR); - attr.write(stype, format); - } - - static H5::DataSet spawn_column(const H5::Group& handle, const std::string& name, hsize_t num_rows, int version, const H5::DataType& dtype, const std::string& ntype) { - H5::DataSpace dspace(1, &num_rows); - auto out = handle.createDataSet(name, dtype, dspace); - if (version >= 3) { - attach_type(out, ntype); - } - return out; - } - - static H5::DataSet spawn_integer_column(const H5::Group& handle, const std::string& name, hsize_t num_rows, int version, const H5::DataType& dtype = H5::PredType::NATIVE_INT32) { - return spawn_column(handle, name, num_rows, version, dtype, "integer"); - } - - static H5::DataSet spawn_number_column(const H5::Group& handle, const std::string& name, hsize_t num_rows, int version, const H5::DataType& dtype = H5::PredType::NATIVE_DOUBLE) { - return spawn_column(handle, name, num_rows, version, dtype, "number"); - } - - static H5::DataSet spawn_string_column(const H5::Group& handle, const std::string& name, hsize_t num_rows, int version, const H5::DataType& dtype) { - return spawn_column(handle, name, num_rows, version, dtype, "string"); - } - - static H5::DataSet spawn_boolean_column(const H5::Group& handle, const std::string& name, hsize_t num_rows, int version, const H5::DataType& dtype = H5::PredType::NATIVE_INT8) { - return spawn_column(handle, name, num_rows, version, dtype, "boolean"); - } - - template - static std::vector pointerize_strings(const Container_& x) { - std::vector output; - for (auto start = x.begin(), end = x.end(); start != end; ++start) { - output.push_back(start->c_str()); - } - return output; - } - -public: - static void create_hdf5_data_frame(const H5::Group& handle, hsize_t num_rows, bool has_row_names, const std::vector& columns, int version) { - { - hsize_t ncol = columns.size(); - H5::DataSpace dspace(1, &ncol); - H5::StrType stype(0, H5T_VARIABLE); - auto dhandle = handle.createDataSet("column_names", stype, dspace); - - std::vector column_names; - column_names.reserve(ncol); - for (const auto& col : columns) { - column_names.push_back(col.name.c_str()); - } - - dhandle.write(column_names.data(), stype); - } - - if (has_row_names) { - H5::DataSpace dspace(1, &num_rows); - H5::StrType stype(0, H5T_VARIABLE); - auto dhandle = handle.createDataSet("row_names", stype, dspace); - - std::vector row_names; - row_names.reserve(num_rows); - std::vector row_names_ptr; - row_names_ptr.reserve(num_rows); - - for (hsize_t i = 0; i < num_rows; ++i) { - row_names.push_back(std::to_string(i)); - row_names_ptr.push_back(row_names.back().c_str()); - } - - dhandle.write(row_names_ptr.data(), stype); - } - - if (version >= 3) { - auto attr = handle.createAttribute("row-count", H5::PredType::NATIVE_UINT32, H5S_SCALAR); - attr.write(H5::PredType::NATIVE_HSIZE, &num_rows); - - H5::StrType stype(0, H5T_VARIABLE); - auto attr2 = handle.createAttribute("version", stype, H5S_SCALAR); - attr2.write(stype, std::string("1.0")); - } - - auto ghandle = handle.createGroup("data"); - size_t NC = columns.size(); - for (size_t c = 0; c < NC; ++c) { - const auto& curcol = columns[c]; - if (curcol.type == takane::data_frame::ColumnType::OTHER) { - continue; - } - - std::string colname = std::to_string(c); - H5::DataSpace dspace(1, &num_rows); - - if (curcol.type == takane::data_frame::ColumnType::INTEGER) { - auto dhandle = spawn_integer_column(ghandle, colname, num_rows, version); - std::vector dump(num_rows); - std::iota(dump.begin(), dump.end(), 0); - dhandle.write(dump.data(), H5::PredType::NATIVE_INT); - - } else if (curcol.type == takane::data_frame::ColumnType::NUMBER) { - std::vector dump(num_rows); - std::iota(dump.begin(), dump.end(), 0.5); - auto dhandle = spawn_number_column(ghandle, colname, num_rows, version); - dhandle.write(dump.data(), H5::PredType::NATIVE_DOUBLE); - - } else if (curcol.type == takane::data_frame::ColumnType::BOOLEAN) { - std::vector dump(num_rows); - for (hsize_t i = 0; i < num_rows; ++i) { - dump[i] = i % 2; - } - auto dhandle = spawn_boolean_column(ghandle, colname, num_rows, version); - dhandle.write(dump.data(), H5::PredType::NATIVE_INT); - - } else if (curcol.type == takane::data_frame::ColumnType::STRING) { - std::vector raw_dump(num_rows); - for (hsize_t i = 0; i < num_rows; ++i) { - raw_dump[i] = std::to_string(i); - } - H5::StrType stype(0, H5T_VARIABLE); - auto dhandle = spawn_string_column(ghandle, colname, num_rows, version, stype); - auto dump = pointerize_strings(raw_dump); - dhandle.write(dump.data(), stype); - - } else if (curcol.type == takane::data_frame::ColumnType::FACTOR) { - if (version == 1) { - std::vector choices(curcol.factor_levels->begin(), curcol.factor_levels->end()); - std::vector dump(num_rows); - for (hsize_t i = 0; i < num_rows; ++i) { - dump[i] = choices[i % choices.size()].c_str(); - } - H5::StrType stype(0, H5T_VARIABLE); - auto dhandle = ghandle.createDataSet(colname, stype, dspace); - dhandle.write(dump.data(), stype); - - } else if (version == 2) { - int nchoices = curcol.factor_levels->size(); - std::vector dump(num_rows); - for (hsize_t i = 0; i < num_rows; ++i) { - dump[i] = i % nchoices; - } - auto dhandle = ghandle.createDataSet(colname, H5::PredType::NATIVE_INT16, dspace); - dhandle.write(dump.data(), H5::PredType::NATIVE_INT); - - } else { - auto dhandle = ghandle.createGroup(colname); - attach_type(dhandle, "factor"); - - hsize_t nchoices = 0; - { - auto dump = pointerize_strings(*(curcol.factor_levels)); - nchoices = dump.size(); - H5::StrType stype(0, H5T_VARIABLE); - H5::DataSpace dspace(1, &nchoices); - auto lhandle = dhandle.createDataSet("levels", stype, dspace); - lhandle.write(dump.data(), stype); - } - - std::vector codes(num_rows); - for (hsize_t i = 0; i < num_rows; ++i) { - codes[i] = i % nchoices; - } - auto chandle = dhandle.createDataSet("codes", H5::PredType::NATIVE_INT16, dspace); - chandle.write(codes.data(), H5::PredType::NATIVE_INT); - } - } - } - } - -public: - template - static void expect_error(const std::string& msg, Args_&& ... args) { - EXPECT_ANY_THROW({ - try { - takane::hdf5_data_frame::validate(std::forward(args)...); - } catch (std::exception& e) { - EXPECT_THAT(e.what(), ::testing::HasSubstr(msg)); - throw; - } - }); - } -}; - -TEST_P(Hdf5DataFrameTest, Rownames) { - takane::hdf5_data_frame::Parameters params(name); - params.num_rows = 29; - params.has_row_names = true; - auto& columns = params.columns.mutable_ref(); - columns.resize(1); - columns.front().name = "WHEE"; - - auto version = GetParam(); - params.df_version = version; - params.hdf5_version = version; - - { - H5::H5File handle(path, H5F_ACC_TRUNC); - auto ghandle = handle.createGroup(name); - create_hdf5_data_frame(ghandle, params.num_rows, true, columns, version); - } - takane::hdf5_data_frame::validate(path.c_str(), params); - - { - H5::H5File handle(path, H5F_ACC_RDWR); - auto ghandle = handle.openGroup(name); - ghandle.unlink("row_names"); - } - expect_error("expected a 'row_names' dataset", path.c_str(), params); - params.has_row_names = false; - takane::hdf5_data_frame::validate(path.c_str(), params); - - { - H5::H5File handle(path, H5F_ACC_RDWR); - auto ghandle = handle.openGroup(name); - ghandle.createGroup("row_names"); - } - params.has_row_names = true; - expect_error("expected a 'row_names' dataset", path.c_str(), params); - - { - H5::H5File handle(path, H5F_ACC_RDWR); - auto ghandle = handle.openGroup(name); - ghandle.unlink("row_names"); - ghandle.createDataSet("row_names", H5::PredType::NATIVE_INT, H5S_SCALAR); - } - expect_error("string dataset", path.c_str(), params); - - { - H5::H5File handle(path, H5F_ACC_RDWR); - auto ghandle = handle.openGroup(name); - ghandle.unlink("row_names"); - - H5::StrType stype(0, H5T_VARIABLE); - hsize_t dummy = 20; - H5::DataSpace dspace(1, &dummy); - ghandle.createDataSet("row_names", stype, dspace); - } - expect_error("expected 'row_names' to have length", path.c_str(), params); -} - -TEST_P(Hdf5DataFrameTest, Colnames) { - takane::hdf5_data_frame::Parameters params(name); - params.num_rows = 29; - auto& columns = params.columns.mutable_ref(); - columns.resize(2); - columns[0].name = "Aaron"; - columns[1].name = "Barry"; - - auto version = GetParam(); - params.df_version = version; - params.hdf5_version = version; - - { - H5::H5File handle(path, H5F_ACC_TRUNC); - auto ghandle = handle.createGroup(name); - create_hdf5_data_frame(ghandle, params.num_rows, false, columns, version); - } - takane::hdf5_data_frame::validate(path.c_str(), params); - - auto old = columns[1].name; - columns[1].name = "Charlie"; - expect_error("expected name 'Charlie'", path.c_str(), params); - columns[1].name = old; - - { - H5::H5File handle(path, H5F_ACC_RDWR); - auto ghandle = handle.openGroup(name); - ghandle.unlink("column_names"); - } - expect_error("dataset", path.c_str(), params); - - { - H5::H5File handle(path, H5F_ACC_RDWR); - auto ghandle = handle.openGroup(name); - ghandle.createGroup("column_names"); - } - expect_error("expected a 'column_names' dataset", path.c_str(), params); - - { - H5::H5File handle(path, H5F_ACC_RDWR); - auto ghandle = handle.openGroup(name); - ghandle.unlink("column_names"); - ghandle.createDataSet("column_names", H5::PredType::NATIVE_INT, H5S_SCALAR); - } - expect_error("string dataset", path.c_str(), params); - - { - H5::H5File handle(path, H5F_ACC_RDWR); - auto ghandle = handle.openGroup(name); - ghandle.unlink("column_names"); - - H5::StrType stype(0, H5T_VARIABLE); - hsize_t dummy = 10; - H5::DataSpace dspace(1, &dummy); - ghandle.createDataSet("column_names", stype, dspace); - } - expect_error("length of 'column_names'", path.c_str(), params); - - columns[1].name = "Aaron"; - { - H5::H5File handle(path, H5F_ACC_TRUNC); - auto ghandle = handle.createGroup(name); - create_hdf5_data_frame(ghandle, params.num_rows, false, columns, version); - } - expect_error("duplicated column name", path.c_str(), params); - - columns[0].name = ""; - { - H5::H5File handle(path, H5F_ACC_TRUNC); - auto ghandle = handle.createGroup(name); - create_hdf5_data_frame(ghandle, params.num_rows, false, columns, version); - } - expect_error("empty strings", path.c_str(), params); -} - -TEST_P(Hdf5DataFrameTest, General) { - takane::hdf5_data_frame::Parameters params(name); - params.num_rows = 33; - auto& columns = params.columns.mutable_ref(); - columns.resize(2); - columns[0].name = "Aaron"; - columns[1].name = "Barry"; - - auto version = GetParam(); - params.df_version = version; - params.hdf5_version = version; - - { - H5::H5File handle(path, H5F_ACC_TRUNC); - } - expect_error("'" + name + "' group", path.c_str(), params); - - H5::StrType stype(0, H5T_VARIABLE); - { - H5::H5File handle(path, H5F_ACC_RDWR); - auto ghandle = handle.createGroup(name); - auto attr = ghandle.createAttribute("version", stype, H5S_SCALAR); - attr.write(stype, std::string("2.0")); - } - expect_error("unsupported version", path.c_str(), params); - - if (version >= 3) { - { - H5::H5File handle(path, H5F_ACC_RDWR); - auto ghandle = handle.openGroup(name); - ghandle.removeAttr("version"); - auto attr = ghandle.createAttribute("version", stype, H5S_SCALAR); - attr.write(stype, std::string("1.0")); - ghandle.createAttribute("row-count", H5::PredType::NATIVE_INT8, H5S_SCALAR); - } - expect_error("64-bit unsigned", path.c_str(), params); - - { - H5::H5File handle(path, H5F_ACC_RDWR); - auto ghandle = handle.openGroup(name); - ghandle.removeAttr("row-count"); - ghandle.createAttribute("row-count", H5::PredType::NATIVE_UINT8, H5S_SCALAR); - } - expect_error("inconsistent number", path.c_str(), params); - } -} - -TEST_P(Hdf5DataFrameTest, Data) { - takane::hdf5_data_frame::Parameters params(name); - params.num_rows = 33; - auto& columns = params.columns.mutable_ref(); - columns.resize(2); - columns[0].name = "Aaron"; - columns[1].name = "Barry"; - - auto version = GetParam(); - params.df_version = version; - params.hdf5_version = version; - - { - H5::H5File handle(path, H5F_ACC_TRUNC); - auto ghandle = handle.createGroup(name); - create_hdf5_data_frame(ghandle, params.num_rows, false, columns, version); - ghandle.unlink("data"); - } - expect_error("'df/data' group", path.c_str(), params); - - { - H5::H5File handle(path, H5F_ACC_RDWR); - auto ghandle = handle.openGroup(name); - auto dhandle = ghandle.createGroup("data"); - dhandle.createGroup("0"); - } - if (version <= 2) { - expect_error("expected a dataset", path.c_str(), params); - } else { - expect_error("only factor columns", path.c_str(), params); - } - - { - H5::H5File handle(path, H5F_ACC_RDWR); - auto ghandle = handle.openGroup(name); - auto dhandle = ghandle.openGroup("data"); - dhandle.unlink("0"); - spawn_integer_column(dhandle, "0", 2, version); - } - expect_error("length equal to the number of rows", path.c_str(), params); - - { - H5::H5File handle(path, H5F_ACC_TRUNC); - auto ghandle = handle.createGroup(name); - create_hdf5_data_frame(ghandle, params.num_rows, false, columns, version); - auto dhandle = ghandle.openGroup("data"); - dhandle.createGroup("foo"); - } - expect_error("more objects present", path.c_str(), params); -} - -TEST_P(Hdf5DataFrameTest, Other) { - takane::hdf5_data_frame::Parameters params(name); - params.num_rows = 33; - auto& columns = params.columns.mutable_ref(); - columns.resize(2); - columns[0].name = "Aaron"; - columns[0].type = takane::data_frame::ColumnType::OTHER; - columns[1].name = "Barry"; - columns[1].type = takane::data_frame::ColumnType::OTHER; - - auto version = GetParam(); - params.df_version = version; - params.hdf5_version = version; - - { - H5::H5File handle(path, H5F_ACC_TRUNC); - auto ghandle = handle.createGroup(name); - create_hdf5_data_frame(ghandle, params.num_rows, false, columns, version); - } - takane::hdf5_data_frame::validate(path.c_str(), params); - - { - H5::H5File handle(path, H5F_ACC_RDWR); - auto ghandle = handle.openGroup(name); - auto dhandle = ghandle.openGroup("data"); - hsize_t nr = params.num_rows; - H5::DataSpace dspace(1, &nr); - dhandle.createDataSet("0", H5::PredType::NATIVE_INT, dspace); - } - expect_error("'other'", path.c_str(), params); -} - -TEST_P(Hdf5DataFrameTest, Integer) { - takane::hdf5_data_frame::Parameters params(name); - auto& columns = params.columns.mutable_ref(); - columns.resize(1); - columns[0].name = "Aaron"; - columns[0].type = takane::data_frame::ColumnType::INTEGER; - params.num_rows = 33; - - auto version = GetParam(); - params.df_version = version; - params.hdf5_version = version; - - { - H5::H5File handle(path, H5F_ACC_TRUNC); - auto ghandle = handle.createGroup(name); - create_hdf5_data_frame(ghandle, params.num_rows, false, columns, version); - } - takane::hdf5_data_frame::validate(path.c_str(), params); - - { - H5::H5File handle(path, H5F_ACC_RDWR); - auto ghandle = handle.openGroup(name); - auto dhandle = ghandle.openGroup("data"); - dhandle.unlink("0"); - spawn_integer_column(dhandle, "0", params.num_rows, version, H5::PredType::NATIVE_DOUBLE); - } - expect_error("expected integer column", path.c_str(), params); - - if (version >= 3) { - { - H5::H5File handle(path, H5F_ACC_RDWR); - auto ghandle = handle.openGroup(name); - auto dhandle = ghandle.openGroup("data"); - dhandle.unlink("0"); - spawn_boolean_column(dhandle, "0", params.num_rows, version); - } - expect_error("'type' attribute set to 'integer'", path.c_str(), params); - } - - { - H5::H5File handle(path, H5F_ACC_RDWR); - auto ghandle = handle.openGroup(name); - auto dhandle = ghandle.openGroup("data"); - dhandle.unlink("0"); - spawn_integer_column(dhandle, "0", params.num_rows, version, H5::PredType::NATIVE_INT64); - } - expect_error("32-bit signed integer", path.c_str(), params); - - // Checking the missing value placeholder. - if (version >= 2) { - { - H5::H5File handle(path, H5F_ACC_RDWR); - auto ghandle = handle.openGroup(name); - auto dhandle = ghandle.openGroup("data"); - dhandle.unlink("0"); - auto xhandle = spawn_integer_column(dhandle, "0", params.num_rows, version, H5::PredType::NATIVE_INT16); - xhandle.createAttribute("missing-value-placeholder", H5::PredType::NATIVE_INT16, H5S_SCALAR); - } - takane::hdf5_data_frame::validate(path.c_str(), params); - - { - H5::H5File handle(path, H5F_ACC_RDWR); - auto ghandle = handle.openGroup(name); - auto dhandle = ghandle.openGroup("data"); - auto xhandle = dhandle.openDataSet("0"); - xhandle.removeAttr("missing-value-placeholder"); - xhandle.createAttribute("missing-value-placeholder", H5::PredType::NATIVE_INT8, H5S_SCALAR); - } - expect_error("same type as", path.c_str(), params); - } -} - -TEST_P(Hdf5DataFrameTest, Boolean) { - takane::hdf5_data_frame::Parameters params(name); - params.num_rows = 33; - auto& columns = params.columns.mutable_ref(); - columns.resize(1); - columns[0].name = "Aaron"; - columns[0].type = takane::data_frame::ColumnType::BOOLEAN; - - auto version = GetParam(); - params.df_version = version; - params.hdf5_version = version; - - { - H5::H5File handle(path, H5F_ACC_TRUNC); - auto ghandle = handle.createGroup(name); - create_hdf5_data_frame(ghandle, params.num_rows, false, columns, version); - } - takane::hdf5_data_frame::validate(path.c_str(), params); - - { - H5::H5File handle(path, H5F_ACC_RDWR); - auto ghandle = handle.openGroup(name); - auto dhandle = ghandle.openGroup("data"); - dhandle.unlink("0"); - spawn_boolean_column(dhandle, "0", params.num_rows, version, H5::PredType::NATIVE_DOUBLE); - } - expect_error("expected boolean column", path.c_str(), params); - - { - H5::H5File handle(path, H5F_ACC_RDWR); - auto ghandle = handle.openGroup(name); - auto dhandle = ghandle.openGroup("data"); - dhandle.unlink("0"); - spawn_boolean_column(dhandle, "0", params.num_rows, version, H5::PredType::NATIVE_INT64); - } - expect_error("32-bit signed integer", path.c_str(), params); - - if (version >= 3) { - { - H5::H5File handle(path, H5F_ACC_RDWR); - auto ghandle = handle.openGroup(name); - auto dhandle = ghandle.openGroup("data"); - dhandle.unlink("0"); - spawn_integer_column(dhandle, "0", params.num_rows, version); - } - expect_error("'type' attribute set to 'boolean'", path.c_str(), params); - } - - // Checking the missing value placeholder. - if (version >= 2) { - { - H5::H5File handle(path, H5F_ACC_RDWR); - auto ghandle = handle.openGroup(name); - auto dhandle = ghandle.openGroup("data"); - dhandle.unlink("0"); - auto xhandle = spawn_boolean_column(dhandle, "0", params.num_rows, version); - xhandle.createAttribute("missing-value-placeholder", H5::PredType::NATIVE_INT8, H5S_SCALAR); - } - takane::hdf5_data_frame::validate(path.c_str(), params); - - { - H5::H5File handle(path, H5F_ACC_RDWR); - auto ghandle = handle.openGroup(name); - auto dhandle = ghandle.openGroup("data"); - auto xhandle = dhandle.openDataSet("0"); - xhandle.removeAttr("missing-value-placeholder"); - xhandle.createAttribute("missing-value-placeholder", H5::PredType::NATIVE_UINT16, H5S_SCALAR); - } - expect_error("same type as", path.c_str(), params); - } -} - -TEST_P(Hdf5DataFrameTest, Number) { - auto version = GetParam(); - - takane::hdf5_data_frame::Parameters params(name); - params.num_rows = 27; - auto& columns = params.columns.mutable_ref(); - columns.resize(1); - columns[0].name = "Aaron"; - columns[0].type = takane::data_frame::ColumnType::NUMBER; - - { - H5::H5File handle(path, H5F_ACC_TRUNC); - auto ghandle = handle.createGroup(name); - create_hdf5_data_frame(ghandle, params.num_rows, false, columns, version); - } - takane::hdf5_data_frame::validate(path.c_str(), params); - - { - H5::H5File handle(path, H5F_ACC_RDWR); - auto ghandle = handle.openGroup(name); - auto dhandle = ghandle.openGroup("data"); - dhandle.unlink("0"); - spawn_number_column(dhandle, "0", params.num_rows, version, H5::PredType::NATIVE_INT); - } - if (version <= 2) { - expect_error("floating-point dataset", path.c_str(), params); - } else { - takane::hdf5_data_frame::validate(path.c_str(), params); - } - - if (version >= 3) { - { - H5::H5File handle(path, H5F_ACC_RDWR); - auto ghandle = handle.openGroup(name); - auto dhandle = ghandle.openGroup("data"); - dhandle.unlink("0"); - spawn_integer_column(dhandle, "0", params.num_rows, version, H5::PredType::NATIVE_DOUBLE); - } - expect_error("'type' attribute set to 'number'", path.c_str(), params); - - { - H5::H5File handle(path, H5F_ACC_RDWR); - auto ghandle = handle.openGroup(name); - auto dhandle = ghandle.openGroup("data"); - dhandle.unlink("0"); - spawn_number_column(dhandle, "0", params.num_rows, version, H5::PredType::NATIVE_INT64); - } - expect_error("64-bit float", path.c_str(), params); - } - - // Checking the missing value placeholder. - { - H5::H5File handle(path, H5F_ACC_RDWR); - auto ghandle = handle.openGroup(name); - auto dhandle = ghandle.openGroup("data"); - dhandle.unlink("0"); - auto xhandle = spawn_number_column(dhandle, "0", params.num_rows, version); - xhandle.createAttribute("missing-value-placeholder", H5::PredType::NATIVE_DOUBLE, H5S_SCALAR); - } - takane::hdf5_data_frame::validate(path.c_str(), params); - - { - H5::H5File handle(path, H5F_ACC_RDWR); - auto ghandle = handle.openGroup(name); - auto dhandle = ghandle.openGroup("data"); - auto xhandle = dhandle.openDataSet("0"); - xhandle.removeAttr("missing-value-placeholder"); - xhandle.createAttribute("missing-value-placeholder", H5::PredType::NATIVE_INT8, H5S_SCALAR); - } - expect_error("same type as", path.c_str(), params); -} - -TEST_P(Hdf5DataFrameTest, String) { - takane::hdf5_data_frame::Parameters params(name); - params.num_rows = 32; - auto& columns = params.columns.mutable_ref(); - columns.resize(1); - columns[0].name = "Aaron"; - columns[0].type = takane::data_frame::ColumnType::STRING; - - auto version = GetParam(); - params.df_version = version; - params.hdf5_version = version; - - { - H5::H5File handle(path, H5F_ACC_TRUNC); - auto ghandle = handle.createGroup(name); - create_hdf5_data_frame(ghandle, params.num_rows, false, columns, version); - } - takane::hdf5_data_frame::validate(path.c_str(), params); - - { - H5::H5File handle(path, H5F_ACC_RDWR); - auto ghandle = handle.openGroup(name); - auto dhandle = ghandle.openGroup("data"); - dhandle.unlink("0"); - spawn_string_column(dhandle, "0", params.num_rows, version, H5::PredType::NATIVE_INT); - } - expect_error("string dataset", path.c_str(), params); - - if (version >= 3) { - { - H5::H5File handle(path, H5F_ACC_RDWR); - auto ghandle = handle.openGroup(name); - auto dhandle = ghandle.openGroup("data"); - dhandle.unlink("0"); - spawn_integer_column(dhandle, "0", params.num_rows, version, H5::StrType(0, H5T_VARIABLE)); - } - expect_error("'type' attribute set to 'string'", path.c_str(), params); - - { - H5::H5File handle(path, H5F_ACC_RDWR); - auto ghandle = handle.openGroup(name); - auto dhandle = ghandle.openGroup("data"); - dhandle.unlink("0"); - auto xhandle = spawn_string_column(dhandle, "0", params.num_rows, version, H5::StrType(0, H5T_VARIABLE)); - attach_format(xhandle, "whee"); - } - expect_error("should be 'none'", path.c_str(), params); - - { - H5::H5File handle(path, H5F_ACC_RDWR); - auto ghandle = handle.openGroup(name); - auto dhandle = ghandle.openGroup("data"); - auto xhandle = dhandle.openDataSet("0"); - xhandle.removeAttr("format"); - attach_format(xhandle, "none"); - } - takane::hdf5_data_frame::validate(path.c_str(), params); - } - - // Checking the missing value placeholder. - { - H5::H5File handle(path, H5F_ACC_RDWR); - auto ghandle = handle.openGroup(name); - auto dhandle = ghandle.openGroup("data"); - dhandle.unlink("0"); - - H5::StrType stype(0, H5T_VARIABLE); - auto xhandle = spawn_string_column(dhandle, "0", params.num_rows, version, stype); - auto ahandle = xhandle.createAttribute("missing-value-placeholder", stype, H5S_SCALAR); - ahandle.write(stype, std::string("asdasd")); - } - takane::hdf5_data_frame::validate(path.c_str(), params); - - { - H5::H5File handle(path, H5F_ACC_RDWR); - auto ghandle = handle.openGroup(name); - auto dhandle = ghandle.openGroup("data"); - auto xhandle = dhandle.openDataSet("0"); - xhandle.removeAttr("missing-value-placeholder"); - xhandle.createAttribute("missing-value-placeholder", H5::PredType::NATIVE_INT8, H5S_SCALAR); - } - expect_error("same type class as", path.c_str(), params); -} - -TEST_P(Hdf5DataFrameTest, StringDate) { - takane::hdf5_data_frame::Parameters params(name); - params.num_rows = 32; - auto& columns = params.columns.mutable_ref(); - columns.resize(1); - columns[0].name = "Aaron"; - columns[0].type = takane::data_frame::ColumnType::STRING; - columns[0].string_format = takane::data_frame::StringFormat::DATE; - - auto version = GetParam(); - params.df_version = version; - params.hdf5_version = version; - - const char* exemplar = "2023-11-02"; - std::vector dump(params.num_rows, exemplar); - H5::StrType stype(0, H5T_VARIABLE); - - { - H5::H5File handle(path, H5F_ACC_TRUNC); - auto ghandle = handle.createGroup(name); - create_hdf5_data_frame(ghandle, params.num_rows, false, columns, version); - auto dhandle = ghandle.openGroup("data"); - dhandle.unlink("0"); - auto xhandle = spawn_string_column(dhandle, "0", params.num_rows, version, stype); - xhandle.write(dump.data(), stype); - if (version >= 3) { - attach_format(xhandle, "date"); - } - } - takane::hdf5_data_frame::validate(path.c_str(), params); - - if (version >= 3) { - { - H5::H5File handle(path, H5F_ACC_RDWR); - auto ghandle = handle.openGroup(name); - auto dhandle = ghandle.openGroup("data"); - dhandle.unlink("0"); - auto xhandle = spawn_string_column(dhandle, "0", params.num_rows, version, stype); - attach_format(xhandle, "none"); - xhandle.write(dump.data(), stype); - } - expect_error("'format' attribute set to 'date'", path.c_str(), params); - } - - const char* violator = "asdasd"; - { - H5::H5File handle(path, H5F_ACC_RDWR); - auto ghandle = handle.openGroup(name); - auto dhandle = ghandle.openGroup("data"); - dhandle.unlink("0"); - auto xhandle = spawn_string_column(dhandle, "0", params.num_rows, version, stype); - if (version >= 3) { - attach_format(xhandle, "date"); - } - dump.back() = violator; - xhandle.write(dump.data(), stype); - } - expect_error("date-formatted", path.c_str(), params); - - { - H5::H5File handle(path, H5F_ACC_RDWR); - auto ghandle = handle.openGroup(name); - auto dhandle = ghandle.openGroup("data"); - auto xhandle = dhandle.openDataSet("0"); - auto ahandle = xhandle.createAttribute("missing-value-placeholder", stype, H5S_SCALAR); - ahandle.write(stype, std::string(violator)); - } - takane::hdf5_data_frame::validate(path.c_str(), params); -} - -TEST_P(Hdf5DataFrameTest, StringDateTime) { - takane::hdf5_data_frame::Parameters params(name); - params.num_rows = 32; - auto& columns = params.columns.mutable_ref(); - columns.resize(1); - columns[0].name = "Aaron"; - columns[0].type = takane::data_frame::ColumnType::STRING; - columns[0].string_format = takane::data_frame::StringFormat::DATE_TIME; - - auto version = GetParam(); - params.df_version = version; - params.hdf5_version = version; - - const char* exemplar = "2023-11-02T23:01:02Z"; - std::vector dump(params.num_rows, exemplar); - H5::StrType stype(0, H5T_VARIABLE); - - { - H5::H5File handle(path, H5F_ACC_TRUNC); - auto ghandle = handle.createGroup(name); - create_hdf5_data_frame(ghandle, params.num_rows, false, columns, version); - auto dhandle = ghandle.openGroup("data"); - dhandle.unlink("0"); - auto xhandle = spawn_string_column(dhandle, "0", params.num_rows, version, stype); - if (version >= 3) { - attach_format(xhandle, "date-time"); - } - xhandle.write(dump.data(), stype); - } - takane::hdf5_data_frame::validate(path.c_str(), params); - - if (version >= 3) { - { - H5::H5File handle(path, H5F_ACC_RDWR); - auto ghandle = handle.openGroup(name); - auto dhandle = ghandle.openGroup("data"); - dhandle.unlink("0"); - auto xhandle = spawn_string_column(dhandle, "0", params.num_rows, version, stype); - attach_format(xhandle, "none"); - xhandle.write(dump.data(), stype); - } - expect_error("'format' attribute set to 'date-time'", path.c_str(), params); - } - - const char* violator = "asdasd"; - { - H5::H5File handle(path, H5F_ACC_RDWR); - auto ghandle = handle.openGroup(name); - auto dhandle = ghandle.openGroup("data"); - dhandle.unlink("0"); - auto xhandle = spawn_string_column(dhandle, "0", params.num_rows, version, stype); - if (version >= 3) { - attach_format(xhandle, "date-time"); - } - dump.back() = violator; - xhandle.write(dump.data(), stype); - } - expect_error("date/time-formatted", path.c_str(), params); - - { - H5::H5File handle(path, H5F_ACC_RDWR); - auto ghandle = handle.openGroup(name); - auto dhandle = ghandle.openGroup("data"); - auto xhandle = dhandle.openDataSet("0"); - auto ahandle = xhandle.createAttribute("missing-value-placeholder", stype, H5S_SCALAR); - ahandle.write(stype, std::string(violator)); - } - takane::hdf5_data_frame::validate(path.c_str(), params); -} - -TEST_P(Hdf5DataFrameTest, Factor) { - takane::hdf5_data_frame::Parameters params(name); - params.num_rows = 32; - auto& columns = params.columns.mutable_ref(); - columns.resize(1); - columns[0].name = "Aaron"; - columns[0].type = takane::data_frame::ColumnType::FACTOR; - std::vector levels{ "kanon", "chisato", "sumire", "ren", "keke" }; - columns[0].factor_levels.mutable_ref().insert(levels.begin(), levels.end()); - - auto version = GetParam(); - params.df_version = version; - params.hdf5_version = version; - - { - H5::H5File handle(path, H5F_ACC_TRUNC); - auto ghandle = handle.createGroup(name); - create_hdf5_data_frame(ghandle, params.num_rows, false, columns, version); - } - takane::hdf5_data_frame::validate(path.c_str(), params); - - if (version == 1) { - columns[0].factor_levels.mutable_ref().erase("chisato"); - expect_error("contains 'chisato'", path.c_str(), params); - - hsize_t nrows = params.num_rows; - H5::DataSpace dspace(1, &nrows); - { - H5::H5File handle(path, H5F_ACC_RDWR); - auto ghandle = handle.openGroup(name); - auto dhandle = ghandle.openGroup("data"); - dhandle.unlink("0"); - dhandle.createDataSet("0", H5::PredType::NATIVE_DOUBLE, dspace); - } - expect_error("string dataset", path.c_str(), params); - - { - H5::H5File handle(path, H5F_ACC_RDWR); - auto ghandle = handle.openGroup(name); - auto dhandle = ghandle.openGroup("data"); - dhandle.unlink("0"); - - H5::StrType stype(0, H5T_VARIABLE); - auto xhandle = dhandle.createDataSet("0", stype, dspace); - const char* missing = "chisato"; - std::vector dump(nrows, missing); - xhandle.write(dump.data(), stype); - - auto ahandle = xhandle.createAttribute("missing-value-placeholder", stype, H5S_SCALAR); // rescues the missing values. - ahandle.write(stype, std::string(missing)); - } - takane::hdf5_data_frame::validate(path.c_str(), params); - - } else { - std::string code_name, group_name; - if (version == 2) { - group_name = "data"; - code_name = "0"; - } else { - group_name = "data/0"; - code_name = "codes"; - } - - { - H5::H5File handle(path, H5F_ACC_RDWR); - auto ghandle = handle.openGroup(name); - auto dhandle = ghandle.openGroup(group_name); - dhandle.unlink(code_name); - hsize_t nrows = params.num_rows + 10; - H5::DataSpace dspace(1, &nrows); - dhandle.createDataSet(code_name, H5::PredType::NATIVE_INT8, dspace); - } - expect_error("length equal to the number of rows", path.c_str(), params); - - hsize_t nrows = params.num_rows; - H5::DataSpace dspace(1, &nrows); - { - H5::H5File handle(path, H5F_ACC_RDWR); - auto ghandle = handle.openGroup(name); - auto dhandle = ghandle.openGroup(group_name); - dhandle.unlink(code_name); - dhandle.createDataSet(code_name, H5::PredType::NATIVE_DOUBLE, dspace); - } - expect_error("expected factor column", path.c_str(), params); - - { - H5::H5File handle(path, H5F_ACC_RDWR); - auto ghandle = handle.openGroup(name); - auto dhandle = ghandle.openGroup(group_name); - dhandle.unlink(code_name); - dhandle.createDataSet(code_name, H5::PredType::NATIVE_INT64, dspace); - } - expect_error("32-bit signed integer", path.c_str(), params); - - { - H5::H5File handle(path, H5F_ACC_RDWR); - auto ghandle = handle.openGroup(name); - auto dhandle = ghandle.openGroup(group_name); - dhandle.unlink(code_name); - - auto xhandle = dhandle.createDataSet(code_name, H5::PredType::NATIVE_INT16, dspace); - std::vector replacement(nrows, columns[0].factor_levels->size()); - xhandle.write(replacement.data(), H5::PredType::NATIVE_INT); - } - expect_error("less than the number of levels", path.c_str(), params); - - // Using -1 as a placeholder value. - { - H5::H5File handle(path, H5F_ACC_RDWR); - auto ghandle = handle.openGroup(name); - auto dhandle = ghandle.openGroup(group_name); - dhandle.unlink(code_name); - - auto xhandle = dhandle.createDataSet(code_name, H5::PredType::NATIVE_INT16, dspace); - std::vector replacement(nrows, -1); - xhandle.write(replacement.data(), H5::PredType::NATIVE_INT); - } - expect_error("non-negative", path.c_str(), params); - - { - H5::H5File handle(path, H5F_ACC_RDWR); - auto ghandle = handle.openGroup(name); - auto dhandle = ghandle.openGroup(group_name); - auto xhandle = dhandle.openDataSet(code_name); - auto ahandle = xhandle.createAttribute("missing-value-placeholder", H5::PredType::NATIVE_INT16, H5S_SCALAR); - int val = -1; - ahandle.write(H5::PredType::NATIVE_INT, &val); - } - takane::hdf5_data_frame::validate(path.c_str(), params); // rescues the negative values. - } - - if (version >= 3) { - { - H5::H5File handle(path, H5F_ACC_TRUNC); - auto ghandle = handle.createGroup(name); - create_hdf5_data_frame(ghandle, params.num_rows, false, columns, version); - auto dhandle = ghandle.openGroup("data/0"); - dhandle.unlink("levels"); - dhandle.createDataSet("levels", H5::PredType::NATIVE_INT, H5S_SCALAR); - } - expect_error("string datatype", path.c_str(), params); - - { - H5::H5File handle(path, H5F_ACC_RDWR); - auto ghandle = handle.openGroup(name); - auto dhandle = ghandle.openGroup("data/0"); - dhandle.unlink("levels"); - - std::vector levels(columns[0].factor_levels->begin(), columns[0].factor_levels->end()); - levels.push_back(levels[0]); - auto dump = pointerize_strings(levels); - - hsize_t nlevels = dump.size(); - H5::DataSpace dspace(1, &nlevels); - H5::StrType stype(0, H5T_VARIABLE); - auto xhandle = dhandle.createDataSet("levels", stype, dspace); - xhandle.write(dump.data(), stype); - } - expect_error("duplicate level", path.c_str(), params); - - { - H5::H5File handle(path, H5F_ACC_TRUNC); - auto ghandle = handle.createGroup(name); - create_hdf5_data_frame(ghandle, params.num_rows, false, columns, version); - auto dhandle = ghandle.openGroup("data/0"); - dhandle.removeAttr("type"); - attach_type(dhandle, "WHEE"); - } - expect_error("'type' attribute set to 'factor'", path.c_str(), params); - - { - H5::H5File handle(path, H5F_ACC_RDWR); - auto ghandle = handle.openGroup(name); - auto dhandle = ghandle.openGroup("data/0"); - dhandle.removeAttr("type"); - attach_type(dhandle, "factor"); - dhandle.createAttribute("ordered", H5::PredType::NATIVE_UINT8, H5S_SCALAR); - } - takane::hdf5_data_frame::validate(path.c_str(), params); - auto params2 = params; - params2.columns.mutable_ref()[0].factor_ordered = true; - expect_error("not consistent", path.c_str(), params2); - - { - H5::H5File handle(path, H5F_ACC_RDWR); - auto ghandle = handle.openGroup(name); - auto dhandle = ghandle.openGroup("data/0"); - dhandle.removeAttr("ordered"); - dhandle.createAttribute("ordered", H5::PredType::NATIVE_DOUBLE, H5S_SCALAR); - } - expect_error("32-bit signed integer", path.c_str(), params); - } -} - -INSTANTIATE_TEST_SUITE_P( - Hdf5DataFrame, - Hdf5DataFrameTest, - ::testing::Values(1,2,3) // versions -); diff --git a/tests/src/simple_list.cpp b/tests/src/simple_list.cpp new file mode 100644 index 0000000..38c8bb3 --- /dev/null +++ b/tests/src/simple_list.cpp @@ -0,0 +1,158 @@ +#include +#include + +#include "takane/takane.hpp" +#include "utils.h" +#include "simple_list.h" + +#include +#include +#include + +struct SimpleListTest : public::testing::Test { + static std::filesystem::path testdir() { + return "TEST_simple_list"; + } + + static void initialize() { + initialize_directory(testdir(), "simple_list"); + } + + static void dump_json(const std::string& buffer) { + simple_list::dump_compressed_json(testdir(), buffer); + } + + template + static void expect_error(const std::string& msg, Args_&& ... args) { + EXPECT_ANY_THROW({ + try { + takane::validate(testdir(), std::forward(args)...); + } catch (std::exception& e) { + EXPECT_THAT(e.what(), ::testing::HasSubstr(msg)); + throw; + } + }); + } +}; + +TEST_F(SimpleListTest, Json) { + { + initialize(); + } + expect_error("could not determine format"); + + // Success! + { + dump_json("{ \"type\": \"list\", \"values\": [] }"); + } + takane::validate(testdir()); + EXPECT_EQ(takane::height(testdir()), 0); + + // Throwing in some externals. + auto dir = testdir(); + dir.append("other_contents"); + { + std::ofstream x(dir); + } + expect_error("expected 'other_contents' to be a directory"); + + auto dir2 = dir; + dir2.append("0"); + { + std::filesystem::remove(dir); + std::filesystem::create_directory(dir); + std::ofstream x(dir2); + } + expect_error("failed to validate external list object at 'other_contents/0'"); + + { + std::filesystem::remove(dir2); + std::filesystem::create_directory(dir2); + auto opath = dir2; + opath.append("contents.h5"); + + H5::H5File handle(opath, H5F_ACC_TRUNC); + auto ghandle = handle.createGroup("atomic_vector"); + hdf5_utils::attach_attribute(ghandle, "version", "1.0"); + hdf5_utils::attach_attribute(ghandle, "type", "integer"); + hdf5_utils::spawn_data(ghandle, "values", 100, H5::PredType::NATIVE_INT32); + + auto objpath = dir2; + objpath.append("OBJECT"); + std::ofstream output(objpath); + output << "atomic_vector"; + } + expect_error("fewer instances"); + + // Success again! + { + dump_json("{ \"type\": \"list\", \"values\": [ { \"type\": \"external\", \"index\": 0 } ] }"); + } + takane::validate(testdir()); + EXPECT_EQ(takane::height(testdir()), 1); +} + +TEST_F(SimpleListTest, Hdf5) { + // Success! + { + initialize(); + auto dir = testdir(); + dir.append("list_contents.h5"); + + H5::H5File handle(dir, H5F_ACC_TRUNC); + auto ghandle = handle.createGroup("simple_list"); + H5::StrType stype(0, H5T_VARIABLE); + auto ahandle = ghandle.createAttribute("uzuki_object", stype, H5S_SCALAR); + ahandle.write(stype, std::string("list")); + ghandle.createGroup("data"); + } + takane::validate(testdir()); + EXPECT_EQ(takane::height(testdir()), 0); + + // Throwing in some externals. + auto dir2 = testdir(); + dir2.append("other_contents"); + dir2.append("0"); + { + std::filesystem::create_directories(dir2); + auto opath = dir2; + opath.append("contents.h5"); + + H5::H5File handle(opath, H5F_ACC_TRUNC); + auto ghandle = handle.createGroup("atomic_vector"); + hdf5_utils::attach_attribute(ghandle, "version", "1.0"); + hdf5_utils::attach_attribute(ghandle, "type", "integer"); + hdf5_utils::spawn_data(ghandle, "values", 100, H5::PredType::NATIVE_INT32); + + auto objpath = dir2; + objpath.append("OBJECT"); + std::ofstream output(objpath); + output << "atomic_vector"; + } + expect_error("fewer instances"); + + // Success again! + { + auto dir = testdir(); + dir.append("list_contents.h5"); + H5::H5File handle(dir, H5F_ACC_TRUNC); + auto ghandle = handle.createGroup("simple_list"); + + H5::StrType stype(0, H5T_VARIABLE); + auto ahandle = ghandle.createAttribute("uzuki_object", stype, H5S_SCALAR); + ahandle.write(stype, std::string("list")); + + auto dhandle = ghandle.createGroup("data"); + auto zhandle = dhandle.createGroup("0"); + { + auto xhandle = zhandle.createAttribute("uzuki_object", stype, H5S_SCALAR); + xhandle.write(stype, std::string("external")); + } + + auto xhandle = zhandle.createDataSet("index", H5::PredType::NATIVE_INT32, H5S_SCALAR); + int val = 0; + xhandle.write(&val, H5::PredType::NATIVE_INT); + } + takane::validate(testdir()); + EXPECT_EQ(takane::height(testdir()), 1); +} diff --git a/tests/src/simple_list.h b/tests/src/simple_list.h new file mode 100644 index 0000000..51b786a --- /dev/null +++ b/tests/src/simple_list.h @@ -0,0 +1,23 @@ +#ifndef SIMPLE_LIST_H +#define SIMPLE_LIST_H + +#include +#include + +#include "byteme/byteme.hpp" + +namespace simple_list { + +inline void dump_compressed_json(const std::filesystem::path& dir, const std::string& buffer) { + auto path = dir / "list_contents.json.gz"; + byteme::GzipFileWriter writer(path.c_str()); + writer.write(reinterpret_cast(buffer.data()), buffer.size()); +} + +inline void mock(const std::filesystem::path& dir) { + dump_compressed_json(dir, "{ \"type\": \"list\", \"values\": [] }"); +} + +} + +#endif diff --git a/tests/src/string_factor.cpp b/tests/src/string_factor.cpp new file mode 100644 index 0000000..73441f3 --- /dev/null +++ b/tests/src/string_factor.cpp @@ -0,0 +1,174 @@ +#include +#include + +#include "takane/takane.hpp" +#include "utils.h" + +#include +#include +#include + +struct StringFactorTest : public::testing::Test { + static std::filesystem::path testdir() { + return "TEST_string_factor"; + } + + static H5::H5File initialize() { + auto path = testdir(); + initialize_directory(path, "string_factor"); + path.append("contents.h5"); + return H5::H5File(path, H5F_ACC_TRUNC); + } + + static H5::H5File reopen() { + auto path = testdir() / "contents.h5"; + return H5::H5File(path, H5F_ACC_RDWR); + } + + template + static void expect_error(const std::string& msg, Args_&& ... args) { + EXPECT_ANY_THROW({ + try { + takane::validate(testdir(), std::forward(args)...); + } catch (std::exception& e) { + EXPECT_THAT(e.what(), ::testing::HasSubstr(msg)); + throw; + } + }); + } +}; + +TEST_F(StringFactorTest, Basic) { + { + auto handle = initialize(); + } + expect_error("expected a 'string_factor' group"); + + { + auto handle = reopen(); + handle.createDataSet("string_factor", H5::PredType::NATIVE_INT, H5S_SCALAR); + } + expect_error("expected a 'string_factor' group"); + + { + auto handle = reopen(); + handle.unlink("string_factor"); + auto ghandle = handle.createGroup("string_factor"); + hdf5_utils::attach_attribute(ghandle, "version", "2.0"); + } + expect_error("unsupported version string"); + + { + auto handle = reopen(); + auto ghandle = handle.openGroup("string_factor"); + ghandle.removeAttr("version"); + hdf5_utils::attach_attribute(ghandle, "version", "1.0"); + } + expect_error("'levels'"); + + { + auto handle = reopen(); + auto ghandle = handle.openGroup("string_factor"); + hdf5_utils::spawn_string_data(ghandle, "levels", 3, { "A", "B", "C", "D", "E" }); + } + expect_error("'codes'"); + + // Success at last. + { + auto handle = reopen(); + auto ghandle = handle.openGroup("string_factor"); + hdf5_utils::spawn_data(ghandle, "codes", 100, H5::PredType::NATIVE_INT32); + } + takane::validate(testdir()); + EXPECT_EQ(takane::height(testdir()), 100); +} + +TEST_F(StringFactorTest, Codes) { + { + auto handle = initialize(); + auto ghandle = handle.createGroup("string_factor"); + hdf5_utils::attach_attribute(ghandle, "version", "1.0"); + + std::vector codes { 0, -1, 2, 1, 3, -1, 2 }; + auto dhandle = hdf5_utils::spawn_data(ghandle, "codes", codes.size(), H5::PredType::NATIVE_INT32); + dhandle.write(codes.data(), H5::PredType::NATIVE_INT); + hdf5_utils::spawn_string_data(ghandle, "levels", 3, { "A", "B", "C" }); + } + expect_error("non-negative"); + + { + auto handle = reopen(); + auto ghandle = handle.openGroup("string_factor"); + auto dhandle = ghandle.openDataSet("codes"); + auto ahandle = dhandle.createAttribute("missing-value-placeholder", H5::PredType::NATIVE_INT32, H5S_SCALAR); + int val = -1; + ahandle.write(H5::PredType::NATIVE_INT, &val); + } + expect_error("number of levels"); + + { + auto handle = reopen(); + auto ghandle = handle.openGroup("string_factor"); + ghandle.unlink("levels"); + hdf5_utils::spawn_string_data(ghandle, "levels", 3, { "A", "B", "C", "D" }); + } + takane::validate(testdir()); +} + +TEST_F(StringFactorTest, Ordered) { + { + auto handle = initialize(); + auto ghandle = handle.createGroup("string_factor"); + hdf5_utils::attach_attribute(ghandle, "version", "1.0"); + + std::vector codes { 0, 2, 1, 1, 2 }; + auto dhandle = hdf5_utils::spawn_data(ghandle, "codes", codes.size(), H5::PredType::NATIVE_INT32); + dhandle.write(codes.data(), H5::PredType::NATIVE_INT); + hdf5_utils::spawn_string_data(ghandle, "levels", 3, { "A", "B", "C" }); + + hdf5_utils::attach_attribute(ghandle, "ordered", "TRUE"); + } + expect_error("32-bit signed integer"); + + { + auto handle = reopen(); + auto ghandle = handle.openGroup("string_factor"); + ghandle.removeAttr("ordered"); + auto ahandle = ghandle.createAttribute("ordered", H5::PredType::NATIVE_INT8, H5S_SCALAR); + int val = 1; + ahandle.write(H5::PredType::NATIVE_INT, &val); + } + takane::validate(testdir()); +} + +TEST_F(StringFactorTest, Names) { + std::vector codes { 0, 1, 2, 1, 0, 1, 2 }; + { + auto handle = initialize(); + auto ghandle = handle.createGroup("string_factor"); + hdf5_utils::attach_attribute(ghandle, "version", "1.0"); + + auto dhandle = hdf5_utils::spawn_data(ghandle, "codes", codes.size(), H5::PredType::NATIVE_INT32); + dhandle.write(codes.data(), H5::PredType::NATIVE_INT); + hdf5_utils::spawn_string_data(ghandle, "levels", 3, { "A", "B", "C" }); + + hdf5_utils::spawn_data(ghandle, "names", codes.size(), H5::PredType::NATIVE_INT); + } + expect_error("string datatype"); + + { + auto handle = reopen(); + auto ghandle = handle.openGroup("string_factor"); + ghandle.unlink("names"); + hdf5_utils::spawn_data(ghandle, "names", 50, H5::StrType(0, 10)); + } + expect_error("same length"); + + { + auto handle = reopen(); + auto ghandle = handle.openGroup("string_factor"); + ghandle.unlink("names"); + hdf5_utils::spawn_data(ghandle, "names", codes.size(), H5::StrType(0, 10)); + } + takane::validate(testdir()); +} diff --git a/tests/src/utils.h b/tests/src/utils.h index 86c7ef6..dc92501 100644 --- a/tests/src/utils.h +++ b/tests/src/utils.h @@ -1,20 +1,85 @@ #ifndef UTILS_H #define UTILS_H -#include "takane/utils_csv.hpp" +#include +#include +#include +#include -struct FilledFieldCreator : public takane::CsvFieldCreator { - comservatory::StringField* string() { - return new comservatory::FilledStringField; +#include "H5Cpp.h" +#include "takane/takane.hpp" + +inline void initialize_directory(const std::filesystem::path& dir, const std::string& type) { + if (std::filesystem::exists(dir)) { + std::filesystem::remove_all(dir); } + std::filesystem::create_directory(dir); + + auto objpath = dir / "OBJECT"; + std::ofstream output(objpath); + output << type; +} + +template +void expect_validation_error(const std::filesystem::path& dir, const std::string& msg, Args_&& ... args) { + EXPECT_ANY_THROW({ + try { + takane::validate(dir, std::forward(args)...); + } catch (std::exception& e) { + EXPECT_THAT(e.what(), ::testing::HasSubstr(msg)); + throw; + } + }); +} + +namespace hdf5_utils { + +template +void attach_attribute(const Handle& handle, const std::string& name, const std::string& type) { + H5::StrType stype(0, H5T_VARIABLE); + auto attr = handle.createAttribute(name, stype, H5S_SCALAR); + attr.write(stype, type); +} - comservatory::NumberField* number() { - return new comservatory::FilledNumberField; +template +void attach_attribute(const Handle& handle, const std::string& name, int val) { + auto attr = handle.createAttribute(name, H5::PredType::NATIVE_INT32, H5S_SCALAR); + attr.write(H5::PredType::NATIVE_INT, &val); +} + +inline H5::DataSet spawn_data(const H5::Group& handle, const std::string& name, hsize_t len, const H5::DataType& dtype) { + H5::DataSpace dspace(1, &len); + return handle.createDataSet(name, dtype, dspace); +} + +template +std::vector pointerize_strings(const Container_& x) { + std::vector output; + for (auto start = x.begin(), end = x.end(); start != end; ++start) { + output.push_back(start->c_str()); } + return output; +} + +inline H5::DataSet spawn_string_data(const H5::Group& handle, const std::string& name, size_t strlen, const std::vector& values) { + auto dhandle = spawn_data(handle, name, values.size(), H5::StrType(0, strlen)); - comservatory::BooleanField* boolean() { - return new comservatory::FilledBooleanField; + if (strlen == H5T_VARIABLE) { + auto ptrs = pointerize_strings(values); + dhandle.write(ptrs.data(), dhandle.getStrType()); + } else { + std::vector buffer(strlen * values.size()); + auto bIt = buffer.data(); + for (size_t i = 0; i < values.size(); ++i) { + std::copy_n(values[i].begin(), std::min(strlen, values[i].size()), bIt); + bIt += strlen; + } + dhandle.write(buffer.data(), dhandle.getStrType()); } -}; + + return dhandle; +} + +} #endif diff --git a/tests/src/utils_hdf5.cpp b/tests/src/utils_hdf5.cpp new file mode 100644 index 0000000..96ce036 --- /dev/null +++ b/tests/src/utils_hdf5.cpp @@ -0,0 +1,236 @@ +#include +#include + +#include "takane/utils_hdf5.hpp" + +#include "utils.h" + +struct Hdf5StringFormatTest : public::testing::Test { + static std::string testpath() { + return "TEST_stringformat.h5"; + } + + template + static void expect_error(const std::string& msg, Args_&& ... args) { + EXPECT_ANY_THROW({ + try { + takane::internal_hdf5::validate_string_format(std::forward(args)...); + } catch (std::exception& e) { + EXPECT_THAT(e.what(), ::testing::HasSubstr(msg)); + throw; + } + }); + } +}; + +TEST_F(Hdf5StringFormatTest, None) { + auto path = testpath(); + + { + H5::H5File handle(path, H5F_ACC_TRUNC); + hdf5_utils::spawn_data(handle, "foobar", 10, H5::StrType(0, 10)); + } + { + H5::H5File handle(path, H5F_ACC_RDONLY); + auto dhandle = handle.openDataSet("foobar"); + takane::internal_hdf5::validate_string_format(dhandle, 10, "none", false, "", 10000); + expect_error("unsupported format", dhandle, 10, "foobar", false, "", 10000); + } +} + +TEST_F(Hdf5StringFormatTest, Date) { + auto path = testpath(); + + { + H5::H5File handle(path, H5F_ACC_TRUNC); + hdf5_utils::spawn_data(handle, "foobar", 5, H5::StrType(0, 10)); // must be 10 characters. + } + { + H5::H5File handle(path, H5F_ACC_RDONLY); + auto dhandle = handle.openDataSet("foobar"); + expect_error("date-formatted string", dhandle, 5, "date", false, "", 10000); + takane::internal_hdf5::validate_string_format(dhandle, 5, "date", true, "", 10000); + } + + { + H5::H5File handle(path, H5F_ACC_RDWR); + handle.unlink("foobar"); + hdf5_utils::spawn_string_data(handle, "foobar", 10, { "2023-01-05", "1999-12-05", "2002-05-23", "2010-08-18", "1987-06-15" }); + } + { + H5::H5File handle(path, H5F_ACC_RDONLY); + auto dhandle = handle.openDataSet("foobar"); + takane::internal_hdf5::validate_string_format(dhandle, 5, "date", false, "", 10000); + } + + // Checking for missing placeholder. + { + H5::H5File handle(path, H5F_ACC_RDWR); + auto dhandle = handle.openDataSet("foobar"); + + hsize_t len = 1; + H5::DataSpace memspace(1, &len); + H5::DataSpace filespace = dhandle.getSpace(); + hsize_t start = 4; + filespace.selectHyperslab(H5S_SELECT_SET, &len, &start); + + std::string placeholder = "aarontllun"; // must be 10 characters. + dhandle.write(placeholder.c_str(), dhandle.getStrType(), memspace, filespace); + } + { + H5::H5File handle(path, H5F_ACC_RDONLY); + auto dhandle = handle.openDataSet("foobar"); + expect_error("date-formatted string", dhandle, 5, "date", true, "foobar", 10000); + takane::internal_hdf5::validate_string_format(dhandle, 5, "date", true, "aarontllun", 10000); + } +} + +TEST_F(Hdf5StringFormatTest, DateTime) { + auto path = testpath(); + + { + H5::H5File handle(path, H5F_ACC_TRUNC); + auto dhandle = hdf5_utils::spawn_data(handle, "foobar", 5, H5::StrType(0, H5T_VARIABLE)); + std::vector contents { "A", "BB", "CCC", "DDDD", "EEEEEE" }; + auto ptrs = hdf5_utils::pointerize_strings(contents); + dhandle.write(ptrs.data(), dhandle.getStrType()); + } + { + H5::H5File handle(path, H5F_ACC_RDONLY); + auto dhandle = handle.openDataSet("foobar"); + expect_error("date/time-formatted string", dhandle, 5, "date-time", false, "", 10000); + } + + { + H5::H5File handle(path, H5F_ACC_RDWR); + auto dhandle = handle.openDataSet("foobar"); + std::vector contents; + for (size_t i = 0; i < 5; ++i) { + contents.push_back("2023-01-1" + std::to_string(i) + "T00:00:00Z"); + } + auto ptrs = hdf5_utils::pointerize_strings(contents); + dhandle.write(ptrs.data(), dhandle.getStrType()); + } + { + H5::H5File handle(path, H5F_ACC_RDONLY); + auto dhandle = handle.openDataSet("foobar"); + takane::internal_hdf5::validate_string_format(dhandle, 5, "date-time", false, "", 10000); + } + + // Checking for missing placeholder. + { + H5::H5File handle(path, H5F_ACC_RDWR); + auto dhandle = handle.openDataSet("foobar"); + + hsize_t len = 1; + H5::DataSpace memspace(1, &len); + H5::DataSpace filespace = dhandle.getSpace(); + hsize_t start = 4; + filespace.selectHyperslab(H5S_SELECT_SET, &len, &start); + + std::string placeholder = "aarontllun"; // must be 10 characters. + auto pptr = placeholder.c_str(); + dhandle.write(&pptr, dhandle.getStrType(), memspace, filespace); + } + { + H5::H5File handle(path, H5F_ACC_RDONLY); + auto dhandle = handle.openDataSet("foobar"); + expect_error("date/time-formatted string", dhandle, 5, "date-time", true, "foobar", 10000); + takane::internal_hdf5::validate_string_format(dhandle, 5, "date-time", true, "aarontllun", 10000); + } +} + +struct Hdf5FactorTest : public::testing::Test { + static std::string testpath() { + return "TEST_factorutils.h5"; + } + + template + static void expect_error_levels(const std::string& msg, Args_&& ... args) { + EXPECT_ANY_THROW({ + try { + takane::internal_hdf5::validate_factor_levels(std::forward(args)...); + } catch (std::exception& e) { + EXPECT_THAT(e.what(), ::testing::HasSubstr(msg)); + throw; + } + }); + } + + template + static void expect_error_codes(const std::string& msg, Args_&& ... args) { + EXPECT_ANY_THROW({ + try { + takane::internal_hdf5::validate_factor_codes(std::forward(args)...); + } catch (std::exception& e) { + EXPECT_THAT(e.what(), ::testing::HasSubstr(msg)); + throw; + } + }); + } +}; + +TEST_F(Hdf5FactorTest, Levels) { + auto path = testpath(); + + size_t nlevels = 0; + { + H5::H5File handle(path, H5F_ACC_TRUNC); + hdf5_utils::spawn_data(handle, "fab", 10, H5::PredType::NATIVE_INT32); + hdf5_utils::spawn_data(handle, "foobar", 10, H5::StrType(0, 10000)); + + std::vector levels { "A", "BB", "CCC", "DDDD", "EEEEE" }; + nlevels = levels.size(); + hdf5_utils::spawn_string_data(handle, "blah", H5T_VARIABLE, levels); + } + { + H5::H5File handle(path, H5F_ACC_RDONLY); + expect_error_levels("expected a string", handle, "fab", 10000); + expect_error_levels("duplicated factor level", handle, "foobar", 10000); + EXPECT_EQ(takane::internal_hdf5::validate_factor_levels(handle, "blah", 10000), nlevels); + } +} + +TEST_F(Hdf5FactorTest, Codes) { + auto path = testpath(); + + size_t ncodes = 20; + { + H5::H5File handle(path, H5F_ACC_TRUNC); + hdf5_utils::spawn_data(handle, "fab", 10, H5::PredType::NATIVE_FLOAT); + hdf5_utils::spawn_data(handle, "blah", ncodes, H5::PredType::NATIVE_INT32); + } + { + H5::H5File handle(path, H5F_ACC_RDONLY); + expect_error_codes("32-bit signed integer", handle, "fab", 10, 10000); + expect_error_codes("less than the number of levels", handle, "blah", 0, 10000); + EXPECT_EQ(takane::internal_hdf5::validate_factor_codes(handle, "blah", 10, 10000), ncodes); + } + + { + H5::H5File handle(path, H5F_ACC_RDWR); + auto dhandle = handle.openDataSet("blah"); + std::vector stuff(ncodes); + for (size_t i = 0; i < ncodes; ++i) { + stuff[i] = static_cast(i) % 5 - 1; + } + dhandle.write(stuff.data(), H5::PredType::NATIVE_INT); + } + { + H5::H5File handle(path, H5F_ACC_RDONLY); + expect_error_codes("non-negative", handle, "blah", 4, 10000); + } + + { + H5::H5File handle(path, H5F_ACC_RDWR); + auto dhandle = handle.openDataSet("blah"); + auto ahandle = dhandle.createAttribute("missing-value-placeholder", H5::PredType::NATIVE_INT32, H5S_SCALAR); + int val = -1; + ahandle.write(H5::PredType::NATIVE_INT, &val); + } + { + H5::H5File handle(path, H5F_ACC_RDONLY); + EXPECT_EQ(takane::internal_hdf5::validate_factor_codes(handle, "blah", 4, 10000), ncodes); + expect_error_codes("number of levels", handle, "blah", 3, 10000); + } +} diff --git a/tests/src/utils_other.cpp b/tests/src/utils_other.cpp new file mode 100644 index 0000000..bc5164b --- /dev/null +++ b/tests/src/utils_other.cpp @@ -0,0 +1,59 @@ +#include +#include + +#include "data_frame.h" +#include "simple_list.h" +#include "takane/utils_other.hpp" + +#include "utils.h" + +struct ValidateMetadataTest : public::testing::Test { + static std::filesystem::path testdir() { + return "TEST_validate"; + } + + template + static void expect_error_mcols(const std::string& msg, Args_&& ... args) { + EXPECT_ANY_THROW({ + try { + takane::internal_other::validate_mcols(std::forward(args)...); + } catch (std::exception& e) { + EXPECT_THAT(e.what(), ::testing::HasSubstr(msg)); + throw; + } + }); + } + + template + static void expect_error_metadata(const std::string& msg, Args_&& ... args) { + EXPECT_ANY_THROW({ + try { + takane::internal_other::validate_metadata(std::forward(args)...); + } catch (std::exception& e) { + EXPECT_THAT(e.what(), ::testing::HasSubstr(msg)); + throw; + } + }); + } +}; + +TEST_F(ValidateMetadataTest, Mcols) { + auto path = testdir(); + initialize_directory(path, "data_frame"); + data_frame::mock(path, 10, true, {}); + takane::internal_other::validate_mcols(path, 10, takane::Options()); + expect_error_mcols("unexpected number of rows", path, 20, takane::Options()); + + initialize_directory(path, "simple_list"); + expect_error_mcols("'data_frame' or one of its derivatives", path, 10, takane::Options()); +} + +TEST_F(ValidateMetadataTest, Metadata) { + auto path = testdir(); + initialize_directory(path, "simple_list"); + simple_list::mock(path); + takane::internal_other::validate_metadata(path, takane::Options()); + + initialize_directory(path, "data_frame"); + expect_error_metadata("'simple_list' or one of its derivatives", path, takane::Options()); +}