Skip to content

Commit

Permalink
Updated vendored libraries to get spatial experiment validators.
Browse files Browse the repository at this point in the history
  • Loading branch information
LTLA committed Dec 29, 2023
1 parent b1236ec commit 74d3e14
Show file tree
Hide file tree
Showing 7 changed files with 270 additions and 7 deletions.
4 changes: 2 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: alabaster.base
Title: Save Bioconductor Objects To File
Version: 1.3.11
Date: 2023-12-27
Version: 1.3.12
Date: 2023-12-29
Authors@R: person("Aaron", "Lun", role=c("aut", "cre"), email="[email protected]")
License: MIT + file LICENSE
Description:
Expand Down
3 changes: 2 additions & 1 deletion R/readObject.R
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,8 @@ read.registry$registry <- list(
atomic_vector_list="alabaster.ranges::readAtomicVectorList",
sequence_information="alabaster.ranges::readSeqinfo",
multi_sample_dataset="alabaster.mae::readMultiAssayExperiment",
sequence_string_set="alabaster.string::readXStringSet"
sequence_string_set="alabaster.string::readXStringSet",
spatial_experiment="alabaster.spatial::readSpatialExperiment"
)

#' @export
Expand Down
1 change: 1 addition & 0 deletions inst/include/takane/_dimensions.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ inline DimensionsRegistry default_registry() {
registry["summarized_experiment"] = [](const std::filesystem::path& p, const ObjectMetadata& m, const Options& o) -> std::vector<size_t> { return summarized_experiment::dimensions(p, m, o); };
registry["ranged_summarized_experiment"] = [](const std::filesystem::path& p, const ObjectMetadata& m, const Options& o) -> std::vector<size_t> { return summarized_experiment::dimensions(p, m, o); };
registry["single_cell_experiment"] = [](const std::filesystem::path& p, const ObjectMetadata& m, const Options& o) -> std::vector<size_t> { return summarized_experiment::dimensions(p, m, o); };
registry["spatial_experiment"] = [](const std::filesystem::path& p, const ObjectMetadata& m, const Options& o) -> std::vector<size_t> { return summarized_experiment::dimensions(p, m, o); };

return registry;
}
Expand Down
2 changes: 2 additions & 0 deletions inst/include/takane/_height.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#include "dense_array.hpp"
#include "compressed_sparse_matrix.hpp"
#include "summarized_experiment.hpp"
#include "spatial_experiment.hpp"
#include "sequence_string_set.hpp"

/**
Expand Down Expand Up @@ -56,6 +57,7 @@ inline HeightRegistry default_registry() {
registry["summarized_experiment"] = [](const std::filesystem::path& p, const ObjectMetadata& m, const Options& o) -> size_t { return summarized_experiment::height(p, m, o); };
registry["ranged_summarized_experiment"] = [](const std::filesystem::path& p, const ObjectMetadata& m, const Options& o) -> size_t { return summarized_experiment::height(p, m, o); };
registry["single_cell_experiment"] = [](const std::filesystem::path& p, const ObjectMetadata& m, const Options& o) -> size_t { return summarized_experiment::height(p, m, o); };
registry["spatial_experiment"] = [](const std::filesystem::path& p, const ObjectMetadata& m, const Options& o) -> size_t { return summarized_experiment::height(p, m, o); };

registry["sequence_string_set"] = [](const std::filesystem::path& p, const ObjectMetadata& m, const Options& o) -> size_t { return sequence_string_set::height(p, m, o); };
return registry;
Expand Down
2 changes: 2 additions & 0 deletions inst/include/takane/_validate.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
#include "summarized_experiment.hpp"
#include "ranged_summarized_experiment.hpp"
#include "single_cell_experiment.hpp"
#include "spatial_experiment.hpp"
#include "multi_sample_dataset.hpp"
#include "sequence_string_set.hpp"

Expand Down Expand Up @@ -59,6 +60,7 @@ inline ValidateRegistry default_registry() {
registry["summarized_experiment"] = [](const std::filesystem::path& p, const ObjectMetadata& m, const Options& o) { summarized_experiment::validate(p, m, o); };
registry["ranged_summarized_experiment"] = [](const std::filesystem::path& p, const ObjectMetadata& m, const Options& o) { ranged_summarized_experiment::validate(p, m, o); };
registry["single_cell_experiment"] = [](const std::filesystem::path& p, const ObjectMetadata& m, const Options& o) { single_cell_experiment::validate(p, m, o); };
registry["spatial_experiment"] = [](const std::filesystem::path& p, const ObjectMetadata& m, const Options& o) { spatial_experiment::validate(p, m, o); };
registry["multi_sample_dataset"] = [](const std::filesystem::path& p, const ObjectMetadata& m, const Options& o) { multi_sample_dataset::validate(p, m, o); };
registry["sequence_string_set"] = [](const std::filesystem::path& p, const ObjectMetadata& m, const Options& o) { sequence_string_set::validate(p, m, o); };
return registry;
Expand Down
247 changes: 247 additions & 0 deletions inst/include/takane/spatial_experiment.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,247 @@
#ifndef TAKANE_SPATIAL_EXPERIMENT_HPP
#define TAKANE_SPATIAL_EXPERIMENT_HPP

#include "ritsuko/hdf5/hdf5.hpp"

#include "single_cell_experiment.hpp"
#include "utils_factor.hpp"
#include "utils_public.hpp"
#include "utils_other.hpp"

#include <filesystem>
#include <stdexcept>
#include <unordered_set>
#include <string>
#include <vector>
#include <cmath>

namespace takane {

/**
* @namespace takane::spatial_experiment
* @brief Definitions for spatial experiments.
*/
namespace spatial_experiment {

/**
* @cond
*/
namespace internal {

inline void validate_coordinates(const std::filesystem::path& path, size_t ncols, const Options& options) {
auto coord_path = path / "coordinates";
auto coord_meta = read_object_metadata(coord_path);
if (coord_meta.type != "dense_array") {
throw std::runtime_error("'coordinates' should be a dense array");
}

// Validating the coordinates; currently these must be a dense array of
// points, but could also be polygons/hulls in the future.
try {
::takane::validate(coord_path, coord_meta, options);
} catch (std::exception& e) {
throw std::runtime_error("failed to validate 'coordinates'; " + std::string(e.what()));
}

auto cdims = ::takane::dimensions(coord_path, coord_meta, options);
if (cdims.size() != 2) {
throw std::runtime_error("'coordinates' should be a 2-dimensional dense array");
} else if (cdims[1] != 2 && cdims[1] != 3) {
throw std::runtime_error("'coordinates' should have 2 or 3 columns");
} else if (cdims[0] != ncols) {
throw std::runtime_error("number of rows in 'coordinates' should equal the number of columns in the 'spatial_experiment'");
}

// Checking that the values are numeric.
auto handle = ritsuko::hdf5::open_file(coord_path / "array.h5");
auto ghandle = ritsuko::hdf5::open_group(handle, "dense_array");
auto dhandle = ritsuko::hdf5::open_dataset(ghandle, "data");
auto dclass = dhandle.getTypeClass();
if (dclass != H5T_INTEGER && dclass != H5T_FLOAT) {
throw std::runtime_error("values in 'coordinates' should be numeric");
}
}

inline void validate_image(const std::filesystem::path& path, size_t i, const std::string& format) {
auto ipath = path / std::to_string(i);

if (format == "PNG") {
ipath += ".png";
byteme::RawFileReader reader(ipath, 10);
byteme::PerByte<unsigned char> pb(&reader);
bool okay = pb.valid();

// Magic number from http://www.libpng.org/pub/png/spec/1.2/png-1.2-pdg.html#PNG-file-signature
std::array<unsigned char, 8> expected { 137, 80, 78, 71, 13, 10, 26, 10 };
for (size_t i = 0; i < 8; ++i) {
if (!okay) {
throw std::runtime_error("incomplete PNG file signature for '" + ipath.string() + "'");
}
if (pb.get() != expected[i]) {
throw std::runtime_error("incorrect file signature for '" + ipath.string() + "'");
}
okay = pb.advance();
}

} else if (format == "TIFF") {
ipath += ".tif";
byteme::RawFileReader reader(ipath, 10);
byteme::PerByte<unsigned char> pb(&reader);
bool okay = pb.valid();

std::array<unsigned char, 4> observed;
for (size_t i = 0; i < 4; ++i) {
if (!okay) {
throw std::runtime_error("incomplete TIFF file signature for '" + ipath.string() + "'");
}
observed[i] = pb.get();
okay = pb.advance();
}

// Magic number from https://en.wikipedia.org/wiki/Magic_number_(programming)
std::array<unsigned char, 4> iisig = { 0x49, 0x49, 0x2A, 0x00 };
std::array<unsigned char, 4> mmsig = { 0x4D, 0x4D, 0x00, 0x2A };
if (observed != iisig && observed != mmsig) {
throw std::runtime_error("incorrect file signature for '" + ipath.string() + "'");
}

} else {
throw std::runtime_error("image format '" + format + "' is not currently supported");
}
}

inline void validate_images(const std::filesystem::path& path, size_t ncols, const Options& options) {
auto image_dir = path / "images";
auto mappath = image_dir / "mapping.h5";
auto ihandle = ritsuko::hdf5::open_file(mappath);
auto ghandle = ritsuko::hdf5::open_group(ihandle, "spatial_experiment");

std::vector<std::string> image_formats;
try {
struct SampleMapMessenger {
static std::string level() { return "sample name"; }
static std::string levels() { return "sample names"; }
static std::string codes() { return "sample assignments"; }
};

auto num_samples = internal_factor::validate_factor_levels<SampleMapMessenger>(ghandle, "sample_names", options.hdf5_buffer_size);
auto num_codes = internal_factor::validate_factor_codes<SampleMapMessenger>(ghandle, "column_samples", num_samples, options.hdf5_buffer_size, true);
if (num_codes != ncols) {
throw std::runtime_error("length of 'column_samples' should equal the number of columns in the spatial experiment");
}

// Scanning through the image information.
auto sample_handle = ritsuko::hdf5::open_dataset(ghandle, "image_samples");
if (ritsuko::hdf5::exceeds_integer_limit(sample_handle, 64, false)) {
throw std::runtime_error("expected a datatype for 'image_samples' that fits in a 64-bit unsigned integer");
}
auto num_images = ritsuko::hdf5::get_1d_length(sample_handle.getSpace(), false);

auto id_handle = ritsuko::hdf5::open_dataset(ghandle, "image_ids");
if (id_handle.getTypeClass() != H5T_STRING) {
throw std::runtime_error("expected a string datatype for 'image_ids'");
}
if (ritsuko::hdf5::get_1d_length(id_handle.getSpace(), false) != num_images) {
throw std::runtime_error("expected 'image_ids' to have the same length as 'image_samples'");
}

auto scale_handle = ritsuko::hdf5::open_dataset(ghandle, "image_scale_factors");
if (ritsuko::hdf5::exceeds_float_limit(scale_handle, 64)) {
throw std::runtime_error("expected a datatype for 'image_scale_factors' that fits in a 64-bit float");
}
if (ritsuko::hdf5::get_1d_length(scale_handle.getSpace(), false) != num_images) {
throw std::runtime_error("expected 'image_scale_factors' to have the same length as 'image_samples'");
}

auto format_handle = ritsuko::hdf5::open_dataset(ghandle, "image_formats");
if (format_handle.getTypeClass() != H5T_STRING) {
throw std::runtime_error("expected a string datatype for 'image_formats'");
}
if (ritsuko::hdf5::get_1d_length(format_handle.getSpace(), false) != num_images) {
throw std::runtime_error("expected 'image_formats' to have the same length as 'image_samples'");
}

ritsuko::hdf5::Stream1dNumericDataset<uint64_t> sample_stream(&sample_handle, num_images, options.hdf5_buffer_size);
ritsuko::hdf5::Stream1dStringDataset id_stream(&id_handle, num_images, options.hdf5_buffer_size);
ritsuko::hdf5::Stream1dNumericDataset<double> scale_stream(&scale_handle, num_images, options.hdf5_buffer_size);
ritsuko::hdf5::Stream1dStringDataset format_stream(&format_handle, num_images, options.hdf5_buffer_size);
std::vector<std::unordered_set<std::string> > collected(num_samples);
image_formats.reserve(num_images);

for (hsize_t i = 0; i < num_images; ++i) {
auto sample = sample_stream.get();
if (sample >= num_samples) {
throw std::runtime_error("entries of 'image_samples' should be less than the number of samples");
}
sample_stream.next();

auto& present = collected[sample];
auto id = id_stream.steal();
if (present.find(id) != present.end()) {
throw std::runtime_error("'image_ids' contains duplicated image IDs for the same sample + ('" + id + "')");
}
present.insert(std::move(id));
id_stream.next();

auto sc = scale_stream.get();
if (!std::isfinite(sc) || sc <= 0) {
throw std::runtime_error("entries of 'image_scale_factors' should be finite and positive");
}
scale_stream.next();

auto fmt = format_stream.steal();
image_formats.push_back(std::move(fmt));
format_stream.next();
}

for (const auto& x : collected) {
if (x.empty()) {
throw std::runtime_error("each sample should map to one or more images in 'image_samples'");
}
}

} catch (std::exception& e) {
throw std::runtime_error("failed to validate '" + mappath.string() + "'; " + std::string(e.what()));
}

// Now validating the images themselves.
size_t num_images = image_formats.size();
for (size_t i = 0; i < num_images; ++i) {
validate_image(image_dir, i, image_formats[i]);
}

size_t num_dir_obj = internal_other::count_directory_entries(image_dir);
if (num_dir_obj - 1 != num_images) { // -1 to account for the mapping.h5 file itself.
throw std::runtime_error("more objects than expected inside the 'images' subdirectory");
}
}

}
/**
* @endcond
*/

/**
* @param path Path to the directory containing the spatial experiment.
* @param metadata Metadata for the object, typically read from its `OBJECT` file.
* @param options Validation options, typically for reading performance.
*/
inline void validate(const std::filesystem::path& path, const ObjectMetadata& metadata, const Options& options) {
::takane::single_cell_experiment::validate(path, metadata, options);

const std::string& vstring = internal_json::extract_version_for_type(metadata.other, "spatial_experiment");
auto version = ritsuko::parse_version_string(vstring.c_str(), vstring.size(), /* skip_patch = */ true);
if (version.major != 1) {
throw std::runtime_error("unsupported version string '" + vstring + "'");
}

auto dims = ::takane::summarized_experiment::dimensions(path, metadata, options);
internal::validate_coordinates(path, dims[1], options);
internal::validate_images(path, dims[1], options);
}

}

}

#endif
18 changes: 14 additions & 4 deletions inst/include/takane/utils_factor.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,16 @@ void check_ordered_attribute(const H5Object_& handle) {
}
}

inline hsize_t validate_factor_levels(const H5::Group& handle, const std::string& name, hsize_t buffer_size) {
struct DefaultFactorMessenger {
static std::string level() { return "factor level"; }
static std::string levels() { return "levels"; }
static std::string codes() { return "factor codes"; }
};

// These factor level/code checks are useful elsewhere but with different error messages;
// in such cases, we just do some compile-time switches that only affect the error message.
template<class ErrorMessenger_ = DefaultFactorMessenger>
hsize_t validate_factor_levels(const H5::Group& handle, const std::string& name, hsize_t buffer_size) {
auto lhandle = ritsuko::hdf5::open_dataset(handle, name.c_str());
if (lhandle.getTypeClass() != H5T_STRING) {
throw std::runtime_error("expected a string datatype for '" + name + "'");
Expand All @@ -42,15 +51,16 @@ inline hsize_t validate_factor_levels(const H5::Group& handle, const std::string
for (hsize_t i = 0; i < len; ++i, stream.next()) {
auto x = stream.steal();
if (present.find(x) != present.end()) {
throw std::runtime_error("'" + name + "' contains duplicated factor level '" + x + "'");
throw std::runtime_error("'" + name + "' contains duplicated " + ErrorMessenger_::level() + " '" + x + "'");
}
present.insert(std::move(x));
}

return len;
}

inline hsize_t validate_factor_codes(const H5::Group& handle, const std::string& name, hsize_t num_levels, hsize_t buffer_size, bool allow_missing = true) {
template<class ErrorMessenger_ = DefaultFactorMessenger>
hsize_t validate_factor_codes(const H5::Group& handle, const std::string& name, hsize_t num_levels, hsize_t buffer_size, bool allow_missing = true) {
auto chandle = ritsuko::hdf5::open_dataset(handle, name.c_str());
if (ritsuko::hdf5::exceeds_integer_limit(chandle, 64, false)) {
throw std::runtime_error("expected a datatype for '" + name + "' that fits in a 64-bit unsigned integer");
Expand All @@ -72,7 +82,7 @@ inline hsize_t validate_factor_codes(const H5::Group& handle, const std::string&
continue;
}
if (static_cast<hsize_t>(x) >= num_levels) {
throw std::runtime_error("expected factor codes to be less than the number of levels");
throw std::runtime_error("expected " + ErrorMessenger_::codes() + " to be less than the number of " + ErrorMessenger_::levels() + " in '" + name + "'");
}
}

Expand Down

0 comments on commit 74d3e14

Please sign in to comment.