Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Zero factor #21

Merged
merged 2 commits into from
Nov 24, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 18 additions & 10 deletions R/saveBaseFactor.R
Original file line number Diff line number Diff line change
Expand Up @@ -35,39 +35,47 @@ setMethod("saveObject", "factor", function(x, path, ...) {

fhandle <- H5Fopen(ofile)
on.exit(H5Fclose(fhandle), add=TRUE)
ghandle <- H5Gopen(fhandle, host)
on.exit(H5Gclose(ghandle), add=TRUE, after=FALSE)
(function (){
ghandle <- H5Gopen(fhandle, host)
on.exit(H5Gclose(ghandle), add=TRUE)
h5writeAttribute("1.0", ghandle, "version", asScalar=TRUE)
if (is.ordered(x)) {
h5writeAttribute(1L, ghandle, "ordered", asScalar=TRUE)
}
})()

.simple_save_codes(fhandle, host, x)
h5write(levels(x), fhandle, paste0(host, "/levels"))
.simple_save_codes(ghandle, x)
h5write(levels(x), ghandle, "levels")

write("string_factor", file=file.path(path, "OBJECT"))
invisible(NULL)
})

.simple_save_codes <- function(fhandle, host, x, save.names=TRUE) {
.simple_save_codes <- function(ghandle, x, save.names=TRUE) {
codes <- as.integer(x) - 1L

missing.placeholder <- NULL
if (anyNA(codes)) {
missing.placeholder <- -1L
missing.placeholder <- nlevels(x)
codes[is.na(codes)] <- missing.placeholder
}

full.data.name <- paste0(host, "/codes")
h5write(codes, fhandle, full.data.name)
shandle <- H5Screate_simple(length(x))
on.exit(H5Sclose(shandle), add=TRUE)
dhandle <- H5Dcreate(ghandle, "codes", dtype_id="H5T_NATIVE_UINT32", h5space=shandle)
on.exit(H5Dclose(dhandle), add=TRUE, after=FALSE)
H5Dwrite(dhandle, codes)

if (!is.null(missing.placeholder)) {
addMissingPlaceholderAttributeForHdf5(fhandle, full.data.name, missing.placeholder)
ashandle <- H5Screate("H5S_SCALAR")
on.exit(H5Sclose(ashandle), add=TRUE, after=FALSE)
ahandle <- H5Acreate(dhandle, "missing-value-placeholder", dtype_id="H5T_NATIVE_UINT32", h5space=ashandle)
on.exit(H5Aclose(ahandle), add=TRUE, after=FALSE)
H5Awrite(ahandle, missing.placeholder)
}

if (save.names && !is.null(names(x))) {
h5write(names(x), fhandle, paste0(host, "/names"))
h5write(names(x), ghandle, "names")
}
}

Expand Down
5 changes: 2 additions & 3 deletions R/saveDataFrame.R
Original file line number Diff line number Diff line change
Expand Up @@ -91,11 +91,10 @@ setMethod("saveObject", "DataFrame", function(x, path, ...) {
if (is.ordered(col)) {
h5writeAttribute(1L, ghandle, "ordered", asScalar=TRUE)
}
.simple_save_codes(ghandle, col, save.names=FALSE)
h5write(levels(col), ghandle, "levels");
})()

.simple_save_codes(fhandle, full.data.name, col, save.names=FALSE)
h5write(levels(col), fhandle, paste0(full.data.name, "/levels"));

} else if (.is_datetime(col)) {
coltype <- "string"
colformat <- "date-time"
Expand Down
10 changes: 4 additions & 6 deletions R/saveDataFrameFactor.R
Original file line number Diff line number Diff line change
Expand Up @@ -32,13 +32,11 @@ setMethod("saveObject", "DataFrameFactor", function(x, path, ...) {

fhandle <- H5Fopen(ofile)
on.exit(H5Fclose(fhandle), add=TRUE)
(function (){
ghandle <- H5Gopen(fhandle, host)
on.exit(H5Gclose(ghandle), add=TRUE)
h5writeAttribute("1.0", ghandle, "version", asScalar=TRUE)
})()
ghandle <- H5Gopen(fhandle, host)
on.exit(H5Gclose(ghandle), add=TRUE, after=FALSE)
h5writeAttribute("1.0", ghandle, "version", asScalar=TRUE)

.simple_save_codes(fhandle, host, x)
.simple_save_codes(ghandle, x)
stuff <- levels(x)
altSaveObject(stuff, paste0(path, "/levels"), ...)

Expand Down
4 changes: 2 additions & 2 deletions inst/include/fetch.sh
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ harvester() {

harvester millijson https://github.com/ArtifactDB/millijson v1.0.0
harvester byteme https://github.com/LTLA/byteme v1.1.0
harvester uzuki2 https://github.com/ArtifactDB/uzuki2 v1.3.0
harvester comservatory https://github.com/ArtifactDB/comservatory v2.0.1
harvester ritsuko https://github.com/ArtifactDB/ritsuko v0.3.3
harvester uzuki2 https://github.com/ArtifactDB/uzuki2 master
harvester ritsuko https://github.com/ArtifactDB/ritsuko master
harvester takane https://github.com/ArtifactDB/takane master
116 changes: 100 additions & 16 deletions inst/include/ritsuko/choose_missing_placeholder.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,20 +15,58 @@
namespace ritsuko {

/**
* Choose an appropriate placeholder for missing values in an integer dataset.
* @cond
*/
template<class Iterator, class Mask, class Type>
bool found(Iterator start, Iterator end, Mask mask, Type candidate) {
if constexpr(std::is_same<Mask, bool>::value) {
return (std::find(start, end, candidate) != end);
} else {
for (; start != end; ++start, ++mask) {
if (!*mask && candidate == *start) {
return true;
}
}
return false;
}
}

template<class Iterator, class Mask, class Type = typename std::remove_cv<typename std::remove_reference<decltype(*(std::declval<Iterator>()))>::type>::type>
std::set<Type> create_unique_set(Iterator start, Iterator end, Mask mask) {
if constexpr(std::is_same<Mask, bool>::value) {
return std::set<Type>(start, end);
} else {
std::set<Type> output;
for (; start != end; ++start, ++mask) {
if (!*mask) {
output.insert(*start);
}
}
return output;
}
}
/**
* @endcond
*/

/**
* Choose an appropriate placeholder for missing values in an integer dataset, after ignoring all the masked values.
* This will try the various special values (the minimum, the maximum, and for signed types, 0)
* before sorting the dataset and searching for an unused integer value.
*
* @tparam Iterator_ Forward iterator for integer values.
* @tparam Mask_ Random access iterator for mask values.
* @tparam Type_ Integer type pointed to by `Iterator_`.
*
* @param start Start of the dataset.
* @param end End of the dataset.
* @param mask Start of the mask vector.
* This should have the same length as `end - start`; each entry is true if the corresponding value of the integer dataset is masked, and false otherwise.
*
* @return Pair containing (i) a boolean indicating whether a placeholder was successfully found, and (ii) the chosen placeholder if the previous boolean is true.
*/
template<class Iterator, class Type_ = typename std::remove_cv<typename std::remove_reference<decltype(*(std::declval<Iterator>()))>::type>::type>
std::pair<bool, Type_> choose_missing_integer_placeholder(Iterator start, Iterator end) {
template<class Iterator, class Mask, class Type_ = typename std::remove_cv<typename std::remove_reference<decltype(*(std::declval<Iterator>()))>::type>::type>
std::pair<bool, Type_> choose_missing_integer_placeholder(Iterator start, Iterator end, Mask mask) {
static_assert(std::numeric_limits<Type_>::is_integer);

// Trying important points first; minima and maxima, and 0.
Expand All @@ -42,7 +80,7 @@ std::pair<bool, Type_> choose_missing_integer_placeholder(Iterator start, Iterat
} else {
candidate = 0;
}
if (std::find(start, end, candidate) == end) {
if (!found(start, end, mask, candidate)) {
return std::make_pair(true, candidate);
}
}
Expand All @@ -55,14 +93,14 @@ std::pair<bool, Type_> choose_missing_integer_placeholder(Iterator start, Iterat
} else {
candidate = 0;
}
if (std::find(start, end, candidate) == end) {
if (!found(start, end, mask, candidate)) {
return std::make_pair(true, candidate);
}
}
}

// Well... going through it in order.
std::set<Type_> uniq_sort(start, end);
auto uniq_sort = create_unique_set(start, end, mask);
Type_ last = std::numeric_limits<Type_>::min();
for (auto x : uniq_sort) {
if (last + 1 < x) {
Expand All @@ -75,7 +113,23 @@ std::pair<bool, Type_> choose_missing_integer_placeholder(Iterator start, Iterat
}

/**
* Choose an appropriate placeholder for missing values in a floating-point dataset.
* Overload of `choose_missing_integer_placeholder()` where no values are masked.
*
* @tparam Iterator_ Forward iterator for integer values.
* @tparam Type_ Integer type pointed to by `Iterator_`.
*
* @param start Start of the dataset.
* @param end End of the dataset.
*
* @return Pair containing (i) a boolean indicating whether a placeholder was successfully found, and (ii) the chosen placeholder if the previous boolean is true.
*/
template<class Iterator, class Type_ = typename std::remove_cv<typename std::remove_reference<decltype(*(std::declval<Iterator>()))>::type>::type>
std::pair<bool, Type_> choose_missing_integer_placeholder(Iterator start, Iterator end) {
return choose_missing_integer_placeholder(start, end, false);
}

/**
* Choose an appropriate placeholder for missing values in a floating-point dataset, after ignoring all masked values.
* This will try the various IEEE special values (NaN, Inf, -Inf) and then some type-specific boundaries (the minimum, the maximum, and for signed types, 0)
* before sorting the dataset and searching for an unused float.
*
Expand All @@ -84,30 +138,43 @@ std::pair<bool, Type_> choose_missing_integer_placeholder(Iterator start, Iterat
*
* @param start Start of the dataset.
* @param end End of the dataset.
* @param mask Start of the mask vector.
* @param skip_nan Whether to skip NaN as a potential placeholder.
* Useful in frameworks like R that need special consideration of NaN payloads.
*
* @return Pair containing (i) a boolean indicating whether a placeholder was successfully found, and (ii) the chosen placeholder if the previous boolean is true.
*/
template<class Iterator, class Type_ = typename std::remove_cv<typename std::remove_reference<decltype(*(std::declval<Iterator>()))>::type>::type>
std::pair<bool, Type_> choose_missing_float_placeholder(Iterator start, Iterator end, bool skip_nan = false) {
template<class Iterator, class Mask, class Type_ = typename std::remove_cv<typename std::remove_reference<decltype(*(std::declval<Iterator>()))>::type>::type>
std::pair<bool, Type_> choose_missing_float_placeholder(Iterator start, Iterator end, Mask mask, bool skip_nan) {
if constexpr(std::numeric_limits<Type_>::is_iec559) {
if (!skip_nan) {
bool has_nan = false;
for (auto x = start; x != end; ++x) {
if (std::isnan(*x)) {
has_nan = true;
break;

if constexpr(std::is_same<Mask, bool>::value) {
for (auto x = start; x != end; ++x) {
if (std::isnan(*x)) {
has_nan = true;
break;
}
}
} else {
auto sIt = mask;
for (auto x = start; x != end; ++x, ++sIt) {
if (!*sIt && std::isnan(*x)) {
has_nan = true;
break;
}
}
}

if (!has_nan) {
return std::make_pair(true, std::numeric_limits<Type_>::quiet_NaN());
}
}

for (int i = 0; i < 2; ++i) {
Type_ candidate = std::numeric_limits<Type_>::infinity() * (i == 0 ? 1 : -1);
if (std::find(start, end, candidate) == end) {
if (!found(start, end, mask, candidate)) {
return std::make_pair(true, candidate);
}
}
Expand All @@ -123,13 +190,13 @@ std::pair<bool, Type_> choose_missing_float_placeholder(Iterator start, Iterator
} else {
candidate = 0;
}
if (std::find(start, end, candidate) == end) {
if (!found(start, end, mask, candidate)) {
return std::make_pair(true, candidate);
}
}

// Well... going through it in order.
std::set<Type_> uniq_sort(start, end);
auto uniq_sort = create_unique_set(start, end, mask);
Type_ last = std::numeric_limits<Type_>::lowest();
for (auto x : uniq_sort) {
if (std::isfinite(x)) {
Expand All @@ -144,6 +211,23 @@ std::pair<bool, Type_> choose_missing_float_placeholder(Iterator start, Iterator
return std::make_pair(false, 0);
}

/**
* Overload of `choose_missing_float_placeholder()` where no values are masked.
*
* @tparam Iterator_ Forward iterator for floating-point values.
* @tparam Type_ Integer type pointed to by `Iterator_`.
*
* @param start Start of the dataset.
* @param end End of the dataset.
* @param skip_nan Whether to skip NaN as a potential placeholder.
*
* @return Pair containing (i) a boolean indicating whether a placeholder was successfully found, and (ii) the chosen placeholder if the previous boolean is true.
*/
template<class Iterator, class Type_ = typename std::remove_cv<typename std::remove_reference<decltype(*(std::declval<Iterator>()))>::type>::type>
std::pair<bool, Type_> choose_missing_float_placeholder(Iterator start, Iterator end, bool skip_nan = false) {
return choose_missing_float_placeholder(start, end, false, skip_nan);
}

}

#endif
Loading
Loading