Skip to content

Commit

Permalink
subword parallel search for ArrayFlex and ArrayPacked find() (#7367)
Browse files Browse the repository at this point in the history
* idea: subword parallel search

* better subword search

* better naming

* new methods for reading unaligned word from array of bitfields

* perf work on array with find based on parallel values comparison

* major cleanup of bitfield scanning

* de-templatified bit field search

* more tests and code generalization

* more tests

* new iterator optimized for linear scan

* eliminated last use of templates in subword parallel search

* optimization of some subword search methods

* working EQ cmp with parallel subword check

* fix in all_fields_NE

* make populate handle negative values

* commented out bypass which disabled subword search

* fix in fix of populate()

* bugfix and direct methods for signed GT and GE

* fix for GT condition

* enabled array perf tests (outside debug mode)

* fixed inner search loop

* made some perf tests non concurrent and silenced warnings

* moved call to match() into inner loop in subword parallel search

* Perf v2, find_with_marked for packed interger arrays (#7385)

* made find_first_marked() branch free

* various optimizations of find_first_marked, best one selected

* for some reason this is much bettergit add .

* no warnings

* made search method selection more explicit and clear

* bunch of fixes..

* restore subword loop

* fix object store tests + use subword cmp always (which is faster on my machine)

---------

Co-authored-by: Finn Schiermer Andersen <[email protected]>

* Perf work for array flex (still missing timestamps) (#7397)

* WIP perf work for array flex

* more small stuff, nothing important

* parallel subword for eq and neq

* move find parallel inside loop for eq and neq

* LT parallel subword cmp

* GT find for array flex

* Int equality as good as Packed

* code review

---------

Co-authored-by: Finn Schiermer Andersen <[email protected]>
Co-authored-by: Finn Schiermer Andersen <[email protected]>
  • Loading branch information
3 people authored Mar 1, 2024
1 parent 4916543 commit cc3ae93
Show file tree
Hide file tree
Showing 11 changed files with 1,828 additions and 64 deletions.
526 changes: 522 additions & 4 deletions src/realm/array_direct.hpp

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions src/realm/array_encode.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,7 @@ void ArrayEncode::init(const char* h)
m_ndx_width = NodeHeader::get_elementB_size<Encoding::Flex>(h);
m_ndx_size = NodeHeader::get_arrayB_num_elements<Encoding::Flex>(h);
m_v_mask = 1ULL << (m_v_width - 1);
m_ndx_mask = 1ULL << (m_ndx_width - 1);
}
}

Expand Down
1 change: 1 addition & 0 deletions src/realm/array_encode.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ class ArrayEncode {
Encoding m_encoding{NodeHeader::Encoding::WTypBits}; // this is not ok .... probably
size_t m_v_width = 0, m_v_size = 0, m_ndx_width = 0, m_ndx_size = 0;
size_t m_v_mask = 0;
size_t m_ndx_mask = 0;

friend class ArrayPacked;
friend class ArrayFlex;
Expand Down
170 changes: 154 additions & 16 deletions src/realm/array_flex.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -159,31 +159,169 @@ bool ArrayFlex::find_all(const Array& arr, int64_t value, size_t start, size_t e

REALM_ASSERT_3(arr.m_width, !=, 0);

if constexpr (std::is_same_v<Equal, Cond>) {
find_eq(arr, value, start, end, baseindex, state);
}
else if constexpr (std::is_same_v<NotEqual, Cond>) {
find_neq(arr, value, start, end, baseindex, state);
}
else if constexpr (std::is_same_v<Less, Cond>) {
find_lt(arr, value, start, end, baseindex, state);
}
else if constexpr (std::is_same_v<Greater, Cond>) {
find_gt(arr, value, start, end, baseindex, state);
}

return true;
}

template <typename Cond, bool v>
inline size_t ArrayFlex::parallel_subword_find(const Array& arr, uint64_t value, size_t width_mask, size_t offset,
uint_least8_t width, size_t start, size_t end) const
{
const auto MSBs = populate(width, width_mask);
const auto search_vector = populate(width, value);
const auto field_count = num_fields_for_width(width);
const auto bit_count_pr_iteration = num_bits_for_width(width);
auto total_bit_count_left = static_cast<signed>(end - start) * width;
REALM_ASSERT(total_bit_count_left >= 0);
auto bitwidth_cmp = [&MSBs](uint64_t a, uint64_t b) {
if constexpr (std::is_same_v<Cond, Equal>)
return find_all_fields_EQ(MSBs, a, b);
else if constexpr (std::is_same_v<Cond, NotEqual>)
return find_all_fields_NE(MSBs, a, b);
else if constexpr (std::is_same_v<Cond, GreaterEqual>) {
if constexpr (v == true)
return find_all_fields_signed_GE(MSBs, a, b);
if constexpr (v == false)
return find_all_fields_unsigned_GE(MSBs, a, b);
REALM_UNREACHABLE();
}

else if constexpr (std::is_same_v<Cond, Greater>)
return find_all_fields_signed_GT(MSBs, a, b);
else if constexpr (std::is_same_v<Cond, Less>)
return find_all_fields_unsigned_LT(MSBs, a, b);
};

unaligned_word_iter it((uint64_t*)(arr.m_data), offset + start * width);
uint64_t vector = 0;
while (total_bit_count_left >= bit_count_pr_iteration) {
const auto word = it.get(bit_count_pr_iteration);
vector = bitwidth_cmp(word, search_vector);
if (vector) {
int sub_word_index = first_field_marked((int)width, vector);
return start + sub_word_index;
}
total_bit_count_left -= bit_count_pr_iteration;
start += field_count;
it.bump(bit_count_pr_iteration);
}
if (total_bit_count_left) { // final subword, may be partial
const auto word = it.get(total_bit_count_left); // <-- limit lookahead to avoid touching memory beyond array
vector = bitwidth_cmp(word, search_vector);
auto last_word_mask = 0xFFFFFFFFFFFFFFFFULL >> (64 - total_bit_count_left);
vector &= last_word_mask;
if (vector) {
int sub_word_index = first_field_marked(width, vector);
return start + sub_word_index;
}
}
return end;
}

bool ArrayFlex::find_eq(const Array& arr, int64_t value, size_t start, size_t end, size_t baseindex,
QueryStateBase* state) const
{
const auto& encoder = arr.m_encoder;
const auto data = (uint64_t*)arr.m_data;
const auto v_width = encoder.m_v_width;
const auto v_size = encoder.m_v_size;
const auto ndx_width = encoder.m_ndx_width;
const auto mask = encoder.width_mask();
const auto offset = v_size * v_width;

auto cmp = [](int64_t v, int64_t value) {
if constexpr (std::is_same_v<Cond, Equal>)
return v == value;
if constexpr (std::is_same_v<Cond, NotEqual>)
return v != value;
if constexpr (std::is_same_v<Cond, Greater>)
return v > value;
if constexpr (std::is_same_v<Cond, Less>)
return v < value;
};
auto v_start = parallel_subword_find<Equal>(arr, value, encoder.m_v_mask, 0, v_width, 0, v_size);
if (v_start == v_size)
return true;

while (start < end) {
start = parallel_subword_find<Equal>(arr, v_start, encoder.m_ndx_mask, offset, ndx_width, start, end);
if (start < end)
if (!state->match(start + baseindex))
return false;

++start;
}
return true;
}

bool ArrayFlex::find_neq(const Array& arr, int64_t value, size_t start, size_t end, size_t baseindex,
QueryStateBase* state) const
{
const auto& encoder = arr.m_encoder;
const auto v_width = encoder.m_v_width;
const auto v_size = encoder.m_v_size;
const auto ndx_width = encoder.m_ndx_width;
const auto offset = v_size * v_width;

auto v_start = parallel_subword_find<Equal>(arr, value, encoder.m_v_mask, 0, v_width, 0, v_size);
if (v_start == v_size)
return true;

while (start < end) {
start = parallel_subword_find<NotEqual>(arr, v_start, encoder.m_ndx_mask, offset, ndx_width, start, end);
if (start < end)
if (!state->match(start + baseindex))
return false;
++start;
}
return true;
}

bool ArrayFlex::find_lt(const Array& arr, int64_t value, size_t start, size_t end, size_t baseindex,
QueryStateBase* state) const
{
const auto& encoder = arr.m_encoder;
const auto v_width = encoder.m_v_width;
const auto v_size = encoder.m_v_size;
const auto ndx_width = encoder.m_ndx_width;
const auto offset = v_size * v_width;
bf_iterator it_index{data, static_cast<size_t>(offset), ndx_width, ndx_width, start};
for (; start < end; ++start, ++it_index) {
const auto v = sign_extend_field_by_mask(mask, read_bitfield(data, it_index.get_value() * v_width, v_width));
if (cmp(v, value))

auto v_start = parallel_subword_find<GreaterEqual>(arr, value, encoder.m_v_mask, 0, v_width, 0, v_size);
if (v_start == v_size)
return true;

while (start < end) {
start = parallel_subword_find<Less>(arr, v_start, encoder.m_ndx_mask, offset, ndx_width, start, end);
if (start < end)
if (!state->match(start + baseindex))
return false;

++start;
}
return true;
}

bool ArrayFlex::find_gt(const Array& arr, int64_t value, size_t start, size_t end, size_t baseindex,
QueryStateBase* state) const
{
const auto& encoder = arr.m_encoder;
const auto v_width = encoder.m_v_width;
const auto v_size = encoder.m_v_size;
const auto ndx_width = encoder.m_ndx_width;
const auto offset = v_size * v_width;

auto v_start = parallel_subword_find<Greater>(arr, value, encoder.m_v_mask, 0, v_width, 0, v_size);
if (v_start == v_size)
return true;

while (start < end) {
start = parallel_subword_find<GreaterEqual, false>(arr, v_start, encoder.m_ndx_mask, offset, ndx_width, start,
end);
if (start < end)
if (!state->match(start + baseindex))
return false;

++start;
}
return true;
}
Expand Down
10 changes: 10 additions & 0 deletions src/realm/array_flex.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,13 +39,23 @@ class ArrayFlex {
int64_t get(const char*, size_t, size_t, size_t, size_t, size_t, size_t) const;
void get_chunk(const Array& h, size_t ndx, int64_t res[8]) const;
void set_direct(const Array&, size_t, int64_t) const;

template <typename Cond>
bool find_all(const Array&, int64_t, size_t, size_t, size_t, QueryStateBase*) const;

int64_t sum(const Array&, size_t, size_t) const;

private:
int64_t do_get(uint64_t*, size_t, size_t, size_t, size_t, size_t, size_t) const;
bool find_all_match(size_t start, size_t end, size_t baseindex, QueryStateBase* state) const;

template <typename Cond, bool = true> // true int64_t other uint64_t
inline size_t parallel_subword_find(const Array&, uint64_t, size_t, size_t, uint_least8_t, size_t, size_t) const;

bool find_eq(const Array&, int64_t, size_t, size_t, size_t, QueryStateBase*) const;
bool find_neq(const Array&, int64_t, size_t, size_t, size_t, QueryStateBase*) const;
bool find_lt(const Array&, int64_t, size_t, size_t, size_t, QueryStateBase*) const;
bool find_gt(const Array&, int64_t, size_t, size_t, size_t, QueryStateBase*) const;
};
} // namespace realm
#endif // REALM_ARRAY_COMPRESS_HPP
1 change: 1 addition & 0 deletions src/realm/array_integer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,7 @@ inline ArrayIntNull::~ArrayIntNull() noexcept {}

inline size_t ArrayIntNull::size() const noexcept
{
// this cannot be right, what if size is 0
return Array::size() - 1;
}

Expand Down
2 changes: 1 addition & 1 deletion src/realm/array_integer_tpl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ bool ArrayIntNull::find_impl(value_type opt_value, size_t start, size_t end, Que
}
// if encoded use specialised find
if (is_encoded())
return find_encoded<cond>(value, start2, end, baseindex2, state);
return find_encoded<cond>(value, start2, end2, baseindex2, state);
// Fall back to plain Array find.
return ArrayWithFind(*this).find<cond>(value, start2, end2, baseindex2, state);
}
Expand Down
80 changes: 65 additions & 15 deletions src/realm/array_packed.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,6 @@ void ArrayPacked::get_chunk(const Array& arr, size_t ndx, int64_t res[8]) const
res[index++] = get(arr, i++);
}
}

template <typename Cond>
bool ArrayPacked::find_all(const Array& arr, int64_t value, size_t start, size_t end, size_t baseindex,
QueryStateBase* state) const
Expand All @@ -138,28 +137,79 @@ bool ArrayPacked::find_all(const Array& arr, int64_t value, size_t start, size_t

REALM_ASSERT_3(arr.m_width, !=, 0);

auto cmp = [](int64_t v, int64_t value) {

// NOTE: this is one of the most important functions in the whole codebase, since it determines how fast the
// queries run.
//
// Main idea around find.
// Try to find the starting point where the condition can be met, comparing as many values as a single 64bit can
// contain in parallel. Once we have found the starting point, keep matching values as much as we can between
// start and end.
//
// EG: we store the value 6, with width 4bits (0110), 6 is 4 bits because, 110 (6) + sign bit 0.
// Inside 64bits we can fit max 16 times 6. If we go from index 0 to 15 throughout the same 64 bits, we need to
// apply a mask and a shift bits every time, then compare the values.
// This is not the cheapest thing to do. Instead we can compare all values contained within 64 bits in one go and
// see if there is a match with what we are looking for. Reducing the number of comparison by ~logk(N) where K is
// the width of each single value within a 64 bit word and N is the total number of values stored in the array.

// in packed format a parallel subword find pays off also for width >= 32
while (start < end) {
start = parallel_subword_find<Cond>(arr, value, start, end);
if (start < end)
if (!state->match(start + baseindex))
return false;

++start;
}
return true;
}

template <typename Cond>
size_t ArrayPacked::parallel_subword_find(const Array& arr, int64_t value, size_t start, size_t end) const
{
const auto width = arr.m_width;
const auto MSBs = populate(width, arr.get_encoder().width_mask());
const auto search_vector = populate(width, value);
const auto field_count = num_fields_for_width(width);
const auto bit_count_pr_iteration = num_bits_for_width(width);
auto total_bit_count_left = static_cast<signed>(end - start) * width;
REALM_ASSERT(total_bit_count_left >= 0);
auto bitwidth_cmp = [&MSBs](uint64_t a, uint64_t b) {
if constexpr (std::is_same_v<Cond, Equal>)
return v == value;
return find_all_fields_EQ(MSBs, a, b);
if constexpr (std::is_same_v<Cond, NotEqual>)
return v != value;
return find_all_fields_NE(MSBs, a, b);
if constexpr (std::is_same_v<Cond, Greater>)
return v > value;
return find_all_fields_signed_GT(MSBs, a, b);
if constexpr (std::is_same_v<Cond, Less>)
return v < value;
return find_all_fields_signed_LT(MSBs, a, b);
};

//~6/7x slower, we need to do a bitscan before to start this loop when values are less than 32 and 64 bits
bf_iterator it((uint64_t*)arr.m_data, 0, arr.m_width, arr.m_width, start);
const auto mask = arr.get_encoder().width_mask();
for (; start < end; ++start, ++it) {
const auto v = sign_extend_field_by_mask(mask, it.get_value());
if (cmp(v, value)) {
if (!state->match(start + baseindex))
return false;
unaligned_word_iter it((uint64_t*)arr.m_data, start * arr.m_width);
uint64_t vector = 0;
while (total_bit_count_left >= bit_count_pr_iteration) {
const auto word = it.get(bit_count_pr_iteration);
vector = bitwidth_cmp(word, search_vector);
if (vector) {
int sub_word_index = first_field_marked(width, vector);
return start + sub_word_index;
}
total_bit_count_left -= bit_count_pr_iteration;
start += field_count;
it.bump(bit_count_pr_iteration);
}
return true;
if (total_bit_count_left) { // final subword, may be partial
const auto word = it.get(total_bit_count_left); // <-- limit lookahead to avoid touching memory beyond array
vector = bitwidth_cmp(word, search_vector);
auto last_word_mask = 0xFFFFFFFFFFFFFFFFULL >> (64 - total_bit_count_left);
vector &= last_word_mask;
if (vector) {
int sub_word_index = first_field_marked(width, vector);
return start + sub_word_index;
}
}
return end;
}

bool ArrayPacked::find_all_match(size_t start, size_t end, size_t baseindex, QueryStateBase* state) const
Expand Down
3 changes: 3 additions & 0 deletions src/realm/array_packed.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,9 @@ class ArrayPacked {
private:
int64_t do_get(uint64_t*, size_t, size_t, size_t, size_t) const;
bool find_all_match(size_t start, size_t end, size_t baseindex, QueryStateBase* state) const;

template <typename Cond>
size_t parallel_subword_find(const Array&, int64_t, size_t, size_t) const;
};
} // namespace realm

Expand Down
1 change: 1 addition & 0 deletions src/realm/node_header.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -897,6 +897,7 @@ size_t inline NodeHeader::get_byte_size_from_header(const char* header) noexcept
get_elementB_size<NodeHeader::Encoding::Flex>(h));
default:
REALM_ASSERT_RELEASE(false && "unknown encoding");
return 0; // kill a warning
}
}

Expand Down
Loading

0 comments on commit cc3ae93

Please sign in to comment.