Skip to content

Commit

Permalink
Perf v2, find_with_marked for packed interger arrays (#7385)
Browse files Browse the repository at this point in the history
* made find_first_marked() branch free

* various optimizations of find_first_marked, best one selected

* for some reason this is much bettergit add .

* no warnings

* made search method selection more explicit and clear

* bunch of fixes..

* restore subword loop

* fix object store tests + use subword cmp always (which is faster on my machine)

---------

Co-authored-by: Finn Schiermer Andersen <[email protected]>
  • Loading branch information
nicola-cab and finnschiermer authored Feb 27, 2024
1 parent 22d0d35 commit e84499d
Show file tree
Hide file tree
Showing 7 changed files with 263 additions and 52 deletions.
214 changes: 209 additions & 5 deletions src/realm/array_direct.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -457,7 +457,6 @@ constexpr int num_bits_table[65] = {-1, 64, 64, 63, 64, 60, 60, 63, // 0-7

inline int num_fields_for_width(int width)
{
REALM_ASSERT(width <= 32); // it will not pay off to use this for fields larger
REALM_ASSERT(width);
return 64 / width;
}
Expand Down Expand Up @@ -634,9 +633,214 @@ inline uint64_t find_all_fields_signed_GE(uint64_t MSBs, uint64_t A, uint64_t B)

// find the first field which have MSB set (marks overflow after trial subtraction, or other
// requested condition).
// This may not be the most efficient method, but it is still much faster than reloading
// each bitfield individually and testing it. To be used after find_all_fields_XXX.
// TODO: Optimize this to log(N) time instead of linear.
struct find_field_desc {
uint8_t levels;
uint64_t m1;
uint64_t m2;
uint64_t m4;
uint64_t m8;
uint64_t m16;
uint64_t m32;
};

constexpr struct find_field_desc find_field_table[65] = {
/* 0 */ {0, 0, 0, 0, 0, 0},
/* 1 */
{6, 0xAAAAAAAAAAAAAAAA, 0xCCCCCCCCCCCCCCCC, 0xF0F0F0F0F0F0F0F0, 0xFF00FF00FF00FF00, 0xFFFF0000FFFF0000,
0xFFFFFFFF00000000},
/* 2 */
{5, 0xCCCCCCCCCCCCCCCC, 0xF0F0F0F0F0F0F0F0, 0xFF00FF00FF00FF00, 0xFFFF0000FFFF0000, 0xFFFFFFFF00000000, 0},
/* 3 */
{5, 0b0000'1110'0011'1000'1110'0011'1000'1110'0011'1000'1110'0011'1000'1110'0011'1000,
0b0000'1111'1100'0000'1111'1100'0000'1111'1100'0000'1111'1100'0000'1111'1100'0000,
0b1111'0000'0000'0000'1111'1111'1111'0000'0000'0000'1111'1111'1111'0000'0000'0000,
0b0000'0000'0000'0000'1111'1111'1111'1111'1111'1111'0000'0000'0000'0000'0000'0000,
0b1111'1111'1111'1111'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000, 0},
/* 4 */
{4, 0xF0F0F0F0F0F0F0F0, 0xFF00FF00FF00FF00, 0xFFFF0000FFFF0000, 0xFFFFFFFF00000000, 0, 0},
/* 5 */
{4, 0b0000'1111'1000'0011'1110'0000'1111'1000'0011'1110'0000'1111'1000'0011'1110'0000,
0b0000'1111'1111'1100'0000'0000'1111'1111'1100'0000'0000'1111'1111'1100'0000'0000,
0b1111'0000'0000'0000'0000'0000'1111'1111'1111'1111'1111'0000'0000'0000'0000'0000,
0b1111'1111'1111'1111'1111'1111'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000, 0, 0},
/* 6 */
{4, 0b0000'1111'1100'0000'1111'1100'0000'1111'1100'0000'1111'1100'0000'1111'1100'0000,
0b1111'0000'0000'0000'1111'1111'1111'0000'0000'0000'1111'1111'1111'0000'0000'0000,
0b0000'0000'0000'0000'1111'1111'1111'1111'1111'1111'0000'0000'0000'0000'0000'0000,
0b1111'1111'1111'1111'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000, 0, 0},
/* 7 */
{4, 0b1000'0000'1111'1110'0000'0011'1111'1000'0000'1111'1110'0000'0011'1111'1000'0000,
0b0000'0000'1111'1111'1111'1100'0000'0000'0000'1111'1111'1111'1100'0000'0000'0000,
0b0000'0000'1111'1111'1111'1111'1111'1111'1111'0000'0000'0000'0000'0000'0000'0000,
0b1111'1111'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000, 0, 0},
/* 8 */
{3, 0xFF00FF00FF00FF00, 0xFFFF0000FFFF0000, 0xFFFFFFFF00000000, 0, 0, 0},
/* 9 */
{3, 0b1000'0000'0011'1111'1110'0000'0000'1111'1111'1000'0000'0011'1111'1110'0000'0000,
0b0111'1111'1100'0000'0000'0000'0000'1111'1111'1111'1111'1100'0000'0000'0000'0000,
0b1111'1111'1111'1111'1111'1111'1111'0000'0000'0000'0000'0000'0000'0000'0000'0000, 0, 0, 0},
/* 10 */
{3, 0b0000'1111'1111'1100'0000'0000'1111'1111'1100'0000'0000'1111'1111'1100'0000'0000,
0b1111'0000'0000'0000'0000'0000'1111'1111'1111'1111'1111'0000'0000'0000'0000'0000,
0b1111'1111'1111'1111'1111'1111'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000, 0, 0, 0},
/* 11 */
{3, 0b1111'1111'1000'0000'0000'1111'1111'1110'0000'0000'0011'1111'1111'1000'0000'0000,
0b0000'0000'0000'0000'0000'1111'1111'1111'1111'1111'1100'0000'0000'0000'0000'0000,
0b1111'1111'1111'1111'1111'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000, 0, 0, 0},
/* 12 */
{3, 0b1111'0000'0000'0000'1111'1111'1111'0000'0000'0000'1111'1111'1111'0000'0000'0000,
0b0000'0000'0000'0000'1111'1111'1111'1111'1111'1111'0000'0000'0000'0000'0000'0000,
0b1111'1111'1111'1111'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000, 0, 0, 0},
/* 13 */
{3, 0b1110'0000'0000'0000'1111'1111'1110'0000'0000'0011'1111'1111'1110'0000'0000'0000,
0b0000'0000'0000'0000'1111'1111'1111'1111'1111'1100'0000'0000'0000'0000'0000'0000,
0b1111'1111'1111'1111'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000, 0, 0, 0},
/* 14 */
{3, 0b0000'0000'1111'1111'1111'1100'0000'0000'0000'1111'1111'1111'1100'0000'0000'0000,
0b0000'0000'1111'1111'1111'1111'1111'1111'1111'0000'0000'0000'0000'0000'0000'0000,
0b1111'1111'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000, 0, 0, 0},
/* 15 */
{3, 0b0000'1111'1111'1111'1110'0000'0000'0000'0011'1111'1111'1111'1000'0000'0000'0000,
0b0000'1111'1111'1111'1111'1111'1111'1111'1100'0000'0000'0000'0000'0000'0000'0000,
0b1111'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000, 0, 0, 0},
/* 16 */
{2, 0xFFFF0000FFFF0000, 0xFFFFFFFF00000000, 0, 0, 0, 0},
/* 17 - as we're only interested in msb of each field we can simplify and use same pattern
for the next 4 entries */
{2, 0xF00000FFFFF00000, 0xFFFFFF0000000000, 0, 0, 0, 0},
/* 18 */
{2, 0xF00000FFFFF00000, 0xFFFFFF0000000000, 0, 0, 0, 0},
/* 19 */
{2, 0xF00000FFFFF00000, 0xFFFFFF0000000000, 0, 0, 0, 0},
/* 20 */
{2, 0xF00000FFFFF00000, 0xFFFFFF0000000000, 0, 0, 0, 0},
/* 21 - and next 4 */
{2, 0x0000FFFFFF000000, 0xFFFF000000000000, 0, 0, 0, 0},
/* 22 */
{2, 0x0000FFFFFF000000, 0xFFFF000000000000, 0, 0, 0, 0},
/* 23 */
{2, 0x0000FFFFFF000000, 0xFFFF000000000000, 0, 0, 0, 0},
/* 24 */
{2, 0x0000FFFFFF000000, 0xFFFF000000000000, 0, 0, 0, 0},
/* 25 - and 4 more */
{2, 0x00FFFFFFF0000000, 0xFF00000000000000, 0, 0, 0, 0},
/* 26 */
{2, 0x00FFFFFFF0000000, 0xFF00000000000000, 0, 0, 0, 0},
/* 27 */
{2, 0x00FFFFFFF0000000, 0xFF00000000000000, 0, 0, 0, 0},
/* 28 */
{2, 0x00FFFFFFF0000000, 0xFF00000000000000, 0, 0, 0, 0},
/* 29 - last 4 where multiple fields exist */
{1, 0xFFFFFFFF00000000, 0, 0, 0, 0, 0},
/* 30 */
{1, 0xFFFFFFFF00000000, 0, 0, 0, 0, 0},
/* 31 */
{1, 0xFFFFFFFF00000000, 0, 0, 0, 0, 0},
/* 32 */
{1, 0xFFFFFFFF00000000, 0, 0, 0, 0, 0},
/* 33 - from here to 64, there is only 1 possible result: 0 */
{0, 0, 0, 0, 0, 0, 0},
{0, 0, 0, 0, 0, 0, 0},
{0, 0, 0, 0, 0, 0, 0},
{0, 0, 0, 0, 0, 0, 0},
{0, 0, 0, 0, 0, 0, 0},
{0, 0, 0, 0, 0, 0, 0},
{0, 0, 0, 0, 0, 0, 0},
{0, 0, 0, 0, 0, 0, 0},
{0, 0, 0, 0, 0, 0, 0},
{0, 0, 0, 0, 0, 0, 0},
{0, 0, 0, 0, 0, 0, 0},
{0, 0, 0, 0, 0, 0, 0},
{0, 0, 0, 0, 0, 0, 0},
{0, 0, 0, 0, 0, 0, 0},
{0, 0, 0, 0, 0, 0, 0},
{0, 0, 0, 0, 0, 0, 0},
{0, 0, 0, 0, 0, 0, 0},
{0, 0, 0, 0, 0, 0, 0},
{0, 0, 0, 0, 0, 0, 0},
{0, 0, 0, 0, 0, 0, 0},
{0, 0, 0, 0, 0, 0, 0},
{0, 0, 0, 0, 0, 0, 0},
{0, 0, 0, 0, 0, 0, 0},
{0, 0, 0, 0, 0, 0, 0},
{0, 0, 0, 0, 0, 0, 0},
{0, 0, 0, 0, 0, 0, 0},
{0, 0, 0, 0, 0, 0, 0},
{0, 0, 0, 0, 0, 0, 0},
{0, 0, 0, 0, 0, 0, 0},
{0, 0, 0, 0, 0, 0, 0},
{0, 0, 0, 0, 0, 0, 0},
{0, 0, 0, 0, 0, 0, 0}};

#if 1
constexpr uint32_t inverse_width[65] = {
65536 * 64 / 1, // never used
65536 * 64 / 1, 65536 * 64 / 2, 65536 * 64 / 3, 65536 * 64 / 4, 65536 * 64 / 5, 65536 * 64 / 6,
65536 * 64 / 7, 65536 * 64 / 8, 65536 * 64 / 9, 65536 * 64 / 10, 65536 * 64 / 11, 65536 * 64 / 12,
65536 * 64 / 13, 65536 * 64 / 14, 65536 * 64 / 15, 65536 * 64 / 16, 65536 * 64 / 17, 65536 * 64 / 18,
65536 * 64 / 19, 65536 * 64 / 20, 65536 * 64 / 21, 65536 * 64 / 22, 65536 * 64 / 23, 65536 * 64 / 24,
65536 * 64 / 25, 65536 * 64 / 26, 65536 * 64 / 27, 65536 * 64 / 28, 65536 * 64 / 29, 65536 * 64 / 30,
65536 * 64 / 31, 65536 * 64 / 32, 65536 * 64 / 33, 65536 * 64 / 34, 65536 * 64 / 35, 65536 * 64 / 36,
65536 * 64 / 37, 65536 * 64 / 38, 65536 * 64 / 39, 65536 * 64 / 40, 65536 * 64 / 41, 65536 * 64 / 42,
65536 * 64 / 43, 65536 * 64 / 44, 65536 * 64 / 45, 65536 * 64 / 46, 65536 * 64 / 47, 65536 * 64 / 48,
65536 * 64 / 49, 65536 * 64 / 50, 65536 * 64 / 51, 65536 * 64 / 52, 65536 * 64 / 53, 65536 * 64 / 54,
65536 * 64 / 55, 65536 * 64 / 56, 65536 * 64 / 57, 65536 * 64 / 58, 65536 * 64 / 59, 65536 * 64 / 60,
65536 * 64 / 61, 65536 * 64 / 62, 65536 * 64 / 63, 65536 * 64 / 64,
};

inline int first_field_marked(int width, uint64_t vector)
{
#if REALM_WINDOWS
int lz = (int)_tzcnt_u64(vector); // TODO: not clear if this is ok on all platforms
#else
int lz = __builtin_ctzll(vector);
#endif
int field = (lz * inverse_width[width]) >> 22;
REALM_ASSERT_DEBUG(field == (lz / width));
return field;
}
#endif
#if 0
inline int first_field_marked(int width, uint64_t vector)
{
// isolate least significant bit
vector = vector & (~vector + 1);
const struct find_field_desc& desc = find_field_table[width];
int result = 0;
switch (desc.levels) {
// the following case entries are intended to fall through
// (this is a variant of Duff's Device)
// TODO: disable compiler warnings for it
case 6:
result |= (vector & desc.m32) ? 32 : 0;
case 5:
result |= (vector & desc.m16) ? 16 : 0;
case 4:
result |= (vector & desc.m8) ? 8 : 0;
case 3:
result |= (vector & desc.m4) ? 4 : 0;
case 2:
result |= (vector & desc.m2) ? 2 : 0;
case 1:
result |= (vector & desc.m1) ? 1 : 0;
default:
break;
}
return result;
}
#endif
#if 0
inline int first_field_marked(int width, uint64_t vector)
{
// isolate least significant bit
vector = vector & (~vector + 1);
// directly compute position of set bit using table
const struct find_field_desc& desc = find_field_table[width];
return ((vector & desc.m1) ? 1 : 0) | ((vector & desc.m2) ? 2 : 0) | ((vector & desc.m4) ? 4 : 0) |
((vector & desc.m8) ? 8 : 0) | ((vector & desc.m16) ? 16 : 0) | ((vector & desc.m32) ? 32 : 0);
}
#endif
#if 0
inline int first_field_marked(int width, uint64_t vector)
{
int result = 0;
Expand All @@ -649,7 +853,7 @@ inline int first_field_marked(int width, uint64_t vector)
}
return -1;
}

#endif

namespace impl {

Expand Down
46 changes: 23 additions & 23 deletions src/realm/array_encode.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -97,29 +97,29 @@ bool ArrayEncode::encode(const Array& origin, Array& arr) const
// return false;
return always_encode(origin, arr, true); // true packed, false flex

std::vector<int64_t> values;
std::vector<size_t> indices;
encode_values(origin, values, indices);
if (!values.empty()) {
size_t v_width, ndx_width;
const auto uncompressed_size = origin.get_byte_size();
const auto packed_size = packed_encoded_array_size(values, origin.size(), v_width);
const auto flex_size = flex_encoded_array_size(values, indices, v_width, ndx_width);

if (flex_size < packed_size && flex_size < uncompressed_size) {
const uint8_t flags = NodeHeader::get_flags(origin.get_header());
encode_array(s_flex, arr, flex_size, flags, v_width, ndx_width, values.size(), indices.size());
copy_into_encoded_array(s_flex, arr, values, indices);
return true;
}
else if (packed_size < uncompressed_size) {
const uint8_t flags = NodeHeader::get_flags(origin.get_header());
encode_array(s_packed, arr, packed_size, flags, v_width, origin.size());
copy_into_encoded_array(s_packed, origin, arr);
return true;
}
}
return false;
// std::vector<int64_t> values;
// std::vector<size_t> indices;
// encode_values(origin, values, indices);
// if (!values.empty()) {
// size_t v_width, ndx_width;
// const auto uncompressed_size = origin.get_byte_size();
// const auto packed_size = packed_encoded_array_size(values, origin.size(), v_width);
// const auto flex_size = flex_encoded_array_size(values, indices, v_width, ndx_width);
//
// if (flex_size < packed_size && flex_size < uncompressed_size) {
// const uint8_t flags = NodeHeader::get_flags(origin.get_header());
// encode_array(s_flex, arr, flex_size, flags, v_width, ndx_width, values.size(), indices.size());
// copy_into_encoded_array(s_flex, arr, values, indices);
// return true;
// }
// else if (packed_size < uncompressed_size) {
// const uint8_t flags = NodeHeader::get_flags(origin.get_header());
// encode_array(s_packed, arr, packed_size, flags, v_width, origin.size());
// copy_into_encoded_array(s_packed, origin, arr);
// return true;
// }
// }
// return false;
}

bool ArrayEncode::decode(Array& arr) const
Expand Down
2 changes: 2 additions & 0 deletions src/realm/array_integer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -185,11 +185,13 @@ void ArrayIntNull::find_all(IntegerColumn* result, value_type value, size_t col_

bool ArrayIntNull::find(int cond, value_type value, size_t start, size_t end, QueryStateBase* state) const
{
end = is_encoded() ? end + 1 : end;
return find_impl(cond, value, start, end, state);
}

size_t ArrayIntNull::find_first(value_type value, size_t begin, size_t end) const
{
end = is_encoded() ? end + 1 : end;
return find_first<Equal>(value, begin, end);
}

Expand Down
1 change: 1 addition & 0 deletions src/realm/array_integer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,7 @@ inline ArrayIntNull::~ArrayIntNull() noexcept {}

inline size_t ArrayIntNull::size() const noexcept
{
// this cannot be right, what if size is 0
return Array::size() - 1;
}

Expand Down
43 changes: 23 additions & 20 deletions src/realm/array_packed.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -137,36 +137,43 @@ bool ArrayPacked::find_all(const Array& arr, int64_t value, size_t start, size_t

REALM_ASSERT_3(arr.m_width, !=, 0);


// NOTE: this is one of the most important functions in the whole codebase, since it determines how fast the
// queries run.
//
// Main idea around find.
// If bitwidth is >=32 than a linear scan is the fastest thing we can do, and a trivial comparison can be as fast
// as it gets. If the bitwidh is less than 32, we can operate on the same 64 bit word diffently.
// Try to find the starting point where the condition can be met, comparing as many values as a single 64bit can
// contain in parallel. Once we have found the starting point, keep matching values as much as we can between
// start and end.
//
// EG: we store the value 6, with width 4bits (0110), 6 is 4 bits because, 110 (6) + sign bit 0.
// Inside 64bits we can fit max 16 times 6. If we go from index 0 to 15 throughout the same 64 bits, we need to
// apply a mask and a shift bits every time, then compare the values.
// This is not the cheapest thing to do. Instead we can compare all values contained within 64 bits in one go and
// see if there is a match with what we are looking for. Reducing the number of comparison by ~logk(N) where K is
// the width of each single value within a 64 bit word and N is the total number of values stored in the array. On
// the other end if we have values of 32 bits or more, accessing twice or once the same 64 bits word is probably
// the cheapest thing to do.
return parallel_subword_find<Cond>(arr, value, start, end, baseindex, state);
// the width of each single value within a 64 bit word and N is the total number of values stored in the array.

while (start < end) {
start = parallel_subword_find<Cond>(arr, value, start, end);
if (start < end) {
if (!state->match(start + baseindex))
return false;
}
++start;
}
return true;
}

template <typename Cond>
bool ArrayPacked::parallel_subword_find(const Array& arr, int64_t value, size_t start, size_t end, size_t baseindex,
QueryStateBase* state) const
size_t ArrayPacked::parallel_subword_find(const Array& arr, int64_t value, size_t start, size_t end) const
{
const auto width = arr.m_width;
const auto MSBs = populate(width, arr.get_encoder().width_mask());
const auto search_vector = populate(width, value);
const auto field_count = num_fields_for_width(width);
const auto bit_count_pr_iteration = num_bits_for_width(width);
signed total_bit_count_left = ((signed)end - start) * width;
auto total_bit_count_left = static_cast<signed>(end - start) * width;
REALM_ASSERT(total_bit_count_left >= 0);

auto bitwidth_cmp = [&MSBs](uint64_t a, uint64_t b) {
if constexpr (std::is_same_v<Cond, Equal>)
return find_all_fields_EQ(MSBs, a, b);
Expand All @@ -183,29 +190,25 @@ bool ArrayPacked::parallel_subword_find(const Array& arr, int64_t value, size_t
while (total_bit_count_left >= bit_count_pr_iteration) {
const auto word = it.get(bit_count_pr_iteration);
vector = bitwidth_cmp(word, search_vector);
while (vector) {
if (vector) {
int sub_word_index = first_field_marked(width, vector);
if (!state->match(start + sub_word_index + baseindex))
return false;
vector &= (vector - 1); // known bithack for clearing least significant bit
return start + sub_word_index;
}
total_bit_count_left -= bit_count_pr_iteration;
start += field_count;
it.bump(bit_count_pr_iteration);
}
if (total_bit_count_left) { // final subword, may be partial
if (!vector && total_bit_count_left) { // final subword, may be partial
const auto word = it.get(total_bit_count_left); // <-- limit lookahead to avoid touching memory beyond array
vector = bitwidth_cmp(word, search_vector);
auto last_word_mask = 0xFFFFFFFFFFFFFFFFULL >> (64 - total_bit_count_left);
vector &= last_word_mask;
while (vector) {
if (vector) {
int sub_word_index = first_field_marked(width, vector);
if (!state->match(start + sub_word_index + baseindex))
return false;
vector &= (vector - 1);
return start + sub_word_index;
}
}
return true;
return arr.size();
}

bool ArrayPacked::find_all_match(size_t start, size_t end, size_t baseindex, QueryStateBase* state) const
Expand Down
Loading

0 comments on commit e84499d

Please sign in to comment.