Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merge new find_with_marked intp perf v2 #7385

Merged
214 changes: 209 additions & 5 deletions src/realm/array_direct.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -457,7 +457,6 @@ constexpr int num_bits_table[65] = {-1, 64, 64, 63, 64, 60, 60, 63, // 0-7

inline int num_fields_for_width(int width)
{
REALM_ASSERT(width <= 32); // it will not pay off to use this for fields larger
REALM_ASSERT(width);
return 64 / width;
}
Expand Down Expand Up @@ -634,9 +633,214 @@ inline uint64_t find_all_fields_signed_GE(uint64_t MSBs, uint64_t A, uint64_t B)

// find the first field which have MSB set (marks overflow after trial subtraction, or other
// requested condition).
// This may not be the most efficient method, but it is still much faster than reloading
// each bitfield individually and testing it. To be used after find_all_fields_XXX.
// TODO: Optimize this to log(N) time instead of linear.
struct find_field_desc {
uint8_t levels;
uint64_t m1;
uint64_t m2;
uint64_t m4;
uint64_t m8;
uint64_t m16;
uint64_t m32;
};

constexpr struct find_field_desc find_field_table[65] = {
/* 0 */ {0, 0, 0, 0, 0, 0},
/* 1 */
{6, 0xAAAAAAAAAAAAAAAA, 0xCCCCCCCCCCCCCCCC, 0xF0F0F0F0F0F0F0F0, 0xFF00FF00FF00FF00, 0xFFFF0000FFFF0000,
0xFFFFFFFF00000000},
/* 2 */
{5, 0xCCCCCCCCCCCCCCCC, 0xF0F0F0F0F0F0F0F0, 0xFF00FF00FF00FF00, 0xFFFF0000FFFF0000, 0xFFFFFFFF00000000, 0},
/* 3 */
{5, 0b0000'1110'0011'1000'1110'0011'1000'1110'0011'1000'1110'0011'1000'1110'0011'1000,
0b0000'1111'1100'0000'1111'1100'0000'1111'1100'0000'1111'1100'0000'1111'1100'0000,
0b1111'0000'0000'0000'1111'1111'1111'0000'0000'0000'1111'1111'1111'0000'0000'0000,
0b0000'0000'0000'0000'1111'1111'1111'1111'1111'1111'0000'0000'0000'0000'0000'0000,
0b1111'1111'1111'1111'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000, 0},
/* 4 */
{4, 0xF0F0F0F0F0F0F0F0, 0xFF00FF00FF00FF00, 0xFFFF0000FFFF0000, 0xFFFFFFFF00000000, 0, 0},
/* 5 */
{4, 0b0000'1111'1000'0011'1110'0000'1111'1000'0011'1110'0000'1111'1000'0011'1110'0000,
0b0000'1111'1111'1100'0000'0000'1111'1111'1100'0000'0000'1111'1111'1100'0000'0000,
0b1111'0000'0000'0000'0000'0000'1111'1111'1111'1111'1111'0000'0000'0000'0000'0000,
0b1111'1111'1111'1111'1111'1111'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000, 0, 0},
/* 6 */
{4, 0b0000'1111'1100'0000'1111'1100'0000'1111'1100'0000'1111'1100'0000'1111'1100'0000,
0b1111'0000'0000'0000'1111'1111'1111'0000'0000'0000'1111'1111'1111'0000'0000'0000,
0b0000'0000'0000'0000'1111'1111'1111'1111'1111'1111'0000'0000'0000'0000'0000'0000,
0b1111'1111'1111'1111'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000, 0, 0},
/* 7 */
{4, 0b1000'0000'1111'1110'0000'0011'1111'1000'0000'1111'1110'0000'0011'1111'1000'0000,
0b0000'0000'1111'1111'1111'1100'0000'0000'0000'1111'1111'1111'1100'0000'0000'0000,
0b0000'0000'1111'1111'1111'1111'1111'1111'1111'0000'0000'0000'0000'0000'0000'0000,
0b1111'1111'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000, 0, 0},
/* 8 */
{3, 0xFF00FF00FF00FF00, 0xFFFF0000FFFF0000, 0xFFFFFFFF00000000, 0, 0, 0},
/* 9 */
{3, 0b1000'0000'0011'1111'1110'0000'0000'1111'1111'1000'0000'0011'1111'1110'0000'0000,
0b0111'1111'1100'0000'0000'0000'0000'1111'1111'1111'1111'1100'0000'0000'0000'0000,
0b1111'1111'1111'1111'1111'1111'1111'0000'0000'0000'0000'0000'0000'0000'0000'0000, 0, 0, 0},
/* 10 */
{3, 0b0000'1111'1111'1100'0000'0000'1111'1111'1100'0000'0000'1111'1111'1100'0000'0000,
0b1111'0000'0000'0000'0000'0000'1111'1111'1111'1111'1111'0000'0000'0000'0000'0000,
0b1111'1111'1111'1111'1111'1111'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000, 0, 0, 0},
/* 11 */
{3, 0b1111'1111'1000'0000'0000'1111'1111'1110'0000'0000'0011'1111'1111'1000'0000'0000,
0b0000'0000'0000'0000'0000'1111'1111'1111'1111'1111'1100'0000'0000'0000'0000'0000,
0b1111'1111'1111'1111'1111'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000, 0, 0, 0},
/* 12 */
{3, 0b1111'0000'0000'0000'1111'1111'1111'0000'0000'0000'1111'1111'1111'0000'0000'0000,
0b0000'0000'0000'0000'1111'1111'1111'1111'1111'1111'0000'0000'0000'0000'0000'0000,
0b1111'1111'1111'1111'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000, 0, 0, 0},
/* 13 */
{3, 0b1110'0000'0000'0000'1111'1111'1110'0000'0000'0011'1111'1111'1110'0000'0000'0000,
0b0000'0000'0000'0000'1111'1111'1111'1111'1111'1100'0000'0000'0000'0000'0000'0000,
0b1111'1111'1111'1111'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000, 0, 0, 0},
/* 14 */
{3, 0b0000'0000'1111'1111'1111'1100'0000'0000'0000'1111'1111'1111'1100'0000'0000'0000,
0b0000'0000'1111'1111'1111'1111'1111'1111'1111'0000'0000'0000'0000'0000'0000'0000,
0b1111'1111'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000, 0, 0, 0},
/* 15 */
{3, 0b0000'1111'1111'1111'1110'0000'0000'0000'0011'1111'1111'1111'1000'0000'0000'0000,
0b0000'1111'1111'1111'1111'1111'1111'1111'1100'0000'0000'0000'0000'0000'0000'0000,
0b1111'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000, 0, 0, 0},
/* 16 */
{2, 0xFFFF0000FFFF0000, 0xFFFFFFFF00000000, 0, 0, 0, 0},
/* 17 - as we're only interested in msb of each field we can simplify and use same pattern
for the next 4 entries */
{2, 0xF00000FFFFF00000, 0xFFFFFF0000000000, 0, 0, 0, 0},
/* 18 */
{2, 0xF00000FFFFF00000, 0xFFFFFF0000000000, 0, 0, 0, 0},
/* 19 */
{2, 0xF00000FFFFF00000, 0xFFFFFF0000000000, 0, 0, 0, 0},
/* 20 */
{2, 0xF00000FFFFF00000, 0xFFFFFF0000000000, 0, 0, 0, 0},
/* 21 - and next 4 */
{2, 0x0000FFFFFF000000, 0xFFFF000000000000, 0, 0, 0, 0},
/* 22 */
{2, 0x0000FFFFFF000000, 0xFFFF000000000000, 0, 0, 0, 0},
/* 23 */
{2, 0x0000FFFFFF000000, 0xFFFF000000000000, 0, 0, 0, 0},
/* 24 */
{2, 0x0000FFFFFF000000, 0xFFFF000000000000, 0, 0, 0, 0},
/* 25 - and 4 more */
{2, 0x00FFFFFFF0000000, 0xFF00000000000000, 0, 0, 0, 0},
/* 26 */
{2, 0x00FFFFFFF0000000, 0xFF00000000000000, 0, 0, 0, 0},
/* 27 */
{2, 0x00FFFFFFF0000000, 0xFF00000000000000, 0, 0, 0, 0},
/* 28 */
{2, 0x00FFFFFFF0000000, 0xFF00000000000000, 0, 0, 0, 0},
/* 29 - last 4 where multiple fields exist */
{1, 0xFFFFFFFF00000000, 0, 0, 0, 0, 0},
/* 30 */
{1, 0xFFFFFFFF00000000, 0, 0, 0, 0, 0},
/* 31 */
{1, 0xFFFFFFFF00000000, 0, 0, 0, 0, 0},
/* 32 */
{1, 0xFFFFFFFF00000000, 0, 0, 0, 0, 0},
/* 33 - from here to 64, there is only 1 possible result: 0 */
{0, 0, 0, 0, 0, 0, 0},
{0, 0, 0, 0, 0, 0, 0},
{0, 0, 0, 0, 0, 0, 0},
{0, 0, 0, 0, 0, 0, 0},
{0, 0, 0, 0, 0, 0, 0},
{0, 0, 0, 0, 0, 0, 0},
{0, 0, 0, 0, 0, 0, 0},
{0, 0, 0, 0, 0, 0, 0},
{0, 0, 0, 0, 0, 0, 0},
{0, 0, 0, 0, 0, 0, 0},
{0, 0, 0, 0, 0, 0, 0},
{0, 0, 0, 0, 0, 0, 0},
{0, 0, 0, 0, 0, 0, 0},
{0, 0, 0, 0, 0, 0, 0},
{0, 0, 0, 0, 0, 0, 0},
{0, 0, 0, 0, 0, 0, 0},
{0, 0, 0, 0, 0, 0, 0},
{0, 0, 0, 0, 0, 0, 0},
{0, 0, 0, 0, 0, 0, 0},
{0, 0, 0, 0, 0, 0, 0},
{0, 0, 0, 0, 0, 0, 0},
{0, 0, 0, 0, 0, 0, 0},
{0, 0, 0, 0, 0, 0, 0},
{0, 0, 0, 0, 0, 0, 0},
{0, 0, 0, 0, 0, 0, 0},
{0, 0, 0, 0, 0, 0, 0},
{0, 0, 0, 0, 0, 0, 0},
{0, 0, 0, 0, 0, 0, 0},
{0, 0, 0, 0, 0, 0, 0},
{0, 0, 0, 0, 0, 0, 0},
{0, 0, 0, 0, 0, 0, 0},
{0, 0, 0, 0, 0, 0, 0}};

#if 1
constexpr uint32_t inverse_width[65] = {
65536 * 64 / 1, // never used
65536 * 64 / 1, 65536 * 64 / 2, 65536 * 64 / 3, 65536 * 64 / 4, 65536 * 64 / 5, 65536 * 64 / 6,
65536 * 64 / 7, 65536 * 64 / 8, 65536 * 64 / 9, 65536 * 64 / 10, 65536 * 64 / 11, 65536 * 64 / 12,
65536 * 64 / 13, 65536 * 64 / 14, 65536 * 64 / 15, 65536 * 64 / 16, 65536 * 64 / 17, 65536 * 64 / 18,
65536 * 64 / 19, 65536 * 64 / 20, 65536 * 64 / 21, 65536 * 64 / 22, 65536 * 64 / 23, 65536 * 64 / 24,
65536 * 64 / 25, 65536 * 64 / 26, 65536 * 64 / 27, 65536 * 64 / 28, 65536 * 64 / 29, 65536 * 64 / 30,
65536 * 64 / 31, 65536 * 64 / 32, 65536 * 64 / 33, 65536 * 64 / 34, 65536 * 64 / 35, 65536 * 64 / 36,
65536 * 64 / 37, 65536 * 64 / 38, 65536 * 64 / 39, 65536 * 64 / 40, 65536 * 64 / 41, 65536 * 64 / 42,
65536 * 64 / 43, 65536 * 64 / 44, 65536 * 64 / 45, 65536 * 64 / 46, 65536 * 64 / 47, 65536 * 64 / 48,
65536 * 64 / 49, 65536 * 64 / 50, 65536 * 64 / 51, 65536 * 64 / 52, 65536 * 64 / 53, 65536 * 64 / 54,
65536 * 64 / 55, 65536 * 64 / 56, 65536 * 64 / 57, 65536 * 64 / 58, 65536 * 64 / 59, 65536 * 64 / 60,
65536 * 64 / 61, 65536 * 64 / 62, 65536 * 64 / 63, 65536 * 64 / 64,
};

inline int first_field_marked(int width, uint64_t vector)
{
#if REALM_WINDOWS
int lz = (int)_tzcnt_u64(vector); // TODO: not clear if this is ok on all platforms
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@finnschiermer this is a tmp fix, just to please the builders.

#else
int lz = __builtin_ctzll(vector);
#endif
int field = (lz * inverse_width[width]) >> 22;
REALM_ASSERT_DEBUG(field == (lz / width));
return field;
}
#endif
#if 0
inline int first_field_marked(int width, uint64_t vector)
{
// isolate least significant bit
vector = vector & (~vector + 1);
const struct find_field_desc& desc = find_field_table[width];
int result = 0;
switch (desc.levels) {
// the following case entries are intended to fall through
// (this is a variant of Duff's Device)
// TODO: disable compiler warnings for it
case 6:
result |= (vector & desc.m32) ? 32 : 0;
case 5:
result |= (vector & desc.m16) ? 16 : 0;
case 4:
result |= (vector & desc.m8) ? 8 : 0;
case 3:
result |= (vector & desc.m4) ? 4 : 0;
case 2:
result |= (vector & desc.m2) ? 2 : 0;
case 1:
result |= (vector & desc.m1) ? 1 : 0;
default:
break;
}
return result;
}
#endif
#if 0
inline int first_field_marked(int width, uint64_t vector)
{
// isolate least significant bit
vector = vector & (~vector + 1);
// directly compute position of set bit using table
const struct find_field_desc& desc = find_field_table[width];
return ((vector & desc.m1) ? 1 : 0) | ((vector & desc.m2) ? 2 : 0) | ((vector & desc.m4) ? 4 : 0) |
((vector & desc.m8) ? 8 : 0) | ((vector & desc.m16) ? 16 : 0) | ((vector & desc.m32) ? 32 : 0);
}
#endif
#if 0
inline int first_field_marked(int width, uint64_t vector)
{
int result = 0;
Expand All @@ -649,7 +853,7 @@ inline int first_field_marked(int width, uint64_t vector)
}
return -1;
}

#endif

namespace impl {

Expand Down
46 changes: 23 additions & 23 deletions src/realm/array_encode.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -97,29 +97,29 @@ bool ArrayEncode::encode(const Array& origin, Array& arr) const
// return false;
return always_encode(origin, arr, true); // true packed, false flex

std::vector<int64_t> values;
std::vector<size_t> indices;
encode_values(origin, values, indices);
if (!values.empty()) {
size_t v_width, ndx_width;
const auto uncompressed_size = origin.get_byte_size();
const auto packed_size = packed_encoded_array_size(values, origin.size(), v_width);
const auto flex_size = flex_encoded_array_size(values, indices, v_width, ndx_width);

if (flex_size < packed_size && flex_size < uncompressed_size) {
const uint8_t flags = NodeHeader::get_flags(origin.get_header());
encode_array(s_flex, arr, flex_size, flags, v_width, ndx_width, values.size(), indices.size());
copy_into_encoded_array(s_flex, arr, values, indices);
return true;
}
else if (packed_size < uncompressed_size) {
const uint8_t flags = NodeHeader::get_flags(origin.get_header());
encode_array(s_packed, arr, packed_size, flags, v_width, origin.size());
copy_into_encoded_array(s_packed, origin, arr);
return true;
}
}
return false;
// std::vector<int64_t> values;
// std::vector<size_t> indices;
// encode_values(origin, values, indices);
// if (!values.empty()) {
// size_t v_width, ndx_width;
// const auto uncompressed_size = origin.get_byte_size();
// const auto packed_size = packed_encoded_array_size(values, origin.size(), v_width);
// const auto flex_size = flex_encoded_array_size(values, indices, v_width, ndx_width);
//
// if (flex_size < packed_size && flex_size < uncompressed_size) {
// const uint8_t flags = NodeHeader::get_flags(origin.get_header());
// encode_array(s_flex, arr, flex_size, flags, v_width, ndx_width, values.size(), indices.size());
// copy_into_encoded_array(s_flex, arr, values, indices);
// return true;
// }
// else if (packed_size < uncompressed_size) {
// const uint8_t flags = NodeHeader::get_flags(origin.get_header());
// encode_array(s_packed, arr, packed_size, flags, v_width, origin.size());
// copy_into_encoded_array(s_packed, origin, arr);
// return true;
// }
// }
// return false;
}

bool ArrayEncode::decode(Array& arr) const
Expand Down
2 changes: 2 additions & 0 deletions src/realm/array_integer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -185,11 +185,13 @@ void ArrayIntNull::find_all(IntegerColumn* result, value_type value, size_t col_

bool ArrayIntNull::find(int cond, value_type value, size_t start, size_t end, QueryStateBase* state) const
{
end = is_encoded() ? end + 1 : end;
return find_impl(cond, value, start, end, state);
}

size_t ArrayIntNull::find_first(value_type value, size_t begin, size_t end) const
{
end = is_encoded() ? end + 1 : end;
return find_first<Equal>(value, begin, end);
}

Expand Down
1 change: 1 addition & 0 deletions src/realm/array_integer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,7 @@ inline ArrayIntNull::~ArrayIntNull() noexcept {}

inline size_t ArrayIntNull::size() const noexcept
{
// this cannot be right, what if size is 0
return Array::size() - 1;
}

Expand Down
43 changes: 23 additions & 20 deletions src/realm/array_packed.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -137,36 +137,43 @@ bool ArrayPacked::find_all(const Array& arr, int64_t value, size_t start, size_t

REALM_ASSERT_3(arr.m_width, !=, 0);


// NOTE: this is one of the most important functions in the whole codebase, since it determines how fast the
// queries run.
//
// Main idea around find.
// If bitwidth is >=32 than a linear scan is the fastest thing we can do, and a trivial comparison can be as fast
// as it gets. If the bitwidh is less than 32, we can operate on the same 64 bit word diffently.
// Try to find the starting point where the condition can be met, comparing as many values as a single 64bit can
// contain in parallel. Once we have found the starting point, keep matching values as much as we can between
// start and end.
//
// EG: we store the value 6, with width 4bits (0110), 6 is 4 bits because, 110 (6) + sign bit 0.
// Inside 64bits we can fit max 16 times 6. If we go from index 0 to 15 throughout the same 64 bits, we need to
// apply a mask and a shift bits every time, then compare the values.
// This is not the cheapest thing to do. Instead we can compare all values contained within 64 bits in one go and
// see if there is a match with what we are looking for. Reducing the number of comparison by ~logk(N) where K is
// the width of each single value within a 64 bit word and N is the total number of values stored in the array. On
// the other end if we have values of 32 bits or more, accessing twice or once the same 64 bits word is probably
// the cheapest thing to do.
return parallel_subword_find<Cond>(arr, value, start, end, baseindex, state);
// the width of each single value within a 64 bit word and N is the total number of values stored in the array.

while (start < end) {
start = parallel_subword_find<Cond>(arr, value, start, end);
if (start < end) {
if (!state->match(start + baseindex))
return false;
}
++start;
}
return true;
}

template <typename Cond>
bool ArrayPacked::parallel_subword_find(const Array& arr, int64_t value, size_t start, size_t end, size_t baseindex,
QueryStateBase* state) const
size_t ArrayPacked::parallel_subword_find(const Array& arr, int64_t value, size_t start, size_t end) const
{
const auto width = arr.m_width;
const auto MSBs = populate(width, arr.get_encoder().width_mask());
const auto search_vector = populate(width, value);
const auto field_count = num_fields_for_width(width);
const auto bit_count_pr_iteration = num_bits_for_width(width);
signed total_bit_count_left = ((signed)end - start) * width;
auto total_bit_count_left = static_cast<signed>(end - start) * width;
REALM_ASSERT(total_bit_count_left >= 0);

auto bitwidth_cmp = [&MSBs](uint64_t a, uint64_t b) {
if constexpr (std::is_same_v<Cond, Equal>)
return find_all_fields_EQ(MSBs, a, b);
Expand All @@ -183,29 +190,25 @@ bool ArrayPacked::parallel_subword_find(const Array& arr, int64_t value, size_t
while (total_bit_count_left >= bit_count_pr_iteration) {
const auto word = it.get(bit_count_pr_iteration);
vector = bitwidth_cmp(word, search_vector);
while (vector) {
if (vector) {
int sub_word_index = first_field_marked(width, vector);
if (!state->match(start + sub_word_index + baseindex))
return false;
vector &= (vector - 1); // known bithack for clearing least significant bit
return start + sub_word_index;
}
total_bit_count_left -= bit_count_pr_iteration;
start += field_count;
it.bump(bit_count_pr_iteration);
}
if (total_bit_count_left) { // final subword, may be partial
if (!vector && total_bit_count_left) { // final subword, may be partial
const auto word = it.get(total_bit_count_left); // <-- limit lookahead to avoid touching memory beyond array
vector = bitwidth_cmp(word, search_vector);
auto last_word_mask = 0xFFFFFFFFFFFFFFFFULL >> (64 - total_bit_count_left);
vector &= last_word_mask;
while (vector) {
if (vector) {
int sub_word_index = first_field_marked(width, vector);
if (!state->match(start + sub_word_index + baseindex))
return false;
vector &= (vector - 1);
return start + sub_word_index;
}
}
return true;
return arr.size();
}

bool ArrayPacked::find_all_match(size_t start, size_t end, size_t baseindex, QueryStateBase* state) const
Expand Down
Loading
Loading