subword parallel search for ArrayFlex and ArrayPacked find() (#7367)

* idea: subword parallel search * better subword search * better naming * new methods for reading unaligned word from array of bitfields * perf work on array with find based on parallel values comparison * major cleanup of bitfield scanning * de-templatified bit field search * more tests and code generalization * more tests * new iterator optimized for linear scan * eliminated last use of templates in subword parallel search * optimization of some subword search methods * working EQ cmp with parallel subword check * fix in all_fields_NE * make populate handle negative values * commented out bypass which disabled subword search * fix in fix of populate() * bugfix and direct methods for signed GT and GE * fix for GT condition * enabled array perf tests (outside debug mode) * fixed inner search loop * made some perf tests non concurrent and silenced warnings * moved call to match() into inner loop in subword parallel search * Perf v2, find_with_marked for packed interger arrays (#7385) * made find_first_marked() branch free * various optimizations of find_first_marked, best one selected * for some reason this is much bettergit add . * no warnings * made search method selection more explicit and clear * bunch of fixes.. * restore subword loop * fix object store tests + use subword cmp always (which is faster on my machine) --------- Co-authored-by: Finn Schiermer Andersen <[email protected]> * Perf work for array flex (still missing timestamps) (#7397) * WIP perf work for array flex * more small stuff, nothing important * parallel subword for eq and neq * move find parallel inside loop for eq and neq * LT parallel subword cmp * GT find for array flex * Int equality as good as Packed * code review --------- Co-authored-by: Finn Schiermer Andersen <[email protected]> Co-authored-by: Finn Schiermer Andersen <[email protected]>
realm · Mar 1, 2024 · cc3ae93 · cc3ae93
1 parent 4916543
commit cc3ae93
Show file tree

Hide file tree

Showing 11 changed files with 1,828 additions and 64 deletions.
diff --git a/src/realm/array_direct.hpp b/src/realm/array_direct.hpp
diff --git a/src/realm/array_encode.cpp b/src/realm/array_encode.cpp
@@ -187,6 +187,7 @@ void ArrayEncode::init(const char* h)
         m_ndx_width = NodeHeader::get_elementB_size<Encoding::Flex>(h);
         m_ndx_size = NodeHeader::get_arrayB_num_elements<Encoding::Flex>(h);
         m_v_mask = 1ULL << (m_v_width - 1);
+        m_ndx_mask = 1ULL << (m_ndx_width - 1);
     }
 }
 

diff --git a/src/realm/array_encode.hpp b/src/realm/array_encode.hpp
@@ -72,6 +72,7 @@ class ArrayEncode {
     Encoding m_encoding{NodeHeader::Encoding::WTypBits}; // this is not ok .... probably
     size_t m_v_width = 0, m_v_size = 0, m_ndx_width = 0, m_ndx_size = 0;
     size_t m_v_mask = 0;
+    size_t m_ndx_mask = 0;
 
     friend class ArrayPacked;
     friend class ArrayFlex;

diff --git a/src/realm/array_flex.cpp b/src/realm/array_flex.cpp
@@ -159,31 +159,169 @@ bool ArrayFlex::find_all(const Array& arr, int64_t value, size_t start, size_t e
 
     REALM_ASSERT_3(arr.m_width, !=, 0);
 
+    if constexpr (std::is_same_v<Equal, Cond>) {
+        find_eq(arr, value, start, end, baseindex, state);
+    }
+    else if constexpr (std::is_same_v<NotEqual, Cond>) {
+        find_neq(arr, value, start, end, baseindex, state);
+    }
+    else if constexpr (std::is_same_v<Less, Cond>) {
+        find_lt(arr, value, start, end, baseindex, state);
+    }
+    else if constexpr (std::is_same_v<Greater, Cond>) {
+        find_gt(arr, value, start, end, baseindex, state);
+    }
+
+    return true;
+}
+
+template <typename Cond, bool v>
+inline size_t ArrayFlex::parallel_subword_find(const Array& arr, uint64_t value, size_t width_mask, size_t offset,
+                                               uint_least8_t width, size_t start, size_t end) const
+{
+    const auto MSBs = populate(width, width_mask);
+    const auto search_vector = populate(width, value);
+    const auto field_count = num_fields_for_width(width);
+    const auto bit_count_pr_iteration = num_bits_for_width(width);
+    auto total_bit_count_left = static_cast<signed>(end - start) * width;
+    REALM_ASSERT(total_bit_count_left >= 0);
+    auto bitwidth_cmp = [&MSBs](uint64_t a, uint64_t b) {
+        if constexpr (std::is_same_v<Cond, Equal>)
+            return find_all_fields_EQ(MSBs, a, b);
+        else if constexpr (std::is_same_v<Cond, NotEqual>)
+            return find_all_fields_NE(MSBs, a, b);
+        else if constexpr (std::is_same_v<Cond, GreaterEqual>) {
+            if constexpr (v == true)
+                return find_all_fields_signed_GE(MSBs, a, b);
+            if constexpr (v == false)
+                return find_all_fields_unsigned_GE(MSBs, a, b);
+            REALM_UNREACHABLE();
+        }
+
+        else if constexpr (std::is_same_v<Cond, Greater>)
+            return find_all_fields_signed_GT(MSBs, a, b);
+        else if constexpr (std::is_same_v<Cond, Less>)
+            return find_all_fields_unsigned_LT(MSBs, a, b);
+    };
+
+    unaligned_word_iter it((uint64_t*)(arr.m_data), offset + start * width);
+    uint64_t vector = 0;
+    while (total_bit_count_left >= bit_count_pr_iteration) {
+        const auto word = it.get(bit_count_pr_iteration);
+        vector = bitwidth_cmp(word, search_vector);
+        if (vector) {
+            int sub_word_index = first_field_marked((int)width, vector);
+            return start + sub_word_index;
+        }
+        total_bit_count_left -= bit_count_pr_iteration;
+        start += field_count;
+        it.bump(bit_count_pr_iteration);
+    }
+    if (total_bit_count_left) {                         // final subword, may be partial
+        const auto word = it.get(total_bit_count_left); // <-- limit lookahead to avoid touching memory beyond array
+        vector = bitwidth_cmp(word, search_vector);
+        auto last_word_mask = 0xFFFFFFFFFFFFFFFFULL >> (64 - total_bit_count_left);
+        vector &= last_word_mask;
+        if (vector) {
+            int sub_word_index = first_field_marked(width, vector);
+            return start + sub_word_index;
+        }
+    }
+    return end;
+}
+
+bool ArrayFlex::find_eq(const Array& arr, int64_t value, size_t start, size_t end, size_t baseindex,
+                        QueryStateBase* state) const
+{
     const auto& encoder = arr.m_encoder;
-    const auto data = (uint64_t*)arr.m_data;
     const auto v_width = encoder.m_v_width;
     const auto v_size = encoder.m_v_size;
     const auto ndx_width = encoder.m_ndx_width;
-    const auto mask = encoder.width_mask();
+    const auto offset = v_size * v_width;
 
-    auto cmp = [](int64_t v, int64_t value) {
-        if constexpr (std::is_same_v<Cond, Equal>)
-            return v == value;
-        if constexpr (std::is_same_v<Cond, NotEqual>)
-            return v != value;
-        if constexpr (std::is_same_v<Cond, Greater>)
-            return v > value;
-        if constexpr (std::is_same_v<Cond, Less>)
-            return v < value;
-    };
+    auto v_start = parallel_subword_find<Equal>(arr, value, encoder.m_v_mask, 0, v_width, 0, v_size);
+    if (v_start == v_size)
+        return true;
+
+    while (start < end) {
+        start = parallel_subword_find<Equal>(arr, v_start, encoder.m_ndx_mask, offset, ndx_width, start, end);
+        if (start < end)
+            if (!state->match(start + baseindex))
+                return false;
+
+        ++start;
+    }
+    return true;
+}
+
+bool ArrayFlex::find_neq(const Array& arr, int64_t value, size_t start, size_t end, size_t baseindex,
+                         QueryStateBase* state) const
+{
+    const auto& encoder = arr.m_encoder;
+    const auto v_width = encoder.m_v_width;
+    const auto v_size = encoder.m_v_size;
+    const auto ndx_width = encoder.m_ndx_width;
+    const auto offset = v_size * v_width;
+
+    auto v_start = parallel_subword_find<Equal>(arr, value, encoder.m_v_mask, 0, v_width, 0, v_size);
+    if (v_start == v_size)
+        return true;
+
+    while (start < end) {
+        start = parallel_subword_find<NotEqual>(arr, v_start, encoder.m_ndx_mask, offset, ndx_width, start, end);
+        if (start < end)
+            if (!state->match(start + baseindex))
+                return false;
+        ++start;
+    }
+    return true;
+}
 
+bool ArrayFlex::find_lt(const Array& arr, int64_t value, size_t start, size_t end, size_t baseindex,
+                        QueryStateBase* state) const
+{
+    const auto& encoder = arr.m_encoder;
+    const auto v_width = encoder.m_v_width;
+    const auto v_size = encoder.m_v_size;
+    const auto ndx_width = encoder.m_ndx_width;
     const auto offset = v_size * v_width;
-    bf_iterator it_index{data, static_cast<size_t>(offset), ndx_width, ndx_width, start};
-    for (; start < end; ++start, ++it_index) {
-        const auto v = sign_extend_field_by_mask(mask, read_bitfield(data, it_index.get_value() * v_width, v_width));
-        if (cmp(v, value))
+
+    auto v_start = parallel_subword_find<GreaterEqual>(arr, value, encoder.m_v_mask, 0, v_width, 0, v_size);
+    if (v_start == v_size)
+        return true;
+
+    while (start < end) {
+        start = parallel_subword_find<Less>(arr, v_start, encoder.m_ndx_mask, offset, ndx_width, start, end);
+        if (start < end)
+            if (!state->match(start + baseindex))
+                return false;
+
+        ++start;
+    }
+    return true;
+}
+
+bool ArrayFlex::find_gt(const Array& arr, int64_t value, size_t start, size_t end, size_t baseindex,
+                        QueryStateBase* state) const
+{
+    const auto& encoder = arr.m_encoder;
+    const auto v_width = encoder.m_v_width;
+    const auto v_size = encoder.m_v_size;
+    const auto ndx_width = encoder.m_ndx_width;
+    const auto offset = v_size * v_width;
+
+    auto v_start = parallel_subword_find<Greater>(arr, value, encoder.m_v_mask, 0, v_width, 0, v_size);
+    if (v_start == v_size)
+        return true;
+
+    while (start < end) {
+        start = parallel_subword_find<GreaterEqual, false>(arr, v_start, encoder.m_ndx_mask, offset, ndx_width, start,
+                                                           end);
+        if (start < end)
             if (!state->match(start + baseindex))
                 return false;
+
+        ++start;
     }
     return true;
 }

diff --git a/src/realm/array_flex.hpp b/src/realm/array_flex.hpp
@@ -39,13 +39,23 @@ class ArrayFlex {
     int64_t get(const char*, size_t, size_t, size_t, size_t, size_t, size_t) const;
     void get_chunk(const Array& h, size_t ndx, int64_t res[8]) const;
     void set_direct(const Array&, size_t, int64_t) const;
+
     template <typename Cond>
     bool find_all(const Array&, int64_t, size_t, size_t, size_t, QueryStateBase*) const;
+
     int64_t sum(const Array&, size_t, size_t) const;
 
 private:
     int64_t do_get(uint64_t*, size_t, size_t, size_t, size_t, size_t, size_t) const;
     bool find_all_match(size_t start, size_t end, size_t baseindex, QueryStateBase* state) const;
+
+    template <typename Cond, bool = true> // true int64_t other uint64_t
+    inline size_t parallel_subword_find(const Array&, uint64_t, size_t, size_t, uint_least8_t, size_t, size_t) const;
+
+    bool find_eq(const Array&, int64_t, size_t, size_t, size_t, QueryStateBase*) const;
+    bool find_neq(const Array&, int64_t, size_t, size_t, size_t, QueryStateBase*) const;
+    bool find_lt(const Array&, int64_t, size_t, size_t, size_t, QueryStateBase*) const;
+    bool find_gt(const Array&, int64_t, size_t, size_t, size_t, QueryStateBase*) const;
 };
 } // namespace realm
 #endif // REALM_ARRAY_COMPRESS_HPP
diff --git a/src/realm/array_integer.hpp b/src/realm/array_integer.hpp
@@ -158,6 +158,7 @@ inline ArrayIntNull::~ArrayIntNull() noexcept {}
 
 inline size_t ArrayIntNull::size() const noexcept
 {
+    // this cannot be right, what if size is 0
     return Array::size() - 1;
 }
 

diff --git a/src/realm/array_integer_tpl.hpp b/src/realm/array_integer_tpl.hpp
@@ -79,7 +79,7 @@ bool ArrayIntNull::find_impl(value_type opt_value, size_t start, size_t end, Que
         }
         // if encoded use specialised find
         if (is_encoded())
-            return find_encoded<cond>(value, start2, end, baseindex2, state);
+            return find_encoded<cond>(value, start2, end2, baseindex2, state);
         // Fall back to plain Array find.
         return ArrayWithFind(*this).find<cond>(value, start2, end2, baseindex2, state);
     }

diff --git a/src/realm/array_packed.cpp b/src/realm/array_packed.cpp
@@ -112,7 +112,6 @@ void ArrayPacked::get_chunk(const Array& arr, size_t ndx, int64_t res[8]) const
         res[index++] = get(arr, i++);
     }
 }
-
 template <typename Cond>
 bool ArrayPacked::find_all(const Array& arr, int64_t value, size_t start, size_t end, size_t baseindex,
                            QueryStateBase* state) const
@@ -138,28 +137,79 @@ bool ArrayPacked::find_all(const Array& arr, int64_t value, size_t start, size_t
 
     REALM_ASSERT_3(arr.m_width, !=, 0);
 
-    auto cmp = [](int64_t v, int64_t value) {
+
+    // NOTE: this is one of the most important functions in the whole codebase, since it determines how fast the
+    // queries run.
+    //
+    // Main idea around find.
+    // Try to find the starting point where the condition can be met, comparing as many values as a single 64bit can
+    // contain in parallel. Once we have found the starting point, keep matching values as much as we can between
+    // start and end.
+    //
+    // EG: we store the value 6, with width 4bits (0110), 6 is 4 bits because, 110 (6) + sign bit 0.
+    // Inside 64bits we can fit max 16 times 6. If we go from index 0 to 15 throughout the same 64 bits, we need to
+    // apply a mask and a shift bits every time, then compare the values.
+    // This is not the cheapest thing to do. Instead we can compare all values contained within 64 bits in one go and
+    // see if there is a match with what we are looking for. Reducing the number of comparison by ~logk(N) where K is
+    // the width of each single value within a 64 bit word and N is the total number of values stored in the array.
+
+    // in packed format a parallel subword find pays off also for width >= 32
+    while (start < end) {
+        start = parallel_subword_find<Cond>(arr, value, start, end);
+        if (start < end)
+            if (!state->match(start + baseindex))
+                return false;
+
+        ++start;
+    }
+    return true;
+}
+
+template <typename Cond>
+size_t ArrayPacked::parallel_subword_find(const Array& arr, int64_t value, size_t start, size_t end) const
+{
+    const auto width = arr.m_width;
+    const auto MSBs = populate(width, arr.get_encoder().width_mask());
+    const auto search_vector = populate(width, value);
+    const auto field_count = num_fields_for_width(width);
+    const auto bit_count_pr_iteration = num_bits_for_width(width);
+    auto total_bit_count_left = static_cast<signed>(end - start) * width;
+    REALM_ASSERT(total_bit_count_left >= 0);
+    auto bitwidth_cmp = [&MSBs](uint64_t a, uint64_t b) {
         if constexpr (std::is_same_v<Cond, Equal>)
-            return v == value;
+            return find_all_fields_EQ(MSBs, a, b);
         if constexpr (std::is_same_v<Cond, NotEqual>)
-            return v != value;
+            return find_all_fields_NE(MSBs, a, b);
         if constexpr (std::is_same_v<Cond, Greater>)
-            return v > value;
+            return find_all_fields_signed_GT(MSBs, a, b);
         if constexpr (std::is_same_v<Cond, Less>)
-            return v < value;
+            return find_all_fields_signed_LT(MSBs, a, b);
     };
 
-    //~6/7x slower, we need to do a bitscan before to start this loop when values are less than 32 and 64 bits
-    bf_iterator it((uint64_t*)arr.m_data, 0, arr.m_width, arr.m_width, start);
-    const auto mask = arr.get_encoder().width_mask();
-    for (; start < end; ++start, ++it) {
-        const auto v = sign_extend_field_by_mask(mask, it.get_value());
-        if (cmp(v, value)) {
-            if (!state->match(start + baseindex))
-                return false;
+    unaligned_word_iter it((uint64_t*)arr.m_data, start * arr.m_width);
+    uint64_t vector = 0;
+    while (total_bit_count_left >= bit_count_pr_iteration) {
+        const auto word = it.get(bit_count_pr_iteration);
+        vector = bitwidth_cmp(word, search_vector);
+        if (vector) {
+            int sub_word_index = first_field_marked(width, vector);
+            return start + sub_word_index;
         }
+        total_bit_count_left -= bit_count_pr_iteration;
+        start += field_count;
+        it.bump(bit_count_pr_iteration);
     }
-    return true;
+    if (total_bit_count_left) {                         // final subword, may be partial
+        const auto word = it.get(total_bit_count_left); // <-- limit lookahead to avoid touching memory beyond array
+        vector = bitwidth_cmp(word, search_vector);
+        auto last_word_mask = 0xFFFFFFFFFFFFFFFFULL >> (64 - total_bit_count_left);
+        vector &= last_word_mask;
+        if (vector) {
+            int sub_word_index = first_field_marked(width, vector);
+            return start + sub_word_index;
+        }
+    }
+    return end;
 }
 
 bool ArrayPacked::find_all_match(size_t start, size_t end, size_t baseindex, QueryStateBase* state) const

diff --git a/src/realm/array_packed.hpp b/src/realm/array_packed.hpp
@@ -48,6 +48,9 @@ class ArrayPacked {
 private:
     int64_t do_get(uint64_t*, size_t, size_t, size_t, size_t) const;
     bool find_all_match(size_t start, size_t end, size_t baseindex, QueryStateBase* state) const;
+
+    template <typename Cond>
+    size_t parallel_subword_find(const Array&, int64_t, size_t, size_t) const;
 };
 } // namespace realm
 

diff --git a/src/realm/node_header.hpp b/src/realm/node_header.hpp
@@ -897,6 +897,7 @@ size_t inline NodeHeader::get_byte_size_from_header(const char* header) noexcept
                                                                 get_elementB_size<NodeHeader::Encoding::Flex>(h));
         default:
             REALM_ASSERT_RELEASE(false && "unknown encoding");
+            return 0; // kill a warning
     }
 }
-Original file line number
+Diff line change
@@ Expand Up / @@ -158,6 +158,7 @@ inline ArrayIntNull::~ArrayIntNull() noexcept {} @@
     inline size_t ArrayIntNull::size() const noexcept
     {
+        // this cannot be right, what if size is 0
         return Array::size() - 1;
     }
@@ Expand Down @@