realm · nicola-cab · Feb 27, 2024 · Feb 27, 2024 · Feb 27, 2024 · Feb 27, 2024
diff --git a/src/realm/array_direct.hpp b/src/realm/array_direct.hpp
@@ -457,7 +457,6 @@ constexpr int num_bits_table[65] = {-1, 64, 64, 63, 64, 60, 60, 63, // 0-7
 
 inline int num_fields_for_width(int width)
 {
-    REALM_ASSERT(width <= 32); // it will not pay off to use this for fields larger
     REALM_ASSERT(width);
     return 64 / width;
 }
@@ -634,9 +633,214 @@ inline uint64_t find_all_fields_signed_GE(uint64_t MSBs, uint64_t A, uint64_t B)
 
 // find the first field which have MSB set (marks overflow after trial subtraction, or other
 // requested condition).
-// This may not be the most efficient method, but it is still much faster than reloading
-// each bitfield individually and testing it. To be used after find_all_fields_XXX.
-// TODO: Optimize this to log(N) time instead of linear.
+struct find_field_desc {
+    uint8_t levels;
+    uint64_t m1;
+    uint64_t m2;
+    uint64_t m4;
+    uint64_t m8;
+    uint64_t m16;
+    uint64_t m32;
+};
+
+constexpr struct find_field_desc find_field_table[65] = {
+    /* 0 */ {0, 0, 0, 0, 0, 0},
+    /* 1 */
+    {6, 0xAAAAAAAAAAAAAAAA, 0xCCCCCCCCCCCCCCCC, 0xF0F0F0F0F0F0F0F0, 0xFF00FF00FF00FF00, 0xFFFF0000FFFF0000,
+     0xFFFFFFFF00000000},
+    /* 2 */
+    {5, 0xCCCCCCCCCCCCCCCC, 0xF0F0F0F0F0F0F0F0, 0xFF00FF00FF00FF00, 0xFFFF0000FFFF0000, 0xFFFFFFFF00000000, 0},
+    /* 3 */
+    {5, 0b0000'1110'0011'1000'1110'0011'1000'1110'0011'1000'1110'0011'1000'1110'0011'1000,
+     0b0000'1111'1100'0000'1111'1100'0000'1111'1100'0000'1111'1100'0000'1111'1100'0000,
+     0b1111'0000'0000'0000'1111'1111'1111'0000'0000'0000'1111'1111'1111'0000'0000'0000,
+     0b0000'0000'0000'0000'1111'1111'1111'1111'1111'1111'0000'0000'0000'0000'0000'0000,
+     0b1111'1111'1111'1111'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000, 0},
+    /* 4 */
+    {4, 0xF0F0F0F0F0F0F0F0, 0xFF00FF00FF00FF00, 0xFFFF0000FFFF0000, 0xFFFFFFFF00000000, 0, 0},
+    /* 5 */
+    {4, 0b0000'1111'1000'0011'1110'0000'1111'1000'0011'1110'0000'1111'1000'0011'1110'0000,
+     0b0000'1111'1111'1100'0000'0000'1111'1111'1100'0000'0000'1111'1111'1100'0000'0000,
+     0b1111'0000'0000'0000'0000'0000'1111'1111'1111'1111'1111'0000'0000'0000'0000'0000,
+     0b1111'1111'1111'1111'1111'1111'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000, 0, 0},
+    /* 6 */
+    {4, 0b0000'1111'1100'0000'1111'1100'0000'1111'1100'0000'1111'1100'0000'1111'1100'0000,
+     0b1111'0000'0000'0000'1111'1111'1111'0000'0000'0000'1111'1111'1111'0000'0000'0000,
+     0b0000'0000'0000'0000'1111'1111'1111'1111'1111'1111'0000'0000'0000'0000'0000'0000,
+     0b1111'1111'1111'1111'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000, 0, 0},
+    /* 7 */
+    {4, 0b1000'0000'1111'1110'0000'0011'1111'1000'0000'1111'1110'0000'0011'1111'1000'0000,
+     0b0000'0000'1111'1111'1111'1100'0000'0000'0000'1111'1111'1111'1100'0000'0000'0000,
+     0b0000'0000'1111'1111'1111'1111'1111'1111'1111'0000'0000'0000'0000'0000'0000'0000,
+     0b1111'1111'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000, 0, 0},
+    /* 8 */
+    {3, 0xFF00FF00FF00FF00, 0xFFFF0000FFFF0000, 0xFFFFFFFF00000000, 0, 0, 0},
+    /* 9 */
+    {3, 0b1000'0000'0011'1111'1110'0000'0000'1111'1111'1000'0000'0011'1111'1110'0000'0000,
+     0b0111'1111'1100'0000'0000'0000'0000'1111'1111'1111'1111'1100'0000'0000'0000'0000,
+     0b1111'1111'1111'1111'1111'1111'1111'0000'0000'0000'0000'0000'0000'0000'0000'0000, 0, 0, 0},
+    /* 10 */
+    {3, 0b0000'1111'1111'1100'0000'0000'1111'1111'1100'0000'0000'1111'1111'1100'0000'0000,
+     0b1111'0000'0000'0000'0000'0000'1111'1111'1111'1111'1111'0000'0000'0000'0000'0000,
+     0b1111'1111'1111'1111'1111'1111'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000, 0, 0, 0},
+    /* 11 */
+    {3, 0b1111'1111'1000'0000'0000'1111'1111'1110'0000'0000'0011'1111'1111'1000'0000'0000,
+     0b0000'0000'0000'0000'0000'1111'1111'1111'1111'1111'1100'0000'0000'0000'0000'0000,
+     0b1111'1111'1111'1111'1111'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000, 0, 0, 0},
+    /* 12 */
+    {3, 0b1111'0000'0000'0000'1111'1111'1111'0000'0000'0000'1111'1111'1111'0000'0000'0000,
+     0b0000'0000'0000'0000'1111'1111'1111'1111'1111'1111'0000'0000'0000'0000'0000'0000,
+     0b1111'1111'1111'1111'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000, 0, 0, 0},
+    /* 13 */
+    {3, 0b1110'0000'0000'0000'1111'1111'1110'0000'0000'0011'1111'1111'1110'0000'0000'0000,
+     0b0000'0000'0000'0000'1111'1111'1111'1111'1111'1100'0000'0000'0000'0000'0000'0000,
+     0b1111'1111'1111'1111'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000, 0, 0, 0},
+    /* 14 */
+    {3, 0b0000'0000'1111'1111'1111'1100'0000'0000'0000'1111'1111'1111'1100'0000'0000'0000,
+     0b0000'0000'1111'1111'1111'1111'1111'1111'1111'0000'0000'0000'0000'0000'0000'0000,
+     0b1111'1111'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000, 0, 0, 0},
+    /* 15 */
+    {3, 0b0000'1111'1111'1111'1110'0000'0000'0000'0011'1111'1111'1111'1000'0000'0000'0000,
+     0b0000'1111'1111'1111'1111'1111'1111'1111'1100'0000'0000'0000'0000'0000'0000'0000,
+     0b1111'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000, 0, 0, 0},
+    /* 16 */
+    {2, 0xFFFF0000FFFF0000, 0xFFFFFFFF00000000, 0, 0, 0, 0},
+    /* 17 - as we're only interested in msb of each field we can simplify and use same pattern
+     for the next 4 entries */
+    {2, 0xF00000FFFFF00000, 0xFFFFFF0000000000, 0, 0, 0, 0},
+    /* 18 */
+    {2, 0xF00000FFFFF00000, 0xFFFFFF0000000000, 0, 0, 0, 0},
+    /* 19 */
+    {2, 0xF00000FFFFF00000, 0xFFFFFF0000000000, 0, 0, 0, 0},
+    /* 20 */
+    {2, 0xF00000FFFFF00000, 0xFFFFFF0000000000, 0, 0, 0, 0},
+    /* 21 - and next 4 */
+    {2, 0x0000FFFFFF000000, 0xFFFF000000000000, 0, 0, 0, 0},
+    /* 22 */
+    {2, 0x0000FFFFFF000000, 0xFFFF000000000000, 0, 0, 0, 0},
+    /* 23 */
+    {2, 0x0000FFFFFF000000, 0xFFFF000000000000, 0, 0, 0, 0},
+    /* 24 */
+    {2, 0x0000FFFFFF000000, 0xFFFF000000000000, 0, 0, 0, 0},
+    /* 25 - and 4 more */
+    {2, 0x00FFFFFFF0000000, 0xFF00000000000000, 0, 0, 0, 0},
+    /* 26 */
+    {2, 0x00FFFFFFF0000000, 0xFF00000000000000, 0, 0, 0, 0},
+    /* 27 */
+    {2, 0x00FFFFFFF0000000, 0xFF00000000000000, 0, 0, 0, 0},
+    /* 28 */
+    {2, 0x00FFFFFFF0000000, 0xFF00000000000000, 0, 0, 0, 0},
+    /* 29 - last 4 where multiple fields exist */
+    {1, 0xFFFFFFFF00000000, 0, 0, 0, 0, 0},
+    /* 30 */
+    {1, 0xFFFFFFFF00000000, 0, 0, 0, 0, 0},
+    /* 31 */
+    {1, 0xFFFFFFFF00000000, 0, 0, 0, 0, 0},
+    /* 32 */
+    {1, 0xFFFFFFFF00000000, 0, 0, 0, 0, 0},
+    /* 33 - from here to 64, there is only 1 possible result: 0 */
+    {0, 0, 0, 0, 0, 0, 0},
+    {0, 0, 0, 0, 0, 0, 0},
+    {0, 0, 0, 0, 0, 0, 0},
+    {0, 0, 0, 0, 0, 0, 0},
+    {0, 0, 0, 0, 0, 0, 0},
+    {0, 0, 0, 0, 0, 0, 0},
+    {0, 0, 0, 0, 0, 0, 0},
+    {0, 0, 0, 0, 0, 0, 0},
+    {0, 0, 0, 0, 0, 0, 0},
+    {0, 0, 0, 0, 0, 0, 0},
+    {0, 0, 0, 0, 0, 0, 0},
+    {0, 0, 0, 0, 0, 0, 0},
+    {0, 0, 0, 0, 0, 0, 0},
+    {0, 0, 0, 0, 0, 0, 0},
+    {0, 0, 0, 0, 0, 0, 0},
+    {0, 0, 0, 0, 0, 0, 0},
+    {0, 0, 0, 0, 0, 0, 0},
+    {0, 0, 0, 0, 0, 0, 0},
+    {0, 0, 0, 0, 0, 0, 0},
+    {0, 0, 0, 0, 0, 0, 0},
+    {0, 0, 0, 0, 0, 0, 0},
+    {0, 0, 0, 0, 0, 0, 0},
+    {0, 0, 0, 0, 0, 0, 0},
+    {0, 0, 0, 0, 0, 0, 0},
+    {0, 0, 0, 0, 0, 0, 0},
+    {0, 0, 0, 0, 0, 0, 0},
+    {0, 0, 0, 0, 0, 0, 0},
+    {0, 0, 0, 0, 0, 0, 0},
+    {0, 0, 0, 0, 0, 0, 0},
+    {0, 0, 0, 0, 0, 0, 0},
+    {0, 0, 0, 0, 0, 0, 0},
+    {0, 0, 0, 0, 0, 0, 0}};
+
+#if 1
+constexpr uint32_t inverse_width[65] = {
+    65536 * 64 / 1, // never used
+    65536 * 64 / 1,  65536 * 64 / 2,  65536 * 64 / 3,  65536 * 64 / 4,  65536 * 64 / 5,  65536 * 64 / 6,
+    65536 * 64 / 7,  65536 * 64 / 8,  65536 * 64 / 9,  65536 * 64 / 10, 65536 * 64 / 11, 65536 * 64 / 12,
+    65536 * 64 / 13, 65536 * 64 / 14, 65536 * 64 / 15, 65536 * 64 / 16, 65536 * 64 / 17, 65536 * 64 / 18,
+    65536 * 64 / 19, 65536 * 64 / 20, 65536 * 64 / 21, 65536 * 64 / 22, 65536 * 64 / 23, 65536 * 64 / 24,
+    65536 * 64 / 25, 65536 * 64 / 26, 65536 * 64 / 27, 65536 * 64 / 28, 65536 * 64 / 29, 65536 * 64 / 30,
+    65536 * 64 / 31, 65536 * 64 / 32, 65536 * 64 / 33, 65536 * 64 / 34, 65536 * 64 / 35, 65536 * 64 / 36,
+    65536 * 64 / 37, 65536 * 64 / 38, 65536 * 64 / 39, 65536 * 64 / 40, 65536 * 64 / 41, 65536 * 64 / 42,
+    65536 * 64 / 43, 65536 * 64 / 44, 65536 * 64 / 45, 65536 * 64 / 46, 65536 * 64 / 47, 65536 * 64 / 48,
+    65536 * 64 / 49, 65536 * 64 / 50, 65536 * 64 / 51, 65536 * 64 / 52, 65536 * 64 / 53, 65536 * 64 / 54,
+    65536 * 64 / 55, 65536 * 64 / 56, 65536 * 64 / 57, 65536 * 64 / 58, 65536 * 64 / 59, 65536 * 64 / 60,
+    65536 * 64 / 61, 65536 * 64 / 62, 65536 * 64 / 63, 65536 * 64 / 64,
+};
+
+inline int first_field_marked(int width, uint64_t vector)
+{
+#if REALM_WINDOWS
+    int lz = (int)_tzcnt_u64(vector); // TODO: not clear if this is ok on all platforms
+#else
+    int lz = __builtin_ctzll(vector);
+#endif
+    int field = (lz * inverse_width[width]) >> 22;
+    REALM_ASSERT_DEBUG(field == (lz / width));
+    return field;
+}
+#endif
+#if 0
+inline int first_field_marked(int width, uint64_t vector)
+{
+    // isolate least significant bit
+    vector = vector & (~vector + 1);
+    const struct find_field_desc& desc = find_field_table[width];
+    int result = 0;
+    switch (desc.levels) {
+        // the following case entries are intended to fall through
+        // (this is a variant of Duff's Device)
+        // TODO: disable compiler warnings for it
+        case 6:
+            result |= (vector & desc.m32) ? 32 : 0;
+        case 5:
+            result |= (vector & desc.m16) ? 16 : 0;
+        case 4:
+            result |= (vector & desc.m8) ? 8 : 0;
+        case 3:
+            result |= (vector & desc.m4) ? 4 : 0;
+        case 2:
+            result |= (vector & desc.m2) ? 2 : 0;
+        case 1:
+            result |= (vector & desc.m1) ? 1 : 0;
+        default:
+            break;
+    }
+    return result;
+}
+#endif
+#if 0
+inline int first_field_marked(int width, uint64_t vector)
+{
+    // isolate least significant bit
+    vector = vector & (~vector + 1);
+    // directly compute position of set bit using table
+    const struct find_field_desc& desc = find_field_table[width];
+    return ((vector & desc.m1) ? 1 : 0) | ((vector & desc.m2) ? 2 : 0) | ((vector & desc.m4) ? 4 : 0) |
+           ((vector & desc.m8) ? 8 : 0) | ((vector & desc.m16) ? 16 : 0) | ((vector & desc.m32) ? 32 : 0);
+}
+#endif
+#if 0
 inline int first_field_marked(int width, uint64_t vector)
 {
     int result = 0;
@@ -649,7 +853,7 @@ inline int first_field_marked(int width, uint64_t vector)
     }
     return -1;
 }
-
+#endif
 
 namespace impl {
 

diff --git a/src/realm/array_encode.cpp b/src/realm/array_encode.cpp
@@ -97,29 +97,29 @@ bool ArrayEncode::encode(const Array& origin, Array& arr) const
     // return false;
     return always_encode(origin, arr, true); // true packed, false flex
 
-    std::vector<int64_t> values;
-    std::vector<size_t> indices;
-    encode_values(origin, values, indices);
-    if (!values.empty()) {
-        size_t v_width, ndx_width;
-        const auto uncompressed_size = origin.get_byte_size();
-        const auto packed_size = packed_encoded_array_size(values, origin.size(), v_width);
-        const auto flex_size = flex_encoded_array_size(values, indices, v_width, ndx_width);
-
-        if (flex_size < packed_size && flex_size < uncompressed_size) {
-            const uint8_t flags = NodeHeader::get_flags(origin.get_header());
-            encode_array(s_flex, arr, flex_size, flags, v_width, ndx_width, values.size(), indices.size());
-            copy_into_encoded_array(s_flex, arr, values, indices);
-            return true;
-        }
-        else if (packed_size < uncompressed_size) {
-            const uint8_t flags = NodeHeader::get_flags(origin.get_header());
-            encode_array(s_packed, arr, packed_size, flags, v_width, origin.size());
-            copy_into_encoded_array(s_packed, origin, arr);
-            return true;
-        }
-    }
-    return false;
+    //    std::vector<int64_t> values;
+    //    std::vector<size_t> indices;
+    //    encode_values(origin, values, indices);
+    //    if (!values.empty()) {
+    //        size_t v_width, ndx_width;
+    //        const auto uncompressed_size = origin.get_byte_size();
+    //        const auto packed_size = packed_encoded_array_size(values, origin.size(), v_width);
+    //        const auto flex_size = flex_encoded_array_size(values, indices, v_width, ndx_width);
+    //
+    //        if (flex_size < packed_size && flex_size < uncompressed_size) {
+    //            const uint8_t flags = NodeHeader::get_flags(origin.get_header());
+    //            encode_array(s_flex, arr, flex_size, flags, v_width, ndx_width, values.size(), indices.size());
+    //            copy_into_encoded_array(s_flex, arr, values, indices);
+    //            return true;
+    //        }
+    //        else if (packed_size < uncompressed_size) {
+    //            const uint8_t flags = NodeHeader::get_flags(origin.get_header());
+    //            encode_array(s_packed, arr, packed_size, flags, v_width, origin.size());
+    //            copy_into_encoded_array(s_packed, origin, arr);
+    //            return true;
+    //        }
+    //    }
+    //    return false;
 }
 
 bool ArrayEncode::decode(Array& arr) const

diff --git a/src/realm/array_integer.cpp b/src/realm/array_integer.cpp
@@ -185,11 +185,13 @@ void ArrayIntNull::find_all(IntegerColumn* result, value_type value, size_t col_
 
 bool ArrayIntNull::find(int cond, value_type value, size_t start, size_t end, QueryStateBase* state) const
 {
+    end = is_encoded() ? end + 1 : end;
     return find_impl(cond, value, start, end, state);
 }
 
 size_t ArrayIntNull::find_first(value_type value, size_t begin, size_t end) const
 {
+    end = is_encoded() ? end + 1 : end;
     return find_first<Equal>(value, begin, end);
 }
 

diff --git a/src/realm/array_integer.hpp b/src/realm/array_integer.hpp
@@ -158,6 +158,7 @@ inline ArrayIntNull::~ArrayIntNull() noexcept {}
 
 inline size_t ArrayIntNull::size() const noexcept
 {
+    // this cannot be right, what if size is 0
     return Array::size() - 1;
 }
 

diff --git a/src/realm/array_packed.cpp b/src/realm/array_packed.cpp
@@ -137,36 +137,43 @@ bool ArrayPacked::find_all(const Array& arr, int64_t value, size_t start, size_t
 
     REALM_ASSERT_3(arr.m_width, !=, 0);
 
+
     // NOTE: this is one of the most important functions in the whole codebase, since it determines how fast the
     // queries run.
     //
     // Main idea around find.
-    // If bitwidth is >=32 than a linear scan is the fastest thing we can do, and a trivial comparison can be as fast
-    // as it gets. If the bitwidh is less than 32, we can operate on the same 64 bit word diffently.
+    // Try to find the starting point where the condition can be met, comparing as many values as a single 64bit can
+    // contain in parallel. Once we have found the starting point, keep matching values as much as we can between
+    // start and end.
     //
     // EG: we store the value 6, with width 4bits (0110), 6 is 4 bits because, 110 (6) + sign bit 0.
     // Inside 64bits we can fit max 16 times 6. If we go from index 0 to 15 throughout the same 64 bits, we need to
     // apply a mask and a shift bits every time, then compare the values.
     // This is not the cheapest thing to do. Instead we can compare all values contained within 64 bits in one go and
     // see if there is a match with what we are looking for. Reducing the number of comparison by ~logk(N) where K is
-    // the width of each single value within a 64 bit word and N is the total number of values stored in the array. On
-    // the other end if we have values of 32 bits or more, accessing twice or once the same 64 bits word is probably
-    // the cheapest thing to do.
-    return parallel_subword_find<Cond>(arr, value, start, end, baseindex, state);
+    // the width of each single value within a 64 bit word and N is the total number of values stored in the array.
+
+    while (start < end) {
+        start = parallel_subword_find<Cond>(arr, value, start, end);
+        if (start < end) {
+            if (!state->match(start + baseindex))
+                return false;
+        }
+        ++start;
+    }
+    return true;
 }
 
 template <typename Cond>
-bool ArrayPacked::parallel_subword_find(const Array& arr, int64_t value, size_t start, size_t end, size_t baseindex,
-                                        QueryStateBase* state) const
+size_t ArrayPacked::parallel_subword_find(const Array& arr, int64_t value, size_t start, size_t end) const
 {
     const auto width = arr.m_width;
     const auto MSBs = populate(width, arr.get_encoder().width_mask());
     const auto search_vector = populate(width, value);
     const auto field_count = num_fields_for_width(width);
     const auto bit_count_pr_iteration = num_bits_for_width(width);
-    signed total_bit_count_left = ((signed)end - start) * width;
+    auto total_bit_count_left = static_cast<signed>(end - start) * width;
     REALM_ASSERT(total_bit_count_left >= 0);
-
     auto bitwidth_cmp = [&MSBs](uint64_t a, uint64_t b) {
         if constexpr (std::is_same_v<Cond, Equal>)
             return find_all_fields_EQ(MSBs, a, b);
@@ -183,29 +190,25 @@ bool ArrayPacked::parallel_subword_find(const Array& arr, int64_t value, size_t
     while (total_bit_count_left >= bit_count_pr_iteration) {
         const auto word = it.get(bit_count_pr_iteration);
         vector = bitwidth_cmp(word, search_vector);
-        while (vector) {
+        if (vector) {
             int sub_word_index = first_field_marked(width, vector);
-            if (!state->match(start + sub_word_index + baseindex))
-                return false;
-            vector &= (vector - 1); // known bithack for clearing least significant bit
+            return start + sub_word_index;
         }
         total_bit_count_left -= bit_count_pr_iteration;
         start += field_count;
         it.bump(bit_count_pr_iteration);
     }
-    if (total_bit_count_left) {                         // final subword, may be partial
+    if (!vector && total_bit_count_left) {              // final subword, may be partial
         const auto word = it.get(total_bit_count_left); // <-- limit lookahead to avoid touching memory beyond array
         vector = bitwidth_cmp(word, search_vector);
         auto last_word_mask = 0xFFFFFFFFFFFFFFFFULL >> (64 - total_bit_count_left);
         vector &= last_word_mask;
-        while (vector) {
+        if (vector) {
             int sub_word_index = first_field_marked(width, vector);
-            if (!state->match(start + sub_word_index + baseindex))
-                return false;
-            vector &= (vector - 1);
+            return start + sub_word_index;
         }
     }
-    return true;
+    return arr.size();
 }
 
 bool ArrayPacked::find_all_match(size_t start, size_t end, size_t baseindex, QueryStateBase* state) const
-Original file line number
+Diff line change
@@ Expand Up / @@ -158,6 +158,7 @@ inline ArrayIntNull::~ArrayIntNull() noexcept {} @@
     inline size_t ArrayIntNull::size() const noexcept
     {
+        // this cannot be right, what if size is 0
         return Array::size() - 1;
     }
@@ Expand Down @@