diff --git a/src/realm/array_direct.hpp b/src/realm/array_direct.hpp
index 9084f65e4ab..ab011903627 100644
--- a/src/realm/array_direct.hpp
+++ b/src/realm/array_direct.hpp
@@ -185,6 +185,43 @@ inline int64_t get_direct(const char* data, size_t width, size_t ndx) noexcept
     REALM_TEMPEX(return get_direct, width, (data, ndx));
 }
 
+// An iterator for getting a 64 bit word from any (byte-address+bit-offset) address.
+class unaligned_word_iter {
+public:
+    unaligned_word_iter(const uint64_t* data, size_t bit_offset)
+        : m_word_ptr(data + (bit_offset >> 6))
+        , m_in_word_offset(bit_offset & 0x3F)
+    {
+    }
+    // 'num_bits' number of bits which must be read
+    // WARNING returned word may be garbage above the first 'num_bits' bits.
+    uint64_t get(unsigned num_bits)
+    {
+        auto first_word = m_word_ptr[0];
+        uint64_t result = first_word >> m_in_word_offset;
+        // note: above shifts in zeroes
+        if (m_in_word_offset + num_bits <= 64)
+            return result;
+        // if we're here, in_word_offset > 0
+        auto first_word_size = 64 - m_in_word_offset;
+        auto second_word = m_word_ptr[1];
+        result |= second_word << first_word_size;
+        // note: above shifts in zeroes below the bits we want
+        return result;
+    }
+    // bump the iterator the specified number of bits
+    void bump(unsigned num_bits)
+    {
+        auto total_offset = m_in_word_offset + num_bits;
+        m_word_ptr += total_offset >> 6;
+        m_in_word_offset = total_offset & 0x3F;
+    }
+
+private:
+    const uint64_t* m_word_ptr;
+    unsigned m_in_word_offset;
+};
+
 // Read a bit field of up to 64 bits.
 // - Any alignment and size is supported
 // - The start of the 'data' area must be 64 bit aligned in all cases.
@@ -210,7 +247,7 @@ class bf_iterator {
         first_word_ptr = data_area + (field_position >> 6);
     }
 
-    uint64_t get_value() const
+    inline uint64_t get_full_word_with_value() const
     {
         auto in_word_position = field_position & 0x3F;
         auto first_word = first_word_ptr[0];
@@ -223,11 +260,43 @@ class bf_iterator {
             result |= second_word << first_word_size;
             // note: above shifts in zeroes below the bits we want
         }
+        return result;
+    }
+
+    inline uint64_t get_value() const
+    {
+        auto result = get_full_word_with_value();
         // discard any bits above the field we want
         if (field_size < 64)
             result &= (1ULL << field_size) - 1;
         return result;
     }
+
+    // get unaligned word - this should not be called if the next word extends beyond
+    // end of array. For that particular case, you must use get_last_unaligned_word instead.
+    inline uint64_t get_unaligned_word() const
+    {
+        auto in_word_position = field_position & 0x3F;
+        auto first_word = first_word_ptr[0];
+        if (in_word_position == 0)
+            return first_word;
+        uint64_t result = first_word >> in_word_position;
+        // note: above shifts in zeroes above the bitfield
+        auto first_word_size = 64 - in_word_position;
+        auto second_word = first_word_ptr[1];
+        result |= second_word << first_word_size;
+        // note: above shifts in zeroes below the bits we want
+        return result;
+    }
+
+    inline uint64_t get_last_unaligned_word() const
+    {
+        auto in_word_position = field_position & 0x3F;
+        auto first_word = first_word_ptr[0];
+        uint64_t result = first_word >> in_word_position;
+        // note: above shifts in zeroes above the bitfield
+        return result;
+    }
     void set_value(uint64_t value) const
     {
         auto in_word_position = field_position & 0x3F;
@@ -257,7 +326,7 @@ class bf_iterator {
             first_word_ptr[1] = second_word;
         }
     }
-    void operator++()
+    inline void operator++()
     {
         auto next_field_position = field_position + step_size;
         if ((next_field_position >> 6) > (field_position >> 6)) {
@@ -287,11 +356,11 @@ class bf_ref {
     bf_iterator it;
 
 public:
-    bf_ref(bf_iterator& it)
+    inline bf_ref(bf_iterator& it)
         : it(it)
     {
     }
-    operator uint64_t() const
+    inline operator uint64_t() const
     {
         return it.get_value();
     }
@@ -344,6 +413,455 @@ inline std::pair<int64_t, int64_t> get_two(const char* data, size_t width, size_
     REALM_TEMPEX(return get_two, width, (data, ndx));
 }
 
+/* Subword parallel search
+
+    The following provides facilities for subword parallel search for bitfields of any size.
+    To simplify, the first bitfield must be aligned within the word: it must occupy the lowest
+    bits of the word.
+
+    In general the metods here return a vector with the most significant bit in each field
+    marking that a condition was met when comparing the corresponding pair of fields in two
+    vectors. Checking if any field meets a condition is as simple as comparing the return
+    vector against 0. Finding the first to meet a condition is also supported.
+
+    Vectors are "split" into fields according to a MSB vector, wich indicates the most
+    significant bit of each field. The MSB must be passed in as an argument to most
+    bit field comparison functions. It can be generated by the field_sign_bit<width> template.
+
+    The simplest condition to test is any_field_NE(A,B), where A and B are words.
+    This condition should be true if any bitfield in A is not equal to the corresponding
+    field in B.
+
+    This is almost as simple as a direct word compare, but needs to take into account that
+    we may want to have part of the words undefined.
+*/
+constexpr int num_fields_table[65] = {-1, 64, 32, 21, 16, 12, 10, 9, // 0-7
+                                      8,  7,  6,  5,  5,  4,  4,  4, // 8-15
+                                      4,  3,  3,  3,  3,  3,  2,  2, // 16-23
+                                      2,  2,  2,  2,  2,  2,  2,  2, // 24-31
+                                      2,  1,  1,  1,  1,  1,  1,  1, // 32-39
+                                      1,  1,  1,  1,  1,  1,  1,  1, // 40-47
+                                      1,  1,  1,  1,  1,  1,  1,  1, // 48-55
+                                      1,  1,  1,  1,  1,  1,  1,  1, // 56-63
+                                      1};
+
+constexpr int num_bits_table[65] = {-1, 64, 64, 63, 64, 60, 60, 63, // 0-7
+                                    64, 63, 60, 55, 60, 52, 56, 60, // 8-15
+                                    64, 51, 54, 57, 60, 63, 44, 46, // 16-23
+                                    48, 50, 52, 54, 56, 58, 60, 64, // 24-31
+                                    64, 33, 34, 35, 36, 37, 38, 39, // 32-39
+                                    40, 41, 42, 43, 44, 45, 46, 47, // 40-47
+                                    48, 49, 50, 51, 52, 53, 54, 55, // 48-55
+                                    56, 57, 58, 59, 60, 61, 62, 63, // 56-63
+                                    64};
+
+inline int num_fields_for_width(int width)
+{
+    REALM_ASSERT(width);
+    return 64 / width;
+}
+
+inline uint64_t num_bits(int width)
+{
+    return num_fields_table[width];
+}
+
+inline int num_bits_for_width(int width)
+{
+    return num_bits_table[width];
+}
+
+inline uint64_t cares_about(int width)
+{
+    return 0xFFFFFFFFFFFFFFFFULL >> (64 - num_bits_table[width]);
+}
+
+// true if any field in A differs from corresponding field in B. If you also want
+// to find which fields, use find_all_fields_NE instead.
+bool inline any_field_NE(int width, uint64_t A, uint64_t B)
+{
+    return (A ^ B) & cares_about(width);
+}
+
+// Populate all fields in a vector with a given value of a give width.
+// Bits outside of the given field are ignored.
+constexpr uint64_t populate(int width, uint64_t value)
+{
+    value &= 0xFFFFFFFFFFFFFFFFULL >> (64 - width);
+    if (width < 8) {
+        value |= value << width;
+        width <<= 1;
+        value |= value << width;
+        width <<= 1;
+        value |= value << width;
+        width <<= 1;
+    }
+    // width now in range 8..64
+    if (width < 32) {
+        value |= value << width;
+        width <<= 1;
+        value |= value << width;
+        width <<= 1;
+    }
+    // width now in range 32..128
+    if (width < 64) {
+        value |= value << width;
+    }
+    return value;
+}
+
+// provides a set bit in pos 0 of each field, remaining bits zero
+constexpr uint64_t field_bit0(int width)
+{
+    return populate(width, 1);
+}
+
+// provides a set sign-bit in each field, remaining bits zero
+constexpr uint64_t field_sign_bit(int width)
+{
+    return populate(width, 1ULL << (width - 1));
+}
+
+/* Unsigned LT.
+
+    This can be determined by trial subtaction. However, some care must be exercised
+    since simply subtracting one vector from another will allow carries from one
+    bitfield to flow into the next one. To avoid this, we isolate bitfields by clamping
+    the MSBs to 1 in A and 0 in B before subtraction. After the subtraction the MSBs in
+    the result indicate borrows from the MSB. We then compute overflow (borrow OUT of MSB)
+    using boolean logic as described below.
+
+    Unsigned LT is also used to find all zero fields or all non-zero fields, so it is
+    the backbone of all comparisons returning vectors.
+*/
+
+// compute the overflows in unsigned trial subtraction A-B. The overflows
+// will be marked by 1 in the sign bit of each field in the result. Other
+// bits in the result are zero.
+// Overflow are detected for each field pair where A is less than B.
+inline uint64_t unsigned_LT_vector(uint64_t MSBs, uint64_t A, uint64_t B)
+{
+    // 1. compute borrow from most significant bit
+    // Isolate bitfields inside A and B before subtraction (prevent carries from spilling over)
+    // do this by clamping most significant bit in A to 1, and msb in B to 0
+    auto A_isolated = A | MSBs;                              // 1 op
+    auto B_isolated = B & ~MSBs;                             // 2 ops
+    auto borrows_into_sign_bit = ~(A_isolated - B_isolated); // 2 ops (total latency 4)
+
+    // 2. determine what subtraction against most significant bit would give:
+    // A B borrow-in:   (A-B-borrow-in)
+    // 0 0 0            (0-0-0) = 0
+    // 0 0 1            (0-0-1) = 1 + borrow-out
+    // 0 1 0            (0-1-0) = 1 + borrow-out
+    // 0 1 1            (0-1-1) = 0 + borrow-out
+    // 1 0 0            (1-0-0) = 1
+    // 1 0 1            (1-0-1) = 0
+    // 1 1 0            (1-1-0) = 0
+    // 1 1 1            (1-1-1) = 1 + borrow-out
+    // borrow-out = (~A & B) | (~A & borrow-in) | (A & B & borrow-in)
+    // The overflows are simply the borrow-out, now encoded into the sign bits of each field.
+    auto overflows = (~A & B) | (~A & borrows_into_sign_bit) | (A & B & borrows_into_sign_bit);
+    // ^ 6 ops, total latency 6 (4+2)
+    return overflows & MSBs; // 1 op, total latency 7
+    // total of 12 ops and a latency of 7. On a beefy CPU 3-4 of those can run in parallel
+    // and still reach a combined latency of 10 or less.
+}
+
+inline uint64_t find_all_fields_unsigned_LT(uint64_t MSBs, uint64_t A, uint64_t B)
+{
+    return unsigned_LT_vector(MSBs, A, B);
+}
+
+inline uint64_t find_all_fields_NE(uint64_t MSBs, uint64_t A, uint64_t B)
+{
+    // 0 != A^B, same as asking 0 - (A^B) overflows.
+    return unsigned_LT_vector(MSBs, 0, A ^ B);
+}
+
+inline uint64_t find_all_fields_EQ(uint64_t MSBs, uint64_t A, uint64_t B)
+{
+    // get the fields which are EQ and negate the result
+    auto all_fields_NE = find_all_fields_NE(MSBs, A, B);
+    auto all_fields_NE_negated = ~all_fields_NE;
+    // must filter the negated vector so only MSB are left.
+    return MSBs & all_fields_NE_negated;
+}
+
+inline uint64_t find_all_fields_unsigned_LE(uint64_t MSBs, uint64_t A, uint64_t B)
+{
+    // Now A <= B is the same as !(A > B) so...
+    // reverse A and B to turn (A>B) --> (B<A)
+    auto GT = unsigned_LT_vector(MSBs, B, A);
+    // Negate the matches
+    auto GT_negated = ~GT;
+    // and since this negates all bits, filter so we only have MSBs again
+    return MSBs & GT_negated;
+}
+
+inline uint64_t find_all_fields_unsigned_GE(uint64_t MSBs, uint64_t A, uint64_t B)
+{
+    return find_all_fields_unsigned_LE(MSBs, B, A);
+}
+
+/*
+    Handling signed values
+
+    Trial subtraction only works as is for unsigned. We simply transform signed into unsigned
+    by pusing all values up by 1<<(field_width-1). This makes all negative values positive and positive
+    values remain positive, although larger. Any overflow during the push can be ignored.
+    After that transformation Trial subtraction should correctly detect the LT condition.
+
+*/
+
+
+inline uint64_t find_all_fields_signed_LT(uint64_t MSBs, uint64_t A, uint64_t B)
+{
+    auto sign_bits = MSBs;
+    return unsigned_LT_vector(MSBs, A ^ sign_bits, B ^ sign_bits);
+}
+
+inline uint64_t find_all_fields_signed_LE(uint64_t MSBs, uint64_t A, uint64_t B)
+{
+    auto sign_bits = MSBs;
+    return find_all_fields_unsigned_LE(MSBs, A ^ sign_bits, B ^ sign_bits);
+}
+
+inline uint64_t find_all_fields_signed_GT(uint64_t MSBs, uint64_t A, uint64_t B)
+{
+    // A > B is the same as B < A
+    return find_all_fields_signed_LT(MSBs, B, A);
+}
+
+inline uint64_t find_all_fields_signed_GE(uint64_t MSBs, uint64_t A, uint64_t B)
+{
+    // A >= B is the same as B <= A
+    return find_all_fields_signed_LE(MSBs, B, A);
+}
+
+// find the first field which have MSB set (marks overflow after trial subtraction, or other
+// requested condition).
+struct find_field_desc {
+    uint8_t levels;
+    uint64_t m1;
+    uint64_t m2;
+    uint64_t m4;
+    uint64_t m8;
+    uint64_t m16;
+    uint64_t m32;
+};
+
+constexpr struct find_field_desc find_field_table[65] = {
+    /* 0 */ {0, 0, 0, 0, 0, 0},
+    /* 1 */
+    {6, 0xAAAAAAAAAAAAAAAA, 0xCCCCCCCCCCCCCCCC, 0xF0F0F0F0F0F0F0F0, 0xFF00FF00FF00FF00, 0xFFFF0000FFFF0000,
+     0xFFFFFFFF00000000},
+    /* 2 */
+    {5, 0xCCCCCCCCCCCCCCCC, 0xF0F0F0F0F0F0F0F0, 0xFF00FF00FF00FF00, 0xFFFF0000FFFF0000, 0xFFFFFFFF00000000, 0},
+    /* 3 */
+    {5, 0b0000'1110'0011'1000'1110'0011'1000'1110'0011'1000'1110'0011'1000'1110'0011'1000,
+     0b0000'1111'1100'0000'1111'1100'0000'1111'1100'0000'1111'1100'0000'1111'1100'0000,
+     0b1111'0000'0000'0000'1111'1111'1111'0000'0000'0000'1111'1111'1111'0000'0000'0000,
+     0b0000'0000'0000'0000'1111'1111'1111'1111'1111'1111'0000'0000'0000'0000'0000'0000,
+     0b1111'1111'1111'1111'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000, 0},
+    /* 4 */
+    {4, 0xF0F0F0F0F0F0F0F0, 0xFF00FF00FF00FF00, 0xFFFF0000FFFF0000, 0xFFFFFFFF00000000, 0, 0},
+    /* 5 */
+    {4, 0b0000'1111'1000'0011'1110'0000'1111'1000'0011'1110'0000'1111'1000'0011'1110'0000,
+     0b0000'1111'1111'1100'0000'0000'1111'1111'1100'0000'0000'1111'1111'1100'0000'0000,
+     0b1111'0000'0000'0000'0000'0000'1111'1111'1111'1111'1111'0000'0000'0000'0000'0000,
+     0b1111'1111'1111'1111'1111'1111'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000, 0, 0},
+    /* 6 */
+    {4, 0b0000'1111'1100'0000'1111'1100'0000'1111'1100'0000'1111'1100'0000'1111'1100'0000,
+     0b1111'0000'0000'0000'1111'1111'1111'0000'0000'0000'1111'1111'1111'0000'0000'0000,
+     0b0000'0000'0000'0000'1111'1111'1111'1111'1111'1111'0000'0000'0000'0000'0000'0000,
+     0b1111'1111'1111'1111'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000, 0, 0},
+    /* 7 */
+    {4, 0b1000'0000'1111'1110'0000'0011'1111'1000'0000'1111'1110'0000'0011'1111'1000'0000,
+     0b0000'0000'1111'1111'1111'1100'0000'0000'0000'1111'1111'1111'1100'0000'0000'0000,
+     0b0000'0000'1111'1111'1111'1111'1111'1111'1111'0000'0000'0000'0000'0000'0000'0000,
+     0b1111'1111'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000, 0, 0},
+    /* 8 */
+    {3, 0xFF00FF00FF00FF00, 0xFFFF0000FFFF0000, 0xFFFFFFFF00000000, 0, 0, 0},
+    /* 9 */
+    {3, 0b1000'0000'0011'1111'1110'0000'0000'1111'1111'1000'0000'0011'1111'1110'0000'0000,
+     0b0111'1111'1100'0000'0000'0000'0000'1111'1111'1111'1111'1100'0000'0000'0000'0000,
+     0b1111'1111'1111'1111'1111'1111'1111'0000'0000'0000'0000'0000'0000'0000'0000'0000, 0, 0, 0},
+    /* 10 */
+    {3, 0b0000'1111'1111'1100'0000'0000'1111'1111'1100'0000'0000'1111'1111'1100'0000'0000,
+     0b1111'0000'0000'0000'0000'0000'1111'1111'1111'1111'1111'0000'0000'0000'0000'0000,
+     0b1111'1111'1111'1111'1111'1111'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000, 0, 0, 0},
+    /* 11 */
+    {3, 0b1111'1111'1000'0000'0000'1111'1111'1110'0000'0000'0011'1111'1111'1000'0000'0000,
+     0b0000'0000'0000'0000'0000'1111'1111'1111'1111'1111'1100'0000'0000'0000'0000'0000,
+     0b1111'1111'1111'1111'1111'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000, 0, 0, 0},
+    /* 12 */
+    {3, 0b1111'0000'0000'0000'1111'1111'1111'0000'0000'0000'1111'1111'1111'0000'0000'0000,
+     0b0000'0000'0000'0000'1111'1111'1111'1111'1111'1111'0000'0000'0000'0000'0000'0000,
+     0b1111'1111'1111'1111'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000, 0, 0, 0},
+    /* 13 */
+    {3, 0b1110'0000'0000'0000'1111'1111'1110'0000'0000'0011'1111'1111'1110'0000'0000'0000,
+     0b0000'0000'0000'0000'1111'1111'1111'1111'1111'1100'0000'0000'0000'0000'0000'0000,
+     0b1111'1111'1111'1111'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000, 0, 0, 0},
+    /* 14 */
+    {3, 0b0000'0000'1111'1111'1111'1100'0000'0000'0000'1111'1111'1111'1100'0000'0000'0000,
+     0b0000'0000'1111'1111'1111'1111'1111'1111'1111'0000'0000'0000'0000'0000'0000'0000,
+     0b1111'1111'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000, 0, 0, 0},
+    /* 15 */
+    {3, 0b0000'1111'1111'1111'1110'0000'0000'0000'0011'1111'1111'1111'1000'0000'0000'0000,
+     0b0000'1111'1111'1111'1111'1111'1111'1111'1100'0000'0000'0000'0000'0000'0000'0000,
+     0b1111'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000, 0, 0, 0},
+    /* 16 */
+    {2, 0xFFFF0000FFFF0000, 0xFFFFFFFF00000000, 0, 0, 0, 0},
+    /* 17 - as we're only interested in msb of each field we can simplify and use same pattern
+     for the next 4 entries */
+    {2, 0xF00000FFFFF00000, 0xFFFFFF0000000000, 0, 0, 0, 0},
+    /* 18 */
+    {2, 0xF00000FFFFF00000, 0xFFFFFF0000000000, 0, 0, 0, 0},
+    /* 19 */
+    {2, 0xF00000FFFFF00000, 0xFFFFFF0000000000, 0, 0, 0, 0},
+    /* 20 */
+    {2, 0xF00000FFFFF00000, 0xFFFFFF0000000000, 0, 0, 0, 0},
+    /* 21 - and next 4 */
+    {2, 0x0000FFFFFF000000, 0xFFFF000000000000, 0, 0, 0, 0},
+    /* 22 */
+    {2, 0x0000FFFFFF000000, 0xFFFF000000000000, 0, 0, 0, 0},
+    /* 23 */
+    {2, 0x0000FFFFFF000000, 0xFFFF000000000000, 0, 0, 0, 0},
+    /* 24 */
+    {2, 0x0000FFFFFF000000, 0xFFFF000000000000, 0, 0, 0, 0},
+    /* 25 - and 4 more */
+    {2, 0x00FFFFFFF0000000, 0xFF00000000000000, 0, 0, 0, 0},
+    /* 26 */
+    {2, 0x00FFFFFFF0000000, 0xFF00000000000000, 0, 0, 0, 0},
+    /* 27 */
+    {2, 0x00FFFFFFF0000000, 0xFF00000000000000, 0, 0, 0, 0},
+    /* 28 */
+    {2, 0x00FFFFFFF0000000, 0xFF00000000000000, 0, 0, 0, 0},
+    /* 29 - last 4 where multiple fields exist */
+    {1, 0xFFFFFFFF00000000, 0, 0, 0, 0, 0},
+    /* 30 */
+    {1, 0xFFFFFFFF00000000, 0, 0, 0, 0, 0},
+    /* 31 */
+    {1, 0xFFFFFFFF00000000, 0, 0, 0, 0, 0},
+    /* 32 */
+    {1, 0xFFFFFFFF00000000, 0, 0, 0, 0, 0},
+    /* 33 - from here to 64, there is only 1 possible result: 0 */
+    {0, 0, 0, 0, 0, 0, 0},
+    {0, 0, 0, 0, 0, 0, 0},
+    {0, 0, 0, 0, 0, 0, 0},
+    {0, 0, 0, 0, 0, 0, 0},
+    {0, 0, 0, 0, 0, 0, 0},
+    {0, 0, 0, 0, 0, 0, 0},
+    {0, 0, 0, 0, 0, 0, 0},
+    {0, 0, 0, 0, 0, 0, 0},
+    {0, 0, 0, 0, 0, 0, 0},
+    {0, 0, 0, 0, 0, 0, 0},
+    {0, 0, 0, 0, 0, 0, 0},
+    {0, 0, 0, 0, 0, 0, 0},
+    {0, 0, 0, 0, 0, 0, 0},
+    {0, 0, 0, 0, 0, 0, 0},
+    {0, 0, 0, 0, 0, 0, 0},
+    {0, 0, 0, 0, 0, 0, 0},
+    {0, 0, 0, 0, 0, 0, 0},
+    {0, 0, 0, 0, 0, 0, 0},
+    {0, 0, 0, 0, 0, 0, 0},
+    {0, 0, 0, 0, 0, 0, 0},
+    {0, 0, 0, 0, 0, 0, 0},
+    {0, 0, 0, 0, 0, 0, 0},
+    {0, 0, 0, 0, 0, 0, 0},
+    {0, 0, 0, 0, 0, 0, 0},
+    {0, 0, 0, 0, 0, 0, 0},
+    {0, 0, 0, 0, 0, 0, 0},
+    {0, 0, 0, 0, 0, 0, 0},
+    {0, 0, 0, 0, 0, 0, 0},
+    {0, 0, 0, 0, 0, 0, 0},
+    {0, 0, 0, 0, 0, 0, 0},
+    {0, 0, 0, 0, 0, 0, 0},
+    {0, 0, 0, 0, 0, 0, 0}};
+
+#if 1
+constexpr uint32_t inverse_width[65] = {
+    65536 * 64 / 1, // never used
+    65536 * 64 / 1,  65536 * 64 / 2,  65536 * 64 / 3,  65536 * 64 / 4,  65536 * 64 / 5,  65536 * 64 / 6,
+    65536 * 64 / 7,  65536 * 64 / 8,  65536 * 64 / 9,  65536 * 64 / 10, 65536 * 64 / 11, 65536 * 64 / 12,
+    65536 * 64 / 13, 65536 * 64 / 14, 65536 * 64 / 15, 65536 * 64 / 16, 65536 * 64 / 17, 65536 * 64 / 18,
+    65536 * 64 / 19, 65536 * 64 / 20, 65536 * 64 / 21, 65536 * 64 / 22, 65536 * 64 / 23, 65536 * 64 / 24,
+    65536 * 64 / 25, 65536 * 64 / 26, 65536 * 64 / 27, 65536 * 64 / 28, 65536 * 64 / 29, 65536 * 64 / 30,
+    65536 * 64 / 31, 65536 * 64 / 32, 65536 * 64 / 33, 65536 * 64 / 34, 65536 * 64 / 35, 65536 * 64 / 36,
+    65536 * 64 / 37, 65536 * 64 / 38, 65536 * 64 / 39, 65536 * 64 / 40, 65536 * 64 / 41, 65536 * 64 / 42,
+    65536 * 64 / 43, 65536 * 64 / 44, 65536 * 64 / 45, 65536 * 64 / 46, 65536 * 64 / 47, 65536 * 64 / 48,
+    65536 * 64 / 49, 65536 * 64 / 50, 65536 * 64 / 51, 65536 * 64 / 52, 65536 * 64 / 53, 65536 * 64 / 54,
+    65536 * 64 / 55, 65536 * 64 / 56, 65536 * 64 / 57, 65536 * 64 / 58, 65536 * 64 / 59, 65536 * 64 / 60,
+    65536 * 64 / 61, 65536 * 64 / 62, 65536 * 64 / 63, 65536 * 64 / 64,
+};
+
+inline int first_field_marked(int width, uint64_t vector)
+{
+#if REALM_WINDOWS
+    int lz = (int)_tzcnt_u64(vector); // TODO: not clear if this is ok on all platforms
+#else
+    int lz = __builtin_ctzll(vector);
+#endif
+    int field = (lz * inverse_width[width]) >> 22;
+    REALM_ASSERT_DEBUG(field == (lz / width));
+    return field;
+}
+#endif
+#if 0
+inline int first_field_marked(int width, uint64_t vector)
+{
+    // isolate least significant bit
+    vector = vector & (~vector + 1);
+    const struct find_field_desc& desc = find_field_table[width];
+    int result = 0;
+    switch (desc.levels) {
+        // the following case entries are intended to fall through
+        // (this is a variant of Duff's Device)
+        // TODO: disable compiler warnings for it
+        case 6:
+            result |= (vector & desc.m32) ? 32 : 0;
+        case 5:
+            result |= (vector & desc.m16) ? 16 : 0;
+        case 4:
+            result |= (vector & desc.m8) ? 8 : 0;
+        case 3:
+            result |= (vector & desc.m4) ? 4 : 0;
+        case 2:
+            result |= (vector & desc.m2) ? 2 : 0;
+        case 1:
+            result |= (vector & desc.m1) ? 1 : 0;
+        default:
+            break;
+    }
+    return result;
+}
+#endif
+#if 0
+inline int first_field_marked(int width, uint64_t vector)
+{
+    // isolate least significant bit
+    vector = vector & (~vector + 1);
+    // directly compute position of set bit using table
+    const struct find_field_desc& desc = find_field_table[width];
+    return ((vector & desc.m1) ? 1 : 0) | ((vector & desc.m2) ? 2 : 0) | ((vector & desc.m4) ? 4 : 0) |
+           ((vector & desc.m8) ? 8 : 0) | ((vector & desc.m16) ? 16 : 0) | ((vector & desc.m32) ? 32 : 0);
+}
+#endif
+#if 0
+inline int first_field_marked(int width, uint64_t vector)
+{
+    int result = 0;
+    auto msb = 1ULL << (width - 1);
+    while (msb) {
+        if (vector & msb)
+            return result;
+        msb <<= width;
+        result++;
+    }
+    return -1;
+}
+#endif
+
 namespace impl {
 
 // Lower and Upper bound are mainly used in the B+tree implementation,
diff --git a/src/realm/array_encode.cpp b/src/realm/array_encode.cpp
index d751f6b5b75..e6e98e595e7 100644
--- a/src/realm/array_encode.cpp
+++ b/src/realm/array_encode.cpp
@@ -187,6 +187,7 @@ void ArrayEncode::init(const char* h)
         m_ndx_width = NodeHeader::get_elementB_size<Encoding::Flex>(h);
         m_ndx_size = NodeHeader::get_arrayB_num_elements<Encoding::Flex>(h);
         m_v_mask = 1ULL << (m_v_width - 1);
+        m_ndx_mask = 1ULL << (m_ndx_width - 1);
     }
 }
 
diff --git a/src/realm/array_encode.hpp b/src/realm/array_encode.hpp
index d161b91c88b..b8414299d0f 100644
--- a/src/realm/array_encode.hpp
+++ b/src/realm/array_encode.hpp
@@ -72,6 +72,7 @@ class ArrayEncode {
     Encoding m_encoding{NodeHeader::Encoding::WTypBits}; // this is not ok .... probably
     size_t m_v_width = 0, m_v_size = 0, m_ndx_width = 0, m_ndx_size = 0;
     size_t m_v_mask = 0;
+    size_t m_ndx_mask = 0;
 
     friend class ArrayPacked;
     friend class ArrayFlex;
diff --git a/src/realm/array_flex.cpp b/src/realm/array_flex.cpp
index 51e1e1d873c..f911f94fe37 100644
--- a/src/realm/array_flex.cpp
+++ b/src/realm/array_flex.cpp
@@ -159,31 +159,169 @@ bool ArrayFlex::find_all(const Array& arr, int64_t value, size_t start, size_t e
 
     REALM_ASSERT_3(arr.m_width, !=, 0);
 
+    if constexpr (std::is_same_v<Equal, Cond>) {
+        find_eq(arr, value, start, end, baseindex, state);
+    }
+    else if constexpr (std::is_same_v<NotEqual, Cond>) {
+        find_neq(arr, value, start, end, baseindex, state);
+    }
+    else if constexpr (std::is_same_v<Less, Cond>) {
+        find_lt(arr, value, start, end, baseindex, state);
+    }
+    else if constexpr (std::is_same_v<Greater, Cond>) {
+        find_gt(arr, value, start, end, baseindex, state);
+    }
+
+    return true;
+}
+
+template <typename Cond, bool v>
+inline size_t ArrayFlex::parallel_subword_find(const Array& arr, uint64_t value, size_t width_mask, size_t offset,
+                                               uint_least8_t width, size_t start, size_t end) const
+{
+    const auto MSBs = populate(width, width_mask);
+    const auto search_vector = populate(width, value);
+    const auto field_count = num_fields_for_width(width);
+    const auto bit_count_pr_iteration = num_bits_for_width(width);
+    auto total_bit_count_left = static_cast<signed>(end - start) * width;
+    REALM_ASSERT(total_bit_count_left >= 0);
+    auto bitwidth_cmp = [&MSBs](uint64_t a, uint64_t b) {
+        if constexpr (std::is_same_v<Cond, Equal>)
+            return find_all_fields_EQ(MSBs, a, b);
+        else if constexpr (std::is_same_v<Cond, NotEqual>)
+            return find_all_fields_NE(MSBs, a, b);
+        else if constexpr (std::is_same_v<Cond, GreaterEqual>) {
+            if constexpr (v == true)
+                return find_all_fields_signed_GE(MSBs, a, b);
+            if constexpr (v == false)
+                return find_all_fields_unsigned_GE(MSBs, a, b);
+            REALM_UNREACHABLE();
+        }
+
+        else if constexpr (std::is_same_v<Cond, Greater>)
+            return find_all_fields_signed_GT(MSBs, a, b);
+        else if constexpr (std::is_same_v<Cond, Less>)
+            return find_all_fields_unsigned_LT(MSBs, a, b);
+    };
+
+    unaligned_word_iter it((uint64_t*)(arr.m_data), offset + start * width);
+    uint64_t vector = 0;
+    while (total_bit_count_left >= bit_count_pr_iteration) {
+        const auto word = it.get(bit_count_pr_iteration);
+        vector = bitwidth_cmp(word, search_vector);
+        if (vector) {
+            int sub_word_index = first_field_marked((int)width, vector);
+            return start + sub_word_index;
+        }
+        total_bit_count_left -= bit_count_pr_iteration;
+        start += field_count;
+        it.bump(bit_count_pr_iteration);
+    }
+    if (total_bit_count_left) {                         // final subword, may be partial
+        const auto word = it.get(total_bit_count_left); // <-- limit lookahead to avoid touching memory beyond array
+        vector = bitwidth_cmp(word, search_vector);
+        auto last_word_mask = 0xFFFFFFFFFFFFFFFFULL >> (64 - total_bit_count_left);
+        vector &= last_word_mask;
+        if (vector) {
+            int sub_word_index = first_field_marked(width, vector);
+            return start + sub_word_index;
+        }
+    }
+    return end;
+}
+
+bool ArrayFlex::find_eq(const Array& arr, int64_t value, size_t start, size_t end, size_t baseindex,
+                        QueryStateBase* state) const
+{
     const auto& encoder = arr.m_encoder;
-    const auto data = (uint64_t*)arr.m_data;
     const auto v_width = encoder.m_v_width;
     const auto v_size = encoder.m_v_size;
     const auto ndx_width = encoder.m_ndx_width;
-    const auto mask = encoder.width_mask();
+    const auto offset = v_size * v_width;
 
-    auto cmp = [](int64_t v, int64_t value) {
-        if constexpr (std::is_same_v<Cond, Equal>)
-            return v == value;
-        if constexpr (std::is_same_v<Cond, NotEqual>)
-            return v != value;
-        if constexpr (std::is_same_v<Cond, Greater>)
-            return v > value;
-        if constexpr (std::is_same_v<Cond, Less>)
-            return v < value;
-    };
+    auto v_start = parallel_subword_find<Equal>(arr, value, encoder.m_v_mask, 0, v_width, 0, v_size);
+    if (v_start == v_size)
+        return true;
+
+    while (start < end) {
+        start = parallel_subword_find<Equal>(arr, v_start, encoder.m_ndx_mask, offset, ndx_width, start, end);
+        if (start < end)
+            if (!state->match(start + baseindex))
+                return false;
+
+        ++start;
+    }
+    return true;
+}
+
+bool ArrayFlex::find_neq(const Array& arr, int64_t value, size_t start, size_t end, size_t baseindex,
+                         QueryStateBase* state) const
+{
+    const auto& encoder = arr.m_encoder;
+    const auto v_width = encoder.m_v_width;
+    const auto v_size = encoder.m_v_size;
+    const auto ndx_width = encoder.m_ndx_width;
+    const auto offset = v_size * v_width;
+
+    auto v_start = parallel_subword_find<Equal>(arr, value, encoder.m_v_mask, 0, v_width, 0, v_size);
+    if (v_start == v_size)
+        return true;
+
+    while (start < end) {
+        start = parallel_subword_find<NotEqual>(arr, v_start, encoder.m_ndx_mask, offset, ndx_width, start, end);
+        if (start < end)
+            if (!state->match(start + baseindex))
+                return false;
+        ++start;
+    }
+    return true;
+}
 
+bool ArrayFlex::find_lt(const Array& arr, int64_t value, size_t start, size_t end, size_t baseindex,
+                        QueryStateBase* state) const
+{
+    const auto& encoder = arr.m_encoder;
+    const auto v_width = encoder.m_v_width;
+    const auto v_size = encoder.m_v_size;
+    const auto ndx_width = encoder.m_ndx_width;
     const auto offset = v_size * v_width;
-    bf_iterator it_index{data, static_cast<size_t>(offset), ndx_width, ndx_width, start};
-    for (; start < end; ++start, ++it_index) {
-        const auto v = sign_extend_field_by_mask(mask, read_bitfield(data, it_index.get_value() * v_width, v_width));
-        if (cmp(v, value))
+
+    auto v_start = parallel_subword_find<GreaterEqual>(arr, value, encoder.m_v_mask, 0, v_width, 0, v_size);
+    if (v_start == v_size)
+        return true;
+
+    while (start < end) {
+        start = parallel_subword_find<Less>(arr, v_start, encoder.m_ndx_mask, offset, ndx_width, start, end);
+        if (start < end)
+            if (!state->match(start + baseindex))
+                return false;
+
+        ++start;
+    }
+    return true;
+}
+
+bool ArrayFlex::find_gt(const Array& arr, int64_t value, size_t start, size_t end, size_t baseindex,
+                        QueryStateBase* state) const
+{
+    const auto& encoder = arr.m_encoder;
+    const auto v_width = encoder.m_v_width;
+    const auto v_size = encoder.m_v_size;
+    const auto ndx_width = encoder.m_ndx_width;
+    const auto offset = v_size * v_width;
+
+    auto v_start = parallel_subword_find<Greater>(arr, value, encoder.m_v_mask, 0, v_width, 0, v_size);
+    if (v_start == v_size)
+        return true;
+
+    while (start < end) {
+        start = parallel_subword_find<GreaterEqual, false>(arr, v_start, encoder.m_ndx_mask, offset, ndx_width, start,
+                                                           end);
+        if (start < end)
             if (!state->match(start + baseindex))
                 return false;
+
+        ++start;
     }
     return true;
 }
diff --git a/src/realm/array_flex.hpp b/src/realm/array_flex.hpp
index f91f7213e12..2079db0a825 100644
--- a/src/realm/array_flex.hpp
+++ b/src/realm/array_flex.hpp
@@ -39,13 +39,23 @@ class ArrayFlex {
     int64_t get(const char*, size_t, size_t, size_t, size_t, size_t, size_t) const;
     void get_chunk(const Array& h, size_t ndx, int64_t res[8]) const;
     void set_direct(const Array&, size_t, int64_t) const;
+
     template <typename Cond>
     bool find_all(const Array&, int64_t, size_t, size_t, size_t, QueryStateBase*) const;
+
     int64_t sum(const Array&, size_t, size_t) const;
 
 private:
     int64_t do_get(uint64_t*, size_t, size_t, size_t, size_t, size_t, size_t) const;
     bool find_all_match(size_t start, size_t end, size_t baseindex, QueryStateBase* state) const;
+
+    template <typename Cond, bool = true> // true int64_t other uint64_t
+    inline size_t parallel_subword_find(const Array&, uint64_t, size_t, size_t, uint_least8_t, size_t, size_t) const;
+
+    bool find_eq(const Array&, int64_t, size_t, size_t, size_t, QueryStateBase*) const;
+    bool find_neq(const Array&, int64_t, size_t, size_t, size_t, QueryStateBase*) const;
+    bool find_lt(const Array&, int64_t, size_t, size_t, size_t, QueryStateBase*) const;
+    bool find_gt(const Array&, int64_t, size_t, size_t, size_t, QueryStateBase*) const;
 };
 } // namespace realm
 #endif // REALM_ARRAY_COMPRESS_HPP
diff --git a/src/realm/array_integer.hpp b/src/realm/array_integer.hpp
index 3248fcbad46..93f5e76ded5 100644
--- a/src/realm/array_integer.hpp
+++ b/src/realm/array_integer.hpp
@@ -158,6 +158,7 @@ inline ArrayIntNull::~ArrayIntNull() noexcept {}
 
 inline size_t ArrayIntNull::size() const noexcept
 {
+    // this cannot be right, what if size is 0
     return Array::size() - 1;
 }
 
diff --git a/src/realm/array_integer_tpl.hpp b/src/realm/array_integer_tpl.hpp
index b145ffe67cd..cc021df8bb6 100644
--- a/src/realm/array_integer_tpl.hpp
+++ b/src/realm/array_integer_tpl.hpp
@@ -79,7 +79,7 @@ bool ArrayIntNull::find_impl(value_type opt_value, size_t start, size_t end, Que
         }
         // if encoded use specialised find
         if (is_encoded())
-            return find_encoded<cond>(value, start2, end, baseindex2, state);
+            return find_encoded<cond>(value, start2, end2, baseindex2, state);
         // Fall back to plain Array find.
         return ArrayWithFind(*this).find<cond>(value, start2, end2, baseindex2, state);
     }
diff --git a/src/realm/array_packed.cpp b/src/realm/array_packed.cpp
index db5fde8b5b4..86693042948 100644
--- a/src/realm/array_packed.cpp
+++ b/src/realm/array_packed.cpp
@@ -112,7 +112,6 @@ void ArrayPacked::get_chunk(const Array& arr, size_t ndx, int64_t res[8]) const
         res[index++] = get(arr, i++);
     }
 }
-
 template <typename Cond>
 bool ArrayPacked::find_all(const Array& arr, int64_t value, size_t start, size_t end, size_t baseindex,
                            QueryStateBase* state) const
@@ -138,28 +137,79 @@ bool ArrayPacked::find_all(const Array& arr, int64_t value, size_t start, size_t
 
     REALM_ASSERT_3(arr.m_width, !=, 0);
 
-    auto cmp = [](int64_t v, int64_t value) {
+
+    // NOTE: this is one of the most important functions in the whole codebase, since it determines how fast the
+    // queries run.
+    //
+    // Main idea around find.
+    // Try to find the starting point where the condition can be met, comparing as many values as a single 64bit can
+    // contain in parallel. Once we have found the starting point, keep matching values as much as we can between
+    // start and end.
+    //
+    // EG: we store the value 6, with width 4bits (0110), 6 is 4 bits because, 110 (6) + sign bit 0.
+    // Inside 64bits we can fit max 16 times 6. If we go from index 0 to 15 throughout the same 64 bits, we need to
+    // apply a mask and a shift bits every time, then compare the values.
+    // This is not the cheapest thing to do. Instead we can compare all values contained within 64 bits in one go and
+    // see if there is a match with what we are looking for. Reducing the number of comparison by ~logk(N) where K is
+    // the width of each single value within a 64 bit word and N is the total number of values stored in the array.
+
+    // in packed format a parallel subword find pays off also for width >= 32
+    while (start < end) {
+        start = parallel_subword_find<Cond>(arr, value, start, end);
+        if (start < end)
+            if (!state->match(start + baseindex))
+                return false;
+
+        ++start;
+    }
+    return true;
+}
+
+template <typename Cond>
+size_t ArrayPacked::parallel_subword_find(const Array& arr, int64_t value, size_t start, size_t end) const
+{
+    const auto width = arr.m_width;
+    const auto MSBs = populate(width, arr.get_encoder().width_mask());
+    const auto search_vector = populate(width, value);
+    const auto field_count = num_fields_for_width(width);
+    const auto bit_count_pr_iteration = num_bits_for_width(width);
+    auto total_bit_count_left = static_cast<signed>(end - start) * width;
+    REALM_ASSERT(total_bit_count_left >= 0);
+    auto bitwidth_cmp = [&MSBs](uint64_t a, uint64_t b) {
         if constexpr (std::is_same_v<Cond, Equal>)
-            return v == value;
+            return find_all_fields_EQ(MSBs, a, b);
         if constexpr (std::is_same_v<Cond, NotEqual>)
-            return v != value;
+            return find_all_fields_NE(MSBs, a, b);
         if constexpr (std::is_same_v<Cond, Greater>)
-            return v > value;
+            return find_all_fields_signed_GT(MSBs, a, b);
         if constexpr (std::is_same_v<Cond, Less>)
-            return v < value;
+            return find_all_fields_signed_LT(MSBs, a, b);
     };
 
-    //~6/7x slower, we need to do a bitscan before to start this loop when values are less than 32 and 64 bits
-    bf_iterator it((uint64_t*)arr.m_data, 0, arr.m_width, arr.m_width, start);
-    const auto mask = arr.get_encoder().width_mask();
-    for (; start < end; ++start, ++it) {
-        const auto v = sign_extend_field_by_mask(mask, it.get_value());
-        if (cmp(v, value)) {
-            if (!state->match(start + baseindex))
-                return false;
+    unaligned_word_iter it((uint64_t*)arr.m_data, start * arr.m_width);
+    uint64_t vector = 0;
+    while (total_bit_count_left >= bit_count_pr_iteration) {
+        const auto word = it.get(bit_count_pr_iteration);
+        vector = bitwidth_cmp(word, search_vector);
+        if (vector) {
+            int sub_word_index = first_field_marked(width, vector);
+            return start + sub_word_index;
         }
+        total_bit_count_left -= bit_count_pr_iteration;
+        start += field_count;
+        it.bump(bit_count_pr_iteration);
     }
-    return true;
+    if (total_bit_count_left) {                         // final subword, may be partial
+        const auto word = it.get(total_bit_count_left); // <-- limit lookahead to avoid touching memory beyond array
+        vector = bitwidth_cmp(word, search_vector);
+        auto last_word_mask = 0xFFFFFFFFFFFFFFFFULL >> (64 - total_bit_count_left);
+        vector &= last_word_mask;
+        if (vector) {
+            int sub_word_index = first_field_marked(width, vector);
+            return start + sub_word_index;
+        }
+    }
+    return end;
 }
 
 bool ArrayPacked::find_all_match(size_t start, size_t end, size_t baseindex, QueryStateBase* state) const
diff --git a/src/realm/array_packed.hpp b/src/realm/array_packed.hpp
index 790d04c80bf..985a9ea3f73 100644
--- a/src/realm/array_packed.hpp
+++ b/src/realm/array_packed.hpp
@@ -48,6 +48,9 @@ class ArrayPacked {
 private:
     int64_t do_get(uint64_t*, size_t, size_t, size_t, size_t) const;
     bool find_all_match(size_t start, size_t end, size_t baseindex, QueryStateBase* state) const;
+
+    template <typename Cond>
+    size_t parallel_subword_find(const Array&, int64_t, size_t, size_t) const;
 };
 } // namespace realm
 
diff --git a/src/realm/node_header.hpp b/src/realm/node_header.hpp
index ed83db3ad39..e91a7b25def 100644
--- a/src/realm/node_header.hpp
+++ b/src/realm/node_header.hpp
@@ -897,6 +897,7 @@ size_t inline NodeHeader::get_byte_size_from_header(const char* header) noexcept
                                                                 get_elementB_size<NodeHeader::Encoding::Flex>(h));
         default:
             REALM_ASSERT_RELEASE(false && "unknown encoding");
+            return 0; // kill a warning
     }
 }
 
diff --git a/test/test_array_integer.cpp b/test/test_array_integer.cpp
index 35959140a82..f877c23ad84 100644
--- a/test/test_array_integer.cpp
+++ b/test/test_array_integer.cpp
@@ -34,12 +34,14 @@ using namespace realm::test_util;
 
 // #define ARRAY_PERFORMANCE_TESTING
 #if !defined(REALM_DEBUG) && defined(ARRAY_PERFORMANCE_TESTING)
-TEST(perf_array_encode_get_vs_array_get)
+NONCONCURRENT_TEST(perf_array_encode_get_vs_array_get_less_32bit)
+// ONLY(perf_array_encode_get_vs_array_get_less_32bit)
 {
     using namespace std;
     using namespace std::chrono;
     size_t n_values = 1000;
     size_t n_runs = 100;
+    std::cout << "   < 32 bit values " << std::endl;
     std::cout << "   N values = " << n_values << std::endl;
     std::cout << "   N runs = " << n_runs << std::endl;
 
@@ -129,12 +131,775 @@ TEST(perf_array_encode_get_vs_array_get)
     a_encoded.destroy();
 }
 
-TEST(Test_basic_find)
+
+NONCONCURRENT_TEST(Test_basic_find_EQ_less_32bit)
+// ONLY(Test_basic_find_EQ_less_32bit)
+{
+    using namespace std;
+    using namespace std::chrono;
+    size_t n_values = 1000;
+    size_t n_runs = 100;
+    std::cout << "   Value with bitwidth < 32 " << std::endl;
+    std::cout << "   N values = " << n_values << std::endl;
+    std::cout << "   N runs = " << n_runs << std::endl;
+
+    std::vector<int64_t> input_array;
+    ArrayInteger a(Allocator::get_default());
+    ArrayInteger a_encoded(Allocator::get_default());
+    a.create();
+
+    for (size_t i = 0; i < n_values; i++)
+        input_array.push_back(i);
+    std::random_device rd;
+    std::mt19937 g(rd());
+    std::shuffle(input_array.begin(), input_array.end(), g);
+    for (const auto& v : input_array)
+        a.add(v);
+
+    auto t1 = high_resolution_clock::now();
+    for (size_t j = 0; j < n_runs; ++j) {
+        for (size_t i = 0; i < n_values; ++i) {
+            auto ndx = a.find_first(input_array[i]);
+            REALM_ASSERT(ndx != realm::not_found);
+            REALM_ASSERT(a.get(ndx) == input_array[ndx]);
+        }
+    }
+    auto t2 = high_resolution_clock::now();
+
+    std::cout << "   Positive values - Array::find<Equal>(): " << duration_cast<milliseconds>(t2 - t1).count()
+              << " ms" << std::endl;
+    std::cout << "   Positive values - Array::find<Equal>(): "
+              << (double)duration_cast<nanoseconds>(t2 - t1).count() / n_values / n_runs << " ns/value" << std::endl;
+
+    a.try_encode(a_encoded);
+    CHECK(a_encoded.is_encoded());
+    CHECK(a_encoded.size() == a.size());
+
+    //    std::cout << "Array: " << std::endl;
+    //    for(size_t i=0; i<a_encoded.size(); ++i)
+    //        std::cout << a_encoded.get(i) << ", ";
+    //    std::cout << std::endl;
+
+    // verify that both find the same thing
+    for (size_t j = 0; j < n_runs; ++j) {
+        for (size_t i = 0; i < n_values; ++i) {
+            auto v = a.find_first(input_array[i]);
+            auto v1 = a_encoded.find_first(input_array[i]);
+            REALM_ASSERT(v == v1);
+        }
+    }
+
+    t1 = high_resolution_clock::now();
+    for (size_t j = 0; j < n_runs; ++j) {
+        for (size_t i = 0; i < n_values; ++i) {
+            auto ndx = a_encoded.find_first(input_array[i]);
+            REALM_ASSERT(ndx != realm::not_found);
+            REALM_ASSERT(a_encoded.get(ndx) == input_array[ndx]);
+        }
+    }
+    t2 = high_resolution_clock::now();
+    std::cout << "   Positive values - ArrayEncode::find<Equal>(): " << duration_cast<milliseconds>(t2 - t1).count()
+              << " ms" << std::endl;
+    std::cout << "   Positive values - ArrayEncode::find<Equal>(): "
+              << (double)duration_cast<nanoseconds>(t2 - t1).count() / n_values / n_runs << " ns/value" << std::endl;
+
+    std::cout << std::endl;
+
+    a.destroy();
+    a_encoded.destroy();
+    a.create();
+    input_array.clear();
+    for (size_t i = 0; i < n_values; i++)
+        input_array.push_back(-i);
+    std::random_device rd1;
+    std::mt19937 g1(rd1());
+    std::shuffle(input_array.begin(), input_array.end(), g1);
+    for (const auto& v : input_array)
+        a.add(v);
+
+    a.try_encode(a_encoded);
+    CHECK(a_encoded.is_encoded());
+    CHECK(a_encoded.size() == a.size());
+
+    // verify that both find the same thing
+    for (size_t j = 0; j < n_runs; ++j) {
+        for (size_t i = 0; i < n_values; ++i) {
+            auto v = a.find_first(input_array[i]);
+            auto v1 = a_encoded.find_first(input_array[i]);
+            REALM_ASSERT(v == v1);
+        }
+    }
+
+    t1 = high_resolution_clock::now();
+    for (size_t j = 0; j < n_runs; ++j) {
+        for (size_t i = 0; i < n_values; ++i) {
+            auto ndx = a.find_first(input_array[i]);
+            REALM_ASSERT(ndx != realm::not_found);
+            REALM_ASSERT(a.get(ndx) == input_array[ndx]);
+        }
+    }
+    t2 = high_resolution_clock::now();
+
+    std::cout << "   Negative values - Array::find<Equal>(): " << duration_cast<milliseconds>(t2 - t1).count()
+              << " ms" << std::endl;
+    std::cout << "   Negative values - Array::find<Equal>(): "
+              << (double)duration_cast<nanoseconds>(t2 - t1).count() / n_values / n_runs << " ns/value" << std::endl;
+
+    t1 = high_resolution_clock::now();
+    for (size_t j = 0; j < n_runs; ++j) {
+        for (size_t i = 0; i < n_values; ++i) {
+            auto ndx = a_encoded.find_first(input_array[i]);
+            REALM_ASSERT(ndx != realm::not_found);
+            REALM_ASSERT(a_encoded.get(ndx) == a.get(ndx));
+        }
+    }
+    t2 = high_resolution_clock::now();
+    std::cout << "   Negative values - ArrayEncode::find<Equal>(): " << duration_cast<milliseconds>(t2 - t1).count()
+              << " ms" << std::endl;
+    std::cout << "   Negative values - ArrayEncode::find<Equal>(): "
+              << (double)duration_cast<nanoseconds>(t2 - t1).count() / n_values / n_runs << " ns/value" << std::endl;
+
+    a.destroy();
+    a_encoded.destroy();
+}
+
+NONCONCURRENT_TEST(Test_basic_find_NEQ_value_less_32bit)
+// ONLY(Test_basic_find_NEQ_value_less_32bit)
+{
+    using namespace std;
+    using namespace std::chrono;
+    size_t n_values = 1000;
+    size_t n_runs = 100;
+    std::cout << "   Value with bitwidth < 32 " << std::endl;
+    std::cout << "   N values = " << n_values << std::endl;
+    std::cout << "   N runs = " << n_runs << std::endl;
+
+    std::vector<int64_t> input_array;
+    ArrayInteger a(Allocator::get_default());
+    ArrayInteger a_encoded(Allocator::get_default());
+    a.create();
+
+    for (size_t i = 0; i < n_values; i++)
+        input_array.push_back(i);
+    std::random_device rd;
+    std::mt19937 g(rd());
+    std::shuffle(input_array.begin(), input_array.end(), g);
+    for (const auto& v : input_array)
+        a.add(v);
+
+    QueryStateFindFirst state1;
+    QueryStateFindFirst state2;
+    auto t1 = high_resolution_clock::now();
+    for (size_t j = 0; j < n_runs; ++j) {
+        for (size_t i = 0; i < n_values; ++i) {
+            a.find<NotEqual>(i, 0, a.size(), &state1);
+            REALM_ASSERT(state1.m_state != realm::not_found);
+            REALM_ASSERT(a.get(state1.m_state) == input_array[state1.m_state]);
+        }
+    }
+    auto t2 = high_resolution_clock::now();
+
+    std::cout << "   Positive values - Array::find<NotEqual>(): " << duration_cast<milliseconds>(t2 - t1).count()
+              << " ms" << std::endl;
+    std::cout << "   Positive values - Array::find<NotEqual>(): "
+              << (double)duration_cast<nanoseconds>(t2 - t1).count() / n_values / n_runs << " ns/value" << std::endl;
+
+    a.try_encode(a_encoded);
+    CHECK(a_encoded.is_encoded());
+    CHECK(a_encoded.size() == a.size());
+
+    // verify that both find the same thing
+    for (size_t j = 0; j < n_runs; ++j) {
+        for (size_t i = 0; i < n_values; ++i) {
+            a.find<NotEqual>(i, 0, a.size(), &state1);
+            a_encoded.find<NotEqual>(i, 0, a_encoded.size(), &state2);
+            REALM_ASSERT(state1.m_state == state2.m_state);
+        }
+    }
+
+    t1 = high_resolution_clock::now();
+    for (size_t j = 0; j < n_runs; ++j) {
+        for (size_t i = 0; i < n_values; ++i) {
+            a_encoded.find<NotEqual>(i, 0, a_encoded.size(), &state2);
+            REALM_ASSERT(state2.m_state != realm::not_found);
+            REALM_ASSERT(a_encoded.get(state2.m_state) == a.get(state2.m_state));
+        }
+    }
+    t2 = high_resolution_clock::now();
+    std::cout << "   Positive values - ArrayEncode::find<NotEqual>(): "
+              << duration_cast<milliseconds>(t2 - t1).count() << " ms" << std::endl;
+    std::cout << "   Positive values - ArrayEncode::find<NotEqual>(): "
+              << (double)duration_cast<nanoseconds>(t2 - t1).count() / n_values / n_runs << " ns/value" << std::endl;
+
+    std::cout << std::endl;
+
+    a.destroy();
+    a_encoded.destroy();
+    a.create();
+    input_array.clear();
+    for (size_t i = 0; i < n_values; i++)
+        input_array.push_back(-i);
+    std::random_device rd1;
+    std::mt19937 g1(rd1());
+    std::shuffle(input_array.begin(), input_array.end(), g1);
+    for (const auto& v : input_array)
+        a.add(v);
+
+    a.try_encode(a_encoded);
+    CHECK(a_encoded.is_encoded());
+    CHECK(a_encoded.size() == a.size());
+
+    // NEQ for signed integers is not working. TODO: investigate this.
+    // verify that both find the same thing
+
+    for (size_t j = 0; j < n_runs; ++j) {
+        for (size_t i = 0; i < n_values; ++i) {
+            a.find<NotEqual>(-i, 0, a.size(), &state1);
+            a_encoded.find<NotEqual>(-i, 0, a_encoded.size(), &state2);
+            REALM_ASSERT(state1.m_state == state2.m_state);
+        }
+    }
+
+    t1 = high_resolution_clock::now();
+    for (size_t j = 0; j < n_runs; ++j) {
+        for (size_t i = 0; i < n_values; ++i) {
+            a.find<NotEqual>(-i, 0, a.size(), &state1);
+            REALM_ASSERT(state1.m_state != realm::not_found);
+            REALM_ASSERT(a.get(state1.m_state) == input_array[state1.m_state]);
+        }
+    }
+    t2 = high_resolution_clock::now();
+
+    std::cout << "   Negative values - Array::find<NotEqual>(): " << duration_cast<milliseconds>(t2 - t1).count()
+              << " ms" << std::endl;
+    std::cout << "   Negative values - Array::find<NotEqual>(): "
+              << (double)duration_cast<nanoseconds>(t2 - t1).count() / n_values / n_runs << " ns/value" << std::endl;
+
+    t1 = high_resolution_clock::now();
+    for (size_t j = 0; j < n_runs; ++j) {
+        for (size_t i = 0; i < n_values; ++i) {
+            a_encoded.find<NotEqual>(-i, 0, a_encoded.size(), &state2);
+            REALM_ASSERT(state2.m_state != realm::not_found);
+            REALM_ASSERT(a_encoded.get(state2.m_state) == a.get(state2.m_state));
+        }
+    }
+    t2 = high_resolution_clock::now();
+    std::cout << "   Negative values - ArrayEncode::find<NotEqual>(): "
+              << duration_cast<milliseconds>(t2 - t1).count() << " ms" << std::endl;
+    std::cout << "   Negative values - ArrayEncode::find<NotEqual>(): "
+              << (double)duration_cast<nanoseconds>(t2 - t1).count() / n_values / n_runs << " ns/value" << std::endl;
+
+    a.destroy();
+    a_encoded.destroy();
+}
+
+NONCONCURRENT_TEST(Test_basic_find_LT_value_less_32bit)
+// ONLY(Test_basic_find_LT_value_less_32bit)
+{
+    using namespace std;
+    using namespace std::chrono;
+    size_t n_values = 1000;
+    size_t n_runs = 100;
+    std::cout << "   Value with bitwidth < 32 " << std::endl;
+    std::cout << "   N values = " << n_values << std::endl;
+    std::cout << "   N runs = " << n_runs << std::endl;
+
+    std::vector<int64_t> input_array;
+    ArrayInteger a(Allocator::get_default());
+    ArrayInteger a_encoded(Allocator::get_default());
+    a.create();
+
+    for (size_t i = 0; i < n_values; i++)
+        input_array.push_back(i);
+    std::random_device rd;
+    std::mt19937 g(rd());
+    std::shuffle(input_array.begin(), input_array.end(), g);
+    for (const auto& v : input_array)
+        a.add(v);
+
+    QueryStateFindFirst state1{};
+    QueryStateFindFirst state2{};
+    auto t1 = high_resolution_clock::now();
+    for (size_t j = 0; j < n_runs; ++j) {
+        for (size_t i = 1; i < n_values; ++i) { // there is nothing less than 0
+            a.find<Less>(i, 0, a.size(), &state1);
+            REALM_ASSERT(state1.m_state != realm::not_found);
+            REALM_ASSERT(a.get(state1.m_state) == input_array[state1.m_state]);
+        }
+    }
+    auto t2 = high_resolution_clock::now();
+
+    std::cout << "   Positive values - Array::find<Less>(): " << duration_cast<milliseconds>(t2 - t1).count() << " ms"
+              << std::endl;
+    std::cout << "   Positive values - Array::find<Less>(): "
+              << (double)duration_cast<nanoseconds>(t2 - t1).count() / n_values / n_runs << " ns/value" << std::endl;
+
+    a.try_encode(a_encoded);
+    CHECK(a_encoded.is_encoded());
+    CHECK(a_encoded.size() == a.size());
+
+    //   std::cout << "Array: " << std::endl;
+    //   for(size_t i=0; i<a_encoded.size(); ++i)
+    //       std::cout << a_encoded.get(i) << ", ";
+    //    std::cout << std::endl;
+
+    // verify that both find the same thing
+    state1 = {};
+    state2 = {};
+    for (size_t j = 0; j < n_runs; ++j) {
+        for (size_t i = 0; i < n_values; ++i) {
+            a.find<Less>(i, 0, a.size(), &state1);
+            a_encoded.find<Less>(i, 0, a_encoded.size(), &state2);
+            REALM_ASSERT(state1.m_state == state2.m_state);
+        }
+    }
+
+    t1 = high_resolution_clock::now();
+    for (size_t j = 0; j < n_runs; ++j) {
+        for (size_t i = 1; i < n_values; ++i) { // there is nothing less than 0
+            a_encoded.find<Less>(i, 0, a_encoded.size(), &state2);
+            REALM_ASSERT(state2.m_state != realm::not_found);
+            REALM_ASSERT(a_encoded.get(state2.m_state) == a.get(state2.m_state));
+        }
+    }
+    t2 = high_resolution_clock::now();
+    std::cout << "   Positive values - ArrayEncode::find<Less>(): " << duration_cast<milliseconds>(t2 - t1).count()
+              << " ms" << std::endl;
+    std::cout << "   Positive values - ArrayEncode::find<Less>(): "
+              << (double)duration_cast<nanoseconds>(t2 - t1).count() / n_values / n_runs << " ns/value" << std::endl;
+
+    std::cout << std::endl;
+
+    a.destroy();
+    a_encoded.destroy();
+    a.create();
+    input_array.clear();
+    for (size_t i = 0; i < n_values; i++)
+        input_array.push_back(-i);
+    std::random_device rd1;
+    std::mt19937 g1(rd1());
+    std::shuffle(input_array.begin(), input_array.end(), g1);
+    for (const auto& v : input_array)
+        a.add(v);
+
+    a.try_encode(a_encoded);
+    CHECK(a_encoded.is_encoded());
+    CHECK(a_encoded.size() == a.size());
+
+    // verify that both find the same thing
+    state1 = {};
+    state2 = {};
+    for (size_t j = 0; j < n_runs; ++j) {
+        for (size_t i = 0; i < n_values; ++i) {
+            a.find<Less>(-i, 0, a.size(), &state1);
+            a_encoded.find<Less>(-i, 0, a_encoded.size(), &state2);
+            REALM_ASSERT(state1.m_state == state2.m_state);
+        }
+    }
+
+    t1 = high_resolution_clock::now();
+    for (size_t j = 0; j < n_runs; ++j) {
+        for (size_t i = 0; i < n_values - 1; ++i) { // nothing less than the biggest negative number
+            a.find<Less>(-i, 0, a.size(), &state1);
+            REALM_ASSERT(state1.m_state != realm::not_found);
+            REALM_ASSERT(a.get(state1.m_state) == input_array[state1.m_state]);
+        }
+    }
+    t2 = high_resolution_clock::now();
+
+    std::cout << "   Negative values - Array::find<Less>(): " << duration_cast<milliseconds>(t2 - t1).count() << " ms"
+              << std::endl;
+    std::cout << "   Negative values - Array::find<Less>(): "
+              << (double)duration_cast<nanoseconds>(t2 - t1).count() / n_values / n_runs << " ns/value" << std::endl;
+
+    t1 = high_resolution_clock::now();
+    for (size_t j = 0; j < n_runs; ++j) {
+        for (size_t i = 0; i < n_values - 1; ++i) { // nothing less than the biggest negative number
+            a_encoded.find<Less>(-i, 0, a_encoded.size(), &state2);
+            REALM_ASSERT(state2.m_state != realm::not_found);
+            REALM_ASSERT(a_encoded.get(state2.m_state) == a.get(state2.m_state));
+        }
+    }
+    t2 = high_resolution_clock::now();
+    std::cout << "   Negative values - ArrayEncode::find<Less>(): " << duration_cast<milliseconds>(t2 - t1).count()
+              << " ms" << std::endl;
+    std::cout << "   Negative values - ArrayEncode::find<Less>(): "
+              << (double)duration_cast<nanoseconds>(t2 - t1).count() / n_values / n_runs << " ns/value" << std::endl;
+
+    a.destroy();
+    a_encoded.destroy();
+}
+
+NONCONCURRENT_TEST(Test_basic_find_GT_value_less_32bit)
+// ONLY(Test_basic_find_GT_value_less_32bit)
+{
+    // GT subword parallel search is not working... TODO : investigate
+    using namespace std;
+    using namespace std::chrono;
+    size_t n_values = 1000;
+    size_t n_runs = 100;
+    std::cout << "   Value with bitwidth < 32 " << std::endl;
+    std::cout << "   N values = " << n_values << std::endl;
+    std::cout << "   N runs = " << n_runs << std::endl;
+
+    std::vector<int64_t> input_array;
+    ArrayInteger a(Allocator::get_default());
+    ArrayInteger a_encoded(Allocator::get_default());
+    a.create();
+
+    for (size_t i = 0; i < n_values; i++)
+        input_array.push_back(i);
+    std::random_device rd;
+    std::mt19937 g(rd());
+    std::shuffle(input_array.begin(), input_array.end(), g);
+    for (const auto& v : input_array)
+        a.add(v);
+
+    QueryStateFindFirst state1;
+    QueryStateFindFirst state2;
+    auto t1 = high_resolution_clock::now();
+    for (size_t j = 0; j < n_runs; ++j) {
+        for (size_t i = 0; i < n_values - 1; ++i) { // nothing greatest than the last number
+            a.find<Greater>(i, 0, a.size(), &state1);
+            REALM_ASSERT(state1.m_state != realm::not_found);
+            REALM_ASSERT(a.get(state1.m_state) == input_array[state1.m_state]);
+        }
+    }
+    auto t2 = high_resolution_clock::now();
+
+    std::cout << "   Positive values - Array::find<Greater>(): " << duration_cast<milliseconds>(t2 - t1).count()
+              << " ms" << std::endl;
+    std::cout << "   Positive values - Array::find<Greater>(): "
+              << (double)duration_cast<nanoseconds>(t2 - t1).count() / n_values / n_runs << " ns/value" << std::endl;
+
+    a.try_encode(a_encoded);
+    CHECK(a_encoded.is_encoded());
+    CHECK(a_encoded.size() == a.size());
+
+    //       std::cout << "Array: " << std::endl;
+    //       for(size_t i=0; i<a_encoded.size(); ++i)
+    //           std::cout << a_encoded.get(i) << ", ";
+    //        std::cout << std::endl;
+
+    // verify that both find the same thing
+    state1 = {};
+    state2 = {};
+    for (size_t j = 0; j < n_runs; ++j) {
+        for (size_t i = 0; i < n_values; ++i) {
+            a.find<Greater>(i, 0, a.size(), &state1);
+            a_encoded.find<Greater>(i, 0, a_encoded.size(), &state2);
+            REALM_ASSERT(state1.m_state == state2.m_state);
+        }
+    }
+
+    t1 = high_resolution_clock::now();
+    for (size_t j = 0; j < n_runs; ++j) {
+        for (size_t i = 0; i < n_values - 1; ++i) { // nothing bigger than the last val
+            a_encoded.find<Greater>(i, 0, a_encoded.size(), &state2);
+            REALM_ASSERT(state2.m_state != realm::not_found);
+            REALM_ASSERT(a_encoded.get(state2.m_state) == a.get(state2.m_state));
+        }
+    }
+    t2 = high_resolution_clock::now();
+    std::cout << "   Positive values - ArrayEncode::find<Greater>(): " << duration_cast<milliseconds>(t2 - t1).count()
+              << " ms" << std::endl;
+    std::cout << "   Positive values - ArrayEncode::find<Greater>(): "
+              << (double)duration_cast<nanoseconds>(t2 - t1).count() / n_values / n_runs << " ns/value" << std::endl;
+
+    std::cout << std::endl;
+
+    a.destroy();
+    a_encoded.destroy();
+    a.create();
+    input_array.clear();
+    for (size_t i = 0; i < n_values; i++)
+        input_array.push_back(-i);
+    std::random_device rd1;
+    std::mt19937 g1(rd1());
+    std::shuffle(input_array.begin(), input_array.end(), g1);
+    for (const auto& v : input_array)
+        a.add(v);
+
+    a.try_encode(a_encoded);
+    CHECK(a_encoded.is_encoded());
+    CHECK(a_encoded.size() == a.size());
+
+    // verify that both find the same thing
+    state1 = {};
+    state2 = {};
+    for (size_t j = 0; j < n_runs; ++j) {
+        for (size_t i = 0; i < n_values; ++i) {
+            a.find<Greater>(-i, 0, a.size(), &state1);
+            a_encoded.find<Greater>(-i, 0, a_encoded.size(), &state2);
+            REALM_ASSERT(state1.m_state == state2.m_state);
+        }
+    }
+
+    t1 = high_resolution_clock::now();
+    for (size_t j = 0; j < n_runs; ++j) {
+        for (size_t i = 1; i < n_values; ++i) { // nothing bigger than 0
+            a.find<Greater>(-i, 0, a.size(), &state1);
+            REALM_ASSERT(state1.m_state != realm::not_found);
+            REALM_ASSERT(a.get(state1.m_state) == input_array[state1.m_state]);
+        }
+    }
+    t2 = high_resolution_clock::now();
+
+    std::cout << "   Negative values - Array::find<Greater>(): " << duration_cast<milliseconds>(t2 - t1).count()
+              << " ms" << std::endl;
+    std::cout << "   Negative values - Array::find<Greater>(): "
+              << (double)duration_cast<nanoseconds>(t2 - t1).count() / n_values / n_runs << " ns/value" << std::endl;
+
+    t1 = high_resolution_clock::now();
+    for (size_t j = 0; j < n_runs; ++j) {
+        for (size_t i = 1; i < n_values; ++i) { // nothing bigger than 0
+            a_encoded.find<Greater>(-i, 0, a_encoded.size(), &state2);
+            REALM_ASSERT(state2.m_state != realm::not_found);
+            REALM_ASSERT(a_encoded.get(state2.m_state) == a.get(state2.m_state));
+        }
+    }
+    t2 = high_resolution_clock::now();
+    std::cout << "   Negative values - ArrayEncode::find<Greater>(): " << duration_cast<milliseconds>(t2 - t1).count()
+              << " ms" << std::endl;
+    std::cout << "   Negative values - ArrayEncode::find<Greater>(): "
+              << (double)duration_cast<nanoseconds>(t2 - t1).count() / n_values / n_runs << " ns/value" << std::endl;
+
+    a.destroy();
+    a_encoded.destroy();
+}
+
+NONCONCURRENT_TEST(perf_array_encode_get_vs_array_get_greater_32bit)
+// ONLY(perf_array_encode_get_vs_array_get_greater_32bit)
+{
+    using namespace std;
+    using namespace std::chrono;
+    size_t start_value = 0x0000000100000000; // 32 bit val
+    size_t n_values = 1000;
+    size_t n_runs = 100;
+    std::cout << "   >= 32 bit values " << std::endl;
+    std::cout << "   N values = " << n_values << std::endl;
+    std::cout << "   N runs = " << n_runs << std::endl;
+
+    std::vector<int64_t> input_array;
+    ArrayInteger a(Allocator::get_default());
+    ArrayInteger a_encoded(Allocator::get_default());
+    a.create();
+
+    for (size_t i = 0; i < n_values; i++)
+        input_array.push_back(start_value + i);
+    std::random_device rd;
+    std::mt19937 g(rd());
+    std::shuffle(input_array.begin(), input_array.end(), g);
+    for (const auto& v : input_array)
+        a.add(v);
+
+    auto t1 = high_resolution_clock::now();
+    for (size_t j = 0; j < n_runs; ++j) {
+        for (size_t i = 0; i < n_values; ++i)
+            REALM_ASSERT(a.get(i) == input_array[i]);
+    }
+    auto t2 = high_resolution_clock::now();
+
+    std::cout << "   Positive values - Array::get(): " << duration_cast<nanoseconds>(t2 - t1).count() << " ns"
+              << std::endl;
+    std::cout << "   Positive values - Array::get(): "
+              << (double)duration_cast<nanoseconds>(t2 - t1).count() / n_values / n_runs << " ns/value" << std::endl;
+
+    a.try_encode(a_encoded);
+    CHECK(a_encoded.is_encoded());
+    CHECK(a_encoded.size() == a.size());
+    t1 = high_resolution_clock::now();
+
+    for (size_t j = 0; j < n_runs; ++j) {
+        for (size_t i = 0; i < n_values; ++i) {
+            REALM_ASSERT(a_encoded.get(i) == a.get(i));
+        }
+    }
+    t2 = high_resolution_clock::now();
+    std::cout << "   Positive values - ArrayEncode::get(): " << duration_cast<nanoseconds>(t2 - t1).count() << " ns"
+              << std::endl;
+    std::cout << "   Positive values - ArrayEncode::get(): "
+              << (double)duration_cast<nanoseconds>(t2 - t1).count() / n_values / n_runs << " ns/value" << std::endl;
+
+    a.destroy();
+    a_encoded.destroy();
+    a.create();
+    input_array.clear();
+    for (size_t i = 0; i < n_values; i++)
+        input_array.push_back(-i);
+    std::random_device rd1;
+    std::mt19937 g1(rd1());
+    std::shuffle(input_array.begin(), input_array.end(), g1);
+    for (const auto& v : input_array)
+        a.add(v);
+
+    t1 = high_resolution_clock::now();
+    for (size_t j = 0; j < n_runs; ++j) {
+        for (size_t i = 0; i < n_values; ++i)
+            REALM_ASSERT(a.get(i) == input_array[i]);
+    }
+    t2 = high_resolution_clock::now();
+
+    std::cout << std::endl;
+
+    std::cout << "   Negative values - Array::get(): " << duration_cast<nanoseconds>(t2 - t1).count() << " ns"
+              << std::endl;
+    std::cout << "   Negative values - Array::get(): "
+              << (double)duration_cast<nanoseconds>(t2 - t1).count() / n_values / n_runs << " ns/value" << std::endl;
+
+    a.try_encode(a_encoded);
+    CHECK(a_encoded.is_encoded());
+    CHECK(a_encoded.size() == a.size());
+    t1 = high_resolution_clock::now();
+    for (size_t j = 0; j < n_runs; ++j) {
+        for (size_t i = 0; i < n_values; ++i) {
+            REALM_ASSERT(a_encoded.get(i) == a.get(i));
+        }
+    }
+    t2 = high_resolution_clock::now();
+    std::cout << "   Negative values - ArrayEncode::get(): " << duration_cast<nanoseconds>(t2 - t1).count() << " ns"
+              << std::endl;
+    std::cout << "   Negative values - ArrayEncode::get(): "
+              << (double)duration_cast<nanoseconds>(t2 - t1).count() / n_values / n_runs << " ns/value" << std::endl;
+
+    a.destroy();
+    a_encoded.destroy();
+}
+
+NONCONCURRENT_TEST(Test_basic_find_EQ_greater_32bit)
+// ONLY(Test_basic_find_EQ_greater_32bit)
+{
+    using namespace std;
+    using namespace std::chrono;
+    size_t start_value = 0x000001000000000; // 32 bit val
+    size_t n_values = 1000;
+    size_t n_runs = 100;
+    std::cout << "   Value with bitwidth >= 32 " << std::endl;
+    std::cout << "   N values = " << n_values << std::endl;
+    std::cout << "   N runs = " << n_runs << std::endl;
+
+    std::vector<int64_t> input_array;
+    ArrayInteger a(Allocator::get_default());
+    ArrayInteger a_encoded(Allocator::get_default());
+    a.create();
+
+    for (size_t i = 0; i < n_values; i++)
+        input_array.push_back(start_value + i);
+    std::random_device rd;
+    std::mt19937 g(rd());
+    std::shuffle(input_array.begin(), input_array.end(), g);
+    for (const auto& v : input_array)
+        a.add(v);
+
+    auto t1 = high_resolution_clock::now();
+    for (size_t j = 0; j < n_runs; ++j) {
+        for (size_t i = 0; i < n_values; ++i) {
+            auto ndx = a.find_first(start_value + i);
+            REALM_ASSERT(ndx != realm::not_found);
+            REALM_ASSERT(a.get(ndx) == input_array[ndx]);
+        }
+    }
+    auto t2 = high_resolution_clock::now();
+
+    std::cout << "   Positive values - Array::find<Equal>(): " << duration_cast<milliseconds>(t2 - t1).count()
+              << " ms" << std::endl;
+    std::cout << "   Positive values - Array::find<Equal>(): "
+              << (double)duration_cast<nanoseconds>(t2 - t1).count() / n_values / n_runs << " ns/value" << std::endl;
+
+    a.try_encode(a_encoded);
+    CHECK(a_encoded.is_encoded());
+    CHECK(a_encoded.size() == a.size());
+
+    // verify that both find the same thing
+    for (size_t j = 0; j < n_runs; ++j) {
+        for (size_t i = 0; i < n_values; ++i) {
+            REALM_ASSERT(a.find_first(start_value + i) == a_encoded.find_first(start_value + i));
+        }
+    }
+
+    t1 = high_resolution_clock::now();
+    for (size_t j = 0; j < n_runs; ++j) {
+        for (size_t i = 0; i < n_values; ++i) {
+            auto ndx = a_encoded.find_first(start_value + i);
+            REALM_ASSERT(ndx != realm::not_found);
+            REALM_ASSERT(a_encoded.get(ndx) == a.get(ndx));
+        }
+    }
+    t2 = high_resolution_clock::now();
+    std::cout << "   Positive values - ArrayEncode::find<Equal>(): " << duration_cast<milliseconds>(t2 - t1).count()
+              << " ms" << std::endl;
+    std::cout << "   Positive values - ArrayEncode::find<Equal>(): "
+              << (double)duration_cast<nanoseconds>(t2 - t1).count() / n_values / n_runs << " ns/value" << std::endl;
+
+    std::cout << std::endl;
+
+    a.destroy();
+    a_encoded.destroy();
+    a.create();
+    input_array.clear();
+    for (size_t i = 0; i < n_values; i++)
+        input_array.push_back(-(start_value + i));
+    std::random_device rd1;
+    std::mt19937 g1(rd1());
+    std::shuffle(input_array.begin(), input_array.end(), g1);
+    for (const auto& v : input_array)
+        a.add(v);
+
+    a.try_encode(a_encoded);
+    CHECK(a_encoded.is_encoded());
+    CHECK(a_encoded.size() == a.size());
+
+    // verify that both find the same thing
+    for (size_t j = 0; j < n_runs; ++j) {
+        for (size_t i = 0; i < n_values; ++i) {
+            const auto k = -(start_value + i);
+            const auto v1 = a.find_first(k);
+            const auto v2 = a_encoded.find_first(k);
+            REALM_ASSERT(v1 == v2);
+        }
+    }
+
+    t1 = high_resolution_clock::now();
+    for (size_t j = 0; j < n_runs; ++j) {
+        for (size_t i = 0; i < n_values; ++i) {
+            auto ndx = a.find_first(-(start_value + i));
+            REALM_ASSERT(ndx != realm::not_found);
+            REALM_ASSERT(a.get(ndx) == input_array[ndx]);
+        }
+    }
+    t2 = high_resolution_clock::now();
+
+    std::cout << "   Negative values - Array::find<Equal>(): " << duration_cast<milliseconds>(t2 - t1).count()
+              << " ms" << std::endl;
+    std::cout << "   Negative values - Array::find<Equal>(): "
+              << (double)duration_cast<nanoseconds>(t2 - t1).count() / n_values / n_runs << " ns/value" << std::endl;
+
+    t1 = high_resolution_clock::now();
+    for (size_t j = 0; j < n_runs; ++j) {
+        for (size_t i = 0; i < n_values; ++i) {
+            auto ndx = a_encoded.find_first(-(start_value + i));
+            REALM_ASSERT(ndx != realm::not_found);
+            REALM_ASSERT(a_encoded.get(ndx) == a.get(ndx));
+        }
+    }
+    t2 = high_resolution_clock::now();
+    std::cout << "   Negative values - ArrayEncode::find<Equal>(): " << duration_cast<milliseconds>(t2 - t1).count()
+              << " ms" << std::endl;
+    std::cout << "   Negative values - ArrayEncode::find<Equal>(): "
+              << (double)duration_cast<nanoseconds>(t2 - t1).count() / n_values / n_runs << " ns/value" << std::endl;
+
+    a.destroy();
+    a_encoded.destroy();
+}
+
+NONCONCURRENT_TEST(Test_basic_find_NEQ_value_greater_32bit)
 {
     using namespace std;
     using namespace std::chrono;
+    size_t start_value = 0x0000000100000000; // 32 bit val
     size_t n_values = 1000;
     size_t n_runs = 100;
+    std::cout << "   Value with bitwidth >= 32 " << std::endl;
     std::cout << "   N values = " << n_values << std::endl;
     std::cout << "   N runs = " << n_runs << std::endl;
 
@@ -144,26 +909,28 @@ TEST(Test_basic_find)
     a.create();
 
     for (size_t i = 0; i < n_values; i++)
-        input_array.push_back(i);
+        input_array.push_back(start_value + i);
     std::random_device rd;
     std::mt19937 g(rd());
     std::shuffle(input_array.begin(), input_array.end(), g);
     for (const auto& v : input_array)
         a.add(v);
 
+    QueryStateFindFirst state1;
+    QueryStateFindFirst state2;
     auto t1 = high_resolution_clock::now();
     for (size_t j = 0; j < n_runs; ++j) {
         for (size_t i = 0; i < n_values; ++i) {
-            auto ndx = a.find_first(i);
-            REALM_ASSERT(ndx != realm::not_found);
-            REALM_ASSERT(a.get(ndx) == input_array[ndx]);
+            a.find<NotEqual>(start_value + i, 0, a.size(), &state1);
+            REALM_ASSERT(state1.m_state != realm::not_found);
+            REALM_ASSERT(a.get(state1.m_state) == input_array[state1.m_state]);
         }
     }
     auto t2 = high_resolution_clock::now();
 
-    std::cout << "   Positive values - Array::find(): " << duration_cast<milliseconds>(t2 - t1).count() << " ms"
-              << std::endl;
-    std::cout << "   Positive values - Array::find(): "
+    std::cout << "   Positive values - Array::find<NotEqual>(): " << duration_cast<milliseconds>(t2 - t1).count()
+              << " ms" << std::endl;
+    std::cout << "   Positive values - Array::find<NotEqual>(): "
               << (double)duration_cast<nanoseconds>(t2 - t1).count() / n_values / n_runs << " ns/value" << std::endl;
 
     a.try_encode(a_encoded);
@@ -173,22 +940,154 @@ TEST(Test_basic_find)
     // verify that both find the same thing
     for (size_t j = 0; j < n_runs; ++j) {
         for (size_t i = 0; i < n_values; ++i) {
-            REALM_ASSERT(a.find_first(i) == a_encoded.find_first(i));
+            a.find<NotEqual>(start_value + i, 0, a.size(), &state1);
+            a_encoded.find<NotEqual>(start_value + i, 0, a_encoded.size(), &state2);
+            REALM_ASSERT(state1.m_state == state2.m_state);
         }
     }
 
     t1 = high_resolution_clock::now();
     for (size_t j = 0; j < n_runs; ++j) {
         for (size_t i = 0; i < n_values; ++i) {
-            auto ndx = a_encoded.find_first(i);
-            REALM_ASSERT(ndx != realm::not_found);
-            REALM_ASSERT(a_encoded.get(ndx) == a.get(ndx));
+            a_encoded.find<NotEqual>(start_value + i, 0, a_encoded.size(), &state2);
+            REALM_ASSERT(state2.m_state != realm::not_found);
+            REALM_ASSERT(a_encoded.get(state2.m_state) == a.get(state2.m_state));
+        }
+    }
+    t2 = high_resolution_clock::now();
+    std::cout << "   Positive values - ArrayEncode::find<NotEqual>(): "
+              << duration_cast<milliseconds>(t2 - t1).count() << " ms" << std::endl;
+    std::cout << "   Positive values - ArrayEncode::find<NotEqual>(): "
+              << (double)duration_cast<nanoseconds>(t2 - t1).count() / n_values / n_runs << " ns/value" << std::endl;
+
+    std::cout << std::endl;
+
+    a.destroy();
+    a_encoded.destroy();
+    a.create();
+    input_array.clear();
+    for (size_t i = 0; i < n_values; i++)
+        input_array.push_back(-(start_value + i));
+    std::random_device rd1;
+    std::mt19937 g1(rd1());
+    std::shuffle(input_array.begin(), input_array.end(), g1);
+    for (const auto& v : input_array)
+        a.add(v);
+
+    a.try_encode(a_encoded);
+    CHECK(a_encoded.is_encoded());
+    CHECK(a_encoded.size() == a.size());
+
+    // verify that both find the same thing
+    for (size_t j = 0; j < n_runs; ++j) {
+        for (size_t i = 0; i < n_values; ++i) {
+            a.find<NotEqual>(-(start_value + i), 0, a.size(), &state1);
+            a_encoded.find<NotEqual>(-(start_value + i), 0, a_encoded.size(), &state2);
+            REALM_ASSERT(state1.m_state == state2.m_state);
+        }
+    }
+
+    t1 = high_resolution_clock::now();
+    for (size_t j = 0; j < n_runs; ++j) {
+        for (size_t i = 0; i < n_values; ++i) {
+            a.find<NotEqual>(-(start_value + i), 0, a.size(), &state1);
+            REALM_ASSERT(state1.m_state != realm::not_found);
+            REALM_ASSERT(a.get(state1.m_state) == input_array[state1.m_state]);
+        }
+    }
+    t2 = high_resolution_clock::now();
+
+    std::cout << "   Negative values - Array::find<NotEqual>(): " << duration_cast<milliseconds>(t2 - t1).count()
+              << " ms" << std::endl;
+    std::cout << "   Negative values - Array::find<NotEqual>(): "
+              << (double)duration_cast<nanoseconds>(t2 - t1).count() / n_values / n_runs << " ns/value" << std::endl;
+
+    t1 = high_resolution_clock::now();
+    for (size_t j = 0; j < n_runs; ++j) {
+        for (size_t i = 0; i < n_values; ++i) {
+            a_encoded.find<NotEqual>(-(start_value + i), 0, a_encoded.size(), &state2);
+            REALM_ASSERT(state2.m_state != realm::not_found);
+            REALM_ASSERT(a_encoded.get(state2.m_state) == a.get(state2.m_state));
+        }
+    }
+    t2 = high_resolution_clock::now();
+    std::cout << "   Negative values - ArrayEncode::find<NotEqual>(): "
+              << duration_cast<milliseconds>(t2 - t1).count() << " ms" << std::endl;
+    std::cout << "   Negative values - ArrayEncode::find<NotEqual>(): "
+              << (double)duration_cast<nanoseconds>(t2 - t1).count() / n_values / n_runs << " ns/value" << std::endl;
+
+    a.destroy();
+    a_encoded.destroy();
+}
+
+NONCONCURRENT_TEST(Test_basic_find_LT_value_greater_32bit)
+{
+    using namespace std;
+    using namespace std::chrono;
+    size_t start_value = 0x0000000100000000; // 32 bit val
+    size_t n_values = 1000;
+    size_t n_runs = 100;
+    std::cout << "   Value with bitwidth >= 32 " << std::endl;
+    std::cout << "   N values = " << n_values << std::endl;
+    std::cout << "   N runs = " << n_runs << std::endl;
+
+    std::vector<int64_t> input_array;
+    ArrayInteger a(Allocator::get_default());
+    ArrayInteger a_encoded(Allocator::get_default());
+    a.create();
+
+    for (size_t i = 0; i < n_values; i++)
+        input_array.push_back(start_value + i);
+    std::random_device rd;
+    std::mt19937 g(rd());
+    std::shuffle(input_array.begin(), input_array.end(), g);
+    for (const auto& v : input_array)
+        a.add(v);
+
+    QueryStateFindFirst state1;
+    QueryStateFindFirst state2;
+    auto t1 = high_resolution_clock::now();
+    for (size_t j = 0; j < n_runs; ++j) {
+        for (size_t i = 1; i < n_values; ++i) {
+            a.find<Less>(start_value + i, 0, a.size(), &state1);
+            REALM_ASSERT(state1.m_state != realm::not_found);
+            REALM_ASSERT(a.get(state1.m_state) == input_array[state1.m_state]);
+        }
+    }
+    auto t2 = high_resolution_clock::now();
+
+    std::cout << "   Positive values - Array::find<Less>(): " << duration_cast<milliseconds>(t2 - t1).count() << " ms"
+              << std::endl;
+    std::cout << "   Positive values - Array::find<Less>(): "
+              << (double)duration_cast<nanoseconds>(t2 - t1).count() / n_values / n_runs << " ns/value" << std::endl;
+
+    a.try_encode(a_encoded);
+    CHECK(a_encoded.is_encoded());
+    CHECK(a_encoded.size() == a.size());
+
+    // verify that both find the same thing
+    state1 = {};
+    state2 = {};
+    for (size_t j = 0; j < n_runs; ++j) {
+        for (size_t i = 0; i < n_values; ++i) {
+            a.find<Less>(start_value + i, 0, a.size(), &state1);
+            a_encoded.find<Less>(start_value + i, 0, a_encoded.size(), &state2);
+            REALM_ASSERT(state1.m_state == state2.m_state);
+        }
+    }
+
+    t1 = high_resolution_clock::now();
+    for (size_t j = 0; j < n_runs; ++j) {
+        for (size_t i = 1; i < n_values; ++i) {
+            a_encoded.find<Less>(start_value + i, 0, a_encoded.size(), &state2);
+            REALM_ASSERT(state2.m_state != realm::not_found);
+            REALM_ASSERT(a_encoded.get(state2.m_state) == a.get(state2.m_state));
         }
     }
     t2 = high_resolution_clock::now();
-    std::cout << "   Positive values - ArrayEncode::find_first(): " << duration_cast<milliseconds>(t2 - t1).count()
+    std::cout << "   Positive values - ArrayEncode::find<Less>(): " << duration_cast<milliseconds>(t2 - t1).count()
               << " ms" << std::endl;
-    std::cout << "   Positive values - ArrayEncode::find_first(): "
+    std::cout << "   Positive values - ArrayEncode::find<Less>(): "
               << (double)duration_cast<nanoseconds>(t2 - t1).count() / n_values / n_runs << " ns/value" << std::endl;
 
     std::cout << std::endl;
@@ -198,57 +1097,196 @@ TEST(Test_basic_find)
     a.create();
     input_array.clear();
     for (size_t i = 0; i < n_values; i++)
-        input_array.push_back(-i);
+        input_array.push_back(-(start_value + i));
     std::random_device rd1;
     std::mt19937 g1(rd1());
     std::shuffle(input_array.begin(), input_array.end(), g1);
     for (const auto& v : input_array)
         a.add(v);
 
+    a.try_encode(a_encoded);
+    CHECK(a_encoded.is_encoded());
+    CHECK(a_encoded.size() == a.size());
+
     // verify that both find the same thing
     for (size_t j = 0; j < n_runs; ++j) {
         for (size_t i = 0; i < n_values; ++i) {
-            REALM_ASSERT(a.find_first(-i) == a_encoded.find_first(-i));
+            a.find<Less>(-(start_value + i), 0, a.size(), &state1);
+            a_encoded.find<Less>(-(start_value + i), 0, a_encoded.size(), &state2);
+            REALM_ASSERT(state1.m_state == state2.m_state);
         }
     }
 
     t1 = high_resolution_clock::now();
     for (size_t j = 0; j < n_runs; ++j) {
         for (size_t i = 0; i < n_values; ++i) {
-            auto ndx = a.find_first(-i);
-            REALM_ASSERT(ndx != realm::not_found);
-            REALM_ASSERT(a.get(ndx) == input_array[ndx]);
+            a.find<Less>(-(start_value + i), 0, a.size(), &state1);
+            REALM_ASSERT(state1.m_state != realm::not_found);
+            REALM_ASSERT(a.get(state1.m_state) == input_array[state1.m_state]);
         }
     }
     t2 = high_resolution_clock::now();
 
-    std::cout << "   Negative values - Array::find(): " << duration_cast<nanoseconds>(t2 - t1).count() << " ns"
+    std::cout << "   Negative values - Array::find<Less>(): " << duration_cast<milliseconds>(t2 - t1).count() << " ms"
               << std::endl;
-    std::cout << "   Negative values - Array::find(): "
+    std::cout << "   Negative values - Array::find<Less>(): "
+              << (double)duration_cast<nanoseconds>(t2 - t1).count() / n_values / n_runs << " ns/value" << std::endl;
+
+    t1 = high_resolution_clock::now();
+    for (size_t j = 0; j < n_runs; ++j) {
+        for (size_t i = 0; i < n_values; ++i) {
+            a_encoded.find<Less>(-(start_value + i), 0, a_encoded.size(), &state2);
+            REALM_ASSERT(state2.m_state != realm::not_found);
+            REALM_ASSERT(a_encoded.get(state2.m_state) == a.get(state2.m_state));
+        }
+    }
+    t2 = high_resolution_clock::now();
+    std::cout << "   Negative values - ArrayEncode::find<Less>(): " << duration_cast<milliseconds>(t2 - t1).count()
+              << " ms" << std::endl;
+    std::cout << "   Negative values - ArrayEncode::find<Less>(): "
+              << (double)duration_cast<nanoseconds>(t2 - t1).count() / n_values / n_runs << " ns/value" << std::endl;
+
+    a.destroy();
+    a_encoded.destroy();
+}
+
+// NONCONCURRENT_TEST(Test_basic_find_GT_value_greater_32bit)
+NONCONCURRENT_TEST(Test_basic_find_GT_value_greater_32bit)
+// ONLY(Test_basic_find_GT_value_greater_32bit)
+{
+    using namespace std;
+    using namespace std::chrono;
+    size_t start_value = 0x0000100000000; // 32 bit val
+    size_t n_values = 1000;
+    size_t n_runs = 100;
+    std::cout << "   Value with bitwidth >= 32 " << std::endl;
+    std::cout << "   N values = " << n_values << std::endl;
+    std::cout << "   N runs = " << n_runs << std::endl;
+
+    std::vector<int64_t> input_array;
+    ArrayInteger a(Allocator::get_default());
+    ArrayInteger a_encoded(Allocator::get_default());
+    a.create();
+
+    for (size_t i = 0; i < n_values; i++)
+        input_array.push_back(start_value + i);
+    std::random_device rd;
+    std::mt19937 g(rd());
+    std::shuffle(input_array.begin(), input_array.end(), g);
+    for (const auto& v : input_array)
+        a.add(v);
+
+    QueryStateFindFirst state1;
+    QueryStateFindFirst state2;
+    auto t1 = high_resolution_clock::now();
+    for (size_t j = 0; j < n_runs; ++j) {
+        for (size_t i = 0; i < n_values - 1; ++i) {
+            a.find<Greater>(start_value + i, 0, a.size(), &state1);
+            REALM_ASSERT(state1.m_state != realm::not_found);
+            REALM_ASSERT(a.get(state1.m_state) == input_array[state1.m_state]);
+        }
+    }
+    auto t2 = high_resolution_clock::now();
+
+    std::cout << "   Positive values - Array::find<Greater>(): " << duration_cast<milliseconds>(t2 - t1).count()
+              << " ms" << std::endl;
+    std::cout << "   Positive values - Array::find<Greater>(): "
+              << (double)duration_cast<nanoseconds>(t2 - t1).count() / n_values / n_runs << " ns/value" << std::endl;
+
+    a.try_encode(a_encoded);
+    CHECK(a_encoded.is_encoded());
+    CHECK(a_encoded.size() == a.size());
+
+    // verify that both find the same thing
+    state1 = {};
+    state2 = {};
+    for (size_t j = 0; j < n_runs; ++j) {
+        for (size_t i = 0; i < n_values; ++i) {
+            const auto k = start_value + i;
+            a.find<Greater>(k, 0, a.size(), &state1);
+            a_encoded.find<Greater>(k, 0, a_encoded.size(), &state2);
+            REALM_ASSERT(state1.m_state == state2.m_state);
+        }
+    }
+
+    t1 = high_resolution_clock::now();
+    for (size_t j = 0; j < n_runs; ++j) {
+        for (size_t i = 0; i < n_values - 1; ++i) {
+            a_encoded.find<Greater>(start_value + i, 0, a_encoded.size(), &state2);
+            REALM_ASSERT(state2.m_state != realm::not_found);
+            REALM_ASSERT(a_encoded.get(state2.m_state) == a.get(state2.m_state));
+        }
+    }
+    t2 = high_resolution_clock::now();
+    std::cout << "   Positive values - ArrayEncode::find<Greater>(): " << duration_cast<milliseconds>(t2 - t1).count()
+              << " ms" << std::endl;
+    std::cout << "   Positive values - ArrayEncode::find<Greater>(): "
               << (double)duration_cast<nanoseconds>(t2 - t1).count() / n_values / n_runs << " ns/value" << std::endl;
 
+    std::cout << std::endl;
+
+    a.destroy();
+    a_encoded.destroy();
+    a.create();
+    input_array.clear();
+    for (size_t i = 0; i < n_values; i++)
+        input_array.push_back(-(start_value + i));
+    std::random_device rd1;
+    std::mt19937 g1(rd1());
+    std::shuffle(input_array.begin(), input_array.end(), g1);
+    for (const auto& v : input_array)
+        a.add(v);
+
     a.try_encode(a_encoded);
     CHECK(a_encoded.is_encoded());
     CHECK(a_encoded.size() == a.size());
+
+    // verify that both find the same thing
+    for (size_t j = 0; j < n_runs; ++j) {
+        for (size_t i = 1; i < n_values; ++i) {
+            a.find<Greater>(-(start_value + i), 0, a.size(), &state1);
+            a_encoded.find<Greater>(-(start_value + i), 0, a_encoded.size(), &state2);
+            REALM_ASSERT(state1.m_state == state2.m_state);
+        }
+    }
+
     t1 = high_resolution_clock::now();
     for (size_t j = 0; j < n_runs; ++j) {
         for (size_t i = 0; i < n_values; ++i) {
-            auto ndx = a_encoded.find_first(-i);
-            REALM_ASSERT(ndx != realm::not_found);
-            REALM_ASSERT(a_encoded.get(ndx) == a.get(ndx));
+            a.find<Greater>(-(start_value + i), 0, a.size(), &state1);
+            REALM_ASSERT(state1.m_state != realm::not_found);
+            REALM_ASSERT(a.get(state1.m_state) == input_array[state1.m_state]);
+        }
+    }
+    t2 = high_resolution_clock::now();
+
+    std::cout << "   Negative values - Array::find<Greater>(): " << duration_cast<milliseconds>(t2 - t1).count()
+              << " ms" << std::endl;
+    std::cout << "   Negative values - Array::find<Greater>(): "
+              << (double)duration_cast<nanoseconds>(t2 - t1).count() / n_values / n_runs << " ns/value" << std::endl;
+
+    t1 = high_resolution_clock::now();
+    for (size_t j = 0; j < n_runs; ++j) {
+        for (size_t i = 1; i < n_values; ++i) {
+            a_encoded.find<Greater>(-(start_value + i), 0, a_encoded.size(), &state2);
+            REALM_ASSERT(state2.m_state != realm::not_found);
+            REALM_ASSERT(a_encoded.get(state2.m_state) == a.get(state2.m_state));
         }
     }
     t2 = high_resolution_clock::now();
-    std::cout << "   Negative values - ArrayEncode::find_first(): " << duration_cast<nanoseconds>(t2 - t1).count()
-              << " ns" << std::endl;
-    std::cout << "   Negative values - ArrayEncode::find_first(): "
+    std::cout << "   Negative values - ArrayEncode::find<Greater>(): " << duration_cast<milliseconds>(t2 - t1).count()
+              << " ms" << std::endl;
+    std::cout << "   Negative values - ArrayEncode::find<Greater>(): "
               << (double)duration_cast<nanoseconds>(t2 - t1).count() / n_values / n_runs << " ns/value" << std::endl;
 
     a.destroy();
     a_encoded.destroy();
 }
+
 #endif
 
+// packed is always on
+#if 0
 TEST(Test_ArrayInt_no_encode)
 {
     ArrayInteger a(Allocator::get_default());
@@ -267,6 +1305,7 @@ TEST(Test_ArrayInt_no_encode)
     a.destroy();
     a1.destroy();
 }
+#endif
 
 TEST(Test_array_same_size_less_bits)
 {
@@ -289,6 +1328,7 @@ TEST(Test_array_same_size_less_bits)
     a1.destroy();
 }
 
+#if 0
 TEST(Test_ArrayInt_encode_decode_needed)
 {
     ArrayInteger a(Allocator::get_default());
@@ -338,6 +1378,7 @@ TEST(Test_ArrayInt_encode_decode_needed)
     a.destroy();
     a1.destroy();
 }
+#endif
 
 TEST(Test_ArrayInt_negative_nums)
 {