diff --git a/src/realm/array_direct.hpp b/src/realm/array_direct.hpp index 9084f65e4ab..ab011903627 100644 --- a/src/realm/array_direct.hpp +++ b/src/realm/array_direct.hpp @@ -185,6 +185,43 @@ inline int64_t get_direct(const char* data, size_t width, size_t ndx) noexcept REALM_TEMPEX(return get_direct, width, (data, ndx)); } +// An iterator for getting a 64 bit word from any (byte-address+bit-offset) address. +class unaligned_word_iter { +public: + unaligned_word_iter(const uint64_t* data, size_t bit_offset) + : m_word_ptr(data + (bit_offset >> 6)) + , m_in_word_offset(bit_offset & 0x3F) + { + } + // 'num_bits' number of bits which must be read + // WARNING returned word may be garbage above the first 'num_bits' bits. + uint64_t get(unsigned num_bits) + { + auto first_word = m_word_ptr[0]; + uint64_t result = first_word >> m_in_word_offset; + // note: above shifts in zeroes + if (m_in_word_offset + num_bits <= 64) + return result; + // if we're here, in_word_offset > 0 + auto first_word_size = 64 - m_in_word_offset; + auto second_word = m_word_ptr[1]; + result |= second_word << first_word_size; + // note: above shifts in zeroes below the bits we want + return result; + } + // bump the iterator the specified number of bits + void bump(unsigned num_bits) + { + auto total_offset = m_in_word_offset + num_bits; + m_word_ptr += total_offset >> 6; + m_in_word_offset = total_offset & 0x3F; + } + +private: + const uint64_t* m_word_ptr; + unsigned m_in_word_offset; +}; + // Read a bit field of up to 64 bits. // - Any alignment and size is supported // - The start of the 'data' area must be 64 bit aligned in all cases. @@ -210,7 +247,7 @@ class bf_iterator { first_word_ptr = data_area + (field_position >> 6); } - uint64_t get_value() const + inline uint64_t get_full_word_with_value() const { auto in_word_position = field_position & 0x3F; auto first_word = first_word_ptr[0]; @@ -223,11 +260,43 @@ class bf_iterator { result |= second_word << first_word_size; // note: above shifts in zeroes below the bits we want } + return result; + } + + inline uint64_t get_value() const + { + auto result = get_full_word_with_value(); // discard any bits above the field we want if (field_size < 64) result &= (1ULL << field_size) - 1; return result; } + + // get unaligned word - this should not be called if the next word extends beyond + // end of array. For that particular case, you must use get_last_unaligned_word instead. + inline uint64_t get_unaligned_word() const + { + auto in_word_position = field_position & 0x3F; + auto first_word = first_word_ptr[0]; + if (in_word_position == 0) + return first_word; + uint64_t result = first_word >> in_word_position; + // note: above shifts in zeroes above the bitfield + auto first_word_size = 64 - in_word_position; + auto second_word = first_word_ptr[1]; + result |= second_word << first_word_size; + // note: above shifts in zeroes below the bits we want + return result; + } + + inline uint64_t get_last_unaligned_word() const + { + auto in_word_position = field_position & 0x3F; + auto first_word = first_word_ptr[0]; + uint64_t result = first_word >> in_word_position; + // note: above shifts in zeroes above the bitfield + return result; + } void set_value(uint64_t value) const { auto in_word_position = field_position & 0x3F; @@ -257,7 +326,7 @@ class bf_iterator { first_word_ptr[1] = second_word; } } - void operator++() + inline void operator++() { auto next_field_position = field_position + step_size; if ((next_field_position >> 6) > (field_position >> 6)) { @@ -287,11 +356,11 @@ class bf_ref { bf_iterator it; public: - bf_ref(bf_iterator& it) + inline bf_ref(bf_iterator& it) : it(it) { } - operator uint64_t() const + inline operator uint64_t() const { return it.get_value(); } @@ -344,6 +413,455 @@ inline std::pair get_two(const char* data, size_t width, size_ REALM_TEMPEX(return get_two, width, (data, ndx)); } +/* Subword parallel search + + The following provides facilities for subword parallel search for bitfields of any size. + To simplify, the first bitfield must be aligned within the word: it must occupy the lowest + bits of the word. + + In general the metods here return a vector with the most significant bit in each field + marking that a condition was met when comparing the corresponding pair of fields in two + vectors. Checking if any field meets a condition is as simple as comparing the return + vector against 0. Finding the first to meet a condition is also supported. + + Vectors are "split" into fields according to a MSB vector, wich indicates the most + significant bit of each field. The MSB must be passed in as an argument to most + bit field comparison functions. It can be generated by the field_sign_bit template. + + The simplest condition to test is any_field_NE(A,B), where A and B are words. + This condition should be true if any bitfield in A is not equal to the corresponding + field in B. + + This is almost as simple as a direct word compare, but needs to take into account that + we may want to have part of the words undefined. +*/ +constexpr int num_fields_table[65] = {-1, 64, 32, 21, 16, 12, 10, 9, // 0-7 + 8, 7, 6, 5, 5, 4, 4, 4, // 8-15 + 4, 3, 3, 3, 3, 3, 2, 2, // 16-23 + 2, 2, 2, 2, 2, 2, 2, 2, // 24-31 + 2, 1, 1, 1, 1, 1, 1, 1, // 32-39 + 1, 1, 1, 1, 1, 1, 1, 1, // 40-47 + 1, 1, 1, 1, 1, 1, 1, 1, // 48-55 + 1, 1, 1, 1, 1, 1, 1, 1, // 56-63 + 1}; + +constexpr int num_bits_table[65] = {-1, 64, 64, 63, 64, 60, 60, 63, // 0-7 + 64, 63, 60, 55, 60, 52, 56, 60, // 8-15 + 64, 51, 54, 57, 60, 63, 44, 46, // 16-23 + 48, 50, 52, 54, 56, 58, 60, 64, // 24-31 + 64, 33, 34, 35, 36, 37, 38, 39, // 32-39 + 40, 41, 42, 43, 44, 45, 46, 47, // 40-47 + 48, 49, 50, 51, 52, 53, 54, 55, // 48-55 + 56, 57, 58, 59, 60, 61, 62, 63, // 56-63 + 64}; + +inline int num_fields_for_width(int width) +{ + REALM_ASSERT(width); + return 64 / width; +} + +inline uint64_t num_bits(int width) +{ + return num_fields_table[width]; +} + +inline int num_bits_for_width(int width) +{ + return num_bits_table[width]; +} + +inline uint64_t cares_about(int width) +{ + return 0xFFFFFFFFFFFFFFFFULL >> (64 - num_bits_table[width]); +} + +// true if any field in A differs from corresponding field in B. If you also want +// to find which fields, use find_all_fields_NE instead. +bool inline any_field_NE(int width, uint64_t A, uint64_t B) +{ + return (A ^ B) & cares_about(width); +} + +// Populate all fields in a vector with a given value of a give width. +// Bits outside of the given field are ignored. +constexpr uint64_t populate(int width, uint64_t value) +{ + value &= 0xFFFFFFFFFFFFFFFFULL >> (64 - width); + if (width < 8) { + value |= value << width; + width <<= 1; + value |= value << width; + width <<= 1; + value |= value << width; + width <<= 1; + } + // width now in range 8..64 + if (width < 32) { + value |= value << width; + width <<= 1; + value |= value << width; + width <<= 1; + } + // width now in range 32..128 + if (width < 64) { + value |= value << width; + } + return value; +} + +// provides a set bit in pos 0 of each field, remaining bits zero +constexpr uint64_t field_bit0(int width) +{ + return populate(width, 1); +} + +// provides a set sign-bit in each field, remaining bits zero +constexpr uint64_t field_sign_bit(int width) +{ + return populate(width, 1ULL << (width - 1)); +} + +/* Unsigned LT. + + This can be determined by trial subtaction. However, some care must be exercised + since simply subtracting one vector from another will allow carries from one + bitfield to flow into the next one. To avoid this, we isolate bitfields by clamping + the MSBs to 1 in A and 0 in B before subtraction. After the subtraction the MSBs in + the result indicate borrows from the MSB. We then compute overflow (borrow OUT of MSB) + using boolean logic as described below. + + Unsigned LT is also used to find all zero fields or all non-zero fields, so it is + the backbone of all comparisons returning vectors. +*/ + +// compute the overflows in unsigned trial subtraction A-B. The overflows +// will be marked by 1 in the sign bit of each field in the result. Other +// bits in the result are zero. +// Overflow are detected for each field pair where A is less than B. +inline uint64_t unsigned_LT_vector(uint64_t MSBs, uint64_t A, uint64_t B) +{ + // 1. compute borrow from most significant bit + // Isolate bitfields inside A and B before subtraction (prevent carries from spilling over) + // do this by clamping most significant bit in A to 1, and msb in B to 0 + auto A_isolated = A | MSBs; // 1 op + auto B_isolated = B & ~MSBs; // 2 ops + auto borrows_into_sign_bit = ~(A_isolated - B_isolated); // 2 ops (total latency 4) + + // 2. determine what subtraction against most significant bit would give: + // A B borrow-in: (A-B-borrow-in) + // 0 0 0 (0-0-0) = 0 + // 0 0 1 (0-0-1) = 1 + borrow-out + // 0 1 0 (0-1-0) = 1 + borrow-out + // 0 1 1 (0-1-1) = 0 + borrow-out + // 1 0 0 (1-0-0) = 1 + // 1 0 1 (1-0-1) = 0 + // 1 1 0 (1-1-0) = 0 + // 1 1 1 (1-1-1) = 1 + borrow-out + // borrow-out = (~A & B) | (~A & borrow-in) | (A & B & borrow-in) + // The overflows are simply the borrow-out, now encoded into the sign bits of each field. + auto overflows = (~A & B) | (~A & borrows_into_sign_bit) | (A & B & borrows_into_sign_bit); + // ^ 6 ops, total latency 6 (4+2) + return overflows & MSBs; // 1 op, total latency 7 + // total of 12 ops and a latency of 7. On a beefy CPU 3-4 of those can run in parallel + // and still reach a combined latency of 10 or less. +} + +inline uint64_t find_all_fields_unsigned_LT(uint64_t MSBs, uint64_t A, uint64_t B) +{ + return unsigned_LT_vector(MSBs, A, B); +} + +inline uint64_t find_all_fields_NE(uint64_t MSBs, uint64_t A, uint64_t B) +{ + // 0 != A^B, same as asking 0 - (A^B) overflows. + return unsigned_LT_vector(MSBs, 0, A ^ B); +} + +inline uint64_t find_all_fields_EQ(uint64_t MSBs, uint64_t A, uint64_t B) +{ + // get the fields which are EQ and negate the result + auto all_fields_NE = find_all_fields_NE(MSBs, A, B); + auto all_fields_NE_negated = ~all_fields_NE; + // must filter the negated vector so only MSB are left. + return MSBs & all_fields_NE_negated; +} + +inline uint64_t find_all_fields_unsigned_LE(uint64_t MSBs, uint64_t A, uint64_t B) +{ + // Now A <= B is the same as !(A > B) so... + // reverse A and B to turn (A>B) --> (B B is the same as B < A + return find_all_fields_signed_LT(MSBs, B, A); +} + +inline uint64_t find_all_fields_signed_GE(uint64_t MSBs, uint64_t A, uint64_t B) +{ + // A >= B is the same as B <= A + return find_all_fields_signed_LE(MSBs, B, A); +} + +// find the first field which have MSB set (marks overflow after trial subtraction, or other +// requested condition). +struct find_field_desc { + uint8_t levels; + uint64_t m1; + uint64_t m2; + uint64_t m4; + uint64_t m8; + uint64_t m16; + uint64_t m32; +}; + +constexpr struct find_field_desc find_field_table[65] = { + /* 0 */ {0, 0, 0, 0, 0, 0}, + /* 1 */ + {6, 0xAAAAAAAAAAAAAAAA, 0xCCCCCCCCCCCCCCCC, 0xF0F0F0F0F0F0F0F0, 0xFF00FF00FF00FF00, 0xFFFF0000FFFF0000, + 0xFFFFFFFF00000000}, + /* 2 */ + {5, 0xCCCCCCCCCCCCCCCC, 0xF0F0F0F0F0F0F0F0, 0xFF00FF00FF00FF00, 0xFFFF0000FFFF0000, 0xFFFFFFFF00000000, 0}, + /* 3 */ + {5, 0b0000'1110'0011'1000'1110'0011'1000'1110'0011'1000'1110'0011'1000'1110'0011'1000, + 0b0000'1111'1100'0000'1111'1100'0000'1111'1100'0000'1111'1100'0000'1111'1100'0000, + 0b1111'0000'0000'0000'1111'1111'1111'0000'0000'0000'1111'1111'1111'0000'0000'0000, + 0b0000'0000'0000'0000'1111'1111'1111'1111'1111'1111'0000'0000'0000'0000'0000'0000, + 0b1111'1111'1111'1111'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000, 0}, + /* 4 */ + {4, 0xF0F0F0F0F0F0F0F0, 0xFF00FF00FF00FF00, 0xFFFF0000FFFF0000, 0xFFFFFFFF00000000, 0, 0}, + /* 5 */ + {4, 0b0000'1111'1000'0011'1110'0000'1111'1000'0011'1110'0000'1111'1000'0011'1110'0000, + 0b0000'1111'1111'1100'0000'0000'1111'1111'1100'0000'0000'1111'1111'1100'0000'0000, + 0b1111'0000'0000'0000'0000'0000'1111'1111'1111'1111'1111'0000'0000'0000'0000'0000, + 0b1111'1111'1111'1111'1111'1111'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000, 0, 0}, + /* 6 */ + {4, 0b0000'1111'1100'0000'1111'1100'0000'1111'1100'0000'1111'1100'0000'1111'1100'0000, + 0b1111'0000'0000'0000'1111'1111'1111'0000'0000'0000'1111'1111'1111'0000'0000'0000, + 0b0000'0000'0000'0000'1111'1111'1111'1111'1111'1111'0000'0000'0000'0000'0000'0000, + 0b1111'1111'1111'1111'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000, 0, 0}, + /* 7 */ + {4, 0b1000'0000'1111'1110'0000'0011'1111'1000'0000'1111'1110'0000'0011'1111'1000'0000, + 0b0000'0000'1111'1111'1111'1100'0000'0000'0000'1111'1111'1111'1100'0000'0000'0000, + 0b0000'0000'1111'1111'1111'1111'1111'1111'1111'0000'0000'0000'0000'0000'0000'0000, + 0b1111'1111'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000, 0, 0}, + /* 8 */ + {3, 0xFF00FF00FF00FF00, 0xFFFF0000FFFF0000, 0xFFFFFFFF00000000, 0, 0, 0}, + /* 9 */ + {3, 0b1000'0000'0011'1111'1110'0000'0000'1111'1111'1000'0000'0011'1111'1110'0000'0000, + 0b0111'1111'1100'0000'0000'0000'0000'1111'1111'1111'1111'1100'0000'0000'0000'0000, + 0b1111'1111'1111'1111'1111'1111'1111'0000'0000'0000'0000'0000'0000'0000'0000'0000, 0, 0, 0}, + /* 10 */ + {3, 0b0000'1111'1111'1100'0000'0000'1111'1111'1100'0000'0000'1111'1111'1100'0000'0000, + 0b1111'0000'0000'0000'0000'0000'1111'1111'1111'1111'1111'0000'0000'0000'0000'0000, + 0b1111'1111'1111'1111'1111'1111'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000, 0, 0, 0}, + /* 11 */ + {3, 0b1111'1111'1000'0000'0000'1111'1111'1110'0000'0000'0011'1111'1111'1000'0000'0000, + 0b0000'0000'0000'0000'0000'1111'1111'1111'1111'1111'1100'0000'0000'0000'0000'0000, + 0b1111'1111'1111'1111'1111'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000, 0, 0, 0}, + /* 12 */ + {3, 0b1111'0000'0000'0000'1111'1111'1111'0000'0000'0000'1111'1111'1111'0000'0000'0000, + 0b0000'0000'0000'0000'1111'1111'1111'1111'1111'1111'0000'0000'0000'0000'0000'0000, + 0b1111'1111'1111'1111'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000, 0, 0, 0}, + /* 13 */ + {3, 0b1110'0000'0000'0000'1111'1111'1110'0000'0000'0011'1111'1111'1110'0000'0000'0000, + 0b0000'0000'0000'0000'1111'1111'1111'1111'1111'1100'0000'0000'0000'0000'0000'0000, + 0b1111'1111'1111'1111'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000, 0, 0, 0}, + /* 14 */ + {3, 0b0000'0000'1111'1111'1111'1100'0000'0000'0000'1111'1111'1111'1100'0000'0000'0000, + 0b0000'0000'1111'1111'1111'1111'1111'1111'1111'0000'0000'0000'0000'0000'0000'0000, + 0b1111'1111'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000, 0, 0, 0}, + /* 15 */ + {3, 0b0000'1111'1111'1111'1110'0000'0000'0000'0011'1111'1111'1111'1000'0000'0000'0000, + 0b0000'1111'1111'1111'1111'1111'1111'1111'1100'0000'0000'0000'0000'0000'0000'0000, + 0b1111'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000'0000, 0, 0, 0}, + /* 16 */ + {2, 0xFFFF0000FFFF0000, 0xFFFFFFFF00000000, 0, 0, 0, 0}, + /* 17 - as we're only interested in msb of each field we can simplify and use same pattern + for the next 4 entries */ + {2, 0xF00000FFFFF00000, 0xFFFFFF0000000000, 0, 0, 0, 0}, + /* 18 */ + {2, 0xF00000FFFFF00000, 0xFFFFFF0000000000, 0, 0, 0, 0}, + /* 19 */ + {2, 0xF00000FFFFF00000, 0xFFFFFF0000000000, 0, 0, 0, 0}, + /* 20 */ + {2, 0xF00000FFFFF00000, 0xFFFFFF0000000000, 0, 0, 0, 0}, + /* 21 - and next 4 */ + {2, 0x0000FFFFFF000000, 0xFFFF000000000000, 0, 0, 0, 0}, + /* 22 */ + {2, 0x0000FFFFFF000000, 0xFFFF000000000000, 0, 0, 0, 0}, + /* 23 */ + {2, 0x0000FFFFFF000000, 0xFFFF000000000000, 0, 0, 0, 0}, + /* 24 */ + {2, 0x0000FFFFFF000000, 0xFFFF000000000000, 0, 0, 0, 0}, + /* 25 - and 4 more */ + {2, 0x00FFFFFFF0000000, 0xFF00000000000000, 0, 0, 0, 0}, + /* 26 */ + {2, 0x00FFFFFFF0000000, 0xFF00000000000000, 0, 0, 0, 0}, + /* 27 */ + {2, 0x00FFFFFFF0000000, 0xFF00000000000000, 0, 0, 0, 0}, + /* 28 */ + {2, 0x00FFFFFFF0000000, 0xFF00000000000000, 0, 0, 0, 0}, + /* 29 - last 4 where multiple fields exist */ + {1, 0xFFFFFFFF00000000, 0, 0, 0, 0, 0}, + /* 30 */ + {1, 0xFFFFFFFF00000000, 0, 0, 0, 0, 0}, + /* 31 */ + {1, 0xFFFFFFFF00000000, 0, 0, 0, 0, 0}, + /* 32 */ + {1, 0xFFFFFFFF00000000, 0, 0, 0, 0, 0}, + /* 33 - from here to 64, there is only 1 possible result: 0 */ + {0, 0, 0, 0, 0, 0, 0}, + {0, 0, 0, 0, 0, 0, 0}, + {0, 0, 0, 0, 0, 0, 0}, + {0, 0, 0, 0, 0, 0, 0}, + {0, 0, 0, 0, 0, 0, 0}, + {0, 0, 0, 0, 0, 0, 0}, + {0, 0, 0, 0, 0, 0, 0}, + {0, 0, 0, 0, 0, 0, 0}, + {0, 0, 0, 0, 0, 0, 0}, + {0, 0, 0, 0, 0, 0, 0}, + {0, 0, 0, 0, 0, 0, 0}, + {0, 0, 0, 0, 0, 0, 0}, + {0, 0, 0, 0, 0, 0, 0}, + {0, 0, 0, 0, 0, 0, 0}, + {0, 0, 0, 0, 0, 0, 0}, + {0, 0, 0, 0, 0, 0, 0}, + {0, 0, 0, 0, 0, 0, 0}, + {0, 0, 0, 0, 0, 0, 0}, + {0, 0, 0, 0, 0, 0, 0}, + {0, 0, 0, 0, 0, 0, 0}, + {0, 0, 0, 0, 0, 0, 0}, + {0, 0, 0, 0, 0, 0, 0}, + {0, 0, 0, 0, 0, 0, 0}, + {0, 0, 0, 0, 0, 0, 0}, + {0, 0, 0, 0, 0, 0, 0}, + {0, 0, 0, 0, 0, 0, 0}, + {0, 0, 0, 0, 0, 0, 0}, + {0, 0, 0, 0, 0, 0, 0}, + {0, 0, 0, 0, 0, 0, 0}, + {0, 0, 0, 0, 0, 0, 0}, + {0, 0, 0, 0, 0, 0, 0}, + {0, 0, 0, 0, 0, 0, 0}}; + +#if 1 +constexpr uint32_t inverse_width[65] = { + 65536 * 64 / 1, // never used + 65536 * 64 / 1, 65536 * 64 / 2, 65536 * 64 / 3, 65536 * 64 / 4, 65536 * 64 / 5, 65536 * 64 / 6, + 65536 * 64 / 7, 65536 * 64 / 8, 65536 * 64 / 9, 65536 * 64 / 10, 65536 * 64 / 11, 65536 * 64 / 12, + 65536 * 64 / 13, 65536 * 64 / 14, 65536 * 64 / 15, 65536 * 64 / 16, 65536 * 64 / 17, 65536 * 64 / 18, + 65536 * 64 / 19, 65536 * 64 / 20, 65536 * 64 / 21, 65536 * 64 / 22, 65536 * 64 / 23, 65536 * 64 / 24, + 65536 * 64 / 25, 65536 * 64 / 26, 65536 * 64 / 27, 65536 * 64 / 28, 65536 * 64 / 29, 65536 * 64 / 30, + 65536 * 64 / 31, 65536 * 64 / 32, 65536 * 64 / 33, 65536 * 64 / 34, 65536 * 64 / 35, 65536 * 64 / 36, + 65536 * 64 / 37, 65536 * 64 / 38, 65536 * 64 / 39, 65536 * 64 / 40, 65536 * 64 / 41, 65536 * 64 / 42, + 65536 * 64 / 43, 65536 * 64 / 44, 65536 * 64 / 45, 65536 * 64 / 46, 65536 * 64 / 47, 65536 * 64 / 48, + 65536 * 64 / 49, 65536 * 64 / 50, 65536 * 64 / 51, 65536 * 64 / 52, 65536 * 64 / 53, 65536 * 64 / 54, + 65536 * 64 / 55, 65536 * 64 / 56, 65536 * 64 / 57, 65536 * 64 / 58, 65536 * 64 / 59, 65536 * 64 / 60, + 65536 * 64 / 61, 65536 * 64 / 62, 65536 * 64 / 63, 65536 * 64 / 64, +}; + +inline int first_field_marked(int width, uint64_t vector) +{ +#if REALM_WINDOWS + int lz = (int)_tzcnt_u64(vector); // TODO: not clear if this is ok on all platforms +#else + int lz = __builtin_ctzll(vector); +#endif + int field = (lz * inverse_width[width]) >> 22; + REALM_ASSERT_DEBUG(field == (lz / width)); + return field; +} +#endif +#if 0 +inline int first_field_marked(int width, uint64_t vector) +{ + // isolate least significant bit + vector = vector & (~vector + 1); + const struct find_field_desc& desc = find_field_table[width]; + int result = 0; + switch (desc.levels) { + // the following case entries are intended to fall through + // (this is a variant of Duff's Device) + // TODO: disable compiler warnings for it + case 6: + result |= (vector & desc.m32) ? 32 : 0; + case 5: + result |= (vector & desc.m16) ? 16 : 0; + case 4: + result |= (vector & desc.m8) ? 8 : 0; + case 3: + result |= (vector & desc.m4) ? 4 : 0; + case 2: + result |= (vector & desc.m2) ? 2 : 0; + case 1: + result |= (vector & desc.m1) ? 1 : 0; + default: + break; + } + return result; +} +#endif +#if 0 +inline int first_field_marked(int width, uint64_t vector) +{ + // isolate least significant bit + vector = vector & (~vector + 1); + // directly compute position of set bit using table + const struct find_field_desc& desc = find_field_table[width]; + return ((vector & desc.m1) ? 1 : 0) | ((vector & desc.m2) ? 2 : 0) | ((vector & desc.m4) ? 4 : 0) | + ((vector & desc.m8) ? 8 : 0) | ((vector & desc.m16) ? 16 : 0) | ((vector & desc.m32) ? 32 : 0); +} +#endif +#if 0 +inline int first_field_marked(int width, uint64_t vector) +{ + int result = 0; + auto msb = 1ULL << (width - 1); + while (msb) { + if (vector & msb) + return result; + msb <<= width; + result++; + } + return -1; +} +#endif + namespace impl { // Lower and Upper bound are mainly used in the B+tree implementation, diff --git a/src/realm/array_encode.cpp b/src/realm/array_encode.cpp index d751f6b5b75..e6e98e595e7 100644 --- a/src/realm/array_encode.cpp +++ b/src/realm/array_encode.cpp @@ -187,6 +187,7 @@ void ArrayEncode::init(const char* h) m_ndx_width = NodeHeader::get_elementB_size(h); m_ndx_size = NodeHeader::get_arrayB_num_elements(h); m_v_mask = 1ULL << (m_v_width - 1); + m_ndx_mask = 1ULL << (m_ndx_width - 1); } } diff --git a/src/realm/array_encode.hpp b/src/realm/array_encode.hpp index d161b91c88b..b8414299d0f 100644 --- a/src/realm/array_encode.hpp +++ b/src/realm/array_encode.hpp @@ -72,6 +72,7 @@ class ArrayEncode { Encoding m_encoding{NodeHeader::Encoding::WTypBits}; // this is not ok .... probably size_t m_v_width = 0, m_v_size = 0, m_ndx_width = 0, m_ndx_size = 0; size_t m_v_mask = 0; + size_t m_ndx_mask = 0; friend class ArrayPacked; friend class ArrayFlex; diff --git a/src/realm/array_flex.cpp b/src/realm/array_flex.cpp index 51e1e1d873c..f911f94fe37 100644 --- a/src/realm/array_flex.cpp +++ b/src/realm/array_flex.cpp @@ -159,31 +159,169 @@ bool ArrayFlex::find_all(const Array& arr, int64_t value, size_t start, size_t e REALM_ASSERT_3(arr.m_width, !=, 0); + if constexpr (std::is_same_v) { + find_eq(arr, value, start, end, baseindex, state); + } + else if constexpr (std::is_same_v) { + find_neq(arr, value, start, end, baseindex, state); + } + else if constexpr (std::is_same_v) { + find_lt(arr, value, start, end, baseindex, state); + } + else if constexpr (std::is_same_v) { + find_gt(arr, value, start, end, baseindex, state); + } + + return true; +} + +template +inline size_t ArrayFlex::parallel_subword_find(const Array& arr, uint64_t value, size_t width_mask, size_t offset, + uint_least8_t width, size_t start, size_t end) const +{ + const auto MSBs = populate(width, width_mask); + const auto search_vector = populate(width, value); + const auto field_count = num_fields_for_width(width); + const auto bit_count_pr_iteration = num_bits_for_width(width); + auto total_bit_count_left = static_cast(end - start) * width; + REALM_ASSERT(total_bit_count_left >= 0); + auto bitwidth_cmp = [&MSBs](uint64_t a, uint64_t b) { + if constexpr (std::is_same_v) + return find_all_fields_EQ(MSBs, a, b); + else if constexpr (std::is_same_v) + return find_all_fields_NE(MSBs, a, b); + else if constexpr (std::is_same_v) { + if constexpr (v == true) + return find_all_fields_signed_GE(MSBs, a, b); + if constexpr (v == false) + return find_all_fields_unsigned_GE(MSBs, a, b); + REALM_UNREACHABLE(); + } + + else if constexpr (std::is_same_v) + return find_all_fields_signed_GT(MSBs, a, b); + else if constexpr (std::is_same_v) + return find_all_fields_unsigned_LT(MSBs, a, b); + }; + + unaligned_word_iter it((uint64_t*)(arr.m_data), offset + start * width); + uint64_t vector = 0; + while (total_bit_count_left >= bit_count_pr_iteration) { + const auto word = it.get(bit_count_pr_iteration); + vector = bitwidth_cmp(word, search_vector); + if (vector) { + int sub_word_index = first_field_marked((int)width, vector); + return start + sub_word_index; + } + total_bit_count_left -= bit_count_pr_iteration; + start += field_count; + it.bump(bit_count_pr_iteration); + } + if (total_bit_count_left) { // final subword, may be partial + const auto word = it.get(total_bit_count_left); // <-- limit lookahead to avoid touching memory beyond array + vector = bitwidth_cmp(word, search_vector); + auto last_word_mask = 0xFFFFFFFFFFFFFFFFULL >> (64 - total_bit_count_left); + vector &= last_word_mask; + if (vector) { + int sub_word_index = first_field_marked(width, vector); + return start + sub_word_index; + } + } + return end; +} + +bool ArrayFlex::find_eq(const Array& arr, int64_t value, size_t start, size_t end, size_t baseindex, + QueryStateBase* state) const +{ const auto& encoder = arr.m_encoder; - const auto data = (uint64_t*)arr.m_data; const auto v_width = encoder.m_v_width; const auto v_size = encoder.m_v_size; const auto ndx_width = encoder.m_ndx_width; - const auto mask = encoder.width_mask(); + const auto offset = v_size * v_width; - auto cmp = [](int64_t v, int64_t value) { - if constexpr (std::is_same_v) - return v == value; - if constexpr (std::is_same_v) - return v != value; - if constexpr (std::is_same_v) - return v > value; - if constexpr (std::is_same_v) - return v < value; - }; + auto v_start = parallel_subword_find(arr, value, encoder.m_v_mask, 0, v_width, 0, v_size); + if (v_start == v_size) + return true; + + while (start < end) { + start = parallel_subword_find(arr, v_start, encoder.m_ndx_mask, offset, ndx_width, start, end); + if (start < end) + if (!state->match(start + baseindex)) + return false; + + ++start; + } + return true; +} + +bool ArrayFlex::find_neq(const Array& arr, int64_t value, size_t start, size_t end, size_t baseindex, + QueryStateBase* state) const +{ + const auto& encoder = arr.m_encoder; + const auto v_width = encoder.m_v_width; + const auto v_size = encoder.m_v_size; + const auto ndx_width = encoder.m_ndx_width; + const auto offset = v_size * v_width; + + auto v_start = parallel_subword_find(arr, value, encoder.m_v_mask, 0, v_width, 0, v_size); + if (v_start == v_size) + return true; + + while (start < end) { + start = parallel_subword_find(arr, v_start, encoder.m_ndx_mask, offset, ndx_width, start, end); + if (start < end) + if (!state->match(start + baseindex)) + return false; + ++start; + } + return true; +} +bool ArrayFlex::find_lt(const Array& arr, int64_t value, size_t start, size_t end, size_t baseindex, + QueryStateBase* state) const +{ + const auto& encoder = arr.m_encoder; + const auto v_width = encoder.m_v_width; + const auto v_size = encoder.m_v_size; + const auto ndx_width = encoder.m_ndx_width; const auto offset = v_size * v_width; - bf_iterator it_index{data, static_cast(offset), ndx_width, ndx_width, start}; - for (; start < end; ++start, ++it_index) { - const auto v = sign_extend_field_by_mask(mask, read_bitfield(data, it_index.get_value() * v_width, v_width)); - if (cmp(v, value)) + + auto v_start = parallel_subword_find(arr, value, encoder.m_v_mask, 0, v_width, 0, v_size); + if (v_start == v_size) + return true; + + while (start < end) { + start = parallel_subword_find(arr, v_start, encoder.m_ndx_mask, offset, ndx_width, start, end); + if (start < end) + if (!state->match(start + baseindex)) + return false; + + ++start; + } + return true; +} + +bool ArrayFlex::find_gt(const Array& arr, int64_t value, size_t start, size_t end, size_t baseindex, + QueryStateBase* state) const +{ + const auto& encoder = arr.m_encoder; + const auto v_width = encoder.m_v_width; + const auto v_size = encoder.m_v_size; + const auto ndx_width = encoder.m_ndx_width; + const auto offset = v_size * v_width; + + auto v_start = parallel_subword_find(arr, value, encoder.m_v_mask, 0, v_width, 0, v_size); + if (v_start == v_size) + return true; + + while (start < end) { + start = parallel_subword_find(arr, v_start, encoder.m_ndx_mask, offset, ndx_width, start, + end); + if (start < end) if (!state->match(start + baseindex)) return false; + + ++start; } return true; } diff --git a/src/realm/array_flex.hpp b/src/realm/array_flex.hpp index f91f7213e12..2079db0a825 100644 --- a/src/realm/array_flex.hpp +++ b/src/realm/array_flex.hpp @@ -39,13 +39,23 @@ class ArrayFlex { int64_t get(const char*, size_t, size_t, size_t, size_t, size_t, size_t) const; void get_chunk(const Array& h, size_t ndx, int64_t res[8]) const; void set_direct(const Array&, size_t, int64_t) const; + template bool find_all(const Array&, int64_t, size_t, size_t, size_t, QueryStateBase*) const; + int64_t sum(const Array&, size_t, size_t) const; private: int64_t do_get(uint64_t*, size_t, size_t, size_t, size_t, size_t, size_t) const; bool find_all_match(size_t start, size_t end, size_t baseindex, QueryStateBase* state) const; + + template // true int64_t other uint64_t + inline size_t parallel_subword_find(const Array&, uint64_t, size_t, size_t, uint_least8_t, size_t, size_t) const; + + bool find_eq(const Array&, int64_t, size_t, size_t, size_t, QueryStateBase*) const; + bool find_neq(const Array&, int64_t, size_t, size_t, size_t, QueryStateBase*) const; + bool find_lt(const Array&, int64_t, size_t, size_t, size_t, QueryStateBase*) const; + bool find_gt(const Array&, int64_t, size_t, size_t, size_t, QueryStateBase*) const; }; } // namespace realm #endif // REALM_ARRAY_COMPRESS_HPP diff --git a/src/realm/array_integer.hpp b/src/realm/array_integer.hpp index 3248fcbad46..93f5e76ded5 100644 --- a/src/realm/array_integer.hpp +++ b/src/realm/array_integer.hpp @@ -158,6 +158,7 @@ inline ArrayIntNull::~ArrayIntNull() noexcept {} inline size_t ArrayIntNull::size() const noexcept { + // this cannot be right, what if size is 0 return Array::size() - 1; } diff --git a/src/realm/array_integer_tpl.hpp b/src/realm/array_integer_tpl.hpp index b145ffe67cd..cc021df8bb6 100644 --- a/src/realm/array_integer_tpl.hpp +++ b/src/realm/array_integer_tpl.hpp @@ -79,7 +79,7 @@ bool ArrayIntNull::find_impl(value_type opt_value, size_t start, size_t end, Que } // if encoded use specialised find if (is_encoded()) - return find_encoded(value, start2, end, baseindex2, state); + return find_encoded(value, start2, end2, baseindex2, state); // Fall back to plain Array find. return ArrayWithFind(*this).find(value, start2, end2, baseindex2, state); } diff --git a/src/realm/array_packed.cpp b/src/realm/array_packed.cpp index db5fde8b5b4..86693042948 100644 --- a/src/realm/array_packed.cpp +++ b/src/realm/array_packed.cpp @@ -112,7 +112,6 @@ void ArrayPacked::get_chunk(const Array& arr, size_t ndx, int64_t res[8]) const res[index++] = get(arr, i++); } } - template bool ArrayPacked::find_all(const Array& arr, int64_t value, size_t start, size_t end, size_t baseindex, QueryStateBase* state) const @@ -138,28 +137,79 @@ bool ArrayPacked::find_all(const Array& arr, int64_t value, size_t start, size_t REALM_ASSERT_3(arr.m_width, !=, 0); - auto cmp = [](int64_t v, int64_t value) { + + // NOTE: this is one of the most important functions in the whole codebase, since it determines how fast the + // queries run. + // + // Main idea around find. + // Try to find the starting point where the condition can be met, comparing as many values as a single 64bit can + // contain in parallel. Once we have found the starting point, keep matching values as much as we can between + // start and end. + // + // EG: we store the value 6, with width 4bits (0110), 6 is 4 bits because, 110 (6) + sign bit 0. + // Inside 64bits we can fit max 16 times 6. If we go from index 0 to 15 throughout the same 64 bits, we need to + // apply a mask and a shift bits every time, then compare the values. + // This is not the cheapest thing to do. Instead we can compare all values contained within 64 bits in one go and + // see if there is a match with what we are looking for. Reducing the number of comparison by ~logk(N) where K is + // the width of each single value within a 64 bit word and N is the total number of values stored in the array. + + // in packed format a parallel subword find pays off also for width >= 32 + while (start < end) { + start = parallel_subword_find(arr, value, start, end); + if (start < end) + if (!state->match(start + baseindex)) + return false; + + ++start; + } + return true; +} + +template +size_t ArrayPacked::parallel_subword_find(const Array& arr, int64_t value, size_t start, size_t end) const +{ + const auto width = arr.m_width; + const auto MSBs = populate(width, arr.get_encoder().width_mask()); + const auto search_vector = populate(width, value); + const auto field_count = num_fields_for_width(width); + const auto bit_count_pr_iteration = num_bits_for_width(width); + auto total_bit_count_left = static_cast(end - start) * width; + REALM_ASSERT(total_bit_count_left >= 0); + auto bitwidth_cmp = [&MSBs](uint64_t a, uint64_t b) { if constexpr (std::is_same_v) - return v == value; + return find_all_fields_EQ(MSBs, a, b); if constexpr (std::is_same_v) - return v != value; + return find_all_fields_NE(MSBs, a, b); if constexpr (std::is_same_v) - return v > value; + return find_all_fields_signed_GT(MSBs, a, b); if constexpr (std::is_same_v) - return v < value; + return find_all_fields_signed_LT(MSBs, a, b); }; - //~6/7x slower, we need to do a bitscan before to start this loop when values are less than 32 and 64 bits - bf_iterator it((uint64_t*)arr.m_data, 0, arr.m_width, arr.m_width, start); - const auto mask = arr.get_encoder().width_mask(); - for (; start < end; ++start, ++it) { - const auto v = sign_extend_field_by_mask(mask, it.get_value()); - if (cmp(v, value)) { - if (!state->match(start + baseindex)) - return false; + unaligned_word_iter it((uint64_t*)arr.m_data, start * arr.m_width); + uint64_t vector = 0; + while (total_bit_count_left >= bit_count_pr_iteration) { + const auto word = it.get(bit_count_pr_iteration); + vector = bitwidth_cmp(word, search_vector); + if (vector) { + int sub_word_index = first_field_marked(width, vector); + return start + sub_word_index; } + total_bit_count_left -= bit_count_pr_iteration; + start += field_count; + it.bump(bit_count_pr_iteration); } - return true; + if (total_bit_count_left) { // final subword, may be partial + const auto word = it.get(total_bit_count_left); // <-- limit lookahead to avoid touching memory beyond array + vector = bitwidth_cmp(word, search_vector); + auto last_word_mask = 0xFFFFFFFFFFFFFFFFULL >> (64 - total_bit_count_left); + vector &= last_word_mask; + if (vector) { + int sub_word_index = first_field_marked(width, vector); + return start + sub_word_index; + } + } + return end; } bool ArrayPacked::find_all_match(size_t start, size_t end, size_t baseindex, QueryStateBase* state) const diff --git a/src/realm/array_packed.hpp b/src/realm/array_packed.hpp index 790d04c80bf..985a9ea3f73 100644 --- a/src/realm/array_packed.hpp +++ b/src/realm/array_packed.hpp @@ -48,6 +48,9 @@ class ArrayPacked { private: int64_t do_get(uint64_t*, size_t, size_t, size_t, size_t) const; bool find_all_match(size_t start, size_t end, size_t baseindex, QueryStateBase* state) const; + + template + size_t parallel_subword_find(const Array&, int64_t, size_t, size_t) const; }; } // namespace realm diff --git a/src/realm/node_header.hpp b/src/realm/node_header.hpp index ed83db3ad39..e91a7b25def 100644 --- a/src/realm/node_header.hpp +++ b/src/realm/node_header.hpp @@ -897,6 +897,7 @@ size_t inline NodeHeader::get_byte_size_from_header(const char* header) noexcept get_elementB_size(h)); default: REALM_ASSERT_RELEASE(false && "unknown encoding"); + return 0; // kill a warning } } diff --git a/test/test_array_integer.cpp b/test/test_array_integer.cpp index 35959140a82..f877c23ad84 100644 --- a/test/test_array_integer.cpp +++ b/test/test_array_integer.cpp @@ -34,12 +34,14 @@ using namespace realm::test_util; // #define ARRAY_PERFORMANCE_TESTING #if !defined(REALM_DEBUG) && defined(ARRAY_PERFORMANCE_TESTING) -TEST(perf_array_encode_get_vs_array_get) +NONCONCURRENT_TEST(perf_array_encode_get_vs_array_get_less_32bit) +// ONLY(perf_array_encode_get_vs_array_get_less_32bit) { using namespace std; using namespace std::chrono; size_t n_values = 1000; size_t n_runs = 100; + std::cout << " < 32 bit values " << std::endl; std::cout << " N values = " << n_values << std::endl; std::cout << " N runs = " << n_runs << std::endl; @@ -129,12 +131,775 @@ TEST(perf_array_encode_get_vs_array_get) a_encoded.destroy(); } -TEST(Test_basic_find) + +NONCONCURRENT_TEST(Test_basic_find_EQ_less_32bit) +// ONLY(Test_basic_find_EQ_less_32bit) +{ + using namespace std; + using namespace std::chrono; + size_t n_values = 1000; + size_t n_runs = 100; + std::cout << " Value with bitwidth < 32 " << std::endl; + std::cout << " N values = " << n_values << std::endl; + std::cout << " N runs = " << n_runs << std::endl; + + std::vector input_array; + ArrayInteger a(Allocator::get_default()); + ArrayInteger a_encoded(Allocator::get_default()); + a.create(); + + for (size_t i = 0; i < n_values; i++) + input_array.push_back(i); + std::random_device rd; + std::mt19937 g(rd()); + std::shuffle(input_array.begin(), input_array.end(), g); + for (const auto& v : input_array) + a.add(v); + + auto t1 = high_resolution_clock::now(); + for (size_t j = 0; j < n_runs; ++j) { + for (size_t i = 0; i < n_values; ++i) { + auto ndx = a.find_first(input_array[i]); + REALM_ASSERT(ndx != realm::not_found); + REALM_ASSERT(a.get(ndx) == input_array[ndx]); + } + } + auto t2 = high_resolution_clock::now(); + + std::cout << " Positive values - Array::find(): " << duration_cast(t2 - t1).count() + << " ms" << std::endl; + std::cout << " Positive values - Array::find(): " + << (double)duration_cast(t2 - t1).count() / n_values / n_runs << " ns/value" << std::endl; + + a.try_encode(a_encoded); + CHECK(a_encoded.is_encoded()); + CHECK(a_encoded.size() == a.size()); + + // std::cout << "Array: " << std::endl; + // for(size_t i=0; i(): " << duration_cast(t2 - t1).count() + << " ms" << std::endl; + std::cout << " Positive values - ArrayEncode::find(): " + << (double)duration_cast(t2 - t1).count() / n_values / n_runs << " ns/value" << std::endl; + + std::cout << std::endl; + + a.destroy(); + a_encoded.destroy(); + a.create(); + input_array.clear(); + for (size_t i = 0; i < n_values; i++) + input_array.push_back(-i); + std::random_device rd1; + std::mt19937 g1(rd1()); + std::shuffle(input_array.begin(), input_array.end(), g1); + for (const auto& v : input_array) + a.add(v); + + a.try_encode(a_encoded); + CHECK(a_encoded.is_encoded()); + CHECK(a_encoded.size() == a.size()); + + // verify that both find the same thing + for (size_t j = 0; j < n_runs; ++j) { + for (size_t i = 0; i < n_values; ++i) { + auto v = a.find_first(input_array[i]); + auto v1 = a_encoded.find_first(input_array[i]); + REALM_ASSERT(v == v1); + } + } + + t1 = high_resolution_clock::now(); + for (size_t j = 0; j < n_runs; ++j) { + for (size_t i = 0; i < n_values; ++i) { + auto ndx = a.find_first(input_array[i]); + REALM_ASSERT(ndx != realm::not_found); + REALM_ASSERT(a.get(ndx) == input_array[ndx]); + } + } + t2 = high_resolution_clock::now(); + + std::cout << " Negative values - Array::find(): " << duration_cast(t2 - t1).count() + << " ms" << std::endl; + std::cout << " Negative values - Array::find(): " + << (double)duration_cast(t2 - t1).count() / n_values / n_runs << " ns/value" << std::endl; + + t1 = high_resolution_clock::now(); + for (size_t j = 0; j < n_runs; ++j) { + for (size_t i = 0; i < n_values; ++i) { + auto ndx = a_encoded.find_first(input_array[i]); + REALM_ASSERT(ndx != realm::not_found); + REALM_ASSERT(a_encoded.get(ndx) == a.get(ndx)); + } + } + t2 = high_resolution_clock::now(); + std::cout << " Negative values - ArrayEncode::find(): " << duration_cast(t2 - t1).count() + << " ms" << std::endl; + std::cout << " Negative values - ArrayEncode::find(): " + << (double)duration_cast(t2 - t1).count() / n_values / n_runs << " ns/value" << std::endl; + + a.destroy(); + a_encoded.destroy(); +} + +NONCONCURRENT_TEST(Test_basic_find_NEQ_value_less_32bit) +// ONLY(Test_basic_find_NEQ_value_less_32bit) +{ + using namespace std; + using namespace std::chrono; + size_t n_values = 1000; + size_t n_runs = 100; + std::cout << " Value with bitwidth < 32 " << std::endl; + std::cout << " N values = " << n_values << std::endl; + std::cout << " N runs = " << n_runs << std::endl; + + std::vector input_array; + ArrayInteger a(Allocator::get_default()); + ArrayInteger a_encoded(Allocator::get_default()); + a.create(); + + for (size_t i = 0; i < n_values; i++) + input_array.push_back(i); + std::random_device rd; + std::mt19937 g(rd()); + std::shuffle(input_array.begin(), input_array.end(), g); + for (const auto& v : input_array) + a.add(v); + + QueryStateFindFirst state1; + QueryStateFindFirst state2; + auto t1 = high_resolution_clock::now(); + for (size_t j = 0; j < n_runs; ++j) { + for (size_t i = 0; i < n_values; ++i) { + a.find(i, 0, a.size(), &state1); + REALM_ASSERT(state1.m_state != realm::not_found); + REALM_ASSERT(a.get(state1.m_state) == input_array[state1.m_state]); + } + } + auto t2 = high_resolution_clock::now(); + + std::cout << " Positive values - Array::find(): " << duration_cast(t2 - t1).count() + << " ms" << std::endl; + std::cout << " Positive values - Array::find(): " + << (double)duration_cast(t2 - t1).count() / n_values / n_runs << " ns/value" << std::endl; + + a.try_encode(a_encoded); + CHECK(a_encoded.is_encoded()); + CHECK(a_encoded.size() == a.size()); + + // verify that both find the same thing + for (size_t j = 0; j < n_runs; ++j) { + for (size_t i = 0; i < n_values; ++i) { + a.find(i, 0, a.size(), &state1); + a_encoded.find(i, 0, a_encoded.size(), &state2); + REALM_ASSERT(state1.m_state == state2.m_state); + } + } + + t1 = high_resolution_clock::now(); + for (size_t j = 0; j < n_runs; ++j) { + for (size_t i = 0; i < n_values; ++i) { + a_encoded.find(i, 0, a_encoded.size(), &state2); + REALM_ASSERT(state2.m_state != realm::not_found); + REALM_ASSERT(a_encoded.get(state2.m_state) == a.get(state2.m_state)); + } + } + t2 = high_resolution_clock::now(); + std::cout << " Positive values - ArrayEncode::find(): " + << duration_cast(t2 - t1).count() << " ms" << std::endl; + std::cout << " Positive values - ArrayEncode::find(): " + << (double)duration_cast(t2 - t1).count() / n_values / n_runs << " ns/value" << std::endl; + + std::cout << std::endl; + + a.destroy(); + a_encoded.destroy(); + a.create(); + input_array.clear(); + for (size_t i = 0; i < n_values; i++) + input_array.push_back(-i); + std::random_device rd1; + std::mt19937 g1(rd1()); + std::shuffle(input_array.begin(), input_array.end(), g1); + for (const auto& v : input_array) + a.add(v); + + a.try_encode(a_encoded); + CHECK(a_encoded.is_encoded()); + CHECK(a_encoded.size() == a.size()); + + // NEQ for signed integers is not working. TODO: investigate this. + // verify that both find the same thing + + for (size_t j = 0; j < n_runs; ++j) { + for (size_t i = 0; i < n_values; ++i) { + a.find(-i, 0, a.size(), &state1); + a_encoded.find(-i, 0, a_encoded.size(), &state2); + REALM_ASSERT(state1.m_state == state2.m_state); + } + } + + t1 = high_resolution_clock::now(); + for (size_t j = 0; j < n_runs; ++j) { + for (size_t i = 0; i < n_values; ++i) { + a.find(-i, 0, a.size(), &state1); + REALM_ASSERT(state1.m_state != realm::not_found); + REALM_ASSERT(a.get(state1.m_state) == input_array[state1.m_state]); + } + } + t2 = high_resolution_clock::now(); + + std::cout << " Negative values - Array::find(): " << duration_cast(t2 - t1).count() + << " ms" << std::endl; + std::cout << " Negative values - Array::find(): " + << (double)duration_cast(t2 - t1).count() / n_values / n_runs << " ns/value" << std::endl; + + t1 = high_resolution_clock::now(); + for (size_t j = 0; j < n_runs; ++j) { + for (size_t i = 0; i < n_values; ++i) { + a_encoded.find(-i, 0, a_encoded.size(), &state2); + REALM_ASSERT(state2.m_state != realm::not_found); + REALM_ASSERT(a_encoded.get(state2.m_state) == a.get(state2.m_state)); + } + } + t2 = high_resolution_clock::now(); + std::cout << " Negative values - ArrayEncode::find(): " + << duration_cast(t2 - t1).count() << " ms" << std::endl; + std::cout << " Negative values - ArrayEncode::find(): " + << (double)duration_cast(t2 - t1).count() / n_values / n_runs << " ns/value" << std::endl; + + a.destroy(); + a_encoded.destroy(); +} + +NONCONCURRENT_TEST(Test_basic_find_LT_value_less_32bit) +// ONLY(Test_basic_find_LT_value_less_32bit) +{ + using namespace std; + using namespace std::chrono; + size_t n_values = 1000; + size_t n_runs = 100; + std::cout << " Value with bitwidth < 32 " << std::endl; + std::cout << " N values = " << n_values << std::endl; + std::cout << " N runs = " << n_runs << std::endl; + + std::vector input_array; + ArrayInteger a(Allocator::get_default()); + ArrayInteger a_encoded(Allocator::get_default()); + a.create(); + + for (size_t i = 0; i < n_values; i++) + input_array.push_back(i); + std::random_device rd; + std::mt19937 g(rd()); + std::shuffle(input_array.begin(), input_array.end(), g); + for (const auto& v : input_array) + a.add(v); + + QueryStateFindFirst state1{}; + QueryStateFindFirst state2{}; + auto t1 = high_resolution_clock::now(); + for (size_t j = 0; j < n_runs; ++j) { + for (size_t i = 1; i < n_values; ++i) { // there is nothing less than 0 + a.find(i, 0, a.size(), &state1); + REALM_ASSERT(state1.m_state != realm::not_found); + REALM_ASSERT(a.get(state1.m_state) == input_array[state1.m_state]); + } + } + auto t2 = high_resolution_clock::now(); + + std::cout << " Positive values - Array::find(): " << duration_cast(t2 - t1).count() << " ms" + << std::endl; + std::cout << " Positive values - Array::find(): " + << (double)duration_cast(t2 - t1).count() / n_values / n_runs << " ns/value" << std::endl; + + a.try_encode(a_encoded); + CHECK(a_encoded.is_encoded()); + CHECK(a_encoded.size() == a.size()); + + // std::cout << "Array: " << std::endl; + // for(size_t i=0; i(i, 0, a.size(), &state1); + a_encoded.find(i, 0, a_encoded.size(), &state2); + REALM_ASSERT(state1.m_state == state2.m_state); + } + } + + t1 = high_resolution_clock::now(); + for (size_t j = 0; j < n_runs; ++j) { + for (size_t i = 1; i < n_values; ++i) { // there is nothing less than 0 + a_encoded.find(i, 0, a_encoded.size(), &state2); + REALM_ASSERT(state2.m_state != realm::not_found); + REALM_ASSERT(a_encoded.get(state2.m_state) == a.get(state2.m_state)); + } + } + t2 = high_resolution_clock::now(); + std::cout << " Positive values - ArrayEncode::find(): " << duration_cast(t2 - t1).count() + << " ms" << std::endl; + std::cout << " Positive values - ArrayEncode::find(): " + << (double)duration_cast(t2 - t1).count() / n_values / n_runs << " ns/value" << std::endl; + + std::cout << std::endl; + + a.destroy(); + a_encoded.destroy(); + a.create(); + input_array.clear(); + for (size_t i = 0; i < n_values; i++) + input_array.push_back(-i); + std::random_device rd1; + std::mt19937 g1(rd1()); + std::shuffle(input_array.begin(), input_array.end(), g1); + for (const auto& v : input_array) + a.add(v); + + a.try_encode(a_encoded); + CHECK(a_encoded.is_encoded()); + CHECK(a_encoded.size() == a.size()); + + // verify that both find the same thing + state1 = {}; + state2 = {}; + for (size_t j = 0; j < n_runs; ++j) { + for (size_t i = 0; i < n_values; ++i) { + a.find(-i, 0, a.size(), &state1); + a_encoded.find(-i, 0, a_encoded.size(), &state2); + REALM_ASSERT(state1.m_state == state2.m_state); + } + } + + t1 = high_resolution_clock::now(); + for (size_t j = 0; j < n_runs; ++j) { + for (size_t i = 0; i < n_values - 1; ++i) { // nothing less than the biggest negative number + a.find(-i, 0, a.size(), &state1); + REALM_ASSERT(state1.m_state != realm::not_found); + REALM_ASSERT(a.get(state1.m_state) == input_array[state1.m_state]); + } + } + t2 = high_resolution_clock::now(); + + std::cout << " Negative values - Array::find(): " << duration_cast(t2 - t1).count() << " ms" + << std::endl; + std::cout << " Negative values - Array::find(): " + << (double)duration_cast(t2 - t1).count() / n_values / n_runs << " ns/value" << std::endl; + + t1 = high_resolution_clock::now(); + for (size_t j = 0; j < n_runs; ++j) { + for (size_t i = 0; i < n_values - 1; ++i) { // nothing less than the biggest negative number + a_encoded.find(-i, 0, a_encoded.size(), &state2); + REALM_ASSERT(state2.m_state != realm::not_found); + REALM_ASSERT(a_encoded.get(state2.m_state) == a.get(state2.m_state)); + } + } + t2 = high_resolution_clock::now(); + std::cout << " Negative values - ArrayEncode::find(): " << duration_cast(t2 - t1).count() + << " ms" << std::endl; + std::cout << " Negative values - ArrayEncode::find(): " + << (double)duration_cast(t2 - t1).count() / n_values / n_runs << " ns/value" << std::endl; + + a.destroy(); + a_encoded.destroy(); +} + +NONCONCURRENT_TEST(Test_basic_find_GT_value_less_32bit) +// ONLY(Test_basic_find_GT_value_less_32bit) +{ + // GT subword parallel search is not working... TODO : investigate + using namespace std; + using namespace std::chrono; + size_t n_values = 1000; + size_t n_runs = 100; + std::cout << " Value with bitwidth < 32 " << std::endl; + std::cout << " N values = " << n_values << std::endl; + std::cout << " N runs = " << n_runs << std::endl; + + std::vector input_array; + ArrayInteger a(Allocator::get_default()); + ArrayInteger a_encoded(Allocator::get_default()); + a.create(); + + for (size_t i = 0; i < n_values; i++) + input_array.push_back(i); + std::random_device rd; + std::mt19937 g(rd()); + std::shuffle(input_array.begin(), input_array.end(), g); + for (const auto& v : input_array) + a.add(v); + + QueryStateFindFirst state1; + QueryStateFindFirst state2; + auto t1 = high_resolution_clock::now(); + for (size_t j = 0; j < n_runs; ++j) { + for (size_t i = 0; i < n_values - 1; ++i) { // nothing greatest than the last number + a.find(i, 0, a.size(), &state1); + REALM_ASSERT(state1.m_state != realm::not_found); + REALM_ASSERT(a.get(state1.m_state) == input_array[state1.m_state]); + } + } + auto t2 = high_resolution_clock::now(); + + std::cout << " Positive values - Array::find(): " << duration_cast(t2 - t1).count() + << " ms" << std::endl; + std::cout << " Positive values - Array::find(): " + << (double)duration_cast(t2 - t1).count() / n_values / n_runs << " ns/value" << std::endl; + + a.try_encode(a_encoded); + CHECK(a_encoded.is_encoded()); + CHECK(a_encoded.size() == a.size()); + + // std::cout << "Array: " << std::endl; + // for(size_t i=0; i(i, 0, a.size(), &state1); + a_encoded.find(i, 0, a_encoded.size(), &state2); + REALM_ASSERT(state1.m_state == state2.m_state); + } + } + + t1 = high_resolution_clock::now(); + for (size_t j = 0; j < n_runs; ++j) { + for (size_t i = 0; i < n_values - 1; ++i) { // nothing bigger than the last val + a_encoded.find(i, 0, a_encoded.size(), &state2); + REALM_ASSERT(state2.m_state != realm::not_found); + REALM_ASSERT(a_encoded.get(state2.m_state) == a.get(state2.m_state)); + } + } + t2 = high_resolution_clock::now(); + std::cout << " Positive values - ArrayEncode::find(): " << duration_cast(t2 - t1).count() + << " ms" << std::endl; + std::cout << " Positive values - ArrayEncode::find(): " + << (double)duration_cast(t2 - t1).count() / n_values / n_runs << " ns/value" << std::endl; + + std::cout << std::endl; + + a.destroy(); + a_encoded.destroy(); + a.create(); + input_array.clear(); + for (size_t i = 0; i < n_values; i++) + input_array.push_back(-i); + std::random_device rd1; + std::mt19937 g1(rd1()); + std::shuffle(input_array.begin(), input_array.end(), g1); + for (const auto& v : input_array) + a.add(v); + + a.try_encode(a_encoded); + CHECK(a_encoded.is_encoded()); + CHECK(a_encoded.size() == a.size()); + + // verify that both find the same thing + state1 = {}; + state2 = {}; + for (size_t j = 0; j < n_runs; ++j) { + for (size_t i = 0; i < n_values; ++i) { + a.find(-i, 0, a.size(), &state1); + a_encoded.find(-i, 0, a_encoded.size(), &state2); + REALM_ASSERT(state1.m_state == state2.m_state); + } + } + + t1 = high_resolution_clock::now(); + for (size_t j = 0; j < n_runs; ++j) { + for (size_t i = 1; i < n_values; ++i) { // nothing bigger than 0 + a.find(-i, 0, a.size(), &state1); + REALM_ASSERT(state1.m_state != realm::not_found); + REALM_ASSERT(a.get(state1.m_state) == input_array[state1.m_state]); + } + } + t2 = high_resolution_clock::now(); + + std::cout << " Negative values - Array::find(): " << duration_cast(t2 - t1).count() + << " ms" << std::endl; + std::cout << " Negative values - Array::find(): " + << (double)duration_cast(t2 - t1).count() / n_values / n_runs << " ns/value" << std::endl; + + t1 = high_resolution_clock::now(); + for (size_t j = 0; j < n_runs; ++j) { + for (size_t i = 1; i < n_values; ++i) { // nothing bigger than 0 + a_encoded.find(-i, 0, a_encoded.size(), &state2); + REALM_ASSERT(state2.m_state != realm::not_found); + REALM_ASSERT(a_encoded.get(state2.m_state) == a.get(state2.m_state)); + } + } + t2 = high_resolution_clock::now(); + std::cout << " Negative values - ArrayEncode::find(): " << duration_cast(t2 - t1).count() + << " ms" << std::endl; + std::cout << " Negative values - ArrayEncode::find(): " + << (double)duration_cast(t2 - t1).count() / n_values / n_runs << " ns/value" << std::endl; + + a.destroy(); + a_encoded.destroy(); +} + +NONCONCURRENT_TEST(perf_array_encode_get_vs_array_get_greater_32bit) +// ONLY(perf_array_encode_get_vs_array_get_greater_32bit) +{ + using namespace std; + using namespace std::chrono; + size_t start_value = 0x0000000100000000; // 32 bit val + size_t n_values = 1000; + size_t n_runs = 100; + std::cout << " >= 32 bit values " << std::endl; + std::cout << " N values = " << n_values << std::endl; + std::cout << " N runs = " << n_runs << std::endl; + + std::vector input_array; + ArrayInteger a(Allocator::get_default()); + ArrayInteger a_encoded(Allocator::get_default()); + a.create(); + + for (size_t i = 0; i < n_values; i++) + input_array.push_back(start_value + i); + std::random_device rd; + std::mt19937 g(rd()); + std::shuffle(input_array.begin(), input_array.end(), g); + for (const auto& v : input_array) + a.add(v); + + auto t1 = high_resolution_clock::now(); + for (size_t j = 0; j < n_runs; ++j) { + for (size_t i = 0; i < n_values; ++i) + REALM_ASSERT(a.get(i) == input_array[i]); + } + auto t2 = high_resolution_clock::now(); + + std::cout << " Positive values - Array::get(): " << duration_cast(t2 - t1).count() << " ns" + << std::endl; + std::cout << " Positive values - Array::get(): " + << (double)duration_cast(t2 - t1).count() / n_values / n_runs << " ns/value" << std::endl; + + a.try_encode(a_encoded); + CHECK(a_encoded.is_encoded()); + CHECK(a_encoded.size() == a.size()); + t1 = high_resolution_clock::now(); + + for (size_t j = 0; j < n_runs; ++j) { + for (size_t i = 0; i < n_values; ++i) { + REALM_ASSERT(a_encoded.get(i) == a.get(i)); + } + } + t2 = high_resolution_clock::now(); + std::cout << " Positive values - ArrayEncode::get(): " << duration_cast(t2 - t1).count() << " ns" + << std::endl; + std::cout << " Positive values - ArrayEncode::get(): " + << (double)duration_cast(t2 - t1).count() / n_values / n_runs << " ns/value" << std::endl; + + a.destroy(); + a_encoded.destroy(); + a.create(); + input_array.clear(); + for (size_t i = 0; i < n_values; i++) + input_array.push_back(-i); + std::random_device rd1; + std::mt19937 g1(rd1()); + std::shuffle(input_array.begin(), input_array.end(), g1); + for (const auto& v : input_array) + a.add(v); + + t1 = high_resolution_clock::now(); + for (size_t j = 0; j < n_runs; ++j) { + for (size_t i = 0; i < n_values; ++i) + REALM_ASSERT(a.get(i) == input_array[i]); + } + t2 = high_resolution_clock::now(); + + std::cout << std::endl; + + std::cout << " Negative values - Array::get(): " << duration_cast(t2 - t1).count() << " ns" + << std::endl; + std::cout << " Negative values - Array::get(): " + << (double)duration_cast(t2 - t1).count() / n_values / n_runs << " ns/value" << std::endl; + + a.try_encode(a_encoded); + CHECK(a_encoded.is_encoded()); + CHECK(a_encoded.size() == a.size()); + t1 = high_resolution_clock::now(); + for (size_t j = 0; j < n_runs; ++j) { + for (size_t i = 0; i < n_values; ++i) { + REALM_ASSERT(a_encoded.get(i) == a.get(i)); + } + } + t2 = high_resolution_clock::now(); + std::cout << " Negative values - ArrayEncode::get(): " << duration_cast(t2 - t1).count() << " ns" + << std::endl; + std::cout << " Negative values - ArrayEncode::get(): " + << (double)duration_cast(t2 - t1).count() / n_values / n_runs << " ns/value" << std::endl; + + a.destroy(); + a_encoded.destroy(); +} + +NONCONCURRENT_TEST(Test_basic_find_EQ_greater_32bit) +// ONLY(Test_basic_find_EQ_greater_32bit) +{ + using namespace std; + using namespace std::chrono; + size_t start_value = 0x000001000000000; // 32 bit val + size_t n_values = 1000; + size_t n_runs = 100; + std::cout << " Value with bitwidth >= 32 " << std::endl; + std::cout << " N values = " << n_values << std::endl; + std::cout << " N runs = " << n_runs << std::endl; + + std::vector input_array; + ArrayInteger a(Allocator::get_default()); + ArrayInteger a_encoded(Allocator::get_default()); + a.create(); + + for (size_t i = 0; i < n_values; i++) + input_array.push_back(start_value + i); + std::random_device rd; + std::mt19937 g(rd()); + std::shuffle(input_array.begin(), input_array.end(), g); + for (const auto& v : input_array) + a.add(v); + + auto t1 = high_resolution_clock::now(); + for (size_t j = 0; j < n_runs; ++j) { + for (size_t i = 0; i < n_values; ++i) { + auto ndx = a.find_first(start_value + i); + REALM_ASSERT(ndx != realm::not_found); + REALM_ASSERT(a.get(ndx) == input_array[ndx]); + } + } + auto t2 = high_resolution_clock::now(); + + std::cout << " Positive values - Array::find(): " << duration_cast(t2 - t1).count() + << " ms" << std::endl; + std::cout << " Positive values - Array::find(): " + << (double)duration_cast(t2 - t1).count() / n_values / n_runs << " ns/value" << std::endl; + + a.try_encode(a_encoded); + CHECK(a_encoded.is_encoded()); + CHECK(a_encoded.size() == a.size()); + + // verify that both find the same thing + for (size_t j = 0; j < n_runs; ++j) { + for (size_t i = 0; i < n_values; ++i) { + REALM_ASSERT(a.find_first(start_value + i) == a_encoded.find_first(start_value + i)); + } + } + + t1 = high_resolution_clock::now(); + for (size_t j = 0; j < n_runs; ++j) { + for (size_t i = 0; i < n_values; ++i) { + auto ndx = a_encoded.find_first(start_value + i); + REALM_ASSERT(ndx != realm::not_found); + REALM_ASSERT(a_encoded.get(ndx) == a.get(ndx)); + } + } + t2 = high_resolution_clock::now(); + std::cout << " Positive values - ArrayEncode::find(): " << duration_cast(t2 - t1).count() + << " ms" << std::endl; + std::cout << " Positive values - ArrayEncode::find(): " + << (double)duration_cast(t2 - t1).count() / n_values / n_runs << " ns/value" << std::endl; + + std::cout << std::endl; + + a.destroy(); + a_encoded.destroy(); + a.create(); + input_array.clear(); + for (size_t i = 0; i < n_values; i++) + input_array.push_back(-(start_value + i)); + std::random_device rd1; + std::mt19937 g1(rd1()); + std::shuffle(input_array.begin(), input_array.end(), g1); + for (const auto& v : input_array) + a.add(v); + + a.try_encode(a_encoded); + CHECK(a_encoded.is_encoded()); + CHECK(a_encoded.size() == a.size()); + + // verify that both find the same thing + for (size_t j = 0; j < n_runs; ++j) { + for (size_t i = 0; i < n_values; ++i) { + const auto k = -(start_value + i); + const auto v1 = a.find_first(k); + const auto v2 = a_encoded.find_first(k); + REALM_ASSERT(v1 == v2); + } + } + + t1 = high_resolution_clock::now(); + for (size_t j = 0; j < n_runs; ++j) { + for (size_t i = 0; i < n_values; ++i) { + auto ndx = a.find_first(-(start_value + i)); + REALM_ASSERT(ndx != realm::not_found); + REALM_ASSERT(a.get(ndx) == input_array[ndx]); + } + } + t2 = high_resolution_clock::now(); + + std::cout << " Negative values - Array::find(): " << duration_cast(t2 - t1).count() + << " ms" << std::endl; + std::cout << " Negative values - Array::find(): " + << (double)duration_cast(t2 - t1).count() / n_values / n_runs << " ns/value" << std::endl; + + t1 = high_resolution_clock::now(); + for (size_t j = 0; j < n_runs; ++j) { + for (size_t i = 0; i < n_values; ++i) { + auto ndx = a_encoded.find_first(-(start_value + i)); + REALM_ASSERT(ndx != realm::not_found); + REALM_ASSERT(a_encoded.get(ndx) == a.get(ndx)); + } + } + t2 = high_resolution_clock::now(); + std::cout << " Negative values - ArrayEncode::find(): " << duration_cast(t2 - t1).count() + << " ms" << std::endl; + std::cout << " Negative values - ArrayEncode::find(): " + << (double)duration_cast(t2 - t1).count() / n_values / n_runs << " ns/value" << std::endl; + + a.destroy(); + a_encoded.destroy(); +} + +NONCONCURRENT_TEST(Test_basic_find_NEQ_value_greater_32bit) { using namespace std; using namespace std::chrono; + size_t start_value = 0x0000000100000000; // 32 bit val size_t n_values = 1000; size_t n_runs = 100; + std::cout << " Value with bitwidth >= 32 " << std::endl; std::cout << " N values = " << n_values << std::endl; std::cout << " N runs = " << n_runs << std::endl; @@ -144,26 +909,28 @@ TEST(Test_basic_find) a.create(); for (size_t i = 0; i < n_values; i++) - input_array.push_back(i); + input_array.push_back(start_value + i); std::random_device rd; std::mt19937 g(rd()); std::shuffle(input_array.begin(), input_array.end(), g); for (const auto& v : input_array) a.add(v); + QueryStateFindFirst state1; + QueryStateFindFirst state2; auto t1 = high_resolution_clock::now(); for (size_t j = 0; j < n_runs; ++j) { for (size_t i = 0; i < n_values; ++i) { - auto ndx = a.find_first(i); - REALM_ASSERT(ndx != realm::not_found); - REALM_ASSERT(a.get(ndx) == input_array[ndx]); + a.find(start_value + i, 0, a.size(), &state1); + REALM_ASSERT(state1.m_state != realm::not_found); + REALM_ASSERT(a.get(state1.m_state) == input_array[state1.m_state]); } } auto t2 = high_resolution_clock::now(); - std::cout << " Positive values - Array::find(): " << duration_cast(t2 - t1).count() << " ms" - << std::endl; - std::cout << " Positive values - Array::find(): " + std::cout << " Positive values - Array::find(): " << duration_cast(t2 - t1).count() + << " ms" << std::endl; + std::cout << " Positive values - Array::find(): " << (double)duration_cast(t2 - t1).count() / n_values / n_runs << " ns/value" << std::endl; a.try_encode(a_encoded); @@ -173,22 +940,154 @@ TEST(Test_basic_find) // verify that both find the same thing for (size_t j = 0; j < n_runs; ++j) { for (size_t i = 0; i < n_values; ++i) { - REALM_ASSERT(a.find_first(i) == a_encoded.find_first(i)); + a.find(start_value + i, 0, a.size(), &state1); + a_encoded.find(start_value + i, 0, a_encoded.size(), &state2); + REALM_ASSERT(state1.m_state == state2.m_state); } } t1 = high_resolution_clock::now(); for (size_t j = 0; j < n_runs; ++j) { for (size_t i = 0; i < n_values; ++i) { - auto ndx = a_encoded.find_first(i); - REALM_ASSERT(ndx != realm::not_found); - REALM_ASSERT(a_encoded.get(ndx) == a.get(ndx)); + a_encoded.find(start_value + i, 0, a_encoded.size(), &state2); + REALM_ASSERT(state2.m_state != realm::not_found); + REALM_ASSERT(a_encoded.get(state2.m_state) == a.get(state2.m_state)); + } + } + t2 = high_resolution_clock::now(); + std::cout << " Positive values - ArrayEncode::find(): " + << duration_cast(t2 - t1).count() << " ms" << std::endl; + std::cout << " Positive values - ArrayEncode::find(): " + << (double)duration_cast(t2 - t1).count() / n_values / n_runs << " ns/value" << std::endl; + + std::cout << std::endl; + + a.destroy(); + a_encoded.destroy(); + a.create(); + input_array.clear(); + for (size_t i = 0; i < n_values; i++) + input_array.push_back(-(start_value + i)); + std::random_device rd1; + std::mt19937 g1(rd1()); + std::shuffle(input_array.begin(), input_array.end(), g1); + for (const auto& v : input_array) + a.add(v); + + a.try_encode(a_encoded); + CHECK(a_encoded.is_encoded()); + CHECK(a_encoded.size() == a.size()); + + // verify that both find the same thing + for (size_t j = 0; j < n_runs; ++j) { + for (size_t i = 0; i < n_values; ++i) { + a.find(-(start_value + i), 0, a.size(), &state1); + a_encoded.find(-(start_value + i), 0, a_encoded.size(), &state2); + REALM_ASSERT(state1.m_state == state2.m_state); + } + } + + t1 = high_resolution_clock::now(); + for (size_t j = 0; j < n_runs; ++j) { + for (size_t i = 0; i < n_values; ++i) { + a.find(-(start_value + i), 0, a.size(), &state1); + REALM_ASSERT(state1.m_state != realm::not_found); + REALM_ASSERT(a.get(state1.m_state) == input_array[state1.m_state]); + } + } + t2 = high_resolution_clock::now(); + + std::cout << " Negative values - Array::find(): " << duration_cast(t2 - t1).count() + << " ms" << std::endl; + std::cout << " Negative values - Array::find(): " + << (double)duration_cast(t2 - t1).count() / n_values / n_runs << " ns/value" << std::endl; + + t1 = high_resolution_clock::now(); + for (size_t j = 0; j < n_runs; ++j) { + for (size_t i = 0; i < n_values; ++i) { + a_encoded.find(-(start_value + i), 0, a_encoded.size(), &state2); + REALM_ASSERT(state2.m_state != realm::not_found); + REALM_ASSERT(a_encoded.get(state2.m_state) == a.get(state2.m_state)); + } + } + t2 = high_resolution_clock::now(); + std::cout << " Negative values - ArrayEncode::find(): " + << duration_cast(t2 - t1).count() << " ms" << std::endl; + std::cout << " Negative values - ArrayEncode::find(): " + << (double)duration_cast(t2 - t1).count() / n_values / n_runs << " ns/value" << std::endl; + + a.destroy(); + a_encoded.destroy(); +} + +NONCONCURRENT_TEST(Test_basic_find_LT_value_greater_32bit) +{ + using namespace std; + using namespace std::chrono; + size_t start_value = 0x0000000100000000; // 32 bit val + size_t n_values = 1000; + size_t n_runs = 100; + std::cout << " Value with bitwidth >= 32 " << std::endl; + std::cout << " N values = " << n_values << std::endl; + std::cout << " N runs = " << n_runs << std::endl; + + std::vector input_array; + ArrayInteger a(Allocator::get_default()); + ArrayInteger a_encoded(Allocator::get_default()); + a.create(); + + for (size_t i = 0; i < n_values; i++) + input_array.push_back(start_value + i); + std::random_device rd; + std::mt19937 g(rd()); + std::shuffle(input_array.begin(), input_array.end(), g); + for (const auto& v : input_array) + a.add(v); + + QueryStateFindFirst state1; + QueryStateFindFirst state2; + auto t1 = high_resolution_clock::now(); + for (size_t j = 0; j < n_runs; ++j) { + for (size_t i = 1; i < n_values; ++i) { + a.find(start_value + i, 0, a.size(), &state1); + REALM_ASSERT(state1.m_state != realm::not_found); + REALM_ASSERT(a.get(state1.m_state) == input_array[state1.m_state]); + } + } + auto t2 = high_resolution_clock::now(); + + std::cout << " Positive values - Array::find(): " << duration_cast(t2 - t1).count() << " ms" + << std::endl; + std::cout << " Positive values - Array::find(): " + << (double)duration_cast(t2 - t1).count() / n_values / n_runs << " ns/value" << std::endl; + + a.try_encode(a_encoded); + CHECK(a_encoded.is_encoded()); + CHECK(a_encoded.size() == a.size()); + + // verify that both find the same thing + state1 = {}; + state2 = {}; + for (size_t j = 0; j < n_runs; ++j) { + for (size_t i = 0; i < n_values; ++i) { + a.find(start_value + i, 0, a.size(), &state1); + a_encoded.find(start_value + i, 0, a_encoded.size(), &state2); + REALM_ASSERT(state1.m_state == state2.m_state); + } + } + + t1 = high_resolution_clock::now(); + for (size_t j = 0; j < n_runs; ++j) { + for (size_t i = 1; i < n_values; ++i) { + a_encoded.find(start_value + i, 0, a_encoded.size(), &state2); + REALM_ASSERT(state2.m_state != realm::not_found); + REALM_ASSERT(a_encoded.get(state2.m_state) == a.get(state2.m_state)); } } t2 = high_resolution_clock::now(); - std::cout << " Positive values - ArrayEncode::find_first(): " << duration_cast(t2 - t1).count() + std::cout << " Positive values - ArrayEncode::find(): " << duration_cast(t2 - t1).count() << " ms" << std::endl; - std::cout << " Positive values - ArrayEncode::find_first(): " + std::cout << " Positive values - ArrayEncode::find(): " << (double)duration_cast(t2 - t1).count() / n_values / n_runs << " ns/value" << std::endl; std::cout << std::endl; @@ -198,57 +1097,196 @@ TEST(Test_basic_find) a.create(); input_array.clear(); for (size_t i = 0; i < n_values; i++) - input_array.push_back(-i); + input_array.push_back(-(start_value + i)); std::random_device rd1; std::mt19937 g1(rd1()); std::shuffle(input_array.begin(), input_array.end(), g1); for (const auto& v : input_array) a.add(v); + a.try_encode(a_encoded); + CHECK(a_encoded.is_encoded()); + CHECK(a_encoded.size() == a.size()); + // verify that both find the same thing for (size_t j = 0; j < n_runs; ++j) { for (size_t i = 0; i < n_values; ++i) { - REALM_ASSERT(a.find_first(-i) == a_encoded.find_first(-i)); + a.find(-(start_value + i), 0, a.size(), &state1); + a_encoded.find(-(start_value + i), 0, a_encoded.size(), &state2); + REALM_ASSERT(state1.m_state == state2.m_state); } } t1 = high_resolution_clock::now(); for (size_t j = 0; j < n_runs; ++j) { for (size_t i = 0; i < n_values; ++i) { - auto ndx = a.find_first(-i); - REALM_ASSERT(ndx != realm::not_found); - REALM_ASSERT(a.get(ndx) == input_array[ndx]); + a.find(-(start_value + i), 0, a.size(), &state1); + REALM_ASSERT(state1.m_state != realm::not_found); + REALM_ASSERT(a.get(state1.m_state) == input_array[state1.m_state]); } } t2 = high_resolution_clock::now(); - std::cout << " Negative values - Array::find(): " << duration_cast(t2 - t1).count() << " ns" + std::cout << " Negative values - Array::find(): " << duration_cast(t2 - t1).count() << " ms" << std::endl; - std::cout << " Negative values - Array::find(): " + std::cout << " Negative values - Array::find(): " + << (double)duration_cast(t2 - t1).count() / n_values / n_runs << " ns/value" << std::endl; + + t1 = high_resolution_clock::now(); + for (size_t j = 0; j < n_runs; ++j) { + for (size_t i = 0; i < n_values; ++i) { + a_encoded.find(-(start_value + i), 0, a_encoded.size(), &state2); + REALM_ASSERT(state2.m_state != realm::not_found); + REALM_ASSERT(a_encoded.get(state2.m_state) == a.get(state2.m_state)); + } + } + t2 = high_resolution_clock::now(); + std::cout << " Negative values - ArrayEncode::find(): " << duration_cast(t2 - t1).count() + << " ms" << std::endl; + std::cout << " Negative values - ArrayEncode::find(): " + << (double)duration_cast(t2 - t1).count() / n_values / n_runs << " ns/value" << std::endl; + + a.destroy(); + a_encoded.destroy(); +} + +// NONCONCURRENT_TEST(Test_basic_find_GT_value_greater_32bit) +NONCONCURRENT_TEST(Test_basic_find_GT_value_greater_32bit) +// ONLY(Test_basic_find_GT_value_greater_32bit) +{ + using namespace std; + using namespace std::chrono; + size_t start_value = 0x0000100000000; // 32 bit val + size_t n_values = 1000; + size_t n_runs = 100; + std::cout << " Value with bitwidth >= 32 " << std::endl; + std::cout << " N values = " << n_values << std::endl; + std::cout << " N runs = " << n_runs << std::endl; + + std::vector input_array; + ArrayInteger a(Allocator::get_default()); + ArrayInteger a_encoded(Allocator::get_default()); + a.create(); + + for (size_t i = 0; i < n_values; i++) + input_array.push_back(start_value + i); + std::random_device rd; + std::mt19937 g(rd()); + std::shuffle(input_array.begin(), input_array.end(), g); + for (const auto& v : input_array) + a.add(v); + + QueryStateFindFirst state1; + QueryStateFindFirst state2; + auto t1 = high_resolution_clock::now(); + for (size_t j = 0; j < n_runs; ++j) { + for (size_t i = 0; i < n_values - 1; ++i) { + a.find(start_value + i, 0, a.size(), &state1); + REALM_ASSERT(state1.m_state != realm::not_found); + REALM_ASSERT(a.get(state1.m_state) == input_array[state1.m_state]); + } + } + auto t2 = high_resolution_clock::now(); + + std::cout << " Positive values - Array::find(): " << duration_cast(t2 - t1).count() + << " ms" << std::endl; + std::cout << " Positive values - Array::find(): " + << (double)duration_cast(t2 - t1).count() / n_values / n_runs << " ns/value" << std::endl; + + a.try_encode(a_encoded); + CHECK(a_encoded.is_encoded()); + CHECK(a_encoded.size() == a.size()); + + // verify that both find the same thing + state1 = {}; + state2 = {}; + for (size_t j = 0; j < n_runs; ++j) { + for (size_t i = 0; i < n_values; ++i) { + const auto k = start_value + i; + a.find(k, 0, a.size(), &state1); + a_encoded.find(k, 0, a_encoded.size(), &state2); + REALM_ASSERT(state1.m_state == state2.m_state); + } + } + + t1 = high_resolution_clock::now(); + for (size_t j = 0; j < n_runs; ++j) { + for (size_t i = 0; i < n_values - 1; ++i) { + a_encoded.find(start_value + i, 0, a_encoded.size(), &state2); + REALM_ASSERT(state2.m_state != realm::not_found); + REALM_ASSERT(a_encoded.get(state2.m_state) == a.get(state2.m_state)); + } + } + t2 = high_resolution_clock::now(); + std::cout << " Positive values - ArrayEncode::find(): " << duration_cast(t2 - t1).count() + << " ms" << std::endl; + std::cout << " Positive values - ArrayEncode::find(): " << (double)duration_cast(t2 - t1).count() / n_values / n_runs << " ns/value" << std::endl; + std::cout << std::endl; + + a.destroy(); + a_encoded.destroy(); + a.create(); + input_array.clear(); + for (size_t i = 0; i < n_values; i++) + input_array.push_back(-(start_value + i)); + std::random_device rd1; + std::mt19937 g1(rd1()); + std::shuffle(input_array.begin(), input_array.end(), g1); + for (const auto& v : input_array) + a.add(v); + a.try_encode(a_encoded); CHECK(a_encoded.is_encoded()); CHECK(a_encoded.size() == a.size()); + + // verify that both find the same thing + for (size_t j = 0; j < n_runs; ++j) { + for (size_t i = 1; i < n_values; ++i) { + a.find(-(start_value + i), 0, a.size(), &state1); + a_encoded.find(-(start_value + i), 0, a_encoded.size(), &state2); + REALM_ASSERT(state1.m_state == state2.m_state); + } + } + t1 = high_resolution_clock::now(); for (size_t j = 0; j < n_runs; ++j) { for (size_t i = 0; i < n_values; ++i) { - auto ndx = a_encoded.find_first(-i); - REALM_ASSERT(ndx != realm::not_found); - REALM_ASSERT(a_encoded.get(ndx) == a.get(ndx)); + a.find(-(start_value + i), 0, a.size(), &state1); + REALM_ASSERT(state1.m_state != realm::not_found); + REALM_ASSERT(a.get(state1.m_state) == input_array[state1.m_state]); + } + } + t2 = high_resolution_clock::now(); + + std::cout << " Negative values - Array::find(): " << duration_cast(t2 - t1).count() + << " ms" << std::endl; + std::cout << " Negative values - Array::find(): " + << (double)duration_cast(t2 - t1).count() / n_values / n_runs << " ns/value" << std::endl; + + t1 = high_resolution_clock::now(); + for (size_t j = 0; j < n_runs; ++j) { + for (size_t i = 1; i < n_values; ++i) { + a_encoded.find(-(start_value + i), 0, a_encoded.size(), &state2); + REALM_ASSERT(state2.m_state != realm::not_found); + REALM_ASSERT(a_encoded.get(state2.m_state) == a.get(state2.m_state)); } } t2 = high_resolution_clock::now(); - std::cout << " Negative values - ArrayEncode::find_first(): " << duration_cast(t2 - t1).count() - << " ns" << std::endl; - std::cout << " Negative values - ArrayEncode::find_first(): " + std::cout << " Negative values - ArrayEncode::find(): " << duration_cast(t2 - t1).count() + << " ms" << std::endl; + std::cout << " Negative values - ArrayEncode::find(): " << (double)duration_cast(t2 - t1).count() / n_values / n_runs << " ns/value" << std::endl; a.destroy(); a_encoded.destroy(); } + #endif +// packed is always on +#if 0 TEST(Test_ArrayInt_no_encode) { ArrayInteger a(Allocator::get_default()); @@ -267,6 +1305,7 @@ TEST(Test_ArrayInt_no_encode) a.destroy(); a1.destroy(); } +#endif TEST(Test_array_same_size_less_bits) { @@ -289,6 +1328,7 @@ TEST(Test_array_same_size_less_bits) a1.destroy(); } +#if 0 TEST(Test_ArrayInt_encode_decode_needed) { ArrayInteger a(Allocator::get_default()); @@ -338,6 +1378,7 @@ TEST(Test_ArrayInt_encode_decode_needed) a.destroy(); a1.destroy(); } +#endif TEST(Test_ArrayInt_negative_nums) {