From 3f36c18f2cd5df9f29de0685fcf62250732ea008 Mon Sep 17 00:00:00 2001 From: "Marcus D. R. Klarqvist" Date: Sat, 17 Aug 2019 13:30:04 +0100 Subject: [PATCH 01/15] test perf --- benchmark.cpp | 4 + libalgebra.h | 244 +++++++++++++++++--------------------------------- 2 files changed, 87 insertions(+), 161 deletions(-) diff --git a/benchmark.cpp b/benchmark.cpp index f140a2e..df9f7b7 100644 --- a/benchmark.cpp +++ b/benchmark.cpp @@ -7,6 +7,10 @@ #include void generate_random_data(uint64_t* data, uint32_t range, uint32_t n) { + // Clear data + uint32_t n_bitmaps = ceil(n / 64.0); + memset(data, 0, sizeof(uint64_t)*n_bitmaps); + // PRNG std::uniform_int_distribution distr(0, range-1); // right inclusive std::random_device rd; // obtain a random number from hardware diff --git a/libalgebra.h b/libalgebra.h index 2a92c7a..3ee0e12 100644 --- a/libalgebra.h +++ b/libalgebra.h @@ -181,9 +181,9 @@ void STORM_aligned_free(void* memblock) { // disable noise #ifdef __GNUC__ -#define WARN_UNUSED __attribute__((warn_unused_result)) +#define STORM_WARN_UNUSED __attribute__((warn_unused_result)) #else -#define WARN_UNUSED +#define STORM_WARN_UNUSED #endif /*------ SIMD definitions --------*/ @@ -197,11 +197,15 @@ void STORM_aligned_free(void* memblock) { ****************************/ #ifndef __has_builtin - #define __has_builtin(x) 0 + #define STORM_HAS_BUILTIN(x) 0 +#else + #define STORM_HAS_BUILTIN(x) __has_builtin(x) #endif #ifndef __has_attribute - #define __has_attribute(x) 0 + #define STORM_HAS_ATTRIBUTE(x) 0 +#else + #define STORM_HAS_ATTRIBUTE(x) __has_attribute(x) #endif #ifdef __GNUC__ @@ -229,12 +233,12 @@ void STORM_aligned_free(void* memblock) { (defined(__cplusplus) || \ defined(_MSC_VER) || \ (GNUC_PREREQ(4, 2) || \ - __has_builtin(__sync_val_compare_and_swap))) + STORM_HAS_BUILTIN(__sync_val_compare_and_swap))) #define HAVE_CPUID #endif #if GNUC_PREREQ(4, 2) || \ - __has_builtin(__builtin_popcount) + STORM_HAS_BUILTIN(__builtin_popcount) #define HAVE_BUILTIN_POPCOUNT #endif @@ -275,7 +279,7 @@ void STORM_aligned_free(void* memblock) { #if defined(HAVE_CPUID) && \ CLANG_PREREQ(3, 8) && \ - __has_attribute(target) && \ + STORM_HAS_ATTRIBUTE(target) && \ (!defined(_MSC_VER) || defined(__AVX2__)) && \ (!defined(__apple_build_version__) || __apple_build_version__ >= 8000000) #define HAVE_SSE42 @@ -283,6 +287,13 @@ void STORM_aligned_free(void* memblock) { #define HAVE_AVX512 #endif +// +#if !defined(_MSC_VER) + #define STORM_TARGET(x) __attribute__ ((target (x))) +#else + #define STORM_TARGET(x) 0 +#endif + #ifdef __cplusplus extern "C" { #endif @@ -349,7 +360,7 @@ void STORM_run_cpuid(int eax, int ecx, int* abcd) { defined(HAVE_AVX512) static -int get_xcr0() { +int STORM_get_xcr0() { int xcr0; #if defined(_MSC_VER) @@ -364,7 +375,7 @@ int get_xcr0() { #endif static -int get_cpuid() { +int STORM_get_cpuid() { int flags = 0; int abcd[4]; @@ -394,7 +405,7 @@ int get_cpuid() { int ymm_mask = STORM_XSTATE_SSE | STORM_XSTATE_YMM; int zmm_mask = STORM_XSTATE_SSE | STORM_XSTATE_YMM | STORM_XSTATE_ZMM; - int xcr0 = get_xcr0(); + int xcr0 = STORM_get_xcr0(); if ((xcr0 & ymm_mask) == ymm_mask) { STORM_run_cpuid(7, 0, abcd); @@ -476,7 +487,6 @@ uint64_t STORM_POPCOUNT(uint64_t x) { * use pure integer algorithm */ #else -// TODO: FIXME STORM_FORCE_INLINE uint64_t STORM_POPCOUNT(uint64_t x) { return STORM_popcount64(x); @@ -779,18 +789,14 @@ const uint8_t STORM_popcnt_lookup8bit[256] = { } #endif -#if !defined(_MSC_VER) - __attribute__ ((target ("sse4.2"))) -#endif +STORM_TARGET("sse4.2") STORM_FORCE_INLINE uint64_t STORM_POPCOUNT_SSE(const __m128i n) { return(STORM_POPCOUNT(_mm_cvtsi128_si64(n)) + STORM_POPCOUNT(_mm_cvtsi128_si64(_mm_unpackhi_epi64(n, n)))); } -#if !defined(_MSC_VER) - __attribute__ ((target ("sse4.2"))) -#endif +STORM_TARGET("sse4.2") STORM_FORCE_INLINE void STORM_CSA128(__m128i* h, __m128i* l, __m128i a, __m128i b, __m128i c) { __m128i u = _mm_xor_si128(a, b); @@ -817,9 +823,7 @@ void STORM_CSA128(__m128i* h, __m128i* l, __m128i a, __m128i b, __m128i c) { * @param b * @param c */ -#if !defined(_MSC_VER) - __attribute__ ((target ("sse4.2"))) -#endif +STORM_TARGET("sse4.2") STORM_FORCE_INLINE void STORM_pospopcnt_csa_sse(__m128i* STORM_RESTRICT h, __m128i* STORM_RESTRICT l, @@ -833,9 +837,7 @@ void STORM_pospopcnt_csa_sse(__m128i* STORM_RESTRICT h, // By @aqrit (https://github.com/aqrit) // @see: https://gist.github.com/aqrit/cb52b2ac5b7d0dfe9319c09d27237bf3 -#if !defined(_MSC_VER) - __attribute__ ((target ("sse4.2"))) -#endif +STORM_TARGET("sse4.2") static int STORM_pospopcnt_u16_sse_sad(const uint16_t* data, uint32_t len, uint32_t* flag_counts) { const __m128i zero = _mm_setzero_si128(); @@ -1032,9 +1034,7 @@ int STORM_pospopcnt_u16_sse_sad(const uint16_t* data, uint32_t len, uint32_t* fl return 0; } -#if !defined(_MSC_VER) - __attribute__ ((target ("sse4.2"))) -#endif +STORM_TARGET("sse4.2") static int STORM_pospopcnt_u16_sse_blend_popcnt_unroll8(const uint16_t* array, uint32_t len, uint32_t* flags) { const __m128i* data_vectors = (const __m128i*)(array); @@ -1104,9 +1104,7 @@ int STORM_pospopcnt_u16_sse_blend_popcnt_unroll8(const uint16_t* array, uint32_t return 0; } -#if !defined(_MSC_VER) - __attribute__ ((target ("sse4.2"))) -#endif +STORM_TARGET("sse4.2") static int STORM_pospopcnt_u16_sse_harvey_seal(const uint16_t* array, uint32_t len, uint32_t* flags) { for (uint32_t i = len - (len % (16 * 8)); i < len; ++i) { @@ -1204,9 +1202,7 @@ int STORM_pospopcnt_u16_sse_harvey_seal(const uint16_t* array, uint32_t len, uin return 0; } -#if !defined(_MSC_VER) - __attribute__ ((target ("sse4.2"))) -#endif +STORM_TARGET("sse4.2") static uint64_t STORM_intersect_count_csa_sse4(const __m128i* STORM_RESTRICT data1, const __m128i* STORM_RESTRICT data2, @@ -1258,9 +1254,7 @@ uint64_t STORM_intersect_count_csa_sse4(const __m128i* STORM_RESTRICT data1, return cnt64; } -#if !defined(_MSC_VER) - __attribute__ ((target ("sse4.2"))) -#endif +STORM_TARGET("sse4.2") static uint64_t STORM_union_count_csa_sse4(const __m128i* STORM_RESTRICT data1, const __m128i* STORM_RESTRICT data2, @@ -1312,9 +1306,7 @@ uint64_t STORM_union_count_csa_sse4(const __m128i* STORM_RESTRICT data1, return cnt64; } -#if !defined(_MSC_VER) - __attribute__ ((target ("sse4.2"))) -#endif +STORM_TARGET("sse4.2") static uint64_t STORM_diff_count_csa_sse4(const __m128i* STORM_RESTRICT data1, const __m128i* STORM_RESTRICT data2, @@ -1366,9 +1358,7 @@ uint64_t STORM_diff_count_csa_sse4(const __m128i* STORM_RESTRICT data1, return cnt64; } -#if !defined(_MSC_VER) - __attribute__ ((target ("sse4.2"))) -#endif +STORM_TARGET("sse4.2") static uint64_t STORM_intersect_count_sse4(const uint64_t* STORM_RESTRICT b1, const uint64_t* STORM_RESTRICT b2, @@ -1388,9 +1378,7 @@ uint64_t STORM_intersect_count_sse4(const uint64_t* STORM_RESTRICT b1, return(count); } -#if !defined(_MSC_VER) - __attribute__ ((target ("sse4.2"))) -#endif +STORM_TARGET("sse4.2") static uint64_t STORM_union_count_sse4(const uint64_t* STORM_RESTRICT b1, const uint64_t* STORM_RESTRICT b2, @@ -1410,9 +1398,7 @@ uint64_t STORM_union_count_sse4(const uint64_t* STORM_RESTRICT b1, return(count); } -#if !defined(_MSC_VER) - __attribute__ ((target ("sse4.2"))) -#endif +STORM_TARGET("sse4.2") static uint64_t STORM_diff_count_sse4(const uint64_t* STORM_RESTRICT b1, const uint64_t* STORM_RESTRICT b2, @@ -1450,9 +1436,7 @@ uint64_t STORM_diff_count_sse4(const uint64_t* STORM_RESTRICT b1, } #endif -#if !defined(_MSC_VER) - __attribute__ ((target ("avx2"))) -#endif +STORM_TARGET("avx2") STORM_FORCE_INLINE void STORM_CSA256(__m256i* h, __m256i* l, __m256i a, __m256i b, __m256i c) { __m256i u = _mm256_xor_si256(a, b); @@ -1460,9 +1444,7 @@ void STORM_CSA256(__m256i* h, __m256i* l, __m256i a, __m256i b, __m256i c) { *l = _mm256_xor_si256(u, c); } -#if !defined(_MSC_VER) - __attribute__ ((target ("avx2"))) -#endif +STORM_TARGET("avx2") STORM_FORCE_INLINE void STORM_pospopcnt_csa_avx2(__m256i* STORM_RESTRICT h, __m256i* STORM_RESTRICT l, @@ -1474,9 +1456,7 @@ void STORM_pospopcnt_csa_avx2(__m256i* STORM_RESTRICT h, *l = _mm256_xor_si256(u, c); } -#if !defined(_MSC_VER) - __attribute__ ((target ("avx2"))) -#endif +STORM_TARGET("avx2") static int STORM_pospopcnt_u16_avx2_blend_popcnt_unroll8(const uint16_t* array, uint32_t len, uint32_t* flags) { const __m256i* data_vectors = (const __m256i*)(array); @@ -1545,9 +1525,7 @@ int STORM_pospopcnt_u16_avx2_blend_popcnt_unroll8(const uint16_t* array, uint32_ return 0; } -#if !defined(_MSC_VER) - __attribute__ ((target ("avx2"))) -#endif +STORM_TARGET("avx2") static int STORM_pospopcnt_u16_avx2_harvey_seal(const uint16_t* array, uint32_t len, uint32_t* flags) { for (uint32_t i = len - (len % (16 * 16)); i < len; ++i) { @@ -1647,9 +1625,7 @@ int STORM_pospopcnt_u16_avx2_harvey_seal(const uint16_t* array, uint32_t len, ui } -#if !defined(_MSC_VER) - __attribute__ ((target ("avx2"))) -#endif +STORM_TARGET("avx2") static __m256i STORM_popcnt256(__m256i v) { __m256i lookup1 = _mm256_setr_epi8( @@ -1676,9 +1652,7 @@ __m256i STORM_popcnt256(__m256i v) { } // modified from https://github.com/WojciechMula/sse-popcount -#if !defined(_MSC_VER) - __attribute__ ((target ("avx2"))) -#endif +STORM_TARGET("avx2") static uint64_t STORM_intersect_count_lookup_avx2_func(const uint8_t* STORM_RESTRICT data1, const uint8_t* STORM_RESTRICT data2, @@ -1747,9 +1721,7 @@ uint64_t STORM_intersect_count_lookup_avx2_func(const uint8_t* STORM_RESTRICT da } // modified from https://github.com/WojciechMula/sse-popcount -#if !defined(_MSC_VER) - __attribute__ ((target ("avx2"))) -#endif +STORM_TARGET("avx2") static uint64_t STORM_union_count_lookup_avx2_func(const uint8_t* STORM_RESTRICT data1, const uint8_t* STORM_RESTRICT data2, @@ -1818,9 +1790,7 @@ uint64_t STORM_union_count_lookup_avx2_func(const uint8_t* STORM_RESTRICT data1, } // modified from https://github.com/WojciechMula/sse-popcount -#if !defined(_MSC_VER) - __attribute__ ((target ("avx2"))) -#endif +STORM_TARGET("avx2") static uint64_t STORM_diff_count_lookup_avx2_func(const uint8_t* STORM_RESTRICT data1, const uint8_t* STORM_RESTRICT data2, @@ -1888,9 +1858,7 @@ uint64_t STORM_diff_count_lookup_avx2_func(const uint8_t* STORM_RESTRICT data1, return result; } -#if !defined(_MSC_VER) - __attribute__ ((target ("avx2"))) -#endif +STORM_TARGET("avx2") static uint64_t STORM_popcnt_avx2(const __m256i* data, uint64_t size) { @@ -1955,9 +1923,7 @@ uint64_t STORM_popcnt_avx2(const __m256i* data, uint64_t size) * @see https://arxiv.org/abs/1611.07612 */ // In this version we perform the operation A&B as input into the CSA operator. -#if !defined(_MSC_VER) - __attribute__ ((target ("avx2"))) -#endif +STORM_TARGET("avx2") static uint64_t STORM_intersect_count_csa_avx2(const __m256i* STORM_RESTRICT data1, const __m256i* STORM_RESTRICT data2, @@ -2016,9 +1982,7 @@ uint64_t STORM_intersect_count_csa_avx2(const __m256i* STORM_RESTRICT data1, } // In this version we perform the operation A|B as input into the CSA operator. -#if !defined(_MSC_VER) - __attribute__ ((target ("avx2"))) -#endif +STORM_TARGET("avx2") static uint64_t STORM_union_count_csa_avx2(const __m256i* STORM_RESTRICT data1, const __m256i* STORM_RESTRICT data2, @@ -2077,9 +2041,7 @@ uint64_t STORM_union_count_csa_avx2(const __m256i* STORM_RESTRICT data1, } // In this version we perform the operation A^B as input into the CSA operator. -#if !defined(_MSC_VER) - __attribute__ ((target ("avx2"))) -#endif +STORM_TARGET("avx2") static uint64_t STORM_diff_count_csa_avx2(const __m256i* STORM_RESTRICT data1, const __m256i* STORM_RESTRICT data2, @@ -2137,9 +2099,7 @@ uint64_t STORM_diff_count_csa_avx2(const __m256i* STORM_RESTRICT data1, cnt64[3]; } -#if !defined(_MSC_VER) - __attribute__ ((target ("avx2"))) -#endif +STORM_TARGET("avx2") static uint64_t STORM_intersect_count_avx2(const uint64_t* STORM_RESTRICT b1, const uint64_t* STORM_RESTRICT b2, @@ -2159,9 +2119,7 @@ uint64_t STORM_intersect_count_avx2(const uint64_t* STORM_RESTRICT b1, return(count); } -#if !defined(_MSC_VER) - __attribute__ ((target ("avx2"))) -#endif +STORM_TARGET("avx2") static uint64_t STORM_union_count_avx2(const uint64_t* STORM_RESTRICT b1, const uint64_t* STORM_RESTRICT b2, @@ -2181,9 +2139,7 @@ uint64_t STORM_union_count_avx2(const uint64_t* STORM_RESTRICT b1, return(count); } -#if !defined(_MSC_VER) - __attribute__ ((target ("avx2"))) -#endif +STORM_TARGET("avx2") static uint64_t STORM_diff_count_avx2(const uint64_t* STORM_RESTRICT b1, const uint64_t* STORM_RESTRICT b2, @@ -2203,9 +2159,7 @@ uint64_t STORM_diff_count_avx2(const uint64_t* STORM_RESTRICT b1, return(count); } -#if !defined(_MSC_VER) - __attribute__ ((target ("avx2"))) -#endif +STORM_TARGET("avx2") static uint64_t STORM_intersect_count_lookup_avx2(const uint64_t* STORM_RESTRICT b1, const uint64_t* STORM_RESTRICT b2, @@ -2214,9 +2168,7 @@ uint64_t STORM_intersect_count_lookup_avx2(const uint64_t* STORM_RESTRICT b1, return STORM_intersect_count_lookup_avx2_func((uint8_t*)b1, (uint8_t*)b2, n_ints*8); } -#if !defined(_MSC_VER) - __attribute__ ((target ("avx2"))) -#endif +STORM_TARGET("avx2") static uint64_t STORM_union_count_lookup_avx2(const uint64_t* STORM_RESTRICT b1, const uint64_t* STORM_RESTRICT b2, @@ -2225,9 +2177,7 @@ uint64_t STORM_union_count_lookup_avx2(const uint64_t* STORM_RESTRICT b1, return STORM_union_count_lookup_avx2_func((uint8_t*)b1, (uint8_t*)b2, n_ints*8); } -#if !defined(_MSC_VER) - __attribute__ ((target ("avx2"))) -#endif +STORM_TARGET("avx2") static uint64_t STORM_diff_count_lookup_avx2(const uint64_t* STORM_RESTRICT b1, const uint64_t* STORM_RESTRICT b2, @@ -2245,9 +2195,7 @@ uint64_t STORM_diff_count_lookup_avx2(const uint64_t* STORM_RESTRICT b1, #include -#if !defined(_MSC_VER) - __attribute__ ((target ("avx512bw"))) -#endif +STORM_TARGET("avx512bw") STORM_FORCE_INLINE __m512i STORM_popcnt512(__m512i v) { __m512i m1 = _mm512_set1_epi8(0x55); @@ -2260,9 +2208,7 @@ __m512i STORM_popcnt512(__m512i v) { return _mm512_sad_epu8(t3, _mm512_setzero_si512()); } -#if !defined(_MSC_VER) - __attribute__ ((target ("avx512bw"))) -#endif +STORM_TARGET("avx512bw") STORM_FORCE_INLINE void STORM_CSA512(__m512i* h, __m512i* l, __m512i a, __m512i b, __m512i c) { *l = _mm512_ternarylogic_epi32(c, b, a, 0x96); @@ -2272,9 +2218,7 @@ void STORM_CSA512(__m512i* h, __m512i* l, __m512i a, __m512i b, __m512i c) { // By Wojciech Muła // @see https://github.com/WojciechMula/sse-popcount/blob/master/popcnt-avx512-harley-seal.cpp#L3 // @see https://arxiv.org/abs/1611.07612 -#if !defined(_MSC_VER) - __attribute__ ((target ("avx512bw"))) -#endif +STORM_TARGET("avx512bw") STORM_FORCE_INLINE __m512i STORM_avx512_popcount(const __m512i v) { const __m512i m1 = _mm512_set1_epi8(0x55); // 01010101 @@ -2288,9 +2232,7 @@ __m512i STORM_avx512_popcount(const __m512i v) { } // 512i-version of carry-save adder subroutine. -#if !defined(_MSC_VER) - __attribute__ ((target ("avx512bw"))) -#endif +STORM_TARGET("avx512bw") STORM_FORCE_INLINE void STORM_pospopcnt_csa_avx512(__m512i* STORM_RESTRICT h, __m512i* STORM_RESTRICT l, @@ -2300,9 +2242,7 @@ void STORM_pospopcnt_csa_avx512(__m512i* STORM_RESTRICT h, *l = _mm512_ternarylogic_epi32(c, b, *l, 0x96); // 10010110 } -#if !defined(_MSC_VER) - __attribute__ ((target ("avx512bw"))) -#endif +STORM_TARGET("avx512bw") static uint64_t STORM_popcnt_avx512(const __m512i* STORM_RESTRICT data, uint64_t size) { @@ -2362,9 +2302,7 @@ uint64_t STORM_popcnt_avx512(const __m512i* STORM_RESTRICT data, uint64_t size) cnt64[7]; } -#if !defined(_MSC_VER) - __attribute__ ((target ("avx512bw"))) -#endif +STORM_TARGET("avx512bw") static int STORM_pospopcnt_u16_avx512bw_harvey_seal(const uint16_t* array, uint32_t len, uint32_t* flags) { for (uint32_t i = len - (len % (32 * 16)); i < len; ++i) { @@ -2462,9 +2400,7 @@ int STORM_pospopcnt_u16_avx512bw_harvey_seal(const uint16_t* array, uint32_t len return 0; } -#if !defined(_MSC_VER) - __attribute__ ((target ("avx512bw"))) -#endif +STORM_TARGET("avx512bw") static int STORM_pospopcnt_u16_avx512bw_blend_popcnt_unroll8(const uint16_t* data, uint32_t len, uint32_t* flags) { #define AND_OR 0xea // ternary function: (a & b) | c @@ -2537,9 +2473,7 @@ int STORM_pospopcnt_u16_avx512bw_blend_popcnt_unroll8(const uint16_t* data, uint return 0; } -#if !defined(_MSC_VER) - __attribute__ ((target ("avx512bw"))) -#endif +STORM_TARGET("avx512bw") static int STORM_pospopcnt_u16_avx512bw_adder_forest(const uint16_t* array, uint32_t len, uint32_t* flags) { __m512i counters[16]; @@ -2671,9 +2605,7 @@ int STORM_pospopcnt_u16_avx512bw_adder_forest(const uint16_t* array, uint32_t le * Wojciech Mula (23 Nov 2016). * @see https://arxiv.org/abs/1611.07612 */ -#if !defined(_MSC_VER) - __attribute__ ((target ("avx512bw"))) -#endif +STORM_TARGET("avx512bw") static uint64_t STORM_intersect_count_csa_avx512(const __m512i* STORM_RESTRICT data1, const __m512i* STORM_RESTRICT data2, @@ -2735,9 +2667,7 @@ uint64_t STORM_intersect_count_csa_avx512(const __m512i* STORM_RESTRICT data1, cnt64[7]; } -#if !defined(_MSC_VER) - __attribute__ ((target ("avx512bw"))) -#endif +STORM_TARGET("avx512bw") static uint64_t STORM_union_count_csa_avx512(const __m512i* STORM_RESTRICT data1, const __m512i* STORM_RESTRICT data2, @@ -2799,9 +2729,7 @@ uint64_t STORM_union_count_csa_avx512(const __m512i* STORM_RESTRICT data1, cnt64[7]; } -#if !defined(_MSC_VER) - __attribute__ ((target ("avx512bw"))) -#endif +STORM_TARGET("avx512bw") static uint64_t STORM_diff_count_csa_avx512(const __m512i* STORM_RESTRICT data1, const __m512i* STORM_RESTRICT data2, @@ -2865,9 +2793,7 @@ uint64_t STORM_diff_count_csa_avx512(const __m512i* STORM_RESTRICT data1, // Functions // AVX512 -#if !defined(_MSC_VER) - __attribute__ ((target ("avx512bw"))) -#endif +STORM_TARGET("avx512bw") static uint64_t STORM_intersect_count_avx512(const uint64_t* STORM_RESTRICT b1, const uint64_t* STORM_RESTRICT b2, @@ -2887,9 +2813,7 @@ uint64_t STORM_intersect_count_avx512(const uint64_t* STORM_RESTRICT b1, return(count); } -#if !defined(_MSC_VER) - __attribute__ ((target ("avx512bw"))) -#endif +STORM_TARGET("avx512bw") static uint64_t STORM_union_count_avx512(const uint64_t* STORM_RESTRICT b1, const uint64_t* STORM_RESTRICT b2, @@ -2909,9 +2833,7 @@ uint64_t STORM_union_count_avx512(const uint64_t* STORM_RESTRICT b1, return(count); } -#if !defined(_MSC_VER) - __attribute__ ((target ("avx512bw"))) -#endif +STORM_TARGET("avx512bw") static uint64_t STORM_diff_count_avx512(const uint64_t* STORM_RESTRICT b1, const uint64_t* STORM_RESTRICT b2, @@ -3030,12 +2952,12 @@ uint32_t STORM_get_alignment() { #if defined(HAVE_CPUID) #if defined(__cplusplus) /* C++11 thread-safe singleton */ - static const int cpuid = get_cpuid(); + static const int cpuid = STORM_get_cpuid(); #else static int cpuid_ = -1; int cpuid = cpuid_; if (cpuid == -1) { - cpuid = get_cpuid(); + cpuid = STORM_get_cpuid(); #if defined(_MSC_VER) _InterlockedCompareExchange(&cpuid_, cpuid, -1); @@ -3080,12 +3002,12 @@ STORM_compute_func STORM_get_intersect_count_func(const uint32_t n_bitmaps_vecto #if defined(HAVE_CPUID) #if defined(__cplusplus) /* C++11 thread-safe singleton */ - static const int cpuid = get_cpuid(); + static const int cpuid = STORM_get_cpuid(); #else static int cpuid_ = -1; int cpuid = cpuid_; if (cpuid == -1) { - cpuid = get_cpuid(); + cpuid = STORM_get_cpuid(); #if defined(_MSC_VER) _InterlockedCompareExchange(&cpuid_, cpuid, -1); @@ -3128,12 +3050,12 @@ STORM_compute_func STORM_get_union_count_func(const uint32_t n_bitmaps_vector) { #if defined(HAVE_CPUID) #if defined(__cplusplus) /* C++11 thread-safe singleton */ - static const int cpuid = get_cpuid(); + static const int cpuid = STORM_get_cpuid(); #else static int cpuid_ = -1; int cpuid = cpuid_; if (cpuid == -1) { - cpuid = get_cpuid(); + cpuid = STORM_get_cpuid(); #if defined(_MSC_VER) _InterlockedCompareExchange(&cpuid_, cpuid, -1); @@ -3176,12 +3098,12 @@ STORM_compute_func STORM_get_diff_count_func(const uint32_t n_bitmaps_vector) { #if defined(HAVE_CPUID) #if defined(__cplusplus) /* C++11 thread-safe singleton */ - static const int cpuid = get_cpuid(); + static const int cpuid = STORM_get_cpuid(); #else static int cpuid_ = -1; int cpuid = cpuid_; if (cpuid == -1) { - cpuid = get_cpuid(); + cpuid = STORM_get_cpuid(); #if defined(_MSC_VER) _InterlockedCompareExchange(&cpuid_, cpuid, -1); @@ -3227,12 +3149,12 @@ uint64_t STORM_intersect_count(const uint64_t* STORM_RESTRICT data1, const uint6 #if defined(HAVE_CPUID) #if defined(__cplusplus) /* C++11 thread-safe singleton */ - static const int cpuid = get_cpuid(); + static const int cpuid = STORM_get_cpuid(); #else static int cpuid_ = -1; int cpuid = cpuid_; if (cpuid == -1) { - cpuid = get_cpuid(); + cpuid = STORM_get_cpuid(); #if defined(_MSC_VER) _InterlockedCompareExchange(&cpuid_, cpuid, -1); @@ -3275,12 +3197,12 @@ uint64_t STORM_union_count(const uint64_t* STORM_RESTRICT data1, const uint64_t* #if defined(HAVE_CPUID) #if defined(__cplusplus) /* C++11 thread-safe singleton */ - static const int cpuid = get_cpuid(); + static const int cpuid = STORM_get_cpuid(); #else static int cpuid_ = -1; int cpuid = cpuid_; if (cpuid == -1) { - cpuid = get_cpuid(); + cpuid = STORM_get_cpuid(); #if defined(_MSC_VER) _InterlockedCompareExchange(&cpuid_, cpuid, -1); @@ -3323,12 +3245,12 @@ uint64_t STORM_diff_count(const uint64_t* STORM_RESTRICT data1, const uint64_t* #if defined(HAVE_CPUID) #if defined(__cplusplus) /* C++11 thread-safe singleton */ - static const int cpuid = get_cpuid(); + static const int cpuid = STORM_get_cpuid(); #else static int cpuid_ = -1; int cpuid = cpuid_; if (cpuid == -1) { - cpuid = get_cpuid(); + cpuid = STORM_get_cpuid(); #if defined(_MSC_VER) _InterlockedCompareExchange(&cpuid_, cpuid, -1); @@ -3376,12 +3298,12 @@ uint64_t STORM_popcnt(const uint8_t* data, uint32_t size) { #if defined(HAVE_CPUID) #if defined(__cplusplus) /* C++11 thread-safe singleton */ - static const int cpuid = get_cpuid(); + static const int cpuid = STORM_get_cpuid(); #else static int cpuid_ = -1; int cpuid = cpuid_; if (cpuid == -1) { - cpuid = get_cpuid(); + cpuid = STORM_get_cpuid(); #if defined(_MSC_VER) _InterlockedCompareExchange(&cpuid_, cpuid, -1); @@ -3453,12 +3375,12 @@ int STORM_pospopcnt_u16(const uint16_t* data, uint32_t len, uint32_t* flags) { #if defined(HAVE_CPUID) #if defined(__cplusplus) /* C++11 thread-safe singleton */ - static const int cpuid = get_cpuid(); + static const int cpuid = STORM_get_cpuid(); #else static int cpuid_ = -1; int cpuid = cpuid_; if (cpuid == -1) { - cpuid = get_cpuid(); + cpuid = STORM_get_cpuid(); #if defined(_MSC_VER) _InterlockedCompareExchange(&cpuid_, cpuid, -1); From 4ef80c846b9b9ff299db77f16a00e66c7142ddf0 Mon Sep 17 00:00:00 2001 From: "Marcus D. R. Klarqvist" Date: Sat, 17 Aug 2019 14:13:36 +0100 Subject: [PATCH 02/15] speedup travis --- .travis.yml | 7 +++- benchmark.cpp | 110 +++++++++++++++++--------------------------------- 2 files changed, 44 insertions(+), 73 deletions(-) diff --git a/.travis.yml b/.travis.yml index 633dbe6..fada22c 100644 --- a/.travis.yml +++ b/.travis.yml @@ -191,4 +191,9 @@ before_script: script: - cmake . - make - - ./benchmark \ No newline at end of file + - | + if [[ "${TRAVIS_OS_NAME}" == "linux" ]]; then + sudo ./benchmark 10 + else + ./benchmark 10 + fi \ No newline at end of file diff --git a/benchmark.cpp b/benchmark.cpp index df9f7b7..5314b19 100644 --- a/benchmark.cpp +++ b/benchmark.cpp @@ -662,92 +662,58 @@ asm volatile("RDTSCP\n\t" return 0; } -int benchmark(uint32_t range, uint32_t n_values) { - std::cout << "Range = [0," << range << ") with bits=" << n_values << std::endl; - +int benchmark(int n_repetitions) { // Align some bitmaps. - uint32_t n_bitmaps = ceil(range / 64.0); - uint64_t* bitmaps = (uint64_t*)STORM_aligned_malloc(STORM_get_alignment(), n_bitmaps*sizeof(uint64_t)); - uint64_t* bitmaps2 = (uint64_t*)STORM_aligned_malloc(STORM_get_alignment(), n_bitmaps*sizeof(uint64_t)); - - generate_random_data(bitmaps, range, n_values); - generate_random_data(bitmaps2, range, n_values); + uint64_t* bitmaps = (uint64_t*)STORM_aligned_malloc(STORM_get_alignment(), 1048576*sizeof(uint64_t)); + uint64_t* bitmaps2 = (uint64_t*)STORM_aligned_malloc(STORM_get_alignment(), 1048576*sizeof(uint64_t)); - // Compute the population bit count. - uint64_t bitcnt = STORM_popcnt((uint8_t*)bitmaps, n_bitmaps*8); - std::cout << "POPCNT=" << bitcnt << std::endl; - - // Compute the positional bit count. - uint32_t flag_count[16]; - int pospopcnt = STORM_pospopcnt_u16((uint16_t*)bitmaps, n_bitmaps*4, &flag_count[0]); - uint32_t pospopcnt_total = flag_count[0]; - std::cout << "POSPOPCNT=" << flag_count[0]; - for (int i = 1; i < 16; ++i) { - std::cout << "," << flag_count[i]; - pospopcnt_total += flag_count[i]; + std::vector ranges = {4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768,65536,131072,262144,524288,1048576}; + std::vector reps; + if (n_repetitions <= 0) { + reps = {5000,5000,5000,5000,5000,2500,2500,2500,2500,2500,50,50,50,50,50,50,50,25,25,25}; + } else { + reps = std::vector(ranges.size(), n_repetitions); } - std::cout << std::endl; - std::cout << "POSPOPCNT total=" << pospopcnt_total << std::endl; - - // Compute intersect count - uint64_t intersect_count = STORM_intersect_count(bitmaps, bitmaps2, n_bitmaps); - std::cout << "intersect count=" << intersect_count << std::endl; - - // Compute union count - uint64_t union_count = STORM_union_count(bitmaps, bitmaps2, n_bitmaps); - std::cout << "union count=" << union_count << std::endl; - - // Compute diff count - uint64_t diff_count = STORM_diff_count(bitmaps, bitmaps2, n_bitmaps); - std::cout << "diff count=" << diff_count << std::endl; - - { - // Align some bitmaps. - uint32_t n_bitmaps = ceil(range / 64.0); - uint64_t* bitmaps = (uint64_t*)STORM_aligned_malloc(STORM_get_alignment(), 1048576*sizeof(uint64_t)); - uint64_t* bitmaps2 = (uint64_t*)STORM_aligned_malloc(STORM_get_alignment(), 1048576*sizeof(uint64_t)); - - std::vector ranges = {4,8,32,128,256,512,1024,2048,4096,8192,65536,262144,1048576}; - std::vector iterations = {2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000}; - assert(ranges.size() == iterations.size()); - - for (int i = 0; i < ranges.size(); ++i) { - bench_unit unit_intsec, unit_union, unit_diff; - + + + for (int i = 0; i < ranges.size(); ++i) { + bench_unit unit_intsec, unit_union, unit_diff; + uint32_t n_bitmaps = ceil(ranges[i] / 64.0); + #ifdef __linux__ - linux_popcount_wrapper("popcount-naive",&popcount_scalar_naive_nosimd, iterations[i], bitmaps, range, n_values, ranges[i], true); - linux_popcount_wrapper("popcount",&STORM_popcnt, iterations[i], bitmaps, range, n_values, ranges[i], true); - linux_set_algebra_wrapper("intersect-naive",&intersect_scalar_naive_nosimd, iterations[i], bitmaps, bitmaps2, range, n_values, ranges[i], true); - linux_set_algebra_wrapper("intersect",STORM_get_intersect_count_func(ranges[i]), iterations[i], bitmaps, bitmaps2, range, n_values, ranges[i], true); - linux_set_algebra_wrapper("union-naive",&union_scalar_naive_nosimd, iterations[i], bitmaps, bitmaps2, range, n_values, ranges[i], true); - linux_set_algebra_wrapper("union",STORM_get_union_count_func(ranges[i]), iterations[i], bitmaps, bitmaps2, range, n_values, ranges[i], true); - linux_set_algebra_wrapper("diff-naive",&diff_scalar_naive_nosimd, iterations[i], bitmaps, bitmaps2, range, n_values, ranges[i], true); - linux_set_algebra_wrapper("diff",STORM_get_diff_count_func(ranges[i]), iterations[i], bitmaps, bitmaps2, range, n_values, ranges[i], true); + linux_popcount_wrapper("popcount-naive",&popcount_scalar_naive_nosimd, reps[i], bitmaps, ranges[i], ranges[i], n_bitmaps, true); + linux_popcount_wrapper("popcount",&STORM_popcnt, reps[i], bitmaps, ranges[i], ranges[i], n_bitmaps, true); + linux_set_algebra_wrapper("intersect-naive",&intersect_scalar_naive_nosimd, reps[i], bitmaps, bitmaps2, ranges[i], ranges[i], n_bitmaps, true); + linux_set_algebra_wrapper("intersect",STORM_get_intersect_count_func(ranges[i]), reps[i], bitmaps, bitmaps2, ranges[i], ranges[i], n_bitmaps, true); + linux_set_algebra_wrapper("union-naive",&union_scalar_naive_nosimd, reps[i], bitmaps, bitmaps2, ranges[i], ranges[i], n_bitmaps, true); + linux_set_algebra_wrapper("union",STORM_get_union_count_func(ranges[i]), reps[i], bitmaps, bitmaps2, ranges[i], ranges[i], n_bitmaps, true); + linux_set_algebra_wrapper("diff-naive",&diff_scalar_naive_nosimd, reps[i], bitmaps, bitmaps2, ranges[i], ranges[i], n_bitmaps, true); + linux_set_algebra_wrapper("diff",STORM_get_diff_count_func(ranges[i]), reps[i], bitmaps, bitmaps2, ranges[i], ranges[i], n_bitmaps, true); #else - popcount_wrapper("popcount-naive",&popcount_scalar_naive_nosimd, iterations[i], bitmaps, range, n_values, ranges[i], unit_intsec); - popcount_wrapper("popcount",&STORM_popcnt, iterations[i], bitmaps, range, n_values, ranges[i], unit_intsec); - set_algebra_wrapper("intersect-naive",&intersect_scalar_naive_nosimd, iterations[i], bitmaps, bitmaps2, range, n_values, ranges[i], unit_intsec); - set_algebra_wrapper("intersect",STORM_get_intersect_count_func(ranges[i]), iterations[i], bitmaps, bitmaps2, range, n_values, ranges[i], unit_intsec); - set_algebra_wrapper("union-naive",&union_scalar_naive_nosimd, iterations[i], bitmaps, bitmaps2, range, n_values, ranges[i], unit_intsec); - set_algebra_wrapper("union",STORM_get_union_count_func(ranges[i]), iterations[i], bitmaps, bitmaps2, range, n_values, ranges[i], unit_union); - set_algebra_wrapper("diff-naive",&diff_scalar_naive_nosimd, iterations[i], bitmaps, bitmaps2, range, n_values, ranges[i], unit_intsec); - set_algebra_wrapper("diff",STORM_get_diff_count_func(ranges[i]), iterations[i], bitmaps, bitmaps2, range, n_values, ranges[i], unit_diff); + popcount_wrapper("popcount-naive",&popcount_scalar_naive_nosimd, reps[i], bitmaps, ranges[i], ranges[i], n_bitmaps, unit_intsec); + popcount_wrapper("popcount",&STORM_popcnt, reps[i], bitmaps, ranges[i], ranges[i], n_bitmaps, unit_intsec); + set_algebra_wrapper("intersect-naive",&intersect_scalar_naive_nosimd, reps[i], bitmaps, bitmaps2, ranges[i], ranges[i], n_bitmaps, unit_intsec); + set_algebra_wrapper("intersect",STORM_get_intersect_count_func(ranges[i]), reps[i], bitmaps, bitmaps2, ranges[i], ranges[i], n_bitmaps, unit_intsec); + set_algebra_wrapper("union-naive",&union_scalar_naive_nosimd, reps[i], bitmaps, bitmaps2, ranges[i], ranges[i], n_bitmaps, unit_intsec); + set_algebra_wrapper("union",STORM_get_union_count_func(ranges[i]), reps[i], bitmaps, bitmaps2, ranges[i], ranges[i], n_bitmaps, unit_union); + set_algebra_wrapper("diff-naive",&diff_scalar_naive_nosimd, reps[i], bitmaps, bitmaps2, ranges[i], ranges[i], n_bitmaps, unit_intsec); + set_algebra_wrapper("diff",STORM_get_diff_count_func(ranges[i]), reps[i], bitmaps, bitmaps2, ranges[i], ranges[i], n_bitmaps, unit_diff); #endif - } - - // Clean up. - STORM_aligned_free(bitmaps); - STORM_aligned_free(bitmaps2); } // Clean up. STORM_aligned_free(bitmaps); STORM_aligned_free(bitmaps2); + return 1; } -int main(int argc, char **argv) { - benchmark(2048000, 50000); +int main(int argc, char **argv) { + int n_repetitions = -1; + if (argc > 2) { + n_repetitions = std::atoi(argv[1]); + } + benchmark(n_repetitions); return EXIT_SUCCESS; } \ No newline at end of file From 73450bb522c55226b78be8a8fb690d01368cb4b6 Mon Sep 17 00:00:00 2001 From: "Marcus D. R. Klarqvist" Date: Sat, 17 Aug 2019 14:24:16 +0100 Subject: [PATCH 03/15] optinal use of perf subsystem --- .travis.yml | 4 ++-- benchmark.cpp | 64 ++++++++++++++++++++++++++++++++++----------------- 2 files changed, 45 insertions(+), 23 deletions(-) diff --git a/.travis.yml b/.travis.yml index fada22c..e7fbd6e 100644 --- a/.travis.yml +++ b/.travis.yml @@ -193,7 +193,7 @@ script: - make - | if [[ "${TRAVIS_OS_NAME}" == "linux" ]]; then - sudo ./benchmark 10 + sudo ./benchmark -r 10 else - ./benchmark 10 + ./benchmark -r 10 fi \ No newline at end of file diff --git a/benchmark.cpp b/benchmark.cpp index 5314b19..e5784b1 100644 --- a/benchmark.cpp +++ b/benchmark.cpp @@ -662,7 +662,7 @@ asm volatile("RDTSCP\n\t" return 0; } -int benchmark(int n_repetitions) { +int benchmark(int n_repetitions, bool use_perf = true) { // Align some bitmaps. uint64_t* bitmaps = (uint64_t*)STORM_aligned_malloc(STORM_get_alignment(), 1048576*sizeof(uint64_t)); uint64_t* bitmaps2 = (uint64_t*)STORM_aligned_malloc(STORM_get_alignment(), 1048576*sizeof(uint64_t)); @@ -680,25 +680,30 @@ int benchmark(int n_repetitions) { bench_unit unit_intsec, unit_union, unit_diff; uint32_t n_bitmaps = ceil(ranges[i] / 64.0); + if (use_perf) { #ifdef __linux__ - linux_popcount_wrapper("popcount-naive",&popcount_scalar_naive_nosimd, reps[i], bitmaps, ranges[i], ranges[i], n_bitmaps, true); - linux_popcount_wrapper("popcount",&STORM_popcnt, reps[i], bitmaps, ranges[i], ranges[i], n_bitmaps, true); - linux_set_algebra_wrapper("intersect-naive",&intersect_scalar_naive_nosimd, reps[i], bitmaps, bitmaps2, ranges[i], ranges[i], n_bitmaps, true); - linux_set_algebra_wrapper("intersect",STORM_get_intersect_count_func(ranges[i]), reps[i], bitmaps, bitmaps2, ranges[i], ranges[i], n_bitmaps, true); - linux_set_algebra_wrapper("union-naive",&union_scalar_naive_nosimd, reps[i], bitmaps, bitmaps2, ranges[i], ranges[i], n_bitmaps, true); - linux_set_algebra_wrapper("union",STORM_get_union_count_func(ranges[i]), reps[i], bitmaps, bitmaps2, ranges[i], ranges[i], n_bitmaps, true); - linux_set_algebra_wrapper("diff-naive",&diff_scalar_naive_nosimd, reps[i], bitmaps, bitmaps2, ranges[i], ranges[i], n_bitmaps, true); - linux_set_algebra_wrapper("diff",STORM_get_diff_count_func(ranges[i]), reps[i], bitmaps, bitmaps2, ranges[i], ranges[i], n_bitmaps, true); + linux_popcount_wrapper("popcount-naive",&popcount_scalar_naive_nosimd, reps[i], bitmaps, ranges[i], ranges[i], n_bitmaps, true); + linux_popcount_wrapper("popcount",&STORM_popcnt, reps[i], bitmaps, ranges[i], ranges[i], n_bitmaps, true); + linux_set_algebra_wrapper("intersect-naive",&intersect_scalar_naive_nosimd, reps[i], bitmaps, bitmaps2, ranges[i], ranges[i], n_bitmaps, true); + linux_set_algebra_wrapper("intersect",STORM_get_intersect_count_func(ranges[i]), reps[i], bitmaps, bitmaps2, ranges[i], ranges[i], n_bitmaps, true); + linux_set_algebra_wrapper("union-naive",&union_scalar_naive_nosimd, reps[i], bitmaps, bitmaps2, ranges[i], ranges[i], n_bitmaps, true); + linux_set_algebra_wrapper("union",STORM_get_union_count_func(ranges[i]), reps[i], bitmaps, bitmaps2, ranges[i], ranges[i], n_bitmaps, true); + linux_set_algebra_wrapper("diff-naive",&diff_scalar_naive_nosimd, reps[i], bitmaps, bitmaps2, ranges[i], ranges[i], n_bitmaps, true); + linux_set_algebra_wrapper("diff",STORM_get_diff_count_func(ranges[i]), reps[i], bitmaps, bitmaps2, ranges[i], ranges[i], n_bitmaps, true); #else - popcount_wrapper("popcount-naive",&popcount_scalar_naive_nosimd, reps[i], bitmaps, ranges[i], ranges[i], n_bitmaps, unit_intsec); - popcount_wrapper("popcount",&STORM_popcnt, reps[i], bitmaps, ranges[i], ranges[i], n_bitmaps, unit_intsec); - set_algebra_wrapper("intersect-naive",&intersect_scalar_naive_nosimd, reps[i], bitmaps, bitmaps2, ranges[i], ranges[i], n_bitmaps, unit_intsec); - set_algebra_wrapper("intersect",STORM_get_intersect_count_func(ranges[i]), reps[i], bitmaps, bitmaps2, ranges[i], ranges[i], n_bitmaps, unit_intsec); - set_algebra_wrapper("union-naive",&union_scalar_naive_nosimd, reps[i], bitmaps, bitmaps2, ranges[i], ranges[i], n_bitmaps, unit_intsec); - set_algebra_wrapper("union",STORM_get_union_count_func(ranges[i]), reps[i], bitmaps, bitmaps2, ranges[i], ranges[i], n_bitmaps, unit_union); - set_algebra_wrapper("diff-naive",&diff_scalar_naive_nosimd, reps[i], bitmaps, bitmaps2, ranges[i], ranges[i], n_bitmaps, unit_intsec); - set_algebra_wrapper("diff",STORM_get_diff_count_func(ranges[i]), reps[i], bitmaps, bitmaps2, ranges[i], ranges[i], n_bitmaps, unit_diff); + std::cerr << "perf counter are only available on Linux systems!" + exit(EXIT_FAILURE); #endif + } else { + popcount_wrapper("popcount-naive",&popcount_scalar_naive_nosimd, reps[i], bitmaps, ranges[i], ranges[i], n_bitmaps, unit_intsec); + popcount_wrapper("popcount",&STORM_popcnt, reps[i], bitmaps, ranges[i], ranges[i], n_bitmaps, unit_intsec); + set_algebra_wrapper("intersect-naive",&intersect_scalar_naive_nosimd, reps[i], bitmaps, bitmaps2, ranges[i], ranges[i], n_bitmaps, unit_intsec); + set_algebra_wrapper("intersect",STORM_get_intersect_count_func(ranges[i]), reps[i], bitmaps, bitmaps2, ranges[i], ranges[i], n_bitmaps, unit_intsec); + set_algebra_wrapper("union-naive",&union_scalar_naive_nosimd, reps[i], bitmaps, bitmaps2, ranges[i], ranges[i], n_bitmaps, unit_intsec); + set_algebra_wrapper("union",STORM_get_union_count_func(ranges[i]), reps[i], bitmaps, bitmaps2, ranges[i], ranges[i], n_bitmaps, unit_union); + set_algebra_wrapper("diff-naive",&diff_scalar_naive_nosimd, reps[i], bitmaps, bitmaps2, ranges[i], ranges[i], n_bitmaps, unit_intsec); + set_algebra_wrapper("diff",STORM_get_diff_count_func(ranges[i]), reps[i], bitmaps, bitmaps2, ranges[i], ranges[i], n_bitmaps, unit_diff); + } } // Clean up. @@ -709,11 +714,28 @@ int benchmark(int n_repetitions) { } int main(int argc, char **argv) { + bool verbose = false; + bool perf_subsystem = false; + int c; int n_repetitions = -1; - if (argc > 2) { - n_repetitions = std::atoi(argv[1]); - } - benchmark(n_repetitions); + + while ((c = getopt(argc, argv, "vpr:")) != -1) { + switch (c) { + case 'r': + n_repetitions = atoi(optarg); + break; + case 'v': + verbose = true; + break; + case 'p': + perf_subsystem = true; + break; + default: + abort(); + } + } + + benchmark(n_repetitions, perf_subsystem); return EXIT_SUCCESS; } \ No newline at end of file From 3697de528ed40556188cdb66c985226a9c50a9c5 Mon Sep 17 00:00:00 2001 From: "Marcus D. R. Klarqvist" Date: Sat, 17 Aug 2019 14:30:17 +0100 Subject: [PATCH 04/15] windows fix for getops --- benchmark.cpp | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/benchmark.cpp b/benchmark.cpp index e5784b1..fb51389 100644 --- a/benchmark.cpp +++ b/benchmark.cpp @@ -662,7 +662,7 @@ asm volatile("RDTSCP\n\t" return 0; } -int benchmark(int n_repetitions, bool use_perf = true) { +int benchmark(int n_repetitions, bool use_perf = false) { // Align some bitmaps. uint64_t* bitmaps = (uint64_t*)STORM_aligned_malloc(STORM_get_alignment(), 1048576*sizeof(uint64_t)); uint64_t* bitmaps2 = (uint64_t*)STORM_aligned_malloc(STORM_get_alignment(), 1048576*sizeof(uint64_t)); @@ -714,6 +714,7 @@ int benchmark(int n_repetitions, bool use_perf = true) { } int main(int argc, char **argv) { +#if !defined(_MSC_VER) bool verbose = false; bool perf_subsystem = false; int c; @@ -736,6 +737,13 @@ int main(int argc, char **argv) { } benchmark(n_repetitions, perf_subsystem); +#else + int n_repetitions = -1; + if (argc > 2) { + n_repetitions = std::atoi(argv[1]); + } + benchmark(n_repetitions, false); +#endif return EXIT_SUCCESS; } \ No newline at end of file From aecbe15ffbabd390cbadee520254b6d06af72a2b Mon Sep 17 00:00:00 2001 From: "Marcus D. R. Klarqvist" Date: Sat, 17 Aug 2019 18:34:22 +0100 Subject: [PATCH 05/15] windows fix --- benchmark.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/benchmark.cpp b/benchmark.cpp index fb51389..3fa0145 100644 --- a/benchmark.cpp +++ b/benchmark.cpp @@ -5,6 +5,9 @@ #include #include #include +#if !defined(_MSC_VER) +#include "getopt.h" +#endif void generate_random_data(uint64_t* data, uint32_t range, uint32_t n) { // Clear data @@ -741,7 +744,7 @@ int main(int argc, char **argv) { int n_repetitions = -1; if (argc > 2) { n_repetitions = std::atoi(argv[1]); - } + } benchmark(n_repetitions, false); #endif From 7b32e9050fbc45be5ce8d81eaa2e5a7dec8646b5 Mon Sep 17 00:00:00 2001 From: "Marcus D. R. Klarqvist" Date: Sat, 17 Aug 2019 18:35:33 +0100 Subject: [PATCH 06/15] typo --- benchmark.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmark.cpp b/benchmark.cpp index 3fa0145..dcec8b9 100644 --- a/benchmark.cpp +++ b/benchmark.cpp @@ -694,7 +694,7 @@ int benchmark(int n_repetitions, bool use_perf = false) { linux_set_algebra_wrapper("diff-naive",&diff_scalar_naive_nosimd, reps[i], bitmaps, bitmaps2, ranges[i], ranges[i], n_bitmaps, true); linux_set_algebra_wrapper("diff",STORM_get_diff_count_func(ranges[i]), reps[i], bitmaps, bitmaps2, ranges[i], ranges[i], n_bitmaps, true); #else - std::cerr << "perf counter are only available on Linux systems!" + std::cerr << "perf counter are only available on Linux systems!" << std::endl; exit(EXIT_FAILURE); #endif } else { From ab74ea1d911fce6888591c0550dce38304742227 Mon Sep 17 00:00:00 2001 From: "Marcus D. R. Klarqvist" Date: Sat, 17 Aug 2019 18:52:21 +0100 Subject: [PATCH 07/15] bench --- benchmark.cpp | 33 ++++++++++++++++----------------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/benchmark.cpp b/benchmark.cpp index dcec8b9..1f33720 100644 --- a/benchmark.cpp +++ b/benchmark.cpp @@ -681,31 +681,30 @@ int benchmark(int n_repetitions, bool use_perf = false) { for (int i = 0; i < ranges.size(); ++i) { bench_unit unit_intsec, unit_union, unit_diff; - uint32_t n_bitmaps = ceil(ranges[i] / 64.0); if (use_perf) { #ifdef __linux__ - linux_popcount_wrapper("popcount-naive",&popcount_scalar_naive_nosimd, reps[i], bitmaps, ranges[i], ranges[i], n_bitmaps, true); - linux_popcount_wrapper("popcount",&STORM_popcnt, reps[i], bitmaps, ranges[i], ranges[i], n_bitmaps, true); - linux_set_algebra_wrapper("intersect-naive",&intersect_scalar_naive_nosimd, reps[i], bitmaps, bitmaps2, ranges[i], ranges[i], n_bitmaps, true); - linux_set_algebra_wrapper("intersect",STORM_get_intersect_count_func(ranges[i]), reps[i], bitmaps, bitmaps2, ranges[i], ranges[i], n_bitmaps, true); - linux_set_algebra_wrapper("union-naive",&union_scalar_naive_nosimd, reps[i], bitmaps, bitmaps2, ranges[i], ranges[i], n_bitmaps, true); - linux_set_algebra_wrapper("union",STORM_get_union_count_func(ranges[i]), reps[i], bitmaps, bitmaps2, ranges[i], ranges[i], n_bitmaps, true); - linux_set_algebra_wrapper("diff-naive",&diff_scalar_naive_nosimd, reps[i], bitmaps, bitmaps2, ranges[i], ranges[i], n_bitmaps, true); - linux_set_algebra_wrapper("diff",STORM_get_diff_count_func(ranges[i]), reps[i], bitmaps, bitmaps2, ranges[i], ranges[i], n_bitmaps, true); + linux_popcount_wrapper("popcount-naive",&popcount_scalar_naive_nosimd, reps[i], bitmaps, ranges[i], ranges[i], ranges[i], true); + linux_popcount_wrapper("popcount",&STORM_popcnt, reps[i], bitmaps, ranges[i], ranges[i], ranges[i], true); + linux_set_algebra_wrapper("intersect-naive",&intersect_scalar_naive_nosimd, reps[i], bitmaps, bitmaps2, ranges[i], ranges[i], ranges[i], true); + linux_set_algebra_wrapper("intersect",STORM_get_intersect_count_func(ranges[i]), reps[i], bitmaps, bitmaps2, ranges[i], ranges[i], ranges[i], true); + linux_set_algebra_wrapper("union-naive",&union_scalar_naive_nosimd, reps[i], bitmaps, bitmaps2, ranges[i], ranges[i], ranges[i], true); + linux_set_algebra_wrapper("union",STORM_get_union_count_func(ranges[i]), reps[i], bitmaps, bitmaps2, ranges[i], ranges[i], ranges[i], true); + linux_set_algebra_wrapper("diff-naive",&diff_scalar_naive_nosimd, reps[i], bitmaps, bitmaps2, ranges[i], ranges[i], ranges[i], true); + linux_set_algebra_wrapper("diff",STORM_get_diff_count_func(ranges[i]), reps[i], bitmaps, bitmaps2, ranges[i], ranges[i], ranges[i], true); #else std::cerr << "perf counter are only available on Linux systems!" << std::endl; exit(EXIT_FAILURE); #endif } else { - popcount_wrapper("popcount-naive",&popcount_scalar_naive_nosimd, reps[i], bitmaps, ranges[i], ranges[i], n_bitmaps, unit_intsec); - popcount_wrapper("popcount",&STORM_popcnt, reps[i], bitmaps, ranges[i], ranges[i], n_bitmaps, unit_intsec); - set_algebra_wrapper("intersect-naive",&intersect_scalar_naive_nosimd, reps[i], bitmaps, bitmaps2, ranges[i], ranges[i], n_bitmaps, unit_intsec); - set_algebra_wrapper("intersect",STORM_get_intersect_count_func(ranges[i]), reps[i], bitmaps, bitmaps2, ranges[i], ranges[i], n_bitmaps, unit_intsec); - set_algebra_wrapper("union-naive",&union_scalar_naive_nosimd, reps[i], bitmaps, bitmaps2, ranges[i], ranges[i], n_bitmaps, unit_intsec); - set_algebra_wrapper("union",STORM_get_union_count_func(ranges[i]), reps[i], bitmaps, bitmaps2, ranges[i], ranges[i], n_bitmaps, unit_union); - set_algebra_wrapper("diff-naive",&diff_scalar_naive_nosimd, reps[i], bitmaps, bitmaps2, ranges[i], ranges[i], n_bitmaps, unit_intsec); - set_algebra_wrapper("diff",STORM_get_diff_count_func(ranges[i]), reps[i], bitmaps, bitmaps2, ranges[i], ranges[i], n_bitmaps, unit_diff); + popcount_wrapper("popcount-naive",&popcount_scalar_naive_nosimd, reps[i], bitmaps, ranges[i], ranges[i], ranges[i], unit_intsec); + popcount_wrapper("popcount",&STORM_popcnt, reps[i], bitmaps, ranges[i], ranges[i], ranges[i], unit_intsec); + set_algebra_wrapper("intersect-naive",&intersect_scalar_naive_nosimd, reps[i], bitmaps, bitmaps2, ranges[i], ranges[i], ranges[i], unit_intsec); + set_algebra_wrapper("intersect",STORM_get_intersect_count_func(ranges[i]), reps[i], bitmaps, bitmaps2, ranges[i], ranges[i], ranges[i], unit_intsec); + set_algebra_wrapper("union-naive",&union_scalar_naive_nosimd, reps[i], bitmaps, bitmaps2, ranges[i], ranges[i], ranges[i], unit_intsec); + set_algebra_wrapper("union",STORM_get_union_count_func(ranges[i]), reps[i], bitmaps, bitmaps2, ranges[i], ranges[i], ranges[i], unit_union); + set_algebra_wrapper("diff-naive",&diff_scalar_naive_nosimd, reps[i], bitmaps, bitmaps2, ranges[i], ranges[i], ranges[i], unit_intsec); + set_algebra_wrapper("diff",STORM_get_diff_count_func(ranges[i]), reps[i], bitmaps, bitmaps2, ranges[i], ranges[i], ranges[i], unit_diff); } } From 602b4570af46bd04e08be17020e19ebfdc7dc5f7 Mon Sep 17 00:00:00 2001 From: "Marcus D. R. Klarqvist" Date: Sat, 17 Aug 2019 21:21:00 +0100 Subject: [PATCH 08/15] table output --- benchmark.cpp | 50 ++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 44 insertions(+), 6 deletions(-) diff --git a/benchmark.cpp b/benchmark.cpp index 1f33720..d0f943f 100644 --- a/benchmark.cpp +++ b/benchmark.cpp @@ -197,8 +197,25 @@ int linux_set_algebra_wrapper(std::string name, std::vector mins = compute_mins(allresults); std::vector avg = compute_averages(allresults); - printf("%s-%u:\n",name.c_str(),n_bitmaps); if (verbose) { + printf("%s\t%u\t%.2f\t%.3f\t%.3f\t%llu\t%llu\t%llu\t%llu\t%llu\t%.1f\t%.1f\t%.1f\t%.1f\t%.1f\n", + name.c_str(), + n_bitmaps, + double(mins[1]) / mins[0], + double(mins[0]) / (2*n_bitmaps), + double(mins[1]) / (2*n_bitmaps), + mins[0], + mins[1], + mins[2], + mins[3], + mins[4], + avg[0], + avg[1], + avg[2], + avg[3], + avg[4]); + } else { + printf("%s-%u:\n",name.c_str(),n_bitmaps); printf("instructions per cycle %4.2f, cycles per 64-bit word: %4.3f, " "instructions per 64-bit word %4.3f \n", double(mins[1]) / mins[0], double(mins[0]) / (2*n_bitmaps), double(mins[1]) / (2*n_bitmaps)); @@ -209,8 +226,6 @@ int linux_set_algebra_wrapper(std::string name, printf("avg: %8.1f cycles, %8.1f instructions, \t%8.1f branch mis., %8.1f " "cache ref., %8.1f cache mis.\n", avg[0], avg[1], avg[2], avg[3], avg[4]); - } else { - printf("cycles per 64-bit word: %4.3f; ref cycles per 64-bit word: %4.3f \n", double(mins[0]) / (2*n_bitmaps), double(mins[5]) / (2*n_bitmaps)); } return 1; @@ -255,8 +270,25 @@ int linux_popcount_wrapper(std::string name, std::vector mins = compute_mins(allresults); std::vector avg = compute_averages(allresults); - printf("%s-%u:\n",name.c_str(),n_bitmaps); if (verbose) { + printf("%s\t%u\t%.2f\t%.3f\t%.3f\t%llu\t%llu\t%llu\t%llu\t%llu\t%.1f\t%.1f\t%.1f\t%.1f\t%.1f\n", + name.c_str(), + n_bitmaps, + double(mins[1]) / mins[0], + double(mins[0]) / n_bitmaps, + double(mins[1]) / n_bitmaps, + mins[0], + mins[1], + mins[2], + mins[3], + mins[4], + avg[0], + avg[1], + avg[2], + avg[3], + avg[4]); + } else { + printf("%s-%u:\n",name.c_str(),n_bitmaps); printf("instructions per cycle %4.2f, cycles per 64-bit word: %4.3f, " "instructions per 64-bit word %4.3f \n", double(mins[1]) / mins[0], double(mins[0]) / n_bitmaps, double(mins[1]) / n_bitmaps); @@ -267,8 +299,6 @@ int linux_popcount_wrapper(std::string name, printf("avg: %8.1f cycles, %8.1f instructions, \t%8.1f branch mis., %8.1f " "cache ref., %8.1f cache mis.\n", avg[0], avg[1], avg[2], avg[3], avg[4]); - } else { - printf("cycles per 64-bit word: %4.3f; ref cycles per 64-bit word: %4.3f \n", double(mins[0]) / n_bitmaps, double(mins[5]) / n_bitmaps); } return 1; @@ -678,6 +708,14 @@ int benchmark(int n_repetitions, bool use_perf = false) { reps = std::vector(ranges.size(), n_repetitions); } + if (use_perf) { +#ifndef __linux__ + std::cerr << "perf counter are only available on Linux systems!" << std::endl; + exit(EXIT_FAILURE); +#endif + printf("Algorithm\tWords\tInstructions/cycle\tCycles/word\tInstructions/word\tMinCycles\tMinInstructions\tMinBranchMiss\tMinCacheRef\tminCacheMiss\tAvgCycles\tAvgInstructions\tAvgBranchMiss\tAvgCacheRef\tAvgCacheMiss\n"); + } + for (int i = 0; i < ranges.size(); ++i) { bench_unit unit_intsec, unit_union, unit_diff; From 09c15500feb164d6381bae6a0e17037e539f313c Mon Sep 17 00:00:00 2001 From: "Marcus D. R. Klarqvist" Date: Sun, 18 Aug 2019 08:29:33 +0100 Subject: [PATCH 09/15] change to size_t --- benchmark.cpp | 147 +++++++++---------------------- libalgebra.h | 237 ++++++++++++++++++++++++-------------------------- 2 files changed, 154 insertions(+), 230 deletions(-) diff --git a/benchmark.cpp b/benchmark.cpp index d0f943f..93708a8 100644 --- a/benchmark.cpp +++ b/benchmark.cpp @@ -259,10 +259,11 @@ int linux_popcount_wrapper(std::string name, volatile uint64_t total = 0; // voltatile to prevent compiler to remove work through optimization for (uint32_t i = 0; i < iterations; i++) { generate_random_data(data, range, n_values); + size_t n_vals = n_bitmaps*8; unified.start(); // Call argument subroutine pointer. - total += (*f)((uint8_t*)data, n_bitmaps*8); + total += (*f)((uint8_t*)data, n_vals); unified.end(results); allresults.push_back(results); } @@ -305,73 +306,6 @@ int linux_popcount_wrapper(std::string name, } #endif // end is linux -uint8_t popcnt_lookup8bit[256] = { - /* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2, - /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3, - /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3, - /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4, - /* 10 */ 1, /* 11 */ 2, /* 12 */ 2, /* 13 */ 3, - /* 14 */ 2, /* 15 */ 3, /* 16 */ 3, /* 17 */ 4, - /* 18 */ 2, /* 19 */ 3, /* 1a */ 3, /* 1b */ 4, - /* 1c */ 3, /* 1d */ 4, /* 1e */ 4, /* 1f */ 5, - /* 20 */ 1, /* 21 */ 2, /* 22 */ 2, /* 23 */ 3, - /* 24 */ 2, /* 25 */ 3, /* 26 */ 3, /* 27 */ 4, - /* 28 */ 2, /* 29 */ 3, /* 2a */ 3, /* 2b */ 4, - /* 2c */ 3, /* 2d */ 4, /* 2e */ 4, /* 2f */ 5, - /* 30 */ 2, /* 31 */ 3, /* 32 */ 3, /* 33 */ 4, - /* 34 */ 3, /* 35 */ 4, /* 36 */ 4, /* 37 */ 5, - /* 38 */ 3, /* 39 */ 4, /* 3a */ 4, /* 3b */ 5, - /* 3c */ 4, /* 3d */ 5, /* 3e */ 5, /* 3f */ 6, - /* 40 */ 1, /* 41 */ 2, /* 42 */ 2, /* 43 */ 3, - /* 44 */ 2, /* 45 */ 3, /* 46 */ 3, /* 47 */ 4, - /* 48 */ 2, /* 49 */ 3, /* 4a */ 3, /* 4b */ 4, - /* 4c */ 3, /* 4d */ 4, /* 4e */ 4, /* 4f */ 5, - /* 50 */ 2, /* 51 */ 3, /* 52 */ 3, /* 53 */ 4, - /* 54 */ 3, /* 55 */ 4, /* 56 */ 4, /* 57 */ 5, - /* 58 */ 3, /* 59 */ 4, /* 5a */ 4, /* 5b */ 5, - /* 5c */ 4, /* 5d */ 5, /* 5e */ 5, /* 5f */ 6, - /* 60 */ 2, /* 61 */ 3, /* 62 */ 3, /* 63 */ 4, - /* 64 */ 3, /* 65 */ 4, /* 66 */ 4, /* 67 */ 5, - /* 68 */ 3, /* 69 */ 4, /* 6a */ 4, /* 6b */ 5, - /* 6c */ 4, /* 6d */ 5, /* 6e */ 5, /* 6f */ 6, - /* 70 */ 3, /* 71 */ 4, /* 72 */ 4, /* 73 */ 5, - /* 74 */ 4, /* 75 */ 5, /* 76 */ 5, /* 77 */ 6, - /* 78 */ 4, /* 79 */ 5, /* 7a */ 5, /* 7b */ 6, - /* 7c */ 5, /* 7d */ 6, /* 7e */ 6, /* 7f */ 7, - /* 80 */ 1, /* 81 */ 2, /* 82 */ 2, /* 83 */ 3, - /* 84 */ 2, /* 85 */ 3, /* 86 */ 3, /* 87 */ 4, - /* 88 */ 2, /* 89 */ 3, /* 8a */ 3, /* 8b */ 4, - /* 8c */ 3, /* 8d */ 4, /* 8e */ 4, /* 8f */ 5, - /* 90 */ 2, /* 91 */ 3, /* 92 */ 3, /* 93 */ 4, - /* 94 */ 3, /* 95 */ 4, /* 96 */ 4, /* 97 */ 5, - /* 98 */ 3, /* 99 */ 4, /* 9a */ 4, /* 9b */ 5, - /* 9c */ 4, /* 9d */ 5, /* 9e */ 5, /* 9f */ 6, - /* a0 */ 2, /* a1 */ 3, /* a2 */ 3, /* a3 */ 4, - /* a4 */ 3, /* a5 */ 4, /* a6 */ 4, /* a7 */ 5, - /* a8 */ 3, /* a9 */ 4, /* aa */ 4, /* ab */ 5, - /* ac */ 4, /* ad */ 5, /* ae */ 5, /* af */ 6, - /* b0 */ 3, /* b1 */ 4, /* b2 */ 4, /* b3 */ 5, - /* b4 */ 4, /* b5 */ 5, /* b6 */ 5, /* b7 */ 6, - /* b8 */ 4, /* b9 */ 5, /* ba */ 5, /* bb */ 6, - /* bc */ 5, /* bd */ 6, /* be */ 6, /* bf */ 7, - /* c0 */ 2, /* c1 */ 3, /* c2 */ 3, /* c3 */ 4, - /* c4 */ 3, /* c5 */ 4, /* c6 */ 4, /* c7 */ 5, - /* c8 */ 3, /* c9 */ 4, /* ca */ 4, /* cb */ 5, - /* cc */ 4, /* cd */ 5, /* ce */ 5, /* cf */ 6, - /* d0 */ 3, /* d1 */ 4, /* d2 */ 4, /* d3 */ 5, - /* d4 */ 4, /* d5 */ 5, /* d6 */ 5, /* d7 */ 6, - /* d8 */ 4, /* d9 */ 5, /* da */ 5, /* db */ 6, - /* dc */ 5, /* dd */ 6, /* de */ 6, /* df */ 7, - /* e0 */ 3, /* e1 */ 4, /* e2 */ 4, /* e3 */ 5, - /* e4 */ 4, /* e5 */ 5, /* e6 */ 5, /* e7 */ 6, - /* e8 */ 4, /* e9 */ 5, /* ea */ 5, /* eb */ 6, - /* ec */ 5, /* ed */ 6, /* ee */ 6, /* ef */ 7, - /* f0 */ 4, /* f1 */ 5, /* f2 */ 5, /* f3 */ 6, - /* f4 */ 5, /* f5 */ 6, /* f6 */ 6, /* f7 */ 7, - /* f8 */ 5, /* f9 */ 6, /* fa */ 6, /* fb */ 7, - /* fc */ 6, /* fd */ 7, /* fe */ 7, /* ff */ 8 -}; - struct bench_unit { bench_unit() : valid(false), cycles(0), cycles_local(0), times(0), times_local(0){} @@ -396,7 +330,7 @@ uint64_t get_cpu_cycles() { #if !defined(__clang__) && !defined(_MSC_VER) __attribute__((optimize("no-tree-vectorize"))) #endif -uint64_t popcount_scalar_naive_nosimd(const uint8_t* data, uint32_t len) { +uint64_t popcount_scalar_naive_nosimd(const uint8_t* data, size_t len) { uint64_t total = 0; // for (int i = 0; i < len; ++i) { // total += STORM_popcount64(data1[i] & data2[i]); @@ -406,14 +340,14 @@ uint64_t popcount_scalar_naive_nosimd(const uint8_t* data, uint32_t len) { for (int j = 0; j < len; j += 8) { // total += STORM_popcount64(data[i]); // diff = data1[i] & data2[i]; - total += popcnt_lookup8bit[data[j+0]]; - total += popcnt_lookup8bit[data[j+1]]; - total += popcnt_lookup8bit[data[j+2]]; - total += popcnt_lookup8bit[data[j+3]]; - total += popcnt_lookup8bit[data[j+4]]; - total += popcnt_lookup8bit[data[j+5]]; - total += popcnt_lookup8bit[data[j+6]]; - total += popcnt_lookup8bit[data[j+7]]; + total += STORM_popcnt_lookup8bit[data[j+0]]; + total += STORM_popcnt_lookup8bit[data[j+1]]; + total += STORM_popcnt_lookup8bit[data[j+2]]; + total += STORM_popcnt_lookup8bit[data[j+3]]; + total += STORM_popcnt_lookup8bit[data[j+4]]; + total += STORM_popcnt_lookup8bit[data[j+5]]; + total += STORM_popcnt_lookup8bit[data[j+6]]; + total += STORM_popcnt_lookup8bit[data[j+7]]; } return total; @@ -422,7 +356,7 @@ uint64_t popcount_scalar_naive_nosimd(const uint8_t* data, uint32_t len) { #if !defined(__clang__) && !defined(_MSC_VER) __attribute__((optimize("no-tree-vectorize"))) #endif -uint64_t intersect_scalar_naive_nosimd(const uint64_t* STORM_RESTRICT data1,const uint64_t* STORM_RESTRICT data2, uint32_t len) { +uint64_t intersect_scalar_naive_nosimd(const uint64_t* STORM_RESTRICT data1,const uint64_t* STORM_RESTRICT data2, size_t len) { uint64_t total = 0; // for (int i = 0; i < len; ++i) { // total += STORM_popcount64(data1[i] & data2[i]); @@ -433,14 +367,14 @@ uint64_t intersect_scalar_naive_nosimd(const uint64_t* STORM_RESTRICT data1,cons for (int i = 0; i < len; ++i) { // total += STORM_popcount64(data1[i] & data2[i]); diff = data1[i] & data2[i]; - total += popcnt_lookup8bit[b8[0]]; - total += popcnt_lookup8bit[b8[1]]; - total += popcnt_lookup8bit[b8[2]]; - total += popcnt_lookup8bit[b8[3]]; - total += popcnt_lookup8bit[b8[4]]; - total += popcnt_lookup8bit[b8[5]]; - total += popcnt_lookup8bit[b8[6]]; - total += popcnt_lookup8bit[b8[7]]; + total += STORM_popcnt_lookup8bit[b8[0]]; + total += STORM_popcnt_lookup8bit[b8[1]]; + total += STORM_popcnt_lookup8bit[b8[2]]; + total += STORM_popcnt_lookup8bit[b8[3]]; + total += STORM_popcnt_lookup8bit[b8[4]]; + total += STORM_popcnt_lookup8bit[b8[5]]; + total += STORM_popcnt_lookup8bit[b8[6]]; + total += STORM_popcnt_lookup8bit[b8[7]]; } return total; @@ -449,7 +383,7 @@ uint64_t intersect_scalar_naive_nosimd(const uint64_t* STORM_RESTRICT data1,cons #if !defined(__clang__) && !defined(_MSC_VER) __attribute__((optimize("no-tree-vectorize"))) #endif -uint64_t union_scalar_naive_nosimd(const uint64_t* STORM_RESTRICT data1,const uint64_t* STORM_RESTRICT data2, uint32_t len) { +uint64_t union_scalar_naive_nosimd(const uint64_t* STORM_RESTRICT data1,const uint64_t* STORM_RESTRICT data2, size_t len) { uint64_t total = 0; // for (int i = 0; i < len; ++i) { // total += STORM_popcount64(data1[i] | data2[i]); @@ -460,14 +394,14 @@ uint64_t union_scalar_naive_nosimd(const uint64_t* STORM_RESTRICT data1,const ui for (int i = 0; i < len; ++i) { // total += STORM_popcount64(data1[i] | data2[i]); diff = data1[i] | data2[i]; - total += popcnt_lookup8bit[b8[0]]; - total += popcnt_lookup8bit[b8[1]]; - total += popcnt_lookup8bit[b8[2]]; - total += popcnt_lookup8bit[b8[3]]; - total += popcnt_lookup8bit[b8[4]]; - total += popcnt_lookup8bit[b8[5]]; - total += popcnt_lookup8bit[b8[6]]; - total += popcnt_lookup8bit[b8[7]]; + total += STORM_popcnt_lookup8bit[b8[0]]; + total += STORM_popcnt_lookup8bit[b8[1]]; + total += STORM_popcnt_lookup8bit[b8[2]]; + total += STORM_popcnt_lookup8bit[b8[3]]; + total += STORM_popcnt_lookup8bit[b8[4]]; + total += STORM_popcnt_lookup8bit[b8[5]]; + total += STORM_popcnt_lookup8bit[b8[6]]; + total += STORM_popcnt_lookup8bit[b8[7]]; } return total; @@ -476,7 +410,7 @@ uint64_t union_scalar_naive_nosimd(const uint64_t* STORM_RESTRICT data1,const ui #if !defined(__clang__) && !defined(_MSC_VER) __attribute__((optimize("no-tree-vectorize"))) #endif -uint64_t diff_scalar_naive_nosimd(const uint64_t* STORM_RESTRICT data1,const uint64_t* STORM_RESTRICT data2, uint32_t len) { +uint64_t diff_scalar_naive_nosimd(const uint64_t* STORM_RESTRICT data1,const uint64_t* STORM_RESTRICT data2, size_t len) { uint64_t total = 0; // for (int i = 0; i < len; ++i) { // total += STORM_popcount64(data1[i] ^ data2[i]); @@ -487,14 +421,14 @@ uint64_t diff_scalar_naive_nosimd(const uint64_t* STORM_RESTRICT data1,const uin for (int i = 0; i < len; ++i) { // total += STORM_popcount64(data1[i] ^ data2[i]); diff = data1[i] ^ data2[i]; - total += popcnt_lookup8bit[b8[0]]; - total += popcnt_lookup8bit[b8[1]]; - total += popcnt_lookup8bit[b8[2]]; - total += popcnt_lookup8bit[b8[3]]; - total += popcnt_lookup8bit[b8[4]]; - total += popcnt_lookup8bit[b8[5]]; - total += popcnt_lookup8bit[b8[6]]; - total += popcnt_lookup8bit[b8[7]]; + total += STORM_popcnt_lookup8bit[b8[0]]; + total += STORM_popcnt_lookup8bit[b8[1]]; + total += STORM_popcnt_lookup8bit[b8[2]]; + total += STORM_popcnt_lookup8bit[b8[3]]; + total += STORM_popcnt_lookup8bit[b8[4]]; + total += STORM_popcnt_lookup8bit[b8[5]]; + total += STORM_popcnt_lookup8bit[b8[6]]; + total += STORM_popcnt_lookup8bit[b8[7]]; } return total; @@ -510,7 +444,7 @@ int set_algebra_wrapper(std::string name, uint64_t* STORM_RESTRICT data2, uint32_t range, uint32_t n_values, - uint32_t n_bitmaps, + size_t n_bitmaps, bench_unit& unit) { uint32_t cycles_low = 0, cycles_high = 0; @@ -655,9 +589,10 @@ asm volatile("RDTSCP\n\t" "mov %%eax, %1\n\t": "=r" (cycles_high), "=r" (cycles_low):: "%rax", "%rbx", "%rcx", "%rdx"); #endif + size_t n_b = n_bitmaps*8; for (int i = 0; i < iterations; ++i) { // Call argument subroutine pointer. - total += (*f)((uint8_t*)data, n_bitmaps); + total += (*f)((uint8_t*)data, n_b); } #ifndef _MSC_VER diff --git a/libalgebra.h b/libalgebra.h index 3ee0e12..7150332 100644 --- a/libalgebra.h +++ b/libalgebra.h @@ -287,7 +287,7 @@ void STORM_aligned_free(void* memblock) { #define HAVE_AVX512 #endif -// +// Target attribute #if !defined(_MSC_VER) #define STORM_TARGET(x) __attribute__ ((target (x))) #else @@ -498,7 +498,7 @@ uint64_t STORM_POPCOUNT(uint64_t x) { static uint64_t STORM_intersect_count_unrolled(const uint64_t* STORM_RESTRICT data1, const uint64_t* STORM_RESTRICT data2, - uint64_t size) + size_t size) { const uint64_t limit = size - size % 4; uint64_t cnt = 0; @@ -520,7 +520,7 @@ uint64_t STORM_intersect_count_unrolled(const uint64_t* STORM_RESTRICT data1, static uint64_t STORM_union_count_unrolled(const uint64_t* STORM_RESTRICT data1, const uint64_t* STORM_RESTRICT data2, - uint64_t size) + size_t size) { const uint64_t limit = size - size % 4; uint64_t cnt = 0; @@ -542,7 +542,7 @@ uint64_t STORM_union_count_unrolled(const uint64_t* STORM_RESTRICT data1, static uint64_t STORM_diff_count_unrolled(const uint64_t* STORM_RESTRICT data1, const uint64_t* STORM_RESTRICT data2, - uint64_t size) + size_t size) { const uint64_t limit = size - size % 4; uint64_t cnt = 0; @@ -562,10 +562,10 @@ uint64_t STORM_diff_count_unrolled(const uint64_t* STORM_RESTRICT data1, } static -int STORM_pospopcnt_u16_scalar_naive(const uint16_t* data, uint32_t len, uint32_t* flags) { +int STORM_pospopcnt_u16_scalar_naive(const uint16_t* data, size_t len, uint32_t* out) { for (int i = 0; i < len; ++i) { for (int j = 0; j < 16; ++j) { - flags[j] += ((data[i] & (1 << j)) >> j); + out[j] += ((data[i] & (1 << j)) >> j); } } @@ -591,7 +591,7 @@ uint64_t STORM_pospopcnt_loadu_u64(const void* ptr) { // By @aqrit (https://github.com/aqrit) // @see: https://gist.github.com/aqrit/c729815b0165c139d0bac642ab7ee104 static -int STORM_pospopcnt_u16_scalar_umul128_unroll2(const uint16_t* in, uint32_t n, uint32_t* out) { +int STORM_pospopcnt_u16_scalar_umul128_unroll2(const uint16_t* in, size_t n, uint32_t* out) { while (n >= 8) { uint64_t counter_a = 0; // 4 packed 12-bit counters uint64_t counter_b = 0; @@ -693,10 +693,10 @@ int STORM_pospopcnt_u16_scalar_umul128_unroll2(const uint16_t* in, uint32_t n, u STORM_FORCE_INLINE uint64_t STORM_popcount64(uint64_t x) { - uint64_t m1 = 0x5555555555555555ll; - uint64_t m2 = 0x3333333333333333ll; - uint64_t m4 = 0x0F0F0F0F0F0F0F0Fll; - uint64_t h01 = 0x0101010101010101ll; + uint64_t m1 = UINT64_C(0x5555555555555555); + uint64_t m2 = UINT64_C(0x3333333333333333); + uint64_t m4 = UINT64_C(0x0F0F0F0F0F0F0F0F); + uint64_t h01 = UINT64_C(0x0101010101010101); x -= (x >> 1) & m1; x = (x & m2) + ((x >> 2) & m2); @@ -782,13 +782,6 @@ const uint8_t STORM_popcnt_lookup8bit[256] = { #include -#ifndef STORM_POPCOUNT_SSE4 -#define STORM_POPCOUNT_SSE4(A, B) { \ - A += STORM_POPCOUNT(_mm_extract_epi64(B, 0)); \ - A += STORM_POPCOUNT(_mm_extract_epi64(B, 1)); \ -} -#endif - STORM_TARGET("sse4.2") STORM_FORCE_INLINE uint64_t STORM_POPCOUNT_SSE(const __m128i n) { @@ -839,7 +832,7 @@ void STORM_pospopcnt_csa_sse(__m128i* STORM_RESTRICT h, // @see: https://gist.github.com/aqrit/cb52b2ac5b7d0dfe9319c09d27237bf3 STORM_TARGET("sse4.2") static -int STORM_pospopcnt_u16_sse_sad(const uint16_t* data, uint32_t len, uint32_t* flag_counts) { +int STORM_pospopcnt_u16_sse_sad(const uint16_t* data, size_t len, uint32_t* flag_counts) { const __m128i zero = _mm_setzero_si128(); const __m128i mask_lo_byte = _mm_srli_epi16(_mm_cmpeq_epi8(zero, zero), 8); const __m128i mask_lo_cnt = _mm_srli_epi16(mask_lo_byte, 2); @@ -1036,7 +1029,7 @@ int STORM_pospopcnt_u16_sse_sad(const uint16_t* data, uint32_t len, uint32_t* fl STORM_TARGET("sse4.2") static -int STORM_pospopcnt_u16_sse_blend_popcnt_unroll8(const uint16_t* array, uint32_t len, uint32_t* flags) { +int STORM_pospopcnt_u16_sse_blend_popcnt_unroll8(const uint16_t* array, size_t len, uint32_t* out) { const __m128i* data_vectors = (const __m128i*)(array); const uint32_t n_cycles = len / 8; @@ -1053,8 +1046,8 @@ int STORM_pospopcnt_u16_sse_blend_popcnt_unroll8(const uint16_t* array, uint32_t U(0,1) U(2,3) U(4,5) U(6,7) for (int i = 0; i < 8; ++i) { -#define A0(p) flags[ 7 - i] += _mm_popcnt_u32(_mm_movemask_epi8(input##p)); -#define A1(k) flags[15 - i] += _mm_popcnt_u32(_mm_movemask_epi8(input##k)); +#define A0(p) out[ 7 - i] += _mm_popcnt_u32(_mm_movemask_epi8(input##p)); +#define A1(k) out[15 - i] += _mm_popcnt_u32(_mm_movemask_epi8(input##k)); #define A(p, k) A0(p) A1(k) A(0,1) A(2, 3) A(4,5) A(6, 7) @@ -1088,7 +1081,7 @@ int STORM_pospopcnt_u16_sse_blend_popcnt_unroll8(const uint16_t* array, uint32_t i *= 8; for (/**/; i < len; ++i) { for (int j = 0; j < 16; ++j) { - flags[j] += ((array[i] & (1 << j)) >> j); + out[j] += ((array[i] & (1 << j)) >> j); } } @@ -1106,10 +1099,10 @@ int STORM_pospopcnt_u16_sse_blend_popcnt_unroll8(const uint16_t* array, uint32_t STORM_TARGET("sse4.2") static -int STORM_pospopcnt_u16_sse_harvey_seal(const uint16_t* array, uint32_t len, uint32_t* flags) { +int STORM_pospopcnt_u16_sse_harvey_seal(const uint16_t* array, size_t len, uint32_t* out) { for (uint32_t i = len - (len % (16 * 8)); i < len; ++i) { for (int j = 0; j < 16; ++j) { - flags[j] += ((array[i] & (1 << j)) >> j); + out[j] += ((array[i] & (1 << j)) >> j); } } @@ -1169,7 +1162,7 @@ int STORM_pospopcnt_u16_sse_harvey_seal(const uint16_t* array, uint32_t len, uin for (size_t i = 0; i < 16; ++i) { _mm_storeu_si128((__m128i*)buffer, counter[i]); for (size_t z = 0; z < 8; z++) { - flags[i] += 16 * (uint32_t)buffer[z]; + out[i] += 16 * (uint32_t)buffer[z]; } } } @@ -1177,26 +1170,26 @@ int STORM_pospopcnt_u16_sse_harvey_seal(const uint16_t* array, uint32_t len, uin _mm_storeu_si128((__m128i*)buffer, v1); for (size_t i = 0; i < 8; ++i) { for (int j = 0; j < 16; ++j) { - flags[j] += ((buffer[i] & (1 << j)) >> j); + out[j] += ((buffer[i] & (1 << j)) >> j); } } _mm_storeu_si128((__m128i*)buffer, v2); for (size_t i = 0; i < 8; ++i) { for (int j = 0; j < 16; ++j) { - flags[j] += 2 * ((buffer[i] & (1 << j)) >> j); + out[j] += 2 * ((buffer[i] & (1 << j)) >> j); } } _mm_storeu_si128((__m128i*)buffer, v4); for (size_t i = 0; i < 8; ++i) { for (int j = 0; j < 16; ++j) { - flags[j] += 4 * ((buffer[i] & (1 << j)) >> j); + out[j] += 4 * ((buffer[i] & (1 << j)) >> j); } } _mm_storeu_si128((__m128i*)buffer, v8); for (size_t i = 0; i < 8; ++i) { for (int j = 0; j < 16; ++j) { - flags[j] += 8 * ((buffer[i] & (1 << j)) >> j); + out[j] += 8 * ((buffer[i] & (1 << j)) >> j); } } return 0; @@ -1206,7 +1199,7 @@ STORM_TARGET("sse4.2") static uint64_t STORM_intersect_count_csa_sse4(const __m128i* STORM_RESTRICT data1, const __m128i* STORM_RESTRICT data2, - uint64_t size) + size_t size) { __m128i ones = _mm_setzero_si128(); __m128i twos = _mm_setzero_si128(); @@ -1258,7 +1251,7 @@ STORM_TARGET("sse4.2") static uint64_t STORM_union_count_csa_sse4(const __m128i* STORM_RESTRICT data1, const __m128i* STORM_RESTRICT data2, - uint64_t size) + size_t size) { __m128i ones = _mm_setzero_si128(); __m128i twos = _mm_setzero_si128(); @@ -1310,7 +1303,7 @@ STORM_TARGET("sse4.2") static uint64_t STORM_diff_count_csa_sse4(const __m128i* STORM_RESTRICT data1, const __m128i* STORM_RESTRICT data2, - uint64_t size) + size_t size) { __m128i ones = _mm_setzero_si128(); __m128i twos = _mm_setzero_si128(); @@ -1362,7 +1355,7 @@ STORM_TARGET("sse4.2") static uint64_t STORM_intersect_count_sse4(const uint64_t* STORM_RESTRICT b1, const uint64_t* STORM_RESTRICT b2, - const uint32_t n_ints) + const size_t n_ints) { uint64_t count = 0; const __m128i* r1 = (__m128i*)b1; @@ -1382,7 +1375,7 @@ STORM_TARGET("sse4.2") static uint64_t STORM_union_count_sse4(const uint64_t* STORM_RESTRICT b1, const uint64_t* STORM_RESTRICT b2, - const uint32_t n_ints) + const size_t n_ints) { uint64_t count = 0; const __m128i* r1 = (__m128i*)b1; @@ -1402,7 +1395,7 @@ STORM_TARGET("sse4.2") static uint64_t STORM_diff_count_sse4(const uint64_t* STORM_RESTRICT b1, const uint64_t* STORM_RESTRICT b2, - const uint32_t n_ints) + const size_t n_ints) { uint64_t count = 0; const __m128i* r1 = (__m128i*)b1; @@ -1427,15 +1420,6 @@ uint64_t STORM_diff_count_sse4(const uint64_t* STORM_RESTRICT b1, #include -#ifndef STORM_POPCOUNT_AVX2 -#define STORM_POPCOUNT_AVX2(A, B) { \ - A += STORM_POPCOUNT(_mm256_extract_epi64(B, 0)); \ - A += STORM_POPCOUNT(_mm256_extract_epi64(B, 1)); \ - A += STORM_POPCOUNT(_mm256_extract_epi64(B, 2)); \ - A += STORM_POPCOUNT(_mm256_extract_epi64(B, 3)); \ -} -#endif - STORM_TARGET("avx2") STORM_FORCE_INLINE void STORM_CSA256(__m256i* h, __m256i* l, __m256i a, __m256i b, __m256i c) { @@ -1458,7 +1442,7 @@ void STORM_pospopcnt_csa_avx2(__m256i* STORM_RESTRICT h, STORM_TARGET("avx2") static -int STORM_pospopcnt_u16_avx2_blend_popcnt_unroll8(const uint16_t* array, uint32_t len, uint32_t* flags) { +int STORM_pospopcnt_u16_avx2_blend_popcnt_unroll8(const uint16_t* array, size_t len, uint32_t* out) { const __m256i* data_vectors = (const __m256i*)(array); const uint32_t n_cycles = len / 16; @@ -1474,8 +1458,8 @@ int STORM_pospopcnt_u16_avx2_blend_popcnt_unroll8(const uint16_t* array, uint32_ U(0,1) U(2, 3) U(4, 5) U(6, 7) for (int i = 0; i < 8; ++i) { -#define A0(p) flags[ 7 - i] += _mm_popcnt_u32(_mm256_movemask_epi8(input##p)); -#define A1(k) flags[15 - i] += _mm_popcnt_u32(_mm256_movemask_epi8(input##k)); +#define A0(p) out[ 7 - i] += _mm_popcnt_u32(_mm256_movemask_epi8(input##p)); +#define A1(k) out[15 - i] += _mm_popcnt_u32(_mm256_movemask_epi8(input##k)); #define A(p, k) A0(p) A1(k) A(0,1) A(2, 3) A(4, 5) A(6, 7) @@ -1508,7 +1492,7 @@ int STORM_pospopcnt_u16_avx2_blend_popcnt_unroll8(const uint16_t* array, uint32_ i *= 16; for (/**/; i < len; ++i) { for (int j = 0; j < 16; ++j) { - flags[j] += ((array[i] & (1 << j)) >> j); + out[j] += ((array[i] & (1 << j)) >> j); } } @@ -1527,10 +1511,10 @@ int STORM_pospopcnt_u16_avx2_blend_popcnt_unroll8(const uint16_t* array, uint32_ STORM_TARGET("avx2") static -int STORM_pospopcnt_u16_avx2_harvey_seal(const uint16_t* array, uint32_t len, uint32_t* flags) { +int STORM_pospopcnt_u16_avx2_harvey_seal(const uint16_t* array, size_t len, uint32_t* out) { for (uint32_t i = len - (len % (16 * 16)); i < len; ++i) { for (int j = 0; j < 16; ++j) { - flags[j] += ((array[i] & (1 << j)) >> j); + out[j] += ((array[i] & (1 << j)) >> j); } } @@ -1591,7 +1575,7 @@ int STORM_pospopcnt_u16_avx2_harvey_seal(const uint16_t* array, uint32_t len, ui for (size_t i = 0; i < 16; ++i) { _mm256_storeu_si256((__m256i*)buffer, counter[i]); for (size_t z = 0; z < 16; z++) { - flags[i] += 16 * (uint32_t)buffer[z]; + out[i] += 16 * (uint32_t)buffer[z]; } } } @@ -1599,26 +1583,26 @@ int STORM_pospopcnt_u16_avx2_harvey_seal(const uint16_t* array, uint32_t len, ui _mm256_storeu_si256((__m256i*)buffer, v1); for (size_t i = 0; i < 16; ++i) { for (int j = 0; j < 16; ++j) { - flags[j] += ((buffer[i] & (1 << j)) >> j); + out[j] += ((buffer[i] & (1 << j)) >> j); } } _mm256_storeu_si256((__m256i*)buffer, v2); for (size_t i = 0; i < 16; ++i) { for (int j = 0; j < 16; ++j) { - flags[j] += 2 * ((buffer[i] & (1 << j)) >> j); + out[j] += 2 * ((buffer[i] & (1 << j)) >> j); } } _mm256_storeu_si256((__m256i*)buffer, v4); for (size_t i = 0; i < 16; ++i) { for (int j = 0; j < 16; ++j) { - flags[j] += 4 * ((buffer[i] & (1 << j)) >> j); + out[j] += 4 * ((buffer[i] & (1 << j)) >> j); } } _mm256_storeu_si256((__m256i*)buffer, v8); for (size_t i = 0; i < 16; ++i) { for (int j = 0; j < 16; ++j) { - flags[j] += 8 * ((buffer[i] & (1 << j)) >> j); + out[j] += 8 * ((buffer[i] & (1 << j)) >> j); } } return 0; @@ -1927,7 +1911,7 @@ STORM_TARGET("avx2") static uint64_t STORM_intersect_count_csa_avx2(const __m256i* STORM_RESTRICT data1, const __m256i* STORM_RESTRICT data2, - uint64_t size) + size_t size) { __m256i cnt = _mm256_setzero_si256(); __m256i ones = _mm256_setzero_si256(); @@ -1986,7 +1970,7 @@ STORM_TARGET("avx2") static uint64_t STORM_union_count_csa_avx2(const __m256i* STORM_RESTRICT data1, const __m256i* STORM_RESTRICT data2, - uint64_t size) + size_t size) { __m256i cnt = _mm256_setzero_si256(); __m256i ones = _mm256_setzero_si256(); @@ -2045,7 +2029,7 @@ STORM_TARGET("avx2") static uint64_t STORM_diff_count_csa_avx2(const __m256i* STORM_RESTRICT data1, const __m256i* STORM_RESTRICT data2, - uint64_t size) + size_t size) { __m256i cnt = _mm256_setzero_si256(); __m256i ones = _mm256_setzero_si256(); @@ -2103,7 +2087,7 @@ STORM_TARGET("avx2") static uint64_t STORM_intersect_count_avx2(const uint64_t* STORM_RESTRICT b1, const uint64_t* STORM_RESTRICT b2, - const uint32_t n_ints) + const size_t n_ints) { uint64_t count = 0; const __m256i* r1 = (__m256i*)b1; @@ -2122,8 +2106,8 @@ uint64_t STORM_intersect_count_avx2(const uint64_t* STORM_RESTRICT b1, STORM_TARGET("avx2") static uint64_t STORM_union_count_avx2(const uint64_t* STORM_RESTRICT b1, - const uint64_t* STORM_RESTRICT b2, - const uint32_t n_ints) + const uint64_t* STORM_RESTRICT b2, + const size_t n_ints) { uint64_t count = 0; const __m256i* r1 = (__m256i*)b1; @@ -2142,8 +2126,8 @@ uint64_t STORM_union_count_avx2(const uint64_t* STORM_RESTRICT b1, STORM_TARGET("avx2") static uint64_t STORM_diff_count_avx2(const uint64_t* STORM_RESTRICT b1, - const uint64_t* STORM_RESTRICT b2, - const uint32_t n_ints) + const uint64_t* STORM_RESTRICT b2, + const size_t n_ints) { uint64_t count = 0; const __m256i* r1 = (__m256i*)b1; @@ -2163,7 +2147,7 @@ STORM_TARGET("avx2") static uint64_t STORM_intersect_count_lookup_avx2(const uint64_t* STORM_RESTRICT b1, const uint64_t* STORM_RESTRICT b2, - const uint32_t n_ints) + const size_t n_ints) { return STORM_intersect_count_lookup_avx2_func((uint8_t*)b1, (uint8_t*)b2, n_ints*8); } @@ -2172,7 +2156,7 @@ STORM_TARGET("avx2") static uint64_t STORM_union_count_lookup_avx2(const uint64_t* STORM_RESTRICT b1, const uint64_t* STORM_RESTRICT b2, - const uint32_t n_ints) + const size_t n_ints) { return STORM_union_count_lookup_avx2_func((uint8_t*)b1, (uint8_t*)b2, n_ints*8); } @@ -2181,7 +2165,7 @@ STORM_TARGET("avx2") static uint64_t STORM_diff_count_lookup_avx2(const uint64_t* STORM_RESTRICT b1, const uint64_t* STORM_RESTRICT b2, - const uint32_t n_ints) + const size_t n_ints) { return STORM_diff_count_lookup_avx2_func((uint8_t*)b1, (uint8_t*)b2, n_ints*8); } @@ -2244,7 +2228,7 @@ void STORM_pospopcnt_csa_avx512(__m512i* STORM_RESTRICT h, STORM_TARGET("avx512bw") static -uint64_t STORM_popcnt_avx512(const __m512i* STORM_RESTRICT data, uint64_t size) +uint64_t STORM_popcnt_avx512(const __m512i* STORM_RESTRICT data, size_t size) { __m512i cnt = _mm512_setzero_si512(); __m512i ones = _mm512_setzero_si512(); @@ -2304,10 +2288,10 @@ uint64_t STORM_popcnt_avx512(const __m512i* STORM_RESTRICT data, uint64_t size) STORM_TARGET("avx512bw") static -int STORM_pospopcnt_u16_avx512bw_harvey_seal(const uint16_t* array, uint32_t len, uint32_t* flags) { +int STORM_pospopcnt_u16_avx512bw_harvey_seal(const uint16_t* array, size_t len, uint32_t* out) { for (uint32_t i = len - (len % (32 * 16)); i < len; ++i) { for (int j = 0; j < 16; ++j) { - flags[j] += ((array[i] & (1 << j)) >> j); + out[j] += ((array[i] & (1 << j)) >> j); } } @@ -2364,7 +2348,7 @@ int STORM_pospopcnt_u16_avx512bw_harvey_seal(const uint16_t* array, uint32_t len for (size_t i = 0; i < 16; ++i) { _mm512_storeu_si512((__m512i*)buffer, counter[i]); for (size_t z = 0; z < 32; z++) { - flags[i] += 16 * (uint32_t)buffer[z]; + out[i] += 16 * (uint32_t)buffer[z]; } } } @@ -2372,28 +2356,28 @@ int STORM_pospopcnt_u16_avx512bw_harvey_seal(const uint16_t* array, uint32_t len _mm512_storeu_si512((__m512i*)buffer, v1); for (size_t i = 0; i < 32; i++) { for (int j = 0; j < 16; j++) { - flags[j] += 1 * ((buffer[i] & (1 << j)) >> j); + out[j] += 1 * ((buffer[i] & (1 << j)) >> j); } } _mm512_storeu_si512((__m512i*)buffer, v2); for (size_t i = 0; i < 32; i++) { for (int j = 0; j < 16; j++) { - flags[j] += 2 * ((buffer[i] & (1 << j)) >> j); + out[j] += 2 * ((buffer[i] & (1 << j)) >> j); } } _mm512_storeu_si512((__m512i*)buffer, v4); for (size_t i = 0; i < 32; i++) { for (int j = 0; j < 16; j++) { - flags[j] += 4 * ((buffer[i] & (1 << j)) >> j); + out[j] += 4 * ((buffer[i] & (1 << j)) >> j); } } _mm512_storeu_si512((__m512i*)buffer, v8); for (size_t i = 0; i < 32; i++) { for (int j = 0; j < 16; j++) { - flags[j] += 8 * ((buffer[i] & (1 << j)) >> j); + out[j] += 8 * ((buffer[i] & (1 << j)) >> j); } } @@ -2402,7 +2386,7 @@ int STORM_pospopcnt_u16_avx512bw_harvey_seal(const uint16_t* array, uint32_t len STORM_TARGET("avx512bw") static -int STORM_pospopcnt_u16_avx512bw_blend_popcnt_unroll8(const uint16_t* data, uint32_t len, uint32_t* flags) { +int STORM_pospopcnt_u16_avx512bw_blend_popcnt_unroll8(const uint16_t* data, size_t len, uint32_t* out) { #define AND_OR 0xea // ternary function: (a & b) | c const __m512i* data_vectors = (const __m512i*)(data); const uint32_t n_cycles = len / 32; @@ -2420,8 +2404,8 @@ int STORM_pospopcnt_u16_avx512bw_blend_popcnt_unroll8(const uint16_t* data, uint U(0,1) U( 2, 3) U( 4, 5) U( 6, 7) for (int i = 0; i < 8; ++i) { -#define A0(p) flags[ 7 - i] += _mm_popcnt_u64(_mm512_movepi8_mask(input##p)); -#define A1(k) flags[15 - i] += _mm_popcnt_u64(_mm512_movepi8_mask(input##k)); +#define A0(p) out[ 7 - i] += _mm_popcnt_u64(_mm512_movepi8_mask(input##p)); +#define A1(k) out[15 - i] += _mm_popcnt_u64(_mm512_movepi8_mask(input##k)); #define A(p, k) A0(p) A1(k) A(0,1) A(2, 3) A(4,5) A(6, 7) @@ -2455,7 +2439,7 @@ int STORM_pospopcnt_u16_avx512bw_blend_popcnt_unroll8(const uint16_t* data, uint i *= 32; for (/**/; i < len; ++i) { for (int j = 0; j < 16; ++j) { - flags[j] += ((data[i] & (1 << j)) >> j); + out[j] += ((data[i] & (1 << j)) >> j); } } @@ -2475,7 +2459,7 @@ int STORM_pospopcnt_u16_avx512bw_blend_popcnt_unroll8(const uint16_t* data, uint STORM_TARGET("avx512bw") static -int STORM_pospopcnt_u16_avx512bw_adder_forest(const uint16_t* array, uint32_t len, uint32_t* flags) { +int STORM_pospopcnt_u16_avx512bw_adder_forest(const uint16_t* array, size_t len, uint32_t* out) { __m512i counters[16]; for (size_t i = 0; i < 16; ++i) { @@ -2544,7 +2528,7 @@ int STORM_pospopcnt_u16_avx512bw_adder_forest(const uint16_t* array, uint32_t le // Update. for (size_t i = 0; i < 16; ++i) { _mm512_storeu_si512((__m512i*)tmp, counters[i]); - for (int j = 0; j < 32; ++j) flags[i] += tmp[j]; + for (int j = 0; j < 32; ++j) out[i] += tmp[j]; } // Reset. for (size_t i = 0; i < 16; ++i) { @@ -2577,7 +2561,7 @@ int STORM_pospopcnt_u16_avx512bw_adder_forest(const uint16_t* array, uint32_t le i *= 512; for (/**/; i < len; ++i) { for (int j = 0; j < 16; ++j) { - flags[j] += ((array[i] & (1 << j)) >> j); + out[j] += ((array[i] & (1 << j)) >> j); } } @@ -2593,7 +2577,7 @@ int STORM_pospopcnt_u16_avx512bw_adder_forest(const uint16_t* array, uint32_t le for (size_t i = 0; i < 16; ++i) { _mm512_storeu_si512((__m512i*)tmp, counters[i]); - for (int j = 0; j < 32; ++j) flags[i] += tmp[j]; + for (int j = 0; j < 32; ++j) out[i] += tmp[j]; } return 0; } @@ -2609,7 +2593,7 @@ STORM_TARGET("avx512bw") static uint64_t STORM_intersect_count_csa_avx512(const __m512i* STORM_RESTRICT data1, const __m512i* STORM_RESTRICT data2, - uint64_t size) + size_t size) { __m512i cnt = _mm512_setzero_si512(); __m512i ones = _mm512_setzero_si512(); @@ -2671,7 +2655,7 @@ STORM_TARGET("avx512bw") static uint64_t STORM_union_count_csa_avx512(const __m512i* STORM_RESTRICT data1, const __m512i* STORM_RESTRICT data2, - uint64_t size) + size_t size) { __m512i cnt = _mm512_setzero_si512(); __m512i ones = _mm512_setzero_si512(); @@ -2732,8 +2716,8 @@ uint64_t STORM_union_count_csa_avx512(const __m512i* STORM_RESTRICT data1, STORM_TARGET("avx512bw") static uint64_t STORM_diff_count_csa_avx512(const __m512i* STORM_RESTRICT data1, - const __m512i* STORM_RESTRICT data2, - uint64_t size) + const __m512i* STORM_RESTRICT data2, + size_t size) { __m512i cnt = _mm512_setzero_si512(); __m512i ones = _mm512_setzero_si512(); @@ -2797,7 +2781,7 @@ STORM_TARGET("avx512bw") static uint64_t STORM_intersect_count_avx512(const uint64_t* STORM_RESTRICT b1, const uint64_t* STORM_RESTRICT b2, - const uint32_t n_ints) + const size_t n_ints) { uint64_t count = 0; const __m512i* r1 = (const __m512i*)(b1); @@ -2817,7 +2801,7 @@ STORM_TARGET("avx512bw") static uint64_t STORM_union_count_avx512(const uint64_t* STORM_RESTRICT b1, const uint64_t* STORM_RESTRICT b2, - const uint32_t n_ints) + const size_t n_ints) { uint64_t count = 0; const __m512i* r1 = (const __m512i*)(b1); @@ -2837,7 +2821,7 @@ STORM_TARGET("avx512bw") static uint64_t STORM_diff_count_avx512(const uint64_t* STORM_RESTRICT b1, const uint64_t* STORM_RESTRICT b2, - const uint32_t n_ints) + const size_t n_ints) { uint64_t count = 0; const __m512i* r1 = (const __m512i*)(b1); @@ -2859,7 +2843,7 @@ uint64_t STORM_diff_count_avx512(const uint64_t* STORM_RESTRICT b1, ****************************/ STORM_FORCE_INLINE -uint64_t STORM_popcount64_unrolled(const uint64_t* data, uint64_t size) { +uint64_t STORM_popcount64_unrolled(const uint64_t* data, size_t size) { uint64_t i = 0; uint64_t limit = size - size % 4; uint64_t cnt = 0; @@ -2884,7 +2868,7 @@ uint64_t STORM_popcount64_unrolled(const uint64_t* data, uint64_t size) { STORM_FORCE_INLINE uint64_t STORM_intersect_count_scalar(const uint64_t* STORM_RESTRICT b1, const uint64_t* STORM_RESTRICT b2, - const uint32_t n_ints) + const size_t n_ints) { return STORM_intersect_count_unrolled(b1, b2, n_ints); } @@ -2892,7 +2876,7 @@ uint64_t STORM_intersect_count_scalar(const uint64_t* STORM_RESTRICT b1, STORM_FORCE_INLINE uint64_t STORM_union_count_scalar(const uint64_t* STORM_RESTRICT b1, const uint64_t* STORM_RESTRICT b2, - const uint32_t n_ints) + const size_t n_ints) { return STORM_union_count_unrolled(b1, b2, n_ints); } @@ -2900,7 +2884,7 @@ uint64_t STORM_union_count_scalar(const uint64_t* STORM_RESTRICT b1, STORM_FORCE_INLINE uint64_t STORM_diff_count_scalar(const uint64_t* STORM_RESTRICT b1, const uint64_t* STORM_RESTRICT b2, - const uint32_t n_ints) + const size_t n_ints) { return STORM_diff_count_unrolled(b1, b2, n_ints); } @@ -2910,21 +2894,19 @@ uint64_t STORM_intersect_count_scalar_list(const uint64_t* STORM_RESTRICT b1, const uint64_t* STORM_RESTRICT b2, const uint32_t* STORM_RESTRICT l1, const uint32_t* STORM_RESTRICT l2, - const uint32_t n1, - const uint32_t n2) + const size_t n1, + const size_t n2) { uint64_t count = 0; #define MOD(x) (( (x) * 64 ) >> 6) if(n1 < n2) { for (int i = 0; i < n1; ++i) { - count += ((b2[l1[i] >> 6] & (1L << MOD(l1[i]))) != 0); - // __builtin_prefetch(&b2[l1[i] >> 6], 0, _MM_HINT_T0); + count += ((b2[l1[i] >> 6] & (1L << MOD(l1[i]))) != 0); } } else { for (int i = 0; i < n2; ++i) { count += ((b1[l2[i] >> 6] & (1L << MOD(l2[i]))) != 0); - // __builtin_prefetch(&b1[l2[i] >> 6], 0, _MM_HINT_T0); } } #undef MOD @@ -2935,11 +2917,9 @@ uint64_t STORM_intersect_count_scalar_list(const uint64_t* STORM_RESTRICT b1, /* ************************************* * Function pointer definitions. ***************************************/ -typedef uint64_t (*STORM_compute_func)(const uint64_t*, const uint64_t*, const uint32_t); -typedef uint64_t (*STORM_compute_lfunc)(const uint64_t*, const uint64_t*, - const uint32_t*, const uint32_t*, const uint32_t, const uint32_t); -typedef int (STORM_pposcnt_func)(const uint16_t*, uint32_t, uint32_t*); -typedef uint64_t (STORM_popcnt_func)(const uint8_t*, uint32_t); +typedef uint64_t (*STORM_compute_func)(const uint64_t*, const uint64_t*, const size_t); +typedef int (STORM_pposcnt_func)(const uint16_t*, size_t, uint32_t*); +typedef uint64_t (STORM_popcnt_func)(const uint8_t*, size_t); /* ************************************* * Alignment @@ -2997,7 +2977,7 @@ uint32_t STORM_get_alignment() { // Return the optimal intersection function given the range [0, n_bitmaps_vector) // and the available instruction set at run-time. static -STORM_compute_func STORM_get_intersect_count_func(const uint32_t n_bitmaps_vector) { +STORM_compute_func STORM_get_intersect_count_func(const size_t n_bitmaps_vector) { #if defined(HAVE_CPUID) #if defined(__cplusplus) @@ -3045,7 +3025,7 @@ STORM_compute_func STORM_get_intersect_count_func(const uint32_t n_bitmaps_vecto } static -STORM_compute_func STORM_get_union_count_func(const uint32_t n_bitmaps_vector) { +STORM_compute_func STORM_get_union_count_func(const size_t n_bitmaps_vector) { #if defined(HAVE_CPUID) #if defined(__cplusplus) @@ -3093,7 +3073,7 @@ STORM_compute_func STORM_get_union_count_func(const uint32_t n_bitmaps_vector) { } static -STORM_compute_func STORM_get_diff_count_func(const uint32_t n_bitmaps_vector) { +STORM_compute_func STORM_get_diff_count_func(const size_t n_bitmaps_vector) { #if defined(HAVE_CPUID) #if defined(__cplusplus) @@ -3144,7 +3124,10 @@ STORM_compute_func STORM_get_diff_count_func(const uint32_t n_bitmaps_vector) { // Return the optimal intersection function given the range [0, n_bitmaps_vector) // and the available instruction set at run-time. static -uint64_t STORM_intersect_count(const uint64_t* STORM_RESTRICT data1, const uint64_t* STORM_RESTRICT data2, const uint32_t n_len) { +uint64_t STORM_intersect_count(const uint64_t* STORM_RESTRICT data1, + const uint64_t* STORM_RESTRICT data2, + const size_t n_len) +{ #if defined(HAVE_CPUID) #if defined(__cplusplus) @@ -3192,7 +3175,10 @@ uint64_t STORM_intersect_count(const uint64_t* STORM_RESTRICT data1, const uint6 } static -uint64_t STORM_union_count(const uint64_t* STORM_RESTRICT data1, const uint64_t* STORM_RESTRICT data2, const uint32_t n_len) { +uint64_t STORM_union_count(const uint64_t* STORM_RESTRICT data1, + const uint64_t* STORM_RESTRICT data2, + const size_t n_len) +{ #if defined(HAVE_CPUID) #if defined(__cplusplus) @@ -3240,7 +3226,10 @@ uint64_t STORM_union_count(const uint64_t* STORM_RESTRICT data1, const uint64_t* } static -uint64_t STORM_diff_count(const uint64_t* STORM_RESTRICT data1, const uint64_t* STORM_RESTRICT data2, const uint32_t n_len) { +uint64_t STORM_diff_count(const uint64_t* STORM_RESTRICT data1, + const uint64_t* STORM_RESTRICT data2, + const size_t n_len) +{ #if defined(HAVE_CPUID) #if defined(__cplusplus) @@ -3291,7 +3280,7 @@ uint64_t STORM_diff_count(const uint64_t* STORM_RESTRICT data1, const uint64_t* * POPCNT and POSPOPCNT functions. ***************************************/ static -uint64_t STORM_popcnt(const uint8_t* data, uint32_t size) { +uint64_t STORM_popcnt(const uint8_t* data, size_t size) { uint64_t cnt = 0; uint64_t i; @@ -3369,8 +3358,8 @@ uint64_t STORM_popcnt(const uint8_t* data, uint32_t size) { } static -int STORM_pospopcnt_u16(const uint16_t* data, uint32_t len, uint32_t* flags) { - memset(flags, 0, sizeof(uint32_t)*16); +int STORM_pospopcnt_u16(const uint16_t* data, size_t len, uint32_t* out) { + memset(out, 0, sizeof(uint32_t)*16); #if defined(HAVE_CPUID) #if defined(__cplusplus) @@ -3394,34 +3383,34 @@ int STORM_pospopcnt_u16(const uint16_t* data, uint32_t len, uint32_t* flags) { #if defined(HAVE_AVX512) if ((cpuid & STORM_bit_AVX512BW)) { - if (len < 32) return(STORM_pospopcnt_u16_sse_sad(data, len, flags)); // small - else if (len < 256) return(STORM_pospopcnt_u16_sse_blend_popcnt_unroll8(data, len, flags)); // small - else if (len < 512) return(STORM_pospopcnt_u16_avx512bw_blend_popcnt_unroll8(data, len, flags)); // medium - else if (len < 4096) return(STORM_pospopcnt_u16_avx512bw_adder_forest(data, len, flags)); // medium3 - else return(STORM_pospopcnt_u16_avx512bw_harvey_seal(data, len, flags)); // fix + if (len < 32) return(STORM_pospopcnt_u16_sse_sad(data, len, out)); // small + else if (len < 256) return(STORM_pospopcnt_u16_sse_blend_popcnt_unroll8(data, len, out)); // small + else if (len < 512) return(STORM_pospopcnt_u16_avx512bw_blend_popcnt_unroll8(data, len, out)); // medium + else if (len < 4096) return(STORM_pospopcnt_u16_avx512bw_adder_forest(data, len, out)); // medium3 + else return(STORM_pospopcnt_u16_avx512bw_harvey_seal(data, len, out)); // fix } #endif #if defined(HAVE_AVX2) if ((cpuid & STORM_bit_AVX2)) { - if (len < 128) return(STORM_pospopcnt_u16_sse_sad(data, len, flags)); // small - else if (len < 1024) return(STORM_pospopcnt_u16_avx2_blend_popcnt_unroll8(data, len, flags)); // medium - else return(STORM_pospopcnt_u16_avx2_harvey_seal(data, len, flags)); // large + if (len < 128) return(STORM_pospopcnt_u16_sse_sad(data, len, out)); // small + else if (len < 1024) return(STORM_pospopcnt_u16_avx2_blend_popcnt_unroll8(data, len, out)); // medium + else return(STORM_pospopcnt_u16_avx2_harvey_seal(data, len, out)); // large } #endif #if defined(HAVE_SSE4) if ((cpuid & STORM_bit_SSE42)) { - return(STORM_pospopcnt_u16_sse_harvey_seal(data, len, flags)); + return(STORM_pospopcnt_u16_sse_harvey_seal(data, len, out)); } #endif #ifndef _MSC_VER - return(STORM_pospopcnt_u16_scalar_umul128_unroll2(data, len, flags)); // fallback scalar + return(STORM_pospopcnt_u16_scalar_umul128_unroll2(data, len, out)); // fallback scalar #else - return(STORM_pospopcnt_u16_scalar_naive(data, len, flags)); + return(STORM_pospopcnt_u16_scalar_naive(data, len, out)); #endif } From bc1a9750b7dd277b32ff8fbe6334092747e1b57f Mon Sep 17 00:00:00 2001 From: "Marcus D. R. Klarqvist" Date: Sun, 18 Aug 2019 08:46:55 +0100 Subject: [PATCH 10/15] semantics --- libalgebra.h | 187 +++++++++++++++++++++++++-------------------------- 1 file changed, 93 insertions(+), 94 deletions(-) diff --git a/libalgebra.h b/libalgebra.h index 7150332..5c9c550 100644 --- a/libalgebra.h +++ b/libalgebra.h @@ -65,8 +65,8 @@ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#ifndef LIBALGEBRA_H_9827563662203 -#define LIBALGEBRA_H_9827563662203 +#ifndef LIBALGEBRA_H_8723467365934 +#define LIBALGEBRA_H_8723467365934 /* ************************************* * Includes @@ -77,32 +77,18 @@ #include #include -// Safety +/* ************************************* +* Safety +***************************************/ + #if !(defined(__APPLE__)) && !(defined(__FreeBSD__)) #include // this should never be needed but there are some reports that it is needed. #endif -/* ************************************* - * Support. - * - * These subroutines and definitions are taken from the CRoaring repo - * by Daniel Lemire et al. available under the Apache 2.0 License - * (same as libintersect.h): - * https://github.com/RoaringBitmap/CRoaring/ - ***************************************/ #if defined(__SIZEOF_LONG_LONG__) && __SIZEOF_LONG_LONG__ != 8 #error This code assumes 64-bit long longs (by use of the GCC intrinsics). Your system is not currently supported. #endif -/* === Compiler specifics === */ - -#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* >= C99 */ -# define STORM_RESTRICT restrict -#else -/* note : it might be useful to define __restrict or STORM_RESTRICT for some C++ compilers */ -# define STORM_RESTRICT /* disable */ -#endif - /**************************** * Memory management * @@ -110,6 +96,11 @@ * STORM_aligned_malloc and STORM_aligned_free to prevent clashing with the * same subroutines in Roaring. These subroutines are included here * since there is no hard dependency on using Roaring bitmaps. +* +* These subroutines and definitions are taken from the CRoaring repo +* by Daniel Lemire et al. available under the Apache 2.0 License +* (same as libalgebra.h): +* https://github.com/RoaringBitmap/CRoaring/ ****************************/ // portable version of posix_memalign #ifndef _MSC_VER @@ -150,15 +141,18 @@ void STORM_aligned_free(void* memblock) { // portable alignment #if defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) /* C11+ */ # include -# define STORM_ALIGN(n) alignas(n) +# define STORM_ALIGN(n) alignas(n) #elif defined(__GNUC__) -# define STORM_ALIGN(n) __attribute__ ((aligned(n))) +# define STORM_ALIGN(n) __attribute__ ((aligned(n))) #elif defined(_MSC_VER) -# define STORM_ALIGN(n) __declspec(align(n)) +# define STORM_ALIGN(n) __declspec(align(n)) #else -# define STORM_ALIGN(n) /* disabled */ +# define STORM_ALIGN(n) /* disabled */ #endif +/* ************************************* +* Compiler Specific Options +***************************************/ // Taken from XXHASH #ifdef _MSC_VER /* Visual Studio */ # pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */ @@ -179,19 +173,6 @@ void STORM_aligned_free(void* memblock) { # endif /* __STDC_VERSION__ */ #endif -// disable noise -#ifdef __GNUC__ -#define STORM_WARN_UNUSED __attribute__((warn_unused_result)) -#else -#define STORM_WARN_UNUSED -#endif - -/*------ SIMD definitions --------*/ - -#define STORM_SSE_ALIGNMENT 16 -#define STORM_AVX2_ALIGNMENT 32 -#define STORM_AVX512_ALIGNMENT 64 - /**************************** * General checks ****************************/ @@ -208,6 +189,20 @@ void STORM_aligned_free(void* memblock) { #define STORM_HAS_ATTRIBUTE(x) __has_attribute(x) #endif +// disable noise +#ifdef __GNUC__ +#define STORM_WARN_UNUSED __attribute__((warn_unused_result)) +#else +#define STORM_WARN_UNUSED +#endif + +#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* >= C99 */ +# define STORM_RESTRICT restrict +#else +/* note : it might be useful to define __restrict or STORM_RESTRICT for some C++ compilers */ +# define STORM_RESTRICT /* disable */ +#endif + #ifdef __GNUC__ #define GNUC_PREREQ(x, y) \ (__GNUC__ > x || (__GNUC__ == x && __GNUC_MINOR__ >= y)) @@ -294,13 +289,19 @@ void STORM_aligned_free(void* memblock) { #define STORM_TARGET(x) 0 #endif + +/**************************** +* CPUID and SIMD +****************************/ + +#define STORM_SSE_ALIGNMENT 16 +#define STORM_AVX2_ALIGNMENT 32 +#define STORM_AVX512_ALIGNMENT 64 + #ifdef __cplusplus extern "C" { #endif -/**************************** -* CPUID -****************************/ #if defined(HAVE_CPUID) #if defined(_MSC_VER) @@ -310,13 +311,13 @@ extern "C" { // CPUID flags. See https://en.wikipedia.org/wiki/CPUID for more info. /* %ecx bit flags */ -#define STORM_bit_POPCNT (1 << 23) // POPCNT instruction -#define STORM_bit_SSE41 (1 << 19) // CPUID.01H:ECX.SSE41[Bit 19] -#define STORM_bit_SSE42 (1 << 20) // CPUID.01H:ECX.SSE41[Bit 20] +#define STORM_CPUID_runtime_bit_POPCNT (1 << 23) // POPCNT instruction +#define STORM_CPUID_runtime_bit_SSE41 (1 << 19) // CPUID.01H:ECX.SSE41[Bit 19] +#define STORM_CPUID_runtime_bit_SSE42 (1 << 20) // CPUID.01H:ECX.SSE41[Bit 20] /* %ebx bit flags */ -#define STORM_bit_AVX2 (1 << 5) // CPUID.(EAX=07H, ECX=0H):EBX.AVX2[bit 5] -#define STORM_bit_AVX512BW (1 << 30) // AVX-512 Byte and Word Instructions +#define STORM_CPUID_runtime_bit_AVX2 (1 << 5) // CPUID.(EAX=07H, ECX=0H):EBX.AVX2[bit 5] +#define STORM_CPUID_runtime_bit_AVX512BW (1 << 30) // AVX-512 Byte and Word Instructions /* xgetbv bit flags */ #define STORM_XSTATE_SSE (1 << 1) @@ -382,16 +383,16 @@ int STORM_get_cpuid() { STORM_run_cpuid(1, 0, abcd); // Check for POPCNT instruction - if ((abcd[2] & STORM_bit_POPCNT) == STORM_bit_POPCNT) - flags |= STORM_bit_POPCNT; + if ((abcd[2] & STORM_CPUID_runtime_bit_POPCNT) == STORM_CPUID_runtime_bit_POPCNT) + flags |= STORM_CPUID_runtime_bit_POPCNT; // Check for SSE4.1 instruction set - if ((abcd[2] & STORM_bit_SSE41) == STORM_bit_SSE41) - flags |= STORM_bit_SSE41; + if ((abcd[2] & STORM_CPUID_runtime_bit_SSE41) == STORM_CPUID_runtime_bit_SSE41) + flags |= STORM_CPUID_runtime_bit_SSE41; // Check for SSE4.2 instruction set - if ((abcd[2] & STORM_bit_SSE42) == STORM_bit_SSE42) - flags |= STORM_bit_SSE42; + if ((abcd[2] & STORM_CPUID_runtime_bit_SSE42) == STORM_CPUID_runtime_bit_SSE42) + flags |= STORM_CPUID_runtime_bit_SSE42; #if defined(HAVE_AVX2) || \ defined(HAVE_AVX512) @@ -410,12 +411,12 @@ int STORM_get_cpuid() { if ((xcr0 & ymm_mask) == ymm_mask) { STORM_run_cpuid(7, 0, abcd); - if ((abcd[1] & STORM_bit_AVX2) == STORM_bit_AVX2) - flags |= STORM_bit_AVX2; + if ((abcd[1] & STORM_CPUID_runtime_bit_AVX2) == STORM_CPUID_runtime_bit_AVX2) + flags |= STORM_CPUID_runtime_bit_AVX2; if ((xcr0 & zmm_mask) == zmm_mask) { - if ((abcd[1] & STORM_bit_AVX512BW) == STORM_bit_AVX512BW) - flags |= STORM_bit_AVX512BW; + if ((abcd[1] & STORM_CPUID_runtime_bit_AVX512BW) == STORM_CPUID_runtime_bit_AVX512BW) + flags |= STORM_CPUID_runtime_bit_AVX512BW; } } @@ -2900,14 +2901,12 @@ uint64_t STORM_intersect_count_scalar_list(const uint64_t* STORM_RESTRICT b1, uint64_t count = 0; #define MOD(x) (( (x) * 64 ) >> 6) - if(n1 < n2) { - for (int i = 0; i < n1; ++i) { + if (n1 < n2) { + for (int i = 0; i < n1; ++i) count += ((b2[l1[i] >> 6] & (1L << MOD(l1[i]))) != 0); - } } else { - for (int i = 0; i < n2; ++i) { + for (int i = 0; i < n2; ++i) count += ((b1[l2[i] >> 6] & (1L << MOD(l2[i]))) != 0); - } } #undef MOD return(count); @@ -2950,19 +2949,19 @@ uint32_t STORM_get_alignment() { uint32_t alignment = 0; #if defined(HAVE_AVX512) - if ((cpuid & STORM_bit_AVX512BW)) { // 16*512 + if ((cpuid & STORM_CPUID_runtime_bit_AVX512BW)) { // 16*512 alignment = STORM_AVX512_ALIGNMENT; } #endif #if defined(HAVE_AVX2) - if ((cpuid & STORM_bit_AVX2) && alignment == 0) { // 16*256 + if ((cpuid & STORM_CPUID_runtime_bit_AVX2) && alignment == 0) { // 16*256 alignment = STORM_AVX2_ALIGNMENT; } #endif #if defined(HAVE_SSE42) - if ((cpuid & STORM_bit_SSE41) && alignment == 0) { // 16*128 + if ((cpuid & STORM_CPUID_runtime_bit_SSE41) && alignment == 0) { // 16*128 alignment = STORM_SSE_ALIGNMENT; } #endif @@ -3000,23 +2999,23 @@ STORM_compute_func STORM_get_intersect_count_func(const size_t n_bitmaps_vector) #if defined(HAVE_AVX512) - if ((cpuid & STORM_bit_AVX512BW) && n_bitmaps_vector >= 128) { // 16*512 + if ((cpuid & STORM_CPUID_runtime_bit_AVX512BW) && n_bitmaps_vector >= 128) { // 16*512 return &STORM_intersect_count_avx512; } #endif #if defined(HAVE_AVX2) - if ((cpuid & STORM_bit_AVX2) && n_bitmaps_vector >= 64) { // 16*256 + if ((cpuid & STORM_CPUID_runtime_bit_AVX2) && n_bitmaps_vector >= 64) { // 16*256 return &STORM_intersect_count_avx2; } - if ((cpuid & STORM_bit_AVX2) && n_bitmaps_vector >= 4) { + if ((cpuid & STORM_CPUID_runtime_bit_AVX2) && n_bitmaps_vector >= 4) { return &STORM_intersect_count_lookup_avx2; } #endif #if defined(HAVE_SSE42) - if ((cpuid & STORM_bit_SSE41) && n_bitmaps_vector >= 32) { // 16*128 + if ((cpuid & STORM_CPUID_runtime_bit_SSE41) && n_bitmaps_vector >= 32) { // 16*128 return &STORM_intersect_count_sse4; } #endif @@ -3048,23 +3047,23 @@ STORM_compute_func STORM_get_union_count_func(const size_t n_bitmaps_vector) { #if defined(HAVE_AVX512) - if ((cpuid & STORM_bit_AVX512BW) && n_bitmaps_vector >= 128) { // 16*512 + if ((cpuid & STORM_CPUID_runtime_bit_AVX512BW) && n_bitmaps_vector >= 128) { // 16*512 return &STORM_union_count_avx512; } #endif #if defined(HAVE_AVX2) - if ((cpuid & STORM_bit_AVX2) && n_bitmaps_vector >= 64) { // 16*256 + if ((cpuid & STORM_CPUID_runtime_bit_AVX2) && n_bitmaps_vector >= 64) { // 16*256 return &STORM_union_count_avx2; } - if ((cpuid & STORM_bit_AVX2) && n_bitmaps_vector >= 4) { + if ((cpuid & STORM_CPUID_runtime_bit_AVX2) && n_bitmaps_vector >= 4) { return &STORM_union_count_lookup_avx2; } #endif #if defined(HAVE_SSE42) - if ((cpuid & STORM_bit_SSE41) && n_bitmaps_vector >= 32) { // 16*128 + if ((cpuid & STORM_CPUID_runtime_bit_SSE41) && n_bitmaps_vector >= 32) { // 16*128 return &STORM_union_count_sse4; } #endif @@ -3096,23 +3095,23 @@ STORM_compute_func STORM_get_diff_count_func(const size_t n_bitmaps_vector) { #if defined(HAVE_AVX512) - if ((cpuid & STORM_bit_AVX512BW) && n_bitmaps_vector >= 128) { // 16*512 + if ((cpuid & STORM_CPUID_runtime_bit_AVX512BW) && n_bitmaps_vector >= 128) { // 16*512 return &STORM_diff_count_avx512; } #endif #if defined(HAVE_AVX2) - if ((cpuid & STORM_bit_AVX2) && n_bitmaps_vector >= 64) { // 16*256 + if ((cpuid & STORM_CPUID_runtime_bit_AVX2) && n_bitmaps_vector >= 64) { // 16*256 return &STORM_diff_count_avx2; } - if ((cpuid & STORM_bit_AVX2) && n_bitmaps_vector >= 4) { + if ((cpuid & STORM_CPUID_runtime_bit_AVX2) && n_bitmaps_vector >= 4) { return &STORM_diff_count_lookup_avx2; } #endif #if defined(HAVE_SSE42) - if ((cpuid & STORM_bit_SSE41) && n_bitmaps_vector >= 32) { // 16*128 + if ((cpuid & STORM_CPUID_runtime_bit_SSE41) && n_bitmaps_vector >= 32) { // 16*128 return &STORM_diff_count_sse4; } #endif @@ -3150,23 +3149,23 @@ uint64_t STORM_intersect_count(const uint64_t* STORM_RESTRICT data1, #if defined(HAVE_AVX512) - if ((cpuid & STORM_bit_AVX512BW) && n_len >= 128) { // 16*512 + if ((cpuid & STORM_CPUID_runtime_bit_AVX512BW) && n_len >= 128) { // 16*512 return STORM_intersect_count_avx512(data1, data2, n_len); } #endif #if defined(HAVE_AVX2) - if ((cpuid & STORM_bit_AVX2) && n_len >= 64) { // 16*256 + if ((cpuid & STORM_CPUID_runtime_bit_AVX2) && n_len >= 64) { // 16*256 return STORM_intersect_count_avx2(data1, data2, n_len); } - if ((cpuid & STORM_bit_AVX2) && n_len >= 4) { + if ((cpuid & STORM_CPUID_runtime_bit_AVX2) && n_len >= 4) { return STORM_intersect_count_lookup_avx2(data1, data2, n_len); } #endif #if defined(HAVE_SSE42) - if ((cpuid & STORM_bit_SSE41) && n_len >= 32) { // 16*128 + if ((cpuid & STORM_CPUID_runtime_bit_SSE41) && n_len >= 32) { // 16*128 return STORM_intersect_count_sse4(data1, data2, n_len); } #endif @@ -3201,23 +3200,23 @@ uint64_t STORM_union_count(const uint64_t* STORM_RESTRICT data1, #if defined(HAVE_AVX512) - if ((cpuid & STORM_bit_AVX512BW) && n_len >= 128) { // 16*512 + if ((cpuid & STORM_CPUID_runtime_bit_AVX512BW) && n_len >= 128) { // 16*512 return STORM_union_count_avx512(data1, data2, n_len); } #endif #if defined(HAVE_AVX2) - if ((cpuid & STORM_bit_AVX2) && n_len >= 64) { // 16*256 + if ((cpuid & STORM_CPUID_runtime_bit_AVX2) && n_len >= 64) { // 16*256 return STORM_union_count_avx2(data1, data2, n_len); } - if ((cpuid & STORM_bit_AVX2) && n_len >= 4) { + if ((cpuid & STORM_CPUID_runtime_bit_AVX2) && n_len >= 4) { return STORM_union_count_lookup_avx2(data1, data2, n_len); } #endif #if defined(HAVE_SSE42) - if ((cpuid & STORM_bit_SSE41) && n_len >= 32) { // 16*128 + if ((cpuid & STORM_CPUID_runtime_bit_SSE41) && n_len >= 32) { // 16*128 return STORM_union_count_sse4(data1, data2, n_len); } #endif @@ -3252,23 +3251,23 @@ uint64_t STORM_diff_count(const uint64_t* STORM_RESTRICT data1, #if defined(HAVE_AVX512) - if ((cpuid & STORM_bit_AVX512BW) && n_len >= 128) { // 16*512 + if ((cpuid & STORM_CPUID_runtime_bit_AVX512BW) && n_len >= 128) { // 16*512 return STORM_diff_count_avx512(data1, data2, n_len); } #endif #if defined(HAVE_AVX2) - if ((cpuid & STORM_bit_AVX2) && n_len >= 64) { // 16*256 + if ((cpuid & STORM_CPUID_runtime_bit_AVX2) && n_len >= 64) { // 16*256 return STORM_diff_count_avx2(data1, data2, n_len); } - if ((cpuid & STORM_bit_AVX2) && n_len >= 4) { + if ((cpuid & STORM_CPUID_runtime_bit_AVX2) && n_len >= 4) { return STORM_diff_count_lookup_avx2(data1, data2, n_len); } #endif #if defined(HAVE_SSE42) - if ((cpuid & STORM_bit_SSE41) && n_len >= 32) { // 16*128 + if ((cpuid & STORM_CPUID_runtime_bit_SSE41) && n_len >= 32) { // 16*128 return STORM_diff_count_sse4(data1, data2, n_len); } #endif @@ -3306,7 +3305,7 @@ uint64_t STORM_popcnt(const uint8_t* data, size_t size) { #if defined(HAVE_AVX512) /* AVX512 requires arrays >= 1024 bytes */ - if ((cpuid & STORM_bit_AVX512BW) && + if ((cpuid & STORM_CPUID_runtime_bit_AVX512BW) && size >= 1024) { cnt += STORM_popcnt_avx512((const __m512i*)data, size / 64); @@ -3319,7 +3318,7 @@ uint64_t STORM_popcnt(const uint8_t* data, size_t size) { #if defined(HAVE_AVX2) /* AVX2 requires arrays >= 512 bytes */ - if ((cpuid & STORM_bit_AVX2) && + if ((cpuid & STORM_CPUID_runtime_bit_AVX2) && size >= 512) { cnt += STORM_popcnt_avx2((const __m256i*)data, size / 32); @@ -3331,7 +3330,7 @@ uint64_t STORM_popcnt(const uint8_t* data, size_t size) { #if defined(HAVE_POPCNT) - if (cpuid & STORM_bit_POPCNT) { + if (cpuid & STORM_CPUID_runtime_bit_POPCNT) { cnt += STORM_popcount64_unrolled((const uint64_t*)data, size / 8); data += size - size % 8; size = size % 8; @@ -3381,7 +3380,7 @@ int STORM_pospopcnt_u16(const uint16_t* data, size_t len, uint32_t* out) { #endif #if defined(HAVE_AVX512) - if ((cpuid & STORM_bit_AVX512BW)) + if ((cpuid & STORM_CPUID_runtime_bit_AVX512BW)) { if (len < 32) return(STORM_pospopcnt_u16_sse_sad(data, len, out)); // small else if (len < 256) return(STORM_pospopcnt_u16_sse_blend_popcnt_unroll8(data, len, out)); // small @@ -3392,7 +3391,7 @@ int STORM_pospopcnt_u16(const uint16_t* data, size_t len, uint32_t* out) { #endif #if defined(HAVE_AVX2) - if ((cpuid & STORM_bit_AVX2)) + if ((cpuid & STORM_CPUID_runtime_bit_AVX2)) { if (len < 128) return(STORM_pospopcnt_u16_sse_sad(data, len, out)); // small else if (len < 1024) return(STORM_pospopcnt_u16_avx2_blend_popcnt_unroll8(data, len, out)); // medium @@ -3401,7 +3400,7 @@ int STORM_pospopcnt_u16(const uint16_t* data, size_t len, uint32_t* out) { #endif #if defined(HAVE_SSE4) - if ((cpuid & STORM_bit_SSE42)) + if ((cpuid & STORM_CPUID_runtime_bit_SSE42)) { return(STORM_pospopcnt_u16_sse_harvey_seal(data, len, out)); } @@ -3418,4 +3417,4 @@ int STORM_pospopcnt_u16(const uint16_t* data, size_t len, uint32_t* out) { } /* extern "C" */ #endif -#endif /* LIBALGEBRA_H_9827563662203 */ +#endif /* LIBALGEBRA_H_8723467365934 */ From 57cf5314990f0371eaecf7c41172af2d4570304f Mon Sep 17 00:00:00 2001 From: "Marcus D. R. Klarqvist" Date: Sun, 18 Aug 2019 09:34:09 +0100 Subject: [PATCH 11/15] results --- README.md | 77 ++++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 65 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index bd16743..51bca3e 100644 --- a/README.md +++ b/README.md @@ -29,7 +29,7 @@ The core algorithms are described in the papers: ### Speedup -Sample performance metrics (practical upper limit) on AVX512BW machine. We simulate a single data array or pairs of data arrays and compute the same statistics many times. This reflect the fastest possible throughput if you never have to leave the destination cache-level. +Sample performance metrics (practical upper limit) on AVX512BW machine. We simulate a single data array or pairs of data arrays in a aligned memory location and compute the same statistics many times. This reflect the fastest possible throughput if you never have to leave the destination cache-level. The host architecture used is a 10 nm Cannon Lake [Core i3-8121U](https://ark.intel.com/content/www/us/en/ark/products/136863/intel-core-i3-8121u-processor-4m-cache-up-to-3-20-ghz.html) with gcc (GCC) 8.2.1 20180905 (Red Hat 8.2.1-3). ### POSPOPCNT @@ -49,24 +49,58 @@ CPUs compared to a naive unvectorized solution ### POPCNT Fold speedup compared to a naive unvectorized algorithm -(`popcount_scalar_naive_nosimd`) for different array sizes. - -| Algorithm | 8 | 32 | 128 | 256 | 512 | 1024 | 2048 | 4096 | 8192 | -|-----------|-----|-----|------|------|------|-------|-------|-------|-------| -| popcnt | 3.3 | 6.5 | 20.1 | 25.4 | 62.4 | 116.7 | 175.1 | 226.4 | 247.8 | +(`popcount_scalar_naive_nosimd`) for different array sizes as (CPU cycles/64-bit word, Instructions/64-bit word): + +| Words | libalgebra.h | Scalar | Speedup | +|---------|---------------|---------------|---------| +| 4 | 28.5 (36.75) | 27.25 (34) | 1 | +| 8 | 18.38 (25.5) | 15.63 (30.5) | 0.9 | +| 16 | 10.81 (19.94) | 13.75 (28.75) | 1.3 | +| 32 | 7.91 (17.16) | 11.25 (27.88) | 1.4 | +| 64 | 3.47 (3.92) | 9.7 (27.44) | 2.8 | +| 128 | 2.23 (1.93) | 8.97 (27.22) | 4 | +| 256 | 1.09 (1.29) | 8.6 (27.11) | 7.9 | +| 512 | 0.7 (0.98) | 8.35 (27.06) | 11.9 | +| 1024 | 0.48 (0.82) | 8.22 (27.03) | 17.2 | +| 2048 | 0.35 (0.74) | 8.12 (27.01) | 23.5 | +| 4096 | 0.29 (0.7) | 8.09 (27.01) | 28.1 | +| 8192 | 0.24 (0.68) | 8.04 (27) | 33.2 | +| 16384 | 0.22 (0.67) | 8.02 (27) | 36.6 | +| 32768 | **0.21 (0.66)** | 8.02 (27) | **38.7** | +| 65536 | 0.83 (0.66) | 8.01 (27) | 9.7 | +| 131072 | 0.47 (0.66) | 8.02 (27) | 17 | +| 262144 | 0.49 (0.66) | 8.05 (27) | 16.5 | +| 524288 | 0.48 (0.66) | 8.07 (27) | 16.9 | +| 1048576 | 0.44 (0.66) | 8.07 (27) | 18.5 | ### Set algebra Fold speedup compared to naive unvectorized solution (`*_scalar_naive_nosimd`) -for different array sizes (in number of _pairs_ of 64-bit values). These +for different array sizes (in number of _pairs_ of 64-bit word but results reported per _single_ 64-bit word). These functions are identifical with the exception of the bitwise operator used (AND, OR, or XOR) which all have identical latency and throughput (CPI). -| Algorithm | 8 | 32 | 128 | 256 | 512 | 1024 | 2048 | 4096 | 8192 | -|-----------------|------|-------|-------|-------|-------|-------|-------|-------|-------| -| intersect count | 4.73 | 10.8 | 17.58 | 24.82 | 31 | 35.78 | 37.75 | 23.08 | 20.81 | -| union count | 4.64 | 10.96 | 17.19 | 24.88 | 31.09 | 35.74 | 37.95 | 22.92 | 21.11 | -| diff count | 4.57 | 10.93 | 17.31 | 24.78 | 30.98 | 35.74 | 37.87 | 23.31 | 21.42 | +| Words | libalgebra.h | Scalar | Speedup | +|---------|--------------|---------------|---------| +| 4 | 18.25 (8.63) | 14.63 (22.75) | 0.8 | +| 8 | 9.5 (5.44) | 10.25 (20.88) | 1.1 | +| 16 | 5.28 (3.84) | 7.91 (19.94) | 1.5 | +| 32 | 2.88 (2.56) | 6.88 (19.47) | 2.4 | +| 64 | 1.98 (2.06) | 5.94 (19.23) | 3 | +| 128 | 1.05 (0.89) | 5.49 (19.12) | 5.2 | +| 256 | 0.61 (0.64) | 5.24 (19.06) | 8.6 | +| 512 | 0.42 (0.51) | 5.13 (19.03) | 12.3 | +| 1024 | 0.29 (0.45) | 5.06 (19.02) | 17.4 | +| 2048 | 0.21 (0.41) | 5.04 (19.01) | 24.2 | +| 4096 | 0.17 (0.4) | 5.02 (19) | 29 | +| 8192 | 0.15 (0.39) | 5.01 (19) | 34.3 | +| 16384 | **0.13 (0.39)** | 5.01 (19) | **37.6** | +| 32768 | 0.54 (0.39) | 5 (19) | 9.3 | +| 65536 | 0.43 (0.38) | 5 (19) | 11.7 | +| 131072 | 0.28 (0.38) | 5 (19) | 17.8 | +| 262144 | 0.25 (0.38) | 5 (19) | 20.4 | +| 524288 | 0.24 (0.38) | 5 (19) | 20.7 | +| 1048576 | 0.22 (0.38) | 5 (19) | 22.9 | ## C/C++ API @@ -111,6 +145,25 @@ uint64_t STORM_union_count(const uint64_t* data1, const uint64_t* data2, const u uint64_t STORM_diff_count(const uint64_t* data1, const uint64_t* data2, const uint32_t size); ``` +### Advanced use + +Retrieve a function pointer to the optimal function given the target length. + +```C +STORM_compute_func STORM_get_intersection_count_func(const size_t n_bitmaps_vector); +STORM_compute_func STORM_get_union_count_func(const size_t n_bitmaps_vector); +STORM_compute_func STORM_get_diff_count_func(const size_t n_bitmaps_vector); +``` + +Portable memory alignment. + +```C +#include "libalgebra.h" + +void* STORM_aligned_malloc(size_t alignment, size_t size); +void STORM_aligned_free(void* memblock); +``` + ## How it works On x86 CPUs ```libalgebra.h``` uses a combination of algorithms depending on the input vector size and what instruction set your CPU supports. These checks are performed during **run-time**. \ No newline at end of file From 825a9547ec9ae42ae90870582358ef7c157fb677 Mon Sep 17 00:00:00 2001 From: "Marcus D. R. Klarqvist" Date: Mon, 19 Aug 2019 00:11:41 +0100 Subject: [PATCH 12/15] popcnt sse4-csa --- benchmark.cpp | 99 ++++++++-------- libalgebra.h | 310 ++++++++++++++++++++++++++++++++++++-------------- 2 files changed, 276 insertions(+), 133 deletions(-) diff --git a/benchmark.cpp b/benchmark.cpp index 93708a8..09fba3d 100644 --- a/benchmark.cpp +++ b/benchmark.cpp @@ -9,24 +9,28 @@ #include "getopt.h" #endif -void generate_random_data(uint64_t* data, uint32_t range, uint32_t n) { +uint64_t* generate_random_data(uint32_t n_bitmaps) { // Clear data - uint32_t n_bitmaps = ceil(n / 64.0); - memset(data, 0, sizeof(uint64_t)*n_bitmaps); + // uint32_t n_bitmaps = ceil(n / 64.0); + // memset(data, 0, sizeof(uint64_t)*n_bitmaps); + uint64_t* mem = (uint64_t*)STORM_aligned_malloc(STORM_get_alignment(), n_bitmaps*sizeof(uint64_t)); // PRNG - std::uniform_int_distribution distr(0, range-1); // right inclusive + std::uniform_int_distribution distr(0, std::numeric_limits::max()-1); // right inclusive std::random_device rd; // obtain a random number from hardware std::mt19937 eng(rd()); // seed the generator // Generate some random data. uint32_t n_unique = 0; // while (n_unique < n) { - for (int i = 0; i < n; ++i) { - uint32_t val = distr(eng); - n_unique += (data[val / 64] & (1ULL << (val % 64))) == 0; - data[val / 64] |= 1ULL << (val % 64); + for (int i = 0; i < n_bitmaps; ++i) { + uint32_t val1 = distr(eng); + uint32_t val2 = distr(eng); + uint64_t x = ((uint64_t)val1 << 32) | val2; + mem[i] = x; } + + return mem; } #ifdef __linux__ @@ -159,8 +163,6 @@ compute_averages(std::vector< std::vector > allresults) { int linux_set_algebra_wrapper(std::string name, STORM_compute_func f, int iterations, - uint64_t* STORM_RESTRICT data1, - uint64_t* STORM_RESTRICT data2, uint32_t range, uint32_t n_values, uint32_t n_bitmaps, @@ -184,14 +186,17 @@ int linux_set_algebra_wrapper(std::string name, volatile uint64_t total = 0; // voltatile to prevent compiler to remove work through optimization for (uint32_t i = 0; i < iterations; i++) { - generate_random_data(data1, range, n_values); - generate_random_data(data2, range, n_values); + uint64_t* mem1 = generate_random_data(n_values); + uint64_t* mem2 = generate_random_data(n_values); unified.start(); // Call argument subroutine pointer. - total += (*f)(data1, data2, n_bitmaps); + total += (*f)(mem1, mem2, n_bitmaps); unified.end(results); allresults.push_back(results); + + STORM_aligned_free(mem1); + STORM_aligned_free(mem2); } std::vector mins = compute_mins(allresults); @@ -234,10 +239,9 @@ int linux_set_algebra_wrapper(std::string name, int linux_popcount_wrapper(std::string name, STORM_popcnt_func f, int iterations, - uint64_t* data, uint32_t range, uint32_t n_values, - uint32_t n_bitmaps, + uint32_t n_bitmaps, bool verbose) { std::vector evts; @@ -258,14 +262,15 @@ int linux_popcount_wrapper(std::string name, volatile uint64_t total = 0; // voltatile to prevent compiler to remove work through optimization for (uint32_t i = 0; i < iterations; i++) { - generate_random_data(data, range, n_values); - size_t n_vals = n_bitmaps*8; + uint64_t* mem1 = generate_random_data(n_values); unified.start(); // Call argument subroutine pointer. - total += (*f)((uint8_t*)data, n_vals); + total += (*f)((uint8_t*)mem1, n_bitmaps*8); unified.end(results); allresults.push_back(results); + + STORM_aligned_free(mem1); } std::vector mins = compute_mins(allresults); @@ -276,8 +281,8 @@ int linux_popcount_wrapper(std::string name, name.c_str(), n_bitmaps, double(mins[1]) / mins[0], - double(mins[0]) / n_bitmaps, - double(mins[1]) / n_bitmaps, + double(mins[0]) / (n_bitmaps), + double(mins[1]) / (n_bitmaps), mins[0], mins[1], mins[2], @@ -292,7 +297,7 @@ int linux_popcount_wrapper(std::string name, printf("%s-%u:\n",name.c_str(),n_bitmaps); printf("instructions per cycle %4.2f, cycles per 64-bit word: %4.3f, " "instructions per 64-bit word %4.3f \n", - double(mins[1]) / mins[0], double(mins[0]) / n_bitmaps, double(mins[1]) / n_bitmaps); + double(mins[1]) / mins[0], double(mins[0]) / (n_bitmaps), double(mins[1]) / (n_bitmaps)); // first we display mins printf("min: %8llu cycles, %8llu instructions, \t%8llu branch mis., %8llu " "cache ref., %8llu cache mis.\n", @@ -440,8 +445,6 @@ typedef std::chrono::high_resolution_clock::time_point clockdef; int set_algebra_wrapper(std::string name, STORM_compute_func f, int iterations, - uint64_t* STORM_RESTRICT data1, - uint64_t* STORM_RESTRICT data2, uint32_t range, uint32_t n_values, size_t n_bitmaps, @@ -474,8 +477,8 @@ asm volatile("RDTSCP\n\t" "mov %%eax, %1\n\t" "CPUID\n\t": "=r" (cycles_high1), "=r" (cycles_low1):: "%rax", "%rbx", "%rcx", "%rdx"); #endif - generate_random_data(data1, range, n_values); - generate_random_data(data2, range, n_values); + uint64_t* mem1 = generate_random_data(n_values); + uint64_t* mem2 = generate_random_data(n_values); volatile uint64_t total = 0; // voltatile to prevent compiler to remove work through optimization clockdef t1 = std::chrono::high_resolution_clock::now(); @@ -496,7 +499,7 @@ asm volatile("RDTSCP\n\t" for (int i = 0; i < iterations; ++i) { // Call argument subroutine pointer. - total += (*f)(data1, data2, n_bitmaps); + total += (*f)(mem1, mem2, n_bitmaps); } #ifndef _MSC_VER @@ -513,6 +516,9 @@ asm volatile("RDTSCP\n\t" clockdef t2 = std::chrono::high_resolution_clock::now(); auto time_span = std::chrono::duration_cast(t2 - t1); + STORM_aligned_free(mem1); + STORM_aligned_free(mem2); + uint64_t start = ( ((uint64_t)cycles_high << 32) | cycles_low ); uint64_t end = ( ((uint64_t)cycles_high1 << 32) | cycles_low1 ); @@ -537,7 +543,6 @@ asm volatile("RDTSCP\n\t" int popcount_wrapper(std::string name, STORM_popcnt_func f, int iterations, - uint64_t* data, uint32_t range, uint32_t n_values, uint32_t n_bitmaps, @@ -570,7 +575,7 @@ asm volatile("RDTSCP\n\t" "mov %%eax, %1\n\t" "CPUID\n\t": "=r" (cycles_high1), "=r" (cycles_low1):: "%rax", "%rbx", "%rcx", "%rdx"); #endif - generate_random_data(data, range, n_values); + uint64_t* mem = generate_random_data(n_values); volatile uint64_t total = 0; // voltatile to prevent compiler to remove work through optimization clockdef t1 = std::chrono::high_resolution_clock::now(); @@ -592,7 +597,7 @@ asm volatile("RDTSCP\n\t" size_t n_b = n_bitmaps*8; for (int i = 0; i < iterations; ++i) { // Call argument subroutine pointer. - total += (*f)((uint8_t*)data, n_b); + total += (*f)((uint8_t*)mem, n_b); } #ifndef _MSC_VER @@ -609,6 +614,8 @@ asm volatile("RDTSCP\n\t" clockdef t2 = std::chrono::high_resolution_clock::now(); auto time_span = std::chrono::duration_cast(t2 - t1); + STORM_aligned_free(mem); + uint64_t start = ( ((uint64_t)cycles_high << 32) | cycles_low ); uint64_t end = ( ((uint64_t)cycles_high1 << 32) | cycles_low1 ); @@ -638,7 +645,7 @@ int benchmark(int n_repetitions, bool use_perf = false) { std::vector ranges = {4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768,65536,131072,262144,524288,1048576}; std::vector reps; if (n_repetitions <= 0) { - reps = {5000,5000,5000,5000,5000,2500,2500,2500,2500,2500,50,50,50,50,50,50,50,25,25,25}; + reps = {5000,5000,5000,5000,5000,2500,2500,2500,2500,2500,150,150,150,150,150,150,150,100,100,100}; } else { reps = std::vector(ranges.size(), n_repetitions); } @@ -657,27 +664,27 @@ int benchmark(int n_repetitions, bool use_perf = false) { if (use_perf) { #ifdef __linux__ - linux_popcount_wrapper("popcount-naive",&popcount_scalar_naive_nosimd, reps[i], bitmaps, ranges[i], ranges[i], ranges[i], true); - linux_popcount_wrapper("popcount",&STORM_popcnt, reps[i], bitmaps, ranges[i], ranges[i], ranges[i], true); - linux_set_algebra_wrapper("intersect-naive",&intersect_scalar_naive_nosimd, reps[i], bitmaps, bitmaps2, ranges[i], ranges[i], ranges[i], true); - linux_set_algebra_wrapper("intersect",STORM_get_intersect_count_func(ranges[i]), reps[i], bitmaps, bitmaps2, ranges[i], ranges[i], ranges[i], true); - linux_set_algebra_wrapper("union-naive",&union_scalar_naive_nosimd, reps[i], bitmaps, bitmaps2, ranges[i], ranges[i], ranges[i], true); - linux_set_algebra_wrapper("union",STORM_get_union_count_func(ranges[i]), reps[i], bitmaps, bitmaps2, ranges[i], ranges[i], ranges[i], true); - linux_set_algebra_wrapper("diff-naive",&diff_scalar_naive_nosimd, reps[i], bitmaps, bitmaps2, ranges[i], ranges[i], ranges[i], true); - linux_set_algebra_wrapper("diff",STORM_get_diff_count_func(ranges[i]), reps[i], bitmaps, bitmaps2, ranges[i], ranges[i], ranges[i], true); + linux_popcount_wrapper("popcount-naive",&popcount_scalar_naive_nosimd, reps[i], ranges[i], ranges[i], ranges[i], true); + linux_popcount_wrapper("popcount",&STORM_popcnt, reps[i], ranges[i], ranges[i], ranges[i], true); + linux_set_algebra_wrapper("intersect-naive",&intersect_scalar_naive_nosimd, reps[i], ranges[i], ranges[i], ranges[i], true); + linux_set_algebra_wrapper("intersect",STORM_get_intersect_count_func(ranges[i]), reps[i], ranges[i], ranges[i], ranges[i], true); + linux_set_algebra_wrapper("union-naive",&union_scalar_naive_nosimd, reps[i], ranges[i], ranges[i], ranges[i], true); + linux_set_algebra_wrapper("union",STORM_get_union_count_func(ranges[i]), reps[i], ranges[i], ranges[i], ranges[i], true); + linux_set_algebra_wrapper("diff-naive",&diff_scalar_naive_nosimd, reps[i], ranges[i], ranges[i], ranges[i], true); + linux_set_algebra_wrapper("diff",STORM_get_diff_count_func(ranges[i]), reps[i], ranges[i], ranges[i], ranges[i], true); #else std::cerr << "perf counter are only available on Linux systems!" << std::endl; exit(EXIT_FAILURE); #endif } else { - popcount_wrapper("popcount-naive",&popcount_scalar_naive_nosimd, reps[i], bitmaps, ranges[i], ranges[i], ranges[i], unit_intsec); - popcount_wrapper("popcount",&STORM_popcnt, reps[i], bitmaps, ranges[i], ranges[i], ranges[i], unit_intsec); - set_algebra_wrapper("intersect-naive",&intersect_scalar_naive_nosimd, reps[i], bitmaps, bitmaps2, ranges[i], ranges[i], ranges[i], unit_intsec); - set_algebra_wrapper("intersect",STORM_get_intersect_count_func(ranges[i]), reps[i], bitmaps, bitmaps2, ranges[i], ranges[i], ranges[i], unit_intsec); - set_algebra_wrapper("union-naive",&union_scalar_naive_nosimd, reps[i], bitmaps, bitmaps2, ranges[i], ranges[i], ranges[i], unit_intsec); - set_algebra_wrapper("union",STORM_get_union_count_func(ranges[i]), reps[i], bitmaps, bitmaps2, ranges[i], ranges[i], ranges[i], unit_union); - set_algebra_wrapper("diff-naive",&diff_scalar_naive_nosimd, reps[i], bitmaps, bitmaps2, ranges[i], ranges[i], ranges[i], unit_intsec); - set_algebra_wrapper("diff",STORM_get_diff_count_func(ranges[i]), reps[i], bitmaps, bitmaps2, ranges[i], ranges[i], ranges[i], unit_diff); + popcount_wrapper("popcount-naive",&popcount_scalar_naive_nosimd, reps[i], ranges[i], ranges[i], ranges[i], unit_intsec); + popcount_wrapper("popcount",&STORM_popcnt, reps[i], ranges[i], ranges[i], ranges[i], unit_intsec); + set_algebra_wrapper("intersect-naive",&intersect_scalar_naive_nosimd, reps[i], ranges[i], ranges[i], ranges[i], unit_intsec); + set_algebra_wrapper("intersect",STORM_get_intersect_count_func(ranges[i]), reps[i], ranges[i], ranges[i], ranges[i], unit_intsec); + set_algebra_wrapper("union-naive",&union_scalar_naive_nosimd, reps[i], ranges[i], ranges[i], ranges[i], unit_intsec); + set_algebra_wrapper("union",STORM_get_union_count_func(ranges[i]), reps[i], ranges[i], ranges[i], ranges[i], unit_union); + set_algebra_wrapper("diff-naive",&diff_scalar_naive_nosimd, reps[i], ranges[i], ranges[i], ranges[i], unit_intsec); + set_algebra_wrapper("diff",STORM_get_diff_count_func(ranges[i]), reps[i], ranges[i], ranges[i], ranges[i], unit_diff); } } diff --git a/libalgebra.h b/libalgebra.h index 5c9c550..5c5e476 100644 --- a/libalgebra.h +++ b/libalgebra.h @@ -229,57 +229,57 @@ void STORM_aligned_free(void* memblock) { defined(_MSC_VER) || \ (GNUC_PREREQ(4, 2) || \ STORM_HAS_BUILTIN(__sync_val_compare_and_swap))) - #define HAVE_CPUID + #define STORM_HAVE_CPUID #endif #if GNUC_PREREQ(4, 2) || \ STORM_HAS_BUILTIN(__builtin_popcount) - #define HAVE_BUILTIN_POPCOUNT + #define STORM_HAVE_BUILTIN_POPCOUNT #endif #if GNUC_PREREQ(4, 2) || \ CLANG_PREREQ(3, 0) - #define HAVE_ASM_POPCNT + #define STORM_HAVE_ASM_POPCNT #endif -#if defined(HAVE_CPUID) && \ - (defined(HAVE_ASM_POPCNT) || \ +#if defined(STORM_HAVE_CPUID) && \ + (defined(STORM_HAVE_ASM_POPCNT) || \ defined(_MSC_VER)) - #define HAVE_POPCNT + #define STORM_HAVE_POPCNT #endif -#if defined(HAVE_CPUID) && \ +#if defined(STORM_HAVE_CPUID) && \ GNUC_PREREQ(4, 9) - #define HAVE_SSE42 - #define HAVE_AVX2 + #define STORM_HAVE_SSE42 + #define STORM_HAVE_AVX2 #endif -#if defined(HAVE_CPUID) && \ +#if defined(STORM_HAVE_CPUID) && \ GNUC_PREREQ(5, 0) - #define HAVE_AVX512 + #define STORM_HAVE_AVX512 #endif -#if defined(HAVE_CPUID) && \ +#if defined(STORM_HAVE_CPUID) && \ defined(_MSC_VER) && \ defined(__AVX2__) - #define HAVE_SSE42 - #define HAVE_AVX2 + #define STORM_HAVE_SSE42 + #define STORM_HAVE_AVX2 #endif -#if defined(HAVE_CPUID) && \ +#if defined(STORM_HAVE_CPUID) && \ defined(_MSC_VER) && \ defined(__AVX512__) - #define HAVE_AVX512 + #define STORM_HAVE_AVX512 #endif -#if defined(HAVE_CPUID) && \ +#if defined(STORM_HAVE_CPUID) && \ CLANG_PREREQ(3, 8) && \ STORM_HAS_ATTRIBUTE(target) && \ (!defined(_MSC_VER) || defined(__AVX2__)) && \ (!defined(__apple_build_version__) || __apple_build_version__ >= 8000000) - #define HAVE_SSE42 - #define HAVE_AVX2 - #define HAVE_AVX512 + #define STORM_HAVE_SSE42 + #define STORM_HAVE_AVX2 + #define STORM_HAVE_AVX512 #endif // Target attribute @@ -302,7 +302,7 @@ void STORM_aligned_free(void* memblock) { extern "C" { #endif -#if defined(HAVE_CPUID) +#if defined(STORM_HAVE_CPUID) #if defined(_MSC_VER) #include @@ -357,8 +357,8 @@ void STORM_run_cpuid(int eax, int ecx, int* abcd) { #endif } -#if defined(HAVE_AVX2) || \ - defined(HAVE_AVX512) +#if defined(STORM_HAVE_AVX2) || \ + defined(STORM_HAVE_AVX512) static int STORM_get_xcr0() { @@ -394,8 +394,8 @@ int STORM_get_cpuid() { if ((abcd[2] & STORM_CPUID_runtime_bit_SSE42) == STORM_CPUID_runtime_bit_SSE42) flags |= STORM_CPUID_runtime_bit_SSE42; -#if defined(HAVE_AVX2) || \ - defined(HAVE_AVX512) +#if defined(STORM_HAVE_AVX2) || \ + defined(STORM_HAVE_AVX512) int osxsave_mask = (1 << 27); @@ -424,10 +424,10 @@ int STORM_get_cpuid() { return flags; } -#endif // defined(HAVE_CPUID) +#endif // defined(STORM_HAVE_CPUID) /// Taken from libpopcnt.h -#if defined(HAVE_ASM_POPCNT) && \ +#if defined(STORM_HAVE_ASM_POPCNT) && \ defined(__x86_64__) STORM_FORCE_INLINE @@ -437,7 +437,7 @@ uint64_t STORM_POPCOUNT(uint64_t x) return x; } -#elif defined(HAVE_ASM_POPCNT) && \ +#elif defined(STORM_HAVE_ASM_POPCNT) && \ defined(__i386__) STORM_FORCE_INLINE @@ -477,7 +477,7 @@ uint64_t STORM_POPCOUNT(uint64_t x) } /* non x86 CPUs */ -#elif defined(HAVE_BUILTIN_POPCOUNT) +#elif defined(STORM_HAVE_BUILTIN_POPCOUNT) STORM_FORCE_INLINE uint64_t STORM_POPCOUNT(uint64_t x) { @@ -779,7 +779,7 @@ const uint8_t STORM_popcnt_lookup8bit[256] = { * SSE4.1 functions ****************************/ -#if defined(HAVE_SSE42) +#if defined(STORM_HAVE_SSE42) #include @@ -1352,6 +1352,57 @@ uint64_t STORM_diff_count_csa_sse4(const __m128i* STORM_RESTRICT data1, return cnt64; } +STORM_TARGET("sse4.2") +static +uint64_t STORM_popcnt_csa_sse4(const __m128i* STORM_RESTRICT data, + size_t size) +{ + __m128i ones = _mm_setzero_si128(); + __m128i twos = _mm_setzero_si128(); + __m128i fours = _mm_setzero_si128(); + __m128i eights = _mm_setzero_si128(); + __m128i sixteens = _mm_setzero_si128(); + __m128i twosA, twosB, foursA, foursB, eightsA, eightsB; + + uint64_t i = 0; + uint64_t limit = size - size % 16; + uint64_t cnt64 = 0; + +#define LOAD(a) (_mm_loadu_si128(&data[i+a])) + + for (/**/; i < limit; i += 16) { + STORM_CSA128(&twosA, &ones, ones, LOAD(0), LOAD(1)); + STORM_CSA128(&twosB, &ones, ones, LOAD(2), LOAD(3)); + STORM_CSA128(&foursA, &twos, twos, twosA, twosB); + STORM_CSA128(&twosA, &ones, ones, LOAD(4), LOAD(5)); + STORM_CSA128(&twosB, &ones, ones, LOAD(6), LOAD(7)); + STORM_CSA128(&foursB, &twos, twos, twosA, twosB); + STORM_CSA128(&eightsA, &fours, fours, foursA, foursB); + STORM_CSA128(&twosA, &ones, ones, LOAD(8), LOAD(9)); + STORM_CSA128(&twosB, &ones, ones, LOAD(10), LOAD(11)); + STORM_CSA128(&foursA, &twos, twos, twosA, twosB); + STORM_CSA128(&twosA, &ones, ones, LOAD(12), LOAD(13)); + STORM_CSA128(&twosB, &ones, ones, LOAD(14), LOAD(15)); + STORM_CSA128(&foursB, &twos, twos, twosA, twosB); + STORM_CSA128(&eightsB, &fours, fours, foursA, foursB); + STORM_CSA128(&sixteens,&eights, eights,eightsA,eightsB); + + cnt64 += STORM_POPCOUNT_SSE(sixteens); + } +#undef LOAD + + cnt64 <<= 4; + cnt64 += STORM_POPCOUNT_SSE(eights) << 3; + cnt64 += STORM_POPCOUNT_SSE(fours) << 2; + cnt64 += STORM_POPCOUNT_SSE(twos) << 1; + cnt64 += STORM_POPCOUNT_SSE(ones) << 0; + + for (/**/; i < size; ++i) + cnt64 = STORM_POPCOUNT_SSE(_mm_loadu_si128(&data[i])); + + return cnt64; +} + STORM_TARGET("sse4.2") static uint64_t STORM_intersect_count_sse4(const uint64_t* STORM_RESTRICT b1, @@ -1411,13 +1462,31 @@ uint64_t STORM_diff_count_sse4(const uint64_t* STORM_RESTRICT b1, return(count); } + +STORM_TARGET("sse4.2") +static +uint64_t STORM_popcnt_sse4(const uint64_t* STORM_RESTRICT data, + const size_t n_ints) +{ + uint64_t count = 0; + const __m128i* r1 = (__m128i*)data; + const uint32_t n_cycles = n_ints / 2; + + count += STORM_popcnt_csa_sse4(r1, n_cycles); + + for (int i = n_cycles*2; i < n_ints; ++i) { + count += STORM_POPCOUNT(data[i]); + } + + return(count); +} #endif /**************************** * AVX256 functions ****************************/ -#if defined(HAVE_AVX2) +#if defined(STORM_HAVE_AVX2) #include @@ -1845,7 +1914,7 @@ uint64_t STORM_diff_count_lookup_avx2_func(const uint8_t* STORM_RESTRICT data1, STORM_TARGET("avx2") static -uint64_t STORM_popcnt_avx2(const __m256i* data, uint64_t size) +uint64_t STORM_popcnt_csa_avx2(const __m256i* data, uint64_t size) { __m256i cnt = _mm256_setzero_si256(); __m256i ones = _mm256_setzero_si256(); @@ -2170,13 +2239,35 @@ uint64_t STORM_diff_count_lookup_avx2(const uint64_t* STORM_RESTRICT b1, { return STORM_diff_count_lookup_avx2_func((uint8_t*)b1, (uint8_t*)b2, n_ints*8); } + +STORM_TARGET("avx2") +static +uint64_t STORM_popcnt_avx2(const uint64_t* data, + const size_t n_ints) +{ + uint64_t count = 0; + const uint32_t n_cycles = n_ints / 4; + const uint32_t n_cycles_sse = (n_ints % 4) / 2; + + const __m256i* r1 = (__m256i*)&data[0]; + const __m128i* r2 = (__m128i*)&data[n_cycles_sse*4]; + + count += STORM_popcnt_csa_avx2(r1, n_cycles); + count += STORM_popcnt_csa_sse4(r2, n_cycles_sse); + + for (int i = (4*n_cycles + 2*n_cycles_sse); i < n_ints; ++i) { + count += STORM_POPCOUNT(data[i]); + } + + return count; +} #endif /**************************** * AVX512BW functions ****************************/ -#if defined(HAVE_AVX512) +#if defined(STORM_HAVE_AVX512) #include @@ -2229,7 +2320,7 @@ void STORM_pospopcnt_csa_avx512(__m512i* STORM_RESTRICT h, STORM_TARGET("avx512bw") static -uint64_t STORM_popcnt_avx512(const __m512i* STORM_RESTRICT data, size_t size) +uint64_t STORM_popcnt_csa_avx512bw(const __m512i* STORM_RESTRICT data, size_t size) { __m512i cnt = _mm512_setzero_si512(); __m512i ones = _mm512_setzero_si512(); @@ -2355,29 +2446,29 @@ int STORM_pospopcnt_u16_avx512bw_harvey_seal(const uint16_t* array, size_t len, } _mm512_storeu_si512((__m512i*)buffer, v1); - for (size_t i = 0; i < 32; i++) { - for (int j = 0; j < 16; j++) { + for (size_t i = 0; i < 32; ++i) { + for (int j = 0; j < 16; ++j) { out[j] += 1 * ((buffer[i] & (1 << j)) >> j); } } _mm512_storeu_si512((__m512i*)buffer, v2); - for (size_t i = 0; i < 32; i++) { - for (int j = 0; j < 16; j++) { + for (size_t i = 0; i < 32; ++i) { + for (int j = 0; j < 16; ++j) { out[j] += 2 * ((buffer[i] & (1 << j)) >> j); } } _mm512_storeu_si512((__m512i*)buffer, v4); - for (size_t i = 0; i < 32; i++) { - for (int j = 0; j < 16; j++) { + for (size_t i = 0; i < 32; ++i) { + for (int j = 0; j < 16; ++j) { out[j] += 4 * ((buffer[i] & (1 << j)) >> j); } } _mm512_storeu_si512((__m512i*)buffer, v8); - for (size_t i = 0; i < 32; i++) { - for (int j = 0; j < 16; j++) { + for (size_t i = 0; i < 32; ++i) { + for (int j = 0; j < 16; ++j) { out[j] += 8 * ((buffer[i] & (1 << j)) >> j); } } @@ -2837,6 +2928,30 @@ uint64_t STORM_diff_count_avx512(const uint64_t* STORM_RESTRICT b1, return(count); } + +STORM_TARGET("avx2") +static +uint64_t STORM_popcnt_avx512(const uint64_t* data, + const size_t n_ints) +{ + uint64_t count = 0; + const uint64_t n_64b = n_ints / 64; + const uint32_t n_cycles = n_64b / 8; + const uint32_t n_cycles_avx2 = (n_64b % 8) / 4; + const uint32_t n_cycles_sse = ((n_64b % 8) % 4) / 2; + + const __m512i* r1 = (__m512i*)&data[0]; + const __m256i* r2 = (__m256i*)&data[n_cycles*8]; + const __m128i* r3 = (__m128i*)&data[n_cycles*8+n_cycles_avx2*4]; + + count += STORM_popcnt_csa_avx512bw(r1, n_cycles); + count += STORM_popcnt_csa_avx2(r2, n_cycles_avx2); + count += STORM_popcnt_csa_sse4(r3, n_cycles_sse); + + for (int i = (8*n_cycles + 4*n_cycles + 2*n_cycles_sse); i < n_64b; ++i) { + count += STORM_POPCOUNT(data[i]); + } +} #endif /**************************** @@ -2928,7 +3043,7 @@ typedef uint64_t (STORM_popcnt_func)(const uint8_t*, size_t); static uint32_t STORM_get_alignment() { -#if defined(HAVE_CPUID) +#if defined(STORM_HAVE_CPUID) #if defined(__cplusplus) /* C++11 thread-safe singleton */ static const int cpuid = STORM_get_cpuid(); @@ -2948,19 +3063,19 @@ uint32_t STORM_get_alignment() { #endif uint32_t alignment = 0; -#if defined(HAVE_AVX512) +#if defined(STORM_HAVE_AVX512) if ((cpuid & STORM_CPUID_runtime_bit_AVX512BW)) { // 16*512 alignment = STORM_AVX512_ALIGNMENT; } #endif -#if defined(HAVE_AVX2) +#if defined(STORM_HAVE_AVX2) if ((cpuid & STORM_CPUID_runtime_bit_AVX2) && alignment == 0) { // 16*256 alignment = STORM_AVX2_ALIGNMENT; } #endif -#if defined(HAVE_SSE42) +#if defined(STORM_HAVE_SSE42) if ((cpuid & STORM_CPUID_runtime_bit_SSE41) && alignment == 0) { // 16*128 alignment = STORM_SSE_ALIGNMENT; } @@ -2978,7 +3093,7 @@ uint32_t STORM_get_alignment() { static STORM_compute_func STORM_get_intersect_count_func(const size_t n_bitmaps_vector) { -#if defined(HAVE_CPUID) +#if defined(STORM_HAVE_CPUID) #if defined(__cplusplus) /* C++11 thread-safe singleton */ static const int cpuid = STORM_get_cpuid(); @@ -2998,13 +3113,13 @@ STORM_compute_func STORM_get_intersect_count_func(const size_t n_bitmaps_vector) #endif -#if defined(HAVE_AVX512) +#if defined(STORM_HAVE_AVX512) if ((cpuid & STORM_CPUID_runtime_bit_AVX512BW) && n_bitmaps_vector >= 128) { // 16*512 return &STORM_intersect_count_avx512; } #endif -#if defined(HAVE_AVX2) +#if defined(STORM_HAVE_AVX2) if ((cpuid & STORM_CPUID_runtime_bit_AVX2) && n_bitmaps_vector >= 64) { // 16*256 return &STORM_intersect_count_avx2; } @@ -3014,7 +3129,7 @@ STORM_compute_func STORM_get_intersect_count_func(const size_t n_bitmaps_vector) } #endif -#if defined(HAVE_SSE42) +#if defined(STORM_HAVE_SSE42) if ((cpuid & STORM_CPUID_runtime_bit_SSE41) && n_bitmaps_vector >= 32) { // 16*128 return &STORM_intersect_count_sse4; } @@ -3026,7 +3141,7 @@ STORM_compute_func STORM_get_intersect_count_func(const size_t n_bitmaps_vector) static STORM_compute_func STORM_get_union_count_func(const size_t n_bitmaps_vector) { -#if defined(HAVE_CPUID) +#if defined(STORM_HAVE_CPUID) #if defined(__cplusplus) /* C++11 thread-safe singleton */ static const int cpuid = STORM_get_cpuid(); @@ -3046,13 +3161,13 @@ STORM_compute_func STORM_get_union_count_func(const size_t n_bitmaps_vector) { #endif -#if defined(HAVE_AVX512) +#if defined(STORM_HAVE_AVX512) if ((cpuid & STORM_CPUID_runtime_bit_AVX512BW) && n_bitmaps_vector >= 128) { // 16*512 return &STORM_union_count_avx512; } #endif -#if defined(HAVE_AVX2) +#if defined(STORM_HAVE_AVX2) if ((cpuid & STORM_CPUID_runtime_bit_AVX2) && n_bitmaps_vector >= 64) { // 16*256 return &STORM_union_count_avx2; } @@ -3062,7 +3177,7 @@ STORM_compute_func STORM_get_union_count_func(const size_t n_bitmaps_vector) { } #endif -#if defined(HAVE_SSE42) +#if defined(STORM_HAVE_SSE42) if ((cpuid & STORM_CPUID_runtime_bit_SSE41) && n_bitmaps_vector >= 32) { // 16*128 return &STORM_union_count_sse4; } @@ -3074,7 +3189,7 @@ STORM_compute_func STORM_get_union_count_func(const size_t n_bitmaps_vector) { static STORM_compute_func STORM_get_diff_count_func(const size_t n_bitmaps_vector) { -#if defined(HAVE_CPUID) +#if defined(STORM_HAVE_CPUID) #if defined(__cplusplus) /* C++11 thread-safe singleton */ static const int cpuid = STORM_get_cpuid(); @@ -3094,13 +3209,13 @@ STORM_compute_func STORM_get_diff_count_func(const size_t n_bitmaps_vector) { #endif -#if defined(HAVE_AVX512) +#if defined(STORM_HAVE_AVX512) if ((cpuid & STORM_CPUID_runtime_bit_AVX512BW) && n_bitmaps_vector >= 128) { // 16*512 return &STORM_diff_count_avx512; } #endif -#if defined(HAVE_AVX2) +#if defined(STORM_HAVE_AVX2) if ((cpuid & STORM_CPUID_runtime_bit_AVX2) && n_bitmaps_vector >= 64) { // 16*256 return &STORM_diff_count_avx2; } @@ -3110,7 +3225,7 @@ STORM_compute_func STORM_get_diff_count_func(const size_t n_bitmaps_vector) { } #endif -#if defined(HAVE_SSE42) +#if defined(STORM_HAVE_SSE42) if ((cpuid & STORM_CPUID_runtime_bit_SSE41) && n_bitmaps_vector >= 32) { // 16*128 return &STORM_diff_count_sse4; } @@ -3128,7 +3243,7 @@ uint64_t STORM_intersect_count(const uint64_t* STORM_RESTRICT data1, const size_t n_len) { -#if defined(HAVE_CPUID) +#if defined(STORM_HAVE_CPUID) #if defined(__cplusplus) /* C++11 thread-safe singleton */ static const int cpuid = STORM_get_cpuid(); @@ -3148,13 +3263,13 @@ uint64_t STORM_intersect_count(const uint64_t* STORM_RESTRICT data1, #endif -#if defined(HAVE_AVX512) +#if defined(STORM_HAVE_AVX512) if ((cpuid & STORM_CPUID_runtime_bit_AVX512BW) && n_len >= 128) { // 16*512 return STORM_intersect_count_avx512(data1, data2, n_len); } #endif -#if defined(HAVE_AVX2) +#if defined(STORM_HAVE_AVX2) if ((cpuid & STORM_CPUID_runtime_bit_AVX2) && n_len >= 64) { // 16*256 return STORM_intersect_count_avx2(data1, data2, n_len); } @@ -3164,7 +3279,7 @@ uint64_t STORM_intersect_count(const uint64_t* STORM_RESTRICT data1, } #endif -#if defined(HAVE_SSE42) +#if defined(STORM_HAVE_SSE42) if ((cpuid & STORM_CPUID_runtime_bit_SSE41) && n_len >= 32) { // 16*128 return STORM_intersect_count_sse4(data1, data2, n_len); } @@ -3179,7 +3294,7 @@ uint64_t STORM_union_count(const uint64_t* STORM_RESTRICT data1, const size_t n_len) { -#if defined(HAVE_CPUID) +#if defined(STORM_HAVE_CPUID) #if defined(__cplusplus) /* C++11 thread-safe singleton */ static const int cpuid = STORM_get_cpuid(); @@ -3199,13 +3314,13 @@ uint64_t STORM_union_count(const uint64_t* STORM_RESTRICT data1, #endif -#if defined(HAVE_AVX512) +#if defined(STORM_HAVE_AVX512) if ((cpuid & STORM_CPUID_runtime_bit_AVX512BW) && n_len >= 128) { // 16*512 return STORM_union_count_avx512(data1, data2, n_len); } #endif -#if defined(HAVE_AVX2) +#if defined(STORM_HAVE_AVX2) if ((cpuid & STORM_CPUID_runtime_bit_AVX2) && n_len >= 64) { // 16*256 return STORM_union_count_avx2(data1, data2, n_len); } @@ -3215,7 +3330,7 @@ uint64_t STORM_union_count(const uint64_t* STORM_RESTRICT data1, } #endif -#if defined(HAVE_SSE42) +#if defined(STORM_HAVE_SSE42) if ((cpuid & STORM_CPUID_runtime_bit_SSE41) && n_len >= 32) { // 16*128 return STORM_union_count_sse4(data1, data2, n_len); } @@ -3230,7 +3345,7 @@ uint64_t STORM_diff_count(const uint64_t* STORM_RESTRICT data1, const size_t n_len) { -#if defined(HAVE_CPUID) +#if defined(STORM_HAVE_CPUID) #if defined(__cplusplus) /* C++11 thread-safe singleton */ static const int cpuid = STORM_get_cpuid(); @@ -3250,13 +3365,13 @@ uint64_t STORM_diff_count(const uint64_t* STORM_RESTRICT data1, #endif -#if defined(HAVE_AVX512) +#if defined(STORM_HAVE_AVX512) if ((cpuid & STORM_CPUID_runtime_bit_AVX512BW) && n_len >= 128) { // 16*512 return STORM_diff_count_avx512(data1, data2, n_len); } #endif -#if defined(HAVE_AVX2) +#if defined(STORM_HAVE_AVX2) if ((cpuid & STORM_CPUID_runtime_bit_AVX2) && n_len >= 64) { // 16*256 return STORM_diff_count_avx2(data1, data2, n_len); } @@ -3266,7 +3381,7 @@ uint64_t STORM_diff_count(const uint64_t* STORM_RESTRICT data1, } #endif -#if defined(HAVE_SSE42) +#if defined(STORM_HAVE_SSE42) if ((cpuid & STORM_CPUID_runtime_bit_SSE41) && n_len >= 32) { // 16*128 return STORM_diff_count_sse4(data1, data2, n_len); } @@ -3282,10 +3397,11 @@ static uint64_t STORM_popcnt(const uint8_t* data, size_t size) { uint64_t cnt = 0; uint64_t i; + // size /= 8; -#if defined(HAVE_CPUID) +#if defined(STORM_HAVE_CPUID) #if defined(__cplusplus) - /* C++11 thread-safe singleton */ + /* C++11 thread-safe singleton */ static const int cpuid = STORM_get_cpuid(); #else static int cpuid_ = -1; @@ -3302,39 +3418,59 @@ uint64_t STORM_popcnt(const uint8_t* data, size_t size) { #endif #endif -#if defined(HAVE_AVX512) +#if defined(STORM_HAVE_AVX512) /* AVX512 requires arrays >= 1024 bytes */ if ((cpuid & STORM_CPUID_runtime_bit_AVX512BW) && size >= 1024) { - cnt += STORM_popcnt_avx512((const __m512i*)data, size / 64); - data += size - size % 64; - size = size % 64; + // cnt += STORM_popcnt_avx512((const __m512i*)data, size / 64); + // data += size - size % 64; + // size = size % 64; + cnt += STORM_popcnt_avx512((uint64_t*)data, size/8); + data += size - size % 8; + size = size % 8; } #endif -#if defined(HAVE_AVX2) +#if defined(STORM_HAVE_AVX2) /* AVX2 requires arrays >= 512 bytes */ if ((cpuid & STORM_CPUID_runtime_bit_AVX2) && size >= 512) { - cnt += STORM_popcnt_avx2((const __m256i*)data, size / 32); - data += size - size % 32; - size = size % 32; + cnt += STORM_popcnt_avx2((uint64_t*)data, size/8); + data += size - size % 8; + size = size % 8; + // data += size - size % 32; + // size = size % 32; + } + +#endif + +#if defined(HAVE_SSE4) + + /* AVX2 requires arrays >= 512 bytes */ + if ((cpuid & STORM_CPUID_runtime_bit_SSE42) && + size >= 256) + { + cnt += STORM_popcnt_sse4((uint64_t*)data, size/8)); + data += size - size % 8; + size = size % 8; + // data += size - size % 32; + // size = size % 32; } #endif -#if defined(HAVE_POPCNT) +#if defined(STORM_HAVE_POPCNT) if (cpuid & STORM_CPUID_runtime_bit_POPCNT) { cnt += STORM_popcount64_unrolled((const uint64_t*)data, size / 8); data += size - size % 8; size = size % 8; - for (i = 0; i < size; i++) + for (i = 0; i < size; ++i) cnt += STORM_popcount64(data[i]); return cnt; @@ -3350,7 +3486,7 @@ uint64_t STORM_popcnt(const uint8_t* data, size_t size) { } /* pure integer popcount algorithm */ - for (i = 0; i < size; i++) + for (i = 0; i < size; ++i) cnt += STORM_popcount64(data[i]); return cnt; @@ -3360,9 +3496,9 @@ static int STORM_pospopcnt_u16(const uint16_t* data, size_t len, uint32_t* out) { memset(out, 0, sizeof(uint32_t)*16); -#if defined(HAVE_CPUID) +#if defined(STORM_HAVE_CPUID) #if defined(__cplusplus) - /* C++11 thread-safe singleton */ + /* C++11 thread-safe singleton */ static const int cpuid = STORM_get_cpuid(); #else static int cpuid_ = -1; @@ -3379,7 +3515,7 @@ int STORM_pospopcnt_u16(const uint16_t* data, size_t len, uint32_t* out) { #endif #endif -#if defined(HAVE_AVX512) +#if defined(STORM_HAVE_AVX512) if ((cpuid & STORM_CPUID_runtime_bit_AVX512BW)) { if (len < 32) return(STORM_pospopcnt_u16_sse_sad(data, len, out)); // small @@ -3390,7 +3526,7 @@ int STORM_pospopcnt_u16(const uint16_t* data, size_t len, uint32_t* out) { } #endif -#if defined(HAVE_AVX2) +#if defined(STORM_HAVE_AVX2) if ((cpuid & STORM_CPUID_runtime_bit_AVX2)) { if (len < 128) return(STORM_pospopcnt_u16_sse_sad(data, len, out)); // small From bc0d45af5aa71ee0b155300b85e5b9445948b7e6 Mon Sep 17 00:00:00 2001 From: "Marcus D. R. Klarqvist" Date: Mon, 19 Aug 2019 00:13:01 +0100 Subject: [PATCH 13/15] return --- libalgebra.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/libalgebra.h b/libalgebra.h index 5c5e476..1948dde 100644 --- a/libalgebra.h +++ b/libalgebra.h @@ -2951,6 +2951,8 @@ uint64_t STORM_popcnt_avx512(const uint64_t* data, for (int i = (8*n_cycles + 4*n_cycles + 2*n_cycles_sse); i < n_64b; ++i) { count += STORM_POPCOUNT(data[i]); } + + return count; } #endif From c15019736cf2eb0bb7414987757f5d4278526a93 Mon Sep 17 00:00:00 2001 From: "Marcus D. R. Klarqvist" Date: Mon, 19 Aug 2019 00:22:39 +0100 Subject: [PATCH 14/15] assertion --- benchmark.cpp | 58 +++++++++++++++++++++++++++------------------------ libalgebra.h | 9 ++++---- 2 files changed, 35 insertions(+), 32 deletions(-) diff --git a/benchmark.cpp b/benchmark.cpp index 09fba3d..905f006 100644 --- a/benchmark.cpp +++ b/benchmark.cpp @@ -33,6 +33,32 @@ uint64_t* generate_random_data(uint32_t n_bitmaps) { return mem; } +#if !defined(__clang__) && !defined(_MSC_VER) +__attribute__((optimize("no-tree-vectorize"))) +#endif +uint64_t popcount_scalar_naive_nosimd(const uint8_t* data, size_t len) { + uint64_t total = 0; + // for (int i = 0; i < len; ++i) { + // total += STORM_popcount64(data1[i] & data2[i]); + // } + // assert(len % 8 == 0); + + for (int j = 0; j < len; j += 8) { + // total += STORM_popcount64(data[i]); + // diff = data1[i] & data2[i]; + total += STORM_popcnt_lookup8bit[data[j+0]]; + total += STORM_popcnt_lookup8bit[data[j+1]]; + total += STORM_popcnt_lookup8bit[data[j+2]]; + total += STORM_popcnt_lookup8bit[data[j+3]]; + total += STORM_popcnt_lookup8bit[data[j+4]]; + total += STORM_popcnt_lookup8bit[data[j+5]]; + total += STORM_popcnt_lookup8bit[data[j+6]]; + total += STORM_popcnt_lookup8bit[data[j+7]]; + } + + return total; +} + #ifdef __linux__ #include // for __NR_perf_event_open @@ -266,10 +292,14 @@ int linux_popcount_wrapper(std::string name, unified.start(); // Call argument subroutine pointer. - total += (*f)((uint8_t*)mem1, n_bitmaps*8); + uint64_t a = (*f)((uint8_t*)mem1, n_bitmaps*8); unified.end(results); allresults.push_back(results); + uint64_t b = popcount_scalar_naive_nosimd((uint8_t*)mem1, n_bitmaps*8); + assert(a == b); + total += a; + STORM_aligned_free(mem1); } @@ -332,32 +362,6 @@ uint64_t get_cpu_cycles() { return result; }; -#if !defined(__clang__) && !defined(_MSC_VER) -__attribute__((optimize("no-tree-vectorize"))) -#endif -uint64_t popcount_scalar_naive_nosimd(const uint8_t* data, size_t len) { - uint64_t total = 0; - // for (int i = 0; i < len; ++i) { - // total += STORM_popcount64(data1[i] & data2[i]); - // } - // assert(len % 8 == 0); - - for (int j = 0; j < len; j += 8) { - // total += STORM_popcount64(data[i]); - // diff = data1[i] & data2[i]; - total += STORM_popcnt_lookup8bit[data[j+0]]; - total += STORM_popcnt_lookup8bit[data[j+1]]; - total += STORM_popcnt_lookup8bit[data[j+2]]; - total += STORM_popcnt_lookup8bit[data[j+3]]; - total += STORM_popcnt_lookup8bit[data[j+4]]; - total += STORM_popcnt_lookup8bit[data[j+5]]; - total += STORM_popcnt_lookup8bit[data[j+6]]; - total += STORM_popcnt_lookup8bit[data[j+7]]; - } - - return total; -} - #if !defined(__clang__) && !defined(_MSC_VER) __attribute__((optimize("no-tree-vectorize"))) #endif diff --git a/libalgebra.h b/libalgebra.h index 1948dde..ac0a790 100644 --- a/libalgebra.h +++ b/libalgebra.h @@ -2935,10 +2935,9 @@ uint64_t STORM_popcnt_avx512(const uint64_t* data, const size_t n_ints) { uint64_t count = 0; - const uint64_t n_64b = n_ints / 64; - const uint32_t n_cycles = n_64b / 8; - const uint32_t n_cycles_avx2 = (n_64b % 8) / 4; - const uint32_t n_cycles_sse = ((n_64b % 8) % 4) / 2; + const uint32_t n_cycles = n_ints / 8; + const uint32_t n_cycles_avx2 = (n_ints % 8) / 4; + const uint32_t n_cycles_sse = ((n_ints % 8) % 4) / 2; const __m512i* r1 = (__m512i*)&data[0]; const __m256i* r2 = (__m256i*)&data[n_cycles*8]; @@ -2948,7 +2947,7 @@ uint64_t STORM_popcnt_avx512(const uint64_t* data, count += STORM_popcnt_csa_avx2(r2, n_cycles_avx2); count += STORM_popcnt_csa_sse4(r3, n_cycles_sse); - for (int i = (8*n_cycles + 4*n_cycles + 2*n_cycles_sse); i < n_64b; ++i) { + for (int i = (8*n_cycles + 4*n_cycles + 2*n_cycles_sse); i < n_ints; ++i) { count += STORM_POPCOUNT(data[i]); } From 03d13a50be9f255c3107e65a43fe870341a01efe Mon Sep 17 00:00:00 2001 From: "Marcus D. R. Klarqvist" Date: Mon, 19 Aug 2019 00:42:10 +0100 Subject: [PATCH 15/15] results in readme --- README.md | 84 +++++++++++++++++++++++++++---------------------------- 1 file changed, 42 insertions(+), 42 deletions(-) diff --git a/README.md b/README.md index 51bca3e..d8b98ec 100644 --- a/README.md +++ b/README.md @@ -29,7 +29,7 @@ The core algorithms are described in the papers: ### Speedup -Sample performance metrics (practical upper limit) on AVX512BW machine. We simulate a single data array or pairs of data arrays in a aligned memory location and compute the same statistics many times. This reflect the fastest possible throughput if you never have to leave the destination cache-level. +Sample performance metrics (practical upper limit) on AVX512BW machine. We simulate a single data array or pairs of data arrays in a aligned memory location and compute the same statistics many times using the command `benchmark -p -r 10000` (required Linux `perf` subsystem). This reflect the fastest possible throughput if you never have to leave the destination cache-level. The host architecture used is a 10 nm Cannon Lake [Core i3-8121U](https://ark.intel.com/content/www/us/en/ark/products/136863/intel-core-i3-8121u-processor-4m-cache-up-to-3-20-ghz.html) with gcc (GCC) 8.2.1 20180905 (Red Hat 8.2.1-3). ### POSPOPCNT @@ -51,27 +51,27 @@ CPUs compared to a naive unvectorized solution Fold speedup compared to a naive unvectorized algorithm (`popcount_scalar_naive_nosimd`) for different array sizes as (CPU cycles/64-bit word, Instructions/64-bit word): -| Words | libalgebra.h | Scalar | Speedup | -|---------|---------------|---------------|---------| -| 4 | 28.5 (36.75) | 27.25 (34) | 1 | -| 8 | 18.38 (25.5) | 15.63 (30.5) | 0.9 | -| 16 | 10.81 (19.94) | 13.75 (28.75) | 1.3 | -| 32 | 7.91 (17.16) | 11.25 (27.88) | 1.4 | -| 64 | 3.47 (3.92) | 9.7 (27.44) | 2.8 | -| 128 | 2.23 (1.93) | 8.97 (27.22) | 4 | -| 256 | 1.09 (1.29) | 8.6 (27.11) | 7.9 | -| 512 | 0.7 (0.98) | 8.35 (27.06) | 11.9 | -| 1024 | 0.48 (0.82) | 8.22 (27.03) | 17.2 | -| 2048 | 0.35 (0.74) | 8.12 (27.01) | 23.5 | -| 4096 | 0.29 (0.7) | 8.09 (27.01) | 28.1 | -| 8192 | 0.24 (0.68) | 8.04 (27) | 33.2 | -| 16384 | 0.22 (0.67) | 8.02 (27) | 36.6 | -| 32768 | **0.21 (0.66)** | 8.02 (27) | **38.7** | -| 65536 | 0.83 (0.66) | 8.01 (27) | 9.7 | -| 131072 | 0.47 (0.66) | 8.02 (27) | 17 | -| 262144 | 0.49 (0.66) | 8.05 (27) | 16.5 | -| 524288 | 0.48 (0.66) | 8.07 (27) | 16.9 | -| 1048576 | 0.44 (0.66) | 8.07 (27) | 18.5 | +| Words | libalgebra.h | Scalar | Speedup | +|---------|--------------|---------------|---------| +| 4 | 27.75 (37) | 26.75 (33.5) | 1 | +| 8 | 16.38 (25.5) | 17.38 (30.25) | 1.1 | +| 16 | 10.5 (19.94) | 12.75 (28.63) | 1.2 | +| 32 | 7.72 (17.16) | 10.69 (27.81) | 1.4 | +| 64 | 3.09 (4.36) | 9.61 (27.41) | 3.1 | +| 128 | 2.53 (2.73) | 8.84 (27.2) | 3.5 | +| 256 | 1.35 (1.7) | 8.5 (27.1) | 6.3 | +| 512 | 0.67 (1.18) | 8.33 (27.05) | 12.4 | +| 1024 | 0.5 (0.92) | 8.25 (27.03) | 16.4 | +| 2048 | 0.41 (0.79) | 8.15 (27.01) | 20.1 | +| 4096 | 0.46 (0.72) | 8.12 (27.01) | 17.8 | +| 8192 | 0.39 (0.69) | 8.11 (27) | 21 | +| 16384 | 0.39 (0.67) | 8.1 (27) | 20.6 | +| 32768 | 0.89 (0.66) | 8.1 (27) | 9.1 | +| 65536 | 0.84 (0.66) | 8.1 (27) | 9.6 | +| 131072 | 0.68 (0.66) | 8.09 (27) | 11.9 | +| 262144 | 1.11 (0.66) | 8.09 (27) | 7.3 | +| 524288 | 1.84 (0.66) | 8.12 (27) | 4.4 | +| 1048576 | 1.95 (0.66) | 8.15 (27) | 4.2 | ### Set algebra @@ -82,25 +82,25 @@ OR, or XOR) which all have identical latency and throughput (CPI). | Words | libalgebra.h | Scalar | Speedup | |---------|--------------|---------------|---------| -| 4 | 18.25 (8.63) | 14.63 (22.75) | 0.8 | -| 8 | 9.5 (5.44) | 10.25 (20.88) | 1.1 | -| 16 | 5.28 (3.84) | 7.91 (19.94) | 1.5 | -| 32 | 2.88 (2.56) | 6.88 (19.47) | 2.4 | -| 64 | 1.98 (2.06) | 5.94 (19.23) | 3 | -| 128 | 1.05 (0.89) | 5.49 (19.12) | 5.2 | -| 256 | 0.61 (0.64) | 5.24 (19.06) | 8.6 | -| 512 | 0.42 (0.51) | 5.13 (19.03) | 12.3 | -| 1024 | 0.29 (0.45) | 5.06 (19.02) | 17.4 | -| 2048 | 0.21 (0.41) | 5.04 (19.01) | 24.2 | -| 4096 | 0.17 (0.4) | 5.02 (19) | 29 | -| 8192 | 0.15 (0.39) | 5.01 (19) | 34.3 | -| 16384 | **0.13 (0.39)** | 5.01 (19) | **37.6** | -| 32768 | 0.54 (0.39) | 5 (19) | 9.3 | -| 65536 | 0.43 (0.38) | 5 (19) | 11.7 | -| 131072 | 0.28 (0.38) | 5 (19) | 17.8 | -| 262144 | 0.25 (0.38) | 5 (19) | 20.4 | -| 524288 | 0.24 (0.38) | 5 (19) | 20.7 | -| 1048576 | 0.22 (0.38) | 5 (19) | 22.9 | +| 4 | 17.63 (8.63) | 14.63 (22.75) | 0.8 | +| 8 | 8.13 (5.44) | 10 (20.88) | 1.2 | +| 16 | 4.69 (3.84) | 7.91 (19.94) | 1.7 | +| 32 | 2.38 (2.56) | 6.59 (19.47) | 2.8 | +| 64 | 1.82 (2.06) | 5.87 (19.23) | 3.2 | +| 128 | 0.88 (0.89) | 5.43 (19.12) | 6.2 | +| 256 | 0.57 (0.64) | 5.18 (19.06) | 9.2 | +| 512 | 0.41 (0.51) | 5.11 (19.03) | 12.4 | +| 1024 | 0.33 (0.45) | 5.06 (19.02) | 15.3 | +| 2048 | 0.39 (0.41) | 5.03 (19.01) | 13.1 | +| 4096 | 0.36 (0.4) | 5.02 (19) | 13.9 | +| 8192 | 0.37 (0.39) | 5.01 (19) | 13.7 | +| 16384 | 0.55 (0.39) | 5.01 (19) | 9.1 | +| 32768 | 0.55 (0.39) | 5 (19) | 9.2 | +| 65536 | 0.52 (0.38) | 5 (19) | 9.7 | +| 131072 | 0.56 (0.38) | 5.01 (19) | 9 | +| 262144 | 1.25 (0.38) | 5.02 (19) | 4 | +| 524288 | 1.76 (0.38) | 5.03 (19) | 2.9 | +| 1048576 | 1.81 (0.38) | 5.07 (19) | 2.8 | ## C/C++ API @@ -135,7 +135,7 @@ int STORM_pospopcnt_u16(const uint16_t* data, uint32_t size, uint32_t* flags); * Compute the intersection, union, or diff cardinality between pairs of bitmaps * @data1: A 64-bit array * @data2: A 64-bit array - * @size: Size of data in bytes + * @size: Size of data in 64-bit words */ // Intersect cardinality uint64_t STORM_intersect_count(const uint64_t* data1, const uint64_t* data2, const uint32_t size);