From 209fac738c10cc7283211669f09437a188cb96ff Mon Sep 17 00:00:00 2001 From: Bryce Lorenz Kille Date: Mon, 21 Aug 2023 16:59:39 -0500 Subject: [PATCH 01/31] Add dynamic_bitset and bitarray to benchmarks --- .gitmodules | 9 +++ benchmark/CMakeLists.txt | 10 +++- benchmark/src/benchmark_main.cc | 98 ++++++++++++++++++++++----------- benchmark/src/shift_bench.hpp | 57 +++++++++++++++++++ 4 files changed, 141 insertions(+), 33 deletions(-) create mode 100644 .gitmodules diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..29bb15d --- /dev/null +++ b/.gitmodules @@ -0,0 +1,9 @@ +[submodule "benchmark/ext/BitArray"] + path = benchmark/ext/BitArray + url = https://github.com/noporpoise/BitArray.git +[submodule "benchmark/ext/itsy_bitsy"] + path = benchmark/ext/itsy_bitsy + url = https://github.com/ThePhD/itsy_bitsy.git +[submodule "benchmark/ext/dynamic_bitset"] + path = benchmark/ext/dynamic_bitset + url = https://github.com/pinam45/dynamic_bitset.git diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt index b5d8dfd..6870bea 100644 --- a/benchmark/CMakeLists.txt +++ b/benchmark/CMakeLists.txt @@ -12,9 +12,15 @@ set(CMAKE_BUILD_TYPE Release) file(GLOB BENCH_SOURCES "src/*.cc") add_executable(bitlib-bench ${BENCH_SOURCES}) +add_subdirectory(ext/dynamic_bitset) + # specify benchmark-specific libraries -include_directories(${googlebench_SOURCE_DIR}/benchmark/include src/utils) -target_link_libraries(bitlib-bench PRIVATE benchmark::benchmark -pthread) +include_directories( + ${googlebench_SOURCE_DIR}/benchmark/include + src/utils + ext/BitArray + ext/itsy_bitsy/include) +target_link_libraries(bitlib-bench PRIVATE benchmark::benchmark -pthread ${CMAKE_CURRENT_LIST_DIR}/ext/BitArray/libbitarr.a sul::dynamic_bitset) target_compile_options(bitlib-bench PUBLIC -O3 -DNDEBUG -march=native -Wpedantic) install(TARGETS bitlib-bench DESTINATION .) diff --git a/benchmark/src/benchmark_main.cc b/benchmark/src/benchmark_main.cc index bb68366..7312234 100644 --- a/benchmark/src/benchmark_main.cc +++ b/benchmark/src/benchmark_main.cc @@ -96,7 +96,7 @@ void register_bool_containers(F test_lambda_f, std::string func_name, unsigned i //BENCHMARK_MAIN(); int main(int argc, char** argv) { - unsigned int size_small = 1 << 4; + unsigned int size_small = 1 << 8; unsigned int size_medium = 1 << 8; unsigned int size_large = 1 << 16; unsigned int size_huge = 1 << 22; @@ -110,41 +110,77 @@ int main(int argc, char** argv) { BM_BitShiftLeft_UU, "bit::shift_left (small) (UU)", size_small); + register_word_containers( + BM_DynamicBitsetShiftLeft, + "dynamic_bitset::shift_left (small)", + size_small); + register_word_containers( + BM_BitArrayShiftLeft, + "bitarray::shift_left (small)", + size_small); register_bool_containers( BM_BoolShiftLeft, "std::shift_left (small)", size_small); register_word_containers( BM_BitShiftLeft, - "bit::shift_left (large) (AA)", + "bit::shift_left (huge) (AA)", size_huge); register_word_containers( BM_BitShiftLeft_UU, - "bit::shift_left (large) (UU)", + "bit::shift_left (huge) (UU)", + size_huge); + register_word_containers( + BM_DynamicBitsetShiftLeft, + "dynamic_bitset::shift_left (huge)", + size_huge); + register_word_containers( + BM_BitArrayShiftLeft, + "bitarray::shift_left (huge) (AA)", size_huge); register_bool_containers( BM_BoolShiftLeft, - "std::shift_left (large)", + "std::shift_left (huge)", size_huge); + register_word_containers( + BM_BitShiftRight, + "bit::shift_right (small) (AA)", + size_small); register_word_containers( BM_BitShiftRight_UU, "bit::shift_right (small) (UU)", size_small); + register_word_containers( + BM_DynamicBitsetShiftRight, + "dynamic_bitset::shift_right (small)", + size_small); + register_word_containers( + BM_BitArrayShiftRight, + "bitarray::shift_right (small)", + size_small); register_bool_containers( BM_BoolShiftRight, "std::shift_right (small)", size_small); register_word_containers( BM_BitShiftRight, - "bit::shift_right (large) (AA)", + "bit::shift_right (huge) (AA)", size_huge); register_word_containers( BM_BitShiftRight_UU, - "bit::shift_right (large) (UU)", + "bit::shift_right (huge) (UU)", + size_huge); + register_word_containers( + BM_DynamicBitsetShiftRight, + "dynamic_bitset::shift_right (huge)", + size_huge); + register_word_containers( + BM_BitArrayShiftRight, + "bitarray::shift_right (huge)", size_huge); register_bool_containers( BM_BoolShiftRight, - "std::shift_right (large)", + "std::shift_right (huge)", size_huge); // Reverse benchmarks @@ -158,15 +194,15 @@ int main(int argc, char** argv) { size_small); register_word_containers( BM_BitReverse, - "bit::reverse (large) (AA)", + "bit::reverse (huge) (AA)", size_huge); register_word_containers( BM_BitReverse_UU, - "bit::reverse (large) (UU)", + "bit::reverse (huge) (UU)", size_huge); register_bool_containers( BM_BoolReverse, - "std::reverse (large)", + "std::reverse (huge)", size_huge); // transform benchmarks @@ -184,15 +220,15 @@ int main(int argc, char** argv) { size_small); register_word_containers( BM_BitTransformUnaryAA, - "bit::transform(UnaryOp) (large) (AA)", + "bit::transform(UnaryOp) (huge) (AA)", size_huge); register_word_containers( BM_BitTransformUnaryUU, - "bit::transform(UnaryOp) (large) (UU)", + "bit::transform(UnaryOp) (huge) (UU)", size_huge); register_bool_containers( BM_BoolTransformUnary, - "std::transform(UnaryOp) (large)", + "std::transform(UnaryOp) (huge)", size_huge); register_word_containers( BM_BitTransformBinaryAA, @@ -208,15 +244,15 @@ int main(int argc, char** argv) { size_small); register_word_containers( BM_BitTransformBinaryAA, - "bit::transform(BinaryOp) (large) (AA)", + "bit::transform(BinaryOp) (huge) (AA)", size_huge); register_word_containers( BM_BitTransformBinaryUU, - "bit::transform(BinaryOp) (large) (UU)", + "bit::transform(BinaryOp) (huge) (UU)", size_huge); register_bool_containers( BM_BoolTransformBinary, - "std::transform(BinaryOp) (large)", + "std::transform(BinaryOp) (huge)", size_huge); // Rotate benchmarks @@ -230,11 +266,11 @@ int main(int argc, char** argv) { size_small); register_word_containers( BM_BitRotate, - "bit::rotate (large) (ARA)", + "bit::rotate (huge) (ARA)", size_huge); register_bool_containers( BM_BoolRotate, - "std::rotate (large)", + "std::rotate (huge)", size_huge); // Count benchmarks @@ -248,11 +284,11 @@ int main(int argc, char** argv) { size_small); register_word_containers( BM_BitCount, - "bit::count (large) (AA)", + "bit::count (huge) (AA)", size_huge); register_bool_containers( BM_BoolCount, - "std::count (large)", + "std::count (huge)", size_huge); // swap_ranges benchmarks @@ -270,15 +306,15 @@ int main(int argc, char** argv) { size_small); register_word_containers( BM_BitSwapRangesAA, - "bit::swap_ranges (large) (AA)", + "bit::swap_ranges (huge) (AA)", size_huge); register_word_containers( BM_BitSwapRangesUU, - "bit::swap_ranges (large) (UU)", + "bit::swap_ranges (huge) (UU)", size_huge); register_bool_containers( BM_BoolSwapRanges, - "std::swap_ranges (large)", + "std::swap_ranges (huge)", size_huge); // copy benchmarks @@ -292,11 +328,11 @@ int main(int argc, char** argv) { size_small); register_word_containers( BM_BitCopy, - "bit::copy (large) (UU)", + "bit::copy (huge) (UU)", size_huge); register_bool_containers( BM_BoolCopy, - "std::copy (large)", + "std::copy (huge)", size_huge); // Equal benchmarks @@ -310,11 +346,11 @@ int main(int argc, char** argv) { size_small); register_word_containers( BM_BitEqual, - "bit::equal (large) (UU)", + "bit::equal (huge) (UU)", size_huge); register_bool_containers( BM_BoolEqual, - "std::equal (large)", + "std::equal (huge)", size_huge); // move benchmarks @@ -328,11 +364,11 @@ int main(int argc, char** argv) { size_small); register_word_containers( BM_BitMove, - "bit::move (large) (UU)", + "bit::move (huge) (UU)", size_huge); register_bool_containers( BM_BoolMove, - "std::move (large)", + "std::move (huge)", size_huge); // copy_backward benchmarks @@ -346,11 +382,11 @@ int main(int argc, char** argv) { size_small); register_word_containers( BM_BitCopyBackward, - "bit::copy_backward (large) (UU)", + "bit::copy_backward (huge) (UU)", size_huge); register_bool_containers( BM_BoolCopyBackward, - "std::copy_backward (large)", + "std::copy_backward (huge)", size_huge); // fill benchmarks diff --git a/benchmark/src/shift_bench.hpp b/benchmark/src/shift_bench.hpp index d85d608..80f8e13 100644 --- a/benchmark/src/shift_bench.hpp +++ b/benchmark/src/shift_bench.hpp @@ -1,7 +1,10 @@ #include #include +#include #include #include "bitlib/bitlib.hpp" +#include "bit_array.h" +#include auto BM_BitShiftLeft = [](benchmark::State& state, auto input) { @@ -20,6 +23,7 @@ auto BM_BitShiftLeft = [](benchmark::State& state, auto input) { } }; + auto BM_BitShiftLeft_UU = [](benchmark::State& state, auto input) { using container_type = typename std::tuple_element<0, decltype(input)>::type; using word_type = typename std::tuple_element<1, decltype(input)>::type; @@ -37,6 +41,33 @@ auto BM_BitShiftLeft_UU = [](benchmark::State& state, auto input) { } }; +auto BM_BitArrayShiftLeft = [](benchmark::State& state, auto input) { + using container_type = typename std::tuple_element<0, decltype(input)>::type; + using word_type = typename std::tuple_element<1, decltype(input)>::type; + unsigned int total_bits = std::get<2>(input); + BIT_ARRAY* bitarr = bit_array_create(total_bits); + auto n = total_bits / 2; + for (auto _ : state) { + bit_array_shift_right(bitarr, n, 0); + benchmark::ClobberMemory(); + } + bit_array_free(bitarr); +}; + + +auto BM_DynamicBitsetShiftLeft = [](benchmark::State& state, auto input) { + using container_type = typename std::tuple_element<0, decltype(input)>::type; + using word_type = typename std::tuple_element<1, decltype(input)>::type; + using iterator_type = typename container_type::iterator; + unsigned int total_bits = std::get<2>(input); + sul::dynamic_bitset<> bitset1(total_bits, 1); + auto n = total_bits / 2; + for (auto _ : state) { + bitset1 <<= n; + benchmark::ClobberMemory(); + } +}; + auto BM_BoolShiftLeft = [](benchmark::State& state, auto input) { using container_type = std::vector; using num_type = typename container_type::value_type; @@ -83,6 +114,32 @@ auto BM_BitShiftRight_UU = [](benchmark::State& state, auto input) { } }; +auto BM_DynamicBitsetShiftRight = [](benchmark::State& state, auto input) { + using container_type = typename std::tuple_element<0, decltype(input)>::type; + using word_type = typename std::tuple_element<1, decltype(input)>::type; + using iterator_type = typename container_type::iterator; + unsigned int total_bits = std::get<2>(input); + sul::dynamic_bitset<> bitset1(total_bits, 1); + auto n = total_bits / 2; + for (auto _ : state) { + bitset1 >>= n; + benchmark::ClobberMemory(); + } +}; + +auto BM_BitArrayShiftRight = [](benchmark::State& state, auto input) { + using container_type = typename std::tuple_element<0, decltype(input)>::type; + using word_type = typename std::tuple_element<1, decltype(input)>::type; + unsigned int total_bits = std::get<2>(input); + BIT_ARRAY* bitarr = bit_array_create(total_bits); + auto n = total_bits / 2; + for (auto _ : state) { + bit_array_shift_right(bitarr, n, 0); + benchmark::ClobberMemory(); + } + bit_array_free(bitarr); +}; + auto BM_BoolShiftRight = [](benchmark::State& state, auto input) { using container_type = std::vector; using num_type = typename container_type::value_type; From 799d18fd61d27cc7ef9706bca8cf04fb8fce5335 Mon Sep 17 00:00:00 2001 From: Bryce Lorenz Kille Date: Mon, 21 Aug 2023 17:07:49 -0500 Subject: [PATCH 02/31] Add fill benchmarks --- benchmark/src/benchmark_main.cc | 16 ++++++++++++++++ benchmark/src/fill_bench.hpp | 26 ++++++++++++++++++++++++++ benchmark/src/shift_bench.hpp | 2 +- 3 files changed, 43 insertions(+), 1 deletion(-) diff --git a/benchmark/src/benchmark_main.cc b/benchmark/src/benchmark_main.cc index 7312234..3f7b3db 100644 --- a/benchmark/src/benchmark_main.cc +++ b/benchmark/src/benchmark_main.cc @@ -394,6 +394,14 @@ int main(int argc, char** argv) { BM_BitFill, "bit::fill (small) (UU)", size_small); + register_bool_containers( + BM_DynamicBitsetFill, + "dynamic_bitset::fill (small)", + size_small); + register_bool_containers( + BM_BitArrayFill, + "bitarray::fill (small)", + size_small); register_bool_containers( BM_BoolFill, "std::fill (small)", @@ -402,6 +410,14 @@ int main(int argc, char** argv) { BM_BitFill, "bit::fill (huge) (UU)", size_huge); + register_bool_containers( + BM_DynamicBitsetFill, + "dynamic_bitset::fill (huge)", + size_huge); + register_bool_containers( + BM_BitArrayFill, + "bitarray::fill (huge)", + size_huge); register_bool_containers( BM_BoolFill, "std::fill (huge)", diff --git a/benchmark/src/fill_bench.hpp b/benchmark/src/fill_bench.hpp index 58ff116..9b17b93 100644 --- a/benchmark/src/fill_bench.hpp +++ b/benchmark/src/fill_bench.hpp @@ -1,6 +1,8 @@ #include #include #include "bitlib/bit-algorithms/fill.hpp" +#include "bit_array.h" +#include "sul/dynamic_bitset.hpp" auto BM_BitFill = [](benchmark::State& state, auto input) { using container_type = typename std::tuple_element<0, decltype(input)>::type; @@ -17,6 +19,30 @@ auto BM_BitFill = [](benchmark::State& state, auto input) { } }; +auto BM_BitArrayFill = [](benchmark::State& state, auto input) { + using container_type = typename std::tuple_element<0, decltype(input)>::type; + using word_type = typename std::tuple_element<1, decltype(input)>::type; + unsigned int total_bits = std::get<2>(input); + BIT_ARRAY* bitarr = bit_array_create(total_bits); + for (auto _ : state) { + bit_array_set_region(bitarr, 2, total_bits - 5); + benchmark::ClobberMemory(); + } + bit_array_free(bitarr); +}; + +auto BM_DynamicBitsetFill = [](benchmark::State& state, auto input) { + using container_type = typename std::tuple_element<0, decltype(input)>::type; + using word_type = typename std::tuple_element<1, decltype(input)>::type; + using iterator_type = typename container_type::iterator; + unsigned int total_bits = std::get<2>(input); + sul::dynamic_bitset<> bitset1(total_bits, 0); + for (auto _ : state) { + bitset1.set(2, total_bits - 5, true); + benchmark::ClobberMemory(); + } +}; + auto BM_BoolFill = [](benchmark::State& state, auto input) { using container_type = std::vector; using num_type = typename container_type::value_type; diff --git a/benchmark/src/shift_bench.hpp b/benchmark/src/shift_bench.hpp index 80f8e13..0a444cf 100644 --- a/benchmark/src/shift_bench.hpp +++ b/benchmark/src/shift_bench.hpp @@ -4,7 +4,7 @@ #include #include "bitlib/bitlib.hpp" #include "bit_array.h" -#include +#include "sul/dynamic_bitset.hpp" auto BM_BitShiftLeft = [](benchmark::State& state, auto input) { From 65ed78232a3c193745c41ccb669a5948b11fb851 Mon Sep 17 00:00:00 2001 From: Bryce Lorenz Kille Date: Mon, 21 Aug 2023 17:26:45 -0500 Subject: [PATCH 03/31] Add libpopcnt --- include/bitlib/bit-algorithms/libpopcnt.h | 798 ++++++++++++++++++++++ 1 file changed, 798 insertions(+) create mode 100644 include/bitlib/bit-algorithms/libpopcnt.h diff --git a/include/bitlib/bit-algorithms/libpopcnt.h b/include/bitlib/bit-algorithms/libpopcnt.h new file mode 100644 index 0000000..ffcd976 --- /dev/null +++ b/include/bitlib/bit-algorithms/libpopcnt.h @@ -0,0 +1,798 @@ +/* + * libpopcnt.h - C/C++ library for counting the number of 1 bits (bit + * population count) in an array as quickly as possible using + * specialized CPU instructions i.e. POPCNT, AVX2, AVX512, NEON. + * + * Copyright (c) 2016 - 2020, Kim Walisch + * Copyright (c) 2016 - 2018, Wojciech Muła + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef LIBPOPCNT_H +#define LIBPOPCNT_H + +#include +#include + +#ifndef __has_builtin + #define __has_builtin(x) 0 +#endif + +#ifndef __has_attribute + #define __has_attribute(x) 0 +#endif + +#ifdef __GNUC__ + #define GNUC_PREREQ(x, y) \ + (__GNUC__ > x || (__GNUC__ == x && __GNUC_MINOR__ >= y)) +#else + #define GNUC_PREREQ(x, y) 0 +#endif + +#ifdef __clang__ + #define CLANG_PREREQ(x, y) \ + (__clang_major__ > x || (__clang_major__ == x && __clang_minor__ >= y)) +#else + #define CLANG_PREREQ(x, y) 0 +#endif + +#if (_MSC_VER < 1900) && \ + !defined(__cplusplus) + #define inline __inline +#endif + +#if (defined(__i386__) || \ + defined(__x86_64__) || \ + defined(_M_IX86) || \ + defined(_M_X64)) + #define X86_OR_X64 +#endif + +#if GNUC_PREREQ(4, 2) || \ + __has_builtin(__builtin_popcount) + #define HAVE_BUILTIN_POPCOUNT +#endif + +#if GNUC_PREREQ(4, 2) || \ + CLANG_PREREQ(3, 0) + #define HAVE_ASM_POPCNT +#endif + +#if defined(X86_OR_X64) && \ + (defined(HAVE_ASM_POPCNT) || \ + defined(_MSC_VER)) + #define HAVE_POPCNT +#endif + +#if defined(X86_OR_X64) && \ + GNUC_PREREQ(4, 9) + #define HAVE_AVX2 +#endif + +#if defined(X86_OR_X64) && \ + GNUC_PREREQ(5, 0) + #define HAVE_AVX512 +#endif + +#if defined(X86_OR_X64) + /* MSVC compatible compilers (Windows) */ + #if defined(_MSC_VER) + /* clang-cl (LLVM 10 from 2020) requires /arch:AVX2 or + * /arch:AVX512 to enable vector instructions */ + #if defined(__clang__) + #if defined(__AVX2__) + #define HAVE_AVX2 + #endif + #if defined(__AVX512__) + #define HAVE_AVX2 + #define HAVE_AVX512 + #endif + /* MSVC 2017 or later does not require + * /arch:AVX2 or /arch:AVX512 */ + #elif _MSC_VER >= 1910 + #define HAVE_AVX2 + #define HAVE_AVX512 + #endif + /* Clang (Unix-like OSes) */ + #elif CLANG_PREREQ(3, 8) && \ + __has_attribute(target) && \ + (!defined(__apple_build_version__) || __apple_build_version__ >= 8000000) + #define HAVE_AVX2 + #define HAVE_AVX512 + #endif +#endif + +/* + * Only enable CPUID runtime checks if this is really + * needed. E.g. do not enable if user has compiled + * using -march=native on a CPU that supports AVX512. + */ +#if defined(X86_OR_X64) && \ + (defined(__cplusplus) || \ + defined(_MSC_VER) || \ + (GNUC_PREREQ(4, 2) || \ + __has_builtin(__sync_val_compare_and_swap))) && \ + ((defined(HAVE_AVX512) && !(defined(__AVX512__) || defined(__AVX512BW__))) || \ + (defined(HAVE_AVX2) && !defined(__AVX2__)) || \ + (defined(HAVE_POPCNT) && !defined(__POPCNT__))) + #define HAVE_CPUID +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * This uses fewer arithmetic operations than any other known + * implementation on machines with fast multiplication. + * It uses 12 arithmetic operations, one of which is a multiply. + * http://en.wikipedia.org/wiki/Hamming_weight#Efficient_implementation + */ +static inline uint64_t popcount64(uint64_t x) +{ + uint64_t m1 = 0x5555555555555555ll; + uint64_t m2 = 0x3333333333333333ll; + uint64_t m4 = 0x0F0F0F0F0F0F0F0Fll; + uint64_t h01 = 0x0101010101010101ll; + + x -= (x >> 1) & m1; + x = (x & m2) + ((x >> 2) & m2); + x = (x + (x >> 4)) & m4; + + return (x * h01) >> 56; +} + +#if defined(HAVE_ASM_POPCNT) && \ + defined(__x86_64__) + +static inline uint64_t popcnt64(uint64_t x) +{ + __asm__ ("popcnt %1, %0" : "=r" (x) : "0" (x)); + return x; +} + +#elif defined(HAVE_ASM_POPCNT) && \ + defined(__i386__) + +static inline uint32_t popcnt32(uint32_t x) +{ + __asm__ ("popcnt %1, %0" : "=r" (x) : "0" (x)); + return x; +} + +static inline uint64_t popcnt64(uint64_t x) +{ + return popcnt32((uint32_t) x) + + popcnt32((uint32_t)(x >> 32)); +} + +#elif defined(_MSC_VER) && \ + defined(_M_X64) + +#include + +static inline uint64_t popcnt64(uint64_t x) +{ + return _mm_popcnt_u64(x); +} + +#elif defined(_MSC_VER) && \ + defined(_M_IX86) + +#include + +static inline uint64_t popcnt64(uint64_t x) +{ + return _mm_popcnt_u32((uint32_t) x) + + _mm_popcnt_u32((uint32_t)(x >> 32)); +} + +/* non x86 CPUs */ +#elif defined(HAVE_BUILTIN_POPCOUNT) + +static inline uint64_t popcnt64(uint64_t x) +{ + return __builtin_popcountll(x); +} + +/* no hardware POPCNT, + * use pure integer algorithm */ +#else + +static inline uint64_t popcnt64(uint64_t x) +{ + return popcount64(x); +} + +#endif + +#if defined(HAVE_CPUID) + +#if defined(_MSC_VER) + #include + #include +#endif + +/* %ecx bit flags */ +#define bit_POPCNT (1 << 23) + +/* %ebx bit flags */ +#define bit_AVX2 (1 << 5) +#define bit_AVX512 (1 << 30) + +/* xgetbv bit flags */ +#define XSTATE_SSE (1 << 1) +#define XSTATE_YMM (1 << 2) +#define XSTATE_ZMM (7 << 5) + +static inline void run_cpuid(int eax, int ecx, int* abcd) +{ +#if defined(_MSC_VER) + __cpuidex(abcd, eax, ecx); +#else + int ebx = 0; + int edx = 0; + + #if defined(__i386__) && \ + defined(__PIC__) + /* in case of PIC under 32-bit EBX cannot be clobbered */ + __asm__ ("movl %%ebx, %%edi;" + "cpuid;" + "xchgl %%ebx, %%edi;" + : "=D" (ebx), + "+a" (eax), + "+c" (ecx), + "=d" (edx)); + #else + __asm__ ("cpuid;" + : "+b" (ebx), + "+a" (eax), + "+c" (ecx), + "=d" (edx)); + #endif + + abcd[0] = eax; + abcd[1] = ebx; + abcd[2] = ecx; + abcd[3] = edx; +#endif +} + +#if defined(HAVE_AVX2) || \ + defined(HAVE_AVX512) + +static inline int get_xcr0() +{ + int xcr0; + +#if defined(_MSC_VER) + xcr0 = (int) _xgetbv(0); +#else + __asm__ ("xgetbv" : "=a" (xcr0) : "c" (0) : "%edx" ); +#endif + + return xcr0; +} + +#endif + +static inline int get_cpuid() +{ + int flags = 0; + int abcd[4]; + + run_cpuid(1, 0, abcd); + + if ((abcd[2] & bit_POPCNT) == bit_POPCNT) + flags |= bit_POPCNT; + +#if defined(HAVE_AVX2) || \ + defined(HAVE_AVX512) + + int osxsave_mask = (1 << 27); + + /* ensure OS supports extended processor state management */ + if ((abcd[2] & osxsave_mask) != osxsave_mask) + return 0; + + int ymm_mask = XSTATE_SSE | XSTATE_YMM; + int zmm_mask = XSTATE_SSE | XSTATE_YMM | XSTATE_ZMM; + + int xcr0 = get_xcr0(); + + if ((xcr0 & ymm_mask) == ymm_mask) + { + run_cpuid(7, 0, abcd); + + if ((abcd[1] & bit_AVX2) == bit_AVX2) + flags |= bit_AVX2; + + if ((xcr0 & zmm_mask) == zmm_mask) + { + if ((abcd[1] & bit_AVX512) == bit_AVX512) + flags |= bit_AVX512; + } + } + +#endif + + return flags; +} + +#endif /* cpuid */ + +#if defined(HAVE_AVX2) + +#include + +#if !defined(_MSC_VER) + __attribute__ ((target ("avx2"))) +#endif +static inline void CSA256(__m256i* h, __m256i* l, __m256i a, __m256i b, __m256i c) +{ + __m256i u = _mm256_xor_si256(a, b); + *h = _mm256_or_si256(_mm256_and_si256(a, b), _mm256_and_si256(u, c)); + *l = _mm256_xor_si256(u, c); +} + +#if !defined(_MSC_VER) + __attribute__ ((target ("avx2"))) +#endif +static inline __m256i popcnt256(__m256i v) +{ + __m256i lookup1 = _mm256_setr_epi8( + 4, 5, 5, 6, 5, 6, 6, 7, + 5, 6, 6, 7, 6, 7, 7, 8, + 4, 5, 5, 6, 5, 6, 6, 7, + 5, 6, 6, 7, 6, 7, 7, 8 + ); + + __m256i lookup2 = _mm256_setr_epi8( + 4, 3, 3, 2, 3, 2, 2, 1, + 3, 2, 2, 1, 2, 1, 1, 0, + 4, 3, 3, 2, 3, 2, 2, 1, + 3, 2, 2, 1, 2, 1, 1, 0 + ); + + __m256i low_mask = _mm256_set1_epi8(0x0f); + __m256i lo = _mm256_and_si256(v, low_mask); + __m256i hi = _mm256_and_si256(_mm256_srli_epi16(v, 4), low_mask); + __m256i popcnt1 = _mm256_shuffle_epi8(lookup1, lo); + __m256i popcnt2 = _mm256_shuffle_epi8(lookup2, hi); + + return _mm256_sad_epu8(popcnt1, popcnt2); +} + +/* + * AVX2 Harley-Seal popcount (4th iteration). + * The algorithm is based on the paper "Faster Population Counts + * using AVX2 Instructions" by Daniel Lemire, Nathan Kurz and + * Wojciech Mula (23 Nov 2016). + * @see https://arxiv.org/abs/1611.07612 + */ +#if !defined(_MSC_VER) + __attribute__ ((target ("avx2"))) +#endif +static inline uint64_t popcnt_avx2(const __m256i* ptr, uint64_t size) +{ + __m256i cnt = _mm256_setzero_si256(); + __m256i ones = _mm256_setzero_si256(); + __m256i twos = _mm256_setzero_si256(); + __m256i fours = _mm256_setzero_si256(); + __m256i eights = _mm256_setzero_si256(); + __m256i sixteens = _mm256_setzero_si256(); + __m256i twosA, twosB, foursA, foursB, eightsA, eightsB; + + uint64_t i = 0; + uint64_t limit = size - size % 16; + uint64_t* cnt64; + + for(; i < limit; i += 16) + { + CSA256(&twosA, &ones, ones, _mm256_loadu_si256(ptr + i + 0), _mm256_loadu_si256(ptr + i + 1)); + CSA256(&twosB, &ones, ones, _mm256_loadu_si256(ptr + i + 2), _mm256_loadu_si256(ptr + i + 3)); + CSA256(&foursA, &twos, twos, twosA, twosB); + CSA256(&twosA, &ones, ones, _mm256_loadu_si256(ptr + i + 4), _mm256_loadu_si256(ptr + i + 5)); + CSA256(&twosB, &ones, ones, _mm256_loadu_si256(ptr + i + 6), _mm256_loadu_si256(ptr + i + 7)); + CSA256(&foursB, &twos, twos, twosA, twosB); + CSA256(&eightsA, &fours, fours, foursA, foursB); + CSA256(&twosA, &ones, ones, _mm256_loadu_si256(ptr + i + 8), _mm256_loadu_si256(ptr + i + 9)); + CSA256(&twosB, &ones, ones, _mm256_loadu_si256(ptr + i + 10), _mm256_loadu_si256(ptr + i + 11)); + CSA256(&foursA, &twos, twos, twosA, twosB); + CSA256(&twosA, &ones, ones, _mm256_loadu_si256(ptr + i + 12), _mm256_loadu_si256(ptr + i + 13)); + CSA256(&twosB, &ones, ones, _mm256_loadu_si256(ptr + i + 14), _mm256_loadu_si256(ptr + i + 15)); + CSA256(&foursB, &twos, twos, twosA, twosB); + CSA256(&eightsB, &fours, fours, foursA, foursB); + CSA256(&sixteens, &eights, eights, eightsA, eightsB); + + cnt = _mm256_add_epi64(cnt, popcnt256(sixteens)); + } + + cnt = _mm256_slli_epi64(cnt, 4); + cnt = _mm256_add_epi64(cnt, _mm256_slli_epi64(popcnt256(eights), 3)); + cnt = _mm256_add_epi64(cnt, _mm256_slli_epi64(popcnt256(fours), 2)); + cnt = _mm256_add_epi64(cnt, _mm256_slli_epi64(popcnt256(twos), 1)); + cnt = _mm256_add_epi64(cnt, popcnt256(ones)); + + for(; i < size; i++) + cnt = _mm256_add_epi64(cnt, popcnt256(_mm256_loadu_si256(ptr + i))); + + cnt64 = (uint64_t*) &cnt; + + return cnt64[0] + + cnt64[1] + + cnt64[2] + + cnt64[3]; +} + +#endif + +#if defined(HAVE_AVX512) + +#include + +#if !defined(_MSC_VER) + __attribute__ ((target ("avx512bw"))) +#endif +static inline __m512i popcnt512(__m512i v) +{ + __m512i m1 = _mm512_set1_epi8(0x55); + __m512i m2 = _mm512_set1_epi8(0x33); + __m512i m4 = _mm512_set1_epi8(0x0F); + __m512i vm = _mm512_and_si512(_mm512_srli_epi16(v, 1), m1); + __m512i t1 = _mm512_sub_epi8(v, vm); + __m512i tm = _mm512_and_si512(t1, m2); + __m512i tm2 = _mm512_and_si512(_mm512_srli_epi16(t1, 2), m2); + __m512i t2 = _mm512_add_epi8(tm, tm2); + __m512i tt = _mm512_add_epi8(t2, _mm512_srli_epi16(t2, 4)); + __m512i t3 = _mm512_and_si512(tt, m4); + + return _mm512_sad_epu8(t3, _mm512_setzero_si512()); +} + +#if !defined(_MSC_VER) + __attribute__ ((target ("avx512bw"))) +#endif +static inline void CSA512(__m512i* h, __m512i* l, __m512i a, __m512i b, __m512i c) +{ + *l = _mm512_ternarylogic_epi32(c, b, a, 0x96); + *h = _mm512_ternarylogic_epi32(c, b, a, 0xe8); +} + +/* + * AVX512 Harley-Seal popcount (4th iteration). + * The algorithm is based on the paper "Faster Population Counts + * using AVX2 Instructions" by Daniel Lemire, Nathan Kurz and + * Wojciech Mula (23 Nov 2016). + * @see https://arxiv.org/abs/1611.07612 + */ +#if !defined(_MSC_VER) + __attribute__ ((target ("avx512bw"))) +#endif +static inline uint64_t popcnt_avx512(const __m512i* ptr, const uint64_t size) +{ + __m512i cnt = _mm512_setzero_si512(); + __m512i ones = _mm512_setzero_si512(); + __m512i twos = _mm512_setzero_si512(); + __m512i fours = _mm512_setzero_si512(); + __m512i eights = _mm512_setzero_si512(); + __m512i sixteens = _mm512_setzero_si512(); + __m512i twosA, twosB, foursA, foursB, eightsA, eightsB; + + uint64_t i = 0; + uint64_t limit = size - size % 16; + uint64_t* cnt64; + + for(; i < limit; i += 16) + { + CSA512(&twosA, &ones, ones, _mm512_loadu_si512(ptr + i + 0), _mm512_loadu_si512(ptr + i + 1)); + CSA512(&twosB, &ones, ones, _mm512_loadu_si512(ptr + i + 2), _mm512_loadu_si512(ptr + i + 3)); + CSA512(&foursA, &twos, twos, twosA, twosB); + CSA512(&twosA, &ones, ones, _mm512_loadu_si512(ptr + i + 4), _mm512_loadu_si512(ptr + i + 5)); + CSA512(&twosB, &ones, ones, _mm512_loadu_si512(ptr + i + 6), _mm512_loadu_si512(ptr + i + 7)); + CSA512(&foursB, &twos, twos, twosA, twosB); + CSA512(&eightsA, &fours, fours, foursA, foursB); + CSA512(&twosA, &ones, ones, _mm512_loadu_si512(ptr + i + 8), _mm512_loadu_si512(ptr + i + 9)); + CSA512(&twosB, &ones, ones, _mm512_loadu_si512(ptr + i + 10), _mm512_loadu_si512(ptr + i + 11)); + CSA512(&foursA, &twos, twos, twosA, twosB); + CSA512(&twosA, &ones, ones, _mm512_loadu_si512(ptr + i + 12), _mm512_loadu_si512(ptr + i + 13)); + CSA512(&twosB, &ones, ones, _mm512_loadu_si512(ptr + i + 14), _mm512_loadu_si512(ptr + i + 15)); + CSA512(&foursB, &twos, twos, twosA, twosB); + CSA512(&eightsB, &fours, fours, foursA, foursB); + CSA512(&sixteens, &eights, eights, eightsA, eightsB); + + cnt = _mm512_add_epi64(cnt, popcnt512(sixteens)); + } + + cnt = _mm512_slli_epi64(cnt, 4); + cnt = _mm512_add_epi64(cnt, _mm512_slli_epi64(popcnt512(eights), 3)); + cnt = _mm512_add_epi64(cnt, _mm512_slli_epi64(popcnt512(fours), 2)); + cnt = _mm512_add_epi64(cnt, _mm512_slli_epi64(popcnt512(twos), 1)); + cnt = _mm512_add_epi64(cnt, popcnt512(ones)); + + for(; i < size; i++) + cnt = _mm512_add_epi64(cnt, popcnt512(_mm512_loadu_si512(ptr + i))); + + cnt64 = (uint64_t*) &cnt; + + return cnt64[0] + + cnt64[1] + + cnt64[2] + + cnt64[3] + + cnt64[4] + + cnt64[5] + + cnt64[6] + + cnt64[7]; +} + +#endif + +/* x86 CPUs */ +#if defined(X86_OR_X64) + +/* + * Count the number of 1 bits in the data array + * @data: An array + * @size: Size of data in bytes + */ +static inline uint64_t popcnt(const void* data, uint64_t size) +{ + uint64_t i = 0; + uint64_t cnt = 0; + const uint8_t* ptr = (const uint8_t*) data; + +/* + * CPUID runtime checks are only enabled if this is needed. + * E.g. CPUID is disabled when a user compiles his + * code using -march=native on a CPU with AVX512. + */ +#if defined(HAVE_CPUID) + #if defined(__cplusplus) + /* C++11 thread-safe singleton */ + static const int cpuid = get_cpuid(); + #else + static int cpuid_ = -1; + int cpuid = cpuid_; + if (cpuid == -1) + { + cpuid = get_cpuid(); + + #if defined(_MSC_VER) + _InterlockedCompareExchange(&cpuid_, cpuid, -1); + #else + __sync_val_compare_and_swap(&cpuid_, -1, cpuid); + #endif + } + #endif +#endif + +#if defined(HAVE_AVX512) + #if defined(__AVX512__) || defined(__AVX512BW__) + /* AVX512 requires arrays >= 1024 bytes */ + if (i + 1024 <= size) + #else + if ((cpuid & bit_AVX512) && + i + 1024 <= size) + #endif + { + const __m512i* ptr512 = (const __m512i*)(ptr + i); + cnt += popcnt_avx512(ptr512, (size - i) / 64); + i = size - size % 64; + } +#endif + +#if defined(HAVE_AVX2) + #if defined(__AVX2__) + /* AVX2 requires arrays >= 512 bytes */ + if (i + 512 <= size) + #else + if ((cpuid & bit_AVX2) && + i + 512 <= size) + #endif + { + const __m256i* ptr256 = (const __m256i*)(ptr + i); + cnt += popcnt_avx2(ptr256, (size - i) / 32); + i = size - size % 32; + } +#endif + +#if defined(HAVE_POPCNT) + /* + * The user has compiled without -mpopcnt. + * Unfortunately the MSVC compiler does not have + * a POPCNT macro so we cannot get rid of the + * runtime check for MSVC. + */ + #if !defined(__POPCNT__) + if (cpuid & bit_POPCNT) + #endif + { + /* We use unaligned memory accesses here to improve performance */ + for (; i < size - size % 8; i += 8) + cnt += popcnt64(*(const uint64_t*)(ptr + i)); + for (; i < size; i++) + cnt += popcnt64(ptr[i]); + + return cnt; + } +#endif + +#if !defined(HAVE_POPCNT) || \ + !defined(__POPCNT__) + /* + * Pure integer popcount algorithm. + * We use unaligned memory accesses here to improve performance. + */ + for (; i < size - size % 8; i += 8) + cnt += popcount64(*(const uint64_t*)(ptr + i)); + + if (i < size) + { + uint64_t val = 0; + size_t bytes = (size_t)(size - i); + memcpy(&val, &ptr[i], bytes); + cnt += popcount64(val); + } + + return cnt; +#endif +} + +#elif defined(__ARM_NEON) || \ + defined(__aarch64__) + +#include + +static inline uint64x2_t vpadalq(uint64x2_t sum, uint8x16_t t) +{ + return vpadalq_u32(sum, vpaddlq_u16(vpaddlq_u8(t))); +} + +/* + * Count the number of 1 bits in the data array + * @data: An array + * @size: Size of data in bytes + */ +static inline uint64_t popcnt(const void* data, uint64_t size) +{ + uint64_t i = 0; + uint64_t cnt = 0; + uint64_t chunk_size = 64; + const uint8_t* ptr = (const uint8_t*) data; + + if (size >= chunk_size) + { + uint64_t iters = size / chunk_size; + uint64x2_t sum = vcombine_u64(vcreate_u64(0), vcreate_u64(0)); + uint8x16_t zero = vcombine_u8(vcreate_u8(0), vcreate_u8(0)); + + do + { + uint8x16_t t0 = zero; + uint8x16_t t1 = zero; + uint8x16_t t2 = zero; + uint8x16_t t3 = zero; + + /* + * After every 31 iterations we need to add the + * temporary sums (t0, t1, t2, t3) to the total sum. + * We must ensure that the temporary sums <= 255 + * and 31 * 8 bits = 248 which is OK. + */ + uint64_t limit = (i + 31 < iters) ? i + 31 : iters; + + /* Each iteration processes 64 bytes */ + for (; i < limit; i++) + { + uint8x16x4_t input = vld4q_u8(ptr); + ptr += chunk_size; + + t0 = vaddq_u8(t0, vcntq_u8(input.val[0])); + t1 = vaddq_u8(t1, vcntq_u8(input.val[1])); + t2 = vaddq_u8(t2, vcntq_u8(input.val[2])); + t3 = vaddq_u8(t3, vcntq_u8(input.val[3])); + } + + sum = vpadalq(sum, t0); + sum = vpadalq(sum, t1); + sum = vpadalq(sum, t2); + sum = vpadalq(sum, t3); + } + while (i < iters); + + i = 0; + size %= chunk_size; + + uint64_t tmp[2]; + vst1q_u64(tmp, sum); + cnt += tmp[0]; + cnt += tmp[1]; + } + +#if defined(__ARM_FEATURE_UNALIGNED) + /* We use unaligned memory accesses here to improve performance */ + for (; i < size - size % 8; i += 8) + cnt += popcnt64(*(const uint64_t*)(ptr + i)); +#else + if (i + 8 <= size) + { + /* Align memory to an 8 byte boundary */ + for (; (uintptr_t)(ptr + i) % 8; i++) + cnt += popcnt64(ptr[i]); + for (; i < size - size % 8; i += 8) + cnt += popcnt64(*(const uint64_t*)(ptr + i)); + } +#endif + + if (i < size) + { + uint64_t val = 0; + size_t bytes = (size_t)(size - i); + memcpy(&val, &ptr[i], bytes); + cnt += popcount64(val); + } + + return cnt; +} + +/* all other CPUs */ +#else + +/* + * Count the number of 1 bits in the data array + * @data: An array + * @size: Size of data in bytes + */ +static inline uint64_t popcnt(const void* data, uint64_t size) +{ + uint64_t i = 0; + uint64_t cnt = 0; + const uint8_t* ptr = (const uint8_t*) data; + + if (size >= 8) + { + /* + * Since we don't know whether this CPU architecture + * supports unaligned memory accesses we align + * memory to an 8 byte boundary. + */ + for (; (uintptr_t)(ptr + i) % 8; i++) + cnt += popcnt64(ptr[i]); + for (; i < size - size % 8; i += 8) + cnt += popcnt64(*(const uint64_t*)(ptr + i)); + } + + for (; i < size; i++) + cnt += popcnt64(ptr[i]); + + return cnt; +} + +#endif + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif /* LIBPOPCNT_H */ From c54833d0279d9541bad2d94b134d8c7ba3e073ed Mon Sep 17 00:00:00 2001 From: Bryce Lorenz Kille Date: Mon, 21 Aug 2023 17:27:01 -0500 Subject: [PATCH 04/31] Use libpopcnt at all times --- include/bitlib/bit-algorithms/.count.hpp.swp | Bin 0 -> 20480 bytes include/bitlib/bit-algorithms/count.hpp | 59 ++++++++++--------- 2 files changed, 31 insertions(+), 28 deletions(-) create mode 100644 include/bitlib/bit-algorithms/.count.hpp.swp diff --git a/include/bitlib/bit-algorithms/.count.hpp.swp b/include/bitlib/bit-algorithms/.count.hpp.swp new file mode 100644 index 0000000000000000000000000000000000000000..80cef426b9147ce433c4d72c7f6d74197af42813 GIT binary patch literal 20480 zcmeI4du$v>9mj_@;gR$~6Q~eU%Q#k&z4)<)gy5XdC;1h{#da(|5>+ia?d{I_mhSCd z_hHAe(tw2e4@eCH37V3Bny7+QZG{j*4Mjo-5hVUX2-H6zK}1n_sv!OVD!w!OytSP> zM;0J7D}A!Q+1Z)-eP{MJvpYYfI&f-onjY&PAUHlq$ii=L8s9#;_u%lK2&r1n9OcRF zj-|TOE{nMS94}Cvb)C;L-S^Z{)8|Z?Rn@e0D=?T^HGK;|aq82oCC4@Wdc#w7C$Rng zdb8P4Ei#i-Kq|0Kfxxfz4L(Q?f8yifQSaW@MfdJmT$fEINCl(8J^_g1J-PFa=r@bbkL0f3AN`KY@p9LrpYkLXkP1izqyka_ zsen{KDj*e*3P=T{0#X5~z<)sj%^>7Q4-oRv2T?g*{^R%mHy$Qr4IBj@20wm?kZ*!7 zfgw-?w{g$U!4JSy@HKD(@UkXAFW3)01ond4sOKv97I+bS9XtPz5K!G?)U%z#zDV zk2t>zKjf$?9p1Ksswx_Qf2plCnAg;FMiH`sG(vH>%nw`}Coi!4A(4cv$4y(a%(I$r zIyMoBVeaq_o|O($&o_!i!}NU3)|pZ>UC-~Omge~d+SgK`S))o9viYW7(Q&<0bsVd# zENfQ43I)0r*E++NG@p5NrOs^X*BP~gMwPkLsnN5{b$GKqYI-R9*?@UIGxALP;>tP7ERUrojya8Hz?E?9&`e=dm06CxM14nC?V+D&U>nS}R!t=H)+N=X z{eb32D$|~{R<3^~RJK)KCYVHULZ5Wk^6G?$i2R58pYE_0vAmN7J=E$>fpTPiE< z8m0qrSMrv%wli}xzZf4+eMU)}#s5MH(SVFq6sbeG;^OWs2`mO0v=R#*urSEFCS99D(#bri-!JkyVM!K18n6)G!9018r-ndq2cuHD)Z1y43 z&uND0s4_%NpG%!FV2JkgAagXtcxn8=Gd#x{nrUMIg-=PCe)rnzcCk1oESe7*iqT8M zp?oUW6ke3>*q%?d0An`Fgu%q`pLLpnB?kByj|5YA+LWUiyto70tXyH?(;v4kva;2+ zpf|D>7ic-GYD%-2rx*pH3i?J=6{WR4=lsgNu31{uVhh?*=7Fb1>th1p<|n7eJJ)RM zJw^{4pk1a{frnkPnV}p|xGcT&=n+`5NoS7myxc)lA*!Ldt0Yz|)en!Q=oNORiJ=lb z$7MR0N+CuCAF$|NYCAqP8%>Kfm}=M05cZR3|RdAMWBI$~NbuXDv|jocrFxi@e~Nt@?1+i)5qy3Rar5~lMh zEL%rdURqY<6I-9Q_&I};?OeIXmYJ(rc~6eL7T$I65jEyA%$#E4iCqA^XxI)^Fll4X z%1y&*2j>fj`g<6&q>b@W34dosda)>`g|PU*<1<6 zeAaJieYVWhp?6$ba!X?R0-E7bbXYi|HQSG5t7x9bT)zSbyu>QYrdBmA(_d{_eI%uf zh`g9;JregDH%rJs#lNB}vZm6!t-|qylfm8diJ{a>MEI+M6WpVC5bsS<;xjy&SPhkQ z_hOAn9EY2~&~K zRZ*0{wEd>M;rj}__Y{5U<_H)d%87Tr-QH+`IWx@$K1vb)&g+Nc93ZDD!hiTO{g zJH#X08?N-k-xaLqP)O9rMY@1jlZi7;bSs}a;o?Uz=tq(ecvReBFTzwR#(jsq$lw2; z#ryr20e|`D1Ct*&qO=P>asen{KDj*e*3P=T{0#X5~fK)&#AQg}bY_9^_{xKu` z3Wx728-6?C@BcUP)#(N7E%Wz(7Flx(@BP07Z-MWEE8sF%1x0WS6o3LA1@9mp;P>Dq za1pG5JzzJufxZ80;1%#PxBzN^fhWLy;C1Z#*8l@2feQM;AF;&)N z%h_#k1zZ59!C`O^>;P|KzyAer9y|x?pagDWpZ^!&3t$=ygA(Whf5ATgb?`m#3b+Km z0u1m3I0zmBcd+Mw6}$)*!KcAN@HY1NUjvtc4o-m*_`?oDeg=LBu7Nkf=fG27Klm7U z9P9#rM_>F6{1yBL{2GY<+1`;ROZ(qbVB4R58K)uRG?L9QvGepFj?+j35yH(9h40u$ zpTd(!pZs=wZbM&Ha|wLAlOL*jmw1n;zPpBdsA}7Q4^`bH_6F%zx99bxNu`|@EQ zs+tPYHt<7LbDBoBr3_;sQHVqBQ>WRVKL3oH5F`Wm9V92iv z5kV1re~c@ZhL?l^Z*>Gb@c2DV_XEwcR;gz>D~Rv&>slD^7w>~6qoDO!k#Qy>G7|gK z9!*0AWt>T;Q7g&RnnZ}mI1?VjfVV7yH<=`!Ax^`aOp+$!QMM9~;_1w?5Q*pn2n7iP nLerQN#MG#DbE&?ft%kuU`6xT#LQU7`Yih2)8U=-zh${Iv4Z=Vd literal 0 HcmV?d00001 diff --git a/include/bitlib/bit-algorithms/count.hpp b/include/bitlib/bit-algorithms/count.hpp index f6cecc8..92356b6 100644 --- a/include/bitlib/bit-algorithms/count.hpp +++ b/include/bitlib/bit-algorithms/count.hpp @@ -13,6 +13,7 @@ #include // Project sources #include "bitlib/bit-iterator/bit.hpp" +#include "bitlib/bit-algorithms//libpopcnt.h" // Third-party libraries #ifdef BITLIB_HWY #include "hwy/highway.h" @@ -56,37 +57,39 @@ count( result = _popcnt(first_value); ++it; } -#ifdef BITLIB_HWY - // ReduceSum not implemented for unsigned char - if constexpr (digits > 8) - { - // Align to boundary - for (; it != last.base() && !is_aligned(&(*it), 64); ++it) { - result += _popcnt(*it); - } +// The SIMD implementation here is actually slower than the standard +//#ifdef BITLIB_HWY + //// ReduceSum not implemented for unsigned char + //if constexpr (digits > 8) + //{ + //// Align to boundary + //for (; it != last.base() && !is_aligned(&(*it), 64); ++it) { + //result += _popcnt(*it); + //} - // SIMD - hn::ScalableTag d; - for (; std::distance(it, last.base()) >= hn::Lanes(d); it += hn::Lanes(d)) - { - const auto popcntV = hn::PopulationCount(hn::Load(d, &*it)); - result += hn::ReduceSum(d, popcntV); - } + //// SIMD + //hn::ScalableTag d; + //for (; std::distance(it, last.base()) >= hn::Lanes(d); it += hn::Lanes(d)) + //{ + //const auto popcntV = hn::PopulationCount(hn::Load(d, &*it)); + //result += hn::ReduceSum(d, popcntV); + //} - // Remaining - for (; it != last.base(); ++it) { - result += _popcnt(*it); - } - } else -#endif + //// Remaining + //for (; it != last.base(); ++it) { + //result += _popcnt(*it); + //} + //} else +//#endif { - result += std::transform_reduce( - it, - last.base(), - 0, - std::plus{}, - [](word_type word) {return _popcnt(word); } - ); + //result += std::transform_reduce( + //it, + //last.base(), + //0, + //std::plus{}, + //[](word_type word) {return popcnt(word); } + //); + result += popcnt(&*it, std::distance(it, last.base())); } if (last.position() != 0) { word_type last_value = *last.base() << (digits - last.position()); From 9a4fada456e507b628dbfb8f2c68cd01cb972686 Mon Sep 17 00:00:00 2001 From: Bryce Lorenz Kille Date: Mon, 21 Aug 2023 17:27:12 -0500 Subject: [PATCH 05/31] Add count benhcmarks --- benchmark/src/benchmark_main.cc | 16 ++++++++++++++++ benchmark/src/count_bench.hpp | 25 +++++++++++++++++++++++++ 2 files changed, 41 insertions(+) diff --git a/benchmark/src/benchmark_main.cc b/benchmark/src/benchmark_main.cc index 3f7b3db..2f8e647 100644 --- a/benchmark/src/benchmark_main.cc +++ b/benchmark/src/benchmark_main.cc @@ -278,6 +278,14 @@ int main(int argc, char** argv) { BM_BitCount, "bit::count (small) (AA)", size_small); + register_word_containers( + BM_DynamicBitsetCount, + "dynamic_bitset::count (small)", + size_small); + register_word_containers( + BM_BitArrayCount, + "bitarray::count (small)", + size_small); register_bool_containers( BM_BoolCount, "std::count (small)", @@ -286,6 +294,14 @@ int main(int argc, char** argv) { BM_BitCount, "bit::count (huge) (AA)", size_huge); + register_word_containers( + BM_DynamicBitsetCount, + "dynamic_bitset::count (huge)", + size_huge); + register_word_containers( + BM_BitArrayCount, + "bitarray::count (huge)", + size_huge); register_bool_containers( BM_BoolCount, "std::count (huge)", diff --git a/benchmark/src/count_bench.hpp b/benchmark/src/count_bench.hpp index 2ef9507..a169d00 100644 --- a/benchmark/src/count_bench.hpp +++ b/benchmark/src/count_bench.hpp @@ -2,6 +2,8 @@ #include #include "test_utils.hpp" #include "bitlib/bit-algorithms/count.hpp" +#include "bit_array.h" +#include "sul/dynamic_bitset.hpp" auto BM_BitCount = [](benchmark::State& state, auto input) { using container_type = typename std::tuple_element<0, decltype(input)>::type; @@ -19,6 +21,29 @@ auto BM_BitCount = [](benchmark::State& state, auto input) { }; +auto BM_BitArrayCount = [](benchmark::State& state, auto input) { + using container_type = typename std::tuple_element<0, decltype(input)>::type; + using word_type = typename std::tuple_element<1, decltype(input)>::type; + unsigned int total_bits = std::get<2>(input); + BIT_ARRAY* bitarr = bit_array_create(total_bits); + for (auto _ : state) { + benchmark::DoNotOptimize(bit_array_num_bits_set(bitarr)); + benchmark::ClobberMemory(); + } + bit_array_free(bitarr); +}; + +auto BM_DynamicBitsetCount = [](benchmark::State& state, auto input) { + using container_type = typename std::tuple_element<0, decltype(input)>::type; + using word_type = typename std::tuple_element<1, decltype(input)>::type; + unsigned int total_bits = std::get<2>(input); + sul::dynamic_bitset<> bitset1(total_bits, 1); + for (auto _ : state) { + benchmark::DoNotOptimize(bitset1.count()); + benchmark::ClobberMemory(); + } +}; + auto BM_BoolCount = [](benchmark::State& state, auto input) { using container_type = std::vector; using num_type = typename container_type::value_type; From 6b7cef90c03c2ba95ab83e7c1399364fe29537aa Mon Sep 17 00:00:00 2001 From: Bryce Lorenz Kille Date: Tue, 22 Aug 2023 13:37:34 -0500 Subject: [PATCH 06/31] Add SIMD --- include/bitlib/bit-algorithms/transform.hpp | 33 +++++++++++++++++++-- 1 file changed, 30 insertions(+), 3 deletions(-) diff --git a/include/bitlib/bit-algorithms/transform.hpp b/include/bitlib/bit-algorithms/transform.hpp index 5d3321e..4a71cf8 100644 --- a/include/bitlib/bit-algorithms/transform.hpp +++ b/include/bitlib/bit-algorithms/transform.hpp @@ -13,6 +13,7 @@ // ============================== PREAMBLE ================================== // // C++ standard library #include +#include // Project sources #include "bitlib/bit-iterator/bit.hpp" // Third-party libraries @@ -73,14 +74,40 @@ constexpr bit_iterator transform( advance(first, partial_bits_to_op); it++; } + auto firstIt = first.base(); if (remaining_bits_to_op > 0) { const bool is_first_aligned = first.position() == 0; //size_type words_to_op = ::std::ceil(remaining_bits_to_op / static_cast(digits)); // d_first will be aligned at this point if (is_first_aligned && remaining_bits_to_op > digits) { - auto N = ::std::distance(first.base(), last.base()); - it = std::transform(first.base(), last.base(), it, unary_op); - first += digits * N; + auto N = ::std::distance(firstIt, last.base()); +#ifdef BITLIB_HWY + if constexpr (std::is_same_v>) + { + // Align to 64 bit boundary + for (; firstIt != last.base() && !is_aligned(&*firstIt, 64); firstIt++, it++) { + *it = unary_op(*firstIt); + } + + bool out_is_aligned = is_aligned(&*it, 64); + + constexpr hn::ScalableTag d; + for (; std::distance(firstIt, last.base()) >= hn::Lanes(d); firstIt += hn::Lanes(d), it += hn::Lanes(d)) + { + const auto v = hn::Not(hn::Load(d, &*firstIt)); + if (out_is_aligned) + { + hn::Store(v, d, &*it); + } else { + hn::StoreU(v, d, &*it); + } + } + } +#endif + size_t std_dist = ::std::distance(firstIt, last.base()); + it = std::transform(firstIt, last.base(), it, unary_op); + firstIt += std_dist; + first = bit_iterator(firstIt); remaining_bits_to_op -= digits * N; } else { while (remaining_bits_to_op >= digits) { From 3d8a0033dbc988363ff20959975f3a67dbec7157 Mon Sep 17 00:00:00 2001 From: Bryce Lorenz Kille Date: Tue, 22 Aug 2023 13:38:05 -0500 Subject: [PATCH 07/31] Remove swp --- include/bitlib/bit-algorithms/.count.hpp.swp | Bin 20480 -> 0 bytes 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 include/bitlib/bit-algorithms/.count.hpp.swp diff --git a/include/bitlib/bit-algorithms/.count.hpp.swp b/include/bitlib/bit-algorithms/.count.hpp.swp deleted file mode 100644 index 80cef426b9147ce433c4d72c7f6d74197af42813..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 20480 zcmeI4du$v>9mj_@;gR$~6Q~eU%Q#k&z4)<)gy5XdC;1h{#da(|5>+ia?d{I_mhSCd z_hHAe(tw2e4@eCH37V3Bny7+QZG{j*4Mjo-5hVUX2-H6zK}1n_sv!OVD!w!OytSP> zM;0J7D}A!Q+1Z)-eP{MJvpYYfI&f-onjY&PAUHlq$ii=L8s9#;_u%lK2&r1n9OcRF zj-|TOE{nMS94}Cvb)C;L-S^Z{)8|Z?Rn@e0D=?T^HGK;|aq82oCC4@Wdc#w7C$Rng zdb8P4Ei#i-Kq|0Kfxxfz4L(Q?f8yifQSaW@MfdJmT$fEINCl(8J^_g1J-PFa=r@bbkL0f3AN`KY@p9LrpYkLXkP1izqyka_ zsen{KDj*e*3P=T{0#X5~z<)sj%^>7Q4-oRv2T?g*{^R%mHy$Qr4IBj@20wm?kZ*!7 zfgw-?w{g$U!4JSy@HKD(@UkXAFW3)01ond4sOKv97I+bS9XtPz5K!G?)U%z#zDV zk2t>zKjf$?9p1Ksswx_Qf2plCnAg;FMiH`sG(vH>%nw`}Coi!4A(4cv$4y(a%(I$r zIyMoBVeaq_o|O($&o_!i!}NU3)|pZ>UC-~Omge~d+SgK`S))o9viYW7(Q&<0bsVd# zENfQ43I)0r*E++NG@p5NrOs^X*BP~gMwPkLsnN5{b$GKqYI-R9*?@UIGxALP;>tP7ERUrojya8Hz?E?9&`e=dm06CxM14nC?V+D&U>nS}R!t=H)+N=X z{eb32D$|~{R<3^~RJK)KCYVHULZ5Wk^6G?$i2R58pYE_0vAmN7J=E$>fpTPiE< z8m0qrSMrv%wli}xzZf4+eMU)}#s5MH(SVFq6sbeG;^OWs2`mO0v=R#*urSEFCS99D(#bri-!JkyVM!K18n6)G!9018r-ndq2cuHD)Z1y43 z&uND0s4_%NpG%!FV2JkgAagXtcxn8=Gd#x{nrUMIg-=PCe)rnzcCk1oESe7*iqT8M zp?oUW6ke3>*q%?d0An`Fgu%q`pLLpnB?kByj|5YA+LWUiyto70tXyH?(;v4kva;2+ zpf|D>7ic-GYD%-2rx*pH3i?J=6{WR4=lsgNu31{uVhh?*=7Fb1>th1p<|n7eJJ)RM zJw^{4pk1a{frnkPnV}p|xGcT&=n+`5NoS7myxc)lA*!Ldt0Yz|)en!Q=oNORiJ=lb z$7MR0N+CuCAF$|NYCAqP8%>Kfm}=M05cZR3|RdAMWBI$~NbuXDv|jocrFxi@e~Nt@?1+i)5qy3Rar5~lMh zEL%rdURqY<6I-9Q_&I};?OeIXmYJ(rc~6eL7T$I65jEyA%$#E4iCqA^XxI)^Fll4X z%1y&*2j>fj`g<6&q>b@W34dosda)>`g|PU*<1<6 zeAaJieYVWhp?6$ba!X?R0-E7bbXYi|HQSG5t7x9bT)zSbyu>QYrdBmA(_d{_eI%uf zh`g9;JregDH%rJs#lNB}vZm6!t-|qylfm8diJ{a>MEI+M6WpVC5bsS<;xjy&SPhkQ z_hOAn9EY2~&~K zRZ*0{wEd>M;rj}__Y{5U<_H)d%87Tr-QH+`IWx@$K1vb)&g+Nc93ZDD!hiTO{g zJH#X08?N-k-xaLqP)O9rMY@1jlZi7;bSs}a;o?Uz=tq(ecvReBFTzwR#(jsq$lw2; z#ryr20e|`D1Ct*&qO=P>asen{KDj*e*3P=T{0#X5~fK)&#AQg}bY_9^_{xKu` z3Wx728-6?C@BcUP)#(N7E%Wz(7Flx(@BP07Z-MWEE8sF%1x0WS6o3LA1@9mp;P>Dq za1pG5JzzJufxZ80;1%#PxBzN^fhWLy;C1Z#*8l@2feQM;AF;&)N z%h_#k1zZ59!C`O^>;P|KzyAer9y|x?pagDWpZ^!&3t$=ygA(Whf5ATgb?`m#3b+Km z0u1m3I0zmBcd+Mw6}$)*!KcAN@HY1NUjvtc4o-m*_`?oDeg=LBu7Nkf=fG27Klm7U z9P9#rM_>F6{1yBL{2GY<+1`;ROZ(qbVB4R58K)uRG?L9QvGepFj?+j35yH(9h40u$ zpTd(!pZs=wZbM&Ha|wLAlOL*jmw1n;zPpBdsA}7Q4^`bH_6F%zx99bxNu`|@EQ zs+tPYHt<7LbDBoBr3_;sQHVqBQ>WRVKL3oH5F`Wm9V92iv z5kV1re~c@ZhL?l^Z*>Gb@c2DV_XEwcR;gz>D~Rv&>slD^7w>~6qoDO!k#Qy>G7|gK z9!*0AWt>T;Q7g&RnnZ}mI1?VjfVV7yH<=`!Ax^`aOp+$!QMM9~;_1w?5Q*pn2n7iP nLerQN#MG#DbE&?ft%kuU`6xT#LQU7`Yih2)8U=-zh${Iv4Z=Vd From 04af96202a82df1c2185a6048598422cd98eff53 Mon Sep 17 00:00:00 2001 From: Bryce Lorenz Kille Date: Tue, 22 Aug 2023 13:38:45 -0500 Subject: [PATCH 08/31] Add HWY* at top and bottom --- .../bit-algorithms/bit_algorithm_details.hpp | 2 +- include/bitlib/bit-algorithms/count.hpp | 4 ++++ include/bitlib/bit-algorithms/fill.hpp | 4 ++++ include/bitlib/bit-algorithms/find.hpp | 16 ++++++++++------ include/bitlib/bit-algorithms/rotate.hpp | 4 ++-- include/bitlib/bit-algorithms/shift.hpp | 6 +++++- 6 files changed, 26 insertions(+), 10 deletions(-) diff --git a/include/bitlib/bit-algorithms/bit_algorithm_details.hpp b/include/bitlib/bit-algorithms/bit_algorithm_details.hpp index 2a06592..414fa54 100644 --- a/include/bitlib/bit-algorithms/bit_algorithm_details.hpp +++ b/include/bitlib/bit-algorithms/bit_algorithm_details.hpp @@ -105,7 +105,7 @@ constexpr bool is_within( // Get next len bits beginning at start and store them in a word of type T template -T get_word(bit_iterator first, T len=binary_digits::value) +T get_word(bit_iterator first, size_t len=binary_digits::value) { using native_word_type = typename bit_iterator::word_type; constexpr T digits = binary_digits::value; diff --git a/include/bitlib/bit-algorithms/count.hpp b/include/bitlib/bit-algorithms/count.hpp index 92356b6..27a909e 100644 --- a/include/bitlib/bit-algorithms/count.hpp +++ b/include/bitlib/bit-algorithms/count.hpp @@ -17,6 +17,7 @@ // Third-party libraries #ifdef BITLIB_HWY #include "hwy/highway.h" +HWY_BEFORE_NAMESPACE(); #endif // Miscellaneous @@ -113,6 +114,9 @@ count( } } // namespace bit +#ifdef BITLIB_HWY +HWY_AFTER_NAMESPACE(); +#endif // ========================================================================== // #endif // _COUNT_HPP_INCLUDED diff --git a/include/bitlib/bit-algorithms/fill.hpp b/include/bitlib/bit-algorithms/fill.hpp index 1481f90..70dcd9d 100644 --- a/include/bitlib/bit-algorithms/fill.hpp +++ b/include/bitlib/bit-algorithms/fill.hpp @@ -20,6 +20,7 @@ // Third-party libraries #ifdef BITLIB_HWY #include "hwy/highway.h" +HWY_BEFORE_NAMESPACE(); #endif // Miscellaneous #define is_aligned(POINTER, BYTE_COUNT) \ @@ -83,5 +84,8 @@ void fill(bit_iterator first, bit_iterator last, // ========================================================================== // } // namespace bit +#ifdef BITLIB_HWY +HWY_AFTER_NAMESPACE(); +#endif #endif // _FILL_HPP_INCLUDED // ========================================================================== // diff --git a/include/bitlib/bit-algorithms/find.hpp b/include/bitlib/bit-algorithms/find.hpp index fed905a..b35583c 100644 --- a/include/bitlib/bit-algorithms/find.hpp +++ b/include/bitlib/bit-algorithms/find.hpp @@ -15,6 +15,7 @@ // Third-party libraries #ifdef BITLIB_HWY #include "hwy/highway.h" +HWY_BEFORE_NAMESPACE(); #endif // Miscellaneous @@ -98,13 +99,13 @@ constexpr bit_iterator find( } #endif - // Finish out the remainder with typical for loop - while (it != last.base()) { - if ((bv == bit1 && (*it == 0)) || (bv == bit0 && (*it == static_cast(-1)))) { - ++it; - continue; - } + if (bv == bit1) { + it = std::find_if(it, last.base(), [](word_type a) {return a != 0;}); + } else { + it = std::find_if(it, last.base(), [](word_type a) {return a != static_cast(-1);}); + } + if (it != last.base()) { size_type num_trailing_complementary_bits = (bv == bit0) ? _tzcnt(static_cast(~*it)) : _tzcnt(static_cast(*it)); @@ -123,6 +124,9 @@ constexpr bit_iterator find( // ========================================================================== // } // namespace bit +#ifdef BITLIB_HWY +HWY_AFTER_NAMESPACE(); +#endif #endif // _FIND_HPP_INCLUDED // ========================================================================== // diff --git a/include/bitlib/bit-algorithms/rotate.hpp b/include/bitlib/bit-algorithms/rotate.hpp index b019d7b..5466f7d 100644 --- a/include/bitlib/bit-algorithms/rotate.hpp +++ b/include/bitlib/bit-algorithms/rotate.hpp @@ -239,13 +239,13 @@ bit_iterator rotate( // Single word subcases if (is_within(first, n_first)) { size_type k = distance(first, n_first); - word_type temp = get_word(first, k); + word_type temp = get_word(first, k); bit_iterator new_last = shift_left(first, last, k); write_word(temp, new_last, static_cast(k)); return new_last; } else if (is_within(n_first, last)) { size_type p = distance(n_first, last); - word_type temp = get_word(n_first, p); + word_type temp = get_word(n_first, p); auto new_last = shift_right(first, last, p); write_word(temp, first, static_cast(p)); return new_last; diff --git a/include/bitlib/bit-algorithms/shift.hpp b/include/bitlib/bit-algorithms/shift.hpp index 0ef14fe..a6b6bda 100644 --- a/include/bitlib/bit-algorithms/shift.hpp +++ b/include/bitlib/bit-algorithms/shift.hpp @@ -19,6 +19,7 @@ // Third-party libraries #ifdef BITLIB_HWY #include "hwy/highway.h" +HWY_BEFORE_NAMESPACE(); #endif // Miscellaneous #define is_aligned(POINTER, BYTE_COUNT) \ @@ -119,7 +120,7 @@ bit_iterator shift_left( } const hn::ScalableTag d; - for (; std::distance(it, new_last_base) >= hn::Lanes(d) + 10 + !is_last_aligned; it += hn::Lanes(d)) + for (; std::distance(it, new_last_base) >= hn::Lanes(d) + !is_last_aligned; it += hn::Lanes(d)) { const auto v = hn::ShiftRightSame(hn::Load(d, &*it), remaining_bitshifts); const auto v_plus1 = hn::ShiftLeftSame(hn::LoadU(d, &*(it+1)), digits - remaining_bitshifts); @@ -272,5 +273,8 @@ bit_iterator shift_right( // ========================================================================== // } // namespace bit +#ifdef BITLIB_HWY +HWY_AFTER_NAMESPACE(); +#endif #endif // _SHIFT_HPP_INCLUDED // ========================================================================== // From bdb64fe7eab953471bfb290d926953f3c4fe6d51 Mon Sep 17 00:00:00 2001 From: Bryce Lorenz Kille Date: Tue, 22 Aug 2023 13:39:00 -0500 Subject: [PATCH 09/31] Add more benchmarks --- benchmark/src/benchmark_main.cc | 101 +++++++++++++++++++++----- benchmark/src/find_bench.hpp | 31 +++++++- benchmark/src/rw_bench.hpp | 44 +++++++---- benchmark/src/transform_bench.hpp | 117 ++++++++++++++++++++++++------ 4 files changed, 239 insertions(+), 54 deletions(-) diff --git a/benchmark/src/benchmark_main.cc b/benchmark/src/benchmark_main.cc index 2f8e647..512f83e 100644 --- a/benchmark/src/benchmark_main.cc +++ b/benchmark/src/benchmark_main.cc @@ -32,6 +32,7 @@ #include "swap_ranges-bench.hpp" #include "transform_bench.hpp" #include "equal_bench.hpp" +#include "rw_bench.hpp" // Third party libraries #include #include @@ -99,12 +100,30 @@ int main(int argc, char** argv) { unsigned int size_small = 1 << 8; unsigned int size_medium = 1 << 8; unsigned int size_large = 1 << 16; - unsigned int size_huge = 1 << 22; + unsigned int size_huge = 1 << 25; + // Read/write benchmarks + register_word_containers( + BM_BitSet, + "bit::set (huge)", + size_huge); + register_word_containers( + BM_DynamicBitsetSet, + "dynamic_bitset::set (huge)", + size_huge); + register_word_containers( + BM_BitArraySet, + "bitarray::set (huge)", + size_huge); + register_bool_containers( + BM_BoolSet, + "std::set (huge)", + size_huge); + // Shift benchmarks register_word_containers( BM_BitShiftLeft, - "bit::shift_left (small) (AA)", + "bit::shift_left (small)", size_small); register_word_containers( BM_BitShiftLeft_UU, @@ -124,7 +143,7 @@ int main(int argc, char** argv) { size_small); register_word_containers( BM_BitShiftLeft, - "bit::shift_left (huge) (AA)", + "bit::shift_left (huge)", size_huge); register_word_containers( BM_BitShiftLeft_UU, @@ -136,7 +155,7 @@ int main(int argc, char** argv) { size_huge); register_word_containers( BM_BitArrayShiftLeft, - "bitarray::shift_left (huge) (AA)", + "bitarray::shift_left (huge) ", size_huge); register_bool_containers( BM_BoolShiftLeft, @@ -144,7 +163,7 @@ int main(int argc, char** argv) { size_huge); register_word_containers( BM_BitShiftRight, - "bit::shift_right (small) (AA)", + "bit::shift_right (small) ", size_small); register_word_containers( BM_BitShiftRight_UU, @@ -164,7 +183,7 @@ int main(int argc, char** argv) { size_small); register_word_containers( BM_BitShiftRight, - "bit::shift_right (huge) (AA)", + "bit::shift_right (huge) ", size_huge); register_word_containers( BM_BitShiftRight_UU, @@ -194,7 +213,7 @@ int main(int argc, char** argv) { size_small); register_word_containers( BM_BitReverse, - "bit::reverse (huge) (AA)", + "bit::reverse (huge) ", size_huge); register_word_containers( BM_BitReverse_UU, @@ -208,48 +227,80 @@ int main(int argc, char** argv) { // transform benchmarks register_word_containers( BM_BitTransformUnaryAA, - "bit::transform(UnaryOp) (small) (AA)", + "bit::transform(UnaryOp) (small) ", size_small); register_word_containers( BM_BitTransformUnaryUU, "bit::transform(UnaryOp) (small) (UU)", size_small); + register_word_containers( + BM_DynamicBitsetTransformUnary, + "dynamic_bitset::transform(UnaryOp) (small) ", + size_small); + register_word_containers( + BM_BitArrayTransformUnary, + "bitarray::transform(UnaryOp) (small) ", + size_small); register_bool_containers( BM_BoolTransformUnary, "std::transform(UnaryOp) (small)", size_small); register_word_containers( BM_BitTransformUnaryAA, - "bit::transform(UnaryOp) (huge) (AA)", + "bit::transform(UnaryOp) (huge) ", size_huge); register_word_containers( BM_BitTransformUnaryUU, "bit::transform(UnaryOp) (huge) (UU)", size_huge); + register_word_containers( + BM_DynamicBitsetTransformUnary, + "dynamic_bitset::transform(UnaryOp) (huge) ", + size_huge); + register_word_containers( + BM_BitArrayTransformUnary, + "bitarray::transform(UnaryOp) (huge) ", + size_huge); register_bool_containers( BM_BoolTransformUnary, "std::transform(UnaryOp) (huge)", size_huge); register_word_containers( BM_BitTransformBinaryAA, - "bit::transform(BinaryOp) (small) (AA)", + "bit::transform(BinaryOp) (small) ", size_small); register_word_containers( BM_BitTransformBinaryUU, "bit::transform(BinaryOp) (small) (UU)", size_small); + register_word_containers( + BM_DynamicBitsetTransformBinary, + "dynamic_bitset::transform(BinaryOp) (small) ", + size_small); + register_word_containers( + BM_BitArrayTransformBinary, + "bitarray::transform(BinaryOp) (small) ", + size_small); register_bool_containers( BM_BoolTransformBinary, "std::transform(BinaryOp) (small)", size_small); register_word_containers( BM_BitTransformBinaryAA, - "bit::transform(BinaryOp) (huge) (AA)", + "bit::transform(BinaryOp) (huge) ", size_huge); register_word_containers( BM_BitTransformBinaryUU, "bit::transform(BinaryOp) (huge) (UU)", size_huge); + register_word_containers( + BM_DynamicBitsetTransformBinary, + "dynamic_bitset::transform(BinaryOp) (huge) ", + size_huge); + register_word_containers( + BM_BitArrayTransformBinary, + "bitarray::transform(BinaryOp) (huge) ", + size_huge); register_bool_containers( BM_BoolTransformBinary, "std::transform(BinaryOp) (huge)", @@ -258,7 +309,7 @@ int main(int argc, char** argv) { // Rotate benchmarks register_word_containers( BM_BitRotate, - "bit::rotate (small) (ARA)", + "bit::rotate (small)", size_small); register_bool_containers( BM_BoolRotate, @@ -266,7 +317,7 @@ int main(int argc, char** argv) { size_small); register_word_containers( BM_BitRotate, - "bit::rotate (huge) (ARA)", + "bit::rotate (huge)", size_huge); register_bool_containers( BM_BoolRotate, @@ -276,7 +327,7 @@ int main(int argc, char** argv) { // Count benchmarks register_word_containers( BM_BitCount, - "bit::count (small) (AA)", + "bit::count (small) ", size_small); register_word_containers( BM_DynamicBitsetCount, @@ -292,7 +343,7 @@ int main(int argc, char** argv) { size_small); register_word_containers( BM_BitCount, - "bit::count (huge) (AA)", + "bit::count (huge) ", size_huge); register_word_containers( BM_DynamicBitsetCount, @@ -310,7 +361,7 @@ int main(int argc, char** argv) { // swap_ranges benchmarks register_word_containers( BM_BitSwapRangesAA, - "bit::swap_ranges (small) (AA)", + "bit::swap_ranges (small) ", size_small); register_word_containers( BM_BitSwapRangesUU, @@ -322,7 +373,7 @@ int main(int argc, char** argv) { size_small); register_word_containers( BM_BitSwapRangesAA, - "bit::swap_ranges (huge) (AA)", + "bit::swap_ranges (huge) ", size_huge); register_word_containers( BM_BitSwapRangesUU, @@ -444,6 +495,14 @@ int main(int argc, char** argv) { BM_BitFind, "bit::find (small) (UU)", size_small); + register_word_containers( + BM_DynamicBitsetFind, + "dynamic_bitset::find (small)", + size_small); + register_word_containers( + BM_BitArrayFind, + "bitarray::find (small)", + size_small); register_bool_containers( BM_BoolFind, "std::find (small)", @@ -452,6 +511,14 @@ int main(int argc, char** argv) { BM_BitFind, "bit::find (huge) (UU)", size_huge); + register_word_containers( + BM_DynamicBitsetFind, + "dynamic_bitset::find (huge)", + size_huge); + register_word_containers( + BM_BitArrayFind, + "bitarray::find (huge)", + size_huge); register_bool_containers( BM_BoolFind, "std::find (huge)", diff --git a/benchmark/src/find_bench.hpp b/benchmark/src/find_bench.hpp index 8d49faa..050f8c9 100644 --- a/benchmark/src/find_bench.hpp +++ b/benchmark/src/find_bench.hpp @@ -1,6 +1,8 @@ #include #include #include "bitlib/bit-algorithms/find.hpp" +#include "bit_array.h" +#include "sul/dynamic_bitset.hpp" auto BM_BitFind = [](benchmark::State& state, auto input) { using container_type = typename std::tuple_element<0, decltype(input)>::type; @@ -11,13 +13,40 @@ auto BM_BitFind = [](benchmark::State& state, auto input) { container_type bitcont(container_size); auto first = bit::bit_iterator(std::begin(bitcont)); auto last = bit::bit_iterator(std::end(bitcont)); - *(first + (bitcont.size() / 2) + 4) = bit::bit1; + *(first + total_bits / 2 + 4) = bit::bit1; for (auto _ : state) { benchmark::DoNotOptimize(bit::find(first + 2, last - 3, bit::bit1)); benchmark::ClobberMemory(); } }; +auto BM_BitArrayFind = [](benchmark::State& state, auto input) { + using container_type = typename std::tuple_element<0, decltype(input)>::type; + using word_type = typename std::tuple_element<1, decltype(input)>::type; + unsigned int total_bits = std::get<2>(input); + BIT_ARRAY* bitarr = bit_array_create(total_bits); + bit_array_set_bit(bitarr, total_bits/2 + 4); + bit_index_t result; + for (auto _ : state) { + benchmark::DoNotOptimize(bit_array_find_first_set_bit(bitarr, &result)); + benchmark::ClobberMemory(); + } + bit_array_free(bitarr); +}; + +auto BM_DynamicBitsetFind = [](benchmark::State& state, auto input) { + using container_type = typename std::tuple_element<0, decltype(input)>::type; + using word_type = typename std::tuple_element<1, decltype(input)>::type; + using iterator_type = typename container_type::iterator; + unsigned int total_bits = std::get<2>(input); + sul::dynamic_bitset<> bitset1(total_bits, 0); + bitset1[total_bits / 2 + 4] = 1; + for (auto _ : state) { + benchmark::DoNotOptimize(bitset1.find_first()); + benchmark::ClobberMemory(); + } +}; + auto BM_BoolFind = [](benchmark::State& state, auto input) { using container_type = std::vector; using num_type = typename container_type::value_type; diff --git a/benchmark/src/rw_bench.hpp b/benchmark/src/rw_bench.hpp index d9baae2..2aac15d 100644 --- a/benchmark/src/rw_bench.hpp +++ b/benchmark/src/rw_bench.hpp @@ -1,6 +1,6 @@ #include #include "test_utils.hpp" -#include +#include "sul/dynamic_bitset.hpp" #include "bitlib/bitlib.hpp" auto BM_BitSet = [](benchmark::State& state, auto input) { @@ -12,11 +12,13 @@ auto BM_BitSet = [](benchmark::State& state, auto input) { auto bitvec1 = get_random_vec(container_size); auto first1 = bit::bit_iterator(std::begin(bitvec1)); - for (auto _ : state) + for (auto _ : state) { benchmark::DoNotOptimize(first1[total_bits/2] = bit::bit1); + benchmark::ClobberMemory(); + } }; -auto BM_CBitArrSet = [](benchmark::State& state, auto input) { +auto BM_BitArraySet = [](benchmark::State& state, auto input) { using container_type = typename std::tuple_element<0, decltype(input)>::type; using WordType = typename std::tuple_element<1, decltype(input)>::type; unsigned int total_bits = std::get<2>(input); @@ -24,15 +26,18 @@ auto BM_CBitArrSet = [](benchmark::State& state, auto input) { BIT_ARRAY* bitarr = bit_array_create(total_bits); for (auto _ : state) + { bit_array_set_bit(bitarr, total_bits/2); + benchmark::ClobberMemory(); + } }; -auto BM_BoostSet = [](benchmark::State& state, auto input) { +auto BM_DynamicBitsetSet = [](benchmark::State& state, auto input) { using container_type = typename std::tuple_element<0, decltype(input)>::type; using WordType = typename std::tuple_element<1, decltype(input)>::type; unsigned int total_bits = std::get<2>(input); - boost::dynamic_bitset x(total_bits); + sul::dynamic_bitset x(total_bits); container_type boolvec1 = make_random_container (total_bits); for (auto i = 0; i < total_bits; ++i) { x[i] = boolvec1[i]; @@ -40,6 +45,7 @@ auto BM_BoostSet = [](benchmark::State& state, auto input) { for (auto _ : state) { (x[total_bits/2] = true); + benchmark::ClobberMemory(); } }; @@ -50,9 +56,13 @@ auto BM_BoolSet = [](benchmark::State& state, auto input) { container_type boolvec1 = make_random_container (container_size); for (auto _ : state) + { benchmark::DoNotOptimize(boolvec1[container_size/2] = true); + benchmark::ClobberMemory(); + } }; + auto BM_BitGet = [](benchmark::State& state, auto input) { using container_type = typename std::tuple_element<0, decltype(input)>::type; using WordType = typename std::tuple_element<1, decltype(input)>::type; @@ -62,11 +72,13 @@ auto BM_BitGet = [](benchmark::State& state, auto input) { auto bitvec1 = get_random_vec(container_size); auto first1 = bit::bit_iterator(std::begin(bitvec1)); - for (auto _ : state) + for (auto _ : state) { benchmark::DoNotOptimize(first1[total_bits/2]); + benchmark::ClobberMemory(); + } }; -auto BM_CBitArrGet = [](benchmark::State& state, auto input) { +auto BM_BitArrayGet = [](benchmark::State& state, auto input) { using container_type = typename std::tuple_element<0, decltype(input)>::type; using WordType = typename std::tuple_element<1, decltype(input)>::type; unsigned int total_bits = std::get<2>(input); @@ -74,15 +86,18 @@ auto BM_CBitArrGet = [](benchmark::State& state, auto input) { BIT_ARRAY* bitarr = bit_array_create(total_bits); for (auto _ : state) + { benchmark::DoNotOptimize(bit_array_get_bit(bitarr, total_bits/2)); + benchmark::ClobberMemory(); + } }; -auto BM_BoostGet = [](benchmark::State& state, auto input) { +auto BM_DynamicBitsetGet = [](benchmark::State& state, auto input) { using container_type = typename std::tuple_element<0, decltype(input)>::type; using WordType = typename std::tuple_element<1, decltype(input)>::type; unsigned int total_bits = std::get<2>(input); - boost::dynamic_bitset x(total_bits); + sul::dynamic_bitset x(total_bits); container_type boolvec1 = make_random_container (total_bits); for (auto i = 0; i < total_bits; ++i) { x[i] = boolvec1[i]; @@ -90,6 +105,7 @@ auto BM_BoostGet = [](benchmark::State& state, auto input) { for (auto _ : state) { benchmark::DoNotOptimize(x[total_bits/2]); + benchmark::ClobberMemory(); } }; @@ -98,10 +114,10 @@ auto BM_BoolGet = [](benchmark::State& state, auto input) { unsigned int total_bits = std::get<2>(input); auto container_size = total_bits; container_type boolvec1 = make_random_container (container_size); - - for (auto _ : state){ - bool b; - benchmark::DoNotOptimize(b = boolvec1[container_size/2]); + bool x; + for (auto _ : state) + { + benchmark::DoNotOptimize(x = boolvec1[container_size/2]); + benchmark::ClobberMemory(); } }; - diff --git a/benchmark/src/transform_bench.hpp b/benchmark/src/transform_bench.hpp index eb91ca3..d543df6 100644 --- a/benchmark/src/transform_bench.hpp +++ b/benchmark/src/transform_bench.hpp @@ -1,27 +1,32 @@ #include +#include #include #include "test_utils.hpp" #include "bitlib/bitlib.hpp" +#include "sul/dynamic_bitset.hpp" +#include "bit_array.h" auto BM_BitTransformUnaryAA = [](benchmark::State& state, auto input) { using container_type = typename std::tuple_element<0, decltype(input)>::type; using WordType = typename std::tuple_element<1, decltype(input)>::type; unsigned int total_bits = std::get<2>(input); auto digits = bit::binary_digits::value; - auto container_size = total_bits / digits + 1; + auto container_size = total_bits / digits; auto bitvec1 = get_random_vec(container_size); auto first1 = bit::bit_iterator(std::begin(bitvec1)); - auto bitvec2 = get_random_vec(container_size); - auto first2 = bit::bit_iterator(std::begin(bitvec2)); + auto last1 = bit::bit_iterator(std::end(bitvec1)); - auto unary_op = std::bit_not(); + constexpr auto unary_op = std::bit_not(); for (auto _ : state) - bit::transform( + { + benchmark::DoNotOptimize(bit::transform( + first1, + last1, first1, - first1 + total_bits, - first2, unary_op - ); + )); + benchmark::ClobberMemory(); + } }; auto BM_BitTransformUnaryUU = [](benchmark::State& state, auto input) { @@ -32,20 +37,49 @@ auto BM_BitTransformUnaryUU = [](benchmark::State& state, auto input) { auto container_size = total_bits / digits + 1; auto bitvec1 = get_random_vec(container_size); auto first1 = bit::bit_iterator(std::begin(bitvec1)); - auto bitvec2 = get_random_vec(container_size); - auto first2 = bit::bit_iterator(std::begin(bitvec2)); - auto unary_op = std::bit_not(); + constexpr auto unary_op = std::bit_not(); for (auto _ : state) - bit::transform( + { + benchmark::DoNotOptimize(bit::transform( first1 + 2, first1 + total_bits - 4, - first2 + 3, + first1 + 1, unary_op - ); + )); + benchmark::ClobberMemory(); + } }; +auto BM_BitArrayTransformUnary = [](benchmark::State& state, auto input) { + using container_type = typename std::tuple_element<0, decltype(input)>::type; + using word_type = typename std::tuple_element<1, decltype(input)>::type; + unsigned int total_bits = std::get<2>(input); + BIT_ARRAY* bitarr = bit_array_create(total_bits); + + for (auto _ : state) + { + bit_array_not(bitarr, bitarr); + benchmark::ClobberMemory(); + } +}; + +auto BM_DynamicBitsetTransformUnary = [](benchmark::State& state, auto input) { + using container_type = typename std::tuple_element<0, decltype(input)>::type; + using WordType = typename std::tuple_element<1, decltype(input)>::type; + unsigned int total_bits = std::get<2>(input); + auto digits = bit::binary_digits::value; + auto container_size = total_bits / digits + 1; + auto bitvec1 = get_random_vec(container_size); + sul::dynamic_bitset bitset1(total_bits, 1); + std::memcpy((char*)bitset1.data(), static_cast((bitvec1.data())), total_bits / 8); + for (auto _ : state) { + benchmark::DoNotOptimize(bitset1.flip()); + benchmark::ClobberMemory(); + } +}; + auto BM_BoolTransformUnary = [](benchmark::State& state, auto input) { using container_type = typename std::tuple_element<0, decltype(input)>::type; unsigned int total_bits = std::get<2>(input); @@ -58,12 +92,15 @@ auto BM_BoolTransformUnary = [](benchmark::State& state, auto input) { auto unary_op = [](bool b) {return !b;}; for (auto _ : state) + { std::transform( first1, first1 + total_bits, first2, unary_op ); + benchmark::ClobberMemory(); + } }; @@ -77,18 +114,19 @@ auto BM_BitTransformBinaryAA = [](benchmark::State& state, auto input) { auto first1 = bit::bit_iterator(std::begin(bitvec1)); auto bitvec2 = get_random_vec(container_size); auto first2 = bit::bit_iterator(std::begin(bitvec2)); - auto bitvec3 = get_random_vec(container_size); - auto first3 = bit::bit_iterator(std::begin(bitvec3)); auto binary_op = std::bit_and(); for (auto _ : state) + { bit::transform( first1, first1 + total_bits, first2, - first3, + first2, binary_op ); + benchmark::ClobberMemory(); + } }; auto BM_BitTransformBinaryUU = [](benchmark::State& state, auto input) { @@ -101,18 +139,51 @@ auto BM_BitTransformBinaryUU = [](benchmark::State& state, auto input) { auto first1 = bit::bit_iterator(std::begin(bitvec1)); auto bitvec2 = get_random_vec(container_size); auto first2 = bit::bit_iterator(std::begin(bitvec2)); - auto bitvec3 = get_random_vec(container_size); - auto first3 = bit::bit_iterator(std::begin(bitvec3)); auto binary_op = std::bit_and(); for (auto _ : state) + { bit::transform( first1 + 2, first1 + total_bits - 4, first2 + 3, - first3 + 1, + first2 + 1, binary_op ); + benchmark::ClobberMemory(); + } +}; + +auto BM_DynamicBitsetTransformBinary = [](benchmark::State& state, auto input) { + using container_type = typename std::tuple_element<0, decltype(input)>::type; + using WordType = typename std::tuple_element<1, decltype(input)>::type; + unsigned int total_bits = std::get<2>(input); + auto digits = bit::binary_digits::value; + auto container_size = total_bits / digits + 1; + auto bitvec1 = get_random_vec(container_size); + auto bitvec2 = get_random_vec(container_size); + sul::dynamic_bitset bitset1(total_bits, 1); + sul::dynamic_bitset bitset2(total_bits, 1); + std::memcpy((char*)bitset1.data(), static_cast((bitvec1.data())), total_bits / 8); + std::memcpy((char*)bitset2.data(), static_cast((bitvec2.data())), total_bits / 8); + for (auto _ : state) { + benchmark::DoNotOptimize(bitset1 &= bitset2); + benchmark::ClobberMemory(); + } +}; + +auto BM_BitArrayTransformBinary = [](benchmark::State& state, auto input) { + using container_type = typename std::tuple_element<0, decltype(input)>::type; + using word_type = typename std::tuple_element<1, decltype(input)>::type; + unsigned int total_bits = std::get<2>(input); + BIT_ARRAY* bitarr = bit_array_create(total_bits); + BIT_ARRAY* bitarr2 = bit_array_create(total_bits); + + for (auto _ : state) + { + bit_array_and(bitarr, bitarr, bitarr2); + benchmark::ClobberMemory(); + } }; @@ -125,17 +196,19 @@ auto BM_BoolTransformBinary = [](benchmark::State& state, auto input) { container_type boolvec3 = make_random_container (container_size); auto first1 = boolvec1.begin(); auto first2 = boolvec2.begin(); - auto first3 = boolvec3.begin(); auto binary_op = [](bool a, bool b) {return a && b;}; for (auto _ : state) + { std::transform( first1, first1 + total_bits, first2, - first3, + first2, binary_op ); + benchmark::ClobberMemory(); + } }; From cfd2d10b99cfa2e4433132696994b666980d2670 Mon Sep 17 00:00:00 2001 From: Bryce Lorenz Kille Date: Fri, 25 Aug 2023 11:45:45 -0500 Subject: [PATCH 10/31] Add yet another cast... --- include/bitlib/bit-algorithms/bit_algorithm_details.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/bitlib/bit-algorithms/bit_algorithm_details.hpp b/include/bitlib/bit-algorithms/bit_algorithm_details.hpp index 414fa54..59a62dc 100644 --- a/include/bitlib/bit-algorithms/bit_algorithm_details.hpp +++ b/include/bitlib/bit-algorithms/bit_algorithm_details.hpp @@ -123,7 +123,7 @@ T get_word(bit_iterator first, size_t len=binary_digits::value) // Fill up ret_word starting at bit [offset] using it // TODO define a mask and use the _bitblend that takes in the extra mask while (len > digits) { - ret_word = _bitblend( + ret_word = _bitblend( ret_word, static_cast(static_cast(*it) << offset), offset, @@ -134,7 +134,7 @@ T get_word(bit_iterator first, size_t len=binary_digits::value) len -= digits; } // Assign remaining len bits of last word - ret_word = _bitblend( + ret_word = _bitblend( ret_word, static_cast(static_cast(*it) << offset), offset, From ba2be225c471db056c8b138210ec37f72b8d1bdd Mon Sep 17 00:00:00 2001 From: Bryce Lorenz Kille Date: Fri, 25 Aug 2023 11:57:19 -0500 Subject: [PATCH 11/31] Add comments --- include/bitlib/bit-algorithms/count.hpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/include/bitlib/bit-algorithms/count.hpp b/include/bitlib/bit-algorithms/count.hpp index 27a909e..1a804b1 100644 --- a/include/bitlib/bit-algorithms/count.hpp +++ b/include/bitlib/bit-algorithms/count.hpp @@ -83,14 +83,17 @@ count( //} else //#endif { + // std:: version //result += std::transform_reduce( //it, //last.base(), //0, //std::plus{}, - //[](word_type word) {return popcnt(word); } + //[](word_type word) {return _popcnt(word); } //); - result += popcnt(&*it, std::distance(it, last.base())); + + // libpopcnt + result += popcnt(&*it, (digits / 8) * std::distance(it, last.base())); } if (last.position() != 0) { word_type last_value = *last.base() << (digits - last.position()); From 7f9a92079fe18520b4cd172687686394c0bada20 Mon Sep 17 00:00:00 2001 From: Bryce Lorenz Kille Date: Fri, 25 Aug 2023 11:57:35 -0500 Subject: [PATCH 12/31] Add extra word of padding for left shift --- include/bitlib/bit-algorithms/shift.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/bitlib/bit-algorithms/shift.hpp b/include/bitlib/bit-algorithms/shift.hpp index a6b6bda..dca5501 100644 --- a/include/bitlib/bit-algorithms/shift.hpp +++ b/include/bitlib/bit-algorithms/shift.hpp @@ -120,7 +120,7 @@ bit_iterator shift_left( } const hn::ScalableTag d; - for (; std::distance(it, new_last_base) >= hn::Lanes(d) + !is_last_aligned; it += hn::Lanes(d)) + for (; std::distance(it, new_last_base) >= hn::Lanes(d) + 1 + !is_last_aligned; it += hn::Lanes(d)) { const auto v = hn::ShiftRightSame(hn::Load(d, &*it), remaining_bitshifts); const auto v_plus1 = hn::ShiftLeftSame(hn::LoadU(d, &*(it+1)), digits - remaining_bitshifts); From 354c90a5d6a7ba3ce40a08db4446eef3b7f1c0f1 Mon Sep 17 00:00:00 2001 From: Bryce Lorenz Kille Date: Sat, 26 Aug 2023 17:05:31 -0500 Subject: [PATCH 13/31] Make bit_and constexpr for benchmarking --- benchmark/src/transform_bench.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmark/src/transform_bench.hpp b/benchmark/src/transform_bench.hpp index d543df6..673d495 100644 --- a/benchmark/src/transform_bench.hpp +++ b/benchmark/src/transform_bench.hpp @@ -115,7 +115,7 @@ auto BM_BitTransformBinaryAA = [](benchmark::State& state, auto input) { auto bitvec2 = get_random_vec(container_size); auto first2 = bit::bit_iterator(std::begin(bitvec2)); - auto binary_op = std::bit_and(); + constexpr auto binary_op = std::bit_and(); for (auto _ : state) { bit::transform( @@ -140,7 +140,7 @@ auto BM_BitTransformBinaryUU = [](benchmark::State& state, auto input) { auto bitvec2 = get_random_vec(container_size); auto first2 = bit::bit_iterator(std::begin(bitvec2)); - auto binary_op = std::bit_and(); + constexpr auto binary_op = std::bit_and(); for (auto _ : state) { bit::transform( From 65ea3ae2fd53469c010ec2ec7d2aaddb33b95ad2 Mon Sep 17 00:00:00 2001 From: Bryce Lorenz Kille Date: Tue, 19 Dec 2023 14:32:33 -0600 Subject: [PATCH 14/31] Add reverse benchmarks --- benchmark/src/benchmark_main.cc | 12 ++++++++++++ benchmark/src/reverse_bench.hpp | 24 ++++++++++++++++++++++++ 2 files changed, 36 insertions(+) diff --git a/benchmark/src/benchmark_main.cc b/benchmark/src/benchmark_main.cc index 512f83e..b91a262 100644 --- a/benchmark/src/benchmark_main.cc +++ b/benchmark/src/benchmark_main.cc @@ -207,6 +207,10 @@ int main(int argc, char** argv) { BM_BitReverse_UU, "bit::reverse (small) (UU)", size_small); + register_word_containers( + BM_BitArrayReverse_UU, + "bitarray::reverse (small) (UU)", + size_small); register_bool_containers( BM_BoolReverse, "std::reverse (small)", @@ -219,6 +223,14 @@ int main(int argc, char** argv) { BM_BitReverse_UU, "bit::reverse (huge) (UU)", size_huge); + register_word_containers( + BM_BitArrayReverse, + "bitarray::reverse (huge)", + size_huge); + register_word_containers( + BM_BitArrayReverse_UU, + "bitarray::reverse (huge) (UU)", + size_huge); register_bool_containers( BM_BoolReverse, "std::reverse (huge)", diff --git a/benchmark/src/reverse_bench.hpp b/benchmark/src/reverse_bench.hpp index 9779f7e..b83a092 100644 --- a/benchmark/src/reverse_bench.hpp +++ b/benchmark/src/reverse_bench.hpp @@ -32,6 +32,30 @@ auto BM_BitReverse_UU = [](benchmark::State& state, auto input) { } }; +auto BM_BitArrayReverse = [](benchmark::State& state, auto input) { + using container_type = typename std::tuple_element<0, decltype(input)>::type; + using word_type = typename std::tuple_element<1, decltype(input)>::type; + unsigned int total_bits = std::get<2>(input); + BIT_ARRAY* bitarr = bit_array_create(total_bits); + for (auto _ : state) { + bit_array_reverse(bitarr); + benchmark::ClobberMemory(); + } + bit_array_free(bitarr); +}; + +auto BM_BitArrayReverse_UU = [](benchmark::State& state, auto input) { + using container_type = typename std::tuple_element<0, decltype(input)>::type; + using word_type = typename std::tuple_element<1, decltype(input)>::type; + unsigned int total_bits = std::get<2>(input); + BIT_ARRAY* bitarr = bit_array_create(total_bits); + for (auto _ : state) { + bit_array_reverse_region(bitarr, 2, total_bits - 5); + benchmark::ClobberMemory(); + } + bit_array_free(bitarr); +}; + auto BM_BoolReverse = [](benchmark::State& state, auto input) { using container_type = std::vector; using num_type = typename container_type::value_type; From 43b5b0e328eec4583e01a10e9c96e10af1a2fa4b Mon Sep 17 00:00:00 2001 From: Bryce Lorenz Kille Date: Tue, 19 Dec 2023 17:14:49 -0600 Subject: [PATCH 15/31] Shift by non-word offset --- benchmark/src/shift_bench.hpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/benchmark/src/shift_bench.hpp b/benchmark/src/shift_bench.hpp index 0a444cf..51b91bd 100644 --- a/benchmark/src/shift_bench.hpp +++ b/benchmark/src/shift_bench.hpp @@ -16,7 +16,7 @@ auto BM_BitShiftLeft = [](benchmark::State& state, auto input) { container_type bitcont = make_random_container(container_size); auto first = bit::bit_iterator(std::begin(bitcont)); auto last = bit::bit_iterator(std::end(bitcont)); - auto n = bit::distance(first, last) / 2; + auto n = bit::distance(first, last) / 2 - 1; for (auto _ : state) { benchmark::DoNotOptimize(bit::shift_left(first, last, n)); benchmark::ClobberMemory(); @@ -46,7 +46,7 @@ auto BM_BitArrayShiftLeft = [](benchmark::State& state, auto input) { using word_type = typename std::tuple_element<1, decltype(input)>::type; unsigned int total_bits = std::get<2>(input); BIT_ARRAY* bitarr = bit_array_create(total_bits); - auto n = total_bits / 2; + auto n = total_bits / 2 - 1; for (auto _ : state) { bit_array_shift_right(bitarr, n, 0); benchmark::ClobberMemory(); @@ -61,7 +61,7 @@ auto BM_DynamicBitsetShiftLeft = [](benchmark::State& state, auto input) { using iterator_type = typename container_type::iterator; unsigned int total_bits = std::get<2>(input); sul::dynamic_bitset<> bitset1(total_bits, 1); - auto n = total_bits / 2; + auto n = total_bits / 2 - 1; for (auto _ : state) { bitset1 <<= n; benchmark::ClobberMemory(); @@ -91,7 +91,7 @@ auto BM_BitShiftRight = [](benchmark::State& state, auto input) { container_type bitcont = make_random_container(container_size); auto first = bit::bit_iterator(std::begin(bitcont)); auto last = bit::bit_iterator(std::end(bitcont)); - auto n = bit::distance(first, last) / 2; + auto n = bit::distance(first, last) / 2 - 1; for (auto _ : state) { benchmark::DoNotOptimize(bit::shift_right(first, last, n)); benchmark::ClobberMemory(); @@ -120,7 +120,7 @@ auto BM_DynamicBitsetShiftRight = [](benchmark::State& state, auto input) { using iterator_type = typename container_type::iterator; unsigned int total_bits = std::get<2>(input); sul::dynamic_bitset<> bitset1(total_bits, 1); - auto n = total_bits / 2; + auto n = total_bits / 2 - 1; for (auto _ : state) { bitset1 >>= n; benchmark::ClobberMemory(); @@ -132,7 +132,7 @@ auto BM_BitArrayShiftRight = [](benchmark::State& state, auto input) { using word_type = typename std::tuple_element<1, decltype(input)>::type; unsigned int total_bits = std::get<2>(input); BIT_ARRAY* bitarr = bit_array_create(total_bits); - auto n = total_bits / 2; + auto n = total_bits / 2 - 1; for (auto _ : state) { bit_array_shift_right(bitarr, n, 0); benchmark::ClobberMemory(); From fb1a7fafa808d362b622fd4e2ef9aeff77c1b89e Mon Sep 17 00:00:00 2001 From: Bryce Lorenz Kille Date: Tue, 19 Dec 2023 17:15:55 -0600 Subject: [PATCH 16/31] Adjust container sizes --- README.md | 2 +- benchmark/src/benchmark_main.cc | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index ab8a7ce..13a4332 100644 --- a/README.md +++ b/README.md @@ -131,7 +131,7 @@ Given that the majority of the library is focused on having the same interface a I used Google's [benchmark](https://github.com/google/benchmark) library for computing benchmarks. Each benchmark is formatted as `{bit, BitArray, std}::function` (size) [(alignment-tags)]. * `bit` is for this library, `BitArray` is for the popular C-based [BitArray library](https://github.com/noporpoise/BitArray), and`std` is the standard library operating on the infamous `vector`. -* (size) denotes the size of the container in bits. `small = 1 << 4`, `large = 1 << 16` +* (size) denotes the size of the container in bits. `small = 1 << 8`, `medium= 1 << 16`, `large = 1 << 24`, `huge = 1 << 31` * (alignment-tags) refers to the memory alignment of the bit-iterators. `U` means the iterator does not fall on a word boundary, `R` means the iterator is placed at random, and `A` means the iterator is aligned with a word boundary. For example, `bit::rotate (large) (ARA)` refers to our library's implementation of the `rotate` algorithm operating on a container of 65536 bits, where `first` and `last` are aligned but `n_first` is selected at random. diff --git a/benchmark/src/benchmark_main.cc b/benchmark/src/benchmark_main.cc index b91a262..2840cd5 100644 --- a/benchmark/src/benchmark_main.cc +++ b/benchmark/src/benchmark_main.cc @@ -98,9 +98,9 @@ void register_bool_containers(F test_lambda_f, std::string func_name, unsigned i //BENCHMARK_MAIN(); int main(int argc, char** argv) { unsigned int size_small = 1 << 8; - unsigned int size_medium = 1 << 8; - unsigned int size_large = 1 << 16; - unsigned int size_huge = 1 << 25; + unsigned int size_medium = 1 << 16; + unsigned int size_large = 1 << 24; + unsigned int size_huge = 1 << 31; // Read/write benchmarks register_word_containers( From 1797dffbeee8ab506b0156717fc37cbb8f7fe236 Mon Sep 17 00:00:00 2001 From: Bryce Lorenz Kille Date: Wed, 20 Dec 2023 11:31:12 -0600 Subject: [PATCH 17/31] Update benchmarks --- README.md | 189 +++++++++++++++++++++++++++++++++--------------------- 1 file changed, 116 insertions(+), 73 deletions(-) diff --git a/README.md b/README.md index 13a4332..44ee88e 100644 --- a/README.md +++ b/README.md @@ -130,83 +130,126 @@ Given that the majority of the library is focused on having the same interface a # Performance Benchmarks I used Google's [benchmark](https://github.com/google/benchmark) library for computing benchmarks. Each benchmark is formatted as `{bit, BitArray, std}::function` (size) [(alignment-tags)]. -* `bit` is for this library, `BitArray` is for the popular C-based [BitArray library](https://github.com/noporpoise/BitArray), and`std` is the standard library operating on the infamous `vector`. + * `bit` is for this library, `BitArray` is for the popular C-based [BitArray library](https://github.com/noporpoise/BitArray), [dynamic_bitset](https://github.com/pinam45/dynamic_bitset) is a header-only library similar to Boost's dynamic_bitset, and`std` is the standard library operating on the infamous `vector`. * (size) denotes the size of the container in bits. `small = 1 << 8`, `medium= 1 << 16`, `large = 1 << 24`, `huge = 1 << 31` * (alignment-tags) refers to the memory alignment of the bit-iterators. `U` means the iterator does not fall on a word boundary, `R` means the iterator is placed at random, and `A` means the iterator is aligned with a word boundary. For example, `bit::rotate (large) (ARA)` refers to our library's implementation of the `rotate` algorithm operating on a container of 65536 bits, where `first` and `last` are aligned but `n_first` is selected at random. ``` -2022-05-04T16:54:22-05:00 -Running ./bin/bench -Run on (80 X 2899.73 MHz CPU s) +2023-12-19T17:56:41-06:00 +Running ./bin/bitlib-bench +Run on (64 X 1067.77 MHz CPU s) CPU Caches: - L1 Data 32 KiB (x40) - L1 Instruction 32 KiB (x40) - L2 Unified 1024 KiB (x40) - L3 Unified 28160 KiB (x2) -Load Average: 1.12, 0.98, 0.54 --------------------------------------------------------------------------------- -Benchmark Time CPU Iterations --------------------------------------------------------------------------------- -bit::shift_left (small) (AA) 4.79 ns 4.79 ns 146028612 -bit::shift_left (small) (UU) 3.72 ns 3.72 ns 187172020 -std::shift_left (small) 37.8 ns 37.8 ns 18507630 -bit::shift_left (large) (AA) 78.9 ns 78.9 ns 8887302 -bit::shift_left (large) (UU) 243 ns 243 ns 2887952 -std::shift_left (large) 156867 ns 156869 ns 4463 -bit::shift_right (small) (UU) 3.48 ns 3.48 ns 201058677 -std::shift_right (small) 35.7 ns 35.7 ns 19186367 -bit::shift_right (large) (AA) 68.3 ns 68.3 ns 10249245 -std::shift_right (large) 132458 ns 132461 ns 5276 -bit::reverse (small) (UU) 8.73 ns 8.73 ns 80176090 -std::reverse (small) 39.9 ns 39.9 ns 17545669 -bit::reverse (large) (AA) 842 ns 842 ns 830385 -bit::reverse (large) (UU) 1157 ns 1157 ns 605963 -std::reverse (large) 285799 ns 285792 ns 2456 -bit::transform(UnaryOp) (small) (AA) 5.22 ns 5.22 ns 134034538 -bit::transform(UnaryOp) (small) (UU) 6.28 ns 6.28 ns 111084155 -std::transform(UnaryOp) (small) 50.6 ns 50.6 ns 13837852 -bit::transform(UnaryOp) (large) (AA) 238 ns 238 ns 2956037 -bit::transform(UnaryOp) (large) (UU) 2005 ns 2005 ns 349160 -std::transform(UnaryOp) (large) 192498 ns 192502 ns 3637 -bit::transform(BinaryOp) (small) (AA) 7.50 ns 7.50 ns 93300797 -bit::transform(BinaryOp) (small) (UU) 7.85 ns 7.85 ns 89176138 -std::transform(BinaryOp) (small) 37.1 ns 37.1 ns 18848167 -bit::transform(BinaryOp) (large) (AA) 345 ns 345 ns 2030257 -bit::transform(BinaryOp) (large) (UU) 12924 ns 12925 ns 54165 -std::transform(BinaryOp) (large) 619243 ns 619246 ns 1134 -bit::rotate (small) (ARA) 9.14 ns 9.14 ns 123732722 -std::rotate (small) 79.7 ns 79.7 ns 9138769 -bit::rotate (large) (ARA) 7617 ns 7617 ns 92147 -std::rotate (large) 582126 ns 582135 ns 1207 -bit::count (small) (AA) 2.29 ns 2.29 ns 299434270 -std::count (small) 15.2 ns 15.2 ns 45934612 -bit::count (large) (AA) 457 ns 457 ns 1533128 -std::count (large) 57501 ns 57501 ns 12174 -bit::swap_ranges (small) (AA) 6.76 ns 6.76 ns 103735181 -bit::swap_ranges (small) (UU) 5.43 ns 5.43 ns 128688535 -std::swap_ranges (small) 27.8 ns 27.8 ns 25309938 -bit::swap_ranges (large) (AA) 446 ns 446 ns 1570781 -bit::swap_ranges (large) (UU) 5496 ns 5496 ns 127033 -std::swap_ranges (large) 507092 ns 507093 ns 1380 -bit::copy (small) (UU) 6.22 ns 6.22 ns 110731355 -std::copy (small) 27.7 ns 27.7 ns 25261667 -bit::copy (large) (UU) 5367 ns 5367 ns 130292 -std::copy (large) 184520 ns 184523 ns 3794 -bit::equal (small) (UU) 3.64 ns 3.64 ns 193325012 -std::equal (small) 32.2 ns 32.2 ns 21650629 -bit::equal (large) (UU) 1799 ns 1799 ns 389158 -std::equal (large) 200078 ns 200080 ns 3499 -bit::move (small) (UU) 6.31 ns 6.31 ns 110834953 -std::move (small) 27.7 ns 27.7 ns 25270665 -bit::move (large) (UU) 5372 ns 5372 ns 130464 -std::move (large) 184090 ns 184094 ns 3803 -bit::copy_backward (small) (UU) 9.60 ns 9.60 ns 72952203 -std::copy_backward (small) 19.9 ns 19.9 ns 35227170 -bit::copy_backward (large) (UU) 7602 ns 7602 ns 92137 -std::copy_backward (large) 431622 ns 431616 ns 1619 -bit::fill (small) (UU) 4.35 ns 4.35 ns 160834380 -std::fill (small) 2.35 ns 2.35 ns 297524146 -bit::fill (huge) (UU) 17138 ns 17137 ns 40748 -std::fill (huge) 11840 ns 11839 ns 59666 + L1 Data 32 KiB (x32) + L1 Instruction 32 KiB (x32) + L2 Unified 1024 KiB (x32) + L3 Unified 22528 KiB (x2) +Load Average: 0.70, 0.87, 0.70 +--------------------------------------------------------------------------------------- +Benchmark Time CPU Iterations +--------------------------------------------------------------------------------------- +bit::set (large) 1.91 ns 1.91 ns 366779196 +dynamic_bitset::set (large) 2.36 ns 2.36 ns 296975883 +bitarray::set (large) 2.20 ns 2.20 ns 318761424 +std::set (large) 2.39 ns 2.39 ns 293167404 +bit::shift_left (small) 19.8 ns 19.8 ns 35519917 +bit::shift_left (small) (UU) 30.5 ns 30.5 ns 22984689 +dynamic_bitset::shift_left (small) 13.1 ns 13.1 ns 53628854 +bitarray::shift_left (small) 38.4 ns 38.4 ns 18168867 +std::shift_left (small) 579 ns 578 ns 1209283 +bit::shift_left (large) 160869 ns 160845 ns 4353 +bit::shift_left (large) (UU) 280267 ns 280229 ns 2505 +dynamic_bitset::shift_left (large) 143487 ns 143454 ns 4877 +bitarray::shift_left (large) 835992 ns 835930 ns 837 +std::shift_left (large) 40289125 ns 40287190 ns 17 +bit::shift_right (small) 27.8 ns 27.8 ns 25146901 +bit::shift_right (small) (UU) 31.1 ns 31.1 ns 22561913 +dynamic_bitset::shift_right (small) 12.2 ns 12.2 ns 57443996 +bitarray::shift_right (small) 38.8 ns 38.8 ns 18155925 +std::shift_right (small) 504 ns 504 ns 1392311 +bit::shift_right (large) 164210 ns 164191 ns 4264 +bit::shift_right (large) (UU) 292115 ns 292087 ns 2404 +dynamic_bitset::shift_right (large) 125191 ns 125160 ns 5591 +bitarray::shift_right (large) 836455 ns 836415 ns 837 +std::shift_right (large) 36904578 ns 36906143 ns 17 +bit::reverse (small) (UU) 30.4 ns 30.4 ns 23010493 +bitarray::reverse (small) (UU) 92.9 ns 92.9 ns 7566424 +std::reverse (small) 416 ns 416 ns 1709223 +bit::reverse (large) 302243 ns 302246 ns 2314 +bit::reverse (large) (UU) 396252 ns 396251 ns 1766 +bitarray::reverse (large) 4180555 ns 4180640 ns 168 +bitarray::reverse (large) (UU) 5565145 ns 5565237 ns 126 +std::reverse (large) 71610824 ns 71613462 ns 10 +bit::transform(UnaryOp) (small) 7.73 ns 7.73 ns 90799823 +bit::transform(UnaryOp) (small) (UU) 16.7 ns 16.7 ns 41797825 +dynamic_bitset::transform(UnaryOp) (small) 3.79 ns 3.79 ns 178099711 +bitarray::transform(UnaryOp) (small) 8.15 ns 8.15 ns 86177059 +std::transform(UnaryOp) (small) 762 ns 762 ns 920469 +bit::transform(UnaryOp) (large) 89430 ns 89427 ns 7830 +bit::transform(UnaryOp) (large) (UU) 513673 ns 513652 ns 1363 +dynamic_bitset::transform(UnaryOp) (large) 90179 ns 90174 ns 7755 +bitarray::transform(UnaryOp) (large) 182288 ns 182278 ns 3806 +std::transform(UnaryOp) (large) 49393629 ns 49392276 ns 14 +bit::transform(BinaryOp) (small) 4.79 ns 4.79 ns 146268444 +bit::transform(BinaryOp) (small) (UU) 40.1 ns 40.1 ns 17465510 +dynamic_bitset::transform(BinaryOp) (small) 4.35 ns 4.35 ns 160471539 +bitarray::transform(BinaryOp) (small) 10.5 ns 10.5 ns 66739191 +std::transform(BinaryOp) (small) 837 ns 837 ns 834684 +bit::transform(BinaryOp) (large) 184508 ns 184491 ns 3796 +bit::transform(BinaryOp) (large) (UU) 2396570 ns 2396591 ns 292 +dynamic_bitset::transform(BinaryOp) (large) 183006 ns 182980 ns 3813 +bitarray::transform(BinaryOp) (large) 131178 ns 131171 ns 5348 +std::transform(BinaryOp) (large) 195492307 ns 195488596 ns 4 +bit::rotate (small) 121 ns 121 ns 10000000 +std::rotate (small) 1725 ns 1725 ns 467233 +bit::rotate (large) 1830057 ns 1830041 ns 377 +std::rotate (large) 149375227 ns 149373295 ns 5 +bit::count (small) 6.28 ns 6.28 ns 111995013 +dynamic_bitset::count (small) 8.12 ns 8.12 ns 87716832 +bitarray::count (small) 6.11 ns 6.11 ns 114586171 +std::count (small) 233 ns 233 ns 3000468 +bit::count (large) 86768 ns 86767 ns 8067 +dynamic_bitset::count (large) 86774 ns 86776 ns 8068 +bitarray::count (large) 228298 ns 228300 ns 3066 +std::count (large) 14717449 ns 14717517 ns 48 +bit::swap_ranges (small) 8.03 ns 8.03 ns 85409308 +bit::swap_ranges (small) (UU) 19.0 ns 19.0 ns 36799054 +std::swap_ranges (small) 753 ns 753 ns 932645 +bit::swap_ranges (large) 206087 ns 206069 ns 3400 +bit::swap_ranges (large) (UU) 1416540 ns 1416482 ns 495 +std::swap_ranges (large) 128732217 ns 128736753 ns 5 +bit::copy (small) (UU) 22.6 ns 22.6 ns 30977614 +std::copy (small) 706 ns 706 ns 991971 +bit::copy (large) (UU) 1283021 ns 1282931 ns 546 +std::copy (large) 47291412 ns 47293406 ns 15 +bit::equal (small) (UU) 13.5 ns 13.5 ns 51665518 +std::equal (small) 887 ns 887 ns 789443 +bit::equal (large) (UU) 684564 ns 684588 ns 1023 +std::equal (large) 58741336 ns 58740796 ns 12 +bit::move (small) (UU) 24.9 ns 24.9 ns 28152253 +std::move (small) 705 ns 705 ns 993177 +bit::move (large) (UU) 1486436 ns 1486307 ns 471 +std::move (large) 47268916 ns 47269412 ns 15 +bit::copy_backward (small) (UU) 35.8 ns 35.8 ns 19180871 +std::copy_backward (small) 524 ns 524 ns 1336116 +bit::copy_backward (large) (UU) 1843335 ns 1843176 ns 381 +std::copy_backward (large) 110068625 ns 110069932 ns 6 +bit::fill (small) (UU) 6.81 ns 6.81 ns 103143199 +dynamic_bitset::fill (small) 3.55 ns 3.55 ns 198214175 +bitarray::fill (small) 13.9 ns 13.9 ns 50233774 +std::fill (small) 9.57 ns 9.57 ns 73133048 +bit::fill (large) (UU) 95661 ns 95650 ns 7326 +dynamic_bitset::fill (large) 102146 ns 102146 ns 6851 +bitarray::fill (large) 72462 ns 72462 ns 9615 +std::fill (large) 72955 ns 72955 ns 9741 +bit::find (small) (UU) 3.22 ns 3.22 ns 217967844 +dynamic_bitset::find (small) 3.05 ns 3.05 ns 229824606 +bitarray::find (small) 7.15 ns 7.15 ns 94973526 +std::find (small) 100 ns 100 ns 6992893 +bit::find (large) (UU) 27810 ns 27808 ns 25202 +dynamic_bitset::find (large) 64434 ns 64437 ns 10870 +bitarray::find (large) 62305 ns 62298 ns 11220 +std::find (large) 6376779 ns 6376904 ns 110 +``` + From eb4234ae28a432a5d04b56b3b4aae13f3478f295 Mon Sep 17 00:00:00 2001 From: Bryce Lorenz Kille Date: Wed, 20 Dec 2023 11:31:52 -0600 Subject: [PATCH 18/31] Revert to large --- benchmark/src/benchmark_main.cc | 224 ++++++++++++++++---------------- 1 file changed, 112 insertions(+), 112 deletions(-) diff --git a/benchmark/src/benchmark_main.cc b/benchmark/src/benchmark_main.cc index 2840cd5..f58bf9c 100644 --- a/benchmark/src/benchmark_main.cc +++ b/benchmark/src/benchmark_main.cc @@ -105,20 +105,20 @@ int main(int argc, char** argv) { // Read/write benchmarks register_word_containers( BM_BitSet, - "bit::set (huge)", - size_huge); + "bit::set (large)", + size_large); register_word_containers( BM_DynamicBitsetSet, - "dynamic_bitset::set (huge)", - size_huge); + "dynamic_bitset::set (large)", + size_large); register_word_containers( BM_BitArraySet, - "bitarray::set (huge)", - size_huge); + "bitarray::set (large)", + size_large); register_bool_containers( BM_BoolSet, - "std::set (huge)", - size_huge); + "std::set (large)", + size_large); // Shift benchmarks register_word_containers( @@ -143,24 +143,24 @@ int main(int argc, char** argv) { size_small); register_word_containers( BM_BitShiftLeft, - "bit::shift_left (huge)", - size_huge); + "bit::shift_left (large)", + size_large); register_word_containers( BM_BitShiftLeft_UU, - "bit::shift_left (huge) (UU)", - size_huge); + "bit::shift_left (large) (UU)", + size_large); register_word_containers( BM_DynamicBitsetShiftLeft, - "dynamic_bitset::shift_left (huge)", - size_huge); + "dynamic_bitset::shift_left (large)", + size_large); register_word_containers( BM_BitArrayShiftLeft, - "bitarray::shift_left (huge) ", - size_huge); + "bitarray::shift_left (large) ", + size_large); register_bool_containers( BM_BoolShiftLeft, - "std::shift_left (huge)", - size_huge); + "std::shift_left (large)", + size_large); register_word_containers( BM_BitShiftRight, "bit::shift_right (small) ", @@ -183,24 +183,24 @@ int main(int argc, char** argv) { size_small); register_word_containers( BM_BitShiftRight, - "bit::shift_right (huge) ", - size_huge); + "bit::shift_right (large) ", + size_large); register_word_containers( BM_BitShiftRight_UU, - "bit::shift_right (huge) (UU)", - size_huge); + "bit::shift_right (large) (UU)", + size_large); register_word_containers( BM_DynamicBitsetShiftRight, - "dynamic_bitset::shift_right (huge)", - size_huge); + "dynamic_bitset::shift_right (large)", + size_large); register_word_containers( BM_BitArrayShiftRight, - "bitarray::shift_right (huge)", - size_huge); + "bitarray::shift_right (large)", + size_large); register_bool_containers( BM_BoolShiftRight, - "std::shift_right (huge)", - size_huge); + "std::shift_right (large)", + size_large); // Reverse benchmarks register_word_containers( @@ -217,24 +217,24 @@ int main(int argc, char** argv) { size_small); register_word_containers( BM_BitReverse, - "bit::reverse (huge) ", - size_huge); + "bit::reverse (large) ", + size_large); register_word_containers( BM_BitReverse_UU, - "bit::reverse (huge) (UU)", - size_huge); + "bit::reverse (large) (UU)", + size_large); register_word_containers( BM_BitArrayReverse, - "bitarray::reverse (huge)", - size_huge); + "bitarray::reverse (large)", + size_large); register_word_containers( BM_BitArrayReverse_UU, - "bitarray::reverse (huge) (UU)", - size_huge); + "bitarray::reverse (large) (UU)", + size_large); register_bool_containers( BM_BoolReverse, - "std::reverse (huge)", - size_huge); + "std::reverse (large)", + size_large); // transform benchmarks register_word_containers( @@ -259,24 +259,24 @@ int main(int argc, char** argv) { size_small); register_word_containers( BM_BitTransformUnaryAA, - "bit::transform(UnaryOp) (huge) ", - size_huge); + "bit::transform(UnaryOp) (large) ", + size_large); register_word_containers( BM_BitTransformUnaryUU, - "bit::transform(UnaryOp) (huge) (UU)", - size_huge); + "bit::transform(UnaryOp) (large) (UU)", + size_large); register_word_containers( BM_DynamicBitsetTransformUnary, - "dynamic_bitset::transform(UnaryOp) (huge) ", - size_huge); + "dynamic_bitset::transform(UnaryOp) (large) ", + size_large); register_word_containers( BM_BitArrayTransformUnary, - "bitarray::transform(UnaryOp) (huge) ", - size_huge); + "bitarray::transform(UnaryOp) (large) ", + size_large); register_bool_containers( BM_BoolTransformUnary, - "std::transform(UnaryOp) (huge)", - size_huge); + "std::transform(UnaryOp) (large)", + size_large); register_word_containers( BM_BitTransformBinaryAA, "bit::transform(BinaryOp) (small) ", @@ -299,24 +299,24 @@ int main(int argc, char** argv) { size_small); register_word_containers( BM_BitTransformBinaryAA, - "bit::transform(BinaryOp) (huge) ", - size_huge); + "bit::transform(BinaryOp) (large) ", + size_large); register_word_containers( BM_BitTransformBinaryUU, - "bit::transform(BinaryOp) (huge) (UU)", - size_huge); + "bit::transform(BinaryOp) (large) (UU)", + size_large); register_word_containers( BM_DynamicBitsetTransformBinary, - "dynamic_bitset::transform(BinaryOp) (huge) ", - size_huge); + "dynamic_bitset::transform(BinaryOp) (large) ", + size_large); register_word_containers( BM_BitArrayTransformBinary, - "bitarray::transform(BinaryOp) (huge) ", - size_huge); + "bitarray::transform(BinaryOp) (large) ", + size_large); register_bool_containers( BM_BoolTransformBinary, - "std::transform(BinaryOp) (huge)", - size_huge); + "std::transform(BinaryOp) (large)", + size_large); // Rotate benchmarks register_word_containers( @@ -329,12 +329,12 @@ int main(int argc, char** argv) { size_small); register_word_containers( BM_BitRotate, - "bit::rotate (huge)", - size_huge); + "bit::rotate (large)", + size_large); register_bool_containers( BM_BoolRotate, - "std::rotate (huge)", - size_huge); + "std::rotate (large)", + size_large); // Count benchmarks register_word_containers( @@ -355,20 +355,20 @@ int main(int argc, char** argv) { size_small); register_word_containers( BM_BitCount, - "bit::count (huge) ", - size_huge); + "bit::count (large) ", + size_large); register_word_containers( BM_DynamicBitsetCount, - "dynamic_bitset::count (huge)", - size_huge); + "dynamic_bitset::count (large)", + size_large); register_word_containers( BM_BitArrayCount, - "bitarray::count (huge)", - size_huge); + "bitarray::count (large)", + size_large); register_bool_containers( BM_BoolCount, - "std::count (huge)", - size_huge); + "std::count (large)", + size_large); // swap_ranges benchmarks register_word_containers( @@ -385,16 +385,16 @@ int main(int argc, char** argv) { size_small); register_word_containers( BM_BitSwapRangesAA, - "bit::swap_ranges (huge) ", - size_huge); + "bit::swap_ranges (large) ", + size_large); register_word_containers( BM_BitSwapRangesUU, - "bit::swap_ranges (huge) (UU)", - size_huge); + "bit::swap_ranges (large) (UU)", + size_large); register_bool_containers( BM_BoolSwapRanges, - "std::swap_ranges (huge)", - size_huge); + "std::swap_ranges (large)", + size_large); // copy benchmarks register_word_containers( @@ -407,12 +407,12 @@ int main(int argc, char** argv) { size_small); register_word_containers( BM_BitCopy, - "bit::copy (huge) (UU)", - size_huge); + "bit::copy (large) (UU)", + size_large); register_bool_containers( BM_BoolCopy, - "std::copy (huge)", - size_huge); + "std::copy (large)", + size_large); // Equal benchmarks register_word_containers( @@ -425,12 +425,12 @@ int main(int argc, char** argv) { size_small); register_word_containers( BM_BitEqual, - "bit::equal (huge) (UU)", - size_huge); + "bit::equal (large) (UU)", + size_large); register_bool_containers( BM_BoolEqual, - "std::equal (huge)", - size_huge); + "std::equal (large)", + size_large); // move benchmarks register_word_containers( @@ -443,12 +443,12 @@ int main(int argc, char** argv) { size_small); register_word_containers( BM_BitMove, - "bit::move (huge) (UU)", - size_huge); + "bit::move (large) (UU)", + size_large); register_bool_containers( BM_BoolMove, - "std::move (huge)", - size_huge); + "std::move (large)", + size_large); // copy_backward benchmarks register_word_containers( @@ -461,12 +461,12 @@ int main(int argc, char** argv) { size_small); register_word_containers( BM_BitCopyBackward, - "bit::copy_backward (huge) (UU)", - size_huge); + "bit::copy_backward (large) (UU)", + size_large); register_bool_containers( BM_BoolCopyBackward, - "std::copy_backward (huge)", - size_huge); + "std::copy_backward (large)", + size_large); // fill benchmarks register_word_containers( @@ -487,20 +487,20 @@ int main(int argc, char** argv) { size_small); register_word_containers( BM_BitFill, - "bit::fill (huge) (UU)", - size_huge); + "bit::fill (large) (UU)", + size_large); register_bool_containers( BM_DynamicBitsetFill, - "dynamic_bitset::fill (huge)", - size_huge); + "dynamic_bitset::fill (large)", + size_large); register_bool_containers( BM_BitArrayFill, - "bitarray::fill (huge)", - size_huge); + "bitarray::fill (large)", + size_large); register_bool_containers( BM_BoolFill, - "std::fill (huge)", - size_huge); + "std::fill (large)", + size_large); // find benchmarks register_word_containers( @@ -521,38 +521,38 @@ int main(int argc, char** argv) { size_small); register_word_containers( BM_BitFind, - "bit::find (huge) (UU)", - size_huge); + "bit::find (large) (UU)", + size_large); register_word_containers( BM_DynamicBitsetFind, - "dynamic_bitset::find (huge)", - size_huge); + "dynamic_bitset::find (large)", + size_large); register_word_containers( BM_BitArrayFind, - "bitarray::find (huge)", - size_huge); + "bitarray::find (large)", + size_large); register_bool_containers( BM_BoolFind, - "std::find (huge)", - size_huge); + "std::find (large)", + size_large); //// Search benchmarks //register_word_containers( //BM_BitSearch, //"Search_Bit_Large", - //size_huge); + //size_large); //register_bool_containers( //BM_BoolSearch, //"Search_Bool_Large", - //size_huge); + //size_large); //register_word_containers( //BM_BitSearch_WorstCase, //"Search_Bit_Large_WorstCase", - //size_huge); + //size_large); //register_bool_containers( //BM_BoolSearch_WorstCase, //"Search_Bool_Large_WorstCase", - //size_huge); + //size_large); benchmark::Initialize(&argc, argv); benchmark::RunSpecifiedBenchmarks(); } From 775c0aa480c491ebdb5789928d8d31abbf7e9ff4 Mon Sep 17 00:00:00 2001 From: Bryce Lorenz Kille Date: Mon, 22 Apr 2024 15:49:34 -0500 Subject: [PATCH 19/31] Add GTest:: prefix to libs --- test/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 07ab0a8..8c92d54 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -23,7 +23,7 @@ endif() # specify test-specific libraries include_directories(${googletest_SOURCE_DIR}/googletest/include/gtest src/utils) -target_link_libraries(bitlib-tests PUBLIC gtest gtest_main -pthread -lgcov --coverage) +target_link_libraries(bitlib-tests PUBLIC GTest::gtest GTest::gtest_main -pthread -lgcov --coverage) set(BITLIB_GTEST_REPEAT 1) From 24b5789efa76221250fa4e51d3f10af1da2aa2f4 Mon Sep 17 00:00:00 2001 From: Bryce Lorenz Kille Date: Tue, 23 Apr 2024 14:48:27 -0500 Subject: [PATCH 20/31] Add profile option --- CMakeLists.txt | 5 ++++ profile/.CMakeLists.txt.swp | Bin 0 -> 12288 bytes profile/CMakeLists.txt | 15 +++++++++++ profile/src/main.cpp | 50 ++++++++++++++++++++++++++++++++++++ 4 files changed, 70 insertions(+) create mode 100644 profile/.CMakeLists.txt.swp create mode 100644 profile/CMakeLists.txt create mode 100644 profile/src/main.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index b887e70..bf82cf3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -46,6 +46,7 @@ option(BITLIB_HWY "Build with google highway SIMD extensions" OFF) option(BITLIB_BENCHMARK "Build bitlib benchmarks" OFF) option(BITLIB_EXAMPLE "Build bitlib examples" OFF) option(BITLIB_TEST "Build bitlib tests" OFF) +option(BITLIB_PROFILE "Buid simple example for profiling" OFF) option(BITLIB_COVERAGE "Compute test coverage" OFF) if (BITLIB_HWY) @@ -64,3 +65,7 @@ if(BITLIB_TEST) add_subdirectory(test) endif() +if(BITLIB_PROFILE) + add_subdirectory(profile) +endif() + diff --git a/profile/.CMakeLists.txt.swp b/profile/.CMakeLists.txt.swp new file mode 100644 index 0000000000000000000000000000000000000000..9ce599819033d074fedcf4d430d31bdf9067d92a GIT binary patch literal 12288 zcmeI&&x+GP90%~JdR<-gE@OAWR7kU~u;{UW;%?ZqDQV);gM>^nX~$-hkeR@iW%b|_ z_zL_3EFkqfXAmqXfC4Ch0w{n2D1ZVefC4Ch0=HB^ z2$4w@rm5z2+*8vVQzld@n7+Q-P5b89vxXhda)z|sA}TDWTm?~nS-{QjATN~2GpTJ2 z8u?w@>QUV}ruuXmGpb({JPtD@q840M&!V)7c@T>dZj|Sx;8K%iWK>F|vPBN4EO;c6 z1!X*oW|v_(*H=$OL}6=XSdSnijN^b`@u*TEOZoN|4zEY9^V+h_VC?vA&m7;hNPSsd zJGGrI-D;!Fdi`hZs3@BC20C#}S0{yaBv)E*&^b4Qu5a1>z?+Or>hknLs9B#gD@*d_ zrn#c55-E&a6#VaDu0Qmwff+cyH}bupZ@FgAbKD6%yk5@>!0x1OxG|W>HnZiGDk?=c MObePPs}&^q1I@m?g#Z8m literal 0 HcmV?d00001 diff --git a/profile/CMakeLists.txt b/profile/CMakeLists.txt new file mode 100644 index 0000000..2efc67f --- /dev/null +++ b/profile/CMakeLists.txt @@ -0,0 +1,15 @@ +# set output directory of builds +set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin) + +# set build type +set(CMAKE_BUILD_TYPE RelWithDebInfo) + +# Add targets +file(GLOB PROFILE_SOURCES "src/*.cpp") +add_executable(bitlib-profile ${PROFILE_SOURCES}) + +# specify benchmark-specific libraries +include_directories(src/utils) + +target_compile_options(bitlib-profile PUBLIC -O2 -ggdb -Wpedantic) +install(TARGETS bitlib-profile DESTINATION .) diff --git a/profile/src/main.cpp b/profile/src/main.cpp new file mode 100644 index 0000000..90a8e45 --- /dev/null +++ b/profile/src/main.cpp @@ -0,0 +1,50 @@ +// =============================== TEST ROOT ================================ // +// Project: The Experimental Bit Algorithms Library +// Name: test_root.cc +// Description: Brings in all of the test headers into an object to be linked +// with the test main +// Creator: Vincent Reverdy +// Contributor(s): Bryce Kille [2019] +// License: BSD 3-Clause License +// ========================================================================== // + + + +// ============================== PREAMBLE ================================== // +// C++ standard library +#include +#include +#include +#include +#include +// Project sources +#include "bitlib/bitlib.hpp" +#include "test_utils.hpp" +// Third party libraries +#include +#include +#include +#include +#include +#include +#include +// ========================================================================== // + + + +int main() +{ + using container_type = std::vector; + const int container_size = 1 << 24; + container_type bitcont = make_random_container(container_size); + auto first = bit::bit_iterator(std::begin(bitcont)); + auto last = bit::bit_iterator(std::end(bitcont)); + auto n = 1 << 10; + for (int i = 0; i < 100; i++) + { + if (i % 10 == 0) + std::cerr << i << "\n"; + bit::shift_left(first + 2, last, n + 4); + } + return 0; +} From 0db6aa9f7e011dc85f7f100ff54ce3b9fade9863 Mon Sep 17 00:00:00 2001 From: Bryce Lorenz Kille Date: Tue, 23 Apr 2024 14:48:45 -0500 Subject: [PATCH 21/31] Increase large --- benchmark/src/benchmark_main.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmark/src/benchmark_main.cc b/benchmark/src/benchmark_main.cc index f58bf9c..0f584db 100644 --- a/benchmark/src/benchmark_main.cc +++ b/benchmark/src/benchmark_main.cc @@ -99,7 +99,7 @@ void register_bool_containers(F test_lambda_f, std::string func_name, unsigned i int main(int argc, char** argv) { unsigned int size_small = 1 << 8; unsigned int size_medium = 1 << 16; - unsigned int size_large = 1 << 24; + unsigned int size_large = 1 << 26; unsigned int size_huge = 1 << 31; // Read/write benchmarks From 60c7bb0e5201d1318a0c551a14feda9f0e776a2d Mon Sep 17 00:00:00 2001 From: Bryce Lorenz Kille Date: Tue, 23 Apr 2024 14:49:20 -0500 Subject: [PATCH 22/31] Do not fill when shifting --- .../bitlib/bit-algorithms/bit_algorithm_details.hpp | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/include/bitlib/bit-algorithms/bit_algorithm_details.hpp b/include/bitlib/bit-algorithms/bit_algorithm_details.hpp index 59a62dc..da6a0dc 100644 --- a/include/bitlib/bit-algorithms/bit_algorithm_details.hpp +++ b/include/bitlib/bit-algorithms/bit_algorithm_details.hpp @@ -278,18 +278,17 @@ void write_word(src_type src, bit_iterator dst_bit_it, // Shifts the range [first, last) to the left by n, filling the empty // bits with 0 -// NOT OPTIMIZED. Will be replaced with std::shift eventually. -template -ForwardIt word_shift_left(ForwardIt first, - ForwardIt last, - typename ForwardIt::difference_type n +template +RandomAccessIt word_shift_left(RandomAccessIt first, + RandomAccessIt last, + typename RandomAccessIt::difference_type n ) { if (n <= 0) return last; if (n >= distance(first, last)) return first; - ForwardIt mid = first + n; + RandomAccessIt mid = first + n; auto ret = std::move(mid, last, first); - std::fill(ret, last, 0); + //std::fill(ret, last, 0); return ret; } From f1e4c34b94fcf47dea4cb8bf690a03aceeb13122 Mon Sep 17 00:00:00 2001 From: Bryce Lorenz Kille Date: Tue, 23 Apr 2024 14:49:32 -0500 Subject: [PATCH 23/31] Working improved shift_left --- include/bitlib/bit-algorithms/shift.hpp | 184 +++++++++++++++++------- 1 file changed, 131 insertions(+), 53 deletions(-) diff --git a/include/bitlib/bit-algorithms/shift.hpp b/include/bitlib/bit-algorithms/shift.hpp index dca5501..57b7744 100644 --- a/include/bitlib/bit-algorithms/shift.hpp +++ b/include/bitlib/bit-algorithms/shift.hpp @@ -56,19 +56,25 @@ bit_iterator shift_left( // Types and constants using word_type = typename bit_iterator::word_type; using size_type = typename bit_iterator::size_type; + using difference_type = typename bit_iterator::difference_type; constexpr size_type digits = binary_digits::value; // Initialization - auto d = distance(first, last); + auto d = bit::distance(first, last); const bool is_first_aligned = first.position() == 0; const bool is_last_aligned = last.position() == 0; + auto middle = first + n; // Out of range cases if (n <= 0) return last; - if (n >= d) return first; - + if (n >= d) + { + //bit::fill(first, last, bit::bit0); + return first; + } // Single word case + // Triggered if all relevant bits are in first.base() if (std::next(first.base(), is_last_aligned) == last.base()) { *first.base() = _bitblend( *first.base(), @@ -88,73 +94,145 @@ bit_iterator shift_left( ); } - // More initialization - size_type word_shifts = n / digits; - size_type remaining_bitshifts = n - digits*(word_shifts); - + // Triggered if all remaining bits can fit in a word + if (d - n <= digits) + { + word_type new_word = get_word(middle, d - n); + write_word(new_word, first, d - n); + first += d - n; + return first; + } // Multiple word case word_type first_value = *first.base(); word_type last_value = !is_last_aligned ? *last.base() : 0; - // Shift words to the left using std::shift - RandomAccessIt new_last_base = STD_SHIFT_LEFT(first.base(), - last.base(), - word_shifts - ); - if (!is_last_aligned) { - // Mask out-of-range bits so that we don't incorporate them - *last.base() &= (static_cast(1) << last.position()) - 1; - *new_last_base = *last.base(); - if (word_shifts > 0) { - *last.base() = 0; + // Align first + if (!is_first_aligned) + { + if (first.position() >= middle.position()) + { + *first.base() = _bitblend( + *first.base(), + (*middle.base()) << (first.position() - middle.position()), + first.position(), + digits - first.position() + ); + } + else + { + const int n1 = digits - middle.position(); + const int n2 = digits - first.position() - n1; + *first.base() = _bitblend( + *first.base(), + (*middle.base()) >> (middle.position() - first.position()), + first.position(), + n1 + ); + *first.base() = _bitblend( + *first.base(), + (*std::next(middle.base())) << (digits - n2), + first.position() + n1, + n2 + ); } + const int shifted = std::min(d - n, (digits - first.position())); + first += shifted; + middle += shifted; } - // Shift bit sequence to the lsb - if (remaining_bitshifts) { - RandomAccessIt it = first.base(); - -#ifdef BITLIB_HWY - // Align to 64 bit boundary - for (; std::next(it, is_last_aligned) != new_last_base && !is_aligned(&*it, 64); it++) { - *it = _shrd(*it, *std::next(it), remaining_bitshifts); + if (middle.base() == last.base()) + { + const int bits_left = last.position() - middle.position(); + if (bits_left > 0) + { + *first.base() = _bitblend( + *first.base(), + *middle.base() >> middle.position(), + 0, + bits_left + ); + first += bits_left; } + //bit::fill(first, last, bit::bit0); + return first; + } - const hn::ScalableTag d; - for (; std::distance(it, new_last_base) >= hn::Lanes(d) + 1 + !is_last_aligned; it += hn::Lanes(d)) + // More initialization + d = bit::distance(first, last); + const size_type word_shifts = n / digits; + const size_type offset = middle.position(); + + // At this point, first is aligned + // Can we juse use std::shift? + if (offset == 0) + { + first = bit::bit_iterator( + STD_SHIFT_LEFT(first.base(), + last.base(), + word_shifts), + 0 + ); + if (!is_last_aligned) { - const auto v = hn::ShiftRightSame(hn::Load(d, &*it), remaining_bitshifts); - const auto v_plus1 = hn::ShiftLeftSame(hn::LoadU(d, &*(it+1)), digits - remaining_bitshifts); - hn::Store(v | v_plus1, d, &*it); + write_word(*last.base(), first, last.position()); + first += last.position(); } + //bit::fill(first, last, bit::bit0); + return first; + } + + // Shift bit sequence to the lsb +#ifdef BITLIB_HWY + // Align to 64 bit boundary + while (std::next(middle.base()) < last.base() && !is_aligned(&*first.base(), 64)) { + *first.base() = _shrd(*middle.base(), *std::next(middle.base()), offset); + first += digits; + middle += digits; + } + + const hn::ScalableTag d_tag; + while (std::distance(middle.base(), last.base()) >= hn::Lanes(d_tag) + 10 + !is_last_aligned) + { + const auto v = hn::ShiftRightSame(hn::LoadU(d_tag, &*middle.base()), offset); + const auto v_plus1 = hn::ShiftLeftSame(hn::LoadU(d_tag, &*(middle.base()+1)), digits - offset); + hn::Store(v | v_plus1, d_tag, &*first.base()); + first += hn::Lanes(d_tag)*digits; + middle += hn::Lanes(d_tag)*digits; + } #endif - // _shrd all words except the last - for (; std::next(it, is_last_aligned) != new_last_base; ++it) { - *it = _shrd(*it, *std::next(it), remaining_bitshifts); - } - // For the last word simply right shift - *it >>= remaining_bitshifts; + auto first_base = first.base(); + auto middle_base = middle.base(); + + while (std::next(middle_base) < last.base()) { + *first_base = _shrd(*middle_base, *std::next(middle_base), offset); + first_base++; + middle_base++;; } - // Blend bits of the first element - if (!is_first_aligned) { + first = bit_iterator(first_base, 0); + middle = bit_iterator(middle_base, middle.position()); + + // If middle is now penultimate word + if (std::next(middle.base()) == last.base()) + { *first.base() = _bitblend( - first_value, *first.base(), - first.position(), - digits - first.position() + *middle.base() >> offset, + 0, + digits - offset ); + first += digits - offset; + middle += digits - offset; } - // Blend bits of the last element - if (!is_last_aligned) { - *last.base() = _bitblend( - *last.base(), - last_value, - last.position(), - digits - last.position() - ); + + if (!is_last_aligned) + { + const difference_type bits_left = last.position() - middle.position(); + const word_type new_word = get_word(middle, bits_left); + write_word(new_word, first, bits_left); + first += bits_left; } - //TODO is this more or less inefficient than having a latent iterator? - bit_iterator d_last = next(first, d-n); - return d_last; + + //bit::fill(first, last, bit::bit0); + return first; } template From 2e54ccf42e265ce1d0aff5263c72861892d04ea1 Mon Sep 17 00:00:00 2001 From: Bryce Lorenz Kille Date: Wed, 5 Jun 2024 17:09:25 -0500 Subject: [PATCH 24/31] Improved right shift. tests passing --- include/bitlib/bit-algorithms/shift.hpp | 150 ++++++++++++++---------- profile/.CMakeLists.txt.swp | Bin 12288 -> 0 bytes 2 files changed, 89 insertions(+), 61 deletions(-) delete mode 100644 profile/.CMakeLists.txt.swp diff --git a/include/bitlib/bit-algorithms/shift.hpp b/include/bitlib/bit-algorithms/shift.hpp index 57b7744..af87187 100644 --- a/include/bitlib/bit-algorithms/shift.hpp +++ b/include/bitlib/bit-algorithms/shift.hpp @@ -152,6 +152,10 @@ bit_iterator shift_left( ); first += bits_left; } + // https://en.cppreference.com/w/cpp/algorithm/shift + // "Elements that are in the original range but not the new range + // are left in a valid but unspecified state." + // //bit::fill(first, last, bit::bit0); return first; } @@ -162,7 +166,6 @@ bit_iterator shift_left( const size_type offset = middle.position(); // At this point, first is aligned - // Can we juse use std::shift? if (offset == 0) { first = bit::bit_iterator( @@ -176,6 +179,10 @@ bit_iterator shift_left( write_word(*last.base(), first, last.position()); first += last.position(); } + // https://en.cppreference.com/w/cpp/algorithm/shift + // "Elements that are in the original range but not the new range + // are left in a valid but unspecified state." + // //bit::fill(first, last, bit::bit0); return first; } @@ -249,7 +256,8 @@ bit_iterator shift_right( const bool is_first_aligned = first.position() == 0; const bool is_last_aligned = last.position() == 0; constexpr auto digits = binary_digits::value; - auto d = distance(first, last); + auto d = bit::distance(first, last); + bit_iterator middle = last - n; // Out of range cases if (n <= 0) return first; @@ -267,83 +275,103 @@ bit_iterator shift_right( first.position(), (is_last_aligned ? digits : last.position()) - first.position() ); - return bit_iterator( - first.base(), - first.position() + n - ); + return first + n; + } + + // Align last + if (last.position() != 0) + { + const size_type bits_to_align = std::min( + last.position(), + bit::distance(first, middle)); + const word_type word_to_write = get_word( + middle - bits_to_align, + bits_to_align); + write_word( + word_to_write, + last - bits_to_align, + bits_to_align); + middle -= bits_to_align; + last -= bits_to_align; + + // Nothing left to do + if (middle == first) + return first + n; } // More initialization - size_type word_shifts = n / digits; - size_type remaining_bitshifts = n - digits*(word_shifts); + const size_type word_shifts = n / digits; + const size_type offset = middle.position(); - // Multiple word case - word_type first_value = *first.base(); - word_type last_value = !is_last_aligned ? *last.base() : 0; - word_type mask = is_first_aligned ? - static_cast(-1) - : - static_cast( - (static_cast(1) << (digits - first.position())) - 1 - ) << first.position(); - *first.base() = *first.base() & mask; - // Shift words to the right - RandomAccessIt new_first_base = STD_SHIFT_RIGHT( - first.base(), - std::next( - last.base(), - !is_last_aligned), - word_shifts - ); - bit_iterator d_first(new_first_base, first.position()); // Shift bit sequence to the msb - if (remaining_bitshifts) { - auto it = is_last_aligned ? last.base() - 1 : last.base(); + if (offset == 0) { + auto new_first = bit::bit_iterator( + STD_SHIFT_RIGHT( + first.base(), + last.base(), + word_shifts), + first.position() + ); + // https://en.cppreference.com/w/cpp/algorithm/shift + // "Elements that are in the original range but not the new range + // are left in a valid but unspecified state." + // + //bit::fill(first, new_first, bit::bit0); + return first + n; + } + if (bit::distance(first, middle) >= digits) + { #ifdef BITLIB_HWY // Align to 64 bit boundary const hn::ScalableTag d; - for (; it != new_first_base && !is_aligned(&*(it - hn::Lanes(d) + 1), 64); it--) { - *it = _shld(*it, *(it - 1), remaining_bitshifts); + while (std::prev(middle.base()) > first.base() && !is_aligned(&*(last.base() - hn::Lanes(d)), 64)) { + *std::prev(last.base()) = _shrd(*std::prev(middle.base()), *middle.base(), offset); + last -= digits; + middle -= digits; } - for (; std::distance(new_first_base, it) >= hn::Lanes(d); it -= hn::Lanes(d)) + while (std::distance(first.base(), middle.base()) > hn::Lanes(d) + 1) { - const auto v = hn::ShiftLeftSame( - hn::Load(d, &*(it - hn::Lanes(d) + 1)), - remaining_bitshifts); - const auto v_plus1 = hn::ShiftRightSame( - hn::LoadU(d, &*(it - hn::Lanes(d))), - digits - remaining_bitshifts); - hn::Store(v | v_plus1, d, &*(it - hn::Lanes(d) + 1)); + const auto v = hn::ShiftRightSame( + hn::LoadU(d, &*(middle.base() - hn::Lanes(d))), + offset); + const auto v_plus1 = hn::ShiftLeftSame( + hn::LoadU(d, &*(middle.base() - hn::Lanes(d) + 1)), + digits - offset); + hn::Store(v | v_plus1, d, &*(last.base() - hn::Lanes(d))); + + last -= digits * hn::Lanes(d); + middle -= digits * hn::Lanes(d); } #endif - for(; it != new_first_base; --it) { - *it = _shld(*it, *(it - 1), remaining_bitshifts); + auto last_base_prev = std::prev(last.base()); + auto middle_base_prev = std::prev(middle.base()); + + while (middle_base_prev + (first.position() <= middle.position()) > first.base()) { + *last_base_prev = _shrd(*middle_base_prev, *std::next(middle_base_prev), offset); + last_base_prev--; + middle_base_prev--; } - *it <<= remaining_bitshifts; - } - // Blend bits of the first element - if (!is_first_aligned) { - *first.base() = _bitblend( - first_value, - *first.base(), - first.position(), - digits - first.position() - ); + + last = bit_iterator(std::next(last_base_prev), last.position()); + middle = bit_iterator(std::next(middle_base_prev), middle.position()); } - // Blend bits of the last element - if (!is_last_aligned) { - *last.base() = _bitblend( - *last.base(), - last_value, - last.position(), - digits - last.position() - ); + + if (first.position() != middle.position()) + { + const size_type bits_to_align = bit::distance(first, middle); + const word_type word_to_write = get_word( + first, + bits_to_align); + write_word( + word_to_write, + last - bits_to_align, + bits_to_align); } - advance(d_first, remaining_bitshifts); - return d_first; + + return first + n; } // -------------------------------------------------------------------------- // diff --git a/profile/.CMakeLists.txt.swp b/profile/.CMakeLists.txt.swp deleted file mode 100644 index 9ce599819033d074fedcf4d430d31bdf9067d92a..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 12288 zcmeI&&x+GP90%~JdR<-gE@OAWR7kU~u;{UW;%?ZqDQV);gM>^nX~$-hkeR@iW%b|_ z_zL_3EFkqfXAmqXfC4Ch0w{n2D1ZVefC4Ch0=HB^ z2$4w@rm5z2+*8vVQzld@n7+Q-P5b89vxXhda)z|sA}TDWTm?~nS-{QjATN~2GpTJ2 z8u?w@>QUV}ruuXmGpb({JPtD@q840M&!V)7c@T>dZj|Sx;8K%iWK>F|vPBN4EO;c6 z1!X*oW|v_(*H=$OL}6=XSdSnijN^b`@u*TEOZoN|4zEY9^V+h_VC?vA&m7;hNPSsd zJGGrI-D;!Fdi`hZs3@BC20C#}S0{yaBv)E*&^b4Qu5a1>z?+Or>hknLs9B#gD@*d_ zrn#c55-E&a6#VaDu0Qmwff+cyH}bupZ@FgAbKD6%yk5@>!0x1OxG|W>HnZiGDk?=c MObePPs}&^q1I@m?g#Z8m From 5ae65fed2420641dedd7dbacce43d29d257727d4 Mon Sep 17 00:00:00 2001 From: Bryce Lorenz Kille Date: Wed, 5 Jun 2024 17:36:43 -0500 Subject: [PATCH 25/31] Update benchmarks --- README.md | 213 ++++++++++++++++++++++++++---------------------------- 1 file changed, 102 insertions(+), 111 deletions(-) diff --git a/README.md b/README.md index 44ee88e..7d5168b 100644 --- a/README.md +++ b/README.md @@ -137,119 +137,110 @@ I used Google's [benchmark](https://github.com/google/benchmark) library for com For example, `bit::rotate (large) (ARA)` refers to our library's implementation of the `rotate` algorithm operating on a container of 65536 bits, where `first` and `last` are aligned but `n_first` is selected at random. ``` -2023-12-19T17:56:41-06:00 -Running ./bin/bitlib-bench -Run on (64 X 1067.77 MHz CPU s) -CPU Caches: - L1 Data 32 KiB (x32) - L1 Instruction 32 KiB (x32) - L2 Unified 1024 KiB (x32) - L3 Unified 22528 KiB (x2) -Load Average: 0.70, 0.87, 0.70 --------------------------------------------------------------------------------------- Benchmark Time CPU Iterations --------------------------------------------------------------------------------------- -bit::set (large) 1.91 ns 1.91 ns 366779196 -dynamic_bitset::set (large) 2.36 ns 2.36 ns 296975883 -bitarray::set (large) 2.20 ns 2.20 ns 318761424 -std::set (large) 2.39 ns 2.39 ns 293167404 -bit::shift_left (small) 19.8 ns 19.8 ns 35519917 -bit::shift_left (small) (UU) 30.5 ns 30.5 ns 22984689 -dynamic_bitset::shift_left (small) 13.1 ns 13.1 ns 53628854 -bitarray::shift_left (small) 38.4 ns 38.4 ns 18168867 -std::shift_left (small) 579 ns 578 ns 1209283 -bit::shift_left (large) 160869 ns 160845 ns 4353 -bit::shift_left (large) (UU) 280267 ns 280229 ns 2505 -dynamic_bitset::shift_left (large) 143487 ns 143454 ns 4877 -bitarray::shift_left (large) 835992 ns 835930 ns 837 -std::shift_left (large) 40289125 ns 40287190 ns 17 -bit::shift_right (small) 27.8 ns 27.8 ns 25146901 -bit::shift_right (small) (UU) 31.1 ns 31.1 ns 22561913 -dynamic_bitset::shift_right (small) 12.2 ns 12.2 ns 57443996 -bitarray::shift_right (small) 38.8 ns 38.8 ns 18155925 -std::shift_right (small) 504 ns 504 ns 1392311 -bit::shift_right (large) 164210 ns 164191 ns 4264 -bit::shift_right (large) (UU) 292115 ns 292087 ns 2404 -dynamic_bitset::shift_right (large) 125191 ns 125160 ns 5591 -bitarray::shift_right (large) 836455 ns 836415 ns 837 -std::shift_right (large) 36904578 ns 36906143 ns 17 -bit::reverse (small) (UU) 30.4 ns 30.4 ns 23010493 -bitarray::reverse (small) (UU) 92.9 ns 92.9 ns 7566424 -std::reverse (small) 416 ns 416 ns 1709223 -bit::reverse (large) 302243 ns 302246 ns 2314 -bit::reverse (large) (UU) 396252 ns 396251 ns 1766 -bitarray::reverse (large) 4180555 ns 4180640 ns 168 -bitarray::reverse (large) (UU) 5565145 ns 5565237 ns 126 -std::reverse (large) 71610824 ns 71613462 ns 10 -bit::transform(UnaryOp) (small) 7.73 ns 7.73 ns 90799823 -bit::transform(UnaryOp) (small) (UU) 16.7 ns 16.7 ns 41797825 -dynamic_bitset::transform(UnaryOp) (small) 3.79 ns 3.79 ns 178099711 -bitarray::transform(UnaryOp) (small) 8.15 ns 8.15 ns 86177059 -std::transform(UnaryOp) (small) 762 ns 762 ns 920469 -bit::transform(UnaryOp) (large) 89430 ns 89427 ns 7830 -bit::transform(UnaryOp) (large) (UU) 513673 ns 513652 ns 1363 -dynamic_bitset::transform(UnaryOp) (large) 90179 ns 90174 ns 7755 -bitarray::transform(UnaryOp) (large) 182288 ns 182278 ns 3806 -std::transform(UnaryOp) (large) 49393629 ns 49392276 ns 14 -bit::transform(BinaryOp) (small) 4.79 ns 4.79 ns 146268444 -bit::transform(BinaryOp) (small) (UU) 40.1 ns 40.1 ns 17465510 -dynamic_bitset::transform(BinaryOp) (small) 4.35 ns 4.35 ns 160471539 -bitarray::transform(BinaryOp) (small) 10.5 ns 10.5 ns 66739191 -std::transform(BinaryOp) (small) 837 ns 837 ns 834684 -bit::transform(BinaryOp) (large) 184508 ns 184491 ns 3796 -bit::transform(BinaryOp) (large) (UU) 2396570 ns 2396591 ns 292 -dynamic_bitset::transform(BinaryOp) (large) 183006 ns 182980 ns 3813 -bitarray::transform(BinaryOp) (large) 131178 ns 131171 ns 5348 -std::transform(BinaryOp) (large) 195492307 ns 195488596 ns 4 -bit::rotate (small) 121 ns 121 ns 10000000 -std::rotate (small) 1725 ns 1725 ns 467233 -bit::rotate (large) 1830057 ns 1830041 ns 377 -std::rotate (large) 149375227 ns 149373295 ns 5 -bit::count (small) 6.28 ns 6.28 ns 111995013 -dynamic_bitset::count (small) 8.12 ns 8.12 ns 87716832 -bitarray::count (small) 6.11 ns 6.11 ns 114586171 -std::count (small) 233 ns 233 ns 3000468 -bit::count (large) 86768 ns 86767 ns 8067 -dynamic_bitset::count (large) 86774 ns 86776 ns 8068 -bitarray::count (large) 228298 ns 228300 ns 3066 -std::count (large) 14717449 ns 14717517 ns 48 -bit::swap_ranges (small) 8.03 ns 8.03 ns 85409308 -bit::swap_ranges (small) (UU) 19.0 ns 19.0 ns 36799054 -std::swap_ranges (small) 753 ns 753 ns 932645 -bit::swap_ranges (large) 206087 ns 206069 ns 3400 -bit::swap_ranges (large) (UU) 1416540 ns 1416482 ns 495 -std::swap_ranges (large) 128732217 ns 128736753 ns 5 -bit::copy (small) (UU) 22.6 ns 22.6 ns 30977614 -std::copy (small) 706 ns 706 ns 991971 -bit::copy (large) (UU) 1283021 ns 1282931 ns 546 -std::copy (large) 47291412 ns 47293406 ns 15 -bit::equal (small) (UU) 13.5 ns 13.5 ns 51665518 -std::equal (small) 887 ns 887 ns 789443 -bit::equal (large) (UU) 684564 ns 684588 ns 1023 -std::equal (large) 58741336 ns 58740796 ns 12 -bit::move (small) (UU) 24.9 ns 24.9 ns 28152253 -std::move (small) 705 ns 705 ns 993177 -bit::move (large) (UU) 1486436 ns 1486307 ns 471 -std::move (large) 47268916 ns 47269412 ns 15 -bit::copy_backward (small) (UU) 35.8 ns 35.8 ns 19180871 -std::copy_backward (small) 524 ns 524 ns 1336116 -bit::copy_backward (large) (UU) 1843335 ns 1843176 ns 381 -std::copy_backward (large) 110068625 ns 110069932 ns 6 -bit::fill (small) (UU) 6.81 ns 6.81 ns 103143199 -dynamic_bitset::fill (small) 3.55 ns 3.55 ns 198214175 -bitarray::fill (small) 13.9 ns 13.9 ns 50233774 -std::fill (small) 9.57 ns 9.57 ns 73133048 -bit::fill (large) (UU) 95661 ns 95650 ns 7326 -dynamic_bitset::fill (large) 102146 ns 102146 ns 6851 -bitarray::fill (large) 72462 ns 72462 ns 9615 -std::fill (large) 72955 ns 72955 ns 9741 -bit::find (small) (UU) 3.22 ns 3.22 ns 217967844 -dynamic_bitset::find (small) 3.05 ns 3.05 ns 229824606 -bitarray::find (small) 7.15 ns 7.15 ns 94973526 -std::find (small) 100 ns 100 ns 6992893 -bit::find (large) (UU) 27810 ns 27808 ns 25202 -dynamic_bitset::find (large) 64434 ns 64437 ns 10870 -bitarray::find (large) 62305 ns 62298 ns 11220 -std::find (large) 6376779 ns 6376904 ns 110 +bit::set (large) 1.90 ns 1.90 ns 367974893 +dynamic_bitset::set (large) 2.37 ns 2.37 ns 296837879 +bitarray::set (large) 2.19 ns 2.19 ns 319133940 +std::set (large) 2.39 ns 2.39 ns 293135332 +bit::shift_left (small) 26.8 ns 26.8 ns 25929070 +bit::shift_left (small) (UU) 22.4 ns 22.4 ns 31233265 +dynamic_bitset::shift_left (small) 13.1 ns 13.1 ns 53627207 +bitarray::shift_left (small) 38.2 ns 38.2 ns 18339126 +std::shift_left (small) 345 ns 345 ns 2029283 +bit::shift_left (large) 371224 ns 371211 ns 1886 +bit::shift_left (large) (UU) 371536 ns 371530 ns 1880 +dynamic_bitset::shift_left (large) 638896 ns 638880 ns 1097 +bitarray::shift_left (large) 3156273 ns 3156003 ns 222 +std::shift_left (large) 105227752 ns 105223527 ns 7 +bit::shift_right (small) 26.9 ns 26.9 ns 25976563 +bit::shift_right (small) (UU) 39.3 ns 39.3 ns 17962533 +dynamic_bitset::shift_right (small) 12.2 ns 12.2 ns 57419526 +bitarray::shift_right (small) 38.1 ns 38.1 ns 18325350 +std::shift_right (small) 504 ns 504 ns 1386280 +bit::shift_right (large) 413297 ns 413269 ns 1693 +bit::shift_right (large) (UU) 413692 ns 413655 ns 1682 +dynamic_bitset::shift_right (large) 557287 ns 557305 ns 1257 +bitarray::shift_right (large) 3156463 ns 3156516 ns 222 +std::shift_right (large) 210100788 ns 210083631 ns 3 +bit::reverse (small) (UU) 43.4 ns 43.4 ns 16112098 +bitarray::reverse (small) (UU) 95.1 ns 95.1 ns 7387177 +std::reverse (small) 419 ns 419 ns 1677069 +bit::reverse (large) 1245260 ns 1245160 ns 563 +bit::reverse (large) (UU) 1800771 ns 1800680 ns 389 +bitarray::reverse (large) 16899481 ns 16898587 ns 41 +bitarray::reverse (large) (UU) 22719408 ns 22720393 ns 31 +std::reverse (large) 293563397 ns 293542850 ns 2 +bit::transform(UnaryOp) (small) 8.75 ns 8.75 ns 80079214 +bit::transform(UnaryOp) (small) (UU) 16.6 ns 16.6 ns 42254961 +dynamic_bitset::transform(UnaryOp) (small) 4.00 ns 4.00 ns 169219246 +bitarray::transform(UnaryOp) (small) 8.39 ns 8.39 ns 83877004 +std::transform(UnaryOp) (small) 763 ns 763 ns 917975 +bit::transform(UnaryOp) (large) 373982 ns 373950 ns 1853 +bit::transform(UnaryOp) (large) (UU) 2059234 ns 2059268 ns 339 +dynamic_bitset::transform(UnaryOp) (large) 379368 ns 379368 ns 1805 +bitarray::transform(UnaryOp) (large) 739552 ns 739544 ns 881 +std::transform(UnaryOp) (large) 197977698 ns 197969224 ns 4 +bit::transform(BinaryOp) (small) 4.38 ns 4.38 ns 160002060 +bit::transform(BinaryOp) (small) (UU) 42.1 ns 42.1 ns 16549758 +dynamic_bitset::transform(BinaryOp) (small) 4.36 ns 4.36 ns 160692979 +bitarray::transform(BinaryOp) (small) 10.7 ns 10.7 ns 66178974 +std::transform(BinaryOp) (small) 855 ns 855 ns 832115 +bit::transform(BinaryOp) (large) 763642 ns 763574 ns 912 +bit::transform(BinaryOp) (large) (UU) 10966202 ns 10966406 ns 64 +dynamic_bitset::transform(BinaryOp) (large) 758617 ns 758574 ns 906 +bitarray::transform(BinaryOp) (large) 518286 ns 518267 ns 1177 +std::transform(BinaryOp) (large) 802270688 ns 802303941 ns 1 +bit::rotate (small) 131 ns 131 ns 16525922 +std::rotate (small) 1782 ns 1782 ns 417293 +bit::rotate (large) 7333284 ns 7333170 ns 96 +std::rotate (large) 514697313 ns 514718779 ns 1 +bit::count (small) 8.14 ns 8.14 ns 86522765 +dynamic_bitset::count (small) 6.29 ns 6.29 ns 108878018 +bitarray::count (small) 5.47 ns 5.47 ns 133692569 +std::count (small) 234 ns 234 ns 2997782 +bit::count (large) 365194 ns 365159 ns 1919 +dynamic_bitset::count (large) 365279 ns 365269 ns 1919 +bitarray::count (large) 917302 ns 917185 ns 764 +std::count (large) 58934071 ns 58931785 ns 12 +bit::swap_ranges (small) 9.58 ns 9.57 ns 73128377 +bit::swap_ranges (small) (UU) 19.7 ns 19.7 ns 35498474 +std::swap_ranges (small) 756 ns 756 ns 912041 +bit::swap_ranges (large) 852205 ns 852241 ns 821 +bit::swap_ranges (large) (UU) 5691899 ns 5692145 ns 123 +std::swap_ranges (large) 522198664 ns 522161939 ns 1 +bit::copy (small) (UU) 25.0 ns 25.0 ns 28200772 +std::copy (small) 707 ns 707 ns 990757 +bit::copy (large) (UU) 5952278 ns 5951729 ns 116 +std::copy (large) 189551338 ns 189554366 ns 4 +bit::equal (small) (UU) 13.1 ns 13.1 ns 53616228 +std::equal (small) 886 ns 886 ns 790035 +bit::equal (large) (UU) 1960399 ns 1960375 ns 357 +std::equal (large) 234389098 ns 234398907 ns 3 +bit::move (small) (UU) 23.5 ns 23.5 ns 29764745 +std::move (small) 706 ns 706 ns 992054 +bit::move (large) (UU) 5135837 ns 5135619 ns 136 +std::move (large) 188961979 ns 188953500 ns 4 +bit::copy_backward (small) (UU) 39.0 ns 39.0 ns 17977387 +std::copy_backward (small) 527 ns 527 ns 1313265 +bit::copy_backward (large) (UU) 9163333 ns 9163038 ns 76 +std::copy_backward (large) 444362971 ns 444350668 ns 2 +bit::fill (small) (UU) 6.48 ns 6.48 ns 108934237 +dynamic_bitset::fill (small) 4.79 ns 4.79 ns 146205764 +bitarray::fill (small) 14.5 ns 14.5 ns 48030428 +std::fill (small) 9.15 ns 9.15 ns 76612702 +bit::fill (large) (UU) 440400 ns 440396 ns 1590 +dynamic_bitset::fill (large) 429375 ns 429359 ns 1631 +bitarray::fill (large) 369732 ns 369736 ns 1964 +std::fill (large) 356517 ns 356488 ns 1894 +bit::find (small) (UU) 3.10 ns 3.10 ns 228714994 +dynamic_bitset::find (small) 3.05 ns 3.05 ns 229830138 +bitarray::find (small) 7.38 ns 7.38 ns 99039746 +std::find (small) 110 ns 110 ns 6311725 +bit::find (large) (UU) 182002 ns 182006 ns 3850 +dynamic_bitset::find (large) 259896 ns 259908 ns 2696 +bitarray::find (large) 252434 ns 252445 ns 2774 +std::find (large) 28570723 ns 28567762 ns 25 ``` From aa204782c3862d7d96a821b06ab2fd0515e4da11 Mon Sep 17 00:00:00 2001 From: Bryce Lorenz Kille Date: Wed, 5 Jun 2024 17:52:47 -0500 Subject: [PATCH 26/31] Fix word_shift_right bug --- .../bit-algorithms/bit_algorithm_details.hpp | 51 +++---------------- 1 file changed, 8 insertions(+), 43 deletions(-) diff --git a/include/bitlib/bit-algorithms/bit_algorithm_details.hpp b/include/bitlib/bit-algorithms/bit_algorithm_details.hpp index da6a0dc..c49e022 100644 --- a/include/bitlib/bit-algorithms/bit_algorithm_details.hpp +++ b/include/bitlib/bit-algorithms/bit_algorithm_details.hpp @@ -288,7 +288,6 @@ RandomAccessIt word_shift_left(RandomAccessIt first, if (n >= distance(first, last)) return first; RandomAccessIt mid = first + n; auto ret = std::move(mid, last, first); - //std::fill(ret, last, 0); return ret; } @@ -296,51 +295,17 @@ RandomAccessIt word_shift_left(RandomAccessIt first, // Shifts the range [first, right) to the left by n, filling the empty // bits with 0 // NOT OPTIMIZED. Will be replaced with std::shift eventually. -template -ForwardIt word_shift_right_dispatch(ForwardIt first, - ForwardIt last, - typename ForwardIt::difference_type n, - std::forward_iterator_tag -) { - auto d = distance(first, last); - if (n <= 0) return first; - if (n >= d) return last; - ForwardIt it = first; - std::advance(it, d-n); - std::rotate(first, it, last); - it = first; - std::advance(it, n); - std::fill(first, it, 0); - return std::next(first, n); -} - -template -ForwardIt word_shift_right_dispatch(ForwardIt first, - ForwardIt last, - typename ForwardIt::difference_type n, - std::random_access_iterator_tag -) { +template +RandomAccessIt word_shift_right(RandomAccessIt first, + RandomAccessIt last, + typename RandomAccessIt::difference_type n +) +{ auto d = distance(first, last); if (n <= 0) return first; if (n >= d) return last; - ForwardIt it = first; - std::advance(it, d-n); - auto ret = std::copy_backward(first, it, last); - std::fill(first, ret, 0); - return ret; -} - -template -ForwardIt word_shift_right(ForwardIt first, - ForwardIt last, - typename ForwardIt::difference_type n -) -{ - return word_shift_right_dispatch( - first, - last, - n, - typename std::iterator_traits::iterator_category()); + std::move_backward(first, last-n, last); + return std::next(first, n); } // returns a word consisting of all one bits From 2e47baa80b0d97d3ea7fe12cd9706670e3b3ffb7 Mon Sep 17 00:00:00 2001 From: Bryce Lorenz Kille Date: Wed, 5 Jun 2024 18:22:44 -0500 Subject: [PATCH 27/31] Fix repeat test --- test/CMakeLists.txt | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 8c92d54..e84ae52 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -25,7 +25,9 @@ endif() include_directories(${googletest_SOURCE_DIR}/googletest/include/gtest src/utils) target_link_libraries(bitlib-tests PUBLIC GTest::gtest GTest::gtest_main -pthread -lgcov --coverage) -set(BITLIB_GTEST_REPEAT 1) +if (NOT BITLIB_GTEST_REPEAT) + set(BITLIB_GTEST_REPEAT 1) +endif() enable_testing() gtest_discover_tests( From 69dc3e9889261761289e58e27016e34d37884be0 Mon Sep 17 00:00:00 2001 From: Bryce Lorenz Kille Date: Wed, 5 Jun 2024 18:53:14 -0500 Subject: [PATCH 28/31] Minor shift changes --- benchmark/src/shift_bench.hpp | 16 ++++++++-------- include/bitlib/bit-algorithms/shift.hpp | 17 ++++++++++------- 2 files changed, 18 insertions(+), 15 deletions(-) diff --git a/benchmark/src/shift_bench.hpp b/benchmark/src/shift_bench.hpp index 51b91bd..8f38383 100644 --- a/benchmark/src/shift_bench.hpp +++ b/benchmark/src/shift_bench.hpp @@ -16,7 +16,7 @@ auto BM_BitShiftLeft = [](benchmark::State& state, auto input) { container_type bitcont = make_random_container(container_size); auto first = bit::bit_iterator(std::begin(bitcont)); auto last = bit::bit_iterator(std::end(bitcont)); - auto n = bit::distance(first, last) / 2 - 1; + auto n = total_bits / 2 - 1; for (auto _ : state) { benchmark::DoNotOptimize(bit::shift_left(first, last, n)); benchmark::ClobberMemory(); @@ -34,7 +34,7 @@ auto BM_BitShiftLeft_UU = [](benchmark::State& state, auto input) { container_type bitcont = make_random_container(container_size); bit::bit_iterator first = bit::bit_iterator(bitcont.begin()) + 1; bit::bit_iterator last = bit::bit_iterator(bitcont.end()) - 1; - auto n = bit::distance(first, last) / 2 + 6; + auto n = total_bits / 2 + 3; for (auto _ : state) { benchmark::DoNotOptimize(bit::shift_left(first, last, n)); benchmark::ClobberMemory(); @@ -75,7 +75,7 @@ auto BM_BoolShiftLeft = [](benchmark::State& state, auto input) { container_type cont = make_random_container(container_size); auto first = cont.begin(); auto last = cont.end(); - auto n = std::distance(first, last) / 2 + 6; + auto n = std::distance(first, last) / 2 - 1; for (auto _ : state) { benchmark::DoNotOptimize(bit::word_shift_left(first, last, n)); benchmark::ClobberMemory(); @@ -91,7 +91,7 @@ auto BM_BitShiftRight = [](benchmark::State& state, auto input) { container_type bitcont = make_random_container(container_size); auto first = bit::bit_iterator(std::begin(bitcont)); auto last = bit::bit_iterator(std::end(bitcont)); - auto n = bit::distance(first, last) / 2 - 1; + auto n = total_bits / 2 - 1; for (auto _ : state) { benchmark::DoNotOptimize(bit::shift_right(first, last, n)); benchmark::ClobberMemory(); @@ -105,9 +105,9 @@ auto BM_BitShiftRight_UU = [](benchmark::State& state, auto input) { auto digits = bit::binary_digits::value; auto container_size = ceil(float(total_bits) / digits); container_type bitcont = make_random_container(container_size); - auto first = bit::bit_iterator(std::begin(bitcont)) + 2; - auto last = bit::bit_iterator(std::end(bitcont)) - 3; - auto n = bit::distance(first, last) / 2 + 6; + auto first = bit::bit_iterator(std::begin(bitcont)) + 1; + auto last = bit::bit_iterator(std::end(bitcont)) - 1; + auto n = total_bits / 2 + 3; for (auto _ : state) { benchmark::DoNotOptimize(bit::shift_right(first, last, n)); benchmark::ClobberMemory(); @@ -147,7 +147,7 @@ auto BM_BoolShiftRight = [](benchmark::State& state, auto input) { container_type cont = make_random_container(container_size); auto first = cont.begin(); auto last = cont.end(); - auto n = std::distance(first, last) / 2 + 6; + auto n = std::distance(first, last) / 2 - 1; for (auto _ : state) { benchmark::DoNotOptimize(bit::word_shift_right(first, last, n)); benchmark::ClobberMemory(); diff --git a/include/bitlib/bit-algorithms/shift.hpp b/include/bitlib/bit-algorithms/shift.hpp index af87187..081188f 100644 --- a/include/bitlib/bit-algorithms/shift.hpp +++ b/include/bitlib/bit-algorithms/shift.hpp @@ -88,10 +88,7 @@ bit_iterator shift_left( first.position(), (is_last_aligned ? digits : last.position()) - first.position() ); - return bit_iterator( - first.base(), - first.position() + d - n - ); + return first + d - n; } // Triggered if all remaining bits can fit in a word @@ -99,8 +96,7 @@ bit_iterator shift_left( { word_type new_word = get_word(middle, d - n); write_word(new_word, first, d - n); - first += d - n; - return first; + return first + d - n; } // Multiple word case word_type first_value = *first.base(); @@ -349,7 +345,14 @@ bit_iterator shift_right( auto last_base_prev = std::prev(last.base()); auto middle_base_prev = std::prev(middle.base()); - while (middle_base_prev + (first.position() <= middle.position()) > first.base()) { + while (middle_base_prev > first.base()) { + *last_base_prev = _shrd(*middle_base_prev, *std::next(middle_base_prev), offset); + last_base_prev--; + middle_base_prev--; + } + + if (first.position() <= middle.position()) + { *last_base_prev = _shrd(*middle_base_prev, *std::next(middle_base_prev), offset); last_base_prev--; middle_base_prev--; From 986ab7646021407ce1c1f8a867cac752a700edfd Mon Sep 17 00:00:00 2001 From: Bryce Lorenz Kille Date: Wed, 5 Jun 2024 18:54:59 -0500 Subject: [PATCH 29/31] Rename header --- profile/src/main.cpp | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/profile/src/main.cpp b/profile/src/main.cpp index 90a8e45..9cd05bd 100644 --- a/profile/src/main.cpp +++ b/profile/src/main.cpp @@ -1,10 +1,7 @@ -// =============================== TEST ROOT ================================ // +// ================================ PROFILE ================================= // // Project: The Experimental Bit Algorithms Library -// Name: test_root.cc -// Description: Brings in all of the test headers into an object to be linked -// with the test main -// Creator: Vincent Reverdy -// Contributor(s): Bryce Kille [2019] +// Description: Used for profiling specific functions/algorithms +// Creator: Bryce Kille // License: BSD 3-Clause License // ========================================================================== // From 5b22949553312a35f45c7051a21a963f821f8778 Mon Sep 17 00:00:00 2001 From: Bryce Lorenz Kille Date: Wed, 5 Jun 2024 19:05:15 -0500 Subject: [PATCH 30/31] Update contributors for bit-algorithms and bit-containers --- benchmark/src/benchmark_main.cc | 5 ++--- include/bitlib/bit-algorithms/bit_algorithm.hpp | 1 - include/bitlib/bit-algorithms/bit_algorithm_details.hpp | 2 -- include/bitlib/bit-algorithms/copy.hpp | 1 - include/bitlib/bit-algorithms/copy_backward.hpp | 1 - include/bitlib/bit-algorithms/debug_utils.hpp | 1 - include/bitlib/bit-algorithms/equal.hpp | 1 - include/bitlib/bit-algorithms/fill.hpp | 5 ++--- include/bitlib/bit-algorithms/move.hpp | 1 - include/bitlib/bit-algorithms/reverse.hpp | 1 - include/bitlib/bit-algorithms/rotate.hpp | 1 - include/bitlib/bit-algorithms/shift.hpp | 1 - include/bitlib/bit-algorithms/swap_ranges.hpp | 1 - include/bitlib/bit-algorithms/transform.hpp | 2 -- include/bitlib/bit-algorithms/type_traits.hpp | 1 - include/bitlib/bit-containers/bit-containers.hpp | 2 -- include/bitlib/bit-containers/bit_vector.hpp | 3 +-- test/src/fixtures.hpp | 2 +- test/src/test-rotate.cpp | 3 +-- test/src/vector_test.cpp | 2 +- 20 files changed, 8 insertions(+), 29 deletions(-) diff --git a/benchmark/src/benchmark_main.cc b/benchmark/src/benchmark_main.cc index 0f584db..a8cad2d 100644 --- a/benchmark/src/benchmark_main.cc +++ b/benchmark/src/benchmark_main.cc @@ -1,10 +1,9 @@ // =============================== TEST ROOT ================================ // // Project: The Experimental Bit Algorithms Library -// Name: test_root.cc +// Name: benchmark_main.cc // Description: Brings in all of the test headers into an object to be linked // with the test main -// Creator: Vincent Reverdy -// Contributor(s): Bryce Kille [2019] +// Contributor(s): Bryce Kille // License: BSD 3-Clause License // ========================================================================== // diff --git a/include/bitlib/bit-algorithms/bit_algorithm.hpp b/include/bitlib/bit-algorithms/bit_algorithm.hpp index 9e51c5c..51cc661 100644 --- a/include/bitlib/bit-algorithms/bit_algorithm.hpp +++ b/include/bitlib/bit-algorithms/bit_algorithm.hpp @@ -2,7 +2,6 @@ // Project: The C++ Bit Library // Name: bit_algorithm.hpp // Description: Optimized versions of algorithms for bit manipulation -// Creator: Vincent Reverdy // Contributor(s): Vincent Reverdy [2015-2017] // Maghav Kumar [2016-2017] // Bryce Kille [2019] diff --git a/include/bitlib/bit-algorithms/bit_algorithm_details.hpp b/include/bitlib/bit-algorithms/bit_algorithm_details.hpp index c49e022..984c499 100644 --- a/include/bitlib/bit-algorithms/bit_algorithm_details.hpp +++ b/include/bitlib/bit-algorithms/bit_algorithm_details.hpp @@ -2,9 +2,7 @@ // Project: The Experimental Bit Algorithms Library // Name: bit_algorithm_details.hpp // Description: A set of utilities to assist in writing algorithms -// Creator: Vincent Reverdy // Contributor(s): Vincent Reverdy [2019] -// Collin Gress [2019] // Bryce Kille [2019] // License: BSD 3-Clause License // ========================================================================== // diff --git a/include/bitlib/bit-algorithms/copy.hpp b/include/bitlib/bit-algorithms/copy.hpp index 7bdea6f..2d50751 100644 --- a/include/bitlib/bit-algorithms/copy.hpp +++ b/include/bitlib/bit-algorithms/copy.hpp @@ -2,7 +2,6 @@ // Project: The Experimental Bit Algorithms Library // Name: copy.hpp // Description: Implementation of copy, copy_if, copy_n and copy_backward -// Creator: Vincent Reverdy // Contributor: Bryce Kille [2019] // License: BSD 3-Clause License // ========================================================================== // diff --git a/include/bitlib/bit-algorithms/copy_backward.hpp b/include/bitlib/bit-algorithms/copy_backward.hpp index 971fb91..55bd6f6 100644 --- a/include/bitlib/bit-algorithms/copy_backward.hpp +++ b/include/bitlib/bit-algorithms/copy_backward.hpp @@ -2,7 +2,6 @@ // Project: The Experimental Bit Algorithms Library // Name: copy_backward.hpp // Description: bit_iterator overloads for std::copy_backward -// Creator: Vincent Reverdy // Contributor(s): // License: BSD 3-Clause License // ========================================================================== // diff --git a/include/bitlib/bit-algorithms/debug_utils.hpp b/include/bitlib/bit-algorithms/debug_utils.hpp index a621bc3..74e590d 100644 --- a/include/bitlib/bit-algorithms/debug_utils.hpp +++ b/include/bitlib/bit-algorithms/debug_utils.hpp @@ -2,7 +2,6 @@ // Project: The Experimental Bit Algorithms Library // Name: debug_utils.hpp // Description: Utilities useful for debugging -// Creator: Vincent Reverdy // Contributor: Bryce Kille [2019] // License: BSD 3-Clause License // ========================================================================== // diff --git a/include/bitlib/bit-algorithms/equal.hpp b/include/bitlib/bit-algorithms/equal.hpp index b4c5e47..e3fb144 100644 --- a/include/bitlib/bit-algorithms/equal.hpp +++ b/include/bitlib/bit-algorithms/equal.hpp @@ -1,7 +1,6 @@ // ================================= EQUAL =================================== // // Project: The Experimental Bit Algorithms Library // Name: equal.hpp -// Creator: Vincent Reverdy // Contributor: Bryce Kille [2019] // License: BSD 3-Clause License // ========================================================================== // diff --git a/include/bitlib/bit-algorithms/fill.hpp b/include/bitlib/bit-algorithms/fill.hpp index 70dcd9d..08f9c61 100644 --- a/include/bitlib/bit-algorithms/fill.hpp +++ b/include/bitlib/bit-algorithms/fill.hpp @@ -2,9 +2,8 @@ // Project: The Experimental Bit Algorithms Library // Name: fill.hpp // Description: bit_iterator overloads for std::fill -// Creator: Vincent Reverdy -// Contributor(s): Vincent Reverdy [2019] -// Bryce Kille [2019] +// Contributor(s): Bryce Kille +// Vincent Reverdy [2019] // License: BSD 3-Clause License // ========================================================================== // #ifndef _FILL_HPP_INCLUDED diff --git a/include/bitlib/bit-algorithms/move.hpp b/include/bitlib/bit-algorithms/move.hpp index def87ef..401cbfd 100644 --- a/include/bitlib/bit-algorithms/move.hpp +++ b/include/bitlib/bit-algorithms/move.hpp @@ -2,7 +2,6 @@ // Project: The Experimental Bit Algorithms Library // Name: move.hpp // Description: bit_iterator overloads for std::move -// Creator: Vincent Reverdy // Contributor(s): // License: BSD 3-Clause License // ========================================================================== // diff --git a/include/bitlib/bit-algorithms/reverse.hpp b/include/bitlib/bit-algorithms/reverse.hpp index 3cc2173..37dacb7 100644 --- a/include/bitlib/bit-algorithms/reverse.hpp +++ b/include/bitlib/bit-algorithms/reverse.hpp @@ -2,7 +2,6 @@ // Project: The Experimental Bit Algorithms Library // Name: copy.hpp // Description: Implementation of reverse -// Creator: Vincent Reverdy // Contributor: Vincent Reverdy [2019] // License: BSD 3-Clause License // ========================================================================== // diff --git a/include/bitlib/bit-algorithms/rotate.hpp b/include/bitlib/bit-algorithms/rotate.hpp index 5466f7d..f9509ba 100644 --- a/include/bitlib/bit-algorithms/rotate.hpp +++ b/include/bitlib/bit-algorithms/rotate.hpp @@ -2,7 +2,6 @@ // Project: The Experimental Bit Algorithms Library // Name: rotate.hpp // Description: bit_iterator overloads for std::rotate -// Creator: Vincent Reverdy // Contributor(s): Bryce Kille [2019] // License: BSD 3-Clause License // ========================================================================== // diff --git a/include/bitlib/bit-algorithms/shift.hpp b/include/bitlib/bit-algorithms/shift.hpp index 081188f..e598e0c 100644 --- a/include/bitlib/bit-algorithms/shift.hpp +++ b/include/bitlib/bit-algorithms/shift.hpp @@ -2,7 +2,6 @@ // Project: The Experimental Bit Algorithms Library // Name: shift.hpp // Description: Implementation of shift_left and shift_right -// Creator: Vincent Reverdy // Contributor(s): Bryce Kille [2019] // License: BSD 3-Clause License // ========================================================================== // diff --git a/include/bitlib/bit-algorithms/swap_ranges.hpp b/include/bitlib/bit-algorithms/swap_ranges.hpp index 799ea47..67cd5a0 100644 --- a/include/bitlib/bit-algorithms/swap_ranges.hpp +++ b/include/bitlib/bit-algorithms/swap_ranges.hpp @@ -2,7 +2,6 @@ // Project: The Experimental Bit Algorithms Library // Name: swap_ranges.hpp // Description: bit_iterator overloads for std::swap_ranges -// Creator: Vincent Reverdy // Contributor(s): Bryce Kille [2019] // License: BSD 3-Clause License // ========================================================================== // diff --git a/include/bitlib/bit-algorithms/transform.hpp b/include/bitlib/bit-algorithms/transform.hpp index 4a71cf8..8857788 100644 --- a/include/bitlib/bit-algorithms/transform.hpp +++ b/include/bitlib/bit-algorithms/transform.hpp @@ -2,8 +2,6 @@ // Project: The Experimental Bit Algorithms Library // Name: transform.hpp // Description: bit_iterator overloads for std::transform -// Creator: Vincent Reverdy -// Contributor(s): // License: BSD 3-Clause License // ========================================================================== // #ifndef _TRANSFORM_HPP_INCLUDED diff --git a/include/bitlib/bit-algorithms/type_traits.hpp b/include/bitlib/bit-algorithms/type_traits.hpp index 2ddbe07..0cd1fb2 100644 --- a/include/bitlib/bit-algorithms/type_traits.hpp +++ b/include/bitlib/bit-algorithms/type_traits.hpp @@ -2,7 +2,6 @@ // Project: The Experimental Bit Algorithms Library // Name: type_traits.hpp // Description: Type traits for bits -// Creator: Vincent Reverdy // Contributor(s): Vincent Reverdy [2019] // License: BSD 3-Clause License // ========================================================================== // diff --git a/include/bitlib/bit-containers/bit-containers.hpp b/include/bitlib/bit-containers/bit-containers.hpp index d2fbc37..bbbd540 100644 --- a/include/bitlib/bit-containers/bit-containers.hpp +++ b/include/bitlib/bit-containers/bit-containers.hpp @@ -2,8 +2,6 @@ // Project: The Bit Algorithms Library // Name: bit-containers.hpp // Description: Brings in all of the container headers together -// Creator: Vincent Reverdy -// Contributor(s): Bryce Kille [2019] // License: BSD 3-Clause License // ========================================================================== // #ifndef _BIT_CONTAINERS_HPP_INCLUDED diff --git a/include/bitlib/bit-containers/bit_vector.hpp b/include/bitlib/bit-containers/bit_vector.hpp index f68d076..f971c49 100644 --- a/include/bitlib/bit-containers/bit_vector.hpp +++ b/include/bitlib/bit-containers/bit_vector.hpp @@ -2,8 +2,7 @@ // Project: The Experimental Bit Algorithms Library // \file bit_vector.hpp // Description: Implementation of bit_vector -// Creator: Vincent Reverdy -// Contributor: Bryce Kille [2019] +// Contributor: Bryce Kille // License: BSD 3-Clause License // ========================================================================== // #ifndef _BIT_VECTOR_HPP_INCLUDED diff --git a/test/src/fixtures.hpp b/test/src/fixtures.hpp index 1c9e00f..b8cd594 100644 --- a/test/src/fixtures.hpp +++ b/test/src/fixtures.hpp @@ -1,7 +1,7 @@ // =============================== FIXTURES ================================= // // Project: The Experimental Bit Algorithms Library // Description: Fixtures for testing -// Creator: Bryce Kille [2019] +// Contributor(s): Bryce Kille // License: BSD 3-Clause License // ========================================================================== // #ifndef _FIXTURES_HPP_INCLUDED diff --git a/test/src/test-rotate.cpp b/test/src/test-rotate.cpp index fdc9373..8ae3a55 100644 --- a/test/src/test-rotate.cpp +++ b/test/src/test-rotate.cpp @@ -2,8 +2,7 @@ // Project: The Experimental Bit Algorithms Library // Name: rotate.hpp // Description: Tests for rotate algorithms -// Creator: Vincent Reverdy -// Contributor(s): Bryce Kille [2019] +// Contributor(s): Bryce Kille // License: BSD 3-Clause License // ========================================================================== // diff --git a/test/src/vector_test.cpp b/test/src/vector_test.cpp index c7dff71..666b031 100644 --- a/test/src/vector_test.cpp +++ b/test/src/vector_test.cpp @@ -1,7 +1,7 @@ // =============================== FIXTURES ================================= // // Project: The Experimental Bit Algorithms Library // Description: Fixtures for testing -// Creator: Bryce Kille [2019] +// Contributor(s): Bryce Kille // License: BSD 3-Clause License // ========================================================================== // From 007d7721db43025ea00868bd5098acaf4c6f8567 Mon Sep 17 00:00:00 2001 From: Bryce Lorenz Kille Date: Wed, 5 Jun 2024 19:07:13 -0500 Subject: [PATCH 31/31] Update version --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index bf82cf3..4668a90 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -2,7 +2,7 @@ cmake_minimum_required(VERSION 3.14) # set the project name -project(Bit-Vector VERSION 0.1.1) +project(Bit-Vector VERSION 0.3.0) # set output directory of builds #set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)