diff --git a/extra/neon/iszero/README.md b/extra/neon/iszero/README.md index 3124489c..159e7a29 100644 --- a/extra/neon/iszero/README.md +++ b/extra/neon/iszero/README.md @@ -9,115 +9,39 @@ cmake --build build Potential result: ``` - sudo ./build/benchmark +sudo ./build/benchmark +Password: loaded db: a15 (Apple A15) # check: 58823 58823 58823 58823 Trial 1 -veq_non_zero_max : 4.94 GB/s 308.5 Ma/s 3.24 ns/d 3.52 GHz 11.43 c/d 11.24 i/d 0.71 c/b 0.70 i/b 0.98 i/c -veq_non_zero_mov : 4.94 GB/s 308.5 Ma/s 3.24 ns/d 3.53 GHz 11.43 c/d 11.24 i/d 0.71 c/b 0.70 i/b 0.98 i/c -veq_non_zero_narrow : 4.93 GB/s 308.0 Ma/s 3.25 ns/d 3.52 GHz 11.44 c/d 11.24 i/d 0.72 c/b 0.70 i/b 0.98 i/c -veq_non_zero_float : 3.73 GB/s 233.3 Ma/s 4.29 ns/d 3.57 GHz 15.29 c/d 10.24 i/d 0.96 c/b 0.64 i/b 0.67 i/c +veq_non_zero_max : 4.92 GB/s 307.7 Ma/s 3.25 ns/d 3.52 GHz 11.45 c/d 11.24 i/d 0.72 c/b 0.70 i/b 0.98 i/c +veq_non_zero_mov : 4.60 GB/s 287.7 Ma/s 3.48 ns/d 3.52 GHz 12.24 c/d 11.24 i/d 0.77 c/b 0.70 i/b 0.92 i/c +veq_non_zero_narrow : 4.90 GB/s 306.1 Ma/s 3.27 ns/d 3.54 GHz 11.56 c/d 11.24 i/d 0.72 c/b 0.70 i/b 0.97 i/c +veq_non_zero_float : 3.73 GB/s 233.2 Ma/s 4.29 ns/d 3.52 GHz 15.09 c/d 10.24 i/d 0.94 c/b 0.64 i/b 0.68 i/c Trial 2 -veq_non_zero_max : 4.92 GB/s 307.7 Ma/s 3.25 ns/d 3.53 GHz 11.47 c/d 11.24 i/d 0.72 c/b 0.70 i/b 0.98 i/c -veq_non_zero_mov : 4.91 GB/s 307.0 Ma/s 3.26 ns/d 3.52 GHz 11.48 c/d 11.24 i/d 0.72 c/b 0.70 i/b 0.98 i/c -veq_non_zero_narrow : 4.91 GB/s 306.8 Ma/s 3.26 ns/d 3.52 GHz 11.49 c/d 11.24 i/d 0.72 c/b 0.70 i/b 0.98 i/c +veq_non_zero_max : 4.89 GB/s 305.6 Ma/s 3.27 ns/d 3.52 GHz 11.53 c/d 11.24 i/d 0.72 c/b 0.70 i/b 0.97 i/c +veq_non_zero_mov : 6.48 GB/s 405.2 Ma/s 2.47 ns/d 3.23 GHz 7.98 c/d 11.27 i/d 0.50 c/b 0.70 i/b 1.41 i/c +veq_non_zero_narrow : 4.88 GB/s 305.2 Ma/s 3.28 ns/d 3.53 GHz 11.55 c/d 11.24 i/d 0.72 c/b 0.70 i/b 0.97 i/c veq_non_zero_float : 3.73 GB/s 233.2 Ma/s 4.29 ns/d 3.52 GHz 15.09 c/d 10.24 i/d 0.94 c/b 0.64 i/b 0.68 i/c Trial 3 -veq_non_zero_max : 4.92 GB/s 307.7 Ma/s 3.25 ns/d 3.54 GHz 11.49 c/d 11.24 i/d 0.72 c/b 0.70 i/b 0.98 i/c -veq_non_zero_mov : 4.75 GB/s 297.0 Ma/s 3.37 ns/d 3.46 GHz 11.63 c/d 11.24 i/d 0.73 c/b 0.70 i/b 0.97 i/c -veq_non_zero_narrow : 4.90 GB/s 306.4 Ma/s 3.26 ns/d 3.54 GHz 11.54 c/d 11.24 i/d 0.72 c/b 0.70 i/b 0.97 i/c -veq_non_zero_float : 3.73 GB/s 233.2 Ma/s 4.29 ns/d 3.52 GHz 15.09 c/d 10.24 i/d 0.94 c/b 0.64 i/b 0.68 i/c -Trial 4 -veq_non_zero_max : 4.88 GB/s 305.0 Ma/s 3.28 ns/d 3.52 GHz 11.55 c/d 11.24 i/d 0.72 c/b 0.70 i/b 0.97 i/c -veq_non_zero_mov : 4.90 GB/s 306.4 Ma/s 3.26 ns/d 3.52 GHz 11.49 c/d 11.24 i/d 0.72 c/b 0.70 i/b 0.98 i/c -veq_non_zero_narrow : 4.89 GB/s 305.5 Ma/s 3.27 ns/d 3.52 GHz 11.53 c/d 11.24 i/d 0.72 c/b 0.70 i/b 0.97 i/c -veq_non_zero_float : 3.73 GB/s 233.2 Ma/s 4.29 ns/d 3.52 GHz 15.09 c/d 10.24 i/d 0.94 c/b 0.64 i/b 0.68 i/c -Trial 5 -veq_non_zero_max : 4.92 GB/s 307.2 Ma/s 3.25 ns/d 3.54 GHz 11.51 c/d 11.24 i/d 0.72 c/b 0.70 i/b 0.98 i/c -veq_non_zero_mov : 4.93 GB/s 308.4 Ma/s 3.24 ns/d 3.52 GHz 11.43 c/d 11.24 i/d 0.71 c/b 0.70 i/b 0.98 i/c -veq_non_zero_narrow : 4.83 GB/s 301.7 Ma/s 3.32 ns/d 3.52 GHz 11.68 c/d 11.24 i/d 0.73 c/b 0.70 i/b 0.96 i/c -veq_non_zero_float : 3.73 GB/s 233.2 Ma/s 4.29 ns/d 3.52 GHz 15.09 c/d 10.24 i/d 0.94 c/b 0.64 i/b 0.68 i/c -Trial 6 -veq_non_zero_max : 4.91 GB/s 306.9 Ma/s 3.26 ns/d 3.52 GHz 11.48 c/d 11.24 i/d 0.72 c/b 0.70 i/b 0.98 i/c -veq_non_zero_mov : 4.70 GB/s 293.7 Ma/s 3.40 ns/d 3.52 GHz 12.00 c/d 11.24 i/d 0.75 c/b 0.70 i/b 0.94 i/c -veq_non_zero_narrow : 4.83 GB/s 301.9 Ma/s 3.31 ns/d 3.52 GHz 11.67 c/d 11.24 i/d 0.73 c/b 0.70 i/b 0.96 i/c -veq_non_zero_float : 3.73 GB/s 233.2 Ma/s 4.29 ns/d 3.52 GHz 15.09 c/d 10.24 i/d 0.94 c/b 0.64 i/b 0.68 i/c -Trial 7 -veq_non_zero_max : 4.89 GB/s 305.9 Ma/s 3.27 ns/d 3.53 GHz 11.54 c/d 11.24 i/d 0.72 c/b 0.70 i/b 0.97 i/c -veq_non_zero_mov : 4.92 GB/s 307.4 Ma/s 3.25 ns/d 3.52 GHz 11.46 c/d 11.24 i/d 0.72 c/b 0.70 i/b 0.98 i/c -veq_non_zero_narrow : 4.95 GB/s 309.3 Ma/s 3.23 ns/d 3.52 GHz 11.39 c/d 11.24 i/d 0.71 c/b 0.70 i/b 0.99 i/c -veq_non_zero_float : 3.73 GB/s 233.2 Ma/s 4.29 ns/d 3.52 GHz 15.09 c/d 10.24 i/d 0.94 c/b 0.64 i/b 0.68 i/c -Trial 8 -veq_non_zero_max : 4.87 GB/s 304.3 Ma/s 3.29 ns/d 3.53 GHz 11.60 c/d 11.24 i/d 0.72 c/b 0.70 i/b 0.97 i/c -veq_non_zero_mov : 4.93 GB/s 308.1 Ma/s 3.25 ns/d 3.54 GHz 11.49 c/d 11.24 i/d 0.72 c/b 0.70 i/b 0.98 i/c -veq_non_zero_narrow : 4.78 GB/s 298.6 Ma/s 3.35 ns/d 3.52 GHz 11.80 c/d 11.24 i/d 0.74 c/b 0.70 i/b 0.95 i/c -veq_non_zero_float : 3.73 GB/s 233.2 Ma/s 4.29 ns/d 3.52 GHz 15.09 c/d 10.24 i/d 0.94 c/b 0.64 i/b 0.68 i/c -Trial 9 -veq_non_zero_max : 4.88 GB/s 305.3 Ma/s 3.28 ns/d 3.52 GHz 11.54 c/d 11.24 i/d 0.72 c/b 0.70 i/b 0.97 i/c -veq_non_zero_mov : 4.92 GB/s 307.4 Ma/s 3.25 ns/d 3.52 GHz 11.46 c/d 11.24 i/d 0.72 c/b 0.70 i/b 0.98 i/c -veq_non_zero_narrow : 4.75 GB/s 296.9 Ma/s 3.37 ns/d 3.52 GHz 11.87 c/d 11.24 i/d 0.74 c/b 0.70 i/b 0.95 i/c -veq_non_zero_float : 3.73 GB/s 233.2 Ma/s 4.29 ns/d 3.52 GHz 15.09 c/d 10.24 i/d 0.94 c/b 0.64 i/b 0.68 i/c -Trial 10 -veq_non_zero_max : 4.92 GB/s 307.8 Ma/s 3.25 ns/d 3.53 GHz 11.46 c/d 11.24 i/d 0.72 c/b 0.70 i/b 0.98 i/c -veq_non_zero_mov : 4.96 GB/s 309.8 Ma/s 3.23 ns/d 3.52 GHz 11.38 c/d 11.24 i/d 0.71 c/b 0.70 i/b 0.99 i/c -veq_non_zero_narrow : 4.66 GB/s 291.0 Ma/s 3.44 ns/d 3.52 GHz 12.10 c/d 11.24 i/d 0.76 c/b 0.70 i/b 0.93 i/c -veq_non_zero_float : 3.73 GB/s 233.2 Ma/s 4.29 ns/d 3.52 GHz 15.09 c/d 10.24 i/d 0.94 c/b 0.64 i/b 0.68 i/c -☁ iszero [master] ⚡ clang-format -i benchmarks/benchmark.cpp -☁ iszero [master] ⚡ cmake --build build -[ 50%] Building CXX object CMakeFiles/benchmark.dir/benchmarks/benchmark.cpp.o -[100%] Linking CXX executable benchmark -[100%] Built target benchmark -☁ iszero [master] ⚡ sudo ./build/benchmark -loaded db: a15 (Apple A15) -# check: 58823 58823 58823 58823 +veq_non_zero_max : 5.55 GB/s 347.1 Ma/s 2.88 ns/d 3.38 GHz 9.73 c/d 11.24 i/d 0.61 c/b 0.70 i/b 1.15 i/c +veq_non_zero_mov : 4.88 GB/s 304.8 Ma/s 3.28 ns/d 3.53 GHz 11.58 c/d 11.24 i/d 0.72 c/b 0.70 i/b 0.97 i/c +veq_non_zero_narrow : 7.71 GB/s 481.7 Ma/s 2.08 ns/d 3.45 GHz 7.15 c/d 11.24 i/d 0.45 c/b 0.70 i/b 1.57 i/c +veq_non_zero_float : 3.73 GB/s 233.2 Ma/s 4.29 ns/d 3.56 GHz 15.25 c/d 10.24 i/d 0.95 c/b 0.64 i/b 0.67 i/c +branchy Trial 1 -veq_non_zero_max : 4.86 GB/s 303.9 Ma/s 3.29 ns/d 3.52 GHz 11.59 c/d 11.24 i/d 0.72 c/b 0.70 i/b 0.97 i/c -veq_non_zero_mov : 4.79 GB/s 299.4 Ma/s 3.34 ns/d 3.52 GHz 11.76 c/d 11.24 i/d 0.74 c/b 0.70 i/b 0.96 i/c -veq_non_zero_narrow : 4.87 GB/s 304.3 Ma/s 3.29 ns/d 3.52 GHz 11.58 c/d 11.24 i/d 0.72 c/b 0.70 i/b 0.97 i/c -veq_non_zero_float : 3.73 GB/s 233.2 Ma/s 4.29 ns/d 3.52 GHz 15.09 c/d 10.24 i/d 0.94 c/b 0.64 i/b 0.68 i/c +veq_non_zero_max : 23.53 GB/s 1470.6 Ma/s 0.68 ns/d 3.59 GHz 2.44 c/d 17.24 i/d 0.15 c/b 1.08 i/b 7.06 i/c +veq_non_zero_mov : 22.32 GB/s 1395.0 Ma/s 0.72 ns/d 3.60 GHz 2.58 c/d 16.18 i/d 0.16 c/b 1.01 i/b 6.27 i/c +veq_non_zero_narrow : 22.30 GB/s 1393.6 Ma/s 0.72 ns/d 3.59 GHz 2.58 c/d 16.18 i/d 0.16 c/b 1.01 i/b 6.27 i/c +veq_non_zero_float : 26.30 GB/s 1643.5 Ma/s 0.61 ns/d 3.61 GHz 2.20 c/d 15.11 i/d 0.14 c/b 0.94 i/b 6.88 i/c Trial 2 -veq_non_zero_max : 7.39 GB/s 462.0 Ma/s 2.16 ns/d 2.83 GHz 6.12 c/d 11.24 i/d 0.38 c/b 0.70 i/b 1.84 i/c -veq_non_zero_mov : 4.91 GB/s 307.2 Ma/s 3.26 ns/d 3.54 GHz 11.51 c/d 11.24 i/d 0.72 c/b 0.70 i/b 0.98 i/c -veq_non_zero_narrow : 4.94 GB/s 308.9 Ma/s 3.24 ns/d 3.54 GHz 11.46 c/d 11.24 i/d 0.72 c/b 0.70 i/b 0.98 i/c -veq_non_zero_float : 3.73 GB/s 233.2 Ma/s 4.29 ns/d 3.52 GHz 15.10 c/d 10.24 i/d 0.94 c/b 0.64 i/b 0.68 i/c +veq_non_zero_max : 23.53 GB/s 1470.6 Ma/s 0.68 ns/d 3.60 GHz 2.45 c/d 17.24 i/d 0.15 c/b 1.08 i/b 7.03 i/c +veq_non_zero_mov : 22.30 GB/s 1393.6 Ma/s 0.72 ns/d 3.60 GHz 2.58 c/d 16.18 i/d 0.16 c/b 1.01 i/b 6.27 i/c +veq_non_zero_narrow : 22.30 GB/s 1393.6 Ma/s 0.72 ns/d 3.60 GHz 2.58 c/d 16.18 i/d 0.16 c/b 1.01 i/b 6.27 i/c +veq_non_zero_float : 26.30 GB/s 1643.5 Ma/s 0.61 ns/d 3.62 GHz 2.20 c/d 15.11 i/d 0.14 c/b 0.94 i/b 6.87 i/c Trial 3 -veq_non_zero_max : 4.94 GB/s 308.7 Ma/s 3.24 ns/d 3.52 GHz 11.41 c/d 11.24 i/d 0.71 c/b 0.70 i/b 0.98 i/c -veq_non_zero_mov : 4.94 GB/s 308.6 Ma/s 3.24 ns/d 3.52 GHz 11.42 c/d 11.24 i/d 0.71 c/b 0.70 i/b 0.98 i/c -veq_non_zero_narrow : 4.92 GB/s 307.5 Ma/s 3.25 ns/d 3.53 GHz 11.49 c/d 11.24 i/d 0.72 c/b 0.70 i/b 0.98 i/c -veq_non_zero_float : 3.73 GB/s 233.3 Ma/s 4.29 ns/d 3.52 GHz 15.09 c/d 10.24 i/d 0.94 c/b 0.64 i/b 0.68 i/c -Trial 4 -veq_non_zero_max : 4.91 GB/s 306.8 Ma/s 3.26 ns/d 3.52 GHz 11.49 c/d 11.24 i/d 0.72 c/b 0.70 i/b 0.98 i/c -veq_non_zero_mov : 4.86 GB/s 303.7 Ma/s 3.29 ns/d 3.54 GHz 11.64 c/d 11.24 i/d 0.73 c/b 0.70 i/b 0.97 i/c -veq_non_zero_narrow : 4.93 GB/s 308.1 Ma/s 3.25 ns/d 3.54 GHz 11.49 c/d 11.24 i/d 0.72 c/b 0.70 i/b 0.98 i/c -veq_non_zero_float : 3.73 GB/s 233.2 Ma/s 4.29 ns/d 3.66 GHz 15.70 c/d 10.24 i/d 0.98 c/b 0.64 i/b 0.65 i/c -Trial 5 -veq_non_zero_max : 4.94 GB/s 308.7 Ma/s 3.24 ns/d 3.52 GHz 11.42 c/d 11.24 i/d 0.71 c/b 0.70 i/b 0.98 i/c -veq_non_zero_mov : 4.88 GB/s 305.2 Ma/s 3.28 ns/d 3.53 GHz 11.58 c/d 11.24 i/d 0.72 c/b 0.70 i/b 0.97 i/c -veq_non_zero_narrow : 4.91 GB/s 306.9 Ma/s 3.26 ns/d 3.53 GHz 11.49 c/d 11.24 i/d 0.72 c/b 0.70 i/b 0.98 i/c -veq_non_zero_float : 3.73 GB/s 233.2 Ma/s 4.29 ns/d 3.52 GHz 15.09 c/d 10.24 i/d 0.94 c/b 0.64 i/b 0.68 i/c -Trial 6 -veq_non_zero_max : 4.91 GB/s 307.0 Ma/s 3.26 ns/d 3.53 GHz 11.50 c/d 11.24 i/d 0.72 c/b 0.70 i/b 0.98 i/c -veq_non_zero_mov : 4.90 GB/s 306.5 Ma/s 3.26 ns/d 3.53 GHz 11.51 c/d 11.24 i/d 0.72 c/b 0.70 i/b 0.98 i/c -veq_non_zero_narrow : 4.90 GB/s 306.1 Ma/s 3.27 ns/d 3.52 GHz 11.52 c/d 11.24 i/d 0.72 c/b 0.70 i/b 0.98 i/c -veq_non_zero_float : 3.73 GB/s 233.2 Ma/s 4.29 ns/d 3.52 GHz 15.09 c/d 10.24 i/d 0.94 c/b 0.64 i/b 0.68 i/c -Trial 7 -veq_non_zero_max : 4.70 GB/s 293.6 Ma/s 3.41 ns/d 3.52 GHz 12.01 c/d 11.24 i/d 0.75 c/b 0.70 i/b 0.94 i/c -veq_non_zero_mov : 4.89 GB/s 305.8 Ma/s 3.27 ns/d 3.53 GHz 11.53 c/d 11.24 i/d 0.72 c/b 0.70 i/b 0.97 i/c -veq_non_zero_narrow : 4.92 GB/s 307.5 Ma/s 3.25 ns/d 3.53 GHz 11.48 c/d 11.24 i/d 0.72 c/b 0.70 i/b 0.98 i/c -veq_non_zero_float : 3.73 GB/s 233.2 Ma/s 4.29 ns/d 3.52 GHz 15.09 c/d 10.24 i/d 0.94 c/b 0.64 i/b 0.68 i/c -Trial 8 -veq_non_zero_max : 4.89 GB/s 305.9 Ma/s 3.27 ns/d 3.52 GHz 11.52 c/d 11.24 i/d 0.72 c/b 0.70 i/b 0.98 i/c -veq_non_zero_mov : 4.93 GB/s 308.4 Ma/s 3.24 ns/d 3.53 GHz 11.43 c/d 11.24 i/d 0.71 c/b 0.70 i/b 0.98 i/c -veq_non_zero_narrow : 4.91 GB/s 307.0 Ma/s 3.26 ns/d 3.53 GHz 11.51 c/d 11.24 i/d 0.72 c/b 0.70 i/b 0.98 i/c -veq_non_zero_float : 3.73 GB/s 233.2 Ma/s 4.29 ns/d 3.52 GHz 15.09 c/d 10.24 i/d 0.94 c/b 0.64 i/b 0.68 i/c -Trial 9 -veq_non_zero_max : 4.79 GB/s 299.6 Ma/s 3.34 ns/d 3.52 GHz 11.76 c/d 11.24 i/d 0.73 c/b 0.70 i/b 0.96 i/c -veq_non_zero_mov : 4.86 GB/s 303.9 Ma/s 3.29 ns/d 3.52 GHz 11.59 c/d 11.24 i/d 0.72 c/b 0.70 i/b 0.97 i/c -veq_non_zero_narrow : 4.94 GB/s 309.0 Ma/s 3.24 ns/d 3.23 GHz 10.46 c/d 11.24 i/d 0.65 c/b 0.70 i/b 1.07 i/c -veq_non_zero_float : 3.73 GB/s 233.2 Ma/s 4.29 ns/d 3.52 GHz 15.09 c/d 10.24 i/d 0.94 c/b 0.64 i/b 0.68 i/c -Trial 10 -veq_non_zero_max : 4.89 GB/s 305.8 Ma/s 3.27 ns/d 3.52 GHz 11.52 c/d 11.24 i/d 0.72 c/b 0.70 i/b 0.98 i/c -veq_non_zero_mov : 4.94 GB/s 308.4 Ma/s 3.24 ns/d 3.54 GHz 11.47 c/d 11.24 i/d 0.72 c/b 0.70 i/b 0.98 i/c -veq_non_zero_narrow : 4.91 GB/s 306.8 Ma/s 3.26 ns/d 3.52 GHz 11.48 c/d 11.24 i/d 0.72 c/b 0.70 i/b 0.98 i/c -veq_non_zero_float : 3.73 GB/s 233.2 Ma/s 4.29 ns/d 3.52 GHz 15.09 c/d 10.24 i/d 0.94 c/b 0.64 i/b 0.68 i/c +veq_non_zero_max : 23.55 GB/s 1472.1 Ma/s 0.68 ns/d 3.60 GHz 2.45 c/d 17.24 i/d 0.15 c/b 1.08 i/b 7.05 i/c +veq_non_zero_mov : 22.30 GB/s 1393.6 Ma/s 0.72 ns/d 3.59 GHz 2.58 c/d 16.18 i/d 0.16 c/b 1.01 i/b 6.28 i/c +veq_non_zero_narrow : 22.30 GB/s 1393.6 Ma/s 0.72 ns/d 3.59 GHz 2.58 c/d 16.18 i/d 0.16 c/b 1.01 i/b 6.27 i/c +veq_non_zero_float : 26.27 GB/s 1641.6 Ma/s 0.61 ns/d 3.61 GHz 2.20 c/d 15.11 i/d 0.14 c/b 0.94 i/b 6.87 i/c ``` diff --git a/extra/neon/iszero/benchmarks/benchmark.cpp b/extra/neon/iszero/benchmarks/benchmark.cpp index 4be71dd4..3ce5a0bb 100644 --- a/extra/neon/iszero/benchmarks/benchmark.cpp +++ b/extra/neon/iszero/benchmarks/benchmark.cpp @@ -39,6 +39,19 @@ template int scan(uint8_t *input, size_t length, F f) { return result; } +template int branchyscan(uint8_t *input, size_t length, F f) { + int result = 0; + for (size_t i = 0; i + 16 + 1 <= length; i += 16) { + uint8x16_t v = vld1q_u8(input + i); + result++; + if (f(v)) { + v = vld1q_u8(input + i + 1); + result += f(v); + } + } + return result; +} + void pretty_print(size_t volume, size_t bytes, std::string name, event_aggregate agg) { printf("%-40s : ", name.c_str()); @@ -71,7 +84,7 @@ int main(int argc, char **argv) { scan(data.data(), data.size(), veq_non_zero_mov), scan(data.data(), data.size(), veq_non_zero_narrow), scan(data.data(), data.size(), veq_non_zero_float)); - for (size_t trial = 0; trial < 10; trial++) { + for (size_t trial = 0; trial < 3; trial++) { printf("Trial %zu\n", trial + 1); pretty_print(count, volume, "veq_non_zero_max", bench([&data, &counter]() { @@ -92,4 +105,28 @@ int main(int argc, char **argv) { scan(data.data(), data.size(), veq_non_zero_float); })); } + printf("branchy\n"); + + for (size_t trial = 0; trial < 3; trial++) { + printf("Trial %zu\n", trial + 1); + + pretty_print(count, volume, "veq_non_zero_max", bench([&data, &counter]() { + counter = counter + branchyscan(data.data(), data.size(), + veq_non_zero_max); + })); + pretty_print(count, volume, "veq_non_zero_mov", bench([&data, &counter]() { + counter = counter + branchyscan(data.data(), data.size(), + veq_non_zero_mov); + })); + pretty_print( + count, volume, "veq_non_zero_narrow", bench([&data, &counter]() { + counter = + counter + branchyscan(data.data(), data.size(), veq_non_zero_mov); + })); + pretty_print(count, volume, "veq_non_zero_float", + bench([&data, &counter]() { + counter = counter + branchyscan(data.data(), data.size(), + veq_non_zero_float); + })); + } }