Skip to content

Commit

Permalink
update
Browse files Browse the repository at this point in the history
  • Loading branch information
lemire committed Jan 12, 2025
1 parent 2bdf333 commit 0051123
Show file tree
Hide file tree
Showing 2 changed files with 64 additions and 103 deletions.
128 changes: 26 additions & 102 deletions extra/neon/iszero/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,115 +9,39 @@ cmake --build build

Potential result:
```
sudo ./build/benchmark
sudo ./build/benchmark
Password:
loaded db: a15 (Apple A15)
# check: 58823 58823 58823 58823
Trial 1
veq_non_zero_max : 4.94 GB/s 308.5 Ma/s 3.24 ns/d 3.52 GHz 11.43 c/d 11.24 i/d 0.71 c/b 0.70 i/b 0.98 i/c
veq_non_zero_mov : 4.94 GB/s 308.5 Ma/s 3.24 ns/d 3.53 GHz 11.43 c/d 11.24 i/d 0.71 c/b 0.70 i/b 0.98 i/c
veq_non_zero_narrow : 4.93 GB/s 308.0 Ma/s 3.25 ns/d 3.52 GHz 11.44 c/d 11.24 i/d 0.72 c/b 0.70 i/b 0.98 i/c
veq_non_zero_float : 3.73 GB/s 233.3 Ma/s 4.29 ns/d 3.57 GHz 15.29 c/d 10.24 i/d 0.96 c/b 0.64 i/b 0.67 i/c
veq_non_zero_max : 4.92 GB/s 307.7 Ma/s 3.25 ns/d 3.52 GHz 11.45 c/d 11.24 i/d 0.72 c/b 0.70 i/b 0.98 i/c
veq_non_zero_mov : 4.60 GB/s 287.7 Ma/s 3.48 ns/d 3.52 GHz 12.24 c/d 11.24 i/d 0.77 c/b 0.70 i/b 0.92 i/c
veq_non_zero_narrow : 4.90 GB/s 306.1 Ma/s 3.27 ns/d 3.54 GHz 11.56 c/d 11.24 i/d 0.72 c/b 0.70 i/b 0.97 i/c
veq_non_zero_float : 3.73 GB/s 233.2 Ma/s 4.29 ns/d 3.52 GHz 15.09 c/d 10.24 i/d 0.94 c/b 0.64 i/b 0.68 i/c
Trial 2
veq_non_zero_max : 4.92 GB/s 307.7 Ma/s 3.25 ns/d 3.53 GHz 11.47 c/d 11.24 i/d 0.72 c/b 0.70 i/b 0.98 i/c
veq_non_zero_mov : 4.91 GB/s 307.0 Ma/s 3.26 ns/d 3.52 GHz 11.48 c/d 11.24 i/d 0.72 c/b 0.70 i/b 0.98 i/c
veq_non_zero_narrow : 4.91 GB/s 306.8 Ma/s 3.26 ns/d 3.52 GHz 11.49 c/d 11.24 i/d 0.72 c/b 0.70 i/b 0.98 i/c
veq_non_zero_max : 4.89 GB/s 305.6 Ma/s 3.27 ns/d 3.52 GHz 11.53 c/d 11.24 i/d 0.72 c/b 0.70 i/b 0.97 i/c
veq_non_zero_mov : 6.48 GB/s 405.2 Ma/s 2.47 ns/d 3.23 GHz 7.98 c/d 11.27 i/d 0.50 c/b 0.70 i/b 1.41 i/c
veq_non_zero_narrow : 4.88 GB/s 305.2 Ma/s 3.28 ns/d 3.53 GHz 11.55 c/d 11.24 i/d 0.72 c/b 0.70 i/b 0.97 i/c
veq_non_zero_float : 3.73 GB/s 233.2 Ma/s 4.29 ns/d 3.52 GHz 15.09 c/d 10.24 i/d 0.94 c/b 0.64 i/b 0.68 i/c
Trial 3
veq_non_zero_max : 4.92 GB/s 307.7 Ma/s 3.25 ns/d 3.54 GHz 11.49 c/d 11.24 i/d 0.72 c/b 0.70 i/b 0.98 i/c
veq_non_zero_mov : 4.75 GB/s 297.0 Ma/s 3.37 ns/d 3.46 GHz 11.63 c/d 11.24 i/d 0.73 c/b 0.70 i/b 0.97 i/c
veq_non_zero_narrow : 4.90 GB/s 306.4 Ma/s 3.26 ns/d 3.54 GHz 11.54 c/d 11.24 i/d 0.72 c/b 0.70 i/b 0.97 i/c
veq_non_zero_float : 3.73 GB/s 233.2 Ma/s 4.29 ns/d 3.52 GHz 15.09 c/d 10.24 i/d 0.94 c/b 0.64 i/b 0.68 i/c
Trial 4
veq_non_zero_max : 4.88 GB/s 305.0 Ma/s 3.28 ns/d 3.52 GHz 11.55 c/d 11.24 i/d 0.72 c/b 0.70 i/b 0.97 i/c
veq_non_zero_mov : 4.90 GB/s 306.4 Ma/s 3.26 ns/d 3.52 GHz 11.49 c/d 11.24 i/d 0.72 c/b 0.70 i/b 0.98 i/c
veq_non_zero_narrow : 4.89 GB/s 305.5 Ma/s 3.27 ns/d 3.52 GHz 11.53 c/d 11.24 i/d 0.72 c/b 0.70 i/b 0.97 i/c
veq_non_zero_float : 3.73 GB/s 233.2 Ma/s 4.29 ns/d 3.52 GHz 15.09 c/d 10.24 i/d 0.94 c/b 0.64 i/b 0.68 i/c
Trial 5
veq_non_zero_max : 4.92 GB/s 307.2 Ma/s 3.25 ns/d 3.54 GHz 11.51 c/d 11.24 i/d 0.72 c/b 0.70 i/b 0.98 i/c
veq_non_zero_mov : 4.93 GB/s 308.4 Ma/s 3.24 ns/d 3.52 GHz 11.43 c/d 11.24 i/d 0.71 c/b 0.70 i/b 0.98 i/c
veq_non_zero_narrow : 4.83 GB/s 301.7 Ma/s 3.32 ns/d 3.52 GHz 11.68 c/d 11.24 i/d 0.73 c/b 0.70 i/b 0.96 i/c
veq_non_zero_float : 3.73 GB/s 233.2 Ma/s 4.29 ns/d 3.52 GHz 15.09 c/d 10.24 i/d 0.94 c/b 0.64 i/b 0.68 i/c
Trial 6
veq_non_zero_max : 4.91 GB/s 306.9 Ma/s 3.26 ns/d 3.52 GHz 11.48 c/d 11.24 i/d 0.72 c/b 0.70 i/b 0.98 i/c
veq_non_zero_mov : 4.70 GB/s 293.7 Ma/s 3.40 ns/d 3.52 GHz 12.00 c/d 11.24 i/d 0.75 c/b 0.70 i/b 0.94 i/c
veq_non_zero_narrow : 4.83 GB/s 301.9 Ma/s 3.31 ns/d 3.52 GHz 11.67 c/d 11.24 i/d 0.73 c/b 0.70 i/b 0.96 i/c
veq_non_zero_float : 3.73 GB/s 233.2 Ma/s 4.29 ns/d 3.52 GHz 15.09 c/d 10.24 i/d 0.94 c/b 0.64 i/b 0.68 i/c
Trial 7
veq_non_zero_max : 4.89 GB/s 305.9 Ma/s 3.27 ns/d 3.53 GHz 11.54 c/d 11.24 i/d 0.72 c/b 0.70 i/b 0.97 i/c
veq_non_zero_mov : 4.92 GB/s 307.4 Ma/s 3.25 ns/d 3.52 GHz 11.46 c/d 11.24 i/d 0.72 c/b 0.70 i/b 0.98 i/c
veq_non_zero_narrow : 4.95 GB/s 309.3 Ma/s 3.23 ns/d 3.52 GHz 11.39 c/d 11.24 i/d 0.71 c/b 0.70 i/b 0.99 i/c
veq_non_zero_float : 3.73 GB/s 233.2 Ma/s 4.29 ns/d 3.52 GHz 15.09 c/d 10.24 i/d 0.94 c/b 0.64 i/b 0.68 i/c
Trial 8
veq_non_zero_max : 4.87 GB/s 304.3 Ma/s 3.29 ns/d 3.53 GHz 11.60 c/d 11.24 i/d 0.72 c/b 0.70 i/b 0.97 i/c
veq_non_zero_mov : 4.93 GB/s 308.1 Ma/s 3.25 ns/d 3.54 GHz 11.49 c/d 11.24 i/d 0.72 c/b 0.70 i/b 0.98 i/c
veq_non_zero_narrow : 4.78 GB/s 298.6 Ma/s 3.35 ns/d 3.52 GHz 11.80 c/d 11.24 i/d 0.74 c/b 0.70 i/b 0.95 i/c
veq_non_zero_float : 3.73 GB/s 233.2 Ma/s 4.29 ns/d 3.52 GHz 15.09 c/d 10.24 i/d 0.94 c/b 0.64 i/b 0.68 i/c
Trial 9
veq_non_zero_max : 4.88 GB/s 305.3 Ma/s 3.28 ns/d 3.52 GHz 11.54 c/d 11.24 i/d 0.72 c/b 0.70 i/b 0.97 i/c
veq_non_zero_mov : 4.92 GB/s 307.4 Ma/s 3.25 ns/d 3.52 GHz 11.46 c/d 11.24 i/d 0.72 c/b 0.70 i/b 0.98 i/c
veq_non_zero_narrow : 4.75 GB/s 296.9 Ma/s 3.37 ns/d 3.52 GHz 11.87 c/d 11.24 i/d 0.74 c/b 0.70 i/b 0.95 i/c
veq_non_zero_float : 3.73 GB/s 233.2 Ma/s 4.29 ns/d 3.52 GHz 15.09 c/d 10.24 i/d 0.94 c/b 0.64 i/b 0.68 i/c
Trial 10
veq_non_zero_max : 4.92 GB/s 307.8 Ma/s 3.25 ns/d 3.53 GHz 11.46 c/d 11.24 i/d 0.72 c/b 0.70 i/b 0.98 i/c
veq_non_zero_mov : 4.96 GB/s 309.8 Ma/s 3.23 ns/d 3.52 GHz 11.38 c/d 11.24 i/d 0.71 c/b 0.70 i/b 0.99 i/c
veq_non_zero_narrow : 4.66 GB/s 291.0 Ma/s 3.44 ns/d 3.52 GHz 12.10 c/d 11.24 i/d 0.76 c/b 0.70 i/b 0.93 i/c
veq_non_zero_float : 3.73 GB/s 233.2 Ma/s 4.29 ns/d 3.52 GHz 15.09 c/d 10.24 i/d 0.94 c/b 0.64 i/b 0.68 i/c
☁ iszero [master] ⚡ clang-format -i benchmarks/benchmark.cpp
☁ iszero [master] ⚡ cmake --build build
[ 50%] Building CXX object CMakeFiles/benchmark.dir/benchmarks/benchmark.cpp.o
[100%] Linking CXX executable benchmark
[100%] Built target benchmark
☁ iszero [master] ⚡ sudo ./build/benchmark
loaded db: a15 (Apple A15)
# check: 58823 58823 58823 58823
veq_non_zero_max : 5.55 GB/s 347.1 Ma/s 2.88 ns/d 3.38 GHz 9.73 c/d 11.24 i/d 0.61 c/b 0.70 i/b 1.15 i/c
veq_non_zero_mov : 4.88 GB/s 304.8 Ma/s 3.28 ns/d 3.53 GHz 11.58 c/d 11.24 i/d 0.72 c/b 0.70 i/b 0.97 i/c
veq_non_zero_narrow : 7.71 GB/s 481.7 Ma/s 2.08 ns/d 3.45 GHz 7.15 c/d 11.24 i/d 0.45 c/b 0.70 i/b 1.57 i/c
veq_non_zero_float : 3.73 GB/s 233.2 Ma/s 4.29 ns/d 3.56 GHz 15.25 c/d 10.24 i/d 0.95 c/b 0.64 i/b 0.67 i/c
branchy
Trial 1
veq_non_zero_max : 4.86 GB/s 303.9 Ma/s 3.29 ns/d 3.52 GHz 11.59 c/d 11.24 i/d 0.72 c/b 0.70 i/b 0.97 i/c
veq_non_zero_mov : 4.79 GB/s 299.4 Ma/s 3.34 ns/d 3.52 GHz 11.76 c/d 11.24 i/d 0.74 c/b 0.70 i/b 0.96 i/c
veq_non_zero_narrow : 4.87 GB/s 304.3 Ma/s 3.29 ns/d 3.52 GHz 11.58 c/d 11.24 i/d 0.72 c/b 0.70 i/b 0.97 i/c
veq_non_zero_float : 3.73 GB/s 233.2 Ma/s 4.29 ns/d 3.52 GHz 15.09 c/d 10.24 i/d 0.94 c/b 0.64 i/b 0.68 i/c
veq_non_zero_max : 23.53 GB/s 1470.6 Ma/s 0.68 ns/d 3.59 GHz 2.44 c/d 17.24 i/d 0.15 c/b 1.08 i/b 7.06 i/c
veq_non_zero_mov : 22.32 GB/s 1395.0 Ma/s 0.72 ns/d 3.60 GHz 2.58 c/d 16.18 i/d 0.16 c/b 1.01 i/b 6.27 i/c
veq_non_zero_narrow : 22.30 GB/s 1393.6 Ma/s 0.72 ns/d 3.59 GHz 2.58 c/d 16.18 i/d 0.16 c/b 1.01 i/b 6.27 i/c
veq_non_zero_float : 26.30 GB/s 1643.5 Ma/s 0.61 ns/d 3.61 GHz 2.20 c/d 15.11 i/d 0.14 c/b 0.94 i/b 6.88 i/c
Trial 2
veq_non_zero_max : 7.39 GB/s 462.0 Ma/s 2.16 ns/d 2.83 GHz 6.12 c/d 11.24 i/d 0.38 c/b 0.70 i/b 1.84 i/c
veq_non_zero_mov : 4.91 GB/s 307.2 Ma/s 3.26 ns/d 3.54 GHz 11.51 c/d 11.24 i/d 0.72 c/b 0.70 i/b 0.98 i/c
veq_non_zero_narrow : 4.94 GB/s 308.9 Ma/s 3.24 ns/d 3.54 GHz 11.46 c/d 11.24 i/d 0.72 c/b 0.70 i/b 0.98 i/c
veq_non_zero_float : 3.73 GB/s 233.2 Ma/s 4.29 ns/d 3.52 GHz 15.10 c/d 10.24 i/d 0.94 c/b 0.64 i/b 0.68 i/c
veq_non_zero_max : 23.53 GB/s 1470.6 Ma/s 0.68 ns/d 3.60 GHz 2.45 c/d 17.24 i/d 0.15 c/b 1.08 i/b 7.03 i/c
veq_non_zero_mov : 22.30 GB/s 1393.6 Ma/s 0.72 ns/d 3.60 GHz 2.58 c/d 16.18 i/d 0.16 c/b 1.01 i/b 6.27 i/c
veq_non_zero_narrow : 22.30 GB/s 1393.6 Ma/s 0.72 ns/d 3.60 GHz 2.58 c/d 16.18 i/d 0.16 c/b 1.01 i/b 6.27 i/c
veq_non_zero_float : 26.30 GB/s 1643.5 Ma/s 0.61 ns/d 3.62 GHz 2.20 c/d 15.11 i/d 0.14 c/b 0.94 i/b 6.87 i/c
Trial 3
veq_non_zero_max : 4.94 GB/s 308.7 Ma/s 3.24 ns/d 3.52 GHz 11.41 c/d 11.24 i/d 0.71 c/b 0.70 i/b 0.98 i/c
veq_non_zero_mov : 4.94 GB/s 308.6 Ma/s 3.24 ns/d 3.52 GHz 11.42 c/d 11.24 i/d 0.71 c/b 0.70 i/b 0.98 i/c
veq_non_zero_narrow : 4.92 GB/s 307.5 Ma/s 3.25 ns/d 3.53 GHz 11.49 c/d 11.24 i/d 0.72 c/b 0.70 i/b 0.98 i/c
veq_non_zero_float : 3.73 GB/s 233.3 Ma/s 4.29 ns/d 3.52 GHz 15.09 c/d 10.24 i/d 0.94 c/b 0.64 i/b 0.68 i/c
Trial 4
veq_non_zero_max : 4.91 GB/s 306.8 Ma/s 3.26 ns/d 3.52 GHz 11.49 c/d 11.24 i/d 0.72 c/b 0.70 i/b 0.98 i/c
veq_non_zero_mov : 4.86 GB/s 303.7 Ma/s 3.29 ns/d 3.54 GHz 11.64 c/d 11.24 i/d 0.73 c/b 0.70 i/b 0.97 i/c
veq_non_zero_narrow : 4.93 GB/s 308.1 Ma/s 3.25 ns/d 3.54 GHz 11.49 c/d 11.24 i/d 0.72 c/b 0.70 i/b 0.98 i/c
veq_non_zero_float : 3.73 GB/s 233.2 Ma/s 4.29 ns/d 3.66 GHz 15.70 c/d 10.24 i/d 0.98 c/b 0.64 i/b 0.65 i/c
Trial 5
veq_non_zero_max : 4.94 GB/s 308.7 Ma/s 3.24 ns/d 3.52 GHz 11.42 c/d 11.24 i/d 0.71 c/b 0.70 i/b 0.98 i/c
veq_non_zero_mov : 4.88 GB/s 305.2 Ma/s 3.28 ns/d 3.53 GHz 11.58 c/d 11.24 i/d 0.72 c/b 0.70 i/b 0.97 i/c
veq_non_zero_narrow : 4.91 GB/s 306.9 Ma/s 3.26 ns/d 3.53 GHz 11.49 c/d 11.24 i/d 0.72 c/b 0.70 i/b 0.98 i/c
veq_non_zero_float : 3.73 GB/s 233.2 Ma/s 4.29 ns/d 3.52 GHz 15.09 c/d 10.24 i/d 0.94 c/b 0.64 i/b 0.68 i/c
Trial 6
veq_non_zero_max : 4.91 GB/s 307.0 Ma/s 3.26 ns/d 3.53 GHz 11.50 c/d 11.24 i/d 0.72 c/b 0.70 i/b 0.98 i/c
veq_non_zero_mov : 4.90 GB/s 306.5 Ma/s 3.26 ns/d 3.53 GHz 11.51 c/d 11.24 i/d 0.72 c/b 0.70 i/b 0.98 i/c
veq_non_zero_narrow : 4.90 GB/s 306.1 Ma/s 3.27 ns/d 3.52 GHz 11.52 c/d 11.24 i/d 0.72 c/b 0.70 i/b 0.98 i/c
veq_non_zero_float : 3.73 GB/s 233.2 Ma/s 4.29 ns/d 3.52 GHz 15.09 c/d 10.24 i/d 0.94 c/b 0.64 i/b 0.68 i/c
Trial 7
veq_non_zero_max : 4.70 GB/s 293.6 Ma/s 3.41 ns/d 3.52 GHz 12.01 c/d 11.24 i/d 0.75 c/b 0.70 i/b 0.94 i/c
veq_non_zero_mov : 4.89 GB/s 305.8 Ma/s 3.27 ns/d 3.53 GHz 11.53 c/d 11.24 i/d 0.72 c/b 0.70 i/b 0.97 i/c
veq_non_zero_narrow : 4.92 GB/s 307.5 Ma/s 3.25 ns/d 3.53 GHz 11.48 c/d 11.24 i/d 0.72 c/b 0.70 i/b 0.98 i/c
veq_non_zero_float : 3.73 GB/s 233.2 Ma/s 4.29 ns/d 3.52 GHz 15.09 c/d 10.24 i/d 0.94 c/b 0.64 i/b 0.68 i/c
Trial 8
veq_non_zero_max : 4.89 GB/s 305.9 Ma/s 3.27 ns/d 3.52 GHz 11.52 c/d 11.24 i/d 0.72 c/b 0.70 i/b 0.98 i/c
veq_non_zero_mov : 4.93 GB/s 308.4 Ma/s 3.24 ns/d 3.53 GHz 11.43 c/d 11.24 i/d 0.71 c/b 0.70 i/b 0.98 i/c
veq_non_zero_narrow : 4.91 GB/s 307.0 Ma/s 3.26 ns/d 3.53 GHz 11.51 c/d 11.24 i/d 0.72 c/b 0.70 i/b 0.98 i/c
veq_non_zero_float : 3.73 GB/s 233.2 Ma/s 4.29 ns/d 3.52 GHz 15.09 c/d 10.24 i/d 0.94 c/b 0.64 i/b 0.68 i/c
Trial 9
veq_non_zero_max : 4.79 GB/s 299.6 Ma/s 3.34 ns/d 3.52 GHz 11.76 c/d 11.24 i/d 0.73 c/b 0.70 i/b 0.96 i/c
veq_non_zero_mov : 4.86 GB/s 303.9 Ma/s 3.29 ns/d 3.52 GHz 11.59 c/d 11.24 i/d 0.72 c/b 0.70 i/b 0.97 i/c
veq_non_zero_narrow : 4.94 GB/s 309.0 Ma/s 3.24 ns/d 3.23 GHz 10.46 c/d 11.24 i/d 0.65 c/b 0.70 i/b 1.07 i/c
veq_non_zero_float : 3.73 GB/s 233.2 Ma/s 4.29 ns/d 3.52 GHz 15.09 c/d 10.24 i/d 0.94 c/b 0.64 i/b 0.68 i/c
Trial 10
veq_non_zero_max : 4.89 GB/s 305.8 Ma/s 3.27 ns/d 3.52 GHz 11.52 c/d 11.24 i/d 0.72 c/b 0.70 i/b 0.98 i/c
veq_non_zero_mov : 4.94 GB/s 308.4 Ma/s 3.24 ns/d 3.54 GHz 11.47 c/d 11.24 i/d 0.72 c/b 0.70 i/b 0.98 i/c
veq_non_zero_narrow : 4.91 GB/s 306.8 Ma/s 3.26 ns/d 3.52 GHz 11.48 c/d 11.24 i/d 0.72 c/b 0.70 i/b 0.98 i/c
veq_non_zero_float : 3.73 GB/s 233.2 Ma/s 4.29 ns/d 3.52 GHz 15.09 c/d 10.24 i/d 0.94 c/b 0.64 i/b 0.68 i/c
veq_non_zero_max : 23.55 GB/s 1472.1 Ma/s 0.68 ns/d 3.60 GHz 2.45 c/d 17.24 i/d 0.15 c/b 1.08 i/b 7.05 i/c
veq_non_zero_mov : 22.30 GB/s 1393.6 Ma/s 0.72 ns/d 3.59 GHz 2.58 c/d 16.18 i/d 0.16 c/b 1.01 i/b 6.28 i/c
veq_non_zero_narrow : 22.30 GB/s 1393.6 Ma/s 0.72 ns/d 3.59 GHz 2.58 c/d 16.18 i/d 0.16 c/b 1.01 i/b 6.27 i/c
veq_non_zero_float : 26.27 GB/s 1641.6 Ma/s 0.61 ns/d 3.61 GHz 2.20 c/d 15.11 i/d 0.14 c/b 0.94 i/b 6.87 i/c
```
39 changes: 38 additions & 1 deletion extra/neon/iszero/benchmarks/benchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,19 @@ template <typename F> int scan(uint8_t *input, size_t length, F f) {
return result;
}

template <typename F> int branchyscan(uint8_t *input, size_t length, F f) {
int result = 0;
for (size_t i = 0; i + 16 + 1 <= length; i += 16) {
uint8x16_t v = vld1q_u8(input + i);
result++;
if (f(v)) {
v = vld1q_u8(input + i + 1);
result += f(v);
}
}
return result;
}

void pretty_print(size_t volume, size_t bytes, std::string name,
event_aggregate agg) {
printf("%-40s : ", name.c_str());
Expand Down Expand Up @@ -71,7 +84,7 @@ int main(int argc, char **argv) {
scan(data.data(), data.size(), veq_non_zero_mov),
scan(data.data(), data.size(), veq_non_zero_narrow),
scan(data.data(), data.size(), veq_non_zero_float));
for (size_t trial = 0; trial < 10; trial++) {
for (size_t trial = 0; trial < 3; trial++) {
printf("Trial %zu\n", trial + 1);

pretty_print(count, volume, "veq_non_zero_max", bench([&data, &counter]() {
Expand All @@ -92,4 +105,28 @@ int main(int argc, char **argv) {
scan(data.data(), data.size(), veq_non_zero_float);
}));
}
printf("branchy\n");

for (size_t trial = 0; trial < 3; trial++) {
printf("Trial %zu\n", trial + 1);

pretty_print(count, volume, "veq_non_zero_max", bench([&data, &counter]() {
counter = counter + branchyscan(data.data(), data.size(),
veq_non_zero_max);
}));
pretty_print(count, volume, "veq_non_zero_mov", bench([&data, &counter]() {
counter = counter + branchyscan(data.data(), data.size(),
veq_non_zero_mov);
}));
pretty_print(
count, volume, "veq_non_zero_narrow", bench([&data, &counter]() {
counter =
counter + branchyscan(data.data(), data.size(), veq_non_zero_mov);
}));
pretty_print(count, volume, "veq_non_zero_float",
bench([&data, &counter]() {
counter = counter + branchyscan(data.data(), data.size(),
veq_non_zero_float);
}));
}
}

0 comments on commit 0051123

Please sign in to comment.