Merge pull request #15 from Jille/benchmark

Add a script to benchmark and add to the README
bwesterb · Jul 22, 2024 · a9e80c1 · a9e80c1
2 parents 3b33e86 + 14b5ce4
commit a9e80c1
Show file tree

Hide file tree

Showing 6 changed files with 154 additions and 10 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,4 @@
+# Benchmark output files
+asm
+naive
+purego
diff --git a/README.md b/README.md
@@ -19,3 +19,37 @@ func main() {
 ```
 
 Makes use of AVX2 on AMD64 and NEON on ARM64.
+
+## Benchmarks
+
+Created using `./benchmark.sh`.
+
+This shows three benchmarks:
+
+* `naive` is a simple for loop doing one byte at a time.
+* `purego` are our slightly optimized versions that work on uint64s instead of bytes.
+* `asm` are the AVX2 implementations and the reason to use this library.
+
+```
+goos: linux
+goarch: amd64
+pkg: github.com/bwesterb/go-and
+cpu: 13th Gen Intel(R) Core(TM) i9-13900
+          │    naive     │                purego                │                 asm                 │
+          │    sec/op    │    sec/op     vs base                │   sec/op     vs base                │
+And-32      273.05µ ± 5%    64.48µ ± 2%  -76.39% (p=0.000 n=10)   21.88µ ± 1%  -91.99% (p=0.000 n=10)
+Or-32       274.70µ ± 6%    64.36µ ± 1%  -76.57% (p=0.000 n=10)   21.81µ ± 1%  -92.06% (p=0.000 n=10)
+AndNot-32   310.78µ ± 2%    71.01µ ± 2%  -77.15% (p=0.000 n=10)   21.83µ ± 1%  -92.98% (p=0.000 n=10)
+Memset-32   167.77µ ± 0%   167.55µ ± 0%   -0.13% (p=0.002 n=10)   15.88µ ± 1%  -90.53% (p=0.000 n=10)
+Popcnt-32   126.84µ ± 0%    71.42µ ± 1%  -43.69% (p=0.000 n=10)   32.48µ ± 1%  -74.40% (p=0.000 n=10)
+geomean      218.3µ         81.18µ       -62.82%                  22.18µ       -89.84%
+
+          │    naive     │                 purego                 │                   asm                   │
+          │     B/s      │      B/s       vs base                 │      B/s       vs base                  │
+And-32      3.411Gi ± 5%   14.444Gi ± 2%  +323.45% (p=0.000 n=10)   42.560Gi ± 1%  +1147.72% (p=0.000 n=10)
+Or-32       3.391Gi ± 7%   14.470Gi ± 1%  +326.78% (p=0.000 n=10)   42.708Gi ± 1%  +1159.61% (p=0.000 n=10)
+AndNot-32   2.997Gi ± 2%   13.116Gi ± 2%  +337.68% (p=0.000 n=10)   42.665Gi ± 1%  +1323.72% (p=0.000 n=10)
+Memset-32   5.551Gi ± 0%    5.559Gi ± 0%    +0.13% (p=0.002 n=10)   58.642Gi ± 1%   +956.36% (p=0.000 n=10)
+Popcnt-32   7.342Gi ± 0%   13.040Gi ± 1%   +77.60% (p=0.000 n=10)   28.677Gi ± 1%   +290.57% (p=0.000 n=10)
+geomean     4.266Gi         11.47Gi       +168.93%                   41.98Gi        +884.14%
+```
diff --git a/and_test.go b/and_test.go
@@ -8,7 +8,25 @@ import (
 	"testing"
 )
 
-func testAgainstGeneric(t *testing.T, fancy, generic func(dst, a, b []byte), size int) {
+func andNaive(dst, a, b []byte) {
+	for i := range dst {
+		dst[i] = a[i] & b[i]
+	}
+}
+
+func orNaive(dst, a, b []byte) {
+	for i := range dst {
+		dst[i] = a[i] | b[i]
+	}
+}
+
+func andNotNaive(dst, a, b []byte) {
+	for i := range dst {
+		dst[i] = (^a[i]) & b[i]
+	}
+}
+
+func testAgainst(t *testing.T, fancy, generic func(dst, a, b []byte), size int) {
 	a := make([]byte, size)
 	b := make([]byte, size)
 	c1 := make([]byte, size)
@@ -25,32 +43,38 @@ func testAgainstGeneric(t *testing.T, fancy, generic func(dst, a, b []byte), siz
 	}
 }
 
-func TestAndAgainstGeneric(t *testing.T) {
+func TestAnd(t *testing.T) {
 	for i := 0; i < 20; i++ {
 		size := 1 << i
-		testAgainstGeneric(t, And, andGeneric, size)
+		testAgainst(t, And, andNaive, size)
+		testAgainst(t, andGeneric, andNaive, size)
 		for j := 0; j < 10; j++ {
-			testAgainstGeneric(t, And, andGeneric, size+rand.IntN(100))
+			testAgainst(t, And, andNaive, size+rand.IntN(100))
+			testAgainst(t, andGeneric, andNaive, size+rand.IntN(100))
 		}
 	}
 }
 
-func TestOrAgainstGeneric(t *testing.T) {
+func TestOr(t *testing.T) {
 	for i := 0; i < 20; i++ {
 		size := 1 << i
-		testAgainstGeneric(t, Or, orGeneric, size)
+		testAgainst(t, Or, orNaive, size)
+		testAgainst(t, orGeneric, orNaive, size)
 		for j := 0; j < 10; j++ {
-			testAgainstGeneric(t, Or, orGeneric, size+rand.IntN(100))
+			testAgainst(t, Or, orNaive, size+rand.IntN(100))
+			testAgainst(t, orGeneric, orNaive, size+rand.IntN(100))
 		}
 	}
 }
 
-func TestAndNotAgainstGeneric(t *testing.T) {
+func TestAndNot(t *testing.T) {
 	for i := 0; i < 20; i++ {
 		size := 1 << i
-		testAgainstGeneric(t, AndNot, andNotGeneric, size)
+		testAgainst(t, AndNot, andNotNaive, size)
+		testAgainst(t, andNotGeneric, andNotNaive, size)
 		for j := 0; j < 10; j++ {
-			testAgainstGeneric(t, AndNot, andNotGeneric, size+rand.IntN(100))
+			testAgainst(t, AndNot, andNotNaive, size+rand.IntN(100))
+			testAgainst(t, andNotGeneric, andNotNaive, size+rand.IntN(100))
 		}
 	}
 }
@@ -79,6 +103,18 @@ func BenchmarkAndGeneric(b *testing.B) {
 	}
 }
 
+func BenchmarkAndNaive(b *testing.B) {
+	b.StopTimer()
+	size := 1000000
+	a := make([]byte, size)
+	bb := make([]byte, size)
+	b.SetBytes(int64(size))
+	b.StartTimer()
+	for i := 0; i < b.N; i++ {
+		andNaive(a, a, bb)
+	}
+}
+
 func BenchmarkOr(b *testing.B) {
 	b.StopTimer()
 	size := 1000000
@@ -103,6 +139,18 @@ func BenchmarkOrGeneric(b *testing.B) {
 	}
 }
 
+func BenchmarkOrNaive(b *testing.B) {
+	b.StopTimer()
+	size := 1000000
+	a := make([]byte, size)
+	bb := make([]byte, size)
+	b.SetBytes(int64(size))
+	b.StartTimer()
+	for i := 0; i < b.N; i++ {
+		orNaive(a, a, bb)
+	}
+}
+
 func BenchmarkAndNot(b *testing.B) {
 	b.StopTimer()
 	size := 1000000
@@ -126,3 +174,15 @@ func BenchmarkAndNotGeneric(b *testing.B) {
 		andNotGeneric(a, a, bb)
 	}
 }
+
+func BenchmarkAndNotNaive(b *testing.B) {
+	b.StopTimer()
+	size := 1000000
+	a := make([]byte, size)
+	bb := make([]byte, size)
+	b.SetBytes(int64(size))
+	b.StartTimer()
+	for i := 0; i < b.N; i++ {
+		andNotNaive(a, a, bb)
+	}
+}
diff --git a/benchmark.sh b/benchmark.sh
@@ -0,0 +1,9 @@
+#!/bin/sh
+
+set -ex
+
+# ^.{2,6}$ is a hack to skip the .+Generic benchmarks
+go test -run=^# -count=10 -bench="^Benchmark.{2,6}$" | tee asm
+go test -run=^# -count=10 -bench="^Benchmark.{2,6}$" -tags purego | tee purego
+go test -run=^# -count=10 -bench="^Benchmark.{2,6}Naive$" | sed --unbuffered 's/Naive//g' | tee naive
+go run golang.org/x/perf/cmd/benchstat@latest naive purego asm
diff --git a/memset_test.go b/memset_test.go
@@ -5,6 +5,12 @@ import (
 	"testing"
 )
 
+func memsetNaive(dst []byte, b byte) {
+	for i := range dst {
+		dst[i] = b
+	}
+}
+
 func testMemset(t *testing.T, size int) {
 	a := make([]byte, size)
 	Memset(a, 0xff)
@@ -46,3 +52,14 @@ func BenchmarkMemsetGeneric(b *testing.B) {
 		memsetGeneric(a, 0xff)
 	}
 }
+
+func BenchmarkMemsetNaive(b *testing.B) {
+	b.StopTimer()
+	size := 1000000
+	a := make([]byte, size)
+	b.SetBytes(int64(size))
+	b.StartTimer()
+	for i := 0; i < b.N; i++ {
+		memsetNaive(a, 0xff)
+	}
+}
diff --git a/popcnt_test.go b/popcnt_test.go
@@ -1,10 +1,19 @@
 package and
 
 import (
+	"math/bits"
 	"math/rand/v2"
 	"testing"
 )
 
+func popcntNaive(a []byte) int {
+	var ret int
+	for i := range a {
+		ret += bits.OnesCount8(a[i])
+	}
+	return ret
+}
+
 func testPopcntAgainstGeneric(t *testing.T, size int) {
 	a := make([]byte, size)
 	rng := rand.New(rand.NewPCG(0, 0))
@@ -49,3 +58,14 @@ func BenchmarkPopcntGeneric(b *testing.B) {
 		_ = popcntGeneric(a)
 	}
 }
+
+func BenchmarkPopcntNaive(b *testing.B) {
+	b.StopTimer()
+	size := 1000000
+	a := make([]byte, size)
+	b.SetBytes(int64(size))
+	b.StartTimer()
+	for i := 0; i < b.N; i++ {
+		_ = popcntNaive(a)
+	}
+}