Skip to content

Commit

Permalink
Call VZEROALL for a 21% benchmark improvement
Browse files Browse the repository at this point in the history
```
goos: linux
goarch: amd64
pkg: github.com/bwesterb/go-and
cpu: 13th Gen Intel(R) Core(TM) i9-13900
          │     asm     │          with_vzeroall/asm          │
          │   sec/op    │   sec/op     vs base                │
And-32      631.2n ± 2%   518.4n ± 1%  -17.87% (p=0.000 n=10)
Or-32       626.4n ± 1%   515.0n ± 1%  -17.79% (p=0.000 n=10)
Xor-32      631.6n ± 1%   518.6n ± 1%  -17.88% (p=0.000 n=10)
AndNot-32   635.2n ± 2%   517.8n ± 1%  -18.49% (p=0.000 n=10)

          │     asm      │          with_vzeroall/asm           │
          │     B/s      │     B/s       vs base                │
And-32      47.21Gi ± 2%   57.49Gi ± 1%  +21.77% (p=0.000 n=10)
Or-32       47.58Gi ± 1%   57.87Gi ± 1%  +21.63% (p=0.000 n=10)
Xor-32      47.19Gi ± 1%   57.47Gi ± 1%  +21.78% (p=0.000 n=10)
AndNot-32   46.92Gi ± 2%   57.56Gi ± 1%  +22.68% (p=0.000 n=10)
```
  • Loading branch information
Jille committed Oct 15, 2024
1 parent 2c43005 commit d184c57
Show file tree
Hide file tree
Showing 3 changed files with 26 additions and 14 deletions.
28 changes: 14 additions & 14 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,23 +38,23 @@ pkg: github.com/bwesterb/go-and
cpu: 13th Gen Intel(R) Core(TM) i9-13900
│ naive │ purego │ asm │
│ sec/op │ sec/op vs base │ sec/op vs base │
And-32 8162.0n ± 5% 2034.5n ± 1% -75.07% (p=0.000 n=10) 631.2n ± 2% -92.27% (p=0.000 n=10)
Or-32 9751.5n ± 8% 2104.5n ± 3% -78.42% (p=0.000 n=10) 626.4n ± 1% -93.58% (p=0.000 n=10)
Xor-32 8112.5n ± 3% 2029.0n ± 0% -74.99% (p=0.000 n=10) 631.6n ± 1% -92.22% (p=0.000 n=10)
AndNot-32 10685.5n ± 4% 2292.0n ± 2% -78.55% (p=0.000 n=10) 635.2n ± 2% -94.06% (p=0.000 n=10)
Memset-32 167.96µ ± 0% 57.54µ ± 1% -65.74% (p=0.000 n=10) 15.83µ ± 1% -90.57% (p=0.000 n=10)
Popcnt-32 132.15µ ± 1% 71.63µ ± 1% -45.80% (p=0.000 n=10) 33.86µ ± 6% -74.38% (p=0.000 n=10)
geomean 23.13µ 6.592µ -71.50% 2.097µ -90.93%
And-32 8162.0n ± 5% 2034.5n ± 1% -75.07% (p=0.000 n=10) 518.4n ± 1% -93.65% (p=0.000 n=10)
Or-32 9751.5n ± 8% 2104.5n ± 3% -78.42% (p=0.000 n=10) 515.0n ± 1% -94.72% (p=0.000 n=10)
Xor-32 8112.5n ± 3% 2029.0n ± 0% -74.99% (p=0.000 n=10) 518.6n ± 1% -93.61% (p=0.000 n=10)
AndNot-32 10685.5n ± 4% 2292.0n ± 2% -78.55% (p=0.000 n=10) 517.8n ± 1% -95.15% (p=0.000 n=10)
Memset-32 167.96µ ± 0% 57.54µ ± 1% -65.74% (p=0.000 n=10) 15.65µ ± 1% -90.68% (p=0.000 n=10)
Popcnt-32 132.15µ ± 1% 71.63µ ± 1% -45.80% (p=0.000 n=10) 36.51µ ± 2% -72.37% (p=0.000 n=10)
geomean 23.13µ 6.592µ -71.50% 1.857µ -91.97%
│ naive │ purego │ asm │
│ B/s │ B/s vs base │ B/s vs base │
And-32 3.651Gi ± 5% 14.649Gi ± 1% +301.20% (p=0.000 n=10) 47.212Gi ± 2% +1193.01% (p=0.000 n=10)
Or-32 3.057Gi ± 8% 14.163Gi ± 3% +363.37% (p=0.000 n=10) 47.580Gi ± 1% +1456.63% (p=0.000 n=10)
Xor-32 3.674Gi ± 3% 14.690Gi ± 0% +299.88% (p=0.000 n=10) 47.190Gi ± 1% +1184.58% (p=0.000 n=10)
AndNot-32 2.789Gi ± 4% 13.003Gi ± 2% +366.21% (p=0.000 n=10) 46.916Gi ± 2% +1582.18% (p=0.000 n=10)
Memset-32 5.545Gi ± 0% 16.187Gi ± 1% +191.91% (p=0.000 n=10) 58.816Gi ± 1% +960.69% (p=0.000 n=10)
Popcnt-32 7.048Gi ± 1% 13.002Gi ± 1% +84.48% (p=0.000 n=10) 27.506Gi ± 6% +290.28% (p=0.000 n=10)
geomean 4.058Gi 14.24Gi +250.89% 44.76Gi +1002.97%
And-32 3.651Gi ± 5% 14.649Gi ± 1% +301.20% (p=0.000 n=10) 57.488Gi ± 1% +1474.44% (p=0.000 n=10)
Or-32 3.057Gi ± 8% 14.163Gi ± 3% +363.37% (p=0.000 n=10) 57.872Gi ± 1% +1793.33% (p=0.000 n=10)
Xor-32 3.674Gi ± 3% 14.690Gi ± 0% +299.88% (p=0.000 n=10) 57.469Gi ± 1% +1464.38% (p=0.000 n=10)
AndNot-32 2.789Gi ± 4% 13.003Gi ± 2% +366.21% (p=0.000 n=10) 57.558Gi ± 1% +1963.74% (p=0.000 n=10)
Memset-32 5.545Gi ± 0% 16.187Gi ± 1% +191.91% (p=0.000 n=10) 59.504Gi ± 1% +973.11% (p=0.000 n=10)
Popcnt-32 7.048Gi ± 1% 13.002Gi ± 1% +84.48% (p=0.000 n=10) 25.507Gi ± 2% +261.92% (p=0.000 n=10)
geomean 4.058Gi 14.24Gi +250.89% 50.56Gi +1145.76%
```

### Apple M2 Pro
Expand Down
10 changes: 10 additions & 0 deletions and_amd64.s
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ loop:
ADDQ $0x00000100, DX
SUBQ $0x00000001, BX
JNZ loop
VZEROALL
RET

// func andAVX(dst *byte, a *byte, b *byte, l uint64)
Expand Down Expand Up @@ -98,6 +99,7 @@ loop:
ADDQ $0x00000080, DX
SUBQ $0x00000001, BX
JNZ loop
VZEROALL
RET

// func orAVX2(dst *byte, a *byte, b *byte, l uint64)
Expand Down Expand Up @@ -146,6 +148,7 @@ loop:
ADDQ $0x00000100, DX
SUBQ $0x00000001, BX
JNZ loop
VZEROALL
RET

// func orAVX(dst *byte, a *byte, b *byte, l uint64)
Expand Down Expand Up @@ -194,6 +197,7 @@ loop:
ADDQ $0x00000080, DX
SUBQ $0x00000001, BX
JNZ loop
VZEROALL
RET

// func xorAVX2(dst *byte, a *byte, b *byte, l uint64)
Expand Down Expand Up @@ -242,6 +246,7 @@ loop:
ADDQ $0x00000100, DX
SUBQ $0x00000001, BX
JNZ loop
VZEROALL
RET

// func xorAVX(dst *byte, a *byte, b *byte, l uint64)
Expand Down Expand Up @@ -290,6 +295,7 @@ loop:
ADDQ $0x00000080, DX
SUBQ $0x00000001, BX
JNZ loop
VZEROALL
RET

// func andNotAVX2(dst *byte, a *byte, b *byte, l uint64)
Expand Down Expand Up @@ -338,6 +344,7 @@ loop:
ADDQ $0x00000100, DX
SUBQ $0x00000001, BX
JNZ loop
VZEROALL
RET

// func andNotAVX(dst *byte, a *byte, b *byte, l uint64)
Expand Down Expand Up @@ -386,6 +393,7 @@ loop:
ADDQ $0x00000080, DX
SUBQ $0x00000001, BX
JNZ loop
VZEROALL
RET

// func popcntAsm(a *byte, l uint64) int
Expand Down Expand Up @@ -438,6 +446,7 @@ loop:
ADDQ $0x00000020, AX
SUBQ $0x00000001, CX
JNZ loop
VZEROALL
RET

// func memsetAVX(dst *byte, l uint64, b byte)
Expand All @@ -454,6 +463,7 @@ loop:
ADDQ $0x00000010, AX
SUBQ $0x00000001, CX
JNZ loop
VZEROALL
RET

DATA zeroes<>+0(SB)/4, $0x00000000
Expand Down
2 changes: 2 additions & 0 deletions internal/asm/src.go
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ func gen(name string, op func(Op, Op, Op), avxLevel AVXLevel, doc string) {
SUBQ(U32(1), l)
JNZ(LabelRef("loop"))

VZEROALL()
RET()
}

Expand Down Expand Up @@ -175,5 +176,6 @@ func genMemset(avxLevel AVXLevel) {
SUBQ(U32(1), l)
JNZ(LabelRef("loop"))

VZEROALL()
RET()
}

0 comments on commit d184c57

Please sign in to comment.