Skip to content

Commit

Permalink
arm64: popcnt
Browse files Browse the repository at this point in the history
  • Loading branch information
bwesterb committed Jul 24, 2024
1 parent f004f10 commit 32aafe0
Show file tree
Hide file tree
Showing 2 changed files with 72 additions and 2 deletions.
12 changes: 10 additions & 2 deletions and_arm64.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@ func andNEON(dst, a, b *byte, len uint64)
//go:noescape
func orNEON(dst, a, b *byte, len uint64)

//go:noescape
func popcntNEON(a *byte, len uint64) uint64

func and(dst, a, b []byte) {
l := uint64(len(a)) >> 8
if l != 0 {
Expand All @@ -32,8 +35,13 @@ func andNot(dst, a, b []byte) {
}

func popcnt(a []byte) int {
// TODO: Write a NEON version for this
return popcntGeneric(a)
ret := 0
l := uint64(len(a)) >> 8
if l != 0 {
ret = int(popcntNEON(&a[0], l))
l <<= 8
}
return ret + popcntGeneric(a[l:])
}

func memset(dst []byte, b byte) {
Expand Down
62 changes: 62 additions & 0 deletions and_arm64.s
Original file line number Diff line number Diff line change
Expand Up @@ -95,3 +95,65 @@ loop:
CBNZ R3, loop

RET

// func popcntNEON(a *byte, l uint64) uint64
TEXT ·popcntNEON(SB), NOSPLIT, $0-24
MOVD a+0(FP), R1
MOVD l+8(FP), R2

VEOR V16.B16, V16.B16, V16.B16 // zero

loop:
VLD1.P 64(R1), [ V0.B16, V1.B16, V2.B16, V3.B16]
VLD1.P 64(R1), [ V4.B16, V5.B16, V6.B16, V7.B16]
VLD1.P 64(R1), [ V8.B16, V9.B16, V10.B16, V11.B16]
VLD1.P 64(R1), [V12.B16, V13.B16, V14.B16, V15.B16]

VCNT V0.B16, V0.B16
VCNT V1.B16, V1.B16
VCNT V2.B16, V2.B16
VCNT V3.B16, V3.B16
VCNT V4.B16, V4.B16
VCNT V5.B16, V5.B16
VCNT V6.B16, V6.B16
VCNT V7.B16, V7.B16
VCNT V8.B16, V8.B16
VCNT V9.B16, V9.B16
VCNT V10.B16, V10.B16
VCNT V11.B16, V11.B16
VCNT V12.B16, V12.B16
VCNT V13.B16, V13.B16
VCNT V14.B16, V14.B16
VCNT V15.B16, V15.B16

VADD V0.B16, V1.B16, V0.B16
VADD V2.B16, V3.B16, V2.B16
VADD V4.B16, V5.B16, V4.B16
VADD V6.B16, V7.B16, V6.B16
VADD V8.B16, V9.B16, V8.B16
VADD V10.B16, V11.B16, V10.B16
VADD V12.B16, V13.B16, V12.B16
VADD V14.B16, V15.B16, V14.B16

VADD V0.B16, V2.B16, V0.B16
VADD V4.B16, V6.B16, V4.B16
VADD V8.B16, V10.B16, V8.B16
VADD V12.B16, V14.B16, V12.B16

VADD V0.B16, V4.B16, V0.B16
VADD V8.B16, V12.B16, V8.B16

VADD V0.B16, V8.B16, V0.B16

VUADDLV V0.B16, V0
VADDV V0.H8, V0

VADD V16.D2, V0.D2, V16.D2

SUBS $1, R2, R2
CBNZ R2, loop

VMOV V16.D[0], R0
MOVD R0, ret+16(FP)

RET

0 comments on commit 32aafe0

Please sign in to comment.