diff --git a/and_arm64.go b/and_arm64.go index 55f8d5e..579b272 100644 --- a/and_arm64.go +++ b/and_arm64.go @@ -8,6 +8,9 @@ func andNEON(dst, a, b *byte, len uint64) //go:noescape func orNEON(dst, a, b *byte, len uint64) +//go:noescape +func popcntNEON(a *byte, len uint64) uint64 + func and(dst, a, b []byte) { l := uint64(len(a)) >> 8 if l != 0 { @@ -32,8 +35,13 @@ func andNot(dst, a, b []byte) { } func popcnt(a []byte) int { - // TODO: Write a NEON version for this - return popcntGeneric(a) + ret := 0 + l := uint64(len(a)) >> 8 + if l != 0 { + ret = int(popcntNEON(&a[0], l)) + l <<= 8 + } + return ret + popcntGeneric(a[l:]) } func memset(dst []byte, b byte) { diff --git a/and_arm64.s b/and_arm64.s index f337784..6362bec 100644 --- a/and_arm64.s +++ b/and_arm64.s @@ -95,3 +95,65 @@ loop: CBNZ R3, loop RET + +// func popcntNEON(a *byte, l uint64) uint64 +TEXT ·popcntNEON(SB), NOSPLIT, $0-24 + MOVD a+0(FP), R1 + MOVD l+8(FP), R2 + + VEOR V16.B16, V16.B16, V16.B16 // zero + +loop: + VLD1.P 64(R1), [ V0.B16, V1.B16, V2.B16, V3.B16] + VLD1.P 64(R1), [ V4.B16, V5.B16, V6.B16, V7.B16] + VLD1.P 64(R1), [ V8.B16, V9.B16, V10.B16, V11.B16] + VLD1.P 64(R1), [V12.B16, V13.B16, V14.B16, V15.B16] + + VCNT V0.B16, V0.B16 + VCNT V1.B16, V1.B16 + VCNT V2.B16, V2.B16 + VCNT V3.B16, V3.B16 + VCNT V4.B16, V4.B16 + VCNT V5.B16, V5.B16 + VCNT V6.B16, V6.B16 + VCNT V7.B16, V7.B16 + VCNT V8.B16, V8.B16 + VCNT V9.B16, V9.B16 + VCNT V10.B16, V10.B16 + VCNT V11.B16, V11.B16 + VCNT V12.B16, V12.B16 + VCNT V13.B16, V13.B16 + VCNT V14.B16, V14.B16 + VCNT V15.B16, V15.B16 + + VADD V0.B16, V1.B16, V0.B16 + VADD V2.B16, V3.B16, V2.B16 + VADD V4.B16, V5.B16, V4.B16 + VADD V6.B16, V7.B16, V6.B16 + VADD V8.B16, V9.B16, V8.B16 + VADD V10.B16, V11.B16, V10.B16 + VADD V12.B16, V13.B16, V12.B16 + VADD V14.B16, V15.B16, V14.B16 + + VADD V0.B16, V2.B16, V0.B16 + VADD V4.B16, V6.B16, V4.B16 + VADD V8.B16, V10.B16, V8.B16 + VADD V12.B16, V14.B16, V12.B16 + + VADD V0.B16, V4.B16, V0.B16 + VADD V8.B16, V12.B16, V8.B16 + + VADD V0.B16, V8.B16, V0.B16 + + VUADDLV V0.B16, V0 + VADDV V0.H8, V0 + + VADD V16.D2, V0.D2, V16.D2 + + SUBS $1, R2, R2 + CBNZ R2, loop + + VMOV V16.D[0], R0 + MOVD R0, ret+16(FP) + + RET