diff --git a/and_arm64.go b/and_arm64.go
index 55f8d5e..579b272 100644
--- a/and_arm64.go
+++ b/and_arm64.go
@@ -8,6 +8,9 @@ func andNEON(dst, a, b *byte, len uint64)
 //go:noescape
 func orNEON(dst, a, b *byte, len uint64)
 
+//go:noescape
+func popcntNEON(a *byte, len uint64) uint64
+
 func and(dst, a, b []byte) {
 	l := uint64(len(a)) >> 8
 	if l != 0 {
@@ -32,8 +35,13 @@ func andNot(dst, a, b []byte) {
 }
 
 func popcnt(a []byte) int {
-	// TODO: Write a NEON version for this
-	return popcntGeneric(a)
+	ret := 0
+	l := uint64(len(a)) >> 8
+	if l != 0 {
+		ret = int(popcntNEON(&a[0], l))
+		l <<= 8
+	}
+	return ret + popcntGeneric(a[l:])
 }
 
 func memset(dst []byte, b byte) {
diff --git a/and_arm64.s b/and_arm64.s
index f337784..6362bec 100644
--- a/and_arm64.s
+++ b/and_arm64.s
@@ -95,3 +95,65 @@ loop:
     CBNZ R3, loop
 
     RET
+
+// func popcntNEON(a *byte, l uint64) uint64
+TEXT ·popcntNEON(SB), NOSPLIT, $0-24
+    MOVD a+0(FP), R1
+    MOVD l+8(FP), R2
+
+    VEOR V16.B16, V16.B16, V16.B16 // zero
+
+loop:
+    VLD1.P 64(R1), [ V0.B16,  V1.B16,  V2.B16,  V3.B16]
+    VLD1.P 64(R1), [ V4.B16,  V5.B16,  V6.B16,  V7.B16]
+    VLD1.P 64(R1), [ V8.B16,  V9.B16, V10.B16, V11.B16]
+    VLD1.P 64(R1), [V12.B16, V13.B16, V14.B16, V15.B16]
+
+    VCNT  V0.B16,  V0.B16
+    VCNT  V1.B16,  V1.B16
+    VCNT  V2.B16,  V2.B16
+    VCNT  V3.B16,  V3.B16
+    VCNT  V4.B16,  V4.B16
+    VCNT  V5.B16,  V5.B16
+    VCNT  V6.B16,  V6.B16
+    VCNT  V7.B16,  V7.B16
+    VCNT  V8.B16,  V8.B16
+    VCNT  V9.B16,  V9.B16
+    VCNT V10.B16, V10.B16
+    VCNT V11.B16, V11.B16
+    VCNT V12.B16, V12.B16
+    VCNT V13.B16, V13.B16
+    VCNT V14.B16, V14.B16
+    VCNT V15.B16, V15.B16
+
+    VADD  V0.B16,  V1.B16,  V0.B16
+    VADD  V2.B16,  V3.B16,  V2.B16
+    VADD  V4.B16,  V5.B16,  V4.B16
+    VADD  V6.B16,  V7.B16,  V6.B16
+    VADD  V8.B16,  V9.B16,  V8.B16
+    VADD V10.B16, V11.B16, V10.B16
+    VADD V12.B16, V13.B16, V12.B16
+    VADD V14.B16, V15.B16, V14.B16
+
+    VADD  V0.B16,  V2.B16,  V0.B16
+    VADD  V4.B16,  V6.B16,  V4.B16
+    VADD  V8.B16, V10.B16,  V8.B16
+    VADD V12.B16, V14.B16, V12.B16
+
+    VADD  V0.B16,  V4.B16,  V0.B16
+    VADD  V8.B16, V12.B16,  V8.B16
+
+    VADD  V0.B16,  V8.B16,  V0.B16
+
+    VUADDLV V0.B16, V0
+    VADDV V0.H8, V0
+
+    VADD V16.D2, V0.D2, V16.D2
+
+    SUBS $1, R2, R2
+    CBNZ R2, loop
+
+    VMOV V16.D[0], R0
+    MOVD R0, ret+16(FP)
+
+    RET