Skip to content

Commit

Permalink
Fix unaligned memory access on ARM
Browse files Browse the repository at this point in the history
  • Loading branch information
kimwalisch committed Dec 31, 2019
1 parent ac90010 commit 043a99f
Showing 1 changed file with 18 additions and 1 deletion.
19 changes: 18 additions & 1 deletion libpopcnt.h
Original file line number Diff line number Diff line change
Expand Up @@ -708,6 +708,16 @@ static inline uint64_t popcnt(const void* data, uint64_t size)

#include <arm_neon.h>

/* Align memory to 8 bytes boundary */
static inline void align_8(const uint8_t** p, uint64_t* size, uint64_t* cnt)
{
for (; *size > 0 && (uintptr_t) *p % 8; (*p)++)
{
*cnt += popcnt64(**p);
*size -= 1;
}
}

static inline uint64x2_t vpadalq(uint64x2_t sum, uint8x16_t t)
{
return vpadalq_u32(sum, vpaddlq_u16(vpaddlq_u8(t)));
Expand Down Expand Up @@ -772,9 +782,16 @@ static inline uint64_t popcnt(const void* data, uint64_t size)
}

size %= chunk_size;
cnt += popcnt64_unrolled((const uint64_t*) ptr, size / 8);
align_8(&ptr, &size, &cnt);
const uint64_t* ptr64 = (const uint64_t*) ptr;
uint64_t iters = size / 8;

for (uint64_t i = 0; i < iters; i++)
cnt += popcnt64(ptr64[i]);

ptr += size - size % 8;
size = size % 8;

for (uint64_t i = 0; i < size; i++)
cnt += popcnt64(ptr[i]);

Expand Down

0 comments on commit 043a99f

Please sign in to comment.