Skip to content

Commit

Permalink
+add NEON optimizations of function DescrIntDecode32f (Decode32f4).
Browse files Browse the repository at this point in the history
  • Loading branch information
ermig1979 committed Sep 27, 2023
1 parent 630b4cc commit 6626a70
Show file tree
Hide file tree
Showing 3 changed files with 59 additions and 22 deletions.
8 changes: 8 additions & 0 deletions src/Simd/SimdDescrIntCommon.h
Original file line number Diff line number Diff line change
Expand Up @@ -428,6 +428,10 @@ namespace Simd
const uint8x8_t E7_SHFL0 = SIMD_VEC_SETR_PI8(0x1, 0x3, 0x5, 0x7, 0x9, 0xB, 0xD, 0);
const uint8x8_t E7_SHFL1 = SIMD_VEC_SETR_PI8(0x2, 0x4, 0x6, 0x8, 0xA, 0xC, 0xE, 0);

const int32x4_t C4_SHL0 = SIMD_VEC_SETR_EPI32(0, -4, -8, -12);
const int32x4_t C4_SHL1 = SIMD_VEC_SETR_EPI32(-16, -20, -24, -28);
const uint32x4_t C4_AND = SIMD_VEC_SET1_EPI32(0x0F);

const uint8x8_t C5_TBL0 = SIMD_VEC_SETR_PI8(0x0, 0x0, 0x0, 0x1, 0x1, 0x1, 0x1, 0x2);
const uint8x8_t C5_TBL1 = SIMD_VEC_SETR_PI8(0x2, 0x3, 0x3, 0x3, 0x3, 0x4, 0x4, 0x4);
//const __m128i C5_SHFL1 = SIMD_MM_SETR_EPI8(0x5, 0x5, 0x5, 0x6, 0x6, 0x6, 0x6, 0x7, 0x7, 0x8, 0x8, 0x8, 0x8, 0x9, 0x9, 0x9);
Expand All @@ -443,6 +447,10 @@ namespace Simd
//const __m128i C7_SHFL1 = SIMD_MM_SETR_EPI8(0x7, 0x7, 0x7, 0x8, 0x8, 0x9, 0x9, 0xA, 0xA, 0xB, 0xB, 0xC, 0xC, 0xD, 0xD, 0xD);
const uint16x8_t C7_MULLO = SIMD_VEC_SETR_EPI16(2, 4, 8, 16, 32, 64, 128, 256);

const int32x4_t C7_SHL0 = SIMD_VEC_SETR_EPI32(0, -7, -14, -21);
const int32x4_t C7_SHL1 = SIMD_VEC_SETR_EPI32(-4, -11, -18, -25);
const uint32x4_t C7_AND = SIMD_VEC_SET1_EPI32(0x7F);

//-------------------------------------------------------------------------------------------------

template<int bits> uint8x8_t LoadLast8(const uint8_t* src)
Expand Down
2 changes: 1 addition & 1 deletion src/Simd/SimdNeonDescrInt.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ namespace Simd
_encode32f = GetEncode32f(_depth);
_encode16f = GetEncode16f(_depth);

if (_depth >= 5) _decode32f = GetDecode32f(_depth);
_decode32f = GetDecode32f(_depth);
if (_depth >= 9) _decode16f = GetDecode16f(_depth);

//_cosineDistance = GetCosineDistance(_depth);
Expand Down
71 changes: 50 additions & 21 deletions src/Simd/SimdNeonDescrIntDec.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,41 @@ namespace Simd
#ifdef SIMD_NEON_ENABLE
namespace Neon
{
static void Decode32f4(const uint8_t* src, float scale, float shift, size_t size, float* dst)
{
assert(size % 8 == 0);
float32x4_t _scale = vdupq_n_f32(scale);
float32x4_t _shift = vdupq_n_f32(shift);
if (Aligned(dst))
{
for (size_t i = 0; i < size; i += 8)
{
uint32x4_t s0 = vdupq_n_u32(*(uint32_t*)(src + 0));
uint32x4_t u0 = vandq_u32(vshlq_u32(s0, C4_SHL0), C4_AND);
uint32x4_t u1 = vandq_u32(vshlq_u32(s0, C4_SHL1), C4_AND);
float32x4_t d0 = vmlaq_f32(_shift, _scale, vcvtq_f32_u32(u0));
float32x4_t d1 = vmlaq_f32(_shift, _scale, vcvtq_f32_u32(u1));
Store<true>(dst + 0, d0);
Store<true>(dst + 4, d1);
src += 4;
dst += 8;
}
}
else
{
for (size_t i = 0; i < size; i += 8)
{
uint32x4_t s0 = vdupq_n_u32(*(uint32_t*)(src + 0));
uint32x4_t u0 = vandq_u32(vshlq_u32(s0, C4_SHL0), C4_AND);
Store<false>(dst + 0, vmlaq_f32(_shift, _scale, vcvtq_f32_u32(u0)));
uint32x4_t u1 = vandq_u32(vshlq_u32(s0, C4_SHL1), C4_AND);
Store<false>(dst + 4, vmlaq_f32(_shift, _scale, vcvtq_f32_u32(u1)));
src += 4;
dst += 8;
}
}
}

static void Decode32f5(const uint8_t* src, float scale, float shift, size_t size, float* dst)
{
assert(size % 8 == 0);
Expand Down Expand Up @@ -122,40 +157,34 @@ namespace Simd
assert(size % 8 == 0);
float32x4_t _scale = vdupq_n_f32(scale);
float32x4_t _shift = vdupq_n_f32(shift);
size_t size8 = AlignLo(size - 1, 8), i = 0;
if (Aligned(dst))
{
for (; i < size8; i += 8)
for (size_t i = 0; i < size; i += 8)
{
uint8x8_t s7 = LoadHalf<false>(src);
uint16x8_t u16 = vshrq_n_u16(vmulq_u16((uint16x8_t)Shuffle(s7, C7_TBL0, C7_TBL1), C7_MULLO), 9);
Store<true>(dst + 0, vmlaq_f32(_shift, _scale, vcvtq_f32_u32(UnpackU16<0>(u16))));
Store<true>(dst + 4, vmlaq_f32(_shift, _scale, vcvtq_f32_u32(UnpackU16<1>(u16))));
uint32x4_t s0 = vdupq_n_u32(*(uint32_t*)(src + 0));
uint32x4_t u0 = vandq_u32(vshlq_u32(s0, C7_SHL0), C7_AND);
Store<true>(dst + 0, vmlaq_f32(_shift, _scale, vcvtq_f32_u32(u0)));
uint32x4_t s1 = vdupq_n_u32(*(uint32_t*)(src + 3));
uint32x4_t u1 = vandq_u32(vshlq_u32(s1, C7_SHL1), C7_AND);
Store<true>(dst + 4, vmlaq_f32(_shift, _scale, vcvtq_f32_u32(u1)));
src += 7;
dst += 8;
}
}
else
{
for (; i < size8; i += 8)
for (size_t i = 0; i < size; i += 8)
{
uint8x8_t s7 = LoadHalf<false>(src);
uint16x8_t u16 = vshrq_n_u16(vmulq_u16((uint16x8_t)Shuffle(s7, C7_TBL0, C7_TBL1), C7_MULLO), 9);
Store<false>(dst + 0, vmlaq_f32(_shift, _scale, vcvtq_f32_u32(UnpackU16<0>(u16))));
Store<false>(dst + 4, vmlaq_f32(_shift, _scale, vcvtq_f32_u32(UnpackU16<1>(u16))));
uint32x4_t s0 = vdupq_n_u32(*(uint32_t*)(src + 0));
uint32x4_t u0 = vandq_u32(vshlq_u32(s0, C7_SHL0), C7_AND);
Store<false>(dst + 0, vmlaq_f32(_shift, _scale, vcvtq_f32_u32(u0)));
uint32x4_t s1 = vdupq_n_u32(*(uint32_t*)(src + 3));
uint32x4_t u1 = vandq_u32(vshlq_u32(s1, C7_SHL1), C7_AND);
Store<false>(dst + 4, vmlaq_f32(_shift, _scale, vcvtq_f32_u32(u1)));
src += 7;
dst += 8;
}
}
for (; i < size; i += 8)
{
uint8x8_t s7 = LoadLast8<7>(src);
uint16x8_t u16 = vshrq_n_u16(vmulq_u16((uint16x8_t)Shuffle(s7, C7_TBL0, C7_TBL1), C7_MULLO), 9);
Store<false>(dst + 0, vmlaq_f32(_shift, _scale, vcvtq_f32_u32(UnpackU16<0>(u16))));
Store<false>(dst + 4, vmlaq_f32(_shift, _scale, vcvtq_f32_u32(UnpackU16<1>(u16))));
src += 7;
dst += 8;
}
}

static void Decode32f8(const uint8_t* src, float scale, float shift, size_t size, float* dst)
Expand Down Expand Up @@ -194,7 +223,7 @@ namespace Simd
{
switch (depth)
{
//case 4: return Decode32f4;
case 4: return Decode32f4;
case 5: return Decode32f5;
case 6: return Decode32f6;
case 7: return Decode32f7;
Expand Down

0 comments on commit 6626a70

Please sign in to comment.