Skip to content

Commit

Permalink
+add NEON optimizations of function DescrIntDecode16f (Decode16f8).
Browse files Browse the repository at this point in the history
  • Loading branch information
ermig1979 committed Sep 27, 2023
1 parent 6626a70 commit 7b7a0fd
Show file tree
Hide file tree
Showing 4 changed files with 39 additions and 6 deletions.
1 change: 1 addition & 0 deletions docs/2023.html
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ <h5>New features</h5>
<li>NEON optimizations of function DescrIntEncode32f.</li>
<li>NEON optimizations of function DescrIntEncode16f.</li>
<li>NEON optimizations of function DescrIntDecode32f.</li>
<li>NEON optimizations of function DescrIntDecode16f.</li>
</ul>
<h5>Bug fixing</h5>
<ul>
Expand Down
2 changes: 1 addition & 1 deletion src/Simd/SimdNeonDescrInt.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ namespace Simd
_encode16f = GetEncode16f(_depth);

_decode32f = GetDecode32f(_depth);
if (_depth >= 9) _decode16f = GetDecode16f(_depth);
if (_depth >= 8) _decode16f = GetDecode16f(_depth);

//_cosineDistance = GetCosineDistance(_depth);
//_macroCosineDistancesDirect = GetMacroCosineDistancesDirect(_depth);
Expand Down
34 changes: 33 additions & 1 deletion src/Simd/SimdNeonDescrIntDec.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -219,6 +219,38 @@ namespace Simd

//-------------------------------------------------------------------------------------------------

static void Decode16f8(const uint8_t* src, float scale, float shift, size_t size, uint16_t * dst)
{
assert(size % 8 == 0);
float32x4_t _scale = vdupq_n_f32(scale);
float32x4_t _shift = vdupq_n_f32(shift);
size_t i = 0;
if (Aligned(src) && Aligned(dst))
{
for (; i < size; i += 8)
{
uint16x8_t u16 = vmovl_u8(LoadHalf<true>(src));
Store<true>(dst + 0, (uint16x4_t)vcvt_f16_f32(vmlaq_f32(_shift, _scale, vcvtq_f32_u32(UnpackU16<0>(u16)))));
Store<true>(dst + 4, (uint16x4_t)vcvt_f16_f32(vmlaq_f32(_shift, _scale, vcvtq_f32_u32(UnpackU16<1>(u16)))));
src += 8;
dst += 8;
}
}
else
{
for (; i < size; i += 8)
{
uint16x8_t u16 = vmovl_u8(LoadHalf<false>(src));
Store<false>(dst + 0, (uint16x4_t)vcvt_f16_f32(vmlaq_f32(_shift, _scale, vcvtq_f32_u32(UnpackU16<0>(u16)))));
Store<false>(dst + 4, (uint16x4_t)vcvt_f16_f32(vmlaq_f32(_shift, _scale, vcvtq_f32_u32(UnpackU16<1>(u16)))));
src += 8;
dst += 8;
}
}
}

//-------------------------------------------------------------------------------------------------

Base::DescrInt::Decode32fPtr GetDecode32f(size_t depth)
{
switch (depth)
Expand All @@ -240,7 +272,7 @@ namespace Simd
//case 5: return Decode16f5;
//case 6: return Decode16f6;
//case 7: return Decode16f7;
//case 8: return Decode16f8;
case 8: return Decode16f8;
default: assert(0); return NULL;
}
}
Expand Down
8 changes: 4 additions & 4 deletions src/Test/TestDescrInt.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -453,10 +453,10 @@ namespace Test
result = result && DescrIntDecode16fAutoTest(FUNC_DI(Simd::Avx512bw::DescrIntInit), FUNC_DI(SimdDescrIntInit));
#endif

//#ifdef SIMD_NEON_ENABLE
// if (Simd::Neon::Enable)
// result = result && DescrIntDecode16fAutoTest(FUNC_DI(Simd::Neon::DescrIntInit), FUNC_DI(SimdDescrIntInit));
//#endif
#ifdef SIMD_NEON_ENABLE
if (Simd::Neon::Enable)
result = result && DescrIntDecode16fAutoTest(FUNC_DI(Simd::Neon::DescrIntInit), FUNC_DI(SimdDescrIntInit));
#endif

return result;
}
Expand Down

0 comments on commit 7b7a0fd

Please sign in to comment.