Skip to content

Commit

Permalink
+add AVX-512BW optimizations of function BgrToYuv420pV2.
Browse files Browse the repository at this point in the history
  • Loading branch information
ermig1979 committed Sep 18, 2023
1 parent 52bd93b commit fa7b55b
Show file tree
Hide file tree
Showing 7 changed files with 124 additions and 31 deletions.
2 changes: 1 addition & 1 deletion docs/2023.html
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ <h3 id="R130">October X, 2023 (version X.X.130)</h3>
<h4>Algorithms</h4>
<h5>New features</h5>
<ul>
<li>Base implementation, SSE4.1, AVX2 optimizations of function BgrToYuv420pV2.</li>
<li>Base implementation, SSE4.1, AVX2, AVX-512BW optimizations of function BgrToYuv420pV2.</li>
</ul>
<ul>
<li>Error in AVX-512BW optimizations of function SynetSoftmaxLayerForward.</li>
Expand Down
3 changes: 3 additions & 0 deletions src/Simd/SimdAvx512bw.h
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,9 @@ namespace Simd

void BgrToYuv420p(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * y, size_t yStride, uint8_t * u, size_t uStride, uint8_t * v, size_t vStride);

void BgrToYuv420pV2(const uint8_t* bgr, size_t bgrStride, size_t width, size_t height,
uint8_t* y, size_t yStride, uint8_t* u, size_t uStride, uint8_t* v, size_t vStride, SimdYuvType yuvType);

void BgrToYuv422p(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * y, size_t yStride, uint8_t * u, size_t uStride, uint8_t * v, size_t vStride);

void BgrToYuv444p(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * y, size_t yStride, uint8_t * u, size_t uStride, uint8_t * v, size_t vStride);
Expand Down
91 changes: 89 additions & 2 deletions src/Simd/SimdAvx512bwBgrToYuv.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
#include "Simd/SimdMemory.h"
#include "Simd/SimdStore.h"
#include "Simd/SimdConversion.h"
#include "Simd/SimdYuvToBgr.h"

namespace Simd
{
Expand Down Expand Up @@ -138,7 +139,7 @@ namespace Simd
BgrToYuv420p<false>(bgr, width, height, bgrStride, y, yStride, u, uStride, v, vStride);
}

//-----------------------------------------------------------------------------------------
//-------------------------------------------------------------------------------------------------

SIMD_INLINE void Average16(__m512i a[2][2])
{
Expand Down Expand Up @@ -204,7 +205,7 @@ namespace Simd
BgrToYuv422p<false>(bgr, width, height, bgrStride, y, yStride, u, uStride, v, vStride);
}

//-----------------------------------------------------------------------------------------
//-------------------------------------------------------------------------------------------------

SIMD_INLINE __m512i ConvertY16(__m512i b16_r16[2], __m512i g16_1[2])
{
Expand Down Expand Up @@ -262,6 +263,92 @@ namespace Simd
else
BgrToYuv444p<false>(bgr, width, height, bgrStride, y, yStride, u, uStride, v, vStride);
}

//-------------------------------------------------------------------------------------------------

template <class T, bool mask> SIMD_INLINE __m512i LoadAndConvertBgrToY16V2(const uint8_t* bgr, __m512i& b16_r16, __m512i& g16_1, const __mmask64* tails)
{
__m512i _b16_r16[2], _g16_1[2];
LoadPreparedBgr16<false, mask>(bgr + 00, _b16_r16[0], _g16_1[0], tails + 0);
LoadPreparedBgr16<false, mask>(bgr + 48, _b16_r16[1], _g16_1[1], tails + 1);
b16_r16 = Hadd32(_b16_r16[0], _b16_r16[1]);
g16_1 = Hadd32(_g16_1[0], _g16_1[1]);
return BgrToY16<T>(_b16_r16, _g16_1);
}

template <class T, bool mask> SIMD_INLINE __m512i LoadAndConvertBgrToY8V2(const uint8_t* bgr, __m512i b16_r16[2], __m512i g16_1[2], const __mmask64* tails)
{
__m512i lo = LoadAndConvertBgrToY16V2<T, mask>(bgr + 00, b16_r16[0], g16_1[0], tails + 0);
__m512i hi = LoadAndConvertBgrToY16V2<T, mask>(bgr + 96, b16_r16[1], g16_1[1], tails + 2);
return Permuted2Pack16iTo8u(lo, hi);
}

template <class T, bool mask> SIMD_INLINE void BgrToYuv420pV2(const uint8_t* bgr0, size_t bgrStride, uint8_t* y0, size_t yStride, uint8_t* u, uint8_t* v, const __mmask64* tails)
{
const uint8_t* bgr1 = bgr0 + bgrStride;
uint8_t* y1 = y0 + yStride;

__m512i _b16_r16[2][2][2], _g16_1[2][2][2];
Store<false, mask>(y0 + 0, LoadAndConvertBgrToY8V2<T, mask>(bgr0 + 0 * A, _b16_r16[0][0], _g16_1[0][0], tails + 0), tails[8]);
Store<false, mask>(y0 + A, LoadAndConvertBgrToY8V2<T, mask>(bgr0 + 3 * A, _b16_r16[0][1], _g16_1[0][1], tails + 4), tails[9]);
Store<false, mask>(y1 + 0, LoadAndConvertBgrToY8V2<T, mask>(bgr1 + 0 * A, _b16_r16[1][0], _g16_1[1][0], tails + 0), tails[8]);
Store<false, mask>(y1 + A, LoadAndConvertBgrToY8V2<T, mask>(bgr1 + 3 * A, _b16_r16[1][1], _g16_1[1][1], tails + 4), tails[9]);

Average16(_b16_r16[0][0][0], _b16_r16[1][0][0]);
Average16(_b16_r16[0][0][1], _b16_r16[1][0][1]);
Average16(_b16_r16[0][1][0], _b16_r16[1][1][0]);
Average16(_b16_r16[0][1][1], _b16_r16[1][1][1]);

Average16(_g16_1[0][0][0], _g16_1[1][0][0]);
Average16(_g16_1[0][0][1], _g16_1[1][0][1]);
Average16(_g16_1[0][1][0], _g16_1[1][1][0]);
Average16(_g16_1[0][1][1], _g16_1[1][1][1]);

Store<false, mask>(u, Permuted2Pack16iTo8u(BgrToU16<T>(_b16_r16[0][0], _g16_1[0][0]), BgrToU16<T>(_b16_r16[0][1], _g16_1[0][1])), tails[10]);
Store<false, mask>(v, Permuted2Pack16iTo8u(BgrToV16<T>(_b16_r16[0][0], _g16_1[0][0]), BgrToV16<T>(_b16_r16[0][1], _g16_1[0][1])), tails[10]);
}

template <class T> void BgrToYuv420pV2(const uint8_t* bgr, size_t bgrStride, size_t width, size_t height, uint8_t* y, size_t yStride,
uint8_t* u, size_t uStride, uint8_t* v, size_t vStride)
{
assert((width % 2 == 0) && (height % 2 == 0));

width /= 2;
size_t widthA = AlignLo(width - 1, A);
size_t tail = width - widthA;
__mmask64 tails[11];
for (size_t i = 0; i < 8; ++i)
tails[i] = TailMask64(tail * 6 - 48 * i) & 0x0000FFFFFFFFFFFF;
for (size_t i = 0; i < 2; ++i)
tails[8 + i] = TailMask64(tail * 2 - A * i);
tails[10] = TailMask64(tail);
for (size_t row = 0; row < height; row += 2)
{
size_t col = 0;
for (; col < widthA; col += A)
BgrToYuv420pV2<T, false>(bgr + col * 6, bgrStride, y + col * 2, yStride, u + col, v + col, tails);
if (tail)
BgrToYuv420pV2<T, true>(bgr + col * 6, bgrStride, y + col * 2, yStride, u + col, v + col, tails);
y += 2 * yStride;
u += uStride;
v += vStride;
bgr += 2 * bgrStride;
}
}

void BgrToYuv420pV2(const uint8_t* bgr, size_t bgrStride, size_t width, size_t height, uint8_t* y, size_t yStride,
uint8_t* u, size_t uStride, uint8_t* v, size_t vStride, SimdYuvType yuvType)
{
switch (yuvType)
{
case SimdYuvBt601: BgrToYuv420pV2<Base::Bt601>(bgr, bgrStride, width, height, y, yStride, u, uStride, v, vStride); break;
case SimdYuvBt709: BgrToYuv420pV2<Base::Bt709>(bgr, bgrStride, width, height, y, yStride, u, uStride, v, vStride); break;
case SimdYuvBt2020: BgrToYuv420pV2<Base::Bt2020>(bgr, bgrStride, width, height, y, yStride, u, uStride, v, vStride); break;
case SimdYuvTrect871: BgrToYuv420pV2<Base::Trect871>(bgr, bgrStride, width, height, y, yStride, u, uStride, v, vStride); break;
default:
assert(0);
}
}
}
#endif// SIMD_AVX512BW_ENABLE
}
18 changes: 0 additions & 18 deletions src/Simd/SimdAvx512bwBgraToYuv.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -353,24 +353,6 @@ namespace Simd

//-------------------------------------------------------------------------------------------------

template <class T> SIMD_INLINE __m512i BgrToY16(__m512i b16_r16[2], __m512i g16_1[2])
{
static const __m512i Y_LO = SIMD_MM512_SET1_EPI16(T::Y_LO);
return SaturateI16ToU8(_mm512_add_epi16(Y_LO, PackI32ToI16(BgrToY32<T>(b16_r16[0], g16_1[0]), BgrToY32<T>(b16_r16[1], g16_1[1]))));
}

template <class T> SIMD_INLINE __m512i BgrToU16(__m512i b16_r16[2], __m512i g16_1[2])
{
static const __m512i UV_Z = SIMD_MM512_SET1_EPI16(T::UV_Z);
return SaturateI16ToU8(_mm512_add_epi16(UV_Z, PackI32ToI16(BgrToU32<T>(b16_r16[0], g16_1[0]), BgrToU32<T>(b16_r16[1], g16_1[1]))));
}

template <class T> SIMD_INLINE __m512i BgrToV16(__m512i b16_r16[2], __m512i g16_1[2])
{
static const __m512i UV_Z = SIMD_MM512_SET1_EPI16(T::UV_Z);
return SaturateI16ToU8(_mm512_add_epi16(UV_Z, PackI32ToI16(BgrToV32<T>(b16_r16[0], g16_1[0]), BgrToV32<T>(b16_r16[1], g16_1[1]))));
}

template <class T, bool tail> SIMD_INLINE void BgraToYuv444pV2(const uint8_t* bgra, uint8_t* y, uint8_t* u, uint8_t* v, const __mmask64* tails)
{
__m512i _b16_r16[2][2], _g16_1[2][2];
Expand Down
10 changes: 5 additions & 5 deletions src/Simd/SimdLib.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1596,11 +1596,11 @@ SIMD_API void SimdBgrToYuv420pV2(const uint8_t* bgr, size_t bgrStride, size_t wi
uint8_t* y, size_t yStride, uint8_t* u, size_t uStride, uint8_t* v, size_t vStride, SimdYuvType yuvType)
{
SIMD_EMPTY();
//#ifdef SIMD_AVX512BW_ENABLE
// if (Avx512bw::Enable)
// Avx512bw::BgrToYuv420pV2(bgr, bgrStride, width, height, y, yStride, u, uStride, v, vStride, yuvType);
// else
//#endif
#ifdef SIMD_AVX512BW_ENABLE
if (Avx512bw::Enable)
Avx512bw::BgrToYuv420pV2(bgr, bgrStride, width, height, y, yStride, u, uStride, v, vStride, yuvType);
else
#endif
#ifdef SIMD_AVX2_ENABLE
if (Avx2::Enable && width >= Avx2::DA)
Avx2::BgrToYuv420pV2(bgr, bgrStride, width, height, y, yStride, u, uStride, v, vStride, yuvType);
Expand Down
21 changes: 21 additions & 0 deletions src/Simd/SimdYuvToBgr.h
Original file line number Diff line number Diff line change
Expand Up @@ -680,6 +680,13 @@ namespace Simd
BgrToY32<T>(UnpackU16<1>(b16, r16), UnpackU16<1>(g16, K16_0001)))));
}

template<class T> SIMD_INLINE __m512i BgrToY16(__m512i b16_r16[2], __m512i g16_1[2])
{
static const __m512i Y_LO = SIMD_MM512_SET1_EPI16(T::Y_LO);
return SaturateI16ToU8(_mm512_add_epi16(Y_LO, _mm512_packs_epi32(
BgrToY32<T>(b16_r16[0], g16_1[0]), BgrToY32<T>(b16_r16[1], g16_1[1]))));
}

template<class T> SIMD_INLINE __m512i BgrToY8(__m512i b8, __m512i g8, __m512i r8)
{
return _mm512_packus_epi16(
Expand All @@ -702,6 +709,13 @@ namespace Simd
BgrToU32<T>(UnpackU16<1>(b16, r16), UnpackU16<1>(g16, K16_0001)))));
}

template<class T> SIMD_INLINE __m512i BgrToU16(__m512i b16_r16[2], __m512i g16_1[2])
{
static const __m512i UV_Z = SIMD_MM512_SET1_EPI16(T::UV_Z);
return SaturateI16ToU8(_mm512_add_epi16(UV_Z, _mm512_packs_epi32(
BgrToU32<T>(b16_r16[0], g16_1[0]), BgrToU32<T>(b16_r16[1], g16_1[1]))));
}

template<class T> SIMD_INLINE __m512i BgrToU8(__m512i b8, __m512i g8, __m512i r8)
{
return _mm512_packus_epi16(
Expand All @@ -724,6 +738,13 @@ namespace Simd
BgrToV32<T>(UnpackU16<1>(b16, r16), UnpackU16<1>(g16, K16_0001)))));
}

template<class T> SIMD_INLINE __m512i BgrToV16(__m512i b16_r16[2], __m512i g16_1[2])
{
static const __m512i UV_Z = SIMD_MM512_SET1_EPI16(T::UV_Z);
return SaturateI16ToU8(_mm512_add_epi16(UV_Z, _mm512_packs_epi32(
BgrToV32<T>(b16_r16[0], g16_1[0]), BgrToV32<T>(b16_r16[1], g16_1[1]))));
}

template<class T> SIMD_INLINE __m512i BgrToV8(__m512i b8, __m512i g8, __m512i r8)
{
return _mm512_packus_epi16(
Expand Down
10 changes: 5 additions & 5 deletions src/Test/TestAnyToYuv.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -515,11 +515,11 @@ namespace Test
result = result && AnyToYuvV2AutoTest(View::Bgr24, 2, 2, FUNC_YUV2(Simd::Avx2::BgrToYuv420pV2), FUNC_YUV2(SimdBgrToYuv420pV2));
#endif

//#ifdef SIMD_AVX512BW_ENABLE
// if (Simd::Avx512bw::Enable)
// result = result && AnyToYuvV2AutoTest(View::Bgr24, 2, 2, FUNC_YUV2(Simd::Avx512bw::BgrToYuv420pV2), FUNC_YUV2(SimdBgrToYuv420pV2));
//#endif
//
#ifdef SIMD_AVX512BW_ENABLE
if (Simd::Avx512bw::Enable)
result = result && AnyToYuvV2AutoTest(View::Bgr24, 2, 2, FUNC_YUV2(Simd::Avx512bw::BgrToYuv420pV2), FUNC_YUV2(SimdBgrToYuv420pV2));
#endif

//#ifdef SIMD_NEON_ENABLE
// if (Simd::Neon::Enable)
// result = result && AnyToYuvV2AutoTest(View::Bgr24, 2, 2, FUNC_YUV2(Simd::Neon::BgrToYuv420pV2), FUNC_YUV2(SimdBgrToYuv420pV2));
Expand Down

0 comments on commit fa7b55b

Please sign in to comment.