Skip to content

Commit

Permalink
+add SSE4.1 optimizations of function BgrToYuv422pV2.
Browse files Browse the repository at this point in the history
  • Loading branch information
ermig1979 committed Sep 18, 2023
1 parent 2062da4 commit 49cf217
Show file tree
Hide file tree
Showing 6 changed files with 89 additions and 21 deletions.
2 changes: 1 addition & 1 deletion docs/2023.html
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ <h4>Algorithms</h4>
<h5>New features</h5>
<ul>
<li>Base implementation, SSE4.1, AVX2, AVX-512BW optimizations of function BgrToYuv420pV2.</li>
<li>Base implementation of function BgrToYuv422pV2.</li>
<li>Base implementation, SSE4.1 optimizations of function BgrToYuv422pV2.</li>
</ul>
<ul>
<li>Error in AVX-512BW optimizations of function SynetSoftmaxLayerForward.</li>
Expand Down
22 changes: 12 additions & 10 deletions src/Simd/SimdAvx512bwBgraToYuv.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -361,9 +361,9 @@ namespace Simd
LoadPreparedBgra16<false, tail>(bgra + 2 * A, _b16_r16[1][0], _g16_1[1][0], tails + 2);
LoadPreparedBgra16<false, tail>(bgra + 3 * A, _b16_r16[1][1], _g16_1[1][1], tails + 3);

Store<false, tail>(y, PackI16ToU8(BgrToY16<T>(_b16_r16[0], _g16_1[0]), BgrToY16<T>(_b16_r16[1], _g16_1[1])), tails[4]);
Store<false, tail>(u, PackI16ToU8(BgrToU16<T>(_b16_r16[0], _g16_1[0]), BgrToU16<T>(_b16_r16[1], _g16_1[1])), tails[4]);
Store<false, tail>(v, PackI16ToU8(BgrToV16<T>(_b16_r16[0], _g16_1[0]), BgrToV16<T>(_b16_r16[1], _g16_1[1])), tails[4]);
Store<false, tail>(y, Permuted2Pack16iTo8u(BgrToY16<T>(_b16_r16[0], _g16_1[0]), BgrToY16<T>(_b16_r16[1], _g16_1[1])), tails[4]);
Store<false, tail>(u, Permuted2Pack16iTo8u(BgrToU16<T>(_b16_r16[0], _g16_1[0]), BgrToU16<T>(_b16_r16[1], _g16_1[1])), tails[4]);
Store<false, tail>(v, Permuted2Pack16iTo8u(BgrToV16<T>(_b16_r16[0], _g16_1[0]), BgrToV16<T>(_b16_r16[1], _g16_1[1])), tails[4]);
}

template <class T> void BgraToYuv444pV2(const uint8_t* bgra, size_t bgraStride, size_t width, size_t height,
Expand Down Expand Up @@ -420,7 +420,9 @@ namespace Simd

template <class T, bool tail> SIMD_INLINE __m512i LoadAndBgrToY8(const uint8_t* bgra, __m512i b16_r16[2], __m512i g16_1[2], const __mmask64* tails)
{
return PackI16ToU8(LoadAndBgrToY16<T, tail>(bgra + 0 * A, b16_r16[0], g16_1[0], tails + 0), LoadAndBgrToY16<T, tail>(bgra + 2 * A, b16_r16[1], g16_1[1], tails + 2));
__m512i lo = LoadAndBgrToY16<T, tail>(bgra + 0 * A, b16_r16[0], g16_1[0], tails + 0);
__m512i hi = LoadAndBgrToY16<T, tail>(bgra + 2 * A, b16_r16[1], g16_1[1], tails + 2);
return Permuted2Pack16iTo8u(lo, hi);
}

template <class T, bool tail> SIMD_INLINE void BgraToYuv422pV2(const uint8_t* bgra, uint8_t* y, uint8_t* u, uint8_t* v, const __mmask64* tails)
Expand All @@ -432,8 +434,8 @@ namespace Simd
Average16(_b16_r16);
Average16(_g16_1);

Store<false, tail>(u, PackI16ToU8(BgrToU16<T>(_b16_r16[0], _g16_1[0]), BgrToU16<T>(_b16_r16[1], _g16_1[1])), tails[10]);
Store<false, tail>(v, PackI16ToU8(BgrToV16<T>(_b16_r16[0], _g16_1[0]), BgrToV16<T>(_b16_r16[1], _g16_1[1])), tails[10]);
Store<false, tail>(u, Permuted2Pack16iTo8u(BgrToU16<T>(_b16_r16[0], _g16_1[0]), BgrToU16<T>(_b16_r16[1], _g16_1[1])), tails[10]);
Store<false, tail>(v, Permuted2Pack16iTo8u(BgrToV16<T>(_b16_r16[0], _g16_1[0]), BgrToV16<T>(_b16_r16[1], _g16_1[1])), tails[10]);
}

template <class T> void BgraToYuv422pV2(const uint8_t* bgra, size_t bgraStride, size_t width, size_t height, uint8_t* y, size_t yStride,
Expand Down Expand Up @@ -500,8 +502,8 @@ namespace Simd
Average16(_g16_1[0][1][0], _g16_1[1][1][0]);
Average16(_g16_1[0][1][1], _g16_1[1][1][1]);

Store<false, tail>(u, PackI16ToU8(BgrToU16<T>(_b16_r16[0][0], _g16_1[0][0]), BgrToU16<T>(_b16_r16[0][1], _g16_1[0][1])), tails[10]);
Store<false, tail>(v, PackI16ToU8(BgrToV16<T>(_b16_r16[0][0], _g16_1[0][0]), BgrToV16<T>(_b16_r16[0][1], _g16_1[0][1])), tails[10]);
Store<false, tail>(u, Permuted2Pack16iTo8u(BgrToU16<T>(_b16_r16[0][0], _g16_1[0][0]), BgrToU16<T>(_b16_r16[0][1], _g16_1[0][1])), tails[10]);
Store<false, tail>(v, Permuted2Pack16iTo8u(BgrToV16<T>(_b16_r16[0][0], _g16_1[0][0]), BgrToV16<T>(_b16_r16[0][1], _g16_1[0][1])), tails[10]);
}

template <class T> void BgraToYuv420pV2(const uint8_t* bgra, size_t bgraStride, size_t width, size_t height, uint8_t* y, size_t yStride,
Expand Down Expand Up @@ -590,8 +592,8 @@ namespace Simd
Average16(_g16_1[0][1][0], _g16_1[1][1][0]);
Average16(_g16_1[0][1][1], _g16_1[1][1][1]);

Store<false, mask>(u, PackI16ToU8(BgrToU16<T>(_b16_r16[0][0], _g16_1[0][0]), BgrToU16<T>(_b16_r16[0][1], _g16_1[0][1])), tails[10]);
Store<false, mask>(v, PackI16ToU8(BgrToV16<T>(_b16_r16[0][0], _g16_1[0][0]), BgrToV16<T>(_b16_r16[0][1], _g16_1[0][1])), tails[10]);
Store<false, mask>(u, Permuted2Pack16iTo8u(BgrToU16<T>(_b16_r16[0][0], _g16_1[0][0]), BgrToU16<T>(_b16_r16[0][1], _g16_1[0][1])), tails[10]);
Store<false, mask>(v, Permuted2Pack16iTo8u(BgrToV16<T>(_b16_r16[0][0], _g16_1[0][0]), BgrToV16<T>(_b16_r16[0][1], _g16_1[0][1])), tails[10]);
}

template <class T> void BgraToYuva420pV2(const uint8_t* bgra, size_t bgraStride, size_t width, size_t height, uint8_t* y, size_t yStride,
Expand Down
10 changes: 5 additions & 5 deletions src/Simd/SimdLib.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1664,11 +1664,11 @@ SIMD_API void SimdBgrToYuv422pV2(const uint8_t* bgr, size_t bgrStride, size_t wi
// Avx2::BgrToYuv422pV2(bgr, bgrStride, width, height, y, yStride, u, uStride, v, vStride, yuvType);
// else
//#endif
//#ifdef SIMD_SSE41_ENABLE
// if (Sse41::Enable && width >= Sse41::DA)
// Sse41::BgrToYuv422pV2(bgr, bgrStride, width, height, y, yStride, u, uStride, v, vStride, yuvType);
// else
//#endif
#ifdef SIMD_SSE41_ENABLE
if (Sse41::Enable && width >= Sse41::DA)
Sse41::BgrToYuv422pV2(bgr, bgrStride, width, height, y, yStride, u, uStride, v, vStride, yuvType);
else
#endif
//#ifdef SIMD_NEON_ENABLE
// if (Neon::Enable && width >= Neon::DA)
// Neon::BgrToYuv420pV2(bgr, bgrStride, width, height, y, yStride, u, uStride, v, vStride, yuvType);
Expand Down
3 changes: 3 additions & 0 deletions src/Simd/SimdSse41.h
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,9 @@ namespace Simd

void BgrToYuv422p(const uint8_t* bgr, size_t width, size_t height, size_t bgrStride, uint8_t* y, size_t yStride, uint8_t* u, size_t uStride, uint8_t* v, size_t vStride);

void BgrToYuv422pV2(const uint8_t* bgr, size_t bgrStride, size_t width, size_t height,
uint8_t* y, size_t yStride, uint8_t* u, size_t uStride, uint8_t* v, size_t vStride, SimdYuvType yuvType);

void BgrToYuv444p(const uint8_t* bgr, size_t width, size_t height, size_t bgrStride, uint8_t* y, size_t yStride, uint8_t* u, size_t uStride, uint8_t* v, size_t vStride);

void Binarization(const uint8_t* src, size_t srcStride, size_t width, size_t height,
Expand Down
63 changes: 63 additions & 0 deletions src/Simd/SimdSse41BgrToYuv.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -300,6 +300,69 @@ namespace Simd
default:
assert(0);
}
#endif
}

//-------------------------------------------------------------------------------------------------

template <class T> SIMD_INLINE void BgrToYuv422pV2(const uint8_t* bgr, uint8_t* y, uint8_t* u, uint8_t* v)
{
__m128i blue[2], green[2], red[2];

LoadBgr<false>((__m128i*)bgr + 0, blue[0], green[0], red[0]);
_mm_storeu_si128((__m128i*)y + 0, BgrToY8<T>(blue[0], green[0], red[0]));

LoadBgr<false>((__m128i*)bgr + 3, blue[1], green[1], red[1]);
_mm_storeu_si128((__m128i*)y + 1, BgrToY8<T>(blue[1], green[1], red[1]));

Average16(blue[0]);
Average16(blue[1]);
Average16(green[0]);
Average16(green[1]);
Average16(red[0]);
Average16(red[1]);

_mm_storeu_si128((__m128i*)u, _mm_packus_epi16(BgrToU16<T>(blue[0], green[0], red[0]), BgrToU16<T>(blue[1], green[1], red[1])));
_mm_storeu_si128((__m128i*)v, _mm_packus_epi16(BgrToV16<T>(blue[0], green[0], red[0]), BgrToV16<T>(blue[1], green[1], red[1])));
}

template <class T> void BgrToYuv422pV2(const uint8_t* bgr, size_t bgrStride, size_t width, size_t height, uint8_t* y, size_t yStride,
uint8_t* u, size_t uStride, uint8_t* v, size_t vStride)
{
assert((width % 2 == 0) && (width >= DA));

size_t widthDA = AlignLo(width, DA);
for (size_t row = 0; row < height; row += 1)
{
for (size_t colUV = 0, colY = 0, colBgr = 0; colY < widthDA; colY += DA, colUV += A, colBgr += A * 6)
BgrToYuv422pV2<T>(bgr + colBgr, y + colY, u + colUV, v + colUV);
if (width != widthDA)
{
size_t colY = width - DA;
BgrToYuv422pV2<T>(bgr + colY * 3, y + colY, u + colY / 2, v + colY / 2);
}
y += yStride;
u += uStride;
v += vStride;
bgr += bgrStride;
}
}

void BgrToYuv422pV2(const uint8_t* bgr, size_t bgrStride, size_t width, size_t height, uint8_t* y, size_t yStride,
uint8_t* u, size_t uStride, uint8_t* v, size_t vStride, SimdYuvType yuvType)
{
#if defined(SIMD_X86_ENABLE) && defined(NDEBUG) && defined(_MSC_VER) && _MSC_VER <= 1900
Base::BgrToYuv420pV2(bgr, bgrStride, width, height, y, yStride, u, uStride, v, vStride, yuvType);
#else
switch (yuvType)
{
case SimdYuvBt601: BgrToYuv422pV2<Base::Bt601>(bgr, bgrStride, width, height, y, yStride, u, uStride, v, vStride); break;
case SimdYuvBt709: BgrToYuv422pV2<Base::Bt709>(bgr, bgrStride, width, height, y, yStride, u, uStride, v, vStride); break;
case SimdYuvBt2020: BgrToYuv422pV2<Base::Bt2020>(bgr, bgrStride, width, height, y, yStride, u, uStride, v, vStride); break;
case SimdYuvTrect871: BgrToYuv422pV2<Base::Trect871>(bgr, bgrStride, width, height, y, yStride, u, uStride, v, vStride); break;
default:
assert(0);
}
#endif
}
}
Expand Down
10 changes: 5 additions & 5 deletions src/Test/TestAnyToYuv.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -534,11 +534,11 @@ namespace Test

result = result && AnyToYuvV2AutoTest(View::Bgr24, 2, 1, FUNC_YUV2(Simd::Base::BgrToYuv422pV2), FUNC_YUV2(SimdBgrToYuv422pV2));

//#ifdef SIMD_SSE41_ENABLE
// if (Simd::Sse41::Enable)
// result = result && AnyToYuvV2AutoTest(View::Bgr24, 2, 1, FUNC_YUV2(Simd::Sse41::BgrToYuv422pV2), FUNC_YUV2(SimdBgrToYuv422pV2));
//#endif
//
#ifdef SIMD_SSE41_ENABLE
if (Simd::Sse41::Enable)
result = result && AnyToYuvV2AutoTest(View::Bgr24, 2, 1, FUNC_YUV2(Simd::Sse41::BgrToYuv422pV2), FUNC_YUV2(SimdBgrToYuv422pV2));
#endif

//#ifdef SIMD_AVX2_ENABLE
// if (Simd::Avx2::Enable)
// result = result && AnyToYuvV2AutoTest(View::Bgr24, 2, 1, FUNC_YUV2(Simd::Avx2::BgrToYuv422pV2), FUNC_YUV2(SimdBgrToYuv422pV2));
Expand Down

0 comments on commit 49cf217

Please sign in to comment.