Skip to content

Commit

Permalink
+add SSE4.1 optimizations of function BgrToYuv420pV2.
Browse files Browse the repository at this point in the history
  • Loading branch information
ermig1979 committed Sep 14, 2023
1 parent 70a6eb4 commit ab58297
Show file tree
Hide file tree
Showing 6 changed files with 89 additions and 12 deletions.
2 changes: 1 addition & 1 deletion docs/2023.html
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ <h3 id="R130">October X, 2023 (version X.X.130)</h3>
<h4>Algorithms</h4>
<h5>New features</h5>
<ul>
<li>Base implementation of function BgrToYuv420pV2.</li>
<li>Base implementation, SSE4.1 optimizations of function BgrToYuv420pV2.</li>
</ul>
<ul>
<li>Error in AVX-512BW optimizations of function SynetSoftmaxLayerForward.</li>
Expand Down
2 changes: 1 addition & 1 deletion src/Simd/SimdBaseBgrToYuv.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ namespace Simd
uint8_t* y1 = y0 + yStride;

y0[0] = BgrToY<YuvType>(bgr0[0], bgr0[1], bgr0[2]);
y0[1] = BgrToY<YuvType>(bgr0[2], bgr0[4], bgr0[5]);
y0[1] = BgrToY<YuvType>(bgr0[3], bgr0[4], bgr0[5]);
y1[0] = BgrToY<YuvType>(bgr1[0], bgr1[1], bgr1[2]);
y1[1] = BgrToY<YuvType>(bgr1[3], bgr1[4], bgr1[5]);

Expand Down
10 changes: 5 additions & 5 deletions src/Simd/SimdLib.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1606,11 +1606,11 @@ SIMD_API void SimdBgrToYuv420pV2(const uint8_t* bgr, size_t bgrStride, size_t wi
// Avx2::BgrToYuv420pV2(bgr, bgrStride, width, height, y, yStride, u, uStride, v, vStride, yuvType);
// else
//#endif
//#ifdef SIMD_SSE41_ENABLE
// if (Sse41::Enable && width >= Sse41::DA)
// Sse41::BgrToYuv420pV2(bgr, bgrStride, width, height, y, yStride, u, uStride, v, vStride, yuvType);
// else
//#endif
#ifdef SIMD_SSE41_ENABLE
if (Sse41::Enable && width >= Sse41::DA)
Sse41::BgrToYuv420pV2(bgr, bgrStride, width, height, y, yStride, u, uStride, v, vStride, yuvType);
else
#endif
//#ifdef SIMD_NEON_ENABLE
// if (Neon::Enable && width >= Neon::DA)
// Neon::BgrToYuv420pV2(bgr, bgrStride, width, height, y, yStride, u, uStride, v, vStride, yuvType);
Expand Down
3 changes: 3 additions & 0 deletions src/Simd/SimdSse41.h
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,9 @@ namespace Simd

void BgrToYuv420p(const uint8_t* bgr, size_t width, size_t height, size_t bgrStride, uint8_t* y, size_t yStride, uint8_t* u, size_t uStride, uint8_t* v, size_t vStride);

void BgrToYuv420pV2(const uint8_t* bgr, size_t bgrStride, size_t width, size_t height,
uint8_t* y, size_t yStride, uint8_t* u, size_t uStride, uint8_t* v, size_t vStride, SimdYuvType yuvType);

void BgrToYuv422p(const uint8_t* bgr, size_t width, size_t height, size_t bgrStride, uint8_t* y, size_t yStride, uint8_t* u, size_t uStride, uint8_t* v, size_t vStride);

void BgrToYuv444p(const uint8_t* bgr, size_t width, size_t height, size_t bgrStride, uint8_t* y, size_t yStride, uint8_t* u, size_t uStride, uint8_t* v, size_t vStride);
Expand Down
74 changes: 74 additions & 0 deletions src/Simd/SimdSse41BgrToYuv.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@
#include "Simd/SimdMemory.h"
#include "Simd/SimdStore.h"
#include "Simd/SimdConversion.h"
#include "Simd/SimdYuvToBgr.h"
#include "Simd/SimdBase.h"

namespace Simd
{
Expand Down Expand Up @@ -228,6 +230,78 @@ namespace Simd
else
BgrToYuv444p<false>(bgr, width, height, bgrStride, y, yStride, u, uStride, v, vStride);
}

//-------------------------------------------------------------------------------------------------

template <class T> SIMD_INLINE void BgrToYuv420pV2(const uint8_t* bgr0, size_t bgrStride, uint8_t* y0, size_t yStride, uint8_t* u, uint8_t* v)
{
const uint8_t* bgr1 = bgr0 + bgrStride;
uint8_t* y1 = y0 + yStride;

__m128i blue[2][2], green[2][2], red[2][2];

LoadBgr<false>((__m128i*)bgr0 + 0, blue[0][0], green[0][0], red[0][0]);
_mm_storeu_si128((__m128i*)y0 + 0, BgrToY8<T>(blue[0][0], green[0][0], red[0][0]));

LoadBgr<false>((__m128i*)bgr0 + 3, blue[0][1], green[0][1], red[0][1]);
_mm_storeu_si128((__m128i*)y0 + 1, BgrToY8<T>(blue[0][1], green[0][1], red[0][1]));

LoadBgr<false>((__m128i*)bgr1 + 0, blue[1][0], green[1][0], red[1][0]);
_mm_storeu_si128((__m128i*)y1 + 0, BgrToY8<T>(blue[1][0], green[1][0], red[1][0]));

LoadBgr<false>((__m128i*)bgr1 + 3, blue[1][1], green[1][1], red[1][1]);
_mm_storeu_si128((__m128i*)y1 + 1, BgrToY8<T>(blue[1][1], green[1][1], red[1][1]));

blue[0][0] = Average16(blue[0][0], blue[1][0]);
blue[0][1] = Average16(blue[0][1], blue[1][1]);
green[0][0] = Average16(green[0][0], green[1][0]);
green[0][1] = Average16(green[0][1], green[1][1]);
red[0][0] = Average16(red[0][0], red[1][0]);
red[0][1] = Average16(red[0][1], red[1][1]);

_mm_storeu_si128((__m128i*)u, _mm_packus_epi16(BgrToU16<T>(blue[0][0], green[0][0], red[0][0]), BgrToU16<T>(blue[0][1], green[0][1], red[0][1])));
_mm_storeu_si128((__m128i*)v, _mm_packus_epi16(BgrToV16<T>(blue[0][0], green[0][0], red[0][0]), BgrToV16<T>(blue[0][1], green[0][1], red[0][1])));
}

template <class T> void BgrToYuv420pV2(const uint8_t* bgr, size_t bgrStride, size_t width, size_t height, uint8_t* y, size_t yStride,
uint8_t* u, size_t uStride, uint8_t* v, size_t vStride)
{
assert((width % 2 == 0) && (height % 2 == 0) && (width >= DA) && (height >= 2));

size_t widthDA = AlignLo(width, DA);
for (size_t row = 0; row < height; row += 2)
{
for (size_t colUV = 0, colY = 0, colBgr = 0; colY < widthDA; colY += DA, colUV += A, colBgr += A * 6)
BgrToYuv420pV2<T>(bgr + colBgr, bgrStride, y + colY, yStride, u + colUV, v + colUV);
if (width != widthDA)
{
size_t colY = width - DA;
BgrToYuv420pV2<T>(bgr + colY * 3, bgrStride, y + colY, yStride, u + colY / 2, v + colY / 2);
}
y += 2 * yStride;
u += uStride;
v += vStride;
bgr += 2 * bgrStride;
}
}

void BgrToYuv420pV2(const uint8_t* bgr, size_t bgrStride, size_t width, size_t height, uint8_t* y, size_t yStride,
uint8_t* u, size_t uStride, uint8_t* v, size_t vStride, SimdYuvType yuvType)
{
#if defined(SIMD_X86_ENABLE) && defined(NDEBUG) && defined(_MSC_VER) && _MSC_VER <= 1900
Base::BgrToYuv420pV2(bgr, bgrStride, width, height, y, yStride, u, uStride, v, vStride, yuvType);
#else
switch (yuvType)
{
case SimdYuvBt601: BgrToYuv420pV2<Base::Bt601>(bgr, bgrStride, width, height, y, yStride, u, uStride, v, vStride); break;
case SimdYuvBt709: BgrToYuv420pV2<Base::Bt709>(bgr, bgrStride, width, height, y, yStride, u, uStride, v, vStride); break;
case SimdYuvBt2020: BgrToYuv420pV2<Base::Bt2020>(bgr, bgrStride, width, height, y, yStride, u, uStride, v, vStride); break;
case SimdYuvTrect871: BgrToYuv420pV2<Base::Trect871>(bgr, bgrStride, width, height, y, yStride, u, uStride, v, vStride); break;
default:
assert(0);
}
#endif
}
}
#endif
}
10 changes: 5 additions & 5 deletions src/Test/TestAnyToYuv.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -505,11 +505,11 @@ namespace Test

result = result && AnyToYuvV2AutoTest(View::Bgr24, 2, 2, FUNC_YUV2(Simd::Base::BgrToYuv420pV2), FUNC_YUV2(SimdBgrToYuv420pV2));

//#ifdef SIMD_SSE41_ENABLE
// if (Simd::Sse41::Enable)
// result = result && AnyToYuvV2AutoTest(View::Bgr24, 2, 2, FUNC_YUV2(Simd::Sse41::BgrToYuv420pV2), FUNC_YUV2(SimdBgrToYuv420pV2));
//#endif
//
#ifdef SIMD_SSE41_ENABLE
if (Simd::Sse41::Enable)
result = result && AnyToYuvV2AutoTest(View::Bgr24, 2, 2, FUNC_YUV2(Simd::Sse41::BgrToYuv420pV2), FUNC_YUV2(SimdBgrToYuv420pV2));
#endif

//#ifdef SIMD_AVX2_ENABLE
// if (Simd::Avx2::Enable)
// result = result && AnyToYuvV2AutoTest(View::Bgr24, 2, 2, FUNC_YUV2(Simd::Avx2::BgrToYuv420pV2), FUNC_YUV2(SimdBgrToYuv420pV2));
Expand Down

0 comments on commit ab58297

Please sign in to comment.