Skip to content

Commit

Permalink
+add AVX2 optimizations of function BgrToYuv422pV2.
Browse files Browse the repository at this point in the history
  • Loading branch information
ermig1979 committed Sep 18, 2023
1 parent 49cf217 commit 657e54c
Show file tree
Hide file tree
Showing 6 changed files with 78 additions and 12 deletions.
2 changes: 1 addition & 1 deletion docs/2023.html
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ <h4>Algorithms</h4>
<h5>New features</h5>
<ul>
<li>Base implementation, SSE4.1, AVX2, AVX-512BW optimizations of function BgrToYuv420pV2.</li>
<li>Base implementation, SSE4.1 optimizations of function BgrToYuv422pV2.</li>
<li>Base implementation, SSE4.1, AVX2 optimizations of function BgrToYuv422pV2.</li>
</ul>
<ul>
<li>Error in AVX-512BW optimizations of function SynetSoftmaxLayerForward.</li>
Expand Down
3 changes: 3 additions & 0 deletions src/Simd/SimdAvx2.h
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,9 @@ namespace Simd

void BgrToYuv422p(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * y, size_t yStride, uint8_t * u, size_t uStride, uint8_t * v, size_t vStride);

void BgrToYuv422pV2(const uint8_t* bgr, size_t bgrStride, size_t width, size_t height,
uint8_t* y, size_t yStride, uint8_t* u, size_t uStride, uint8_t* v, size_t vStride, SimdYuvType yuvType);

void BgrToYuv444p(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * y, size_t yStride, uint8_t * u, size_t uStride, uint8_t * v, size_t vStride);

void Binarization(const uint8_t * src, size_t srcStride, size_t width, size_t height,
Expand Down
63 changes: 63 additions & 0 deletions src/Simd/SimdAvx2BgrToYuv.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -309,6 +309,69 @@ namespace Simd
default:
assert(0);
}
#endif
}

//-------------------------------------------------------------------------------------------------

template <class T> SIMD_INLINE void BgrToYuv422pV2(const uint8_t* bgr, uint8_t* y, uint8_t* u, uint8_t* v)
{
__m256i blue[2], green[2], red[2];

LoadBgr<false>((__m256i*)bgr + 0, blue[0], green[0], red[0]);
_mm256_storeu_si256((__m256i*)y + 0, BgrToY8<T>(blue[0], green[0], red[0]));

LoadBgr<false>((__m256i*)bgr + 3, blue[1], green[1], red[1]);
_mm256_storeu_si256((__m256i*)y + 1, BgrToY8<T>(blue[1], green[1], red[1]));

Average16(blue[0]);
Average16(blue[1]);
Average16(green[0]);
Average16(green[1]);
Average16(red[0]);
Average16(red[1]);

_mm256_storeu_si256((__m256i*)u, PackI16ToU8(BgrToU16<T>(blue[0], green[0], red[0]), BgrToU16<T>(blue[1], green[1], red[1])));
_mm256_storeu_si256((__m256i*)v, PackI16ToU8(BgrToV16<T>(blue[0], green[0], red[0]), BgrToV16<T>(blue[1], green[1], red[1])));
}

template <class T> void BgrToYuv422pV2(const uint8_t* bgr, size_t bgrStride, size_t width, size_t height, uint8_t* y, size_t yStride,
uint8_t* u, size_t uStride, uint8_t* v, size_t vStride)
{
assert((width % 2 == 0) && (width >= DA));

size_t widthDA = AlignLo(width, DA);
for (size_t row = 0; row < height; row += 1)
{
for (size_t colUV = 0, colY = 0, colBgr = 0; colY < widthDA; colY += DA, colUV += A, colBgr += A * 6)
BgrToYuv422pV2<T>(bgr + colBgr, y + colY, u + colUV, v + colUV);
if (width != widthDA)
{
size_t colY = width - DA;
BgrToYuv422pV2<T>(bgr + colY * 3, y + colY, u + colY / 2, v + colY / 2);
}
y += yStride;
u += uStride;
v += vStride;
bgr += bgrStride;
}
}

void BgrToYuv422pV2(const uint8_t* bgr, size_t bgrStride, size_t width, size_t height, uint8_t* y, size_t yStride,
uint8_t* u, size_t uStride, uint8_t* v, size_t vStride, SimdYuvType yuvType)
{
#if defined(SIMD_X86_ENABLE) && defined(NDEBUG) && defined(_MSC_VER) && _MSC_VER <= 1900
Base::BgrToYuv422pV2(bgr, bgrStride, width, height, y, yStride, u, uStride, v, vStride, yuvType);
#else
switch (yuvType)
{
case SimdYuvBt601: BgrToYuv422pV2<Base::Bt601>(bgr, bgrStride, width, height, y, yStride, u, uStride, v, vStride); break;
case SimdYuvBt709: BgrToYuv422pV2<Base::Bt709>(bgr, bgrStride, width, height, y, yStride, u, uStride, v, vStride); break;
case SimdYuvBt2020: BgrToYuv422pV2<Base::Bt2020>(bgr, bgrStride, width, height, y, yStride, u, uStride, v, vStride); break;
case SimdYuvTrect871: BgrToYuv422pV2<Base::Trect871>(bgr, bgrStride, width, height, y, yStride, u, uStride, v, vStride); break;
default:
assert(0);
}
#endif
}
}
Expand Down
10 changes: 5 additions & 5 deletions src/Simd/SimdLib.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1659,11 +1659,11 @@ SIMD_API void SimdBgrToYuv422pV2(const uint8_t* bgr, size_t bgrStride, size_t wi
// Avx512bw::BgrToYuv422pV2(bgr, bgrStride, width, height, y, yStride, u, uStride, v, vStride, yuvType);
// else
//#endif
//#ifdef SIMD_AVX2_ENABLE
// if (Avx2::Enable && width >= Avx2::DA)
// Avx2::BgrToYuv422pV2(bgr, bgrStride, width, height, y, yStride, u, uStride, v, vStride, yuvType);
// else
//#endif
#ifdef SIMD_AVX2_ENABLE
if (Avx2::Enable && width >= Avx2::DA)
Avx2::BgrToYuv422pV2(bgr, bgrStride, width, height, y, yStride, u, uStride, v, vStride, yuvType);
else
#endif
#ifdef SIMD_SSE41_ENABLE
if (Sse41::Enable && width >= Sse41::DA)
Sse41::BgrToYuv422pV2(bgr, bgrStride, width, height, y, yStride, u, uStride, v, vStride, yuvType);
Expand Down
2 changes: 1 addition & 1 deletion src/Simd/SimdSse41BgrToYuv.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -352,7 +352,7 @@ namespace Simd
uint8_t* u, size_t uStride, uint8_t* v, size_t vStride, SimdYuvType yuvType)
{
#if defined(SIMD_X86_ENABLE) && defined(NDEBUG) && defined(_MSC_VER) && _MSC_VER <= 1900
Base::BgrToYuv420pV2(bgr, bgrStride, width, height, y, yStride, u, uStride, v, vStride, yuvType);
Base::BgrToYuv422pV2(bgr, bgrStride, width, height, y, yStride, u, uStride, v, vStride, yuvType);
#else
switch (yuvType)
{
Expand Down
10 changes: 5 additions & 5 deletions src/Test/TestAnyToYuv.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -539,11 +539,11 @@ namespace Test
result = result && AnyToYuvV2AutoTest(View::Bgr24, 2, 1, FUNC_YUV2(Simd::Sse41::BgrToYuv422pV2), FUNC_YUV2(SimdBgrToYuv422pV2));
#endif

//#ifdef SIMD_AVX2_ENABLE
// if (Simd::Avx2::Enable)
// result = result && AnyToYuvV2AutoTest(View::Bgr24, 2, 1, FUNC_YUV2(Simd::Avx2::BgrToYuv422pV2), FUNC_YUV2(SimdBgrToYuv422pV2));
//#endif
//
#ifdef SIMD_AVX2_ENABLE
if (Simd::Avx2::Enable)
result = result && AnyToYuvV2AutoTest(View::Bgr24, 2, 1, FUNC_YUV2(Simd::Avx2::BgrToYuv422pV2), FUNC_YUV2(SimdBgrToYuv422pV2));
#endif

//#ifdef SIMD_AVX512BW_ENABLE
// if (Simd::Avx512bw::Enable)
// result = result && AnyToYuvV2AutoTest(View::Bgr24, 2, 1, FUNC_YUV2(Simd::Avx512bw::BgrToYuv422pV2), FUNC_YUV2(SimdBgrToYuv422pV2));
Expand Down

0 comments on commit 657e54c

Please sign in to comment.