Skip to content

Commit

Permalink
+add AVX2 optimizations of function BgrToYuv420pV2.
Browse files Browse the repository at this point in the history
  • Loading branch information
ermig1979 committed Sep 18, 2023
1 parent ab58297 commit 52bd93b
Show file tree
Hide file tree
Showing 5 changed files with 88 additions and 11 deletions.
2 changes: 1 addition & 1 deletion docs/2023.html
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ <h3 id="R130">October X, 2023 (version X.X.130)</h3>
<h4>Algorithms</h4>
<h5>New features</h5>
<ul>
<li>Base implementation, SSE4.1 optimizations of function BgrToYuv420pV2.</li>
<li>Base implementation, SSE4.1, AVX2 optimizations of function BgrToYuv420pV2.</li>
</ul>
<ul>
<li>Error in AVX-512BW optimizations of function SynetSoftmaxLayerForward.</li>
Expand Down
3 changes: 3 additions & 0 deletions src/Simd/SimdAvx2.h
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,9 @@ namespace Simd

void BgrToYuv420p(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * y, size_t yStride, uint8_t * u, size_t uStride, uint8_t * v, size_t vStride);

void BgrToYuv420pV2(const uint8_t* bgr, size_t bgrStride, size_t width, size_t height,
uint8_t* y, size_t yStride, uint8_t* u, size_t uStride, uint8_t* v, size_t vStride, SimdYuvType yuvType);

void BgrToYuv422p(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * y, size_t yStride, uint8_t * u, size_t uStride, uint8_t * v, size_t vStride);

void BgrToYuv444p(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * y, size_t yStride, uint8_t * u, size_t uStride, uint8_t * v, size_t vStride);
Expand Down
74 changes: 74 additions & 0 deletions src/Simd/SimdAvx2BgrToYuv.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@
#include "Simd/SimdMemory.h"
#include "Simd/SimdStore.h"
#include "Simd/SimdConversion.h"
#include "Simd/SimdYuvToBgr.h"
#include "Simd/SimdBase.h"

namespace Simd
{
Expand Down Expand Up @@ -237,6 +239,78 @@ namespace Simd
else
BgrToYuv444p<false>(bgr, width, height, bgrStride, y, yStride, u, uStride, v, vStride);
}

//-------------------------------------------------------------------------------------------------

template <class T> SIMD_INLINE void BgrToYuv420pV2(const uint8_t* bgr0, size_t bgrStride, uint8_t* y0, size_t yStride, uint8_t* u, uint8_t* v)
{
const uint8_t* bgr1 = bgr0 + bgrStride;
uint8_t* y1 = y0 + yStride;

__m256i blue[2][2], green[2][2], red[2][2];

LoadBgr<false>((__m256i*)bgr0 + 0, blue[0][0], green[0][0], red[0][0]);
_mm256_storeu_si256((__m256i*)y0 + 0, BgrToY8<T>(blue[0][0], green[0][0], red[0][0]));

LoadBgr<false>((__m256i*)bgr0 + 3, blue[0][1], green[0][1], red[0][1]);
_mm256_storeu_si256((__m256i*)y0 + 1, BgrToY8<T>(blue[0][1], green[0][1], red[0][1]));

LoadBgr<false>((__m256i*)bgr1 + 0, blue[1][0], green[1][0], red[1][0]);
_mm256_storeu_si256((__m256i*)y1 + 0, BgrToY8<T>(blue[1][0], green[1][0], red[1][0]));

LoadBgr<false>((__m256i*)bgr1 + 3, blue[1][1], green[1][1], red[1][1]);
_mm256_storeu_si256((__m256i*)y1 + 1, BgrToY8<T>(blue[1][1], green[1][1], red[1][1]));

blue[0][0] = Average16(blue[0][0], blue[1][0]);
blue[0][1] = Average16(blue[0][1], blue[1][1]);
green[0][0] = Average16(green[0][0], green[1][0]);
green[0][1] = Average16(green[0][1], green[1][1]);
red[0][0] = Average16(red[0][0], red[1][0]);
red[0][1] = Average16(red[0][1], red[1][1]);

_mm256_storeu_si256((__m256i*)u, PackI16ToU8(BgrToU16<T>(blue[0][0], green[0][0], red[0][0]), BgrToU16<T>(blue[0][1], green[0][1], red[0][1])));
_mm256_storeu_si256((__m256i*)v, PackI16ToU8(BgrToV16<T>(blue[0][0], green[0][0], red[0][0]), BgrToV16<T>(blue[0][1], green[0][1], red[0][1])));
}

template <class T> void BgrToYuv420pV2(const uint8_t* bgr, size_t bgrStride, size_t width, size_t height, uint8_t* y, size_t yStride,
uint8_t* u, size_t uStride, uint8_t* v, size_t vStride)
{
assert((width % 2 == 0) && (height % 2 == 0) && (width >= DA) && (height >= 2));

size_t widthDA = AlignLo(width, DA);
for (size_t row = 0; row < height; row += 2)
{
for (size_t colUV = 0, colY = 0, colBgr = 0; colY < widthDA; colY += DA, colUV += A, colBgr += A * 6)
BgrToYuv420pV2<T>(bgr + colBgr, bgrStride, y + colY, yStride, u + colUV, v + colUV);
if (width != widthDA)
{
size_t colY = width - DA;
BgrToYuv420pV2<T>(bgr + colY * 3, bgrStride, y + colY, yStride, u + colY / 2, v + colY / 2);
}
y += 2 * yStride;
u += uStride;
v += vStride;
bgr += 2 * bgrStride;
}
}

void BgrToYuv420pV2(const uint8_t* bgr, size_t bgrStride, size_t width, size_t height, uint8_t* y, size_t yStride,
uint8_t* u, size_t uStride, uint8_t* v, size_t vStride, SimdYuvType yuvType)
{
#if defined(SIMD_X86_ENABLE) && defined(NDEBUG) && defined(_MSC_VER) && _MSC_VER <= 1900
Base::BgrToYuv420pV2(bgr, bgrStride, width, height, y, yStride, u, uStride, v, vStride, yuvType);
#else
switch (yuvType)
{
case SimdYuvBt601: BgrToYuv420pV2<Base::Bt601>(bgr, bgrStride, width, height, y, yStride, u, uStride, v, vStride); break;
case SimdYuvBt709: BgrToYuv420pV2<Base::Bt709>(bgr, bgrStride, width, height, y, yStride, u, uStride, v, vStride); break;
case SimdYuvBt2020: BgrToYuv420pV2<Base::Bt2020>(bgr, bgrStride, width, height, y, yStride, u, uStride, v, vStride); break;
case SimdYuvTrect871: BgrToYuv420pV2<Base::Trect871>(bgr, bgrStride, width, height, y, yStride, u, uStride, v, vStride); break;
default:
assert(0);
}
#endif
}
}
#endif// SIMD_AVX2_ENABLE
}
10 changes: 5 additions & 5 deletions src/Simd/SimdLib.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1601,11 +1601,11 @@ SIMD_API void SimdBgrToYuv420pV2(const uint8_t* bgr, size_t bgrStride, size_t wi
// Avx512bw::BgrToYuv420pV2(bgr, bgrStride, width, height, y, yStride, u, uStride, v, vStride, yuvType);
// else
//#endif
//#ifdef SIMD_AVX2_ENABLE
// if (Avx2::Enable && width >= Avx2::DA)
// Avx2::BgrToYuv420pV2(bgr, bgrStride, width, height, y, yStride, u, uStride, v, vStride, yuvType);
// else
//#endif
#ifdef SIMD_AVX2_ENABLE
if (Avx2::Enable && width >= Avx2::DA)
Avx2::BgrToYuv420pV2(bgr, bgrStride, width, height, y, yStride, u, uStride, v, vStride, yuvType);
else
#endif
#ifdef SIMD_SSE41_ENABLE
if (Sse41::Enable && width >= Sse41::DA)
Sse41::BgrToYuv420pV2(bgr, bgrStride, width, height, y, yStride, u, uStride, v, vStride, yuvType);
Expand Down
10 changes: 5 additions & 5 deletions src/Test/TestAnyToYuv.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -510,11 +510,11 @@ namespace Test
result = result && AnyToYuvV2AutoTest(View::Bgr24, 2, 2, FUNC_YUV2(Simd::Sse41::BgrToYuv420pV2), FUNC_YUV2(SimdBgrToYuv420pV2));
#endif

//#ifdef SIMD_AVX2_ENABLE
// if (Simd::Avx2::Enable)
// result = result && AnyToYuvV2AutoTest(View::Bgr24, 2, 2, FUNC_YUV2(Simd::Avx2::BgrToYuv420pV2), FUNC_YUV2(SimdBgrToYuv420pV2));
//#endif
//
#ifdef SIMD_AVX2_ENABLE
if (Simd::Avx2::Enable)
result = result && AnyToYuvV2AutoTest(View::Bgr24, 2, 2, FUNC_YUV2(Simd::Avx2::BgrToYuv420pV2), FUNC_YUV2(SimdBgrToYuv420pV2));
#endif

//#ifdef SIMD_AVX512BW_ENABLE
// if (Simd::Avx512bw::Enable)
// result = result && AnyToYuvV2AutoTest(View::Bgr24, 2, 2, FUNC_YUV2(Simd::Avx512bw::BgrToYuv420pV2), FUNC_YUV2(SimdBgrToYuv420pV2));
Expand Down

0 comments on commit 52bd93b

Please sign in to comment.