Skip to content

Commit

Permalink
+add SSE4.1 optimizations of class ImageJpegLoader (part3: function: …
Browse files Browse the repository at this point in the history
…JpegIdctBlock).
  • Loading branch information
ermig1979 committed Nov 12, 2024
1 parent d2af1d2 commit be0e009
Show file tree
Hide file tree
Showing 3 changed files with 159 additions and 6 deletions.
8 changes: 4 additions & 4 deletions src/Simd/SimdBaseImageLoadJpeg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -514,7 +514,7 @@ namespace Simd
}
}

static void JpegIdctBlock(uint8_t* dst, int stride, short src[64])
static void JpegIdctBlock(const int16_t* src, uint8_t* dst, int stride)
{
int buf[64];
for (int i = 0; i < 8; ++i)
Expand Down Expand Up @@ -558,7 +558,7 @@ namespace Simd
int ha = z->img_comp[n].ha;
if (!JpegDecodeBlock(z, data, z->huff_dc + z->img_comp[n].hd, z->huff_ac + ha, z->huff_ac[ha].fast_ac, n, z->dequant[z->img_comp[n].tq]))
return 0;
z->idctBlock(z->img_comp[n].data + z->img_comp[n].w2 * j * 8 + i * 8, z->img_comp[n].w2, data);
z->idctBlock(data, z->img_comp[n].data + z->img_comp[n].w2 * j * 8 + i * 8, z->img_comp[n].w2);
if (--z->todo <= 0)
{
if (z->code_bits < 24)
Expand Down Expand Up @@ -589,7 +589,7 @@ namespace Simd
int ha = z->img_comp[n].ha;
if (!JpegDecodeBlock(z, data, z->huff_dc + z->img_comp[n].hd, z->huff_ac + ha, z->huff_ac[ha].fast_ac, n, z->dequant[z->img_comp[n].tq]))
return 0;
z->idctBlock(z->img_comp[n].data + z->img_comp[n].w2 * y2 + x2, z->img_comp[n].w2, data);
z->idctBlock(data, z->img_comp[n].data + z->img_comp[n].w2 * y2 + x2, z->img_comp[n].w2);
}
}
}
Expand Down Expand Up @@ -691,7 +691,7 @@ namespace Simd
const uint16_t* dequant = z->dequant[z->img_comp[n].tq];
for (int k = 0; k < 64; ++k)
data[k] *= dequant[k];
z->idctBlock(z->img_comp[n].data + z->img_comp[n].w2 * j * 8 + i * 8, z->img_comp[n].w2, data);
z->idctBlock(data, z->img_comp[n].data + z->img_comp[n].w2 * j * 8 + i * 8, z->img_comp[n].w2);
}
}
}
Expand Down
2 changes: 1 addition & 1 deletion src/Simd/SimdImageLoadJpeg.h
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ namespace Simd

//-------------------------------------------------------------------------------------------------

typedef void (*IdctBlockPtr)(uint8_t* out, int out_stride, short data[64]);
typedef void (*IdctBlockPtr)(const int16_t * src, uint8_t* dst, int stride);
typedef uint8_t* (*ResampleRowPtr)(uint8_t* out, const uint8_t* in0, const uint8_t* in1, int w, int hs);
typedef void (*YuvToRgbRowPtr)(uint8_t* out, const uint8_t* y, const uint8_t* pcb, const uint8_t* pcr, int count, int step);
typedef void (*YuvToBgrPtr)(const uint8_t* y, size_t yStride, const uint8_t* u, size_t uStride, const uint8_t* v, size_t vStride, size_t width, size_t height, uint8_t* bgr, size_t bgrStride, SimdYuvType yuvType);
Expand Down
155 changes: 154 additions & 1 deletion src/Simd/SimdSse41ImageLoadJpeg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,159 @@ namespace Simd
#ifdef SIMD_SSE41_ENABLE
namespace Sse41
{
static void JpegIdctBlock(const int16_t* src, uint8_t* dst, int stride)
{
#define jpeg__f2f(x) ((int) (((x) * 4096 + 0.5)))

__m128i row0, row1, row2, row3, row4, row5, row6, row7;
__m128i tmp;

#define dct_const(x,y) _mm_setr_epi16((x),(y),(x),(y),(x),(y),(x),(y))

#define dct_rot(out0,out1, x,y,c0,c1) \
__m128i c0##lo = _mm_unpacklo_epi16((x),(y)); \
__m128i c0##hi = _mm_unpackhi_epi16((x),(y)); \
__m128i out0##_l = _mm_madd_epi16(c0##lo, c0); \
__m128i out0##_h = _mm_madd_epi16(c0##hi, c0); \
__m128i out1##_l = _mm_madd_epi16(c0##lo, c1); \
__m128i out1##_h = _mm_madd_epi16(c0##hi, c1)

#define dct_widen(out, in) \
__m128i out##_l = _mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), (in)), 4); \
__m128i out##_h = _mm_srai_epi32(_mm_unpackhi_epi16(_mm_setzero_si128(), (in)), 4)

#define dct_wadd(out, a, b) \
__m128i out##_l = _mm_add_epi32(a##_l, b##_l); \
__m128i out##_h = _mm_add_epi32(a##_h, b##_h)

#define dct_wsub(out, a, b) \
__m128i out##_l = _mm_sub_epi32(a##_l, b##_l); \
__m128i out##_h = _mm_sub_epi32(a##_h, b##_h)

#define dct_bfly32o(out0, out1, a,b,bias,s) \
{ \
__m128i abiased_l = _mm_add_epi32(a##_l, bias); \
__m128i abiased_h = _mm_add_epi32(a##_h, bias); \
dct_wadd(sum, abiased, b); \
dct_wsub(dif, abiased, b); \
out0 = _mm_packs_epi32(_mm_srai_epi32(sum_l, s), _mm_srai_epi32(sum_h, s)); \
out1 = _mm_packs_epi32(_mm_srai_epi32(dif_l, s), _mm_srai_epi32(dif_h, s)); \
}

#define dct_interleave8(a, b) \
tmp = a; \
a = _mm_unpacklo_epi8(a, b); \
b = _mm_unpackhi_epi8(tmp, b)

#define dct_interleave16(a, b) \
tmp = a; \
a = _mm_unpacklo_epi16(a, b); \
b = _mm_unpackhi_epi16(tmp, b)

#define dct_pass(bias,shift) \
{ \
dct_rot(t2e,t3e, row2,row6, rot0_0,rot0_1); \
__m128i sum04 = _mm_add_epi16(row0, row4); \
__m128i dif04 = _mm_sub_epi16(row0, row4); \
dct_widen(t0e, sum04); \
dct_widen(t1e, dif04); \
dct_wadd(x0, t0e, t3e); \
dct_wsub(x3, t0e, t3e); \
dct_wadd(x1, t1e, t2e); \
dct_wsub(x2, t1e, t2e); \
dct_rot(y0o,y2o, row7,row3, rot2_0,rot2_1); \
dct_rot(y1o,y3o, row5,row1, rot3_0,rot3_1); \
__m128i sum17 = _mm_add_epi16(row1, row7); \
__m128i sum35 = _mm_add_epi16(row3, row5); \
dct_rot(y4o,y5o, sum17,sum35, rot1_0,rot1_1); \
dct_wadd(x4, y0o, y4o); \
dct_wadd(x5, y1o, y5o); \
dct_wadd(x6, y2o, y5o); \
dct_wadd(x7, y3o, y4o); \
dct_bfly32o(row0,row7, x0,x7,bias,shift); \
dct_bfly32o(row1,row6, x1,x6,bias,shift); \
dct_bfly32o(row2,row5, x2,x5,bias,shift); \
dct_bfly32o(row3,row4, x3,x4,bias,shift); \
}

__m128i rot0_0 = dct_const(jpeg__f2f(0.5411961f), jpeg__f2f(0.5411961f) + jpeg__f2f(-1.847759065f));
__m128i rot0_1 = dct_const(jpeg__f2f(0.5411961f) + jpeg__f2f(0.765366865f), jpeg__f2f(0.5411961f));
__m128i rot1_0 = dct_const(jpeg__f2f(1.175875602f) + jpeg__f2f(-0.899976223f), jpeg__f2f(1.175875602f));
__m128i rot1_1 = dct_const(jpeg__f2f(1.175875602f), jpeg__f2f(1.175875602f) + jpeg__f2f(-2.562915447f));
__m128i rot2_0 = dct_const(jpeg__f2f(-1.961570560f) + jpeg__f2f(0.298631336f), jpeg__f2f(-1.961570560f));
__m128i rot2_1 = dct_const(jpeg__f2f(-1.961570560f), jpeg__f2f(-1.961570560f) + jpeg__f2f(3.072711026f));
__m128i rot3_0 = dct_const(jpeg__f2f(-0.390180644f) + jpeg__f2f(2.053119869f), jpeg__f2f(-0.390180644f));
__m128i rot3_1 = dct_const(jpeg__f2f(-0.390180644f), jpeg__f2f(-0.390180644f) + jpeg__f2f(1.501321110f));

__m128i bias_0 = _mm_set1_epi32(512);
__m128i bias_1 = _mm_set1_epi32(65536 + (128 << 17));
row0 = _mm_load_si128((const __m128i*) (src + 0 * 8));
row1 = _mm_load_si128((const __m128i*) (src + 1 * 8));
row2 = _mm_load_si128((const __m128i*) (src + 2 * 8));
row3 = _mm_load_si128((const __m128i*) (src + 3 * 8));
row4 = _mm_load_si128((const __m128i*) (src + 4 * 8));
row5 = _mm_load_si128((const __m128i*) (src + 5 * 8));
row6 = _mm_load_si128((const __m128i*) (src + 6 * 8));
row7 = _mm_load_si128((const __m128i*) (src + 7 * 8));
dct_pass(bias_0, 10);

{
dct_interleave16(row0, row4);
dct_interleave16(row1, row5);
dct_interleave16(row2, row6);
dct_interleave16(row3, row7);

dct_interleave16(row0, row2);
dct_interleave16(row1, row3);
dct_interleave16(row4, row6);
dct_interleave16(row5, row7);

dct_interleave16(row0, row1);
dct_interleave16(row2, row3);
dct_interleave16(row4, row5);
dct_interleave16(row6, row7);
}

dct_pass(bias_1, 17);
{
__m128i p0 = _mm_packus_epi16(row0, row1); // a0a1a2a3...a7b0b1b2b3...b7
__m128i p1 = _mm_packus_epi16(row2, row3);
__m128i p2 = _mm_packus_epi16(row4, row5);
__m128i p3 = _mm_packus_epi16(row6, row7);

dct_interleave8(p0, p2); // a0e0a1e1...
dct_interleave8(p1, p3); // c0g0c1g1...

dct_interleave8(p0, p1); // a0c0e0g0...
dct_interleave8(p2, p3); // b0d0f0h0...

dct_interleave8(p0, p2); // a0b0c0d0...
dct_interleave8(p1, p3); // a4b4c4d4...

// store
_mm_storel_epi64((__m128i*) dst, p0); dst += stride;
_mm_storel_epi64((__m128i*) dst, _mm_shuffle_epi32(p0, 0x4e)); dst += stride;
_mm_storel_epi64((__m128i*) dst, p2); dst += stride;
_mm_storel_epi64((__m128i*) dst, _mm_shuffle_epi32(p2, 0x4e)); dst += stride;
_mm_storel_epi64((__m128i*) dst, p1); dst += stride;
_mm_storel_epi64((__m128i*) dst, _mm_shuffle_epi32(p1, 0x4e)); dst += stride;
_mm_storel_epi64((__m128i*) dst, p3); dst += stride;
_mm_storel_epi64((__m128i*) dst, _mm_shuffle_epi32(p3, 0x4e));
}

#undef dct_const
#undef dct_rot
#undef dct_widen
#undef dct_wadd
#undef dct_wsub
#undef dct_bfly32o
#undef dct_interleave8
#undef dct_interleave16
#undef dct_pass
}

//-------------------------------------------------------------------------------------------------

#define jpeg__div4(x) ((uint8_t) ((x) >> 2))
#define jpeg__div16(x) ((uint8_t) ((x) >> 4))

Expand Down Expand Up @@ -267,7 +420,7 @@ namespace Simd
ImageJpegLoader::ImageJpegLoader(const ImageLoaderParam& param)
: Base::ImageJpegLoader(param)
{
//_context->idctBlock = JpegIdctBlock;
_context->idctBlock = JpegIdctBlock;
_context->resampleRowHv2 = JpegResampleRowHv2;
if (_param.format == SimdPixelFormatGray8)
_context->rgbaToAny = Sse41::RgbaToGray;
Expand Down

0 comments on commit be0e009

Please sign in to comment.