From 45548d65a8e6a78cdbd8a3f3fd2750e0ee8f656f Mon Sep 17 00:00:00 2001 From: awxkee Date: Fri, 13 Oct 2023 23:27:33 +1100 Subject: [PATCH] some HDR improvements --- Sources/avifc/Color/Colorspace.h | 28 +++++++---- Sources/avifc/Color/HLG.hpp | 2 +- Sources/avifc/Color/PQ.hpp | 5 +- Sources/avifc/HDRColorTransfer.mm | 4 +- Sources/avifc/NEMath.h | 9 +--- Sources/avifc/ToneMap/ClampToneMapper.cpp | 10 ++-- Sources/avifc/ToneMap/DragoToneMapper.cpp | 8 +-- Sources/avifc/ToneMap/HableToneMapper.cpp | 1 + .../avifc/ToneMap/LogarithmicToneMapper.cpp | 22 ++++---- .../avifc/ToneMap/LogarithmicToneMapper.hpp | 4 +- Sources/avifc/ToneMap/Rec2408ToneMapper.cpp | 50 ++++++------------- Sources/avifc/ToneMap/Rec2408ToneMapper.hpp | 8 --- .../avifc/ToneMap/ReinhardJodieToneMapper.cpp | 19 +++---- .../avifc/ToneMap/ReinhardJodieToneMapper.hpp | 2 +- Sources/avifc/ToneMap/ReinhardToneMapper.cpp | 18 +++---- 15 files changed, 79 insertions(+), 111 deletions(-) diff --git a/Sources/avifc/Color/Colorspace.h b/Sources/avifc/Color/Colorspace.h index b0f634e..23b21d8 100644 --- a/Sources/avifc/Color/Colorspace.h +++ b/Sources/avifc/Color/Colorspace.h @@ -44,9 +44,10 @@ static const float DisplayP3Primaries[3][2] = { { 0.740, 0.270 }, { 0.220, 0.780 static const float Rec2020LumaPrimaries[3] = {0.2627f, 0.6780f, 0.0593f}; static const float Rec709LumaPrimaries[3] = {0.2126f, 0.7152f, 0.0722f}; -static const float DisplayP3LumaPrimaries = 80; -static const float Rec709WhitePointNits = 100; +static const float DisplayP3LumaPrimaries[3] = {0.299f, 0.587f, 0.114f}; static const float DisplayP3WhitePointNits = 80; +static const float Rec709WhitePointNits = 100; +static const float Rec2020WhitePointNits = 203; static const float IlluminantD65[2] = { 0.3127, 0.3290 }; @@ -157,7 +158,8 @@ class ColorSpaceMatrix { const float32x4_t row2 = { matrix[3], matrix[4], matrix[5], 0.0f }; const float32x4_t row3 = { matrix[6], matrix[7], matrix[8], 0.0f }; - return vaddq_f32(vaddq_f32(vmulq_f32(v, row1), vmulq_f32(v, row2)), vmulq_f32(v, row3)); + float32x4_t r = { vaddvq_f32(vmulq_f32(v, row1)), vaddvq_f32(vmulq_f32(v, row2)), vaddvq_f32(vmulq_f32(v, row3)), 0.0f }; + return r; } inline float32x4_t operator*(const float32x4_t v) { @@ -165,14 +167,15 @@ class ColorSpaceMatrix { const float32x4_t row2 = { matrix[3], matrix[4], matrix[5], 0.0f }; const float32x4_t row3 = { matrix[6], matrix[7], matrix[8], 0.0f }; - return vaddq_f32(vaddq_f32(vmulq_f32(v, row1), vmulq_f32(v, row2)), vmulq_f32(v, row3)); + float32x4_t r = { vaddvq_f32(vmulq_f32(v, row1)), vaddvq_f32(vmulq_f32(v, row2)), vaddvq_f32(vmulq_f32(v, row3)), 0.0f }; + return r; } inline float32x4x4_t operator*(const float32x4x4_t v) { - const float32x4_t r1 = vaddq_f32(vaddq_f32(vmulq_f32(v.val[0], row1), vmulq_f32(v.val[0], row2)), vmulq_f32(v.val[1], row3)); - const float32x4_t r2 = vaddq_f32(vaddq_f32(vmulq_f32(v.val[1], row1), vmulq_f32(v.val[1], row2)), vmulq_f32(v.val[1], row3)); - const float32x4_t r3 = vaddq_f32(vaddq_f32(vmulq_f32(v.val[2], row1), vmulq_f32(v.val[2], row2)), vmulq_f32(v.val[2], row3)); - const float32x4_t r4 = vaddq_f32(vaddq_f32(vmulq_f32(v.val[3], row1), vmulq_f32(v.val[3], row2)), vmulq_f32(v.val[3], row3)); + const float32x4_t r1 = { vaddvq_f32(vmulq_f32(v.val[0], row1)), vaddvq_f32(vmulq_f32(v.val[0], row2)), vaddvq_f32(vmulq_f32(v.val[0], row3)), 0.0f }; + const float32x4_t r2 = { vaddvq_f32(vmulq_f32(v.val[1], row1)), vaddvq_f32(vmulq_f32(v.val[1], row2)), vaddvq_f32(vmulq_f32(v.val[1], row3)), 0.0f }; + const float32x4_t r3= { vaddvq_f32(vmulq_f32(v.val[2], row1)), vaddvq_f32(vmulq_f32(v.val[2], row2)), vaddvq_f32(vmulq_f32(v.val[2], row3)), 0.0f }; + const float32x4_t r4 = { vaddvq_f32(vmulq_f32(v.val[3], row1)), vaddvq_f32(vmulq_f32(v.val[3], row2)), vaddvq_f32(vmulq_f32(v.val[3], row3)), 0.0f }; float32x4x4_t r = { r1, r2, r3, r4 }; return r; } @@ -360,9 +363,12 @@ class ColorSpaceProfile { float whitePointNits; }; -static ColorSpaceProfile* rec2020Profile = new ColorSpaceProfile(Rec2020Primaries, IlluminantD65, Rec2020LumaPrimaries, 203); -static ColorSpaceProfile* rec709Profile = new ColorSpaceProfile(Rec709Primaries, IlluminantD65, Rec709LumaPrimaries, 100); -static ColorSpaceProfile* displayP3Profile = new ColorSpaceProfile(DisplayP3Primaries, IlluminantD65, Rec709LumaPrimaries, 80); +static ColorSpaceProfile* rec2020Profile = new ColorSpaceProfile(Rec2020Primaries, IlluminantD65, + Rec2020LumaPrimaries, Rec2020WhitePointNits); +static ColorSpaceProfile* rec709Profile = new ColorSpaceProfile(Rec709Primaries, IlluminantD65, + Rec709LumaPrimaries, Rec709WhitePointNits); +static ColorSpaceProfile* displayP3Profile = new ColorSpaceProfile(DisplayP3Primaries, IlluminantD65, + DisplayP3LumaPrimaries, DisplayP3WhitePointNits); template T lerp(const T& a, const T& b, float t) { diff --git a/Sources/avifc/Color/HLG.hpp b/Sources/avifc/Color/HLG.hpp index a58fdeb..e06f7ef 100644 --- a/Sources/avifc/Color/HLG.hpp +++ b/Sources/avifc/Color/HLG.hpp @@ -46,7 +46,7 @@ static inline float32x4_t HLGToLinear(const float32x4_t v) { const float32x4_t vDivVec = vrecpeq_f32(vdupq_n_f32(a)); float32x4_t high = vdivq_f32(vaddq_f32(vexpq_f32(vmulq_f32(vsubq_f32(v, vdupq_n_f32(c)), vDivVec)), vdupq_n_f32(b)), vdupq_n_f32(12.0f)); - float32x4_t low = vmulq_f32(vmulq_f32(v, v), vdupq_n_f32(1.0f/3.0f)); + float32x4_t low = vmulq_n_f32(vmulq_f32(v, v), 1.0f/3.0f); low = vbslq_f32(mask, vdupq_n_f32(0), low); high = vbslq_f32(maskHigh, vdupq_n_f32(0), high); diff --git a/Sources/avifc/Color/PQ.hpp b/Sources/avifc/Color/PQ.hpp index b060c31..828389e 100644 --- a/Sources/avifc/Color/PQ.hpp +++ b/Sources/avifc/Color/PQ.hpp @@ -37,7 +37,7 @@ const static float m1 = (2610.0f / 4096.0f) / 4.0f; const static float m2 = (2523.0f / 4096.0f) * 128.0f; const static float32x4_t c1 = vdupq_n_f32(3424.0f / 4096.0f); const static float32x4_t c2 = vdupq_n_f32((2413.0f / 4096.0f) * 32.0f); -const static float32x4_t c3 = vdupq_n_f32((2392.0f / 4096.0f) * 32.0f); +const static float c3 = (2392.0f / 4096.0f) * 32.0f; const static float m2Power = 1.0f / m2; const static float m1Power = 1.0f / m1; @@ -46,10 +46,9 @@ static inline float32x4_t ToLinearPQ(const float32x4_t v, const float sdrReferen const float32x4_t rv = vmaxq_f32(v, zeros); float32x4_t p = vpowq_f32(rv, m2Power); const float lumaScale = 10000.0f / sdrReferencePoint; - return vcopysignq_f32(vmulq_n_f32(vpowq_f32(vdivq_f32(vmaxq_f32(vsubq_f32(p, c1), zeros), vmlsq_f32(c2, c3, p)), m1Power), + return vcopysignq_f32(vmulq_n_f32(vpowq_f32(vmulq_f32(vmaxq_f32(vsubq_f32(p, c1), zeros), vrecpeq_f32(vmlsq_n_f32(c2, p, c3))), m1Power), lumaScale), rv); } - #endif static float ToLinearPQ(float v, const float sdrReferencePoint) { diff --git a/Sources/avifc/HDRColorTransfer.mm b/Sources/avifc/HDRColorTransfer.mm index 20c98e7..06c769d 100644 --- a/Sources/avifc/HDRColorTransfer.mm +++ b/Sources/avifc/HDRColorTransfer.mm @@ -49,6 +49,8 @@ #import "ToneMap/ReinhardToneMapper.hpp" #import "ToneMap/ClampToneMapper.hpp" #import "ToneMap/ReinhardJodieToneMapper.hpp" +#import "ToneMap/HableToneMapper.hpp" +#import "ToneMap/DragoToneMapper.hpp" #import "half.hpp" #import "Color/Gamma.hpp" #import "Color/PQ.hpp" @@ -308,7 +310,7 @@ +(void)transferNEONF16:(nonnull uint8_t*)data stride:(int)stride width:(int)widt auto ptr16 = reinterpret_cast(ptr + y * stride); int x; - for (x = 0; x + 8 < width / 2; x += 8) { + for (x = 0; x + 8 < width; x += 8) { if (components == 4) { float16x8x4_t rgbVector = vld4q_f16(reinterpret_cast(ptr16)); diff --git a/Sources/avifc/NEMath.h b/Sources/avifc/NEMath.h index 02fb797..dcb1c0c 100644 --- a/Sources/avifc/NEMath.h +++ b/Sources/avifc/NEMath.h @@ -271,13 +271,6 @@ static inline float32x4_t vcopysignq_f32(const float32x4_t dst, const float32x4_ return vbslq_f32(mask, vnegq_f32(dst), dst); } -__attribute__((always_inline)) -static inline float vsumq_f32(const float32x4_t v) { -// float32x2_t r = vadd_f32(vget_high_f32(v), vget_low_f32(v)); -// return vget_lane_f32(vpadd_f32(r, r), 0); - return vaddvq_f32(v); -} - __attribute__((always_inline)) static inline float32x2_t vsumq_f32x2(const float32x4_t v, const float32x4_t v1) { // float32x2_t r = vadd_f32(vget_high_f32(v), vget_low_f32(v)); @@ -325,7 +318,7 @@ __attribute__((always_inline)) static inline float vsumq_f16(const float16x8_t v) { const float32x4_t low = vcvt_f32_f16(vget_low_f16(v)); const float32x4_t high = vcvt_f32_f16(vget_high_f16(v)); - return vsumq_f32(vaddq_f32(high, low)); + return vaddvq_f32(vaddq_f32(high, low)); } __attribute__((always_inline)) diff --git a/Sources/avifc/ToneMap/ClampToneMapper.cpp b/Sources/avifc/ToneMap/ClampToneMapper.cpp index 0ca2484..d55a87f 100644 --- a/Sources/avifc/ToneMap/ClampToneMapper.cpp +++ b/Sources/avifc/ToneMap/ClampToneMapper.cpp @@ -59,7 +59,7 @@ void ClampToneMapper::Execute(float& r, float& g, float &b) { float32x4_t ClampToneMapper::Execute(const float32x4_t m) { const float32x4_t v = vmulq_n_f32(m, exposure); - const float Lin = vsumq_f32(vmulq_f32(v, vLumaVec)); + const float Lin = vaddvq_f32(vmulq_f32(v, vLumaVec)); if (Lin == 0) { return v; } @@ -89,10 +89,10 @@ float32x4x4_t ClampToneMapper::Execute(const float32x4x4_t m) { const float32x4_t Lout = vclampq_n_f32(vmulq_f32(Lin, vrecpeq_f32(vdupq_n_f32(Lmax_))), 0.0f, 1.0f); const float32x4_t scale = vdivq_f32(Lout, Lin); const float32x4x4_t r = { - vmulq_n_f32(exposured.val[0], vgetq_lane_f32(scale, 0)), - vmulq_n_f32(exposured.val[1], vgetq_lane_f32(scale, 1)), - vmulq_n_f32(exposured.val[2], vgetq_lane_f32(scale, 2)), - vmulq_n_f32(exposured.val[3], vgetq_lane_f32(scale, 3)) + vmulq_laneq_f32(exposured.val[0], scale, 0), + vmulq_laneq_f32(exposured.val[1], scale, 1), + vmulq_laneq_f32(exposured.val[2], scale, 2), + vmulq_laneq_f32(exposured.val[3], scale, 3) }; return r; } diff --git a/Sources/avifc/ToneMap/DragoToneMapper.cpp b/Sources/avifc/ToneMap/DragoToneMapper.cpp index adb734b..abae539 100644 --- a/Sources/avifc/ToneMap/DragoToneMapper.cpp +++ b/Sources/avifc/ToneMap/DragoToneMapper.cpp @@ -68,15 +68,9 @@ void DragoToneMapper::Execute(float& r, float& g, float &b) { #if __arm64__ -__attribute__((always_inline)) -static inline float vsumq_f32Drago(const float32x4_t v) { - float32x2_t r = vadd_f32(vget_high_f32(v), vget_low_f32(v)); - return vget_lane_f32(vpadd_f32(r, r), 0); -} - float32x4_t DragoToneMapper::Execute(const float32x4_t m) { const float32x4_t v = vmulq_n_f32(m, exposure); - const float Lin = vsumq_f32Drago(vmulq_n_f32(vmulq_f32(v, vLumaVec), exposure)); + const float Lin = vaddvq_f32(vmulq_n_f32(vmulq_f32(v, vLumaVec), exposure)); if (Lin == 0) { return v; } diff --git a/Sources/avifc/ToneMap/HableToneMapper.cpp b/Sources/avifc/ToneMap/HableToneMapper.cpp index 4ae9adc..f57a716 100644 --- a/Sources/avifc/ToneMap/HableToneMapper.cpp +++ b/Sources/avifc/ToneMap/HableToneMapper.cpp @@ -24,6 +24,7 @@ // #include "HableToneMapper.hpp" +#include "NEMath.h" #if defined(__clang__) #pragma clang fp contract(fast) exceptions(ignore) reassociate(on) diff --git a/Sources/avifc/ToneMap/LogarithmicToneMapper.cpp b/Sources/avifc/ToneMap/LogarithmicToneMapper.cpp index c7d7c7d..09b1e40 100644 --- a/Sources/avifc/ToneMap/LogarithmicToneMapper.cpp +++ b/Sources/avifc/ToneMap/LogarithmicToneMapper.cpp @@ -59,8 +59,8 @@ void LogarithmicToneMapper::Execute(float& r, float& g, float &b) { float32x4_t LogarithmicToneMapper::Execute(const float32x4_t m) { const float32x4_t v = vmulq_n_f32(m, exposure); - const float Lin = vsumq_f32(vmulq_f32(v, vLumaVec)); - const float Lout = vgetq_lane_f32(vdivq_f32(vlog10q_f32(vdupq_n_f32(fabsf_c(1.0 + curve * Lin))), vDenVec), 0); + const float Lin = vaddvq_f32(vmulq_f32(v, vLumaVec)); + const float Lout = vgetq_lane_f32(vmulq_f32(vlog10q_f32(vdupq_n_f32(fabsf_c(1.0 + curve * Lin))), vDenVec), 0); const float scale = Lout / Lin; if (scale == 1) { return v; @@ -76,21 +76,21 @@ float32x4x4_t LogarithmicToneMapper::Execute(const float32x4x4_t m) { vmulq_n_f32(m.val[3], exposure), }; float32x4_t Lin = { - vsumq_f32(vmulq_f32(exposured.val[0], vLumaVec)), - vsumq_f32(vmulq_f32(exposured.val[1], vLumaVec)), - vsumq_f32(vmulq_f32(exposured.val[2], vLumaVec)), - vsumq_f32(vmulq_f32(exposured.val[3], vLumaVec)), + vaddvq_f32(vmulq_f32(exposured.val[0], vLumaVec)), + vaddvq_f32(vmulq_f32(exposured.val[1], vLumaVec)), + vaddvq_f32(vmulq_f32(exposured.val[2], vLumaVec)), + vaddvq_f32(vmulq_f32(exposured.val[3], vLumaVec)), }; Lin = vsetq_if_f32(Lin, 0.0f, 1.0f); const float32x4_t Lout = vsetq_if_f32( - vdivq_f32(vlog10q_f32(vabsq_f32(vmlaq_f32(vdupq_n_f32(1.0f), vdupq_n_f32(curve), Lin))), vDenVec), + vmulq_f32(vlog10q_f32(vabsq_f32(vmlaq_f32(vdupq_n_f32(1.0f), vdupq_n_f32(curve), Lin))), vDenVec), 0.0f, 1.0f); const float32x4_t scale = vdivq_f32(Lout, Lin); float32x4x4_t r = { - vmulq_n_f32(exposured.val[0], vgetq_lane_f32(scale, 0)), - vmulq_n_f32(exposured.val[1], vgetq_lane_f32(scale, 1)), - vmulq_n_f32(exposured.val[2], vgetq_lane_f32(scale, 2)), - vmulq_n_f32(exposured.val[3], vgetq_lane_f32(scale, 3)) + vmulq_laneq_f32(exposured.val[0], scale, 0), + vmulq_laneq_f32(exposured.val[1], scale, 1), + vmulq_laneq_f32(exposured.val[2], scale, 2), + vmulq_laneq_f32(exposured.val[3], scale, 3) }; return r; } diff --git a/Sources/avifc/ToneMap/LogarithmicToneMapper.hpp b/Sources/avifc/ToneMap/LogarithmicToneMapper.hpp index f5479b0..6973144 100644 --- a/Sources/avifc/ToneMap/LogarithmicToneMapper.hpp +++ b/Sources/avifc/ToneMap/LogarithmicToneMapper.hpp @@ -47,7 +47,7 @@ class LogarithmicToneMapper: public ToneMapper { den = log10(1.0 + curve * Lmax_); #if __arm64__ vLumaVec = { lumaVec[0], lumaVec[1], lumaVec[2], 0.0f }; - vDenVec = vdupq_n_f32(den); + vDenVec = vdupq_n_f32(1.0f /den); #endif } @@ -56,7 +56,7 @@ class LogarithmicToneMapper: public ToneMapper { den = log10(1.0 + curve * Lmax_); #if __arm64__ vLumaVec = { lumaVec[0], lumaVec[1], lumaVec[2], 0.0f }; - vDenVec = vdupq_n_f32(den); + vDenVec = vdupq_n_f32(1.0f / den); #endif } diff --git a/Sources/avifc/ToneMap/Rec2408ToneMapper.cpp b/Sources/avifc/ToneMap/Rec2408ToneMapper.cpp index 6edd0d6..46f8570 100644 --- a/Sources/avifc/ToneMap/Rec2408ToneMapper.cpp +++ b/Sources/avifc/ToneMap/Rec2408ToneMapper.cpp @@ -31,49 +31,29 @@ using namespace std; #if __arm64__ -float Rec2408ToneMapper::SDR(float Lin) { - const float c1 = 107 / 128; - const float c2 = 2413 / 128; - const float c3 = 2392 / 128; - const float m1 = 1305 / 8192; - const float m2 = 2523 / 32; - const float v = pow(Lin / 10000, m1); - return pow((c1 + c2 * v) / (1 + c3 * v), m2); -} - -float32x4_t Rec2408ToneMapper::SDR(float32x4_t Lin) { - const float c1 = 107 / 128; - const float c2 = 2413 / 128; - const float c3 = 2392 / 128; - const float m1 = 1305 / 8192; - const float m2 = 2523 / 32; - const float32x4_t v = vpowq_f32(vdivq_f32(Lin, vdupq_n_f32(10000)), m1); - return vpowq_f32(vdivq_f32(vmlaq_f32(vdupq_n_f32(c1), vdupq_n_f32(c2), v), vmlaq_f32(vdupq_n_f32(1), vdupq_n_f32(c3), v)), m2); -} - float32x4x4_t Rec2408ToneMapper::Execute(const float32x4x4_t m) { - const float32x4x4_t lumas = { - vmulq_f32(m.val[0], luma), - vmulq_f32(m.val[1], luma), - vmulq_f32(m.val[2], luma), - vmulq_f32(m.val[3], luma), + const float32x4_t lc = luma; + const float32x4_t Lin = { + vaddvq_f32(vmulq_f32(m.val[0], lc)), + vaddvq_f32(vmulq_f32(m.val[1], lc)), + vaddvq_f32(vmulq_f32(m.val[2], lc)), + vaddvq_f32(vmulq_f32(m.val[3], lc)), }; - const float32x4_t Lin = vsumq_f32x4(lumas.val[0], lumas.val[1], lumas.val[2], lumas.val[3]); - const float32x4_t Lout = vdivq_f32(vmlaq_f32(this->ones, this->aVec, Lin), - vmlaq_f32(this->ones, this->bVec, Lin)); - + const float32x4_t ones = vdupq_n_f32(1.f); + const float32x4_t Lout = vmulq_f32(vmlaq_n_f32(ones, Lin, this->a), + vrecpeq_f32(vmlaq_n_f32(ones, Lin, this->b))); float32x4x4_t r = { - vmulq_n_f32(m.val[0], vgetq_lane_f32(Lout, 0)), - vmulq_n_f32(m.val[1], vgetq_lane_f32(Lout, 1)), - vmulq_n_f32(m.val[2], vgetq_lane_f32(Lout, 2)), - vmulq_n_f32(m.val[3], vgetq_lane_f32(Lout, 3)) + vmulq_laneq_f32(m.val[0], Lout, 0), + vmulq_laneq_f32(m.val[1], Lout, 1), + vmulq_laneq_f32(m.val[2], Lout, 2), + vmulq_laneq_f32(m.val[3], Lout, 3) }; - + return r; } float32x4_t Rec2408ToneMapper::Execute(const float32x4_t m) { - const float Lin = vsumq_f32(vmulq_f32(m, this->luma)); + const float Lin = vaddvq_f32(vmulq_f32(m, this->luma)); if (Lin == 0) { return m; } diff --git a/Sources/avifc/ToneMap/Rec2408ToneMapper.hpp b/Sources/avifc/ToneMap/Rec2408ToneMapper.hpp index 8d7e09e..2804759 100644 --- a/Sources/avifc/ToneMap/Rec2408ToneMapper.hpp +++ b/Sources/avifc/ToneMap/Rec2408ToneMapper.hpp @@ -46,9 +46,6 @@ class Rec2408ToneMapper: public ToneMapper { this->b = 1.0f / (displayMaxBrightness/whitePoint); memcpy(this->lumaCoefficients, lumaCoefficients, sizeof(float)*3); #if __arm64__ - this->aVec = vdupq_n_f32(a); - this->bVec = vdupq_n_f32(b); - this->ones = vdupq_n_f32(1.f); this->luma = { lumaCoefficients[0], lumaCoefficients[1], lumaCoefficients[2], 0.0f }; #endif } @@ -63,13 +60,8 @@ class Rec2408ToneMapper: public ToneMapper { float Ld; float a; float b; - float SDR(float Lin); float lumaCoefficients[3]; #if __arm64__ - float32x4_t SDR(float32x4_t Lin); - float32x4_t aVec; - float32x4_t bVec; - float32x4_t ones; float32x4_t luma; #endif }; diff --git a/Sources/avifc/ToneMap/ReinhardJodieToneMapper.cpp b/Sources/avifc/ToneMap/ReinhardJodieToneMapper.cpp index 25441af..36ff71d 100644 --- a/Sources/avifc/ToneMap/ReinhardJodieToneMapper.cpp +++ b/Sources/avifc/ToneMap/ReinhardJodieToneMapper.cpp @@ -79,7 +79,7 @@ void ReinhardJodieToneMapper::Execute(float& r, float& g, float& b) { float32x4_t ReinhardJodieToneMapper::Execute(const float32x4_t m) { const float32x4_t v = vmulq_n_f32(m, exposure); - const float luma = vsumq_f32(vmulq_f32(v, vLumaVec)); + const float luma = vaddvq_f32(vmulq_f32(v, vLumaVec)); const float32x4_t tv = vdivq_f32(v, vaddq_f32(vdupq_n_f32(1.0f), v)); const float32x4_t in = vdivq_f32(v, vdupq_n_f32(1.0f + luma)); @@ -94,24 +94,25 @@ float32x4x4_t ReinhardJodieToneMapper::Execute(const float32x4x4_t m) { vmulq_n_f32(m.val[3], exposure), }; float32x4_t Lin = { - vsumq_f32(vmulq_f32(exposured.val[0], vLumaVec)), - vsumq_f32(vmulq_f32(exposured.val[1], vLumaVec)), - vsumq_f32(vmulq_f32(exposured.val[2], vLumaVec)), - vsumq_f32(vmulq_f32(exposured.val[3], vLumaVec)), + vaddvq_f32(vmulq_f32(exposured.val[0], vLumaVec)), + vaddvq_f32(vmulq_f32(exposured.val[1], vLumaVec)), + vaddvq_f32(vmulq_f32(exposured.val[2], vLumaVec)), + vaddvq_f32(vmulq_f32(exposured.val[3], vLumaVec)), }; Lin = vaddq_f32(Lin, vdupq_n_f32(1.0f)); + Lin = vrecpeq_f32(Lin); const float32x4_t tv1 = vdivq_f32(exposured.val[0], vaddq_f32(vdupq_n_f32(1.0f), exposured.val[0])); - const float32x4_t in1 = vdivq_f32(exposured.val[0], vdupq_n_f32(vgetq_lane_f32(Lin, 0))); + const float32x4_t in1 = vmulq_laneq_f32(exposured.val[0], Lin, 0); const float32x4_t tv2 = vdivq_f32(exposured.val[1], vaddq_f32(vdupq_n_f32(1.0f), exposured.val[1])); - const float32x4_t in2 = vdivq_f32(exposured.val[1], vdupq_n_f32(vgetq_lane_f32(Lin, 1))); + const float32x4_t in2 = vmulq_laneq_f32(exposured.val[1], Lin, 1); const float32x4_t tv3 = vdivq_f32(exposured.val[2], vaddq_f32(vdupq_n_f32(1.0f), exposured.val[2])); - const float32x4_t in3 = vdivq_f32(exposured.val[2], vdupq_n_f32(vgetq_lane_f32(Lin, 2))); + const float32x4_t in3 = vmulq_laneq_f32(exposured.val[2], Lin, 2); const float32x4_t tv4 = vdivq_f32(exposured.val[3], vaddq_f32(vdupq_n_f32(1.0f), exposured.val[3])); - const float32x4_t in4 = vdivq_f32(exposured.val[3], vdupq_n_f32(vgetq_lane_f32(Lin, 3))); + const float32x4_t in4 = vmulq_laneq_f32(exposured.val[3], Lin, 3); const float32x4x4_t res = { lerpNEON(in1, tv1, tv1), diff --git a/Sources/avifc/ToneMap/ReinhardJodieToneMapper.hpp b/Sources/avifc/ToneMap/ReinhardJodieToneMapper.hpp index 4df22ed..a8f635b 100644 --- a/Sources/avifc/ToneMap/ReinhardJodieToneMapper.hpp +++ b/Sources/avifc/ToneMap/ReinhardJodieToneMapper.hpp @@ -34,7 +34,7 @@ class ReinhardJodieToneMapper: public ToneMapper { public: - ReinhardJodieToneMapper(const bool extended = true): lumaVec { 0.2126, 0.7152, 0.0722 }, lumaMaximum(1.0f), exposure(1.2f) { + ReinhardJodieToneMapper(const bool extended = true): lumaVec { 0.2126, 0.7152, 0.0722 }, lumaMaximum(1.0f), exposure(1.0f) { useExtended = extended; #if __arm64__ vLumaVec = { lumaVec[0], lumaVec[1], lumaVec[2], 0.0f }; diff --git a/Sources/avifc/ToneMap/ReinhardToneMapper.cpp b/Sources/avifc/ToneMap/ReinhardToneMapper.cpp index b02233c..fdaf1f7 100644 --- a/Sources/avifc/ToneMap/ReinhardToneMapper.cpp +++ b/Sources/avifc/ToneMap/ReinhardToneMapper.cpp @@ -75,7 +75,7 @@ void ReinhardToneMapper::Execute(float& r, float& g, float& b) { float32x4_t ReinhardToneMapper::Execute(const float32x4_t m) { const float32x4_t v = vmulq_n_f32(m, exposure); - const float luma = vsumq_f32(vmulq_f32(v, vLumaVec)); + const float luma = vaddvq_f32(vmulq_f32(v, vLumaVec)); if (luma == 0) { return m; } @@ -95,19 +95,19 @@ float32x4x4_t ReinhardToneMapper::Execute(const float32x4x4_t m) { vmulq_n_f32(m.val[3], exposure), }; float32x4_t Lin = { - vsumq_f32(vmulq_f32(exposured.val[0], vLumaVec)), - vsumq_f32(vmulq_f32(exposured.val[1], vLumaVec)), - vsumq_f32(vmulq_f32(exposured.val[2], vLumaVec)), - vsumq_f32(vmulq_f32(exposured.val[3], vLumaVec)), + vaddvq_f32(vmulq_f32(exposured.val[0], vLumaVec)), + vaddvq_f32(vmulq_f32(exposured.val[1], vLumaVec)), + vaddvq_f32(vmulq_f32(exposured.val[2], vLumaVec)), + vaddvq_f32(vmulq_f32(exposured.val[3], vLumaVec)), }; Lin = vsetq_if_f32(Lin, 0.0f, 1.0f); const float32x4_t Lout = vsetq_if_f32(reinhardNEON(Lin, lumaMaximum, useExtended), 0.0f, 1.0f); const float32x4_t scale = vdivq_f32(Lout, Lin); float32x4x4_t r = { - vmulq_n_f32(exposured.val[0], vgetq_lane_f32(scale, 0)), - vmulq_n_f32(exposured.val[1], vgetq_lane_f32(scale, 1)), - vmulq_n_f32(exposured.val[2], vgetq_lane_f32(scale, 2)), - vmulq_n_f32(exposured.val[3], vgetq_lane_f32(scale, 3)), + vmulq_laneq_f32(exposured.val[0], scale, 0), + vmulq_laneq_f32(exposured.val[1], scale, 1), + vmulq_laneq_f32(exposured.val[2], scale, 2), + vmulq_laneq_f32(exposured.val[3], scale, 3) }; return r; }