From c7e20c87d3d58e31b169cdb1ea51481f07298804 Mon Sep 17 00:00:00 2001 From: Mohammad Azim Khan Date: Sat, 16 Nov 2024 08:49:33 +0000 Subject: [PATCH 1/5] Various masked operations --- g3doc/quick_reference.md | 37 +++++++++++ hwy/ops/arm_sve-inl.h | 60 ++++++++++++++++++ hwy/ops/generic_ops-inl.h | 50 +++++++++++++++ hwy/tests/logical_test.cc | 23 +++++++ hwy/tests/reduction_test.cc | 120 ++++++++++++++++++++++++++++++++++++ hwy/tests/table_test.cc | 113 +++++++++++++++++++++++++++++++++ 6 files changed, 403 insertions(+) diff --git a/g3doc/quick_reference.md b/g3doc/quick_reference.md index 842903d0a3..7c320b4509 100644 --- a/g3doc/quick_reference.md +++ b/g3doc/quick_reference.md @@ -1125,6 +1125,9 @@ types, and on SVE/RVV. * V **AndNot**(V a, V b): returns `~a[i] & b[i]`. +* V **MaskedOrOrZero**(M m, V a, V b): returns `a[i] || b[i]` + or `zero` if `m[i]` is false. + The following three-argument functions may be more efficient than assembling them from 2-argument functions: @@ -2351,6 +2354,22 @@ The following `ReverseN` must not be called if `Lanes(D()) < N`: must be in the range `[0, 2 * Lanes(d))` but need not be unique. The index type `TI` must be an integer of the same size as `TFromD`. +* V **TableLookupLanesOr**(M m, V a, V b, unspecified) returns the + result of `TableLookupLanes(a, unspecified)` where `m[i]` is true, and returns + `b[i]` where `m[i]` is false. + +* V **TableLookupLanesOrZero**(M m, V a, unspecified) returns + the result of `TableLookupLanes(a, unspecified)` where `m[i]` is true, and + returns zero where `m[i]` is false. + +* V **TwoTablesLookupLanesOr**(D d, M m, V a, V b, unspecified) + returns the result of `TwoTablesLookupLanes(V a, V b, unspecified)` where + `m[i]` is true, and `a[i]` where `m[i]` is false. + +* V **TwoTablesLookupLanesOrZero**(D d, M m, V a, V b, unspecified) + returns the result of `TwoTablesLookupLanes(V a, V b, unspecified)` where + `m[i]` is true, and zero where `m[i]` is false. + * V **Per4LaneBlockShuffle**<size_t kIdx3, size_t kIdx2, size_t kIdx1, size_t kIdx0>(V v) does a per 4-lane block shuffle of `v` if `Lanes(DFromV())` is greater than or equal to 4 or a shuffle of the @@ -2491,6 +2510,24 @@ more efficient on some targets. * T **ReduceMin**(D, V v): returns the minimum of all lanes. * T **ReduceMax**(D, V v): returns the maximum of all lanes. +### Masked reductions + +**Note**: Horizontal operations (across lanes of the same vector) such as +reductions are slower than normal SIMD operations and are typically used outside +critical loops. + +All ops in this section ignore lanes where `mask=false`. These are equivalent +to, and potentially more efficient than, `GetLane(SumOfLanes(d, +IfThenElseZero(m, v)))` etc. The result is implementation-defined when all mask +elements are false. + +* T **MaskedReduceSum**(D, M m, V v): returns the sum of all lanes + where `m[i]` is `true`. +* T **MaskedReduceMin**(D, M m, V v): returns the minimum of all + lanes where `m[i]` is `true`. +* T **MaskedReduceMax**(D, M m, V v): returns the maximum of all + lanes where `m[i]` is `true`. + ### Crypto Ops in this section are only available if `HWY_TARGET != HWY_SCALAR`: diff --git a/hwy/ops/arm_sve-inl.h b/hwy/ops/arm_sve-inl.h index 9420b965e6..3abe34d058 100644 --- a/hwy/ops/arm_sve-inl.h +++ b/hwy/ops/arm_sve-inl.h @@ -219,6 +219,11 @@ HWY_SVE_FOREACH_BF16_UNCONDITIONAL(HWY_SPECIALIZE, _, _) HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) { \ return sv##OP##_##CHAR##BITS(v); \ } +#define HWY_SVE_RETV_ARGMV_M(BASE, CHAR, BITS, HALF, NAME, OP) \ + HWY_API HWY_SVE_V(BASE, BITS) \ + NAME(svbool_t m, HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \ + return sv##OP##_##CHAR##BITS##_m(b, m, a); \ + } #define HWY_SVE_RETV_ARGMV(BASE, CHAR, BITS, HALF, NAME, OP) \ HWY_API HWY_SVE_V(BASE, BITS) NAME(svbool_t m, HWY_SVE_V(BASE, BITS) v) { \ return sv##OP##_##CHAR##BITS##_x(m, v); \ @@ -260,6 +265,17 @@ HWY_SVE_FOREACH_BF16_UNCONDITIONAL(HWY_SPECIALIZE, _, _) NAME(svbool_t m, HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \ return sv##OP##_##CHAR##BITS##_x(m, a, b); \ } +#define HWY_SVE_RETV_ARGMVV_M(BASE, CHAR, BITS, HALF, NAME, OP) \ + HWY_API HWY_SVE_V(BASE, BITS) \ + NAME(svbool_t m, HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \ + return sv##OP##_##CHAR##BITS##_m(m, a, b); \ + } +// User-specified mask. Mask=false value is zero. +#define HWY_SVE_RETV_ARGMVVZ(BASE, CHAR, BITS, HALF, NAME, OP) \ + HWY_API HWY_SVE_V(BASE, BITS) \ + NAME(svbool_t m, HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \ + return sv##OP##_##CHAR##BITS##_z(m, a, b); \ + } #define HWY_SVE_RETV_ARGVVV(BASE, CHAR, BITS, HALF, NAME, OP) \ HWY_API HWY_SVE_V(BASE, BITS) \ @@ -763,6 +779,9 @@ HWY_API V Or(const V a, const V b) { return BitCast(df, Or(BitCast(du, a), BitCast(du, b))); } +// ------------------------------ MaskedOrOrZero +HWY_SVE_FOREACH_UI(HWY_SVE_RETV_ARGMVVZ, MaskedOrOrZero, orr) + // ------------------------------ Xor namespace detail { @@ -1725,6 +1744,25 @@ HWY_API TFromD ReduceMax(D d, VFromD v) { return detail::MaxOfLanesM(detail::MakeMask(d), v); } +#ifdef HWY_NATIVE_MASKED_REDUCE_SCALAR +#undef HWY_NATIVE_MASKED_REDUCE_SCALAR +#else +#define HWY_NATIVE_MASKED_REDUCE_SCALAR +#endif + +template +HWY_API TFromD MaskedReduceSum(D /*d*/, M m, VFromD v) { + return detail::SumOfLanesM(m, v); +} +template +HWY_API TFromD MaskedReduceMin(D /*d*/, M m, VFromD v) { + return detail::MinOfLanesM(m, v); +} +template +HWY_API TFromD MaskedReduceMax(D /*d*/, M m, VFromD v) { + return detail::MaxOfLanesM(m, v); +} + // ------------------------------ SumOfLanes template @@ -5056,6 +5094,23 @@ HWY_API V IfNegativeThenElse(V v, V yes, V no) { static_assert(IsSigned>(), "Only works for signed/float"); return IfThenElse(IsNegative(v), yes, no); } +// ------------------------------ IfNegativeThenNegOrUndefIfZero + +#ifdef HWY_NATIVE_INTEGER_IF_NEGATIVE_THEN_NEG +#undef HWY_NATIVE_INTEGER_IF_NEGATIVE_THEN_NEG +#else +#define HWY_NATIVE_INTEGER_IF_NEGATIVE_THEN_NEG +#endif + +#define HWY_SVE_NEG_IF(BASE, CHAR, BITS, HALF, NAME, OP) \ + HWY_API HWY_SVE_V(BASE, BITS) \ + NAME(HWY_SVE_V(BASE, BITS) mask, HWY_SVE_V(BASE, BITS) v) { \ + return sv##OP##_##CHAR##BITS##_m(v, IsNegative(mask), v); \ + } + +HWY_SVE_FOREACH_IF(HWY_SVE_NEG_IF, IfNegativeThenNegOrUndefIfZero, neg) + +#undef HWY_SVE_NEG_IF // ------------------------------ AverageRound (ShiftRight) @@ -6587,6 +6642,7 @@ HWY_SVE_FOREACH_UI(HWY_SVE_MASKED_LEADING_ZERO_COUNT, MaskedLeadingZeroCount, #undef HWY_SVE_IF_NOT_EMULATED_D #undef HWY_SVE_PTRUE #undef HWY_SVE_RETV_ARGMVV +#undef HWY_SVE_RETV_ARGMVVZ #undef HWY_SVE_RETV_ARGMV_Z #undef HWY_SVE_RETV_ARGMV #undef HWY_SVE_RETV_ARGPV @@ -6594,7 +6650,11 @@ HWY_SVE_FOREACH_UI(HWY_SVE_MASKED_LEADING_ZERO_COUNT, MaskedLeadingZeroCount, #undef HWY_SVE_RETV_ARGPVV #undef HWY_SVE_RETV_ARGV #undef HWY_SVE_RETV_ARGVN +#undef HWY_SVE_RETV_ARGMV +#undef HWY_SVE_RETV_ARGMV_M +#undef HWY_SVE_RETV_ARGMV_Z #undef HWY_SVE_RETV_ARGVV +#undef HWY_SVE_RETV_ARGMVV_M #undef HWY_SVE_RETV_ARGVVV #undef HWY_SVE_RETV_ARGMVVV #undef HWY_SVE_T diff --git a/hwy/ops/generic_ops-inl.h b/hwy/ops/generic_ops-inl.h index 375f707a5a..8e586ea913 100644 --- a/hwy/ops/generic_ops-inl.h +++ b/hwy/ops/generic_ops-inl.h @@ -1013,6 +1013,28 @@ HWY_API TFromD ReduceMax(D d, VFromD v) { } #endif // HWY_NATIVE_REDUCE_MINMAX_4_UI8 +#if (defined(HWY_NATIVE_MASKED_REDUCE_SCALAR) == defined(HWY_TARGET_TOGGLE)) +#ifdef HWY_NATIVE_MASKED_REDUCE_SCALAR +#undef HWY_NATIVE_MASKED_REDUCE_SCALAR +#else +#define HWY_NATIVE_MASKED_REDUCE_SCALAR +#endif + +template +HWY_API TFromD MaskedReduceSum(D d, M m, VFromD v) { + return ReduceSum(d, IfThenElseZero(m, v)); +} +template +HWY_API TFromD MaskedReduceMin(D d, M m, VFromD v) { + return ReduceMin(d, IfThenElse(m, v, MaxOfLanes(d, v))); +} +template +HWY_API TFromD MaskedReduceMax(D d, M m, VFromD v) { + return ReduceMax(d, IfThenElseZero(m, v)); +} + +#endif // HWY_NATIVE_MASKED_REDUCE_SCALAR + // ------------------------------ IsEitherNaN #if (defined(HWY_NATIVE_IS_EITHER_NAN) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_IS_EITHER_NAN @@ -6713,6 +6735,30 @@ HWY_API V ReverseBits(V v) { } #endif // HWY_NATIVE_REVERSE_BITS_UI16_32_64 +// ------------------------------ TableLookupLanesOr +template +HWY_API V TableLookupLanesOr(M m, V a, V b, IndicesFromD> idx) { + return IfThenElse(m, TableLookupLanes(a, idx), b); +} + +// ------------------------------ TableLookupLanesOrZero +template +HWY_API V TableLookupLanesOrZero(M m, V a, IndicesFromD> idx) { + return IfThenElseZero(m, TableLookupLanes(a, idx)); +} + +// ------------------------------ TwoTablesLookupLanesOr +template +HWY_API V TwoTablesLookupLanesOr(D d, M m, V a, V b, IndicesFromD idx) { + return IfThenElse(m, TwoTablesLookupLanes(d, a, b, idx), a); +} + +// ------------------------------ TwoTablesLookupLanesOrZero +template +HWY_API V TwoTablesLookupLanesOrZero(D d, M m, V a, V b, IndicesFromD idx) { + return IfThenElse(m, TwoTablesLookupLanes(d, a, b, idx), Zero(d)); +} + // ------------------------------ Per4LaneBlockShuffle #if (defined(HWY_NATIVE_PER4LANEBLKSHUF_DUP32) == defined(HWY_TARGET_TOGGLE)) @@ -7568,6 +7614,10 @@ HWY_API V BitShuffle(V v, VI idx) { #endif // HWY_NATIVE_BITSHUFFLE +template +HWY_API V MaskedOrOrZero(M m, V a, V b) { + return IfThenElseZero(m, Or(a, b)); +} // ------------------------------ AllBits1/AllBits0 #if (defined(HWY_NATIVE_ALLONES) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_ALLONES diff --git a/hwy/tests/logical_test.cc b/hwy/tests/logical_test.cc index 31882ec9eb..94fbaccbd4 100644 --- a/hwy/tests/logical_test.cc +++ b/hwy/tests/logical_test.cc @@ -146,6 +146,28 @@ HWY_NOINLINE void TestAllTestBit() { ForIntegerTypes(ForPartialVectors()); } +struct TestMaskedOrOrZero { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const MFromD all_true = MaskTrue(d); + const auto v1 = Iota(d, 1); + const auto v2 = Iota(d, 2); + + HWY_ASSERT_VEC_EQ(d, Or(v2, v1), MaskedOrOrZero(all_true, v1, v2)); + + const MFromD first_five = FirstN(d, 5); + const Vec v0 = Zero(d); + + const Vec v1_exp = IfThenElse(first_five, Or(v2, v1), v0); + + HWY_ASSERT_VEC_EQ(d, v1_exp, MaskedOrOrZero(first_five, v1, v2)); + } +}; + +HWY_NOINLINE void TestAllMaskedLogical() { + ForAllTypes(ForPartialVectors()); +} + struct TestAllBits { template HWY_NOINLINE void operator()(T /*unused*/, D d) { @@ -185,6 +207,7 @@ HWY_BEFORE_TEST(HwyLogicalTest); HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllNot); HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllLogical); HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllTestBit); +HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllMaskedLogical); HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllAllBits); HWY_AFTER_TEST(); diff --git a/hwy/tests/reduction_test.cc b/hwy/tests/reduction_test.cc index fffc4a7873..fd35f645f6 100644 --- a/hwy/tests/reduction_test.cc +++ b/hwy/tests/reduction_test.cc @@ -352,6 +352,122 @@ HWY_NOINLINE void TestAllSumsOf8() { ForGEVectors<64, TestSumsOf8>()(uint8_t()); } +struct TestMaskedReduceSum { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + RandomState rng; + + const Vec v2 = Iota(d, 2); + + const size_t N = Lanes(d); + auto bool_lanes = AllocateAligned(N); + HWY_ASSERT(bool_lanes); + + for (size_t rep = 0; rep < AdjustedReps(200); ++rep) { + T expected = 0; + for (size_t i = 0; i < N; ++i) { + bool_lanes[i] = (Random32(&rng) & 1024) ? T(1) : T(0); + if (bool_lanes[i]) { + expected += ConvertScalarTo(i + 2); + } + } + + const Vec mask_i = Load(d, bool_lanes.get()); + const Mask mask = RebindMask(d, Gt(mask_i, Zero(d))); + + // If all elements are disabled the result is implementation defined + if (AllFalse(d, mask)) { + continue; + } + + HWY_ASSERT_EQ(expected, MaskedReduceSum(d, mask, v2)); + } + } +}; + +HWY_NOINLINE void TestAllMaskedReduceSum() { + ForAllTypes(ForPartialVectors()); +} + +struct TestMaskedReduceMin { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + RandomState rng; + + const Vec v2 = Iota(d, 2); + + const size_t N = Lanes(d); + auto bool_lanes = AllocateAligned(N); + HWY_ASSERT(bool_lanes); + + for (size_t rep = 0; rep < AdjustedReps(200); ++rep) { + T expected = + ConvertScalarTo(N + 3); // larger than any values in the vector + for (size_t i = 0; i < N; ++i) { + bool_lanes[i] = (Random32(&rng) & 1024) ? T(1) : T(0); + if (bool_lanes[i]) { + if (expected > ConvertScalarTo(i + 2)) { + expected = ConvertScalarTo(i + 2); + } + } + } + + const Vec mask_i = Load(d, bool_lanes.get()); + const Mask mask = RebindMask(d, Gt(mask_i, Zero(d))); + + // If all elements are disabled the result is implementation defined + if (AllFalse(d, mask)) { + continue; + } + + HWY_ASSERT_EQ(expected, MaskedReduceMin(d, mask, v2)); + } + } +}; + +HWY_NOINLINE void TestAllMaskedReduceMin() { + ForAllTypes(ForPartialVectors()); +} + +struct TestMaskedReduceMax { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + RandomState rng; + + const Vec v2 = Iota(d, 2); + + const size_t N = Lanes(d); + auto bool_lanes = AllocateAligned(N); + HWY_ASSERT(bool_lanes); + + for (size_t rep = 0; rep < AdjustedReps(200); ++rep) { + T expected = 0; + for (size_t i = 0; i < N; ++i) { + bool_lanes[i] = (Random32(&rng) & 1024) ? T(1) : T(0); + if (bool_lanes[i]) { + if (expected < ConvertScalarTo(i + 2)) { + expected = ConvertScalarTo(i + 2); + } + } + } + + const Vec mask_i = Load(d, bool_lanes.get()); + const Mask mask = RebindMask(d, Gt(mask_i, Zero(d))); + + // If all elements are disabled the result is implementation defined + if (AllFalse(d, mask)) { + continue; + } + + HWY_ASSERT_EQ(expected, MaskedReduceMax(d, mask, v2)); + } + } +}; + +HWY_NOINLINE void TestAllMaskedReduceMax() { + ForAllTypes(ForPartialVectors()); +} + } // namespace // NOLINTNEXTLINE(google-readability-namespace-comments) } // namespace HWY_NAMESPACE @@ -367,6 +483,10 @@ HWY_EXPORT_AND_TEST_P(HwyReductionTest, TestAllMinMaxOfLanes); HWY_EXPORT_AND_TEST_P(HwyReductionTest, TestAllSumsOf2); HWY_EXPORT_AND_TEST_P(HwyReductionTest, TestAllSumsOf4); HWY_EXPORT_AND_TEST_P(HwyReductionTest, TestAllSumsOf8); + +HWY_EXPORT_AND_TEST_P(HwyReductionTest, TestAllMaskedReduceSum); +HWY_EXPORT_AND_TEST_P(HwyReductionTest, TestAllMaskedReduceMin); +HWY_EXPORT_AND_TEST_P(HwyReductionTest, TestAllMaskedReduceMax); HWY_AFTER_TEST(); } // namespace } // namespace hwy diff --git a/hwy/tests/table_test.cc b/hwy/tests/table_test.cc index 09fdd7eaf6..eb5b1a8644 100644 --- a/hwy/tests/table_test.cc +++ b/hwy/tests/table_test.cc @@ -103,6 +103,59 @@ HWY_NOINLINE void TestAllTableLookupLanes() { ForAllTypes(ForPartialVectors()); } +struct TestTableLookupLanesOr { + template +#if HWY_TARGET != HWY_SCALARWE + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const RebindToSigned di; + using TI = TFromD; + + const size_t N = Lanes(d); + // Select indices from N-1 counting down + auto indices = IndicesFromVec( + d, Sub(Set(di, ConvertScalarTo(N - 1)), Iota(di, 0))); + + auto expected = AllocateAligned(N); + auto expected_zero = AllocateAligned(N); + auto bool_lanes = AllocateAligned(N); + HWY_ASSERT(expected && expected_zero && bool_lanes); + + const auto v1 = Iota(d, 5); + const auto v2 = Iota(d, 8); + + RandomState rng; + + for (size_t rep = 0; rep < AdjustedReps(200); ++rep) { + for (size_t i = 0; i < N; ++i) { + bool_lanes[i] = (Random32(&rng) & 1024) ? T(1) : T(0); + + if (bool_lanes[i]) { + expected[i] = ConvertScalarTo(N - i + 5 - 1); // v1[N-1, N-2, ...] + expected_zero[i] = + ConvertScalarTo(N - i + 5 - 1); // v1[N-1, N-2, ...] + } else { + expected[i] = ConvertScalarTo(i + 8); // v2[i] + expected_zero[i] = ConvertScalarTo(0); + } + } + + const Vec mask_i = Load(d, bool_lanes.get()); + const Mask mask = RebindMask(d, Gt(mask_i, Zero(d))); + HWY_ASSERT_VEC_EQ(d, expected.get(), + TableLookupLanesOr(mask, v1, v2, indices)); + HWY_ASSERT_VEC_EQ(d, expected_zero.get(), + TableLookupLanesOrZero(mask, v1, indices)); +#else + (void) d; +#endif + } + } +}; + +HWY_NOINLINE void TestAllTableLookupLanesOr() { + ForAllTypes(ForPartialVectors()); +} + struct TestTwoTablesLookupLanes { template HWY_NOINLINE void operator()(T /*unused*/, D d) { @@ -194,6 +247,64 @@ HWY_NOINLINE void TestAllTwoTablesLookupLanes() { ForAllTypes(ForPartialVectors()); } +struct TestTwoTablesLookupLanesOr { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const RebindToSigned di; + using TI = TFromD; + + const size_t N = Lanes(d); + // Select indices from N-1 counting down + auto idx_lower = Sub(Set(di, ConvertScalarTo(N - 1)), Iota(di, 0)); + auto idx_upper = Add(idx_lower, Set(di, ConvertScalarTo(N))); + auto indices = IndicesFromVec(d, OddEven(idx_upper, idx_lower)); + + auto expected = AllocateAligned(N); + auto expected_zero = AllocateAligned(N); + auto bool_lanes = AllocateAligned(N); + HWY_ASSERT(expected && expected_zero && bool_lanes); + + const auto v1 = Iota(d, 5); + const auto v2 = Iota(d, 8); + + RandomState rng; + + for (size_t rep = 0; rep < AdjustedReps(200); ++rep) { + for (size_t i = 0; i < N; ++i) { + bool_lanes[i] = (Random32(&rng) & 1024) ? T(1) : T(0); + + if (bool_lanes[i]) { + if (i % 2) { + expected[i] = + ConvertScalarTo(N - i + 8 - 1); // v2[N-1, N-2, ...] + expected_zero[i] = + ConvertScalarTo(N - i + 8 - 1); // v2[N-1, N-2, ...] + } else { + expected[i] = + ConvertScalarTo(N - i + 5 - 1); // v1[N-1, N-2, ...] + expected_zero[i] = + ConvertScalarTo(N - i + 5 - 1); // v1[N-1, N-2, ...] + } + } else { + expected[i] = ConvertScalarTo(i + 5); // v1[i] + expected_zero[i] = ConvertScalarTo(0); + } + } + + const Vec mask_i = Load(d, bool_lanes.get()); + const Mask mask = RebindMask(d, Gt(mask_i, Zero(d))); + HWY_ASSERT_VEC_EQ(d, expected.get(), + TwoTablesLookupLanesOr(d, mask, v1, v2, indices)); + HWY_ASSERT_VEC_EQ(d, expected_zero.get(), + TwoTablesLookupLanesOrZero(d, mask, v1, v2, indices)); + } + } +}; + +HWY_NOINLINE void TestAllTwoTablesLookupLanesOr() { + ForAllTypes(ForPartialVectors()); +} + } // namespace // NOLINTNEXTLINE(google-readability-namespace-comments) } // namespace HWY_NAMESPACE @@ -205,7 +316,9 @@ namespace hwy { namespace { HWY_BEFORE_TEST(HwyTableTest); HWY_EXPORT_AND_TEST_P(HwyTableTest, TestAllTableLookupLanes); +HWY_EXPORT_AND_TEST_P(HwyTableTest, TestAllTableLookupLanesOr); HWY_EXPORT_AND_TEST_P(HwyTableTest, TestAllTwoTablesLookupLanes); +HWY_EXPORT_AND_TEST_P(HwyTableTest, TestAllTwoTablesLookupLanesOr); HWY_AFTER_TEST(); } // namespace } // namespace hwy From 3c3fc48331d06bd13560d794d3917c3a1fc8139b Mon Sep 17 00:00:00 2001 From: Will Barber Date: Tue, 28 Jan 2025 10:51:41 +0000 Subject: [PATCH 2/5] Fix review comments Remove OrZero suffix and fix MaskedOr docs Update naming of masked table lookups to follow convention Optimise MaskedReduceMin/Max Add TODOs Remove the masked table lookups To be added alongside the platform specialisations Remove unused macros Rename HWY_SVE_RETV_ARGMVVZ to follow convention --- g3doc/quick_reference.md | 18 +----- hwy/ops/arm_sve-inl.h | 31 ++--------- hwy/ops/generic_ops-inl.h | 30 +--------- hwy/ops/rvv-inl.h | 2 + hwy/tests/logical_test.cc | 8 +-- hwy/tests/table_test.cc | 113 -------------------------------------- 6 files changed, 15 insertions(+), 187 deletions(-) diff --git a/g3doc/quick_reference.md b/g3doc/quick_reference.md index 7c320b4509..2366c782cb 100644 --- a/g3doc/quick_reference.md +++ b/g3doc/quick_reference.md @@ -1125,7 +1125,7 @@ types, and on SVE/RVV. * V **AndNot**(V a, V b): returns `~a[i] & b[i]`. -* V **MaskedOrOrZero**(M m, V a, V b): returns `a[i] || b[i]` +* V **MaskedOr**(M m, V a, V b): returns `a[i] | b[i]` or `zero` if `m[i]` is false. The following three-argument functions may be more efficient than assembling @@ -2354,22 +2354,6 @@ The following `ReverseN` must not be called if `Lanes(D()) < N`: must be in the range `[0, 2 * Lanes(d))` but need not be unique. The index type `TI` must be an integer of the same size as `TFromD`. -* V **TableLookupLanesOr**(M m, V a, V b, unspecified) returns the - result of `TableLookupLanes(a, unspecified)` where `m[i]` is true, and returns - `b[i]` where `m[i]` is false. - -* V **TableLookupLanesOrZero**(M m, V a, unspecified) returns - the result of `TableLookupLanes(a, unspecified)` where `m[i]` is true, and - returns zero where `m[i]` is false. - -* V **TwoTablesLookupLanesOr**(D d, M m, V a, V b, unspecified) - returns the result of `TwoTablesLookupLanes(V a, V b, unspecified)` where - `m[i]` is true, and `a[i]` where `m[i]` is false. - -* V **TwoTablesLookupLanesOrZero**(D d, M m, V a, V b, unspecified) - returns the result of `TwoTablesLookupLanes(V a, V b, unspecified)` where - `m[i]` is true, and zero where `m[i]` is false. - * V **Per4LaneBlockShuffle**<size_t kIdx3, size_t kIdx2, size_t kIdx1, size_t kIdx0>(V v) does a per 4-lane block shuffle of `v` if `Lanes(DFromV())` is greater than or equal to 4 or a shuffle of the diff --git a/hwy/ops/arm_sve-inl.h b/hwy/ops/arm_sve-inl.h index 3abe34d058..32c520b1bb 100644 --- a/hwy/ops/arm_sve-inl.h +++ b/hwy/ops/arm_sve-inl.h @@ -219,11 +219,6 @@ HWY_SVE_FOREACH_BF16_UNCONDITIONAL(HWY_SPECIALIZE, _, _) HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) { \ return sv##OP##_##CHAR##BITS(v); \ } -#define HWY_SVE_RETV_ARGMV_M(BASE, CHAR, BITS, HALF, NAME, OP) \ - HWY_API HWY_SVE_V(BASE, BITS) \ - NAME(svbool_t m, HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \ - return sv##OP##_##CHAR##BITS##_m(b, m, a); \ - } #define HWY_SVE_RETV_ARGMV(BASE, CHAR, BITS, HALF, NAME, OP) \ HWY_API HWY_SVE_V(BASE, BITS) NAME(svbool_t m, HWY_SVE_V(BASE, BITS) v) { \ return sv##OP##_##CHAR##BITS##_x(m, v); \ @@ -265,13 +260,8 @@ HWY_SVE_FOREACH_BF16_UNCONDITIONAL(HWY_SPECIALIZE, _, _) NAME(svbool_t m, HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \ return sv##OP##_##CHAR##BITS##_x(m, a, b); \ } -#define HWY_SVE_RETV_ARGMVV_M(BASE, CHAR, BITS, HALF, NAME, OP) \ - HWY_API HWY_SVE_V(BASE, BITS) \ - NAME(svbool_t m, HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \ - return sv##OP##_##CHAR##BITS##_m(m, a, b); \ - } // User-specified mask. Mask=false value is zero. -#define HWY_SVE_RETV_ARGMVVZ(BASE, CHAR, BITS, HALF, NAME, OP) \ +#define HWY_SVE_RETV_ARGMVV_Z(BASE, CHAR, BITS, HALF, NAME, OP) \ HWY_API HWY_SVE_V(BASE, BITS) \ NAME(svbool_t m, HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \ return sv##OP##_##CHAR##BITS##_z(m, a, b); \ @@ -284,13 +274,6 @@ HWY_SVE_FOREACH_BF16_UNCONDITIONAL(HWY_SPECIALIZE, _, _) return sv##OP##_##CHAR##BITS(a, b, c); \ } -#define HWY_SVE_RETV_ARGMVVV(BASE, CHAR, BITS, HALF, NAME, OP) \ - HWY_API HWY_SVE_V(BASE, BITS) \ - NAME(svbool_t m, HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b, \ - HWY_SVE_V(BASE, BITS) c) { \ - return sv##OP##_##CHAR##BITS##_x(m, a, b, c); \ - } - // ------------------------------ Lanes namespace detail { @@ -779,8 +762,8 @@ HWY_API V Or(const V a, const V b) { return BitCast(df, Or(BitCast(du, a), BitCast(du, b))); } -// ------------------------------ MaskedOrOrZero -HWY_SVE_FOREACH_UI(HWY_SVE_RETV_ARGMVVZ, MaskedOrOrZero, orr) +// ------------------------------ MaskedOr +HWY_SVE_FOREACH_UI(HWY_SVE_RETV_ARGMVV_Z, MaskedOr, orr) // ------------------------------ Xor @@ -1697,6 +1680,7 @@ namespace detail { return sv##OP##_##CHAR##BITS(pg, v); \ } +// TODO: Remove SumOfLanesM in favor of using MaskedReduceSum HWY_SVE_FOREACH_UI(HWY_SVE_REDUCE_ADD, SumOfLanesM, addv) HWY_SVE_FOREACH_F(HWY_SVE_REDUCE, SumOfLanesM, addv) @@ -6642,7 +6626,7 @@ HWY_SVE_FOREACH_UI(HWY_SVE_MASKED_LEADING_ZERO_COUNT, MaskedLeadingZeroCount, #undef HWY_SVE_IF_NOT_EMULATED_D #undef HWY_SVE_PTRUE #undef HWY_SVE_RETV_ARGMVV -#undef HWY_SVE_RETV_ARGMVVZ +#undef HWY_SVE_RETV_ARGMVV_Z #undef HWY_SVE_RETV_ARGMV_Z #undef HWY_SVE_RETV_ARGMV #undef HWY_SVE_RETV_ARGPV @@ -6650,13 +6634,8 @@ HWY_SVE_FOREACH_UI(HWY_SVE_MASKED_LEADING_ZERO_COUNT, MaskedLeadingZeroCount, #undef HWY_SVE_RETV_ARGPVV #undef HWY_SVE_RETV_ARGV #undef HWY_SVE_RETV_ARGVN -#undef HWY_SVE_RETV_ARGMV -#undef HWY_SVE_RETV_ARGMV_M -#undef HWY_SVE_RETV_ARGMV_Z #undef HWY_SVE_RETV_ARGVV -#undef HWY_SVE_RETV_ARGMVV_M #undef HWY_SVE_RETV_ARGVVV -#undef HWY_SVE_RETV_ARGMVVV #undef HWY_SVE_T #undef HWY_SVE_UNDEFINED #undef HWY_SVE_V diff --git a/hwy/ops/generic_ops-inl.h b/hwy/ops/generic_ops-inl.h index 8e586ea913..e7018a3cf1 100644 --- a/hwy/ops/generic_ops-inl.h +++ b/hwy/ops/generic_ops-inl.h @@ -1026,11 +1026,11 @@ HWY_API TFromD MaskedReduceSum(D d, M m, VFromD v) { } template HWY_API TFromD MaskedReduceMin(D d, M m, VFromD v) { - return ReduceMin(d, IfThenElse(m, v, MaxOfLanes(d, v))); + return ReduceMin(d, IfThenElse(m, v, Set(d, hwy::HighestValue>()))); } template HWY_API TFromD MaskedReduceMax(D d, M m, VFromD v) { - return ReduceMax(d, IfThenElseZero(m, v)); + return ReduceMax(d, IfThenElse(m, v, Set(d, hwy::LowestValue>()))); } #endif // HWY_NATIVE_MASKED_REDUCE_SCALAR @@ -6735,30 +6735,6 @@ HWY_API V ReverseBits(V v) { } #endif // HWY_NATIVE_REVERSE_BITS_UI16_32_64 -// ------------------------------ TableLookupLanesOr -template -HWY_API V TableLookupLanesOr(M m, V a, V b, IndicesFromD> idx) { - return IfThenElse(m, TableLookupLanes(a, idx), b); -} - -// ------------------------------ TableLookupLanesOrZero -template -HWY_API V TableLookupLanesOrZero(M m, V a, IndicesFromD> idx) { - return IfThenElseZero(m, TableLookupLanes(a, idx)); -} - -// ------------------------------ TwoTablesLookupLanesOr -template -HWY_API V TwoTablesLookupLanesOr(D d, M m, V a, V b, IndicesFromD idx) { - return IfThenElse(m, TwoTablesLookupLanes(d, a, b, idx), a); -} - -// ------------------------------ TwoTablesLookupLanesOrZero -template -HWY_API V TwoTablesLookupLanesOrZero(D d, M m, V a, V b, IndicesFromD idx) { - return IfThenElse(m, TwoTablesLookupLanes(d, a, b, idx), Zero(d)); -} - // ------------------------------ Per4LaneBlockShuffle #if (defined(HWY_NATIVE_PER4LANEBLKSHUF_DUP32) == defined(HWY_TARGET_TOGGLE)) @@ -7615,7 +7591,7 @@ HWY_API V BitShuffle(V v, VI idx) { #endif // HWY_NATIVE_BITSHUFFLE template -HWY_API V MaskedOrOrZero(M m, V a, V b) { +HWY_API V MaskedOr(M m, V a, V b) { return IfThenElseZero(m, Or(a, b)); } // ------------------------------ AllBits1/AllBits0 diff --git a/hwy/ops/rvv-inl.h b/hwy/ops/rvv-inl.h index 31232f7ee9..9c09c135ba 100644 --- a/hwy/ops/rvv-inl.h +++ b/hwy/ops/rvv-inl.h @@ -4755,6 +4755,8 @@ HWY_API T ReduceMax(D d, const VFromD v) { #undef HWY_RVV_REDUCE +// TODO: add MaskedReduceSum/Min/Max + // ------------------------------ SumOfLanes template diff --git a/hwy/tests/logical_test.cc b/hwy/tests/logical_test.cc index 94fbaccbd4..45ffe99d6a 100644 --- a/hwy/tests/logical_test.cc +++ b/hwy/tests/logical_test.cc @@ -146,26 +146,26 @@ HWY_NOINLINE void TestAllTestBit() { ForIntegerTypes(ForPartialVectors()); } -struct TestMaskedOrOrZero { +struct TestMaskedOr { template HWY_NOINLINE void operator()(T /*unused*/, D d) { const MFromD all_true = MaskTrue(d); const auto v1 = Iota(d, 1); const auto v2 = Iota(d, 2); - HWY_ASSERT_VEC_EQ(d, Or(v2, v1), MaskedOrOrZero(all_true, v1, v2)); + HWY_ASSERT_VEC_EQ(d, Or(v2, v1), MaskedOr(all_true, v1, v2)); const MFromD first_five = FirstN(d, 5); const Vec v0 = Zero(d); const Vec v1_exp = IfThenElse(first_five, Or(v2, v1), v0); - HWY_ASSERT_VEC_EQ(d, v1_exp, MaskedOrOrZero(first_five, v1, v2)); + HWY_ASSERT_VEC_EQ(d, v1_exp, MaskedOr(first_five, v1, v2)); } }; HWY_NOINLINE void TestAllMaskedLogical() { - ForAllTypes(ForPartialVectors()); + ForAllTypes(ForPartialVectors()); } struct TestAllBits { diff --git a/hwy/tests/table_test.cc b/hwy/tests/table_test.cc index eb5b1a8644..09fdd7eaf6 100644 --- a/hwy/tests/table_test.cc +++ b/hwy/tests/table_test.cc @@ -103,59 +103,6 @@ HWY_NOINLINE void TestAllTableLookupLanes() { ForAllTypes(ForPartialVectors()); } -struct TestTableLookupLanesOr { - template -#if HWY_TARGET != HWY_SCALARWE - HWY_NOINLINE void operator()(T /*unused*/, D d) { - const RebindToSigned di; - using TI = TFromD; - - const size_t N = Lanes(d); - // Select indices from N-1 counting down - auto indices = IndicesFromVec( - d, Sub(Set(di, ConvertScalarTo(N - 1)), Iota(di, 0))); - - auto expected = AllocateAligned(N); - auto expected_zero = AllocateAligned(N); - auto bool_lanes = AllocateAligned(N); - HWY_ASSERT(expected && expected_zero && bool_lanes); - - const auto v1 = Iota(d, 5); - const auto v2 = Iota(d, 8); - - RandomState rng; - - for (size_t rep = 0; rep < AdjustedReps(200); ++rep) { - for (size_t i = 0; i < N; ++i) { - bool_lanes[i] = (Random32(&rng) & 1024) ? T(1) : T(0); - - if (bool_lanes[i]) { - expected[i] = ConvertScalarTo(N - i + 5 - 1); // v1[N-1, N-2, ...] - expected_zero[i] = - ConvertScalarTo(N - i + 5 - 1); // v1[N-1, N-2, ...] - } else { - expected[i] = ConvertScalarTo(i + 8); // v2[i] - expected_zero[i] = ConvertScalarTo(0); - } - } - - const Vec mask_i = Load(d, bool_lanes.get()); - const Mask mask = RebindMask(d, Gt(mask_i, Zero(d))); - HWY_ASSERT_VEC_EQ(d, expected.get(), - TableLookupLanesOr(mask, v1, v2, indices)); - HWY_ASSERT_VEC_EQ(d, expected_zero.get(), - TableLookupLanesOrZero(mask, v1, indices)); -#else - (void) d; -#endif - } - } -}; - -HWY_NOINLINE void TestAllTableLookupLanesOr() { - ForAllTypes(ForPartialVectors()); -} - struct TestTwoTablesLookupLanes { template HWY_NOINLINE void operator()(T /*unused*/, D d) { @@ -247,64 +194,6 @@ HWY_NOINLINE void TestAllTwoTablesLookupLanes() { ForAllTypes(ForPartialVectors()); } -struct TestTwoTablesLookupLanesOr { - template - HWY_NOINLINE void operator()(T /*unused*/, D d) { - const RebindToSigned di; - using TI = TFromD; - - const size_t N = Lanes(d); - // Select indices from N-1 counting down - auto idx_lower = Sub(Set(di, ConvertScalarTo(N - 1)), Iota(di, 0)); - auto idx_upper = Add(idx_lower, Set(di, ConvertScalarTo(N))); - auto indices = IndicesFromVec(d, OddEven(idx_upper, idx_lower)); - - auto expected = AllocateAligned(N); - auto expected_zero = AllocateAligned(N); - auto bool_lanes = AllocateAligned(N); - HWY_ASSERT(expected && expected_zero && bool_lanes); - - const auto v1 = Iota(d, 5); - const auto v2 = Iota(d, 8); - - RandomState rng; - - for (size_t rep = 0; rep < AdjustedReps(200); ++rep) { - for (size_t i = 0; i < N; ++i) { - bool_lanes[i] = (Random32(&rng) & 1024) ? T(1) : T(0); - - if (bool_lanes[i]) { - if (i % 2) { - expected[i] = - ConvertScalarTo(N - i + 8 - 1); // v2[N-1, N-2, ...] - expected_zero[i] = - ConvertScalarTo(N - i + 8 - 1); // v2[N-1, N-2, ...] - } else { - expected[i] = - ConvertScalarTo(N - i + 5 - 1); // v1[N-1, N-2, ...] - expected_zero[i] = - ConvertScalarTo(N - i + 5 - 1); // v1[N-1, N-2, ...] - } - } else { - expected[i] = ConvertScalarTo(i + 5); // v1[i] - expected_zero[i] = ConvertScalarTo(0); - } - } - - const Vec mask_i = Load(d, bool_lanes.get()); - const Mask mask = RebindMask(d, Gt(mask_i, Zero(d))); - HWY_ASSERT_VEC_EQ(d, expected.get(), - TwoTablesLookupLanesOr(d, mask, v1, v2, indices)); - HWY_ASSERT_VEC_EQ(d, expected_zero.get(), - TwoTablesLookupLanesOrZero(d, mask, v1, v2, indices)); - } - } -}; - -HWY_NOINLINE void TestAllTwoTablesLookupLanesOr() { - ForAllTypes(ForPartialVectors()); -} - } // namespace // NOLINTNEXTLINE(google-readability-namespace-comments) } // namespace HWY_NAMESPACE @@ -316,9 +205,7 @@ namespace hwy { namespace { HWY_BEFORE_TEST(HwyTableTest); HWY_EXPORT_AND_TEST_P(HwyTableTest, TestAllTableLookupLanes); -HWY_EXPORT_AND_TEST_P(HwyTableTest, TestAllTableLookupLanesOr); HWY_EXPORT_AND_TEST_P(HwyTableTest, TestAllTwoTablesLookupLanes); -HWY_EXPORT_AND_TEST_P(HwyTableTest, TestAllTwoTablesLookupLanesOr); HWY_AFTER_TEST(); } // namespace } // namespace hwy From 4702505feb2009e05c327f9e897961b3b6295180 Mon Sep 17 00:00:00 2001 From: Will Barber Date: Wed, 29 Jan 2025 16:37:37 +0000 Subject: [PATCH 3/5] Fix bool_lanes typing --- hwy/tests/reduction_test.cc | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/hwy/tests/reduction_test.cc b/hwy/tests/reduction_test.cc index fd35f645f6..df204fc003 100644 --- a/hwy/tests/reduction_test.cc +++ b/hwy/tests/reduction_test.cc @@ -357,23 +357,25 @@ struct TestMaskedReduceSum { HWY_NOINLINE void operator()(T /*unused*/, D d) { RandomState rng; + using TI = MakeSigned; + const Rebind di; const Vec v2 = Iota(d, 2); const size_t N = Lanes(d); - auto bool_lanes = AllocateAligned(N); + auto bool_lanes = AllocateAligned(N); HWY_ASSERT(bool_lanes); for (size_t rep = 0; rep < AdjustedReps(200); ++rep) { T expected = 0; for (size_t i = 0; i < N; ++i) { - bool_lanes[i] = (Random32(&rng) & 1024) ? T(1) : T(0); + bool_lanes[i] = (Random32(&rng) & 1024) ? TI(1) : TI(0); if (bool_lanes[i]) { expected += ConvertScalarTo(i + 2); } } - const Vec mask_i = Load(d, bool_lanes.get()); - const Mask mask = RebindMask(d, Gt(mask_i, Zero(d))); + const auto mask_i = Load(di, bool_lanes.get()); + const Mask mask = RebindMask(d, Gt(mask_i, Zero(di))); // If all elements are disabled the result is implementation defined if (AllFalse(d, mask)) { @@ -394,17 +396,19 @@ struct TestMaskedReduceMin { HWY_NOINLINE void operator()(T /*unused*/, D d) { RandomState rng; + using TI = MakeSigned; + const Rebind di; const Vec v2 = Iota(d, 2); const size_t N = Lanes(d); - auto bool_lanes = AllocateAligned(N); + auto bool_lanes = AllocateAligned(N); HWY_ASSERT(bool_lanes); for (size_t rep = 0; rep < AdjustedReps(200); ++rep) { T expected = ConvertScalarTo(N + 3); // larger than any values in the vector for (size_t i = 0; i < N; ++i) { - bool_lanes[i] = (Random32(&rng) & 1024) ? T(1) : T(0); + bool_lanes[i] = (Random32(&rng) & 1024) ? TI(1) : TI(0); if (bool_lanes[i]) { if (expected > ConvertScalarTo(i + 2)) { expected = ConvertScalarTo(i + 2); @@ -412,8 +416,8 @@ struct TestMaskedReduceMin { } } - const Vec mask_i = Load(d, bool_lanes.get()); - const Mask mask = RebindMask(d, Gt(mask_i, Zero(d))); + const auto mask_i = Load(di, bool_lanes.get()); + const Mask mask = RebindMask(d, Gt(mask_i, Zero(di))); // If all elements are disabled the result is implementation defined if (AllFalse(d, mask)) { @@ -434,16 +438,18 @@ struct TestMaskedReduceMax { HWY_NOINLINE void operator()(T /*unused*/, D d) { RandomState rng; + using TI = MakeSigned; + const Rebind di; const Vec v2 = Iota(d, 2); const size_t N = Lanes(d); - auto bool_lanes = AllocateAligned(N); + auto bool_lanes = AllocateAligned(N); HWY_ASSERT(bool_lanes); for (size_t rep = 0; rep < AdjustedReps(200); ++rep) { T expected = 0; for (size_t i = 0; i < N; ++i) { - bool_lanes[i] = (Random32(&rng) & 1024) ? T(1) : T(0); + bool_lanes[i] = (Random32(&rng) & 1024) ? TI(1) : TI(0); if (bool_lanes[i]) { if (expected < ConvertScalarTo(i + 2)) { expected = ConvertScalarTo(i + 2); @@ -451,8 +457,8 @@ struct TestMaskedReduceMax { } } - const Vec mask_i = Load(d, bool_lanes.get()); - const Mask mask = RebindMask(d, Gt(mask_i, Zero(d))); + const auto mask_i = Load(di, bool_lanes.get()); + const Mask mask = RebindMask(d, Gt(mask_i, Zero(di))); // If all elements are disabled the result is implementation defined if (AllFalse(d, mask)) { From 61552baaa8bc3089258aa11f0af800696522f477 Mon Sep 17 00:00:00 2001 From: Will Barber Date: Thu, 30 Jan 2025 14:30:20 +0000 Subject: [PATCH 4/5] Fix MaskedReduceMin/Max for floating point values --- hwy/ops/generic_ops-inl.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hwy/ops/generic_ops-inl.h b/hwy/ops/generic_ops-inl.h index e7018a3cf1..4c1e09509a 100644 --- a/hwy/ops/generic_ops-inl.h +++ b/hwy/ops/generic_ops-inl.h @@ -1026,11 +1026,11 @@ HWY_API TFromD MaskedReduceSum(D d, M m, VFromD v) { } template HWY_API TFromD MaskedReduceMin(D d, M m, VFromD v) { - return ReduceMin(d, IfThenElse(m, v, Set(d, hwy::HighestValue>()))); + return ReduceMin(d, IfThenElse(m, v, Set(d, hwy::PositiveInfOrHighestValue >()))); } template HWY_API TFromD MaskedReduceMax(D d, M m, VFromD v) { - return ReduceMax(d, IfThenElse(m, v, Set(d, hwy::LowestValue>()))); + return ReduceMax(d, IfThenElse(m, v, Set(d, hwy::NegativeInfOrLowestValue>()))); } #endif // HWY_NATIVE_MASKED_REDUCE_SCALAR From af4183dab6d7a10240a2ee9439b9071710f9344d Mon Sep 17 00:00:00 2001 From: Will Barber Date: Mon, 3 Feb 2025 14:55:35 +0000 Subject: [PATCH 5/5] Fix rebase issue --- hwy/ops/arm_sve-inl.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/hwy/ops/arm_sve-inl.h b/hwy/ops/arm_sve-inl.h index 32c520b1bb..d753605fa6 100644 --- a/hwy/ops/arm_sve-inl.h +++ b/hwy/ops/arm_sve-inl.h @@ -274,6 +274,13 @@ HWY_SVE_FOREACH_BF16_UNCONDITIONAL(HWY_SPECIALIZE, _, _) return sv##OP##_##CHAR##BITS(a, b, c); \ } +#define HWY_SVE_RETV_ARGMVVV(BASE, CHAR, BITS, HALF, NAME, OP) \ + HWY_API HWY_SVE_V(BASE, BITS) \ + NAME(svbool_t m, HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b, \ + HWY_SVE_V(BASE, BITS) c) { \ + return sv##OP##_##CHAR##BITS##_x(m, a, b, c); \ + } + // ------------------------------ Lanes namespace detail { @@ -6636,6 +6643,7 @@ HWY_SVE_FOREACH_UI(HWY_SVE_MASKED_LEADING_ZERO_COUNT, MaskedLeadingZeroCount, #undef HWY_SVE_RETV_ARGVN #undef HWY_SVE_RETV_ARGVV #undef HWY_SVE_RETV_ARGVVV +#undef HWY_SVE_RETV_ARGMVVV #undef HWY_SVE_T #undef HWY_SVE_UNDEFINED #undef HWY_SVE_V