Drop FMA intrinsic

Danila Kutenin pointed out: > Technically speaking, _mm_fmadd_ps is not an SSE extension, this was > introduced with fma extension which took place even after AVX. To clarify the purpose of SSE2NEON, this pach would drop the existing FMA implementation. The instruction vfmaq_f32, standing for "fused floating-point multiply-accumulate", is only available for VFPv4+. Thus, for Armv7-A targets, we have to take the following cases into consideration: * VFPv3, which is implemented on Cortex-R4, R5, Cortex-A9 * VFPv4, which is implemented on the A15 and Cortex-A7, or later According to the ACLE spec[1], "__ARM_FEATURE_FMA" is defined to 1 if the hardware floating-point architecture supports fused floating-point multiply-accumulate. Related: #82 [1] https://developer.arm.com/architectures/system-architectures/software-standards/acle
DLTcollab · Jun 5, 2021 · ab1ceea · ab1ceea
1 parent d44d259
commit ab1ceea
Show file tree

Hide file tree

Showing 3 changed files with 7 additions and 29 deletions.
diff --git a/sse2neon.h b/sse2neon.h
@@ -359,8 +359,6 @@ FORCE_INLINE __m128d _mm_round_pd(__m128d, int);
 FORCE_INLINE __m128 _mm_round_ps(__m128, int);
 // SSE4.2
 FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t, uint8_t);
-// FMA
-FORCE_INLINE __m128 _mm_fmadd_ps(__m128, __m128, __m128);
 
 /* Backwards compatibility for compilers with lack of specific type support */
 
@@ -6034,7 +6032,13 @@ FORCE_INLINE __m128d _mm_addsub_pd(__m128d a, __m128d b)
 FORCE_INLINE __m128 _mm_addsub_ps(__m128 a, __m128 b)
 {
     __m128 mask = {-1.0f, 1.0f, -1.0f, 1.0f};
-    return _mm_fmadd_ps(b, mask, a);
+#if defined(__aarch64__) || defined(__ARM_FEATURE_FMA) /* VFPv4+ */
+    return vreinterpretq_m128_f32(vfmaq_f32(vreinterpretq_f32_m128(a),
+                                            vreinterpretq_f32_m128(mask),
+                                            vreinterpretq_f32_m128(b)));
+#else
+    return _mm_add_ps(_mm_mul_ps(b, mask), a);
+#endif
 }
 
 // Horizontally add adjacent pairs of double-precision (64-bit) floating-point
@@ -8012,24 +8016,6 @@ FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon)
 }
 #endif
 
-/* FMA */
-
-// Computes the fused multiple add product of 32-bit floating point numbers.
-//
-// Return Value
-// Multiplies A and B, and adds C to the temporary result before returning it.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd
-FORCE_INLINE __m128 _mm_fmadd_ps(__m128 a, __m128 b, __m128 c)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128_f32(vfmaq_f32(vreinterpretq_f32_m128(c),
-                                            vreinterpretq_f32_m128(b),
-                                            vreinterpretq_f32_m128(a)));
-#else
-    return _mm_add_ps(_mm_mul_ps(a, b), c);
-#endif
-}
-
 /* Others */
 
 // Perform a carry-less multiplication of two 64-bit integers, selected from a

diff --git a/tests/impl.cpp b/tests/impl.cpp
@@ -8776,12 +8776,6 @@ result_t test_mm_aeskeygenassist_si128(const SSE2NEONTestImpl &impl,
     return validate128(resultReference, resultIntrinsic);
 }
 
-/* FMA */
-result_t test_mm_fmadd_ps(const SSE2NEONTestImpl &impl, uint32_t iter)
-{
-    return TEST_UNIMPL;
-}
-
 /* Others */
 result_t test_mm_clmulepi64_si128(const SSE2NEONTestImpl &impl, uint32_t iter)
 {

diff --git a/tests/impl.h b/tests/impl.h
@@ -526,8 +526,6 @@
     TYPE(mm_aesenc_si128)          \
     TYPE(mm_aesenclast_si128)      \
     TYPE(mm_aeskeygenassist_si128) \
-    /* FMA */                      \
-    TYPE(mm_fmadd_ps)              \
     /* Others */                   \
     TYPE(mm_clmulepi64_si128)      \
     TYPE(mm_popcnt_u32)            \