From 0bed7c1596d0daf7e846585a01df8c9dbfb50a1e Mon Sep 17 00:00:00 2001
From: Seppo Ingalsuo <seppo.ingalsuo@linux.intel.com>
Date: Fri, 14 Feb 2025 17:23:16 +0200
Subject: [PATCH] Math: FIR: Optimize filter core function for HiFi5

This patch optimizes the function fir_32x16_2x_hifi5().

- The (4x) quad-MAC with AE_MULAFD32X16X2_FIR_HH() and
  AE_MULAFD32X16X2_FIR_HL() is replaced with a 8x MAC intrinsic
  AE_MULA2Q32X16_FIR_H().
- Since the 8x MAC is not supporting fractions, a shift left by
  one is added to adjust the format to Q17.47.
- The output sample single saturation and round is replaced with
  instruction that rounds two 64 bit accumulators.

WIP - Currently the MCPS saving with FIR EQ and TDFB components
seems much smaller, only 0.2 MCPS.

Signed-off-by: Seppo Ingalsuo <seppo.ingalsuo@linux.intel.com>
---
 src/math/fir_hifi5.c | 77 +++++++++++++++++---------------------------
 1 file changed, 30 insertions(+), 47 deletions(-)

diff --git a/src/math/fir_hifi5.c b/src/math/fir_hifi5.c
index 4d9a93d250e5..30c8854179bb 100644
--- a/src/math/fir_hifi5.c
+++ b/src/math/fir_hifi5.c
@@ -85,11 +85,6 @@ void fir_get_lrshifts(struct fir_state_32x16 *fir, int *lshift,
 }
 EXPORT_SYMBOL(fir_get_lrshifts);
 
-/* HiFi EP has the follow number of reqisters that should not be exceeded
- * 4x 56 bit registers in register file Q
- * 8x 48 bit registers in register file P
- */
-
 void fir_32x16(struct fir_state_32x16 *fir, ae_int32 x, ae_int32 *y, int shift)
 {
 	/* This function uses
@@ -163,31 +158,26 @@ void fir_32x16(struct fir_state_32x16 *fir, ae_int32 x, ae_int32 *y, int shift)
 }
 EXPORT_SYMBOL(fir_32x16);
 
-/* HiFi EP has the follow number of reqisters that should not be exceeded
- * 4x 56 bit registers in register file Q
- * 8x 48 bit registers in register file P
- */
-
 void fir_32x16_2x(struct fir_state_32x16 *fir, ae_int32 x0, ae_int32 x1,
 		  ae_int32 *y0, ae_int32 *y1, int shift)
 {
 	/* This function uses
-	 * 2x 56 bit registers Q,
-	 * 4x 48 bit registers P
+	 * 7x 64 bit AE registers
 	 * 3x integers
 	 * 2x address pointers,
 	 */
-	ae_f64 a;
-	ae_f64 b;
 	ae_valign u;
+	ae_f64 a = AE_ZERO64();
+	ae_f64 b = AE_ZERO64();
 	ae_f32x2 d0;
 	ae_f32x2 d1;
+	ae_f32x2 d2;
 	ae_f16x4 coefs;
-	int i;
-	ae_f32x2 *dp;
+	ae_int32x2 *dp;
 	ae_f16x4 *coefp = fir->coef;
 	const int taps_div_4 = fir->taps >> 2;
 	const int inc = 2 * sizeof(int32_t);
+	int i;
 
 	/* Bypass samples if taps count is zero. */
 	if (!taps_div_4) {
@@ -198,21 +188,14 @@ void fir_32x16_2x(struct fir_state_32x16 *fir, ae_int32 x0, ae_int32 x1,
 
 	/* Write samples to delay */
 	AE_S32_L_XC(x0, fir->rwp, -sizeof(int32_t));
-	dp = (ae_f32x2 *)fir->rwp;
+	dp = (ae_int32x2 *)fir->rwp;
 	AE_S32_L_XC(x1, fir->rwp, -sizeof(int32_t));
 
-	/* Note: If the next function is converted to handle two samples
-	 * per call the data load can be done with single instruction
-	 * AE_LP24X2F_C(data2, dp, sizeof(ae_p24x2f));
-	 */
-	a = AE_ZERO64();
-	b = AE_ZERO64();
-
 	/* Prime the coefficients stream */
 	u = AE_LA64_PP(coefp);
 
-	/* Load two data samples and pack to d0 to data2_h and
-	 * d1 to data2_l.
+	/* Load two samples, two newest samples and proceed
+	 * to elder input samples in delay line.
 	 */
 	AE_L32X2_XC(d0, dp, inc);
 	for (i = 0; i < taps_div_4; i++) {
@@ -222,34 +205,34 @@ void fir_32x16_2x(struct fir_state_32x16 *fir, ae_int32 x0, ae_int32 x1,
 		 */
 		AE_LA16X4_IP(coefs, u, coefp);
 
-		/* Load two data samples. Upper part d1_h is x[n+1] and
-		 * lower part d1_l is x[n].
+		/* Load two data samples more.
+		 * d0.H is x[n] the newest sample
+		 * d0.L is x[n-1]
+		 * d1.H is x[n-2]
+		 * d1.L is x[n-3]
+		 * d2.H is x[n-4]
 		 */
 		AE_L32X2_XC(d1, dp, inc);
+		AE_L32X2_XC(d2, dp, inc);
 
-		/* Quad MAC (HH)
-		 * b += d0_h * coefs_3 + d0_l * coefs_2
-		 * a += d0_l * coefs_3 + d1_h * coefs_2
+		/* Calculate four FIR taps for current (x1 -> a) and previous input (x0 -> b)
+		 * b = b  + d0.H * c.3  + d0.L * c.2  + d1.H * c.1  + d1.L * c.0
+		 * a = a  + d0.L * c.3  + d1.H * c.2  + d1.L * c.1  + d2.H * c.0
 		 */
-		AE_MULAFD32X16X2_FIR_HH(b, a, d0, d1, coefs);
-		d0 = d1;
-
-		/* Repeat the same for next two taps and increase coefp. */
-		AE_L32X2_XC(d1, dp, inc);
+		AE_MULA2Q32X16_FIR_H(b, a, d0, d1, d2, coefs);
 
-		/* Quad MAC (HL)
-		 * b += d0_h * coefs_1 + d0_l * coefs_0
-		 * a += d0_l * coefs_1 + d1_h * coefs_0
-		 */
-		AE_MULAFD32X16X2_FIR_HL(b, a, d0, d1, coefs);
-		d0 = d1;
+		/* Prepare for next four taps, d2 overlaps to next loop iteration as d0 */
+		d0 = d2;
 	}
 
-	/* Do scaling shifts and store sample. */
-	b = AE_SLAA64S(b, shift);
-	a = AE_SLAA64S(a, shift);
-	AE_S32_L_I(AE_ROUND32F48SSYM(b), (ae_int32 *)y1, 0);
-	AE_S32_L_I(AE_ROUND32F48SSYM(a), (ae_int32 *)y0, 0);
+	/* Shift left by one Q1.31 x Q1.15 -> Q2.46 format for Q2.47 round and
+	 * store output samples.
+	 */
+	b = AE_SLAA64S(b, shift + 1);
+	a = AE_SLAA64S(a, shift + 1);
+	d0 = AE_ROUND32X2F48SASYM(b, a);
+	AE_S32_H_I(d0, (ae_int32 *)y1, 0);
+	AE_S32_L_I(d0, (ae_int32 *)y0, 0);
 }
 EXPORT_SYMBOL(fir_32x16_2x);