From 0bed7c1596d0daf7e846585a01df8c9dbfb50a1e Mon Sep 17 00:00:00 2001 From: Seppo Ingalsuo Date: Fri, 14 Feb 2025 17:23:16 +0200 Subject: [PATCH] Math: FIR: Optimize filter core function for HiFi5 This patch optimizes the function fir_32x16_2x_hifi5(). - The (4x) quad-MAC with AE_MULAFD32X16X2_FIR_HH() and AE_MULAFD32X16X2_FIR_HL() is replaced with a 8x MAC intrinsic AE_MULA2Q32X16_FIR_H(). - Since the 8x MAC is not supporting fractions, a shift left by one is added to adjust the format to Q17.47. - The output sample single saturation and round is replaced with instruction that rounds two 64 bit accumulators. WIP - Currently the MCPS saving with FIR EQ and TDFB components seems much smaller, only 0.2 MCPS. Signed-off-by: Seppo Ingalsuo --- src/math/fir_hifi5.c | 77 +++++++++++++++++--------------------------- 1 file changed, 30 insertions(+), 47 deletions(-) diff --git a/src/math/fir_hifi5.c b/src/math/fir_hifi5.c index 4d9a93d250e5..30c8854179bb 100644 --- a/src/math/fir_hifi5.c +++ b/src/math/fir_hifi5.c @@ -85,11 +85,6 @@ void fir_get_lrshifts(struct fir_state_32x16 *fir, int *lshift, } EXPORT_SYMBOL(fir_get_lrshifts); -/* HiFi EP has the follow number of reqisters that should not be exceeded - * 4x 56 bit registers in register file Q - * 8x 48 bit registers in register file P - */ - void fir_32x16(struct fir_state_32x16 *fir, ae_int32 x, ae_int32 *y, int shift) { /* This function uses @@ -163,31 +158,26 @@ void fir_32x16(struct fir_state_32x16 *fir, ae_int32 x, ae_int32 *y, int shift) } EXPORT_SYMBOL(fir_32x16); -/* HiFi EP has the follow number of reqisters that should not be exceeded - * 4x 56 bit registers in register file Q - * 8x 48 bit registers in register file P - */ - void fir_32x16_2x(struct fir_state_32x16 *fir, ae_int32 x0, ae_int32 x1, ae_int32 *y0, ae_int32 *y1, int shift) { /* This function uses - * 2x 56 bit registers Q, - * 4x 48 bit registers P + * 7x 64 bit AE registers * 3x integers * 2x address pointers, */ - ae_f64 a; - ae_f64 b; ae_valign u; + ae_f64 a = AE_ZERO64(); + ae_f64 b = AE_ZERO64(); ae_f32x2 d0; ae_f32x2 d1; + ae_f32x2 d2; ae_f16x4 coefs; - int i; - ae_f32x2 *dp; + ae_int32x2 *dp; ae_f16x4 *coefp = fir->coef; const int taps_div_4 = fir->taps >> 2; const int inc = 2 * sizeof(int32_t); + int i; /* Bypass samples if taps count is zero. */ if (!taps_div_4) { @@ -198,21 +188,14 @@ void fir_32x16_2x(struct fir_state_32x16 *fir, ae_int32 x0, ae_int32 x1, /* Write samples to delay */ AE_S32_L_XC(x0, fir->rwp, -sizeof(int32_t)); - dp = (ae_f32x2 *)fir->rwp; + dp = (ae_int32x2 *)fir->rwp; AE_S32_L_XC(x1, fir->rwp, -sizeof(int32_t)); - /* Note: If the next function is converted to handle two samples - * per call the data load can be done with single instruction - * AE_LP24X2F_C(data2, dp, sizeof(ae_p24x2f)); - */ - a = AE_ZERO64(); - b = AE_ZERO64(); - /* Prime the coefficients stream */ u = AE_LA64_PP(coefp); - /* Load two data samples and pack to d0 to data2_h and - * d1 to data2_l. + /* Load two samples, two newest samples and proceed + * to elder input samples in delay line. */ AE_L32X2_XC(d0, dp, inc); for (i = 0; i < taps_div_4; i++) { @@ -222,34 +205,34 @@ void fir_32x16_2x(struct fir_state_32x16 *fir, ae_int32 x0, ae_int32 x1, */ AE_LA16X4_IP(coefs, u, coefp); - /* Load two data samples. Upper part d1_h is x[n+1] and - * lower part d1_l is x[n]. + /* Load two data samples more. + * d0.H is x[n] the newest sample + * d0.L is x[n-1] + * d1.H is x[n-2] + * d1.L is x[n-3] + * d2.H is x[n-4] */ AE_L32X2_XC(d1, dp, inc); + AE_L32X2_XC(d2, dp, inc); - /* Quad MAC (HH) - * b += d0_h * coefs_3 + d0_l * coefs_2 - * a += d0_l * coefs_3 + d1_h * coefs_2 + /* Calculate four FIR taps for current (x1 -> a) and previous input (x0 -> b) + * b = b + d0.H * c.3 + d0.L * c.2 + d1.H * c.1 + d1.L * c.0 + * a = a + d0.L * c.3 + d1.H * c.2 + d1.L * c.1 + d2.H * c.0 */ - AE_MULAFD32X16X2_FIR_HH(b, a, d0, d1, coefs); - d0 = d1; - - /* Repeat the same for next two taps and increase coefp. */ - AE_L32X2_XC(d1, dp, inc); + AE_MULA2Q32X16_FIR_H(b, a, d0, d1, d2, coefs); - /* Quad MAC (HL) - * b += d0_h * coefs_1 + d0_l * coefs_0 - * a += d0_l * coefs_1 + d1_h * coefs_0 - */ - AE_MULAFD32X16X2_FIR_HL(b, a, d0, d1, coefs); - d0 = d1; + /* Prepare for next four taps, d2 overlaps to next loop iteration as d0 */ + d0 = d2; } - /* Do scaling shifts and store sample. */ - b = AE_SLAA64S(b, shift); - a = AE_SLAA64S(a, shift); - AE_S32_L_I(AE_ROUND32F48SSYM(b), (ae_int32 *)y1, 0); - AE_S32_L_I(AE_ROUND32F48SSYM(a), (ae_int32 *)y0, 0); + /* Shift left by one Q1.31 x Q1.15 -> Q2.46 format for Q2.47 round and + * store output samples. + */ + b = AE_SLAA64S(b, shift + 1); + a = AE_SLAA64S(a, shift + 1); + d0 = AE_ROUND32X2F48SASYM(b, a); + AE_S32_H_I(d0, (ae_int32 *)y1, 0); + AE_S32_L_I(d0, (ae_int32 *)y0, 0); } EXPORT_SYMBOL(fir_32x16_2x);