Skip to content

Commit

Permalink
Math: FIR: Optimize filter core function for HiFi5
Browse files Browse the repository at this point in the history
This patch optimizes the function fir_32x16_2x_hifi5().

- The (4x) quad-MAC with AE_MULAFD32X16X2_FIR_HH() and
  AE_MULAFD32X16X2_FIR_HL() is replaced with a 8x MAC intrinsic
  AE_MULA2Q32X16_FIR_H().
- Since the 8x MAC is not supporting fractions, a shift left by
  one is added to adjust the format to Q17.47.
- The output sample single saturation and round is replaced with
  instruction that rounds two 64 bit accumulators.

WIP - Currently the MCPS saving with FIR EQ and TDFB components
seems much smaller, only 0.2 MCPS.

Signed-off-by: Seppo Ingalsuo <[email protected]>
  • Loading branch information
singalsu committed Feb 14, 2025
1 parent f8f60e9 commit 0bed7c1
Showing 1 changed file with 30 additions and 47 deletions.
77 changes: 30 additions & 47 deletions src/math/fir_hifi5.c
Original file line number Diff line number Diff line change
Expand Up @@ -85,11 +85,6 @@ void fir_get_lrshifts(struct fir_state_32x16 *fir, int *lshift,
}
EXPORT_SYMBOL(fir_get_lrshifts);

/* HiFi EP has the follow number of reqisters that should not be exceeded
* 4x 56 bit registers in register file Q
* 8x 48 bit registers in register file P
*/

void fir_32x16(struct fir_state_32x16 *fir, ae_int32 x, ae_int32 *y, int shift)
{
/* This function uses
Expand Down Expand Up @@ -163,31 +158,26 @@ void fir_32x16(struct fir_state_32x16 *fir, ae_int32 x, ae_int32 *y, int shift)
}
EXPORT_SYMBOL(fir_32x16);

/* HiFi EP has the follow number of reqisters that should not be exceeded
* 4x 56 bit registers in register file Q
* 8x 48 bit registers in register file P
*/

void fir_32x16_2x(struct fir_state_32x16 *fir, ae_int32 x0, ae_int32 x1,
ae_int32 *y0, ae_int32 *y1, int shift)
{
/* This function uses
* 2x 56 bit registers Q,
* 4x 48 bit registers P
* 7x 64 bit AE registers
* 3x integers
* 2x address pointers,
*/
ae_f64 a;
ae_f64 b;
ae_valign u;
ae_f64 a = AE_ZERO64();
ae_f64 b = AE_ZERO64();
ae_f32x2 d0;
ae_f32x2 d1;
ae_f32x2 d2;
ae_f16x4 coefs;
int i;
ae_f32x2 *dp;
ae_int32x2 *dp;
ae_f16x4 *coefp = fir->coef;
const int taps_div_4 = fir->taps >> 2;
const int inc = 2 * sizeof(int32_t);
int i;

/* Bypass samples if taps count is zero. */
if (!taps_div_4) {
Expand All @@ -198,21 +188,14 @@ void fir_32x16_2x(struct fir_state_32x16 *fir, ae_int32 x0, ae_int32 x1,

/* Write samples to delay */
AE_S32_L_XC(x0, fir->rwp, -sizeof(int32_t));
dp = (ae_f32x2 *)fir->rwp;
dp = (ae_int32x2 *)fir->rwp;
AE_S32_L_XC(x1, fir->rwp, -sizeof(int32_t));

/* Note: If the next function is converted to handle two samples
* per call the data load can be done with single instruction
* AE_LP24X2F_C(data2, dp, sizeof(ae_p24x2f));
*/
a = AE_ZERO64();
b = AE_ZERO64();

/* Prime the coefficients stream */
u = AE_LA64_PP(coefp);

/* Load two data samples and pack to d0 to data2_h and
* d1 to data2_l.
/* Load two samples, two newest samples and proceed
* to elder input samples in delay line.
*/
AE_L32X2_XC(d0, dp, inc);
for (i = 0; i < taps_div_4; i++) {
Expand All @@ -222,34 +205,34 @@ void fir_32x16_2x(struct fir_state_32x16 *fir, ae_int32 x0, ae_int32 x1,
*/
AE_LA16X4_IP(coefs, u, coefp);

/* Load two data samples. Upper part d1_h is x[n+1] and
* lower part d1_l is x[n].
/* Load two data samples more.
* d0.H is x[n] the newest sample
* d0.L is x[n-1]
* d1.H is x[n-2]
* d1.L is x[n-3]
* d2.H is x[n-4]
*/
AE_L32X2_XC(d1, dp, inc);
AE_L32X2_XC(d2, dp, inc);

/* Quad MAC (HH)
* b += d0_h * coefs_3 + d0_l * coefs_2
* a += d0_l * coefs_3 + d1_h * coefs_2
/* Calculate four FIR taps for current (x1 -> a) and previous input (x0 -> b)
* b = b + d0.H * c.3 + d0.L * c.2 + d1.H * c.1 + d1.L * c.0
* a = a + d0.L * c.3 + d1.H * c.2 + d1.L * c.1 + d2.H * c.0
*/
AE_MULAFD32X16X2_FIR_HH(b, a, d0, d1, coefs);
d0 = d1;

/* Repeat the same for next two taps and increase coefp. */
AE_L32X2_XC(d1, dp, inc);
AE_MULA2Q32X16_FIR_H(b, a, d0, d1, d2, coefs);

/* Quad MAC (HL)
* b += d0_h * coefs_1 + d0_l * coefs_0
* a += d0_l * coefs_1 + d1_h * coefs_0
*/
AE_MULAFD32X16X2_FIR_HL(b, a, d0, d1, coefs);
d0 = d1;
/* Prepare for next four taps, d2 overlaps to next loop iteration as d0 */
d0 = d2;
}

/* Do scaling shifts and store sample. */
b = AE_SLAA64S(b, shift);
a = AE_SLAA64S(a, shift);
AE_S32_L_I(AE_ROUND32F48SSYM(b), (ae_int32 *)y1, 0);
AE_S32_L_I(AE_ROUND32F48SSYM(a), (ae_int32 *)y0, 0);
/* Shift left by one Q1.31 x Q1.15 -> Q2.46 format for Q2.47 round and
* store output samples.
*/
b = AE_SLAA64S(b, shift + 1);
a = AE_SLAA64S(a, shift + 1);
d0 = AE_ROUND32X2F48SASYM(b, a);
AE_S32_H_I(d0, (ae_int32 *)y1, 0);
AE_S32_L_I(d0, (ae_int32 *)y0, 0);
}
EXPORT_SYMBOL(fir_32x16_2x);

Expand Down

0 comments on commit 0bed7c1

Please sign in to comment.