Skip to content

Commit

Permalink
Added row-pref armv8a ukernel from #698.
Browse files Browse the repository at this point in the history
Details:
- Integrated changes from PR #698 to enable testing in the context of
  the 'stable' branch. These changes add row-preferential sgemm and
  dgemm microkernels for the armv8a kernel set.
- Updated the 'altra' subconfig to easily switch between the previous
  (column-preferential) ukernel and the aforementioned row-pref ukernel.
  • Loading branch information
fgvanzee committed Oct 6, 2023
1 parent 1232831 commit 55598a9
Show file tree
Hide file tree
Showing 3 changed files with 662 additions and 5 deletions.
25 changes: 20 additions & 5 deletions config/altra/bli_cntx_init_altra.c
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@

#include "blis.h"

//#define USE_ROWPREF_UKERNEL

void bli_cntx_init_altra( cntx_t* cntx )
{
blksz_t blkszs[ BLIS_NUM_BLKSZS ];
Expand All @@ -48,19 +50,32 @@ void bli_cntx_init_altra( cntx_t* cntx )
bli_cntx_set_l3_nat_ukrs
(
2,
#ifdef USE_ROWPREF_UKERNEL
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armv8a_asm_12x8r, TRUE,
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armv8a_asm_8x6r, TRUE,
#else
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armv8a_asm_8x12, FALSE,
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armv8a_asm_6x8, FALSE,
#endif
cntx
);

// Initialize level-3 blocksize objects with architecture-specific values.
// s d c z
bli_blksz_init_easy( &blkszs[ BLIS_MR ], 8, 6, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NR ], 12, 8, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 192, 120, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 640, 480, -1, -1 ); // Changed d to 480 - LDR
// bli_blksz_init_easy( &blkszs[ BLIS_NC ], 3072, 6144, -1, -1 ); // Doubled NC
#ifdef USE_ROWPREF_UKERNEL
bli_blksz_init_easy( &blkszs[ BLIS_MR ], 12, 8, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NR ], 8, 6, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 192, 120, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 640, 480, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 12288, 8196, -1, -1 );
#else
bli_blksz_init_easy( &blkszs[ BLIS_MR ], 8, 6, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NR ], 12, 8, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 192, 120, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 640, 480, -1, -1 ); // Changed d to 480 - LDR
// bli_blksz_init_easy( &blkszs[ BLIS_NC ], 3072, 6144, -1, -1 ); // Doubled NC
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 12288, 8192, -1, -1 ); // Increased NC slightly more
#endif

// Update the context with the current architecture's register and cache
// blocksizes (and multiples) for native execution.
Expand Down
16 changes: 16 additions & 0 deletions kernels/armv8a/3/armv8a_asm_d2x2.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,9 +47,25 @@
" fmla v"#C0".2d, v"#A".2d, v"#B".d[0] \n\t" \
" fmla v"#C1".2d, v"#A".2d, v"#B".d[1] \n\t"

#define DGEMM_2X2_NANOKERNEL_PLAIN(C0,C1,A,B) \
DGEMM_2X2_NANOKERNEL(C0,C1,A,B)

#define DGEMM_2X2_NANOKERNEL_INIT(C0,C1,A,B) \
" fmul v"#C0".2d, v"#A".2d, v"#B".d[0] \n\t" \
" fmul v"#C1".2d, v"#A".2d, v"#B".d[1] \n\t"

#define SGEMM_4X4_NANOKERNEL(C0,C1,C2,C3,A,B) \
" fmla v"#C0".4s, v"#A".4s, v"#B".s[0] \n\t" \
" fmla v"#C1".4s, v"#A".4s, v"#B".s[1] \n\t" \
" fmla v"#C2".4s, v"#A".4s, v"#B".s[2] \n\t" \
" fmla v"#C3".4s, v"#A".4s, v"#B".s[3] \n\t"

#define SGEMM_4X4_NANOKERNEL_PLAIN(C0,C1,C2,C3,A,B) \
SGEMM_4X4_NANOKERNEL(C0,C1,C2,C3,A,B)

#define SGEMM_4X4_NANOKERNEL_INIT(C0,C1,C2,C3,A,B) \
" fmul v"#C0".4s, v"#A".4s, v"#B".s[0] \n\t" \
" fmul v"#C1".4s, v"#A".4s, v"#B".s[1] \n\t" \
" fmul v"#C2".4s, v"#A".4s, v"#B".s[2] \n\t" \
" fmul v"#C3".4s, v"#A".4s, v"#B".s[3] \n\t"

Loading

0 comments on commit 55598a9

Please sign in to comment.