Skip to content

Commit

Permalink
PR 117048: aarch64: Add define_insn_and_split for vector ROTATE
Browse files Browse the repository at this point in the history
The ultimate goal in this PR is to match the XAR pattern that is represented
as a (ROTATE (XOR X Y) VCST) from the ACLE intrinsics code in the testcase.
The first blocker for this was the missing recognition of ROTATE in
simplify-rtx, which is fixed in the previous patch.
The next problem is that once the ROTATE has been matched from the shifts
and orr/xor/plus, it will try to match it in an insn before trying to combine
the XOR into it.  But as we don't have a backend pattern for a vector ROTATE
this recog fails and combine does not try the followup XOR+ROTATE combination
which would have succeeded.

This patch solves that by introducing a sort of "scaffolding" pattern for
vector ROTATE, which allows it to be combined into the XAR.
If it fails to be combined into anything the splitter will break it back
down into the SHL+USRA sequence that it would have emitted.
By having this splitter we can special-case some rotate amounts in the future
to emit more specialised instructions e.g. from the REV* family.
This can be done if the ROTATE is not combined into something else.

This optimisation is done in the next patch in the series.

Bootstrapped and tested on aarch64-none-linux-gnu.

Signed-off-by: Kyrylo Tkachov <[email protected]>

gcc/

	PR target/117048
	* config/aarch64/aarch64-simd.md (*aarch64_simd_rotate_imm<mode>):
	New define_insn_and_split.

gcc/testsuite/

	PR target/117048
	* gcc.target/aarch64/simd/pr117048.c: New test.
  • Loading branch information
ktkachov committed Nov 4, 2024
1 parent 1e5ff11 commit 1411d39
Show file tree
Hide file tree
Showing 2 changed files with 102 additions and 0 deletions.
29 changes: 29 additions & 0 deletions gcc/config/aarch64/aarch64-simd.md
Original file line number Diff line number Diff line change
Expand Up @@ -1294,6 +1294,35 @@
[(set_attr "type" "neon_shift_acc<q>")]
)

;; After all the combinations and propagations of ROTATE have been
;; attempted split any remaining vector rotates into SHL + USRA sequences.
(define_insn_and_split "*aarch64_simd_rotate_imm<mode>"
[(set (match_operand:VDQ_I 0 "register_operand" "=&w")
(rotate:VDQ_I (match_operand:VDQ_I 1 "register_operand" "w")
(match_operand:VDQ_I 2 "aarch64_simd_lshift_imm")))]
"TARGET_SIMD"
"#"
"&& 1"
[(set (match_dup 3)
(ashift:VDQ_I (match_dup 1)
(match_dup 2)))
(set (match_dup 0)
(plus:VDQ_I
(lshiftrt:VDQ_I
(match_dup 1)
(match_dup 4))
(match_dup 3)))]
{
operands[3] = reload_completed ? operands[0] : gen_reg_rtx (<MODE>mode);
rtx shft_amnt = unwrap_const_vec_duplicate (operands[2]);
int bitwidth = GET_MODE_UNIT_SIZE (<MODE>mode) * BITS_PER_UNIT;
operands[4]
= aarch64_simd_gen_const_vector_dup (<MODE>mode,
bitwidth - INTVAL (shft_amnt));
}
[(set_attr "length" "8")]
)

(define_insn "aarch64_<sra_op>rsra_n<mode>_insn"
[(set (match_operand:VSDQ_I_DI 0 "register_operand" "=w")
(plus:VSDQ_I_DI
Expand Down
73 changes: 73 additions & 0 deletions gcc/testsuite/gcc.target/aarch64/simd/pr117048.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
/* { dg-do compile } */
/* { dg-options "-O2" } */
/* { dg-final { check-function-bodies "**" "" "" } } */

#include <arm_neon.h>

#pragma GCC target "+sha3"

/*
** func_shl_eor:
** xar v0\.2d, v([0-9]+)\.2d, v([0-9]+)\.2d, 63
** ret
*/
uint64x2_t
func_shl_eor (uint64x2_t a, uint64x2_t b) {
uint64x2_t c = veorq_u64 (a, b);
return veorq_u64(vshlq_n_u64(c, 1), vshrq_n_u64(c, 63));
}

/*
** func_add_eor:
** xar v0\.2d, v([0-9]+)\.2d, v([0-9]+)\.2d, 63
** ret
*/
uint64x2_t
func_add_eor (uint64x2_t a, uint64x2_t b) {
uint64x2_t c = veorq_u64 (a, b);
return veorq_u64(vaddq_u64(c, c), vshrq_n_u64(c, 63));
}

/*
** func_shl_orr:
** xar v0\.2d, v([0-9]+)\.2d, v([0-9]+)\.2d, 63
** ret
*/
uint64x2_t
func_shl_orr (uint64x2_t a, uint64x2_t b) {
uint64x2_t c = veorq_u64 (a, b);
return vorrq_u64(vshlq_n_u64(c, 1), vshrq_n_u64(c, 63));
}

/*
** func_add_orr:
** xar v0\.2d, v([0-9]+)\.2d, v([0-9]+)\.2d, 63
** ret
*/
uint64x2_t
func_add_orr (uint64x2_t a, uint64x2_t b) {
uint64x2_t c = veorq_u64 (a, b);
return vorrq_u64(vaddq_u64(c, c), vshrq_n_u64(c, 63));
}

/*
** func_shl_add:
** xar v0\.2d, v([0-9]+)\.2d, v([0-9]+)\.2d, 63
** ret
*/
uint64x2_t
func_shl_add (uint64x2_t a, uint64x2_t b) {
uint64x2_t c = veorq_u64 (a, b);
return vaddq_u64(vshlq_n_u64(c, 1), vshrq_n_u64(c, 63));
}

/*
** func_add_add:
** xar v0\.2d, v([0-9]+)\.2d, v([0-9]+)\.2d, 63
** ret
*/
uint64x2_t
func_add_add (uint64x2_t a, uint64x2_t b) {
uint64x2_t c = veorq_u64 (a, b);
return vaddq_u64(vaddq_u64(c, c), vshrq_n_u64(c, 63));
}

0 comments on commit 1411d39

Please sign in to comment.