From 19757e1c28de07b45da03117e6ff7ae3e21e5a7a Mon Sep 17 00:00:00 2001 From: Kyrylo Tkachov Date: Wed, 16 Oct 2024 04:10:08 -0700 Subject: [PATCH] aarch64: Optimize vector rotates as vector permutes where possible Some vector rotate operations can be implemented in a single instruction rather than using the fallback SHL+USRA sequence. In particular, when the rotate amount is half the bitwidth of the element we can use a REV64,REV32,REV16 instruction. More generally, rotates by a byte amount can be implented using vector permutes. This patch adds such a generic routine in expmed.cc called expand_rotate_as_vec_perm that calculates the required permute indices and uses the expand_vec_perm_const interface. On aarch64 this ends up generating the single-instruction sequences above where possible and can use LDR+TBL sequences too, which are a good choice. With help from Richard, the routine should be VLA-safe. However, the only use of expand_rotate_as_vec_perm introduced in this patch is in aarch64-specific code that for now only handles fixed-width modes. A runtime aarch64 test is added to ensure the permute indices are not messed up. Bootstrapped and tested on aarch64-none-linux-gnu. Signed-off-by: Kyrylo Tkachov gcc/ * expmed.h (expand_rotate_as_vec_perm): Declare. * expmed.cc (expand_rotate_as_vec_perm): Define. * config/aarch64/aarch64-protos.h (aarch64_emit_opt_vec_rotate): Declare prototype. * config/aarch64/aarch64.cc (aarch64_emit_opt_vec_rotate): Implement. * config/aarch64/aarch64-simd.md (*aarch64_simd_rotate_imm): Call the above. gcc/testsuite/ * gcc.target/aarch64/vec-rot-exec.c: New test. * gcc.target/aarch64/simd/pr117048_2.c: New test. --- gcc/config/aarch64/aarch64-protos.h | 1 + gcc/config/aarch64/aarch64-simd.md | 3 + gcc/config/aarch64/aarch64.cc | 16 +++ gcc/expmed.cc | 44 ++++++++ gcc/expmed.h | 1 + .../gcc.target/aarch64/simd/pr117048_2.c | 66 ++++++++++++ .../gcc.target/aarch64/vec-rot-exec.c | 101 ++++++++++++++++++ 7 files changed, 232 insertions(+) create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/pr117048_2.c create mode 100644 gcc/testsuite/gcc.target/aarch64/vec-rot-exec.c diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h index 05caad5e2fe..e8588e1cb17 100644 --- a/gcc/config/aarch64/aarch64-protos.h +++ b/gcc/config/aarch64/aarch64-protos.h @@ -851,6 +851,7 @@ bool aarch64_rnd_imm_p (rtx); bool aarch64_constant_address_p (rtx); bool aarch64_emit_approx_div (rtx, rtx, rtx); bool aarch64_emit_approx_sqrt (rtx, rtx, bool); +bool aarch64_emit_opt_vec_rotate (rtx, rtx, rtx); tree aarch64_vector_load_decl (tree); rtx aarch64_gen_callee_cookie (aarch64_isa_mode, arm_pcs); void aarch64_expand_call (rtx, rtx, rtx, bool); diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 08b121227ee..a91222b6e3b 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -1313,6 +1313,9 @@ (match_dup 4)) (match_dup 3)))] { + if (aarch64_emit_opt_vec_rotate (operands[0], operands[1], operands[2])) + DONE; + operands[3] = reload_completed ? operands[0] : gen_reg_rtx (mode); rtx shft_amnt = unwrap_const_vec_duplicate (operands[2]); int bitwidth = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT; diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc index 0fa7927d821..7388f6b8fdf 100644 --- a/gcc/config/aarch64/aarch64.cc +++ b/gcc/config/aarch64/aarch64.cc @@ -16018,6 +16018,22 @@ aarch64_emit_approx_div (rtx quo, rtx num, rtx den) return true; } +/* Emit an optimized sequence to perform a vector rotate + of REG by the vector constant amount AMNT and place the result + in DST. Return true iff successful. */ + +bool +aarch64_emit_opt_vec_rotate (rtx dst, rtx reg, rtx amnt) +{ + machine_mode mode = GET_MODE (reg); + /* Attempt to expand the rotate as a vector permute. + For some rotate amounts they can be single instructions and + even the general single-vector TBL permute has good throughput. */ + if (expand_rotate_as_vec_perm (mode, dst, reg, amnt)) + return true; + return false; +} + /* Return the number of instructions that can be issued per cycle. */ static int aarch64_sched_issue_rate (void) diff --git a/gcc/expmed.cc b/gcc/expmed.cc index aa9f1abc8ab..2d5e5243ce8 100644 --- a/gcc/expmed.cc +++ b/gcc/expmed.cc @@ -6286,6 +6286,50 @@ emit_store_flag_force (rtx target, enum rtx_code code, rtx op0, rtx op1, return target; } +/* Expand a vector (left) rotate of MODE of X by an immediate AMT as a vector + permute operation. Emit code to put the result in DST if successfull and + return it. Otherwise return NULL. This is intended to implement vector + rotates by byte amounts using vector permutes when the target does not offer + native vector rotate operations. */ +rtx +expand_rotate_as_vec_perm (machine_mode mode, rtx dst, rtx x, rtx amt) +{ + rtx amt_unwrap = unwrap_const_vec_duplicate (amt); + /* For now handle only rotate by the same integer constant in all lanes. + In principle rotates by any constant vector are representable through + permutes as long as the individual rotate amounts are multiples of + BITS_PER_UNIT. */ + if (!CONST_INT_P (amt_unwrap)) + return NULL_RTX; + + int rotamnt = INTVAL (amt_unwrap); + if (rotamnt % BITS_PER_UNIT != 0) + return NULL_RTX; + machine_mode qimode; + if (!qimode_for_vec_perm (mode).exists (&qimode)) + return NULL_RTX; + + vec_perm_builder builder; + unsigned nunits = GET_MODE_SIZE (GET_MODE_INNER (mode)); + poly_uint64 total_units = GET_MODE_SIZE (mode); + builder.new_vector (total_units, nunits, 3); + unsigned rot_bytes = rotamnt / BITS_PER_UNIT; + unsigned rot_to_perm = BYTES_BIG_ENDIAN ? rot_bytes : nunits - rot_bytes; + for (unsigned j = 0; j < 3 * nunits; j += nunits) + for (unsigned i = 0; i < nunits; i++) + builder.quick_push ((rot_to_perm + i) % nunits + j); + + rtx perm_src = lowpart_subreg (qimode, x, mode); + rtx perm_dst = lowpart_subreg (qimode, dst, mode); + rtx res + = expand_vec_perm_const (qimode, perm_src, perm_src, builder, + qimode, perm_dst); + if (!res) + return NULL_RTX; + emit_move_insn (dst, lowpart_subreg (mode, res, qimode)); + return dst; +} + /* Helper function for canonicalize_cmp_for_target. Swap between inclusive and exclusive ranges in order to create an equivalent comparison. See canonicalize_cmp_for_target for the possible cases. */ diff --git a/gcc/expmed.h b/gcc/expmed.h index 0a19176b77a..2414c3cb27d 100644 --- a/gcc/expmed.h +++ b/gcc/expmed.h @@ -726,5 +726,6 @@ extern rtx expand_mult_highpart_adjust (scalar_int_mode, rtx, rtx, rtx, rtx, int); extern rtx expmed_mult_highpart_optab (scalar_int_mode, rtx, rtx, rtx, int, int); +extern rtx expand_rotate_as_vec_perm (machine_mode, rtx, rtx, rtx); #endif // EXPMED_H diff --git a/gcc/testsuite/gcc.target/aarch64/simd/pr117048_2.c b/gcc/testsuite/gcc.target/aarch64/simd/pr117048_2.c new file mode 100644 index 00000000000..7baf3581870 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/simd/pr117048_2.c @@ -0,0 +1,66 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mlittle-endian" } */ +/* { dg-final { check-function-bodies "**" "" "" } } */ + +typedef char __attribute__ ((vector_size (16))) v16qi; +typedef unsigned short __attribute__ ((vector_size (16))) v8hi; +typedef unsigned int __attribute__ ((vector_size (16))) v4si; +typedef unsigned long long __attribute__ ((vector_size (16))) v2di; +typedef unsigned short __attribute__ ((vector_size (8))) v4hi; +typedef unsigned int __attribute__ ((vector_size (8))) v2si; + +/* +** G1: +** rev64 v0\.4s, v0\.4s +** ret +*/ +v2di +G1 (v2di r) +{ + return (r >> 32) | (r << 32); +} + +/* +** G2: +** rev32 v0\.8h, v0\.8h +** ret +*/ +v4si +G2 (v4si r) +{ + return (r >> 16) | (r << 16); +} + +/* +** G3: +** rev16 v0\.16b, v0\.16b +** ret +*/ +v8hi +G3 (v8hi r) +{ + return (r >> 8) | (r << 8); +} + +/* +** G4: +** rev32 v0\.4h, v0\.4h +** ret +*/ +v2si +G4 (v2si r) +{ + return (r >> 16) | (r << 16); +} + +/* +** G5: +** rev16 v0\.8b, v0\.8b +** ret +*/ +v4hi +G5 (v4hi r) +{ + return (r >> 8) | (r << 8); +} + diff --git a/gcc/testsuite/gcc.target/aarch64/vec-rot-exec.c b/gcc/testsuite/gcc.target/aarch64/vec-rot-exec.c new file mode 100644 index 00000000000..130a9b1aa64 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/vec-rot-exec.c @@ -0,0 +1,101 @@ +/* { dg-do run } */ +/* { dg-options "-O2" } */ + +typedef char __attribute__ ((vector_size (16))) v16qi; +typedef unsigned short __attribute__ ((vector_size (16))) v8hi; +typedef unsigned int __attribute__ ((vector_size (16))) v4si; +typedef unsigned long long __attribute__ ((vector_size (16))) v2di; +typedef char __attribute__ ((vector_size (8))) v8qi; +typedef unsigned short __attribute__ ((vector_size (8))) v4hi; +typedef unsigned int __attribute__ ((vector_size (8))) v2si; +#define VEC_ELTS(X) (sizeof (X) / (sizeof (X[0]))) + +static const char __attribute__ ((aligned (16))) *str = "abcdefghijklmnopqrstuvwxyz"; + +unsigned long long +__attribute__((noipa,noinline)) +rot_64_one (unsigned long long x, unsigned amt) +{ + return (x << amt) | (x >> (64 - amt)); +} +unsigned int +__attribute__((noipa,noinline)) +rot_32_one (unsigned int x, unsigned amt) +{ + return (x << amt) | (x >> (32 - amt)); +} + +unsigned short +__attribute__((noipa,noinline)) +rot_16_one (unsigned short x, unsigned short amt) +{ + return (x << amt) | (x >> (16 - amt)); +} + + +#define ROTFUNC(M,S,A) \ +M \ +__attribute__((noipa,noinline)) \ +rot_##M##_##S##_##A (M x) \ +{ \ + return (x << A) | (x >> (S - A)); \ +} \ + \ +void \ +test_rot_##M##_##S##_##A (void) \ +{ \ + M vec = *(M *)str; \ + M res = rot_##M##_##S##_##A (vec); \ + for (__SIZE_TYPE__ i = 0; i < VEC_ELTS (vec); i++) \ + if (res[i] != rot_##S##_one (vec[i], A)) \ + __builtin_abort (); \ +} + +ROTFUNC (v2di, 64, 56) +ROTFUNC (v2di, 64, 48) +ROTFUNC (v2di, 64, 40) +ROTFUNC (v2di, 64, 32) +ROTFUNC (v2di, 64, 24) +ROTFUNC (v2di, 64, 16) +ROTFUNC (v2di, 64, 8) + +ROTFUNC (v4si, 32, 24) +ROTFUNC (v4si, 32, 16) +ROTFUNC (v4si, 32, 8) + +ROTFUNC (v8hi, 16, 8) + +ROTFUNC (v2si, 32, 24) +ROTFUNC (v2si, 32, 16) +ROTFUNC (v2si, 32, 8) + +ROTFUNC (v4hi, 16, 8) + +#define CALL_TEST(M,S,A) test_rot_##M##_##S##_##A () + +int +main (void) +{ + CALL_TEST (v2di, 64, 56); + CALL_TEST (v2di, 64, 48); + CALL_TEST (v2di, 64, 40); + CALL_TEST (v2di, 64, 32); + CALL_TEST (v2di, 64, 24); + CALL_TEST (v2di, 64, 16); + CALL_TEST (v2di, 64, 8); + + CALL_TEST (v4si, 32, 24); + CALL_TEST (v4si, 32, 16); + CALL_TEST (v4si, 32, 8); + + CALL_TEST (v8hi, 16, 8); + + CALL_TEST (v2si, 32, 24); + CALL_TEST (v2si, 32, 16); + CALL_TEST (v2si, 32, 8); + + CALL_TEST (v4hi, 16, 8); + + return 0; +} +