aarch64: Optimize vector rotates as vector permutes where possible

Some vector rotate operations can be implemented in a single instruction rather than using the fallback SHL+USRA sequence. In particular, when the rotate amount is half the bitwidth of the element we can use a REV64,REV32,REV16 instruction. More generally, rotates by a byte amount can be implented using vector permutes. This patch adds such a generic routine in expmed.cc called expand_rotate_as_vec_perm that calculates the required permute indices and uses the expand_vec_perm_const interface. On aarch64 this ends up generating the single-instruction sequences above where possible and can use LDR+TBL sequences too, which are a good choice. With help from Richard, the routine should be VLA-safe. However, the only use of expand_rotate_as_vec_perm introduced in this patch is in aarch64-specific code that for now only handles fixed-width modes. A runtime aarch64 test is added to ensure the permute indices are not messed up. Bootstrapped and tested on aarch64-none-linux-gnu. Signed-off-by: Kyrylo Tkachov <ktkachov@nvidia.com> gcc/ * expmed.h (expand_rotate_as_vec_perm): Declare. * expmed.cc (expand_rotate_as_vec_perm): Define. * config/aarch64/aarch64-protos.h (aarch64_emit_opt_vec_rotate): Declare prototype. * config/aarch64/aarch64.cc (aarch64_emit_opt_vec_rotate): Implement. * config/aarch64/aarch64-simd.md (*aarch64_simd_rotate_imm<mode>): Call the above. gcc/testsuite/ * gcc.target/aarch64/vec-rot-exec.c: New test. * gcc.target/aarch64/simd/pr117048_2.c: New test.
2024-11-21 13:40:47 +00:00 · 2024-10-16 04:10:08 -07:00 · 2024-10-16 04:10:08 -07:00 · 19757e1c28
commit 19757e1c28
parent 1411d39bc7
7 changed files with 232 additions and 0 deletions
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@ -851,6 +851,7 @@ bool aarch64_rnd_imm_p (rtx);
 bool aarch64_constant_address_p (rtx);
 bool aarch64_emit_approx_div (rtx, rtx, rtx);
 bool aarch64_emit_approx_sqrt (rtx, rtx, bool);
 bool aarch64_emit_opt_vec_rotate (rtx, rtx, rtx);
 tree aarch64_vector_load_decl (tree);
 rtx aarch64_gen_callee_cookie (aarch64_isa_mode, arm_pcs);
 void aarch64_expand_call (rtx, rtx, rtx, bool);
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@ -1313,6 +1313,9 @@
 	    (match_dup 4))
 	  (match_dup 3)))]
  {
    if (aarch64_emit_opt_vec_rotate (operands[0], operands[1], operands[2]))
      DONE;
    operands[3] = reload_completed ? operands[0] : gen_reg_rtx (<MODE>mode);
    rtx shft_amnt = unwrap_const_vec_duplicate (operands[2]);
    int bitwidth = GET_MODE_UNIT_SIZE (<MODE>mode) * BITS_PER_UNIT;
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@ -16018,6 +16018,22 @@ aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
  return true;
 }
 /* Emit an optimized sequence to perform a vector rotate
   of REG by the vector constant amount AMNT and place the result
   in DST.  Return true iff successful.  */
 bool
 aarch64_emit_opt_vec_rotate (rtx dst, rtx reg, rtx amnt)
 {
  machine_mode mode = GET_MODE (reg);
  /* Attempt to expand the rotate as a vector permute.
     For some rotate amounts they can be single instructions and
     even the general single-vector TBL permute has good throughput.  */
  if (expand_rotate_as_vec_perm (mode, dst, reg, amnt))
    return true;
  return false;
 }
 /* Return the number of instructions that can be issued per cycle.  */
 static int
 aarch64_sched_issue_rate (void)
--- a/gcc/expmed.cc
+++ b/gcc/expmed.cc
@ -6286,6 +6286,50 @@ emit_store_flag_force (rtx target, enum rtx_code code, rtx op0, rtx op1,
  return target;
 }
 /* Expand a vector (left) rotate of MODE of X by an immediate AMT as a vector
   permute operation.  Emit code to put the result in DST if successfull and
   return it.  Otherwise return NULL.  This is intended to implement vector
   rotates by byte amounts using vector permutes when the target does not offer
   native vector rotate operations.  */
 rtx
 expand_rotate_as_vec_perm (machine_mode mode, rtx dst, rtx x, rtx amt)
 {
  rtx amt_unwrap = unwrap_const_vec_duplicate (amt);
  /* For now handle only rotate by the same integer constant in all lanes.
     In principle rotates by any constant vector are representable through
     permutes as long as the individual rotate amounts are multiples of
     BITS_PER_UNIT.  */
  if (!CONST_INT_P (amt_unwrap))
    return NULL_RTX;
  int rotamnt = INTVAL (amt_unwrap);
  if (rotamnt % BITS_PER_UNIT != 0)
    return NULL_RTX;
  machine_mode qimode;
  if (!qimode_for_vec_perm (mode).exists (&qimode))
    return NULL_RTX;
  vec_perm_builder builder;
  unsigned nunits = GET_MODE_SIZE (GET_MODE_INNER (mode));
  poly_uint64 total_units = GET_MODE_SIZE (mode);
  builder.new_vector (total_units, nunits, 3);
  unsigned rot_bytes = rotamnt / BITS_PER_UNIT;
  unsigned rot_to_perm = BYTES_BIG_ENDIAN ? rot_bytes : nunits - rot_bytes;
  for (unsigned j = 0; j < 3 * nunits; j += nunits)
    for (unsigned i = 0; i < nunits; i++)
      builder.quick_push ((rot_to_perm + i) % nunits + j);
  rtx perm_src = lowpart_subreg (qimode, x, mode);
  rtx perm_dst = lowpart_subreg (qimode, dst, mode);
  rtx res
    = expand_vec_perm_const (qimode, perm_src, perm_src, builder,
 			     qimode, perm_dst);
  if (!res)
    return NULL_RTX;
  emit_move_insn (dst, lowpart_subreg (mode, res, qimode));
  return dst;
 }
 /* Helper function for canonicalize_cmp_for_target.  Swap between inclusive
   and exclusive ranges in order to create an equivalent comparison.  See
   canonicalize_cmp_for_target for the possible cases.  */
--- a/gcc/expmed.h
+++ b/gcc/expmed.h
@ -726,5 +726,6 @@ extern rtx expand_mult_highpart_adjust (scalar_int_mode, rtx, rtx, rtx,
 					rtx, int);
 extern rtx expmed_mult_highpart_optab (scalar_int_mode, rtx, rtx, rtx,
 				       int, int);
 extern rtx expand_rotate_as_vec_perm (machine_mode, rtx, rtx, rtx);
 #endif  // EXPMED_H
--- a/gcc/testsuite/gcc.target/aarch64/simd/pr117048_2.c
+++ b/gcc/testsuite/gcc.target/aarch64/simd/pr117048_2.c
@ -0,0 +1,66 @@
 /* { dg-do compile } */
 /* { dg-options "-O2 -mlittle-endian" } */
 /* { dg-final { check-function-bodies "**" "" "" } } */
 typedef char __attribute__ ((vector_size (16))) v16qi;
 typedef unsigned short __attribute__ ((vector_size (16))) v8hi;
 typedef unsigned int __attribute__ ((vector_size (16))) v4si;
 typedef unsigned long long __attribute__ ((vector_size (16))) v2di;
 typedef unsigned short __attribute__ ((vector_size (8))) v4hi;
 typedef unsigned int __attribute__ ((vector_size (8))) v2si;
 /*
 ** G1:
 **	rev64	v0\.4s, v0\.4s
 **	ret 
 */
 v2di
 G1 (v2di r)
 {
  return (r >> 32) | (r << 32);
 }
 /*
 ** G2:
 **	rev32	v0\.8h, v0\.8h
 **	ret 
 */
 v4si
 G2 (v4si r)
 {
  return (r >> 16) | (r << 16);
 }
 /*
 ** G3:
 **	rev16	v0\.16b, v0\.16b
 **	ret 
 */
 v8hi
 G3 (v8hi r)
 {
  return (r >> 8) | (r << 8);
 }
 /*
 ** G4:
 **	rev32	v0\.4h, v0\.4h
 **	ret 
 */
 v2si
 G4 (v2si r)
 {
  return (r >> 16) | (r << 16);
 }
 /*
 ** G5:
 **	rev16	v0\.8b, v0\.8b
 **	ret 
 */
 v4hi
 G5 (v4hi r)
 {
  return (r >> 8) | (r << 8);
 }
--- a/gcc/testsuite/gcc.target/aarch64/vec-rot-exec.c
+++ b/gcc/testsuite/gcc.target/aarch64/vec-rot-exec.c
@ -0,0 +1,101 @@
 /* { dg-do run } */
 /* { dg-options "-O2" } */
 typedef char __attribute__ ((vector_size (16))) v16qi;
 typedef unsigned short __attribute__ ((vector_size (16))) v8hi;
 typedef unsigned int __attribute__ ((vector_size (16))) v4si;
 typedef unsigned long long __attribute__ ((vector_size (16))) v2di;
 typedef char __attribute__ ((vector_size (8))) v8qi;
 typedef unsigned short __attribute__ ((vector_size (8))) v4hi;
 typedef unsigned int __attribute__ ((vector_size (8))) v2si;
 #define VEC_ELTS(X) (sizeof (X) / (sizeof (X[0])))
 static const char __attribute__ ((aligned (16))) *str = "abcdefghijklmnopqrstuvwxyz";
 unsigned long long
 __attribute__((noipa,noinline)) 
 rot_64_one (unsigned long long x, unsigned amt)
 {
  return (x << amt) | (x >> (64 - amt));
 }
 unsigned int
 __attribute__((noipa,noinline)) 
 rot_32_one (unsigned int x, unsigned amt)
 {
  return (x << amt) | (x >> (32 - amt));
 }
 unsigned short
 __attribute__((noipa,noinline)) 
 rot_16_one (unsigned short x, unsigned short amt)
 {
  return (x << amt) | (x >> (16 - amt));
 }
 #define ROTFUNC(M,S,A)					\
 M							\
 __attribute__((noipa,noinline)) 			\
 rot_##M##_##S##_##A (M x)				\
 {							\
  return (x << A) | (x >> (S - A));			\
 }							\
 							\
 void							\
 test_rot_##M##_##S##_##A (void)				\
 {							\
  M vec = *(M *)str;					\
  M res = rot_##M##_##S##_##A (vec);			\
  for (__SIZE_TYPE__ i = 0; i < VEC_ELTS (vec); i++)	\
    if (res[i] != rot_##S##_one (vec[i], A))		\
      __builtin_abort ();				\
 }
 ROTFUNC (v2di, 64, 56)
 ROTFUNC (v2di, 64, 48)
 ROTFUNC (v2di, 64, 40)
 ROTFUNC (v2di, 64, 32)
 ROTFUNC (v2di, 64, 24)
 ROTFUNC (v2di, 64, 16)
 ROTFUNC (v2di, 64, 8)
 ROTFUNC (v4si, 32, 24)
 ROTFUNC (v4si, 32, 16)
 ROTFUNC (v4si, 32, 8)
 ROTFUNC (v8hi, 16, 8)
 ROTFUNC (v2si, 32, 24)
 ROTFUNC (v2si, 32, 16)
 ROTFUNC (v2si, 32, 8)
 ROTFUNC (v4hi, 16, 8)
 #define CALL_TEST(M,S,A) test_rot_##M##_##S##_##A ()
 int
 main (void)
 {
  CALL_TEST (v2di, 64, 56);
  CALL_TEST (v2di, 64, 48);
  CALL_TEST (v2di, 64, 40);
  CALL_TEST (v2di, 64, 32);
  CALL_TEST (v2di, 64, 24);
  CALL_TEST (v2di, 64, 16);
  CALL_TEST (v2di, 64, 8);
  CALL_TEST (v4si, 32, 24);
  CALL_TEST (v4si, 32, 16);
  CALL_TEST (v4si, 32, 8);
  CALL_TEST (v8hi, 16, 8);
  CALL_TEST (v2si, 32, 24);
  CALL_TEST (v2si, 32, 16);
  CALL_TEST (v2si, 32, 8);
  CALL_TEST (v4hi, 16, 8);
  return 0;
 }