aarch64: Optimize vector rotates as vector permutes where possible

Some vector rotate operations can be implemented in a single instruction
rather than using the fallback SHL+USRA sequence.
In particular, when the rotate amount is half the bitwidth of the element
we can use a REV64,REV32,REV16 instruction.
More generally, rotates by a byte amount can be implented using vector
permutes.
This patch adds such a generic routine in expmed.cc called
expand_rotate_as_vec_perm that calculates the required permute indices
and uses the expand_vec_perm_const interface.

On aarch64 this ends up generating the single-instruction sequences above
where possible and can use LDR+TBL sequences too, which are a good choice.

With help from Richard, the routine should be VLA-safe.
However, the only use of expand_rotate_as_vec_perm introduced in this patch
is in aarch64-specific code that for now only handles fixed-width modes.

A runtime aarch64 test is added to ensure the permute indices are not messed
up.

Bootstrapped and tested on aarch64-none-linux-gnu.

Signed-off-by: Kyrylo Tkachov <ktkachov@nvidia.com>

gcc/

	* expmed.h (expand_rotate_as_vec_perm): Declare.
	* expmed.cc (expand_rotate_as_vec_perm): Define.
	* config/aarch64/aarch64-protos.h (aarch64_emit_opt_vec_rotate):
	Declare prototype.
	* config/aarch64/aarch64.cc (aarch64_emit_opt_vec_rotate): Implement.
	* config/aarch64/aarch64-simd.md (*aarch64_simd_rotate_imm<mode>):
	Call the above.

gcc/testsuite/

	* gcc.target/aarch64/vec-rot-exec.c: New test.
	* gcc.target/aarch64/simd/pr117048_2.c: New test.
This commit is contained in:
Kyrylo Tkachov 2024-10-16 04:10:08 -07:00
parent 1411d39bc7
commit 19757e1c28
No known key found for this signature in database
7 changed files with 232 additions and 0 deletions

View File

@ -851,6 +851,7 @@ bool aarch64_rnd_imm_p (rtx);
bool aarch64_constant_address_p (rtx);
bool aarch64_emit_approx_div (rtx, rtx, rtx);
bool aarch64_emit_approx_sqrt (rtx, rtx, bool);
bool aarch64_emit_opt_vec_rotate (rtx, rtx, rtx);
tree aarch64_vector_load_decl (tree);
rtx aarch64_gen_callee_cookie (aarch64_isa_mode, arm_pcs);
void aarch64_expand_call (rtx, rtx, rtx, bool);

View File

@ -1313,6 +1313,9 @@
(match_dup 4))
(match_dup 3)))]
{
if (aarch64_emit_opt_vec_rotate (operands[0], operands[1], operands[2]))
DONE;
operands[3] = reload_completed ? operands[0] : gen_reg_rtx (<MODE>mode);
rtx shft_amnt = unwrap_const_vec_duplicate (operands[2]);
int bitwidth = GET_MODE_UNIT_SIZE (<MODE>mode) * BITS_PER_UNIT;

View File

@ -16018,6 +16018,22 @@ aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
return true;
}
/* Emit an optimized sequence to perform a vector rotate
of REG by the vector constant amount AMNT and place the result
in DST. Return true iff successful. */
bool
aarch64_emit_opt_vec_rotate (rtx dst, rtx reg, rtx amnt)
{
machine_mode mode = GET_MODE (reg);
/* Attempt to expand the rotate as a vector permute.
For some rotate amounts they can be single instructions and
even the general single-vector TBL permute has good throughput. */
if (expand_rotate_as_vec_perm (mode, dst, reg, amnt))
return true;
return false;
}
/* Return the number of instructions that can be issued per cycle. */
static int
aarch64_sched_issue_rate (void)

View File

@ -6286,6 +6286,50 @@ emit_store_flag_force (rtx target, enum rtx_code code, rtx op0, rtx op1,
return target;
}
/* Expand a vector (left) rotate of MODE of X by an immediate AMT as a vector
permute operation. Emit code to put the result in DST if successfull and
return it. Otherwise return NULL. This is intended to implement vector
rotates by byte amounts using vector permutes when the target does not offer
native vector rotate operations. */
rtx
expand_rotate_as_vec_perm (machine_mode mode, rtx dst, rtx x, rtx amt)
{
rtx amt_unwrap = unwrap_const_vec_duplicate (amt);
/* For now handle only rotate by the same integer constant in all lanes.
In principle rotates by any constant vector are representable through
permutes as long as the individual rotate amounts are multiples of
BITS_PER_UNIT. */
if (!CONST_INT_P (amt_unwrap))
return NULL_RTX;
int rotamnt = INTVAL (amt_unwrap);
if (rotamnt % BITS_PER_UNIT != 0)
return NULL_RTX;
machine_mode qimode;
if (!qimode_for_vec_perm (mode).exists (&qimode))
return NULL_RTX;
vec_perm_builder builder;
unsigned nunits = GET_MODE_SIZE (GET_MODE_INNER (mode));
poly_uint64 total_units = GET_MODE_SIZE (mode);
builder.new_vector (total_units, nunits, 3);
unsigned rot_bytes = rotamnt / BITS_PER_UNIT;
unsigned rot_to_perm = BYTES_BIG_ENDIAN ? rot_bytes : nunits - rot_bytes;
for (unsigned j = 0; j < 3 * nunits; j += nunits)
for (unsigned i = 0; i < nunits; i++)
builder.quick_push ((rot_to_perm + i) % nunits + j);
rtx perm_src = lowpart_subreg (qimode, x, mode);
rtx perm_dst = lowpart_subreg (qimode, dst, mode);
rtx res
= expand_vec_perm_const (qimode, perm_src, perm_src, builder,
qimode, perm_dst);
if (!res)
return NULL_RTX;
emit_move_insn (dst, lowpart_subreg (mode, res, qimode));
return dst;
}
/* Helper function for canonicalize_cmp_for_target. Swap between inclusive
and exclusive ranges in order to create an equivalent comparison. See
canonicalize_cmp_for_target for the possible cases. */

View File

@ -726,5 +726,6 @@ extern rtx expand_mult_highpart_adjust (scalar_int_mode, rtx, rtx, rtx,
rtx, int);
extern rtx expmed_mult_highpart_optab (scalar_int_mode, rtx, rtx, rtx,
int, int);
extern rtx expand_rotate_as_vec_perm (machine_mode, rtx, rtx, rtx);
#endif // EXPMED_H

View File

@ -0,0 +1,66 @@
/* { dg-do compile } */
/* { dg-options "-O2 -mlittle-endian" } */
/* { dg-final { check-function-bodies "**" "" "" } } */
typedef char __attribute__ ((vector_size (16))) v16qi;
typedef unsigned short __attribute__ ((vector_size (16))) v8hi;
typedef unsigned int __attribute__ ((vector_size (16))) v4si;
typedef unsigned long long __attribute__ ((vector_size (16))) v2di;
typedef unsigned short __attribute__ ((vector_size (8))) v4hi;
typedef unsigned int __attribute__ ((vector_size (8))) v2si;
/*
** G1:
** rev64 v0\.4s, v0\.4s
** ret
*/
v2di
G1 (v2di r)
{
return (r >> 32) | (r << 32);
}
/*
** G2:
** rev32 v0\.8h, v0\.8h
** ret
*/
v4si
G2 (v4si r)
{
return (r >> 16) | (r << 16);
}
/*
** G3:
** rev16 v0\.16b, v0\.16b
** ret
*/
v8hi
G3 (v8hi r)
{
return (r >> 8) | (r << 8);
}
/*
** G4:
** rev32 v0\.4h, v0\.4h
** ret
*/
v2si
G4 (v2si r)
{
return (r >> 16) | (r << 16);
}
/*
** G5:
** rev16 v0\.8b, v0\.8b
** ret
*/
v4hi
G5 (v4hi r)
{
return (r >> 8) | (r << 8);
}

View File

@ -0,0 +1,101 @@
/* { dg-do run } */
/* { dg-options "-O2" } */
typedef char __attribute__ ((vector_size (16))) v16qi;
typedef unsigned short __attribute__ ((vector_size (16))) v8hi;
typedef unsigned int __attribute__ ((vector_size (16))) v4si;
typedef unsigned long long __attribute__ ((vector_size (16))) v2di;
typedef char __attribute__ ((vector_size (8))) v8qi;
typedef unsigned short __attribute__ ((vector_size (8))) v4hi;
typedef unsigned int __attribute__ ((vector_size (8))) v2si;
#define VEC_ELTS(X) (sizeof (X) / (sizeof (X[0])))
static const char __attribute__ ((aligned (16))) *str = "abcdefghijklmnopqrstuvwxyz";
unsigned long long
__attribute__((noipa,noinline))
rot_64_one (unsigned long long x, unsigned amt)
{
return (x << amt) | (x >> (64 - amt));
}
unsigned int
__attribute__((noipa,noinline))
rot_32_one (unsigned int x, unsigned amt)
{
return (x << amt) | (x >> (32 - amt));
}
unsigned short
__attribute__((noipa,noinline))
rot_16_one (unsigned short x, unsigned short amt)
{
return (x << amt) | (x >> (16 - amt));
}
#define ROTFUNC(M,S,A) \
M \
__attribute__((noipa,noinline)) \
rot_##M##_##S##_##A (M x) \
{ \
return (x << A) | (x >> (S - A)); \
} \
\
void \
test_rot_##M##_##S##_##A (void) \
{ \
M vec = *(M *)str; \
M res = rot_##M##_##S##_##A (vec); \
for (__SIZE_TYPE__ i = 0; i < VEC_ELTS (vec); i++) \
if (res[i] != rot_##S##_one (vec[i], A)) \
__builtin_abort (); \
}
ROTFUNC (v2di, 64, 56)
ROTFUNC (v2di, 64, 48)
ROTFUNC (v2di, 64, 40)
ROTFUNC (v2di, 64, 32)
ROTFUNC (v2di, 64, 24)
ROTFUNC (v2di, 64, 16)
ROTFUNC (v2di, 64, 8)
ROTFUNC (v4si, 32, 24)
ROTFUNC (v4si, 32, 16)
ROTFUNC (v4si, 32, 8)
ROTFUNC (v8hi, 16, 8)
ROTFUNC (v2si, 32, 24)
ROTFUNC (v2si, 32, 16)
ROTFUNC (v2si, 32, 8)
ROTFUNC (v4hi, 16, 8)
#define CALL_TEST(M,S,A) test_rot_##M##_##S##_##A ()
int
main (void)
{
CALL_TEST (v2di, 64, 56);
CALL_TEST (v2di, 64, 48);
CALL_TEST (v2di, 64, 40);
CALL_TEST (v2di, 64, 32);
CALL_TEST (v2di, 64, 24);
CALL_TEST (v2di, 64, 16);
CALL_TEST (v2di, 64, 8);
CALL_TEST (v4si, 32, 24);
CALL_TEST (v4si, 32, 16);
CALL_TEST (v4si, 32, 8);
CALL_TEST (v8hi, 16, 8);
CALL_TEST (v2si, 32, 24);
CALL_TEST (v2si, 32, 16);
CALL_TEST (v2si, 32, 8);
CALL_TEST (v4hi, 16, 8);
return 0;
}