mirror of
https://github.com/gcc-mirror/gcc.git
synced 2024-11-21 13:40:47 +00:00
aarch64: Optimize vector rotates as vector permutes where possible
Some vector rotate operations can be implemented in a single instruction rather than using the fallback SHL+USRA sequence. In particular, when the rotate amount is half the bitwidth of the element we can use a REV64,REV32,REV16 instruction. More generally, rotates by a byte amount can be implented using vector permutes. This patch adds such a generic routine in expmed.cc called expand_rotate_as_vec_perm that calculates the required permute indices and uses the expand_vec_perm_const interface. On aarch64 this ends up generating the single-instruction sequences above where possible and can use LDR+TBL sequences too, which are a good choice. With help from Richard, the routine should be VLA-safe. However, the only use of expand_rotate_as_vec_perm introduced in this patch is in aarch64-specific code that for now only handles fixed-width modes. A runtime aarch64 test is added to ensure the permute indices are not messed up. Bootstrapped and tested on aarch64-none-linux-gnu. Signed-off-by: Kyrylo Tkachov <ktkachov@nvidia.com> gcc/ * expmed.h (expand_rotate_as_vec_perm): Declare. * expmed.cc (expand_rotate_as_vec_perm): Define. * config/aarch64/aarch64-protos.h (aarch64_emit_opt_vec_rotate): Declare prototype. * config/aarch64/aarch64.cc (aarch64_emit_opt_vec_rotate): Implement. * config/aarch64/aarch64-simd.md (*aarch64_simd_rotate_imm<mode>): Call the above. gcc/testsuite/ * gcc.target/aarch64/vec-rot-exec.c: New test. * gcc.target/aarch64/simd/pr117048_2.c: New test.
This commit is contained in:
parent
1411d39bc7
commit
19757e1c28
@ -851,6 +851,7 @@ bool aarch64_rnd_imm_p (rtx);
|
|||||||
bool aarch64_constant_address_p (rtx);
|
bool aarch64_constant_address_p (rtx);
|
||||||
bool aarch64_emit_approx_div (rtx, rtx, rtx);
|
bool aarch64_emit_approx_div (rtx, rtx, rtx);
|
||||||
bool aarch64_emit_approx_sqrt (rtx, rtx, bool);
|
bool aarch64_emit_approx_sqrt (rtx, rtx, bool);
|
||||||
|
bool aarch64_emit_opt_vec_rotate (rtx, rtx, rtx);
|
||||||
tree aarch64_vector_load_decl (tree);
|
tree aarch64_vector_load_decl (tree);
|
||||||
rtx aarch64_gen_callee_cookie (aarch64_isa_mode, arm_pcs);
|
rtx aarch64_gen_callee_cookie (aarch64_isa_mode, arm_pcs);
|
||||||
void aarch64_expand_call (rtx, rtx, rtx, bool);
|
void aarch64_expand_call (rtx, rtx, rtx, bool);
|
||||||
|
@ -1313,6 +1313,9 @@
|
|||||||
(match_dup 4))
|
(match_dup 4))
|
||||||
(match_dup 3)))]
|
(match_dup 3)))]
|
||||||
{
|
{
|
||||||
|
if (aarch64_emit_opt_vec_rotate (operands[0], operands[1], operands[2]))
|
||||||
|
DONE;
|
||||||
|
|
||||||
operands[3] = reload_completed ? operands[0] : gen_reg_rtx (<MODE>mode);
|
operands[3] = reload_completed ? operands[0] : gen_reg_rtx (<MODE>mode);
|
||||||
rtx shft_amnt = unwrap_const_vec_duplicate (operands[2]);
|
rtx shft_amnt = unwrap_const_vec_duplicate (operands[2]);
|
||||||
int bitwidth = GET_MODE_UNIT_SIZE (<MODE>mode) * BITS_PER_UNIT;
|
int bitwidth = GET_MODE_UNIT_SIZE (<MODE>mode) * BITS_PER_UNIT;
|
||||||
|
@ -16018,6 +16018,22 @@ aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Emit an optimized sequence to perform a vector rotate
|
||||||
|
of REG by the vector constant amount AMNT and place the result
|
||||||
|
in DST. Return true iff successful. */
|
||||||
|
|
||||||
|
bool
|
||||||
|
aarch64_emit_opt_vec_rotate (rtx dst, rtx reg, rtx amnt)
|
||||||
|
{
|
||||||
|
machine_mode mode = GET_MODE (reg);
|
||||||
|
/* Attempt to expand the rotate as a vector permute.
|
||||||
|
For some rotate amounts they can be single instructions and
|
||||||
|
even the general single-vector TBL permute has good throughput. */
|
||||||
|
if (expand_rotate_as_vec_perm (mode, dst, reg, amnt))
|
||||||
|
return true;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
/* Return the number of instructions that can be issued per cycle. */
|
/* Return the number of instructions that can be issued per cycle. */
|
||||||
static int
|
static int
|
||||||
aarch64_sched_issue_rate (void)
|
aarch64_sched_issue_rate (void)
|
||||||
|
@ -6286,6 +6286,50 @@ emit_store_flag_force (rtx target, enum rtx_code code, rtx op0, rtx op1,
|
|||||||
return target;
|
return target;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Expand a vector (left) rotate of MODE of X by an immediate AMT as a vector
|
||||||
|
permute operation. Emit code to put the result in DST if successfull and
|
||||||
|
return it. Otherwise return NULL. This is intended to implement vector
|
||||||
|
rotates by byte amounts using vector permutes when the target does not offer
|
||||||
|
native vector rotate operations. */
|
||||||
|
rtx
|
||||||
|
expand_rotate_as_vec_perm (machine_mode mode, rtx dst, rtx x, rtx amt)
|
||||||
|
{
|
||||||
|
rtx amt_unwrap = unwrap_const_vec_duplicate (amt);
|
||||||
|
/* For now handle only rotate by the same integer constant in all lanes.
|
||||||
|
In principle rotates by any constant vector are representable through
|
||||||
|
permutes as long as the individual rotate amounts are multiples of
|
||||||
|
BITS_PER_UNIT. */
|
||||||
|
if (!CONST_INT_P (amt_unwrap))
|
||||||
|
return NULL_RTX;
|
||||||
|
|
||||||
|
int rotamnt = INTVAL (amt_unwrap);
|
||||||
|
if (rotamnt % BITS_PER_UNIT != 0)
|
||||||
|
return NULL_RTX;
|
||||||
|
machine_mode qimode;
|
||||||
|
if (!qimode_for_vec_perm (mode).exists (&qimode))
|
||||||
|
return NULL_RTX;
|
||||||
|
|
||||||
|
vec_perm_builder builder;
|
||||||
|
unsigned nunits = GET_MODE_SIZE (GET_MODE_INNER (mode));
|
||||||
|
poly_uint64 total_units = GET_MODE_SIZE (mode);
|
||||||
|
builder.new_vector (total_units, nunits, 3);
|
||||||
|
unsigned rot_bytes = rotamnt / BITS_PER_UNIT;
|
||||||
|
unsigned rot_to_perm = BYTES_BIG_ENDIAN ? rot_bytes : nunits - rot_bytes;
|
||||||
|
for (unsigned j = 0; j < 3 * nunits; j += nunits)
|
||||||
|
for (unsigned i = 0; i < nunits; i++)
|
||||||
|
builder.quick_push ((rot_to_perm + i) % nunits + j);
|
||||||
|
|
||||||
|
rtx perm_src = lowpart_subreg (qimode, x, mode);
|
||||||
|
rtx perm_dst = lowpart_subreg (qimode, dst, mode);
|
||||||
|
rtx res
|
||||||
|
= expand_vec_perm_const (qimode, perm_src, perm_src, builder,
|
||||||
|
qimode, perm_dst);
|
||||||
|
if (!res)
|
||||||
|
return NULL_RTX;
|
||||||
|
emit_move_insn (dst, lowpart_subreg (mode, res, qimode));
|
||||||
|
return dst;
|
||||||
|
}
|
||||||
|
|
||||||
/* Helper function for canonicalize_cmp_for_target. Swap between inclusive
|
/* Helper function for canonicalize_cmp_for_target. Swap between inclusive
|
||||||
and exclusive ranges in order to create an equivalent comparison. See
|
and exclusive ranges in order to create an equivalent comparison. See
|
||||||
canonicalize_cmp_for_target for the possible cases. */
|
canonicalize_cmp_for_target for the possible cases. */
|
||||||
|
@ -726,5 +726,6 @@ extern rtx expand_mult_highpart_adjust (scalar_int_mode, rtx, rtx, rtx,
|
|||||||
rtx, int);
|
rtx, int);
|
||||||
extern rtx expmed_mult_highpart_optab (scalar_int_mode, rtx, rtx, rtx,
|
extern rtx expmed_mult_highpart_optab (scalar_int_mode, rtx, rtx, rtx,
|
||||||
int, int);
|
int, int);
|
||||||
|
extern rtx expand_rotate_as_vec_perm (machine_mode, rtx, rtx, rtx);
|
||||||
|
|
||||||
#endif // EXPMED_H
|
#endif // EXPMED_H
|
||||||
|
66
gcc/testsuite/gcc.target/aarch64/simd/pr117048_2.c
Normal file
66
gcc/testsuite/gcc.target/aarch64/simd/pr117048_2.c
Normal file
@ -0,0 +1,66 @@
|
|||||||
|
/* { dg-do compile } */
|
||||||
|
/* { dg-options "-O2 -mlittle-endian" } */
|
||||||
|
/* { dg-final { check-function-bodies "**" "" "" } } */
|
||||||
|
|
||||||
|
typedef char __attribute__ ((vector_size (16))) v16qi;
|
||||||
|
typedef unsigned short __attribute__ ((vector_size (16))) v8hi;
|
||||||
|
typedef unsigned int __attribute__ ((vector_size (16))) v4si;
|
||||||
|
typedef unsigned long long __attribute__ ((vector_size (16))) v2di;
|
||||||
|
typedef unsigned short __attribute__ ((vector_size (8))) v4hi;
|
||||||
|
typedef unsigned int __attribute__ ((vector_size (8))) v2si;
|
||||||
|
|
||||||
|
/*
|
||||||
|
** G1:
|
||||||
|
** rev64 v0\.4s, v0\.4s
|
||||||
|
** ret
|
||||||
|
*/
|
||||||
|
v2di
|
||||||
|
G1 (v2di r)
|
||||||
|
{
|
||||||
|
return (r >> 32) | (r << 32);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
** G2:
|
||||||
|
** rev32 v0\.8h, v0\.8h
|
||||||
|
** ret
|
||||||
|
*/
|
||||||
|
v4si
|
||||||
|
G2 (v4si r)
|
||||||
|
{
|
||||||
|
return (r >> 16) | (r << 16);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
** G3:
|
||||||
|
** rev16 v0\.16b, v0\.16b
|
||||||
|
** ret
|
||||||
|
*/
|
||||||
|
v8hi
|
||||||
|
G3 (v8hi r)
|
||||||
|
{
|
||||||
|
return (r >> 8) | (r << 8);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
** G4:
|
||||||
|
** rev32 v0\.4h, v0\.4h
|
||||||
|
** ret
|
||||||
|
*/
|
||||||
|
v2si
|
||||||
|
G4 (v2si r)
|
||||||
|
{
|
||||||
|
return (r >> 16) | (r << 16);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
** G5:
|
||||||
|
** rev16 v0\.8b, v0\.8b
|
||||||
|
** ret
|
||||||
|
*/
|
||||||
|
v4hi
|
||||||
|
G5 (v4hi r)
|
||||||
|
{
|
||||||
|
return (r >> 8) | (r << 8);
|
||||||
|
}
|
||||||
|
|
101
gcc/testsuite/gcc.target/aarch64/vec-rot-exec.c
Normal file
101
gcc/testsuite/gcc.target/aarch64/vec-rot-exec.c
Normal file
@ -0,0 +1,101 @@
|
|||||||
|
/* { dg-do run } */
|
||||||
|
/* { dg-options "-O2" } */
|
||||||
|
|
||||||
|
typedef char __attribute__ ((vector_size (16))) v16qi;
|
||||||
|
typedef unsigned short __attribute__ ((vector_size (16))) v8hi;
|
||||||
|
typedef unsigned int __attribute__ ((vector_size (16))) v4si;
|
||||||
|
typedef unsigned long long __attribute__ ((vector_size (16))) v2di;
|
||||||
|
typedef char __attribute__ ((vector_size (8))) v8qi;
|
||||||
|
typedef unsigned short __attribute__ ((vector_size (8))) v4hi;
|
||||||
|
typedef unsigned int __attribute__ ((vector_size (8))) v2si;
|
||||||
|
#define VEC_ELTS(X) (sizeof (X) / (sizeof (X[0])))
|
||||||
|
|
||||||
|
static const char __attribute__ ((aligned (16))) *str = "abcdefghijklmnopqrstuvwxyz";
|
||||||
|
|
||||||
|
unsigned long long
|
||||||
|
__attribute__((noipa,noinline))
|
||||||
|
rot_64_one (unsigned long long x, unsigned amt)
|
||||||
|
{
|
||||||
|
return (x << amt) | (x >> (64 - amt));
|
||||||
|
}
|
||||||
|
unsigned int
|
||||||
|
__attribute__((noipa,noinline))
|
||||||
|
rot_32_one (unsigned int x, unsigned amt)
|
||||||
|
{
|
||||||
|
return (x << amt) | (x >> (32 - amt));
|
||||||
|
}
|
||||||
|
|
||||||
|
unsigned short
|
||||||
|
__attribute__((noipa,noinline))
|
||||||
|
rot_16_one (unsigned short x, unsigned short amt)
|
||||||
|
{
|
||||||
|
return (x << amt) | (x >> (16 - amt));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
#define ROTFUNC(M,S,A) \
|
||||||
|
M \
|
||||||
|
__attribute__((noipa,noinline)) \
|
||||||
|
rot_##M##_##S##_##A (M x) \
|
||||||
|
{ \
|
||||||
|
return (x << A) | (x >> (S - A)); \
|
||||||
|
} \
|
||||||
|
\
|
||||||
|
void \
|
||||||
|
test_rot_##M##_##S##_##A (void) \
|
||||||
|
{ \
|
||||||
|
M vec = *(M *)str; \
|
||||||
|
M res = rot_##M##_##S##_##A (vec); \
|
||||||
|
for (__SIZE_TYPE__ i = 0; i < VEC_ELTS (vec); i++) \
|
||||||
|
if (res[i] != rot_##S##_one (vec[i], A)) \
|
||||||
|
__builtin_abort (); \
|
||||||
|
}
|
||||||
|
|
||||||
|
ROTFUNC (v2di, 64, 56)
|
||||||
|
ROTFUNC (v2di, 64, 48)
|
||||||
|
ROTFUNC (v2di, 64, 40)
|
||||||
|
ROTFUNC (v2di, 64, 32)
|
||||||
|
ROTFUNC (v2di, 64, 24)
|
||||||
|
ROTFUNC (v2di, 64, 16)
|
||||||
|
ROTFUNC (v2di, 64, 8)
|
||||||
|
|
||||||
|
ROTFUNC (v4si, 32, 24)
|
||||||
|
ROTFUNC (v4si, 32, 16)
|
||||||
|
ROTFUNC (v4si, 32, 8)
|
||||||
|
|
||||||
|
ROTFUNC (v8hi, 16, 8)
|
||||||
|
|
||||||
|
ROTFUNC (v2si, 32, 24)
|
||||||
|
ROTFUNC (v2si, 32, 16)
|
||||||
|
ROTFUNC (v2si, 32, 8)
|
||||||
|
|
||||||
|
ROTFUNC (v4hi, 16, 8)
|
||||||
|
|
||||||
|
#define CALL_TEST(M,S,A) test_rot_##M##_##S##_##A ()
|
||||||
|
|
||||||
|
int
|
||||||
|
main (void)
|
||||||
|
{
|
||||||
|
CALL_TEST (v2di, 64, 56);
|
||||||
|
CALL_TEST (v2di, 64, 48);
|
||||||
|
CALL_TEST (v2di, 64, 40);
|
||||||
|
CALL_TEST (v2di, 64, 32);
|
||||||
|
CALL_TEST (v2di, 64, 24);
|
||||||
|
CALL_TEST (v2di, 64, 16);
|
||||||
|
CALL_TEST (v2di, 64, 8);
|
||||||
|
|
||||||
|
CALL_TEST (v4si, 32, 24);
|
||||||
|
CALL_TEST (v4si, 32, 16);
|
||||||
|
CALL_TEST (v4si, 32, 8);
|
||||||
|
|
||||||
|
CALL_TEST (v8hi, 16, 8);
|
||||||
|
|
||||||
|
CALL_TEST (v2si, 32, 24);
|
||||||
|
CALL_TEST (v2si, 32, 16);
|
||||||
|
CALL_TEST (v2si, 32, 8);
|
||||||
|
|
||||||
|
CALL_TEST (v4hi, 16, 8);
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
Loading…
Reference in New Issue
Block a user