aarch64: Optimise calls to ldexp with SVE FSCALE instruction [PR111733]

This patch uses the FSCALE instruction provided by SVE to implement the
standard ldexp family of functions.

Currently, with '-Ofast -mcpu=neoverse-v2', GCC generates libcalls for the
following code:

float
test_ldexpf (float x, int i)
{
	return __builtin_ldexpf (x, i);
}

double
test_ldexp (double x, int i)
{
	return __builtin_ldexp(x, i);
}

GCC Output:

test_ldexpf:
	b ldexpf

test_ldexp:
	b ldexp

Since SVE has support for an FSCALE instruction, we can use this to process
scalar floats by moving them to a vector register and performing an fscale call,
similar to how LLVM tackles an ldexp builtin as well.

New Output:

test_ldexpf:
	fmov	s31, w0
	ptrue	p7.b, vl4
	fscale	z0.s, p7/m, z0.s, z31.s
	ret

test_ldexp:
	sxtw	x0, w0
	ptrue	p7.b, vl8
	fmov	d31, x0
	fscale	z0.d, p7/m, z0.d, z31.d
	ret

This is a revision of an earlier patch, and now uses the extended definition of
aarch64_ptrue_reg to generate predicate registers with the appropriate set bits.

The patch was bootstrapped and regtested on aarch64-linux-gnu, no regression.
OK for mainline?

Signed-off-by: Soumya AR <soumyaa@nvidia.com>

gcc/ChangeLog:

	PR target/111733
	* config/aarch64/aarch64-sve.md
	(ldexp<mode>3): Added a new pattern to match ldexp calls with scalar
	floating modes and expand to the existing pattern for FSCALE.
	* config/aarch64/iterators.md:
	(SVE_FULL_F_SCALAR): Added an iterator to match all FP SVE modes as well
	as their scalar equivalents.
	(VPRED): Extended the attribute to handle GPF_HF modes.
	* internal-fn.def (LDEXP): Changed macro to incorporate ldexpf16.

gcc/testsuite/ChangeLog:

	* gcc.target/aarch64/sve/fscale.c: New test.
This commit is contained in:
Soumya AR 2024-11-13 10:20:14 +05:30
parent 445d8bb6a8
commit 9b2915d95d
4 changed files with 72 additions and 7 deletions

View File

@ -5088,6 +5088,21 @@
;; - FTSSEL ;; - FTSSEL
;; ------------------------------------------------------------------------- ;; -------------------------------------------------------------------------
(define_expand "ldexp<mode>3"
[(set (match_operand:GPF_HF 0 "register_operand")
(unspec:GPF_HF
[(match_dup 3)
(const_int SVE_STRICT_GP)
(match_operand:GPF_HF 1 "register_operand")
(match_operand:<V_INT_EQUIV> 2 "register_operand")]
UNSPEC_COND_FSCALE))]
"TARGET_SVE"
{
operands[3] = aarch64_ptrue_reg (<VPRED>mode,
GET_MODE_UNIT_SIZE (<MODE>mode));
}
)
;; Unpredicated floating-point binary operations that take an integer as ;; Unpredicated floating-point binary operations that take an integer as
;; their second operand. ;; their second operand.
(define_insn "@aarch64_sve_<optab><mode>" (define_insn "@aarch64_sve_<optab><mode>"
@ -5103,17 +5118,17 @@
;; Predicated floating-point binary operations that take an integer ;; Predicated floating-point binary operations that take an integer
;; as their second operand. ;; as their second operand.
(define_insn "@aarch64_pred_<optab><mode>" (define_insn "@aarch64_pred_<optab><mode>"
[(set (match_operand:SVE_FULL_F 0 "register_operand") [(set (match_operand:SVE_FULL_F_SCALAR 0 "register_operand")
(unspec:SVE_FULL_F (unspec:SVE_FULL_F_SCALAR
[(match_operand:<VPRED> 1 "register_operand") [(match_operand:<VPRED> 1 "register_operand")
(match_operand:SI 4 "aarch64_sve_gp_strictness") (match_operand:SI 4 "aarch64_sve_gp_strictness")
(match_operand:SVE_FULL_F 2 "register_operand") (match_operand:SVE_FULL_F_SCALAR 2 "register_operand")
(match_operand:<V_INT_EQUIV> 3 "register_operand")] (match_operand:<V_INT_EQUIV> 3 "register_operand")]
SVE_COND_FP_BINARY_INT))] SVE_COND_FP_BINARY_INT))]
"TARGET_SVE" "TARGET_SVE"
{@ [ cons: =0 , 1 , 2 , 3 ; attrs: movprfx ] {@ [ cons: =0 , 1 , 2 , 3 ; attrs: movprfx ]
[ w , Upl , 0 , w ; * ] <sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype> [ w , Upl , 0 , w ; * ] <sve_fp_op>\t%Z0.<Vetype>, %1/m, %Z0.<Vetype>, %Z3.<Vetype>
[ ?&w , Upl , w , w ; yes ] movprfx\t%0, %2\;<sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype> [ ?&w , Upl , w , w ; yes ] movprfx\t%Z0, %Z2\;<sve_fp_op>\t%Z0.<Vetype>, %1/m, %Z0.<Vetype>, %Z3.<Vetype>
} }
) )

View File

@ -452,6 +452,9 @@
;; All fully-packed SVE floating-point vector modes. ;; All fully-packed SVE floating-point vector modes.
(define_mode_iterator SVE_FULL_F [VNx8HF VNx4SF VNx2DF]) (define_mode_iterator SVE_FULL_F [VNx8HF VNx4SF VNx2DF])
;; Fully-packed SVE floating-point vector modes and their scalar equivalents.
(define_mode_iterator SVE_FULL_F_SCALAR [SVE_FULL_F GPF_HF])
;; Fully-packed SVE integer vector modes that have 8-bit or 16-bit elements. ;; Fully-packed SVE integer vector modes that have 8-bit or 16-bit elements.
(define_mode_iterator SVE_FULL_BHI [VNx16QI VNx8HI]) (define_mode_iterator SVE_FULL_BHI [VNx16QI VNx8HI])
@ -2354,7 +2357,8 @@
(VNx8DI "VNx2BI") (VNx8DF "VNx2BI") (VNx8DI "VNx2BI") (VNx8DF "VNx2BI")
(V8QI "VNx8BI") (V16QI "VNx16BI") (V8QI "VNx8BI") (V16QI "VNx16BI")
(V4HI "VNx4BI") (V8HI "VNx8BI") (V2SI "VNx2BI") (V4HI "VNx4BI") (V8HI "VNx8BI") (V2SI "VNx2BI")
(V4SI "VNx4BI") (V2DI "VNx2BI") (V1DI "VNx2BI")]) (V4SI "VNx4BI") (V2DI "VNx2BI") (V1DI "VNx2BI")
(HF "VNx8BI") (SF "VNx4BI") (DF "VNx2BI")])
;; ...and again in lower case. ;; ...and again in lower case.
(define_mode_attr vpred [(VNx16QI "vnx16bi") (VNx8QI "vnx8bi") (define_mode_attr vpred [(VNx16QI "vnx16bi") (VNx8QI "vnx8bi")

View File

@ -441,7 +441,7 @@ DEF_INTERNAL_OPTAB_FN (VEC_FMADDSUB, ECF_CONST, vec_fmaddsub, ternary)
DEF_INTERNAL_OPTAB_FN (VEC_FMSUBADD, ECF_CONST, vec_fmsubadd, ternary) DEF_INTERNAL_OPTAB_FN (VEC_FMSUBADD, ECF_CONST, vec_fmsubadd, ternary)
/* FP scales. */ /* FP scales. */
DEF_INTERNAL_FLT_FN (LDEXP, ECF_CONST, ldexp, binary) DEF_INTERNAL_FLT_FLOATN_FN (LDEXP, ECF_CONST, ldexp, binary)
/* Ternary math functions. */ /* Ternary math functions. */
DEF_INTERNAL_FLT_FLOATN_FN (FMA, ECF_CONST, fma, ternary) DEF_INTERNAL_FLT_FLOATN_FN (FMA, ECF_CONST, fma, ternary)

View File

@ -0,0 +1,46 @@
/* { dg-do compile } */
/* { dg-additional-options "-Ofast" } */
/* { dg-final { check-function-bodies "**" "" } } */
/*
** test_ldexpf16:
** ...
** ptrue (p[0-7]).b, vl2
** ...
** fscale z[0-9]+\.h, \1/m, z[0-9]+\.h, z[0-9]+\.h
** ret
*/
_Float16
test_ldexpf16 (_Float16 x, int i)
{
return __builtin_ldexpf16 (x, i);
}
/*
** test_ldexpf:
** ...
** ptrue (p[0-7])\.b, vl4
** ...
** fscale z[0-9]+\.s, \1/m, z[0-9]+\.s, z[0-9]+\.s
** ret
*/
float
test_ldexpf (float x, int i)
{
return __builtin_ldexpf (x, i);
}
/*
** test_ldexp:
** ...
** ptrue (p[0-7]).b, vl8
** ...
** fscale z[0-9]+\.d, \1/m, z[0-9]+\.d, z[0-9]+\.d
** ret
*/
double
test_ldexp (double x, int i)
{
return __builtin_ldexp (x, i);
}