Support vector float_extend from __bf16 to float.

It's supported by vector permutation with zero vector.

gcc/ChangeLog:

	* config/i386/i386-expand.cc
	(ix86_expand_vector_bf2sf_with_vec_perm): New function.
	* config/i386/i386-protos.h
	(ix86_expand_vector_bf2sf_with_vec_perm): New Declare.
	* config/i386/mmx.md (extendv2bfv2sf2): New expander.
	* config/i386/sse.md (extend<sf_cvt_bf16_lower><mode>2):
	Ditto.
	(VF1_AVX512BW): New mode iterator.
	(sf_cvt_bf16): Add V4SF.
	(sf_cvt_bf16_lower): New mode attr.

gcc/testsuite/ChangeLog:

	* gcc.target/i386/avx512bw-extendbf2sf.c: New test.
	* gcc.target/i386/sse2-extendbf2sf.c: New test.
This commit is contained in:
liuhongt 2024-10-23 23:51:20 -07:00
parent a17acf4f25
commit 648bd1fcc6
6 changed files with 144 additions and 1 deletions

View File

@ -26879,5 +26879,44 @@ ix86_expand_vector_sf2bf_with_vec_perm (rtx dest, rtx src)
emit_move_insn (dest, lowpart_subreg (GET_MODE (dest), target, vperm_mode));
}
/* Implement extendv8bf2v8sf2 with vector permutation. */
void
ix86_expand_vector_bf2sf_with_vec_perm (rtx dest, rtx src)
{
machine_mode vperm_mode, src_mode = GET_MODE (src);
switch (src_mode)
{
case V16BFmode:
vperm_mode = V32BFmode;
break;
case V8BFmode:
vperm_mode = V16BFmode;
break;
case V4BFmode:
vperm_mode = V8BFmode;
break;
default:
gcc_unreachable ();
}
int nelt = GET_MODE_NUNITS (vperm_mode);
vec_perm_builder sel (nelt, nelt, 1);
sel.quick_grow (nelt);
for (int i = 0, k = 0, j = nelt; i != nelt; i++)
sel[i] = i & 1 ? j++ : k++;
vec_perm_indices indices (sel, 2, nelt);
rtx target = gen_reg_rtx (vperm_mode);
rtx op1 = lowpart_subreg (vperm_mode,
force_reg (src_mode, src),
src_mode);
rtx op0 = CONST0_RTX (vperm_mode);
bool ok = targetm.vectorize.vec_perm_const (vperm_mode, vperm_mode,
target, op0, op1, indices);
gcc_assert (ok);
emit_move_insn (dest, lowpart_subreg (GET_MODE (dest), target, vperm_mode));
}
#include "gt-i386-expand.h"

View File

@ -259,6 +259,8 @@ extern bool ix86_ternlog_operand_p (rtx op);
extern rtx ix86_expand_ternlog (machine_mode mode, rtx op0, rtx op1, rtx op2,
int idx, rtx target);
extern void ix86_expand_vector_sf2bf_with_vec_perm (rtx, rtx);
extern void ix86_expand_vector_bf2sf_with_vec_perm (rtx, rtx);
#ifdef TREE_CODE
extern void init_cumulative_args (CUMULATIVE_ARGS *, tree, rtx, tree, int);

View File

@ -3012,6 +3012,24 @@
DONE;
})
(define_expand "extendv2bfv2sf2"
[(set (match_operand:V2SF 0 "register_operand")
(float_extend:V2SF
(match_operand:V2BF 1 "nonimmediate_operand")))]
"TARGET_SSE2 && TARGET_MMX_WITH_SSE"
{
rtx op0 = gen_reg_rtx (V4SFmode);
rtx op1 = gen_reg_rtx (V4BFmode);
emit_move_insn (op1, lowpart_subreg (V4BFmode,
force_reg (V2BFmode, operands[1]),
V2BFmode));
emit_insn (gen_extendv4bfv4sf2 (op0, op1));
emit_move_insn (operands[0], lowpart_subreg (V2SFmode, op0, V4SFmode));
DONE;
})
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;
;; Parallel integral arithmetic

View File

@ -539,6 +539,9 @@
(define_mode_iterator VF1_AVX512VL
[(V16SF "TARGET_EVEX512") (V8SF "TARGET_AVX512VL") (V4SF "TARGET_AVX512VL")])
(define_mode_iterator VF1_AVX512BW
[(V16SF "TARGET_EVEX512 && TARGET_EVEX512") (V8SF "TARGET_AVX2") V4SF])
(define_mode_iterator VF1_AVX10_2
[(V16SF "TARGET_AVX10_2_512") V8SF V4SF])
@ -30957,7 +30960,11 @@
[(V32BF "V16SF") (V16BF "V8SF") (V8BF "V4SF")])
;; Converting from SF to BF
(define_mode_attr sf_cvt_bf16
[(V8SF "V8BF") (V16SF "V16BF")])
[(V4SF "V4BF") (V8SF "V8BF") (V16SF "V16BF")])
(define_mode_attr sf_cvt_bf16_lower
[(V4SF "v4bf") (V8SF "v8bf") (V16SF "v16bf")])
;; Mapping from BF to SF
(define_mode_attr sf_bf16
[(V4SF "V8BF") (V8SF "V16BF") (V16SF "V32BF")])
@ -31116,6 +31123,17 @@
}
})
(define_expand "extend<sf_cvt_bf16_lower><mode>2"
[(set (match_operand:VF1_AVX512BW 0 "register_operand")
(float_extend:VF1_AVX512BW
(match_operand:<sf_cvt_bf16> 1 "nonimmediate_operand")))]
"TARGET_SSE2"
{
ix86_expand_vector_bf2sf_with_vec_perm (operands[0], operands[1]);
DONE;
})
(define_insn "avx512f_cvtneps2bf16_<mode><mask_name>"
[(set (match_operand:<sf_cvt_bf16> 0 "register_operand" "=v")
(float_truncate:<sf_cvt_bf16>

View File

@ -0,0 +1,46 @@
/* { dg-do compile } */
/* { dg-options "-mavx512bw -mavx512vl -O2" } */
/* { dg-final { scan-assembler-times {(?n)(?:vpermi2w|vpunpcklwd)} 6 } } */
typedef float v4sf __attribute__((vector_size(16)));
typedef float v8sf __attribute__((vector_size(32)));
typedef float v16sf __attribute__((vector_size(64)));
typedef __bf16 v4bf __attribute__((vector_size(8)));
typedef __bf16 v8bf __attribute__((vector_size(16)));
typedef __bf16 v16bf __attribute__((vector_size(32)));
v4sf
foo (v4bf b, v4bf a)
{
return __builtin_convertvector (a, v4sf);
}
v8sf
foo2 (v8bf b, v8bf a)
{
return __builtin_convertvector (a, v8sf);
}
v16sf
foo3 (v16bf b, v16bf a)
{
return __builtin_convertvector (a, v16sf);
}
v4sf
foo_mem (v4bf* a)
{
return __builtin_convertvector (*a, v4sf);
}
v8sf
foo2_mem (v8bf* a)
{
return __builtin_convertvector (*a, v8sf);
}
v16sf
foo3_mem (v16bf* a)
{
return __builtin_convertvector (*a, v16sf);
}

View File

@ -0,0 +1,20 @@
/* { dg-do compile } */
/* { dg-options "-msse2 -O2" } */
/* { dg-final { scan-assembler-times {(?n)(?:vpermi2w|punpcklwd)} 2 { target { ! ia32 } } } } */
typedef float v2sf __attribute__((vector_size(8)));
typedef __bf16 v2bf __attribute__((vector_size(4)));
v2sf
foo (v2bf b, v2bf a)
{
return __builtin_convertvector (a, v2sf);
}
v2sf
foo_mem (v2bf* a)
{
return __builtin_convertvector (*a, v2sf);
}