mirror of
https://github.com/gcc-mirror/gcc.git
synced 2024-11-21 13:40:47 +00:00
amdgcn: additional gfx1030/gfx1100 support
This is enough to get gfx1030 and gfx1100 working; there are still some test failures to investigate, and probably some tuning to do. gcc/ChangeLog: * config/gcn/gcn-opts.h (TARGET_PACKED_WORK_ITEMS): Add TARGET_RDNA3. * config/gcn/gcn-valu.md (all_convert): New iterator. (<convop><V_INT_1REG_ALT:mode><V_INT_1REG:mode>2<exec>): New define_expand, and rename the old one to ... (*<convop><V_INT_1REG_ALT:mode><V_INT_1REG:mode>_sdwa<exec>): ... this. (extend<V_INT_1REG_ALT:mode><V_INT_1REG:mode>2<exec>): Likewise, to ... (extend<V_INT_1REG_ALT:mode><V_INT_1REG:mode>_sdwa<exec>): .. this. (*<convop><V_INT_1REG_ALT:mode><V_INT_1REG:mode>_shift<exec>): New. * config/gcn/gcn.cc (gcn_global_address_p): Use "offsetbits" correctly. (gcn_hsa_declare_function_name): Update the vgpr counting for gfx1100. * config/gcn/gcn.md (<u>mulhisi3): Disable on RDNA3. (<u>mulqihi3_scalar): Likewise. libgcc/ChangeLog: * config/gcn/amdgcn_veclib.h (CDNA3_PLUS): Handle RDNA3. libgomp/ChangeLog: * config/gcn/time.c (RTC_TICKS): Configure RDNA3. (omp_get_wtime): Add RDNA3-compatible variant. * plugin/plugin-gcn.c (max_isa_vgprs): Tune for gfx1030 and gfx1100. Signed-off-by: Andrew Stubbs <ams@baylibre.com>
This commit is contained in:
parent
a0dde47f84
commit
99890e1552
@ -62,7 +62,7 @@ extern enum gcn_isa {
|
||||
|
||||
|
||||
#define TARGET_M0_LDS_LIMIT (TARGET_GCN3)
|
||||
#define TARGET_PACKED_WORK_ITEMS (TARGET_CDNA2_PLUS)
|
||||
#define TARGET_PACKED_WORK_ITEMS (TARGET_CDNA2_PLUS || TARGET_RDNA3)
|
||||
|
||||
#define TARGET_XNACK (flag_xnack != HSACO_ATTR_OFF)
|
||||
|
||||
|
@ -3555,30 +3555,63 @@
|
||||
;; }}}
|
||||
;; {{{ Int/int conversions
|
||||
|
||||
(define_code_iterator all_convert [truncate zero_extend sign_extend])
|
||||
(define_code_iterator zero_convert [truncate zero_extend])
|
||||
(define_code_attr convop [
|
||||
(sign_extend "extend")
|
||||
(zero_extend "zero_extend")
|
||||
(truncate "trunc")])
|
||||
|
||||
(define_insn "<convop><V_INT_1REG_ALT:mode><V_INT_1REG:mode>2<exec>"
|
||||
(define_expand "<convop><V_INT_1REG_ALT:mode><V_INT_1REG:mode>2<exec>"
|
||||
[(set (match_operand:V_INT_1REG 0 "register_operand" "=v")
|
||||
(all_convert:V_INT_1REG
|
||||
(match_operand:V_INT_1REG_ALT 1 "gcn_alu_operand" " v")))]
|
||||
"")
|
||||
|
||||
(define_insn "*<convop><V_INT_1REG_ALT:mode><V_INT_1REG:mode>_sdwa<exec>"
|
||||
[(set (match_operand:V_INT_1REG 0 "register_operand" "=v")
|
||||
(zero_convert:V_INT_1REG
|
||||
(match_operand:V_INT_1REG_ALT 1 "gcn_alu_operand" " v")))]
|
||||
""
|
||||
"!TARGET_RDNA3"
|
||||
"v_mov_b32_sdwa\t%0, %1 dst_sel:<V_INT_1REG:sdwa> dst_unused:UNUSED_PAD src0_sel:<V_INT_1REG_ALT:sdwa>"
|
||||
[(set_attr "type" "vop_sdwa")
|
||||
(set_attr "length" "8")])
|
||||
|
||||
(define_insn "extend<V_INT_1REG_ALT:mode><V_INT_1REG:mode>2<exec>"
|
||||
(define_insn "extend<V_INT_1REG_ALT:mode><V_INT_1REG:mode>_sdwa<exec>"
|
||||
[(set (match_operand:V_INT_1REG 0 "register_operand" "=v")
|
||||
(sign_extend:V_INT_1REG
|
||||
(match_operand:V_INT_1REG_ALT 1 "gcn_alu_operand" " v")))]
|
||||
""
|
||||
"!TARGET_RDNA3"
|
||||
"v_mov_b32_sdwa\t%0, sext(%1) src0_sel:<V_INT_1REG_ALT:sdwa>"
|
||||
[(set_attr "type" "vop_sdwa")
|
||||
(set_attr "length" "8")])
|
||||
|
||||
(define_insn "*<convop><V_INT_1REG_ALT:mode><V_INT_1REG:mode>_shift<exec>"
|
||||
[(set (match_operand:V_INT_1REG 0 "register_operand" "=v")
|
||||
(all_convert:V_INT_1REG
|
||||
(match_operand:V_INT_1REG_ALT 1 "gcn_alu_operand" " v")))]
|
||||
"TARGET_RDNA3"
|
||||
{
|
||||
enum {extend, zero_extend, trunc};
|
||||
rtx shiftwidth = (<V_INT_1REG_ALT:SCALAR_MODE>mode == QImode
|
||||
|| <V_INT_1REG:SCALAR_MODE>mode == QImode
|
||||
? GEN_INT (24)
|
||||
: <V_INT_1REG_ALT:SCALAR_MODE>mode == HImode
|
||||
|| <V_INT_1REG:SCALAR_MODE>mode == HImode
|
||||
? GEN_INT (16)
|
||||
: NULL);
|
||||
operands[2] = shiftwidth;
|
||||
|
||||
if (!shiftwidth)
|
||||
return "v_mov_b32 %0, %1";
|
||||
else if (<convop> == extend || <convop> == trunc)
|
||||
return "v_lshlrev_b32\t%0, %2, %1\;v_ashrrev_i32\t%0, %2, %0";
|
||||
else
|
||||
return "v_lshlrev_b32\t%0, %2, %1\;v_lshrrev_b32\t%0, %2, %0";
|
||||
}
|
||||
[(set_attr "type" "mult")
|
||||
(set_attr "length" "8")])
|
||||
|
||||
;; GCC can already do these for scalar types, but not for vector types.
|
||||
;; Unfortunately you can't just do SUBREG on a vector to select the low part,
|
||||
;; so there must be a few tricks here.
|
||||
|
@ -1597,8 +1597,8 @@ gcn_global_address_p (rtx addr)
|
||||
rtx offset = XEXP (addr, 1);
|
||||
int offsetbits = (TARGET_RDNA2_PLUS ? 11 : 12);
|
||||
bool immediate_p = (CONST_INT_P (offset)
|
||||
&& INTVAL (offset) >= -(1 << 12)
|
||||
&& INTVAL (offset) < (1 << 12));
|
||||
&& INTVAL (offset) >= -(1 << offsetbits)
|
||||
&& INTVAL (offset) < (1 << offsetbits));
|
||||
|
||||
if ((gcn_address_register_p (base, DImode, false)
|
||||
|| gcn_vec_address_register_p (base, DImode, false))
|
||||
@ -6597,8 +6597,10 @@ gcn_hsa_declare_function_name (FILE *file, const char *name,
|
||||
if (df_regs_ever_live_p (FIRST_AVGPR_REG + avgpr))
|
||||
break;
|
||||
avgpr++;
|
||||
vgpr = (vgpr + 3) & ~3;
|
||||
avgpr = (avgpr + 3) & ~3;
|
||||
|
||||
/* The main function epilogue uses v8, but df doesn't see that. */
|
||||
if (vgpr < 9)
|
||||
vgpr = 9;
|
||||
|
||||
if (!leaf_function_p ())
|
||||
{
|
||||
@ -6611,9 +6613,18 @@ gcn_hsa_declare_function_name (FILE *file, const char *name,
|
||||
avgpr = MAX_NORMAL_AVGPR_COUNT;
|
||||
}
|
||||
|
||||
/* The gfx90a accum_offset field can't represent 0 registers. */
|
||||
if (gcn_arch == PROCESSOR_GFX90a && vgpr < 4)
|
||||
vgpr = 4;
|
||||
/* SIMD32 devices count double in wavefront64 mode. */
|
||||
if (TARGET_RDNA2_PLUS)
|
||||
vgpr *= 2;
|
||||
|
||||
/* Round up to the allocation block size. */
|
||||
int vgpr_block_size = (TARGET_RDNA3 ? 12
|
||||
: TARGET_RDNA2_PLUS || TARGET_CDNA2_PLUS ? 8
|
||||
: 4);
|
||||
if (vgpr % vgpr_block_size)
|
||||
vgpr += vgpr_block_size - (vgpr % vgpr_block_size);
|
||||
if (avgpr % vgpr_block_size)
|
||||
avgpr += vgpr_block_size - (avgpr % vgpr_block_size);
|
||||
|
||||
fputs ("\t.rodata\n"
|
||||
"\t.p2align\t6\n"
|
||||
@ -6714,12 +6725,14 @@ gcn_hsa_declare_function_name (FILE *file, const char *name,
|
||||
" .private_segment_fixed_size: 0\n"
|
||||
" .wavefront_size: 64\n"
|
||||
" .sgpr_count: %i\n"
|
||||
" .vgpr_count: %i\n"
|
||||
" .vgpr_count: %i%s\n"
|
||||
" .max_flat_workgroup_size: 1024\n",
|
||||
cfun->machine->kernarg_segment_byte_size,
|
||||
cfun->machine->kernarg_segment_alignment,
|
||||
LDS_SIZE,
|
||||
sgpr, next_free_vgpr);
|
||||
sgpr, next_free_vgpr,
|
||||
(TARGET_RDNA2_PLUS ? " ; wavefrontsize64 counts double on SIMD32"
|
||||
: ""));
|
||||
if (gcn_arch == PROCESSOR_GFX90a || gcn_arch == PROCESSOR_GFX908)
|
||||
fprintf (file, " .agpr_count: %i\n", avgpr);
|
||||
fputs (" .end_amdgpu_metadata\n", file);
|
||||
|
@ -1618,7 +1618,7 @@
|
||||
(mult:SI
|
||||
(any_extend:SI (match_operand:HI 1 "register_operand" "%v"))
|
||||
(any_extend:SI (match_operand:HI 2 "register_operand" " v"))))]
|
||||
""
|
||||
"!TARGET_RDNA3"
|
||||
"v_mul_<iu>32_<iu>24_sdwa\t%0, %<e>1, %<e>2 src0_sel:WORD_0 src1_sel:WORD_0"
|
||||
[(set_attr "type" "vop_sdwa")
|
||||
(set_attr "length" "8")])
|
||||
@ -1628,7 +1628,7 @@
|
||||
(mult:HI
|
||||
(any_extend:HI (match_operand:QI 1 "register_operand" "%v"))
|
||||
(any_extend:HI (match_operand:QI 2 "register_operand" " v"))))]
|
||||
""
|
||||
"!TARGET_RDNA3"
|
||||
"v_mul_<iu>32_<iu>24_sdwa\t%0, %<e>1, %<e>2 src0_sel:BYTE_0 src1_sel:BYTE_0"
|
||||
[(set_attr "type" "vop_sdwa")
|
||||
(set_attr "length" "8")])
|
||||
|
@ -230,7 +230,7 @@ do { \
|
||||
|
||||
#if defined (__GCN3__) || defined (__GCN5__) \
|
||||
|| defined (__CDNA1__) || defined (__CDNA2__) \
|
||||
|| defined (__RDNA2__)
|
||||
|| defined (__RDNA2__) || defined (__RDNA3__)
|
||||
#define CDNA3_PLUS 0
|
||||
#else
|
||||
#define CDNA3_PLUS 1
|
||||
|
@ -30,15 +30,25 @@
|
||||
/* According to AMD:
|
||||
dGPU RTC is 27MHz
|
||||
AGPU RTC is 100MHz
|
||||
RDNA3 ISA manual states "typically 100MHz"
|
||||
FIXME: DTRT on an APU. */
|
||||
#ifdef __RDNA3__
|
||||
#define RTC_TICKS (1.0 / 100000000.0) /* 100MHz */
|
||||
#else
|
||||
#define RTC_TICKS (1.0 / 27000000.0) /* 27MHz */
|
||||
#endif
|
||||
|
||||
double
|
||||
omp_get_wtime (void)
|
||||
{
|
||||
uint64_t clock;
|
||||
#ifdef __RDNA3__
|
||||
asm ("s_sendmsg_rtn_b64 %0 0x83 ;Get REALTIME\n\t"
|
||||
"s_waitcnt 0" : "=r" (clock));
|
||||
#else
|
||||
asm ("s_memrealtime %0\n\t"
|
||||
"s_waitcnt 0" : "=r" (clock));
|
||||
#endif
|
||||
return clock * RTC_TICKS;
|
||||
}
|
||||
|
||||
|
@ -1741,11 +1741,13 @@ max_isa_vgprs (int isa)
|
||||
case EF_AMDGPU_MACH_AMDGCN_GFX900:
|
||||
case EF_AMDGPU_MACH_AMDGCN_GFX906:
|
||||
case EF_AMDGPU_MACH_AMDGCN_GFX908:
|
||||
case EF_AMDGPU_MACH_AMDGCN_GFX1030:
|
||||
case EF_AMDGPU_MACH_AMDGCN_GFX1100:
|
||||
return 256;
|
||||
case EF_AMDGPU_MACH_AMDGCN_GFX90a:
|
||||
return 512;
|
||||
case EF_AMDGPU_MACH_AMDGCN_GFX1030:
|
||||
return 512; /* 512 SIMD32 = 256 wavefrontsize64. */
|
||||
case EF_AMDGPU_MACH_AMDGCN_GFX1100:
|
||||
return 1536; /* 1536 SIMD32 = 768 wavefrontsize64. */
|
||||
}
|
||||
GOMP_PLUGIN_fatal ("unhandled ISA in max_isa_vgprs");
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user