diff --git a/gcc/testsuite/gcc.target/aarch64/pr116139.c b/gcc/testsuite/gcc.target/aarch64/pr116139.c new file mode 100644 index 00000000000..78a21323030 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/pr116139.c @@ -0,0 +1,35 @@ +/* PR tree-optimization/116139 */ +/* { dg-do compile } */ +/* { dg-options "-Ofast --param fully-pipelined-fma=1 -mcpu=neoverse-n3" } */ + +#define LOOP_COUNT 800000000 +typedef double data_e; + +data_e +foo (data_e in) +{ + data_e a1, a2, a3, a4; + data_e tmp, result = 0; + a1 = in + 0.1; + a2 = in * 0.1; + a3 = in + 0.01; + a4 = in * 0.59; + + data_e result2 = 0; + + for (int ic = 0; ic < LOOP_COUNT; ic++) + { + tmp = a1 + a2 * a2 + a3 * a3 + a4 * a4 ; + result += tmp - ic; + result2 = result2 / 2 - tmp; + + a1 += 0.91; + a2 += 0.1; + a3 -= 0.01; + a4 -= 0.89; + + } + + return result + result2; +} + diff --git a/gcc/tree-ssa-reassoc.cc b/gcc/tree-ssa-reassoc.cc index d74352268b5..70c810c5198 100644 --- a/gcc/tree-ssa-reassoc.cc +++ b/gcc/tree-ssa-reassoc.cc @@ -5509,16 +5509,15 @@ get_reassociation_width (vec *ops, int mult_num, tree lhs, , it is latency(MULT)*2 + latency(ADD)*2. Assuming latency(MULT) >= latency(ADD), the first variant is preferred. - Find out if we can get a smaller width considering FMA. */ - if (width > 1 && mult_num && param_fully_pipelined_fma) - { - /* When param_fully_pipelined_fma is set, assume FMUL and FMA use the - same units that can also do FADD. For other scenarios, such as when - FMUL and FADD are using separated units, the following code may not - appy. */ - int width_mult = targetm.sched.reassociation_width (MULT_EXPR, mode); - gcc_checking_assert (width_mult <= width); + Find out if we can get a smaller width considering FMA. + Assume FMUL and FMA use the same units that can also do FADD. + For other scenarios, such as when FMUL and FADD are using separated units, + the following code may not apply. */ + int width_mult = targetm.sched.reassociation_width (MULT_EXPR, mode); + if (width > 1 && mult_num && param_fully_pipelined_fma + && width_mult <= width) + { /* Latency of MULT_EXPRs. */ int lat_mul = get_mult_latency_consider_fma (ops_num, mult_num, width_mult);