Fall back to single-lane SLP before falling back to no SLP

The following changes the fallback to disable SLP when any of the
discovered SLP instances failed to pass vectorization checking into
a fallback that emulates what no SLP would do with SLP - force
single-lane discovery for all instances.

The patch does not remove the final fallback to disable SLP but it
reduces the fallout from failing vectorization when any non-SLP
stmt survives analysis.

	* tree-vectorizer.h (vect_analyze_slp): Add force_single_lane
	parameter.
	* tree-vect-slp.cc (vect_analyze_slp_instance): Remove
	defaulting of force_single_lane.
	(vect_build_slp_instance): Likewise.  Pass down appropriate
	force_single_lane.
	(vect_analyze_slp): Add force_sigle_lane parameter and pass
	it down appropriately.
	(vect_slp_analyze_bb_1): Always do multi-lane SLP.
	* tree-vect-loop.cc (vect_analyze_loop_2): Track two SLP
	modes and adjust accordingly.
	(vect_analyze_loop_1): Save the SLP mode when unrolling.

	* gcc.dg/vect/vect-outer-slp-1.c: Adjust.
This commit is contained in:
Richard Biener 2024-09-18 12:41:25 +02:00 committed by Richard Biener
parent d3a7302ec5
commit 77bd23a3e2
4 changed files with 54 additions and 42 deletions

View File

@ -29,4 +29,4 @@ void foo (void)
/* { dg-final { scan-tree-dump "OUTER LOOP VECTORIZED" "vect" } } */
/* We don't yet support SLP inductions for variable length vectors. */
/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" { xfail vect_variable_length } } } */
/* { dg-final { scan-tree-dump-not "VEC_PERM_EXPR" "vect" } } */
/* { dg-final { scan-tree-dump-not " = VEC_PERM_EXPR" "vect" } } */

View File

@ -2718,7 +2718,7 @@ vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo)
static opt_result
vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal,
unsigned *suggested_unroll_factor,
bool& slp_done_for_suggested_uf)
unsigned& slp_done_for_suggested_uf)
{
opt_result ok = opt_result::success ();
int res;
@ -2787,11 +2787,11 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal,
/* If the slp decision is false when suggested unroll factor is worked
out, and we are applying suggested unroll factor, we can simply skip
all slp related analyses this time. */
bool slp = !applying_suggested_uf || slp_done_for_suggested_uf;
unsigned slp = !applying_suggested_uf ? 2 : slp_done_for_suggested_uf;
/* Classify all cross-iteration scalar data-flow cycles.
Cross-iteration cycles caused by virtual phis are analyzed separately. */
vect_analyze_scalar_cycles (loop_vinfo, slp);
vect_analyze_scalar_cycles (loop_vinfo, slp == 2);
vect_pattern_recog (loop_vinfo);
@ -2854,18 +2854,23 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal,
vect_compute_single_scalar_iteration_cost (loop_vinfo);
poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
bool saved_can_use_partial_vectors_p
= LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo);
/* This is the point where we can re-start analysis with SLP forced off. */
start_over:
if (slp)
{
/* Check the SLP opportunities in the loop, analyze and build
SLP trees. */
ok = vect_analyze_slp (loop_vinfo, LOOP_VINFO_N_STMTS (loop_vinfo));
ok = vect_analyze_slp (loop_vinfo, LOOP_VINFO_N_STMTS (loop_vinfo),
slp == 1);
if (!ok)
return ok;
/* If there are any SLP instances mark them as pure_slp. */
slp = vect_make_slp_decision (loop_vinfo);
if (slp)
if (vect_make_slp_decision (loop_vinfo))
{
/* Find stmts that need to be both vectorized and SLPed. */
vect_detect_hybrid_slp (loop_vinfo);
@ -2881,16 +2886,10 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal,
}
}
bool saved_can_use_partial_vectors_p
= LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo);
/* We don't expect to have to roll back to anything other than an empty
set of rgroups. */
gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
/* This is the point where we can re-start analysis with SLP forced off. */
start_over:
/* When we arrive here with SLP disabled and we are supposed
to use SLP for everything fail vectorization. */
if (!slp && param_vect_force_slp)
@ -3218,15 +3217,14 @@ again:
/* Ensure that "ok" is false (with an opt_problem if dumping is enabled). */
gcc_assert (!ok);
/* Try again with SLP forced off but if we didn't do any SLP there is
/* Try again with SLP degraded but if we didn't do any SLP there is
no point in re-trying. */
if (!slp)
return ok;
/* If the slp decision is true when suggested unroll factor is worked
out, and we are applying suggested unroll factor, we don't need to
re-try any more. */
if (applying_suggested_uf && slp_done_for_suggested_uf)
/* If we are applying suggested unroll factor, we don't need to
re-try any more as we want to keep the SLP mode fixed. */
if (applying_suggested_uf)
return ok;
/* If there are reduction chains re-trying will fail anyway. */
@ -3271,11 +3269,18 @@ again:
}
if (dump_enabled_p ())
{
if (slp)
dump_printf_loc (MSG_NOTE, vect_location,
"re-trying with single-lane SLP\n");
else
dump_printf_loc (MSG_NOTE, vect_location,
"re-trying with SLP disabled\n");
}
/* Roll back state appropriately. No SLP this time. */
slp = false;
/* Roll back state appropriately. Degrade SLP this time. From multi-
to single-lane to disabled. */
--slp;
/* Restore vectorization factor as it were without SLP. */
LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
/* Free the SLP instances. */
@ -3420,7 +3425,7 @@ vect_analyze_loop_1 (class loop *loop, vec_info_shared *shared,
machine_mode vector_mode = vector_modes[mode_i];
loop_vinfo->vector_mode = vector_mode;
unsigned int suggested_unroll_factor = 1;
bool slp_done_for_suggested_uf = false;
unsigned slp_done_for_suggested_uf = 0;
/* Run the main analysis. */
opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal,

View File

@ -3488,7 +3488,7 @@ vect_analyze_slp_instance (vec_info *vinfo,
scalar_stmts_to_slp_tree_map_t *bst_map,
stmt_vec_info stmt_info, slp_instance_kind kind,
unsigned max_tree_size, unsigned *limit,
bool force_single_lane = false);
bool force_single_lane);
/* Build an interleaving scheme for the store sources RHS_NODES from
SCALAR_STMTS. */
@ -3684,7 +3684,7 @@ vect_build_slp_instance (vec_info *vinfo,
scalar_stmts_to_slp_tree_map_t *bst_map,
/* ??? We need stmt_info for group splitting. */
stmt_vec_info stmt_info_,
bool force_single_lane = false)
bool force_single_lane)
{
/* If there's no budget left bail out early. */
if (*limit == 0)
@ -3891,7 +3891,7 @@ vect_build_slp_instance (vec_info *vinfo,
group1_size);
bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
kind, max_tree_size,
limit);
limit, false);
/* Split the rest at the failure point and possibly
re-analyze the remaining matching part if it has
at least two lanes. */
@ -3904,14 +3904,14 @@ vect_build_slp_instance (vec_info *vinfo,
if (i - group1_size > 1)
res |= vect_analyze_slp_instance (vinfo, bst_map, rest2,
kind, max_tree_size,
limit);
limit, false);
}
/* Re-analyze the non-matching tail if it has at least
two lanes. */
if (i + 1 < group_size)
res |= vect_analyze_slp_instance (vinfo, bst_map,
rest, kind, max_tree_size,
limit);
limit, false);
return res;
}
}
@ -4544,7 +4544,8 @@ vect_lower_load_permutations (loop_vec_info loop_vinfo,
trees of packed scalar stmts if SLP is possible. */
opt_result
vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size,
bool force_single_lane)
{
loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
unsigned int i;
@ -4561,7 +4562,8 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
/* Find SLP sequences starting from groups of grouped stores. */
FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
vect_analyze_slp_instance (vinfo, bst_map, first_element,
slp_inst_kind_store, max_tree_size, &limit);
slp_inst_kind_store, max_tree_size, &limit,
force_single_lane);
/* For loops also start SLP discovery from non-grouped stores. */
if (loop_vinfo)
@ -4581,7 +4583,7 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
stmts.quick_push (stmt_info);
vect_build_slp_instance (vinfo, slp_inst_kind_store,
stmts, roots, remain, max_tree_size,
&limit, bst_map, NULL);
&limit, bst_map, NULL, force_single_lane);
}
}
@ -4598,7 +4600,8 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
bb_vinfo->roots[i].stmts,
bb_vinfo->roots[i].roots,
bb_vinfo->roots[i].remain,
max_tree_size, &limit, bst_map, NULL))
max_tree_size, &limit, bst_map, NULL,
false))
{
bb_vinfo->roots[i].stmts = vNULL;
bb_vinfo->roots[i].roots = vNULL;
@ -4614,9 +4617,11 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
if (! STMT_VINFO_RELEVANT_P (first_element)
&& ! STMT_VINFO_LIVE_P (first_element))
;
else if (! vect_analyze_slp_instance (vinfo, bst_map, first_element,
else if (force_single_lane
|| ! vect_analyze_slp_instance (vinfo, bst_map, first_element,
slp_inst_kind_reduc_chain,
max_tree_size, &limit))
max_tree_size, &limit,
force_single_lane))
{
/* Dissolve reduction chain group. */
stmt_vec_info vinfo = first_element;
@ -4656,7 +4661,8 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
{
/* Do not discover SLP reductions combining lane-reducing
ops, that will fail later. */
if (!lane_reducing_stmt_p (STMT_VINFO_STMT (next_info)))
if (!force_single_lane
&& !lane_reducing_stmt_p (STMT_VINFO_STMT (next_info)))
scalar_stmts.quick_push (next_info);
else
{
@ -4670,7 +4676,8 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
slp_inst_kind_reduc_group,
stmts, roots, remain,
max_tree_size, &limit,
bst_map, NULL);
bst_map, NULL,
force_single_lane);
}
}
}
@ -4683,7 +4690,7 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
slp_inst_kind_reduc_group,
scalar_stmts, roots, remain,
max_tree_size, &limit, bst_map,
NULL))
NULL, force_single_lane))
{
if (scalar_stmts.length () <= 1)
scalar_stmts.release ();
@ -4699,7 +4706,7 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
slp_inst_kind_reduc_group,
stmts, roots, remain,
max_tree_size, &limit,
bst_map, NULL);
bst_map, NULL, force_single_lane);
}
saved_stmts.release ();
}
@ -4731,7 +4738,7 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
slp_inst_kind_reduc_group,
stmts, roots, remain,
max_tree_size, &limit,
bst_map, NULL);
bst_map, NULL, force_single_lane);
}
}
}
@ -8934,7 +8941,7 @@ vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal,
/* Check the SLP opportunities in the basic block, analyze and build SLP
trees. */
if (!vect_analyze_slp (bb_vinfo, n_stmts))
if (!vect_analyze_slp (bb_vinfo, n_stmts, false))
{
if (dump_enabled_p ())
{

View File

@ -2538,7 +2538,7 @@ extern bool vect_transform_slp_perm_load (vec_info *, slp_tree, const vec<tree>
unsigned * = nullptr, bool = false);
extern bool vect_slp_analyze_operations (vec_info *);
extern void vect_schedule_slp (vec_info *, const vec<slp_instance> &);
extern opt_result vect_analyze_slp (vec_info *, unsigned);
extern opt_result vect_analyze_slp (vec_info *, unsigned, bool);
extern bool vect_make_slp_decision (loop_vec_info);
extern void vect_detect_hybrid_slp (loop_vec_info);
extern void vect_optimize_slp (vec_info *);