tree-optimization/116974 - Handle single-lane SLP for OMP scan store

The following massages the GIMPLE matching way of handling scan
stores to work with single-lane SLP.  I do not fully understand all
the cases that can happen and the stmt matching at vectorizable_store
time is less than ideal - but the following gets me all the testcases
to pass with and without forced SLP.

Long term we want to perform the matching at SLP discovery time,
properly chaining the various SLP instances the current state ends
up with.

	PR tree-optimization/116974
	* tree-vect-stmts.cc (check_scan_store): Pass in the SLP node
	instead of just a flag.  Allow single-lane scan stores.
	(vectorizable_store): Adjust.
	* tree-vect-loop.cc (vect_analyze_loop_2): Empty scan_map
	before re-trying.
This commit is contained in:
Richard Biener 2024-03-13 14:59:27 +01:00 committed by Richard Biener
parent dc90578f0b
commit 9df0772d50
2 changed files with 60 additions and 26 deletions

View File

@ -3369,6 +3369,8 @@ again:
LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
= saved_can_use_partial_vectors_p;
LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
if (loop_vinfo->scan_map)
loop_vinfo->scan_map->empty ();
goto start_over;
}

View File

@ -7445,7 +7445,7 @@ scan_store_can_perm_p (tree vectype, tree init,
static bool
check_scan_store (vec_info *vinfo, stmt_vec_info stmt_info, tree vectype,
enum vect_def_type rhs_dt, bool slp, tree mask,
enum vect_def_type rhs_dt, slp_tree slp_node, tree mask,
vect_memory_access_type memory_access_type)
{
loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
@ -7453,7 +7453,7 @@ check_scan_store (vec_info *vinfo, stmt_vec_info stmt_info, tree vectype,
tree ref_type;
gcc_assert (STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) > 1);
if (slp
if ((slp_node && SLP_TREE_LANES (slp_node) > 1)
|| mask
|| memory_access_type != VMAT_CONTIGUOUS
|| TREE_CODE (DR_BASE_ADDRESS (dr_info->dr)) != ADDR_EXPR
@ -7848,8 +7848,8 @@ check_scan_store (vec_info *vinfo, stmt_vec_info stmt_info, tree vectype,
Handle only the transformation, checking is done in check_scan_store. */
static bool
vectorizable_scan_store (vec_info *vinfo,
stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
vectorizable_scan_store (vec_info *vinfo, stmt_vec_info stmt_info,
slp_tree slp_node, gimple_stmt_iterator *gsi,
gimple **vec_stmt, int ncopies)
{
loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
@ -7961,16 +7961,34 @@ vectorizable_scan_store (vec_info *vinfo,
tree orig = NULL_TREE;
if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) == 4 && !inscan_var_store)
ldataref_ptr = DR_BASE_ADDRESS (load1_dr_info->dr);
auto_vec<tree> vec_oprnds1;
/* The initialization is invariant. */
vec_oprnd1 = vect_init_vector (vinfo, stmt_info, *init, vectype, NULL);
auto_vec<tree> vec_oprnds2;
auto_vec<tree> vec_oprnds3;
vect_get_vec_defs (vinfo, stmt_info, NULL, ncopies,
*init, &vec_oprnds1,
ldataref_ptr == NULL ? rhs1 : NULL, &vec_oprnds2,
rhs2, &vec_oprnds3);
for (int j = 0; j < ncopies; j++)
if (ldataref_ptr == NULL)
{
/* We want to lookup the vector operands of the reduction, not those
of the store - for SLP we have to use the proper SLP node for the
lookup, which should be the single child of the scan store. */
vect_get_vec_defs (vinfo, stmt_info, SLP_TREE_CHILDREN (slp_node)[0],
ncopies, rhs1, &vec_oprnds2, rhs2, &vec_oprnds3);
/* ??? For SLP we do not key the def on 'rhs1' or 'rhs2' but get
them in SLP child order. So we have to swap here with logic
similar to above. */
stmt_vec_info load
= SLP_TREE_SCALAR_STMTS (SLP_TREE_CHILDREN
(SLP_TREE_CHILDREN (slp_node)[0])[0])[0];
dr_vec_info *dr_info = STMT_VINFO_DR_INFO (load);
tree var = TREE_OPERAND (DR_BASE_ADDRESS (dr_info->dr), 0);
if (lookup_attribute ("omp simd inscan", DECL_ATTRIBUTES (var)))
for (unsigned i = 0; i < vec_oprnds2.length (); ++i)
std::swap (vec_oprnds2[i], vec_oprnds3[i]);;
}
else
vect_get_vec_defs (vinfo, stmt_info, slp_node, ncopies,
rhs2, &vec_oprnds3);
for (unsigned j = 0; j < vec_oprnds3.length (); j++)
{
vec_oprnd1 = vec_oprnds1[j];
if (ldataref_ptr == NULL)
vec_oprnd2 = vec_oprnds2[j];
vec_oprnd3 = vec_oprnds3[j];
@ -7988,8 +8006,11 @@ vectorizable_scan_store (vec_info *vinfo,
vect_copy_ref_info (data_ref, DR_REF (load1_dr_info->dr));
gimple *g = gimple_build_assign (vec_oprnd2, data_ref);
vect_finish_stmt_generation (vinfo, stmt_info, g, gsi);
STMT_VINFO_VEC_STMTS (stmt_info).safe_push (g);
*vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
if (! slp_node)
{
STMT_VINFO_VEC_STMTS (stmt_info).safe_push (g);
*vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
}
}
tree v = vec_oprnd2;
@ -8003,8 +8024,11 @@ vectorizable_scan_store (vec_info *vinfo,
? zero_vec : vec_oprnd1, v,
perms[i]);
vect_finish_stmt_generation (vinfo, stmt_info, g, gsi);
STMT_VINFO_VEC_STMTS (stmt_info).safe_push (g);
*vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
if (! slp_node)
{
STMT_VINFO_VEC_STMTS (stmt_info).safe_push (g);
*vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
}
if (zero_vec && use_whole_vector[i] == scan_store_kind_lshift_cond)
{
@ -8021,7 +8045,8 @@ vectorizable_scan_store (vec_info *vinfo,
new_temp, vec_oprnd1);
vect_finish_stmt_generation (vinfo, stmt_info,
g, gsi);
STMT_VINFO_VEC_STMTS (stmt_info).safe_push (g);
if (! slp_node)
STMT_VINFO_VEC_STMTS (stmt_info).safe_push (g);
new_temp = new_temp2;
}
@ -8039,7 +8064,8 @@ vectorizable_scan_store (vec_info *vinfo,
tree new_temp2 = make_ssa_name (vectype);
g = gimple_build_assign (new_temp2, code, v, new_temp);
vect_finish_stmt_generation (vinfo, stmt_info, g, gsi);
STMT_VINFO_VEC_STMTS (stmt_info).safe_push (g);
if (! slp_node)
STMT_VINFO_VEC_STMTS (stmt_info).safe_push (g);
v = new_temp2;
}
@ -8047,7 +8073,8 @@ vectorizable_scan_store (vec_info *vinfo,
tree new_temp = make_ssa_name (vectype);
gimple *g = gimple_build_assign (new_temp, code, orig, v);
vect_finish_stmt_generation (vinfo, stmt_info, g, gsi);
STMT_VINFO_VEC_STMTS (stmt_info).safe_push (g);
if (! slp_node)
STMT_VINFO_VEC_STMTS (stmt_info).safe_push (g);
tree last_perm_arg = new_temp;
/* For exclusive scan, new_temp computed above is the exclusive scan
@ -8058,14 +8085,16 @@ vectorizable_scan_store (vec_info *vinfo,
last_perm_arg = make_ssa_name (vectype);
g = gimple_build_assign (last_perm_arg, code, new_temp, vec_oprnd2);
vect_finish_stmt_generation (vinfo, stmt_info, g, gsi);
STMT_VINFO_VEC_STMTS (stmt_info).safe_push (g);
if (! slp_node)
STMT_VINFO_VEC_STMTS (stmt_info).safe_push (g);
}
orig = make_ssa_name (vectype);
g = gimple_build_assign (orig, VEC_PERM_EXPR, last_perm_arg,
last_perm_arg, perms[units_log2]);
vect_finish_stmt_generation (vinfo, stmt_info, g, gsi);
STMT_VINFO_VEC_STMTS (stmt_info).safe_push (g);
if (! slp_node)
STMT_VINFO_VEC_STMTS (stmt_info).safe_push (g);
if (!inscan_var_store)
{
@ -8075,12 +8104,13 @@ vectorizable_scan_store (vec_info *vinfo,
vect_copy_ref_info (data_ref, DR_REF (dr_info->dr));
g = gimple_build_assign (data_ref, new_temp);
vect_finish_stmt_generation (vinfo, stmt_info, g, gsi);
STMT_VINFO_VEC_STMTS (stmt_info).safe_push (g);
if (! slp_node)
STMT_VINFO_VEC_STMTS (stmt_info).safe_push (g);
}
}
if (inscan_var_store)
for (int j = 0; j < ncopies; j++)
for (unsigned j = 0; j < vec_oprnds3.length (); j++)
{
if (j != 0)
dataref_offset = int_const_binop (PLUS_EXPR, dataref_offset, bump);
@ -8091,7 +8121,8 @@ vectorizable_scan_store (vec_info *vinfo,
vect_copy_ref_info (data_ref, DR_REF (dr_info->dr));
gimple *g = gimple_build_assign (data_ref, orig);
vect_finish_stmt_generation (vinfo, stmt_info, g, gsi);
STMT_VINFO_VEC_STMTS (stmt_info).safe_push (g);
if (! slp_node)
STMT_VINFO_VEC_STMTS (stmt_info).safe_push (g);
}
return true;
}
@ -8308,7 +8339,7 @@ vectorizable_store (vec_info *vinfo,
if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) > 1 && !vec_stmt)
{
if (!check_scan_store (vinfo, stmt_info, vectype, rhs_dt, slp, mask,
if (!check_scan_store (vinfo, stmt_info, vectype, rhs_dt, slp_node, mask,
memory_access_type))
return false;
}
@ -8366,7 +8397,7 @@ vectorizable_store (vec_info *vinfo,
if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) >= 3)
{
gcc_assert (memory_access_type == VMAT_CONTIGUOUS);
gcc_assert (!slp);
gcc_assert (!slp || SLP_TREE_LANES (slp_node) == 1);
if (costing_p)
{
unsigned int inside_cost = 0, prologue_cost = 0;
@ -8385,7 +8416,8 @@ vectorizable_store (vec_info *vinfo,
return true;
}
return vectorizable_scan_store (vinfo, stmt_info, gsi, vec_stmt, ncopies);
return vectorizable_scan_store (vinfo, stmt_info, slp_node,
gsi, vec_stmt, ncopies);
}
if (grouped_store || slp)