openmp: Tune omp_max_vf for offload targets

If requested, return the vectorization factor appropriate for the offload device, if any. This change gives a significant speedup in the BabelStream "dot" benchmark on amdgcn. The omp_adjust_chunk_size usecase is set "false", for now, but I intend to change that in a follow-up patch. Note that NVPTX SIMT offload does not use this code-path. gcc/ChangeLog: * gimple-loop-versioning.cc (loop_versioning::loop_versioning): Set omp_max_vf to offload == false. * omp-expand.cc (omp_adjust_chunk_size): Likewise. * omp-general.cc (omp_max_vf): Add "offload" parameter, and detect amdgcn offload devices. * omp-general.h (omp_max_vf): Likewise. * omp-low.cc (lower_rec_simd_input_clauses): Pass offload state to omp_max_vf.
2024-11-21 13:40:47 +00:00 · 2024-10-21 12:29:54 +00:00 · 2024-10-21 12:29:54 +00:00 · 5c9de3df85
commit 5c9de3df85
parent 137b26412f
5 changed files with 20 additions and 6 deletions
--- a/gcc/gimple-loop-versioning.cc
+++ b/gcc/gimple-loop-versioning.cc
@ -554,7 +554,7 @@ loop_versioning::loop_versioning (function *fn)
     handled efficiently by scalar code.  omp_max_vf calculates the
     maximum number of bytes in a vector, when such a value is relevant
     to loop optimization.  */
-  m_maximum_scale = estimated_poly_value (omp_max_vf ());
+  m_maximum_scale = estimated_poly_value (omp_max_vf (false));
  m_maximum_scale = MAX (m_maximum_scale, MAX_FIXED_MODE_SIZE);
 }

--- a/gcc/omp-expand.cc
+++ b/gcc/omp-expand.cc
@ -212,7 +212,7 @@ omp_adjust_chunk_size (tree chunk_size, bool simd_schedule)
  if (!simd_schedule || integer_zerop (chunk_size))
    return chunk_size;

-  poly_uint64 vf = omp_max_vf ();
+  poly_uint64 vf = omp_max_vf (false);
  if (known_eq (vf, 1U))
    return chunk_size;

--- a/gcc/omp-general.cc
+++ b/gcc/omp-general.cc
@ -987,10 +987,11 @@ find_combined_omp_for (tree *tp, int *walk_subtrees, void *data)
  return NULL_TREE;
 }

-/* Return maximum possible vectorization factor for the target.  */
+/* Return maximum possible vectorization factor for the target, or for
+   the OpenMP offload target if one exists.  */

 poly_uint64
-omp_max_vf (void)
+omp_max_vf (bool offload)
 {
  if (!optimize
      || optimize_debug
@ -999,6 +1000,18 @@ omp_max_vf (void)
 	  && OPTION_SET_P (flag_tree_loop_vectorize)))
    return 1;

+  if (ENABLE_OFFLOADING && offload)
+    {
+      for (const char *c = getenv ("OFFLOAD_TARGET_NAMES"); c;)
+	{
+	  if (startswith (c, "amdgcn"))
+	    return ordered_max (64, omp_max_vf (false));
+	  else if ((c = strchr (c, ':')))
+	    c++;
+	}
+      /* Otherwise, fall through to host VF.  */
+    }
+
  auto_vector_modes modes;
  targetm.vectorize.autovectorize_vector_modes (&modes, true);
  if (!modes.is_empty ())
--- a/gcc/omp-general.h
+++ b/gcc/omp-general.h
@ -162,7 +162,7 @@ extern void omp_extract_for_data (gomp_for *for_stmt, struct omp_for_data *fd,
 				  struct omp_for_data_loop *loops);
 extern gimple *omp_build_barrier (tree lhs);
 extern tree find_combined_omp_for (tree *, int *, void *);
-extern poly_uint64 omp_max_vf (void);
+extern poly_uint64 omp_max_vf (bool);
 extern int omp_max_simt_vf (void);
 extern const char *omp_context_name_list_prop (tree);
 extern void omp_construct_traits_to_codes (tree, int, enum tree_code *);
--- a/gcc/omp-low.cc
+++ b/gcc/omp-low.cc
@ -4589,7 +4589,8 @@ lower_rec_simd_input_clauses (tree new_var, omp_context *ctx,
 {
  if (known_eq (sctx->max_vf, 0U))
    {
-      sctx->max_vf = sctx->is_simt ? omp_max_simt_vf () : omp_max_vf ();
+      sctx->max_vf = (sctx->is_simt ? omp_max_simt_vf ()
+		      : omp_max_vf (omp_maybe_offloaded_ctx (ctx)));
      if (maybe_gt (sctx->max_vf, 1U))
 	{
 	  tree c = omp_find_clause (gimple_omp_for_clauses (ctx->stmt),