From 5c9de3df8547682bfb3d484d7d28a27776bf979c Mon Sep 17 00:00:00 2001 From: Andrew Stubbs Date: Mon, 21 Oct 2024 12:29:54 +0000 Subject: [PATCH] openmp: Tune omp_max_vf for offload targets If requested, return the vectorization factor appropriate for the offload device, if any. This change gives a significant speedup in the BabelStream "dot" benchmark on amdgcn. The omp_adjust_chunk_size usecase is set "false", for now, but I intend to change that in a follow-up patch. Note that NVPTX SIMT offload does not use this code-path. gcc/ChangeLog: * gimple-loop-versioning.cc (loop_versioning::loop_versioning): Set omp_max_vf to offload == false. * omp-expand.cc (omp_adjust_chunk_size): Likewise. * omp-general.cc (omp_max_vf): Add "offload" parameter, and detect amdgcn offload devices. * omp-general.h (omp_max_vf): Likewise. * omp-low.cc (lower_rec_simd_input_clauses): Pass offload state to omp_max_vf. --- gcc/gimple-loop-versioning.cc | 2 +- gcc/omp-expand.cc | 2 +- gcc/omp-general.cc | 17 +++++++++++++++-- gcc/omp-general.h | 2 +- gcc/omp-low.cc | 3 ++- 5 files changed, 20 insertions(+), 6 deletions(-) diff --git a/gcc/gimple-loop-versioning.cc b/gcc/gimple-loop-versioning.cc index 107b0020024..2968c929d04 100644 --- a/gcc/gimple-loop-versioning.cc +++ b/gcc/gimple-loop-versioning.cc @@ -554,7 +554,7 @@ loop_versioning::loop_versioning (function *fn) handled efficiently by scalar code. omp_max_vf calculates the maximum number of bytes in a vector, when such a value is relevant to loop optimization. */ - m_maximum_scale = estimated_poly_value (omp_max_vf ()); + m_maximum_scale = estimated_poly_value (omp_max_vf (false)); m_maximum_scale = MAX (m_maximum_scale, MAX_FIXED_MODE_SIZE); } diff --git a/gcc/omp-expand.cc b/gcc/omp-expand.cc index b0b4ddf5dbc..907fd46a5b2 100644 --- a/gcc/omp-expand.cc +++ b/gcc/omp-expand.cc @@ -212,7 +212,7 @@ omp_adjust_chunk_size (tree chunk_size, bool simd_schedule) if (!simd_schedule || integer_zerop (chunk_size)) return chunk_size; - poly_uint64 vf = omp_max_vf (); + poly_uint64 vf = omp_max_vf (false); if (known_eq (vf, 1U)) return chunk_size; diff --git a/gcc/omp-general.cc b/gcc/omp-general.cc index f74b9bf5e96..1ae575ee181 100644 --- a/gcc/omp-general.cc +++ b/gcc/omp-general.cc @@ -987,10 +987,11 @@ find_combined_omp_for (tree *tp, int *walk_subtrees, void *data) return NULL_TREE; } -/* Return maximum possible vectorization factor for the target. */ +/* Return maximum possible vectorization factor for the target, or for + the OpenMP offload target if one exists. */ poly_uint64 -omp_max_vf (void) +omp_max_vf (bool offload) { if (!optimize || optimize_debug @@ -999,6 +1000,18 @@ omp_max_vf (void) && OPTION_SET_P (flag_tree_loop_vectorize))) return 1; + if (ENABLE_OFFLOADING && offload) + { + for (const char *c = getenv ("OFFLOAD_TARGET_NAMES"); c;) + { + if (startswith (c, "amdgcn")) + return ordered_max (64, omp_max_vf (false)); + else if ((c = strchr (c, ':'))) + c++; + } + /* Otherwise, fall through to host VF. */ + } + auto_vector_modes modes; targetm.vectorize.autovectorize_vector_modes (&modes, true); if (!modes.is_empty ()) diff --git a/gcc/omp-general.h b/gcc/omp-general.h index f3778131626..70f78d2055b 100644 --- a/gcc/omp-general.h +++ b/gcc/omp-general.h @@ -162,7 +162,7 @@ extern void omp_extract_for_data (gomp_for *for_stmt, struct omp_for_data *fd, struct omp_for_data_loop *loops); extern gimple *omp_build_barrier (tree lhs); extern tree find_combined_omp_for (tree *, int *, void *); -extern poly_uint64 omp_max_vf (void); +extern poly_uint64 omp_max_vf (bool); extern int omp_max_simt_vf (void); extern const char *omp_context_name_list_prop (tree); extern void omp_construct_traits_to_codes (tree, int, enum tree_code *); diff --git a/gcc/omp-low.cc b/gcc/omp-low.cc index 44c4310075b..70a2c108fbc 100644 --- a/gcc/omp-low.cc +++ b/gcc/omp-low.cc @@ -4589,7 +4589,8 @@ lower_rec_simd_input_clauses (tree new_var, omp_context *ctx, { if (known_eq (sctx->max_vf, 0U)) { - sctx->max_vf = sctx->is_simt ? omp_max_simt_vf () : omp_max_vf (); + sctx->max_vf = (sctx->is_simt ? omp_max_simt_vf () + : omp_max_vf (omp_maybe_offloaded_ctx (ctx))); if (maybe_gt (sctx->max_vf, 1U)) { tree c = omp_find_clause (gimple_omp_for_clauses (ctx->stmt),