From bcc7e346bf9b5dc77797ea949d6adc740deb30ca Mon Sep 17 00:00:00 2001 From: Richard Sandiford Date: Sat, 16 Nov 2019 10:40:23 +0000 Subject: [PATCH] Optionally pick the cheapest loop_vec_info This patch adds a mode in which the vectoriser tries each available base vector mode and picks the one with the lowest cost. The new behaviour is selected by autovectorize_vector_modes. The patch keeps the current behaviour of preferring a VF of loop->simdlen over any larger or smaller VF, regardless of costs or target preferences. 2019-11-16 Richard Sandiford gcc/ * target.h (VECT_COMPARE_COSTS): New constant. * target.def (autovectorize_vector_modes): Return a bitmask of flags. * doc/tm.texi: Regenerate. * targhooks.h (default_autovectorize_vector_modes): Update accordingly. * targhooks.c (default_autovectorize_vector_modes): Likewise. * config/aarch64/aarch64.c (aarch64_autovectorize_vector_modes): Likewise. * config/arc/arc.c (arc_autovectorize_vector_modes): Likewise. * config/arm/arm.c (arm_autovectorize_vector_modes): Likewise. * config/i386/i386.c (ix86_autovectorize_vector_modes): Likewise. * config/mips/mips.c (mips_autovectorize_vector_modes): Likewise. * tree-vectorizer.h (_loop_vec_info::vec_outside_cost) (_loop_vec_info::vec_inside_cost): New member variables. * tree-vect-loop.c (_loop_vec_info::_loop_vec_info): Initialize them. (vect_better_loop_vinfo_p, vect_joust_loop_vinfos): New functions. (vect_analyze_loop): When autovectorize_vector_modes returns VECT_COMPARE_COSTS, try vectorizing the loop with each available vector mode and picking the one with the lowest cost. (vect_estimate_min_profitable_iters): Record the computed costs in the loop_vec_info. From-SVN: r278336 --- gcc/ChangeLog | 23 ++++++ gcc/config/aarch64/aarch64.c | 4 +- gcc/config/arc/arc.c | 3 +- gcc/config/arm/arm.c | 5 +- gcc/config/i386/i386.c | 4 +- gcc/config/mips/mips.c | 3 +- gcc/doc/tm.texi | 14 +++- gcc/target.def | 14 +++- gcc/target.h | 8 ++ gcc/targhooks.c | 3 +- gcc/targhooks.h | 2 +- gcc/tree-vect-loop.c | 148 +++++++++++++++++++++++++++++++++-- gcc/tree-vectorizer.h | 7 ++ 13 files changed, 219 insertions(+), 19 deletions(-) diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 7d48dbc69a6..f809def56c4 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,26 @@ +2019-11-16 Richard Sandiford + + * target.h (VECT_COMPARE_COSTS): New constant. + * target.def (autovectorize_vector_modes): Return a bitmask of flags. + * doc/tm.texi: Regenerate. + * targhooks.h (default_autovectorize_vector_modes): Update accordingly. + * targhooks.c (default_autovectorize_vector_modes): Likewise. + * config/aarch64/aarch64.c (aarch64_autovectorize_vector_modes): + Likewise. + * config/arc/arc.c (arc_autovectorize_vector_modes): Likewise. + * config/arm/arm.c (arm_autovectorize_vector_modes): Likewise. + * config/i386/i386.c (ix86_autovectorize_vector_modes): Likewise. + * config/mips/mips.c (mips_autovectorize_vector_modes): Likewise. + * tree-vectorizer.h (_loop_vec_info::vec_outside_cost) + (_loop_vec_info::vec_inside_cost): New member variables. + * tree-vect-loop.c (_loop_vec_info::_loop_vec_info): Initialize them. + (vect_better_loop_vinfo_p, vect_joust_loop_vinfos): New functions. + (vect_analyze_loop): When autovectorize_vector_modes returns + VECT_COMPARE_COSTS, try vectorizing the loop with each available + vector mode and picking the one with the lowest cost. + (vect_estimate_min_profitable_iters): Record the computed costs + in the loop_vec_info. + 2019-11-16 Richard Sandiford * tree-vectorizer.h (can_duplicate_and_interleave_p): Take an diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index ec0643daaa5..e2251a2ef32 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -15935,7 +15935,7 @@ aarch64_preferred_simd_mode (scalar_mode mode) /* Return a list of possible vector sizes for the vectorizer to iterate over. */ -static void +static unsigned int aarch64_autovectorize_vector_modes (vector_modes *modes, bool) { if (TARGET_SVE) @@ -15961,6 +15961,8 @@ aarch64_autovectorize_vector_modes (vector_modes *modes, bool) TODO: We could similarly support limited forms of V2QImode and V2HImode for this case. */ modes->safe_push (V2SImode); + + return 0; } /* Implement TARGET_MANGLE_TYPE. */ diff --git a/gcc/config/arc/arc.c b/gcc/config/arc/arc.c index f48f102f90d..115500e56da 100644 --- a/gcc/config/arc/arc.c +++ b/gcc/config/arc/arc.c @@ -609,7 +609,7 @@ arc_preferred_simd_mode (scalar_mode mode) /* Implements target hook TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES. */ -static void +static unsigned int arc_autovectorize_vector_modes (vector_modes *modes, bool) { if (TARGET_PLUS_QMACW) @@ -617,6 +617,7 @@ arc_autovectorize_vector_modes (vector_modes *modes, bool) modes->quick_push (V4HImode); modes->quick_push (V2HImode); } + return 0; } diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c index 70a20f7646c..1fd30c238cd 100644 --- a/gcc/config/arm/arm.c +++ b/gcc/config/arm/arm.c @@ -288,7 +288,7 @@ static bool arm_builtin_support_vector_misalignment (machine_mode mode, static void arm_conditional_register_usage (void); static enum flt_eval_method arm_excess_precision (enum excess_precision_type); static reg_class_t arm_preferred_rename_class (reg_class_t rclass); -static void arm_autovectorize_vector_modes (vector_modes *, bool); +static unsigned int arm_autovectorize_vector_modes (vector_modes *, bool); static int arm_default_branch_cost (bool, bool); static int arm_cortex_a5_branch_cost (bool, bool); static int arm_cortex_m_branch_cost (bool, bool); @@ -29015,7 +29015,7 @@ arm_vector_alignment (const_tree type) return align; } -static void +static unsigned int arm_autovectorize_vector_modes (vector_modes *modes, bool) { if (!TARGET_NEON_VECTORIZE_DOUBLE) @@ -29023,6 +29023,7 @@ arm_autovectorize_vector_modes (vector_modes *modes, bool) modes->safe_push (V16QImode); modes->safe_push (V8QImode); } + return 0; } static bool diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index c406be35239..bb5dc144175 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -21384,7 +21384,7 @@ ix86_preferred_simd_mode (scalar_mode mode) vectors. If AVX512F is enabled then try vectorizing with 512bit, 256bit and 128bit vectors. */ -static void +static unsigned int ix86_autovectorize_vector_modes (vector_modes *modes, bool all) { if (TARGET_AVX512F && !TARGET_PREFER_AVX256) @@ -21414,6 +21414,8 @@ ix86_autovectorize_vector_modes (vector_modes *modes, bool all) if (TARGET_MMX_WITH_SSE) modes->safe_push (V8QImode); + + return 0; } /* Implemenation of targetm.vectorize.get_mask_mode. */ diff --git a/gcc/config/mips/mips.c b/gcc/config/mips/mips.c index 30017e37977..6341216d1bc 100644 --- a/gcc/config/mips/mips.c +++ b/gcc/config/mips/mips.c @@ -13455,11 +13455,12 @@ mips_preferred_simd_mode (scalar_mode mode) /* Implement TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES. */ -static void +static unsigned int mips_autovectorize_vector_modes (vector_modes *modes, bool) { if (ISA_HAS_MSA) modes->safe_push (V16QImode); + return 0; } /* Implement TARGET_INIT_LIBFUNCS. */ diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi index 037039afdcf..a3eba282464 100644 --- a/gcc/doc/tm.texi +++ b/gcc/doc/tm.texi @@ -6008,7 +6008,7 @@ against lower halves of vectors recursively until the specified mode is reached. The default is @var{mode} which means no splitting. @end deftypefn -@deftypefn {Target Hook} void TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES (vector_modes *@var{modes}, bool @var{all}) +@deftypefn {Target Hook} {unsigned int} TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES (vector_modes *@var{modes}, bool @var{all}) If using the mode returned by @code{TARGET_VECTORIZE_PREFERRED_SIMD_MODE} is not the only approach worth considering, this hook should add one mode to @var{modes} for each useful alternative approach. These modes are then @@ -6024,9 +6024,19 @@ element mode. If @var{all} is true, add suitable vector modes even when they are generally not expected to be worthwhile. +The hook returns a bitmask of flags that control how the modes in +@var{modes} are used. The flags are: +@table @code +@item VECT_COMPARE_COSTS +Tells the loop vectorizer to try all the provided modes and pick the one +with the lowest cost. By default the vectorizer will choose the first +mode that works. +@end table + The hook does not need to do anything if the vector returned by @code{TARGET_VECTORIZE_PREFERRED_SIMD_MODE} is the only one relevant -for autovectorization. The default implementation does nothing. +for autovectorization. The default implementation adds no modes and +returns 0. @end deftypefn @deftypefn {Target Hook} opt_machine_mode TARGET_VECTORIZE_RELATED_MODE (machine_mode @var{vector_mode}, scalar_mode @var{element_mode}, poly_uint64 @var{nunits}) diff --git a/gcc/target.def b/gcc/target.def index d220f8fbf12..e705c5d14d9 100644 --- a/gcc/target.def +++ b/gcc/target.def @@ -1925,10 +1925,20 @@ element mode.\n\ If @var{all} is true, add suitable vector modes even when they are generally\n\ not expected to be worthwhile.\n\ \n\ +The hook returns a bitmask of flags that control how the modes in\n\ +@var{modes} are used. The flags are:\n\ +@table @code\n\ +@item VECT_COMPARE_COSTS\n\ +Tells the loop vectorizer to try all the provided modes and pick the one\n\ +with the lowest cost. By default the vectorizer will choose the first\n\ +mode that works.\n\ +@end table\n\ +\n\ The hook does not need to do anything if the vector returned by\n\ @code{TARGET_VECTORIZE_PREFERRED_SIMD_MODE} is the only one relevant\n\ -for autovectorization. The default implementation does nothing.", - void, +for autovectorization. The default implementation adds no modes and\n\ +returns 0.", + unsigned int, (vector_modes *modes, bool all), default_autovectorize_vector_modes) diff --git a/gcc/target.h b/gcc/target.h index 60757efe5dc..2c5b59be851 100644 --- a/gcc/target.h +++ b/gcc/target.h @@ -218,6 +218,14 @@ enum omp_device_kind_arch_isa { omp_device_isa }; +/* Flags returned by TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES: + + VECT_COMPARE_COSTS + Tells the loop vectorizer to try all the provided modes and + pick the one with the lowest cost. By default the vectorizer + will choose the first mode that works. */ +const unsigned int VECT_COMPARE_COSTS = 1U << 0; + /* The target structure. This holds all the backend hooks. */ #define DEFHOOKPOD(NAME, DOC, TYPE, INIT) TYPE NAME; #define DEFHOOK(NAME, DOC, TYPE, PARAMS, INIT) TYPE (* NAME) PARAMS; diff --git a/gcc/targhooks.c b/gcc/targhooks.c index b0362f969aa..fd8e43565c3 100644 --- a/gcc/targhooks.c +++ b/gcc/targhooks.c @@ -1300,9 +1300,10 @@ default_split_reduction (machine_mode mode) /* By default only the preferred vector mode is tried. */ -void +unsigned int default_autovectorize_vector_modes (vector_modes *, bool) { + return 0; } /* The default implementation of TARGET_VECTORIZE_RELATED_MODE. */ diff --git a/gcc/targhooks.h b/gcc/targhooks.h index 7f96a7c0058..e041291347b 100644 --- a/gcc/targhooks.h +++ b/gcc/targhooks.h @@ -113,7 +113,7 @@ default_builtin_support_vector_misalignment (machine_mode mode, int, bool); extern machine_mode default_preferred_simd_mode (scalar_mode mode); extern machine_mode default_split_reduction (machine_mode); -extern void default_autovectorize_vector_modes (vector_modes *, bool); +extern unsigned int default_autovectorize_vector_modes (vector_modes *, bool); extern opt_machine_mode default_vectorize_related_mode (machine_mode, scalar_mode, poly_uint64); diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c index 37290fa475d..7a58dfc323a 100644 --- a/gcc/tree-vect-loop.c +++ b/gcc/tree-vect-loop.c @@ -829,6 +829,8 @@ _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared) scan_map (NULL), slp_unrolling_factor (1), single_scalar_iteration_cost (0), + vec_outside_cost (0), + vec_inside_cost (0), vectorizable (false), can_fully_mask_p (true), fully_masked_p (false), @@ -2374,6 +2376,80 @@ again: goto start_over; } +/* Return true if vectorizing a loop using NEW_LOOP_VINFO appears + to be better than vectorizing it using OLD_LOOP_VINFO. Assume that + OLD_LOOP_VINFO is better unless something specifically indicates + otherwise. + + Note that this deliberately isn't a partial order. */ + +static bool +vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo, + loop_vec_info old_loop_vinfo) +{ + struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo); + gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop); + + poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo); + poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo); + + /* Always prefer a VF of loop->simdlen over any other VF. */ + if (loop->simdlen) + { + bool new_simdlen_p = known_eq (new_vf, loop->simdlen); + bool old_simdlen_p = known_eq (old_vf, loop->simdlen); + if (new_simdlen_p != old_simdlen_p) + return new_simdlen_p; + } + + /* Limit the VFs to what is likely to be the maximum number of iterations, + to handle cases in which at least one loop_vinfo is fully-masked. */ + HOST_WIDE_INT estimated_max_niter = likely_max_stmt_executions_int (loop); + if (estimated_max_niter != -1) + { + if (known_le (estimated_max_niter, new_vf)) + new_vf = estimated_max_niter; + if (known_le (estimated_max_niter, old_vf)) + old_vf = estimated_max_niter; + } + + /* Check whether the (fractional) cost per scalar iteration is lower + or higher: new_inside_cost / new_vf vs. old_inside_cost / old_vf. */ + poly_widest_int rel_new = (new_loop_vinfo->vec_inside_cost + * poly_widest_int (old_vf)); + poly_widest_int rel_old = (old_loop_vinfo->vec_inside_cost + * poly_widest_int (new_vf)); + if (maybe_lt (rel_old, rel_new)) + return false; + if (known_lt (rel_new, rel_old)) + return true; + + /* If there's nothing to choose between the loop bodies, see whether + there's a difference in the prologue and epilogue costs. */ + if (new_loop_vinfo->vec_outside_cost != old_loop_vinfo->vec_outside_cost) + return new_loop_vinfo->vec_outside_cost < old_loop_vinfo->vec_outside_cost; + + return false; +} + +/* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO. Return + true if we should. */ + +static bool +vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo, + loop_vec_info old_loop_vinfo) +{ + if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo)) + return false; + + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "***** Preferring vector mode %s to vector mode %s\n", + GET_MODE_NAME (new_loop_vinfo->vector_mode), + GET_MODE_NAME (old_loop_vinfo->vector_mode)); + return true; +} + /* Function vect_analyze_loop. Apply a set of analyses on LOOP, and create a loop_vec_info struct @@ -2385,8 +2461,9 @@ vect_analyze_loop (class loop *loop, vec_info_shared *shared) auto_vector_modes vector_modes; /* Autodetect first vector size we try. */ - targetm.vectorize.autovectorize_vector_modes (&vector_modes, - loop->simdlen != 0); + unsigned int autovec_flags + = targetm.vectorize.autovectorize_vector_modes (&vector_modes, + loop->simdlen != 0); unsigned int mode_i = 0; DUMP_VECT_SCOPE ("analyze_loop_nest"); @@ -2409,6 +2486,8 @@ vect_analyze_loop (class loop *loop, vec_info_shared *shared) machine_mode next_vector_mode = VOIDmode; poly_uint64 lowest_th = 0; unsigned vectorized_loops = 0; + bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS) + && !unlimited_cost_model (loop)); bool vect_epilogues = false; opt_result res = opt_result::success (); @@ -2429,6 +2508,34 @@ vect_analyze_loop (class loop *loop, vec_info_shared *shared) bool fatal = false; + /* When pick_lowest_cost_p is true, we should in principle iterate + over all the loop_vec_infos that LOOP_VINFO could replace and + try to vectorize LOOP_VINFO under the same conditions. + E.g. when trying to replace an epilogue loop, we should vectorize + LOOP_VINFO as an epilogue loop with the same VF limit. When trying + to replace the main loop, we should vectorize LOOP_VINFO as a main + loop too. + + However, autovectorize_vector_modes is usually sorted as follows: + + - Modes that naturally produce lower VFs usually follow modes that + naturally produce higher VFs. + + - When modes naturally produce the same VF, maskable modes + usually follow unmaskable ones, so that the maskable mode + can be used to vectorize the epilogue of the unmaskable mode. + + This order is preferred because it leads to the maximum + epilogue vectorization opportunities. Targets should only use + a different order if they want to make wide modes available while + disparaging them relative to earlier, smaller modes. The assumption + in that case is that the wider modes are more expensive in some + way that isn't reflected directly in the costs. + + There should therefore be few interesting cases in which + LOOP_VINFO fails when treated as an epilogue loop, succeeds when + treated as a standalone loop, and ends up being genuinely cheaper + than FIRST_LOOP_VINFO. */ if (vect_epilogues) LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = first_loop_vinfo; @@ -2476,13 +2583,34 @@ vect_analyze_loop (class loop *loop, vec_info_shared *shared) LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = NULL; simdlen = 0; } + else if (pick_lowest_cost_p && first_loop_vinfo) + { + /* Keep trying to roll back vectorization attempts while the + loop_vec_infos they produced were worse than this one. */ + vec &vinfos = first_loop_vinfo->epilogue_vinfos; + while (!vinfos.is_empty () + && vect_joust_loop_vinfos (loop_vinfo, vinfos.last ())) + { + gcc_assert (vect_epilogues); + delete vinfos.pop (); + } + if (vinfos.is_empty () + && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo)) + { + delete first_loop_vinfo; + first_loop_vinfo = opt_loop_vec_info::success (NULL); + LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = NULL; + } + } if (first_loop_vinfo == NULL) { first_loop_vinfo = loop_vinfo; lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo); } - else if (vect_epilogues) + else if (vect_epilogues + /* For now only allow one epilogue loop. */ + && first_loop_vinfo->epilogue_vinfos.is_empty ()) { first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo); poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo); @@ -2506,12 +2634,14 @@ vect_analyze_loop (class loop *loop, vec_info_shared *shared) && param_vect_epilogues_nomask && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo) && !loop->simduid - /* For now only allow one epilogue loop. */ - && first_loop_vinfo->epilogue_vinfos.is_empty ()); + /* For now only allow one epilogue loop, but allow + pick_lowest_cost_p to replace it. */ + && (first_loop_vinfo->epilogue_vinfos.is_empty () + || pick_lowest_cost_p)); /* Commit to first_loop_vinfo if we have no reason to try alternatives. */ - if (!simdlen && !vect_epilogues) + if (!simdlen && !vect_epilogues && !pick_lowest_cost_p) break; } else @@ -3509,7 +3639,11 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo, &vec_inside_cost, &vec_epilogue_cost); vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost); - + + /* Stash the costs so that we can compare two loop_vec_infos. */ + loop_vinfo->vec_inside_cost = vec_inside_cost; + loop_vinfo->vec_outside_cost = vec_outside_cost; + if (dump_enabled_p ()) { dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n"); diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index fd88084554d..0eac5bdef88 100644 --- a/gcc/tree-vectorizer.h +++ b/gcc/tree-vectorizer.h @@ -601,6 +601,13 @@ public: /* Cost of a single scalar iteration. */ int single_scalar_iteration_cost; + /* The cost of the vector prologue and epilogue, including peeled + iterations and set-up code. */ + int vec_outside_cost; + + /* The cost of the vector loop body. */ + int vec_inside_cost; + /* Is the loop vectorizable? */ bool vectorizable; -- 2.30.2