From 64432b680eab0bddbe9a4ad4798457cf6a14ad60 Mon Sep 17 00:00:00 2001 From: Kyrylo Tkachov Date: Thu, 17 Dec 2020 18:02:37 +0000 Subject: [PATCH] vect, aarch64: Extend SVE vs Advanced SIMD costing decisions in vect_better_loop_vinfo_p While experimenting with some backend costs for Advanced SIMD and SVE I hit many cases where GCC would pick SVE for VLA auto-vectorisation even when the backend very clearly presented cheaper costs for Advanced SIMD. For a simple float addition loop the SVE costs were: vec.c:9:21: note: Cost model analysis: Vector inside of loop cost: 28 Vector prologue cost: 2 Vector epilogue cost: 0 Scalar iteration cost: 10 Scalar outside cost: 0 Vector outside cost: 2 prologue iterations: 0 epilogue iterations: 0 Minimum number of vector iterations: 1 Calculated minimum iters for profitability: 4 and for Advanced SIMD (Neon) they're: vec.c:9:21: note: Cost model analysis: Vector inside of loop cost: 11 Vector prologue cost: 0 Vector epilogue cost: 0 Scalar iteration cost: 10 Scalar outside cost: 0 Vector outside cost: 0 prologue iterations: 0 epilogue iterations: 0 Calculated minimum iters for profitability: 0 vec.c:9:21: note: Runtime profitability threshold = 4 yet the SVE one was always picked. With guidance from Richard this seems to be due to the vinfo comparisons in vect_better_loop_vinfo_p, in particular the part with the big comment explaining the estimated_rel_new * 2 <= estimated_rel_old heuristic. This patch extends the comparisons by introducing a three-way estimate kind for poly_int values that the backend can distinguish. This allows vect_better_loop_vinfo_p to ask for minimum, maximum and likely estimates and pick Advanced SIMD overs SVE when it is clearly cheaper. gcc/ * target.h (enum poly_value_estimate_kind): Define. (estimated_poly_value): Take an estimate kind argument. * target.def (estimated_poly_value): Update definition for the above. * doc/tm.texi: Regenerate. * targhooks.c (estimated_poly_value): Update prototype. * tree-vect-loop.c (vect_better_loop_vinfo_p): Use min, max and likely estimates of VF to pick between vinfos. * config/aarch64/aarch64.c (aarch64_cmp_autovec_modes): Use estimated_poly_value instead of aarch64_estimated_poly_value. (aarch64_estimated_poly_value): Take a kind argument and handle it. --- gcc/config/aarch64/aarch64.c | 35 +++++++++++---- gcc/doc/tm.texi | 7 ++- gcc/target.def | 7 ++- gcc/target.h | 12 ++++- gcc/targhooks.c | 2 +- gcc/tree-vect-loop.c | 85 +++++++++++++++++++++--------------- 6 files changed, 96 insertions(+), 52 deletions(-) diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index b79630194c7..67b4c91dbe4 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -17367,8 +17367,6 @@ aarch64_simd_container_mode (scalar_mode mode, poly_int64 width) return word_mode; } -static HOST_WIDE_INT aarch64_estimated_poly_value (poly_int64); - /* Compare an SVE mode SVE_M and an Advanced SIMD mode ASIMD_M and return whether the SVE mode should be preferred over the Advanced SIMD one in aarch64_autovectorize_vector_modes. */ @@ -17401,8 +17399,8 @@ aarch64_cmp_autovec_modes (machine_mode sve_m, machine_mode asimd_m) return maybe_gt (nunits_sve, nunits_asimd); /* Otherwise estimate the runtime width of the modes involved. */ - HOST_WIDE_INT est_sve = aarch64_estimated_poly_value (nunits_sve); - HOST_WIDE_INT est_asimd = aarch64_estimated_poly_value (nunits_asimd); + HOST_WIDE_INT est_sve = estimated_poly_value (nunits_sve); + HOST_WIDE_INT est_asimd = estimated_poly_value (nunits_asimd); /* Preferring SVE means picking it first unless the Advanced SIMD mode is clearly wider. */ @@ -23195,19 +23193,38 @@ aarch64_speculation_safe_value (machine_mode mode, /* Implement TARGET_ESTIMATED_POLY_VALUE. Look into the tuning structure for an estimate. - VAL.coeffs[1] is multiplied by the number of VQ chunks over the initial - Advanced SIMD 128 bits. */ + KIND specifies the type of requested estimate: min, max or likely. + For cores with a known SVE width all three estimates are the same. + For generic SVE tuning we want to distinguish the maximum estimate from + the minimum and likely ones. + The likely estimate is the same as the minimum in that case to give a + conservative behavior of auto-vectorizing with SVE when it is a win + even for 128-bit SVE. + When SVE width information is available VAL.coeffs[1] is multiplied by + the number of VQ chunks over the initial Advanced SIMD 128 bits. */ static HOST_WIDE_INT -aarch64_estimated_poly_value (poly_int64 val) +aarch64_estimated_poly_value (poly_int64 val, + poly_value_estimate_kind kind + = POLY_VALUE_LIKELY) { enum aarch64_sve_vector_bits_enum width_source = aarch64_tune_params.sve_width; - /* If we still don't have an estimate, use the default. */ + /* If there is no core-specific information then the minimum and likely + values are based on 128-bit vectors and the maximum is based on + the architectural maximum of 2048 bits. */ if (width_source == SVE_SCALABLE) - return default_estimated_poly_value (val); + switch (kind) + { + case POLY_VALUE_MIN: + case POLY_VALUE_LIKELY: + return val.coeffs[0]; + case POLY_VALUE_MAX: + return val.coeffs[0] + val.coeffs[1] * 15; + } + /* If the core provides width information, use that. */ HOST_WIDE_INT over_128 = width_source - 128; return val.coeffs[0] + val.coeffs[1] * over_128 / 128; } diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi index d9b855c13ac..900d584d178 100644 --- a/gcc/doc/tm.texi +++ b/gcc/doc/tm.texi @@ -7005,9 +7005,12 @@ delay slot branches filled using the basic filler is often still desirable as the delay slot can hide a pipeline bubble. @end deftypefn -@deftypefn {Target Hook} HOST_WIDE_INT TARGET_ESTIMATED_POLY_VALUE (poly_int64 @var{val}) +@deftypefn {Target Hook} HOST_WIDE_INT TARGET_ESTIMATED_POLY_VALUE (poly_int64 @var{val}, poly_value_estimate_kind @var{kind}) Return an estimate of the runtime value of @var{val}, for use in -things like cost calculations or profiling frequencies. The default +things like cost calculations or profiling frequencies. @var{kind} is used +to ask for the minimum, maximum, and likely estimates of the value through +the @code{POLY_VALUE_MIN}, @code{POLY_VALUE_MAX} and +@code{POLY_VALUE_LIKELY} values. The default implementation returns the lowest possible value of @var{val}. @end deftypefn diff --git a/gcc/target.def b/gcc/target.def index acdc694d44c..d5ed0dfda4f 100644 --- a/gcc/target.def +++ b/gcc/target.def @@ -3851,9 +3851,12 @@ default_new_address_profitable_p) DEFHOOK (estimated_poly_value, "Return an estimate of the runtime value of @var{val}, for use in\n\ -things like cost calculations or profiling frequencies. The default\n\ +things like cost calculations or profiling frequencies. @var{kind} is used\n\ +to ask for the minimum, maximum, and likely estimates of the value through\n\ +the @code{POLY_VALUE_MIN}, @code{POLY_VALUE_MAX} and\n\ +@code{POLY_VALUE_LIKELY} values. The default\n\ implementation returns the lowest possible value of @var{val}.", - HOST_WIDE_INT, (poly_int64 val), + HOST_WIDE_INT, (poly_int64 val, poly_value_estimate_kind kind), default_estimated_poly_value) /* Permit speculative instructions in delay slots during delayed-branch diff --git a/gcc/target.h b/gcc/target.h index 960188023f1..68ef5194b6c 100644 --- a/gcc/target.h +++ b/gcc/target.h @@ -252,6 +252,13 @@ enum type_context_kind { TCTX_CAPTURE_BY_COPY }; +enum poly_value_estimate_kind +{ + POLY_VALUE_MIN, + POLY_VALUE_MAX, + POLY_VALUE_LIKELY +}; + extern bool verify_type_context (location_t, type_context_kind, const_tree, bool = false); @@ -272,12 +279,13 @@ extern struct gcc_target targetm; provides a rough guess. */ static inline HOST_WIDE_INT -estimated_poly_value (poly_int64 x) +estimated_poly_value (poly_int64 x, + poly_value_estimate_kind kind = POLY_VALUE_LIKELY) { if (NUM_POLY_INT_COEFFS == 1) return x.coeffs[0]; else - return targetm.estimated_poly_value (x); + return targetm.estimated_poly_value (x, kind); } #ifdef GCC_TM_H diff --git a/gcc/targhooks.c b/gcc/targhooks.c index 6e12e13d68e..d616fb73bdf 100644 --- a/gcc/targhooks.c +++ b/gcc/targhooks.c @@ -1757,7 +1757,7 @@ default_slow_unaligned_access (machine_mode, unsigned int) /* The default implementation of TARGET_ESTIMATED_POLY_VALUE. */ HOST_WIDE_INT -default_estimated_poly_value (poly_int64 x) +default_estimated_poly_value (poly_int64 x, poly_value_estimate_kind) { return x.coeffs[0]; } diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c index 52757add0e3..688538a4521 100644 --- a/gcc/tree-vect-loop.c +++ b/gcc/tree-vect-loop.c @@ -2773,43 +2773,56 @@ vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo, /* Check whether the (fractional) cost per scalar iteration is lower or higher: new_inside_cost / new_vf vs. old_inside_cost / old_vf. */ - poly_widest_int rel_new = (new_loop_vinfo->vec_inside_cost - * poly_widest_int (old_vf)); - poly_widest_int rel_old = (old_loop_vinfo->vec_inside_cost - * poly_widest_int (new_vf)); - if (maybe_lt (rel_old, rel_new)) - { - /* When old_loop_vinfo uses a variable vectorization factor, - we know that it has a lower cost for at least one runtime VF. - However, we don't know how likely that VF is. - - One option would be to compare the costs for the estimated VFs. - The problem is that that can put too much pressure on the cost - model. E.g. if the estimated VF is also the lowest possible VF, - and if old_loop_vinfo is 1 unit worse than new_loop_vinfo - for the estimated VF, we'd then choose new_loop_vinfo even - though (a) new_loop_vinfo might not actually be better than - old_loop_vinfo for that VF and (b) it would be significantly - worse at larger VFs. - - Here we go for a hacky compromise: pick new_loop_vinfo if it is - no more expensive than old_loop_vinfo even after doubling the - estimated old_loop_vinfo VF. For all but trivial loops, this - ensures that we only pick new_loop_vinfo if it is significantly - better than old_loop_vinfo at the estimated VF. */ - if (rel_new.is_constant ()) - return false; - - HOST_WIDE_INT new_estimated_vf = estimated_poly_value (new_vf); - HOST_WIDE_INT old_estimated_vf = estimated_poly_value (old_vf); - widest_int estimated_rel_new = (new_loop_vinfo->vec_inside_cost - * widest_int (old_estimated_vf)); - widest_int estimated_rel_old = (old_loop_vinfo->vec_inside_cost - * widest_int (new_estimated_vf)); - return estimated_rel_new * 2 <= estimated_rel_old; - } - if (known_lt (rel_new, rel_old)) + poly_int64 rel_new = new_loop_vinfo->vec_inside_cost * old_vf; + poly_int64 rel_old = old_loop_vinfo->vec_inside_cost * new_vf; + + HOST_WIDE_INT est_rel_new_min + = estimated_poly_value (rel_new, POLY_VALUE_MIN); + HOST_WIDE_INT est_rel_new_max + = estimated_poly_value (rel_new, POLY_VALUE_MAX); + + HOST_WIDE_INT est_rel_old_min + = estimated_poly_value (rel_old, POLY_VALUE_MIN); + HOST_WIDE_INT est_rel_old_max + = estimated_poly_value (rel_old, POLY_VALUE_MAX); + + /* Check first if we can make out an unambigous total order from the minimum + and maximum estimates. */ + if (est_rel_new_min < est_rel_old_min + && est_rel_new_max < est_rel_old_max) return true; + else if (est_rel_old_min < est_rel_new_min + && est_rel_old_max < est_rel_new_max) + return false; + /* When old_loop_vinfo uses a variable vectorization factor, + we know that it has a lower cost for at least one runtime VF. + However, we don't know how likely that VF is. + + One option would be to compare the costs for the estimated VFs. + The problem is that that can put too much pressure on the cost + model. E.g. if the estimated VF is also the lowest possible VF, + and if old_loop_vinfo is 1 unit worse than new_loop_vinfo + for the estimated VF, we'd then choose new_loop_vinfo even + though (a) new_loop_vinfo might not actually be better than + old_loop_vinfo for that VF and (b) it would be significantly + worse at larger VFs. + + Here we go for a hacky compromise: pick new_loop_vinfo if it is + no more expensive than old_loop_vinfo even after doubling the + estimated old_loop_vinfo VF. For all but trivial loops, this + ensures that we only pick new_loop_vinfo if it is significantly + better than old_loop_vinfo at the estimated VF. */ + + if (est_rel_old_min != est_rel_new_min + || est_rel_old_max != est_rel_new_max) + { + HOST_WIDE_INT est_rel_new_likely + = estimated_poly_value (rel_new, POLY_VALUE_LIKELY); + HOST_WIDE_INT est_rel_old_likely + = estimated_poly_value (rel_old, POLY_VALUE_LIKELY); + + return est_rel_new_likely * 2 <= est_rel_old_likely; + } /* If there's nothing to choose between the loop bodies, see whether there's a difference in the prologue and epilogue costs. */ -- 2.30.2