From 64432b680eab0bddbe9a4ad4798457cf6a14ad60 Mon Sep 17 00:00:00 2001
From: Kyrylo Tkachov <kyrylo.tkachov@arm.com>
Date: Thu, 17 Dec 2020 18:02:37 +0000
Subject: [PATCH] vect, aarch64: Extend SVE vs Advanced SIMD costing decisions
 in vect_better_loop_vinfo_p

While experimenting with some backend costs for Advanced SIMD and SVE I
hit many cases where GCC would pick SVE for VLA auto-vectorisation even when
the backend very clearly presented cheaper costs for Advanced SIMD.
For a simple float addition loop the SVE costs were:

vec.c:9:21: note:  Cost model analysis:
  Vector inside of loop cost: 28
  Vector prologue cost: 2
  Vector epilogue cost: 0
  Scalar iteration cost: 10
  Scalar outside cost: 0
  Vector outside cost: 2
  prologue iterations: 0
  epilogue iterations: 0
  Minimum number of vector iterations: 1
  Calculated minimum iters for profitability: 4

and for Advanced SIMD (Neon) they're:

vec.c:9:21: note:  Cost model analysis:
  Vector inside of loop cost: 11
  Vector prologue cost: 0
  Vector epilogue cost: 0
  Scalar iteration cost: 10
  Scalar outside cost: 0
  Vector outside cost: 0
  prologue iterations: 0
  epilogue iterations: 0
  Calculated minimum iters for profitability: 0
vec.c:9:21: note:    Runtime profitability threshold = 4

yet the SVE one was always picked. With guidance from Richard this seems
to be due to the vinfo comparisons in vect_better_loop_vinfo_p, in
particular the part with the big comment explaining the
estimated_rel_new * 2 <= estimated_rel_old heuristic.

This patch extends the comparisons by introducing a three-way estimate
kind for poly_int values that the backend can distinguish.
This allows vect_better_loop_vinfo_p to ask for minimum, maximum and
likely estimates and pick Advanced SIMD overs SVE when it is clearly cheaper.

gcc/
	* target.h (enum poly_value_estimate_kind): Define.
	(estimated_poly_value): Take an estimate kind argument.
	* target.def (estimated_poly_value): Update definition for the
	above.
	* doc/tm.texi: Regenerate.
	* targhooks.c (estimated_poly_value): Update prototype.
	* tree-vect-loop.c (vect_better_loop_vinfo_p): Use min, max and
	likely estimates of VF to pick between vinfos.
	* config/aarch64/aarch64.c (aarch64_cmp_autovec_modes): Use
	estimated_poly_value instead of aarch64_estimated_poly_value.
	(aarch64_estimated_poly_value): Take a kind argument and handle
	it.
---
 gcc/config/aarch64/aarch64.c | 35 +++++++++++----
 gcc/doc/tm.texi              |  7 ++-
 gcc/target.def               |  7 ++-
 gcc/target.h                 | 12 ++++-
 gcc/targhooks.c              |  2 +-
 gcc/tree-vect-loop.c         | 85 +++++++++++++++++++++---------------
 6 files changed, 96 insertions(+), 52 deletions(-)

diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index b79630194c7..67b4c91dbe4 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -17367,8 +17367,6 @@ aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
   return word_mode;
 }
 
-static HOST_WIDE_INT aarch64_estimated_poly_value (poly_int64);
-
 /* Compare an SVE mode SVE_M and an Advanced SIMD mode ASIMD_M
    and return whether the SVE mode should be preferred over the
    Advanced SIMD one in aarch64_autovectorize_vector_modes.  */
@@ -17401,8 +17399,8 @@ aarch64_cmp_autovec_modes (machine_mode sve_m, machine_mode asimd_m)
     return maybe_gt (nunits_sve, nunits_asimd);
 
   /* Otherwise estimate the runtime width of the modes involved.  */
-  HOST_WIDE_INT est_sve = aarch64_estimated_poly_value (nunits_sve);
-  HOST_WIDE_INT est_asimd = aarch64_estimated_poly_value (nunits_asimd);
+  HOST_WIDE_INT est_sve = estimated_poly_value (nunits_sve);
+  HOST_WIDE_INT est_asimd = estimated_poly_value (nunits_asimd);
 
   /* Preferring SVE means picking it first unless the Advanced SIMD mode
      is clearly wider.  */
@@ -23195,19 +23193,38 @@ aarch64_speculation_safe_value (machine_mode mode,
 
 /* Implement TARGET_ESTIMATED_POLY_VALUE.
    Look into the tuning structure for an estimate.
-   VAL.coeffs[1] is multiplied by the number of VQ chunks over the initial
-   Advanced SIMD 128 bits.  */
+   KIND specifies the type of requested estimate: min, max or likely.
+   For cores with a known SVE width all three estimates are the same.
+   For generic SVE tuning we want to distinguish the maximum estimate from
+   the minimum and likely ones.
+   The likely estimate is the same as the minimum in that case to give a
+   conservative behavior of auto-vectorizing with SVE when it is a win
+   even for 128-bit SVE.
+   When SVE width information is available VAL.coeffs[1] is multiplied by
+   the number of VQ chunks over the initial Advanced SIMD 128 bits.  */
 
 static HOST_WIDE_INT
-aarch64_estimated_poly_value (poly_int64 val)
+aarch64_estimated_poly_value (poly_int64 val,
+			      poly_value_estimate_kind kind
+				= POLY_VALUE_LIKELY)
 {
   enum aarch64_sve_vector_bits_enum width_source
     = aarch64_tune_params.sve_width;
 
-  /* If we still don't have an estimate, use the default.  */
+  /* If there is no core-specific information then the minimum and likely
+     values are based on 128-bit vectors and the maximum is based on
+     the architectural maximum of 2048 bits.  */
   if (width_source == SVE_SCALABLE)
-    return default_estimated_poly_value (val);
+    switch (kind)
+      {
+      case POLY_VALUE_MIN:
+      case POLY_VALUE_LIKELY:
+	return val.coeffs[0];
+      case POLY_VALUE_MAX:
+	  return val.coeffs[0] + val.coeffs[1] * 15;
+      }
 
+  /* If the core provides width information, use that.  */
   HOST_WIDE_INT over_128 = width_source - 128;
   return val.coeffs[0] + val.coeffs[1] * over_128 / 128;
 }
diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
index d9b855c13ac..900d584d178 100644
--- a/gcc/doc/tm.texi
+++ b/gcc/doc/tm.texi
@@ -7005,9 +7005,12 @@ delay slot branches filled using the basic filler is often still desirable
 as the delay slot can hide a pipeline bubble.
 @end deftypefn
 
-@deftypefn {Target Hook} HOST_WIDE_INT TARGET_ESTIMATED_POLY_VALUE (poly_int64 @var{val})
+@deftypefn {Target Hook} HOST_WIDE_INT TARGET_ESTIMATED_POLY_VALUE (poly_int64 @var{val}, poly_value_estimate_kind @var{kind})
 Return an estimate of the runtime value of @var{val}, for use in
-things like cost calculations or profiling frequencies.  The default
+things like cost calculations or profiling frequencies.  @var{kind} is used
+to ask for the minimum, maximum, and likely estimates of the value through
+the @code{POLY_VALUE_MIN}, @code{POLY_VALUE_MAX} and
+@code{POLY_VALUE_LIKELY} values.  The default
 implementation returns the lowest possible value of @var{val}.
 @end deftypefn
 
diff --git a/gcc/target.def b/gcc/target.def
index acdc694d44c..d5ed0dfda4f 100644
--- a/gcc/target.def
+++ b/gcc/target.def
@@ -3851,9 +3851,12 @@ default_new_address_profitable_p)
 DEFHOOK
 (estimated_poly_value,
  "Return an estimate of the runtime value of @var{val}, for use in\n\
-things like cost calculations or profiling frequencies.  The default\n\
+things like cost calculations or profiling frequencies.  @var{kind} is used\n\
+to ask for the minimum, maximum, and likely estimates of the value through\n\
+the @code{POLY_VALUE_MIN}, @code{POLY_VALUE_MAX} and\n\
+@code{POLY_VALUE_LIKELY} values.  The default\n\
 implementation returns the lowest possible value of @var{val}.",
- HOST_WIDE_INT, (poly_int64 val),
+ HOST_WIDE_INT, (poly_int64 val, poly_value_estimate_kind kind),
  default_estimated_poly_value)
 
 /* Permit speculative instructions in delay slots during delayed-branch 
diff --git a/gcc/target.h b/gcc/target.h
index 960188023f1..68ef5194b6c 100644
--- a/gcc/target.h
+++ b/gcc/target.h
@@ -252,6 +252,13 @@ enum type_context_kind {
   TCTX_CAPTURE_BY_COPY
 };
 
+enum poly_value_estimate_kind
+{
+  POLY_VALUE_MIN,
+  POLY_VALUE_MAX,
+  POLY_VALUE_LIKELY
+};
+
 extern bool verify_type_context (location_t, type_context_kind, const_tree,
 				 bool = false);
 
@@ -272,12 +279,13 @@ extern struct gcc_target targetm;
    provides a rough guess.  */
 
 static inline HOST_WIDE_INT
-estimated_poly_value (poly_int64 x)
+estimated_poly_value (poly_int64 x,
+		      poly_value_estimate_kind kind = POLY_VALUE_LIKELY)
 {
   if (NUM_POLY_INT_COEFFS == 1)
     return x.coeffs[0];
   else
-    return targetm.estimated_poly_value (x);
+    return targetm.estimated_poly_value (x, kind);
 }
 
 #ifdef GCC_TM_H
diff --git a/gcc/targhooks.c b/gcc/targhooks.c
index 6e12e13d68e..d616fb73bdf 100644
--- a/gcc/targhooks.c
+++ b/gcc/targhooks.c
@@ -1757,7 +1757,7 @@ default_slow_unaligned_access (machine_mode, unsigned int)
 /* The default implementation of TARGET_ESTIMATED_POLY_VALUE.  */
 
 HOST_WIDE_INT
-default_estimated_poly_value (poly_int64 x)
+default_estimated_poly_value (poly_int64 x, poly_value_estimate_kind)
 {
   return x.coeffs[0];
 }
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index 52757add0e3..688538a4521 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -2773,43 +2773,56 @@ vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
 
   /* Check whether the (fractional) cost per scalar iteration is lower
      or higher: new_inside_cost / new_vf vs. old_inside_cost / old_vf.  */
-  poly_widest_int rel_new = (new_loop_vinfo->vec_inside_cost
-			     * poly_widest_int (old_vf));
-  poly_widest_int rel_old = (old_loop_vinfo->vec_inside_cost
-			     * poly_widest_int (new_vf));
-  if (maybe_lt (rel_old, rel_new))
-    {
-      /* When old_loop_vinfo uses a variable vectorization factor,
-	 we know that it has a lower cost for at least one runtime VF.
-	 However, we don't know how likely that VF is.
-
-	 One option would be to compare the costs for the estimated VFs.
-	 The problem is that that can put too much pressure on the cost
-	 model.  E.g. if the estimated VF is also the lowest possible VF,
-	 and if old_loop_vinfo is 1 unit worse than new_loop_vinfo
-	 for the estimated VF, we'd then choose new_loop_vinfo even
-	 though (a) new_loop_vinfo might not actually be better than
-	 old_loop_vinfo for that VF and (b) it would be significantly
-	 worse at larger VFs.
-
-	 Here we go for a hacky compromise: pick new_loop_vinfo if it is
-	 no more expensive than old_loop_vinfo even after doubling the
-	 estimated old_loop_vinfo VF.  For all but trivial loops, this
-	 ensures that we only pick new_loop_vinfo if it is significantly
-	 better than old_loop_vinfo at the estimated VF.  */
-      if (rel_new.is_constant ())
-	return false;
-
-      HOST_WIDE_INT new_estimated_vf = estimated_poly_value (new_vf);
-      HOST_WIDE_INT old_estimated_vf = estimated_poly_value (old_vf);
-      widest_int estimated_rel_new = (new_loop_vinfo->vec_inside_cost
-				      * widest_int (old_estimated_vf));
-      widest_int estimated_rel_old = (old_loop_vinfo->vec_inside_cost
-				      * widest_int (new_estimated_vf));
-      return estimated_rel_new * 2 <= estimated_rel_old;
-    }
-  if (known_lt (rel_new, rel_old))
+  poly_int64 rel_new = new_loop_vinfo->vec_inside_cost * old_vf;
+  poly_int64 rel_old = old_loop_vinfo->vec_inside_cost * new_vf;
+
+  HOST_WIDE_INT est_rel_new_min
+    = estimated_poly_value (rel_new, POLY_VALUE_MIN);
+  HOST_WIDE_INT est_rel_new_max
+    = estimated_poly_value (rel_new, POLY_VALUE_MAX);
+
+  HOST_WIDE_INT est_rel_old_min
+    = estimated_poly_value (rel_old, POLY_VALUE_MIN);
+  HOST_WIDE_INT est_rel_old_max
+    = estimated_poly_value (rel_old, POLY_VALUE_MAX);
+
+  /* Check first if we can make out an unambigous total order from the minimum
+     and maximum estimates.  */
+  if (est_rel_new_min < est_rel_old_min
+      && est_rel_new_max < est_rel_old_max)
     return true;
+  else if (est_rel_old_min < est_rel_new_min
+	   && est_rel_old_max < est_rel_new_max)
+    return false;
+  /* When old_loop_vinfo uses a variable vectorization factor,
+     we know that it has a lower cost for at least one runtime VF.
+     However, we don't know how likely that VF is.
+
+     One option would be to compare the costs for the estimated VFs.
+     The problem is that that can put too much pressure on the cost
+     model.  E.g. if the estimated VF is also the lowest possible VF,
+     and if old_loop_vinfo is 1 unit worse than new_loop_vinfo
+     for the estimated VF, we'd then choose new_loop_vinfo even
+     though (a) new_loop_vinfo might not actually be better than
+     old_loop_vinfo for that VF and (b) it would be significantly
+     worse at larger VFs.
+
+     Here we go for a hacky compromise: pick new_loop_vinfo if it is
+     no more expensive than old_loop_vinfo even after doubling the
+     estimated old_loop_vinfo VF.  For all but trivial loops, this
+     ensures that we only pick new_loop_vinfo if it is significantly
+     better than old_loop_vinfo at the estimated VF.  */
+
+  if (est_rel_old_min != est_rel_new_min
+      || est_rel_old_max != est_rel_new_max)
+    {
+      HOST_WIDE_INT est_rel_new_likely
+	= estimated_poly_value (rel_new, POLY_VALUE_LIKELY);
+      HOST_WIDE_INT est_rel_old_likely
+	= estimated_poly_value (rel_old, POLY_VALUE_LIKELY);
+
+      return est_rel_new_likely * 2 <= est_rel_old_likely;
+    }
 
   /* If there's nothing to choose between the loop bodies, see whether
      there's a difference in the prologue and epilogue costs.  */
-- 
2.30.2