From 0b0061f4d83cd8eb181f7114a077237b95a0c144 Mon Sep 17 00:00:00 2001
From: Richard Sandiford <richard.sandiford@arm.com>
Date: Thu, 19 Nov 2020 16:49:37 +0000
Subject: [PATCH] =?utf8?q?vect:=20Add=20a=20=E2=80=9Cvery=20cheap=E2=80=9D?=
 =?utf8?q?=20cost=20model?=
MIME-Version: 1.0
Content-Type: text/plain; charset=utf8
Content-Transfer-Encoding: 8bit

Currently we have three vector cost models: cheap, dynamic and
unlimited.  -O2 -ftree-vectorize uses âcheapâ by default, but that's
still relatively aggressive about peeling and aliasing checks,
and can lead to significant code size growth.

This patch adds an even more conservative choice, which for lack of
imagination I've called âvery cheapâ.  It only allows vectorisation
if the vector code entirely replaces the scalar code.  It also
requires one iteration of the vector loop to pay for itself,
regardless of how often the loop iterates.  (If the vector loop
needs multiple iterations to be beneficial then things are
probably too close to call, and the conservative thing would
be to stick with the scalar code.)

The idea is that this should be suitable for -O2, although the patch
doesn't change any defaults itself.

I tested this by building and running a bunch of workloads for SVE,
with three options:

  (1) -O2
  (2) -O2 -ftree-vectorize -fvect-cost-model=very-cheap
  (3) -O2 -ftree-vectorize [-fvect-cost-model=cheap]

All three builds used the default -msve-vector-bits=scalable and
ran with the minimum vector length of 128 bits, which should give
a worst-case bound for the performance impact.

The workloads included a mixture of microbenchmarks and full
applications.  Because it's quite an eclectic mix, there's not
much point giving exact figures.  The aim was more to get a general
impression.

Code size growth with (2) was much lower than with (3).  Only a
handful of tests increased by more than 5%, and all of them were
microbenchmarks.

In terms of performance, (2) was significantly faster than (1)
on microbenchmarks (as expected) but also on some full apps.
Again, performance only regressed on a handful of tests.

As expected, the performance of (3) vs. (1) and (3) vs. (2) is more
of a mixed bag.  There are several significant improvements with (3)
over (2), but also some (smaller) regressions.  That seems to be in
line with -O2 -ftree-vectorize being a kind of -O2.5.

The patch reorders vect_cost_model so that values are in order
of increasing aggressiveness, which makes it possible to use
range checks.  The value 0 still represents âunlimitedâ,
so âif (flag_vect_cost_model)â is still a meaningful check.

gcc/
	* doc/invoke.texi (-fvect-cost-model): Add a very-cheap model.
	* common.opt (fvect-cost-model=): Add very-cheap as a possible option.
	(fsimd-cost-model=): Likewise.
	(vect_cost_model): Add very-cheap.
	* flag-types.h (vect_cost_model): Add VECT_COST_MODEL_VERY_CHEAP.
	Put the values in order of increasing aggressiveness.
	* tree-vect-data-refs.c (vect_enhance_data_refs_alignment): Use
	range checks when comparing against VECT_COST_MODEL_CHEAP.
	(vect_prune_runtime_alias_test_list): Do not allow any alias
	checks for the very-cheap cost model.
	* tree-vect-loop.c (vect_analyze_loop_costing): Do not allow
	any peeling for the very-cheap cost model.  Also require one
	iteration of the vector loop to pay for itself.

gcc/testsuite/
	* gcc.dg/vect/vect-cost-model-1.c: New test.
	* gcc.dg/vect/vect-cost-model-2.c: Likewise.
	* gcc.dg/vect/vect-cost-model-3.c: Likewise.
	* gcc.dg/vect/vect-cost-model-4.c: Likewise.
	* gcc.dg/vect/vect-cost-model-5.c: Likewise.
	* gcc.dg/vect/vect-cost-model-6.c: Likewise.
---
 gcc/common.opt                                |  7 +++--
 gcc/doc/invoke.texi                           | 12 +++++++--
 gcc/flag-types.h                              | 10 ++++---
 gcc/testsuite/gcc.dg/vect/vect-cost-model-1.c | 11 ++++++++
 gcc/testsuite/gcc.dg/vect/vect-cost-model-2.c | 11 ++++++++
 gcc/testsuite/gcc.dg/vect/vect-cost-model-3.c | 11 ++++++++
 gcc/testsuite/gcc.dg/vect/vect-cost-model-4.c | 13 +++++++++
 gcc/testsuite/gcc.dg/vect/vect-cost-model-5.c | 11 ++++++++
 gcc/testsuite/gcc.dg/vect/vect-cost-model-6.c | 12 +++++++++
 gcc/tree-vect-data-refs.c                     |  8 ++++--
 gcc/tree-vect-loop.c                          | 27 +++++++++++++++++++
 11 files changed, 123 insertions(+), 10 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/vect/vect-cost-model-1.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/vect-cost-model-2.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/vect-cost-model-3.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/vect-cost-model-4.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/vect-cost-model-5.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/vect-cost-model-6.c

diff --git a/gcc/common.opt b/gcc/common.opt
index fe39b3dee9f..ca8a2690799 100644
--- a/gcc/common.opt
+++ b/gcc/common.opt
@@ -3020,11 +3020,11 @@ Enable basic block vectorization (SLP) on trees.
 
 fvect-cost-model=
 Common Joined RejectNegative Enum(vect_cost_model) Var(flag_vect_cost_model) Init(VECT_COST_MODEL_DEFAULT) Optimization
--fvect-cost-model=[unlimited|dynamic|cheap]	Specifies the cost model for vectorization.
+-fvect-cost-model=[unlimited|dynamic|cheap|very-cheap]	Specifies the cost model for vectorization.
 
 fsimd-cost-model=
 Common Joined RejectNegative Enum(vect_cost_model) Var(flag_simd_cost_model) Init(VECT_COST_MODEL_UNLIMITED) Optimization
--fsimd-cost-model=[unlimited|dynamic|cheap]	Specifies the vectorization cost model for code marked with a simd directive.
+-fsimd-cost-model=[unlimited|dynamic|cheap|very-cheap]	Specifies the vectorization cost model for code marked with a simd directive.
 
 Enum
 Name(vect_cost_model) Type(enum vect_cost_model) UnknownError(unknown vectorizer cost model %qs)
@@ -3038,6 +3038,9 @@ Enum(vect_cost_model) String(dynamic) Value(VECT_COST_MODEL_DYNAMIC)
 EnumValue
 Enum(vect_cost_model) String(cheap) Value(VECT_COST_MODEL_CHEAP)
 
+EnumValue
+Enum(vect_cost_model) String(very-cheap) Value(VECT_COST_MODEL_VERY_CHEAP)
+
 fvect-cost-model
 Common Alias(fvect-cost-model=,dynamic,unlimited)
 Enables the dynamic vectorizer cost model.  Preserved for backward compatibility.
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 3510a54c6c4..07232c6b33d 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -11440,7 +11440,8 @@ and @option{-fauto-profile}.
 @item -fvect-cost-model=@var{model}
 @opindex fvect-cost-model
 Alter the cost model used for vectorization.  The @var{model} argument
-should be one of @samp{unlimited}, @samp{dynamic} or @samp{cheap}.
+should be one of @samp{unlimited}, @samp{dynamic}, @samp{cheap} or
+@samp{very-cheap}.
 With the @samp{unlimited} model the vectorized code-path is assumed
 to be profitable while with the @samp{dynamic} model a runtime check
 guards the vectorized code-path to enable it only for iteration
@@ -11448,7 +11449,14 @@ counts that will likely execute faster than when executing the original
 scalar loop.  The @samp{cheap} model disables vectorization of
 loops where doing so would be cost prohibitive for example due to
 required runtime checks for data dependence or alignment but otherwise
-is equal to the @samp{dynamic} model.
+is equal to the @samp{dynamic} model.  The @samp{very-cheap} model only
+allows vectorization if the vector code would entirely replace the
+scalar code that is being vectorized.  For example, if each iteration
+of a vectorized loop would only be able to handle exactly four iterations
+of the scalar loop, the @samp{very-cheap} model would only allow
+vectorization if the scalar iteration count is known to be a multiple
+of four.
+
 The default cost model depends on other optimization flags and is
 either @samp{dynamic} or @samp{cheap}.
 
diff --git a/gcc/flag-types.h b/gcc/flag-types.h
index 648ed096e30..0dbab19943c 100644
--- a/gcc/flag-types.h
+++ b/gcc/flag-types.h
@@ -232,12 +232,14 @@ enum scalar_storage_order_kind {
   SSO_LITTLE_ENDIAN
 };
 
-/* Vectorizer cost-model.  */
+/* Vectorizer cost-model.  Except for DEFAULT, the values are ordered from
+   the most conservative to the least conservative.  */
 enum vect_cost_model {
+  VECT_COST_MODEL_VERY_CHEAP = -3,
+  VECT_COST_MODEL_CHEAP = -2,
+  VECT_COST_MODEL_DYNAMIC = -1,
   VECT_COST_MODEL_UNLIMITED = 0,
-  VECT_COST_MODEL_CHEAP = 1,
-  VECT_COST_MODEL_DYNAMIC = 2,
-  VECT_COST_MODEL_DEFAULT = 3
+  VECT_COST_MODEL_DEFAULT = 1
 };
 
 /* Different instrumentation modes.  */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-cost-model-1.c b/gcc/testsuite/gcc.dg/vect/vect-cost-model-1.c
new file mode 100644
index 00000000000..0737da5d671
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-cost-model-1.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O2 -ftree-vectorize -fvect-cost-model=cheap" } */
+
+void
+f (int *x, int *y)
+{
+  for (unsigned int i = 0; i < 1024; ++i)
+    x[i] += y[i];
+}
+
+/* { dg-final { scan-tree-dump {LOOP VECTORIZED} vect { target vect_int } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-cost-model-2.c b/gcc/testsuite/gcc.dg/vect/vect-cost-model-2.c
new file mode 100644
index 00000000000..fa9bdb607b2
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-cost-model-2.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O2 -ftree-vectorize -fvect-cost-model=very-cheap" } */
+
+void
+f (int *x, int *y)
+{
+  for (unsigned int i = 0; i < 1024; ++i)
+    x[i] += y[i];
+}
+
+/* { dg-final { scan-tree-dump-not {LOOP VECTORIZED} vect { target vect_int } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-cost-model-3.c b/gcc/testsuite/gcc.dg/vect/vect-cost-model-3.c
new file mode 100644
index 00000000000..d7c6cfd2049
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-cost-model-3.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O2 -ftree-vectorize -fvect-cost-model=cheap" } */
+
+void
+f (int *restrict x, int *restrict y)
+{
+  for (unsigned int i = 0; i < 1024; ++i)
+    x[i] += y[i];
+}
+
+/* { dg-final { scan-tree-dump {LOOP VECTORIZED} vect { target vect_int } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-cost-model-4.c b/gcc/testsuite/gcc.dg/vect/vect-cost-model-4.c
new file mode 100644
index 00000000000..bb018ad99fe
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-cost-model-4.c
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O2 -ftree-vectorize -fvect-cost-model=very-cheap" } */
+
+int x[1024], y[1024];
+
+void
+f (void)
+{
+  for (unsigned int i = 0; i < 1024; ++i)
+    x[i] += y[i];
+}
+
+/* { dg-final { scan-tree-dump {LOOP VECTORIZED} vect { target vect_int } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-cost-model-5.c b/gcc/testsuite/gcc.dg/vect/vect-cost-model-5.c
new file mode 100644
index 00000000000..536ec0a3cda
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-cost-model-5.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O2 -ftree-vectorize -fvect-cost-model=cheap" } */
+
+void
+f (int *restrict x, int *restrict y)
+{
+  for (unsigned int i = 0; i < 1023; ++i)
+    x[i] += y[i];
+}
+
+/* { dg-final { scan-tree-dump {LOOP VECTORIZED} vect { target vect_int } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-cost-model-6.c b/gcc/testsuite/gcc.dg/vect/vect-cost-model-6.c
new file mode 100644
index 00000000000..552febb5fee
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-cost-model-6.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O2 -ftree-vectorize -fvect-cost-model=very-cheap" } */
+
+void
+f (int *restrict x, int *restrict y)
+{
+  for (unsigned int i = 0; i < 1023; ++i)
+    x[i] += y[i];
+}
+
+/* { dg-final { scan-tree-dump {LOOP VECTORIZED} vect { target { vect_int && vect_partial_vectors_usage_2 } } } } */
+/* { dg-final { scan-tree-dump-not {LOOP VECTORIZED} vect { target { vect_int && { ! vect_partial_vectors_usage_2 } } } } } */
diff --git a/gcc/tree-vect-data-refs.c b/gcc/tree-vect-data-refs.c
index 0efab495407..18e36c89d14 100644
--- a/gcc/tree-vect-data-refs.c
+++ b/gcc/tree-vect-data-refs.c
@@ -2161,7 +2161,7 @@ vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
         {
           unsigned max_allowed_peel
 	    = param_vect_max_peeling_for_alignment;
-	  if (flag_vect_cost_model == VECT_COST_MODEL_CHEAP)
+	  if (flag_vect_cost_model <= VECT_COST_MODEL_CHEAP)
 	    max_allowed_peel = 0;
           if (max_allowed_peel != (unsigned)-1)
             {
@@ -2259,7 +2259,7 @@ vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
   do_versioning
     = (optimize_loop_nest_for_speed_p (loop)
        && !loop->inner /* FORNOW */
-       && flag_vect_cost_model != VECT_COST_MODEL_CHEAP);
+       && flag_vect_cost_model > VECT_COST_MODEL_CHEAP);
 
   if (do_versioning)
     {
@@ -3682,6 +3682,10 @@ vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
   unsigned int count = (comp_alias_ddrs.length ()
 			+ check_unequal_addrs.length ());
 
+  if (count && flag_vect_cost_model == VECT_COST_MODEL_VERY_CHEAP)
+    return opt_result::failure_at
+      (vect_location, "would need a runtime alias check\n");
+
   if (dump_enabled_p ())
     dump_printf_loc (MSG_NOTE, vect_location,
 		     "improved number of alias checks from %d to %d\n",
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index 856bbfebf7c..48dfb4df00e 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -1827,6 +1827,19 @@ vect_analyze_loop_costing (loop_vec_info loop_vinfo)
 	}
     }
 
+  /* If using the "very cheap" model. reject cases in which we'd keep
+     a copy of the scalar code (even if we might be able to vectorize it).  */
+  if (flag_vect_cost_model == VECT_COST_MODEL_VERY_CHEAP
+      && (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
+	  || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
+	  || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)))
+    {
+      if (dump_enabled_p ())
+	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			 "some scalar iterations would need to be peeled\n");
+      return 0;
+    }
+
   int min_profitable_iters, min_profitable_estimate;
   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
 				      &min_profitable_estimate);
@@ -1885,6 +1898,20 @@ vect_analyze_loop_costing (loop_vec_info loop_vinfo)
       min_profitable_estimate = min_profitable_iters;
     }
 
+  /* If the vector loop needs multiple iterations to be beneficial then
+     things are probably too close to call, and the conservative thing
+     would be to stick with the scalar code.  */
+  if (flag_vect_cost_model == VECT_COST_MODEL_VERY_CHEAP
+      && min_profitable_estimate > (int) vect_vf_for_cost (loop_vinfo))
+    {
+      if (dump_enabled_p ())
+	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			 "one iteration of the vector loop would be"
+			 " more expensive than the equivalent number of"
+			 " iterations of the scalar loop\n");
+      return 0;
+    }
+
   HOST_WIDE_INT estimated_niter;
 
   /* If we are vectorizing an epilogue then we know the maximum number of
-- 
2.30.2