re PR target/88838 ([SVE] Use 32-bit WHILELO in LP64 mode)

author Kugan Vivekanandarajah <kuganv@linaro.org>

Thu, 13 Jun 2019 03:34:28 +0000 (03:34 +0000)

committer Kugan Vivekanandarajah <kugan@gcc.gnu.org>

Thu, 13 Jun 2019 03:34:28 +0000 (03:34 +0000)
author Kugan Vivekanandarajah <kuganv@linaro.org>
Thu, 13 Jun 2019 03:34:28 +0000 (03:34 +0000)
committer Kugan Vivekanandarajah <kugan@gcc.gnu.org>
Thu, 13 Jun 2019 03:34:28 +0000 (03:34 +0000)
diff --git a/gcc/ChangeLog b/gcc/ChangeLog

index 58fb6fcd38b056c0c11f1ba28677f52566ad0ba1..079379e45dc3dab842ce98a5fa8623579611c9a4 100644 (file)
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,15 @@
+2019-06-13  Kugan Vivekanandarajah  <kugan.vivekanandarajah@linaro.org>
+
+       PR target/88838
+       * tree-vect-loop-manip.c (vect_set_loop_masks_directly): If the
+       compare_type is not with Pmode size, we will create an IV with
+       Pmode size with truncated use (i.e. converted to the correct type).
+       * tree-vect-loop.c (vect_verify_full_masking): Find IV type.
+       (vect_iv_limit_for_full_masking): New. Factored out of
+       vect_set_loop_condition_masked.
+       * tree-vectorizer.h (LOOP_VINFO_MASK_IV_TYPE): New.
+       (vect_iv_limit_for_full_masking): Declare.
+
  2019-06-13  Kugan Vivekanandarajah  <kugan.vivekanandarajah@linaro.org>
  
         PR target/88834
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog

index 2f222a9d3e6baaa00847df57441d65847c30953b..7212c6be52577e3f916e5e9f0d8f3634d61798c3 100644 (file)
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,3 +1,9 @@
+2019-06-13  Kugan Vivekanandarajah  <kugan.vivekanandarajah@linaro.org>
+
+       PR target/88838
+       * gcc.target/aarch64/pr88838.c: New test.
+       * gcc.target/aarch64/sve/while_1.c: Adjust.
+
  2019-06-13  Kugan Vivekanandarajah  <kugan.vivekanandarajah@linaro.org>
  
         PR target/88834
diff --git a/gcc/testsuite/gcc.target/aarch64/pr88838.c b/gcc/testsuite/gcc.target/aarch64/pr88838.c

new file mode 100644 (file)

index 0000000..d7db847
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr88838.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-options "-S -O3 -march=armv8.2-a+sve" } */
+
+void
+f (int *restrict x, int *restrict y, int *restrict z, int n)
+{
+    for (int i = 0; i < n; i += 1)
+          x[i] = y[i] + z[i];
+}
+
+/* { dg-final { scan-assembler-not "sxtw" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/while_1.c b/gcc/testsuite/gcc.target/aarch64/sve/while_1.c

index a93a04baa3beb12c2822a0211d7bbe4cc9ccdb3b..05a486012210f1c3a9be8e254fe9f09938d03b34 100644 (file)
--- a/gcc/testsuite/gcc.target/aarch64/sve/while_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/while_1.c
@@ -26,14 +26,14 @@
  TEST_ALL (ADD_LOOP)
  
  /* { dg-final { scan-assembler-not {\tuqdec} } } */
-/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.b, xzr,} 2 } } */
-/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.b, x[0-9]+,} 2 } } */
-/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.h, xzr,} 2 } } */
-/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.h, x[0-9]+,} 2 } } */
-/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.s, xzr,} 3 } } */
-/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.s, x[0-9]+,} 3 } } */
-/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.d, xzr,} 3 } } */
-/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.d, x[0-9]+,} 3 } } */
+/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.b, wzr,} 2 } } */
+/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.b, w[0-9]+,} 2 } } */
+/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.h, wzr,} 2 } } */
+/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.h, w[0-9]+,} 2 } } */
+/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.s, wzr,} 3 } } */
+/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.s, w[0-9]+,} 3 } } */
+/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.d, wzr,} 3 } } */
+/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.d, w[0-9]+,} 3 } } */
  /* { dg-final { scan-assembler-times {\tld1b\tz[0-9]+\.b, p[0-7]/z, \[x0, x[0-9]+\]\n} 2 } } */
  /* { dg-final { scan-assembler-times {\tst1b\tz[0-9]+\.b, p[0-7], \[x0, x[0-9]+\]\n} 2 } } */
  /* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.h, p[0-7]/z, \[x0, x[0-9]+, lsl 1\]\n} 2 } } */
diff --git a/gcc/tree-vect-loop-manip.c b/gcc/tree-vect-loop-manip.c

index b3fae5ba4dacdd1bed8ec082d1cf2e002fc5bda7..a0a1bee9408e4f4483ecea6d4a17c4677c5042e5 100644 (file)
--- a/gcc/tree-vect-loop-manip.c
+++ b/gcc/tree-vect-loop-manip.c
@@ -415,6 +415,7 @@ vect_set_loop_masks_directly (struct loop *loop, loop_vec_info loop_vinfo,
                               bool might_wrap_p)
  {
    tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
+  tree iv_type = LOOP_VINFO_MASK_IV_TYPE (loop_vinfo);
    tree mask_type = rgm->mask_type;
    unsigned int nscalars_per_iter = rgm->max_nscalars_per_iter;
    poly_uint64 nscalars_per_mask = TYPE_VECTOR_SUBPARTS (mask_type);
@@ -445,11 +446,16 @@ vect_set_loop_masks_directly (struct loop *loop, loop_vec_info loop_vinfo,
    tree index_before_incr, index_after_incr;
    gimple_stmt_iterator incr_gsi;
    bool insert_after;
-  tree zero_index = build_int_cst (compare_type, 0);
    standard_iv_increment_position (loop, &incr_gsi, &insert_after);
-  create_iv (zero_index, nscalars_step, NULL_TREE, loop, &incr_gsi,
+
+  tree zero_index = build_int_cst (iv_type, 0);
+  tree step = build_int_cst (iv_type,
+                            LOOP_VINFO_VECT_FACTOR (loop_vinfo));
+  /* Create IV of iv_type.  */
+  create_iv (zero_index, step, NULL_TREE, loop, &incr_gsi,
              insert_after, &index_before_incr, &index_after_incr);
  
+  zero_index = build_int_cst (compare_type, 0);
    tree test_index, test_limit, first_limit;
    gimple_stmt_iterator *test_gsi;
    if (might_wrap_p)
@@ -529,6 +535,10 @@ vect_set_loop_masks_directly (struct loop *loop, loop_vec_info loop_vinfo,
    tree next_mask = NULL_TREE;
    tree mask;
    unsigned int i;
+  gimple_seq test_seq = NULL;
+  test_index = gimple_convert (&test_seq, compare_type, test_index);
+  gsi_insert_seq_before (test_gsi, test_seq, GSI_SAME_STMT);
+
    FOR_EACH_VEC_ELT_REVERSE (rgm->masks, i, mask)
      {
        /* Previous masks will cover BIAS scalars.  This mask covers the
@@ -637,12 +647,12 @@ vect_set_loop_condition_masked (struct loop *loop, loop_vec_info loop_vinfo,
  
    tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
    unsigned int compare_precision = TYPE_PRECISION (compare_type);
-  unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
    tree orig_niters = niters;
  
    /* Type of the initial value of NITERS.  */
    tree ni_actual_type = TREE_TYPE (niters);
    unsigned int ni_actual_precision = TYPE_PRECISION (ni_actual_type);
+  tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
  
    /* Convert NITERS to the same size as the compare.  */
    if (compare_precision > ni_actual_precision
@@ -661,33 +671,7 @@ vect_set_loop_condition_masked (struct loop *loop, loop_vec_info loop_vinfo,
    else
      niters = gimple_convert (&preheader_seq, compare_type, niters);
  
-  /* Convert skip_niters to the right type.  */
-  tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
-
-  /* Now calculate the value that the induction variable must be able
-     to hit in order to ensure that we end the loop with an all-false mask.
-     This involves adding the maximum number of inactive trailing scalar
-     iterations.  */
-  widest_int iv_limit;
-  bool known_max_iters = max_loop_iterations (loop, &iv_limit);
-  if (known_max_iters)
-    {
-      if (niters_skip)
-       {
-         /* Add the maximum number of skipped iterations to the
-            maximum iteration count.  */
-         if (TREE_CODE (niters_skip) == INTEGER_CST)
-           iv_limit += wi::to_widest (niters_skip);
-         else
-           iv_limit += max_vf - 1;
-       }
-      /* IV_LIMIT is the maximum number of latch iterations, which is also
-        the maximum in-range IV value.  Round this value down to the previous
-        vector alignment boundary and then add an extra full iteration.  */
-      poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
-      iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
-    }
-
+  widest_int iv_limit = vect_iv_limit_for_full_masking (loop_vinfo);
    /* Get the vectorization factor in tree form.  */
    tree vf = build_int_cst (compare_type,
                            LOOP_VINFO_VECT_FACTOR (loop_vinfo));
@@ -717,7 +701,7 @@ vect_set_loop_condition_masked (struct loop *loop, loop_vec_info loop_vinfo,
         /* See whether zero-based IV would ever generate all-false masks
            before wrapping around.  */
         bool might_wrap_p
-         = (!known_max_iters
+         = (iv_limit == -1
              || (wi::min_precision (iv_limit * rgm->max_nscalars_per_iter,
                                     UNSIGNED)
                  > compare_precision));
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c

index 4942c6937e00859be9b065dee89f302d3373c387..671ef2f7658e9c20b990306c943745c8a98ee218 100644 (file)
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -1030,6 +1030,8 @@ vect_verify_full_masking (loop_vec_info loop_vinfo)
  {
    struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
    unsigned int min_ni_width;
+  unsigned int max_nscalars_per_iter
+    = vect_get_max_nscalars_per_iter (loop_vinfo);
  
    /* Use a normal loop if there are no statements that need masking.
       This only happens in rare degenerate cases: it means that the loop
@@ -1048,7 +1050,7 @@ vect_verify_full_masking (loop_vec_info loop_vinfo)
      max_ni = wi::smin (max_ni, max_back_edges + 1);
  
    /* Account for rgroup masks, in which each bit is replicated N times.  */
-  max_ni *= vect_get_max_nscalars_per_iter (loop_vinfo);
+  max_ni *= max_nscalars_per_iter;
  
    /* Work out how many bits we need to represent the limit.  */
    min_ni_width = wi::min_precision (max_ni, UNSIGNED);
@@ -1056,6 +1058,14 @@ vect_verify_full_masking (loop_vec_info loop_vinfo)
    /* Find a scalar mode for which WHILE_ULT is supported.  */
    opt_scalar_int_mode cmp_mode_iter;
    tree cmp_type = NULL_TREE;
+  tree iv_type = NULL_TREE;
+  widest_int iv_limit = vect_iv_limit_for_full_masking (loop_vinfo);
+  widest_int iv_precision = UINT_MAX;
+
+  if (iv_limit != -1)
+    iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
+                                     UNSIGNED);
+
    FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
      {
        unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
@@ -1067,10 +1077,32 @@ vect_verify_full_masking (loop_vec_info loop_vinfo)
               && can_produce_all_loop_masks_p (loop_vinfo, this_type))
             {
               /* Although we could stop as soon as we find a valid mode,
-                it's often better to continue until we hit Pmode, since the
-                operands to the WHILE are more likely to be reusable in
-                address calculations.  */
-             cmp_type = this_type;
+                there are at least two reasons why that's not always the
+                best choice:
+
+                - An IV that's Pmode or wider is more likely to be reusable
+                in address calculations than an IV that's narrower than
+                Pmode.
+
+                - Doing the comparison in IV_PRECISION or wider allows
+                a natural 0-based IV, whereas using a narrower comparison
+                type requires mitigations against wrap-around.
+
+                Conversely, if the IV limit is variable, doing the comparison
+                in a wider type than the original type can introduce
+                unnecessary extensions, so picking the widest valid mode
+                is not always a good choice either.
+
+                Here we prefer the first IV type that's Pmode or wider,
+                and the first comparison type that's IV_PRECISION or wider.
+                (The comparison type must be no wider than the IV type,
+                to avoid extensions in the vector loop.)
+
+                ??? We might want to try continuing beyond Pmode for ILP32
+                targets if CMP_BITS < IV_PRECISION.  */
+             iv_type = this_type;
+             if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
+               cmp_type = this_type;
               if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
                 break;
             }
@@ -1081,6 +1113,7 @@ vect_verify_full_masking (loop_vec_info loop_vinfo)
      return false;
  
    LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo) = cmp_type;
+  LOOP_VINFO_MASK_IV_TYPE (loop_vinfo) = iv_type;
    return true;
  }
  
@@ -9014,3 +9047,45 @@ optimize_mask_stores (struct loop *loop)
        add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
      }
  }
+
+/* Decide whether it is possible to use a zero-based induction variable
+   when vectorizing LOOP_VINFO with a fully-masked loop.  If it is,
+   return the value that the induction variable must be able to hold
+   in order to ensure that the loop ends with an all-false mask.
+   Return -1 otherwise.  */
+widest_int
+vect_iv_limit_for_full_masking (loop_vec_info loop_vinfo)
+{
+  tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
+  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+  unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
+
+  /* Calculate the value that the induction variable must be able
+     to hit in order to ensure that we end the loop with an all-false mask.
+     This involves adding the maximum number of inactive trailing scalar
+     iterations.  */
+  widest_int iv_limit = -1;
+  if (max_loop_iterations (loop, &iv_limit))
+    {
+      if (niters_skip)
+       {
+         /* Add the maximum number of skipped iterations to the
+            maximum iteration count.  */
+         if (TREE_CODE (niters_skip) == INTEGER_CST)
+           iv_limit += wi::to_widest (niters_skip);
+         else
+           iv_limit += max_vf - 1;
+       }
+      else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
+       /* Make a conservatively-correct assumption.  */
+       iv_limit += max_vf - 1;
+
+      /* IV_LIMIT is the maximum number of latch iterations, which is also
+        the maximum in-range IV value.  Round this value down to the previous
+        vector alignment boundary and then add an extra full iteration.  */
+      poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
+      iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
+    }
+  return iv_limit;
+}
+
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h

index 4db30ccc22bba1dd2013570fe4a9d6bfae3f90c4..eb0f21f84fb4d0dca777e6724e8795227c2633f5 100644 (file)
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -435,6 +435,10 @@ typedef struct _loop_vec_info : public vec_info {
       is false and vectorized loop otherwise.  */
    tree simd_if_cond;
  
+  /* Type of the IV to use in the WHILE_ULT call for fully-masked
+     loops.  */
+  tree iv_type;
+
    /* Unknown DRs according to which loop was peeled.  */
    struct dr_vec_info *unaligned_dr;
  
@@ -570,6 +574,7 @@ typedef struct _loop_vec_info : public vec_info {
  #define LOOP_VINFO_MASKS(L)                (L)->masks
  #define LOOP_VINFO_MASK_SKIP_NITERS(L)     (L)->mask_skip_niters
  #define LOOP_VINFO_MASK_COMPARE_TYPE(L)    (L)->mask_compare_type
+#define LOOP_VINFO_MASK_IV_TYPE(L)         (L)->iv_type
  #define LOOP_VINFO_PTR_MASK(L)             (L)->ptr_mask
  #define LOOP_VINFO_LOOP_NEST(L)            (L)->shared->loop_nest
  #define LOOP_VINFO_DATAREFS(L)             (L)->shared->datarefs
@@ -1582,6 +1587,7 @@ extern tree vect_create_addr_base_for_vector_ref (stmt_vec_info, gimple_seq *,
  /* FORNOW: Used in tree-parloops.c.  */
  extern stmt_vec_info vect_force_simple_reduction (loop_vec_info, stmt_vec_info,
                                                   bool *, bool);
+extern widest_int vect_iv_limit_for_full_masking (loop_vec_info loop_vinfo);
  /* Used in gimple-loop-interchange.c.  */
  extern bool check_reduction_path (dump_user_location_t, loop_p, gphi *, tree,
                                   enum tree_code);
author	Kugan Vivekanandarajah <kuganv@linaro.org>
	Thu, 13 Jun 2019 03:34:28 +0000 (03:34 +0000)
committer	Kugan Vivekanandarajah <kugan@gcc.gnu.org>
	Thu, 13 Jun 2019 03:34:28 +0000 (03:34 +0000)
gcc/ChangeLog		patch \| blob \| history
gcc/testsuite/ChangeLog		patch \| blob \| history
gcc/testsuite/gcc.target/aarch64/pr88838.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/aarch64/sve/while_1.c		patch \| blob \| history
gcc/tree-vect-loop-manip.c		patch \| blob \| history
gcc/tree-vect-loop.c		patch \| blob \| history
gcc/tree-vectorizer.h		patch \| blob \| history