aarch64: Tweak SVE load/store costs

author Richard Sandiford <richard.sandiford@arm.com>

Tue, 14 Apr 2020 20:04:03 +0000 (21:04 +0100)

committer Richard Sandiford <richard.sandiford@arm.com>

Fri, 17 Apr 2020 15:09:38 +0000 (16:09 +0100)
author Richard Sandiford <richard.sandiford@arm.com>
Tue, 14 Apr 2020 20:04:03 +0000 (21:04 +0100)
committer Richard Sandiford <richard.sandiford@arm.com>
Fri, 17 Apr 2020 15:09:38 +0000 (16:09 +0100)
diff --git a/gcc/ChangeLog b/gcc/ChangeLog

index 1c010560c19f49a5f28ba599bfbb9dba914fb1d2..75540e89b69e8e4538ef5ee8fc932816e734dc34 100644 (file)
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,11 @@
+2020-04-17  Richard Sandiford  <richard.sandiford@arm.com>
+
+       * config/aarch64/aarch64.c (aarch64_advsimd_ldp_stp_p): New function.
+       (aarch64_sve_adjust_stmt_cost): Add a vectype parameter.  Double the
+       cost of load and store insns if one loop iteration has enough scalar
+       elements to use an Advanced SIMD LDP or STP.
+       (aarch64_add_stmt_cost): Update call accordingly.
+
  2020-04-17  Jakub Jelinek  <jakub@redhat.com>
             Jeff Law  <law@redhat.com>
  
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c

index d0a41c286cdfde9759fcbad1b2e2367037f0a832..24c055df0dcb25a30cbc79b32c24bd73351d9179 100644 (file)
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -13518,6 +13518,32 @@ aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
      }
  }
  
+/* Return true if creating multiple copies of STMT_INFO for Advanced SIMD
+   vectors would produce a series of LDP or STP operations.  KIND is the
+   kind of statement that STMT_INFO represents.  */
+static bool
+aarch64_advsimd_ldp_stp_p (enum vect_cost_for_stmt kind,
+                          stmt_vec_info stmt_info)
+{
+  switch (kind)
+    {
+    case vector_load:
+    case vector_store:
+    case unaligned_load:
+    case unaligned_store:
+      break;
+
+    default:
+      return false;
+    }
+
+  if (aarch64_tune_params.extra_tuning_flags
+      & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS)
+    return false;
+
+  return is_gimple_assign (stmt_info->stmt);
+}
+
  /* Return true if STMT_INFO extends the result of a load.  */
  static bool
  aarch64_extending_load_p (stmt_vec_info stmt_info)
@@ -13556,10 +13582,12 @@ aarch64_integer_truncation_p (stmt_vec_info stmt_info)
  }
  
  /* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost
-   for STMT_INFO, which has cost kind KIND.  Adjust the cost as necessary
-   for SVE targets.  */
+   for STMT_INFO, which has cost kind KIND and which when vectorized would
+   operate on vector type VECTYPE.  Adjust the cost as necessary for SVE
+   targets.  */
  static unsigned int
-aarch64_sve_adjust_stmt_cost (vect_cost_for_stmt kind, stmt_vec_info stmt_info,
+aarch64_sve_adjust_stmt_cost (vect_cost_for_stmt kind,
+                             stmt_vec_info stmt_info, tree vectype,
                               unsigned int stmt_cost)
  {
    /* Unlike vec_promote_demote, vector_stmt conversions do not change the
@@ -13578,6 +13606,46 @@ aarch64_sve_adjust_stmt_cost (vect_cost_for_stmt kind, stmt_vec_info stmt_info,
    if (kind == vector_stmt && aarch64_integer_truncation_p (stmt_info))
      stmt_cost = 0;
  
+  /* Advanced SIMD can load and store pairs of registers using LDP and STP,
+     but there are no equivalent instructions for SVE.  This means that
+     (all other things being equal) 128-bit SVE needs twice as many load
+     and store instructions as Advanced SIMD in order to process vector pairs.
+
+     Also, scalar code can often use LDP and STP to access pairs of values,
+     so it is too simplistic to say that one SVE load or store replaces
+     VF scalar loads and stores.
+
+     Ideally we would account for this in the scalar and Advanced SIMD
+     costs by making suitable load/store pairs as cheap as a single
+     load/store.  However, that would be a very invasive change and in
+     practice it tends to stress other parts of the cost model too much.
+     E.g. stores of scalar constants currently count just a store,
+     whereas stores of vector constants count a store and a vec_init.
+     This is an artificial distinction for AArch64, where stores of
+     nonzero scalar constants need the same kind of register invariant
+     as vector stores.
+
+     An alternative would be to double the cost of any SVE loads and stores
+     that could be paired in Advanced SIMD (and possibly also paired in
+     scalar code).  But this tends to stress other parts of the cost model
+     in the same way.  It also means that we can fall back to Advanced SIMD
+     even if full-loop predication would have been useful.
+
+     Here we go for a more conservative version: double the costs of SVE
+     loads and stores if one iteration of the scalar loop processes enough
+     elements for it to use a whole number of Advanced SIMD LDP or STP
+     instructions.  This makes it very likely that the VF would be 1 for
+     Advanced SIMD, and so no epilogue should be needed.  */
+  if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
+    {
+      stmt_vec_info first = DR_GROUP_FIRST_ELEMENT (stmt_info);
+      unsigned int count = DR_GROUP_SIZE (first) - DR_GROUP_GAP (first);
+      unsigned int elt_bits = GET_MODE_UNIT_BITSIZE (TYPE_MODE (vectype));
+      if (multiple_p (count * elt_bits, 256)
+         && aarch64_advsimd_ldp_stp_p (kind, stmt_info))
+       stmt_cost *= 2;
+    }
+
    return stmt_cost;
  }
  
@@ -13597,7 +13665,8 @@ aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
             aarch64_builtin_vectorization_cost (kind, vectype, misalign);
  
        if (stmt_info && vectype && aarch64_sve_mode_p (TYPE_MODE (vectype)))
-       stmt_cost = aarch64_sve_adjust_stmt_cost (kind, stmt_info, stmt_cost);
+       stmt_cost = aarch64_sve_adjust_stmt_cost (kind, stmt_info, vectype,
+                                                 stmt_cost);
  
        /* Statements in an inner loop relative to the loop being
          vectorized are weighted more heavily.  The value here is
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog

index 830ee92357e5bac7429b3b5026eb1378550ee30f..6c96253c1d190a76f63524fa0cb45c985e559422 100644 (file)
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,3 +1,12 @@
+2020-04-17  Richard Sandiford  <richard.sandiford@arm.com>
+
+       * gcc.target/aarch64/sve/cost_model_2.c: New test.
+       * gcc.target/aarch64/sve/cost_model_3.c: Likewise.
+       * gcc.target/aarch64/sve/cost_model_4.c: Likewise.
+       * gcc.target/aarch64/sve/cost_model_5.c: Likewise.
+       * gcc.target/aarch64/sve/cost_model_6.c: Likewise.
+       * gcc.target/aarch64/sve/cost_model_7.c: Likewise.
+
  2020-04-17  Jakub Jelinek  <jakub@redhat.com>
             Jeff Law  <law@redhat.com>
  
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cost_model_2.c b/gcc/testsuite/gcc.target/aarch64/sve/cost_model_2.c

new file mode 100644 (file)

index 0000000..d9d7078
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cost_model_2.c
@@ -0,0 +1,12 @@
+/* { dg-options "-O3 -msve-vector-bits=128" } */
+
+void
+vset (int *restrict dst, int *restrict src, int count)
+{
+  for (int i = 0; i < count; ++i)
+#pragma GCC unroll 4
+    for (int j = 0; j < 4; ++j)
+      *dst++ = 1;
+}
+
+/* { dg-final { scan-assembler-times {\tst1w\tz} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cost_model_3.c b/gcc/testsuite/gcc.target/aarch64/sve/cost_model_3.c

new file mode 100644 (file)

index 0000000..dd7d1cf
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cost_model_3.c
@@ -0,0 +1,13 @@
+/* { dg-options "-O3 -msve-vector-bits=128" } */
+
+void
+vset (int *restrict dst, int *restrict src, int count)
+{
+  for (int i = 0; i < count; ++i)
+#pragma GCC unroll 8
+    for (int j = 0; j < 8; ++j)
+      *dst++ = 1;
+}
+
+/* { dg-final { scan-assembler-not {\tst1w\tz} } } */
+/* { dg-final { scan-assembler-times {\tstp\tq} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cost_model_4.c b/gcc/testsuite/gcc.target/aarch64/sve/cost_model_4.c

new file mode 100644 (file)

index 0000000..a7ecfe3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cost_model_4.c
@@ -0,0 +1,12 @@
+/* { dg-options "-O3 -msve-vector-bits=256" } */
+
+void
+vset (int *restrict dst, int *restrict src, int count)
+{
+  for (int i = 0; i < count; ++i)
+#pragma GCC unroll 8
+    for (int j = 0; j < 8; ++j)
+      *dst++ = 1;
+}
+
+/* { dg-final { scan-assembler-times {\tst1w\tz} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cost_model_5.c b/gcc/testsuite/gcc.target/aarch64/sve/cost_model_5.c

new file mode 100644 (file)

index 0000000..250ca83
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cost_model_5.c
@@ -0,0 +1,13 @@
+/* { dg-options "-O3 -msve-vector-bits=256" } */
+
+void
+vset (int *restrict dst, int *restrict src, int count)
+{
+  for (int i = 0; i < count; ++i)
+#pragma GCC unroll 16
+    for (int j = 0; j < 16; ++j)
+      *dst++ = 1;
+}
+
+/* { dg-final { scan-assembler-not {\tst1w\tz} } } */
+/* { dg-final { scan-assembler-times {\tstp\tq} 2 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cost_model_6.c b/gcc/testsuite/gcc.target/aarch64/sve/cost_model_6.c

new file mode 100644 (file)

index 0000000..565e1e3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cost_model_6.c
@@ -0,0 +1,12 @@
+/* { dg-options "-O3 -msve-vector-bits=512" } */
+
+void
+vset (int *restrict dst, int *restrict src, int count)
+{
+  for (int i = 0; i < count; ++i)
+#pragma GCC unroll 16
+    for (int j = 0; j < 16; ++j)
+      *dst++ = 1;
+}
+
+/* { dg-final { scan-assembler-times {\tst1w\tz} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cost_model_7.c b/gcc/testsuite/gcc.target/aarch64/sve/cost_model_7.c

new file mode 100644 (file)

index 0000000..31057c0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cost_model_7.c
@@ -0,0 +1,12 @@
+/* { dg-options "-O3 -msve-vector-bits=512" } */
+
+void
+vset (int *restrict dst, int *restrict src, int count)
+{
+  for (int i = 0; i < count; ++i)
+#pragma GCC unroll 32
+    for (int j = 0; j < 32; ++j)
+      *dst++ = 1;
+}
+
+/* { dg-final { scan-assembler-times {\tst1w\tz} 2 } } */
author	Richard Sandiford <richard.sandiford@arm.com>
	Tue, 14 Apr 2020 20:04:03 +0000 (21:04 +0100)
committer	Richard Sandiford <richard.sandiford@arm.com>
	Fri, 17 Apr 2020 15:09:38 +0000 (16:09 +0100)
gcc/ChangeLog		patch \| blob \| history
gcc/config/aarch64/aarch64.c		patch \| blob \| history
gcc/testsuite/ChangeLog		patch \| blob \| history
gcc/testsuite/gcc.target/aarch64/sve/cost_model_2.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/aarch64/sve/cost_model_3.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/aarch64/sve/cost_model_4.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/aarch64/sve/cost_model_5.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/aarch64/sve/cost_model_6.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/aarch64/sve/cost_model_7.c	[new file with mode: 0644]	patch \| blob