AArch64: Adjust costing of by element MUL to be the same as SAME3 MUL.

author Tamar Christina <tamar.christina@arm.com>

Wed, 10 Jun 2020 10:55:46 +0000 (11:55 +0100)

committer Tamar Christina <tamar.christina@arm.com>

Wed, 10 Jun 2020 10:55:46 +0000 (11:55 +0100)
author Tamar Christina <tamar.christina@arm.com>
Wed, 10 Jun 2020 10:55:46 +0000 (11:55 +0100)
committer Tamar Christina <tamar.christina@arm.com>
Wed, 10 Jun 2020 10:55:46 +0000 (11:55 +0100)
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c

index 973c65aa4fb348450872036617362aa17310fb20..f3551a73d87c4e686540f39224985592c3c66fd1 100644 (file)
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -11279,7 +11279,23 @@ aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
    op1 = XEXP (x, 1);
  
    if (VECTOR_MODE_P (mode))
-    mode = GET_MODE_INNER (mode);
+    {
+      unsigned int vec_flags = aarch64_classify_vector_mode (mode);
+      mode = GET_MODE_INNER (mode);
+      if (vec_flags & VEC_ADVSIMD)
+       {
+         /* The by-element versions of the instruction have the same costs as
+            the normal 3-vector version.  So don't add the costs of the
+            duplicate into the costs of the multiply.  We make an assumption
+            that the input to the VEC_DUPLICATE is already on the FP & SIMD
+            side.  This means costing of a MUL by element pre RA is a bit
+            optimistic.  */
+         if (GET_CODE (op0) == VEC_DUPLICATE)
+           op0 = XEXP (op0, 0);
+         else if (GET_CODE (op1) == VEC_DUPLICATE)
+           op1 = XEXP (op1, 0);
+       }
+    }
  
    /* Integer multiply/fma.  */
    if (GET_MODE_CLASS (mode) == MODE_INT)
diff --git a/gcc/testsuite/gcc.target/aarch64/asimd-mull-elem.c b/gcc/testsuite/gcc.target/aarch64/asimd-mull-elem.c

new file mode 100644 (file)

index 0000000..513721c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/asimd-mull-elem.c
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target vect_int } */
+/* { dg-require-effective-target vect_float } */
+/* { dg-options "-Ofast" } */
+
+#include <arm_neon.h>
+
+void s_mult_i (int32_t* restrict res, int32_t* restrict a, int32_t b)
+{
+    for (int x = 0; x < 16; x++)
+      res[x] = a[x] * b;
+}
+
+void s_mult_f (float32_t* restrict res, float32_t* restrict a, float32_t b)
+{
+    for (int x = 0; x < 16; x++)
+      res[x] = a[x] * b;
+}
+
+/* { dg-final { scan-assembler-times {\s+mul\tv[0-9]+\.4s, v[0-9]+\.4s, v[0-9]+\.s\[0\]} 4 } } */
+/* { dg-final { scan-assembler-times {\s+fmul\tv[0-9]+\.4s, v[0-9]+\.4s, v[0-9]+\.s\[0\]} 4 } } */
author	Tamar Christina <tamar.christina@arm.com>
	Wed, 10 Jun 2020 10:55:46 +0000 (11:55 +0100)
committer	Tamar Christina <tamar.christina@arm.com>
	Wed, 10 Jun 2020 10:55:46 +0000 (11:55 +0100)
gcc/config/aarch64/aarch64.c		patch \| blob \| history
gcc/testsuite/gcc.target/aarch64/asimd-mull-elem.c	[new file with mode: 0644]	patch \| blob