aarch64: Add support for unpacked SVE ASRD
authorRichard Sandiford <richard.sandiford@arm.com>
Mon, 11 Jan 2021 18:03:26 +0000 (18:03 +0000)
committerRichard Sandiford <richard.sandiford@arm.com>
Mon, 11 Jan 2021 18:03:26 +0000 (18:03 +0000)
This patch adds support for both conditional and unconditional unpacked
ASRD.  This meant adding a new define_insn for the unconditional form,
instead of reusing the conditional instructions.  It also meant
extending the current conditional patterns to support merging with
any independent value, not just zero.

gcc/
* config/aarch64/aarch64-sve.md (sdiv_pow2<mode>3): Extend from
SVE_FULL_I to SVE_I.  Generate an UNSPEC_PRED_X.
(*sdiv_pow2<mode>3): New pattern.
(@cond_<sve_int_op><mode>): Extend from SVE_FULL_I to SVE_I.
Wrap the ASRD in an UNSPEC_PRED_X.
(*cond_<sve_int_op><mode>_2): Likewise.  Replace the UNSPEC_PRED_X
predicate with a constant PTRUE, if it isn't already.
(*cond_<sve_int_op><mode>_z): Replace with...
(*cond_<sve_int_op><mode>_any): ...this new pattern.

gcc/testsuite/
* gcc.target/aarch64/sve/asrdiv_4.c: New test.
* gcc.target/aarch64/sve/cond_asrd_1.c: Likewise.
* gcc.target/aarch64/sve/cond_asrd_1_run.c: Likewise.
* gcc.target/aarch64/sve/cond_asrd_2.c: Likewise.
* gcc.target/aarch64/sve/cond_asrd_2_run.c: Likewise.
* gcc.target/aarch64/sve/cond_asrd_3.c: Likewise.
* gcc.target/aarch64/sve/cond_asrd_3_run.c: Likewise.

gcc/config/aarch64/aarch64-sve.md
gcc/testsuite/gcc.target/aarch64/sve/asrdiv_4.c [new file with mode: 0644]
gcc/testsuite/gcc.target/aarch64/sve/cond_asrd_1.c [new file with mode: 0644]
gcc/testsuite/gcc.target/aarch64/sve/cond_asrd_1_run.c [new file with mode: 0644]
gcc/testsuite/gcc.target/aarch64/sve/cond_asrd_2.c [new file with mode: 0644]
gcc/testsuite/gcc.target/aarch64/sve/cond_asrd_2_run.c [new file with mode: 0644]
gcc/testsuite/gcc.target/aarch64/sve/cond_asrd_3.c [new file with mode: 0644]
gcc/testsuite/gcc.target/aarch64/sve/cond_asrd_3_run.c [new file with mode: 0644]

index b8259f24b3ddedafb1099567cfc41d7e12cfee90..a6f8450f9518c3e24bb2116369ce5239dd7e0c0e 100644 (file)
 ;; - URSHR (SVE2)
 ;; -------------------------------------------------------------------------
 
-;; Unpredicated <SVE_INT_OP>.
+;; Unpredicated ASRD.
 (define_expand "sdiv_pow2<mode>3"
-  [(set (match_operand:SVE_FULL_I 0 "register_operand")
-       (unspec:SVE_FULL_I
+  [(set (match_operand:SVE_I 0 "register_operand")
+       (unspec:SVE_I
          [(match_dup 3)
-          (unspec:SVE_FULL_I
-            [(match_operand:SVE_FULL_I 1 "register_operand")
+          (unspec:SVE_I
+            [(match_operand:SVE_I 1 "register_operand")
              (match_operand 2 "aarch64_simd_rshift_imm")]
-            UNSPEC_ASRD)
-          (match_dup 1)]
-        UNSPEC_SEL))]
+            UNSPEC_ASRD)]
+        UNSPEC_PRED_X))]
   "TARGET_SVE"
   {
     operands[3] = aarch64_ptrue_reg (<VPRED>mode);
   }
 )
 
-;; Predicated right shift with merging.
+;; Predicated ASRD.
+(define_insn "*sdiv_pow2<mode>3"
+  [(set (match_operand:SVE_I 0 "register_operand" "=w, ?&w")
+       (unspec:SVE_I
+         [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
+          (unspec:SVE_I
+            [(match_operand:SVE_I 2 "register_operand" "0, w")
+             (match_operand:SVE_I 3 "aarch64_simd_rshift_imm")]
+            UNSPEC_ASRD)]
+         UNSPEC_PRED_X))]
+  "TARGET_SVE"
+  "@
+   asrd\t%0.<Vetype>, %1/m, %0.<Vetype>, #%3
+   movprfx\t%0, %2\;asrd\t%0.<Vetype>, %1/m, %0.<Vetype>, #%3"
+  [(set_attr "movprfx" "*,yes")])
+
+;; Predicated shift with merging.
 (define_expand "@cond_<sve_int_op><mode>"
-  [(set (match_operand:SVE_FULL_I 0 "register_operand")
-       (unspec:SVE_FULL_I
+  [(set (match_operand:SVE_I 0 "register_operand")
+       (unspec:SVE_I
          [(match_operand:<VPRED> 1 "register_operand")
-          (unspec:SVE_FULL_I
-            [(match_operand:SVE_FULL_I 2 "register_operand")
-             (match_operand:SVE_FULL_I 3 "aarch64_simd_<lr>shift_imm")]
-            SVE_INT_SHIFT_IMM)
-          (match_operand:SVE_FULL_I 4 "aarch64_simd_reg_or_zero")]
+          (unspec:SVE_I
+            [(match_dup 5)
+             (unspec:SVE_I
+               [(match_operand:SVE_I 2 "register_operand")
+                (match_operand:SVE_I 3 "aarch64_simd_<lr>shift_imm")]
+               SVE_INT_SHIFT_IMM)]
+            UNSPEC_PRED_X)
+          (match_operand:SVE_I 4 "aarch64_simd_reg_or_zero")]
          UNSPEC_SEL))]
   "TARGET_SVE"
+  {
+    operands[5] = aarch64_ptrue_reg (<VPRED>mode);
+  }
 )
 
-;; Predicated right shift, merging with the first input.
-(define_insn "*cond_<sve_int_op><mode>_2"
-  [(set (match_operand:SVE_FULL_I 0 "register_operand" "=w, ?&w")
-       (unspec:SVE_FULL_I
+;; Predicated shift, merging with the first input.
+(define_insn_and_rewrite "*cond_<sve_int_op><mode>_2"
+  [(set (match_operand:SVE_I 0 "register_operand" "=w, ?&w")
+       (unspec:SVE_I
          [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
-          (unspec:SVE_FULL_I
-            [(match_operand:SVE_FULL_I 2 "register_operand" "0, w")
-             (match_operand:SVE_FULL_I 3 "aarch64_simd_<lr>shift_imm")]
-            SVE_INT_SHIFT_IMM)
+          (unspec:SVE_I
+            [(match_operand 4)
+             (unspec:SVE_I
+               [(match_operand:SVE_I 2 "register_operand" "0, w")
+                (match_operand:SVE_I 3 "aarch64_simd_<lr>shift_imm")]
+               SVE_INT_SHIFT_IMM)]
+            UNSPEC_PRED_X)
           (match_dup 2)]
          UNSPEC_SEL))]
   "TARGET_SVE"
   "@
    <sve_int_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, #%3
    movprfx\t%0, %2\;<sve_int_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, #%3"
+  "&& !CONSTANT_P (operands[4])"
+  {
+    operands[4] = CONSTM1_RTX (<VPRED>mode);
+  }
   [(set_attr "movprfx" "*,yes")])
 
-;; Predicated right shift, merging with zero.
-(define_insn "*cond_<sve_int_op><mode>_z"
-  [(set (match_operand:SVE_FULL_I 0 "register_operand" "=w")
-       (unspec:SVE_FULL_I
-         [(match_operand:<VPRED> 1 "register_operand" "Upl")
-          (unspec:SVE_FULL_I
-            [(match_operand:SVE_FULL_I 2 "register_operand" "w")
-             (match_operand:SVE_FULL_I 3 "aarch64_simd_<lr>shift_imm")]
-            SVE_INT_SHIFT_IMM)
-          (match_operand:SVE_FULL_I 4 "aarch64_simd_imm_zero")]
-         UNSPEC_SEL))]
-  "TARGET_SVE"
-  "movprfx\t%0.<Vetype>, %1/z, %2.<Vetype>\;<sve_int_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, #%3"
-  [(set_attr "movprfx" "yes")])
+;; Predicated shift, merging with an independent value.
+(define_insn_and_rewrite "*cond_<sve_int_op><mode>_any"
+  [(set (match_operand:SVE_I 0 "register_operand" "=w, &w, ?&w")
+       (unspec:SVE_I
+         [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl")
+          (unspec:SVE_I
+            [(match_operand 5)
+             (unspec:SVE_I
+               [(match_operand:SVE_I 2 "register_operand" "w, w, w")
+                (match_operand:SVE_I 3 "aarch64_simd_<lr>shift_imm")]
+               SVE_INT_SHIFT_IMM)]
+            UNSPEC_PRED_X)
+          (match_operand:SVE_I 4 "aarch64_simd_reg_or_zero" "Dz, 0, w")]
+        UNSPEC_SEL))]
+  "TARGET_SVE && !rtx_equal_p (operands[2], operands[4])"
+  "@
+   movprfx\t%0.<Vetype>, %1/z, %2.<Vetype>\;<sve_int_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, #%3
+   movprfx\t%0.<Vetype>, %1/m, %2.<Vetype>\;<sve_int_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, #%3
+   #"
+  "&& reload_completed
+   && register_operand (operands[4], <MODE>mode)
+   && !rtx_equal_p (operands[0], operands[4])"
+  {
+    emit_insn (gen_vcond_mask_<mode><vpred> (operands[0], operands[2],
+                                            operands[4], operands[1]));
+    operands[4] = operands[2] = operands[0];
+  }
+  [(set_attr "movprfx" "yes")]
+)
 
 ;; -------------------------------------------------------------------------
 ;; ---- [FP<-INT] General binary arithmetic corresponding to unspecs
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/asrdiv_4.c b/gcc/testsuite/gcc.target/aarch64/sve/asrdiv_4.c
new file mode 100644 (file)
index 0000000..6684fe1
--- /dev/null
@@ -0,0 +1,29 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define DEF_LOOP(TYPE1, TYPE2, COUNT)                          \
+  void __attribute__ ((noipa))                                 \
+  test_##TYPE1##_##TYPE2##_##TYPE3 (TYPE2 *restrict r,         \
+                                   TYPE1 *restrict pred,       \
+                                   TYPE2 *restrict a)          \
+  {                                                            \
+    for (int i = 0; i < COUNT; ++i)                            \
+      if (pred[i])                                             \
+       r[i] = a[i] / 16;                                       \
+  }
+
+#define TEST_ALL(T) \
+  T (int16_t, int8_t, 7) \
+  T (int32_t, int8_t, 3) \
+  T (int32_t, int16_t, 3) \
+  T (int64_t, int8_t, 5) \
+  T (int64_t, int16_t, 5) \
+  T (int64_t, int32_t, 5)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\tasrd\tz[0-9]+\.b, p[0-7]/m, z[0-9]+\.b, #4\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tasrd\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #4\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tasrd\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #4\n} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_asrd_1.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_asrd_1.c
new file mode 100644 (file)
index 0000000..478b52a
--- /dev/null
@@ -0,0 +1,32 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define DEF_LOOP(TYPE1, TYPE2, COUNT)                          \
+  void __attribute__ ((noipa))                                 \
+  test_##TYPE1##_##TYPE2 (TYPE2 *__restrict r,                 \
+                         TYPE1 *__restrict a,                  \
+                         TYPE2 *__restrict b)                  \
+  {                                                            \
+    for (int i = 0; i < COUNT; ++i)                            \
+      r[i] = a[i] == 0 ? b[i] / 16 : b[i];                     \
+  }
+
+#define TEST_ALL(T) \
+  T (int16_t, int8_t, 7) \
+  T (int32_t, int8_t, 3) \
+  T (int32_t, int16_t, 3) \
+  T (int64_t, int8_t, 5) \
+  T (int64_t, int16_t, 5) \
+  T (int64_t, int32_t, 5)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\tasrd\tz[0-9]+\.b, p[0-7]/m, z[0-9]+\.b, #4\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tasrd\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #4\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tasrd\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #4\n} 1 } } */
+
+/* { dg-final { scan-assembler-not {\tmov\tz} } } */
+/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
+/* { dg-final { scan-assembler-not {\tsel\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_asrd_1_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_asrd_1_run.c
new file mode 100644 (file)
index 0000000..25c88ea
--- /dev/null
@@ -0,0 +1,26 @@
+/* { dg-do run { target { aarch64_sve_hw } } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "cond_asrd_1.c"
+
+#define TEST_LOOP(TYPE1, TYPE2, N)                             \
+  {                                                            \
+    TYPE1 a[N];                                                        \
+    TYPE2 r[N], b[N];                                          \
+    for (int i = 0; i < N; ++i)                                        \
+      {                                                                \
+       a[i] = i % 3 < 2 ? 0 : i * 13;                          \
+       b[i] = (i & 1 ? i : -i) * 17;                           \
+       asm volatile ("" ::: "memory");                         \
+      }                                                                \
+    test_##TYPE1##_##TYPE2 (r, a, b);                          \
+    for (int i = 0; i < N; ++i)                                        \
+      if (r[i] != (TYPE2) (a[i] == 0 ? b[i] / 16 : b[i]))      \
+       __builtin_abort ();                                     \
+  }
+
+int main ()
+{
+  TEST_ALL (TEST_LOOP)
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_asrd_2.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_asrd_2.c
new file mode 100644 (file)
index 0000000..e4040ee
--- /dev/null
@@ -0,0 +1,35 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define DEF_LOOP(TYPE1, TYPE2, COUNT)                          \
+  void __attribute__ ((noipa))                                 \
+  test_##TYPE1##_##TYPE2 (TYPE2 *__restrict r,                 \
+                         TYPE1 *__restrict a,                  \
+                         TYPE2 *__restrict b)                  \
+  {                                                            \
+    for (int i = 0; i < COUNT; ++i)                            \
+      r[i] = a[i] == 0 ? b[i] / 16 : a[i];                     \
+  }
+
+#define TEST_ALL(T) \
+  T (int16_t, int8_t, 7) \
+  T (int32_t, int8_t, 3) \
+  T (int32_t, int16_t, 3) \
+  T (int64_t, int8_t, 5) \
+  T (int64_t, int16_t, 5) \
+  T (int64_t, int32_t, 5)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\tasrd\tz[0-9]+\.b, p[0-7]/m, z[0-9]+\.b, #4\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tasrd\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #4\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tasrd\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #4\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.b, p[0-7]/m, z[0-9]+\.b\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s\n} 1 } } */
+
+/* { dg-final { scan-assembler-not {\tmov\tz} } } */
+/* { dg-final { scan-assembler-not {\tsel\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_asrd_2_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_asrd_2_run.c
new file mode 100644 (file)
index 0000000..ba1b722
--- /dev/null
@@ -0,0 +1,26 @@
+/* { dg-do run { target { aarch64_sve_hw } } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "cond_asrd_2.c"
+
+#define TEST_LOOP(TYPE1, TYPE2, N)                             \
+  {                                                            \
+    TYPE1 a[N];                                                        \
+    TYPE2 r[N], b[N];                                          \
+    for (int i = 0; i < N; ++i)                                        \
+      {                                                                \
+       a[i] = i % 3 < 2 ? 0 : i * 13;                          \
+       b[i] = (i & 1 ? i : -i) * 17;                           \
+       asm volatile ("" ::: "memory");                         \
+      }                                                                \
+    test_##TYPE1##_##TYPE2 (r, a, b);                          \
+    for (int i = 0; i < N; ++i)                                        \
+      if (r[i] != (TYPE2) (a[i] == 0 ? b[i] / 16 : a[i]))      \
+       __builtin_abort ();                                     \
+  }
+
+int main ()
+{
+  TEST_ALL (TEST_LOOP)
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_asrd_3.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_asrd_3.c
new file mode 100644 (file)
index 0000000..0d620a3
--- /dev/null
@@ -0,0 +1,35 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define DEF_LOOP(TYPE1, TYPE2, COUNT)                          \
+  void __attribute__ ((noipa))                                 \
+  test_##TYPE1##_##TYPE2 (TYPE2 *__restrict r,                 \
+                         TYPE1 *__restrict a,                  \
+                         TYPE2 *__restrict b)                  \
+  {                                                            \
+    for (int i = 0; i < COUNT; ++i)                            \
+      r[i] = a[i] == 0 ? b[i] / 16 : 0;                                \
+  }
+
+#define TEST_ALL(T) \
+  T (int16_t, int8_t, 7) \
+  T (int32_t, int8_t, 3) \
+  T (int32_t, int16_t, 3) \
+  T (int64_t, int8_t, 5) \
+  T (int64_t, int16_t, 5) \
+  T (int64_t, int32_t, 5)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\tasrd\tz[0-9]+\.b, p[0-7]/m, z[0-9]+\.b, #4\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tasrd\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #4\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tasrd\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #4\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.b, p[0-7]/z, z[0-9]+\.b\n} 3 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.h, p[0-7]/z, z[0-9]+\.h\n} 2 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.s, p[0-7]/z, z[0-9]+\.s\n} 1 { xfail *-*-* } } } */
+
+/* { dg-final { scan-assembler-not {\tmov\tz} { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-not {\tsel\t} { xfail *-*-* } } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_asrd_3_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_asrd_3_run.c
new file mode 100644 (file)
index 0000000..42ab6fe
--- /dev/null
@@ -0,0 +1,26 @@
+/* { dg-do run { target { aarch64_sve_hw } } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "cond_asrd_3.c"
+
+#define TEST_LOOP(TYPE1, TYPE2, N)                             \
+  {                                                            \
+    TYPE1 a[N];                                                        \
+    TYPE2 r[N], b[N];                                          \
+    for (int i = 0; i < N; ++i)                                        \
+      {                                                                \
+       a[i] = i % 3 < 2 ? 0 : i * 13;                          \
+       b[i] = (i & 1 ? i : -i) * 17;                           \
+       asm volatile ("" ::: "memory");                         \
+      }                                                                \
+    test_##TYPE1##_##TYPE2 (r, a, b);                          \
+    for (int i = 0; i < N; ++i)                                        \
+      if (r[i] != (TYPE2) (a[i] == 0 ? b[i] / 16 : 0))         \
+       __builtin_abort ();                                     \
+  }
+
+int main ()
+{
+  TEST_ALL (TEST_LOOP)
+  return 0;
+}