aarch64: Support unpacked CNOT on SVE
authorRichard Sandiford <richard.sandiford@arm.com>
Fri, 8 Jan 2021 10:49:38 +0000 (10:49 +0000)
committerRichard Sandiford <richard.sandiford@arm.com>
Fri, 8 Jan 2021 10:49:38 +0000 (10:49 +0000)
This patch adds unpacked support for unconditional and
conditional CNOT.  The type suffix has to be taken from
the element size rather than the container size.

gcc/
* config/aarch64/aarch64-sve.md (*cnot<mode>): Extend from
SVE_FULL_I to SVE_I.
(*cond_cnot<mode>_2, *cond_cnot<mode>_any): Likewise.

gcc/testsuite/
* gcc.target/aarch64/sve/cnot_2.c: New test.
* gcc.target/aarch64/sve/cond_cnot_4.c: Likewise.
* gcc.target/aarch64/sve/cond_cnot_4_run.c: Likewise.
* gcc.target/aarch64/sve/cond_cnot_5.c: Likewise.
* gcc.target/aarch64/sve/cond_cnot_5_run.c: Likewise.
* gcc.target/aarch64/sve/cond_cnot_6.c: Likewise.
* gcc.target/aarch64/sve/cond_cnot_6_run.c: Likewise.

gcc/config/aarch64/aarch64-sve.md
gcc/testsuite/gcc.target/aarch64/sve/cnot_2.c [new file with mode: 0644]
gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_4.c [new file with mode: 0644]
gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_4_run.c [new file with mode: 0644]
gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_5.c [new file with mode: 0644]
gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_5_run.c [new file with mode: 0644]
gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_6.c [new file with mode: 0644]
gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_6_run.c [new file with mode: 0644]

index b83f9912cb6729486074fce41833ffc77345b024..2f5a5e3c9148cd7480022f61a4110b6cbd788243 100644 (file)
 )
 
 (define_insn "*cnot<mode>"
-  [(set (match_operand:SVE_FULL_I 0 "register_operand" "=w, ?&w")
-       (unspec:SVE_FULL_I
+  [(set (match_operand:SVE_I 0 "register_operand" "=w, ?&w")
+       (unspec:SVE_I
          [(unspec:<VPRED>
             [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
              (match_operand:SI 5 "aarch64_sve_ptrue_flag")
              (eq:<VPRED>
-               (match_operand:SVE_FULL_I 2 "register_operand" "0, w")
-               (match_operand:SVE_FULL_I 3 "aarch64_simd_imm_zero"))]
+               (match_operand:SVE_I 2 "register_operand" "0, w")
+               (match_operand:SVE_I 3 "aarch64_simd_imm_zero"))]
             UNSPEC_PRED_Z)
-          (match_operand:SVE_FULL_I 4 "aarch64_simd_imm_one")
+          (match_operand:SVE_I 4 "aarch64_simd_imm_one")
           (match_dup 3)]
          UNSPEC_SEL))]
   "TARGET_SVE"
 
 ;; Predicated logical inverse, merging with the first input.
 (define_insn_and_rewrite "*cond_cnot<mode>_2"
-  [(set (match_operand:SVE_FULL_I 0 "register_operand" "=w, ?&w")
-       (unspec:SVE_FULL_I
+  [(set (match_operand:SVE_I 0 "register_operand" "=w, ?&w")
+       (unspec:SVE_I
          [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
           ;; Logical inverse of operand 2 (as above).
-          (unspec:SVE_FULL_I
+          (unspec:SVE_I
             [(unspec:<VPRED>
                [(match_operand 5)
                 (const_int SVE_KNOWN_PTRUE)
                 (eq:<VPRED>
-                  (match_operand:SVE_FULL_I 2 "register_operand" "0, w")
-                  (match_operand:SVE_FULL_I 3 "aarch64_simd_imm_zero"))]
+                  (match_operand:SVE_I 2 "register_operand" "0, w")
+                  (match_operand:SVE_I 3 "aarch64_simd_imm_zero"))]
                UNSPEC_PRED_Z)
-             (match_operand:SVE_FULL_I 4 "aarch64_simd_imm_one")
+             (match_operand:SVE_I 4 "aarch64_simd_imm_one")
              (match_dup 3)]
             UNSPEC_SEL)
           (match_dup 2)]
 ;; as earlyclobber helps to make the instruction more regular to the
 ;; register allocator.
 (define_insn_and_rewrite "*cond_cnot<mode>_any"
-  [(set (match_operand:SVE_FULL_I 0 "register_operand" "=&w, ?&w, ?&w")
-       (unspec:SVE_FULL_I
+  [(set (match_operand:SVE_I 0 "register_operand" "=&w, ?&w, ?&w")
+       (unspec:SVE_I
          [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl")
           ;; Logical inverse of operand 2 (as above).
-          (unspec:SVE_FULL_I
+          (unspec:SVE_I
             [(unspec:<VPRED>
                [(match_operand 5)
                 (const_int SVE_KNOWN_PTRUE)
                 (eq:<VPRED>
-                  (match_operand:SVE_FULL_I 2 "register_operand" "w, w, w")
-                  (match_operand:SVE_FULL_I 3 "aarch64_simd_imm_zero"))]
+                  (match_operand:SVE_I 2 "register_operand" "w, w, w")
+                  (match_operand:SVE_I 3 "aarch64_simd_imm_zero"))]
                UNSPEC_PRED_Z)
-             (match_operand:SVE_FULL_I 4 "aarch64_simd_imm_one")
+             (match_operand:SVE_I 4 "aarch64_simd_imm_one")
              (match_dup 3)]
             UNSPEC_SEL)
-          (match_operand:SVE_FULL_I 6 "aarch64_simd_reg_or_zero" "0, Dz, w")]
+          (match_operand:SVE_I 6 "aarch64_simd_reg_or_zero" "0, Dz, w")]
          UNSPEC_SEL))]
   "TARGET_SVE && !rtx_equal_p (operands[2], operands[6])"
   "@
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cnot_2.c b/gcc/testsuite/gcc.target/aarch64/sve/cnot_2.c
new file mode 100644 (file)
index 0000000..fe77823
--- /dev/null
@@ -0,0 +1,29 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define DEF_LOOP(TYPE1, TYPE2, COUNT)                          \
+  void __attribute__ ((noipa))                                 \
+  test_##TYPE1##_##TYPE2##_##TYPE3 (TYPE2 *restrict r,         \
+                                   TYPE1 *restrict pred,       \
+                                   TYPE2 *restrict a)          \
+  {                                                            \
+    for (int i = 0; i < COUNT; ++i)                            \
+      if (pred[i])                                             \
+       r[i] = !a[i];                                           \
+  }
+
+#define TEST_ALL(T) \
+  T (int16_t, int8_t, 7) \
+  T (int32_t, int8_t, 3) \
+  T (int32_t, int16_t, 3) \
+  T (int64_t, int8_t, 5) \
+  T (int64_t, int16_t, 5) \
+  T (int64_t, int32_t, 5)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\tcnot\tz[0-9]+\.b, p[0-7]/m,} 3 } } */
+/* { dg-final { scan-assembler-times {\tcnot\tz[0-9]+\.h, p[0-7]/m,} 2 } } */
+/* { dg-final { scan-assembler-times {\tcnot\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_4.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_4.c
new file mode 100644 (file)
index 0000000..729d3f4
--- /dev/null
@@ -0,0 +1,32 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define DEF_LOOP(TYPE1, TYPE2, COUNT)                          \
+  void __attribute__ ((noipa))                                 \
+  test_##TYPE1##_##TYPE2 (TYPE2 *__restrict r,                 \
+                         TYPE2 *__restrict a,                  \
+                         TYPE1 *__restrict pred)               \
+  {                                                            \
+    for (int i = 0; i < COUNT; ++i)                            \
+      r[i] = pred[i] ? !a[i] : a[i];                           \
+  }
+
+#define TEST_ALL(T) \
+  T (int16_t, int8_t, 7) \
+  T (int32_t, int8_t, 3) \
+  T (int32_t, int16_t, 3) \
+  T (int64_t, int8_t, 5) \
+  T (int64_t, int16_t, 5) \
+  T (int64_t, int32_t, 5)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\tcnot\tz[0-9]+\.b, p[0-7]/m,} 3 } } */
+/* { dg-final { scan-assembler-times {\tcnot\tz[0-9]+\.h, p[0-7]/m,} 2 } } */
+/* { dg-final { scan-assembler-times {\tcnot\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-not {\tmov\tz} } } */
+/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
+/* { dg-final { scan-assembler-not {\tsel\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_4_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_4_run.c
new file mode 100644 (file)
index 0000000..de9c0a5
--- /dev/null
@@ -0,0 +1,26 @@
+/* { dg-do run { target { aarch64_sve_hw } } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "cond_cnot_4.c"
+
+#define TEST_LOOP(TYPE1, TYPE2, N)                             \
+  {                                                            \
+    TYPE1 pred[N];                                             \
+    TYPE2 r[N], a[N];                                          \
+    for (int i = 0; i < N; ++i)                                        \
+      {                                                                \
+       a[i] = i & 1 ? 0 : 3 * (i + 1);                         \
+       pred[i] = (i % 3 < 2);                                  \
+       asm volatile ("" ::: "memory");                         \
+      }                                                                \
+    test_##TYPE1##_##TYPE2 (r, a, pred);                       \
+    for (int i = 0; i < N; ++i)                                        \
+      if (r[i] != (TYPE2) (pred[i] ? !a[i] : a[i]))            \
+       __builtin_abort ();                                     \
+  }
+
+int main ()
+{
+  TEST_ALL (TEST_LOOP)
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_5.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_5.c
new file mode 100644 (file)
index 0000000..7318e10
--- /dev/null
@@ -0,0 +1,32 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define DEF_LOOP(TYPE1, TYPE2, COUNT)                          \
+  void __attribute__ ((noipa))                                 \
+  test_##TYPE1##_##TYPE2 (TYPE2 *__restrict r,                 \
+                         TYPE1 *__restrict a,                  \
+                         TYPE2 *__restrict b)                  \
+  {                                                            \
+    for (int i = 0; i < COUNT; ++i)                            \
+      r[i] = a[i] == 0 ? !b[i] : a[i];                         \
+  }
+
+#define TEST_ALL(T) \
+  T (int16_t, int8_t, 7) \
+  T (int32_t, int8_t, 3) \
+  T (int32_t, int16_t, 3) \
+  T (int64_t, int8_t, 5) \
+  T (int64_t, int16_t, 5) \
+  T (int64_t, int32_t, 5)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\tcnot\tz[0-9]+\.b, p[0-7]/m,} 3 } } */
+/* { dg-final { scan-assembler-times {\tcnot\tz[0-9]+\.h, p[0-7]/m,} 2 } } */
+/* { dg-final { scan-assembler-times {\tcnot\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-not {\tmov\tz} } } */
+/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
+/* { dg-final { scan-assembler-not {\tsel\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_5_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_5_run.c
new file mode 100644 (file)
index 0000000..f8f277c
--- /dev/null
@@ -0,0 +1,26 @@
+/* { dg-do run { target { aarch64_sve_hw } } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "cond_cnot_5.c"
+
+#define TEST_LOOP(TYPE1, TYPE2, N)                             \
+  {                                                            \
+    TYPE1 a[N];                                                        \
+    TYPE2 r[N], b[N];                                          \
+    for (int i = 0; i < N; ++i)                                        \
+      {                                                                \
+       a[i] = i % 3 < 2 ? 0 : i * 42;                          \
+       b[i] = i & 1 ? 0 : 3 * (i + 1);                         \
+       asm volatile ("" ::: "memory");                         \
+      }                                                                \
+    test_##TYPE1##_##TYPE2 (r, a, b);                          \
+    for (int i = 0; i < N; ++i)                                        \
+      if (r[i] != (TYPE2) (a[i] == 0 ? !b[i] : a[i]))          \
+       __builtin_abort ();                                     \
+  }
+
+int main ()
+{
+  TEST_ALL (TEST_LOOP)
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_6.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_6.c
new file mode 100644 (file)
index 0000000..d44e357
--- /dev/null
@@ -0,0 +1,31 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define DEF_LOOP(TYPE1, TYPE2, COUNT)                          \
+  void __attribute__ ((noipa))                                 \
+  test_##TYPE1##_##TYPE2 (TYPE2 *__restrict r,                 \
+                         TYPE1 *__restrict a,                  \
+                         TYPE2 *__restrict b)                  \
+  {                                                            \
+    for (int i = 0; i < COUNT; ++i)                            \
+      r[i] = a[i] == 0 ? !b[i] : 127;                          \
+  }
+
+#define TEST_ALL(T) \
+  T (int16_t, int8_t, 7) \
+  T (int32_t, int8_t, 3) \
+  T (int32_t, int16_t, 3) \
+  T (int64_t, int8_t, 5) \
+  T (int64_t, int16_t, 5) \
+  T (int64_t, int32_t, 5)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\tcnot\tz[0-9]+\.b, p[0-7]/m,} 3 } } */
+/* { dg-final { scan-assembler-times {\tcnot\tz[0-9]+\.h, p[0-7]/m,} 2 } } */
+/* { dg-final { scan-assembler-times {\tcnot\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-not {\tmov\tz[^\n]*z} } } */
+/* { dg-final { scan-assembler-not {\tsel\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_6_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_6_run.c
new file mode 100644 (file)
index 0000000..9e33616
--- /dev/null
@@ -0,0 +1,26 @@
+/* { dg-do run { target { aarch64_sve_hw } } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "cond_cnot_6.c"
+
+#define TEST_LOOP(TYPE1, TYPE2, N)                             \
+  {                                                            \
+    TYPE1 a[N];                                                        \
+    TYPE2 r[N], b[N];                                          \
+    for (int i = 0; i < N; ++i)                                        \
+      {                                                                \
+       a[i] = i % 3 < 2 ? 0 : i * 42;                          \
+       b[i] = i & 1 ? 0 : 3 * (i + 1);                         \
+       asm volatile ("" ::: "memory");                         \
+      }                                                                \
+    test_##TYPE1##_##TYPE2 (r, a, b);                          \
+    for (int i = 0; i < N; ++i)                                        \
+      if (r[i] != (TYPE2) (a[i] == 0 ? !b[i] : 127))           \
+       __builtin_abort ();                                     \
+  }
+
+int main ()
+{
+  TEST_ALL (TEST_LOOP)
+  return 0;
+}