[AArch64] Emit TARGET_DOTPROD-specific sequence for <us>sadv16qi
authorKyrylo Tkachov <kyrylo.tkachov@arm.com>
Mon, 3 Jun 2019 11:20:58 +0000 (11:20 +0000)
committerKyrylo Tkachov <ktkachov@gcc.gnu.org>
Mon, 3 Jun 2019 11:20:58 +0000 (11:20 +0000)
Wilco pointed out that when the Dot Product instructions are available we can use them
to generate an even more efficient expansion for the [us]sadv16qi optab.
Instead of the current:
        uabdl2  v0.8h, v1.16b, v2.16b
        uabal   v0.8h, v1.8b, v2.8b
        uadalp  v3.4s, v0.8h

we can generate:
      (1)  mov    v4.16b, 1
      (2)  uabd    v0.16b, v1.16b, v2.16b
      (3)  udot    v3.4s, v0.16b, v4.16b

Instruction (1) can be CSEd across multiple such expansions and even hoisted outside of loops,
so when this sequence appears frequently back-to-back (like in x264_r) we essentially only have 2 instructions
per sum. Also, the UDOT instruction does the byte-to-word accumulation in one step, which allows us to use
the much simpler UABD instruction before it.

This makes it a shorter and lower-latency sequence overall for targets that support it.

* config/aarch64/iterators.md (MAX_OPP): New code attr.
* config/aarch64/aarch64-simd.md (*aarch64_<su>abd<mode>_3): Rename to...
(aarch64_<su>abd<mode>_3): ... This.
(<sur>sadv16qi): Add TARGET_DOTPROD expansion.

* gcc.target/aarch64/ssadv16qi.c: Add +nodotprod to pragma.
* gcc.target/aarch64/usadv16qi.c: Likewise.
* gcc.target/aarch64/ssadv16qi-dotprod.c: New test.
* gcc.target/aarch64/usadv16qi-dotprod.c: Likewise.

From-SVN: r271863

gcc/ChangeLog
gcc/config/aarch64/aarch64-simd.md
gcc/testsuite/ChangeLog
gcc/testsuite/gcc.target/aarch64/ssadv16qi-dotprod.c [new file with mode: 0644]
gcc/testsuite/gcc.target/aarch64/ssadv16qi.c
gcc/testsuite/gcc.target/aarch64/usadv16qi-dotprod.c [new file with mode: 0644]
gcc/testsuite/gcc.target/aarch64/usadv16qi.c

index 372e880987c22894540a7063c7ae0f7b752cb1c2..06184edd27f20cbf86ec79f92b592964666ce688 100644 (file)
@@ -1,3 +1,10 @@
+2019-06-03  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>
+
+       * config/aarch64/iterators.md (MAX_OPP): New code attr.
+       * config/aarch64/aarch64-simd.md (*aarch64_<su>abd<mode>_3): Rename to...
+       (aarch64_<su>abd<mode>_3): ... This.
+       (<sur>sadv16qi): Add TARGET_DOTPROD expansion.
+
 2019-06-03  Richard Biener  <rguenther@suse.de>
 
        * tree-ssa-sccvn.c (ao_ref_init_from_vn_reference): Get original
index d4c48d2aa613d6b33fa9b012a98cfd89c96fec9b..b648e9e791658c45bd82c5a08c3d9f5809951b2c 100644 (file)
 ;; So (ABS:QI (minus:QI 64 -128)) == (ABS:QI (192 or -64 signed)) == 64.
 ;; Whereas SABD would return 192 (-64 signed) on the above example.
 ;; Use MINUS ([us]max (op1, op2), [us]min (op1, op2)) instead.
-(define_insn "*aarch64_<su>abd<mode>_3"
+(define_insn "aarch64_<su>abd<mode>_3"
   [(set (match_operand:VDQ_BHSI 0 "register_operand" "=w")
        (minus:VDQ_BHSI
          (USMAX:VDQ_BHSI
 ;; UABAL       tmp.8h, op1.16b, op2.16b
 ;; UADALP      op3.4s, tmp.8h
 ;; MOV         op0, op3 // should be eliminated in later passes.
-;; The signed version just uses the signed variants of the above instructions.
+;;
+;; For TARGET_DOTPROD we do:
+;; MOV tmp1.16b, #1 // Can be CSE'd and hoisted out of loops.
+;; UABD        tmp2.16b, op1.16b, op2.16b
+;; UDOT        op3.4s, tmp2.16b, tmp1.16b
+;; MOV op0, op3 // RA will tie the operands of UDOT appropriately.
+;;
+;; The signed version just uses the signed variants of the above instructions
+;; but for TARGET_DOTPROD still emits a UDOT as the absolute difference is
+;; unsigned.
 
 (define_expand "<sur>sadv16qi"
   [(use (match_operand:V4SI 0 "register_operand"))
    (use (match_operand:V4SI 3 "register_operand"))]
   "TARGET_SIMD"
   {
+    if (TARGET_DOTPROD)
+      {
+       rtx ones = force_reg (V16QImode, CONST1_RTX (V16QImode));
+       rtx abd = gen_reg_rtx (V16QImode);
+       emit_insn (gen_aarch64_<sur>abdv16qi_3 (abd, operands[1], operands[2]));
+       emit_insn (gen_aarch64_udotv16qi (operands[0], operands[3],
+                                         abd, ones));
+       DONE;
+      }
     rtx reduc = gen_reg_rtx (V8HImode);
     emit_insn (gen_aarch64_<sur>abdl2v16qi_3 (reduc, operands[1],
                                               operands[2]));
index 999bdc25db892a1e62b42f34a12d86737058c8ca..112cf11f58ed48737696ec09836181462af4d848 100644 (file)
@@ -1,3 +1,10 @@
+2019-06-03  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>
+
+       * gcc.target/aarch64/ssadv16qi.c: Add +nodotprod to pragma.
+       * gcc.target/aarch64/usadv16qi.c: Likewise.
+       * gcc.target/aarch64/ssadv16qi-dotprod.c: New test.
+       * gcc.target/aarch64/usadv16qi-dotprod.c: Likewise.
+
 2019-06-03  Prathamesh Kulkarni  <prathamesh.kulkarni@linaro.org>
 
        * lib/target-supports.exp (add_options_for_aarch64_sve): New procedure.
diff --git a/gcc/testsuite/gcc.target/aarch64/ssadv16qi-dotprod.c b/gcc/testsuite/gcc.target/aarch64/ssadv16qi-dotprod.c
new file mode 100644 (file)
index 0000000..08b6831
--- /dev/null
@@ -0,0 +1,31 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_v8_2a_dotprod_neon_ok } */
+/* { dg-add-options arm_v8_2a_dotprod_neon }  */
+/* { dg-additional-options "-O3" } */
+
+#pragma GCC target "+nosve"
+
+#define N 1024
+
+signed char pix1[N], pix2[N];
+
+int foo (void)
+{
+  int i_sum = 0;
+  int i;
+
+  for (i = 0; i < N; i++)
+    i_sum += __builtin_abs (pix1[i] - pix2[i]);
+
+  return i_sum;
+}
+
+/* { dg-final { scan-assembler-not {\tsshll\t} } } */
+/* { dg-final { scan-assembler-not {\tsshll2\t} } } */
+/* { dg-final { scan-assembler-not {\tssubl\t} } } */
+/* { dg-final { scan-assembler-not {\tssubl2\t} } } */
+/* { dg-final { scan-assembler-not {\tabs\t} } } */
+
+/* { dg-final { scan-assembler {\tsabd\t} } } */
+/* { dg-final { scan-assembler {\tudot\t} } } */
+
index 40b28843616e84df137210b45ec16abed2a37c75..85a867a113013f560bfd0a3142805b9c95ad8c5a 100644 (file)
@@ -1,7 +1,7 @@
 /* { dg-do compile } */
 /* { dg-options "-O3" } */
 
-#pragma GCC target "+nosve"
+#pragma GCC target "+nosve+nodotprod"
 
 #define N 1024
 
diff --git a/gcc/testsuite/gcc.target/aarch64/usadv16qi-dotprod.c b/gcc/testsuite/gcc.target/aarch64/usadv16qi-dotprod.c
new file mode 100644 (file)
index 0000000..ea8de4d
--- /dev/null
@@ -0,0 +1,30 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_v8_2a_dotprod_neon_ok } */
+/* { dg-add-options arm_v8_2a_dotprod_neon }  */
+/* { dg-additional-options "-O3" } */
+
+#pragma GCC target "+nosve"
+
+#define N 1024
+
+unsigned char pix1[N], pix2[N];
+
+int foo (void)
+{
+  int i_sum = 0;
+  int i;
+
+  for (i = 0; i < N; i++)
+    i_sum += __builtin_abs (pix1[i] - pix2[i]);
+
+  return i_sum;
+}
+
+/* { dg-final { scan-assembler-not {\tushll\t} } } */
+/* { dg-final { scan-assembler-not {\tushll2\t} } } */
+/* { dg-final { scan-assembler-not {\tusubl\t} } } */
+/* { dg-final { scan-assembler-not {\tusubl2\t} } } */
+/* { dg-final { scan-assembler-not {\tabs\t} } } */
+
+/* { dg-final { scan-assembler {\tuabd\t} } } */
+/* { dg-final { scan-assembler {\tudot\t} } } */
index 69ceaf4259ea43e95078ce900d2498c3a2291369..a66e1209662cefaa95c90d8d2694f9c7c0de4152 100644 (file)
@@ -1,7 +1,7 @@
 /* { dg-do compile } */
 /* { dg-options "-O3" } */
 
-#pragma GCC target "+nosve"
+#pragma GCC target "+nosve+nodotprod"
 
 #define N 1024