[arm] Implement usadv16qi and ssadv16qi standard names
authorPrzemyslaw Wirkus <przemyslaw.wirkus@arm.com>
Wed, 12 Jun 2019 08:27:59 +0000 (08:27 +0000)
committerKyrylo Tkachov <ktkachov@gcc.gnu.org>
Wed, 12 Jun 2019 08:27:59 +0000 (08:27 +0000)
This patch implements the usadv16qi and ssadv16qi standard names for arm.

The V16QImode variant is important as it is the most commonly used pattern:
reducing vectors of bytes into an int.
The midend expects the optab to compute the absolute differences of operands 1
and 2 and reduce them while widening along the way up to SImode. So the inputs
are V16QImode and the output is V4SImode.

I've based my solution on Aarch64 usadv16qi and ssadv16qi standard names
current implementation (r260437). This solution emits below sequence of
instructions:

        VABDL.u8        tmp, op1, op2   # op1, op2 lowpart
        VABAL.u8        tmp, op1, op2   # op1, op2 highpart
        VPADAL.u16      op3, tmp

So, for the code:

$ arm-none-linux-gnueabihf-gcc -S -O3 -march=armv8-a+simd -mfpu=auto -mfloat-abi=hard usadv16qi.c -dp

#define N 1024
unsigned char pix1[N];
unsigned char pix2[N];

int
foo (void)
{
  int i_sum = 0;
  int i;
  for (i = 0; i < N; i++)
    i_sum += __builtin_abs (pix1[i] - pix2[i]);
  return i_sum;
}

we now generate on arm:
foo:
        movw    r3, #:lower16:pix2      @ 57    [c=4 l=4]  *arm_movsi_vfp/3
        movt    r3, #:upper16:pix2      @ 58    [c=4 l=4]  *arm_movt/0
        vmov.i32        q9, #0  @ v4si  @ 3     [c=4 l=4]  *neon_movv4si/2
        movw    r2, #:lower16:pix1      @ 59    [c=4 l=4]  *arm_movsi_vfp/3
        movt    r2, #:upper16:pix1      @ 60    [c=4 l=4]  *arm_movt/0
        add     r1, r3, #1024   @ 8     [c=4 l=4]  *arm_addsi3/4
.L2:
        vld1.8  {q11}, [r3]!    @ 11    [c=8 l=4]  *movmisalignv16qi_neon_load
        vld1.8  {q10}, [r2]!    @ 10    [c=8 l=4]  *movmisalignv16qi_neon_load
        cmp     r1, r3  @ 21    [c=4 l=4]  *arm_cmpsi_insn/2
        vabdl.u8        q8, d20, d22    @ 12    [c=8 l=4]  neon_vabdluv8qi
        vabal.u8        q8, d21, d23    @ 15    [c=88 l=4]  neon_vabaluv8qi
        vpadal.u16      q9, q8  @ 16    [c=8 l=4]  neon_vpadaluv8hi
        bne     .L2             @ 22    [c=16 l=4]  arm_cond_branch
        vadd.i32        d18, d18, d19   @ 24    [c=120 l=4]  quad_halves_plusv4si
        vpadd.i32       d18, d18, d18   @ 25    [c=8 l=4]  neon_vpadd_internalv2si
        vmov.32 r0, d18[0]      @ 30    [c=12 l=4]  vec_extractv2sisi/1

instead of:
foo:
        @ args = 0, pretend = 0, frame = 0
        @ frame_needed = 0, uses_anonymous_args = 0
        @ link register save eliminated.
        movw    r3, #:lower16:pix1
        movt    r3, #:upper16:pix1
        vmov.i32        q9, #0  @ v4si
        movw    r2, #:lower16:pix2
        movt    r2, #:upper16:pix2
        add     r1, r3, #1024
.L2:
        vld1.8  {q8}, [r3]!
        vld1.8  {q11}, [r2]!
        vmovl.u8 q10, d16
        cmp     r1, r3
        vmovl.u8 q8, d17
        vmovl.u8 q12, d22
        vmovl.u8 q11, d23
        vsub.i16        q10, q10, q12
        vsub.i16        q8, q8, q11
        vabs.s16        q10, q10
        vabs.s16        q8, q8
        vaddw.s16       q9, q9, d20
        vaddw.s16       q9, q9, d21
        vaddw.s16       q9, q9, d16
        vaddw.s16       q9, q9, d17
        bne     .L2
        vadd.i32        d18, d18, d19
        vpadd.i32       d18, d18, d18
        vmov.32 r0, d18[0]

2019-06-12  Przemyslaw Wirkus  <przemyslaw.wirkus@arm.com>

        * config/arm/iterators.md (VABAL): New int iterator.
        * config/arm/neon.md (<sup>sadv16qi): New define_expand.
        * config/arm/unspecs.md ("unspec"): Define UNSPEC_VABAL_S, UNSPEC_VABAL_U
        values.

        * gcc.target/arm/ssadv16qi.c: New test.
        * gcc.target/arm/usadv16qi.c: Likewise.

From-SVN: r272180

gcc/ChangeLog
gcc/config/arm/iterators.md
gcc/config/arm/neon.md
gcc/config/arm/unspecs.md
gcc/testsuite/ChangeLog
gcc/testsuite/gcc.target/arm/ssadv16qi.c [new file with mode: 0644]
gcc/testsuite/gcc.target/arm/usadv16qi.c [new file with mode: 0644]

index c1d58a13aa3a3de6f209d0f8ac8a9fb8cb2940d4..eb297484547add0bcbff427972dfbc95af775970 100644 (file)
@@ -1,3 +1,10 @@
+2019-06-12  Przemyslaw Wirkus  <przemyslaw.wirkus@arm.com>
+
+        * config/arm/iterators.md (VABAL): New int iterator.
+        * config/arm/neon.md (<sup>sadv16qi): New define_expand.
+        * config/arm/unspecs.md ("unspec"): Define UNSPEC_VABAL_S, UNSPEC_VABAL_U
+        values.
+
 2019-06-12  Martin Liska  <mliska@suse.cz>
 
        * value-prof.c (stream_out_histogram_value): Only first value
index eb07c5b90c1b1905d35d7b480bdbe7d7a45ab7ba..2462b8c87ea7dbe60ba50d22b1e494bb4fe905c2 100644 (file)
 
 (define_int_iterator VSUBHN [UNSPEC_VSUBHN UNSPEC_VRSUBHN])
 
+(define_int_iterator VABAL [UNSPEC_VABAL_S UNSPEC_VABAL_U])
+
 (define_int_iterator VABD [UNSPEC_VABD_S UNSPEC_VABD_U])
 
 (define_int_iterator VABDL [UNSPEC_VABDL_S UNSPEC_VABDL_U])
   (UNSPEC_VSUBW_S "s") (UNSPEC_VSUBW_U "u")
   (UNSPEC_VHSUB_S "s") (UNSPEC_VHSUB_U "u")
   (UNSPEC_VQSUB_S "s") (UNSPEC_VQSUB_U "u")
+  (UNSPEC_VABAL_S "s") (UNSPEC_VABAL_U "u")
   (UNSPEC_VABD_S "s") (UNSPEC_VABD_U "u")
   (UNSPEC_VABDL_S "s") (UNSPEC_VABDL_U "u")
   (UNSPEC_VMAX "s") (UNSPEC_VMAX_U "u")
index 726b7281a11be92d0b7a91fa7b8ba9efd1b68ac9..bcf838f23e35b15a5417b68ed16fa902df507d63 100644 (file)
   [(set_attr "type" "neon_arith_acc<q>")]
 )
 
+(define_expand "<sup>sadv16qi"
+  [(use (match_operand:V4SI 0 "register_operand"))
+   (unspec:V16QI [(use (match_operand:V16QI 1 "register_operand"))
+                  (use (match_operand:V16QI 2 "register_operand"))] VABAL)
+   (use (match_operand:V4SI 3 "register_operand"))]
+  "TARGET_NEON"
+  {
+    rtx reduc = gen_reg_rtx (V8HImode);
+    rtx op1_highpart = gen_reg_rtx (V8QImode);
+    rtx op2_highpart = gen_reg_rtx (V8QImode);
+
+    emit_insn (gen_neon_vabdl<sup>v8qi (reduc,
+                                        gen_lowpart (V8QImode, operands[1]),
+                                        gen_lowpart (V8QImode, operands[2])));
+
+    emit_insn (gen_neon_vget_highv16qi (op1_highpart, operands[1]));
+    emit_insn (gen_neon_vget_highv16qi (op2_highpart, operands[2]));
+    emit_insn (gen_neon_vabal<sup>v8qi (reduc, reduc,
+                                        op1_highpart, op2_highpart));
+    emit_insn (gen_neon_vpadal<sup>v8hi (operands[3], operands[3], reduc));
+
+    emit_move_insn (operands[0], operands[3]);
+    DONE;
+  }
+)
+
 (define_insn "neon_v<maxmin><sup><mode>"
   [(set (match_operand:VDQIW 0 "s_register_operand" "=w")
         (unspec:VDQIW [(match_operand:VDQIW 1 "s_register_operand" "w")
index 174bcc5e3d5e1123cb1c1a595f5003884840aea8..41068bac90aa0ce6fef531789a38e5f7b3b27dff 100644 (file)
   UNSPEC_SHA256SU1
   UNSPEC_VMULLP64
   UNSPEC_LOAD_COUNT
+  UNSPEC_VABAL_S
+  UNSPEC_VABAL_U
   UNSPEC_VABD_F
   UNSPEC_VABD_S
   UNSPEC_VABD_U
index 27535710633f64cae1f9840cd0b473b74223939e..b6cba15451430fc6eb6a2a0b80417bb12fb023b7 100644 (file)
@@ -1,3 +1,8 @@
+2019-06-12  Przemyslaw Wirkus  <przemyslaw.wirkus@arm.com>
+
+        * gcc.target/arm/ssadv16qi.c: New test.
+        * gcc.target/arm/usadv16qi.c: Likewise.
+
 2019-06-12  Jakub Jelinek  <jakub@redhat.com>
 
        PR c/90760
diff --git a/gcc/testsuite/gcc.target/arm/ssadv16qi.c b/gcc/testsuite/gcc.target/arm/ssadv16qi.c
new file mode 100644 (file)
index 0000000..dba5ef4
--- /dev/null
@@ -0,0 +1,29 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 --save-temps" } */
+/* { dg-require-effective-target arm_fp_ok } */
+/* { dg-require-effective-target arm_neon_ok } */
+/* { dg-add-options arm_neon } */
+
+#define N 1024
+
+signed char pix1[N], pix2[N];
+
+int
+foo (void)
+{
+  int i_sum = 0;
+  int i;
+
+  for (i = 0; i < N; i++)
+    i_sum += __builtin_abs (pix1[i] - pix2[i]);
+
+  return i_sum;
+}
+
+/* { dg-final { scan-assembler {\tvabdl\.s8\t} } } */
+/* { dg-final { scan-assembler {\tvabal\.s8\t} } } */
+/* { dg-final { scan-assembler {\tvpadal\.s16\t} } } */
+
+/* { dg-final { scan-assembler-not {\tvmovl} } } */
+/* { dg-final { scan-assembler-not {\tvsub} } } */
+/* { dg-final { scan-assembler-not {\tvabs} } } */
diff --git a/gcc/testsuite/gcc.target/arm/usadv16qi.c b/gcc/testsuite/gcc.target/arm/usadv16qi.c
new file mode 100644 (file)
index 0000000..d744bcb
--- /dev/null
@@ -0,0 +1,29 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 --save-temps" } */
+/* { dg-require-effective-target arm_fp_ok } */
+/* { dg-require-effective-target arm_neon_ok } */
+/* { dg-add-options arm_neon } */
+
+#define N 1024
+
+unsigned char pix1[N], pix2[N];
+
+int
+foo (void)
+{
+  int i_sum = 0;
+  int i;
+
+  for (i = 0; i < N; i++)
+    i_sum += __builtin_abs (pix1[i] - pix2[i]);
+
+  return i_sum;
+}
+
+/* { dg-final { scan-assembler {\tvabdl\.u8\t} } } */
+/* { dg-final { scan-assembler {\tvabal\.u8\t} } } */
+/* { dg-final { scan-assembler {\tvpadal\.u16\t} } } */
+
+/* { dg-final { scan-assembler-not {\tvmovl} } } */
+/* { dg-final { scan-assembler-not {\tvsub} } } */
+/* { dg-final { scan-assembler-not {\tvabs} } } */