This patch adds support to vectorize sum of abslolute differences (SAD_EXPR)
authorAlejandro Martinez <alejandro.martinezvicente@arm.com>
Tue, 7 May 2019 16:34:20 +0000 (16:34 +0000)
committerAlejandro Martinez <alejandro@gcc.gnu.org>
Tue, 7 May 2019 16:34:20 +0000 (16:34 +0000)
using SVE.

Given this input code:

int
sum_abs (uint8_t *restrict x, uint8_t *restrict y, int n)
{
  int sum = 0;

  for (int i = 0; i < n; i++)
    {
      sum += __builtin_abs (x[i] - y[i]);
    }

  return sum;
}

The resulting SVE code is:

0000000000000000 <sum_abs>:
   0: 7100005f  cmp w2, #0x0
   4: 5400026d  b.le 50 <sum_abs+0x50>
   8: d2800003  mov x3, #0x0                    // #0
   c: 93407c42  sxtw x2, w2
  10: 2538c002  mov z2.b, #0
  14: 25221fe0  whilelo p0.b, xzr, x2
  18: 2538c023  mov z3.b, #1
  1c: 2518e3e1  ptrue p1.b
  20: a4034000  ld1b {z0.b}, p0/z, [x0, x3]
  24: a4034021  ld1b {z1.b}, p0/z, [x1, x3]
  28: 0430e3e3  incb x3
  2c: 0520c021  sel z1.b, p0, z1.b, z0.b
  30: 25221c60  whilelo p0.b, x3, x2
  34: 040d0420  uabd z0.b, p1/m, z0.b, z1.b
  38: 44830402  udot z2.s, z0.b, z3.b
  3c: 54ffff21  b.ne 20 <sum_abs+0x20>  // b.any
  40: 2598e3e0  ptrue p0.s
  44: 04812042  uaddv d2, p0, z2.s
  48: 1e260040  fmov w0, s2
  4c: d65f03c0  ret
  50: 1e2703e2  fmov s2, wzr
  54: 1e260040  fmov w0, s2
  58: d65f03c0  ret

Notice how udot is used inside a fully masked loop.

gcc/Changelog:

2019-05-07  Alejandro Martinez  <alejandro.martinezvicente@arm.com>

* config/aarch64/aarch64-sve.md (<su>abd<mode>_3): New define_expand.
(aarch64_<su>abd<mode>_3): Likewise.
(*aarch64_<su>abd<mode>_3): New define_insn.
(<sur>sad<vsi2qi>): New define_expand.
* config/aarch64/iterators.md: Added MAX_OPP attribute.
* tree-vect-loop.c (use_mask_by_cond_expr_p): Add SAD_EXPR.
(build_vect_cond_expr): Likewise.

gcc/testsuite/Changelog:

2019-05-07  Alejandro Martinez  <alejandro.martinezvicente@arm.com>

* gcc.target/aarch64/sve/sad_1.c: New test for sum of absolute
differences.

From-SVN: r270975

gcc/ChangeLog
gcc/config/aarch64/aarch64-sve.md
gcc/config/aarch64/iterators.md
gcc/testsuite/ChangeLog
gcc/testsuite/gcc.target/aarch64/sve/sad_1.c [new file with mode: 0644]
gcc/tree-vect-loop.c

index cf2ea44035b5a34d9dbd46c091a0ea8261e561e9..d55adb244864169b4b7152a2b3eb98c443aed802 100644 (file)
@@ -1,3 +1,13 @@
+2019-05-07  Alejandro Martinez  <alejandro.martinezvicente@arm.com>
+
+       * config/aarch64/aarch64-sve.md (<su>abd<mode>_3): New define_expand.
+       (aarch64_<su>abd<mode>_3): Likewise.
+       (*aarch64_<su>abd<mode>_3): New define_insn.
+       (<sur>sad<vsi2qi>): New define_expand.
+       * config/aarch64/iterators.md: Added MAX_OPP attribute.
+       * tree-vect-loop.c (use_mask_by_cond_expr_p): Add SAD_EXPR.
+       (build_vect_cond_expr): Likewise.
+
 2019-05-07  Uroš Bizjak  <ubizjak@gmail.com>
 
        * cfgexpand.c (asm_clobber_reg_is_valid): Reject
index 02d33b7276fb690ab97adcec623a65aabd5b8994..e94801d9f8690f90181b8a63c8fac4857fbe16d5 100644 (file)
    movprfx\t%0, %3\;<sur>dot\\t%0.<Vetype>, %1.<Vetype_fourth>, %2.<Vetype_fourth>"
   [(set_attr "movprfx" "*,yes")]
 )
+
+;; Helper expander for aarch64_<su>abd<mode>_3 to save the callers
+;; the hassle of constructing the other arm of the MINUS.
+(define_expand "<su>abd<mode>_3"
+  [(use (match_operand:SVE_I 0 "register_operand"))
+   (USMAX:SVE_I (match_operand:SVE_I 1 "register_operand")
+               (match_operand:SVE_I 2 "register_operand"))]
+  "TARGET_SVE"
+  {
+    rtx pred = force_reg (<VPRED>mode, CONSTM1_RTX (<VPRED>mode));
+    rtx other_arm = gen_rtx_<MAX_OPP> (<MODE>mode, operands[1], operands[2]);
+    emit_insn (gen_aarch64_<su>abd<mode>_3 (operands[0], pred, operands[1],
+                                           operands[2], other_arm));
+    DONE;
+  }
+)
+
+;; Predicated integer absolute difference.
+(define_insn "aarch64_<su>abd<mode>_3"
+  [(set (match_operand:SVE_I 0 "register_operand" "=w, ?&w")
+       (unspec:SVE_I
+         [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
+          (minus:SVE_I
+            (USMAX:SVE_I
+              (match_operand:SVE_I 2 "register_operand" "0, w")
+              (match_operand:SVE_I 3 "register_operand" "w, w"))
+            (match_operator 4 "aarch64_<max_opp>"
+              [(match_dup 2)
+               (match_dup 3)]))]
+         UNSPEC_MERGE_PTRUE))]
+  "TARGET_SVE"
+  "@
+   <su>abd\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
+   movprfx\t%0, %2\;<su>abd\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>"
+  [(set_attr "movprfx" "*,yes")]
+)
+
+;; Emit a sequence to produce a sum-of-absolute-differences of the inputs in
+;; operands 1 and 2.  The sequence also has to perform a widening reduction of
+;; the difference into a vector and accumulate that into operand 3 before
+;; copying that into the result operand 0.
+;; Perform that with a sequence of:
+;; MOV         ones.b, #1
+;; [SU]ABD     diff.b, p0/m, op1.b, op2.b
+;; MOVPRFX     op0, op3        // If necessary
+;; UDOT                op0.s, diff.b, ones.b
+
+(define_expand "<sur>sad<vsi2qi>"
+  [(use (match_operand:SVE_SDI 0 "register_operand"))
+   (unspec:<VSI2QI> [(use (match_operand:<VSI2QI> 1 "register_operand"))
+                   (use (match_operand:<VSI2QI> 2 "register_operand"))] ABAL)
+   (use (match_operand:SVE_SDI 3 "register_operand"))]
+  "TARGET_SVE"
+  {
+    rtx ones = force_reg (<VSI2QI>mode, CONST1_RTX (<VSI2QI>mode));
+    rtx diff = gen_reg_rtx (<VSI2QI>mode);
+    emit_insn (gen_<sur>abd<vsi2qi>_3 (diff, operands[1], operands[2]));
+    emit_insn (gen_udot_prod<vsi2qi> (operands[0], diff, ones, operands[3]));
+    DONE;
+  }
+)
index b3b2d6e470a81c9727c0e8452e14794ef594bf04..20aa0e9d2b80192ff7ffbac00d1bae79026a8753 100644 (file)
 ;; Map smax to smin and umax to umin.
 (define_code_attr max_opp [(smax "smin") (umax "umin")])
 
+;; Same as above, but louder.
+(define_code_attr MAX_OPP [(smax "SMIN") (umax "UMIN")])
+
 ;; The number of subvectors in an SVE_STRUCT.
 (define_mode_attr vector_count [(VNx32QI "2") (VNx16HI "2")
                                (VNx8SI  "2") (VNx4DI  "2")
index ae46a72613963405e514eb76e398240d1c52d3a1..ae3e09aae465782d23966616791444e8838701d5 100644 (file)
@@ -1,3 +1,8 @@
+2019-05-07  Alejandro Martinez  <alejandro.martinezvicente@arm.com>
+
+       * gcc.target/aarch64/sve/sad_1.c: New test for sum of absolute
+       differences.
+
 2019-05-07  Uroš Bizjak  <ubizjak@gmail.com>
 
        * gcc.target/i386/asm-7.c: New test.
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/sad_1.c b/gcc/testsuite/gcc.target/aarch64/sve/sad_1.c
new file mode 100644 (file)
index 0000000..e7bf64a
--- /dev/null
@@ -0,0 +1,28 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define DEF_SAD(TYPE1, TYPE2)                                          \
+TYPE1 __attribute__ ((noinline, noclone))                              \
+sum_abs_##TYPE1##_##TYPE2 (TYPE2 *restrict x, TYPE2 *restrict y, int n)        \
+{                                                                      \
+  TYPE1 sum = 0;                                                       \
+  for (int i = 0; i < n; i++)                                          \
+    {                                                                  \
+      sum += __builtin_abs (x[i] - y[i]);                              \
+    }                                                                  \
+  return sum;                                                          \
+}
+
+DEF_SAD(int32_t, uint8_t)
+DEF_SAD(int32_t, int8_t)
+DEF_SAD(int64_t, uint16_t)
+DEF_SAD(int64_t, int16_t)
+
+/* { dg-final { scan-assembler-times {\tuabd\tz[0-9]+\.b, p[0-7]/m, z[0-9]+\.b, z[0-9]+\.b\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tsabd\tz[0-9]+\.b, p[0-7]/m, z[0-9]+\.b, z[0-9]+\.b\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tudot\tz[0-9]+\.s, z[0-9]+\.b, z[0-9]+\.b\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tuabd\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tsabd\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tudot\tz[0-9]+\.d, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
index 493c1ab8c71ed1dce8f4e4c254498350d0c6ad3b..057a87426779881064cddcfae5213c290365951d 100644 (file)
@@ -5973,6 +5973,7 @@ use_mask_by_cond_expr_p (enum tree_code code, internal_fn cond_fn,
   switch (code)
     {
     case DOT_PROD_EXPR:
+    case SAD_EXPR:
       return true;
 
     default:
@@ -6002,6 +6003,17 @@ build_vect_cond_expr (enum tree_code code, tree vop[3], tree mask,
        break;
       }
 
+    case SAD_EXPR:
+      {
+       tree vectype = TREE_TYPE (vop[1]);
+       tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
+       gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
+                                              mask, vop[1], vop[0]);
+       gsi_insert_before (gsi, select, GSI_SAME_STMT);
+       vop[1] = masked_op1;
+       break;
+      }
+
     default:
       gcc_unreachable ();
     }