From a9fad8fe6c84de272f2a56d462e67d53c9f4a73d Mon Sep 17 00:00:00 2001 From: Alejandro Martinez Date: Tue, 7 May 2019 16:34:20 +0000 Subject: [PATCH] This patch adds support to vectorize sum of abslolute differences (SAD_EXPR) using SVE. Given this input code: int sum_abs (uint8_t *restrict x, uint8_t *restrict y, int n) { int sum = 0; for (int i = 0; i < n; i++) { sum += __builtin_abs (x[i] - y[i]); } return sum; } The resulting SVE code is: 0000000000000000 : 0: 7100005f cmp w2, #0x0 4: 5400026d b.le 50 8: d2800003 mov x3, #0x0 // #0 c: 93407c42 sxtw x2, w2 10: 2538c002 mov z2.b, #0 14: 25221fe0 whilelo p0.b, xzr, x2 18: 2538c023 mov z3.b, #1 1c: 2518e3e1 ptrue p1.b 20: a4034000 ld1b {z0.b}, p0/z, [x0, x3] 24: a4034021 ld1b {z1.b}, p0/z, [x1, x3] 28: 0430e3e3 incb x3 2c: 0520c021 sel z1.b, p0, z1.b, z0.b 30: 25221c60 whilelo p0.b, x3, x2 34: 040d0420 uabd z0.b, p1/m, z0.b, z1.b 38: 44830402 udot z2.s, z0.b, z3.b 3c: 54ffff21 b.ne 20 // b.any 40: 2598e3e0 ptrue p0.s 44: 04812042 uaddv d2, p0, z2.s 48: 1e260040 fmov w0, s2 4c: d65f03c0 ret 50: 1e2703e2 fmov s2, wzr 54: 1e260040 fmov w0, s2 58: d65f03c0 ret Notice how udot is used inside a fully masked loop. gcc/Changelog: 2019-05-07 Alejandro Martinez * config/aarch64/aarch64-sve.md (abd_3): New define_expand. (aarch64_abd_3): Likewise. (*aarch64_abd_3): New define_insn. (sad): New define_expand. * config/aarch64/iterators.md: Added MAX_OPP attribute. * tree-vect-loop.c (use_mask_by_cond_expr_p): Add SAD_EXPR. (build_vect_cond_expr): Likewise. gcc/testsuite/Changelog: 2019-05-07 Alejandro Martinez * gcc.target/aarch64/sve/sad_1.c: New test for sum of absolute differences. From-SVN: r270975 --- gcc/ChangeLog | 10 ++++ gcc/config/aarch64/aarch64-sve.md | 61 ++++++++++++++++++++ gcc/config/aarch64/iterators.md | 3 + gcc/testsuite/ChangeLog | 5 ++ gcc/testsuite/gcc.target/aarch64/sve/sad_1.c | 28 +++++++++ gcc/tree-vect-loop.c | 12 ++++ 6 files changed, 119 insertions(+) create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/sad_1.c diff --git a/gcc/ChangeLog b/gcc/ChangeLog index cf2ea44035b..d55adb24486 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,13 @@ +2019-05-07 Alejandro Martinez + + * config/aarch64/aarch64-sve.md (abd_3): New define_expand. + (aarch64_abd_3): Likewise. + (*aarch64_abd_3): New define_insn. + (sad): New define_expand. + * config/aarch64/iterators.md: Added MAX_OPP attribute. + * tree-vect-loop.c (use_mask_by_cond_expr_p): Add SAD_EXPR. + (build_vect_cond_expr): Likewise. + 2019-05-07 Uroš Bizjak * cfgexpand.c (asm_clobber_reg_is_valid): Reject diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md index 02d33b7276f..e94801d9f86 100644 --- a/gcc/config/aarch64/aarch64-sve.md +++ b/gcc/config/aarch64/aarch64-sve.md @@ -3148,3 +3148,64 @@ movprfx\t%0, %3\;dot\\t%0., %1., %2." [(set_attr "movprfx" "*,yes")] ) + +;; Helper expander for aarch64_abd_3 to save the callers +;; the hassle of constructing the other arm of the MINUS. +(define_expand "abd_3" + [(use (match_operand:SVE_I 0 "register_operand")) + (USMAX:SVE_I (match_operand:SVE_I 1 "register_operand") + (match_operand:SVE_I 2 "register_operand"))] + "TARGET_SVE" + { + rtx pred = force_reg (mode, CONSTM1_RTX (mode)); + rtx other_arm = gen_rtx_ (mode, operands[1], operands[2]); + emit_insn (gen_aarch64_abd_3 (operands[0], pred, operands[1], + operands[2], other_arm)); + DONE; + } +) + +;; Predicated integer absolute difference. +(define_insn "aarch64_abd_3" + [(set (match_operand:SVE_I 0 "register_operand" "=w, ?&w") + (unspec:SVE_I + [(match_operand: 1 "register_operand" "Upl, Upl") + (minus:SVE_I + (USMAX:SVE_I + (match_operand:SVE_I 2 "register_operand" "0, w") + (match_operand:SVE_I 3 "register_operand" "w, w")) + (match_operator 4 "aarch64_" + [(match_dup 2) + (match_dup 3)]))] + UNSPEC_MERGE_PTRUE))] + "TARGET_SVE" + "@ + abd\t%0., %1/m, %0., %3. + movprfx\t%0, %2\;abd\t%0., %1/m, %0., %3." + [(set_attr "movprfx" "*,yes")] +) + +;; Emit a sequence to produce a sum-of-absolute-differences of the inputs in +;; operands 1 and 2. The sequence also has to perform a widening reduction of +;; the difference into a vector and accumulate that into operand 3 before +;; copying that into the result operand 0. +;; Perform that with a sequence of: +;; MOV ones.b, #1 +;; [SU]ABD diff.b, p0/m, op1.b, op2.b +;; MOVPRFX op0, op3 // If necessary +;; UDOT op0.s, diff.b, ones.b + +(define_expand "sad" + [(use (match_operand:SVE_SDI 0 "register_operand")) + (unspec: [(use (match_operand: 1 "register_operand")) + (use (match_operand: 2 "register_operand"))] ABAL) + (use (match_operand:SVE_SDI 3 "register_operand"))] + "TARGET_SVE" + { + rtx ones = force_reg (mode, CONST1_RTX (mode)); + rtx diff = gen_reg_rtx (mode); + emit_insn (gen_abd_3 (diff, operands[1], operands[2])); + emit_insn (gen_udot_prod (operands[0], diff, ones, operands[3])); + DONE; + } +) diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md index b3b2d6e470a..20aa0e9d2b8 100644 --- a/gcc/config/aarch64/iterators.md +++ b/gcc/config/aarch64/iterators.md @@ -1060,6 +1060,9 @@ ;; Map smax to smin and umax to umin. (define_code_attr max_opp [(smax "smin") (umax "umin")]) +;; Same as above, but louder. +(define_code_attr MAX_OPP [(smax "SMIN") (umax "UMIN")]) + ;; The number of subvectors in an SVE_STRUCT. (define_mode_attr vector_count [(VNx32QI "2") (VNx16HI "2") (VNx8SI "2") (VNx4DI "2") diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index ae46a726139..ae3e09aae46 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,8 @@ +2019-05-07 Alejandro Martinez + + * gcc.target/aarch64/sve/sad_1.c: New test for sum of absolute + differences. + 2019-05-07 Uroš Bizjak * gcc.target/i386/asm-7.c: New test. diff --git a/gcc/testsuite/gcc.target/aarch64/sve/sad_1.c b/gcc/testsuite/gcc.target/aarch64/sve/sad_1.c new file mode 100644 index 00000000000..e7bf64a57b1 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/sad_1.c @@ -0,0 +1,28 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-vectorize" } */ + +#include + +#define DEF_SAD(TYPE1, TYPE2) \ +TYPE1 __attribute__ ((noinline, noclone)) \ +sum_abs_##TYPE1##_##TYPE2 (TYPE2 *restrict x, TYPE2 *restrict y, int n) \ +{ \ + TYPE1 sum = 0; \ + for (int i = 0; i < n; i++) \ + { \ + sum += __builtin_abs (x[i] - y[i]); \ + } \ + return sum; \ +} + +DEF_SAD(int32_t, uint8_t) +DEF_SAD(int32_t, int8_t) +DEF_SAD(int64_t, uint16_t) +DEF_SAD(int64_t, int16_t) + +/* { dg-final { scan-assembler-times {\tuabd\tz[0-9]+\.b, p[0-7]/m, z[0-9]+\.b, z[0-9]+\.b\n} 1 } } */ +/* { dg-final { scan-assembler-times {\tsabd\tz[0-9]+\.b, p[0-7]/m, z[0-9]+\.b, z[0-9]+\.b\n} 1 } } */ +/* { dg-final { scan-assembler-times {\tudot\tz[0-9]+\.s, z[0-9]+\.b, z[0-9]+\.b\n} 2 } } */ +/* { dg-final { scan-assembler-times {\tuabd\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */ +/* { dg-final { scan-assembler-times {\tsabd\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */ +/* { dg-final { scan-assembler-times {\tudot\tz[0-9]+\.d, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */ diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c index 493c1ab8c71..057a8742677 100644 --- a/gcc/tree-vect-loop.c +++ b/gcc/tree-vect-loop.c @@ -5973,6 +5973,7 @@ use_mask_by_cond_expr_p (enum tree_code code, internal_fn cond_fn, switch (code) { case DOT_PROD_EXPR: + case SAD_EXPR: return true; default: @@ -6002,6 +6003,17 @@ build_vect_cond_expr (enum tree_code code, tree vop[3], tree mask, break; } + case SAD_EXPR: + { + tree vectype = TREE_TYPE (vop[1]); + tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1"); + gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR, + mask, vop[1], vop[0]); + gsi_insert_before (gsi, select, GSI_SAME_STMT); + vop[1] = masked_op1; + break; + } + default: gcc_unreachable (); } -- 2.30.2