From 58cc98767aa1d8136d36467b892dc4adaf427acc Mon Sep 17 00:00:00 2001 From: Yuliang Wang Date: Thu, 12 Sep 2019 09:59:58 +0000 Subject: [PATCH] Vectorise multiply high with scaling operations (PR 89386) 2019-09-12 Yuliang Wang gcc/ PR tree-optimization/89386 * config/aarch64/aarch64-sve2.md (mull) (shrnb, shrnt): New SVE2 patterns. (mulhs3): New pattern for MULHRS. * config/aarch64/iterators.md (UNSPEC_SMULLB, UNSPEC_SMULLT) (UNSPEC_UMULLB, UNSPEC_UMULLT, UNSPEC_SHRNB, UNSPEC_SHRNT) (UNSPEC_RSHRNB, UNSPEC_RSHRNT, UNSPEC_SMULHS, UNSPEC_SMULHRS) UNSPEC_UMULHS, UNSPEC_UMULHRS): New unspecs. (MULLBT, SHRNB, SHRNT, MULHRS): New int iterators. (su, r): Handle the unspecs above. (bt): New int attribute. * internal-fn.def (IFN_MULHS, IFN_MULHRS): New internal functions. * internal-fn.c (first_commutative_argument): Commutativity info for above. * optabs.def (smulhs_optab, smulhrs_optab, umulhs_optab) (umulhrs_optab): New optabs. * doc/md.texi (smulhs$var{m3}, umulhs$var{m3}) (smulhrs$var{m3}, umulhrs$var{m3}): Documentation for the above. * tree-vect-patterns.c (vect_recog_mulhs_pattern): New pattern function. (vect_vect_recog_func_ptrs): Add it. * testsuite/gcc.target/aarch64/sve2/mulhrs_1.c: New test. * testsuite/gcc.dg/vect/vect-mulhrs-1.c: As above. * testsuite/gcc.dg/vect/vect-mulhrs-2.c: As above. * testsuite/gcc.dg/vect/vect-mulhrs-3.c: As above. * testsuite/gcc.dg/vect/vect-mulhrs-4.c: As above. * doc/sourcebuild.texi (vect_mulhrs_hi): Document new target selector. * testsuite/lib/target-supports.exp (check_effective_target_vect_mulhrs_hi): Return true for AArch64 with SVE2. From-SVN: r275682 --- gcc/ChangeLog | 33 ++++ gcc/config/aarch64/aarch64-sve2.md | 60 +++++++ gcc/config/aarch64/iterators.md | 35 +++- gcc/doc/md.texi | 27 +++ gcc/doc/sourcebuild.texi | 4 + gcc/internal-fn.c | 2 + gcc/internal-fn.def | 5 + gcc/optabs.def | 4 + gcc/testsuite/gcc.dg/vect/vect-mulhrs-1.c | 49 +++++ gcc/testsuite/gcc.dg/vect/vect-mulhrs-2.c | 9 + gcc/testsuite/gcc.dg/vect/vect-mulhrs-3.c | 9 + gcc/testsuite/gcc.dg/vect/vect-mulhrs-4.c | 10 ++ .../gcc.target/aarch64/sve2/mulhrs_1.c | 63 +++++++ gcc/testsuite/lib/target-supports.exp | 9 + gcc/tree-vect-patterns.c | 170 ++++++++++++++++++ 15 files changed, 488 insertions(+), 1 deletion(-) create mode 100644 gcc/testsuite/gcc.dg/vect/vect-mulhrs-1.c create mode 100644 gcc/testsuite/gcc.dg/vect/vect-mulhrs-2.c create mode 100644 gcc/testsuite/gcc.dg/vect/vect-mulhrs-3.c create mode 100644 gcc/testsuite/gcc.dg/vect/vect-mulhrs-4.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/mulhrs_1.c diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 7f7f6514c39..7aba409e362 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,36 @@ +2019-09-12 Yuliang Wang + + PR tree-optimization/89386 + * config/aarch64/aarch64-sve2.md (mull) + (shrnb, shrnt): New SVE2 patterns. + (mulhs3): New pattern for MULHRS. + * config/aarch64/iterators.md (UNSPEC_SMULLB, UNSPEC_SMULLT) + (UNSPEC_UMULLB, UNSPEC_UMULLT, UNSPEC_SHRNB, UNSPEC_SHRNT) + (UNSPEC_RSHRNB, UNSPEC_RSHRNT, UNSPEC_SMULHS, UNSPEC_SMULHRS) + UNSPEC_UMULHS, UNSPEC_UMULHRS): New unspecs. + (MULLBT, SHRNB, SHRNT, MULHRS): New int iterators. + (su, r): Handle the unspecs above. + (bt): New int attribute. + * internal-fn.def (IFN_MULHS, IFN_MULHRS): New internal functions. + * internal-fn.c (first_commutative_argument): Commutativity info for + above. + * optabs.def (smulhs_optab, smulhrs_optab, umulhs_optab) + (umulhrs_optab): New optabs. + * doc/md.texi (smulhs$var{m3}, umulhs$var{m3}) + (smulhrs$var{m3}, umulhrs$var{m3}): Documentation for the above. + * tree-vect-patterns.c (vect_recog_mulhs_pattern): New pattern + function. + (vect_vect_recog_func_ptrs): Add it. + * testsuite/gcc.target/aarch64/sve2/mulhrs_1.c: New test. + * testsuite/gcc.dg/vect/vect-mulhrs-1.c: As above. + * testsuite/gcc.dg/vect/vect-mulhrs-2.c: As above. + * testsuite/gcc.dg/vect/vect-mulhrs-3.c: As above. + * testsuite/gcc.dg/vect/vect-mulhrs-4.c: As above. + * doc/sourcebuild.texi (vect_mulhrs_hi): Document new target selector. + * testsuite/lib/target-supports.exp + (check_effective_target_vect_mulhrs_hi): Return true for AArch64 + with SVE2. + 2019-09-11 Michael Meissner * config/rs6000/predicates.md (non_add_cint_operand): Simplify the diff --git a/gcc/config/aarch64/aarch64-sve2.md b/gcc/config/aarch64/aarch64-sve2.md index 2334e5a7b7d..ee9acdcfbca 100644 --- a/gcc/config/aarch64/aarch64-sve2.md +++ b/gcc/config/aarch64/aarch64-sve2.md @@ -63,3 +63,63 @@ movprfx\t%0, %2\;h\t%0., %1/m, %0., %3." [(set_attr "movprfx" "*,yes")] ) + +;; Multiply long top / bottom. +(define_insn "mull" + [(set (match_operand: 0 "register_operand" "=w") + (unspec: [(match_operand:SVE_BHSI 1 "register_operand" "w") + (match_operand:SVE_BHSI 2 "register_operand" "w")] + MULLBT))] + "TARGET_SVE2" + "mull\t%0., %1., %2." +) + +;; (Rounding) Right shift narrow bottom. +(define_insn "shrnb" + [(set (match_operand:SVE_BHSI 0 "register_operand" "=w") + (unspec:SVE_BHSI + [(match_operand: 1 "register_operand" "w") + (match_operand 2 "aarch64_simd_shift_imm_offset_" "")] + SHRNB))] + "TARGET_SVE2" + "shrnb\t%0., %1., #%2" +) + +;; (Rounding) Right shift narrow top. +(define_insn "shrnt" + [(set (match_operand:SVE_BHSI 0 "register_operand" "=w") + (unspec:SVE_BHSI + [(match_operand:SVE_BHSI 1 "register_operand" "0") + (match_operand: 2 "register_operand" "w") + (match_operand 3 "aarch64_simd_shift_imm_offset_" "i")] + SHRNT))] + "TARGET_SVE2" + "shrnt\t%0., %2., #%3" +) + +;; Unpredicated integer multiply-high-with-(round-and-)scale. +(define_expand "mulhs3" + [(set (match_operand:SVE_BHSI 0 "register_operand") + (unspec:SVE_BHSI + [(match_dup 3) + (unspec:SVE_BHSI [(match_operand:SVE_BHSI 1 "register_operand") + (match_operand:SVE_BHSI 2 "register_operand")] + MULHRS)] + UNSPEC_PRED_X))] + "TARGET_SVE2" + { + operands[3] = aarch64_ptrue_reg (mode); + + rtx prod_b = gen_reg_rtx (mode); + rtx prod_t = gen_reg_rtx (mode); + emit_insn (gen_mullb (prod_b, operands[1], operands[2])); + emit_insn (gen_mullt (prod_t, operands[1], operands[2])); + + rtx shift = GEN_INT (GET_MODE_UNIT_BITSIZE (mode) - 1); + emit_insn (gen_shrnb (operands[0], prod_b, shift)); + emit_insn (gen_shrnt (operands[0], operands[0], prod_t, shift)); + + DONE; + } +) + diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md index 49d227f6746..d23f0fcbc2f 100644 --- a/gcc/config/aarch64/iterators.md +++ b/gcc/config/aarch64/iterators.md @@ -378,6 +378,10 @@ UNSPEC_RSUBHN2 ; Used in aarch64-simd.md. UNSPEC_SQDMULH ; Used in aarch64-simd.md. UNSPEC_SQRDMULH ; Used in aarch64-simd.md. + UNSPEC_SMULLB ; Used in aarch64-sve2.md. + UNSPEC_SMULLT ; Used in aarch64-sve2.md. + UNSPEC_UMULLB ; Used in aarch64-sve2.md. + UNSPEC_UMULLT ; Used in aarch64-sve2.md. UNSPEC_PMUL ; Used in aarch64-simd.md. UNSPEC_FMULX ; Used in aarch64-simd.md. UNSPEC_USQADD ; Used in aarch64-simd.md. @@ -400,6 +404,10 @@ UNSPEC_UQSHRN ; Used in aarch64-simd.md. UNSPEC_SQRSHRN ; Used in aarch64-simd.md. UNSPEC_UQRSHRN ; Used in aarch64-simd.md. + UNSPEC_SHRNB ; Used in aarch64-sve2.md. + UNSPEC_SHRNT ; Used in aarch64-sve2.md. + UNSPEC_RSHRNB ; Used in aarch64-sve2.md. + UNSPEC_RSHRNT ; Used in aarch64-sve2.md. UNSPEC_SSHL ; Used in aarch64-simd.md. UNSPEC_USHL ; Used in aarch64-simd.md. UNSPEC_SRSHL ; Used in aarch64-simd.md. @@ -523,6 +531,10 @@ UNSPEC_FCMLA90 ; Used in aarch64-simd.md. UNSPEC_FCMLA180 ; Used in aarch64-simd.md. UNSPEC_FCMLA270 ; Used in aarch64-simd.md. + UNSPEC_SMULHS ; Used in aarch64-sve2.md. + UNSPEC_SMULHRS ; Used in aarch64-sve2.md. + UNSPEC_UMULHS ; Used in aarch64-sve2.md. + UNSPEC_UMULHRS ; Used in aarch64-sve2.md. ]) ;; ------------------------------------------------------------------ @@ -1588,6 +1600,13 @@ (define_int_iterator RHADD [UNSPEC_SRHADD UNSPEC_URHADD]) +(define_int_iterator MULLBT [UNSPEC_SMULLB UNSPEC_UMULLB + UNSPEC_SMULLT UNSPEC_UMULLT]) + +(define_int_iterator SHRNB [UNSPEC_SHRNB UNSPEC_RSHRNB]) + +(define_int_iterator SHRNT [UNSPEC_SHRNT UNSPEC_RSHRNT]) + (define_int_iterator DOTPROD [UNSPEC_SDOT UNSPEC_UDOT]) (define_int_iterator ADDSUBHN [UNSPEC_ADDHN UNSPEC_RADDHN @@ -1607,6 +1626,9 @@ (define_int_iterator VQDMULH [UNSPEC_SQDMULH UNSPEC_SQRDMULH]) +(define_int_iterator MULHRS [UNSPEC_SMULHS UNSPEC_UMULHS + UNSPEC_SMULHRS UNSPEC_UMULHRS]) + (define_int_iterator USSUQADD [UNSPEC_SUQADD UNSPEC_USQADD]) (define_int_iterator SUQMOVN [UNSPEC_SQXTN UNSPEC_UQXTN]) @@ -1872,7 +1894,11 @@ (UNSPEC_COND_FCVTZS "s") (UNSPEC_COND_FCVTZU "u") (UNSPEC_COND_SCVTF "s") - (UNSPEC_COND_UCVTF "u")]) + (UNSPEC_COND_UCVTF "u") + (UNSPEC_SMULLB "s") (UNSPEC_UMULLB "u") + (UNSPEC_SMULLT "s") (UNSPEC_UMULLT "u") + (UNSPEC_SMULHS "s") (UNSPEC_UMULHS "u") + (UNSPEC_SMULHRS "s") (UNSPEC_UMULHRS "u")]) (define_int_attr sur [(UNSPEC_SHADD "s") (UNSPEC_UHADD "u") (UNSPEC_SRHADD "sr") (UNSPEC_URHADD "ur") @@ -1910,6 +1936,10 @@ (UNSPEC_SQRSHRN "r") (UNSPEC_UQRSHRN "r") (UNSPEC_SQSHL "") (UNSPEC_UQSHL "") (UNSPEC_SQRSHL "r")(UNSPEC_UQRSHL "r") + (UNSPEC_SHRNB "") (UNSPEC_SHRNT "") + (UNSPEC_RSHRNB "r") (UNSPEC_RSHRNT "r") + (UNSPEC_SMULHS "") (UNSPEC_UMULHS "") + (UNSPEC_SMULHRS "r") (UNSPEC_UMULHRS "r") ]) (define_int_attr lr [(UNSPEC_SSLI "l") (UNSPEC_USLI "l") @@ -1922,6 +1952,9 @@ (UNSPEC_SHADD "") (UNSPEC_UHADD "u") (UNSPEC_SRHADD "") (UNSPEC_URHADD "u")]) +(define_int_attr bt [(UNSPEC_SMULLB "b") (UNSPEC_UMULLB "b") + (UNSPEC_SMULLT "t") (UNSPEC_UMULLT "t")]) + (define_int_attr addsub [(UNSPEC_SHADD "add") (UNSPEC_UHADD "add") (UNSPEC_SRHADD "add") diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi index fa4ae14534b..f35fd2b1b19 100644 --- a/gcc/doc/md.texi +++ b/gcc/doc/md.texi @@ -5387,6 +5387,33 @@ operand 1. Add operand 1 to operand 2 and place the widened result in operand 0. (This is used express accumulation of elements into an accumulator of a wider mode.) +@cindex @code{smulhs@var{m3}} instruction pattern +@item @samp{smulhs@var{m3}} +@cindex @code{umulhs@var{m3}} instruction pattern +@itemx @samp{umulhs@var{m3}} +Signed/unsigned multiply high with scale. This is equivalent to the C code: +@smallexample +narrow op0, op1, op2; +@dots{} +op0 = (narrow) (((wide) op1 * (wide) op2) >> (N / 2 - 1)); +@end smallexample +where the sign of @samp{narrow} determines whether this is a signed +or unsigned operation, and @var{N} is the size of @samp{wide} in bits. + +@cindex @code{smulhrs@var{m3}} instruction pattern +@item @samp{smulhrs@var{m3}} +@cindex @code{umulhrs@var{m3}} instruction pattern +@itemx @samp{umulhrs@var{m3}} +Signed/unsigned multiply high with round and scale. This is +equivalent to the C code: +@smallexample +narrow op0, op1, op2; +@dots{} +op0 = (narrow) (((((wide) op1 * (wide) op2) >> (N / 2 - 2)) + 1) >> 1); +@end smallexample +where the sign of @samp{narrow} determines whether this is a signed +or unsigned operation, and @var{N} is the size of @samp{wide} in bits. + @cindex @code{vec_shl_insert_@var{m}} instruction pattern @item @samp{vec_shl_insert_@var{m}} Shift the elements in vector input operand 1 left one element (i.e.@: diff --git a/gcc/doc/sourcebuild.texi b/gcc/doc/sourcebuild.texi index e4180ccaabb..4ace224a8ff 100644 --- a/gcc/doc/sourcebuild.texi +++ b/gcc/doc/sourcebuild.texi @@ -1442,6 +1442,10 @@ vector alignment. Target supports both signed and unsigned averaging operations on vectors of bytes. +@item vect_mulhrs_hi +Target supports both signed and unsigned multiply-high-with-round-and-scale +operations on vectors of half-words. + @item vect_condition Target supports vector conditional operations. diff --git a/gcc/internal-fn.c b/gcc/internal-fn.c index ad86b9afd54..549d6f1153b 100644 --- a/gcc/internal-fn.c +++ b/gcc/internal-fn.c @@ -3210,6 +3210,8 @@ first_commutative_argument (internal_fn fn) case IFN_FNMS: case IFN_AVG_FLOOR: case IFN_AVG_CEIL: + case IFN_MULHS: + case IFN_MULHRS: case IFN_FMIN: case IFN_FMAX: return 0; diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def index b5a6ca33223..49f57978c88 100644 --- a/gcc/internal-fn.def +++ b/gcc/internal-fn.def @@ -149,6 +149,11 @@ DEF_INTERNAL_SIGNED_OPTAB_FN (AVG_FLOOR, ECF_CONST | ECF_NOTHROW, first, DEF_INTERNAL_SIGNED_OPTAB_FN (AVG_CEIL, ECF_CONST | ECF_NOTHROW, first, savg_ceil, uavg_ceil, binary) +DEF_INTERNAL_SIGNED_OPTAB_FN (MULHS, ECF_CONST | ECF_NOTHROW, first, + smulhs, umulhs, binary) +DEF_INTERNAL_SIGNED_OPTAB_FN (MULHRS, ECF_CONST | ECF_NOTHROW, first, + smulhrs, umulhrs, binary) + DEF_INTERNAL_OPTAB_FN (COND_ADD, ECF_CONST, cond_add, cond_binary) DEF_INTERNAL_OPTAB_FN (COND_SUB, ECF_CONST, cond_sub, cond_binary) DEF_INTERNAL_OPTAB_FN (COND_MUL, ECF_CONST, cond_smul, cond_binary) diff --git a/gcc/optabs.def b/gcc/optabs.def index 0860b38badb..308696846d4 100644 --- a/gcc/optabs.def +++ b/gcc/optabs.def @@ -343,6 +343,10 @@ OPTAB_D (udot_prod_optab, "udot_prod$I$a") OPTAB_D (usum_widen_optab, "widen_usum$I$a3") OPTAB_D (usad_optab, "usad$I$a") OPTAB_D (ssad_optab, "ssad$I$a") +OPTAB_D (smulhs_optab, "smulhs$a3") +OPTAB_D (smulhrs_optab, "smulhrs$a3") +OPTAB_D (umulhs_optab, "umulhs$a3") +OPTAB_D (umulhrs_optab, "umulhrs$a3") OPTAB_D (vec_pack_sfix_trunc_optab, "vec_pack_sfix_trunc_$a") OPTAB_D (vec_pack_ssat_optab, "vec_pack_ssat_$a") OPTAB_D (vec_pack_trunc_optab, "vec_pack_trunc_$a") diff --git a/gcc/testsuite/gcc.dg/vect/vect-mulhrs-1.c b/gcc/testsuite/gcc.dg/vect/vect-mulhrs-1.c new file mode 100644 index 00000000000..8e46ff6b01f --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-mulhrs-1.c @@ -0,0 +1,49 @@ +/* { dg-require-effective-target vect_int } */ + +#include "tree-vect.h" +#ifndef SIGNEDNESS +#define SIGNEDNESS signed +#endif +#ifndef BIAS +#define BIAS 0 +#endif + +#define HRS(x) ((((x) >> (15 - BIAS)) + BIAS) >> BIAS) + +void __attribute__ ((noipa)) +f (SIGNEDNESS short *restrict a, SIGNEDNESS short *restrict b, + SIGNEDNESS short *restrict c, __INTPTR_TYPE__ n) +{ + for (__INTPTR_TYPE__ i = 0; i < n; ++i) + a[i] = HRS((SIGNEDNESS int) b[i] * (SIGNEDNESS int) c[i]); +} + +#define N 50 +#define BASE1 ((SIGNEDNESS int) -1 < 0 ? -126 : 4) +#define BASE2 ((SIGNEDNESS int) -1 < 0 ? -101 : 26) +#define CONST1 0x01AB +#define CONST2 0x01CD + +int +main (void) +{ + check_vect (); + + SIGNEDNESS short a[N], b[N], c[N]; + for (int i = 0; i < N; ++i) + { + b[i] = BASE1 + i * CONST1; + c[i] = BASE2 + i * CONST2; + asm volatile ("" ::: "memory"); + } + f (a, b, c, N); + for (int i = 0; i < N; ++i) + if (a[i] != HRS(BASE1 * BASE2 + i * i * (CONST1 * CONST2) + + i * (BASE1 * CONST2 + BASE2 * CONST1))) + __builtin_abort (); + return 0; +} + +/* { dg-final { scan-tree-dump "vect_recog_mulhs_pattern: detected" "vect" } } */ +/* { dg-final { scan-tree-dump {\.MULHS} "vect" { target vect_mulhrs_hi } } } */ +/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_mulhrs_hi } } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-mulhrs-2.c b/gcc/testsuite/gcc.dg/vect/vect-mulhrs-2.c new file mode 100644 index 00000000000..a16e71c6a37 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-mulhrs-2.c @@ -0,0 +1,9 @@ +/* { dg-require-effective-target vect_int } */ + +#define SIGNEDNESS unsigned + +#include "vect-mulhrs-1.c" + +/* { dg-final { scan-tree-dump "vect_recog_mulhs_pattern: detected" "vect" } } */ +/* { dg-final { scan-tree-dump {\.MULHS} "vect" { target vect_mulhrs_hi } } } */ +/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_mulhrs_hi } } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-mulhrs-3.c b/gcc/testsuite/gcc.dg/vect/vect-mulhrs-3.c new file mode 100644 index 00000000000..e7d44d75d6c --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-mulhrs-3.c @@ -0,0 +1,9 @@ +/* { dg-require-effective-target vect_int } */ + +#define BIAS 1 + +#include "vect-mulhrs-1.c" + +/* { dg-final { scan-tree-dump "vect_recog_mulhs_pattern: detected" "vect" } } */ +/* { dg-final { scan-tree-dump {\.MULHRS} "vect" { target vect_mulhrs_hi } } } */ +/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_mulhrs_hi } } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-mulhrs-4.c b/gcc/testsuite/gcc.dg/vect/vect-mulhrs-4.c new file mode 100644 index 00000000000..e121763352e --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-mulhrs-4.c @@ -0,0 +1,10 @@ +/* { dg-require-effective-target vect_int } */ + +#define SIGNEDNESS unsigned +#define BIAS 1 + +#include "vect-mulhrs-1.c" + +/* { dg-final { scan-tree-dump "vect_recog_mulhs_pattern: detected" "vect" } } */ +/* { dg-final { scan-tree-dump {\.MULHRS} "vect" { target vect_mulhrs_hi } } } */ +/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_mulhrs_hi } } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/mulhrs_1.c b/gcc/testsuite/gcc.target/aarch64/sve2/mulhrs_1.c new file mode 100644 index 00000000000..7970d681c9d --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve2/mulhrs_1.c @@ -0,0 +1,63 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-vectorize -fdump-tree-vect-details --save-temps" } */ + +#include + +#define MULTHI(TYPE, BIGGER, RND) \ +TYPE __attribute__ ((noinline, noclone)) \ +mulhs_##TYPE##_##RND (TYPE *restrict x, \ + TYPE *restrict y, TYPE *restrict z, int n) \ +{ \ + for (int i = 0; i < n; i++) \ + { \ + z[i] = ((((BIGGER)x[i] * (BIGGER)y[i]) >> \ + (sizeof(BIGGER)*8/2-2)) + RND) >> 1; \ + } \ +} + +MULTHI (int8_t, int16_t, 0) +MULTHI (int16_t, int32_t, 0) +MULTHI (int32_t, int64_t, 0) + +MULTHI (uint8_t, uint16_t, 0) +MULTHI (uint16_t, uint32_t, 0) +MULTHI (uint32_t, uint64_t, 0) + +MULTHI (int8_t, int16_t, 1) +MULTHI (int16_t, int32_t, 1) +MULTHI (int32_t, int64_t, 1) + +MULTHI (uint8_t, uint16_t, 1) +MULTHI (uint16_t, uint32_t, 1) +MULTHI (uint32_t, uint64_t, 1) + +/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 12 "vect" } } */ + +/* { dg-final { scan-assembler-times {\tsmullb\tz[0-9]+\.h, z[0-9]+\.b, z[0-9]+\.b\n} 2 } } */ +/* { dg-final { scan-assembler-times {\tsmullt\tz[0-9]+\.h, z[0-9]+\.b, z[0-9]+\.b\n} 2 } } */ +/* { dg-final { scan-assembler-times {\tsmullb\tz[0-9]+\.s, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */ +/* { dg-final { scan-assembler-times {\tsmullt\tz[0-9]+\.s, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */ +/* { dg-final { scan-assembler-times {\tsmullb\tz[0-9]+\.d, z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */ +/* { dg-final { scan-assembler-times {\tsmullt\tz[0-9]+\.d, z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */ + +/* { dg-final { scan-assembler-times {\tshrnb\tz[0-9]+\.b, z[0-9]+\.h, #7\n} 2 } } */ +/* { dg-final { scan-assembler-times {\tshrnt\tz[0-9]+\.b, z[0-9]+\.h, #7\n} 2 } } */ +/* { dg-final { scan-assembler-times {\tshrnb\tz[0-9]+\.h, z[0-9]+\.s, #15\n} 2 } } */ +/* { dg-final { scan-assembler-times {\tshrnt\tz[0-9]+\.h, z[0-9]+\.s, #15\n} 2 } } */ +/* { dg-final { scan-assembler-times {\tshrnb\tz[0-9]+\.s, z[0-9]+\.d, #31\n} 2 } } */ +/* { dg-final { scan-assembler-times {\tshrnt\tz[0-9]+\.s, z[0-9]+\.d, #31\n} 2 } } */ + +/* { dg-final { scan-assembler-times {\tumullb\tz[0-9]+\.h, z[0-9]+\.b, z[0-9]+\.b\n} 2 } } */ +/* { dg-final { scan-assembler-times {\tumullt\tz[0-9]+\.h, z[0-9]+\.b, z[0-9]+\.b\n} 2 } } */ +/* { dg-final { scan-assembler-times {\tumullb\tz[0-9]+\.s, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */ +/* { dg-final { scan-assembler-times {\tumullt\tz[0-9]+\.s, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */ +/* { dg-final { scan-assembler-times {\tumullb\tz[0-9]+\.d, z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */ +/* { dg-final { scan-assembler-times {\tumullt\tz[0-9]+\.d, z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */ + +/* { dg-final { scan-assembler-times {\trshrnb\tz[0-9]+\.b, z[0-9]+\.h, #7\n} 2 } } */ +/* { dg-final { scan-assembler-times {\trshrnt\tz[0-9]+\.b, z[0-9]+\.h, #7\n} 2 } } */ +/* { dg-final { scan-assembler-times {\trshrnb\tz[0-9]+\.h, z[0-9]+\.s, #15\n} 2 } } */ +/* { dg-final { scan-assembler-times {\trshrnt\tz[0-9]+\.h, z[0-9]+\.s, #15\n} 2 } } */ +/* { dg-final { scan-assembler-times {\trshrnb\tz[0-9]+\.s, z[0-9]+\.d, #31\n} 2 } } */ +/* { dg-final { scan-assembler-times {\trshrnt\tz[0-9]+\.s, z[0-9]+\.d, #31\n} 2 } } */ + diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp index 4f7d6cbdd42..f05a0930fbd 100644 --- a/gcc/testsuite/lib/target-supports.exp +++ b/gcc/testsuite/lib/target-supports.exp @@ -6175,6 +6175,15 @@ proc check_effective_target_vect_avg_qi {} { && ![check_effective_target_aarch64_sve1_only] }] } +# Return 1 if the target plus current options supports both signed +# and unsigned multiply-high-with-round-and-scale operations +# on vectors of half-words. + +proc check_effective_target_vect_mulhrs_hi {} { + return [expr { [istarget aarch64*-*-*] + && [check_effective_target_aarch64_sve2] }] +} + # Return 1 if the target plus current options supports a vector # demotion (packing) of shorts (to chars) and ints (to shorts) # using modulo arithmetic, 0 otherwise. diff --git a/gcc/tree-vect-patterns.c b/gcc/tree-vect-patterns.c index ccb2e1edecd..2f86f9e4fc7 100644 --- a/gcc/tree-vect-patterns.c +++ b/gcc/tree-vect-patterns.c @@ -1723,6 +1723,175 @@ vect_recog_over_widening_pattern (stmt_vec_info last_stmt_info, tree *type_out) return pattern_stmt; } +/* Recognize the following patterns: + + ATYPE a; // narrower than TYPE + BTYPE b; // narrower than TYPE + + 1) Multiply high with scaling + TYPE res = ((TYPE) a * (TYPE) b) >> c; + 2) ... or also with rounding + TYPE res = (((TYPE) a * (TYPE) b) >> d + 1) >> 1; + + where only the bottom half of res is used. */ + +static gimple * +vect_recog_mulhs_pattern (stmt_vec_info last_stmt_info, tree *type_out) +{ + /* Check for a right shift. */ + gassign *last_stmt = dyn_cast (last_stmt_info->stmt); + if (!last_stmt + || gimple_assign_rhs_code (last_stmt) != RSHIFT_EXPR) + return NULL; + vec_info *vinfo = last_stmt_info->vinfo; + + /* Check that the shift result is wider than the users of the + result need (i.e. that narrowing would be a natural choice). */ + tree lhs_type = TREE_TYPE (gimple_assign_lhs (last_stmt)); + unsigned int target_precision + = vect_element_precision (last_stmt_info->min_output_precision); + if (!INTEGRAL_TYPE_P (lhs_type) + || target_precision >= TYPE_PRECISION (lhs_type)) + return NULL; + + /* Look through any change in sign on the outer shift input. */ + vect_unpromoted_value unprom_rshift_input; + tree rshift_input = vect_look_through_possible_promotion + (vinfo, gimple_assign_rhs1 (last_stmt), &unprom_rshift_input); + if (!rshift_input + || TYPE_PRECISION (TREE_TYPE (rshift_input)) + != TYPE_PRECISION (lhs_type)) + return NULL; + + /* Get the definition of the shift input. */ + stmt_vec_info rshift_input_stmt_info + = vect_get_internal_def (vinfo, rshift_input); + if (!rshift_input_stmt_info) + return NULL; + gassign *rshift_input_stmt + = dyn_cast (rshift_input_stmt_info->stmt); + if (!rshift_input_stmt) + return NULL; + + stmt_vec_info mulh_stmt_info; + tree scale_term; + internal_fn ifn; + unsigned int expect_offset; + + /* Check for the presence of the rounding term. */ + if (gimple_assign_rhs_code (rshift_input_stmt) == PLUS_EXPR) + { + /* Check that the outer shift was by 1. */ + if (!integer_onep (gimple_assign_rhs2 (last_stmt))) + return NULL; + + /* Check that the second operand of the PLUS_EXPR is 1. */ + if (!integer_onep (gimple_assign_rhs2 (rshift_input_stmt))) + return NULL; + + /* Look through any change in sign on the addition input. */ + vect_unpromoted_value unprom_plus_input; + tree plus_input = vect_look_through_possible_promotion + (vinfo, gimple_assign_rhs1 (rshift_input_stmt), &unprom_plus_input); + if (!plus_input + || TYPE_PRECISION (TREE_TYPE (plus_input)) + != TYPE_PRECISION (TREE_TYPE (rshift_input))) + return NULL; + + /* Get the definition of the multiply-high-scale part. */ + stmt_vec_info plus_input_stmt_info + = vect_get_internal_def (vinfo, plus_input); + if (!plus_input_stmt_info) + return NULL; + gassign *plus_input_stmt + = dyn_cast (plus_input_stmt_info->stmt); + if (!plus_input_stmt + || gimple_assign_rhs_code (plus_input_stmt) != RSHIFT_EXPR) + return NULL; + + /* Look through any change in sign on the scaling input. */ + vect_unpromoted_value unprom_scale_input; + tree scale_input = vect_look_through_possible_promotion + (vinfo, gimple_assign_rhs1 (plus_input_stmt), &unprom_scale_input); + if (!scale_input + || TYPE_PRECISION (TREE_TYPE (scale_input)) + != TYPE_PRECISION (TREE_TYPE (plus_input))) + return NULL; + + /* Get the definition of the multiply-high part. */ + mulh_stmt_info = vect_get_internal_def (vinfo, scale_input); + if (!mulh_stmt_info) + return NULL; + + /* Get the scaling term. */ + scale_term = gimple_assign_rhs2 (plus_input_stmt); + + expect_offset = target_precision + 2; + ifn = IFN_MULHRS; + } + else + { + mulh_stmt_info = rshift_input_stmt_info; + scale_term = gimple_assign_rhs2 (last_stmt); + + expect_offset = target_precision + 1; + ifn = IFN_MULHS; + } + + /* Check that the scaling factor is correct. */ + if (TREE_CODE (scale_term) != INTEGER_CST + || wi::to_widest (scale_term) + expect_offset + != TYPE_PRECISION (lhs_type)) + return NULL; + + /* Check whether the scaling input term can be seen as two widened + inputs multiplied together. */ + vect_unpromoted_value unprom_mult[2]; + tree new_type; + unsigned int nops + = vect_widened_op_tree (mulh_stmt_info, MULT_EXPR, WIDEN_MULT_EXPR, + false, 2, unprom_mult, &new_type); + if (nops != 2) + return NULL; + + vect_pattern_detected ("vect_recog_mulhs_pattern", last_stmt); + + /* Adjust output precision. */ + if (TYPE_PRECISION (new_type) < target_precision) + new_type = build_nonstandard_integer_type + (target_precision, TYPE_UNSIGNED (new_type)); + + /* Check for target support. */ + tree new_vectype = get_vectype_for_scalar_type (new_type); + if (!new_vectype + || !direct_internal_fn_supported_p + (ifn, new_vectype, OPTIMIZE_FOR_SPEED)) + return NULL; + + /* The IR requires a valid vector type for the cast result, even though + it's likely to be discarded. */ + *type_out = get_vectype_for_scalar_type (lhs_type); + if (!*type_out) + return NULL; + + /* Generate the IFN_MULHRS call. */ + tree new_var = vect_recog_temp_ssa_var (new_type, NULL); + tree new_ops[2]; + vect_convert_inputs (last_stmt_info, 2, new_ops, new_type, + unprom_mult, new_vectype); + gcall *mulhrs_stmt + = gimple_build_call_internal (ifn, 2, new_ops[0], new_ops[1]); + gimple_call_set_lhs (mulhrs_stmt, new_var); + gimple_set_location (mulhrs_stmt, gimple_location (last_stmt)); + + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "created pattern stmt: %G", mulhrs_stmt); + + return vect_convert_output (last_stmt_info, lhs_type, + mulhrs_stmt, new_vectype); +} + /* Recognize the patterns: ATYPE a; // narrower than TYPE @@ -4713,6 +4882,7 @@ static vect_recog_func vect_vect_recog_func_ptrs[] = { /* Must come after over_widening, which narrows the shift as much as possible beforehand. */ { vect_recog_average_pattern, "average" }, + { vect_recog_mulhs_pattern, "mult_high" }, { vect_recog_cast_forwprop_pattern, "cast_forwprop" }, { vect_recog_widen_mult_pattern, "widen_mult" }, { vect_recog_dot_prod_pattern, "dot_prod" }, -- 2.30.2