Vectorise multiply high with scaling operations (PR 89386)

author Yuliang Wang <yuliang.wang@arm.com>

Thu, 12 Sep 2019 09:59:58 +0000 (09:59 +0000)

committer Richard Sandiford <rsandifo@gcc.gnu.org>

Thu, 12 Sep 2019 09:59:58 +0000 (09:59 +0000)
author Yuliang Wang <yuliang.wang@arm.com>
Thu, 12 Sep 2019 09:59:58 +0000 (09:59 +0000)
committer Richard Sandiford <rsandifo@gcc.gnu.org>
Thu, 12 Sep 2019 09:59:58 +0000 (09:59 +0000)
diff --git a/gcc/ChangeLog b/gcc/ChangeLog

index 7f7f6514c396820eb1152151bb22cfe62e432d83..7aba409e362179d8bf971d3205511bc3f52e4076 100644 (file)
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,36 @@
+2019-09-12  Yuliang Wang  <yuliang.wang@arm.com>
+
+       PR tree-optimization/89386
+       * config/aarch64/aarch64-sve2.md (<su>mull<bt><Vwide>)
+       (<r>shrnb<mode>, <r>shrnt<mode>): New SVE2 patterns.
+       (<su>mulh<r>s<mode>3): New pattern for MULHRS.
+       * config/aarch64/iterators.md (UNSPEC_SMULLB, UNSPEC_SMULLT)
+       (UNSPEC_UMULLB, UNSPEC_UMULLT, UNSPEC_SHRNB, UNSPEC_SHRNT)
+       (UNSPEC_RSHRNB, UNSPEC_RSHRNT, UNSPEC_SMULHS, UNSPEC_SMULHRS)
+       UNSPEC_UMULHS, UNSPEC_UMULHRS): New unspecs.
+       (MULLBT, SHRNB, SHRNT, MULHRS): New int iterators.
+       (su, r): Handle the unspecs above.
+       (bt): New int attribute.
+       * internal-fn.def (IFN_MULHS, IFN_MULHRS): New internal functions.
+       * internal-fn.c (first_commutative_argument): Commutativity info for
+       above.
+       * optabs.def (smulhs_optab, smulhrs_optab, umulhs_optab)
+       (umulhrs_optab): New optabs.
+       * doc/md.texi (smulhs$var{m3}, umulhs$var{m3})
+       (smulhrs$var{m3}, umulhrs$var{m3}): Documentation for the above.
+       * tree-vect-patterns.c (vect_recog_mulhs_pattern): New pattern
+       function.
+       (vect_vect_recog_func_ptrs): Add it.
+       * testsuite/gcc.target/aarch64/sve2/mulhrs_1.c: New test.
+       * testsuite/gcc.dg/vect/vect-mulhrs-1.c: As above.
+       * testsuite/gcc.dg/vect/vect-mulhrs-2.c: As above.
+       * testsuite/gcc.dg/vect/vect-mulhrs-3.c: As above.
+       * testsuite/gcc.dg/vect/vect-mulhrs-4.c: As above.
+       * doc/sourcebuild.texi (vect_mulhrs_hi): Document new target selector.
+       * testsuite/lib/target-supports.exp
+       (check_effective_target_vect_mulhrs_hi): Return true for AArch64
+       with SVE2.
+
  2019-09-11  Michael Meissner  <meissner@linux.ibm.com>
  
         * config/rs6000/predicates.md (non_add_cint_operand): Simplify the
diff --git a/gcc/config/aarch64/aarch64-sve2.md b/gcc/config/aarch64/aarch64-sve2.md

index 2334e5a7b7dc524bbd1f4d0a48ba5cd991970118..ee9acdcfbca359bac20fccdfab96779e21d96e80 100644 (file)
--- a/gcc/config/aarch64/aarch64-sve2.md
+++ b/gcc/config/aarch64/aarch64-sve2.md
@@ -63,3 +63,63 @@
     movprfx\t%0, %2\;<sur>h<addsub>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>"
    [(set_attr "movprfx" "*,yes")]
  )
+
+;; Multiply long top / bottom.
+(define_insn "<su>mull<bt><Vwide>"
+  [(set (match_operand:<VWIDE> 0 "register_operand" "=w")
+       (unspec:<VWIDE> [(match_operand:SVE_BHSI 1 "register_operand" "w")
+                        (match_operand:SVE_BHSI 2 "register_operand" "w")]
+                       MULLBT))]
+  "TARGET_SVE2"
+  "<su>mull<bt>\t%0.<Vewtype>, %1.<Vetype>, %2.<Vetype>"
+)
+
+;; (Rounding) Right shift narrow bottom.
+(define_insn "<r>shrnb<mode>"
+  [(set (match_operand:SVE_BHSI 0 "register_operand" "=w")
+        (unspec:SVE_BHSI
+         [(match_operand:<VWIDE> 1 "register_operand" "w")
+          (match_operand 2 "aarch64_simd_shift_imm_offset_<Vel>" "")]
+         SHRNB))]
+  "TARGET_SVE2"
+  "<r>shrnb\t%0.<Vetype>, %1.<Vewtype>, #%2"
+)
+
+;; (Rounding) Right shift narrow top.
+(define_insn "<r>shrnt<mode>"
+  [(set (match_operand:SVE_BHSI 0 "register_operand" "=w")
+       (unspec:SVE_BHSI
+         [(match_operand:SVE_BHSI 1 "register_operand" "0")
+          (match_operand:<VWIDE> 2 "register_operand" "w")
+          (match_operand 3 "aarch64_simd_shift_imm_offset_<Vel>" "i")]
+         SHRNT))]
+  "TARGET_SVE2"
+  "<r>shrnt\t%0.<Vetype>, %2.<Vewtype>, #%3"
+)
+
+;; Unpredicated integer multiply-high-with-(round-and-)scale.
+(define_expand "<su>mulh<r>s<mode>3"
+  [(set (match_operand:SVE_BHSI 0 "register_operand")
+       (unspec:SVE_BHSI
+         [(match_dup 3)
+          (unspec:SVE_BHSI [(match_operand:SVE_BHSI 1 "register_operand")
+                            (match_operand:SVE_BHSI 2 "register_operand")]
+                           MULHRS)]
+         UNSPEC_PRED_X))]
+  "TARGET_SVE2"
+  {
+    operands[3] = aarch64_ptrue_reg (<VPRED>mode);
+
+    rtx prod_b = gen_reg_rtx (<VWIDE>mode);
+    rtx prod_t = gen_reg_rtx (<VWIDE>mode);
+    emit_insn (gen_<su>mullb<Vwide> (prod_b, operands[1], operands[2]));
+    emit_insn (gen_<su>mullt<Vwide> (prod_t, operands[1], operands[2]));
+
+    rtx shift = GEN_INT (GET_MODE_UNIT_BITSIZE (<MODE>mode) - 1);
+    emit_insn (gen_<r>shrnb<mode> (operands[0], prod_b, shift));
+    emit_insn (gen_<r>shrnt<mode> (operands[0], operands[0], prod_t, shift));
+
+    DONE;
+  }
+)
+
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md

index 49d227f674694725ff18223eb2ff045181b5da61..d23f0fcbc2f47e8b48ca9bc52b3e483e450da432 100644 (file)
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -378,6 +378,10 @@
      UNSPEC_RSUBHN2     ; Used in aarch64-simd.md.
      UNSPEC_SQDMULH     ; Used in aarch64-simd.md.
      UNSPEC_SQRDMULH    ; Used in aarch64-simd.md.
+    UNSPEC_SMULLB      ; Used in aarch64-sve2.md.
+    UNSPEC_SMULLT      ; Used in aarch64-sve2.md.
+    UNSPEC_UMULLB      ; Used in aarch64-sve2.md.
+    UNSPEC_UMULLT      ; Used in aarch64-sve2.md.
      UNSPEC_PMUL                ; Used in aarch64-simd.md.
      UNSPEC_FMULX       ; Used in aarch64-simd.md.
      UNSPEC_USQADD      ; Used in aarch64-simd.md.
@@ -400,6 +404,10 @@
      UNSPEC_UQSHRN      ; Used in aarch64-simd.md.
      UNSPEC_SQRSHRN     ; Used in aarch64-simd.md.
      UNSPEC_UQRSHRN     ; Used in aarch64-simd.md.
+    UNSPEC_SHRNB       ; Used in aarch64-sve2.md.
+    UNSPEC_SHRNT       ; Used in aarch64-sve2.md.
+    UNSPEC_RSHRNB      ; Used in aarch64-sve2.md.
+    UNSPEC_RSHRNT      ; Used in aarch64-sve2.md.
      UNSPEC_SSHL                ; Used in aarch64-simd.md.
      UNSPEC_USHL                ; Used in aarch64-simd.md.
      UNSPEC_SRSHL       ; Used in aarch64-simd.md.
@@ -523,6 +531,10 @@
      UNSPEC_FCMLA90     ; Used in aarch64-simd.md.
      UNSPEC_FCMLA180    ; Used in aarch64-simd.md.
      UNSPEC_FCMLA270    ; Used in aarch64-simd.md.
+    UNSPEC_SMULHS      ; Used in aarch64-sve2.md.
+    UNSPEC_SMULHRS     ; Used in aarch64-sve2.md.
+    UNSPEC_UMULHS      ; Used in aarch64-sve2.md.
+    UNSPEC_UMULHRS     ; Used in aarch64-sve2.md.
  ])
  
  ;; ------------------------------------------------------------------
@@ -1588,6 +1600,13 @@
  
  (define_int_iterator RHADD [UNSPEC_SRHADD UNSPEC_URHADD])
  
+(define_int_iterator MULLBT [UNSPEC_SMULLB UNSPEC_UMULLB
+                             UNSPEC_SMULLT UNSPEC_UMULLT])
+
+(define_int_iterator SHRNB [UNSPEC_SHRNB UNSPEC_RSHRNB])
+
+(define_int_iterator SHRNT [UNSPEC_SHRNT UNSPEC_RSHRNT])
+
  (define_int_iterator DOTPROD [UNSPEC_SDOT UNSPEC_UDOT])
  
  (define_int_iterator ADDSUBHN [UNSPEC_ADDHN UNSPEC_RADDHN
@@ -1607,6 +1626,9 @@
  
  (define_int_iterator VQDMULH [UNSPEC_SQDMULH UNSPEC_SQRDMULH])
  
+(define_int_iterator MULHRS [UNSPEC_SMULHS UNSPEC_UMULHS
+                             UNSPEC_SMULHRS UNSPEC_UMULHRS])
+
  (define_int_iterator USSUQADD [UNSPEC_SUQADD UNSPEC_USQADD])
  
  (define_int_iterator SUQMOVN [UNSPEC_SQXTN UNSPEC_UQXTN])
@@ -1872,7 +1894,11 @@
                      (UNSPEC_COND_FCVTZS "s")
                      (UNSPEC_COND_FCVTZU "u")
                      (UNSPEC_COND_SCVTF "s")
-                    (UNSPEC_COND_UCVTF "u")])
+                    (UNSPEC_COND_UCVTF "u")
+                    (UNSPEC_SMULLB "s") (UNSPEC_UMULLB "u")
+                    (UNSPEC_SMULLT "s") (UNSPEC_UMULLT "u")
+                    (UNSPEC_SMULHS "s") (UNSPEC_UMULHS "u")
+                    (UNSPEC_SMULHRS "s") (UNSPEC_UMULHRS "u")])
  
  (define_int_attr sur [(UNSPEC_SHADD "s") (UNSPEC_UHADD "u")
                       (UNSPEC_SRHADD "sr") (UNSPEC_URHADD "ur")
@@ -1910,6 +1936,10 @@
                      (UNSPEC_SQRSHRN "r") (UNSPEC_UQRSHRN "r")
                      (UNSPEC_SQSHL   "")  (UNSPEC_UQSHL  "")
                      (UNSPEC_SQRSHL   "r")(UNSPEC_UQRSHL  "r")
+                   (UNSPEC_SHRNB "") (UNSPEC_SHRNT "")
+                   (UNSPEC_RSHRNB "r") (UNSPEC_RSHRNT "r")
+                   (UNSPEC_SMULHS "") (UNSPEC_UMULHS "")
+                   (UNSPEC_SMULHRS "r") (UNSPEC_UMULHRS "r")
  ])
  
  (define_int_attr lr [(UNSPEC_SSLI  "l") (UNSPEC_USLI  "l")
@@ -1922,6 +1952,9 @@
                     (UNSPEC_SHADD "") (UNSPEC_UHADD "u")
                     (UNSPEC_SRHADD "") (UNSPEC_URHADD "u")])
  
+(define_int_attr bt [(UNSPEC_SMULLB "b") (UNSPEC_UMULLB "b")
+                    (UNSPEC_SMULLT "t") (UNSPEC_UMULLT "t")])
+
  (define_int_attr addsub [(UNSPEC_SHADD "add")
                          (UNSPEC_UHADD "add")
                          (UNSPEC_SRHADD "add")
diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi

index fa4ae14534b790f0416992da31ce49ccefec7a73..f35fd2b1b19cef1deb41566d7614d80d449d69fc 100644 (file)
--- a/gcc/doc/md.texi
+++ b/gcc/doc/md.texi
@@ -5387,6 +5387,33 @@ operand 1. Add operand 1 to operand 2 and place the widened result in
  operand 0. (This is used express accumulation of elements into an accumulator
  of a wider mode.)
  
+@cindex @code{smulhs@var{m3}} instruction pattern
+@item @samp{smulhs@var{m3}}
+@cindex @code{umulhs@var{m3}} instruction pattern
+@itemx @samp{umulhs@var{m3}}
+Signed/unsigned multiply high with scale. This is equivalent to the C code:
+@smallexample
+narrow op0, op1, op2;
+@dots{}
+op0 = (narrow) (((wide) op1 * (wide) op2) >> (N / 2 - 1));
+@end smallexample
+where the sign of @samp{narrow} determines whether this is a signed
+or unsigned operation, and @var{N} is the size of @samp{wide} in bits.
+
+@cindex @code{smulhrs@var{m3}} instruction pattern
+@item @samp{smulhrs@var{m3}}
+@cindex @code{umulhrs@var{m3}} instruction pattern
+@itemx @samp{umulhrs@var{m3}}
+Signed/unsigned multiply high with round and scale. This is
+equivalent to the C code:
+@smallexample
+narrow op0, op1, op2;
+@dots{}
+op0 = (narrow) (((((wide) op1 * (wide) op2) >> (N / 2 - 2)) + 1) >> 1);
+@end smallexample
+where the sign of @samp{narrow} determines whether this is a signed
+or unsigned operation, and @var{N} is the size of @samp{wide} in bits.
+
  @cindex @code{vec_shl_insert_@var{m}} instruction pattern
  @item @samp{vec_shl_insert_@var{m}}
  Shift the elements in vector input operand 1 left one element (i.e.@:
diff --git a/gcc/doc/sourcebuild.texi b/gcc/doc/sourcebuild.texi

index e4180ccaabb706b16410ce84b49f09c1720756b9..4ace224a8ff5ed4fafed10a69ef00ffb2d7d8c39 100644 (file)
--- a/gcc/doc/sourcebuild.texi
+++ b/gcc/doc/sourcebuild.texi
@@ -1442,6 +1442,10 @@ vector alignment.
  Target supports both signed and unsigned averaging operations on vectors
  of bytes.
  
+@item vect_mulhrs_hi
+Target supports both signed and unsigned multiply-high-with-round-and-scale
+operations on vectors of half-words.
+
  @item vect_condition
  Target supports vector conditional operations.
  
diff --git a/gcc/internal-fn.c b/gcc/internal-fn.c

index ad86b9afd5488f2ff3cec94d966a3c320bc15919..549d6f1153b6e568d1ab8b0e5b8fa8ccc2161b61 100644 (file)
--- a/gcc/internal-fn.c
+++ b/gcc/internal-fn.c
@@ -3210,6 +3210,8 @@ first_commutative_argument (internal_fn fn)
      case IFN_FNMS:
      case IFN_AVG_FLOOR:
      case IFN_AVG_CEIL:
+    case IFN_MULHS:
+    case IFN_MULHRS:
      case IFN_FMIN:
      case IFN_FMAX:
        return 0;
diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def

index b5a6ca33223728f9c39bca0211b7a9728e665d5e..49f57978c88a3a8c1a0206d983e1720ed09b0385 100644 (file)
--- a/gcc/internal-fn.def
+++ b/gcc/internal-fn.def
@@ -149,6 +149,11 @@ DEF_INTERNAL_SIGNED_OPTAB_FN (AVG_FLOOR, ECF_CONST | ECF_NOTHROW, first,
  DEF_INTERNAL_SIGNED_OPTAB_FN (AVG_CEIL, ECF_CONST | ECF_NOTHROW, first,
                               savg_ceil, uavg_ceil, binary)
  
+DEF_INTERNAL_SIGNED_OPTAB_FN (MULHS, ECF_CONST | ECF_NOTHROW, first,
+                             smulhs, umulhs, binary)
+DEF_INTERNAL_SIGNED_OPTAB_FN (MULHRS, ECF_CONST | ECF_NOTHROW, first,
+                             smulhrs, umulhrs, binary)
+
  DEF_INTERNAL_OPTAB_FN (COND_ADD, ECF_CONST, cond_add, cond_binary)
  DEF_INTERNAL_OPTAB_FN (COND_SUB, ECF_CONST, cond_sub, cond_binary)
  DEF_INTERNAL_OPTAB_FN (COND_MUL, ECF_CONST, cond_smul, cond_binary)
diff --git a/gcc/optabs.def b/gcc/optabs.def

index 0860b38badbb2531f7292159bc63f03d9b1aa23c..308696846d4926fdd94133b87f4f59b8d1cc7f20 100644 (file)
--- a/gcc/optabs.def
+++ b/gcc/optabs.def
@@ -343,6 +343,10 @@ OPTAB_D (udot_prod_optab, "udot_prod$I$a")
  OPTAB_D (usum_widen_optab, "widen_usum$I$a3")
  OPTAB_D (usad_optab, "usad$I$a")
  OPTAB_D (ssad_optab, "ssad$I$a")
+OPTAB_D (smulhs_optab, "smulhs$a3")
+OPTAB_D (smulhrs_optab, "smulhrs$a3")
+OPTAB_D (umulhs_optab, "umulhs$a3")
+OPTAB_D (umulhrs_optab, "umulhrs$a3")
  OPTAB_D (vec_pack_sfix_trunc_optab, "vec_pack_sfix_trunc_$a")
  OPTAB_D (vec_pack_ssat_optab, "vec_pack_ssat_$a")
  OPTAB_D (vec_pack_trunc_optab, "vec_pack_trunc_$a")
diff --git a/gcc/testsuite/gcc.dg/vect/vect-mulhrs-1.c b/gcc/testsuite/gcc.dg/vect/vect-mulhrs-1.c

new file mode 100644 (file)

index 0000000..8e46ff6
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-mulhrs-1.c
@@ -0,0 +1,49 @@
+/* { dg-require-effective-target vect_int } */
+
+#include "tree-vect.h"
+#ifndef SIGNEDNESS
+#define SIGNEDNESS signed
+#endif
+#ifndef BIAS
+#define BIAS 0
+#endif
+
+#define HRS(x) ((((x) >> (15 - BIAS)) + BIAS) >> BIAS)
+
+void __attribute__ ((noipa))
+f (SIGNEDNESS short *restrict a, SIGNEDNESS short *restrict b,
+   SIGNEDNESS short *restrict c, __INTPTR_TYPE__ n)
+{
+  for (__INTPTR_TYPE__ i = 0; i < n; ++i)
+    a[i] = HRS((SIGNEDNESS int) b[i] * (SIGNEDNESS int) c[i]);
+}
+
+#define N 50
+#define BASE1 ((SIGNEDNESS int) -1 < 0 ? -126 : 4)
+#define BASE2 ((SIGNEDNESS int) -1 < 0 ? -101 : 26)
+#define CONST1 0x01AB
+#define CONST2 0x01CD
+
+int
+main (void)
+{
+  check_vect ();
+
+  SIGNEDNESS short a[N], b[N], c[N];
+  for (int i = 0; i < N; ++i)
+    {
+      b[i] = BASE1 + i * CONST1;
+      c[i] = BASE2 + i * CONST2;
+      asm volatile ("" ::: "memory");
+    }
+  f (a, b, c, N);
+  for (int i = 0; i < N; ++i)
+    if (a[i] != HRS(BASE1 * BASE2 + i * i * (CONST1 * CONST2)
+                   + i * (BASE1 * CONST2 + BASE2 * CONST1)))
+      __builtin_abort ();
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump "vect_recog_mulhs_pattern: detected" "vect" } } */
+/* { dg-final { scan-tree-dump {\.MULHS} "vect" { target vect_mulhrs_hi } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_mulhrs_hi } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-mulhrs-2.c b/gcc/testsuite/gcc.dg/vect/vect-mulhrs-2.c

new file mode 100644 (file)

index 0000000..a16e71c
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-mulhrs-2.c
@@ -0,0 +1,9 @@
+/* { dg-require-effective-target vect_int } */
+
+#define SIGNEDNESS unsigned
+
+#include "vect-mulhrs-1.c"
+
+/* { dg-final { scan-tree-dump "vect_recog_mulhs_pattern: detected" "vect" } } */
+/* { dg-final { scan-tree-dump {\.MULHS} "vect" { target vect_mulhrs_hi } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_mulhrs_hi } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-mulhrs-3.c b/gcc/testsuite/gcc.dg/vect/vect-mulhrs-3.c

new file mode 100644 (file)

index 0000000..e7d44d7
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-mulhrs-3.c
@@ -0,0 +1,9 @@
+/* { dg-require-effective-target vect_int } */
+
+#define BIAS 1
+
+#include "vect-mulhrs-1.c"
+
+/* { dg-final { scan-tree-dump "vect_recog_mulhs_pattern: detected" "vect" } } */
+/* { dg-final { scan-tree-dump {\.MULHRS} "vect" { target vect_mulhrs_hi } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_mulhrs_hi } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-mulhrs-4.c b/gcc/testsuite/gcc.dg/vect/vect-mulhrs-4.c

new file mode 100644 (file)

index 0000000..e121763
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-mulhrs-4.c
@@ -0,0 +1,10 @@
+/* { dg-require-effective-target vect_int } */
+
+#define SIGNEDNESS unsigned
+#define BIAS 1
+
+#include "vect-mulhrs-1.c"
+
+/* { dg-final { scan-tree-dump "vect_recog_mulhs_pattern: detected" "vect" } } */
+/* { dg-final { scan-tree-dump {\.MULHRS} "vect" { target vect_mulhrs_hi } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_mulhrs_hi } } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/mulhrs_1.c b/gcc/testsuite/gcc.target/aarch64/sve2/mulhrs_1.c

new file mode 100644 (file)

index 0000000..7970d68
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve2/mulhrs_1.c
@@ -0,0 +1,63 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -fdump-tree-vect-details --save-temps" } */
+
+#include <stdint.h>
+
+#define MULTHI(TYPE, BIGGER, RND)                     \
+TYPE __attribute__ ((noinline, noclone))              \
+mulhs_##TYPE##_##RND (TYPE *restrict x,               \
+        TYPE *restrict y, TYPE *restrict z, int n)    \
+{                                                     \
+  for (int i = 0; i < n; i++)                         \
+  {                                                   \
+    z[i] = ((((BIGGER)x[i] * (BIGGER)y[i]) >>         \
+            (sizeof(BIGGER)*8/2-2)) + RND) >> 1;      \
+  }                                                   \
+}
+
+MULTHI (int8_t, int16_t, 0)
+MULTHI (int16_t, int32_t, 0)
+MULTHI (int32_t, int64_t, 0)
+
+MULTHI (uint8_t, uint16_t, 0)
+MULTHI (uint16_t, uint32_t, 0)
+MULTHI (uint32_t, uint64_t, 0)
+
+MULTHI (int8_t, int16_t, 1)
+MULTHI (int16_t, int32_t, 1)
+MULTHI (int32_t, int64_t, 1)
+
+MULTHI (uint8_t, uint16_t, 1)
+MULTHI (uint16_t, uint32_t, 1)
+MULTHI (uint32_t, uint64_t, 1)
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 12 "vect" } } */
+
+/* { dg-final { scan-assembler-times {\tsmullb\tz[0-9]+\.h, z[0-9]+\.b, z[0-9]+\.b\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tsmullt\tz[0-9]+\.h, z[0-9]+\.b, z[0-9]+\.b\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tsmullb\tz[0-9]+\.s, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tsmullt\tz[0-9]+\.s, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tsmullb\tz[0-9]+\.d, z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tsmullt\tz[0-9]+\.d, z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tshrnb\tz[0-9]+\.b, z[0-9]+\.h, #7\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tshrnt\tz[0-9]+\.b, z[0-9]+\.h, #7\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tshrnb\tz[0-9]+\.h, z[0-9]+\.s, #15\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tshrnt\tz[0-9]+\.h, z[0-9]+\.s, #15\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tshrnb\tz[0-9]+\.s, z[0-9]+\.d, #31\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tshrnt\tz[0-9]+\.s, z[0-9]+\.d, #31\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tumullb\tz[0-9]+\.h, z[0-9]+\.b, z[0-9]+\.b\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tumullt\tz[0-9]+\.h, z[0-9]+\.b, z[0-9]+\.b\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tumullb\tz[0-9]+\.s, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tumullt\tz[0-9]+\.s, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tumullb\tz[0-9]+\.d, z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tumullt\tz[0-9]+\.d, z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\trshrnb\tz[0-9]+\.b, z[0-9]+\.h, #7\n} 2 } } */
+/* { dg-final { scan-assembler-times {\trshrnt\tz[0-9]+\.b, z[0-9]+\.h, #7\n} 2 } } */
+/* { dg-final { scan-assembler-times {\trshrnb\tz[0-9]+\.h, z[0-9]+\.s, #15\n} 2 } } */
+/* { dg-final { scan-assembler-times {\trshrnt\tz[0-9]+\.h, z[0-9]+\.s, #15\n} 2 } } */
+/* { dg-final { scan-assembler-times {\trshrnb\tz[0-9]+\.s, z[0-9]+\.d, #31\n} 2 } } */
+/* { dg-final { scan-assembler-times {\trshrnt\tz[0-9]+\.s, z[0-9]+\.d, #31\n} 2 } } */
+
diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp

index 4f7d6cbdd423924855c92c2a9949f3cb35b1f25a..f05a0930fbd9c79e9d3fde6dc4d93c28c7745e4e 100644 (file)
--- a/gcc/testsuite/lib/target-supports.exp
+++ b/gcc/testsuite/lib/target-supports.exp
@@ -6175,6 +6175,15 @@ proc check_effective_target_vect_avg_qi {} {
                    && ![check_effective_target_aarch64_sve1_only] }]
  }
  
+# Return 1 if the target plus current options supports both signed
+# and unsigned multiply-high-with-round-and-scale operations
+# on vectors of half-words.
+
+proc check_effective_target_vect_mulhrs_hi {} {
+    return [expr { [istarget aarch64*-*-*]
+                  && [check_effective_target_aarch64_sve2] }]
+}
+
  # Return 1 if the target plus current options supports a vector
  # demotion (packing) of shorts (to chars) and ints (to shorts) 
  # using modulo arithmetic, 0 otherwise.
diff --git a/gcc/tree-vect-patterns.c b/gcc/tree-vect-patterns.c

index ccb2e1edecda09db786c0d98ccd25f5be107c7e4..2f86f9e4fc7039add1b1d7b82574cb8262eb4ba4 100644 (file)
--- a/gcc/tree-vect-patterns.c
+++ b/gcc/tree-vect-patterns.c
@@ -1723,6 +1723,175 @@ vect_recog_over_widening_pattern (stmt_vec_info last_stmt_info, tree *type_out)
    return pattern_stmt;
  }
  
+/* Recognize the following patterns:
+
+     ATYPE a;  // narrower than TYPE
+     BTYPE b;  // narrower than TYPE
+
+   1) Multiply high with scaling
+     TYPE res = ((TYPE) a * (TYPE) b) >> c;
+   2) ... or also with rounding
+     TYPE res = (((TYPE) a * (TYPE) b) >> d + 1) >> 1;
+
+   where only the bottom half of res is used.  */
+
+static gimple *
+vect_recog_mulhs_pattern (stmt_vec_info last_stmt_info, tree *type_out)
+{
+  /* Check for a right shift.  */
+  gassign *last_stmt = dyn_cast <gassign *> (last_stmt_info->stmt);
+  if (!last_stmt
+      || gimple_assign_rhs_code (last_stmt) != RSHIFT_EXPR)
+    return NULL;
+  vec_info *vinfo = last_stmt_info->vinfo;
+
+  /* Check that the shift result is wider than the users of the
+     result need (i.e. that narrowing would be a natural choice).  */
+  tree lhs_type = TREE_TYPE (gimple_assign_lhs (last_stmt));
+  unsigned int target_precision
+    = vect_element_precision (last_stmt_info->min_output_precision);
+  if (!INTEGRAL_TYPE_P (lhs_type)
+      || target_precision >= TYPE_PRECISION (lhs_type))
+    return NULL;
+
+  /* Look through any change in sign on the outer shift input.  */
+  vect_unpromoted_value unprom_rshift_input;
+  tree rshift_input = vect_look_through_possible_promotion
+    (vinfo, gimple_assign_rhs1 (last_stmt), &unprom_rshift_input);
+  if (!rshift_input
+      || TYPE_PRECISION (TREE_TYPE (rshift_input))
+          != TYPE_PRECISION (lhs_type))
+    return NULL;
+
+  /* Get the definition of the shift input.  */
+  stmt_vec_info rshift_input_stmt_info
+    = vect_get_internal_def (vinfo, rshift_input);
+  if (!rshift_input_stmt_info)
+    return NULL;
+  gassign *rshift_input_stmt
+    = dyn_cast <gassign *> (rshift_input_stmt_info->stmt);
+  if (!rshift_input_stmt)
+    return NULL;
+
+  stmt_vec_info mulh_stmt_info;
+  tree scale_term;
+  internal_fn ifn;
+  unsigned int expect_offset;
+
+  /* Check for the presence of the rounding term.  */
+  if (gimple_assign_rhs_code (rshift_input_stmt) == PLUS_EXPR)
+    {
+      /* Check that the outer shift was by 1.  */
+      if (!integer_onep (gimple_assign_rhs2 (last_stmt)))
+       return NULL;
+
+      /* Check that the second operand of the PLUS_EXPR is 1.  */
+      if (!integer_onep (gimple_assign_rhs2 (rshift_input_stmt)))
+       return NULL;
+
+      /* Look through any change in sign on the addition input.  */
+      vect_unpromoted_value unprom_plus_input;
+      tree plus_input = vect_look_through_possible_promotion
+       (vinfo, gimple_assign_rhs1 (rshift_input_stmt), &unprom_plus_input);
+      if (!plus_input
+          || TYPE_PRECISION (TREE_TYPE (plus_input))
+               != TYPE_PRECISION (TREE_TYPE (rshift_input)))
+       return NULL;
+
+      /* Get the definition of the multiply-high-scale part.  */
+      stmt_vec_info plus_input_stmt_info
+       = vect_get_internal_def (vinfo, plus_input);
+      if (!plus_input_stmt_info)
+       return NULL;
+      gassign *plus_input_stmt
+       = dyn_cast <gassign *> (plus_input_stmt_info->stmt);
+      if (!plus_input_stmt
+         || gimple_assign_rhs_code (plus_input_stmt) != RSHIFT_EXPR)
+       return NULL;
+
+      /* Look through any change in sign on the scaling input.  */
+      vect_unpromoted_value unprom_scale_input;
+      tree scale_input = vect_look_through_possible_promotion
+       (vinfo, gimple_assign_rhs1 (plus_input_stmt), &unprom_scale_input);
+      if (!scale_input
+         || TYPE_PRECISION (TREE_TYPE (scale_input))
+              != TYPE_PRECISION (TREE_TYPE (plus_input)))
+       return NULL;
+
+      /* Get the definition of the multiply-high part.  */
+      mulh_stmt_info = vect_get_internal_def (vinfo, scale_input);
+      if (!mulh_stmt_info)
+       return NULL;
+
+      /* Get the scaling term.  */
+      scale_term = gimple_assign_rhs2 (plus_input_stmt);
+
+      expect_offset = target_precision + 2;
+      ifn = IFN_MULHRS;
+    }
+  else
+    {
+      mulh_stmt_info = rshift_input_stmt_info;
+      scale_term = gimple_assign_rhs2 (last_stmt);
+
+      expect_offset = target_precision + 1;
+      ifn = IFN_MULHS;
+    }
+
+  /* Check that the scaling factor is correct.  */
+  if (TREE_CODE (scale_term) != INTEGER_CST
+      || wi::to_widest (scale_term) + expect_offset
+          != TYPE_PRECISION (lhs_type))
+    return NULL;
+
+  /* Check whether the scaling input term can be seen as two widened
+     inputs multiplied together.  */
+  vect_unpromoted_value unprom_mult[2];
+  tree new_type;
+  unsigned int nops
+    = vect_widened_op_tree (mulh_stmt_info, MULT_EXPR, WIDEN_MULT_EXPR,
+                           false, 2, unprom_mult, &new_type);
+  if (nops != 2)
+    return NULL;
+
+  vect_pattern_detected ("vect_recog_mulhs_pattern", last_stmt);
+
+  /* Adjust output precision.  */
+  if (TYPE_PRECISION (new_type) < target_precision)
+    new_type = build_nonstandard_integer_type
+      (target_precision, TYPE_UNSIGNED (new_type));
+
+  /* Check for target support.  */
+  tree new_vectype = get_vectype_for_scalar_type (new_type);
+  if (!new_vectype
+      || !direct_internal_fn_supported_p
+           (ifn, new_vectype, OPTIMIZE_FOR_SPEED))
+    return NULL;
+
+  /* The IR requires a valid vector type for the cast result, even though
+     it's likely to be discarded.  */
+  *type_out = get_vectype_for_scalar_type (lhs_type);
+  if (!*type_out)
+    return NULL;
+
+  /* Generate the IFN_MULHRS call.  */
+  tree new_var = vect_recog_temp_ssa_var (new_type, NULL);
+  tree new_ops[2];
+  vect_convert_inputs (last_stmt_info, 2, new_ops, new_type,
+                      unprom_mult, new_vectype);
+  gcall *mulhrs_stmt
+    = gimple_build_call_internal (ifn, 2, new_ops[0], new_ops[1]);
+  gimple_call_set_lhs (mulhrs_stmt, new_var);
+  gimple_set_location (mulhrs_stmt, gimple_location (last_stmt));
+
+  if (dump_enabled_p ())
+    dump_printf_loc (MSG_NOTE, vect_location,
+                    "created pattern stmt: %G", mulhrs_stmt);
+
+  return vect_convert_output (last_stmt_info, lhs_type,
+                             mulhrs_stmt, new_vectype);
+}
+
  /* Recognize the patterns:
  
             ATYPE a;  // narrower than TYPE
@@ -4713,6 +4882,7 @@ static vect_recog_func vect_vect_recog_func_ptrs[] = {
    /* Must come after over_widening, which narrows the shift as much as
       possible beforehand.  */
    { vect_recog_average_pattern, "average" },
+  { vect_recog_mulhs_pattern, "mult_high" },
    { vect_recog_cast_forwprop_pattern, "cast_forwprop" },
    { vect_recog_widen_mult_pattern, "widen_mult" },
    { vect_recog_dot_prod_pattern, "dot_prod" },
author	Yuliang Wang <yuliang.wang@arm.com>
	Thu, 12 Sep 2019 09:59:58 +0000 (09:59 +0000)
committer	Richard Sandiford <rsandifo@gcc.gnu.org>
	Thu, 12 Sep 2019 09:59:58 +0000 (09:59 +0000)
gcc/ChangeLog		patch \| blob \| history
gcc/config/aarch64/aarch64-sve2.md		patch \| blob \| history
gcc/config/aarch64/iterators.md		patch \| blob \| history
gcc/doc/md.texi		patch \| blob \| history
gcc/doc/sourcebuild.texi		patch \| blob \| history
gcc/internal-fn.c		patch \| blob \| history
gcc/internal-fn.def		patch \| blob \| history
gcc/optabs.def		patch \| blob \| history
gcc/testsuite/gcc.dg/vect/vect-mulhrs-1.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.dg/vect/vect-mulhrs-2.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.dg/vect/vect-mulhrs-3.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.dg/vect/vect-mulhrs-4.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/aarch64/sve2/mulhrs_1.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/lib/target-supports.exp		patch \| blob \| history
gcc/tree-vect-patterns.c		patch \| blob \| history