[AArch64] Use SVE MLA, MLS, MAD and MSB for conditional arithmetic
authorRichard Sandiford <richard.sandiford@arm.com>
Thu, 15 Aug 2019 08:22:07 +0000 (08:22 +0000)
committerRichard Sandiford <rsandifo@gcc.gnu.org>
Thu, 15 Aug 2019 08:22:07 +0000 (08:22 +0000)
This patch uses predicated MLA, MLS, MAD and MSB to implement
conditional "FMA"s on integers.  This also requires providing
the unpredicated optabs (fma and fnma) since otherwise
tree-ssa-math-opts.c won't try to use the conditional forms.

We still want to use shifts and adds in preference to multiplications,
so the patch makes the optab expanders check for that.

The tests cover floating-point types too, which are already handled,
and which were already tested to some extent by gcc.dg/vect.

2019-08-15  Richard Sandiford  <richard.sandiford@arm.com>
    Kugan Vivekanandarajah  <kugan.vivekanandarajah@linaro.org>

gcc/
* config/aarch64/aarch64-protos.h (aarch64_prepare_sve_int_fma)
(aarch64_prepare_sve_cond_int_fma): Declare.
* config/aarch64/aarch64.c (aarch64_convert_mult_to_shift)
(aarch64_prepare_sve_int_fma): New functions.
(aarch64_prepare_sve_cond_int_fma): Likewise.
* config/aarch64/aarch64-sve.md
(cond_<SVE_INT_BINARY:optab><SVE_I:mode>): Add a "@" marker.
(fma<SVE_I:mode>4, cond_fma<SVE_I:mode>, *cond_fma<SVE_I:mode>_2)
(*cond_fma<SVE_I:mode>_4, *cond_fma<SVE_I:mode>_any, fnma<SVE_I:mode>4)
(cond_fnma<SVE_I:mode>, *cond_fnma<SVE_I:mode>_2)
(*cond_fnma<SVE_I:mode>_4, *cond_fnma<SVE_I:mode>_any): New patterns.
(*madd<mode>): Rename to...
(*fma<mode>4): ...this.
(*msub<mode>): Rename to...
(*fnma<mode>4): ...this.

gcc/testsuite/
* gcc.target/aarch64/sve/cond_mla_1.c: New test.
* gcc.target/aarch64/sve/cond_mla_1_run.c: Likewise.
* gcc.target/aarch64/sve/cond_mla_2.c: Likewise.
* gcc.target/aarch64/sve/cond_mla_2_run.c: Likewise.
* gcc.target/aarch64/sve/cond_mla_3.c: Likewise.
* gcc.target/aarch64/sve/cond_mla_3_run.c: Likewise.
* gcc.target/aarch64/sve/cond_mla_4.c: Likewise.
* gcc.target/aarch64/sve/cond_mla_4_run.c: Likewise.
* gcc.target/aarch64/sve/cond_mla_5.c: Likewise.
* gcc.target/aarch64/sve/cond_mla_5_run.c: Likewise.
* gcc.target/aarch64/sve/cond_mla_6.c: Likewise.
* gcc.target/aarch64/sve/cond_mla_6_run.c: Likewise.
* gcc.target/aarch64/sve/cond_mla_7.c: Likewise.
* gcc.target/aarch64/sve/cond_mla_7_run.c: Likewise.
* gcc.target/aarch64/sve/cond_mla_8.c: Likewise.
* gcc.target/aarch64/sve/cond_mla_8_run.c: Likewise.

Co-Authored-By: Kugan Vivekanandarajah <kuganv@linaro.org>
From-SVN: r274509

21 files changed:
gcc/ChangeLog
gcc/config/aarch64/aarch64-protos.h
gcc/config/aarch64/aarch64-sve.md
gcc/config/aarch64/aarch64.c
gcc/testsuite/ChangeLog
gcc/testsuite/gcc.target/aarch64/sve/cond_mla_1.c [new file with mode: 0644]
gcc/testsuite/gcc.target/aarch64/sve/cond_mla_1_run.c [new file with mode: 0644]
gcc/testsuite/gcc.target/aarch64/sve/cond_mla_2.c [new file with mode: 0644]
gcc/testsuite/gcc.target/aarch64/sve/cond_mla_2_run.c [new file with mode: 0644]
gcc/testsuite/gcc.target/aarch64/sve/cond_mla_3.c [new file with mode: 0644]
gcc/testsuite/gcc.target/aarch64/sve/cond_mla_3_run.c [new file with mode: 0644]
gcc/testsuite/gcc.target/aarch64/sve/cond_mla_4.c [new file with mode: 0644]
gcc/testsuite/gcc.target/aarch64/sve/cond_mla_4_run.c [new file with mode: 0644]
gcc/testsuite/gcc.target/aarch64/sve/cond_mla_5.c [new file with mode: 0644]
gcc/testsuite/gcc.target/aarch64/sve/cond_mla_5_run.c [new file with mode: 0644]
gcc/testsuite/gcc.target/aarch64/sve/cond_mla_6.c [new file with mode: 0644]
gcc/testsuite/gcc.target/aarch64/sve/cond_mla_6_run.c [new file with mode: 0644]
gcc/testsuite/gcc.target/aarch64/sve/cond_mla_7.c [new file with mode: 0644]
gcc/testsuite/gcc.target/aarch64/sve/cond_mla_7_run.c [new file with mode: 0644]
gcc/testsuite/gcc.target/aarch64/sve/cond_mla_8.c [new file with mode: 0644]
gcc/testsuite/gcc.target/aarch64/sve/cond_mla_8_run.c [new file with mode: 0644]

index 52ab8e5d3702811a13a6764046345ed6e150bac8..66631f6b37c141608362e9ab52122f524aa8aec7 100644 (file)
@@ -1,3 +1,22 @@
+2019-08-15  Richard Sandiford  <richard.sandiford@arm.com>
+           Kugan Vivekanandarajah  <kugan.vivekanandarajah@linaro.org>
+
+       * config/aarch64/aarch64-protos.h (aarch64_prepare_sve_int_fma)
+       (aarch64_prepare_sve_cond_int_fma): Declare.
+       * config/aarch64/aarch64.c (aarch64_convert_mult_to_shift)
+       (aarch64_prepare_sve_int_fma): New functions.
+       (aarch64_prepare_sve_cond_int_fma): Likewise.
+       * config/aarch64/aarch64-sve.md
+       (cond_<SVE_INT_BINARY:optab><SVE_I:mode>): Add a "@" marker.
+       (fma<SVE_I:mode>4, cond_fma<SVE_I:mode>, *cond_fma<SVE_I:mode>_2)
+       (*cond_fma<SVE_I:mode>_4, *cond_fma<SVE_I:mode>_any, fnma<SVE_I:mode>4)
+       (cond_fnma<SVE_I:mode>, *cond_fnma<SVE_I:mode>_2)
+       (*cond_fnma<SVE_I:mode>_4, *cond_fnma<SVE_I:mode>_any): New patterns.
+       (*madd<mode>): Rename to...
+       (*fma<mode>4): ...this.
+       (*msub<mode>): Rename to...
+       (*fnma<mode>4): ...this.
+
 2019-08-15  Richard Sandiford  <richard.sandiford@arm.com>
            Kugan Vivekanandarajah  <kugan.vivekanandarajah@linaro.org>
 
index 858c4500406873cb1a0c0f619c5854e61c9ab5f6..24e724004f833e16ba2e3dc4f06bba1069cf8f98 100644 (file)
@@ -630,6 +630,9 @@ bool aarch64_gen_adjusted_ldpstp (rtx *, bool, scalar_mode, RTX_CODE);
 void aarch64_expand_sve_vec_cmp_int (rtx, rtx_code, rtx, rtx);
 bool aarch64_expand_sve_vec_cmp_float (rtx, rtx_code, rtx, rtx, bool);
 void aarch64_expand_sve_vcond (machine_mode, machine_mode, rtx *);
+
+bool aarch64_prepare_sve_int_fma (rtx *, rtx_code);
+bool aarch64_prepare_sve_cond_int_fma (rtx *, rtx_code);
 #endif /* RTX_CODE */
 
 void aarch64_init_builtins (void);
index d43ce521a799bf3385f0ca1ffd14599b18c27be3..93f55360d69652e1ca434682e92440c014ce84fe 100644 (file)
 )
 
 ;; Predicated integer operations with merging.
-(define_expand "cond_<optab><mode>"
+(define_expand "@cond_<optab><mode>"
   [(set (match_operand:SVE_I 0 "register_operand")
        (unspec:SVE_I
          [(match_operand:<VPRED> 1 "register_operand")
 ;; - MLA
 ;; -------------------------------------------------------------------------
 
+;; Unpredicated integer addition of product.
+(define_expand "fma<mode>4"
+  [(set (match_operand:SVE_I 0 "register_operand")
+       (plus:SVE_I
+         (unspec:SVE_I
+           [(match_dup 4)
+            (mult:SVE_I (match_operand:SVE_I 1 "register_operand")
+                        (match_operand:SVE_I 2 "nonmemory_operand"))]
+           UNSPEC_PRED_X)
+         (match_operand:SVE_I 3 "register_operand")))]
+  "TARGET_SVE"
+  {
+    if (aarch64_prepare_sve_int_fma (operands, PLUS))
+      DONE;
+    operands[4] = aarch64_ptrue_reg (<VPRED>mode);
+  }
+)
+
 ;; Predicated integer addition of product.
-(define_insn "*madd<mode>"
+(define_insn "*fma<mode>4"
   [(set (match_operand:SVE_I 0 "register_operand" "=w, w, ?&w")
        (plus:SVE_I
          (unspec:SVE_I
   [(set_attr "movprfx" "*,*,yes")]
 )
 
+;; Predicated integer addition of product with merging.
+(define_expand "cond_fma<mode>"
+  [(set (match_operand:SVE_I 0 "register_operand")
+       (unspec:SVE_I
+         [(match_operand:<VPRED> 1 "register_operand")
+          (plus:SVE_I
+            (mult:SVE_I (match_operand:SVE_I 2 "register_operand")
+                        (match_operand:SVE_I 3 "general_operand"))
+            (match_operand:SVE_I 4 "register_operand"))
+          (match_operand:SVE_I 5 "aarch64_simd_reg_or_zero")]
+         UNSPEC_SEL))]
+  "TARGET_SVE"
+  {
+    if (aarch64_prepare_sve_cond_int_fma (operands, PLUS))
+      DONE;
+    /* Swap the multiplication operands if the fallback value is the
+       second of the two.  */
+    if (rtx_equal_p (operands[3], operands[5]))
+      std::swap (operands[2], operands[3]);
+  }
+)
+
+;; Predicated integer addition of product, merging with the first input.
+(define_insn "*cond_fma<mode>_2"
+  [(set (match_operand:SVE_I 0 "register_operand" "=w, ?&w")
+       (unspec:SVE_I
+         [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
+          (plus:SVE_I
+            (mult:SVE_I (match_operand:SVE_I 2 "register_operand" "0, w")
+                        (match_operand:SVE_I 3 "register_operand" "w, w"))
+            (match_operand:SVE_I 4 "register_operand" "w, w"))
+          (match_dup 2)]
+         UNSPEC_SEL))]
+  "TARGET_SVE"
+  "@
+   mad\t%0.<Vetype>, %1/m, %3.<Vetype>, %4.<Vetype>
+   movprfx\t%0, %2\;mad\t%0.<Vetype>, %1/m, %3.<Vetype>, %4.<Vetype>"
+  [(set_attr "movprfx" "*,yes")]
+)
+
+;; Predicated integer addition of product, merging with the third input.
+(define_insn "*cond_fma<mode>_4"
+  [(set (match_operand:SVE_I 0 "register_operand" "=w, ?&w")
+       (unspec:SVE_I
+         [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
+          (plus:SVE_I
+            (mult:SVE_I (match_operand:SVE_I 2 "register_operand" "w, w")
+                        (match_operand:SVE_I 3 "register_operand" "w, w"))
+            (match_operand:SVE_I 4 "register_operand" "0, w"))
+          (match_dup 4)]
+         UNSPEC_SEL))]
+  "TARGET_SVE"
+  "@
+   mla\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>
+   movprfx\t%0, %4\;mla\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>"
+  [(set_attr "movprfx" "*,yes")]
+)
+
+;; Predicated integer addition of product, merging with an independent value.
+(define_insn_and_rewrite "*cond_fma<mode>_any"
+  [(set (match_operand:SVE_I 0 "register_operand" "=&w, &w, &w, &w, &w, ?&w")
+       (unspec:SVE_I
+         [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl, Upl, Upl, Upl")
+          (plus:SVE_I
+            (mult:SVE_I (match_operand:SVE_I 2 "register_operand" "w, w, 0, w, w, w")
+                        (match_operand:SVE_I 3 "register_operand" "w, w, w, 0, w, w"))
+            (match_operand:SVE_I 4 "register_operand" "w, 0, w, w, w, w"))
+          (match_operand:SVE_I 5 "aarch64_simd_reg_or_zero" "Dz, Dz, Dz, Dz, 0, w")]
+         UNSPEC_SEL))]
+  "TARGET_SVE
+   && !rtx_equal_p (operands[2], operands[5])
+   && !rtx_equal_p (operands[3], operands[5])
+   && !rtx_equal_p (operands[4], operands[5])"
+  "@
+   movprfx\t%0.<Vetype>, %1/z, %4.<Vetype>\;mla\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>
+   movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;mla\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>
+   movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;mad\t%0.<Vetype>, %1/m, %3.<Vetype>, %4.<Vetype>
+   movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;mad\t%0.<Vetype>, %1/m, %2.<Vetype>, %4.<Vetype>
+   movprfx\t%0.<Vetype>, %1/m, %4.<Vetype>\;mla\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>
+   #"
+  "&& reload_completed
+   && register_operand (operands[5], <MODE>mode)
+   && !rtx_equal_p (operands[0], operands[5])"
+  {
+    emit_insn (gen_vcond_mask_<mode><vpred> (operands[0], operands[4],
+                                            operands[5], operands[1]));
+    operands[5] = operands[4] = operands[0];
+  }
+  [(set_attr "movprfx" "yes")]
+)
+
 ;; -------------------------------------------------------------------------
 ;; ---- [INT] MLS and MSB
 ;; -------------------------------------------------------------------------
 ;; - MSB
 ;; -------------------------------------------------------------------------
 
+;; Unpredicated integer subtraction of product.
+(define_expand "fnma<mode>4"
+  [(set (match_operand:SVE_I 0 "register_operand")
+       (minus:SVE_I
+         (match_operand:SVE_I 3 "register_operand")
+         (unspec:SVE_I
+           [(match_dup 4)
+            (mult:SVE_I (match_operand:SVE_I 1 "register_operand")
+                        (match_operand:SVE_I 2 "general_operand"))]
+           UNSPEC_PRED_X)))]
+  "TARGET_SVE"
+  {
+    if (aarch64_prepare_sve_int_fma (operands, MINUS))
+      DONE;
+    operands[4] = aarch64_ptrue_reg (<VPRED>mode);
+  }
+)
+
 ;; Predicated integer subtraction of product.
-(define_insn "*msub<mode>3"
+(define_insn "*fnma<mode>3"
   [(set (match_operand:SVE_I 0 "register_operand" "=w, w, ?&w")
        (minus:SVE_I
          (match_operand:SVE_I 4 "register_operand" "w, 0, w")
   [(set_attr "movprfx" "*,*,yes")]
 )
 
+;; Predicated integer subtraction of product with merging.
+(define_expand "cond_fnma<mode>"
+  [(set (match_operand:SVE_I 0 "register_operand")
+   (unspec:SVE_I
+       [(match_operand:<VPRED> 1 "register_operand")
+        (minus:SVE_I
+          (match_operand:SVE_I 4 "register_operand")
+          (mult:SVE_I (match_operand:SVE_I 2 "register_operand")
+                      (match_operand:SVE_I 3 "general_operand")))
+        (match_operand:SVE_I 5 "aarch64_simd_reg_or_zero")]
+       UNSPEC_SEL))]
+  "TARGET_SVE"
+  {
+    if (aarch64_prepare_sve_cond_int_fma (operands, MINUS))
+      DONE;
+    /* Swap the multiplication operands if the fallback value is the
+       second of the two.  */
+    if (rtx_equal_p (operands[3], operands[5]))
+      std::swap (operands[2], operands[3]);
+  }
+)
+
+;; Predicated integer subtraction of product, merging with the first input.
+(define_insn "*cond_fnma<mode>_2"
+  [(set (match_operand:SVE_I 0 "register_operand" "=w, ?&w")
+       (unspec:SVE_I
+         [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
+          (minus:SVE_I
+            (match_operand:SVE_I 4 "register_operand" "w, w")
+            (mult:SVE_I (match_operand:SVE_I 2 "register_operand" "0, w")
+                        (match_operand:SVE_I 3 "register_operand" "w, w")))
+          (match_dup 2)]
+         UNSPEC_SEL))]
+  "TARGET_SVE"
+  "@
+   msb\t%0.<Vetype>, %1/m, %3.<Vetype>, %4.<Vetype>
+   movprfx\t%0, %2\;msb\t%0.<Vetype>, %1/m, %3.<Vetype>, %4.<Vetype>"
+  [(set_attr "movprfx" "*,yes")]
+)
+
+;; Predicated integer subtraction of product, merging with the third input.
+(define_insn "*cond_fnma<mode>_4"
+  [(set (match_operand:SVE_I 0 "register_operand" "=w, ?&w")
+       (unspec:SVE_I
+         [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
+          (minus:SVE_I
+            (match_operand:SVE_I 4 "register_operand" "0, w")
+            (mult:SVE_I (match_operand:SVE_I 2 "register_operand" "w, w")
+                        (match_operand:SVE_I 3 "register_operand" "w, w")))
+          (match_dup 4)]
+         UNSPEC_SEL))]
+  "TARGET_SVE"
+  "@
+   mls\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>
+   movprfx\t%0, %4\;mls\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>"
+  [(set_attr "movprfx" "*,yes")]
+)
+
+;; Predicated integer subtraction of product, merging with an
+;; independent value.
+(define_insn_and_rewrite "*cond_fnma<mode>_any"
+  [(set (match_operand:SVE_I 0 "register_operand" "=&w, &w, &w, &w, &w, ?&w")
+       (unspec:SVE_I
+         [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl, Upl, Upl, Upl")
+          (minus:SVE_I
+            (match_operand:SVE_I 4 "register_operand" "w, 0, w, w, w, w")
+            (mult:SVE_I (match_operand:SVE_I 2 "register_operand" "w, w, 0, w, w, w")
+                        (match_operand:SVE_I 3 "register_operand" "w, w, w, 0, w, w")))
+          (match_operand:SVE_I 5 "aarch64_simd_reg_or_zero" "Dz, Dz, Dz, Dz, 0, w")]
+         UNSPEC_SEL))]
+  "TARGET_SVE
+   && !rtx_equal_p (operands[2], operands[5])
+   && !rtx_equal_p (operands[3], operands[5])
+   && !rtx_equal_p (operands[4], operands[5])"
+  "@
+   movprfx\t%0.<Vetype>, %1/z, %4.<Vetype>\;mls\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>
+   movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;mls\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>
+   movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;msb\t%0.<Vetype>, %1/m, %3.<Vetype>, %4.<Vetype>
+   movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;msb\t%0.<Vetype>, %1/m, %2.<Vetype>, %4.<Vetype>
+   movprfx\t%0.<Vetype>, %1/m, %4.<Vetype>\;mls\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>
+   #"
+  "&& reload_completed
+   && register_operand (operands[5], <MODE>mode)
+   && !rtx_equal_p (operands[0], operands[5])"
+  {
+    emit_insn (gen_vcond_mask_<mode><vpred> (operands[0], operands[4],
+                                            operands[5], operands[1]));
+    operands[5] = operands[4] = operands[0];
+  }
+  [(set_attr "movprfx" "yes")]
+)
+
 ;; -------------------------------------------------------------------------
 ;; ---- [INT] Dot product
 ;; -------------------------------------------------------------------------
index 81a267bde54372d999f4851d537d6885f9abd4ce..ec787727aa69290ea6202999b99b8658cfdb9f2f 100644 (file)
@@ -16469,6 +16469,98 @@ aarch64_sve_expand_vector_init (rtx target, rtx vals)
     aarch64_sve_expand_vector_init_insert_elems (target, v, nelts);
 }
 
+/* Check whether VALUE is a vector constant in which every element
+   is either a power of 2 or a negated power of 2.  If so, return
+   a constant vector of log2s, and flip CODE between PLUS and MINUS
+   if VALUE contains negated powers of 2.  Return NULL_RTX otherwise.  */
+
+static rtx
+aarch64_convert_mult_to_shift (rtx value, rtx_code &code)
+{
+  if (GET_CODE (value) != CONST_VECTOR)
+    return NULL_RTX;
+
+  rtx_vector_builder builder;
+  if (!builder.new_unary_operation (GET_MODE (value), value, false))
+    return NULL_RTX;
+
+  scalar_mode int_mode = GET_MODE_INNER (GET_MODE (value));
+  /* 1 if the result of the multiplication must be negated,
+     0 if it mustn't, or -1 if we don't yet care.  */
+  int negate = -1;
+  unsigned int encoded_nelts = const_vector_encoded_nelts (value);
+  for (unsigned int i = 0; i < encoded_nelts; ++i)
+    {
+      rtx elt = CONST_VECTOR_ENCODED_ELT (value, i);
+      if (!CONST_SCALAR_INT_P (elt))
+       return NULL_RTX;
+      rtx_mode_t val (elt, int_mode);
+      wide_int pow2 = wi::neg (val);
+      if (val != pow2)
+       {
+         /* It matters whether we negate or not.  Make that choice,
+            and make sure that it's consistent with previous elements.  */
+         if (negate == !wi::neg_p (val))
+           return NULL_RTX;
+         negate = wi::neg_p (val);
+         if (!negate)
+           pow2 = val;
+       }
+      /* POW2 is now the value that we want to be a power of 2.  */
+      int shift = wi::exact_log2 (pow2);
+      if (shift < 0)
+       return NULL_RTX;
+      builder.quick_push (gen_int_mode (shift, int_mode));
+    }
+  if (negate == -1)
+    /* PLUS and MINUS are equivalent; canonicalize on PLUS.  */
+    code = PLUS;
+  else if (negate == 1)
+    code = code == PLUS ? MINUS : PLUS;
+  return builder.build ();
+}
+
+/* Prepare for an integer SVE multiply-add or multiply-subtract pattern;
+   CODE is PLUS for the former and MINUS for the latter.  OPERANDS is the
+   operands array, in the same order as for fma_optab.  Return true if
+   the function emitted all the necessary instructions, false if the caller
+   should generate the pattern normally with the new OPERANDS array.  */
+
+bool
+aarch64_prepare_sve_int_fma (rtx *operands, rtx_code code)
+{
+  machine_mode mode = GET_MODE (operands[0]);
+  if (rtx shifts = aarch64_convert_mult_to_shift (operands[2], code))
+    {
+      rtx product = expand_binop (mode, vashl_optab, operands[1], shifts,
+                                 NULL_RTX, true, OPTAB_DIRECT);
+      force_expand_binop (mode, code == PLUS ? add_optab : sub_optab,
+                         operands[3], product, operands[0], true,
+                         OPTAB_DIRECT);
+      return true;
+    }
+  operands[2] = force_reg (mode, operands[2]);
+  return false;
+}
+
+/* Likewise, but for a conditional pattern.  */
+
+bool
+aarch64_prepare_sve_cond_int_fma (rtx *operands, rtx_code code)
+{
+  machine_mode mode = GET_MODE (operands[0]);
+  if (rtx shifts = aarch64_convert_mult_to_shift (operands[3], code))
+    {
+      rtx product = expand_binop (mode, vashl_optab, operands[2], shifts,
+                                 NULL_RTX, true, OPTAB_DIRECT);
+      emit_insn (gen_cond (code, mode, operands[0], operands[1],
+                          operands[4], product, operands[5]));
+      return true;
+    }
+  operands[3] = force_reg (mode, operands[3]);
+  return false;
+}
+
 static unsigned HOST_WIDE_INT
 aarch64_shift_truncation_mask (machine_mode mode)
 {
index aad8e04c32ca396b7f682368469a58798ca212be..63596d8497e4b5da42de8cca81190d3de8352135 100644 (file)
@@ -1,3 +1,23 @@
+2019-08-15  Richard Sandiford  <richard.sandiford@arm.com>
+           Kugan Vivekanandarajah  <kugan.vivekanandarajah@linaro.org>
+
+       * gcc.target/aarch64/sve/cond_mla_1.c: New test.
+       * gcc.target/aarch64/sve/cond_mla_1_run.c: Likewise.
+       * gcc.target/aarch64/sve/cond_mla_2.c: Likewise.
+       * gcc.target/aarch64/sve/cond_mla_2_run.c: Likewise.
+       * gcc.target/aarch64/sve/cond_mla_3.c: Likewise.
+       * gcc.target/aarch64/sve/cond_mla_3_run.c: Likewise.
+       * gcc.target/aarch64/sve/cond_mla_4.c: Likewise.
+       * gcc.target/aarch64/sve/cond_mla_4_run.c: Likewise.
+       * gcc.target/aarch64/sve/cond_mla_5.c: Likewise.
+       * gcc.target/aarch64/sve/cond_mla_5_run.c: Likewise.
+       * gcc.target/aarch64/sve/cond_mla_6.c: Likewise.
+       * gcc.target/aarch64/sve/cond_mla_6_run.c: Likewise.
+       * gcc.target/aarch64/sve/cond_mla_7.c: Likewise.
+       * gcc.target/aarch64/sve/cond_mla_7_run.c: Likewise.
+       * gcc.target/aarch64/sve/cond_mla_8.c: Likewise.
+       * gcc.target/aarch64/sve/cond_mla_8_run.c: Likewise.
+
 2019-08-15  Richard Sandiford  <richard.sandiford@arm.com>
            Kugan Vivekanandarajah  <kugan.vivekanandarajah@linaro.org>
 
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_1.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_1.c
new file mode 100644 (file)
index 0000000..cb01d50
--- /dev/null
@@ -0,0 +1,52 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define DEF_LOOP(TYPE, NAME, OP)                       \
+  void __attribute__ ((noipa))                         \
+  test_##TYPE##_##NAME (TYPE *__restrict r,            \
+                       TYPE *__restrict a,             \
+                       TYPE *__restrict b, TYPE c,     \
+                       TYPE *__restrict pred, int n)   \
+  {                                                    \
+    for (int i = 0; i < n; ++i)                                \
+      r[i] = pred[i] != 1 ? a[i] OP b[i] * c : b[i];   \
+  }
+
+#define TEST_TYPE(T, TYPE) \
+  T (TYPE, add, +) \
+  T (TYPE, sub, -)
+
+#define TEST_ALL(T) \
+  TEST_TYPE (T, uint8_t) \
+  TEST_TYPE (T, uint16_t) \
+  TEST_TYPE (T, uint32_t) \
+  TEST_TYPE (T, uint64_t) \
+  TEST_TYPE (T, _Float16) \
+  TEST_TYPE (T, float) \
+  TEST_TYPE (T, double)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\tmad\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmad\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmad\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmad\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tmsb\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmsb\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmsb\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmsb\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmad\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmad\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmad\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmsb\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmsb\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmsb\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-not {\tmov\tz[^,]*z} } } */
+/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
+/* { dg-final { scan-assembler-not {\tsel\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_1_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_1_run.c
new file mode 100644 (file)
index 0000000..bcfc622
--- /dev/null
@@ -0,0 +1,35 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "cond_mla_1.c"
+
+#define FACTOR 17
+#define N 99
+
+#define TEST_LOOP(TYPE, NAME, OP)                              \
+  {                                                            \
+    TYPE r[N], a[N], b[N], pred[N];                            \
+    for (int i = 0; i < N; ++i)                                        \
+      {                                                                \
+       a[i] = (i & 1 ? i : 3 * i);                             \
+       b[i] = (i >> 4) << (i & 15);                            \
+       pred[i] = i % 3 < i % 5;                                \
+       asm volatile ("" ::: "memory");                         \
+      }                                                                \
+    test_##TYPE##_##NAME (r, a, b, FACTOR, pred, N);           \
+    for (int i = 0; i < N; ++i)                                        \
+      {                                                                \
+       TYPE expected                                           \
+         = pred[i] != 1 ? a[i] OP b[i] * (TYPE) FACTOR : b[i]; \
+       if (r[i] != expected)                                   \
+         __builtin_abort ();                                   \
+       asm volatile ("" ::: "memory");                         \
+      }                                                                \
+  }
+
+int
+main (void)
+{
+  TEST_ALL (TEST_LOOP)
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_2.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_2.c
new file mode 100644 (file)
index 0000000..b6ea1a3
--- /dev/null
@@ -0,0 +1,53 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define DEF_LOOP(TYPE, NAME, OP)                       \
+  void __attribute__ ((noipa))                         \
+  test_##TYPE##_##NAME (TYPE *__restrict r,            \
+                       TYPE *__restrict a,             \
+                       TYPE *__restrict b, TYPE c,     \
+                       TYPE *__restrict pred, int n)   \
+  {                                                    \
+    for (int i = 0; i < n; ++i)                                \
+      r[i] = pred[i] != 1 ? a[i] OP b[i] * c : c;      \
+  }
+
+#define TEST_TYPE(T, TYPE) \
+  T (TYPE, add, +) \
+  T (TYPE, sub, -)
+
+#define TEST_ALL(T) \
+  TEST_TYPE (T, uint8_t) \
+  TEST_TYPE (T, uint16_t) \
+  TEST_TYPE (T, uint32_t) \
+  TEST_TYPE (T, uint64_t) \
+  TEST_TYPE (T, _Float16) \
+  TEST_TYPE (T, float) \
+  TEST_TYPE (T, double)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\tmad\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmad\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmad\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmad\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tmsb\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmsb\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmsb\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmsb\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmad\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmad\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmad\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmsb\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmsb\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmsb\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+, z[0-9]+\n} 14 } } */
+
+/* { dg-final { scan-assembler-not {\tmov\tz[^,]*z} } } */
+/* { dg-final { scan-assembler-not {\tsel\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_2_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_2_run.c
new file mode 100644 (file)
index 0000000..79998b8
--- /dev/null
@@ -0,0 +1,36 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "cond_mla_2.c"
+
+#define FACTOR 17
+#define N 99
+
+#define TEST_LOOP(TYPE, NAME, OP)                              \
+  {                                                            \
+    TYPE r[N], a[N], b[N], pred[N];                            \
+    for (int i = 0; i < N; ++i)                                        \
+      {                                                                \
+       a[i] = (i & 1 ? i : 3 * i);                             \
+       b[i] = (i >> 4) << (i & 15);                            \
+       pred[i] = i % 3 < i % 5;                                \
+       asm volatile ("" ::: "memory");                         \
+      }                                                                \
+    test_##TYPE##_##NAME (r, a, b, FACTOR, pred, N);           \
+    for (int i = 0; i < N; ++i)                                        \
+      {                                                                \
+       TYPE expected = (pred[i] != 1                           \
+                        ? a[i] OP b[i] * (TYPE) FACTOR         \
+                        : (TYPE) FACTOR);                      \
+       if (r[i] != expected)                                   \
+         __builtin_abort ();                                   \
+       asm volatile ("" ::: "memory");                         \
+      }                                                                \
+  }
+
+int
+main (void)
+{
+  TEST_ALL (TEST_LOOP)
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_3.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_3.c
new file mode 100644 (file)
index 0000000..085fccf
--- /dev/null
@@ -0,0 +1,52 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define DEF_LOOP(TYPE, NAME, OP)                       \
+  void __attribute__ ((noipa))                         \
+  test_##TYPE##_##NAME (TYPE *__restrict r,            \
+                       TYPE *__restrict a,             \
+                       TYPE *__restrict b, TYPE c,     \
+                       TYPE *__restrict pred, int n)   \
+  {                                                    \
+    for (int i = 0; i < n; ++i)                                \
+      r[i] = pred[i] != 1 ? a[i] OP b[i] * c : a[i];   \
+  }
+
+#define TEST_TYPE(T, TYPE) \
+  T (TYPE, add, +) \
+  T (TYPE, sub, -)
+
+#define TEST_ALL(T) \
+  TEST_TYPE (T, uint8_t) \
+  TEST_TYPE (T, uint16_t) \
+  TEST_TYPE (T, uint32_t) \
+  TEST_TYPE (T, uint64_t) \
+  TEST_TYPE (T, _Float16) \
+  TEST_TYPE (T, float) \
+  TEST_TYPE (T, double)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\tmla\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmla\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmla\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmla\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tmls\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmls\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmls\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmls\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmls\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmls\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmls\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-not {\tmov\tz[^,]*z} } } */
+/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
+/* { dg-final { scan-assembler-not {\tsel\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_3_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_3_run.c
new file mode 100644 (file)
index 0000000..cbd1185
--- /dev/null
@@ -0,0 +1,35 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "cond_mla_3.c"
+
+#define FACTOR 17
+#define N 99
+
+#define TEST_LOOP(TYPE, NAME, OP)                              \
+  {                                                            \
+    TYPE r[N], a[N], b[N], pred[N];                            \
+    for (int i = 0; i < N; ++i)                                        \
+      {                                                                \
+       a[i] = (i & 1 ? i : 3 * i);                             \
+       b[i] = (i >> 4) << (i & 15);                            \
+       pred[i] = i % 3 < i % 5;                                \
+       asm volatile ("" ::: "memory");                         \
+      }                                                                \
+    test_##TYPE##_##NAME (r, a, b, FACTOR, pred, N);           \
+    for (int i = 0; i < N; ++i)                                        \
+      {                                                                \
+       TYPE expected                                           \
+         = pred[i] != 1 ? a[i] OP b[i] * (TYPE) FACTOR : a[i]; \
+       if (r[i] != expected)                                   \
+         __builtin_abort ();                                   \
+       asm volatile ("" ::: "memory");                         \
+      }                                                                \
+  }
+
+int
+main (void)
+{
+  TEST_ALL (TEST_LOOP)
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_4.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_4.c
new file mode 100644 (file)
index 0000000..ed9f73e
--- /dev/null
@@ -0,0 +1,56 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define DEF_LOOP(TYPE, NAME, OP)                       \
+  void __attribute__ ((noipa))                         \
+  test_##TYPE##_##NAME (TYPE *__restrict r,            \
+                       TYPE *__restrict a,             \
+                       TYPE *__restrict b, TYPE c,     \
+                       TYPE *__restrict pred, int n)   \
+  {                                                    \
+    for (int i = 0; i < n; ++i)                                \
+      r[i] = pred[i] == 1 ? a[i] OP b[i] * c : pred[i];        \
+  }
+
+#define TEST_TYPE(T, TYPE) \
+  T (TYPE, add, +) \
+  T (TYPE, sub, -)
+
+#define TEST_ALL(T) \
+  TEST_TYPE (T, uint8_t) \
+  TEST_TYPE (T, uint16_t) \
+  TEST_TYPE (T, uint32_t) \
+  TEST_TYPE (T, uint64_t) \
+  TEST_TYPE (T, _Float16) \
+  TEST_TYPE (T, float) \
+  TEST_TYPE (T, double)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\tmla\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmla\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmla\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmla\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tmls\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmls\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmls\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmls\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmls\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmls\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmls\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.b, p[0-7]/m,} 2 } } */
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.h, p[0-7]/m,} 4 } } */
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.s, p[0-7]/m,} 4 } } */
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.d, p[0-7]/m,} 4 } } */
+
+/* { dg-final { scan-assembler-not {\tmov\tz[^,]*z} } } */
+/* { dg-final { scan-assembler-not {\tsel\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_4_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_4_run.c
new file mode 100644 (file)
index 0000000..5e07859
--- /dev/null
@@ -0,0 +1,36 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "cond_mla_4.c"
+
+#define FACTOR 17
+#define N 99
+
+#define TEST_LOOP(TYPE, NAME, OP)                              \
+  {                                                            \
+    TYPE r[N], a[N], b[N], pred[N];                            \
+    for (int i = 0; i < N; ++i)                                        \
+      {                                                                \
+       a[i] = (i & 1 ? i : 3 * i);                             \
+       b[i] = (i >> 4) << (i & 15);                            \
+       pred[i] = i % 3;                                        \
+       asm volatile ("" ::: "memory");                         \
+      }                                                                \
+    test_##TYPE##_##NAME (r, a, b, FACTOR, pred, N);           \
+    for (int i = 0; i < N; ++i)                                        \
+      {                                                                \
+       TYPE expected = (pred[i] == 1                           \
+                        ? a[i] OP b[i] * (TYPE) FACTOR         \
+                        : pred[i]);                            \
+       if (r[i] != expected)                                   \
+         __builtin_abort ();                                   \
+       asm volatile ("" ::: "memory");                         \
+      }                                                                \
+  }
+
+int
+main (void)
+{
+  TEST_ALL (TEST_LOOP)
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_5.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_5.c
new file mode 100644 (file)
index 0000000..e71f2b0
--- /dev/null
@@ -0,0 +1,56 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define DEF_LOOP(TYPE, NAME, OP)                       \
+  void __attribute__ ((noipa))                         \
+  test_##TYPE##_##NAME (TYPE *__restrict r,            \
+                       TYPE *__restrict a,             \
+                       TYPE *__restrict b, TYPE c,     \
+                       TYPE *__restrict pred, int n)   \
+  {                                                    \
+    for (int i = 0; i < n; ++i)                                \
+      r[i] = pred[i] ? a[i] OP b[i] * c : 0;           \
+  }
+
+#define TEST_TYPE(T, TYPE) \
+  T (TYPE, add, +) \
+  T (TYPE, sub, -)
+
+#define TEST_ALL(T) \
+  TEST_TYPE (T, uint8_t) \
+  TEST_TYPE (T, uint16_t) \
+  TEST_TYPE (T, uint32_t) \
+  TEST_TYPE (T, uint64_t) \
+  TEST_TYPE (T, _Float16) \
+  TEST_TYPE (T, float) \
+  TEST_TYPE (T, double)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\t(?:mla|mad)\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\t(?:mla|mad)\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\t(?:mla|mad)\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\t(?:mla|mad)\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\t(?:mls|msb)\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\t(?:mls|msb)\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\t(?:mls|msb)\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\t(?:mls|msb)\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmls\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmls\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmls\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.b, p[0-7]/z,} 2 } } */
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.h, p[0-7]/z,} 4 } } */
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.s, p[0-7]/z,} 4 } } */
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.d, p[0-7]/z,} 4 } } */
+
+/* { dg-final { scan-assembler-not {\tmov\tz[^,]*z} } } */
+/* { dg-final { scan-assembler-not {\tsel\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_5_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_5_run.c
new file mode 100644 (file)
index 0000000..9de46e3
--- /dev/null
@@ -0,0 +1,35 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "cond_mla_5.c"
+
+#define FACTOR 17
+#define N 99
+
+#define TEST_LOOP(TYPE, NAME, OP)                              \
+  {                                                            \
+    TYPE r[N], a[N], b[N], pred[N];                            \
+    for (int i = 0; i < N; ++i)                                        \
+      {                                                                \
+       a[i] = (i & 1 ? i : 3 * i);                             \
+       b[i] = (i >> 4) << (i & 15);                            \
+       pred[i] = i % 3 < i % 5;                                \
+       asm volatile ("" ::: "memory");                         \
+      }                                                                \
+    test_##TYPE##_##NAME (r, a, b, FACTOR, pred, N);           \
+    for (int i = 0; i < N; ++i)                                        \
+      {                                                                \
+       TYPE expected                                           \
+         = pred[i] ? a[i] OP b[i] * (TYPE) FACTOR : 0;         \
+       if (r[i] != expected)                                   \
+         __builtin_abort ();                                   \
+       asm volatile ("" ::: "memory");                         \
+      }                                                                \
+  }
+
+int
+main (void)
+{
+  TEST_ALL (TEST_LOOP)
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_6.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_6.c
new file mode 100644 (file)
index 0000000..832bdb3
--- /dev/null
@@ -0,0 +1,53 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define DEF_LOOP(TYPE, NAME, OP)                       \
+  void __attribute__ ((noipa))                         \
+  test_##TYPE##_##NAME (TYPE *__restrict r,            \
+                       TYPE *__restrict a,             \
+                       TYPE *__restrict b, TYPE c,     \
+                       TYPE *__restrict pred, int n)   \
+  {                                                    \
+    for (int i = 0; i < n; ++i)                                \
+      r[i] = pred[i] ? a[i] OP b[i] * c : 5;           \
+  }
+
+#define TEST_TYPE(T, TYPE) \
+  T (TYPE, add, +) \
+  T (TYPE, sub, -)
+
+#define TEST_ALL(T) \
+  TEST_TYPE (T, uint8_t) \
+  TEST_TYPE (T, uint16_t) \
+  TEST_TYPE (T, uint32_t) \
+  TEST_TYPE (T, uint64_t) \
+  TEST_TYPE (T, _Float16) \
+  TEST_TYPE (T, float) \
+  TEST_TYPE (T, double)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\tmla\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmla\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmla\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmla\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tmls\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmls\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmls\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmls\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmls\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmls\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmls\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tsel\t} 14 } } */
+
+/* { dg-final { scan-assembler-not {\tmov\tz[^,]*z} } } */
+/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_6_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_6_run.c
new file mode 100644 (file)
index 0000000..59f57a2
--- /dev/null
@@ -0,0 +1,35 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "cond_mla_6.c"
+
+#define FACTOR 17
+#define N 99
+
+#define TEST_LOOP(TYPE, NAME, OP)                              \
+  {                                                            \
+    TYPE r[N], a[N], b[N], pred[N];                            \
+    for (int i = 0; i < N; ++i)                                        \
+      {                                                                \
+       a[i] = (i & 1 ? i : 3 * i);                             \
+       b[i] = (i >> 4) << (i & 15);                            \
+       pred[i] = i % 3 < i % 5;                                \
+       asm volatile ("" ::: "memory");                         \
+      }                                                                \
+    test_##TYPE##_##NAME (r, a, b, FACTOR, pred, N);           \
+    for (int i = 0; i < N; ++i)                                        \
+      {                                                                \
+       TYPE expected                                           \
+         = pred[i] ? a[i] OP b[i] * (TYPE) FACTOR : 5; \
+       if (r[i] != expected)                                   \
+         __builtin_abort ();                                   \
+       asm volatile ("" ::: "memory");                         \
+      }                                                                \
+  }
+
+int
+main (void)
+{
+  TEST_ALL (TEST_LOOP)
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_7.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_7.c
new file mode 100644 (file)
index 0000000..5561f42
--- /dev/null
@@ -0,0 +1,62 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define DEF_LOOP(TYPE, NAME, OP, CONST)                                \
+  void __attribute__ ((noipa))                                 \
+  test_##TYPE##_##NAME##_##CONST (TYPE *__restrict r,          \
+                                 TYPE *__restrict a,           \
+                                 TYPE *__restrict b,           \
+                                 TYPE *__restrict pred, int n) \
+  {                                                            \
+    for (int i = 0; i < n; ++i)                                        \
+      r[i] = pred[i] != 1 ? a[i] OP b[i] * CONST : a[i];       \
+  }
+
+#define TEST_COUNT(T, TYPE, CONST) \
+  T (TYPE, add, +, CONST) \
+  T (TYPE, sub, -, CONST)
+
+#define TEST_TYPE(T, TYPE, CONST) \
+  TEST_COUNT (T, TYPE, 2) \
+  TEST_COUNT (T, TYPE, 4) \
+  TEST_COUNT (T, TYPE, CONST)
+
+#define TEST_ALL(T) \
+  TEST_TYPE (T, uint8_t, 0x80) \
+  TEST_TYPE (T, uint16_t, 0x8000) \
+  TEST_TYPE (T, uint32_t, 0x80000000) \
+  TEST_TYPE (T, uint64_t, 0x8000000000000000ULL)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.b, z[0-9]+\.b, #1\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.b, z[0-9]+\.b, #2\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.b, z[0-9]+\.b, #7\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.h, z[0-9]+\.h, #1\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.h, z[0-9]+\.h, #2\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.h, z[0-9]+\.h, #15\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.s, z[0-9]+\.s, #1\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.s, z[0-9]+\.s, #2\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.s, z[0-9]+\.s, #31\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.d, z[0-9]+\.d, #1\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.d, z[0-9]+\.d, #2\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.d, z[0-9]+\.d, #63\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.b, p[0-7]/m, z[0-9]+\.b, z[0-9]+\.b\n} 4 } } */
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 4 } } */
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 4 } } */
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 4 } } */
+
+/* { dg-final { scan-assembler-times {\tsub\tz[0-9]+\.b, p[0-7]/m, z[0-9]+\.b, z[0-9]+\.b\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tsub\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tsub\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tsub\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 2 } } */
+
+/* { dg-final { scan-assembler-not {\tmov\tz[^,]*z} } } */
+/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
+/* { dg-final { scan-assembler-not {\tsel\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_7_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_7_run.c
new file mode 100644 (file)
index 0000000..b094f40
--- /dev/null
@@ -0,0 +1,34 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "cond_mla_7.c"
+
+#define N 99
+
+#define TEST_LOOP(TYPE, NAME, OP, CONST)                       \
+  {                                                            \
+    TYPE r[N], a[N], b[N], pred[N];                            \
+    for (int i = 0; i < N; ++i)                                        \
+      {                                                                \
+       a[i] = (i & 1 ? i : 3 * i);                             \
+       b[i] = (i >> 4) << (i & 15);                            \
+       pred[i] = i % 3 < i % 5;                                \
+       asm volatile ("" ::: "memory");                         \
+      }                                                                \
+    test_##TYPE##_##NAME##_##CONST (r, a, b, pred, N);         \
+    for (int i = 0; i < N; ++i)                                        \
+      {                                                                \
+       TYPE expected                                           \
+         = pred[i] != 1 ? a[i] OP b[i] * CONST : a[i];         \
+       if (r[i] != expected)                                   \
+         __builtin_abort ();                                   \
+       asm volatile ("" ::: "memory");                         \
+      }                                                                \
+  }
+
+int
+main (void)
+{
+  TEST_ALL (TEST_LOOP)
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_8.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_8.c
new file mode 100644 (file)
index 0000000..d554927
--- /dev/null
@@ -0,0 +1,62 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define DEF_LOOP(TYPE, NAME, OP, CONST)                                \
+  void __attribute__ ((noipa))                                 \
+  test_##TYPE##_##NAME##_##CONST (TYPE *__restrict r,          \
+                                 TYPE *__restrict a,           \
+                                 TYPE *__restrict b,           \
+                                 TYPE *__restrict pred, int n) \
+  {                                                            \
+    for (int i = 0; i < n; ++i)                                        \
+      r[i] = pred[i] != 1 ? a[i] OP b[i] * -CONST : a[i];      \
+  }
+
+#define TEST_COUNT(T, TYPE, CONST) \
+  T (TYPE, add, +, CONST) \
+  T (TYPE, sub, -, CONST)
+
+#define TEST_TYPE(T, TYPE, CONST) \
+  TEST_COUNT (T, TYPE, 2) \
+  TEST_COUNT (T, TYPE, 4) \
+  TEST_COUNT (T, TYPE, CONST)
+
+#define TEST_ALL(T) \
+  TEST_TYPE (T, uint8_t, 0x80) \
+  TEST_TYPE (T, uint16_t, 0x8000) \
+  TEST_TYPE (T, uint32_t, 0x80000000) \
+  TEST_TYPE (T, uint64_t, 0x8000000000000000ULL)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.b, z[0-9]+\.b, #1\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.b, z[0-9]+\.b, #2\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.b, z[0-9]+\.b, #7\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.h, z[0-9]+\.h, #1\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.h, z[0-9]+\.h, #2\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.h, z[0-9]+\.h, #15\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.s, z[0-9]+\.s, #1\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.s, z[0-9]+\.s, #2\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.s, z[0-9]+\.s, #31\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.d, z[0-9]+\.d, #1\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.d, z[0-9]+\.d, #2\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.d, z[0-9]+\.d, #63\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.b, p[0-7]/m, z[0-9]+\.b, z[0-9]+\.b\n} 4 } } */
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 4 } } */
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 4 } } */
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 4 } } */
+
+/* { dg-final { scan-assembler-times {\tsub\tz[0-9]+\.b, p[0-7]/m, z[0-9]+\.b, z[0-9]+\.b\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tsub\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tsub\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tsub\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 2 } } */
+
+/* { dg-final { scan-assembler-not {\tmov\tz[^,]*z} } } */
+/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
+/* { dg-final { scan-assembler-not {\tsel\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_8_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_8_run.c
new file mode 100644 (file)
index 0000000..7fb58aa
--- /dev/null
@@ -0,0 +1,34 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "cond_mla_8.c"
+
+#define N 99
+
+#define TEST_LOOP(TYPE, NAME, OP, CONST)                       \
+  {                                                            \
+    TYPE r[N], a[N], b[N], pred[N];                            \
+    for (int i = 0; i < N; ++i)                                        \
+      {                                                                \
+       a[i] = (i & 1 ? i : 3 * i);                             \
+       b[i] = (i >> 4) << (i & 15);                            \
+       pred[i] = i % 3 < i % 5;                                \
+       asm volatile ("" ::: "memory");                         \
+      }                                                                \
+    test_##TYPE##_##NAME##_##CONST (r, a, b, pred, N);         \
+    for (int i = 0; i < N; ++i)                                        \
+      {                                                                \
+       TYPE expected                                           \
+         = pred[i] != 1 ? a[i] OP b[i] * -CONST : a[i];        \
+       if (r[i] != expected)                                   \
+         __builtin_abort ();                                   \
+       asm volatile ("" ::: "memory");                         \
+      }                                                                \
+  }
+
+int
+main (void)
+{
+  TEST_ALL (TEST_LOOP)
+  return 0;
+}