[AArch64] Use SVE MLA, MLS, MAD and MSB for conditional arithmetic

author Richard Sandiford <richard.sandiford@arm.com>

Thu, 15 Aug 2019 08:22:07 +0000 (08:22 +0000)

committer Richard Sandiford <rsandifo@gcc.gnu.org>

Thu, 15 Aug 2019 08:22:07 +0000 (08:22 +0000)
author Richard Sandiford <richard.sandiford@arm.com>
Thu, 15 Aug 2019 08:22:07 +0000 (08:22 +0000)
committer Richard Sandiford <rsandifo@gcc.gnu.org>
Thu, 15 Aug 2019 08:22:07 +0000 (08:22 +0000)
diff --git a/gcc/ChangeLog b/gcc/ChangeLog

index 52ab8e5d3702811a13a6764046345ed6e150bac8..66631f6b37c141608362e9ab52122f524aa8aec7 100644 (file)
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,22 @@
+2019-08-15  Richard Sandiford  <richard.sandiford@arm.com>
+           Kugan Vivekanandarajah  <kugan.vivekanandarajah@linaro.org>
+
+       * config/aarch64/aarch64-protos.h (aarch64_prepare_sve_int_fma)
+       (aarch64_prepare_sve_cond_int_fma): Declare.
+       * config/aarch64/aarch64.c (aarch64_convert_mult_to_shift)
+       (aarch64_prepare_sve_int_fma): New functions.
+       (aarch64_prepare_sve_cond_int_fma): Likewise.
+       * config/aarch64/aarch64-sve.md
+       (cond_<SVE_INT_BINARY:optab><SVE_I:mode>): Add a "@" marker.
+       (fma<SVE_I:mode>4, cond_fma<SVE_I:mode>, *cond_fma<SVE_I:mode>_2)
+       (*cond_fma<SVE_I:mode>_4, *cond_fma<SVE_I:mode>_any, fnma<SVE_I:mode>4)
+       (cond_fnma<SVE_I:mode>, *cond_fnma<SVE_I:mode>_2)
+       (*cond_fnma<SVE_I:mode>_4, *cond_fnma<SVE_I:mode>_any): New patterns.
+       (*madd<mode>): Rename to...
+       (*fma<mode>4): ...this.
+       (*msub<mode>): Rename to...
+       (*fnma<mode>4): ...this.
+
  2019-08-15  Richard Sandiford  <richard.sandiford@arm.com>
             Kugan Vivekanandarajah  <kugan.vivekanandarajah@linaro.org>
  
diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h

index 858c4500406873cb1a0c0f619c5854e61c9ab5f6..24e724004f833e16ba2e3dc4f06bba1069cf8f98 100644 (file)
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -630,6 +630,9 @@ bool aarch64_gen_adjusted_ldpstp (rtx *, bool, scalar_mode, RTX_CODE);
  void aarch64_expand_sve_vec_cmp_int (rtx, rtx_code, rtx, rtx);
  bool aarch64_expand_sve_vec_cmp_float (rtx, rtx_code, rtx, rtx, bool);
  void aarch64_expand_sve_vcond (machine_mode, machine_mode, rtx *);
+
+bool aarch64_prepare_sve_int_fma (rtx *, rtx_code);
+bool aarch64_prepare_sve_cond_int_fma (rtx *, rtx_code);
  #endif /* RTX_CODE */
  
  void aarch64_init_builtins (void);
diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md

index d43ce521a799bf3385f0ca1ffd14599b18c27be3..93f55360d69652e1ca434682e92440c014ce84fe 100644 (file)
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -1844,7 +1844,7 @@
  )
  
  ;; Predicated integer operations with merging.
-(define_expand "cond_<optab><mode>"
+(define_expand "@cond_<optab><mode>"
    [(set (match_operand:SVE_I 0 "register_operand")
         (unspec:SVE_I
           [(match_operand:<VPRED> 1 "register_operand")
@@ -3384,8 +3384,26 @@
  ;; - MLA
  ;; -------------------------------------------------------------------------
  
+;; Unpredicated integer addition of product.
+(define_expand "fma<mode>4"
+  [(set (match_operand:SVE_I 0 "register_operand")
+       (plus:SVE_I
+         (unspec:SVE_I
+           [(match_dup 4)
+            (mult:SVE_I (match_operand:SVE_I 1 "register_operand")
+                        (match_operand:SVE_I 2 "nonmemory_operand"))]
+           UNSPEC_PRED_X)
+         (match_operand:SVE_I 3 "register_operand")))]
+  "TARGET_SVE"
+  {
+    if (aarch64_prepare_sve_int_fma (operands, PLUS))
+      DONE;
+    operands[4] = aarch64_ptrue_reg (<VPRED>mode);
+  }
+)
+
  ;; Predicated integer addition of product.
-(define_insn "*madd<mode>"
+(define_insn "*fma<mode>4"
    [(set (match_operand:SVE_I 0 "register_operand" "=w, w, ?&w")
         (plus:SVE_I
           (unspec:SVE_I
@@ -3402,6 +3420,97 @@
    [(set_attr "movprfx" "*,*,yes")]
  )
  
+;; Predicated integer addition of product with merging.
+(define_expand "cond_fma<mode>"
+  [(set (match_operand:SVE_I 0 "register_operand")
+       (unspec:SVE_I
+         [(match_operand:<VPRED> 1 "register_operand")
+          (plus:SVE_I
+            (mult:SVE_I (match_operand:SVE_I 2 "register_operand")
+                        (match_operand:SVE_I 3 "general_operand"))
+            (match_operand:SVE_I 4 "register_operand"))
+          (match_operand:SVE_I 5 "aarch64_simd_reg_or_zero")]
+         UNSPEC_SEL))]
+  "TARGET_SVE"
+  {
+    if (aarch64_prepare_sve_cond_int_fma (operands, PLUS))
+      DONE;
+    /* Swap the multiplication operands if the fallback value is the
+       second of the two.  */
+    if (rtx_equal_p (operands[3], operands[5]))
+      std::swap (operands[2], operands[3]);
+  }
+)
+
+;; Predicated integer addition of product, merging with the first input.
+(define_insn "*cond_fma<mode>_2"
+  [(set (match_operand:SVE_I 0 "register_operand" "=w, ?&w")
+       (unspec:SVE_I
+         [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
+          (plus:SVE_I
+            (mult:SVE_I (match_operand:SVE_I 2 "register_operand" "0, w")
+                        (match_operand:SVE_I 3 "register_operand" "w, w"))
+            (match_operand:SVE_I 4 "register_operand" "w, w"))
+          (match_dup 2)]
+         UNSPEC_SEL))]
+  "TARGET_SVE"
+  "@
+   mad\t%0.<Vetype>, %1/m, %3.<Vetype>, %4.<Vetype>
+   movprfx\t%0, %2\;mad\t%0.<Vetype>, %1/m, %3.<Vetype>, %4.<Vetype>"
+  [(set_attr "movprfx" "*,yes")]
+)
+
+;; Predicated integer addition of product, merging with the third input.
+(define_insn "*cond_fma<mode>_4"
+  [(set (match_operand:SVE_I 0 "register_operand" "=w, ?&w")
+       (unspec:SVE_I
+         [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
+          (plus:SVE_I
+            (mult:SVE_I (match_operand:SVE_I 2 "register_operand" "w, w")
+                        (match_operand:SVE_I 3 "register_operand" "w, w"))
+            (match_operand:SVE_I 4 "register_operand" "0, w"))
+          (match_dup 4)]
+         UNSPEC_SEL))]
+  "TARGET_SVE"
+  "@
+   mla\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>
+   movprfx\t%0, %4\;mla\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>"
+  [(set_attr "movprfx" "*,yes")]
+)
+
+;; Predicated integer addition of product, merging with an independent value.
+(define_insn_and_rewrite "*cond_fma<mode>_any"
+  [(set (match_operand:SVE_I 0 "register_operand" "=&w, &w, &w, &w, &w, ?&w")
+       (unspec:SVE_I
+         [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl, Upl, Upl, Upl")
+          (plus:SVE_I
+            (mult:SVE_I (match_operand:SVE_I 2 "register_operand" "w, w, 0, w, w, w")
+                        (match_operand:SVE_I 3 "register_operand" "w, w, w, 0, w, w"))
+            (match_operand:SVE_I 4 "register_operand" "w, 0, w, w, w, w"))
+          (match_operand:SVE_I 5 "aarch64_simd_reg_or_zero" "Dz, Dz, Dz, Dz, 0, w")]
+         UNSPEC_SEL))]
+  "TARGET_SVE
+   && !rtx_equal_p (operands[2], operands[5])
+   && !rtx_equal_p (operands[3], operands[5])
+   && !rtx_equal_p (operands[4], operands[5])"
+  "@
+   movprfx\t%0.<Vetype>, %1/z, %4.<Vetype>\;mla\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>
+   movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;mla\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>
+   movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;mad\t%0.<Vetype>, %1/m, %3.<Vetype>, %4.<Vetype>
+   movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;mad\t%0.<Vetype>, %1/m, %2.<Vetype>, %4.<Vetype>
+   movprfx\t%0.<Vetype>, %1/m, %4.<Vetype>\;mla\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>
+   #"
+  "&& reload_completed
+   && register_operand (operands[5], <MODE>mode)
+   && !rtx_equal_p (operands[0], operands[5])"
+  {
+    emit_insn (gen_vcond_mask_<mode><vpred> (operands[0], operands[4],
+                                            operands[5], operands[1]));
+    operands[5] = operands[4] = operands[0];
+  }
+  [(set_attr "movprfx" "yes")]
+)
+
  ;; -------------------------------------------------------------------------
  ;; ---- [INT] MLS and MSB
  ;; -------------------------------------------------------------------------
@@ -3410,8 +3519,26 @@
  ;; - MSB
  ;; -------------------------------------------------------------------------
  
+;; Unpredicated integer subtraction of product.
+(define_expand "fnma<mode>4"
+  [(set (match_operand:SVE_I 0 "register_operand")
+       (minus:SVE_I
+         (match_operand:SVE_I 3 "register_operand")
+         (unspec:SVE_I
+           [(match_dup 4)
+            (mult:SVE_I (match_operand:SVE_I 1 "register_operand")
+                        (match_operand:SVE_I 2 "general_operand"))]
+           UNSPEC_PRED_X)))]
+  "TARGET_SVE"
+  {
+    if (aarch64_prepare_sve_int_fma (operands, MINUS))
+      DONE;
+    operands[4] = aarch64_ptrue_reg (<VPRED>mode);
+  }
+)
+
  ;; Predicated integer subtraction of product.
-(define_insn "*msub<mode>3"
+(define_insn "*fnma<mode>3"
    [(set (match_operand:SVE_I 0 "register_operand" "=w, w, ?&w")
         (minus:SVE_I
           (match_operand:SVE_I 4 "register_operand" "w, 0, w")
@@ -3428,6 +3555,98 @@
    [(set_attr "movprfx" "*,*,yes")]
  )
  
+;; Predicated integer subtraction of product with merging.
+(define_expand "cond_fnma<mode>"
+  [(set (match_operand:SVE_I 0 "register_operand")
+   (unspec:SVE_I
+       [(match_operand:<VPRED> 1 "register_operand")
+        (minus:SVE_I
+          (match_operand:SVE_I 4 "register_operand")
+          (mult:SVE_I (match_operand:SVE_I 2 "register_operand")
+                      (match_operand:SVE_I 3 "general_operand")))
+        (match_operand:SVE_I 5 "aarch64_simd_reg_or_zero")]
+       UNSPEC_SEL))]
+  "TARGET_SVE"
+  {
+    if (aarch64_prepare_sve_cond_int_fma (operands, MINUS))
+      DONE;
+    /* Swap the multiplication operands if the fallback value is the
+       second of the two.  */
+    if (rtx_equal_p (operands[3], operands[5]))
+      std::swap (operands[2], operands[3]);
+  }
+)
+
+;; Predicated integer subtraction of product, merging with the first input.
+(define_insn "*cond_fnma<mode>_2"
+  [(set (match_operand:SVE_I 0 "register_operand" "=w, ?&w")
+       (unspec:SVE_I
+         [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
+          (minus:SVE_I
+            (match_operand:SVE_I 4 "register_operand" "w, w")
+            (mult:SVE_I (match_operand:SVE_I 2 "register_operand" "0, w")
+                        (match_operand:SVE_I 3 "register_operand" "w, w")))
+          (match_dup 2)]
+         UNSPEC_SEL))]
+  "TARGET_SVE"
+  "@
+   msb\t%0.<Vetype>, %1/m, %3.<Vetype>, %4.<Vetype>
+   movprfx\t%0, %2\;msb\t%0.<Vetype>, %1/m, %3.<Vetype>, %4.<Vetype>"
+  [(set_attr "movprfx" "*,yes")]
+)
+
+;; Predicated integer subtraction of product, merging with the third input.
+(define_insn "*cond_fnma<mode>_4"
+  [(set (match_operand:SVE_I 0 "register_operand" "=w, ?&w")
+       (unspec:SVE_I
+         [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
+          (minus:SVE_I
+            (match_operand:SVE_I 4 "register_operand" "0, w")
+            (mult:SVE_I (match_operand:SVE_I 2 "register_operand" "w, w")
+                        (match_operand:SVE_I 3 "register_operand" "w, w")))
+          (match_dup 4)]
+         UNSPEC_SEL))]
+  "TARGET_SVE"
+  "@
+   mls\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>
+   movprfx\t%0, %4\;mls\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>"
+  [(set_attr "movprfx" "*,yes")]
+)
+
+;; Predicated integer subtraction of product, merging with an
+;; independent value.
+(define_insn_and_rewrite "*cond_fnma<mode>_any"
+  [(set (match_operand:SVE_I 0 "register_operand" "=&w, &w, &w, &w, &w, ?&w")
+       (unspec:SVE_I
+         [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl, Upl, Upl, Upl")
+          (minus:SVE_I
+            (match_operand:SVE_I 4 "register_operand" "w, 0, w, w, w, w")
+            (mult:SVE_I (match_operand:SVE_I 2 "register_operand" "w, w, 0, w, w, w")
+                        (match_operand:SVE_I 3 "register_operand" "w, w, w, 0, w, w")))
+          (match_operand:SVE_I 5 "aarch64_simd_reg_or_zero" "Dz, Dz, Dz, Dz, 0, w")]
+         UNSPEC_SEL))]
+  "TARGET_SVE
+   && !rtx_equal_p (operands[2], operands[5])
+   && !rtx_equal_p (operands[3], operands[5])
+   && !rtx_equal_p (operands[4], operands[5])"
+  "@
+   movprfx\t%0.<Vetype>, %1/z, %4.<Vetype>\;mls\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>
+   movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;mls\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>
+   movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;msb\t%0.<Vetype>, %1/m, %3.<Vetype>, %4.<Vetype>
+   movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;msb\t%0.<Vetype>, %1/m, %2.<Vetype>, %4.<Vetype>
+   movprfx\t%0.<Vetype>, %1/m, %4.<Vetype>\;mls\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>
+   #"
+  "&& reload_completed
+   && register_operand (operands[5], <MODE>mode)
+   && !rtx_equal_p (operands[0], operands[5])"
+  {
+    emit_insn (gen_vcond_mask_<mode><vpred> (operands[0], operands[4],
+                                            operands[5], operands[1]));
+    operands[5] = operands[4] = operands[0];
+  }
+  [(set_attr "movprfx" "yes")]
+)
+
  ;; -------------------------------------------------------------------------
  ;; ---- [INT] Dot product
  ;; -------------------------------------------------------------------------
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c

index 81a267bde54372d999f4851d537d6885f9abd4ce..ec787727aa69290ea6202999b99b8658cfdb9f2f 100644 (file)
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -16469,6 +16469,98 @@ aarch64_sve_expand_vector_init (rtx target, rtx vals)
      aarch64_sve_expand_vector_init_insert_elems (target, v, nelts);
  }
  
+/* Check whether VALUE is a vector constant in which every element
+   is either a power of 2 or a negated power of 2.  If so, return
+   a constant vector of log2s, and flip CODE between PLUS and MINUS
+   if VALUE contains negated powers of 2.  Return NULL_RTX otherwise.  */
+
+static rtx
+aarch64_convert_mult_to_shift (rtx value, rtx_code &code)
+{
+  if (GET_CODE (value) != CONST_VECTOR)
+    return NULL_RTX;
+
+  rtx_vector_builder builder;
+  if (!builder.new_unary_operation (GET_MODE (value), value, false))
+    return NULL_RTX;
+
+  scalar_mode int_mode = GET_MODE_INNER (GET_MODE (value));
+  /* 1 if the result of the multiplication must be negated,
+     0 if it mustn't, or -1 if we don't yet care.  */
+  int negate = -1;
+  unsigned int encoded_nelts = const_vector_encoded_nelts (value);
+  for (unsigned int i = 0; i < encoded_nelts; ++i)
+    {
+      rtx elt = CONST_VECTOR_ENCODED_ELT (value, i);
+      if (!CONST_SCALAR_INT_P (elt))
+       return NULL_RTX;
+      rtx_mode_t val (elt, int_mode);
+      wide_int pow2 = wi::neg (val);
+      if (val != pow2)
+       {
+         /* It matters whether we negate or not.  Make that choice,
+            and make sure that it's consistent with previous elements.  */
+         if (negate == !wi::neg_p (val))
+           return NULL_RTX;
+         negate = wi::neg_p (val);
+         if (!negate)
+           pow2 = val;
+       }
+      /* POW2 is now the value that we want to be a power of 2.  */
+      int shift = wi::exact_log2 (pow2);
+      if (shift < 0)
+       return NULL_RTX;
+      builder.quick_push (gen_int_mode (shift, int_mode));
+    }
+  if (negate == -1)
+    /* PLUS and MINUS are equivalent; canonicalize on PLUS.  */
+    code = PLUS;
+  else if (negate == 1)
+    code = code == PLUS ? MINUS : PLUS;
+  return builder.build ();
+}
+
+/* Prepare for an integer SVE multiply-add or multiply-subtract pattern;
+   CODE is PLUS for the former and MINUS for the latter.  OPERANDS is the
+   operands array, in the same order as for fma_optab.  Return true if
+   the function emitted all the necessary instructions, false if the caller
+   should generate the pattern normally with the new OPERANDS array.  */
+
+bool
+aarch64_prepare_sve_int_fma (rtx *operands, rtx_code code)
+{
+  machine_mode mode = GET_MODE (operands[0]);
+  if (rtx shifts = aarch64_convert_mult_to_shift (operands[2], code))
+    {
+      rtx product = expand_binop (mode, vashl_optab, operands[1], shifts,
+                                 NULL_RTX, true, OPTAB_DIRECT);
+      force_expand_binop (mode, code == PLUS ? add_optab : sub_optab,
+                         operands[3], product, operands[0], true,
+                         OPTAB_DIRECT);
+      return true;
+    }
+  operands[2] = force_reg (mode, operands[2]);
+  return false;
+}
+
+/* Likewise, but for a conditional pattern.  */
+
+bool
+aarch64_prepare_sve_cond_int_fma (rtx *operands, rtx_code code)
+{
+  machine_mode mode = GET_MODE (operands[0]);
+  if (rtx shifts = aarch64_convert_mult_to_shift (operands[3], code))
+    {
+      rtx product = expand_binop (mode, vashl_optab, operands[2], shifts,
+                                 NULL_RTX, true, OPTAB_DIRECT);
+      emit_insn (gen_cond (code, mode, operands[0], operands[1],
+                          operands[4], product, operands[5]));
+      return true;
+    }
+  operands[3] = force_reg (mode, operands[3]);
+  return false;
+}
+
  static unsigned HOST_WIDE_INT
  aarch64_shift_truncation_mask (machine_mode mode)
  {
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog

index aad8e04c32ca396b7f682368469a58798ca212be..63596d8497e4b5da42de8cca81190d3de8352135 100644 (file)
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,3 +1,23 @@
+2019-08-15  Richard Sandiford  <richard.sandiford@arm.com>
+           Kugan Vivekanandarajah  <kugan.vivekanandarajah@linaro.org>
+
+       * gcc.target/aarch64/sve/cond_mla_1.c: New test.
+       * gcc.target/aarch64/sve/cond_mla_1_run.c: Likewise.
+       * gcc.target/aarch64/sve/cond_mla_2.c: Likewise.
+       * gcc.target/aarch64/sve/cond_mla_2_run.c: Likewise.
+       * gcc.target/aarch64/sve/cond_mla_3.c: Likewise.
+       * gcc.target/aarch64/sve/cond_mla_3_run.c: Likewise.
+       * gcc.target/aarch64/sve/cond_mla_4.c: Likewise.
+       * gcc.target/aarch64/sve/cond_mla_4_run.c: Likewise.
+       * gcc.target/aarch64/sve/cond_mla_5.c: Likewise.
+       * gcc.target/aarch64/sve/cond_mla_5_run.c: Likewise.
+       * gcc.target/aarch64/sve/cond_mla_6.c: Likewise.
+       * gcc.target/aarch64/sve/cond_mla_6_run.c: Likewise.
+       * gcc.target/aarch64/sve/cond_mla_7.c: Likewise.
+       * gcc.target/aarch64/sve/cond_mla_7_run.c: Likewise.
+       * gcc.target/aarch64/sve/cond_mla_8.c: Likewise.
+       * gcc.target/aarch64/sve/cond_mla_8_run.c: Likewise.
+
  2019-08-15  Richard Sandiford  <richard.sandiford@arm.com>
             Kugan Vivekanandarajah  <kugan.vivekanandarajah@linaro.org>
  
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_1.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_1.c

new file mode 100644 (file)

index 0000000..cb01d50
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_1.c
@@ -0,0 +1,52 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define DEF_LOOP(TYPE, NAME, OP)                       \
+  void __attribute__ ((noipa))                         \
+  test_##TYPE##_##NAME (TYPE *__restrict r,            \
+                       TYPE *__restrict a,             \
+                       TYPE *__restrict b, TYPE c,     \
+                       TYPE *__restrict pred, int n)   \
+  {                                                    \
+    for (int i = 0; i < n; ++i)                                \
+      r[i] = pred[i] != 1 ? a[i] OP b[i] * c : b[i];   \
+  }
+
+#define TEST_TYPE(T, TYPE) \
+  T (TYPE, add, +) \
+  T (TYPE, sub, -)
+
+#define TEST_ALL(T) \
+  TEST_TYPE (T, uint8_t) \
+  TEST_TYPE (T, uint16_t) \
+  TEST_TYPE (T, uint32_t) \
+  TEST_TYPE (T, uint64_t) \
+  TEST_TYPE (T, _Float16) \
+  TEST_TYPE (T, float) \
+  TEST_TYPE (T, double)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\tmad\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmad\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmad\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmad\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tmsb\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmsb\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmsb\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmsb\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmad\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmad\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmad\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmsb\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmsb\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmsb\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-not {\tmov\tz[^,]*z} } } */
+/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
+/* { dg-final { scan-assembler-not {\tsel\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_1_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_1_run.c

new file mode 100644 (file)

index 0000000..bcfc622
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_1_run.c
@@ -0,0 +1,35 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "cond_mla_1.c"
+
+#define FACTOR 17
+#define N 99
+
+#define TEST_LOOP(TYPE, NAME, OP)                              \
+  {                                                            \
+    TYPE r[N], a[N], b[N], pred[N];                            \
+    for (int i = 0; i < N; ++i)                                        \
+      {                                                                \
+       a[i] = (i & 1 ? i : 3 * i);                             \
+       b[i] = (i >> 4) << (i & 15);                            \
+       pred[i] = i % 3 < i % 5;                                \
+       asm volatile ("" ::: "memory");                         \
+      }                                                                \
+    test_##TYPE##_##NAME (r, a, b, FACTOR, pred, N);           \
+    for (int i = 0; i < N; ++i)                                        \
+      {                                                                \
+       TYPE expected                                           \
+         = pred[i] != 1 ? a[i] OP b[i] * (TYPE) FACTOR : b[i]; \
+       if (r[i] != expected)                                   \
+         __builtin_abort ();                                   \
+       asm volatile ("" ::: "memory");                         \
+      }                                                                \
+  }
+
+int
+main (void)
+{
+  TEST_ALL (TEST_LOOP)
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_2.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_2.c

new file mode 100644 (file)

index 0000000..b6ea1a3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_2.c
@@ -0,0 +1,53 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define DEF_LOOP(TYPE, NAME, OP)                       \
+  void __attribute__ ((noipa))                         \
+  test_##TYPE##_##NAME (TYPE *__restrict r,            \
+                       TYPE *__restrict a,             \
+                       TYPE *__restrict b, TYPE c,     \
+                       TYPE *__restrict pred, int n)   \
+  {                                                    \
+    for (int i = 0; i < n; ++i)                                \
+      r[i] = pred[i] != 1 ? a[i] OP b[i] * c : c;      \
+  }
+
+#define TEST_TYPE(T, TYPE) \
+  T (TYPE, add, +) \
+  T (TYPE, sub, -)
+
+#define TEST_ALL(T) \
+  TEST_TYPE (T, uint8_t) \
+  TEST_TYPE (T, uint16_t) \
+  TEST_TYPE (T, uint32_t) \
+  TEST_TYPE (T, uint64_t) \
+  TEST_TYPE (T, _Float16) \
+  TEST_TYPE (T, float) \
+  TEST_TYPE (T, double)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\tmad\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmad\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmad\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmad\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tmsb\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmsb\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmsb\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmsb\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmad\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmad\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmad\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmsb\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmsb\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmsb\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+, z[0-9]+\n} 14 } } */
+
+/* { dg-final { scan-assembler-not {\tmov\tz[^,]*z} } } */
+/* { dg-final { scan-assembler-not {\tsel\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_2_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_2_run.c

new file mode 100644 (file)

index 0000000..79998b8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_2_run.c
@@ -0,0 +1,36 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "cond_mla_2.c"
+
+#define FACTOR 17
+#define N 99
+
+#define TEST_LOOP(TYPE, NAME, OP)                              \
+  {                                                            \
+    TYPE r[N], a[N], b[N], pred[N];                            \
+    for (int i = 0; i < N; ++i)                                        \
+      {                                                                \
+       a[i] = (i & 1 ? i : 3 * i);                             \
+       b[i] = (i >> 4) << (i & 15);                            \
+       pred[i] = i % 3 < i % 5;                                \
+       asm volatile ("" ::: "memory");                         \
+      }                                                                \
+    test_##TYPE##_##NAME (r, a, b, FACTOR, pred, N);           \
+    for (int i = 0; i < N; ++i)                                        \
+      {                                                                \
+       TYPE expected = (pred[i] != 1                           \
+                        ? a[i] OP b[i] * (TYPE) FACTOR         \
+                        : (TYPE) FACTOR);                      \
+       if (r[i] != expected)                                   \
+         __builtin_abort ();                                   \
+       asm volatile ("" ::: "memory");                         \
+      }                                                                \
+  }
+
+int
+main (void)
+{
+  TEST_ALL (TEST_LOOP)
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_3.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_3.c

new file mode 100644 (file)

index 0000000..085fccf
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_3.c
@@ -0,0 +1,52 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define DEF_LOOP(TYPE, NAME, OP)                       \
+  void __attribute__ ((noipa))                         \
+  test_##TYPE##_##NAME (TYPE *__restrict r,            \
+                       TYPE *__restrict a,             \
+                       TYPE *__restrict b, TYPE c,     \
+                       TYPE *__restrict pred, int n)   \
+  {                                                    \
+    for (int i = 0; i < n; ++i)                                \
+      r[i] = pred[i] != 1 ? a[i] OP b[i] * c : a[i];   \
+  }
+
+#define TEST_TYPE(T, TYPE) \
+  T (TYPE, add, +) \
+  T (TYPE, sub, -)
+
+#define TEST_ALL(T) \
+  TEST_TYPE (T, uint8_t) \
+  TEST_TYPE (T, uint16_t) \
+  TEST_TYPE (T, uint32_t) \
+  TEST_TYPE (T, uint64_t) \
+  TEST_TYPE (T, _Float16) \
+  TEST_TYPE (T, float) \
+  TEST_TYPE (T, double)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\tmla\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmla\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmla\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmla\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tmls\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmls\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmls\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmls\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmls\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmls\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmls\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-not {\tmov\tz[^,]*z} } } */
+/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
+/* { dg-final { scan-assembler-not {\tsel\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_3_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_3_run.c

new file mode 100644 (file)

index 0000000..cbd1185
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_3_run.c
@@ -0,0 +1,35 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "cond_mla_3.c"
+
+#define FACTOR 17
+#define N 99
+
+#define TEST_LOOP(TYPE, NAME, OP)                              \
+  {                                                            \
+    TYPE r[N], a[N], b[N], pred[N];                            \
+    for (int i = 0; i < N; ++i)                                        \
+      {                                                                \
+       a[i] = (i & 1 ? i : 3 * i);                             \
+       b[i] = (i >> 4) << (i & 15);                            \
+       pred[i] = i % 3 < i % 5;                                \
+       asm volatile ("" ::: "memory");                         \
+      }                                                                \
+    test_##TYPE##_##NAME (r, a, b, FACTOR, pred, N);           \
+    for (int i = 0; i < N; ++i)                                        \
+      {                                                                \
+       TYPE expected                                           \
+         = pred[i] != 1 ? a[i] OP b[i] * (TYPE) FACTOR : a[i]; \
+       if (r[i] != expected)                                   \
+         __builtin_abort ();                                   \
+       asm volatile ("" ::: "memory");                         \
+      }                                                                \
+  }
+
+int
+main (void)
+{
+  TEST_ALL (TEST_LOOP)
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_4.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_4.c

new file mode 100644 (file)

index 0000000..ed9f73e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_4.c
@@ -0,0 +1,56 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define DEF_LOOP(TYPE, NAME, OP)                       \
+  void __attribute__ ((noipa))                         \
+  test_##TYPE##_##NAME (TYPE *__restrict r,            \
+                       TYPE *__restrict a,             \
+                       TYPE *__restrict b, TYPE c,     \
+                       TYPE *__restrict pred, int n)   \
+  {                                                    \
+    for (int i = 0; i < n; ++i)                                \
+      r[i] = pred[i] == 1 ? a[i] OP b[i] * c : pred[i];        \
+  }
+
+#define TEST_TYPE(T, TYPE) \
+  T (TYPE, add, +) \
+  T (TYPE, sub, -)
+
+#define TEST_ALL(T) \
+  TEST_TYPE (T, uint8_t) \
+  TEST_TYPE (T, uint16_t) \
+  TEST_TYPE (T, uint32_t) \
+  TEST_TYPE (T, uint64_t) \
+  TEST_TYPE (T, _Float16) \
+  TEST_TYPE (T, float) \
+  TEST_TYPE (T, double)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\tmla\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmla\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmla\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmla\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tmls\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmls\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmls\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmls\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmls\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmls\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmls\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.b, p[0-7]/m,} 2 } } */
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.h, p[0-7]/m,} 4 } } */
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.s, p[0-7]/m,} 4 } } */
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.d, p[0-7]/m,} 4 } } */
+
+/* { dg-final { scan-assembler-not {\tmov\tz[^,]*z} } } */
+/* { dg-final { scan-assembler-not {\tsel\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_4_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_4_run.c

new file mode 100644 (file)

index 0000000..5e07859
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_4_run.c
@@ -0,0 +1,36 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "cond_mla_4.c"
+
+#define FACTOR 17
+#define N 99
+
+#define TEST_LOOP(TYPE, NAME, OP)                              \
+  {                                                            \
+    TYPE r[N], a[N], b[N], pred[N];                            \
+    for (int i = 0; i < N; ++i)                                        \
+      {                                                                \
+       a[i] = (i & 1 ? i : 3 * i);                             \
+       b[i] = (i >> 4) << (i & 15);                            \
+       pred[i] = i % 3;                                        \
+       asm volatile ("" ::: "memory");                         \
+      }                                                                \
+    test_##TYPE##_##NAME (r, a, b, FACTOR, pred, N);           \
+    for (int i = 0; i < N; ++i)                                        \
+      {                                                                \
+       TYPE expected = (pred[i] == 1                           \
+                        ? a[i] OP b[i] * (TYPE) FACTOR         \
+                        : pred[i]);                            \
+       if (r[i] != expected)                                   \
+         __builtin_abort ();                                   \
+       asm volatile ("" ::: "memory");                         \
+      }                                                                \
+  }
+
+int
+main (void)
+{
+  TEST_ALL (TEST_LOOP)
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_5.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_5.c

new file mode 100644 (file)

index 0000000..e71f2b0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_5.c
@@ -0,0 +1,56 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define DEF_LOOP(TYPE, NAME, OP)                       \
+  void __attribute__ ((noipa))                         \
+  test_##TYPE##_##NAME (TYPE *__restrict r,            \
+                       TYPE *__restrict a,             \
+                       TYPE *__restrict b, TYPE c,     \
+                       TYPE *__restrict pred, int n)   \
+  {                                                    \
+    for (int i = 0; i < n; ++i)                                \
+      r[i] = pred[i] ? a[i] OP b[i] * c : 0;           \
+  }
+
+#define TEST_TYPE(T, TYPE) \
+  T (TYPE, add, +) \
+  T (TYPE, sub, -)
+
+#define TEST_ALL(T) \
+  TEST_TYPE (T, uint8_t) \
+  TEST_TYPE (T, uint16_t) \
+  TEST_TYPE (T, uint32_t) \
+  TEST_TYPE (T, uint64_t) \
+  TEST_TYPE (T, _Float16) \
+  TEST_TYPE (T, float) \
+  TEST_TYPE (T, double)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\t(?:mla|mad)\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\t(?:mla|mad)\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\t(?:mla|mad)\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\t(?:mla|mad)\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\t(?:mls|msb)\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\t(?:mls|msb)\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\t(?:mls|msb)\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\t(?:mls|msb)\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmls\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmls\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmls\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.b, p[0-7]/z,} 2 } } */
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.h, p[0-7]/z,} 4 } } */
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.s, p[0-7]/z,} 4 } } */
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.d, p[0-7]/z,} 4 } } */
+
+/* { dg-final { scan-assembler-not {\tmov\tz[^,]*z} } } */
+/* { dg-final { scan-assembler-not {\tsel\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_5_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_5_run.c

new file mode 100644 (file)

index 0000000..9de46e3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_5_run.c
@@ -0,0 +1,35 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "cond_mla_5.c"
+
+#define FACTOR 17
+#define N 99
+
+#define TEST_LOOP(TYPE, NAME, OP)                              \
+  {                                                            \
+    TYPE r[N], a[N], b[N], pred[N];                            \
+    for (int i = 0; i < N; ++i)                                        \
+      {                                                                \
+       a[i] = (i & 1 ? i : 3 * i);                             \
+       b[i] = (i >> 4) << (i & 15);                            \
+       pred[i] = i % 3 < i % 5;                                \
+       asm volatile ("" ::: "memory");                         \
+      }                                                                \
+    test_##TYPE##_##NAME (r, a, b, FACTOR, pred, N);           \
+    for (int i = 0; i < N; ++i)                                        \
+      {                                                                \
+       TYPE expected                                           \
+         = pred[i] ? a[i] OP b[i] * (TYPE) FACTOR : 0;         \
+       if (r[i] != expected)                                   \
+         __builtin_abort ();                                   \
+       asm volatile ("" ::: "memory");                         \
+      }                                                                \
+  }
+
+int
+main (void)
+{
+  TEST_ALL (TEST_LOOP)
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_6.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_6.c

new file mode 100644 (file)

index 0000000..832bdb3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_6.c
@@ -0,0 +1,53 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define DEF_LOOP(TYPE, NAME, OP)                       \
+  void __attribute__ ((noipa))                         \
+  test_##TYPE##_##NAME (TYPE *__restrict r,            \
+                       TYPE *__restrict a,             \
+                       TYPE *__restrict b, TYPE c,     \
+                       TYPE *__restrict pred, int n)   \
+  {                                                    \
+    for (int i = 0; i < n; ++i)                                \
+      r[i] = pred[i] ? a[i] OP b[i] * c : 5;           \
+  }
+
+#define TEST_TYPE(T, TYPE) \
+  T (TYPE, add, +) \
+  T (TYPE, sub, -)
+
+#define TEST_ALL(T) \
+  TEST_TYPE (T, uint8_t) \
+  TEST_TYPE (T, uint16_t) \
+  TEST_TYPE (T, uint32_t) \
+  TEST_TYPE (T, uint64_t) \
+  TEST_TYPE (T, _Float16) \
+  TEST_TYPE (T, float) \
+  TEST_TYPE (T, double)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\tmla\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmla\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmla\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmla\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tmls\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmls\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmls\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmls\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmls\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmls\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmls\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tsel\t} 14 } } */
+
+/* { dg-final { scan-assembler-not {\tmov\tz[^,]*z} } } */
+/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_6_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_6_run.c

new file mode 100644 (file)

index 0000000..59f57a2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_6_run.c
@@ -0,0 +1,35 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "cond_mla_6.c"
+
+#define FACTOR 17
+#define N 99
+
+#define TEST_LOOP(TYPE, NAME, OP)                              \
+  {                                                            \
+    TYPE r[N], a[N], b[N], pred[N];                            \
+    for (int i = 0; i < N; ++i)                                        \
+      {                                                                \
+       a[i] = (i & 1 ? i : 3 * i);                             \
+       b[i] = (i >> 4) << (i & 15);                            \
+       pred[i] = i % 3 < i % 5;                                \
+       asm volatile ("" ::: "memory");                         \
+      }                                                                \
+    test_##TYPE##_##NAME (r, a, b, FACTOR, pred, N);           \
+    for (int i = 0; i < N; ++i)                                        \
+      {                                                                \
+       TYPE expected                                           \
+         = pred[i] ? a[i] OP b[i] * (TYPE) FACTOR : 5; \
+       if (r[i] != expected)                                   \
+         __builtin_abort ();                                   \
+       asm volatile ("" ::: "memory");                         \
+      }                                                                \
+  }
+
+int
+main (void)
+{
+  TEST_ALL (TEST_LOOP)
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_7.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_7.c

new file mode 100644 (file)

index 0000000..5561f42
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_7.c
@@ -0,0 +1,62 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define DEF_LOOP(TYPE, NAME, OP, CONST)                                \
+  void __attribute__ ((noipa))                                 \
+  test_##TYPE##_##NAME##_##CONST (TYPE *__restrict r,          \
+                                 TYPE *__restrict a,           \
+                                 TYPE *__restrict b,           \
+                                 TYPE *__restrict pred, int n) \
+  {                                                            \
+    for (int i = 0; i < n; ++i)                                        \
+      r[i] = pred[i] != 1 ? a[i] OP b[i] * CONST : a[i];       \
+  }
+
+#define TEST_COUNT(T, TYPE, CONST) \
+  T (TYPE, add, +, CONST) \
+  T (TYPE, sub, -, CONST)
+
+#define TEST_TYPE(T, TYPE, CONST) \
+  TEST_COUNT (T, TYPE, 2) \
+  TEST_COUNT (T, TYPE, 4) \
+  TEST_COUNT (T, TYPE, CONST)
+
+#define TEST_ALL(T) \
+  TEST_TYPE (T, uint8_t, 0x80) \
+  TEST_TYPE (T, uint16_t, 0x8000) \
+  TEST_TYPE (T, uint32_t, 0x80000000) \
+  TEST_TYPE (T, uint64_t, 0x8000000000000000ULL)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.b, z[0-9]+\.b, #1\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.b, z[0-9]+\.b, #2\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.b, z[0-9]+\.b, #7\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.h, z[0-9]+\.h, #1\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.h, z[0-9]+\.h, #2\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.h, z[0-9]+\.h, #15\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.s, z[0-9]+\.s, #1\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.s, z[0-9]+\.s, #2\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.s, z[0-9]+\.s, #31\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.d, z[0-9]+\.d, #1\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.d, z[0-9]+\.d, #2\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.d, z[0-9]+\.d, #63\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.b, p[0-7]/m, z[0-9]+\.b, z[0-9]+\.b\n} 4 } } */
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 4 } } */
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 4 } } */
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 4 } } */
+
+/* { dg-final { scan-assembler-times {\tsub\tz[0-9]+\.b, p[0-7]/m, z[0-9]+\.b, z[0-9]+\.b\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tsub\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tsub\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tsub\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 2 } } */
+
+/* { dg-final { scan-assembler-not {\tmov\tz[^,]*z} } } */
+/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
+/* { dg-final { scan-assembler-not {\tsel\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_7_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_7_run.c

new file mode 100644 (file)

index 0000000..b094f40
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_7_run.c
@@ -0,0 +1,34 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "cond_mla_7.c"
+
+#define N 99
+
+#define TEST_LOOP(TYPE, NAME, OP, CONST)                       \
+  {                                                            \
+    TYPE r[N], a[N], b[N], pred[N];                            \
+    for (int i = 0; i < N; ++i)                                        \
+      {                                                                \
+       a[i] = (i & 1 ? i : 3 * i);                             \
+       b[i] = (i >> 4) << (i & 15);                            \
+       pred[i] = i % 3 < i % 5;                                \
+       asm volatile ("" ::: "memory");                         \
+      }                                                                \
+    test_##TYPE##_##NAME##_##CONST (r, a, b, pred, N);         \
+    for (int i = 0; i < N; ++i)                                        \
+      {                                                                \
+       TYPE expected                                           \
+         = pred[i] != 1 ? a[i] OP b[i] * CONST : a[i];         \
+       if (r[i] != expected)                                   \
+         __builtin_abort ();                                   \
+       asm volatile ("" ::: "memory");                         \
+      }                                                                \
+  }
+
+int
+main (void)
+{
+  TEST_ALL (TEST_LOOP)
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_8.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_8.c

new file mode 100644 (file)

index 0000000..d554927
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_8.c
@@ -0,0 +1,62 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define DEF_LOOP(TYPE, NAME, OP, CONST)                                \
+  void __attribute__ ((noipa))                                 \
+  test_##TYPE##_##NAME##_##CONST (TYPE *__restrict r,          \
+                                 TYPE *__restrict a,           \
+                                 TYPE *__restrict b,           \
+                                 TYPE *__restrict pred, int n) \
+  {                                                            \
+    for (int i = 0; i < n; ++i)                                        \
+      r[i] = pred[i] != 1 ? a[i] OP b[i] * -CONST : a[i];      \
+  }
+
+#define TEST_COUNT(T, TYPE, CONST) \
+  T (TYPE, add, +, CONST) \
+  T (TYPE, sub, -, CONST)
+
+#define TEST_TYPE(T, TYPE, CONST) \
+  TEST_COUNT (T, TYPE, 2) \
+  TEST_COUNT (T, TYPE, 4) \
+  TEST_COUNT (T, TYPE, CONST)
+
+#define TEST_ALL(T) \
+  TEST_TYPE (T, uint8_t, 0x80) \
+  TEST_TYPE (T, uint16_t, 0x8000) \
+  TEST_TYPE (T, uint32_t, 0x80000000) \
+  TEST_TYPE (T, uint64_t, 0x8000000000000000ULL)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.b, z[0-9]+\.b, #1\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.b, z[0-9]+\.b, #2\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.b, z[0-9]+\.b, #7\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.h, z[0-9]+\.h, #1\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.h, z[0-9]+\.h, #2\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.h, z[0-9]+\.h, #15\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.s, z[0-9]+\.s, #1\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.s, z[0-9]+\.s, #2\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.s, z[0-9]+\.s, #31\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.d, z[0-9]+\.d, #1\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.d, z[0-9]+\.d, #2\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.d, z[0-9]+\.d, #63\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.b, p[0-7]/m, z[0-9]+\.b, z[0-9]+\.b\n} 4 } } */
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 4 } } */
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 4 } } */
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 4 } } */
+
+/* { dg-final { scan-assembler-times {\tsub\tz[0-9]+\.b, p[0-7]/m, z[0-9]+\.b, z[0-9]+\.b\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tsub\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tsub\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tsub\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 2 } } */
+
+/* { dg-final { scan-assembler-not {\tmov\tz[^,]*z} } } */
+/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
+/* { dg-final { scan-assembler-not {\tsel\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_8_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_8_run.c

new file mode 100644 (file)

index 0000000..7fb58aa
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_8_run.c
@@ -0,0 +1,34 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "cond_mla_8.c"
+
+#define N 99
+
+#define TEST_LOOP(TYPE, NAME, OP, CONST)                       \
+  {                                                            \
+    TYPE r[N], a[N], b[N], pred[N];                            \
+    for (int i = 0; i < N; ++i)                                        \
+      {                                                                \
+       a[i] = (i & 1 ? i : 3 * i);                             \
+       b[i] = (i >> 4) << (i & 15);                            \
+       pred[i] = i % 3 < i % 5;                                \
+       asm volatile ("" ::: "memory");                         \
+      }                                                                \
+    test_##TYPE##_##NAME##_##CONST (r, a, b, pred, N);         \
+    for (int i = 0; i < N; ++i)                                        \
+      {                                                                \
+       TYPE expected                                           \
+         = pred[i] != 1 ? a[i] OP b[i] * -CONST : a[i];        \
+       if (r[i] != expected)                                   \
+         __builtin_abort ();                                   \
+       asm volatile ("" ::: "memory");                         \
+      }                                                                \
+  }
+
+int
+main (void)
+{
+  TEST_ALL (TEST_LOOP)
+  return 0;
+}
author	Richard Sandiford <richard.sandiford@arm.com>
	Thu, 15 Aug 2019 08:22:07 +0000 (08:22 +0000)
committer	Richard Sandiford <rsandifo@gcc.gnu.org>
	Thu, 15 Aug 2019 08:22:07 +0000 (08:22 +0000)
gcc/ChangeLog		patch \| blob \| history
gcc/config/aarch64/aarch64-protos.h		patch \| blob \| history
gcc/config/aarch64/aarch64-sve.md		patch \| blob \| history
gcc/config/aarch64/aarch64.c		patch \| blob \| history
gcc/testsuite/ChangeLog		patch \| blob \| history
gcc/testsuite/gcc.target/aarch64/sve/cond_mla_1.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/aarch64/sve/cond_mla_1_run.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/aarch64/sve/cond_mla_2.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/aarch64/sve/cond_mla_2_run.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/aarch64/sve/cond_mla_3.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/aarch64/sve/cond_mla_3_run.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/aarch64/sve/cond_mla_4.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/aarch64/sve/cond_mla_4_run.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/aarch64/sve/cond_mla_5.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/aarch64/sve/cond_mla_5_run.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/aarch64/sve/cond_mla_6.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/aarch64/sve/cond_mla_6_run.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/aarch64/sve/cond_mla_7.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/aarch64/sve/cond_mla_7_run.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/aarch64/sve/cond_mla_8.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/aarch64/sve/cond_mla_8_run.c	[new file with mode: 0644]	patch \| blob