From: Richard Sandiford <richard.sandiford@arm.com>
Date: Thu, 15 Aug 2019 08:22:07 +0000 (+0000)
Subject: [AArch64] Use SVE MLA, MLS, MAD and MSB for conditional arithmetic
X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=b6c3aea1892c148c21f8b87668f344b2397f4aa5;p=gcc.git

[AArch64] Use SVE MLA, MLS, MAD and MSB for conditional arithmetic

This patch uses predicated MLA, MLS, MAD and MSB to implement
conditional "FMA"s on integers.  This also requires providing
the unpredicated optabs (fma and fnma) since otherwise
tree-ssa-math-opts.c won't try to use the conditional forms.

We still want to use shifts and adds in preference to multiplications,
so the patch makes the optab expanders check for that.

The tests cover floating-point types too, which are already handled,
and which were already tested to some extent by gcc.dg/vect.

2019-08-15  Richard Sandiford  <richard.sandiford@arm.com>
	    Kugan Vivekanandarajah  <kugan.vivekanandarajah@linaro.org>

gcc/
	* config/aarch64/aarch64-protos.h (aarch64_prepare_sve_int_fma)
	(aarch64_prepare_sve_cond_int_fma): Declare.
	* config/aarch64/aarch64.c (aarch64_convert_mult_to_shift)
	(aarch64_prepare_sve_int_fma): New functions.
	(aarch64_prepare_sve_cond_int_fma): Likewise.
	* config/aarch64/aarch64-sve.md
	(cond_<SVE_INT_BINARY:optab><SVE_I:mode>): Add a "@" marker.
	(fma<SVE_I:mode>4, cond_fma<SVE_I:mode>, *cond_fma<SVE_I:mode>_2)
	(*cond_fma<SVE_I:mode>_4, *cond_fma<SVE_I:mode>_any, fnma<SVE_I:mode>4)
	(cond_fnma<SVE_I:mode>, *cond_fnma<SVE_I:mode>_2)
	(*cond_fnma<SVE_I:mode>_4, *cond_fnma<SVE_I:mode>_any): New patterns.
	(*madd<mode>): Rename to...
	(*fma<mode>4): ...this.
	(*msub<mode>): Rename to...
	(*fnma<mode>4): ...this.

gcc/testsuite/
	* gcc.target/aarch64/sve/cond_mla_1.c: New test.
	* gcc.target/aarch64/sve/cond_mla_1_run.c: Likewise.
	* gcc.target/aarch64/sve/cond_mla_2.c: Likewise.
	* gcc.target/aarch64/sve/cond_mla_2_run.c: Likewise.
	* gcc.target/aarch64/sve/cond_mla_3.c: Likewise.
	* gcc.target/aarch64/sve/cond_mla_3_run.c: Likewise.
	* gcc.target/aarch64/sve/cond_mla_4.c: Likewise.
	* gcc.target/aarch64/sve/cond_mla_4_run.c: Likewise.
	* gcc.target/aarch64/sve/cond_mla_5.c: Likewise.
	* gcc.target/aarch64/sve/cond_mla_5_run.c: Likewise.
	* gcc.target/aarch64/sve/cond_mla_6.c: Likewise.
	* gcc.target/aarch64/sve/cond_mla_6_run.c: Likewise.
	* gcc.target/aarch64/sve/cond_mla_7.c: Likewise.
	* gcc.target/aarch64/sve/cond_mla_7_run.c: Likewise.
	* gcc.target/aarch64/sve/cond_mla_8.c: Likewise.
	* gcc.target/aarch64/sve/cond_mla_8_run.c: Likewise.

Co-Authored-By: Kugan Vivekanandarajah <kuganv@linaro.org>

From-SVN: r274509
---

diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 52ab8e5d370..66631f6b37c 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,22 @@
+2019-08-15  Richard Sandiford  <richard.sandiford@arm.com>
+	    Kugan Vivekanandarajah  <kugan.vivekanandarajah@linaro.org>
+
+	* config/aarch64/aarch64-protos.h (aarch64_prepare_sve_int_fma)
+	(aarch64_prepare_sve_cond_int_fma): Declare.
+	* config/aarch64/aarch64.c (aarch64_convert_mult_to_shift)
+	(aarch64_prepare_sve_int_fma): New functions.
+	(aarch64_prepare_sve_cond_int_fma): Likewise.
+	* config/aarch64/aarch64-sve.md
+	(cond_<SVE_INT_BINARY:optab><SVE_I:mode>): Add a "@" marker.
+	(fma<SVE_I:mode>4, cond_fma<SVE_I:mode>, *cond_fma<SVE_I:mode>_2)
+	(*cond_fma<SVE_I:mode>_4, *cond_fma<SVE_I:mode>_any, fnma<SVE_I:mode>4)
+	(cond_fnma<SVE_I:mode>, *cond_fnma<SVE_I:mode>_2)
+	(*cond_fnma<SVE_I:mode>_4, *cond_fnma<SVE_I:mode>_any): New patterns.
+	(*madd<mode>): Rename to...
+	(*fma<mode>4): ...this.
+	(*msub<mode>): Rename to...
+	(*fnma<mode>4): ...this.
+
 2019-08-15  Richard Sandiford  <richard.sandiford@arm.com>
 	    Kugan Vivekanandarajah  <kugan.vivekanandarajah@linaro.org>
 
diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index 858c4500406..24e724004f8 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -630,6 +630,9 @@ bool aarch64_gen_adjusted_ldpstp (rtx *, bool, scalar_mode, RTX_CODE);
 void aarch64_expand_sve_vec_cmp_int (rtx, rtx_code, rtx, rtx);
 bool aarch64_expand_sve_vec_cmp_float (rtx, rtx_code, rtx, rtx, bool);
 void aarch64_expand_sve_vcond (machine_mode, machine_mode, rtx *);
+
+bool aarch64_prepare_sve_int_fma (rtx *, rtx_code);
+bool aarch64_prepare_sve_cond_int_fma (rtx *, rtx_code);
 #endif /* RTX_CODE */
 
 void aarch64_init_builtins (void);
diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md
index d43ce521a79..93f55360d69 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -1844,7 +1844,7 @@
 )
 
 ;; Predicated integer operations with merging.
-(define_expand "cond_<optab><mode>"
+(define_expand "@cond_<optab><mode>"
   [(set (match_operand:SVE_I 0 "register_operand")
 	(unspec:SVE_I
 	  [(match_operand:<VPRED> 1 "register_operand")
@@ -3384,8 +3384,26 @@
 ;; - MLA
 ;; -------------------------------------------------------------------------
 
+;; Unpredicated integer addition of product.
+(define_expand "fma<mode>4"
+  [(set (match_operand:SVE_I 0 "register_operand")
+	(plus:SVE_I
+	  (unspec:SVE_I
+	    [(match_dup 4)
+	     (mult:SVE_I (match_operand:SVE_I 1 "register_operand")
+			 (match_operand:SVE_I 2 "nonmemory_operand"))]
+	    UNSPEC_PRED_X)
+	  (match_operand:SVE_I 3 "register_operand")))]
+  "TARGET_SVE"
+  {
+    if (aarch64_prepare_sve_int_fma (operands, PLUS))
+      DONE;
+    operands[4] = aarch64_ptrue_reg (<VPRED>mode);
+  }
+)
+
 ;; Predicated integer addition of product.
-(define_insn "*madd<mode>"
+(define_insn "*fma<mode>4"
   [(set (match_operand:SVE_I 0 "register_operand" "=w, w, ?&w")
 	(plus:SVE_I
 	  (unspec:SVE_I
@@ -3402,6 +3420,97 @@
   [(set_attr "movprfx" "*,*,yes")]
 )
 
+;; Predicated integer addition of product with merging.
+(define_expand "cond_fma<mode>"
+  [(set (match_operand:SVE_I 0 "register_operand")
+	(unspec:SVE_I
+	  [(match_operand:<VPRED> 1 "register_operand")
+	   (plus:SVE_I
+	     (mult:SVE_I (match_operand:SVE_I 2 "register_operand")
+			 (match_operand:SVE_I 3 "general_operand"))
+	     (match_operand:SVE_I 4 "register_operand"))
+	   (match_operand:SVE_I 5 "aarch64_simd_reg_or_zero")]
+	  UNSPEC_SEL))]
+  "TARGET_SVE"
+  {
+    if (aarch64_prepare_sve_cond_int_fma (operands, PLUS))
+      DONE;
+    /* Swap the multiplication operands if the fallback value is the
+       second of the two.  */
+    if (rtx_equal_p (operands[3], operands[5]))
+      std::swap (operands[2], operands[3]);
+  }
+)
+
+;; Predicated integer addition of product, merging with the first input.
+(define_insn "*cond_fma<mode>_2"
+  [(set (match_operand:SVE_I 0 "register_operand" "=w, ?&w")
+	(unspec:SVE_I
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
+	   (plus:SVE_I
+	     (mult:SVE_I (match_operand:SVE_I 2 "register_operand" "0, w")
+			 (match_operand:SVE_I 3 "register_operand" "w, w"))
+	     (match_operand:SVE_I 4 "register_operand" "w, w"))
+	   (match_dup 2)]
+	  UNSPEC_SEL))]
+  "TARGET_SVE"
+  "@
+   mad\t%0.<Vetype>, %1/m, %3.<Vetype>, %4.<Vetype>
+   movprfx\t%0, %2\;mad\t%0.<Vetype>, %1/m, %3.<Vetype>, %4.<Vetype>"
+  [(set_attr "movprfx" "*,yes")]
+)
+
+;; Predicated integer addition of product, merging with the third input.
+(define_insn "*cond_fma<mode>_4"
+  [(set (match_operand:SVE_I 0 "register_operand" "=w, ?&w")
+	(unspec:SVE_I
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
+	   (plus:SVE_I
+	     (mult:SVE_I (match_operand:SVE_I 2 "register_operand" "w, w")
+			 (match_operand:SVE_I 3 "register_operand" "w, w"))
+	     (match_operand:SVE_I 4 "register_operand" "0, w"))
+	   (match_dup 4)]
+	  UNSPEC_SEL))]
+  "TARGET_SVE"
+  "@
+   mla\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>
+   movprfx\t%0, %4\;mla\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>"
+  [(set_attr "movprfx" "*,yes")]
+)
+
+;; Predicated integer addition of product, merging with an independent value.
+(define_insn_and_rewrite "*cond_fma<mode>_any"
+  [(set (match_operand:SVE_I 0 "register_operand" "=&w, &w, &w, &w, &w, ?&w")
+	(unspec:SVE_I
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl, Upl, Upl, Upl")
+	   (plus:SVE_I
+	     (mult:SVE_I (match_operand:SVE_I 2 "register_operand" "w, w, 0, w, w, w")
+			 (match_operand:SVE_I 3 "register_operand" "w, w, w, 0, w, w"))
+	     (match_operand:SVE_I 4 "register_operand" "w, 0, w, w, w, w"))
+	   (match_operand:SVE_I 5 "aarch64_simd_reg_or_zero" "Dz, Dz, Dz, Dz, 0, w")]
+	  UNSPEC_SEL))]
+  "TARGET_SVE
+   && !rtx_equal_p (operands[2], operands[5])
+   && !rtx_equal_p (operands[3], operands[5])
+   && !rtx_equal_p (operands[4], operands[5])"
+  "@
+   movprfx\t%0.<Vetype>, %1/z, %4.<Vetype>\;mla\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>
+   movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;mla\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>
+   movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;mad\t%0.<Vetype>, %1/m, %3.<Vetype>, %4.<Vetype>
+   movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;mad\t%0.<Vetype>, %1/m, %2.<Vetype>, %4.<Vetype>
+   movprfx\t%0.<Vetype>, %1/m, %4.<Vetype>\;mla\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>
+   #"
+  "&& reload_completed
+   && register_operand (operands[5], <MODE>mode)
+   && !rtx_equal_p (operands[0], operands[5])"
+  {
+    emit_insn (gen_vcond_mask_<mode><vpred> (operands[0], operands[4],
+					     operands[5], operands[1]));
+    operands[5] = operands[4] = operands[0];
+  }
+  [(set_attr "movprfx" "yes")]
+)
+
 ;; -------------------------------------------------------------------------
 ;; ---- [INT] MLS and MSB
 ;; -------------------------------------------------------------------------
@@ -3410,8 +3519,26 @@
 ;; - MSB
 ;; -------------------------------------------------------------------------
 
+;; Unpredicated integer subtraction of product.
+(define_expand "fnma<mode>4"
+  [(set (match_operand:SVE_I 0 "register_operand")
+	(minus:SVE_I
+	  (match_operand:SVE_I 3 "register_operand")
+	  (unspec:SVE_I
+	    [(match_dup 4)
+	     (mult:SVE_I (match_operand:SVE_I 1 "register_operand")
+			 (match_operand:SVE_I 2 "general_operand"))]
+	    UNSPEC_PRED_X)))]
+  "TARGET_SVE"
+  {
+    if (aarch64_prepare_sve_int_fma (operands, MINUS))
+      DONE;
+    operands[4] = aarch64_ptrue_reg (<VPRED>mode);
+  }
+)
+
 ;; Predicated integer subtraction of product.
-(define_insn "*msub<mode>3"
+(define_insn "*fnma<mode>3"
   [(set (match_operand:SVE_I 0 "register_operand" "=w, w, ?&w")
 	(minus:SVE_I
 	  (match_operand:SVE_I 4 "register_operand" "w, 0, w")
@@ -3428,6 +3555,98 @@
   [(set_attr "movprfx" "*,*,yes")]
 )
 
+;; Predicated integer subtraction of product with merging.
+(define_expand "cond_fnma<mode>"
+  [(set (match_operand:SVE_I 0 "register_operand")
+   (unspec:SVE_I
+	[(match_operand:<VPRED> 1 "register_operand")
+	 (minus:SVE_I
+	   (match_operand:SVE_I 4 "register_operand")
+	   (mult:SVE_I (match_operand:SVE_I 2 "register_operand")
+		       (match_operand:SVE_I 3 "general_operand")))
+	 (match_operand:SVE_I 5 "aarch64_simd_reg_or_zero")]
+	UNSPEC_SEL))]
+  "TARGET_SVE"
+  {
+    if (aarch64_prepare_sve_cond_int_fma (operands, MINUS))
+      DONE;
+    /* Swap the multiplication operands if the fallback value is the
+       second of the two.  */
+    if (rtx_equal_p (operands[3], operands[5]))
+      std::swap (operands[2], operands[3]);
+  }
+)
+
+;; Predicated integer subtraction of product, merging with the first input.
+(define_insn "*cond_fnma<mode>_2"
+  [(set (match_operand:SVE_I 0 "register_operand" "=w, ?&w")
+	(unspec:SVE_I
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
+	   (minus:SVE_I
+	     (match_operand:SVE_I 4 "register_operand" "w, w")
+	     (mult:SVE_I (match_operand:SVE_I 2 "register_operand" "0, w")
+			 (match_operand:SVE_I 3 "register_operand" "w, w")))
+	   (match_dup 2)]
+	  UNSPEC_SEL))]
+  "TARGET_SVE"
+  "@
+   msb\t%0.<Vetype>, %1/m, %3.<Vetype>, %4.<Vetype>
+   movprfx\t%0, %2\;msb\t%0.<Vetype>, %1/m, %3.<Vetype>, %4.<Vetype>"
+  [(set_attr "movprfx" "*,yes")]
+)
+
+;; Predicated integer subtraction of product, merging with the third input.
+(define_insn "*cond_fnma<mode>_4"
+  [(set (match_operand:SVE_I 0 "register_operand" "=w, ?&w")
+	(unspec:SVE_I
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
+	   (minus:SVE_I
+	     (match_operand:SVE_I 4 "register_operand" "0, w")
+	     (mult:SVE_I (match_operand:SVE_I 2 "register_operand" "w, w")
+			 (match_operand:SVE_I 3 "register_operand" "w, w")))
+	   (match_dup 4)]
+	  UNSPEC_SEL))]
+  "TARGET_SVE"
+  "@
+   mls\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>
+   movprfx\t%0, %4\;mls\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>"
+  [(set_attr "movprfx" "*,yes")]
+)
+
+;; Predicated integer subtraction of product, merging with an
+;; independent value.
+(define_insn_and_rewrite "*cond_fnma<mode>_any"
+  [(set (match_operand:SVE_I 0 "register_operand" "=&w, &w, &w, &w, &w, ?&w")
+	(unspec:SVE_I
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl, Upl, Upl, Upl")
+	   (minus:SVE_I
+	     (match_operand:SVE_I 4 "register_operand" "w, 0, w, w, w, w")
+	     (mult:SVE_I (match_operand:SVE_I 2 "register_operand" "w, w, 0, w, w, w")
+			 (match_operand:SVE_I 3 "register_operand" "w, w, w, 0, w, w")))
+	   (match_operand:SVE_I 5 "aarch64_simd_reg_or_zero" "Dz, Dz, Dz, Dz, 0, w")]
+	  UNSPEC_SEL))]
+  "TARGET_SVE
+   && !rtx_equal_p (operands[2], operands[5])
+   && !rtx_equal_p (operands[3], operands[5])
+   && !rtx_equal_p (operands[4], operands[5])"
+  "@
+   movprfx\t%0.<Vetype>, %1/z, %4.<Vetype>\;mls\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>
+   movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;mls\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>
+   movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;msb\t%0.<Vetype>, %1/m, %3.<Vetype>, %4.<Vetype>
+   movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;msb\t%0.<Vetype>, %1/m, %2.<Vetype>, %4.<Vetype>
+   movprfx\t%0.<Vetype>, %1/m, %4.<Vetype>\;mls\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>
+   #"
+  "&& reload_completed
+   && register_operand (operands[5], <MODE>mode)
+   && !rtx_equal_p (operands[0], operands[5])"
+  {
+    emit_insn (gen_vcond_mask_<mode><vpred> (operands[0], operands[4],
+					     operands[5], operands[1]));
+    operands[5] = operands[4] = operands[0];
+  }
+  [(set_attr "movprfx" "yes")]
+)
+
 ;; -------------------------------------------------------------------------
 ;; ---- [INT] Dot product
 ;; -------------------------------------------------------------------------
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 81a267bde54..ec787727aa6 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -16469,6 +16469,98 @@ aarch64_sve_expand_vector_init (rtx target, rtx vals)
     aarch64_sve_expand_vector_init_insert_elems (target, v, nelts);
 }
 
+/* Check whether VALUE is a vector constant in which every element
+   is either a power of 2 or a negated power of 2.  If so, return
+   a constant vector of log2s, and flip CODE between PLUS and MINUS
+   if VALUE contains negated powers of 2.  Return NULL_RTX otherwise.  */
+
+static rtx
+aarch64_convert_mult_to_shift (rtx value, rtx_code &code)
+{
+  if (GET_CODE (value) != CONST_VECTOR)
+    return NULL_RTX;
+
+  rtx_vector_builder builder;
+  if (!builder.new_unary_operation (GET_MODE (value), value, false))
+    return NULL_RTX;
+
+  scalar_mode int_mode = GET_MODE_INNER (GET_MODE (value));
+  /* 1 if the result of the multiplication must be negated,
+     0 if it mustn't, or -1 if we don't yet care.  */
+  int negate = -1;
+  unsigned int encoded_nelts = const_vector_encoded_nelts (value);
+  for (unsigned int i = 0; i < encoded_nelts; ++i)
+    {
+      rtx elt = CONST_VECTOR_ENCODED_ELT (value, i);
+      if (!CONST_SCALAR_INT_P (elt))
+	return NULL_RTX;
+      rtx_mode_t val (elt, int_mode);
+      wide_int pow2 = wi::neg (val);
+      if (val != pow2)
+	{
+	  /* It matters whether we negate or not.  Make that choice,
+	     and make sure that it's consistent with previous elements.  */
+	  if (negate == !wi::neg_p (val))
+	    return NULL_RTX;
+	  negate = wi::neg_p (val);
+	  if (!negate)
+	    pow2 = val;
+	}
+      /* POW2 is now the value that we want to be a power of 2.  */
+      int shift = wi::exact_log2 (pow2);
+      if (shift < 0)
+	return NULL_RTX;
+      builder.quick_push (gen_int_mode (shift, int_mode));
+    }
+  if (negate == -1)
+    /* PLUS and MINUS are equivalent; canonicalize on PLUS.  */
+    code = PLUS;
+  else if (negate == 1)
+    code = code == PLUS ? MINUS : PLUS;
+  return builder.build ();
+}
+
+/* Prepare for an integer SVE multiply-add or multiply-subtract pattern;
+   CODE is PLUS for the former and MINUS for the latter.  OPERANDS is the
+   operands array, in the same order as for fma_optab.  Return true if
+   the function emitted all the necessary instructions, false if the caller
+   should generate the pattern normally with the new OPERANDS array.  */
+
+bool
+aarch64_prepare_sve_int_fma (rtx *operands, rtx_code code)
+{
+  machine_mode mode = GET_MODE (operands[0]);
+  if (rtx shifts = aarch64_convert_mult_to_shift (operands[2], code))
+    {
+      rtx product = expand_binop (mode, vashl_optab, operands[1], shifts,
+				  NULL_RTX, true, OPTAB_DIRECT);
+      force_expand_binop (mode, code == PLUS ? add_optab : sub_optab,
+			  operands[3], product, operands[0], true,
+			  OPTAB_DIRECT);
+      return true;
+    }
+  operands[2] = force_reg (mode, operands[2]);
+  return false;
+}
+
+/* Likewise, but for a conditional pattern.  */
+
+bool
+aarch64_prepare_sve_cond_int_fma (rtx *operands, rtx_code code)
+{
+  machine_mode mode = GET_MODE (operands[0]);
+  if (rtx shifts = aarch64_convert_mult_to_shift (operands[3], code))
+    {
+      rtx product = expand_binop (mode, vashl_optab, operands[2], shifts,
+				  NULL_RTX, true, OPTAB_DIRECT);
+      emit_insn (gen_cond (code, mode, operands[0], operands[1],
+			   operands[4], product, operands[5]));
+      return true;
+    }
+  operands[3] = force_reg (mode, operands[3]);
+  return false;
+}
+
 static unsigned HOST_WIDE_INT
 aarch64_shift_truncation_mask (machine_mode mode)
 {
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog
index aad8e04c32c..63596d8497e 100644
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,3 +1,23 @@
+2019-08-15  Richard Sandiford  <richard.sandiford@arm.com>
+	    Kugan Vivekanandarajah  <kugan.vivekanandarajah@linaro.org>
+
+	* gcc.target/aarch64/sve/cond_mla_1.c: New test.
+	* gcc.target/aarch64/sve/cond_mla_1_run.c: Likewise.
+	* gcc.target/aarch64/sve/cond_mla_2.c: Likewise.
+	* gcc.target/aarch64/sve/cond_mla_2_run.c: Likewise.
+	* gcc.target/aarch64/sve/cond_mla_3.c: Likewise.
+	* gcc.target/aarch64/sve/cond_mla_3_run.c: Likewise.
+	* gcc.target/aarch64/sve/cond_mla_4.c: Likewise.
+	* gcc.target/aarch64/sve/cond_mla_4_run.c: Likewise.
+	* gcc.target/aarch64/sve/cond_mla_5.c: Likewise.
+	* gcc.target/aarch64/sve/cond_mla_5_run.c: Likewise.
+	* gcc.target/aarch64/sve/cond_mla_6.c: Likewise.
+	* gcc.target/aarch64/sve/cond_mla_6_run.c: Likewise.
+	* gcc.target/aarch64/sve/cond_mla_7.c: Likewise.
+	* gcc.target/aarch64/sve/cond_mla_7_run.c: Likewise.
+	* gcc.target/aarch64/sve/cond_mla_8.c: Likewise.
+	* gcc.target/aarch64/sve/cond_mla_8_run.c: Likewise.
+
 2019-08-15  Richard Sandiford  <richard.sandiford@arm.com>
 	    Kugan Vivekanandarajah  <kugan.vivekanandarajah@linaro.org>
 
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_1.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_1.c
new file mode 100644
index 00000000000..cb01d50f3a5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_1.c
@@ -0,0 +1,52 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define DEF_LOOP(TYPE, NAME, OP)			\
+  void __attribute__ ((noipa))				\
+  test_##TYPE##_##NAME (TYPE *__restrict r,		\
+			TYPE *__restrict a,		\
+			TYPE *__restrict b, TYPE c,	\
+			TYPE *__restrict pred, int n)	\
+  {							\
+    for (int i = 0; i < n; ++i)				\
+      r[i] = pred[i] != 1 ? a[i] OP b[i] * c : b[i];	\
+  }
+
+#define TEST_TYPE(T, TYPE) \
+  T (TYPE, add, +) \
+  T (TYPE, sub, -)
+
+#define TEST_ALL(T) \
+  TEST_TYPE (T, uint8_t) \
+  TEST_TYPE (T, uint16_t) \
+  TEST_TYPE (T, uint32_t) \
+  TEST_TYPE (T, uint64_t) \
+  TEST_TYPE (T, _Float16) \
+  TEST_TYPE (T, float) \
+  TEST_TYPE (T, double)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\tmad\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmad\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmad\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmad\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tmsb\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmsb\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmsb\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmsb\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmad\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmad\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmad\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmsb\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmsb\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmsb\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-not {\tmov\tz[^,]*z} } } */
+/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
+/* { dg-final { scan-assembler-not {\tsel\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_1_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_1_run.c
new file mode 100644
index 00000000000..bcfc6228066
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_1_run.c
@@ -0,0 +1,35 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "cond_mla_1.c"
+
+#define FACTOR 17
+#define N 99
+
+#define TEST_LOOP(TYPE, NAME, OP)				\
+  {								\
+    TYPE r[N], a[N], b[N], pred[N];				\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	a[i] = (i & 1 ? i : 3 * i);				\
+	b[i] = (i >> 4) << (i & 15);				\
+	pred[i] = i % 3 < i % 5;				\
+	asm volatile ("" ::: "memory");				\
+      }								\
+    test_##TYPE##_##NAME (r, a, b, FACTOR, pred, N);		\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	TYPE expected						\
+	  = pred[i] != 1 ? a[i] OP b[i] * (TYPE) FACTOR : b[i];	\
+	if (r[i] != expected)					\
+	  __builtin_abort ();					\
+	asm volatile ("" ::: "memory");				\
+      }								\
+  }
+
+int
+main (void)
+{
+  TEST_ALL (TEST_LOOP)
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_2.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_2.c
new file mode 100644
index 00000000000..b6ea1a3e21f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_2.c
@@ -0,0 +1,53 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define DEF_LOOP(TYPE, NAME, OP)			\
+  void __attribute__ ((noipa))				\
+  test_##TYPE##_##NAME (TYPE *__restrict r,		\
+			TYPE *__restrict a,		\
+			TYPE *__restrict b, TYPE c,	\
+			TYPE *__restrict pred, int n)	\
+  {							\
+    for (int i = 0; i < n; ++i)				\
+      r[i] = pred[i] != 1 ? a[i] OP b[i] * c : c;	\
+  }
+
+#define TEST_TYPE(T, TYPE) \
+  T (TYPE, add, +) \
+  T (TYPE, sub, -)
+
+#define TEST_ALL(T) \
+  TEST_TYPE (T, uint8_t) \
+  TEST_TYPE (T, uint16_t) \
+  TEST_TYPE (T, uint32_t) \
+  TEST_TYPE (T, uint64_t) \
+  TEST_TYPE (T, _Float16) \
+  TEST_TYPE (T, float) \
+  TEST_TYPE (T, double)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\tmad\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmad\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmad\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmad\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tmsb\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmsb\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmsb\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmsb\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmad\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmad\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmad\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmsb\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmsb\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmsb\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+, z[0-9]+\n} 14 } } */
+
+/* { dg-final { scan-assembler-not {\tmov\tz[^,]*z} } } */
+/* { dg-final { scan-assembler-not {\tsel\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_2_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_2_run.c
new file mode 100644
index 00000000000..79998b84eea
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_2_run.c
@@ -0,0 +1,36 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "cond_mla_2.c"
+
+#define FACTOR 17
+#define N 99
+
+#define TEST_LOOP(TYPE, NAME, OP)				\
+  {								\
+    TYPE r[N], a[N], b[N], pred[N];				\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	a[i] = (i & 1 ? i : 3 * i);				\
+	b[i] = (i >> 4) << (i & 15);				\
+	pred[i] = i % 3 < i % 5;				\
+	asm volatile ("" ::: "memory");				\
+      }								\
+    test_##TYPE##_##NAME (r, a, b, FACTOR, pred, N);		\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	TYPE expected = (pred[i] != 1				\
+			 ? a[i] OP b[i] * (TYPE) FACTOR		\
+			 : (TYPE) FACTOR);			\
+	if (r[i] != expected)					\
+	  __builtin_abort ();					\
+	asm volatile ("" ::: "memory");				\
+      }								\
+  }
+
+int
+main (void)
+{
+  TEST_ALL (TEST_LOOP)
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_3.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_3.c
new file mode 100644
index 00000000000..085fccf53e2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_3.c
@@ -0,0 +1,52 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define DEF_LOOP(TYPE, NAME, OP)			\
+  void __attribute__ ((noipa))				\
+  test_##TYPE##_##NAME (TYPE *__restrict r,		\
+			TYPE *__restrict a,		\
+			TYPE *__restrict b, TYPE c,	\
+			TYPE *__restrict pred, int n)	\
+  {							\
+    for (int i = 0; i < n; ++i)				\
+      r[i] = pred[i] != 1 ? a[i] OP b[i] * c : a[i];	\
+  }
+
+#define TEST_TYPE(T, TYPE) \
+  T (TYPE, add, +) \
+  T (TYPE, sub, -)
+
+#define TEST_ALL(T) \
+  TEST_TYPE (T, uint8_t) \
+  TEST_TYPE (T, uint16_t) \
+  TEST_TYPE (T, uint32_t) \
+  TEST_TYPE (T, uint64_t) \
+  TEST_TYPE (T, _Float16) \
+  TEST_TYPE (T, float) \
+  TEST_TYPE (T, double)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\tmla\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmla\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmla\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmla\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tmls\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmls\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmls\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmls\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmls\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmls\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmls\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-not {\tmov\tz[^,]*z} } } */
+/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
+/* { dg-final { scan-assembler-not {\tsel\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_3_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_3_run.c
new file mode 100644
index 00000000000..cbd1185b257
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_3_run.c
@@ -0,0 +1,35 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "cond_mla_3.c"
+
+#define FACTOR 17
+#define N 99
+
+#define TEST_LOOP(TYPE, NAME, OP)				\
+  {								\
+    TYPE r[N], a[N], b[N], pred[N];				\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	a[i] = (i & 1 ? i : 3 * i);				\
+	b[i] = (i >> 4) << (i & 15);				\
+	pred[i] = i % 3 < i % 5;				\
+	asm volatile ("" ::: "memory");				\
+      }								\
+    test_##TYPE##_##NAME (r, a, b, FACTOR, pred, N);		\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	TYPE expected						\
+	  = pred[i] != 1 ? a[i] OP b[i] * (TYPE) FACTOR : a[i];	\
+	if (r[i] != expected)					\
+	  __builtin_abort ();					\
+	asm volatile ("" ::: "memory");				\
+      }								\
+  }
+
+int
+main (void)
+{
+  TEST_ALL (TEST_LOOP)
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_4.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_4.c
new file mode 100644
index 00000000000..ed9f73e9c48
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_4.c
@@ -0,0 +1,56 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define DEF_LOOP(TYPE, NAME, OP)			\
+  void __attribute__ ((noipa))				\
+  test_##TYPE##_##NAME (TYPE *__restrict r,		\
+			TYPE *__restrict a,		\
+			TYPE *__restrict b, TYPE c,	\
+			TYPE *__restrict pred, int n)	\
+  {							\
+    for (int i = 0; i < n; ++i)				\
+      r[i] = pred[i] == 1 ? a[i] OP b[i] * c : pred[i];	\
+  }
+
+#define TEST_TYPE(T, TYPE) \
+  T (TYPE, add, +) \
+  T (TYPE, sub, -)
+
+#define TEST_ALL(T) \
+  TEST_TYPE (T, uint8_t) \
+  TEST_TYPE (T, uint16_t) \
+  TEST_TYPE (T, uint32_t) \
+  TEST_TYPE (T, uint64_t) \
+  TEST_TYPE (T, _Float16) \
+  TEST_TYPE (T, float) \
+  TEST_TYPE (T, double)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\tmla\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmla\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmla\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmla\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tmls\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmls\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmls\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmls\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmls\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmls\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmls\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.b, p[0-7]/m,} 2 } } */
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.h, p[0-7]/m,} 4 } } */
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.s, p[0-7]/m,} 4 } } */
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.d, p[0-7]/m,} 4 } } */
+
+/* { dg-final { scan-assembler-not {\tmov\tz[^,]*z} } } */
+/* { dg-final { scan-assembler-not {\tsel\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_4_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_4_run.c
new file mode 100644
index 00000000000..5e078594a1a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_4_run.c
@@ -0,0 +1,36 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "cond_mla_4.c"
+
+#define FACTOR 17
+#define N 99
+
+#define TEST_LOOP(TYPE, NAME, OP)				\
+  {								\
+    TYPE r[N], a[N], b[N], pred[N];				\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	a[i] = (i & 1 ? i : 3 * i);				\
+	b[i] = (i >> 4) << (i & 15);				\
+	pred[i] = i % 3;					\
+	asm volatile ("" ::: "memory");				\
+      }								\
+    test_##TYPE##_##NAME (r, a, b, FACTOR, pred, N);		\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	TYPE expected = (pred[i] == 1				\
+			 ? a[i] OP b[i] * (TYPE) FACTOR		\
+			 : pred[i]);				\
+	if (r[i] != expected)					\
+	  __builtin_abort ();					\
+	asm volatile ("" ::: "memory");				\
+      }								\
+  }
+
+int
+main (void)
+{
+  TEST_ALL (TEST_LOOP)
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_5.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_5.c
new file mode 100644
index 00000000000..e71f2b0336f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_5.c
@@ -0,0 +1,56 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define DEF_LOOP(TYPE, NAME, OP)			\
+  void __attribute__ ((noipa))				\
+  test_##TYPE##_##NAME (TYPE *__restrict r,		\
+			TYPE *__restrict a,		\
+			TYPE *__restrict b, TYPE c,	\
+			TYPE *__restrict pred, int n)	\
+  {							\
+    for (int i = 0; i < n; ++i)				\
+      r[i] = pred[i] ? a[i] OP b[i] * c : 0;		\
+  }
+
+#define TEST_TYPE(T, TYPE) \
+  T (TYPE, add, +) \
+  T (TYPE, sub, -)
+
+#define TEST_ALL(T) \
+  TEST_TYPE (T, uint8_t) \
+  TEST_TYPE (T, uint16_t) \
+  TEST_TYPE (T, uint32_t) \
+  TEST_TYPE (T, uint64_t) \
+  TEST_TYPE (T, _Float16) \
+  TEST_TYPE (T, float) \
+  TEST_TYPE (T, double)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\t(?:mla|mad)\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\t(?:mla|mad)\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\t(?:mla|mad)\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\t(?:mla|mad)\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\t(?:mls|msb)\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\t(?:mls|msb)\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\t(?:mls|msb)\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\t(?:mls|msb)\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmls\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmls\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmls\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.b, p[0-7]/z,} 2 } } */
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.h, p[0-7]/z,} 4 } } */
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.s, p[0-7]/z,} 4 } } */
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.d, p[0-7]/z,} 4 } } */
+
+/* { dg-final { scan-assembler-not {\tmov\tz[^,]*z} } } */
+/* { dg-final { scan-assembler-not {\tsel\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_5_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_5_run.c
new file mode 100644
index 00000000000..9de46e30f9a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_5_run.c
@@ -0,0 +1,35 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "cond_mla_5.c"
+
+#define FACTOR 17
+#define N 99
+
+#define TEST_LOOP(TYPE, NAME, OP)				\
+  {								\
+    TYPE r[N], a[N], b[N], pred[N];				\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	a[i] = (i & 1 ? i : 3 * i);				\
+	b[i] = (i >> 4) << (i & 15);				\
+	pred[i] = i % 3 < i % 5;				\
+	asm volatile ("" ::: "memory");				\
+      }								\
+    test_##TYPE##_##NAME (r, a, b, FACTOR, pred, N);		\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	TYPE expected						\
+	  = pred[i] ? a[i] OP b[i] * (TYPE) FACTOR : 0;		\
+	if (r[i] != expected)					\
+	  __builtin_abort ();					\
+	asm volatile ("" ::: "memory");				\
+      }								\
+  }
+
+int
+main (void)
+{
+  TEST_ALL (TEST_LOOP)
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_6.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_6.c
new file mode 100644
index 00000000000..832bdb3d83c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_6.c
@@ -0,0 +1,53 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define DEF_LOOP(TYPE, NAME, OP)			\
+  void __attribute__ ((noipa))				\
+  test_##TYPE##_##NAME (TYPE *__restrict r,		\
+			TYPE *__restrict a,		\
+			TYPE *__restrict b, TYPE c,	\
+			TYPE *__restrict pred, int n)	\
+  {							\
+    for (int i = 0; i < n; ++i)				\
+      r[i] = pred[i] ? a[i] OP b[i] * c : 5;		\
+  }
+
+#define TEST_TYPE(T, TYPE) \
+  T (TYPE, add, +) \
+  T (TYPE, sub, -)
+
+#define TEST_ALL(T) \
+  TEST_TYPE (T, uint8_t) \
+  TEST_TYPE (T, uint16_t) \
+  TEST_TYPE (T, uint32_t) \
+  TEST_TYPE (T, uint64_t) \
+  TEST_TYPE (T, _Float16) \
+  TEST_TYPE (T, float) \
+  TEST_TYPE (T, double)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\tmla\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmla\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmla\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmla\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tmls\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmls\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmls\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmls\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmls\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmls\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmls\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tsel\t} 14 } } */
+
+/* { dg-final { scan-assembler-not {\tmov\tz[^,]*z} } } */
+/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_6_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_6_run.c
new file mode 100644
index 00000000000..59f57a2db13
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_6_run.c
@@ -0,0 +1,35 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "cond_mla_6.c"
+
+#define FACTOR 17
+#define N 99
+
+#define TEST_LOOP(TYPE, NAME, OP)				\
+  {								\
+    TYPE r[N], a[N], b[N], pred[N];				\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	a[i] = (i & 1 ? i : 3 * i);				\
+	b[i] = (i >> 4) << (i & 15);				\
+	pred[i] = i % 3 < i % 5;				\
+	asm volatile ("" ::: "memory");				\
+      }								\
+    test_##TYPE##_##NAME (r, a, b, FACTOR, pred, N);		\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	TYPE expected						\
+	  = pred[i] ? a[i] OP b[i] * (TYPE) FACTOR : 5;	\
+	if (r[i] != expected)					\
+	  __builtin_abort ();					\
+	asm volatile ("" ::: "memory");				\
+      }								\
+  }
+
+int
+main (void)
+{
+  TEST_ALL (TEST_LOOP)
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_7.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_7.c
new file mode 100644
index 00000000000..5561f421979
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_7.c
@@ -0,0 +1,62 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define DEF_LOOP(TYPE, NAME, OP, CONST)				\
+  void __attribute__ ((noipa))					\
+  test_##TYPE##_##NAME##_##CONST (TYPE *__restrict r, 		\
+				  TYPE *__restrict a,		\
+				  TYPE *__restrict b,		\
+				  TYPE *__restrict pred, int n)	\
+  {								\
+    for (int i = 0; i < n; ++i)					\
+      r[i] = pred[i] != 1 ? a[i] OP b[i] * CONST : a[i];	\
+  }
+
+#define TEST_COUNT(T, TYPE, CONST) \
+  T (TYPE, add, +, CONST) \
+  T (TYPE, sub, -, CONST)
+
+#define TEST_TYPE(T, TYPE, CONST) \
+  TEST_COUNT (T, TYPE, 2) \
+  TEST_COUNT (T, TYPE, 4) \
+  TEST_COUNT (T, TYPE, CONST)
+
+#define TEST_ALL(T) \
+  TEST_TYPE (T, uint8_t, 0x80) \
+  TEST_TYPE (T, uint16_t, 0x8000) \
+  TEST_TYPE (T, uint32_t, 0x80000000) \
+  TEST_TYPE (T, uint64_t, 0x8000000000000000ULL)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.b, z[0-9]+\.b, #1\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.b, z[0-9]+\.b, #2\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.b, z[0-9]+\.b, #7\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.h, z[0-9]+\.h, #1\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.h, z[0-9]+\.h, #2\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.h, z[0-9]+\.h, #15\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.s, z[0-9]+\.s, #1\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.s, z[0-9]+\.s, #2\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.s, z[0-9]+\.s, #31\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.d, z[0-9]+\.d, #1\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.d, z[0-9]+\.d, #2\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.d, z[0-9]+\.d, #63\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.b, p[0-7]/m, z[0-9]+\.b, z[0-9]+\.b\n} 4 } } */
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 4 } } */
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 4 } } */
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 4 } } */
+
+/* { dg-final { scan-assembler-times {\tsub\tz[0-9]+\.b, p[0-7]/m, z[0-9]+\.b, z[0-9]+\.b\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tsub\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tsub\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tsub\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 2 } } */
+
+/* { dg-final { scan-assembler-not {\tmov\tz[^,]*z} } } */
+/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
+/* { dg-final { scan-assembler-not {\tsel\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_7_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_7_run.c
new file mode 100644
index 00000000000..b094f40a28a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_7_run.c
@@ -0,0 +1,34 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "cond_mla_7.c"
+
+#define N 99
+
+#define TEST_LOOP(TYPE, NAME, OP, CONST)			\
+  {								\
+    TYPE r[N], a[N], b[N], pred[N];				\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	a[i] = (i & 1 ? i : 3 * i);				\
+	b[i] = (i >> 4) << (i & 15);				\
+	pred[i] = i % 3 < i % 5;				\
+	asm volatile ("" ::: "memory");				\
+      }								\
+    test_##TYPE##_##NAME##_##CONST (r, a, b, pred, N);		\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	TYPE expected						\
+	  = pred[i] != 1 ? a[i] OP b[i] * CONST : a[i];		\
+	if (r[i] != expected)					\
+	  __builtin_abort ();					\
+	asm volatile ("" ::: "memory");				\
+      }								\
+  }
+
+int
+main (void)
+{
+  TEST_ALL (TEST_LOOP)
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_8.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_8.c
new file mode 100644
index 00000000000..d5549272e57
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_8.c
@@ -0,0 +1,62 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define DEF_LOOP(TYPE, NAME, OP, CONST)				\
+  void __attribute__ ((noipa))					\
+  test_##TYPE##_##NAME##_##CONST (TYPE *__restrict r, 		\
+				  TYPE *__restrict a,		\
+				  TYPE *__restrict b,		\
+				  TYPE *__restrict pred, int n)	\
+  {								\
+    for (int i = 0; i < n; ++i)					\
+      r[i] = pred[i] != 1 ? a[i] OP b[i] * -CONST : a[i];	\
+  }
+
+#define TEST_COUNT(T, TYPE, CONST) \
+  T (TYPE, add, +, CONST) \
+  T (TYPE, sub, -, CONST)
+
+#define TEST_TYPE(T, TYPE, CONST) \
+  TEST_COUNT (T, TYPE, 2) \
+  TEST_COUNT (T, TYPE, 4) \
+  TEST_COUNT (T, TYPE, CONST)
+
+#define TEST_ALL(T) \
+  TEST_TYPE (T, uint8_t, 0x80) \
+  TEST_TYPE (T, uint16_t, 0x8000) \
+  TEST_TYPE (T, uint32_t, 0x80000000) \
+  TEST_TYPE (T, uint64_t, 0x8000000000000000ULL)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.b, z[0-9]+\.b, #1\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.b, z[0-9]+\.b, #2\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.b, z[0-9]+\.b, #7\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.h, z[0-9]+\.h, #1\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.h, z[0-9]+\.h, #2\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.h, z[0-9]+\.h, #15\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.s, z[0-9]+\.s, #1\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.s, z[0-9]+\.s, #2\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.s, z[0-9]+\.s, #31\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.d, z[0-9]+\.d, #1\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.d, z[0-9]+\.d, #2\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.d, z[0-9]+\.d, #63\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.b, p[0-7]/m, z[0-9]+\.b, z[0-9]+\.b\n} 4 } } */
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 4 } } */
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 4 } } */
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 4 } } */
+
+/* { dg-final { scan-assembler-times {\tsub\tz[0-9]+\.b, p[0-7]/m, z[0-9]+\.b, z[0-9]+\.b\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tsub\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tsub\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tsub\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 2 } } */
+
+/* { dg-final { scan-assembler-not {\tmov\tz[^,]*z} } } */
+/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
+/* { dg-final { scan-assembler-not {\tsel\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_8_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_8_run.c
new file mode 100644
index 00000000000..7fb58aa70de
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_8_run.c
@@ -0,0 +1,34 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "cond_mla_8.c"
+
+#define N 99
+
+#define TEST_LOOP(TYPE, NAME, OP, CONST)			\
+  {								\
+    TYPE r[N], a[N], b[N], pred[N];				\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	a[i] = (i & 1 ? i : 3 * i);				\
+	b[i] = (i >> 4) << (i & 15);				\
+	pred[i] = i % 3 < i % 5;				\
+	asm volatile ("" ::: "memory");				\
+      }								\
+    test_##TYPE##_##NAME##_##CONST (r, a, b, pred, N);		\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	TYPE expected						\
+	  = pred[i] != 1 ? a[i] OP b[i] * -CONST : a[i];	\
+	if (r[i] != expected)					\
+	  __builtin_abort ();					\
+	asm volatile ("" ::: "memory");				\
+      }								\
+  }
+
+int
+main (void)
+{
+  TEST_ALL (TEST_LOOP)
+  return 0;
+}