+2019-08-15 Richard Sandiford <richard.sandiford@arm.com>
+ Kugan Vivekanandarajah <kugan.vivekanandarajah@linaro.org>
+
+ * config/aarch64/aarch64-protos.h (aarch64_prepare_sve_int_fma)
+ (aarch64_prepare_sve_cond_int_fma): Declare.
+ * config/aarch64/aarch64.c (aarch64_convert_mult_to_shift)
+ (aarch64_prepare_sve_int_fma): New functions.
+ (aarch64_prepare_sve_cond_int_fma): Likewise.
+ * config/aarch64/aarch64-sve.md
+ (cond_<SVE_INT_BINARY:optab><SVE_I:mode>): Add a "@" marker.
+ (fma<SVE_I:mode>4, cond_fma<SVE_I:mode>, *cond_fma<SVE_I:mode>_2)
+ (*cond_fma<SVE_I:mode>_4, *cond_fma<SVE_I:mode>_any, fnma<SVE_I:mode>4)
+ (cond_fnma<SVE_I:mode>, *cond_fnma<SVE_I:mode>_2)
+ (*cond_fnma<SVE_I:mode>_4, *cond_fnma<SVE_I:mode>_any): New patterns.
+ (*madd<mode>): Rename to...
+ (*fma<mode>4): ...this.
+ (*msub<mode>): Rename to...
+ (*fnma<mode>4): ...this.
+
2019-08-15 Richard Sandiford <richard.sandiford@arm.com>
Kugan Vivekanandarajah <kugan.vivekanandarajah@linaro.org>
void aarch64_expand_sve_vec_cmp_int (rtx, rtx_code, rtx, rtx);
bool aarch64_expand_sve_vec_cmp_float (rtx, rtx_code, rtx, rtx, bool);
void aarch64_expand_sve_vcond (machine_mode, machine_mode, rtx *);
+
+bool aarch64_prepare_sve_int_fma (rtx *, rtx_code);
+bool aarch64_prepare_sve_cond_int_fma (rtx *, rtx_code);
#endif /* RTX_CODE */
void aarch64_init_builtins (void);
)
;; Predicated integer operations with merging.
-(define_expand "cond_<optab><mode>"
+(define_expand "@cond_<optab><mode>"
[(set (match_operand:SVE_I 0 "register_operand")
(unspec:SVE_I
[(match_operand:<VPRED> 1 "register_operand")
;; - MLA
;; -------------------------------------------------------------------------
+;; Unpredicated integer addition of product.
+(define_expand "fma<mode>4"
+ [(set (match_operand:SVE_I 0 "register_operand")
+ (plus:SVE_I
+ (unspec:SVE_I
+ [(match_dup 4)
+ (mult:SVE_I (match_operand:SVE_I 1 "register_operand")
+ (match_operand:SVE_I 2 "nonmemory_operand"))]
+ UNSPEC_PRED_X)
+ (match_operand:SVE_I 3 "register_operand")))]
+ "TARGET_SVE"
+ {
+ if (aarch64_prepare_sve_int_fma (operands, PLUS))
+ DONE;
+ operands[4] = aarch64_ptrue_reg (<VPRED>mode);
+ }
+)
+
;; Predicated integer addition of product.
-(define_insn "*madd<mode>"
+(define_insn "*fma<mode>4"
[(set (match_operand:SVE_I 0 "register_operand" "=w, w, ?&w")
(plus:SVE_I
(unspec:SVE_I
[(set_attr "movprfx" "*,*,yes")]
)
+;; Predicated integer addition of product with merging.
+(define_expand "cond_fma<mode>"
+ [(set (match_operand:SVE_I 0 "register_operand")
+ (unspec:SVE_I
+ [(match_operand:<VPRED> 1 "register_operand")
+ (plus:SVE_I
+ (mult:SVE_I (match_operand:SVE_I 2 "register_operand")
+ (match_operand:SVE_I 3 "general_operand"))
+ (match_operand:SVE_I 4 "register_operand"))
+ (match_operand:SVE_I 5 "aarch64_simd_reg_or_zero")]
+ UNSPEC_SEL))]
+ "TARGET_SVE"
+ {
+ if (aarch64_prepare_sve_cond_int_fma (operands, PLUS))
+ DONE;
+ /* Swap the multiplication operands if the fallback value is the
+ second of the two. */
+ if (rtx_equal_p (operands[3], operands[5]))
+ std::swap (operands[2], operands[3]);
+ }
+)
+
+;; Predicated integer addition of product, merging with the first input.
+(define_insn "*cond_fma<mode>_2"
+ [(set (match_operand:SVE_I 0 "register_operand" "=w, ?&w")
+ (unspec:SVE_I
+ [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
+ (plus:SVE_I
+ (mult:SVE_I (match_operand:SVE_I 2 "register_operand" "0, w")
+ (match_operand:SVE_I 3 "register_operand" "w, w"))
+ (match_operand:SVE_I 4 "register_operand" "w, w"))
+ (match_dup 2)]
+ UNSPEC_SEL))]
+ "TARGET_SVE"
+ "@
+ mad\t%0.<Vetype>, %1/m, %3.<Vetype>, %4.<Vetype>
+ movprfx\t%0, %2\;mad\t%0.<Vetype>, %1/m, %3.<Vetype>, %4.<Vetype>"
+ [(set_attr "movprfx" "*,yes")]
+)
+
+;; Predicated integer addition of product, merging with the third input.
+(define_insn "*cond_fma<mode>_4"
+ [(set (match_operand:SVE_I 0 "register_operand" "=w, ?&w")
+ (unspec:SVE_I
+ [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
+ (plus:SVE_I
+ (mult:SVE_I (match_operand:SVE_I 2 "register_operand" "w, w")
+ (match_operand:SVE_I 3 "register_operand" "w, w"))
+ (match_operand:SVE_I 4 "register_operand" "0, w"))
+ (match_dup 4)]
+ UNSPEC_SEL))]
+ "TARGET_SVE"
+ "@
+ mla\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>
+ movprfx\t%0, %4\;mla\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>"
+ [(set_attr "movprfx" "*,yes")]
+)
+
+;; Predicated integer addition of product, merging with an independent value.
+(define_insn_and_rewrite "*cond_fma<mode>_any"
+ [(set (match_operand:SVE_I 0 "register_operand" "=&w, &w, &w, &w, &w, ?&w")
+ (unspec:SVE_I
+ [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl, Upl, Upl, Upl")
+ (plus:SVE_I
+ (mult:SVE_I (match_operand:SVE_I 2 "register_operand" "w, w, 0, w, w, w")
+ (match_operand:SVE_I 3 "register_operand" "w, w, w, 0, w, w"))
+ (match_operand:SVE_I 4 "register_operand" "w, 0, w, w, w, w"))
+ (match_operand:SVE_I 5 "aarch64_simd_reg_or_zero" "Dz, Dz, Dz, Dz, 0, w")]
+ UNSPEC_SEL))]
+ "TARGET_SVE
+ && !rtx_equal_p (operands[2], operands[5])
+ && !rtx_equal_p (operands[3], operands[5])
+ && !rtx_equal_p (operands[4], operands[5])"
+ "@
+ movprfx\t%0.<Vetype>, %1/z, %4.<Vetype>\;mla\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>
+ movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;mla\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>
+ movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;mad\t%0.<Vetype>, %1/m, %3.<Vetype>, %4.<Vetype>
+ movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;mad\t%0.<Vetype>, %1/m, %2.<Vetype>, %4.<Vetype>
+ movprfx\t%0.<Vetype>, %1/m, %4.<Vetype>\;mla\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>
+ #"
+ "&& reload_completed
+ && register_operand (operands[5], <MODE>mode)
+ && !rtx_equal_p (operands[0], operands[5])"
+ {
+ emit_insn (gen_vcond_mask_<mode><vpred> (operands[0], operands[4],
+ operands[5], operands[1]));
+ operands[5] = operands[4] = operands[0];
+ }
+ [(set_attr "movprfx" "yes")]
+)
+
;; -------------------------------------------------------------------------
;; ---- [INT] MLS and MSB
;; -------------------------------------------------------------------------
;; - MSB
;; -------------------------------------------------------------------------
+;; Unpredicated integer subtraction of product.
+(define_expand "fnma<mode>4"
+ [(set (match_operand:SVE_I 0 "register_operand")
+ (minus:SVE_I
+ (match_operand:SVE_I 3 "register_operand")
+ (unspec:SVE_I
+ [(match_dup 4)
+ (mult:SVE_I (match_operand:SVE_I 1 "register_operand")
+ (match_operand:SVE_I 2 "general_operand"))]
+ UNSPEC_PRED_X)))]
+ "TARGET_SVE"
+ {
+ if (aarch64_prepare_sve_int_fma (operands, MINUS))
+ DONE;
+ operands[4] = aarch64_ptrue_reg (<VPRED>mode);
+ }
+)
+
;; Predicated integer subtraction of product.
-(define_insn "*msub<mode>3"
+(define_insn "*fnma<mode>3"
[(set (match_operand:SVE_I 0 "register_operand" "=w, w, ?&w")
(minus:SVE_I
(match_operand:SVE_I 4 "register_operand" "w, 0, w")
[(set_attr "movprfx" "*,*,yes")]
)
+;; Predicated integer subtraction of product with merging.
+(define_expand "cond_fnma<mode>"
+ [(set (match_operand:SVE_I 0 "register_operand")
+ (unspec:SVE_I
+ [(match_operand:<VPRED> 1 "register_operand")
+ (minus:SVE_I
+ (match_operand:SVE_I 4 "register_operand")
+ (mult:SVE_I (match_operand:SVE_I 2 "register_operand")
+ (match_operand:SVE_I 3 "general_operand")))
+ (match_operand:SVE_I 5 "aarch64_simd_reg_or_zero")]
+ UNSPEC_SEL))]
+ "TARGET_SVE"
+ {
+ if (aarch64_prepare_sve_cond_int_fma (operands, MINUS))
+ DONE;
+ /* Swap the multiplication operands if the fallback value is the
+ second of the two. */
+ if (rtx_equal_p (operands[3], operands[5]))
+ std::swap (operands[2], operands[3]);
+ }
+)
+
+;; Predicated integer subtraction of product, merging with the first input.
+(define_insn "*cond_fnma<mode>_2"
+ [(set (match_operand:SVE_I 0 "register_operand" "=w, ?&w")
+ (unspec:SVE_I
+ [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
+ (minus:SVE_I
+ (match_operand:SVE_I 4 "register_operand" "w, w")
+ (mult:SVE_I (match_operand:SVE_I 2 "register_operand" "0, w")
+ (match_operand:SVE_I 3 "register_operand" "w, w")))
+ (match_dup 2)]
+ UNSPEC_SEL))]
+ "TARGET_SVE"
+ "@
+ msb\t%0.<Vetype>, %1/m, %3.<Vetype>, %4.<Vetype>
+ movprfx\t%0, %2\;msb\t%0.<Vetype>, %1/m, %3.<Vetype>, %4.<Vetype>"
+ [(set_attr "movprfx" "*,yes")]
+)
+
+;; Predicated integer subtraction of product, merging with the third input.
+(define_insn "*cond_fnma<mode>_4"
+ [(set (match_operand:SVE_I 0 "register_operand" "=w, ?&w")
+ (unspec:SVE_I
+ [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
+ (minus:SVE_I
+ (match_operand:SVE_I 4 "register_operand" "0, w")
+ (mult:SVE_I (match_operand:SVE_I 2 "register_operand" "w, w")
+ (match_operand:SVE_I 3 "register_operand" "w, w")))
+ (match_dup 4)]
+ UNSPEC_SEL))]
+ "TARGET_SVE"
+ "@
+ mls\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>
+ movprfx\t%0, %4\;mls\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>"
+ [(set_attr "movprfx" "*,yes")]
+)
+
+;; Predicated integer subtraction of product, merging with an
+;; independent value.
+(define_insn_and_rewrite "*cond_fnma<mode>_any"
+ [(set (match_operand:SVE_I 0 "register_operand" "=&w, &w, &w, &w, &w, ?&w")
+ (unspec:SVE_I
+ [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl, Upl, Upl, Upl")
+ (minus:SVE_I
+ (match_operand:SVE_I 4 "register_operand" "w, 0, w, w, w, w")
+ (mult:SVE_I (match_operand:SVE_I 2 "register_operand" "w, w, 0, w, w, w")
+ (match_operand:SVE_I 3 "register_operand" "w, w, w, 0, w, w")))
+ (match_operand:SVE_I 5 "aarch64_simd_reg_or_zero" "Dz, Dz, Dz, Dz, 0, w")]
+ UNSPEC_SEL))]
+ "TARGET_SVE
+ && !rtx_equal_p (operands[2], operands[5])
+ && !rtx_equal_p (operands[3], operands[5])
+ && !rtx_equal_p (operands[4], operands[5])"
+ "@
+ movprfx\t%0.<Vetype>, %1/z, %4.<Vetype>\;mls\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>
+ movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;mls\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>
+ movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;msb\t%0.<Vetype>, %1/m, %3.<Vetype>, %4.<Vetype>
+ movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;msb\t%0.<Vetype>, %1/m, %2.<Vetype>, %4.<Vetype>
+ movprfx\t%0.<Vetype>, %1/m, %4.<Vetype>\;mls\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>
+ #"
+ "&& reload_completed
+ && register_operand (operands[5], <MODE>mode)
+ && !rtx_equal_p (operands[0], operands[5])"
+ {
+ emit_insn (gen_vcond_mask_<mode><vpred> (operands[0], operands[4],
+ operands[5], operands[1]));
+ operands[5] = operands[4] = operands[0];
+ }
+ [(set_attr "movprfx" "yes")]
+)
+
;; -------------------------------------------------------------------------
;; ---- [INT] Dot product
;; -------------------------------------------------------------------------
aarch64_sve_expand_vector_init_insert_elems (target, v, nelts);
}
+/* Check whether VALUE is a vector constant in which every element
+ is either a power of 2 or a negated power of 2. If so, return
+ a constant vector of log2s, and flip CODE between PLUS and MINUS
+ if VALUE contains negated powers of 2. Return NULL_RTX otherwise. */
+
+static rtx
+aarch64_convert_mult_to_shift (rtx value, rtx_code &code)
+{
+ if (GET_CODE (value) != CONST_VECTOR)
+ return NULL_RTX;
+
+ rtx_vector_builder builder;
+ if (!builder.new_unary_operation (GET_MODE (value), value, false))
+ return NULL_RTX;
+
+ scalar_mode int_mode = GET_MODE_INNER (GET_MODE (value));
+ /* 1 if the result of the multiplication must be negated,
+ 0 if it mustn't, or -1 if we don't yet care. */
+ int negate = -1;
+ unsigned int encoded_nelts = const_vector_encoded_nelts (value);
+ for (unsigned int i = 0; i < encoded_nelts; ++i)
+ {
+ rtx elt = CONST_VECTOR_ENCODED_ELT (value, i);
+ if (!CONST_SCALAR_INT_P (elt))
+ return NULL_RTX;
+ rtx_mode_t val (elt, int_mode);
+ wide_int pow2 = wi::neg (val);
+ if (val != pow2)
+ {
+ /* It matters whether we negate or not. Make that choice,
+ and make sure that it's consistent with previous elements. */
+ if (negate == !wi::neg_p (val))
+ return NULL_RTX;
+ negate = wi::neg_p (val);
+ if (!negate)
+ pow2 = val;
+ }
+ /* POW2 is now the value that we want to be a power of 2. */
+ int shift = wi::exact_log2 (pow2);
+ if (shift < 0)
+ return NULL_RTX;
+ builder.quick_push (gen_int_mode (shift, int_mode));
+ }
+ if (negate == -1)
+ /* PLUS and MINUS are equivalent; canonicalize on PLUS. */
+ code = PLUS;
+ else if (negate == 1)
+ code = code == PLUS ? MINUS : PLUS;
+ return builder.build ();
+}
+
+/* Prepare for an integer SVE multiply-add or multiply-subtract pattern;
+ CODE is PLUS for the former and MINUS for the latter. OPERANDS is the
+ operands array, in the same order as for fma_optab. Return true if
+ the function emitted all the necessary instructions, false if the caller
+ should generate the pattern normally with the new OPERANDS array. */
+
+bool
+aarch64_prepare_sve_int_fma (rtx *operands, rtx_code code)
+{
+ machine_mode mode = GET_MODE (operands[0]);
+ if (rtx shifts = aarch64_convert_mult_to_shift (operands[2], code))
+ {
+ rtx product = expand_binop (mode, vashl_optab, operands[1], shifts,
+ NULL_RTX, true, OPTAB_DIRECT);
+ force_expand_binop (mode, code == PLUS ? add_optab : sub_optab,
+ operands[3], product, operands[0], true,
+ OPTAB_DIRECT);
+ return true;
+ }
+ operands[2] = force_reg (mode, operands[2]);
+ return false;
+}
+
+/* Likewise, but for a conditional pattern. */
+
+bool
+aarch64_prepare_sve_cond_int_fma (rtx *operands, rtx_code code)
+{
+ machine_mode mode = GET_MODE (operands[0]);
+ if (rtx shifts = aarch64_convert_mult_to_shift (operands[3], code))
+ {
+ rtx product = expand_binop (mode, vashl_optab, operands[2], shifts,
+ NULL_RTX, true, OPTAB_DIRECT);
+ emit_insn (gen_cond (code, mode, operands[0], operands[1],
+ operands[4], product, operands[5]));
+ return true;
+ }
+ operands[3] = force_reg (mode, operands[3]);
+ return false;
+}
+
static unsigned HOST_WIDE_INT
aarch64_shift_truncation_mask (machine_mode mode)
{
+2019-08-15 Richard Sandiford <richard.sandiford@arm.com>
+ Kugan Vivekanandarajah <kugan.vivekanandarajah@linaro.org>
+
+ * gcc.target/aarch64/sve/cond_mla_1.c: New test.
+ * gcc.target/aarch64/sve/cond_mla_1_run.c: Likewise.
+ * gcc.target/aarch64/sve/cond_mla_2.c: Likewise.
+ * gcc.target/aarch64/sve/cond_mla_2_run.c: Likewise.
+ * gcc.target/aarch64/sve/cond_mla_3.c: Likewise.
+ * gcc.target/aarch64/sve/cond_mla_3_run.c: Likewise.
+ * gcc.target/aarch64/sve/cond_mla_4.c: Likewise.
+ * gcc.target/aarch64/sve/cond_mla_4_run.c: Likewise.
+ * gcc.target/aarch64/sve/cond_mla_5.c: Likewise.
+ * gcc.target/aarch64/sve/cond_mla_5_run.c: Likewise.
+ * gcc.target/aarch64/sve/cond_mla_6.c: Likewise.
+ * gcc.target/aarch64/sve/cond_mla_6_run.c: Likewise.
+ * gcc.target/aarch64/sve/cond_mla_7.c: Likewise.
+ * gcc.target/aarch64/sve/cond_mla_7_run.c: Likewise.
+ * gcc.target/aarch64/sve/cond_mla_8.c: Likewise.
+ * gcc.target/aarch64/sve/cond_mla_8_run.c: Likewise.
+
2019-08-15 Richard Sandiford <richard.sandiford@arm.com>
Kugan Vivekanandarajah <kugan.vivekanandarajah@linaro.org>
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define DEF_LOOP(TYPE, NAME, OP) \
+ void __attribute__ ((noipa)) \
+ test_##TYPE##_##NAME (TYPE *__restrict r, \
+ TYPE *__restrict a, \
+ TYPE *__restrict b, TYPE c, \
+ TYPE *__restrict pred, int n) \
+ { \
+ for (int i = 0; i < n; ++i) \
+ r[i] = pred[i] != 1 ? a[i] OP b[i] * c : b[i]; \
+ }
+
+#define TEST_TYPE(T, TYPE) \
+ T (TYPE, add, +) \
+ T (TYPE, sub, -)
+
+#define TEST_ALL(T) \
+ TEST_TYPE (T, uint8_t) \
+ TEST_TYPE (T, uint16_t) \
+ TEST_TYPE (T, uint32_t) \
+ TEST_TYPE (T, uint64_t) \
+ TEST_TYPE (T, _Float16) \
+ TEST_TYPE (T, float) \
+ TEST_TYPE (T, double)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\tmad\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmad\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmad\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmad\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tmsb\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmsb\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmsb\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmsb\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmad\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmad\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmad\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmsb\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmsb\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmsb\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-not {\tmov\tz[^,]*z} } } */
+/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
+/* { dg-final { scan-assembler-not {\tsel\t} } } */
--- /dev/null
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "cond_mla_1.c"
+
+#define FACTOR 17
+#define N 99
+
+#define TEST_LOOP(TYPE, NAME, OP) \
+ { \
+ TYPE r[N], a[N], b[N], pred[N]; \
+ for (int i = 0; i < N; ++i) \
+ { \
+ a[i] = (i & 1 ? i : 3 * i); \
+ b[i] = (i >> 4) << (i & 15); \
+ pred[i] = i % 3 < i % 5; \
+ asm volatile ("" ::: "memory"); \
+ } \
+ test_##TYPE##_##NAME (r, a, b, FACTOR, pred, N); \
+ for (int i = 0; i < N; ++i) \
+ { \
+ TYPE expected \
+ = pred[i] != 1 ? a[i] OP b[i] * (TYPE) FACTOR : b[i]; \
+ if (r[i] != expected) \
+ __builtin_abort (); \
+ asm volatile ("" ::: "memory"); \
+ } \
+ }
+
+int
+main (void)
+{
+ TEST_ALL (TEST_LOOP)
+ return 0;
+}
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define DEF_LOOP(TYPE, NAME, OP) \
+ void __attribute__ ((noipa)) \
+ test_##TYPE##_##NAME (TYPE *__restrict r, \
+ TYPE *__restrict a, \
+ TYPE *__restrict b, TYPE c, \
+ TYPE *__restrict pred, int n) \
+ { \
+ for (int i = 0; i < n; ++i) \
+ r[i] = pred[i] != 1 ? a[i] OP b[i] * c : c; \
+ }
+
+#define TEST_TYPE(T, TYPE) \
+ T (TYPE, add, +) \
+ T (TYPE, sub, -)
+
+#define TEST_ALL(T) \
+ TEST_TYPE (T, uint8_t) \
+ TEST_TYPE (T, uint16_t) \
+ TEST_TYPE (T, uint32_t) \
+ TEST_TYPE (T, uint64_t) \
+ TEST_TYPE (T, _Float16) \
+ TEST_TYPE (T, float) \
+ TEST_TYPE (T, double)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\tmad\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmad\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmad\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmad\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tmsb\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmsb\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmsb\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmsb\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmad\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmad\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmad\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmsb\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmsb\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmsb\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+, z[0-9]+\n} 14 } } */
+
+/* { dg-final { scan-assembler-not {\tmov\tz[^,]*z} } } */
+/* { dg-final { scan-assembler-not {\tsel\t} } } */
--- /dev/null
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "cond_mla_2.c"
+
+#define FACTOR 17
+#define N 99
+
+#define TEST_LOOP(TYPE, NAME, OP) \
+ { \
+ TYPE r[N], a[N], b[N], pred[N]; \
+ for (int i = 0; i < N; ++i) \
+ { \
+ a[i] = (i & 1 ? i : 3 * i); \
+ b[i] = (i >> 4) << (i & 15); \
+ pred[i] = i % 3 < i % 5; \
+ asm volatile ("" ::: "memory"); \
+ } \
+ test_##TYPE##_##NAME (r, a, b, FACTOR, pred, N); \
+ for (int i = 0; i < N; ++i) \
+ { \
+ TYPE expected = (pred[i] != 1 \
+ ? a[i] OP b[i] * (TYPE) FACTOR \
+ : (TYPE) FACTOR); \
+ if (r[i] != expected) \
+ __builtin_abort (); \
+ asm volatile ("" ::: "memory"); \
+ } \
+ }
+
+int
+main (void)
+{
+ TEST_ALL (TEST_LOOP)
+ return 0;
+}
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define DEF_LOOP(TYPE, NAME, OP) \
+ void __attribute__ ((noipa)) \
+ test_##TYPE##_##NAME (TYPE *__restrict r, \
+ TYPE *__restrict a, \
+ TYPE *__restrict b, TYPE c, \
+ TYPE *__restrict pred, int n) \
+ { \
+ for (int i = 0; i < n; ++i) \
+ r[i] = pred[i] != 1 ? a[i] OP b[i] * c : a[i]; \
+ }
+
+#define TEST_TYPE(T, TYPE) \
+ T (TYPE, add, +) \
+ T (TYPE, sub, -)
+
+#define TEST_ALL(T) \
+ TEST_TYPE (T, uint8_t) \
+ TEST_TYPE (T, uint16_t) \
+ TEST_TYPE (T, uint32_t) \
+ TEST_TYPE (T, uint64_t) \
+ TEST_TYPE (T, _Float16) \
+ TEST_TYPE (T, float) \
+ TEST_TYPE (T, double)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\tmla\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmla\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmla\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmla\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tmls\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmls\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmls\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmls\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmls\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmls\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmls\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-not {\tmov\tz[^,]*z} } } */
+/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
+/* { dg-final { scan-assembler-not {\tsel\t} } } */
--- /dev/null
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "cond_mla_3.c"
+
+#define FACTOR 17
+#define N 99
+
+#define TEST_LOOP(TYPE, NAME, OP) \
+ { \
+ TYPE r[N], a[N], b[N], pred[N]; \
+ for (int i = 0; i < N; ++i) \
+ { \
+ a[i] = (i & 1 ? i : 3 * i); \
+ b[i] = (i >> 4) << (i & 15); \
+ pred[i] = i % 3 < i % 5; \
+ asm volatile ("" ::: "memory"); \
+ } \
+ test_##TYPE##_##NAME (r, a, b, FACTOR, pred, N); \
+ for (int i = 0; i < N; ++i) \
+ { \
+ TYPE expected \
+ = pred[i] != 1 ? a[i] OP b[i] * (TYPE) FACTOR : a[i]; \
+ if (r[i] != expected) \
+ __builtin_abort (); \
+ asm volatile ("" ::: "memory"); \
+ } \
+ }
+
+int
+main (void)
+{
+ TEST_ALL (TEST_LOOP)
+ return 0;
+}
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define DEF_LOOP(TYPE, NAME, OP) \
+ void __attribute__ ((noipa)) \
+ test_##TYPE##_##NAME (TYPE *__restrict r, \
+ TYPE *__restrict a, \
+ TYPE *__restrict b, TYPE c, \
+ TYPE *__restrict pred, int n) \
+ { \
+ for (int i = 0; i < n; ++i) \
+ r[i] = pred[i] == 1 ? a[i] OP b[i] * c : pred[i]; \
+ }
+
+#define TEST_TYPE(T, TYPE) \
+ T (TYPE, add, +) \
+ T (TYPE, sub, -)
+
+#define TEST_ALL(T) \
+ TEST_TYPE (T, uint8_t) \
+ TEST_TYPE (T, uint16_t) \
+ TEST_TYPE (T, uint32_t) \
+ TEST_TYPE (T, uint64_t) \
+ TEST_TYPE (T, _Float16) \
+ TEST_TYPE (T, float) \
+ TEST_TYPE (T, double)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\tmla\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmla\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmla\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmla\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tmls\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmls\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmls\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmls\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmls\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmls\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmls\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.b, p[0-7]/m,} 2 } } */
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.h, p[0-7]/m,} 4 } } */
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.s, p[0-7]/m,} 4 } } */
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.d, p[0-7]/m,} 4 } } */
+
+/* { dg-final { scan-assembler-not {\tmov\tz[^,]*z} } } */
+/* { dg-final { scan-assembler-not {\tsel\t} } } */
--- /dev/null
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "cond_mla_4.c"
+
+#define FACTOR 17
+#define N 99
+
+#define TEST_LOOP(TYPE, NAME, OP) \
+ { \
+ TYPE r[N], a[N], b[N], pred[N]; \
+ for (int i = 0; i < N; ++i) \
+ { \
+ a[i] = (i & 1 ? i : 3 * i); \
+ b[i] = (i >> 4) << (i & 15); \
+ pred[i] = i % 3; \
+ asm volatile ("" ::: "memory"); \
+ } \
+ test_##TYPE##_##NAME (r, a, b, FACTOR, pred, N); \
+ for (int i = 0; i < N; ++i) \
+ { \
+ TYPE expected = (pred[i] == 1 \
+ ? a[i] OP b[i] * (TYPE) FACTOR \
+ : pred[i]); \
+ if (r[i] != expected) \
+ __builtin_abort (); \
+ asm volatile ("" ::: "memory"); \
+ } \
+ }
+
+int
+main (void)
+{
+ TEST_ALL (TEST_LOOP)
+ return 0;
+}
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define DEF_LOOP(TYPE, NAME, OP) \
+ void __attribute__ ((noipa)) \
+ test_##TYPE##_##NAME (TYPE *__restrict r, \
+ TYPE *__restrict a, \
+ TYPE *__restrict b, TYPE c, \
+ TYPE *__restrict pred, int n) \
+ { \
+ for (int i = 0; i < n; ++i) \
+ r[i] = pred[i] ? a[i] OP b[i] * c : 0; \
+ }
+
+#define TEST_TYPE(T, TYPE) \
+ T (TYPE, add, +) \
+ T (TYPE, sub, -)
+
+#define TEST_ALL(T) \
+ TEST_TYPE (T, uint8_t) \
+ TEST_TYPE (T, uint16_t) \
+ TEST_TYPE (T, uint32_t) \
+ TEST_TYPE (T, uint64_t) \
+ TEST_TYPE (T, _Float16) \
+ TEST_TYPE (T, float) \
+ TEST_TYPE (T, double)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\t(?:mla|mad)\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\t(?:mla|mad)\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\t(?:mla|mad)\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\t(?:mla|mad)\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\t(?:mls|msb)\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\t(?:mls|msb)\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\t(?:mls|msb)\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\t(?:mls|msb)\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmls\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmls\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmls\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.b, p[0-7]/z,} 2 } } */
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.h, p[0-7]/z,} 4 } } */
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.s, p[0-7]/z,} 4 } } */
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.d, p[0-7]/z,} 4 } } */
+
+/* { dg-final { scan-assembler-not {\tmov\tz[^,]*z} } } */
+/* { dg-final { scan-assembler-not {\tsel\t} } } */
--- /dev/null
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "cond_mla_5.c"
+
+#define FACTOR 17
+#define N 99
+
+#define TEST_LOOP(TYPE, NAME, OP) \
+ { \
+ TYPE r[N], a[N], b[N], pred[N]; \
+ for (int i = 0; i < N; ++i) \
+ { \
+ a[i] = (i & 1 ? i : 3 * i); \
+ b[i] = (i >> 4) << (i & 15); \
+ pred[i] = i % 3 < i % 5; \
+ asm volatile ("" ::: "memory"); \
+ } \
+ test_##TYPE##_##NAME (r, a, b, FACTOR, pred, N); \
+ for (int i = 0; i < N; ++i) \
+ { \
+ TYPE expected \
+ = pred[i] ? a[i] OP b[i] * (TYPE) FACTOR : 0; \
+ if (r[i] != expected) \
+ __builtin_abort (); \
+ asm volatile ("" ::: "memory"); \
+ } \
+ }
+
+int
+main (void)
+{
+ TEST_ALL (TEST_LOOP)
+ return 0;
+}
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define DEF_LOOP(TYPE, NAME, OP) \
+ void __attribute__ ((noipa)) \
+ test_##TYPE##_##NAME (TYPE *__restrict r, \
+ TYPE *__restrict a, \
+ TYPE *__restrict b, TYPE c, \
+ TYPE *__restrict pred, int n) \
+ { \
+ for (int i = 0; i < n; ++i) \
+ r[i] = pred[i] ? a[i] OP b[i] * c : 5; \
+ }
+
+#define TEST_TYPE(T, TYPE) \
+ T (TYPE, add, +) \
+ T (TYPE, sub, -)
+
+#define TEST_ALL(T) \
+ TEST_TYPE (T, uint8_t) \
+ TEST_TYPE (T, uint16_t) \
+ TEST_TYPE (T, uint32_t) \
+ TEST_TYPE (T, uint64_t) \
+ TEST_TYPE (T, _Float16) \
+ TEST_TYPE (T, float) \
+ TEST_TYPE (T, double)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\tmla\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmla\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmla\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmla\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tmls\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmls\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmls\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmls\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmls\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmls\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmls\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tsel\t} 14 } } */
+
+/* { dg-final { scan-assembler-not {\tmov\tz[^,]*z} } } */
+/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
--- /dev/null
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "cond_mla_6.c"
+
+#define FACTOR 17
+#define N 99
+
+#define TEST_LOOP(TYPE, NAME, OP) \
+ { \
+ TYPE r[N], a[N], b[N], pred[N]; \
+ for (int i = 0; i < N; ++i) \
+ { \
+ a[i] = (i & 1 ? i : 3 * i); \
+ b[i] = (i >> 4) << (i & 15); \
+ pred[i] = i % 3 < i % 5; \
+ asm volatile ("" ::: "memory"); \
+ } \
+ test_##TYPE##_##NAME (r, a, b, FACTOR, pred, N); \
+ for (int i = 0; i < N; ++i) \
+ { \
+ TYPE expected \
+ = pred[i] ? a[i] OP b[i] * (TYPE) FACTOR : 5; \
+ if (r[i] != expected) \
+ __builtin_abort (); \
+ asm volatile ("" ::: "memory"); \
+ } \
+ }
+
+int
+main (void)
+{
+ TEST_ALL (TEST_LOOP)
+ return 0;
+}
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define DEF_LOOP(TYPE, NAME, OP, CONST) \
+ void __attribute__ ((noipa)) \
+ test_##TYPE##_##NAME##_##CONST (TYPE *__restrict r, \
+ TYPE *__restrict a, \
+ TYPE *__restrict b, \
+ TYPE *__restrict pred, int n) \
+ { \
+ for (int i = 0; i < n; ++i) \
+ r[i] = pred[i] != 1 ? a[i] OP b[i] * CONST : a[i]; \
+ }
+
+#define TEST_COUNT(T, TYPE, CONST) \
+ T (TYPE, add, +, CONST) \
+ T (TYPE, sub, -, CONST)
+
+#define TEST_TYPE(T, TYPE, CONST) \
+ TEST_COUNT (T, TYPE, 2) \
+ TEST_COUNT (T, TYPE, 4) \
+ TEST_COUNT (T, TYPE, CONST)
+
+#define TEST_ALL(T) \
+ TEST_TYPE (T, uint8_t, 0x80) \
+ TEST_TYPE (T, uint16_t, 0x8000) \
+ TEST_TYPE (T, uint32_t, 0x80000000) \
+ TEST_TYPE (T, uint64_t, 0x8000000000000000ULL)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.b, z[0-9]+\.b, #1\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.b, z[0-9]+\.b, #2\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.b, z[0-9]+\.b, #7\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.h, z[0-9]+\.h, #1\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.h, z[0-9]+\.h, #2\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.h, z[0-9]+\.h, #15\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.s, z[0-9]+\.s, #1\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.s, z[0-9]+\.s, #2\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.s, z[0-9]+\.s, #31\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.d, z[0-9]+\.d, #1\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.d, z[0-9]+\.d, #2\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.d, z[0-9]+\.d, #63\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.b, p[0-7]/m, z[0-9]+\.b, z[0-9]+\.b\n} 4 } } */
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 4 } } */
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 4 } } */
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 4 } } */
+
+/* { dg-final { scan-assembler-times {\tsub\tz[0-9]+\.b, p[0-7]/m, z[0-9]+\.b, z[0-9]+\.b\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tsub\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tsub\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tsub\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 2 } } */
+
+/* { dg-final { scan-assembler-not {\tmov\tz[^,]*z} } } */
+/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
+/* { dg-final { scan-assembler-not {\tsel\t} } } */
--- /dev/null
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "cond_mla_7.c"
+
+#define N 99
+
+#define TEST_LOOP(TYPE, NAME, OP, CONST) \
+ { \
+ TYPE r[N], a[N], b[N], pred[N]; \
+ for (int i = 0; i < N; ++i) \
+ { \
+ a[i] = (i & 1 ? i : 3 * i); \
+ b[i] = (i >> 4) << (i & 15); \
+ pred[i] = i % 3 < i % 5; \
+ asm volatile ("" ::: "memory"); \
+ } \
+ test_##TYPE##_##NAME##_##CONST (r, a, b, pred, N); \
+ for (int i = 0; i < N; ++i) \
+ { \
+ TYPE expected \
+ = pred[i] != 1 ? a[i] OP b[i] * CONST : a[i]; \
+ if (r[i] != expected) \
+ __builtin_abort (); \
+ asm volatile ("" ::: "memory"); \
+ } \
+ }
+
+int
+main (void)
+{
+ TEST_ALL (TEST_LOOP)
+ return 0;
+}
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define DEF_LOOP(TYPE, NAME, OP, CONST) \
+ void __attribute__ ((noipa)) \
+ test_##TYPE##_##NAME##_##CONST (TYPE *__restrict r, \
+ TYPE *__restrict a, \
+ TYPE *__restrict b, \
+ TYPE *__restrict pred, int n) \
+ { \
+ for (int i = 0; i < n; ++i) \
+ r[i] = pred[i] != 1 ? a[i] OP b[i] * -CONST : a[i]; \
+ }
+
+#define TEST_COUNT(T, TYPE, CONST) \
+ T (TYPE, add, +, CONST) \
+ T (TYPE, sub, -, CONST)
+
+#define TEST_TYPE(T, TYPE, CONST) \
+ TEST_COUNT (T, TYPE, 2) \
+ TEST_COUNT (T, TYPE, 4) \
+ TEST_COUNT (T, TYPE, CONST)
+
+#define TEST_ALL(T) \
+ TEST_TYPE (T, uint8_t, 0x80) \
+ TEST_TYPE (T, uint16_t, 0x8000) \
+ TEST_TYPE (T, uint32_t, 0x80000000) \
+ TEST_TYPE (T, uint64_t, 0x8000000000000000ULL)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.b, z[0-9]+\.b, #1\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.b, z[0-9]+\.b, #2\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.b, z[0-9]+\.b, #7\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.h, z[0-9]+\.h, #1\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.h, z[0-9]+\.h, #2\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.h, z[0-9]+\.h, #15\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.s, z[0-9]+\.s, #1\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.s, z[0-9]+\.s, #2\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.s, z[0-9]+\.s, #31\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.d, z[0-9]+\.d, #1\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.d, z[0-9]+\.d, #2\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.d, z[0-9]+\.d, #63\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.b, p[0-7]/m, z[0-9]+\.b, z[0-9]+\.b\n} 4 } } */
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 4 } } */
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 4 } } */
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 4 } } */
+
+/* { dg-final { scan-assembler-times {\tsub\tz[0-9]+\.b, p[0-7]/m, z[0-9]+\.b, z[0-9]+\.b\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tsub\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tsub\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tsub\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 2 } } */
+
+/* { dg-final { scan-assembler-not {\tmov\tz[^,]*z} } } */
+/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
+/* { dg-final { scan-assembler-not {\tsel\t} } } */
--- /dev/null
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "cond_mla_8.c"
+
+#define N 99
+
+#define TEST_LOOP(TYPE, NAME, OP, CONST) \
+ { \
+ TYPE r[N], a[N], b[N], pred[N]; \
+ for (int i = 0; i < N; ++i) \
+ { \
+ a[i] = (i & 1 ? i : 3 * i); \
+ b[i] = (i >> 4) << (i & 15); \
+ pred[i] = i % 3 < i % 5; \
+ asm volatile ("" ::: "memory"); \
+ } \
+ test_##TYPE##_##NAME##_##CONST (r, a, b, pred, N); \
+ for (int i = 0; i < N; ++i) \
+ { \
+ TYPE expected \
+ = pred[i] != 1 ? a[i] OP b[i] * -CONST : a[i]; \
+ if (r[i] != expected) \
+ __builtin_abort (); \
+ asm volatile ("" ::: "memory"); \
+ } \
+ }
+
+int
+main (void)
+{
+ TEST_ALL (TEST_LOOP)
+ return 0;
+}