+2018-01-13 Richard Sandiford <richard.sandiford@linaro.org>
+ Alan Hayward <alan.hayward@arm.com>
+ David Sherwood <david.sherwood@arm.com>
+
+ * doc/md.texi (vec_mask_load_lanes@var{m}@var{n}): Document.
+ (vec_mask_store_lanes@var{m}@var{n}): Likewise.
+ * optabs.def (vec_mask_load_lanes_optab): New optab.
+ (vec_mask_store_lanes_optab): Likewise.
+ * internal-fn.def (MASK_LOAD_LANES): New internal function.
+ (MASK_STORE_LANES): Likewise.
+ * internal-fn.c (mask_load_lanes_direct): New macro.
+ (mask_store_lanes_direct): Likewise.
+ (expand_mask_load_optab_fn): Handle masked operations.
+ (expand_mask_load_lanes_optab_fn): New macro.
+ (expand_mask_store_optab_fn): Handle masked operations.
+ (expand_mask_store_lanes_optab_fn): New macro.
+ (direct_mask_load_lanes_optab_supported_p): Likewise.
+ (direct_mask_store_lanes_optab_supported_p): Likewise.
+ * tree-vectorizer.h (vect_store_lanes_supported): Take a masked_p
+ parameter.
+ (vect_load_lanes_supported): Likewise.
+ * tree-vect-data-refs.c (strip_conversion): New function.
+ (can_group_stmts_p): Likewise.
+ (vect_analyze_data_ref_accesses): Use it instead of checking
+ for a pair of assignments.
+ (vect_store_lanes_supported): Take a masked_p parameter.
+ (vect_load_lanes_supported): Likewise.
+ * tree-vect-loop.c (vect_analyze_loop_2): Update calls to
+ vect_store_lanes_supported and vect_load_lanes_supported.
+ * tree-vect-slp.c (vect_analyze_slp_instance): Likewise.
+ * tree-vect-stmts.c (get_group_load_store_type): Take a masked_p
+ parameter. Don't allow gaps for masked accesses.
+ Use vect_get_store_rhs. Update calls to vect_store_lanes_supported
+ and vect_load_lanes_supported.
+ (get_load_store_type): Take a masked_p parameter and update
+ call to get_group_load_store_type.
+ (vectorizable_store): Update call to get_load_store_type.
+ Handle IFN_MASK_STORE_LANES.
+ (vectorizable_load): Update call to get_load_store_type.
+ Handle IFN_MASK_LOAD_LANES.
+
2018-01-13 Richard Sandiford <richard.sandiford@linaro.org>
Alan Hayward <alan.hayward@arm.com>
David Sherwood <david.sherwood@arm.com>
This pattern is not allowed to @code{FAIL}.
+@cindex @code{vec_mask_load_lanes@var{m}@var{n}} instruction pattern
+@item @samp{vec_mask_load_lanes@var{m}@var{n}}
+Like @samp{vec_load_lanes@var{m}@var{n}}, but takes an additional
+mask operand (operand 2) that specifies which elements of the destination
+vectors should be loaded. Other elements of the destination
+vectors are set to zero. The operation is equivalent to:
+
+@smallexample
+int c = GET_MODE_SIZE (@var{m}) / GET_MODE_SIZE (@var{n});
+for (j = 0; j < GET_MODE_NUNITS (@var{n}); j++)
+ if (operand2[j])
+ for (i = 0; i < c; i++)
+ operand0[i][j] = operand1[j * c + i];
+ else
+ for (i = 0; i < c; i++)
+ operand0[i][j] = 0;
+@end smallexample
+
+This pattern is not allowed to @code{FAIL}.
+
@cindex @code{vec_store_lanes@var{m}@var{n}} instruction pattern
@item @samp{vec_store_lanes@var{m}@var{n}}
Equivalent to @samp{vec_load_lanes@var{m}@var{n}}, with the memory
This pattern is not allowed to @code{FAIL}.
+@cindex @code{vec_mask_store_lanes@var{m}@var{n}} instruction pattern
+@item @samp{vec_mask_store_lanes@var{m}@var{n}}
+Like @samp{vec_store_lanes@var{m}@var{n}}, but takes an additional
+mask operand (operand 2) that specifies which elements of the source
+vectors should be stored. The operation is equivalent to:
+
+@smallexample
+int c = GET_MODE_SIZE (@var{m}) / GET_MODE_SIZE (@var{n});
+for (j = 0; j < GET_MODE_NUNITS (@var{n}); j++)
+ if (operand2[j])
+ for (i = 0; i < c; i++)
+ operand0[j * c + i] = operand1[i][j];
+@end smallexample
+
+This pattern is not allowed to @code{FAIL}.
+
@cindex @code{vec_set@var{m}} instruction pattern
@item @samp{vec_set@var{m}}
Set given field in the vector value. Operand 0 is the vector to modify,
#define not_direct { -2, -2, false }
#define mask_load_direct { -1, 2, false }
#define load_lanes_direct { -1, -1, false }
+#define mask_load_lanes_direct { -1, -1, false }
#define mask_store_direct { 3, 2, false }
#define store_lanes_direct { 0, 0, false }
+#define mask_store_lanes_direct { 0, 0, false }
#define unary_direct { 0, 0, true }
#define binary_direct { 0, 0, true }
gcc_unreachable ();
}
-/* Expand MASK_LOAD call STMT using optab OPTAB. */
+/* Expand MASK_LOAD{,_LANES} call STMT using optab OPTAB. */
static void
expand_mask_load_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
tree type, lhs, rhs, maskt, ptr;
rtx mem, target, mask;
unsigned align;
+ insn_code icode;
maskt = gimple_call_arg (stmt, 2);
lhs = gimple_call_lhs (stmt);
type = build_aligned_type (type, align);
rhs = fold_build2 (MEM_REF, type, gimple_call_arg (stmt, 0), ptr);
+ if (optab == vec_mask_load_lanes_optab)
+ icode = get_multi_vector_move (type, optab);
+ else
+ icode = convert_optab_handler (optab, TYPE_MODE (type),
+ TYPE_MODE (TREE_TYPE (maskt)));
+
mem = expand_expr (rhs, NULL_RTX, VOIDmode, EXPAND_WRITE);
gcc_assert (MEM_P (mem));
mask = expand_normal (maskt);
create_output_operand (&ops[0], target, TYPE_MODE (type));
create_fixed_operand (&ops[1], mem);
create_input_operand (&ops[2], mask, TYPE_MODE (TREE_TYPE (maskt)));
- expand_insn (convert_optab_handler (optab, TYPE_MODE (type),
- TYPE_MODE (TREE_TYPE (maskt))),
- 3, ops);
+ expand_insn (icode, 3, ops);
}
-/* Expand MASK_STORE call STMT using optab OPTAB. */
+#define expand_mask_load_lanes_optab_fn expand_mask_load_optab_fn
+
+/* Expand MASK_STORE{,_LANES} call STMT using optab OPTAB. */
static void
expand_mask_store_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
tree type, lhs, rhs, maskt, ptr;
rtx mem, reg, mask;
unsigned align;
+ insn_code icode;
maskt = gimple_call_arg (stmt, 2);
rhs = gimple_call_arg (stmt, 3);
type = build_aligned_type (type, align);
lhs = fold_build2 (MEM_REF, type, gimple_call_arg (stmt, 0), ptr);
+ if (optab == vec_mask_store_lanes_optab)
+ icode = get_multi_vector_move (type, optab);
+ else
+ icode = convert_optab_handler (optab, TYPE_MODE (type),
+ TYPE_MODE (TREE_TYPE (maskt)));
+
mem = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE);
gcc_assert (MEM_P (mem));
mask = expand_normal (maskt);
create_fixed_operand (&ops[0], mem);
create_input_operand (&ops[1], reg, TYPE_MODE (type));
create_input_operand (&ops[2], mask, TYPE_MODE (TREE_TYPE (maskt)));
- expand_insn (convert_optab_handler (optab, TYPE_MODE (type),
- TYPE_MODE (TREE_TYPE (maskt))),
- 3, ops);
+ expand_insn (icode, 3, ops);
}
+#define expand_mask_store_lanes_optab_fn expand_mask_store_optab_fn
+
static void
expand_ABNORMAL_DISPATCHER (internal_fn, gcall *)
{
#define direct_binary_optab_supported_p direct_optab_supported_p
#define direct_mask_load_optab_supported_p direct_optab_supported_p
#define direct_load_lanes_optab_supported_p multi_vector_optab_supported_p
+#define direct_mask_load_lanes_optab_supported_p multi_vector_optab_supported_p
#define direct_mask_store_optab_supported_p direct_optab_supported_p
#define direct_store_lanes_optab_supported_p multi_vector_optab_supported_p
+#define direct_mask_store_lanes_optab_supported_p multi_vector_optab_supported_p
/* Return the optab used by internal function FN. */
- mask_load: currently just maskload
- load_lanes: currently just vec_load_lanes
+ - mask_load_lanes: currently just vec_mask_load_lanes
- mask_store: currently just maskstore
- store_lanes: currently just vec_store_lanes
+ - mask_store_lanes: currently just vec_mask_store_lanes
DEF_INTERNAL_SIGNED_OPTAB_FN defines an internal function that
maps to one of two optabs, depending on the signedness of an input.
DEF_INTERNAL_OPTAB_FN (MASK_LOAD, ECF_PURE, maskload, mask_load)
DEF_INTERNAL_OPTAB_FN (LOAD_LANES, ECF_CONST, vec_load_lanes, load_lanes)
+DEF_INTERNAL_OPTAB_FN (MASK_LOAD_LANES, ECF_PURE,
+ vec_mask_load_lanes, mask_load_lanes)
DEF_INTERNAL_OPTAB_FN (MASK_STORE, 0, maskstore, mask_store)
DEF_INTERNAL_OPTAB_FN (STORE_LANES, ECF_CONST, vec_store_lanes, store_lanes)
+DEF_INTERNAL_OPTAB_FN (MASK_STORE_LANES, 0,
+ vec_mask_store_lanes, mask_store_lanes)
DEF_INTERNAL_OPTAB_FN (RSQRT, ECF_CONST, rsqrt, unary)
OPTAB_CD(usmsub_widen_optab, "usmsub$a$b4")
OPTAB_CD(vec_load_lanes_optab, "vec_load_lanes$a$b")
OPTAB_CD(vec_store_lanes_optab, "vec_store_lanes$a$b")
+OPTAB_CD(vec_mask_load_lanes_optab, "vec_mask_load_lanes$a$b")
+OPTAB_CD(vec_mask_store_lanes_optab, "vec_mask_store_lanes$a$b")
OPTAB_CD(vcond_optab, "vcond$a$b")
OPTAB_CD(vcondu_optab, "vcondu$a$b")
OPTAB_CD(vcondeq_optab, "vcondeq$a$b")
+2018-01-13 Richard Sandiford <richard.sandiford@linaro.org>
+ Alan Hayward <alan.hayward@arm.com>
+ David Sherwood <david.sherwood@arm.com>
+
+ * gcc.dg/vect/vect-ooo-group-1.c: New test.
+ * gcc.target/aarch64/sve/mask_struct_load_1.c: Likewise.
+ * gcc.target/aarch64/sve/mask_struct_load_1_run.c: Likewise.
+ * gcc.target/aarch64/sve/mask_struct_load_2.c: Likewise.
+ * gcc.target/aarch64/sve/mask_struct_load_2_run.c: Likewise.
+ * gcc.target/aarch64/sve/mask_struct_load_3.c: Likewise.
+ * gcc.target/aarch64/sve/mask_struct_load_3_run.c: Likewise.
+ * gcc.target/aarch64/sve/mask_struct_load_4.c: Likewise.
+ * gcc.target/aarch64/sve/mask_struct_load_5.c: Likewise.
+ * gcc.target/aarch64/sve/mask_struct_load_6.c: Likewise.
+ * gcc.target/aarch64/sve/mask_struct_load_7.c: Likewise.
+ * gcc.target/aarch64/sve/mask_struct_load_8.c: Likewise.
+ * gcc.target/aarch64/sve/mask_struct_store_1.c: Likewise.
+ * gcc.target/aarch64/sve/mask_struct_store_1_run.c: Likewise.
+ * gcc.target/aarch64/sve/mask_struct_store_2.c: Likewise.
+ * gcc.target/aarch64/sve/mask_struct_store_2_run.c: Likewise.
+ * gcc.target/aarch64/sve/mask_struct_store_3.c: Likewise.
+ * gcc.target/aarch64/sve/mask_struct_store_3_run.c: Likewise.
+ * gcc.target/aarch64/sve/mask_struct_store_4.c: Likewise.
+
2018-01-13 Richard Sandiford <richard.sandiford@linaro.org>
Alan Hayward <alan.hayward@arm.com>
David Sherwood <david.sherwood@arm.com>
--- /dev/null
+/* { dg-do compile } */
+
+void
+f (int *restrict a, int *restrict b, int *restrict c)
+{
+ for (int i = 0; i < 100; ++i)
+ if (c[i])
+ {
+ a[i * 2] = b[i * 5 + 2];
+ a[i * 2 + 1] = b[i * 5];
+ }
+}
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -ffast-math" } */
+
+#define TEST_LOOP(NAME, OUTTYPE, INTYPE, MASKTYPE) \
+ void __attribute__ ((noinline, noclone)) \
+ NAME##_2 (OUTTYPE *__restrict dest, INTYPE *__restrict src, \
+ MASKTYPE *__restrict cond, int n) \
+ { \
+ for (int i = 0; i < n; ++i) \
+ if (cond[i]) \
+ dest[i] = src[i * 2] + src[i * 2 + 1]; \
+ }
+
+#define TEST2(NAME, OUTTYPE, INTYPE) \
+ TEST_LOOP (NAME##_i8, OUTTYPE, INTYPE, signed char) \
+ TEST_LOOP (NAME##_i16, OUTTYPE, INTYPE, unsigned short) \
+ TEST_LOOP (NAME##_f32, OUTTYPE, INTYPE, float) \
+ TEST_LOOP (NAME##_f64, OUTTYPE, INTYPE, double)
+
+#define TEST1(NAME, OUTTYPE) \
+ TEST2 (NAME##_i8, OUTTYPE, signed char) \
+ TEST2 (NAME##_i16, OUTTYPE, unsigned short) \
+ TEST2 (NAME##_i32, OUTTYPE, int) \
+ TEST2 (NAME##_i64, OUTTYPE, unsigned long)
+
+#define TEST(NAME) \
+ TEST1 (NAME##_i8, signed char) \
+ TEST1 (NAME##_i16, unsigned short) \
+ TEST1 (NAME##_i32, int) \
+ TEST1 (NAME##_i64, unsigned long) \
+ TEST2 (NAME##_f16_f16, _Float16, _Float16) \
+ TEST2 (NAME##_f32_f32, float, float) \
+ TEST2 (NAME##_f64_f64, double, double)
+
+TEST (test)
+
+/* Mask | 8 16 32 64
+ -------+------------
+ Out 8 | 1 1 1 1
+ 16 | 1 1 1 1
+ 32 | 1 1 1 1
+ 64 | 1 1 1 1. */
+/* { dg-final { scan-assembler-times {\tld2b\t.z[0-9]} 16 } } */
+
+/* Mask | 8 16 32 64
+ -------+------------
+ Out 8 | 2 2 2 2
+ 16 | 2 1 1 1 x2 (for half float)
+ 32 | 2 1 1 1
+ 64 | 2 1 1 1. */
+/* { dg-final { scan-assembler-times {\tld2h\t.z[0-9]} 28 } } */
+
+/* Mask | 8 16 32 64
+ -------+------------
+ Out 8 | 4 4 4 4
+ 16 | 4 2 2 2
+ 32 | 4 2 1 1 x2 (for float)
+ 64 | 4 2 1 1. */
+/* { dg-final { scan-assembler-times {\tld2w\t.z[0-9]} 50 } } */
+
+/* Mask | 8 16 32 64
+ -------+------------
+ Out 8 | 8 8 8 8
+ 16 | 8 4 4 4
+ 32 | 8 4 2 2
+ 64 | 8 4 2 1 x2 (for double). */
+/* { dg-final { scan-assembler-times {\tld2d\t.z[0-9]} 98 } } */
--- /dev/null
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize -ffast-math" } */
+
+#include "mask_struct_load_1.c"
+
+#define N 100
+
+#undef TEST_LOOP
+#define TEST_LOOP(NAME, OUTTYPE, INTYPE, MASKTYPE) \
+ { \
+ OUTTYPE out[N]; \
+ INTYPE in[N * 2]; \
+ MASKTYPE mask[N]; \
+ for (int i = 0; i < N; ++i) \
+ { \
+ out[i] = i * 7 / 2; \
+ mask[i] = i % 5 <= i % 3; \
+ asm volatile ("" ::: "memory"); \
+ } \
+ for (int i = 0; i < N * 2; ++i) \
+ in[i] = i * 9 / 2; \
+ NAME##_2 (out, in, mask, N); \
+ for (int i = 0; i < N; ++i) \
+ { \
+ OUTTYPE if_true = in[i * 2] + in[i * 2 + 1]; \
+ OUTTYPE if_false = i * 7 / 2; \
+ if (out[i] != (mask[i] ? if_true : if_false)) \
+ __builtin_abort (); \
+ asm volatile ("" ::: "memory"); \
+ } \
+ }
+
+int __attribute__ ((optimize (1)))
+main (void)
+{
+ TEST (test);
+ return 0;
+}
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -ffast-math" } */
+
+#define TEST_LOOP(NAME, OUTTYPE, INTYPE, MASKTYPE) \
+ void __attribute__ ((noinline, noclone)) \
+ NAME##_3 (OUTTYPE *__restrict dest, INTYPE *__restrict src, \
+ MASKTYPE *__restrict cond, int n) \
+ { \
+ for (int i = 0; i < n; ++i) \
+ if (cond[i]) \
+ dest[i] = (src[i * 3] \
+ + src[i * 3 + 1] \
+ + src[i * 3 + 2]); \
+ }
+
+#define TEST2(NAME, OUTTYPE, INTYPE) \
+ TEST_LOOP (NAME##_i8, OUTTYPE, INTYPE, signed char) \
+ TEST_LOOP (NAME##_i16, OUTTYPE, INTYPE, unsigned short) \
+ TEST_LOOP (NAME##_f32, OUTTYPE, INTYPE, float) \
+ TEST_LOOP (NAME##_f64, OUTTYPE, INTYPE, double)
+
+#define TEST1(NAME, OUTTYPE) \
+ TEST2 (NAME##_i8, OUTTYPE, signed char) \
+ TEST2 (NAME##_i16, OUTTYPE, unsigned short) \
+ TEST2 (NAME##_i32, OUTTYPE, int) \
+ TEST2 (NAME##_i64, OUTTYPE, unsigned long)
+
+#define TEST(NAME) \
+ TEST1 (NAME##_i8, signed char) \
+ TEST1 (NAME##_i16, unsigned short) \
+ TEST1 (NAME##_i32, int) \
+ TEST1 (NAME##_i64, unsigned long) \
+ TEST2 (NAME##_f16_f16, _Float16, _Float16) \
+ TEST2 (NAME##_f32_f32, float, float) \
+ TEST2 (NAME##_f64_f64, double, double)
+
+TEST (test)
+
+/* Mask | 8 16 32 64
+ -------+------------
+ Out 8 | 1 1 1 1
+ 16 | 1 1 1 1
+ 32 | 1 1 1 1
+ 64 | 1 1 1 1. */
+/* { dg-final { scan-assembler-times {\tld3b\t.z[0-9]} 16 } } */
+
+/* Mask | 8 16 32 64
+ -------+------------
+ Out 8 | 2 2 2 2
+ 16 | 2 1 1 1 x2 (for _Float16)
+ 32 | 2 1 1 1
+ 64 | 2 1 1 1. */
+/* { dg-final { scan-assembler-times {\tld3h\t.z[0-9]} 28 } } */
+
+/* Mask | 8 16 32 64
+ -------+------------
+ Out 8 | 4 4 4 4
+ 16 | 4 2 2 2
+ 32 | 4 2 1 1 x2 (for float)
+ 64 | 4 2 1 1. */
+/* { dg-final { scan-assembler-times {\tld3w\t.z[0-9]} 50 } } */
+
+/* Mask | 8 16 32 64
+ -------+------------
+ Out 8 | 8 8 8 8
+ 16 | 8 4 4 4
+ 32 | 8 4 2 2
+ 64 | 8 4 2 1 x2 (for double). */
+/* { dg-final { scan-assembler-times {\tld3d\t.z[0-9]} 98 } } */
--- /dev/null
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize -ffast-math" } */
+
+#include "mask_struct_load_2.c"
+
+#define N 100
+
+#undef TEST_LOOP
+#define TEST_LOOP(NAME, OUTTYPE, INTYPE, MASKTYPE) \
+ { \
+ OUTTYPE out[N]; \
+ INTYPE in[N * 3]; \
+ MASKTYPE mask[N]; \
+ for (int i = 0; i < N; ++i) \
+ { \
+ out[i] = i * 7 / 2; \
+ mask[i] = i % 5 <= i % 3; \
+ asm volatile ("" ::: "memory"); \
+ } \
+ for (int i = 0; i < N * 3; ++i) \
+ in[i] = i * 9 / 2; \
+ NAME##_3 (out, in, mask, N); \
+ for (int i = 0; i < N; ++i) \
+ { \
+ OUTTYPE if_true = (in[i * 3] \
+ + in[i * 3 + 1] \
+ + in[i * 3 + 2]); \
+ OUTTYPE if_false = i * 7 / 2; \
+ if (out[i] != (mask[i] ? if_true : if_false)) \
+ __builtin_abort (); \
+ asm volatile ("" ::: "memory"); \
+ } \
+ }
+
+int __attribute__ ((optimize (1)))
+main (void)
+{
+ TEST (test);
+ return 0;
+}
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -ffast-math" } */
+
+#define TEST_LOOP(NAME, OUTTYPE, INTYPE, MASKTYPE) \
+ void __attribute__ ((noinline, noclone)) \
+ NAME##_4 (OUTTYPE *__restrict dest, INTYPE *__restrict src, \
+ MASKTYPE *__restrict cond, int n) \
+ { \
+ for (int i = 0; i < n; ++i) \
+ if (cond[i]) \
+ dest[i] = (src[i * 4] \
+ + src[i * 4 + 1] \
+ + src[i * 4 + 2] \
+ + src[i * 4 + 3]); \
+ }
+
+#define TEST2(NAME, OUTTYPE, INTYPE) \
+ TEST_LOOP (NAME##_i8, OUTTYPE, INTYPE, signed char) \
+ TEST_LOOP (NAME##_i16, OUTTYPE, INTYPE, unsigned short) \
+ TEST_LOOP (NAME##_f32, OUTTYPE, INTYPE, float) \
+ TEST_LOOP (NAME##_f64, OUTTYPE, INTYPE, double)
+
+#define TEST1(NAME, OUTTYPE) \
+ TEST2 (NAME##_i8, OUTTYPE, signed char) \
+ TEST2 (NAME##_i16, OUTTYPE, unsigned short) \
+ TEST2 (NAME##_i32, OUTTYPE, int) \
+ TEST2 (NAME##_i64, OUTTYPE, unsigned long)
+
+#define TEST(NAME) \
+ TEST1 (NAME##_i8, signed char) \
+ TEST1 (NAME##_i16, unsigned short) \
+ TEST1 (NAME##_i32, int) \
+ TEST1 (NAME##_i64, unsigned long) \
+ TEST2 (NAME##_f16_f16, _Float16, _Float16) \
+ TEST2 (NAME##_f32_f32, float, float) \
+ TEST2 (NAME##_f64_f64, double, double)
+
+TEST (test)
+
+/* Mask | 8 16 32 64
+ -------+------------
+ Out 8 | 1 1 1 1
+ 16 | 1 1 1 1
+ 32 | 1 1 1 1
+ 64 | 1 1 1 1. */
+/* { dg-final { scan-assembler-times {\tld4b\t.z[0-9]} 16 } } */
+
+/* Mask | 8 16 32 64
+ -------+------------
+ Out 8 | 2 2 2 2
+ 16 | 2 1 1 1 x2 (for half float)
+ 32 | 2 1 1 1
+ 64 | 2 1 1 1. */
+/* { dg-final { scan-assembler-times {\tld4h\t.z[0-9]} 28 } } */
+
+/* Mask | 8 16 32 64
+ -------+------------
+ Out 8 | 4 4 4 4
+ 16 | 4 2 2 2
+ 32 | 4 2 1 1 x2 (for float)
+ 64 | 4 2 1 1. */
+/* { dg-final { scan-assembler-times {\tld4w\t.z[0-9]} 50 } } */
+
+/* Mask | 8 16 32 64
+ -------+------------
+ Out 8 | 8 8 8 8
+ 16 | 8 4 4 4
+ 32 | 8 4 2 2
+ 64 | 8 4 2 1 x2 (for double). */
+/* { dg-final { scan-assembler-times {\tld4d\t.z[0-9]} 98 } } */
--- /dev/null
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize -ffast-math" } */
+
+#include "mask_struct_load_3.c"
+
+#define N 100
+
+#undef TEST_LOOP
+#define TEST_LOOP(NAME, OUTTYPE, INTYPE, MASKTYPE) \
+ { \
+ OUTTYPE out[N]; \
+ INTYPE in[N * 4]; \
+ MASKTYPE mask[N]; \
+ for (int i = 0; i < N; ++i) \
+ { \
+ out[i] = i * 7 / 2; \
+ mask[i] = i % 5 <= i % 3; \
+ asm volatile ("" ::: "memory"); \
+ } \
+ for (int i = 0; i < N * 4; ++i) \
+ in[i] = i * 9 / 2; \
+ NAME##_4 (out, in, mask, N); \
+ for (int i = 0; i < N; ++i) \
+ { \
+ OUTTYPE if_true = (in[i * 4] \
+ + in[i * 4 + 1] \
+ + in[i * 4 + 2] \
+ + in[i * 4 + 3]); \
+ OUTTYPE if_false = i * 7 / 2; \
+ if (out[i] != (mask[i] ? if_true : if_false)) \
+ __builtin_abort (); \
+ asm volatile ("" ::: "memory"); \
+ } \
+ }
+
+int __attribute__ ((optimize (1)))
+main (void)
+{
+ TEST (test);
+ return 0;
+}
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -ffast-math" } */
+
+#define TEST_LOOP(NAME, OUTTYPE, INTYPE, MASKTYPE) \
+ void __attribute__ ((noinline, noclone)) \
+ NAME##_3 (OUTTYPE *__restrict dest, INTYPE *__restrict src, \
+ MASKTYPE *__restrict cond, int n) \
+ { \
+ for (int i = 0; i < n; ++i) \
+ if (cond[i]) \
+ dest[i] = src[i * 3] + src[i * 3 + 2]; \
+ }
+
+#define TEST2(NAME, OUTTYPE, INTYPE) \
+ TEST_LOOP (NAME##_i8, OUTTYPE, INTYPE, signed char) \
+ TEST_LOOP (NAME##_i16, OUTTYPE, INTYPE, unsigned short) \
+ TEST_LOOP (NAME##_f32, OUTTYPE, INTYPE, float) \
+ TEST_LOOP (NAME##_f64, OUTTYPE, INTYPE, double)
+
+#define TEST1(NAME, OUTTYPE) \
+ TEST2 (NAME##_i8, OUTTYPE, signed char) \
+ TEST2 (NAME##_i16, OUTTYPE, unsigned short) \
+ TEST2 (NAME##_i32, OUTTYPE, int) \
+ TEST2 (NAME##_i64, OUTTYPE, unsigned long)
+
+#define TEST(NAME) \
+ TEST1 (NAME##_i8, signed char) \
+ TEST1 (NAME##_i16, unsigned short) \
+ TEST1 (NAME##_i32, int) \
+ TEST1 (NAME##_i64, unsigned long) \
+ TEST2 (NAME##_f16_f16, _Float16, _Float16) \
+ TEST2 (NAME##_f32_f32, float, float) \
+ TEST2 (NAME##_f64_f64, double, double)
+
+TEST (test)
+
+/* Mask | 8 16 32 64
+ -------+------------
+ Out 8 | 1 1 1 1
+ 16 | 1 1 1 1
+ 32 | 1 1 1 1
+ 64 | 1 1 1 1. */
+/* { dg-final { scan-assembler-times {\tld3b\t.z[0-9]} 16 } } */
+
+/* Mask | 8 16 32 64
+ -------+------------
+ Out 8 | 2 2 2 2
+ 16 | 2 1 1 1 x2 (for half float)
+ 32 | 2 1 1 1
+ 64 | 2 1 1 1. */
+/* { dg-final { scan-assembler-times {\tld3h\t.z[0-9]} 28 } } */
+
+/* Mask | 8 16 32 64
+ -------+------------
+ Out 8 | 4 4 4 4
+ 16 | 4 2 2 2
+ 32 | 4 2 1 1 x2 (for float)
+ 64 | 4 2 1 1. */
+/* { dg-final { scan-assembler-times {\tld3w\t.z[0-9]} 50 } } */
+
+/* Mask | 8 16 32 64
+ -------+------------
+ Out 8 | 8 8 8 8
+ 16 | 8 4 4 4
+ 32 | 8 4 2 2
+ 64 | 8 4 2 1 x2 (for double). */
+/* { dg-final { scan-assembler-times {\tld3d\t.z[0-9]} 98 } } */
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -ffast-math" } */
+
+#define TEST_LOOP(NAME, OUTTYPE, INTYPE, MASKTYPE) \
+ void __attribute__ ((noinline, noclone)) \
+ NAME##_4 (OUTTYPE *__restrict dest, INTYPE *__restrict src, \
+ MASKTYPE *__restrict cond, int n) \
+ { \
+ for (int i = 0; i < n; ++i) \
+ if (cond[i]) \
+ dest[i] = src[i * 4] + src[i * 4 + 3]; \
+ }
+
+#define TEST2(NAME, OUTTYPE, INTYPE) \
+ TEST_LOOP (NAME##_i8, OUTTYPE, INTYPE, signed char) \
+ TEST_LOOP (NAME##_i16, OUTTYPE, INTYPE, unsigned short) \
+ TEST_LOOP (NAME##_f32, OUTTYPE, INTYPE, float) \
+ TEST_LOOP (NAME##_f64, OUTTYPE, INTYPE, double)
+
+#define TEST1(NAME, OUTTYPE) \
+ TEST2 (NAME##_i8, OUTTYPE, signed char) \
+ TEST2 (NAME##_i16, OUTTYPE, unsigned short) \
+ TEST2 (NAME##_i32, OUTTYPE, int) \
+ TEST2 (NAME##_i64, OUTTYPE, unsigned long)
+
+#define TEST(NAME) \
+ TEST1 (NAME##_i8, signed char) \
+ TEST1 (NAME##_i16, unsigned short) \
+ TEST1 (NAME##_i32, int) \
+ TEST1 (NAME##_i64, unsigned long) \
+ TEST2 (NAME##_f16_f16, _Float16, _Float16) \
+ TEST2 (NAME##_f32_f32, float, float) \
+ TEST2 (NAME##_f64_f64, double, double)
+
+TEST (test)
+
+/* Mask | 8 16 32 64
+ -------+------------
+ Out 8 | 1 1 1 1
+ 16 | 1 1 1 1
+ 32 | 1 1 1 1
+ 64 | 1 1 1 1. */
+/* { dg-final { scan-assembler-times {\tld4b\t.z[0-9]} 16 } } */
+
+/* Mask | 8 16 32 64
+ -------+------------
+ Out 8 | 2 2 2 2
+ 16 | 2 1 1 1 x2 (for half float)
+ 32 | 2 1 1 1
+ 64 | 2 1 1 1. */
+/* { dg-final { scan-assembler-times {\tld4h\t.z[0-9]} 28 } } */
+
+/* Mask | 8 16 32 64
+ -------+------------
+ Out 8 | 4 4 4 4
+ 16 | 4 2 2 2
+ 32 | 4 2 1 1 x2 (for float)
+ 64 | 4 2 1 1. */
+/* { dg-final { scan-assembler-times {\tld4w\t.z[0-9]} 50 } } */
+
+/* Mask | 8 16 32 64
+ -------+------------
+ Out 8 | 8 8 8 8
+ 16 | 8 4 4 4
+ 32 | 8 4 2 2
+ 64 | 8 4 2 1 x2 (for double). */
+/* { dg-final { scan-assembler-times {\tld4d\t.z[0-9]} 98 } } */
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -ffast-math" } */
+
+#define TEST_LOOP(NAME, OUTTYPE, INTYPE, MASKTYPE) \
+ void __attribute__ ((noinline, noclone)) \
+ NAME##_2 (OUTTYPE *__restrict dest, INTYPE *__restrict src, \
+ MASKTYPE *__restrict cond, int n) \
+ { \
+ for (int i = 0; i < n; ++i) \
+ if (cond[i]) \
+ dest[i] = src[i * 2]; \
+ }
+
+#define TEST2(NAME, OUTTYPE, INTYPE) \
+ TEST_LOOP (NAME##_i8, OUTTYPE, INTYPE, signed char) \
+ TEST_LOOP (NAME##_i16, OUTTYPE, INTYPE, unsigned short) \
+ TEST_LOOP (NAME##_f32, OUTTYPE, INTYPE, float) \
+ TEST_LOOP (NAME##_f64, OUTTYPE, INTYPE, double)
+
+#define TEST1(NAME, OUTTYPE) \
+ TEST2 (NAME##_i8, OUTTYPE, signed char) \
+ TEST2 (NAME##_i16, OUTTYPE, unsigned short) \
+ TEST2 (NAME##_i32, OUTTYPE, int) \
+ TEST2 (NAME##_i64, OUTTYPE, unsigned long)
+
+#define TEST(NAME) \
+ TEST1 (NAME##_i8, signed char) \
+ TEST1 (NAME##_i16, unsigned short) \
+ TEST1 (NAME##_i32, int) \
+ TEST1 (NAME##_i64, unsigned long) \
+ TEST2 (NAME##_f16_f16, _Float16, _Float16) \
+ TEST2 (NAME##_f32_f32, float, float) \
+ TEST2 (NAME##_f64_f64, double, double)
+
+TEST (test)
+
+/* { dg-final { scan-assembler-not {\tld2b\t} } } */
+/* { dg-final { scan-assembler-not {\tld2h\t} } } */
+/* { dg-final { scan-assembler-not {\tld2w\t} } } */
+/* { dg-final { scan-assembler-not {\tld2d\t} } } */
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -ffast-math" } */
+
+#define TEST_LOOP(NAME, OUTTYPE, INTYPE, MASKTYPE) \
+ void __attribute__ ((noinline, noclone)) \
+ NAME##_3 (OUTTYPE *__restrict dest, INTYPE *__restrict src, \
+ MASKTYPE *__restrict cond, int n) \
+ { \
+ for (int i = 0; i < n; ++i) \
+ if (cond[i]) \
+ dest[i] = src[i * 3] + src[i * 3 + 1]; \
+ }
+
+#define TEST2(NAME, OUTTYPE, INTYPE) \
+ TEST_LOOP (NAME##_i8, OUTTYPE, INTYPE, signed char) \
+ TEST_LOOP (NAME##_i16, OUTTYPE, INTYPE, unsigned short) \
+ TEST_LOOP (NAME##_f32, OUTTYPE, INTYPE, float) \
+ TEST_LOOP (NAME##_f64, OUTTYPE, INTYPE, double)
+
+#define TEST1(NAME, OUTTYPE) \
+ TEST2 (NAME##_i8, OUTTYPE, signed char) \
+ TEST2 (NAME##_i16, OUTTYPE, unsigned short) \
+ TEST2 (NAME##_i32, OUTTYPE, int) \
+ TEST2 (NAME##_i64, OUTTYPE, unsigned long)
+
+#define TEST(NAME) \
+ TEST1 (NAME##_i8, signed char) \
+ TEST1 (NAME##_i16, unsigned short) \
+ TEST1 (NAME##_i32, int) \
+ TEST1 (NAME##_i64, unsigned long) \
+ TEST2 (NAME##_f16_f16, _Float16, _Float16) \
+ TEST2 (NAME##_f32_f32, float, float) \
+ TEST2 (NAME##_f64_f64, double, double)
+
+TEST (test)
+
+/* { dg-final { scan-assembler-not {\tld3b\t} } } */
+/* { dg-final { scan-assembler-not {\tld3h\t} } } */
+/* { dg-final { scan-assembler-not {\tld3w\t} } } */
+/* { dg-final { scan-assembler-not {\tld3d\t} } } */
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -ffast-math" } */
+
+#define TEST_LOOP(NAME, OUTTYPE, INTYPE, MASKTYPE) \
+ void __attribute__ ((noinline, noclone)) \
+ NAME##_4 (OUTTYPE *__restrict dest, INTYPE *__restrict src, \
+ MASKTYPE *__restrict cond, int n) \
+ { \
+ for (int i = 0; i < n; ++i) \
+ if (cond[i]) \
+ dest[i] = src[i * 4] + src[i * 4 + 2]; \
+ }
+
+#define TEST2(NAME, OUTTYPE, INTYPE) \
+ TEST_LOOP (NAME##_i8, OUTTYPE, INTYPE, signed char) \
+ TEST_LOOP (NAME##_i16, OUTTYPE, INTYPE, unsigned short) \
+ TEST_LOOP (NAME##_f32, OUTTYPE, INTYPE, float) \
+ TEST_LOOP (NAME##_f64, OUTTYPE, INTYPE, double)
+
+#define TEST1(NAME, OUTTYPE) \
+ TEST2 (NAME##_i8, OUTTYPE, signed char) \
+ TEST2 (NAME##_i16, OUTTYPE, unsigned short) \
+ TEST2 (NAME##_i32, OUTTYPE, int) \
+ TEST2 (NAME##_i64, OUTTYPE, unsigned long)
+
+#define TEST(NAME) \
+ TEST1 (NAME##_i8, signed char) \
+ TEST1 (NAME##_i16, unsigned short) \
+ TEST1 (NAME##_i32, int) \
+ TEST1 (NAME##_i64, unsigned long) \
+ TEST2 (NAME##_f16_f16, _Float16, _Float16) \
+ TEST2 (NAME##_f32_f32, float, float) \
+ TEST2 (NAME##_f64_f64, double, double)
+
+TEST (test)
+
+/* { dg-final { scan-assembler-not {\tld4b\t} } } */
+/* { dg-final { scan-assembler-not {\tld4h\t} } } */
+/* { dg-final { scan-assembler-not {\tld4w\t} } } */
+/* { dg-final { scan-assembler-not {\tld4d\t} } } */
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -ffast-math" } */
+
+#define TEST_LOOP(NAME, OUTTYPE, INTYPE, MASKTYPE) \
+ void __attribute__ ((noinline, noclone)) \
+ NAME##_2 (OUTTYPE *__restrict dest, INTYPE *__restrict src, \
+ MASKTYPE *__restrict cond, INTYPE bias, int n) \
+ { \
+ for (int i = 0; i < n; ++i) \
+ { \
+ INTYPE value = src[i] + bias; \
+ if (cond[i]) \
+ { \
+ dest[i * 2] = value; \
+ dest[i * 2 + 1] = value; \
+ } \
+ } \
+ }
+
+#define TEST2(NAME, OUTTYPE, INTYPE) \
+ TEST_LOOP (NAME##_i8, OUTTYPE, INTYPE, signed char) \
+ TEST_LOOP (NAME##_i16, OUTTYPE, INTYPE, unsigned short) \
+ TEST_LOOP (NAME##_f32, OUTTYPE, INTYPE, float) \
+ TEST_LOOP (NAME##_f64, OUTTYPE, INTYPE, double)
+
+#define TEST1(NAME, OUTTYPE) \
+ TEST2 (NAME##_i8, OUTTYPE, signed char) \
+ TEST2 (NAME##_i16, OUTTYPE, unsigned short) \
+ TEST2 (NAME##_i32, OUTTYPE, int) \
+ TEST2 (NAME##_i64, OUTTYPE, unsigned long)
+
+#define TEST(NAME) \
+ TEST1 (NAME##_i8, signed char) \
+ TEST1 (NAME##_i16, unsigned short) \
+ TEST1 (NAME##_i32, int) \
+ TEST1 (NAME##_i64, unsigned long) \
+ TEST2 (NAME##_f16_f16, _Float16, _Float16) \
+ TEST2 (NAME##_f32_f32, float, float) \
+ TEST2 (NAME##_f64_f64, double, double)
+
+TEST (test)
+
+/* Mask | 8 16 32 64
+ -------+------------
+ In 8 | 1 1 1 1
+ 16 | 1 1 1 1
+ 32 | 1 1 1 1
+ 64 | 1 1 1 1. */
+/* { dg-final { scan-assembler-times {\tst2b\t.z[0-9]} 16 } } */
+
+/* Mask | 8 16 32 64
+ -------+------------
+ In 8 | 2 2 2 2
+ 16 | 2 1 1 1 x2 (for _Float16)
+ 32 | 2 1 1 1
+ 64 | 2 1 1 1. */
+/* { dg-final { scan-assembler-times {\tst2h\t.z[0-9]} 28 } } */
+
+/* Mask | 8 16 32 64
+ -------+------------
+ In 8 | 4 4 4 4
+ 16 | 4 2 2 2
+ 32 | 4 2 1 1 x2 (for float)
+ 64 | 4 2 1 1. */
+/* { dg-final { scan-assembler-times {\tst2w\t.z[0-9]} 50 } } */
+
+/* Mask | 8 16 32 64
+ -------+------------
+ In 8 | 8 8 8 8
+ 16 | 8 4 4 4
+ 32 | 8 4 2 2
+ 64 | 8 4 2 1 x2 (for double). */
+/* { dg-final { scan-assembler-times {\tst2d\t.z[0-9]} 98 } } */
--- /dev/null
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize -ffast-math" } */
+
+#include "mask_struct_store_1.c"
+
+#define N 100
+
+#undef TEST_LOOP
+#define TEST_LOOP(NAME, OUTTYPE, INTYPE, MASKTYPE) \
+ { \
+ OUTTYPE out[N * 2]; \
+ INTYPE in[N]; \
+ MASKTYPE mask[N]; \
+ for (int i = 0; i < N; ++i) \
+ { \
+ in[i] = i * 7 / 2; \
+ mask[i] = i % 5 <= i % 3; \
+ asm volatile ("" ::: "memory"); \
+ } \
+ for (int i = 0; i < N * 2; ++i) \
+ out[i] = i * 9 / 2; \
+ NAME##_2 (out, in, mask, 17, N); \
+ for (int i = 0; i < N * 2; ++i) \
+ { \
+ OUTTYPE if_true = (INTYPE) (in[i / 2] + 17); \
+ OUTTYPE if_false = i * 9 / 2; \
+ if (out[i] != (mask[i / 2] ? if_true : if_false)) \
+ __builtin_abort (); \
+ asm volatile ("" ::: "memory"); \
+ } \
+ }
+
+int __attribute__ ((optimize (1)))
+main (void)
+{
+ TEST (test);
+ return 0;
+}
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -ffast-math" } */
+
+#define TEST_LOOP(NAME, OUTTYPE, INTYPE, MASKTYPE) \
+ void __attribute__ ((noinline, noclone)) \
+ NAME##_3 (OUTTYPE *__restrict dest, INTYPE *__restrict src, \
+ MASKTYPE *__restrict cond, INTYPE bias, int n) \
+ { \
+ for (int i = 0; i < n; ++i) \
+ { \
+ INTYPE value = src[i] + bias; \
+ if (cond[i]) \
+ { \
+ dest[i * 3] = value; \
+ dest[i * 3 + 1] = value; \
+ dest[i * 3 + 2] = value; \
+ } \
+ } \
+ }
+
+#define TEST2(NAME, OUTTYPE, INTYPE) \
+ TEST_LOOP (NAME##_i8, OUTTYPE, INTYPE, signed char) \
+ TEST_LOOP (NAME##_i16, OUTTYPE, INTYPE, unsigned short) \
+ TEST_LOOP (NAME##_f32, OUTTYPE, INTYPE, float) \
+ TEST_LOOP (NAME##_f64, OUTTYPE, INTYPE, double)
+
+#define TEST1(NAME, OUTTYPE) \
+ TEST2 (NAME##_i8, OUTTYPE, signed char) \
+ TEST2 (NAME##_i16, OUTTYPE, unsigned short) \
+ TEST2 (NAME##_i32, OUTTYPE, int) \
+ TEST2 (NAME##_i64, OUTTYPE, unsigned long)
+
+#define TEST(NAME) \
+ TEST1 (NAME##_i8, signed char) \
+ TEST1 (NAME##_i16, unsigned short) \
+ TEST1 (NAME##_i32, int) \
+ TEST1 (NAME##_i64, unsigned long) \
+ TEST2 (NAME##_f16_f16, _Float16, _Float16) \
+ TEST2 (NAME##_f32_f32, float, float) \
+ TEST2 (NAME##_f64_f64, double, double)
+
+TEST (test)
+
+/* Mask | 8 16 32 64
+ -------+------------
+ In 8 | 1 1 1 1
+ 16 | 1 1 1 1
+ 32 | 1 1 1 1
+ 64 | 1 1 1 1. */
+/* { dg-final { scan-assembler-times {\tst3b\t.z[0-9]} 16 } } */
+
+/* Mask | 8 16 32 64
+ -------+------------
+ In 8 | 2 2 2 2
+ 16 | 2 1 1 1 x2 (for _Float16)
+ 32 | 2 1 1 1
+ 64 | 2 1 1 1. */
+/* { dg-final { scan-assembler-times {\tst3h\t.z[0-9]} 28 } } */
+
+/* Mask | 8 16 32 64
+ -------+------------
+ In 8 | 4 4 4 4
+ 16 | 4 2 2 2
+ 32 | 4 2 1 1 x2 (for float)
+ 64 | 4 2 1 1. */
+/* { dg-final { scan-assembler-times {\tst3w\t.z[0-9]} 50 } } */
+
+/* Mask | 8 16 32 64
+ -------+------------
+ In 8 | 8 8 8 8
+ 16 | 8 4 4 4
+ 32 | 8 4 2 2
+ 64 | 8 4 2 1 x2 (for double). */
+/* { dg-final { scan-assembler-times {\tst3d\t.z[0-9]} 98 } } */
--- /dev/null
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize -ffast-math" } */
+
+#include "mask_struct_store_2.c"
+
+#define N 100
+
+#undef TEST_LOOP
+#define TEST_LOOP(NAME, OUTTYPE, INTYPE, MASKTYPE) \
+ { \
+ OUTTYPE out[N * 3]; \
+ INTYPE in[N]; \
+ MASKTYPE mask[N]; \
+ for (int i = 0; i < N; ++i) \
+ { \
+ in[i] = i * 7 / 2; \
+ mask[i] = i % 5 <= i % 3; \
+ asm volatile ("" ::: "memory"); \
+ } \
+ for (int i = 0; i < N * 3; ++i) \
+ out[i] = i * 9 / 2; \
+ NAME##_3 (out, in, mask, 11, N); \
+ for (int i = 0; i < N * 3; ++i) \
+ { \
+ OUTTYPE if_true = (INTYPE) (in[i / 3] + 11); \
+ OUTTYPE if_false = i * 9 / 2; \
+ if (out[i] != (mask[i / 3] ? if_true : if_false)) \
+ __builtin_abort (); \
+ asm volatile ("" ::: "memory"); \
+ } \
+ }
+
+int __attribute__ ((optimize (1)))
+main (void)
+{
+ TEST (test);
+ return 0;
+}
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -ffast-math" } */
+
+#define TEST_LOOP(NAME, OUTTYPE, INTYPE, MASKTYPE) \
+ void __attribute__ ((noinline, noclone)) \
+ NAME##_4 (OUTTYPE *__restrict dest, INTYPE *__restrict src, \
+ MASKTYPE *__restrict cond, INTYPE bias, int n) \
+ { \
+ for (int i = 0; i < n; ++i) \
+ { \
+ INTYPE value = src[i] + bias; \
+ if (cond[i]) \
+ { \
+ dest[i * 4] = value; \
+ dest[i * 4 + 1] = value; \
+ dest[i * 4 + 2] = value; \
+ dest[i * 4 + 3] = value; \
+ } \
+ } \
+ }
+
+#define TEST2(NAME, OUTTYPE, INTYPE) \
+ TEST_LOOP (NAME##_i8, OUTTYPE, INTYPE, signed char) \
+ TEST_LOOP (NAME##_i16, OUTTYPE, INTYPE, unsigned short) \
+ TEST_LOOP (NAME##_f32, OUTTYPE, INTYPE, float) \
+ TEST_LOOP (NAME##_f64, OUTTYPE, INTYPE, double)
+
+#define TEST1(NAME, OUTTYPE) \
+ TEST2 (NAME##_i8, OUTTYPE, signed char) \
+ TEST2 (NAME##_i16, OUTTYPE, unsigned short) \
+ TEST2 (NAME##_i32, OUTTYPE, int) \
+ TEST2 (NAME##_i64, OUTTYPE, unsigned long)
+
+#define TEST(NAME) \
+ TEST1 (NAME##_i8, signed char) \
+ TEST1 (NAME##_i16, unsigned short) \
+ TEST1 (NAME##_i32, int) \
+ TEST1 (NAME##_i64, unsigned long) \
+ TEST2 (NAME##_f16_f16, _Float16, _Float16) \
+ TEST2 (NAME##_f32_f32, float, float) \
+ TEST2 (NAME##_f64_f64, double, double)
+
+TEST (test)
+
+/* Mask | 8 16 32 64
+ -------+------------
+ In 8 | 1 1 1 1
+ 16 | 1 1 1 1
+ 32 | 1 1 1 1
+ 64 | 1 1 1 1. */
+/* { dg-final { scan-assembler-times {\tst4b\t.z[0-9]} 16 } } */
+
+/* Mask | 8 16 32 64
+ -------+------------
+ In 8 | 2 2 2 2
+ 16 | 2 1 1 1 x2 (for half float)
+ 32 | 2 1 1 1
+ 64 | 2 1 1 1. */
+/* { dg-final { scan-assembler-times {\tst4h\t.z[0-9]} 28 } } */
+
+/* Mask | 8 16 32 64
+ -------+------------
+ In 8 | 4 4 4 4
+ 16 | 4 2 2 2
+ 32 | 4 2 1 1 x2 (for float)
+ 64 | 4 2 1 1. */
+/* { dg-final { scan-assembler-times {\tst4w\t.z[0-9]} 50 } } */
+
+/* Mask | 8 16 32 64
+ -------+------------
+ In 8 | 8 8 8 8
+ 16 | 8 4 4 4
+ 32 | 8 4 2 2
+ 64 | 8 4 2 1 x2 (for double). */
+/* { dg-final { scan-assembler-times {\tst4d\t.z[0-9]} 98 } } */
--- /dev/null
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize -ffast-math" } */
+
+#include "mask_struct_store_3.c"
+
+#define N 100
+
+#undef TEST_LOOP
+#define TEST_LOOP(NAME, OUTTYPE, INTYPE, MASKTYPE) \
+ { \
+ OUTTYPE out[N * 4]; \
+ INTYPE in[N]; \
+ MASKTYPE mask[N]; \
+ for (int i = 0; i < N; ++i) \
+ { \
+ in[i] = i * 7 / 2; \
+ mask[i] = i % 5 <= i % 3; \
+ asm volatile ("" ::: "memory"); \
+ } \
+ for (int i = 0; i < N * 4; ++i) \
+ out[i] = i * 9 / 2; \
+ NAME##_4 (out, in, mask, 42, N); \
+ for (int i = 0; i < N * 4; ++i) \
+ { \
+ OUTTYPE if_true = (INTYPE) (in[i / 4] + 42); \
+ OUTTYPE if_false = i * 9 / 2; \
+ if (out[i] != (mask[i / 4] ? if_true : if_false)) \
+ __builtin_abort (); \
+ asm volatile ("" ::: "memory"); \
+ } \
+ }
+
+int __attribute__ ((optimize (1)))
+main (void)
+{
+ TEST (test);
+ return 0;
+}
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -ffast-math" } */
+
+#define TEST_LOOP(NAME, OUTTYPE, INTYPE, MASKTYPE) \
+ void __attribute__ ((noinline, noclone)) \
+ NAME##_2 (OUTTYPE *__restrict dest, INTYPE *__restrict src, \
+ MASKTYPE *__restrict cond, int n) \
+ { \
+ for (int i = 0; i < n; ++i) \
+ { \
+ if (cond[i] < 8) \
+ dest[i * 2] = src[i]; \
+ if (cond[i] > 2) \
+ dest[i * 2 + 1] = src[i]; \
+ } \
+ }
+
+#define TEST2(NAME, OUTTYPE, INTYPE) \
+ TEST_LOOP (NAME##_i8, OUTTYPE, INTYPE, signed char) \
+ TEST_LOOP (NAME##_i16, OUTTYPE, INTYPE, unsigned short) \
+ TEST_LOOP (NAME##_f32, OUTTYPE, INTYPE, float) \
+ TEST_LOOP (NAME##_f64, OUTTYPE, INTYPE, double)
+
+#define TEST1(NAME, OUTTYPE) \
+ TEST2 (NAME##_i8, OUTTYPE, signed char) \
+ TEST2 (NAME##_i16, OUTTYPE, unsigned short) \
+ TEST2 (NAME##_i32, OUTTYPE, int) \
+ TEST2 (NAME##_i64, OUTTYPE, unsigned long)
+
+#define TEST(NAME) \
+ TEST1 (NAME##_i8, signed char) \
+ TEST1 (NAME##_i16, unsigned short) \
+ TEST1 (NAME##_i32, int) \
+ TEST1 (NAME##_i64, unsigned long) \
+ TEST2 (NAME##_f16_f16, _Float16, _Float16) \
+ TEST2 (NAME##_f32_f32, float, float) \
+ TEST2 (NAME##_f64_f64, double, double)
+
+TEST (test)
+
+/* { dg-final { scan-assembler-not {\tst2b\t.z[0-9]} } } */
+/* { dg-final { scan-assembler-not {\tst2h\t.z[0-9]} } } */
+/* { dg-final { scan-assembler-not {\tst2w\t.z[0-9]} } } */
+/* { dg-final { scan-assembler-not {\tst2d\t.z[0-9]} } } */
return cmp;
}
+/* If OP is the result of a conversion, return the unconverted value,
+ otherwise return null. */
+
+static tree
+strip_conversion (tree op)
+{
+ if (TREE_CODE (op) != SSA_NAME)
+ return NULL_TREE;
+ gimple *stmt = SSA_NAME_DEF_STMT (op);
+ if (!is_gimple_assign (stmt)
+ || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
+ return NULL_TREE;
+ return gimple_assign_rhs1 (stmt);
+}
+
+/* Return true if vectorizable_* routines can handle statements STMT1
+ and STMT2 being in a single group. */
+
+static bool
+can_group_stmts_p (gimple *stmt1, gimple *stmt2)
+{
+ if (gimple_assign_single_p (stmt1))
+ return gimple_assign_single_p (stmt2);
+
+ if (is_gimple_call (stmt1) && gimple_call_internal_p (stmt1))
+ {
+ /* Check for two masked loads or two masked stores. */
+ if (!is_gimple_call (stmt2) || !gimple_call_internal_p (stmt2))
+ return false;
+ internal_fn ifn = gimple_call_internal_fn (stmt1);
+ if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
+ return false;
+ if (ifn != gimple_call_internal_fn (stmt2))
+ return false;
+
+ /* Check that the masks are the same. Cope with casts of masks,
+ like those created by build_mask_conversion. */
+ tree mask1 = gimple_call_arg (stmt1, 2);
+ tree mask2 = gimple_call_arg (stmt2, 2);
+ if (!operand_equal_p (mask1, mask2, 0))
+ {
+ mask1 = strip_conversion (mask1);
+ if (!mask1)
+ return false;
+ mask2 = strip_conversion (mask2);
+ if (!mask2)
+ return false;
+ if (!operand_equal_p (mask1, mask2, 0))
+ return false;
+ }
+ return true;
+ }
+
+ return false;
+}
+
/* Function vect_analyze_data_ref_accesses.
Analyze the access pattern of all the data references in the loop.
|| data_ref_compare_tree (DR_BASE_ADDRESS (dra),
DR_BASE_ADDRESS (drb)) != 0
|| data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
- || !gimple_assign_single_p (DR_STMT (dra))
- || !gimple_assign_single_p (DR_STMT (drb)))
+ || !can_group_stmts_p (DR_STMT (dra), DR_STMT (drb)))
break;
/* Check that the data-refs have the same constant size. */
}
-/* Return TRUE if vec_store_lanes is available for COUNT vectors of
- type VECTYPE. */
+/* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of
+ type VECTYPE. MASKED_P says whether the masked form is needed. */
bool
-vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
+vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
+ bool masked_p)
{
- return vect_lanes_optab_supported_p ("vec_store_lanes",
- vec_store_lanes_optab,
- vectype, count);
+ if (masked_p)
+ return vect_lanes_optab_supported_p ("vec_mask_store_lanes",
+ vec_mask_store_lanes_optab,
+ vectype, count);
+ else
+ return vect_lanes_optab_supported_p ("vec_store_lanes",
+ vec_store_lanes_optab,
+ vectype, count);
}
return false;
}
-/* Return TRUE if vec_load_lanes is available for COUNT vectors of
- type VECTYPE. */
+/* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of
+ type VECTYPE. MASKED_P says whether the masked form is needed. */
bool
-vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
+vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
+ bool masked_p)
{
- return vect_lanes_optab_supported_p ("vec_load_lanes",
- vec_load_lanes_optab,
- vectype, count);
+ if (masked_p)
+ return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
+ vec_mask_load_lanes_optab,
+ vectype, count);
+ else
+ return vect_lanes_optab_supported_p ("vec_load_lanes",
+ vec_load_lanes_optab,
+ vectype, count);
}
/* Function vect_permute_load_chain.
vinfo = vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo));
unsigned int size = STMT_VINFO_GROUP_SIZE (vinfo);
tree vectype = STMT_VINFO_VECTYPE (vinfo);
- if (! vect_store_lanes_supported (vectype, size)
+ if (! vect_store_lanes_supported (vectype, size, false)
&& ! vect_grouped_store_supported (vectype, size))
return false;
FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
bool single_element_p = !STMT_VINFO_GROUP_NEXT_ELEMENT (vinfo);
size = STMT_VINFO_GROUP_SIZE (vinfo);
vectype = STMT_VINFO_VECTYPE (vinfo);
- if (! vect_load_lanes_supported (vectype, size)
+ if (! vect_load_lanes_supported (vectype, size, false)
&& ! vect_grouped_load_supported (vectype, single_element_p,
size))
return false;
instructions do not generate this SLP instance. */
if (is_a <loop_vec_info> (vinfo)
&& loads_permuted
- && dr && vect_store_lanes_supported (vectype, group_size))
+ && dr && vect_store_lanes_supported (vectype, group_size, false))
{
slp_tree load_node;
FOR_EACH_VEC_ELT (loads, i, load_node)
if (STMT_VINFO_STRIDED_P (stmt_vinfo)
|| ! vect_load_lanes_supported
(STMT_VINFO_VECTYPE (stmt_vinfo),
- GROUP_SIZE (stmt_vinfo)))
+ GROUP_SIZE (stmt_vinfo), false))
break;
}
if (i == loads.length ())
static bool
get_group_load_store_type (gimple *stmt, tree vectype, bool slp,
- vec_load_store_type vls_type,
+ bool masked_p, vec_load_store_type vls_type,
vect_memory_access_type *memory_access_type)
{
stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
/* True if we can cope with such overrun by peeling for gaps, so that
there is at least one final scalar iteration after the vector loop. */
- bool can_overrun_p = (vls_type == VLS_LOAD && loop_vinfo && !loop->inner);
+ bool can_overrun_p = (!masked_p
+ && vls_type == VLS_LOAD
+ && loop_vinfo
+ && !loop->inner);
/* There can only be a gap at the end of the group if the stride is
known at compile time. */
and so we are guaranteed to access a non-gap element in the
same B-sized block. */
if (would_overrun_p
+ && !masked_p
&& gap < (vect_known_alignment_in_bytes (first_dr)
/ vect_get_scalar_dr_size (first_dr)))
would_overrun_p = false;
/* Otherwise try using LOAD/STORE_LANES. */
if (*memory_access_type == VMAT_ELEMENTWISE
&& (vls_type == VLS_LOAD
- ? vect_load_lanes_supported (vectype, group_size)
- : vect_store_lanes_supported (vectype, group_size)))
+ ? vect_load_lanes_supported (vectype, group_size, masked_p)
+ : vect_store_lanes_supported (vectype, group_size,
+ masked_p)))
{
*memory_access_type = VMAT_LOAD_STORE_LANES;
overrun_p = would_overrun_p;
gimple *next_stmt = GROUP_NEXT_ELEMENT (stmt_info);
while (next_stmt)
{
- gcc_assert (gimple_assign_single_p (next_stmt));
- tree op = gimple_assign_rhs1 (next_stmt);
+ tree op = vect_get_store_rhs (next_stmt);
gimple *def_stmt;
enum vect_def_type dt;
if (!vect_is_simple_use (op, vinfo, &def_stmt, &dt))
or scatters, fill in GS_INFO accordingly.
SLP says whether we're performing SLP rather than loop vectorization.
+ MASKED_P is true if the statement is conditional on a vectorized mask.
VECTYPE is the vector type that the vectorized statements will use.
NCOPIES is the number of vector statements that will be needed. */
static bool
-get_load_store_type (gimple *stmt, tree vectype, bool slp,
+get_load_store_type (gimple *stmt, tree vectype, bool slp, bool masked_p,
vec_load_store_type vls_type, unsigned int ncopies,
vect_memory_access_type *memory_access_type,
gather_scatter_info *gs_info)
}
else if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
{
- if (!get_group_load_store_type (stmt, vectype, slp, vls_type,
+ if (!get_group_load_store_type (stmt, vectype, slp, masked_p, vls_type,
memory_access_type))
return false;
}
return false;
vect_memory_access_type memory_access_type;
- if (!get_load_store_type (stmt, vectype, slp, vls_type, ncopies,
+ if (!get_load_store_type (stmt, vectype, slp, mask, vls_type, ncopies,
&memory_access_type, &gs_info))
return false;
if (mask)
{
- if (memory_access_type != VMAT_CONTIGUOUS)
+ if (memory_access_type == VMAT_CONTIGUOUS)
+ {
+ if (!VECTOR_MODE_P (vec_mode)
+ || !can_vec_mask_load_store_p (vec_mode,
+ TYPE_MODE (mask_vectype), false))
+ return false;
+ }
+ else if (memory_access_type != VMAT_LOAD_STORE_LANES)
{
if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
"unsupported access type for masked store.\n");
return false;
}
- if (!VECTOR_MODE_P (vec_mode)
- || !can_vec_mask_load_store_p (vec_mode, TYPE_MODE (mask_vectype),
- false))
- return false;
}
else
{
write_vector_array (stmt, gsi, vec_oprnd, vec_array, i);
}
- /* Emit:
- MEM_REF[...all elements...] = STORE_LANES (VEC_ARRAY). */
- data_ref = create_array_ref (aggr_type, dataref_ptr, ref_type);
- gcall *call = gimple_build_call_internal (IFN_STORE_LANES, 1,
- vec_array);
- gimple_call_set_lhs (call, data_ref);
+ gcall *call;
+ if (mask)
+ {
+ /* Emit:
+ MASK_STORE_LANES (DATAREF_PTR, ALIAS_PTR, VEC_MASK,
+ VEC_ARRAY). */
+ unsigned int align = TYPE_ALIGN_UNIT (TREE_TYPE (vectype));
+ tree alias_ptr = build_int_cst (ref_type, align);
+ call = gimple_build_call_internal (IFN_MASK_STORE_LANES, 4,
+ dataref_ptr, alias_ptr,
+ vec_mask, vec_array);
+ }
+ else
+ {
+ /* Emit:
+ MEM_REF[...all elements...] = STORE_LANES (VEC_ARRAY). */
+ data_ref = create_array_ref (aggr_type, dataref_ptr, ref_type);
+ call = gimple_build_call_internal (IFN_STORE_LANES, 1,
+ vec_array);
+ gimple_call_set_lhs (call, data_ref);
+ }
gimple_call_set_nothrow (call, true);
new_stmt = call;
vect_finish_stmt_generation (stmt, new_stmt, gsi);
}
vect_memory_access_type memory_access_type;
- if (!get_load_store_type (stmt, vectype, slp, VLS_LOAD, ncopies,
+ if (!get_load_store_type (stmt, vectype, slp, mask, VLS_LOAD, ncopies,
&memory_access_type, &gs_info))
return false;
{
if (memory_access_type == VMAT_CONTIGUOUS)
{
- if (!VECTOR_MODE_P (TYPE_MODE (vectype))
- || !can_vec_mask_load_store_p (TYPE_MODE (vectype),
+ machine_mode vec_mode = TYPE_MODE (vectype);
+ if (!VECTOR_MODE_P (vec_mode)
+ || !can_vec_mask_load_store_p (vec_mode,
TYPE_MODE (mask_vectype), true))
return false;
}
return false;
}
}
- else
+ else if (memory_access_type != VMAT_LOAD_STORE_LANES)
{
if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
vec_array = create_vector_array (vectype, vec_num);
- /* Emit:
- VEC_ARRAY = LOAD_LANES (MEM_REF[...all elements...]). */
- data_ref = create_array_ref (aggr_type, dataref_ptr, ref_type);
- gcall *call = gimple_build_call_internal (IFN_LOAD_LANES, 1,
- data_ref);
+ gcall *call;
+ if (mask)
+ {
+ /* Emit:
+ VEC_ARRAY = MASK_LOAD_LANES (DATAREF_PTR, ALIAS_PTR,
+ VEC_MASK). */
+ unsigned int align = TYPE_ALIGN_UNIT (TREE_TYPE (vectype));
+ tree alias_ptr = build_int_cst (ref_type, align);
+ call = gimple_build_call_internal (IFN_MASK_LOAD_LANES, 3,
+ dataref_ptr, alias_ptr,
+ vec_mask);
+ }
+ else
+ {
+ /* Emit:
+ VEC_ARRAY = LOAD_LANES (MEM_REF[...all elements...]). */
+ data_ref = create_array_ref (aggr_type, dataref_ptr, ref_type);
+ call = gimple_build_call_internal (IFN_LOAD_LANES, 1, data_ref);
+ }
gimple_call_set_lhs (call, vec_array);
gimple_call_set_nothrow (call, true);
new_stmt = call;
tree);
extern tree vect_create_destination_var (tree, tree);
extern bool vect_grouped_store_supported (tree, unsigned HOST_WIDE_INT);
-extern bool vect_store_lanes_supported (tree, unsigned HOST_WIDE_INT);
+extern bool vect_store_lanes_supported (tree, unsigned HOST_WIDE_INT, bool);
extern bool vect_grouped_load_supported (tree, bool, unsigned HOST_WIDE_INT);
-extern bool vect_load_lanes_supported (tree, unsigned HOST_WIDE_INT);
+extern bool vect_load_lanes_supported (tree, unsigned HOST_WIDE_INT, bool);
extern void vect_permute_store_chain (vec<tree> ,unsigned int, gimple *,
gimple_stmt_iterator *, vec<tree> *);
extern tree vect_setup_realignment (gimple *, gimple_stmt_iterator *, tree *,