From 7e11fc7f5cecffe650b672ac1af212d4bd9f1335 Mon Sep 17 00:00:00 2001 From: Richard Sandiford Date: Sat, 13 Jan 2018 17:57:57 +0000 Subject: [PATCH] Add support for masked load/store_lanes This patch adds support for vectorising groups of IFN_MASK_LOADs and IFN_MASK_STOREs using conditional load/store-lanes instructions. This requires new internal functions to represent the result (IFN_MASK_{LOAD,STORE}_LANES), as well as associated optabs. The normal IFN_{LOAD,STORE}_LANES functions are const operations that logically just perform the permute: the load or store is encoded as a MEM operand to the call statement. In contrast, the IFN_MASK_{LOAD,STORE}_LANES functions use the same kind of interface as IFN_MASK_{LOAD,STORE}, since the memory is only conditionally accessed. The AArch64 patterns were added as part of the main LD[234]/ST[234] patch. 2018-01-13 Richard Sandiford Alan Hayward David Sherwood gcc/ * doc/md.texi (vec_mask_load_lanes@var{m}@var{n}): Document. (vec_mask_store_lanes@var{m}@var{n}): Likewise. * optabs.def (vec_mask_load_lanes_optab): New optab. (vec_mask_store_lanes_optab): Likewise. * internal-fn.def (MASK_LOAD_LANES): New internal function. (MASK_STORE_LANES): Likewise. * internal-fn.c (mask_load_lanes_direct): New macro. (mask_store_lanes_direct): Likewise. (expand_mask_load_optab_fn): Handle masked operations. (expand_mask_load_lanes_optab_fn): New macro. (expand_mask_store_optab_fn): Handle masked operations. (expand_mask_store_lanes_optab_fn): New macro. (direct_mask_load_lanes_optab_supported_p): Likewise. (direct_mask_store_lanes_optab_supported_p): Likewise. * tree-vectorizer.h (vect_store_lanes_supported): Take a masked_p parameter. (vect_load_lanes_supported): Likewise. * tree-vect-data-refs.c (strip_conversion): New function. (can_group_stmts_p): Likewise. (vect_analyze_data_ref_accesses): Use it instead of checking for a pair of assignments. (vect_store_lanes_supported): Take a masked_p parameter. (vect_load_lanes_supported): Likewise. * tree-vect-loop.c (vect_analyze_loop_2): Update calls to vect_store_lanes_supported and vect_load_lanes_supported. * tree-vect-slp.c (vect_analyze_slp_instance): Likewise. * tree-vect-stmts.c (get_group_load_store_type): Take a masked_p parameter. Don't allow gaps for masked accesses. Use vect_get_store_rhs. Update calls to vect_store_lanes_supported and vect_load_lanes_supported. (get_load_store_type): Take a masked_p parameter and update call to get_group_load_store_type. (vectorizable_store): Update call to get_load_store_type. Handle IFN_MASK_STORE_LANES. (vectorizable_load): Update call to get_load_store_type. Handle IFN_MASK_LOAD_LANES. gcc/testsuite/ * gcc.dg/vect/vect-ooo-group-1.c: New test. * gcc.target/aarch64/sve/mask_struct_load_1.c: Likewise. * gcc.target/aarch64/sve/mask_struct_load_1_run.c: Likewise. * gcc.target/aarch64/sve/mask_struct_load_2.c: Likewise. * gcc.target/aarch64/sve/mask_struct_load_2_run.c: Likewise. * gcc.target/aarch64/sve/mask_struct_load_3.c: Likewise. * gcc.target/aarch64/sve/mask_struct_load_3_run.c: Likewise. * gcc.target/aarch64/sve/mask_struct_load_4.c: Likewise. * gcc.target/aarch64/sve/mask_struct_load_5.c: Likewise. * gcc.target/aarch64/sve/mask_struct_load_6.c: Likewise. * gcc.target/aarch64/sve/mask_struct_load_7.c: Likewise. * gcc.target/aarch64/sve/mask_struct_load_8.c: Likewise. * gcc.target/aarch64/sve/mask_struct_store_1.c: Likewise. * gcc.target/aarch64/sve/mask_struct_store_1_run.c: Likewise. * gcc.target/aarch64/sve/mask_struct_store_2.c: Likewise. * gcc.target/aarch64/sve/mask_struct_store_2_run.c: Likewise. * gcc.target/aarch64/sve/mask_struct_store_3.c: Likewise. * gcc.target/aarch64/sve/mask_struct_store_3_run.c: Likewise. * gcc.target/aarch64/sve/mask_struct_store_4.c: Likewise. Co-Authored-By: Alan Hayward Co-Authored-By: David Sherwood From-SVN: r256620 --- gcc/ChangeLog | 41 ++++++++ gcc/doc/md.texi | 36 +++++++ gcc/internal-fn.c | 34 +++++-- gcc/internal-fn.def | 6 ++ gcc/optabs.def | 2 + gcc/testsuite/ChangeLog | 24 +++++ gcc/testsuite/gcc.dg/vect/vect-ooo-group-1.c | 12 +++ .../aarch64/sve/mask_struct_load_1.c | 67 +++++++++++++ .../aarch64/sve/mask_struct_load_1_run.c | 38 ++++++++ .../aarch64/sve/mask_struct_load_2.c | 69 +++++++++++++ .../aarch64/sve/mask_struct_load_2_run.c | 40 ++++++++ .../aarch64/sve/mask_struct_load_3.c | 70 ++++++++++++++ .../aarch64/sve/mask_struct_load_3_run.c | 41 ++++++++ .../aarch64/sve/mask_struct_load_4.c | 67 +++++++++++++ .../aarch64/sve/mask_struct_load_5.c | 67 +++++++++++++ .../aarch64/sve/mask_struct_load_6.c | 40 ++++++++ .../aarch64/sve/mask_struct_load_7.c | 40 ++++++++ .../aarch64/sve/mask_struct_load_8.c | 40 ++++++++ .../aarch64/sve/mask_struct_store_1.c | 73 ++++++++++++++ .../aarch64/sve/mask_struct_store_1_run.c | 38 ++++++++ .../aarch64/sve/mask_struct_store_2.c | 74 ++++++++++++++ .../aarch64/sve/mask_struct_store_2_run.c | 38 ++++++++ .../aarch64/sve/mask_struct_store_3.c | 75 +++++++++++++++ .../aarch64/sve/mask_struct_store_3_run.c | 38 ++++++++ .../aarch64/sve/mask_struct_store_4.c | 44 +++++++++ gcc/tree-vect-data-refs.c | 95 +++++++++++++++--- gcc/tree-vect-loop.c | 4 +- gcc/tree-vect-slp.c | 4 +- gcc/tree-vect-stmts.c | 96 +++++++++++++------ gcc/tree-vectorizer.h | 4 +- 30 files changed, 1260 insertions(+), 57 deletions(-) create mode 100644 gcc/testsuite/gcc.dg/vect/vect-ooo-group-1.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/mask_struct_load_1.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/mask_struct_load_1_run.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/mask_struct_load_2.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/mask_struct_load_2_run.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/mask_struct_load_3.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/mask_struct_load_3_run.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/mask_struct_load_4.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/mask_struct_load_5.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/mask_struct_load_6.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/mask_struct_load_7.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/mask_struct_load_8.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/mask_struct_store_1.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/mask_struct_store_1_run.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/mask_struct_store_2.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/mask_struct_store_2_run.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/mask_struct_store_3.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/mask_struct_store_3_run.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/mask_struct_store_4.c diff --git a/gcc/ChangeLog b/gcc/ChangeLog index fba27e19675..454af82a477 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,44 @@ +2018-01-13 Richard Sandiford + Alan Hayward + David Sherwood + + * doc/md.texi (vec_mask_load_lanes@var{m}@var{n}): Document. + (vec_mask_store_lanes@var{m}@var{n}): Likewise. + * optabs.def (vec_mask_load_lanes_optab): New optab. + (vec_mask_store_lanes_optab): Likewise. + * internal-fn.def (MASK_LOAD_LANES): New internal function. + (MASK_STORE_LANES): Likewise. + * internal-fn.c (mask_load_lanes_direct): New macro. + (mask_store_lanes_direct): Likewise. + (expand_mask_load_optab_fn): Handle masked operations. + (expand_mask_load_lanes_optab_fn): New macro. + (expand_mask_store_optab_fn): Handle masked operations. + (expand_mask_store_lanes_optab_fn): New macro. + (direct_mask_load_lanes_optab_supported_p): Likewise. + (direct_mask_store_lanes_optab_supported_p): Likewise. + * tree-vectorizer.h (vect_store_lanes_supported): Take a masked_p + parameter. + (vect_load_lanes_supported): Likewise. + * tree-vect-data-refs.c (strip_conversion): New function. + (can_group_stmts_p): Likewise. + (vect_analyze_data_ref_accesses): Use it instead of checking + for a pair of assignments. + (vect_store_lanes_supported): Take a masked_p parameter. + (vect_load_lanes_supported): Likewise. + * tree-vect-loop.c (vect_analyze_loop_2): Update calls to + vect_store_lanes_supported and vect_load_lanes_supported. + * tree-vect-slp.c (vect_analyze_slp_instance): Likewise. + * tree-vect-stmts.c (get_group_load_store_type): Take a masked_p + parameter. Don't allow gaps for masked accesses. + Use vect_get_store_rhs. Update calls to vect_store_lanes_supported + and vect_load_lanes_supported. + (get_load_store_type): Take a masked_p parameter and update + call to get_group_load_store_type. + (vectorizable_store): Update call to get_load_store_type. + Handle IFN_MASK_STORE_LANES. + (vectorizable_load): Update call to get_load_store_type. + Handle IFN_MASK_LOAD_LANES. + 2018-01-13 Richard Sandiford Alan Hayward David Sherwood diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi index e956c751b57..213ee9997e3 100644 --- a/gcc/doc/md.texi +++ b/gcc/doc/md.texi @@ -4855,6 +4855,26 @@ loads for vectors of mode @var{n}. This pattern is not allowed to @code{FAIL}. +@cindex @code{vec_mask_load_lanes@var{m}@var{n}} instruction pattern +@item @samp{vec_mask_load_lanes@var{m}@var{n}} +Like @samp{vec_load_lanes@var{m}@var{n}}, but takes an additional +mask operand (operand 2) that specifies which elements of the destination +vectors should be loaded. Other elements of the destination +vectors are set to zero. The operation is equivalent to: + +@smallexample +int c = GET_MODE_SIZE (@var{m}) / GET_MODE_SIZE (@var{n}); +for (j = 0; j < GET_MODE_NUNITS (@var{n}); j++) + if (operand2[j]) + for (i = 0; i < c; i++) + operand0[i][j] = operand1[j * c + i]; + else + for (i = 0; i < c; i++) + operand0[i][j] = 0; +@end smallexample + +This pattern is not allowed to @code{FAIL}. + @cindex @code{vec_store_lanes@var{m}@var{n}} instruction pattern @item @samp{vec_store_lanes@var{m}@var{n}} Equivalent to @samp{vec_load_lanes@var{m}@var{n}}, with the memory @@ -4872,6 +4892,22 @@ for a memory operand 0 and register operand 1. This pattern is not allowed to @code{FAIL}. +@cindex @code{vec_mask_store_lanes@var{m}@var{n}} instruction pattern +@item @samp{vec_mask_store_lanes@var{m}@var{n}} +Like @samp{vec_store_lanes@var{m}@var{n}}, but takes an additional +mask operand (operand 2) that specifies which elements of the source +vectors should be stored. The operation is equivalent to: + +@smallexample +int c = GET_MODE_SIZE (@var{m}) / GET_MODE_SIZE (@var{n}); +for (j = 0; j < GET_MODE_NUNITS (@var{n}); j++) + if (operand2[j]) + for (i = 0; i < c; i++) + operand0[j * c + i] = operand1[i][j]; +@end smallexample + +This pattern is not allowed to @code{FAIL}. + @cindex @code{vec_set@var{m}} instruction pattern @item @samp{vec_set@var{m}} Set given field in the vector value. Operand 0 is the vector to modify, diff --git a/gcc/internal-fn.c b/gcc/internal-fn.c index 6483fe69425..d9c7a168554 100644 --- a/gcc/internal-fn.c +++ b/gcc/internal-fn.c @@ -82,8 +82,10 @@ init_internal_fns () #define not_direct { -2, -2, false } #define mask_load_direct { -1, 2, false } #define load_lanes_direct { -1, -1, false } +#define mask_load_lanes_direct { -1, -1, false } #define mask_store_direct { 3, 2, false } #define store_lanes_direct { 0, 0, false } +#define mask_store_lanes_direct { 0, 0, false } #define unary_direct { 0, 0, true } #define binary_direct { 0, 0, true } @@ -2408,7 +2410,7 @@ expand_LOOP_DIST_ALIAS (internal_fn, gcall *) gcc_unreachable (); } -/* Expand MASK_LOAD call STMT using optab OPTAB. */ +/* Expand MASK_LOAD{,_LANES} call STMT using optab OPTAB. */ static void expand_mask_load_optab_fn (internal_fn, gcall *stmt, convert_optab optab) @@ -2417,6 +2419,7 @@ expand_mask_load_optab_fn (internal_fn, gcall *stmt, convert_optab optab) tree type, lhs, rhs, maskt, ptr; rtx mem, target, mask; unsigned align; + insn_code icode; maskt = gimple_call_arg (stmt, 2); lhs = gimple_call_lhs (stmt); @@ -2429,6 +2432,12 @@ expand_mask_load_optab_fn (internal_fn, gcall *stmt, convert_optab optab) type = build_aligned_type (type, align); rhs = fold_build2 (MEM_REF, type, gimple_call_arg (stmt, 0), ptr); + if (optab == vec_mask_load_lanes_optab) + icode = get_multi_vector_move (type, optab); + else + icode = convert_optab_handler (optab, TYPE_MODE (type), + TYPE_MODE (TREE_TYPE (maskt))); + mem = expand_expr (rhs, NULL_RTX, VOIDmode, EXPAND_WRITE); gcc_assert (MEM_P (mem)); mask = expand_normal (maskt); @@ -2436,12 +2445,12 @@ expand_mask_load_optab_fn (internal_fn, gcall *stmt, convert_optab optab) create_output_operand (&ops[0], target, TYPE_MODE (type)); create_fixed_operand (&ops[1], mem); create_input_operand (&ops[2], mask, TYPE_MODE (TREE_TYPE (maskt))); - expand_insn (convert_optab_handler (optab, TYPE_MODE (type), - TYPE_MODE (TREE_TYPE (maskt))), - 3, ops); + expand_insn (icode, 3, ops); } -/* Expand MASK_STORE call STMT using optab OPTAB. */ +#define expand_mask_load_lanes_optab_fn expand_mask_load_optab_fn + +/* Expand MASK_STORE{,_LANES} call STMT using optab OPTAB. */ static void expand_mask_store_optab_fn (internal_fn, gcall *stmt, convert_optab optab) @@ -2450,6 +2459,7 @@ expand_mask_store_optab_fn (internal_fn, gcall *stmt, convert_optab optab) tree type, lhs, rhs, maskt, ptr; rtx mem, reg, mask; unsigned align; + insn_code icode; maskt = gimple_call_arg (stmt, 2); rhs = gimple_call_arg (stmt, 3); @@ -2460,6 +2470,12 @@ expand_mask_store_optab_fn (internal_fn, gcall *stmt, convert_optab optab) type = build_aligned_type (type, align); lhs = fold_build2 (MEM_REF, type, gimple_call_arg (stmt, 0), ptr); + if (optab == vec_mask_store_lanes_optab) + icode = get_multi_vector_move (type, optab); + else + icode = convert_optab_handler (optab, TYPE_MODE (type), + TYPE_MODE (TREE_TYPE (maskt))); + mem = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE); gcc_assert (MEM_P (mem)); mask = expand_normal (maskt); @@ -2467,11 +2483,11 @@ expand_mask_store_optab_fn (internal_fn, gcall *stmt, convert_optab optab) create_fixed_operand (&ops[0], mem); create_input_operand (&ops[1], reg, TYPE_MODE (type)); create_input_operand (&ops[2], mask, TYPE_MODE (TREE_TYPE (maskt))); - expand_insn (convert_optab_handler (optab, TYPE_MODE (type), - TYPE_MODE (TREE_TYPE (maskt))), - 3, ops); + expand_insn (icode, 3, ops); } +#define expand_mask_store_lanes_optab_fn expand_mask_store_optab_fn + static void expand_ABNORMAL_DISPATCHER (internal_fn, gcall *) { @@ -2871,8 +2887,10 @@ multi_vector_optab_supported_p (convert_optab optab, tree_pair types, #define direct_binary_optab_supported_p direct_optab_supported_p #define direct_mask_load_optab_supported_p direct_optab_supported_p #define direct_load_lanes_optab_supported_p multi_vector_optab_supported_p +#define direct_mask_load_lanes_optab_supported_p multi_vector_optab_supported_p #define direct_mask_store_optab_supported_p direct_optab_supported_p #define direct_store_lanes_optab_supported_p multi_vector_optab_supported_p +#define direct_mask_store_lanes_optab_supported_p multi_vector_optab_supported_p /* Return the optab used by internal function FN. */ diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def index 76c87846424..4dc07c985a7 100644 --- a/gcc/internal-fn.def +++ b/gcc/internal-fn.def @@ -47,9 +47,11 @@ along with GCC; see the file COPYING3. If not see - mask_load: currently just maskload - load_lanes: currently just vec_load_lanes + - mask_load_lanes: currently just vec_mask_load_lanes - mask_store: currently just maskstore - store_lanes: currently just vec_store_lanes + - mask_store_lanes: currently just vec_mask_store_lanes DEF_INTERNAL_SIGNED_OPTAB_FN defines an internal function that maps to one of two optabs, depending on the signedness of an input. @@ -106,9 +108,13 @@ along with GCC; see the file COPYING3. If not see DEF_INTERNAL_OPTAB_FN (MASK_LOAD, ECF_PURE, maskload, mask_load) DEF_INTERNAL_OPTAB_FN (LOAD_LANES, ECF_CONST, vec_load_lanes, load_lanes) +DEF_INTERNAL_OPTAB_FN (MASK_LOAD_LANES, ECF_PURE, + vec_mask_load_lanes, mask_load_lanes) DEF_INTERNAL_OPTAB_FN (MASK_STORE, 0, maskstore, mask_store) DEF_INTERNAL_OPTAB_FN (STORE_LANES, ECF_CONST, vec_store_lanes, store_lanes) +DEF_INTERNAL_OPTAB_FN (MASK_STORE_LANES, 0, + vec_mask_store_lanes, mask_store_lanes) DEF_INTERNAL_OPTAB_FN (RSQRT, ECF_CONST, rsqrt, unary) diff --git a/gcc/optabs.def b/gcc/optabs.def index d9ec1eacd07..c22708b6943 100644 --- a/gcc/optabs.def +++ b/gcc/optabs.def @@ -80,6 +80,8 @@ OPTAB_CD(ssmsub_widen_optab, "ssmsub$b$a4") OPTAB_CD(usmsub_widen_optab, "usmsub$a$b4") OPTAB_CD(vec_load_lanes_optab, "vec_load_lanes$a$b") OPTAB_CD(vec_store_lanes_optab, "vec_store_lanes$a$b") +OPTAB_CD(vec_mask_load_lanes_optab, "vec_mask_load_lanes$a$b") +OPTAB_CD(vec_mask_store_lanes_optab, "vec_mask_store_lanes$a$b") OPTAB_CD(vcond_optab, "vcond$a$b") OPTAB_CD(vcondu_optab, "vcondu$a$b") OPTAB_CD(vcondeq_optab, "vcondeq$a$b") diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 735a4bed9d8..4b1b97425bb 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,27 @@ +2018-01-13 Richard Sandiford + Alan Hayward + David Sherwood + + * gcc.dg/vect/vect-ooo-group-1.c: New test. + * gcc.target/aarch64/sve/mask_struct_load_1.c: Likewise. + * gcc.target/aarch64/sve/mask_struct_load_1_run.c: Likewise. + * gcc.target/aarch64/sve/mask_struct_load_2.c: Likewise. + * gcc.target/aarch64/sve/mask_struct_load_2_run.c: Likewise. + * gcc.target/aarch64/sve/mask_struct_load_3.c: Likewise. + * gcc.target/aarch64/sve/mask_struct_load_3_run.c: Likewise. + * gcc.target/aarch64/sve/mask_struct_load_4.c: Likewise. + * gcc.target/aarch64/sve/mask_struct_load_5.c: Likewise. + * gcc.target/aarch64/sve/mask_struct_load_6.c: Likewise. + * gcc.target/aarch64/sve/mask_struct_load_7.c: Likewise. + * gcc.target/aarch64/sve/mask_struct_load_8.c: Likewise. + * gcc.target/aarch64/sve/mask_struct_store_1.c: Likewise. + * gcc.target/aarch64/sve/mask_struct_store_1_run.c: Likewise. + * gcc.target/aarch64/sve/mask_struct_store_2.c: Likewise. + * gcc.target/aarch64/sve/mask_struct_store_2_run.c: Likewise. + * gcc.target/aarch64/sve/mask_struct_store_3.c: Likewise. + * gcc.target/aarch64/sve/mask_struct_store_3_run.c: Likewise. + * gcc.target/aarch64/sve/mask_struct_store_4.c: Likewise. + 2018-01-13 Richard Sandiford Alan Hayward David Sherwood diff --git a/gcc/testsuite/gcc.dg/vect/vect-ooo-group-1.c b/gcc/testsuite/gcc.dg/vect/vect-ooo-group-1.c new file mode 100644 index 00000000000..416198354ff --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-ooo-group-1.c @@ -0,0 +1,12 @@ +/* { dg-do compile } */ + +void +f (int *restrict a, int *restrict b, int *restrict c) +{ + for (int i = 0; i < 100; ++i) + if (c[i]) + { + a[i * 2] = b[i * 5 + 2]; + a[i * 2 + 1] = b[i * 5]; + } +} diff --git a/gcc/testsuite/gcc.target/aarch64/sve/mask_struct_load_1.c b/gcc/testsuite/gcc.target/aarch64/sve/mask_struct_load_1.c new file mode 100644 index 00000000000..c3c335e3350 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/mask_struct_load_1.c @@ -0,0 +1,67 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-vectorize -ffast-math" } */ + +#define TEST_LOOP(NAME, OUTTYPE, INTYPE, MASKTYPE) \ + void __attribute__ ((noinline, noclone)) \ + NAME##_2 (OUTTYPE *__restrict dest, INTYPE *__restrict src, \ + MASKTYPE *__restrict cond, int n) \ + { \ + for (int i = 0; i < n; ++i) \ + if (cond[i]) \ + dest[i] = src[i * 2] + src[i * 2 + 1]; \ + } + +#define TEST2(NAME, OUTTYPE, INTYPE) \ + TEST_LOOP (NAME##_i8, OUTTYPE, INTYPE, signed char) \ + TEST_LOOP (NAME##_i16, OUTTYPE, INTYPE, unsigned short) \ + TEST_LOOP (NAME##_f32, OUTTYPE, INTYPE, float) \ + TEST_LOOP (NAME##_f64, OUTTYPE, INTYPE, double) + +#define TEST1(NAME, OUTTYPE) \ + TEST2 (NAME##_i8, OUTTYPE, signed char) \ + TEST2 (NAME##_i16, OUTTYPE, unsigned short) \ + TEST2 (NAME##_i32, OUTTYPE, int) \ + TEST2 (NAME##_i64, OUTTYPE, unsigned long) + +#define TEST(NAME) \ + TEST1 (NAME##_i8, signed char) \ + TEST1 (NAME##_i16, unsigned short) \ + TEST1 (NAME##_i32, int) \ + TEST1 (NAME##_i64, unsigned long) \ + TEST2 (NAME##_f16_f16, _Float16, _Float16) \ + TEST2 (NAME##_f32_f32, float, float) \ + TEST2 (NAME##_f64_f64, double, double) + +TEST (test) + +/* Mask | 8 16 32 64 + -------+------------ + Out 8 | 1 1 1 1 + 16 | 1 1 1 1 + 32 | 1 1 1 1 + 64 | 1 1 1 1. */ +/* { dg-final { scan-assembler-times {\tld2b\t.z[0-9]} 16 } } */ + +/* Mask | 8 16 32 64 + -------+------------ + Out 8 | 2 2 2 2 + 16 | 2 1 1 1 x2 (for half float) + 32 | 2 1 1 1 + 64 | 2 1 1 1. */ +/* { dg-final { scan-assembler-times {\tld2h\t.z[0-9]} 28 } } */ + +/* Mask | 8 16 32 64 + -------+------------ + Out 8 | 4 4 4 4 + 16 | 4 2 2 2 + 32 | 4 2 1 1 x2 (for float) + 64 | 4 2 1 1. */ +/* { dg-final { scan-assembler-times {\tld2w\t.z[0-9]} 50 } } */ + +/* Mask | 8 16 32 64 + -------+------------ + Out 8 | 8 8 8 8 + 16 | 8 4 4 4 + 32 | 8 4 2 2 + 64 | 8 4 2 1 x2 (for double). */ +/* { dg-final { scan-assembler-times {\tld2d\t.z[0-9]} 98 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/mask_struct_load_1_run.c b/gcc/testsuite/gcc.target/aarch64/sve/mask_struct_load_1_run.c new file mode 100644 index 00000000000..7df791f04fe --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/mask_struct_load_1_run.c @@ -0,0 +1,38 @@ +/* { dg-do run { target aarch64_sve_hw } } */ +/* { dg-options "-O2 -ftree-vectorize -ffast-math" } */ + +#include "mask_struct_load_1.c" + +#define N 100 + +#undef TEST_LOOP +#define TEST_LOOP(NAME, OUTTYPE, INTYPE, MASKTYPE) \ + { \ + OUTTYPE out[N]; \ + INTYPE in[N * 2]; \ + MASKTYPE mask[N]; \ + for (int i = 0; i < N; ++i) \ + { \ + out[i] = i * 7 / 2; \ + mask[i] = i % 5 <= i % 3; \ + asm volatile ("" ::: "memory"); \ + } \ + for (int i = 0; i < N * 2; ++i) \ + in[i] = i * 9 / 2; \ + NAME##_2 (out, in, mask, N); \ + for (int i = 0; i < N; ++i) \ + { \ + OUTTYPE if_true = in[i * 2] + in[i * 2 + 1]; \ + OUTTYPE if_false = i * 7 / 2; \ + if (out[i] != (mask[i] ? if_true : if_false)) \ + __builtin_abort (); \ + asm volatile ("" ::: "memory"); \ + } \ + } + +int __attribute__ ((optimize (1))) +main (void) +{ + TEST (test); + return 0; +} diff --git a/gcc/testsuite/gcc.target/aarch64/sve/mask_struct_load_2.c b/gcc/testsuite/gcc.target/aarch64/sve/mask_struct_load_2.c new file mode 100644 index 00000000000..1afb21fc2e1 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/mask_struct_load_2.c @@ -0,0 +1,69 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-vectorize -ffast-math" } */ + +#define TEST_LOOP(NAME, OUTTYPE, INTYPE, MASKTYPE) \ + void __attribute__ ((noinline, noclone)) \ + NAME##_3 (OUTTYPE *__restrict dest, INTYPE *__restrict src, \ + MASKTYPE *__restrict cond, int n) \ + { \ + for (int i = 0; i < n; ++i) \ + if (cond[i]) \ + dest[i] = (src[i * 3] \ + + src[i * 3 + 1] \ + + src[i * 3 + 2]); \ + } + +#define TEST2(NAME, OUTTYPE, INTYPE) \ + TEST_LOOP (NAME##_i8, OUTTYPE, INTYPE, signed char) \ + TEST_LOOP (NAME##_i16, OUTTYPE, INTYPE, unsigned short) \ + TEST_LOOP (NAME##_f32, OUTTYPE, INTYPE, float) \ + TEST_LOOP (NAME##_f64, OUTTYPE, INTYPE, double) + +#define TEST1(NAME, OUTTYPE) \ + TEST2 (NAME##_i8, OUTTYPE, signed char) \ + TEST2 (NAME##_i16, OUTTYPE, unsigned short) \ + TEST2 (NAME##_i32, OUTTYPE, int) \ + TEST2 (NAME##_i64, OUTTYPE, unsigned long) + +#define TEST(NAME) \ + TEST1 (NAME##_i8, signed char) \ + TEST1 (NAME##_i16, unsigned short) \ + TEST1 (NAME##_i32, int) \ + TEST1 (NAME##_i64, unsigned long) \ + TEST2 (NAME##_f16_f16, _Float16, _Float16) \ + TEST2 (NAME##_f32_f32, float, float) \ + TEST2 (NAME##_f64_f64, double, double) + +TEST (test) + +/* Mask | 8 16 32 64 + -------+------------ + Out 8 | 1 1 1 1 + 16 | 1 1 1 1 + 32 | 1 1 1 1 + 64 | 1 1 1 1. */ +/* { dg-final { scan-assembler-times {\tld3b\t.z[0-9]} 16 } } */ + +/* Mask | 8 16 32 64 + -------+------------ + Out 8 | 2 2 2 2 + 16 | 2 1 1 1 x2 (for _Float16) + 32 | 2 1 1 1 + 64 | 2 1 1 1. */ +/* { dg-final { scan-assembler-times {\tld3h\t.z[0-9]} 28 } } */ + +/* Mask | 8 16 32 64 + -------+------------ + Out 8 | 4 4 4 4 + 16 | 4 2 2 2 + 32 | 4 2 1 1 x2 (for float) + 64 | 4 2 1 1. */ +/* { dg-final { scan-assembler-times {\tld3w\t.z[0-9]} 50 } } */ + +/* Mask | 8 16 32 64 + -------+------------ + Out 8 | 8 8 8 8 + 16 | 8 4 4 4 + 32 | 8 4 2 2 + 64 | 8 4 2 1 x2 (for double). */ +/* { dg-final { scan-assembler-times {\tld3d\t.z[0-9]} 98 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/mask_struct_load_2_run.c b/gcc/testsuite/gcc.target/aarch64/sve/mask_struct_load_2_run.c new file mode 100644 index 00000000000..d9dcab38713 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/mask_struct_load_2_run.c @@ -0,0 +1,40 @@ +/* { dg-do run { target aarch64_sve_hw } } */ +/* { dg-options "-O2 -ftree-vectorize -ffast-math" } */ + +#include "mask_struct_load_2.c" + +#define N 100 + +#undef TEST_LOOP +#define TEST_LOOP(NAME, OUTTYPE, INTYPE, MASKTYPE) \ + { \ + OUTTYPE out[N]; \ + INTYPE in[N * 3]; \ + MASKTYPE mask[N]; \ + for (int i = 0; i < N; ++i) \ + { \ + out[i] = i * 7 / 2; \ + mask[i] = i % 5 <= i % 3; \ + asm volatile ("" ::: "memory"); \ + } \ + for (int i = 0; i < N * 3; ++i) \ + in[i] = i * 9 / 2; \ + NAME##_3 (out, in, mask, N); \ + for (int i = 0; i < N; ++i) \ + { \ + OUTTYPE if_true = (in[i * 3] \ + + in[i * 3 + 1] \ + + in[i * 3 + 2]); \ + OUTTYPE if_false = i * 7 / 2; \ + if (out[i] != (mask[i] ? if_true : if_false)) \ + __builtin_abort (); \ + asm volatile ("" ::: "memory"); \ + } \ + } + +int __attribute__ ((optimize (1))) +main (void) +{ + TEST (test); + return 0; +} diff --git a/gcc/testsuite/gcc.target/aarch64/sve/mask_struct_load_3.c b/gcc/testsuite/gcc.target/aarch64/sve/mask_struct_load_3.c new file mode 100644 index 00000000000..a5b386d19b4 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/mask_struct_load_3.c @@ -0,0 +1,70 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-vectorize -ffast-math" } */ + +#define TEST_LOOP(NAME, OUTTYPE, INTYPE, MASKTYPE) \ + void __attribute__ ((noinline, noclone)) \ + NAME##_4 (OUTTYPE *__restrict dest, INTYPE *__restrict src, \ + MASKTYPE *__restrict cond, int n) \ + { \ + for (int i = 0; i < n; ++i) \ + if (cond[i]) \ + dest[i] = (src[i * 4] \ + + src[i * 4 + 1] \ + + src[i * 4 + 2] \ + + src[i * 4 + 3]); \ + } + +#define TEST2(NAME, OUTTYPE, INTYPE) \ + TEST_LOOP (NAME##_i8, OUTTYPE, INTYPE, signed char) \ + TEST_LOOP (NAME##_i16, OUTTYPE, INTYPE, unsigned short) \ + TEST_LOOP (NAME##_f32, OUTTYPE, INTYPE, float) \ + TEST_LOOP (NAME##_f64, OUTTYPE, INTYPE, double) + +#define TEST1(NAME, OUTTYPE) \ + TEST2 (NAME##_i8, OUTTYPE, signed char) \ + TEST2 (NAME##_i16, OUTTYPE, unsigned short) \ + TEST2 (NAME##_i32, OUTTYPE, int) \ + TEST2 (NAME##_i64, OUTTYPE, unsigned long) + +#define TEST(NAME) \ + TEST1 (NAME##_i8, signed char) \ + TEST1 (NAME##_i16, unsigned short) \ + TEST1 (NAME##_i32, int) \ + TEST1 (NAME##_i64, unsigned long) \ + TEST2 (NAME##_f16_f16, _Float16, _Float16) \ + TEST2 (NAME##_f32_f32, float, float) \ + TEST2 (NAME##_f64_f64, double, double) + +TEST (test) + +/* Mask | 8 16 32 64 + -------+------------ + Out 8 | 1 1 1 1 + 16 | 1 1 1 1 + 32 | 1 1 1 1 + 64 | 1 1 1 1. */ +/* { dg-final { scan-assembler-times {\tld4b\t.z[0-9]} 16 } } */ + +/* Mask | 8 16 32 64 + -------+------------ + Out 8 | 2 2 2 2 + 16 | 2 1 1 1 x2 (for half float) + 32 | 2 1 1 1 + 64 | 2 1 1 1. */ +/* { dg-final { scan-assembler-times {\tld4h\t.z[0-9]} 28 } } */ + +/* Mask | 8 16 32 64 + -------+------------ + Out 8 | 4 4 4 4 + 16 | 4 2 2 2 + 32 | 4 2 1 1 x2 (for float) + 64 | 4 2 1 1. */ +/* { dg-final { scan-assembler-times {\tld4w\t.z[0-9]} 50 } } */ + +/* Mask | 8 16 32 64 + -------+------------ + Out 8 | 8 8 8 8 + 16 | 8 4 4 4 + 32 | 8 4 2 2 + 64 | 8 4 2 1 x2 (for double). */ +/* { dg-final { scan-assembler-times {\tld4d\t.z[0-9]} 98 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/mask_struct_load_3_run.c b/gcc/testsuite/gcc.target/aarch64/sve/mask_struct_load_3_run.c new file mode 100644 index 00000000000..8bc3b08fcf4 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/mask_struct_load_3_run.c @@ -0,0 +1,41 @@ +/* { dg-do run { target aarch64_sve_hw } } */ +/* { dg-options "-O2 -ftree-vectorize -ffast-math" } */ + +#include "mask_struct_load_3.c" + +#define N 100 + +#undef TEST_LOOP +#define TEST_LOOP(NAME, OUTTYPE, INTYPE, MASKTYPE) \ + { \ + OUTTYPE out[N]; \ + INTYPE in[N * 4]; \ + MASKTYPE mask[N]; \ + for (int i = 0; i < N; ++i) \ + { \ + out[i] = i * 7 / 2; \ + mask[i] = i % 5 <= i % 3; \ + asm volatile ("" ::: "memory"); \ + } \ + for (int i = 0; i < N * 4; ++i) \ + in[i] = i * 9 / 2; \ + NAME##_4 (out, in, mask, N); \ + for (int i = 0; i < N; ++i) \ + { \ + OUTTYPE if_true = (in[i * 4] \ + + in[i * 4 + 1] \ + + in[i * 4 + 2] \ + + in[i * 4 + 3]); \ + OUTTYPE if_false = i * 7 / 2; \ + if (out[i] != (mask[i] ? if_true : if_false)) \ + __builtin_abort (); \ + asm volatile ("" ::: "memory"); \ + } \ + } + +int __attribute__ ((optimize (1))) +main (void) +{ + TEST (test); + return 0; +} diff --git a/gcc/testsuite/gcc.target/aarch64/sve/mask_struct_load_4.c b/gcc/testsuite/gcc.target/aarch64/sve/mask_struct_load_4.c new file mode 100644 index 00000000000..9c66643c1e0 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/mask_struct_load_4.c @@ -0,0 +1,67 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-vectorize -ffast-math" } */ + +#define TEST_LOOP(NAME, OUTTYPE, INTYPE, MASKTYPE) \ + void __attribute__ ((noinline, noclone)) \ + NAME##_3 (OUTTYPE *__restrict dest, INTYPE *__restrict src, \ + MASKTYPE *__restrict cond, int n) \ + { \ + for (int i = 0; i < n; ++i) \ + if (cond[i]) \ + dest[i] = src[i * 3] + src[i * 3 + 2]; \ + } + +#define TEST2(NAME, OUTTYPE, INTYPE) \ + TEST_LOOP (NAME##_i8, OUTTYPE, INTYPE, signed char) \ + TEST_LOOP (NAME##_i16, OUTTYPE, INTYPE, unsigned short) \ + TEST_LOOP (NAME##_f32, OUTTYPE, INTYPE, float) \ + TEST_LOOP (NAME##_f64, OUTTYPE, INTYPE, double) + +#define TEST1(NAME, OUTTYPE) \ + TEST2 (NAME##_i8, OUTTYPE, signed char) \ + TEST2 (NAME##_i16, OUTTYPE, unsigned short) \ + TEST2 (NAME##_i32, OUTTYPE, int) \ + TEST2 (NAME##_i64, OUTTYPE, unsigned long) + +#define TEST(NAME) \ + TEST1 (NAME##_i8, signed char) \ + TEST1 (NAME##_i16, unsigned short) \ + TEST1 (NAME##_i32, int) \ + TEST1 (NAME##_i64, unsigned long) \ + TEST2 (NAME##_f16_f16, _Float16, _Float16) \ + TEST2 (NAME##_f32_f32, float, float) \ + TEST2 (NAME##_f64_f64, double, double) + +TEST (test) + +/* Mask | 8 16 32 64 + -------+------------ + Out 8 | 1 1 1 1 + 16 | 1 1 1 1 + 32 | 1 1 1 1 + 64 | 1 1 1 1. */ +/* { dg-final { scan-assembler-times {\tld3b\t.z[0-9]} 16 } } */ + +/* Mask | 8 16 32 64 + -------+------------ + Out 8 | 2 2 2 2 + 16 | 2 1 1 1 x2 (for half float) + 32 | 2 1 1 1 + 64 | 2 1 1 1. */ +/* { dg-final { scan-assembler-times {\tld3h\t.z[0-9]} 28 } } */ + +/* Mask | 8 16 32 64 + -------+------------ + Out 8 | 4 4 4 4 + 16 | 4 2 2 2 + 32 | 4 2 1 1 x2 (for float) + 64 | 4 2 1 1. */ +/* { dg-final { scan-assembler-times {\tld3w\t.z[0-9]} 50 } } */ + +/* Mask | 8 16 32 64 + -------+------------ + Out 8 | 8 8 8 8 + 16 | 8 4 4 4 + 32 | 8 4 2 2 + 64 | 8 4 2 1 x2 (for double). */ +/* { dg-final { scan-assembler-times {\tld3d\t.z[0-9]} 98 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/mask_struct_load_5.c b/gcc/testsuite/gcc.target/aarch64/sve/mask_struct_load_5.c new file mode 100644 index 00000000000..41412350633 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/mask_struct_load_5.c @@ -0,0 +1,67 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-vectorize -ffast-math" } */ + +#define TEST_LOOP(NAME, OUTTYPE, INTYPE, MASKTYPE) \ + void __attribute__ ((noinline, noclone)) \ + NAME##_4 (OUTTYPE *__restrict dest, INTYPE *__restrict src, \ + MASKTYPE *__restrict cond, int n) \ + { \ + for (int i = 0; i < n; ++i) \ + if (cond[i]) \ + dest[i] = src[i * 4] + src[i * 4 + 3]; \ + } + +#define TEST2(NAME, OUTTYPE, INTYPE) \ + TEST_LOOP (NAME##_i8, OUTTYPE, INTYPE, signed char) \ + TEST_LOOP (NAME##_i16, OUTTYPE, INTYPE, unsigned short) \ + TEST_LOOP (NAME##_f32, OUTTYPE, INTYPE, float) \ + TEST_LOOP (NAME##_f64, OUTTYPE, INTYPE, double) + +#define TEST1(NAME, OUTTYPE) \ + TEST2 (NAME##_i8, OUTTYPE, signed char) \ + TEST2 (NAME##_i16, OUTTYPE, unsigned short) \ + TEST2 (NAME##_i32, OUTTYPE, int) \ + TEST2 (NAME##_i64, OUTTYPE, unsigned long) + +#define TEST(NAME) \ + TEST1 (NAME##_i8, signed char) \ + TEST1 (NAME##_i16, unsigned short) \ + TEST1 (NAME##_i32, int) \ + TEST1 (NAME##_i64, unsigned long) \ + TEST2 (NAME##_f16_f16, _Float16, _Float16) \ + TEST2 (NAME##_f32_f32, float, float) \ + TEST2 (NAME##_f64_f64, double, double) + +TEST (test) + +/* Mask | 8 16 32 64 + -------+------------ + Out 8 | 1 1 1 1 + 16 | 1 1 1 1 + 32 | 1 1 1 1 + 64 | 1 1 1 1. */ +/* { dg-final { scan-assembler-times {\tld4b\t.z[0-9]} 16 } } */ + +/* Mask | 8 16 32 64 + -------+------------ + Out 8 | 2 2 2 2 + 16 | 2 1 1 1 x2 (for half float) + 32 | 2 1 1 1 + 64 | 2 1 1 1. */ +/* { dg-final { scan-assembler-times {\tld4h\t.z[0-9]} 28 } } */ + +/* Mask | 8 16 32 64 + -------+------------ + Out 8 | 4 4 4 4 + 16 | 4 2 2 2 + 32 | 4 2 1 1 x2 (for float) + 64 | 4 2 1 1. */ +/* { dg-final { scan-assembler-times {\tld4w\t.z[0-9]} 50 } } */ + +/* Mask | 8 16 32 64 + -------+------------ + Out 8 | 8 8 8 8 + 16 | 8 4 4 4 + 32 | 8 4 2 2 + 64 | 8 4 2 1 x2 (for double). */ +/* { dg-final { scan-assembler-times {\tld4d\t.z[0-9]} 98 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/mask_struct_load_6.c b/gcc/testsuite/gcc.target/aarch64/sve/mask_struct_load_6.c new file mode 100644 index 00000000000..805622624aa --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/mask_struct_load_6.c @@ -0,0 +1,40 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-vectorize -ffast-math" } */ + +#define TEST_LOOP(NAME, OUTTYPE, INTYPE, MASKTYPE) \ + void __attribute__ ((noinline, noclone)) \ + NAME##_2 (OUTTYPE *__restrict dest, INTYPE *__restrict src, \ + MASKTYPE *__restrict cond, int n) \ + { \ + for (int i = 0; i < n; ++i) \ + if (cond[i]) \ + dest[i] = src[i * 2]; \ + } + +#define TEST2(NAME, OUTTYPE, INTYPE) \ + TEST_LOOP (NAME##_i8, OUTTYPE, INTYPE, signed char) \ + TEST_LOOP (NAME##_i16, OUTTYPE, INTYPE, unsigned short) \ + TEST_LOOP (NAME##_f32, OUTTYPE, INTYPE, float) \ + TEST_LOOP (NAME##_f64, OUTTYPE, INTYPE, double) + +#define TEST1(NAME, OUTTYPE) \ + TEST2 (NAME##_i8, OUTTYPE, signed char) \ + TEST2 (NAME##_i16, OUTTYPE, unsigned short) \ + TEST2 (NAME##_i32, OUTTYPE, int) \ + TEST2 (NAME##_i64, OUTTYPE, unsigned long) + +#define TEST(NAME) \ + TEST1 (NAME##_i8, signed char) \ + TEST1 (NAME##_i16, unsigned short) \ + TEST1 (NAME##_i32, int) \ + TEST1 (NAME##_i64, unsigned long) \ + TEST2 (NAME##_f16_f16, _Float16, _Float16) \ + TEST2 (NAME##_f32_f32, float, float) \ + TEST2 (NAME##_f64_f64, double, double) + +TEST (test) + +/* { dg-final { scan-assembler-not {\tld2b\t} } } */ +/* { dg-final { scan-assembler-not {\tld2h\t} } } */ +/* { dg-final { scan-assembler-not {\tld2w\t} } } */ +/* { dg-final { scan-assembler-not {\tld2d\t} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/mask_struct_load_7.c b/gcc/testsuite/gcc.target/aarch64/sve/mask_struct_load_7.c new file mode 100644 index 00000000000..982fa8b6222 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/mask_struct_load_7.c @@ -0,0 +1,40 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-vectorize -ffast-math" } */ + +#define TEST_LOOP(NAME, OUTTYPE, INTYPE, MASKTYPE) \ + void __attribute__ ((noinline, noclone)) \ + NAME##_3 (OUTTYPE *__restrict dest, INTYPE *__restrict src, \ + MASKTYPE *__restrict cond, int n) \ + { \ + for (int i = 0; i < n; ++i) \ + if (cond[i]) \ + dest[i] = src[i * 3] + src[i * 3 + 1]; \ + } + +#define TEST2(NAME, OUTTYPE, INTYPE) \ + TEST_LOOP (NAME##_i8, OUTTYPE, INTYPE, signed char) \ + TEST_LOOP (NAME##_i16, OUTTYPE, INTYPE, unsigned short) \ + TEST_LOOP (NAME##_f32, OUTTYPE, INTYPE, float) \ + TEST_LOOP (NAME##_f64, OUTTYPE, INTYPE, double) + +#define TEST1(NAME, OUTTYPE) \ + TEST2 (NAME##_i8, OUTTYPE, signed char) \ + TEST2 (NAME##_i16, OUTTYPE, unsigned short) \ + TEST2 (NAME##_i32, OUTTYPE, int) \ + TEST2 (NAME##_i64, OUTTYPE, unsigned long) + +#define TEST(NAME) \ + TEST1 (NAME##_i8, signed char) \ + TEST1 (NAME##_i16, unsigned short) \ + TEST1 (NAME##_i32, int) \ + TEST1 (NAME##_i64, unsigned long) \ + TEST2 (NAME##_f16_f16, _Float16, _Float16) \ + TEST2 (NAME##_f32_f32, float, float) \ + TEST2 (NAME##_f64_f64, double, double) + +TEST (test) + +/* { dg-final { scan-assembler-not {\tld3b\t} } } */ +/* { dg-final { scan-assembler-not {\tld3h\t} } } */ +/* { dg-final { scan-assembler-not {\tld3w\t} } } */ +/* { dg-final { scan-assembler-not {\tld3d\t} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/mask_struct_load_8.c b/gcc/testsuite/gcc.target/aarch64/sve/mask_struct_load_8.c new file mode 100644 index 00000000000..c1da197cab6 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/mask_struct_load_8.c @@ -0,0 +1,40 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-vectorize -ffast-math" } */ + +#define TEST_LOOP(NAME, OUTTYPE, INTYPE, MASKTYPE) \ + void __attribute__ ((noinline, noclone)) \ + NAME##_4 (OUTTYPE *__restrict dest, INTYPE *__restrict src, \ + MASKTYPE *__restrict cond, int n) \ + { \ + for (int i = 0; i < n; ++i) \ + if (cond[i]) \ + dest[i] = src[i * 4] + src[i * 4 + 2]; \ + } + +#define TEST2(NAME, OUTTYPE, INTYPE) \ + TEST_LOOP (NAME##_i8, OUTTYPE, INTYPE, signed char) \ + TEST_LOOP (NAME##_i16, OUTTYPE, INTYPE, unsigned short) \ + TEST_LOOP (NAME##_f32, OUTTYPE, INTYPE, float) \ + TEST_LOOP (NAME##_f64, OUTTYPE, INTYPE, double) + +#define TEST1(NAME, OUTTYPE) \ + TEST2 (NAME##_i8, OUTTYPE, signed char) \ + TEST2 (NAME##_i16, OUTTYPE, unsigned short) \ + TEST2 (NAME##_i32, OUTTYPE, int) \ + TEST2 (NAME##_i64, OUTTYPE, unsigned long) + +#define TEST(NAME) \ + TEST1 (NAME##_i8, signed char) \ + TEST1 (NAME##_i16, unsigned short) \ + TEST1 (NAME##_i32, int) \ + TEST1 (NAME##_i64, unsigned long) \ + TEST2 (NAME##_f16_f16, _Float16, _Float16) \ + TEST2 (NAME##_f32_f32, float, float) \ + TEST2 (NAME##_f64_f64, double, double) + +TEST (test) + +/* { dg-final { scan-assembler-not {\tld4b\t} } } */ +/* { dg-final { scan-assembler-not {\tld4h\t} } } */ +/* { dg-final { scan-assembler-not {\tld4w\t} } } */ +/* { dg-final { scan-assembler-not {\tld4d\t} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/mask_struct_store_1.c b/gcc/testsuite/gcc.target/aarch64/sve/mask_struct_store_1.c new file mode 100644 index 00000000000..47ad135ecdf --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/mask_struct_store_1.c @@ -0,0 +1,73 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-vectorize -ffast-math" } */ + +#define TEST_LOOP(NAME, OUTTYPE, INTYPE, MASKTYPE) \ + void __attribute__ ((noinline, noclone)) \ + NAME##_2 (OUTTYPE *__restrict dest, INTYPE *__restrict src, \ + MASKTYPE *__restrict cond, INTYPE bias, int n) \ + { \ + for (int i = 0; i < n; ++i) \ + { \ + INTYPE value = src[i] + bias; \ + if (cond[i]) \ + { \ + dest[i * 2] = value; \ + dest[i * 2 + 1] = value; \ + } \ + } \ + } + +#define TEST2(NAME, OUTTYPE, INTYPE) \ + TEST_LOOP (NAME##_i8, OUTTYPE, INTYPE, signed char) \ + TEST_LOOP (NAME##_i16, OUTTYPE, INTYPE, unsigned short) \ + TEST_LOOP (NAME##_f32, OUTTYPE, INTYPE, float) \ + TEST_LOOP (NAME##_f64, OUTTYPE, INTYPE, double) + +#define TEST1(NAME, OUTTYPE) \ + TEST2 (NAME##_i8, OUTTYPE, signed char) \ + TEST2 (NAME##_i16, OUTTYPE, unsigned short) \ + TEST2 (NAME##_i32, OUTTYPE, int) \ + TEST2 (NAME##_i64, OUTTYPE, unsigned long) + +#define TEST(NAME) \ + TEST1 (NAME##_i8, signed char) \ + TEST1 (NAME##_i16, unsigned short) \ + TEST1 (NAME##_i32, int) \ + TEST1 (NAME##_i64, unsigned long) \ + TEST2 (NAME##_f16_f16, _Float16, _Float16) \ + TEST2 (NAME##_f32_f32, float, float) \ + TEST2 (NAME##_f64_f64, double, double) + +TEST (test) + +/* Mask | 8 16 32 64 + -------+------------ + In 8 | 1 1 1 1 + 16 | 1 1 1 1 + 32 | 1 1 1 1 + 64 | 1 1 1 1. */ +/* { dg-final { scan-assembler-times {\tst2b\t.z[0-9]} 16 } } */ + +/* Mask | 8 16 32 64 + -------+------------ + In 8 | 2 2 2 2 + 16 | 2 1 1 1 x2 (for _Float16) + 32 | 2 1 1 1 + 64 | 2 1 1 1. */ +/* { dg-final { scan-assembler-times {\tst2h\t.z[0-9]} 28 } } */ + +/* Mask | 8 16 32 64 + -------+------------ + In 8 | 4 4 4 4 + 16 | 4 2 2 2 + 32 | 4 2 1 1 x2 (for float) + 64 | 4 2 1 1. */ +/* { dg-final { scan-assembler-times {\tst2w\t.z[0-9]} 50 } } */ + +/* Mask | 8 16 32 64 + -------+------------ + In 8 | 8 8 8 8 + 16 | 8 4 4 4 + 32 | 8 4 2 2 + 64 | 8 4 2 1 x2 (for double). */ +/* { dg-final { scan-assembler-times {\tst2d\t.z[0-9]} 98 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/mask_struct_store_1_run.c b/gcc/testsuite/gcc.target/aarch64/sve/mask_struct_store_1_run.c new file mode 100644 index 00000000000..88f248c7304 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/mask_struct_store_1_run.c @@ -0,0 +1,38 @@ +/* { dg-do run { target aarch64_sve_hw } } */ +/* { dg-options "-O2 -ftree-vectorize -ffast-math" } */ + +#include "mask_struct_store_1.c" + +#define N 100 + +#undef TEST_LOOP +#define TEST_LOOP(NAME, OUTTYPE, INTYPE, MASKTYPE) \ + { \ + OUTTYPE out[N * 2]; \ + INTYPE in[N]; \ + MASKTYPE mask[N]; \ + for (int i = 0; i < N; ++i) \ + { \ + in[i] = i * 7 / 2; \ + mask[i] = i % 5 <= i % 3; \ + asm volatile ("" ::: "memory"); \ + } \ + for (int i = 0; i < N * 2; ++i) \ + out[i] = i * 9 / 2; \ + NAME##_2 (out, in, mask, 17, N); \ + for (int i = 0; i < N * 2; ++i) \ + { \ + OUTTYPE if_true = (INTYPE) (in[i / 2] + 17); \ + OUTTYPE if_false = i * 9 / 2; \ + if (out[i] != (mask[i / 2] ? if_true : if_false)) \ + __builtin_abort (); \ + asm volatile ("" ::: "memory"); \ + } \ + } + +int __attribute__ ((optimize (1))) +main (void) +{ + TEST (test); + return 0; +} diff --git a/gcc/testsuite/gcc.target/aarch64/sve/mask_struct_store_2.c b/gcc/testsuite/gcc.target/aarch64/sve/mask_struct_store_2.c new file mode 100644 index 00000000000..e87a31c765e --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/mask_struct_store_2.c @@ -0,0 +1,74 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-vectorize -ffast-math" } */ + +#define TEST_LOOP(NAME, OUTTYPE, INTYPE, MASKTYPE) \ + void __attribute__ ((noinline, noclone)) \ + NAME##_3 (OUTTYPE *__restrict dest, INTYPE *__restrict src, \ + MASKTYPE *__restrict cond, INTYPE bias, int n) \ + { \ + for (int i = 0; i < n; ++i) \ + { \ + INTYPE value = src[i] + bias; \ + if (cond[i]) \ + { \ + dest[i * 3] = value; \ + dest[i * 3 + 1] = value; \ + dest[i * 3 + 2] = value; \ + } \ + } \ + } + +#define TEST2(NAME, OUTTYPE, INTYPE) \ + TEST_LOOP (NAME##_i8, OUTTYPE, INTYPE, signed char) \ + TEST_LOOP (NAME##_i16, OUTTYPE, INTYPE, unsigned short) \ + TEST_LOOP (NAME##_f32, OUTTYPE, INTYPE, float) \ + TEST_LOOP (NAME##_f64, OUTTYPE, INTYPE, double) + +#define TEST1(NAME, OUTTYPE) \ + TEST2 (NAME##_i8, OUTTYPE, signed char) \ + TEST2 (NAME##_i16, OUTTYPE, unsigned short) \ + TEST2 (NAME##_i32, OUTTYPE, int) \ + TEST2 (NAME##_i64, OUTTYPE, unsigned long) + +#define TEST(NAME) \ + TEST1 (NAME##_i8, signed char) \ + TEST1 (NAME##_i16, unsigned short) \ + TEST1 (NAME##_i32, int) \ + TEST1 (NAME##_i64, unsigned long) \ + TEST2 (NAME##_f16_f16, _Float16, _Float16) \ + TEST2 (NAME##_f32_f32, float, float) \ + TEST2 (NAME##_f64_f64, double, double) + +TEST (test) + +/* Mask | 8 16 32 64 + -------+------------ + In 8 | 1 1 1 1 + 16 | 1 1 1 1 + 32 | 1 1 1 1 + 64 | 1 1 1 1. */ +/* { dg-final { scan-assembler-times {\tst3b\t.z[0-9]} 16 } } */ + +/* Mask | 8 16 32 64 + -------+------------ + In 8 | 2 2 2 2 + 16 | 2 1 1 1 x2 (for _Float16) + 32 | 2 1 1 1 + 64 | 2 1 1 1. */ +/* { dg-final { scan-assembler-times {\tst3h\t.z[0-9]} 28 } } */ + +/* Mask | 8 16 32 64 + -------+------------ + In 8 | 4 4 4 4 + 16 | 4 2 2 2 + 32 | 4 2 1 1 x2 (for float) + 64 | 4 2 1 1. */ +/* { dg-final { scan-assembler-times {\tst3w\t.z[0-9]} 50 } } */ + +/* Mask | 8 16 32 64 + -------+------------ + In 8 | 8 8 8 8 + 16 | 8 4 4 4 + 32 | 8 4 2 2 + 64 | 8 4 2 1 x2 (for double). */ +/* { dg-final { scan-assembler-times {\tst3d\t.z[0-9]} 98 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/mask_struct_store_2_run.c b/gcc/testsuite/gcc.target/aarch64/sve/mask_struct_store_2_run.c new file mode 100644 index 00000000000..9fd45ffd4dc --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/mask_struct_store_2_run.c @@ -0,0 +1,38 @@ +/* { dg-do run { target aarch64_sve_hw } } */ +/* { dg-options "-O2 -ftree-vectorize -ffast-math" } */ + +#include "mask_struct_store_2.c" + +#define N 100 + +#undef TEST_LOOP +#define TEST_LOOP(NAME, OUTTYPE, INTYPE, MASKTYPE) \ + { \ + OUTTYPE out[N * 3]; \ + INTYPE in[N]; \ + MASKTYPE mask[N]; \ + for (int i = 0; i < N; ++i) \ + { \ + in[i] = i * 7 / 2; \ + mask[i] = i % 5 <= i % 3; \ + asm volatile ("" ::: "memory"); \ + } \ + for (int i = 0; i < N * 3; ++i) \ + out[i] = i * 9 / 2; \ + NAME##_3 (out, in, mask, 11, N); \ + for (int i = 0; i < N * 3; ++i) \ + { \ + OUTTYPE if_true = (INTYPE) (in[i / 3] + 11); \ + OUTTYPE if_false = i * 9 / 2; \ + if (out[i] != (mask[i / 3] ? if_true : if_false)) \ + __builtin_abort (); \ + asm volatile ("" ::: "memory"); \ + } \ + } + +int __attribute__ ((optimize (1))) +main (void) +{ + TEST (test); + return 0; +} diff --git a/gcc/testsuite/gcc.target/aarch64/sve/mask_struct_store_3.c b/gcc/testsuite/gcc.target/aarch64/sve/mask_struct_store_3.c new file mode 100644 index 00000000000..908b12b502e --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/mask_struct_store_3.c @@ -0,0 +1,75 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-vectorize -ffast-math" } */ + +#define TEST_LOOP(NAME, OUTTYPE, INTYPE, MASKTYPE) \ + void __attribute__ ((noinline, noclone)) \ + NAME##_4 (OUTTYPE *__restrict dest, INTYPE *__restrict src, \ + MASKTYPE *__restrict cond, INTYPE bias, int n) \ + { \ + for (int i = 0; i < n; ++i) \ + { \ + INTYPE value = src[i] + bias; \ + if (cond[i]) \ + { \ + dest[i * 4] = value; \ + dest[i * 4 + 1] = value; \ + dest[i * 4 + 2] = value; \ + dest[i * 4 + 3] = value; \ + } \ + } \ + } + +#define TEST2(NAME, OUTTYPE, INTYPE) \ + TEST_LOOP (NAME##_i8, OUTTYPE, INTYPE, signed char) \ + TEST_LOOP (NAME##_i16, OUTTYPE, INTYPE, unsigned short) \ + TEST_LOOP (NAME##_f32, OUTTYPE, INTYPE, float) \ + TEST_LOOP (NAME##_f64, OUTTYPE, INTYPE, double) + +#define TEST1(NAME, OUTTYPE) \ + TEST2 (NAME##_i8, OUTTYPE, signed char) \ + TEST2 (NAME##_i16, OUTTYPE, unsigned short) \ + TEST2 (NAME##_i32, OUTTYPE, int) \ + TEST2 (NAME##_i64, OUTTYPE, unsigned long) + +#define TEST(NAME) \ + TEST1 (NAME##_i8, signed char) \ + TEST1 (NAME##_i16, unsigned short) \ + TEST1 (NAME##_i32, int) \ + TEST1 (NAME##_i64, unsigned long) \ + TEST2 (NAME##_f16_f16, _Float16, _Float16) \ + TEST2 (NAME##_f32_f32, float, float) \ + TEST2 (NAME##_f64_f64, double, double) + +TEST (test) + +/* Mask | 8 16 32 64 + -------+------------ + In 8 | 1 1 1 1 + 16 | 1 1 1 1 + 32 | 1 1 1 1 + 64 | 1 1 1 1. */ +/* { dg-final { scan-assembler-times {\tst4b\t.z[0-9]} 16 } } */ + +/* Mask | 8 16 32 64 + -------+------------ + In 8 | 2 2 2 2 + 16 | 2 1 1 1 x2 (for half float) + 32 | 2 1 1 1 + 64 | 2 1 1 1. */ +/* { dg-final { scan-assembler-times {\tst4h\t.z[0-9]} 28 } } */ + +/* Mask | 8 16 32 64 + -------+------------ + In 8 | 4 4 4 4 + 16 | 4 2 2 2 + 32 | 4 2 1 1 x2 (for float) + 64 | 4 2 1 1. */ +/* { dg-final { scan-assembler-times {\tst4w\t.z[0-9]} 50 } } */ + +/* Mask | 8 16 32 64 + -------+------------ + In 8 | 8 8 8 8 + 16 | 8 4 4 4 + 32 | 8 4 2 2 + 64 | 8 4 2 1 x2 (for double). */ +/* { dg-final { scan-assembler-times {\tst4d\t.z[0-9]} 98 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/mask_struct_store_3_run.c b/gcc/testsuite/gcc.target/aarch64/sve/mask_struct_store_3_run.c new file mode 100644 index 00000000000..31d661b6594 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/mask_struct_store_3_run.c @@ -0,0 +1,38 @@ +/* { dg-do run { target aarch64_sve_hw } } */ +/* { dg-options "-O2 -ftree-vectorize -ffast-math" } */ + +#include "mask_struct_store_3.c" + +#define N 100 + +#undef TEST_LOOP +#define TEST_LOOP(NAME, OUTTYPE, INTYPE, MASKTYPE) \ + { \ + OUTTYPE out[N * 4]; \ + INTYPE in[N]; \ + MASKTYPE mask[N]; \ + for (int i = 0; i < N; ++i) \ + { \ + in[i] = i * 7 / 2; \ + mask[i] = i % 5 <= i % 3; \ + asm volatile ("" ::: "memory"); \ + } \ + for (int i = 0; i < N * 4; ++i) \ + out[i] = i * 9 / 2; \ + NAME##_4 (out, in, mask, 42, N); \ + for (int i = 0; i < N * 4; ++i) \ + { \ + OUTTYPE if_true = (INTYPE) (in[i / 4] + 42); \ + OUTTYPE if_false = i * 9 / 2; \ + if (out[i] != (mask[i / 4] ? if_true : if_false)) \ + __builtin_abort (); \ + asm volatile ("" ::: "memory"); \ + } \ + } + +int __attribute__ ((optimize (1))) +main (void) +{ + TEST (test); + return 0; +} diff --git a/gcc/testsuite/gcc.target/aarch64/sve/mask_struct_store_4.c b/gcc/testsuite/gcc.target/aarch64/sve/mask_struct_store_4.c new file mode 100644 index 00000000000..f7b63b51488 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/mask_struct_store_4.c @@ -0,0 +1,44 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-vectorize -ffast-math" } */ + +#define TEST_LOOP(NAME, OUTTYPE, INTYPE, MASKTYPE) \ + void __attribute__ ((noinline, noclone)) \ + NAME##_2 (OUTTYPE *__restrict dest, INTYPE *__restrict src, \ + MASKTYPE *__restrict cond, int n) \ + { \ + for (int i = 0; i < n; ++i) \ + { \ + if (cond[i] < 8) \ + dest[i * 2] = src[i]; \ + if (cond[i] > 2) \ + dest[i * 2 + 1] = src[i]; \ + } \ + } + +#define TEST2(NAME, OUTTYPE, INTYPE) \ + TEST_LOOP (NAME##_i8, OUTTYPE, INTYPE, signed char) \ + TEST_LOOP (NAME##_i16, OUTTYPE, INTYPE, unsigned short) \ + TEST_LOOP (NAME##_f32, OUTTYPE, INTYPE, float) \ + TEST_LOOP (NAME##_f64, OUTTYPE, INTYPE, double) + +#define TEST1(NAME, OUTTYPE) \ + TEST2 (NAME##_i8, OUTTYPE, signed char) \ + TEST2 (NAME##_i16, OUTTYPE, unsigned short) \ + TEST2 (NAME##_i32, OUTTYPE, int) \ + TEST2 (NAME##_i64, OUTTYPE, unsigned long) + +#define TEST(NAME) \ + TEST1 (NAME##_i8, signed char) \ + TEST1 (NAME##_i16, unsigned short) \ + TEST1 (NAME##_i32, int) \ + TEST1 (NAME##_i64, unsigned long) \ + TEST2 (NAME##_f16_f16, _Float16, _Float16) \ + TEST2 (NAME##_f32_f32, float, float) \ + TEST2 (NAME##_f64_f64, double, double) + +TEST (test) + +/* { dg-final { scan-assembler-not {\tst2b\t.z[0-9]} } } */ +/* { dg-final { scan-assembler-not {\tst2h\t.z[0-9]} } } */ +/* { dg-final { scan-assembler-not {\tst2w\t.z[0-9]} } } */ +/* { dg-final { scan-assembler-not {\tst2d\t.z[0-9]} } } */ diff --git a/gcc/tree-vect-data-refs.c b/gcc/tree-vect-data-refs.c index 759c1e30edf..23b10844ffc 100644 --- a/gcc/tree-vect-data-refs.c +++ b/gcc/tree-vect-data-refs.c @@ -2780,6 +2780,62 @@ dr_group_sort_cmp (const void *dra_, const void *drb_) return cmp; } +/* If OP is the result of a conversion, return the unconverted value, + otherwise return null. */ + +static tree +strip_conversion (tree op) +{ + if (TREE_CODE (op) != SSA_NAME) + return NULL_TREE; + gimple *stmt = SSA_NAME_DEF_STMT (op); + if (!is_gimple_assign (stmt) + || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt))) + return NULL_TREE; + return gimple_assign_rhs1 (stmt); +} + +/* Return true if vectorizable_* routines can handle statements STMT1 + and STMT2 being in a single group. */ + +static bool +can_group_stmts_p (gimple *stmt1, gimple *stmt2) +{ + if (gimple_assign_single_p (stmt1)) + return gimple_assign_single_p (stmt2); + + if (is_gimple_call (stmt1) && gimple_call_internal_p (stmt1)) + { + /* Check for two masked loads or two masked stores. */ + if (!is_gimple_call (stmt2) || !gimple_call_internal_p (stmt2)) + return false; + internal_fn ifn = gimple_call_internal_fn (stmt1); + if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE) + return false; + if (ifn != gimple_call_internal_fn (stmt2)) + return false; + + /* Check that the masks are the same. Cope with casts of masks, + like those created by build_mask_conversion. */ + tree mask1 = gimple_call_arg (stmt1, 2); + tree mask2 = gimple_call_arg (stmt2, 2); + if (!operand_equal_p (mask1, mask2, 0)) + { + mask1 = strip_conversion (mask1); + if (!mask1) + return false; + mask2 = strip_conversion (mask2); + if (!mask2) + return false; + if (!operand_equal_p (mask1, mask2, 0)) + return false; + } + return true; + } + + return false; +} + /* Function vect_analyze_data_ref_accesses. Analyze the access pattern of all the data references in the loop. @@ -2846,8 +2902,7 @@ vect_analyze_data_ref_accesses (vec_info *vinfo) || data_ref_compare_tree (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb)) != 0 || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0 - || !gimple_assign_single_p (DR_STMT (dra)) - || !gimple_assign_single_p (DR_STMT (drb))) + || !can_group_stmts_p (DR_STMT (dra), DR_STMT (drb))) break; /* Check that the data-refs have the same constant size. */ @@ -4684,15 +4739,21 @@ vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count) } -/* Return TRUE if vec_store_lanes is available for COUNT vectors of - type VECTYPE. */ +/* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of + type VECTYPE. MASKED_P says whether the masked form is needed. */ bool -vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count) +vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count, + bool masked_p) { - return vect_lanes_optab_supported_p ("vec_store_lanes", - vec_store_lanes_optab, - vectype, count); + if (masked_p) + return vect_lanes_optab_supported_p ("vec_mask_store_lanes", + vec_mask_store_lanes_optab, + vectype, count); + else + return vect_lanes_optab_supported_p ("vec_store_lanes", + vec_store_lanes_optab, + vectype, count); } @@ -5283,15 +5344,21 @@ vect_grouped_load_supported (tree vectype, bool single_element_p, return false; } -/* Return TRUE if vec_load_lanes is available for COUNT vectors of - type VECTYPE. */ +/* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of + type VECTYPE. MASKED_P says whether the masked form is needed. */ bool -vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count) +vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count, + bool masked_p) { - return vect_lanes_optab_supported_p ("vec_load_lanes", - vec_load_lanes_optab, - vectype, count); + if (masked_p) + return vect_lanes_optab_supported_p ("vec_mask_load_lanes", + vec_mask_load_lanes_optab, + vectype, count); + else + return vect_lanes_optab_supported_p ("vec_load_lanes", + vec_load_lanes_optab, + vectype, count); } /* Function vect_permute_load_chain. diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c index c6fa5198055..8d1e003048a 100644 --- a/gcc/tree-vect-loop.c +++ b/gcc/tree-vect-loop.c @@ -2250,7 +2250,7 @@ again: vinfo = vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo)); unsigned int size = STMT_VINFO_GROUP_SIZE (vinfo); tree vectype = STMT_VINFO_VECTYPE (vinfo); - if (! vect_store_lanes_supported (vectype, size) + if (! vect_store_lanes_supported (vectype, size, false) && ! vect_grouped_store_supported (vectype, size)) return false; FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node) @@ -2260,7 +2260,7 @@ again: bool single_element_p = !STMT_VINFO_GROUP_NEXT_ELEMENT (vinfo); size = STMT_VINFO_GROUP_SIZE (vinfo); vectype = STMT_VINFO_VECTYPE (vinfo); - if (! vect_load_lanes_supported (vectype, size) + if (! vect_load_lanes_supported (vectype, size, false) && ! vect_grouped_load_supported (vectype, single_element_p, size)) return false; diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c index 2a6d9244109..f52d8276573 100644 --- a/gcc/tree-vect-slp.c +++ b/gcc/tree-vect-slp.c @@ -2189,7 +2189,7 @@ vect_analyze_slp_instance (vec_info *vinfo, instructions do not generate this SLP instance. */ if (is_a (vinfo) && loads_permuted - && dr && vect_store_lanes_supported (vectype, group_size)) + && dr && vect_store_lanes_supported (vectype, group_size, false)) { slp_tree load_node; FOR_EACH_VEC_ELT (loads, i, load_node) @@ -2202,7 +2202,7 @@ vect_analyze_slp_instance (vec_info *vinfo, if (STMT_VINFO_STRIDED_P (stmt_vinfo) || ! vect_load_lanes_supported (STMT_VINFO_VECTYPE (stmt_vinfo), - GROUP_SIZE (stmt_vinfo))) + GROUP_SIZE (stmt_vinfo), false)) break; } if (i == loads.length ()) diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c index 50b35fc60af..d9d747ab9a0 100644 --- a/gcc/tree-vect-stmts.c +++ b/gcc/tree-vect-stmts.c @@ -1757,7 +1757,7 @@ vect_get_store_rhs (gimple *stmt) static bool get_group_load_store_type (gimple *stmt, tree vectype, bool slp, - vec_load_store_type vls_type, + bool masked_p, vec_load_store_type vls_type, vect_memory_access_type *memory_access_type) { stmt_vec_info stmt_info = vinfo_for_stmt (stmt); @@ -1778,7 +1778,10 @@ get_group_load_store_type (gimple *stmt, tree vectype, bool slp, /* True if we can cope with such overrun by peeling for gaps, so that there is at least one final scalar iteration after the vector loop. */ - bool can_overrun_p = (vls_type == VLS_LOAD && loop_vinfo && !loop->inner); + bool can_overrun_p = (!masked_p + && vls_type == VLS_LOAD + && loop_vinfo + && !loop->inner); /* There can only be a gap at the end of the group if the stride is known at compile time. */ @@ -1841,6 +1844,7 @@ get_group_load_store_type (gimple *stmt, tree vectype, bool slp, and so we are guaranteed to access a non-gap element in the same B-sized block. */ if (would_overrun_p + && !masked_p && gap < (vect_known_alignment_in_bytes (first_dr) / vect_get_scalar_dr_size (first_dr))) would_overrun_p = false; @@ -1857,8 +1861,9 @@ get_group_load_store_type (gimple *stmt, tree vectype, bool slp, /* Otherwise try using LOAD/STORE_LANES. */ if (*memory_access_type == VMAT_ELEMENTWISE && (vls_type == VLS_LOAD - ? vect_load_lanes_supported (vectype, group_size) - : vect_store_lanes_supported (vectype, group_size))) + ? vect_load_lanes_supported (vectype, group_size, masked_p) + : vect_store_lanes_supported (vectype, group_size, + masked_p))) { *memory_access_type = VMAT_LOAD_STORE_LANES; overrun_p = would_overrun_p; @@ -1884,8 +1889,7 @@ get_group_load_store_type (gimple *stmt, tree vectype, bool slp, gimple *next_stmt = GROUP_NEXT_ELEMENT (stmt_info); while (next_stmt) { - gcc_assert (gimple_assign_single_p (next_stmt)); - tree op = gimple_assign_rhs1 (next_stmt); + tree op = vect_get_store_rhs (next_stmt); gimple *def_stmt; enum vect_def_type dt; if (!vect_is_simple_use (op, vinfo, &def_stmt, &dt)) @@ -1969,11 +1973,12 @@ get_negative_load_store_type (gimple *stmt, tree vectype, or scatters, fill in GS_INFO accordingly. SLP says whether we're performing SLP rather than loop vectorization. + MASKED_P is true if the statement is conditional on a vectorized mask. VECTYPE is the vector type that the vectorized statements will use. NCOPIES is the number of vector statements that will be needed. */ static bool -get_load_store_type (gimple *stmt, tree vectype, bool slp, +get_load_store_type (gimple *stmt, tree vectype, bool slp, bool masked_p, vec_load_store_type vls_type, unsigned int ncopies, vect_memory_access_type *memory_access_type, gather_scatter_info *gs_info) @@ -2001,7 +2006,7 @@ get_load_store_type (gimple *stmt, tree vectype, bool slp, } else if (STMT_VINFO_GROUPED_ACCESS (stmt_info)) { - if (!get_group_load_store_type (stmt, vectype, slp, vls_type, + if (!get_group_load_store_type (stmt, vectype, slp, masked_p, vls_type, memory_access_type)) return false; } @@ -5762,23 +5767,26 @@ vectorizable_store (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt, return false; vect_memory_access_type memory_access_type; - if (!get_load_store_type (stmt, vectype, slp, vls_type, ncopies, + if (!get_load_store_type (stmt, vectype, slp, mask, vls_type, ncopies, &memory_access_type, &gs_info)) return false; if (mask) { - if (memory_access_type != VMAT_CONTIGUOUS) + if (memory_access_type == VMAT_CONTIGUOUS) + { + if (!VECTOR_MODE_P (vec_mode) + || !can_vec_mask_load_store_p (vec_mode, + TYPE_MODE (mask_vectype), false)) + return false; + } + else if (memory_access_type != VMAT_LOAD_STORE_LANES) { if (dump_enabled_p ()) dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, "unsupported access type for masked store.\n"); return false; } - if (!VECTOR_MODE_P (vec_mode) - || !can_vec_mask_load_store_p (vec_mode, TYPE_MODE (mask_vectype), - false)) - return false; } else { @@ -6421,12 +6429,27 @@ vectorizable_store (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt, write_vector_array (stmt, gsi, vec_oprnd, vec_array, i); } - /* Emit: - MEM_REF[...all elements...] = STORE_LANES (VEC_ARRAY). */ - data_ref = create_array_ref (aggr_type, dataref_ptr, ref_type); - gcall *call = gimple_build_call_internal (IFN_STORE_LANES, 1, - vec_array); - gimple_call_set_lhs (call, data_ref); + gcall *call; + if (mask) + { + /* Emit: + MASK_STORE_LANES (DATAREF_PTR, ALIAS_PTR, VEC_MASK, + VEC_ARRAY). */ + unsigned int align = TYPE_ALIGN_UNIT (TREE_TYPE (vectype)); + tree alias_ptr = build_int_cst (ref_type, align); + call = gimple_build_call_internal (IFN_MASK_STORE_LANES, 4, + dataref_ptr, alias_ptr, + vec_mask, vec_array); + } + else + { + /* Emit: + MEM_REF[...all elements...] = STORE_LANES (VEC_ARRAY). */ + data_ref = create_array_ref (aggr_type, dataref_ptr, ref_type); + call = gimple_build_call_internal (IFN_STORE_LANES, 1, + vec_array); + gimple_call_set_lhs (call, data_ref); + } gimple_call_set_nothrow (call, true); new_stmt = call; vect_finish_stmt_generation (stmt, new_stmt, gsi); @@ -6870,7 +6893,7 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt, } vect_memory_access_type memory_access_type; - if (!get_load_store_type (stmt, vectype, slp, VLS_LOAD, ncopies, + if (!get_load_store_type (stmt, vectype, slp, mask, VLS_LOAD, ncopies, &memory_access_type, &gs_info)) return false; @@ -6878,8 +6901,9 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt, { if (memory_access_type == VMAT_CONTIGUOUS) { - if (!VECTOR_MODE_P (TYPE_MODE (vectype)) - || !can_vec_mask_load_store_p (TYPE_MODE (vectype), + machine_mode vec_mode = TYPE_MODE (vectype); + if (!VECTOR_MODE_P (vec_mode) + || !can_vec_mask_load_store_p (vec_mode, TYPE_MODE (mask_vectype), true)) return false; } @@ -6897,7 +6921,7 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt, return false; } } - else + else if (memory_access_type != VMAT_LOAD_STORE_LANES) { if (dump_enabled_p ()) dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, @@ -7447,11 +7471,25 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt, vec_array = create_vector_array (vectype, vec_num); - /* Emit: - VEC_ARRAY = LOAD_LANES (MEM_REF[...all elements...]). */ - data_ref = create_array_ref (aggr_type, dataref_ptr, ref_type); - gcall *call = gimple_build_call_internal (IFN_LOAD_LANES, 1, - data_ref); + gcall *call; + if (mask) + { + /* Emit: + VEC_ARRAY = MASK_LOAD_LANES (DATAREF_PTR, ALIAS_PTR, + VEC_MASK). */ + unsigned int align = TYPE_ALIGN_UNIT (TREE_TYPE (vectype)); + tree alias_ptr = build_int_cst (ref_type, align); + call = gimple_build_call_internal (IFN_MASK_LOAD_LANES, 3, + dataref_ptr, alias_ptr, + vec_mask); + } + else + { + /* Emit: + VEC_ARRAY = LOAD_LANES (MEM_REF[...all elements...]). */ + data_ref = create_array_ref (aggr_type, dataref_ptr, ref_type); + call = gimple_build_call_internal (IFN_LOAD_LANES, 1, data_ref); + } gimple_call_set_lhs (call, vec_array); gimple_call_set_nothrow (call, true); new_stmt = call; diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index 129cde07cbf..000688d05d9 100644 --- a/gcc/tree-vectorizer.h +++ b/gcc/tree-vectorizer.h @@ -1293,9 +1293,9 @@ extern tree bump_vector_ptr (tree, gimple *, gimple_stmt_iterator *, gimple *, tree); extern tree vect_create_destination_var (tree, tree); extern bool vect_grouped_store_supported (tree, unsigned HOST_WIDE_INT); -extern bool vect_store_lanes_supported (tree, unsigned HOST_WIDE_INT); +extern bool vect_store_lanes_supported (tree, unsigned HOST_WIDE_INT, bool); extern bool vect_grouped_load_supported (tree, bool, unsigned HOST_WIDE_INT); -extern bool vect_load_lanes_supported (tree, unsigned HOST_WIDE_INT); +extern bool vect_load_lanes_supported (tree, unsigned HOST_WIDE_INT, bool); extern void vect_permute_store_chain (vec ,unsigned int, gimple *, gimple_stmt_iterator *, vec *); extern tree vect_setup_realignment (gimple *, gimple_stmt_iterator *, tree *, -- 2.30.2