+2018-01-13 Richard Sandiford <richard.sandiford@linaro.org>
+ Alan Hayward <alan.hayward@arm.com>
+ David Sherwood <david.sherwood@arm.com>
+
+ * doc/sourcebuild.texi (vect_scatter_store): Document.
+ * optabs.def (scatter_store_optab, mask_scatter_store_optab): New
+ optabs.
+ * doc/md.texi (scatter_store@var{m}, mask_scatter_store@var{m}):
+ Document.
+ * genopinit.c (main): Add supports_vec_scatter_store and
+ supports_vec_scatter_store_cached to target_optabs.
+ * gimple.h (gimple_expr_type): Handle IFN_SCATTER_STORE and
+ IFN_MASK_SCATTER_STORE.
+ * internal-fn.def (SCATTER_STORE, MASK_SCATTER_STORE): New internal
+ functions.
+ * internal-fn.h (internal_store_fn_p): Declare.
+ (internal_fn_stored_value_index): Likewise.
+ * internal-fn.c (scatter_store_direct): New macro.
+ (expand_scatter_store_optab_fn): New function.
+ (direct_scatter_store_optab_supported_p): New macro.
+ (internal_store_fn_p): New function.
+ (internal_gather_scatter_fn_p): Handle IFN_SCATTER_STORE and
+ IFN_MASK_SCATTER_STORE.
+ (internal_fn_mask_index): Likewise.
+ (internal_fn_stored_value_index): New function.
+ (internal_gather_scatter_fn_supported_p): Adjust operand numbers
+ for scatter stores.
+ * optabs-query.h (supports_vec_scatter_store_p): Declare.
+ * optabs-query.c (supports_vec_scatter_store_p): New function.
+ * tree-vectorizer.h (vect_get_store_rhs): Declare.
+ * tree-vect-data-refs.c (vect_analyze_data_ref_access): Return
+ true for scatter stores.
+ (vect_gather_scatter_fn_p): Handle scatter stores too.
+ (vect_check_gather_scatter): Consider using scatter stores if
+ supports_vec_scatter_store_p.
+ * tree-vect-patterns.c (vect_try_gather_scatter_pattern): Handle
+ scatter stores too.
+ * tree-vect-stmts.c (exist_non_indexing_operands_for_use_p): Use
+ internal_fn_stored_value_index.
+ (check_load_store_masking): Handle scatter stores too.
+ (vect_get_store_rhs): Make public.
+ (vectorizable_call): Use internal_store_fn_p.
+ (vectorizable_store): Handle scatter store internal functions.
+ (vect_transform_stmt): Compare GROUP_STORE_COUNT with GROUP_SIZE
+ when deciding whether the end of the group has been reached.
+ * config/aarch64/aarch64.md (UNSPEC_ST1_SCATTER): New unspec.
+ * config/aarch64/aarch64-sve.md (scatter_store<mode>): New expander.
+ (mask_scatter_store<mode>): New insns.
+
2018-01-13 Richard Sandiford <richard.sandiford@linaro.org>
Alan Hayward <alan.hayward@arm.com>
David Sherwood <david.sherwood@arm.com>
ld1d\t%0.d, %5/z, [%1, %2.d, lsl %p4]"
)
+;; Unpredicated scatter store.
+(define_expand "scatter_store<mode>"
+ [(set (mem:BLK (scratch))
+ (unspec:BLK
+ [(match_dup 5)
+ (match_operand:DI 0 "aarch64_reg_or_zero")
+ (match_operand:<V_INT_EQUIV> 1 "register_operand")
+ (match_operand:DI 2 "const_int_operand")
+ (match_operand:DI 3 "aarch64_gather_scale_operand_<Vesize>")
+ (match_operand:SVE_SD 4 "register_operand")]
+ UNSPEC_ST1_SCATTER))]
+ "TARGET_SVE"
+ {
+ operands[5] = force_reg (<VPRED>mode, CONSTM1_RTX (<VPRED>mode));
+ }
+)
+
+;; Predicated scatter stores for 32-bit elements. Operand 2 is true for
+;; unsigned extension and false for signed extension.
+(define_insn "mask_scatter_store<mode>"
+ [(set (mem:BLK (scratch))
+ (unspec:BLK
+ [(match_operand:<VPRED> 5 "register_operand" "Upl, Upl, Upl, Upl, Upl")
+ (match_operand:DI 0 "aarch64_reg_or_zero" "Z, rk, rk, rk, rk")
+ (match_operand:<V_INT_EQUIV> 1 "register_operand" "w, w, w, w, w")
+ (match_operand:DI 2 "const_int_operand" "i, Z, Ui1, Z, Ui1")
+ (match_operand:DI 3 "aarch64_gather_scale_operand_w" "Ui1, Ui1, Ui1, i, i")
+ (match_operand:SVE_S 4 "register_operand" "w, w, w, w, w")]
+ UNSPEC_ST1_SCATTER))]
+ "TARGET_SVE"
+ "@
+ st1w\t%4.s, %5, [%1.s]
+ st1w\t%4.s, %5, [%0, %1.s, sxtw]
+ st1w\t%4.s, %5, [%0, %1.s, uxtw]
+ st1w\t%4.s, %5, [%0, %1.s, sxtw %p3]
+ st1w\t%4.s, %5, [%0, %1.s, uxtw %p3]"
+)
+
+;; Predicated scatter stores for 64-bit elements. The value of operand 2
+;; doesn't matter in this case.
+(define_insn "mask_scatter_store<mode>"
+ [(set (mem:BLK (scratch))
+ (unspec:BLK
+ [(match_operand:<VPRED> 5 "register_operand" "Upl, Upl, Upl")
+ (match_operand:DI 0 "aarch64_reg_or_zero" "Z, rk, rk")
+ (match_operand:<V_INT_EQUIV> 1 "register_operand" "w, w, w")
+ (match_operand:DI 2 "const_int_operand")
+ (match_operand:DI 3 "aarch64_gather_scale_operand_d" "Ui1, Ui1, i")
+ (match_operand:SVE_D 4 "register_operand" "w, w, w")]
+ UNSPEC_ST1_SCATTER))]
+ "TARGET_SVE"
+ "@
+ st1d\t%4.d, %5, [%1.d]
+ st1d\t%4.d, %5, [%0, %1.d]
+ st1d\t%4.d, %5, [%0, %1.d, lsl %p3]"
+)
+
;; SVE structure moves.
(define_expand "mov<mode>"
[(set (match_operand:SVE_STRUCT 0 "nonimmediate_operand")
UNSPEC_ST1_SVE
UNSPEC_LD1RQ
UNSPEC_LD1_GATHER
+ UNSPEC_ST1_SCATTER
UNSPEC_MERGE_PTRUE
UNSPEC_PTEST_PTRUE
UNSPEC_UNPACKSHI
of the result should be loaded from memory and clear if element @var{i}
of the result should be set to zero.
+@cindex @code{scatter_store@var{m}} instruction pattern
+@item @samp{scatter_store@var{m}}
+Store a vector of mode @var{m} into several distinct memory locations.
+Operand 0 is a scalar base address and operand 1 is a vector of offsets
+from that base. Operand 4 is the vector of values that should be stored,
+which has the same number of elements as the offset. For each element
+index @var{i}:
+
+@itemize @bullet
+@item
+extend the offset element @var{i} to address width, using zero
+extension if operand 2 is 1 and sign extension if operand 2 is zero;
+@item
+multiply the extended offset by operand 3;
+@item
+add the result to the base; and
+@item
+store element @var{i} of operand 4 to that address.
+@end itemize
+
+The value of operand 2 does not matter if the offsets are already
+address width.
+
+@cindex @code{mask_scatter_store@var{m}} instruction pattern
+@item @samp{mask_scatter_store@var{m}}
+Like @samp{scatter_store@var{m}}, but takes an extra mask operand as
+operand 5. Bit @var{i} of the mask is set if element @var{i}
+of the result should be stored to memory.
+
@cindex @code{vec_set@var{m}} instruction pattern
@item @samp{vec_set@var{m}}
Set given field in the vector value. Operand 0 is the vector to modify,
@item vect_masked_store
Target supports vector masked stores.
+@item vect_scatter_store
+Target supports vector scatter stores.
+
@item vect_aligned_arrays
Target aligns arrays to vector alignment boundary.
" mode. */\n"
" bool supports_vec_gather_load;\n"
" bool supports_vec_gather_load_cached;\n"
+ " bool supports_vec_scatter_store;\n"
+ " bool supports_vec_scatter_store_cached;\n"
"};\n"
"extern void init_all_optabs (struct target_optabs *);\n"
"\n"
if (code == GIMPLE_CALL)
{
const gcall *call_stmt = as_a <const gcall *> (stmt);
- if (gimple_call_internal_p (call_stmt)
- && gimple_call_internal_fn (call_stmt) == IFN_MASK_STORE)
- return TREE_TYPE (gimple_call_arg (call_stmt, 3));
- else
- return gimple_call_return_type (call_stmt);
+ if (gimple_call_internal_p (call_stmt))
+ switch (gimple_call_internal_fn (call_stmt))
+ {
+ case IFN_MASK_STORE:
+ case IFN_SCATTER_STORE:
+ return TREE_TYPE (gimple_call_arg (call_stmt, 3));
+ case IFN_MASK_SCATTER_STORE:
+ return TREE_TYPE (gimple_call_arg (call_stmt, 4));
+ default:
+ break;
+ }
+ return gimple_call_return_type (call_stmt);
}
else if (code == GIMPLE_ASSIGN)
{
#define mask_store_direct { 3, 2, false }
#define store_lanes_direct { 0, 0, false }
#define mask_store_lanes_direct { 0, 0, false }
+#define scatter_store_direct { 3, 3, false }
#define unary_direct { 0, 0, true }
#define binary_direct { 0, 0, true }
#define cond_unary_direct { 1, 1, true }
expand_assignment (lhs, gimple_call_arg (call, 0), false);
}
+/* Expand {MASK_,}SCATTER_STORE{S,U} call CALL using optab OPTAB. */
+
+static void
+expand_scatter_store_optab_fn (internal_fn, gcall *stmt, direct_optab optab)
+{
+ internal_fn ifn = gimple_call_internal_fn (stmt);
+ int rhs_index = internal_fn_stored_value_index (ifn);
+ int mask_index = internal_fn_mask_index (ifn);
+ tree base = gimple_call_arg (stmt, 0);
+ tree offset = gimple_call_arg (stmt, 1);
+ tree scale = gimple_call_arg (stmt, 2);
+ tree rhs = gimple_call_arg (stmt, rhs_index);
+
+ rtx base_rtx = expand_normal (base);
+ rtx offset_rtx = expand_normal (offset);
+ HOST_WIDE_INT scale_int = tree_to_shwi (scale);
+ rtx rhs_rtx = expand_normal (rhs);
+
+ struct expand_operand ops[6];
+ int i = 0;
+ create_address_operand (&ops[i++], base_rtx);
+ create_input_operand (&ops[i++], offset_rtx, TYPE_MODE (TREE_TYPE (offset)));
+ create_integer_operand (&ops[i++], TYPE_UNSIGNED (TREE_TYPE (offset)));
+ create_integer_operand (&ops[i++], scale_int);
+ create_input_operand (&ops[i++], rhs_rtx, TYPE_MODE (TREE_TYPE (rhs)));
+ if (mask_index >= 0)
+ {
+ tree mask = gimple_call_arg (stmt, mask_index);
+ rtx mask_rtx = expand_normal (mask);
+ create_input_operand (&ops[i++], mask_rtx, TYPE_MODE (TREE_TYPE (mask)));
+ }
+
+ insn_code icode = direct_optab_handler (optab, TYPE_MODE (TREE_TYPE (rhs)));
+ expand_insn (icode, i, ops);
+}
+
/* Expand {MASK_,}GATHER_LOAD call CALL using optab OPTAB. */
static void
#define direct_mask_store_optab_supported_p direct_optab_supported_p
#define direct_store_lanes_optab_supported_p multi_vector_optab_supported_p
#define direct_mask_store_lanes_optab_supported_p multi_vector_optab_supported_p
+#define direct_scatter_store_optab_supported_p direct_optab_supported_p
#define direct_while_optab_supported_p convert_optab_supported_p
#define direct_fold_extract_optab_supported_p direct_optab_supported_p
#define direct_fold_left_optab_supported_p direct_optab_supported_p
}
}
+/* Return true if IFN is some form of store to memory. */
+
+bool
+internal_store_fn_p (internal_fn fn)
+{
+ switch (fn)
+ {
+ case IFN_MASK_STORE:
+ case IFN_STORE_LANES:
+ case IFN_MASK_STORE_LANES:
+ case IFN_SCATTER_STORE:
+ case IFN_MASK_SCATTER_STORE:
+ return true;
+
+ default:
+ return false;
+ }
+}
+
/* Return true if IFN is some form of gather load or scatter store. */
bool
{
case IFN_GATHER_LOAD:
case IFN_MASK_GATHER_LOAD:
+ case IFN_SCATTER_STORE:
+ case IFN_MASK_SCATTER_STORE:
return true;
default:
case IFN_MASK_GATHER_LOAD:
return 3;
+ case IFN_MASK_SCATTER_STORE:
+ return 4;
+
+ default:
+ return -1;
+ }
+}
+
+/* If FN takes a value that should be stored to memory, return the index
+ of that argument, otherwise return -1. */
+
+int
+internal_fn_stored_value_index (internal_fn fn)
+{
+ switch (fn)
+ {
+ case IFN_MASK_STORE:
+ case IFN_SCATTER_STORE:
+ case IFN_MASK_SCATTER_STORE:
+ return 3;
+
default:
return -1;
}
return false;
optab optab = direct_internal_fn_optab (ifn);
insn_code icode = direct_optab_handler (optab, TYPE_MODE (vector_type));
+ int output_ops = internal_load_fn_p (ifn) ? 1 : 0;
return (icode != CODE_FOR_nothing
- && insn_operand_matches (icode, 3, GEN_INT (offset_sign == UNSIGNED))
- && insn_operand_matches (icode, 4, GEN_INT (scale)));
+ && insn_operand_matches (icode, 2 + output_ops,
+ GEN_INT (offset_sign == UNSIGNED))
+ && insn_operand_matches (icode, 3 + output_ops,
+ GEN_INT (scale)));
}
/* Expand STMT as though it were a call to internal function FN. */
- mask_store: currently just maskstore
- store_lanes: currently just vec_store_lanes
- mask_store_lanes: currently just vec_mask_store_lanes
+ - scatter_store: used for {mask_,}scatter_store
- unary: a normal unary optab, such as vec_reverse_<mode>
- binary: a normal binary optab, such as vec_interleave_lo_<mode>
DEF_INTERNAL_OPTAB_FN (MASK_GATHER_LOAD, ECF_PURE,
mask_gather_load, gather_load)
+DEF_INTERNAL_OPTAB_FN (SCATTER_STORE, 0, scatter_store, scatter_store)
+DEF_INTERNAL_OPTAB_FN (MASK_SCATTER_STORE, 0,
+ mask_scatter_store, scatter_store)
+
DEF_INTERNAL_OPTAB_FN (MASK_STORE, 0, maskstore, mask_store)
DEF_INTERNAL_OPTAB_FN (STORE_LANES, ECF_CONST, vec_store_lanes, store_lanes)
DEF_INTERNAL_OPTAB_FN (MASK_STORE_LANES, 0,
extern internal_fn get_conditional_internal_fn (tree_code);
extern bool internal_load_fn_p (internal_fn);
+extern bool internal_store_fn_p (internal_fn);
extern bool internal_gather_scatter_fn_p (internal_fn);
extern int internal_fn_mask_index (internal_fn);
+extern int internal_fn_stored_value_index (internal_fn);
extern bool internal_gather_scatter_fn_supported_p (internal_fn, tree,
tree, signop, int);
return this_fn_optabs->supports_vec_gather_load;
}
+
+/* Return true if vec_scatter_store is available for at least one vector
+ mode. */
+
+bool
+supports_vec_scatter_store_p ()
+{
+ if (this_fn_optabs->supports_vec_scatter_store_cached)
+ return this_fn_optabs->supports_vec_scatter_store;
+
+ this_fn_optabs->supports_vec_scatter_store_cached = true;
+
+ this_fn_optabs->supports_vec_scatter_store
+ = supports_at_least_one_mode_p (scatter_store_optab);
+
+ return this_fn_optabs->supports_vec_scatter_store;
+}
+
bool can_atomic_load_p (machine_mode);
bool lshift_cheap_p (bool);
bool supports_vec_gather_load_p ();
+bool supports_vec_scatter_store_p ();
/* Version of find_widening_optab_handler_and_mode that operates on
specific mode types. */
OPTAB_D (gather_load_optab, "gather_load$a")
OPTAB_D (mask_gather_load_optab, "mask_gather_load$a")
+OPTAB_D (scatter_store_optab, "scatter_store$a")
+OPTAB_D (mask_scatter_store_optab, "mask_scatter_store$a")
OPTAB_DC (vec_duplicate_optab, "vec_duplicate$a", VEC_DUPLICATE)
OPTAB_DC (vec_series_optab, "vec_series$a", VEC_SERIES)
+2018-01-13 Richard Sandiford <richard.sandiford@linaro.org>
+ Alan Hayward <alan.hayward@arm.com>
+ David Sherwood <david.sherwood@arm.com>
+
+ * lib/target-supports.exp (check_effective_target_vect_scatter_store):
+ New proc.
+ * gcc.dg/vect/pr25413a.c: Expect both loops to be optimized on
+ targets with scatter stores.
+ * gcc.dg/vect/vect-71.c: Restrict XFAIL to targets without scatter
+ stores.
+ * gcc.target/aarch64/sve/mask_scatter_store_1.c: New test.
+ * gcc.target/aarch64/sve/mask_scatter_store_2.c: Likewise.
+ * gcc.target/aarch64/sve/scatter_store_1.c: Likewise.
+ * gcc.target/aarch64/sve/scatter_store_2.c: Likewise.
+ * gcc.target/aarch64/sve/scatter_store_3.c: Likewise.
+ * gcc.target/aarch64/sve/scatter_store_4.c: Likewise.
+ * gcc.target/aarch64/sve/scatter_store_5.c: Likewise.
+ * gcc.target/aarch64/sve/scatter_store_6.c: Likewise.
+ * gcc.target/aarch64/sve/scatter_store_7.c: Likewise.
+ * gcc.target/aarch64/sve/strided_store_1.c: Likewise.
+ * gcc.target/aarch64/sve/strided_store_2.c: Likewise.
+ * gcc.target/aarch64/sve/strided_store_3.c: Likewise.
+ * gcc.target/aarch64/sve/strided_store_4.c: Likewise.
+ * gcc.target/aarch64/sve/strided_store_5.c: Likewise.
+ * gcc.target/aarch64/sve/strided_store_6.c: Likewise.
+ * gcc.target/aarch64/sve/strided_store_7.c: Likewise.
+
2018-01-13 Richard Sandiford <richard.sandiford@linaro.org>
Alan Hayward <alan.hayward@arm.com>
David Sherwood <david.sherwood@arm.com>
return 0;
}
-/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target { ! vect_scatter_store } } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 2 loops" 1 "vect" { target vect_scatter_store } } } */
/* { dg-final { scan-tree-dump-times "vector alignment may not be reachable" 1 "vect" { target { ! vector_alignment_reachable } } } } */
/* { dg-final { scan-tree-dump-times "Alignment of access forced using versioning" 1 "vect" { target { ! vector_alignment_reachable } } } } */
return main1 ();
}
-/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { xfail { ! vect_scatter_store } } } } */
--- /dev/null
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O2 -ftree-vectorize -ffast-math --save-temps" } */
+
+#include <stdint.h>
+
+#ifndef INDEX32
+#define INDEX32 int32_t
+#define INDEX64 int64_t
+#endif
+
+#define TEST_LOOP(DATA_TYPE, CMP_TYPE, BITS) \
+ void \
+ f_##DATA_TYPE##_##CMP_TYPE \
+ (DATA_TYPE *restrict dest, DATA_TYPE *restrict src, \
+ CMP_TYPE *restrict cmp1, CMP_TYPE *restrict cmp2, \
+ INDEX##BITS *restrict indices, int n) \
+ { \
+ for (int i = 0; i < n; ++i) \
+ if (cmp1[i] == cmp2[i]) \
+ dest[indices[i]] = src[i] + 1; \
+ }
+
+#define TEST32(T, DATA_TYPE) \
+ T (DATA_TYPE, int32_t, 32) \
+ T (DATA_TYPE, uint32_t, 32) \
+ T (DATA_TYPE, float, 32)
+
+#define TEST64(T, DATA_TYPE) \
+ T (DATA_TYPE, int64_t, 64) \
+ T (DATA_TYPE, uint64_t, 64) \
+ T (DATA_TYPE, double, 64)
+
+#define TEST_ALL(T) \
+ TEST32 (T, int32_t) \
+ TEST32 (T, uint32_t) \
+ TEST32 (T, float) \
+ TEST64 (T, int64_t) \
+ TEST64 (T, uint64_t) \
+ TEST64 (T, double)
+
+TEST_ALL (TEST_LOOP)
+
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, x[0-9]+, lsl 2\]\n} 36 } } */
+/* { dg-final { scan-assembler-times {\tcmpeq\tp[0-7]\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 6 } } */
+/* { dg-final { scan-assembler-times {\tfcmeq\tp[0-7]\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s, p[0-7], \[x[0-9]+, z[0-9]+\.s, sxtw 2\]\n} 9 } } */
+
+/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]/z, \[x[0-9]+, x[0-9]+, lsl 3\]\n} 36 } } */
+/* { dg-final { scan-assembler-times {\tcmpeq\tp[0-7]\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 6 } } */
+/* { dg-final { scan-assembler-times {\tfcmeq\tp[0-7]\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d, p[0-7], \[x[0-9]+, z[0-9]+\.d, lsl 3\]\n} 9 } } */
--- /dev/null
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O2 -ftree-vectorize -ffast-math --save-temps" } */
+
+#define INDEX32 uint32_t
+#define INDEX64 uint64_t
+
+#include "mask_scatter_store_1.c"
+
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, x[0-9]+, lsl 2\]\n} 36 } } */
+/* { dg-final { scan-assembler-times {\tcmpeq\tp[0-7]\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 6 } } */
+/* { dg-final { scan-assembler-times {\tfcmeq\tp[0-7]\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s, p[0-7], \[x[0-9]+, z[0-9]+\.s, uxtw 2\]\n} 9 } } */
+
+/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]/z, \[x[0-9]+, x[0-9]+, lsl 3\]\n} 36 } } */
+/* { dg-final { scan-assembler-times {\tcmpeq\tp[0-7]\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 6 } } */
+/* { dg-final { scan-assembler-times {\tfcmeq\tp[0-7]\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d, p[0-7], \[x[0-9]+, z[0-9]+\.d, lsl 3\]\n} 9 } } */
--- /dev/null
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O2 -ftree-vectorize --save-temps" } */
+
+#include <stdint.h>
+
+#ifndef INDEX32
+#define INDEX32 int32_t
+#define INDEX64 int64_t
+#endif
+
+#define TEST_LOOP(DATA_TYPE, BITS) \
+ void __attribute__ ((noinline, noclone)) \
+ f_##DATA_TYPE (DATA_TYPE *restrict dest, DATA_TYPE *restrict src, \
+ INDEX##BITS *indices, int n) \
+ { \
+ for (int i = 9; i < n; ++i) \
+ dest[indices[i]] = src[i] + 1; \
+ }
+
+#define TEST_ALL(T) \
+ T (int32_t, 32) \
+ T (uint32_t, 32) \
+ T (float, 32) \
+ T (int64_t, 64) \
+ T (uint64_t, 64) \
+ T (double, 64)
+
+TEST_ALL (TEST_LOOP)
+
+/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s, p[0-7], \[x[0-9]+, z[0-9]+.s, sxtw 2\]\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d, p[0-7], \[x[0-9]+, z[0-9]+.d, lsl 3\]\n} 3 } } */
--- /dev/null
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O2 -ftree-vectorize --save-temps" } */
+
+#define INDEX32 uint32_t
+#define INDEX64 uint64_t
+
+#include "scatter_store_1.c"
+
+/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s, p[0-7], \[x[0-9]+, z[0-9]+.s, uxtw 2\]\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d, p[0-7], \[x[0-9]+, z[0-9]+.d, lsl 3\]\n} 3 } } */
--- /dev/null
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O2 -ftree-vectorize --save-temps" } */
+
+#include <stdint.h>
+
+#ifndef INDEX32
+#define INDEX32 int32_t
+#define INDEX64 int64_t
+#endif
+
+/* Invoked 18 times for each data size. */
+#define TEST_LOOP(DATA_TYPE, BITS) \
+ void __attribute__ ((noinline, noclone)) \
+ f_##DATA_TYPE (DATA_TYPE *restrict dest, DATA_TYPE *restrict src, \
+ INDEX##BITS *indices, int n) \
+ { \
+ for (int i = 9; i < n; ++i) \
+ *(DATA_TYPE *) ((char *) dest + indices[i]) = src[i] + 1; \
+ }
+
+#define TEST_ALL(T) \
+ T (int32_t, 32) \
+ T (uint32_t, 32) \
+ T (float, 32) \
+ T (int64_t, 64) \
+ T (uint64_t, 64) \
+ T (double, 64)
+
+TEST_ALL (TEST_LOOP)
+
+/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s, p[0-7], \[x[0-9]+, z[0-9]+.s, sxtw\]\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d, p[0-7], \[x[0-9]+, z[0-9]+.d\]\n} 3 } } */
--- /dev/null
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O2 -ftree-vectorize --save-temps" } */
+
+#define INDEX32 uint32_t
+#define INDEX64 uint64_t
+
+#include "scatter_store_3.c"
+
+/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s, p[0-7], \[x[0-9]+, z[0-9]+.s, uxtw\]\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d, p[0-7], \[x[0-9]+, z[0-9]+.d\]\n} 3 } } */
--- /dev/null
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O2 -ftree-vectorize --save-temps" } */
+
+#include <stdint.h>
+
+/* Invoked 18 times for each data size. */
+#define TEST_LOOP(DATA_TYPE) \
+ void __attribute__ ((noinline, noclone)) \
+ f_##DATA_TYPE (DATA_TYPE *restrict *dest, DATA_TYPE *restrict src, \
+ int n) \
+ { \
+ for (int i = 9; i < n; ++i) \
+ *dest[i] = src[i] + 1; \
+ }
+
+#define TEST_ALL(T) \
+ T (int64_t) \
+ T (uint64_t) \
+ T (double)
+
+TEST_ALL (TEST_LOOP)
+
+/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d, p[0-7], \[z[0-9]+.d\]\n} 3 } } */
--- /dev/null
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O2 -ftree-vectorize -fwrapv --save-temps" } */
+
+#include <stdint.h>
+
+#ifndef INDEX32
+#define INDEX16 int16_t
+#define INDEX32 int32_t
+#endif
+
+/* Invoked 18 times for each data size. */
+#define TEST_LOOP(DATA_TYPE, BITS) \
+ void __attribute__ ((noinline, noclone)) \
+ f_##DATA_TYPE (DATA_TYPE *restrict dest, DATA_TYPE *restrict src, \
+ INDEX##BITS *indices, INDEX##BITS mask, int n) \
+ { \
+ for (int i = 9; i < n; ++i) \
+ dest[(INDEX##BITS) (indices[i] | mask)] = src[i] + 1; \
+ }
+
+#define TEST_ALL(T) \
+ T (int32_t, 16) \
+ T (uint32_t, 16) \
+ T (float, 16) \
+ T (int64_t, 32) \
+ T (uint64_t, 32) \
+ T (double, 32)
+
+TEST_ALL (TEST_LOOP)
+
+/* { dg-final { scan-assembler-times {\tsunpkhi\tz[0-9]+\.s, z[0-9]+\.h\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tsunpklo\tz[0-9]+\.s, z[0-9]+\.h\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tsunpkhi\tz[0-9]+\.d, z[0-9]+\.s\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tsunpklo\tz[0-9]+\.d, z[0-9]+\.s\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s, p[0-7], \[x[0-9]+, z[0-9]+.s, sxtw 2\]\n} 6 } } */
+/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d, p[0-7], \[x[0-9]+, z[0-9]+.d, lsl 3\]\n} 6 } } */
--- /dev/null
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O2 -ftree-vectorize --save-temps" } */
+
+#define INDEX16 uint16_t
+#define INDEX32 uint32_t
+
+#include "scatter_store_6.c"
+
+/* { dg-final { scan-assembler-times {\tuunpkhi\tz[0-9]+\.s, z[0-9]+\.h\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tuunpklo\tz[0-9]+\.s, z[0-9]+\.h\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tuunpkhi\tz[0-9]+\.d, z[0-9]+\.s\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tuunpklo\tz[0-9]+\.d, z[0-9]+\.s\n} 3 } } */
+/* Either extension type is OK here. */
+/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s, p[0-7], \[x[0-9]+, z[0-9]+.s, [us]xtw 2\]\n} 6 } } */
+/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d, p[0-7], \[x[0-9]+, z[0-9]+.d, lsl 3\]\n} 6 } } */
--- /dev/null
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O2 -ftree-vectorize --save-temps" } */
+
+#include <stdint.h>
+
+#ifndef INDEX8
+#define INDEX8 int8_t
+#define INDEX16 int16_t
+#define INDEX32 int32_t
+#define INDEX64 int64_t
+#endif
+
+#define TEST_LOOP(DATA_TYPE, BITS) \
+ void __attribute__ ((noinline, noclone)) \
+ f_##DATA_TYPE##_##BITS (DATA_TYPE *restrict dest, \
+ DATA_TYPE *restrict src, \
+ INDEX##BITS stride, INDEX##BITS n) \
+ { \
+ for (INDEX##BITS i = 0; i < n; ++i) \
+ dest[i * stride] = src[i] + 1; \
+ }
+
+#define TEST_TYPE(T, DATA_TYPE) \
+ T (DATA_TYPE, 8) \
+ T (DATA_TYPE, 16) \
+ T (DATA_TYPE, 32) \
+ T (DATA_TYPE, 64)
+
+#define TEST_ALL(T) \
+ TEST_TYPE (T, int32_t) \
+ TEST_TYPE (T, uint32_t) \
+ TEST_TYPE (T, float) \
+ TEST_TYPE (T, int64_t) \
+ TEST_TYPE (T, uint64_t) \
+ TEST_TYPE (T, double)
+
+TEST_ALL (TEST_LOOP)
+
+/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s, p[0-7], \[x[0-9]+, z[0-9]+.s, sxtw 2\]\n} 9 } } */
+/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d, p[0-7], \[x[0-9]+, z[0-9]+.d, lsl 3\]\n} 12 } } */
--- /dev/null
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O2 -ftree-vectorize --save-temps" } */
+
+#define INDEX8 uint8_t
+#define INDEX16 uint16_t
+#define INDEX32 uint32_t
+#define INDEX64 uint64_t
+
+#include "strided_store_1.c"
+
+/* 8 and 16 bits are signed because the multiplication promotes to int.
+ Using uxtw for all 9 would be OK. */
+/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s, p[0-7], \[x[0-9]+, z[0-9]+.s, sxtw 2\]\n} 6 } } */
+/* The 32-bit loop needs to honor the defined overflow in uint32_t,
+ so we vectorize the offset calculation. This means that the
+ 64-bit version needs two copies. */
+/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s, p[0-7], \[x[0-9]+, z[0-9]+.s, uxtw 2\]\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d, p[0-7], \[x[0-9]+, z[0-9]+.d, lsl 3\]\n} 15 } } */
--- /dev/null
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O2 -ftree-vectorize --save-temps" } */
+
+#include <stdint.h>
+
+#define TEST_LOOP(DATA_TYPE, OTHER_TYPE) \
+ void __attribute__ ((noinline, noclone)) \
+ f_##DATA_TYPE##_##BITS (DATA_TYPE *restrict dest, \
+ DATA_TYPE *restrict src, \
+ OTHER_TYPE *restrict other, \
+ OTHER_TYPE mask, \
+ int stride, int n) \
+ { \
+ for (int i = 0; i < n; ++i) \
+ dest[i * stride] = src[i] + (OTHER_TYPE) (other[i] | mask); \
+ }
+
+#define TEST_ALL(T) \
+ T (int32_t, int16_t) \
+ T (uint32_t, int16_t) \
+ T (float, int16_t) \
+ T (int64_t, int32_t) \
+ T (uint64_t, int32_t) \
+ T (double, int32_t)
+
+TEST_ALL (TEST_LOOP)
+
+/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.h, p[0-7]/z, \[x[0-9]+, x[0-9]+, lsl 1\]\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, x[0-9]+, lsl 2\]\n} 9 } } */
+/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s, p[0-7], \[x[0-9]+, z[0-9]+.s, sxtw 2\]\n} 6 } } */
+
+/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]/z, \[x[0-9]+, x[0-9]+, lsl 3\]\n} 6 } } */
+/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d, p[0-7], \[x[0-9]+, z[0-9]+.d, lsl 3\]\n} 6 } } */
--- /dev/null
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O2 -ftree-vectorize --save-temps" } */
+
+#include <stdint.h>
+
+#define TEST_LOOP(DATA_TYPE, NAME, SCALE) \
+ void __attribute__ ((noinline, noclone)) \
+ f_##DATA_TYPE##_##NAME (DATA_TYPE *restrict dest, \
+ DATA_TYPE *restrict src, int n) \
+ { \
+ for (int i = 0; i < n; ++i) \
+ dest[i * SCALE] = src[i] + 1; \
+ }
+
+#define TEST_TYPE(T, DATA_TYPE) \
+ T (DATA_TYPE, 5, 5) \
+ T (DATA_TYPE, 7, 7) \
+ T (DATA_TYPE, 11, 11) \
+ T (DATA_TYPE, 200, 200) \
+ T (DATA_TYPE, m100, -100)
+
+#define TEST_ALL(T) \
+ TEST_TYPE (T, int32_t) \
+ TEST_TYPE (T, uint32_t) \
+ TEST_TYPE (T, float) \
+ TEST_TYPE (T, int64_t) \
+ TEST_TYPE (T, uint64_t) \
+ TEST_TYPE (T, double)
+
+TEST_ALL (TEST_LOOP)
+
+/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s, p[0-7], \[x[0-9]+, z[0-9]+.s, sxtw 2\]\n} 15 } } */
+/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d, p[0-7], \[x[0-9]+, z[0-9]+.d, lsl 3\]\n} 15 } } */
--- /dev/null
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O2 -ftree-vectorize -msve-vector-bits=256 --save-temps" } */
+
+#include <stdint.h>
+
+#define TEST_LOOP(DATA_TYPE, NAME, SCALE) \
+ void __attribute__ ((noinline, noclone)) \
+ f_##DATA_TYPE##_##NAME (DATA_TYPE *restrict dest, \
+ DATA_TYPE *restrict src, long n) \
+ { \
+ for (long i = 0; i < n; ++i) \
+ dest[i * SCALE] = src[i] + 1; \
+ }
+
+#define TEST_TYPE(T, DATA_TYPE) \
+ T (DATA_TYPE, 5, 5) \
+ T (DATA_TYPE, 7, 7) \
+ T (DATA_TYPE, 11, 11) \
+ T (DATA_TYPE, 200, 200) \
+ T (DATA_TYPE, m100, -100)
+
+#define TEST_ALL(T) \
+ TEST_TYPE (T, int32_t) \
+ TEST_TYPE (T, uint32_t) \
+ TEST_TYPE (T, float) \
+ TEST_TYPE (T, int64_t) \
+ TEST_TYPE (T, uint64_t) \
+ TEST_TYPE (T, double)
+
+TEST_ALL (TEST_LOOP)
+
+/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s, p[0-7], \[x[0-9]+, z[0-9]+.s, uxtw\]\n} 12 } } */
+/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s, p[0-7], \[x[0-9]+, z[0-9]+.s, sxtw\]\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d, p[0-7], \[x[0-9]+, z[0-9]+.d\]\n} 15 } } */
--- /dev/null
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O2 -ftree-vectorize -msve-vector-bits=scalable --save-temps" } */
+
+#include "strided_store_5.c"
+
+/* { dg-final { scan-assembler-not {\[x[0-9]+, z[0-9]+\.s} } } */
+/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d, p[0-7], \[x[0-9]+, z[0-9]+.d\]\n} 15 } } */
--- /dev/null
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O2 -ftree-vectorize --save-temps" } */
+
+#include <stdint.h>
+
+#define TEST_LOOP(DATA_TYPE, NAME, SCALE) \
+ void __attribute__ ((noinline, noclone)) \
+ f_##DATA_TYPE##_##NAME (DATA_TYPE *restrict dest, \
+ DATA_TYPE *restrict src) \
+ { \
+ for (long i = 0; i < 1000; ++i) \
+ dest[i * SCALE] = src[i] + 1; \
+ }
+
+#define TEST_TYPE(T, DATA_TYPE) \
+ T (DATA_TYPE, 5, 5) \
+ T (DATA_TYPE, 7, 7) \
+ T (DATA_TYPE, 11, 11) \
+ T (DATA_TYPE, 200, 200) \
+ T (DATA_TYPE, m100, -100)
+
+#define TEST_ALL(T) \
+ TEST_TYPE (T, int32_t) \
+ TEST_TYPE (T, uint32_t) \
+ TEST_TYPE (T, float) \
+ TEST_TYPE (T, int64_t) \
+ TEST_TYPE (T, uint64_t) \
+ TEST_TYPE (T, double)
+
+TEST_ALL (TEST_LOOP)
+
+/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s, p[0-7], \[x[0-9]+, z[0-9]+.s, uxtw\]\n} 12 } } */
+/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s, p[0-7], \[x[0-9]+, z[0-9]+.s, sxtw\]\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d, p[0-7], \[x[0-9]+, z[0-9]+.d\]\n} 15 } } */
return [check_effective_target_aarch64_sve]
}
+# Return 1 if the target supports vector scatter stores.
+
+proc check_effective_target_vect_scatter_store { } {
+ return [check_effective_target_aarch64_sve]
+}
+
# Return 1 if the target supports vector conditional operations, 0 otherwise.
proc check_effective_target_vect_condition { } {
loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
struct loop *loop = NULL;
+ if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
+ return true;
+
if (loop_vinfo)
loop = LOOP_VINFO_LOOP (loop_vinfo);
if (read_p)
ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
else
- return false;
+ ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
/* Test whether the target supports this combination. */
if (!internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
/* True if we should aim to use internal functions rather than
built-in functions. */
bool use_ifn_p = (DR_IS_READ (dr)
- && supports_vec_gather_load_p ());
+ ? supports_vec_gather_load_p ()
+ : supports_vec_scatter_store_p ());
base = DR_REF (dr);
/* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
bool maybe_scatter
= DR_IS_WRITE (dr)
&& !TREE_THIS_VOLATILE (DR_REF (dr))
- && targetm.vectorize.builtin_scatter != NULL;
+ && (targetm.vectorize.builtin_scatter != NULL
+ || supports_vec_scatter_store_p ());
bool maybe_simd_lane_access
= is_a <loop_vec_info> (vinfo) && loop->simduid;
if (!dr || !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
return NULL;
- /* Reject stores for now. */
- if (!DR_IS_READ (dr))
- return NULL;
-
/* Get the boolean that controls whether the load or store happens.
This is null if the operation is unconditional. */
tree mask = vect_get_load_store_mask (stmt);
gimple_call_set_lhs (pattern_stmt, load_lhs);
}
else
- /* Not yet supported. */
- gcc_unreachable ();
+ {
+ tree rhs = vect_get_store_rhs (stmt);
+ if (mask != NULL)
+ pattern_stmt = gimple_build_call_internal (IFN_MASK_SCATTER_STORE, 5,
+ base, offset, scale, rhs,
+ mask);
+ else
+ pattern_stmt = gimple_build_call_internal (IFN_SCATTER_STORE, 4,
+ base, offset, scale, rhs);
+ }
gimple_call_set_nothrow (pattern_stmt, true);
/* Copy across relevant vectorization info and associate DR with the
if (mask_index >= 0
&& use == gimple_call_arg (stmt, mask_index))
return true;
+ int stored_value_index = internal_fn_stored_value_index (ifn);
+ if (stored_value_index >= 0
+ && use == gimple_call_arg (stmt, stored_value_index))
+ return true;
if (internal_gather_scatter_fn_p (ifn)
&& use == gimple_call_arg (stmt, 1))
return true;
- if (ifn == IFN_MASK_STORE
- && use == gimple_call_arg (stmt, 3))
- return true;
}
return false;
}
if (memory_access_type == VMAT_GATHER_SCATTER)
{
- gcc_assert (is_load);
+ internal_fn ifn = (is_load
+ ? IFN_MASK_GATHER_LOAD
+ : IFN_MASK_SCATTER_STORE);
tree offset_type = TREE_TYPE (gs_info->offset);
- if (!internal_gather_scatter_fn_supported_p (IFN_MASK_GATHER_LOAD,
- vectype,
+ if (!internal_gather_scatter_fn_supported_p (ifn, vectype,
gs_info->memory_type,
TYPE_SIGN (offset_type),
gs_info->scale))
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
"can't use a fully-masked loop because the"
" target doesn't have an appropriate masked"
- " gather load instruction.\n");
+ " gather load or scatter store instruction.\n");
LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
return;
}
/* STMT is either a masked or unconditional store. Return the value
being stored. */
-static tree
+tree
vect_get_store_rhs (gimple *stmt)
{
if (gassign *assign = dyn_cast <gassign *> (stmt))
if (gcall *call = dyn_cast <gcall *> (stmt))
{
internal_fn ifn = gimple_call_internal_fn (call);
- gcc_assert (ifn == IFN_MASK_STORE);
- return gimple_call_arg (stmt, 3);
+ int index = internal_fn_stored_value_index (ifn);
+ gcc_assert (index >= 0);
+ return gimple_call_arg (stmt, index);
}
gcc_unreachable ();
}
if (gimple_call_internal_p (stmt)
&& (internal_load_fn_p (gimple_call_internal_fn (stmt))
- || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
+ || internal_store_fn_p (gimple_call_internal_fn (stmt))))
/* Handled by vectorizable_load and vectorizable_store. */
return false;
else
{
gcall *call = dyn_cast <gcall *> (stmt);
- if (!call || !gimple_call_internal_p (call, IFN_MASK_STORE))
+ if (!call || !gimple_call_internal_p (call))
+ return false;
+
+ internal_fn ifn = gimple_call_internal_fn (call);
+ if (!internal_store_fn_p (ifn))
return false;
if (slp_node != NULL)
return false;
}
- ref_type = TREE_TYPE (gimple_call_arg (call, 1));
- mask = gimple_call_arg (call, 2);
- if (!vect_check_load_store_mask (stmt, mask, &mask_vectype))
- return false;
+ int mask_index = internal_fn_mask_index (ifn);
+ if (mask_index >= 0)
+ {
+ mask = gimple_call_arg (call, mask_index);
+ if (!vect_check_load_store_mask (stmt, mask, &mask_vectype))
+ return false;
+ }
}
op = vect_get_store_rhs (stmt);
TYPE_MODE (mask_vectype), false))
return false;
}
- else if (memory_access_type != VMAT_LOAD_STORE_LANES)
+ else if (memory_access_type != VMAT_LOAD_STORE_LANES
+ && (memory_access_type != VMAT_GATHER_SCATTER || gs_info.decl))
{
if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
return false;
}
- grouped_store = STMT_VINFO_GROUPED_ACCESS (stmt_info);
+ grouped_store = (STMT_VINFO_GROUPED_ACCESS (stmt_info)
+ && memory_access_type != VMAT_GATHER_SCATTER);
if (grouped_store)
{
first_stmt = GROUP_FIRST_ELEMENT (stmt_info);
ensure_base_align (dr);
- if (memory_access_type == VMAT_GATHER_SCATTER)
+ if (memory_access_type == VMAT_GATHER_SCATTER && gs_info.decl)
{
tree vec_oprnd0 = NULL_TREE, vec_oprnd1 = NULL_TREE, src;
tree arglist = TYPE_ARG_TYPES (TREE_TYPE (gs_info.decl));
return true;
}
- if (grouped_store)
+ if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
{
- GROUP_STORE_COUNT (vinfo_for_stmt (first_stmt))++;
+ gimple *group_stmt = GROUP_FIRST_ELEMENT (stmt_info);
+ GROUP_STORE_COUNT (vinfo_for_stmt (group_stmt))++;
+ }
+ if (grouped_store)
+ {
/* FORNOW */
gcc_assert (!loop || !nested_in_vect_loop_p (loop, stmt));
|| memory_access_type == VMAT_CONTIGUOUS_REVERSE)
offset = size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1);
- if (memory_access_type == VMAT_LOAD_STORE_LANES)
- aggr_type = build_array_type_nelts (elem_type, vec_num * nunits);
+ tree bump;
+ tree vec_offset = NULL_TREE;
+ if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
+ {
+ aggr_type = NULL_TREE;
+ bump = NULL_TREE;
+ }
+ else if (memory_access_type == VMAT_GATHER_SCATTER)
+ {
+ aggr_type = elem_type;
+ vect_get_strided_load_store_ops (stmt, loop_vinfo, &gs_info,
+ &bump, &vec_offset);
+ }
else
- aggr_type = vectype;
+ {
+ if (memory_access_type == VMAT_LOAD_STORE_LANES)
+ aggr_type = build_array_type_nelts (elem_type, vec_num * nunits);
+ else
+ aggr_type = vectype;
+ bump = vect_get_data_ptr_increment (dr, aggr_type, memory_access_type);
+ }
if (mask)
LOOP_VINFO_HAS_MASK_STORE (loop_vinfo) = true;
dataref_offset = build_int_cst (ref_type, 0);
inv_p = false;
}
+ else if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
+ {
+ vect_get_gather_scatter_ops (loop, stmt, &gs_info,
+ &dataref_ptr, &vec_offset);
+ inv_p = false;
+ }
else
dataref_ptr
= vect_create_data_ref_ptr (first_stmt, aggr_type,
simd_lane_access_p ? loop : NULL,
offset, &dummy, gsi, &ptr_incr,
- simd_lane_access_p, &inv_p);
+ simd_lane_access_p, &inv_p,
+ NULL_TREE, bump);
gcc_assert (bb_vinfo || !inv_p);
}
else
}
if (dataref_offset)
dataref_offset
- = int_const_binop (PLUS_EXPR, dataref_offset,
- TYPE_SIZE_UNIT (aggr_type));
+ = int_const_binop (PLUS_EXPR, dataref_offset, bump);
+ else if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
+ {
+ gimple *def_stmt;
+ vect_def_type dt;
+ vect_is_simple_use (vec_offset, loop_vinfo, &def_stmt, &dt);
+ vec_offset = vect_get_vec_def_for_stmt_copy (dt, vec_offset);
+ }
else
dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, gsi, stmt,
- TYPE_SIZE_UNIT (aggr_type));
+ bump);
}
if (memory_access_type == VMAT_LOAD_STORE_LANES)
final_mask = prepare_load_store_mask (mask_vectype, final_mask,
vec_mask, gsi);
+ if (memory_access_type == VMAT_GATHER_SCATTER)
+ {
+ tree scale = size_int (gs_info.scale);
+ gcall *call;
+ if (masked_loop_p)
+ call = gimple_build_call_internal
+ (IFN_MASK_SCATTER_STORE, 5, dataref_ptr, vec_offset,
+ scale, vec_oprnd, final_mask);
+ else
+ call = gimple_build_call_internal
+ (IFN_SCATTER_STORE, 4, dataref_ptr, vec_offset,
+ scale, vec_oprnd);
+ gimple_call_set_nothrow (call, true);
+ new_stmt = call;
+ vect_finish_stmt_generation (stmt, new_stmt, gsi);
+ break;
+ }
+
if (i > 0)
/* Bump the vector pointer. */
dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, gsi,
- stmt, NULL_TREE);
+ stmt, bump);
if (slp)
vec_oprnd = vec_oprnds[i];
one are skipped, and there vec_stmt_info shouldn't be freed
meanwhile. */
*grouped_store = true;
- if (STMT_VINFO_VEC_STMT (stmt_info))
+ stmt_vec_info group_info
+ = vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info));
+ if (GROUP_STORE_COUNT (group_info) == GROUP_SIZE (group_info))
is_store = true;
- }
+ }
else
is_store = true;
break;
extern void vect_finish_stmt_generation (gimple *, gimple *,
gimple_stmt_iterator *);
extern bool vect_mark_stmts_to_be_vectorized (loop_vec_info);
+extern tree vect_get_store_rhs (gimple *);
extern tree vect_get_vec_def_for_operand_1 (gimple *, enum vect_def_type);
extern tree vect_get_vec_def_for_operand (tree, gimple *, tree = NULL);
extern void vect_get_vec_defs (tree, tree, gimple *, vec<tree> *,