+2019-06-03 Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org>
+
+ PR target/88837
+ * vector-builder.h (vector_builder::count_dups): New method.
+ * config/aarch64/aarch64-protos.h (aarch64_expand_sve_vector_init):
+ Declare prototype.
+ * config/aarch64/aarch64/sve.md (aarch64_sve_rev64<mode>): Use @.
+ (vec_init<mode><Vel>): New pattern.
+ * config/aarch64/aarch64.c (emit_insr): New function.
+ (aarch64_sve_expand_vector_init_handle_trailing_constants): Likewise.
+ (aarch64_sve_expand_vector_init_insert_elems): Likewise.
+ (aarch64_sve_expand_vector_init_handle_trailing_same_elem): Likewise.
+ (aarch64_sve_expand_vector_init): Define two overloaded functions.
+
2019-06-03 Alejandro Martinez <alejandro.martinezvicente@arm.com>
PR tree-optimization/90681
void aarch64_split_sve_subreg_move (rtx, rtx, rtx);
void aarch64_expand_prologue (void);
void aarch64_expand_vector_init (rtx, rtx);
+void aarch64_sve_expand_vector_init (rtx, rtx);
void aarch64_init_cumulative_args (CUMULATIVE_ARGS *, const_tree, rtx,
const_tree, unsigned);
void aarch64_init_expanders (void);
"revb\t%0.h, %1/m, %2.h"
)
-(define_insn "*aarch64_sve_rev<mode>"
+(define_insn "@aarch64_sve_rev<mode>"
[(set (match_operand:SVE_ALL 0 "register_operand" "=w")
(unspec:SVE_ALL [(match_operand:SVE_ALL 1 "register_operand" "w")]
UNSPEC_REV))]
DONE;
}
)
+
+;; Standard pattern name vec_init<mode><Vel>.
+(define_expand "vec_init<mode><Vel>"
+ [(match_operand:SVE_ALL 0 "register_operand" "")
+ (match_operand 1 "" "")]
+ "TARGET_SVE"
+ {
+ aarch64_sve_expand_vector_init (operands[0], operands[1]);
+ DONE;
+ }
+)
}
}
+/* Emit RTL corresponding to:
+ insr TARGET, ELEM. */
+
+static void
+emit_insr (rtx target, rtx elem)
+{
+ machine_mode mode = GET_MODE (target);
+ scalar_mode elem_mode = GET_MODE_INNER (mode);
+ elem = force_reg (elem_mode, elem);
+
+ insn_code icode = optab_handler (vec_shl_insert_optab, mode);
+ gcc_assert (icode != CODE_FOR_nothing);
+ emit_insn (GEN_FCN (icode) (target, target, elem));
+}
+
+/* Subroutine of aarch64_sve_expand_vector_init for handling
+ trailing constants.
+ This function works as follows:
+ (a) Create a new vector consisting of trailing constants.
+ (b) Initialize TARGET with the constant vector using emit_move_insn.
+ (c) Insert remaining elements in TARGET using insr.
+ NELTS is the total number of elements in original vector while
+ while NELTS_REQD is the number of elements that are actually
+ significant.
+
+ ??? The heuristic used is to do above only if number of constants
+ is at least half the total number of elements. May need fine tuning. */
+
+static bool
+aarch64_sve_expand_vector_init_handle_trailing_constants
+ (rtx target, const rtx_vector_builder &builder, int nelts, int nelts_reqd)
+{
+ machine_mode mode = GET_MODE (target);
+ scalar_mode elem_mode = GET_MODE_INNER (mode);
+ int n_trailing_constants = 0;
+
+ for (int i = nelts_reqd - 1;
+ i >= 0 && aarch64_legitimate_constant_p (elem_mode, builder.elt (i));
+ i--)
+ n_trailing_constants++;
+
+ if (n_trailing_constants >= nelts_reqd / 2)
+ {
+ rtx_vector_builder v (mode, 1, nelts);
+ for (int i = 0; i < nelts; i++)
+ v.quick_push (builder.elt (i + nelts_reqd - n_trailing_constants));
+ rtx const_vec = v.build ();
+ emit_move_insn (target, const_vec);
+
+ for (int i = nelts_reqd - n_trailing_constants - 1; i >= 0; i--)
+ emit_insr (target, builder.elt (i));
+
+ return true;
+ }
+
+ return false;
+}
+
+/* Subroutine of aarch64_sve_expand_vector_init.
+ Works as follows:
+ (a) Initialize TARGET by broadcasting element NELTS_REQD - 1 of BUILDER.
+ (b) Skip trailing elements from BUILDER, which are the same as
+ element NELTS_REQD - 1.
+ (c) Insert earlier elements in reverse order in TARGET using insr. */
+
+static void
+aarch64_sve_expand_vector_init_insert_elems (rtx target,
+ const rtx_vector_builder &builder,
+ int nelts_reqd)
+{
+ machine_mode mode = GET_MODE (target);
+ scalar_mode elem_mode = GET_MODE_INNER (mode);
+
+ struct expand_operand ops[2];
+ enum insn_code icode = optab_handler (vec_duplicate_optab, mode);
+ gcc_assert (icode != CODE_FOR_nothing);
+
+ create_output_operand (&ops[0], target, mode);
+ create_input_operand (&ops[1], builder.elt (nelts_reqd - 1), elem_mode);
+ expand_insn (icode, 2, ops);
+
+ int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
+ for (int i = nelts_reqd - ndups - 1; i >= 0; i--)
+ emit_insr (target, builder.elt (i));
+}
+
+/* Subroutine of aarch64_sve_expand_vector_init to handle case
+ when all trailing elements of builder are same.
+ This works as follows:
+ (a) Use expand_insn interface to broadcast last vector element in TARGET.
+ (b) Insert remaining elements in TARGET using insr.
+
+ ??? The heuristic used is to do above if number of same trailing elements
+ is at least 3/4 of total number of elements, loosely based on
+ heuristic from mostly_zeros_p. May need fine-tuning. */
+
+static bool
+aarch64_sve_expand_vector_init_handle_trailing_same_elem
+ (rtx target, const rtx_vector_builder &builder, int nelts_reqd)
+{
+ int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
+ if (ndups >= (3 * nelts_reqd) / 4)
+ {
+ aarch64_sve_expand_vector_init_insert_elems (target, builder,
+ nelts_reqd - ndups + 1);
+ return true;
+ }
+
+ return false;
+}
+
+/* Initialize register TARGET from BUILDER. NELTS is the constant number
+ of elements in BUILDER.
+
+ The function tries to initialize TARGET from BUILDER if it fits one
+ of the special cases outlined below.
+
+ Failing that, the function divides BUILDER into two sub-vectors:
+ v_even = even elements of BUILDER;
+ v_odd = odd elements of BUILDER;
+
+ and recursively calls itself with v_even and v_odd.
+
+ if (recursive call succeeded for v_even or v_odd)
+ TARGET = zip (v_even, v_odd)
+
+ The function returns true if it managed to build TARGET from BUILDER
+ with one of the special cases, false otherwise.
+
+ Example: {a, 1, b, 2, c, 3, d, 4}
+
+ The vector gets divided into:
+ v_even = {a, b, c, d}
+ v_odd = {1, 2, 3, 4}
+
+ aarch64_sve_expand_vector_init(v_odd) hits case 1 and
+ initialize tmp2 from constant vector v_odd using emit_move_insn.
+
+ aarch64_sve_expand_vector_init(v_even) fails since v_even contains
+ 4 elements, so we construct tmp1 from v_even using insr:
+ tmp1 = dup(d)
+ insr tmp1, c
+ insr tmp1, b
+ insr tmp1, a
+
+ And finally:
+ TARGET = zip (tmp1, tmp2)
+ which sets TARGET to {a, 1, b, 2, c, 3, d, 4}. */
+
+static bool
+aarch64_sve_expand_vector_init (rtx target, const rtx_vector_builder &builder,
+ int nelts, int nelts_reqd)
+{
+ machine_mode mode = GET_MODE (target);
+
+ /* Case 1: Vector contains trailing constants. */
+
+ if (aarch64_sve_expand_vector_init_handle_trailing_constants
+ (target, builder, nelts, nelts_reqd))
+ return true;
+
+ /* Case 2: Vector contains leading constants. */
+
+ rtx_vector_builder rev_builder (mode, 1, nelts_reqd);
+ for (int i = 0; i < nelts_reqd; i++)
+ rev_builder.quick_push (builder.elt (nelts_reqd - i - 1));
+ rev_builder.finalize ();
+
+ if (aarch64_sve_expand_vector_init_handle_trailing_constants
+ (target, rev_builder, nelts, nelts_reqd))
+ {
+ emit_insn (gen_aarch64_sve_rev (mode, target, target));
+ return true;
+ }
+
+ /* Case 3: Vector contains trailing same element. */
+
+ if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
+ (target, builder, nelts_reqd))
+ return true;
+
+ /* Case 4: Vector contains leading same element. */
+
+ if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
+ (target, rev_builder, nelts_reqd) && nelts_reqd == nelts)
+ {
+ emit_insn (gen_aarch64_sve_rev (mode, target, target));
+ return true;
+ }
+
+ /* Avoid recursing below 4-elements.
+ ??? The threshold 4 may need fine-tuning. */
+
+ if (nelts_reqd <= 4)
+ return false;
+
+ rtx_vector_builder v_even (mode, 1, nelts);
+ rtx_vector_builder v_odd (mode, 1, nelts);
+
+ for (int i = 0; i < nelts * 2; i += 2)
+ {
+ v_even.quick_push (builder.elt (i));
+ v_odd.quick_push (builder.elt (i + 1));
+ }
+
+ v_even.finalize ();
+ v_odd.finalize ();
+
+ rtx tmp1 = gen_reg_rtx (mode);
+ bool did_even_p = aarch64_sve_expand_vector_init (tmp1, v_even,
+ nelts, nelts_reqd / 2);
+
+ rtx tmp2 = gen_reg_rtx (mode);
+ bool did_odd_p = aarch64_sve_expand_vector_init (tmp2, v_odd,
+ nelts, nelts_reqd / 2);
+
+ if (!did_even_p && !did_odd_p)
+ return false;
+
+ /* Initialize v_even and v_odd using INSR if it didn't match any of the
+ special cases and zip v_even, v_odd. */
+
+ if (!did_even_p)
+ aarch64_sve_expand_vector_init_insert_elems (tmp1, v_even, nelts_reqd / 2);
+
+ if (!did_odd_p)
+ aarch64_sve_expand_vector_init_insert_elems (tmp2, v_odd, nelts_reqd / 2);
+
+ rtvec v = gen_rtvec (2, tmp1, tmp2);
+ emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
+ return true;
+}
+
+/* Initialize register TARGET from the elements in PARALLEL rtx VALS. */
+
+void
+aarch64_sve_expand_vector_init (rtx target, rtx vals)
+{
+ machine_mode mode = GET_MODE (target);
+ int nelts = XVECLEN (vals, 0);
+
+ rtx_vector_builder v (mode, 1, nelts);
+ for (int i = 0; i < nelts; i++)
+ v.quick_push (XVECEXP (vals, 0, i));
+ v.finalize ();
+
+ /* If neither sub-vectors of v could be initialized specially,
+ then use INSR to insert all elements from v into TARGET.
+ ??? This might not be optimal for vectors with large
+ initializers like 16-element or above.
+ For nelts < 4, it probably isn't useful to handle specially. */
+
+ if (nelts < 4
+ || !aarch64_sve_expand_vector_init (target, v, nelts, nelts))
+ aarch64_sve_expand_vector_init_insert_elems (target, v, nelts);
+}
+
static unsigned HOST_WIDE_INT
aarch64_shift_truncation_mask (machine_mode mode)
{
+2019-06-03 Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org>
+
+ PR target/88837
+ * gcc.target/aarch64/sve/init_1.c: New test.
+ * gcc.target/aarch64/sve/init_1_run.c: Likewise.
+ * gcc.target/aarch64/sve/init_2.c: Likewise.
+ * gcc.target/aarch64/sve/init_2_run.c: Likewise.
+ * gcc.target/aarch64/sve/init_3.c: Likewise.
+ * gcc.target/aarch64/sve/init_3_run.c: Likewise.
+ * gcc.target/aarch64/sve/init_4.c: Likewise.
+ * gcc.target/aarch64/sve/init_4_run.c: Likewise.
+ * gcc.target/aarch64/sve/init_5.c: Likewise.
+ * gcc.target/aarch64/sve/init_5_run.c: Likewise.
+ * gcc.target/aarch64/sve/init_6.c: Likewise.
+ * gcc.target/aarch64/sve/init_6_run.c: Likewise.
+ * gcc.target/aarch64/sve/init_7.c: Likewise.
+ * gcc.target/aarch64/sve/init_7_run.c: Likewise.
+ * gcc.target/aarch64/sve/init_8.c: Likewise.
+ * gcc.target/aarch64/sve/init_8_run.c: Likewise.
+ * gcc.target/aarch64/sve/init_9.c: Likewise.
+ * gcc.target/aarch64/sve/init_9_run.c: Likewise.
+ * gcc.target/aarch64/sve/init_10.c: Likewise.
+ * gcc.target/aarch64/sve/init_10_run.c: Likewise.
+ * gcc.target/aarch64/sve/init_11.c: Likewise.
+ * gcc.target/aarch64/sve/init_11_run.c: Likewise.
+ * gcc.target/aarch64/sve/init_12.c: Likewise.
+ * gcc.target/aarch64/sve/init_12_run.c: Likewise.
+
2019-06-03 Alejandro Martinez <alejandro.martinezvicente@arm.com>
PR tree-optimization/90681
--- /dev/null
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O2 -fno-schedule-insns -msve-vector-bits=256 --save-temps" } */
+
+/* Case 1.1: Trailing constants with stepped sequence. */
+
+#include <stdint.h>
+
+typedef int32_t vnx4si __attribute__((vector_size (32)));
+
+__attribute__((noipa))
+vnx4si foo(int a, int b)
+{
+ return (vnx4si) { a, b, 1, 2, 3, 4, 5, 6 };
+}
+
+/*
+foo:
+.LFB0:
+ .cfi_startproc
+ ptrue p0.s, vl8
+ index z0.s, #1, #1
+ insr z0.s, w1
+ insr z0.s, w0
+ ret
+*/
+
+/* { dg-final { scan-assembler {\tindex\t(z[0-9]+\.s), #1, #1\n\tinsr\t\1, w1\n\tinsr\t\1, w0} } } */
--- /dev/null
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O2 -fno-schedule-insns -msve-vector-bits=256 --save-temps" } */
+
+/* Case 5.4: Interleaved repeating elements and non-repeating elements. */
+
+#include <stdint.h>
+
+typedef int32_t vnx4si __attribute__((vector_size (32)));
+
+__attribute__((noipa))
+vnx4si foo(int a, int b, int c, int f)
+{
+ return (vnx4si) { a, f, b, f, c, f, c, f };
+}
+
+/*
+foo:
+.LFB0:
+ .cfi_startproc
+ mov z0.s, w2
+ mov z1.s, w3
+ insr z0.s, w1
+ ptrue p0.s, vl8
+ insr z0.s, w0
+ zip1 z0.s, z0.s, z1.s
+ ret
+*/
+
+/* { dg-final { scan-assembler {\tmov\t(z[0-9]+\.s), w3\n\tmov\t(z[0-9]+\.s), w2\n.*\n\tinsr\t\2, w1\n\tinsr\t\2, w0\n\tzip1\t\2, \2, \1} } } */
--- /dev/null
+/* { dg-do run { target aarch64_sve256_hw } } */
+/* { dg-options "-O2 -msve-vector-bits=256" } */
+
+#include "init_10.c"
+
+int main()
+{
+ int a = 10;
+ int b = 11;
+ int c = 12;
+ int f = 13;
+
+ vnx4si v = foo (a, b, c, f);
+ int expected[] = { a, f, b, f, c, f, c, f };
+
+ for (int i = 0; i < 8; i++)
+ if (v[i] != expected[i])
+ __builtin_abort ();
+
+ return 0;
+}
--- /dev/null
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O2 -fno-schedule-insns -msve-vector-bits=256 --save-temps" } */
+
+/* Case 5.5: Interleaved repeating elements and trailing same elements. */
+
+#include <stdint.h>
+
+typedef int32_t vnx4si __attribute__((vector_size (32)));
+
+__attribute__((noipa))
+vnx4si foo(int a, int b, int f)
+{
+ return (vnx4si) { a, f, b, f, b, f, b, f };
+}
+
+/*
+foo:
+.LFB0:
+ .cfi_startproc
+ mov z0.s, w1
+ mov z1.s, w2
+ insr z0.s, w0
+ ptrue p0.s, vl8
+ zip1 z0.s, z0.s, z1.s
+ ret
+*/
+
+/* { dg-final { scan-assembler {\tmov\t(z[0-9]+\.s), w1\n\tmov\t(z[0-9]+\.s), w2\n\tinsr\t\1, w0\n.*\tzip1\t\1, \1, \2} } } */
--- /dev/null
+/* { dg-do run { target aarch64_sve256_hw } } */
+/* { dg-options "-O2 -msve-vector-bits=256" } */
+
+#include "init_11.c"
+
+int main()
+{
+ int a = 10;
+ int b = 11;
+ int f = 12;
+
+ vnx4si v = foo (a, b, f);
+ int expected[] = { a, f, b, f, b, f, b, f };
+
+ for (int i = 0; i < 8; i++)
+ if (v[i] != expected[i])
+ __builtin_abort ();
+
+ return 0;
+}
--- /dev/null
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O2 -fno-schedule-insns -msve-vector-bits=256 --save-temps" } */
+
+/* Case 5.5: Interleaved repeating elements and trailing same elements. */
+
+#include <stdint.h>
+
+typedef int32_t vnx4si __attribute__((vector_size (32)));
+
+__attribute__((noipa))
+vnx4si foo(int a, int b, int f)
+{
+ return (vnx4si) { b, f, b, f, b, f, a, f };
+}
+
+/*
+foo:
+.LFB0:
+ .cfi_startproc
+ mov z0.s, w0
+ mov z1.s, w2
+ insr z0.s, w1
+ ptrue p0.s, vl8
+ insr z0.s, w1
+ insr z0.s, w1
+ zip1 z0.s, z0.s, z1.s
+ ret
+*/
+
+/* { dg-final { scan-assembler {\tmov\t(z[0-9]+\.s), w2\n\tmov\t(z[0-9]+\.s), w0\n.*\n\tinsr\t\2, w1\n\tinsr\t\2, w1\n\tinsr\t\2, w1\n\tzip1\t\2, \2, \1} } } */
--- /dev/null
+/* { dg-do run { target aarch64_sve256_hw } } */
+/* { dg-options "-O2 -msve-vector-bits=256" } */
+
+#include "init_12.c"
+
+int main()
+{
+ int a = 10;
+ int b = 11;
+ int f = 12;
+
+ vnx4si v = foo (a, b, f);
+ int expected[] = { b, f, b, f, b, f, a, f };
+
+ for (int i = 0; i < 8; i++)
+ if (v[i] != expected[i])
+ __builtin_abort ();
+
+ return 0;
+}
--- /dev/null
+/* { dg-do run { target aarch64_sve256_hw } } */
+/* { dg-options "-O2 -msve-vector-bits=256" } */
+
+#include "init_1.c"
+
+int main()
+{
+ int a = 10;
+ int b = 11;
+
+ vnx4si v = foo (a, b);
+ int expected[] = { a, b, 1, 2, 3, 4, 5, 6 };
+
+ for (int i = 0; i < 8; i++)
+ if (v[i] != expected[i])
+ __builtin_abort ();
+
+ return 0;
+}
--- /dev/null
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O2 -fno-schedule-insns -msve-vector-bits=256 --save-temps" } */
+
+/* Case 1.2: Trailing constants with repeating sequence. */
+
+#include <stdint.h>
+
+typedef int32_t vnx4si __attribute__((vector_size (32)));
+
+__attribute__((noipa))
+vnx4si foo(int a, int b)
+{
+ return (vnx4si) { a, b, 2, 3, 2, 3, 2, 3 };
+}
+
+/*
+foo:
+.LFB0:
+ .cfi_startproc
+ ptrue p0.s, vl8
+ adrp x2, .LANCHOR0
+ add x2, x2, :lo12:.LANCHOR0
+ ld1w z0.s, p0/z, [x2]
+ insr z0.s, w1
+ insr z0.s, w0
+ ret
+*/
+
+/* { dg-final { scan-assembler {\tld1w\t(z[0-9]+\.s), p[0-9]+/z, \[x[0-9]+\]\n\tinsr\t\1, w1\n\tinsr\t\1, w0} } } */
--- /dev/null
+/* { dg-do run { target aarch64_sve256_hw } } */
+/* { dg-options "-O2 -msve-vector-bits=256" } */
+
+#include "init_2.c"
+
+int main()
+{
+ int a = 10;
+ int b = 11;
+
+ vnx4si v = foo (a, b);
+ int expected[] = { a, b, 2, 3, 2, 3, 2, 3 };
+
+ for (int i = 0; i < 8; i++)
+ if (v[i] != expected[i])
+ __builtin_abort ();
+
+ return 0;
+}
--- /dev/null
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O2 -fno-schedule-insns -msve-vector-bits=256 --save-temps" } */
+
+/* Case 2.1: Leading constants with stepped sequence. */
+
+#include <stdint.h>
+
+typedef int32_t vnx4si __attribute__((vector_size (32)));
+
+__attribute__((noipa))
+vnx4si foo(int a, int b)
+{
+ return (vnx4si) { 1, 2, 3, 4, 5, 6, a, b };
+}
+
+/*
+foo:
+.LFB0:
+ .cfi_startproc
+ ptrue p0.s, vl8
+ index z0.s, #6, #-1
+ insr z0.s, w0
+ insr z0.s, w1
+ rev z0.s, z0.s
+ ret
+*/
+
+/* { dg-final { scan-assembler {\tindex\t(z[0-9]+\.s), #6, #-1\n\tinsr\t\1, w0\n\tinsr\t\1, w1\n\trev\t\1, \1} } } */
--- /dev/null
+/* { dg-do run { target aarch64_sve256_hw } } */
+/* { dg-options "-O2 -msve-vector-bits=256" } */
+
+#include "init_3.c"
+
+int main()
+{
+ int a = 10;
+ int b = 11;
+
+ vnx4si v = foo (a, b);
+ int expected[] = { 1, 2, 3, 4, 5, 6, a, b };
+
+ for (int i = 0; i < 8; i++)
+ if (v[i] != expected[i])
+ __builtin_abort ();
+
+ return 0;
+}
--- /dev/null
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O2 -fno-schedule-insns -msve-vector-bits=256 --save-temps" } */
+
+/* Case 2.2: Leading constants with stepped sequence. */
+
+#include <stdint.h>
+
+typedef int32_t vnx4si __attribute__((vector_size (32)));
+
+__attribute__((noipa))
+vnx4si foo(int a, int b)
+{
+ return (vnx4si) { 3, 2, 3, 2, 3, 2, b, a };
+}
+
+/*
+foo:
+.LFB0:
+ .cfi_startproc
+ ptrue p0.s, vl8
+ adrp x2, .LANCHOR0
+ add x2, x2, :lo12:.LANCHOR0
+ ld1w z0.s, p0/z, [x2]
+ insr z0.s, w1
+ insr z0.s, w0
+ rev z0.s, z0.s
+ ret
+*/
+
+/* { dg-final { scan-assembler {\tld1w\t(z[0-9]+\.s), p[0-9]+/z, \[x[0-9]+\]\n\tinsr\t\1, w1\n\tinsr\t\1, w0\n\trev\t\1, \1} } } */
--- /dev/null
+/* { dg-do run { target aarch64_sve256_hw } } */
+/* { dg-options "-O2 -msve-vector-bits=256" } */
+
+#include "init_4.c"
+
+int main()
+{
+ int a = 10;
+ int b = 11;
+
+ vnx4si v = foo (a, b);
+ int expected[] = { 3, 2, 3, 2, 3, 2, b, a };
+
+ for (int i = 0; i < 8; i++)
+ if (v[i] != expected[i])
+ __builtin_abort ();
+
+ return 0;
+}
--- /dev/null
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O2 -fno-schedule-insns -msve-vector-bits=256 --save-temps" } */
+
+/* Case 3: Trailing same element. */
+
+#include <stdint.h>
+
+typedef int32_t vnx4si __attribute__((vector_size (32)));
+
+__attribute__((noipa))
+vnx4si foo(int a, int b, int c)
+{
+ return (vnx4si) { a, b, c, c, c, c, c, c };
+}
+
+/*
+foo:
+.LFB0:
+ .cfi_startproc
+ mov z0.s, w2
+ ptrue p0.s, vl8
+ insr z0.s, w1
+ insr z0.s, w0
+ ret
+*/
+
+/* { dg-final { scan-assembler {\tmov\t(z[0-9]+\.s), w2\n.*\tinsr\t\1, w1\n\tinsr\t\1, w0} } } */
--- /dev/null
+/* { dg-do run { target aarch64_sve256_hw } } */
+/* { dg-options "-O2 -msve-vector-bits=256" } */
+
+#include "init_5.c"
+
+int main()
+{
+ int a = 10;
+ int b = 11;
+ int c = 12;
+
+ vnx4si v = foo (a, b, c);
+ int expected[] = { a, b, c, c, c, c, c, c };
+
+ for (int i = 0; i < 8; i++)
+ if (v[i] != expected[i])
+ __builtin_abort ();
+
+ return 0;
+}
--- /dev/null
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O2 -fno-schedule-insns -msve-vector-bits=256 --save-temps" } */
+
+/* Case 3: Trailing same element. */
+
+#include <stdint.h>
+
+typedef int32_t vnx4si __attribute__((vector_size (32)));
+
+__attribute__((noipa))
+vnx4si foo(int a, int b, int c)
+{
+ return (vnx4si) { c, c, c, c, c, c, b, a };
+}
+
+/*
+foo:
+.LFB0:
+ .cfi_startproc
+ mov z0.s, w2
+ ptrue p0.s, vl8
+ insr z0.s, w1
+ insr z0.s, w0
+ rev z0.s, z0.s
+ ret
+*/
+
+/* { dg-final { scan-assembler {\tmov\t(z[0-9]+\.s), w2\n.*\tinsr\t\1, w1\n\tinsr\t\1, w0\n\trev\t\1, \1} } } */
--- /dev/null
+/* { dg-do run { target aarch64_sve256_hw } } */
+/* { dg-options "-O2 -msve-vector-bits=256" } */
+
+#include "init_6.c"
+
+int main()
+{
+ int a = 10;
+ int b = 11;
+ int c = 12;
+
+ vnx4si v = foo (a, b, c);
+ int expected[] = { c, c, c, c, c, c, b, a };
+
+ for (int i = 0; i < 8; i++)
+ if (v[i] != expected[i])
+ __builtin_abort ();
+
+ return 0;
+}
--- /dev/null
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O2 -fno-schedule-insns -msve-vector-bits=256 --save-temps" } */
+
+/* Case 5.1: All elements. */
+
+#include <stdint.h>
+
+typedef int32_t vnx4si __attribute__((vector_size (32)));
+
+__attribute__((noipa))
+vnx4si foo(int a, int b, int c, int d, int e, int f, int g, int h)
+{
+ return (vnx4si) { a, b, c, d, e, f, g, h };
+}
+
+/*
+foo:
+.LFB0:
+ .cfi_startproc
+ mov z0.s, w7
+ ptrue p0.s, vl8
+ insr z0.s, w6
+ insr z0.s, w5
+ insr z0.s, w4
+ insr z0.s, w3
+ insr z0.s, w2
+ insr z0.s, w1
+ insr z0.s, w0
+ ret
+*/
+
+/* { dg-final { scan-assembler {\tmov\t(z[0-9]+\.s), w7\n.*\tinsr\t\1, w6\n\tinsr\t\1, w5\n\tinsr\t\1, w4\n\tinsr\t\1, w3\n\tinsr\t\1, w2\n\tinsr\t\1, w1\n\tinsr\t\1, w0} } } */
--- /dev/null
+/* { dg-do run { target aarch64_sve256_hw } } */
+/* { dg-options "-O2 -msve-vector-bits=256" } */
+
+#include "init_7.c"
+
+int main()
+{
+ int a = 10;
+ int b = 11;
+ int c = 12;
+ int d = 13;
+ int e = 14;
+ int f = 15;
+ int g = 16;
+ int h = 17;
+
+ vnx4si v = foo (a, b, c, d, e, f, g, h);
+ int expected[] = { a, b, c, d, e, f, g, h };
+
+ for (int i = 0; i < 8; i++)
+ if (v[i] != expected[i])
+ __builtin_abort ();
+
+ return 0;
+}
--- /dev/null
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O2 -fno-schedule-insns -msve-vector-bits=256 --save-temps" } */
+
+/* Case 5.2: Interleaved elements and constants. */
+
+#include <stdint.h>
+
+typedef int32_t vnx4si __attribute__((vector_size (32)));
+
+__attribute__((noipa))
+vnx4si foo(int a, int b, int c, int d)
+{
+ return (vnx4si) { a, 1, b, 2, c, 3, d, 4 };
+}
+
+/*
+foo:
+.LFB0:
+ .cfi_startproc
+ ptrue p0.s, vl8
+ mov z0.s, w3
+ adrp x3, .LANCHOR0
+ insr z0.s, w2
+ add x3, x3, :lo12:.LANCHOR0
+ insr z0.s, w1
+ ld1w z1.s, p0/z, [x3]
+ insr z0.s, w0
+ zip1 z0.s, z0.s, z1.s
+ ret
+*/
+
+/* { dg-final { scan-assembler {\tmov\t(z[0-9]+\.s), w3\n\tadrp\t(x[0-9]+), \.LANCHOR0\n\tinsr\t\1, w2\n\tadd\t\2, \2, :lo12:\.LANCHOR0\n\tinsr\t\1, w1\n\tld1w\t(z[0-9]+\.s), p[0-9]+/z, \[\2\]\n\tinsr\t\1, w0\n\tzip1\t\1, \1, \3} } } */
--- /dev/null
+/* { dg-do run { target aarch64_sve256_hw } } */
+/* { dg-options "-O2 -msve-vector-bits=256" } */
+
+#include "init_8.c"
+
+int main()
+{
+ int a = 10;
+ int b = 11;
+ int c = 12;
+ int d = 13;
+
+ vnx4si v = foo (a, b, c, d);
+ int expected[] = { a, 1, b, 2, c, 3, d, 4 };
+
+ for (int i = 0; i < 8; i++)
+ if (v[i] != expected[i])
+ __builtin_abort ();
+
+ return 0;
+}
--- /dev/null
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O2 -fno-schedule-insns -msve-vector-bits=256 --save-temps" } */
+
+/* Case 5.3: Repeated elements. */
+
+#include <stdint.h>
+
+typedef int32_t vnx4si __attribute__((vector_size (32)));
+
+__attribute__((noipa))
+vnx4si foo(int a, int b)
+{
+ return (vnx4si) { a, b, a, b, a, b, a, b };
+}
+
+/*
+foo:
+.LFB0:
+ .cfi_startproc
+ mov z0.s, w0
+ mov z1.s, w1
+ ptrue p0.s, vl8
+ zip1 z0.s, z0.s, z1.s
+ ret
+*/
+
+/* { dg-final { scan-assembler {\tmov\t(z[0-9]+\.s), w0\n\tmov\t(z[0-9]+\.s), w1\n.*\tzip1\t\1, \1, \2} } } */
--- /dev/null
+/* { dg-do run { target aarch64_sve256_hw } } */
+/* { dg-options "-O2 -msve-vector-bits=256" } */
+
+#include "init_9.c"
+
+int main()
+{
+ int a = 10;
+ int b = 11;
+
+ vnx4si v = foo (a, b);
+ int expected[] = { a, b, a, b, a, b, a, b };
+
+ for (int i = 0; i < 8; i++)
+ if (v[i] != expected[i])
+ __builtin_abort ();
+
+ return 0;
+}
unsigned int encoded_nelts () const;
bool encoded_full_vector_p () const;
T elt (unsigned int) const;
+ unsigned int count_dups (int, int, int) const;
bool operator == (const Derived &) const;
bool operator != (const Derived &x) const { return !operator == (x); }
derived ()->step (prev, final));
}
+/* Return the number of leading duplicate elements in the range
+ [START:END:STEP]. The value is always at least 1. */
+
+template<typename T, typename Derived>
+unsigned int
+vector_builder<T, Derived>::count_dups (int start, int end, int step) const
+{
+ gcc_assert ((end - start) % step == 0);
+
+ unsigned int ndups = 1;
+ for (int i = start + step;
+ i != end && derived ()->equal_p (elt (i), elt (start));
+ i += step)
+ ndups++;
+ return ndups;
+}
+
/* Change the encoding to NPATTERNS patterns of NELTS_PER_PATTERN each,
but without changing the underlying vector. */