+2018-05-17 James Greenhalgh <james.greenhalgh@arm.com>
+ Kyrylo Tkachov <kyrylo.tkachov@arm.com>
+
+ * config/aarch64/aarch64.c (aarch64_expand_vector_init): Modify
+ code generation for cases where splatting a value is not useful.
+ * simplify-rtx.c (simplify_ternary_operation): Simplify
+ vec_merge across a vec_duplicate and a paradoxical subreg forming a vector
+ mode to a vec_concat.
+
+
2018-05-17 Olga Makhotina <olga.makhotina@intel.com>
* config.gcc: Support "goldmont-plus".
maxv = matches[i][1];
}
- /* Create a duplicate of the most common element. */
- rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
- aarch64_emit_move (target, gen_vec_duplicate (mode, x));
+ /* Create a duplicate of the most common element, unless all elements
+ are equally useless to us, in which case just immediately set the
+ vector register using the first element. */
+
+ if (maxv == 1)
+ {
+ /* For vectors of two 64-bit elements, we can do even better. */
+ if (n_elts == 2
+ && (inner_mode == E_DImode
+ || inner_mode == E_DFmode))
+
+ {
+ rtx x0 = XVECEXP (vals, 0, 0);
+ rtx x1 = XVECEXP (vals, 0, 1);
+ /* Combine can pick up this case, but handling it directly
+ here leaves clearer RTL.
+
+ This is load_pair_lanes<mode>, and also gives us a clean-up
+ for store_pair_lanes<mode>. */
+ if (memory_operand (x0, inner_mode)
+ && memory_operand (x1, inner_mode)
+ && !STRICT_ALIGNMENT
+ && rtx_equal_p (XEXP (x1, 0),
+ plus_constant (Pmode,
+ XEXP (x0, 0),
+ GET_MODE_SIZE (inner_mode))))
+ {
+ rtx t;
+ if (inner_mode == DFmode)
+ t = gen_load_pair_lanesdf (target, x0, x1);
+ else
+ t = gen_load_pair_lanesdi (target, x0, x1);
+ emit_insn (t);
+ return;
+ }
+ }
+ /* The subreg-move sequence below will move into lane zero of the
+ vector register. For big-endian we want that position to hold
+ the last element of VALS. */
+ maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0;
+ rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
+ aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode));
+ }
+ else
+ {
+ rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
+ aarch64_emit_move (target, gen_vec_duplicate (mode, x));
+ }
/* Insert the rest. */
for (int i = 0; i < n_elts; i++)
return simplify_gen_binary (VEC_CONCAT, mode, newop0, newop1);
}
+ /* Replace:
+
+ (vec_merge:outer (vec_duplicate:outer x:inner)
+ (subreg:outer y:inner 0)
+ (const_int N))
+
+ with (vec_concat:outer x:inner y:inner) if N == 1,
+ or (vec_concat:outer y:inner x:inner) if N == 2.
+
+ Implicitly, this means we have a paradoxical subreg, but such
+ a check is cheap, so make it anyway.
+
+ Only applies for vectors of two elements. */
+ if (GET_CODE (op0) == VEC_DUPLICATE
+ && GET_CODE (op1) == SUBREG
+ && GET_MODE (op1) == GET_MODE (op0)
+ && GET_MODE (SUBREG_REG (op1)) == GET_MODE (XEXP (op0, 0))
+ && paradoxical_subreg_p (op1)
+ && subreg_lowpart_p (op1)
+ && known_eq (GET_MODE_NUNITS (GET_MODE (op0)), 2)
+ && known_eq (GET_MODE_NUNITS (GET_MODE (op1)), 2)
+ && IN_RANGE (sel, 1, 2))
+ {
+ rtx newop0 = XEXP (op0, 0);
+ rtx newop1 = SUBREG_REG (op1);
+ if (sel == 2)
+ std::swap (newop0, newop1);
+ return simplify_gen_binary (VEC_CONCAT, mode, newop0, newop1);
+ }
+
+ /* Same as above but with switched operands:
+ Replace (vec_merge:outer (subreg:outer x:inner 0)
+ (vec_duplicate:outer y:inner)
+ (const_int N))
+
+ with (vec_concat:outer x:inner y:inner) if N == 1,
+ or (vec_concat:outer y:inner x:inner) if N == 2. */
+ if (GET_CODE (op1) == VEC_DUPLICATE
+ && GET_CODE (op0) == SUBREG
+ && GET_MODE (op0) == GET_MODE (op1)
+ && GET_MODE (SUBREG_REG (op0)) == GET_MODE (XEXP (op1, 0))
+ && paradoxical_subreg_p (op0)
+ && subreg_lowpart_p (op0)
+ && known_eq (GET_MODE_NUNITS (GET_MODE (op1)), 2)
+ && known_eq (GET_MODE_NUNITS (GET_MODE (op0)), 2)
+ && IN_RANGE (sel, 1, 2))
+ {
+ rtx newop0 = SUBREG_REG (op0);
+ rtx newop1 = XEXP (op1, 0);
+ if (sel == 2)
+ std::swap (newop0, newop1);
+ return simplify_gen_binary (VEC_CONCAT, mode, newop0, newop1);
+ }
+
/* Replace (vec_merge (vec_duplicate x) (vec_duplicate y)
(const_int n))
with (vec_concat x y) or (vec_concat y x) depending on value
+2018-05-17 James Greenhalgh <james.greenhalgh@arm.com>
+
+ * gcc.target/aarch64/vect-slp-dup.c: New.
+
2018-05-17 Paolo Carlini <paolo.carlini@oracle.com>
PR c++/85713
--- /dev/null
+/* { dg-do compile } */
+
+/* { dg-options "-O3 -ftree-vectorize -fno-vect-cost-model" } */
+
+void bar (double);
+
+void
+foo (double *restrict in, double *restrict in2,
+ double *restrict out1, double *restrict out2)
+{
+ for (int i = 0; i < 1024; i++)
+ {
+ out1[i] = in[i] + 2.0 * in[i+128];
+ out1[i+1] = in[i+1] + 2.0 * in2[i];
+ bar (in[i]);
+ }
+}
+
+/* { dg-final { scan-assembler-not "dup\tv\[0-9\]+.2d, v\[0-9\]+" } } */
+