+2019-08-13 Richard Sandiford <richard.sandiford@arm.com>
+
+ * machmode.h (opt_mode::else_mode): New function.
+ (opt_mode::else_blk): Use it.
+ * config/aarch64/aarch64-protos.h (aarch64_vq_mode): Declare.
+ (aarch64_full_sve_mode, aarch64_sve_ld1rq_operand_p): Likewise.
+ (aarch64_gen_stepped_int_parallel): Likewise.
+ (aarch64_stepped_int_parallel_p): Likewise.
+ (aarch64_expand_mov_immediate): Remove the optional gen_vec_duplicate
+ argument.
+ * config/aarch64/aarch64.c
+ (aarch64_expand_sve_widened_duplicate): Delete.
+ (aarch64_expand_sve_dupq, aarch64_expand_sve_ld1rq): New functions.
+ (aarch64_expand_sve_const_vector): Rewrite to handle more cases.
+ (aarch64_expand_mov_immediate): Remove the optional gen_vec_duplicate
+ argument. Use early returns in the !CONST_INT_P handling.
+ Pass all SVE data vectors to aarch64_expand_sve_const_vector rather
+ than handling some inline.
+ (aarch64_full_sve_mode, aarch64_vq_mode): New functions, split out
+ from...
+ (aarch64_simd_container_mode): ...here.
+ (aarch64_gen_stepped_int_parallel, aarch64_stepped_int_parallel_p)
+ (aarch64_sve_ld1rq_operand_p): New functions.
+ * config/aarch64/predicates.md (descending_int_parallel)
+ (aarch64_sve_ld1rq_operand): New predicates.
+ * config/aarch64/constraints.md (UtQ): New constraint.
+ * config/aarch64/aarch64.md (UNSPEC_REINTERPRET): New unspec.
+ * config/aarch64/aarch64-sve.md (mov<SVE_ALL:mode>): Remove the
+ gen_vec_duplicate from call to aarch64_expand_mov_immediate.
+ (@aarch64_sve_reinterpret<mode>): New expander.
+ (*aarch64_sve_reinterpret<mode>): New pattern.
+ (@aarch64_vec_duplicate_vq<mode>_le): New pattern.
+ (@aarch64_vec_duplicate_vq<mode>_be): Likewise.
+ (*sve_ld1rq<Vesize>): Replace with...
+ (@aarch64_sve_ld1rq<mode>): ...this new pattern.
+
2019-08-13 Wilco Dijkstra <wdijkstr@arm.com>
* config/aarch64/aarch64.c (generic_tunings): Set function alignment to
bool aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode);
int aarch64_branch_cost (bool, bool);
enum aarch64_symbol_type aarch64_classify_symbolic_expression (rtx);
+opt_machine_mode aarch64_vq_mode (scalar_mode);
+opt_machine_mode aarch64_full_sve_mode (scalar_mode);
bool aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode);
bool aarch64_const_vec_all_same_int_p (rtx, HOST_WIDE_INT);
bool aarch64_const_vec_all_same_in_range_p (rtx, HOST_WIDE_INT,
rtx aarch64_simd_gen_const_vector_dup (machine_mode, HOST_WIDE_INT);
bool aarch64_simd_mem_operand_p (rtx);
bool aarch64_sve_ld1r_operand_p (rtx);
+bool aarch64_sve_ld1rq_operand_p (rtx);
bool aarch64_sve_ldr_operand_p (rtx);
bool aarch64_sve_struct_memory_operand_p (rtx);
rtx aarch64_simd_vect_par_cnst_half (machine_mode, int, bool);
+rtx aarch64_gen_stepped_int_parallel (unsigned int, int, int);
+bool aarch64_stepped_int_parallel_p (rtx, int);
rtx aarch64_tls_get_addr (void);
tree aarch64_fold_builtin (tree, int, tree *, bool);
unsigned aarch64_dbx_register_number (unsigned);
const char * aarch64_output_probe_sve_stack_clash (rtx, rtx, rtx, rtx);
void aarch64_err_no_fpadvsimd (machine_mode);
void aarch64_expand_epilogue (bool);
-void aarch64_expand_mov_immediate (rtx, rtx, rtx (*) (rtx, rtx) = 0);
+void aarch64_expand_mov_immediate (rtx, rtx);
rtx aarch64_ptrue_reg (machine_mode);
rtx aarch64_pfalse_reg (machine_mode);
void aarch64_emit_sve_pred_move (rtx, rtx, rtx);
if (CONSTANT_P (operands[1]))
{
- aarch64_expand_mov_immediate (operands[0], operands[1],
- gen_vec_duplicate<mode>);
+ aarch64_expand_mov_immediate (operands[0], operands[1]);
DONE;
}
}
)
+;; Reinterpret operand 1 in operand 0's mode, without changing its contents.
+;; This is equivalent to a subreg on little-endian targets but not for
+;; big-endian; see the comment at the head of the file for details.
+(define_expand "@aarch64_sve_reinterpret<mode>"
+ [(set (match_operand:SVE_ALL 0 "register_operand")
+ (unspec:SVE_ALL [(match_operand 1 "aarch64_any_register_operand")]
+ UNSPEC_REINTERPRET))]
+ "TARGET_SVE"
+ {
+ if (!BYTES_BIG_ENDIAN)
+ {
+ emit_move_insn (operands[0], gen_lowpart (<MODE>mode, operands[1]));
+ DONE;
+ }
+ }
+)
+
+;; A pattern for handling type punning on big-endian targets. We use a
+;; special predicate for operand 1 to reduce the number of patterns.
+(define_insn_and_split "*aarch64_sve_reinterpret<mode>"
+ [(set (match_operand:SVE_ALL 0 "register_operand" "=w")
+ (unspec:SVE_ALL [(match_operand 1 "aarch64_any_register_operand" "0")]
+ UNSPEC_REINTERPRET))]
+ "TARGET_SVE"
+ "#"
+ "&& reload_completed"
+ [(set (match_dup 0) (match_dup 1))]
+ {
+ emit_note (NOTE_INSN_DELETED);
+ DONE;
+ }
+)
+
;; -------------------------------------------------------------------------
;; ---- Moves of multiple vectors
;; -------------------------------------------------------------------------
[(set_attr "length" "4,4,8")]
)
+;; Duplicate an Advanced SIMD vector to fill an SVE vector (LE version).
+(define_insn "@aarch64_vec_duplicate_vq<mode>_le"
+ [(set (match_operand:SVE_ALL 0 "register_operand" "=w")
+ (vec_duplicate:SVE_ALL
+ (match_operand:<V128> 1 "register_operand" "w")))]
+ "TARGET_SVE && !BYTES_BIG_ENDIAN"
+ {
+ operands[1] = gen_rtx_REG (<MODE>mode, REGNO (operands[1]));
+ return "dup\t%0.q, %1.q[0]";
+ }
+)
+
+;; Duplicate an Advanced SIMD vector to fill an SVE vector (BE version).
+;; The SVE register layout puts memory lane N into (architectural)
+;; register lane N, whereas the Advanced SIMD layout puts the memory
+;; lsb into the register lsb. We therefore have to describe this in rtl
+;; terms as a reverse of the V128 vector followed by a duplicate.
+(define_insn "@aarch64_vec_duplicate_vq<mode>_be"
+ [(set (match_operand:SVE_ALL 0 "register_operand" "=w")
+ (vec_duplicate:SVE_ALL
+ (vec_select:<V128>
+ (match_operand:<V128> 1 "register_operand" "w")
+ (match_operand 2 "descending_int_parallel"))))]
+ "TARGET_SVE
+ && BYTES_BIG_ENDIAN
+ && known_eq (INTVAL (XVECEXP (operands[2], 0, 0)),
+ GET_MODE_NUNITS (<V128>mode) - 1)"
+ {
+ operands[1] = gen_rtx_REG (<MODE>mode, REGNO (operands[1]));
+ return "dup\t%0.q, %1.q[0]";
+ }
+)
+
;; This is used for vec_duplicate<mode>s from memory, but can also
;; be used by combine to optimize selects of a a vec_duplicate<mode>
;; with zero.
"ld1r<Vesize>\t%0.<Vetype>, %1/z, %2"
)
-;; Load 128 bits from memory and duplicate to fill a vector. Since there
-;; are so few operations on 128-bit "elements", we don't define a VNx1TI
-;; and simply use vectors of bytes instead.
-(define_insn "*sve_ld1rq<Vesize>"
+;; Load 128 bits from memory under predicate control and duplicate to
+;; fill a vector.
+(define_insn "@aarch64_sve_ld1rq<mode>"
[(set (match_operand:SVE_ALL 0 "register_operand" "=w")
(unspec:SVE_ALL
- [(match_operand:<VPRED> 1 "register_operand" "Upl")
- (match_operand:TI 2 "aarch64_sve_ld1r_operand" "Uty")]
+ [(match_operand:<VPRED> 2 "register_operand" "Upl")
+ (match_operand:<V128> 1 "aarch64_sve_ld1rq_operand" "UtQ")]
UNSPEC_LD1RQ))]
"TARGET_SVE"
- "ld1rq<Vesize>\t%0.<Vetype>, %1/z, %2"
+ {
+ operands[1] = gen_rtx_MEM (<VEL>mode, XEXP (operands[1], 0));
+ return "ld1rq<Vesize>\t%0.<Vetype>, %2/z, %1";
+ }
)
;; -------------------------------------------------------------------------
emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step));
}
-/* Try to duplicate SRC into SVE register DEST, given that SRC is an
- integer of mode INT_MODE. Return true on success. */
+/* Duplicate 128-bit Advanced SIMD vector SRC so that it fills an SVE
+ register of mode MODE. Use TARGET for the result if it's nonnull
+ and convenient.
+
+ The two vector modes must have the same element mode. The behavior
+ is to duplicate architectural lane N of SRC into architectural lanes
+ N + I * STEP of the result. On big-endian targets, architectural
+ lane 0 of an Advanced SIMD vector is the last element of the vector
+ in memory layout, so for big-endian targets this operation has the
+ effect of reversing SRC before duplicating it. Callers need to
+ account for this. */
-static bool
-aarch64_expand_sve_widened_duplicate (rtx dest, scalar_int_mode src_mode,
- rtx src)
-{
- /* If the constant is smaller than 128 bits, we can do the move
- using a vector of SRC_MODEs. */
- if (src_mode != TImode)
- {
- poly_uint64 count = exact_div (GET_MODE_SIZE (GET_MODE (dest)),
- GET_MODE_SIZE (src_mode));
- machine_mode dup_mode = mode_for_vector (src_mode, count).require ();
- emit_move_insn (gen_lowpart (dup_mode, dest),
- gen_const_vec_duplicate (dup_mode, src));
- return true;
+rtx
+aarch64_expand_sve_dupq (rtx target, machine_mode mode, rtx src)
+{
+ machine_mode src_mode = GET_MODE (src);
+ gcc_assert (GET_MODE_INNER (mode) == GET_MODE_INNER (src_mode));
+ insn_code icode = (BYTES_BIG_ENDIAN
+ ? code_for_aarch64_vec_duplicate_vq_be (mode)
+ : code_for_aarch64_vec_duplicate_vq_le (mode));
+
+ unsigned int i = 0;
+ expand_operand ops[3];
+ create_output_operand (&ops[i++], target, mode);
+ create_output_operand (&ops[i++], src, src_mode);
+ if (BYTES_BIG_ENDIAN)
+ {
+ /* Create a PARALLEL describing the reversal of SRC. */
+ unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (mode);
+ rtx sel = aarch64_gen_stepped_int_parallel (nelts_per_vq,
+ nelts_per_vq - 1, -1);
+ create_fixed_operand (&ops[i++], sel);
}
+ expand_insn (icode, i, ops);
+ return ops[0].value;
+}
- /* Use LD1RQ[BHWD] to load the 128 bits from memory. */
- src = force_const_mem (src_mode, src);
+/* Try to force 128-bit vector value SRC into memory and use LD1RQ to fetch
+ the memory image into DEST. Return true on success. */
+
+static bool
+aarch64_expand_sve_ld1rq (rtx dest, rtx src)
+{
+ src = force_const_mem (GET_MODE (src), src);
if (!src)
return false;
/* Make sure that the address is legitimate. */
- if (!aarch64_sve_ld1r_operand_p (src))
+ if (!aarch64_sve_ld1rq_operand_p (src))
{
rtx addr = force_reg (Pmode, XEXP (src, 0));
src = replace_equiv_address (src, addr);
unsigned int elem_bytes = GET_MODE_UNIT_SIZE (mode);
machine_mode pred_mode = aarch64_sve_pred_mode (elem_bytes).require ();
rtx ptrue = aarch64_ptrue_reg (pred_mode);
- src = gen_rtx_UNSPEC (mode, gen_rtvec (2, ptrue, src), UNSPEC_LD1RQ);
- emit_insn (gen_rtx_SET (dest, src));
+ emit_insn (gen_aarch64_sve_ld1rq (mode, dest, src, ptrue));
return true;
}
-/* Expand a move of general CONST_VECTOR SRC into DEST, given that it
- isn't a simple duplicate or series. */
+/* Return a register containing CONST_VECTOR SRC, given that SRC has an
+ SVE data mode and isn't a legitimate constant. Use TARGET for the
+ result if convenient.
-static void
-aarch64_expand_sve_const_vector (rtx dest, rtx src)
+ The returned register can have whatever mode seems most natural
+ given the contents of SRC. */
+
+static rtx
+aarch64_expand_sve_const_vector (rtx target, rtx src)
{
machine_mode mode = GET_MODE (src);
unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
- gcc_assert (npatterns > 1);
+ scalar_mode elt_mode = GET_MODE_INNER (mode);
+ unsigned int elt_bits = GET_MODE_BITSIZE (elt_mode);
+ unsigned int encoded_bits = npatterns * nelts_per_pattern * elt_bits;
+
+ if (nelts_per_pattern == 1 && encoded_bits == 128)
+ {
+ /* The constant is a duplicated quadword but can't be narrowed
+ beyond a quadword. Get the memory image of the first quadword
+ as a 128-bit vector and try using LD1RQ to load it from memory.
+
+ The effect for both endiannesses is to load memory lane N into
+ architectural lanes N + I * STEP of the result. On big-endian
+ targets, the layout of the 128-bit vector in an Advanced SIMD
+ register would be different from its layout in an SVE register,
+ but this 128-bit vector is a memory value only. */
+ machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
+ rtx vq_value = simplify_gen_subreg (vq_mode, src, mode, 0);
+ if (vq_value && aarch64_expand_sve_ld1rq (target, vq_value))
+ return target;
+ }
+
+ if (nelts_per_pattern == 1 && encoded_bits < 128)
+ {
+ /* The vector is a repeating sequence of 64 bits or fewer.
+ See if we can load them using an Advanced SIMD move and then
+ duplicate it to fill a vector. This is better than using a GPR
+ move because it keeps everything in the same register file. */
+ machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
+ rtx_vector_builder builder (vq_mode, npatterns, 1);
+ for (unsigned int i = 0; i < npatterns; ++i)
+ {
+ /* We want memory lane N to go into architectural lane N,
+ so reverse for big-endian targets. The DUP .Q pattern
+ has a compensating reverse built-in. */
+ unsigned int srci = BYTES_BIG_ENDIAN ? npatterns - i - 1 : i;
+ builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, srci));
+ }
+ rtx vq_src = builder.build ();
+ if (aarch64_simd_valid_immediate (vq_src, NULL))
+ {
+ vq_src = force_reg (vq_mode, vq_src);
+ return aarch64_expand_sve_dupq (target, mode, vq_src);
+ }
- if (nelts_per_pattern == 1)
- {
- /* The constant is a repeating seqeuence of at least two elements,
- where the repeating elements occupy no more than 128 bits.
- Get an integer representation of the replicated value. */
- scalar_int_mode int_mode;
- if (BYTES_BIG_ENDIAN)
- /* For now, always use LD1RQ to load the value on big-endian
- targets, since the handling of smaller integers includes a
- subreg that is semantically an element reverse. */
- int_mode = TImode;
- else
+ /* Get an integer representation of the repeating part of Advanced
+ SIMD vector VQ_SRC. This preserves the endianness of VQ_SRC,
+ which for big-endian targets is lane-swapped wrt a normal
+ Advanced SIMD vector. This means that for both endiannesses,
+ memory lane N of SVE vector SRC corresponds to architectural
+ lane N of a register holding VQ_SRC. This in turn means that
+ memory lane 0 of SVE vector SRC is in the lsb of VQ_SRC (viewed
+ as a single 128-bit value) and thus that memory lane 0 of SRC is
+ in the lsb of the integer. Duplicating the integer therefore
+ ensures that memory lane N of SRC goes into architectural lane
+ N + I * INDEX of the SVE register. */
+ scalar_mode int_mode = int_mode_for_size (encoded_bits, 0).require ();
+ rtx elt_value = simplify_gen_subreg (int_mode, vq_src, vq_mode, 0);
+ if (elt_value)
{
- unsigned int int_bits = GET_MODE_UNIT_BITSIZE (mode) * npatterns;
- gcc_assert (int_bits <= 128);
- int_mode = int_mode_for_size (int_bits, 0).require ();
+ /* Pretend that we had a vector of INT_MODE to start with. */
+ elt_mode = int_mode;
+ mode = aarch64_full_sve_mode (int_mode).require ();
+
+ /* If the integer can be moved into a general register by a
+ single instruction, do that and duplicate the result. */
+ if (CONST_INT_P (elt_value)
+ && aarch64_move_imm (INTVAL (elt_value), elt_mode))
+ {
+ elt_value = force_reg (elt_mode, elt_value);
+ return expand_vector_broadcast (mode, elt_value);
+ }
}
- rtx int_value = simplify_gen_subreg (int_mode, src, mode, 0);
- if (int_value
- && aarch64_expand_sve_widened_duplicate (dest, int_mode, int_value))
- return;
+ else if (npatterns == 1)
+ /* We're duplicating a single value, but can't do better than
+ force it to memory and load from there. This handles things
+ like symbolic constants. */
+ elt_value = CONST_VECTOR_ENCODED_ELT (src, 0);
+
+ if (elt_value)
+ {
+ /* Load the element from memory if we can, otherwise move it into
+ a register and use a DUP. */
+ rtx op = force_const_mem (elt_mode, elt_value);
+ if (!op)
+ op = force_reg (elt_mode, elt_value);
+ return expand_vector_broadcast (mode, op);
+ }
+ }
+
+ /* Try using INDEX. */
+ rtx base, step;
+ if (const_vec_series_p (src, &base, &step))
+ {
+ aarch64_expand_vec_series (target, base, step);
+ return target;
}
+ /* From here on, it's better to force the whole constant to memory
+ if we can. */
+ if (GET_MODE_NUNITS (mode).is_constant ())
+ return NULL_RTX;
+
/* Expand each pattern individually. */
+ gcc_assert (npatterns > 1);
rtx_vector_builder builder;
auto_vec<rtx, 16> vectors (npatterns);
for (unsigned int i = 0; i < npatterns; ++i)
npatterns /= 2;
for (unsigned int i = 0; i < npatterns; ++i)
{
- rtx tmp = (npatterns == 1 ? dest : gen_reg_rtx (mode));
+ rtx tmp = (npatterns == 1 ? target : gen_reg_rtx (mode));
rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]);
emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
vectors[i] = tmp;
}
}
- gcc_assert (vectors[0] == dest);
+ gcc_assert (vectors[0] == target);
+ return target;
}
-/* Set DEST to immediate IMM. For SVE vector modes, GEN_VEC_DUPLICATE
- is a pattern that can be used to set DEST to a replicated scalar
- element. */
+/* Set DEST to immediate IMM. */
void
-aarch64_expand_mov_immediate (rtx dest, rtx imm,
- rtx (*gen_vec_duplicate) (rtx, rtx))
+aarch64_expand_mov_immediate (rtx dest, rtx imm)
{
machine_mode mode = GET_MODE (dest);
if (!CONST_INT_P (imm))
{
- rtx base, step, value;
if (GET_CODE (imm) == HIGH
|| aarch64_simd_valid_immediate (imm, NULL))
- emit_insn (gen_rtx_SET (dest, imm));
- else if (const_vec_series_p (imm, &base, &step))
- aarch64_expand_vec_series (dest, base, step);
- else if (const_vec_duplicate_p (imm, &value))
{
- /* If the constant is out of range of an SVE vector move,
- load it from memory if we can, otherwise move it into
- a register and use a DUP. */
- scalar_mode inner_mode = GET_MODE_INNER (mode);
- rtx op = force_const_mem (inner_mode, value);
- if (!op)
- op = force_reg (inner_mode, value);
- else if (!aarch64_sve_ld1r_operand_p (op))
- {
- rtx addr = force_reg (Pmode, XEXP (op, 0));
- op = replace_equiv_address (op, addr);
- }
- emit_insn (gen_vec_duplicate (dest, op));
- }
- else if (GET_CODE (imm) == CONST_VECTOR
- && !GET_MODE_NUNITS (GET_MODE (imm)).is_constant ())
- aarch64_expand_sve_const_vector (dest, imm);
- else
- {
- rtx mem = force_const_mem (mode, imm);
- gcc_assert (mem);
- emit_move_insn (dest, mem);
+ emit_insn (gen_rtx_SET (dest, imm));
+ return;
}
+ if (GET_CODE (imm) == CONST_VECTOR && aarch64_sve_data_mode_p (mode))
+ if (rtx res = aarch64_expand_sve_const_vector (dest, imm))
+ {
+ if (dest != res)
+ emit_insn (gen_aarch64_sve_reinterpret (mode, dest, res));
+ return;
+ }
+
+ rtx mem = force_const_mem (mode, imm);
+ gcc_assert (mem);
+ emit_move_insn (dest, mem);
return;
}
return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
}
+/* Return the full-width SVE vector mode for element mode MODE, if one
+ exists. */
+opt_machine_mode
+aarch64_full_sve_mode (scalar_mode mode)
+{
+ switch (mode)
+ {
+ case E_DFmode:
+ return VNx2DFmode;
+ case E_SFmode:
+ return VNx4SFmode;
+ case E_HFmode:
+ return VNx8HFmode;
+ case E_DImode:
+ return VNx2DImode;
+ case E_SImode:
+ return VNx4SImode;
+ case E_HImode:
+ return VNx8HImode;
+ case E_QImode:
+ return VNx16QImode;
+ default:
+ return opt_machine_mode ();
+ }
+}
+
+/* Return the 128-bit Advanced SIMD vector mode for element mode MODE,
+ if it exists. */
+opt_machine_mode
+aarch64_vq_mode (scalar_mode mode)
+{
+ switch (mode)
+ {
+ case E_DFmode:
+ return V2DFmode;
+ case E_SFmode:
+ return V4SFmode;
+ case E_HFmode:
+ return V8HFmode;
+ case E_SImode:
+ return V4SImode;
+ case E_HImode:
+ return V8HImode;
+ case E_QImode:
+ return V16QImode;
+ case E_DImode:
+ return V2DImode;
+ default:
+ return opt_machine_mode ();
+ }
+}
+
/* Return appropriate SIMD container
for MODE within a vector of WIDTH bits. */
static machine_mode
aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
{
if (TARGET_SVE && known_eq (width, BITS_PER_SVE_VECTOR))
- switch (mode)
- {
- case E_DFmode:
- return VNx2DFmode;
- case E_SFmode:
- return VNx4SFmode;
- case E_HFmode:
- return VNx8HFmode;
- case E_DImode:
- return VNx2DImode;
- case E_SImode:
- return VNx4SImode;
- case E_HImode:
- return VNx8HImode;
- case E_QImode:
- return VNx16QImode;
- default:
- return word_mode;
- }
+ return aarch64_full_sve_mode (mode).else_mode (word_mode);
gcc_assert (known_eq (width, 64) || known_eq (width, 128));
if (TARGET_SIMD)
{
if (known_eq (width, 128))
- switch (mode)
- {
- case E_DFmode:
- return V2DFmode;
- case E_SFmode:
- return V4SFmode;
- case E_HFmode:
- return V8HFmode;
- case E_SImode:
- return V4SImode;
- case E_HImode:
- return V8HImode;
- case E_QImode:
- return V16QImode;
- case E_DImode:
- return V2DImode;
- default:
- break;
- }
+ return aarch64_vq_mode (mode).else_mode (word_mode);
else
switch (mode)
{
return true;
}
+/* Return a PARALLEL containing NELTS elements, with element I equal
+ to BASE + I * STEP. */
+
+rtx
+aarch64_gen_stepped_int_parallel (unsigned int nelts, int base, int step)
+{
+ rtvec vec = rtvec_alloc (nelts);
+ for (unsigned int i = 0; i < nelts; ++i)
+ RTVEC_ELT (vec, i) = gen_int_mode (base + i * step, DImode);
+ return gen_rtx_PARALLEL (VOIDmode, vec);
+}
+
+/* Return true if OP is a PARALLEL of CONST_INTs that form a linear
+ series with step STEP. */
+
+bool
+aarch64_stepped_int_parallel_p (rtx op, int step)
+{
+ if (GET_CODE (op) != PARALLEL || !CONST_INT_P (XVECEXP (op, 0, 0)))
+ return false;
+
+ unsigned HOST_WIDE_INT base = UINTVAL (XVECEXP (op, 0, 0));
+ for (int i = 1; i < XVECLEN (op, 0); ++i)
+ if (!CONST_INT_P (XVECEXP (op, 0, i))
+ || UINTVAL (XVECEXP (op, 0, i)) != base + i * step)
+ return false;
+
+ return true;
+}
+
/* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
HIGH (exclusive). */
void
&& offset_6bit_unsigned_scaled_p (mode, addr.const_offset));
}
+/* Return true if OP is a valid MEM operand for an SVE LD1RQ instruction. */
+bool
+aarch64_sve_ld1rq_operand_p (rtx op)
+{
+ struct aarch64_address_info addr;
+ scalar_mode elem_mode = GET_MODE_INNER (GET_MODE (op));
+ if (!MEM_P (op)
+ || !aarch64_classify_address (&addr, XEXP (op, 0), elem_mode, false))
+ return false;
+
+ if (addr.type == ADDRESS_REG_IMM)
+ return offset_4bit_signed_scaled_p (TImode, addr.const_offset);
+
+ if (addr.type == ADDRESS_REG_REG)
+ return (1U << addr.shift) == GET_MODE_SIZE (elem_mode);
+
+ return false;
+}
+
/* Return true if OP is a valid MEM operand for an SVE LDR instruction.
The conditions for STR are the same. */
bool
UNSPEC_CLASTB
UNSPEC_FADDA
UNSPEC_REV_SUBREG
+ UNSPEC_REINTERPRET
UNSPEC_SPECULATION_TRACKER
UNSPEC_COPYSIGN
UNSPEC_TTEST ; Represent transaction test.
(match_test "aarch64_legitimate_address_p (V2DImode,
XEXP (op, 0), 1)")))
+(define_memory_constraint "UtQ"
+ "@internal
+ An address valid for SVE LD1RQs."
+ (and (match_code "mem")
+ (match_test "aarch64_sve_ld1rq_operand_p (op)")))
+
(define_memory_constraint "Uty"
"@internal
An address valid for SVE LD1Rs."
return aarch64_simd_check_vect_par_cnst_half (op, mode, false);
})
+(define_predicate "descending_int_parallel"
+ (match_code "parallel")
+{
+ return aarch64_stepped_int_parallel_p (op, -1);
+})
+
(define_special_predicate "aarch64_simd_lshift_imm"
(match_code "const,const_vector")
{
(and (match_operand 0 "memory_operand")
(match_test "aarch64_sve_ld1r_operand_p (op)")))
+(define_predicate "aarch64_sve_ld1rq_operand"
+ (and (match_code "mem")
+ (match_test "aarch64_sve_ld1rq_operand_p (op)")))
+
;; Like memory_operand, but restricted to addresses that are valid for
;; SVE LDR and STR instructions.
(define_predicate "aarch64_sve_ldr_operand"
ALWAYS_INLINE opt_mode (from_int m) : m_mode (machine_mode (m)) {}
machine_mode else_void () const;
- machine_mode else_blk () const;
+ machine_mode else_blk () const { return else_mode (BLKmode); }
+ machine_mode else_mode (machine_mode) const;
T require () const;
bool exists () const;
return m_mode;
}
-/* If the T exists, return its enum value, otherwise return E_BLKmode. */
+/* If the T exists, return its enum value, otherwise return FALLBACK. */
template<typename T>
inline machine_mode
-opt_mode<T>::else_blk () const
+opt_mode<T>::else_mode (machine_mode fallback) const
{
- return m_mode == E_VOIDmode ? E_BLKmode : m_mode;
+ return m_mode == E_VOIDmode ? fallback : m_mode;
}
/* Assert that the object contains a T and return it. */
+2019-08-13 Richard Sandiford <richard.sandiford@arm.com>
+
+ * gcc.target/aarch64/sve/init_2.c: Expect ld1rd to be used
+ instead of a full vector load.
+ * gcc.target/aarch64/sve/init_4.c: Likewise.
+ * gcc.target/aarch64/sve/ld1r_2.c: Remove constants that no longer
+ need to be loaded from memory.
+ * gcc.target/aarch64/sve/slp_2.c: Expect the same output for
+ big and little endian.
+ * gcc.target/aarch64/sve/slp_3.c: Likewise. Expect 3 of the
+ doubles to be moved via integer registers rather than loaded
+ from memory.
+ * gcc.target/aarch64/sve/slp_4.c: Likewise but for 4 doubles.
+ * gcc.target/aarch64/sve/spill_4.c: Expect 16-bit constants to be
+ loaded via an integer register rather than from memory.
+ * gcc.target/aarch64/sve/const_1.c: New test.
+ * gcc.target/aarch64/sve/const_2.c: Likewise.
+ * gcc.target/aarch64/sve/const_3.c: Likewise.
+
2019-08-13 Jozef Lawrynowicz <jozef.l@mittosystems.com>
* gcc.target/msp430/msp430.exp (msp430_device_permutations_runtest):
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+
+#include <stdint.h>
+
+void
+set (uint64_t *dst, int count)
+{
+ for (int i = 0; i < count; ++i)
+ dst[i] = 0xffff00ff00ffff00ULL;
+}
+
+/* { dg-final { scan-assembler {\tmovi\tv([0-9]+)\.2d, 0xffff00ff00ffff00\n.*\tdup\tz[0-9]+\.q, z\1\.q\[0\]\n} } } */
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+
+#include <stdint.h>
+
+#define TEST(TYPE, CONST) \
+ void \
+ set_##TYPE (TYPE *dst, int count) \
+ { \
+ for (int i = 0; i < count; ++i) \
+ dst[i] = CONST; \
+ }
+
+TEST (uint16_t, 129)
+TEST (uint32_t, 129)
+TEST (uint64_t, 129)
+
+/* { dg-final { scan-assembler {\tmovi\tv([0-9]+)\.8h, 0x81\n[^:]*\tdup\tz[0-9]+\.q, z\1\.q\[0\]\n} } } */
+/* { dg-final { scan-assembler {\tmovi\tv([0-9]+)\.4s, 0x81\n[^:]*\tdup\tz[0-9]+\.q, z\1\.q\[0\]\n} } } */
+/* { dg-final { scan-assembler {\tmov\t(x[0-9]+), 129\n[^:]*\tmov\tz[0-9]+\.d, \1\n} } } */
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+
+#include <stdint.h>
+
+#define TEST(TYPE, CONST) \
+ void \
+ set_##TYPE (TYPE *dst, int count) \
+ { \
+ for (int i = 0; i < count; ++i) \
+ dst[i] = CONST; \
+ }
+
+TEST (uint16_t, 0x1234)
+TEST (uint32_t, 0x1234)
+TEST (uint64_t, 0x1234)
+
+/* { dg-final { scan-assembler {\tmov\t(w[0-9]+), 4660\n[^:]*\tmov\tz[0-9]+\.h, \1\n} } } */
+/* { dg-final { scan-assembler {\tmov\t(w[0-9]+), 4660\n[^:]*\tmov\tz[0-9]+\.s, \1\n} } } */
+/* { dg-final { scan-assembler {\tmov\t(x[0-9]+), 4660\n[^:]*\tmov\tz[0-9]+\.d, \1\n} } } */
/*
** foo:
** ...
-** ld1w (z[0-9]+\.s), p[0-9]+/z, \[x[0-9]+\]
-** insr \1, w1
-** insr \1, w0
+** ld1rd (z[0-9]+)\.d, p[0-9]+/z, \[x[0-9]+\]
+** insr \1\.s, w1
+** insr \1\.s, w0
** ...
*/
__attribute__((noipa))
/*
** foo:
** ...
-** ld1w (z[0-9]+\.s), p[0-9]+/z, \[x[0-9]+\]
-** insr \1, w1
-** insr \1, w0
-** rev \1, \1
+** ld1rd (z[0-9]+)\.d, p[0-9]+/z, \[x[0-9]+\]
+** insr \1\.s, w1
+** insr \1\.s, w0
+** rev \1\.s, \1\.s
** ...
*/
__attribute__((noipa))
T (int64_t)
#define FOR_EACH_LOAD_BROADCAST_IMM(T) \
- T (int16_t, 129, imm_129) \
- T (int32_t, 129, imm_129) \
- T (int64_t, 129, imm_129) \
- \
- T (int16_t, -130, imm_m130) \
- T (int32_t, -130, imm_m130) \
- T (int64_t, -130, imm_m130) \
- \
- T (int16_t, 0x1234, imm_0x1234) \
- T (int32_t, 0x1234, imm_0x1234) \
- T (int64_t, 0x1234, imm_0x1234) \
- \
- T (int16_t, 0xFEDC, imm_0xFEDC) \
- T (int32_t, 0xFEDC, imm_0xFEDC) \
- T (int64_t, 0xFEDC, imm_0xFEDC) \
- \
T (int32_t, 0x12345678, imm_0x12345678) \
T (int64_t, 0x12345678, imm_0x12345678) \
\
FOR_EACH_LOAD_BROADCAST_IMM (DEF_LOAD_BROADCAST_IMM)
/* { dg-final { scan-assembler-times {\tld1rb\tz[0-9]+\.b, p[0-7]/z, } 1 } } */
-/* { dg-final { scan-assembler-times {\tld1rh\tz[0-9]+\.h, p[0-7]/z, } 5 } } */
-/* { dg-final { scan-assembler-times {\tld1rw\tz[0-9]+\.s, p[0-7]/z, } 7 } } */
-/* { dg-final { scan-assembler-times {\tld1rd\tz[0-9]+\.d, p[0-7]/z, } 8 } } */
+/* { dg-final { scan-assembler-times {\tld1rh\tz[0-9]+\.h, p[0-7]/z, } 1 } } */
+/* { dg-final { scan-assembler-times {\tld1rw\tz[0-9]+\.s, p[0-7]/z, } 3 } } */
+/* { dg-final { scan-assembler-times {\tld1rd\tz[0-9]+\.d, p[0-7]/z, } 4 } } */
TEST_ALL (VEC_PERM)
-/* { dg-final { scan-assembler-times {\tld1rh\tz[0-9]+\.h, } 2 { target aarch64_little_endian } } } */
-/* { dg-final { scan-assembler-times {\tld1rqb\tz[0-9]+\.b, } 2 { target aarch64_big_endian } } } */
-/* { dg-final { scan-assembler-times {\tld1rw\tz[0-9]+\.s, } 3 { target aarch64_little_endian } } } */
-/* { dg-final { scan-assembler-times {\tld1rqh\tz[0-9]+\.h, } 3 { target aarch64_big_endian } } } */
-/* { dg-final { scan-assembler-times {\tld1rd\tz[0-9]+\.d, } 3 { target aarch64_little_endian } } } */
-/* { dg-final { scan-assembler-times {\tld1rqw\tz[0-9]+\.s, } 3 { target aarch64_big_endian } } } */
+/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.h, w[0-9]+\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tld1rw\tz[0-9]+\.s, } 3 } } */
+/* { dg-final { scan-assembler-times {\tld1rd\tz[0-9]+\.d, } 3 } } */
/* { dg-final { scan-assembler-times {\tld1rqd\tz[0-9]+\.d, } 3 } } */
/* { dg-final { scan-assembler-not {\tzip1\t} } } */
/* { dg-final { scan-assembler-not {\tzip2\t} } } */
TEST_ALL (VEC_PERM)
/* 1 for each 8-bit type. */
-/* { dg-final { scan-assembler-times {\tld1rw\tz[0-9]+\.s, } 2 { target aarch64_little_endian } } } */
-/* { dg-final { scan-assembler-times {\tld1rqb\tz[0-9]+\.b, } 2 { target aarch64_big_endian } } } */
-/* 1 for each 16-bit type and 4 for double. */
-/* { dg-final { scan-assembler-times {\tld1rd\tz[0-9]+\.d, } 7 { target aarch64_little_endian } } } */
-/* { dg-final { scan-assembler-times {\tld1rqh\tz[0-9]+\.h, } 3 { target aarch64_big_endian } } } */
-/* { dg-final { scan-assembler-times {\tld1rd\tz[0-9]+\.d, } 4 { target aarch64_big_endian } } } */
+/* { dg-final { scan-assembler-times {\tld1rw\tz[0-9]+\.s, } 2 } } */
+/* 1 for each 16-bit type plus 1 for double. */
+/* { dg-final { scan-assembler-times {\tld1rd\tz[0-9]+\.d, } 4 } } */
/* 1 for each 32-bit type. */
/* { dg-final { scan-assembler-times {\tld1rqw\tz[0-9]+\.s, } 3 } } */
/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #41\n} 2 } } */
/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #25\n} 2 } } */
/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #31\n} 2 } } */
/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #62\n} 2 } } */
+/* 3 for double. */
+/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, x[0-9]+\n} 3 } } */
/* The 64-bit types need:
ZIP1 ZIP1 (2 ZIP2s optimized away)
TEST_ALL (VEC_PERM)
-/* 1 for each 8-bit type, 4 for each 32-bit type and 8 for double. */
-/* { dg-final { scan-assembler-times {\tld1rd\tz[0-9]+\.d, } 22 { target aarch64_little_endian } } } */
-/* { dg-final { scan-assembler-times {\tld1rqb\tz[0-9]+\.b, } 2 { target aarch64_big_endian } } } */
-/* { dg-final { scan-assembler-times {\tld1rd\tz[0-9]+\.d, } 20 { target aarch64_big_endian } } } */
+/* 1 for each 8-bit type, 4 for each 32-bit type and 4 for double. */
+/* { dg-final { scan-assembler-times {\tld1rd\tz[0-9]+\.d, } 18 } } */
/* 1 for each 16-bit type. */
/* { dg-final { scan-assembler-times {\tld1rqh\tz[0-9]\.h, } 3 } } */
/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #99\n} 2 } } */
/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #37\n} 2 } } */
/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #24\n} 2 } } */
/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #81\n} 2 } } */
+/* 4 for double. */
+/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, x[0-9]+\n} 4 } } */
/* The 32-bit types need:
ZIP1 ZIP1 (2 ZIP2s optimized away)
TEST_LOOP (uint32_t, 0x12345);
TEST_LOOP (uint64_t, 0x123456);
-/* { dg-final { scan-assembler-times {\tptrue\tp[0-9]+\.h,} 3 } } */
+/* { dg-final { scan-assembler-not {\tptrue\tp[0-9]+\.h,} } } */
/* { dg-final { scan-assembler-times {\tptrue\tp[0-9]+\.s,} 3 } } */
/* { dg-final { scan-assembler-times {\tptrue\tp[0-9]+\.d,} 3 } } */
-/* { dg-final { scan-assembler-times {\tld1rh\tz[0-9]+\.h,} 3 } } */
+/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.h, w[0-9]+\n} 3 } } */
/* { dg-final { scan-assembler-times {\tld1rw\tz[0-9]+\.s,} 3 } } */
/* { dg-final { scan-assembler-times {\tld1rd\tz[0-9]+\.d,} 3 } } */
/* { dg-final { scan-assembler-not {\tldr\tz[0-9]} } } */