+2014-12-05 Bin Cheng <bin.cheng@arm.com>
+
+ * config/aarch64/aarch64.md (load_pair<mode>): Split to
+ load_pairsi, load_pairdi, load_pairsf and load_pairdf.
+ (load_pairsi, load_pairdi, load_pairsf, load_pairdf): Split
+ from load_pair<mode>. New alternative to support int/fp
+ registers in fp/int mode patterns.
+ (store_pair<mode>:): Split to store_pairsi, store_pairdi,
+ store_pairsf and store_pairdi.
+ (store_pairsi, store_pairdi, store_pairsf, store_pairdf): Split
+ from store_pair<mode>. New alternative to support int/fp
+ registers in fp/int mode patterns.
+ (*load_pair_extendsidi2_aarch64): New pattern.
+ (*load_pair_zero_extendsidi2_aarch64): New pattern.
+ (aarch64-ldpstp.md): Include.
+ * config/aarch64/aarch64-ldpstp.md: New file.
+ * config/aarch64/aarch64-protos.h (aarch64_gen_adjusted_ldpstp):
+ New.
+ (extract_base_offset_in_addr): New.
+ (aarch64_operands_ok_for_ldpstp): New.
+ (aarch64_operands_adjust_ok_for_ldpstp): New.
+ * config/aarch64/aarch64.c (enum sched_fusion_type): New enum.
+ (TARGET_SCHED_FUSION_PRIORITY): New hook.
+ (fusion_load_store): New functon.
+ (extract_base_offset_in_addr): New function.
+ (aarch64_gen_adjusted_ldpstp): New function.
+ (aarch64_sched_fusion_priority): New function.
+ (aarch64_operands_ok_for_ldpstp): New function.
+ (aarch64_operands_adjust_ok_for_ldpstp): New function.
+
2014-12-05 Olivier Hainque <hainque@adacore.com>
* defaults.h: (DWARF_REG_TO_UNWIND_COLUMN): Define default.
--- /dev/null
+;; AArch64 ldp/stp peephole optimizations.
+;; Copyright (C) 2014 Free Software Foundation, Inc.
+;; Contributed by ARM Ltd.
+;;
+;; This file is part of GCC.
+;;
+;; GCC is free software; you can redistribute it and/or modify it
+;; under the terms of the GNU General Public License as published by
+;; the Free Software Foundation; either version 3, or (at your option)
+;; any later version.
+;;
+;; GCC is distributed in the hope that it will be useful, but
+;; WITHOUT ANY WARRANTY; without even the implied warranty of
+;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;; General Public License for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING3. If not see
+;; <http://www.gnu.org/licenses/>.
+
+(define_peephole2
+ [(set (match_operand:GPI 0 "register_operand" "")
+ (match_operand:GPI 1 "aarch64_mem_pair_operand" ""))
+ (set (match_operand:GPI 2 "register_operand" "")
+ (match_operand:GPI 3 "memory_operand" ""))]
+ "aarch64_operands_ok_for_ldpstp (operands, true, <MODE>mode)"
+ [(parallel [(set (match_dup 0) (match_dup 1))
+ (set (match_dup 2) (match_dup 3))])]
+{
+ rtx base, offset_1, offset_2, tmp;
+
+ extract_base_offset_in_addr (operands[1], &base, &offset_1);
+ extract_base_offset_in_addr (operands[3], &base, &offset_2);
+ if (INTVAL (offset_1) > INTVAL (offset_2))
+ {
+ tmp = operands[0];
+ operands[0] = operands[2];
+ operands[2] = tmp;
+ tmp = operands[1];
+ operands[1] = operands[3];
+ operands[3] = tmp;
+ }
+})
+
+(define_peephole2
+ [(set (match_operand:GPI 0 "aarch64_mem_pair_operand" "")
+ (match_operand:GPI 1 "aarch64_reg_or_zero" ""))
+ (set (match_operand:GPI 2 "memory_operand" "")
+ (match_operand:GPI 3 "aarch64_reg_or_zero" ""))]
+ "aarch64_operands_ok_for_ldpstp (operands, false, <MODE>mode)"
+ [(parallel [(set (match_dup 0) (match_dup 1))
+ (set (match_dup 2) (match_dup 3))])]
+{
+ rtx base, offset_1, offset_2, tmp;
+
+ extract_base_offset_in_addr (operands[0], &base, &offset_1);
+ extract_base_offset_in_addr (operands[2], &base, &offset_2);
+ if (INTVAL (offset_1) > INTVAL (offset_2))
+ {
+ tmp = operands[0];
+ operands[0] = operands[2];
+ operands[2] = tmp;
+ tmp = operands[1];
+ operands[1] = operands[3];
+ operands[3] = tmp;
+ }
+})
+
+(define_peephole2
+ [(set (match_operand:GPF 0 "register_operand" "")
+ (match_operand:GPF 1 "aarch64_mem_pair_operand" ""))
+ (set (match_operand:GPF 2 "register_operand" "")
+ (match_operand:GPF 3 "memory_operand" ""))]
+ "aarch64_operands_ok_for_ldpstp (operands, true, <MODE>mode)"
+ [(parallel [(set (match_dup 0) (match_dup 1))
+ (set (match_dup 2) (match_dup 3))])]
+{
+ rtx base, offset_1, offset_2, tmp;
+
+ extract_base_offset_in_addr (operands[1], &base, &offset_1);
+ extract_base_offset_in_addr (operands[3], &base, &offset_2);
+ if (INTVAL (offset_1) > INTVAL (offset_2))
+ {
+ tmp = operands[0];
+ operands[0] = operands[2];
+ operands[2] = tmp;
+ tmp = operands[1];
+ operands[1] = operands[3];
+ operands[3] = tmp;
+ }
+})
+
+(define_peephole2
+ [(set (match_operand:GPF 0 "aarch64_mem_pair_operand" "")
+ (match_operand:GPF 1 "register_operand" ""))
+ (set (match_operand:GPF 2 "memory_operand" "")
+ (match_operand:GPF 3 "register_operand" ""))]
+ "aarch64_operands_ok_for_ldpstp (operands, false, <MODE>mode)"
+ [(parallel [(set (match_dup 0) (match_dup 1))
+ (set (match_dup 2) (match_dup 3))])]
+{
+ rtx base, offset_1, offset_2, tmp;
+
+ extract_base_offset_in_addr (operands[0], &base, &offset_1);
+ extract_base_offset_in_addr (operands[2], &base, &offset_2);
+ if (INTVAL (offset_1) > INTVAL (offset_2))
+ {
+ tmp = operands[0];
+ operands[0] = operands[2];
+ operands[2] = tmp;
+ tmp = operands[1];
+ operands[1] = operands[3];
+ operands[3] = tmp;
+ }
+})
+
+;; Handle sign/zero extended consecutive load/store.
+
+(define_peephole2
+ [(set (match_operand:DI 0 "register_operand" "")
+ (sign_extend:DI (match_operand:SI 1 "aarch64_mem_pair_operand" "")))
+ (set (match_operand:DI 2 "register_operand" "")
+ (sign_extend:DI (match_operand:SI 3 "memory_operand" "")))]
+ "aarch64_operands_ok_for_ldpstp (operands, true, SImode)"
+ [(parallel [(set (match_dup 0) (sign_extend:DI (match_dup 1)))
+ (set (match_dup 2) (sign_extend:DI (match_dup 3)))])]
+{
+ rtx base, offset_1, offset_2, tmp;
+
+ extract_base_offset_in_addr (operands[1], &base, &offset_1);
+ extract_base_offset_in_addr (operands[3], &base, &offset_2);
+ if (INTVAL (offset_1) > INTVAL (offset_2))
+ {
+ tmp = operands[0];
+ operands[0] = operands[2];
+ operands[2] = tmp;
+ tmp = operands[1];
+ operands[1] = operands[3];
+ operands[3] = tmp;
+ }
+})
+
+(define_peephole2
+ [(set (match_operand:DI 0 "register_operand" "")
+ (zero_extend:DI (match_operand:SI 1 "aarch64_mem_pair_operand" "")))
+ (set (match_operand:DI 2 "register_operand" "")
+ (zero_extend:DI (match_operand:SI 3 "memory_operand" "")))]
+ "aarch64_operands_ok_for_ldpstp (operands, true, SImode)"
+ [(parallel [(set (match_dup 0) (zero_extend:DI (match_dup 1)))
+ (set (match_dup 2) (zero_extend:DI (match_dup 3)))])]
+{
+ rtx base, offset_1, offset_2, tmp;
+
+ extract_base_offset_in_addr (operands[1], &base, &offset_1);
+ extract_base_offset_in_addr (operands[3], &base, &offset_2);
+ if (INTVAL (offset_1) > INTVAL (offset_2))
+ {
+ tmp = operands[0];
+ operands[0] = operands[2];
+ operands[2] = tmp;
+ tmp = operands[1];
+ operands[1] = operands[3];
+ operands[3] = tmp;
+ }
+})
+
+;; Handle consecutive load/store whose offset is out of the range
+;; supported by ldp/ldpsw/stp. We firstly adjust offset in a scratch
+;; register, then merge them into ldp/ldpsw/stp by using the adjusted
+;; offset.
+
+(define_peephole2
+ [(match_scratch:DI 8 "r")
+ (set (match_operand:GPI 0 "register_operand" "")
+ (match_operand:GPI 1 "memory_operand" ""))
+ (set (match_operand:GPI 2 "register_operand" "")
+ (match_operand:GPI 3 "memory_operand" ""))
+ (set (match_operand:GPI 4 "register_operand" "")
+ (match_operand:GPI 5 "memory_operand" ""))
+ (set (match_operand:GPI 6 "register_operand" "")
+ (match_operand:GPI 7 "memory_operand" ""))
+ (match_dup 8)]
+ "aarch64_operands_adjust_ok_for_ldpstp (operands, true, <MODE>mode)"
+ [(const_int 0)]
+{
+ rtx base, offset_1, offset_2, tmp;
+
+ extract_base_offset_in_addr (operands[1], &base, &offset_1);
+ extract_base_offset_in_addr (operands[3], &base, &offset_2);
+ if (INTVAL (offset_1) > INTVAL (offset_2))
+ {
+ tmp = operands[0];
+ operands[0] = operands[6];
+ operands[6] = tmp;
+ tmp = operands[1];
+ operands[1] = operands[7];
+ operands[7] = tmp;
+ tmp = operands[2];
+ operands[2] = operands[4];
+ operands[4] = tmp;
+ tmp = operands[3];
+ operands[3] = operands[5];
+ operands[5] = tmp;
+ }
+
+ if (aarch64_gen_adjusted_ldpstp (operands, true, <MODE>mode, UNKNOWN))
+ DONE;
+ else
+ FAIL;
+})
+
+(define_peephole2
+ [(match_scratch:DI 8 "r")
+ (set (match_operand:GPF 0 "register_operand" "")
+ (match_operand:GPF 1 "memory_operand" ""))
+ (set (match_operand:GPF 2 "register_operand" "")
+ (match_operand:GPF 3 "memory_operand" ""))
+ (set (match_operand:GPF 4 "register_operand" "")
+ (match_operand:GPF 5 "memory_operand" ""))
+ (set (match_operand:GPF 6 "register_operand" "")
+ (match_operand:GPF 7 "memory_operand" ""))
+ (match_dup 8)]
+ "aarch64_operands_adjust_ok_for_ldpstp (operands, true, <MODE>mode)"
+ [(const_int 0)]
+{
+ rtx base, offset_1, offset_2, tmp;
+
+ extract_base_offset_in_addr (operands[1], &base, &offset_1);
+ extract_base_offset_in_addr (operands[3], &base, &offset_2);
+ if (INTVAL (offset_1) > INTVAL (offset_2))
+ {
+ tmp = operands[0];
+ operands[0] = operands[6];
+ operands[6] = tmp;
+ tmp = operands[1];
+ operands[1] = operands[7];
+ operands[7] = tmp;
+ tmp = operands[2];
+ operands[2] = operands[4];
+ operands[4] = tmp;
+ tmp = operands[3];
+ operands[3] = operands[5];
+ operands[5] = tmp;
+ }
+
+ if (aarch64_gen_adjusted_ldpstp (operands, true, <MODE>mode, UNKNOWN))
+ DONE;
+ else
+ FAIL;
+})
+
+(define_peephole2
+ [(match_scratch:DI 8 "r")
+ (set (match_operand:DI 0 "register_operand" "")
+ (sign_extend:DI (match_operand:SI 1 "memory_operand" "")))
+ (set (match_operand:DI 2 "register_operand" "")
+ (sign_extend:DI (match_operand:SI 3 "memory_operand" "")))
+ (set (match_operand:DI 4 "register_operand" "")
+ (sign_extend:DI (match_operand:SI 5 "memory_operand" "")))
+ (set (match_operand:DI 6 "register_operand" "")
+ (sign_extend:DI (match_operand:SI 7 "memory_operand" "")))
+ (match_dup 8)]
+ "aarch64_operands_adjust_ok_for_ldpstp (operands, true, SImode)"
+ [(const_int 0)]
+{
+ rtx base, offset_1, offset_2, tmp;
+
+ extract_base_offset_in_addr (operands[1], &base, &offset_1);
+ extract_base_offset_in_addr (operands[3], &base, &offset_2);
+ if (INTVAL (offset_1) > INTVAL (offset_2))
+ {
+ tmp = operands[0];
+ operands[0] = operands[6];
+ operands[6] = tmp;
+ tmp = operands[1];
+ operands[1] = operands[7];
+ operands[7] = tmp;
+ tmp = operands[2];
+ operands[2] = operands[4];
+ operands[4] = tmp;
+ tmp = operands[3];
+ operands[3] = operands[5];
+ operands[5] = tmp;
+ }
+
+ if (aarch64_gen_adjusted_ldpstp (operands, true, SImode, SIGN_EXTEND))
+ DONE;
+ else
+ FAIL;
+})
+
+(define_peephole2
+ [(match_scratch:DI 8 "r")
+ (set (match_operand:DI 0 "register_operand" "")
+ (zero_extend:DI (match_operand:SI 1 "memory_operand" "")))
+ (set (match_operand:DI 2 "register_operand" "")
+ (zero_extend:DI (match_operand:SI 3 "memory_operand" "")))
+ (set (match_operand:DI 4 "register_operand" "")
+ (zero_extend:DI (match_operand:SI 5 "memory_operand" "")))
+ (set (match_operand:DI 6 "register_operand" "")
+ (zero_extend:DI (match_operand:SI 7 "memory_operand" "")))
+ (match_dup 8)]
+ "aarch64_operands_adjust_ok_for_ldpstp (operands, true, SImode)"
+ [(const_int 0)]
+{
+ rtx base, offset_1, offset_2, tmp;
+
+ extract_base_offset_in_addr (operands[1], &base, &offset_1);
+ extract_base_offset_in_addr (operands[3], &base, &offset_2);
+ if (INTVAL (offset_1) > INTVAL (offset_2))
+ {
+ tmp = operands[0];
+ operands[0] = operands[6];
+ operands[6] = tmp;
+ tmp = operands[1];
+ operands[1] = operands[7];
+ operands[7] = tmp;
+ tmp = operands[2];
+ operands[2] = operands[4];
+ operands[4] = tmp;
+ tmp = operands[3];
+ operands[3] = operands[5];
+ operands[5] = tmp;
+ }
+
+ if (aarch64_gen_adjusted_ldpstp (operands, true, SImode, ZERO_EXTEND))
+ DONE;
+ else
+ FAIL;
+})
+
+(define_peephole2
+ [(match_scratch:DI 8 "r")
+ (set (match_operand:GPI 0 "memory_operand" "")
+ (match_operand:GPI 1 "aarch64_reg_or_zero" ""))
+ (set (match_operand:GPI 2 "memory_operand" "")
+ (match_operand:GPI 3 "aarch64_reg_or_zero" ""))
+ (set (match_operand:GPI 4 "memory_operand" "")
+ (match_operand:GPI 5 "aarch64_reg_or_zero" ""))
+ (set (match_operand:GPI 6 "memory_operand" "")
+ (match_operand:GPI 7 "aarch64_reg_or_zero" ""))
+ (match_dup 8)]
+ "aarch64_operands_adjust_ok_for_ldpstp (operands, false, <MODE>mode)"
+ [(const_int 0)]
+{
+ rtx base, offset_1, offset_2, tmp;
+
+ extract_base_offset_in_addr (operands[0], &base, &offset_1);
+ extract_base_offset_in_addr (operands[2], &base, &offset_2);
+ if (INTVAL (offset_1) > INTVAL (offset_2))
+ {
+ tmp = operands[0];
+ operands[0] = operands[6];
+ operands[6] = tmp;
+ tmp = operands[1];
+ operands[1] = operands[7];
+ operands[7] = tmp;
+ tmp = operands[2];
+ operands[2] = operands[4];
+ operands[4] = tmp;
+ tmp = operands[3];
+ operands[3] = operands[5];
+ operands[5] = tmp;
+ }
+
+ if (aarch64_gen_adjusted_ldpstp (operands, false, <MODE>mode, UNKNOWN))
+ DONE;
+ else
+ FAIL;
+})
+
+(define_peephole2
+ [(match_scratch:DI 8 "r")
+ (set (match_operand:GPF 0 "memory_operand" "")
+ (match_operand:GPF 1 "aarch64_reg_or_zero" ""))
+ (set (match_operand:GPF 2 "memory_operand" "")
+ (match_operand:GPF 3 "aarch64_reg_or_zero" ""))
+ (set (match_operand:GPF 4 "memory_operand" "")
+ (match_operand:GPF 5 "aarch64_reg_or_zero" ""))
+ (set (match_operand:GPF 6 "memory_operand" "")
+ (match_operand:GPF 7 "aarch64_reg_or_zero" ""))
+ (match_dup 8)]
+ "aarch64_operands_adjust_ok_for_ldpstp (operands, false, <MODE>mode)"
+ [(const_int 0)]
+{
+ rtx base, offset_1, offset_2, tmp;
+
+ extract_base_offset_in_addr (operands[0], &base, &offset_1);
+ extract_base_offset_in_addr (operands[2], &base, &offset_2);
+ if (INTVAL (offset_1) > INTVAL (offset_2))
+ {
+ tmp = operands[0];
+ operands[0] = operands[6];
+ operands[6] = tmp;
+ tmp = operands[1];
+ operands[1] = operands[7];
+ operands[7] = tmp;
+ tmp = operands[2];
+ operands[2] = operands[4];
+ operands[4] = tmp;
+ tmp = operands[3];
+ operands[3] = operands[5];
+ operands[5] = tmp;
+ }
+
+ if (aarch64_gen_adjusted_ldpstp (operands, false, <MODE>mode, UNKNOWN))
+ DONE;
+ else
+ FAIL;
+})
void aarch64_split_compare_and_swap (rtx op[]);
void aarch64_split_atomic_op (enum rtx_code, rtx, rtx, rtx, rtx, rtx, rtx);
+bool aarch64_gen_adjusted_ldpstp (rtx *, bool, enum machine_mode, RTX_CODE);
#endif /* RTX_CODE */
void aarch64_init_builtins (void);
aarch64_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel);
void aarch64_atomic_assign_expand_fenv (tree *, tree *, tree *);
int aarch64_ccmp_mode_to_code (enum machine_mode mode);
+
+bool extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset);
+bool aarch64_operands_ok_for_ldpstp (rtx *, bool, enum machine_mode);
+bool aarch64_operands_adjust_ok_for_ldpstp (rtx *, bool, enum machine_mode);
#endif /* GCC_AARCH64_PROTOS_H */
return false;
}
+/* If MEM is in the form of [base+offset], extract the two parts
+ of address and set to BASE and OFFSET, otherwise return false
+ after clearing BASE and OFFSET. */
+
+bool
+extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
+{
+ rtx addr;
+
+ gcc_assert (MEM_P (mem));
+
+ addr = XEXP (mem, 0);
+
+ if (REG_P (addr))
+ {
+ *base = addr;
+ *offset = const0_rtx;
+ return true;
+ }
+
+ if (GET_CODE (addr) == PLUS
+ && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
+ {
+ *base = XEXP (addr, 0);
+ *offset = XEXP (addr, 1);
+ return true;
+ }
+
+ *base = NULL_RTX;
+ *offset = NULL_RTX;
+
+ return false;
+}
+
+/* Types for scheduling fusion. */
+enum sched_fusion_type
+{
+ SCHED_FUSION_NONE = 0,
+ SCHED_FUSION_LD_SIGN_EXTEND,
+ SCHED_FUSION_LD_ZERO_EXTEND,
+ SCHED_FUSION_LD,
+ SCHED_FUSION_ST,
+ SCHED_FUSION_NUM
+};
+
+/* If INSN is a load or store of address in the form of [base+offset],
+ extract the two parts and set to BASE and OFFSET. Return scheduling
+ fusion type this INSN is. */
+
+static enum sched_fusion_type
+fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
+{
+ rtx x, dest, src;
+ enum sched_fusion_type fusion = SCHED_FUSION_LD;
+
+ gcc_assert (INSN_P (insn));
+ x = PATTERN (insn);
+ if (GET_CODE (x) != SET)
+ return SCHED_FUSION_NONE;
+
+ src = SET_SRC (x);
+ dest = SET_DEST (x);
+
+ if (GET_MODE (src) != SImode && GET_MODE (src) != DImode
+ && GET_MODE (src) != SFmode && GET_MODE (src) != DFmode)
+ return SCHED_FUSION_NONE;
+
+ if (GET_CODE (src) == SIGN_EXTEND)
+ {
+ fusion = SCHED_FUSION_LD_SIGN_EXTEND;
+ src = XEXP (src, 0);
+ if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
+ return SCHED_FUSION_NONE;
+ }
+ else if (GET_CODE (src) == ZERO_EXTEND)
+ {
+ fusion = SCHED_FUSION_LD_ZERO_EXTEND;
+ src = XEXP (src, 0);
+ if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
+ return SCHED_FUSION_NONE;
+ }
+
+ if (GET_CODE (src) == MEM && REG_P (dest))
+ extract_base_offset_in_addr (src, base, offset);
+ else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
+ {
+ fusion = SCHED_FUSION_ST;
+ extract_base_offset_in_addr (dest, base, offset);
+ }
+ else
+ return SCHED_FUSION_NONE;
+
+ if (*base == NULL_RTX || *offset == NULL_RTX)
+ fusion = SCHED_FUSION_NONE;
+
+ return fusion;
+}
+
+/* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
+
+ Currently we only support to fuse ldr or str instructions, so FUSION_PRI
+ and PRI are only calculated for these instructions. For other instruction,
+ FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
+ type instruction fusion can be added by returning different priorities.
+
+ It's important that irrelevant instructions get the largest FUSION_PRI. */
+
+static void
+aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
+ int *fusion_pri, int *pri)
+{
+ int tmp, off_val;
+ rtx base, offset;
+ enum sched_fusion_type fusion;
+
+ gcc_assert (INSN_P (insn));
+
+ tmp = max_pri - 1;
+ fusion = fusion_load_store (insn, &base, &offset);
+ if (fusion == SCHED_FUSION_NONE)
+ {
+ *pri = tmp;
+ *fusion_pri = tmp;
+ return;
+ }
+
+ /* Set FUSION_PRI according to fusion type and base register. */
+ *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
+
+ /* Calculate PRI. */
+ tmp /= 2;
+
+ /* INSN with smaller offset goes first. */
+ off_val = (int)(INTVAL (offset));
+ if (off_val >= 0)
+ tmp -= (off_val & 0xfffff);
+ else
+ tmp += ((- off_val) & 0xfffff);
+
+ *pri = tmp;
+ return;
+}
+
+/* Given OPERANDS of consecutive load/store, check if we can merge
+ them into ldp/stp. LOAD is true if they are load instructions.
+ MODE is the mode of memory operands. */
+
+bool
+aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
+ enum machine_mode mode)
+{
+ HOST_WIDE_INT offval_1, offval_2, msize;
+ enum reg_class rclass_1, rclass_2;
+ rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
+
+ if (load)
+ {
+ mem_1 = operands[1];
+ mem_2 = operands[3];
+ reg_1 = operands[0];
+ reg_2 = operands[2];
+ gcc_assert (REG_P (reg_1) && REG_P (reg_2));
+ if (REGNO (reg_1) == REGNO (reg_2))
+ return false;
+ }
+ else
+ {
+ mem_1 = operands[0];
+ mem_2 = operands[2];
+ reg_1 = operands[1];
+ reg_2 = operands[3];
+ }
+
+ /* Check if the addresses are in the form of [base+offset]. */
+ extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
+ if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
+ return false;
+ extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
+ if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
+ return false;
+
+ /* Check if the bases are same. */
+ if (!rtx_equal_p (base_1, base_2))
+ return false;
+
+ offval_1 = INTVAL (offset_1);
+ offval_2 = INTVAL (offset_2);
+ msize = GET_MODE_SIZE (mode);
+ /* Check if the offsets are consecutive. */
+ if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
+ return false;
+
+ /* Check if the addresses are clobbered by load. */
+ if (load)
+ {
+ if (reg_mentioned_p (reg_1, mem_1))
+ return false;
+
+ /* In increasing order, the last load can clobber the address. */
+ if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
+ return false;
+ }
+
+ if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
+ rclass_1 = FP_REGS;
+ else
+ rclass_1 = GENERAL_REGS;
+
+ if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
+ rclass_2 = FP_REGS;
+ else
+ rclass_2 = GENERAL_REGS;
+
+ /* Check if the registers are of same class. */
+ if (rclass_1 != rclass_2)
+ return false;
+
+ return true;
+}
+
+/* Given OPERANDS of consecutive load/store, check if we can merge
+ them into ldp/stp by adjusting the offset. LOAD is true if they
+ are load instructions. MODE is the mode of memory operands.
+
+ Given below consecutive stores:
+
+ str w1, [xb, 0x100]
+ str w1, [xb, 0x104]
+ str w1, [xb, 0x108]
+ str w1, [xb, 0x10c]
+
+ Though the offsets are out of the range supported by stp, we can
+ still pair them after adjusting the offset, like:
+
+ add scratch, xb, 0x100
+ stp w1, w1, [scratch]
+ stp w1, w1, [scratch, 0x8]
+
+ The peephole patterns detecting this opportunity should guarantee
+ the scratch register is avaliable. */
+
+bool
+aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
+ enum machine_mode mode)
+{
+ enum reg_class rclass_1, rclass_2, rclass_3, rclass_4;
+ HOST_WIDE_INT offval_1, offval_2, offval_3, offval_4, msize;
+ rtx mem_1, mem_2, mem_3, mem_4, reg_1, reg_2, reg_3, reg_4;
+ rtx base_1, base_2, base_3, base_4, offset_1, offset_2, offset_3, offset_4;
+
+ if (load)
+ {
+ reg_1 = operands[0];
+ mem_1 = operands[1];
+ reg_2 = operands[2];
+ mem_2 = operands[3];
+ reg_3 = operands[4];
+ mem_3 = operands[5];
+ reg_4 = operands[6];
+ mem_4 = operands[7];
+ gcc_assert (REG_P (reg_1) && REG_P (reg_2)
+ && REG_P (reg_3) && REG_P (reg_4));
+ if (REGNO (reg_1) == REGNO (reg_2) || REGNO (reg_3) == REGNO (reg_4))
+ return false;
+ }
+ else
+ {
+ mem_1 = operands[0];
+ reg_1 = operands[1];
+ mem_2 = operands[2];
+ reg_2 = operands[3];
+ mem_3 = operands[4];
+ reg_3 = operands[5];
+ mem_4 = operands[6];
+ reg_4 = operands[7];
+ }
+ /* Skip if memory operand is by itslef valid for ldp/stp. */
+ if (!MEM_P (mem_1) || aarch64_mem_pair_operand (mem_1, mode))
+ return false;
+
+ /* Check if the addresses are in the form of [base+offset]. */
+ extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
+ if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
+ return false;
+ extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
+ if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
+ return false;
+ extract_base_offset_in_addr (mem_3, &base_3, &offset_3);
+ if (base_3 == NULL_RTX || offset_3 == NULL_RTX)
+ return false;
+ extract_base_offset_in_addr (mem_4, &base_4, &offset_4);
+ if (base_4 == NULL_RTX || offset_4 == NULL_RTX)
+ return false;
+
+ /* Check if the bases are same. */
+ if (!rtx_equal_p (base_1, base_2)
+ || !rtx_equal_p (base_2, base_3)
+ || !rtx_equal_p (base_3, base_4))
+ return false;
+
+ offval_1 = INTVAL (offset_1);
+ offval_2 = INTVAL (offset_2);
+ offval_3 = INTVAL (offset_3);
+ offval_4 = INTVAL (offset_4);
+ msize = GET_MODE_SIZE (mode);
+ /* Check if the offsets are consecutive. */
+ if ((offval_1 != (offval_2 + msize)
+ || offval_1 != (offval_3 + msize * 2)
+ || offval_1 != (offval_4 + msize * 3))
+ && (offval_4 != (offval_3 + msize)
+ || offval_4 != (offval_2 + msize * 2)
+ || offval_4 != (offval_1 + msize * 3)))
+ return false;
+
+ /* Check if the addresses are clobbered by load. */
+ if (load)
+ {
+ if (reg_mentioned_p (reg_1, mem_1)
+ || reg_mentioned_p (reg_2, mem_2)
+ || reg_mentioned_p (reg_3, mem_3))
+ return false;
+
+ /* In increasing order, the last load can clobber the address. */
+ if (offval_1 > offval_2 && reg_mentioned_p (reg_4, mem_4))
+ return false;
+ }
+
+ if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
+ rclass_1 = FP_REGS;
+ else
+ rclass_1 = GENERAL_REGS;
+
+ if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
+ rclass_2 = FP_REGS;
+ else
+ rclass_2 = GENERAL_REGS;
+
+ if (REG_P (reg_3) && FP_REGNUM_P (REGNO (reg_3)))
+ rclass_3 = FP_REGS;
+ else
+ rclass_3 = GENERAL_REGS;
+
+ if (REG_P (reg_4) && FP_REGNUM_P (REGNO (reg_4)))
+ rclass_4 = FP_REGS;
+ else
+ rclass_4 = GENERAL_REGS;
+
+ /* Check if the registers are of same class. */
+ if (rclass_1 != rclass_2 || rclass_2 != rclass_3 || rclass_3 != rclass_4)
+ return false;
+
+ return true;
+}
+
+/* Given OPERANDS of consecutive load/store, this function pairs them
+ into ldp/stp after adjusting the offset. It depends on the fact
+ that addresses of load/store instructions are in increasing order.
+ MODE is the mode of memory operands. CODE is the rtl operator
+ which should be applied to all memory operands, it's SIGN_EXTEND,
+ ZERO_EXTEND or UNKNOWN. */
+
+bool
+aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
+ enum machine_mode mode, RTX_CODE code)
+{
+ rtx base, offset, t1, t2;
+ rtx mem_1, mem_2, mem_3, mem_4;
+ HOST_WIDE_INT off_val, abs_off, adj_off, new_off, stp_off_limit, msize;
+
+ if (load)
+ {
+ mem_1 = operands[1];
+ mem_2 = operands[3];
+ mem_3 = operands[5];
+ mem_4 = operands[7];
+ }
+ else
+ {
+ mem_1 = operands[0];
+ mem_2 = operands[2];
+ mem_3 = operands[4];
+ mem_4 = operands[6];
+ gcc_assert (code == UNKNOWN);
+ }
+
+ extract_base_offset_in_addr (mem_1, &base, &offset);
+ gcc_assert (base != NULL_RTX && offset != NULL_RTX);
+
+ /* Adjust offset thus it can fit in ldp/stp instruction. */
+ msize = GET_MODE_SIZE (mode);
+ stp_off_limit = msize * 0x40;
+ off_val = INTVAL (offset);
+ abs_off = (off_val < 0) ? -off_val : off_val;
+ new_off = abs_off % stp_off_limit;
+ adj_off = abs_off - new_off;
+
+ /* Further adjust to make sure all offsets are OK. */
+ if ((new_off + msize * 2) >= stp_off_limit)
+ {
+ adj_off += stp_off_limit;
+ new_off -= stp_off_limit;
+ }
+
+ /* Make sure the adjustment can be done with ADD/SUB instructions. */
+ if (adj_off >= 0x1000)
+ return false;
+
+ if (off_val < 0)
+ {
+ adj_off = -adj_off;
+ new_off = -new_off;
+ }
+
+ /* Create new memory references. */
+ mem_1 = change_address (mem_1, VOIDmode,
+ plus_constant (DImode, operands[8], new_off));
+
+ /* Check if the adjusted address is OK for ldp/stp. */
+ if (!aarch64_mem_pair_operand (mem_1, mode))
+ return false;
+
+ msize = GET_MODE_SIZE (mode);
+ mem_2 = change_address (mem_2, VOIDmode,
+ plus_constant (DImode,
+ operands[8],
+ new_off + msize));
+ mem_3 = change_address (mem_3, VOIDmode,
+ plus_constant (DImode,
+ operands[8],
+ new_off + msize * 2));
+ mem_4 = change_address (mem_4, VOIDmode,
+ plus_constant (DImode,
+ operands[8],
+ new_off + msize * 3));
+
+ if (code == ZERO_EXTEND)
+ {
+ mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
+ mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
+ mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
+ mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
+ }
+ else if (code == SIGN_EXTEND)
+ {
+ mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
+ mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
+ mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
+ mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
+ }
+
+ if (load)
+ {
+ operands[1] = mem_1;
+ operands[3] = mem_2;
+ operands[5] = mem_3;
+ operands[7] = mem_4;
+ }
+ else
+ {
+ operands[0] = mem_1;
+ operands[2] = mem_2;
+ operands[4] = mem_3;
+ operands[6] = mem_4;
+ }
+
+ /* Emit adjusting instruction. */
+ emit_insn (gen_rtx_SET (VOIDmode, operands[8],
+ plus_constant (DImode, base, adj_off)));
+ /* Emit ldp/stp instructions. */
+ t1 = gen_rtx_SET (VOIDmode, operands[0], operands[1]);
+ t2 = gen_rtx_SET (VOIDmode, operands[2], operands[3]);
+ emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
+ t1 = gen_rtx_SET (VOIDmode, operands[4], operands[5]);
+ t2 = gen_rtx_SET (VOIDmode, operands[6], operands[7]);
+ emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
+ return true;
+}
+
#undef TARGET_ADDRESS_COST
#define TARGET_ADDRESS_COST aarch64_address_cost
#undef TARGET_SCHED_MACRO_FUSION_PAIR_P
#define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
+#undef TARGET_SCHED_FUSION_PRIORITY
+#define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
+
struct gcc_target targetm = TARGET_INITIALIZER;
#include "gt-aarch64.h"
;; Operands 1 and 3 are tied together by the final condition; so we allow
;; fairly lax checking on the second memory operation.
-(define_insn "load_pair<mode>"
- [(set (match_operand:GPI 0 "register_operand" "=r")
- (match_operand:GPI 1 "aarch64_mem_pair_operand" "Ump"))
- (set (match_operand:GPI 2 "register_operand" "=r")
- (match_operand:GPI 3 "memory_operand" "m"))]
+(define_insn "load_pairsi"
+ [(set (match_operand:SI 0 "register_operand" "=r,*w")
+ (match_operand:SI 1 "aarch64_mem_pair_operand" "Ump,Ump"))
+ (set (match_operand:SI 2 "register_operand" "=r,*w")
+ (match_operand:SI 3 "memory_operand" "m,m"))]
"rtx_equal_p (XEXP (operands[3], 0),
plus_constant (Pmode,
XEXP (operands[1], 0),
- GET_MODE_SIZE (<MODE>mode)))"
- "ldp\\t%<w>0, %<w>2, %1"
- [(set_attr "type" "load2")]
+ GET_MODE_SIZE (SImode)))"
+ "@
+ ldp\\t%w0, %w2, %1
+ ldp\\t%s0, %s2, %1"
+ [(set_attr "type" "load2,neon_load1_2reg")
+ (set_attr "fp" "*,yes")]
)
+(define_insn "load_pairdi"
+ [(set (match_operand:DI 0 "register_operand" "=r,*w")
+ (match_operand:DI 1 "aarch64_mem_pair_operand" "Ump,Ump"))
+ (set (match_operand:DI 2 "register_operand" "=r,*w")
+ (match_operand:DI 3 "memory_operand" "m,m"))]
+ "rtx_equal_p (XEXP (operands[3], 0),
+ plus_constant (Pmode,
+ XEXP (operands[1], 0),
+ GET_MODE_SIZE (DImode)))"
+ "@
+ ldp\\t%x0, %x2, %1
+ ldp\\t%d0, %d2, %1"
+ [(set_attr "type" "load2,neon_load1_2reg")
+ (set_attr "fp" "*,yes")]
+)
+
+
;; Operands 0 and 2 are tied together by the final condition; so we allow
;; fairly lax checking on the second memory operation.
-(define_insn "store_pair<mode>"
- [(set (match_operand:GPI 0 "aarch64_mem_pair_operand" "=Ump")
- (match_operand:GPI 1 "register_operand" "r"))
- (set (match_operand:GPI 2 "memory_operand" "=m")
- (match_operand:GPI 3 "register_operand" "r"))]
+(define_insn "store_pairsi"
+ [(set (match_operand:SI 0 "aarch64_mem_pair_operand" "=Ump,Ump")
+ (match_operand:SI 1 "aarch64_reg_or_zero" "rZ,*w"))
+ (set (match_operand:SI 2 "memory_operand" "=m,m")
+ (match_operand:SI 3 "aarch64_reg_or_zero" "rZ,*w"))]
"rtx_equal_p (XEXP (operands[2], 0),
plus_constant (Pmode,
XEXP (operands[0], 0),
- GET_MODE_SIZE (<MODE>mode)))"
- "stp\\t%<w>1, %<w>3, %0"
- [(set_attr "type" "store2")]
+ GET_MODE_SIZE (SImode)))"
+ "@
+ stp\\t%w1, %w3, %0
+ stp\\t%s1, %s3, %0"
+ [(set_attr "type" "store2,neon_store1_2reg")
+ (set_attr "fp" "*,yes")]
+)
+
+(define_insn "store_pairdi"
+ [(set (match_operand:DI 0 "aarch64_mem_pair_operand" "=Ump,Ump")
+ (match_operand:DI 1 "aarch64_reg_or_zero" "rZ,*w"))
+ (set (match_operand:DI 2 "memory_operand" "=m,m")
+ (match_operand:DI 3 "aarch64_reg_or_zero" "rZ,*w"))]
+ "rtx_equal_p (XEXP (operands[2], 0),
+ plus_constant (Pmode,
+ XEXP (operands[0], 0),
+ GET_MODE_SIZE (DImode)))"
+ "@
+ stp\\t%x1, %x3, %0
+ stp\\t%d1, %d3, %0"
+ [(set_attr "type" "store2,neon_store1_2reg")
+ (set_attr "fp" "*,yes")]
)
;; Operands 1 and 3 are tied together by the final condition; so we allow
;; fairly lax checking on the second memory operation.
-(define_insn "load_pair<mode>"
- [(set (match_operand:GPF 0 "register_operand" "=w")
- (match_operand:GPF 1 "aarch64_mem_pair_operand" "Ump"))
- (set (match_operand:GPF 2 "register_operand" "=w")
- (match_operand:GPF 3 "memory_operand" "m"))]
+(define_insn "load_pairsf"
+ [(set (match_operand:SF 0 "register_operand" "=w,*r")
+ (match_operand:SF 1 "aarch64_mem_pair_operand" "Ump,Ump"))
+ (set (match_operand:SF 2 "register_operand" "=w,*r")
+ (match_operand:SF 3 "memory_operand" "m,m"))]
"rtx_equal_p (XEXP (operands[3], 0),
plus_constant (Pmode,
XEXP (operands[1], 0),
- GET_MODE_SIZE (<MODE>mode)))"
- "ldp\\t%<w>0, %<w>2, %1"
- [(set_attr "type" "neon_load1_2reg<q>")]
+ GET_MODE_SIZE (SFmode)))"
+ "@
+ ldp\\t%s0, %s2, %1
+ ldp\\t%w0, %w2, %1"
+ [(set_attr "type" "neon_load1_2reg,load2")
+ (set_attr "fp" "yes,*")]
+)
+
+(define_insn "load_pairdf"
+ [(set (match_operand:DF 0 "register_operand" "=w,*r")
+ (match_operand:DF 1 "aarch64_mem_pair_operand" "Ump,Ump"))
+ (set (match_operand:DF 2 "register_operand" "=w,*r")
+ (match_operand:DF 3 "memory_operand" "m,m"))]
+ "rtx_equal_p (XEXP (operands[3], 0),
+ plus_constant (Pmode,
+ XEXP (operands[1], 0),
+ GET_MODE_SIZE (DFmode)))"
+ "@
+ ldp\\t%d0, %d2, %1
+ ldp\\t%x0, %x2, %1"
+ [(set_attr "type" "neon_load1_2reg,load2")
+ (set_attr "fp" "yes,*")]
)
;; Operands 0 and 2 are tied together by the final condition; so we allow
;; fairly lax checking on the second memory operation.
-(define_insn "store_pair<mode>"
- [(set (match_operand:GPF 0 "aarch64_mem_pair_operand" "=Ump")
- (match_operand:GPF 1 "register_operand" "w"))
- (set (match_operand:GPF 2 "memory_operand" "=m")
- (match_operand:GPF 3 "register_operand" "w"))]
+(define_insn "store_pairsf"
+ [(set (match_operand:SF 0 "aarch64_mem_pair_operand" "=Ump,Ump")
+ (match_operand:SF 1 "register_operand" "w,*r"))
+ (set (match_operand:SF 2 "memory_operand" "=m,m")
+ (match_operand:SF 3 "register_operand" "w,*r"))]
"rtx_equal_p (XEXP (operands[2], 0),
plus_constant (Pmode,
XEXP (operands[0], 0),
- GET_MODE_SIZE (<MODE>mode)))"
- "stp\\t%<w>1, %<w>3, %0"
- [(set_attr "type" "neon_store1_2reg<q>")]
+ GET_MODE_SIZE (SFmode)))"
+ "@
+ stp\\t%s1, %s3, %0
+ stp\\t%w1, %w3, %0"
+ [(set_attr "type" "neon_store1_2reg,store2")
+ (set_attr "fp" "yes,*")]
+)
+
+(define_insn "store_pairdf"
+ [(set (match_operand:DF 0 "aarch64_mem_pair_operand" "=Ump,Ump")
+ (match_operand:DF 1 "register_operand" "w,*r"))
+ (set (match_operand:DF 2 "memory_operand" "=m,m")
+ (match_operand:DF 3 "register_operand" "w,*r"))]
+ "rtx_equal_p (XEXP (operands[2], 0),
+ plus_constant (Pmode,
+ XEXP (operands[0], 0),
+ GET_MODE_SIZE (DFmode)))"
+ "@
+ stp\\t%d1, %d3, %0
+ stp\\t%x1, %x3, %0"
+ [(set_attr "type" "neon_store1_2reg,store2")
+ (set_attr "fp" "yes,*")]
)
;; Load pair with post-index writeback. This is primarily used in function
[(set_attr "type" "extend,load1")]
)
+(define_insn "*load_pair_extendsidi2_aarch64"
+ [(set (match_operand:DI 0 "register_operand" "=r")
+ (sign_extend:DI (match_operand:SI 1 "aarch64_mem_pair_operand" "Ump")))
+ (set (match_operand:DI 2 "register_operand" "=r")
+ (sign_extend:DI (match_operand:SI 3 "memory_operand" "m")))]
+ "rtx_equal_p (XEXP (operands[3], 0),
+ plus_constant (Pmode,
+ XEXP (operands[1], 0),
+ GET_MODE_SIZE (SImode)))"
+ "ldpsw\\t%0, %2, %1"
+ [(set_attr "type" "load2")]
+)
+
(define_insn "*zero_extendsidi2_aarch64"
[(set (match_operand:DI 0 "register_operand" "=r,r")
(zero_extend:DI (match_operand:SI 1 "nonimmediate_operand" "r,m")))]
[(set_attr "type" "extend,load1")]
)
+(define_insn "*load_pair_zero_extendsidi2_aarch64"
+ [(set (match_operand:DI 0 "register_operand" "=r")
+ (zero_extend:DI (match_operand:SI 1 "aarch64_mem_pair_operand" "Ump")))
+ (set (match_operand:DI 2 "register_operand" "=r")
+ (zero_extend:DI (match_operand:SI 3 "memory_operand" "m")))]
+ "rtx_equal_p (XEXP (operands[3], 0),
+ plus_constant (Pmode,
+ XEXP (operands[1], 0),
+ GET_MODE_SIZE (SImode)))"
+ "ldp\\t%w0, %w2, %1"
+ [(set_attr "type" "load2")]
+)
+
(define_expand "<ANY_EXTEND:optab><SHORT:mode><GPI:mode>2"
[(set (match_operand:GPI 0 "register_operand")
(ANY_EXTEND:GPI (match_operand:SHORT 1 "nonimmediate_operand")))]
;; Atomic Operations
(include "atomics.md")
+
+;; ldp/stp peephole patterns
+(include "aarch64-ldpstp.md")
+2014-12-05 Bin Cheng <bin.cheng@arm.com>
+
+ * gcc.target/aarch64/ldp_stp_1.c: New test.
+ * gcc.target/aarch64/ldp_stp_2.c: New test.
+ * gcc.target/aarch64/ldp_stp_3.c: New test.
+ * gcc.target/aarch64/ldp_stp_4.c: New test.
+ * gcc.target/aarch64/ldp_stp_5.c: New test.
+ * gcc.target/aarch64/lr_free_1.c: Disable scheduling fusion
+ and peephole2 pass.
+
2014-12-05 Sandra Loosemore <sandra@codesourcery.com>
* gcc.dg/vect/pr63341-1.c: Remove explicit "dg-do run".
--- /dev/null
+/* { dg-options "-O2" } */
+
+int arr[4][4];
+
+void
+foo ()
+{
+ arr[0][1] = 1;
+ arr[1][0] = -1;
+ arr[2][0] = 1;
+ arr[1][1] = -1;
+ arr[0][2] = 1;
+ arr[0][3] = -1;
+ arr[1][2] = 1;
+ arr[2][1] = -1;
+ arr[3][0] = 1;
+ arr[3][1] = -1;
+ arr[2][2] = 1;
+ arr[1][3] = -1;
+ arr[2][3] = 1;
+ arr[3][2] = -1;
+}
+
+/* { dg-final { scan-assembler-times "stp\tw\[0-9\]+, w\[0-9\]" 7 } } */
--- /dev/null
+/* { dg-options "-O2" } */
+
+extern void abort (void);
+
+int arr[4][4] = {{0, 1, 1, -1}, {-1, -1, 1, -1}, {1, -1, 1, 1}, {1, -1, -1, 0}};
+long long
+foo ()
+{
+ long long ll = 0;
+ ll += arr[0][1];
+ ll += arr[1][0];
+ ll += arr[1][1];
+ ll += arr[2][0];
+ return ll;
+}
+
+/* { dg-final { scan-assembler-times "ldpsw\tx\[0-9\]+, x\[0-9\]" 1 } } */
--- /dev/null
+/* { dg-options "-O2" } */
+
+extern void abort (void);
+
+unsigned int arr[4][4] = {{0, 1, 1, 2}, {2, 2, 1, 2}, {1, 2, 1, 1}, {1, 2, 2, 0}};
+unsigned long long
+foo ()
+{
+ unsigned long long ll = 0;
+ ll += arr[0][1];
+ ll += arr[1][0];
+ ll += arr[1][1];
+ ll += arr[2][0];
+ return ll;
+}
+
+/* { dg-final { scan-assembler-times "ldp\tw\[0-9\]+, w\[0-9\]" 1 } } */
--- /dev/null
+/* { dg-options "-O2" } */
+
+float arr[4][4];
+
+void
+foo ()
+{
+ arr[0][1] = 1;
+ arr[1][0] = -1;
+ arr[2][0] = 1;
+ arr[1][1] = -1;
+ arr[0][2] = 1;
+ arr[0][3] = -1;
+ arr[1][2] = 1;
+ arr[2][1] = -1;
+ arr[3][0] = 1;
+ arr[3][1] = -1;
+ arr[2][2] = 1;
+ arr[1][3] = -1;
+ arr[2][3] = 1;
+ arr[3][2] = -1;
+}
+
+/* { dg-final { scan-assembler-times "stp\ts\[0-9\]+, s\[0-9\]" 7 } } */
--- /dev/null
+/* { dg-options "-O2" } */
+
+double arr[4][4];
+
+void
+foo ()
+{
+ arr[0][1] = 1;
+ arr[1][0] = -1;
+ arr[2][0] = 1;
+ arr[1][1] = -1;
+ arr[0][2] = 1;
+ arr[0][3] = -1;
+ arr[1][2] = 1;
+ arr[2][1] = -1;
+ arr[3][0] = 1;
+ arr[3][1] = -1;
+ arr[2][2] = 1;
+ arr[1][3] = -1;
+ arr[2][3] = 1;
+ arr[3][2] = -1;
+}
+
+/* { dg-final { scan-assembler-times "stp\td\[0-9\]+, d\[0-9\]" 7 } } */
/* { dg-do run } */
-/* { dg-options "-fno-inline -O2 -fomit-frame-pointer -ffixed-x2 -ffixed-x3 -ffixed-x4 -ffixed-x5 -ffixed-x6 -ffixed-x7 -ffixed-x8 -ffixed-x9 -ffixed-x10 -ffixed-x11 -ffixed-x12 -ffixed-x13 -ffixed-x14 -ffixed-x15 -ffixed-x16 -ffixed-x17 -ffixed-x18 -ffixed-x19 -ffixed-x20 -ffixed-x21 -ffixed-x22 -ffixed-x23 -ffixed-x24 -ffixed-x25 -ffixed-x26 -ffixed-x27 -ffixed-28 -ffixed-29 --save-temps -mgeneral-regs-only -fno-ipa-cp" } */
+/* { dg-options "-fno-inline -O2 -fomit-frame-pointer -ffixed-x2 -ffixed-x3 -ffixed-x4 -ffixed-x5 -ffixed-x6 -ffixed-x7 -ffixed-x8 -ffixed-x9 -ffixed-x10 -ffixed-x11 -ffixed-x12 -ffixed-x13 -ffixed-x14 -ffixed-x15 -ffixed-x16 -ffixed-x17 -ffixed-x18 -ffixed-x19 -ffixed-x20 -ffixed-x21 -ffixed-x22 -ffixed-x23 -ffixed-x24 -ffixed-x25 -ffixed-x26 -ffixed-x27 -ffixed-28 -ffixed-29 --save-temps -mgeneral-regs-only -fno-ipa-cp -fno-schedule-fusion -fno-peephole2" } */
extern void abort ();