From: Bin Cheng Date: Fri, 5 Dec 2014 17:06:33 +0000 (+0000) Subject: [AArch64]load store pair optimization using sched_fusion pass. X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=350013bc494c048acf0fb3041dcff32d54b5f462;p=gcc.git [AArch64]load store pair optimization using sched_fusion pass. From-SVN: r218430 --- diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 3fc9bf138ba..88a29ebcd7c 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,33 @@ +2014-12-05 Bin Cheng + + * config/aarch64/aarch64.md (load_pair): Split to + load_pairsi, load_pairdi, load_pairsf and load_pairdf. + (load_pairsi, load_pairdi, load_pairsf, load_pairdf): Split + from load_pair. New alternative to support int/fp + registers in fp/int mode patterns. + (store_pair:): Split to store_pairsi, store_pairdi, + store_pairsf and store_pairdi. + (store_pairsi, store_pairdi, store_pairsf, store_pairdf): Split + from store_pair. New alternative to support int/fp + registers in fp/int mode patterns. + (*load_pair_extendsidi2_aarch64): New pattern. + (*load_pair_zero_extendsidi2_aarch64): New pattern. + (aarch64-ldpstp.md): Include. + * config/aarch64/aarch64-ldpstp.md: New file. + * config/aarch64/aarch64-protos.h (aarch64_gen_adjusted_ldpstp): + New. + (extract_base_offset_in_addr): New. + (aarch64_operands_ok_for_ldpstp): New. + (aarch64_operands_adjust_ok_for_ldpstp): New. + * config/aarch64/aarch64.c (enum sched_fusion_type): New enum. + (TARGET_SCHED_FUSION_PRIORITY): New hook. + (fusion_load_store): New functon. + (extract_base_offset_in_addr): New function. + (aarch64_gen_adjusted_ldpstp): New function. + (aarch64_sched_fusion_priority): New function. + (aarch64_operands_ok_for_ldpstp): New function. + (aarch64_operands_adjust_ok_for_ldpstp): New function. + 2014-12-05 Olivier Hainque * defaults.h: (DWARF_REG_TO_UNWIND_COLUMN): Define default. diff --git a/gcc/config/aarch64/aarch64-ldpstp.md b/gcc/config/aarch64/aarch64-ldpstp.md new file mode 100644 index 00000000000..dacfe1c4d01 --- /dev/null +++ b/gcc/config/aarch64/aarch64-ldpstp.md @@ -0,0 +1,410 @@ +;; AArch64 ldp/stp peephole optimizations. +;; Copyright (C) 2014 Free Software Foundation, Inc. +;; Contributed by ARM Ltd. +;; +;; This file is part of GCC. +;; +;; GCC is free software; you can redistribute it and/or modify it +;; under the terms of the GNU General Public License as published by +;; the Free Software Foundation; either version 3, or (at your option) +;; any later version. +;; +;; GCC is distributed in the hope that it will be useful, but +;; WITHOUT ANY WARRANTY; without even the implied warranty of +;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;; General Public License for more details. +;; +;; You should have received a copy of the GNU General Public License +;; along with GCC; see the file COPYING3. If not see +;; . + +(define_peephole2 + [(set (match_operand:GPI 0 "register_operand" "") + (match_operand:GPI 1 "aarch64_mem_pair_operand" "")) + (set (match_operand:GPI 2 "register_operand" "") + (match_operand:GPI 3 "memory_operand" ""))] + "aarch64_operands_ok_for_ldpstp (operands, true, mode)" + [(parallel [(set (match_dup 0) (match_dup 1)) + (set (match_dup 2) (match_dup 3))])] +{ + rtx base, offset_1, offset_2, tmp; + + extract_base_offset_in_addr (operands[1], &base, &offset_1); + extract_base_offset_in_addr (operands[3], &base, &offset_2); + if (INTVAL (offset_1) > INTVAL (offset_2)) + { + tmp = operands[0]; + operands[0] = operands[2]; + operands[2] = tmp; + tmp = operands[1]; + operands[1] = operands[3]; + operands[3] = tmp; + } +}) + +(define_peephole2 + [(set (match_operand:GPI 0 "aarch64_mem_pair_operand" "") + (match_operand:GPI 1 "aarch64_reg_or_zero" "")) + (set (match_operand:GPI 2 "memory_operand" "") + (match_operand:GPI 3 "aarch64_reg_or_zero" ""))] + "aarch64_operands_ok_for_ldpstp (operands, false, mode)" + [(parallel [(set (match_dup 0) (match_dup 1)) + (set (match_dup 2) (match_dup 3))])] +{ + rtx base, offset_1, offset_2, tmp; + + extract_base_offset_in_addr (operands[0], &base, &offset_1); + extract_base_offset_in_addr (operands[2], &base, &offset_2); + if (INTVAL (offset_1) > INTVAL (offset_2)) + { + tmp = operands[0]; + operands[0] = operands[2]; + operands[2] = tmp; + tmp = operands[1]; + operands[1] = operands[3]; + operands[3] = tmp; + } +}) + +(define_peephole2 + [(set (match_operand:GPF 0 "register_operand" "") + (match_operand:GPF 1 "aarch64_mem_pair_operand" "")) + (set (match_operand:GPF 2 "register_operand" "") + (match_operand:GPF 3 "memory_operand" ""))] + "aarch64_operands_ok_for_ldpstp (operands, true, mode)" + [(parallel [(set (match_dup 0) (match_dup 1)) + (set (match_dup 2) (match_dup 3))])] +{ + rtx base, offset_1, offset_2, tmp; + + extract_base_offset_in_addr (operands[1], &base, &offset_1); + extract_base_offset_in_addr (operands[3], &base, &offset_2); + if (INTVAL (offset_1) > INTVAL (offset_2)) + { + tmp = operands[0]; + operands[0] = operands[2]; + operands[2] = tmp; + tmp = operands[1]; + operands[1] = operands[3]; + operands[3] = tmp; + } +}) + +(define_peephole2 + [(set (match_operand:GPF 0 "aarch64_mem_pair_operand" "") + (match_operand:GPF 1 "register_operand" "")) + (set (match_operand:GPF 2 "memory_operand" "") + (match_operand:GPF 3 "register_operand" ""))] + "aarch64_operands_ok_for_ldpstp (operands, false, mode)" + [(parallel [(set (match_dup 0) (match_dup 1)) + (set (match_dup 2) (match_dup 3))])] +{ + rtx base, offset_1, offset_2, tmp; + + extract_base_offset_in_addr (operands[0], &base, &offset_1); + extract_base_offset_in_addr (operands[2], &base, &offset_2); + if (INTVAL (offset_1) > INTVAL (offset_2)) + { + tmp = operands[0]; + operands[0] = operands[2]; + operands[2] = tmp; + tmp = operands[1]; + operands[1] = operands[3]; + operands[3] = tmp; + } +}) + +;; Handle sign/zero extended consecutive load/store. + +(define_peephole2 + [(set (match_operand:DI 0 "register_operand" "") + (sign_extend:DI (match_operand:SI 1 "aarch64_mem_pair_operand" ""))) + (set (match_operand:DI 2 "register_operand" "") + (sign_extend:DI (match_operand:SI 3 "memory_operand" "")))] + "aarch64_operands_ok_for_ldpstp (operands, true, SImode)" + [(parallel [(set (match_dup 0) (sign_extend:DI (match_dup 1))) + (set (match_dup 2) (sign_extend:DI (match_dup 3)))])] +{ + rtx base, offset_1, offset_2, tmp; + + extract_base_offset_in_addr (operands[1], &base, &offset_1); + extract_base_offset_in_addr (operands[3], &base, &offset_2); + if (INTVAL (offset_1) > INTVAL (offset_2)) + { + tmp = operands[0]; + operands[0] = operands[2]; + operands[2] = tmp; + tmp = operands[1]; + operands[1] = operands[3]; + operands[3] = tmp; + } +}) + +(define_peephole2 + [(set (match_operand:DI 0 "register_operand" "") + (zero_extend:DI (match_operand:SI 1 "aarch64_mem_pair_operand" ""))) + (set (match_operand:DI 2 "register_operand" "") + (zero_extend:DI (match_operand:SI 3 "memory_operand" "")))] + "aarch64_operands_ok_for_ldpstp (operands, true, SImode)" + [(parallel [(set (match_dup 0) (zero_extend:DI (match_dup 1))) + (set (match_dup 2) (zero_extend:DI (match_dup 3)))])] +{ + rtx base, offset_1, offset_2, tmp; + + extract_base_offset_in_addr (operands[1], &base, &offset_1); + extract_base_offset_in_addr (operands[3], &base, &offset_2); + if (INTVAL (offset_1) > INTVAL (offset_2)) + { + tmp = operands[0]; + operands[0] = operands[2]; + operands[2] = tmp; + tmp = operands[1]; + operands[1] = operands[3]; + operands[3] = tmp; + } +}) + +;; Handle consecutive load/store whose offset is out of the range +;; supported by ldp/ldpsw/stp. We firstly adjust offset in a scratch +;; register, then merge them into ldp/ldpsw/stp by using the adjusted +;; offset. + +(define_peephole2 + [(match_scratch:DI 8 "r") + (set (match_operand:GPI 0 "register_operand" "") + (match_operand:GPI 1 "memory_operand" "")) + (set (match_operand:GPI 2 "register_operand" "") + (match_operand:GPI 3 "memory_operand" "")) + (set (match_operand:GPI 4 "register_operand" "") + (match_operand:GPI 5 "memory_operand" "")) + (set (match_operand:GPI 6 "register_operand" "") + (match_operand:GPI 7 "memory_operand" "")) + (match_dup 8)] + "aarch64_operands_adjust_ok_for_ldpstp (operands, true, mode)" + [(const_int 0)] +{ + rtx base, offset_1, offset_2, tmp; + + extract_base_offset_in_addr (operands[1], &base, &offset_1); + extract_base_offset_in_addr (operands[3], &base, &offset_2); + if (INTVAL (offset_1) > INTVAL (offset_2)) + { + tmp = operands[0]; + operands[0] = operands[6]; + operands[6] = tmp; + tmp = operands[1]; + operands[1] = operands[7]; + operands[7] = tmp; + tmp = operands[2]; + operands[2] = operands[4]; + operands[4] = tmp; + tmp = operands[3]; + operands[3] = operands[5]; + operands[5] = tmp; + } + + if (aarch64_gen_adjusted_ldpstp (operands, true, mode, UNKNOWN)) + DONE; + else + FAIL; +}) + +(define_peephole2 + [(match_scratch:DI 8 "r") + (set (match_operand:GPF 0 "register_operand" "") + (match_operand:GPF 1 "memory_operand" "")) + (set (match_operand:GPF 2 "register_operand" "") + (match_operand:GPF 3 "memory_operand" "")) + (set (match_operand:GPF 4 "register_operand" "") + (match_operand:GPF 5 "memory_operand" "")) + (set (match_operand:GPF 6 "register_operand" "") + (match_operand:GPF 7 "memory_operand" "")) + (match_dup 8)] + "aarch64_operands_adjust_ok_for_ldpstp (operands, true, mode)" + [(const_int 0)] +{ + rtx base, offset_1, offset_2, tmp; + + extract_base_offset_in_addr (operands[1], &base, &offset_1); + extract_base_offset_in_addr (operands[3], &base, &offset_2); + if (INTVAL (offset_1) > INTVAL (offset_2)) + { + tmp = operands[0]; + operands[0] = operands[6]; + operands[6] = tmp; + tmp = operands[1]; + operands[1] = operands[7]; + operands[7] = tmp; + tmp = operands[2]; + operands[2] = operands[4]; + operands[4] = tmp; + tmp = operands[3]; + operands[3] = operands[5]; + operands[5] = tmp; + } + + if (aarch64_gen_adjusted_ldpstp (operands, true, mode, UNKNOWN)) + DONE; + else + FAIL; +}) + +(define_peephole2 + [(match_scratch:DI 8 "r") + (set (match_operand:DI 0 "register_operand" "") + (sign_extend:DI (match_operand:SI 1 "memory_operand" ""))) + (set (match_operand:DI 2 "register_operand" "") + (sign_extend:DI (match_operand:SI 3 "memory_operand" ""))) + (set (match_operand:DI 4 "register_operand" "") + (sign_extend:DI (match_operand:SI 5 "memory_operand" ""))) + (set (match_operand:DI 6 "register_operand" "") + (sign_extend:DI (match_operand:SI 7 "memory_operand" ""))) + (match_dup 8)] + "aarch64_operands_adjust_ok_for_ldpstp (operands, true, SImode)" + [(const_int 0)] +{ + rtx base, offset_1, offset_2, tmp; + + extract_base_offset_in_addr (operands[1], &base, &offset_1); + extract_base_offset_in_addr (operands[3], &base, &offset_2); + if (INTVAL (offset_1) > INTVAL (offset_2)) + { + tmp = operands[0]; + operands[0] = operands[6]; + operands[6] = tmp; + tmp = operands[1]; + operands[1] = operands[7]; + operands[7] = tmp; + tmp = operands[2]; + operands[2] = operands[4]; + operands[4] = tmp; + tmp = operands[3]; + operands[3] = operands[5]; + operands[5] = tmp; + } + + if (aarch64_gen_adjusted_ldpstp (operands, true, SImode, SIGN_EXTEND)) + DONE; + else + FAIL; +}) + +(define_peephole2 + [(match_scratch:DI 8 "r") + (set (match_operand:DI 0 "register_operand" "") + (zero_extend:DI (match_operand:SI 1 "memory_operand" ""))) + (set (match_operand:DI 2 "register_operand" "") + (zero_extend:DI (match_operand:SI 3 "memory_operand" ""))) + (set (match_operand:DI 4 "register_operand" "") + (zero_extend:DI (match_operand:SI 5 "memory_operand" ""))) + (set (match_operand:DI 6 "register_operand" "") + (zero_extend:DI (match_operand:SI 7 "memory_operand" ""))) + (match_dup 8)] + "aarch64_operands_adjust_ok_for_ldpstp (operands, true, SImode)" + [(const_int 0)] +{ + rtx base, offset_1, offset_2, tmp; + + extract_base_offset_in_addr (operands[1], &base, &offset_1); + extract_base_offset_in_addr (operands[3], &base, &offset_2); + if (INTVAL (offset_1) > INTVAL (offset_2)) + { + tmp = operands[0]; + operands[0] = operands[6]; + operands[6] = tmp; + tmp = operands[1]; + operands[1] = operands[7]; + operands[7] = tmp; + tmp = operands[2]; + operands[2] = operands[4]; + operands[4] = tmp; + tmp = operands[3]; + operands[3] = operands[5]; + operands[5] = tmp; + } + + if (aarch64_gen_adjusted_ldpstp (operands, true, SImode, ZERO_EXTEND)) + DONE; + else + FAIL; +}) + +(define_peephole2 + [(match_scratch:DI 8 "r") + (set (match_operand:GPI 0 "memory_operand" "") + (match_operand:GPI 1 "aarch64_reg_or_zero" "")) + (set (match_operand:GPI 2 "memory_operand" "") + (match_operand:GPI 3 "aarch64_reg_or_zero" "")) + (set (match_operand:GPI 4 "memory_operand" "") + (match_operand:GPI 5 "aarch64_reg_or_zero" "")) + (set (match_operand:GPI 6 "memory_operand" "") + (match_operand:GPI 7 "aarch64_reg_or_zero" "")) + (match_dup 8)] + "aarch64_operands_adjust_ok_for_ldpstp (operands, false, mode)" + [(const_int 0)] +{ + rtx base, offset_1, offset_2, tmp; + + extract_base_offset_in_addr (operands[0], &base, &offset_1); + extract_base_offset_in_addr (operands[2], &base, &offset_2); + if (INTVAL (offset_1) > INTVAL (offset_2)) + { + tmp = operands[0]; + operands[0] = operands[6]; + operands[6] = tmp; + tmp = operands[1]; + operands[1] = operands[7]; + operands[7] = tmp; + tmp = operands[2]; + operands[2] = operands[4]; + operands[4] = tmp; + tmp = operands[3]; + operands[3] = operands[5]; + operands[5] = tmp; + } + + if (aarch64_gen_adjusted_ldpstp (operands, false, mode, UNKNOWN)) + DONE; + else + FAIL; +}) + +(define_peephole2 + [(match_scratch:DI 8 "r") + (set (match_operand:GPF 0 "memory_operand" "") + (match_operand:GPF 1 "aarch64_reg_or_zero" "")) + (set (match_operand:GPF 2 "memory_operand" "") + (match_operand:GPF 3 "aarch64_reg_or_zero" "")) + (set (match_operand:GPF 4 "memory_operand" "") + (match_operand:GPF 5 "aarch64_reg_or_zero" "")) + (set (match_operand:GPF 6 "memory_operand" "") + (match_operand:GPF 7 "aarch64_reg_or_zero" "")) + (match_dup 8)] + "aarch64_operands_adjust_ok_for_ldpstp (operands, false, mode)" + [(const_int 0)] +{ + rtx base, offset_1, offset_2, tmp; + + extract_base_offset_in_addr (operands[0], &base, &offset_1); + extract_base_offset_in_addr (operands[2], &base, &offset_2); + if (INTVAL (offset_1) > INTVAL (offset_2)) + { + tmp = operands[0]; + operands[0] = operands[6]; + operands[6] = tmp; + tmp = operands[1]; + operands[1] = operands[7]; + operands[7] = tmp; + tmp = operands[2]; + operands[2] = operands[4]; + operands[4] = tmp; + tmp = operands[3]; + operands[3] = operands[5]; + operands[5] = tmp; + } + + if (aarch64_gen_adjusted_ldpstp (operands, false, mode, UNKNOWN)) + DONE; + else + FAIL; +}) diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h index f5db563189d..ec4157a38fe 100644 --- a/gcc/config/aarch64/aarch64-protos.h +++ b/gcc/config/aarch64/aarch64-protos.h @@ -293,6 +293,7 @@ void aarch64_expand_compare_and_swap (rtx op[]); void aarch64_split_compare_and_swap (rtx op[]); void aarch64_split_atomic_op (enum rtx_code, rtx, rtx, rtx, rtx, rtx, rtx); +bool aarch64_gen_adjusted_ldpstp (rtx *, bool, enum machine_mode, RTX_CODE); #endif /* RTX_CODE */ void aarch64_init_builtins (void); @@ -316,4 +317,8 @@ extern bool aarch64_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel); void aarch64_atomic_assign_expand_fenv (tree *, tree *, tree *); int aarch64_ccmp_mode_to_code (enum machine_mode mode); + +bool extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset); +bool aarch64_operands_ok_for_ldpstp (rtx *, bool, enum machine_mode); +bool aarch64_operands_adjust_ok_for_ldpstp (rtx *, bool, enum machine_mode); #endif /* GCC_AARCH64_PROTOS_H */ diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index d3ef770a6cb..79a8679e748 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -10382,6 +10382,484 @@ aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr) return false; } +/* If MEM is in the form of [base+offset], extract the two parts + of address and set to BASE and OFFSET, otherwise return false + after clearing BASE and OFFSET. */ + +bool +extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset) +{ + rtx addr; + + gcc_assert (MEM_P (mem)); + + addr = XEXP (mem, 0); + + if (REG_P (addr)) + { + *base = addr; + *offset = const0_rtx; + return true; + } + + if (GET_CODE (addr) == PLUS + && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1))) + { + *base = XEXP (addr, 0); + *offset = XEXP (addr, 1); + return true; + } + + *base = NULL_RTX; + *offset = NULL_RTX; + + return false; +} + +/* Types for scheduling fusion. */ +enum sched_fusion_type +{ + SCHED_FUSION_NONE = 0, + SCHED_FUSION_LD_SIGN_EXTEND, + SCHED_FUSION_LD_ZERO_EXTEND, + SCHED_FUSION_LD, + SCHED_FUSION_ST, + SCHED_FUSION_NUM +}; + +/* If INSN is a load or store of address in the form of [base+offset], + extract the two parts and set to BASE and OFFSET. Return scheduling + fusion type this INSN is. */ + +static enum sched_fusion_type +fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset) +{ + rtx x, dest, src; + enum sched_fusion_type fusion = SCHED_FUSION_LD; + + gcc_assert (INSN_P (insn)); + x = PATTERN (insn); + if (GET_CODE (x) != SET) + return SCHED_FUSION_NONE; + + src = SET_SRC (x); + dest = SET_DEST (x); + + if (GET_MODE (src) != SImode && GET_MODE (src) != DImode + && GET_MODE (src) != SFmode && GET_MODE (src) != DFmode) + return SCHED_FUSION_NONE; + + if (GET_CODE (src) == SIGN_EXTEND) + { + fusion = SCHED_FUSION_LD_SIGN_EXTEND; + src = XEXP (src, 0); + if (GET_CODE (src) != MEM || GET_MODE (src) != SImode) + return SCHED_FUSION_NONE; + } + else if (GET_CODE (src) == ZERO_EXTEND) + { + fusion = SCHED_FUSION_LD_ZERO_EXTEND; + src = XEXP (src, 0); + if (GET_CODE (src) != MEM || GET_MODE (src) != SImode) + return SCHED_FUSION_NONE; + } + + if (GET_CODE (src) == MEM && REG_P (dest)) + extract_base_offset_in_addr (src, base, offset); + else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx)) + { + fusion = SCHED_FUSION_ST; + extract_base_offset_in_addr (dest, base, offset); + } + else + return SCHED_FUSION_NONE; + + if (*base == NULL_RTX || *offset == NULL_RTX) + fusion = SCHED_FUSION_NONE; + + return fusion; +} + +/* Implement the TARGET_SCHED_FUSION_PRIORITY hook. + + Currently we only support to fuse ldr or str instructions, so FUSION_PRI + and PRI are only calculated for these instructions. For other instruction, + FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other + type instruction fusion can be added by returning different priorities. + + It's important that irrelevant instructions get the largest FUSION_PRI. */ + +static void +aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri, + int *fusion_pri, int *pri) +{ + int tmp, off_val; + rtx base, offset; + enum sched_fusion_type fusion; + + gcc_assert (INSN_P (insn)); + + tmp = max_pri - 1; + fusion = fusion_load_store (insn, &base, &offset); + if (fusion == SCHED_FUSION_NONE) + { + *pri = tmp; + *fusion_pri = tmp; + return; + } + + /* Set FUSION_PRI according to fusion type and base register. */ + *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base); + + /* Calculate PRI. */ + tmp /= 2; + + /* INSN with smaller offset goes first. */ + off_val = (int)(INTVAL (offset)); + if (off_val >= 0) + tmp -= (off_val & 0xfffff); + else + tmp += ((- off_val) & 0xfffff); + + *pri = tmp; + return; +} + +/* Given OPERANDS of consecutive load/store, check if we can merge + them into ldp/stp. LOAD is true if they are load instructions. + MODE is the mode of memory operands. */ + +bool +aarch64_operands_ok_for_ldpstp (rtx *operands, bool load, + enum machine_mode mode) +{ + HOST_WIDE_INT offval_1, offval_2, msize; + enum reg_class rclass_1, rclass_2; + rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2; + + if (load) + { + mem_1 = operands[1]; + mem_2 = operands[3]; + reg_1 = operands[0]; + reg_2 = operands[2]; + gcc_assert (REG_P (reg_1) && REG_P (reg_2)); + if (REGNO (reg_1) == REGNO (reg_2)) + return false; + } + else + { + mem_1 = operands[0]; + mem_2 = operands[2]; + reg_1 = operands[1]; + reg_2 = operands[3]; + } + + /* Check if the addresses are in the form of [base+offset]. */ + extract_base_offset_in_addr (mem_1, &base_1, &offset_1); + if (base_1 == NULL_RTX || offset_1 == NULL_RTX) + return false; + extract_base_offset_in_addr (mem_2, &base_2, &offset_2); + if (base_2 == NULL_RTX || offset_2 == NULL_RTX) + return false; + + /* Check if the bases are same. */ + if (!rtx_equal_p (base_1, base_2)) + return false; + + offval_1 = INTVAL (offset_1); + offval_2 = INTVAL (offset_2); + msize = GET_MODE_SIZE (mode); + /* Check if the offsets are consecutive. */ + if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize)) + return false; + + /* Check if the addresses are clobbered by load. */ + if (load) + { + if (reg_mentioned_p (reg_1, mem_1)) + return false; + + /* In increasing order, the last load can clobber the address. */ + if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2)) + return false; + } + + if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1))) + rclass_1 = FP_REGS; + else + rclass_1 = GENERAL_REGS; + + if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2))) + rclass_2 = FP_REGS; + else + rclass_2 = GENERAL_REGS; + + /* Check if the registers are of same class. */ + if (rclass_1 != rclass_2) + return false; + + return true; +} + +/* Given OPERANDS of consecutive load/store, check if we can merge + them into ldp/stp by adjusting the offset. LOAD is true if they + are load instructions. MODE is the mode of memory operands. + + Given below consecutive stores: + + str w1, [xb, 0x100] + str w1, [xb, 0x104] + str w1, [xb, 0x108] + str w1, [xb, 0x10c] + + Though the offsets are out of the range supported by stp, we can + still pair them after adjusting the offset, like: + + add scratch, xb, 0x100 + stp w1, w1, [scratch] + stp w1, w1, [scratch, 0x8] + + The peephole patterns detecting this opportunity should guarantee + the scratch register is avaliable. */ + +bool +aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load, + enum machine_mode mode) +{ + enum reg_class rclass_1, rclass_2, rclass_3, rclass_4; + HOST_WIDE_INT offval_1, offval_2, offval_3, offval_4, msize; + rtx mem_1, mem_2, mem_3, mem_4, reg_1, reg_2, reg_3, reg_4; + rtx base_1, base_2, base_3, base_4, offset_1, offset_2, offset_3, offset_4; + + if (load) + { + reg_1 = operands[0]; + mem_1 = operands[1]; + reg_2 = operands[2]; + mem_2 = operands[3]; + reg_3 = operands[4]; + mem_3 = operands[5]; + reg_4 = operands[6]; + mem_4 = operands[7]; + gcc_assert (REG_P (reg_1) && REG_P (reg_2) + && REG_P (reg_3) && REG_P (reg_4)); + if (REGNO (reg_1) == REGNO (reg_2) || REGNO (reg_3) == REGNO (reg_4)) + return false; + } + else + { + mem_1 = operands[0]; + reg_1 = operands[1]; + mem_2 = operands[2]; + reg_2 = operands[3]; + mem_3 = operands[4]; + reg_3 = operands[5]; + mem_4 = operands[6]; + reg_4 = operands[7]; + } + /* Skip if memory operand is by itslef valid for ldp/stp. */ + if (!MEM_P (mem_1) || aarch64_mem_pair_operand (mem_1, mode)) + return false; + + /* Check if the addresses are in the form of [base+offset]. */ + extract_base_offset_in_addr (mem_1, &base_1, &offset_1); + if (base_1 == NULL_RTX || offset_1 == NULL_RTX) + return false; + extract_base_offset_in_addr (mem_2, &base_2, &offset_2); + if (base_2 == NULL_RTX || offset_2 == NULL_RTX) + return false; + extract_base_offset_in_addr (mem_3, &base_3, &offset_3); + if (base_3 == NULL_RTX || offset_3 == NULL_RTX) + return false; + extract_base_offset_in_addr (mem_4, &base_4, &offset_4); + if (base_4 == NULL_RTX || offset_4 == NULL_RTX) + return false; + + /* Check if the bases are same. */ + if (!rtx_equal_p (base_1, base_2) + || !rtx_equal_p (base_2, base_3) + || !rtx_equal_p (base_3, base_4)) + return false; + + offval_1 = INTVAL (offset_1); + offval_2 = INTVAL (offset_2); + offval_3 = INTVAL (offset_3); + offval_4 = INTVAL (offset_4); + msize = GET_MODE_SIZE (mode); + /* Check if the offsets are consecutive. */ + if ((offval_1 != (offval_2 + msize) + || offval_1 != (offval_3 + msize * 2) + || offval_1 != (offval_4 + msize * 3)) + && (offval_4 != (offval_3 + msize) + || offval_4 != (offval_2 + msize * 2) + || offval_4 != (offval_1 + msize * 3))) + return false; + + /* Check if the addresses are clobbered by load. */ + if (load) + { + if (reg_mentioned_p (reg_1, mem_1) + || reg_mentioned_p (reg_2, mem_2) + || reg_mentioned_p (reg_3, mem_3)) + return false; + + /* In increasing order, the last load can clobber the address. */ + if (offval_1 > offval_2 && reg_mentioned_p (reg_4, mem_4)) + return false; + } + + if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1))) + rclass_1 = FP_REGS; + else + rclass_1 = GENERAL_REGS; + + if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2))) + rclass_2 = FP_REGS; + else + rclass_2 = GENERAL_REGS; + + if (REG_P (reg_3) && FP_REGNUM_P (REGNO (reg_3))) + rclass_3 = FP_REGS; + else + rclass_3 = GENERAL_REGS; + + if (REG_P (reg_4) && FP_REGNUM_P (REGNO (reg_4))) + rclass_4 = FP_REGS; + else + rclass_4 = GENERAL_REGS; + + /* Check if the registers are of same class. */ + if (rclass_1 != rclass_2 || rclass_2 != rclass_3 || rclass_3 != rclass_4) + return false; + + return true; +} + +/* Given OPERANDS of consecutive load/store, this function pairs them + into ldp/stp after adjusting the offset. It depends on the fact + that addresses of load/store instructions are in increasing order. + MODE is the mode of memory operands. CODE is the rtl operator + which should be applied to all memory operands, it's SIGN_EXTEND, + ZERO_EXTEND or UNKNOWN. */ + +bool +aarch64_gen_adjusted_ldpstp (rtx *operands, bool load, + enum machine_mode mode, RTX_CODE code) +{ + rtx base, offset, t1, t2; + rtx mem_1, mem_2, mem_3, mem_4; + HOST_WIDE_INT off_val, abs_off, adj_off, new_off, stp_off_limit, msize; + + if (load) + { + mem_1 = operands[1]; + mem_2 = operands[3]; + mem_3 = operands[5]; + mem_4 = operands[7]; + } + else + { + mem_1 = operands[0]; + mem_2 = operands[2]; + mem_3 = operands[4]; + mem_4 = operands[6]; + gcc_assert (code == UNKNOWN); + } + + extract_base_offset_in_addr (mem_1, &base, &offset); + gcc_assert (base != NULL_RTX && offset != NULL_RTX); + + /* Adjust offset thus it can fit in ldp/stp instruction. */ + msize = GET_MODE_SIZE (mode); + stp_off_limit = msize * 0x40; + off_val = INTVAL (offset); + abs_off = (off_val < 0) ? -off_val : off_val; + new_off = abs_off % stp_off_limit; + adj_off = abs_off - new_off; + + /* Further adjust to make sure all offsets are OK. */ + if ((new_off + msize * 2) >= stp_off_limit) + { + adj_off += stp_off_limit; + new_off -= stp_off_limit; + } + + /* Make sure the adjustment can be done with ADD/SUB instructions. */ + if (adj_off >= 0x1000) + return false; + + if (off_val < 0) + { + adj_off = -adj_off; + new_off = -new_off; + } + + /* Create new memory references. */ + mem_1 = change_address (mem_1, VOIDmode, + plus_constant (DImode, operands[8], new_off)); + + /* Check if the adjusted address is OK for ldp/stp. */ + if (!aarch64_mem_pair_operand (mem_1, mode)) + return false; + + msize = GET_MODE_SIZE (mode); + mem_2 = change_address (mem_2, VOIDmode, + plus_constant (DImode, + operands[8], + new_off + msize)); + mem_3 = change_address (mem_3, VOIDmode, + plus_constant (DImode, + operands[8], + new_off + msize * 2)); + mem_4 = change_address (mem_4, VOIDmode, + plus_constant (DImode, + operands[8], + new_off + msize * 3)); + + if (code == ZERO_EXTEND) + { + mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1); + mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2); + mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3); + mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4); + } + else if (code == SIGN_EXTEND) + { + mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1); + mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2); + mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3); + mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4); + } + + if (load) + { + operands[1] = mem_1; + operands[3] = mem_2; + operands[5] = mem_3; + operands[7] = mem_4; + } + else + { + operands[0] = mem_1; + operands[2] = mem_2; + operands[4] = mem_3; + operands[6] = mem_4; + } + + /* Emit adjusting instruction. */ + emit_insn (gen_rtx_SET (VOIDmode, operands[8], + plus_constant (DImode, base, adj_off))); + /* Emit ldp/stp instructions. */ + t1 = gen_rtx_SET (VOIDmode, operands[0], operands[1]); + t2 = gen_rtx_SET (VOIDmode, operands[2], operands[3]); + emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2))); + t1 = gen_rtx_SET (VOIDmode, operands[4], operands[5]); + t2 = gen_rtx_SET (VOIDmode, operands[6], operands[7]); + emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2))); + return true; +} + #undef TARGET_ADDRESS_COST #define TARGET_ADDRESS_COST aarch64_address_cost @@ -10647,6 +11125,9 @@ aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr) #undef TARGET_SCHED_MACRO_FUSION_PAIR_P #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p +#undef TARGET_SCHED_FUSION_PRIORITY +#define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority + struct gcc_target targetm = TARGET_INITIALIZER; #include "gt-aarch64.h" diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md index 1b0d3025744..46be23999ef 100644 --- a/gcc/config/aarch64/aarch64.md +++ b/gcc/config/aarch64/aarch64.md @@ -1081,62 +1081,139 @@ ;; Operands 1 and 3 are tied together by the final condition; so we allow ;; fairly lax checking on the second memory operation. -(define_insn "load_pair" - [(set (match_operand:GPI 0 "register_operand" "=r") - (match_operand:GPI 1 "aarch64_mem_pair_operand" "Ump")) - (set (match_operand:GPI 2 "register_operand" "=r") - (match_operand:GPI 3 "memory_operand" "m"))] +(define_insn "load_pairsi" + [(set (match_operand:SI 0 "register_operand" "=r,*w") + (match_operand:SI 1 "aarch64_mem_pair_operand" "Ump,Ump")) + (set (match_operand:SI 2 "register_operand" "=r,*w") + (match_operand:SI 3 "memory_operand" "m,m"))] "rtx_equal_p (XEXP (operands[3], 0), plus_constant (Pmode, XEXP (operands[1], 0), - GET_MODE_SIZE (mode)))" - "ldp\\t%0, %2, %1" - [(set_attr "type" "load2")] + GET_MODE_SIZE (SImode)))" + "@ + ldp\\t%w0, %w2, %1 + ldp\\t%s0, %s2, %1" + [(set_attr "type" "load2,neon_load1_2reg") + (set_attr "fp" "*,yes")] ) +(define_insn "load_pairdi" + [(set (match_operand:DI 0 "register_operand" "=r,*w") + (match_operand:DI 1 "aarch64_mem_pair_operand" "Ump,Ump")) + (set (match_operand:DI 2 "register_operand" "=r,*w") + (match_operand:DI 3 "memory_operand" "m,m"))] + "rtx_equal_p (XEXP (operands[3], 0), + plus_constant (Pmode, + XEXP (operands[1], 0), + GET_MODE_SIZE (DImode)))" + "@ + ldp\\t%x0, %x2, %1 + ldp\\t%d0, %d2, %1" + [(set_attr "type" "load2,neon_load1_2reg") + (set_attr "fp" "*,yes")] +) + + ;; Operands 0 and 2 are tied together by the final condition; so we allow ;; fairly lax checking on the second memory operation. -(define_insn "store_pair" - [(set (match_operand:GPI 0 "aarch64_mem_pair_operand" "=Ump") - (match_operand:GPI 1 "register_operand" "r")) - (set (match_operand:GPI 2 "memory_operand" "=m") - (match_operand:GPI 3 "register_operand" "r"))] +(define_insn "store_pairsi" + [(set (match_operand:SI 0 "aarch64_mem_pair_operand" "=Ump,Ump") + (match_operand:SI 1 "aarch64_reg_or_zero" "rZ,*w")) + (set (match_operand:SI 2 "memory_operand" "=m,m") + (match_operand:SI 3 "aarch64_reg_or_zero" "rZ,*w"))] "rtx_equal_p (XEXP (operands[2], 0), plus_constant (Pmode, XEXP (operands[0], 0), - GET_MODE_SIZE (mode)))" - "stp\\t%1, %3, %0" - [(set_attr "type" "store2")] + GET_MODE_SIZE (SImode)))" + "@ + stp\\t%w1, %w3, %0 + stp\\t%s1, %s3, %0" + [(set_attr "type" "store2,neon_store1_2reg") + (set_attr "fp" "*,yes")] +) + +(define_insn "store_pairdi" + [(set (match_operand:DI 0 "aarch64_mem_pair_operand" "=Ump,Ump") + (match_operand:DI 1 "aarch64_reg_or_zero" "rZ,*w")) + (set (match_operand:DI 2 "memory_operand" "=m,m") + (match_operand:DI 3 "aarch64_reg_or_zero" "rZ,*w"))] + "rtx_equal_p (XEXP (operands[2], 0), + plus_constant (Pmode, + XEXP (operands[0], 0), + GET_MODE_SIZE (DImode)))" + "@ + stp\\t%x1, %x3, %0 + stp\\t%d1, %d3, %0" + [(set_attr "type" "store2,neon_store1_2reg") + (set_attr "fp" "*,yes")] ) ;; Operands 1 and 3 are tied together by the final condition; so we allow ;; fairly lax checking on the second memory operation. -(define_insn "load_pair" - [(set (match_operand:GPF 0 "register_operand" "=w") - (match_operand:GPF 1 "aarch64_mem_pair_operand" "Ump")) - (set (match_operand:GPF 2 "register_operand" "=w") - (match_operand:GPF 3 "memory_operand" "m"))] +(define_insn "load_pairsf" + [(set (match_operand:SF 0 "register_operand" "=w,*r") + (match_operand:SF 1 "aarch64_mem_pair_operand" "Ump,Ump")) + (set (match_operand:SF 2 "register_operand" "=w,*r") + (match_operand:SF 3 "memory_operand" "m,m"))] "rtx_equal_p (XEXP (operands[3], 0), plus_constant (Pmode, XEXP (operands[1], 0), - GET_MODE_SIZE (mode)))" - "ldp\\t%0, %2, %1" - [(set_attr "type" "neon_load1_2reg")] + GET_MODE_SIZE (SFmode)))" + "@ + ldp\\t%s0, %s2, %1 + ldp\\t%w0, %w2, %1" + [(set_attr "type" "neon_load1_2reg,load2") + (set_attr "fp" "yes,*")] +) + +(define_insn "load_pairdf" + [(set (match_operand:DF 0 "register_operand" "=w,*r") + (match_operand:DF 1 "aarch64_mem_pair_operand" "Ump,Ump")) + (set (match_operand:DF 2 "register_operand" "=w,*r") + (match_operand:DF 3 "memory_operand" "m,m"))] + "rtx_equal_p (XEXP (operands[3], 0), + plus_constant (Pmode, + XEXP (operands[1], 0), + GET_MODE_SIZE (DFmode)))" + "@ + ldp\\t%d0, %d2, %1 + ldp\\t%x0, %x2, %1" + [(set_attr "type" "neon_load1_2reg,load2") + (set_attr "fp" "yes,*")] ) ;; Operands 0 and 2 are tied together by the final condition; so we allow ;; fairly lax checking on the second memory operation. -(define_insn "store_pair" - [(set (match_operand:GPF 0 "aarch64_mem_pair_operand" "=Ump") - (match_operand:GPF 1 "register_operand" "w")) - (set (match_operand:GPF 2 "memory_operand" "=m") - (match_operand:GPF 3 "register_operand" "w"))] +(define_insn "store_pairsf" + [(set (match_operand:SF 0 "aarch64_mem_pair_operand" "=Ump,Ump") + (match_operand:SF 1 "register_operand" "w,*r")) + (set (match_operand:SF 2 "memory_operand" "=m,m") + (match_operand:SF 3 "register_operand" "w,*r"))] "rtx_equal_p (XEXP (operands[2], 0), plus_constant (Pmode, XEXP (operands[0], 0), - GET_MODE_SIZE (mode)))" - "stp\\t%1, %3, %0" - [(set_attr "type" "neon_store1_2reg")] + GET_MODE_SIZE (SFmode)))" + "@ + stp\\t%s1, %s3, %0 + stp\\t%w1, %w3, %0" + [(set_attr "type" "neon_store1_2reg,store2") + (set_attr "fp" "yes,*")] +) + +(define_insn "store_pairdf" + [(set (match_operand:DF 0 "aarch64_mem_pair_operand" "=Ump,Ump") + (match_operand:DF 1 "register_operand" "w,*r")) + (set (match_operand:DF 2 "memory_operand" "=m,m") + (match_operand:DF 3 "register_operand" "w,*r"))] + "rtx_equal_p (XEXP (operands[2], 0), + plus_constant (Pmode, + XEXP (operands[0], 0), + GET_MODE_SIZE (DFmode)))" + "@ + stp\\t%d1, %d3, %0 + stp\\t%x1, %x3, %0" + [(set_attr "type" "neon_store1_2reg,store2") + (set_attr "fp" "yes,*")] ) ;; Load pair with post-index writeback. This is primarily used in function @@ -1225,6 +1302,19 @@ [(set_attr "type" "extend,load1")] ) +(define_insn "*load_pair_extendsidi2_aarch64" + [(set (match_operand:DI 0 "register_operand" "=r") + (sign_extend:DI (match_operand:SI 1 "aarch64_mem_pair_operand" "Ump"))) + (set (match_operand:DI 2 "register_operand" "=r") + (sign_extend:DI (match_operand:SI 3 "memory_operand" "m")))] + "rtx_equal_p (XEXP (operands[3], 0), + plus_constant (Pmode, + XEXP (operands[1], 0), + GET_MODE_SIZE (SImode)))" + "ldpsw\\t%0, %2, %1" + [(set_attr "type" "load2")] +) + (define_insn "*zero_extendsidi2_aarch64" [(set (match_operand:DI 0 "register_operand" "=r,r") (zero_extend:DI (match_operand:SI 1 "nonimmediate_operand" "r,m")))] @@ -1235,6 +1325,19 @@ [(set_attr "type" "extend,load1")] ) +(define_insn "*load_pair_zero_extendsidi2_aarch64" + [(set (match_operand:DI 0 "register_operand" "=r") + (zero_extend:DI (match_operand:SI 1 "aarch64_mem_pair_operand" "Ump"))) + (set (match_operand:DI 2 "register_operand" "=r") + (zero_extend:DI (match_operand:SI 3 "memory_operand" "m")))] + "rtx_equal_p (XEXP (operands[3], 0), + plus_constant (Pmode, + XEXP (operands[1], 0), + GET_MODE_SIZE (SImode)))" + "ldp\\t%w0, %w2, %1" + [(set_attr "type" "load2")] +) + (define_expand "2" [(set (match_operand:GPI 0 "register_operand") (ANY_EXTEND:GPI (match_operand:SHORT 1 "nonimmediate_operand")))] @@ -4238,3 +4341,6 @@ ;; Atomic Operations (include "atomics.md") + +;; ldp/stp peephole patterns +(include "aarch64-ldpstp.md") diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index d944651db7c..f32f5c7c1bc 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,13 @@ +2014-12-05 Bin Cheng + + * gcc.target/aarch64/ldp_stp_1.c: New test. + * gcc.target/aarch64/ldp_stp_2.c: New test. + * gcc.target/aarch64/ldp_stp_3.c: New test. + * gcc.target/aarch64/ldp_stp_4.c: New test. + * gcc.target/aarch64/ldp_stp_5.c: New test. + * gcc.target/aarch64/lr_free_1.c: Disable scheduling fusion + and peephole2 pass. + 2014-12-05 Sandra Loosemore * gcc.dg/vect/pr63341-1.c: Remove explicit "dg-do run". diff --git a/gcc/testsuite/gcc.target/aarch64/ldp_stp_1.c b/gcc/testsuite/gcc.target/aarch64/ldp_stp_1.c new file mode 100644 index 00000000000..f02e55f1cc2 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/ldp_stp_1.c @@ -0,0 +1,24 @@ +/* { dg-options "-O2" } */ + +int arr[4][4]; + +void +foo () +{ + arr[0][1] = 1; + arr[1][0] = -1; + arr[2][0] = 1; + arr[1][1] = -1; + arr[0][2] = 1; + arr[0][3] = -1; + arr[1][2] = 1; + arr[2][1] = -1; + arr[3][0] = 1; + arr[3][1] = -1; + arr[2][2] = 1; + arr[1][3] = -1; + arr[2][3] = 1; + arr[3][2] = -1; +} + +/* { dg-final { scan-assembler-times "stp\tw\[0-9\]+, w\[0-9\]" 7 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/ldp_stp_2.c b/gcc/testsuite/gcc.target/aarch64/ldp_stp_2.c new file mode 100644 index 00000000000..e3b56413390 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/ldp_stp_2.c @@ -0,0 +1,17 @@ +/* { dg-options "-O2" } */ + +extern void abort (void); + +int arr[4][4] = {{0, 1, 1, -1}, {-1, -1, 1, -1}, {1, -1, 1, 1}, {1, -1, -1, 0}}; +long long +foo () +{ + long long ll = 0; + ll += arr[0][1]; + ll += arr[1][0]; + ll += arr[1][1]; + ll += arr[2][0]; + return ll; +} + +/* { dg-final { scan-assembler-times "ldpsw\tx\[0-9\]+, x\[0-9\]" 1 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/ldp_stp_3.c b/gcc/testsuite/gcc.target/aarch64/ldp_stp_3.c new file mode 100644 index 00000000000..c6c877bf87a --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/ldp_stp_3.c @@ -0,0 +1,17 @@ +/* { dg-options "-O2" } */ + +extern void abort (void); + +unsigned int arr[4][4] = {{0, 1, 1, 2}, {2, 2, 1, 2}, {1, 2, 1, 1}, {1, 2, 2, 0}}; +unsigned long long +foo () +{ + unsigned long long ll = 0; + ll += arr[0][1]; + ll += arr[1][0]; + ll += arr[1][1]; + ll += arr[2][0]; + return ll; +} + +/* { dg-final { scan-assembler-times "ldp\tw\[0-9\]+, w\[0-9\]" 1 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/ldp_stp_4.c b/gcc/testsuite/gcc.target/aarch64/ldp_stp_4.c new file mode 100644 index 00000000000..40056b1adeb --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/ldp_stp_4.c @@ -0,0 +1,24 @@ +/* { dg-options "-O2" } */ + +float arr[4][4]; + +void +foo () +{ + arr[0][1] = 1; + arr[1][0] = -1; + arr[2][0] = 1; + arr[1][1] = -1; + arr[0][2] = 1; + arr[0][3] = -1; + arr[1][2] = 1; + arr[2][1] = -1; + arr[3][0] = 1; + arr[3][1] = -1; + arr[2][2] = 1; + arr[1][3] = -1; + arr[2][3] = 1; + arr[3][2] = -1; +} + +/* { dg-final { scan-assembler-times "stp\ts\[0-9\]+, s\[0-9\]" 7 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/ldp_stp_5.c b/gcc/testsuite/gcc.target/aarch64/ldp_stp_5.c new file mode 100644 index 00000000000..94266181df7 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/ldp_stp_5.c @@ -0,0 +1,24 @@ +/* { dg-options "-O2" } */ + +double arr[4][4]; + +void +foo () +{ + arr[0][1] = 1; + arr[1][0] = -1; + arr[2][0] = 1; + arr[1][1] = -1; + arr[0][2] = 1; + arr[0][3] = -1; + arr[1][2] = 1; + arr[2][1] = -1; + arr[3][0] = 1; + arr[3][1] = -1; + arr[2][2] = 1; + arr[1][3] = -1; + arr[2][3] = 1; + arr[3][2] = -1; +} + +/* { dg-final { scan-assembler-times "stp\td\[0-9\]+, d\[0-9\]" 7 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/lr_free_1.c b/gcc/testsuite/gcc.target/aarch64/lr_free_1.c index 4c530a2ad5e..84dcc4df011 100644 --- a/gcc/testsuite/gcc.target/aarch64/lr_free_1.c +++ b/gcc/testsuite/gcc.target/aarch64/lr_free_1.c @@ -1,5 +1,5 @@ /* { dg-do run } */ -/* { dg-options "-fno-inline -O2 -fomit-frame-pointer -ffixed-x2 -ffixed-x3 -ffixed-x4 -ffixed-x5 -ffixed-x6 -ffixed-x7 -ffixed-x8 -ffixed-x9 -ffixed-x10 -ffixed-x11 -ffixed-x12 -ffixed-x13 -ffixed-x14 -ffixed-x15 -ffixed-x16 -ffixed-x17 -ffixed-x18 -ffixed-x19 -ffixed-x20 -ffixed-x21 -ffixed-x22 -ffixed-x23 -ffixed-x24 -ffixed-x25 -ffixed-x26 -ffixed-x27 -ffixed-28 -ffixed-29 --save-temps -mgeneral-regs-only -fno-ipa-cp" } */ +/* { dg-options "-fno-inline -O2 -fomit-frame-pointer -ffixed-x2 -ffixed-x3 -ffixed-x4 -ffixed-x5 -ffixed-x6 -ffixed-x7 -ffixed-x8 -ffixed-x9 -ffixed-x10 -ffixed-x11 -ffixed-x12 -ffixed-x13 -ffixed-x14 -ffixed-x15 -ffixed-x16 -ffixed-x17 -ffixed-x18 -ffixed-x19 -ffixed-x20 -ffixed-x21 -ffixed-x22 -ffixed-x23 -ffixed-x24 -ffixed-x25 -ffixed-x26 -ffixed-x27 -ffixed-28 -ffixed-29 --save-temps -mgeneral-regs-only -fno-ipa-cp -fno-schedule-fusion -fno-peephole2" } */ extern void abort ();