* timevar.def (TV_SCHED_FUSION): New time var.
* passes.def (pass_sched_fusion): New pass.
* config/arm/arm.c (TARGET_SCHED_FUSION_PRIORITY): New.
(extract_base_offset_in_addr, fusion_load_store): New.
(arm_sched_fusion_priority): New.
(arm_option_override): Disable scheduling fusion by default
on non-armv7 processors or ldrd/strd isn't preferred.
* sched-int.h (struct _haifa_insn_data): New field.
(INSN_FUSION_PRIORITY, FUSION_MAX_PRIORITY, sched_fusion): New.
* sched-rgn.c (rest_of_handle_sched_fusion): New.
(pass_data_sched_fusion, pass_sched_fusion): New.
(make_pass_sched_fusion): New.
* haifa-sched.c (sched_fusion): New.
(insn_cost): Handle sched_fusion.
(priority): Handle sched_fusion by calling target hook.
(enum rfs_decision): New enum value.
(rfs_str): New element for RFS_FUSION.
(rank_for_schedule): Support sched_fusion.
(schedule_insn, max_issue, prune_ready_list): Handle sched_fusion.
(schedule_block, fix_tick_ready): Handle sched_fusion.
* common.opt (flag_schedule_fusion): New.
* tree-pass.h (make_pass_sched_fusion): New.
* target.def (fusion_priority): New.
* doc/tm.texi.in (TARGET_SCHED_FUSION_PRIORITY): New.
* doc/tm.texi: Regenerated.
* doc/invoke.texi (-fschedule-fusion): New.
testsuite:
* gcc.target/arm/ldrd-strd-pair-1.c: New test.
* gcc.target/arm/vfp-1.c: Improve scanning string.
From-SVN: r217533
+2014-11-14 Bin Cheng <bin.cheng@arm.com>
+
+ * timevar.def (TV_SCHED_FUSION): New time var.
+ * passes.def (pass_sched_fusion): New pass.
+ * config/arm/arm.c (TARGET_SCHED_FUSION_PRIORITY): New.
+ (extract_base_offset_in_addr, fusion_load_store): New.
+ (arm_sched_fusion_priority): New.
+ (arm_option_override): Disable scheduling fusion by default
+ on non-armv7 processors or ldrd/strd isn't preferred.
+ * sched-int.h (struct _haifa_insn_data): New field.
+ (INSN_FUSION_PRIORITY, FUSION_MAX_PRIORITY, sched_fusion): New.
+ * sched-rgn.c (rest_of_handle_sched_fusion): New.
+ (pass_data_sched_fusion, pass_sched_fusion): New.
+ (make_pass_sched_fusion): New.
+ * haifa-sched.c (sched_fusion): New.
+ (insn_cost): Handle sched_fusion.
+ (priority): Handle sched_fusion by calling target hook.
+ (enum rfs_decision): New enum value.
+ (rfs_str): New element for RFS_FUSION.
+ (rank_for_schedule): Support sched_fusion.
+ (schedule_insn, max_issue, prune_ready_list): Handle sched_fusion.
+ (schedule_block, fix_tick_ready): Handle sched_fusion.
+ * common.opt (flag_schedule_fusion): New.
+ * tree-pass.h (make_pass_sched_fusion): New.
+ * target.def (fusion_priority): New.
+ * doc/tm.texi.in (TARGET_SCHED_FUSION_PRIORITY): New.
+ * doc/tm.texi: Regenerated.
+ * doc/invoke.texi (-fschedule-fusion): New.
+
2014-11-13 Rong Xu <xur@google.com>
PR debug/63581
Common Report Var(flag_rename_registers) Init(2) Optimization
Perform a register renaming optimization pass
+fschedule-fusion
+Common Report Var(flag_schedule_fusion) Init(2) Optimization
+Perform a target dependent instruction fusion optimization pass
+
freorder-blocks
Common Report Var(flag_reorder_blocks) Optimization
Reorder basic blocks to improve code placement
static void arm_canonicalize_comparison (int *code, rtx *op0, rtx *op1,
bool op0_preserve_value);
static unsigned HOST_WIDE_INT arm_asan_shadow_offset (void);
+
+static void arm_sched_fusion_priority (rtx_insn *, int, int *, int*);
\f
/* Table of machine attributes. */
static const struct attribute_spec arm_attribute_table[] =
#undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
#define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
+#undef TARGET_SCHED_FUSION_PRIORITY
+#define TARGET_SCHED_FUSION_PRIORITY arm_sched_fusion_priority
+
struct gcc_target targetm = TARGET_INITIALIZER;
\f
/* Obstack for minipool constant handling. */
if (TARGET_THUMB2)
inline_asm_unified = 1;
+ /* Disable scheduling fusion by default if it's not armv7 processor
+ or doesn't prefer ldrd/strd. */
+ if (flag_schedule_fusion == 2
+ && (!arm_arch7 || !current_tune->prefer_ldrd_strd))
+ flag_schedule_fusion = 0;
+
/* Register global variables with the garbage collector. */
arm_add_gc_roots ();
}
&& CONSTANT_POOL_ADDRESS_P (XEXP (x, 0)));
}
+/* If MEM is in the form of [base+offset], extract the two parts
+ of address and set to BASE and OFFSET, otherwise return false
+ after clearing BASE and OFFSET. */
+
+static bool
+extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
+{
+ rtx addr;
+
+ gcc_assert (MEM_P (mem));
+
+ addr = XEXP (mem, 0);
+
+ /* Strip off const from addresses like (const (addr)). */
+ if (GET_CODE (addr) == CONST)
+ addr = XEXP (addr, 0);
+
+ if (GET_CODE (addr) == REG)
+ {
+ *base = addr;
+ *offset = const0_rtx;
+ return true;
+ }
+
+ if (GET_CODE (addr) == PLUS
+ && GET_CODE (XEXP (addr, 0)) == REG
+ && CONST_INT_P (XEXP (addr, 1)))
+ {
+ *base = XEXP (addr, 0);
+ *offset = XEXP (addr, 1);
+ return true;
+ }
+
+ *base = NULL_RTX;
+ *offset = NULL_RTX;
+
+ return false;
+}
+
+/* If INSN is a load or store of address in the form of [base+offset],
+ extract the two parts and set to BASE and OFFSET. IS_LOAD is set
+ to TRUE if it's a load. Return TRUE if INSN is such an instruction,
+ otherwise return FALSE. */
+
+static bool
+fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset, bool *is_load)
+{
+ rtx x, dest, src;
+
+ gcc_assert (INSN_P (insn));
+ x = PATTERN (insn);
+ if (GET_CODE (x) != SET)
+ return false;
+
+ src = SET_SRC (x);
+ dest = SET_DEST (x);
+ if (GET_CODE (src) == REG && GET_CODE (dest) == MEM)
+ {
+ *is_load = false;
+ extract_base_offset_in_addr (dest, base, offset);
+ }
+ else if (GET_CODE (src) == MEM && GET_CODE (dest) == REG)
+ {
+ *is_load = true;
+ extract_base_offset_in_addr (src, base, offset);
+ }
+ else
+ return false;
+
+ return (*base != NULL_RTX && *offset != NULL_RTX);
+}
+
+/* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
+
+ Currently we only support to fuse ldr or str instructions, so FUSION_PRI
+ and PRI are only calculated for these instructions. For other instruction,
+ FUSION_PRI and PRI are simply set to MAX_PRI. In the future, other kind
+ instruction fusion can be supported by returning different priorities.
+
+ It's important that irrelevant instructions get the largest FUSION_PRI. */
+
+static void
+arm_sched_fusion_priority (rtx_insn *insn, int max_pri,
+ int *fusion_pri, int *pri)
+{
+ int tmp, off_val;
+ bool is_load;
+ rtx base, offset;
+
+ gcc_assert (INSN_P (insn));
+
+ tmp = max_pri - 1;
+ if (!fusion_load_store (insn, &base, &offset, &is_load))
+ {
+ *pri = tmp;
+ *fusion_pri = tmp;
+ return;
+ }
+
+ /* Load goes first. */
+ if (is_load)
+ *fusion_pri = tmp - 1;
+ else
+ *fusion_pri = tmp - 2;
+
+ tmp /= 2;
+
+ /* INSN with smaller base register goes first. */
+ tmp -= ((REGNO (base) & 0xff) << 20);
+
+ /* INSN with smaller offset goes first. */
+ off_val = (int)(INTVAL (offset));
+ if (off_val >= 0)
+ tmp -= (off_val & 0xfffff);
+ else
+ tmp += ((- off_val) & 0xfffff);
+
+ *pri = tmp;
+ return;
+}
#include "gt-arm.h"
-fprofile-correction -fprofile-dir=@var{path} -fprofile-generate @gol
-fprofile-generate=@var{path} @gol
-fprofile-use -fprofile-use=@var{path} -fprofile-values -fprofile-reorder-functions @gol
--freciprocal-math -free -frename-registers -freorder-blocks @gol
+-freciprocal-math -free -frename-registers -fschedule-fusion -freorder-blocks @gol
-freorder-blocks-and-partition -freorder-functions @gol
-frerun-cse-after-loop -freschedule-modulo-scheduled-loops @gol
-frounding-math -fsched2-use-superblocks -fsched-pressure @gol
Enabled by default with @option{-funroll-loops} and @option{-fpeel-loops}.
+@item -fschedule-fusion
+@opindex fschedule-fusion
+Performs a target dependent pass over the instruction stream to schedule
+instructions of same type together because target machine can execute them
+more efficiently if they are adjacent to each other in the instruction flow.
+
+Enabled at levels @option{-O2}, @option{-O3}, @option{-Os}.
+
@item -ftracer
@opindex ftracer
Perform tail duplication to enlarge superblock size. This transformation
parallelism required in output calculations chain.
@end deftypefn
+@deftypefn {Target Hook} void TARGET_SCHED_FUSION_PRIORITY (rtx_insn *@var{insn}, int @var{max_pri}, int *@var{fusion_pri}, int *@var{pri})
+This hook is called by scheduling fusion pass. It calculates fusion
+priorities for each instruction passed in by parameter. The priorities
+are returned via pointer parameters.
+
+@var{insn} is the instruction whose priorities need to be calculated.
+@var{max_pri} is the maximum priority can be returned in any cases.
+@var{fusion_pri} is the pointer parameter through which @var{insn}'s
+fusion priority should be calculated and returned.
+@var{pri} is the pointer parameter through which @var{insn}'s priority
+should be calculated and returned.
+
+Same @var{fusion_pri} should be returned for instructions which should
+be scheduled together. Different @var{pri} should be returned for
+instructions with same @var{fusion_pri}. @var{fusion_pri} is the major
+sort key, @var{pri} is the minor sort key. All instructions will be
+scheduled according to the two priorities. All priorities calculated
+should be between 0 (exclusive) and @var{max_pri} (inclusive). To avoid
+false dependencies, @var{fusion_pri} of instructions which need to be
+scheduled together should be smaller than @var{fusion_pri} of irrelevant
+instructions.
+
+Given below example:
+
+ ldr r10, [r1, 4]
+ add r4, r4, r10
+ ldr r15, [r2, 8]
+ sub r5, r5, r15
+ ldr r11, [r1, 0]
+ add r4, r4, r11
+ ldr r16, [r2, 12]
+ sub r5, r5, r16
+
+On targets like ARM/AArch64, the two pairs of consecutive loads should be
+merged. Since peephole2 pass can't help in this case unless consecutive
+loads are actually next to each other in instruction flow. That's where
+this scheduling fusion pass works. This hook calculates priority for each
+instruction based on its fustion type, like:
+
+ ldr r10, [r1, 4] ; fusion_pri=99, pri=96
+ add r4, r4, r10 ; fusion_pri=100, pri=100
+ ldr r15, [r2, 8] ; fusion_pri=98, pri=92
+ sub r5, r5, r15 ; fusion_pri=100, pri=100
+ ldr r11, [r1, 0] ; fusion_pri=99, pri=100
+ add r4, r4, r11 ; fusion_pri=100, pri=100
+ ldr r16, [r2, 12] ; fusion_pri=98, pri=88
+ sub r5, r5, r16 ; fusion_pri=100, pri=100
+
+Scheduling fusion pass then sorts all ready to issue instructions according
+to the priorities. As a result, instructions of same fusion type will be
+pushed together in instruction flow, like:
+
+ ldr r11, [r1, 0]
+ ldr r10, [r1, 4]
+ ldr r15, [r2, 8]
+ ldr r16, [r2, 12]
+ add r4, r4, r10
+ sub r5, r5, r15
+ add r4, r4, r11
+ sub r5, r5, r16
+
+Now peephole2 pass can simply merge the two pairs of loads.
+
+Since scheduling fusion pass relies on peephole2 to do real fusion
+work, it is only enabled by default when peephole2 is in effect.
+
+This is firstly introduced on ARM/AArch64 targets, please refer to
+the hook implementation for how different fusion types are supported.
+@end deftypefn
+
@node Sections
@section Dividing the Output into Sections (Texts, Data, @dots{})
@c the above section title is WAY too long. maybe cut the part between
@hook TARGET_SCHED_REASSOCIATION_WIDTH
+@hook TARGET_SCHED_FUSION_PRIORITY
+
@node Sections
@section Dividing the Output into Sections (Texts, Data, @dots{})
@c the above section title is WAY too long. maybe cut the part between
{
int cost;
+ if (sched_fusion)
+ return 0;
+
if (sel_sched_p ())
{
if (recog_memoized (insn) < 0)
return nodbgcount;
}
+bool sched_fusion;
+
/* Compute the priority number for INSN. */
static int
priority (rtx_insn *insn)
{
int this_priority = -1;
- if (dep_list_size (insn, SD_LIST_FORW) == 0)
+ if (sched_fusion)
+ {
+ int this_fusion_priority;
+
+ targetm.sched.fusion_priority (insn, FUSION_MAX_PRIORITY,
+ &this_fusion_priority, &this_priority);
+ INSN_FUSION_PRIORITY (insn) = this_fusion_priority;
+ }
+ else if (dep_list_size (insn, SD_LIST_FORW) == 0)
/* ??? We should set INSN_PRIORITY to insn_cost when and insn has
some forward deps but all of them are ignored by
contributes_to_priority hook. At the moment we set priority of
RFS_SCHED_GROUP, RFS_PRESSURE_DELAY, RFS_PRESSURE_TICK,
RFS_FEEDS_BACKTRACK_INSN, RFS_PRIORITY, RFS_SPECULATION,
RFS_SCHED_RANK, RFS_LAST_INSN, RFS_PRESSURE_INDEX,
- RFS_DEP_COUNT, RFS_TIE, RFS_N };
+ RFS_DEP_COUNT, RFS_TIE, RFS_FUSION, RFS_N };
/* Corresponding strings for print outs. */
static const char *rfs_str[RFS_N] = {
"RFS_SCHED_GROUP", "RFS_PRESSURE_DELAY", "RFS_PRESSURE_TICK",
"RFS_FEEDS_BACKTRACK_INSN", "RFS_PRIORITY", "RFS_SPECULATION",
"RFS_SCHED_RANK", "RFS_LAST_INSN", "RFS_PRESSURE_INDEX",
- "RFS_DEP_COUNT", "RFS_TIE" };
+ "RFS_DEP_COUNT", "RFS_TIE", "RFS_FUSION" };
/* Statistical breakdown of rank_for_schedule decisions. */
typedef struct { unsigned stats[RFS_N]; } rank_for_schedule_stats_t;
/* Make sure that priority of TMP and TMP2 are initialized. */
gcc_assert (INSN_PRIORITY_KNOWN (tmp) && INSN_PRIORITY_KNOWN (tmp2));
+ if (sched_fusion)
+ {
+ /* The instruction that has the same fusion priority as the last
+ instruction is the instruction we picked next. If that is not
+ the case, we sort ready list firstly by fusion priority, then
+ by priority, and at last by INSN_LUID. */
+ int a = INSN_FUSION_PRIORITY (tmp);
+ int b = INSN_FUSION_PRIORITY (tmp2);
+ int last = -1;
+
+ if (last_nondebug_scheduled_insn
+ && !NOTE_P (last_nondebug_scheduled_insn)
+ && BLOCK_FOR_INSN (tmp)
+ == BLOCK_FOR_INSN (last_nondebug_scheduled_insn))
+ last = INSN_FUSION_PRIORITY (last_nondebug_scheduled_insn);
+
+ if (a != last && b != last)
+ {
+ if (a == b)
+ {
+ a = INSN_PRIORITY (tmp);
+ b = INSN_PRIORITY (tmp2);
+ }
+ if (a != b)
+ return rfs_result (RFS_FUSION, b - a, tmp, tmp2);
+ else
+ return rfs_result (RFS_FUSION,
+ INSN_LUID (tmp) - INSN_LUID (tmp2), tmp, tmp2);
+ }
+ else if (a == b)
+ {
+ gcc_assert (last_nondebug_scheduled_insn
+ && !NOTE_P (last_nondebug_scheduled_insn));
+ last = INSN_PRIORITY (last_nondebug_scheduled_insn);
+
+ a = abs (INSN_PRIORITY (tmp) - last);
+ b = abs (INSN_PRIORITY (tmp2) - last);
+ if (a != b)
+ return rfs_result (RFS_FUSION, a - b, tmp, tmp2);
+ else
+ return rfs_result (RFS_FUSION,
+ INSN_LUID (tmp) - INSN_LUID (tmp2), tmp, tmp2);
+ }
+ else if (a == last)
+ return rfs_result (RFS_FUSION, -1, tmp, tmp2);
+ else
+ return rfs_result (RFS_FUSION, 1, tmp, tmp2);
+ }
+
if (sched_pressure != SCHED_PRESSURE_NONE)
{
/* Prefer insn whose scheduling results in the smallest register
gcc_assert (INSN_TICK (insn) >= MIN_TICK);
if (INSN_TICK (insn) > clock_var)
/* INSN has been prematurely moved from the queue to the ready list.
- This is possible only if following flag is set. */
- gcc_assert (flag_sched_stalled_insns);
+ This is possible only if following flags are set. */
+ gcc_assert (flag_sched_stalled_insns || sched_fusion);
/* ??? Probably, if INSN is scheduled prematurely, we should leave
INSN_TICK untouched. This is a machine-dependent issue, actually. */
struct choice_entry *top;
rtx_insn *insn;
+ if (sched_fusion)
+ return 0;
+
n_ready = ready->n_ready;
gcc_assert (dfa_lookahead >= 1 && privileged_n >= 0
&& privileged_n <= n_ready);
bool sched_group_found = false;
int min_cost_group = 1;
+ if (sched_fusion)
+ return;
+
for (i = 0; i < ready.n_ready; i++)
{
rtx_insn *insn = ready_element (&ready, i);
rtx_insn *tail = PREV_INSN (next_tail);
if ((current_sched_info->flags & DONT_BREAK_DEPENDENCIES) == 0
- && sched_pressure != SCHED_PRESSURE_MODEL)
+ && sched_pressure != SCHED_PRESSURE_MODEL && !sched_fusion)
find_modifiable_mems (head, tail);
/* We used to have code to avoid getting parameters moved from hard
{
memcpy (temp_state, curr_state, dfa_state_size);
cost = state_transition (curr_state, insn);
- if (sched_pressure != SCHED_PRESSURE_WEIGHTED)
+ if (sched_pressure != SCHED_PRESSURE_WEIGHTED && !sched_fusion)
gcc_assert (cost < 0);
if (memcmp (temp_state, curr_state, dfa_state_size) != 0)
cycle_issued_insns++;
INSN_TICK (next) = tick;
delay = tick - clock_var;
- if (delay <= 0 || sched_pressure != SCHED_PRESSURE_NONE)
+ if (delay <= 0 || sched_pressure != SCHED_PRESSURE_NONE || sched_fusion)
delay = QUEUE_READY;
change_queue_index (next, delay);
NEXT_PASS (pass_stack_adjustments);
NEXT_PASS (pass_jump2);
NEXT_PASS (pass_duplicate_computed_gotos);
+ NEXT_PASS (pass_sched_fusion);
NEXT_PASS (pass_peephole2);
NEXT_PASS (pass_if_after_reload);
NEXT_PASS (pass_regrename);
/* A priority for each insn. */
int priority;
+ /* The fusion priority for each insn. */
+ int fusion_priority;
+
/* The minimum clock tick at which the insn becomes ready. This is
used to note timing constraints for the insns in the pending list. */
int tick;
/* Accessor macros for h_i_d. There are more in haifa-sched.c and
sched-rgn.c. */
#define INSN_PRIORITY(INSN) (HID (INSN)->priority)
+#define INSN_FUSION_PRIORITY(INSN) (HID (INSN)->fusion_priority)
#define INSN_REG_PRESSURE(INSN) (HID (INSN)->reg_pressure)
#define INSN_MAX_REG_PRESSURE(INSN) (HID (INSN)->max_reg_pressure)
#define INSN_REG_USE_LIST(INSN) (HID (INSN)->reg_use_list)
extern void sd_delete_dep (sd_iterator_def);
extern void sd_debug_lists (rtx, sd_list_types_def);
+/* Macros and declarations for scheduling fusion. */
+#define FUSION_MAX_PRIORITY (INT_MAX)
+extern bool sched_fusion;
+
#endif /* INSN_SCHEDULING */
#endif /* GCC_SCHED_INT_H */
return 0;
}
+static unsigned int
+rest_of_handle_sched_fusion (void)
+{
+#ifdef INSN_SCHEDULING
+ sched_fusion = true;
+ schedule_insns ();
+ sched_fusion = false;
+#endif
+ return 0;
+}
+
namespace {
const pass_data pass_data_live_range_shrinkage =
{
return new pass_sched2 (ctxt);
}
+
+namespace {
+
+const pass_data pass_data_sched_fusion =
+{
+ RTL_PASS, /* type */
+ "sched_fusion", /* name */
+ OPTGROUP_NONE, /* optinfo_flags */
+ TV_SCHED_FUSION, /* tv_id */
+ 0, /* properties_required */
+ 0, /* properties_provided */
+ 0, /* properties_destroyed */
+ 0, /* todo_flags_start */
+ TODO_df_finish, /* todo_flags_finish */
+};
+
+class pass_sched_fusion : public rtl_opt_pass
+{
+public:
+ pass_sched_fusion (gcc::context *ctxt)
+ : rtl_opt_pass (pass_data_sched_fusion, ctxt)
+ {}
+
+ /* opt_pass methods: */
+ virtual bool gate (function *);
+ virtual unsigned int execute (function *)
+ {
+ return rest_of_handle_sched_fusion ();
+ }
+
+}; // class pass_sched2
+
+bool
+pass_sched_fusion::gate (function *)
+{
+#ifdef INSN_SCHEDULING
+ /* Scheduling fusion relies on peephole2 to do real fusion work,
+ so only enable it if peephole2 is in effect. */
+ return (optimize > 0 && flag_peephole2
+ && flag_schedule_fusion && targetm.sched.fusion_priority != NULL);
+#else
+ return 0;
+#endif
+}
+
+} // anon namespace
+
+rtl_opt_pass *
+make_pass_sched_fusion (gcc::context *ctxt)
+{
+ return new pass_sched_fusion (ctxt);
+}
int, (unsigned int opc, machine_mode mode),
hook_int_uint_mode_1)
+/* The following member value is a function that returns priority for
+ fusion of each instruction via pointer parameters. */
+DEFHOOK
+(fusion_priority,
+"This hook is called by scheduling fusion pass. It calculates fusion\n\
+priorities for each instruction passed in by parameter. The priorities\n\
+are returned via pointer parameters.\n\
+\n\
+@var{insn} is the instruction whose priorities need to be calculated.\n\
+@var{max_pri} is the maximum priority can be returned in any cases.\n\
+@var{fusion_pri} is the pointer parameter through which @var{insn}'s\n\
+fusion priority should be calculated and returned.\n\
+@var{pri} is the pointer parameter through which @var{insn}'s priority\n\
+should be calculated and returned.\n\
+\n\
+Same @var{fusion_pri} should be returned for instructions which should\n\
+be scheduled together. Different @var{pri} should be returned for\n\
+instructions with same @var{fusion_pri}. @var{fusion_pri} is the major\n\
+sort key, @var{pri} is the minor sort key. All instructions will be\n\
+scheduled according to the two priorities. All priorities calculated\n\
+should be between 0 (exclusive) and @var{max_pri} (inclusive). To avoid\n\
+false dependencies, @var{fusion_pri} of instructions which need to be\n\
+scheduled together should be smaller than @var{fusion_pri} of irrelevant\n\
+instructions.\n\
+\n\
+Given below example:\n\
+\n\
+ ldr r10, [r1, 4]\n\
+ add r4, r4, r10\n\
+ ldr r15, [r2, 8]\n\
+ sub r5, r5, r15\n\
+ ldr r11, [r1, 0]\n\
+ add r4, r4, r11\n\
+ ldr r16, [r2, 12]\n\
+ sub r5, r5, r16\n\
+\n\
+On targets like ARM/AArch64, the two pairs of consecutive loads should be\n\
+merged. Since peephole2 pass can't help in this case unless consecutive\n\
+loads are actually next to each other in instruction flow. That's where\n\
+this scheduling fusion pass works. This hook calculates priority for each\n\
+instruction based on its fustion type, like:\n\
+\n\
+ ldr r10, [r1, 4] ; fusion_pri=99, pri=96 \n\
+ add r4, r4, r10 ; fusion_pri=100, pri=100 \n\
+ ldr r15, [r2, 8] ; fusion_pri=98, pri=92 \n\
+ sub r5, r5, r15 ; fusion_pri=100, pri=100 \n\
+ ldr r11, [r1, 0] ; fusion_pri=99, pri=100 \n\
+ add r4, r4, r11 ; fusion_pri=100, pri=100 \n\
+ ldr r16, [r2, 12] ; fusion_pri=98, pri=88 \n\
+ sub r5, r5, r16 ; fusion_pri=100, pri=100 \n\
+\n\
+Scheduling fusion pass then sorts all ready to issue instructions according\n\
+to the priorities. As a result, instructions of same fusion type will be\n\
+pushed together in instruction flow, like:\n\
+\n\
+ ldr r11, [r1, 0]\n\
+ ldr r10, [r1, 4]\n\
+ ldr r15, [r2, 8]\n\
+ ldr r16, [r2, 12]\n\
+ add r4, r4, r10\n\
+ sub r5, r5, r15\n\
+ add r4, r4, r11\n\
+ sub r5, r5, r16\n\
+\n\
+Now peephole2 pass can simply merge the two pairs of loads.\n\
+\n\
+Since scheduling fusion pass relies on peephole2 to do real fusion\n\
+work, it is only enabled by default when peephole2 is in effect.\n\
+\n\
+This is firstly introduced on ARM/AArch64 targets, please refer to\n\
+the hook implementation for how different fusion types are supported.",
+void, (rtx_insn *insn, int max_pri, int *fusion_pri, int *pri), NULL)
+
HOOK_VECTOR_END (sched)
/* Functions relating to OpenMP and Cilk Plus SIMD clones. */
+2014-11-14 Bin Cheng <bin.cheng@arm.com>
+
+ * gcc.target/arm/ldrd-strd-pair-1.c: New test.
+ * gcc.target/arm/vfp-1.c: Improve scanning string.
+
2014-11-13 Rong Xu <xur@google.com>
PR debug/63581
--- /dev/null
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_prefer_ldrd_strd } */
+/* { dg-options "-O2 -mthumb" } */
+
+struct
+{
+ int x;
+ int y;
+ char c;
+ int d;
+}a;
+
+int foo(int x, int y)
+{
+ int c;
+ a.x = x;
+ c = a.x;
+ a.d = c;
+ a.y = y;
+
+ return 0;
+}
+/* { dg-final { scan-assembler "strd\t" { target { arm_thumb2_ok } } } } */
}
void test_ldst (float f[], double d[]) {
- /* { dg-final { scan-assembler "vldr.32.+ \\\[r0, #1020\\\]" } } */
+ /* { dg-final { scan-assembler "vldr.32.+ \\\[r0, #-?\[0-9\]+\\\]" } } */
/* { dg-final { scan-assembler "vldr.32.+ \\\[r\[0-9\], #-1020\\\]" { target { arm32 && { ! arm_thumb2_ok } } } } } */
/* { dg-final { scan-assembler "add.+ r0, #1024" } } */
/* { dg-final { scan-assembler "vstr.32.+ \\\[r\[0-9\]\\\]\n" } } */
DEFTIMEVAR (TV_COMBINE_STACK_ADJUST , "combine stack adjustments")
DEFTIMEVAR (TV_PEEPHOLE2 , "peephole 2")
DEFTIMEVAR (TV_RENAME_REGISTERS , "rename registers")
+DEFTIMEVAR (TV_SCHED_FUSION , "scheduling fusion")
DEFTIMEVAR (TV_CPROP_REGISTERS , "hard reg cprop")
DEFTIMEVAR (TV_SCHED2 , "scheduling 2")
DEFTIMEVAR (TV_MACH_DEP , "machine dep reorg")
extern rtl_opt_pass *make_pass_thread_prologue_and_epilogue (gcc::context
*ctxt);
extern rtl_opt_pass *make_pass_stack_adjustments (gcc::context *ctxt);
+extern rtl_opt_pass *make_pass_sched_fusion (gcc::context *ctxt);
extern rtl_opt_pass *make_pass_peephole2 (gcc::context *ctxt);
extern rtl_opt_pass *make_pass_if_after_reload (gcc::context *ctxt);
extern rtl_opt_pass *make_pass_regrename (gcc::context *ctxt);