* config.gcc (i386, x86_64): Add extra objects.
* i386/i386-protos.h (ix86_rip_relative_addr_p): Declare.
(ix86_min_insn_size): Declare.
(ix86_issue_rate): Declare.
(ix86_adjust_cost): Declare.
(ia32_multipass_dfa_lookahead): Declare.
(ix86_macro_fusion_p): Declare.
(ix86_macro_fusion_pair_p): Declare.
(ix86_bd_has_dispatch): Declare.
(ix86_bd_do_dispatch): Declare.
(ix86_core2i7_init_hooks): Declare.
(ix86_atom_sched_reorder): Declare.
* i386/i386.c Move all CPU cost tables to x86-tune-costs.h.
(COSTS_N_BYTES): Move to x86-tune-costs.h.
(DUMMY_STRINGOP_ALGS):x86-tune-costs.h.
(rip_relative_addr_p): Rename to ...
(ix86_rip_relative_addr_p): ... this one; export.
(memory_address_length): Update.
(ix86_issue_rate): Move to x86-tune-sched.c.
(ix86_flags_dependent): Move to x86-tune-sched.c.
(ix86_agi_dependent): Move to x86-tune-sched.c.
(exact_dependency_1): Move to x86-tune-sched.c.
(exact_store_load_dependency): Move to x86-tune-sched.c.
(ix86_adjust_cost): Move to x86-tune-sched.c.
(ia32_multipass_dfa_lookahead): Move to x86-tune-sched.c.
(ix86_macro_fusion_p): Move to x86-tune-sched.c.
(ix86_macro_fusion_pair_p): Move to x86-tune-sched.c.
(do_reorder_for_imul): Move to x86-tune-sched-atom.c.
(swap_top_of_ready_list): Move to x86-tune-sched-atom.c.
(ix86_sched_reorder): Move to x86-tune-sched-atom.c.
(core2i7_first_cycle_multipass_init): Move to x86-tune-sched-core.c.
(core2i7_dfa_post_advance_cycle): Move to x86-tune-sched-core.c.
(min_insn_size): Rename to ...
(ix86_min_insn_size): ... this one; export.
(core2i7_first_cycle_multipass_begin): Move to x86-tune-sched-core.c.
(core2i7_first_cycle_multipass_issue): Move to x86-tune-sched-core.c.
(core2i7_first_cycle_multipass_backtrack): Move to x86-tune-sched-core.c.
(core2i7_first_cycle_multipass_end): Move to x86-tune-sched-core.c.
(core2i7_first_cycle_multipass_fini): Move to x86-tune-sched-core.c.
(ix86_sched_init_global): Break up logic to ix86_core2i7_init_hooks.
(ix86_avoid_jump_mispredicts): Update.
(TARGET_SCHED_DISPATCH): Move to ix86-tune-sched-bd.c.
(TARGET_SCHED_DISPATCH_DO): Move to ix86-tune-sched-bd.c.
(TARGET_SCHED_REORDER): Move to ix86-tune-sched-bd.c.
(DISPATCH_WINDOW_SIZE): Move to ix86-tune-sched-bd.c.
(MAX_DISPATCH_WINDOWS): Move to ix86-tune-sched-bd.c.
(MAX_INSN): Move to ix86-tune-sched-bd.c.
(MAX_IMM): Move to ix86-tune-sched-bd.c.
(MAX_IMM_SIZE): Move to ix86-tune-sched-bd.c.
(MAX_IMM_32): Move to ix86-tune-sched-bd.c.
(MAX_IMM_64): Move to ix86-tune-sched-bd.c.
(MAX_LOAD): Move to ix86-tune-sched-bd.c.
(MAX_STORE): Move to ix86-tune-sched-bd.c.
(BIG): Move to ix86-tune-sched-bd.c.
(enum dispatch_group): Move to ix86-tune-sched-bd.c.
(enum insn_path): Move to ix86-tune-sched-bd.c.
(get_mem_group): Move to ix86-tune-sched-bd.c.
(is_cmp): Move to ix86-tune-sched-bd.c.
(dispatch_violation): Move to ix86-tune-sched-bd.c.
(is_branch): Move to ix86-tune-sched-bd.c.
(is_prefetch): Move to ix86-tune-sched-bd.c.
(init_window): Move to ix86-tune-sched-bd.c.
(allocate_window): Move to ix86-tune-sched-bd.c.
(init_dispatch_sched): Move to ix86-tune-sched-bd.c.
(is_end_basic_block): Move to ix86-tune-sched-bd.c.
(process_end_window): Move to ix86-tune-sched-bd.c.
(allocate_next_window): Move to ix86-tune-sched-bd.c.
(find_constant): Move to ix86-tune-sched-bd.c.
(get_num_immediates): Move to ix86-tune-sched-bd.c.
(has_immediate): Move to ix86-tune-sched-bd.c.
(get_insn_path): Move to ix86-tune-sched-bd.c.
(get_insn_group): Move to ix86-tune-sched-bd.c.
(count_num_restricted): Move to ix86-tune-sched-bd.c.
(fits_dispatch_window): Move to ix86-tune-sched-bd.c.
(add_insn_window): Move to ix86-tune-sched-bd.c.
(add_to_dispatch_window): Move to ix86-tune-sched-bd.c.
(debug_dispatch_window_file): Move to ix86-tune-sched-bd.c.
(debug_dispatch_window): Move to ix86-tune-sched-bd.c.
(debug_insn_dispatch_info_file): Move to ix86-tune-sched-bd.c.
(debug_ready_dispatch): Move to ix86-tune-sched-bd.c.
(do_dispatch): Move to ix86-tune-sched-bd.c.
(has_dispatch): Move to ix86-tune-sched-bd.c.
* i386/t-i386: Add new object files.
* i386/x86-tune-costs.h: New file.
* i386/x86-tune-sched-atom.c: New file.
* i386/x86-tune-sched-bd.c: New file.
* i386/x86-tune-sched-core.c: New file.
* i386/x86-tune-sched.c: New file.
From-SVN: r253646
+2017-10-11 Jan Hubicka <hubicka@ucw.cz>
+
+ * config.gcc (i386, x86_64): Add extra objects.
+ * i386/i386-protos.h (ix86_rip_relative_addr_p): Declare.
+ (ix86_min_insn_size): Declare.
+ (ix86_issue_rate): Declare.
+ (ix86_adjust_cost): Declare.
+ (ia32_multipass_dfa_lookahead): Declare.
+ (ix86_macro_fusion_p): Declare.
+ (ix86_macro_fusion_pair_p): Declare.
+ (ix86_bd_has_dispatch): Declare.
+ (ix86_bd_do_dispatch): Declare.
+ (ix86_core2i7_init_hooks): Declare.
+ (ix86_atom_sched_reorder): Declare.
+ * i386/i386.c Move all CPU cost tables to x86-tune-costs.h.
+ (COSTS_N_BYTES): Move to x86-tune-costs.h.
+ (DUMMY_STRINGOP_ALGS):x86-tune-costs.h.
+ (rip_relative_addr_p): Rename to ...
+ (ix86_rip_relative_addr_p): ... this one; export.
+ (memory_address_length): Update.
+ (ix86_issue_rate): Move to x86-tune-sched.c.
+ (ix86_flags_dependent): Move to x86-tune-sched.c.
+ (ix86_agi_dependent): Move to x86-tune-sched.c.
+ (exact_dependency_1): Move to x86-tune-sched.c.
+ (exact_store_load_dependency): Move to x86-tune-sched.c.
+ (ix86_adjust_cost): Move to x86-tune-sched.c.
+ (ia32_multipass_dfa_lookahead): Move to x86-tune-sched.c.
+ (ix86_macro_fusion_p): Move to x86-tune-sched.c.
+ (ix86_macro_fusion_pair_p): Move to x86-tune-sched.c.
+ (do_reorder_for_imul): Move to x86-tune-sched-atom.c.
+ (swap_top_of_ready_list): Move to x86-tune-sched-atom.c.
+ (ix86_sched_reorder): Move to x86-tune-sched-atom.c.
+ (core2i7_first_cycle_multipass_init): Move to x86-tune-sched-core.c.
+ (core2i7_dfa_post_advance_cycle): Move to x86-tune-sched-core.c.
+ (min_insn_size): Rename to ...
+ (ix86_min_insn_size): ... this one; export.
+ (core2i7_first_cycle_multipass_begin): Move to x86-tune-sched-core.c.
+ (core2i7_first_cycle_multipass_issue): Move to x86-tune-sched-core.c.
+ (core2i7_first_cycle_multipass_backtrack): Move to x86-tune-sched-core.c.
+ (core2i7_first_cycle_multipass_end): Move to x86-tune-sched-core.c.
+ (core2i7_first_cycle_multipass_fini): Move to x86-tune-sched-core.c.
+ (ix86_sched_init_global): Break up logic to ix86_core2i7_init_hooks.
+ (ix86_avoid_jump_mispredicts): Update.
+ (TARGET_SCHED_DISPATCH): Move to ix86-tune-sched-bd.c.
+ (TARGET_SCHED_DISPATCH_DO): Move to ix86-tune-sched-bd.c.
+ (TARGET_SCHED_REORDER): Move to ix86-tune-sched-bd.c.
+ (DISPATCH_WINDOW_SIZE): Move to ix86-tune-sched-bd.c.
+ (MAX_DISPATCH_WINDOWS): Move to ix86-tune-sched-bd.c.
+ (MAX_INSN): Move to ix86-tune-sched-bd.c.
+ (MAX_IMM): Move to ix86-tune-sched-bd.c.
+ (MAX_IMM_SIZE): Move to ix86-tune-sched-bd.c.
+ (MAX_IMM_32): Move to ix86-tune-sched-bd.c.
+ (MAX_IMM_64): Move to ix86-tune-sched-bd.c.
+ (MAX_LOAD): Move to ix86-tune-sched-bd.c.
+ (MAX_STORE): Move to ix86-tune-sched-bd.c.
+ (BIG): Move to ix86-tune-sched-bd.c.
+ (enum dispatch_group): Move to ix86-tune-sched-bd.c.
+ (enum insn_path): Move to ix86-tune-sched-bd.c.
+ (get_mem_group): Move to ix86-tune-sched-bd.c.
+ (is_cmp): Move to ix86-tune-sched-bd.c.
+ (dispatch_violation): Move to ix86-tune-sched-bd.c.
+ (is_branch): Move to ix86-tune-sched-bd.c.
+ (is_prefetch): Move to ix86-tune-sched-bd.c.
+ (init_window): Move to ix86-tune-sched-bd.c.
+ (allocate_window): Move to ix86-tune-sched-bd.c.
+ (init_dispatch_sched): Move to ix86-tune-sched-bd.c.
+ (is_end_basic_block): Move to ix86-tune-sched-bd.c.
+ (process_end_window): Move to ix86-tune-sched-bd.c.
+ (allocate_next_window): Move to ix86-tune-sched-bd.c.
+ (find_constant): Move to ix86-tune-sched-bd.c.
+ (get_num_immediates): Move to ix86-tune-sched-bd.c.
+ (has_immediate): Move to ix86-tune-sched-bd.c.
+ (get_insn_path): Move to ix86-tune-sched-bd.c.
+ (get_insn_group): Move to ix86-tune-sched-bd.c.
+ (count_num_restricted): Move to ix86-tune-sched-bd.c.
+ (fits_dispatch_window): Move to ix86-tune-sched-bd.c.
+ (add_insn_window): Move to ix86-tune-sched-bd.c.
+ (add_to_dispatch_window): Move to ix86-tune-sched-bd.c.
+ (debug_dispatch_window_file): Move to ix86-tune-sched-bd.c.
+ (debug_dispatch_window): Move to ix86-tune-sched-bd.c.
+ (debug_insn_dispatch_info_file): Move to ix86-tune-sched-bd.c.
+ (debug_ready_dispatch): Move to ix86-tune-sched-bd.c.
+ (do_dispatch): Move to ix86-tune-sched-bd.c.
+ (has_dispatch): Move to ix86-tune-sched-bd.c.
+ * i386/t-i386: Add new object files.
+ * i386/x86-tune-costs.h: New file.
+ * i386/x86-tune-sched-atom.c: New file.
+ * i386/x86-tune-sched-bd.c: New file.
+ * i386/x86-tune-sched-core.c: New file.
+ * i386/x86-tune-sched.c: New file.
+
2017-10-11 Liu Hao <lh_mouse@126.com>
* pretty-print.c [_WIN32] (colorize_init): Remove. Use
cpu_type=i386
c_target_objs="i386-c.o"
cxx_target_objs="i386-c.o"
+ extra_objs="x86-tune-sched.o x86-tune-sched-bd.o x86-tune-sched-atom.o x86-tune-sched-core.o"
extra_options="${extra_options} fused-madd.opt"
extra_headers="cpuid.h mmintrin.h mm3dnow.h xmmintrin.h emmintrin.h
pmmintrin.h tmmintrin.h ammintrin.h smmintrin.h
c_target_objs="i386-c.o"
cxx_target_objs="i386-c.o"
extra_options="${extra_options} fused-madd.opt"
+ extra_objs="x86-tune-sched.o x86-tune-sched-bd.o x86-tune-sched-atom.o x86-tune-sched-core.o"
extra_headers="cpuid.h mmintrin.h mm3dnow.h xmmintrin.h emmintrin.h
pmmintrin.h tmmintrin.h ammintrin.h smmintrin.h
nmmintrin.h bmmintrin.h fma4intrin.h wmmintrin.h
extern bool ix86_target_stack_probe (void);
extern bool ix86_can_use_return_insn_p (void);
extern void ix86_setup_frame_addresses (void);
+extern bool ix86_rip_relative_addr_p (struct ix86_address *parts);
extern HOST_WIDE_INT ix86_initial_elimination_offset (int, int);
extern void ix86_expand_prologue (void);
extern const char * ix86_output_call_insn (rtx_insn *insn, rtx call_op);
extern bool ix86_operands_ok_for_move_multiple (rtx *operands, bool load,
machine_mode mode);
+extern int ix86_min_insn_size (rtx_insn *);
+
+extern int ix86_issue_rate (void);
+extern int ix86_adjust_cost (rtx_insn *insn, int dep_type, rtx_insn *dep_insn,
+ int cost, unsigned int);
+extern int ia32_multipass_dfa_lookahead (void);
+extern bool ix86_macro_fusion_p (void);
+extern bool ix86_macro_fusion_pair_p (rtx_insn *condgen, rtx_insn *condjmp);
+
+extern bool ix86_bd_has_dispatch (rtx_insn *insn, int action);
+extern void ix86_bd_do_dispatch (rtx_insn *insn, int mode);
+
+extern void ix86_core2i7_init_hooks (void);
+
+extern int ix86_atom_sched_reorder (FILE *, int, rtx_insn **, int *, int);
#ifdef RTX_CODE
/* Target data for multipass lookahead scheduling.
/* This file should be included last. */
#include "target-def.h"
+#include "x86-tune-costs.h"
+
static rtx legitimize_dllimport_symbol (rtx, bool);
static rtx legitimize_pe_coff_extern_decl (rtx, bool);
static rtx legitimize_pe_coff_symbol (rtx, bool);
: (mode) == DImode ? 3 \
: 4)
-/* Processor costs (relative to an add) */
-/* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes. */
-#define COSTS_N_BYTES(N) ((N) * 2)
-
-#define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall, false}}}
-
-static stringop_algs ix86_size_memcpy[2] = {
- {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
- {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
-static stringop_algs ix86_size_memset[2] = {
- {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
- {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
-
-const
-struct processor_costs ix86_size_cost = {/* costs for tuning for size */
- COSTS_N_BYTES (2), /* cost of an add instruction */
- COSTS_N_BYTES (3), /* cost of a lea instruction */
- COSTS_N_BYTES (2), /* variable shift costs */
- COSTS_N_BYTES (3), /* constant shift costs */
- {COSTS_N_BYTES (3), /* cost of starting multiply for QI */
- COSTS_N_BYTES (3), /* HI */
- COSTS_N_BYTES (3), /* SI */
- COSTS_N_BYTES (3), /* DI */
- COSTS_N_BYTES (5)}, /* other */
- 0, /* cost of multiply per each bit set */
- {COSTS_N_BYTES (3), /* cost of a divide/mod for QI */
- COSTS_N_BYTES (3), /* HI */
- COSTS_N_BYTES (3), /* SI */
- COSTS_N_BYTES (3), /* DI */
- COSTS_N_BYTES (5)}, /* other */
- COSTS_N_BYTES (3), /* cost of movsx */
- COSTS_N_BYTES (3), /* cost of movzx */
- 0, /* "large" insn */
- 2, /* MOVE_RATIO */
- 2, /* cost for loading QImode using movzbl */
- {2, 2, 2}, /* cost of loading integer registers
- in QImode, HImode and SImode.
- Relative to reg-reg move (2). */
- {2, 2, 2}, /* cost of storing integer registers */
- 2, /* cost of reg,reg fld/fst */
- {2, 2, 2}, /* cost of loading fp registers
- in SFmode, DFmode and XFmode */
- {2, 2, 2}, /* cost of storing fp registers
- in SFmode, DFmode and XFmode */
- 3, /* cost of moving MMX register */
- {3, 3}, /* cost of loading MMX registers
- in SImode and DImode */
- {3, 3}, /* cost of storing MMX registers
- in SImode and DImode */
- 3, /* cost of moving SSE register */
- {3, 3, 3}, /* cost of loading SSE registers
- in SImode, DImode and TImode */
- {3, 3, 3}, /* cost of storing SSE registers
- in SImode, DImode and TImode */
- 3, /* MMX or SSE register to integer */
- 0, /* size of l1 cache */
- 0, /* size of l2 cache */
- 0, /* size of prefetch block */
- 0, /* number of parallel prefetches */
- 2, /* Branch cost */
- COSTS_N_BYTES (2), /* cost of FADD and FSUB insns. */
- COSTS_N_BYTES (2), /* cost of FMUL instruction. */
- COSTS_N_BYTES (2), /* cost of FDIV instruction. */
- COSTS_N_BYTES (2), /* cost of FABS instruction. */
- COSTS_N_BYTES (2), /* cost of FCHS instruction. */
- COSTS_N_BYTES (2), /* cost of FSQRT instruction. */
- 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
- ix86_size_memcpy,
- ix86_size_memset,
- 1, /* scalar_stmt_cost. */
- 1, /* scalar load_cost. */
- 1, /* scalar_store_cost. */
- 1, /* vec_stmt_cost. */
- 1, /* vec_to_scalar_cost. */
- 1, /* scalar_to_vec_cost. */
- 1, /* vec_align_load_cost. */
- 1, /* vec_unalign_load_cost. */
- 1, /* vec_store_cost. */
- 1, /* cond_taken_branch_cost. */
- 1, /* cond_not_taken_branch_cost. */
-};
-
-/* Processor costs (relative to an add) */
-static stringop_algs i386_memcpy[2] = {
- {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
- DUMMY_STRINGOP_ALGS};
-static stringop_algs i386_memset[2] = {
- {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
- DUMMY_STRINGOP_ALGS};
-
-static const
-struct processor_costs i386_cost = { /* 386 specific costs */
- COSTS_N_INSNS (1), /* cost of an add instruction */
- COSTS_N_INSNS (1), /* cost of a lea instruction */
- COSTS_N_INSNS (3), /* variable shift costs */
- COSTS_N_INSNS (2), /* constant shift costs */
- {COSTS_N_INSNS (6), /* cost of starting multiply for QI */
- COSTS_N_INSNS (6), /* HI */
- COSTS_N_INSNS (6), /* SI */
- COSTS_N_INSNS (6), /* DI */
- COSTS_N_INSNS (6)}, /* other */
- COSTS_N_INSNS (1), /* cost of multiply per each bit set */
- {COSTS_N_INSNS (23), /* cost of a divide/mod for QI */
- COSTS_N_INSNS (23), /* HI */
- COSTS_N_INSNS (23), /* SI */
- COSTS_N_INSNS (23), /* DI */
- COSTS_N_INSNS (23)}, /* other */
- COSTS_N_INSNS (3), /* cost of movsx */
- COSTS_N_INSNS (2), /* cost of movzx */
- 15, /* "large" insn */
- 3, /* MOVE_RATIO */
- 4, /* cost for loading QImode using movzbl */
- {2, 4, 2}, /* cost of loading integer registers
- in QImode, HImode and SImode.
- Relative to reg-reg move (2). */
- {2, 4, 2}, /* cost of storing integer registers */
- 2, /* cost of reg,reg fld/fst */
- {8, 8, 8}, /* cost of loading fp registers
- in SFmode, DFmode and XFmode */
- {8, 8, 8}, /* cost of storing fp registers
- in SFmode, DFmode and XFmode */
- 2, /* cost of moving MMX register */
- {4, 8}, /* cost of loading MMX registers
- in SImode and DImode */
- {4, 8}, /* cost of storing MMX registers
- in SImode and DImode */
- 2, /* cost of moving SSE register */
- {4, 8, 16}, /* cost of loading SSE registers
- in SImode, DImode and TImode */
- {4, 8, 16}, /* cost of storing SSE registers
- in SImode, DImode and TImode */
- 3, /* MMX or SSE register to integer */
- 0, /* size of l1 cache */
- 0, /* size of l2 cache */
- 0, /* size of prefetch block */
- 0, /* number of parallel prefetches */
- 1, /* Branch cost */
- COSTS_N_INSNS (23), /* cost of FADD and FSUB insns. */
- COSTS_N_INSNS (27), /* cost of FMUL instruction. */
- COSTS_N_INSNS (88), /* cost of FDIV instruction. */
- COSTS_N_INSNS (22), /* cost of FABS instruction. */
- COSTS_N_INSNS (24), /* cost of FCHS instruction. */
- COSTS_N_INSNS (122), /* cost of FSQRT instruction. */
- 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
- i386_memcpy,
- i386_memset,
- 1, /* scalar_stmt_cost. */
- 1, /* scalar load_cost. */
- 1, /* scalar_store_cost. */
- 1, /* vec_stmt_cost. */
- 1, /* vec_to_scalar_cost. */
- 1, /* scalar_to_vec_cost. */
- 1, /* vec_align_load_cost. */
- 2, /* vec_unalign_load_cost. */
- 1, /* vec_store_cost. */
- 3, /* cond_taken_branch_cost. */
- 1, /* cond_not_taken_branch_cost. */
-};
-
-static stringop_algs i486_memcpy[2] = {
- {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
- DUMMY_STRINGOP_ALGS};
-static stringop_algs i486_memset[2] = {
- {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
- DUMMY_STRINGOP_ALGS};
-
-static const
-struct processor_costs i486_cost = { /* 486 specific costs */
- COSTS_N_INSNS (1), /* cost of an add instruction */
- COSTS_N_INSNS (1), /* cost of a lea instruction */
- COSTS_N_INSNS (3), /* variable shift costs */
- COSTS_N_INSNS (2), /* constant shift costs */
- {COSTS_N_INSNS (12), /* cost of starting multiply for QI */
- COSTS_N_INSNS (12), /* HI */
- COSTS_N_INSNS (12), /* SI */
- COSTS_N_INSNS (12), /* DI */
- COSTS_N_INSNS (12)}, /* other */
- 1, /* cost of multiply per each bit set */
- {COSTS_N_INSNS (40), /* cost of a divide/mod for QI */
- COSTS_N_INSNS (40), /* HI */
- COSTS_N_INSNS (40), /* SI */
- COSTS_N_INSNS (40), /* DI */
- COSTS_N_INSNS (40)}, /* other */
- COSTS_N_INSNS (3), /* cost of movsx */
- COSTS_N_INSNS (2), /* cost of movzx */
- 15, /* "large" insn */
- 3, /* MOVE_RATIO */
- 4, /* cost for loading QImode using movzbl */
- {2, 4, 2}, /* cost of loading integer registers
- in QImode, HImode and SImode.
- Relative to reg-reg move (2). */
- {2, 4, 2}, /* cost of storing integer registers */
- 2, /* cost of reg,reg fld/fst */
- {8, 8, 8}, /* cost of loading fp registers
- in SFmode, DFmode and XFmode */
- {8, 8, 8}, /* cost of storing fp registers
- in SFmode, DFmode and XFmode */
- 2, /* cost of moving MMX register */
- {4, 8}, /* cost of loading MMX registers
- in SImode and DImode */
- {4, 8}, /* cost of storing MMX registers
- in SImode and DImode */
- 2, /* cost of moving SSE register */
- {4, 8, 16}, /* cost of loading SSE registers
- in SImode, DImode and TImode */
- {4, 8, 16}, /* cost of storing SSE registers
- in SImode, DImode and TImode */
- 3, /* MMX or SSE register to integer */
- 4, /* size of l1 cache. 486 has 8kB cache
- shared for code and data, so 4kB is
- not really precise. */
- 4, /* size of l2 cache */
- 0, /* size of prefetch block */
- 0, /* number of parallel prefetches */
- 1, /* Branch cost */
- COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
- COSTS_N_INSNS (16), /* cost of FMUL instruction. */
- COSTS_N_INSNS (73), /* cost of FDIV instruction. */
- COSTS_N_INSNS (3), /* cost of FABS instruction. */
- COSTS_N_INSNS (3), /* cost of FCHS instruction. */
- COSTS_N_INSNS (83), /* cost of FSQRT instruction. */
- 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
- i486_memcpy,
- i486_memset,
- 1, /* scalar_stmt_cost. */
- 1, /* scalar load_cost. */
- 1, /* scalar_store_cost. */
- 1, /* vec_stmt_cost. */
- 1, /* vec_to_scalar_cost. */
- 1, /* scalar_to_vec_cost. */
- 1, /* vec_align_load_cost. */
- 2, /* vec_unalign_load_cost. */
- 1, /* vec_store_cost. */
- 3, /* cond_taken_branch_cost. */
- 1, /* cond_not_taken_branch_cost. */
-};
-
-static stringop_algs pentium_memcpy[2] = {
- {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
- DUMMY_STRINGOP_ALGS};
-static stringop_algs pentium_memset[2] = {
- {libcall, {{-1, rep_prefix_4_byte, false}}},
- DUMMY_STRINGOP_ALGS};
-
-static const
-struct processor_costs pentium_cost = {
- COSTS_N_INSNS (1), /* cost of an add instruction */
- COSTS_N_INSNS (1), /* cost of a lea instruction */
- COSTS_N_INSNS (4), /* variable shift costs */
- COSTS_N_INSNS (1), /* constant shift costs */
- {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
- COSTS_N_INSNS (11), /* HI */
- COSTS_N_INSNS (11), /* SI */
- COSTS_N_INSNS (11), /* DI */
- COSTS_N_INSNS (11)}, /* other */
- 0, /* cost of multiply per each bit set */
- {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
- COSTS_N_INSNS (25), /* HI */
- COSTS_N_INSNS (25), /* SI */
- COSTS_N_INSNS (25), /* DI */
- COSTS_N_INSNS (25)}, /* other */
- COSTS_N_INSNS (3), /* cost of movsx */
- COSTS_N_INSNS (2), /* cost of movzx */
- 8, /* "large" insn */
- 6, /* MOVE_RATIO */
- 6, /* cost for loading QImode using movzbl */
- {2, 4, 2}, /* cost of loading integer registers
- in QImode, HImode and SImode.
- Relative to reg-reg move (2). */
- {2, 4, 2}, /* cost of storing integer registers */
- 2, /* cost of reg,reg fld/fst */
- {2, 2, 6}, /* cost of loading fp registers
- in SFmode, DFmode and XFmode */
- {4, 4, 6}, /* cost of storing fp registers
- in SFmode, DFmode and XFmode */
- 8, /* cost of moving MMX register */
- {8, 8}, /* cost of loading MMX registers
- in SImode and DImode */
- {8, 8}, /* cost of storing MMX registers
- in SImode and DImode */
- 2, /* cost of moving SSE register */
- {4, 8, 16}, /* cost of loading SSE registers
- in SImode, DImode and TImode */
- {4, 8, 16}, /* cost of storing SSE registers
- in SImode, DImode and TImode */
- 3, /* MMX or SSE register to integer */
- 8, /* size of l1 cache. */
- 8, /* size of l2 cache */
- 0, /* size of prefetch block */
- 0, /* number of parallel prefetches */
- 2, /* Branch cost */
- COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
- COSTS_N_INSNS (3), /* cost of FMUL instruction. */
- COSTS_N_INSNS (39), /* cost of FDIV instruction. */
- COSTS_N_INSNS (1), /* cost of FABS instruction. */
- COSTS_N_INSNS (1), /* cost of FCHS instruction. */
- COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
- 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
- pentium_memcpy,
- pentium_memset,
- 1, /* scalar_stmt_cost. */
- 1, /* scalar load_cost. */
- 1, /* scalar_store_cost. */
- 1, /* vec_stmt_cost. */
- 1, /* vec_to_scalar_cost. */
- 1, /* scalar_to_vec_cost. */
- 1, /* vec_align_load_cost. */
- 2, /* vec_unalign_load_cost. */
- 1, /* vec_store_cost. */
- 3, /* cond_taken_branch_cost. */
- 1, /* cond_not_taken_branch_cost. */
-};
-
-static const
-struct processor_costs lakemont_cost = {
- COSTS_N_INSNS (1), /* cost of an add instruction */
- COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
- COSTS_N_INSNS (1), /* variable shift costs */
- COSTS_N_INSNS (1), /* constant shift costs */
- {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
- COSTS_N_INSNS (11), /* HI */
- COSTS_N_INSNS (11), /* SI */
- COSTS_N_INSNS (11), /* DI */
- COSTS_N_INSNS (11)}, /* other */
- 0, /* cost of multiply per each bit set */
- {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
- COSTS_N_INSNS (25), /* HI */
- COSTS_N_INSNS (25), /* SI */
- COSTS_N_INSNS (25), /* DI */
- COSTS_N_INSNS (25)}, /* other */
- COSTS_N_INSNS (3), /* cost of movsx */
- COSTS_N_INSNS (2), /* cost of movzx */
- 8, /* "large" insn */
- 17, /* MOVE_RATIO */
- 6, /* cost for loading QImode using movzbl */
- {2, 4, 2}, /* cost of loading integer registers
- in QImode, HImode and SImode.
- Relative to reg-reg move (2). */
- {2, 4, 2}, /* cost of storing integer registers */
- 2, /* cost of reg,reg fld/fst */
- {2, 2, 6}, /* cost of loading fp registers
- in SFmode, DFmode and XFmode */
- {4, 4, 6}, /* cost of storing fp registers
- in SFmode, DFmode and XFmode */
- 8, /* cost of moving MMX register */
- {8, 8}, /* cost of loading MMX registers
- in SImode and DImode */
- {8, 8}, /* cost of storing MMX registers
- in SImode and DImode */
- 2, /* cost of moving SSE register */
- {4, 8, 16}, /* cost of loading SSE registers
- in SImode, DImode and TImode */
- {4, 8, 16}, /* cost of storing SSE registers
- in SImode, DImode and TImode */
- 3, /* MMX or SSE register to integer */
- 8, /* size of l1 cache. */
- 8, /* size of l2 cache */
- 0, /* size of prefetch block */
- 0, /* number of parallel prefetches */
- 2, /* Branch cost */
- COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
- COSTS_N_INSNS (3), /* cost of FMUL instruction. */
- COSTS_N_INSNS (39), /* cost of FDIV instruction. */
- COSTS_N_INSNS (1), /* cost of FABS instruction. */
- COSTS_N_INSNS (1), /* cost of FCHS instruction. */
- COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
- 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
- pentium_memcpy,
- pentium_memset,
- 1, /* scalar_stmt_cost. */
- 1, /* scalar load_cost. */
- 1, /* scalar_store_cost. */
- 1, /* vec_stmt_cost. */
- 1, /* vec_to_scalar_cost. */
- 1, /* scalar_to_vec_cost. */
- 1, /* vec_align_load_cost. */
- 2, /* vec_unalign_load_cost. */
- 1, /* vec_store_cost. */
- 3, /* cond_taken_branch_cost. */
- 1, /* cond_not_taken_branch_cost. */
-};
-
-/* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
- (we ensure the alignment). For small blocks inline loop is still a
- noticeable win, for bigger blocks either rep movsl or rep movsb is
- way to go. Rep movsb has apparently more expensive startup time in CPU,
- but after 4K the difference is down in the noise. */
-static stringop_algs pentiumpro_memcpy[2] = {
- {rep_prefix_4_byte, {{128, loop, false}, {1024, unrolled_loop, false},
- {8192, rep_prefix_4_byte, false},
- {-1, rep_prefix_1_byte, false}}},
- DUMMY_STRINGOP_ALGS};
-static stringop_algs pentiumpro_memset[2] = {
- {rep_prefix_4_byte, {{1024, unrolled_loop, false},
- {8192, rep_prefix_4_byte, false},
- {-1, libcall, false}}},
- DUMMY_STRINGOP_ALGS};
-static const
-struct processor_costs pentiumpro_cost = {
- COSTS_N_INSNS (1), /* cost of an add instruction */
- COSTS_N_INSNS (1), /* cost of a lea instruction */
- COSTS_N_INSNS (1), /* variable shift costs */
- COSTS_N_INSNS (1), /* constant shift costs */
- {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
- COSTS_N_INSNS (4), /* HI */
- COSTS_N_INSNS (4), /* SI */
- COSTS_N_INSNS (4), /* DI */
- COSTS_N_INSNS (4)}, /* other */
- 0, /* cost of multiply per each bit set */
- {COSTS_N_INSNS (17), /* cost of a divide/mod for QI */
- COSTS_N_INSNS (17), /* HI */
- COSTS_N_INSNS (17), /* SI */
- COSTS_N_INSNS (17), /* DI */
- COSTS_N_INSNS (17)}, /* other */
- COSTS_N_INSNS (1), /* cost of movsx */
- COSTS_N_INSNS (1), /* cost of movzx */
- 8, /* "large" insn */
- 6, /* MOVE_RATIO */
- 2, /* cost for loading QImode using movzbl */
- {4, 4, 4}, /* cost of loading integer registers
- in QImode, HImode and SImode.
- Relative to reg-reg move (2). */
- {2, 2, 2}, /* cost of storing integer registers */
- 2, /* cost of reg,reg fld/fst */
- {2, 2, 6}, /* cost of loading fp registers
- in SFmode, DFmode and XFmode */
- {4, 4, 6}, /* cost of storing fp registers
- in SFmode, DFmode and XFmode */
- 2, /* cost of moving MMX register */
- {2, 2}, /* cost of loading MMX registers
- in SImode and DImode */
- {2, 2}, /* cost of storing MMX registers
- in SImode and DImode */
- 2, /* cost of moving SSE register */
- {2, 2, 8}, /* cost of loading SSE registers
- in SImode, DImode and TImode */
- {2, 2, 8}, /* cost of storing SSE registers
- in SImode, DImode and TImode */
- 3, /* MMX or SSE register to integer */
- 8, /* size of l1 cache. */
- 256, /* size of l2 cache */
- 32, /* size of prefetch block */
- 6, /* number of parallel prefetches */
- 2, /* Branch cost */
- COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
- COSTS_N_INSNS (5), /* cost of FMUL instruction. */
- COSTS_N_INSNS (56), /* cost of FDIV instruction. */
- COSTS_N_INSNS (2), /* cost of FABS instruction. */
- COSTS_N_INSNS (2), /* cost of FCHS instruction. */
- COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
- 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
- pentiumpro_memcpy,
- pentiumpro_memset,
- 1, /* scalar_stmt_cost. */
- 1, /* scalar load_cost. */
- 1, /* scalar_store_cost. */
- 1, /* vec_stmt_cost. */
- 1, /* vec_to_scalar_cost. */
- 1, /* scalar_to_vec_cost. */
- 1, /* vec_align_load_cost. */
- 2, /* vec_unalign_load_cost. */
- 1, /* vec_store_cost. */
- 3, /* cond_taken_branch_cost. */
- 1, /* cond_not_taken_branch_cost. */
-};
-
-static stringop_algs geode_memcpy[2] = {
- {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
- DUMMY_STRINGOP_ALGS};
-static stringop_algs geode_memset[2] = {
- {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
- DUMMY_STRINGOP_ALGS};
-static const
-struct processor_costs geode_cost = {
- COSTS_N_INSNS (1), /* cost of an add instruction */
- COSTS_N_INSNS (1), /* cost of a lea instruction */
- COSTS_N_INSNS (2), /* variable shift costs */
- COSTS_N_INSNS (1), /* constant shift costs */
- {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
- COSTS_N_INSNS (4), /* HI */
- COSTS_N_INSNS (7), /* SI */
- COSTS_N_INSNS (7), /* DI */
- COSTS_N_INSNS (7)}, /* other */
- 0, /* cost of multiply per each bit set */
- {COSTS_N_INSNS (15), /* cost of a divide/mod for QI */
- COSTS_N_INSNS (23), /* HI */
- COSTS_N_INSNS (39), /* SI */
- COSTS_N_INSNS (39), /* DI */
- COSTS_N_INSNS (39)}, /* other */
- COSTS_N_INSNS (1), /* cost of movsx */
- COSTS_N_INSNS (1), /* cost of movzx */
- 8, /* "large" insn */
- 4, /* MOVE_RATIO */
- 1, /* cost for loading QImode using movzbl */
- {1, 1, 1}, /* cost of loading integer registers
- in QImode, HImode and SImode.
- Relative to reg-reg move (2). */
- {1, 1, 1}, /* cost of storing integer registers */
- 1, /* cost of reg,reg fld/fst */
- {1, 1, 1}, /* cost of loading fp registers
- in SFmode, DFmode and XFmode */
- {4, 6, 6}, /* cost of storing fp registers
- in SFmode, DFmode and XFmode */
-
- 2, /* cost of moving MMX register */
- {2, 2}, /* cost of loading MMX registers
- in SImode and DImode */
- {2, 2}, /* cost of storing MMX registers
- in SImode and DImode */
- 2, /* cost of moving SSE register */
- {2, 2, 8}, /* cost of loading SSE registers
- in SImode, DImode and TImode */
- {2, 2, 8}, /* cost of storing SSE registers
- in SImode, DImode and TImode */
- 3, /* MMX or SSE register to integer */
- 64, /* size of l1 cache. */
- 128, /* size of l2 cache. */
- 32, /* size of prefetch block */
- 1, /* number of parallel prefetches */
- 1, /* Branch cost */
- COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
- COSTS_N_INSNS (11), /* cost of FMUL instruction. */
- COSTS_N_INSNS (47), /* cost of FDIV instruction. */
- COSTS_N_INSNS (1), /* cost of FABS instruction. */
- COSTS_N_INSNS (1), /* cost of FCHS instruction. */
- COSTS_N_INSNS (54), /* cost of FSQRT instruction. */
- 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
- geode_memcpy,
- geode_memset,
- 1, /* scalar_stmt_cost. */
- 1, /* scalar load_cost. */
- 1, /* scalar_store_cost. */
- 1, /* vec_stmt_cost. */
- 1, /* vec_to_scalar_cost. */
- 1, /* scalar_to_vec_cost. */
- 1, /* vec_align_load_cost. */
- 2, /* vec_unalign_load_cost. */
- 1, /* vec_store_cost. */
- 3, /* cond_taken_branch_cost. */
- 1, /* cond_not_taken_branch_cost. */
-};
-
-static stringop_algs k6_memcpy[2] = {
- {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
- DUMMY_STRINGOP_ALGS};
-static stringop_algs k6_memset[2] = {
- {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
- DUMMY_STRINGOP_ALGS};
-static const
-struct processor_costs k6_cost = {
- COSTS_N_INSNS (1), /* cost of an add instruction */
- COSTS_N_INSNS (2), /* cost of a lea instruction */
- COSTS_N_INSNS (1), /* variable shift costs */
- COSTS_N_INSNS (1), /* constant shift costs */
- {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
- COSTS_N_INSNS (3), /* HI */
- COSTS_N_INSNS (3), /* SI */
- COSTS_N_INSNS (3), /* DI */
- COSTS_N_INSNS (3)}, /* other */
- 0, /* cost of multiply per each bit set */
- {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
- COSTS_N_INSNS (18), /* HI */
- COSTS_N_INSNS (18), /* SI */
- COSTS_N_INSNS (18), /* DI */
- COSTS_N_INSNS (18)}, /* other */
- COSTS_N_INSNS (2), /* cost of movsx */
- COSTS_N_INSNS (2), /* cost of movzx */
- 8, /* "large" insn */
- 4, /* MOVE_RATIO */
- 3, /* cost for loading QImode using movzbl */
- {4, 5, 4}, /* cost of loading integer registers
- in QImode, HImode and SImode.
- Relative to reg-reg move (2). */
- {2, 3, 2}, /* cost of storing integer registers */
- 4, /* cost of reg,reg fld/fst */
- {6, 6, 6}, /* cost of loading fp registers
- in SFmode, DFmode and XFmode */
- {4, 4, 4}, /* cost of storing fp registers
- in SFmode, DFmode and XFmode */
- 2, /* cost of moving MMX register */
- {2, 2}, /* cost of loading MMX registers
- in SImode and DImode */
- {2, 2}, /* cost of storing MMX registers
- in SImode and DImode */
- 2, /* cost of moving SSE register */
- {2, 2, 8}, /* cost of loading SSE registers
- in SImode, DImode and TImode */
- {2, 2, 8}, /* cost of storing SSE registers
- in SImode, DImode and TImode */
- 6, /* MMX or SSE register to integer */
- 32, /* size of l1 cache. */
- 32, /* size of l2 cache. Some models
- have integrated l2 cache, but
- optimizing for k6 is not important
- enough to worry about that. */
- 32, /* size of prefetch block */
- 1, /* number of parallel prefetches */
- 1, /* Branch cost */
- COSTS_N_INSNS (2), /* cost of FADD and FSUB insns. */
- COSTS_N_INSNS (2), /* cost of FMUL instruction. */
- COSTS_N_INSNS (56), /* cost of FDIV instruction. */
- COSTS_N_INSNS (2), /* cost of FABS instruction. */
- COSTS_N_INSNS (2), /* cost of FCHS instruction. */
- COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
- 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
- k6_memcpy,
- k6_memset,
- 1, /* scalar_stmt_cost. */
- 1, /* scalar load_cost. */
- 1, /* scalar_store_cost. */
- 1, /* vec_stmt_cost. */
- 1, /* vec_to_scalar_cost. */
- 1, /* scalar_to_vec_cost. */
- 1, /* vec_align_load_cost. */
- 2, /* vec_unalign_load_cost. */
- 1, /* vec_store_cost. */
- 3, /* cond_taken_branch_cost. */
- 1, /* cond_not_taken_branch_cost. */
-};
-
-/* For some reason, Athlon deals better with REP prefix (relative to loops)
- compared to K8. Alignment becomes important after 8 bytes for memcpy and
- 128 bytes for memset. */
-static stringop_algs athlon_memcpy[2] = {
- {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
- DUMMY_STRINGOP_ALGS};
-static stringop_algs athlon_memset[2] = {
- {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
- DUMMY_STRINGOP_ALGS};
-static const
-struct processor_costs athlon_cost = {
- COSTS_N_INSNS (1), /* cost of an add instruction */
- COSTS_N_INSNS (2), /* cost of a lea instruction */
- COSTS_N_INSNS (1), /* variable shift costs */
- COSTS_N_INSNS (1), /* constant shift costs */
- {COSTS_N_INSNS (5), /* cost of starting multiply for QI */
- COSTS_N_INSNS (5), /* HI */
- COSTS_N_INSNS (5), /* SI */
- COSTS_N_INSNS (5), /* DI */
- COSTS_N_INSNS (5)}, /* other */
- 0, /* cost of multiply per each bit set */
- {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
- COSTS_N_INSNS (26), /* HI */
- COSTS_N_INSNS (42), /* SI */
- COSTS_N_INSNS (74), /* DI */
- COSTS_N_INSNS (74)}, /* other */
- COSTS_N_INSNS (1), /* cost of movsx */
- COSTS_N_INSNS (1), /* cost of movzx */
- 8, /* "large" insn */
- 9, /* MOVE_RATIO */
- 4, /* cost for loading QImode using movzbl */
- {3, 4, 3}, /* cost of loading integer registers
- in QImode, HImode and SImode.
- Relative to reg-reg move (2). */
- {3, 4, 3}, /* cost of storing integer registers */
- 4, /* cost of reg,reg fld/fst */
- {4, 4, 12}, /* cost of loading fp registers
- in SFmode, DFmode and XFmode */
- {6, 6, 8}, /* cost of storing fp registers
- in SFmode, DFmode and XFmode */
- 2, /* cost of moving MMX register */
- {4, 4}, /* cost of loading MMX registers
- in SImode and DImode */
- {4, 4}, /* cost of storing MMX registers
- in SImode and DImode */
- 2, /* cost of moving SSE register */
- {4, 4, 6}, /* cost of loading SSE registers
- in SImode, DImode and TImode */
- {4, 4, 5}, /* cost of storing SSE registers
- in SImode, DImode and TImode */
- 5, /* MMX or SSE register to integer */
- 64, /* size of l1 cache. */
- 256, /* size of l2 cache. */
- 64, /* size of prefetch block */
- 6, /* number of parallel prefetches */
- 5, /* Branch cost */
- COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
- COSTS_N_INSNS (4), /* cost of FMUL instruction. */
- COSTS_N_INSNS (24), /* cost of FDIV instruction. */
- COSTS_N_INSNS (2), /* cost of FABS instruction. */
- COSTS_N_INSNS (2), /* cost of FCHS instruction. */
- COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
- 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
- athlon_memcpy,
- athlon_memset,
- 1, /* scalar_stmt_cost. */
- 1, /* scalar load_cost. */
- 1, /* scalar_store_cost. */
- 1, /* vec_stmt_cost. */
- 1, /* vec_to_scalar_cost. */
- 1, /* scalar_to_vec_cost. */
- 1, /* vec_align_load_cost. */
- 2, /* vec_unalign_load_cost. */
- 1, /* vec_store_cost. */
- 3, /* cond_taken_branch_cost. */
- 1, /* cond_not_taken_branch_cost. */
-};
-
-/* K8 has optimized REP instruction for medium sized blocks, but for very
- small blocks it is better to use loop. For large blocks, libcall can
- do nontemporary accesses and beat inline considerably. */
-static stringop_algs k8_memcpy[2] = {
- {libcall, {{6, loop, false}, {14, unrolled_loop, false},
- {-1, rep_prefix_4_byte, false}}},
- {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
- {-1, libcall, false}}}};
-static stringop_algs k8_memset[2] = {
- {libcall, {{8, loop, false}, {24, unrolled_loop, false},
- {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
- {libcall, {{48, unrolled_loop, false},
- {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
-static const
-struct processor_costs k8_cost = {
- COSTS_N_INSNS (1), /* cost of an add instruction */
- COSTS_N_INSNS (2), /* cost of a lea instruction */
- COSTS_N_INSNS (1), /* variable shift costs */
- COSTS_N_INSNS (1), /* constant shift costs */
- {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
- COSTS_N_INSNS (4), /* HI */
- COSTS_N_INSNS (3), /* SI */
- COSTS_N_INSNS (4), /* DI */
- COSTS_N_INSNS (5)}, /* other */
- 0, /* cost of multiply per each bit set */
- {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
- COSTS_N_INSNS (26), /* HI */
- COSTS_N_INSNS (42), /* SI */
- COSTS_N_INSNS (74), /* DI */
- COSTS_N_INSNS (74)}, /* other */
- COSTS_N_INSNS (1), /* cost of movsx */
- COSTS_N_INSNS (1), /* cost of movzx */
- 8, /* "large" insn */
- 9, /* MOVE_RATIO */
- 4, /* cost for loading QImode using movzbl */
- {3, 4, 3}, /* cost of loading integer registers
- in QImode, HImode and SImode.
- Relative to reg-reg move (2). */
- {3, 4, 3}, /* cost of storing integer registers */
- 4, /* cost of reg,reg fld/fst */
- {4, 4, 12}, /* cost of loading fp registers
- in SFmode, DFmode and XFmode */
- {6, 6, 8}, /* cost of storing fp registers
- in SFmode, DFmode and XFmode */
- 2, /* cost of moving MMX register */
- {3, 3}, /* cost of loading MMX registers
- in SImode and DImode */
- {4, 4}, /* cost of storing MMX registers
- in SImode and DImode */
- 2, /* cost of moving SSE register */
- {4, 3, 6}, /* cost of loading SSE registers
- in SImode, DImode and TImode */
- {4, 4, 5}, /* cost of storing SSE registers
- in SImode, DImode and TImode */
- 5, /* MMX or SSE register to integer */
- 64, /* size of l1 cache. */
- 512, /* size of l2 cache. */
- 64, /* size of prefetch block */
- /* New AMD processors never drop prefetches; if they cannot be performed
- immediately, they are queued. We set number of simultaneous prefetches
- to a large constant to reflect this (it probably is not a good idea not
- to limit number of prefetches at all, as their execution also takes some
- time). */
- 100, /* number of parallel prefetches */
- 3, /* Branch cost */
- COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
- COSTS_N_INSNS (4), /* cost of FMUL instruction. */
- COSTS_N_INSNS (19), /* cost of FDIV instruction. */
- COSTS_N_INSNS (2), /* cost of FABS instruction. */
- COSTS_N_INSNS (2), /* cost of FCHS instruction. */
- COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
- 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
- k8_memcpy,
- k8_memset,
- 4, /* scalar_stmt_cost. */
- 2, /* scalar load_cost. */
- 2, /* scalar_store_cost. */
- 5, /* vec_stmt_cost. */
- 0, /* vec_to_scalar_cost. */
- 2, /* scalar_to_vec_cost. */
- 2, /* vec_align_load_cost. */
- 3, /* vec_unalign_load_cost. */
- 3, /* vec_store_cost. */
- 3, /* cond_taken_branch_cost. */
- 2, /* cond_not_taken_branch_cost. */
-};
-
-/* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
- very small blocks it is better to use loop. For large blocks, libcall can
- do nontemporary accesses and beat inline considerably. */
-static stringop_algs amdfam10_memcpy[2] = {
- {libcall, {{6, loop, false}, {14, unrolled_loop, false},
- {-1, rep_prefix_4_byte, false}}},
- {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
- {-1, libcall, false}}}};
-static stringop_algs amdfam10_memset[2] = {
- {libcall, {{8, loop, false}, {24, unrolled_loop, false},
- {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
- {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
- {-1, libcall, false}}}};
-struct processor_costs amdfam10_cost = {
- COSTS_N_INSNS (1), /* cost of an add instruction */
- COSTS_N_INSNS (2), /* cost of a lea instruction */
- COSTS_N_INSNS (1), /* variable shift costs */
- COSTS_N_INSNS (1), /* constant shift costs */
- {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
- COSTS_N_INSNS (4), /* HI */
- COSTS_N_INSNS (3), /* SI */
- COSTS_N_INSNS (4), /* DI */
- COSTS_N_INSNS (5)}, /* other */
- 0, /* cost of multiply per each bit set */
- {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
- COSTS_N_INSNS (35), /* HI */
- COSTS_N_INSNS (51), /* SI */
- COSTS_N_INSNS (83), /* DI */
- COSTS_N_INSNS (83)}, /* other */
- COSTS_N_INSNS (1), /* cost of movsx */
- COSTS_N_INSNS (1), /* cost of movzx */
- 8, /* "large" insn */
- 9, /* MOVE_RATIO */
- 4, /* cost for loading QImode using movzbl */
- {3, 4, 3}, /* cost of loading integer registers
- in QImode, HImode and SImode.
- Relative to reg-reg move (2). */
- {3, 4, 3}, /* cost of storing integer registers */
- 4, /* cost of reg,reg fld/fst */
- {4, 4, 12}, /* cost of loading fp registers
- in SFmode, DFmode and XFmode */
- {6, 6, 8}, /* cost of storing fp registers
- in SFmode, DFmode and XFmode */
- 2, /* cost of moving MMX register */
- {3, 3}, /* cost of loading MMX registers
- in SImode and DImode */
- {4, 4}, /* cost of storing MMX registers
- in SImode and DImode */
- 2, /* cost of moving SSE register */
- {4, 4, 3}, /* cost of loading SSE registers
- in SImode, DImode and TImode */
- {4, 4, 5}, /* cost of storing SSE registers
- in SImode, DImode and TImode */
- 3, /* MMX or SSE register to integer */
- /* On K8:
- MOVD reg64, xmmreg Double FSTORE 4
- MOVD reg32, xmmreg Double FSTORE 4
- On AMDFAM10:
- MOVD reg64, xmmreg Double FADD 3
- 1/1 1/1
- MOVD reg32, xmmreg Double FADD 3
- 1/1 1/1 */
- 64, /* size of l1 cache. */
- 512, /* size of l2 cache. */
- 64, /* size of prefetch block */
- /* New AMD processors never drop prefetches; if they cannot be performed
- immediately, they are queued. We set number of simultaneous prefetches
- to a large constant to reflect this (it probably is not a good idea not
- to limit number of prefetches at all, as their execution also takes some
- time). */
- 100, /* number of parallel prefetches */
- 2, /* Branch cost */
- COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
- COSTS_N_INSNS (4), /* cost of FMUL instruction. */
- COSTS_N_INSNS (19), /* cost of FDIV instruction. */
- COSTS_N_INSNS (2), /* cost of FABS instruction. */
- COSTS_N_INSNS (2), /* cost of FCHS instruction. */
- COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
- 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
- amdfam10_memcpy,
- amdfam10_memset,
- 4, /* scalar_stmt_cost. */
- 2, /* scalar load_cost. */
- 2, /* scalar_store_cost. */
- 6, /* vec_stmt_cost. */
- 0, /* vec_to_scalar_cost. */
- 2, /* scalar_to_vec_cost. */
- 2, /* vec_align_load_cost. */
- 2, /* vec_unalign_load_cost. */
- 2, /* vec_store_cost. */
- 2, /* cond_taken_branch_cost. */
- 1, /* cond_not_taken_branch_cost. */
-};
-
-/* BDVER1 has optimized REP instruction for medium sized blocks, but for
- very small blocks it is better to use loop. For large blocks, libcall
- can do nontemporary accesses and beat inline considerably. */
-static stringop_algs bdver1_memcpy[2] = {
- {libcall, {{6, loop, false}, {14, unrolled_loop, false},
- {-1, rep_prefix_4_byte, false}}},
- {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
- {-1, libcall, false}}}};
-static stringop_algs bdver1_memset[2] = {
- {libcall, {{8, loop, false}, {24, unrolled_loop, false},
- {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
- {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
- {-1, libcall, false}}}};
-
-const struct processor_costs bdver1_cost = {
- COSTS_N_INSNS (1), /* cost of an add instruction */
- COSTS_N_INSNS (1), /* cost of a lea instruction */
- COSTS_N_INSNS (1), /* variable shift costs */
- COSTS_N_INSNS (1), /* constant shift costs */
- {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
- COSTS_N_INSNS (4), /* HI */
- COSTS_N_INSNS (4), /* SI */
- COSTS_N_INSNS (6), /* DI */
- COSTS_N_INSNS (6)}, /* other */
- 0, /* cost of multiply per each bit set */
- {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
- COSTS_N_INSNS (35), /* HI */
- COSTS_N_INSNS (51), /* SI */
- COSTS_N_INSNS (83), /* DI */
- COSTS_N_INSNS (83)}, /* other */
- COSTS_N_INSNS (1), /* cost of movsx */
- COSTS_N_INSNS (1), /* cost of movzx */
- 8, /* "large" insn */
- 9, /* MOVE_RATIO */
- 4, /* cost for loading QImode using movzbl */
- {5, 5, 4}, /* cost of loading integer registers
- in QImode, HImode and SImode.
- Relative to reg-reg move (2). */
- {4, 4, 4}, /* cost of storing integer registers */
- 2, /* cost of reg,reg fld/fst */
- {5, 5, 12}, /* cost of loading fp registers
- in SFmode, DFmode and XFmode */
- {4, 4, 8}, /* cost of storing fp registers
- in SFmode, DFmode and XFmode */
- 2, /* cost of moving MMX register */
- {4, 4}, /* cost of loading MMX registers
- in SImode and DImode */
- {4, 4}, /* cost of storing MMX registers
- in SImode and DImode */
- 2, /* cost of moving SSE register */
- {4, 4, 4}, /* cost of loading SSE registers
- in SImode, DImode and TImode */
- {4, 4, 4}, /* cost of storing SSE registers
- in SImode, DImode and TImode */
- 2, /* MMX or SSE register to integer */
- /* On K8:
- MOVD reg64, xmmreg Double FSTORE 4
- MOVD reg32, xmmreg Double FSTORE 4
- On AMDFAM10:
- MOVD reg64, xmmreg Double FADD 3
- 1/1 1/1
- MOVD reg32, xmmreg Double FADD 3
- 1/1 1/1 */
- 16, /* size of l1 cache. */
- 2048, /* size of l2 cache. */
- 64, /* size of prefetch block */
- /* New AMD processors never drop prefetches; if they cannot be performed
- immediately, they are queued. We set number of simultaneous prefetches
- to a large constant to reflect this (it probably is not a good idea not
- to limit number of prefetches at all, as their execution also takes some
- time). */
- 100, /* number of parallel prefetches */
- 2, /* Branch cost */
- COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
- COSTS_N_INSNS (6), /* cost of FMUL instruction. */
- COSTS_N_INSNS (42), /* cost of FDIV instruction. */
- COSTS_N_INSNS (2), /* cost of FABS instruction. */
- COSTS_N_INSNS (2), /* cost of FCHS instruction. */
- COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
- 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
- bdver1_memcpy,
- bdver1_memset,
- 6, /* scalar_stmt_cost. */
- 4, /* scalar load_cost. */
- 4, /* scalar_store_cost. */
- 6, /* vec_stmt_cost. */
- 0, /* vec_to_scalar_cost. */
- 2, /* scalar_to_vec_cost. */
- 4, /* vec_align_load_cost. */
- 4, /* vec_unalign_load_cost. */
- 4, /* vec_store_cost. */
- 4, /* cond_taken_branch_cost. */
- 2, /* cond_not_taken_branch_cost. */
-};
-
-/* BDVER2 has optimized REP instruction for medium sized blocks, but for
- very small blocks it is better to use loop. For large blocks, libcall
- can do nontemporary accesses and beat inline considerably. */
-
-static stringop_algs bdver2_memcpy[2] = {
- {libcall, {{6, loop, false}, {14, unrolled_loop, false},
- {-1, rep_prefix_4_byte, false}}},
- {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
- {-1, libcall, false}}}};
-static stringop_algs bdver2_memset[2] = {
- {libcall, {{8, loop, false}, {24, unrolled_loop, false},
- {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
- {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
- {-1, libcall, false}}}};
-
-const struct processor_costs bdver2_cost = {
- COSTS_N_INSNS (1), /* cost of an add instruction */
- COSTS_N_INSNS (1), /* cost of a lea instruction */
- COSTS_N_INSNS (1), /* variable shift costs */
- COSTS_N_INSNS (1), /* constant shift costs */
- {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
- COSTS_N_INSNS (4), /* HI */
- COSTS_N_INSNS (4), /* SI */
- COSTS_N_INSNS (6), /* DI */
- COSTS_N_INSNS (6)}, /* other */
- 0, /* cost of multiply per each bit set */
- {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
- COSTS_N_INSNS (35), /* HI */
- COSTS_N_INSNS (51), /* SI */
- COSTS_N_INSNS (83), /* DI */
- COSTS_N_INSNS (83)}, /* other */
- COSTS_N_INSNS (1), /* cost of movsx */
- COSTS_N_INSNS (1), /* cost of movzx */
- 8, /* "large" insn */
- 9, /* MOVE_RATIO */
- 4, /* cost for loading QImode using movzbl */
- {5, 5, 4}, /* cost of loading integer registers
- in QImode, HImode and SImode.
- Relative to reg-reg move (2). */
- {4, 4, 4}, /* cost of storing integer registers */
- 2, /* cost of reg,reg fld/fst */
- {5, 5, 12}, /* cost of loading fp registers
- in SFmode, DFmode and XFmode */
- {4, 4, 8}, /* cost of storing fp registers
- in SFmode, DFmode and XFmode */
- 2, /* cost of moving MMX register */
- {4, 4}, /* cost of loading MMX registers
- in SImode and DImode */
- {4, 4}, /* cost of storing MMX registers
- in SImode and DImode */
- 2, /* cost of moving SSE register */
- {4, 4, 4}, /* cost of loading SSE registers
- in SImode, DImode and TImode */
- {4, 4, 4}, /* cost of storing SSE registers
- in SImode, DImode and TImode */
- 2, /* MMX or SSE register to integer */
- /* On K8:
- MOVD reg64, xmmreg Double FSTORE 4
- MOVD reg32, xmmreg Double FSTORE 4
- On AMDFAM10:
- MOVD reg64, xmmreg Double FADD 3
- 1/1 1/1
- MOVD reg32, xmmreg Double FADD 3
- 1/1 1/1 */
- 16, /* size of l1 cache. */
- 2048, /* size of l2 cache. */
- 64, /* size of prefetch block */
- /* New AMD processors never drop prefetches; if they cannot be performed
- immediately, they are queued. We set number of simultaneous prefetches
- to a large constant to reflect this (it probably is not a good idea not
- to limit number of prefetches at all, as their execution also takes some
- time). */
- 100, /* number of parallel prefetches */
- 2, /* Branch cost */
- COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
- COSTS_N_INSNS (6), /* cost of FMUL instruction. */
- COSTS_N_INSNS (42), /* cost of FDIV instruction. */
- COSTS_N_INSNS (2), /* cost of FABS instruction. */
- COSTS_N_INSNS (2), /* cost of FCHS instruction. */
- COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
- 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
- bdver2_memcpy,
- bdver2_memset,
- 6, /* scalar_stmt_cost. */
- 4, /* scalar load_cost. */
- 4, /* scalar_store_cost. */
- 6, /* vec_stmt_cost. */
- 0, /* vec_to_scalar_cost. */
- 2, /* scalar_to_vec_cost. */
- 4, /* vec_align_load_cost. */
- 4, /* vec_unalign_load_cost. */
- 4, /* vec_store_cost. */
- 4, /* cond_taken_branch_cost. */
- 2, /* cond_not_taken_branch_cost. */
-};
-
-
- /* BDVER3 has optimized REP instruction for medium sized blocks, but for
- very small blocks it is better to use loop. For large blocks, libcall
- can do nontemporary accesses and beat inline considerably. */
-static stringop_algs bdver3_memcpy[2] = {
- {libcall, {{6, loop, false}, {14, unrolled_loop, false},
- {-1, rep_prefix_4_byte, false}}},
- {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
- {-1, libcall, false}}}};
-static stringop_algs bdver3_memset[2] = {
- {libcall, {{8, loop, false}, {24, unrolled_loop, false},
- {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
- {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
- {-1, libcall, false}}}};
-struct processor_costs bdver3_cost = {
- COSTS_N_INSNS (1), /* cost of an add instruction */
- COSTS_N_INSNS (1), /* cost of a lea instruction */
- COSTS_N_INSNS (1), /* variable shift costs */
- COSTS_N_INSNS (1), /* constant shift costs */
- {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
- COSTS_N_INSNS (4), /* HI */
- COSTS_N_INSNS (4), /* SI */
- COSTS_N_INSNS (6), /* DI */
- COSTS_N_INSNS (6)}, /* other */
- 0, /* cost of multiply per each bit set */
- {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
- COSTS_N_INSNS (35), /* HI */
- COSTS_N_INSNS (51), /* SI */
- COSTS_N_INSNS (83), /* DI */
- COSTS_N_INSNS (83)}, /* other */
- COSTS_N_INSNS (1), /* cost of movsx */
- COSTS_N_INSNS (1), /* cost of movzx */
- 8, /* "large" insn */
- 9, /* MOVE_RATIO */
- 4, /* cost for loading QImode using movzbl */
- {5, 5, 4}, /* cost of loading integer registers
- in QImode, HImode and SImode.
- Relative to reg-reg move (2). */
- {4, 4, 4}, /* cost of storing integer registers */
- 2, /* cost of reg,reg fld/fst */
- {5, 5, 12}, /* cost of loading fp registers
- in SFmode, DFmode and XFmode */
- {4, 4, 8}, /* cost of storing fp registers
- in SFmode, DFmode and XFmode */
- 2, /* cost of moving MMX register */
- {4, 4}, /* cost of loading MMX registers
- in SImode and DImode */
- {4, 4}, /* cost of storing MMX registers
- in SImode and DImode */
- 2, /* cost of moving SSE register */
- {4, 4, 4}, /* cost of loading SSE registers
- in SImode, DImode and TImode */
- {4, 4, 4}, /* cost of storing SSE registers
- in SImode, DImode and TImode */
- 2, /* MMX or SSE register to integer */
- 16, /* size of l1 cache. */
- 2048, /* size of l2 cache. */
- 64, /* size of prefetch block */
- /* New AMD processors never drop prefetches; if they cannot be performed
- immediately, they are queued. We set number of simultaneous prefetches
- to a large constant to reflect this (it probably is not a good idea not
- to limit number of prefetches at all, as their execution also takes some
- time). */
- 100, /* number of parallel prefetches */
- 2, /* Branch cost */
- COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
- COSTS_N_INSNS (6), /* cost of FMUL instruction. */
- COSTS_N_INSNS (42), /* cost of FDIV instruction. */
- COSTS_N_INSNS (2), /* cost of FABS instruction. */
- COSTS_N_INSNS (2), /* cost of FCHS instruction. */
- COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
- 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
- bdver3_memcpy,
- bdver3_memset,
- 6, /* scalar_stmt_cost. */
- 4, /* scalar load_cost. */
- 4, /* scalar_store_cost. */
- 6, /* vec_stmt_cost. */
- 0, /* vec_to_scalar_cost. */
- 2, /* scalar_to_vec_cost. */
- 4, /* vec_align_load_cost. */
- 4, /* vec_unalign_load_cost. */
- 4, /* vec_store_cost. */
- 4, /* cond_taken_branch_cost. */
- 2, /* cond_not_taken_branch_cost. */
-};
-
-/* BDVER4 has optimized REP instruction for medium sized blocks, but for
- very small blocks it is better to use loop. For large blocks, libcall
- can do nontemporary accesses and beat inline considerably. */
-static stringop_algs bdver4_memcpy[2] = {
- {libcall, {{6, loop, false}, {14, unrolled_loop, false},
- {-1, rep_prefix_4_byte, false}}},
- {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
- {-1, libcall, false}}}};
-static stringop_algs bdver4_memset[2] = {
- {libcall, {{8, loop, false}, {24, unrolled_loop, false},
- {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
- {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
- {-1, libcall, false}}}};
-struct processor_costs bdver4_cost = {
- COSTS_N_INSNS (1), /* cost of an add instruction */
- COSTS_N_INSNS (1), /* cost of a lea instruction */
- COSTS_N_INSNS (1), /* variable shift costs */
- COSTS_N_INSNS (1), /* constant shift costs */
- {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
- COSTS_N_INSNS (4), /* HI */
- COSTS_N_INSNS (4), /* SI */
- COSTS_N_INSNS (6), /* DI */
- COSTS_N_INSNS (6)}, /* other */
- 0, /* cost of multiply per each bit set */
- {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
- COSTS_N_INSNS (35), /* HI */
- COSTS_N_INSNS (51), /* SI */
- COSTS_N_INSNS (83), /* DI */
- COSTS_N_INSNS (83)}, /* other */
- COSTS_N_INSNS (1), /* cost of movsx */
- COSTS_N_INSNS (1), /* cost of movzx */
- 8, /* "large" insn */
- 9, /* MOVE_RATIO */
- 4, /* cost for loading QImode using movzbl */
- {5, 5, 4}, /* cost of loading integer registers
- in QImode, HImode and SImode.
- Relative to reg-reg move (2). */
- {4, 4, 4}, /* cost of storing integer registers */
- 2, /* cost of reg,reg fld/fst */
- {5, 5, 12}, /* cost of loading fp registers
- in SFmode, DFmode and XFmode */
- {4, 4, 8}, /* cost of storing fp registers
- in SFmode, DFmode and XFmode */
- 2, /* cost of moving MMX register */
- {4, 4}, /* cost of loading MMX registers
- in SImode and DImode */
- {4, 4}, /* cost of storing MMX registers
- in SImode and DImode */
- 2, /* cost of moving SSE register */
- {4, 4, 4}, /* cost of loading SSE registers
- in SImode, DImode and TImode */
- {4, 4, 4}, /* cost of storing SSE registers
- in SImode, DImode and TImode */
- 2, /* MMX or SSE register to integer */
- 16, /* size of l1 cache. */
- 2048, /* size of l2 cache. */
- 64, /* size of prefetch block */
- /* New AMD processors never drop prefetches; if they cannot be performed
- immediately, they are queued. We set number of simultaneous prefetches
- to a large constant to reflect this (it probably is not a good idea not
- to limit number of prefetches at all, as their execution also takes some
- time). */
- 100, /* number of parallel prefetches */
- 2, /* Branch cost */
- COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
- COSTS_N_INSNS (6), /* cost of FMUL instruction. */
- COSTS_N_INSNS (42), /* cost of FDIV instruction. */
- COSTS_N_INSNS (2), /* cost of FABS instruction. */
- COSTS_N_INSNS (2), /* cost of FCHS instruction. */
- COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
- 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
- bdver4_memcpy,
- bdver4_memset,
- 6, /* scalar_stmt_cost. */
- 4, /* scalar load_cost. */
- 4, /* scalar_store_cost. */
- 6, /* vec_stmt_cost. */
- 0, /* vec_to_scalar_cost. */
- 2, /* scalar_to_vec_cost. */
- 4, /* vec_align_load_cost. */
- 4, /* vec_unalign_load_cost. */
- 4, /* vec_store_cost. */
- 4, /* cond_taken_branch_cost. */
- 2, /* cond_not_taken_branch_cost. */
-};
-
-
-/* ZNVER1 has optimized REP instruction for medium sized blocks, but for
- very small blocks it is better to use loop. For large blocks, libcall
- can do nontemporary accesses and beat inline considerably. */
-static stringop_algs znver1_memcpy[2] = {
- {libcall, {{6, loop, false}, {14, unrolled_loop, false},
- {-1, rep_prefix_4_byte, false}}},
- {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
- {-1, libcall, false}}}};
-static stringop_algs znver1_memset[2] = {
- {libcall, {{8, loop, false}, {24, unrolled_loop, false},
- {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
- {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
- {-1, libcall, false}}}};
-struct processor_costs znver1_cost = {
- COSTS_N_INSNS (1), /* cost of an add instruction. */
- COSTS_N_INSNS (1), /* cost of a lea instruction. */
- COSTS_N_INSNS (1), /* variable shift costs. */
- COSTS_N_INSNS (1), /* constant shift costs. */
- {COSTS_N_INSNS (3), /* cost of starting multiply for QI. */
- COSTS_N_INSNS (3), /* HI. */
- COSTS_N_INSNS (3), /* SI. */
- COSTS_N_INSNS (4), /* DI. */
- COSTS_N_INSNS (4)}, /* other. */
- 0, /* cost of multiply per each bit
- set. */
- {COSTS_N_INSNS (19), /* cost of a divide/mod for QI. */
- COSTS_N_INSNS (35), /* HI. */
- COSTS_N_INSNS (51), /* SI. */
- COSTS_N_INSNS (83), /* DI. */
- COSTS_N_INSNS (83)}, /* other. */
- COSTS_N_INSNS (1), /* cost of movsx. */
- COSTS_N_INSNS (1), /* cost of movzx. */
- 8, /* "large" insn. */
- 9, /* MOVE_RATIO. */
- 4, /* cost for loading QImode using
- movzbl. */
- {5, 5, 4}, /* cost of loading integer registers
- in QImode, HImode and SImode.
- Relative to reg-reg move (2). */
- {4, 4, 4}, /* cost of storing integer
- registers. */
- 2, /* cost of reg,reg fld/fst. */
- {5, 5, 12}, /* cost of loading fp registers
- in SFmode, DFmode and XFmode. */
- {4, 4, 8}, /* cost of storing fp registers
- in SFmode, DFmode and XFmode. */
- 2, /* cost of moving MMX register. */
- {4, 4}, /* cost of loading MMX registers
- in SImode and DImode. */
- {4, 4}, /* cost of storing MMX registers
- in SImode and DImode. */
- 2, /* cost of moving SSE register. */
- {4, 4, 4}, /* cost of loading SSE registers
- in SImode, DImode and TImode. */
- {4, 4, 4}, /* cost of storing SSE registers
- in SImode, DImode and TImode. */
- 2, /* MMX or SSE register to integer. */
- 32, /* size of l1 cache. */
- 512, /* size of l2 cache. */
- 64, /* size of prefetch block. */
- /* New AMD processors never drop prefetches; if they cannot be performed
- immediately, they are queued. We set number of simultaneous prefetches
- to a large constant to reflect this (it probably is not a good idea not
- to limit number of prefetches at all, as their execution also takes some
- time). */
- 100, /* number of parallel prefetches. */
- 3, /* Branch cost. */
- COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
- COSTS_N_INSNS (6), /* cost of FMUL instruction. */
- COSTS_N_INSNS (42), /* cost of FDIV instruction. */
- COSTS_N_INSNS (2), /* cost of FABS instruction. */
- COSTS_N_INSNS (2), /* cost of FCHS instruction. */
- COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
- /* Zen can execute 4 integer operations per cycle. FP operations take 3 cycles
- and it can execute 2 integer additions and 2 multiplications thus
- reassociation may make sense up to with of 6. SPEC2k6 bencharks suggests
- that 4 works better than 6 probably due to register pressure.
-
- Integer vector operations are taken by FP unit and execute 3 vector
- plus/minus operations per cycle but only one multiply. This is adjusted
- in ix86_reassociation_width. */
- 4, 4, 3, 6, /* reassoc int, fp, vec_int, vec_fp. */
- znver1_memcpy,
- znver1_memset,
- 6, /* scalar_stmt_cost. */
- 4, /* scalar load_cost. */
- 4, /* scalar_store_cost. */
- 6, /* vec_stmt_cost. */
- 0, /* vec_to_scalar_cost. */
- 2, /* scalar_to_vec_cost. */
- 4, /* vec_align_load_cost. */
- 4, /* vec_unalign_load_cost. */
- 4, /* vec_store_cost. */
- 4, /* cond_taken_branch_cost. */
- 2, /* cond_not_taken_branch_cost. */
-};
-
- /* BTVER1 has optimized REP instruction for medium sized blocks, but for
- very small blocks it is better to use loop. For large blocks, libcall can
- do nontemporary accesses and beat inline considerably. */
-static stringop_algs btver1_memcpy[2] = {
- {libcall, {{6, loop, false}, {14, unrolled_loop, false},
- {-1, rep_prefix_4_byte, false}}},
- {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
- {-1, libcall, false}}}};
-static stringop_algs btver1_memset[2] = {
- {libcall, {{8, loop, false}, {24, unrolled_loop, false},
- {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
- {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
- {-1, libcall, false}}}};
-const struct processor_costs btver1_cost = {
- COSTS_N_INSNS (1), /* cost of an add instruction */
- COSTS_N_INSNS (2), /* cost of a lea instruction */
- COSTS_N_INSNS (1), /* variable shift costs */
- COSTS_N_INSNS (1), /* constant shift costs */
- {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
- COSTS_N_INSNS (4), /* HI */
- COSTS_N_INSNS (3), /* SI */
- COSTS_N_INSNS (4), /* DI */
- COSTS_N_INSNS (5)}, /* other */
- 0, /* cost of multiply per each bit set */
- {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
- COSTS_N_INSNS (35), /* HI */
- COSTS_N_INSNS (51), /* SI */
- COSTS_N_INSNS (83), /* DI */
- COSTS_N_INSNS (83)}, /* other */
- COSTS_N_INSNS (1), /* cost of movsx */
- COSTS_N_INSNS (1), /* cost of movzx */
- 8, /* "large" insn */
- 9, /* MOVE_RATIO */
- 4, /* cost for loading QImode using movzbl */
- {3, 4, 3}, /* cost of loading integer registers
- in QImode, HImode and SImode.
- Relative to reg-reg move (2). */
- {3, 4, 3}, /* cost of storing integer registers */
- 4, /* cost of reg,reg fld/fst */
- {4, 4, 12}, /* cost of loading fp registers
- in SFmode, DFmode and XFmode */
- {6, 6, 8}, /* cost of storing fp registers
- in SFmode, DFmode and XFmode */
- 2, /* cost of moving MMX register */
- {3, 3}, /* cost of loading MMX registers
- in SImode and DImode */
- {4, 4}, /* cost of storing MMX registers
- in SImode and DImode */
- 2, /* cost of moving SSE register */
- {4, 4, 3}, /* cost of loading SSE registers
- in SImode, DImode and TImode */
- {4, 4, 5}, /* cost of storing SSE registers
- in SImode, DImode and TImode */
- 3, /* MMX or SSE register to integer */
- /* On K8:
- MOVD reg64, xmmreg Double FSTORE 4
- MOVD reg32, xmmreg Double FSTORE 4
- On AMDFAM10:
- MOVD reg64, xmmreg Double FADD 3
- 1/1 1/1
- MOVD reg32, xmmreg Double FADD 3
- 1/1 1/1 */
- 32, /* size of l1 cache. */
- 512, /* size of l2 cache. */
- 64, /* size of prefetch block */
- 100, /* number of parallel prefetches */
- 2, /* Branch cost */
- COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
- COSTS_N_INSNS (4), /* cost of FMUL instruction. */
- COSTS_N_INSNS (19), /* cost of FDIV instruction. */
- COSTS_N_INSNS (2), /* cost of FABS instruction. */
- COSTS_N_INSNS (2), /* cost of FCHS instruction. */
- COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
- 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
- btver1_memcpy,
- btver1_memset,
- 4, /* scalar_stmt_cost. */
- 2, /* scalar load_cost. */
- 2, /* scalar_store_cost. */
- 6, /* vec_stmt_cost. */
- 0, /* vec_to_scalar_cost. */
- 2, /* scalar_to_vec_cost. */
- 2, /* vec_align_load_cost. */
- 2, /* vec_unalign_load_cost. */
- 2, /* vec_store_cost. */
- 2, /* cond_taken_branch_cost. */
- 1, /* cond_not_taken_branch_cost. */
-};
-
-static stringop_algs btver2_memcpy[2] = {
- {libcall, {{6, loop, false}, {14, unrolled_loop, false},
- {-1, rep_prefix_4_byte, false}}},
- {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
- {-1, libcall, false}}}};
-static stringop_algs btver2_memset[2] = {
- {libcall, {{8, loop, false}, {24, unrolled_loop, false},
- {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
- {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
- {-1, libcall, false}}}};
-const struct processor_costs btver2_cost = {
- COSTS_N_INSNS (1), /* cost of an add instruction */
- COSTS_N_INSNS (2), /* cost of a lea instruction */
- COSTS_N_INSNS (1), /* variable shift costs */
- COSTS_N_INSNS (1), /* constant shift costs */
- {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
- COSTS_N_INSNS (4), /* HI */
- COSTS_N_INSNS (3), /* SI */
- COSTS_N_INSNS (4), /* DI */
- COSTS_N_INSNS (5)}, /* other */
- 0, /* cost of multiply per each bit set */
- {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
- COSTS_N_INSNS (35), /* HI */
- COSTS_N_INSNS (51), /* SI */
- COSTS_N_INSNS (83), /* DI */
- COSTS_N_INSNS (83)}, /* other */
- COSTS_N_INSNS (1), /* cost of movsx */
- COSTS_N_INSNS (1), /* cost of movzx */
- 8, /* "large" insn */
- 9, /* MOVE_RATIO */
- 4, /* cost for loading QImode using movzbl */
- {3, 4, 3}, /* cost of loading integer registers
- in QImode, HImode and SImode.
- Relative to reg-reg move (2). */
- {3, 4, 3}, /* cost of storing integer registers */
- 4, /* cost of reg,reg fld/fst */
- {4, 4, 12}, /* cost of loading fp registers
- in SFmode, DFmode and XFmode */
- {6, 6, 8}, /* cost of storing fp registers
- in SFmode, DFmode and XFmode */
- 2, /* cost of moving MMX register */
- {3, 3}, /* cost of loading MMX registers
- in SImode and DImode */
- {4, 4}, /* cost of storing MMX registers
- in SImode and DImode */
- 2, /* cost of moving SSE register */
- {4, 4, 3}, /* cost of loading SSE registers
- in SImode, DImode and TImode */
- {4, 4, 5}, /* cost of storing SSE registers
- in SImode, DImode and TImode */
- 3, /* MMX or SSE register to integer */
- /* On K8:
- MOVD reg64, xmmreg Double FSTORE 4
- MOVD reg32, xmmreg Double FSTORE 4
- On AMDFAM10:
- MOVD reg64, xmmreg Double FADD 3
- 1/1 1/1
- MOVD reg32, xmmreg Double FADD 3
- 1/1 1/1 */
- 32, /* size of l1 cache. */
- 2048, /* size of l2 cache. */
- 64, /* size of prefetch block */
- 100, /* number of parallel prefetches */
- 2, /* Branch cost */
- COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
- COSTS_N_INSNS (4), /* cost of FMUL instruction. */
- COSTS_N_INSNS (19), /* cost of FDIV instruction. */
- COSTS_N_INSNS (2), /* cost of FABS instruction. */
- COSTS_N_INSNS (2), /* cost of FCHS instruction. */
- COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
- 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
- btver2_memcpy,
- btver2_memset,
- 4, /* scalar_stmt_cost. */
- 2, /* scalar load_cost. */
- 2, /* scalar_store_cost. */
- 6, /* vec_stmt_cost. */
- 0, /* vec_to_scalar_cost. */
- 2, /* scalar_to_vec_cost. */
- 2, /* vec_align_load_cost. */
- 2, /* vec_unalign_load_cost. */
- 2, /* vec_store_cost. */
- 2, /* cond_taken_branch_cost. */
- 1, /* cond_not_taken_branch_cost. */
-};
-
-static stringop_algs pentium4_memcpy[2] = {
- {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
- DUMMY_STRINGOP_ALGS};
-static stringop_algs pentium4_memset[2] = {
- {libcall, {{6, loop_1_byte, false}, {48, loop, false},
- {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
- DUMMY_STRINGOP_ALGS};
-
-static const
-struct processor_costs pentium4_cost = {
- COSTS_N_INSNS (1), /* cost of an add instruction */
- COSTS_N_INSNS (3), /* cost of a lea instruction */
- COSTS_N_INSNS (4), /* variable shift costs */
- COSTS_N_INSNS (4), /* constant shift costs */
- {COSTS_N_INSNS (15), /* cost of starting multiply for QI */
- COSTS_N_INSNS (15), /* HI */
- COSTS_N_INSNS (15), /* SI */
- COSTS_N_INSNS (15), /* DI */
- COSTS_N_INSNS (15)}, /* other */
- 0, /* cost of multiply per each bit set */
- {COSTS_N_INSNS (56), /* cost of a divide/mod for QI */
- COSTS_N_INSNS (56), /* HI */
- COSTS_N_INSNS (56), /* SI */
- COSTS_N_INSNS (56), /* DI */
- COSTS_N_INSNS (56)}, /* other */
- COSTS_N_INSNS (1), /* cost of movsx */
- COSTS_N_INSNS (1), /* cost of movzx */
- 16, /* "large" insn */
- 6, /* MOVE_RATIO */
- 2, /* cost for loading QImode using movzbl */
- {4, 5, 4}, /* cost of loading integer registers
- in QImode, HImode and SImode.
- Relative to reg-reg move (2). */
- {2, 3, 2}, /* cost of storing integer registers */
- 2, /* cost of reg,reg fld/fst */
- {2, 2, 6}, /* cost of loading fp registers
- in SFmode, DFmode and XFmode */
- {4, 4, 6}, /* cost of storing fp registers
- in SFmode, DFmode and XFmode */
- 2, /* cost of moving MMX register */
- {2, 2}, /* cost of loading MMX registers
- in SImode and DImode */
- {2, 2}, /* cost of storing MMX registers
- in SImode and DImode */
- 12, /* cost of moving SSE register */
- {12, 12, 12}, /* cost of loading SSE registers
- in SImode, DImode and TImode */
- {2, 2, 8}, /* cost of storing SSE registers
- in SImode, DImode and TImode */
- 10, /* MMX or SSE register to integer */
- 8, /* size of l1 cache. */
- 256, /* size of l2 cache. */
- 64, /* size of prefetch block */
- 6, /* number of parallel prefetches */
- 2, /* Branch cost */
- COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
- COSTS_N_INSNS (7), /* cost of FMUL instruction. */
- COSTS_N_INSNS (43), /* cost of FDIV instruction. */
- COSTS_N_INSNS (2), /* cost of FABS instruction. */
- COSTS_N_INSNS (2), /* cost of FCHS instruction. */
- COSTS_N_INSNS (43), /* cost of FSQRT instruction. */
- 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
- pentium4_memcpy,
- pentium4_memset,
- 1, /* scalar_stmt_cost. */
- 1, /* scalar load_cost. */
- 1, /* scalar_store_cost. */
- 1, /* vec_stmt_cost. */
- 1, /* vec_to_scalar_cost. */
- 1, /* scalar_to_vec_cost. */
- 1, /* vec_align_load_cost. */
- 2, /* vec_unalign_load_cost. */
- 1, /* vec_store_cost. */
- 3, /* cond_taken_branch_cost. */
- 1, /* cond_not_taken_branch_cost. */
-};
-
-static stringop_algs nocona_memcpy[2] = {
- {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
- {libcall, {{32, loop, false}, {20000, rep_prefix_8_byte, false},
- {100000, unrolled_loop, false}, {-1, libcall, false}}}};
-
-static stringop_algs nocona_memset[2] = {
- {libcall, {{6, loop_1_byte, false}, {48, loop, false},
- {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
- {libcall, {{24, loop, false}, {64, unrolled_loop, false},
- {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
-
-static const
-struct processor_costs nocona_cost = {
- COSTS_N_INSNS (1), /* cost of an add instruction */
- COSTS_N_INSNS (1), /* cost of a lea instruction */
- COSTS_N_INSNS (1), /* variable shift costs */
- COSTS_N_INSNS (1), /* constant shift costs */
- {COSTS_N_INSNS (10), /* cost of starting multiply for QI */
- COSTS_N_INSNS (10), /* HI */
- COSTS_N_INSNS (10), /* SI */
- COSTS_N_INSNS (10), /* DI */
- COSTS_N_INSNS (10)}, /* other */
- 0, /* cost of multiply per each bit set */
- {COSTS_N_INSNS (66), /* cost of a divide/mod for QI */
- COSTS_N_INSNS (66), /* HI */
- COSTS_N_INSNS (66), /* SI */
- COSTS_N_INSNS (66), /* DI */
- COSTS_N_INSNS (66)}, /* other */
- COSTS_N_INSNS (1), /* cost of movsx */
- COSTS_N_INSNS (1), /* cost of movzx */
- 16, /* "large" insn */
- 17, /* MOVE_RATIO */
- 4, /* cost for loading QImode using movzbl */
- {4, 4, 4}, /* cost of loading integer registers
- in QImode, HImode and SImode.
- Relative to reg-reg move (2). */
- {4, 4, 4}, /* cost of storing integer registers */
- 3, /* cost of reg,reg fld/fst */
- {12, 12, 12}, /* cost of loading fp registers
- in SFmode, DFmode and XFmode */
- {4, 4, 4}, /* cost of storing fp registers
- in SFmode, DFmode and XFmode */
- 6, /* cost of moving MMX register */
- {12, 12}, /* cost of loading MMX registers
- in SImode and DImode */
- {12, 12}, /* cost of storing MMX registers
- in SImode and DImode */
- 6, /* cost of moving SSE register */
- {12, 12, 12}, /* cost of loading SSE registers
- in SImode, DImode and TImode */
- {12, 12, 12}, /* cost of storing SSE registers
- in SImode, DImode and TImode */
- 8, /* MMX or SSE register to integer */
- 8, /* size of l1 cache. */
- 1024, /* size of l2 cache. */
- 64, /* size of prefetch block */
- 8, /* number of parallel prefetches */
- 1, /* Branch cost */
- COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
- COSTS_N_INSNS (8), /* cost of FMUL instruction. */
- COSTS_N_INSNS (40), /* cost of FDIV instruction. */
- COSTS_N_INSNS (3), /* cost of FABS instruction. */
- COSTS_N_INSNS (3), /* cost of FCHS instruction. */
- COSTS_N_INSNS (44), /* cost of FSQRT instruction. */
- 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
- nocona_memcpy,
- nocona_memset,
- 1, /* scalar_stmt_cost. */
- 1, /* scalar load_cost. */
- 1, /* scalar_store_cost. */
- 1, /* vec_stmt_cost. */
- 1, /* vec_to_scalar_cost. */
- 1, /* scalar_to_vec_cost. */
- 1, /* vec_align_load_cost. */
- 2, /* vec_unalign_load_cost. */
- 1, /* vec_store_cost. */
- 3, /* cond_taken_branch_cost. */
- 1, /* cond_not_taken_branch_cost. */
-};
-
-static stringop_algs atom_memcpy[2] = {
- {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
- {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
- {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
-static stringop_algs atom_memset[2] = {
- {libcall, {{8, loop, false}, {15, unrolled_loop, false},
- {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
- {libcall, {{24, loop, false}, {32, unrolled_loop, false},
- {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
-static const
-struct processor_costs atom_cost = {
- COSTS_N_INSNS (1), /* cost of an add instruction */
- COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
- COSTS_N_INSNS (1), /* variable shift costs */
- COSTS_N_INSNS (1), /* constant shift costs */
- {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
- COSTS_N_INSNS (4), /* HI */
- COSTS_N_INSNS (3), /* SI */
- COSTS_N_INSNS (4), /* DI */
- COSTS_N_INSNS (2)}, /* other */
- 0, /* cost of multiply per each bit set */
- {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
- COSTS_N_INSNS (26), /* HI */
- COSTS_N_INSNS (42), /* SI */
- COSTS_N_INSNS (74), /* DI */
- COSTS_N_INSNS (74)}, /* other */
- COSTS_N_INSNS (1), /* cost of movsx */
- COSTS_N_INSNS (1), /* cost of movzx */
- 8, /* "large" insn */
- 17, /* MOVE_RATIO */
- 4, /* cost for loading QImode using movzbl */
- {4, 4, 4}, /* cost of loading integer registers
- in QImode, HImode and SImode.
- Relative to reg-reg move (2). */
- {4, 4, 4}, /* cost of storing integer registers */
- 4, /* cost of reg,reg fld/fst */
- {12, 12, 12}, /* cost of loading fp registers
- in SFmode, DFmode and XFmode */
- {6, 6, 8}, /* cost of storing fp registers
- in SFmode, DFmode and XFmode */
- 2, /* cost of moving MMX register */
- {8, 8}, /* cost of loading MMX registers
- in SImode and DImode */
- {8, 8}, /* cost of storing MMX registers
- in SImode and DImode */
- 2, /* cost of moving SSE register */
- {8, 8, 8}, /* cost of loading SSE registers
- in SImode, DImode and TImode */
- {8, 8, 8}, /* cost of storing SSE registers
- in SImode, DImode and TImode */
- 5, /* MMX or SSE register to integer */
- 32, /* size of l1 cache. */
- 256, /* size of l2 cache. */
- 64, /* size of prefetch block */
- 6, /* number of parallel prefetches */
- 3, /* Branch cost */
- COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
- COSTS_N_INSNS (8), /* cost of FMUL instruction. */
- COSTS_N_INSNS (20), /* cost of FDIV instruction. */
- COSTS_N_INSNS (8), /* cost of FABS instruction. */
- COSTS_N_INSNS (8), /* cost of FCHS instruction. */
- COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
- 2, 2, 2, 2, /* reassoc int, fp, vec_int, vec_fp. */
- atom_memcpy,
- atom_memset,
- 1, /* scalar_stmt_cost. */
- 1, /* scalar load_cost. */
- 1, /* scalar_store_cost. */
- 1, /* vec_stmt_cost. */
- 1, /* vec_to_scalar_cost. */
- 1, /* scalar_to_vec_cost. */
- 1, /* vec_align_load_cost. */
- 2, /* vec_unalign_load_cost. */
- 1, /* vec_store_cost. */
- 3, /* cond_taken_branch_cost. */
- 1, /* cond_not_taken_branch_cost. */
-};
-
-static stringop_algs slm_memcpy[2] = {
- {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
- {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
- {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
-static stringop_algs slm_memset[2] = {
- {libcall, {{8, loop, false}, {15, unrolled_loop, false},
- {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
- {libcall, {{24, loop, false}, {32, unrolled_loop, false},
- {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
-static const
-struct processor_costs slm_cost = {
- COSTS_N_INSNS (1), /* cost of an add instruction */
- COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
- COSTS_N_INSNS (1), /* variable shift costs */
- COSTS_N_INSNS (1), /* constant shift costs */
- {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
- COSTS_N_INSNS (3), /* HI */
- COSTS_N_INSNS (3), /* SI */
- COSTS_N_INSNS (4), /* DI */
- COSTS_N_INSNS (2)}, /* other */
- 0, /* cost of multiply per each bit set */
- {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
- COSTS_N_INSNS (26), /* HI */
- COSTS_N_INSNS (42), /* SI */
- COSTS_N_INSNS (74), /* DI */
- COSTS_N_INSNS (74)}, /* other */
- COSTS_N_INSNS (1), /* cost of movsx */
- COSTS_N_INSNS (1), /* cost of movzx */
- 8, /* "large" insn */
- 17, /* MOVE_RATIO */
- 4, /* cost for loading QImode using movzbl */
- {4, 4, 4}, /* cost of loading integer registers
- in QImode, HImode and SImode.
- Relative to reg-reg move (2). */
- {4, 4, 4}, /* cost of storing integer registers */
- 4, /* cost of reg,reg fld/fst */
- {12, 12, 12}, /* cost of loading fp registers
- in SFmode, DFmode and XFmode */
- {6, 6, 8}, /* cost of storing fp registers
- in SFmode, DFmode and XFmode */
- 2, /* cost of moving MMX register */
- {8, 8}, /* cost of loading MMX registers
- in SImode and DImode */
- {8, 8}, /* cost of storing MMX registers
- in SImode and DImode */
- 2, /* cost of moving SSE register */
- {8, 8, 8}, /* cost of loading SSE registers
- in SImode, DImode and TImode */
- {8, 8, 8}, /* cost of storing SSE registers
- in SImode, DImode and TImode */
- 5, /* MMX or SSE register to integer */
- 32, /* size of l1 cache. */
- 256, /* size of l2 cache. */
- 64, /* size of prefetch block */
- 6, /* number of parallel prefetches */
- 3, /* Branch cost */
- COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
- COSTS_N_INSNS (8), /* cost of FMUL instruction. */
- COSTS_N_INSNS (20), /* cost of FDIV instruction. */
- COSTS_N_INSNS (8), /* cost of FABS instruction. */
- COSTS_N_INSNS (8), /* cost of FCHS instruction. */
- COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
- 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
- slm_memcpy,
- slm_memset,
- 1, /* scalar_stmt_cost. */
- 1, /* scalar load_cost. */
- 1, /* scalar_store_cost. */
- 1, /* vec_stmt_cost. */
- 4, /* vec_to_scalar_cost. */
- 1, /* scalar_to_vec_cost. */
- 1, /* vec_align_load_cost. */
- 2, /* vec_unalign_load_cost. */
- 1, /* vec_store_cost. */
- 3, /* cond_taken_branch_cost. */
- 1, /* cond_not_taken_branch_cost. */
-};
-
-static stringop_algs intel_memcpy[2] = {
- {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
- {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
- {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
-static stringop_algs intel_memset[2] = {
- {libcall, {{8, loop, false}, {15, unrolled_loop, false},
- {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
- {libcall, {{24, loop, false}, {32, unrolled_loop, false},
- {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
-static const
-struct processor_costs intel_cost = {
- COSTS_N_INSNS (1), /* cost of an add instruction */
- COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
- COSTS_N_INSNS (1), /* variable shift costs */
- COSTS_N_INSNS (1), /* constant shift costs */
- {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
- COSTS_N_INSNS (3), /* HI */
- COSTS_N_INSNS (3), /* SI */
- COSTS_N_INSNS (4), /* DI */
- COSTS_N_INSNS (2)}, /* other */
- 0, /* cost of multiply per each bit set */
- {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
- COSTS_N_INSNS (26), /* HI */
- COSTS_N_INSNS (42), /* SI */
- COSTS_N_INSNS (74), /* DI */
- COSTS_N_INSNS (74)}, /* other */
- COSTS_N_INSNS (1), /* cost of movsx */
- COSTS_N_INSNS (1), /* cost of movzx */
- 8, /* "large" insn */
- 17, /* MOVE_RATIO */
- 4, /* cost for loading QImode using movzbl */
- {4, 4, 4}, /* cost of loading integer registers
- in QImode, HImode and SImode.
- Relative to reg-reg move (2). */
- {4, 4, 4}, /* cost of storing integer registers */
- 4, /* cost of reg,reg fld/fst */
- {12, 12, 12}, /* cost of loading fp registers
- in SFmode, DFmode and XFmode */
- {6, 6, 8}, /* cost of storing fp registers
- in SFmode, DFmode and XFmode */
- 2, /* cost of moving MMX register */
- {8, 8}, /* cost of loading MMX registers
- in SImode and DImode */
- {8, 8}, /* cost of storing MMX registers
- in SImode and DImode */
- 2, /* cost of moving SSE register */
- {8, 8, 8}, /* cost of loading SSE registers
- in SImode, DImode and TImode */
- {8, 8, 8}, /* cost of storing SSE registers
- in SImode, DImode and TImode */
- 5, /* MMX or SSE register to integer */
- 32, /* size of l1 cache. */
- 256, /* size of l2 cache. */
- 64, /* size of prefetch block */
- 6, /* number of parallel prefetches */
- 3, /* Branch cost */
- COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
- COSTS_N_INSNS (8), /* cost of FMUL instruction. */
- COSTS_N_INSNS (20), /* cost of FDIV instruction. */
- COSTS_N_INSNS (8), /* cost of FABS instruction. */
- COSTS_N_INSNS (8), /* cost of FCHS instruction. */
- COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
- 1, 4, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
- intel_memcpy,
- intel_memset,
- 1, /* scalar_stmt_cost. */
- 1, /* scalar load_cost. */
- 1, /* scalar_store_cost. */
- 1, /* vec_stmt_cost. */
- 4, /* vec_to_scalar_cost. */
- 1, /* scalar_to_vec_cost. */
- 1, /* vec_align_load_cost. */
- 2, /* vec_unalign_load_cost. */
- 1, /* vec_store_cost. */
- 3, /* cond_taken_branch_cost. */
- 1, /* cond_not_taken_branch_cost. */
-};
-
-/* Generic should produce code tuned for Core-i7 (and newer chips)
- and btver1 (and newer chips). */
-
-static stringop_algs generic_memcpy[2] = {
- {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
- {-1, libcall, false}}},
- {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
- {-1, libcall, false}}}};
-static stringop_algs generic_memset[2] = {
- {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
- {-1, libcall, false}}},
- {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
- {-1, libcall, false}}}};
-static const
-struct processor_costs generic_cost = {
- COSTS_N_INSNS (1), /* cost of an add instruction */
- /* On all chips taken into consideration lea is 2 cycles and more. With
- this cost however our current implementation of synth_mult results in
- use of unnecessary temporary registers causing regression on several
- SPECfp benchmarks. */
- COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
- COSTS_N_INSNS (1), /* variable shift costs */
- COSTS_N_INSNS (1), /* constant shift costs */
- {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
- COSTS_N_INSNS (4), /* HI */
- COSTS_N_INSNS (3), /* SI */
- COSTS_N_INSNS (4), /* DI */
- COSTS_N_INSNS (2)}, /* other */
- 0, /* cost of multiply per each bit set */
- {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
- COSTS_N_INSNS (26), /* HI */
- COSTS_N_INSNS (42), /* SI */
- COSTS_N_INSNS (74), /* DI */
- COSTS_N_INSNS (74)}, /* other */
- COSTS_N_INSNS (1), /* cost of movsx */
- COSTS_N_INSNS (1), /* cost of movzx */
- 8, /* "large" insn */
- 17, /* MOVE_RATIO */
- 4, /* cost for loading QImode using movzbl */
- {4, 4, 4}, /* cost of loading integer registers
- in QImode, HImode and SImode.
- Relative to reg-reg move (2). */
- {4, 4, 4}, /* cost of storing integer registers */
- 4, /* cost of reg,reg fld/fst */
- {12, 12, 12}, /* cost of loading fp registers
- in SFmode, DFmode and XFmode */
- {6, 6, 8}, /* cost of storing fp registers
- in SFmode, DFmode and XFmode */
- 2, /* cost of moving MMX register */
- {8, 8}, /* cost of loading MMX registers
- in SImode and DImode */
- {8, 8}, /* cost of storing MMX registers
- in SImode and DImode */
- 2, /* cost of moving SSE register */
- {8, 8, 8}, /* cost of loading SSE registers
- in SImode, DImode and TImode */
- {8, 8, 8}, /* cost of storing SSE registers
- in SImode, DImode and TImode */
- 5, /* MMX or SSE register to integer */
- 32, /* size of l1 cache. */
- 512, /* size of l2 cache. */
- 64, /* size of prefetch block */
- 6, /* number of parallel prefetches */
- /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
- value is increased to perhaps more appropriate value of 5. */
- 3, /* Branch cost */
- COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
- COSTS_N_INSNS (8), /* cost of FMUL instruction. */
- COSTS_N_INSNS (20), /* cost of FDIV instruction. */
- COSTS_N_INSNS (8), /* cost of FABS instruction. */
- COSTS_N_INSNS (8), /* cost of FCHS instruction. */
- COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
- 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
- generic_memcpy,
- generic_memset,
- 1, /* scalar_stmt_cost. */
- 1, /* scalar load_cost. */
- 1, /* scalar_store_cost. */
- 1, /* vec_stmt_cost. */
- 1, /* vec_to_scalar_cost. */
- 1, /* scalar_to_vec_cost. */
- 1, /* vec_align_load_cost. */
- 2, /* vec_unalign_load_cost. */
- 1, /* vec_store_cost. */
- 3, /* cond_taken_branch_cost. */
- 1, /* cond_not_taken_branch_cost. */
-};
-
-/* core_cost should produce code tuned for Core familly of CPUs. */
-static stringop_algs core_memcpy[2] = {
- {libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}},
- {libcall, {{24, loop, true}, {128, rep_prefix_8_byte, true},
- {-1, libcall, false}}}};
-static stringop_algs core_memset[2] = {
- {libcall, {{6, loop_1_byte, true},
- {24, loop, true},
- {8192, rep_prefix_4_byte, true},
- {-1, libcall, false}}},
- {libcall, {{24, loop, true}, {512, rep_prefix_8_byte, true},
- {-1, libcall, false}}}};
-
-static const
-struct processor_costs core_cost = {
- COSTS_N_INSNS (1), /* cost of an add instruction */
- /* On all chips taken into consideration lea is 2 cycles and more. With
- this cost however our current implementation of synth_mult results in
- use of unnecessary temporary registers causing regression on several
- SPECfp benchmarks. */
- COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
- COSTS_N_INSNS (1), /* variable shift costs */
- COSTS_N_INSNS (1), /* constant shift costs */
- {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
- COSTS_N_INSNS (4), /* HI */
- COSTS_N_INSNS (3), /* SI */
- COSTS_N_INSNS (4), /* DI */
- COSTS_N_INSNS (2)}, /* other */
- 0, /* cost of multiply per each bit set */
- {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
- COSTS_N_INSNS (26), /* HI */
- COSTS_N_INSNS (42), /* SI */
- COSTS_N_INSNS (74), /* DI */
- COSTS_N_INSNS (74)}, /* other */
- COSTS_N_INSNS (1), /* cost of movsx */
- COSTS_N_INSNS (1), /* cost of movzx */
- 8, /* "large" insn */
- 17, /* MOVE_RATIO */
- 4, /* cost for loading QImode using movzbl */
- {4, 4, 4}, /* cost of loading integer registers
- in QImode, HImode and SImode.
- Relative to reg-reg move (2). */
- {4, 4, 4}, /* cost of storing integer registers */
- 4, /* cost of reg,reg fld/fst */
- {12, 12, 12}, /* cost of loading fp registers
- in SFmode, DFmode and XFmode */
- {6, 6, 8}, /* cost of storing fp registers
- in SFmode, DFmode and XFmode */
- 2, /* cost of moving MMX register */
- {8, 8}, /* cost of loading MMX registers
- in SImode and DImode */
- {8, 8}, /* cost of storing MMX registers
- in SImode and DImode */
- 2, /* cost of moving SSE register */
- {8, 8, 8}, /* cost of loading SSE registers
- in SImode, DImode and TImode */
- {8, 8, 8}, /* cost of storing SSE registers
- in SImode, DImode and TImode */
- 5, /* MMX or SSE register to integer */
- 64, /* size of l1 cache. */
- 512, /* size of l2 cache. */
- 64, /* size of prefetch block */
- 6, /* number of parallel prefetches */
- /* FIXME perhaps more appropriate value is 5. */
- 3, /* Branch cost */
- COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
- COSTS_N_INSNS (8), /* cost of FMUL instruction. */
- COSTS_N_INSNS (20), /* cost of FDIV instruction. */
- COSTS_N_INSNS (8), /* cost of FABS instruction. */
- COSTS_N_INSNS (8), /* cost of FCHS instruction. */
- COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
- 1, 4, 2, 2, /* reassoc int, fp, vec_int, vec_fp. */
- core_memcpy,
- core_memset,
- 1, /* scalar_stmt_cost. */
- 1, /* scalar load_cost. */
- 1, /* scalar_store_cost. */
- 1, /* vec_stmt_cost. */
- 1, /* vec_to_scalar_cost. */
- 1, /* scalar_to_vec_cost. */
- 1, /* vec_align_load_cost. */
- 2, /* vec_unalign_load_cost. */
- 1, /* vec_store_cost. */
- 3, /* cond_taken_branch_cost. */
- 1, /* cond_not_taken_branch_cost. */
-};
-
/* Set by -mtune. */
-const struct processor_costs *ix86_tune_cost = &pentium_cost;
+const struct processor_costs *ix86_tune_cost = NULL;
/* Set by -mtune or -Os. */
-const struct processor_costs *ix86_cost = &pentium_cost;
+const struct processor_costs *ix86_cost = NULL;
/* Processor feature/optimization bitmasks. */
#define m_386 (1U<<PROCESSOR_I386)
/* Check whether x86 address PARTS is a pc-relative address. */
-static bool
-rip_relative_addr_p (struct ix86_address *parts)
+bool
+ix86_rip_relative_addr_p (struct ix86_address *parts)
{
rtx base, index, disp;
else if (disp && !base && !index)
{
len += 4;
- if (!rip_relative_addr_p (&parts))
+ if (!ix86_rip_relative_addr_p (&parts))
len++;
}
else
return 2 + 1;
}
\f
-/* Return the maximum number of instructions a cpu can issue. */
-
-static int
-ix86_issue_rate (void)
-{
- switch (ix86_tune)
- {
- case PROCESSOR_PENTIUM:
- case PROCESSOR_LAKEMONT:
- case PROCESSOR_BONNELL:
- case PROCESSOR_SILVERMONT:
- case PROCESSOR_KNL:
- case PROCESSOR_KNM:
- case PROCESSOR_INTEL:
- case PROCESSOR_K6:
- case PROCESSOR_BTVER2:
- case PROCESSOR_PENTIUM4:
- case PROCESSOR_NOCONA:
- return 2;
-
- case PROCESSOR_PENTIUMPRO:
- case PROCESSOR_ATHLON:
- case PROCESSOR_K8:
- case PROCESSOR_AMDFAM10:
- case PROCESSOR_GENERIC:
- case PROCESSOR_BTVER1:
- return 3;
-
- case PROCESSOR_BDVER1:
- case PROCESSOR_BDVER2:
- case PROCESSOR_BDVER3:
- case PROCESSOR_BDVER4:
- case PROCESSOR_ZNVER1:
- case PROCESSOR_CORE2:
- case PROCESSOR_NEHALEM:
- case PROCESSOR_SANDYBRIDGE:
- case PROCESSOR_HASWELL:
- return 4;
-
- default:
- return 1;
- }
-}
-
-/* A subroutine of ix86_adjust_cost -- return TRUE iff INSN reads flags set
- by DEP_INSN and nothing set by DEP_INSN. */
-
-static bool
-ix86_flags_dependent (rtx_insn *insn, rtx_insn *dep_insn, enum attr_type insn_type)
-{
- rtx set, set2;
-
- /* Simplify the test for uninteresting insns. */
- if (insn_type != TYPE_SETCC
- && insn_type != TYPE_ICMOV
- && insn_type != TYPE_FCMOV
- && insn_type != TYPE_IBR)
- return false;
-
- if ((set = single_set (dep_insn)) != 0)
- {
- set = SET_DEST (set);
- set2 = NULL_RTX;
- }
- else if (GET_CODE (PATTERN (dep_insn)) == PARALLEL
- && XVECLEN (PATTERN (dep_insn), 0) == 2
- && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 0)) == SET
- && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 1)) == SET)
- {
- set = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
- set2 = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
- }
- else
- return false;
-
- if (!REG_P (set) || REGNO (set) != FLAGS_REG)
- return false;
-
- /* This test is true if the dependent insn reads the flags but
- not any other potentially set register. */
- if (!reg_overlap_mentioned_p (set, PATTERN (insn)))
- return false;
-
- if (set2 && reg_overlap_mentioned_p (set2, PATTERN (insn)))
- return false;
-
- return true;
-}
-
-/* Return true iff USE_INSN has a memory address with operands set by
- SET_INSN. */
-
-bool
-ix86_agi_dependent (rtx_insn *set_insn, rtx_insn *use_insn)
-{
- int i;
- extract_insn_cached (use_insn);
- for (i = recog_data.n_operands - 1; i >= 0; --i)
- if (MEM_P (recog_data.operand[i]))
- {
- rtx addr = XEXP (recog_data.operand[i], 0);
- if (modified_in_p (addr, set_insn) != 0)
- {
- /* No AGI stall if SET_INSN is a push or pop and USE_INSN
- has SP based memory (unless index reg is modified in a pop). */
- rtx set = single_set (set_insn);
- if (set
- && (push_operand (SET_DEST (set), GET_MODE (SET_DEST (set)))
- || pop_operand (SET_SRC (set), GET_MODE (SET_SRC (set)))))
- {
- struct ix86_address parts;
- if (ix86_decompose_address (addr, &parts)
- && parts.base == stack_pointer_rtx
- && (parts.index == NULL_RTX
- || MEM_P (SET_DEST (set))
- || !modified_in_p (parts.index, set_insn)))
- return false;
- }
- return true;
- }
- return false;
- }
- return false;
-}
-
-/* Helper function for exact_store_load_dependency.
- Return true if addr is found in insn. */
-static bool
-exact_dependency_1 (rtx addr, rtx insn)
-{
- enum rtx_code code;
- const char *format_ptr;
- int i, j;
-
- code = GET_CODE (insn);
- switch (code)
- {
- case MEM:
- if (rtx_equal_p (addr, insn))
- return true;
- break;
- case REG:
- CASE_CONST_ANY:
- case SYMBOL_REF:
- case CODE_LABEL:
- case PC:
- case CC0:
- case EXPR_LIST:
- return false;
- default:
- break;
- }
-
- format_ptr = GET_RTX_FORMAT (code);
- for (i = 0; i < GET_RTX_LENGTH (code); i++)
- {
- switch (*format_ptr++)
- {
- case 'e':
- if (exact_dependency_1 (addr, XEXP (insn, i)))
- return true;
- break;
- case 'E':
- for (j = 0; j < XVECLEN (insn, i); j++)
- if (exact_dependency_1 (addr, XVECEXP (insn, i, j)))
- return true;
- break;
- }
- }
- return false;
-}
-
-/* Return true if there exists exact dependency for store & load, i.e.
- the same memory address is used in them. */
-static bool
-exact_store_load_dependency (rtx_insn *store, rtx_insn *load)
-{
- rtx set1, set2;
-
- set1 = single_set (store);
- if (!set1)
- return false;
- if (!MEM_P (SET_DEST (set1)))
- return false;
- set2 = single_set (load);
- if (!set2)
- return false;
- if (exact_dependency_1 (SET_DEST (set1), SET_SRC (set2)))
- return true;
- return false;
-}
-
-static int
-ix86_adjust_cost (rtx_insn *insn, int dep_type, rtx_insn *dep_insn, int cost,
- unsigned int)
-{
- enum attr_type insn_type, dep_insn_type;
- enum attr_memory memory;
- rtx set, set2;
- int dep_insn_code_number;
-
- /* Anti and output dependencies have zero cost on all CPUs. */
- if (dep_type != 0)
- return 0;
-
- dep_insn_code_number = recog_memoized (dep_insn);
-
- /* If we can't recognize the insns, we can't really do anything. */
- if (dep_insn_code_number < 0 || recog_memoized (insn) < 0)
- return cost;
-
- insn_type = get_attr_type (insn);
- dep_insn_type = get_attr_type (dep_insn);
-
- switch (ix86_tune)
- {
- case PROCESSOR_PENTIUM:
- case PROCESSOR_LAKEMONT:
- /* Address Generation Interlock adds a cycle of latency. */
- if (insn_type == TYPE_LEA)
- {
- rtx addr = PATTERN (insn);
-
- if (GET_CODE (addr) == PARALLEL)
- addr = XVECEXP (addr, 0, 0);
-
- gcc_assert (GET_CODE (addr) == SET);
-
- addr = SET_SRC (addr);
- if (modified_in_p (addr, dep_insn))
- cost += 1;
- }
- else if (ix86_agi_dependent (dep_insn, insn))
- cost += 1;
-
- /* ??? Compares pair with jump/setcc. */
- if (ix86_flags_dependent (insn, dep_insn, insn_type))
- cost = 0;
-
- /* Floating point stores require value to be ready one cycle earlier. */
- if (insn_type == TYPE_FMOV
- && get_attr_memory (insn) == MEMORY_STORE
- && !ix86_agi_dependent (dep_insn, insn))
- cost += 1;
- break;
-
- case PROCESSOR_PENTIUMPRO:
- /* INT->FP conversion is expensive. */
- if (get_attr_fp_int_src (dep_insn))
- cost += 5;
-
- /* There is one cycle extra latency between an FP op and a store. */
- if (insn_type == TYPE_FMOV
- && (set = single_set (dep_insn)) != NULL_RTX
- && (set2 = single_set (insn)) != NULL_RTX
- && rtx_equal_p (SET_DEST (set), SET_SRC (set2))
- && MEM_P (SET_DEST (set2)))
- cost += 1;
-
- memory = get_attr_memory (insn);
-
- /* Show ability of reorder buffer to hide latency of load by executing
- in parallel with previous instruction in case
- previous instruction is not needed to compute the address. */
- if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
- && !ix86_agi_dependent (dep_insn, insn))
- {
- /* Claim moves to take one cycle, as core can issue one load
- at time and the next load can start cycle later. */
- if (dep_insn_type == TYPE_IMOV
- || dep_insn_type == TYPE_FMOV)
- cost = 1;
- else if (cost > 1)
- cost--;
- }
- break;
-
- case PROCESSOR_K6:
- /* The esp dependency is resolved before
- the instruction is really finished. */
- if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
- && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
- return 1;
-
- /* INT->FP conversion is expensive. */
- if (get_attr_fp_int_src (dep_insn))
- cost += 5;
-
- memory = get_attr_memory (insn);
-
- /* Show ability of reorder buffer to hide latency of load by executing
- in parallel with previous instruction in case
- previous instruction is not needed to compute the address. */
- if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
- && !ix86_agi_dependent (dep_insn, insn))
- {
- /* Claim moves to take one cycle, as core can issue one load
- at time and the next load can start cycle later. */
- if (dep_insn_type == TYPE_IMOV
- || dep_insn_type == TYPE_FMOV)
- cost = 1;
- else if (cost > 2)
- cost -= 2;
- else
- cost = 1;
- }
- break;
-
- case PROCESSOR_AMDFAM10:
- case PROCESSOR_BDVER1:
- case PROCESSOR_BDVER2:
- case PROCESSOR_BDVER3:
- case PROCESSOR_BDVER4:
- case PROCESSOR_ZNVER1:
- case PROCESSOR_BTVER1:
- case PROCESSOR_BTVER2:
- case PROCESSOR_GENERIC:
- /* Stack engine allows to execute push&pop instructions in parall. */
- if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
- && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
- return 0;
- /* FALLTHRU */
-
- case PROCESSOR_ATHLON:
- case PROCESSOR_K8:
- memory = get_attr_memory (insn);
-
- /* Show ability of reorder buffer to hide latency of load by executing
- in parallel with previous instruction in case
- previous instruction is not needed to compute the address. */
- if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
- && !ix86_agi_dependent (dep_insn, insn))
- {
- enum attr_unit unit = get_attr_unit (insn);
- int loadcost = 3;
-
- /* Because of the difference between the length of integer and
- floating unit pipeline preparation stages, the memory operands
- for floating point are cheaper.
-
- ??? For Athlon it the difference is most probably 2. */
- if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN)
- loadcost = 3;
- else
- loadcost = TARGET_ATHLON ? 2 : 0;
-
- if (cost >= loadcost)
- cost -= loadcost;
- else
- cost = 0;
- }
- break;
-
- case PROCESSOR_CORE2:
- case PROCESSOR_NEHALEM:
- case PROCESSOR_SANDYBRIDGE:
- case PROCESSOR_HASWELL:
- /* Stack engine allows to execute push&pop instructions in parall. */
- if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
- && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
- return 0;
-
- memory = get_attr_memory (insn);
-
- /* Show ability of reorder buffer to hide latency of load by executing
- in parallel with previous instruction in case
- previous instruction is not needed to compute the address. */
- if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
- && !ix86_agi_dependent (dep_insn, insn))
- {
- if (cost >= 4)
- cost -= 4;
- else
- cost = 0;
- }
- break;
-
- case PROCESSOR_SILVERMONT:
- case PROCESSOR_KNL:
- case PROCESSOR_KNM:
- case PROCESSOR_INTEL:
- if (!reload_completed)
- return cost;
-
- /* Increase cost of integer loads. */
- memory = get_attr_memory (dep_insn);
- if (memory == MEMORY_LOAD || memory == MEMORY_BOTH)
- {
- enum attr_unit unit = get_attr_unit (dep_insn);
- if (unit == UNIT_INTEGER && cost == 1)
- {
- if (memory == MEMORY_LOAD)
- cost = 3;
- else
- {
- /* Increase cost of ld/st for short int types only
- because of store forwarding issue. */
- rtx set = single_set (dep_insn);
- if (set && (GET_MODE (SET_DEST (set)) == QImode
- || GET_MODE (SET_DEST (set)) == HImode))
- {
- /* Increase cost of store/load insn if exact
- dependence exists and it is load insn. */
- enum attr_memory insn_memory = get_attr_memory (insn);
- if (insn_memory == MEMORY_LOAD
- && exact_store_load_dependency (dep_insn, insn))
- cost = 3;
- }
- }
- }
- }
-
- default:
- break;
- }
-
- return cost;
-}
-
-/* How many alternative schedules to try. This should be as wide as the
- scheduling freedom in the DFA, but no wider. Making this value too
- large results extra work for the scheduler. */
-
-static int
-ia32_multipass_dfa_lookahead (void)
-{
- /* Generally, we want haifa-sched:max_issue() to look ahead as far
- as many instructions can be executed on a cycle, i.e.,
- issue_rate. */
- if (reload_completed)
- return ix86_issue_rate ();
- /* Don't use lookahead for pre-reload schedule to save compile time. */
- return 0;
-}
-
-/* Return true if target platform supports macro-fusion. */
-
-static bool
-ix86_macro_fusion_p ()
-{
- return TARGET_FUSE_CMP_AND_BRANCH;
-}
-
-/* Check whether current microarchitecture support macro fusion
- for insn pair "CONDGEN + CONDJMP". Refer to
- "Intel Architectures Optimization Reference Manual". */
-
-static bool
-ix86_macro_fusion_pair_p (rtx_insn *condgen, rtx_insn *condjmp)
-{
- rtx src, dest;
- enum rtx_code ccode;
- rtx compare_set = NULL_RTX, test_if, cond;
- rtx alu_set = NULL_RTX, addr = NULL_RTX;
-
- if (!any_condjump_p (condjmp))
- return false;
-
- unsigned int condreg1, condreg2;
- rtx cc_reg_1;
- ix86_fixed_condition_code_regs (&condreg1, &condreg2);
- cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
- if (!reg_referenced_p (cc_reg_1, PATTERN (condjmp))
- || !condgen
- || !modified_in_p (cc_reg_1, condgen))
- return false;
-
- if (get_attr_type (condgen) != TYPE_TEST
- && get_attr_type (condgen) != TYPE_ICMP
- && get_attr_type (condgen) != TYPE_INCDEC
- && get_attr_type (condgen) != TYPE_ALU)
- return false;
-
- compare_set = single_set (condgen);
- if (compare_set == NULL_RTX
- && !TARGET_FUSE_ALU_AND_BRANCH)
- return false;
-
- if (compare_set == NULL_RTX)
- {
- int i;
- rtx pat = PATTERN (condgen);
- for (i = 0; i < XVECLEN (pat, 0); i++)
- if (GET_CODE (XVECEXP (pat, 0, i)) == SET)
- {
- rtx set_src = SET_SRC (XVECEXP (pat, 0, i));
- if (GET_CODE (set_src) == COMPARE)
- compare_set = XVECEXP (pat, 0, i);
- else
- alu_set = XVECEXP (pat, 0, i);
- }
- }
- if (compare_set == NULL_RTX)
- return false;
- src = SET_SRC (compare_set);
- if (GET_CODE (src) != COMPARE)
- return false;
-
- /* Macro-fusion for cmp/test MEM-IMM + conditional jmp is not
- supported. */
- if ((MEM_P (XEXP (src, 0))
- && CONST_INT_P (XEXP (src, 1)))
- || (MEM_P (XEXP (src, 1))
- && CONST_INT_P (XEXP (src, 0))))
- return false;
-
- /* No fusion for RIP-relative address. */
- if (MEM_P (XEXP (src, 0)))
- addr = XEXP (XEXP (src, 0), 0);
- else if (MEM_P (XEXP (src, 1)))
- addr = XEXP (XEXP (src, 1), 0);
-
- if (addr) {
- ix86_address parts;
- int ok = ix86_decompose_address (addr, &parts);
- gcc_assert (ok);
-
- if (rip_relative_addr_p (&parts))
- return false;
- }
-
- test_if = SET_SRC (pc_set (condjmp));
- cond = XEXP (test_if, 0);
- ccode = GET_CODE (cond);
- /* Check whether conditional jump use Sign or Overflow Flags. */
- if (!TARGET_FUSE_CMP_AND_BRANCH_SOFLAGS
- && (ccode == GE
- || ccode == GT
- || ccode == LE
- || ccode == LT))
- return false;
-
- /* Return true for TYPE_TEST and TYPE_ICMP. */
- if (get_attr_type (condgen) == TYPE_TEST
- || get_attr_type (condgen) == TYPE_ICMP)
- return true;
-
- /* The following is the case that macro-fusion for alu + jmp. */
- if (!TARGET_FUSE_ALU_AND_BRANCH || !alu_set)
- return false;
-
- /* No fusion for alu op with memory destination operand. */
- dest = SET_DEST (alu_set);
- if (MEM_P (dest))
- return false;
-
- /* Macro-fusion for inc/dec + unsigned conditional jump is not
- supported. */
- if (get_attr_type (condgen) == TYPE_INCDEC
- && (ccode == GEU
- || ccode == GTU
- || ccode == LEU
- || ccode == LTU))
- return false;
-
- return true;
-}
-
-/* Try to reorder ready list to take advantage of Atom pipelined IMUL
- execution. It is applied if
- (1) IMUL instruction is on the top of list;
- (2) There exists the only producer of independent IMUL instruction in
- ready list.
- Return index of IMUL producer if it was found and -1 otherwise. */
-static int
-do_reorder_for_imul (rtx_insn **ready, int n_ready)
-{
- rtx_insn *insn;
- rtx set, insn1, insn2;
- sd_iterator_def sd_it;
- dep_t dep;
- int index = -1;
- int i;
-
- if (!TARGET_BONNELL)
- return index;
-
- /* Check that IMUL instruction is on the top of ready list. */
- insn = ready[n_ready - 1];
- set = single_set (insn);
- if (!set)
- return index;
- if (!(GET_CODE (SET_SRC (set)) == MULT
- && GET_MODE (SET_SRC (set)) == SImode))
- return index;
-
- /* Search for producer of independent IMUL instruction. */
- for (i = n_ready - 2; i >= 0; i--)
- {
- insn = ready[i];
- if (!NONDEBUG_INSN_P (insn))
- continue;
- /* Skip IMUL instruction. */
- insn2 = PATTERN (insn);
- if (GET_CODE (insn2) == PARALLEL)
- insn2 = XVECEXP (insn2, 0, 0);
- if (GET_CODE (insn2) == SET
- && GET_CODE (SET_SRC (insn2)) == MULT
- && GET_MODE (SET_SRC (insn2)) == SImode)
- continue;
-
- FOR_EACH_DEP (insn, SD_LIST_FORW, sd_it, dep)
- {
- rtx con;
- con = DEP_CON (dep);
- if (!NONDEBUG_INSN_P (con))
- continue;
- insn1 = PATTERN (con);
- if (GET_CODE (insn1) == PARALLEL)
- insn1 = XVECEXP (insn1, 0, 0);
-
- if (GET_CODE (insn1) == SET
- && GET_CODE (SET_SRC (insn1)) == MULT
- && GET_MODE (SET_SRC (insn1)) == SImode)
- {
- sd_iterator_def sd_it1;
- dep_t dep1;
- /* Check if there is no other dependee for IMUL. */
- index = i;
- FOR_EACH_DEP (con, SD_LIST_BACK, sd_it1, dep1)
- {
- rtx pro;
- pro = DEP_PRO (dep1);
- if (!NONDEBUG_INSN_P (pro))
- continue;
- if (pro != insn)
- index = -1;
- }
- if (index >= 0)
- break;
- }
- }
- if (index >= 0)
- break;
- }
- return index;
-}
-
-/* Try to find the best candidate on the top of ready list if two insns
- have the same priority - candidate is best if its dependees were
- scheduled earlier. Applied for Silvermont only.
- Return true if top 2 insns must be interchanged. */
-static bool
-swap_top_of_ready_list (rtx_insn **ready, int n_ready)
-{
- rtx_insn *top = ready[n_ready - 1];
- rtx_insn *next = ready[n_ready - 2];
- rtx set;
- sd_iterator_def sd_it;
- dep_t dep;
- int clock1 = -1;
- int clock2 = -1;
- #define INSN_TICK(INSN) (HID (INSN)->tick)
-
- if (!TARGET_SILVERMONT && !TARGET_INTEL)
- return false;
-
- if (!NONDEBUG_INSN_P (top))
- return false;
- if (!NONJUMP_INSN_P (top))
- return false;
- if (!NONDEBUG_INSN_P (next))
- return false;
- if (!NONJUMP_INSN_P (next))
- return false;
- set = single_set (top);
- if (!set)
- return false;
- set = single_set (next);
- if (!set)
- return false;
-
- if (INSN_PRIORITY_KNOWN (top) && INSN_PRIORITY_KNOWN (next))
- {
- if (INSN_PRIORITY (top) != INSN_PRIORITY (next))
- return false;
- /* Determine winner more precise. */
- FOR_EACH_DEP (top, SD_LIST_RES_BACK, sd_it, dep)
- {
- rtx pro;
- pro = DEP_PRO (dep);
- if (!NONDEBUG_INSN_P (pro))
- continue;
- if (INSN_TICK (pro) > clock1)
- clock1 = INSN_TICK (pro);
- }
- FOR_EACH_DEP (next, SD_LIST_RES_BACK, sd_it, dep)
- {
- rtx pro;
- pro = DEP_PRO (dep);
- if (!NONDEBUG_INSN_P (pro))
- continue;
- if (INSN_TICK (pro) > clock2)
- clock2 = INSN_TICK (pro);
- }
-
- if (clock1 == clock2)
- {
- /* Determine winner - load must win. */
- enum attr_memory memory1, memory2;
- memory1 = get_attr_memory (top);
- memory2 = get_attr_memory (next);
- if (memory2 == MEMORY_LOAD && memory1 != MEMORY_LOAD)
- return true;
- }
- return (bool) (clock2 < clock1);
- }
- return false;
- #undef INSN_TICK
-}
-
-/* Perform possible reodering of ready list for Atom/Silvermont only.
- Return issue rate. */
-static int
-ix86_sched_reorder (FILE *dump, int sched_verbose, rtx_insn **ready,
- int *pn_ready, int clock_var)
-{
- int issue_rate = -1;
- int n_ready = *pn_ready;
- int i;
- rtx_insn *insn;
- int index = -1;
-
- /* Set up issue rate. */
- issue_rate = ix86_issue_rate ();
-
- /* Do reodering for BONNELL/SILVERMONT only. */
- if (!TARGET_BONNELL && !TARGET_SILVERMONT && !TARGET_INTEL)
- return issue_rate;
-
- /* Nothing to do if ready list contains only 1 instruction. */
- if (n_ready <= 1)
- return issue_rate;
-
- /* Do reodering for post-reload scheduler only. */
- if (!reload_completed)
- return issue_rate;
-
- if ((index = do_reorder_for_imul (ready, n_ready)) >= 0)
- {
- if (sched_verbose > 1)
- fprintf (dump, ";;\tatom sched_reorder: put %d insn on top\n",
- INSN_UID (ready[index]));
-
- /* Put IMUL producer (ready[index]) at the top of ready list. */
- insn = ready[index];
- for (i = index; i < n_ready - 1; i++)
- ready[i] = ready[i + 1];
- ready[n_ready - 1] = insn;
- return issue_rate;
- }
-
- /* Skip selective scheduling since HID is not populated in it. */
- if (clock_var != 0
- && !sel_sched_p ()
- && swap_top_of_ready_list (ready, n_ready))
- {
- if (sched_verbose > 1)
- fprintf (dump, ";;\tslm sched_reorder: swap %d and %d insns\n",
- INSN_UID (ready[n_ready - 1]), INSN_UID (ready[n_ready - 2]));
- /* Swap 2 top elements of ready list. */
- insn = ready[n_ready - 1];
- ready[n_ready - 1] = ready[n_ready - 2];
- ready[n_ready - 2] = insn;
- }
- return issue_rate;
-}
static bool
ix86_class_likely_spilled_p (reg_class_t);
return priority;
}
-/* Model decoder of Core 2/i7.
- Below hooks for multipass scheduling (see haifa-sched.c:max_issue)
- track the instruction fetch block boundaries and make sure that long
- (9+ bytes) instructions are assigned to D0. */
-
-/* Maximum length of an insn that can be handled by
- a secondary decoder unit. '8' for Core 2/i7. */
-static int core2i7_secondary_decoder_max_insn_size;
-
-/* Ifetch block size, i.e., number of bytes decoder reads per cycle.
- '16' for Core 2/i7. */
-static int core2i7_ifetch_block_size;
-
-/* Maximum number of instructions decoder can handle per cycle.
- '6' for Core 2/i7. */
-static int core2i7_ifetch_block_max_insns;
-
-typedef struct ix86_first_cycle_multipass_data_ *
- ix86_first_cycle_multipass_data_t;
-typedef const struct ix86_first_cycle_multipass_data_ *
- const_ix86_first_cycle_multipass_data_t;
-
-/* A variable to store target state across calls to max_issue within
- one cycle. */
-static struct ix86_first_cycle_multipass_data_ _ix86_first_cycle_multipass_data,
- *ix86_first_cycle_multipass_data = &_ix86_first_cycle_multipass_data;
-
-/* Initialize DATA. */
-static void
-core2i7_first_cycle_multipass_init (void *_data)
-{
- ix86_first_cycle_multipass_data_t data
- = (ix86_first_cycle_multipass_data_t) _data;
-
- data->ifetch_block_len = 0;
- data->ifetch_block_n_insns = 0;
- data->ready_try_change = NULL;
- data->ready_try_change_size = 0;
-}
-
-/* Advancing the cycle; reset ifetch block counts. */
-static void
-core2i7_dfa_post_advance_cycle (void)
-{
- ix86_first_cycle_multipass_data_t data = ix86_first_cycle_multipass_data;
-
- gcc_assert (data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
-
- data->ifetch_block_len = 0;
- data->ifetch_block_n_insns = 0;
-}
-
-static int min_insn_size (rtx_insn *);
-
-/* Filter out insns from ready_try that the core will not be able to issue
- on current cycle due to decoder. */
-static void
-core2i7_first_cycle_multipass_filter_ready_try
-(const_ix86_first_cycle_multipass_data_t data,
- signed char *ready_try, int n_ready, bool first_cycle_insn_p)
-{
- while (n_ready--)
- {
- rtx_insn *insn;
- int insn_size;
-
- if (ready_try[n_ready])
- continue;
-
- insn = get_ready_element (n_ready);
- insn_size = min_insn_size (insn);
-
- if (/* If this is a too long an insn for a secondary decoder ... */
- (!first_cycle_insn_p
- && insn_size > core2i7_secondary_decoder_max_insn_size)
- /* ... or it would not fit into the ifetch block ... */
- || data->ifetch_block_len + insn_size > core2i7_ifetch_block_size
- /* ... or the decoder is full already ... */
- || data->ifetch_block_n_insns + 1 > core2i7_ifetch_block_max_insns)
- /* ... mask the insn out. */
- {
- ready_try[n_ready] = 1;
-
- if (data->ready_try_change)
- bitmap_set_bit (data->ready_try_change, n_ready);
- }
- }
-}
-
-/* Prepare for a new round of multipass lookahead scheduling. */
-static void
-core2i7_first_cycle_multipass_begin (void *_data,
- signed char *ready_try, int n_ready,
- bool first_cycle_insn_p)
-{
- ix86_first_cycle_multipass_data_t data
- = (ix86_first_cycle_multipass_data_t) _data;
- const_ix86_first_cycle_multipass_data_t prev_data
- = ix86_first_cycle_multipass_data;
-
- /* Restore the state from the end of the previous round. */
- data->ifetch_block_len = prev_data->ifetch_block_len;
- data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns;
-
- /* Filter instructions that cannot be issued on current cycle due to
- decoder restrictions. */
- core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
- first_cycle_insn_p);
-}
-
-/* INSN is being issued in current solution. Account for its impact on
- the decoder model. */
-static void
-core2i7_first_cycle_multipass_issue (void *_data,
- signed char *ready_try, int n_ready,
- rtx_insn *insn, const void *_prev_data)
-{
- ix86_first_cycle_multipass_data_t data
- = (ix86_first_cycle_multipass_data_t) _data;
- const_ix86_first_cycle_multipass_data_t prev_data
- = (const_ix86_first_cycle_multipass_data_t) _prev_data;
-
- int insn_size = min_insn_size (insn);
-
- data->ifetch_block_len = prev_data->ifetch_block_len + insn_size;
- data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns + 1;
- gcc_assert (data->ifetch_block_len <= core2i7_ifetch_block_size
- && data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
-
- /* Allocate or resize the bitmap for storing INSN's effect on ready_try. */
- if (!data->ready_try_change)
- {
- data->ready_try_change = sbitmap_alloc (n_ready);
- data->ready_try_change_size = n_ready;
- }
- else if (data->ready_try_change_size < n_ready)
- {
- data->ready_try_change = sbitmap_resize (data->ready_try_change,
- n_ready, 0);
- data->ready_try_change_size = n_ready;
- }
- bitmap_clear (data->ready_try_change);
-
- /* Filter out insns from ready_try that the core will not be able to issue
- on current cycle due to decoder. */
- core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
- false);
-}
-
-/* Revert the effect on ready_try. */
-static void
-core2i7_first_cycle_multipass_backtrack (const void *_data,
- signed char *ready_try,
- int n_ready ATTRIBUTE_UNUSED)
-{
- const_ix86_first_cycle_multipass_data_t data
- = (const_ix86_first_cycle_multipass_data_t) _data;
- unsigned int i = 0;
- sbitmap_iterator sbi;
-
- gcc_assert (bitmap_last_set_bit (data->ready_try_change) < n_ready);
- EXECUTE_IF_SET_IN_BITMAP (data->ready_try_change, 0, i, sbi)
- {
- ready_try[i] = 0;
- }
-}
-
-/* Save the result of multipass lookahead scheduling for the next round. */
-static void
-core2i7_first_cycle_multipass_end (const void *_data)
-{
- const_ix86_first_cycle_multipass_data_t data
- = (const_ix86_first_cycle_multipass_data_t) _data;
- ix86_first_cycle_multipass_data_t next_data
- = ix86_first_cycle_multipass_data;
-
- if (data != NULL)
- {
- next_data->ifetch_block_len = data->ifetch_block_len;
- next_data->ifetch_block_n_insns = data->ifetch_block_n_insns;
- }
-}
-
-/* Deallocate target data. */
-static void
-core2i7_first_cycle_multipass_fini (void *_data)
-{
- ix86_first_cycle_multipass_data_t data
- = (ix86_first_cycle_multipass_data_t) _data;
-
- if (data->ready_try_change)
- {
- sbitmap_free (data->ready_try_change);
- data->ready_try_change = NULL;
- data->ready_try_change_size = 0;
- }
-}
-
/* Prepare for scheduling pass. */
static void
ix86_sched_init_global (FILE *, int, int)
to save compile time. */
if (reload_completed)
{
- targetm.sched.dfa_post_advance_cycle
- = core2i7_dfa_post_advance_cycle;
- targetm.sched.first_cycle_multipass_init
- = core2i7_first_cycle_multipass_init;
- targetm.sched.first_cycle_multipass_begin
- = core2i7_first_cycle_multipass_begin;
- targetm.sched.first_cycle_multipass_issue
- = core2i7_first_cycle_multipass_issue;
- targetm.sched.first_cycle_multipass_backtrack
- = core2i7_first_cycle_multipass_backtrack;
- targetm.sched.first_cycle_multipass_end
- = core2i7_first_cycle_multipass_end;
- targetm.sched.first_cycle_multipass_fini
- = core2i7_first_cycle_multipass_fini;
-
- /* Set decoder parameters. */
- core2i7_secondary_decoder_max_insn_size = 8;
- core2i7_ifetch_block_size = 16;
- core2i7_ifetch_block_max_insns = 6;
+ ix86_core2i7_init_hooks ();
break;
}
/* Fall through. */
address sizes. This is enough to eliminate unnecessary padding in
99% of cases. */
-static int
-min_insn_size (rtx_insn *insn)
+int
+ix86_min_insn_size (rtx_insn *insn)
{
int l = 0, len;
njumps--, isjump = true;
else
isjump = false;
- nbytes -= min_insn_size (start);
+ nbytes -= ix86_min_insn_size (start);
}
}
continue;
}
- min_size = min_insn_size (insn);
+ min_size = ix86_min_insn_size (insn);
nbytes += min_size;
if (dump_file)
fprintf (dump_file, "Insn %i estimated to %i bytes\n",
njumps--, isjump = true;
else
isjump = false;
- nbytes -= min_insn_size (start);
+ nbytes -= ix86_min_insn_size (start);
}
gcc_assert (njumps >= 0);
if (dump_file)
if (njumps == 3 && isjump && nbytes < 16)
{
- int padsize = 15 - nbytes + min_insn_size (insn);
+ int padsize = 15 - nbytes + ix86_min_insn_size (insn);
if (dump_file)
fprintf (dump_file, "Padding insn %i by %i bytes!\n",
}
#undef TARGET_SCHED_DISPATCH
-#define TARGET_SCHED_DISPATCH has_dispatch
+#define TARGET_SCHED_DISPATCH ix86_bd_has_dispatch
#undef TARGET_SCHED_DISPATCH_DO
-#define TARGET_SCHED_DISPATCH_DO do_dispatch
+#define TARGET_SCHED_DISPATCH_DO ix86_bd_do_dispatch
#undef TARGET_SCHED_REASSOCIATION_WIDTH
#define TARGET_SCHED_REASSOCIATION_WIDTH ix86_reassociation_width
#undef TARGET_SCHED_REORDER
-#define TARGET_SCHED_REORDER ix86_sched_reorder
+#define TARGET_SCHED_REORDER ix86_atom_sched_reorder
#undef TARGET_SCHED_ADJUST_PRIORITY
#define TARGET_SCHED_ADJUST_PRIORITY ix86_adjust_priority
#undef TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK
#define TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK \
ix86_dependencies_evaluation_hook
-/* The size of the dispatch window is the total number of bytes of
- object code allowed in a window. */
-#define DISPATCH_WINDOW_SIZE 16
-
-/* Number of dispatch windows considered for scheduling. */
-#define MAX_DISPATCH_WINDOWS 3
-
-/* Maximum number of instructions in a window. */
-#define MAX_INSN 4
-
-/* Maximum number of immediate operands in a window. */
-#define MAX_IMM 4
-
-/* Maximum number of immediate bits allowed in a window. */
-#define MAX_IMM_SIZE 128
-
-/* Maximum number of 32 bit immediates allowed in a window. */
-#define MAX_IMM_32 4
-
-/* Maximum number of 64 bit immediates allowed in a window. */
-#define MAX_IMM_64 2
-
-/* Maximum total of loads or prefetches allowed in a window. */
-#define MAX_LOAD 2
-
-/* Maximum total of stores allowed in a window. */
-#define MAX_STORE 1
-
-#undef BIG
-#define BIG 100
-
-
-/* Dispatch groups. Istructions that affect the mix in a dispatch window. */
-enum dispatch_group {
- disp_no_group = 0,
- disp_load,
- disp_store,
- disp_load_store,
- disp_prefetch,
- disp_imm,
- disp_imm_32,
- disp_imm_64,
- disp_branch,
- disp_cmp,
- disp_jcc,
- disp_last
-};
-
-/* Number of allowable groups in a dispatch window. It is an array
- indexed by dispatch_group enum. 100 is used as a big number,
- because the number of these kind of operations does not have any
- effect in dispatch window, but we need them for other reasons in
- the table. */
-static unsigned int num_allowable_groups[disp_last] = {
- 0, 2, 1, 1, 2, 4, 4, 2, 1, BIG, BIG
-};
-
-char group_name[disp_last + 1][16] = {
- "disp_no_group", "disp_load", "disp_store", "disp_load_store",
- "disp_prefetch", "disp_imm", "disp_imm_32", "disp_imm_64",
- "disp_branch", "disp_cmp", "disp_jcc", "disp_last"
-};
-
-/* Instruction path. */
-enum insn_path {
- no_path = 0,
- path_single, /* Single micro op. */
- path_double, /* Double micro op. */
- path_multi, /* Instructions with more than 2 micro op.. */
- last_path
-};
-
-/* sched_insn_info defines a window to the instructions scheduled in
- the basic block. It contains a pointer to the insn_info table and
- the instruction scheduled.
-
- Windows are allocated for each basic block and are linked
- together. */
-typedef struct sched_insn_info_s {
- rtx insn;
- enum dispatch_group group;
- enum insn_path path;
- int byte_len;
- int imm_bytes;
-} sched_insn_info;
-
-/* Linked list of dispatch windows. This is a two way list of
- dispatch windows of a basic block. It contains information about
- the number of uops in the window and the total number of
- instructions and of bytes in the object code for this dispatch
- window. */
-typedef struct dispatch_windows_s {
- int num_insn; /* Number of insn in the window. */
- int num_uops; /* Number of uops in the window. */
- int window_size; /* Number of bytes in the window. */
- int window_num; /* Window number between 0 or 1. */
- int num_imm; /* Number of immediates in an insn. */
- int num_imm_32; /* Number of 32 bit immediates in an insn. */
- int num_imm_64; /* Number of 64 bit immediates in an insn. */
- int imm_size; /* Total immediates in the window. */
- int num_loads; /* Total memory loads in the window. */
- int num_stores; /* Total memory stores in the window. */
- int violation; /* Violation exists in window. */
- sched_insn_info *window; /* Pointer to the window. */
- struct dispatch_windows_s *next;
- struct dispatch_windows_s *prev;
-} dispatch_windows;
-
-/* Immediate valuse used in an insn. */
-typedef struct imm_info_s
- {
- int imm;
- int imm32;
- int imm64;
- } imm_info;
-
-static dispatch_windows *dispatch_window_list;
-static dispatch_windows *dispatch_window_list1;
-
-/* Get dispatch group of insn. */
-
-static enum dispatch_group
-get_mem_group (rtx_insn *insn)
-{
- enum attr_memory memory;
-
- if (INSN_CODE (insn) < 0)
- return disp_no_group;
- memory = get_attr_memory (insn);
- if (memory == MEMORY_STORE)
- return disp_store;
-
- if (memory == MEMORY_LOAD)
- return disp_load;
-
- if (memory == MEMORY_BOTH)
- return disp_load_store;
-
- return disp_no_group;
-}
-
-/* Return true if insn is a compare instruction. */
-
-static bool
-is_cmp (rtx_insn *insn)
-{
- enum attr_type type;
-
- type = get_attr_type (insn);
- return (type == TYPE_TEST
- || type == TYPE_ICMP
- || type == TYPE_FCMP
- || GET_CODE (PATTERN (insn)) == COMPARE);
-}
-
-/* Return true if a dispatch violation encountered. */
-
-static bool
-dispatch_violation (void)
-{
- if (dispatch_window_list->next)
- return dispatch_window_list->next->violation;
- return dispatch_window_list->violation;
-}
-
-/* Return true if insn is a branch instruction. */
-
-static bool
-is_branch (rtx_insn *insn)
-{
- return (CALL_P (insn) || JUMP_P (insn));
-}
-
-/* Return true if insn is a prefetch instruction. */
-
-static bool
-is_prefetch (rtx_insn *insn)
-{
- return NONJUMP_INSN_P (insn) && GET_CODE (PATTERN (insn)) == PREFETCH;
-}
-
-/* This function initializes a dispatch window and the list container holding a
- pointer to the window. */
-
-static void
-init_window (int window_num)
-{
- int i;
- dispatch_windows *new_list;
-
- if (window_num == 0)
- new_list = dispatch_window_list;
- else
- new_list = dispatch_window_list1;
-
- new_list->num_insn = 0;
- new_list->num_uops = 0;
- new_list->window_size = 0;
- new_list->next = NULL;
- new_list->prev = NULL;
- new_list->window_num = window_num;
- new_list->num_imm = 0;
- new_list->num_imm_32 = 0;
- new_list->num_imm_64 = 0;
- new_list->imm_size = 0;
- new_list->num_loads = 0;
- new_list->num_stores = 0;
- new_list->violation = false;
-
- for (i = 0; i < MAX_INSN; i++)
- {
- new_list->window[i].insn = NULL;
- new_list->window[i].group = disp_no_group;
- new_list->window[i].path = no_path;
- new_list->window[i].byte_len = 0;
- new_list->window[i].imm_bytes = 0;
- }
- return;
-}
-
-/* This function allocates and initializes a dispatch window and the
- list container holding a pointer to the window. */
-
-static dispatch_windows *
-allocate_window (void)
-{
- dispatch_windows *new_list = XNEW (struct dispatch_windows_s);
- new_list->window = XNEWVEC (struct sched_insn_info_s, MAX_INSN + 1);
-
- return new_list;
-}
-
-/* This routine initializes the dispatch scheduling information. It
- initiates building dispatch scheduler tables and constructs the
- first dispatch window. */
-
-static void
-init_dispatch_sched (void)
-{
- /* Allocate a dispatch list and a window. */
- dispatch_window_list = allocate_window ();
- dispatch_window_list1 = allocate_window ();
- init_window (0);
- init_window (1);
-}
-
-/* This function returns true if a branch is detected. End of a basic block
- does not have to be a branch, but here we assume only branches end a
- window. */
-
-static bool
-is_end_basic_block (enum dispatch_group group)
-{
- return group == disp_branch;
-}
-
-/* This function is called when the end of a window processing is reached. */
-
-static void
-process_end_window (void)
-{
- gcc_assert (dispatch_window_list->num_insn <= MAX_INSN);
- if (dispatch_window_list->next)
- {
- gcc_assert (dispatch_window_list1->num_insn <= MAX_INSN);
- gcc_assert (dispatch_window_list->window_size
- + dispatch_window_list1->window_size <= 48);
- init_window (1);
- }
- init_window (0);
-}
-
-/* Allocates a new dispatch window and adds it to WINDOW_LIST.
- WINDOW_NUM is either 0 or 1. A maximum of two windows are generated
- for 48 bytes of instructions. Note that these windows are not dispatch
- windows that their sizes are DISPATCH_WINDOW_SIZE. */
-
-static dispatch_windows *
-allocate_next_window (int window_num)
-{
- if (window_num == 0)
- {
- if (dispatch_window_list->next)
- init_window (1);
- init_window (0);
- return dispatch_window_list;
- }
-
- dispatch_window_list->next = dispatch_window_list1;
- dispatch_window_list1->prev = dispatch_window_list;
-
- return dispatch_window_list1;
-}
-
-/* Compute number of immediate operands of an instruction. */
-
-static void
-find_constant (rtx in_rtx, imm_info *imm_values)
-{
- if (INSN_P (in_rtx))
- in_rtx = PATTERN (in_rtx);
- subrtx_iterator::array_type array;
- FOR_EACH_SUBRTX (iter, array, in_rtx, ALL)
- if (const_rtx x = *iter)
- switch (GET_CODE (x))
- {
- case CONST:
- case SYMBOL_REF:
- case CONST_INT:
- (imm_values->imm)++;
- if (x86_64_immediate_operand (CONST_CAST_RTX (x), SImode))
- (imm_values->imm32)++;
- else
- (imm_values->imm64)++;
- break;
-
- case CONST_DOUBLE:
- case CONST_WIDE_INT:
- (imm_values->imm)++;
- (imm_values->imm64)++;
- break;
-
- case CODE_LABEL:
- if (LABEL_KIND (x) == LABEL_NORMAL)
- {
- (imm_values->imm)++;
- (imm_values->imm32)++;
- }
- break;
-
- default:
- break;
- }
-}
-
-/* Return total size of immediate operands of an instruction along with number
- of corresponding immediate-operands. It initializes its parameters to zero
- befor calling FIND_CONSTANT.
- INSN is the input instruction. IMM is the total of immediates.
- IMM32 is the number of 32 bit immediates. IMM64 is the number of 64
- bit immediates. */
-
-static int
-get_num_immediates (rtx_insn *insn, int *imm, int *imm32, int *imm64)
-{
- imm_info imm_values = {0, 0, 0};
-
- find_constant (insn, &imm_values);
- *imm = imm_values.imm;
- *imm32 = imm_values.imm32;
- *imm64 = imm_values.imm64;
- return imm_values.imm32 * 4 + imm_values.imm64 * 8;
-}
-
-/* This function indicates if an operand of an instruction is an
- immediate. */
-
-static bool
-has_immediate (rtx_insn *insn)
-{
- int num_imm_operand;
- int num_imm32_operand;
- int num_imm64_operand;
-
- if (insn)
- return get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
- &num_imm64_operand);
- return false;
-}
-
-/* Return single or double path for instructions. */
-
-static enum insn_path
-get_insn_path (rtx_insn *insn)
-{
- enum attr_amdfam10_decode path = get_attr_amdfam10_decode (insn);
-
- if ((int)path == 0)
- return path_single;
-
- if ((int)path == 1)
- return path_double;
-
- return path_multi;
-}
-
-/* Return insn dispatch group. */
-
-static enum dispatch_group
-get_insn_group (rtx_insn *insn)
-{
- enum dispatch_group group = get_mem_group (insn);
- if (group)
- return group;
-
- if (is_branch (insn))
- return disp_branch;
-
- if (is_cmp (insn))
- return disp_cmp;
-
- if (has_immediate (insn))
- return disp_imm;
-
- if (is_prefetch (insn))
- return disp_prefetch;
-
- return disp_no_group;
-}
-
-/* Count number of GROUP restricted instructions in a dispatch
- window WINDOW_LIST. */
-
-static int
-count_num_restricted (rtx_insn *insn, dispatch_windows *window_list)
-{
- enum dispatch_group group = get_insn_group (insn);
- int imm_size;
- int num_imm_operand;
- int num_imm32_operand;
- int num_imm64_operand;
-
- if (group == disp_no_group)
- return 0;
-
- if (group == disp_imm)
- {
- imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
- &num_imm64_operand);
- if (window_list->imm_size + imm_size > MAX_IMM_SIZE
- || num_imm_operand + window_list->num_imm > MAX_IMM
- || (num_imm32_operand > 0
- && (window_list->num_imm_32 + num_imm32_operand > MAX_IMM_32
- || window_list->num_imm_64 * 2 + num_imm32_operand > MAX_IMM_32))
- || (num_imm64_operand > 0
- && (window_list->num_imm_64 + num_imm64_operand > MAX_IMM_64
- || window_list->num_imm_32 + num_imm64_operand * 2 > MAX_IMM_32))
- || (window_list->imm_size + imm_size == MAX_IMM_SIZE
- && num_imm64_operand > 0
- && ((window_list->num_imm_64 > 0
- && window_list->num_insn >= 2)
- || window_list->num_insn >= 3)))
- return BIG;
-
- return 1;
- }
-
- if ((group == disp_load_store
- && (window_list->num_loads >= MAX_LOAD
- || window_list->num_stores >= MAX_STORE))
- || ((group == disp_load
- || group == disp_prefetch)
- && window_list->num_loads >= MAX_LOAD)
- || (group == disp_store
- && window_list->num_stores >= MAX_STORE))
- return BIG;
-
- return 1;
-}
-
-/* This function returns true if insn satisfies dispatch rules on the
- last window scheduled. */
-
-static bool
-fits_dispatch_window (rtx_insn *insn)
-{
- dispatch_windows *window_list = dispatch_window_list;
- dispatch_windows *window_list_next = dispatch_window_list->next;
- unsigned int num_restrict;
- enum dispatch_group group = get_insn_group (insn);
- enum insn_path path = get_insn_path (insn);
- int sum;
-
- /* Make disp_cmp and disp_jcc get scheduled at the latest. These
- instructions should be given the lowest priority in the
- scheduling process in Haifa scheduler to make sure they will be
- scheduled in the same dispatch window as the reference to them. */
- if (group == disp_jcc || group == disp_cmp)
- return false;
-
- /* Check nonrestricted. */
- if (group == disp_no_group || group == disp_branch)
- return true;
-
- /* Get last dispatch window. */
- if (window_list_next)
- window_list = window_list_next;
-
- if (window_list->window_num == 1)
- {
- sum = window_list->prev->window_size + window_list->window_size;
-
- if (sum == 32
- || (min_insn_size (insn) + sum) >= 48)
- /* Window 1 is full. Go for next window. */
- return true;
- }
-
- num_restrict = count_num_restricted (insn, window_list);
-
- if (num_restrict > num_allowable_groups[group])
- return false;
-
- /* See if it fits in the first window. */
- if (window_list->window_num == 0)
- {
- /* The first widow should have only single and double path
- uops. */
- if (path == path_double
- && (window_list->num_uops + 2) > MAX_INSN)
- return false;
- else if (path != path_single)
- return false;
- }
- return true;
-}
-
-/* Add an instruction INSN with NUM_UOPS micro-operations to the
- dispatch window WINDOW_LIST. */
-
-static void
-add_insn_window (rtx_insn *insn, dispatch_windows *window_list, int num_uops)
-{
- int byte_len = min_insn_size (insn);
- int num_insn = window_list->num_insn;
- int imm_size;
- sched_insn_info *window = window_list->window;
- enum dispatch_group group = get_insn_group (insn);
- enum insn_path path = get_insn_path (insn);
- int num_imm_operand;
- int num_imm32_operand;
- int num_imm64_operand;
-
- if (!window_list->violation && group != disp_cmp
- && !fits_dispatch_window (insn))
- window_list->violation = true;
-
- imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
- &num_imm64_operand);
-
- /* Initialize window with new instruction. */
- window[num_insn].insn = insn;
- window[num_insn].byte_len = byte_len;
- window[num_insn].group = group;
- window[num_insn].path = path;
- window[num_insn].imm_bytes = imm_size;
-
- window_list->window_size += byte_len;
- window_list->num_insn = num_insn + 1;
- window_list->num_uops = window_list->num_uops + num_uops;
- window_list->imm_size += imm_size;
- window_list->num_imm += num_imm_operand;
- window_list->num_imm_32 += num_imm32_operand;
- window_list->num_imm_64 += num_imm64_operand;
-
- if (group == disp_store)
- window_list->num_stores += 1;
- else if (group == disp_load
- || group == disp_prefetch)
- window_list->num_loads += 1;
- else if (group == disp_load_store)
- {
- window_list->num_stores += 1;
- window_list->num_loads += 1;
- }
-}
-
-/* Adds a scheduled instruction, INSN, to the current dispatch window.
- If the total bytes of instructions or the number of instructions in
- the window exceed allowable, it allocates a new window. */
-
-static void
-add_to_dispatch_window (rtx_insn *insn)
-{
- int byte_len;
- dispatch_windows *window_list;
- dispatch_windows *next_list;
- dispatch_windows *window0_list;
- enum insn_path path;
- enum dispatch_group insn_group;
- bool insn_fits;
- int num_insn;
- int num_uops;
- int window_num;
- int insn_num_uops;
- int sum;
-
- if (INSN_CODE (insn) < 0)
- return;
-
- byte_len = min_insn_size (insn);
- window_list = dispatch_window_list;
- next_list = window_list->next;
- path = get_insn_path (insn);
- insn_group = get_insn_group (insn);
-
- /* Get the last dispatch window. */
- if (next_list)
- window_list = dispatch_window_list->next;
-
- if (path == path_single)
- insn_num_uops = 1;
- else if (path == path_double)
- insn_num_uops = 2;
- else
- insn_num_uops = (int) path;
-
- /* If current window is full, get a new window.
- Window number zero is full, if MAX_INSN uops are scheduled in it.
- Window number one is full, if window zero's bytes plus window
- one's bytes is 32, or if the bytes of the new instruction added
- to the total makes it greater than 48, or it has already MAX_INSN
- instructions in it. */
- num_insn = window_list->num_insn;
- num_uops = window_list->num_uops;
- window_num = window_list->window_num;
- insn_fits = fits_dispatch_window (insn);
-
- if (num_insn >= MAX_INSN
- || num_uops + insn_num_uops > MAX_INSN
- || !(insn_fits))
- {
- window_num = ~window_num & 1;
- window_list = allocate_next_window (window_num);
- }
-
- if (window_num == 0)
- {
- add_insn_window (insn, window_list, insn_num_uops);
- if (window_list->num_insn >= MAX_INSN
- && insn_group == disp_branch)
- {
- process_end_window ();
- return;
- }
- }
- else if (window_num == 1)
- {
- window0_list = window_list->prev;
- sum = window0_list->window_size + window_list->window_size;
- if (sum == 32
- || (byte_len + sum) >= 48)
- {
- process_end_window ();
- window_list = dispatch_window_list;
- }
-
- add_insn_window (insn, window_list, insn_num_uops);
- }
- else
- gcc_unreachable ();
-
- if (is_end_basic_block (insn_group))
- {
- /* End of basic block is reached do end-basic-block process. */
- process_end_window ();
- return;
- }
-}
-
-/* Print the dispatch window, WINDOW_NUM, to FILE. */
-
-DEBUG_FUNCTION static void
-debug_dispatch_window_file (FILE *file, int window_num)
-{
- dispatch_windows *list;
- int i;
-
- if (window_num == 0)
- list = dispatch_window_list;
- else
- list = dispatch_window_list1;
-
- fprintf (file, "Window #%d:\n", list->window_num);
- fprintf (file, " num_insn = %d, num_uops = %d, window_size = %d\n",
- list->num_insn, list->num_uops, list->window_size);
- fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
- list->num_imm, list->num_imm_32, list->num_imm_64, list->imm_size);
-
- fprintf (file, " num_loads = %d, num_stores = %d\n", list->num_loads,
- list->num_stores);
- fprintf (file, " insn info:\n");
-
- for (i = 0; i < MAX_INSN; i++)
- {
- if (!list->window[i].insn)
- break;
- fprintf (file, " group[%d] = %s, insn[%d] = %p, path[%d] = %d byte_len[%d] = %d, imm_bytes[%d] = %d\n",
- i, group_name[list->window[i].group],
- i, (void *)list->window[i].insn,
- i, list->window[i].path,
- i, list->window[i].byte_len,
- i, list->window[i].imm_bytes);
- }
-}
-
-/* Print to stdout a dispatch window. */
-
-DEBUG_FUNCTION void
-debug_dispatch_window (int window_num)
-{
- debug_dispatch_window_file (stdout, window_num);
-}
-
-/* Print INSN dispatch information to FILE. */
-
-DEBUG_FUNCTION static void
-debug_insn_dispatch_info_file (FILE *file, rtx_insn *insn)
-{
- int byte_len;
- enum insn_path path;
- enum dispatch_group group;
- int imm_size;
- int num_imm_operand;
- int num_imm32_operand;
- int num_imm64_operand;
-
- if (INSN_CODE (insn) < 0)
- return;
-
- byte_len = min_insn_size (insn);
- path = get_insn_path (insn);
- group = get_insn_group (insn);
- imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
- &num_imm64_operand);
-
- fprintf (file, " insn info:\n");
- fprintf (file, " group = %s, path = %d, byte_len = %d\n",
- group_name[group], path, byte_len);
- fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
- num_imm_operand, num_imm32_operand, num_imm64_operand, imm_size);
-}
-
-/* Print to STDERR the status of the ready list with respect to
- dispatch windows. */
-
-DEBUG_FUNCTION void
-debug_ready_dispatch (void)
-{
- int i;
- int no_ready = number_in_ready ();
-
- fprintf (stdout, "Number of ready: %d\n", no_ready);
-
- for (i = 0; i < no_ready; i++)
- debug_insn_dispatch_info_file (stdout, get_ready_element (i));
-}
-
-/* This routine is the driver of the dispatch scheduler. */
-
-static void
-do_dispatch (rtx_insn *insn, int mode)
-{
- if (mode == DISPATCH_INIT)
- init_dispatch_sched ();
- else if (mode == ADD_TO_DISPATCH_WINDOW)
- add_to_dispatch_window (insn);
-}
-
-/* Return TRUE if Dispatch Scheduling is supported. */
-
-static bool
-has_dispatch (rtx_insn *insn, int action)
-{
- /* Current implementation of dispatch scheduler models buldozer only. */
- if ((TARGET_BDVER1 || TARGET_BDVER2 || TARGET_BDVER3
- || TARGET_BDVER4) && flag_dispatch_scheduler)
- switch (action)
- {
- default:
- return false;
-
- case IS_DISPATCH_ON:
- return true;
-
- case IS_CMP:
- return is_cmp (insn);
-
- case DISPATCH_VIOLATION:
- return dispatch_violation ();
-
- case FITS_DISPATCH_WINDOW:
- return fits_dispatch_window (insn);
- }
-
- return false;
-}
/* Implementation of reassociation_width target hook used by
reassoc phase to identify parallelism level in reassociated
$(COMPILE) $<
$(POSTCOMPILE)
+x86-tune-sched.o: $(srcdir)/config/i386/x86-tune-sched.c
+ $(COMPILE) $<
+ $(POSTCOMPILE)
+
+x86-tune-sched-bd.o: $(srcdir)/config/i386/x86-tune-sched-bd.c
+ $(COMPILE) $<
+ $(POSTCOMPILE)
+
+x86-tune-sched-atom.o: $(srcdir)/config/i386/x86-tune-sched-atom.c
+ $(COMPILE) $<
+ $(POSTCOMPILE)
+
+x86-tune-sched-core.o: $(srcdir)/config/i386/x86-tune-sched-core.c
+ $(COMPILE) $<
+ $(POSTCOMPILE)
+
i386.o: i386-builtin-types.inc
i386-builtin-types.inc: s-i386-bt ; @true
--- /dev/null
+
+/* Processor costs (relative to an add) */
+/* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes. */
+#define COSTS_N_BYTES(N) ((N) * 2)
+
+#define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall, false}}}
+
+static stringop_algs ix86_size_memcpy[2] = {
+ {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
+ {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
+static stringop_algs ix86_size_memset[2] = {
+ {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
+ {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
+
+const
+struct processor_costs ix86_size_cost = {/* costs for tuning for size */
+ COSTS_N_BYTES (2), /* cost of an add instruction */
+ COSTS_N_BYTES (3), /* cost of a lea instruction */
+ COSTS_N_BYTES (2), /* variable shift costs */
+ COSTS_N_BYTES (3), /* constant shift costs */
+ {COSTS_N_BYTES (3), /* cost of starting multiply for QI */
+ COSTS_N_BYTES (3), /* HI */
+ COSTS_N_BYTES (3), /* SI */
+ COSTS_N_BYTES (3), /* DI */
+ COSTS_N_BYTES (5)}, /* other */
+ 0, /* cost of multiply per each bit set */
+ {COSTS_N_BYTES (3), /* cost of a divide/mod for QI */
+ COSTS_N_BYTES (3), /* HI */
+ COSTS_N_BYTES (3), /* SI */
+ COSTS_N_BYTES (3), /* DI */
+ COSTS_N_BYTES (5)}, /* other */
+ COSTS_N_BYTES (3), /* cost of movsx */
+ COSTS_N_BYTES (3), /* cost of movzx */
+ 0, /* "large" insn */
+ 2, /* MOVE_RATIO */
+ 2, /* cost for loading QImode using movzbl */
+ {2, 2, 2}, /* cost of loading integer registers
+ in QImode, HImode and SImode.
+ Relative to reg-reg move (2). */
+ {2, 2, 2}, /* cost of storing integer registers */
+ 2, /* cost of reg,reg fld/fst */
+ {2, 2, 2}, /* cost of loading fp registers
+ in SFmode, DFmode and XFmode */
+ {2, 2, 2}, /* cost of storing fp registers
+ in SFmode, DFmode and XFmode */
+ 3, /* cost of moving MMX register */
+ {3, 3}, /* cost of loading MMX registers
+ in SImode and DImode */
+ {3, 3}, /* cost of storing MMX registers
+ in SImode and DImode */
+ 3, /* cost of moving SSE register */
+ {3, 3, 3}, /* cost of loading SSE registers
+ in SImode, DImode and TImode */
+ {3, 3, 3}, /* cost of storing SSE registers
+ in SImode, DImode and TImode */
+ 3, /* MMX or SSE register to integer */
+ 0, /* size of l1 cache */
+ 0, /* size of l2 cache */
+ 0, /* size of prefetch block */
+ 0, /* number of parallel prefetches */
+ 2, /* Branch cost */
+ COSTS_N_BYTES (2), /* cost of FADD and FSUB insns. */
+ COSTS_N_BYTES (2), /* cost of FMUL instruction. */
+ COSTS_N_BYTES (2), /* cost of FDIV instruction. */
+ COSTS_N_BYTES (2), /* cost of FABS instruction. */
+ COSTS_N_BYTES (2), /* cost of FCHS instruction. */
+ COSTS_N_BYTES (2), /* cost of FSQRT instruction. */
+ 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
+ ix86_size_memcpy,
+ ix86_size_memset,
+ 1, /* scalar_stmt_cost. */
+ 1, /* scalar load_cost. */
+ 1, /* scalar_store_cost. */
+ 1, /* vec_stmt_cost. */
+ 1, /* vec_to_scalar_cost. */
+ 1, /* scalar_to_vec_cost. */
+ 1, /* vec_align_load_cost. */
+ 1, /* vec_unalign_load_cost. */
+ 1, /* vec_store_cost. */
+ 1, /* cond_taken_branch_cost. */
+ 1, /* cond_not_taken_branch_cost. */
+};
+
+/* Processor costs (relative to an add) */
+static stringop_algs i386_memcpy[2] = {
+ {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
+ DUMMY_STRINGOP_ALGS};
+static stringop_algs i386_memset[2] = {
+ {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
+ DUMMY_STRINGOP_ALGS};
+
+static const
+struct processor_costs i386_cost = { /* 386 specific costs */
+ COSTS_N_INSNS (1), /* cost of an add instruction */
+ COSTS_N_INSNS (1), /* cost of a lea instruction */
+ COSTS_N_INSNS (3), /* variable shift costs */
+ COSTS_N_INSNS (2), /* constant shift costs */
+ {COSTS_N_INSNS (6), /* cost of starting multiply for QI */
+ COSTS_N_INSNS (6), /* HI */
+ COSTS_N_INSNS (6), /* SI */
+ COSTS_N_INSNS (6), /* DI */
+ COSTS_N_INSNS (6)}, /* other */
+ COSTS_N_INSNS (1), /* cost of multiply per each bit set */
+ {COSTS_N_INSNS (23), /* cost of a divide/mod for QI */
+ COSTS_N_INSNS (23), /* HI */
+ COSTS_N_INSNS (23), /* SI */
+ COSTS_N_INSNS (23), /* DI */
+ COSTS_N_INSNS (23)}, /* other */
+ COSTS_N_INSNS (3), /* cost of movsx */
+ COSTS_N_INSNS (2), /* cost of movzx */
+ 15, /* "large" insn */
+ 3, /* MOVE_RATIO */
+ 4, /* cost for loading QImode using movzbl */
+ {2, 4, 2}, /* cost of loading integer registers
+ in QImode, HImode and SImode.
+ Relative to reg-reg move (2). */
+ {2, 4, 2}, /* cost of storing integer registers */
+ 2, /* cost of reg,reg fld/fst */
+ {8, 8, 8}, /* cost of loading fp registers
+ in SFmode, DFmode and XFmode */
+ {8, 8, 8}, /* cost of storing fp registers
+ in SFmode, DFmode and XFmode */
+ 2, /* cost of moving MMX register */
+ {4, 8}, /* cost of loading MMX registers
+ in SImode and DImode */
+ {4, 8}, /* cost of storing MMX registers
+ in SImode and DImode */
+ 2, /* cost of moving SSE register */
+ {4, 8, 16}, /* cost of loading SSE registers
+ in SImode, DImode and TImode */
+ {4, 8, 16}, /* cost of storing SSE registers
+ in SImode, DImode and TImode */
+ 3, /* MMX or SSE register to integer */
+ 0, /* size of l1 cache */
+ 0, /* size of l2 cache */
+ 0, /* size of prefetch block */
+ 0, /* number of parallel prefetches */
+ 1, /* Branch cost */
+ COSTS_N_INSNS (23), /* cost of FADD and FSUB insns. */
+ COSTS_N_INSNS (27), /* cost of FMUL instruction. */
+ COSTS_N_INSNS (88), /* cost of FDIV instruction. */
+ COSTS_N_INSNS (22), /* cost of FABS instruction. */
+ COSTS_N_INSNS (24), /* cost of FCHS instruction. */
+ COSTS_N_INSNS (122), /* cost of FSQRT instruction. */
+ 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
+ i386_memcpy,
+ i386_memset,
+ 1, /* scalar_stmt_cost. */
+ 1, /* scalar load_cost. */
+ 1, /* scalar_store_cost. */
+ 1, /* vec_stmt_cost. */
+ 1, /* vec_to_scalar_cost. */
+ 1, /* scalar_to_vec_cost. */
+ 1, /* vec_align_load_cost. */
+ 2, /* vec_unalign_load_cost. */
+ 1, /* vec_store_cost. */
+ 3, /* cond_taken_branch_cost. */
+ 1, /* cond_not_taken_branch_cost. */
+};
+
+static stringop_algs i486_memcpy[2] = {
+ {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
+ DUMMY_STRINGOP_ALGS};
+static stringop_algs i486_memset[2] = {
+ {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
+ DUMMY_STRINGOP_ALGS};
+
+static const
+struct processor_costs i486_cost = { /* 486 specific costs */
+ COSTS_N_INSNS (1), /* cost of an add instruction */
+ COSTS_N_INSNS (1), /* cost of a lea instruction */
+ COSTS_N_INSNS (3), /* variable shift costs */
+ COSTS_N_INSNS (2), /* constant shift costs */
+ {COSTS_N_INSNS (12), /* cost of starting multiply for QI */
+ COSTS_N_INSNS (12), /* HI */
+ COSTS_N_INSNS (12), /* SI */
+ COSTS_N_INSNS (12), /* DI */
+ COSTS_N_INSNS (12)}, /* other */
+ 1, /* cost of multiply per each bit set */
+ {COSTS_N_INSNS (40), /* cost of a divide/mod for QI */
+ COSTS_N_INSNS (40), /* HI */
+ COSTS_N_INSNS (40), /* SI */
+ COSTS_N_INSNS (40), /* DI */
+ COSTS_N_INSNS (40)}, /* other */
+ COSTS_N_INSNS (3), /* cost of movsx */
+ COSTS_N_INSNS (2), /* cost of movzx */
+ 15, /* "large" insn */
+ 3, /* MOVE_RATIO */
+ 4, /* cost for loading QImode using movzbl */
+ {2, 4, 2}, /* cost of loading integer registers
+ in QImode, HImode and SImode.
+ Relative to reg-reg move (2). */
+ {2, 4, 2}, /* cost of storing integer registers */
+ 2, /* cost of reg,reg fld/fst */
+ {8, 8, 8}, /* cost of loading fp registers
+ in SFmode, DFmode and XFmode */
+ {8, 8, 8}, /* cost of storing fp registers
+ in SFmode, DFmode and XFmode */
+ 2, /* cost of moving MMX register */
+ {4, 8}, /* cost of loading MMX registers
+ in SImode and DImode */
+ {4, 8}, /* cost of storing MMX registers
+ in SImode and DImode */
+ 2, /* cost of moving SSE register */
+ {4, 8, 16}, /* cost of loading SSE registers
+ in SImode, DImode and TImode */
+ {4, 8, 16}, /* cost of storing SSE registers
+ in SImode, DImode and TImode */
+ 3, /* MMX or SSE register to integer */
+ 4, /* size of l1 cache. 486 has 8kB cache
+ shared for code and data, so 4kB is
+ not really precise. */
+ 4, /* size of l2 cache */
+ 0, /* size of prefetch block */
+ 0, /* number of parallel prefetches */
+ 1, /* Branch cost */
+ COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
+ COSTS_N_INSNS (16), /* cost of FMUL instruction. */
+ COSTS_N_INSNS (73), /* cost of FDIV instruction. */
+ COSTS_N_INSNS (3), /* cost of FABS instruction. */
+ COSTS_N_INSNS (3), /* cost of FCHS instruction. */
+ COSTS_N_INSNS (83), /* cost of FSQRT instruction. */
+ 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
+ i486_memcpy,
+ i486_memset,
+ 1, /* scalar_stmt_cost. */
+ 1, /* scalar load_cost. */
+ 1, /* scalar_store_cost. */
+ 1, /* vec_stmt_cost. */
+ 1, /* vec_to_scalar_cost. */
+ 1, /* scalar_to_vec_cost. */
+ 1, /* vec_align_load_cost. */
+ 2, /* vec_unalign_load_cost. */
+ 1, /* vec_store_cost. */
+ 3, /* cond_taken_branch_cost. */
+ 1, /* cond_not_taken_branch_cost. */
+};
+
+static stringop_algs pentium_memcpy[2] = {
+ {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
+ DUMMY_STRINGOP_ALGS};
+static stringop_algs pentium_memset[2] = {
+ {libcall, {{-1, rep_prefix_4_byte, false}}},
+ DUMMY_STRINGOP_ALGS};
+
+static const
+struct processor_costs pentium_cost = {
+ COSTS_N_INSNS (1), /* cost of an add instruction */
+ COSTS_N_INSNS (1), /* cost of a lea instruction */
+ COSTS_N_INSNS (4), /* variable shift costs */
+ COSTS_N_INSNS (1), /* constant shift costs */
+ {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
+ COSTS_N_INSNS (11), /* HI */
+ COSTS_N_INSNS (11), /* SI */
+ COSTS_N_INSNS (11), /* DI */
+ COSTS_N_INSNS (11)}, /* other */
+ 0, /* cost of multiply per each bit set */
+ {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
+ COSTS_N_INSNS (25), /* HI */
+ COSTS_N_INSNS (25), /* SI */
+ COSTS_N_INSNS (25), /* DI */
+ COSTS_N_INSNS (25)}, /* other */
+ COSTS_N_INSNS (3), /* cost of movsx */
+ COSTS_N_INSNS (2), /* cost of movzx */
+ 8, /* "large" insn */
+ 6, /* MOVE_RATIO */
+ 6, /* cost for loading QImode using movzbl */
+ {2, 4, 2}, /* cost of loading integer registers
+ in QImode, HImode and SImode.
+ Relative to reg-reg move (2). */
+ {2, 4, 2}, /* cost of storing integer registers */
+ 2, /* cost of reg,reg fld/fst */
+ {2, 2, 6}, /* cost of loading fp registers
+ in SFmode, DFmode and XFmode */
+ {4, 4, 6}, /* cost of storing fp registers
+ in SFmode, DFmode and XFmode */
+ 8, /* cost of moving MMX register */
+ {8, 8}, /* cost of loading MMX registers
+ in SImode and DImode */
+ {8, 8}, /* cost of storing MMX registers
+ in SImode and DImode */
+ 2, /* cost of moving SSE register */
+ {4, 8, 16}, /* cost of loading SSE registers
+ in SImode, DImode and TImode */
+ {4, 8, 16}, /* cost of storing SSE registers
+ in SImode, DImode and TImode */
+ 3, /* MMX or SSE register to integer */
+ 8, /* size of l1 cache. */
+ 8, /* size of l2 cache */
+ 0, /* size of prefetch block */
+ 0, /* number of parallel prefetches */
+ 2, /* Branch cost */
+ COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
+ COSTS_N_INSNS (3), /* cost of FMUL instruction. */
+ COSTS_N_INSNS (39), /* cost of FDIV instruction. */
+ COSTS_N_INSNS (1), /* cost of FABS instruction. */
+ COSTS_N_INSNS (1), /* cost of FCHS instruction. */
+ COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
+ 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
+ pentium_memcpy,
+ pentium_memset,
+ 1, /* scalar_stmt_cost. */
+ 1, /* scalar load_cost. */
+ 1, /* scalar_store_cost. */
+ 1, /* vec_stmt_cost. */
+ 1, /* vec_to_scalar_cost. */
+ 1, /* scalar_to_vec_cost. */
+ 1, /* vec_align_load_cost. */
+ 2, /* vec_unalign_load_cost. */
+ 1, /* vec_store_cost. */
+ 3, /* cond_taken_branch_cost. */
+ 1, /* cond_not_taken_branch_cost. */
+};
+
+static const
+struct processor_costs lakemont_cost = {
+ COSTS_N_INSNS (1), /* cost of an add instruction */
+ COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
+ COSTS_N_INSNS (1), /* variable shift costs */
+ COSTS_N_INSNS (1), /* constant shift costs */
+ {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
+ COSTS_N_INSNS (11), /* HI */
+ COSTS_N_INSNS (11), /* SI */
+ COSTS_N_INSNS (11), /* DI */
+ COSTS_N_INSNS (11)}, /* other */
+ 0, /* cost of multiply per each bit set */
+ {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
+ COSTS_N_INSNS (25), /* HI */
+ COSTS_N_INSNS (25), /* SI */
+ COSTS_N_INSNS (25), /* DI */
+ COSTS_N_INSNS (25)}, /* other */
+ COSTS_N_INSNS (3), /* cost of movsx */
+ COSTS_N_INSNS (2), /* cost of movzx */
+ 8, /* "large" insn */
+ 17, /* MOVE_RATIO */
+ 6, /* cost for loading QImode using movzbl */
+ {2, 4, 2}, /* cost of loading integer registers
+ in QImode, HImode and SImode.
+ Relative to reg-reg move (2). */
+ {2, 4, 2}, /* cost of storing integer registers */
+ 2, /* cost of reg,reg fld/fst */
+ {2, 2, 6}, /* cost of loading fp registers
+ in SFmode, DFmode and XFmode */
+ {4, 4, 6}, /* cost of storing fp registers
+ in SFmode, DFmode and XFmode */
+ 8, /* cost of moving MMX register */
+ {8, 8}, /* cost of loading MMX registers
+ in SImode and DImode */
+ {8, 8}, /* cost of storing MMX registers
+ in SImode and DImode */
+ 2, /* cost of moving SSE register */
+ {4, 8, 16}, /* cost of loading SSE registers
+ in SImode, DImode and TImode */
+ {4, 8, 16}, /* cost of storing SSE registers
+ in SImode, DImode and TImode */
+ 3, /* MMX or SSE register to integer */
+ 8, /* size of l1 cache. */
+ 8, /* size of l2 cache */
+ 0, /* size of prefetch block */
+ 0, /* number of parallel prefetches */
+ 2, /* Branch cost */
+ COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
+ COSTS_N_INSNS (3), /* cost of FMUL instruction. */
+ COSTS_N_INSNS (39), /* cost of FDIV instruction. */
+ COSTS_N_INSNS (1), /* cost of FABS instruction. */
+ COSTS_N_INSNS (1), /* cost of FCHS instruction. */
+ COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
+ 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
+ pentium_memcpy,
+ pentium_memset,
+ 1, /* scalar_stmt_cost. */
+ 1, /* scalar load_cost. */
+ 1, /* scalar_store_cost. */
+ 1, /* vec_stmt_cost. */
+ 1, /* vec_to_scalar_cost. */
+ 1, /* scalar_to_vec_cost. */
+ 1, /* vec_align_load_cost. */
+ 2, /* vec_unalign_load_cost. */
+ 1, /* vec_store_cost. */
+ 3, /* cond_taken_branch_cost. */
+ 1, /* cond_not_taken_branch_cost. */
+};
+
+/* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
+ (we ensure the alignment). For small blocks inline loop is still a
+ noticeable win, for bigger blocks either rep movsl or rep movsb is
+ way to go. Rep movsb has apparently more expensive startup time in CPU,
+ but after 4K the difference is down in the noise. */
+static stringop_algs pentiumpro_memcpy[2] = {
+ {rep_prefix_4_byte, {{128, loop, false}, {1024, unrolled_loop, false},
+ {8192, rep_prefix_4_byte, false},
+ {-1, rep_prefix_1_byte, false}}},
+ DUMMY_STRINGOP_ALGS};
+static stringop_algs pentiumpro_memset[2] = {
+ {rep_prefix_4_byte, {{1024, unrolled_loop, false},
+ {8192, rep_prefix_4_byte, false},
+ {-1, libcall, false}}},
+ DUMMY_STRINGOP_ALGS};
+static const
+struct processor_costs pentiumpro_cost = {
+ COSTS_N_INSNS (1), /* cost of an add instruction */
+ COSTS_N_INSNS (1), /* cost of a lea instruction */
+ COSTS_N_INSNS (1), /* variable shift costs */
+ COSTS_N_INSNS (1), /* constant shift costs */
+ {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
+ COSTS_N_INSNS (4), /* HI */
+ COSTS_N_INSNS (4), /* SI */
+ COSTS_N_INSNS (4), /* DI */
+ COSTS_N_INSNS (4)}, /* other */
+ 0, /* cost of multiply per each bit set */
+ {COSTS_N_INSNS (17), /* cost of a divide/mod for QI */
+ COSTS_N_INSNS (17), /* HI */
+ COSTS_N_INSNS (17), /* SI */
+ COSTS_N_INSNS (17), /* DI */
+ COSTS_N_INSNS (17)}, /* other */
+ COSTS_N_INSNS (1), /* cost of movsx */
+ COSTS_N_INSNS (1), /* cost of movzx */
+ 8, /* "large" insn */
+ 6, /* MOVE_RATIO */
+ 2, /* cost for loading QImode using movzbl */
+ {4, 4, 4}, /* cost of loading integer registers
+ in QImode, HImode and SImode.
+ Relative to reg-reg move (2). */
+ {2, 2, 2}, /* cost of storing integer registers */
+ 2, /* cost of reg,reg fld/fst */
+ {2, 2, 6}, /* cost of loading fp registers
+ in SFmode, DFmode and XFmode */
+ {4, 4, 6}, /* cost of storing fp registers
+ in SFmode, DFmode and XFmode */
+ 2, /* cost of moving MMX register */
+ {2, 2}, /* cost of loading MMX registers
+ in SImode and DImode */
+ {2, 2}, /* cost of storing MMX registers
+ in SImode and DImode */
+ 2, /* cost of moving SSE register */
+ {2, 2, 8}, /* cost of loading SSE registers
+ in SImode, DImode and TImode */
+ {2, 2, 8}, /* cost of storing SSE registers
+ in SImode, DImode and TImode */
+ 3, /* MMX or SSE register to integer */
+ 8, /* size of l1 cache. */
+ 256, /* size of l2 cache */
+ 32, /* size of prefetch block */
+ 6, /* number of parallel prefetches */
+ 2, /* Branch cost */
+ COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
+ COSTS_N_INSNS (5), /* cost of FMUL instruction. */
+ COSTS_N_INSNS (56), /* cost of FDIV instruction. */
+ COSTS_N_INSNS (2), /* cost of FABS instruction. */
+ COSTS_N_INSNS (2), /* cost of FCHS instruction. */
+ COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
+ 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
+ pentiumpro_memcpy,
+ pentiumpro_memset,
+ 1, /* scalar_stmt_cost. */
+ 1, /* scalar load_cost. */
+ 1, /* scalar_store_cost. */
+ 1, /* vec_stmt_cost. */
+ 1, /* vec_to_scalar_cost. */
+ 1, /* scalar_to_vec_cost. */
+ 1, /* vec_align_load_cost. */
+ 2, /* vec_unalign_load_cost. */
+ 1, /* vec_store_cost. */
+ 3, /* cond_taken_branch_cost. */
+ 1, /* cond_not_taken_branch_cost. */
+};
+
+static stringop_algs geode_memcpy[2] = {
+ {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
+ DUMMY_STRINGOP_ALGS};
+static stringop_algs geode_memset[2] = {
+ {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
+ DUMMY_STRINGOP_ALGS};
+static const
+struct processor_costs geode_cost = {
+ COSTS_N_INSNS (1), /* cost of an add instruction */
+ COSTS_N_INSNS (1), /* cost of a lea instruction */
+ COSTS_N_INSNS (2), /* variable shift costs */
+ COSTS_N_INSNS (1), /* constant shift costs */
+ {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
+ COSTS_N_INSNS (4), /* HI */
+ COSTS_N_INSNS (7), /* SI */
+ COSTS_N_INSNS (7), /* DI */
+ COSTS_N_INSNS (7)}, /* other */
+ 0, /* cost of multiply per each bit set */
+ {COSTS_N_INSNS (15), /* cost of a divide/mod for QI */
+ COSTS_N_INSNS (23), /* HI */
+ COSTS_N_INSNS (39), /* SI */
+ COSTS_N_INSNS (39), /* DI */
+ COSTS_N_INSNS (39)}, /* other */
+ COSTS_N_INSNS (1), /* cost of movsx */
+ COSTS_N_INSNS (1), /* cost of movzx */
+ 8, /* "large" insn */
+ 4, /* MOVE_RATIO */
+ 1, /* cost for loading QImode using movzbl */
+ {1, 1, 1}, /* cost of loading integer registers
+ in QImode, HImode and SImode.
+ Relative to reg-reg move (2). */
+ {1, 1, 1}, /* cost of storing integer registers */
+ 1, /* cost of reg,reg fld/fst */
+ {1, 1, 1}, /* cost of loading fp registers
+ in SFmode, DFmode and XFmode */
+ {4, 6, 6}, /* cost of storing fp registers
+ in SFmode, DFmode and XFmode */
+
+ 2, /* cost of moving MMX register */
+ {2, 2}, /* cost of loading MMX registers
+ in SImode and DImode */
+ {2, 2}, /* cost of storing MMX registers
+ in SImode and DImode */
+ 2, /* cost of moving SSE register */
+ {2, 2, 8}, /* cost of loading SSE registers
+ in SImode, DImode and TImode */
+ {2, 2, 8}, /* cost of storing SSE registers
+ in SImode, DImode and TImode */
+ 3, /* MMX or SSE register to integer */
+ 64, /* size of l1 cache. */
+ 128, /* size of l2 cache. */
+ 32, /* size of prefetch block */
+ 1, /* number of parallel prefetches */
+ 1, /* Branch cost */
+ COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
+ COSTS_N_INSNS (11), /* cost of FMUL instruction. */
+ COSTS_N_INSNS (47), /* cost of FDIV instruction. */
+ COSTS_N_INSNS (1), /* cost of FABS instruction. */
+ COSTS_N_INSNS (1), /* cost of FCHS instruction. */
+ COSTS_N_INSNS (54), /* cost of FSQRT instruction. */
+ 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
+ geode_memcpy,
+ geode_memset,
+ 1, /* scalar_stmt_cost. */
+ 1, /* scalar load_cost. */
+ 1, /* scalar_store_cost. */
+ 1, /* vec_stmt_cost. */
+ 1, /* vec_to_scalar_cost. */
+ 1, /* scalar_to_vec_cost. */
+ 1, /* vec_align_load_cost. */
+ 2, /* vec_unalign_load_cost. */
+ 1, /* vec_store_cost. */
+ 3, /* cond_taken_branch_cost. */
+ 1, /* cond_not_taken_branch_cost. */
+};
+
+static stringop_algs k6_memcpy[2] = {
+ {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
+ DUMMY_STRINGOP_ALGS};
+static stringop_algs k6_memset[2] = {
+ {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
+ DUMMY_STRINGOP_ALGS};
+static const
+struct processor_costs k6_cost = {
+ COSTS_N_INSNS (1), /* cost of an add instruction */
+ COSTS_N_INSNS (2), /* cost of a lea instruction */
+ COSTS_N_INSNS (1), /* variable shift costs */
+ COSTS_N_INSNS (1), /* constant shift costs */
+ {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
+ COSTS_N_INSNS (3), /* HI */
+ COSTS_N_INSNS (3), /* SI */
+ COSTS_N_INSNS (3), /* DI */
+ COSTS_N_INSNS (3)}, /* other */
+ 0, /* cost of multiply per each bit set */
+ {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
+ COSTS_N_INSNS (18), /* HI */
+ COSTS_N_INSNS (18), /* SI */
+ COSTS_N_INSNS (18), /* DI */
+ COSTS_N_INSNS (18)}, /* other */
+ COSTS_N_INSNS (2), /* cost of movsx */
+ COSTS_N_INSNS (2), /* cost of movzx */
+ 8, /* "large" insn */
+ 4, /* MOVE_RATIO */
+ 3, /* cost for loading QImode using movzbl */
+ {4, 5, 4}, /* cost of loading integer registers
+ in QImode, HImode and SImode.
+ Relative to reg-reg move (2). */
+ {2, 3, 2}, /* cost of storing integer registers */
+ 4, /* cost of reg,reg fld/fst */
+ {6, 6, 6}, /* cost of loading fp registers
+ in SFmode, DFmode and XFmode */
+ {4, 4, 4}, /* cost of storing fp registers
+ in SFmode, DFmode and XFmode */
+ 2, /* cost of moving MMX register */
+ {2, 2}, /* cost of loading MMX registers
+ in SImode and DImode */
+ {2, 2}, /* cost of storing MMX registers
+ in SImode and DImode */
+ 2, /* cost of moving SSE register */
+ {2, 2, 8}, /* cost of loading SSE registers
+ in SImode, DImode and TImode */
+ {2, 2, 8}, /* cost of storing SSE registers
+ in SImode, DImode and TImode */
+ 6, /* MMX or SSE register to integer */
+ 32, /* size of l1 cache. */
+ 32, /* size of l2 cache. Some models
+ have integrated l2 cache, but
+ optimizing for k6 is not important
+ enough to worry about that. */
+ 32, /* size of prefetch block */
+ 1, /* number of parallel prefetches */
+ 1, /* Branch cost */
+ COSTS_N_INSNS (2), /* cost of FADD and FSUB insns. */
+ COSTS_N_INSNS (2), /* cost of FMUL instruction. */
+ COSTS_N_INSNS (56), /* cost of FDIV instruction. */
+ COSTS_N_INSNS (2), /* cost of FABS instruction. */
+ COSTS_N_INSNS (2), /* cost of FCHS instruction. */
+ COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
+ 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
+ k6_memcpy,
+ k6_memset,
+ 1, /* scalar_stmt_cost. */
+ 1, /* scalar load_cost. */
+ 1, /* scalar_store_cost. */
+ 1, /* vec_stmt_cost. */
+ 1, /* vec_to_scalar_cost. */
+ 1, /* scalar_to_vec_cost. */
+ 1, /* vec_align_load_cost. */
+ 2, /* vec_unalign_load_cost. */
+ 1, /* vec_store_cost. */
+ 3, /* cond_taken_branch_cost. */
+ 1, /* cond_not_taken_branch_cost. */
+};
+
+/* For some reason, Athlon deals better with REP prefix (relative to loops)
+ compared to K8. Alignment becomes important after 8 bytes for memcpy and
+ 128 bytes for memset. */
+static stringop_algs athlon_memcpy[2] = {
+ {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
+ DUMMY_STRINGOP_ALGS};
+static stringop_algs athlon_memset[2] = {
+ {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
+ DUMMY_STRINGOP_ALGS};
+static const
+struct processor_costs athlon_cost = {
+ COSTS_N_INSNS (1), /* cost of an add instruction */
+ COSTS_N_INSNS (2), /* cost of a lea instruction */
+ COSTS_N_INSNS (1), /* variable shift costs */
+ COSTS_N_INSNS (1), /* constant shift costs */
+ {COSTS_N_INSNS (5), /* cost of starting multiply for QI */
+ COSTS_N_INSNS (5), /* HI */
+ COSTS_N_INSNS (5), /* SI */
+ COSTS_N_INSNS (5), /* DI */
+ COSTS_N_INSNS (5)}, /* other */
+ 0, /* cost of multiply per each bit set */
+ {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
+ COSTS_N_INSNS (26), /* HI */
+ COSTS_N_INSNS (42), /* SI */
+ COSTS_N_INSNS (74), /* DI */
+ COSTS_N_INSNS (74)}, /* other */
+ COSTS_N_INSNS (1), /* cost of movsx */
+ COSTS_N_INSNS (1), /* cost of movzx */
+ 8, /* "large" insn */
+ 9, /* MOVE_RATIO */
+ 4, /* cost for loading QImode using movzbl */
+ {3, 4, 3}, /* cost of loading integer registers
+ in QImode, HImode and SImode.
+ Relative to reg-reg move (2). */
+ {3, 4, 3}, /* cost of storing integer registers */
+ 4, /* cost of reg,reg fld/fst */
+ {4, 4, 12}, /* cost of loading fp registers
+ in SFmode, DFmode and XFmode */
+ {6, 6, 8}, /* cost of storing fp registers
+ in SFmode, DFmode and XFmode */
+ 2, /* cost of moving MMX register */
+ {4, 4}, /* cost of loading MMX registers
+ in SImode and DImode */
+ {4, 4}, /* cost of storing MMX registers
+ in SImode and DImode */
+ 2, /* cost of moving SSE register */
+ {4, 4, 6}, /* cost of loading SSE registers
+ in SImode, DImode and TImode */
+ {4, 4, 5}, /* cost of storing SSE registers
+ in SImode, DImode and TImode */
+ 5, /* MMX or SSE register to integer */
+ 64, /* size of l1 cache. */
+ 256, /* size of l2 cache. */
+ 64, /* size of prefetch block */
+ 6, /* number of parallel prefetches */
+ 5, /* Branch cost */
+ COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
+ COSTS_N_INSNS (4), /* cost of FMUL instruction. */
+ COSTS_N_INSNS (24), /* cost of FDIV instruction. */
+ COSTS_N_INSNS (2), /* cost of FABS instruction. */
+ COSTS_N_INSNS (2), /* cost of FCHS instruction. */
+ COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
+ 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
+ athlon_memcpy,
+ athlon_memset,
+ 1, /* scalar_stmt_cost. */
+ 1, /* scalar load_cost. */
+ 1, /* scalar_store_cost. */
+ 1, /* vec_stmt_cost. */
+ 1, /* vec_to_scalar_cost. */
+ 1, /* scalar_to_vec_cost. */
+ 1, /* vec_align_load_cost. */
+ 2, /* vec_unalign_load_cost. */
+ 1, /* vec_store_cost. */
+ 3, /* cond_taken_branch_cost. */
+ 1, /* cond_not_taken_branch_cost. */
+};
+
+/* K8 has optimized REP instruction for medium sized blocks, but for very
+ small blocks it is better to use loop. For large blocks, libcall can
+ do nontemporary accesses and beat inline considerably. */
+static stringop_algs k8_memcpy[2] = {
+ {libcall, {{6, loop, false}, {14, unrolled_loop, false},
+ {-1, rep_prefix_4_byte, false}}},
+ {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
+ {-1, libcall, false}}}};
+static stringop_algs k8_memset[2] = {
+ {libcall, {{8, loop, false}, {24, unrolled_loop, false},
+ {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
+ {libcall, {{48, unrolled_loop, false},
+ {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
+static const
+struct processor_costs k8_cost = {
+ COSTS_N_INSNS (1), /* cost of an add instruction */
+ COSTS_N_INSNS (2), /* cost of a lea instruction */
+ COSTS_N_INSNS (1), /* variable shift costs */
+ COSTS_N_INSNS (1), /* constant shift costs */
+ {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
+ COSTS_N_INSNS (4), /* HI */
+ COSTS_N_INSNS (3), /* SI */
+ COSTS_N_INSNS (4), /* DI */
+ COSTS_N_INSNS (5)}, /* other */
+ 0, /* cost of multiply per each bit set */
+ {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
+ COSTS_N_INSNS (26), /* HI */
+ COSTS_N_INSNS (42), /* SI */
+ COSTS_N_INSNS (74), /* DI */
+ COSTS_N_INSNS (74)}, /* other */
+ COSTS_N_INSNS (1), /* cost of movsx */
+ COSTS_N_INSNS (1), /* cost of movzx */
+ 8, /* "large" insn */
+ 9, /* MOVE_RATIO */
+ 4, /* cost for loading QImode using movzbl */
+ {3, 4, 3}, /* cost of loading integer registers
+ in QImode, HImode and SImode.
+ Relative to reg-reg move (2). */
+ {3, 4, 3}, /* cost of storing integer registers */
+ 4, /* cost of reg,reg fld/fst */
+ {4, 4, 12}, /* cost of loading fp registers
+ in SFmode, DFmode and XFmode */
+ {6, 6, 8}, /* cost of storing fp registers
+ in SFmode, DFmode and XFmode */
+ 2, /* cost of moving MMX register */
+ {3, 3}, /* cost of loading MMX registers
+ in SImode and DImode */
+ {4, 4}, /* cost of storing MMX registers
+ in SImode and DImode */
+ 2, /* cost of moving SSE register */
+ {4, 3, 6}, /* cost of loading SSE registers
+ in SImode, DImode and TImode */
+ {4, 4, 5}, /* cost of storing SSE registers
+ in SImode, DImode and TImode */
+ 5, /* MMX or SSE register to integer */
+ 64, /* size of l1 cache. */
+ 512, /* size of l2 cache. */
+ 64, /* size of prefetch block */
+ /* New AMD processors never drop prefetches; if they cannot be performed
+ immediately, they are queued. We set number of simultaneous prefetches
+ to a large constant to reflect this (it probably is not a good idea not
+ to limit number of prefetches at all, as their execution also takes some
+ time). */
+ 100, /* number of parallel prefetches */
+ 3, /* Branch cost */
+ COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
+ COSTS_N_INSNS (4), /* cost of FMUL instruction. */
+ COSTS_N_INSNS (19), /* cost of FDIV instruction. */
+ COSTS_N_INSNS (2), /* cost of FABS instruction. */
+ COSTS_N_INSNS (2), /* cost of FCHS instruction. */
+ COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
+ 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
+ k8_memcpy,
+ k8_memset,
+ 4, /* scalar_stmt_cost. */
+ 2, /* scalar load_cost. */
+ 2, /* scalar_store_cost. */
+ 5, /* vec_stmt_cost. */
+ 0, /* vec_to_scalar_cost. */
+ 2, /* scalar_to_vec_cost. */
+ 2, /* vec_align_load_cost. */
+ 3, /* vec_unalign_load_cost. */
+ 3, /* vec_store_cost. */
+ 3, /* cond_taken_branch_cost. */
+ 2, /* cond_not_taken_branch_cost. */
+};
+
+/* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
+ very small blocks it is better to use loop. For large blocks, libcall can
+ do nontemporary accesses and beat inline considerably. */
+static stringop_algs amdfam10_memcpy[2] = {
+ {libcall, {{6, loop, false}, {14, unrolled_loop, false},
+ {-1, rep_prefix_4_byte, false}}},
+ {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
+ {-1, libcall, false}}}};
+static stringop_algs amdfam10_memset[2] = {
+ {libcall, {{8, loop, false}, {24, unrolled_loop, false},
+ {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
+ {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
+ {-1, libcall, false}}}};
+struct processor_costs amdfam10_cost = {
+ COSTS_N_INSNS (1), /* cost of an add instruction */
+ COSTS_N_INSNS (2), /* cost of a lea instruction */
+ COSTS_N_INSNS (1), /* variable shift costs */
+ COSTS_N_INSNS (1), /* constant shift costs */
+ {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
+ COSTS_N_INSNS (4), /* HI */
+ COSTS_N_INSNS (3), /* SI */
+ COSTS_N_INSNS (4), /* DI */
+ COSTS_N_INSNS (5)}, /* other */
+ 0, /* cost of multiply per each bit set */
+ {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
+ COSTS_N_INSNS (35), /* HI */
+ COSTS_N_INSNS (51), /* SI */
+ COSTS_N_INSNS (83), /* DI */
+ COSTS_N_INSNS (83)}, /* other */
+ COSTS_N_INSNS (1), /* cost of movsx */
+ COSTS_N_INSNS (1), /* cost of movzx */
+ 8, /* "large" insn */
+ 9, /* MOVE_RATIO */
+ 4, /* cost for loading QImode using movzbl */
+ {3, 4, 3}, /* cost of loading integer registers
+ in QImode, HImode and SImode.
+ Relative to reg-reg move (2). */
+ {3, 4, 3}, /* cost of storing integer registers */
+ 4, /* cost of reg,reg fld/fst */
+ {4, 4, 12}, /* cost of loading fp registers
+ in SFmode, DFmode and XFmode */
+ {6, 6, 8}, /* cost of storing fp registers
+ in SFmode, DFmode and XFmode */
+ 2, /* cost of moving MMX register */
+ {3, 3}, /* cost of loading MMX registers
+ in SImode and DImode */
+ {4, 4}, /* cost of storing MMX registers
+ in SImode and DImode */
+ 2, /* cost of moving SSE register */
+ {4, 4, 3}, /* cost of loading SSE registers
+ in SImode, DImode and TImode */
+ {4, 4, 5}, /* cost of storing SSE registers
+ in SImode, DImode and TImode */
+ 3, /* MMX or SSE register to integer */
+ /* On K8:
+ MOVD reg64, xmmreg Double FSTORE 4
+ MOVD reg32, xmmreg Double FSTORE 4
+ On AMDFAM10:
+ MOVD reg64, xmmreg Double FADD 3
+ 1/1 1/1
+ MOVD reg32, xmmreg Double FADD 3
+ 1/1 1/1 */
+ 64, /* size of l1 cache. */
+ 512, /* size of l2 cache. */
+ 64, /* size of prefetch block */
+ /* New AMD processors never drop prefetches; if they cannot be performed
+ immediately, they are queued. We set number of simultaneous prefetches
+ to a large constant to reflect this (it probably is not a good idea not
+ to limit number of prefetches at all, as their execution also takes some
+ time). */
+ 100, /* number of parallel prefetches */
+ 2, /* Branch cost */
+ COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
+ COSTS_N_INSNS (4), /* cost of FMUL instruction. */
+ COSTS_N_INSNS (19), /* cost of FDIV instruction. */
+ COSTS_N_INSNS (2), /* cost of FABS instruction. */
+ COSTS_N_INSNS (2), /* cost of FCHS instruction. */
+ COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
+ 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
+ amdfam10_memcpy,
+ amdfam10_memset,
+ 4, /* scalar_stmt_cost. */
+ 2, /* scalar load_cost. */
+ 2, /* scalar_store_cost. */
+ 6, /* vec_stmt_cost. */
+ 0, /* vec_to_scalar_cost. */
+ 2, /* scalar_to_vec_cost. */
+ 2, /* vec_align_load_cost. */
+ 2, /* vec_unalign_load_cost. */
+ 2, /* vec_store_cost. */
+ 2, /* cond_taken_branch_cost. */
+ 1, /* cond_not_taken_branch_cost. */
+};
+
+/* BDVER1 has optimized REP instruction for medium sized blocks, but for
+ very small blocks it is better to use loop. For large blocks, libcall
+ can do nontemporary accesses and beat inline considerably. */
+static stringop_algs bdver1_memcpy[2] = {
+ {libcall, {{6, loop, false}, {14, unrolled_loop, false},
+ {-1, rep_prefix_4_byte, false}}},
+ {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
+ {-1, libcall, false}}}};
+static stringop_algs bdver1_memset[2] = {
+ {libcall, {{8, loop, false}, {24, unrolled_loop, false},
+ {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
+ {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
+ {-1, libcall, false}}}};
+
+const struct processor_costs bdver1_cost = {
+ COSTS_N_INSNS (1), /* cost of an add instruction */
+ COSTS_N_INSNS (1), /* cost of a lea instruction */
+ COSTS_N_INSNS (1), /* variable shift costs */
+ COSTS_N_INSNS (1), /* constant shift costs */
+ {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
+ COSTS_N_INSNS (4), /* HI */
+ COSTS_N_INSNS (4), /* SI */
+ COSTS_N_INSNS (6), /* DI */
+ COSTS_N_INSNS (6)}, /* other */
+ 0, /* cost of multiply per each bit set */
+ {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
+ COSTS_N_INSNS (35), /* HI */
+ COSTS_N_INSNS (51), /* SI */
+ COSTS_N_INSNS (83), /* DI */
+ COSTS_N_INSNS (83)}, /* other */
+ COSTS_N_INSNS (1), /* cost of movsx */
+ COSTS_N_INSNS (1), /* cost of movzx */
+ 8, /* "large" insn */
+ 9, /* MOVE_RATIO */
+ 4, /* cost for loading QImode using movzbl */
+ {5, 5, 4}, /* cost of loading integer registers
+ in QImode, HImode and SImode.
+ Relative to reg-reg move (2). */
+ {4, 4, 4}, /* cost of storing integer registers */
+ 2, /* cost of reg,reg fld/fst */
+ {5, 5, 12}, /* cost of loading fp registers
+ in SFmode, DFmode and XFmode */
+ {4, 4, 8}, /* cost of storing fp registers
+ in SFmode, DFmode and XFmode */
+ 2, /* cost of moving MMX register */
+ {4, 4}, /* cost of loading MMX registers
+ in SImode and DImode */
+ {4, 4}, /* cost of storing MMX registers
+ in SImode and DImode */
+ 2, /* cost of moving SSE register */
+ {4, 4, 4}, /* cost of loading SSE registers
+ in SImode, DImode and TImode */
+ {4, 4, 4}, /* cost of storing SSE registers
+ in SImode, DImode and TImode */
+ 2, /* MMX or SSE register to integer */
+ /* On K8:
+ MOVD reg64, xmmreg Double FSTORE 4
+ MOVD reg32, xmmreg Double FSTORE 4
+ On AMDFAM10:
+ MOVD reg64, xmmreg Double FADD 3
+ 1/1 1/1
+ MOVD reg32, xmmreg Double FADD 3
+ 1/1 1/1 */
+ 16, /* size of l1 cache. */
+ 2048, /* size of l2 cache. */
+ 64, /* size of prefetch block */
+ /* New AMD processors never drop prefetches; if they cannot be performed
+ immediately, they are queued. We set number of simultaneous prefetches
+ to a large constant to reflect this (it probably is not a good idea not
+ to limit number of prefetches at all, as their execution also takes some
+ time). */
+ 100, /* number of parallel prefetches */
+ 2, /* Branch cost */
+ COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
+ COSTS_N_INSNS (6), /* cost of FMUL instruction. */
+ COSTS_N_INSNS (42), /* cost of FDIV instruction. */
+ COSTS_N_INSNS (2), /* cost of FABS instruction. */
+ COSTS_N_INSNS (2), /* cost of FCHS instruction. */
+ COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
+ 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
+ bdver1_memcpy,
+ bdver1_memset,
+ 6, /* scalar_stmt_cost. */
+ 4, /* scalar load_cost. */
+ 4, /* scalar_store_cost. */
+ 6, /* vec_stmt_cost. */
+ 0, /* vec_to_scalar_cost. */
+ 2, /* scalar_to_vec_cost. */
+ 4, /* vec_align_load_cost. */
+ 4, /* vec_unalign_load_cost. */
+ 4, /* vec_store_cost. */
+ 4, /* cond_taken_branch_cost. */
+ 2, /* cond_not_taken_branch_cost. */
+};
+
+/* BDVER2 has optimized REP instruction for medium sized blocks, but for
+ very small blocks it is better to use loop. For large blocks, libcall
+ can do nontemporary accesses and beat inline considerably. */
+
+static stringop_algs bdver2_memcpy[2] = {
+ {libcall, {{6, loop, false}, {14, unrolled_loop, false},
+ {-1, rep_prefix_4_byte, false}}},
+ {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
+ {-1, libcall, false}}}};
+static stringop_algs bdver2_memset[2] = {
+ {libcall, {{8, loop, false}, {24, unrolled_loop, false},
+ {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
+ {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
+ {-1, libcall, false}}}};
+
+const struct processor_costs bdver2_cost = {
+ COSTS_N_INSNS (1), /* cost of an add instruction */
+ COSTS_N_INSNS (1), /* cost of a lea instruction */
+ COSTS_N_INSNS (1), /* variable shift costs */
+ COSTS_N_INSNS (1), /* constant shift costs */
+ {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
+ COSTS_N_INSNS (4), /* HI */
+ COSTS_N_INSNS (4), /* SI */
+ COSTS_N_INSNS (6), /* DI */
+ COSTS_N_INSNS (6)}, /* other */
+ 0, /* cost of multiply per each bit set */
+ {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
+ COSTS_N_INSNS (35), /* HI */
+ COSTS_N_INSNS (51), /* SI */
+ COSTS_N_INSNS (83), /* DI */
+ COSTS_N_INSNS (83)}, /* other */
+ COSTS_N_INSNS (1), /* cost of movsx */
+ COSTS_N_INSNS (1), /* cost of movzx */
+ 8, /* "large" insn */
+ 9, /* MOVE_RATIO */
+ 4, /* cost for loading QImode using movzbl */
+ {5, 5, 4}, /* cost of loading integer registers
+ in QImode, HImode and SImode.
+ Relative to reg-reg move (2). */
+ {4, 4, 4}, /* cost of storing integer registers */
+ 2, /* cost of reg,reg fld/fst */
+ {5, 5, 12}, /* cost of loading fp registers
+ in SFmode, DFmode and XFmode */
+ {4, 4, 8}, /* cost of storing fp registers
+ in SFmode, DFmode and XFmode */
+ 2, /* cost of moving MMX register */
+ {4, 4}, /* cost of loading MMX registers
+ in SImode and DImode */
+ {4, 4}, /* cost of storing MMX registers
+ in SImode and DImode */
+ 2, /* cost of moving SSE register */
+ {4, 4, 4}, /* cost of loading SSE registers
+ in SImode, DImode and TImode */
+ {4, 4, 4}, /* cost of storing SSE registers
+ in SImode, DImode and TImode */
+ 2, /* MMX or SSE register to integer */
+ /* On K8:
+ MOVD reg64, xmmreg Double FSTORE 4
+ MOVD reg32, xmmreg Double FSTORE 4
+ On AMDFAM10:
+ MOVD reg64, xmmreg Double FADD 3
+ 1/1 1/1
+ MOVD reg32, xmmreg Double FADD 3
+ 1/1 1/1 */
+ 16, /* size of l1 cache. */
+ 2048, /* size of l2 cache. */
+ 64, /* size of prefetch block */
+ /* New AMD processors never drop prefetches; if they cannot be performed
+ immediately, they are queued. We set number of simultaneous prefetches
+ to a large constant to reflect this (it probably is not a good idea not
+ to limit number of prefetches at all, as their execution also takes some
+ time). */
+ 100, /* number of parallel prefetches */
+ 2, /* Branch cost */
+ COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
+ COSTS_N_INSNS (6), /* cost of FMUL instruction. */
+ COSTS_N_INSNS (42), /* cost of FDIV instruction. */
+ COSTS_N_INSNS (2), /* cost of FABS instruction. */
+ COSTS_N_INSNS (2), /* cost of FCHS instruction. */
+ COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
+ 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
+ bdver2_memcpy,
+ bdver2_memset,
+ 6, /* scalar_stmt_cost. */
+ 4, /* scalar load_cost. */
+ 4, /* scalar_store_cost. */
+ 6, /* vec_stmt_cost. */
+ 0, /* vec_to_scalar_cost. */
+ 2, /* scalar_to_vec_cost. */
+ 4, /* vec_align_load_cost. */
+ 4, /* vec_unalign_load_cost. */
+ 4, /* vec_store_cost. */
+ 4, /* cond_taken_branch_cost. */
+ 2, /* cond_not_taken_branch_cost. */
+};
+
+
+ /* BDVER3 has optimized REP instruction for medium sized blocks, but for
+ very small blocks it is better to use loop. For large blocks, libcall
+ can do nontemporary accesses and beat inline considerably. */
+static stringop_algs bdver3_memcpy[2] = {
+ {libcall, {{6, loop, false}, {14, unrolled_loop, false},
+ {-1, rep_prefix_4_byte, false}}},
+ {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
+ {-1, libcall, false}}}};
+static stringop_algs bdver3_memset[2] = {
+ {libcall, {{8, loop, false}, {24, unrolled_loop, false},
+ {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
+ {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
+ {-1, libcall, false}}}};
+struct processor_costs bdver3_cost = {
+ COSTS_N_INSNS (1), /* cost of an add instruction */
+ COSTS_N_INSNS (1), /* cost of a lea instruction */
+ COSTS_N_INSNS (1), /* variable shift costs */
+ COSTS_N_INSNS (1), /* constant shift costs */
+ {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
+ COSTS_N_INSNS (4), /* HI */
+ COSTS_N_INSNS (4), /* SI */
+ COSTS_N_INSNS (6), /* DI */
+ COSTS_N_INSNS (6)}, /* other */
+ 0, /* cost of multiply per each bit set */
+ {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
+ COSTS_N_INSNS (35), /* HI */
+ COSTS_N_INSNS (51), /* SI */
+ COSTS_N_INSNS (83), /* DI */
+ COSTS_N_INSNS (83)}, /* other */
+ COSTS_N_INSNS (1), /* cost of movsx */
+ COSTS_N_INSNS (1), /* cost of movzx */
+ 8, /* "large" insn */
+ 9, /* MOVE_RATIO */
+ 4, /* cost for loading QImode using movzbl */
+ {5, 5, 4}, /* cost of loading integer registers
+ in QImode, HImode and SImode.
+ Relative to reg-reg move (2). */
+ {4, 4, 4}, /* cost of storing integer registers */
+ 2, /* cost of reg,reg fld/fst */
+ {5, 5, 12}, /* cost of loading fp registers
+ in SFmode, DFmode and XFmode */
+ {4, 4, 8}, /* cost of storing fp registers
+ in SFmode, DFmode and XFmode */
+ 2, /* cost of moving MMX register */
+ {4, 4}, /* cost of loading MMX registers
+ in SImode and DImode */
+ {4, 4}, /* cost of storing MMX registers
+ in SImode and DImode */
+ 2, /* cost of moving SSE register */
+ {4, 4, 4}, /* cost of loading SSE registers
+ in SImode, DImode and TImode */
+ {4, 4, 4}, /* cost of storing SSE registers
+ in SImode, DImode and TImode */
+ 2, /* MMX or SSE register to integer */
+ 16, /* size of l1 cache. */
+ 2048, /* size of l2 cache. */
+ 64, /* size of prefetch block */
+ /* New AMD processors never drop prefetches; if they cannot be performed
+ immediately, they are queued. We set number of simultaneous prefetches
+ to a large constant to reflect this (it probably is not a good idea not
+ to limit number of prefetches at all, as their execution also takes some
+ time). */
+ 100, /* number of parallel prefetches */
+ 2, /* Branch cost */
+ COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
+ COSTS_N_INSNS (6), /* cost of FMUL instruction. */
+ COSTS_N_INSNS (42), /* cost of FDIV instruction. */
+ COSTS_N_INSNS (2), /* cost of FABS instruction. */
+ COSTS_N_INSNS (2), /* cost of FCHS instruction. */
+ COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
+ 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
+ bdver3_memcpy,
+ bdver3_memset,
+ 6, /* scalar_stmt_cost. */
+ 4, /* scalar load_cost. */
+ 4, /* scalar_store_cost. */
+ 6, /* vec_stmt_cost. */
+ 0, /* vec_to_scalar_cost. */
+ 2, /* scalar_to_vec_cost. */
+ 4, /* vec_align_load_cost. */
+ 4, /* vec_unalign_load_cost. */
+ 4, /* vec_store_cost. */
+ 4, /* cond_taken_branch_cost. */
+ 2, /* cond_not_taken_branch_cost. */
+};
+
+/* BDVER4 has optimized REP instruction for medium sized blocks, but for
+ very small blocks it is better to use loop. For large blocks, libcall
+ can do nontemporary accesses and beat inline considerably. */
+static stringop_algs bdver4_memcpy[2] = {
+ {libcall, {{6, loop, false}, {14, unrolled_loop, false},
+ {-1, rep_prefix_4_byte, false}}},
+ {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
+ {-1, libcall, false}}}};
+static stringop_algs bdver4_memset[2] = {
+ {libcall, {{8, loop, false}, {24, unrolled_loop, false},
+ {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
+ {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
+ {-1, libcall, false}}}};
+struct processor_costs bdver4_cost = {
+ COSTS_N_INSNS (1), /* cost of an add instruction */
+ COSTS_N_INSNS (1), /* cost of a lea instruction */
+ COSTS_N_INSNS (1), /* variable shift costs */
+ COSTS_N_INSNS (1), /* constant shift costs */
+ {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
+ COSTS_N_INSNS (4), /* HI */
+ COSTS_N_INSNS (4), /* SI */
+ COSTS_N_INSNS (6), /* DI */
+ COSTS_N_INSNS (6)}, /* other */
+ 0, /* cost of multiply per each bit set */
+ {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
+ COSTS_N_INSNS (35), /* HI */
+ COSTS_N_INSNS (51), /* SI */
+ COSTS_N_INSNS (83), /* DI */
+ COSTS_N_INSNS (83)}, /* other */
+ COSTS_N_INSNS (1), /* cost of movsx */
+ COSTS_N_INSNS (1), /* cost of movzx */
+ 8, /* "large" insn */
+ 9, /* MOVE_RATIO */
+ 4, /* cost for loading QImode using movzbl */
+ {5, 5, 4}, /* cost of loading integer registers
+ in QImode, HImode and SImode.
+ Relative to reg-reg move (2). */
+ {4, 4, 4}, /* cost of storing integer registers */
+ 2, /* cost of reg,reg fld/fst */
+ {5, 5, 12}, /* cost of loading fp registers
+ in SFmode, DFmode and XFmode */
+ {4, 4, 8}, /* cost of storing fp registers
+ in SFmode, DFmode and XFmode */
+ 2, /* cost of moving MMX register */
+ {4, 4}, /* cost of loading MMX registers
+ in SImode and DImode */
+ {4, 4}, /* cost of storing MMX registers
+ in SImode and DImode */
+ 2, /* cost of moving SSE register */
+ {4, 4, 4}, /* cost of loading SSE registers
+ in SImode, DImode and TImode */
+ {4, 4, 4}, /* cost of storing SSE registers
+ in SImode, DImode and TImode */
+ 2, /* MMX or SSE register to integer */
+ 16, /* size of l1 cache. */
+ 2048, /* size of l2 cache. */
+ 64, /* size of prefetch block */
+ /* New AMD processors never drop prefetches; if they cannot be performed
+ immediately, they are queued. We set number of simultaneous prefetches
+ to a large constant to reflect this (it probably is not a good idea not
+ to limit number of prefetches at all, as their execution also takes some
+ time). */
+ 100, /* number of parallel prefetches */
+ 2, /* Branch cost */
+ COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
+ COSTS_N_INSNS (6), /* cost of FMUL instruction. */
+ COSTS_N_INSNS (42), /* cost of FDIV instruction. */
+ COSTS_N_INSNS (2), /* cost of FABS instruction. */
+ COSTS_N_INSNS (2), /* cost of FCHS instruction. */
+ COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
+ 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
+ bdver4_memcpy,
+ bdver4_memset,
+ 6, /* scalar_stmt_cost. */
+ 4, /* scalar load_cost. */
+ 4, /* scalar_store_cost. */
+ 6, /* vec_stmt_cost. */
+ 0, /* vec_to_scalar_cost. */
+ 2, /* scalar_to_vec_cost. */
+ 4, /* vec_align_load_cost. */
+ 4, /* vec_unalign_load_cost. */
+ 4, /* vec_store_cost. */
+ 4, /* cond_taken_branch_cost. */
+ 2, /* cond_not_taken_branch_cost. */
+};
+
+
+/* ZNVER1 has optimized REP instruction for medium sized blocks, but for
+ very small blocks it is better to use loop. For large blocks, libcall
+ can do nontemporary accesses and beat inline considerably. */
+static stringop_algs znver1_memcpy[2] = {
+ {libcall, {{6, loop, false}, {14, unrolled_loop, false},
+ {-1, rep_prefix_4_byte, false}}},
+ {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
+ {-1, libcall, false}}}};
+static stringop_algs znver1_memset[2] = {
+ {libcall, {{8, loop, false}, {24, unrolled_loop, false},
+ {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
+ {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
+ {-1, libcall, false}}}};
+struct processor_costs znver1_cost = {
+ COSTS_N_INSNS (1), /* cost of an add instruction. */
+ COSTS_N_INSNS (1), /* cost of a lea instruction. */
+ COSTS_N_INSNS (1), /* variable shift costs. */
+ COSTS_N_INSNS (1), /* constant shift costs. */
+ {COSTS_N_INSNS (3), /* cost of starting multiply for QI. */
+ COSTS_N_INSNS (3), /* HI. */
+ COSTS_N_INSNS (3), /* SI. */
+ COSTS_N_INSNS (4), /* DI. */
+ COSTS_N_INSNS (4)}, /* other. */
+ 0, /* cost of multiply per each bit
+ set. */
+ {COSTS_N_INSNS (19), /* cost of a divide/mod for QI. */
+ COSTS_N_INSNS (35), /* HI. */
+ COSTS_N_INSNS (51), /* SI. */
+ COSTS_N_INSNS (83), /* DI. */
+ COSTS_N_INSNS (83)}, /* other. */
+ COSTS_N_INSNS (1), /* cost of movsx. */
+ COSTS_N_INSNS (1), /* cost of movzx. */
+ 8, /* "large" insn. */
+ 9, /* MOVE_RATIO. */
+ 4, /* cost for loading QImode using
+ movzbl. */
+ {5, 5, 4}, /* cost of loading integer registers
+ in QImode, HImode and SImode.
+ Relative to reg-reg move (2). */
+ {4, 4, 4}, /* cost of storing integer
+ registers. */
+ 2, /* cost of reg,reg fld/fst. */
+ {5, 5, 12}, /* cost of loading fp registers
+ in SFmode, DFmode and XFmode. */
+ {4, 4, 8}, /* cost of storing fp registers
+ in SFmode, DFmode and XFmode. */
+ 2, /* cost of moving MMX register. */
+ {4, 4}, /* cost of loading MMX registers
+ in SImode and DImode. */
+ {4, 4}, /* cost of storing MMX registers
+ in SImode and DImode. */
+ 2, /* cost of moving SSE register. */
+ {4, 4, 4}, /* cost of loading SSE registers
+ in SImode, DImode and TImode. */
+ {4, 4, 4}, /* cost of storing SSE registers
+ in SImode, DImode and TImode. */
+ 2, /* MMX or SSE register to integer. */
+ 32, /* size of l1 cache. */
+ 512, /* size of l2 cache. */
+ 64, /* size of prefetch block. */
+ /* New AMD processors never drop prefetches; if they cannot be performed
+ immediately, they are queued. We set number of simultaneous prefetches
+ to a large constant to reflect this (it probably is not a good idea not
+ to limit number of prefetches at all, as their execution also takes some
+ time). */
+ 100, /* number of parallel prefetches. */
+ 3, /* Branch cost. */
+ COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
+ COSTS_N_INSNS (6), /* cost of FMUL instruction. */
+ COSTS_N_INSNS (42), /* cost of FDIV instruction. */
+ COSTS_N_INSNS (2), /* cost of FABS instruction. */
+ COSTS_N_INSNS (2), /* cost of FCHS instruction. */
+ COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
+ /* Zen can execute 4 integer operations per cycle. FP operations take 3 cycles
+ and it can execute 2 integer additions and 2 multiplications thus
+ reassociation may make sense up to with of 6. SPEC2k6 bencharks suggests
+ that 4 works better than 6 probably due to register pressure.
+
+ Integer vector operations are taken by FP unit and execute 3 vector
+ plus/minus operations per cycle but only one multiply. This is adjusted
+ in ix86_reassociation_width. */
+ 4, 4, 3, 6, /* reassoc int, fp, vec_int, vec_fp. */
+ znver1_memcpy,
+ znver1_memset,
+ 6, /* scalar_stmt_cost. */
+ 4, /* scalar load_cost. */
+ 4, /* scalar_store_cost. */
+ 6, /* vec_stmt_cost. */
+ 0, /* vec_to_scalar_cost. */
+ 2, /* scalar_to_vec_cost. */
+ 4, /* vec_align_load_cost. */
+ 4, /* vec_unalign_load_cost. */
+ 4, /* vec_store_cost. */
+ 4, /* cond_taken_branch_cost. */
+ 2, /* cond_not_taken_branch_cost. */
+};
+
+ /* BTVER1 has optimized REP instruction for medium sized blocks, but for
+ very small blocks it is better to use loop. For large blocks, libcall can
+ do nontemporary accesses and beat inline considerably. */
+static stringop_algs btver1_memcpy[2] = {
+ {libcall, {{6, loop, false}, {14, unrolled_loop, false},
+ {-1, rep_prefix_4_byte, false}}},
+ {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
+ {-1, libcall, false}}}};
+static stringop_algs btver1_memset[2] = {
+ {libcall, {{8, loop, false}, {24, unrolled_loop, false},
+ {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
+ {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
+ {-1, libcall, false}}}};
+const struct processor_costs btver1_cost = {
+ COSTS_N_INSNS (1), /* cost of an add instruction */
+ COSTS_N_INSNS (2), /* cost of a lea instruction */
+ COSTS_N_INSNS (1), /* variable shift costs */
+ COSTS_N_INSNS (1), /* constant shift costs */
+ {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
+ COSTS_N_INSNS (4), /* HI */
+ COSTS_N_INSNS (3), /* SI */
+ COSTS_N_INSNS (4), /* DI */
+ COSTS_N_INSNS (5)}, /* other */
+ 0, /* cost of multiply per each bit set */
+ {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
+ COSTS_N_INSNS (35), /* HI */
+ COSTS_N_INSNS (51), /* SI */
+ COSTS_N_INSNS (83), /* DI */
+ COSTS_N_INSNS (83)}, /* other */
+ COSTS_N_INSNS (1), /* cost of movsx */
+ COSTS_N_INSNS (1), /* cost of movzx */
+ 8, /* "large" insn */
+ 9, /* MOVE_RATIO */
+ 4, /* cost for loading QImode using movzbl */
+ {3, 4, 3}, /* cost of loading integer registers
+ in QImode, HImode and SImode.
+ Relative to reg-reg move (2). */
+ {3, 4, 3}, /* cost of storing integer registers */
+ 4, /* cost of reg,reg fld/fst */
+ {4, 4, 12}, /* cost of loading fp registers
+ in SFmode, DFmode and XFmode */
+ {6, 6, 8}, /* cost of storing fp registers
+ in SFmode, DFmode and XFmode */
+ 2, /* cost of moving MMX register */
+ {3, 3}, /* cost of loading MMX registers
+ in SImode and DImode */
+ {4, 4}, /* cost of storing MMX registers
+ in SImode and DImode */
+ 2, /* cost of moving SSE register */
+ {4, 4, 3}, /* cost of loading SSE registers
+ in SImode, DImode and TImode */
+ {4, 4, 5}, /* cost of storing SSE registers
+ in SImode, DImode and TImode */
+ 3, /* MMX or SSE register to integer */
+ /* On K8:
+ MOVD reg64, xmmreg Double FSTORE 4
+ MOVD reg32, xmmreg Double FSTORE 4
+ On AMDFAM10:
+ MOVD reg64, xmmreg Double FADD 3
+ 1/1 1/1
+ MOVD reg32, xmmreg Double FADD 3
+ 1/1 1/1 */
+ 32, /* size of l1 cache. */
+ 512, /* size of l2 cache. */
+ 64, /* size of prefetch block */
+ 100, /* number of parallel prefetches */
+ 2, /* Branch cost */
+ COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
+ COSTS_N_INSNS (4), /* cost of FMUL instruction. */
+ COSTS_N_INSNS (19), /* cost of FDIV instruction. */
+ COSTS_N_INSNS (2), /* cost of FABS instruction. */
+ COSTS_N_INSNS (2), /* cost of FCHS instruction. */
+ COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
+ 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
+ btver1_memcpy,
+ btver1_memset,
+ 4, /* scalar_stmt_cost. */
+ 2, /* scalar load_cost. */
+ 2, /* scalar_store_cost. */
+ 6, /* vec_stmt_cost. */
+ 0, /* vec_to_scalar_cost. */
+ 2, /* scalar_to_vec_cost. */
+ 2, /* vec_align_load_cost. */
+ 2, /* vec_unalign_load_cost. */
+ 2, /* vec_store_cost. */
+ 2, /* cond_taken_branch_cost. */
+ 1, /* cond_not_taken_branch_cost. */
+};
+
+static stringop_algs btver2_memcpy[2] = {
+ {libcall, {{6, loop, false}, {14, unrolled_loop, false},
+ {-1, rep_prefix_4_byte, false}}},
+ {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
+ {-1, libcall, false}}}};
+static stringop_algs btver2_memset[2] = {
+ {libcall, {{8, loop, false}, {24, unrolled_loop, false},
+ {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
+ {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
+ {-1, libcall, false}}}};
+const struct processor_costs btver2_cost = {
+ COSTS_N_INSNS (1), /* cost of an add instruction */
+ COSTS_N_INSNS (2), /* cost of a lea instruction */
+ COSTS_N_INSNS (1), /* variable shift costs */
+ COSTS_N_INSNS (1), /* constant shift costs */
+ {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
+ COSTS_N_INSNS (4), /* HI */
+ COSTS_N_INSNS (3), /* SI */
+ COSTS_N_INSNS (4), /* DI */
+ COSTS_N_INSNS (5)}, /* other */
+ 0, /* cost of multiply per each bit set */
+ {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
+ COSTS_N_INSNS (35), /* HI */
+ COSTS_N_INSNS (51), /* SI */
+ COSTS_N_INSNS (83), /* DI */
+ COSTS_N_INSNS (83)}, /* other */
+ COSTS_N_INSNS (1), /* cost of movsx */
+ COSTS_N_INSNS (1), /* cost of movzx */
+ 8, /* "large" insn */
+ 9, /* MOVE_RATIO */
+ 4, /* cost for loading QImode using movzbl */
+ {3, 4, 3}, /* cost of loading integer registers
+ in QImode, HImode and SImode.
+ Relative to reg-reg move (2). */
+ {3, 4, 3}, /* cost of storing integer registers */
+ 4, /* cost of reg,reg fld/fst */
+ {4, 4, 12}, /* cost of loading fp registers
+ in SFmode, DFmode and XFmode */
+ {6, 6, 8}, /* cost of storing fp registers
+ in SFmode, DFmode and XFmode */
+ 2, /* cost of moving MMX register */
+ {3, 3}, /* cost of loading MMX registers
+ in SImode and DImode */
+ {4, 4}, /* cost of storing MMX registers
+ in SImode and DImode */
+ 2, /* cost of moving SSE register */
+ {4, 4, 3}, /* cost of loading SSE registers
+ in SImode, DImode and TImode */
+ {4, 4, 5}, /* cost of storing SSE registers
+ in SImode, DImode and TImode */
+ 3, /* MMX or SSE register to integer */
+ /* On K8:
+ MOVD reg64, xmmreg Double FSTORE 4
+ MOVD reg32, xmmreg Double FSTORE 4
+ On AMDFAM10:
+ MOVD reg64, xmmreg Double FADD 3
+ 1/1 1/1
+ MOVD reg32, xmmreg Double FADD 3
+ 1/1 1/1 */
+ 32, /* size of l1 cache. */
+ 2048, /* size of l2 cache. */
+ 64, /* size of prefetch block */
+ 100, /* number of parallel prefetches */
+ 2, /* Branch cost */
+ COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
+ COSTS_N_INSNS (4), /* cost of FMUL instruction. */
+ COSTS_N_INSNS (19), /* cost of FDIV instruction. */
+ COSTS_N_INSNS (2), /* cost of FABS instruction. */
+ COSTS_N_INSNS (2), /* cost of FCHS instruction. */
+ COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
+ 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
+ btver2_memcpy,
+ btver2_memset,
+ 4, /* scalar_stmt_cost. */
+ 2, /* scalar load_cost. */
+ 2, /* scalar_store_cost. */
+ 6, /* vec_stmt_cost. */
+ 0, /* vec_to_scalar_cost. */
+ 2, /* scalar_to_vec_cost. */
+ 2, /* vec_align_load_cost. */
+ 2, /* vec_unalign_load_cost. */
+ 2, /* vec_store_cost. */
+ 2, /* cond_taken_branch_cost. */
+ 1, /* cond_not_taken_branch_cost. */
+};
+
+static stringop_algs pentium4_memcpy[2] = {
+ {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
+ DUMMY_STRINGOP_ALGS};
+static stringop_algs pentium4_memset[2] = {
+ {libcall, {{6, loop_1_byte, false}, {48, loop, false},
+ {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
+ DUMMY_STRINGOP_ALGS};
+
+static const
+struct processor_costs pentium4_cost = {
+ COSTS_N_INSNS (1), /* cost of an add instruction */
+ COSTS_N_INSNS (3), /* cost of a lea instruction */
+ COSTS_N_INSNS (4), /* variable shift costs */
+ COSTS_N_INSNS (4), /* constant shift costs */
+ {COSTS_N_INSNS (15), /* cost of starting multiply for QI */
+ COSTS_N_INSNS (15), /* HI */
+ COSTS_N_INSNS (15), /* SI */
+ COSTS_N_INSNS (15), /* DI */
+ COSTS_N_INSNS (15)}, /* other */
+ 0, /* cost of multiply per each bit set */
+ {COSTS_N_INSNS (56), /* cost of a divide/mod for QI */
+ COSTS_N_INSNS (56), /* HI */
+ COSTS_N_INSNS (56), /* SI */
+ COSTS_N_INSNS (56), /* DI */
+ COSTS_N_INSNS (56)}, /* other */
+ COSTS_N_INSNS (1), /* cost of movsx */
+ COSTS_N_INSNS (1), /* cost of movzx */
+ 16, /* "large" insn */
+ 6, /* MOVE_RATIO */
+ 2, /* cost for loading QImode using movzbl */
+ {4, 5, 4}, /* cost of loading integer registers
+ in QImode, HImode and SImode.
+ Relative to reg-reg move (2). */
+ {2, 3, 2}, /* cost of storing integer registers */
+ 2, /* cost of reg,reg fld/fst */
+ {2, 2, 6}, /* cost of loading fp registers
+ in SFmode, DFmode and XFmode */
+ {4, 4, 6}, /* cost of storing fp registers
+ in SFmode, DFmode and XFmode */
+ 2, /* cost of moving MMX register */
+ {2, 2}, /* cost of loading MMX registers
+ in SImode and DImode */
+ {2, 2}, /* cost of storing MMX registers
+ in SImode and DImode */
+ 12, /* cost of moving SSE register */
+ {12, 12, 12}, /* cost of loading SSE registers
+ in SImode, DImode and TImode */
+ {2, 2, 8}, /* cost of storing SSE registers
+ in SImode, DImode and TImode */
+ 10, /* MMX or SSE register to integer */
+ 8, /* size of l1 cache. */
+ 256, /* size of l2 cache. */
+ 64, /* size of prefetch block */
+ 6, /* number of parallel prefetches */
+ 2, /* Branch cost */
+ COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
+ COSTS_N_INSNS (7), /* cost of FMUL instruction. */
+ COSTS_N_INSNS (43), /* cost of FDIV instruction. */
+ COSTS_N_INSNS (2), /* cost of FABS instruction. */
+ COSTS_N_INSNS (2), /* cost of FCHS instruction. */
+ COSTS_N_INSNS (43), /* cost of FSQRT instruction. */
+ 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
+ pentium4_memcpy,
+ pentium4_memset,
+ 1, /* scalar_stmt_cost. */
+ 1, /* scalar load_cost. */
+ 1, /* scalar_store_cost. */
+ 1, /* vec_stmt_cost. */
+ 1, /* vec_to_scalar_cost. */
+ 1, /* scalar_to_vec_cost. */
+ 1, /* vec_align_load_cost. */
+ 2, /* vec_unalign_load_cost. */
+ 1, /* vec_store_cost. */
+ 3, /* cond_taken_branch_cost. */
+ 1, /* cond_not_taken_branch_cost. */
+};
+
+static stringop_algs nocona_memcpy[2] = {
+ {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
+ {libcall, {{32, loop, false}, {20000, rep_prefix_8_byte, false},
+ {100000, unrolled_loop, false}, {-1, libcall, false}}}};
+
+static stringop_algs nocona_memset[2] = {
+ {libcall, {{6, loop_1_byte, false}, {48, loop, false},
+ {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
+ {libcall, {{24, loop, false}, {64, unrolled_loop, false},
+ {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
+
+static const
+struct processor_costs nocona_cost = {
+ COSTS_N_INSNS (1), /* cost of an add instruction */
+ COSTS_N_INSNS (1), /* cost of a lea instruction */
+ COSTS_N_INSNS (1), /* variable shift costs */
+ COSTS_N_INSNS (1), /* constant shift costs */
+ {COSTS_N_INSNS (10), /* cost of starting multiply for QI */
+ COSTS_N_INSNS (10), /* HI */
+ COSTS_N_INSNS (10), /* SI */
+ COSTS_N_INSNS (10), /* DI */
+ COSTS_N_INSNS (10)}, /* other */
+ 0, /* cost of multiply per each bit set */
+ {COSTS_N_INSNS (66), /* cost of a divide/mod for QI */
+ COSTS_N_INSNS (66), /* HI */
+ COSTS_N_INSNS (66), /* SI */
+ COSTS_N_INSNS (66), /* DI */
+ COSTS_N_INSNS (66)}, /* other */
+ COSTS_N_INSNS (1), /* cost of movsx */
+ COSTS_N_INSNS (1), /* cost of movzx */
+ 16, /* "large" insn */
+ 17, /* MOVE_RATIO */
+ 4, /* cost for loading QImode using movzbl */
+ {4, 4, 4}, /* cost of loading integer registers
+ in QImode, HImode and SImode.
+ Relative to reg-reg move (2). */
+ {4, 4, 4}, /* cost of storing integer registers */
+ 3, /* cost of reg,reg fld/fst */
+ {12, 12, 12}, /* cost of loading fp registers
+ in SFmode, DFmode and XFmode */
+ {4, 4, 4}, /* cost of storing fp registers
+ in SFmode, DFmode and XFmode */
+ 6, /* cost of moving MMX register */
+ {12, 12}, /* cost of loading MMX registers
+ in SImode and DImode */
+ {12, 12}, /* cost of storing MMX registers
+ in SImode and DImode */
+ 6, /* cost of moving SSE register */
+ {12, 12, 12}, /* cost of loading SSE registers
+ in SImode, DImode and TImode */
+ {12, 12, 12}, /* cost of storing SSE registers
+ in SImode, DImode and TImode */
+ 8, /* MMX or SSE register to integer */
+ 8, /* size of l1 cache. */
+ 1024, /* size of l2 cache. */
+ 64, /* size of prefetch block */
+ 8, /* number of parallel prefetches */
+ 1, /* Branch cost */
+ COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
+ COSTS_N_INSNS (8), /* cost of FMUL instruction. */
+ COSTS_N_INSNS (40), /* cost of FDIV instruction. */
+ COSTS_N_INSNS (3), /* cost of FABS instruction. */
+ COSTS_N_INSNS (3), /* cost of FCHS instruction. */
+ COSTS_N_INSNS (44), /* cost of FSQRT instruction. */
+ 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
+ nocona_memcpy,
+ nocona_memset,
+ 1, /* scalar_stmt_cost. */
+ 1, /* scalar load_cost. */
+ 1, /* scalar_store_cost. */
+ 1, /* vec_stmt_cost. */
+ 1, /* vec_to_scalar_cost. */
+ 1, /* scalar_to_vec_cost. */
+ 1, /* vec_align_load_cost. */
+ 2, /* vec_unalign_load_cost. */
+ 1, /* vec_store_cost. */
+ 3, /* cond_taken_branch_cost. */
+ 1, /* cond_not_taken_branch_cost. */
+};
+
+static stringop_algs atom_memcpy[2] = {
+ {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
+ {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
+ {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
+static stringop_algs atom_memset[2] = {
+ {libcall, {{8, loop, false}, {15, unrolled_loop, false},
+ {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
+ {libcall, {{24, loop, false}, {32, unrolled_loop, false},
+ {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
+static const
+struct processor_costs atom_cost = {
+ COSTS_N_INSNS (1), /* cost of an add instruction */
+ COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
+ COSTS_N_INSNS (1), /* variable shift costs */
+ COSTS_N_INSNS (1), /* constant shift costs */
+ {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
+ COSTS_N_INSNS (4), /* HI */
+ COSTS_N_INSNS (3), /* SI */
+ COSTS_N_INSNS (4), /* DI */
+ COSTS_N_INSNS (2)}, /* other */
+ 0, /* cost of multiply per each bit set */
+ {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
+ COSTS_N_INSNS (26), /* HI */
+ COSTS_N_INSNS (42), /* SI */
+ COSTS_N_INSNS (74), /* DI */
+ COSTS_N_INSNS (74)}, /* other */
+ COSTS_N_INSNS (1), /* cost of movsx */
+ COSTS_N_INSNS (1), /* cost of movzx */
+ 8, /* "large" insn */
+ 17, /* MOVE_RATIO */
+ 4, /* cost for loading QImode using movzbl */
+ {4, 4, 4}, /* cost of loading integer registers
+ in QImode, HImode and SImode.
+ Relative to reg-reg move (2). */
+ {4, 4, 4}, /* cost of storing integer registers */
+ 4, /* cost of reg,reg fld/fst */
+ {12, 12, 12}, /* cost of loading fp registers
+ in SFmode, DFmode and XFmode */
+ {6, 6, 8}, /* cost of storing fp registers
+ in SFmode, DFmode and XFmode */
+ 2, /* cost of moving MMX register */
+ {8, 8}, /* cost of loading MMX registers
+ in SImode and DImode */
+ {8, 8}, /* cost of storing MMX registers
+ in SImode and DImode */
+ 2, /* cost of moving SSE register */
+ {8, 8, 8}, /* cost of loading SSE registers
+ in SImode, DImode and TImode */
+ {8, 8, 8}, /* cost of storing SSE registers
+ in SImode, DImode and TImode */
+ 5, /* MMX or SSE register to integer */
+ 32, /* size of l1 cache. */
+ 256, /* size of l2 cache. */
+ 64, /* size of prefetch block */
+ 6, /* number of parallel prefetches */
+ 3, /* Branch cost */
+ COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
+ COSTS_N_INSNS (8), /* cost of FMUL instruction. */
+ COSTS_N_INSNS (20), /* cost of FDIV instruction. */
+ COSTS_N_INSNS (8), /* cost of FABS instruction. */
+ COSTS_N_INSNS (8), /* cost of FCHS instruction. */
+ COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
+ 2, 2, 2, 2, /* reassoc int, fp, vec_int, vec_fp. */
+ atom_memcpy,
+ atom_memset,
+ 1, /* scalar_stmt_cost. */
+ 1, /* scalar load_cost. */
+ 1, /* scalar_store_cost. */
+ 1, /* vec_stmt_cost. */
+ 1, /* vec_to_scalar_cost. */
+ 1, /* scalar_to_vec_cost. */
+ 1, /* vec_align_load_cost. */
+ 2, /* vec_unalign_load_cost. */
+ 1, /* vec_store_cost. */
+ 3, /* cond_taken_branch_cost. */
+ 1, /* cond_not_taken_branch_cost. */
+};
+
+static stringop_algs slm_memcpy[2] = {
+ {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
+ {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
+ {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
+static stringop_algs slm_memset[2] = {
+ {libcall, {{8, loop, false}, {15, unrolled_loop, false},
+ {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
+ {libcall, {{24, loop, false}, {32, unrolled_loop, false},
+ {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
+static const
+struct processor_costs slm_cost = {
+ COSTS_N_INSNS (1), /* cost of an add instruction */
+ COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
+ COSTS_N_INSNS (1), /* variable shift costs */
+ COSTS_N_INSNS (1), /* constant shift costs */
+ {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
+ COSTS_N_INSNS (3), /* HI */
+ COSTS_N_INSNS (3), /* SI */
+ COSTS_N_INSNS (4), /* DI */
+ COSTS_N_INSNS (2)}, /* other */
+ 0, /* cost of multiply per each bit set */
+ {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
+ COSTS_N_INSNS (26), /* HI */
+ COSTS_N_INSNS (42), /* SI */
+ COSTS_N_INSNS (74), /* DI */
+ COSTS_N_INSNS (74)}, /* other */
+ COSTS_N_INSNS (1), /* cost of movsx */
+ COSTS_N_INSNS (1), /* cost of movzx */
+ 8, /* "large" insn */
+ 17, /* MOVE_RATIO */
+ 4, /* cost for loading QImode using movzbl */
+ {4, 4, 4}, /* cost of loading integer registers
+ in QImode, HImode and SImode.
+ Relative to reg-reg move (2). */
+ {4, 4, 4}, /* cost of storing integer registers */
+ 4, /* cost of reg,reg fld/fst */
+ {12, 12, 12}, /* cost of loading fp registers
+ in SFmode, DFmode and XFmode */
+ {6, 6, 8}, /* cost of storing fp registers
+ in SFmode, DFmode and XFmode */
+ 2, /* cost of moving MMX register */
+ {8, 8}, /* cost of loading MMX registers
+ in SImode and DImode */
+ {8, 8}, /* cost of storing MMX registers
+ in SImode and DImode */
+ 2, /* cost of moving SSE register */
+ {8, 8, 8}, /* cost of loading SSE registers
+ in SImode, DImode and TImode */
+ {8, 8, 8}, /* cost of storing SSE registers
+ in SImode, DImode and TImode */
+ 5, /* MMX or SSE register to integer */
+ 32, /* size of l1 cache. */
+ 256, /* size of l2 cache. */
+ 64, /* size of prefetch block */
+ 6, /* number of parallel prefetches */
+ 3, /* Branch cost */
+ COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
+ COSTS_N_INSNS (8), /* cost of FMUL instruction. */
+ COSTS_N_INSNS (20), /* cost of FDIV instruction. */
+ COSTS_N_INSNS (8), /* cost of FABS instruction. */
+ COSTS_N_INSNS (8), /* cost of FCHS instruction. */
+ COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
+ 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
+ slm_memcpy,
+ slm_memset,
+ 1, /* scalar_stmt_cost. */
+ 1, /* scalar load_cost. */
+ 1, /* scalar_store_cost. */
+ 1, /* vec_stmt_cost. */
+ 4, /* vec_to_scalar_cost. */
+ 1, /* scalar_to_vec_cost. */
+ 1, /* vec_align_load_cost. */
+ 2, /* vec_unalign_load_cost. */
+ 1, /* vec_store_cost. */
+ 3, /* cond_taken_branch_cost. */
+ 1, /* cond_not_taken_branch_cost. */
+};
+
+static stringop_algs intel_memcpy[2] = {
+ {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
+ {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
+ {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
+static stringop_algs intel_memset[2] = {
+ {libcall, {{8, loop, false}, {15, unrolled_loop, false},
+ {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
+ {libcall, {{24, loop, false}, {32, unrolled_loop, false},
+ {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
+static const
+struct processor_costs intel_cost = {
+ COSTS_N_INSNS (1), /* cost of an add instruction */
+ COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
+ COSTS_N_INSNS (1), /* variable shift costs */
+ COSTS_N_INSNS (1), /* constant shift costs */
+ {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
+ COSTS_N_INSNS (3), /* HI */
+ COSTS_N_INSNS (3), /* SI */
+ COSTS_N_INSNS (4), /* DI */
+ COSTS_N_INSNS (2)}, /* other */
+ 0, /* cost of multiply per each bit set */
+ {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
+ COSTS_N_INSNS (26), /* HI */
+ COSTS_N_INSNS (42), /* SI */
+ COSTS_N_INSNS (74), /* DI */
+ COSTS_N_INSNS (74)}, /* other */
+ COSTS_N_INSNS (1), /* cost of movsx */
+ COSTS_N_INSNS (1), /* cost of movzx */
+ 8, /* "large" insn */
+ 17, /* MOVE_RATIO */
+ 4, /* cost for loading QImode using movzbl */
+ {4, 4, 4}, /* cost of loading integer registers
+ in QImode, HImode and SImode.
+ Relative to reg-reg move (2). */
+ {4, 4, 4}, /* cost of storing integer registers */
+ 4, /* cost of reg,reg fld/fst */
+ {12, 12, 12}, /* cost of loading fp registers
+ in SFmode, DFmode and XFmode */
+ {6, 6, 8}, /* cost of storing fp registers
+ in SFmode, DFmode and XFmode */
+ 2, /* cost of moving MMX register */
+ {8, 8}, /* cost of loading MMX registers
+ in SImode and DImode */
+ {8, 8}, /* cost of storing MMX registers
+ in SImode and DImode */
+ 2, /* cost of moving SSE register */
+ {8, 8, 8}, /* cost of loading SSE registers
+ in SImode, DImode and TImode */
+ {8, 8, 8}, /* cost of storing SSE registers
+ in SImode, DImode and TImode */
+ 5, /* MMX or SSE register to integer */
+ 32, /* size of l1 cache. */
+ 256, /* size of l2 cache. */
+ 64, /* size of prefetch block */
+ 6, /* number of parallel prefetches */
+ 3, /* Branch cost */
+ COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
+ COSTS_N_INSNS (8), /* cost of FMUL instruction. */
+ COSTS_N_INSNS (20), /* cost of FDIV instruction. */
+ COSTS_N_INSNS (8), /* cost of FABS instruction. */
+ COSTS_N_INSNS (8), /* cost of FCHS instruction. */
+ COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
+ 1, 4, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
+ intel_memcpy,
+ intel_memset,
+ 1, /* scalar_stmt_cost. */
+ 1, /* scalar load_cost. */
+ 1, /* scalar_store_cost. */
+ 1, /* vec_stmt_cost. */
+ 4, /* vec_to_scalar_cost. */
+ 1, /* scalar_to_vec_cost. */
+ 1, /* vec_align_load_cost. */
+ 2, /* vec_unalign_load_cost. */
+ 1, /* vec_store_cost. */
+ 3, /* cond_taken_branch_cost. */
+ 1, /* cond_not_taken_branch_cost. */
+};
+
+/* Generic should produce code tuned for Core-i7 (and newer chips)
+ and btver1 (and newer chips). */
+
+static stringop_algs generic_memcpy[2] = {
+ {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
+ {-1, libcall, false}}},
+ {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
+ {-1, libcall, false}}}};
+static stringop_algs generic_memset[2] = {
+ {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
+ {-1, libcall, false}}},
+ {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
+ {-1, libcall, false}}}};
+static const
+struct processor_costs generic_cost = {
+ COSTS_N_INSNS (1), /* cost of an add instruction */
+ /* On all chips taken into consideration lea is 2 cycles and more. With
+ this cost however our current implementation of synth_mult results in
+ use of unnecessary temporary registers causing regression on several
+ SPECfp benchmarks. */
+ COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
+ COSTS_N_INSNS (1), /* variable shift costs */
+ COSTS_N_INSNS (1), /* constant shift costs */
+ {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
+ COSTS_N_INSNS (4), /* HI */
+ COSTS_N_INSNS (3), /* SI */
+ COSTS_N_INSNS (4), /* DI */
+ COSTS_N_INSNS (2)}, /* other */
+ 0, /* cost of multiply per each bit set */
+ {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
+ COSTS_N_INSNS (26), /* HI */
+ COSTS_N_INSNS (42), /* SI */
+ COSTS_N_INSNS (74), /* DI */
+ COSTS_N_INSNS (74)}, /* other */
+ COSTS_N_INSNS (1), /* cost of movsx */
+ COSTS_N_INSNS (1), /* cost of movzx */
+ 8, /* "large" insn */
+ 17, /* MOVE_RATIO */
+ 4, /* cost for loading QImode using movzbl */
+ {4, 4, 4}, /* cost of loading integer registers
+ in QImode, HImode and SImode.
+ Relative to reg-reg move (2). */
+ {4, 4, 4}, /* cost of storing integer registers */
+ 4, /* cost of reg,reg fld/fst */
+ {12, 12, 12}, /* cost of loading fp registers
+ in SFmode, DFmode and XFmode */
+ {6, 6, 8}, /* cost of storing fp registers
+ in SFmode, DFmode and XFmode */
+ 2, /* cost of moving MMX register */
+ {8, 8}, /* cost of loading MMX registers
+ in SImode and DImode */
+ {8, 8}, /* cost of storing MMX registers
+ in SImode and DImode */
+ 2, /* cost of moving SSE register */
+ {8, 8, 8}, /* cost of loading SSE registers
+ in SImode, DImode and TImode */
+ {8, 8, 8}, /* cost of storing SSE registers
+ in SImode, DImode and TImode */
+ 5, /* MMX or SSE register to integer */
+ 32, /* size of l1 cache. */
+ 512, /* size of l2 cache. */
+ 64, /* size of prefetch block */
+ 6, /* number of parallel prefetches */
+ /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
+ value is increased to perhaps more appropriate value of 5. */
+ 3, /* Branch cost */
+ COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
+ COSTS_N_INSNS (8), /* cost of FMUL instruction. */
+ COSTS_N_INSNS (20), /* cost of FDIV instruction. */
+ COSTS_N_INSNS (8), /* cost of FABS instruction. */
+ COSTS_N_INSNS (8), /* cost of FCHS instruction. */
+ COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
+ 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
+ generic_memcpy,
+ generic_memset,
+ 1, /* scalar_stmt_cost. */
+ 1, /* scalar load_cost. */
+ 1, /* scalar_store_cost. */
+ 1, /* vec_stmt_cost. */
+ 1, /* vec_to_scalar_cost. */
+ 1, /* scalar_to_vec_cost. */
+ 1, /* vec_align_load_cost. */
+ 2, /* vec_unalign_load_cost. */
+ 1, /* vec_store_cost. */
+ 3, /* cond_taken_branch_cost. */
+ 1, /* cond_not_taken_branch_cost. */
+};
+
+/* core_cost should produce code tuned for Core familly of CPUs. */
+static stringop_algs core_memcpy[2] = {
+ {libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}},
+ {libcall, {{24, loop, true}, {128, rep_prefix_8_byte, true},
+ {-1, libcall, false}}}};
+static stringop_algs core_memset[2] = {
+ {libcall, {{6, loop_1_byte, true},
+ {24, loop, true},
+ {8192, rep_prefix_4_byte, true},
+ {-1, libcall, false}}},
+ {libcall, {{24, loop, true}, {512, rep_prefix_8_byte, true},
+ {-1, libcall, false}}}};
+
+static const
+struct processor_costs core_cost = {
+ COSTS_N_INSNS (1), /* cost of an add instruction */
+ /* On all chips taken into consideration lea is 2 cycles and more. With
+ this cost however our current implementation of synth_mult results in
+ use of unnecessary temporary registers causing regression on several
+ SPECfp benchmarks. */
+ COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
+ COSTS_N_INSNS (1), /* variable shift costs */
+ COSTS_N_INSNS (1), /* constant shift costs */
+ {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
+ COSTS_N_INSNS (4), /* HI */
+ COSTS_N_INSNS (3), /* SI */
+ COSTS_N_INSNS (4), /* DI */
+ COSTS_N_INSNS (2)}, /* other */
+ 0, /* cost of multiply per each bit set */
+ {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
+ COSTS_N_INSNS (26), /* HI */
+ COSTS_N_INSNS (42), /* SI */
+ COSTS_N_INSNS (74), /* DI */
+ COSTS_N_INSNS (74)}, /* other */
+ COSTS_N_INSNS (1), /* cost of movsx */
+ COSTS_N_INSNS (1), /* cost of movzx */
+ 8, /* "large" insn */
+ 17, /* MOVE_RATIO */
+ 4, /* cost for loading QImode using movzbl */
+ {4, 4, 4}, /* cost of loading integer registers
+ in QImode, HImode and SImode.
+ Relative to reg-reg move (2). */
+ {4, 4, 4}, /* cost of storing integer registers */
+ 4, /* cost of reg,reg fld/fst */
+ {12, 12, 12}, /* cost of loading fp registers
+ in SFmode, DFmode and XFmode */
+ {6, 6, 8}, /* cost of storing fp registers
+ in SFmode, DFmode and XFmode */
+ 2, /* cost of moving MMX register */
+ {8, 8}, /* cost of loading MMX registers
+ in SImode and DImode */
+ {8, 8}, /* cost of storing MMX registers
+ in SImode and DImode */
+ 2, /* cost of moving SSE register */
+ {8, 8, 8}, /* cost of loading SSE registers
+ in SImode, DImode and TImode */
+ {8, 8, 8}, /* cost of storing SSE registers
+ in SImode, DImode and TImode */
+ 5, /* MMX or SSE register to integer */
+ 64, /* size of l1 cache. */
+ 512, /* size of l2 cache. */
+ 64, /* size of prefetch block */
+ 6, /* number of parallel prefetches */
+ /* FIXME perhaps more appropriate value is 5. */
+ 3, /* Branch cost */
+ COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
+ COSTS_N_INSNS (8), /* cost of FMUL instruction. */
+ COSTS_N_INSNS (20), /* cost of FDIV instruction. */
+ COSTS_N_INSNS (8), /* cost of FABS instruction. */
+ COSTS_N_INSNS (8), /* cost of FCHS instruction. */
+ COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
+ 1, 4, 2, 2, /* reassoc int, fp, vec_int, vec_fp. */
+ core_memcpy,
+ core_memset,
+ 1, /* scalar_stmt_cost. */
+ 1, /* scalar load_cost. */
+ 1, /* scalar_store_cost. */
+ 1, /* vec_stmt_cost. */
+ 1, /* vec_to_scalar_cost. */
+ 1, /* scalar_to_vec_cost. */
+ 1, /* vec_align_load_cost. */
+ 2, /* vec_unalign_load_cost. */
+ 1, /* vec_store_cost. */
+ 3, /* cond_taken_branch_cost. */
+ 1, /* cond_not_taken_branch_cost. */
+};
+
--- /dev/null
+/* Scheduler hooks for IA-32 which implement atom+ specific logic.
+ Copyright (C) 1988-2017 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3. If not see
+<http://www.gnu.org/licenses/>. */
+
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "backend.h"
+#include "rtl.h"
+#include "tree.h"
+#include "cfghooks.h"
+#include "tm_p.h"
+#include "insn-config.h"
+#include "insn-attr.h"
+#include "recog.h"
+#include "target.h"
+#include "rtl-iter.h"
+#include "regset.h"
+#include "sched-int.h"
+
+/* Try to reorder ready list to take advantage of Atom pipelined IMUL
+ execution. It is applied if
+ (1) IMUL instruction is on the top of list;
+ (2) There exists the only producer of independent IMUL instruction in
+ ready list.
+ Return index of IMUL producer if it was found and -1 otherwise. */
+static int
+do_reorder_for_imul (rtx_insn **ready, int n_ready)
+{
+ rtx_insn *insn;
+ rtx set, insn1, insn2;
+ sd_iterator_def sd_it;
+ dep_t dep;
+ int index = -1;
+ int i;
+
+ if (!TARGET_BONNELL)
+ return index;
+
+ /* Check that IMUL instruction is on the top of ready list. */
+ insn = ready[n_ready - 1];
+ set = single_set (insn);
+ if (!set)
+ return index;
+ if (!(GET_CODE (SET_SRC (set)) == MULT
+ && GET_MODE (SET_SRC (set)) == SImode))
+ return index;
+
+ /* Search for producer of independent IMUL instruction. */
+ for (i = n_ready - 2; i >= 0; i--)
+ {
+ insn = ready[i];
+ if (!NONDEBUG_INSN_P (insn))
+ continue;
+ /* Skip IMUL instruction. */
+ insn2 = PATTERN (insn);
+ if (GET_CODE (insn2) == PARALLEL)
+ insn2 = XVECEXP (insn2, 0, 0);
+ if (GET_CODE (insn2) == SET
+ && GET_CODE (SET_SRC (insn2)) == MULT
+ && GET_MODE (SET_SRC (insn2)) == SImode)
+ continue;
+
+ FOR_EACH_DEP (insn, SD_LIST_FORW, sd_it, dep)
+ {
+ rtx con;
+ con = DEP_CON (dep);
+ if (!NONDEBUG_INSN_P (con))
+ continue;
+ insn1 = PATTERN (con);
+ if (GET_CODE (insn1) == PARALLEL)
+ insn1 = XVECEXP (insn1, 0, 0);
+
+ if (GET_CODE (insn1) == SET
+ && GET_CODE (SET_SRC (insn1)) == MULT
+ && GET_MODE (SET_SRC (insn1)) == SImode)
+ {
+ sd_iterator_def sd_it1;
+ dep_t dep1;
+ /* Check if there is no other dependee for IMUL. */
+ index = i;
+ FOR_EACH_DEP (con, SD_LIST_BACK, sd_it1, dep1)
+ {
+ rtx pro;
+ pro = DEP_PRO (dep1);
+ if (!NONDEBUG_INSN_P (pro))
+ continue;
+ if (pro != insn)
+ index = -1;
+ }
+ if (index >= 0)
+ break;
+ }
+ }
+ if (index >= 0)
+ break;
+ }
+ return index;
+}
+
+/* Try to find the best candidate on the top of ready list if two insns
+ have the same priority - candidate is best if its dependees were
+ scheduled earlier. Applied for Silvermont only.
+ Return true if top 2 insns must be interchanged. */
+static bool
+swap_top_of_ready_list (rtx_insn **ready, int n_ready)
+{
+ rtx_insn *top = ready[n_ready - 1];
+ rtx_insn *next = ready[n_ready - 2];
+ rtx set;
+ sd_iterator_def sd_it;
+ dep_t dep;
+ int clock1 = -1;
+ int clock2 = -1;
+ #define INSN_TICK(INSN) (HID (INSN)->tick)
+
+ if (!TARGET_SILVERMONT && !TARGET_INTEL)
+ return false;
+
+ if (!NONDEBUG_INSN_P (top))
+ return false;
+ if (!NONJUMP_INSN_P (top))
+ return false;
+ if (!NONDEBUG_INSN_P (next))
+ return false;
+ if (!NONJUMP_INSN_P (next))
+ return false;
+ set = single_set (top);
+ if (!set)
+ return false;
+ set = single_set (next);
+ if (!set)
+ return false;
+
+ if (INSN_PRIORITY_KNOWN (top) && INSN_PRIORITY_KNOWN (next))
+ {
+ if (INSN_PRIORITY (top) != INSN_PRIORITY (next))
+ return false;
+ /* Determine winner more precise. */
+ FOR_EACH_DEP (top, SD_LIST_RES_BACK, sd_it, dep)
+ {
+ rtx pro;
+ pro = DEP_PRO (dep);
+ if (!NONDEBUG_INSN_P (pro))
+ continue;
+ if (INSN_TICK (pro) > clock1)
+ clock1 = INSN_TICK (pro);
+ }
+ FOR_EACH_DEP (next, SD_LIST_RES_BACK, sd_it, dep)
+ {
+ rtx pro;
+ pro = DEP_PRO (dep);
+ if (!NONDEBUG_INSN_P (pro))
+ continue;
+ if (INSN_TICK (pro) > clock2)
+ clock2 = INSN_TICK (pro);
+ }
+
+ if (clock1 == clock2)
+ {
+ /* Determine winner - load must win. */
+ enum attr_memory memory1, memory2;
+ memory1 = get_attr_memory (top);
+ memory2 = get_attr_memory (next);
+ if (memory2 == MEMORY_LOAD && memory1 != MEMORY_LOAD)
+ return true;
+ }
+ return (bool) (clock2 < clock1);
+ }
+ return false;
+ #undef INSN_TICK
+}
+
+/* Perform possible reodering of ready list for Atom/Silvermont only.
+ Return issue rate. */
+int
+ix86_atom_sched_reorder (FILE *dump, int sched_verbose, rtx_insn **ready,
+ int *pn_ready, int clock_var)
+{
+ int issue_rate = -1;
+ int n_ready = *pn_ready;
+ int i;
+ rtx_insn *insn;
+ int index = -1;
+
+ /* Set up issue rate. */
+ issue_rate = ix86_issue_rate ();
+
+ /* Do reodering for BONNELL/SILVERMONT only. */
+ if (!TARGET_BONNELL && !TARGET_SILVERMONT && !TARGET_INTEL)
+ return issue_rate;
+
+ /* Nothing to do if ready list contains only 1 instruction. */
+ if (n_ready <= 1)
+ return issue_rate;
+
+ /* Do reodering for post-reload scheduler only. */
+ if (!reload_completed)
+ return issue_rate;
+
+ if ((index = do_reorder_for_imul (ready, n_ready)) >= 0)
+ {
+ if (sched_verbose > 1)
+ fprintf (dump, ";;\tatom sched_reorder: put %d insn on top\n",
+ INSN_UID (ready[index]));
+
+ /* Put IMUL producer (ready[index]) at the top of ready list. */
+ insn = ready[index];
+ for (i = index; i < n_ready - 1; i++)
+ ready[i] = ready[i + 1];
+ ready[n_ready - 1] = insn;
+ return issue_rate;
+ }
+
+ /* Skip selective scheduling since HID is not populated in it. */
+ if (clock_var != 0
+ && !sel_sched_p ()
+ && swap_top_of_ready_list (ready, n_ready))
+ {
+ if (sched_verbose > 1)
+ fprintf (dump, ";;\tslm sched_reorder: swap %d and %d insns\n",
+ INSN_UID (ready[n_ready - 1]), INSN_UID (ready[n_ready - 2]));
+ /* Swap 2 top elements of ready list. */
+ insn = ready[n_ready - 1];
+ ready[n_ready - 1] = ready[n_ready - 2];
+ ready[n_ready - 2] = insn;
+ }
+ return issue_rate;
+}
--- /dev/null
+/* Scheduler hooks for IA-32 which implement bdver1-4 specific logic.
+ Copyright (C) 1988-2017 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3. If not see
+<http://www.gnu.org/licenses/>. */
+
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "backend.h"
+#include "rtl.h"
+#include "tree.h"
+#include "cfghooks.h"
+#include "tm_p.h"
+#include "insn-config.h"
+#include "insn-attr.h"
+#include "recog.h"
+#include "target.h"
+#include "rtl-iter.h"
+#include "regset.h"
+#include "sched-int.h"
+
+/* The size of the dispatch window is the total number of bytes of
+ object code allowed in a window. */
+#define DISPATCH_WINDOW_SIZE 16
+
+/* Number of dispatch windows considered for scheduling. */
+#define MAX_DISPATCH_WINDOWS 3
+
+/* Maximum number of instructions in a window. */
+#define MAX_INSN 4
+
+/* Maximum number of immediate operands in a window. */
+#define MAX_IMM 4
+
+/* Maximum number of immediate bits allowed in a window. */
+#define MAX_IMM_SIZE 128
+
+/* Maximum number of 32 bit immediates allowed in a window. */
+#define MAX_IMM_32 4
+
+/* Maximum number of 64 bit immediates allowed in a window. */
+#define MAX_IMM_64 2
+
+/* Maximum total of loads or prefetches allowed in a window. */
+#define MAX_LOAD 2
+
+/* Maximum total of stores allowed in a window. */
+#define MAX_STORE 1
+
+#undef BIG
+#define BIG 100
+
+
+/* Dispatch groups. Istructions that affect the mix in a dispatch window. */
+enum dispatch_group {
+ disp_no_group = 0,
+ disp_load,
+ disp_store,
+ disp_load_store,
+ disp_prefetch,
+ disp_imm,
+ disp_imm_32,
+ disp_imm_64,
+ disp_branch,
+ disp_cmp,
+ disp_jcc,
+ disp_last
+};
+
+/* Number of allowable groups in a dispatch window. It is an array
+ indexed by dispatch_group enum. 100 is used as a big number,
+ because the number of these kind of operations does not have any
+ effect in dispatch window, but we need them for other reasons in
+ the table. */
+static unsigned int num_allowable_groups[disp_last] = {
+ 0, 2, 1, 1, 2, 4, 4, 2, 1, BIG, BIG
+};
+
+char group_name[disp_last + 1][16] = {
+ "disp_no_group", "disp_load", "disp_store", "disp_load_store",
+ "disp_prefetch", "disp_imm", "disp_imm_32", "disp_imm_64",
+ "disp_branch", "disp_cmp", "disp_jcc", "disp_last"
+};
+
+/* Instruction path. */
+enum insn_path {
+ no_path = 0,
+ path_single, /* Single micro op. */
+ path_double, /* Double micro op. */
+ path_multi, /* Instructions with more than 2 micro op.. */
+ last_path
+};
+
+/* sched_insn_info defines a window to the instructions scheduled in
+ the basic block. It contains a pointer to the insn_info table and
+ the instruction scheduled.
+
+ Windows are allocated for each basic block and are linked
+ together. */
+typedef struct sched_insn_info_s {
+ rtx insn;
+ enum dispatch_group group;
+ enum insn_path path;
+ int byte_len;
+ int imm_bytes;
+} sched_insn_info;
+
+/* Linked list of dispatch windows. This is a two way list of
+ dispatch windows of a basic block. It contains information about
+ the number of uops in the window and the total number of
+ instructions and of bytes in the object code for this dispatch
+ window. */
+typedef struct dispatch_windows_s {
+ int num_insn; /* Number of insn in the window. */
+ int num_uops; /* Number of uops in the window. */
+ int window_size; /* Number of bytes in the window. */
+ int window_num; /* Window number between 0 or 1. */
+ int num_imm; /* Number of immediates in an insn. */
+ int num_imm_32; /* Number of 32 bit immediates in an insn. */
+ int num_imm_64; /* Number of 64 bit immediates in an insn. */
+ int imm_size; /* Total immediates in the window. */
+ int num_loads; /* Total memory loads in the window. */
+ int num_stores; /* Total memory stores in the window. */
+ int violation; /* Violation exists in window. */
+ sched_insn_info *window; /* Pointer to the window. */
+ struct dispatch_windows_s *next;
+ struct dispatch_windows_s *prev;
+} dispatch_windows;
+
+/* Immediate valuse used in an insn. */
+typedef struct imm_info_s
+ {
+ int imm;
+ int imm32;
+ int imm64;
+ } imm_info;
+
+static dispatch_windows *dispatch_window_list;
+static dispatch_windows *dispatch_window_list1;
+
+/* Get dispatch group of insn. */
+
+static enum dispatch_group
+get_mem_group (rtx_insn *insn)
+{
+ enum attr_memory memory;
+
+ if (INSN_CODE (insn) < 0)
+ return disp_no_group;
+ memory = get_attr_memory (insn);
+ if (memory == MEMORY_STORE)
+ return disp_store;
+
+ if (memory == MEMORY_LOAD)
+ return disp_load;
+
+ if (memory == MEMORY_BOTH)
+ return disp_load_store;
+
+ return disp_no_group;
+}
+
+/* Return true if insn is a compare instruction. */
+
+static bool
+is_cmp (rtx_insn *insn)
+{
+ enum attr_type type;
+
+ type = get_attr_type (insn);
+ return (type == TYPE_TEST
+ || type == TYPE_ICMP
+ || type == TYPE_FCMP
+ || GET_CODE (PATTERN (insn)) == COMPARE);
+}
+
+/* Return true if a dispatch violation encountered. */
+
+static bool
+dispatch_violation (void)
+{
+ if (dispatch_window_list->next)
+ return dispatch_window_list->next->violation;
+ return dispatch_window_list->violation;
+}
+
+/* Return true if insn is a branch instruction. */
+
+static bool
+is_branch (rtx_insn *insn)
+{
+ return (CALL_P (insn) || JUMP_P (insn));
+}
+
+/* Return true if insn is a prefetch instruction. */
+
+static bool
+is_prefetch (rtx_insn *insn)
+{
+ return NONJUMP_INSN_P (insn) && GET_CODE (PATTERN (insn)) == PREFETCH;
+}
+
+/* This function initializes a dispatch window and the list container holding a
+ pointer to the window. */
+
+static void
+init_window (int window_num)
+{
+ int i;
+ dispatch_windows *new_list;
+
+ if (window_num == 0)
+ new_list = dispatch_window_list;
+ else
+ new_list = dispatch_window_list1;
+
+ new_list->num_insn = 0;
+ new_list->num_uops = 0;
+ new_list->window_size = 0;
+ new_list->next = NULL;
+ new_list->prev = NULL;
+ new_list->window_num = window_num;
+ new_list->num_imm = 0;
+ new_list->num_imm_32 = 0;
+ new_list->num_imm_64 = 0;
+ new_list->imm_size = 0;
+ new_list->num_loads = 0;
+ new_list->num_stores = 0;
+ new_list->violation = false;
+
+ for (i = 0; i < MAX_INSN; i++)
+ {
+ new_list->window[i].insn = NULL;
+ new_list->window[i].group = disp_no_group;
+ new_list->window[i].path = no_path;
+ new_list->window[i].byte_len = 0;
+ new_list->window[i].imm_bytes = 0;
+ }
+ return;
+}
+
+/* This function allocates and initializes a dispatch window and the
+ list container holding a pointer to the window. */
+
+static dispatch_windows *
+allocate_window (void)
+{
+ dispatch_windows *new_list = XNEW (struct dispatch_windows_s);
+ new_list->window = XNEWVEC (struct sched_insn_info_s, MAX_INSN + 1);
+
+ return new_list;
+}
+
+/* This routine initializes the dispatch scheduling information. It
+ initiates building dispatch scheduler tables and constructs the
+ first dispatch window. */
+
+static void
+init_dispatch_sched (void)
+{
+ /* Allocate a dispatch list and a window. */
+ dispatch_window_list = allocate_window ();
+ dispatch_window_list1 = allocate_window ();
+ init_window (0);
+ init_window (1);
+}
+
+/* This function returns true if a branch is detected. End of a basic block
+ does not have to be a branch, but here we assume only branches end a
+ window. */
+
+static bool
+is_end_basic_block (enum dispatch_group group)
+{
+ return group == disp_branch;
+}
+
+/* This function is called when the end of a window processing is reached. */
+
+static void
+process_end_window (void)
+{
+ gcc_assert (dispatch_window_list->num_insn <= MAX_INSN);
+ if (dispatch_window_list->next)
+ {
+ gcc_assert (dispatch_window_list1->num_insn <= MAX_INSN);
+ gcc_assert (dispatch_window_list->window_size
+ + dispatch_window_list1->window_size <= 48);
+ init_window (1);
+ }
+ init_window (0);
+}
+
+/* Allocates a new dispatch window and adds it to WINDOW_LIST.
+ WINDOW_NUM is either 0 or 1. A maximum of two windows are generated
+ for 48 bytes of instructions. Note that these windows are not dispatch
+ windows that their sizes are DISPATCH_WINDOW_SIZE. */
+
+static dispatch_windows *
+allocate_next_window (int window_num)
+{
+ if (window_num == 0)
+ {
+ if (dispatch_window_list->next)
+ init_window (1);
+ init_window (0);
+ return dispatch_window_list;
+ }
+
+ dispatch_window_list->next = dispatch_window_list1;
+ dispatch_window_list1->prev = dispatch_window_list;
+
+ return dispatch_window_list1;
+}
+
+/* Compute number of immediate operands of an instruction. */
+
+static void
+find_constant (rtx in_rtx, imm_info *imm_values)
+{
+ if (INSN_P (in_rtx))
+ in_rtx = PATTERN (in_rtx);
+ subrtx_iterator::array_type array;
+ FOR_EACH_SUBRTX (iter, array, in_rtx, ALL)
+ if (const_rtx x = *iter)
+ switch (GET_CODE (x))
+ {
+ case CONST:
+ case SYMBOL_REF:
+ case CONST_INT:
+ (imm_values->imm)++;
+ if (x86_64_immediate_operand (CONST_CAST_RTX (x), SImode))
+ (imm_values->imm32)++;
+ else
+ (imm_values->imm64)++;
+ break;
+
+ case CONST_DOUBLE:
+ case CONST_WIDE_INT:
+ (imm_values->imm)++;
+ (imm_values->imm64)++;
+ break;
+
+ case CODE_LABEL:
+ if (LABEL_KIND (x) == LABEL_NORMAL)
+ {
+ (imm_values->imm)++;
+ (imm_values->imm32)++;
+ }
+ break;
+
+ default:
+ break;
+ }
+}
+
+/* Return total size of immediate operands of an instruction along with number
+ of corresponding immediate-operands. It initializes its parameters to zero
+ befor calling FIND_CONSTANT.
+ INSN is the input instruction. IMM is the total of immediates.
+ IMM32 is the number of 32 bit immediates. IMM64 is the number of 64
+ bit immediates. */
+
+static int
+get_num_immediates (rtx_insn *insn, int *imm, int *imm32, int *imm64)
+{
+ imm_info imm_values = {0, 0, 0};
+
+ find_constant (insn, &imm_values);
+ *imm = imm_values.imm;
+ *imm32 = imm_values.imm32;
+ *imm64 = imm_values.imm64;
+ return imm_values.imm32 * 4 + imm_values.imm64 * 8;
+}
+
+/* This function indicates if an operand of an instruction is an
+ immediate. */
+
+static bool
+has_immediate (rtx_insn *insn)
+{
+ int num_imm_operand;
+ int num_imm32_operand;
+ int num_imm64_operand;
+
+ if (insn)
+ return get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
+ &num_imm64_operand);
+ return false;
+}
+
+/* Return single or double path for instructions. */
+
+static enum insn_path
+get_insn_path (rtx_insn *insn)
+{
+ enum attr_amdfam10_decode path = get_attr_amdfam10_decode (insn);
+
+ if ((int)path == 0)
+ return path_single;
+
+ if ((int)path == 1)
+ return path_double;
+
+ return path_multi;
+}
+
+/* Return insn dispatch group. */
+
+static enum dispatch_group
+get_insn_group (rtx_insn *insn)
+{
+ enum dispatch_group group = get_mem_group (insn);
+ if (group)
+ return group;
+
+ if (is_branch (insn))
+ return disp_branch;
+
+ if (is_cmp (insn))
+ return disp_cmp;
+
+ if (has_immediate (insn))
+ return disp_imm;
+
+ if (is_prefetch (insn))
+ return disp_prefetch;
+
+ return disp_no_group;
+}
+
+/* Count number of GROUP restricted instructions in a dispatch
+ window WINDOW_LIST. */
+
+static int
+count_num_restricted (rtx_insn *insn, dispatch_windows *window_list)
+{
+ enum dispatch_group group = get_insn_group (insn);
+ int imm_size;
+ int num_imm_operand;
+ int num_imm32_operand;
+ int num_imm64_operand;
+
+ if (group == disp_no_group)
+ return 0;
+
+ if (group == disp_imm)
+ {
+ imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
+ &num_imm64_operand);
+ if (window_list->imm_size + imm_size > MAX_IMM_SIZE
+ || num_imm_operand + window_list->num_imm > MAX_IMM
+ || (num_imm32_operand > 0
+ && (window_list->num_imm_32 + num_imm32_operand > MAX_IMM_32
+ || window_list->num_imm_64 * 2 + num_imm32_operand > MAX_IMM_32))
+ || (num_imm64_operand > 0
+ && (window_list->num_imm_64 + num_imm64_operand > MAX_IMM_64
+ || window_list->num_imm_32 + num_imm64_operand * 2 > MAX_IMM_32))
+ || (window_list->imm_size + imm_size == MAX_IMM_SIZE
+ && num_imm64_operand > 0
+ && ((window_list->num_imm_64 > 0
+ && window_list->num_insn >= 2)
+ || window_list->num_insn >= 3)))
+ return BIG;
+
+ return 1;
+ }
+
+ if ((group == disp_load_store
+ && (window_list->num_loads >= MAX_LOAD
+ || window_list->num_stores >= MAX_STORE))
+ || ((group == disp_load
+ || group == disp_prefetch)
+ && window_list->num_loads >= MAX_LOAD)
+ || (group == disp_store
+ && window_list->num_stores >= MAX_STORE))
+ return BIG;
+
+ return 1;
+}
+
+/* This function returns true if insn satisfies dispatch rules on the
+ last window scheduled. */
+
+static bool
+fits_dispatch_window (rtx_insn *insn)
+{
+ dispatch_windows *window_list = dispatch_window_list;
+ dispatch_windows *window_list_next = dispatch_window_list->next;
+ unsigned int num_restrict;
+ enum dispatch_group group = get_insn_group (insn);
+ enum insn_path path = get_insn_path (insn);
+ int sum;
+
+ /* Make disp_cmp and disp_jcc get scheduled at the latest. These
+ instructions should be given the lowest priority in the
+ scheduling process in Haifa scheduler to make sure they will be
+ scheduled in the same dispatch window as the reference to them. */
+ if (group == disp_jcc || group == disp_cmp)
+ return false;
+
+ /* Check nonrestricted. */
+ if (group == disp_no_group || group == disp_branch)
+ return true;
+
+ /* Get last dispatch window. */
+ if (window_list_next)
+ window_list = window_list_next;
+
+ if (window_list->window_num == 1)
+ {
+ sum = window_list->prev->window_size + window_list->window_size;
+
+ if (sum == 32
+ || (ix86_min_insn_size (insn) + sum) >= 48)
+ /* Window 1 is full. Go for next window. */
+ return true;
+ }
+
+ num_restrict = count_num_restricted (insn, window_list);
+
+ if (num_restrict > num_allowable_groups[group])
+ return false;
+
+ /* See if it fits in the first window. */
+ if (window_list->window_num == 0)
+ {
+ /* The first widow should have only single and double path
+ uops. */
+ if (path == path_double
+ && (window_list->num_uops + 2) > MAX_INSN)
+ return false;
+ else if (path != path_single)
+ return false;
+ }
+ return true;
+}
+
+/* Add an instruction INSN with NUM_UOPS micro-operations to the
+ dispatch window WINDOW_LIST. */
+
+static void
+add_insn_window (rtx_insn *insn, dispatch_windows *window_list, int num_uops)
+{
+ int byte_len = ix86_min_insn_size (insn);
+ int num_insn = window_list->num_insn;
+ int imm_size;
+ sched_insn_info *window = window_list->window;
+ enum dispatch_group group = get_insn_group (insn);
+ enum insn_path path = get_insn_path (insn);
+ int num_imm_operand;
+ int num_imm32_operand;
+ int num_imm64_operand;
+
+ if (!window_list->violation && group != disp_cmp
+ && !fits_dispatch_window (insn))
+ window_list->violation = true;
+
+ imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
+ &num_imm64_operand);
+
+ /* Initialize window with new instruction. */
+ window[num_insn].insn = insn;
+ window[num_insn].byte_len = byte_len;
+ window[num_insn].group = group;
+ window[num_insn].path = path;
+ window[num_insn].imm_bytes = imm_size;
+
+ window_list->window_size += byte_len;
+ window_list->num_insn = num_insn + 1;
+ window_list->num_uops = window_list->num_uops + num_uops;
+ window_list->imm_size += imm_size;
+ window_list->num_imm += num_imm_operand;
+ window_list->num_imm_32 += num_imm32_operand;
+ window_list->num_imm_64 += num_imm64_operand;
+
+ if (group == disp_store)
+ window_list->num_stores += 1;
+ else if (group == disp_load
+ || group == disp_prefetch)
+ window_list->num_loads += 1;
+ else if (group == disp_load_store)
+ {
+ window_list->num_stores += 1;
+ window_list->num_loads += 1;
+ }
+}
+
+/* Adds a scheduled instruction, INSN, to the current dispatch window.
+ If the total bytes of instructions or the number of instructions in
+ the window exceed allowable, it allocates a new window. */
+
+static void
+add_to_dispatch_window (rtx_insn *insn)
+{
+ int byte_len;
+ dispatch_windows *window_list;
+ dispatch_windows *next_list;
+ dispatch_windows *window0_list;
+ enum insn_path path;
+ enum dispatch_group insn_group;
+ bool insn_fits;
+ int num_insn;
+ int num_uops;
+ int window_num;
+ int insn_num_uops;
+ int sum;
+
+ if (INSN_CODE (insn) < 0)
+ return;
+
+ byte_len = ix86_min_insn_size (insn);
+ window_list = dispatch_window_list;
+ next_list = window_list->next;
+ path = get_insn_path (insn);
+ insn_group = get_insn_group (insn);
+
+ /* Get the last dispatch window. */
+ if (next_list)
+ window_list = dispatch_window_list->next;
+
+ if (path == path_single)
+ insn_num_uops = 1;
+ else if (path == path_double)
+ insn_num_uops = 2;
+ else
+ insn_num_uops = (int) path;
+
+ /* If current window is full, get a new window.
+ Window number zero is full, if MAX_INSN uops are scheduled in it.
+ Window number one is full, if window zero's bytes plus window
+ one's bytes is 32, or if the bytes of the new instruction added
+ to the total makes it greater than 48, or it has already MAX_INSN
+ instructions in it. */
+ num_insn = window_list->num_insn;
+ num_uops = window_list->num_uops;
+ window_num = window_list->window_num;
+ insn_fits = fits_dispatch_window (insn);
+
+ if (num_insn >= MAX_INSN
+ || num_uops + insn_num_uops > MAX_INSN
+ || !(insn_fits))
+ {
+ window_num = ~window_num & 1;
+ window_list = allocate_next_window (window_num);
+ }
+
+ if (window_num == 0)
+ {
+ add_insn_window (insn, window_list, insn_num_uops);
+ if (window_list->num_insn >= MAX_INSN
+ && insn_group == disp_branch)
+ {
+ process_end_window ();
+ return;
+ }
+ }
+ else if (window_num == 1)
+ {
+ window0_list = window_list->prev;
+ sum = window0_list->window_size + window_list->window_size;
+ if (sum == 32
+ || (byte_len + sum) >= 48)
+ {
+ process_end_window ();
+ window_list = dispatch_window_list;
+ }
+
+ add_insn_window (insn, window_list, insn_num_uops);
+ }
+ else
+ gcc_unreachable ();
+
+ if (is_end_basic_block (insn_group))
+ {
+ /* End of basic block is reached do end-basic-block process. */
+ process_end_window ();
+ return;
+ }
+}
+
+/* Print the dispatch window, WINDOW_NUM, to FILE. */
+
+DEBUG_FUNCTION static void
+debug_dispatch_window_file (FILE *file, int window_num)
+{
+ dispatch_windows *list;
+ int i;
+
+ if (window_num == 0)
+ list = dispatch_window_list;
+ else
+ list = dispatch_window_list1;
+
+ fprintf (file, "Window #%d:\n", list->window_num);
+ fprintf (file, " num_insn = %d, num_uops = %d, window_size = %d\n",
+ list->num_insn, list->num_uops, list->window_size);
+ fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
+ list->num_imm, list->num_imm_32, list->num_imm_64, list->imm_size);
+
+ fprintf (file, " num_loads = %d, num_stores = %d\n", list->num_loads,
+ list->num_stores);
+ fprintf (file, " insn info:\n");
+
+ for (i = 0; i < MAX_INSN; i++)
+ {
+ if (!list->window[i].insn)
+ break;
+ fprintf (file, " group[%d] = %s, insn[%d] = %p, path[%d] = %d byte_len[%d] = %d, imm_bytes[%d] = %d\n",
+ i, group_name[list->window[i].group],
+ i, (void *)list->window[i].insn,
+ i, list->window[i].path,
+ i, list->window[i].byte_len,
+ i, list->window[i].imm_bytes);
+ }
+}
+
+/* Print to stdout a dispatch window. */
+
+DEBUG_FUNCTION void
+debug_dispatch_window (int window_num)
+{
+ debug_dispatch_window_file (stdout, window_num);
+}
+
+/* Print INSN dispatch information to FILE. */
+
+DEBUG_FUNCTION static void
+debug_insn_dispatch_info_file (FILE *file, rtx_insn *insn)
+{
+ int byte_len;
+ enum insn_path path;
+ enum dispatch_group group;
+ int imm_size;
+ int num_imm_operand;
+ int num_imm32_operand;
+ int num_imm64_operand;
+
+ if (INSN_CODE (insn) < 0)
+ return;
+
+ byte_len = ix86_min_insn_size (insn);
+ path = get_insn_path (insn);
+ group = get_insn_group (insn);
+ imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
+ &num_imm64_operand);
+
+ fprintf (file, " insn info:\n");
+ fprintf (file, " group = %s, path = %d, byte_len = %d\n",
+ group_name[group], path, byte_len);
+ fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
+ num_imm_operand, num_imm32_operand, num_imm64_operand, imm_size);
+}
+
+/* Print to STDERR the status of the ready list with respect to
+ dispatch windows. */
+
+DEBUG_FUNCTION void
+debug_ready_dispatch (void)
+{
+ int i;
+ int no_ready = number_in_ready ();
+
+ fprintf (stdout, "Number of ready: %d\n", no_ready);
+
+ for (i = 0; i < no_ready; i++)
+ debug_insn_dispatch_info_file (stdout, get_ready_element (i));
+}
+
+/* This routine is the driver of the dispatch scheduler. */
+
+void
+ix86_bd_do_dispatch (rtx_insn *insn, int mode)
+{
+ if (mode == DISPATCH_INIT)
+ init_dispatch_sched ();
+ else if (mode == ADD_TO_DISPATCH_WINDOW)
+ add_to_dispatch_window (insn);
+}
+
+/* Return TRUE if Dispatch Scheduling is supported. */
+
+bool
+ix86_bd_has_dispatch (rtx_insn *insn, int action)
+{
+ /* Current implementation of dispatch scheduler models buldozer only. */
+ if ((TARGET_BDVER1 || TARGET_BDVER2 || TARGET_BDVER3
+ || TARGET_BDVER4) && flag_dispatch_scheduler)
+ switch (action)
+ {
+ default:
+ return false;
+
+ case IS_DISPATCH_ON:
+ return true;
+
+ case IS_CMP:
+ return is_cmp (insn);
+
+ case DISPATCH_VIOLATION:
+ return dispatch_violation ();
+
+ case FITS_DISPATCH_WINDOW:
+ return fits_dispatch_window (insn);
+ }
+
+ return false;
+}
--- /dev/null
+/* Scheduler hooks for IA-32 which implement bdver1-4 specific logic.
+ Copyright (C) 1988-2017 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3. If not see
+<http://www.gnu.org/licenses/>. */
+
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "backend.h"
+#include "rtl.h"
+#include "tree.h"
+#include "cfghooks.h"
+#include "tm_p.h"
+#include "insn-config.h"
+#include "insn-attr.h"
+#include "recog.h"
+#include "target.h"
+#include "rtl-iter.h"
+#include "regset.h"
+#include "sched-int.h"
+
+
+/* Model decoder of Core 2/i7.
+ Below hooks for multipass scheduling (see haifa-sched.c:max_issue)
+ track the instruction fetch block boundaries and make sure that long
+ (9+ bytes) instructions are assigned to D0. */
+
+/* Maximum length of an insn that can be handled by
+ a secondary decoder unit. '8' for Core 2/i7. */
+static int core2i7_secondary_decoder_max_insn_size;
+
+/* Ifetch block size, i.e., number of bytes decoder reads per cycle.
+ '16' for Core 2/i7. */
+static int core2i7_ifetch_block_size;
+
+/* Maximum number of instructions decoder can handle per cycle.
+ '6' for Core 2/i7. */
+static int core2i7_ifetch_block_max_insns;
+
+typedef struct ix86_first_cycle_multipass_data_ *
+ ix86_first_cycle_multipass_data_t;
+typedef const struct ix86_first_cycle_multipass_data_ *
+ const_ix86_first_cycle_multipass_data_t;
+
+/* A variable to store target state across calls to max_issue within
+ one cycle. */
+static struct ix86_first_cycle_multipass_data_ _ix86_first_cycle_multipass_data,
+ *ix86_first_cycle_multipass_data = &_ix86_first_cycle_multipass_data;
+
+/* Initialize DATA. */
+static void
+core2i7_first_cycle_multipass_init (void *_data)
+{
+ ix86_first_cycle_multipass_data_t data
+ = (ix86_first_cycle_multipass_data_t) _data;
+
+ data->ifetch_block_len = 0;
+ data->ifetch_block_n_insns = 0;
+ data->ready_try_change = NULL;
+ data->ready_try_change_size = 0;
+}
+
+/* Advancing the cycle; reset ifetch block counts. */
+static void
+core2i7_dfa_post_advance_cycle (void)
+{
+ ix86_first_cycle_multipass_data_t data = ix86_first_cycle_multipass_data;
+
+ gcc_assert (data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
+
+ data->ifetch_block_len = 0;
+ data->ifetch_block_n_insns = 0;
+}
+
+/* Filter out insns from ready_try that the core will not be able to issue
+ on current cycle due to decoder. */
+static void
+core2i7_first_cycle_multipass_filter_ready_try
+(const_ix86_first_cycle_multipass_data_t data,
+ signed char *ready_try, int n_ready, bool first_cycle_insn_p)
+{
+ while (n_ready--)
+ {
+ rtx_insn *insn;
+ int insn_size;
+
+ if (ready_try[n_ready])
+ continue;
+
+ insn = get_ready_element (n_ready);
+ insn_size = ix86_min_insn_size (insn);
+
+ if (/* If this is a too long an insn for a secondary decoder ... */
+ (!first_cycle_insn_p
+ && insn_size > core2i7_secondary_decoder_max_insn_size)
+ /* ... or it would not fit into the ifetch block ... */
+ || data->ifetch_block_len + insn_size > core2i7_ifetch_block_size
+ /* ... or the decoder is full already ... */
+ || data->ifetch_block_n_insns + 1 > core2i7_ifetch_block_max_insns)
+ /* ... mask the insn out. */
+ {
+ ready_try[n_ready] = 1;
+
+ if (data->ready_try_change)
+ bitmap_set_bit (data->ready_try_change, n_ready);
+ }
+ }
+}
+
+/* Prepare for a new round of multipass lookahead scheduling. */
+static void
+core2i7_first_cycle_multipass_begin (void *_data,
+ signed char *ready_try, int n_ready,
+ bool first_cycle_insn_p)
+{
+ ix86_first_cycle_multipass_data_t data
+ = (ix86_first_cycle_multipass_data_t) _data;
+ const_ix86_first_cycle_multipass_data_t prev_data
+ = ix86_first_cycle_multipass_data;
+
+ /* Restore the state from the end of the previous round. */
+ data->ifetch_block_len = prev_data->ifetch_block_len;
+ data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns;
+
+ /* Filter instructions that cannot be issued on current cycle due to
+ decoder restrictions. */
+ core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
+ first_cycle_insn_p);
+}
+
+/* INSN is being issued in current solution. Account for its impact on
+ the decoder model. */
+static void
+core2i7_first_cycle_multipass_issue (void *_data,
+ signed char *ready_try, int n_ready,
+ rtx_insn *insn, const void *_prev_data)
+{
+ ix86_first_cycle_multipass_data_t data
+ = (ix86_first_cycle_multipass_data_t) _data;
+ const_ix86_first_cycle_multipass_data_t prev_data
+ = (const_ix86_first_cycle_multipass_data_t) _prev_data;
+
+ int insn_size = ix86_min_insn_size (insn);
+
+ data->ifetch_block_len = prev_data->ifetch_block_len + insn_size;
+ data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns + 1;
+ gcc_assert (data->ifetch_block_len <= core2i7_ifetch_block_size
+ && data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
+
+ /* Allocate or resize the bitmap for storing INSN's effect on ready_try. */
+ if (!data->ready_try_change)
+ {
+ data->ready_try_change = sbitmap_alloc (n_ready);
+ data->ready_try_change_size = n_ready;
+ }
+ else if (data->ready_try_change_size < n_ready)
+ {
+ data->ready_try_change = sbitmap_resize (data->ready_try_change,
+ n_ready, 0);
+ data->ready_try_change_size = n_ready;
+ }
+ bitmap_clear (data->ready_try_change);
+
+ /* Filter out insns from ready_try that the core will not be able to issue
+ on current cycle due to decoder. */
+ core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
+ false);
+}
+
+/* Revert the effect on ready_try. */
+static void
+core2i7_first_cycle_multipass_backtrack (const void *_data,
+ signed char *ready_try,
+ int n_ready ATTRIBUTE_UNUSED)
+{
+ const_ix86_first_cycle_multipass_data_t data
+ = (const_ix86_first_cycle_multipass_data_t) _data;
+ unsigned int i = 0;
+ sbitmap_iterator sbi;
+
+ gcc_assert (bitmap_last_set_bit (data->ready_try_change) < n_ready);
+ EXECUTE_IF_SET_IN_BITMAP (data->ready_try_change, 0, i, sbi)
+ {
+ ready_try[i] = 0;
+ }
+}
+
+/* Save the result of multipass lookahead scheduling for the next round. */
+static void
+core2i7_first_cycle_multipass_end (const void *_data)
+{
+ const_ix86_first_cycle_multipass_data_t data
+ = (const_ix86_first_cycle_multipass_data_t) _data;
+ ix86_first_cycle_multipass_data_t next_data
+ = ix86_first_cycle_multipass_data;
+
+ if (data != NULL)
+ {
+ next_data->ifetch_block_len = data->ifetch_block_len;
+ next_data->ifetch_block_n_insns = data->ifetch_block_n_insns;
+ }
+}
+
+/* Deallocate target data. */
+static void
+core2i7_first_cycle_multipass_fini (void *_data)
+{
+ ix86_first_cycle_multipass_data_t data
+ = (ix86_first_cycle_multipass_data_t) _data;
+
+ if (data->ready_try_change)
+ {
+ sbitmap_free (data->ready_try_change);
+ data->ready_try_change = NULL;
+ data->ready_try_change_size = 0;
+ }
+}
+
+void
+ix86_core2i7_init_hooks (void)
+{
+ targetm.sched.dfa_post_advance_cycle
+ = core2i7_dfa_post_advance_cycle;
+ targetm.sched.first_cycle_multipass_init
+ = core2i7_first_cycle_multipass_init;
+ targetm.sched.first_cycle_multipass_begin
+ = core2i7_first_cycle_multipass_begin;
+ targetm.sched.first_cycle_multipass_issue
+ = core2i7_first_cycle_multipass_issue;
+ targetm.sched.first_cycle_multipass_backtrack
+ = core2i7_first_cycle_multipass_backtrack;
+ targetm.sched.first_cycle_multipass_end
+ = core2i7_first_cycle_multipass_end;
+ targetm.sched.first_cycle_multipass_fini
+ = core2i7_first_cycle_multipass_fini;
+
+ /* Set decoder parameters. */
+ core2i7_secondary_decoder_max_insn_size = 8;
+ core2i7_ifetch_block_size = 16;
+ core2i7_ifetch_block_max_insns = 6;
+}
--- /dev/null
+/* Scheduler hooks for IA-32 which implement CPU specific logic.
+ Copyright (C) 1988-2017 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3. If not see
+<http://www.gnu.org/licenses/>. */
+
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "backend.h"
+#include "rtl.h"
+#include "tree.h"
+#include "cfghooks.h"
+#include "tm_p.h"
+#include "insn-config.h"
+#include "insn-attr.h"
+#include "recog.h"
+#include "target.h"
+
+/* Return the maximum number of instructions a cpu can issue. */
+
+int
+ix86_issue_rate (void)
+{
+ switch (ix86_tune)
+ {
+ case PROCESSOR_PENTIUM:
+ case PROCESSOR_LAKEMONT:
+ case PROCESSOR_BONNELL:
+ case PROCESSOR_SILVERMONT:
+ case PROCESSOR_KNL:
+ case PROCESSOR_KNM:
+ case PROCESSOR_INTEL:
+ case PROCESSOR_K6:
+ case PROCESSOR_BTVER2:
+ case PROCESSOR_PENTIUM4:
+ case PROCESSOR_NOCONA:
+ return 2;
+
+ case PROCESSOR_PENTIUMPRO:
+ case PROCESSOR_ATHLON:
+ case PROCESSOR_K8:
+ case PROCESSOR_AMDFAM10:
+ case PROCESSOR_GENERIC:
+ case PROCESSOR_BTVER1:
+ return 3;
+
+ case PROCESSOR_BDVER1:
+ case PROCESSOR_BDVER2:
+ case PROCESSOR_BDVER3:
+ case PROCESSOR_BDVER4:
+ case PROCESSOR_ZNVER1:
+ case PROCESSOR_CORE2:
+ case PROCESSOR_NEHALEM:
+ case PROCESSOR_SANDYBRIDGE:
+ case PROCESSOR_HASWELL:
+ return 4;
+
+ default:
+ return 1;
+ }
+}
+
+/* Return true iff USE_INSN has a memory address with operands set by
+ SET_INSN. */
+
+bool
+ix86_agi_dependent (rtx_insn *set_insn, rtx_insn *use_insn)
+{
+ int i;
+ extract_insn_cached (use_insn);
+ for (i = recog_data.n_operands - 1; i >= 0; --i)
+ if (MEM_P (recog_data.operand[i]))
+ {
+ rtx addr = XEXP (recog_data.operand[i], 0);
+ if (modified_in_p (addr, set_insn) != 0)
+ {
+ /* No AGI stall if SET_INSN is a push or pop and USE_INSN
+ has SP based memory (unless index reg is modified in a pop). */
+ rtx set = single_set (set_insn);
+ if (set
+ && (push_operand (SET_DEST (set), GET_MODE (SET_DEST (set)))
+ || pop_operand (SET_SRC (set), GET_MODE (SET_SRC (set)))))
+ {
+ struct ix86_address parts;
+ if (ix86_decompose_address (addr, &parts)
+ && parts.base == stack_pointer_rtx
+ && (parts.index == NULL_RTX
+ || MEM_P (SET_DEST (set))
+ || !modified_in_p (parts.index, set_insn)))
+ return false;
+ }
+ return true;
+ }
+ return false;
+ }
+ return false;
+}
+
+/* A subroutine of ix86_adjust_cost -- return TRUE iff INSN reads flags set
+ by DEP_INSN and nothing set by DEP_INSN. */
+
+static bool
+ix86_flags_dependent (rtx_insn *insn, rtx_insn *dep_insn, enum attr_type insn_type)
+{
+ rtx set, set2;
+
+ /* Simplify the test for uninteresting insns. */
+ if (insn_type != TYPE_SETCC
+ && insn_type != TYPE_ICMOV
+ && insn_type != TYPE_FCMOV
+ && insn_type != TYPE_IBR)
+ return false;
+
+ if ((set = single_set (dep_insn)) != 0)
+ {
+ set = SET_DEST (set);
+ set2 = NULL_RTX;
+ }
+ else if (GET_CODE (PATTERN (dep_insn)) == PARALLEL
+ && XVECLEN (PATTERN (dep_insn), 0) == 2
+ && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 0)) == SET
+ && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 1)) == SET)
+ {
+ set = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
+ set2 = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
+ }
+ else
+ return false;
+
+ if (!REG_P (set) || REGNO (set) != FLAGS_REG)
+ return false;
+
+ /* This test is true if the dependent insn reads the flags but
+ not any other potentially set register. */
+ if (!reg_overlap_mentioned_p (set, PATTERN (insn)))
+ return false;
+
+ if (set2 && reg_overlap_mentioned_p (set2, PATTERN (insn)))
+ return false;
+
+ return true;
+}
+
+/* Helper function for exact_store_load_dependency.
+ Return true if addr is found in insn. */
+static bool
+exact_dependency_1 (rtx addr, rtx insn)
+{
+ enum rtx_code code;
+ const char *format_ptr;
+ int i, j;
+
+ code = GET_CODE (insn);
+ switch (code)
+ {
+ case MEM:
+ if (rtx_equal_p (addr, insn))
+ return true;
+ break;
+ case REG:
+ CASE_CONST_ANY:
+ case SYMBOL_REF:
+ case CODE_LABEL:
+ case PC:
+ case CC0:
+ case EXPR_LIST:
+ return false;
+ default:
+ break;
+ }
+
+ format_ptr = GET_RTX_FORMAT (code);
+ for (i = 0; i < GET_RTX_LENGTH (code); i++)
+ {
+ switch (*format_ptr++)
+ {
+ case 'e':
+ if (exact_dependency_1 (addr, XEXP (insn, i)))
+ return true;
+ break;
+ case 'E':
+ for (j = 0; j < XVECLEN (insn, i); j++)
+ if (exact_dependency_1 (addr, XVECEXP (insn, i, j)))
+ return true;
+ break;
+ }
+ }
+ return false;
+}
+
+/* Return true if there exists exact dependency for store & load, i.e.
+ the same memory address is used in them. */
+static bool
+exact_store_load_dependency (rtx_insn *store, rtx_insn *load)
+{
+ rtx set1, set2;
+
+ set1 = single_set (store);
+ if (!set1)
+ return false;
+ if (!MEM_P (SET_DEST (set1)))
+ return false;
+ set2 = single_set (load);
+ if (!set2)
+ return false;
+ if (exact_dependency_1 (SET_DEST (set1), SET_SRC (set2)))
+ return true;
+ return false;
+}
+
+
+/* This function corrects the value of COST (latency) based on the relationship
+ between INSN and DEP_INSN through a dependence of type DEP_TYPE, and strength
+ DW. It should return the new value.
+
+ On x86 CPUs this is most commonly used to model the fact that valus of
+ registers used to compute address of memory operand needs to be ready
+ earlier than values of registers used in the actual operation. */
+
+int
+ix86_adjust_cost (rtx_insn *insn, int dep_type, rtx_insn *dep_insn, int cost,
+ unsigned int)
+{
+ enum attr_type insn_type, dep_insn_type;
+ enum attr_memory memory;
+ rtx set, set2;
+ int dep_insn_code_number;
+
+ /* Anti and output dependencies have zero cost on all CPUs. */
+ if (dep_type != 0)
+ return 0;
+
+ dep_insn_code_number = recog_memoized (dep_insn);
+
+ /* If we can't recognize the insns, we can't really do anything. */
+ if (dep_insn_code_number < 0 || recog_memoized (insn) < 0)
+ return cost;
+
+ insn_type = get_attr_type (insn);
+ dep_insn_type = get_attr_type (dep_insn);
+
+ switch (ix86_tune)
+ {
+ case PROCESSOR_PENTIUM:
+ case PROCESSOR_LAKEMONT:
+ /* Address Generation Interlock adds a cycle of latency. */
+ if (insn_type == TYPE_LEA)
+ {
+ rtx addr = PATTERN (insn);
+
+ if (GET_CODE (addr) == PARALLEL)
+ addr = XVECEXP (addr, 0, 0);
+
+ gcc_assert (GET_CODE (addr) == SET);
+
+ addr = SET_SRC (addr);
+ if (modified_in_p (addr, dep_insn))
+ cost += 1;
+ }
+ else if (ix86_agi_dependent (dep_insn, insn))
+ cost += 1;
+
+ /* ??? Compares pair with jump/setcc. */
+ if (ix86_flags_dependent (insn, dep_insn, insn_type))
+ cost = 0;
+
+ /* Floating point stores require value to be ready one cycle earlier. */
+ if (insn_type == TYPE_FMOV
+ && get_attr_memory (insn) == MEMORY_STORE
+ && !ix86_agi_dependent (dep_insn, insn))
+ cost += 1;
+ break;
+
+ case PROCESSOR_PENTIUMPRO:
+ /* INT->FP conversion is expensive. */
+ if (get_attr_fp_int_src (dep_insn))
+ cost += 5;
+
+ /* There is one cycle extra latency between an FP op and a store. */
+ if (insn_type == TYPE_FMOV
+ && (set = single_set (dep_insn)) != NULL_RTX
+ && (set2 = single_set (insn)) != NULL_RTX
+ && rtx_equal_p (SET_DEST (set), SET_SRC (set2))
+ && MEM_P (SET_DEST (set2)))
+ cost += 1;
+
+ memory = get_attr_memory (insn);
+
+ /* Show ability of reorder buffer to hide latency of load by executing
+ in parallel with previous instruction in case
+ previous instruction is not needed to compute the address. */
+ if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
+ && !ix86_agi_dependent (dep_insn, insn))
+ {
+ /* Claim moves to take one cycle, as core can issue one load
+ at time and the next load can start cycle later. */
+ if (dep_insn_type == TYPE_IMOV
+ || dep_insn_type == TYPE_FMOV)
+ cost = 1;
+ else if (cost > 1)
+ cost--;
+ }
+ break;
+
+ case PROCESSOR_K6:
+ /* The esp dependency is resolved before
+ the instruction is really finished. */
+ if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
+ && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
+ return 1;
+
+ /* INT->FP conversion is expensive. */
+ if (get_attr_fp_int_src (dep_insn))
+ cost += 5;
+
+ memory = get_attr_memory (insn);
+
+ /* Show ability of reorder buffer to hide latency of load by executing
+ in parallel with previous instruction in case
+ previous instruction is not needed to compute the address. */
+ if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
+ && !ix86_agi_dependent (dep_insn, insn))
+ {
+ /* Claim moves to take one cycle, as core can issue one load
+ at time and the next load can start cycle later. */
+ if (dep_insn_type == TYPE_IMOV
+ || dep_insn_type == TYPE_FMOV)
+ cost = 1;
+ else if (cost > 2)
+ cost -= 2;
+ else
+ cost = 1;
+ }
+ break;
+
+ case PROCESSOR_AMDFAM10:
+ case PROCESSOR_BDVER1:
+ case PROCESSOR_BDVER2:
+ case PROCESSOR_BDVER3:
+ case PROCESSOR_BDVER4:
+ case PROCESSOR_ZNVER1:
+ case PROCESSOR_BTVER1:
+ case PROCESSOR_BTVER2:
+ case PROCESSOR_GENERIC:
+ /* Stack engine allows to execute push&pop instructions in parall. */
+ if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
+ && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
+ return 0;
+ /* FALLTHRU */
+
+ case PROCESSOR_ATHLON:
+ case PROCESSOR_K8:
+ memory = get_attr_memory (insn);
+
+ /* Show ability of reorder buffer to hide latency of load by executing
+ in parallel with previous instruction in case
+ previous instruction is not needed to compute the address. */
+ if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
+ && !ix86_agi_dependent (dep_insn, insn))
+ {
+ enum attr_unit unit = get_attr_unit (insn);
+ int loadcost = 3;
+
+ /* Because of the difference between the length of integer and
+ floating unit pipeline preparation stages, the memory operands
+ for floating point are cheaper.
+
+ ??? For Athlon it the difference is most probably 2. */
+ if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN)
+ loadcost = 3;
+ else
+ loadcost = TARGET_ATHLON ? 2 : 0;
+
+ if (cost >= loadcost)
+ cost -= loadcost;
+ else
+ cost = 0;
+ }
+ break;
+
+ case PROCESSOR_CORE2:
+ case PROCESSOR_NEHALEM:
+ case PROCESSOR_SANDYBRIDGE:
+ case PROCESSOR_HASWELL:
+ /* Stack engine allows to execute push&pop instructions in parall. */
+ if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
+ && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
+ return 0;
+
+ memory = get_attr_memory (insn);
+
+ /* Show ability of reorder buffer to hide latency of load by executing
+ in parallel with previous instruction in case
+ previous instruction is not needed to compute the address. */
+ if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
+ && !ix86_agi_dependent (dep_insn, insn))
+ {
+ if (cost >= 4)
+ cost -= 4;
+ else
+ cost = 0;
+ }
+ break;
+
+ case PROCESSOR_SILVERMONT:
+ case PROCESSOR_KNL:
+ case PROCESSOR_KNM:
+ case PROCESSOR_INTEL:
+ if (!reload_completed)
+ return cost;
+
+ /* Increase cost of integer loads. */
+ memory = get_attr_memory (dep_insn);
+ if (memory == MEMORY_LOAD || memory == MEMORY_BOTH)
+ {
+ enum attr_unit unit = get_attr_unit (dep_insn);
+ if (unit == UNIT_INTEGER && cost == 1)
+ {
+ if (memory == MEMORY_LOAD)
+ cost = 3;
+ else
+ {
+ /* Increase cost of ld/st for short int types only
+ because of store forwarding issue. */
+ rtx set = single_set (dep_insn);
+ if (set && (GET_MODE (SET_DEST (set)) == QImode
+ || GET_MODE (SET_DEST (set)) == HImode))
+ {
+ /* Increase cost of store/load insn if exact
+ dependence exists and it is load insn. */
+ enum attr_memory insn_memory = get_attr_memory (insn);
+ if (insn_memory == MEMORY_LOAD
+ && exact_store_load_dependency (dep_insn, insn))
+ cost = 3;
+ }
+ }
+ }
+ }
+
+ default:
+ break;
+ }
+
+ return cost;
+}
+
+/* How many alternative schedules to try. This should be as wide as the
+ scheduling freedom in the DFA, but no wider. Making this value too
+ large results extra work for the scheduler. */
+
+int
+ia32_multipass_dfa_lookahead (void)
+{
+ /* Generally, we want haifa-sched:max_issue() to look ahead as far
+ as many instructions can be executed on a cycle, i.e.,
+ issue_rate. */
+ if (reload_completed)
+ return ix86_issue_rate ();
+ /* Don't use lookahead for pre-reload schedule to save compile time. */
+ return 0;
+}
+
+/* Return true if target platform supports macro-fusion. */
+
+bool
+ix86_macro_fusion_p ()
+{
+ return TARGET_FUSE_CMP_AND_BRANCH;
+}
+
+/* Check whether current microarchitecture support macro fusion
+ for insn pair "CONDGEN + CONDJMP". Refer to
+ "Intel Architectures Optimization Reference Manual". */
+
+bool
+ix86_macro_fusion_pair_p (rtx_insn *condgen, rtx_insn *condjmp)
+{
+ rtx src, dest;
+ enum rtx_code ccode;
+ rtx compare_set = NULL_RTX, test_if, cond;
+ rtx alu_set = NULL_RTX, addr = NULL_RTX;
+
+ if (!any_condjump_p (condjmp))
+ return false;
+
+ unsigned int condreg1, condreg2;
+ rtx cc_reg_1;
+ targetm.fixed_condition_code_regs (&condreg1, &condreg2);
+ cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
+ if (!reg_referenced_p (cc_reg_1, PATTERN (condjmp))
+ || !condgen
+ || !modified_in_p (cc_reg_1, condgen))
+ return false;
+
+ if (get_attr_type (condgen) != TYPE_TEST
+ && get_attr_type (condgen) != TYPE_ICMP
+ && get_attr_type (condgen) != TYPE_INCDEC
+ && get_attr_type (condgen) != TYPE_ALU)
+ return false;
+
+ compare_set = single_set (condgen);
+ if (compare_set == NULL_RTX
+ && !TARGET_FUSE_ALU_AND_BRANCH)
+ return false;
+
+ if (compare_set == NULL_RTX)
+ {
+ int i;
+ rtx pat = PATTERN (condgen);
+ for (i = 0; i < XVECLEN (pat, 0); i++)
+ if (GET_CODE (XVECEXP (pat, 0, i)) == SET)
+ {
+ rtx set_src = SET_SRC (XVECEXP (pat, 0, i));
+ if (GET_CODE (set_src) == COMPARE)
+ compare_set = XVECEXP (pat, 0, i);
+ else
+ alu_set = XVECEXP (pat, 0, i);
+ }
+ }
+ if (compare_set == NULL_RTX)
+ return false;
+ src = SET_SRC (compare_set);
+ if (GET_CODE (src) != COMPARE)
+ return false;
+
+ /* Macro-fusion for cmp/test MEM-IMM + conditional jmp is not
+ supported. */
+ if ((MEM_P (XEXP (src, 0))
+ && CONST_INT_P (XEXP (src, 1)))
+ || (MEM_P (XEXP (src, 1))
+ && CONST_INT_P (XEXP (src, 0))))
+ return false;
+
+ /* No fusion for RIP-relative address. */
+ if (MEM_P (XEXP (src, 0)))
+ addr = XEXP (XEXP (src, 0), 0);
+ else if (MEM_P (XEXP (src, 1)))
+ addr = XEXP (XEXP (src, 1), 0);
+
+ if (addr) {
+ ix86_address parts;
+ int ok = ix86_decompose_address (addr, &parts);
+ gcc_assert (ok);
+
+ if (ix86_rip_relative_addr_p (&parts))
+ return false;
+ }
+
+ test_if = SET_SRC (pc_set (condjmp));
+ cond = XEXP (test_if, 0);
+ ccode = GET_CODE (cond);
+ /* Check whether conditional jump use Sign or Overflow Flags. */
+ if (!TARGET_FUSE_CMP_AND_BRANCH_SOFLAGS
+ && (ccode == GE
+ || ccode == GT
+ || ccode == LE
+ || ccode == LT))
+ return false;
+
+ /* Return true for TYPE_TEST and TYPE_ICMP. */
+ if (get_attr_type (condgen) == TYPE_TEST
+ || get_attr_type (condgen) == TYPE_ICMP)
+ return true;
+
+ /* The following is the case that macro-fusion for alu + jmp. */
+ if (!TARGET_FUSE_ALU_AND_BRANCH || !alu_set)
+ return false;
+
+ /* No fusion for alu op with memory destination operand. */
+ dest = SET_DEST (alu_set);
+ if (MEM_P (dest))
+ return false;
+
+ /* Macro-fusion for inc/dec + unsigned conditional jump is not
+ supported. */
+ if (get_attr_type (condgen) == TYPE_INCDEC
+ && (ccode == GEU
+ || ccode == GTU
+ || ccode == LEU
+ || ccode == LTU))
+ return false;
+
+ return true;
+}
+