From 64766e8dc78b92fc906e21429b1befd2b248f96e Mon Sep 17 00:00:00 2001 From: Jan Hubicka Date: Wed, 11 Oct 2017 17:17:23 +0200 Subject: [PATCH] config.gcc (i386, x86_64): Add extra objects. * config.gcc (i386, x86_64): Add extra objects. * i386/i386-protos.h (ix86_rip_relative_addr_p): Declare. (ix86_min_insn_size): Declare. (ix86_issue_rate): Declare. (ix86_adjust_cost): Declare. (ia32_multipass_dfa_lookahead): Declare. (ix86_macro_fusion_p): Declare. (ix86_macro_fusion_pair_p): Declare. (ix86_bd_has_dispatch): Declare. (ix86_bd_do_dispatch): Declare. (ix86_core2i7_init_hooks): Declare. (ix86_atom_sched_reorder): Declare. * i386/i386.c Move all CPU cost tables to x86-tune-costs.h. (COSTS_N_BYTES): Move to x86-tune-costs.h. (DUMMY_STRINGOP_ALGS):x86-tune-costs.h. (rip_relative_addr_p): Rename to ... (ix86_rip_relative_addr_p): ... this one; export. (memory_address_length): Update. (ix86_issue_rate): Move to x86-tune-sched.c. (ix86_flags_dependent): Move to x86-tune-sched.c. (ix86_agi_dependent): Move to x86-tune-sched.c. (exact_dependency_1): Move to x86-tune-sched.c. (exact_store_load_dependency): Move to x86-tune-sched.c. (ix86_adjust_cost): Move to x86-tune-sched.c. (ia32_multipass_dfa_lookahead): Move to x86-tune-sched.c. (ix86_macro_fusion_p): Move to x86-tune-sched.c. (ix86_macro_fusion_pair_p): Move to x86-tune-sched.c. (do_reorder_for_imul): Move to x86-tune-sched-atom.c. (swap_top_of_ready_list): Move to x86-tune-sched-atom.c. (ix86_sched_reorder): Move to x86-tune-sched-atom.c. (core2i7_first_cycle_multipass_init): Move to x86-tune-sched-core.c. (core2i7_dfa_post_advance_cycle): Move to x86-tune-sched-core.c. (min_insn_size): Rename to ... (ix86_min_insn_size): ... this one; export. (core2i7_first_cycle_multipass_begin): Move to x86-tune-sched-core.c. (core2i7_first_cycle_multipass_issue): Move to x86-tune-sched-core.c. (core2i7_first_cycle_multipass_backtrack): Move to x86-tune-sched-core.c. (core2i7_first_cycle_multipass_end): Move to x86-tune-sched-core.c. (core2i7_first_cycle_multipass_fini): Move to x86-tune-sched-core.c. (ix86_sched_init_global): Break up logic to ix86_core2i7_init_hooks. (ix86_avoid_jump_mispredicts): Update. (TARGET_SCHED_DISPATCH): Move to ix86-tune-sched-bd.c. (TARGET_SCHED_DISPATCH_DO): Move to ix86-tune-sched-bd.c. (TARGET_SCHED_REORDER): Move to ix86-tune-sched-bd.c. (DISPATCH_WINDOW_SIZE): Move to ix86-tune-sched-bd.c. (MAX_DISPATCH_WINDOWS): Move to ix86-tune-sched-bd.c. (MAX_INSN): Move to ix86-tune-sched-bd.c. (MAX_IMM): Move to ix86-tune-sched-bd.c. (MAX_IMM_SIZE): Move to ix86-tune-sched-bd.c. (MAX_IMM_32): Move to ix86-tune-sched-bd.c. (MAX_IMM_64): Move to ix86-tune-sched-bd.c. (MAX_LOAD): Move to ix86-tune-sched-bd.c. (MAX_STORE): Move to ix86-tune-sched-bd.c. (BIG): Move to ix86-tune-sched-bd.c. (enum dispatch_group): Move to ix86-tune-sched-bd.c. (enum insn_path): Move to ix86-tune-sched-bd.c. (get_mem_group): Move to ix86-tune-sched-bd.c. (is_cmp): Move to ix86-tune-sched-bd.c. (dispatch_violation): Move to ix86-tune-sched-bd.c. (is_branch): Move to ix86-tune-sched-bd.c. (is_prefetch): Move to ix86-tune-sched-bd.c. (init_window): Move to ix86-tune-sched-bd.c. (allocate_window): Move to ix86-tune-sched-bd.c. (init_dispatch_sched): Move to ix86-tune-sched-bd.c. (is_end_basic_block): Move to ix86-tune-sched-bd.c. (process_end_window): Move to ix86-tune-sched-bd.c. (allocate_next_window): Move to ix86-tune-sched-bd.c. (find_constant): Move to ix86-tune-sched-bd.c. (get_num_immediates): Move to ix86-tune-sched-bd.c. (has_immediate): Move to ix86-tune-sched-bd.c. (get_insn_path): Move to ix86-tune-sched-bd.c. (get_insn_group): Move to ix86-tune-sched-bd.c. (count_num_restricted): Move to ix86-tune-sched-bd.c. (fits_dispatch_window): Move to ix86-tune-sched-bd.c. (add_insn_window): Move to ix86-tune-sched-bd.c. (add_to_dispatch_window): Move to ix86-tune-sched-bd.c. (debug_dispatch_window_file): Move to ix86-tune-sched-bd.c. (debug_dispatch_window): Move to ix86-tune-sched-bd.c. (debug_insn_dispatch_info_file): Move to ix86-tune-sched-bd.c. (debug_ready_dispatch): Move to ix86-tune-sched-bd.c. (do_dispatch): Move to ix86-tune-sched-bd.c. (has_dispatch): Move to ix86-tune-sched-bd.c. * i386/t-i386: Add new object files. * i386/x86-tune-costs.h: New file. * i386/x86-tune-sched-atom.c: New file. * i386/x86-tune-sched-bd.c: New file. * i386/x86-tune-sched-core.c: New file. * i386/x86-tune-sched.c: New file. From-SVN: r253646 --- gcc/ChangeLog | 91 + gcc/config.gcc | 2 + gcc/config/i386/i386-protos.h | 16 + gcc/config/i386/i386.c | 3884 +------------------------ gcc/config/i386/t-i386 | 16 + gcc/config/i386/x86-tune-costs.h | 2083 +++++++++++++ gcc/config/i386/x86-tune-sched-atom.c | 244 ++ gcc/config/i386/x86-tune-sched-bd.c | 822 ++++++ gcc/config/i386/x86-tune-sched-core.c | 255 ++ gcc/config/i386/x86-tune-sched.c | 599 ++++ 10 files changed, 4145 insertions(+), 3867 deletions(-) create mode 100644 gcc/config/i386/x86-tune-costs.h create mode 100644 gcc/config/i386/x86-tune-sched-atom.c create mode 100644 gcc/config/i386/x86-tune-sched-bd.c create mode 100644 gcc/config/i386/x86-tune-sched-core.c create mode 100644 gcc/config/i386/x86-tune-sched.c diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 3d888e3b645..8d09e103801 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,94 @@ +2017-10-11 Jan Hubicka + + * config.gcc (i386, x86_64): Add extra objects. + * i386/i386-protos.h (ix86_rip_relative_addr_p): Declare. + (ix86_min_insn_size): Declare. + (ix86_issue_rate): Declare. + (ix86_adjust_cost): Declare. + (ia32_multipass_dfa_lookahead): Declare. + (ix86_macro_fusion_p): Declare. + (ix86_macro_fusion_pair_p): Declare. + (ix86_bd_has_dispatch): Declare. + (ix86_bd_do_dispatch): Declare. + (ix86_core2i7_init_hooks): Declare. + (ix86_atom_sched_reorder): Declare. + * i386/i386.c Move all CPU cost tables to x86-tune-costs.h. + (COSTS_N_BYTES): Move to x86-tune-costs.h. + (DUMMY_STRINGOP_ALGS):x86-tune-costs.h. + (rip_relative_addr_p): Rename to ... + (ix86_rip_relative_addr_p): ... this one; export. + (memory_address_length): Update. + (ix86_issue_rate): Move to x86-tune-sched.c. + (ix86_flags_dependent): Move to x86-tune-sched.c. + (ix86_agi_dependent): Move to x86-tune-sched.c. + (exact_dependency_1): Move to x86-tune-sched.c. + (exact_store_load_dependency): Move to x86-tune-sched.c. + (ix86_adjust_cost): Move to x86-tune-sched.c. + (ia32_multipass_dfa_lookahead): Move to x86-tune-sched.c. + (ix86_macro_fusion_p): Move to x86-tune-sched.c. + (ix86_macro_fusion_pair_p): Move to x86-tune-sched.c. + (do_reorder_for_imul): Move to x86-tune-sched-atom.c. + (swap_top_of_ready_list): Move to x86-tune-sched-atom.c. + (ix86_sched_reorder): Move to x86-tune-sched-atom.c. + (core2i7_first_cycle_multipass_init): Move to x86-tune-sched-core.c. + (core2i7_dfa_post_advance_cycle): Move to x86-tune-sched-core.c. + (min_insn_size): Rename to ... + (ix86_min_insn_size): ... this one; export. + (core2i7_first_cycle_multipass_begin): Move to x86-tune-sched-core.c. + (core2i7_first_cycle_multipass_issue): Move to x86-tune-sched-core.c. + (core2i7_first_cycle_multipass_backtrack): Move to x86-tune-sched-core.c. + (core2i7_first_cycle_multipass_end): Move to x86-tune-sched-core.c. + (core2i7_first_cycle_multipass_fini): Move to x86-tune-sched-core.c. + (ix86_sched_init_global): Break up logic to ix86_core2i7_init_hooks. + (ix86_avoid_jump_mispredicts): Update. + (TARGET_SCHED_DISPATCH): Move to ix86-tune-sched-bd.c. + (TARGET_SCHED_DISPATCH_DO): Move to ix86-tune-sched-bd.c. + (TARGET_SCHED_REORDER): Move to ix86-tune-sched-bd.c. + (DISPATCH_WINDOW_SIZE): Move to ix86-tune-sched-bd.c. + (MAX_DISPATCH_WINDOWS): Move to ix86-tune-sched-bd.c. + (MAX_INSN): Move to ix86-tune-sched-bd.c. + (MAX_IMM): Move to ix86-tune-sched-bd.c. + (MAX_IMM_SIZE): Move to ix86-tune-sched-bd.c. + (MAX_IMM_32): Move to ix86-tune-sched-bd.c. + (MAX_IMM_64): Move to ix86-tune-sched-bd.c. + (MAX_LOAD): Move to ix86-tune-sched-bd.c. + (MAX_STORE): Move to ix86-tune-sched-bd.c. + (BIG): Move to ix86-tune-sched-bd.c. + (enum dispatch_group): Move to ix86-tune-sched-bd.c. + (enum insn_path): Move to ix86-tune-sched-bd.c. + (get_mem_group): Move to ix86-tune-sched-bd.c. + (is_cmp): Move to ix86-tune-sched-bd.c. + (dispatch_violation): Move to ix86-tune-sched-bd.c. + (is_branch): Move to ix86-tune-sched-bd.c. + (is_prefetch): Move to ix86-tune-sched-bd.c. + (init_window): Move to ix86-tune-sched-bd.c. + (allocate_window): Move to ix86-tune-sched-bd.c. + (init_dispatch_sched): Move to ix86-tune-sched-bd.c. + (is_end_basic_block): Move to ix86-tune-sched-bd.c. + (process_end_window): Move to ix86-tune-sched-bd.c. + (allocate_next_window): Move to ix86-tune-sched-bd.c. + (find_constant): Move to ix86-tune-sched-bd.c. + (get_num_immediates): Move to ix86-tune-sched-bd.c. + (has_immediate): Move to ix86-tune-sched-bd.c. + (get_insn_path): Move to ix86-tune-sched-bd.c. + (get_insn_group): Move to ix86-tune-sched-bd.c. + (count_num_restricted): Move to ix86-tune-sched-bd.c. + (fits_dispatch_window): Move to ix86-tune-sched-bd.c. + (add_insn_window): Move to ix86-tune-sched-bd.c. + (add_to_dispatch_window): Move to ix86-tune-sched-bd.c. + (debug_dispatch_window_file): Move to ix86-tune-sched-bd.c. + (debug_dispatch_window): Move to ix86-tune-sched-bd.c. + (debug_insn_dispatch_info_file): Move to ix86-tune-sched-bd.c. + (debug_ready_dispatch): Move to ix86-tune-sched-bd.c. + (do_dispatch): Move to ix86-tune-sched-bd.c. + (has_dispatch): Move to ix86-tune-sched-bd.c. + * i386/t-i386: Add new object files. + * i386/x86-tune-costs.h: New file. + * i386/x86-tune-sched-atom.c: New file. + * i386/x86-tune-sched-bd.c: New file. + * i386/x86-tune-sched-core.c: New file. + * i386/x86-tune-sched.c: New file. + 2017-10-11 Liu Hao * pretty-print.c [_WIN32] (colorize_init): Remove. Use diff --git a/gcc/config.gcc b/gcc/config.gcc index c52bebc220a..22702396a9f 100644 --- a/gcc/config.gcc +++ b/gcc/config.gcc @@ -360,6 +360,7 @@ i[34567]86-*-*) cpu_type=i386 c_target_objs="i386-c.o" cxx_target_objs="i386-c.o" + extra_objs="x86-tune-sched.o x86-tune-sched-bd.o x86-tune-sched-atom.o x86-tune-sched-core.o" extra_options="${extra_options} fused-madd.opt" extra_headers="cpuid.h mmintrin.h mm3dnow.h xmmintrin.h emmintrin.h pmmintrin.h tmmintrin.h ammintrin.h smmintrin.h @@ -384,6 +385,7 @@ x86_64-*-*) c_target_objs="i386-c.o" cxx_target_objs="i386-c.o" extra_options="${extra_options} fused-madd.opt" + extra_objs="x86-tune-sched.o x86-tune-sched-bd.o x86-tune-sched-atom.o x86-tune-sched-core.o" extra_headers="cpuid.h mmintrin.h mm3dnow.h xmmintrin.h emmintrin.h pmmintrin.h tmmintrin.h ammintrin.h smmintrin.h nmmintrin.h bmmintrin.h fma4intrin.h wmmintrin.h diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h index fbe9f271434..ab3d8f95c5d 100644 --- a/gcc/config/i386/i386-protos.h +++ b/gcc/config/i386/i386-protos.h @@ -27,6 +27,7 @@ extern bool ix86_handle_option (struct gcc_options *opts, extern bool ix86_target_stack_probe (void); extern bool ix86_can_use_return_insn_p (void); extern void ix86_setup_frame_addresses (void); +extern bool ix86_rip_relative_addr_p (struct ix86_address *parts); extern HOST_WIDE_INT ix86_initial_elimination_offset (int, int); extern void ix86_expand_prologue (void); @@ -314,6 +315,21 @@ extern enum attr_cpu ix86_schedule; extern const char * ix86_output_call_insn (rtx_insn *insn, rtx call_op); extern bool ix86_operands_ok_for_move_multiple (rtx *operands, bool load, machine_mode mode); +extern int ix86_min_insn_size (rtx_insn *); + +extern int ix86_issue_rate (void); +extern int ix86_adjust_cost (rtx_insn *insn, int dep_type, rtx_insn *dep_insn, + int cost, unsigned int); +extern int ia32_multipass_dfa_lookahead (void); +extern bool ix86_macro_fusion_p (void); +extern bool ix86_macro_fusion_pair_p (rtx_insn *condgen, rtx_insn *condjmp); + +extern bool ix86_bd_has_dispatch (rtx_insn *insn, int action); +extern void ix86_bd_do_dispatch (rtx_insn *insn, int mode); + +extern void ix86_core2i7_init_hooks (void); + +extern int ix86_atom_sched_reorder (FILE *, int, rtx_insn **, int *, int); #ifdef RTX_CODE /* Target data for multipass lookahead scheduling. diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 7abda92bf01..63058a8d19c 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -92,6 +92,8 @@ along with GCC; see the file COPYING3. If not see /* This file should be included last. */ #include "target-def.h" +#include "x86-tune-costs.h" + static rtx legitimize_dllimport_symbol (rtx, bool); static rtx legitimize_pe_coff_extern_decl (rtx, bool); static rtx legitimize_pe_coff_symbol (rtx, bool); @@ -111,2094 +113,12 @@ static bool ix86_function_naked (const_tree); : (mode) == DImode ? 3 \ : 4) -/* Processor costs (relative to an add) */ -/* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes. */ -#define COSTS_N_BYTES(N) ((N) * 2) - -#define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall, false}}} - -static stringop_algs ix86_size_memcpy[2] = { - {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}, - {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}}; -static stringop_algs ix86_size_memset[2] = { - {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}, - {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}}; - -const -struct processor_costs ix86_size_cost = {/* costs for tuning for size */ - COSTS_N_BYTES (2), /* cost of an add instruction */ - COSTS_N_BYTES (3), /* cost of a lea instruction */ - COSTS_N_BYTES (2), /* variable shift costs */ - COSTS_N_BYTES (3), /* constant shift costs */ - {COSTS_N_BYTES (3), /* cost of starting multiply for QI */ - COSTS_N_BYTES (3), /* HI */ - COSTS_N_BYTES (3), /* SI */ - COSTS_N_BYTES (3), /* DI */ - COSTS_N_BYTES (5)}, /* other */ - 0, /* cost of multiply per each bit set */ - {COSTS_N_BYTES (3), /* cost of a divide/mod for QI */ - COSTS_N_BYTES (3), /* HI */ - COSTS_N_BYTES (3), /* SI */ - COSTS_N_BYTES (3), /* DI */ - COSTS_N_BYTES (5)}, /* other */ - COSTS_N_BYTES (3), /* cost of movsx */ - COSTS_N_BYTES (3), /* cost of movzx */ - 0, /* "large" insn */ - 2, /* MOVE_RATIO */ - 2, /* cost for loading QImode using movzbl */ - {2, 2, 2}, /* cost of loading integer registers - in QImode, HImode and SImode. - Relative to reg-reg move (2). */ - {2, 2, 2}, /* cost of storing integer registers */ - 2, /* cost of reg,reg fld/fst */ - {2, 2, 2}, /* cost of loading fp registers - in SFmode, DFmode and XFmode */ - {2, 2, 2}, /* cost of storing fp registers - in SFmode, DFmode and XFmode */ - 3, /* cost of moving MMX register */ - {3, 3}, /* cost of loading MMX registers - in SImode and DImode */ - {3, 3}, /* cost of storing MMX registers - in SImode and DImode */ - 3, /* cost of moving SSE register */ - {3, 3, 3}, /* cost of loading SSE registers - in SImode, DImode and TImode */ - {3, 3, 3}, /* cost of storing SSE registers - in SImode, DImode and TImode */ - 3, /* MMX or SSE register to integer */ - 0, /* size of l1 cache */ - 0, /* size of l2 cache */ - 0, /* size of prefetch block */ - 0, /* number of parallel prefetches */ - 2, /* Branch cost */ - COSTS_N_BYTES (2), /* cost of FADD and FSUB insns. */ - COSTS_N_BYTES (2), /* cost of FMUL instruction. */ - COSTS_N_BYTES (2), /* cost of FDIV instruction. */ - COSTS_N_BYTES (2), /* cost of FABS instruction. */ - COSTS_N_BYTES (2), /* cost of FCHS instruction. */ - COSTS_N_BYTES (2), /* cost of FSQRT instruction. */ - 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ - ix86_size_memcpy, - ix86_size_memset, - 1, /* scalar_stmt_cost. */ - 1, /* scalar load_cost. */ - 1, /* scalar_store_cost. */ - 1, /* vec_stmt_cost. */ - 1, /* vec_to_scalar_cost. */ - 1, /* scalar_to_vec_cost. */ - 1, /* vec_align_load_cost. */ - 1, /* vec_unalign_load_cost. */ - 1, /* vec_store_cost. */ - 1, /* cond_taken_branch_cost. */ - 1, /* cond_not_taken_branch_cost. */ -}; - -/* Processor costs (relative to an add) */ -static stringop_algs i386_memcpy[2] = { - {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}, - DUMMY_STRINGOP_ALGS}; -static stringop_algs i386_memset[2] = { - {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}, - DUMMY_STRINGOP_ALGS}; - -static const -struct processor_costs i386_cost = { /* 386 specific costs */ - COSTS_N_INSNS (1), /* cost of an add instruction */ - COSTS_N_INSNS (1), /* cost of a lea instruction */ - COSTS_N_INSNS (3), /* variable shift costs */ - COSTS_N_INSNS (2), /* constant shift costs */ - {COSTS_N_INSNS (6), /* cost of starting multiply for QI */ - COSTS_N_INSNS (6), /* HI */ - COSTS_N_INSNS (6), /* SI */ - COSTS_N_INSNS (6), /* DI */ - COSTS_N_INSNS (6)}, /* other */ - COSTS_N_INSNS (1), /* cost of multiply per each bit set */ - {COSTS_N_INSNS (23), /* cost of a divide/mod for QI */ - COSTS_N_INSNS (23), /* HI */ - COSTS_N_INSNS (23), /* SI */ - COSTS_N_INSNS (23), /* DI */ - COSTS_N_INSNS (23)}, /* other */ - COSTS_N_INSNS (3), /* cost of movsx */ - COSTS_N_INSNS (2), /* cost of movzx */ - 15, /* "large" insn */ - 3, /* MOVE_RATIO */ - 4, /* cost for loading QImode using movzbl */ - {2, 4, 2}, /* cost of loading integer registers - in QImode, HImode and SImode. - Relative to reg-reg move (2). */ - {2, 4, 2}, /* cost of storing integer registers */ - 2, /* cost of reg,reg fld/fst */ - {8, 8, 8}, /* cost of loading fp registers - in SFmode, DFmode and XFmode */ - {8, 8, 8}, /* cost of storing fp registers - in SFmode, DFmode and XFmode */ - 2, /* cost of moving MMX register */ - {4, 8}, /* cost of loading MMX registers - in SImode and DImode */ - {4, 8}, /* cost of storing MMX registers - in SImode and DImode */ - 2, /* cost of moving SSE register */ - {4, 8, 16}, /* cost of loading SSE registers - in SImode, DImode and TImode */ - {4, 8, 16}, /* cost of storing SSE registers - in SImode, DImode and TImode */ - 3, /* MMX or SSE register to integer */ - 0, /* size of l1 cache */ - 0, /* size of l2 cache */ - 0, /* size of prefetch block */ - 0, /* number of parallel prefetches */ - 1, /* Branch cost */ - COSTS_N_INSNS (23), /* cost of FADD and FSUB insns. */ - COSTS_N_INSNS (27), /* cost of FMUL instruction. */ - COSTS_N_INSNS (88), /* cost of FDIV instruction. */ - COSTS_N_INSNS (22), /* cost of FABS instruction. */ - COSTS_N_INSNS (24), /* cost of FCHS instruction. */ - COSTS_N_INSNS (122), /* cost of FSQRT instruction. */ - 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ - i386_memcpy, - i386_memset, - 1, /* scalar_stmt_cost. */ - 1, /* scalar load_cost. */ - 1, /* scalar_store_cost. */ - 1, /* vec_stmt_cost. */ - 1, /* vec_to_scalar_cost. */ - 1, /* scalar_to_vec_cost. */ - 1, /* vec_align_load_cost. */ - 2, /* vec_unalign_load_cost. */ - 1, /* vec_store_cost. */ - 3, /* cond_taken_branch_cost. */ - 1, /* cond_not_taken_branch_cost. */ -}; - -static stringop_algs i486_memcpy[2] = { - {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}}, - DUMMY_STRINGOP_ALGS}; -static stringop_algs i486_memset[2] = { - {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}}, - DUMMY_STRINGOP_ALGS}; - -static const -struct processor_costs i486_cost = { /* 486 specific costs */ - COSTS_N_INSNS (1), /* cost of an add instruction */ - COSTS_N_INSNS (1), /* cost of a lea instruction */ - COSTS_N_INSNS (3), /* variable shift costs */ - COSTS_N_INSNS (2), /* constant shift costs */ - {COSTS_N_INSNS (12), /* cost of starting multiply for QI */ - COSTS_N_INSNS (12), /* HI */ - COSTS_N_INSNS (12), /* SI */ - COSTS_N_INSNS (12), /* DI */ - COSTS_N_INSNS (12)}, /* other */ - 1, /* cost of multiply per each bit set */ - {COSTS_N_INSNS (40), /* cost of a divide/mod for QI */ - COSTS_N_INSNS (40), /* HI */ - COSTS_N_INSNS (40), /* SI */ - COSTS_N_INSNS (40), /* DI */ - COSTS_N_INSNS (40)}, /* other */ - COSTS_N_INSNS (3), /* cost of movsx */ - COSTS_N_INSNS (2), /* cost of movzx */ - 15, /* "large" insn */ - 3, /* MOVE_RATIO */ - 4, /* cost for loading QImode using movzbl */ - {2, 4, 2}, /* cost of loading integer registers - in QImode, HImode and SImode. - Relative to reg-reg move (2). */ - {2, 4, 2}, /* cost of storing integer registers */ - 2, /* cost of reg,reg fld/fst */ - {8, 8, 8}, /* cost of loading fp registers - in SFmode, DFmode and XFmode */ - {8, 8, 8}, /* cost of storing fp registers - in SFmode, DFmode and XFmode */ - 2, /* cost of moving MMX register */ - {4, 8}, /* cost of loading MMX registers - in SImode and DImode */ - {4, 8}, /* cost of storing MMX registers - in SImode and DImode */ - 2, /* cost of moving SSE register */ - {4, 8, 16}, /* cost of loading SSE registers - in SImode, DImode and TImode */ - {4, 8, 16}, /* cost of storing SSE registers - in SImode, DImode and TImode */ - 3, /* MMX or SSE register to integer */ - 4, /* size of l1 cache. 486 has 8kB cache - shared for code and data, so 4kB is - not really precise. */ - 4, /* size of l2 cache */ - 0, /* size of prefetch block */ - 0, /* number of parallel prefetches */ - 1, /* Branch cost */ - COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */ - COSTS_N_INSNS (16), /* cost of FMUL instruction. */ - COSTS_N_INSNS (73), /* cost of FDIV instruction. */ - COSTS_N_INSNS (3), /* cost of FABS instruction. */ - COSTS_N_INSNS (3), /* cost of FCHS instruction. */ - COSTS_N_INSNS (83), /* cost of FSQRT instruction. */ - 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ - i486_memcpy, - i486_memset, - 1, /* scalar_stmt_cost. */ - 1, /* scalar load_cost. */ - 1, /* scalar_store_cost. */ - 1, /* vec_stmt_cost. */ - 1, /* vec_to_scalar_cost. */ - 1, /* scalar_to_vec_cost. */ - 1, /* vec_align_load_cost. */ - 2, /* vec_unalign_load_cost. */ - 1, /* vec_store_cost. */ - 3, /* cond_taken_branch_cost. */ - 1, /* cond_not_taken_branch_cost. */ -}; - -static stringop_algs pentium_memcpy[2] = { - {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}}, - DUMMY_STRINGOP_ALGS}; -static stringop_algs pentium_memset[2] = { - {libcall, {{-1, rep_prefix_4_byte, false}}}, - DUMMY_STRINGOP_ALGS}; - -static const -struct processor_costs pentium_cost = { - COSTS_N_INSNS (1), /* cost of an add instruction */ - COSTS_N_INSNS (1), /* cost of a lea instruction */ - COSTS_N_INSNS (4), /* variable shift costs */ - COSTS_N_INSNS (1), /* constant shift costs */ - {COSTS_N_INSNS (11), /* cost of starting multiply for QI */ - COSTS_N_INSNS (11), /* HI */ - COSTS_N_INSNS (11), /* SI */ - COSTS_N_INSNS (11), /* DI */ - COSTS_N_INSNS (11)}, /* other */ - 0, /* cost of multiply per each bit set */ - {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */ - COSTS_N_INSNS (25), /* HI */ - COSTS_N_INSNS (25), /* SI */ - COSTS_N_INSNS (25), /* DI */ - COSTS_N_INSNS (25)}, /* other */ - COSTS_N_INSNS (3), /* cost of movsx */ - COSTS_N_INSNS (2), /* cost of movzx */ - 8, /* "large" insn */ - 6, /* MOVE_RATIO */ - 6, /* cost for loading QImode using movzbl */ - {2, 4, 2}, /* cost of loading integer registers - in QImode, HImode and SImode. - Relative to reg-reg move (2). */ - {2, 4, 2}, /* cost of storing integer registers */ - 2, /* cost of reg,reg fld/fst */ - {2, 2, 6}, /* cost of loading fp registers - in SFmode, DFmode and XFmode */ - {4, 4, 6}, /* cost of storing fp registers - in SFmode, DFmode and XFmode */ - 8, /* cost of moving MMX register */ - {8, 8}, /* cost of loading MMX registers - in SImode and DImode */ - {8, 8}, /* cost of storing MMX registers - in SImode and DImode */ - 2, /* cost of moving SSE register */ - {4, 8, 16}, /* cost of loading SSE registers - in SImode, DImode and TImode */ - {4, 8, 16}, /* cost of storing SSE registers - in SImode, DImode and TImode */ - 3, /* MMX or SSE register to integer */ - 8, /* size of l1 cache. */ - 8, /* size of l2 cache */ - 0, /* size of prefetch block */ - 0, /* number of parallel prefetches */ - 2, /* Branch cost */ - COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */ - COSTS_N_INSNS (3), /* cost of FMUL instruction. */ - COSTS_N_INSNS (39), /* cost of FDIV instruction. */ - COSTS_N_INSNS (1), /* cost of FABS instruction. */ - COSTS_N_INSNS (1), /* cost of FCHS instruction. */ - COSTS_N_INSNS (70), /* cost of FSQRT instruction. */ - 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ - pentium_memcpy, - pentium_memset, - 1, /* scalar_stmt_cost. */ - 1, /* scalar load_cost. */ - 1, /* scalar_store_cost. */ - 1, /* vec_stmt_cost. */ - 1, /* vec_to_scalar_cost. */ - 1, /* scalar_to_vec_cost. */ - 1, /* vec_align_load_cost. */ - 2, /* vec_unalign_load_cost. */ - 1, /* vec_store_cost. */ - 3, /* cond_taken_branch_cost. */ - 1, /* cond_not_taken_branch_cost. */ -}; - -static const -struct processor_costs lakemont_cost = { - COSTS_N_INSNS (1), /* cost of an add instruction */ - COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */ - COSTS_N_INSNS (1), /* variable shift costs */ - COSTS_N_INSNS (1), /* constant shift costs */ - {COSTS_N_INSNS (11), /* cost of starting multiply for QI */ - COSTS_N_INSNS (11), /* HI */ - COSTS_N_INSNS (11), /* SI */ - COSTS_N_INSNS (11), /* DI */ - COSTS_N_INSNS (11)}, /* other */ - 0, /* cost of multiply per each bit set */ - {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */ - COSTS_N_INSNS (25), /* HI */ - COSTS_N_INSNS (25), /* SI */ - COSTS_N_INSNS (25), /* DI */ - COSTS_N_INSNS (25)}, /* other */ - COSTS_N_INSNS (3), /* cost of movsx */ - COSTS_N_INSNS (2), /* cost of movzx */ - 8, /* "large" insn */ - 17, /* MOVE_RATIO */ - 6, /* cost for loading QImode using movzbl */ - {2, 4, 2}, /* cost of loading integer registers - in QImode, HImode and SImode. - Relative to reg-reg move (2). */ - {2, 4, 2}, /* cost of storing integer registers */ - 2, /* cost of reg,reg fld/fst */ - {2, 2, 6}, /* cost of loading fp registers - in SFmode, DFmode and XFmode */ - {4, 4, 6}, /* cost of storing fp registers - in SFmode, DFmode and XFmode */ - 8, /* cost of moving MMX register */ - {8, 8}, /* cost of loading MMX registers - in SImode and DImode */ - {8, 8}, /* cost of storing MMX registers - in SImode and DImode */ - 2, /* cost of moving SSE register */ - {4, 8, 16}, /* cost of loading SSE registers - in SImode, DImode and TImode */ - {4, 8, 16}, /* cost of storing SSE registers - in SImode, DImode and TImode */ - 3, /* MMX or SSE register to integer */ - 8, /* size of l1 cache. */ - 8, /* size of l2 cache */ - 0, /* size of prefetch block */ - 0, /* number of parallel prefetches */ - 2, /* Branch cost */ - COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */ - COSTS_N_INSNS (3), /* cost of FMUL instruction. */ - COSTS_N_INSNS (39), /* cost of FDIV instruction. */ - COSTS_N_INSNS (1), /* cost of FABS instruction. */ - COSTS_N_INSNS (1), /* cost of FCHS instruction. */ - COSTS_N_INSNS (70), /* cost of FSQRT instruction. */ - 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ - pentium_memcpy, - pentium_memset, - 1, /* scalar_stmt_cost. */ - 1, /* scalar load_cost. */ - 1, /* scalar_store_cost. */ - 1, /* vec_stmt_cost. */ - 1, /* vec_to_scalar_cost. */ - 1, /* scalar_to_vec_cost. */ - 1, /* vec_align_load_cost. */ - 2, /* vec_unalign_load_cost. */ - 1, /* vec_store_cost. */ - 3, /* cond_taken_branch_cost. */ - 1, /* cond_not_taken_branch_cost. */ -}; - -/* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes - (we ensure the alignment). For small blocks inline loop is still a - noticeable win, for bigger blocks either rep movsl or rep movsb is - way to go. Rep movsb has apparently more expensive startup time in CPU, - but after 4K the difference is down in the noise. */ -static stringop_algs pentiumpro_memcpy[2] = { - {rep_prefix_4_byte, {{128, loop, false}, {1024, unrolled_loop, false}, - {8192, rep_prefix_4_byte, false}, - {-1, rep_prefix_1_byte, false}}}, - DUMMY_STRINGOP_ALGS}; -static stringop_algs pentiumpro_memset[2] = { - {rep_prefix_4_byte, {{1024, unrolled_loop, false}, - {8192, rep_prefix_4_byte, false}, - {-1, libcall, false}}}, - DUMMY_STRINGOP_ALGS}; -static const -struct processor_costs pentiumpro_cost = { - COSTS_N_INSNS (1), /* cost of an add instruction */ - COSTS_N_INSNS (1), /* cost of a lea instruction */ - COSTS_N_INSNS (1), /* variable shift costs */ - COSTS_N_INSNS (1), /* constant shift costs */ - {COSTS_N_INSNS (4), /* cost of starting multiply for QI */ - COSTS_N_INSNS (4), /* HI */ - COSTS_N_INSNS (4), /* SI */ - COSTS_N_INSNS (4), /* DI */ - COSTS_N_INSNS (4)}, /* other */ - 0, /* cost of multiply per each bit set */ - {COSTS_N_INSNS (17), /* cost of a divide/mod for QI */ - COSTS_N_INSNS (17), /* HI */ - COSTS_N_INSNS (17), /* SI */ - COSTS_N_INSNS (17), /* DI */ - COSTS_N_INSNS (17)}, /* other */ - COSTS_N_INSNS (1), /* cost of movsx */ - COSTS_N_INSNS (1), /* cost of movzx */ - 8, /* "large" insn */ - 6, /* MOVE_RATIO */ - 2, /* cost for loading QImode using movzbl */ - {4, 4, 4}, /* cost of loading integer registers - in QImode, HImode and SImode. - Relative to reg-reg move (2). */ - {2, 2, 2}, /* cost of storing integer registers */ - 2, /* cost of reg,reg fld/fst */ - {2, 2, 6}, /* cost of loading fp registers - in SFmode, DFmode and XFmode */ - {4, 4, 6}, /* cost of storing fp registers - in SFmode, DFmode and XFmode */ - 2, /* cost of moving MMX register */ - {2, 2}, /* cost of loading MMX registers - in SImode and DImode */ - {2, 2}, /* cost of storing MMX registers - in SImode and DImode */ - 2, /* cost of moving SSE register */ - {2, 2, 8}, /* cost of loading SSE registers - in SImode, DImode and TImode */ - {2, 2, 8}, /* cost of storing SSE registers - in SImode, DImode and TImode */ - 3, /* MMX or SSE register to integer */ - 8, /* size of l1 cache. */ - 256, /* size of l2 cache */ - 32, /* size of prefetch block */ - 6, /* number of parallel prefetches */ - 2, /* Branch cost */ - COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */ - COSTS_N_INSNS (5), /* cost of FMUL instruction. */ - COSTS_N_INSNS (56), /* cost of FDIV instruction. */ - COSTS_N_INSNS (2), /* cost of FABS instruction. */ - COSTS_N_INSNS (2), /* cost of FCHS instruction. */ - COSTS_N_INSNS (56), /* cost of FSQRT instruction. */ - 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ - pentiumpro_memcpy, - pentiumpro_memset, - 1, /* scalar_stmt_cost. */ - 1, /* scalar load_cost. */ - 1, /* scalar_store_cost. */ - 1, /* vec_stmt_cost. */ - 1, /* vec_to_scalar_cost. */ - 1, /* scalar_to_vec_cost. */ - 1, /* vec_align_load_cost. */ - 2, /* vec_unalign_load_cost. */ - 1, /* vec_store_cost. */ - 3, /* cond_taken_branch_cost. */ - 1, /* cond_not_taken_branch_cost. */ -}; - -static stringop_algs geode_memcpy[2] = { - {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}}, - DUMMY_STRINGOP_ALGS}; -static stringop_algs geode_memset[2] = { - {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}}, - DUMMY_STRINGOP_ALGS}; -static const -struct processor_costs geode_cost = { - COSTS_N_INSNS (1), /* cost of an add instruction */ - COSTS_N_INSNS (1), /* cost of a lea instruction */ - COSTS_N_INSNS (2), /* variable shift costs */ - COSTS_N_INSNS (1), /* constant shift costs */ - {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ - COSTS_N_INSNS (4), /* HI */ - COSTS_N_INSNS (7), /* SI */ - COSTS_N_INSNS (7), /* DI */ - COSTS_N_INSNS (7)}, /* other */ - 0, /* cost of multiply per each bit set */ - {COSTS_N_INSNS (15), /* cost of a divide/mod for QI */ - COSTS_N_INSNS (23), /* HI */ - COSTS_N_INSNS (39), /* SI */ - COSTS_N_INSNS (39), /* DI */ - COSTS_N_INSNS (39)}, /* other */ - COSTS_N_INSNS (1), /* cost of movsx */ - COSTS_N_INSNS (1), /* cost of movzx */ - 8, /* "large" insn */ - 4, /* MOVE_RATIO */ - 1, /* cost for loading QImode using movzbl */ - {1, 1, 1}, /* cost of loading integer registers - in QImode, HImode and SImode. - Relative to reg-reg move (2). */ - {1, 1, 1}, /* cost of storing integer registers */ - 1, /* cost of reg,reg fld/fst */ - {1, 1, 1}, /* cost of loading fp registers - in SFmode, DFmode and XFmode */ - {4, 6, 6}, /* cost of storing fp registers - in SFmode, DFmode and XFmode */ - - 2, /* cost of moving MMX register */ - {2, 2}, /* cost of loading MMX registers - in SImode and DImode */ - {2, 2}, /* cost of storing MMX registers - in SImode and DImode */ - 2, /* cost of moving SSE register */ - {2, 2, 8}, /* cost of loading SSE registers - in SImode, DImode and TImode */ - {2, 2, 8}, /* cost of storing SSE registers - in SImode, DImode and TImode */ - 3, /* MMX or SSE register to integer */ - 64, /* size of l1 cache. */ - 128, /* size of l2 cache. */ - 32, /* size of prefetch block */ - 1, /* number of parallel prefetches */ - 1, /* Branch cost */ - COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */ - COSTS_N_INSNS (11), /* cost of FMUL instruction. */ - COSTS_N_INSNS (47), /* cost of FDIV instruction. */ - COSTS_N_INSNS (1), /* cost of FABS instruction. */ - COSTS_N_INSNS (1), /* cost of FCHS instruction. */ - COSTS_N_INSNS (54), /* cost of FSQRT instruction. */ - 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ - geode_memcpy, - geode_memset, - 1, /* scalar_stmt_cost. */ - 1, /* scalar load_cost. */ - 1, /* scalar_store_cost. */ - 1, /* vec_stmt_cost. */ - 1, /* vec_to_scalar_cost. */ - 1, /* scalar_to_vec_cost. */ - 1, /* vec_align_load_cost. */ - 2, /* vec_unalign_load_cost. */ - 1, /* vec_store_cost. */ - 3, /* cond_taken_branch_cost. */ - 1, /* cond_not_taken_branch_cost. */ -}; - -static stringop_algs k6_memcpy[2] = { - {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}}, - DUMMY_STRINGOP_ALGS}; -static stringop_algs k6_memset[2] = { - {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}}, - DUMMY_STRINGOP_ALGS}; -static const -struct processor_costs k6_cost = { - COSTS_N_INSNS (1), /* cost of an add instruction */ - COSTS_N_INSNS (2), /* cost of a lea instruction */ - COSTS_N_INSNS (1), /* variable shift costs */ - COSTS_N_INSNS (1), /* constant shift costs */ - {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ - COSTS_N_INSNS (3), /* HI */ - COSTS_N_INSNS (3), /* SI */ - COSTS_N_INSNS (3), /* DI */ - COSTS_N_INSNS (3)}, /* other */ - 0, /* cost of multiply per each bit set */ - {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */ - COSTS_N_INSNS (18), /* HI */ - COSTS_N_INSNS (18), /* SI */ - COSTS_N_INSNS (18), /* DI */ - COSTS_N_INSNS (18)}, /* other */ - COSTS_N_INSNS (2), /* cost of movsx */ - COSTS_N_INSNS (2), /* cost of movzx */ - 8, /* "large" insn */ - 4, /* MOVE_RATIO */ - 3, /* cost for loading QImode using movzbl */ - {4, 5, 4}, /* cost of loading integer registers - in QImode, HImode and SImode. - Relative to reg-reg move (2). */ - {2, 3, 2}, /* cost of storing integer registers */ - 4, /* cost of reg,reg fld/fst */ - {6, 6, 6}, /* cost of loading fp registers - in SFmode, DFmode and XFmode */ - {4, 4, 4}, /* cost of storing fp registers - in SFmode, DFmode and XFmode */ - 2, /* cost of moving MMX register */ - {2, 2}, /* cost of loading MMX registers - in SImode and DImode */ - {2, 2}, /* cost of storing MMX registers - in SImode and DImode */ - 2, /* cost of moving SSE register */ - {2, 2, 8}, /* cost of loading SSE registers - in SImode, DImode and TImode */ - {2, 2, 8}, /* cost of storing SSE registers - in SImode, DImode and TImode */ - 6, /* MMX or SSE register to integer */ - 32, /* size of l1 cache. */ - 32, /* size of l2 cache. Some models - have integrated l2 cache, but - optimizing for k6 is not important - enough to worry about that. */ - 32, /* size of prefetch block */ - 1, /* number of parallel prefetches */ - 1, /* Branch cost */ - COSTS_N_INSNS (2), /* cost of FADD and FSUB insns. */ - COSTS_N_INSNS (2), /* cost of FMUL instruction. */ - COSTS_N_INSNS (56), /* cost of FDIV instruction. */ - COSTS_N_INSNS (2), /* cost of FABS instruction. */ - COSTS_N_INSNS (2), /* cost of FCHS instruction. */ - COSTS_N_INSNS (56), /* cost of FSQRT instruction. */ - 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ - k6_memcpy, - k6_memset, - 1, /* scalar_stmt_cost. */ - 1, /* scalar load_cost. */ - 1, /* scalar_store_cost. */ - 1, /* vec_stmt_cost. */ - 1, /* vec_to_scalar_cost. */ - 1, /* scalar_to_vec_cost. */ - 1, /* vec_align_load_cost. */ - 2, /* vec_unalign_load_cost. */ - 1, /* vec_store_cost. */ - 3, /* cond_taken_branch_cost. */ - 1, /* cond_not_taken_branch_cost. */ -}; - -/* For some reason, Athlon deals better with REP prefix (relative to loops) - compared to K8. Alignment becomes important after 8 bytes for memcpy and - 128 bytes for memset. */ -static stringop_algs athlon_memcpy[2] = { - {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, - DUMMY_STRINGOP_ALGS}; -static stringop_algs athlon_memset[2] = { - {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, - DUMMY_STRINGOP_ALGS}; -static const -struct processor_costs athlon_cost = { - COSTS_N_INSNS (1), /* cost of an add instruction */ - COSTS_N_INSNS (2), /* cost of a lea instruction */ - COSTS_N_INSNS (1), /* variable shift costs */ - COSTS_N_INSNS (1), /* constant shift costs */ - {COSTS_N_INSNS (5), /* cost of starting multiply for QI */ - COSTS_N_INSNS (5), /* HI */ - COSTS_N_INSNS (5), /* SI */ - COSTS_N_INSNS (5), /* DI */ - COSTS_N_INSNS (5)}, /* other */ - 0, /* cost of multiply per each bit set */ - {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */ - COSTS_N_INSNS (26), /* HI */ - COSTS_N_INSNS (42), /* SI */ - COSTS_N_INSNS (74), /* DI */ - COSTS_N_INSNS (74)}, /* other */ - COSTS_N_INSNS (1), /* cost of movsx */ - COSTS_N_INSNS (1), /* cost of movzx */ - 8, /* "large" insn */ - 9, /* MOVE_RATIO */ - 4, /* cost for loading QImode using movzbl */ - {3, 4, 3}, /* cost of loading integer registers - in QImode, HImode and SImode. - Relative to reg-reg move (2). */ - {3, 4, 3}, /* cost of storing integer registers */ - 4, /* cost of reg,reg fld/fst */ - {4, 4, 12}, /* cost of loading fp registers - in SFmode, DFmode and XFmode */ - {6, 6, 8}, /* cost of storing fp registers - in SFmode, DFmode and XFmode */ - 2, /* cost of moving MMX register */ - {4, 4}, /* cost of loading MMX registers - in SImode and DImode */ - {4, 4}, /* cost of storing MMX registers - in SImode and DImode */ - 2, /* cost of moving SSE register */ - {4, 4, 6}, /* cost of loading SSE registers - in SImode, DImode and TImode */ - {4, 4, 5}, /* cost of storing SSE registers - in SImode, DImode and TImode */ - 5, /* MMX or SSE register to integer */ - 64, /* size of l1 cache. */ - 256, /* size of l2 cache. */ - 64, /* size of prefetch block */ - 6, /* number of parallel prefetches */ - 5, /* Branch cost */ - COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */ - COSTS_N_INSNS (4), /* cost of FMUL instruction. */ - COSTS_N_INSNS (24), /* cost of FDIV instruction. */ - COSTS_N_INSNS (2), /* cost of FABS instruction. */ - COSTS_N_INSNS (2), /* cost of FCHS instruction. */ - COSTS_N_INSNS (35), /* cost of FSQRT instruction. */ - 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ - athlon_memcpy, - athlon_memset, - 1, /* scalar_stmt_cost. */ - 1, /* scalar load_cost. */ - 1, /* scalar_store_cost. */ - 1, /* vec_stmt_cost. */ - 1, /* vec_to_scalar_cost. */ - 1, /* scalar_to_vec_cost. */ - 1, /* vec_align_load_cost. */ - 2, /* vec_unalign_load_cost. */ - 1, /* vec_store_cost. */ - 3, /* cond_taken_branch_cost. */ - 1, /* cond_not_taken_branch_cost. */ -}; - -/* K8 has optimized REP instruction for medium sized blocks, but for very - small blocks it is better to use loop. For large blocks, libcall can - do nontemporary accesses and beat inline considerably. */ -static stringop_algs k8_memcpy[2] = { - {libcall, {{6, loop, false}, {14, unrolled_loop, false}, - {-1, rep_prefix_4_byte, false}}}, - {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false}, - {-1, libcall, false}}}}; -static stringop_algs k8_memset[2] = { - {libcall, {{8, loop, false}, {24, unrolled_loop, false}, - {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, - {libcall, {{48, unrolled_loop, false}, - {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}}; -static const -struct processor_costs k8_cost = { - COSTS_N_INSNS (1), /* cost of an add instruction */ - COSTS_N_INSNS (2), /* cost of a lea instruction */ - COSTS_N_INSNS (1), /* variable shift costs */ - COSTS_N_INSNS (1), /* constant shift costs */ - {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ - COSTS_N_INSNS (4), /* HI */ - COSTS_N_INSNS (3), /* SI */ - COSTS_N_INSNS (4), /* DI */ - COSTS_N_INSNS (5)}, /* other */ - 0, /* cost of multiply per each bit set */ - {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */ - COSTS_N_INSNS (26), /* HI */ - COSTS_N_INSNS (42), /* SI */ - COSTS_N_INSNS (74), /* DI */ - COSTS_N_INSNS (74)}, /* other */ - COSTS_N_INSNS (1), /* cost of movsx */ - COSTS_N_INSNS (1), /* cost of movzx */ - 8, /* "large" insn */ - 9, /* MOVE_RATIO */ - 4, /* cost for loading QImode using movzbl */ - {3, 4, 3}, /* cost of loading integer registers - in QImode, HImode and SImode. - Relative to reg-reg move (2). */ - {3, 4, 3}, /* cost of storing integer registers */ - 4, /* cost of reg,reg fld/fst */ - {4, 4, 12}, /* cost of loading fp registers - in SFmode, DFmode and XFmode */ - {6, 6, 8}, /* cost of storing fp registers - in SFmode, DFmode and XFmode */ - 2, /* cost of moving MMX register */ - {3, 3}, /* cost of loading MMX registers - in SImode and DImode */ - {4, 4}, /* cost of storing MMX registers - in SImode and DImode */ - 2, /* cost of moving SSE register */ - {4, 3, 6}, /* cost of loading SSE registers - in SImode, DImode and TImode */ - {4, 4, 5}, /* cost of storing SSE registers - in SImode, DImode and TImode */ - 5, /* MMX or SSE register to integer */ - 64, /* size of l1 cache. */ - 512, /* size of l2 cache. */ - 64, /* size of prefetch block */ - /* New AMD processors never drop prefetches; if they cannot be performed - immediately, they are queued. We set number of simultaneous prefetches - to a large constant to reflect this (it probably is not a good idea not - to limit number of prefetches at all, as their execution also takes some - time). */ - 100, /* number of parallel prefetches */ - 3, /* Branch cost */ - COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */ - COSTS_N_INSNS (4), /* cost of FMUL instruction. */ - COSTS_N_INSNS (19), /* cost of FDIV instruction. */ - COSTS_N_INSNS (2), /* cost of FABS instruction. */ - COSTS_N_INSNS (2), /* cost of FCHS instruction. */ - COSTS_N_INSNS (35), /* cost of FSQRT instruction. */ - 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ - k8_memcpy, - k8_memset, - 4, /* scalar_stmt_cost. */ - 2, /* scalar load_cost. */ - 2, /* scalar_store_cost. */ - 5, /* vec_stmt_cost. */ - 0, /* vec_to_scalar_cost. */ - 2, /* scalar_to_vec_cost. */ - 2, /* vec_align_load_cost. */ - 3, /* vec_unalign_load_cost. */ - 3, /* vec_store_cost. */ - 3, /* cond_taken_branch_cost. */ - 2, /* cond_not_taken_branch_cost. */ -}; - -/* AMDFAM10 has optimized REP instruction for medium sized blocks, but for - very small blocks it is better to use loop. For large blocks, libcall can - do nontemporary accesses and beat inline considerably. */ -static stringop_algs amdfam10_memcpy[2] = { - {libcall, {{6, loop, false}, {14, unrolled_loop, false}, - {-1, rep_prefix_4_byte, false}}}, - {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false}, - {-1, libcall, false}}}}; -static stringop_algs amdfam10_memset[2] = { - {libcall, {{8, loop, false}, {24, unrolled_loop, false}, - {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, - {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false}, - {-1, libcall, false}}}}; -struct processor_costs amdfam10_cost = { - COSTS_N_INSNS (1), /* cost of an add instruction */ - COSTS_N_INSNS (2), /* cost of a lea instruction */ - COSTS_N_INSNS (1), /* variable shift costs */ - COSTS_N_INSNS (1), /* constant shift costs */ - {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ - COSTS_N_INSNS (4), /* HI */ - COSTS_N_INSNS (3), /* SI */ - COSTS_N_INSNS (4), /* DI */ - COSTS_N_INSNS (5)}, /* other */ - 0, /* cost of multiply per each bit set */ - {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */ - COSTS_N_INSNS (35), /* HI */ - COSTS_N_INSNS (51), /* SI */ - COSTS_N_INSNS (83), /* DI */ - COSTS_N_INSNS (83)}, /* other */ - COSTS_N_INSNS (1), /* cost of movsx */ - COSTS_N_INSNS (1), /* cost of movzx */ - 8, /* "large" insn */ - 9, /* MOVE_RATIO */ - 4, /* cost for loading QImode using movzbl */ - {3, 4, 3}, /* cost of loading integer registers - in QImode, HImode and SImode. - Relative to reg-reg move (2). */ - {3, 4, 3}, /* cost of storing integer registers */ - 4, /* cost of reg,reg fld/fst */ - {4, 4, 12}, /* cost of loading fp registers - in SFmode, DFmode and XFmode */ - {6, 6, 8}, /* cost of storing fp registers - in SFmode, DFmode and XFmode */ - 2, /* cost of moving MMX register */ - {3, 3}, /* cost of loading MMX registers - in SImode and DImode */ - {4, 4}, /* cost of storing MMX registers - in SImode and DImode */ - 2, /* cost of moving SSE register */ - {4, 4, 3}, /* cost of loading SSE registers - in SImode, DImode and TImode */ - {4, 4, 5}, /* cost of storing SSE registers - in SImode, DImode and TImode */ - 3, /* MMX or SSE register to integer */ - /* On K8: - MOVD reg64, xmmreg Double FSTORE 4 - MOVD reg32, xmmreg Double FSTORE 4 - On AMDFAM10: - MOVD reg64, xmmreg Double FADD 3 - 1/1 1/1 - MOVD reg32, xmmreg Double FADD 3 - 1/1 1/1 */ - 64, /* size of l1 cache. */ - 512, /* size of l2 cache. */ - 64, /* size of prefetch block */ - /* New AMD processors never drop prefetches; if they cannot be performed - immediately, they are queued. We set number of simultaneous prefetches - to a large constant to reflect this (it probably is not a good idea not - to limit number of prefetches at all, as their execution also takes some - time). */ - 100, /* number of parallel prefetches */ - 2, /* Branch cost */ - COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */ - COSTS_N_INSNS (4), /* cost of FMUL instruction. */ - COSTS_N_INSNS (19), /* cost of FDIV instruction. */ - COSTS_N_INSNS (2), /* cost of FABS instruction. */ - COSTS_N_INSNS (2), /* cost of FCHS instruction. */ - COSTS_N_INSNS (35), /* cost of FSQRT instruction. */ - 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ - amdfam10_memcpy, - amdfam10_memset, - 4, /* scalar_stmt_cost. */ - 2, /* scalar load_cost. */ - 2, /* scalar_store_cost. */ - 6, /* vec_stmt_cost. */ - 0, /* vec_to_scalar_cost. */ - 2, /* scalar_to_vec_cost. */ - 2, /* vec_align_load_cost. */ - 2, /* vec_unalign_load_cost. */ - 2, /* vec_store_cost. */ - 2, /* cond_taken_branch_cost. */ - 1, /* cond_not_taken_branch_cost. */ -}; - -/* BDVER1 has optimized REP instruction for medium sized blocks, but for - very small blocks it is better to use loop. For large blocks, libcall - can do nontemporary accesses and beat inline considerably. */ -static stringop_algs bdver1_memcpy[2] = { - {libcall, {{6, loop, false}, {14, unrolled_loop, false}, - {-1, rep_prefix_4_byte, false}}}, - {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false}, - {-1, libcall, false}}}}; -static stringop_algs bdver1_memset[2] = { - {libcall, {{8, loop, false}, {24, unrolled_loop, false}, - {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, - {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false}, - {-1, libcall, false}}}}; - -const struct processor_costs bdver1_cost = { - COSTS_N_INSNS (1), /* cost of an add instruction */ - COSTS_N_INSNS (1), /* cost of a lea instruction */ - COSTS_N_INSNS (1), /* variable shift costs */ - COSTS_N_INSNS (1), /* constant shift costs */ - {COSTS_N_INSNS (4), /* cost of starting multiply for QI */ - COSTS_N_INSNS (4), /* HI */ - COSTS_N_INSNS (4), /* SI */ - COSTS_N_INSNS (6), /* DI */ - COSTS_N_INSNS (6)}, /* other */ - 0, /* cost of multiply per each bit set */ - {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */ - COSTS_N_INSNS (35), /* HI */ - COSTS_N_INSNS (51), /* SI */ - COSTS_N_INSNS (83), /* DI */ - COSTS_N_INSNS (83)}, /* other */ - COSTS_N_INSNS (1), /* cost of movsx */ - COSTS_N_INSNS (1), /* cost of movzx */ - 8, /* "large" insn */ - 9, /* MOVE_RATIO */ - 4, /* cost for loading QImode using movzbl */ - {5, 5, 4}, /* cost of loading integer registers - in QImode, HImode and SImode. - Relative to reg-reg move (2). */ - {4, 4, 4}, /* cost of storing integer registers */ - 2, /* cost of reg,reg fld/fst */ - {5, 5, 12}, /* cost of loading fp registers - in SFmode, DFmode and XFmode */ - {4, 4, 8}, /* cost of storing fp registers - in SFmode, DFmode and XFmode */ - 2, /* cost of moving MMX register */ - {4, 4}, /* cost of loading MMX registers - in SImode and DImode */ - {4, 4}, /* cost of storing MMX registers - in SImode and DImode */ - 2, /* cost of moving SSE register */ - {4, 4, 4}, /* cost of loading SSE registers - in SImode, DImode and TImode */ - {4, 4, 4}, /* cost of storing SSE registers - in SImode, DImode and TImode */ - 2, /* MMX or SSE register to integer */ - /* On K8: - MOVD reg64, xmmreg Double FSTORE 4 - MOVD reg32, xmmreg Double FSTORE 4 - On AMDFAM10: - MOVD reg64, xmmreg Double FADD 3 - 1/1 1/1 - MOVD reg32, xmmreg Double FADD 3 - 1/1 1/1 */ - 16, /* size of l1 cache. */ - 2048, /* size of l2 cache. */ - 64, /* size of prefetch block */ - /* New AMD processors never drop prefetches; if they cannot be performed - immediately, they are queued. We set number of simultaneous prefetches - to a large constant to reflect this (it probably is not a good idea not - to limit number of prefetches at all, as their execution also takes some - time). */ - 100, /* number of parallel prefetches */ - 2, /* Branch cost */ - COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */ - COSTS_N_INSNS (6), /* cost of FMUL instruction. */ - COSTS_N_INSNS (42), /* cost of FDIV instruction. */ - COSTS_N_INSNS (2), /* cost of FABS instruction. */ - COSTS_N_INSNS (2), /* cost of FCHS instruction. */ - COSTS_N_INSNS (52), /* cost of FSQRT instruction. */ - 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ - bdver1_memcpy, - bdver1_memset, - 6, /* scalar_stmt_cost. */ - 4, /* scalar load_cost. */ - 4, /* scalar_store_cost. */ - 6, /* vec_stmt_cost. */ - 0, /* vec_to_scalar_cost. */ - 2, /* scalar_to_vec_cost. */ - 4, /* vec_align_load_cost. */ - 4, /* vec_unalign_load_cost. */ - 4, /* vec_store_cost. */ - 4, /* cond_taken_branch_cost. */ - 2, /* cond_not_taken_branch_cost. */ -}; - -/* BDVER2 has optimized REP instruction for medium sized blocks, but for - very small blocks it is better to use loop. For large blocks, libcall - can do nontemporary accesses and beat inline considerably. */ - -static stringop_algs bdver2_memcpy[2] = { - {libcall, {{6, loop, false}, {14, unrolled_loop, false}, - {-1, rep_prefix_4_byte, false}}}, - {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false}, - {-1, libcall, false}}}}; -static stringop_algs bdver2_memset[2] = { - {libcall, {{8, loop, false}, {24, unrolled_loop, false}, - {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, - {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false}, - {-1, libcall, false}}}}; - -const struct processor_costs bdver2_cost = { - COSTS_N_INSNS (1), /* cost of an add instruction */ - COSTS_N_INSNS (1), /* cost of a lea instruction */ - COSTS_N_INSNS (1), /* variable shift costs */ - COSTS_N_INSNS (1), /* constant shift costs */ - {COSTS_N_INSNS (4), /* cost of starting multiply for QI */ - COSTS_N_INSNS (4), /* HI */ - COSTS_N_INSNS (4), /* SI */ - COSTS_N_INSNS (6), /* DI */ - COSTS_N_INSNS (6)}, /* other */ - 0, /* cost of multiply per each bit set */ - {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */ - COSTS_N_INSNS (35), /* HI */ - COSTS_N_INSNS (51), /* SI */ - COSTS_N_INSNS (83), /* DI */ - COSTS_N_INSNS (83)}, /* other */ - COSTS_N_INSNS (1), /* cost of movsx */ - COSTS_N_INSNS (1), /* cost of movzx */ - 8, /* "large" insn */ - 9, /* MOVE_RATIO */ - 4, /* cost for loading QImode using movzbl */ - {5, 5, 4}, /* cost of loading integer registers - in QImode, HImode and SImode. - Relative to reg-reg move (2). */ - {4, 4, 4}, /* cost of storing integer registers */ - 2, /* cost of reg,reg fld/fst */ - {5, 5, 12}, /* cost of loading fp registers - in SFmode, DFmode and XFmode */ - {4, 4, 8}, /* cost of storing fp registers - in SFmode, DFmode and XFmode */ - 2, /* cost of moving MMX register */ - {4, 4}, /* cost of loading MMX registers - in SImode and DImode */ - {4, 4}, /* cost of storing MMX registers - in SImode and DImode */ - 2, /* cost of moving SSE register */ - {4, 4, 4}, /* cost of loading SSE registers - in SImode, DImode and TImode */ - {4, 4, 4}, /* cost of storing SSE registers - in SImode, DImode and TImode */ - 2, /* MMX or SSE register to integer */ - /* On K8: - MOVD reg64, xmmreg Double FSTORE 4 - MOVD reg32, xmmreg Double FSTORE 4 - On AMDFAM10: - MOVD reg64, xmmreg Double FADD 3 - 1/1 1/1 - MOVD reg32, xmmreg Double FADD 3 - 1/1 1/1 */ - 16, /* size of l1 cache. */ - 2048, /* size of l2 cache. */ - 64, /* size of prefetch block */ - /* New AMD processors never drop prefetches; if they cannot be performed - immediately, they are queued. We set number of simultaneous prefetches - to a large constant to reflect this (it probably is not a good idea not - to limit number of prefetches at all, as their execution also takes some - time). */ - 100, /* number of parallel prefetches */ - 2, /* Branch cost */ - COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */ - COSTS_N_INSNS (6), /* cost of FMUL instruction. */ - COSTS_N_INSNS (42), /* cost of FDIV instruction. */ - COSTS_N_INSNS (2), /* cost of FABS instruction. */ - COSTS_N_INSNS (2), /* cost of FCHS instruction. */ - COSTS_N_INSNS (52), /* cost of FSQRT instruction. */ - 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ - bdver2_memcpy, - bdver2_memset, - 6, /* scalar_stmt_cost. */ - 4, /* scalar load_cost. */ - 4, /* scalar_store_cost. */ - 6, /* vec_stmt_cost. */ - 0, /* vec_to_scalar_cost. */ - 2, /* scalar_to_vec_cost. */ - 4, /* vec_align_load_cost. */ - 4, /* vec_unalign_load_cost. */ - 4, /* vec_store_cost. */ - 4, /* cond_taken_branch_cost. */ - 2, /* cond_not_taken_branch_cost. */ -}; - - - /* BDVER3 has optimized REP instruction for medium sized blocks, but for - very small blocks it is better to use loop. For large blocks, libcall - can do nontemporary accesses and beat inline considerably. */ -static stringop_algs bdver3_memcpy[2] = { - {libcall, {{6, loop, false}, {14, unrolled_loop, false}, - {-1, rep_prefix_4_byte, false}}}, - {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false}, - {-1, libcall, false}}}}; -static stringop_algs bdver3_memset[2] = { - {libcall, {{8, loop, false}, {24, unrolled_loop, false}, - {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, - {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false}, - {-1, libcall, false}}}}; -struct processor_costs bdver3_cost = { - COSTS_N_INSNS (1), /* cost of an add instruction */ - COSTS_N_INSNS (1), /* cost of a lea instruction */ - COSTS_N_INSNS (1), /* variable shift costs */ - COSTS_N_INSNS (1), /* constant shift costs */ - {COSTS_N_INSNS (4), /* cost of starting multiply for QI */ - COSTS_N_INSNS (4), /* HI */ - COSTS_N_INSNS (4), /* SI */ - COSTS_N_INSNS (6), /* DI */ - COSTS_N_INSNS (6)}, /* other */ - 0, /* cost of multiply per each bit set */ - {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */ - COSTS_N_INSNS (35), /* HI */ - COSTS_N_INSNS (51), /* SI */ - COSTS_N_INSNS (83), /* DI */ - COSTS_N_INSNS (83)}, /* other */ - COSTS_N_INSNS (1), /* cost of movsx */ - COSTS_N_INSNS (1), /* cost of movzx */ - 8, /* "large" insn */ - 9, /* MOVE_RATIO */ - 4, /* cost for loading QImode using movzbl */ - {5, 5, 4}, /* cost of loading integer registers - in QImode, HImode and SImode. - Relative to reg-reg move (2). */ - {4, 4, 4}, /* cost of storing integer registers */ - 2, /* cost of reg,reg fld/fst */ - {5, 5, 12}, /* cost of loading fp registers - in SFmode, DFmode and XFmode */ - {4, 4, 8}, /* cost of storing fp registers - in SFmode, DFmode and XFmode */ - 2, /* cost of moving MMX register */ - {4, 4}, /* cost of loading MMX registers - in SImode and DImode */ - {4, 4}, /* cost of storing MMX registers - in SImode and DImode */ - 2, /* cost of moving SSE register */ - {4, 4, 4}, /* cost of loading SSE registers - in SImode, DImode and TImode */ - {4, 4, 4}, /* cost of storing SSE registers - in SImode, DImode and TImode */ - 2, /* MMX or SSE register to integer */ - 16, /* size of l1 cache. */ - 2048, /* size of l2 cache. */ - 64, /* size of prefetch block */ - /* New AMD processors never drop prefetches; if they cannot be performed - immediately, they are queued. We set number of simultaneous prefetches - to a large constant to reflect this (it probably is not a good idea not - to limit number of prefetches at all, as their execution also takes some - time). */ - 100, /* number of parallel prefetches */ - 2, /* Branch cost */ - COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */ - COSTS_N_INSNS (6), /* cost of FMUL instruction. */ - COSTS_N_INSNS (42), /* cost of FDIV instruction. */ - COSTS_N_INSNS (2), /* cost of FABS instruction. */ - COSTS_N_INSNS (2), /* cost of FCHS instruction. */ - COSTS_N_INSNS (52), /* cost of FSQRT instruction. */ - 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ - bdver3_memcpy, - bdver3_memset, - 6, /* scalar_stmt_cost. */ - 4, /* scalar load_cost. */ - 4, /* scalar_store_cost. */ - 6, /* vec_stmt_cost. */ - 0, /* vec_to_scalar_cost. */ - 2, /* scalar_to_vec_cost. */ - 4, /* vec_align_load_cost. */ - 4, /* vec_unalign_load_cost. */ - 4, /* vec_store_cost. */ - 4, /* cond_taken_branch_cost. */ - 2, /* cond_not_taken_branch_cost. */ -}; - -/* BDVER4 has optimized REP instruction for medium sized blocks, but for - very small blocks it is better to use loop. For large blocks, libcall - can do nontemporary accesses and beat inline considerably. */ -static stringop_algs bdver4_memcpy[2] = { - {libcall, {{6, loop, false}, {14, unrolled_loop, false}, - {-1, rep_prefix_4_byte, false}}}, - {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false}, - {-1, libcall, false}}}}; -static stringop_algs bdver4_memset[2] = { - {libcall, {{8, loop, false}, {24, unrolled_loop, false}, - {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, - {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false}, - {-1, libcall, false}}}}; -struct processor_costs bdver4_cost = { - COSTS_N_INSNS (1), /* cost of an add instruction */ - COSTS_N_INSNS (1), /* cost of a lea instruction */ - COSTS_N_INSNS (1), /* variable shift costs */ - COSTS_N_INSNS (1), /* constant shift costs */ - {COSTS_N_INSNS (4), /* cost of starting multiply for QI */ - COSTS_N_INSNS (4), /* HI */ - COSTS_N_INSNS (4), /* SI */ - COSTS_N_INSNS (6), /* DI */ - COSTS_N_INSNS (6)}, /* other */ - 0, /* cost of multiply per each bit set */ - {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */ - COSTS_N_INSNS (35), /* HI */ - COSTS_N_INSNS (51), /* SI */ - COSTS_N_INSNS (83), /* DI */ - COSTS_N_INSNS (83)}, /* other */ - COSTS_N_INSNS (1), /* cost of movsx */ - COSTS_N_INSNS (1), /* cost of movzx */ - 8, /* "large" insn */ - 9, /* MOVE_RATIO */ - 4, /* cost for loading QImode using movzbl */ - {5, 5, 4}, /* cost of loading integer registers - in QImode, HImode and SImode. - Relative to reg-reg move (2). */ - {4, 4, 4}, /* cost of storing integer registers */ - 2, /* cost of reg,reg fld/fst */ - {5, 5, 12}, /* cost of loading fp registers - in SFmode, DFmode and XFmode */ - {4, 4, 8}, /* cost of storing fp registers - in SFmode, DFmode and XFmode */ - 2, /* cost of moving MMX register */ - {4, 4}, /* cost of loading MMX registers - in SImode and DImode */ - {4, 4}, /* cost of storing MMX registers - in SImode and DImode */ - 2, /* cost of moving SSE register */ - {4, 4, 4}, /* cost of loading SSE registers - in SImode, DImode and TImode */ - {4, 4, 4}, /* cost of storing SSE registers - in SImode, DImode and TImode */ - 2, /* MMX or SSE register to integer */ - 16, /* size of l1 cache. */ - 2048, /* size of l2 cache. */ - 64, /* size of prefetch block */ - /* New AMD processors never drop prefetches; if they cannot be performed - immediately, they are queued. We set number of simultaneous prefetches - to a large constant to reflect this (it probably is not a good idea not - to limit number of prefetches at all, as their execution also takes some - time). */ - 100, /* number of parallel prefetches */ - 2, /* Branch cost */ - COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */ - COSTS_N_INSNS (6), /* cost of FMUL instruction. */ - COSTS_N_INSNS (42), /* cost of FDIV instruction. */ - COSTS_N_INSNS (2), /* cost of FABS instruction. */ - COSTS_N_INSNS (2), /* cost of FCHS instruction. */ - COSTS_N_INSNS (52), /* cost of FSQRT instruction. */ - 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ - bdver4_memcpy, - bdver4_memset, - 6, /* scalar_stmt_cost. */ - 4, /* scalar load_cost. */ - 4, /* scalar_store_cost. */ - 6, /* vec_stmt_cost. */ - 0, /* vec_to_scalar_cost. */ - 2, /* scalar_to_vec_cost. */ - 4, /* vec_align_load_cost. */ - 4, /* vec_unalign_load_cost. */ - 4, /* vec_store_cost. */ - 4, /* cond_taken_branch_cost. */ - 2, /* cond_not_taken_branch_cost. */ -}; - - -/* ZNVER1 has optimized REP instruction for medium sized blocks, but for - very small blocks it is better to use loop. For large blocks, libcall - can do nontemporary accesses and beat inline considerably. */ -static stringop_algs znver1_memcpy[2] = { - {libcall, {{6, loop, false}, {14, unrolled_loop, false}, - {-1, rep_prefix_4_byte, false}}}, - {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false}, - {-1, libcall, false}}}}; -static stringop_algs znver1_memset[2] = { - {libcall, {{8, loop, false}, {24, unrolled_loop, false}, - {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, - {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false}, - {-1, libcall, false}}}}; -struct processor_costs znver1_cost = { - COSTS_N_INSNS (1), /* cost of an add instruction. */ - COSTS_N_INSNS (1), /* cost of a lea instruction. */ - COSTS_N_INSNS (1), /* variable shift costs. */ - COSTS_N_INSNS (1), /* constant shift costs. */ - {COSTS_N_INSNS (3), /* cost of starting multiply for QI. */ - COSTS_N_INSNS (3), /* HI. */ - COSTS_N_INSNS (3), /* SI. */ - COSTS_N_INSNS (4), /* DI. */ - COSTS_N_INSNS (4)}, /* other. */ - 0, /* cost of multiply per each bit - set. */ - {COSTS_N_INSNS (19), /* cost of a divide/mod for QI. */ - COSTS_N_INSNS (35), /* HI. */ - COSTS_N_INSNS (51), /* SI. */ - COSTS_N_INSNS (83), /* DI. */ - COSTS_N_INSNS (83)}, /* other. */ - COSTS_N_INSNS (1), /* cost of movsx. */ - COSTS_N_INSNS (1), /* cost of movzx. */ - 8, /* "large" insn. */ - 9, /* MOVE_RATIO. */ - 4, /* cost for loading QImode using - movzbl. */ - {5, 5, 4}, /* cost of loading integer registers - in QImode, HImode and SImode. - Relative to reg-reg move (2). */ - {4, 4, 4}, /* cost of storing integer - registers. */ - 2, /* cost of reg,reg fld/fst. */ - {5, 5, 12}, /* cost of loading fp registers - in SFmode, DFmode and XFmode. */ - {4, 4, 8}, /* cost of storing fp registers - in SFmode, DFmode and XFmode. */ - 2, /* cost of moving MMX register. */ - {4, 4}, /* cost of loading MMX registers - in SImode and DImode. */ - {4, 4}, /* cost of storing MMX registers - in SImode and DImode. */ - 2, /* cost of moving SSE register. */ - {4, 4, 4}, /* cost of loading SSE registers - in SImode, DImode and TImode. */ - {4, 4, 4}, /* cost of storing SSE registers - in SImode, DImode and TImode. */ - 2, /* MMX or SSE register to integer. */ - 32, /* size of l1 cache. */ - 512, /* size of l2 cache. */ - 64, /* size of prefetch block. */ - /* New AMD processors never drop prefetches; if they cannot be performed - immediately, they are queued. We set number of simultaneous prefetches - to a large constant to reflect this (it probably is not a good idea not - to limit number of prefetches at all, as their execution also takes some - time). */ - 100, /* number of parallel prefetches. */ - 3, /* Branch cost. */ - COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */ - COSTS_N_INSNS (6), /* cost of FMUL instruction. */ - COSTS_N_INSNS (42), /* cost of FDIV instruction. */ - COSTS_N_INSNS (2), /* cost of FABS instruction. */ - COSTS_N_INSNS (2), /* cost of FCHS instruction. */ - COSTS_N_INSNS (52), /* cost of FSQRT instruction. */ - /* Zen can execute 4 integer operations per cycle. FP operations take 3 cycles - and it can execute 2 integer additions and 2 multiplications thus - reassociation may make sense up to with of 6. SPEC2k6 bencharks suggests - that 4 works better than 6 probably due to register pressure. - - Integer vector operations are taken by FP unit and execute 3 vector - plus/minus operations per cycle but only one multiply. This is adjusted - in ix86_reassociation_width. */ - 4, 4, 3, 6, /* reassoc int, fp, vec_int, vec_fp. */ - znver1_memcpy, - znver1_memset, - 6, /* scalar_stmt_cost. */ - 4, /* scalar load_cost. */ - 4, /* scalar_store_cost. */ - 6, /* vec_stmt_cost. */ - 0, /* vec_to_scalar_cost. */ - 2, /* scalar_to_vec_cost. */ - 4, /* vec_align_load_cost. */ - 4, /* vec_unalign_load_cost. */ - 4, /* vec_store_cost. */ - 4, /* cond_taken_branch_cost. */ - 2, /* cond_not_taken_branch_cost. */ -}; - - /* BTVER1 has optimized REP instruction for medium sized blocks, but for - very small blocks it is better to use loop. For large blocks, libcall can - do nontemporary accesses and beat inline considerably. */ -static stringop_algs btver1_memcpy[2] = { - {libcall, {{6, loop, false}, {14, unrolled_loop, false}, - {-1, rep_prefix_4_byte, false}}}, - {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false}, - {-1, libcall, false}}}}; -static stringop_algs btver1_memset[2] = { - {libcall, {{8, loop, false}, {24, unrolled_loop, false}, - {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, - {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false}, - {-1, libcall, false}}}}; -const struct processor_costs btver1_cost = { - COSTS_N_INSNS (1), /* cost of an add instruction */ - COSTS_N_INSNS (2), /* cost of a lea instruction */ - COSTS_N_INSNS (1), /* variable shift costs */ - COSTS_N_INSNS (1), /* constant shift costs */ - {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ - COSTS_N_INSNS (4), /* HI */ - COSTS_N_INSNS (3), /* SI */ - COSTS_N_INSNS (4), /* DI */ - COSTS_N_INSNS (5)}, /* other */ - 0, /* cost of multiply per each bit set */ - {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */ - COSTS_N_INSNS (35), /* HI */ - COSTS_N_INSNS (51), /* SI */ - COSTS_N_INSNS (83), /* DI */ - COSTS_N_INSNS (83)}, /* other */ - COSTS_N_INSNS (1), /* cost of movsx */ - COSTS_N_INSNS (1), /* cost of movzx */ - 8, /* "large" insn */ - 9, /* MOVE_RATIO */ - 4, /* cost for loading QImode using movzbl */ - {3, 4, 3}, /* cost of loading integer registers - in QImode, HImode and SImode. - Relative to reg-reg move (2). */ - {3, 4, 3}, /* cost of storing integer registers */ - 4, /* cost of reg,reg fld/fst */ - {4, 4, 12}, /* cost of loading fp registers - in SFmode, DFmode and XFmode */ - {6, 6, 8}, /* cost of storing fp registers - in SFmode, DFmode and XFmode */ - 2, /* cost of moving MMX register */ - {3, 3}, /* cost of loading MMX registers - in SImode and DImode */ - {4, 4}, /* cost of storing MMX registers - in SImode and DImode */ - 2, /* cost of moving SSE register */ - {4, 4, 3}, /* cost of loading SSE registers - in SImode, DImode and TImode */ - {4, 4, 5}, /* cost of storing SSE registers - in SImode, DImode and TImode */ - 3, /* MMX or SSE register to integer */ - /* On K8: - MOVD reg64, xmmreg Double FSTORE 4 - MOVD reg32, xmmreg Double FSTORE 4 - On AMDFAM10: - MOVD reg64, xmmreg Double FADD 3 - 1/1 1/1 - MOVD reg32, xmmreg Double FADD 3 - 1/1 1/1 */ - 32, /* size of l1 cache. */ - 512, /* size of l2 cache. */ - 64, /* size of prefetch block */ - 100, /* number of parallel prefetches */ - 2, /* Branch cost */ - COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */ - COSTS_N_INSNS (4), /* cost of FMUL instruction. */ - COSTS_N_INSNS (19), /* cost of FDIV instruction. */ - COSTS_N_INSNS (2), /* cost of FABS instruction. */ - COSTS_N_INSNS (2), /* cost of FCHS instruction. */ - COSTS_N_INSNS (35), /* cost of FSQRT instruction. */ - 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ - btver1_memcpy, - btver1_memset, - 4, /* scalar_stmt_cost. */ - 2, /* scalar load_cost. */ - 2, /* scalar_store_cost. */ - 6, /* vec_stmt_cost. */ - 0, /* vec_to_scalar_cost. */ - 2, /* scalar_to_vec_cost. */ - 2, /* vec_align_load_cost. */ - 2, /* vec_unalign_load_cost. */ - 2, /* vec_store_cost. */ - 2, /* cond_taken_branch_cost. */ - 1, /* cond_not_taken_branch_cost. */ -}; - -static stringop_algs btver2_memcpy[2] = { - {libcall, {{6, loop, false}, {14, unrolled_loop, false}, - {-1, rep_prefix_4_byte, false}}}, - {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false}, - {-1, libcall, false}}}}; -static stringop_algs btver2_memset[2] = { - {libcall, {{8, loop, false}, {24, unrolled_loop, false}, - {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, - {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false}, - {-1, libcall, false}}}}; -const struct processor_costs btver2_cost = { - COSTS_N_INSNS (1), /* cost of an add instruction */ - COSTS_N_INSNS (2), /* cost of a lea instruction */ - COSTS_N_INSNS (1), /* variable shift costs */ - COSTS_N_INSNS (1), /* constant shift costs */ - {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ - COSTS_N_INSNS (4), /* HI */ - COSTS_N_INSNS (3), /* SI */ - COSTS_N_INSNS (4), /* DI */ - COSTS_N_INSNS (5)}, /* other */ - 0, /* cost of multiply per each bit set */ - {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */ - COSTS_N_INSNS (35), /* HI */ - COSTS_N_INSNS (51), /* SI */ - COSTS_N_INSNS (83), /* DI */ - COSTS_N_INSNS (83)}, /* other */ - COSTS_N_INSNS (1), /* cost of movsx */ - COSTS_N_INSNS (1), /* cost of movzx */ - 8, /* "large" insn */ - 9, /* MOVE_RATIO */ - 4, /* cost for loading QImode using movzbl */ - {3, 4, 3}, /* cost of loading integer registers - in QImode, HImode and SImode. - Relative to reg-reg move (2). */ - {3, 4, 3}, /* cost of storing integer registers */ - 4, /* cost of reg,reg fld/fst */ - {4, 4, 12}, /* cost of loading fp registers - in SFmode, DFmode and XFmode */ - {6, 6, 8}, /* cost of storing fp registers - in SFmode, DFmode and XFmode */ - 2, /* cost of moving MMX register */ - {3, 3}, /* cost of loading MMX registers - in SImode and DImode */ - {4, 4}, /* cost of storing MMX registers - in SImode and DImode */ - 2, /* cost of moving SSE register */ - {4, 4, 3}, /* cost of loading SSE registers - in SImode, DImode and TImode */ - {4, 4, 5}, /* cost of storing SSE registers - in SImode, DImode and TImode */ - 3, /* MMX or SSE register to integer */ - /* On K8: - MOVD reg64, xmmreg Double FSTORE 4 - MOVD reg32, xmmreg Double FSTORE 4 - On AMDFAM10: - MOVD reg64, xmmreg Double FADD 3 - 1/1 1/1 - MOVD reg32, xmmreg Double FADD 3 - 1/1 1/1 */ - 32, /* size of l1 cache. */ - 2048, /* size of l2 cache. */ - 64, /* size of prefetch block */ - 100, /* number of parallel prefetches */ - 2, /* Branch cost */ - COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */ - COSTS_N_INSNS (4), /* cost of FMUL instruction. */ - COSTS_N_INSNS (19), /* cost of FDIV instruction. */ - COSTS_N_INSNS (2), /* cost of FABS instruction. */ - COSTS_N_INSNS (2), /* cost of FCHS instruction. */ - COSTS_N_INSNS (35), /* cost of FSQRT instruction. */ - 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ - btver2_memcpy, - btver2_memset, - 4, /* scalar_stmt_cost. */ - 2, /* scalar load_cost. */ - 2, /* scalar_store_cost. */ - 6, /* vec_stmt_cost. */ - 0, /* vec_to_scalar_cost. */ - 2, /* scalar_to_vec_cost. */ - 2, /* vec_align_load_cost. */ - 2, /* vec_unalign_load_cost. */ - 2, /* vec_store_cost. */ - 2, /* cond_taken_branch_cost. */ - 1, /* cond_not_taken_branch_cost. */ -}; - -static stringop_algs pentium4_memcpy[2] = { - {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}}, - DUMMY_STRINGOP_ALGS}; -static stringop_algs pentium4_memset[2] = { - {libcall, {{6, loop_1_byte, false}, {48, loop, false}, - {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}}, - DUMMY_STRINGOP_ALGS}; - -static const -struct processor_costs pentium4_cost = { - COSTS_N_INSNS (1), /* cost of an add instruction */ - COSTS_N_INSNS (3), /* cost of a lea instruction */ - COSTS_N_INSNS (4), /* variable shift costs */ - COSTS_N_INSNS (4), /* constant shift costs */ - {COSTS_N_INSNS (15), /* cost of starting multiply for QI */ - COSTS_N_INSNS (15), /* HI */ - COSTS_N_INSNS (15), /* SI */ - COSTS_N_INSNS (15), /* DI */ - COSTS_N_INSNS (15)}, /* other */ - 0, /* cost of multiply per each bit set */ - {COSTS_N_INSNS (56), /* cost of a divide/mod for QI */ - COSTS_N_INSNS (56), /* HI */ - COSTS_N_INSNS (56), /* SI */ - COSTS_N_INSNS (56), /* DI */ - COSTS_N_INSNS (56)}, /* other */ - COSTS_N_INSNS (1), /* cost of movsx */ - COSTS_N_INSNS (1), /* cost of movzx */ - 16, /* "large" insn */ - 6, /* MOVE_RATIO */ - 2, /* cost for loading QImode using movzbl */ - {4, 5, 4}, /* cost of loading integer registers - in QImode, HImode and SImode. - Relative to reg-reg move (2). */ - {2, 3, 2}, /* cost of storing integer registers */ - 2, /* cost of reg,reg fld/fst */ - {2, 2, 6}, /* cost of loading fp registers - in SFmode, DFmode and XFmode */ - {4, 4, 6}, /* cost of storing fp registers - in SFmode, DFmode and XFmode */ - 2, /* cost of moving MMX register */ - {2, 2}, /* cost of loading MMX registers - in SImode and DImode */ - {2, 2}, /* cost of storing MMX registers - in SImode and DImode */ - 12, /* cost of moving SSE register */ - {12, 12, 12}, /* cost of loading SSE registers - in SImode, DImode and TImode */ - {2, 2, 8}, /* cost of storing SSE registers - in SImode, DImode and TImode */ - 10, /* MMX or SSE register to integer */ - 8, /* size of l1 cache. */ - 256, /* size of l2 cache. */ - 64, /* size of prefetch block */ - 6, /* number of parallel prefetches */ - 2, /* Branch cost */ - COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */ - COSTS_N_INSNS (7), /* cost of FMUL instruction. */ - COSTS_N_INSNS (43), /* cost of FDIV instruction. */ - COSTS_N_INSNS (2), /* cost of FABS instruction. */ - COSTS_N_INSNS (2), /* cost of FCHS instruction. */ - COSTS_N_INSNS (43), /* cost of FSQRT instruction. */ - 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ - pentium4_memcpy, - pentium4_memset, - 1, /* scalar_stmt_cost. */ - 1, /* scalar load_cost. */ - 1, /* scalar_store_cost. */ - 1, /* vec_stmt_cost. */ - 1, /* vec_to_scalar_cost. */ - 1, /* scalar_to_vec_cost. */ - 1, /* vec_align_load_cost. */ - 2, /* vec_unalign_load_cost. */ - 1, /* vec_store_cost. */ - 3, /* cond_taken_branch_cost. */ - 1, /* cond_not_taken_branch_cost. */ -}; - -static stringop_algs nocona_memcpy[2] = { - {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}}, - {libcall, {{32, loop, false}, {20000, rep_prefix_8_byte, false}, - {100000, unrolled_loop, false}, {-1, libcall, false}}}}; - -static stringop_algs nocona_memset[2] = { - {libcall, {{6, loop_1_byte, false}, {48, loop, false}, - {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}}, - {libcall, {{24, loop, false}, {64, unrolled_loop, false}, - {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}}; - -static const -struct processor_costs nocona_cost = { - COSTS_N_INSNS (1), /* cost of an add instruction */ - COSTS_N_INSNS (1), /* cost of a lea instruction */ - COSTS_N_INSNS (1), /* variable shift costs */ - COSTS_N_INSNS (1), /* constant shift costs */ - {COSTS_N_INSNS (10), /* cost of starting multiply for QI */ - COSTS_N_INSNS (10), /* HI */ - COSTS_N_INSNS (10), /* SI */ - COSTS_N_INSNS (10), /* DI */ - COSTS_N_INSNS (10)}, /* other */ - 0, /* cost of multiply per each bit set */ - {COSTS_N_INSNS (66), /* cost of a divide/mod for QI */ - COSTS_N_INSNS (66), /* HI */ - COSTS_N_INSNS (66), /* SI */ - COSTS_N_INSNS (66), /* DI */ - COSTS_N_INSNS (66)}, /* other */ - COSTS_N_INSNS (1), /* cost of movsx */ - COSTS_N_INSNS (1), /* cost of movzx */ - 16, /* "large" insn */ - 17, /* MOVE_RATIO */ - 4, /* cost for loading QImode using movzbl */ - {4, 4, 4}, /* cost of loading integer registers - in QImode, HImode and SImode. - Relative to reg-reg move (2). */ - {4, 4, 4}, /* cost of storing integer registers */ - 3, /* cost of reg,reg fld/fst */ - {12, 12, 12}, /* cost of loading fp registers - in SFmode, DFmode and XFmode */ - {4, 4, 4}, /* cost of storing fp registers - in SFmode, DFmode and XFmode */ - 6, /* cost of moving MMX register */ - {12, 12}, /* cost of loading MMX registers - in SImode and DImode */ - {12, 12}, /* cost of storing MMX registers - in SImode and DImode */ - 6, /* cost of moving SSE register */ - {12, 12, 12}, /* cost of loading SSE registers - in SImode, DImode and TImode */ - {12, 12, 12}, /* cost of storing SSE registers - in SImode, DImode and TImode */ - 8, /* MMX or SSE register to integer */ - 8, /* size of l1 cache. */ - 1024, /* size of l2 cache. */ - 64, /* size of prefetch block */ - 8, /* number of parallel prefetches */ - 1, /* Branch cost */ - COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */ - COSTS_N_INSNS (8), /* cost of FMUL instruction. */ - COSTS_N_INSNS (40), /* cost of FDIV instruction. */ - COSTS_N_INSNS (3), /* cost of FABS instruction. */ - COSTS_N_INSNS (3), /* cost of FCHS instruction. */ - COSTS_N_INSNS (44), /* cost of FSQRT instruction. */ - 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ - nocona_memcpy, - nocona_memset, - 1, /* scalar_stmt_cost. */ - 1, /* scalar load_cost. */ - 1, /* scalar_store_cost. */ - 1, /* vec_stmt_cost. */ - 1, /* vec_to_scalar_cost. */ - 1, /* scalar_to_vec_cost. */ - 1, /* vec_align_load_cost. */ - 2, /* vec_unalign_load_cost. */ - 1, /* vec_store_cost. */ - 3, /* cond_taken_branch_cost. */ - 1, /* cond_not_taken_branch_cost. */ -}; - -static stringop_algs atom_memcpy[2] = { - {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}}, - {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false}, - {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}}; -static stringop_algs atom_memset[2] = { - {libcall, {{8, loop, false}, {15, unrolled_loop, false}, - {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, - {libcall, {{24, loop, false}, {32, unrolled_loop, false}, - {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}}; -static const -struct processor_costs atom_cost = { - COSTS_N_INSNS (1), /* cost of an add instruction */ - COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */ - COSTS_N_INSNS (1), /* variable shift costs */ - COSTS_N_INSNS (1), /* constant shift costs */ - {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ - COSTS_N_INSNS (4), /* HI */ - COSTS_N_INSNS (3), /* SI */ - COSTS_N_INSNS (4), /* DI */ - COSTS_N_INSNS (2)}, /* other */ - 0, /* cost of multiply per each bit set */ - {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */ - COSTS_N_INSNS (26), /* HI */ - COSTS_N_INSNS (42), /* SI */ - COSTS_N_INSNS (74), /* DI */ - COSTS_N_INSNS (74)}, /* other */ - COSTS_N_INSNS (1), /* cost of movsx */ - COSTS_N_INSNS (1), /* cost of movzx */ - 8, /* "large" insn */ - 17, /* MOVE_RATIO */ - 4, /* cost for loading QImode using movzbl */ - {4, 4, 4}, /* cost of loading integer registers - in QImode, HImode and SImode. - Relative to reg-reg move (2). */ - {4, 4, 4}, /* cost of storing integer registers */ - 4, /* cost of reg,reg fld/fst */ - {12, 12, 12}, /* cost of loading fp registers - in SFmode, DFmode and XFmode */ - {6, 6, 8}, /* cost of storing fp registers - in SFmode, DFmode and XFmode */ - 2, /* cost of moving MMX register */ - {8, 8}, /* cost of loading MMX registers - in SImode and DImode */ - {8, 8}, /* cost of storing MMX registers - in SImode and DImode */ - 2, /* cost of moving SSE register */ - {8, 8, 8}, /* cost of loading SSE registers - in SImode, DImode and TImode */ - {8, 8, 8}, /* cost of storing SSE registers - in SImode, DImode and TImode */ - 5, /* MMX or SSE register to integer */ - 32, /* size of l1 cache. */ - 256, /* size of l2 cache. */ - 64, /* size of prefetch block */ - 6, /* number of parallel prefetches */ - 3, /* Branch cost */ - COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */ - COSTS_N_INSNS (8), /* cost of FMUL instruction. */ - COSTS_N_INSNS (20), /* cost of FDIV instruction. */ - COSTS_N_INSNS (8), /* cost of FABS instruction. */ - COSTS_N_INSNS (8), /* cost of FCHS instruction. */ - COSTS_N_INSNS (40), /* cost of FSQRT instruction. */ - 2, 2, 2, 2, /* reassoc int, fp, vec_int, vec_fp. */ - atom_memcpy, - atom_memset, - 1, /* scalar_stmt_cost. */ - 1, /* scalar load_cost. */ - 1, /* scalar_store_cost. */ - 1, /* vec_stmt_cost. */ - 1, /* vec_to_scalar_cost. */ - 1, /* scalar_to_vec_cost. */ - 1, /* vec_align_load_cost. */ - 2, /* vec_unalign_load_cost. */ - 1, /* vec_store_cost. */ - 3, /* cond_taken_branch_cost. */ - 1, /* cond_not_taken_branch_cost. */ -}; - -static stringop_algs slm_memcpy[2] = { - {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}}, - {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false}, - {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}}; -static stringop_algs slm_memset[2] = { - {libcall, {{8, loop, false}, {15, unrolled_loop, false}, - {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, - {libcall, {{24, loop, false}, {32, unrolled_loop, false}, - {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}}; -static const -struct processor_costs slm_cost = { - COSTS_N_INSNS (1), /* cost of an add instruction */ - COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */ - COSTS_N_INSNS (1), /* variable shift costs */ - COSTS_N_INSNS (1), /* constant shift costs */ - {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ - COSTS_N_INSNS (3), /* HI */ - COSTS_N_INSNS (3), /* SI */ - COSTS_N_INSNS (4), /* DI */ - COSTS_N_INSNS (2)}, /* other */ - 0, /* cost of multiply per each bit set */ - {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */ - COSTS_N_INSNS (26), /* HI */ - COSTS_N_INSNS (42), /* SI */ - COSTS_N_INSNS (74), /* DI */ - COSTS_N_INSNS (74)}, /* other */ - COSTS_N_INSNS (1), /* cost of movsx */ - COSTS_N_INSNS (1), /* cost of movzx */ - 8, /* "large" insn */ - 17, /* MOVE_RATIO */ - 4, /* cost for loading QImode using movzbl */ - {4, 4, 4}, /* cost of loading integer registers - in QImode, HImode and SImode. - Relative to reg-reg move (2). */ - {4, 4, 4}, /* cost of storing integer registers */ - 4, /* cost of reg,reg fld/fst */ - {12, 12, 12}, /* cost of loading fp registers - in SFmode, DFmode and XFmode */ - {6, 6, 8}, /* cost of storing fp registers - in SFmode, DFmode and XFmode */ - 2, /* cost of moving MMX register */ - {8, 8}, /* cost of loading MMX registers - in SImode and DImode */ - {8, 8}, /* cost of storing MMX registers - in SImode and DImode */ - 2, /* cost of moving SSE register */ - {8, 8, 8}, /* cost of loading SSE registers - in SImode, DImode and TImode */ - {8, 8, 8}, /* cost of storing SSE registers - in SImode, DImode and TImode */ - 5, /* MMX or SSE register to integer */ - 32, /* size of l1 cache. */ - 256, /* size of l2 cache. */ - 64, /* size of prefetch block */ - 6, /* number of parallel prefetches */ - 3, /* Branch cost */ - COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */ - COSTS_N_INSNS (8), /* cost of FMUL instruction. */ - COSTS_N_INSNS (20), /* cost of FDIV instruction. */ - COSTS_N_INSNS (8), /* cost of FABS instruction. */ - COSTS_N_INSNS (8), /* cost of FCHS instruction. */ - COSTS_N_INSNS (40), /* cost of FSQRT instruction. */ - 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ - slm_memcpy, - slm_memset, - 1, /* scalar_stmt_cost. */ - 1, /* scalar load_cost. */ - 1, /* scalar_store_cost. */ - 1, /* vec_stmt_cost. */ - 4, /* vec_to_scalar_cost. */ - 1, /* scalar_to_vec_cost. */ - 1, /* vec_align_load_cost. */ - 2, /* vec_unalign_load_cost. */ - 1, /* vec_store_cost. */ - 3, /* cond_taken_branch_cost. */ - 1, /* cond_not_taken_branch_cost. */ -}; - -static stringop_algs intel_memcpy[2] = { - {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}}, - {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false}, - {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}}; -static stringop_algs intel_memset[2] = { - {libcall, {{8, loop, false}, {15, unrolled_loop, false}, - {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, - {libcall, {{24, loop, false}, {32, unrolled_loop, false}, - {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}}; -static const -struct processor_costs intel_cost = { - COSTS_N_INSNS (1), /* cost of an add instruction */ - COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */ - COSTS_N_INSNS (1), /* variable shift costs */ - COSTS_N_INSNS (1), /* constant shift costs */ - {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ - COSTS_N_INSNS (3), /* HI */ - COSTS_N_INSNS (3), /* SI */ - COSTS_N_INSNS (4), /* DI */ - COSTS_N_INSNS (2)}, /* other */ - 0, /* cost of multiply per each bit set */ - {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */ - COSTS_N_INSNS (26), /* HI */ - COSTS_N_INSNS (42), /* SI */ - COSTS_N_INSNS (74), /* DI */ - COSTS_N_INSNS (74)}, /* other */ - COSTS_N_INSNS (1), /* cost of movsx */ - COSTS_N_INSNS (1), /* cost of movzx */ - 8, /* "large" insn */ - 17, /* MOVE_RATIO */ - 4, /* cost for loading QImode using movzbl */ - {4, 4, 4}, /* cost of loading integer registers - in QImode, HImode and SImode. - Relative to reg-reg move (2). */ - {4, 4, 4}, /* cost of storing integer registers */ - 4, /* cost of reg,reg fld/fst */ - {12, 12, 12}, /* cost of loading fp registers - in SFmode, DFmode and XFmode */ - {6, 6, 8}, /* cost of storing fp registers - in SFmode, DFmode and XFmode */ - 2, /* cost of moving MMX register */ - {8, 8}, /* cost of loading MMX registers - in SImode and DImode */ - {8, 8}, /* cost of storing MMX registers - in SImode and DImode */ - 2, /* cost of moving SSE register */ - {8, 8, 8}, /* cost of loading SSE registers - in SImode, DImode and TImode */ - {8, 8, 8}, /* cost of storing SSE registers - in SImode, DImode and TImode */ - 5, /* MMX or SSE register to integer */ - 32, /* size of l1 cache. */ - 256, /* size of l2 cache. */ - 64, /* size of prefetch block */ - 6, /* number of parallel prefetches */ - 3, /* Branch cost */ - COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */ - COSTS_N_INSNS (8), /* cost of FMUL instruction. */ - COSTS_N_INSNS (20), /* cost of FDIV instruction. */ - COSTS_N_INSNS (8), /* cost of FABS instruction. */ - COSTS_N_INSNS (8), /* cost of FCHS instruction. */ - COSTS_N_INSNS (40), /* cost of FSQRT instruction. */ - 1, 4, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ - intel_memcpy, - intel_memset, - 1, /* scalar_stmt_cost. */ - 1, /* scalar load_cost. */ - 1, /* scalar_store_cost. */ - 1, /* vec_stmt_cost. */ - 4, /* vec_to_scalar_cost. */ - 1, /* scalar_to_vec_cost. */ - 1, /* vec_align_load_cost. */ - 2, /* vec_unalign_load_cost. */ - 1, /* vec_store_cost. */ - 3, /* cond_taken_branch_cost. */ - 1, /* cond_not_taken_branch_cost. */ -}; - -/* Generic should produce code tuned for Core-i7 (and newer chips) - and btver1 (and newer chips). */ - -static stringop_algs generic_memcpy[2] = { - {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false}, - {-1, libcall, false}}}, - {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false}, - {-1, libcall, false}}}}; -static stringop_algs generic_memset[2] = { - {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false}, - {-1, libcall, false}}}, - {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false}, - {-1, libcall, false}}}}; -static const -struct processor_costs generic_cost = { - COSTS_N_INSNS (1), /* cost of an add instruction */ - /* On all chips taken into consideration lea is 2 cycles and more. With - this cost however our current implementation of synth_mult results in - use of unnecessary temporary registers causing regression on several - SPECfp benchmarks. */ - COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */ - COSTS_N_INSNS (1), /* variable shift costs */ - COSTS_N_INSNS (1), /* constant shift costs */ - {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ - COSTS_N_INSNS (4), /* HI */ - COSTS_N_INSNS (3), /* SI */ - COSTS_N_INSNS (4), /* DI */ - COSTS_N_INSNS (2)}, /* other */ - 0, /* cost of multiply per each bit set */ - {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */ - COSTS_N_INSNS (26), /* HI */ - COSTS_N_INSNS (42), /* SI */ - COSTS_N_INSNS (74), /* DI */ - COSTS_N_INSNS (74)}, /* other */ - COSTS_N_INSNS (1), /* cost of movsx */ - COSTS_N_INSNS (1), /* cost of movzx */ - 8, /* "large" insn */ - 17, /* MOVE_RATIO */ - 4, /* cost for loading QImode using movzbl */ - {4, 4, 4}, /* cost of loading integer registers - in QImode, HImode and SImode. - Relative to reg-reg move (2). */ - {4, 4, 4}, /* cost of storing integer registers */ - 4, /* cost of reg,reg fld/fst */ - {12, 12, 12}, /* cost of loading fp registers - in SFmode, DFmode and XFmode */ - {6, 6, 8}, /* cost of storing fp registers - in SFmode, DFmode and XFmode */ - 2, /* cost of moving MMX register */ - {8, 8}, /* cost of loading MMX registers - in SImode and DImode */ - {8, 8}, /* cost of storing MMX registers - in SImode and DImode */ - 2, /* cost of moving SSE register */ - {8, 8, 8}, /* cost of loading SSE registers - in SImode, DImode and TImode */ - {8, 8, 8}, /* cost of storing SSE registers - in SImode, DImode and TImode */ - 5, /* MMX or SSE register to integer */ - 32, /* size of l1 cache. */ - 512, /* size of l2 cache. */ - 64, /* size of prefetch block */ - 6, /* number of parallel prefetches */ - /* Benchmarks shows large regressions on K8 sixtrack benchmark when this - value is increased to perhaps more appropriate value of 5. */ - 3, /* Branch cost */ - COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */ - COSTS_N_INSNS (8), /* cost of FMUL instruction. */ - COSTS_N_INSNS (20), /* cost of FDIV instruction. */ - COSTS_N_INSNS (8), /* cost of FABS instruction. */ - COSTS_N_INSNS (8), /* cost of FCHS instruction. */ - COSTS_N_INSNS (40), /* cost of FSQRT instruction. */ - 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ - generic_memcpy, - generic_memset, - 1, /* scalar_stmt_cost. */ - 1, /* scalar load_cost. */ - 1, /* scalar_store_cost. */ - 1, /* vec_stmt_cost. */ - 1, /* vec_to_scalar_cost. */ - 1, /* scalar_to_vec_cost. */ - 1, /* vec_align_load_cost. */ - 2, /* vec_unalign_load_cost. */ - 1, /* vec_store_cost. */ - 3, /* cond_taken_branch_cost. */ - 1, /* cond_not_taken_branch_cost. */ -}; - -/* core_cost should produce code tuned for Core familly of CPUs. */ -static stringop_algs core_memcpy[2] = { - {libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}}, - {libcall, {{24, loop, true}, {128, rep_prefix_8_byte, true}, - {-1, libcall, false}}}}; -static stringop_algs core_memset[2] = { - {libcall, {{6, loop_1_byte, true}, - {24, loop, true}, - {8192, rep_prefix_4_byte, true}, - {-1, libcall, false}}}, - {libcall, {{24, loop, true}, {512, rep_prefix_8_byte, true}, - {-1, libcall, false}}}}; - -static const -struct processor_costs core_cost = { - COSTS_N_INSNS (1), /* cost of an add instruction */ - /* On all chips taken into consideration lea is 2 cycles and more. With - this cost however our current implementation of synth_mult results in - use of unnecessary temporary registers causing regression on several - SPECfp benchmarks. */ - COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */ - COSTS_N_INSNS (1), /* variable shift costs */ - COSTS_N_INSNS (1), /* constant shift costs */ - {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ - COSTS_N_INSNS (4), /* HI */ - COSTS_N_INSNS (3), /* SI */ - COSTS_N_INSNS (4), /* DI */ - COSTS_N_INSNS (2)}, /* other */ - 0, /* cost of multiply per each bit set */ - {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */ - COSTS_N_INSNS (26), /* HI */ - COSTS_N_INSNS (42), /* SI */ - COSTS_N_INSNS (74), /* DI */ - COSTS_N_INSNS (74)}, /* other */ - COSTS_N_INSNS (1), /* cost of movsx */ - COSTS_N_INSNS (1), /* cost of movzx */ - 8, /* "large" insn */ - 17, /* MOVE_RATIO */ - 4, /* cost for loading QImode using movzbl */ - {4, 4, 4}, /* cost of loading integer registers - in QImode, HImode and SImode. - Relative to reg-reg move (2). */ - {4, 4, 4}, /* cost of storing integer registers */ - 4, /* cost of reg,reg fld/fst */ - {12, 12, 12}, /* cost of loading fp registers - in SFmode, DFmode and XFmode */ - {6, 6, 8}, /* cost of storing fp registers - in SFmode, DFmode and XFmode */ - 2, /* cost of moving MMX register */ - {8, 8}, /* cost of loading MMX registers - in SImode and DImode */ - {8, 8}, /* cost of storing MMX registers - in SImode and DImode */ - 2, /* cost of moving SSE register */ - {8, 8, 8}, /* cost of loading SSE registers - in SImode, DImode and TImode */ - {8, 8, 8}, /* cost of storing SSE registers - in SImode, DImode and TImode */ - 5, /* MMX or SSE register to integer */ - 64, /* size of l1 cache. */ - 512, /* size of l2 cache. */ - 64, /* size of prefetch block */ - 6, /* number of parallel prefetches */ - /* FIXME perhaps more appropriate value is 5. */ - 3, /* Branch cost */ - COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */ - COSTS_N_INSNS (8), /* cost of FMUL instruction. */ - COSTS_N_INSNS (20), /* cost of FDIV instruction. */ - COSTS_N_INSNS (8), /* cost of FABS instruction. */ - COSTS_N_INSNS (8), /* cost of FCHS instruction. */ - COSTS_N_INSNS (40), /* cost of FSQRT instruction. */ - 1, 4, 2, 2, /* reassoc int, fp, vec_int, vec_fp. */ - core_memcpy, - core_memset, - 1, /* scalar_stmt_cost. */ - 1, /* scalar load_cost. */ - 1, /* scalar_store_cost. */ - 1, /* vec_stmt_cost. */ - 1, /* vec_to_scalar_cost. */ - 1, /* scalar_to_vec_cost. */ - 1, /* vec_align_load_cost. */ - 2, /* vec_unalign_load_cost. */ - 1, /* vec_store_cost. */ - 3, /* cond_taken_branch_cost. */ - 1, /* cond_not_taken_branch_cost. */ -}; - /* Set by -mtune. */ -const struct processor_costs *ix86_tune_cost = &pentium_cost; +const struct processor_costs *ix86_tune_cost = NULL; /* Set by -mtune or -Os. */ -const struct processor_costs *ix86_cost = &pentium_cost; +const struct processor_costs *ix86_cost = NULL; /* Processor feature/optimization bitmasks. */ #define m_386 (1U<= 0; --i) - if (MEM_P (recog_data.operand[i])) - { - rtx addr = XEXP (recog_data.operand[i], 0); - if (modified_in_p (addr, set_insn) != 0) - { - /* No AGI stall if SET_INSN is a push or pop and USE_INSN - has SP based memory (unless index reg is modified in a pop). */ - rtx set = single_set (set_insn); - if (set - && (push_operand (SET_DEST (set), GET_MODE (SET_DEST (set))) - || pop_operand (SET_SRC (set), GET_MODE (SET_SRC (set))))) - { - struct ix86_address parts; - if (ix86_decompose_address (addr, &parts) - && parts.base == stack_pointer_rtx - && (parts.index == NULL_RTX - || MEM_P (SET_DEST (set)) - || !modified_in_p (parts.index, set_insn))) - return false; - } - return true; - } - return false; - } - return false; -} - -/* Helper function for exact_store_load_dependency. - Return true if addr is found in insn. */ -static bool -exact_dependency_1 (rtx addr, rtx insn) -{ - enum rtx_code code; - const char *format_ptr; - int i, j; - - code = GET_CODE (insn); - switch (code) - { - case MEM: - if (rtx_equal_p (addr, insn)) - return true; - break; - case REG: - CASE_CONST_ANY: - case SYMBOL_REF: - case CODE_LABEL: - case PC: - case CC0: - case EXPR_LIST: - return false; - default: - break; - } - - format_ptr = GET_RTX_FORMAT (code); - for (i = 0; i < GET_RTX_LENGTH (code); i++) - { - switch (*format_ptr++) - { - case 'e': - if (exact_dependency_1 (addr, XEXP (insn, i))) - return true; - break; - case 'E': - for (j = 0; j < XVECLEN (insn, i); j++) - if (exact_dependency_1 (addr, XVECEXP (insn, i, j))) - return true; - break; - } - } - return false; -} - -/* Return true if there exists exact dependency for store & load, i.e. - the same memory address is used in them. */ -static bool -exact_store_load_dependency (rtx_insn *store, rtx_insn *load) -{ - rtx set1, set2; - - set1 = single_set (store); - if (!set1) - return false; - if (!MEM_P (SET_DEST (set1))) - return false; - set2 = single_set (load); - if (!set2) - return false; - if (exact_dependency_1 (SET_DEST (set1), SET_SRC (set2))) - return true; - return false; -} - -static int -ix86_adjust_cost (rtx_insn *insn, int dep_type, rtx_insn *dep_insn, int cost, - unsigned int) -{ - enum attr_type insn_type, dep_insn_type; - enum attr_memory memory; - rtx set, set2; - int dep_insn_code_number; - - /* Anti and output dependencies have zero cost on all CPUs. */ - if (dep_type != 0) - return 0; - - dep_insn_code_number = recog_memoized (dep_insn); - - /* If we can't recognize the insns, we can't really do anything. */ - if (dep_insn_code_number < 0 || recog_memoized (insn) < 0) - return cost; - - insn_type = get_attr_type (insn); - dep_insn_type = get_attr_type (dep_insn); - - switch (ix86_tune) - { - case PROCESSOR_PENTIUM: - case PROCESSOR_LAKEMONT: - /* Address Generation Interlock adds a cycle of latency. */ - if (insn_type == TYPE_LEA) - { - rtx addr = PATTERN (insn); - - if (GET_CODE (addr) == PARALLEL) - addr = XVECEXP (addr, 0, 0); - - gcc_assert (GET_CODE (addr) == SET); - - addr = SET_SRC (addr); - if (modified_in_p (addr, dep_insn)) - cost += 1; - } - else if (ix86_agi_dependent (dep_insn, insn)) - cost += 1; - - /* ??? Compares pair with jump/setcc. */ - if (ix86_flags_dependent (insn, dep_insn, insn_type)) - cost = 0; - - /* Floating point stores require value to be ready one cycle earlier. */ - if (insn_type == TYPE_FMOV - && get_attr_memory (insn) == MEMORY_STORE - && !ix86_agi_dependent (dep_insn, insn)) - cost += 1; - break; - - case PROCESSOR_PENTIUMPRO: - /* INT->FP conversion is expensive. */ - if (get_attr_fp_int_src (dep_insn)) - cost += 5; - - /* There is one cycle extra latency between an FP op and a store. */ - if (insn_type == TYPE_FMOV - && (set = single_set (dep_insn)) != NULL_RTX - && (set2 = single_set (insn)) != NULL_RTX - && rtx_equal_p (SET_DEST (set), SET_SRC (set2)) - && MEM_P (SET_DEST (set2))) - cost += 1; - - memory = get_attr_memory (insn); - - /* Show ability of reorder buffer to hide latency of load by executing - in parallel with previous instruction in case - previous instruction is not needed to compute the address. */ - if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH) - && !ix86_agi_dependent (dep_insn, insn)) - { - /* Claim moves to take one cycle, as core can issue one load - at time and the next load can start cycle later. */ - if (dep_insn_type == TYPE_IMOV - || dep_insn_type == TYPE_FMOV) - cost = 1; - else if (cost > 1) - cost--; - } - break; - - case PROCESSOR_K6: - /* The esp dependency is resolved before - the instruction is really finished. */ - if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP) - && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP)) - return 1; - - /* INT->FP conversion is expensive. */ - if (get_attr_fp_int_src (dep_insn)) - cost += 5; - - memory = get_attr_memory (insn); - - /* Show ability of reorder buffer to hide latency of load by executing - in parallel with previous instruction in case - previous instruction is not needed to compute the address. */ - if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH) - && !ix86_agi_dependent (dep_insn, insn)) - { - /* Claim moves to take one cycle, as core can issue one load - at time and the next load can start cycle later. */ - if (dep_insn_type == TYPE_IMOV - || dep_insn_type == TYPE_FMOV) - cost = 1; - else if (cost > 2) - cost -= 2; - else - cost = 1; - } - break; - - case PROCESSOR_AMDFAM10: - case PROCESSOR_BDVER1: - case PROCESSOR_BDVER2: - case PROCESSOR_BDVER3: - case PROCESSOR_BDVER4: - case PROCESSOR_ZNVER1: - case PROCESSOR_BTVER1: - case PROCESSOR_BTVER2: - case PROCESSOR_GENERIC: - /* Stack engine allows to execute push&pop instructions in parall. */ - if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP) - && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP)) - return 0; - /* FALLTHRU */ - - case PROCESSOR_ATHLON: - case PROCESSOR_K8: - memory = get_attr_memory (insn); - - /* Show ability of reorder buffer to hide latency of load by executing - in parallel with previous instruction in case - previous instruction is not needed to compute the address. */ - if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH) - && !ix86_agi_dependent (dep_insn, insn)) - { - enum attr_unit unit = get_attr_unit (insn); - int loadcost = 3; - - /* Because of the difference between the length of integer and - floating unit pipeline preparation stages, the memory operands - for floating point are cheaper. - - ??? For Athlon it the difference is most probably 2. */ - if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN) - loadcost = 3; - else - loadcost = TARGET_ATHLON ? 2 : 0; - - if (cost >= loadcost) - cost -= loadcost; - else - cost = 0; - } - break; - - case PROCESSOR_CORE2: - case PROCESSOR_NEHALEM: - case PROCESSOR_SANDYBRIDGE: - case PROCESSOR_HASWELL: - /* Stack engine allows to execute push&pop instructions in parall. */ - if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP) - && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP)) - return 0; - - memory = get_attr_memory (insn); - - /* Show ability of reorder buffer to hide latency of load by executing - in parallel with previous instruction in case - previous instruction is not needed to compute the address. */ - if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH) - && !ix86_agi_dependent (dep_insn, insn)) - { - if (cost >= 4) - cost -= 4; - else - cost = 0; - } - break; - - case PROCESSOR_SILVERMONT: - case PROCESSOR_KNL: - case PROCESSOR_KNM: - case PROCESSOR_INTEL: - if (!reload_completed) - return cost; - - /* Increase cost of integer loads. */ - memory = get_attr_memory (dep_insn); - if (memory == MEMORY_LOAD || memory == MEMORY_BOTH) - { - enum attr_unit unit = get_attr_unit (dep_insn); - if (unit == UNIT_INTEGER && cost == 1) - { - if (memory == MEMORY_LOAD) - cost = 3; - else - { - /* Increase cost of ld/st for short int types only - because of store forwarding issue. */ - rtx set = single_set (dep_insn); - if (set && (GET_MODE (SET_DEST (set)) == QImode - || GET_MODE (SET_DEST (set)) == HImode)) - { - /* Increase cost of store/load insn if exact - dependence exists and it is load insn. */ - enum attr_memory insn_memory = get_attr_memory (insn); - if (insn_memory == MEMORY_LOAD - && exact_store_load_dependency (dep_insn, insn)) - cost = 3; - } - } - } - } - - default: - break; - } - - return cost; -} - -/* How many alternative schedules to try. This should be as wide as the - scheduling freedom in the DFA, but no wider. Making this value too - large results extra work for the scheduler. */ - -static int -ia32_multipass_dfa_lookahead (void) -{ - /* Generally, we want haifa-sched:max_issue() to look ahead as far - as many instructions can be executed on a cycle, i.e., - issue_rate. */ - if (reload_completed) - return ix86_issue_rate (); - /* Don't use lookahead for pre-reload schedule to save compile time. */ - return 0; -} - -/* Return true if target platform supports macro-fusion. */ - -static bool -ix86_macro_fusion_p () -{ - return TARGET_FUSE_CMP_AND_BRANCH; -} - -/* Check whether current microarchitecture support macro fusion - for insn pair "CONDGEN + CONDJMP". Refer to - "Intel Architectures Optimization Reference Manual". */ - -static bool -ix86_macro_fusion_pair_p (rtx_insn *condgen, rtx_insn *condjmp) -{ - rtx src, dest; - enum rtx_code ccode; - rtx compare_set = NULL_RTX, test_if, cond; - rtx alu_set = NULL_RTX, addr = NULL_RTX; - - if (!any_condjump_p (condjmp)) - return false; - - unsigned int condreg1, condreg2; - rtx cc_reg_1; - ix86_fixed_condition_code_regs (&condreg1, &condreg2); - cc_reg_1 = gen_rtx_REG (CCmode, condreg1); - if (!reg_referenced_p (cc_reg_1, PATTERN (condjmp)) - || !condgen - || !modified_in_p (cc_reg_1, condgen)) - return false; - - if (get_attr_type (condgen) != TYPE_TEST - && get_attr_type (condgen) != TYPE_ICMP - && get_attr_type (condgen) != TYPE_INCDEC - && get_attr_type (condgen) != TYPE_ALU) - return false; - - compare_set = single_set (condgen); - if (compare_set == NULL_RTX - && !TARGET_FUSE_ALU_AND_BRANCH) - return false; - - if (compare_set == NULL_RTX) - { - int i; - rtx pat = PATTERN (condgen); - for (i = 0; i < XVECLEN (pat, 0); i++) - if (GET_CODE (XVECEXP (pat, 0, i)) == SET) - { - rtx set_src = SET_SRC (XVECEXP (pat, 0, i)); - if (GET_CODE (set_src) == COMPARE) - compare_set = XVECEXP (pat, 0, i); - else - alu_set = XVECEXP (pat, 0, i); - } - } - if (compare_set == NULL_RTX) - return false; - src = SET_SRC (compare_set); - if (GET_CODE (src) != COMPARE) - return false; - - /* Macro-fusion for cmp/test MEM-IMM + conditional jmp is not - supported. */ - if ((MEM_P (XEXP (src, 0)) - && CONST_INT_P (XEXP (src, 1))) - || (MEM_P (XEXP (src, 1)) - && CONST_INT_P (XEXP (src, 0)))) - return false; - - /* No fusion for RIP-relative address. */ - if (MEM_P (XEXP (src, 0))) - addr = XEXP (XEXP (src, 0), 0); - else if (MEM_P (XEXP (src, 1))) - addr = XEXP (XEXP (src, 1), 0); - - if (addr) { - ix86_address parts; - int ok = ix86_decompose_address (addr, &parts); - gcc_assert (ok); - - if (rip_relative_addr_p (&parts)) - return false; - } - - test_if = SET_SRC (pc_set (condjmp)); - cond = XEXP (test_if, 0); - ccode = GET_CODE (cond); - /* Check whether conditional jump use Sign or Overflow Flags. */ - if (!TARGET_FUSE_CMP_AND_BRANCH_SOFLAGS - && (ccode == GE - || ccode == GT - || ccode == LE - || ccode == LT)) - return false; - - /* Return true for TYPE_TEST and TYPE_ICMP. */ - if (get_attr_type (condgen) == TYPE_TEST - || get_attr_type (condgen) == TYPE_ICMP) - return true; - - /* The following is the case that macro-fusion for alu + jmp. */ - if (!TARGET_FUSE_ALU_AND_BRANCH || !alu_set) - return false; - - /* No fusion for alu op with memory destination operand. */ - dest = SET_DEST (alu_set); - if (MEM_P (dest)) - return false; - - /* Macro-fusion for inc/dec + unsigned conditional jump is not - supported. */ - if (get_attr_type (condgen) == TYPE_INCDEC - && (ccode == GEU - || ccode == GTU - || ccode == LEU - || ccode == LTU)) - return false; - - return true; -} - -/* Try to reorder ready list to take advantage of Atom pipelined IMUL - execution. It is applied if - (1) IMUL instruction is on the top of list; - (2) There exists the only producer of independent IMUL instruction in - ready list. - Return index of IMUL producer if it was found and -1 otherwise. */ -static int -do_reorder_for_imul (rtx_insn **ready, int n_ready) -{ - rtx_insn *insn; - rtx set, insn1, insn2; - sd_iterator_def sd_it; - dep_t dep; - int index = -1; - int i; - - if (!TARGET_BONNELL) - return index; - - /* Check that IMUL instruction is on the top of ready list. */ - insn = ready[n_ready - 1]; - set = single_set (insn); - if (!set) - return index; - if (!(GET_CODE (SET_SRC (set)) == MULT - && GET_MODE (SET_SRC (set)) == SImode)) - return index; - - /* Search for producer of independent IMUL instruction. */ - for (i = n_ready - 2; i >= 0; i--) - { - insn = ready[i]; - if (!NONDEBUG_INSN_P (insn)) - continue; - /* Skip IMUL instruction. */ - insn2 = PATTERN (insn); - if (GET_CODE (insn2) == PARALLEL) - insn2 = XVECEXP (insn2, 0, 0); - if (GET_CODE (insn2) == SET - && GET_CODE (SET_SRC (insn2)) == MULT - && GET_MODE (SET_SRC (insn2)) == SImode) - continue; - - FOR_EACH_DEP (insn, SD_LIST_FORW, sd_it, dep) - { - rtx con; - con = DEP_CON (dep); - if (!NONDEBUG_INSN_P (con)) - continue; - insn1 = PATTERN (con); - if (GET_CODE (insn1) == PARALLEL) - insn1 = XVECEXP (insn1, 0, 0); - - if (GET_CODE (insn1) == SET - && GET_CODE (SET_SRC (insn1)) == MULT - && GET_MODE (SET_SRC (insn1)) == SImode) - { - sd_iterator_def sd_it1; - dep_t dep1; - /* Check if there is no other dependee for IMUL. */ - index = i; - FOR_EACH_DEP (con, SD_LIST_BACK, sd_it1, dep1) - { - rtx pro; - pro = DEP_PRO (dep1); - if (!NONDEBUG_INSN_P (pro)) - continue; - if (pro != insn) - index = -1; - } - if (index >= 0) - break; - } - } - if (index >= 0) - break; - } - return index; -} - -/* Try to find the best candidate on the top of ready list if two insns - have the same priority - candidate is best if its dependees were - scheduled earlier. Applied for Silvermont only. - Return true if top 2 insns must be interchanged. */ -static bool -swap_top_of_ready_list (rtx_insn **ready, int n_ready) -{ - rtx_insn *top = ready[n_ready - 1]; - rtx_insn *next = ready[n_ready - 2]; - rtx set; - sd_iterator_def sd_it; - dep_t dep; - int clock1 = -1; - int clock2 = -1; - #define INSN_TICK(INSN) (HID (INSN)->tick) - - if (!TARGET_SILVERMONT && !TARGET_INTEL) - return false; - - if (!NONDEBUG_INSN_P (top)) - return false; - if (!NONJUMP_INSN_P (top)) - return false; - if (!NONDEBUG_INSN_P (next)) - return false; - if (!NONJUMP_INSN_P (next)) - return false; - set = single_set (top); - if (!set) - return false; - set = single_set (next); - if (!set) - return false; - - if (INSN_PRIORITY_KNOWN (top) && INSN_PRIORITY_KNOWN (next)) - { - if (INSN_PRIORITY (top) != INSN_PRIORITY (next)) - return false; - /* Determine winner more precise. */ - FOR_EACH_DEP (top, SD_LIST_RES_BACK, sd_it, dep) - { - rtx pro; - pro = DEP_PRO (dep); - if (!NONDEBUG_INSN_P (pro)) - continue; - if (INSN_TICK (pro) > clock1) - clock1 = INSN_TICK (pro); - } - FOR_EACH_DEP (next, SD_LIST_RES_BACK, sd_it, dep) - { - rtx pro; - pro = DEP_PRO (dep); - if (!NONDEBUG_INSN_P (pro)) - continue; - if (INSN_TICK (pro) > clock2) - clock2 = INSN_TICK (pro); - } - - if (clock1 == clock2) - { - /* Determine winner - load must win. */ - enum attr_memory memory1, memory2; - memory1 = get_attr_memory (top); - memory2 = get_attr_memory (next); - if (memory2 == MEMORY_LOAD && memory1 != MEMORY_LOAD) - return true; - } - return (bool) (clock2 < clock1); - } - return false; - #undef INSN_TICK -} - -/* Perform possible reodering of ready list for Atom/Silvermont only. - Return issue rate. */ -static int -ix86_sched_reorder (FILE *dump, int sched_verbose, rtx_insn **ready, - int *pn_ready, int clock_var) -{ - int issue_rate = -1; - int n_ready = *pn_ready; - int i; - rtx_insn *insn; - int index = -1; - - /* Set up issue rate. */ - issue_rate = ix86_issue_rate (); - - /* Do reodering for BONNELL/SILVERMONT only. */ - if (!TARGET_BONNELL && !TARGET_SILVERMONT && !TARGET_INTEL) - return issue_rate; - - /* Nothing to do if ready list contains only 1 instruction. */ - if (n_ready <= 1) - return issue_rate; - - /* Do reodering for post-reload scheduler only. */ - if (!reload_completed) - return issue_rate; - - if ((index = do_reorder_for_imul (ready, n_ready)) >= 0) - { - if (sched_verbose > 1) - fprintf (dump, ";;\tatom sched_reorder: put %d insn on top\n", - INSN_UID (ready[index])); - - /* Put IMUL producer (ready[index]) at the top of ready list. */ - insn = ready[index]; - for (i = index; i < n_ready - 1; i++) - ready[i] = ready[i + 1]; - ready[n_ready - 1] = insn; - return issue_rate; - } - - /* Skip selective scheduling since HID is not populated in it. */ - if (clock_var != 0 - && !sel_sched_p () - && swap_top_of_ready_list (ready, n_ready)) - { - if (sched_verbose > 1) - fprintf (dump, ";;\tslm sched_reorder: swap %d and %d insns\n", - INSN_UID (ready[n_ready - 1]), INSN_UID (ready[n_ready - 2])); - /* Swap 2 top elements of ready list. */ - insn = ready[n_ready - 1]; - ready[n_ready - 1] = ready[n_ready - 2]; - ready[n_ready - 2] = insn; - } - return issue_rate; -} static bool ix86_class_likely_spilled_p (reg_class_t); @@ -31330,204 +28483,6 @@ ix86_adjust_priority (rtx_insn *insn, int priority) return priority; } -/* Model decoder of Core 2/i7. - Below hooks for multipass scheduling (see haifa-sched.c:max_issue) - track the instruction fetch block boundaries and make sure that long - (9+ bytes) instructions are assigned to D0. */ - -/* Maximum length of an insn that can be handled by - a secondary decoder unit. '8' for Core 2/i7. */ -static int core2i7_secondary_decoder_max_insn_size; - -/* Ifetch block size, i.e., number of bytes decoder reads per cycle. - '16' for Core 2/i7. */ -static int core2i7_ifetch_block_size; - -/* Maximum number of instructions decoder can handle per cycle. - '6' for Core 2/i7. */ -static int core2i7_ifetch_block_max_insns; - -typedef struct ix86_first_cycle_multipass_data_ * - ix86_first_cycle_multipass_data_t; -typedef const struct ix86_first_cycle_multipass_data_ * - const_ix86_first_cycle_multipass_data_t; - -/* A variable to store target state across calls to max_issue within - one cycle. */ -static struct ix86_first_cycle_multipass_data_ _ix86_first_cycle_multipass_data, - *ix86_first_cycle_multipass_data = &_ix86_first_cycle_multipass_data; - -/* Initialize DATA. */ -static void -core2i7_first_cycle_multipass_init (void *_data) -{ - ix86_first_cycle_multipass_data_t data - = (ix86_first_cycle_multipass_data_t) _data; - - data->ifetch_block_len = 0; - data->ifetch_block_n_insns = 0; - data->ready_try_change = NULL; - data->ready_try_change_size = 0; -} - -/* Advancing the cycle; reset ifetch block counts. */ -static void -core2i7_dfa_post_advance_cycle (void) -{ - ix86_first_cycle_multipass_data_t data = ix86_first_cycle_multipass_data; - - gcc_assert (data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns); - - data->ifetch_block_len = 0; - data->ifetch_block_n_insns = 0; -} - -static int min_insn_size (rtx_insn *); - -/* Filter out insns from ready_try that the core will not be able to issue - on current cycle due to decoder. */ -static void -core2i7_first_cycle_multipass_filter_ready_try -(const_ix86_first_cycle_multipass_data_t data, - signed char *ready_try, int n_ready, bool first_cycle_insn_p) -{ - while (n_ready--) - { - rtx_insn *insn; - int insn_size; - - if (ready_try[n_ready]) - continue; - - insn = get_ready_element (n_ready); - insn_size = min_insn_size (insn); - - if (/* If this is a too long an insn for a secondary decoder ... */ - (!first_cycle_insn_p - && insn_size > core2i7_secondary_decoder_max_insn_size) - /* ... or it would not fit into the ifetch block ... */ - || data->ifetch_block_len + insn_size > core2i7_ifetch_block_size - /* ... or the decoder is full already ... */ - || data->ifetch_block_n_insns + 1 > core2i7_ifetch_block_max_insns) - /* ... mask the insn out. */ - { - ready_try[n_ready] = 1; - - if (data->ready_try_change) - bitmap_set_bit (data->ready_try_change, n_ready); - } - } -} - -/* Prepare for a new round of multipass lookahead scheduling. */ -static void -core2i7_first_cycle_multipass_begin (void *_data, - signed char *ready_try, int n_ready, - bool first_cycle_insn_p) -{ - ix86_first_cycle_multipass_data_t data - = (ix86_first_cycle_multipass_data_t) _data; - const_ix86_first_cycle_multipass_data_t prev_data - = ix86_first_cycle_multipass_data; - - /* Restore the state from the end of the previous round. */ - data->ifetch_block_len = prev_data->ifetch_block_len; - data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns; - - /* Filter instructions that cannot be issued on current cycle due to - decoder restrictions. */ - core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready, - first_cycle_insn_p); -} - -/* INSN is being issued in current solution. Account for its impact on - the decoder model. */ -static void -core2i7_first_cycle_multipass_issue (void *_data, - signed char *ready_try, int n_ready, - rtx_insn *insn, const void *_prev_data) -{ - ix86_first_cycle_multipass_data_t data - = (ix86_first_cycle_multipass_data_t) _data; - const_ix86_first_cycle_multipass_data_t prev_data - = (const_ix86_first_cycle_multipass_data_t) _prev_data; - - int insn_size = min_insn_size (insn); - - data->ifetch_block_len = prev_data->ifetch_block_len + insn_size; - data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns + 1; - gcc_assert (data->ifetch_block_len <= core2i7_ifetch_block_size - && data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns); - - /* Allocate or resize the bitmap for storing INSN's effect on ready_try. */ - if (!data->ready_try_change) - { - data->ready_try_change = sbitmap_alloc (n_ready); - data->ready_try_change_size = n_ready; - } - else if (data->ready_try_change_size < n_ready) - { - data->ready_try_change = sbitmap_resize (data->ready_try_change, - n_ready, 0); - data->ready_try_change_size = n_ready; - } - bitmap_clear (data->ready_try_change); - - /* Filter out insns from ready_try that the core will not be able to issue - on current cycle due to decoder. */ - core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready, - false); -} - -/* Revert the effect on ready_try. */ -static void -core2i7_first_cycle_multipass_backtrack (const void *_data, - signed char *ready_try, - int n_ready ATTRIBUTE_UNUSED) -{ - const_ix86_first_cycle_multipass_data_t data - = (const_ix86_first_cycle_multipass_data_t) _data; - unsigned int i = 0; - sbitmap_iterator sbi; - - gcc_assert (bitmap_last_set_bit (data->ready_try_change) < n_ready); - EXECUTE_IF_SET_IN_BITMAP (data->ready_try_change, 0, i, sbi) - { - ready_try[i] = 0; - } -} - -/* Save the result of multipass lookahead scheduling for the next round. */ -static void -core2i7_first_cycle_multipass_end (const void *_data) -{ - const_ix86_first_cycle_multipass_data_t data - = (const_ix86_first_cycle_multipass_data_t) _data; - ix86_first_cycle_multipass_data_t next_data - = ix86_first_cycle_multipass_data; - - if (data != NULL) - { - next_data->ifetch_block_len = data->ifetch_block_len; - next_data->ifetch_block_n_insns = data->ifetch_block_n_insns; - } -} - -/* Deallocate target data. */ -static void -core2i7_first_cycle_multipass_fini (void *_data) -{ - ix86_first_cycle_multipass_data_t data - = (ix86_first_cycle_multipass_data_t) _data; - - if (data->ready_try_change) - { - sbitmap_free (data->ready_try_change); - data->ready_try_change = NULL; - data->ready_try_change_size = 0; - } -} - /* Prepare for scheduling pass. */ static void ix86_sched_init_global (FILE *, int, int) @@ -31545,25 +28500,7 @@ ix86_sched_init_global (FILE *, int, int) to save compile time. */ if (reload_completed) { - targetm.sched.dfa_post_advance_cycle - = core2i7_dfa_post_advance_cycle; - targetm.sched.first_cycle_multipass_init - = core2i7_first_cycle_multipass_init; - targetm.sched.first_cycle_multipass_begin - = core2i7_first_cycle_multipass_begin; - targetm.sched.first_cycle_multipass_issue - = core2i7_first_cycle_multipass_issue; - targetm.sched.first_cycle_multipass_backtrack - = core2i7_first_cycle_multipass_backtrack; - targetm.sched.first_cycle_multipass_end - = core2i7_first_cycle_multipass_end; - targetm.sched.first_cycle_multipass_fini - = core2i7_first_cycle_multipass_fini; - - /* Set decoder parameters. */ - core2i7_secondary_decoder_max_insn_size = 8; - core2i7_ifetch_block_size = 16; - core2i7_ifetch_block_max_insns = 6; + ix86_core2i7_init_hooks (); break; } /* Fall through. */ @@ -43128,8 +40065,8 @@ x86_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED) address sizes. This is enough to eliminate unnecessary padding in 99% of cases. */ -static int -min_insn_size (rtx_insn *insn) +int +ix86_min_insn_size (rtx_insn *insn) { int l = 0, len; @@ -43238,13 +40175,13 @@ ix86_avoid_jump_mispredicts (void) njumps--, isjump = true; else isjump = false; - nbytes -= min_insn_size (start); + nbytes -= ix86_min_insn_size (start); } } continue; } - min_size = min_insn_size (insn); + min_size = ix86_min_insn_size (insn); nbytes += min_size; if (dump_file) fprintf (dump_file, "Insn %i estimated to %i bytes\n", @@ -43263,7 +40200,7 @@ ix86_avoid_jump_mispredicts (void) njumps--, isjump = true; else isjump = false; - nbytes -= min_insn_size (start); + nbytes -= ix86_min_insn_size (start); } gcc_assert (njumps >= 0); if (dump_file) @@ -43272,7 +40209,7 @@ ix86_avoid_jump_mispredicts (void) if (njumps == 3 && isjump && nbytes < 16) { - int padsize = 15 - nbytes + min_insn_size (insn); + int padsize = 15 - nbytes + ix86_min_insn_size (insn); if (dump_file) fprintf (dump_file, "Padding insn %i by %i bytes!\n", @@ -51035,806 +47972,19 @@ ix86_enum_va_list (int idx, const char **pname, tree *ptree) } #undef TARGET_SCHED_DISPATCH -#define TARGET_SCHED_DISPATCH has_dispatch +#define TARGET_SCHED_DISPATCH ix86_bd_has_dispatch #undef TARGET_SCHED_DISPATCH_DO -#define TARGET_SCHED_DISPATCH_DO do_dispatch +#define TARGET_SCHED_DISPATCH_DO ix86_bd_do_dispatch #undef TARGET_SCHED_REASSOCIATION_WIDTH #define TARGET_SCHED_REASSOCIATION_WIDTH ix86_reassociation_width #undef TARGET_SCHED_REORDER -#define TARGET_SCHED_REORDER ix86_sched_reorder +#define TARGET_SCHED_REORDER ix86_atom_sched_reorder #undef TARGET_SCHED_ADJUST_PRIORITY #define TARGET_SCHED_ADJUST_PRIORITY ix86_adjust_priority #undef TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK #define TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK \ ix86_dependencies_evaluation_hook -/* The size of the dispatch window is the total number of bytes of - object code allowed in a window. */ -#define DISPATCH_WINDOW_SIZE 16 - -/* Number of dispatch windows considered for scheduling. */ -#define MAX_DISPATCH_WINDOWS 3 - -/* Maximum number of instructions in a window. */ -#define MAX_INSN 4 - -/* Maximum number of immediate operands in a window. */ -#define MAX_IMM 4 - -/* Maximum number of immediate bits allowed in a window. */ -#define MAX_IMM_SIZE 128 - -/* Maximum number of 32 bit immediates allowed in a window. */ -#define MAX_IMM_32 4 - -/* Maximum number of 64 bit immediates allowed in a window. */ -#define MAX_IMM_64 2 - -/* Maximum total of loads or prefetches allowed in a window. */ -#define MAX_LOAD 2 - -/* Maximum total of stores allowed in a window. */ -#define MAX_STORE 1 - -#undef BIG -#define BIG 100 - - -/* Dispatch groups. Istructions that affect the mix in a dispatch window. */ -enum dispatch_group { - disp_no_group = 0, - disp_load, - disp_store, - disp_load_store, - disp_prefetch, - disp_imm, - disp_imm_32, - disp_imm_64, - disp_branch, - disp_cmp, - disp_jcc, - disp_last -}; - -/* Number of allowable groups in a dispatch window. It is an array - indexed by dispatch_group enum. 100 is used as a big number, - because the number of these kind of operations does not have any - effect in dispatch window, but we need them for other reasons in - the table. */ -static unsigned int num_allowable_groups[disp_last] = { - 0, 2, 1, 1, 2, 4, 4, 2, 1, BIG, BIG -}; - -char group_name[disp_last + 1][16] = { - "disp_no_group", "disp_load", "disp_store", "disp_load_store", - "disp_prefetch", "disp_imm", "disp_imm_32", "disp_imm_64", - "disp_branch", "disp_cmp", "disp_jcc", "disp_last" -}; - -/* Instruction path. */ -enum insn_path { - no_path = 0, - path_single, /* Single micro op. */ - path_double, /* Double micro op. */ - path_multi, /* Instructions with more than 2 micro op.. */ - last_path -}; - -/* sched_insn_info defines a window to the instructions scheduled in - the basic block. It contains a pointer to the insn_info table and - the instruction scheduled. - - Windows are allocated for each basic block and are linked - together. */ -typedef struct sched_insn_info_s { - rtx insn; - enum dispatch_group group; - enum insn_path path; - int byte_len; - int imm_bytes; -} sched_insn_info; - -/* Linked list of dispatch windows. This is a two way list of - dispatch windows of a basic block. It contains information about - the number of uops in the window and the total number of - instructions and of bytes in the object code for this dispatch - window. */ -typedef struct dispatch_windows_s { - int num_insn; /* Number of insn in the window. */ - int num_uops; /* Number of uops in the window. */ - int window_size; /* Number of bytes in the window. */ - int window_num; /* Window number between 0 or 1. */ - int num_imm; /* Number of immediates in an insn. */ - int num_imm_32; /* Number of 32 bit immediates in an insn. */ - int num_imm_64; /* Number of 64 bit immediates in an insn. */ - int imm_size; /* Total immediates in the window. */ - int num_loads; /* Total memory loads in the window. */ - int num_stores; /* Total memory stores in the window. */ - int violation; /* Violation exists in window. */ - sched_insn_info *window; /* Pointer to the window. */ - struct dispatch_windows_s *next; - struct dispatch_windows_s *prev; -} dispatch_windows; - -/* Immediate valuse used in an insn. */ -typedef struct imm_info_s - { - int imm; - int imm32; - int imm64; - } imm_info; - -static dispatch_windows *dispatch_window_list; -static dispatch_windows *dispatch_window_list1; - -/* Get dispatch group of insn. */ - -static enum dispatch_group -get_mem_group (rtx_insn *insn) -{ - enum attr_memory memory; - - if (INSN_CODE (insn) < 0) - return disp_no_group; - memory = get_attr_memory (insn); - if (memory == MEMORY_STORE) - return disp_store; - - if (memory == MEMORY_LOAD) - return disp_load; - - if (memory == MEMORY_BOTH) - return disp_load_store; - - return disp_no_group; -} - -/* Return true if insn is a compare instruction. */ - -static bool -is_cmp (rtx_insn *insn) -{ - enum attr_type type; - - type = get_attr_type (insn); - return (type == TYPE_TEST - || type == TYPE_ICMP - || type == TYPE_FCMP - || GET_CODE (PATTERN (insn)) == COMPARE); -} - -/* Return true if a dispatch violation encountered. */ - -static bool -dispatch_violation (void) -{ - if (dispatch_window_list->next) - return dispatch_window_list->next->violation; - return dispatch_window_list->violation; -} - -/* Return true if insn is a branch instruction. */ - -static bool -is_branch (rtx_insn *insn) -{ - return (CALL_P (insn) || JUMP_P (insn)); -} - -/* Return true if insn is a prefetch instruction. */ - -static bool -is_prefetch (rtx_insn *insn) -{ - return NONJUMP_INSN_P (insn) && GET_CODE (PATTERN (insn)) == PREFETCH; -} - -/* This function initializes a dispatch window and the list container holding a - pointer to the window. */ - -static void -init_window (int window_num) -{ - int i; - dispatch_windows *new_list; - - if (window_num == 0) - new_list = dispatch_window_list; - else - new_list = dispatch_window_list1; - - new_list->num_insn = 0; - new_list->num_uops = 0; - new_list->window_size = 0; - new_list->next = NULL; - new_list->prev = NULL; - new_list->window_num = window_num; - new_list->num_imm = 0; - new_list->num_imm_32 = 0; - new_list->num_imm_64 = 0; - new_list->imm_size = 0; - new_list->num_loads = 0; - new_list->num_stores = 0; - new_list->violation = false; - - for (i = 0; i < MAX_INSN; i++) - { - new_list->window[i].insn = NULL; - new_list->window[i].group = disp_no_group; - new_list->window[i].path = no_path; - new_list->window[i].byte_len = 0; - new_list->window[i].imm_bytes = 0; - } - return; -} - -/* This function allocates and initializes a dispatch window and the - list container holding a pointer to the window. */ - -static dispatch_windows * -allocate_window (void) -{ - dispatch_windows *new_list = XNEW (struct dispatch_windows_s); - new_list->window = XNEWVEC (struct sched_insn_info_s, MAX_INSN + 1); - - return new_list; -} - -/* This routine initializes the dispatch scheduling information. It - initiates building dispatch scheduler tables and constructs the - first dispatch window. */ - -static void -init_dispatch_sched (void) -{ - /* Allocate a dispatch list and a window. */ - dispatch_window_list = allocate_window (); - dispatch_window_list1 = allocate_window (); - init_window (0); - init_window (1); -} - -/* This function returns true if a branch is detected. End of a basic block - does not have to be a branch, but here we assume only branches end a - window. */ - -static bool -is_end_basic_block (enum dispatch_group group) -{ - return group == disp_branch; -} - -/* This function is called when the end of a window processing is reached. */ - -static void -process_end_window (void) -{ - gcc_assert (dispatch_window_list->num_insn <= MAX_INSN); - if (dispatch_window_list->next) - { - gcc_assert (dispatch_window_list1->num_insn <= MAX_INSN); - gcc_assert (dispatch_window_list->window_size - + dispatch_window_list1->window_size <= 48); - init_window (1); - } - init_window (0); -} - -/* Allocates a new dispatch window and adds it to WINDOW_LIST. - WINDOW_NUM is either 0 or 1. A maximum of two windows are generated - for 48 bytes of instructions. Note that these windows are not dispatch - windows that their sizes are DISPATCH_WINDOW_SIZE. */ - -static dispatch_windows * -allocate_next_window (int window_num) -{ - if (window_num == 0) - { - if (dispatch_window_list->next) - init_window (1); - init_window (0); - return dispatch_window_list; - } - - dispatch_window_list->next = dispatch_window_list1; - dispatch_window_list1->prev = dispatch_window_list; - - return dispatch_window_list1; -} - -/* Compute number of immediate operands of an instruction. */ - -static void -find_constant (rtx in_rtx, imm_info *imm_values) -{ - if (INSN_P (in_rtx)) - in_rtx = PATTERN (in_rtx); - subrtx_iterator::array_type array; - FOR_EACH_SUBRTX (iter, array, in_rtx, ALL) - if (const_rtx x = *iter) - switch (GET_CODE (x)) - { - case CONST: - case SYMBOL_REF: - case CONST_INT: - (imm_values->imm)++; - if (x86_64_immediate_operand (CONST_CAST_RTX (x), SImode)) - (imm_values->imm32)++; - else - (imm_values->imm64)++; - break; - - case CONST_DOUBLE: - case CONST_WIDE_INT: - (imm_values->imm)++; - (imm_values->imm64)++; - break; - - case CODE_LABEL: - if (LABEL_KIND (x) == LABEL_NORMAL) - { - (imm_values->imm)++; - (imm_values->imm32)++; - } - break; - - default: - break; - } -} - -/* Return total size of immediate operands of an instruction along with number - of corresponding immediate-operands. It initializes its parameters to zero - befor calling FIND_CONSTANT. - INSN is the input instruction. IMM is the total of immediates. - IMM32 is the number of 32 bit immediates. IMM64 is the number of 64 - bit immediates. */ - -static int -get_num_immediates (rtx_insn *insn, int *imm, int *imm32, int *imm64) -{ - imm_info imm_values = {0, 0, 0}; - - find_constant (insn, &imm_values); - *imm = imm_values.imm; - *imm32 = imm_values.imm32; - *imm64 = imm_values.imm64; - return imm_values.imm32 * 4 + imm_values.imm64 * 8; -} - -/* This function indicates if an operand of an instruction is an - immediate. */ - -static bool -has_immediate (rtx_insn *insn) -{ - int num_imm_operand; - int num_imm32_operand; - int num_imm64_operand; - - if (insn) - return get_num_immediates (insn, &num_imm_operand, &num_imm32_operand, - &num_imm64_operand); - return false; -} - -/* Return single or double path for instructions. */ - -static enum insn_path -get_insn_path (rtx_insn *insn) -{ - enum attr_amdfam10_decode path = get_attr_amdfam10_decode (insn); - - if ((int)path == 0) - return path_single; - - if ((int)path == 1) - return path_double; - - return path_multi; -} - -/* Return insn dispatch group. */ - -static enum dispatch_group -get_insn_group (rtx_insn *insn) -{ - enum dispatch_group group = get_mem_group (insn); - if (group) - return group; - - if (is_branch (insn)) - return disp_branch; - - if (is_cmp (insn)) - return disp_cmp; - - if (has_immediate (insn)) - return disp_imm; - - if (is_prefetch (insn)) - return disp_prefetch; - - return disp_no_group; -} - -/* Count number of GROUP restricted instructions in a dispatch - window WINDOW_LIST. */ - -static int -count_num_restricted (rtx_insn *insn, dispatch_windows *window_list) -{ - enum dispatch_group group = get_insn_group (insn); - int imm_size; - int num_imm_operand; - int num_imm32_operand; - int num_imm64_operand; - - if (group == disp_no_group) - return 0; - - if (group == disp_imm) - { - imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand, - &num_imm64_operand); - if (window_list->imm_size + imm_size > MAX_IMM_SIZE - || num_imm_operand + window_list->num_imm > MAX_IMM - || (num_imm32_operand > 0 - && (window_list->num_imm_32 + num_imm32_operand > MAX_IMM_32 - || window_list->num_imm_64 * 2 + num_imm32_operand > MAX_IMM_32)) - || (num_imm64_operand > 0 - && (window_list->num_imm_64 + num_imm64_operand > MAX_IMM_64 - || window_list->num_imm_32 + num_imm64_operand * 2 > MAX_IMM_32)) - || (window_list->imm_size + imm_size == MAX_IMM_SIZE - && num_imm64_operand > 0 - && ((window_list->num_imm_64 > 0 - && window_list->num_insn >= 2) - || window_list->num_insn >= 3))) - return BIG; - - return 1; - } - - if ((group == disp_load_store - && (window_list->num_loads >= MAX_LOAD - || window_list->num_stores >= MAX_STORE)) - || ((group == disp_load - || group == disp_prefetch) - && window_list->num_loads >= MAX_LOAD) - || (group == disp_store - && window_list->num_stores >= MAX_STORE)) - return BIG; - - return 1; -} - -/* This function returns true if insn satisfies dispatch rules on the - last window scheduled. */ - -static bool -fits_dispatch_window (rtx_insn *insn) -{ - dispatch_windows *window_list = dispatch_window_list; - dispatch_windows *window_list_next = dispatch_window_list->next; - unsigned int num_restrict; - enum dispatch_group group = get_insn_group (insn); - enum insn_path path = get_insn_path (insn); - int sum; - - /* Make disp_cmp and disp_jcc get scheduled at the latest. These - instructions should be given the lowest priority in the - scheduling process in Haifa scheduler to make sure they will be - scheduled in the same dispatch window as the reference to them. */ - if (group == disp_jcc || group == disp_cmp) - return false; - - /* Check nonrestricted. */ - if (group == disp_no_group || group == disp_branch) - return true; - - /* Get last dispatch window. */ - if (window_list_next) - window_list = window_list_next; - - if (window_list->window_num == 1) - { - sum = window_list->prev->window_size + window_list->window_size; - - if (sum == 32 - || (min_insn_size (insn) + sum) >= 48) - /* Window 1 is full. Go for next window. */ - return true; - } - - num_restrict = count_num_restricted (insn, window_list); - - if (num_restrict > num_allowable_groups[group]) - return false; - - /* See if it fits in the first window. */ - if (window_list->window_num == 0) - { - /* The first widow should have only single and double path - uops. */ - if (path == path_double - && (window_list->num_uops + 2) > MAX_INSN) - return false; - else if (path != path_single) - return false; - } - return true; -} - -/* Add an instruction INSN with NUM_UOPS micro-operations to the - dispatch window WINDOW_LIST. */ - -static void -add_insn_window (rtx_insn *insn, dispatch_windows *window_list, int num_uops) -{ - int byte_len = min_insn_size (insn); - int num_insn = window_list->num_insn; - int imm_size; - sched_insn_info *window = window_list->window; - enum dispatch_group group = get_insn_group (insn); - enum insn_path path = get_insn_path (insn); - int num_imm_operand; - int num_imm32_operand; - int num_imm64_operand; - - if (!window_list->violation && group != disp_cmp - && !fits_dispatch_window (insn)) - window_list->violation = true; - - imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand, - &num_imm64_operand); - - /* Initialize window with new instruction. */ - window[num_insn].insn = insn; - window[num_insn].byte_len = byte_len; - window[num_insn].group = group; - window[num_insn].path = path; - window[num_insn].imm_bytes = imm_size; - - window_list->window_size += byte_len; - window_list->num_insn = num_insn + 1; - window_list->num_uops = window_list->num_uops + num_uops; - window_list->imm_size += imm_size; - window_list->num_imm += num_imm_operand; - window_list->num_imm_32 += num_imm32_operand; - window_list->num_imm_64 += num_imm64_operand; - - if (group == disp_store) - window_list->num_stores += 1; - else if (group == disp_load - || group == disp_prefetch) - window_list->num_loads += 1; - else if (group == disp_load_store) - { - window_list->num_stores += 1; - window_list->num_loads += 1; - } -} - -/* Adds a scheduled instruction, INSN, to the current dispatch window. - If the total bytes of instructions or the number of instructions in - the window exceed allowable, it allocates a new window. */ - -static void -add_to_dispatch_window (rtx_insn *insn) -{ - int byte_len; - dispatch_windows *window_list; - dispatch_windows *next_list; - dispatch_windows *window0_list; - enum insn_path path; - enum dispatch_group insn_group; - bool insn_fits; - int num_insn; - int num_uops; - int window_num; - int insn_num_uops; - int sum; - - if (INSN_CODE (insn) < 0) - return; - - byte_len = min_insn_size (insn); - window_list = dispatch_window_list; - next_list = window_list->next; - path = get_insn_path (insn); - insn_group = get_insn_group (insn); - - /* Get the last dispatch window. */ - if (next_list) - window_list = dispatch_window_list->next; - - if (path == path_single) - insn_num_uops = 1; - else if (path == path_double) - insn_num_uops = 2; - else - insn_num_uops = (int) path; - - /* If current window is full, get a new window. - Window number zero is full, if MAX_INSN uops are scheduled in it. - Window number one is full, if window zero's bytes plus window - one's bytes is 32, or if the bytes of the new instruction added - to the total makes it greater than 48, or it has already MAX_INSN - instructions in it. */ - num_insn = window_list->num_insn; - num_uops = window_list->num_uops; - window_num = window_list->window_num; - insn_fits = fits_dispatch_window (insn); - - if (num_insn >= MAX_INSN - || num_uops + insn_num_uops > MAX_INSN - || !(insn_fits)) - { - window_num = ~window_num & 1; - window_list = allocate_next_window (window_num); - } - - if (window_num == 0) - { - add_insn_window (insn, window_list, insn_num_uops); - if (window_list->num_insn >= MAX_INSN - && insn_group == disp_branch) - { - process_end_window (); - return; - } - } - else if (window_num == 1) - { - window0_list = window_list->prev; - sum = window0_list->window_size + window_list->window_size; - if (sum == 32 - || (byte_len + sum) >= 48) - { - process_end_window (); - window_list = dispatch_window_list; - } - - add_insn_window (insn, window_list, insn_num_uops); - } - else - gcc_unreachable (); - - if (is_end_basic_block (insn_group)) - { - /* End of basic block is reached do end-basic-block process. */ - process_end_window (); - return; - } -} - -/* Print the dispatch window, WINDOW_NUM, to FILE. */ - -DEBUG_FUNCTION static void -debug_dispatch_window_file (FILE *file, int window_num) -{ - dispatch_windows *list; - int i; - - if (window_num == 0) - list = dispatch_window_list; - else - list = dispatch_window_list1; - - fprintf (file, "Window #%d:\n", list->window_num); - fprintf (file, " num_insn = %d, num_uops = %d, window_size = %d\n", - list->num_insn, list->num_uops, list->window_size); - fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n", - list->num_imm, list->num_imm_32, list->num_imm_64, list->imm_size); - - fprintf (file, " num_loads = %d, num_stores = %d\n", list->num_loads, - list->num_stores); - fprintf (file, " insn info:\n"); - - for (i = 0; i < MAX_INSN; i++) - { - if (!list->window[i].insn) - break; - fprintf (file, " group[%d] = %s, insn[%d] = %p, path[%d] = %d byte_len[%d] = %d, imm_bytes[%d] = %d\n", - i, group_name[list->window[i].group], - i, (void *)list->window[i].insn, - i, list->window[i].path, - i, list->window[i].byte_len, - i, list->window[i].imm_bytes); - } -} - -/* Print to stdout a dispatch window. */ - -DEBUG_FUNCTION void -debug_dispatch_window (int window_num) -{ - debug_dispatch_window_file (stdout, window_num); -} - -/* Print INSN dispatch information to FILE. */ - -DEBUG_FUNCTION static void -debug_insn_dispatch_info_file (FILE *file, rtx_insn *insn) -{ - int byte_len; - enum insn_path path; - enum dispatch_group group; - int imm_size; - int num_imm_operand; - int num_imm32_operand; - int num_imm64_operand; - - if (INSN_CODE (insn) < 0) - return; - - byte_len = min_insn_size (insn); - path = get_insn_path (insn); - group = get_insn_group (insn); - imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand, - &num_imm64_operand); - - fprintf (file, " insn info:\n"); - fprintf (file, " group = %s, path = %d, byte_len = %d\n", - group_name[group], path, byte_len); - fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n", - num_imm_operand, num_imm32_operand, num_imm64_operand, imm_size); -} - -/* Print to STDERR the status of the ready list with respect to - dispatch windows. */ - -DEBUG_FUNCTION void -debug_ready_dispatch (void) -{ - int i; - int no_ready = number_in_ready (); - - fprintf (stdout, "Number of ready: %d\n", no_ready); - - for (i = 0; i < no_ready; i++) - debug_insn_dispatch_info_file (stdout, get_ready_element (i)); -} - -/* This routine is the driver of the dispatch scheduler. */ - -static void -do_dispatch (rtx_insn *insn, int mode) -{ - if (mode == DISPATCH_INIT) - init_dispatch_sched (); - else if (mode == ADD_TO_DISPATCH_WINDOW) - add_to_dispatch_window (insn); -} - -/* Return TRUE if Dispatch Scheduling is supported. */ - -static bool -has_dispatch (rtx_insn *insn, int action) -{ - /* Current implementation of dispatch scheduler models buldozer only. */ - if ((TARGET_BDVER1 || TARGET_BDVER2 || TARGET_BDVER3 - || TARGET_BDVER4) && flag_dispatch_scheduler) - switch (action) - { - default: - return false; - - case IS_DISPATCH_ON: - return true; - - case IS_CMP: - return is_cmp (insn); - - case DISPATCH_VIOLATION: - return dispatch_violation (); - - case FITS_DISPATCH_WINDOW: - return fits_dispatch_window (insn); - } - - return false; -} /* Implementation of reassociation_width target hook used by reassoc phase to identify parallelism level in reassociated diff --git a/gcc/config/i386/t-i386 b/gcc/config/i386/t-i386 index 0a8524bfbe2..8411a9680ff 100644 --- a/gcc/config/i386/t-i386 +++ b/gcc/config/i386/t-i386 @@ -24,6 +24,22 @@ i386-c.o: $(srcdir)/config/i386/i386-c.c $(COMPILE) $< $(POSTCOMPILE) +x86-tune-sched.o: $(srcdir)/config/i386/x86-tune-sched.c + $(COMPILE) $< + $(POSTCOMPILE) + +x86-tune-sched-bd.o: $(srcdir)/config/i386/x86-tune-sched-bd.c + $(COMPILE) $< + $(POSTCOMPILE) + +x86-tune-sched-atom.o: $(srcdir)/config/i386/x86-tune-sched-atom.c + $(COMPILE) $< + $(POSTCOMPILE) + +x86-tune-sched-core.o: $(srcdir)/config/i386/x86-tune-sched-core.c + $(COMPILE) $< + $(POSTCOMPILE) + i386.o: i386-builtin-types.inc i386-builtin-types.inc: s-i386-bt ; @true diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h new file mode 100644 index 00000000000..d27072c0901 --- /dev/null +++ b/gcc/config/i386/x86-tune-costs.h @@ -0,0 +1,2083 @@ + +/* Processor costs (relative to an add) */ +/* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes. */ +#define COSTS_N_BYTES(N) ((N) * 2) + +#define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall, false}}} + +static stringop_algs ix86_size_memcpy[2] = { + {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}, + {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}}; +static stringop_algs ix86_size_memset[2] = { + {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}, + {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}}; + +const +struct processor_costs ix86_size_cost = {/* costs for tuning for size */ + COSTS_N_BYTES (2), /* cost of an add instruction */ + COSTS_N_BYTES (3), /* cost of a lea instruction */ + COSTS_N_BYTES (2), /* variable shift costs */ + COSTS_N_BYTES (3), /* constant shift costs */ + {COSTS_N_BYTES (3), /* cost of starting multiply for QI */ + COSTS_N_BYTES (3), /* HI */ + COSTS_N_BYTES (3), /* SI */ + COSTS_N_BYTES (3), /* DI */ + COSTS_N_BYTES (5)}, /* other */ + 0, /* cost of multiply per each bit set */ + {COSTS_N_BYTES (3), /* cost of a divide/mod for QI */ + COSTS_N_BYTES (3), /* HI */ + COSTS_N_BYTES (3), /* SI */ + COSTS_N_BYTES (3), /* DI */ + COSTS_N_BYTES (5)}, /* other */ + COSTS_N_BYTES (3), /* cost of movsx */ + COSTS_N_BYTES (3), /* cost of movzx */ + 0, /* "large" insn */ + 2, /* MOVE_RATIO */ + 2, /* cost for loading QImode using movzbl */ + {2, 2, 2}, /* cost of loading integer registers + in QImode, HImode and SImode. + Relative to reg-reg move (2). */ + {2, 2, 2}, /* cost of storing integer registers */ + 2, /* cost of reg,reg fld/fst */ + {2, 2, 2}, /* cost of loading fp registers + in SFmode, DFmode and XFmode */ + {2, 2, 2}, /* cost of storing fp registers + in SFmode, DFmode and XFmode */ + 3, /* cost of moving MMX register */ + {3, 3}, /* cost of loading MMX registers + in SImode and DImode */ + {3, 3}, /* cost of storing MMX registers + in SImode and DImode */ + 3, /* cost of moving SSE register */ + {3, 3, 3}, /* cost of loading SSE registers + in SImode, DImode and TImode */ + {3, 3, 3}, /* cost of storing SSE registers + in SImode, DImode and TImode */ + 3, /* MMX or SSE register to integer */ + 0, /* size of l1 cache */ + 0, /* size of l2 cache */ + 0, /* size of prefetch block */ + 0, /* number of parallel prefetches */ + 2, /* Branch cost */ + COSTS_N_BYTES (2), /* cost of FADD and FSUB insns. */ + COSTS_N_BYTES (2), /* cost of FMUL instruction. */ + COSTS_N_BYTES (2), /* cost of FDIV instruction. */ + COSTS_N_BYTES (2), /* cost of FABS instruction. */ + COSTS_N_BYTES (2), /* cost of FCHS instruction. */ + COSTS_N_BYTES (2), /* cost of FSQRT instruction. */ + 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ + ix86_size_memcpy, + ix86_size_memset, + 1, /* scalar_stmt_cost. */ + 1, /* scalar load_cost. */ + 1, /* scalar_store_cost. */ + 1, /* vec_stmt_cost. */ + 1, /* vec_to_scalar_cost. */ + 1, /* scalar_to_vec_cost. */ + 1, /* vec_align_load_cost. */ + 1, /* vec_unalign_load_cost. */ + 1, /* vec_store_cost. */ + 1, /* cond_taken_branch_cost. */ + 1, /* cond_not_taken_branch_cost. */ +}; + +/* Processor costs (relative to an add) */ +static stringop_algs i386_memcpy[2] = { + {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}, + DUMMY_STRINGOP_ALGS}; +static stringop_algs i386_memset[2] = { + {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}, + DUMMY_STRINGOP_ALGS}; + +static const +struct processor_costs i386_cost = { /* 386 specific costs */ + COSTS_N_INSNS (1), /* cost of an add instruction */ + COSTS_N_INSNS (1), /* cost of a lea instruction */ + COSTS_N_INSNS (3), /* variable shift costs */ + COSTS_N_INSNS (2), /* constant shift costs */ + {COSTS_N_INSNS (6), /* cost of starting multiply for QI */ + COSTS_N_INSNS (6), /* HI */ + COSTS_N_INSNS (6), /* SI */ + COSTS_N_INSNS (6), /* DI */ + COSTS_N_INSNS (6)}, /* other */ + COSTS_N_INSNS (1), /* cost of multiply per each bit set */ + {COSTS_N_INSNS (23), /* cost of a divide/mod for QI */ + COSTS_N_INSNS (23), /* HI */ + COSTS_N_INSNS (23), /* SI */ + COSTS_N_INSNS (23), /* DI */ + COSTS_N_INSNS (23)}, /* other */ + COSTS_N_INSNS (3), /* cost of movsx */ + COSTS_N_INSNS (2), /* cost of movzx */ + 15, /* "large" insn */ + 3, /* MOVE_RATIO */ + 4, /* cost for loading QImode using movzbl */ + {2, 4, 2}, /* cost of loading integer registers + in QImode, HImode and SImode. + Relative to reg-reg move (2). */ + {2, 4, 2}, /* cost of storing integer registers */ + 2, /* cost of reg,reg fld/fst */ + {8, 8, 8}, /* cost of loading fp registers + in SFmode, DFmode and XFmode */ + {8, 8, 8}, /* cost of storing fp registers + in SFmode, DFmode and XFmode */ + 2, /* cost of moving MMX register */ + {4, 8}, /* cost of loading MMX registers + in SImode and DImode */ + {4, 8}, /* cost of storing MMX registers + in SImode and DImode */ + 2, /* cost of moving SSE register */ + {4, 8, 16}, /* cost of loading SSE registers + in SImode, DImode and TImode */ + {4, 8, 16}, /* cost of storing SSE registers + in SImode, DImode and TImode */ + 3, /* MMX or SSE register to integer */ + 0, /* size of l1 cache */ + 0, /* size of l2 cache */ + 0, /* size of prefetch block */ + 0, /* number of parallel prefetches */ + 1, /* Branch cost */ + COSTS_N_INSNS (23), /* cost of FADD and FSUB insns. */ + COSTS_N_INSNS (27), /* cost of FMUL instruction. */ + COSTS_N_INSNS (88), /* cost of FDIV instruction. */ + COSTS_N_INSNS (22), /* cost of FABS instruction. */ + COSTS_N_INSNS (24), /* cost of FCHS instruction. */ + COSTS_N_INSNS (122), /* cost of FSQRT instruction. */ + 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ + i386_memcpy, + i386_memset, + 1, /* scalar_stmt_cost. */ + 1, /* scalar load_cost. */ + 1, /* scalar_store_cost. */ + 1, /* vec_stmt_cost. */ + 1, /* vec_to_scalar_cost. */ + 1, /* scalar_to_vec_cost. */ + 1, /* vec_align_load_cost. */ + 2, /* vec_unalign_load_cost. */ + 1, /* vec_store_cost. */ + 3, /* cond_taken_branch_cost. */ + 1, /* cond_not_taken_branch_cost. */ +}; + +static stringop_algs i486_memcpy[2] = { + {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}}, + DUMMY_STRINGOP_ALGS}; +static stringop_algs i486_memset[2] = { + {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}}, + DUMMY_STRINGOP_ALGS}; + +static const +struct processor_costs i486_cost = { /* 486 specific costs */ + COSTS_N_INSNS (1), /* cost of an add instruction */ + COSTS_N_INSNS (1), /* cost of a lea instruction */ + COSTS_N_INSNS (3), /* variable shift costs */ + COSTS_N_INSNS (2), /* constant shift costs */ + {COSTS_N_INSNS (12), /* cost of starting multiply for QI */ + COSTS_N_INSNS (12), /* HI */ + COSTS_N_INSNS (12), /* SI */ + COSTS_N_INSNS (12), /* DI */ + COSTS_N_INSNS (12)}, /* other */ + 1, /* cost of multiply per each bit set */ + {COSTS_N_INSNS (40), /* cost of a divide/mod for QI */ + COSTS_N_INSNS (40), /* HI */ + COSTS_N_INSNS (40), /* SI */ + COSTS_N_INSNS (40), /* DI */ + COSTS_N_INSNS (40)}, /* other */ + COSTS_N_INSNS (3), /* cost of movsx */ + COSTS_N_INSNS (2), /* cost of movzx */ + 15, /* "large" insn */ + 3, /* MOVE_RATIO */ + 4, /* cost for loading QImode using movzbl */ + {2, 4, 2}, /* cost of loading integer registers + in QImode, HImode and SImode. + Relative to reg-reg move (2). */ + {2, 4, 2}, /* cost of storing integer registers */ + 2, /* cost of reg,reg fld/fst */ + {8, 8, 8}, /* cost of loading fp registers + in SFmode, DFmode and XFmode */ + {8, 8, 8}, /* cost of storing fp registers + in SFmode, DFmode and XFmode */ + 2, /* cost of moving MMX register */ + {4, 8}, /* cost of loading MMX registers + in SImode and DImode */ + {4, 8}, /* cost of storing MMX registers + in SImode and DImode */ + 2, /* cost of moving SSE register */ + {4, 8, 16}, /* cost of loading SSE registers + in SImode, DImode and TImode */ + {4, 8, 16}, /* cost of storing SSE registers + in SImode, DImode and TImode */ + 3, /* MMX or SSE register to integer */ + 4, /* size of l1 cache. 486 has 8kB cache + shared for code and data, so 4kB is + not really precise. */ + 4, /* size of l2 cache */ + 0, /* size of prefetch block */ + 0, /* number of parallel prefetches */ + 1, /* Branch cost */ + COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */ + COSTS_N_INSNS (16), /* cost of FMUL instruction. */ + COSTS_N_INSNS (73), /* cost of FDIV instruction. */ + COSTS_N_INSNS (3), /* cost of FABS instruction. */ + COSTS_N_INSNS (3), /* cost of FCHS instruction. */ + COSTS_N_INSNS (83), /* cost of FSQRT instruction. */ + 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ + i486_memcpy, + i486_memset, + 1, /* scalar_stmt_cost. */ + 1, /* scalar load_cost. */ + 1, /* scalar_store_cost. */ + 1, /* vec_stmt_cost. */ + 1, /* vec_to_scalar_cost. */ + 1, /* scalar_to_vec_cost. */ + 1, /* vec_align_load_cost. */ + 2, /* vec_unalign_load_cost. */ + 1, /* vec_store_cost. */ + 3, /* cond_taken_branch_cost. */ + 1, /* cond_not_taken_branch_cost. */ +}; + +static stringop_algs pentium_memcpy[2] = { + {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}}, + DUMMY_STRINGOP_ALGS}; +static stringop_algs pentium_memset[2] = { + {libcall, {{-1, rep_prefix_4_byte, false}}}, + DUMMY_STRINGOP_ALGS}; + +static const +struct processor_costs pentium_cost = { + COSTS_N_INSNS (1), /* cost of an add instruction */ + COSTS_N_INSNS (1), /* cost of a lea instruction */ + COSTS_N_INSNS (4), /* variable shift costs */ + COSTS_N_INSNS (1), /* constant shift costs */ + {COSTS_N_INSNS (11), /* cost of starting multiply for QI */ + COSTS_N_INSNS (11), /* HI */ + COSTS_N_INSNS (11), /* SI */ + COSTS_N_INSNS (11), /* DI */ + COSTS_N_INSNS (11)}, /* other */ + 0, /* cost of multiply per each bit set */ + {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */ + COSTS_N_INSNS (25), /* HI */ + COSTS_N_INSNS (25), /* SI */ + COSTS_N_INSNS (25), /* DI */ + COSTS_N_INSNS (25)}, /* other */ + COSTS_N_INSNS (3), /* cost of movsx */ + COSTS_N_INSNS (2), /* cost of movzx */ + 8, /* "large" insn */ + 6, /* MOVE_RATIO */ + 6, /* cost for loading QImode using movzbl */ + {2, 4, 2}, /* cost of loading integer registers + in QImode, HImode and SImode. + Relative to reg-reg move (2). */ + {2, 4, 2}, /* cost of storing integer registers */ + 2, /* cost of reg,reg fld/fst */ + {2, 2, 6}, /* cost of loading fp registers + in SFmode, DFmode and XFmode */ + {4, 4, 6}, /* cost of storing fp registers + in SFmode, DFmode and XFmode */ + 8, /* cost of moving MMX register */ + {8, 8}, /* cost of loading MMX registers + in SImode and DImode */ + {8, 8}, /* cost of storing MMX registers + in SImode and DImode */ + 2, /* cost of moving SSE register */ + {4, 8, 16}, /* cost of loading SSE registers + in SImode, DImode and TImode */ + {4, 8, 16}, /* cost of storing SSE registers + in SImode, DImode and TImode */ + 3, /* MMX or SSE register to integer */ + 8, /* size of l1 cache. */ + 8, /* size of l2 cache */ + 0, /* size of prefetch block */ + 0, /* number of parallel prefetches */ + 2, /* Branch cost */ + COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */ + COSTS_N_INSNS (3), /* cost of FMUL instruction. */ + COSTS_N_INSNS (39), /* cost of FDIV instruction. */ + COSTS_N_INSNS (1), /* cost of FABS instruction. */ + COSTS_N_INSNS (1), /* cost of FCHS instruction. */ + COSTS_N_INSNS (70), /* cost of FSQRT instruction. */ + 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ + pentium_memcpy, + pentium_memset, + 1, /* scalar_stmt_cost. */ + 1, /* scalar load_cost. */ + 1, /* scalar_store_cost. */ + 1, /* vec_stmt_cost. */ + 1, /* vec_to_scalar_cost. */ + 1, /* scalar_to_vec_cost. */ + 1, /* vec_align_load_cost. */ + 2, /* vec_unalign_load_cost. */ + 1, /* vec_store_cost. */ + 3, /* cond_taken_branch_cost. */ + 1, /* cond_not_taken_branch_cost. */ +}; + +static const +struct processor_costs lakemont_cost = { + COSTS_N_INSNS (1), /* cost of an add instruction */ + COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */ + COSTS_N_INSNS (1), /* variable shift costs */ + COSTS_N_INSNS (1), /* constant shift costs */ + {COSTS_N_INSNS (11), /* cost of starting multiply for QI */ + COSTS_N_INSNS (11), /* HI */ + COSTS_N_INSNS (11), /* SI */ + COSTS_N_INSNS (11), /* DI */ + COSTS_N_INSNS (11)}, /* other */ + 0, /* cost of multiply per each bit set */ + {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */ + COSTS_N_INSNS (25), /* HI */ + COSTS_N_INSNS (25), /* SI */ + COSTS_N_INSNS (25), /* DI */ + COSTS_N_INSNS (25)}, /* other */ + COSTS_N_INSNS (3), /* cost of movsx */ + COSTS_N_INSNS (2), /* cost of movzx */ + 8, /* "large" insn */ + 17, /* MOVE_RATIO */ + 6, /* cost for loading QImode using movzbl */ + {2, 4, 2}, /* cost of loading integer registers + in QImode, HImode and SImode. + Relative to reg-reg move (2). */ + {2, 4, 2}, /* cost of storing integer registers */ + 2, /* cost of reg,reg fld/fst */ + {2, 2, 6}, /* cost of loading fp registers + in SFmode, DFmode and XFmode */ + {4, 4, 6}, /* cost of storing fp registers + in SFmode, DFmode and XFmode */ + 8, /* cost of moving MMX register */ + {8, 8}, /* cost of loading MMX registers + in SImode and DImode */ + {8, 8}, /* cost of storing MMX registers + in SImode and DImode */ + 2, /* cost of moving SSE register */ + {4, 8, 16}, /* cost of loading SSE registers + in SImode, DImode and TImode */ + {4, 8, 16}, /* cost of storing SSE registers + in SImode, DImode and TImode */ + 3, /* MMX or SSE register to integer */ + 8, /* size of l1 cache. */ + 8, /* size of l2 cache */ + 0, /* size of prefetch block */ + 0, /* number of parallel prefetches */ + 2, /* Branch cost */ + COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */ + COSTS_N_INSNS (3), /* cost of FMUL instruction. */ + COSTS_N_INSNS (39), /* cost of FDIV instruction. */ + COSTS_N_INSNS (1), /* cost of FABS instruction. */ + COSTS_N_INSNS (1), /* cost of FCHS instruction. */ + COSTS_N_INSNS (70), /* cost of FSQRT instruction. */ + 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ + pentium_memcpy, + pentium_memset, + 1, /* scalar_stmt_cost. */ + 1, /* scalar load_cost. */ + 1, /* scalar_store_cost. */ + 1, /* vec_stmt_cost. */ + 1, /* vec_to_scalar_cost. */ + 1, /* scalar_to_vec_cost. */ + 1, /* vec_align_load_cost. */ + 2, /* vec_unalign_load_cost. */ + 1, /* vec_store_cost. */ + 3, /* cond_taken_branch_cost. */ + 1, /* cond_not_taken_branch_cost. */ +}; + +/* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes + (we ensure the alignment). For small blocks inline loop is still a + noticeable win, for bigger blocks either rep movsl or rep movsb is + way to go. Rep movsb has apparently more expensive startup time in CPU, + but after 4K the difference is down in the noise. */ +static stringop_algs pentiumpro_memcpy[2] = { + {rep_prefix_4_byte, {{128, loop, false}, {1024, unrolled_loop, false}, + {8192, rep_prefix_4_byte, false}, + {-1, rep_prefix_1_byte, false}}}, + DUMMY_STRINGOP_ALGS}; +static stringop_algs pentiumpro_memset[2] = { + {rep_prefix_4_byte, {{1024, unrolled_loop, false}, + {8192, rep_prefix_4_byte, false}, + {-1, libcall, false}}}, + DUMMY_STRINGOP_ALGS}; +static const +struct processor_costs pentiumpro_cost = { + COSTS_N_INSNS (1), /* cost of an add instruction */ + COSTS_N_INSNS (1), /* cost of a lea instruction */ + COSTS_N_INSNS (1), /* variable shift costs */ + COSTS_N_INSNS (1), /* constant shift costs */ + {COSTS_N_INSNS (4), /* cost of starting multiply for QI */ + COSTS_N_INSNS (4), /* HI */ + COSTS_N_INSNS (4), /* SI */ + COSTS_N_INSNS (4), /* DI */ + COSTS_N_INSNS (4)}, /* other */ + 0, /* cost of multiply per each bit set */ + {COSTS_N_INSNS (17), /* cost of a divide/mod for QI */ + COSTS_N_INSNS (17), /* HI */ + COSTS_N_INSNS (17), /* SI */ + COSTS_N_INSNS (17), /* DI */ + COSTS_N_INSNS (17)}, /* other */ + COSTS_N_INSNS (1), /* cost of movsx */ + COSTS_N_INSNS (1), /* cost of movzx */ + 8, /* "large" insn */ + 6, /* MOVE_RATIO */ + 2, /* cost for loading QImode using movzbl */ + {4, 4, 4}, /* cost of loading integer registers + in QImode, HImode and SImode. + Relative to reg-reg move (2). */ + {2, 2, 2}, /* cost of storing integer registers */ + 2, /* cost of reg,reg fld/fst */ + {2, 2, 6}, /* cost of loading fp registers + in SFmode, DFmode and XFmode */ + {4, 4, 6}, /* cost of storing fp registers + in SFmode, DFmode and XFmode */ + 2, /* cost of moving MMX register */ + {2, 2}, /* cost of loading MMX registers + in SImode and DImode */ + {2, 2}, /* cost of storing MMX registers + in SImode and DImode */ + 2, /* cost of moving SSE register */ + {2, 2, 8}, /* cost of loading SSE registers + in SImode, DImode and TImode */ + {2, 2, 8}, /* cost of storing SSE registers + in SImode, DImode and TImode */ + 3, /* MMX or SSE register to integer */ + 8, /* size of l1 cache. */ + 256, /* size of l2 cache */ + 32, /* size of prefetch block */ + 6, /* number of parallel prefetches */ + 2, /* Branch cost */ + COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */ + COSTS_N_INSNS (5), /* cost of FMUL instruction. */ + COSTS_N_INSNS (56), /* cost of FDIV instruction. */ + COSTS_N_INSNS (2), /* cost of FABS instruction. */ + COSTS_N_INSNS (2), /* cost of FCHS instruction. */ + COSTS_N_INSNS (56), /* cost of FSQRT instruction. */ + 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ + pentiumpro_memcpy, + pentiumpro_memset, + 1, /* scalar_stmt_cost. */ + 1, /* scalar load_cost. */ + 1, /* scalar_store_cost. */ + 1, /* vec_stmt_cost. */ + 1, /* vec_to_scalar_cost. */ + 1, /* scalar_to_vec_cost. */ + 1, /* vec_align_load_cost. */ + 2, /* vec_unalign_load_cost. */ + 1, /* vec_store_cost. */ + 3, /* cond_taken_branch_cost. */ + 1, /* cond_not_taken_branch_cost. */ +}; + +static stringop_algs geode_memcpy[2] = { + {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}}, + DUMMY_STRINGOP_ALGS}; +static stringop_algs geode_memset[2] = { + {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}}, + DUMMY_STRINGOP_ALGS}; +static const +struct processor_costs geode_cost = { + COSTS_N_INSNS (1), /* cost of an add instruction */ + COSTS_N_INSNS (1), /* cost of a lea instruction */ + COSTS_N_INSNS (2), /* variable shift costs */ + COSTS_N_INSNS (1), /* constant shift costs */ + {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ + COSTS_N_INSNS (4), /* HI */ + COSTS_N_INSNS (7), /* SI */ + COSTS_N_INSNS (7), /* DI */ + COSTS_N_INSNS (7)}, /* other */ + 0, /* cost of multiply per each bit set */ + {COSTS_N_INSNS (15), /* cost of a divide/mod for QI */ + COSTS_N_INSNS (23), /* HI */ + COSTS_N_INSNS (39), /* SI */ + COSTS_N_INSNS (39), /* DI */ + COSTS_N_INSNS (39)}, /* other */ + COSTS_N_INSNS (1), /* cost of movsx */ + COSTS_N_INSNS (1), /* cost of movzx */ + 8, /* "large" insn */ + 4, /* MOVE_RATIO */ + 1, /* cost for loading QImode using movzbl */ + {1, 1, 1}, /* cost of loading integer registers + in QImode, HImode and SImode. + Relative to reg-reg move (2). */ + {1, 1, 1}, /* cost of storing integer registers */ + 1, /* cost of reg,reg fld/fst */ + {1, 1, 1}, /* cost of loading fp registers + in SFmode, DFmode and XFmode */ + {4, 6, 6}, /* cost of storing fp registers + in SFmode, DFmode and XFmode */ + + 2, /* cost of moving MMX register */ + {2, 2}, /* cost of loading MMX registers + in SImode and DImode */ + {2, 2}, /* cost of storing MMX registers + in SImode and DImode */ + 2, /* cost of moving SSE register */ + {2, 2, 8}, /* cost of loading SSE registers + in SImode, DImode and TImode */ + {2, 2, 8}, /* cost of storing SSE registers + in SImode, DImode and TImode */ + 3, /* MMX or SSE register to integer */ + 64, /* size of l1 cache. */ + 128, /* size of l2 cache. */ + 32, /* size of prefetch block */ + 1, /* number of parallel prefetches */ + 1, /* Branch cost */ + COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */ + COSTS_N_INSNS (11), /* cost of FMUL instruction. */ + COSTS_N_INSNS (47), /* cost of FDIV instruction. */ + COSTS_N_INSNS (1), /* cost of FABS instruction. */ + COSTS_N_INSNS (1), /* cost of FCHS instruction. */ + COSTS_N_INSNS (54), /* cost of FSQRT instruction. */ + 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ + geode_memcpy, + geode_memset, + 1, /* scalar_stmt_cost. */ + 1, /* scalar load_cost. */ + 1, /* scalar_store_cost. */ + 1, /* vec_stmt_cost. */ + 1, /* vec_to_scalar_cost. */ + 1, /* scalar_to_vec_cost. */ + 1, /* vec_align_load_cost. */ + 2, /* vec_unalign_load_cost. */ + 1, /* vec_store_cost. */ + 3, /* cond_taken_branch_cost. */ + 1, /* cond_not_taken_branch_cost. */ +}; + +static stringop_algs k6_memcpy[2] = { + {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}}, + DUMMY_STRINGOP_ALGS}; +static stringop_algs k6_memset[2] = { + {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}}, + DUMMY_STRINGOP_ALGS}; +static const +struct processor_costs k6_cost = { + COSTS_N_INSNS (1), /* cost of an add instruction */ + COSTS_N_INSNS (2), /* cost of a lea instruction */ + COSTS_N_INSNS (1), /* variable shift costs */ + COSTS_N_INSNS (1), /* constant shift costs */ + {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ + COSTS_N_INSNS (3), /* HI */ + COSTS_N_INSNS (3), /* SI */ + COSTS_N_INSNS (3), /* DI */ + COSTS_N_INSNS (3)}, /* other */ + 0, /* cost of multiply per each bit set */ + {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */ + COSTS_N_INSNS (18), /* HI */ + COSTS_N_INSNS (18), /* SI */ + COSTS_N_INSNS (18), /* DI */ + COSTS_N_INSNS (18)}, /* other */ + COSTS_N_INSNS (2), /* cost of movsx */ + COSTS_N_INSNS (2), /* cost of movzx */ + 8, /* "large" insn */ + 4, /* MOVE_RATIO */ + 3, /* cost for loading QImode using movzbl */ + {4, 5, 4}, /* cost of loading integer registers + in QImode, HImode and SImode. + Relative to reg-reg move (2). */ + {2, 3, 2}, /* cost of storing integer registers */ + 4, /* cost of reg,reg fld/fst */ + {6, 6, 6}, /* cost of loading fp registers + in SFmode, DFmode and XFmode */ + {4, 4, 4}, /* cost of storing fp registers + in SFmode, DFmode and XFmode */ + 2, /* cost of moving MMX register */ + {2, 2}, /* cost of loading MMX registers + in SImode and DImode */ + {2, 2}, /* cost of storing MMX registers + in SImode and DImode */ + 2, /* cost of moving SSE register */ + {2, 2, 8}, /* cost of loading SSE registers + in SImode, DImode and TImode */ + {2, 2, 8}, /* cost of storing SSE registers + in SImode, DImode and TImode */ + 6, /* MMX or SSE register to integer */ + 32, /* size of l1 cache. */ + 32, /* size of l2 cache. Some models + have integrated l2 cache, but + optimizing for k6 is not important + enough to worry about that. */ + 32, /* size of prefetch block */ + 1, /* number of parallel prefetches */ + 1, /* Branch cost */ + COSTS_N_INSNS (2), /* cost of FADD and FSUB insns. */ + COSTS_N_INSNS (2), /* cost of FMUL instruction. */ + COSTS_N_INSNS (56), /* cost of FDIV instruction. */ + COSTS_N_INSNS (2), /* cost of FABS instruction. */ + COSTS_N_INSNS (2), /* cost of FCHS instruction. */ + COSTS_N_INSNS (56), /* cost of FSQRT instruction. */ + 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ + k6_memcpy, + k6_memset, + 1, /* scalar_stmt_cost. */ + 1, /* scalar load_cost. */ + 1, /* scalar_store_cost. */ + 1, /* vec_stmt_cost. */ + 1, /* vec_to_scalar_cost. */ + 1, /* scalar_to_vec_cost. */ + 1, /* vec_align_load_cost. */ + 2, /* vec_unalign_load_cost. */ + 1, /* vec_store_cost. */ + 3, /* cond_taken_branch_cost. */ + 1, /* cond_not_taken_branch_cost. */ +}; + +/* For some reason, Athlon deals better with REP prefix (relative to loops) + compared to K8. Alignment becomes important after 8 bytes for memcpy and + 128 bytes for memset. */ +static stringop_algs athlon_memcpy[2] = { + {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, + DUMMY_STRINGOP_ALGS}; +static stringop_algs athlon_memset[2] = { + {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, + DUMMY_STRINGOP_ALGS}; +static const +struct processor_costs athlon_cost = { + COSTS_N_INSNS (1), /* cost of an add instruction */ + COSTS_N_INSNS (2), /* cost of a lea instruction */ + COSTS_N_INSNS (1), /* variable shift costs */ + COSTS_N_INSNS (1), /* constant shift costs */ + {COSTS_N_INSNS (5), /* cost of starting multiply for QI */ + COSTS_N_INSNS (5), /* HI */ + COSTS_N_INSNS (5), /* SI */ + COSTS_N_INSNS (5), /* DI */ + COSTS_N_INSNS (5)}, /* other */ + 0, /* cost of multiply per each bit set */ + {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */ + COSTS_N_INSNS (26), /* HI */ + COSTS_N_INSNS (42), /* SI */ + COSTS_N_INSNS (74), /* DI */ + COSTS_N_INSNS (74)}, /* other */ + COSTS_N_INSNS (1), /* cost of movsx */ + COSTS_N_INSNS (1), /* cost of movzx */ + 8, /* "large" insn */ + 9, /* MOVE_RATIO */ + 4, /* cost for loading QImode using movzbl */ + {3, 4, 3}, /* cost of loading integer registers + in QImode, HImode and SImode. + Relative to reg-reg move (2). */ + {3, 4, 3}, /* cost of storing integer registers */ + 4, /* cost of reg,reg fld/fst */ + {4, 4, 12}, /* cost of loading fp registers + in SFmode, DFmode and XFmode */ + {6, 6, 8}, /* cost of storing fp registers + in SFmode, DFmode and XFmode */ + 2, /* cost of moving MMX register */ + {4, 4}, /* cost of loading MMX registers + in SImode and DImode */ + {4, 4}, /* cost of storing MMX registers + in SImode and DImode */ + 2, /* cost of moving SSE register */ + {4, 4, 6}, /* cost of loading SSE registers + in SImode, DImode and TImode */ + {4, 4, 5}, /* cost of storing SSE registers + in SImode, DImode and TImode */ + 5, /* MMX or SSE register to integer */ + 64, /* size of l1 cache. */ + 256, /* size of l2 cache. */ + 64, /* size of prefetch block */ + 6, /* number of parallel prefetches */ + 5, /* Branch cost */ + COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */ + COSTS_N_INSNS (4), /* cost of FMUL instruction. */ + COSTS_N_INSNS (24), /* cost of FDIV instruction. */ + COSTS_N_INSNS (2), /* cost of FABS instruction. */ + COSTS_N_INSNS (2), /* cost of FCHS instruction. */ + COSTS_N_INSNS (35), /* cost of FSQRT instruction. */ + 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ + athlon_memcpy, + athlon_memset, + 1, /* scalar_stmt_cost. */ + 1, /* scalar load_cost. */ + 1, /* scalar_store_cost. */ + 1, /* vec_stmt_cost. */ + 1, /* vec_to_scalar_cost. */ + 1, /* scalar_to_vec_cost. */ + 1, /* vec_align_load_cost. */ + 2, /* vec_unalign_load_cost. */ + 1, /* vec_store_cost. */ + 3, /* cond_taken_branch_cost. */ + 1, /* cond_not_taken_branch_cost. */ +}; + +/* K8 has optimized REP instruction for medium sized blocks, but for very + small blocks it is better to use loop. For large blocks, libcall can + do nontemporary accesses and beat inline considerably. */ +static stringop_algs k8_memcpy[2] = { + {libcall, {{6, loop, false}, {14, unrolled_loop, false}, + {-1, rep_prefix_4_byte, false}}}, + {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false}, + {-1, libcall, false}}}}; +static stringop_algs k8_memset[2] = { + {libcall, {{8, loop, false}, {24, unrolled_loop, false}, + {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, + {libcall, {{48, unrolled_loop, false}, + {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}}; +static const +struct processor_costs k8_cost = { + COSTS_N_INSNS (1), /* cost of an add instruction */ + COSTS_N_INSNS (2), /* cost of a lea instruction */ + COSTS_N_INSNS (1), /* variable shift costs */ + COSTS_N_INSNS (1), /* constant shift costs */ + {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ + COSTS_N_INSNS (4), /* HI */ + COSTS_N_INSNS (3), /* SI */ + COSTS_N_INSNS (4), /* DI */ + COSTS_N_INSNS (5)}, /* other */ + 0, /* cost of multiply per each bit set */ + {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */ + COSTS_N_INSNS (26), /* HI */ + COSTS_N_INSNS (42), /* SI */ + COSTS_N_INSNS (74), /* DI */ + COSTS_N_INSNS (74)}, /* other */ + COSTS_N_INSNS (1), /* cost of movsx */ + COSTS_N_INSNS (1), /* cost of movzx */ + 8, /* "large" insn */ + 9, /* MOVE_RATIO */ + 4, /* cost for loading QImode using movzbl */ + {3, 4, 3}, /* cost of loading integer registers + in QImode, HImode and SImode. + Relative to reg-reg move (2). */ + {3, 4, 3}, /* cost of storing integer registers */ + 4, /* cost of reg,reg fld/fst */ + {4, 4, 12}, /* cost of loading fp registers + in SFmode, DFmode and XFmode */ + {6, 6, 8}, /* cost of storing fp registers + in SFmode, DFmode and XFmode */ + 2, /* cost of moving MMX register */ + {3, 3}, /* cost of loading MMX registers + in SImode and DImode */ + {4, 4}, /* cost of storing MMX registers + in SImode and DImode */ + 2, /* cost of moving SSE register */ + {4, 3, 6}, /* cost of loading SSE registers + in SImode, DImode and TImode */ + {4, 4, 5}, /* cost of storing SSE registers + in SImode, DImode and TImode */ + 5, /* MMX or SSE register to integer */ + 64, /* size of l1 cache. */ + 512, /* size of l2 cache. */ + 64, /* size of prefetch block */ + /* New AMD processors never drop prefetches; if they cannot be performed + immediately, they are queued. We set number of simultaneous prefetches + to a large constant to reflect this (it probably is not a good idea not + to limit number of prefetches at all, as their execution also takes some + time). */ + 100, /* number of parallel prefetches */ + 3, /* Branch cost */ + COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */ + COSTS_N_INSNS (4), /* cost of FMUL instruction. */ + COSTS_N_INSNS (19), /* cost of FDIV instruction. */ + COSTS_N_INSNS (2), /* cost of FABS instruction. */ + COSTS_N_INSNS (2), /* cost of FCHS instruction. */ + COSTS_N_INSNS (35), /* cost of FSQRT instruction. */ + 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ + k8_memcpy, + k8_memset, + 4, /* scalar_stmt_cost. */ + 2, /* scalar load_cost. */ + 2, /* scalar_store_cost. */ + 5, /* vec_stmt_cost. */ + 0, /* vec_to_scalar_cost. */ + 2, /* scalar_to_vec_cost. */ + 2, /* vec_align_load_cost. */ + 3, /* vec_unalign_load_cost. */ + 3, /* vec_store_cost. */ + 3, /* cond_taken_branch_cost. */ + 2, /* cond_not_taken_branch_cost. */ +}; + +/* AMDFAM10 has optimized REP instruction for medium sized blocks, but for + very small blocks it is better to use loop. For large blocks, libcall can + do nontemporary accesses and beat inline considerably. */ +static stringop_algs amdfam10_memcpy[2] = { + {libcall, {{6, loop, false}, {14, unrolled_loop, false}, + {-1, rep_prefix_4_byte, false}}}, + {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false}, + {-1, libcall, false}}}}; +static stringop_algs amdfam10_memset[2] = { + {libcall, {{8, loop, false}, {24, unrolled_loop, false}, + {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, + {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false}, + {-1, libcall, false}}}}; +struct processor_costs amdfam10_cost = { + COSTS_N_INSNS (1), /* cost of an add instruction */ + COSTS_N_INSNS (2), /* cost of a lea instruction */ + COSTS_N_INSNS (1), /* variable shift costs */ + COSTS_N_INSNS (1), /* constant shift costs */ + {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ + COSTS_N_INSNS (4), /* HI */ + COSTS_N_INSNS (3), /* SI */ + COSTS_N_INSNS (4), /* DI */ + COSTS_N_INSNS (5)}, /* other */ + 0, /* cost of multiply per each bit set */ + {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */ + COSTS_N_INSNS (35), /* HI */ + COSTS_N_INSNS (51), /* SI */ + COSTS_N_INSNS (83), /* DI */ + COSTS_N_INSNS (83)}, /* other */ + COSTS_N_INSNS (1), /* cost of movsx */ + COSTS_N_INSNS (1), /* cost of movzx */ + 8, /* "large" insn */ + 9, /* MOVE_RATIO */ + 4, /* cost for loading QImode using movzbl */ + {3, 4, 3}, /* cost of loading integer registers + in QImode, HImode and SImode. + Relative to reg-reg move (2). */ + {3, 4, 3}, /* cost of storing integer registers */ + 4, /* cost of reg,reg fld/fst */ + {4, 4, 12}, /* cost of loading fp registers + in SFmode, DFmode and XFmode */ + {6, 6, 8}, /* cost of storing fp registers + in SFmode, DFmode and XFmode */ + 2, /* cost of moving MMX register */ + {3, 3}, /* cost of loading MMX registers + in SImode and DImode */ + {4, 4}, /* cost of storing MMX registers + in SImode and DImode */ + 2, /* cost of moving SSE register */ + {4, 4, 3}, /* cost of loading SSE registers + in SImode, DImode and TImode */ + {4, 4, 5}, /* cost of storing SSE registers + in SImode, DImode and TImode */ + 3, /* MMX or SSE register to integer */ + /* On K8: + MOVD reg64, xmmreg Double FSTORE 4 + MOVD reg32, xmmreg Double FSTORE 4 + On AMDFAM10: + MOVD reg64, xmmreg Double FADD 3 + 1/1 1/1 + MOVD reg32, xmmreg Double FADD 3 + 1/1 1/1 */ + 64, /* size of l1 cache. */ + 512, /* size of l2 cache. */ + 64, /* size of prefetch block */ + /* New AMD processors never drop prefetches; if they cannot be performed + immediately, they are queued. We set number of simultaneous prefetches + to a large constant to reflect this (it probably is not a good idea not + to limit number of prefetches at all, as their execution also takes some + time). */ + 100, /* number of parallel prefetches */ + 2, /* Branch cost */ + COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */ + COSTS_N_INSNS (4), /* cost of FMUL instruction. */ + COSTS_N_INSNS (19), /* cost of FDIV instruction. */ + COSTS_N_INSNS (2), /* cost of FABS instruction. */ + COSTS_N_INSNS (2), /* cost of FCHS instruction. */ + COSTS_N_INSNS (35), /* cost of FSQRT instruction. */ + 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ + amdfam10_memcpy, + amdfam10_memset, + 4, /* scalar_stmt_cost. */ + 2, /* scalar load_cost. */ + 2, /* scalar_store_cost. */ + 6, /* vec_stmt_cost. */ + 0, /* vec_to_scalar_cost. */ + 2, /* scalar_to_vec_cost. */ + 2, /* vec_align_load_cost. */ + 2, /* vec_unalign_load_cost. */ + 2, /* vec_store_cost. */ + 2, /* cond_taken_branch_cost. */ + 1, /* cond_not_taken_branch_cost. */ +}; + +/* BDVER1 has optimized REP instruction for medium sized blocks, but for + very small blocks it is better to use loop. For large blocks, libcall + can do nontemporary accesses and beat inline considerably. */ +static stringop_algs bdver1_memcpy[2] = { + {libcall, {{6, loop, false}, {14, unrolled_loop, false}, + {-1, rep_prefix_4_byte, false}}}, + {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false}, + {-1, libcall, false}}}}; +static stringop_algs bdver1_memset[2] = { + {libcall, {{8, loop, false}, {24, unrolled_loop, false}, + {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, + {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false}, + {-1, libcall, false}}}}; + +const struct processor_costs bdver1_cost = { + COSTS_N_INSNS (1), /* cost of an add instruction */ + COSTS_N_INSNS (1), /* cost of a lea instruction */ + COSTS_N_INSNS (1), /* variable shift costs */ + COSTS_N_INSNS (1), /* constant shift costs */ + {COSTS_N_INSNS (4), /* cost of starting multiply for QI */ + COSTS_N_INSNS (4), /* HI */ + COSTS_N_INSNS (4), /* SI */ + COSTS_N_INSNS (6), /* DI */ + COSTS_N_INSNS (6)}, /* other */ + 0, /* cost of multiply per each bit set */ + {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */ + COSTS_N_INSNS (35), /* HI */ + COSTS_N_INSNS (51), /* SI */ + COSTS_N_INSNS (83), /* DI */ + COSTS_N_INSNS (83)}, /* other */ + COSTS_N_INSNS (1), /* cost of movsx */ + COSTS_N_INSNS (1), /* cost of movzx */ + 8, /* "large" insn */ + 9, /* MOVE_RATIO */ + 4, /* cost for loading QImode using movzbl */ + {5, 5, 4}, /* cost of loading integer registers + in QImode, HImode and SImode. + Relative to reg-reg move (2). */ + {4, 4, 4}, /* cost of storing integer registers */ + 2, /* cost of reg,reg fld/fst */ + {5, 5, 12}, /* cost of loading fp registers + in SFmode, DFmode and XFmode */ + {4, 4, 8}, /* cost of storing fp registers + in SFmode, DFmode and XFmode */ + 2, /* cost of moving MMX register */ + {4, 4}, /* cost of loading MMX registers + in SImode and DImode */ + {4, 4}, /* cost of storing MMX registers + in SImode and DImode */ + 2, /* cost of moving SSE register */ + {4, 4, 4}, /* cost of loading SSE registers + in SImode, DImode and TImode */ + {4, 4, 4}, /* cost of storing SSE registers + in SImode, DImode and TImode */ + 2, /* MMX or SSE register to integer */ + /* On K8: + MOVD reg64, xmmreg Double FSTORE 4 + MOVD reg32, xmmreg Double FSTORE 4 + On AMDFAM10: + MOVD reg64, xmmreg Double FADD 3 + 1/1 1/1 + MOVD reg32, xmmreg Double FADD 3 + 1/1 1/1 */ + 16, /* size of l1 cache. */ + 2048, /* size of l2 cache. */ + 64, /* size of prefetch block */ + /* New AMD processors never drop prefetches; if they cannot be performed + immediately, they are queued. We set number of simultaneous prefetches + to a large constant to reflect this (it probably is not a good idea not + to limit number of prefetches at all, as their execution also takes some + time). */ + 100, /* number of parallel prefetches */ + 2, /* Branch cost */ + COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */ + COSTS_N_INSNS (6), /* cost of FMUL instruction. */ + COSTS_N_INSNS (42), /* cost of FDIV instruction. */ + COSTS_N_INSNS (2), /* cost of FABS instruction. */ + COSTS_N_INSNS (2), /* cost of FCHS instruction. */ + COSTS_N_INSNS (52), /* cost of FSQRT instruction. */ + 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ + bdver1_memcpy, + bdver1_memset, + 6, /* scalar_stmt_cost. */ + 4, /* scalar load_cost. */ + 4, /* scalar_store_cost. */ + 6, /* vec_stmt_cost. */ + 0, /* vec_to_scalar_cost. */ + 2, /* scalar_to_vec_cost. */ + 4, /* vec_align_load_cost. */ + 4, /* vec_unalign_load_cost. */ + 4, /* vec_store_cost. */ + 4, /* cond_taken_branch_cost. */ + 2, /* cond_not_taken_branch_cost. */ +}; + +/* BDVER2 has optimized REP instruction for medium sized blocks, but for + very small blocks it is better to use loop. For large blocks, libcall + can do nontemporary accesses and beat inline considerably. */ + +static stringop_algs bdver2_memcpy[2] = { + {libcall, {{6, loop, false}, {14, unrolled_loop, false}, + {-1, rep_prefix_4_byte, false}}}, + {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false}, + {-1, libcall, false}}}}; +static stringop_algs bdver2_memset[2] = { + {libcall, {{8, loop, false}, {24, unrolled_loop, false}, + {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, + {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false}, + {-1, libcall, false}}}}; + +const struct processor_costs bdver2_cost = { + COSTS_N_INSNS (1), /* cost of an add instruction */ + COSTS_N_INSNS (1), /* cost of a lea instruction */ + COSTS_N_INSNS (1), /* variable shift costs */ + COSTS_N_INSNS (1), /* constant shift costs */ + {COSTS_N_INSNS (4), /* cost of starting multiply for QI */ + COSTS_N_INSNS (4), /* HI */ + COSTS_N_INSNS (4), /* SI */ + COSTS_N_INSNS (6), /* DI */ + COSTS_N_INSNS (6)}, /* other */ + 0, /* cost of multiply per each bit set */ + {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */ + COSTS_N_INSNS (35), /* HI */ + COSTS_N_INSNS (51), /* SI */ + COSTS_N_INSNS (83), /* DI */ + COSTS_N_INSNS (83)}, /* other */ + COSTS_N_INSNS (1), /* cost of movsx */ + COSTS_N_INSNS (1), /* cost of movzx */ + 8, /* "large" insn */ + 9, /* MOVE_RATIO */ + 4, /* cost for loading QImode using movzbl */ + {5, 5, 4}, /* cost of loading integer registers + in QImode, HImode and SImode. + Relative to reg-reg move (2). */ + {4, 4, 4}, /* cost of storing integer registers */ + 2, /* cost of reg,reg fld/fst */ + {5, 5, 12}, /* cost of loading fp registers + in SFmode, DFmode and XFmode */ + {4, 4, 8}, /* cost of storing fp registers + in SFmode, DFmode and XFmode */ + 2, /* cost of moving MMX register */ + {4, 4}, /* cost of loading MMX registers + in SImode and DImode */ + {4, 4}, /* cost of storing MMX registers + in SImode and DImode */ + 2, /* cost of moving SSE register */ + {4, 4, 4}, /* cost of loading SSE registers + in SImode, DImode and TImode */ + {4, 4, 4}, /* cost of storing SSE registers + in SImode, DImode and TImode */ + 2, /* MMX or SSE register to integer */ + /* On K8: + MOVD reg64, xmmreg Double FSTORE 4 + MOVD reg32, xmmreg Double FSTORE 4 + On AMDFAM10: + MOVD reg64, xmmreg Double FADD 3 + 1/1 1/1 + MOVD reg32, xmmreg Double FADD 3 + 1/1 1/1 */ + 16, /* size of l1 cache. */ + 2048, /* size of l2 cache. */ + 64, /* size of prefetch block */ + /* New AMD processors never drop prefetches; if they cannot be performed + immediately, they are queued. We set number of simultaneous prefetches + to a large constant to reflect this (it probably is not a good idea not + to limit number of prefetches at all, as their execution also takes some + time). */ + 100, /* number of parallel prefetches */ + 2, /* Branch cost */ + COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */ + COSTS_N_INSNS (6), /* cost of FMUL instruction. */ + COSTS_N_INSNS (42), /* cost of FDIV instruction. */ + COSTS_N_INSNS (2), /* cost of FABS instruction. */ + COSTS_N_INSNS (2), /* cost of FCHS instruction. */ + COSTS_N_INSNS (52), /* cost of FSQRT instruction. */ + 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ + bdver2_memcpy, + bdver2_memset, + 6, /* scalar_stmt_cost. */ + 4, /* scalar load_cost. */ + 4, /* scalar_store_cost. */ + 6, /* vec_stmt_cost. */ + 0, /* vec_to_scalar_cost. */ + 2, /* scalar_to_vec_cost. */ + 4, /* vec_align_load_cost. */ + 4, /* vec_unalign_load_cost. */ + 4, /* vec_store_cost. */ + 4, /* cond_taken_branch_cost. */ + 2, /* cond_not_taken_branch_cost. */ +}; + + + /* BDVER3 has optimized REP instruction for medium sized blocks, but for + very small blocks it is better to use loop. For large blocks, libcall + can do nontemporary accesses and beat inline considerably. */ +static stringop_algs bdver3_memcpy[2] = { + {libcall, {{6, loop, false}, {14, unrolled_loop, false}, + {-1, rep_prefix_4_byte, false}}}, + {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false}, + {-1, libcall, false}}}}; +static stringop_algs bdver3_memset[2] = { + {libcall, {{8, loop, false}, {24, unrolled_loop, false}, + {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, + {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false}, + {-1, libcall, false}}}}; +struct processor_costs bdver3_cost = { + COSTS_N_INSNS (1), /* cost of an add instruction */ + COSTS_N_INSNS (1), /* cost of a lea instruction */ + COSTS_N_INSNS (1), /* variable shift costs */ + COSTS_N_INSNS (1), /* constant shift costs */ + {COSTS_N_INSNS (4), /* cost of starting multiply for QI */ + COSTS_N_INSNS (4), /* HI */ + COSTS_N_INSNS (4), /* SI */ + COSTS_N_INSNS (6), /* DI */ + COSTS_N_INSNS (6)}, /* other */ + 0, /* cost of multiply per each bit set */ + {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */ + COSTS_N_INSNS (35), /* HI */ + COSTS_N_INSNS (51), /* SI */ + COSTS_N_INSNS (83), /* DI */ + COSTS_N_INSNS (83)}, /* other */ + COSTS_N_INSNS (1), /* cost of movsx */ + COSTS_N_INSNS (1), /* cost of movzx */ + 8, /* "large" insn */ + 9, /* MOVE_RATIO */ + 4, /* cost for loading QImode using movzbl */ + {5, 5, 4}, /* cost of loading integer registers + in QImode, HImode and SImode. + Relative to reg-reg move (2). */ + {4, 4, 4}, /* cost of storing integer registers */ + 2, /* cost of reg,reg fld/fst */ + {5, 5, 12}, /* cost of loading fp registers + in SFmode, DFmode and XFmode */ + {4, 4, 8}, /* cost of storing fp registers + in SFmode, DFmode and XFmode */ + 2, /* cost of moving MMX register */ + {4, 4}, /* cost of loading MMX registers + in SImode and DImode */ + {4, 4}, /* cost of storing MMX registers + in SImode and DImode */ + 2, /* cost of moving SSE register */ + {4, 4, 4}, /* cost of loading SSE registers + in SImode, DImode and TImode */ + {4, 4, 4}, /* cost of storing SSE registers + in SImode, DImode and TImode */ + 2, /* MMX or SSE register to integer */ + 16, /* size of l1 cache. */ + 2048, /* size of l2 cache. */ + 64, /* size of prefetch block */ + /* New AMD processors never drop prefetches; if they cannot be performed + immediately, they are queued. We set number of simultaneous prefetches + to a large constant to reflect this (it probably is not a good idea not + to limit number of prefetches at all, as their execution also takes some + time). */ + 100, /* number of parallel prefetches */ + 2, /* Branch cost */ + COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */ + COSTS_N_INSNS (6), /* cost of FMUL instruction. */ + COSTS_N_INSNS (42), /* cost of FDIV instruction. */ + COSTS_N_INSNS (2), /* cost of FABS instruction. */ + COSTS_N_INSNS (2), /* cost of FCHS instruction. */ + COSTS_N_INSNS (52), /* cost of FSQRT instruction. */ + 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ + bdver3_memcpy, + bdver3_memset, + 6, /* scalar_stmt_cost. */ + 4, /* scalar load_cost. */ + 4, /* scalar_store_cost. */ + 6, /* vec_stmt_cost. */ + 0, /* vec_to_scalar_cost. */ + 2, /* scalar_to_vec_cost. */ + 4, /* vec_align_load_cost. */ + 4, /* vec_unalign_load_cost. */ + 4, /* vec_store_cost. */ + 4, /* cond_taken_branch_cost. */ + 2, /* cond_not_taken_branch_cost. */ +}; + +/* BDVER4 has optimized REP instruction for medium sized blocks, but for + very small blocks it is better to use loop. For large blocks, libcall + can do nontemporary accesses and beat inline considerably. */ +static stringop_algs bdver4_memcpy[2] = { + {libcall, {{6, loop, false}, {14, unrolled_loop, false}, + {-1, rep_prefix_4_byte, false}}}, + {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false}, + {-1, libcall, false}}}}; +static stringop_algs bdver4_memset[2] = { + {libcall, {{8, loop, false}, {24, unrolled_loop, false}, + {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, + {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false}, + {-1, libcall, false}}}}; +struct processor_costs bdver4_cost = { + COSTS_N_INSNS (1), /* cost of an add instruction */ + COSTS_N_INSNS (1), /* cost of a lea instruction */ + COSTS_N_INSNS (1), /* variable shift costs */ + COSTS_N_INSNS (1), /* constant shift costs */ + {COSTS_N_INSNS (4), /* cost of starting multiply for QI */ + COSTS_N_INSNS (4), /* HI */ + COSTS_N_INSNS (4), /* SI */ + COSTS_N_INSNS (6), /* DI */ + COSTS_N_INSNS (6)}, /* other */ + 0, /* cost of multiply per each bit set */ + {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */ + COSTS_N_INSNS (35), /* HI */ + COSTS_N_INSNS (51), /* SI */ + COSTS_N_INSNS (83), /* DI */ + COSTS_N_INSNS (83)}, /* other */ + COSTS_N_INSNS (1), /* cost of movsx */ + COSTS_N_INSNS (1), /* cost of movzx */ + 8, /* "large" insn */ + 9, /* MOVE_RATIO */ + 4, /* cost for loading QImode using movzbl */ + {5, 5, 4}, /* cost of loading integer registers + in QImode, HImode and SImode. + Relative to reg-reg move (2). */ + {4, 4, 4}, /* cost of storing integer registers */ + 2, /* cost of reg,reg fld/fst */ + {5, 5, 12}, /* cost of loading fp registers + in SFmode, DFmode and XFmode */ + {4, 4, 8}, /* cost of storing fp registers + in SFmode, DFmode and XFmode */ + 2, /* cost of moving MMX register */ + {4, 4}, /* cost of loading MMX registers + in SImode and DImode */ + {4, 4}, /* cost of storing MMX registers + in SImode and DImode */ + 2, /* cost of moving SSE register */ + {4, 4, 4}, /* cost of loading SSE registers + in SImode, DImode and TImode */ + {4, 4, 4}, /* cost of storing SSE registers + in SImode, DImode and TImode */ + 2, /* MMX or SSE register to integer */ + 16, /* size of l1 cache. */ + 2048, /* size of l2 cache. */ + 64, /* size of prefetch block */ + /* New AMD processors never drop prefetches; if they cannot be performed + immediately, they are queued. We set number of simultaneous prefetches + to a large constant to reflect this (it probably is not a good idea not + to limit number of prefetches at all, as their execution also takes some + time). */ + 100, /* number of parallel prefetches */ + 2, /* Branch cost */ + COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */ + COSTS_N_INSNS (6), /* cost of FMUL instruction. */ + COSTS_N_INSNS (42), /* cost of FDIV instruction. */ + COSTS_N_INSNS (2), /* cost of FABS instruction. */ + COSTS_N_INSNS (2), /* cost of FCHS instruction. */ + COSTS_N_INSNS (52), /* cost of FSQRT instruction. */ + 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ + bdver4_memcpy, + bdver4_memset, + 6, /* scalar_stmt_cost. */ + 4, /* scalar load_cost. */ + 4, /* scalar_store_cost. */ + 6, /* vec_stmt_cost. */ + 0, /* vec_to_scalar_cost. */ + 2, /* scalar_to_vec_cost. */ + 4, /* vec_align_load_cost. */ + 4, /* vec_unalign_load_cost. */ + 4, /* vec_store_cost. */ + 4, /* cond_taken_branch_cost. */ + 2, /* cond_not_taken_branch_cost. */ +}; + + +/* ZNVER1 has optimized REP instruction for medium sized blocks, but for + very small blocks it is better to use loop. For large blocks, libcall + can do nontemporary accesses and beat inline considerably. */ +static stringop_algs znver1_memcpy[2] = { + {libcall, {{6, loop, false}, {14, unrolled_loop, false}, + {-1, rep_prefix_4_byte, false}}}, + {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false}, + {-1, libcall, false}}}}; +static stringop_algs znver1_memset[2] = { + {libcall, {{8, loop, false}, {24, unrolled_loop, false}, + {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, + {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false}, + {-1, libcall, false}}}}; +struct processor_costs znver1_cost = { + COSTS_N_INSNS (1), /* cost of an add instruction. */ + COSTS_N_INSNS (1), /* cost of a lea instruction. */ + COSTS_N_INSNS (1), /* variable shift costs. */ + COSTS_N_INSNS (1), /* constant shift costs. */ + {COSTS_N_INSNS (3), /* cost of starting multiply for QI. */ + COSTS_N_INSNS (3), /* HI. */ + COSTS_N_INSNS (3), /* SI. */ + COSTS_N_INSNS (4), /* DI. */ + COSTS_N_INSNS (4)}, /* other. */ + 0, /* cost of multiply per each bit + set. */ + {COSTS_N_INSNS (19), /* cost of a divide/mod for QI. */ + COSTS_N_INSNS (35), /* HI. */ + COSTS_N_INSNS (51), /* SI. */ + COSTS_N_INSNS (83), /* DI. */ + COSTS_N_INSNS (83)}, /* other. */ + COSTS_N_INSNS (1), /* cost of movsx. */ + COSTS_N_INSNS (1), /* cost of movzx. */ + 8, /* "large" insn. */ + 9, /* MOVE_RATIO. */ + 4, /* cost for loading QImode using + movzbl. */ + {5, 5, 4}, /* cost of loading integer registers + in QImode, HImode and SImode. + Relative to reg-reg move (2). */ + {4, 4, 4}, /* cost of storing integer + registers. */ + 2, /* cost of reg,reg fld/fst. */ + {5, 5, 12}, /* cost of loading fp registers + in SFmode, DFmode and XFmode. */ + {4, 4, 8}, /* cost of storing fp registers + in SFmode, DFmode and XFmode. */ + 2, /* cost of moving MMX register. */ + {4, 4}, /* cost of loading MMX registers + in SImode and DImode. */ + {4, 4}, /* cost of storing MMX registers + in SImode and DImode. */ + 2, /* cost of moving SSE register. */ + {4, 4, 4}, /* cost of loading SSE registers + in SImode, DImode and TImode. */ + {4, 4, 4}, /* cost of storing SSE registers + in SImode, DImode and TImode. */ + 2, /* MMX or SSE register to integer. */ + 32, /* size of l1 cache. */ + 512, /* size of l2 cache. */ + 64, /* size of prefetch block. */ + /* New AMD processors never drop prefetches; if they cannot be performed + immediately, they are queued. We set number of simultaneous prefetches + to a large constant to reflect this (it probably is not a good idea not + to limit number of prefetches at all, as their execution also takes some + time). */ + 100, /* number of parallel prefetches. */ + 3, /* Branch cost. */ + COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */ + COSTS_N_INSNS (6), /* cost of FMUL instruction. */ + COSTS_N_INSNS (42), /* cost of FDIV instruction. */ + COSTS_N_INSNS (2), /* cost of FABS instruction. */ + COSTS_N_INSNS (2), /* cost of FCHS instruction. */ + COSTS_N_INSNS (52), /* cost of FSQRT instruction. */ + /* Zen can execute 4 integer operations per cycle. FP operations take 3 cycles + and it can execute 2 integer additions and 2 multiplications thus + reassociation may make sense up to with of 6. SPEC2k6 bencharks suggests + that 4 works better than 6 probably due to register pressure. + + Integer vector operations are taken by FP unit and execute 3 vector + plus/minus operations per cycle but only one multiply. This is adjusted + in ix86_reassociation_width. */ + 4, 4, 3, 6, /* reassoc int, fp, vec_int, vec_fp. */ + znver1_memcpy, + znver1_memset, + 6, /* scalar_stmt_cost. */ + 4, /* scalar load_cost. */ + 4, /* scalar_store_cost. */ + 6, /* vec_stmt_cost. */ + 0, /* vec_to_scalar_cost. */ + 2, /* scalar_to_vec_cost. */ + 4, /* vec_align_load_cost. */ + 4, /* vec_unalign_load_cost. */ + 4, /* vec_store_cost. */ + 4, /* cond_taken_branch_cost. */ + 2, /* cond_not_taken_branch_cost. */ +}; + + /* BTVER1 has optimized REP instruction for medium sized blocks, but for + very small blocks it is better to use loop. For large blocks, libcall can + do nontemporary accesses and beat inline considerably. */ +static stringop_algs btver1_memcpy[2] = { + {libcall, {{6, loop, false}, {14, unrolled_loop, false}, + {-1, rep_prefix_4_byte, false}}}, + {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false}, + {-1, libcall, false}}}}; +static stringop_algs btver1_memset[2] = { + {libcall, {{8, loop, false}, {24, unrolled_loop, false}, + {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, + {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false}, + {-1, libcall, false}}}}; +const struct processor_costs btver1_cost = { + COSTS_N_INSNS (1), /* cost of an add instruction */ + COSTS_N_INSNS (2), /* cost of a lea instruction */ + COSTS_N_INSNS (1), /* variable shift costs */ + COSTS_N_INSNS (1), /* constant shift costs */ + {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ + COSTS_N_INSNS (4), /* HI */ + COSTS_N_INSNS (3), /* SI */ + COSTS_N_INSNS (4), /* DI */ + COSTS_N_INSNS (5)}, /* other */ + 0, /* cost of multiply per each bit set */ + {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */ + COSTS_N_INSNS (35), /* HI */ + COSTS_N_INSNS (51), /* SI */ + COSTS_N_INSNS (83), /* DI */ + COSTS_N_INSNS (83)}, /* other */ + COSTS_N_INSNS (1), /* cost of movsx */ + COSTS_N_INSNS (1), /* cost of movzx */ + 8, /* "large" insn */ + 9, /* MOVE_RATIO */ + 4, /* cost for loading QImode using movzbl */ + {3, 4, 3}, /* cost of loading integer registers + in QImode, HImode and SImode. + Relative to reg-reg move (2). */ + {3, 4, 3}, /* cost of storing integer registers */ + 4, /* cost of reg,reg fld/fst */ + {4, 4, 12}, /* cost of loading fp registers + in SFmode, DFmode and XFmode */ + {6, 6, 8}, /* cost of storing fp registers + in SFmode, DFmode and XFmode */ + 2, /* cost of moving MMX register */ + {3, 3}, /* cost of loading MMX registers + in SImode and DImode */ + {4, 4}, /* cost of storing MMX registers + in SImode and DImode */ + 2, /* cost of moving SSE register */ + {4, 4, 3}, /* cost of loading SSE registers + in SImode, DImode and TImode */ + {4, 4, 5}, /* cost of storing SSE registers + in SImode, DImode and TImode */ + 3, /* MMX or SSE register to integer */ + /* On K8: + MOVD reg64, xmmreg Double FSTORE 4 + MOVD reg32, xmmreg Double FSTORE 4 + On AMDFAM10: + MOVD reg64, xmmreg Double FADD 3 + 1/1 1/1 + MOVD reg32, xmmreg Double FADD 3 + 1/1 1/1 */ + 32, /* size of l1 cache. */ + 512, /* size of l2 cache. */ + 64, /* size of prefetch block */ + 100, /* number of parallel prefetches */ + 2, /* Branch cost */ + COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */ + COSTS_N_INSNS (4), /* cost of FMUL instruction. */ + COSTS_N_INSNS (19), /* cost of FDIV instruction. */ + COSTS_N_INSNS (2), /* cost of FABS instruction. */ + COSTS_N_INSNS (2), /* cost of FCHS instruction. */ + COSTS_N_INSNS (35), /* cost of FSQRT instruction. */ + 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ + btver1_memcpy, + btver1_memset, + 4, /* scalar_stmt_cost. */ + 2, /* scalar load_cost. */ + 2, /* scalar_store_cost. */ + 6, /* vec_stmt_cost. */ + 0, /* vec_to_scalar_cost. */ + 2, /* scalar_to_vec_cost. */ + 2, /* vec_align_load_cost. */ + 2, /* vec_unalign_load_cost. */ + 2, /* vec_store_cost. */ + 2, /* cond_taken_branch_cost. */ + 1, /* cond_not_taken_branch_cost. */ +}; + +static stringop_algs btver2_memcpy[2] = { + {libcall, {{6, loop, false}, {14, unrolled_loop, false}, + {-1, rep_prefix_4_byte, false}}}, + {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false}, + {-1, libcall, false}}}}; +static stringop_algs btver2_memset[2] = { + {libcall, {{8, loop, false}, {24, unrolled_loop, false}, + {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, + {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false}, + {-1, libcall, false}}}}; +const struct processor_costs btver2_cost = { + COSTS_N_INSNS (1), /* cost of an add instruction */ + COSTS_N_INSNS (2), /* cost of a lea instruction */ + COSTS_N_INSNS (1), /* variable shift costs */ + COSTS_N_INSNS (1), /* constant shift costs */ + {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ + COSTS_N_INSNS (4), /* HI */ + COSTS_N_INSNS (3), /* SI */ + COSTS_N_INSNS (4), /* DI */ + COSTS_N_INSNS (5)}, /* other */ + 0, /* cost of multiply per each bit set */ + {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */ + COSTS_N_INSNS (35), /* HI */ + COSTS_N_INSNS (51), /* SI */ + COSTS_N_INSNS (83), /* DI */ + COSTS_N_INSNS (83)}, /* other */ + COSTS_N_INSNS (1), /* cost of movsx */ + COSTS_N_INSNS (1), /* cost of movzx */ + 8, /* "large" insn */ + 9, /* MOVE_RATIO */ + 4, /* cost for loading QImode using movzbl */ + {3, 4, 3}, /* cost of loading integer registers + in QImode, HImode and SImode. + Relative to reg-reg move (2). */ + {3, 4, 3}, /* cost of storing integer registers */ + 4, /* cost of reg,reg fld/fst */ + {4, 4, 12}, /* cost of loading fp registers + in SFmode, DFmode and XFmode */ + {6, 6, 8}, /* cost of storing fp registers + in SFmode, DFmode and XFmode */ + 2, /* cost of moving MMX register */ + {3, 3}, /* cost of loading MMX registers + in SImode and DImode */ + {4, 4}, /* cost of storing MMX registers + in SImode and DImode */ + 2, /* cost of moving SSE register */ + {4, 4, 3}, /* cost of loading SSE registers + in SImode, DImode and TImode */ + {4, 4, 5}, /* cost of storing SSE registers + in SImode, DImode and TImode */ + 3, /* MMX or SSE register to integer */ + /* On K8: + MOVD reg64, xmmreg Double FSTORE 4 + MOVD reg32, xmmreg Double FSTORE 4 + On AMDFAM10: + MOVD reg64, xmmreg Double FADD 3 + 1/1 1/1 + MOVD reg32, xmmreg Double FADD 3 + 1/1 1/1 */ + 32, /* size of l1 cache. */ + 2048, /* size of l2 cache. */ + 64, /* size of prefetch block */ + 100, /* number of parallel prefetches */ + 2, /* Branch cost */ + COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */ + COSTS_N_INSNS (4), /* cost of FMUL instruction. */ + COSTS_N_INSNS (19), /* cost of FDIV instruction. */ + COSTS_N_INSNS (2), /* cost of FABS instruction. */ + COSTS_N_INSNS (2), /* cost of FCHS instruction. */ + COSTS_N_INSNS (35), /* cost of FSQRT instruction. */ + 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ + btver2_memcpy, + btver2_memset, + 4, /* scalar_stmt_cost. */ + 2, /* scalar load_cost. */ + 2, /* scalar_store_cost. */ + 6, /* vec_stmt_cost. */ + 0, /* vec_to_scalar_cost. */ + 2, /* scalar_to_vec_cost. */ + 2, /* vec_align_load_cost. */ + 2, /* vec_unalign_load_cost. */ + 2, /* vec_store_cost. */ + 2, /* cond_taken_branch_cost. */ + 1, /* cond_not_taken_branch_cost. */ +}; + +static stringop_algs pentium4_memcpy[2] = { + {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}}, + DUMMY_STRINGOP_ALGS}; +static stringop_algs pentium4_memset[2] = { + {libcall, {{6, loop_1_byte, false}, {48, loop, false}, + {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}}, + DUMMY_STRINGOP_ALGS}; + +static const +struct processor_costs pentium4_cost = { + COSTS_N_INSNS (1), /* cost of an add instruction */ + COSTS_N_INSNS (3), /* cost of a lea instruction */ + COSTS_N_INSNS (4), /* variable shift costs */ + COSTS_N_INSNS (4), /* constant shift costs */ + {COSTS_N_INSNS (15), /* cost of starting multiply for QI */ + COSTS_N_INSNS (15), /* HI */ + COSTS_N_INSNS (15), /* SI */ + COSTS_N_INSNS (15), /* DI */ + COSTS_N_INSNS (15)}, /* other */ + 0, /* cost of multiply per each bit set */ + {COSTS_N_INSNS (56), /* cost of a divide/mod for QI */ + COSTS_N_INSNS (56), /* HI */ + COSTS_N_INSNS (56), /* SI */ + COSTS_N_INSNS (56), /* DI */ + COSTS_N_INSNS (56)}, /* other */ + COSTS_N_INSNS (1), /* cost of movsx */ + COSTS_N_INSNS (1), /* cost of movzx */ + 16, /* "large" insn */ + 6, /* MOVE_RATIO */ + 2, /* cost for loading QImode using movzbl */ + {4, 5, 4}, /* cost of loading integer registers + in QImode, HImode and SImode. + Relative to reg-reg move (2). */ + {2, 3, 2}, /* cost of storing integer registers */ + 2, /* cost of reg,reg fld/fst */ + {2, 2, 6}, /* cost of loading fp registers + in SFmode, DFmode and XFmode */ + {4, 4, 6}, /* cost of storing fp registers + in SFmode, DFmode and XFmode */ + 2, /* cost of moving MMX register */ + {2, 2}, /* cost of loading MMX registers + in SImode and DImode */ + {2, 2}, /* cost of storing MMX registers + in SImode and DImode */ + 12, /* cost of moving SSE register */ + {12, 12, 12}, /* cost of loading SSE registers + in SImode, DImode and TImode */ + {2, 2, 8}, /* cost of storing SSE registers + in SImode, DImode and TImode */ + 10, /* MMX or SSE register to integer */ + 8, /* size of l1 cache. */ + 256, /* size of l2 cache. */ + 64, /* size of prefetch block */ + 6, /* number of parallel prefetches */ + 2, /* Branch cost */ + COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */ + COSTS_N_INSNS (7), /* cost of FMUL instruction. */ + COSTS_N_INSNS (43), /* cost of FDIV instruction. */ + COSTS_N_INSNS (2), /* cost of FABS instruction. */ + COSTS_N_INSNS (2), /* cost of FCHS instruction. */ + COSTS_N_INSNS (43), /* cost of FSQRT instruction. */ + 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ + pentium4_memcpy, + pentium4_memset, + 1, /* scalar_stmt_cost. */ + 1, /* scalar load_cost. */ + 1, /* scalar_store_cost. */ + 1, /* vec_stmt_cost. */ + 1, /* vec_to_scalar_cost. */ + 1, /* scalar_to_vec_cost. */ + 1, /* vec_align_load_cost. */ + 2, /* vec_unalign_load_cost. */ + 1, /* vec_store_cost. */ + 3, /* cond_taken_branch_cost. */ + 1, /* cond_not_taken_branch_cost. */ +}; + +static stringop_algs nocona_memcpy[2] = { + {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}}, + {libcall, {{32, loop, false}, {20000, rep_prefix_8_byte, false}, + {100000, unrolled_loop, false}, {-1, libcall, false}}}}; + +static stringop_algs nocona_memset[2] = { + {libcall, {{6, loop_1_byte, false}, {48, loop, false}, + {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}}, + {libcall, {{24, loop, false}, {64, unrolled_loop, false}, + {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}}; + +static const +struct processor_costs nocona_cost = { + COSTS_N_INSNS (1), /* cost of an add instruction */ + COSTS_N_INSNS (1), /* cost of a lea instruction */ + COSTS_N_INSNS (1), /* variable shift costs */ + COSTS_N_INSNS (1), /* constant shift costs */ + {COSTS_N_INSNS (10), /* cost of starting multiply for QI */ + COSTS_N_INSNS (10), /* HI */ + COSTS_N_INSNS (10), /* SI */ + COSTS_N_INSNS (10), /* DI */ + COSTS_N_INSNS (10)}, /* other */ + 0, /* cost of multiply per each bit set */ + {COSTS_N_INSNS (66), /* cost of a divide/mod for QI */ + COSTS_N_INSNS (66), /* HI */ + COSTS_N_INSNS (66), /* SI */ + COSTS_N_INSNS (66), /* DI */ + COSTS_N_INSNS (66)}, /* other */ + COSTS_N_INSNS (1), /* cost of movsx */ + COSTS_N_INSNS (1), /* cost of movzx */ + 16, /* "large" insn */ + 17, /* MOVE_RATIO */ + 4, /* cost for loading QImode using movzbl */ + {4, 4, 4}, /* cost of loading integer registers + in QImode, HImode and SImode. + Relative to reg-reg move (2). */ + {4, 4, 4}, /* cost of storing integer registers */ + 3, /* cost of reg,reg fld/fst */ + {12, 12, 12}, /* cost of loading fp registers + in SFmode, DFmode and XFmode */ + {4, 4, 4}, /* cost of storing fp registers + in SFmode, DFmode and XFmode */ + 6, /* cost of moving MMX register */ + {12, 12}, /* cost of loading MMX registers + in SImode and DImode */ + {12, 12}, /* cost of storing MMX registers + in SImode and DImode */ + 6, /* cost of moving SSE register */ + {12, 12, 12}, /* cost of loading SSE registers + in SImode, DImode and TImode */ + {12, 12, 12}, /* cost of storing SSE registers + in SImode, DImode and TImode */ + 8, /* MMX or SSE register to integer */ + 8, /* size of l1 cache. */ + 1024, /* size of l2 cache. */ + 64, /* size of prefetch block */ + 8, /* number of parallel prefetches */ + 1, /* Branch cost */ + COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */ + COSTS_N_INSNS (8), /* cost of FMUL instruction. */ + COSTS_N_INSNS (40), /* cost of FDIV instruction. */ + COSTS_N_INSNS (3), /* cost of FABS instruction. */ + COSTS_N_INSNS (3), /* cost of FCHS instruction. */ + COSTS_N_INSNS (44), /* cost of FSQRT instruction. */ + 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ + nocona_memcpy, + nocona_memset, + 1, /* scalar_stmt_cost. */ + 1, /* scalar load_cost. */ + 1, /* scalar_store_cost. */ + 1, /* vec_stmt_cost. */ + 1, /* vec_to_scalar_cost. */ + 1, /* scalar_to_vec_cost. */ + 1, /* vec_align_load_cost. */ + 2, /* vec_unalign_load_cost. */ + 1, /* vec_store_cost. */ + 3, /* cond_taken_branch_cost. */ + 1, /* cond_not_taken_branch_cost. */ +}; + +static stringop_algs atom_memcpy[2] = { + {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}}, + {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false}, + {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}}; +static stringop_algs atom_memset[2] = { + {libcall, {{8, loop, false}, {15, unrolled_loop, false}, + {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, + {libcall, {{24, loop, false}, {32, unrolled_loop, false}, + {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}}; +static const +struct processor_costs atom_cost = { + COSTS_N_INSNS (1), /* cost of an add instruction */ + COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */ + COSTS_N_INSNS (1), /* variable shift costs */ + COSTS_N_INSNS (1), /* constant shift costs */ + {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ + COSTS_N_INSNS (4), /* HI */ + COSTS_N_INSNS (3), /* SI */ + COSTS_N_INSNS (4), /* DI */ + COSTS_N_INSNS (2)}, /* other */ + 0, /* cost of multiply per each bit set */ + {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */ + COSTS_N_INSNS (26), /* HI */ + COSTS_N_INSNS (42), /* SI */ + COSTS_N_INSNS (74), /* DI */ + COSTS_N_INSNS (74)}, /* other */ + COSTS_N_INSNS (1), /* cost of movsx */ + COSTS_N_INSNS (1), /* cost of movzx */ + 8, /* "large" insn */ + 17, /* MOVE_RATIO */ + 4, /* cost for loading QImode using movzbl */ + {4, 4, 4}, /* cost of loading integer registers + in QImode, HImode and SImode. + Relative to reg-reg move (2). */ + {4, 4, 4}, /* cost of storing integer registers */ + 4, /* cost of reg,reg fld/fst */ + {12, 12, 12}, /* cost of loading fp registers + in SFmode, DFmode and XFmode */ + {6, 6, 8}, /* cost of storing fp registers + in SFmode, DFmode and XFmode */ + 2, /* cost of moving MMX register */ + {8, 8}, /* cost of loading MMX registers + in SImode and DImode */ + {8, 8}, /* cost of storing MMX registers + in SImode and DImode */ + 2, /* cost of moving SSE register */ + {8, 8, 8}, /* cost of loading SSE registers + in SImode, DImode and TImode */ + {8, 8, 8}, /* cost of storing SSE registers + in SImode, DImode and TImode */ + 5, /* MMX or SSE register to integer */ + 32, /* size of l1 cache. */ + 256, /* size of l2 cache. */ + 64, /* size of prefetch block */ + 6, /* number of parallel prefetches */ + 3, /* Branch cost */ + COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */ + COSTS_N_INSNS (8), /* cost of FMUL instruction. */ + COSTS_N_INSNS (20), /* cost of FDIV instruction. */ + COSTS_N_INSNS (8), /* cost of FABS instruction. */ + COSTS_N_INSNS (8), /* cost of FCHS instruction. */ + COSTS_N_INSNS (40), /* cost of FSQRT instruction. */ + 2, 2, 2, 2, /* reassoc int, fp, vec_int, vec_fp. */ + atom_memcpy, + atom_memset, + 1, /* scalar_stmt_cost. */ + 1, /* scalar load_cost. */ + 1, /* scalar_store_cost. */ + 1, /* vec_stmt_cost. */ + 1, /* vec_to_scalar_cost. */ + 1, /* scalar_to_vec_cost. */ + 1, /* vec_align_load_cost. */ + 2, /* vec_unalign_load_cost. */ + 1, /* vec_store_cost. */ + 3, /* cond_taken_branch_cost. */ + 1, /* cond_not_taken_branch_cost. */ +}; + +static stringop_algs slm_memcpy[2] = { + {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}}, + {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false}, + {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}}; +static stringop_algs slm_memset[2] = { + {libcall, {{8, loop, false}, {15, unrolled_loop, false}, + {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, + {libcall, {{24, loop, false}, {32, unrolled_loop, false}, + {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}}; +static const +struct processor_costs slm_cost = { + COSTS_N_INSNS (1), /* cost of an add instruction */ + COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */ + COSTS_N_INSNS (1), /* variable shift costs */ + COSTS_N_INSNS (1), /* constant shift costs */ + {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ + COSTS_N_INSNS (3), /* HI */ + COSTS_N_INSNS (3), /* SI */ + COSTS_N_INSNS (4), /* DI */ + COSTS_N_INSNS (2)}, /* other */ + 0, /* cost of multiply per each bit set */ + {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */ + COSTS_N_INSNS (26), /* HI */ + COSTS_N_INSNS (42), /* SI */ + COSTS_N_INSNS (74), /* DI */ + COSTS_N_INSNS (74)}, /* other */ + COSTS_N_INSNS (1), /* cost of movsx */ + COSTS_N_INSNS (1), /* cost of movzx */ + 8, /* "large" insn */ + 17, /* MOVE_RATIO */ + 4, /* cost for loading QImode using movzbl */ + {4, 4, 4}, /* cost of loading integer registers + in QImode, HImode and SImode. + Relative to reg-reg move (2). */ + {4, 4, 4}, /* cost of storing integer registers */ + 4, /* cost of reg,reg fld/fst */ + {12, 12, 12}, /* cost of loading fp registers + in SFmode, DFmode and XFmode */ + {6, 6, 8}, /* cost of storing fp registers + in SFmode, DFmode and XFmode */ + 2, /* cost of moving MMX register */ + {8, 8}, /* cost of loading MMX registers + in SImode and DImode */ + {8, 8}, /* cost of storing MMX registers + in SImode and DImode */ + 2, /* cost of moving SSE register */ + {8, 8, 8}, /* cost of loading SSE registers + in SImode, DImode and TImode */ + {8, 8, 8}, /* cost of storing SSE registers + in SImode, DImode and TImode */ + 5, /* MMX or SSE register to integer */ + 32, /* size of l1 cache. */ + 256, /* size of l2 cache. */ + 64, /* size of prefetch block */ + 6, /* number of parallel prefetches */ + 3, /* Branch cost */ + COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */ + COSTS_N_INSNS (8), /* cost of FMUL instruction. */ + COSTS_N_INSNS (20), /* cost of FDIV instruction. */ + COSTS_N_INSNS (8), /* cost of FABS instruction. */ + COSTS_N_INSNS (8), /* cost of FCHS instruction. */ + COSTS_N_INSNS (40), /* cost of FSQRT instruction. */ + 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ + slm_memcpy, + slm_memset, + 1, /* scalar_stmt_cost. */ + 1, /* scalar load_cost. */ + 1, /* scalar_store_cost. */ + 1, /* vec_stmt_cost. */ + 4, /* vec_to_scalar_cost. */ + 1, /* scalar_to_vec_cost. */ + 1, /* vec_align_load_cost. */ + 2, /* vec_unalign_load_cost. */ + 1, /* vec_store_cost. */ + 3, /* cond_taken_branch_cost. */ + 1, /* cond_not_taken_branch_cost. */ +}; + +static stringop_algs intel_memcpy[2] = { + {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}}, + {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false}, + {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}}; +static stringop_algs intel_memset[2] = { + {libcall, {{8, loop, false}, {15, unrolled_loop, false}, + {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, + {libcall, {{24, loop, false}, {32, unrolled_loop, false}, + {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}}; +static const +struct processor_costs intel_cost = { + COSTS_N_INSNS (1), /* cost of an add instruction */ + COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */ + COSTS_N_INSNS (1), /* variable shift costs */ + COSTS_N_INSNS (1), /* constant shift costs */ + {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ + COSTS_N_INSNS (3), /* HI */ + COSTS_N_INSNS (3), /* SI */ + COSTS_N_INSNS (4), /* DI */ + COSTS_N_INSNS (2)}, /* other */ + 0, /* cost of multiply per each bit set */ + {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */ + COSTS_N_INSNS (26), /* HI */ + COSTS_N_INSNS (42), /* SI */ + COSTS_N_INSNS (74), /* DI */ + COSTS_N_INSNS (74)}, /* other */ + COSTS_N_INSNS (1), /* cost of movsx */ + COSTS_N_INSNS (1), /* cost of movzx */ + 8, /* "large" insn */ + 17, /* MOVE_RATIO */ + 4, /* cost for loading QImode using movzbl */ + {4, 4, 4}, /* cost of loading integer registers + in QImode, HImode and SImode. + Relative to reg-reg move (2). */ + {4, 4, 4}, /* cost of storing integer registers */ + 4, /* cost of reg,reg fld/fst */ + {12, 12, 12}, /* cost of loading fp registers + in SFmode, DFmode and XFmode */ + {6, 6, 8}, /* cost of storing fp registers + in SFmode, DFmode and XFmode */ + 2, /* cost of moving MMX register */ + {8, 8}, /* cost of loading MMX registers + in SImode and DImode */ + {8, 8}, /* cost of storing MMX registers + in SImode and DImode */ + 2, /* cost of moving SSE register */ + {8, 8, 8}, /* cost of loading SSE registers + in SImode, DImode and TImode */ + {8, 8, 8}, /* cost of storing SSE registers + in SImode, DImode and TImode */ + 5, /* MMX or SSE register to integer */ + 32, /* size of l1 cache. */ + 256, /* size of l2 cache. */ + 64, /* size of prefetch block */ + 6, /* number of parallel prefetches */ + 3, /* Branch cost */ + COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */ + COSTS_N_INSNS (8), /* cost of FMUL instruction. */ + COSTS_N_INSNS (20), /* cost of FDIV instruction. */ + COSTS_N_INSNS (8), /* cost of FABS instruction. */ + COSTS_N_INSNS (8), /* cost of FCHS instruction. */ + COSTS_N_INSNS (40), /* cost of FSQRT instruction. */ + 1, 4, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ + intel_memcpy, + intel_memset, + 1, /* scalar_stmt_cost. */ + 1, /* scalar load_cost. */ + 1, /* scalar_store_cost. */ + 1, /* vec_stmt_cost. */ + 4, /* vec_to_scalar_cost. */ + 1, /* scalar_to_vec_cost. */ + 1, /* vec_align_load_cost. */ + 2, /* vec_unalign_load_cost. */ + 1, /* vec_store_cost. */ + 3, /* cond_taken_branch_cost. */ + 1, /* cond_not_taken_branch_cost. */ +}; + +/* Generic should produce code tuned for Core-i7 (and newer chips) + and btver1 (and newer chips). */ + +static stringop_algs generic_memcpy[2] = { + {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false}, + {-1, libcall, false}}}, + {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false}, + {-1, libcall, false}}}}; +static stringop_algs generic_memset[2] = { + {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false}, + {-1, libcall, false}}}, + {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false}, + {-1, libcall, false}}}}; +static const +struct processor_costs generic_cost = { + COSTS_N_INSNS (1), /* cost of an add instruction */ + /* On all chips taken into consideration lea is 2 cycles and more. With + this cost however our current implementation of synth_mult results in + use of unnecessary temporary registers causing regression on several + SPECfp benchmarks. */ + COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */ + COSTS_N_INSNS (1), /* variable shift costs */ + COSTS_N_INSNS (1), /* constant shift costs */ + {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ + COSTS_N_INSNS (4), /* HI */ + COSTS_N_INSNS (3), /* SI */ + COSTS_N_INSNS (4), /* DI */ + COSTS_N_INSNS (2)}, /* other */ + 0, /* cost of multiply per each bit set */ + {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */ + COSTS_N_INSNS (26), /* HI */ + COSTS_N_INSNS (42), /* SI */ + COSTS_N_INSNS (74), /* DI */ + COSTS_N_INSNS (74)}, /* other */ + COSTS_N_INSNS (1), /* cost of movsx */ + COSTS_N_INSNS (1), /* cost of movzx */ + 8, /* "large" insn */ + 17, /* MOVE_RATIO */ + 4, /* cost for loading QImode using movzbl */ + {4, 4, 4}, /* cost of loading integer registers + in QImode, HImode and SImode. + Relative to reg-reg move (2). */ + {4, 4, 4}, /* cost of storing integer registers */ + 4, /* cost of reg,reg fld/fst */ + {12, 12, 12}, /* cost of loading fp registers + in SFmode, DFmode and XFmode */ + {6, 6, 8}, /* cost of storing fp registers + in SFmode, DFmode and XFmode */ + 2, /* cost of moving MMX register */ + {8, 8}, /* cost of loading MMX registers + in SImode and DImode */ + {8, 8}, /* cost of storing MMX registers + in SImode and DImode */ + 2, /* cost of moving SSE register */ + {8, 8, 8}, /* cost of loading SSE registers + in SImode, DImode and TImode */ + {8, 8, 8}, /* cost of storing SSE registers + in SImode, DImode and TImode */ + 5, /* MMX or SSE register to integer */ + 32, /* size of l1 cache. */ + 512, /* size of l2 cache. */ + 64, /* size of prefetch block */ + 6, /* number of parallel prefetches */ + /* Benchmarks shows large regressions on K8 sixtrack benchmark when this + value is increased to perhaps more appropriate value of 5. */ + 3, /* Branch cost */ + COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */ + COSTS_N_INSNS (8), /* cost of FMUL instruction. */ + COSTS_N_INSNS (20), /* cost of FDIV instruction. */ + COSTS_N_INSNS (8), /* cost of FABS instruction. */ + COSTS_N_INSNS (8), /* cost of FCHS instruction. */ + COSTS_N_INSNS (40), /* cost of FSQRT instruction. */ + 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ + generic_memcpy, + generic_memset, + 1, /* scalar_stmt_cost. */ + 1, /* scalar load_cost. */ + 1, /* scalar_store_cost. */ + 1, /* vec_stmt_cost. */ + 1, /* vec_to_scalar_cost. */ + 1, /* scalar_to_vec_cost. */ + 1, /* vec_align_load_cost. */ + 2, /* vec_unalign_load_cost. */ + 1, /* vec_store_cost. */ + 3, /* cond_taken_branch_cost. */ + 1, /* cond_not_taken_branch_cost. */ +}; + +/* core_cost should produce code tuned for Core familly of CPUs. */ +static stringop_algs core_memcpy[2] = { + {libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}}, + {libcall, {{24, loop, true}, {128, rep_prefix_8_byte, true}, + {-1, libcall, false}}}}; +static stringop_algs core_memset[2] = { + {libcall, {{6, loop_1_byte, true}, + {24, loop, true}, + {8192, rep_prefix_4_byte, true}, + {-1, libcall, false}}}, + {libcall, {{24, loop, true}, {512, rep_prefix_8_byte, true}, + {-1, libcall, false}}}}; + +static const +struct processor_costs core_cost = { + COSTS_N_INSNS (1), /* cost of an add instruction */ + /* On all chips taken into consideration lea is 2 cycles and more. With + this cost however our current implementation of synth_mult results in + use of unnecessary temporary registers causing regression on several + SPECfp benchmarks. */ + COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */ + COSTS_N_INSNS (1), /* variable shift costs */ + COSTS_N_INSNS (1), /* constant shift costs */ + {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ + COSTS_N_INSNS (4), /* HI */ + COSTS_N_INSNS (3), /* SI */ + COSTS_N_INSNS (4), /* DI */ + COSTS_N_INSNS (2)}, /* other */ + 0, /* cost of multiply per each bit set */ + {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */ + COSTS_N_INSNS (26), /* HI */ + COSTS_N_INSNS (42), /* SI */ + COSTS_N_INSNS (74), /* DI */ + COSTS_N_INSNS (74)}, /* other */ + COSTS_N_INSNS (1), /* cost of movsx */ + COSTS_N_INSNS (1), /* cost of movzx */ + 8, /* "large" insn */ + 17, /* MOVE_RATIO */ + 4, /* cost for loading QImode using movzbl */ + {4, 4, 4}, /* cost of loading integer registers + in QImode, HImode and SImode. + Relative to reg-reg move (2). */ + {4, 4, 4}, /* cost of storing integer registers */ + 4, /* cost of reg,reg fld/fst */ + {12, 12, 12}, /* cost of loading fp registers + in SFmode, DFmode and XFmode */ + {6, 6, 8}, /* cost of storing fp registers + in SFmode, DFmode and XFmode */ + 2, /* cost of moving MMX register */ + {8, 8}, /* cost of loading MMX registers + in SImode and DImode */ + {8, 8}, /* cost of storing MMX registers + in SImode and DImode */ + 2, /* cost of moving SSE register */ + {8, 8, 8}, /* cost of loading SSE registers + in SImode, DImode and TImode */ + {8, 8, 8}, /* cost of storing SSE registers + in SImode, DImode and TImode */ + 5, /* MMX or SSE register to integer */ + 64, /* size of l1 cache. */ + 512, /* size of l2 cache. */ + 64, /* size of prefetch block */ + 6, /* number of parallel prefetches */ + /* FIXME perhaps more appropriate value is 5. */ + 3, /* Branch cost */ + COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */ + COSTS_N_INSNS (8), /* cost of FMUL instruction. */ + COSTS_N_INSNS (20), /* cost of FDIV instruction. */ + COSTS_N_INSNS (8), /* cost of FABS instruction. */ + COSTS_N_INSNS (8), /* cost of FCHS instruction. */ + COSTS_N_INSNS (40), /* cost of FSQRT instruction. */ + 1, 4, 2, 2, /* reassoc int, fp, vec_int, vec_fp. */ + core_memcpy, + core_memset, + 1, /* scalar_stmt_cost. */ + 1, /* scalar load_cost. */ + 1, /* scalar_store_cost. */ + 1, /* vec_stmt_cost. */ + 1, /* vec_to_scalar_cost. */ + 1, /* scalar_to_vec_cost. */ + 1, /* vec_align_load_cost. */ + 2, /* vec_unalign_load_cost. */ + 1, /* vec_store_cost. */ + 3, /* cond_taken_branch_cost. */ + 1, /* cond_not_taken_branch_cost. */ +}; + diff --git a/gcc/config/i386/x86-tune-sched-atom.c b/gcc/config/i386/x86-tune-sched-atom.c new file mode 100644 index 00000000000..86942c0703d --- /dev/null +++ b/gcc/config/i386/x86-tune-sched-atom.c @@ -0,0 +1,244 @@ +/* Scheduler hooks for IA-32 which implement atom+ specific logic. + Copyright (C) 1988-2017 Free Software Foundation, Inc. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3, or (at your option) +any later version. + +GCC is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with GCC; see the file COPYING3. If not see +. */ + +#include "config.h" +#include "system.h" +#include "coretypes.h" +#include "backend.h" +#include "rtl.h" +#include "tree.h" +#include "cfghooks.h" +#include "tm_p.h" +#include "insn-config.h" +#include "insn-attr.h" +#include "recog.h" +#include "target.h" +#include "rtl-iter.h" +#include "regset.h" +#include "sched-int.h" + +/* Try to reorder ready list to take advantage of Atom pipelined IMUL + execution. It is applied if + (1) IMUL instruction is on the top of list; + (2) There exists the only producer of independent IMUL instruction in + ready list. + Return index of IMUL producer if it was found and -1 otherwise. */ +static int +do_reorder_for_imul (rtx_insn **ready, int n_ready) +{ + rtx_insn *insn; + rtx set, insn1, insn2; + sd_iterator_def sd_it; + dep_t dep; + int index = -1; + int i; + + if (!TARGET_BONNELL) + return index; + + /* Check that IMUL instruction is on the top of ready list. */ + insn = ready[n_ready - 1]; + set = single_set (insn); + if (!set) + return index; + if (!(GET_CODE (SET_SRC (set)) == MULT + && GET_MODE (SET_SRC (set)) == SImode)) + return index; + + /* Search for producer of independent IMUL instruction. */ + for (i = n_ready - 2; i >= 0; i--) + { + insn = ready[i]; + if (!NONDEBUG_INSN_P (insn)) + continue; + /* Skip IMUL instruction. */ + insn2 = PATTERN (insn); + if (GET_CODE (insn2) == PARALLEL) + insn2 = XVECEXP (insn2, 0, 0); + if (GET_CODE (insn2) == SET + && GET_CODE (SET_SRC (insn2)) == MULT + && GET_MODE (SET_SRC (insn2)) == SImode) + continue; + + FOR_EACH_DEP (insn, SD_LIST_FORW, sd_it, dep) + { + rtx con; + con = DEP_CON (dep); + if (!NONDEBUG_INSN_P (con)) + continue; + insn1 = PATTERN (con); + if (GET_CODE (insn1) == PARALLEL) + insn1 = XVECEXP (insn1, 0, 0); + + if (GET_CODE (insn1) == SET + && GET_CODE (SET_SRC (insn1)) == MULT + && GET_MODE (SET_SRC (insn1)) == SImode) + { + sd_iterator_def sd_it1; + dep_t dep1; + /* Check if there is no other dependee for IMUL. */ + index = i; + FOR_EACH_DEP (con, SD_LIST_BACK, sd_it1, dep1) + { + rtx pro; + pro = DEP_PRO (dep1); + if (!NONDEBUG_INSN_P (pro)) + continue; + if (pro != insn) + index = -1; + } + if (index >= 0) + break; + } + } + if (index >= 0) + break; + } + return index; +} + +/* Try to find the best candidate on the top of ready list if two insns + have the same priority - candidate is best if its dependees were + scheduled earlier. Applied for Silvermont only. + Return true if top 2 insns must be interchanged. */ +static bool +swap_top_of_ready_list (rtx_insn **ready, int n_ready) +{ + rtx_insn *top = ready[n_ready - 1]; + rtx_insn *next = ready[n_ready - 2]; + rtx set; + sd_iterator_def sd_it; + dep_t dep; + int clock1 = -1; + int clock2 = -1; + #define INSN_TICK(INSN) (HID (INSN)->tick) + + if (!TARGET_SILVERMONT && !TARGET_INTEL) + return false; + + if (!NONDEBUG_INSN_P (top)) + return false; + if (!NONJUMP_INSN_P (top)) + return false; + if (!NONDEBUG_INSN_P (next)) + return false; + if (!NONJUMP_INSN_P (next)) + return false; + set = single_set (top); + if (!set) + return false; + set = single_set (next); + if (!set) + return false; + + if (INSN_PRIORITY_KNOWN (top) && INSN_PRIORITY_KNOWN (next)) + { + if (INSN_PRIORITY (top) != INSN_PRIORITY (next)) + return false; + /* Determine winner more precise. */ + FOR_EACH_DEP (top, SD_LIST_RES_BACK, sd_it, dep) + { + rtx pro; + pro = DEP_PRO (dep); + if (!NONDEBUG_INSN_P (pro)) + continue; + if (INSN_TICK (pro) > clock1) + clock1 = INSN_TICK (pro); + } + FOR_EACH_DEP (next, SD_LIST_RES_BACK, sd_it, dep) + { + rtx pro; + pro = DEP_PRO (dep); + if (!NONDEBUG_INSN_P (pro)) + continue; + if (INSN_TICK (pro) > clock2) + clock2 = INSN_TICK (pro); + } + + if (clock1 == clock2) + { + /* Determine winner - load must win. */ + enum attr_memory memory1, memory2; + memory1 = get_attr_memory (top); + memory2 = get_attr_memory (next); + if (memory2 == MEMORY_LOAD && memory1 != MEMORY_LOAD) + return true; + } + return (bool) (clock2 < clock1); + } + return false; + #undef INSN_TICK +} + +/* Perform possible reodering of ready list for Atom/Silvermont only. + Return issue rate. */ +int +ix86_atom_sched_reorder (FILE *dump, int sched_verbose, rtx_insn **ready, + int *pn_ready, int clock_var) +{ + int issue_rate = -1; + int n_ready = *pn_ready; + int i; + rtx_insn *insn; + int index = -1; + + /* Set up issue rate. */ + issue_rate = ix86_issue_rate (); + + /* Do reodering for BONNELL/SILVERMONT only. */ + if (!TARGET_BONNELL && !TARGET_SILVERMONT && !TARGET_INTEL) + return issue_rate; + + /* Nothing to do if ready list contains only 1 instruction. */ + if (n_ready <= 1) + return issue_rate; + + /* Do reodering for post-reload scheduler only. */ + if (!reload_completed) + return issue_rate; + + if ((index = do_reorder_for_imul (ready, n_ready)) >= 0) + { + if (sched_verbose > 1) + fprintf (dump, ";;\tatom sched_reorder: put %d insn on top\n", + INSN_UID (ready[index])); + + /* Put IMUL producer (ready[index]) at the top of ready list. */ + insn = ready[index]; + for (i = index; i < n_ready - 1; i++) + ready[i] = ready[i + 1]; + ready[n_ready - 1] = insn; + return issue_rate; + } + + /* Skip selective scheduling since HID is not populated in it. */ + if (clock_var != 0 + && !sel_sched_p () + && swap_top_of_ready_list (ready, n_ready)) + { + if (sched_verbose > 1) + fprintf (dump, ";;\tslm sched_reorder: swap %d and %d insns\n", + INSN_UID (ready[n_ready - 1]), INSN_UID (ready[n_ready - 2])); + /* Swap 2 top elements of ready list. */ + insn = ready[n_ready - 1]; + ready[n_ready - 1] = ready[n_ready - 2]; + ready[n_ready - 2] = insn; + } + return issue_rate; +} diff --git a/gcc/config/i386/x86-tune-sched-bd.c b/gcc/config/i386/x86-tune-sched-bd.c new file mode 100644 index 00000000000..c862fc156e2 --- /dev/null +++ b/gcc/config/i386/x86-tune-sched-bd.c @@ -0,0 +1,822 @@ +/* Scheduler hooks for IA-32 which implement bdver1-4 specific logic. + Copyright (C) 1988-2017 Free Software Foundation, Inc. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3, or (at your option) +any later version. + +GCC is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with GCC; see the file COPYING3. If not see +. */ + +#include "config.h" +#include "system.h" +#include "coretypes.h" +#include "backend.h" +#include "rtl.h" +#include "tree.h" +#include "cfghooks.h" +#include "tm_p.h" +#include "insn-config.h" +#include "insn-attr.h" +#include "recog.h" +#include "target.h" +#include "rtl-iter.h" +#include "regset.h" +#include "sched-int.h" + +/* The size of the dispatch window is the total number of bytes of + object code allowed in a window. */ +#define DISPATCH_WINDOW_SIZE 16 + +/* Number of dispatch windows considered for scheduling. */ +#define MAX_DISPATCH_WINDOWS 3 + +/* Maximum number of instructions in a window. */ +#define MAX_INSN 4 + +/* Maximum number of immediate operands in a window. */ +#define MAX_IMM 4 + +/* Maximum number of immediate bits allowed in a window. */ +#define MAX_IMM_SIZE 128 + +/* Maximum number of 32 bit immediates allowed in a window. */ +#define MAX_IMM_32 4 + +/* Maximum number of 64 bit immediates allowed in a window. */ +#define MAX_IMM_64 2 + +/* Maximum total of loads or prefetches allowed in a window. */ +#define MAX_LOAD 2 + +/* Maximum total of stores allowed in a window. */ +#define MAX_STORE 1 + +#undef BIG +#define BIG 100 + + +/* Dispatch groups. Istructions that affect the mix in a dispatch window. */ +enum dispatch_group { + disp_no_group = 0, + disp_load, + disp_store, + disp_load_store, + disp_prefetch, + disp_imm, + disp_imm_32, + disp_imm_64, + disp_branch, + disp_cmp, + disp_jcc, + disp_last +}; + +/* Number of allowable groups in a dispatch window. It is an array + indexed by dispatch_group enum. 100 is used as a big number, + because the number of these kind of operations does not have any + effect in dispatch window, but we need them for other reasons in + the table. */ +static unsigned int num_allowable_groups[disp_last] = { + 0, 2, 1, 1, 2, 4, 4, 2, 1, BIG, BIG +}; + +char group_name[disp_last + 1][16] = { + "disp_no_group", "disp_load", "disp_store", "disp_load_store", + "disp_prefetch", "disp_imm", "disp_imm_32", "disp_imm_64", + "disp_branch", "disp_cmp", "disp_jcc", "disp_last" +}; + +/* Instruction path. */ +enum insn_path { + no_path = 0, + path_single, /* Single micro op. */ + path_double, /* Double micro op. */ + path_multi, /* Instructions with more than 2 micro op.. */ + last_path +}; + +/* sched_insn_info defines a window to the instructions scheduled in + the basic block. It contains a pointer to the insn_info table and + the instruction scheduled. + + Windows are allocated for each basic block and are linked + together. */ +typedef struct sched_insn_info_s { + rtx insn; + enum dispatch_group group; + enum insn_path path; + int byte_len; + int imm_bytes; +} sched_insn_info; + +/* Linked list of dispatch windows. This is a two way list of + dispatch windows of a basic block. It contains information about + the number of uops in the window and the total number of + instructions and of bytes in the object code for this dispatch + window. */ +typedef struct dispatch_windows_s { + int num_insn; /* Number of insn in the window. */ + int num_uops; /* Number of uops in the window. */ + int window_size; /* Number of bytes in the window. */ + int window_num; /* Window number between 0 or 1. */ + int num_imm; /* Number of immediates in an insn. */ + int num_imm_32; /* Number of 32 bit immediates in an insn. */ + int num_imm_64; /* Number of 64 bit immediates in an insn. */ + int imm_size; /* Total immediates in the window. */ + int num_loads; /* Total memory loads in the window. */ + int num_stores; /* Total memory stores in the window. */ + int violation; /* Violation exists in window. */ + sched_insn_info *window; /* Pointer to the window. */ + struct dispatch_windows_s *next; + struct dispatch_windows_s *prev; +} dispatch_windows; + +/* Immediate valuse used in an insn. */ +typedef struct imm_info_s + { + int imm; + int imm32; + int imm64; + } imm_info; + +static dispatch_windows *dispatch_window_list; +static dispatch_windows *dispatch_window_list1; + +/* Get dispatch group of insn. */ + +static enum dispatch_group +get_mem_group (rtx_insn *insn) +{ + enum attr_memory memory; + + if (INSN_CODE (insn) < 0) + return disp_no_group; + memory = get_attr_memory (insn); + if (memory == MEMORY_STORE) + return disp_store; + + if (memory == MEMORY_LOAD) + return disp_load; + + if (memory == MEMORY_BOTH) + return disp_load_store; + + return disp_no_group; +} + +/* Return true if insn is a compare instruction. */ + +static bool +is_cmp (rtx_insn *insn) +{ + enum attr_type type; + + type = get_attr_type (insn); + return (type == TYPE_TEST + || type == TYPE_ICMP + || type == TYPE_FCMP + || GET_CODE (PATTERN (insn)) == COMPARE); +} + +/* Return true if a dispatch violation encountered. */ + +static bool +dispatch_violation (void) +{ + if (dispatch_window_list->next) + return dispatch_window_list->next->violation; + return dispatch_window_list->violation; +} + +/* Return true if insn is a branch instruction. */ + +static bool +is_branch (rtx_insn *insn) +{ + return (CALL_P (insn) || JUMP_P (insn)); +} + +/* Return true if insn is a prefetch instruction. */ + +static bool +is_prefetch (rtx_insn *insn) +{ + return NONJUMP_INSN_P (insn) && GET_CODE (PATTERN (insn)) == PREFETCH; +} + +/* This function initializes a dispatch window and the list container holding a + pointer to the window. */ + +static void +init_window (int window_num) +{ + int i; + dispatch_windows *new_list; + + if (window_num == 0) + new_list = dispatch_window_list; + else + new_list = dispatch_window_list1; + + new_list->num_insn = 0; + new_list->num_uops = 0; + new_list->window_size = 0; + new_list->next = NULL; + new_list->prev = NULL; + new_list->window_num = window_num; + new_list->num_imm = 0; + new_list->num_imm_32 = 0; + new_list->num_imm_64 = 0; + new_list->imm_size = 0; + new_list->num_loads = 0; + new_list->num_stores = 0; + new_list->violation = false; + + for (i = 0; i < MAX_INSN; i++) + { + new_list->window[i].insn = NULL; + new_list->window[i].group = disp_no_group; + new_list->window[i].path = no_path; + new_list->window[i].byte_len = 0; + new_list->window[i].imm_bytes = 0; + } + return; +} + +/* This function allocates and initializes a dispatch window and the + list container holding a pointer to the window. */ + +static dispatch_windows * +allocate_window (void) +{ + dispatch_windows *new_list = XNEW (struct dispatch_windows_s); + new_list->window = XNEWVEC (struct sched_insn_info_s, MAX_INSN + 1); + + return new_list; +} + +/* This routine initializes the dispatch scheduling information. It + initiates building dispatch scheduler tables and constructs the + first dispatch window. */ + +static void +init_dispatch_sched (void) +{ + /* Allocate a dispatch list and a window. */ + dispatch_window_list = allocate_window (); + dispatch_window_list1 = allocate_window (); + init_window (0); + init_window (1); +} + +/* This function returns true if a branch is detected. End of a basic block + does not have to be a branch, but here we assume only branches end a + window. */ + +static bool +is_end_basic_block (enum dispatch_group group) +{ + return group == disp_branch; +} + +/* This function is called when the end of a window processing is reached. */ + +static void +process_end_window (void) +{ + gcc_assert (dispatch_window_list->num_insn <= MAX_INSN); + if (dispatch_window_list->next) + { + gcc_assert (dispatch_window_list1->num_insn <= MAX_INSN); + gcc_assert (dispatch_window_list->window_size + + dispatch_window_list1->window_size <= 48); + init_window (1); + } + init_window (0); +} + +/* Allocates a new dispatch window and adds it to WINDOW_LIST. + WINDOW_NUM is either 0 or 1. A maximum of two windows are generated + for 48 bytes of instructions. Note that these windows are not dispatch + windows that their sizes are DISPATCH_WINDOW_SIZE. */ + +static dispatch_windows * +allocate_next_window (int window_num) +{ + if (window_num == 0) + { + if (dispatch_window_list->next) + init_window (1); + init_window (0); + return dispatch_window_list; + } + + dispatch_window_list->next = dispatch_window_list1; + dispatch_window_list1->prev = dispatch_window_list; + + return dispatch_window_list1; +} + +/* Compute number of immediate operands of an instruction. */ + +static void +find_constant (rtx in_rtx, imm_info *imm_values) +{ + if (INSN_P (in_rtx)) + in_rtx = PATTERN (in_rtx); + subrtx_iterator::array_type array; + FOR_EACH_SUBRTX (iter, array, in_rtx, ALL) + if (const_rtx x = *iter) + switch (GET_CODE (x)) + { + case CONST: + case SYMBOL_REF: + case CONST_INT: + (imm_values->imm)++; + if (x86_64_immediate_operand (CONST_CAST_RTX (x), SImode)) + (imm_values->imm32)++; + else + (imm_values->imm64)++; + break; + + case CONST_DOUBLE: + case CONST_WIDE_INT: + (imm_values->imm)++; + (imm_values->imm64)++; + break; + + case CODE_LABEL: + if (LABEL_KIND (x) == LABEL_NORMAL) + { + (imm_values->imm)++; + (imm_values->imm32)++; + } + break; + + default: + break; + } +} + +/* Return total size of immediate operands of an instruction along with number + of corresponding immediate-operands. It initializes its parameters to zero + befor calling FIND_CONSTANT. + INSN is the input instruction. IMM is the total of immediates. + IMM32 is the number of 32 bit immediates. IMM64 is the number of 64 + bit immediates. */ + +static int +get_num_immediates (rtx_insn *insn, int *imm, int *imm32, int *imm64) +{ + imm_info imm_values = {0, 0, 0}; + + find_constant (insn, &imm_values); + *imm = imm_values.imm; + *imm32 = imm_values.imm32; + *imm64 = imm_values.imm64; + return imm_values.imm32 * 4 + imm_values.imm64 * 8; +} + +/* This function indicates if an operand of an instruction is an + immediate. */ + +static bool +has_immediate (rtx_insn *insn) +{ + int num_imm_operand; + int num_imm32_operand; + int num_imm64_operand; + + if (insn) + return get_num_immediates (insn, &num_imm_operand, &num_imm32_operand, + &num_imm64_operand); + return false; +} + +/* Return single or double path for instructions. */ + +static enum insn_path +get_insn_path (rtx_insn *insn) +{ + enum attr_amdfam10_decode path = get_attr_amdfam10_decode (insn); + + if ((int)path == 0) + return path_single; + + if ((int)path == 1) + return path_double; + + return path_multi; +} + +/* Return insn dispatch group. */ + +static enum dispatch_group +get_insn_group (rtx_insn *insn) +{ + enum dispatch_group group = get_mem_group (insn); + if (group) + return group; + + if (is_branch (insn)) + return disp_branch; + + if (is_cmp (insn)) + return disp_cmp; + + if (has_immediate (insn)) + return disp_imm; + + if (is_prefetch (insn)) + return disp_prefetch; + + return disp_no_group; +} + +/* Count number of GROUP restricted instructions in a dispatch + window WINDOW_LIST. */ + +static int +count_num_restricted (rtx_insn *insn, dispatch_windows *window_list) +{ + enum dispatch_group group = get_insn_group (insn); + int imm_size; + int num_imm_operand; + int num_imm32_operand; + int num_imm64_operand; + + if (group == disp_no_group) + return 0; + + if (group == disp_imm) + { + imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand, + &num_imm64_operand); + if (window_list->imm_size + imm_size > MAX_IMM_SIZE + || num_imm_operand + window_list->num_imm > MAX_IMM + || (num_imm32_operand > 0 + && (window_list->num_imm_32 + num_imm32_operand > MAX_IMM_32 + || window_list->num_imm_64 * 2 + num_imm32_operand > MAX_IMM_32)) + || (num_imm64_operand > 0 + && (window_list->num_imm_64 + num_imm64_operand > MAX_IMM_64 + || window_list->num_imm_32 + num_imm64_operand * 2 > MAX_IMM_32)) + || (window_list->imm_size + imm_size == MAX_IMM_SIZE + && num_imm64_operand > 0 + && ((window_list->num_imm_64 > 0 + && window_list->num_insn >= 2) + || window_list->num_insn >= 3))) + return BIG; + + return 1; + } + + if ((group == disp_load_store + && (window_list->num_loads >= MAX_LOAD + || window_list->num_stores >= MAX_STORE)) + || ((group == disp_load + || group == disp_prefetch) + && window_list->num_loads >= MAX_LOAD) + || (group == disp_store + && window_list->num_stores >= MAX_STORE)) + return BIG; + + return 1; +} + +/* This function returns true if insn satisfies dispatch rules on the + last window scheduled. */ + +static bool +fits_dispatch_window (rtx_insn *insn) +{ + dispatch_windows *window_list = dispatch_window_list; + dispatch_windows *window_list_next = dispatch_window_list->next; + unsigned int num_restrict; + enum dispatch_group group = get_insn_group (insn); + enum insn_path path = get_insn_path (insn); + int sum; + + /* Make disp_cmp and disp_jcc get scheduled at the latest. These + instructions should be given the lowest priority in the + scheduling process in Haifa scheduler to make sure they will be + scheduled in the same dispatch window as the reference to them. */ + if (group == disp_jcc || group == disp_cmp) + return false; + + /* Check nonrestricted. */ + if (group == disp_no_group || group == disp_branch) + return true; + + /* Get last dispatch window. */ + if (window_list_next) + window_list = window_list_next; + + if (window_list->window_num == 1) + { + sum = window_list->prev->window_size + window_list->window_size; + + if (sum == 32 + || (ix86_min_insn_size (insn) + sum) >= 48) + /* Window 1 is full. Go for next window. */ + return true; + } + + num_restrict = count_num_restricted (insn, window_list); + + if (num_restrict > num_allowable_groups[group]) + return false; + + /* See if it fits in the first window. */ + if (window_list->window_num == 0) + { + /* The first widow should have only single and double path + uops. */ + if (path == path_double + && (window_list->num_uops + 2) > MAX_INSN) + return false; + else if (path != path_single) + return false; + } + return true; +} + +/* Add an instruction INSN with NUM_UOPS micro-operations to the + dispatch window WINDOW_LIST. */ + +static void +add_insn_window (rtx_insn *insn, dispatch_windows *window_list, int num_uops) +{ + int byte_len = ix86_min_insn_size (insn); + int num_insn = window_list->num_insn; + int imm_size; + sched_insn_info *window = window_list->window; + enum dispatch_group group = get_insn_group (insn); + enum insn_path path = get_insn_path (insn); + int num_imm_operand; + int num_imm32_operand; + int num_imm64_operand; + + if (!window_list->violation && group != disp_cmp + && !fits_dispatch_window (insn)) + window_list->violation = true; + + imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand, + &num_imm64_operand); + + /* Initialize window with new instruction. */ + window[num_insn].insn = insn; + window[num_insn].byte_len = byte_len; + window[num_insn].group = group; + window[num_insn].path = path; + window[num_insn].imm_bytes = imm_size; + + window_list->window_size += byte_len; + window_list->num_insn = num_insn + 1; + window_list->num_uops = window_list->num_uops + num_uops; + window_list->imm_size += imm_size; + window_list->num_imm += num_imm_operand; + window_list->num_imm_32 += num_imm32_operand; + window_list->num_imm_64 += num_imm64_operand; + + if (group == disp_store) + window_list->num_stores += 1; + else if (group == disp_load + || group == disp_prefetch) + window_list->num_loads += 1; + else if (group == disp_load_store) + { + window_list->num_stores += 1; + window_list->num_loads += 1; + } +} + +/* Adds a scheduled instruction, INSN, to the current dispatch window. + If the total bytes of instructions or the number of instructions in + the window exceed allowable, it allocates a new window. */ + +static void +add_to_dispatch_window (rtx_insn *insn) +{ + int byte_len; + dispatch_windows *window_list; + dispatch_windows *next_list; + dispatch_windows *window0_list; + enum insn_path path; + enum dispatch_group insn_group; + bool insn_fits; + int num_insn; + int num_uops; + int window_num; + int insn_num_uops; + int sum; + + if (INSN_CODE (insn) < 0) + return; + + byte_len = ix86_min_insn_size (insn); + window_list = dispatch_window_list; + next_list = window_list->next; + path = get_insn_path (insn); + insn_group = get_insn_group (insn); + + /* Get the last dispatch window. */ + if (next_list) + window_list = dispatch_window_list->next; + + if (path == path_single) + insn_num_uops = 1; + else if (path == path_double) + insn_num_uops = 2; + else + insn_num_uops = (int) path; + + /* If current window is full, get a new window. + Window number zero is full, if MAX_INSN uops are scheduled in it. + Window number one is full, if window zero's bytes plus window + one's bytes is 32, or if the bytes of the new instruction added + to the total makes it greater than 48, or it has already MAX_INSN + instructions in it. */ + num_insn = window_list->num_insn; + num_uops = window_list->num_uops; + window_num = window_list->window_num; + insn_fits = fits_dispatch_window (insn); + + if (num_insn >= MAX_INSN + || num_uops + insn_num_uops > MAX_INSN + || !(insn_fits)) + { + window_num = ~window_num & 1; + window_list = allocate_next_window (window_num); + } + + if (window_num == 0) + { + add_insn_window (insn, window_list, insn_num_uops); + if (window_list->num_insn >= MAX_INSN + && insn_group == disp_branch) + { + process_end_window (); + return; + } + } + else if (window_num == 1) + { + window0_list = window_list->prev; + sum = window0_list->window_size + window_list->window_size; + if (sum == 32 + || (byte_len + sum) >= 48) + { + process_end_window (); + window_list = dispatch_window_list; + } + + add_insn_window (insn, window_list, insn_num_uops); + } + else + gcc_unreachable (); + + if (is_end_basic_block (insn_group)) + { + /* End of basic block is reached do end-basic-block process. */ + process_end_window (); + return; + } +} + +/* Print the dispatch window, WINDOW_NUM, to FILE. */ + +DEBUG_FUNCTION static void +debug_dispatch_window_file (FILE *file, int window_num) +{ + dispatch_windows *list; + int i; + + if (window_num == 0) + list = dispatch_window_list; + else + list = dispatch_window_list1; + + fprintf (file, "Window #%d:\n", list->window_num); + fprintf (file, " num_insn = %d, num_uops = %d, window_size = %d\n", + list->num_insn, list->num_uops, list->window_size); + fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n", + list->num_imm, list->num_imm_32, list->num_imm_64, list->imm_size); + + fprintf (file, " num_loads = %d, num_stores = %d\n", list->num_loads, + list->num_stores); + fprintf (file, " insn info:\n"); + + for (i = 0; i < MAX_INSN; i++) + { + if (!list->window[i].insn) + break; + fprintf (file, " group[%d] = %s, insn[%d] = %p, path[%d] = %d byte_len[%d] = %d, imm_bytes[%d] = %d\n", + i, group_name[list->window[i].group], + i, (void *)list->window[i].insn, + i, list->window[i].path, + i, list->window[i].byte_len, + i, list->window[i].imm_bytes); + } +} + +/* Print to stdout a dispatch window. */ + +DEBUG_FUNCTION void +debug_dispatch_window (int window_num) +{ + debug_dispatch_window_file (stdout, window_num); +} + +/* Print INSN dispatch information to FILE. */ + +DEBUG_FUNCTION static void +debug_insn_dispatch_info_file (FILE *file, rtx_insn *insn) +{ + int byte_len; + enum insn_path path; + enum dispatch_group group; + int imm_size; + int num_imm_operand; + int num_imm32_operand; + int num_imm64_operand; + + if (INSN_CODE (insn) < 0) + return; + + byte_len = ix86_min_insn_size (insn); + path = get_insn_path (insn); + group = get_insn_group (insn); + imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand, + &num_imm64_operand); + + fprintf (file, " insn info:\n"); + fprintf (file, " group = %s, path = %d, byte_len = %d\n", + group_name[group], path, byte_len); + fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n", + num_imm_operand, num_imm32_operand, num_imm64_operand, imm_size); +} + +/* Print to STDERR the status of the ready list with respect to + dispatch windows. */ + +DEBUG_FUNCTION void +debug_ready_dispatch (void) +{ + int i; + int no_ready = number_in_ready (); + + fprintf (stdout, "Number of ready: %d\n", no_ready); + + for (i = 0; i < no_ready; i++) + debug_insn_dispatch_info_file (stdout, get_ready_element (i)); +} + +/* This routine is the driver of the dispatch scheduler. */ + +void +ix86_bd_do_dispatch (rtx_insn *insn, int mode) +{ + if (mode == DISPATCH_INIT) + init_dispatch_sched (); + else if (mode == ADD_TO_DISPATCH_WINDOW) + add_to_dispatch_window (insn); +} + +/* Return TRUE if Dispatch Scheduling is supported. */ + +bool +ix86_bd_has_dispatch (rtx_insn *insn, int action) +{ + /* Current implementation of dispatch scheduler models buldozer only. */ + if ((TARGET_BDVER1 || TARGET_BDVER2 || TARGET_BDVER3 + || TARGET_BDVER4) && flag_dispatch_scheduler) + switch (action) + { + default: + return false; + + case IS_DISPATCH_ON: + return true; + + case IS_CMP: + return is_cmp (insn); + + case DISPATCH_VIOLATION: + return dispatch_violation (); + + case FITS_DISPATCH_WINDOW: + return fits_dispatch_window (insn); + } + + return false; +} diff --git a/gcc/config/i386/x86-tune-sched-core.c b/gcc/config/i386/x86-tune-sched-core.c new file mode 100644 index 00000000000..67b14a708e8 --- /dev/null +++ b/gcc/config/i386/x86-tune-sched-core.c @@ -0,0 +1,255 @@ +/* Scheduler hooks for IA-32 which implement bdver1-4 specific logic. + Copyright (C) 1988-2017 Free Software Foundation, Inc. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3, or (at your option) +any later version. + +GCC is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with GCC; see the file COPYING3. If not see +. */ + +#include "config.h" +#include "system.h" +#include "coretypes.h" +#include "backend.h" +#include "rtl.h" +#include "tree.h" +#include "cfghooks.h" +#include "tm_p.h" +#include "insn-config.h" +#include "insn-attr.h" +#include "recog.h" +#include "target.h" +#include "rtl-iter.h" +#include "regset.h" +#include "sched-int.h" + + +/* Model decoder of Core 2/i7. + Below hooks for multipass scheduling (see haifa-sched.c:max_issue) + track the instruction fetch block boundaries and make sure that long + (9+ bytes) instructions are assigned to D0. */ + +/* Maximum length of an insn that can be handled by + a secondary decoder unit. '8' for Core 2/i7. */ +static int core2i7_secondary_decoder_max_insn_size; + +/* Ifetch block size, i.e., number of bytes decoder reads per cycle. + '16' for Core 2/i7. */ +static int core2i7_ifetch_block_size; + +/* Maximum number of instructions decoder can handle per cycle. + '6' for Core 2/i7. */ +static int core2i7_ifetch_block_max_insns; + +typedef struct ix86_first_cycle_multipass_data_ * + ix86_first_cycle_multipass_data_t; +typedef const struct ix86_first_cycle_multipass_data_ * + const_ix86_first_cycle_multipass_data_t; + +/* A variable to store target state across calls to max_issue within + one cycle. */ +static struct ix86_first_cycle_multipass_data_ _ix86_first_cycle_multipass_data, + *ix86_first_cycle_multipass_data = &_ix86_first_cycle_multipass_data; + +/* Initialize DATA. */ +static void +core2i7_first_cycle_multipass_init (void *_data) +{ + ix86_first_cycle_multipass_data_t data + = (ix86_first_cycle_multipass_data_t) _data; + + data->ifetch_block_len = 0; + data->ifetch_block_n_insns = 0; + data->ready_try_change = NULL; + data->ready_try_change_size = 0; +} + +/* Advancing the cycle; reset ifetch block counts. */ +static void +core2i7_dfa_post_advance_cycle (void) +{ + ix86_first_cycle_multipass_data_t data = ix86_first_cycle_multipass_data; + + gcc_assert (data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns); + + data->ifetch_block_len = 0; + data->ifetch_block_n_insns = 0; +} + +/* Filter out insns from ready_try that the core will not be able to issue + on current cycle due to decoder. */ +static void +core2i7_first_cycle_multipass_filter_ready_try +(const_ix86_first_cycle_multipass_data_t data, + signed char *ready_try, int n_ready, bool first_cycle_insn_p) +{ + while (n_ready--) + { + rtx_insn *insn; + int insn_size; + + if (ready_try[n_ready]) + continue; + + insn = get_ready_element (n_ready); + insn_size = ix86_min_insn_size (insn); + + if (/* If this is a too long an insn for a secondary decoder ... */ + (!first_cycle_insn_p + && insn_size > core2i7_secondary_decoder_max_insn_size) + /* ... or it would not fit into the ifetch block ... */ + || data->ifetch_block_len + insn_size > core2i7_ifetch_block_size + /* ... or the decoder is full already ... */ + || data->ifetch_block_n_insns + 1 > core2i7_ifetch_block_max_insns) + /* ... mask the insn out. */ + { + ready_try[n_ready] = 1; + + if (data->ready_try_change) + bitmap_set_bit (data->ready_try_change, n_ready); + } + } +} + +/* Prepare for a new round of multipass lookahead scheduling. */ +static void +core2i7_first_cycle_multipass_begin (void *_data, + signed char *ready_try, int n_ready, + bool first_cycle_insn_p) +{ + ix86_first_cycle_multipass_data_t data + = (ix86_first_cycle_multipass_data_t) _data; + const_ix86_first_cycle_multipass_data_t prev_data + = ix86_first_cycle_multipass_data; + + /* Restore the state from the end of the previous round. */ + data->ifetch_block_len = prev_data->ifetch_block_len; + data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns; + + /* Filter instructions that cannot be issued on current cycle due to + decoder restrictions. */ + core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready, + first_cycle_insn_p); +} + +/* INSN is being issued in current solution. Account for its impact on + the decoder model. */ +static void +core2i7_first_cycle_multipass_issue (void *_data, + signed char *ready_try, int n_ready, + rtx_insn *insn, const void *_prev_data) +{ + ix86_first_cycle_multipass_data_t data + = (ix86_first_cycle_multipass_data_t) _data; + const_ix86_first_cycle_multipass_data_t prev_data + = (const_ix86_first_cycle_multipass_data_t) _prev_data; + + int insn_size = ix86_min_insn_size (insn); + + data->ifetch_block_len = prev_data->ifetch_block_len + insn_size; + data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns + 1; + gcc_assert (data->ifetch_block_len <= core2i7_ifetch_block_size + && data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns); + + /* Allocate or resize the bitmap for storing INSN's effect on ready_try. */ + if (!data->ready_try_change) + { + data->ready_try_change = sbitmap_alloc (n_ready); + data->ready_try_change_size = n_ready; + } + else if (data->ready_try_change_size < n_ready) + { + data->ready_try_change = sbitmap_resize (data->ready_try_change, + n_ready, 0); + data->ready_try_change_size = n_ready; + } + bitmap_clear (data->ready_try_change); + + /* Filter out insns from ready_try that the core will not be able to issue + on current cycle due to decoder. */ + core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready, + false); +} + +/* Revert the effect on ready_try. */ +static void +core2i7_first_cycle_multipass_backtrack (const void *_data, + signed char *ready_try, + int n_ready ATTRIBUTE_UNUSED) +{ + const_ix86_first_cycle_multipass_data_t data + = (const_ix86_first_cycle_multipass_data_t) _data; + unsigned int i = 0; + sbitmap_iterator sbi; + + gcc_assert (bitmap_last_set_bit (data->ready_try_change) < n_ready); + EXECUTE_IF_SET_IN_BITMAP (data->ready_try_change, 0, i, sbi) + { + ready_try[i] = 0; + } +} + +/* Save the result of multipass lookahead scheduling for the next round. */ +static void +core2i7_first_cycle_multipass_end (const void *_data) +{ + const_ix86_first_cycle_multipass_data_t data + = (const_ix86_first_cycle_multipass_data_t) _data; + ix86_first_cycle_multipass_data_t next_data + = ix86_first_cycle_multipass_data; + + if (data != NULL) + { + next_data->ifetch_block_len = data->ifetch_block_len; + next_data->ifetch_block_n_insns = data->ifetch_block_n_insns; + } +} + +/* Deallocate target data. */ +static void +core2i7_first_cycle_multipass_fini (void *_data) +{ + ix86_first_cycle_multipass_data_t data + = (ix86_first_cycle_multipass_data_t) _data; + + if (data->ready_try_change) + { + sbitmap_free (data->ready_try_change); + data->ready_try_change = NULL; + data->ready_try_change_size = 0; + } +} + +void +ix86_core2i7_init_hooks (void) +{ + targetm.sched.dfa_post_advance_cycle + = core2i7_dfa_post_advance_cycle; + targetm.sched.first_cycle_multipass_init + = core2i7_first_cycle_multipass_init; + targetm.sched.first_cycle_multipass_begin + = core2i7_first_cycle_multipass_begin; + targetm.sched.first_cycle_multipass_issue + = core2i7_first_cycle_multipass_issue; + targetm.sched.first_cycle_multipass_backtrack + = core2i7_first_cycle_multipass_backtrack; + targetm.sched.first_cycle_multipass_end + = core2i7_first_cycle_multipass_end; + targetm.sched.first_cycle_multipass_fini + = core2i7_first_cycle_multipass_fini; + + /* Set decoder parameters. */ + core2i7_secondary_decoder_max_insn_size = 8; + core2i7_ifetch_block_size = 16; + core2i7_ifetch_block_max_insns = 6; +} diff --git a/gcc/config/i386/x86-tune-sched.c b/gcc/config/i386/x86-tune-sched.c new file mode 100644 index 00000000000..51fa77c389a --- /dev/null +++ b/gcc/config/i386/x86-tune-sched.c @@ -0,0 +1,599 @@ +/* Scheduler hooks for IA-32 which implement CPU specific logic. + Copyright (C) 1988-2017 Free Software Foundation, Inc. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3, or (at your option) +any later version. + +GCC is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with GCC; see the file COPYING3. If not see +. */ + +#include "config.h" +#include "system.h" +#include "coretypes.h" +#include "backend.h" +#include "rtl.h" +#include "tree.h" +#include "cfghooks.h" +#include "tm_p.h" +#include "insn-config.h" +#include "insn-attr.h" +#include "recog.h" +#include "target.h" + +/* Return the maximum number of instructions a cpu can issue. */ + +int +ix86_issue_rate (void) +{ + switch (ix86_tune) + { + case PROCESSOR_PENTIUM: + case PROCESSOR_LAKEMONT: + case PROCESSOR_BONNELL: + case PROCESSOR_SILVERMONT: + case PROCESSOR_KNL: + case PROCESSOR_KNM: + case PROCESSOR_INTEL: + case PROCESSOR_K6: + case PROCESSOR_BTVER2: + case PROCESSOR_PENTIUM4: + case PROCESSOR_NOCONA: + return 2; + + case PROCESSOR_PENTIUMPRO: + case PROCESSOR_ATHLON: + case PROCESSOR_K8: + case PROCESSOR_AMDFAM10: + case PROCESSOR_GENERIC: + case PROCESSOR_BTVER1: + return 3; + + case PROCESSOR_BDVER1: + case PROCESSOR_BDVER2: + case PROCESSOR_BDVER3: + case PROCESSOR_BDVER4: + case PROCESSOR_ZNVER1: + case PROCESSOR_CORE2: + case PROCESSOR_NEHALEM: + case PROCESSOR_SANDYBRIDGE: + case PROCESSOR_HASWELL: + return 4; + + default: + return 1; + } +} + +/* Return true iff USE_INSN has a memory address with operands set by + SET_INSN. */ + +bool +ix86_agi_dependent (rtx_insn *set_insn, rtx_insn *use_insn) +{ + int i; + extract_insn_cached (use_insn); + for (i = recog_data.n_operands - 1; i >= 0; --i) + if (MEM_P (recog_data.operand[i])) + { + rtx addr = XEXP (recog_data.operand[i], 0); + if (modified_in_p (addr, set_insn) != 0) + { + /* No AGI stall if SET_INSN is a push or pop and USE_INSN + has SP based memory (unless index reg is modified in a pop). */ + rtx set = single_set (set_insn); + if (set + && (push_operand (SET_DEST (set), GET_MODE (SET_DEST (set))) + || pop_operand (SET_SRC (set), GET_MODE (SET_SRC (set))))) + { + struct ix86_address parts; + if (ix86_decompose_address (addr, &parts) + && parts.base == stack_pointer_rtx + && (parts.index == NULL_RTX + || MEM_P (SET_DEST (set)) + || !modified_in_p (parts.index, set_insn))) + return false; + } + return true; + } + return false; + } + return false; +} + +/* A subroutine of ix86_adjust_cost -- return TRUE iff INSN reads flags set + by DEP_INSN and nothing set by DEP_INSN. */ + +static bool +ix86_flags_dependent (rtx_insn *insn, rtx_insn *dep_insn, enum attr_type insn_type) +{ + rtx set, set2; + + /* Simplify the test for uninteresting insns. */ + if (insn_type != TYPE_SETCC + && insn_type != TYPE_ICMOV + && insn_type != TYPE_FCMOV + && insn_type != TYPE_IBR) + return false; + + if ((set = single_set (dep_insn)) != 0) + { + set = SET_DEST (set); + set2 = NULL_RTX; + } + else if (GET_CODE (PATTERN (dep_insn)) == PARALLEL + && XVECLEN (PATTERN (dep_insn), 0) == 2 + && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 0)) == SET + && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 1)) == SET) + { + set = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0)); + set2 = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0)); + } + else + return false; + + if (!REG_P (set) || REGNO (set) != FLAGS_REG) + return false; + + /* This test is true if the dependent insn reads the flags but + not any other potentially set register. */ + if (!reg_overlap_mentioned_p (set, PATTERN (insn))) + return false; + + if (set2 && reg_overlap_mentioned_p (set2, PATTERN (insn))) + return false; + + return true; +} + +/* Helper function for exact_store_load_dependency. + Return true if addr is found in insn. */ +static bool +exact_dependency_1 (rtx addr, rtx insn) +{ + enum rtx_code code; + const char *format_ptr; + int i, j; + + code = GET_CODE (insn); + switch (code) + { + case MEM: + if (rtx_equal_p (addr, insn)) + return true; + break; + case REG: + CASE_CONST_ANY: + case SYMBOL_REF: + case CODE_LABEL: + case PC: + case CC0: + case EXPR_LIST: + return false; + default: + break; + } + + format_ptr = GET_RTX_FORMAT (code); + for (i = 0; i < GET_RTX_LENGTH (code); i++) + { + switch (*format_ptr++) + { + case 'e': + if (exact_dependency_1 (addr, XEXP (insn, i))) + return true; + break; + case 'E': + for (j = 0; j < XVECLEN (insn, i); j++) + if (exact_dependency_1 (addr, XVECEXP (insn, i, j))) + return true; + break; + } + } + return false; +} + +/* Return true if there exists exact dependency for store & load, i.e. + the same memory address is used in them. */ +static bool +exact_store_load_dependency (rtx_insn *store, rtx_insn *load) +{ + rtx set1, set2; + + set1 = single_set (store); + if (!set1) + return false; + if (!MEM_P (SET_DEST (set1))) + return false; + set2 = single_set (load); + if (!set2) + return false; + if (exact_dependency_1 (SET_DEST (set1), SET_SRC (set2))) + return true; + return false; +} + + +/* This function corrects the value of COST (latency) based on the relationship + between INSN and DEP_INSN through a dependence of type DEP_TYPE, and strength + DW. It should return the new value. + + On x86 CPUs this is most commonly used to model the fact that valus of + registers used to compute address of memory operand needs to be ready + earlier than values of registers used in the actual operation. */ + +int +ix86_adjust_cost (rtx_insn *insn, int dep_type, rtx_insn *dep_insn, int cost, + unsigned int) +{ + enum attr_type insn_type, dep_insn_type; + enum attr_memory memory; + rtx set, set2; + int dep_insn_code_number; + + /* Anti and output dependencies have zero cost on all CPUs. */ + if (dep_type != 0) + return 0; + + dep_insn_code_number = recog_memoized (dep_insn); + + /* If we can't recognize the insns, we can't really do anything. */ + if (dep_insn_code_number < 0 || recog_memoized (insn) < 0) + return cost; + + insn_type = get_attr_type (insn); + dep_insn_type = get_attr_type (dep_insn); + + switch (ix86_tune) + { + case PROCESSOR_PENTIUM: + case PROCESSOR_LAKEMONT: + /* Address Generation Interlock adds a cycle of latency. */ + if (insn_type == TYPE_LEA) + { + rtx addr = PATTERN (insn); + + if (GET_CODE (addr) == PARALLEL) + addr = XVECEXP (addr, 0, 0); + + gcc_assert (GET_CODE (addr) == SET); + + addr = SET_SRC (addr); + if (modified_in_p (addr, dep_insn)) + cost += 1; + } + else if (ix86_agi_dependent (dep_insn, insn)) + cost += 1; + + /* ??? Compares pair with jump/setcc. */ + if (ix86_flags_dependent (insn, dep_insn, insn_type)) + cost = 0; + + /* Floating point stores require value to be ready one cycle earlier. */ + if (insn_type == TYPE_FMOV + && get_attr_memory (insn) == MEMORY_STORE + && !ix86_agi_dependent (dep_insn, insn)) + cost += 1; + break; + + case PROCESSOR_PENTIUMPRO: + /* INT->FP conversion is expensive. */ + if (get_attr_fp_int_src (dep_insn)) + cost += 5; + + /* There is one cycle extra latency between an FP op and a store. */ + if (insn_type == TYPE_FMOV + && (set = single_set (dep_insn)) != NULL_RTX + && (set2 = single_set (insn)) != NULL_RTX + && rtx_equal_p (SET_DEST (set), SET_SRC (set2)) + && MEM_P (SET_DEST (set2))) + cost += 1; + + memory = get_attr_memory (insn); + + /* Show ability of reorder buffer to hide latency of load by executing + in parallel with previous instruction in case + previous instruction is not needed to compute the address. */ + if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH) + && !ix86_agi_dependent (dep_insn, insn)) + { + /* Claim moves to take one cycle, as core can issue one load + at time and the next load can start cycle later. */ + if (dep_insn_type == TYPE_IMOV + || dep_insn_type == TYPE_FMOV) + cost = 1; + else if (cost > 1) + cost--; + } + break; + + case PROCESSOR_K6: + /* The esp dependency is resolved before + the instruction is really finished. */ + if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP) + && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP)) + return 1; + + /* INT->FP conversion is expensive. */ + if (get_attr_fp_int_src (dep_insn)) + cost += 5; + + memory = get_attr_memory (insn); + + /* Show ability of reorder buffer to hide latency of load by executing + in parallel with previous instruction in case + previous instruction is not needed to compute the address. */ + if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH) + && !ix86_agi_dependent (dep_insn, insn)) + { + /* Claim moves to take one cycle, as core can issue one load + at time and the next load can start cycle later. */ + if (dep_insn_type == TYPE_IMOV + || dep_insn_type == TYPE_FMOV) + cost = 1; + else if (cost > 2) + cost -= 2; + else + cost = 1; + } + break; + + case PROCESSOR_AMDFAM10: + case PROCESSOR_BDVER1: + case PROCESSOR_BDVER2: + case PROCESSOR_BDVER3: + case PROCESSOR_BDVER4: + case PROCESSOR_ZNVER1: + case PROCESSOR_BTVER1: + case PROCESSOR_BTVER2: + case PROCESSOR_GENERIC: + /* Stack engine allows to execute push&pop instructions in parall. */ + if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP) + && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP)) + return 0; + /* FALLTHRU */ + + case PROCESSOR_ATHLON: + case PROCESSOR_K8: + memory = get_attr_memory (insn); + + /* Show ability of reorder buffer to hide latency of load by executing + in parallel with previous instruction in case + previous instruction is not needed to compute the address. */ + if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH) + && !ix86_agi_dependent (dep_insn, insn)) + { + enum attr_unit unit = get_attr_unit (insn); + int loadcost = 3; + + /* Because of the difference between the length of integer and + floating unit pipeline preparation stages, the memory operands + for floating point are cheaper. + + ??? For Athlon it the difference is most probably 2. */ + if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN) + loadcost = 3; + else + loadcost = TARGET_ATHLON ? 2 : 0; + + if (cost >= loadcost) + cost -= loadcost; + else + cost = 0; + } + break; + + case PROCESSOR_CORE2: + case PROCESSOR_NEHALEM: + case PROCESSOR_SANDYBRIDGE: + case PROCESSOR_HASWELL: + /* Stack engine allows to execute push&pop instructions in parall. */ + if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP) + && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP)) + return 0; + + memory = get_attr_memory (insn); + + /* Show ability of reorder buffer to hide latency of load by executing + in parallel with previous instruction in case + previous instruction is not needed to compute the address. */ + if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH) + && !ix86_agi_dependent (dep_insn, insn)) + { + if (cost >= 4) + cost -= 4; + else + cost = 0; + } + break; + + case PROCESSOR_SILVERMONT: + case PROCESSOR_KNL: + case PROCESSOR_KNM: + case PROCESSOR_INTEL: + if (!reload_completed) + return cost; + + /* Increase cost of integer loads. */ + memory = get_attr_memory (dep_insn); + if (memory == MEMORY_LOAD || memory == MEMORY_BOTH) + { + enum attr_unit unit = get_attr_unit (dep_insn); + if (unit == UNIT_INTEGER && cost == 1) + { + if (memory == MEMORY_LOAD) + cost = 3; + else + { + /* Increase cost of ld/st for short int types only + because of store forwarding issue. */ + rtx set = single_set (dep_insn); + if (set && (GET_MODE (SET_DEST (set)) == QImode + || GET_MODE (SET_DEST (set)) == HImode)) + { + /* Increase cost of store/load insn if exact + dependence exists and it is load insn. */ + enum attr_memory insn_memory = get_attr_memory (insn); + if (insn_memory == MEMORY_LOAD + && exact_store_load_dependency (dep_insn, insn)) + cost = 3; + } + } + } + } + + default: + break; + } + + return cost; +} + +/* How many alternative schedules to try. This should be as wide as the + scheduling freedom in the DFA, but no wider. Making this value too + large results extra work for the scheduler. */ + +int +ia32_multipass_dfa_lookahead (void) +{ + /* Generally, we want haifa-sched:max_issue() to look ahead as far + as many instructions can be executed on a cycle, i.e., + issue_rate. */ + if (reload_completed) + return ix86_issue_rate (); + /* Don't use lookahead for pre-reload schedule to save compile time. */ + return 0; +} + +/* Return true if target platform supports macro-fusion. */ + +bool +ix86_macro_fusion_p () +{ + return TARGET_FUSE_CMP_AND_BRANCH; +} + +/* Check whether current microarchitecture support macro fusion + for insn pair "CONDGEN + CONDJMP". Refer to + "Intel Architectures Optimization Reference Manual". */ + +bool +ix86_macro_fusion_pair_p (rtx_insn *condgen, rtx_insn *condjmp) +{ + rtx src, dest; + enum rtx_code ccode; + rtx compare_set = NULL_RTX, test_if, cond; + rtx alu_set = NULL_RTX, addr = NULL_RTX; + + if (!any_condjump_p (condjmp)) + return false; + + unsigned int condreg1, condreg2; + rtx cc_reg_1; + targetm.fixed_condition_code_regs (&condreg1, &condreg2); + cc_reg_1 = gen_rtx_REG (CCmode, condreg1); + if (!reg_referenced_p (cc_reg_1, PATTERN (condjmp)) + || !condgen + || !modified_in_p (cc_reg_1, condgen)) + return false; + + if (get_attr_type (condgen) != TYPE_TEST + && get_attr_type (condgen) != TYPE_ICMP + && get_attr_type (condgen) != TYPE_INCDEC + && get_attr_type (condgen) != TYPE_ALU) + return false; + + compare_set = single_set (condgen); + if (compare_set == NULL_RTX + && !TARGET_FUSE_ALU_AND_BRANCH) + return false; + + if (compare_set == NULL_RTX) + { + int i; + rtx pat = PATTERN (condgen); + for (i = 0; i < XVECLEN (pat, 0); i++) + if (GET_CODE (XVECEXP (pat, 0, i)) == SET) + { + rtx set_src = SET_SRC (XVECEXP (pat, 0, i)); + if (GET_CODE (set_src) == COMPARE) + compare_set = XVECEXP (pat, 0, i); + else + alu_set = XVECEXP (pat, 0, i); + } + } + if (compare_set == NULL_RTX) + return false; + src = SET_SRC (compare_set); + if (GET_CODE (src) != COMPARE) + return false; + + /* Macro-fusion for cmp/test MEM-IMM + conditional jmp is not + supported. */ + if ((MEM_P (XEXP (src, 0)) + && CONST_INT_P (XEXP (src, 1))) + || (MEM_P (XEXP (src, 1)) + && CONST_INT_P (XEXP (src, 0)))) + return false; + + /* No fusion for RIP-relative address. */ + if (MEM_P (XEXP (src, 0))) + addr = XEXP (XEXP (src, 0), 0); + else if (MEM_P (XEXP (src, 1))) + addr = XEXP (XEXP (src, 1), 0); + + if (addr) { + ix86_address parts; + int ok = ix86_decompose_address (addr, &parts); + gcc_assert (ok); + + if (ix86_rip_relative_addr_p (&parts)) + return false; + } + + test_if = SET_SRC (pc_set (condjmp)); + cond = XEXP (test_if, 0); + ccode = GET_CODE (cond); + /* Check whether conditional jump use Sign or Overflow Flags. */ + if (!TARGET_FUSE_CMP_AND_BRANCH_SOFLAGS + && (ccode == GE + || ccode == GT + || ccode == LE + || ccode == LT)) + return false; + + /* Return true for TYPE_TEST and TYPE_ICMP. */ + if (get_attr_type (condgen) == TYPE_TEST + || get_attr_type (condgen) == TYPE_ICMP) + return true; + + /* The following is the case that macro-fusion for alu + jmp. */ + if (!TARGET_FUSE_ALU_AND_BRANCH || !alu_set) + return false; + + /* No fusion for alu op with memory destination operand. */ + dest = SET_DEST (alu_set); + if (MEM_P (dest)) + return false; + + /* Macro-fusion for inc/dec + unsigned conditional jump is not + supported. */ + if (get_attr_type (condgen) == TYPE_INCDEC + && (ccode == GEU + || ccode == GTU + || ccode == LEU + || ccode == LTU)) + return false; + + return true; +} + -- 2.30.2