config.gcc (i386, x86_64): Add extra objects.

author Jan Hubicka <hubicka@ucw.cz>

Wed, 11 Oct 2017 15:17:23 +0000 (17:17 +0200)

committer Jan Hubicka <hubicka@gcc.gnu.org>

Wed, 11 Oct 2017 15:17:23 +0000 (15:17 +0000)
author Jan Hubicka <hubicka@ucw.cz>
Wed, 11 Oct 2017 15:17:23 +0000 (17:17 +0200)
committer Jan Hubicka <hubicka@gcc.gnu.org>
Wed, 11 Oct 2017 15:17:23 +0000 (15:17 +0000)
diff --git a/gcc/ChangeLog b/gcc/ChangeLog

index 3d888e3b64529d9d0feef2daaf10e7c42a0fd377..8d09e1038012cee1365133363b8444ae8a3f5db0 100644 (file)
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,94 @@
+2017-10-11  Jan Hubicka  <hubicka@ucw.cz>
+
+       * config.gcc (i386, x86_64): Add extra objects.
+       * i386/i386-protos.h (ix86_rip_relative_addr_p): Declare.
+       (ix86_min_insn_size): Declare.
+       (ix86_issue_rate): Declare.
+       (ix86_adjust_cost): Declare.
+       (ia32_multipass_dfa_lookahead): Declare.
+       (ix86_macro_fusion_p): Declare.
+       (ix86_macro_fusion_pair_p): Declare.
+       (ix86_bd_has_dispatch): Declare.
+       (ix86_bd_do_dispatch): Declare.
+       (ix86_core2i7_init_hooks): Declare.
+       (ix86_atom_sched_reorder): Declare.
+       * i386/i386.c Move all CPU cost tables to x86-tune-costs.h.
+       (COSTS_N_BYTES): Move to x86-tune-costs.h.
+       (DUMMY_STRINGOP_ALGS):x86-tune-costs.h.
+       (rip_relative_addr_p): Rename to ...
+       (ix86_rip_relative_addr_p): ... this one; export.
+       (memory_address_length): Update.
+       (ix86_issue_rate): Move to x86-tune-sched.c.
+       (ix86_flags_dependent): Move to x86-tune-sched.c.
+       (ix86_agi_dependent): Move to x86-tune-sched.c.
+       (exact_dependency_1): Move to x86-tune-sched.c.
+       (exact_store_load_dependency): Move to x86-tune-sched.c.
+       (ix86_adjust_cost): Move to x86-tune-sched.c.
+       (ia32_multipass_dfa_lookahead): Move to x86-tune-sched.c.
+       (ix86_macro_fusion_p): Move to x86-tune-sched.c.
+       (ix86_macro_fusion_pair_p): Move to x86-tune-sched.c.
+       (do_reorder_for_imul): Move to x86-tune-sched-atom.c.
+       (swap_top_of_ready_list): Move to x86-tune-sched-atom.c.
+       (ix86_sched_reorder): Move to x86-tune-sched-atom.c.
+       (core2i7_first_cycle_multipass_init): Move to x86-tune-sched-core.c.
+       (core2i7_dfa_post_advance_cycle): Move to x86-tune-sched-core.c.
+       (min_insn_size): Rename to ...
+       (ix86_min_insn_size): ... this one; export.
+       (core2i7_first_cycle_multipass_begin): Move to x86-tune-sched-core.c.
+       (core2i7_first_cycle_multipass_issue): Move to x86-tune-sched-core.c.
+       (core2i7_first_cycle_multipass_backtrack): Move to x86-tune-sched-core.c.
+       (core2i7_first_cycle_multipass_end): Move to x86-tune-sched-core.c.
+       (core2i7_first_cycle_multipass_fini): Move to x86-tune-sched-core.c.
+       (ix86_sched_init_global): Break up logic to ix86_core2i7_init_hooks.
+       (ix86_avoid_jump_mispredicts): Update.
+       (TARGET_SCHED_DISPATCH): Move to ix86-tune-sched-bd.c.
+       (TARGET_SCHED_DISPATCH_DO): Move to ix86-tune-sched-bd.c.
+       (TARGET_SCHED_REORDER): Move to ix86-tune-sched-bd.c.
+       (DISPATCH_WINDOW_SIZE): Move to ix86-tune-sched-bd.c.
+       (MAX_DISPATCH_WINDOWS): Move to ix86-tune-sched-bd.c.
+       (MAX_INSN): Move to ix86-tune-sched-bd.c.
+       (MAX_IMM): Move to ix86-tune-sched-bd.c.
+       (MAX_IMM_SIZE): Move to ix86-tune-sched-bd.c.
+       (MAX_IMM_32): Move to ix86-tune-sched-bd.c.
+       (MAX_IMM_64): Move to ix86-tune-sched-bd.c.
+       (MAX_LOAD): Move to ix86-tune-sched-bd.c.
+       (MAX_STORE): Move to ix86-tune-sched-bd.c.
+       (BIG): Move to ix86-tune-sched-bd.c.
+       (enum dispatch_group): Move to ix86-tune-sched-bd.c.
+       (enum insn_path): Move to ix86-tune-sched-bd.c.
+       (get_mem_group): Move to ix86-tune-sched-bd.c.
+       (is_cmp): Move to ix86-tune-sched-bd.c.
+       (dispatch_violation): Move to ix86-tune-sched-bd.c.
+       (is_branch): Move to ix86-tune-sched-bd.c.
+       (is_prefetch): Move to ix86-tune-sched-bd.c.
+       (init_window): Move to ix86-tune-sched-bd.c.
+       (allocate_window): Move to ix86-tune-sched-bd.c.
+       (init_dispatch_sched): Move to ix86-tune-sched-bd.c.
+       (is_end_basic_block): Move to ix86-tune-sched-bd.c.
+       (process_end_window): Move to ix86-tune-sched-bd.c.
+       (allocate_next_window): Move to ix86-tune-sched-bd.c.
+       (find_constant): Move to ix86-tune-sched-bd.c.
+       (get_num_immediates): Move to ix86-tune-sched-bd.c.
+       (has_immediate): Move to ix86-tune-sched-bd.c.
+       (get_insn_path): Move to ix86-tune-sched-bd.c.
+       (get_insn_group): Move to ix86-tune-sched-bd.c.
+       (count_num_restricted): Move to ix86-tune-sched-bd.c.
+       (fits_dispatch_window): Move to ix86-tune-sched-bd.c.
+       (add_insn_window): Move to ix86-tune-sched-bd.c.
+       (add_to_dispatch_window): Move to ix86-tune-sched-bd.c.
+       (debug_dispatch_window_file): Move to ix86-tune-sched-bd.c.
+       (debug_dispatch_window): Move to ix86-tune-sched-bd.c.
+       (debug_insn_dispatch_info_file): Move to ix86-tune-sched-bd.c.
+       (debug_ready_dispatch): Move to ix86-tune-sched-bd.c.
+       (do_dispatch): Move to ix86-tune-sched-bd.c.
+       (has_dispatch): Move to ix86-tune-sched-bd.c.
+       * i386/t-i386: Add new object files.
+       * i386/x86-tune-costs.h: New file.
+       * i386/x86-tune-sched-atom.c: New file.
+       * i386/x86-tune-sched-bd.c: New file.
+       * i386/x86-tune-sched-core.c: New file.
+       * i386/x86-tune-sched.c: New file.
+
  2017-10-11  Liu Hao  <lh_mouse@126.com>
  
         * pretty-print.c [_WIN32] (colorize_init): Remove.  Use
diff --git a/gcc/config.gcc b/gcc/config.gcc

index c52bebc220ac4a0f80723957b3180eae0a7ea357..22702396a9f68f11886f15df4562058680263fba 100644 (file)
--- a/gcc/config.gcc
+++ b/gcc/config.gcc
@@ -360,6 +360,7 @@ i[34567]86-*-*)
         cpu_type=i386
         c_target_objs="i386-c.o"
         cxx_target_objs="i386-c.o"
+       extra_objs="x86-tune-sched.o x86-tune-sched-bd.o x86-tune-sched-atom.o x86-tune-sched-core.o"
         extra_options="${extra_options} fused-madd.opt"
         extra_headers="cpuid.h mmintrin.h mm3dnow.h xmmintrin.h emmintrin.h
                        pmmintrin.h tmmintrin.h ammintrin.h smmintrin.h
@@ -384,6 +385,7 @@ x86_64-*-*)
         c_target_objs="i386-c.o"
         cxx_target_objs="i386-c.o"
         extra_options="${extra_options} fused-madd.opt"
+       extra_objs="x86-tune-sched.o x86-tune-sched-bd.o x86-tune-sched-atom.o x86-tune-sched-core.o"
         extra_headers="cpuid.h mmintrin.h mm3dnow.h xmmintrin.h emmintrin.h
                        pmmintrin.h tmmintrin.h ammintrin.h smmintrin.h
                        nmmintrin.h bmmintrin.h fma4intrin.h wmmintrin.h
diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h

index fbe9f271434a263e18681148d73fc050b431ff99..ab3d8f95c5d0c5bdbf6b5ae8f200e9c5dc5a7dd0 100644 (file)
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@@ -27,6 +27,7 @@ extern bool ix86_handle_option (struct gcc_options *opts,
  extern bool ix86_target_stack_probe (void);
  extern bool ix86_can_use_return_insn_p (void);
  extern void ix86_setup_frame_addresses (void);
+extern bool ix86_rip_relative_addr_p (struct ix86_address *parts);
  
  extern HOST_WIDE_INT ix86_initial_elimination_offset (int, int);
  extern void ix86_expand_prologue (void);
@@ -314,6 +315,21 @@ extern enum attr_cpu ix86_schedule;
  extern const char * ix86_output_call_insn (rtx_insn *insn, rtx call_op);
  extern bool ix86_operands_ok_for_move_multiple (rtx *operands, bool load,
                                                 machine_mode mode);
+extern int ix86_min_insn_size (rtx_insn *);
+
+extern int ix86_issue_rate (void);
+extern int ix86_adjust_cost (rtx_insn *insn, int dep_type, rtx_insn *dep_insn,
+                            int cost, unsigned int);
+extern int ia32_multipass_dfa_lookahead (void);
+extern bool ix86_macro_fusion_p (void);
+extern bool ix86_macro_fusion_pair_p (rtx_insn *condgen, rtx_insn *condjmp);
+
+extern bool ix86_bd_has_dispatch (rtx_insn *insn, int action);
+extern void ix86_bd_do_dispatch (rtx_insn *insn, int mode);
+
+extern void ix86_core2i7_init_hooks (void);
+
+extern int ix86_atom_sched_reorder (FILE *, int, rtx_insn **, int *, int);
  
  #ifdef RTX_CODE
  /* Target data for multipass lookahead scheduling.
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c

index 7abda92bf01de895801044ace90b5ee444b8a162..63058a8d19ca2402515860869897f3dbe3f1a15c 100644 (file)
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -92,6 +92,8 @@ along with GCC; see the file COPYING3.  If not see
  /* This file should be included last.  */
  #include "target-def.h"
  
+#include "x86-tune-costs.h"
+
  static rtx legitimize_dllimport_symbol (rtx, bool);
  static rtx legitimize_pe_coff_extern_decl (rtx, bool);
  static rtx legitimize_pe_coff_symbol (rtx, bool);
@@ -111,2094 +113,12 @@ static bool ix86_function_naked (const_tree);
     : (mode) == DImode ? 3                                      \
     : 4)
  
-/* Processor costs (relative to an add) */
-/* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes.  */
-#define COSTS_N_BYTES(N) ((N) * 2)
-
-#define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall, false}}}
-
-static stringop_algs ix86_size_memcpy[2] = {
-  {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
-  {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
-static stringop_algs ix86_size_memset[2] = {
-  {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
-  {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
-
-const
-struct processor_costs ix86_size_cost = {/* costs for tuning for size */
-  COSTS_N_BYTES (2),                   /* cost of an add instruction */
-  COSTS_N_BYTES (3),                   /* cost of a lea instruction */
-  COSTS_N_BYTES (2),                   /* variable shift costs */
-  COSTS_N_BYTES (3),                   /* constant shift costs */
-  {COSTS_N_BYTES (3),                  /* cost of starting multiply for QI */
-   COSTS_N_BYTES (3),                  /*                               HI */
-   COSTS_N_BYTES (3),                  /*                               SI */
-   COSTS_N_BYTES (3),                  /*                               DI */
-   COSTS_N_BYTES (5)},                 /*                            other */
-  0,                                   /* cost of multiply per each bit set */
-  {COSTS_N_BYTES (3),                  /* cost of a divide/mod for QI */
-   COSTS_N_BYTES (3),                  /*                          HI */
-   COSTS_N_BYTES (3),                  /*                          SI */
-   COSTS_N_BYTES (3),                  /*                          DI */
-   COSTS_N_BYTES (5)},                 /*                          other */
-  COSTS_N_BYTES (3),                   /* cost of movsx */
-  COSTS_N_BYTES (3),                   /* cost of movzx */
-  0,                                   /* "large" insn */
-  2,                                   /* MOVE_RATIO */
-  2,                                /* cost for loading QImode using movzbl */
-  {2, 2, 2},                           /* cost of loading integer registers
-                                          in QImode, HImode and SImode.
-                                          Relative to reg-reg move (2).  */
-  {2, 2, 2},                           /* cost of storing integer registers */
-  2,                                   /* cost of reg,reg fld/fst */
-  {2, 2, 2},                           /* cost of loading fp registers
-                                          in SFmode, DFmode and XFmode */
-  {2, 2, 2},                           /* cost of storing fp registers
-                                          in SFmode, DFmode and XFmode */
-  3,                                   /* cost of moving MMX register */
-  {3, 3},                              /* cost of loading MMX registers
-                                          in SImode and DImode */
-  {3, 3},                              /* cost of storing MMX registers
-                                          in SImode and DImode */
-  3,                                   /* cost of moving SSE register */
-  {3, 3, 3},                           /* cost of loading SSE registers
-                                          in SImode, DImode and TImode */
-  {3, 3, 3},                           /* cost of storing SSE registers
-                                          in SImode, DImode and TImode */
-  3,                                   /* MMX or SSE register to integer */
-  0,                                   /* size of l1 cache  */
-  0,                                   /* size of l2 cache  */
-  0,                                   /* size of prefetch block */
-  0,                                   /* number of parallel prefetches */
-  2,                                   /* Branch cost */
-  COSTS_N_BYTES (2),                   /* cost of FADD and FSUB insns.  */
-  COSTS_N_BYTES (2),                   /* cost of FMUL instruction.  */
-  COSTS_N_BYTES (2),                   /* cost of FDIV instruction.  */
-  COSTS_N_BYTES (2),                   /* cost of FABS instruction.  */
-  COSTS_N_BYTES (2),                   /* cost of FCHS instruction.  */
-  COSTS_N_BYTES (2),                   /* cost of FSQRT instruction.  */
-  1, 1, 1, 1,                          /* reassoc int, fp, vec_int, vec_fp.  */
-  ix86_size_memcpy,
-  ix86_size_memset,
-  1,                                   /* scalar_stmt_cost.  */
-  1,                                   /* scalar load_cost.  */
-  1,                                   /* scalar_store_cost.  */
-  1,                                   /* vec_stmt_cost.  */
-  1,                                   /* vec_to_scalar_cost.  */
-  1,                                   /* scalar_to_vec_cost.  */
-  1,                                   /* vec_align_load_cost.  */
-  1,                                   /* vec_unalign_load_cost.  */
-  1,                                   /* vec_store_cost.  */
-  1,                                   /* cond_taken_branch_cost.  */
-  1,                                   /* cond_not_taken_branch_cost.  */
-};
-
-/* Processor costs (relative to an add) */
-static stringop_algs i386_memcpy[2] = {
-  {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
-  DUMMY_STRINGOP_ALGS};
-static stringop_algs i386_memset[2] = {
-  {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
-  DUMMY_STRINGOP_ALGS};
-
-static const
-struct processor_costs i386_cost = {   /* 386 specific costs */
-  COSTS_N_INSNS (1),                   /* cost of an add instruction */
-  COSTS_N_INSNS (1),                   /* cost of a lea instruction */
-  COSTS_N_INSNS (3),                   /* variable shift costs */
-  COSTS_N_INSNS (2),                   /* constant shift costs */
-  {COSTS_N_INSNS (6),                  /* cost of starting multiply for QI */
-   COSTS_N_INSNS (6),                  /*                               HI */
-   COSTS_N_INSNS (6),                  /*                               SI */
-   COSTS_N_INSNS (6),                  /*                               DI */
-   COSTS_N_INSNS (6)},                 /*                            other */
-  COSTS_N_INSNS (1),                   /* cost of multiply per each bit set */
-  {COSTS_N_INSNS (23),                 /* cost of a divide/mod for QI */
-   COSTS_N_INSNS (23),                 /*                          HI */
-   COSTS_N_INSNS (23),                 /*                          SI */
-   COSTS_N_INSNS (23),                 /*                          DI */
-   COSTS_N_INSNS (23)},                        /*                          other */
-  COSTS_N_INSNS (3),                   /* cost of movsx */
-  COSTS_N_INSNS (2),                   /* cost of movzx */
-  15,                                  /* "large" insn */
-  3,                                   /* MOVE_RATIO */
-  4,                                /* cost for loading QImode using movzbl */
-  {2, 4, 2},                           /* cost of loading integer registers
-                                          in QImode, HImode and SImode.
-                                          Relative to reg-reg move (2).  */
-  {2, 4, 2},                           /* cost of storing integer registers */
-  2,                                   /* cost of reg,reg fld/fst */
-  {8, 8, 8},                           /* cost of loading fp registers
-                                          in SFmode, DFmode and XFmode */
-  {8, 8, 8},                           /* cost of storing fp registers
-                                          in SFmode, DFmode and XFmode */
-  2,                                   /* cost of moving MMX register */
-  {4, 8},                              /* cost of loading MMX registers
-                                          in SImode and DImode */
-  {4, 8},                              /* cost of storing MMX registers
-                                          in SImode and DImode */
-  2,                                   /* cost of moving SSE register */
-  {4, 8, 16},                          /* cost of loading SSE registers
-                                          in SImode, DImode and TImode */
-  {4, 8, 16},                          /* cost of storing SSE registers
-                                          in SImode, DImode and TImode */
-  3,                                   /* MMX or SSE register to integer */
-  0,                                   /* size of l1 cache  */
-  0,                                   /* size of l2 cache  */
-  0,                                   /* size of prefetch block */
-  0,                                   /* number of parallel prefetches */
-  1,                                   /* Branch cost */
-  COSTS_N_INSNS (23),                  /* cost of FADD and FSUB insns.  */
-  COSTS_N_INSNS (27),                  /* cost of FMUL instruction.  */
-  COSTS_N_INSNS (88),                  /* cost of FDIV instruction.  */
-  COSTS_N_INSNS (22),                  /* cost of FABS instruction.  */
-  COSTS_N_INSNS (24),                  /* cost of FCHS instruction.  */
-  COSTS_N_INSNS (122),                 /* cost of FSQRT instruction.  */
-  1, 1, 1, 1,                          /* reassoc int, fp, vec_int, vec_fp.  */
-  i386_memcpy,
-  i386_memset,
-  1,                                   /* scalar_stmt_cost.  */
-  1,                                   /* scalar load_cost.  */
-  1,                                   /* scalar_store_cost.  */
-  1,                                   /* vec_stmt_cost.  */
-  1,                                   /* vec_to_scalar_cost.  */
-  1,                                   /* scalar_to_vec_cost.  */
-  1,                                   /* vec_align_load_cost.  */
-  2,                                   /* vec_unalign_load_cost.  */
-  1,                                   /* vec_store_cost.  */
-  3,                                   /* cond_taken_branch_cost.  */
-  1,                                   /* cond_not_taken_branch_cost.  */
-};
-
-static stringop_algs i486_memcpy[2] = {
-  {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
-  DUMMY_STRINGOP_ALGS};
-static stringop_algs i486_memset[2] = {
-  {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
-  DUMMY_STRINGOP_ALGS};
-
-static const
-struct processor_costs i486_cost = {   /* 486 specific costs */
-  COSTS_N_INSNS (1),                   /* cost of an add instruction */
-  COSTS_N_INSNS (1),                   /* cost of a lea instruction */
-  COSTS_N_INSNS (3),                   /* variable shift costs */
-  COSTS_N_INSNS (2),                   /* constant shift costs */
-  {COSTS_N_INSNS (12),                 /* cost of starting multiply for QI */
-   COSTS_N_INSNS (12),                 /*                               HI */
-   COSTS_N_INSNS (12),                 /*                               SI */
-   COSTS_N_INSNS (12),                 /*                               DI */
-   COSTS_N_INSNS (12)},                        /*                            other */
-  1,                                   /* cost of multiply per each bit set */
-  {COSTS_N_INSNS (40),                 /* cost of a divide/mod for QI */
-   COSTS_N_INSNS (40),                 /*                          HI */
-   COSTS_N_INSNS (40),                 /*                          SI */
-   COSTS_N_INSNS (40),                 /*                          DI */
-   COSTS_N_INSNS (40)},                        /*                          other */
-  COSTS_N_INSNS (3),                   /* cost of movsx */
-  COSTS_N_INSNS (2),                   /* cost of movzx */
-  15,                                  /* "large" insn */
-  3,                                   /* MOVE_RATIO */
-  4,                                /* cost for loading QImode using movzbl */
-  {2, 4, 2},                           /* cost of loading integer registers
-                                          in QImode, HImode and SImode.
-                                          Relative to reg-reg move (2).  */
-  {2, 4, 2},                           /* cost of storing integer registers */
-  2,                                   /* cost of reg,reg fld/fst */
-  {8, 8, 8},                           /* cost of loading fp registers
-                                          in SFmode, DFmode and XFmode */
-  {8, 8, 8},                           /* cost of storing fp registers
-                                          in SFmode, DFmode and XFmode */
-  2,                                   /* cost of moving MMX register */
-  {4, 8},                              /* cost of loading MMX registers
-                                          in SImode and DImode */
-  {4, 8},                              /* cost of storing MMX registers
-                                          in SImode and DImode */
-  2,                                   /* cost of moving SSE register */
-  {4, 8, 16},                          /* cost of loading SSE registers
-                                          in SImode, DImode and TImode */
-  {4, 8, 16},                          /* cost of storing SSE registers
-                                          in SImode, DImode and TImode */
-  3,                                   /* MMX or SSE register to integer */
-  4,                                   /* size of l1 cache.  486 has 8kB cache
-                                          shared for code and data, so 4kB is
-                                          not really precise.  */
-  4,                                   /* size of l2 cache  */
-  0,                                   /* size of prefetch block */
-  0,                                   /* number of parallel prefetches */
-  1,                                   /* Branch cost */
-  COSTS_N_INSNS (8),                   /* cost of FADD and FSUB insns.  */
-  COSTS_N_INSNS (16),                  /* cost of FMUL instruction.  */
-  COSTS_N_INSNS (73),                  /* cost of FDIV instruction.  */
-  COSTS_N_INSNS (3),                   /* cost of FABS instruction.  */
-  COSTS_N_INSNS (3),                   /* cost of FCHS instruction.  */
-  COSTS_N_INSNS (83),                  /* cost of FSQRT instruction.  */
-  1, 1, 1, 1,                          /* reassoc int, fp, vec_int, vec_fp.  */
-  i486_memcpy,
-  i486_memset,
-  1,                                   /* scalar_stmt_cost.  */
-  1,                                   /* scalar load_cost.  */
-  1,                                   /* scalar_store_cost.  */
-  1,                                   /* vec_stmt_cost.  */
-  1,                                   /* vec_to_scalar_cost.  */
-  1,                                   /* scalar_to_vec_cost.  */
-  1,                                   /* vec_align_load_cost.  */
-  2,                                   /* vec_unalign_load_cost.  */
-  1,                                   /* vec_store_cost.  */
-  3,                                   /* cond_taken_branch_cost.  */
-  1,                                   /* cond_not_taken_branch_cost.  */
-};
-
-static stringop_algs pentium_memcpy[2] = {
-  {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
-  DUMMY_STRINGOP_ALGS};
-static stringop_algs pentium_memset[2] = {
-  {libcall, {{-1, rep_prefix_4_byte, false}}},
-  DUMMY_STRINGOP_ALGS};
-
-static const
-struct processor_costs pentium_cost = {
-  COSTS_N_INSNS (1),                   /* cost of an add instruction */
-  COSTS_N_INSNS (1),                   /* cost of a lea instruction */
-  COSTS_N_INSNS (4),                   /* variable shift costs */
-  COSTS_N_INSNS (1),                   /* constant shift costs */
-  {COSTS_N_INSNS (11),                 /* cost of starting multiply for QI */
-   COSTS_N_INSNS (11),                 /*                               HI */
-   COSTS_N_INSNS (11),                 /*                               SI */
-   COSTS_N_INSNS (11),                 /*                               DI */
-   COSTS_N_INSNS (11)},                        /*                            other */
-  0,                                   /* cost of multiply per each bit set */
-  {COSTS_N_INSNS (25),                 /* cost of a divide/mod for QI */
-   COSTS_N_INSNS (25),                 /*                          HI */
-   COSTS_N_INSNS (25),                 /*                          SI */
-   COSTS_N_INSNS (25),                 /*                          DI */
-   COSTS_N_INSNS (25)},                        /*                          other */
-  COSTS_N_INSNS (3),                   /* cost of movsx */
-  COSTS_N_INSNS (2),                   /* cost of movzx */
-  8,                                   /* "large" insn */
-  6,                                   /* MOVE_RATIO */
-  6,                                /* cost for loading QImode using movzbl */
-  {2, 4, 2},                           /* cost of loading integer registers
-                                          in QImode, HImode and SImode.
-                                          Relative to reg-reg move (2).  */
-  {2, 4, 2},                           /* cost of storing integer registers */
-  2,                                   /* cost of reg,reg fld/fst */
-  {2, 2, 6},                           /* cost of loading fp registers
-                                          in SFmode, DFmode and XFmode */
-  {4, 4, 6},                           /* cost of storing fp registers
-                                          in SFmode, DFmode and XFmode */
-  8,                                   /* cost of moving MMX register */
-  {8, 8},                              /* cost of loading MMX registers
-                                          in SImode and DImode */
-  {8, 8},                              /* cost of storing MMX registers
-                                          in SImode and DImode */
-  2,                                   /* cost of moving SSE register */
-  {4, 8, 16},                          /* cost of loading SSE registers
-                                          in SImode, DImode and TImode */
-  {4, 8, 16},                          /* cost of storing SSE registers
-                                          in SImode, DImode and TImode */
-  3,                                   /* MMX or SSE register to integer */
-  8,                                   /* size of l1 cache.  */
-  8,                                   /* size of l2 cache  */
-  0,                                   /* size of prefetch block */
-  0,                                   /* number of parallel prefetches */
-  2,                                   /* Branch cost */
-  COSTS_N_INSNS (3),                   /* cost of FADD and FSUB insns.  */
-  COSTS_N_INSNS (3),                   /* cost of FMUL instruction.  */
-  COSTS_N_INSNS (39),                  /* cost of FDIV instruction.  */
-  COSTS_N_INSNS (1),                   /* cost of FABS instruction.  */
-  COSTS_N_INSNS (1),                   /* cost of FCHS instruction.  */
-  COSTS_N_INSNS (70),                  /* cost of FSQRT instruction.  */
-  1, 1, 1, 1,                          /* reassoc int, fp, vec_int, vec_fp.  */
-  pentium_memcpy,
-  pentium_memset,
-  1,                                   /* scalar_stmt_cost.  */
-  1,                                   /* scalar load_cost.  */
-  1,                                   /* scalar_store_cost.  */
-  1,                                   /* vec_stmt_cost.  */
-  1,                                   /* vec_to_scalar_cost.  */
-  1,                                   /* scalar_to_vec_cost.  */
-  1,                                   /* vec_align_load_cost.  */
-  2,                                   /* vec_unalign_load_cost.  */
-  1,                                   /* vec_store_cost.  */
-  3,                                   /* cond_taken_branch_cost.  */
-  1,                                   /* cond_not_taken_branch_cost.  */
-};
-
-static const
-struct processor_costs lakemont_cost = {
-  COSTS_N_INSNS (1),                   /* cost of an add instruction */
-  COSTS_N_INSNS (1) + 1,               /* cost of a lea instruction */
-  COSTS_N_INSNS (1),                   /* variable shift costs */
-  COSTS_N_INSNS (1),                   /* constant shift costs */
-  {COSTS_N_INSNS (11),                 /* cost of starting multiply for QI */
-   COSTS_N_INSNS (11),                 /*                               HI */
-   COSTS_N_INSNS (11),                 /*                               SI */
-   COSTS_N_INSNS (11),                 /*                               DI */
-   COSTS_N_INSNS (11)},                        /*                            other */
-  0,                                   /* cost of multiply per each bit set */
-  {COSTS_N_INSNS (25),                 /* cost of a divide/mod for QI */
-   COSTS_N_INSNS (25),                 /*                          HI */
-   COSTS_N_INSNS (25),                 /*                          SI */
-   COSTS_N_INSNS (25),                 /*                          DI */
-   COSTS_N_INSNS (25)},                        /*                          other */
-  COSTS_N_INSNS (3),                   /* cost of movsx */
-  COSTS_N_INSNS (2),                   /* cost of movzx */
-  8,                                   /* "large" insn */
-  17,                                  /* MOVE_RATIO */
-  6,                                /* cost for loading QImode using movzbl */
-  {2, 4, 2},                           /* cost of loading integer registers
-                                          in QImode, HImode and SImode.
-                                          Relative to reg-reg move (2).  */
-  {2, 4, 2},                           /* cost of storing integer registers */
-  2,                                   /* cost of reg,reg fld/fst */
-  {2, 2, 6},                           /* cost of loading fp registers
-                                          in SFmode, DFmode and XFmode */
-  {4, 4, 6},                           /* cost of storing fp registers
-                                          in SFmode, DFmode and XFmode */
-  8,                                   /* cost of moving MMX register */
-  {8, 8},                              /* cost of loading MMX registers
-                                          in SImode and DImode */
-  {8, 8},                              /* cost of storing MMX registers
-                                          in SImode and DImode */
-  2,                                   /* cost of moving SSE register */
-  {4, 8, 16},                          /* cost of loading SSE registers
-                                          in SImode, DImode and TImode */
-  {4, 8, 16},                          /* cost of storing SSE registers
-                                          in SImode, DImode and TImode */
-  3,                                   /* MMX or SSE register to integer */
-  8,                                   /* size of l1 cache.  */
-  8,                                   /* size of l2 cache  */
-  0,                                   /* size of prefetch block */
-  0,                                   /* number of parallel prefetches */
-  2,                                   /* Branch cost */
-  COSTS_N_INSNS (3),                   /* cost of FADD and FSUB insns.  */
-  COSTS_N_INSNS (3),                   /* cost of FMUL instruction.  */
-  COSTS_N_INSNS (39),                  /* cost of FDIV instruction.  */
-  COSTS_N_INSNS (1),                   /* cost of FABS instruction.  */
-  COSTS_N_INSNS (1),                   /* cost of FCHS instruction.  */
-  COSTS_N_INSNS (70),                  /* cost of FSQRT instruction.  */
-  1, 1, 1, 1,                          /* reassoc int, fp, vec_int, vec_fp.  */
-  pentium_memcpy,
-  pentium_memset,
-  1,                                   /* scalar_stmt_cost.  */
-  1,                                   /* scalar load_cost.  */
-  1,                                   /* scalar_store_cost.  */
-  1,                                   /* vec_stmt_cost.  */
-  1,                                   /* vec_to_scalar_cost.  */
-  1,                                   /* scalar_to_vec_cost.  */
-  1,                                   /* vec_align_load_cost.  */
-  2,                                   /* vec_unalign_load_cost.  */
-  1,                                   /* vec_store_cost.  */
-  3,                                   /* cond_taken_branch_cost.  */
-  1,                                   /* cond_not_taken_branch_cost.  */
-};
-
-/* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
-   (we ensure the alignment).  For small blocks inline loop is still a
-   noticeable win, for bigger blocks either rep movsl or rep movsb is
-   way to go.  Rep movsb has apparently more expensive startup time in CPU,
-   but after 4K the difference is down in the noise.  */
-static stringop_algs pentiumpro_memcpy[2] = {
-  {rep_prefix_4_byte, {{128, loop, false}, {1024, unrolled_loop, false},
-                       {8192, rep_prefix_4_byte, false},
-                       {-1, rep_prefix_1_byte, false}}},
-  DUMMY_STRINGOP_ALGS};
-static stringop_algs pentiumpro_memset[2] = {
-  {rep_prefix_4_byte, {{1024, unrolled_loop, false},
-                       {8192, rep_prefix_4_byte, false},
-                       {-1, libcall, false}}},
-  DUMMY_STRINGOP_ALGS};
-static const
-struct processor_costs pentiumpro_cost = {
-  COSTS_N_INSNS (1),                   /* cost of an add instruction */
-  COSTS_N_INSNS (1),                   /* cost of a lea instruction */
-  COSTS_N_INSNS (1),                   /* variable shift costs */
-  COSTS_N_INSNS (1),                   /* constant shift costs */
-  {COSTS_N_INSNS (4),                  /* cost of starting multiply for QI */
-   COSTS_N_INSNS (4),                  /*                               HI */
-   COSTS_N_INSNS (4),                  /*                               SI */
-   COSTS_N_INSNS (4),                  /*                               DI */
-   COSTS_N_INSNS (4)},                 /*                            other */
-  0,                                   /* cost of multiply per each bit set */
-  {COSTS_N_INSNS (17),                 /* cost of a divide/mod for QI */
-   COSTS_N_INSNS (17),                 /*                          HI */
-   COSTS_N_INSNS (17),                 /*                          SI */
-   COSTS_N_INSNS (17),                 /*                          DI */
-   COSTS_N_INSNS (17)},                        /*                          other */
-  COSTS_N_INSNS (1),                   /* cost of movsx */
-  COSTS_N_INSNS (1),                   /* cost of movzx */
-  8,                                   /* "large" insn */
-  6,                                   /* MOVE_RATIO */
-  2,                                /* cost for loading QImode using movzbl */
-  {4, 4, 4},                           /* cost of loading integer registers
-                                          in QImode, HImode and SImode.
-                                          Relative to reg-reg move (2).  */
-  {2, 2, 2},                           /* cost of storing integer registers */
-  2,                                   /* cost of reg,reg fld/fst */
-  {2, 2, 6},                           /* cost of loading fp registers
-                                          in SFmode, DFmode and XFmode */
-  {4, 4, 6},                           /* cost of storing fp registers
-                                          in SFmode, DFmode and XFmode */
-  2,                                   /* cost of moving MMX register */
-  {2, 2},                              /* cost of loading MMX registers
-                                          in SImode and DImode */
-  {2, 2},                              /* cost of storing MMX registers
-                                          in SImode and DImode */
-  2,                                   /* cost of moving SSE register */
-  {2, 2, 8},                           /* cost of loading SSE registers
-                                          in SImode, DImode and TImode */
-  {2, 2, 8},                           /* cost of storing SSE registers
-                                          in SImode, DImode and TImode */
-  3,                                   /* MMX or SSE register to integer */
-  8,                                   /* size of l1 cache.  */
-  256,                                 /* size of l2 cache  */
-  32,                                  /* size of prefetch block */
-  6,                                   /* number of parallel prefetches */
-  2,                                   /* Branch cost */
-  COSTS_N_INSNS (3),                   /* cost of FADD and FSUB insns.  */
-  COSTS_N_INSNS (5),                   /* cost of FMUL instruction.  */
-  COSTS_N_INSNS (56),                  /* cost of FDIV instruction.  */
-  COSTS_N_INSNS (2),                   /* cost of FABS instruction.  */
-  COSTS_N_INSNS (2),                   /* cost of FCHS instruction.  */
-  COSTS_N_INSNS (56),                  /* cost of FSQRT instruction.  */
-  1, 1, 1, 1,                          /* reassoc int, fp, vec_int, vec_fp.  */
-  pentiumpro_memcpy,
-  pentiumpro_memset,
-  1,                                   /* scalar_stmt_cost.  */
-  1,                                   /* scalar load_cost.  */
-  1,                                   /* scalar_store_cost.  */
-  1,                                   /* vec_stmt_cost.  */
-  1,                                   /* vec_to_scalar_cost.  */
-  1,                                   /* scalar_to_vec_cost.  */
-  1,                                   /* vec_align_load_cost.  */
-  2,                                   /* vec_unalign_load_cost.  */
-  1,                                   /* vec_store_cost.  */
-  3,                                   /* cond_taken_branch_cost.  */
-  1,                                   /* cond_not_taken_branch_cost.  */
-};
-
-static stringop_algs geode_memcpy[2] = {
-  {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
-  DUMMY_STRINGOP_ALGS};
-static stringop_algs geode_memset[2] = {
-  {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
-  DUMMY_STRINGOP_ALGS};
-static const
-struct processor_costs geode_cost = {
-  COSTS_N_INSNS (1),                   /* cost of an add instruction */
-  COSTS_N_INSNS (1),                   /* cost of a lea instruction */
-  COSTS_N_INSNS (2),                   /* variable shift costs */
-  COSTS_N_INSNS (1),                   /* constant shift costs */
-  {COSTS_N_INSNS (3),                  /* cost of starting multiply for QI */
-   COSTS_N_INSNS (4),                  /*                               HI */
-   COSTS_N_INSNS (7),                  /*                               SI */
-   COSTS_N_INSNS (7),                  /*                               DI */
-   COSTS_N_INSNS (7)},                 /*                            other */
-  0,                                   /* cost of multiply per each bit set */
-  {COSTS_N_INSNS (15),                 /* cost of a divide/mod for QI */
-   COSTS_N_INSNS (23),                 /*                          HI */
-   COSTS_N_INSNS (39),                 /*                          SI */
-   COSTS_N_INSNS (39),                 /*                          DI */
-   COSTS_N_INSNS (39)},                        /*                          other */
-  COSTS_N_INSNS (1),                   /* cost of movsx */
-  COSTS_N_INSNS (1),                   /* cost of movzx */
-  8,                                   /* "large" insn */
-  4,                                   /* MOVE_RATIO */
-  1,                                /* cost for loading QImode using movzbl */
-  {1, 1, 1},                           /* cost of loading integer registers
-                                          in QImode, HImode and SImode.
-                                          Relative to reg-reg move (2).  */
-  {1, 1, 1},                           /* cost of storing integer registers */
-  1,                                   /* cost of reg,reg fld/fst */
-  {1, 1, 1},                           /* cost of loading fp registers
-                                          in SFmode, DFmode and XFmode */
-  {4, 6, 6},                           /* cost of storing fp registers
-                                          in SFmode, DFmode and XFmode */
-
-  2,                                   /* cost of moving MMX register */
-  {2, 2},                              /* cost of loading MMX registers
-                                          in SImode and DImode */
-  {2, 2},                              /* cost of storing MMX registers
-                                          in SImode and DImode */
-  2,                                   /* cost of moving SSE register */
-  {2, 2, 8},                           /* cost of loading SSE registers
-                                          in SImode, DImode and TImode */
-  {2, 2, 8},                           /* cost of storing SSE registers
-                                          in SImode, DImode and TImode */
-  3,                                   /* MMX or SSE register to integer */
-  64,                                  /* size of l1 cache.  */
-  128,                                 /* size of l2 cache.  */
-  32,                                  /* size of prefetch block */
-  1,                                   /* number of parallel prefetches */
-  1,                                   /* Branch cost */
-  COSTS_N_INSNS (6),                   /* cost of FADD and FSUB insns.  */
-  COSTS_N_INSNS (11),                  /* cost of FMUL instruction.  */
-  COSTS_N_INSNS (47),                  /* cost of FDIV instruction.  */
-  COSTS_N_INSNS (1),                   /* cost of FABS instruction.  */
-  COSTS_N_INSNS (1),                   /* cost of FCHS instruction.  */
-  COSTS_N_INSNS (54),                  /* cost of FSQRT instruction.  */
-  1, 1, 1, 1,                          /* reassoc int, fp, vec_int, vec_fp.  */
-  geode_memcpy,
-  geode_memset,
-  1,                                   /* scalar_stmt_cost.  */
-  1,                                   /* scalar load_cost.  */
-  1,                                   /* scalar_store_cost.  */
-  1,                                   /* vec_stmt_cost.  */
-  1,                                   /* vec_to_scalar_cost.  */
-  1,                                   /* scalar_to_vec_cost.  */
-  1,                                   /* vec_align_load_cost.  */
-  2,                                   /* vec_unalign_load_cost.  */
-  1,                                   /* vec_store_cost.  */
-  3,                                   /* cond_taken_branch_cost.  */
-  1,                                   /* cond_not_taken_branch_cost.  */
-};
-
-static stringop_algs k6_memcpy[2] = {
-  {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
-  DUMMY_STRINGOP_ALGS};
-static stringop_algs k6_memset[2] = {
-  {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
-  DUMMY_STRINGOP_ALGS};
-static const
-struct processor_costs k6_cost = {
-  COSTS_N_INSNS (1),                   /* cost of an add instruction */
-  COSTS_N_INSNS (2),                   /* cost of a lea instruction */
-  COSTS_N_INSNS (1),                   /* variable shift costs */
-  COSTS_N_INSNS (1),                   /* constant shift costs */
-  {COSTS_N_INSNS (3),                  /* cost of starting multiply for QI */
-   COSTS_N_INSNS (3),                  /*                               HI */
-   COSTS_N_INSNS (3),                  /*                               SI */
-   COSTS_N_INSNS (3),                  /*                               DI */
-   COSTS_N_INSNS (3)},                 /*                            other */
-  0,                                   /* cost of multiply per each bit set */
-  {COSTS_N_INSNS (18),                 /* cost of a divide/mod for QI */
-   COSTS_N_INSNS (18),                 /*                          HI */
-   COSTS_N_INSNS (18),                 /*                          SI */
-   COSTS_N_INSNS (18),                 /*                          DI */
-   COSTS_N_INSNS (18)},                        /*                          other */
-  COSTS_N_INSNS (2),                   /* cost of movsx */
-  COSTS_N_INSNS (2),                   /* cost of movzx */
-  8,                                   /* "large" insn */
-  4,                                   /* MOVE_RATIO */
-  3,                                /* cost for loading QImode using movzbl */
-  {4, 5, 4},                           /* cost of loading integer registers
-                                          in QImode, HImode and SImode.
-                                          Relative to reg-reg move (2).  */
-  {2, 3, 2},                           /* cost of storing integer registers */
-  4,                                   /* cost of reg,reg fld/fst */
-  {6, 6, 6},                           /* cost of loading fp registers
-                                          in SFmode, DFmode and XFmode */
-  {4, 4, 4},                           /* cost of storing fp registers
-                                          in SFmode, DFmode and XFmode */
-  2,                                   /* cost of moving MMX register */
-  {2, 2},                              /* cost of loading MMX registers
-                                          in SImode and DImode */
-  {2, 2},                              /* cost of storing MMX registers
-                                          in SImode and DImode */
-  2,                                   /* cost of moving SSE register */
-  {2, 2, 8},                           /* cost of loading SSE registers
-                                          in SImode, DImode and TImode */
-  {2, 2, 8},                           /* cost of storing SSE registers
-                                          in SImode, DImode and TImode */
-  6,                                   /* MMX or SSE register to integer */
-  32,                                  /* size of l1 cache.  */
-  32,                                  /* size of l2 cache.  Some models
-                                          have integrated l2 cache, but
-                                          optimizing for k6 is not important
-                                          enough to worry about that.  */
-  32,                                  /* size of prefetch block */
-  1,                                   /* number of parallel prefetches */
-  1,                                   /* Branch cost */
-  COSTS_N_INSNS (2),                   /* cost of FADD and FSUB insns.  */
-  COSTS_N_INSNS (2),                   /* cost of FMUL instruction.  */
-  COSTS_N_INSNS (56),                  /* cost of FDIV instruction.  */
-  COSTS_N_INSNS (2),                   /* cost of FABS instruction.  */
-  COSTS_N_INSNS (2),                   /* cost of FCHS instruction.  */
-  COSTS_N_INSNS (56),                  /* cost of FSQRT instruction.  */
-  1, 1, 1, 1,                          /* reassoc int, fp, vec_int, vec_fp.  */
-  k6_memcpy,
-  k6_memset,
-  1,                                   /* scalar_stmt_cost.  */
-  1,                                   /* scalar load_cost.  */
-  1,                                   /* scalar_store_cost.  */
-  1,                                   /* vec_stmt_cost.  */
-  1,                                   /* vec_to_scalar_cost.  */
-  1,                                   /* scalar_to_vec_cost.  */
-  1,                                   /* vec_align_load_cost.  */
-  2,                                   /* vec_unalign_load_cost.  */
-  1,                                   /* vec_store_cost.  */
-  3,                                   /* cond_taken_branch_cost.  */
-  1,                                   /* cond_not_taken_branch_cost.  */
-};
-
-/* For some reason, Athlon deals better with REP prefix (relative to loops)
-   compared to K8. Alignment becomes important after 8 bytes for memcpy and
-   128 bytes for memset.  */
-static stringop_algs athlon_memcpy[2] = {
-  {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
-  DUMMY_STRINGOP_ALGS};
-static stringop_algs athlon_memset[2] = {
-  {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
-  DUMMY_STRINGOP_ALGS};
-static const
-struct processor_costs athlon_cost = {
-  COSTS_N_INSNS (1),                   /* cost of an add instruction */
-  COSTS_N_INSNS (2),                   /* cost of a lea instruction */
-  COSTS_N_INSNS (1),                   /* variable shift costs */
-  COSTS_N_INSNS (1),                   /* constant shift costs */
-  {COSTS_N_INSNS (5),                  /* cost of starting multiply for QI */
-   COSTS_N_INSNS (5),                  /*                               HI */
-   COSTS_N_INSNS (5),                  /*                               SI */
-   COSTS_N_INSNS (5),                  /*                               DI */
-   COSTS_N_INSNS (5)},                 /*                            other */
-  0,                                   /* cost of multiply per each bit set */
-  {COSTS_N_INSNS (18),                 /* cost of a divide/mod for QI */
-   COSTS_N_INSNS (26),                 /*                          HI */
-   COSTS_N_INSNS (42),                 /*                          SI */
-   COSTS_N_INSNS (74),                 /*                          DI */
-   COSTS_N_INSNS (74)},                        /*                          other */
-  COSTS_N_INSNS (1),                   /* cost of movsx */
-  COSTS_N_INSNS (1),                   /* cost of movzx */
-  8,                                   /* "large" insn */
-  9,                                   /* MOVE_RATIO */
-  4,                                /* cost for loading QImode using movzbl */
-  {3, 4, 3},                           /* cost of loading integer registers
-                                          in QImode, HImode and SImode.
-                                          Relative to reg-reg move (2).  */
-  {3, 4, 3},                           /* cost of storing integer registers */
-  4,                                   /* cost of reg,reg fld/fst */
-  {4, 4, 12},                          /* cost of loading fp registers
-                                          in SFmode, DFmode and XFmode */
-  {6, 6, 8},                           /* cost of storing fp registers
-                                          in SFmode, DFmode and XFmode */
-  2,                                   /* cost of moving MMX register */
-  {4, 4},                              /* cost of loading MMX registers
-                                          in SImode and DImode */
-  {4, 4},                              /* cost of storing MMX registers
-                                          in SImode and DImode */
-  2,                                   /* cost of moving SSE register */
-  {4, 4, 6},                           /* cost of loading SSE registers
-                                          in SImode, DImode and TImode */
-  {4, 4, 5},                           /* cost of storing SSE registers
-                                          in SImode, DImode and TImode */
-  5,                                   /* MMX or SSE register to integer */
-  64,                                  /* size of l1 cache.  */
-  256,                                 /* size of l2 cache.  */
-  64,                                  /* size of prefetch block */
-  6,                                   /* number of parallel prefetches */
-  5,                                   /* Branch cost */
-  COSTS_N_INSNS (4),                   /* cost of FADD and FSUB insns.  */
-  COSTS_N_INSNS (4),                   /* cost of FMUL instruction.  */
-  COSTS_N_INSNS (24),                  /* cost of FDIV instruction.  */
-  COSTS_N_INSNS (2),                   /* cost of FABS instruction.  */
-  COSTS_N_INSNS (2),                   /* cost of FCHS instruction.  */
-  COSTS_N_INSNS (35),                  /* cost of FSQRT instruction.  */
-  1, 1, 1, 1,                          /* reassoc int, fp, vec_int, vec_fp.  */
-  athlon_memcpy,
-  athlon_memset,
-  1,                                   /* scalar_stmt_cost.  */
-  1,                                   /* scalar load_cost.  */
-  1,                                   /* scalar_store_cost.  */
-  1,                                   /* vec_stmt_cost.  */
-  1,                                   /* vec_to_scalar_cost.  */
-  1,                                   /* scalar_to_vec_cost.  */
-  1,                                   /* vec_align_load_cost.  */
-  2,                                   /* vec_unalign_load_cost.  */
-  1,                                   /* vec_store_cost.  */
-  3,                                   /* cond_taken_branch_cost.  */
-  1,                                   /* cond_not_taken_branch_cost.  */
-};
-
-/* K8 has optimized REP instruction for medium sized blocks, but for very
-   small blocks it is better to use loop. For large blocks, libcall can
-   do nontemporary accesses and beat inline considerably.  */
-static stringop_algs k8_memcpy[2] = {
-  {libcall, {{6, loop, false}, {14, unrolled_loop, false},
-             {-1, rep_prefix_4_byte, false}}},
-  {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
-             {-1, libcall, false}}}};
-static stringop_algs k8_memset[2] = {
-  {libcall, {{8, loop, false}, {24, unrolled_loop, false},
-             {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
-  {libcall, {{48, unrolled_loop, false},
-             {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
-static const
-struct processor_costs k8_cost = {
-  COSTS_N_INSNS (1),                   /* cost of an add instruction */
-  COSTS_N_INSNS (2),                   /* cost of a lea instruction */
-  COSTS_N_INSNS (1),                   /* variable shift costs */
-  COSTS_N_INSNS (1),                   /* constant shift costs */
-  {COSTS_N_INSNS (3),                  /* cost of starting multiply for QI */
-   COSTS_N_INSNS (4),                  /*                               HI */
-   COSTS_N_INSNS (3),                  /*                               SI */
-   COSTS_N_INSNS (4),                  /*                               DI */
-   COSTS_N_INSNS (5)},                 /*                            other */
-  0,                                   /* cost of multiply per each bit set */
-  {COSTS_N_INSNS (18),                 /* cost of a divide/mod for QI */
-   COSTS_N_INSNS (26),                 /*                          HI */
-   COSTS_N_INSNS (42),                 /*                          SI */
-   COSTS_N_INSNS (74),                 /*                          DI */
-   COSTS_N_INSNS (74)},                        /*                          other */
-  COSTS_N_INSNS (1),                   /* cost of movsx */
-  COSTS_N_INSNS (1),                   /* cost of movzx */
-  8,                                   /* "large" insn */
-  9,                                   /* MOVE_RATIO */
-  4,                                /* cost for loading QImode using movzbl */
-  {3, 4, 3},                           /* cost of loading integer registers
-                                          in QImode, HImode and SImode.
-                                          Relative to reg-reg move (2).  */
-  {3, 4, 3},                           /* cost of storing integer registers */
-  4,                                   /* cost of reg,reg fld/fst */
-  {4, 4, 12},                          /* cost of loading fp registers
-                                          in SFmode, DFmode and XFmode */
-  {6, 6, 8},                           /* cost of storing fp registers
-                                          in SFmode, DFmode and XFmode */
-  2,                                   /* cost of moving MMX register */
-  {3, 3},                              /* cost of loading MMX registers
-                                          in SImode and DImode */
-  {4, 4},                              /* cost of storing MMX registers
-                                          in SImode and DImode */
-  2,                                   /* cost of moving SSE register */
-  {4, 3, 6},                           /* cost of loading SSE registers
-                                          in SImode, DImode and TImode */
-  {4, 4, 5},                           /* cost of storing SSE registers
-                                          in SImode, DImode and TImode */
-  5,                                   /* MMX or SSE register to integer */
-  64,                                  /* size of l1 cache.  */
-  512,                                 /* size of l2 cache.  */
-  64,                                  /* size of prefetch block */
-  /* New AMD processors never drop prefetches; if they cannot be performed
-     immediately, they are queued.  We set number of simultaneous prefetches
-     to a large constant to reflect this (it probably is not a good idea not
-     to limit number of prefetches at all, as their execution also takes some
-     time).  */
-  100,                                 /* number of parallel prefetches */
-  3,                                   /* Branch cost */
-  COSTS_N_INSNS (4),                   /* cost of FADD and FSUB insns.  */
-  COSTS_N_INSNS (4),                   /* cost of FMUL instruction.  */
-  COSTS_N_INSNS (19),                  /* cost of FDIV instruction.  */
-  COSTS_N_INSNS (2),                   /* cost of FABS instruction.  */
-  COSTS_N_INSNS (2),                   /* cost of FCHS instruction.  */
-  COSTS_N_INSNS (35),                  /* cost of FSQRT instruction.  */
-  1, 1, 1, 1,                          /* reassoc int, fp, vec_int, vec_fp.  */
-  k8_memcpy,
-  k8_memset,
-  4,                                   /* scalar_stmt_cost.  */
-  2,                                   /* scalar load_cost.  */
-  2,                                   /* scalar_store_cost.  */
-  5,                                   /* vec_stmt_cost.  */
-  0,                                   /* vec_to_scalar_cost.  */
-  2,                                   /* scalar_to_vec_cost.  */
-  2,                                   /* vec_align_load_cost.  */
-  3,                                   /* vec_unalign_load_cost.  */
-  3,                                   /* vec_store_cost.  */
-  3,                                   /* cond_taken_branch_cost.  */
-  2,                                   /* cond_not_taken_branch_cost.  */
-};
-
-/* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
-   very small blocks it is better to use loop. For large blocks, libcall can
-   do nontemporary accesses and beat inline considerably.  */
-static stringop_algs amdfam10_memcpy[2] = {
-  {libcall, {{6, loop, false}, {14, unrolled_loop, false},
-             {-1, rep_prefix_4_byte, false}}},
-  {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
-             {-1, libcall, false}}}};
-static stringop_algs amdfam10_memset[2] = {
-  {libcall, {{8, loop, false}, {24, unrolled_loop, false},
-             {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
-  {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
-             {-1, libcall, false}}}};
-struct processor_costs amdfam10_cost = {
-  COSTS_N_INSNS (1),                   /* cost of an add instruction */
-  COSTS_N_INSNS (2),                   /* cost of a lea instruction */
-  COSTS_N_INSNS (1),                   /* variable shift costs */
-  COSTS_N_INSNS (1),                   /* constant shift costs */
-  {COSTS_N_INSNS (3),                  /* cost of starting multiply for QI */
-   COSTS_N_INSNS (4),                  /*                               HI */
-   COSTS_N_INSNS (3),                  /*                               SI */
-   COSTS_N_INSNS (4),                  /*                               DI */
-   COSTS_N_INSNS (5)},                 /*                            other */
-  0,                                   /* cost of multiply per each bit set */
-  {COSTS_N_INSNS (19),                 /* cost of a divide/mod for QI */
-   COSTS_N_INSNS (35),                 /*                          HI */
-   COSTS_N_INSNS (51),                 /*                          SI */
-   COSTS_N_INSNS (83),                 /*                          DI */
-   COSTS_N_INSNS (83)},                        /*                          other */
-  COSTS_N_INSNS (1),                   /* cost of movsx */
-  COSTS_N_INSNS (1),                   /* cost of movzx */
-  8,                                   /* "large" insn */
-  9,                                   /* MOVE_RATIO */
-  4,                                /* cost for loading QImode using movzbl */
-  {3, 4, 3},                           /* cost of loading integer registers
-                                          in QImode, HImode and SImode.
-                                          Relative to reg-reg move (2).  */
-  {3, 4, 3},                           /* cost of storing integer registers */
-  4,                                   /* cost of reg,reg fld/fst */
-  {4, 4, 12},                          /* cost of loading fp registers
-                                          in SFmode, DFmode and XFmode */
-  {6, 6, 8},                           /* cost of storing fp registers
-                                          in SFmode, DFmode and XFmode */
-  2,                                   /* cost of moving MMX register */
-  {3, 3},                              /* cost of loading MMX registers
-                                          in SImode and DImode */
-  {4, 4},                              /* cost of storing MMX registers
-                                          in SImode and DImode */
-  2,                                   /* cost of moving SSE register */
-  {4, 4, 3},                           /* cost of loading SSE registers
-                                          in SImode, DImode and TImode */
-  {4, 4, 5},                           /* cost of storing SSE registers
-                                          in SImode, DImode and TImode */
-  3,                                   /* MMX or SSE register to integer */
-                                       /* On K8:
-                                           MOVD reg64, xmmreg Double FSTORE 4
-                                           MOVD reg32, xmmreg Double FSTORE 4
-                                          On AMDFAM10:
-                                           MOVD reg64, xmmreg Double FADD 3
-                                                              1/1  1/1
-                                           MOVD reg32, xmmreg Double FADD 3
-                                                              1/1  1/1 */
-  64,                                  /* size of l1 cache.  */
-  512,                                 /* size of l2 cache.  */
-  64,                                  /* size of prefetch block */
-  /* New AMD processors never drop prefetches; if they cannot be performed
-     immediately, they are queued.  We set number of simultaneous prefetches
-     to a large constant to reflect this (it probably is not a good idea not
-     to limit number of prefetches at all, as their execution also takes some
-     time).  */
-  100,                                 /* number of parallel prefetches */
-  2,                                   /* Branch cost */
-  COSTS_N_INSNS (4),                   /* cost of FADD and FSUB insns.  */
-  COSTS_N_INSNS (4),                   /* cost of FMUL instruction.  */
-  COSTS_N_INSNS (19),                  /* cost of FDIV instruction.  */
-  COSTS_N_INSNS (2),                   /* cost of FABS instruction.  */
-  COSTS_N_INSNS (2),                   /* cost of FCHS instruction.  */
-  COSTS_N_INSNS (35),                  /* cost of FSQRT instruction.  */
-  1, 1, 1, 1,                          /* reassoc int, fp, vec_int, vec_fp.  */
-  amdfam10_memcpy,
-  amdfam10_memset,
-  4,                                   /* scalar_stmt_cost.  */
-  2,                                   /* scalar load_cost.  */
-  2,                                   /* scalar_store_cost.  */
-  6,                                   /* vec_stmt_cost.  */
-  0,                                   /* vec_to_scalar_cost.  */
-  2,                                   /* scalar_to_vec_cost.  */
-  2,                                   /* vec_align_load_cost.  */
-  2,                                   /* vec_unalign_load_cost.  */
-  2,                                   /* vec_store_cost.  */
-  2,                                   /* cond_taken_branch_cost.  */
-  1,                                   /* cond_not_taken_branch_cost.  */
-};
-
-/*  BDVER1 has optimized REP instruction for medium sized blocks, but for
-    very small blocks it is better to use loop. For large blocks, libcall
-    can do nontemporary accesses and beat inline considerably.  */
-static stringop_algs bdver1_memcpy[2] = {
-  {libcall, {{6, loop, false}, {14, unrolled_loop, false},
-             {-1, rep_prefix_4_byte, false}}},
-  {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
-             {-1, libcall, false}}}};
-static stringop_algs bdver1_memset[2] = {
-  {libcall, {{8, loop, false}, {24, unrolled_loop, false},
-             {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
-  {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
-             {-1, libcall, false}}}};
-
-const struct processor_costs bdver1_cost = {
-  COSTS_N_INSNS (1),                   /* cost of an add instruction */
-  COSTS_N_INSNS (1),                   /* cost of a lea instruction */
-  COSTS_N_INSNS (1),                   /* variable shift costs */
-  COSTS_N_INSNS (1),                   /* constant shift costs */
-  {COSTS_N_INSNS (4),                  /* cost of starting multiply for QI */
-   COSTS_N_INSNS (4),                  /*                               HI */
-   COSTS_N_INSNS (4),                  /*                               SI */
-   COSTS_N_INSNS (6),                  /*                               DI */
-   COSTS_N_INSNS (6)},                 /*                            other */
-  0,                                   /* cost of multiply per each bit set */
-  {COSTS_N_INSNS (19),                 /* cost of a divide/mod for QI */
-   COSTS_N_INSNS (35),                 /*                          HI */
-   COSTS_N_INSNS (51),                 /*                          SI */
-   COSTS_N_INSNS (83),                 /*                          DI */
-   COSTS_N_INSNS (83)},                        /*                          other */
-  COSTS_N_INSNS (1),                   /* cost of movsx */
-  COSTS_N_INSNS (1),                   /* cost of movzx */
-  8,                                   /* "large" insn */
-  9,                                   /* MOVE_RATIO */
-  4,                                /* cost for loading QImode using movzbl */
-  {5, 5, 4},                           /* cost of loading integer registers
-                                          in QImode, HImode and SImode.
-                                          Relative to reg-reg move (2).  */
-  {4, 4, 4},                           /* cost of storing integer registers */
-  2,                                   /* cost of reg,reg fld/fst */
-  {5, 5, 12},                          /* cost of loading fp registers
-                                          in SFmode, DFmode and XFmode */
-  {4, 4, 8},                           /* cost of storing fp registers
-                                          in SFmode, DFmode and XFmode */
-  2,                                   /* cost of moving MMX register */
-  {4, 4},                              /* cost of loading MMX registers
-                                          in SImode and DImode */
-  {4, 4},                              /* cost of storing MMX registers
-                                          in SImode and DImode */
-  2,                                   /* cost of moving SSE register */
-  {4, 4, 4},                           /* cost of loading SSE registers
-                                          in SImode, DImode and TImode */
-  {4, 4, 4},                           /* cost of storing SSE registers
-                                          in SImode, DImode and TImode */
-  2,                                   /* MMX or SSE register to integer */
-                                       /* On K8:
-                                           MOVD reg64, xmmreg Double FSTORE 4
-                                           MOVD reg32, xmmreg Double FSTORE 4
-                                          On AMDFAM10:
-                                           MOVD reg64, xmmreg Double FADD 3
-                                                              1/1  1/1
-                                           MOVD reg32, xmmreg Double FADD 3
-                                                              1/1  1/1 */
-  16,                                  /* size of l1 cache.  */
-  2048,                                        /* size of l2 cache.  */
-  64,                                  /* size of prefetch block */
-  /* New AMD processors never drop prefetches; if they cannot be performed
-     immediately, they are queued.  We set number of simultaneous prefetches
-     to a large constant to reflect this (it probably is not a good idea not
-     to limit number of prefetches at all, as their execution also takes some
-     time).  */
-  100,                                 /* number of parallel prefetches */
-  2,                                   /* Branch cost */
-  COSTS_N_INSNS (6),                   /* cost of FADD and FSUB insns.  */
-  COSTS_N_INSNS (6),                   /* cost of FMUL instruction.  */
-  COSTS_N_INSNS (42),                  /* cost of FDIV instruction.  */
-  COSTS_N_INSNS (2),                   /* cost of FABS instruction.  */
-  COSTS_N_INSNS (2),                   /* cost of FCHS instruction.  */
-  COSTS_N_INSNS (52),                  /* cost of FSQRT instruction.  */
-  1, 2, 1, 1,                          /* reassoc int, fp, vec_int, vec_fp.  */
-  bdver1_memcpy,
-  bdver1_memset,
-  6,                                   /* scalar_stmt_cost.  */
-  4,                                   /* scalar load_cost.  */
-  4,                                   /* scalar_store_cost.  */
-  6,                                   /* vec_stmt_cost.  */
-  0,                                   /* vec_to_scalar_cost.  */
-  2,                                   /* scalar_to_vec_cost.  */
-  4,                                   /* vec_align_load_cost.  */
-  4,                                   /* vec_unalign_load_cost.  */
-  4,                                   /* vec_store_cost.  */
-  4,                                   /* cond_taken_branch_cost.  */
-  2,                                   /* cond_not_taken_branch_cost.  */
-};
-
-/*  BDVER2 has optimized REP instruction for medium sized blocks, but for
-    very small blocks it is better to use loop. For large blocks, libcall
-    can do nontemporary accesses and beat inline considerably.  */
-
-static stringop_algs bdver2_memcpy[2] = {
-  {libcall, {{6, loop, false}, {14, unrolled_loop, false},
-             {-1, rep_prefix_4_byte, false}}},
-  {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
-             {-1, libcall, false}}}};
-static stringop_algs bdver2_memset[2] = {
-  {libcall, {{8, loop, false}, {24, unrolled_loop, false},
-             {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
-  {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
-             {-1, libcall, false}}}};
-
-const struct processor_costs bdver2_cost = {
-  COSTS_N_INSNS (1),                   /* cost of an add instruction */
-  COSTS_N_INSNS (1),                   /* cost of a lea instruction */
-  COSTS_N_INSNS (1),                   /* variable shift costs */
-  COSTS_N_INSNS (1),                   /* constant shift costs */
-  {COSTS_N_INSNS (4),                  /* cost of starting multiply for QI */
-   COSTS_N_INSNS (4),                  /*                               HI */
-   COSTS_N_INSNS (4),                  /*                               SI */
-   COSTS_N_INSNS (6),                  /*                               DI */
-   COSTS_N_INSNS (6)},                 /*                            other */
-  0,                                   /* cost of multiply per each bit set */
-  {COSTS_N_INSNS (19),                 /* cost of a divide/mod for QI */
-   COSTS_N_INSNS (35),                 /*                          HI */
-   COSTS_N_INSNS (51),                 /*                          SI */
-   COSTS_N_INSNS (83),                 /*                          DI */
-   COSTS_N_INSNS (83)},                        /*                          other */
-  COSTS_N_INSNS (1),                   /* cost of movsx */
-  COSTS_N_INSNS (1),                   /* cost of movzx */
-  8,                                   /* "large" insn */
-  9,                                   /* MOVE_RATIO */
-  4,                                /* cost for loading QImode using movzbl */
-  {5, 5, 4},                           /* cost of loading integer registers
-                                          in QImode, HImode and SImode.
-                                          Relative to reg-reg move (2).  */
-  {4, 4, 4},                           /* cost of storing integer registers */
-  2,                                   /* cost of reg,reg fld/fst */
-  {5, 5, 12},                          /* cost of loading fp registers
-                                          in SFmode, DFmode and XFmode */
-  {4, 4, 8},                           /* cost of storing fp registers
-                                          in SFmode, DFmode and XFmode */
-  2,                                   /* cost of moving MMX register */
-  {4, 4},                              /* cost of loading MMX registers
-                                          in SImode and DImode */
-  {4, 4},                              /* cost of storing MMX registers
-                                          in SImode and DImode */
-  2,                                   /* cost of moving SSE register */
-  {4, 4, 4},                           /* cost of loading SSE registers
-                                          in SImode, DImode and TImode */
-  {4, 4, 4},                           /* cost of storing SSE registers
-                                          in SImode, DImode and TImode */
-  2,                                   /* MMX or SSE register to integer */
-                                       /* On K8:
-                                           MOVD reg64, xmmreg Double FSTORE 4
-                                           MOVD reg32, xmmreg Double FSTORE 4
-                                          On AMDFAM10:
-                                           MOVD reg64, xmmreg Double FADD 3
-                                                              1/1  1/1
-                                           MOVD reg32, xmmreg Double FADD 3
-                                                              1/1  1/1 */
-  16,                                  /* size of l1 cache.  */
-  2048,                                        /* size of l2 cache.  */
-  64,                                  /* size of prefetch block */
-  /* New AMD processors never drop prefetches; if they cannot be performed
-     immediately, they are queued.  We set number of simultaneous prefetches
-     to a large constant to reflect this (it probably is not a good idea not
-     to limit number of prefetches at all, as their execution also takes some
-     time).  */
-  100,                                 /* number of parallel prefetches */
-  2,                                   /* Branch cost */
-  COSTS_N_INSNS (6),                   /* cost of FADD and FSUB insns.  */
-  COSTS_N_INSNS (6),                   /* cost of FMUL instruction.  */
-  COSTS_N_INSNS (42),                  /* cost of FDIV instruction.  */
-  COSTS_N_INSNS (2),                   /* cost of FABS instruction.  */
-  COSTS_N_INSNS (2),                   /* cost of FCHS instruction.  */
-  COSTS_N_INSNS (52),                  /* cost of FSQRT instruction.  */
-  1, 2, 1, 1,                          /* reassoc int, fp, vec_int, vec_fp.  */
-  bdver2_memcpy,
-  bdver2_memset,
-  6,                                   /* scalar_stmt_cost.  */
-  4,                                   /* scalar load_cost.  */
-  4,                                   /* scalar_store_cost.  */
-  6,                                   /* vec_stmt_cost.  */
-  0,                                   /* vec_to_scalar_cost.  */
-  2,                                   /* scalar_to_vec_cost.  */
-  4,                                   /* vec_align_load_cost.  */
-  4,                                   /* vec_unalign_load_cost.  */
-  4,                                   /* vec_store_cost.  */
-  4,                                   /* cond_taken_branch_cost.  */
-  2,                                   /* cond_not_taken_branch_cost.  */
-};
-
-
-  /*  BDVER3 has optimized REP instruction for medium sized blocks, but for
-      very small blocks it is better to use loop. For large blocks, libcall
-      can do nontemporary accesses and beat inline considerably.  */
-static stringop_algs bdver3_memcpy[2] = {
-  {libcall, {{6, loop, false}, {14, unrolled_loop, false},
-             {-1, rep_prefix_4_byte, false}}},
-  {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
-             {-1, libcall, false}}}};
-static stringop_algs bdver3_memset[2] = {
-  {libcall, {{8, loop, false}, {24, unrolled_loop, false},
-             {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
-  {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
-             {-1, libcall, false}}}};
-struct processor_costs bdver3_cost = {
-  COSTS_N_INSNS (1),                   /* cost of an add instruction */
-  COSTS_N_INSNS (1),                   /* cost of a lea instruction */
-  COSTS_N_INSNS (1),                   /* variable shift costs */
-  COSTS_N_INSNS (1),                   /* constant shift costs */
-  {COSTS_N_INSNS (4),                  /* cost of starting multiply for QI */
-   COSTS_N_INSNS (4),                  /*                               HI */
-   COSTS_N_INSNS (4),                  /*                               SI */
-   COSTS_N_INSNS (6),                  /*                               DI */
-   COSTS_N_INSNS (6)},                 /*                            other */
-  0,                                   /* cost of multiply per each bit set */
-  {COSTS_N_INSNS (19),                 /* cost of a divide/mod for QI */
-   COSTS_N_INSNS (35),                 /*                          HI */
-   COSTS_N_INSNS (51),                 /*                          SI */
-   COSTS_N_INSNS (83),                 /*                          DI */
-   COSTS_N_INSNS (83)},                        /*                          other */
-  COSTS_N_INSNS (1),                   /* cost of movsx */
-  COSTS_N_INSNS (1),                   /* cost of movzx */
-  8,                                   /* "large" insn */
-  9,                                   /* MOVE_RATIO */
-  4,                                /* cost for loading QImode using movzbl */
-  {5, 5, 4},                           /* cost of loading integer registers
-                                          in QImode, HImode and SImode.
-                                          Relative to reg-reg move (2).  */
-  {4, 4, 4},                           /* cost of storing integer registers */
-  2,                                   /* cost of reg,reg fld/fst */
-  {5, 5, 12},                          /* cost of loading fp registers
-                                          in SFmode, DFmode and XFmode */
-  {4, 4, 8},                           /* cost of storing fp registers
-                                          in SFmode, DFmode and XFmode */
-  2,                                   /* cost of moving MMX register */
-  {4, 4},                              /* cost of loading MMX registers
-                                          in SImode and DImode */
-  {4, 4},                              /* cost of storing MMX registers
-                                          in SImode and DImode */
-  2,                                   /* cost of moving SSE register */
-  {4, 4, 4},                           /* cost of loading SSE registers
-                                          in SImode, DImode and TImode */
-  {4, 4, 4},                           /* cost of storing SSE registers
-                                          in SImode, DImode and TImode */
-  2,                                   /* MMX or SSE register to integer */
-  16,                                  /* size of l1 cache.  */
-  2048,                                        /* size of l2 cache.  */
-  64,                                  /* size of prefetch block */
-  /* New AMD processors never drop prefetches; if they cannot be performed
-     immediately, they are queued.  We set number of simultaneous prefetches
-     to a large constant to reflect this (it probably is not a good idea not
-     to limit number of prefetches at all, as their execution also takes some
-     time).  */
-  100,                                 /* number of parallel prefetches */
-  2,                                   /* Branch cost */
-  COSTS_N_INSNS (6),                   /* cost of FADD and FSUB insns.  */
-  COSTS_N_INSNS (6),                   /* cost of FMUL instruction.  */
-  COSTS_N_INSNS (42),                  /* cost of FDIV instruction.  */
-  COSTS_N_INSNS (2),                   /* cost of FABS instruction.  */
-  COSTS_N_INSNS (2),                   /* cost of FCHS instruction.  */
-  COSTS_N_INSNS (52),                  /* cost of FSQRT instruction.  */
-  1, 2, 1, 1,                          /* reassoc int, fp, vec_int, vec_fp.  */
-  bdver3_memcpy,
-  bdver3_memset,
-  6,                                   /* scalar_stmt_cost.  */
-  4,                                   /* scalar load_cost.  */
-  4,                                   /* scalar_store_cost.  */
-  6,                                   /* vec_stmt_cost.  */
-  0,                                   /* vec_to_scalar_cost.  */
-  2,                                   /* scalar_to_vec_cost.  */
-  4,                                   /* vec_align_load_cost.  */
-  4,                                   /* vec_unalign_load_cost.  */
-  4,                                   /* vec_store_cost.  */
-  4,                                   /* cond_taken_branch_cost.  */
-  2,                                   /* cond_not_taken_branch_cost.  */
-};
-
-/*  BDVER4 has optimized REP instruction for medium sized blocks, but for
-    very small blocks it is better to use loop. For large blocks, libcall
-    can do nontemporary accesses and beat inline considerably.  */
-static stringop_algs bdver4_memcpy[2] = {
-  {libcall, {{6, loop, false}, {14, unrolled_loop, false},
-             {-1, rep_prefix_4_byte, false}}},
-  {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
-             {-1, libcall, false}}}};
-static stringop_algs bdver4_memset[2] = {
-  {libcall, {{8, loop, false}, {24, unrolled_loop, false},
-             {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
-  {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
-             {-1, libcall, false}}}};
-struct processor_costs bdver4_cost = {
-  COSTS_N_INSNS (1),                   /* cost of an add instruction */
-  COSTS_N_INSNS (1),                   /* cost of a lea instruction */
-  COSTS_N_INSNS (1),                   /* variable shift costs */
-  COSTS_N_INSNS (1),                   /* constant shift costs */
-  {COSTS_N_INSNS (4),                  /* cost of starting multiply for QI */
-   COSTS_N_INSNS (4),                  /*                               HI */
-   COSTS_N_INSNS (4),                  /*                               SI */
-   COSTS_N_INSNS (6),                  /*                               DI */
-   COSTS_N_INSNS (6)},                 /*                            other */
-  0,                                   /* cost of multiply per each bit set */
-  {COSTS_N_INSNS (19),                 /* cost of a divide/mod for QI */
-   COSTS_N_INSNS (35),                 /*                          HI */
-   COSTS_N_INSNS (51),                 /*                          SI */
-   COSTS_N_INSNS (83),                 /*                          DI */
-   COSTS_N_INSNS (83)},                        /*                          other */
-  COSTS_N_INSNS (1),                   /* cost of movsx */
-  COSTS_N_INSNS (1),                   /* cost of movzx */
-  8,                                   /* "large" insn */
-  9,                                   /* MOVE_RATIO */
-  4,                                /* cost for loading QImode using movzbl */
-  {5, 5, 4},                           /* cost of loading integer registers
-                                          in QImode, HImode and SImode.
-                                          Relative to reg-reg move (2).  */
-  {4, 4, 4},                           /* cost of storing integer registers */
-  2,                                   /* cost of reg,reg fld/fst */
-  {5, 5, 12},                          /* cost of loading fp registers
-                                          in SFmode, DFmode and XFmode */
-  {4, 4, 8},                           /* cost of storing fp registers
-                                          in SFmode, DFmode and XFmode */
-  2,                                   /* cost of moving MMX register */
-  {4, 4},                              /* cost of loading MMX registers
-                                          in SImode and DImode */
-  {4, 4},                              /* cost of storing MMX registers
-                                          in SImode and DImode */
-  2,                                   /* cost of moving SSE register */
-  {4, 4, 4},                           /* cost of loading SSE registers
-                                          in SImode, DImode and TImode */
-  {4, 4, 4},                           /* cost of storing SSE registers
-                                          in SImode, DImode and TImode */
-  2,                                   /* MMX or SSE register to integer */
-  16,                                  /* size of l1 cache.  */
-  2048,                                        /* size of l2 cache.  */
-  64,                                  /* size of prefetch block */
-  /* New AMD processors never drop prefetches; if they cannot be performed
-     immediately, they are queued.  We set number of simultaneous prefetches
-     to a large constant to reflect this (it probably is not a good idea not
-     to limit number of prefetches at all, as their execution also takes some
-     time).  */
-  100,                                 /* number of parallel prefetches */
-  2,                                   /* Branch cost */
-  COSTS_N_INSNS (6),                   /* cost of FADD and FSUB insns.  */
-  COSTS_N_INSNS (6),                   /* cost of FMUL instruction.  */
-  COSTS_N_INSNS (42),                  /* cost of FDIV instruction.  */
-  COSTS_N_INSNS (2),                   /* cost of FABS instruction.  */
-  COSTS_N_INSNS (2),                   /* cost of FCHS instruction.  */
-  COSTS_N_INSNS (52),                  /* cost of FSQRT instruction.  */
-  1, 2, 1, 1,                          /* reassoc int, fp, vec_int, vec_fp.  */
-  bdver4_memcpy,
-  bdver4_memset,
-  6,                                   /* scalar_stmt_cost.  */
-  4,                                   /* scalar load_cost.  */
-  4,                                   /* scalar_store_cost.  */
-  6,                                   /* vec_stmt_cost.  */
-  0,                                   /* vec_to_scalar_cost.  */
-  2,                                   /* scalar_to_vec_cost.  */
-  4,                                   /* vec_align_load_cost.  */
-  4,                                   /* vec_unalign_load_cost.  */
-  4,                                   /* vec_store_cost.  */
-  4,                                   /* cond_taken_branch_cost.  */
-  2,                                   /* cond_not_taken_branch_cost.  */
-};
-
-
-/*  ZNVER1 has optimized REP instruction for medium sized blocks, but for
-    very small blocks it is better to use loop.  For large blocks, libcall
-    can do nontemporary accesses and beat inline considerably.  */
-static stringop_algs znver1_memcpy[2] = {
-  {libcall, {{6, loop, false}, {14, unrolled_loop, false},
-            {-1, rep_prefix_4_byte, false}}},
-  {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
-            {-1, libcall, false}}}};
-static stringop_algs znver1_memset[2] = {
-  {libcall, {{8, loop, false}, {24, unrolled_loop, false},
-            {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
-  {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
-            {-1, libcall, false}}}};
-struct processor_costs znver1_cost = {
-  COSTS_N_INSNS (1),                   /* cost of an add instruction.  */
-  COSTS_N_INSNS (1),                   /* cost of a lea instruction.  */
-  COSTS_N_INSNS (1),                   /* variable shift costs.  */
-  COSTS_N_INSNS (1),                   /* constant shift costs.  */
-  {COSTS_N_INSNS (3),                  /* cost of starting multiply for QI.  */
-   COSTS_N_INSNS (3),                  /*                               HI.  */
-   COSTS_N_INSNS (3),                  /*                               SI.  */
-   COSTS_N_INSNS (4),                  /*                               DI.  */
-   COSTS_N_INSNS (4)},                 /*                            other.  */
-  0,                                   /* cost of multiply per each bit
-                                           set.  */
-  {COSTS_N_INSNS (19),                 /* cost of a divide/mod for QI.  */
-   COSTS_N_INSNS (35),                 /*                          HI.  */
-   COSTS_N_INSNS (51),                 /*                          SI.  */
-   COSTS_N_INSNS (83),                 /*                          DI.  */
-   COSTS_N_INSNS (83)},                        /*                          other.  */
-  COSTS_N_INSNS (1),                   /* cost of movsx.  */
-  COSTS_N_INSNS (1),                   /* cost of movzx.  */
-  8,                                   /* "large" insn.  */
-  9,                                   /* MOVE_RATIO.  */
-  4,                                   /* cost for loading QImode using
-                                          movzbl.  */
-  {5, 5, 4},                           /* cost of loading integer registers
-                                          in QImode, HImode and SImode.
-                                          Relative to reg-reg move (2).  */
-  {4, 4, 4},                           /* cost of storing integer
-                                          registers.  */
-  2,                                   /* cost of reg,reg fld/fst.  */
-  {5, 5, 12},                          /* cost of loading fp registers
-                                          in SFmode, DFmode and XFmode.  */
-  {4, 4, 8},                           /* cost of storing fp registers
-                                          in SFmode, DFmode and XFmode.  */
-  2,                                   /* cost of moving MMX register.  */
-  {4, 4},                              /* cost of loading MMX registers
-                                          in SImode and DImode.  */
-  {4, 4},                              /* cost of storing MMX registers
-                                          in SImode and DImode.  */
-  2,                                   /* cost of moving SSE register.  */
-  {4, 4, 4},                           /* cost of loading SSE registers
-                                          in SImode, DImode and TImode.  */
-  {4, 4, 4},                           /* cost of storing SSE registers
-                                          in SImode, DImode and TImode.  */
-  2,                                   /* MMX or SSE register to integer.  */
-  32,                                  /* size of l1 cache.  */
-  512,                                 /* size of l2 cache.  */
-  64,                                  /* size of prefetch block.  */
-  /* New AMD processors never drop prefetches; if they cannot be performed
-     immediately, they are queued.  We set number of simultaneous prefetches
-     to a large constant to reflect this (it probably is not a good idea not
-     to limit number of prefetches at all, as their execution also takes some
-     time).  */
-  100,                                 /* number of parallel prefetches.  */
-  3,                                   /* Branch cost.  */
-  COSTS_N_INSNS (6),                   /* cost of FADD and FSUB insns.  */
-  COSTS_N_INSNS (6),                   /* cost of FMUL instruction.  */
-  COSTS_N_INSNS (42),                  /* cost of FDIV instruction.  */
-  COSTS_N_INSNS (2),                   /* cost of FABS instruction.  */
-  COSTS_N_INSNS (2),                   /* cost of FCHS instruction.  */
-  COSTS_N_INSNS (52),                  /* cost of FSQRT instruction.  */
-  /* Zen can execute 4 integer operations per cycle. FP operations take 3 cycles
-     and it can execute 2 integer additions and 2 multiplications thus
-     reassociation may make sense up to with of 6.  SPEC2k6 bencharks suggests
-     that 4 works better than 6 probably due to register pressure.
-
-     Integer vector operations are taken by FP unit and execute 3 vector
-     plus/minus operations per cycle but only one multiply.  This is adjusted
-     in ix86_reassociation_width.  */
-  4, 4, 3, 6,                          /* reassoc int, fp, vec_int, vec_fp.  */
-  znver1_memcpy,
-  znver1_memset,
-  6,                                   /* scalar_stmt_cost.  */
-  4,                                   /* scalar load_cost.  */
-  4,                                   /* scalar_store_cost.  */
-  6,                                   /* vec_stmt_cost.  */
-  0,                                   /* vec_to_scalar_cost.  */
-  2,                                   /* scalar_to_vec_cost.  */
-  4,                                   /* vec_align_load_cost.  */
-  4,                                   /* vec_unalign_load_cost.  */
-  4,                                   /* vec_store_cost.  */
-  4,                                   /* cond_taken_branch_cost.  */
-  2,                                   /* cond_not_taken_branch_cost.  */
-};
-
-  /* BTVER1 has optimized REP instruction for medium sized blocks, but for
-     very small blocks it is better to use loop. For large blocks, libcall can
-     do nontemporary accesses and beat inline considerably.  */
-static stringop_algs btver1_memcpy[2] = {
-  {libcall, {{6, loop, false}, {14, unrolled_loop, false},
-             {-1, rep_prefix_4_byte, false}}},
-  {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
-             {-1, libcall, false}}}};
-static stringop_algs btver1_memset[2] = {
-  {libcall, {{8, loop, false}, {24, unrolled_loop, false},
-             {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
-  {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
-             {-1, libcall, false}}}};
-const struct processor_costs btver1_cost = {
-  COSTS_N_INSNS (1),                   /* cost of an add instruction */
-  COSTS_N_INSNS (2),                   /* cost of a lea instruction */
-  COSTS_N_INSNS (1),                   /* variable shift costs */
-  COSTS_N_INSNS (1),                   /* constant shift costs */
-  {COSTS_N_INSNS (3),                  /* cost of starting multiply for QI */
-   COSTS_N_INSNS (4),                  /*                               HI */
-   COSTS_N_INSNS (3),                  /*                               SI */
-   COSTS_N_INSNS (4),                  /*                               DI */
-   COSTS_N_INSNS (5)},                 /*                            other */
-  0,                                   /* cost of multiply per each bit set */
-  {COSTS_N_INSNS (19),                 /* cost of a divide/mod for QI */
-   COSTS_N_INSNS (35),                 /*                          HI */
-   COSTS_N_INSNS (51),                 /*                          SI */
-   COSTS_N_INSNS (83),                 /*                          DI */
-   COSTS_N_INSNS (83)},                        /*                          other */
-  COSTS_N_INSNS (1),                   /* cost of movsx */
-  COSTS_N_INSNS (1),                   /* cost of movzx */
-  8,                                   /* "large" insn */
-  9,                                   /* MOVE_RATIO */
-  4,                                /* cost for loading QImode using movzbl */
-  {3, 4, 3},                           /* cost of loading integer registers
-                                          in QImode, HImode and SImode.
-                                          Relative to reg-reg move (2).  */
-  {3, 4, 3},                           /* cost of storing integer registers */
-  4,                                   /* cost of reg,reg fld/fst */
-  {4, 4, 12},                          /* cost of loading fp registers
-                                          in SFmode, DFmode and XFmode */
-  {6, 6, 8},                           /* cost of storing fp registers
-                                          in SFmode, DFmode and XFmode */
-  2,                                   /* cost of moving MMX register */
-  {3, 3},                              /* cost of loading MMX registers
-                                          in SImode and DImode */
-  {4, 4},                              /* cost of storing MMX registers
-                                          in SImode and DImode */
-  2,                                   /* cost of moving SSE register */
-  {4, 4, 3},                           /* cost of loading SSE registers
-                                          in SImode, DImode and TImode */
-  {4, 4, 5},                           /* cost of storing SSE registers
-                                          in SImode, DImode and TImode */
-  3,                                   /* MMX or SSE register to integer */
-                                       /* On K8:
-                                          MOVD reg64, xmmreg Double FSTORE 4
-                                          MOVD reg32, xmmreg Double FSTORE 4
-                                          On AMDFAM10:
-                                          MOVD reg64, xmmreg Double FADD 3
-                                                              1/1  1/1
-                                           MOVD reg32, xmmreg Double FADD 3
-                                                              1/1  1/1 */
-  32,                                  /* size of l1 cache.  */
-  512,                                 /* size of l2 cache.  */
-  64,                                  /* size of prefetch block */
-  100,                                 /* number of parallel prefetches */
-  2,                                   /* Branch cost */
-  COSTS_N_INSNS (4),                   /* cost of FADD and FSUB insns.  */
-  COSTS_N_INSNS (4),                   /* cost of FMUL instruction.  */
-  COSTS_N_INSNS (19),                  /* cost of FDIV instruction.  */
-  COSTS_N_INSNS (2),                   /* cost of FABS instruction.  */
-  COSTS_N_INSNS (2),                   /* cost of FCHS instruction.  */
-  COSTS_N_INSNS (35),                  /* cost of FSQRT instruction.  */
-  1, 1, 1, 1,                          /* reassoc int, fp, vec_int, vec_fp.  */
-  btver1_memcpy,
-  btver1_memset,
-  4,                                   /* scalar_stmt_cost.  */
-  2,                                   /* scalar load_cost.  */
-  2,                                   /* scalar_store_cost.  */
-  6,                                   /* vec_stmt_cost.  */
-  0,                                   /* vec_to_scalar_cost.  */
-  2,                                   /* scalar_to_vec_cost.  */
-  2,                                   /* vec_align_load_cost.  */
-  2,                                   /* vec_unalign_load_cost.  */
-  2,                                   /* vec_store_cost.  */
-  2,                                   /* cond_taken_branch_cost.  */
-  1,                                   /* cond_not_taken_branch_cost.  */
-};
-
-static stringop_algs btver2_memcpy[2] = {
-  {libcall, {{6, loop, false}, {14, unrolled_loop, false},
-             {-1, rep_prefix_4_byte, false}}},
-  {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
-             {-1, libcall, false}}}};
-static stringop_algs btver2_memset[2] = {
-  {libcall, {{8, loop, false}, {24, unrolled_loop, false},
-             {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
-  {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
-             {-1, libcall, false}}}};
-const struct processor_costs btver2_cost = {
-  COSTS_N_INSNS (1),                   /* cost of an add instruction */
-  COSTS_N_INSNS (2),                   /* cost of a lea instruction */
-  COSTS_N_INSNS (1),                   /* variable shift costs */
-  COSTS_N_INSNS (1),                   /* constant shift costs */
-  {COSTS_N_INSNS (3),                  /* cost of starting multiply for QI */
-   COSTS_N_INSNS (4),                  /*                               HI */
-   COSTS_N_INSNS (3),                  /*                               SI */
-   COSTS_N_INSNS (4),                  /*                               DI */
-   COSTS_N_INSNS (5)},                 /*                            other */
-  0,                                   /* cost of multiply per each bit set */
-  {COSTS_N_INSNS (19),                 /* cost of a divide/mod for QI */
-   COSTS_N_INSNS (35),                 /*                          HI */
-   COSTS_N_INSNS (51),                 /*                          SI */
-   COSTS_N_INSNS (83),                 /*                          DI */
-   COSTS_N_INSNS (83)},                        /*                          other */
-  COSTS_N_INSNS (1),                   /* cost of movsx */
-  COSTS_N_INSNS (1),                   /* cost of movzx */
-  8,                                   /* "large" insn */
-  9,                                   /* MOVE_RATIO */
-  4,                                /* cost for loading QImode using movzbl */
-  {3, 4, 3},                           /* cost of loading integer registers
-                                          in QImode, HImode and SImode.
-                                          Relative to reg-reg move (2).  */
-  {3, 4, 3},                           /* cost of storing integer registers */
-  4,                                   /* cost of reg,reg fld/fst */
-  {4, 4, 12},                          /* cost of loading fp registers
-                                          in SFmode, DFmode and XFmode */
-  {6, 6, 8},                           /* cost of storing fp registers
-                                          in SFmode, DFmode and XFmode */
-  2,                                   /* cost of moving MMX register */
-  {3, 3},                              /* cost of loading MMX registers
-                                          in SImode and DImode */
-  {4, 4},                              /* cost of storing MMX registers
-                                          in SImode and DImode */
-  2,                                   /* cost of moving SSE register */
-  {4, 4, 3},                           /* cost of loading SSE registers
-                                          in SImode, DImode and TImode */
-  {4, 4, 5},                           /* cost of storing SSE registers
-                                          in SImode, DImode and TImode */
-  3,                                   /* MMX or SSE register to integer */
-                                       /* On K8:
-                                          MOVD reg64, xmmreg Double FSTORE 4
-                                          MOVD reg32, xmmreg Double FSTORE 4
-                                          On AMDFAM10:
-                                          MOVD reg64, xmmreg Double FADD 3
-                                                              1/1  1/1
-                                           MOVD reg32, xmmreg Double FADD 3
-                                                              1/1  1/1 */
-  32,                                  /* size of l1 cache.  */
-  2048,                                        /* size of l2 cache.  */
-  64,                                  /* size of prefetch block */
-  100,                                 /* number of parallel prefetches */
-  2,                                   /* Branch cost */
-  COSTS_N_INSNS (4),                   /* cost of FADD and FSUB insns.  */
-  COSTS_N_INSNS (4),                   /* cost of FMUL instruction.  */
-  COSTS_N_INSNS (19),                  /* cost of FDIV instruction.  */
-  COSTS_N_INSNS (2),                   /* cost of FABS instruction.  */
-  COSTS_N_INSNS (2),                   /* cost of FCHS instruction.  */
-  COSTS_N_INSNS (35),                  /* cost of FSQRT instruction.  */
-  1, 1, 1, 1,                          /* reassoc int, fp, vec_int, vec_fp.  */
-  btver2_memcpy,
-  btver2_memset,
-  4,                                   /* scalar_stmt_cost.  */
-  2,                                   /* scalar load_cost.  */
-  2,                                   /* scalar_store_cost.  */
-  6,                                   /* vec_stmt_cost.  */
-  0,                                   /* vec_to_scalar_cost.  */
-  2,                                   /* scalar_to_vec_cost.  */
-  2,                                   /* vec_align_load_cost.  */
-  2,                                   /* vec_unalign_load_cost.  */
-  2,                                   /* vec_store_cost.  */
-  2,                                   /* cond_taken_branch_cost.  */
-  1,                                   /* cond_not_taken_branch_cost.  */
-};
-
-static stringop_algs pentium4_memcpy[2] = {
-  {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
-  DUMMY_STRINGOP_ALGS};
-static stringop_algs pentium4_memset[2] = {
-  {libcall, {{6, loop_1_byte, false}, {48, loop, false},
-             {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
-  DUMMY_STRINGOP_ALGS};
-
-static const
-struct processor_costs pentium4_cost = {
-  COSTS_N_INSNS (1),                   /* cost of an add instruction */
-  COSTS_N_INSNS (3),                   /* cost of a lea instruction */
-  COSTS_N_INSNS (4),                   /* variable shift costs */
-  COSTS_N_INSNS (4),                   /* constant shift costs */
-  {COSTS_N_INSNS (15),                 /* cost of starting multiply for QI */
-   COSTS_N_INSNS (15),                 /*                               HI */
-   COSTS_N_INSNS (15),                 /*                               SI */
-   COSTS_N_INSNS (15),                 /*                               DI */
-   COSTS_N_INSNS (15)},                        /*                            other */
-  0,                                   /* cost of multiply per each bit set */
-  {COSTS_N_INSNS (56),                 /* cost of a divide/mod for QI */
-   COSTS_N_INSNS (56),                 /*                          HI */
-   COSTS_N_INSNS (56),                 /*                          SI */
-   COSTS_N_INSNS (56),                 /*                          DI */
-   COSTS_N_INSNS (56)},                        /*                          other */
-  COSTS_N_INSNS (1),                   /* cost of movsx */
-  COSTS_N_INSNS (1),                   /* cost of movzx */
-  16,                                  /* "large" insn */
-  6,                                   /* MOVE_RATIO */
-  2,                                /* cost for loading QImode using movzbl */
-  {4, 5, 4},                           /* cost of loading integer registers
-                                          in QImode, HImode and SImode.
-                                          Relative to reg-reg move (2).  */
-  {2, 3, 2},                           /* cost of storing integer registers */
-  2,                                   /* cost of reg,reg fld/fst */
-  {2, 2, 6},                           /* cost of loading fp registers
-                                          in SFmode, DFmode and XFmode */
-  {4, 4, 6},                           /* cost of storing fp registers
-                                          in SFmode, DFmode and XFmode */
-  2,                                   /* cost of moving MMX register */
-  {2, 2},                              /* cost of loading MMX registers
-                                          in SImode and DImode */
-  {2, 2},                              /* cost of storing MMX registers
-                                          in SImode and DImode */
-  12,                                  /* cost of moving SSE register */
-  {12, 12, 12},                                /* cost of loading SSE registers
-                                          in SImode, DImode and TImode */
-  {2, 2, 8},                           /* cost of storing SSE registers
-                                          in SImode, DImode and TImode */
-  10,                                  /* MMX or SSE register to integer */
-  8,                                   /* size of l1 cache.  */
-  256,                                 /* size of l2 cache.  */
-  64,                                  /* size of prefetch block */
-  6,                                   /* number of parallel prefetches */
-  2,                                   /* Branch cost */
-  COSTS_N_INSNS (5),                   /* cost of FADD and FSUB insns.  */
-  COSTS_N_INSNS (7),                   /* cost of FMUL instruction.  */
-  COSTS_N_INSNS (43),                  /* cost of FDIV instruction.  */
-  COSTS_N_INSNS (2),                   /* cost of FABS instruction.  */
-  COSTS_N_INSNS (2),                   /* cost of FCHS instruction.  */
-  COSTS_N_INSNS (43),                  /* cost of FSQRT instruction.  */
-  1, 1, 1, 1,                          /* reassoc int, fp, vec_int, vec_fp.  */
-  pentium4_memcpy,
-  pentium4_memset,
-  1,                                   /* scalar_stmt_cost.  */
-  1,                                   /* scalar load_cost.  */
-  1,                                   /* scalar_store_cost.  */
-  1,                                   /* vec_stmt_cost.  */
-  1,                                   /* vec_to_scalar_cost.  */
-  1,                                   /* scalar_to_vec_cost.  */
-  1,                                   /* vec_align_load_cost.  */
-  2,                                   /* vec_unalign_load_cost.  */
-  1,                                   /* vec_store_cost.  */
-  3,                                   /* cond_taken_branch_cost.  */
-  1,                                   /* cond_not_taken_branch_cost.  */
-};
-
-static stringop_algs nocona_memcpy[2] = {
-  {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
-  {libcall, {{32, loop, false}, {20000, rep_prefix_8_byte, false},
-             {100000, unrolled_loop, false}, {-1, libcall, false}}}};
-
-static stringop_algs nocona_memset[2] = {
-  {libcall, {{6, loop_1_byte, false}, {48, loop, false},
-             {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
-  {libcall, {{24, loop, false}, {64, unrolled_loop, false},
-             {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
-
-static const
-struct processor_costs nocona_cost = {
-  COSTS_N_INSNS (1),                   /* cost of an add instruction */
-  COSTS_N_INSNS (1),                   /* cost of a lea instruction */
-  COSTS_N_INSNS (1),                   /* variable shift costs */
-  COSTS_N_INSNS (1),                   /* constant shift costs */
-  {COSTS_N_INSNS (10),                 /* cost of starting multiply for QI */
-   COSTS_N_INSNS (10),                 /*                               HI */
-   COSTS_N_INSNS (10),                 /*                               SI */
-   COSTS_N_INSNS (10),                 /*                               DI */
-   COSTS_N_INSNS (10)},                        /*                            other */
-  0,                                   /* cost of multiply per each bit set */
-  {COSTS_N_INSNS (66),                 /* cost of a divide/mod for QI */
-   COSTS_N_INSNS (66),                 /*                          HI */
-   COSTS_N_INSNS (66),                 /*                          SI */
-   COSTS_N_INSNS (66),                 /*                          DI */
-   COSTS_N_INSNS (66)},                        /*                          other */
-  COSTS_N_INSNS (1),                   /* cost of movsx */
-  COSTS_N_INSNS (1),                   /* cost of movzx */
-  16,                                  /* "large" insn */
-  17,                                  /* MOVE_RATIO */
-  4,                                /* cost for loading QImode using movzbl */
-  {4, 4, 4},                           /* cost of loading integer registers
-                                          in QImode, HImode and SImode.
-                                          Relative to reg-reg move (2).  */
-  {4, 4, 4},                           /* cost of storing integer registers */
-  3,                                   /* cost of reg,reg fld/fst */
-  {12, 12, 12},                                /* cost of loading fp registers
-                                          in SFmode, DFmode and XFmode */
-  {4, 4, 4},                           /* cost of storing fp registers
-                                          in SFmode, DFmode and XFmode */
-  6,                                   /* cost of moving MMX register */
-  {12, 12},                            /* cost of loading MMX registers
-                                          in SImode and DImode */
-  {12, 12},                            /* cost of storing MMX registers
-                                          in SImode and DImode */
-  6,                                   /* cost of moving SSE register */
-  {12, 12, 12},                                /* cost of loading SSE registers
-                                          in SImode, DImode and TImode */
-  {12, 12, 12},                                /* cost of storing SSE registers
-                                          in SImode, DImode and TImode */
-  8,                                   /* MMX or SSE register to integer */
-  8,                                   /* size of l1 cache.  */
-  1024,                                        /* size of l2 cache.  */
-  64,                                  /* size of prefetch block */
-  8,                                   /* number of parallel prefetches */
-  1,                                   /* Branch cost */
-  COSTS_N_INSNS (6),                   /* cost of FADD and FSUB insns.  */
-  COSTS_N_INSNS (8),                   /* cost of FMUL instruction.  */
-  COSTS_N_INSNS (40),                  /* cost of FDIV instruction.  */
-  COSTS_N_INSNS (3),                   /* cost of FABS instruction.  */
-  COSTS_N_INSNS (3),                   /* cost of FCHS instruction.  */
-  COSTS_N_INSNS (44),                  /* cost of FSQRT instruction.  */
-  1, 1, 1, 1,                          /* reassoc int, fp, vec_int, vec_fp.  */
-  nocona_memcpy,
-  nocona_memset,
-  1,                                   /* scalar_stmt_cost.  */
-  1,                                   /* scalar load_cost.  */
-  1,                                   /* scalar_store_cost.  */
-  1,                                   /* vec_stmt_cost.  */
-  1,                                   /* vec_to_scalar_cost.  */
-  1,                                   /* scalar_to_vec_cost.  */
-  1,                                   /* vec_align_load_cost.  */
-  2,                                   /* vec_unalign_load_cost.  */
-  1,                                   /* vec_store_cost.  */
-  3,                                   /* cond_taken_branch_cost.  */
-  1,                                   /* cond_not_taken_branch_cost.  */
-};
-
-static stringop_algs atom_memcpy[2] = {
-  {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
-  {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
-             {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
-static stringop_algs atom_memset[2] = {
-  {libcall, {{8, loop, false}, {15, unrolled_loop, false},
-             {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
-  {libcall, {{24, loop, false}, {32, unrolled_loop, false},
-             {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
-static const
-struct processor_costs atom_cost = {
-  COSTS_N_INSNS (1),                   /* cost of an add instruction */
-  COSTS_N_INSNS (1) + 1,               /* cost of a lea instruction */
-  COSTS_N_INSNS (1),                   /* variable shift costs */
-  COSTS_N_INSNS (1),                   /* constant shift costs */
-  {COSTS_N_INSNS (3),                  /* cost of starting multiply for QI */
-   COSTS_N_INSNS (4),                  /*                               HI */
-   COSTS_N_INSNS (3),                  /*                               SI */
-   COSTS_N_INSNS (4),                  /*                               DI */
-   COSTS_N_INSNS (2)},                 /*                            other */
-  0,                                   /* cost of multiply per each bit set */
-  {COSTS_N_INSNS (18),                 /* cost of a divide/mod for QI */
-   COSTS_N_INSNS (26),                 /*                          HI */
-   COSTS_N_INSNS (42),                 /*                          SI */
-   COSTS_N_INSNS (74),                 /*                          DI */
-   COSTS_N_INSNS (74)},                        /*                          other */
-  COSTS_N_INSNS (1),                   /* cost of movsx */
-  COSTS_N_INSNS (1),                   /* cost of movzx */
-  8,                                   /* "large" insn */
-  17,                                  /* MOVE_RATIO */
-  4,                                   /* cost for loading QImode using movzbl */
-  {4, 4, 4},                           /* cost of loading integer registers
-                                          in QImode, HImode and SImode.
-                                          Relative to reg-reg move (2).  */
-  {4, 4, 4},                           /* cost of storing integer registers */
-  4,                                   /* cost of reg,reg fld/fst */
-  {12, 12, 12},                                /* cost of loading fp registers
-                                          in SFmode, DFmode and XFmode */
-  {6, 6, 8},                           /* cost of storing fp registers
-                                          in SFmode, DFmode and XFmode */
-  2,                                   /* cost of moving MMX register */
-  {8, 8},                              /* cost of loading MMX registers
-                                          in SImode and DImode */
-  {8, 8},                              /* cost of storing MMX registers
-                                          in SImode and DImode */
-  2,                                   /* cost of moving SSE register */
-  {8, 8, 8},                           /* cost of loading SSE registers
-                                          in SImode, DImode and TImode */
-  {8, 8, 8},                           /* cost of storing SSE registers
-                                          in SImode, DImode and TImode */
-  5,                                   /* MMX or SSE register to integer */
-  32,                                  /* size of l1 cache.  */
-  256,                                 /* size of l2 cache.  */
-  64,                                  /* size of prefetch block */
-  6,                                   /* number of parallel prefetches */
-  3,                                   /* Branch cost */
-  COSTS_N_INSNS (8),                   /* cost of FADD and FSUB insns.  */
-  COSTS_N_INSNS (8),                   /* cost of FMUL instruction.  */
-  COSTS_N_INSNS (20),                  /* cost of FDIV instruction.  */
-  COSTS_N_INSNS (8),                   /* cost of FABS instruction.  */
-  COSTS_N_INSNS (8),                   /* cost of FCHS instruction.  */
-  COSTS_N_INSNS (40),                  /* cost of FSQRT instruction.  */
-  2, 2, 2, 2,                          /* reassoc int, fp, vec_int, vec_fp.  */
-  atom_memcpy,
-  atom_memset,
-  1,                                   /* scalar_stmt_cost.  */
-  1,                                   /* scalar load_cost.  */
-  1,                                   /* scalar_store_cost.  */
-  1,                                   /* vec_stmt_cost.  */
-  1,                                   /* vec_to_scalar_cost.  */
-  1,                                   /* scalar_to_vec_cost.  */
-  1,                                   /* vec_align_load_cost.  */
-  2,                                   /* vec_unalign_load_cost.  */
-  1,                                   /* vec_store_cost.  */
-  3,                                   /* cond_taken_branch_cost.  */
-  1,                                   /* cond_not_taken_branch_cost.  */
-};
-
-static stringop_algs slm_memcpy[2] = {
-  {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
-  {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
-             {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
-static stringop_algs slm_memset[2] = {
-  {libcall, {{8, loop, false}, {15, unrolled_loop, false},
-             {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
-  {libcall, {{24, loop, false}, {32, unrolled_loop, false},
-             {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
-static const
-struct processor_costs slm_cost = {
-  COSTS_N_INSNS (1),                   /* cost of an add instruction */
-  COSTS_N_INSNS (1) + 1,               /* cost of a lea instruction */
-  COSTS_N_INSNS (1),                   /* variable shift costs */
-  COSTS_N_INSNS (1),                   /* constant shift costs */
-  {COSTS_N_INSNS (3),                  /* cost of starting multiply for QI */
-   COSTS_N_INSNS (3),                  /*                               HI */
-   COSTS_N_INSNS (3),                  /*                               SI */
-   COSTS_N_INSNS (4),                  /*                               DI */
-   COSTS_N_INSNS (2)},                 /*                            other */
-  0,                                   /* cost of multiply per each bit set */
-  {COSTS_N_INSNS (18),                 /* cost of a divide/mod for QI */
-   COSTS_N_INSNS (26),                 /*                          HI */
-   COSTS_N_INSNS (42),                 /*                          SI */
-   COSTS_N_INSNS (74),                 /*                          DI */
-   COSTS_N_INSNS (74)},                        /*                          other */
-  COSTS_N_INSNS (1),                   /* cost of movsx */
-  COSTS_N_INSNS (1),                   /* cost of movzx */
-  8,                                   /* "large" insn */
-  17,                                  /* MOVE_RATIO */
-  4,                                   /* cost for loading QImode using movzbl */
-  {4, 4, 4},                           /* cost of loading integer registers
-                                          in QImode, HImode and SImode.
-                                          Relative to reg-reg move (2).  */
-  {4, 4, 4},                           /* cost of storing integer registers */
-  4,                                   /* cost of reg,reg fld/fst */
-  {12, 12, 12},                                /* cost of loading fp registers
-                                          in SFmode, DFmode and XFmode */
-  {6, 6, 8},                           /* cost of storing fp registers
-                                          in SFmode, DFmode and XFmode */
-  2,                                   /* cost of moving MMX register */
-  {8, 8},                              /* cost of loading MMX registers
-                                          in SImode and DImode */
-  {8, 8},                              /* cost of storing MMX registers
-                                          in SImode and DImode */
-  2,                                   /* cost of moving SSE register */
-  {8, 8, 8},                           /* cost of loading SSE registers
-                                          in SImode, DImode and TImode */
-  {8, 8, 8},                           /* cost of storing SSE registers
-                                          in SImode, DImode and TImode */
-  5,                                   /* MMX or SSE register to integer */
-  32,                                  /* size of l1 cache.  */
-  256,                                 /* size of l2 cache.  */
-  64,                                  /* size of prefetch block */
-  6,                                   /* number of parallel prefetches */
-  3,                                   /* Branch cost */
-  COSTS_N_INSNS (8),                   /* cost of FADD and FSUB insns.  */
-  COSTS_N_INSNS (8),                   /* cost of FMUL instruction.  */
-  COSTS_N_INSNS (20),                  /* cost of FDIV instruction.  */
-  COSTS_N_INSNS (8),                   /* cost of FABS instruction.  */
-  COSTS_N_INSNS (8),                   /* cost of FCHS instruction.  */
-  COSTS_N_INSNS (40),                  /* cost of FSQRT instruction.  */
-  1, 2, 1, 1,                          /* reassoc int, fp, vec_int, vec_fp.  */
-  slm_memcpy,
-  slm_memset,
-  1,                                   /* scalar_stmt_cost.  */
-  1,                                   /* scalar load_cost.  */
-  1,                                   /* scalar_store_cost.  */
-  1,                                   /* vec_stmt_cost.  */
-  4,                                   /* vec_to_scalar_cost.  */
-  1,                                   /* scalar_to_vec_cost.  */
-  1,                                   /* vec_align_load_cost.  */
-  2,                                   /* vec_unalign_load_cost.  */
-  1,                                   /* vec_store_cost.  */
-  3,                                   /* cond_taken_branch_cost.  */
-  1,                                   /* cond_not_taken_branch_cost.  */
-};
-
-static stringop_algs intel_memcpy[2] = {
-  {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
-  {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
-             {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
-static stringop_algs intel_memset[2] = {
-  {libcall, {{8, loop, false}, {15, unrolled_loop, false},
-             {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
-  {libcall, {{24, loop, false}, {32, unrolled_loop, false},
-             {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
-static const
-struct processor_costs intel_cost = {
-  COSTS_N_INSNS (1),                   /* cost of an add instruction */
-  COSTS_N_INSNS (1) + 1,               /* cost of a lea instruction */
-  COSTS_N_INSNS (1),                   /* variable shift costs */
-  COSTS_N_INSNS (1),                   /* constant shift costs */
-  {COSTS_N_INSNS (3),                  /* cost of starting multiply for QI */
-   COSTS_N_INSNS (3),                  /*                               HI */
-   COSTS_N_INSNS (3),                  /*                               SI */
-   COSTS_N_INSNS (4),                  /*                               DI */
-   COSTS_N_INSNS (2)},                 /*                            other */
-  0,                                   /* cost of multiply per each bit set */
-  {COSTS_N_INSNS (18),                 /* cost of a divide/mod for QI */
-   COSTS_N_INSNS (26),                 /*                          HI */
-   COSTS_N_INSNS (42),                 /*                          SI */
-   COSTS_N_INSNS (74),                 /*                          DI */
-   COSTS_N_INSNS (74)},                        /*                          other */
-  COSTS_N_INSNS (1),                   /* cost of movsx */
-  COSTS_N_INSNS (1),                   /* cost of movzx */
-  8,                                   /* "large" insn */
-  17,                                  /* MOVE_RATIO */
-  4,                                   /* cost for loading QImode using movzbl */
-  {4, 4, 4},                           /* cost of loading integer registers
-                                          in QImode, HImode and SImode.
-                                          Relative to reg-reg move (2).  */
-  {4, 4, 4},                           /* cost of storing integer registers */
-  4,                                   /* cost of reg,reg fld/fst */
-  {12, 12, 12},                                /* cost of loading fp registers
-                                          in SFmode, DFmode and XFmode */
-  {6, 6, 8},                           /* cost of storing fp registers
-                                          in SFmode, DFmode and XFmode */
-  2,                                   /* cost of moving MMX register */
-  {8, 8},                              /* cost of loading MMX registers
-                                          in SImode and DImode */
-  {8, 8},                              /* cost of storing MMX registers
-                                          in SImode and DImode */
-  2,                                   /* cost of moving SSE register */
-  {8, 8, 8},                           /* cost of loading SSE registers
-                                          in SImode, DImode and TImode */
-  {8, 8, 8},                           /* cost of storing SSE registers
-                                          in SImode, DImode and TImode */
-  5,                                   /* MMX or SSE register to integer */
-  32,                                  /* size of l1 cache.  */
-  256,                                 /* size of l2 cache.  */
-  64,                                  /* size of prefetch block */
-  6,                                   /* number of parallel prefetches */
-  3,                                   /* Branch cost */
-  COSTS_N_INSNS (8),                   /* cost of FADD and FSUB insns.  */
-  COSTS_N_INSNS (8),                   /* cost of FMUL instruction.  */
-  COSTS_N_INSNS (20),                  /* cost of FDIV instruction.  */
-  COSTS_N_INSNS (8),                   /* cost of FABS instruction.  */
-  COSTS_N_INSNS (8),                   /* cost of FCHS instruction.  */
-  COSTS_N_INSNS (40),                  /* cost of FSQRT instruction.  */
-  1, 4, 1, 1,                          /* reassoc int, fp, vec_int, vec_fp.  */
-  intel_memcpy,
-  intel_memset,
-  1,                                   /* scalar_stmt_cost.  */
-  1,                                   /* scalar load_cost.  */
-  1,                                   /* scalar_store_cost.  */
-  1,                                   /* vec_stmt_cost.  */
-  4,                                   /* vec_to_scalar_cost.  */
-  1,                                   /* scalar_to_vec_cost.  */
-  1,                                   /* vec_align_load_cost.  */
-  2,                                   /* vec_unalign_load_cost.  */
-  1,                                   /* vec_store_cost.  */
-  3,                                   /* cond_taken_branch_cost.  */
-  1,                                   /* cond_not_taken_branch_cost.  */
-};
-
-/* Generic should produce code tuned for Core-i7 (and newer chips)
-   and btver1 (and newer chips).  */
-
-static stringop_algs generic_memcpy[2] = {
-  {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
-             {-1, libcall, false}}},
-  {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
-             {-1, libcall, false}}}};
-static stringop_algs generic_memset[2] = {
-  {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
-             {-1, libcall, false}}},
-  {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
-             {-1, libcall, false}}}};
-static const
-struct processor_costs generic_cost = {
-  COSTS_N_INSNS (1),                   /* cost of an add instruction */
-  /* On all chips taken into consideration lea is 2 cycles and more.  With
-     this cost however our current implementation of synth_mult results in
-     use of unnecessary temporary registers causing regression on several
-     SPECfp benchmarks.  */
-  COSTS_N_INSNS (1) + 1,               /* cost of a lea instruction */
-  COSTS_N_INSNS (1),                   /* variable shift costs */
-  COSTS_N_INSNS (1),                   /* constant shift costs */
-  {COSTS_N_INSNS (3),                  /* cost of starting multiply for QI */
-   COSTS_N_INSNS (4),                  /*                               HI */
-   COSTS_N_INSNS (3),                  /*                               SI */
-   COSTS_N_INSNS (4),                  /*                               DI */
-   COSTS_N_INSNS (2)},                 /*                            other */
-  0,                                   /* cost of multiply per each bit set */
-  {COSTS_N_INSNS (18),                 /* cost of a divide/mod for QI */
-   COSTS_N_INSNS (26),                 /*                          HI */
-   COSTS_N_INSNS (42),                 /*                          SI */
-   COSTS_N_INSNS (74),                 /*                          DI */
-   COSTS_N_INSNS (74)},                        /*                          other */
-  COSTS_N_INSNS (1),                   /* cost of movsx */
-  COSTS_N_INSNS (1),                   /* cost of movzx */
-  8,                                   /* "large" insn */
-  17,                                  /* MOVE_RATIO */
-  4,                                /* cost for loading QImode using movzbl */
-  {4, 4, 4},                           /* cost of loading integer registers
-                                          in QImode, HImode and SImode.
-                                          Relative to reg-reg move (2).  */
-  {4, 4, 4},                           /* cost of storing integer registers */
-  4,                                   /* cost of reg,reg fld/fst */
-  {12, 12, 12},                                /* cost of loading fp registers
-                                          in SFmode, DFmode and XFmode */
-  {6, 6, 8},                           /* cost of storing fp registers
-                                          in SFmode, DFmode and XFmode */
-  2,                                   /* cost of moving MMX register */
-  {8, 8},                              /* cost of loading MMX registers
-                                          in SImode and DImode */
-  {8, 8},                              /* cost of storing MMX registers
-                                          in SImode and DImode */
-  2,                                   /* cost of moving SSE register */
-  {8, 8, 8},                           /* cost of loading SSE registers
-                                          in SImode, DImode and TImode */
-  {8, 8, 8},                           /* cost of storing SSE registers
-                                          in SImode, DImode and TImode */
-  5,                                   /* MMX or SSE register to integer */
-  32,                                  /* size of l1 cache.  */
-  512,                                 /* size of l2 cache.  */
-  64,                                  /* size of prefetch block */
-  6,                                   /* number of parallel prefetches */
-  /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
-     value is increased to perhaps more appropriate value of 5.  */
-  3,                                   /* Branch cost */
-  COSTS_N_INSNS (8),                   /* cost of FADD and FSUB insns.  */
-  COSTS_N_INSNS (8),                   /* cost of FMUL instruction.  */
-  COSTS_N_INSNS (20),                  /* cost of FDIV instruction.  */
-  COSTS_N_INSNS (8),                   /* cost of FABS instruction.  */
-  COSTS_N_INSNS (8),                   /* cost of FCHS instruction.  */
-  COSTS_N_INSNS (40),                  /* cost of FSQRT instruction.  */
-  1, 2, 1, 1,                          /* reassoc int, fp, vec_int, vec_fp.  */
-  generic_memcpy,
-  generic_memset,
-  1,                                   /* scalar_stmt_cost.  */
-  1,                                   /* scalar load_cost.  */
-  1,                                   /* scalar_store_cost.  */
-  1,                                   /* vec_stmt_cost.  */
-  1,                                   /* vec_to_scalar_cost.  */
-  1,                                   /* scalar_to_vec_cost.  */
-  1,                                   /* vec_align_load_cost.  */
-  2,                                   /* vec_unalign_load_cost.  */
-  1,                                   /* vec_store_cost.  */
-  3,                                   /* cond_taken_branch_cost.  */
-  1,                                   /* cond_not_taken_branch_cost.  */
-};
-
-/* core_cost should produce code tuned for Core familly of CPUs.  */
-static stringop_algs core_memcpy[2] = {
-  {libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}},
-  {libcall, {{24, loop, true}, {128, rep_prefix_8_byte, true},
-             {-1, libcall, false}}}};
-static stringop_algs core_memset[2] = {
-  {libcall, {{6, loop_1_byte, true},
-             {24, loop, true},
-             {8192, rep_prefix_4_byte, true},
-             {-1, libcall, false}}},
-  {libcall, {{24, loop, true}, {512, rep_prefix_8_byte, true},
-             {-1, libcall, false}}}};
-
-static const
-struct processor_costs core_cost = {
-  COSTS_N_INSNS (1),                   /* cost of an add instruction */
-  /* On all chips taken into consideration lea is 2 cycles and more.  With
-     this cost however our current implementation of synth_mult results in
-     use of unnecessary temporary registers causing regression on several
-     SPECfp benchmarks.  */
-  COSTS_N_INSNS (1) + 1,               /* cost of a lea instruction */
-  COSTS_N_INSNS (1),                   /* variable shift costs */
-  COSTS_N_INSNS (1),                   /* constant shift costs */
-  {COSTS_N_INSNS (3),                  /* cost of starting multiply for QI */
-   COSTS_N_INSNS (4),                  /*                               HI */
-   COSTS_N_INSNS (3),                  /*                               SI */
-   COSTS_N_INSNS (4),                  /*                               DI */
-   COSTS_N_INSNS (2)},                 /*                            other */
-  0,                                   /* cost of multiply per each bit set */
-  {COSTS_N_INSNS (18),                 /* cost of a divide/mod for QI */
-   COSTS_N_INSNS (26),                 /*                          HI */
-   COSTS_N_INSNS (42),                 /*                          SI */
-   COSTS_N_INSNS (74),                 /*                          DI */
-   COSTS_N_INSNS (74)},                        /*                          other */
-  COSTS_N_INSNS (1),                   /* cost of movsx */
-  COSTS_N_INSNS (1),                   /* cost of movzx */
-  8,                                   /* "large" insn */
-  17,                                  /* MOVE_RATIO */
-  4,                                /* cost for loading QImode using movzbl */
-  {4, 4, 4},                           /* cost of loading integer registers
-                                          in QImode, HImode and SImode.
-                                          Relative to reg-reg move (2).  */
-  {4, 4, 4},                           /* cost of storing integer registers */
-  4,                                   /* cost of reg,reg fld/fst */
-  {12, 12, 12},                                /* cost of loading fp registers
-                                          in SFmode, DFmode and XFmode */
-  {6, 6, 8},                           /* cost of storing fp registers
-                                          in SFmode, DFmode and XFmode */
-  2,                                   /* cost of moving MMX register */
-  {8, 8},                              /* cost of loading MMX registers
-                                          in SImode and DImode */
-  {8, 8},                              /* cost of storing MMX registers
-                                          in SImode and DImode */
-  2,                                   /* cost of moving SSE register */
-  {8, 8, 8},                           /* cost of loading SSE registers
-                                          in SImode, DImode and TImode */
-  {8, 8, 8},                           /* cost of storing SSE registers
-                                          in SImode, DImode and TImode */
-  5,                                   /* MMX or SSE register to integer */
-  64,                                  /* size of l1 cache.  */
-  512,                                 /* size of l2 cache.  */
-  64,                                  /* size of prefetch block */
-  6,                                   /* number of parallel prefetches */
-  /* FIXME perhaps more appropriate value is 5.  */
-  3,                                   /* Branch cost */
-  COSTS_N_INSNS (8),                   /* cost of FADD and FSUB insns.  */
-  COSTS_N_INSNS (8),                   /* cost of FMUL instruction.  */
-  COSTS_N_INSNS (20),                  /* cost of FDIV instruction.  */
-  COSTS_N_INSNS (8),                   /* cost of FABS instruction.  */
-  COSTS_N_INSNS (8),                   /* cost of FCHS instruction.  */
-  COSTS_N_INSNS (40),                  /* cost of FSQRT instruction.  */
-  1, 4, 2, 2,                          /* reassoc int, fp, vec_int, vec_fp.  */
-  core_memcpy,
-  core_memset,
-  1,                                   /* scalar_stmt_cost.  */
-  1,                                   /* scalar load_cost.  */
-  1,                                   /* scalar_store_cost.  */
-  1,                                   /* vec_stmt_cost.  */
-  1,                                   /* vec_to_scalar_cost.  */
-  1,                                   /* scalar_to_vec_cost.  */
-  1,                                   /* vec_align_load_cost.  */
-  2,                                   /* vec_unalign_load_cost.  */
-  1,                                   /* vec_store_cost.  */
-  3,                                   /* cond_taken_branch_cost.  */
-  1,                                   /* cond_not_taken_branch_cost.  */
-};
-
  
  /* Set by -mtune.  */
-const struct processor_costs *ix86_tune_cost = &pentium_cost;
+const struct processor_costs *ix86_tune_cost = NULL;
  
  /* Set by -mtune or -Os.  */
-const struct processor_costs *ix86_cost = &pentium_cost;
+const struct processor_costs *ix86_cost = NULL;
  
  /* Processor feature/optimization bitmasks.  */
  #define m_386 (1U<<PROCESSOR_I386)
@@ -30051,8 +27971,8 @@ ix86_get_modrm_for_rop (rtx_insn *insn, rtx *operands, int noperands,
  
  /* Check whether x86 address PARTS is a pc-relative address.  */
  
-static bool
-rip_relative_addr_p (struct ix86_address *parts)
+bool
+ix86_rip_relative_addr_p (struct ix86_address *parts)
  {
    rtx base, index, disp;
  
@@ -30156,7 +28076,7 @@ memory_address_length (rtx addr, bool lea)
    else if (disp && !base && !index)
      {
        len += 4;
-      if (!rip_relative_addr_p (&parts))
+      if (!ix86_rip_relative_addr_p (&parts))
         len++;
      }
    else
@@ -30338,773 +28258,6 @@ ix86_attr_length_vex_default (rtx_insn *insn, bool has_0f_opcode,
    return 2 + 1;
  }
  \f
-/* Return the maximum number of instructions a cpu can issue.  */
-
-static int
-ix86_issue_rate (void)
-{
-  switch (ix86_tune)
-    {
-    case PROCESSOR_PENTIUM:
-    case PROCESSOR_LAKEMONT:
-    case PROCESSOR_BONNELL:
-    case PROCESSOR_SILVERMONT:
-    case PROCESSOR_KNL:
-    case PROCESSOR_KNM:
-    case PROCESSOR_INTEL:
-    case PROCESSOR_K6:
-    case PROCESSOR_BTVER2:
-    case PROCESSOR_PENTIUM4:
-    case PROCESSOR_NOCONA:
-      return 2;
-
-    case PROCESSOR_PENTIUMPRO:
-    case PROCESSOR_ATHLON:
-    case PROCESSOR_K8:
-    case PROCESSOR_AMDFAM10:
-    case PROCESSOR_GENERIC:
-    case PROCESSOR_BTVER1:
-      return 3;
-
-    case PROCESSOR_BDVER1:
-    case PROCESSOR_BDVER2:
-    case PROCESSOR_BDVER3:
-    case PROCESSOR_BDVER4:
-    case PROCESSOR_ZNVER1:
-    case PROCESSOR_CORE2:
-    case PROCESSOR_NEHALEM:
-    case PROCESSOR_SANDYBRIDGE:
-    case PROCESSOR_HASWELL:
-      return 4;
-
-    default:
-      return 1;
-    }
-}
-
-/* A subroutine of ix86_adjust_cost -- return TRUE iff INSN reads flags set
-   by DEP_INSN and nothing set by DEP_INSN.  */
-
-static bool
-ix86_flags_dependent (rtx_insn *insn, rtx_insn *dep_insn, enum attr_type insn_type)
-{
-  rtx set, set2;
-
-  /* Simplify the test for uninteresting insns.  */
-  if (insn_type != TYPE_SETCC
-      && insn_type != TYPE_ICMOV
-      && insn_type != TYPE_FCMOV
-      && insn_type != TYPE_IBR)
-    return false;
-
-  if ((set = single_set (dep_insn)) != 0)
-    {
-      set = SET_DEST (set);
-      set2 = NULL_RTX;
-    }
-  else if (GET_CODE (PATTERN (dep_insn)) == PARALLEL
-          && XVECLEN (PATTERN (dep_insn), 0) == 2
-          && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 0)) == SET
-          && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 1)) == SET)
-    {
-      set = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
-      set2 = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
-    }
-  else
-    return false;
-
-  if (!REG_P (set) || REGNO (set) != FLAGS_REG)
-    return false;
-
-  /* This test is true if the dependent insn reads the flags but
-     not any other potentially set register.  */
-  if (!reg_overlap_mentioned_p (set, PATTERN (insn)))
-    return false;
-
-  if (set2 && reg_overlap_mentioned_p (set2, PATTERN (insn)))
-    return false;
-
-  return true;
-}
-
-/* Return true iff USE_INSN has a memory address with operands set by
-   SET_INSN.  */
-
-bool
-ix86_agi_dependent (rtx_insn *set_insn, rtx_insn *use_insn)
-{
-  int i;
-  extract_insn_cached (use_insn);
-  for (i = recog_data.n_operands - 1; i >= 0; --i)
-    if (MEM_P (recog_data.operand[i]))
-      {
-       rtx addr = XEXP (recog_data.operand[i], 0);
-       if (modified_in_p (addr, set_insn) != 0)
-         {
-           /* No AGI stall if SET_INSN is a push or pop and USE_INSN
-              has SP based memory (unless index reg is modified in a pop).  */
-           rtx set = single_set (set_insn);
-           if (set
-               && (push_operand (SET_DEST (set), GET_MODE (SET_DEST (set)))
-                   || pop_operand (SET_SRC (set), GET_MODE (SET_SRC (set)))))
-             {
-               struct ix86_address parts;
-               if (ix86_decompose_address (addr, &parts)
-                   && parts.base == stack_pointer_rtx
-                   && (parts.index == NULL_RTX
-                       || MEM_P (SET_DEST (set))
-                       || !modified_in_p (parts.index, set_insn)))
-                 return false;
-             }
-           return true;
-         }
-       return false;
-      }
-  return false;
-}
-
-/* Helper function for exact_store_load_dependency.
-   Return true if addr is found in insn.  */
-static bool
-exact_dependency_1 (rtx addr, rtx insn)
-{
-  enum rtx_code code;
-  const char *format_ptr;
-  int i, j;
-
-  code = GET_CODE (insn);
-  switch (code)
-    {
-    case MEM:
-      if (rtx_equal_p (addr, insn))
-       return true;
-      break;
-    case REG:
-    CASE_CONST_ANY:
-    case SYMBOL_REF:
-    case CODE_LABEL:
-    case PC:
-    case CC0:
-    case EXPR_LIST:
-      return false;
-    default:
-      break;
-    }
-
-  format_ptr = GET_RTX_FORMAT (code);
-  for (i = 0; i < GET_RTX_LENGTH (code); i++)
-    {
-      switch (*format_ptr++)
-       {
-       case 'e':
-         if (exact_dependency_1 (addr, XEXP (insn, i)))
-           return true;
-         break;
-       case 'E':
-         for (j = 0; j < XVECLEN (insn, i); j++)
-           if (exact_dependency_1 (addr, XVECEXP (insn, i, j)))
-             return true;
-         break;
-       }
-    }
-  return false;
-}
-
-/* Return true if there exists exact dependency for store & load, i.e.
-   the same memory address is used in them.  */
-static bool
-exact_store_load_dependency (rtx_insn *store, rtx_insn *load)
-{
-  rtx set1, set2;
-
-  set1 = single_set (store);
-  if (!set1)
-    return false;
-  if (!MEM_P (SET_DEST (set1)))
-    return false;
-  set2 = single_set (load);
-  if (!set2)
-    return false;
-  if (exact_dependency_1 (SET_DEST (set1), SET_SRC (set2)))
-    return true;
-  return false;
-}
-
-static int
-ix86_adjust_cost (rtx_insn *insn, int dep_type, rtx_insn *dep_insn, int cost,
-                 unsigned int)
-{
-  enum attr_type insn_type, dep_insn_type;
-  enum attr_memory memory;
-  rtx set, set2;
-  int dep_insn_code_number;
-
-  /* Anti and output dependencies have zero cost on all CPUs.  */
-  if (dep_type != 0)
-    return 0;
-
-  dep_insn_code_number = recog_memoized (dep_insn);
-
-  /* If we can't recognize the insns, we can't really do anything.  */
-  if (dep_insn_code_number < 0 || recog_memoized (insn) < 0)
-    return cost;
-
-  insn_type = get_attr_type (insn);
-  dep_insn_type = get_attr_type (dep_insn);
-
-  switch (ix86_tune)
-    {
-    case PROCESSOR_PENTIUM:
-    case PROCESSOR_LAKEMONT:
-      /* Address Generation Interlock adds a cycle of latency.  */
-      if (insn_type == TYPE_LEA)
-       {
-         rtx addr = PATTERN (insn);
-
-         if (GET_CODE (addr) == PARALLEL)
-           addr = XVECEXP (addr, 0, 0);
-
-         gcc_assert (GET_CODE (addr) == SET);
-
-         addr = SET_SRC (addr);
-         if (modified_in_p (addr, dep_insn))
-           cost += 1;
-       }
-      else if (ix86_agi_dependent (dep_insn, insn))
-       cost += 1;
-
-      /* ??? Compares pair with jump/setcc.  */
-      if (ix86_flags_dependent (insn, dep_insn, insn_type))
-       cost = 0;
-
-      /* Floating point stores require value to be ready one cycle earlier.  */
-      if (insn_type == TYPE_FMOV
-         && get_attr_memory (insn) == MEMORY_STORE
-         && !ix86_agi_dependent (dep_insn, insn))
-       cost += 1;
-      break;
-
-    case PROCESSOR_PENTIUMPRO:
-      /* INT->FP conversion is expensive.  */
-      if (get_attr_fp_int_src (dep_insn))
-       cost += 5;
-
-      /* There is one cycle extra latency between an FP op and a store.  */
-      if (insn_type == TYPE_FMOV
-         && (set = single_set (dep_insn)) != NULL_RTX
-         && (set2 = single_set (insn)) != NULL_RTX
-         && rtx_equal_p (SET_DEST (set), SET_SRC (set2))
-         && MEM_P (SET_DEST (set2)))
-       cost += 1;
-
-      memory = get_attr_memory (insn);
-
-      /* Show ability of reorder buffer to hide latency of load by executing
-        in parallel with previous instruction in case
-        previous instruction is not needed to compute the address.  */
-      if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
-         && !ix86_agi_dependent (dep_insn, insn))
-       {
-         /* Claim moves to take one cycle, as core can issue one load
-            at time and the next load can start cycle later.  */
-         if (dep_insn_type == TYPE_IMOV
-             || dep_insn_type == TYPE_FMOV)
-           cost = 1;
-         else if (cost > 1)
-           cost--;
-       }
-      break;
-
-    case PROCESSOR_K6:
-     /* The esp dependency is resolved before
-       the instruction is really finished.  */
-      if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
-         && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
-       return 1;
-
-      /* INT->FP conversion is expensive.  */
-      if (get_attr_fp_int_src (dep_insn))
-       cost += 5;
-
-      memory = get_attr_memory (insn);
-
-      /* Show ability of reorder buffer to hide latency of load by executing
-        in parallel with previous instruction in case
-        previous instruction is not needed to compute the address.  */
-      if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
-         && !ix86_agi_dependent (dep_insn, insn))
-       {
-         /* Claim moves to take one cycle, as core can issue one load
-            at time and the next load can start cycle later.  */
-         if (dep_insn_type == TYPE_IMOV
-             || dep_insn_type == TYPE_FMOV)
-           cost = 1;
-         else if (cost > 2)
-           cost -= 2;
-         else
-           cost = 1;
-       }
-      break;
-
-    case PROCESSOR_AMDFAM10:
-    case PROCESSOR_BDVER1:
-    case PROCESSOR_BDVER2:
-    case PROCESSOR_BDVER3:
-    case PROCESSOR_BDVER4:
-    case PROCESSOR_ZNVER1:
-    case PROCESSOR_BTVER1:
-    case PROCESSOR_BTVER2:
-    case PROCESSOR_GENERIC:
-      /* Stack engine allows to execute push&pop instructions in parall.  */
-      if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
-         && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
-       return 0;
-      /* FALLTHRU */
-
-    case PROCESSOR_ATHLON:
-    case PROCESSOR_K8:
-      memory = get_attr_memory (insn);
-
-      /* Show ability of reorder buffer to hide latency of load by executing
-        in parallel with previous instruction in case
-        previous instruction is not needed to compute the address.  */
-      if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
-         && !ix86_agi_dependent (dep_insn, insn))
-       {
-         enum attr_unit unit = get_attr_unit (insn);
-         int loadcost = 3;
-
-         /* Because of the difference between the length of integer and
-            floating unit pipeline preparation stages, the memory operands
-            for floating point are cheaper.
-
-            ??? For Athlon it the difference is most probably 2.  */
-         if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN)
-           loadcost = 3;
-         else
-           loadcost = TARGET_ATHLON ? 2 : 0;
-
-         if (cost >= loadcost)
-           cost -= loadcost;
-         else
-           cost = 0;
-       }
-      break;
-
-    case PROCESSOR_CORE2:
-    case PROCESSOR_NEHALEM:
-    case PROCESSOR_SANDYBRIDGE:
-    case PROCESSOR_HASWELL:
-      /* Stack engine allows to execute push&pop instructions in parall.  */
-      if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
-         && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
-       return 0;
-
-      memory = get_attr_memory (insn);
-
-      /* Show ability of reorder buffer to hide latency of load by executing
-        in parallel with previous instruction in case
-        previous instruction is not needed to compute the address.  */
-      if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
-         && !ix86_agi_dependent (dep_insn, insn))
-       {
-         if (cost >= 4)
-           cost -= 4;
-         else
-           cost = 0;
-       }
-      break;
-
-    case PROCESSOR_SILVERMONT:
-    case PROCESSOR_KNL:
-    case PROCESSOR_KNM:
-    case PROCESSOR_INTEL:
-      if (!reload_completed)
-       return cost;
-
-      /* Increase cost of integer loads.  */
-      memory = get_attr_memory (dep_insn);
-      if (memory == MEMORY_LOAD || memory == MEMORY_BOTH)
-       {
-         enum attr_unit unit = get_attr_unit (dep_insn);
-         if (unit == UNIT_INTEGER && cost == 1)
-           {
-             if (memory == MEMORY_LOAD)
-               cost = 3;
-             else
-               {
-                 /* Increase cost of ld/st for short int types only
-                    because of store forwarding issue.  */
-                 rtx set = single_set (dep_insn);
-                 if (set && (GET_MODE (SET_DEST (set)) == QImode
-                             || GET_MODE (SET_DEST (set)) == HImode))
-                   {
-                     /* Increase cost of store/load insn if exact
-                        dependence exists and it is load insn.  */
-                     enum attr_memory insn_memory = get_attr_memory (insn);
-                     if (insn_memory == MEMORY_LOAD
-                         && exact_store_load_dependency (dep_insn, insn))
-                       cost = 3;
-                   }
-               }
-           }
-       }
-
-    default:
-      break;
-    }
-
-  return cost;
-}
-
-/* How many alternative schedules to try.  This should be as wide as the
-   scheduling freedom in the DFA, but no wider.  Making this value too
-   large results extra work for the scheduler.  */
-
-static int
-ia32_multipass_dfa_lookahead (void)
-{
-  /* Generally, we want haifa-sched:max_issue() to look ahead as far
-     as many instructions can be executed on a cycle, i.e.,
-     issue_rate.  */
-  if (reload_completed)
-    return ix86_issue_rate ();
-  /* Don't use lookahead for pre-reload schedule to save compile time.  */
-  return 0;
-}
-
-/* Return true if target platform supports macro-fusion.  */
-
-static bool
-ix86_macro_fusion_p ()
-{
-  return TARGET_FUSE_CMP_AND_BRANCH;
-}
-
-/* Check whether current microarchitecture support macro fusion
-   for insn pair "CONDGEN + CONDJMP". Refer to
-   "Intel Architectures Optimization Reference Manual". */
-
-static bool
-ix86_macro_fusion_pair_p (rtx_insn *condgen, rtx_insn *condjmp)
-{
-  rtx src, dest;
-  enum rtx_code ccode;
-  rtx compare_set = NULL_RTX, test_if, cond;
-  rtx alu_set = NULL_RTX, addr = NULL_RTX;
-
-  if (!any_condjump_p (condjmp))
-    return false;
-
-  unsigned int condreg1, condreg2;
-  rtx cc_reg_1;
-  ix86_fixed_condition_code_regs (&condreg1, &condreg2);
-  cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
-  if (!reg_referenced_p (cc_reg_1, PATTERN (condjmp))
-      || !condgen
-      || !modified_in_p (cc_reg_1, condgen))
-    return false;
-
-  if (get_attr_type (condgen) != TYPE_TEST
-      && get_attr_type (condgen) != TYPE_ICMP
-      && get_attr_type (condgen) != TYPE_INCDEC
-      && get_attr_type (condgen) != TYPE_ALU)
-    return false;
-
-  compare_set = single_set (condgen);
-  if (compare_set == NULL_RTX
-      && !TARGET_FUSE_ALU_AND_BRANCH)
-    return false;
-
-  if (compare_set == NULL_RTX)
-    {
-      int i;
-      rtx pat = PATTERN (condgen);
-      for (i = 0; i < XVECLEN (pat, 0); i++)
-       if (GET_CODE (XVECEXP (pat, 0, i)) == SET)
-         {
-           rtx set_src = SET_SRC (XVECEXP (pat, 0, i));
-           if (GET_CODE (set_src) == COMPARE)
-             compare_set = XVECEXP (pat, 0, i);
-           else
-             alu_set = XVECEXP (pat, 0, i);
-         }
-    }
-  if (compare_set == NULL_RTX)
-    return false;
-  src = SET_SRC (compare_set);
-  if (GET_CODE (src) != COMPARE)
-    return false;
-
-  /* Macro-fusion for cmp/test MEM-IMM + conditional jmp is not
-     supported.  */
-  if ((MEM_P (XEXP (src, 0))
-       && CONST_INT_P (XEXP (src, 1)))
-      || (MEM_P (XEXP (src, 1))
-         && CONST_INT_P (XEXP (src, 0))))
-    return false;
-
-  /* No fusion for RIP-relative address.  */
-  if (MEM_P (XEXP (src, 0)))
-    addr = XEXP (XEXP (src, 0), 0);
-  else if (MEM_P (XEXP (src, 1)))
-    addr = XEXP (XEXP (src, 1), 0);
-
-  if (addr) {
-    ix86_address parts;
-    int ok = ix86_decompose_address (addr, &parts);
-    gcc_assert (ok);
-
-    if (rip_relative_addr_p (&parts))
-      return false;
-  }
-
-  test_if = SET_SRC (pc_set (condjmp));
-  cond = XEXP (test_if, 0);
-  ccode = GET_CODE (cond);
-  /* Check whether conditional jump use Sign or Overflow Flags.  */
-  if (!TARGET_FUSE_CMP_AND_BRANCH_SOFLAGS
-      && (ccode == GE
-          || ccode == GT
-         || ccode == LE
-         || ccode == LT))
-    return false;
-
-  /* Return true for TYPE_TEST and TYPE_ICMP.  */
-  if (get_attr_type (condgen) == TYPE_TEST
-      || get_attr_type (condgen) == TYPE_ICMP)
-    return true;
-
-  /* The following is the case that macro-fusion for alu + jmp.  */
-  if (!TARGET_FUSE_ALU_AND_BRANCH || !alu_set)
-    return false;
-
-  /* No fusion for alu op with memory destination operand.  */
-  dest = SET_DEST (alu_set);
-  if (MEM_P (dest))
-    return false;
-
-  /* Macro-fusion for inc/dec + unsigned conditional jump is not
-     supported.  */
-  if (get_attr_type (condgen) == TYPE_INCDEC
-      && (ccode == GEU
-         || ccode == GTU
-         || ccode == LEU
-         || ccode == LTU))
-    return false;
-
-  return true;
-}
-
-/* Try to reorder ready list to take advantage of Atom pipelined IMUL
-   execution. It is applied if
-   (1) IMUL instruction is on the top of list;
-   (2) There exists the only producer of independent IMUL instruction in
-       ready list.
-   Return index of IMUL producer if it was found and -1 otherwise.  */
-static int
-do_reorder_for_imul (rtx_insn **ready, int n_ready)
-{
-  rtx_insn *insn;
-  rtx set, insn1, insn2;
-  sd_iterator_def sd_it;
-  dep_t dep;
-  int index = -1;
-  int i;
-
-  if (!TARGET_BONNELL)
-    return index;
-
-  /* Check that IMUL instruction is on the top of ready list.  */
-  insn = ready[n_ready - 1];
-  set = single_set (insn);
-  if (!set)
-    return index;
-  if (!(GET_CODE (SET_SRC (set)) == MULT
-      && GET_MODE (SET_SRC (set)) == SImode))
-    return index;
-
-  /* Search for producer of independent IMUL instruction.  */
-  for (i = n_ready - 2; i >= 0; i--)
-    {
-      insn = ready[i];
-      if (!NONDEBUG_INSN_P (insn))
-       continue;
-      /* Skip IMUL instruction.  */
-      insn2 = PATTERN (insn);
-      if (GET_CODE (insn2) == PARALLEL)
-       insn2 = XVECEXP (insn2, 0, 0);
-      if (GET_CODE (insn2) == SET
-         && GET_CODE (SET_SRC (insn2)) == MULT
-         && GET_MODE (SET_SRC (insn2)) == SImode)
-       continue;
-
-      FOR_EACH_DEP (insn, SD_LIST_FORW, sd_it, dep)
-       {
-         rtx con;
-         con = DEP_CON (dep);
-         if (!NONDEBUG_INSN_P (con))
-           continue;
-         insn1 = PATTERN (con);
-         if (GET_CODE (insn1) == PARALLEL)
-           insn1 = XVECEXP (insn1, 0, 0);
-
-         if (GET_CODE (insn1) == SET
-             && GET_CODE (SET_SRC (insn1)) == MULT
-             && GET_MODE (SET_SRC (insn1)) == SImode)
-           {
-             sd_iterator_def sd_it1;
-             dep_t dep1;
-             /* Check if there is no other dependee for IMUL.  */
-             index = i;
-             FOR_EACH_DEP (con, SD_LIST_BACK, sd_it1, dep1)
-               {
-                 rtx pro;
-                 pro = DEP_PRO (dep1);
-                 if (!NONDEBUG_INSN_P (pro))
-                   continue;
-                 if (pro != insn)
-                   index = -1;
-               }
-             if (index >= 0)
-               break;
-           }
-       }
-      if (index >= 0)
-       break;
-    }
-  return index;
-}
-
-/* Try to find the best candidate on the top of ready list if two insns
-   have the same priority - candidate is best if its dependees were
-   scheduled earlier. Applied for Silvermont only.
-   Return true if top 2 insns must be interchanged.  */
-static bool
-swap_top_of_ready_list (rtx_insn **ready, int n_ready)
-{
-  rtx_insn *top = ready[n_ready - 1];
-  rtx_insn *next = ready[n_ready - 2];
-  rtx set;
-  sd_iterator_def sd_it;
-  dep_t dep;
-  int clock1 = -1;
-  int clock2 = -1;
-  #define INSN_TICK(INSN) (HID (INSN)->tick)
-
-  if (!TARGET_SILVERMONT && !TARGET_INTEL)
-    return false;
-
-  if (!NONDEBUG_INSN_P (top))
-    return false;
-  if (!NONJUMP_INSN_P (top))
-    return false;
-  if (!NONDEBUG_INSN_P (next))
-    return false;
-  if (!NONJUMP_INSN_P (next))
-    return false;
-  set = single_set (top);
-  if (!set)
-    return false;
-  set = single_set (next);
-  if (!set)
-    return false;
-
-  if (INSN_PRIORITY_KNOWN (top) && INSN_PRIORITY_KNOWN (next))
-    {
-      if (INSN_PRIORITY (top) != INSN_PRIORITY (next))
-       return false;
-      /* Determine winner more precise.  */
-      FOR_EACH_DEP (top, SD_LIST_RES_BACK, sd_it, dep)
-       {
-         rtx pro;
-         pro = DEP_PRO (dep);
-         if (!NONDEBUG_INSN_P (pro))
-           continue;
-         if (INSN_TICK (pro) > clock1)
-           clock1 = INSN_TICK (pro);
-       }
-      FOR_EACH_DEP (next, SD_LIST_RES_BACK, sd_it, dep)
-       {
-         rtx pro;
-         pro = DEP_PRO (dep);
-         if (!NONDEBUG_INSN_P (pro))
-           continue;
-         if (INSN_TICK (pro) > clock2)
-           clock2 = INSN_TICK (pro);
-       }
-
-      if (clock1 == clock2)
-       {
-         /* Determine winner - load must win. */
-         enum attr_memory memory1, memory2;
-         memory1 = get_attr_memory (top);
-         memory2 = get_attr_memory (next);
-         if (memory2 == MEMORY_LOAD && memory1 != MEMORY_LOAD)
-           return true;
-       }
-       return (bool) (clock2 < clock1);
-    }
-  return false;
-  #undef INSN_TICK
-}
-
-/* Perform possible reodering of ready list for Atom/Silvermont only.
-   Return issue rate.  */
-static int
-ix86_sched_reorder (FILE *dump, int sched_verbose, rtx_insn **ready,
-                   int *pn_ready, int clock_var)
-{
-  int issue_rate = -1;
-  int n_ready = *pn_ready;
-  int i;
-  rtx_insn *insn;
-  int index = -1;
-
-  /* Set up issue rate.  */
-  issue_rate = ix86_issue_rate ();
-
-  /* Do reodering for BONNELL/SILVERMONT only.  */
-  if (!TARGET_BONNELL && !TARGET_SILVERMONT && !TARGET_INTEL)
-    return issue_rate;
-
-  /* Nothing to do if ready list contains only 1 instruction.  */
-  if (n_ready <= 1)
-    return issue_rate;
-
-  /* Do reodering for post-reload scheduler only.  */
-  if (!reload_completed)
-    return issue_rate;
-
-  if ((index = do_reorder_for_imul (ready, n_ready)) >= 0)
-    {
-      if (sched_verbose > 1)
-       fprintf (dump, ";;\tatom sched_reorder: put %d insn on top\n",
-                INSN_UID (ready[index]));
-
-      /* Put IMUL producer (ready[index]) at the top of ready list.  */
-      insn = ready[index];
-      for (i = index; i < n_ready - 1; i++)
-       ready[i] = ready[i + 1];
-      ready[n_ready - 1] = insn;
-      return issue_rate;
-    }
-
-  /* Skip selective scheduling since HID is not populated in it.  */
-  if (clock_var != 0
-      && !sel_sched_p ()
-      && swap_top_of_ready_list (ready, n_ready))
-    {
-      if (sched_verbose > 1)
-       fprintf (dump, ";;\tslm sched_reorder: swap %d and %d insns\n",
-                INSN_UID (ready[n_ready - 1]), INSN_UID (ready[n_ready - 2]));
-      /* Swap 2 top elements of ready list.  */
-      insn = ready[n_ready - 1];
-      ready[n_ready - 1] = ready[n_ready - 2];
-      ready[n_ready - 2] = insn;
-    }
-  return issue_rate;
-}
  
  static bool
  ix86_class_likely_spilled_p (reg_class_t);
@@ -31330,204 +28483,6 @@ ix86_adjust_priority (rtx_insn *insn, int priority)
    return priority;
  }
  
-/* Model decoder of Core 2/i7.
-   Below hooks for multipass scheduling (see haifa-sched.c:max_issue)
-   track the instruction fetch block boundaries and make sure that long
-   (9+ bytes) instructions are assigned to D0.  */
-
-/* Maximum length of an insn that can be handled by
-   a secondary decoder unit.  '8' for Core 2/i7.  */
-static int core2i7_secondary_decoder_max_insn_size;
-
-/* Ifetch block size, i.e., number of bytes decoder reads per cycle.
-   '16' for Core 2/i7.  */
-static int core2i7_ifetch_block_size;
-
-/* Maximum number of instructions decoder can handle per cycle.
-   '6' for Core 2/i7.  */
-static int core2i7_ifetch_block_max_insns;
-
-typedef struct ix86_first_cycle_multipass_data_ *
-  ix86_first_cycle_multipass_data_t;
-typedef const struct ix86_first_cycle_multipass_data_ *
-  const_ix86_first_cycle_multipass_data_t;
-
-/* A variable to store target state across calls to max_issue within
-   one cycle.  */
-static struct ix86_first_cycle_multipass_data_ _ix86_first_cycle_multipass_data,
-  *ix86_first_cycle_multipass_data = &_ix86_first_cycle_multipass_data;
-
-/* Initialize DATA.  */
-static void
-core2i7_first_cycle_multipass_init (void *_data)
-{
-  ix86_first_cycle_multipass_data_t data
-    = (ix86_first_cycle_multipass_data_t) _data;
-
-  data->ifetch_block_len = 0;
-  data->ifetch_block_n_insns = 0;
-  data->ready_try_change = NULL;
-  data->ready_try_change_size = 0;
-}
-
-/* Advancing the cycle; reset ifetch block counts.  */
-static void
-core2i7_dfa_post_advance_cycle (void)
-{
-  ix86_first_cycle_multipass_data_t data = ix86_first_cycle_multipass_data;
-
-  gcc_assert (data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
-
-  data->ifetch_block_len = 0;
-  data->ifetch_block_n_insns = 0;
-}
-
-static int min_insn_size (rtx_insn *);
-
-/* Filter out insns from ready_try that the core will not be able to issue
-   on current cycle due to decoder.  */
-static void
-core2i7_first_cycle_multipass_filter_ready_try
-(const_ix86_first_cycle_multipass_data_t data,
- signed char *ready_try, int n_ready, bool first_cycle_insn_p)
-{
-  while (n_ready--)
-    {
-      rtx_insn *insn;
-      int insn_size;
-
-      if (ready_try[n_ready])
-       continue;
-
-      insn = get_ready_element (n_ready);
-      insn_size = min_insn_size (insn);
-
-      if (/* If this is a too long an insn for a secondary decoder ...  */
-         (!first_cycle_insn_p
-          && insn_size > core2i7_secondary_decoder_max_insn_size)
-         /* ... or it would not fit into the ifetch block ...  */
-         || data->ifetch_block_len + insn_size > core2i7_ifetch_block_size
-         /* ... or the decoder is full already ...  */
-         || data->ifetch_block_n_insns + 1 > core2i7_ifetch_block_max_insns)
-       /* ... mask the insn out.  */
-       {
-         ready_try[n_ready] = 1;
-
-         if (data->ready_try_change)
-           bitmap_set_bit (data->ready_try_change, n_ready);
-       }
-    }
-}
-
-/* Prepare for a new round of multipass lookahead scheduling.  */
-static void
-core2i7_first_cycle_multipass_begin (void *_data,
-                                    signed char *ready_try, int n_ready,
-                                    bool first_cycle_insn_p)
-{
-  ix86_first_cycle_multipass_data_t data
-    = (ix86_first_cycle_multipass_data_t) _data;
-  const_ix86_first_cycle_multipass_data_t prev_data
-    = ix86_first_cycle_multipass_data;
-
-  /* Restore the state from the end of the previous round.  */
-  data->ifetch_block_len = prev_data->ifetch_block_len;
-  data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns;
-
-  /* Filter instructions that cannot be issued on current cycle due to
-     decoder restrictions.  */
-  core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
-                                                 first_cycle_insn_p);
-}
-
-/* INSN is being issued in current solution.  Account for its impact on
-   the decoder model.  */
-static void
-core2i7_first_cycle_multipass_issue (void *_data,
-                                    signed char *ready_try, int n_ready,
-                                    rtx_insn *insn, const void *_prev_data)
-{
-  ix86_first_cycle_multipass_data_t data
-    = (ix86_first_cycle_multipass_data_t) _data;
-  const_ix86_first_cycle_multipass_data_t prev_data
-    = (const_ix86_first_cycle_multipass_data_t) _prev_data;
-
-  int insn_size = min_insn_size (insn);
-
-  data->ifetch_block_len = prev_data->ifetch_block_len + insn_size;
-  data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns + 1;
-  gcc_assert (data->ifetch_block_len <= core2i7_ifetch_block_size
-             && data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
-
-  /* Allocate or resize the bitmap for storing INSN's effect on ready_try.  */
-  if (!data->ready_try_change)
-    {
-      data->ready_try_change = sbitmap_alloc (n_ready);
-      data->ready_try_change_size = n_ready;
-    }
-  else if (data->ready_try_change_size < n_ready)
-    {
-      data->ready_try_change = sbitmap_resize (data->ready_try_change,
-                                              n_ready, 0);
-      data->ready_try_change_size = n_ready;
-    }
-  bitmap_clear (data->ready_try_change);
-
-  /* Filter out insns from ready_try that the core will not be able to issue
-     on current cycle due to decoder.  */
-  core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
-                                                 false);
-}
-
-/* Revert the effect on ready_try.  */
-static void
-core2i7_first_cycle_multipass_backtrack (const void *_data,
-                                        signed char *ready_try,
-                                        int n_ready ATTRIBUTE_UNUSED)
-{
-  const_ix86_first_cycle_multipass_data_t data
-    = (const_ix86_first_cycle_multipass_data_t) _data;
-  unsigned int i = 0;
-  sbitmap_iterator sbi;
-
-  gcc_assert (bitmap_last_set_bit (data->ready_try_change) < n_ready);
-  EXECUTE_IF_SET_IN_BITMAP (data->ready_try_change, 0, i, sbi)
-    {
-      ready_try[i] = 0;
-    }
-}
-
-/* Save the result of multipass lookahead scheduling for the next round.  */
-static void
-core2i7_first_cycle_multipass_end (const void *_data)
-{
-  const_ix86_first_cycle_multipass_data_t data
-    = (const_ix86_first_cycle_multipass_data_t) _data;
-  ix86_first_cycle_multipass_data_t next_data
-    = ix86_first_cycle_multipass_data;
-
-  if (data != NULL)
-    {
-      next_data->ifetch_block_len = data->ifetch_block_len;
-      next_data->ifetch_block_n_insns = data->ifetch_block_n_insns;
-    }
-}
-
-/* Deallocate target data.  */
-static void
-core2i7_first_cycle_multipass_fini (void *_data)
-{
-  ix86_first_cycle_multipass_data_t data
-    = (ix86_first_cycle_multipass_data_t) _data;
-
-  if (data->ready_try_change)
-    {
-      sbitmap_free (data->ready_try_change);
-      data->ready_try_change = NULL;
-      data->ready_try_change_size = 0;
-    }
-}
-
  /* Prepare for scheduling pass.  */
  static void
  ix86_sched_init_global (FILE *, int, int)
@@ -31545,25 +28500,7 @@ ix86_sched_init_global (FILE *, int, int)
           to save compile time.  */
        if (reload_completed)
         {
-         targetm.sched.dfa_post_advance_cycle
-           = core2i7_dfa_post_advance_cycle;
-         targetm.sched.first_cycle_multipass_init
-           = core2i7_first_cycle_multipass_init;
-         targetm.sched.first_cycle_multipass_begin
-           = core2i7_first_cycle_multipass_begin;
-         targetm.sched.first_cycle_multipass_issue
-           = core2i7_first_cycle_multipass_issue;
-         targetm.sched.first_cycle_multipass_backtrack
-           = core2i7_first_cycle_multipass_backtrack;
-         targetm.sched.first_cycle_multipass_end
-           = core2i7_first_cycle_multipass_end;
-         targetm.sched.first_cycle_multipass_fini
-           = core2i7_first_cycle_multipass_fini;
-
-         /* Set decoder parameters.  */
-         core2i7_secondary_decoder_max_insn_size = 8;
-         core2i7_ifetch_block_size = 16;
-         core2i7_ifetch_block_max_insns = 6;
+         ix86_core2i7_init_hooks ();
           break;
         }
        /* Fall through.  */
@@ -43128,8 +40065,8 @@ x86_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED)
     address sizes.  This is enough to eliminate unnecessary padding in
     99% of cases.  */
  
-static int
-min_insn_size (rtx_insn *insn)
+int
+ix86_min_insn_size (rtx_insn *insn)
  {
    int l = 0, len;
  
@@ -43238,13 +40175,13 @@ ix86_avoid_jump_mispredicts (void)
                     njumps--, isjump = true;
                   else
                     isjump = false;
-                 nbytes -= min_insn_size (start);
+                 nbytes -= ix86_min_insn_size (start);
                 }
             }
           continue;
         }
  
-      min_size = min_insn_size (insn);
+      min_size = ix86_min_insn_size (insn);
        nbytes += min_size;
        if (dump_file)
         fprintf (dump_file, "Insn %i estimated to %i bytes\n",
@@ -43263,7 +40200,7 @@ ix86_avoid_jump_mispredicts (void)
             njumps--, isjump = true;
           else
             isjump = false;
-         nbytes -= min_insn_size (start);
+         nbytes -= ix86_min_insn_size (start);
         }
        gcc_assert (njumps >= 0);
        if (dump_file)
@@ -43272,7 +40209,7 @@ ix86_avoid_jump_mispredicts (void)
  
        if (njumps == 3 && isjump && nbytes < 16)
         {
-         int padsize = 15 - nbytes + min_insn_size (insn);
+         int padsize = 15 - nbytes + ix86_min_insn_size (insn);
  
           if (dump_file)
             fprintf (dump_file, "Padding insn %i by %i bytes!\n",
@@ -51035,806 +47972,19 @@ ix86_enum_va_list (int idx, const char **pname, tree *ptree)
  }
  
  #undef TARGET_SCHED_DISPATCH
-#define TARGET_SCHED_DISPATCH has_dispatch
+#define TARGET_SCHED_DISPATCH ix86_bd_has_dispatch
  #undef TARGET_SCHED_DISPATCH_DO
-#define TARGET_SCHED_DISPATCH_DO do_dispatch
+#define TARGET_SCHED_DISPATCH_DO ix86_bd_do_dispatch
  #undef TARGET_SCHED_REASSOCIATION_WIDTH
  #define TARGET_SCHED_REASSOCIATION_WIDTH ix86_reassociation_width
  #undef TARGET_SCHED_REORDER
-#define TARGET_SCHED_REORDER ix86_sched_reorder
+#define TARGET_SCHED_REORDER ix86_atom_sched_reorder
  #undef TARGET_SCHED_ADJUST_PRIORITY
  #define TARGET_SCHED_ADJUST_PRIORITY ix86_adjust_priority
  #undef TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK
  #define TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK \
    ix86_dependencies_evaluation_hook
  
-/* The size of the dispatch window is the total number of bytes of
-   object code allowed in a window.  */
-#define DISPATCH_WINDOW_SIZE 16
-
-/* Number of dispatch windows considered for scheduling.  */
-#define MAX_DISPATCH_WINDOWS 3
-
-/* Maximum number of instructions in a window.  */
-#define MAX_INSN 4
-
-/* Maximum number of immediate operands in a window.  */
-#define MAX_IMM 4
-
-/* Maximum number of immediate bits allowed in a window.  */
-#define MAX_IMM_SIZE 128
-
-/* Maximum number of 32 bit immediates allowed in a window.  */
-#define MAX_IMM_32 4
-
-/* Maximum number of 64 bit immediates allowed in a window.  */
-#define MAX_IMM_64 2
-
-/* Maximum total of loads or prefetches allowed in a window.  */
-#define MAX_LOAD 2
-
-/* Maximum total of stores allowed in a window.  */
-#define MAX_STORE 1
-
-#undef BIG
-#define BIG 100
-
-
-/* Dispatch groups.  Istructions that affect the mix in a dispatch window.  */
-enum dispatch_group {
-  disp_no_group = 0,
-  disp_load,
-  disp_store,
-  disp_load_store,
-  disp_prefetch,
-  disp_imm,
-  disp_imm_32,
-  disp_imm_64,
-  disp_branch,
-  disp_cmp,
-  disp_jcc,
-  disp_last
-};
-
-/* Number of allowable groups in a dispatch window.  It is an array
-   indexed by dispatch_group enum.  100 is used as a big number,
-   because the number of these kind of operations does not have any
-   effect in dispatch window, but we need them for other reasons in
-   the table.  */
-static unsigned int num_allowable_groups[disp_last] = {
-  0, 2, 1, 1, 2, 4, 4, 2, 1, BIG, BIG
-};
-
-char group_name[disp_last + 1][16] = {
-  "disp_no_group", "disp_load", "disp_store", "disp_load_store",
-  "disp_prefetch", "disp_imm", "disp_imm_32", "disp_imm_64",
-  "disp_branch", "disp_cmp", "disp_jcc", "disp_last"
-};
-
-/* Instruction path.  */
-enum insn_path {
-  no_path = 0,
-  path_single, /* Single micro op.  */
-  path_double, /* Double micro op.  */
-  path_multi,  /* Instructions with more than 2 micro op..  */
-  last_path
-};
-
-/* sched_insn_info defines a window to the instructions scheduled in
-   the basic block.  It contains a pointer to the insn_info table and
-   the instruction scheduled.
-
-   Windows are allocated for each basic block and are linked
-   together.  */
-typedef struct sched_insn_info_s {
-  rtx insn;
-  enum dispatch_group group;
-  enum insn_path path;
-  int byte_len;
-  int imm_bytes;
-} sched_insn_info;
-
-/* Linked list of dispatch windows.  This is a two way list of
-   dispatch windows of a basic block.  It contains information about
-   the number of uops in the window and the total number of
-   instructions and of bytes in the object code for this dispatch
-   window.  */
-typedef struct dispatch_windows_s {
-  int num_insn;            /* Number of insn in the window.  */
-  int num_uops;            /* Number of uops in the window.  */
-  int window_size;         /* Number of bytes in the window.  */
-  int window_num;          /* Window number between 0 or 1.  */
-  int num_imm;             /* Number of immediates in an insn.  */
-  int num_imm_32;          /* Number of 32 bit immediates in an insn.  */
-  int num_imm_64;          /* Number of 64 bit immediates in an insn.  */
-  int imm_size;            /* Total immediates in the window.  */
-  int num_loads;           /* Total memory loads in the window.  */
-  int num_stores;          /* Total memory stores in the window.  */
-  int violation;          /* Violation exists in window.  */
-  sched_insn_info *window; /* Pointer to the window.  */
-  struct dispatch_windows_s *next;
-  struct dispatch_windows_s *prev;
-} dispatch_windows;
-
-/* Immediate valuse used in an insn.  */
-typedef struct imm_info_s
-  {
-    int imm;
-    int imm32;
-    int imm64;
-  } imm_info;
-
-static dispatch_windows *dispatch_window_list;
-static dispatch_windows *dispatch_window_list1;
-
-/* Get dispatch group of insn.  */
-
-static enum dispatch_group
-get_mem_group (rtx_insn *insn)
-{
-  enum attr_memory memory;
-
-  if (INSN_CODE (insn) < 0)
-    return disp_no_group;
-  memory = get_attr_memory (insn);
-  if (memory == MEMORY_STORE)
-    return disp_store;
-
-  if (memory == MEMORY_LOAD)
-    return disp_load;
-
-  if (memory == MEMORY_BOTH)
-    return disp_load_store;
-
-  return disp_no_group;
-}
-
-/* Return true if insn is a compare instruction.  */
-
-static bool
-is_cmp (rtx_insn *insn)
-{
-  enum attr_type type;
-
-  type = get_attr_type (insn);
-  return (type == TYPE_TEST
-         || type == TYPE_ICMP
-         || type == TYPE_FCMP
-         || GET_CODE (PATTERN (insn)) == COMPARE);
-}
-
-/* Return true if a dispatch violation encountered.  */
-
-static bool
-dispatch_violation (void)
-{
-  if (dispatch_window_list->next)
-    return dispatch_window_list->next->violation;
-  return dispatch_window_list->violation;
-}
-
-/* Return true if insn is a branch instruction.  */
-
-static bool
-is_branch (rtx_insn *insn)
-{
-  return (CALL_P (insn) || JUMP_P (insn));
-}
-
-/* Return true if insn is a prefetch instruction.  */
-
-static bool
-is_prefetch (rtx_insn *insn)
-{
-  return NONJUMP_INSN_P (insn) && GET_CODE (PATTERN (insn)) == PREFETCH;
-}
-
-/* This function initializes a dispatch window and the list container holding a
-   pointer to the window.  */
-
-static void
-init_window (int window_num)
-{
-  int i;
-  dispatch_windows *new_list;
-
-  if (window_num == 0)
-    new_list = dispatch_window_list;
-  else
-    new_list = dispatch_window_list1;
-
-  new_list->num_insn = 0;
-  new_list->num_uops = 0;
-  new_list->window_size = 0;
-  new_list->next = NULL;
-  new_list->prev = NULL;
-  new_list->window_num = window_num;
-  new_list->num_imm = 0;
-  new_list->num_imm_32 = 0;
-  new_list->num_imm_64 = 0;
-  new_list->imm_size = 0;
-  new_list->num_loads = 0;
-  new_list->num_stores = 0;
-  new_list->violation = false;
-
-  for (i = 0; i < MAX_INSN; i++)
-    {
-      new_list->window[i].insn = NULL;
-      new_list->window[i].group = disp_no_group;
-      new_list->window[i].path = no_path;
-      new_list->window[i].byte_len = 0;
-      new_list->window[i].imm_bytes = 0;
-    }
-  return;
-}
-
-/* This function allocates and initializes a dispatch window and the
-   list container holding a pointer to the window.  */
-
-static dispatch_windows *
-allocate_window (void)
-{
-  dispatch_windows *new_list = XNEW (struct dispatch_windows_s);
-  new_list->window = XNEWVEC (struct sched_insn_info_s, MAX_INSN + 1);
-
-  return new_list;
-}
-
-/* This routine initializes the dispatch scheduling information.  It
-   initiates building dispatch scheduler tables and constructs the
-   first dispatch window.  */
-
-static void
-init_dispatch_sched (void)
-{
-  /* Allocate a dispatch list and a window.  */
-  dispatch_window_list = allocate_window ();
-  dispatch_window_list1 = allocate_window ();
-  init_window (0);
-  init_window (1);
-}
-
-/* This function returns true if a branch is detected.  End of a basic block
-   does not have to be a branch, but here we assume only branches end a
-   window.  */
-
-static bool
-is_end_basic_block (enum dispatch_group group)
-{
-  return group == disp_branch;
-}
-
-/* This function is called when the end of a window processing is reached.  */
-
-static void
-process_end_window (void)
-{
-  gcc_assert (dispatch_window_list->num_insn <= MAX_INSN);
-  if (dispatch_window_list->next)
-    {
-      gcc_assert (dispatch_window_list1->num_insn <= MAX_INSN);
-      gcc_assert (dispatch_window_list->window_size
-                 + dispatch_window_list1->window_size <= 48);
-      init_window (1);
-    }
-  init_window (0);
-}
-
-/* Allocates a new dispatch window and adds it to WINDOW_LIST.
-   WINDOW_NUM is either 0 or 1.  A maximum of two windows are generated
-   for 48 bytes of instructions.  Note that these windows are not dispatch
-   windows that their sizes are DISPATCH_WINDOW_SIZE.  */
-
-static dispatch_windows *
-allocate_next_window (int window_num)
-{
-  if (window_num == 0)
-    {
-      if (dispatch_window_list->next)
-         init_window (1);
-      init_window (0);
-      return dispatch_window_list;
-    }
-
-  dispatch_window_list->next = dispatch_window_list1;
-  dispatch_window_list1->prev = dispatch_window_list;
-
-  return dispatch_window_list1;
-}
-
-/* Compute number of immediate operands of an instruction.  */
-
-static void
-find_constant (rtx in_rtx, imm_info *imm_values)
-{
-  if (INSN_P (in_rtx))
-    in_rtx = PATTERN (in_rtx);
-  subrtx_iterator::array_type array;
-  FOR_EACH_SUBRTX (iter, array, in_rtx, ALL)
-    if (const_rtx x = *iter)
-      switch (GET_CODE (x))
-       {
-       case CONST:
-       case SYMBOL_REF:
-       case CONST_INT:
-         (imm_values->imm)++;
-         if (x86_64_immediate_operand (CONST_CAST_RTX (x), SImode))
-           (imm_values->imm32)++;
-         else
-           (imm_values->imm64)++;
-         break;
-
-       case CONST_DOUBLE:
-       case CONST_WIDE_INT:
-         (imm_values->imm)++;
-         (imm_values->imm64)++;
-         break;
-
-       case CODE_LABEL:
-         if (LABEL_KIND (x) == LABEL_NORMAL)
-           {
-             (imm_values->imm)++;
-             (imm_values->imm32)++;
-           }
-         break;
-
-       default:
-         break;
-       }
-}
-
-/* Return total size of immediate operands of an instruction along with number
-   of corresponding immediate-operands.  It initializes its parameters to zero
-   befor calling FIND_CONSTANT.
-   INSN is the input instruction.  IMM is the total of immediates.
-   IMM32 is the number of 32 bit immediates.  IMM64 is the number of 64
-   bit immediates.  */
-
-static int
-get_num_immediates (rtx_insn *insn, int *imm, int *imm32, int *imm64)
-{
-  imm_info imm_values = {0, 0, 0};
-
-  find_constant (insn, &imm_values);
-  *imm = imm_values.imm;
-  *imm32 = imm_values.imm32;
-  *imm64 = imm_values.imm64;
-  return imm_values.imm32 * 4 + imm_values.imm64 * 8;
-}
-
-/* This function indicates if an operand of an instruction is an
-   immediate.  */
-
-static bool
-has_immediate (rtx_insn *insn)
-{
-  int num_imm_operand;
-  int num_imm32_operand;
-  int num_imm64_operand;
-
-  if (insn)
-    return get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
-                              &num_imm64_operand);
-  return false;
-}
-
-/* Return single or double path for instructions.  */
-
-static enum insn_path
-get_insn_path (rtx_insn *insn)
-{
-  enum attr_amdfam10_decode path = get_attr_amdfam10_decode (insn);
-
-  if ((int)path == 0)
-    return path_single;
-
-  if ((int)path == 1)
-    return path_double;
-
-  return path_multi;
-}
-
-/* Return insn dispatch group.  */
-
-static enum dispatch_group
-get_insn_group (rtx_insn *insn)
-{
-  enum dispatch_group group = get_mem_group (insn);
-  if (group)
-    return group;
-
-  if (is_branch (insn))
-    return disp_branch;
-
-  if (is_cmp (insn))
-    return disp_cmp;
-
-  if (has_immediate (insn))
-    return disp_imm;
-
-  if (is_prefetch (insn))
-    return disp_prefetch;
-
-  return disp_no_group;
-}
-
-/* Count number of GROUP restricted instructions in a dispatch
-   window WINDOW_LIST.  */
-
-static int
-count_num_restricted (rtx_insn *insn, dispatch_windows *window_list)
-{
-  enum dispatch_group group = get_insn_group (insn);
-  int imm_size;
-  int num_imm_operand;
-  int num_imm32_operand;
-  int num_imm64_operand;
-
-  if (group == disp_no_group)
-    return 0;
-
-  if (group == disp_imm)
-    {
-      imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
-                             &num_imm64_operand);
-      if (window_list->imm_size + imm_size > MAX_IMM_SIZE
-         || num_imm_operand + window_list->num_imm > MAX_IMM
-         || (num_imm32_operand > 0
-             && (window_list->num_imm_32 + num_imm32_operand > MAX_IMM_32
-                 || window_list->num_imm_64 * 2 + num_imm32_operand > MAX_IMM_32))
-         || (num_imm64_operand > 0
-             && (window_list->num_imm_64 + num_imm64_operand > MAX_IMM_64
-                 || window_list->num_imm_32 + num_imm64_operand * 2 > MAX_IMM_32))
-         || (window_list->imm_size + imm_size == MAX_IMM_SIZE
-             && num_imm64_operand > 0
-             && ((window_list->num_imm_64 > 0
-                  && window_list->num_insn >= 2)
-                 || window_list->num_insn >= 3)))
-       return BIG;
-
-      return 1;
-    }
-
-  if ((group == disp_load_store
-       && (window_list->num_loads >= MAX_LOAD
-          || window_list->num_stores >= MAX_STORE))
-      || ((group == disp_load
-          || group == disp_prefetch)
-         && window_list->num_loads >= MAX_LOAD)
-      || (group == disp_store
-         && window_list->num_stores >= MAX_STORE))
-    return BIG;
-
-  return 1;
-}
-
-/* This function returns true if insn satisfies dispatch rules on the
-   last window scheduled.  */
-
-static bool
-fits_dispatch_window (rtx_insn *insn)
-{
-  dispatch_windows *window_list = dispatch_window_list;
-  dispatch_windows *window_list_next = dispatch_window_list->next;
-  unsigned int num_restrict;
-  enum dispatch_group group = get_insn_group (insn);
-  enum insn_path path = get_insn_path (insn);
-  int sum;
-
-  /* Make disp_cmp and disp_jcc get scheduled at the latest.  These
-     instructions should be given the lowest priority in the
-     scheduling process in Haifa scheduler to make sure they will be
-     scheduled in the same dispatch window as the reference to them.  */
-  if (group == disp_jcc || group == disp_cmp)
-    return false;
-
-  /* Check nonrestricted.  */
-  if (group == disp_no_group || group == disp_branch)
-    return true;
-
-  /* Get last dispatch window.  */
-  if (window_list_next)
-    window_list = window_list_next;
-
-  if (window_list->window_num == 1)
-    {
-      sum = window_list->prev->window_size + window_list->window_size;
-
-      if (sum == 32
-         || (min_insn_size (insn) + sum) >= 48)
-       /* Window 1 is full.  Go for next window.  */
-       return true;
-    }
-
-  num_restrict = count_num_restricted (insn, window_list);
-
-  if (num_restrict > num_allowable_groups[group])
-    return false;
-
-  /* See if it fits in the first window.  */
-  if (window_list->window_num == 0)
-    {
-      /* The first widow should have only single and double path
-        uops.  */
-      if (path == path_double
-         && (window_list->num_uops + 2) > MAX_INSN)
-       return false;
-      else if (path != path_single)
-        return false;
-    }
-  return true;
-}
-
-/* Add an instruction INSN with NUM_UOPS micro-operations to the
-   dispatch window WINDOW_LIST.  */
-
-static void
-add_insn_window (rtx_insn *insn, dispatch_windows *window_list, int num_uops)
-{
-  int byte_len = min_insn_size (insn);
-  int num_insn = window_list->num_insn;
-  int imm_size;
-  sched_insn_info *window = window_list->window;
-  enum dispatch_group group = get_insn_group (insn);
-  enum insn_path path = get_insn_path (insn);
-  int num_imm_operand;
-  int num_imm32_operand;
-  int num_imm64_operand;
-
-  if (!window_list->violation && group != disp_cmp
-      && !fits_dispatch_window (insn))
-    window_list->violation = true;
-
-  imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
-                                &num_imm64_operand);
-
-  /* Initialize window with new instruction.  */
-  window[num_insn].insn = insn;
-  window[num_insn].byte_len = byte_len;
-  window[num_insn].group = group;
-  window[num_insn].path = path;
-  window[num_insn].imm_bytes = imm_size;
-
-  window_list->window_size += byte_len;
-  window_list->num_insn = num_insn + 1;
-  window_list->num_uops = window_list->num_uops + num_uops;
-  window_list->imm_size += imm_size;
-  window_list->num_imm += num_imm_operand;
-  window_list->num_imm_32 += num_imm32_operand;
-  window_list->num_imm_64 += num_imm64_operand;
-
-  if (group == disp_store)
-    window_list->num_stores += 1;
-  else if (group == disp_load
-          || group == disp_prefetch)
-    window_list->num_loads += 1;
-  else if (group == disp_load_store)
-    {
-      window_list->num_stores += 1;
-      window_list->num_loads += 1;
-    }
-}
-
-/* Adds a scheduled instruction, INSN, to the current dispatch window.
-   If the total bytes of instructions or the number of instructions in
-   the window exceed allowable, it allocates a new window.  */
-
-static void
-add_to_dispatch_window (rtx_insn *insn)
-{
-  int byte_len;
-  dispatch_windows *window_list;
-  dispatch_windows *next_list;
-  dispatch_windows *window0_list;
-  enum insn_path path;
-  enum dispatch_group insn_group;
-  bool insn_fits;
-  int num_insn;
-  int num_uops;
-  int window_num;
-  int insn_num_uops;
-  int sum;
-
-  if (INSN_CODE (insn) < 0)
-    return;
-
-  byte_len = min_insn_size (insn);
-  window_list = dispatch_window_list;
-  next_list = window_list->next;
-  path = get_insn_path (insn);
-  insn_group = get_insn_group (insn);
-
-  /* Get the last dispatch window.  */
-  if (next_list)
-      window_list = dispatch_window_list->next;
-
-  if (path == path_single)
-    insn_num_uops = 1;
-  else if (path == path_double)
-    insn_num_uops = 2;
-  else
-    insn_num_uops = (int) path;
-
-  /* If current window is full, get a new window.
-     Window number zero is full, if MAX_INSN uops are scheduled in it.
-     Window number one is full, if window zero's bytes plus window
-     one's bytes is 32, or if the bytes of the new instruction added
-     to the total makes it greater than 48, or it has already MAX_INSN
-     instructions in it.  */
-  num_insn = window_list->num_insn;
-  num_uops = window_list->num_uops;
-  window_num = window_list->window_num;
-  insn_fits = fits_dispatch_window (insn);
-
-  if (num_insn >= MAX_INSN
-      || num_uops + insn_num_uops > MAX_INSN
-      || !(insn_fits))
-    {
-      window_num = ~window_num & 1;
-      window_list = allocate_next_window (window_num);
-    }
-
-  if (window_num == 0)
-    {
-      add_insn_window (insn, window_list, insn_num_uops);
-      if (window_list->num_insn >= MAX_INSN
-         && insn_group == disp_branch)
-       {
-         process_end_window ();
-         return;
-       }
-    }
-  else if (window_num == 1)
-    {
-      window0_list = window_list->prev;
-      sum = window0_list->window_size + window_list->window_size;
-      if (sum == 32
-         || (byte_len + sum) >= 48)
-       {
-         process_end_window ();
-         window_list = dispatch_window_list;
-       }
-
-      add_insn_window (insn, window_list, insn_num_uops);
-    }
-  else
-    gcc_unreachable ();
-
-  if (is_end_basic_block (insn_group))
-    {
-      /* End of basic block is reached do end-basic-block process.  */
-      process_end_window ();
-      return;
-    }
-}
-
-/* Print the dispatch window, WINDOW_NUM, to FILE.  */
-
-DEBUG_FUNCTION static void
-debug_dispatch_window_file (FILE *file, int window_num)
-{
-  dispatch_windows *list;
-  int i;
-
-  if (window_num == 0)
-    list = dispatch_window_list;
-  else
-    list = dispatch_window_list1;
-
-  fprintf (file, "Window #%d:\n", list->window_num);
-  fprintf (file, "  num_insn = %d, num_uops = %d, window_size = %d\n",
-         list->num_insn, list->num_uops, list->window_size);
-  fprintf (file, "  num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
-          list->num_imm, list->num_imm_32, list->num_imm_64, list->imm_size);
-
-  fprintf (file, "  num_loads = %d, num_stores = %d\n", list->num_loads,
-         list->num_stores);
-  fprintf (file, " insn info:\n");
-
-  for (i = 0; i < MAX_INSN; i++)
-    {
-      if (!list->window[i].insn)
-       break;
-      fprintf (file, "    group[%d] = %s, insn[%d] = %p, path[%d] = %d byte_len[%d] = %d, imm_bytes[%d] = %d\n",
-             i, group_name[list->window[i].group],
-             i, (void *)list->window[i].insn,
-             i, list->window[i].path,
-             i, list->window[i].byte_len,
-             i, list->window[i].imm_bytes);
-    }
-}
-
-/* Print to stdout a dispatch window.  */
-
-DEBUG_FUNCTION void
-debug_dispatch_window (int window_num)
-{
-  debug_dispatch_window_file (stdout, window_num);
-}
-
-/* Print INSN dispatch information to FILE.  */
-
-DEBUG_FUNCTION static void
-debug_insn_dispatch_info_file (FILE *file, rtx_insn *insn)
-{
-  int byte_len;
-  enum insn_path path;
-  enum dispatch_group group;
-  int imm_size;
-  int num_imm_operand;
-  int num_imm32_operand;
-  int num_imm64_operand;
-
-  if (INSN_CODE (insn) < 0)
-    return;
-
-  byte_len = min_insn_size (insn);
-  path = get_insn_path (insn);
-  group = get_insn_group (insn);
-  imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
-                                &num_imm64_operand);
-
-  fprintf (file, " insn info:\n");
-  fprintf (file, "  group = %s, path = %d, byte_len = %d\n",
-          group_name[group], path, byte_len);
-  fprintf (file, "  num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
-          num_imm_operand, num_imm32_operand, num_imm64_operand, imm_size);
-}
-
-/* Print to STDERR the status of the ready list with respect to
-   dispatch windows.  */
-
-DEBUG_FUNCTION void
-debug_ready_dispatch (void)
-{
-  int i;
-  int no_ready = number_in_ready ();
-
-  fprintf (stdout, "Number of ready: %d\n", no_ready);
-
-  for (i = 0; i < no_ready; i++)
-    debug_insn_dispatch_info_file (stdout, get_ready_element (i));
-}
-
-/* This routine is the driver of the dispatch scheduler.  */
-
-static void
-do_dispatch (rtx_insn *insn, int mode)
-{
-  if (mode == DISPATCH_INIT)
-    init_dispatch_sched ();
-  else if (mode == ADD_TO_DISPATCH_WINDOW)
-    add_to_dispatch_window (insn);
-}
-
-/* Return TRUE if Dispatch Scheduling is supported.  */
-
-static bool
-has_dispatch (rtx_insn *insn, int action)
-{
-  /* Current implementation of dispatch scheduler models buldozer only.  */
-  if ((TARGET_BDVER1 || TARGET_BDVER2 || TARGET_BDVER3
-      || TARGET_BDVER4) && flag_dispatch_scheduler)
-    switch (action)
-      {
-      default:
-       return false;
-
-      case IS_DISPATCH_ON:
-       return true;
-
-      case IS_CMP:
-       return is_cmp (insn);
-
-      case DISPATCH_VIOLATION:
-       return dispatch_violation ();
-
-      case FITS_DISPATCH_WINDOW:
-       return fits_dispatch_window (insn);
-      }
-
-  return false;
-}
  
  /* Implementation of reassociation_width target hook used by
     reassoc phase to identify parallelism level in reassociated
diff --git a/gcc/config/i386/t-i386 b/gcc/config/i386/t-i386

index 0a8524bfbe2bc7acb60681bab7602750dd3cfe62..8411a9680ff1d0e82929c3399d98dc215d37d6c8 100644 (file)
--- a/gcc/config/i386/t-i386
+++ b/gcc/config/i386/t-i386
@@ -24,6 +24,22 @@ i386-c.o: $(srcdir)/config/i386/i386-c.c
           $(COMPILE) $<
           $(POSTCOMPILE)
  
+x86-tune-sched.o: $(srcdir)/config/i386/x86-tune-sched.c
+         $(COMPILE) $<
+         $(POSTCOMPILE)
+
+x86-tune-sched-bd.o: $(srcdir)/config/i386/x86-tune-sched-bd.c
+         $(COMPILE) $<
+         $(POSTCOMPILE)
+
+x86-tune-sched-atom.o: $(srcdir)/config/i386/x86-tune-sched-atom.c
+         $(COMPILE) $<
+         $(POSTCOMPILE)
+
+x86-tune-sched-core.o: $(srcdir)/config/i386/x86-tune-sched-core.c
+         $(COMPILE) $<
+         $(POSTCOMPILE)
+
  i386.o: i386-builtin-types.inc
  
  i386-builtin-types.inc: s-i386-bt ; @true
diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h

new file mode 100644 (file)

index 0000000..d27072c
--- /dev/null
+++ b/gcc/config/i386/x86-tune-costs.h
@@ -0,0 +1,2083 @@
+
+/* Processor costs (relative to an add) */
+/* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes.  */
+#define COSTS_N_BYTES(N) ((N) * 2)
+
+#define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall, false}}}
+
+static stringop_algs ix86_size_memcpy[2] = {
+  {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
+  {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
+static stringop_algs ix86_size_memset[2] = {
+  {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
+  {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
+
+const
+struct processor_costs ix86_size_cost = {/* costs for tuning for size */
+  COSTS_N_BYTES (2),                   /* cost of an add instruction */
+  COSTS_N_BYTES (3),                   /* cost of a lea instruction */
+  COSTS_N_BYTES (2),                   /* variable shift costs */
+  COSTS_N_BYTES (3),                   /* constant shift costs */
+  {COSTS_N_BYTES (3),                  /* cost of starting multiply for QI */
+   COSTS_N_BYTES (3),                  /*                               HI */
+   COSTS_N_BYTES (3),                  /*                               SI */
+   COSTS_N_BYTES (3),                  /*                               DI */
+   COSTS_N_BYTES (5)},                 /*                            other */
+  0,                                   /* cost of multiply per each bit set */
+  {COSTS_N_BYTES (3),                  /* cost of a divide/mod for QI */
+   COSTS_N_BYTES (3),                  /*                          HI */
+   COSTS_N_BYTES (3),                  /*                          SI */
+   COSTS_N_BYTES (3),                  /*                          DI */
+   COSTS_N_BYTES (5)},                 /*                          other */
+  COSTS_N_BYTES (3),                   /* cost of movsx */
+  COSTS_N_BYTES (3),                   /* cost of movzx */
+  0,                                   /* "large" insn */
+  2,                                   /* MOVE_RATIO */
+  2,                                /* cost for loading QImode using movzbl */
+  {2, 2, 2},                           /* cost of loading integer registers
+                                          in QImode, HImode and SImode.
+                                          Relative to reg-reg move (2).  */
+  {2, 2, 2},                           /* cost of storing integer registers */
+  2,                                   /* cost of reg,reg fld/fst */
+  {2, 2, 2},                           /* cost of loading fp registers
+                                          in SFmode, DFmode and XFmode */
+  {2, 2, 2},                           /* cost of storing fp registers
+                                          in SFmode, DFmode and XFmode */
+  3,                                   /* cost of moving MMX register */
+  {3, 3},                              /* cost of loading MMX registers
+                                          in SImode and DImode */
+  {3, 3},                              /* cost of storing MMX registers
+                                          in SImode and DImode */
+  3,                                   /* cost of moving SSE register */
+  {3, 3, 3},                           /* cost of loading SSE registers
+                                          in SImode, DImode and TImode */
+  {3, 3, 3},                           /* cost of storing SSE registers
+                                          in SImode, DImode and TImode */
+  3,                                   /* MMX or SSE register to integer */
+  0,                                   /* size of l1 cache  */
+  0,                                   /* size of l2 cache  */
+  0,                                   /* size of prefetch block */
+  0,                                   /* number of parallel prefetches */
+  2,                                   /* Branch cost */
+  COSTS_N_BYTES (2),                   /* cost of FADD and FSUB insns.  */
+  COSTS_N_BYTES (2),                   /* cost of FMUL instruction.  */
+  COSTS_N_BYTES (2),                   /* cost of FDIV instruction.  */
+  COSTS_N_BYTES (2),                   /* cost of FABS instruction.  */
+  COSTS_N_BYTES (2),                   /* cost of FCHS instruction.  */
+  COSTS_N_BYTES (2),                   /* cost of FSQRT instruction.  */
+  1, 1, 1, 1,                          /* reassoc int, fp, vec_int, vec_fp.  */
+  ix86_size_memcpy,
+  ix86_size_memset,
+  1,                                   /* scalar_stmt_cost.  */
+  1,                                   /* scalar load_cost.  */
+  1,                                   /* scalar_store_cost.  */
+  1,                                   /* vec_stmt_cost.  */
+  1,                                   /* vec_to_scalar_cost.  */
+  1,                                   /* scalar_to_vec_cost.  */
+  1,                                   /* vec_align_load_cost.  */
+  1,                                   /* vec_unalign_load_cost.  */
+  1,                                   /* vec_store_cost.  */
+  1,                                   /* cond_taken_branch_cost.  */
+  1,                                   /* cond_not_taken_branch_cost.  */
+};
+
+/* Processor costs (relative to an add) */
+static stringop_algs i386_memcpy[2] = {
+  {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
+  DUMMY_STRINGOP_ALGS};
+static stringop_algs i386_memset[2] = {
+  {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
+  DUMMY_STRINGOP_ALGS};
+
+static const
+struct processor_costs i386_cost = {   /* 386 specific costs */
+  COSTS_N_INSNS (1),                   /* cost of an add instruction */
+  COSTS_N_INSNS (1),                   /* cost of a lea instruction */
+  COSTS_N_INSNS (3),                   /* variable shift costs */
+  COSTS_N_INSNS (2),                   /* constant shift costs */
+  {COSTS_N_INSNS (6),                  /* cost of starting multiply for QI */
+   COSTS_N_INSNS (6),                  /*                               HI */
+   COSTS_N_INSNS (6),                  /*                               SI */
+   COSTS_N_INSNS (6),                  /*                               DI */
+   COSTS_N_INSNS (6)},                 /*                            other */
+  COSTS_N_INSNS (1),                   /* cost of multiply per each bit set */
+  {COSTS_N_INSNS (23),                 /* cost of a divide/mod for QI */
+   COSTS_N_INSNS (23),                 /*                          HI */
+   COSTS_N_INSNS (23),                 /*                          SI */
+   COSTS_N_INSNS (23),                 /*                          DI */
+   COSTS_N_INSNS (23)},                        /*                          other */
+  COSTS_N_INSNS (3),                   /* cost of movsx */
+  COSTS_N_INSNS (2),                   /* cost of movzx */
+  15,                                  /* "large" insn */
+  3,                                   /* MOVE_RATIO */
+  4,                                /* cost for loading QImode using movzbl */
+  {2, 4, 2},                           /* cost of loading integer registers
+                                          in QImode, HImode and SImode.
+                                          Relative to reg-reg move (2).  */
+  {2, 4, 2},                           /* cost of storing integer registers */
+  2,                                   /* cost of reg,reg fld/fst */
+  {8, 8, 8},                           /* cost of loading fp registers
+                                          in SFmode, DFmode and XFmode */
+  {8, 8, 8},                           /* cost of storing fp registers
+                                          in SFmode, DFmode and XFmode */
+  2,                                   /* cost of moving MMX register */
+  {4, 8},                              /* cost of loading MMX registers
+                                          in SImode and DImode */
+  {4, 8},                              /* cost of storing MMX registers
+                                          in SImode and DImode */
+  2,                                   /* cost of moving SSE register */
+  {4, 8, 16},                          /* cost of loading SSE registers
+                                          in SImode, DImode and TImode */
+  {4, 8, 16},                          /* cost of storing SSE registers
+                                          in SImode, DImode and TImode */
+  3,                                   /* MMX or SSE register to integer */
+  0,                                   /* size of l1 cache  */
+  0,                                   /* size of l2 cache  */
+  0,                                   /* size of prefetch block */
+  0,                                   /* number of parallel prefetches */
+  1,                                   /* Branch cost */
+  COSTS_N_INSNS (23),                  /* cost of FADD and FSUB insns.  */
+  COSTS_N_INSNS (27),                  /* cost of FMUL instruction.  */
+  COSTS_N_INSNS (88),                  /* cost of FDIV instruction.  */
+  COSTS_N_INSNS (22),                  /* cost of FABS instruction.  */
+  COSTS_N_INSNS (24),                  /* cost of FCHS instruction.  */
+  COSTS_N_INSNS (122),                 /* cost of FSQRT instruction.  */
+  1, 1, 1, 1,                          /* reassoc int, fp, vec_int, vec_fp.  */
+  i386_memcpy,
+  i386_memset,
+  1,                                   /* scalar_stmt_cost.  */
+  1,                                   /* scalar load_cost.  */
+  1,                                   /* scalar_store_cost.  */
+  1,                                   /* vec_stmt_cost.  */
+  1,                                   /* vec_to_scalar_cost.  */
+  1,                                   /* scalar_to_vec_cost.  */
+  1,                                   /* vec_align_load_cost.  */
+  2,                                   /* vec_unalign_load_cost.  */
+  1,                                   /* vec_store_cost.  */
+  3,                                   /* cond_taken_branch_cost.  */
+  1,                                   /* cond_not_taken_branch_cost.  */
+};
+
+static stringop_algs i486_memcpy[2] = {
+  {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
+  DUMMY_STRINGOP_ALGS};
+static stringop_algs i486_memset[2] = {
+  {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
+  DUMMY_STRINGOP_ALGS};
+
+static const
+struct processor_costs i486_cost = {   /* 486 specific costs */
+  COSTS_N_INSNS (1),                   /* cost of an add instruction */
+  COSTS_N_INSNS (1),                   /* cost of a lea instruction */
+  COSTS_N_INSNS (3),                   /* variable shift costs */
+  COSTS_N_INSNS (2),                   /* constant shift costs */
+  {COSTS_N_INSNS (12),                 /* cost of starting multiply for QI */
+   COSTS_N_INSNS (12),                 /*                               HI */
+   COSTS_N_INSNS (12),                 /*                               SI */
+   COSTS_N_INSNS (12),                 /*                               DI */
+   COSTS_N_INSNS (12)},                        /*                            other */
+  1,                                   /* cost of multiply per each bit set */
+  {COSTS_N_INSNS (40),                 /* cost of a divide/mod for QI */
+   COSTS_N_INSNS (40),                 /*                          HI */
+   COSTS_N_INSNS (40),                 /*                          SI */
+   COSTS_N_INSNS (40),                 /*                          DI */
+   COSTS_N_INSNS (40)},                        /*                          other */
+  COSTS_N_INSNS (3),                   /* cost of movsx */
+  COSTS_N_INSNS (2),                   /* cost of movzx */
+  15,                                  /* "large" insn */
+  3,                                   /* MOVE_RATIO */
+  4,                                /* cost for loading QImode using movzbl */
+  {2, 4, 2},                           /* cost of loading integer registers
+                                          in QImode, HImode and SImode.
+                                          Relative to reg-reg move (2).  */
+  {2, 4, 2},                           /* cost of storing integer registers */
+  2,                                   /* cost of reg,reg fld/fst */
+  {8, 8, 8},                           /* cost of loading fp registers
+                                          in SFmode, DFmode and XFmode */
+  {8, 8, 8},                           /* cost of storing fp registers
+                                          in SFmode, DFmode and XFmode */
+  2,                                   /* cost of moving MMX register */
+  {4, 8},                              /* cost of loading MMX registers
+                                          in SImode and DImode */
+  {4, 8},                              /* cost of storing MMX registers
+                                          in SImode and DImode */
+  2,                                   /* cost of moving SSE register */
+  {4, 8, 16},                          /* cost of loading SSE registers
+                                          in SImode, DImode and TImode */
+  {4, 8, 16},                          /* cost of storing SSE registers
+                                          in SImode, DImode and TImode */
+  3,                                   /* MMX or SSE register to integer */
+  4,                                   /* size of l1 cache.  486 has 8kB cache
+                                          shared for code and data, so 4kB is
+                                          not really precise.  */
+  4,                                   /* size of l2 cache  */
+  0,                                   /* size of prefetch block */
+  0,                                   /* number of parallel prefetches */
+  1,                                   /* Branch cost */
+  COSTS_N_INSNS (8),                   /* cost of FADD and FSUB insns.  */
+  COSTS_N_INSNS (16),                  /* cost of FMUL instruction.  */
+  COSTS_N_INSNS (73),                  /* cost of FDIV instruction.  */
+  COSTS_N_INSNS (3),                   /* cost of FABS instruction.  */
+  COSTS_N_INSNS (3),                   /* cost of FCHS instruction.  */
+  COSTS_N_INSNS (83),                  /* cost of FSQRT instruction.  */
+  1, 1, 1, 1,                          /* reassoc int, fp, vec_int, vec_fp.  */
+  i486_memcpy,
+  i486_memset,
+  1,                                   /* scalar_stmt_cost.  */
+  1,                                   /* scalar load_cost.  */
+  1,                                   /* scalar_store_cost.  */
+  1,                                   /* vec_stmt_cost.  */
+  1,                                   /* vec_to_scalar_cost.  */
+  1,                                   /* scalar_to_vec_cost.  */
+  1,                                   /* vec_align_load_cost.  */
+  2,                                   /* vec_unalign_load_cost.  */
+  1,                                   /* vec_store_cost.  */
+  3,                                   /* cond_taken_branch_cost.  */
+  1,                                   /* cond_not_taken_branch_cost.  */
+};
+
+static stringop_algs pentium_memcpy[2] = {
+  {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
+  DUMMY_STRINGOP_ALGS};
+static stringop_algs pentium_memset[2] = {
+  {libcall, {{-1, rep_prefix_4_byte, false}}},
+  DUMMY_STRINGOP_ALGS};
+
+static const
+struct processor_costs pentium_cost = {
+  COSTS_N_INSNS (1),                   /* cost of an add instruction */
+  COSTS_N_INSNS (1),                   /* cost of a lea instruction */
+  COSTS_N_INSNS (4),                   /* variable shift costs */
+  COSTS_N_INSNS (1),                   /* constant shift costs */
+  {COSTS_N_INSNS (11),                 /* cost of starting multiply for QI */
+   COSTS_N_INSNS (11),                 /*                               HI */
+   COSTS_N_INSNS (11),                 /*                               SI */
+   COSTS_N_INSNS (11),                 /*                               DI */
+   COSTS_N_INSNS (11)},                        /*                            other */
+  0,                                   /* cost of multiply per each bit set */
+  {COSTS_N_INSNS (25),                 /* cost of a divide/mod for QI */
+   COSTS_N_INSNS (25),                 /*                          HI */
+   COSTS_N_INSNS (25),                 /*                          SI */
+   COSTS_N_INSNS (25),                 /*                          DI */
+   COSTS_N_INSNS (25)},                        /*                          other */
+  COSTS_N_INSNS (3),                   /* cost of movsx */
+  COSTS_N_INSNS (2),                   /* cost of movzx */
+  8,                                   /* "large" insn */
+  6,                                   /* MOVE_RATIO */
+  6,                                /* cost for loading QImode using movzbl */
+  {2, 4, 2},                           /* cost of loading integer registers
+                                          in QImode, HImode and SImode.
+                                          Relative to reg-reg move (2).  */
+  {2, 4, 2},                           /* cost of storing integer registers */
+  2,                                   /* cost of reg,reg fld/fst */
+  {2, 2, 6},                           /* cost of loading fp registers
+                                          in SFmode, DFmode and XFmode */
+  {4, 4, 6},                           /* cost of storing fp registers
+                                          in SFmode, DFmode and XFmode */
+  8,                                   /* cost of moving MMX register */
+  {8, 8},                              /* cost of loading MMX registers
+                                          in SImode and DImode */
+  {8, 8},                              /* cost of storing MMX registers
+                                          in SImode and DImode */
+  2,                                   /* cost of moving SSE register */
+  {4, 8, 16},                          /* cost of loading SSE registers
+                                          in SImode, DImode and TImode */
+  {4, 8, 16},                          /* cost of storing SSE registers
+                                          in SImode, DImode and TImode */
+  3,                                   /* MMX or SSE register to integer */
+  8,                                   /* size of l1 cache.  */
+  8,                                   /* size of l2 cache  */
+  0,                                   /* size of prefetch block */
+  0,                                   /* number of parallel prefetches */
+  2,                                   /* Branch cost */
+  COSTS_N_INSNS (3),                   /* cost of FADD and FSUB insns.  */
+  COSTS_N_INSNS (3),                   /* cost of FMUL instruction.  */
+  COSTS_N_INSNS (39),                  /* cost of FDIV instruction.  */
+  COSTS_N_INSNS (1),                   /* cost of FABS instruction.  */
+  COSTS_N_INSNS (1),                   /* cost of FCHS instruction.  */
+  COSTS_N_INSNS (70),                  /* cost of FSQRT instruction.  */
+  1, 1, 1, 1,                          /* reassoc int, fp, vec_int, vec_fp.  */
+  pentium_memcpy,
+  pentium_memset,
+  1,                                   /* scalar_stmt_cost.  */
+  1,                                   /* scalar load_cost.  */
+  1,                                   /* scalar_store_cost.  */
+  1,                                   /* vec_stmt_cost.  */
+  1,                                   /* vec_to_scalar_cost.  */
+  1,                                   /* scalar_to_vec_cost.  */
+  1,                                   /* vec_align_load_cost.  */
+  2,                                   /* vec_unalign_load_cost.  */
+  1,                                   /* vec_store_cost.  */
+  3,                                   /* cond_taken_branch_cost.  */
+  1,                                   /* cond_not_taken_branch_cost.  */
+};
+
+static const
+struct processor_costs lakemont_cost = {
+  COSTS_N_INSNS (1),                   /* cost of an add instruction */
+  COSTS_N_INSNS (1) + 1,               /* cost of a lea instruction */
+  COSTS_N_INSNS (1),                   /* variable shift costs */
+  COSTS_N_INSNS (1),                   /* constant shift costs */
+  {COSTS_N_INSNS (11),                 /* cost of starting multiply for QI */
+   COSTS_N_INSNS (11),                 /*                               HI */
+   COSTS_N_INSNS (11),                 /*                               SI */
+   COSTS_N_INSNS (11),                 /*                               DI */
+   COSTS_N_INSNS (11)},                        /*                            other */
+  0,                                   /* cost of multiply per each bit set */
+  {COSTS_N_INSNS (25),                 /* cost of a divide/mod for QI */
+   COSTS_N_INSNS (25),                 /*                          HI */
+   COSTS_N_INSNS (25),                 /*                          SI */
+   COSTS_N_INSNS (25),                 /*                          DI */
+   COSTS_N_INSNS (25)},                        /*                          other */
+  COSTS_N_INSNS (3),                   /* cost of movsx */
+  COSTS_N_INSNS (2),                   /* cost of movzx */
+  8,                                   /* "large" insn */
+  17,                                  /* MOVE_RATIO */
+  6,                                /* cost for loading QImode using movzbl */
+  {2, 4, 2},                           /* cost of loading integer registers
+                                          in QImode, HImode and SImode.
+                                          Relative to reg-reg move (2).  */
+  {2, 4, 2},                           /* cost of storing integer registers */
+  2,                                   /* cost of reg,reg fld/fst */
+  {2, 2, 6},                           /* cost of loading fp registers
+                                          in SFmode, DFmode and XFmode */
+  {4, 4, 6},                           /* cost of storing fp registers
+                                          in SFmode, DFmode and XFmode */
+  8,                                   /* cost of moving MMX register */
+  {8, 8},                              /* cost of loading MMX registers
+                                          in SImode and DImode */
+  {8, 8},                              /* cost of storing MMX registers
+                                          in SImode and DImode */
+  2,                                   /* cost of moving SSE register */
+  {4, 8, 16},                          /* cost of loading SSE registers
+                                          in SImode, DImode and TImode */
+  {4, 8, 16},                          /* cost of storing SSE registers
+                                          in SImode, DImode and TImode */
+  3,                                   /* MMX or SSE register to integer */
+  8,                                   /* size of l1 cache.  */
+  8,                                   /* size of l2 cache  */
+  0,                                   /* size of prefetch block */
+  0,                                   /* number of parallel prefetches */
+  2,                                   /* Branch cost */
+  COSTS_N_INSNS (3),                   /* cost of FADD and FSUB insns.  */
+  COSTS_N_INSNS (3),                   /* cost of FMUL instruction.  */
+  COSTS_N_INSNS (39),                  /* cost of FDIV instruction.  */
+  COSTS_N_INSNS (1),                   /* cost of FABS instruction.  */
+  COSTS_N_INSNS (1),                   /* cost of FCHS instruction.  */
+  COSTS_N_INSNS (70),                  /* cost of FSQRT instruction.  */
+  1, 1, 1, 1,                          /* reassoc int, fp, vec_int, vec_fp.  */
+  pentium_memcpy,
+  pentium_memset,
+  1,                                   /* scalar_stmt_cost.  */
+  1,                                   /* scalar load_cost.  */
+  1,                                   /* scalar_store_cost.  */
+  1,                                   /* vec_stmt_cost.  */
+  1,                                   /* vec_to_scalar_cost.  */
+  1,                                   /* scalar_to_vec_cost.  */
+  1,                                   /* vec_align_load_cost.  */
+  2,                                   /* vec_unalign_load_cost.  */
+  1,                                   /* vec_store_cost.  */
+  3,                                   /* cond_taken_branch_cost.  */
+  1,                                   /* cond_not_taken_branch_cost.  */
+};
+
+/* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
+   (we ensure the alignment).  For small blocks inline loop is still a
+   noticeable win, for bigger blocks either rep movsl or rep movsb is
+   way to go.  Rep movsb has apparently more expensive startup time in CPU,
+   but after 4K the difference is down in the noise.  */
+static stringop_algs pentiumpro_memcpy[2] = {
+  {rep_prefix_4_byte, {{128, loop, false}, {1024, unrolled_loop, false},
+                       {8192, rep_prefix_4_byte, false},
+                       {-1, rep_prefix_1_byte, false}}},
+  DUMMY_STRINGOP_ALGS};
+static stringop_algs pentiumpro_memset[2] = {
+  {rep_prefix_4_byte, {{1024, unrolled_loop, false},
+                       {8192, rep_prefix_4_byte, false},
+                       {-1, libcall, false}}},
+  DUMMY_STRINGOP_ALGS};
+static const
+struct processor_costs pentiumpro_cost = {
+  COSTS_N_INSNS (1),                   /* cost of an add instruction */
+  COSTS_N_INSNS (1),                   /* cost of a lea instruction */
+  COSTS_N_INSNS (1),                   /* variable shift costs */
+  COSTS_N_INSNS (1),                   /* constant shift costs */
+  {COSTS_N_INSNS (4),                  /* cost of starting multiply for QI */
+   COSTS_N_INSNS (4),                  /*                               HI */
+   COSTS_N_INSNS (4),                  /*                               SI */
+   COSTS_N_INSNS (4),                  /*                               DI */
+   COSTS_N_INSNS (4)},                 /*                            other */
+  0,                                   /* cost of multiply per each bit set */
+  {COSTS_N_INSNS (17),                 /* cost of a divide/mod for QI */
+   COSTS_N_INSNS (17),                 /*                          HI */
+   COSTS_N_INSNS (17),                 /*                          SI */
+   COSTS_N_INSNS (17),                 /*                          DI */
+   COSTS_N_INSNS (17)},                        /*                          other */
+  COSTS_N_INSNS (1),                   /* cost of movsx */
+  COSTS_N_INSNS (1),                   /* cost of movzx */
+  8,                                   /* "large" insn */
+  6,                                   /* MOVE_RATIO */
+  2,                                /* cost for loading QImode using movzbl */
+  {4, 4, 4},                           /* cost of loading integer registers
+                                          in QImode, HImode and SImode.
+                                          Relative to reg-reg move (2).  */
+  {2, 2, 2},                           /* cost of storing integer registers */
+  2,                                   /* cost of reg,reg fld/fst */
+  {2, 2, 6},                           /* cost of loading fp registers
+                                          in SFmode, DFmode and XFmode */
+  {4, 4, 6},                           /* cost of storing fp registers
+                                          in SFmode, DFmode and XFmode */
+  2,                                   /* cost of moving MMX register */
+  {2, 2},                              /* cost of loading MMX registers
+                                          in SImode and DImode */
+  {2, 2},                              /* cost of storing MMX registers
+                                          in SImode and DImode */
+  2,                                   /* cost of moving SSE register */
+  {2, 2, 8},                           /* cost of loading SSE registers
+                                          in SImode, DImode and TImode */
+  {2, 2, 8},                           /* cost of storing SSE registers
+                                          in SImode, DImode and TImode */
+  3,                                   /* MMX or SSE register to integer */
+  8,                                   /* size of l1 cache.  */
+  256,                                 /* size of l2 cache  */
+  32,                                  /* size of prefetch block */
+  6,                                   /* number of parallel prefetches */
+  2,                                   /* Branch cost */
+  COSTS_N_INSNS (3),                   /* cost of FADD and FSUB insns.  */
+  COSTS_N_INSNS (5),                   /* cost of FMUL instruction.  */
+  COSTS_N_INSNS (56),                  /* cost of FDIV instruction.  */
+  COSTS_N_INSNS (2),                   /* cost of FABS instruction.  */
+  COSTS_N_INSNS (2),                   /* cost of FCHS instruction.  */
+  COSTS_N_INSNS (56),                  /* cost of FSQRT instruction.  */
+  1, 1, 1, 1,                          /* reassoc int, fp, vec_int, vec_fp.  */
+  pentiumpro_memcpy,
+  pentiumpro_memset,
+  1,                                   /* scalar_stmt_cost.  */
+  1,                                   /* scalar load_cost.  */
+  1,                                   /* scalar_store_cost.  */
+  1,                                   /* vec_stmt_cost.  */
+  1,                                   /* vec_to_scalar_cost.  */
+  1,                                   /* scalar_to_vec_cost.  */
+  1,                                   /* vec_align_load_cost.  */
+  2,                                   /* vec_unalign_load_cost.  */
+  1,                                   /* vec_store_cost.  */
+  3,                                   /* cond_taken_branch_cost.  */
+  1,                                   /* cond_not_taken_branch_cost.  */
+};
+
+static stringop_algs geode_memcpy[2] = {
+  {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
+  DUMMY_STRINGOP_ALGS};
+static stringop_algs geode_memset[2] = {
+  {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
+  DUMMY_STRINGOP_ALGS};
+static const
+struct processor_costs geode_cost = {
+  COSTS_N_INSNS (1),                   /* cost of an add instruction */
+  COSTS_N_INSNS (1),                   /* cost of a lea instruction */
+  COSTS_N_INSNS (2),                   /* variable shift costs */
+  COSTS_N_INSNS (1),                   /* constant shift costs */
+  {COSTS_N_INSNS (3),                  /* cost of starting multiply for QI */
+   COSTS_N_INSNS (4),                  /*                               HI */
+   COSTS_N_INSNS (7),                  /*                               SI */
+   COSTS_N_INSNS (7),                  /*                               DI */
+   COSTS_N_INSNS (7)},                 /*                            other */
+  0,                                   /* cost of multiply per each bit set */
+  {COSTS_N_INSNS (15),                 /* cost of a divide/mod for QI */
+   COSTS_N_INSNS (23),                 /*                          HI */
+   COSTS_N_INSNS (39),                 /*                          SI */
+   COSTS_N_INSNS (39),                 /*                          DI */
+   COSTS_N_INSNS (39)},                        /*                          other */
+  COSTS_N_INSNS (1),                   /* cost of movsx */
+  COSTS_N_INSNS (1),                   /* cost of movzx */
+  8,                                   /* "large" insn */
+  4,                                   /* MOVE_RATIO */
+  1,                                /* cost for loading QImode using movzbl */
+  {1, 1, 1},                           /* cost of loading integer registers
+                                          in QImode, HImode and SImode.
+                                          Relative to reg-reg move (2).  */
+  {1, 1, 1},                           /* cost of storing integer registers */
+  1,                                   /* cost of reg,reg fld/fst */
+  {1, 1, 1},                           /* cost of loading fp registers
+                                          in SFmode, DFmode and XFmode */
+  {4, 6, 6},                           /* cost of storing fp registers
+                                          in SFmode, DFmode and XFmode */
+
+  2,                                   /* cost of moving MMX register */
+  {2, 2},                              /* cost of loading MMX registers
+                                          in SImode and DImode */
+  {2, 2},                              /* cost of storing MMX registers
+                                          in SImode and DImode */
+  2,                                   /* cost of moving SSE register */
+  {2, 2, 8},                           /* cost of loading SSE registers
+                                          in SImode, DImode and TImode */
+  {2, 2, 8},                           /* cost of storing SSE registers
+                                          in SImode, DImode and TImode */
+  3,                                   /* MMX or SSE register to integer */
+  64,                                  /* size of l1 cache.  */
+  128,                                 /* size of l2 cache.  */
+  32,                                  /* size of prefetch block */
+  1,                                   /* number of parallel prefetches */
+  1,                                   /* Branch cost */
+  COSTS_N_INSNS (6),                   /* cost of FADD and FSUB insns.  */
+  COSTS_N_INSNS (11),                  /* cost of FMUL instruction.  */
+  COSTS_N_INSNS (47),                  /* cost of FDIV instruction.  */
+  COSTS_N_INSNS (1),                   /* cost of FABS instruction.  */
+  COSTS_N_INSNS (1),                   /* cost of FCHS instruction.  */
+  COSTS_N_INSNS (54),                  /* cost of FSQRT instruction.  */
+  1, 1, 1, 1,                          /* reassoc int, fp, vec_int, vec_fp.  */
+  geode_memcpy,
+  geode_memset,
+  1,                                   /* scalar_stmt_cost.  */
+  1,                                   /* scalar load_cost.  */
+  1,                                   /* scalar_store_cost.  */
+  1,                                   /* vec_stmt_cost.  */
+  1,                                   /* vec_to_scalar_cost.  */
+  1,                                   /* scalar_to_vec_cost.  */
+  1,                                   /* vec_align_load_cost.  */
+  2,                                   /* vec_unalign_load_cost.  */
+  1,                                   /* vec_store_cost.  */
+  3,                                   /* cond_taken_branch_cost.  */
+  1,                                   /* cond_not_taken_branch_cost.  */
+};
+
+static stringop_algs k6_memcpy[2] = {
+  {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
+  DUMMY_STRINGOP_ALGS};
+static stringop_algs k6_memset[2] = {
+  {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
+  DUMMY_STRINGOP_ALGS};
+static const
+struct processor_costs k6_cost = {
+  COSTS_N_INSNS (1),                   /* cost of an add instruction */
+  COSTS_N_INSNS (2),                   /* cost of a lea instruction */
+  COSTS_N_INSNS (1),                   /* variable shift costs */
+  COSTS_N_INSNS (1),                   /* constant shift costs */
+  {COSTS_N_INSNS (3),                  /* cost of starting multiply for QI */
+   COSTS_N_INSNS (3),                  /*                               HI */
+   COSTS_N_INSNS (3),                  /*                               SI */
+   COSTS_N_INSNS (3),                  /*                               DI */
+   COSTS_N_INSNS (3)},                 /*                            other */
+  0,                                   /* cost of multiply per each bit set */
+  {COSTS_N_INSNS (18),                 /* cost of a divide/mod for QI */
+   COSTS_N_INSNS (18),                 /*                          HI */
+   COSTS_N_INSNS (18),                 /*                          SI */
+   COSTS_N_INSNS (18),                 /*                          DI */
+   COSTS_N_INSNS (18)},                        /*                          other */
+  COSTS_N_INSNS (2),                   /* cost of movsx */
+  COSTS_N_INSNS (2),                   /* cost of movzx */
+  8,                                   /* "large" insn */
+  4,                                   /* MOVE_RATIO */
+  3,                                /* cost for loading QImode using movzbl */
+  {4, 5, 4},                           /* cost of loading integer registers
+                                          in QImode, HImode and SImode.
+                                          Relative to reg-reg move (2).  */
+  {2, 3, 2},                           /* cost of storing integer registers */
+  4,                                   /* cost of reg,reg fld/fst */
+  {6, 6, 6},                           /* cost of loading fp registers
+                                          in SFmode, DFmode and XFmode */
+  {4, 4, 4},                           /* cost of storing fp registers
+                                          in SFmode, DFmode and XFmode */
+  2,                                   /* cost of moving MMX register */
+  {2, 2},                              /* cost of loading MMX registers
+                                          in SImode and DImode */
+  {2, 2},                              /* cost of storing MMX registers
+                                          in SImode and DImode */
+  2,                                   /* cost of moving SSE register */
+  {2, 2, 8},                           /* cost of loading SSE registers
+                                          in SImode, DImode and TImode */
+  {2, 2, 8},                           /* cost of storing SSE registers
+                                          in SImode, DImode and TImode */
+  6,                                   /* MMX or SSE register to integer */
+  32,                                  /* size of l1 cache.  */
+  32,                                  /* size of l2 cache.  Some models
+                                          have integrated l2 cache, but
+                                          optimizing for k6 is not important
+                                          enough to worry about that.  */
+  32,                                  /* size of prefetch block */
+  1,                                   /* number of parallel prefetches */
+  1,                                   /* Branch cost */
+  COSTS_N_INSNS (2),                   /* cost of FADD and FSUB insns.  */
+  COSTS_N_INSNS (2),                   /* cost of FMUL instruction.  */
+  COSTS_N_INSNS (56),                  /* cost of FDIV instruction.  */
+  COSTS_N_INSNS (2),                   /* cost of FABS instruction.  */
+  COSTS_N_INSNS (2),                   /* cost of FCHS instruction.  */
+  COSTS_N_INSNS (56),                  /* cost of FSQRT instruction.  */
+  1, 1, 1, 1,                          /* reassoc int, fp, vec_int, vec_fp.  */
+  k6_memcpy,
+  k6_memset,
+  1,                                   /* scalar_stmt_cost.  */
+  1,                                   /* scalar load_cost.  */
+  1,                                   /* scalar_store_cost.  */
+  1,                                   /* vec_stmt_cost.  */
+  1,                                   /* vec_to_scalar_cost.  */
+  1,                                   /* scalar_to_vec_cost.  */
+  1,                                   /* vec_align_load_cost.  */
+  2,                                   /* vec_unalign_load_cost.  */
+  1,                                   /* vec_store_cost.  */
+  3,                                   /* cond_taken_branch_cost.  */
+  1,                                   /* cond_not_taken_branch_cost.  */
+};
+
+/* For some reason, Athlon deals better with REP prefix (relative to loops)
+   compared to K8. Alignment becomes important after 8 bytes for memcpy and
+   128 bytes for memset.  */
+static stringop_algs athlon_memcpy[2] = {
+  {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
+  DUMMY_STRINGOP_ALGS};
+static stringop_algs athlon_memset[2] = {
+  {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
+  DUMMY_STRINGOP_ALGS};
+static const
+struct processor_costs athlon_cost = {
+  COSTS_N_INSNS (1),                   /* cost of an add instruction */
+  COSTS_N_INSNS (2),                   /* cost of a lea instruction */
+  COSTS_N_INSNS (1),                   /* variable shift costs */
+  COSTS_N_INSNS (1),                   /* constant shift costs */
+  {COSTS_N_INSNS (5),                  /* cost of starting multiply for QI */
+   COSTS_N_INSNS (5),                  /*                               HI */
+   COSTS_N_INSNS (5),                  /*                               SI */
+   COSTS_N_INSNS (5),                  /*                               DI */
+   COSTS_N_INSNS (5)},                 /*                            other */
+  0,                                   /* cost of multiply per each bit set */
+  {COSTS_N_INSNS (18),                 /* cost of a divide/mod for QI */
+   COSTS_N_INSNS (26),                 /*                          HI */
+   COSTS_N_INSNS (42),                 /*                          SI */
+   COSTS_N_INSNS (74),                 /*                          DI */
+   COSTS_N_INSNS (74)},                        /*                          other */
+  COSTS_N_INSNS (1),                   /* cost of movsx */
+  COSTS_N_INSNS (1),                   /* cost of movzx */
+  8,                                   /* "large" insn */
+  9,                                   /* MOVE_RATIO */
+  4,                                /* cost for loading QImode using movzbl */
+  {3, 4, 3},                           /* cost of loading integer registers
+                                          in QImode, HImode and SImode.
+                                          Relative to reg-reg move (2).  */
+  {3, 4, 3},                           /* cost of storing integer registers */
+  4,                                   /* cost of reg,reg fld/fst */
+  {4, 4, 12},                          /* cost of loading fp registers
+                                          in SFmode, DFmode and XFmode */
+  {6, 6, 8},                           /* cost of storing fp registers
+                                          in SFmode, DFmode and XFmode */
+  2,                                   /* cost of moving MMX register */
+  {4, 4},                              /* cost of loading MMX registers
+                                          in SImode and DImode */
+  {4, 4},                              /* cost of storing MMX registers
+                                          in SImode and DImode */
+  2,                                   /* cost of moving SSE register */
+  {4, 4, 6},                           /* cost of loading SSE registers
+                                          in SImode, DImode and TImode */
+  {4, 4, 5},                           /* cost of storing SSE registers
+                                          in SImode, DImode and TImode */
+  5,                                   /* MMX or SSE register to integer */
+  64,                                  /* size of l1 cache.  */
+  256,                                 /* size of l2 cache.  */
+  64,                                  /* size of prefetch block */
+  6,                                   /* number of parallel prefetches */
+  5,                                   /* Branch cost */
+  COSTS_N_INSNS (4),                   /* cost of FADD and FSUB insns.  */
+  COSTS_N_INSNS (4),                   /* cost of FMUL instruction.  */
+  COSTS_N_INSNS (24),                  /* cost of FDIV instruction.  */
+  COSTS_N_INSNS (2),                   /* cost of FABS instruction.  */
+  COSTS_N_INSNS (2),                   /* cost of FCHS instruction.  */
+  COSTS_N_INSNS (35),                  /* cost of FSQRT instruction.  */
+  1, 1, 1, 1,                          /* reassoc int, fp, vec_int, vec_fp.  */
+  athlon_memcpy,
+  athlon_memset,
+  1,                                   /* scalar_stmt_cost.  */
+  1,                                   /* scalar load_cost.  */
+  1,                                   /* scalar_store_cost.  */
+  1,                                   /* vec_stmt_cost.  */
+  1,                                   /* vec_to_scalar_cost.  */
+  1,                                   /* scalar_to_vec_cost.  */
+  1,                                   /* vec_align_load_cost.  */
+  2,                                   /* vec_unalign_load_cost.  */
+  1,                                   /* vec_store_cost.  */
+  3,                                   /* cond_taken_branch_cost.  */
+  1,                                   /* cond_not_taken_branch_cost.  */
+};
+
+/* K8 has optimized REP instruction for medium sized blocks, but for very
+   small blocks it is better to use loop. For large blocks, libcall can
+   do nontemporary accesses and beat inline considerably.  */
+static stringop_algs k8_memcpy[2] = {
+  {libcall, {{6, loop, false}, {14, unrolled_loop, false},
+             {-1, rep_prefix_4_byte, false}}},
+  {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
+             {-1, libcall, false}}}};
+static stringop_algs k8_memset[2] = {
+  {libcall, {{8, loop, false}, {24, unrolled_loop, false},
+             {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
+  {libcall, {{48, unrolled_loop, false},
+             {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
+static const
+struct processor_costs k8_cost = {
+  COSTS_N_INSNS (1),                   /* cost of an add instruction */
+  COSTS_N_INSNS (2),                   /* cost of a lea instruction */
+  COSTS_N_INSNS (1),                   /* variable shift costs */
+  COSTS_N_INSNS (1),                   /* constant shift costs */
+  {COSTS_N_INSNS (3),                  /* cost of starting multiply for QI */
+   COSTS_N_INSNS (4),                  /*                               HI */
+   COSTS_N_INSNS (3),                  /*                               SI */
+   COSTS_N_INSNS (4),                  /*                               DI */
+   COSTS_N_INSNS (5)},                 /*                            other */
+  0,                                   /* cost of multiply per each bit set */
+  {COSTS_N_INSNS (18),                 /* cost of a divide/mod for QI */
+   COSTS_N_INSNS (26),                 /*                          HI */
+   COSTS_N_INSNS (42),                 /*                          SI */
+   COSTS_N_INSNS (74),                 /*                          DI */
+   COSTS_N_INSNS (74)},                        /*                          other */
+  COSTS_N_INSNS (1),                   /* cost of movsx */
+  COSTS_N_INSNS (1),                   /* cost of movzx */
+  8,                                   /* "large" insn */
+  9,                                   /* MOVE_RATIO */
+  4,                                /* cost for loading QImode using movzbl */
+  {3, 4, 3},                           /* cost of loading integer registers
+                                          in QImode, HImode and SImode.
+                                          Relative to reg-reg move (2).  */
+  {3, 4, 3},                           /* cost of storing integer registers */
+  4,                                   /* cost of reg,reg fld/fst */
+  {4, 4, 12},                          /* cost of loading fp registers
+                                          in SFmode, DFmode and XFmode */
+  {6, 6, 8},                           /* cost of storing fp registers
+                                          in SFmode, DFmode and XFmode */
+  2,                                   /* cost of moving MMX register */
+  {3, 3},                              /* cost of loading MMX registers
+                                          in SImode and DImode */
+  {4, 4},                              /* cost of storing MMX registers
+                                          in SImode and DImode */
+  2,                                   /* cost of moving SSE register */
+  {4, 3, 6},                           /* cost of loading SSE registers
+                                          in SImode, DImode and TImode */
+  {4, 4, 5},                           /* cost of storing SSE registers
+                                          in SImode, DImode and TImode */
+  5,                                   /* MMX or SSE register to integer */
+  64,                                  /* size of l1 cache.  */
+  512,                                 /* size of l2 cache.  */
+  64,                                  /* size of prefetch block */
+  /* New AMD processors never drop prefetches; if they cannot be performed
+     immediately, they are queued.  We set number of simultaneous prefetches
+     to a large constant to reflect this (it probably is not a good idea not
+     to limit number of prefetches at all, as their execution also takes some
+     time).  */
+  100,                                 /* number of parallel prefetches */
+  3,                                   /* Branch cost */
+  COSTS_N_INSNS (4),                   /* cost of FADD and FSUB insns.  */
+  COSTS_N_INSNS (4),                   /* cost of FMUL instruction.  */
+  COSTS_N_INSNS (19),                  /* cost of FDIV instruction.  */
+  COSTS_N_INSNS (2),                   /* cost of FABS instruction.  */
+  COSTS_N_INSNS (2),                   /* cost of FCHS instruction.  */
+  COSTS_N_INSNS (35),                  /* cost of FSQRT instruction.  */
+  1, 1, 1, 1,                          /* reassoc int, fp, vec_int, vec_fp.  */
+  k8_memcpy,
+  k8_memset,
+  4,                                   /* scalar_stmt_cost.  */
+  2,                                   /* scalar load_cost.  */
+  2,                                   /* scalar_store_cost.  */
+  5,                                   /* vec_stmt_cost.  */
+  0,                                   /* vec_to_scalar_cost.  */
+  2,                                   /* scalar_to_vec_cost.  */
+  2,                                   /* vec_align_load_cost.  */
+  3,                                   /* vec_unalign_load_cost.  */
+  3,                                   /* vec_store_cost.  */
+  3,                                   /* cond_taken_branch_cost.  */
+  2,                                   /* cond_not_taken_branch_cost.  */
+};
+
+/* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
+   very small blocks it is better to use loop. For large blocks, libcall can
+   do nontemporary accesses and beat inline considerably.  */
+static stringop_algs amdfam10_memcpy[2] = {
+  {libcall, {{6, loop, false}, {14, unrolled_loop, false},
+             {-1, rep_prefix_4_byte, false}}},
+  {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
+             {-1, libcall, false}}}};
+static stringop_algs amdfam10_memset[2] = {
+  {libcall, {{8, loop, false}, {24, unrolled_loop, false},
+             {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
+  {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
+             {-1, libcall, false}}}};
+struct processor_costs amdfam10_cost = {
+  COSTS_N_INSNS (1),                   /* cost of an add instruction */
+  COSTS_N_INSNS (2),                   /* cost of a lea instruction */
+  COSTS_N_INSNS (1),                   /* variable shift costs */
+  COSTS_N_INSNS (1),                   /* constant shift costs */
+  {COSTS_N_INSNS (3),                  /* cost of starting multiply for QI */
+   COSTS_N_INSNS (4),                  /*                               HI */
+   COSTS_N_INSNS (3),                  /*                               SI */
+   COSTS_N_INSNS (4),                  /*                               DI */
+   COSTS_N_INSNS (5)},                 /*                            other */
+  0,                                   /* cost of multiply per each bit set */
+  {COSTS_N_INSNS (19),                 /* cost of a divide/mod for QI */
+   COSTS_N_INSNS (35),                 /*                          HI */
+   COSTS_N_INSNS (51),                 /*                          SI */
+   COSTS_N_INSNS (83),                 /*                          DI */
+   COSTS_N_INSNS (83)},                        /*                          other */
+  COSTS_N_INSNS (1),                   /* cost of movsx */
+  COSTS_N_INSNS (1),                   /* cost of movzx */
+  8,                                   /* "large" insn */
+  9,                                   /* MOVE_RATIO */
+  4,                                /* cost for loading QImode using movzbl */
+  {3, 4, 3},                           /* cost of loading integer registers
+                                          in QImode, HImode and SImode.
+                                          Relative to reg-reg move (2).  */
+  {3, 4, 3},                           /* cost of storing integer registers */
+  4,                                   /* cost of reg,reg fld/fst */
+  {4, 4, 12},                          /* cost of loading fp registers
+                                          in SFmode, DFmode and XFmode */
+  {6, 6, 8},                           /* cost of storing fp registers
+                                          in SFmode, DFmode and XFmode */
+  2,                                   /* cost of moving MMX register */
+  {3, 3},                              /* cost of loading MMX registers
+                                          in SImode and DImode */
+  {4, 4},                              /* cost of storing MMX registers
+                                          in SImode and DImode */
+  2,                                   /* cost of moving SSE register */
+  {4, 4, 3},                           /* cost of loading SSE registers
+                                          in SImode, DImode and TImode */
+  {4, 4, 5},                           /* cost of storing SSE registers
+                                          in SImode, DImode and TImode */
+  3,                                   /* MMX or SSE register to integer */
+                                       /* On K8:
+                                           MOVD reg64, xmmreg Double FSTORE 4
+                                           MOVD reg32, xmmreg Double FSTORE 4
+                                          On AMDFAM10:
+                                           MOVD reg64, xmmreg Double FADD 3
+                                                              1/1  1/1
+                                           MOVD reg32, xmmreg Double FADD 3
+                                                              1/1  1/1 */
+  64,                                  /* size of l1 cache.  */
+  512,                                 /* size of l2 cache.  */
+  64,                                  /* size of prefetch block */
+  /* New AMD processors never drop prefetches; if they cannot be performed
+     immediately, they are queued.  We set number of simultaneous prefetches
+     to a large constant to reflect this (it probably is not a good idea not
+     to limit number of prefetches at all, as their execution also takes some
+     time).  */
+  100,                                 /* number of parallel prefetches */
+  2,                                   /* Branch cost */
+  COSTS_N_INSNS (4),                   /* cost of FADD and FSUB insns.  */
+  COSTS_N_INSNS (4),                   /* cost of FMUL instruction.  */
+  COSTS_N_INSNS (19),                  /* cost of FDIV instruction.  */
+  COSTS_N_INSNS (2),                   /* cost of FABS instruction.  */
+  COSTS_N_INSNS (2),                   /* cost of FCHS instruction.  */
+  COSTS_N_INSNS (35),                  /* cost of FSQRT instruction.  */
+  1, 1, 1, 1,                          /* reassoc int, fp, vec_int, vec_fp.  */
+  amdfam10_memcpy,
+  amdfam10_memset,
+  4,                                   /* scalar_stmt_cost.  */
+  2,                                   /* scalar load_cost.  */
+  2,                                   /* scalar_store_cost.  */
+  6,                                   /* vec_stmt_cost.  */
+  0,                                   /* vec_to_scalar_cost.  */
+  2,                                   /* scalar_to_vec_cost.  */
+  2,                                   /* vec_align_load_cost.  */
+  2,                                   /* vec_unalign_load_cost.  */
+  2,                                   /* vec_store_cost.  */
+  2,                                   /* cond_taken_branch_cost.  */
+  1,                                   /* cond_not_taken_branch_cost.  */
+};
+
+/*  BDVER1 has optimized REP instruction for medium sized blocks, but for
+    very small blocks it is better to use loop. For large blocks, libcall
+    can do nontemporary accesses and beat inline considerably.  */
+static stringop_algs bdver1_memcpy[2] = {
+  {libcall, {{6, loop, false}, {14, unrolled_loop, false},
+             {-1, rep_prefix_4_byte, false}}},
+  {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
+             {-1, libcall, false}}}};
+static stringop_algs bdver1_memset[2] = {
+  {libcall, {{8, loop, false}, {24, unrolled_loop, false},
+             {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
+  {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
+             {-1, libcall, false}}}};
+
+const struct processor_costs bdver1_cost = {
+  COSTS_N_INSNS (1),                   /* cost of an add instruction */
+  COSTS_N_INSNS (1),                   /* cost of a lea instruction */
+  COSTS_N_INSNS (1),                   /* variable shift costs */
+  COSTS_N_INSNS (1),                   /* constant shift costs */
+  {COSTS_N_INSNS (4),                  /* cost of starting multiply for QI */
+   COSTS_N_INSNS (4),                  /*                               HI */
+   COSTS_N_INSNS (4),                  /*                               SI */
+   COSTS_N_INSNS (6),                  /*                               DI */
+   COSTS_N_INSNS (6)},                 /*                            other */
+  0,                                   /* cost of multiply per each bit set */
+  {COSTS_N_INSNS (19),                 /* cost of a divide/mod for QI */
+   COSTS_N_INSNS (35),                 /*                          HI */
+   COSTS_N_INSNS (51),                 /*                          SI */
+   COSTS_N_INSNS (83),                 /*                          DI */
+   COSTS_N_INSNS (83)},                        /*                          other */
+  COSTS_N_INSNS (1),                   /* cost of movsx */
+  COSTS_N_INSNS (1),                   /* cost of movzx */
+  8,                                   /* "large" insn */
+  9,                                   /* MOVE_RATIO */
+  4,                                /* cost for loading QImode using movzbl */
+  {5, 5, 4},                           /* cost of loading integer registers
+                                          in QImode, HImode and SImode.
+                                          Relative to reg-reg move (2).  */
+  {4, 4, 4},                           /* cost of storing integer registers */
+  2,                                   /* cost of reg,reg fld/fst */
+  {5, 5, 12},                          /* cost of loading fp registers
+                                          in SFmode, DFmode and XFmode */
+  {4, 4, 8},                           /* cost of storing fp registers
+                                          in SFmode, DFmode and XFmode */
+  2,                                   /* cost of moving MMX register */
+  {4, 4},                              /* cost of loading MMX registers
+                                          in SImode and DImode */
+  {4, 4},                              /* cost of storing MMX registers
+                                          in SImode and DImode */
+  2,                                   /* cost of moving SSE register */
+  {4, 4, 4},                           /* cost of loading SSE registers
+                                          in SImode, DImode and TImode */
+  {4, 4, 4},                           /* cost of storing SSE registers
+                                          in SImode, DImode and TImode */
+  2,                                   /* MMX or SSE register to integer */
+                                       /* On K8:
+                                           MOVD reg64, xmmreg Double FSTORE 4
+                                           MOVD reg32, xmmreg Double FSTORE 4
+                                          On AMDFAM10:
+                                           MOVD reg64, xmmreg Double FADD 3
+                                                              1/1  1/1
+                                           MOVD reg32, xmmreg Double FADD 3
+                                                              1/1  1/1 */
+  16,                                  /* size of l1 cache.  */
+  2048,                                        /* size of l2 cache.  */
+  64,                                  /* size of prefetch block */
+  /* New AMD processors never drop prefetches; if they cannot be performed
+     immediately, they are queued.  We set number of simultaneous prefetches
+     to a large constant to reflect this (it probably is not a good idea not
+     to limit number of prefetches at all, as their execution also takes some
+     time).  */
+  100,                                 /* number of parallel prefetches */
+  2,                                   /* Branch cost */
+  COSTS_N_INSNS (6),                   /* cost of FADD and FSUB insns.  */
+  COSTS_N_INSNS (6),                   /* cost of FMUL instruction.  */
+  COSTS_N_INSNS (42),                  /* cost of FDIV instruction.  */
+  COSTS_N_INSNS (2),                   /* cost of FABS instruction.  */
+  COSTS_N_INSNS (2),                   /* cost of FCHS instruction.  */
+  COSTS_N_INSNS (52),                  /* cost of FSQRT instruction.  */
+  1, 2, 1, 1,                          /* reassoc int, fp, vec_int, vec_fp.  */
+  bdver1_memcpy,
+  bdver1_memset,
+  6,                                   /* scalar_stmt_cost.  */
+  4,                                   /* scalar load_cost.  */
+  4,                                   /* scalar_store_cost.  */
+  6,                                   /* vec_stmt_cost.  */
+  0,                                   /* vec_to_scalar_cost.  */
+  2,                                   /* scalar_to_vec_cost.  */
+  4,                                   /* vec_align_load_cost.  */
+  4,                                   /* vec_unalign_load_cost.  */
+  4,                                   /* vec_store_cost.  */
+  4,                                   /* cond_taken_branch_cost.  */
+  2,                                   /* cond_not_taken_branch_cost.  */
+};
+
+/*  BDVER2 has optimized REP instruction for medium sized blocks, but for
+    very small blocks it is better to use loop. For large blocks, libcall
+    can do nontemporary accesses and beat inline considerably.  */
+
+static stringop_algs bdver2_memcpy[2] = {
+  {libcall, {{6, loop, false}, {14, unrolled_loop, false},
+             {-1, rep_prefix_4_byte, false}}},
+  {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
+             {-1, libcall, false}}}};
+static stringop_algs bdver2_memset[2] = {
+  {libcall, {{8, loop, false}, {24, unrolled_loop, false},
+             {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
+  {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
+             {-1, libcall, false}}}};
+
+const struct processor_costs bdver2_cost = {
+  COSTS_N_INSNS (1),                   /* cost of an add instruction */
+  COSTS_N_INSNS (1),                   /* cost of a lea instruction */
+  COSTS_N_INSNS (1),                   /* variable shift costs */
+  COSTS_N_INSNS (1),                   /* constant shift costs */
+  {COSTS_N_INSNS (4),                  /* cost of starting multiply for QI */
+   COSTS_N_INSNS (4),                  /*                               HI */
+   COSTS_N_INSNS (4),                  /*                               SI */
+   COSTS_N_INSNS (6),                  /*                               DI */
+   COSTS_N_INSNS (6)},                 /*                            other */
+  0,                                   /* cost of multiply per each bit set */
+  {COSTS_N_INSNS (19),                 /* cost of a divide/mod for QI */
+   COSTS_N_INSNS (35),                 /*                          HI */
+   COSTS_N_INSNS (51),                 /*                          SI */
+   COSTS_N_INSNS (83),                 /*                          DI */
+   COSTS_N_INSNS (83)},                        /*                          other */
+  COSTS_N_INSNS (1),                   /* cost of movsx */
+  COSTS_N_INSNS (1),                   /* cost of movzx */
+  8,                                   /* "large" insn */
+  9,                                   /* MOVE_RATIO */
+  4,                                /* cost for loading QImode using movzbl */
+  {5, 5, 4},                           /* cost of loading integer registers
+                                          in QImode, HImode and SImode.
+                                          Relative to reg-reg move (2).  */
+  {4, 4, 4},                           /* cost of storing integer registers */
+  2,                                   /* cost of reg,reg fld/fst */
+  {5, 5, 12},                          /* cost of loading fp registers
+                                          in SFmode, DFmode and XFmode */
+  {4, 4, 8},                           /* cost of storing fp registers
+                                          in SFmode, DFmode and XFmode */
+  2,                                   /* cost of moving MMX register */
+  {4, 4},                              /* cost of loading MMX registers
+                                          in SImode and DImode */
+  {4, 4},                              /* cost of storing MMX registers
+                                          in SImode and DImode */
+  2,                                   /* cost of moving SSE register */
+  {4, 4, 4},                           /* cost of loading SSE registers
+                                          in SImode, DImode and TImode */
+  {4, 4, 4},                           /* cost of storing SSE registers
+                                          in SImode, DImode and TImode */
+  2,                                   /* MMX or SSE register to integer */
+                                       /* On K8:
+                                           MOVD reg64, xmmreg Double FSTORE 4
+                                           MOVD reg32, xmmreg Double FSTORE 4
+                                          On AMDFAM10:
+                                           MOVD reg64, xmmreg Double FADD 3
+                                                              1/1  1/1
+                                           MOVD reg32, xmmreg Double FADD 3
+                                                              1/1  1/1 */
+  16,                                  /* size of l1 cache.  */
+  2048,                                        /* size of l2 cache.  */
+  64,                                  /* size of prefetch block */
+  /* New AMD processors never drop prefetches; if they cannot be performed
+     immediately, they are queued.  We set number of simultaneous prefetches
+     to a large constant to reflect this (it probably is not a good idea not
+     to limit number of prefetches at all, as their execution also takes some
+     time).  */
+  100,                                 /* number of parallel prefetches */
+  2,                                   /* Branch cost */
+  COSTS_N_INSNS (6),                   /* cost of FADD and FSUB insns.  */
+  COSTS_N_INSNS (6),                   /* cost of FMUL instruction.  */
+  COSTS_N_INSNS (42),                  /* cost of FDIV instruction.  */
+  COSTS_N_INSNS (2),                   /* cost of FABS instruction.  */
+  COSTS_N_INSNS (2),                   /* cost of FCHS instruction.  */
+  COSTS_N_INSNS (52),                  /* cost of FSQRT instruction.  */
+  1, 2, 1, 1,                          /* reassoc int, fp, vec_int, vec_fp.  */
+  bdver2_memcpy,
+  bdver2_memset,
+  6,                                   /* scalar_stmt_cost.  */
+  4,                                   /* scalar load_cost.  */
+  4,                                   /* scalar_store_cost.  */
+  6,                                   /* vec_stmt_cost.  */
+  0,                                   /* vec_to_scalar_cost.  */
+  2,                                   /* scalar_to_vec_cost.  */
+  4,                                   /* vec_align_load_cost.  */
+  4,                                   /* vec_unalign_load_cost.  */
+  4,                                   /* vec_store_cost.  */
+  4,                                   /* cond_taken_branch_cost.  */
+  2,                                   /* cond_not_taken_branch_cost.  */
+};
+
+
+  /*  BDVER3 has optimized REP instruction for medium sized blocks, but for
+      very small blocks it is better to use loop. For large blocks, libcall
+      can do nontemporary accesses and beat inline considerably.  */
+static stringop_algs bdver3_memcpy[2] = {
+  {libcall, {{6, loop, false}, {14, unrolled_loop, false},
+             {-1, rep_prefix_4_byte, false}}},
+  {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
+             {-1, libcall, false}}}};
+static stringop_algs bdver3_memset[2] = {
+  {libcall, {{8, loop, false}, {24, unrolled_loop, false},
+             {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
+  {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
+             {-1, libcall, false}}}};
+struct processor_costs bdver3_cost = {
+  COSTS_N_INSNS (1),                   /* cost of an add instruction */
+  COSTS_N_INSNS (1),                   /* cost of a lea instruction */
+  COSTS_N_INSNS (1),                   /* variable shift costs */
+  COSTS_N_INSNS (1),                   /* constant shift costs */
+  {COSTS_N_INSNS (4),                  /* cost of starting multiply for QI */
+   COSTS_N_INSNS (4),                  /*                               HI */
+   COSTS_N_INSNS (4),                  /*                               SI */
+   COSTS_N_INSNS (6),                  /*                               DI */
+   COSTS_N_INSNS (6)},                 /*                            other */
+  0,                                   /* cost of multiply per each bit set */
+  {COSTS_N_INSNS (19),                 /* cost of a divide/mod for QI */
+   COSTS_N_INSNS (35),                 /*                          HI */
+   COSTS_N_INSNS (51),                 /*                          SI */
+   COSTS_N_INSNS (83),                 /*                          DI */
+   COSTS_N_INSNS (83)},                        /*                          other */
+  COSTS_N_INSNS (1),                   /* cost of movsx */
+  COSTS_N_INSNS (1),                   /* cost of movzx */
+  8,                                   /* "large" insn */
+  9,                                   /* MOVE_RATIO */
+  4,                                /* cost for loading QImode using movzbl */
+  {5, 5, 4},                           /* cost of loading integer registers
+                                          in QImode, HImode and SImode.
+                                          Relative to reg-reg move (2).  */
+  {4, 4, 4},                           /* cost of storing integer registers */
+  2,                                   /* cost of reg,reg fld/fst */
+  {5, 5, 12},                          /* cost of loading fp registers
+                                          in SFmode, DFmode and XFmode */
+  {4, 4, 8},                           /* cost of storing fp registers
+                                          in SFmode, DFmode and XFmode */
+  2,                                   /* cost of moving MMX register */
+  {4, 4},                              /* cost of loading MMX registers
+                                          in SImode and DImode */
+  {4, 4},                              /* cost of storing MMX registers
+                                          in SImode and DImode */
+  2,                                   /* cost of moving SSE register */
+  {4, 4, 4},                           /* cost of loading SSE registers
+                                          in SImode, DImode and TImode */
+  {4, 4, 4},                           /* cost of storing SSE registers
+                                          in SImode, DImode and TImode */
+  2,                                   /* MMX or SSE register to integer */
+  16,                                  /* size of l1 cache.  */
+  2048,                                        /* size of l2 cache.  */
+  64,                                  /* size of prefetch block */
+  /* New AMD processors never drop prefetches; if they cannot be performed
+     immediately, they are queued.  We set number of simultaneous prefetches
+     to a large constant to reflect this (it probably is not a good idea not
+     to limit number of prefetches at all, as their execution also takes some
+     time).  */
+  100,                                 /* number of parallel prefetches */
+  2,                                   /* Branch cost */
+  COSTS_N_INSNS (6),                   /* cost of FADD and FSUB insns.  */
+  COSTS_N_INSNS (6),                   /* cost of FMUL instruction.  */
+  COSTS_N_INSNS (42),                  /* cost of FDIV instruction.  */
+  COSTS_N_INSNS (2),                   /* cost of FABS instruction.  */
+  COSTS_N_INSNS (2),                   /* cost of FCHS instruction.  */
+  COSTS_N_INSNS (52),                  /* cost of FSQRT instruction.  */
+  1, 2, 1, 1,                          /* reassoc int, fp, vec_int, vec_fp.  */
+  bdver3_memcpy,
+  bdver3_memset,
+  6,                                   /* scalar_stmt_cost.  */
+  4,                                   /* scalar load_cost.  */
+  4,                                   /* scalar_store_cost.  */
+  6,                                   /* vec_stmt_cost.  */
+  0,                                   /* vec_to_scalar_cost.  */
+  2,                                   /* scalar_to_vec_cost.  */
+  4,                                   /* vec_align_load_cost.  */
+  4,                                   /* vec_unalign_load_cost.  */
+  4,                                   /* vec_store_cost.  */
+  4,                                   /* cond_taken_branch_cost.  */
+  2,                                   /* cond_not_taken_branch_cost.  */
+};
+
+/*  BDVER4 has optimized REP instruction for medium sized blocks, but for
+    very small blocks it is better to use loop. For large blocks, libcall
+    can do nontemporary accesses and beat inline considerably.  */
+static stringop_algs bdver4_memcpy[2] = {
+  {libcall, {{6, loop, false}, {14, unrolled_loop, false},
+             {-1, rep_prefix_4_byte, false}}},
+  {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
+             {-1, libcall, false}}}};
+static stringop_algs bdver4_memset[2] = {
+  {libcall, {{8, loop, false}, {24, unrolled_loop, false},
+             {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
+  {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
+             {-1, libcall, false}}}};
+struct processor_costs bdver4_cost = {
+  COSTS_N_INSNS (1),                   /* cost of an add instruction */
+  COSTS_N_INSNS (1),                   /* cost of a lea instruction */
+  COSTS_N_INSNS (1),                   /* variable shift costs */
+  COSTS_N_INSNS (1),                   /* constant shift costs */
+  {COSTS_N_INSNS (4),                  /* cost of starting multiply for QI */
+   COSTS_N_INSNS (4),                  /*                               HI */
+   COSTS_N_INSNS (4),                  /*                               SI */
+   COSTS_N_INSNS (6),                  /*                               DI */
+   COSTS_N_INSNS (6)},                 /*                            other */
+  0,                                   /* cost of multiply per each bit set */
+  {COSTS_N_INSNS (19),                 /* cost of a divide/mod for QI */
+   COSTS_N_INSNS (35),                 /*                          HI */
+   COSTS_N_INSNS (51),                 /*                          SI */
+   COSTS_N_INSNS (83),                 /*                          DI */
+   COSTS_N_INSNS (83)},                        /*                          other */
+  COSTS_N_INSNS (1),                   /* cost of movsx */
+  COSTS_N_INSNS (1),                   /* cost of movzx */
+  8,                                   /* "large" insn */
+  9,                                   /* MOVE_RATIO */
+  4,                                /* cost for loading QImode using movzbl */
+  {5, 5, 4},                           /* cost of loading integer registers
+                                          in QImode, HImode and SImode.
+                                          Relative to reg-reg move (2).  */
+  {4, 4, 4},                           /* cost of storing integer registers */
+  2,                                   /* cost of reg,reg fld/fst */
+  {5, 5, 12},                          /* cost of loading fp registers
+                                          in SFmode, DFmode and XFmode */
+  {4, 4, 8},                           /* cost of storing fp registers
+                                          in SFmode, DFmode and XFmode */
+  2,                                   /* cost of moving MMX register */
+  {4, 4},                              /* cost of loading MMX registers
+                                          in SImode and DImode */
+  {4, 4},                              /* cost of storing MMX registers
+                                          in SImode and DImode */
+  2,                                   /* cost of moving SSE register */
+  {4, 4, 4},                           /* cost of loading SSE registers
+                                          in SImode, DImode and TImode */
+  {4, 4, 4},                           /* cost of storing SSE registers
+                                          in SImode, DImode and TImode */
+  2,                                   /* MMX or SSE register to integer */
+  16,                                  /* size of l1 cache.  */
+  2048,                                        /* size of l2 cache.  */
+  64,                                  /* size of prefetch block */
+  /* New AMD processors never drop prefetches; if they cannot be performed
+     immediately, they are queued.  We set number of simultaneous prefetches
+     to a large constant to reflect this (it probably is not a good idea not
+     to limit number of prefetches at all, as their execution also takes some
+     time).  */
+  100,                                 /* number of parallel prefetches */
+  2,                                   /* Branch cost */
+  COSTS_N_INSNS (6),                   /* cost of FADD and FSUB insns.  */
+  COSTS_N_INSNS (6),                   /* cost of FMUL instruction.  */
+  COSTS_N_INSNS (42),                  /* cost of FDIV instruction.  */
+  COSTS_N_INSNS (2),                   /* cost of FABS instruction.  */
+  COSTS_N_INSNS (2),                   /* cost of FCHS instruction.  */
+  COSTS_N_INSNS (52),                  /* cost of FSQRT instruction.  */
+  1, 2, 1, 1,                          /* reassoc int, fp, vec_int, vec_fp.  */
+  bdver4_memcpy,
+  bdver4_memset,
+  6,                                   /* scalar_stmt_cost.  */
+  4,                                   /* scalar load_cost.  */
+  4,                                   /* scalar_store_cost.  */
+  6,                                   /* vec_stmt_cost.  */
+  0,                                   /* vec_to_scalar_cost.  */
+  2,                                   /* scalar_to_vec_cost.  */
+  4,                                   /* vec_align_load_cost.  */
+  4,                                   /* vec_unalign_load_cost.  */
+  4,                                   /* vec_store_cost.  */
+  4,                                   /* cond_taken_branch_cost.  */
+  2,                                   /* cond_not_taken_branch_cost.  */
+};
+
+
+/*  ZNVER1 has optimized REP instruction for medium sized blocks, but for
+    very small blocks it is better to use loop.  For large blocks, libcall
+    can do nontemporary accesses and beat inline considerably.  */
+static stringop_algs znver1_memcpy[2] = {
+  {libcall, {{6, loop, false}, {14, unrolled_loop, false},
+            {-1, rep_prefix_4_byte, false}}},
+  {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
+            {-1, libcall, false}}}};
+static stringop_algs znver1_memset[2] = {
+  {libcall, {{8, loop, false}, {24, unrolled_loop, false},
+            {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
+  {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
+            {-1, libcall, false}}}};
+struct processor_costs znver1_cost = {
+  COSTS_N_INSNS (1),                   /* cost of an add instruction.  */
+  COSTS_N_INSNS (1),                   /* cost of a lea instruction.  */
+  COSTS_N_INSNS (1),                   /* variable shift costs.  */
+  COSTS_N_INSNS (1),                   /* constant shift costs.  */
+  {COSTS_N_INSNS (3),                  /* cost of starting multiply for QI.  */
+   COSTS_N_INSNS (3),                  /*                               HI.  */
+   COSTS_N_INSNS (3),                  /*                               SI.  */
+   COSTS_N_INSNS (4),                  /*                               DI.  */
+   COSTS_N_INSNS (4)},                 /*                            other.  */
+  0,                                   /* cost of multiply per each bit
+                                           set.  */
+  {COSTS_N_INSNS (19),                 /* cost of a divide/mod for QI.  */
+   COSTS_N_INSNS (35),                 /*                          HI.  */
+   COSTS_N_INSNS (51),                 /*                          SI.  */
+   COSTS_N_INSNS (83),                 /*                          DI.  */
+   COSTS_N_INSNS (83)},                        /*                          other.  */
+  COSTS_N_INSNS (1),                   /* cost of movsx.  */
+  COSTS_N_INSNS (1),                   /* cost of movzx.  */
+  8,                                   /* "large" insn.  */
+  9,                                   /* MOVE_RATIO.  */
+  4,                                   /* cost for loading QImode using
+                                          movzbl.  */
+  {5, 5, 4},                           /* cost of loading integer registers
+                                          in QImode, HImode and SImode.
+                                          Relative to reg-reg move (2).  */
+  {4, 4, 4},                           /* cost of storing integer
+                                          registers.  */
+  2,                                   /* cost of reg,reg fld/fst.  */
+  {5, 5, 12},                          /* cost of loading fp registers
+                                          in SFmode, DFmode and XFmode.  */
+  {4, 4, 8},                           /* cost of storing fp registers
+                                          in SFmode, DFmode and XFmode.  */
+  2,                                   /* cost of moving MMX register.  */
+  {4, 4},                              /* cost of loading MMX registers
+                                          in SImode and DImode.  */
+  {4, 4},                              /* cost of storing MMX registers
+                                          in SImode and DImode.  */
+  2,                                   /* cost of moving SSE register.  */
+  {4, 4, 4},                           /* cost of loading SSE registers
+                                          in SImode, DImode and TImode.  */
+  {4, 4, 4},                           /* cost of storing SSE registers
+                                          in SImode, DImode and TImode.  */
+  2,                                   /* MMX or SSE register to integer.  */
+  32,                                  /* size of l1 cache.  */
+  512,                                 /* size of l2 cache.  */
+  64,                                  /* size of prefetch block.  */
+  /* New AMD processors never drop prefetches; if they cannot be performed
+     immediately, they are queued.  We set number of simultaneous prefetches
+     to a large constant to reflect this (it probably is not a good idea not
+     to limit number of prefetches at all, as their execution also takes some
+     time).  */
+  100,                                 /* number of parallel prefetches.  */
+  3,                                   /* Branch cost.  */
+  COSTS_N_INSNS (6),                   /* cost of FADD and FSUB insns.  */
+  COSTS_N_INSNS (6),                   /* cost of FMUL instruction.  */
+  COSTS_N_INSNS (42),                  /* cost of FDIV instruction.  */
+  COSTS_N_INSNS (2),                   /* cost of FABS instruction.  */
+  COSTS_N_INSNS (2),                   /* cost of FCHS instruction.  */
+  COSTS_N_INSNS (52),                  /* cost of FSQRT instruction.  */
+  /* Zen can execute 4 integer operations per cycle. FP operations take 3 cycles
+     and it can execute 2 integer additions and 2 multiplications thus
+     reassociation may make sense up to with of 6.  SPEC2k6 bencharks suggests
+     that 4 works better than 6 probably due to register pressure.
+
+     Integer vector operations are taken by FP unit and execute 3 vector
+     plus/minus operations per cycle but only one multiply.  This is adjusted
+     in ix86_reassociation_width.  */
+  4, 4, 3, 6,                          /* reassoc int, fp, vec_int, vec_fp.  */
+  znver1_memcpy,
+  znver1_memset,
+  6,                                   /* scalar_stmt_cost.  */
+  4,                                   /* scalar load_cost.  */
+  4,                                   /* scalar_store_cost.  */
+  6,                                   /* vec_stmt_cost.  */
+  0,                                   /* vec_to_scalar_cost.  */
+  2,                                   /* scalar_to_vec_cost.  */
+  4,                                   /* vec_align_load_cost.  */
+  4,                                   /* vec_unalign_load_cost.  */
+  4,                                   /* vec_store_cost.  */
+  4,                                   /* cond_taken_branch_cost.  */
+  2,                                   /* cond_not_taken_branch_cost.  */
+};
+
+  /* BTVER1 has optimized REP instruction for medium sized blocks, but for
+     very small blocks it is better to use loop. For large blocks, libcall can
+     do nontemporary accesses and beat inline considerably.  */
+static stringop_algs btver1_memcpy[2] = {
+  {libcall, {{6, loop, false}, {14, unrolled_loop, false},
+             {-1, rep_prefix_4_byte, false}}},
+  {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
+             {-1, libcall, false}}}};
+static stringop_algs btver1_memset[2] = {
+  {libcall, {{8, loop, false}, {24, unrolled_loop, false},
+             {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
+  {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
+             {-1, libcall, false}}}};
+const struct processor_costs btver1_cost = {
+  COSTS_N_INSNS (1),                   /* cost of an add instruction */
+  COSTS_N_INSNS (2),                   /* cost of a lea instruction */
+  COSTS_N_INSNS (1),                   /* variable shift costs */
+  COSTS_N_INSNS (1),                   /* constant shift costs */
+  {COSTS_N_INSNS (3),                  /* cost of starting multiply for QI */
+   COSTS_N_INSNS (4),                  /*                               HI */
+   COSTS_N_INSNS (3),                  /*                               SI */
+   COSTS_N_INSNS (4),                  /*                               DI */
+   COSTS_N_INSNS (5)},                 /*                            other */
+  0,                                   /* cost of multiply per each bit set */
+  {COSTS_N_INSNS (19),                 /* cost of a divide/mod for QI */
+   COSTS_N_INSNS (35),                 /*                          HI */
+   COSTS_N_INSNS (51),                 /*                          SI */
+   COSTS_N_INSNS (83),                 /*                          DI */
+   COSTS_N_INSNS (83)},                        /*                          other */
+  COSTS_N_INSNS (1),                   /* cost of movsx */
+  COSTS_N_INSNS (1),                   /* cost of movzx */
+  8,                                   /* "large" insn */
+  9,                                   /* MOVE_RATIO */
+  4,                                /* cost for loading QImode using movzbl */
+  {3, 4, 3},                           /* cost of loading integer registers
+                                          in QImode, HImode and SImode.
+                                          Relative to reg-reg move (2).  */
+  {3, 4, 3},                           /* cost of storing integer registers */
+  4,                                   /* cost of reg,reg fld/fst */
+  {4, 4, 12},                          /* cost of loading fp registers
+                                          in SFmode, DFmode and XFmode */
+  {6, 6, 8},                           /* cost of storing fp registers
+                                          in SFmode, DFmode and XFmode */
+  2,                                   /* cost of moving MMX register */
+  {3, 3},                              /* cost of loading MMX registers
+                                          in SImode and DImode */
+  {4, 4},                              /* cost of storing MMX registers
+                                          in SImode and DImode */
+  2,                                   /* cost of moving SSE register */
+  {4, 4, 3},                           /* cost of loading SSE registers
+                                          in SImode, DImode and TImode */
+  {4, 4, 5},                           /* cost of storing SSE registers
+                                          in SImode, DImode and TImode */
+  3,                                   /* MMX or SSE register to integer */
+                                       /* On K8:
+                                          MOVD reg64, xmmreg Double FSTORE 4
+                                          MOVD reg32, xmmreg Double FSTORE 4
+                                          On AMDFAM10:
+                                          MOVD reg64, xmmreg Double FADD 3
+                                                              1/1  1/1
+                                           MOVD reg32, xmmreg Double FADD 3
+                                                              1/1  1/1 */
+  32,                                  /* size of l1 cache.  */
+  512,                                 /* size of l2 cache.  */
+  64,                                  /* size of prefetch block */
+  100,                                 /* number of parallel prefetches */
+  2,                                   /* Branch cost */
+  COSTS_N_INSNS (4),                   /* cost of FADD and FSUB insns.  */
+  COSTS_N_INSNS (4),                   /* cost of FMUL instruction.  */
+  COSTS_N_INSNS (19),                  /* cost of FDIV instruction.  */
+  COSTS_N_INSNS (2),                   /* cost of FABS instruction.  */
+  COSTS_N_INSNS (2),                   /* cost of FCHS instruction.  */
+  COSTS_N_INSNS (35),                  /* cost of FSQRT instruction.  */
+  1, 1, 1, 1,                          /* reassoc int, fp, vec_int, vec_fp.  */
+  btver1_memcpy,
+  btver1_memset,
+  4,                                   /* scalar_stmt_cost.  */
+  2,                                   /* scalar load_cost.  */
+  2,                                   /* scalar_store_cost.  */
+  6,                                   /* vec_stmt_cost.  */
+  0,                                   /* vec_to_scalar_cost.  */
+  2,                                   /* scalar_to_vec_cost.  */
+  2,                                   /* vec_align_load_cost.  */
+  2,                                   /* vec_unalign_load_cost.  */
+  2,                                   /* vec_store_cost.  */
+  2,                                   /* cond_taken_branch_cost.  */
+  1,                                   /* cond_not_taken_branch_cost.  */
+};
+
+static stringop_algs btver2_memcpy[2] = {
+  {libcall, {{6, loop, false}, {14, unrolled_loop, false},
+             {-1, rep_prefix_4_byte, false}}},
+  {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
+             {-1, libcall, false}}}};
+static stringop_algs btver2_memset[2] = {
+  {libcall, {{8, loop, false}, {24, unrolled_loop, false},
+             {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
+  {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
+             {-1, libcall, false}}}};
+const struct processor_costs btver2_cost = {
+  COSTS_N_INSNS (1),                   /* cost of an add instruction */
+  COSTS_N_INSNS (2),                   /* cost of a lea instruction */
+  COSTS_N_INSNS (1),                   /* variable shift costs */
+  COSTS_N_INSNS (1),                   /* constant shift costs */
+  {COSTS_N_INSNS (3),                  /* cost of starting multiply for QI */
+   COSTS_N_INSNS (4),                  /*                               HI */
+   COSTS_N_INSNS (3),                  /*                               SI */
+   COSTS_N_INSNS (4),                  /*                               DI */
+   COSTS_N_INSNS (5)},                 /*                            other */
+  0,                                   /* cost of multiply per each bit set */
+  {COSTS_N_INSNS (19),                 /* cost of a divide/mod for QI */
+   COSTS_N_INSNS (35),                 /*                          HI */
+   COSTS_N_INSNS (51),                 /*                          SI */
+   COSTS_N_INSNS (83),                 /*                          DI */
+   COSTS_N_INSNS (83)},                        /*                          other */
+  COSTS_N_INSNS (1),                   /* cost of movsx */
+  COSTS_N_INSNS (1),                   /* cost of movzx */
+  8,                                   /* "large" insn */
+  9,                                   /* MOVE_RATIO */
+  4,                                /* cost for loading QImode using movzbl */
+  {3, 4, 3},                           /* cost of loading integer registers
+                                          in QImode, HImode and SImode.
+                                          Relative to reg-reg move (2).  */
+  {3, 4, 3},                           /* cost of storing integer registers */
+  4,                                   /* cost of reg,reg fld/fst */
+  {4, 4, 12},                          /* cost of loading fp registers
+                                          in SFmode, DFmode and XFmode */
+  {6, 6, 8},                           /* cost of storing fp registers
+                                          in SFmode, DFmode and XFmode */
+  2,                                   /* cost of moving MMX register */
+  {3, 3},                              /* cost of loading MMX registers
+                                          in SImode and DImode */
+  {4, 4},                              /* cost of storing MMX registers
+                                          in SImode and DImode */
+  2,                                   /* cost of moving SSE register */
+  {4, 4, 3},                           /* cost of loading SSE registers
+                                          in SImode, DImode and TImode */
+  {4, 4, 5},                           /* cost of storing SSE registers
+                                          in SImode, DImode and TImode */
+  3,                                   /* MMX or SSE register to integer */
+                                       /* On K8:
+                                          MOVD reg64, xmmreg Double FSTORE 4
+                                          MOVD reg32, xmmreg Double FSTORE 4
+                                          On AMDFAM10:
+                                          MOVD reg64, xmmreg Double FADD 3
+                                                              1/1  1/1
+                                           MOVD reg32, xmmreg Double FADD 3
+                                                              1/1  1/1 */
+  32,                                  /* size of l1 cache.  */
+  2048,                                        /* size of l2 cache.  */
+  64,                                  /* size of prefetch block */
+  100,                                 /* number of parallel prefetches */
+  2,                                   /* Branch cost */
+  COSTS_N_INSNS (4),                   /* cost of FADD and FSUB insns.  */
+  COSTS_N_INSNS (4),                   /* cost of FMUL instruction.  */
+  COSTS_N_INSNS (19),                  /* cost of FDIV instruction.  */
+  COSTS_N_INSNS (2),                   /* cost of FABS instruction.  */
+  COSTS_N_INSNS (2),                   /* cost of FCHS instruction.  */
+  COSTS_N_INSNS (35),                  /* cost of FSQRT instruction.  */
+  1, 1, 1, 1,                          /* reassoc int, fp, vec_int, vec_fp.  */
+  btver2_memcpy,
+  btver2_memset,
+  4,                                   /* scalar_stmt_cost.  */
+  2,                                   /* scalar load_cost.  */
+  2,                                   /* scalar_store_cost.  */
+  6,                                   /* vec_stmt_cost.  */
+  0,                                   /* vec_to_scalar_cost.  */
+  2,                                   /* scalar_to_vec_cost.  */
+  2,                                   /* vec_align_load_cost.  */
+  2,                                   /* vec_unalign_load_cost.  */
+  2,                                   /* vec_store_cost.  */
+  2,                                   /* cond_taken_branch_cost.  */
+  1,                                   /* cond_not_taken_branch_cost.  */
+};
+
+static stringop_algs pentium4_memcpy[2] = {
+  {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
+  DUMMY_STRINGOP_ALGS};
+static stringop_algs pentium4_memset[2] = {
+  {libcall, {{6, loop_1_byte, false}, {48, loop, false},
+             {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
+  DUMMY_STRINGOP_ALGS};
+
+static const
+struct processor_costs pentium4_cost = {
+  COSTS_N_INSNS (1),                   /* cost of an add instruction */
+  COSTS_N_INSNS (3),                   /* cost of a lea instruction */
+  COSTS_N_INSNS (4),                   /* variable shift costs */
+  COSTS_N_INSNS (4),                   /* constant shift costs */
+  {COSTS_N_INSNS (15),                 /* cost of starting multiply for QI */
+   COSTS_N_INSNS (15),                 /*                               HI */
+   COSTS_N_INSNS (15),                 /*                               SI */
+   COSTS_N_INSNS (15),                 /*                               DI */
+   COSTS_N_INSNS (15)},                        /*                            other */
+  0,                                   /* cost of multiply per each bit set */
+  {COSTS_N_INSNS (56),                 /* cost of a divide/mod for QI */
+   COSTS_N_INSNS (56),                 /*                          HI */
+   COSTS_N_INSNS (56),                 /*                          SI */
+   COSTS_N_INSNS (56),                 /*                          DI */
+   COSTS_N_INSNS (56)},                        /*                          other */
+  COSTS_N_INSNS (1),                   /* cost of movsx */
+  COSTS_N_INSNS (1),                   /* cost of movzx */
+  16,                                  /* "large" insn */
+  6,                                   /* MOVE_RATIO */
+  2,                                /* cost for loading QImode using movzbl */
+  {4, 5, 4},                           /* cost of loading integer registers
+                                          in QImode, HImode and SImode.
+                                          Relative to reg-reg move (2).  */
+  {2, 3, 2},                           /* cost of storing integer registers */
+  2,                                   /* cost of reg,reg fld/fst */
+  {2, 2, 6},                           /* cost of loading fp registers
+                                          in SFmode, DFmode and XFmode */
+  {4, 4, 6},                           /* cost of storing fp registers
+                                          in SFmode, DFmode and XFmode */
+  2,                                   /* cost of moving MMX register */
+  {2, 2},                              /* cost of loading MMX registers
+                                          in SImode and DImode */
+  {2, 2},                              /* cost of storing MMX registers
+                                          in SImode and DImode */
+  12,                                  /* cost of moving SSE register */
+  {12, 12, 12},                                /* cost of loading SSE registers
+                                          in SImode, DImode and TImode */
+  {2, 2, 8},                           /* cost of storing SSE registers
+                                          in SImode, DImode and TImode */
+  10,                                  /* MMX or SSE register to integer */
+  8,                                   /* size of l1 cache.  */
+  256,                                 /* size of l2 cache.  */
+  64,                                  /* size of prefetch block */
+  6,                                   /* number of parallel prefetches */
+  2,                                   /* Branch cost */
+  COSTS_N_INSNS (5),                   /* cost of FADD and FSUB insns.  */
+  COSTS_N_INSNS (7),                   /* cost of FMUL instruction.  */
+  COSTS_N_INSNS (43),                  /* cost of FDIV instruction.  */
+  COSTS_N_INSNS (2),                   /* cost of FABS instruction.  */
+  COSTS_N_INSNS (2),                   /* cost of FCHS instruction.  */
+  COSTS_N_INSNS (43),                  /* cost of FSQRT instruction.  */
+  1, 1, 1, 1,                          /* reassoc int, fp, vec_int, vec_fp.  */
+  pentium4_memcpy,
+  pentium4_memset,
+  1,                                   /* scalar_stmt_cost.  */
+  1,                                   /* scalar load_cost.  */
+  1,                                   /* scalar_store_cost.  */
+  1,                                   /* vec_stmt_cost.  */
+  1,                                   /* vec_to_scalar_cost.  */
+  1,                                   /* scalar_to_vec_cost.  */
+  1,                                   /* vec_align_load_cost.  */
+  2,                                   /* vec_unalign_load_cost.  */
+  1,                                   /* vec_store_cost.  */
+  3,                                   /* cond_taken_branch_cost.  */
+  1,                                   /* cond_not_taken_branch_cost.  */
+};
+
+static stringop_algs nocona_memcpy[2] = {
+  {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
+  {libcall, {{32, loop, false}, {20000, rep_prefix_8_byte, false},
+             {100000, unrolled_loop, false}, {-1, libcall, false}}}};
+
+static stringop_algs nocona_memset[2] = {
+  {libcall, {{6, loop_1_byte, false}, {48, loop, false},
+             {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
+  {libcall, {{24, loop, false}, {64, unrolled_loop, false},
+             {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
+
+static const
+struct processor_costs nocona_cost = {
+  COSTS_N_INSNS (1),                   /* cost of an add instruction */
+  COSTS_N_INSNS (1),                   /* cost of a lea instruction */
+  COSTS_N_INSNS (1),                   /* variable shift costs */
+  COSTS_N_INSNS (1),                   /* constant shift costs */
+  {COSTS_N_INSNS (10),                 /* cost of starting multiply for QI */
+   COSTS_N_INSNS (10),                 /*                               HI */
+   COSTS_N_INSNS (10),                 /*                               SI */
+   COSTS_N_INSNS (10),                 /*                               DI */
+   COSTS_N_INSNS (10)},                        /*                            other */
+  0,                                   /* cost of multiply per each bit set */
+  {COSTS_N_INSNS (66),                 /* cost of a divide/mod for QI */
+   COSTS_N_INSNS (66),                 /*                          HI */
+   COSTS_N_INSNS (66),                 /*                          SI */
+   COSTS_N_INSNS (66),                 /*                          DI */
+   COSTS_N_INSNS (66)},                        /*                          other */
+  COSTS_N_INSNS (1),                   /* cost of movsx */
+  COSTS_N_INSNS (1),                   /* cost of movzx */
+  16,                                  /* "large" insn */
+  17,                                  /* MOVE_RATIO */
+  4,                                /* cost for loading QImode using movzbl */
+  {4, 4, 4},                           /* cost of loading integer registers
+                                          in QImode, HImode and SImode.
+                                          Relative to reg-reg move (2).  */
+  {4, 4, 4},                           /* cost of storing integer registers */
+  3,                                   /* cost of reg,reg fld/fst */
+  {12, 12, 12},                                /* cost of loading fp registers
+                                          in SFmode, DFmode and XFmode */
+  {4, 4, 4},                           /* cost of storing fp registers
+                                          in SFmode, DFmode and XFmode */
+  6,                                   /* cost of moving MMX register */
+  {12, 12},                            /* cost of loading MMX registers
+                                          in SImode and DImode */
+  {12, 12},                            /* cost of storing MMX registers
+                                          in SImode and DImode */
+  6,                                   /* cost of moving SSE register */
+  {12, 12, 12},                                /* cost of loading SSE registers
+                                          in SImode, DImode and TImode */
+  {12, 12, 12},                                /* cost of storing SSE registers
+                                          in SImode, DImode and TImode */
+  8,                                   /* MMX or SSE register to integer */
+  8,                                   /* size of l1 cache.  */
+  1024,                                        /* size of l2 cache.  */
+  64,                                  /* size of prefetch block */
+  8,                                   /* number of parallel prefetches */
+  1,                                   /* Branch cost */
+  COSTS_N_INSNS (6),                   /* cost of FADD and FSUB insns.  */
+  COSTS_N_INSNS (8),                   /* cost of FMUL instruction.  */
+  COSTS_N_INSNS (40),                  /* cost of FDIV instruction.  */
+  COSTS_N_INSNS (3),                   /* cost of FABS instruction.  */
+  COSTS_N_INSNS (3),                   /* cost of FCHS instruction.  */
+  COSTS_N_INSNS (44),                  /* cost of FSQRT instruction.  */
+  1, 1, 1, 1,                          /* reassoc int, fp, vec_int, vec_fp.  */
+  nocona_memcpy,
+  nocona_memset,
+  1,                                   /* scalar_stmt_cost.  */
+  1,                                   /* scalar load_cost.  */
+  1,                                   /* scalar_store_cost.  */
+  1,                                   /* vec_stmt_cost.  */
+  1,                                   /* vec_to_scalar_cost.  */
+  1,                                   /* scalar_to_vec_cost.  */
+  1,                                   /* vec_align_load_cost.  */
+  2,                                   /* vec_unalign_load_cost.  */
+  1,                                   /* vec_store_cost.  */
+  3,                                   /* cond_taken_branch_cost.  */
+  1,                                   /* cond_not_taken_branch_cost.  */
+};
+
+static stringop_algs atom_memcpy[2] = {
+  {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
+  {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
+             {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
+static stringop_algs atom_memset[2] = {
+  {libcall, {{8, loop, false}, {15, unrolled_loop, false},
+             {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
+  {libcall, {{24, loop, false}, {32, unrolled_loop, false},
+             {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
+static const
+struct processor_costs atom_cost = {
+  COSTS_N_INSNS (1),                   /* cost of an add instruction */
+  COSTS_N_INSNS (1) + 1,               /* cost of a lea instruction */
+  COSTS_N_INSNS (1),                   /* variable shift costs */
+  COSTS_N_INSNS (1),                   /* constant shift costs */
+  {COSTS_N_INSNS (3),                  /* cost of starting multiply for QI */
+   COSTS_N_INSNS (4),                  /*                               HI */
+   COSTS_N_INSNS (3),                  /*                               SI */
+   COSTS_N_INSNS (4),                  /*                               DI */
+   COSTS_N_INSNS (2)},                 /*                            other */
+  0,                                   /* cost of multiply per each bit set */
+  {COSTS_N_INSNS (18),                 /* cost of a divide/mod for QI */
+   COSTS_N_INSNS (26),                 /*                          HI */
+   COSTS_N_INSNS (42),                 /*                          SI */
+   COSTS_N_INSNS (74),                 /*                          DI */
+   COSTS_N_INSNS (74)},                        /*                          other */
+  COSTS_N_INSNS (1),                   /* cost of movsx */
+  COSTS_N_INSNS (1),                   /* cost of movzx */
+  8,                                   /* "large" insn */
+  17,                                  /* MOVE_RATIO */
+  4,                                   /* cost for loading QImode using movzbl */
+  {4, 4, 4},                           /* cost of loading integer registers
+                                          in QImode, HImode and SImode.
+                                          Relative to reg-reg move (2).  */
+  {4, 4, 4},                           /* cost of storing integer registers */
+  4,                                   /* cost of reg,reg fld/fst */
+  {12, 12, 12},                                /* cost of loading fp registers
+                                          in SFmode, DFmode and XFmode */
+  {6, 6, 8},                           /* cost of storing fp registers
+                                          in SFmode, DFmode and XFmode */
+  2,                                   /* cost of moving MMX register */
+  {8, 8},                              /* cost of loading MMX registers
+                                          in SImode and DImode */
+  {8, 8},                              /* cost of storing MMX registers
+                                          in SImode and DImode */
+  2,                                   /* cost of moving SSE register */
+  {8, 8, 8},                           /* cost of loading SSE registers
+                                          in SImode, DImode and TImode */
+  {8, 8, 8},                           /* cost of storing SSE registers
+                                          in SImode, DImode and TImode */
+  5,                                   /* MMX or SSE register to integer */
+  32,                                  /* size of l1 cache.  */
+  256,                                 /* size of l2 cache.  */
+  64,                                  /* size of prefetch block */
+  6,                                   /* number of parallel prefetches */
+  3,                                   /* Branch cost */
+  COSTS_N_INSNS (8),                   /* cost of FADD and FSUB insns.  */
+  COSTS_N_INSNS (8),                   /* cost of FMUL instruction.  */
+  COSTS_N_INSNS (20),                  /* cost of FDIV instruction.  */
+  COSTS_N_INSNS (8),                   /* cost of FABS instruction.  */
+  COSTS_N_INSNS (8),                   /* cost of FCHS instruction.  */
+  COSTS_N_INSNS (40),                  /* cost of FSQRT instruction.  */
+  2, 2, 2, 2,                          /* reassoc int, fp, vec_int, vec_fp.  */
+  atom_memcpy,
+  atom_memset,
+  1,                                   /* scalar_stmt_cost.  */
+  1,                                   /* scalar load_cost.  */
+  1,                                   /* scalar_store_cost.  */
+  1,                                   /* vec_stmt_cost.  */
+  1,                                   /* vec_to_scalar_cost.  */
+  1,                                   /* scalar_to_vec_cost.  */
+  1,                                   /* vec_align_load_cost.  */
+  2,                                   /* vec_unalign_load_cost.  */
+  1,                                   /* vec_store_cost.  */
+  3,                                   /* cond_taken_branch_cost.  */
+  1,                                   /* cond_not_taken_branch_cost.  */
+};
+
+static stringop_algs slm_memcpy[2] = {
+  {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
+  {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
+             {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
+static stringop_algs slm_memset[2] = {
+  {libcall, {{8, loop, false}, {15, unrolled_loop, false},
+             {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
+  {libcall, {{24, loop, false}, {32, unrolled_loop, false},
+             {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
+static const
+struct processor_costs slm_cost = {
+  COSTS_N_INSNS (1),                   /* cost of an add instruction */
+  COSTS_N_INSNS (1) + 1,               /* cost of a lea instruction */
+  COSTS_N_INSNS (1),                   /* variable shift costs */
+  COSTS_N_INSNS (1),                   /* constant shift costs */
+  {COSTS_N_INSNS (3),                  /* cost of starting multiply for QI */
+   COSTS_N_INSNS (3),                  /*                               HI */
+   COSTS_N_INSNS (3),                  /*                               SI */
+   COSTS_N_INSNS (4),                  /*                               DI */
+   COSTS_N_INSNS (2)},                 /*                            other */
+  0,                                   /* cost of multiply per each bit set */
+  {COSTS_N_INSNS (18),                 /* cost of a divide/mod for QI */
+   COSTS_N_INSNS (26),                 /*                          HI */
+   COSTS_N_INSNS (42),                 /*                          SI */
+   COSTS_N_INSNS (74),                 /*                          DI */
+   COSTS_N_INSNS (74)},                        /*                          other */
+  COSTS_N_INSNS (1),                   /* cost of movsx */
+  COSTS_N_INSNS (1),                   /* cost of movzx */
+  8,                                   /* "large" insn */
+  17,                                  /* MOVE_RATIO */
+  4,                                   /* cost for loading QImode using movzbl */
+  {4, 4, 4},                           /* cost of loading integer registers
+                                          in QImode, HImode and SImode.
+                                          Relative to reg-reg move (2).  */
+  {4, 4, 4},                           /* cost of storing integer registers */
+  4,                                   /* cost of reg,reg fld/fst */
+  {12, 12, 12},                                /* cost of loading fp registers
+                                          in SFmode, DFmode and XFmode */
+  {6, 6, 8},                           /* cost of storing fp registers
+                                          in SFmode, DFmode and XFmode */
+  2,                                   /* cost of moving MMX register */
+  {8, 8},                              /* cost of loading MMX registers
+                                          in SImode and DImode */
+  {8, 8},                              /* cost of storing MMX registers
+                                          in SImode and DImode */
+  2,                                   /* cost of moving SSE register */
+  {8, 8, 8},                           /* cost of loading SSE registers
+                                          in SImode, DImode and TImode */
+  {8, 8, 8},                           /* cost of storing SSE registers
+                                          in SImode, DImode and TImode */
+  5,                                   /* MMX or SSE register to integer */
+  32,                                  /* size of l1 cache.  */
+  256,                                 /* size of l2 cache.  */
+  64,                                  /* size of prefetch block */
+  6,                                   /* number of parallel prefetches */
+  3,                                   /* Branch cost */
+  COSTS_N_INSNS (8),                   /* cost of FADD and FSUB insns.  */
+  COSTS_N_INSNS (8),                   /* cost of FMUL instruction.  */
+  COSTS_N_INSNS (20),                  /* cost of FDIV instruction.  */
+  COSTS_N_INSNS (8),                   /* cost of FABS instruction.  */
+  COSTS_N_INSNS (8),                   /* cost of FCHS instruction.  */
+  COSTS_N_INSNS (40),                  /* cost of FSQRT instruction.  */
+  1, 2, 1, 1,                          /* reassoc int, fp, vec_int, vec_fp.  */
+  slm_memcpy,
+  slm_memset,
+  1,                                   /* scalar_stmt_cost.  */
+  1,                                   /* scalar load_cost.  */
+  1,                                   /* scalar_store_cost.  */
+  1,                                   /* vec_stmt_cost.  */
+  4,                                   /* vec_to_scalar_cost.  */
+  1,                                   /* scalar_to_vec_cost.  */
+  1,                                   /* vec_align_load_cost.  */
+  2,                                   /* vec_unalign_load_cost.  */
+  1,                                   /* vec_store_cost.  */
+  3,                                   /* cond_taken_branch_cost.  */
+  1,                                   /* cond_not_taken_branch_cost.  */
+};
+
+static stringop_algs intel_memcpy[2] = {
+  {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
+  {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
+             {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
+static stringop_algs intel_memset[2] = {
+  {libcall, {{8, loop, false}, {15, unrolled_loop, false},
+             {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
+  {libcall, {{24, loop, false}, {32, unrolled_loop, false},
+             {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
+static const
+struct processor_costs intel_cost = {
+  COSTS_N_INSNS (1),                   /* cost of an add instruction */
+  COSTS_N_INSNS (1) + 1,               /* cost of a lea instruction */
+  COSTS_N_INSNS (1),                   /* variable shift costs */
+  COSTS_N_INSNS (1),                   /* constant shift costs */
+  {COSTS_N_INSNS (3),                  /* cost of starting multiply for QI */
+   COSTS_N_INSNS (3),                  /*                               HI */
+   COSTS_N_INSNS (3),                  /*                               SI */
+   COSTS_N_INSNS (4),                  /*                               DI */
+   COSTS_N_INSNS (2)},                 /*                            other */
+  0,                                   /* cost of multiply per each bit set */
+  {COSTS_N_INSNS (18),                 /* cost of a divide/mod for QI */
+   COSTS_N_INSNS (26),                 /*                          HI */
+   COSTS_N_INSNS (42),                 /*                          SI */
+   COSTS_N_INSNS (74),                 /*                          DI */
+   COSTS_N_INSNS (74)},                        /*                          other */
+  COSTS_N_INSNS (1),                   /* cost of movsx */
+  COSTS_N_INSNS (1),                   /* cost of movzx */
+  8,                                   /* "large" insn */
+  17,                                  /* MOVE_RATIO */
+  4,                                   /* cost for loading QImode using movzbl */
+  {4, 4, 4},                           /* cost of loading integer registers
+                                          in QImode, HImode and SImode.
+                                          Relative to reg-reg move (2).  */
+  {4, 4, 4},                           /* cost of storing integer registers */
+  4,                                   /* cost of reg,reg fld/fst */
+  {12, 12, 12},                                /* cost of loading fp registers
+                                          in SFmode, DFmode and XFmode */
+  {6, 6, 8},                           /* cost of storing fp registers
+                                          in SFmode, DFmode and XFmode */
+  2,                                   /* cost of moving MMX register */
+  {8, 8},                              /* cost of loading MMX registers
+                                          in SImode and DImode */
+  {8, 8},                              /* cost of storing MMX registers
+                                          in SImode and DImode */
+  2,                                   /* cost of moving SSE register */
+  {8, 8, 8},                           /* cost of loading SSE registers
+                                          in SImode, DImode and TImode */
+  {8, 8, 8},                           /* cost of storing SSE registers
+                                          in SImode, DImode and TImode */
+  5,                                   /* MMX or SSE register to integer */
+  32,                                  /* size of l1 cache.  */
+  256,                                 /* size of l2 cache.  */
+  64,                                  /* size of prefetch block */
+  6,                                   /* number of parallel prefetches */
+  3,                                   /* Branch cost */
+  COSTS_N_INSNS (8),                   /* cost of FADD and FSUB insns.  */
+  COSTS_N_INSNS (8),                   /* cost of FMUL instruction.  */
+  COSTS_N_INSNS (20),                  /* cost of FDIV instruction.  */
+  COSTS_N_INSNS (8),                   /* cost of FABS instruction.  */
+  COSTS_N_INSNS (8),                   /* cost of FCHS instruction.  */
+  COSTS_N_INSNS (40),                  /* cost of FSQRT instruction.  */
+  1, 4, 1, 1,                          /* reassoc int, fp, vec_int, vec_fp.  */
+  intel_memcpy,
+  intel_memset,
+  1,                                   /* scalar_stmt_cost.  */
+  1,                                   /* scalar load_cost.  */
+  1,                                   /* scalar_store_cost.  */
+  1,                                   /* vec_stmt_cost.  */
+  4,                                   /* vec_to_scalar_cost.  */
+  1,                                   /* scalar_to_vec_cost.  */
+  1,                                   /* vec_align_load_cost.  */
+  2,                                   /* vec_unalign_load_cost.  */
+  1,                                   /* vec_store_cost.  */
+  3,                                   /* cond_taken_branch_cost.  */
+  1,                                   /* cond_not_taken_branch_cost.  */
+};
+
+/* Generic should produce code tuned for Core-i7 (and newer chips)
+   and btver1 (and newer chips).  */
+
+static stringop_algs generic_memcpy[2] = {
+  {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
+             {-1, libcall, false}}},
+  {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
+             {-1, libcall, false}}}};
+static stringop_algs generic_memset[2] = {
+  {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
+             {-1, libcall, false}}},
+  {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
+             {-1, libcall, false}}}};
+static const
+struct processor_costs generic_cost = {
+  COSTS_N_INSNS (1),                   /* cost of an add instruction */
+  /* On all chips taken into consideration lea is 2 cycles and more.  With
+     this cost however our current implementation of synth_mult results in
+     use of unnecessary temporary registers causing regression on several
+     SPECfp benchmarks.  */
+  COSTS_N_INSNS (1) + 1,               /* cost of a lea instruction */
+  COSTS_N_INSNS (1),                   /* variable shift costs */
+  COSTS_N_INSNS (1),                   /* constant shift costs */
+  {COSTS_N_INSNS (3),                  /* cost of starting multiply for QI */
+   COSTS_N_INSNS (4),                  /*                               HI */
+   COSTS_N_INSNS (3),                  /*                               SI */
+   COSTS_N_INSNS (4),                  /*                               DI */
+   COSTS_N_INSNS (2)},                 /*                            other */
+  0,                                   /* cost of multiply per each bit set */
+  {COSTS_N_INSNS (18),                 /* cost of a divide/mod for QI */
+   COSTS_N_INSNS (26),                 /*                          HI */
+   COSTS_N_INSNS (42),                 /*                          SI */
+   COSTS_N_INSNS (74),                 /*                          DI */
+   COSTS_N_INSNS (74)},                        /*                          other */
+  COSTS_N_INSNS (1),                   /* cost of movsx */
+  COSTS_N_INSNS (1),                   /* cost of movzx */
+  8,                                   /* "large" insn */
+  17,                                  /* MOVE_RATIO */
+  4,                                /* cost for loading QImode using movzbl */
+  {4, 4, 4},                           /* cost of loading integer registers
+                                          in QImode, HImode and SImode.
+                                          Relative to reg-reg move (2).  */
+  {4, 4, 4},                           /* cost of storing integer registers */
+  4,                                   /* cost of reg,reg fld/fst */
+  {12, 12, 12},                                /* cost of loading fp registers
+                                          in SFmode, DFmode and XFmode */
+  {6, 6, 8},                           /* cost of storing fp registers
+                                          in SFmode, DFmode and XFmode */
+  2,                                   /* cost of moving MMX register */
+  {8, 8},                              /* cost of loading MMX registers
+                                          in SImode and DImode */
+  {8, 8},                              /* cost of storing MMX registers
+                                          in SImode and DImode */
+  2,                                   /* cost of moving SSE register */
+  {8, 8, 8},                           /* cost of loading SSE registers
+                                          in SImode, DImode and TImode */
+  {8, 8, 8},                           /* cost of storing SSE registers
+                                          in SImode, DImode and TImode */
+  5,                                   /* MMX or SSE register to integer */
+  32,                                  /* size of l1 cache.  */
+  512,                                 /* size of l2 cache.  */
+  64,                                  /* size of prefetch block */
+  6,                                   /* number of parallel prefetches */
+  /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
+     value is increased to perhaps more appropriate value of 5.  */
+  3,                                   /* Branch cost */
+  COSTS_N_INSNS (8),                   /* cost of FADD and FSUB insns.  */
+  COSTS_N_INSNS (8),                   /* cost of FMUL instruction.  */
+  COSTS_N_INSNS (20),                  /* cost of FDIV instruction.  */
+  COSTS_N_INSNS (8),                   /* cost of FABS instruction.  */
+  COSTS_N_INSNS (8),                   /* cost of FCHS instruction.  */
+  COSTS_N_INSNS (40),                  /* cost of FSQRT instruction.  */
+  1, 2, 1, 1,                          /* reassoc int, fp, vec_int, vec_fp.  */
+  generic_memcpy,
+  generic_memset,
+  1,                                   /* scalar_stmt_cost.  */
+  1,                                   /* scalar load_cost.  */
+  1,                                   /* scalar_store_cost.  */
+  1,                                   /* vec_stmt_cost.  */
+  1,                                   /* vec_to_scalar_cost.  */
+  1,                                   /* scalar_to_vec_cost.  */
+  1,                                   /* vec_align_load_cost.  */
+  2,                                   /* vec_unalign_load_cost.  */
+  1,                                   /* vec_store_cost.  */
+  3,                                   /* cond_taken_branch_cost.  */
+  1,                                   /* cond_not_taken_branch_cost.  */
+};
+
+/* core_cost should produce code tuned for Core familly of CPUs.  */
+static stringop_algs core_memcpy[2] = {
+  {libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}},
+  {libcall, {{24, loop, true}, {128, rep_prefix_8_byte, true},
+             {-1, libcall, false}}}};
+static stringop_algs core_memset[2] = {
+  {libcall, {{6, loop_1_byte, true},
+             {24, loop, true},
+             {8192, rep_prefix_4_byte, true},
+             {-1, libcall, false}}},
+  {libcall, {{24, loop, true}, {512, rep_prefix_8_byte, true},
+             {-1, libcall, false}}}};
+
+static const
+struct processor_costs core_cost = {
+  COSTS_N_INSNS (1),                   /* cost of an add instruction */
+  /* On all chips taken into consideration lea is 2 cycles and more.  With
+     this cost however our current implementation of synth_mult results in
+     use of unnecessary temporary registers causing regression on several
+     SPECfp benchmarks.  */
+  COSTS_N_INSNS (1) + 1,               /* cost of a lea instruction */
+  COSTS_N_INSNS (1),                   /* variable shift costs */
+  COSTS_N_INSNS (1),                   /* constant shift costs */
+  {COSTS_N_INSNS (3),                  /* cost of starting multiply for QI */
+   COSTS_N_INSNS (4),                  /*                               HI */
+   COSTS_N_INSNS (3),                  /*                               SI */
+   COSTS_N_INSNS (4),                  /*                               DI */
+   COSTS_N_INSNS (2)},                 /*                            other */
+  0,                                   /* cost of multiply per each bit set */
+  {COSTS_N_INSNS (18),                 /* cost of a divide/mod for QI */
+   COSTS_N_INSNS (26),                 /*                          HI */
+   COSTS_N_INSNS (42),                 /*                          SI */
+   COSTS_N_INSNS (74),                 /*                          DI */
+   COSTS_N_INSNS (74)},                        /*                          other */
+  COSTS_N_INSNS (1),                   /* cost of movsx */
+  COSTS_N_INSNS (1),                   /* cost of movzx */
+  8,                                   /* "large" insn */
+  17,                                  /* MOVE_RATIO */
+  4,                                /* cost for loading QImode using movzbl */
+  {4, 4, 4},                           /* cost of loading integer registers
+                                          in QImode, HImode and SImode.
+                                          Relative to reg-reg move (2).  */
+  {4, 4, 4},                           /* cost of storing integer registers */
+  4,                                   /* cost of reg,reg fld/fst */
+  {12, 12, 12},                                /* cost of loading fp registers
+                                          in SFmode, DFmode and XFmode */
+  {6, 6, 8},                           /* cost of storing fp registers
+                                          in SFmode, DFmode and XFmode */
+  2,                                   /* cost of moving MMX register */
+  {8, 8},                              /* cost of loading MMX registers
+                                          in SImode and DImode */
+  {8, 8},                              /* cost of storing MMX registers
+                                          in SImode and DImode */
+  2,                                   /* cost of moving SSE register */
+  {8, 8, 8},                           /* cost of loading SSE registers
+                                          in SImode, DImode and TImode */
+  {8, 8, 8},                           /* cost of storing SSE registers
+                                          in SImode, DImode and TImode */
+  5,                                   /* MMX or SSE register to integer */
+  64,                                  /* size of l1 cache.  */
+  512,                                 /* size of l2 cache.  */
+  64,                                  /* size of prefetch block */
+  6,                                   /* number of parallel prefetches */
+  /* FIXME perhaps more appropriate value is 5.  */
+  3,                                   /* Branch cost */
+  COSTS_N_INSNS (8),                   /* cost of FADD and FSUB insns.  */
+  COSTS_N_INSNS (8),                   /* cost of FMUL instruction.  */
+  COSTS_N_INSNS (20),                  /* cost of FDIV instruction.  */
+  COSTS_N_INSNS (8),                   /* cost of FABS instruction.  */
+  COSTS_N_INSNS (8),                   /* cost of FCHS instruction.  */
+  COSTS_N_INSNS (40),                  /* cost of FSQRT instruction.  */
+  1, 4, 2, 2,                          /* reassoc int, fp, vec_int, vec_fp.  */
+  core_memcpy,
+  core_memset,
+  1,                                   /* scalar_stmt_cost.  */
+  1,                                   /* scalar load_cost.  */
+  1,                                   /* scalar_store_cost.  */
+  1,                                   /* vec_stmt_cost.  */
+  1,                                   /* vec_to_scalar_cost.  */
+  1,                                   /* scalar_to_vec_cost.  */
+  1,                                   /* vec_align_load_cost.  */
+  2,                                   /* vec_unalign_load_cost.  */
+  1,                                   /* vec_store_cost.  */
+  3,                                   /* cond_taken_branch_cost.  */
+  1,                                   /* cond_not_taken_branch_cost.  */
+};
+
diff --git a/gcc/config/i386/x86-tune-sched-atom.c b/gcc/config/i386/x86-tune-sched-atom.c

new file mode 100644 (file)

index 0000000..86942c0
--- /dev/null
+++ b/gcc/config/i386/x86-tune-sched-atom.c
@@ -0,0 +1,244 @@
+/* Scheduler hooks for IA-32 which implement atom+ specific logic.
+   Copyright (C) 1988-2017 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "backend.h"
+#include "rtl.h"
+#include "tree.h"
+#include "cfghooks.h"
+#include "tm_p.h"
+#include "insn-config.h"
+#include "insn-attr.h"
+#include "recog.h"
+#include "target.h"
+#include "rtl-iter.h"
+#include "regset.h"
+#include "sched-int.h"
+
+/* Try to reorder ready list to take advantage of Atom pipelined IMUL
+   execution. It is applied if
+   (1) IMUL instruction is on the top of list;
+   (2) There exists the only producer of independent IMUL instruction in
+       ready list.
+   Return index of IMUL producer if it was found and -1 otherwise.  */
+static int
+do_reorder_for_imul (rtx_insn **ready, int n_ready)
+{
+  rtx_insn *insn;
+  rtx set, insn1, insn2;
+  sd_iterator_def sd_it;
+  dep_t dep;
+  int index = -1;
+  int i;
+
+  if (!TARGET_BONNELL)
+    return index;
+
+  /* Check that IMUL instruction is on the top of ready list.  */
+  insn = ready[n_ready - 1];
+  set = single_set (insn);
+  if (!set)
+    return index;
+  if (!(GET_CODE (SET_SRC (set)) == MULT
+      && GET_MODE (SET_SRC (set)) == SImode))
+    return index;
+
+  /* Search for producer of independent IMUL instruction.  */
+  for (i = n_ready - 2; i >= 0; i--)
+    {
+      insn = ready[i];
+      if (!NONDEBUG_INSN_P (insn))
+       continue;
+      /* Skip IMUL instruction.  */
+      insn2 = PATTERN (insn);
+      if (GET_CODE (insn2) == PARALLEL)
+       insn2 = XVECEXP (insn2, 0, 0);
+      if (GET_CODE (insn2) == SET
+         && GET_CODE (SET_SRC (insn2)) == MULT
+         && GET_MODE (SET_SRC (insn2)) == SImode)
+       continue;
+
+      FOR_EACH_DEP (insn, SD_LIST_FORW, sd_it, dep)
+       {
+         rtx con;
+         con = DEP_CON (dep);
+         if (!NONDEBUG_INSN_P (con))
+           continue;
+         insn1 = PATTERN (con);
+         if (GET_CODE (insn1) == PARALLEL)
+           insn1 = XVECEXP (insn1, 0, 0);
+
+         if (GET_CODE (insn1) == SET
+             && GET_CODE (SET_SRC (insn1)) == MULT
+             && GET_MODE (SET_SRC (insn1)) == SImode)
+           {
+             sd_iterator_def sd_it1;
+             dep_t dep1;
+             /* Check if there is no other dependee for IMUL.  */
+             index = i;
+             FOR_EACH_DEP (con, SD_LIST_BACK, sd_it1, dep1)
+               {
+                 rtx pro;
+                 pro = DEP_PRO (dep1);
+                 if (!NONDEBUG_INSN_P (pro))
+                   continue;
+                 if (pro != insn)
+                   index = -1;
+               }
+             if (index >= 0)
+               break;
+           }
+       }
+      if (index >= 0)
+       break;
+    }
+  return index;
+}
+
+/* Try to find the best candidate on the top of ready list if two insns
+   have the same priority - candidate is best if its dependees were
+   scheduled earlier. Applied for Silvermont only.
+   Return true if top 2 insns must be interchanged.  */
+static bool
+swap_top_of_ready_list (rtx_insn **ready, int n_ready)
+{
+  rtx_insn *top = ready[n_ready - 1];
+  rtx_insn *next = ready[n_ready - 2];
+  rtx set;
+  sd_iterator_def sd_it;
+  dep_t dep;
+  int clock1 = -1;
+  int clock2 = -1;
+  #define INSN_TICK(INSN) (HID (INSN)->tick)
+
+  if (!TARGET_SILVERMONT && !TARGET_INTEL)
+    return false;
+
+  if (!NONDEBUG_INSN_P (top))
+    return false;
+  if (!NONJUMP_INSN_P (top))
+    return false;
+  if (!NONDEBUG_INSN_P (next))
+    return false;
+  if (!NONJUMP_INSN_P (next))
+    return false;
+  set = single_set (top);
+  if (!set)
+    return false;
+  set = single_set (next);
+  if (!set)
+    return false;
+
+  if (INSN_PRIORITY_KNOWN (top) && INSN_PRIORITY_KNOWN (next))
+    {
+      if (INSN_PRIORITY (top) != INSN_PRIORITY (next))
+       return false;
+      /* Determine winner more precise.  */
+      FOR_EACH_DEP (top, SD_LIST_RES_BACK, sd_it, dep)
+       {
+         rtx pro;
+         pro = DEP_PRO (dep);
+         if (!NONDEBUG_INSN_P (pro))
+           continue;
+         if (INSN_TICK (pro) > clock1)
+           clock1 = INSN_TICK (pro);
+       }
+      FOR_EACH_DEP (next, SD_LIST_RES_BACK, sd_it, dep)
+       {
+         rtx pro;
+         pro = DEP_PRO (dep);
+         if (!NONDEBUG_INSN_P (pro))
+           continue;
+         if (INSN_TICK (pro) > clock2)
+           clock2 = INSN_TICK (pro);
+       }
+
+      if (clock1 == clock2)
+       {
+         /* Determine winner - load must win. */
+         enum attr_memory memory1, memory2;
+         memory1 = get_attr_memory (top);
+         memory2 = get_attr_memory (next);
+         if (memory2 == MEMORY_LOAD && memory1 != MEMORY_LOAD)
+           return true;
+       }
+       return (bool) (clock2 < clock1);
+    }
+  return false;
+  #undef INSN_TICK
+}
+
+/* Perform possible reodering of ready list for Atom/Silvermont only.
+   Return issue rate.  */
+int
+ix86_atom_sched_reorder (FILE *dump, int sched_verbose, rtx_insn **ready,
+                        int *pn_ready, int clock_var)
+{
+  int issue_rate = -1;
+  int n_ready = *pn_ready;
+  int i;
+  rtx_insn *insn;
+  int index = -1;
+
+  /* Set up issue rate.  */
+  issue_rate = ix86_issue_rate ();
+
+  /* Do reodering for BONNELL/SILVERMONT only.  */
+  if (!TARGET_BONNELL && !TARGET_SILVERMONT && !TARGET_INTEL)
+    return issue_rate;
+
+  /* Nothing to do if ready list contains only 1 instruction.  */
+  if (n_ready <= 1)
+    return issue_rate;
+
+  /* Do reodering for post-reload scheduler only.  */
+  if (!reload_completed)
+    return issue_rate;
+
+  if ((index = do_reorder_for_imul (ready, n_ready)) >= 0)
+    {
+      if (sched_verbose > 1)
+       fprintf (dump, ";;\tatom sched_reorder: put %d insn on top\n",
+                INSN_UID (ready[index]));
+
+      /* Put IMUL producer (ready[index]) at the top of ready list.  */
+      insn = ready[index];
+      for (i = index; i < n_ready - 1; i++)
+       ready[i] = ready[i + 1];
+      ready[n_ready - 1] = insn;
+      return issue_rate;
+    }
+
+  /* Skip selective scheduling since HID is not populated in it.  */
+  if (clock_var != 0
+      && !sel_sched_p ()
+      && swap_top_of_ready_list (ready, n_ready))
+    {
+      if (sched_verbose > 1)
+       fprintf (dump, ";;\tslm sched_reorder: swap %d and %d insns\n",
+                INSN_UID (ready[n_ready - 1]), INSN_UID (ready[n_ready - 2]));
+      /* Swap 2 top elements of ready list.  */
+      insn = ready[n_ready - 1];
+      ready[n_ready - 1] = ready[n_ready - 2];
+      ready[n_ready - 2] = insn;
+    }
+  return issue_rate;
+}
diff --git a/gcc/config/i386/x86-tune-sched-bd.c b/gcc/config/i386/x86-tune-sched-bd.c

new file mode 100644 (file)

index 0000000..c862fc1
--- /dev/null
+++ b/gcc/config/i386/x86-tune-sched-bd.c
@@ -0,0 +1,822 @@
+/* Scheduler hooks for IA-32 which implement bdver1-4 specific logic.
+   Copyright (C) 1988-2017 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "backend.h"
+#include "rtl.h"
+#include "tree.h"
+#include "cfghooks.h"
+#include "tm_p.h"
+#include "insn-config.h"
+#include "insn-attr.h"
+#include "recog.h"
+#include "target.h"
+#include "rtl-iter.h"
+#include "regset.h"
+#include "sched-int.h"
+
+/* The size of the dispatch window is the total number of bytes of
+   object code allowed in a window.  */
+#define DISPATCH_WINDOW_SIZE 16
+
+/* Number of dispatch windows considered for scheduling.  */
+#define MAX_DISPATCH_WINDOWS 3
+
+/* Maximum number of instructions in a window.  */
+#define MAX_INSN 4
+
+/* Maximum number of immediate operands in a window.  */
+#define MAX_IMM 4
+
+/* Maximum number of immediate bits allowed in a window.  */
+#define MAX_IMM_SIZE 128
+
+/* Maximum number of 32 bit immediates allowed in a window.  */
+#define MAX_IMM_32 4
+
+/* Maximum number of 64 bit immediates allowed in a window.  */
+#define MAX_IMM_64 2
+
+/* Maximum total of loads or prefetches allowed in a window.  */
+#define MAX_LOAD 2
+
+/* Maximum total of stores allowed in a window.  */
+#define MAX_STORE 1
+
+#undef BIG
+#define BIG 100
+
+
+/* Dispatch groups.  Istructions that affect the mix in a dispatch window.  */
+enum dispatch_group {
+  disp_no_group = 0,
+  disp_load,
+  disp_store,
+  disp_load_store,
+  disp_prefetch,
+  disp_imm,
+  disp_imm_32,
+  disp_imm_64,
+  disp_branch,
+  disp_cmp,
+  disp_jcc,
+  disp_last
+};
+
+/* Number of allowable groups in a dispatch window.  It is an array
+   indexed by dispatch_group enum.  100 is used as a big number,
+   because the number of these kind of operations does not have any
+   effect in dispatch window, but we need them for other reasons in
+   the table.  */
+static unsigned int num_allowable_groups[disp_last] = {
+  0, 2, 1, 1, 2, 4, 4, 2, 1, BIG, BIG
+};
+
+char group_name[disp_last + 1][16] = {
+  "disp_no_group", "disp_load", "disp_store", "disp_load_store",
+  "disp_prefetch", "disp_imm", "disp_imm_32", "disp_imm_64",
+  "disp_branch", "disp_cmp", "disp_jcc", "disp_last"
+};
+
+/* Instruction path.  */
+enum insn_path {
+  no_path = 0,
+  path_single, /* Single micro op.  */
+  path_double, /* Double micro op.  */
+  path_multi,  /* Instructions with more than 2 micro op..  */
+  last_path
+};
+
+/* sched_insn_info defines a window to the instructions scheduled in
+   the basic block.  It contains a pointer to the insn_info table and
+   the instruction scheduled.
+
+   Windows are allocated for each basic block and are linked
+   together.  */
+typedef struct sched_insn_info_s {
+  rtx insn;
+  enum dispatch_group group;
+  enum insn_path path;
+  int byte_len;
+  int imm_bytes;
+} sched_insn_info;
+
+/* Linked list of dispatch windows.  This is a two way list of
+   dispatch windows of a basic block.  It contains information about
+   the number of uops in the window and the total number of
+   instructions and of bytes in the object code for this dispatch
+   window.  */
+typedef struct dispatch_windows_s {
+  int num_insn;            /* Number of insn in the window.  */
+  int num_uops;            /* Number of uops in the window.  */
+  int window_size;         /* Number of bytes in the window.  */
+  int window_num;          /* Window number between 0 or 1.  */
+  int num_imm;             /* Number of immediates in an insn.  */
+  int num_imm_32;          /* Number of 32 bit immediates in an insn.  */
+  int num_imm_64;          /* Number of 64 bit immediates in an insn.  */
+  int imm_size;            /* Total immediates in the window.  */
+  int num_loads;           /* Total memory loads in the window.  */
+  int num_stores;          /* Total memory stores in the window.  */
+  int violation;          /* Violation exists in window.  */
+  sched_insn_info *window; /* Pointer to the window.  */
+  struct dispatch_windows_s *next;
+  struct dispatch_windows_s *prev;
+} dispatch_windows;
+
+/* Immediate valuse used in an insn.  */
+typedef struct imm_info_s
+  {
+    int imm;
+    int imm32;
+    int imm64;
+  } imm_info;
+
+static dispatch_windows *dispatch_window_list;
+static dispatch_windows *dispatch_window_list1;
+
+/* Get dispatch group of insn.  */
+
+static enum dispatch_group
+get_mem_group (rtx_insn *insn)
+{
+  enum attr_memory memory;
+
+  if (INSN_CODE (insn) < 0)
+    return disp_no_group;
+  memory = get_attr_memory (insn);
+  if (memory == MEMORY_STORE)
+    return disp_store;
+
+  if (memory == MEMORY_LOAD)
+    return disp_load;
+
+  if (memory == MEMORY_BOTH)
+    return disp_load_store;
+
+  return disp_no_group;
+}
+
+/* Return true if insn is a compare instruction.  */
+
+static bool
+is_cmp (rtx_insn *insn)
+{
+  enum attr_type type;
+
+  type = get_attr_type (insn);
+  return (type == TYPE_TEST
+         || type == TYPE_ICMP
+         || type == TYPE_FCMP
+         || GET_CODE (PATTERN (insn)) == COMPARE);
+}
+
+/* Return true if a dispatch violation encountered.  */
+
+static bool
+dispatch_violation (void)
+{
+  if (dispatch_window_list->next)
+    return dispatch_window_list->next->violation;
+  return dispatch_window_list->violation;
+}
+
+/* Return true if insn is a branch instruction.  */
+
+static bool
+is_branch (rtx_insn *insn)
+{
+  return (CALL_P (insn) || JUMP_P (insn));
+}
+
+/* Return true if insn is a prefetch instruction.  */
+
+static bool
+is_prefetch (rtx_insn *insn)
+{
+  return NONJUMP_INSN_P (insn) && GET_CODE (PATTERN (insn)) == PREFETCH;
+}
+
+/* This function initializes a dispatch window and the list container holding a
+   pointer to the window.  */
+
+static void
+init_window (int window_num)
+{
+  int i;
+  dispatch_windows *new_list;
+
+  if (window_num == 0)
+    new_list = dispatch_window_list;
+  else
+    new_list = dispatch_window_list1;
+
+  new_list->num_insn = 0;
+  new_list->num_uops = 0;
+  new_list->window_size = 0;
+  new_list->next = NULL;
+  new_list->prev = NULL;
+  new_list->window_num = window_num;
+  new_list->num_imm = 0;
+  new_list->num_imm_32 = 0;
+  new_list->num_imm_64 = 0;
+  new_list->imm_size = 0;
+  new_list->num_loads = 0;
+  new_list->num_stores = 0;
+  new_list->violation = false;
+
+  for (i = 0; i < MAX_INSN; i++)
+    {
+      new_list->window[i].insn = NULL;
+      new_list->window[i].group = disp_no_group;
+      new_list->window[i].path = no_path;
+      new_list->window[i].byte_len = 0;
+      new_list->window[i].imm_bytes = 0;
+    }
+  return;
+}
+
+/* This function allocates and initializes a dispatch window and the
+   list container holding a pointer to the window.  */
+
+static dispatch_windows *
+allocate_window (void)
+{
+  dispatch_windows *new_list = XNEW (struct dispatch_windows_s);
+  new_list->window = XNEWVEC (struct sched_insn_info_s, MAX_INSN + 1);
+
+  return new_list;
+}
+
+/* This routine initializes the dispatch scheduling information.  It
+   initiates building dispatch scheduler tables and constructs the
+   first dispatch window.  */
+
+static void
+init_dispatch_sched (void)
+{
+  /* Allocate a dispatch list and a window.  */
+  dispatch_window_list = allocate_window ();
+  dispatch_window_list1 = allocate_window ();
+  init_window (0);
+  init_window (1);
+}
+
+/* This function returns true if a branch is detected.  End of a basic block
+   does not have to be a branch, but here we assume only branches end a
+   window.  */
+
+static bool
+is_end_basic_block (enum dispatch_group group)
+{
+  return group == disp_branch;
+}
+
+/* This function is called when the end of a window processing is reached.  */
+
+static void
+process_end_window (void)
+{
+  gcc_assert (dispatch_window_list->num_insn <= MAX_INSN);
+  if (dispatch_window_list->next)
+    {
+      gcc_assert (dispatch_window_list1->num_insn <= MAX_INSN);
+      gcc_assert (dispatch_window_list->window_size
+                 + dispatch_window_list1->window_size <= 48);
+      init_window (1);
+    }
+  init_window (0);
+}
+
+/* Allocates a new dispatch window and adds it to WINDOW_LIST.
+   WINDOW_NUM is either 0 or 1.  A maximum of two windows are generated
+   for 48 bytes of instructions.  Note that these windows are not dispatch
+   windows that their sizes are DISPATCH_WINDOW_SIZE.  */
+
+static dispatch_windows *
+allocate_next_window (int window_num)
+{
+  if (window_num == 0)
+    {
+      if (dispatch_window_list->next)
+         init_window (1);
+      init_window (0);
+      return dispatch_window_list;
+    }
+
+  dispatch_window_list->next = dispatch_window_list1;
+  dispatch_window_list1->prev = dispatch_window_list;
+
+  return dispatch_window_list1;
+}
+
+/* Compute number of immediate operands of an instruction.  */
+
+static void
+find_constant (rtx in_rtx, imm_info *imm_values)
+{
+  if (INSN_P (in_rtx))
+    in_rtx = PATTERN (in_rtx);
+  subrtx_iterator::array_type array;
+  FOR_EACH_SUBRTX (iter, array, in_rtx, ALL)
+    if (const_rtx x = *iter)
+      switch (GET_CODE (x))
+       {
+       case CONST:
+       case SYMBOL_REF:
+       case CONST_INT:
+         (imm_values->imm)++;
+         if (x86_64_immediate_operand (CONST_CAST_RTX (x), SImode))
+           (imm_values->imm32)++;
+         else
+           (imm_values->imm64)++;
+         break;
+
+       case CONST_DOUBLE:
+       case CONST_WIDE_INT:
+         (imm_values->imm)++;
+         (imm_values->imm64)++;
+         break;
+
+       case CODE_LABEL:
+         if (LABEL_KIND (x) == LABEL_NORMAL)
+           {
+             (imm_values->imm)++;
+             (imm_values->imm32)++;
+           }
+         break;
+
+       default:
+         break;
+       }
+}
+
+/* Return total size of immediate operands of an instruction along with number
+   of corresponding immediate-operands.  It initializes its parameters to zero
+   befor calling FIND_CONSTANT.
+   INSN is the input instruction.  IMM is the total of immediates.
+   IMM32 is the number of 32 bit immediates.  IMM64 is the number of 64
+   bit immediates.  */
+
+static int
+get_num_immediates (rtx_insn *insn, int *imm, int *imm32, int *imm64)
+{
+  imm_info imm_values = {0, 0, 0};
+
+  find_constant (insn, &imm_values);
+  *imm = imm_values.imm;
+  *imm32 = imm_values.imm32;
+  *imm64 = imm_values.imm64;
+  return imm_values.imm32 * 4 + imm_values.imm64 * 8;
+}
+
+/* This function indicates if an operand of an instruction is an
+   immediate.  */
+
+static bool
+has_immediate (rtx_insn *insn)
+{
+  int num_imm_operand;
+  int num_imm32_operand;
+  int num_imm64_operand;
+
+  if (insn)
+    return get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
+                              &num_imm64_operand);
+  return false;
+}
+
+/* Return single or double path for instructions.  */
+
+static enum insn_path
+get_insn_path (rtx_insn *insn)
+{
+  enum attr_amdfam10_decode path = get_attr_amdfam10_decode (insn);
+
+  if ((int)path == 0)
+    return path_single;
+
+  if ((int)path == 1)
+    return path_double;
+
+  return path_multi;
+}
+
+/* Return insn dispatch group.  */
+
+static enum dispatch_group
+get_insn_group (rtx_insn *insn)
+{
+  enum dispatch_group group = get_mem_group (insn);
+  if (group)
+    return group;
+
+  if (is_branch (insn))
+    return disp_branch;
+
+  if (is_cmp (insn))
+    return disp_cmp;
+
+  if (has_immediate (insn))
+    return disp_imm;
+
+  if (is_prefetch (insn))
+    return disp_prefetch;
+
+  return disp_no_group;
+}
+
+/* Count number of GROUP restricted instructions in a dispatch
+   window WINDOW_LIST.  */
+
+static int
+count_num_restricted (rtx_insn *insn, dispatch_windows *window_list)
+{
+  enum dispatch_group group = get_insn_group (insn);
+  int imm_size;
+  int num_imm_operand;
+  int num_imm32_operand;
+  int num_imm64_operand;
+
+  if (group == disp_no_group)
+    return 0;
+
+  if (group == disp_imm)
+    {
+      imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
+                             &num_imm64_operand);
+      if (window_list->imm_size + imm_size > MAX_IMM_SIZE
+         || num_imm_operand + window_list->num_imm > MAX_IMM
+         || (num_imm32_operand > 0
+             && (window_list->num_imm_32 + num_imm32_operand > MAX_IMM_32
+                 || window_list->num_imm_64 * 2 + num_imm32_operand > MAX_IMM_32))
+         || (num_imm64_operand > 0
+             && (window_list->num_imm_64 + num_imm64_operand > MAX_IMM_64
+                 || window_list->num_imm_32 + num_imm64_operand * 2 > MAX_IMM_32))
+         || (window_list->imm_size + imm_size == MAX_IMM_SIZE
+             && num_imm64_operand > 0
+             && ((window_list->num_imm_64 > 0
+                  && window_list->num_insn >= 2)
+                 || window_list->num_insn >= 3)))
+       return BIG;
+
+      return 1;
+    }
+
+  if ((group == disp_load_store
+       && (window_list->num_loads >= MAX_LOAD
+          || window_list->num_stores >= MAX_STORE))
+      || ((group == disp_load
+          || group == disp_prefetch)
+         && window_list->num_loads >= MAX_LOAD)
+      || (group == disp_store
+         && window_list->num_stores >= MAX_STORE))
+    return BIG;
+
+  return 1;
+}
+
+/* This function returns true if insn satisfies dispatch rules on the
+   last window scheduled.  */
+
+static bool
+fits_dispatch_window (rtx_insn *insn)
+{
+  dispatch_windows *window_list = dispatch_window_list;
+  dispatch_windows *window_list_next = dispatch_window_list->next;
+  unsigned int num_restrict;
+  enum dispatch_group group = get_insn_group (insn);
+  enum insn_path path = get_insn_path (insn);
+  int sum;
+
+  /* Make disp_cmp and disp_jcc get scheduled at the latest.  These
+     instructions should be given the lowest priority in the
+     scheduling process in Haifa scheduler to make sure they will be
+     scheduled in the same dispatch window as the reference to them.  */
+  if (group == disp_jcc || group == disp_cmp)
+    return false;
+
+  /* Check nonrestricted.  */
+  if (group == disp_no_group || group == disp_branch)
+    return true;
+
+  /* Get last dispatch window.  */
+  if (window_list_next)
+    window_list = window_list_next;
+
+  if (window_list->window_num == 1)
+    {
+      sum = window_list->prev->window_size + window_list->window_size;
+
+      if (sum == 32
+         || (ix86_min_insn_size (insn) + sum) >= 48)
+       /* Window 1 is full.  Go for next window.  */
+       return true;
+    }
+
+  num_restrict = count_num_restricted (insn, window_list);
+
+  if (num_restrict > num_allowable_groups[group])
+    return false;
+
+  /* See if it fits in the first window.  */
+  if (window_list->window_num == 0)
+    {
+      /* The first widow should have only single and double path
+        uops.  */
+      if (path == path_double
+         && (window_list->num_uops + 2) > MAX_INSN)
+       return false;
+      else if (path != path_single)
+        return false;
+    }
+  return true;
+}
+
+/* Add an instruction INSN with NUM_UOPS micro-operations to the
+   dispatch window WINDOW_LIST.  */
+
+static void
+add_insn_window (rtx_insn *insn, dispatch_windows *window_list, int num_uops)
+{
+  int byte_len = ix86_min_insn_size (insn);
+  int num_insn = window_list->num_insn;
+  int imm_size;
+  sched_insn_info *window = window_list->window;
+  enum dispatch_group group = get_insn_group (insn);
+  enum insn_path path = get_insn_path (insn);
+  int num_imm_operand;
+  int num_imm32_operand;
+  int num_imm64_operand;
+
+  if (!window_list->violation && group != disp_cmp
+      && !fits_dispatch_window (insn))
+    window_list->violation = true;
+
+  imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
+                                &num_imm64_operand);
+
+  /* Initialize window with new instruction.  */
+  window[num_insn].insn = insn;
+  window[num_insn].byte_len = byte_len;
+  window[num_insn].group = group;
+  window[num_insn].path = path;
+  window[num_insn].imm_bytes = imm_size;
+
+  window_list->window_size += byte_len;
+  window_list->num_insn = num_insn + 1;
+  window_list->num_uops = window_list->num_uops + num_uops;
+  window_list->imm_size += imm_size;
+  window_list->num_imm += num_imm_operand;
+  window_list->num_imm_32 += num_imm32_operand;
+  window_list->num_imm_64 += num_imm64_operand;
+
+  if (group == disp_store)
+    window_list->num_stores += 1;
+  else if (group == disp_load
+          || group == disp_prefetch)
+    window_list->num_loads += 1;
+  else if (group == disp_load_store)
+    {
+      window_list->num_stores += 1;
+      window_list->num_loads += 1;
+    }
+}
+
+/* Adds a scheduled instruction, INSN, to the current dispatch window.
+   If the total bytes of instructions or the number of instructions in
+   the window exceed allowable, it allocates a new window.  */
+
+static void
+add_to_dispatch_window (rtx_insn *insn)
+{
+  int byte_len;
+  dispatch_windows *window_list;
+  dispatch_windows *next_list;
+  dispatch_windows *window0_list;
+  enum insn_path path;
+  enum dispatch_group insn_group;
+  bool insn_fits;
+  int num_insn;
+  int num_uops;
+  int window_num;
+  int insn_num_uops;
+  int sum;
+
+  if (INSN_CODE (insn) < 0)
+    return;
+
+  byte_len = ix86_min_insn_size (insn);
+  window_list = dispatch_window_list;
+  next_list = window_list->next;
+  path = get_insn_path (insn);
+  insn_group = get_insn_group (insn);
+
+  /* Get the last dispatch window.  */
+  if (next_list)
+      window_list = dispatch_window_list->next;
+
+  if (path == path_single)
+    insn_num_uops = 1;
+  else if (path == path_double)
+    insn_num_uops = 2;
+  else
+    insn_num_uops = (int) path;
+
+  /* If current window is full, get a new window.
+     Window number zero is full, if MAX_INSN uops are scheduled in it.
+     Window number one is full, if window zero's bytes plus window
+     one's bytes is 32, or if the bytes of the new instruction added
+     to the total makes it greater than 48, or it has already MAX_INSN
+     instructions in it.  */
+  num_insn = window_list->num_insn;
+  num_uops = window_list->num_uops;
+  window_num = window_list->window_num;
+  insn_fits = fits_dispatch_window (insn);
+
+  if (num_insn >= MAX_INSN
+      || num_uops + insn_num_uops > MAX_INSN
+      || !(insn_fits))
+    {
+      window_num = ~window_num & 1;
+      window_list = allocate_next_window (window_num);
+    }
+
+  if (window_num == 0)
+    {
+      add_insn_window (insn, window_list, insn_num_uops);
+      if (window_list->num_insn >= MAX_INSN
+         && insn_group == disp_branch)
+       {
+         process_end_window ();
+         return;
+       }
+    }
+  else if (window_num == 1)
+    {
+      window0_list = window_list->prev;
+      sum = window0_list->window_size + window_list->window_size;
+      if (sum == 32
+         || (byte_len + sum) >= 48)
+       {
+         process_end_window ();
+         window_list = dispatch_window_list;
+       }
+
+      add_insn_window (insn, window_list, insn_num_uops);
+    }
+  else
+    gcc_unreachable ();
+
+  if (is_end_basic_block (insn_group))
+    {
+      /* End of basic block is reached do end-basic-block process.  */
+      process_end_window ();
+      return;
+    }
+}
+
+/* Print the dispatch window, WINDOW_NUM, to FILE.  */
+
+DEBUG_FUNCTION static void
+debug_dispatch_window_file (FILE *file, int window_num)
+{
+  dispatch_windows *list;
+  int i;
+
+  if (window_num == 0)
+    list = dispatch_window_list;
+  else
+    list = dispatch_window_list1;
+
+  fprintf (file, "Window #%d:\n", list->window_num);
+  fprintf (file, "  num_insn = %d, num_uops = %d, window_size = %d\n",
+         list->num_insn, list->num_uops, list->window_size);
+  fprintf (file, "  num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
+          list->num_imm, list->num_imm_32, list->num_imm_64, list->imm_size);
+
+  fprintf (file, "  num_loads = %d, num_stores = %d\n", list->num_loads,
+         list->num_stores);
+  fprintf (file, " insn info:\n");
+
+  for (i = 0; i < MAX_INSN; i++)
+    {
+      if (!list->window[i].insn)
+       break;
+      fprintf (file, "    group[%d] = %s, insn[%d] = %p, path[%d] = %d byte_len[%d] = %d, imm_bytes[%d] = %d\n",
+             i, group_name[list->window[i].group],
+             i, (void *)list->window[i].insn,
+             i, list->window[i].path,
+             i, list->window[i].byte_len,
+             i, list->window[i].imm_bytes);
+    }
+}
+
+/* Print to stdout a dispatch window.  */
+
+DEBUG_FUNCTION void
+debug_dispatch_window (int window_num)
+{
+  debug_dispatch_window_file (stdout, window_num);
+}
+
+/* Print INSN dispatch information to FILE.  */
+
+DEBUG_FUNCTION static void
+debug_insn_dispatch_info_file (FILE *file, rtx_insn *insn)
+{
+  int byte_len;
+  enum insn_path path;
+  enum dispatch_group group;
+  int imm_size;
+  int num_imm_operand;
+  int num_imm32_operand;
+  int num_imm64_operand;
+
+  if (INSN_CODE (insn) < 0)
+    return;
+
+  byte_len = ix86_min_insn_size (insn);
+  path = get_insn_path (insn);
+  group = get_insn_group (insn);
+  imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
+                                &num_imm64_operand);
+
+  fprintf (file, " insn info:\n");
+  fprintf (file, "  group = %s, path = %d, byte_len = %d\n",
+          group_name[group], path, byte_len);
+  fprintf (file, "  num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
+          num_imm_operand, num_imm32_operand, num_imm64_operand, imm_size);
+}
+
+/* Print to STDERR the status of the ready list with respect to
+   dispatch windows.  */
+
+DEBUG_FUNCTION void
+debug_ready_dispatch (void)
+{
+  int i;
+  int no_ready = number_in_ready ();
+
+  fprintf (stdout, "Number of ready: %d\n", no_ready);
+
+  for (i = 0; i < no_ready; i++)
+    debug_insn_dispatch_info_file (stdout, get_ready_element (i));
+}
+
+/* This routine is the driver of the dispatch scheduler.  */
+
+void
+ix86_bd_do_dispatch (rtx_insn *insn, int mode)
+{
+  if (mode == DISPATCH_INIT)
+    init_dispatch_sched ();
+  else if (mode == ADD_TO_DISPATCH_WINDOW)
+    add_to_dispatch_window (insn);
+}
+
+/* Return TRUE if Dispatch Scheduling is supported.  */
+
+bool
+ix86_bd_has_dispatch (rtx_insn *insn, int action)
+{
+  /* Current implementation of dispatch scheduler models buldozer only.  */
+  if ((TARGET_BDVER1 || TARGET_BDVER2 || TARGET_BDVER3
+      || TARGET_BDVER4) && flag_dispatch_scheduler)
+    switch (action)
+      {
+      default:
+       return false;
+
+      case IS_DISPATCH_ON:
+       return true;
+
+      case IS_CMP:
+       return is_cmp (insn);
+
+      case DISPATCH_VIOLATION:
+       return dispatch_violation ();
+
+      case FITS_DISPATCH_WINDOW:
+       return fits_dispatch_window (insn);
+      }
+
+  return false;
+}
diff --git a/gcc/config/i386/x86-tune-sched-core.c b/gcc/config/i386/x86-tune-sched-core.c

new file mode 100644 (file)

index 0000000..67b14a7
--- /dev/null
+++ b/gcc/config/i386/x86-tune-sched-core.c
@@ -0,0 +1,255 @@
+/* Scheduler hooks for IA-32 which implement bdver1-4 specific logic.
+   Copyright (C) 1988-2017 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "backend.h"
+#include "rtl.h"
+#include "tree.h"
+#include "cfghooks.h"
+#include "tm_p.h"
+#include "insn-config.h"
+#include "insn-attr.h"
+#include "recog.h"
+#include "target.h"
+#include "rtl-iter.h"
+#include "regset.h"
+#include "sched-int.h"
+
+
+/* Model decoder of Core 2/i7.
+   Below hooks for multipass scheduling (see haifa-sched.c:max_issue)
+   track the instruction fetch block boundaries and make sure that long
+   (9+ bytes) instructions are assigned to D0.  */
+
+/* Maximum length of an insn that can be handled by
+   a secondary decoder unit.  '8' for Core 2/i7.  */
+static int core2i7_secondary_decoder_max_insn_size;
+
+/* Ifetch block size, i.e., number of bytes decoder reads per cycle.
+   '16' for Core 2/i7.  */
+static int core2i7_ifetch_block_size;
+
+/* Maximum number of instructions decoder can handle per cycle.
+   '6' for Core 2/i7.  */
+static int core2i7_ifetch_block_max_insns;
+
+typedef struct ix86_first_cycle_multipass_data_ *
+  ix86_first_cycle_multipass_data_t;
+typedef const struct ix86_first_cycle_multipass_data_ *
+  const_ix86_first_cycle_multipass_data_t;
+
+/* A variable to store target state across calls to max_issue within
+   one cycle.  */
+static struct ix86_first_cycle_multipass_data_ _ix86_first_cycle_multipass_data,
+  *ix86_first_cycle_multipass_data = &_ix86_first_cycle_multipass_data;
+
+/* Initialize DATA.  */
+static void
+core2i7_first_cycle_multipass_init (void *_data)
+{
+  ix86_first_cycle_multipass_data_t data
+    = (ix86_first_cycle_multipass_data_t) _data;
+
+  data->ifetch_block_len = 0;
+  data->ifetch_block_n_insns = 0;
+  data->ready_try_change = NULL;
+  data->ready_try_change_size = 0;
+}
+
+/* Advancing the cycle; reset ifetch block counts.  */
+static void
+core2i7_dfa_post_advance_cycle (void)
+{
+  ix86_first_cycle_multipass_data_t data = ix86_first_cycle_multipass_data;
+
+  gcc_assert (data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
+
+  data->ifetch_block_len = 0;
+  data->ifetch_block_n_insns = 0;
+}
+
+/* Filter out insns from ready_try that the core will not be able to issue
+   on current cycle due to decoder.  */
+static void
+core2i7_first_cycle_multipass_filter_ready_try
+(const_ix86_first_cycle_multipass_data_t data,
+ signed char *ready_try, int n_ready, bool first_cycle_insn_p)
+{
+  while (n_ready--)
+    {
+      rtx_insn *insn;
+      int insn_size;
+
+      if (ready_try[n_ready])
+       continue;
+
+      insn = get_ready_element (n_ready);
+      insn_size = ix86_min_insn_size (insn);
+
+      if (/* If this is a too long an insn for a secondary decoder ...  */
+         (!first_cycle_insn_p
+          && insn_size > core2i7_secondary_decoder_max_insn_size)
+         /* ... or it would not fit into the ifetch block ...  */
+         || data->ifetch_block_len + insn_size > core2i7_ifetch_block_size
+         /* ... or the decoder is full already ...  */
+         || data->ifetch_block_n_insns + 1 > core2i7_ifetch_block_max_insns)
+       /* ... mask the insn out.  */
+       {
+         ready_try[n_ready] = 1;
+
+         if (data->ready_try_change)
+           bitmap_set_bit (data->ready_try_change, n_ready);
+       }
+    }
+}
+
+/* Prepare for a new round of multipass lookahead scheduling.  */
+static void
+core2i7_first_cycle_multipass_begin (void *_data,
+                                    signed char *ready_try, int n_ready,
+                                    bool first_cycle_insn_p)
+{
+  ix86_first_cycle_multipass_data_t data
+    = (ix86_first_cycle_multipass_data_t) _data;
+  const_ix86_first_cycle_multipass_data_t prev_data
+    = ix86_first_cycle_multipass_data;
+
+  /* Restore the state from the end of the previous round.  */
+  data->ifetch_block_len = prev_data->ifetch_block_len;
+  data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns;
+
+  /* Filter instructions that cannot be issued on current cycle due to
+     decoder restrictions.  */
+  core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
+                                                 first_cycle_insn_p);
+}
+
+/* INSN is being issued in current solution.  Account for its impact on
+   the decoder model.  */
+static void
+core2i7_first_cycle_multipass_issue (void *_data,
+                                    signed char *ready_try, int n_ready,
+                                    rtx_insn *insn, const void *_prev_data)
+{
+  ix86_first_cycle_multipass_data_t data
+    = (ix86_first_cycle_multipass_data_t) _data;
+  const_ix86_first_cycle_multipass_data_t prev_data
+    = (const_ix86_first_cycle_multipass_data_t) _prev_data;
+
+  int insn_size = ix86_min_insn_size (insn);
+
+  data->ifetch_block_len = prev_data->ifetch_block_len + insn_size;
+  data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns + 1;
+  gcc_assert (data->ifetch_block_len <= core2i7_ifetch_block_size
+             && data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
+
+  /* Allocate or resize the bitmap for storing INSN's effect on ready_try.  */
+  if (!data->ready_try_change)
+    {
+      data->ready_try_change = sbitmap_alloc (n_ready);
+      data->ready_try_change_size = n_ready;
+    }
+  else if (data->ready_try_change_size < n_ready)
+    {
+      data->ready_try_change = sbitmap_resize (data->ready_try_change,
+                                              n_ready, 0);
+      data->ready_try_change_size = n_ready;
+    }
+  bitmap_clear (data->ready_try_change);
+
+  /* Filter out insns from ready_try that the core will not be able to issue
+     on current cycle due to decoder.  */
+  core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
+                                                 false);
+}
+
+/* Revert the effect on ready_try.  */
+static void
+core2i7_first_cycle_multipass_backtrack (const void *_data,
+                                        signed char *ready_try,
+                                        int n_ready ATTRIBUTE_UNUSED)
+{
+  const_ix86_first_cycle_multipass_data_t data
+    = (const_ix86_first_cycle_multipass_data_t) _data;
+  unsigned int i = 0;
+  sbitmap_iterator sbi;
+
+  gcc_assert (bitmap_last_set_bit (data->ready_try_change) < n_ready);
+  EXECUTE_IF_SET_IN_BITMAP (data->ready_try_change, 0, i, sbi)
+    {
+      ready_try[i] = 0;
+    }
+}
+
+/* Save the result of multipass lookahead scheduling for the next round.  */
+static void
+core2i7_first_cycle_multipass_end (const void *_data)
+{
+  const_ix86_first_cycle_multipass_data_t data
+    = (const_ix86_first_cycle_multipass_data_t) _data;
+  ix86_first_cycle_multipass_data_t next_data
+    = ix86_first_cycle_multipass_data;
+
+  if (data != NULL)
+    {
+      next_data->ifetch_block_len = data->ifetch_block_len;
+      next_data->ifetch_block_n_insns = data->ifetch_block_n_insns;
+    }
+}
+
+/* Deallocate target data.  */
+static void
+core2i7_first_cycle_multipass_fini (void *_data)
+{
+  ix86_first_cycle_multipass_data_t data
+    = (ix86_first_cycle_multipass_data_t) _data;
+
+  if (data->ready_try_change)
+    {
+      sbitmap_free (data->ready_try_change);
+      data->ready_try_change = NULL;
+      data->ready_try_change_size = 0;
+    }
+}
+
+void
+ix86_core2i7_init_hooks (void)
+{
+  targetm.sched.dfa_post_advance_cycle
+    = core2i7_dfa_post_advance_cycle;
+  targetm.sched.first_cycle_multipass_init
+    = core2i7_first_cycle_multipass_init;
+  targetm.sched.first_cycle_multipass_begin
+    = core2i7_first_cycle_multipass_begin;
+  targetm.sched.first_cycle_multipass_issue
+    = core2i7_first_cycle_multipass_issue;
+  targetm.sched.first_cycle_multipass_backtrack
+    = core2i7_first_cycle_multipass_backtrack;
+  targetm.sched.first_cycle_multipass_end
+    = core2i7_first_cycle_multipass_end;
+  targetm.sched.first_cycle_multipass_fini
+    = core2i7_first_cycle_multipass_fini;
+
+  /* Set decoder parameters.  */
+  core2i7_secondary_decoder_max_insn_size = 8;
+  core2i7_ifetch_block_size = 16;
+  core2i7_ifetch_block_max_insns = 6;
+}
diff --git a/gcc/config/i386/x86-tune-sched.c b/gcc/config/i386/x86-tune-sched.c

new file mode 100644 (file)

index 0000000..51fa77c
--- /dev/null
+++ b/gcc/config/i386/x86-tune-sched.c
@@ -0,0 +1,599 @@
+/* Scheduler hooks for IA-32 which implement CPU specific logic.
+   Copyright (C) 1988-2017 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "backend.h"
+#include "rtl.h"
+#include "tree.h"
+#include "cfghooks.h"
+#include "tm_p.h"
+#include "insn-config.h"
+#include "insn-attr.h"
+#include "recog.h"
+#include "target.h"
+
+/* Return the maximum number of instructions a cpu can issue.  */
+
+int
+ix86_issue_rate (void)
+{
+  switch (ix86_tune)
+    {
+    case PROCESSOR_PENTIUM:
+    case PROCESSOR_LAKEMONT:
+    case PROCESSOR_BONNELL:
+    case PROCESSOR_SILVERMONT:
+    case PROCESSOR_KNL:
+    case PROCESSOR_KNM:
+    case PROCESSOR_INTEL:
+    case PROCESSOR_K6:
+    case PROCESSOR_BTVER2:
+    case PROCESSOR_PENTIUM4:
+    case PROCESSOR_NOCONA:
+      return 2;
+
+    case PROCESSOR_PENTIUMPRO:
+    case PROCESSOR_ATHLON:
+    case PROCESSOR_K8:
+    case PROCESSOR_AMDFAM10:
+    case PROCESSOR_GENERIC:
+    case PROCESSOR_BTVER1:
+      return 3;
+
+    case PROCESSOR_BDVER1:
+    case PROCESSOR_BDVER2:
+    case PROCESSOR_BDVER3:
+    case PROCESSOR_BDVER4:
+    case PROCESSOR_ZNVER1:
+    case PROCESSOR_CORE2:
+    case PROCESSOR_NEHALEM:
+    case PROCESSOR_SANDYBRIDGE:
+    case PROCESSOR_HASWELL:
+      return 4;
+
+    default:
+      return 1;
+    }
+}
+
+/* Return true iff USE_INSN has a memory address with operands set by
+   SET_INSN.  */
+
+bool
+ix86_agi_dependent (rtx_insn *set_insn, rtx_insn *use_insn)
+{
+  int i;
+  extract_insn_cached (use_insn);
+  for (i = recog_data.n_operands - 1; i >= 0; --i)
+    if (MEM_P (recog_data.operand[i]))
+      {
+       rtx addr = XEXP (recog_data.operand[i], 0);
+       if (modified_in_p (addr, set_insn) != 0)
+         {
+           /* No AGI stall if SET_INSN is a push or pop and USE_INSN
+              has SP based memory (unless index reg is modified in a pop).  */
+           rtx set = single_set (set_insn);
+           if (set
+               && (push_operand (SET_DEST (set), GET_MODE (SET_DEST (set)))
+                   || pop_operand (SET_SRC (set), GET_MODE (SET_SRC (set)))))
+             {
+               struct ix86_address parts;
+               if (ix86_decompose_address (addr, &parts)
+                   && parts.base == stack_pointer_rtx
+                   && (parts.index == NULL_RTX
+                       || MEM_P (SET_DEST (set))
+                       || !modified_in_p (parts.index, set_insn)))
+                 return false;
+             }
+           return true;
+         }
+       return false;
+      }
+  return false;
+}
+
+/* A subroutine of ix86_adjust_cost -- return TRUE iff INSN reads flags set
+   by DEP_INSN and nothing set by DEP_INSN.  */
+
+static bool
+ix86_flags_dependent (rtx_insn *insn, rtx_insn *dep_insn, enum attr_type insn_type)
+{
+  rtx set, set2;
+
+  /* Simplify the test for uninteresting insns.  */
+  if (insn_type != TYPE_SETCC
+      && insn_type != TYPE_ICMOV
+      && insn_type != TYPE_FCMOV
+      && insn_type != TYPE_IBR)
+    return false;
+
+  if ((set = single_set (dep_insn)) != 0)
+    {
+      set = SET_DEST (set);
+      set2 = NULL_RTX;
+    }
+  else if (GET_CODE (PATTERN (dep_insn)) == PARALLEL
+          && XVECLEN (PATTERN (dep_insn), 0) == 2
+          && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 0)) == SET
+          && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 1)) == SET)
+    {
+      set = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
+      set2 = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
+    }
+  else
+    return false;
+
+  if (!REG_P (set) || REGNO (set) != FLAGS_REG)
+    return false;
+
+  /* This test is true if the dependent insn reads the flags but
+     not any other potentially set register.  */
+  if (!reg_overlap_mentioned_p (set, PATTERN (insn)))
+    return false;
+
+  if (set2 && reg_overlap_mentioned_p (set2, PATTERN (insn)))
+    return false;
+
+  return true;
+}
+
+/* Helper function for exact_store_load_dependency.
+   Return true if addr is found in insn.  */
+static bool
+exact_dependency_1 (rtx addr, rtx insn)
+{
+  enum rtx_code code;
+  const char *format_ptr;
+  int i, j;
+
+  code = GET_CODE (insn);
+  switch (code)
+    {
+    case MEM:
+      if (rtx_equal_p (addr, insn))
+       return true;
+      break;
+    case REG:
+    CASE_CONST_ANY:
+    case SYMBOL_REF:
+    case CODE_LABEL:
+    case PC:
+    case CC0:
+    case EXPR_LIST:
+      return false;
+    default:
+      break;
+    }
+
+  format_ptr = GET_RTX_FORMAT (code);
+  for (i = 0; i < GET_RTX_LENGTH (code); i++)
+    {
+      switch (*format_ptr++)
+       {
+       case 'e':
+         if (exact_dependency_1 (addr, XEXP (insn, i)))
+           return true;
+         break;
+       case 'E':
+         for (j = 0; j < XVECLEN (insn, i); j++)
+           if (exact_dependency_1 (addr, XVECEXP (insn, i, j)))
+             return true;
+         break;
+       }
+    }
+  return false;
+}
+
+/* Return true if there exists exact dependency for store & load, i.e.
+   the same memory address is used in them.  */
+static bool
+exact_store_load_dependency (rtx_insn *store, rtx_insn *load)
+{
+  rtx set1, set2;
+
+  set1 = single_set (store);
+  if (!set1)
+    return false;
+  if (!MEM_P (SET_DEST (set1)))
+    return false;
+  set2 = single_set (load);
+  if (!set2)
+    return false;
+  if (exact_dependency_1 (SET_DEST (set1), SET_SRC (set2)))
+    return true;
+  return false;
+}
+
+
+/* This function corrects the value of COST (latency) based on the relationship
+   between INSN and DEP_INSN through a dependence of type DEP_TYPE, and strength
+   DW.  It should return the new value.
+
+   On x86 CPUs this is most commonly used to model the fact that valus of
+   registers used to compute address of memory operand  needs to be ready
+   earlier than values of registers used in the actual operation.  */
+
+int
+ix86_adjust_cost (rtx_insn *insn, int dep_type, rtx_insn *dep_insn, int cost,
+                 unsigned int)
+{
+  enum attr_type insn_type, dep_insn_type;
+  enum attr_memory memory;
+  rtx set, set2;
+  int dep_insn_code_number;
+
+  /* Anti and output dependencies have zero cost on all CPUs.  */
+  if (dep_type != 0)
+    return 0;
+
+  dep_insn_code_number = recog_memoized (dep_insn);
+
+  /* If we can't recognize the insns, we can't really do anything.  */
+  if (dep_insn_code_number < 0 || recog_memoized (insn) < 0)
+    return cost;
+
+  insn_type = get_attr_type (insn);
+  dep_insn_type = get_attr_type (dep_insn);
+
+  switch (ix86_tune)
+    {
+    case PROCESSOR_PENTIUM:
+    case PROCESSOR_LAKEMONT:
+      /* Address Generation Interlock adds a cycle of latency.  */
+      if (insn_type == TYPE_LEA)
+       {
+         rtx addr = PATTERN (insn);
+
+         if (GET_CODE (addr) == PARALLEL)
+           addr = XVECEXP (addr, 0, 0);
+
+         gcc_assert (GET_CODE (addr) == SET);
+
+         addr = SET_SRC (addr);
+         if (modified_in_p (addr, dep_insn))
+           cost += 1;
+       }
+      else if (ix86_agi_dependent (dep_insn, insn))
+       cost += 1;
+
+      /* ??? Compares pair with jump/setcc.  */
+      if (ix86_flags_dependent (insn, dep_insn, insn_type))
+       cost = 0;
+
+      /* Floating point stores require value to be ready one cycle earlier.  */
+      if (insn_type == TYPE_FMOV
+         && get_attr_memory (insn) == MEMORY_STORE
+         && !ix86_agi_dependent (dep_insn, insn))
+       cost += 1;
+      break;
+
+    case PROCESSOR_PENTIUMPRO:
+      /* INT->FP conversion is expensive.  */
+      if (get_attr_fp_int_src (dep_insn))
+       cost += 5;
+
+      /* There is one cycle extra latency between an FP op and a store.  */
+      if (insn_type == TYPE_FMOV
+         && (set = single_set (dep_insn)) != NULL_RTX
+         && (set2 = single_set (insn)) != NULL_RTX
+         && rtx_equal_p (SET_DEST (set), SET_SRC (set2))
+         && MEM_P (SET_DEST (set2)))
+       cost += 1;
+
+      memory = get_attr_memory (insn);
+
+      /* Show ability of reorder buffer to hide latency of load by executing
+        in parallel with previous instruction in case
+        previous instruction is not needed to compute the address.  */
+      if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
+         && !ix86_agi_dependent (dep_insn, insn))
+       {
+         /* Claim moves to take one cycle, as core can issue one load
+            at time and the next load can start cycle later.  */
+         if (dep_insn_type == TYPE_IMOV
+             || dep_insn_type == TYPE_FMOV)
+           cost = 1;
+         else if (cost > 1)
+           cost--;
+       }
+      break;
+
+    case PROCESSOR_K6:
+     /* The esp dependency is resolved before
+       the instruction is really finished.  */
+      if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
+         && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
+       return 1;
+
+      /* INT->FP conversion is expensive.  */
+      if (get_attr_fp_int_src (dep_insn))
+       cost += 5;
+
+      memory = get_attr_memory (insn);
+
+      /* Show ability of reorder buffer to hide latency of load by executing
+        in parallel with previous instruction in case
+        previous instruction is not needed to compute the address.  */
+      if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
+         && !ix86_agi_dependent (dep_insn, insn))
+       {
+         /* Claim moves to take one cycle, as core can issue one load
+            at time and the next load can start cycle later.  */
+         if (dep_insn_type == TYPE_IMOV
+             || dep_insn_type == TYPE_FMOV)
+           cost = 1;
+         else if (cost > 2)
+           cost -= 2;
+         else
+           cost = 1;
+       }
+      break;
+
+    case PROCESSOR_AMDFAM10:
+    case PROCESSOR_BDVER1:
+    case PROCESSOR_BDVER2:
+    case PROCESSOR_BDVER3:
+    case PROCESSOR_BDVER4:
+    case PROCESSOR_ZNVER1:
+    case PROCESSOR_BTVER1:
+    case PROCESSOR_BTVER2:
+    case PROCESSOR_GENERIC:
+      /* Stack engine allows to execute push&pop instructions in parall.  */
+      if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
+         && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
+       return 0;
+      /* FALLTHRU */
+
+    case PROCESSOR_ATHLON:
+    case PROCESSOR_K8:
+      memory = get_attr_memory (insn);
+
+      /* Show ability of reorder buffer to hide latency of load by executing
+        in parallel with previous instruction in case
+        previous instruction is not needed to compute the address.  */
+      if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
+         && !ix86_agi_dependent (dep_insn, insn))
+       {
+         enum attr_unit unit = get_attr_unit (insn);
+         int loadcost = 3;
+
+         /* Because of the difference between the length of integer and
+            floating unit pipeline preparation stages, the memory operands
+            for floating point are cheaper.
+
+            ??? For Athlon it the difference is most probably 2.  */
+         if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN)
+           loadcost = 3;
+         else
+           loadcost = TARGET_ATHLON ? 2 : 0;
+
+         if (cost >= loadcost)
+           cost -= loadcost;
+         else
+           cost = 0;
+       }
+      break;
+
+    case PROCESSOR_CORE2:
+    case PROCESSOR_NEHALEM:
+    case PROCESSOR_SANDYBRIDGE:
+    case PROCESSOR_HASWELL:
+      /* Stack engine allows to execute push&pop instructions in parall.  */
+      if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
+         && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
+       return 0;
+
+      memory = get_attr_memory (insn);
+
+      /* Show ability of reorder buffer to hide latency of load by executing
+        in parallel with previous instruction in case
+        previous instruction is not needed to compute the address.  */
+      if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
+         && !ix86_agi_dependent (dep_insn, insn))
+       {
+         if (cost >= 4)
+           cost -= 4;
+         else
+           cost = 0;
+       }
+      break;
+
+    case PROCESSOR_SILVERMONT:
+    case PROCESSOR_KNL:
+    case PROCESSOR_KNM:
+    case PROCESSOR_INTEL:
+      if (!reload_completed)
+       return cost;
+
+      /* Increase cost of integer loads.  */
+      memory = get_attr_memory (dep_insn);
+      if (memory == MEMORY_LOAD || memory == MEMORY_BOTH)
+       {
+         enum attr_unit unit = get_attr_unit (dep_insn);
+         if (unit == UNIT_INTEGER && cost == 1)
+           {
+             if (memory == MEMORY_LOAD)
+               cost = 3;
+             else
+               {
+                 /* Increase cost of ld/st for short int types only
+                    because of store forwarding issue.  */
+                 rtx set = single_set (dep_insn);
+                 if (set && (GET_MODE (SET_DEST (set)) == QImode
+                             || GET_MODE (SET_DEST (set)) == HImode))
+                   {
+                     /* Increase cost of store/load insn if exact
+                        dependence exists and it is load insn.  */
+                     enum attr_memory insn_memory = get_attr_memory (insn);
+                     if (insn_memory == MEMORY_LOAD
+                         && exact_store_load_dependency (dep_insn, insn))
+                       cost = 3;
+                   }
+               }
+           }
+       }
+
+    default:
+      break;
+    }
+
+  return cost;
+}
+
+/* How many alternative schedules to try.  This should be as wide as the
+   scheduling freedom in the DFA, but no wider.  Making this value too
+   large results extra work for the scheduler.  */
+
+int
+ia32_multipass_dfa_lookahead (void)
+{
+  /* Generally, we want haifa-sched:max_issue() to look ahead as far
+     as many instructions can be executed on a cycle, i.e.,
+     issue_rate.  */
+  if (reload_completed)
+    return ix86_issue_rate ();
+  /* Don't use lookahead for pre-reload schedule to save compile time.  */
+  return 0;
+}
+
+/* Return true if target platform supports macro-fusion.  */
+
+bool
+ix86_macro_fusion_p ()
+{
+  return TARGET_FUSE_CMP_AND_BRANCH;
+}
+
+/* Check whether current microarchitecture support macro fusion
+   for insn pair "CONDGEN + CONDJMP". Refer to
+   "Intel Architectures Optimization Reference Manual". */
+
+bool
+ix86_macro_fusion_pair_p (rtx_insn *condgen, rtx_insn *condjmp)
+{
+  rtx src, dest;
+  enum rtx_code ccode;
+  rtx compare_set = NULL_RTX, test_if, cond;
+  rtx alu_set = NULL_RTX, addr = NULL_RTX;
+
+  if (!any_condjump_p (condjmp))
+    return false;
+
+  unsigned int condreg1, condreg2;
+  rtx cc_reg_1;
+  targetm.fixed_condition_code_regs (&condreg1, &condreg2);
+  cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
+  if (!reg_referenced_p (cc_reg_1, PATTERN (condjmp))
+      || !condgen
+      || !modified_in_p (cc_reg_1, condgen))
+    return false;
+
+  if (get_attr_type (condgen) != TYPE_TEST
+      && get_attr_type (condgen) != TYPE_ICMP
+      && get_attr_type (condgen) != TYPE_INCDEC
+      && get_attr_type (condgen) != TYPE_ALU)
+    return false;
+
+  compare_set = single_set (condgen);
+  if (compare_set == NULL_RTX
+      && !TARGET_FUSE_ALU_AND_BRANCH)
+    return false;
+
+  if (compare_set == NULL_RTX)
+    {
+      int i;
+      rtx pat = PATTERN (condgen);
+      for (i = 0; i < XVECLEN (pat, 0); i++)
+       if (GET_CODE (XVECEXP (pat, 0, i)) == SET)
+         {
+           rtx set_src = SET_SRC (XVECEXP (pat, 0, i));
+           if (GET_CODE (set_src) == COMPARE)
+             compare_set = XVECEXP (pat, 0, i);
+           else
+             alu_set = XVECEXP (pat, 0, i);
+         }
+    }
+  if (compare_set == NULL_RTX)
+    return false;
+  src = SET_SRC (compare_set);
+  if (GET_CODE (src) != COMPARE)
+    return false;
+
+  /* Macro-fusion for cmp/test MEM-IMM + conditional jmp is not
+     supported.  */
+  if ((MEM_P (XEXP (src, 0))
+       && CONST_INT_P (XEXP (src, 1)))
+      || (MEM_P (XEXP (src, 1))
+         && CONST_INT_P (XEXP (src, 0))))
+    return false;
+
+  /* No fusion for RIP-relative address.  */
+  if (MEM_P (XEXP (src, 0)))
+    addr = XEXP (XEXP (src, 0), 0);
+  else if (MEM_P (XEXP (src, 1)))
+    addr = XEXP (XEXP (src, 1), 0);
+
+  if (addr) {
+    ix86_address parts;
+    int ok = ix86_decompose_address (addr, &parts);
+    gcc_assert (ok);
+
+    if (ix86_rip_relative_addr_p (&parts))
+      return false;
+  }
+
+  test_if = SET_SRC (pc_set (condjmp));
+  cond = XEXP (test_if, 0);
+  ccode = GET_CODE (cond);
+  /* Check whether conditional jump use Sign or Overflow Flags.  */
+  if (!TARGET_FUSE_CMP_AND_BRANCH_SOFLAGS
+      && (ccode == GE
+          || ccode == GT
+         || ccode == LE
+         || ccode == LT))
+    return false;
+
+  /* Return true for TYPE_TEST and TYPE_ICMP.  */
+  if (get_attr_type (condgen) == TYPE_TEST
+      || get_attr_type (condgen) == TYPE_ICMP)
+    return true;
+
+  /* The following is the case that macro-fusion for alu + jmp.  */
+  if (!TARGET_FUSE_ALU_AND_BRANCH || !alu_set)
+    return false;
+
+  /* No fusion for alu op with memory destination operand.  */
+  dest = SET_DEST (alu_set);
+  if (MEM_P (dest))
+    return false;
+
+  /* Macro-fusion for inc/dec + unsigned conditional jump is not
+     supported.  */
+  if (get_attr_type (condgen) == TYPE_INCDEC
+      && (ccode == GEU
+         || ccode == GTU
+         || ccode == LEU
+         || ccode == LTU))
+    return false;
+
+  return true;
+}
+
author	Jan Hubicka <hubicka@ucw.cz>
	Wed, 11 Oct 2017 15:17:23 +0000 (17:17 +0200)
committer	Jan Hubicka <hubicka@gcc.gnu.org>
	Wed, 11 Oct 2017 15:17:23 +0000 (15:17 +0000)
gcc/ChangeLog		patch \| blob \| history
gcc/config.gcc		patch \| blob \| history
gcc/config/i386/i386-protos.h		patch \| blob \| history
gcc/config/i386/i386.c		patch \| blob \| history
gcc/config/i386/t-i386		patch \| blob \| history
gcc/config/i386/x86-tune-costs.h	[new file with mode: 0644]	patch \| blob
gcc/config/i386/x86-tune-sched-atom.c	[new file with mode: 0644]	patch \| blob
gcc/config/i386/x86-tune-sched-bd.c	[new file with mode: 0644]	patch \| blob
gcc/config/i386/x86-tune-sched-core.c	[new file with mode: 0644]	patch \| blob
gcc/config/i386/x86-tune-sched.c	[new file with mode: 0644]	patch \| blob