i386: Add clear_ratio to processor_costs

[gcc.git] / gcc / config / i386 / x86-tune-costs.h
diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h

index 50ecb35cbdece59e764580b8429b0080f9f00f0f..99816aeaebc1fcaa900807db156149613c4aeb68 100644 (file)
--- a/gcc/config/i386/x86-tune-costs.h
+++ b/gcc/config/i386/x86-tune-costs.h
@@ -1,5 +1,5 @@
  /* Costs of operations of individual x86 CPUs.
-   Copyright (C) 1988-2018 Free Software Foundation, Inc.
+   Copyright (C) 1988-2019 Free Software Foundation, Inc.
  
  This file is part of GCC.
  
@@ -36,6 +36,32 @@ static stringop_algs ix86_size_memset[2] = {
  
  const
  struct processor_costs ix86_size_cost = {/* costs for tuning for size */
+  {
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
+  2,                                /* cost for loading QImode using movzbl */
+  {2, 2, 2},                           /* cost of loading integer registers
+                                          in QImode, HImode and SImode.
+                                          Relative to reg-reg move (2).  */
+  {2, 2, 2},                           /* cost of storing integer registers */
+  2,                                   /* cost of reg,reg fld/fst */
+  {2, 2, 2},                           /* cost of loading fp registers
+                                          in SFmode, DFmode and XFmode */
+  {2, 2, 2},                           /* cost of storing fp registers
+                                          in SFmode, DFmode and XFmode */
+  3,                                   /* cost of moving MMX register */
+  {3, 3},                              /* cost of loading MMX registers
+                                          in SImode and DImode */
+  {3, 3},                              /* cost of storing MMX registers
+                                          in SImode and DImode */
+  3, 3, 3,                             /* cost of moving XMM,YMM,ZMM register */
+  {3, 3, 3, 3, 3},                     /* cost of loading SSE registers
+                                          in 32,64,128,256 and 512-bit */
+  {3, 3, 3, 3, 3},                     /* cost of storing SSE registers
+                                          in 32,64,128,256 and 512-bit */
+  3, 3,                                        /* SSE->integer and integer->SSE moves */
+  /* End of register allocator costs.  */
+  },
+
    COSTS_N_BYTES (2),                   /* cost of an add instruction */
    COSTS_N_BYTES (3),                   /* cost of a lea instruction */
    COSTS_N_BYTES (2),                   /* variable shift costs */
@@ -55,33 +81,21 @@ struct processor_costs ix86_size_cost = {/* costs for tuning for size */
    COSTS_N_BYTES (3),                   /* cost of movzx */
    0,                                   /* "large" insn */
    2,                                   /* MOVE_RATIO */
-
-  /* All move costs are relative to integer->integer move times 2. */
-  2,                                /* cost for loading QImode using movzbl */
+  2,                                   /* CLEAR_RATIO */
    {2, 2, 2},                           /* cost of loading integer registers
                                            in QImode, HImode and SImode.
                                            Relative to reg-reg move (2).  */
    {2, 2, 2},                           /* cost of storing integer registers */
-  2,                                   /* cost of reg,reg fld/fst */
-  {2, 2, 2},                           /* cost of loading fp registers
-                                          in SFmode, DFmode and XFmode */
-  {2, 2, 2},                           /* cost of storing fp registers
-                                          in SFmode, DFmode and XFmode */
-  3,                                   /* cost of moving MMX register */
-  {3, 3},                              /* cost of loading MMX registers
-                                          in SImode and DImode */
-  {3, 3},                              /* cost of storing MMX registers
-                                          in SImode and DImode */
-  3, 3, 3,                             /* cost of moving XMM,YMM,ZMM register */
-  {3, 3, 3, 3, 3},                     /* cost of loading SSE registers
-                                          in 32,64,128,256 and 512-bit */
+  {3, 3, 3, 3, 3},                     /* cost of loading SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {3, 3, 3, 3, 3},                     /* cost of storing SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
    {3, 3, 3, 3, 3},                     /* cost of unaligned SSE load
                                            in 128bit, 256bit and 512bit */
-  {3, 3, 3, 3, 3},                     /* cost of storing SSE registers
-                                          in 32,64,128,256 and 512-bit */
-  {3, 3, 3, 3, 3},                             /* cost of unaligned SSE store
+  {3, 3, 3, 3, 3},                     /* cost of unaligned SSE store
                                            in 128bit, 256bit and 512bit */
-  3, 3,                                        /* SSE->integer and integer->SSE moves */
+  3, 3, 3,                             /* cost of moving XMM,YMM,ZMM register */
+  3,                                   /* cost of moving SSE register to integer.  */
    5, 0,                                        /* Gather load static, per_elt.  */
    5, 0,                                        /* Gather store static, per_elt.  */
    0,                                   /* size of l1 cache  */
@@ -127,6 +141,32 @@ static stringop_algs i386_memset[2] = {
  
  static const
  struct processor_costs i386_cost = {   /* 386 specific costs */
+  {
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
+  4,                                /* cost for loading QImode using movzbl */
+  {2, 4, 2},                           /* cost of loading integer registers
+                                          in QImode, HImode and SImode.
+                                          Relative to reg-reg move (2).  */
+  {2, 4, 2},                           /* cost of storing integer registers */
+  2,                                   /* cost of reg,reg fld/fst */
+  {8, 8, 8},                           /* cost of loading fp registers
+                                          in SFmode, DFmode and XFmode */
+  {8, 8, 8},                           /* cost of storing fp registers
+                                          in SFmode, DFmode and XFmode */
+  2,                                   /* cost of moving MMX register */
+  {4, 8},                              /* cost of loading MMX registers
+                                          in SImode and DImode */
+  {4, 8},                              /* cost of storing MMX registers
+                                          in SImode and DImode */
+  2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
+  {4, 8, 16, 32, 64},                  /* cost of loading SSE registers
+                                          in 32,64,128,256 and 512-bit */
+  {4, 8, 16, 32, 64},                  /* cost of storing SSE registers
+                                          in 32,64,128,256 and 512-bit */
+  3, 3,                                        /* SSE->integer and integer->SSE moves */
+  /* End of register allocator costs.  */
+  },
+
    COSTS_N_INSNS (1),                   /* cost of an add instruction */
    COSTS_N_INSNS (1),                   /* cost of a lea instruction */
    COSTS_N_INSNS (3),                   /* variable shift costs */
@@ -146,32 +186,19 @@ struct processor_costs i386_cost = {      /* 386 specific costs */
    COSTS_N_INSNS (2),                   /* cost of movzx */
    15,                                  /* "large" insn */
    3,                                   /* MOVE_RATIO */
-
-  /* All move costs are relative to integer->integer move times 2 and thus
-     they are latency*2. */
-  4,                                /* cost for loading QImode using movzbl */
+  3,                                   /* CLEAR_RATIO */
    {2, 4, 2},                           /* cost of loading integer registers
                                            in QImode, HImode and SImode.
                                            Relative to reg-reg move (2).  */
    {2, 4, 2},                           /* cost of storing integer registers */
-  2,                                   /* cost of reg,reg fld/fst */
-  {8, 8, 8},                           /* cost of loading fp registers
-                                          in SFmode, DFmode and XFmode */
-  {8, 8, 8},                           /* cost of storing fp registers
-                                          in SFmode, DFmode and XFmode */
-  2,                                   /* cost of moving MMX register */
-  {4, 8},                              /* cost of loading MMX registers
-                                          in SImode and DImode */
-  {4, 8},                              /* cost of storing MMX registers
-                                          in SImode and DImode */
-  2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
-  {4, 8, 16, 32, 64},                  /* cost of loading SSE registers
-                                          in 32,64,128,256 and 512-bit */
+  {4, 8, 16, 32, 64},                  /* cost of loading SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {4, 8, 16, 32, 64},                  /* cost of storing SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
    {4, 8, 16, 32, 64},                  /* cost of unaligned loads.  */
-  {4, 8, 16, 32, 64},                  /* cost of storing SSE registers
-                                          in 32,64,128,256 and 512-bit */
    {4, 8, 16, 32, 64},                  /* cost of unaligned stores.  */
-  3, 3,                                        /* SSE->integer and integer->SSE moves */
+  2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
+  3,                                   /* cost of moving SSE register to integer.  */
    4, 4,                                        /* Gather load static, per_elt.  */
    4, 4,                                        /* Gather store static, per_elt.  */
    0,                                   /* size of l1 cache  */
@@ -216,6 +243,32 @@ static stringop_algs i486_memset[2] = {
  
  static const
  struct processor_costs i486_cost = {   /* 486 specific costs */
+  {
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
+  4,                                /* cost for loading QImode using movzbl */
+  {2, 4, 2},                           /* cost of loading integer registers
+                                          in QImode, HImode and SImode.
+                                          Relative to reg-reg move (2).  */
+  {2, 4, 2},                           /* cost of storing integer registers */
+  2,                                   /* cost of reg,reg fld/fst */
+  {8, 8, 8},                           /* cost of loading fp registers
+                                          in SFmode, DFmode and XFmode */
+  {8, 8, 8},                           /* cost of storing fp registers
+                                          in SFmode, DFmode and XFmode */
+  2,                                   /* cost of moving MMX register */
+  {4, 8},                              /* cost of loading MMX registers
+                                          in SImode and DImode */
+  {4, 8},                              /* cost of storing MMX registers
+                                          in SImode and DImode */
+  2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
+  {4, 8, 16, 32, 64},                  /* cost of loading SSE registers
+                                          in 32,64,128,256 and 512-bit */
+  {4, 8, 16, 32, 64},                  /* cost of storing SSE registers
+                                          in 32,64,128,256 and 512-bit */
+  3, 3,                                        /* SSE->integer and integer->SSE moves */
+  /* End of register allocator costs.  */
+  },
+
    COSTS_N_INSNS (1),                   /* cost of an add instruction */
    COSTS_N_INSNS (1),                   /* cost of a lea instruction */
    COSTS_N_INSNS (3),                   /* variable shift costs */
@@ -235,32 +288,19 @@ struct processor_costs i486_cost = {      /* 486 specific costs */
    COSTS_N_INSNS (2),                   /* cost of movzx */
    15,                                  /* "large" insn */
    3,                                   /* MOVE_RATIO */
-
-  /* All move costs are relative to integer->integer move times 2 and thus
-     they are latency*2. */
-  4,                                /* cost for loading QImode using movzbl */
+  3,                                   /* CLEAR_RATIO */
    {2, 4, 2},                           /* cost of loading integer registers
                                            in QImode, HImode and SImode.
                                            Relative to reg-reg move (2).  */
    {2, 4, 2},                           /* cost of storing integer registers */
-  2,                                   /* cost of reg,reg fld/fst */
-  {8, 8, 8},                           /* cost of loading fp registers
-                                          in SFmode, DFmode and XFmode */
-  {8, 8, 8},                           /* cost of storing fp registers
-                                          in SFmode, DFmode and XFmode */
-  2,                                   /* cost of moving MMX register */
-  {4, 8},                              /* cost of loading MMX registers
-                                          in SImode and DImode */
-  {4, 8},                              /* cost of storing MMX registers
-                                          in SImode and DImode */
-  2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
-  {4, 8, 16, 32, 64},                  /* cost of loading SSE registers
-                                          in 32,64,128,256 and 512-bit */
+  {4, 8, 16, 32, 64},                  /* cost of loading SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {4, 8, 16, 32, 64},                  /* cost of storing SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
    {4, 8, 16, 32, 64},                  /* cost of unaligned loads.  */
-  {4, 8, 16, 32, 64},                  /* cost of storing SSE registers
-                                          in 32,64,128,256 and 512-bit */
    {4, 8, 16, 32, 64},                  /* cost of unaligned stores.  */
-  3, 3,                                        /* SSE->integer and integer->SSE moves */
+  2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
+  3,                                   /* cost of moving SSE register to integer.  */
    4, 4,                                        /* Gather load static, per_elt.  */
    4, 4,                                        /* Gather store static, per_elt.  */
    4,                                   /* size of l1 cache.  486 has 8kB cache
@@ -307,6 +347,32 @@ static stringop_algs pentium_memset[2] = {
  
  static const
  struct processor_costs pentium_cost = {
+  {
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
+  6,                                /* cost for loading QImode using movzbl */
+  {2, 4, 2},                           /* cost of loading integer registers
+                                          in QImode, HImode and SImode.
+                                          Relative to reg-reg move (2).  */
+  {2, 4, 2},                           /* cost of storing integer registers */
+  2,                                   /* cost of reg,reg fld/fst */
+  {2, 2, 6},                           /* cost of loading fp registers
+                                          in SFmode, DFmode and XFmode */
+  {4, 4, 6},                           /* cost of storing fp registers
+                                          in SFmode, DFmode and XFmode */
+  8,                                   /* cost of moving MMX register */
+  {8, 8},                              /* cost of loading MMX registers
+                                          in SImode and DImode */
+  {8, 8},                              /* cost of storing MMX registers
+                                          in SImode and DImode */
+  2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
+  {4, 8, 16, 32, 64},                  /* cost of loading SSE registers
+                                          in 32,64,128,256 and 512-bit */
+  {4, 8, 16, 32, 64},                  /* cost of storing SSE registers
+                                          in 32,64,128,256 and 512-bit */
+  3, 3,                                        /* SSE->integer and integer->SSE moves */
+  /* End of register allocator costs.  */
+  },
+
    COSTS_N_INSNS (1),                   /* cost of an add instruction */
    COSTS_N_INSNS (1),                   /* cost of a lea instruction */
    COSTS_N_INSNS (4),                   /* variable shift costs */
@@ -326,32 +392,19 @@ struct processor_costs pentium_cost = {
    COSTS_N_INSNS (2),                   /* cost of movzx */
    8,                                   /* "large" insn */
    6,                                   /* MOVE_RATIO */
-
-  /* All move costs are relative to integer->integer move times 2 and thus
-     they are latency*2. */
-  6,                                /* cost for loading QImode using movzbl */
+  6,                                   /* CLEAR_RATIO */
    {2, 4, 2},                           /* cost of loading integer registers
                                            in QImode, HImode and SImode.
                                            Relative to reg-reg move (2).  */
    {2, 4, 2},                           /* cost of storing integer registers */
-  2,                                   /* cost of reg,reg fld/fst */
-  {2, 2, 6},                           /* cost of loading fp registers
-                                          in SFmode, DFmode and XFmode */
-  {4, 4, 6},                           /* cost of storing fp registers
-                                          in SFmode, DFmode and XFmode */
-  8,                                   /* cost of moving MMX register */
-  {8, 8},                              /* cost of loading MMX registers
-                                          in SImode and DImode */
-  {8, 8},                              /* cost of storing MMX registers
-                                          in SImode and DImode */
-  2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
-  {4, 8, 16, 32, 64},                  /* cost of loading SSE registers
-                                          in 32,64,128,256 and 512-bit */
+  {4, 8, 16, 32, 64},                  /* cost of loading SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {4, 8, 16, 32, 64},                  /* cost of storing SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
    {4, 8, 16, 32, 64},                  /* cost of unaligned loads.  */
-  {4, 8, 16, 32, 64},                  /* cost of storing SSE registers
-                                          in 32,64,128,256 and 512-bit */
    {4, 8, 16, 32, 64},                  /* cost of unaligned stores.  */
-  3, 3,                                        /* SSE->integer and integer->SSE moves */
+  2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
+  3,                                   /* cost of moving SSE register to integer.  */
    4, 4,                                        /* Gather load static, per_elt.  */
    4, 4,                                        /* Gather store static, per_elt.  */
    8,                                   /* size of l1 cache.  */
@@ -389,6 +442,32 @@ struct processor_costs pentium_cost = {
  
  static const
  struct processor_costs lakemont_cost = {
+  {
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
+  6,                                /* cost for loading QImode using movzbl */
+  {2, 4, 2},                           /* cost of loading integer registers
+                                          in QImode, HImode and SImode.
+                                          Relative to reg-reg move (2).  */
+  {2, 4, 2},                           /* cost of storing integer registers */
+  2,                                   /* cost of reg,reg fld/fst */
+  {2, 2, 6},                           /* cost of loading fp registers
+                                          in SFmode, DFmode and XFmode */
+  {4, 4, 6},                           /* cost of storing fp registers
+                                          in SFmode, DFmode and XFmode */
+  8,                                   /* cost of moving MMX register */
+  {8, 8},                              /* cost of loading MMX registers
+                                          in SImode and DImode */
+  {8, 8},                              /* cost of storing MMX registers
+                                          in SImode and DImode */
+  2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
+  {4, 8, 16, 32, 64},                  /* cost of loading SSE registers
+                                          in 32,64,128,256 and 512-bit */
+  {4, 8, 16, 32, 64},                  /* cost of storing SSE registers
+                                          in 32,64,128,256 and 512-bit */
+  3, 3,                                        /* SSE->integer and integer->SSE moves */
+  /* End of register allocator costs.  */
+  },
+
    COSTS_N_INSNS (1),                   /* cost of an add instruction */
    COSTS_N_INSNS (1) + 1,               /* cost of a lea instruction */
    COSTS_N_INSNS (1),                   /* variable shift costs */
@@ -408,32 +487,19 @@ struct processor_costs lakemont_cost = {
    COSTS_N_INSNS (2),                   /* cost of movzx */
    8,                                   /* "large" insn */
    17,                                  /* MOVE_RATIO */
-
-  /* All move costs are relative to integer->integer move times 2 and thus
-     they are latency*2. */
-  6,                                /* cost for loading QImode using movzbl */
+  6,                                   /* CLEAR_RATIO */
    {2, 4, 2},                           /* cost of loading integer registers
                                            in QImode, HImode and SImode.
                                            Relative to reg-reg move (2).  */
    {2, 4, 2},                           /* cost of storing integer registers */
-  2,                                   /* cost of reg,reg fld/fst */
-  {2, 2, 6},                           /* cost of loading fp registers
-                                          in SFmode, DFmode and XFmode */
-  {4, 4, 6},                           /* cost of storing fp registers
-                                          in SFmode, DFmode and XFmode */
-  8,                                   /* cost of moving MMX register */
-  {8, 8},                              /* cost of loading MMX registers
-                                          in SImode and DImode */
-  {8, 8},                              /* cost of storing MMX registers
-                                          in SImode and DImode */
-  2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
-  {4, 8, 16, 32, 64},                  /* cost of loading SSE registers
-                                          in 32,64,128,256 and 512-bit */
+  {4, 8, 16, 32, 64},                  /* cost of loading SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {4, 8, 16, 32, 64},                  /* cost of storing SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
    {4, 8, 16, 32, 64},                  /* cost of unaligned loads.  */
-  {4, 8, 16, 32, 64},                  /* cost of storing SSE registers
-                                          in 32,64,128,256 and 512-bit */
    {4, 8, 16, 32, 64},                  /* cost of unaligned stores.  */
-  3, 3,                                        /* SSE->integer and integer->SSE moves */
+  2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
+  3,                                   /* cost of moving SSE register to integer.  */
    4, 4,                                        /* Gather load static, per_elt.  */
    4, 4,                                        /* Gather store static, per_elt.  */
    8,                                   /* size of l1 cache.  */
@@ -486,6 +552,32 @@ static stringop_algs pentiumpro_memset[2] = {
    DUMMY_STRINGOP_ALGS};
  static const
  struct processor_costs pentiumpro_cost = {
+  {
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
+  2,                                /* cost for loading QImode using movzbl */
+  {4, 4, 4},                           /* cost of loading integer registers
+                                          in QImode, HImode and SImode.
+                                          Relative to reg-reg move (2).  */
+  {2, 2, 2},                           /* cost of storing integer registers */
+  2,                                   /* cost of reg,reg fld/fst */
+  {2, 2, 6},                           /* cost of loading fp registers
+                                          in SFmode, DFmode and XFmode */
+  {4, 4, 6},                           /* cost of storing fp registers
+                                          in SFmode, DFmode and XFmode */
+  2,                                   /* cost of moving MMX register */
+  {2, 2},                              /* cost of loading MMX registers
+                                          in SImode and DImode */
+  {2, 2},                              /* cost of storing MMX registers
+                                          in SImode and DImode */
+  2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
+  {4, 8, 16, 32, 64},                  /* cost of loading SSE registers
+                                          in 32,64,128,256 and 512-bit */
+  {4, 8, 16, 32, 64},                  /* cost of storing SSE registers
+                                          in 32,64,128,256 and 512-bit */
+  3, 3,                                        /* SSE->integer and integer->SSE moves */
+  /* End of register allocator costs.  */
+  },
+
    COSTS_N_INSNS (1),                   /* cost of an add instruction */
    COSTS_N_INSNS (1),                   /* cost of a lea instruction */
    COSTS_N_INSNS (1),                   /* variable shift costs */
@@ -505,32 +597,19 @@ struct processor_costs pentiumpro_cost = {
    COSTS_N_INSNS (1),                   /* cost of movzx */
    8,                                   /* "large" insn */
    6,                                   /* MOVE_RATIO */
-
-  /* All move costs are relative to integer->integer move times 2 and thus
-     they are latency*2. */
-  2,                                /* cost for loading QImode using movzbl */
+  6,                                   /* CLEAR_RATIO */
    {4, 4, 4},                           /* cost of loading integer registers
                                            in QImode, HImode and SImode.
                                            Relative to reg-reg move (2).  */
    {2, 2, 2},                           /* cost of storing integer registers */
-  2,                                   /* cost of reg,reg fld/fst */
-  {2, 2, 6},                           /* cost of loading fp registers
-                                          in SFmode, DFmode and XFmode */
-  {4, 4, 6},                           /* cost of storing fp registers
-                                          in SFmode, DFmode and XFmode */
-  2,                                   /* cost of moving MMX register */
-  {2, 2},                              /* cost of loading MMX registers
-                                          in SImode and DImode */
-  {2, 2},                              /* cost of storing MMX registers
-                                          in SImode and DImode */
-  2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
-  {4, 8, 16, 32, 64},                  /* cost of loading SSE registers
-                                          in 32,64,128,256 and 512-bit */
+  {4, 8, 16, 32, 64},                  /* cost of loading SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {4, 8, 16, 32, 64},                  /* cost of storing SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
    {4, 8, 16, 32, 64},                  /* cost of unaligned loads.  */
-  {4, 8, 16, 32, 64},                  /* cost of storing SSE registers
-                                          in 32,64,128,256 and 512-bit */
    {4, 8, 16, 32, 64},                  /* cost of unaligned stores.  */
-  3, 3,                                        /* SSE->integer and integer->SSE moves */
+  2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
+  3,                                   /* cost of moving SSE register to integer.  */
    4, 4,                                        /* Gather load static, per_elt.  */
    4, 4,                                        /* Gather store static, per_elt.  */
    8,                                   /* size of l1 cache.  */
@@ -574,28 +653,8 @@ static stringop_algs geode_memset[2] = {
    DUMMY_STRINGOP_ALGS};
  static const
  struct processor_costs geode_cost = {
-  COSTS_N_INSNS (1),                   /* cost of an add instruction */
-  COSTS_N_INSNS (1),                   /* cost of a lea instruction */
-  COSTS_N_INSNS (2),                   /* variable shift costs */
-  COSTS_N_INSNS (1),                   /* constant shift costs */
-  {COSTS_N_INSNS (3),                  /* cost of starting multiply for QI */
-   COSTS_N_INSNS (4),                  /*                               HI */
-   COSTS_N_INSNS (7),                  /*                               SI */
-   COSTS_N_INSNS (7),                  /*                               DI */
-   COSTS_N_INSNS (7)},                 /*                            other */
-  0,                                   /* cost of multiply per each bit set */
-  {COSTS_N_INSNS (15),                 /* cost of a divide/mod for QI */
-   COSTS_N_INSNS (23),                 /*                          HI */
-   COSTS_N_INSNS (39),                 /*                          SI */
-   COSTS_N_INSNS (39),                 /*                          DI */
-   COSTS_N_INSNS (39)},                        /*                          other */
-  COSTS_N_INSNS (1),                   /* cost of movsx */
-  COSTS_N_INSNS (1),                   /* cost of movzx */
-  8,                                   /* "large" insn */
-  4,                                   /* MOVE_RATIO */
-
-  /* All move costs are relative to integer->integer move times 2 and thus
-     they are latency*2. */
+  {
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
    2,                                /* cost for loading QImode using movzbl */
    {2, 2, 2},                           /* cost of loading integer registers
                                            in QImode, HImode and SImode.
@@ -606,7 +665,6 @@ struct processor_costs geode_cost = {
                                            in SFmode, DFmode and XFmode */
    {4, 6, 6},                           /* cost of storing fp registers
                                            in SFmode, DFmode and XFmode */
-
    2,                                   /* cost of moving MMX register */
    {2, 2},                              /* cost of loading MMX registers
                                            in SImode and DImode */
@@ -615,11 +673,44 @@ struct processor_costs geode_cost = {
    2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
    {2, 2, 8, 16, 32},                   /* cost of loading SSE registers
                                            in 32,64,128,256 and 512-bit */
-  {2, 2, 8, 16, 32},                   /* cost of unaligned loads.  */
    {2, 2, 8, 16, 32},                   /* cost of storing SSE registers
                                            in 32,64,128,256 and 512-bit */
-  {2, 2, 8, 16, 32},                   /* cost of unaligned stores.  */
    6, 6,                                        /* SSE->integer and integer->SSE moves */
+  /* End of register allocator costs.  */
+  },
+
+  COSTS_N_INSNS (1),                   /* cost of an add instruction */
+  COSTS_N_INSNS (1),                   /* cost of a lea instruction */
+  COSTS_N_INSNS (2),                   /* variable shift costs */
+  COSTS_N_INSNS (1),                   /* constant shift costs */
+  {COSTS_N_INSNS (3),                  /* cost of starting multiply for QI */
+   COSTS_N_INSNS (4),                  /*                               HI */
+   COSTS_N_INSNS (7),                  /*                               SI */
+   COSTS_N_INSNS (7),                  /*                               DI */
+   COSTS_N_INSNS (7)},                 /*                            other */
+  0,                                   /* cost of multiply per each bit set */
+  {COSTS_N_INSNS (15),                 /* cost of a divide/mod for QI */
+   COSTS_N_INSNS (23),                 /*                          HI */
+   COSTS_N_INSNS (39),                 /*                          SI */
+   COSTS_N_INSNS (39),                 /*                          DI */
+   COSTS_N_INSNS (39)},                        /*                          other */
+  COSTS_N_INSNS (1),                   /* cost of movsx */
+  COSTS_N_INSNS (1),                   /* cost of movzx */
+  8,                                   /* "large" insn */
+  4,                                   /* MOVE_RATIO */
+  4,                                   /* CLEAR_RATIO */
+  {2, 2, 2},                           /* cost of loading integer registers
+                                          in QImode, HImode and SImode.
+                                          Relative to reg-reg move (2).  */
+  {2, 2, 2},                           /* cost of storing integer registers */
+  {2, 2, 8, 16, 32},                   /* cost of loading SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {2, 2, 8, 16, 32},                   /* cost of storing SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {2, 2, 8, 16, 32},                   /* cost of unaligned loads.  */
+  {2, 2, 8, 16, 32},                   /* cost of unaligned stores.  */
+  2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
+  6,                                   /* cost of moving SSE register to integer.  */
    2, 2,                                        /* Gather load static, per_elt.  */
    2, 2,                                        /* Gather store static, per_elt.  */
    64,                                  /* size of l1 cache.  */
@@ -663,6 +754,32 @@ static stringop_algs k6_memset[2] = {
    DUMMY_STRINGOP_ALGS};
  static const
  struct processor_costs k6_cost = {
+  {
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
+  3,                                /* cost for loading QImode using movzbl */
+  {4, 5, 4},                           /* cost of loading integer registers
+                                          in QImode, HImode and SImode.
+                                          Relative to reg-reg move (2).  */
+  {2, 3, 2},                           /* cost of storing integer registers */
+  4,                                   /* cost of reg,reg fld/fst */
+  {6, 6, 6},                           /* cost of loading fp registers
+                                          in SFmode, DFmode and XFmode */
+  {4, 4, 4},                           /* cost of storing fp registers
+                                          in SFmode, DFmode and XFmode */
+  2,                                   /* cost of moving MMX register */
+  {2, 2},                              /* cost of loading MMX registers
+                                          in SImode and DImode */
+  {2, 2},                              /* cost of storing MMX registers
+                                          in SImode and DImode */
+  2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
+  {2, 2, 8, 16, 32},                   /* cost of loading SSE registers
+                                          in 32,64,128,256 and 512-bit */
+  {2, 2, 8, 16, 32},                   /* cost of storing SSE registers
+                                          in 32,64,128,256 and 512-bit */
+  6, 6,                                        /* SSE->integer and integer->SSE moves */
+  /* End of register allocator costs.  */
+  },
+
    COSTS_N_INSNS (1),                   /* cost of an add instruction */
    COSTS_N_INSNS (2),                   /* cost of a lea instruction */
    COSTS_N_INSNS (1),                   /* variable shift costs */
@@ -682,32 +799,19 @@ struct processor_costs k6_cost = {
    COSTS_N_INSNS (2),                   /* cost of movzx */
    8,                                   /* "large" insn */
    4,                                   /* MOVE_RATIO */
-
-  /* All move costs are relative to integer->integer move times 2 and thus
-     they are latency*2. */
-  3,                                /* cost for loading QImode using movzbl */
+  4,                                   /* CLEAR_RATIO */
    {4, 5, 4},                           /* cost of loading integer registers
                                            in QImode, HImode and SImode.
                                            Relative to reg-reg move (2).  */
    {2, 3, 2},                           /* cost of storing integer registers */
-  4,                                   /* cost of reg,reg fld/fst */
-  {6, 6, 6},                           /* cost of loading fp registers
-                                          in SFmode, DFmode and XFmode */
-  {4, 4, 4},                           /* cost of storing fp registers
-                                          in SFmode, DFmode and XFmode */
-  2,                                   /* cost of moving MMX register */
-  {2, 2},                              /* cost of loading MMX registers
-                                          in SImode and DImode */
-  {2, 2},                              /* cost of storing MMX registers
-                                          in SImode and DImode */
-  2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
-  {2, 2, 8, 16, 32},                   /* cost of loading SSE registers
-                                          in 32,64,128,256 and 512-bit */
+  {2, 2, 8, 16, 32},                   /* cost of loading SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {2, 2, 8, 16, 32},                   /* cost of storing SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
    {2, 2, 8, 16, 32},                   /* cost of unaligned loads.  */
-  {2, 2, 8, 16, 32},                   /* cost of storing SSE registers
-                                          in 32,64,128,256 and 512-bit */
    {2, 2, 8, 16, 32},                   /* cost of unaligned stores.  */
-  6, 6,                                        /* SSE->integer and integer->SSE moves */
+  2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
+  6,                                   /* cost of moving SSE register to integer.  */
    2, 2,                                        /* Gather load static, per_elt.  */
    2, 2,                                        /* Gather store static, per_elt.  */
    32,                                  /* size of l1 cache.  */
@@ -757,6 +861,32 @@ static stringop_algs athlon_memset[2] = {
    DUMMY_STRINGOP_ALGS};
  static const
  struct processor_costs athlon_cost = {
+  {
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
+  4,                                /* cost for loading QImode using movzbl */
+  {3, 4, 3},                           /* cost of loading integer registers
+                                          in QImode, HImode and SImode.
+                                          Relative to reg-reg move (2).  */
+  {3, 4, 3},                           /* cost of storing integer registers */
+  4,                                   /* cost of reg,reg fld/fst */
+  {4, 4, 12},                          /* cost of loading fp registers
+                                          in SFmode, DFmode and XFmode */
+  {6, 6, 8},                           /* cost of storing fp registers
+                                          in SFmode, DFmode and XFmode */
+  2,                                   /* cost of moving MMX register */
+  {4, 4},                              /* cost of loading MMX registers
+                                          in SImode and DImode */
+  {4, 4},                              /* cost of storing MMX registers
+                                          in SImode and DImode */
+  2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
+  {4, 4, 12, 12, 24},                  /* cost of loading SSE registers
+                                          in 32,64,128,256 and 512-bit */
+  {4, 4, 10, 10, 20},                  /* cost of storing SSE registers
+                                          in 32,64,128,256 and 512-bit */
+  5, 5,                                        /* SSE->integer and integer->SSE moves */
+  /* End of register allocator costs.  */
+  },
+
    COSTS_N_INSNS (1),                   /* cost of an add instruction */
    COSTS_N_INSNS (2),                   /* cost of a lea instruction */
    COSTS_N_INSNS (1),                   /* variable shift costs */
@@ -776,32 +906,19 @@ struct processor_costs athlon_cost = {
    COSTS_N_INSNS (1),                   /* cost of movzx */
    8,                                   /* "large" insn */
    9,                                   /* MOVE_RATIO */
-
-  /* All move costs are relative to integer->integer move times 2 and thus
-     they are latency*2. */
-  4,                                /* cost for loading QImode using movzbl */
+  6,                                   /* CLEAR_RATIO */
    {3, 4, 3},                           /* cost of loading integer registers
                                            in QImode, HImode and SImode.
                                            Relative to reg-reg move (2).  */
    {3, 4, 3},                           /* cost of storing integer registers */
-  4,                                   /* cost of reg,reg fld/fst */
-  {4, 4, 12},                          /* cost of loading fp registers
-                                          in SFmode, DFmode and XFmode */
-  {6, 6, 8},                           /* cost of storing fp registers
-                                          in SFmode, DFmode and XFmode */
-  2,                                   /* cost of moving MMX register */
-  {4, 4},                              /* cost of loading MMX registers
-                                          in SImode and DImode */
-  {4, 4},                              /* cost of storing MMX registers
-                                          in SImode and DImode */
-  2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
-  {4, 4, 12, 12, 24},                  /* cost of loading SSE registers
-                                          in 32,64,128,256 and 512-bit */
+  {4, 4, 12, 12, 24},                  /* cost of loading SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {4, 4, 10, 10, 20},                  /* cost of storing SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
    {4, 4, 12, 12, 24},                  /* cost of unaligned loads.  */
-  {4, 4, 10, 10, 20},                  /* cost of storing SSE registers
-                                          in 32,64,128,256 and 512-bit */
    {4, 4, 10, 10, 20},                  /* cost of unaligned stores.  */
-  5, 5,                                        /* SSE->integer and integer->SSE moves */
+  2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
+  5,                                   /* cost of moving SSE register to integer.  */
    4, 4,                                        /* Gather load static, per_elt.  */
    4, 4,                                        /* Gather store static, per_elt.  */
    64,                                  /* size of l1 cache.  */
@@ -853,6 +970,32 @@ static stringop_algs k8_memset[2] = {
               {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
  static const
  struct processor_costs k8_cost = {
+  {
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
+  4,                                /* cost for loading QImode using movzbl */
+  {3, 4, 3},                           /* cost of loading integer registers
+                                          in QImode, HImode and SImode.
+                                          Relative to reg-reg move (2).  */
+  {3, 4, 3},                           /* cost of storing integer registers */
+  4,                                   /* cost of reg,reg fld/fst */
+  {4, 4, 12},                          /* cost of loading fp registers
+                                          in SFmode, DFmode and XFmode */
+  {6, 6, 8},                           /* cost of storing fp registers
+                                          in SFmode, DFmode and XFmode */
+  2,                                   /* cost of moving MMX register */
+  {3, 3},                              /* cost of loading MMX registers
+                                          in SImode and DImode */
+  {4, 4},                              /* cost of storing MMX registers
+                                          in SImode and DImode */
+  2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
+  {4, 3, 12, 12, 24},                  /* cost of loading SSE registers
+                                          in 32,64,128,256 and 512-bit */
+  {4, 4, 10, 10, 20},                  /* cost of storing SSE registers
+                                          in 32,64,128,256 and 512-bit */
+  5, 5,                                        /* SSE->integer and integer->SSE moves */
+  /* End of register allocator costs.  */
+  },
+
    COSTS_N_INSNS (1),                   /* cost of an add instruction */
    COSTS_N_INSNS (2),                   /* cost of a lea instruction */
    COSTS_N_INSNS (1),                   /* variable shift costs */
@@ -872,32 +1015,19 @@ struct processor_costs k8_cost = {
    COSTS_N_INSNS (1),                   /* cost of movzx */
    8,                                   /* "large" insn */
    9,                                   /* MOVE_RATIO */
-
-  /* All move costs are relative to integer->integer move times 2 and thus
-     they are latency*2. */
-  4,                                /* cost for loading QImode using movzbl */
+  6,                                   /* CLEAR_RATIO */
    {3, 4, 3},                           /* cost of loading integer registers
                                            in QImode, HImode and SImode.
                                            Relative to reg-reg move (2).  */
    {3, 4, 3},                           /* cost of storing integer registers */
-  4,                                   /* cost of reg,reg fld/fst */
-  {4, 4, 12},                          /* cost of loading fp registers
-                                          in SFmode, DFmode and XFmode */
-  {6, 6, 8},                           /* cost of storing fp registers
-                                          in SFmode, DFmode and XFmode */
-  2,                                   /* cost of moving MMX register */
-  {3, 3},                              /* cost of loading MMX registers
-                                          in SImode and DImode */
-  {4, 4},                              /* cost of storing MMX registers
-                                          in SImode and DImode */
-  2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
-  {4, 3, 12, 12, 24},                  /* cost of loading SSE registers
-                                          in 32,64,128,256 and 512-bit */
+  {4, 3, 12, 12, 24},                  /* cost of loading SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {4, 4, 10, 10, 20},                  /* cost of storing SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
    {4, 3, 12, 12, 24},                  /* cost of unaligned loads.  */
-  {4, 4, 10, 10, 20},                  /* cost of storing SSE registers
-                                          in 32,64,128,256 and 512-bit */
    {4, 4, 10, 10, 20},                  /* cost of unaligned stores.  */
-  5, 5,                                        /* SSE->integer and integer->SSE moves */
+  2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
+  5,                                   /* cost of moving SSE register to integer.  */
    4, 4,                                        /* Gather load static, per_elt.  */
    4, 4,                                        /* Gather store static, per_elt.  */
    64,                                  /* size of l1 cache.  */
@@ -953,28 +1083,8 @@ static stringop_algs amdfam10_memset[2] = {
    {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
               {-1, libcall, false}}}};
  struct processor_costs amdfam10_cost = {
-  COSTS_N_INSNS (1),                   /* cost of an add instruction */
-  COSTS_N_INSNS (2),                   /* cost of a lea instruction */
-  COSTS_N_INSNS (1),                   /* variable shift costs */
-  COSTS_N_INSNS (1),                   /* constant shift costs */
-  {COSTS_N_INSNS (3),                  /* cost of starting multiply for QI */
-   COSTS_N_INSNS (4),                  /*                               HI */
-   COSTS_N_INSNS (3),                  /*                               SI */
-   COSTS_N_INSNS (4),                  /*                               DI */
-   COSTS_N_INSNS (5)},                 /*                            other */
-  0,                                   /* cost of multiply per each bit set */
-  {COSTS_N_INSNS (19),                 /* cost of a divide/mod for QI */
-   COSTS_N_INSNS (35),                 /*                          HI */
-   COSTS_N_INSNS (51),                 /*                          SI */
-   COSTS_N_INSNS (83),                 /*                          DI */
-   COSTS_N_INSNS (83)},                        /*                          other */
-  COSTS_N_INSNS (1),                   /* cost of movsx */
-  COSTS_N_INSNS (1),                   /* cost of movzx */
-  8,                                   /* "large" insn */
-  9,                                   /* MOVE_RATIO */
-
-  /* All move costs are relative to integer->integer move times 2 and thus
-     they are latency*2. */
+  {
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
    4,                                /* cost for loading QImode using movzbl */
    {3, 4, 3},                           /* cost of loading integer registers
                                            in QImode, HImode and SImode.
@@ -993,11 +1103,10 @@ struct processor_costs amdfam10_cost = {
    2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
    {4, 4, 3, 6, 12},                    /* cost of loading SSE registers
                                            in 32,64,128,256 and 512-bit */
-  {4, 4, 3, 7, 12},                    /* cost of unaligned loads.  */
    {4, 4, 5, 10, 20},                   /* cost of storing SSE registers
                                            in 32,64,128,256 and 512-bit */
-  {4, 4, 5, 10, 20},                   /* cost of unaligned stores.  */
    3, 3,                                        /* SSE->integer and integer->SSE moves */
+
                                         /* On K8:
                                             MOVD reg64, xmmreg Double FSTORE 4
                                             MOVD reg32, xmmreg Double FSTORE 4
@@ -1006,6 +1115,41 @@ struct processor_costs amdfam10_cost = {
                                                                1/1  1/1
                                             MOVD reg32, xmmreg Double FADD 3
                                                                1/1  1/1 */
+  /* End of register allocator costs.  */
+  },
+
+  COSTS_N_INSNS (1),                   /* cost of an add instruction */
+  COSTS_N_INSNS (2),                   /* cost of a lea instruction */
+  COSTS_N_INSNS (1),                   /* variable shift costs */
+  COSTS_N_INSNS (1),                   /* constant shift costs */
+  {COSTS_N_INSNS (3),                  /* cost of starting multiply for QI */
+   COSTS_N_INSNS (4),                  /*                               HI */
+   COSTS_N_INSNS (3),                  /*                               SI */
+   COSTS_N_INSNS (4),                  /*                               DI */
+   COSTS_N_INSNS (5)},                 /*                            other */
+  0,                                   /* cost of multiply per each bit set */
+  {COSTS_N_INSNS (19),                 /* cost of a divide/mod for QI */
+   COSTS_N_INSNS (35),                 /*                          HI */
+   COSTS_N_INSNS (51),                 /*                          SI */
+   COSTS_N_INSNS (83),                 /*                          DI */
+   COSTS_N_INSNS (83)},                        /*                          other */
+  COSTS_N_INSNS (1),                   /* cost of movsx */
+  COSTS_N_INSNS (1),                   /* cost of movzx */
+  8,                                   /* "large" insn */
+  9,                                   /* MOVE_RATIO */
+  6,                                   /* CLEAR_RATIO */
+  {3, 4, 3},                           /* cost of loading integer registers
+                                          in QImode, HImode and SImode.
+                                          Relative to reg-reg move (2).  */
+  {3, 4, 3},                           /* cost of storing integer registers */
+  {4, 4, 3, 6, 12},                    /* cost of loading SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {4, 4, 5, 10, 20},                   /* cost of storing SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {4, 4, 3, 7, 12},                    /* cost of unaligned loads.  */
+  {4, 4, 5, 10, 20},                   /* cost of unaligned stores.  */
+  2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
+  3,                                   /* cost of moving SSE register to integer.  */
    4, 4,                                        /* Gather load static, per_elt.  */
    4, 4,                                        /* Gather store static, per_elt.  */
    64,                                  /* size of l1 cache.  */
@@ -1062,6 +1206,32 @@ static stringop_algs bdver_memset[2] = {
               {-1, libcall, false}}}};
  
  const struct processor_costs bdver_cost = {
+  {
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
+  8,                                /* cost for loading QImode using movzbl */
+  {8, 8, 8},                           /* cost of loading integer registers
+                                          in QImode, HImode and SImode.
+                                          Relative to reg-reg move (2).  */
+  {8, 8, 8},                           /* cost of storing integer registers */
+  4,                                   /* cost of reg,reg fld/fst */
+  {12, 12, 28},                                /* cost of loading fp registers
+                                          in SFmode, DFmode and XFmode */
+  {10, 10, 18},                                /* cost of storing fp registers
+                                          in SFmode, DFmode and XFmode */
+  4,                                   /* cost of moving MMX register */
+  {12, 12},                            /* cost of loading MMX registers
+                                          in SImode and DImode */
+  {10, 10},                            /* cost of storing MMX registers
+                                          in SImode and DImode */
+  2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
+  {12, 12, 10, 40, 60},                        /* cost of loading SSE registers
+                                          in 32,64,128,256 and 512-bit */
+  {10, 10, 10, 40, 60},                        /* cost of storing SSE registers
+                                          in 32,64,128,256 and 512-bit */
+  16, 20,                              /* SSE->integer and integer->SSE moves */
+  /* End of register allocator costs.  */
+  },
+
    COSTS_N_INSNS (1),                   /* cost of an add instruction */
    COSTS_N_INSNS (1),                   /* cost of a lea instruction */
    COSTS_N_INSNS (1),                   /* variable shift costs */
@@ -1081,32 +1251,19 @@ const struct processor_costs bdver_cost = {
    COSTS_N_INSNS (1),                   /* cost of movzx */
    8,                                   /* "large" insn */
    9,                                   /* MOVE_RATIO */
-
-  /* All move costs are relative to integer->integer move times 2 and thus
-     they are latency*2. */
-  8,                                /* cost for loading QImode using movzbl */
+  6,                                   /* CLEAR_RATIO */
    {8, 8, 8},                           /* cost of loading integer registers
                                            in QImode, HImode and SImode.
                                            Relative to reg-reg move (2).  */
    {8, 8, 8},                           /* cost of storing integer registers */
-  4,                                   /* cost of reg,reg fld/fst */
-  {12, 12, 28},                                /* cost of loading fp registers
-                                          in SFmode, DFmode and XFmode */
-  {10, 10, 18},                                /* cost of storing fp registers
-                                          in SFmode, DFmode and XFmode */
-  4,                                   /* cost of moving MMX register */
-  {12, 12},                            /* cost of loading MMX registers
-                                          in SImode and DImode */
-  {10, 10},                            /* cost of storing MMX registers
-                                          in SImode and DImode */
-  2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
-  {12, 12, 10, 40, 60},                        /* cost of loading SSE registers
-                                          in 32,64,128,256 and 512-bit */
+  {12, 12, 10, 40, 60},                        /* cost of loading SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {10, 10, 10, 40, 60},                        /* cost of storing SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
    {12, 12, 10, 40, 60},                        /* cost of unaligned loads.  */
-  {10, 10, 10, 40, 60},                        /* cost of storing SSE registers
-                                          in 32,64,128,256 and 512-bit */
    {10, 10, 10, 40, 60},                        /* cost of unaligned stores.  */
-  16, 20,                              /* SSE->integer and integer->SSE moves */
+  2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
+  16,                                  /* cost of moving SSE register to integer.  */
    12, 12,                              /* Gather load static, per_elt.  */
    10, 10,                              /* Gather store static, per_elt.  */
    16,                                  /* size of l1 cache.  */
@@ -1164,8 +1321,41 @@ static stringop_algs znver1_memset[2] = {
    {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
              {-1, libcall, false}}}};
  struct processor_costs znver1_cost = {
-  COSTS_N_INSNS (1),                   /* cost of an add instruction.  */
-  COSTS_N_INSNS (1),                   /* cost of a lea instruction.  */
+  {
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
+
+  /* reg-reg moves are done by renaming and thus they are even cheaper than
+     1 cycle. Becuase reg-reg move cost is 2 and the following tables correspond
+     to doubles of latencies, we do not model this correctly.  It does not
+     seem to make practical difference to bump prices up even more.  */
+  6,                                   /* cost for loading QImode using
+                                          movzbl.  */
+  {6, 6, 6},                           /* cost of loading integer registers
+                                          in QImode, HImode and SImode.
+                                          Relative to reg-reg move (2).  */
+  {8, 8, 8},                           /* cost of storing integer
+                                          registers.  */
+  2,                                   /* cost of reg,reg fld/fst.  */
+  {6, 6, 16},                          /* cost of loading fp registers
+                                          in SFmode, DFmode and XFmode.  */
+  {8, 8, 16},                          /* cost of storing fp registers
+                                          in SFmode, DFmode and XFmode.  */
+  2,                                   /* cost of moving MMX register.  */
+  {6, 6},                              /* cost of loading MMX registers
+                                          in SImode and DImode.  */
+  {8, 8},                              /* cost of storing MMX registers
+                                          in SImode and DImode.  */
+  2, 3, 6,                             /* cost of moving XMM,YMM,ZMM register.  */
+  {6, 6, 6, 12, 24},                   /* cost of loading SSE registers
+                                          in 32,64,128,256 and 512-bit.  */
+  {8, 8, 8, 16, 32},                   /* cost of storing SSE registers
+                                          in 32,64,128,256 and 512-bit.  */
+  6, 6,                                        /* SSE->integer and integer->SSE moves.  */
+  /* End of register allocator costs.  */
+  },
+
+  COSTS_N_INSNS (1),                   /* cost of an add instruction.  */
+  COSTS_N_INSNS (1),                   /* cost of a lea instruction.  */
    COSTS_N_INSNS (1),                   /* variable shift costs.  */
    COSTS_N_INSNS (1),                   /* constant shift costs.  */
    {COSTS_N_INSNS (3),                  /* cost of starting multiply for QI.  */
@@ -1186,12 +1376,94 @@ struct processor_costs znver1_cost = {
    COSTS_N_INSNS (1),                   /* cost of movzx.  */
    8,                                   /* "large" insn.  */
    9,                                   /* MOVE_RATIO.  */
+  6,                                   /* CLEAR_RATIO */
+  {6, 6, 6},                           /* cost of loading integer registers
+                                          in QImode, HImode and SImode.
+                                          Relative to reg-reg move (2).  */
+  {8, 8, 8},                           /* cost of storing integer
+                                          registers.  */
+  {6, 6, 6, 12, 24},                   /* cost of loading SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {8, 8, 8, 16, 32},                   /* cost of storing SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {6, 6, 6, 12, 24},                   /* cost of unaligned loads.  */
+  {8, 8, 8, 16, 32},                   /* cost of unaligned stores.  */
+  2, 3, 6,                             /* cost of moving XMM,YMM,ZMM register.  */
+  6,                                   /* cost of moving SSE register to integer.  */
+  /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops,
+     throughput 12.  Approx 9 uops do not depend on vector size and every load
+     is 7 uops.  */
+  18, 8,                               /* Gather load static, per_elt.  */
+  18, 10,                              /* Gather store static, per_elt.  */
+  32,                                  /* size of l1 cache.  */
+  512,                                 /* size of l2 cache.  */
+  64,                                  /* size of prefetch block.  */
+  /* New AMD processors never drop prefetches; if they cannot be performed
+     immediately, they are queued.  We set number of simultaneous prefetches
+     to a large constant to reflect this (it probably is not a good idea not
+     to limit number of prefetches at all, as their execution also takes some
+     time).  */
+  100,                                 /* number of parallel prefetches.  */
+  3,                                   /* Branch cost.  */
+  COSTS_N_INSNS (5),                   /* cost of FADD and FSUB insns.  */
+  COSTS_N_INSNS (5),                   /* cost of FMUL instruction.  */
+  /* Latency of fdiv is 8-15.  */
+  COSTS_N_INSNS (15),                  /* cost of FDIV instruction.  */
+  COSTS_N_INSNS (1),                   /* cost of FABS instruction.  */
+  COSTS_N_INSNS (1),                   /* cost of FCHS instruction.  */
+  /* Latency of fsqrt is 4-10.  */
+  COSTS_N_INSNS (10),                  /* cost of FSQRT instruction.  */
  
-  /* All move costs are relative to integer->integer move times 2 and thus
-     they are latency*2. */
+  COSTS_N_INSNS (1),                   /* cost of cheap SSE instruction.  */
+  COSTS_N_INSNS (3),                   /* cost of ADDSS/SD SUBSS/SD insns.  */
+  COSTS_N_INSNS (3),                   /* cost of MULSS instruction.  */
+  COSTS_N_INSNS (4),                   /* cost of MULSD instruction.  */
+  COSTS_N_INSNS (5),                   /* cost of FMA SS instruction.  */
+  COSTS_N_INSNS (5),                   /* cost of FMA SD instruction.  */
+  COSTS_N_INSNS (10),                  /* cost of DIVSS instruction.  */
+  /* 9-13  */
+  COSTS_N_INSNS (13),                  /* cost of DIVSD instruction.  */
+  COSTS_N_INSNS (10),                  /* cost of SQRTSS instruction.  */
+  COSTS_N_INSNS (15),                  /* cost of SQRTSD instruction.  */
+  /* Zen can execute 4 integer operations per cycle. FP operations take 3 cycles
+     and it can execute 2 integer additions and 2 multiplications thus
+     reassociation may make sense up to with of 6.  SPEC2k6 bencharks suggests
+     that 4 works better than 6 probably due to register pressure.
+
+     Integer vector operations are taken by FP unit and execute 3 vector
+     plus/minus operations per cycle but only one multiply.  This is adjusted
+     in ix86_reassociation_width.  */
+  4, 4, 3, 6,                          /* reassoc int, fp, vec_int, vec_fp.  */
+  znver1_memcpy,
+  znver1_memset,
+  COSTS_N_INSNS (4),                   /* cond_taken_branch_cost.  */
+  COSTS_N_INSNS (2),                   /* cond_not_taken_branch_cost.  */
+  "16",                                        /* Loop alignment.  */
+  "16",                                        /* Jump alignment.  */
+  "0:0:8",                             /* Label alignment.  */
+  "16",                                        /* Func alignment.  */
+};
+
+/*  ZNVER2 has optimized REP instruction for medium sized blocks, but for
+    very small blocks it is better to use loop.  For large blocks, libcall
+    can do nontemporary accesses and beat inline considerably.  */
+static stringop_algs znver2_memcpy[2] = {
+  {libcall, {{6, loop, false}, {14, unrolled_loop, false},
+            {-1, rep_prefix_4_byte, false}}},
+  {libcall, {{16, loop, false}, {64, rep_prefix_4_byte, false},
+            {-1, libcall, false}}}};
+static stringop_algs znver2_memset[2] = {
+  {libcall, {{8, loop, false}, {24, unrolled_loop, false},
+            {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
+  {libcall, {{24, rep_prefix_4_byte, false}, {128, rep_prefix_8_byte, false},
+            {-1, libcall, false}}}};
+
+struct processor_costs znver2_cost = {
+  {
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
  
    /* reg-reg moves are done by renaming and thus they are even cheaper than
-     1 cycle. Becuase reg-reg move cost is 2 and the following tables correspond
+     1 cycle.  Because reg-reg move cost is 2 and following tables correspond
       to doubles of latencies, we do not model this correctly.  It does not
       seem to make practical difference to bump prices up even more.  */
    6,                                   /* cost for loading QImode using
@@ -1203,22 +1475,62 @@ struct processor_costs znver1_cost = {
                                            registers.  */
    2,                                   /* cost of reg,reg fld/fst.  */
    {6, 6, 16},                          /* cost of loading fp registers
-                                          in SFmode, DFmode and XFmode.  */
+                                          in SFmode, DFmode and XFmode.  */
    {8, 8, 16},                          /* cost of storing fp registers
-                                          in SFmode, DFmode and XFmode.  */
+                                          in SFmode, DFmode and XFmode.  */
    2,                                   /* cost of moving MMX register.  */
    {6, 6},                              /* cost of loading MMX registers
                                            in SImode and DImode.  */
    {8, 8},                              /* cost of storing MMX registers
                                            in SImode and DImode.  */
-  2, 3, 6,                             /* cost of moving XMM,YMM,ZMM register.  */
-  {6, 6, 6, 12, 24},                   /* cost of loading SSE registers
+  2, 2, 3,                             /* cost of moving XMM,YMM,ZMM
+                                          register.  */
+  {6, 6, 6, 6, 12},                    /* cost of loading SSE registers
                                            in 32,64,128,256 and 512-bit.  */
-  {6, 6, 6, 12, 24},                   /* cost of unaligned loads.  */
-  {8, 8, 8, 16, 32},                   /* cost of storing SSE registers
+  {8, 8, 8, 8, 16},                    /* cost of storing SSE registers
                                            in 32,64,128,256 and 512-bit.  */
-  {8, 8, 8, 16, 32},                   /* cost of unaligned stores.  */
-  6, 6,                                        /* SSE->integer and integer->SSE moves.  */
+  6, 6,                                        /* SSE->integer and integer->SSE
+                                          moves.  */
+  /* End of register allocator costs.  */
+  },
+
+  COSTS_N_INSNS (1),                   /* cost of an add instruction.  */
+  COSTS_N_INSNS (1),                   /* cost of a lea instruction.  */
+  COSTS_N_INSNS (1),                   /* variable shift costs.  */
+  COSTS_N_INSNS (1),                   /* constant shift costs.  */
+  {COSTS_N_INSNS (3),                  /* cost of starting multiply for QI.  */
+   COSTS_N_INSNS (3),                  /*                               HI.  */
+   COSTS_N_INSNS (3),                  /*                               SI.  */
+   COSTS_N_INSNS (3),                  /*                               DI.  */
+   COSTS_N_INSNS (3)},                 /*                      other.  */
+  0,                                   /* cost of multiply per each bit
+                                          set.  */
+   /* Depending on parameters, idiv can get faster on ryzen.  This is upper
+      bound.  */
+  {COSTS_N_INSNS (16),                 /* cost of a divide/mod for QI.  */
+   COSTS_N_INSNS (22),                 /*                          HI.  */
+   COSTS_N_INSNS (30),                 /*                          SI.  */
+   COSTS_N_INSNS (45),                 /*                          DI.  */
+   COSTS_N_INSNS (45)},                        /*                          other.  */
+  COSTS_N_INSNS (1),                   /* cost of movsx.  */
+  COSTS_N_INSNS (1),                   /* cost of movzx.  */
+  8,                                   /* "large" insn.  */
+  9,                                   /* MOVE_RATIO.  */
+  6,                                   /* CLEAR_RATIO */
+  {6, 6, 6},                           /* cost of loading integer registers
+                                          in QImode, HImode and SImode.
+                                          Relative to reg-reg move (2).  */
+  {8, 8, 8},                           /* cost of storing integer
+                                          registers.  */
+  {6, 6, 6, 6, 12},                    /* cost of loading SSE registers
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {8, 8, 8, 8, 16},                    /* cost of storing SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {6, 6, 6, 6, 12},                    /* cost of unaligned loads.  */
+  {8, 8, 8, 8, 16},                    /* cost of unaligned stores.  */
+  2, 2, 3,                             /* cost of moving XMM,YMM,ZMM
+                                          register.  */
+  6,                                   /* cost of moving SSE register to integer.  */
    /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops,
       throughput 12.  Approx 9 uops do not depend on vector size and every load
       is 7 uops.  */
@@ -1246,25 +1558,26 @@ struct processor_costs znver1_cost = {
    COSTS_N_INSNS (1),                   /* cost of cheap SSE instruction.  */
    COSTS_N_INSNS (3),                   /* cost of ADDSS/SD SUBSS/SD insns.  */
    COSTS_N_INSNS (3),                   /* cost of MULSS instruction.  */
-  COSTS_N_INSNS (4),                   /* cost of MULSD instruction.  */
+  COSTS_N_INSNS (3),                   /* cost of MULSD instruction.  */
    COSTS_N_INSNS (5),                   /* cost of FMA SS instruction.  */
    COSTS_N_INSNS (5),                   /* cost of FMA SD instruction.  */
    COSTS_N_INSNS (10),                  /* cost of DIVSS instruction.  */
-  /* 9-13  */
+  /* 9-13.  */
    COSTS_N_INSNS (13),                  /* cost of DIVSD instruction.  */
    COSTS_N_INSNS (10),                  /* cost of SQRTSS instruction.  */
    COSTS_N_INSNS (15),                  /* cost of SQRTSD instruction.  */
-  /* Zen can execute 4 integer operations per cycle. FP operations take 3 cycles
-     and it can execute 2 integer additions and 2 multiplications thus
-     reassociation may make sense up to with of 6.  SPEC2k6 bencharks suggests
+  /* Zen can execute 4 integer operations per cycle.  FP operations
+     take 3 cycles and it can execute 2 integer additions and 2
+     multiplications thus reassociation may make sense up to with of 6.
+     SPEC2k6 bencharks suggests
       that 4 works better than 6 probably due to register pressure.
  
       Integer vector operations are taken by FP unit and execute 3 vector
       plus/minus operations per cycle but only one multiply.  This is adjusted
       in ix86_reassociation_width.  */
    4, 4, 3, 6,                          /* reassoc int, fp, vec_int, vec_fp.  */
-  znver1_memcpy,
-  znver1_memset,
+  znver2_memcpy,
+  znver2_memset,
    COSTS_N_INSNS (4),                   /* cond_taken_branch_cost.  */
    COSTS_N_INSNS (2),                   /* cond_not_taken_branch_cost.  */
    "16",                                        /* Loop alignment.  */
@@ -1289,6 +1602,32 @@ static stringop_algs skylake_memset[2] = {
  
  static const
  struct processor_costs skylake_cost = {
+  {
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
+  6,                                /* cost for loading QImode using movzbl */
+  {4, 4, 4},                           /* cost of loading integer registers
+                                          in QImode, HImode and SImode.
+                                          Relative to reg-reg move (2).  */
+  {6, 6, 6},                           /* cost of storing integer registers */
+  2,                                   /* cost of reg,reg fld/fst */
+  {6, 6, 8},                           /* cost of loading fp registers
+                                          in SFmode, DFmode and XFmode */
+  {6, 6, 10},                          /* cost of storing fp registers
+                                          in SFmode, DFmode and XFmode */
+  2,                                   /* cost of moving MMX register */
+  {6, 6},                              /* cost of loading MMX registers
+                                          in SImode and DImode */
+  {6, 6},                              /* cost of storing MMX registers
+                                          in SImode and DImode */
+  2, 2, 4,                             /* cost of moving XMM,YMM,ZMM register */
+  {6, 6, 6, 10, 20},                   /* cost of loading SSE registers
+                                          in 32,64,128,256 and 512-bit */
+  {8, 8, 8, 12, 24},                   /* cost of storing SSE registers
+                                          in 32,64,128,256 and 512-bit */
+  6, 6,                                        /* SSE->integer and integer->SSE moves */
+  /* End of register allocator costs.  */
+  },
+
    COSTS_N_INSNS (1),                   /* cost of an add instruction */
    COSTS_N_INSNS (1)+1,         /* cost of a lea instruction */
    COSTS_N_INSNS (1),                   /* variable shift costs */
@@ -1310,30 +1649,19 @@ struct processor_costs skylake_cost = {
    COSTS_N_INSNS (0),                   /* cost of movzx */
    8,                                   /* "large" insn */
    17,                                  /* MOVE_RATIO */
-
-  6,                                /* cost for loading QImode using movzbl */
+  6,                                   /* CLEAR_RATIO */
    {4, 4, 4},                           /* cost of loading integer registers
                                            in QImode, HImode and SImode.
                                            Relative to reg-reg move (2).  */
-  {6, 6, 3},                           /* cost of storing integer registers */
-  2,                                   /* cost of reg,reg fld/fst */
-  {6, 6, 8},                           /* cost of loading fp registers
-                                          in SFmode, DFmode and XFmode */
-  {6, 6, 10},                          /* cost of storing fp registers
-                                          in SFmode, DFmode and XFmode */
-  2,                                   /* cost of moving MMX register */
-  {6, 6},                              /* cost of loading MMX registers
-                                          in SImode and DImode */
-  {6, 6},                              /* cost of storing MMX registers
-                                          in SImode and DImode */
-  2, 2, 4,                             /* cost of moving XMM,YMM,ZMM register */
-  {6, 6, 6, 10, 20},                   /* cost of loading SSE registers
-                                          in 32,64,128,256 and 512-bit */
+  {6, 6, 6},                           /* cost of storing integer registers */
+  {6, 6, 6, 10, 20},                   /* cost of loading SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {8, 8, 8, 12, 24},                   /* cost of storing SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
    {6, 6, 6, 10, 20},                   /* cost of unaligned loads.  */
-  {8, 8, 8, 12, 24},                   /* cost of storing SSE registers
-                                          in 32,64,128,256 and 512-bit */
    {8, 8, 8, 8, 16},                    /* cost of unaligned stores.  */
-  2, 2,                                        /* SSE->integer and integer->SSE moves */
+  2, 2, 4,                             /* cost of moving XMM,YMM,ZMM register */
+  2,                                   /* cost of moving SSE register to integer.  */
    20, 8,                               /* Gather load static, per_elt.  */
    22, 10,                              /* Gather store static, per_elt.  */
    64,                                  /* size of l1 cache.  */
@@ -1382,6 +1710,32 @@ static stringop_algs btver1_memset[2] = {
    {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
               {-1, libcall, false}}}};
  const struct processor_costs btver1_cost = {
+  {
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
+  8,                                /* cost for loading QImode using movzbl */
+  {6, 8, 6},                           /* cost of loading integer registers
+                                          in QImode, HImode and SImode.
+                                          Relative to reg-reg move (2).  */
+  {6, 8, 6},                           /* cost of storing integer registers */
+  4,                                   /* cost of reg,reg fld/fst */
+  {12, 12, 28},                                /* cost of loading fp registers
+                                          in SFmode, DFmode and XFmode */
+  {12, 12, 38},                                /* cost of storing fp registers
+                                          in SFmode, DFmode and XFmode */
+  4,                                   /* cost of moving MMX register */
+  {10, 10},                            /* cost of loading MMX registers
+                                          in SImode and DImode */
+  {12, 12},                            /* cost of storing MMX registers
+                                          in SImode and DImode */
+  2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
+  {10, 10, 12, 48, 96},                        /* cost of loading SSE registers
+                                          in 32,64,128,256 and 512-bit */
+  {10, 10, 12, 48, 96},                        /* cost of storing SSE registers
+                                          in 32,64,128,256 and 512-bit */
+  14, 14,                              /* SSE->integer and integer->SSE moves */
+  /* End of register allocator costs.  */
+  },
+
    COSTS_N_INSNS (1),                   /* cost of an add instruction */
    COSTS_N_INSNS (2),                   /* cost of a lea instruction */
    COSTS_N_INSNS (1),                   /* variable shift costs */
@@ -1401,32 +1755,19 @@ const struct processor_costs btver1_cost = {
    COSTS_N_INSNS (1),                   /* cost of movzx */
    8,                                   /* "large" insn */
    9,                                   /* MOVE_RATIO */
-
-  /* All move costs are relative to integer->integer move times 2 and thus
-     they are latency*2. */
-  8,                                /* cost for loading QImode using movzbl */
+  6,                                   /* CLEAR_RATIO */
    {6, 8, 6},                           /* cost of loading integer registers
                                            in QImode, HImode and SImode.
                                            Relative to reg-reg move (2).  */
    {6, 8, 6},                           /* cost of storing integer registers */
-  4,                                   /* cost of reg,reg fld/fst */
-  {12, 12, 28},                                /* cost of loading fp registers
-                                          in SFmode, DFmode and XFmode */
-  {12, 12, 38},                                /* cost of storing fp registers
-                                          in SFmode, DFmode and XFmode */
-  4,                                   /* cost of moving MMX register */
-  {10, 10},                            /* cost of loading MMX registers
-                                          in SImode and DImode */
-  {12, 12},                            /* cost of storing MMX registers
-                                          in SImode and DImode */
-  2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
-  {10, 10, 12, 48, 96},                        /* cost of loading SSE registers
-                                          in 32,64,128,256 and 512-bit */
+  {10, 10, 12, 48, 96},                        /* cost of loading SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {10, 10, 12, 48, 96},                        /* cost of storing SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
    {10, 10, 12, 48, 96},                        /* cost of unaligned loads.  */
-  {10, 10, 12, 48, 96},                        /* cost of storing SSE registers
-                                          in 32,64,128,256 and 512-bit */
    {10, 10, 12, 48, 96},                        /* cost of unaligned stores.  */
-  14, 14,                              /* SSE->integer and integer->SSE moves */
+  2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
+  14,                                  /* cost of moving SSE register to integer.  */
    10, 10,                              /* Gather load static, per_elt.  */
    10, 10,                              /* Gather store static, per_elt.  */
    32,                                  /* size of l1 cache.  */
@@ -1473,6 +1814,32 @@ static stringop_algs btver2_memset[2] = {
    {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
               {-1, libcall, false}}}};
  const struct processor_costs btver2_cost = {
+  {
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
+  8,                                /* cost for loading QImode using movzbl */
+  {8, 8, 6},                           /* cost of loading integer registers
+                                          in QImode, HImode and SImode.
+                                          Relative to reg-reg move (2).  */
+  {8, 8, 6},                           /* cost of storing integer registers */
+  4,                                   /* cost of reg,reg fld/fst */
+  {12, 12, 28},                                /* cost of loading fp registers
+                                          in SFmode, DFmode and XFmode */
+  {12, 12, 38},                                /* cost of storing fp registers
+                                          in SFmode, DFmode and XFmode */
+  4,                                   /* cost of moving MMX register */
+  {10, 10},                            /* cost of loading MMX registers
+                                          in SImode and DImode */
+  {12, 12},                            /* cost of storing MMX registers
+                                          in SImode and DImode */
+  2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
+  {10, 10, 12, 48, 96},                        /* cost of loading SSE registers
+                                          in 32,64,128,256 and 512-bit */
+  {10, 10, 12, 48, 96},                        /* cost of storing SSE registers
+                                          in 32,64,128,256 and 512-bit */
+  14, 14,                              /* SSE->integer and integer->SSE moves */
+  /* End of register allocator costs.  */
+  },
+
    COSTS_N_INSNS (1),                   /* cost of an add instruction */
    COSTS_N_INSNS (2),                   /* cost of a lea instruction */
    COSTS_N_INSNS (1),                   /* variable shift costs */
@@ -1492,32 +1859,19 @@ const struct processor_costs btver2_cost = {
    COSTS_N_INSNS (1),                   /* cost of movzx */
    8,                                   /* "large" insn */
    9,                                   /* MOVE_RATIO */
-
-  /* All move costs are relative to integer->integer move times 2 and thus
-     they are latency*2. */
-  8,                                /* cost for loading QImode using movzbl */
+  6,                                   /* CLEAR_RATIO */
    {8, 8, 6},                           /* cost of loading integer registers
                                            in QImode, HImode and SImode.
                                            Relative to reg-reg move (2).  */
    {8, 8, 6},                           /* cost of storing integer registers */
-  4,                                   /* cost of reg,reg fld/fst */
-  {12, 12, 28},                                /* cost of loading fp registers
-                                          in SFmode, DFmode and XFmode */
-  {12, 12, 38},                                /* cost of storing fp registers
-                                          in SFmode, DFmode and XFmode */
-  4,                                   /* cost of moving MMX register */
-  {10, 10},                            /* cost of loading MMX registers
-                                          in SImode and DImode */
-  {12, 12},                            /* cost of storing MMX registers
-                                          in SImode and DImode */
-  2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
-  {10, 10, 12, 48, 96},                        /* cost of loading SSE registers
-                                          in 32,64,128,256 and 512-bit */
+  {10, 10, 12, 48, 96},                        /* cost of loading SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {10, 10, 12, 48, 96},                        /* cost of storing SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
    {10, 10, 12, 48, 96},                        /* cost of unaligned loads.  */
-  {10, 10, 12, 48, 96},                        /* cost of storing SSE registers
-                                          in 32,64,128,256 and 512-bit */
    {10, 10, 12, 48, 96},                        /* cost of unaligned stores.  */
-  14, 14,                              /* SSE->integer and integer->SSE moves */
+  2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
+  14,                                  /* cost of moving SSE register to integer.  */
    10, 10,                              /* Gather load static, per_elt.  */
    10, 10,                              /* Gather store static, per_elt.  */
    32,                                  /* size of l1 cache.  */
@@ -1563,28 +1917,8 @@ static stringop_algs pentium4_memset[2] = {
  
  static const
  struct processor_costs pentium4_cost = {
-  COSTS_N_INSNS (1),                   /* cost of an add instruction */
-  COSTS_N_INSNS (3),                   /* cost of a lea instruction */
-  COSTS_N_INSNS (4),                   /* variable shift costs */
-  COSTS_N_INSNS (4),                   /* constant shift costs */
-  {COSTS_N_INSNS (15),                 /* cost of starting multiply for QI */
-   COSTS_N_INSNS (15),                 /*                               HI */
-   COSTS_N_INSNS (15),                 /*                               SI */
-   COSTS_N_INSNS (15),                 /*                               DI */
-   COSTS_N_INSNS (15)},                        /*                            other */
-  0,                                   /* cost of multiply per each bit set */
-  {COSTS_N_INSNS (56),                 /* cost of a divide/mod for QI */
-   COSTS_N_INSNS (56),                 /*                          HI */
-   COSTS_N_INSNS (56),                 /*                          SI */
-   COSTS_N_INSNS (56),                 /*                          DI */
-   COSTS_N_INSNS (56)},                        /*                          other */
-  COSTS_N_INSNS (1),                   /* cost of movsx */
-  COSTS_N_INSNS (1),                   /* cost of movzx */
-  16,                                  /* "large" insn */
-  6,                                   /* MOVE_RATIO */
-
-  /* All move costs are relative to integer->integer move times 2 and thus
-     they are latency*2. */
+  {
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
    5,                                /* cost for loading QImode using movzbl */
    {4, 5, 4},                           /* cost of loading integer registers
                                            in QImode, HImode and SImode.
@@ -1603,11 +1937,44 @@ struct processor_costs pentium4_cost = {
    12, 24, 48,                          /* cost of moving XMM,YMM,ZMM register */
    {16, 16, 16, 32, 64},                        /* cost of loading SSE registers
                                            in 32,64,128,256 and 512-bit */
+  {16, 16, 16, 32, 64},                        /* cost of storing SSE registers
+                                          in 32,64,128,256 and 512-bit */
+  20, 12,                              /* SSE->integer and integer->SSE moves */
+  /* End of register allocator costs.  */
+  },
+
+  COSTS_N_INSNS (1),                   /* cost of an add instruction */
+  COSTS_N_INSNS (3),                   /* cost of a lea instruction */
+  COSTS_N_INSNS (4),                   /* variable shift costs */
+  COSTS_N_INSNS (4),                   /* constant shift costs */
+  {COSTS_N_INSNS (15),                 /* cost of starting multiply for QI */
+   COSTS_N_INSNS (15),                 /*                               HI */
+   COSTS_N_INSNS (15),                 /*                               SI */
+   COSTS_N_INSNS (15),                 /*                               DI */
+   COSTS_N_INSNS (15)},                        /*                            other */
+  0,                                   /* cost of multiply per each bit set */
+  {COSTS_N_INSNS (56),                 /* cost of a divide/mod for QI */
+   COSTS_N_INSNS (56),                 /*                          HI */
+   COSTS_N_INSNS (56),                 /*                          SI */
+   COSTS_N_INSNS (56),                 /*                          DI */
+   COSTS_N_INSNS (56)},                        /*                          other */
+  COSTS_N_INSNS (1),                   /* cost of movsx */
+  COSTS_N_INSNS (1),                   /* cost of movzx */
+  16,                                  /* "large" insn */
+  6,                                   /* MOVE_RATIO */
+  6,                                   /* CLEAR_RATIO */
+  {4, 5, 4},                           /* cost of loading integer registers
+                                          in QImode, HImode and SImode.
+                                          Relative to reg-reg move (2).  */
+  {2, 3, 2},                           /* cost of storing integer registers */
+  {16, 16, 16, 32, 64},                        /* cost of loading SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {16, 16, 16, 32, 64},                        /* cost of storing SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
    {32, 32, 32, 64, 128},               /* cost of unaligned loads.  */
-  {16, 16, 16, 32, 64},                        /* cost of storing SSE registers
-                                          in 32,64,128,256 and 512-bit */
    {32, 32, 32, 64, 128},               /* cost of unaligned stores.  */
-  20, 12,                              /* SSE->integer and integer->SSE moves */
+  12, 24, 48,                          /* cost of moving XMM,YMM,ZMM register */
+  20,                                  /* cost of moving SSE register to integer.  */
    16, 16,                              /* Gather load static, per_elt.  */
    16, 16,                              /* Gather store static, per_elt.  */
    8,                                   /* size of l1 cache.  */
@@ -1656,6 +2023,32 @@ static stringop_algs nocona_memset[2] = {
  
  static const
  struct processor_costs nocona_cost = {
+  {
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
+  4,                                /* cost for loading QImode using movzbl */
+  {4, 4, 4},                           /* cost of loading integer registers
+                                          in QImode, HImode and SImode.
+                                          Relative to reg-reg move (2).  */
+  {4, 4, 4},                           /* cost of storing integer registers */
+  12,                                  /* cost of reg,reg fld/fst */
+  {14, 14, 14},                                /* cost of loading fp registers
+                                          in SFmode, DFmode and XFmode */
+  {14, 14, 14},                                /* cost of storing fp registers
+                                          in SFmode, DFmode and XFmode */
+  14,                                  /* cost of moving MMX register */
+  {12, 12},                            /* cost of loading MMX registers
+                                          in SImode and DImode */
+  {12, 12},                            /* cost of storing MMX registers
+                                          in SImode and DImode */
+  6, 12, 24,                           /* cost of moving XMM,YMM,ZMM register */
+  {12, 12, 12, 24, 48},                        /* cost of loading SSE registers
+                                          in 32,64,128,256 and 512-bit */
+  {12, 12, 12, 24, 48},                        /* cost of storing SSE registers
+                                          in 32,64,128,256 and 512-bit */
+  20, 12,                              /* SSE->integer and integer->SSE moves */
+  /* End of register allocator costs.  */
+  },
+
    COSTS_N_INSNS (1),                   /* cost of an add instruction */
    COSTS_N_INSNS (1),                   /* cost of a lea instruction */
    COSTS_N_INSNS (1),                   /* variable shift costs */
@@ -1675,32 +2068,19 @@ struct processor_costs nocona_cost = {
    COSTS_N_INSNS (1),                   /* cost of movzx */
    16,                                  /* "large" insn */
    17,                                  /* MOVE_RATIO */
-
-  /* All move costs are relative to integer->integer move times 2 and thus
-     they are latency*2. */
-  4,                                /* cost for loading QImode using movzbl */
+  6,                                   /* CLEAR_RATIO */
    {4, 4, 4},                           /* cost of loading integer registers
                                            in QImode, HImode and SImode.
                                            Relative to reg-reg move (2).  */
    {4, 4, 4},                           /* cost of storing integer registers */
-  12,                                  /* cost of reg,reg fld/fst */
-  {14, 14, 14},                                /* cost of loading fp registers
-                                          in SFmode, DFmode and XFmode */
-  {14, 14, 14},                                /* cost of storing fp registers
-                                          in SFmode, DFmode and XFmode */
-  14,                                  /* cost of moving MMX register */
-  {12, 12},                            /* cost of loading MMX registers
-                                          in SImode and DImode */
-  {12, 12},                            /* cost of storing MMX registers
-                                          in SImode and DImode */
-  6, 12, 24,                           /* cost of moving XMM,YMM,ZMM register */
-  {12, 12, 12, 24, 48},                        /* cost of loading SSE registers
-                                          in 32,64,128,256 and 512-bit */
+  {12, 12, 12, 24, 48},                        /* cost of loading SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {12, 12, 12, 24, 48},                        /* cost of storing SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
    {24, 24, 24, 48, 96},                        /* cost of unaligned loads.  */
-  {12, 12, 12, 24, 48},                        /* cost of storing SSE registers
-                                          in 32,64,128,256 and 512-bit */
    {24, 24, 24, 48, 96},                        /* cost of unaligned stores.  */
-  20, 12,                              /* SSE->integer and integer->SSE moves */
+  6, 12, 24,                           /* cost of moving XMM,YMM,ZMM register */
+  20,                                  /* cost of moving SSE register to integer.  */
    12, 12,                              /* Gather load static, per_elt.  */
    12, 12,                              /* Gather store static, per_elt.  */
    8,                                   /* size of l1 cache.  */
@@ -1747,6 +2127,32 @@ static stringop_algs atom_memset[2] = {
               {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
  static const
  struct processor_costs atom_cost = {
+  {
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
+  6,                                   /* cost for loading QImode using movzbl */
+  {6, 6, 6},                           /* cost of loading integer registers
+                                          in QImode, HImode and SImode.
+                                          Relative to reg-reg move (2).  */
+  {6, 6, 6},                           /* cost of storing integer registers */
+  4,                                   /* cost of reg,reg fld/fst */
+  {6, 6, 18},                          /* cost of loading fp registers
+                                          in SFmode, DFmode and XFmode */
+  {14, 14, 24},                                /* cost of storing fp registers
+                                          in SFmode, DFmode and XFmode */
+  2,                                   /* cost of moving MMX register */
+  {8, 8},                              /* cost of loading MMX registers
+                                          in SImode and DImode */
+  {10, 10},                            /* cost of storing MMX registers
+                                          in SImode and DImode */
+  2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
+  {8, 8, 8, 16, 32},                   /* cost of loading SSE registers
+                                          in 32,64,128,256 and 512-bit */
+  {8, 8, 8, 16, 32},                   /* cost of storing SSE registers
+                                          in 32,64,128,256 and 512-bit */
+  8, 6,                                        /* SSE->integer and integer->SSE moves */
+  /* End of register allocator costs.  */
+  },
+
    COSTS_N_INSNS (1),                   /* cost of an add instruction */
    COSTS_N_INSNS (1) + 1,               /* cost of a lea instruction */
    COSTS_N_INSNS (1),                   /* variable shift costs */
@@ -1766,32 +2172,19 @@ struct processor_costs atom_cost = {
    COSTS_N_INSNS (1),                   /* cost of movzx */
    8,                                   /* "large" insn */
    17,                                  /* MOVE_RATIO */
-
-  /* All move costs are relative to integer->integer move times 2 and thus
-     they are latency*2. */
-  6,                                   /* cost for loading QImode using movzbl */
+  6,                                   /* CLEAR_RATIO */
    {6, 6, 6},                           /* cost of loading integer registers
                                            in QImode, HImode and SImode.
                                            Relative to reg-reg move (2).  */
    {6, 6, 6},                           /* cost of storing integer registers */
-  4,                                   /* cost of reg,reg fld/fst */
-  {6, 6, 18},                          /* cost of loading fp registers
-                                          in SFmode, DFmode and XFmode */
-  {14, 14, 24},                                /* cost of storing fp registers
-                                          in SFmode, DFmode and XFmode */
-  2,                                   /* cost of moving MMX register */
-  {8, 8},                              /* cost of loading MMX registers
-                                          in SImode and DImode */
-  {10, 10},                            /* cost of storing MMX registers
-                                          in SImode and DImode */
-  2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
-  {8, 8, 8, 16, 32},                   /* cost of loading SSE registers
-                                          in 32,64,128,256 and 512-bit */
+  {8, 8, 8, 16, 32},                   /* cost of loading SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {8, 8, 8, 16, 32},                   /* cost of storing SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
    {16, 16, 16, 32, 64},                        /* cost of unaligned loads.  */
-  {8, 8, 8, 16, 32},                   /* cost of storing SSE registers
-                                          in 32,64,128,256 and 512-bit */
    {16, 16, 16, 32, 64},                        /* cost of unaligned stores.  */
-  8, 6,                                        /* SSE->integer and integer->SSE moves */
+  2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
+  8,                                   /* cost of moving SSE register to integer.  */
    8, 8,                                        /* Gather load static, per_elt.  */
    8, 8,                                        /* Gather store static, per_elt.  */
    32,                                  /* size of l1 cache.  */
@@ -1838,6 +2231,32 @@ static stringop_algs slm_memset[2] = {
               {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
  static const
  struct processor_costs slm_cost = {
+  {
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
+  8,                                   /* cost for loading QImode using movzbl */
+  {8, 8, 8},                           /* cost of loading integer registers
+                                          in QImode, HImode and SImode.
+                                          Relative to reg-reg move (2).  */
+  {6, 6, 6},                           /* cost of storing integer registers */
+  2,                                   /* cost of reg,reg fld/fst */
+  {8, 8, 18},                          /* cost of loading fp registers
+                                          in SFmode, DFmode and XFmode */
+  {6, 6, 18},                          /* cost of storing fp registers
+                                          in SFmode, DFmode and XFmode */
+  2,                                   /* cost of moving MMX register */
+  {8, 8},                              /* cost of loading MMX registers
+                                          in SImode and DImode */
+  {6, 6},                              /* cost of storing MMX registers
+                                          in SImode and DImode */
+  2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
+  {8, 8, 8, 16, 32},                   /* cost of loading SSE registers
+                                          in 32,64,128,256 and 512-bit */
+  {8, 8, 8, 16, 32},                   /* cost of storing SSE registers
+                                          in 32,64,128,256 and 512-bit */
+  8, 6,                                        /* SSE->integer and integer->SSE moves */
+  /* End of register allocator costs.  */
+  },
+
    COSTS_N_INSNS (1),                   /* cost of an add instruction */
    COSTS_N_INSNS (1) + 1,               /* cost of a lea instruction */
    COSTS_N_INSNS (1),                   /* variable shift costs */
@@ -1857,32 +2276,19 @@ struct processor_costs slm_cost = {
    COSTS_N_INSNS (1),                   /* cost of movzx */
    8,                                   /* "large" insn */
    17,                                  /* MOVE_RATIO */
-
-  /* All move costs are relative to integer->integer move times 2 and thus
-     they are latency*2. */
-  8,                                   /* cost for loading QImode using movzbl */
+  6,                                   /* CLEAR_RATIO */
    {8, 8, 8},                           /* cost of loading integer registers
                                            in QImode, HImode and SImode.
                                            Relative to reg-reg move (2).  */
    {6, 6, 6},                           /* cost of storing integer registers */
-  2,                                   /* cost of reg,reg fld/fst */
-  {8, 8, 18},                          /* cost of loading fp registers
-                                          in SFmode, DFmode and XFmode */
-  {6, 6, 18},                          /* cost of storing fp registers
-                                          in SFmode, DFmode and XFmode */
-  2,                                   /* cost of moving MMX register */
-  {8, 8},                              /* cost of loading MMX registers
-                                          in SImode and DImode */
-  {6, 6},                              /* cost of storing MMX registers
-                                          in SImode and DImode */
-  2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
-  {8, 8, 8, 16, 32},                   /* cost of loading SSE registers
-                                          in 32,64,128,256 and 512-bit */
+  {8, 8, 8, 16, 32},                   /* cost of loading SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {8, 8, 8, 16, 32},                   /* cost of storing SSE register
+                                          in SImode, DImode and TImode.  */
    {16, 16, 16, 32, 64},                        /* cost of unaligned loads.  */
-  {8, 8, 8, 16, 32},                   /* cost of storing SSE registers
-                                          in 32,64,128,256 and 512-bit */
    {16, 16, 16, 32, 64},                        /* cost of unaligned stores.  */
-  8, 6,                                        /* SSE->integer and integer->SSE moves */
+  2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
+  8,                                   /* cost of moving SSE register to integer.  */
    8, 8,                                        /* Gather load static, per_elt.  */
    8, 8,                                        /* Gather store static, per_elt.  */
    32,                                  /* size of l1 cache.  */
@@ -1929,6 +2335,32 @@ static stringop_algs intel_memset[2] = {
               {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
  static const
  struct processor_costs intel_cost = {
+  {
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
+  6,                                /* cost for loading QImode using movzbl */
+  {4, 4, 4},                           /* cost of loading integer registers
+                                          in QImode, HImode and SImode.
+                                          Relative to reg-reg move (2).  */
+  {6, 6, 6},                           /* cost of storing integer registers */
+  2,                                   /* cost of reg,reg fld/fst */
+  {6, 6, 8},                           /* cost of loading fp registers
+                                          in SFmode, DFmode and XFmode */
+  {6, 6, 10},                          /* cost of storing fp registers
+                                          in SFmode, DFmode and XFmode */
+  2,                                   /* cost of moving MMX register */
+  {6, 6},                              /* cost of loading MMX registers
+                                          in SImode and DImode */
+  {6, 6},                              /* cost of storing MMX registers
+                                          in SImode and DImode */
+  2, 2, 2,                             /* cost of moving XMM,YMM,ZMM register */
+  {6, 6, 6, 6, 6},                     /* cost of loading SSE registers
+                                          in 32,64,128,256 and 512-bit */
+  {6, 6, 6, 6, 6},                     /* cost of storing SSE registers
+                                          in 32,64,128,256 and 512-bit */
+  4, 4,                                        /* SSE->integer and integer->SSE moves */
+  /* End of register allocator costs.  */
+  },
+
    COSTS_N_INSNS (1),                   /* cost of an add instruction */
    COSTS_N_INSNS (1) + 1,               /* cost of a lea instruction */
    COSTS_N_INSNS (1),                   /* variable shift costs */
@@ -1948,32 +2380,19 @@ struct processor_costs intel_cost = {
    COSTS_N_INSNS (1),                   /* cost of movzx */
    8,                                   /* "large" insn */
    17,                                  /* MOVE_RATIO */
-
-  /* All move costs are relative to integer->integer move times 2 and thus
-     they are latency*2. */
-  6,                                /* cost for loading QImode using movzbl */
+  6,                                   /* CLEAR_RATIO */
    {4, 4, 4},                           /* cost of loading integer registers
                                            in QImode, HImode and SImode.
                                            Relative to reg-reg move (2).  */
    {6, 6, 6},                           /* cost of storing integer registers */
-  2,                                   /* cost of reg,reg fld/fst */
-  {6, 6, 8},                           /* cost of loading fp registers
-                                          in SFmode, DFmode and XFmode */
-  {6, 6, 10},                          /* cost of storing fp registers
-                                          in SFmode, DFmode and XFmode */
-  2,                                   /* cost of moving MMX register */
-  {6, 6},                              /* cost of loading MMX registers
-                                          in SImode and DImode */
-  {6, 6},                              /* cost of storing MMX registers
-                                          in SImode and DImode */
-  2, 2, 2,                             /* cost of moving XMM,YMM,ZMM register */
-  {6, 6, 6, 6, 6},                     /* cost of loading SSE registers
-                                          in 32,64,128,256 and 512-bit */
+  {6, 6, 6, 6, 6},                     /* cost of loading SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {6, 6, 6, 6, 6},                     /* cost of storing SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
    {10, 10, 10, 10, 10},                        /* cost of unaligned loads.  */
-  {6, 6, 6, 6, 6},                     /* cost of storing SSE registers
-                                          in 32,64,128,256 and 512-bit */
    {10, 10, 10, 10, 10},                        /* cost of unaligned loads.  */
-  4, 4,                                        /* SSE->integer and integer->SSE moves */
+  2, 2, 2,                             /* cost of moving XMM,YMM,ZMM register */
+  4,                                   /* cost of moving SSE register to integer.  */
    6, 6,                                        /* Gather load static, per_elt.  */
    6, 6,                                        /* Gather store static, per_elt.  */
    32,                                  /* size of l1 cache.  */
@@ -1988,7 +2407,7 @@ struct processor_costs intel_cost = {
    COSTS_N_INSNS (8),                   /* cost of FCHS instruction.  */
    COSTS_N_INSNS (40),                  /* cost of FSQRT instruction.  */
  
-  COSTS_N_INSNS (8),                   /* cost of cheap SSE instruction.  */
+  COSTS_N_INSNS (1),                   /* cost of cheap SSE instruction.  */
    COSTS_N_INSNS (8),                   /* cost of ADDSS/SD SUBSS/SD insns.  */
    COSTS_N_INSNS (8),                   /* cost of MULSS instruction.  */
    COSTS_N_INSNS (8),                   /* cost of MULSD instruction.  */
@@ -2024,6 +2443,32 @@ static stringop_algs generic_memset[2] = {
               {-1, libcall, false}}}};
  static const
  struct processor_costs generic_cost = {
+  {
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
+  6,                                /* cost for loading QImode using movzbl */
+  {6, 6, 6},                           /* cost of loading integer registers
+                                          in QImode, HImode and SImode.
+                                          Relative to reg-reg move (2).  */
+  {6, 6, 6},                           /* cost of storing integer registers */
+  4,                                   /* cost of reg,reg fld/fst */
+  {6, 6, 12},                          /* cost of loading fp registers
+                                          in SFmode, DFmode and XFmode */
+  {6, 6, 12},                          /* cost of storing fp registers
+                                          in SFmode, DFmode and XFmode */
+  2,                                   /* cost of moving MMX register */
+  {6, 6},                              /* cost of loading MMX registers
+                                          in SImode and DImode */
+  {6, 6},                              /* cost of storing MMX registers
+                                          in SImode and DImode */
+  2, 3, 4,                             /* cost of moving XMM,YMM,ZMM register */
+  {6, 6, 6, 10, 15},                   /* cost of loading SSE registers
+                                          in 32,64,128,256 and 512-bit */
+  {6, 6, 6, 10, 15},                   /* cost of storing SSE registers
+                                          in 32,64,128,256 and 512-bit */
+  6, 6,                                        /* SSE->integer and integer->SSE moves */
+  /* End of register allocator costs.  */
+  },
+
    COSTS_N_INSNS (1),                   /* cost of an add instruction */
    /* Setting cost to 2 makes our current implementation of synth_mult result in
       use of unnecessary temporary registers causing regression on several
@@ -2046,32 +2491,19 @@ struct processor_costs generic_cost = {
    COSTS_N_INSNS (1),                   /* cost of movzx */
    8,                                   /* "large" insn */
    17,                                  /* MOVE_RATIO */
-
-  /* All move costs are relative to integer->integer move times 2 and thus
-     they are latency*2. */
-  6,                                /* cost for loading QImode using movzbl */
+  6,                                   /* CLEAR_RATIO */
    {6, 6, 6},                           /* cost of loading integer registers
                                            in QImode, HImode and SImode.
                                            Relative to reg-reg move (2).  */
    {6, 6, 6},                           /* cost of storing integer registers */
-  4,                                   /* cost of reg,reg fld/fst */
-  {6, 6, 12},                          /* cost of loading fp registers
-                                          in SFmode, DFmode and XFmode */
-  {6, 6, 12},                          /* cost of storing fp registers
-                                          in SFmode, DFmode and XFmode */
-  2,                                   /* cost of moving MMX register */
-  {6, 6},                              /* cost of loading MMX registers
-                                          in SImode and DImode */
-  {6, 6},                              /* cost of storing MMX registers
-                                          in SImode and DImode */
-  2, 3, 4,                             /* cost of moving XMM,YMM,ZMM register */
-  {6, 6, 6, 10, 15},                   /* cost of loading SSE registers
-                                          in 32,64,128,256 and 512-bit */
+  {6, 6, 6, 10, 15},                   /* cost of loading SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {6, 6, 6, 10, 15},                   /* cost of storing SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
    {6, 6, 6, 10, 15},                   /* cost of unaligned loads.  */
-  {6, 6, 6, 10, 15},                   /* cost of storing SSE registers
-                                          in 32,64,128,256 and 512-bit */
    {6, 6, 6, 10, 15},                   /* cost of unaligned storess.  */
-  6, 6,                                        /* SSE->integer and integer->SSE moves */
+  2, 3, 4,                             /* cost of moving XMM,YMM,ZMM register */
+  6,                                   /* cost of moving SSE register to integer.  */
    18, 6,                               /* Gather load static, per_elt.  */
    18, 6,                               /* Gather store static, per_elt.  */
    32,                                  /* size of l1 cache.  */
@@ -2124,6 +2556,32 @@ static stringop_algs core_memset[2] = {
  
  static const
  struct processor_costs core_cost = {
+  {
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
+  6,                                /* cost for loading QImode using movzbl */
+  {4, 4, 4},                           /* cost of loading integer registers
+                                          in QImode, HImode and SImode.
+                                          Relative to reg-reg move (2).  */
+  {6, 6, 6},                           /* cost of storing integer registers */
+  2,                                   /* cost of reg,reg fld/fst */
+  {6, 6, 8},                           /* cost of loading fp registers
+                                          in SFmode, DFmode and XFmode */
+  {6, 6, 10},                          /* cost of storing fp registers
+                                          in SFmode, DFmode and XFmode */
+  2,                                   /* cost of moving MMX register */
+  {6, 6},                              /* cost of loading MMX registers
+                                          in SImode and DImode */
+  {6, 6},                              /* cost of storing MMX registers
+                                          in SImode and DImode */
+  2, 2, 4,                             /* cost of moving XMM,YMM,ZMM register */
+  {6, 6, 6, 6, 12},                    /* cost of loading SSE registers
+                                          in 32,64,128,256 and 512-bit */
+  {6, 6, 6, 6, 12},                    /* cost of storing SSE registers
+                                          in 32,64,128,256 and 512-bit */
+  6, 6,                                        /* SSE->integer and integer->SSE moves */
+  /* End of register allocator costs.  */
+  },
+
    COSTS_N_INSNS (1),                   /* cost of an add instruction */
    /* On all chips taken into consideration lea is 2 cycles and more.  With
       this cost however our current implementation of synth_mult results in
@@ -2150,32 +2608,19 @@ struct processor_costs core_cost = {
    COSTS_N_INSNS (1),                   /* cost of movzx */
    8,                                   /* "large" insn */
    17,                                  /* MOVE_RATIO */
-
-  /* All move costs are relative to integer->integer move times 2 and thus
-     they are latency*2. */
-  6,                                /* cost for loading QImode using movzbl */
+  6,                                   /* CLEAR_RATIO */
    {4, 4, 4},                           /* cost of loading integer registers
                                            in QImode, HImode and SImode.
                                            Relative to reg-reg move (2).  */
    {6, 6, 6},                           /* cost of storing integer registers */
-  2,                                   /* cost of reg,reg fld/fst */
-  {6, 6, 8},                           /* cost of loading fp registers
-                                          in SFmode, DFmode and XFmode */
-  {6, 6, 10},                          /* cost of storing fp registers
-                                          in SFmode, DFmode and XFmode */
-  2,                                   /* cost of moving MMX register */
-  {6, 6},                              /* cost of loading MMX registers
-                                          in SImode and DImode */
-  {6, 6},                              /* cost of storing MMX registers
-                                          in SImode and DImode */
-  2, 2, 4,                             /* cost of moving XMM,YMM,ZMM register */
-  {6, 6, 6, 6, 12},                    /* cost of loading SSE registers
-                                          in 32,64,128,256 and 512-bit */
+  {6, 6, 6, 6, 12},                    /* cost of loading SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {6, 6, 6, 6, 12},                    /* cost of storing SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
    {6, 6, 6, 6, 12},                    /* cost of unaligned loads.  */
-  {6, 6, 6, 6, 12},                    /* cost of storing SSE registers
-                                          in 32,64,128,256 and 512-bit */
    {6, 6, 6, 6, 12},                    /* cost of unaligned stores.  */
-  2, 2,                                        /* SSE->integer and integer->SSE moves */
+  2, 2, 4,                             /* cost of moving XMM,YMM,ZMM register */
+  2,                                   /* cost of moving SSE register to integer.  */
    /* VGATHERDPD is 7 uops, rec throughput 5, while VGATHERDPD is 9 uops,
       rec. throughput 6.
       So 5 uops statically and one uops per load.  */