i386: Add clear_ratio to processor_costs

[gcc.git] / gcc / config / i386 / x86-tune-costs.h
diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h

index 71a5854c09a6183e91a52b0ed910aac2384b8e5a..99816aeaebc1fcaa900807db156149613c4aeb68 100644 (file)
--- a/gcc/config/i386/x86-tune-costs.h
+++ b/gcc/config/i386/x86-tune-costs.h
@@ -1,5 +1,5 @@
  /* Costs of operations of individual x86 CPUs.
-   Copyright (C) 1988-2018 Free Software Foundation, Inc.
+   Copyright (C) 1988-2019 Free Software Foundation, Inc.
  
  This file is part of GCC.
  
@@ -36,6 +36,32 @@ static stringop_algs ix86_size_memset[2] = {
  
  const
  struct processor_costs ix86_size_cost = {/* costs for tuning for size */
+  {
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
+  2,                                /* cost for loading QImode using movzbl */
+  {2, 2, 2},                           /* cost of loading integer registers
+                                          in QImode, HImode and SImode.
+                                          Relative to reg-reg move (2).  */
+  {2, 2, 2},                           /* cost of storing integer registers */
+  2,                                   /* cost of reg,reg fld/fst */
+  {2, 2, 2},                           /* cost of loading fp registers
+                                          in SFmode, DFmode and XFmode */
+  {2, 2, 2},                           /* cost of storing fp registers
+                                          in SFmode, DFmode and XFmode */
+  3,                                   /* cost of moving MMX register */
+  {3, 3},                              /* cost of loading MMX registers
+                                          in SImode and DImode */
+  {3, 3},                              /* cost of storing MMX registers
+                                          in SImode and DImode */
+  3, 3, 3,                             /* cost of moving XMM,YMM,ZMM register */
+  {3, 3, 3, 3, 3},                     /* cost of loading SSE registers
+                                          in 32,64,128,256 and 512-bit */
+  {3, 3, 3, 3, 3},                     /* cost of storing SSE registers
+                                          in 32,64,128,256 and 512-bit */
+  3, 3,                                        /* SSE->integer and integer->SSE moves */
+  /* End of register allocator costs.  */
+  },
+
    COSTS_N_BYTES (2),                   /* cost of an add instruction */
    COSTS_N_BYTES (3),                   /* cost of a lea instruction */
    COSTS_N_BYTES (2),                   /* variable shift costs */
@@ -55,33 +81,21 @@ struct processor_costs ix86_size_cost = {/* costs for tuning for size */
    COSTS_N_BYTES (3),                   /* cost of movzx */
    0,                                   /* "large" insn */
    2,                                   /* MOVE_RATIO */
-
-  /* All move costs are relative to integer->integer move times 2. */
-  2,                                /* cost for loading QImode using movzbl */
+  2,                                   /* CLEAR_RATIO */
    {2, 2, 2},                           /* cost of loading integer registers
                                            in QImode, HImode and SImode.
                                            Relative to reg-reg move (2).  */
    {2, 2, 2},                           /* cost of storing integer registers */
-  2,                                   /* cost of reg,reg fld/fst */
-  {2, 2, 2},                           /* cost of loading fp registers
-                                          in SFmode, DFmode and XFmode */
-  {2, 2, 2},                           /* cost of storing fp registers
-                                          in SFmode, DFmode and XFmode */
-  3,                                   /* cost of moving MMX register */
-  {3, 3},                              /* cost of loading MMX registers
-                                          in SImode and DImode */
-  {3, 3},                              /* cost of storing MMX registers
-                                          in SImode and DImode */
-  3, 3, 3,                             /* cost of moving XMM,YMM,ZMM register */
-  {3, 3, 3, 3, 3},                     /* cost of loading SSE registers
-                                          in 32,64,128,256 and 512-bit */
+  {3, 3, 3, 3, 3},                     /* cost of loading SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {3, 3, 3, 3, 3},                     /* cost of storing SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
    {3, 3, 3, 3, 3},                     /* cost of unaligned SSE load
                                            in 128bit, 256bit and 512bit */
-  {3, 3, 3, 3, 3},                     /* cost of storing SSE registers
-                                          in 32,64,128,256 and 512-bit */
-  {3, 3, 3, 3, 3},                             /* cost of unaligned SSE store
+  {3, 3, 3, 3, 3},                     /* cost of unaligned SSE store
                                            in 128bit, 256bit and 512bit */
-  3, 3,                                        /* SSE->integer and integer->SSE moves */
+  3, 3, 3,                             /* cost of moving XMM,YMM,ZMM register */
+  3,                                   /* cost of moving SSE register to integer.  */
    5, 0,                                        /* Gather load static, per_elt.  */
    5, 0,                                        /* Gather store static, per_elt.  */
    0,                                   /* size of l1 cache  */
@@ -127,6 +141,32 @@ static stringop_algs i386_memset[2] = {
  
  static const
  struct processor_costs i386_cost = {   /* 386 specific costs */
+  {
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
+  4,                                /* cost for loading QImode using movzbl */
+  {2, 4, 2},                           /* cost of loading integer registers
+                                          in QImode, HImode and SImode.
+                                          Relative to reg-reg move (2).  */
+  {2, 4, 2},                           /* cost of storing integer registers */
+  2,                                   /* cost of reg,reg fld/fst */
+  {8, 8, 8},                           /* cost of loading fp registers
+                                          in SFmode, DFmode and XFmode */
+  {8, 8, 8},                           /* cost of storing fp registers
+                                          in SFmode, DFmode and XFmode */
+  2,                                   /* cost of moving MMX register */
+  {4, 8},                              /* cost of loading MMX registers
+                                          in SImode and DImode */
+  {4, 8},                              /* cost of storing MMX registers
+                                          in SImode and DImode */
+  2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
+  {4, 8, 16, 32, 64},                  /* cost of loading SSE registers
+                                          in 32,64,128,256 and 512-bit */
+  {4, 8, 16, 32, 64},                  /* cost of storing SSE registers
+                                          in 32,64,128,256 and 512-bit */
+  3, 3,                                        /* SSE->integer and integer->SSE moves */
+  /* End of register allocator costs.  */
+  },
+
    COSTS_N_INSNS (1),                   /* cost of an add instruction */
    COSTS_N_INSNS (1),                   /* cost of a lea instruction */
    COSTS_N_INSNS (3),                   /* variable shift costs */
@@ -146,32 +186,19 @@ struct processor_costs i386_cost = {      /* 386 specific costs */
    COSTS_N_INSNS (2),                   /* cost of movzx */
    15,                                  /* "large" insn */
    3,                                   /* MOVE_RATIO */
-
-  /* All move costs are relative to integer->integer move times 2 and thus
-     they are latency*2. */
-  4,                                /* cost for loading QImode using movzbl */
+  3,                                   /* CLEAR_RATIO */
    {2, 4, 2},                           /* cost of loading integer registers
                                            in QImode, HImode and SImode.
                                            Relative to reg-reg move (2).  */
    {2, 4, 2},                           /* cost of storing integer registers */
-  2,                                   /* cost of reg,reg fld/fst */
-  {8, 8, 8},                           /* cost of loading fp registers
-                                          in SFmode, DFmode and XFmode */
-  {8, 8, 8},                           /* cost of storing fp registers
-                                          in SFmode, DFmode and XFmode */
-  2,                                   /* cost of moving MMX register */
-  {4, 8},                              /* cost of loading MMX registers
-                                          in SImode and DImode */
-  {4, 8},                              /* cost of storing MMX registers
-                                          in SImode and DImode */
-  2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
-  {4, 8, 16, 32, 64},                  /* cost of loading SSE registers
-                                          in 32,64,128,256 and 512-bit */
+  {4, 8, 16, 32, 64},                  /* cost of loading SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {4, 8, 16, 32, 64},                  /* cost of storing SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
    {4, 8, 16, 32, 64},                  /* cost of unaligned loads.  */
-  {4, 8, 16, 32, 64},                  /* cost of storing SSE registers
-                                          in 32,64,128,256 and 512-bit */
    {4, 8, 16, 32, 64},                  /* cost of unaligned stores.  */
-  3, 3,                                        /* SSE->integer and integer->SSE moves */
+  2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
+  3,                                   /* cost of moving SSE register to integer.  */
    4, 4,                                        /* Gather load static, per_elt.  */
    4, 4,                                        /* Gather store static, per_elt.  */
    0,                                   /* size of l1 cache  */
@@ -216,6 +243,32 @@ static stringop_algs i486_memset[2] = {
  
  static const
  struct processor_costs i486_cost = {   /* 486 specific costs */
+  {
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
+  4,                                /* cost for loading QImode using movzbl */
+  {2, 4, 2},                           /* cost of loading integer registers
+                                          in QImode, HImode and SImode.
+                                          Relative to reg-reg move (2).  */
+  {2, 4, 2},                           /* cost of storing integer registers */
+  2,                                   /* cost of reg,reg fld/fst */
+  {8, 8, 8},                           /* cost of loading fp registers
+                                          in SFmode, DFmode and XFmode */
+  {8, 8, 8},                           /* cost of storing fp registers
+                                          in SFmode, DFmode and XFmode */
+  2,                                   /* cost of moving MMX register */
+  {4, 8},                              /* cost of loading MMX registers
+                                          in SImode and DImode */
+  {4, 8},                              /* cost of storing MMX registers
+                                          in SImode and DImode */
+  2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
+  {4, 8, 16, 32, 64},                  /* cost of loading SSE registers
+                                          in 32,64,128,256 and 512-bit */
+  {4, 8, 16, 32, 64},                  /* cost of storing SSE registers
+                                          in 32,64,128,256 and 512-bit */
+  3, 3,                                        /* SSE->integer and integer->SSE moves */
+  /* End of register allocator costs.  */
+  },
+
    COSTS_N_INSNS (1),                   /* cost of an add instruction */
    COSTS_N_INSNS (1),                   /* cost of a lea instruction */
    COSTS_N_INSNS (3),                   /* variable shift costs */
@@ -235,32 +288,19 @@ struct processor_costs i486_cost = {      /* 486 specific costs */
    COSTS_N_INSNS (2),                   /* cost of movzx */
    15,                                  /* "large" insn */
    3,                                   /* MOVE_RATIO */
-
-  /* All move costs are relative to integer->integer move times 2 and thus
-     they are latency*2. */
-  4,                                /* cost for loading QImode using movzbl */
+  3,                                   /* CLEAR_RATIO */
    {2, 4, 2},                           /* cost of loading integer registers
                                            in QImode, HImode and SImode.
                                            Relative to reg-reg move (2).  */
    {2, 4, 2},                           /* cost of storing integer registers */
-  2,                                   /* cost of reg,reg fld/fst */
-  {8, 8, 8},                           /* cost of loading fp registers
-                                          in SFmode, DFmode and XFmode */
-  {8, 8, 8},                           /* cost of storing fp registers
-                                          in SFmode, DFmode and XFmode */
-  2,                                   /* cost of moving MMX register */
-  {4, 8},                              /* cost of loading MMX registers
-                                          in SImode and DImode */
-  {4, 8},                              /* cost of storing MMX registers
-                                          in SImode and DImode */
-  2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
-  {4, 8, 16, 32, 64},                  /* cost of loading SSE registers
-                                          in 32,64,128,256 and 512-bit */
+  {4, 8, 16, 32, 64},                  /* cost of loading SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {4, 8, 16, 32, 64},                  /* cost of storing SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
    {4, 8, 16, 32, 64},                  /* cost of unaligned loads.  */
-  {4, 8, 16, 32, 64},                  /* cost of storing SSE registers
-                                          in 32,64,128,256 and 512-bit */
    {4, 8, 16, 32, 64},                  /* cost of unaligned stores.  */
-  3, 3,                                        /* SSE->integer and integer->SSE moves */
+  2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
+  3,                                   /* cost of moving SSE register to integer.  */
    4, 4,                                        /* Gather load static, per_elt.  */
    4, 4,                                        /* Gather store static, per_elt.  */
    4,                                   /* size of l1 cache.  486 has 8kB cache
@@ -307,6 +347,32 @@ static stringop_algs pentium_memset[2] = {
  
  static const
  struct processor_costs pentium_cost = {
+  {
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
+  6,                                /* cost for loading QImode using movzbl */
+  {2, 4, 2},                           /* cost of loading integer registers
+                                          in QImode, HImode and SImode.
+                                          Relative to reg-reg move (2).  */
+  {2, 4, 2},                           /* cost of storing integer registers */
+  2,                                   /* cost of reg,reg fld/fst */
+  {2, 2, 6},                           /* cost of loading fp registers
+                                          in SFmode, DFmode and XFmode */
+  {4, 4, 6},                           /* cost of storing fp registers
+                                          in SFmode, DFmode and XFmode */
+  8,                                   /* cost of moving MMX register */
+  {8, 8},                              /* cost of loading MMX registers
+                                          in SImode and DImode */
+  {8, 8},                              /* cost of storing MMX registers
+                                          in SImode and DImode */
+  2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
+  {4, 8, 16, 32, 64},                  /* cost of loading SSE registers
+                                          in 32,64,128,256 and 512-bit */
+  {4, 8, 16, 32, 64},                  /* cost of storing SSE registers
+                                          in 32,64,128,256 and 512-bit */
+  3, 3,                                        /* SSE->integer and integer->SSE moves */
+  /* End of register allocator costs.  */
+  },
+
    COSTS_N_INSNS (1),                   /* cost of an add instruction */
    COSTS_N_INSNS (1),                   /* cost of a lea instruction */
    COSTS_N_INSNS (4),                   /* variable shift costs */
@@ -326,32 +392,19 @@ struct processor_costs pentium_cost = {
    COSTS_N_INSNS (2),                   /* cost of movzx */
    8,                                   /* "large" insn */
    6,                                   /* MOVE_RATIO */
-
-  /* All move costs are relative to integer->integer move times 2 and thus
-     they are latency*2. */
-  6,                                /* cost for loading QImode using movzbl */
+  6,                                   /* CLEAR_RATIO */
    {2, 4, 2},                           /* cost of loading integer registers
                                            in QImode, HImode and SImode.
                                            Relative to reg-reg move (2).  */
    {2, 4, 2},                           /* cost of storing integer registers */
-  2,                                   /* cost of reg,reg fld/fst */
-  {2, 2, 6},                           /* cost of loading fp registers
-                                          in SFmode, DFmode and XFmode */
-  {4, 4, 6},                           /* cost of storing fp registers
-                                          in SFmode, DFmode and XFmode */
-  8,                                   /* cost of moving MMX register */
-  {8, 8},                              /* cost of loading MMX registers
-                                          in SImode and DImode */
-  {8, 8},                              /* cost of storing MMX registers
-                                          in SImode and DImode */
-  2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
-  {4, 8, 16, 32, 64},                  /* cost of loading SSE registers
-                                          in 32,64,128,256 and 512-bit */
+  {4, 8, 16, 32, 64},                  /* cost of loading SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {4, 8, 16, 32, 64},                  /* cost of storing SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
    {4, 8, 16, 32, 64},                  /* cost of unaligned loads.  */
-  {4, 8, 16, 32, 64},                  /* cost of storing SSE registers
-                                          in 32,64,128,256 and 512-bit */
    {4, 8, 16, 32, 64},                  /* cost of unaligned stores.  */
-  3, 3,                                        /* SSE->integer and integer->SSE moves */
+  2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
+  3,                                   /* cost of moving SSE register to integer.  */
    4, 4,                                        /* Gather load static, per_elt.  */
    4, 4,                                        /* Gather store static, per_elt.  */
    8,                                   /* size of l1 cache.  */
@@ -389,6 +442,32 @@ struct processor_costs pentium_cost = {
  
  static const
  struct processor_costs lakemont_cost = {
+  {
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
+  6,                                /* cost for loading QImode using movzbl */
+  {2, 4, 2},                           /* cost of loading integer registers
+                                          in QImode, HImode and SImode.
+                                          Relative to reg-reg move (2).  */
+  {2, 4, 2},                           /* cost of storing integer registers */
+  2,                                   /* cost of reg,reg fld/fst */
+  {2, 2, 6},                           /* cost of loading fp registers
+                                          in SFmode, DFmode and XFmode */
+  {4, 4, 6},                           /* cost of storing fp registers
+                                          in SFmode, DFmode and XFmode */
+  8,                                   /* cost of moving MMX register */
+  {8, 8},                              /* cost of loading MMX registers
+                                          in SImode and DImode */
+  {8, 8},                              /* cost of storing MMX registers
+                                          in SImode and DImode */
+  2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
+  {4, 8, 16, 32, 64},                  /* cost of loading SSE registers
+                                          in 32,64,128,256 and 512-bit */
+  {4, 8, 16, 32, 64},                  /* cost of storing SSE registers
+                                          in 32,64,128,256 and 512-bit */
+  3, 3,                                        /* SSE->integer and integer->SSE moves */
+  /* End of register allocator costs.  */
+  },
+
    COSTS_N_INSNS (1),                   /* cost of an add instruction */
    COSTS_N_INSNS (1) + 1,               /* cost of a lea instruction */
    COSTS_N_INSNS (1),                   /* variable shift costs */
@@ -408,32 +487,19 @@ struct processor_costs lakemont_cost = {
    COSTS_N_INSNS (2),                   /* cost of movzx */
    8,                                   /* "large" insn */
    17,                                  /* MOVE_RATIO */
-
-  /* All move costs are relative to integer->integer move times 2 and thus
-     they are latency*2. */
-  6,                                /* cost for loading QImode using movzbl */
+  6,                                   /* CLEAR_RATIO */
    {2, 4, 2},                           /* cost of loading integer registers
                                            in QImode, HImode and SImode.
                                            Relative to reg-reg move (2).  */
    {2, 4, 2},                           /* cost of storing integer registers */
-  2,                                   /* cost of reg,reg fld/fst */
-  {2, 2, 6},                           /* cost of loading fp registers
-                                          in SFmode, DFmode and XFmode */
-  {4, 4, 6},                           /* cost of storing fp registers
-                                          in SFmode, DFmode and XFmode */
-  8,                                   /* cost of moving MMX register */
-  {8, 8},                              /* cost of loading MMX registers
-                                          in SImode and DImode */
-  {8, 8},                              /* cost of storing MMX registers
-                                          in SImode and DImode */
-  2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
-  {4, 8, 16, 32, 64},                  /* cost of loading SSE registers
-                                          in 32,64,128,256 and 512-bit */
+  {4, 8, 16, 32, 64},                  /* cost of loading SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {4, 8, 16, 32, 64},                  /* cost of storing SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
    {4, 8, 16, 32, 64},                  /* cost of unaligned loads.  */
-  {4, 8, 16, 32, 64},                  /* cost of storing SSE registers
-                                          in 32,64,128,256 and 512-bit */
    {4, 8, 16, 32, 64},                  /* cost of unaligned stores.  */
-  3, 3,                                        /* SSE->integer and integer->SSE moves */
+  2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
+  3,                                   /* cost of moving SSE register to integer.  */
    4, 4,                                        /* Gather load static, per_elt.  */
    4, 4,                                        /* Gather store static, per_elt.  */
    8,                                   /* size of l1 cache.  */
@@ -486,6 +552,32 @@ static stringop_algs pentiumpro_memset[2] = {
    DUMMY_STRINGOP_ALGS};
  static const
  struct processor_costs pentiumpro_cost = {
+  {
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
+  2,                                /* cost for loading QImode using movzbl */
+  {4, 4, 4},                           /* cost of loading integer registers
+                                          in QImode, HImode and SImode.
+                                          Relative to reg-reg move (2).  */
+  {2, 2, 2},                           /* cost of storing integer registers */
+  2,                                   /* cost of reg,reg fld/fst */
+  {2, 2, 6},                           /* cost of loading fp registers
+                                          in SFmode, DFmode and XFmode */
+  {4, 4, 6},                           /* cost of storing fp registers
+                                          in SFmode, DFmode and XFmode */
+  2,                                   /* cost of moving MMX register */
+  {2, 2},                              /* cost of loading MMX registers
+                                          in SImode and DImode */
+  {2, 2},                              /* cost of storing MMX registers
+                                          in SImode and DImode */
+  2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
+  {4, 8, 16, 32, 64},                  /* cost of loading SSE registers
+                                          in 32,64,128,256 and 512-bit */
+  {4, 8, 16, 32, 64},                  /* cost of storing SSE registers
+                                          in 32,64,128,256 and 512-bit */
+  3, 3,                                        /* SSE->integer and integer->SSE moves */
+  /* End of register allocator costs.  */
+  },
+
    COSTS_N_INSNS (1),                   /* cost of an add instruction */
    COSTS_N_INSNS (1),                   /* cost of a lea instruction */
    COSTS_N_INSNS (1),                   /* variable shift costs */
@@ -505,32 +597,19 @@ struct processor_costs pentiumpro_cost = {
    COSTS_N_INSNS (1),                   /* cost of movzx */
    8,                                   /* "large" insn */
    6,                                   /* MOVE_RATIO */
-
-  /* All move costs are relative to integer->integer move times 2 and thus
-     they are latency*2. */
-  2,                                /* cost for loading QImode using movzbl */
+  6,                                   /* CLEAR_RATIO */
    {4, 4, 4},                           /* cost of loading integer registers
                                            in QImode, HImode and SImode.
                                            Relative to reg-reg move (2).  */
    {2, 2, 2},                           /* cost of storing integer registers */
-  2,                                   /* cost of reg,reg fld/fst */
-  {2, 2, 6},                           /* cost of loading fp registers
-                                          in SFmode, DFmode and XFmode */
-  {4, 4, 6},                           /* cost of storing fp registers
-                                          in SFmode, DFmode and XFmode */
-  2,                                   /* cost of moving MMX register */
-  {2, 2},                              /* cost of loading MMX registers
-                                          in SImode and DImode */
-  {2, 2},                              /* cost of storing MMX registers
-                                          in SImode and DImode */
-  2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
-  {4, 8, 16, 32, 64},                  /* cost of loading SSE registers
-                                          in 32,64,128,256 and 512-bit */
+  {4, 8, 16, 32, 64},                  /* cost of loading SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {4, 8, 16, 32, 64},                  /* cost of storing SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
    {4, 8, 16, 32, 64},                  /* cost of unaligned loads.  */
-  {4, 8, 16, 32, 64},                  /* cost of storing SSE registers
-                                          in 32,64,128,256 and 512-bit */
    {4, 8, 16, 32, 64},                  /* cost of unaligned stores.  */
-  3, 3,                                        /* SSE->integer and integer->SSE moves */
+  2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
+  3,                                   /* cost of moving SSE register to integer.  */
    4, 4,                                        /* Gather load static, per_elt.  */
    4, 4,                                        /* Gather store static, per_elt.  */
    8,                                   /* size of l1 cache.  */
@@ -574,28 +653,8 @@ static stringop_algs geode_memset[2] = {
    DUMMY_STRINGOP_ALGS};
  static const
  struct processor_costs geode_cost = {
-  COSTS_N_INSNS (1),                   /* cost of an add instruction */
-  COSTS_N_INSNS (1),                   /* cost of a lea instruction */
-  COSTS_N_INSNS (2),                   /* variable shift costs */
-  COSTS_N_INSNS (1),                   /* constant shift costs */
-  {COSTS_N_INSNS (3),                  /* cost of starting multiply for QI */
-   COSTS_N_INSNS (4),                  /*                               HI */
-   COSTS_N_INSNS (7),                  /*                               SI */
-   COSTS_N_INSNS (7),                  /*                               DI */
-   COSTS_N_INSNS (7)},                 /*                            other */
-  0,                                   /* cost of multiply per each bit set */
-  {COSTS_N_INSNS (15),                 /* cost of a divide/mod for QI */
-   COSTS_N_INSNS (23),                 /*                          HI */
-   COSTS_N_INSNS (39),                 /*                          SI */
-   COSTS_N_INSNS (39),                 /*                          DI */
-   COSTS_N_INSNS (39)},                        /*                          other */
-  COSTS_N_INSNS (1),                   /* cost of movsx */
-  COSTS_N_INSNS (1),                   /* cost of movzx */
-  8,                                   /* "large" insn */
-  4,                                   /* MOVE_RATIO */
-
-  /* All move costs are relative to integer->integer move times 2 and thus
-     they are latency*2. */
+  {
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
    2,                                /* cost for loading QImode using movzbl */
    {2, 2, 2},                           /* cost of loading integer registers
                                            in QImode, HImode and SImode.
@@ -606,7 +665,6 @@ struct processor_costs geode_cost = {
                                            in SFmode, DFmode and XFmode */
    {4, 6, 6},                           /* cost of storing fp registers
                                            in SFmode, DFmode and XFmode */
-
    2,                                   /* cost of moving MMX register */
    {2, 2},                              /* cost of loading MMX registers
                                            in SImode and DImode */
@@ -615,11 +673,44 @@ struct processor_costs geode_cost = {
    2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
    {2, 2, 8, 16, 32},                   /* cost of loading SSE registers
                                            in 32,64,128,256 and 512-bit */
-  {2, 2, 8, 16, 32},                   /* cost of unaligned loads.  */
    {2, 2, 8, 16, 32},                   /* cost of storing SSE registers
                                            in 32,64,128,256 and 512-bit */
-  {2, 2, 8, 16, 32},                   /* cost of unaligned stores.  */
    6, 6,                                        /* SSE->integer and integer->SSE moves */
+  /* End of register allocator costs.  */
+  },
+
+  COSTS_N_INSNS (1),                   /* cost of an add instruction */
+  COSTS_N_INSNS (1),                   /* cost of a lea instruction */
+  COSTS_N_INSNS (2),                   /* variable shift costs */
+  COSTS_N_INSNS (1),                   /* constant shift costs */
+  {COSTS_N_INSNS (3),                  /* cost of starting multiply for QI */
+   COSTS_N_INSNS (4),                  /*                               HI */
+   COSTS_N_INSNS (7),                  /*                               SI */
+   COSTS_N_INSNS (7),                  /*                               DI */
+   COSTS_N_INSNS (7)},                 /*                            other */
+  0,                                   /* cost of multiply per each bit set */
+  {COSTS_N_INSNS (15),                 /* cost of a divide/mod for QI */
+   COSTS_N_INSNS (23),                 /*                          HI */
+   COSTS_N_INSNS (39),                 /*                          SI */
+   COSTS_N_INSNS (39),                 /*                          DI */
+   COSTS_N_INSNS (39)},                        /*                          other */
+  COSTS_N_INSNS (1),                   /* cost of movsx */
+  COSTS_N_INSNS (1),                   /* cost of movzx */
+  8,                                   /* "large" insn */
+  4,                                   /* MOVE_RATIO */
+  4,                                   /* CLEAR_RATIO */
+  {2, 2, 2},                           /* cost of loading integer registers
+                                          in QImode, HImode and SImode.
+                                          Relative to reg-reg move (2).  */
+  {2, 2, 2},                           /* cost of storing integer registers */
+  {2, 2, 8, 16, 32},                   /* cost of loading SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {2, 2, 8, 16, 32},                   /* cost of storing SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {2, 2, 8, 16, 32},                   /* cost of unaligned loads.  */
+  {2, 2, 8, 16, 32},                   /* cost of unaligned stores.  */
+  2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
+  6,                                   /* cost of moving SSE register to integer.  */
    2, 2,                                        /* Gather load static, per_elt.  */
    2, 2,                                        /* Gather store static, per_elt.  */
    64,                                  /* size of l1 cache.  */
@@ -663,6 +754,32 @@ static stringop_algs k6_memset[2] = {
    DUMMY_STRINGOP_ALGS};
  static const
  struct processor_costs k6_cost = {
+  {
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
+  3,                                /* cost for loading QImode using movzbl */
+  {4, 5, 4},                           /* cost of loading integer registers
+                                          in QImode, HImode and SImode.
+                                          Relative to reg-reg move (2).  */
+  {2, 3, 2},                           /* cost of storing integer registers */
+  4,                                   /* cost of reg,reg fld/fst */
+  {6, 6, 6},                           /* cost of loading fp registers
+                                          in SFmode, DFmode and XFmode */
+  {4, 4, 4},                           /* cost of storing fp registers
+                                          in SFmode, DFmode and XFmode */
+  2,                                   /* cost of moving MMX register */
+  {2, 2},                              /* cost of loading MMX registers
+                                          in SImode and DImode */
+  {2, 2},                              /* cost of storing MMX registers
+                                          in SImode and DImode */
+  2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
+  {2, 2, 8, 16, 32},                   /* cost of loading SSE registers
+                                          in 32,64,128,256 and 512-bit */
+  {2, 2, 8, 16, 32},                   /* cost of storing SSE registers
+                                          in 32,64,128,256 and 512-bit */
+  6, 6,                                        /* SSE->integer and integer->SSE moves */
+  /* End of register allocator costs.  */
+  },
+
    COSTS_N_INSNS (1),                   /* cost of an add instruction */
    COSTS_N_INSNS (2),                   /* cost of a lea instruction */
    COSTS_N_INSNS (1),                   /* variable shift costs */
@@ -682,32 +799,19 @@ struct processor_costs k6_cost = {
    COSTS_N_INSNS (2),                   /* cost of movzx */
    8,                                   /* "large" insn */
    4,                                   /* MOVE_RATIO */
-
-  /* All move costs are relative to integer->integer move times 2 and thus
-     they are latency*2. */
-  3,                                /* cost for loading QImode using movzbl */
+  4,                                   /* CLEAR_RATIO */
    {4, 5, 4},                           /* cost of loading integer registers
                                            in QImode, HImode and SImode.
                                            Relative to reg-reg move (2).  */
    {2, 3, 2},                           /* cost of storing integer registers */
-  4,                                   /* cost of reg,reg fld/fst */
-  {6, 6, 6},                           /* cost of loading fp registers
-                                          in SFmode, DFmode and XFmode */
-  {4, 4, 4},                           /* cost of storing fp registers
-                                          in SFmode, DFmode and XFmode */
-  2,                                   /* cost of moving MMX register */
-  {2, 2},                              /* cost of loading MMX registers
-                                          in SImode and DImode */
-  {2, 2},                              /* cost of storing MMX registers
-                                          in SImode and DImode */
-  2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
-  {2, 2, 8, 16, 32},                   /* cost of loading SSE registers
-                                          in 32,64,128,256 and 512-bit */
+  {2, 2, 8, 16, 32},                   /* cost of loading SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {2, 2, 8, 16, 32},                   /* cost of storing SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
    {2, 2, 8, 16, 32},                   /* cost of unaligned loads.  */
-  {2, 2, 8, 16, 32},                   /* cost of storing SSE registers
-                                          in 32,64,128,256 and 512-bit */
    {2, 2, 8, 16, 32},                   /* cost of unaligned stores.  */
-  6, 6,                                        /* SSE->integer and integer->SSE moves */
+  2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
+  6,                                   /* cost of moving SSE register to integer.  */
    2, 2,                                        /* Gather load static, per_elt.  */
    2, 2,                                        /* Gather store static, per_elt.  */
    32,                                  /* size of l1 cache.  */
@@ -757,6 +861,32 @@ static stringop_algs athlon_memset[2] = {
    DUMMY_STRINGOP_ALGS};
  static const
  struct processor_costs athlon_cost = {
+  {
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
+  4,                                /* cost for loading QImode using movzbl */
+  {3, 4, 3},                           /* cost of loading integer registers
+                                          in QImode, HImode and SImode.
+                                          Relative to reg-reg move (2).  */
+  {3, 4, 3},                           /* cost of storing integer registers */
+  4,                                   /* cost of reg,reg fld/fst */
+  {4, 4, 12},                          /* cost of loading fp registers
+                                          in SFmode, DFmode and XFmode */
+  {6, 6, 8},                           /* cost of storing fp registers
+                                          in SFmode, DFmode and XFmode */
+  2,                                   /* cost of moving MMX register */
+  {4, 4},                              /* cost of loading MMX registers
+                                          in SImode and DImode */
+  {4, 4},                              /* cost of storing MMX registers
+                                          in SImode and DImode */
+  2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
+  {4, 4, 12, 12, 24},                  /* cost of loading SSE registers
+                                          in 32,64,128,256 and 512-bit */
+  {4, 4, 10, 10, 20},                  /* cost of storing SSE registers
+                                          in 32,64,128,256 and 512-bit */
+  5, 5,                                        /* SSE->integer and integer->SSE moves */
+  /* End of register allocator costs.  */
+  },
+
    COSTS_N_INSNS (1),                   /* cost of an add instruction */
    COSTS_N_INSNS (2),                   /* cost of a lea instruction */
    COSTS_N_INSNS (1),                   /* variable shift costs */
@@ -776,32 +906,19 @@ struct processor_costs athlon_cost = {
    COSTS_N_INSNS (1),                   /* cost of movzx */
    8,                                   /* "large" insn */
    9,                                   /* MOVE_RATIO */
-
-  /* All move costs are relative to integer->integer move times 2 and thus
-     they are latency*2. */
-  4,                                /* cost for loading QImode using movzbl */
+  6,                                   /* CLEAR_RATIO */
    {3, 4, 3},                           /* cost of loading integer registers
                                            in QImode, HImode and SImode.
                                            Relative to reg-reg move (2).  */
    {3, 4, 3},                           /* cost of storing integer registers */
-  4,                                   /* cost of reg,reg fld/fst */
-  {4, 4, 12},                          /* cost of loading fp registers
-                                          in SFmode, DFmode and XFmode */
-  {6, 6, 8},                           /* cost of storing fp registers
-                                          in SFmode, DFmode and XFmode */
-  2,                                   /* cost of moving MMX register */
-  {4, 4},                              /* cost of loading MMX registers
-                                          in SImode and DImode */
-  {4, 4},                              /* cost of storing MMX registers
-                                          in SImode and DImode */
+  {4, 4, 12, 12, 24},                  /* cost of loading SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {4, 4, 10, 10, 20},                  /* cost of storing SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {4, 4, 12, 12, 24},                  /* cost of unaligned loads.  */
+  {4, 4, 10, 10, 20},                  /* cost of unaligned stores.  */
    2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
-  {4, 4, 6, 12, 24},                   /* cost of loading SSE registers
-                                          in 32,64,128,256 and 512-bit */
-  {4, 4, 6, 12, 24},                   /* cost of unaligned loads.  */
-  {4, 4, 5, 10, 20},                   /* cost of storing SSE registers
-                                          in 32,64,128,256 and 512-bit */
-  {4, 4, 5, 10, 20},                   /* cost of unaligned stores.  */
-  5, 5,                                        /* SSE->integer and integer->SSE moves */
+  5,                                   /* cost of moving SSE register to integer.  */
    4, 4,                                        /* Gather load static, per_elt.  */
    4, 4,                                        /* Gather store static, per_elt.  */
    64,                                  /* size of l1 cache.  */
@@ -853,6 +970,32 @@ static stringop_algs k8_memset[2] = {
               {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
  static const
  struct processor_costs k8_cost = {
+  {
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
+  4,                                /* cost for loading QImode using movzbl */
+  {3, 4, 3},                           /* cost of loading integer registers
+                                          in QImode, HImode and SImode.
+                                          Relative to reg-reg move (2).  */
+  {3, 4, 3},                           /* cost of storing integer registers */
+  4,                                   /* cost of reg,reg fld/fst */
+  {4, 4, 12},                          /* cost of loading fp registers
+                                          in SFmode, DFmode and XFmode */
+  {6, 6, 8},                           /* cost of storing fp registers
+                                          in SFmode, DFmode and XFmode */
+  2,                                   /* cost of moving MMX register */
+  {3, 3},                              /* cost of loading MMX registers
+                                          in SImode and DImode */
+  {4, 4},                              /* cost of storing MMX registers
+                                          in SImode and DImode */
+  2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
+  {4, 3, 12, 12, 24},                  /* cost of loading SSE registers
+                                          in 32,64,128,256 and 512-bit */
+  {4, 4, 10, 10, 20},                  /* cost of storing SSE registers
+                                          in 32,64,128,256 and 512-bit */
+  5, 5,                                        /* SSE->integer and integer->SSE moves */
+  /* End of register allocator costs.  */
+  },
+
    COSTS_N_INSNS (1),                   /* cost of an add instruction */
    COSTS_N_INSNS (2),                   /* cost of a lea instruction */
    COSTS_N_INSNS (1),                   /* variable shift costs */
@@ -872,32 +1015,19 @@ struct processor_costs k8_cost = {
    COSTS_N_INSNS (1),                   /* cost of movzx */
    8,                                   /* "large" insn */
    9,                                   /* MOVE_RATIO */
-
-  /* All move costs are relative to integer->integer move times 2 and thus
-     they are latency*2. */
-  4,                                /* cost for loading QImode using movzbl */
+  6,                                   /* CLEAR_RATIO */
    {3, 4, 3},                           /* cost of loading integer registers
                                            in QImode, HImode and SImode.
                                            Relative to reg-reg move (2).  */
    {3, 4, 3},                           /* cost of storing integer registers */
-  4,                                   /* cost of reg,reg fld/fst */
-  {4, 4, 12},                          /* cost of loading fp registers
-                                          in SFmode, DFmode and XFmode */
-  {6, 6, 8},                           /* cost of storing fp registers
-                                          in SFmode, DFmode and XFmode */
-  2,                                   /* cost of moving MMX register */
-  {3, 3},                              /* cost of loading MMX registers
-                                          in SImode and DImode */
-  {4, 4},                              /* cost of storing MMX registers
-                                          in SImode and DImode */
+  {4, 3, 12, 12, 24},                  /* cost of loading SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {4, 4, 10, 10, 20},                  /* cost of storing SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {4, 3, 12, 12, 24},                  /* cost of unaligned loads.  */
+  {4, 4, 10, 10, 20},                  /* cost of unaligned stores.  */
    2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
-  {4, 3, 6, 12, 24},                   /* cost of loading SSE registers
-                                          in 32,64,128,256 and 512-bit */
-  {4, 3, 6, 12, 24},                   /* cost of unaligned loads.  */
-  {4, 4, 5, 10, 20},                   /* cost of storing SSE registers
-                                          in 32,64,128,256 and 512-bit */
-  {4, 4, 5, 10, 20},                   /* cost of unaligned stores.  */
-  5, 5,                                        /* SSE->integer and integer->SSE moves */
+  5,                                   /* cost of moving SSE register to integer.  */
    4, 4,                                        /* Gather load static, per_elt.  */
    4, 4,                                        /* Gather store static, per_elt.  */
    64,                                  /* size of l1 cache.  */
@@ -953,28 +1083,8 @@ static stringop_algs amdfam10_memset[2] = {
    {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
               {-1, libcall, false}}}};
  struct processor_costs amdfam10_cost = {
-  COSTS_N_INSNS (1),                   /* cost of an add instruction */
-  COSTS_N_INSNS (2),                   /* cost of a lea instruction */
-  COSTS_N_INSNS (1),                   /* variable shift costs */
-  COSTS_N_INSNS (1),                   /* constant shift costs */
-  {COSTS_N_INSNS (3),                  /* cost of starting multiply for QI */
-   COSTS_N_INSNS (4),                  /*                               HI */
-   COSTS_N_INSNS (3),                  /*                               SI */
-   COSTS_N_INSNS (4),                  /*                               DI */
-   COSTS_N_INSNS (5)},                 /*                            other */
-  0,                                   /* cost of multiply per each bit set */
-  {COSTS_N_INSNS (19),                 /* cost of a divide/mod for QI */
-   COSTS_N_INSNS (35),                 /*                          HI */
-   COSTS_N_INSNS (51),                 /*                          SI */
-   COSTS_N_INSNS (83),                 /*                          DI */
-   COSTS_N_INSNS (83)},                        /*                          other */
-  COSTS_N_INSNS (1),                   /* cost of movsx */
-  COSTS_N_INSNS (1),                   /* cost of movzx */
-  8,                                   /* "large" insn */
-  9,                                   /* MOVE_RATIO */
-
-  /* All move costs are relative to integer->integer move times 2 and thus
-     they are latency*2. */
+  {
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
    4,                                /* cost for loading QImode using movzbl */
    {3, 4, 3},                           /* cost of loading integer registers
                                            in QImode, HImode and SImode.
@@ -993,11 +1103,10 @@ struct processor_costs amdfam10_cost = {
    2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
    {4, 4, 3, 6, 12},                    /* cost of loading SSE registers
                                            in 32,64,128,256 and 512-bit */
-  {4, 4, 3, 7, 12},                    /* cost of unaligned loads.  */
    {4, 4, 5, 10, 20},                   /* cost of storing SSE registers
                                            in 32,64,128,256 and 512-bit */
-  {4, 4, 5, 10, 20},                   /* cost of unaligned stores.  */
    3, 3,                                        /* SSE->integer and integer->SSE moves */
+
                                         /* On K8:
                                             MOVD reg64, xmmreg Double FSTORE 4
                                             MOVD reg32, xmmreg Double FSTORE 4
@@ -1006,6 +1115,41 @@ struct processor_costs amdfam10_cost = {
                                                                1/1  1/1
                                             MOVD reg32, xmmreg Double FADD 3
                                                                1/1  1/1 */
+  /* End of register allocator costs.  */
+  },
+
+  COSTS_N_INSNS (1),                   /* cost of an add instruction */
+  COSTS_N_INSNS (2),                   /* cost of a lea instruction */
+  COSTS_N_INSNS (1),                   /* variable shift costs */
+  COSTS_N_INSNS (1),                   /* constant shift costs */
+  {COSTS_N_INSNS (3),                  /* cost of starting multiply for QI */
+   COSTS_N_INSNS (4),                  /*                               HI */
+   COSTS_N_INSNS (3),                  /*                               SI */
+   COSTS_N_INSNS (4),                  /*                               DI */
+   COSTS_N_INSNS (5)},                 /*                            other */
+  0,                                   /* cost of multiply per each bit set */
+  {COSTS_N_INSNS (19),                 /* cost of a divide/mod for QI */
+   COSTS_N_INSNS (35),                 /*                          HI */
+   COSTS_N_INSNS (51),                 /*                          SI */
+   COSTS_N_INSNS (83),                 /*                          DI */
+   COSTS_N_INSNS (83)},                        /*                          other */
+  COSTS_N_INSNS (1),                   /* cost of movsx */
+  COSTS_N_INSNS (1),                   /* cost of movzx */
+  8,                                   /* "large" insn */
+  9,                                   /* MOVE_RATIO */
+  6,                                   /* CLEAR_RATIO */
+  {3, 4, 3},                           /* cost of loading integer registers
+                                          in QImode, HImode and SImode.
+                                          Relative to reg-reg move (2).  */
+  {3, 4, 3},                           /* cost of storing integer registers */
+  {4, 4, 3, 6, 12},                    /* cost of loading SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {4, 4, 5, 10, 20},                   /* cost of storing SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {4, 4, 3, 7, 12},                    /* cost of unaligned loads.  */
+  {4, 4, 5, 10, 20},                   /* cost of unaligned stores.  */
+  2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
+  3,                                   /* cost of moving SSE register to integer.  */
    4, 4,                                        /* Gather load static, per_elt.  */
    4, 4,                                        /* Gather store static, per_elt.  */
    64,                                  /* size of l1 cache.  */
@@ -1047,21 +1191,47 @@ struct processor_costs amdfam10_cost = {
    "32",                                        /* Func alignment.  */
  };
  
-/*  BDVER1 has optimized REP instruction for medium sized blocks, but for
+/*  BDVER has optimized REP instruction for medium sized blocks, but for
      very small blocks it is better to use loop. For large blocks, libcall
      can do nontemporary accesses and beat inline considerably.  */
-static stringop_algs bdver1_memcpy[2] = {
+static stringop_algs bdver_memcpy[2] = {
    {libcall, {{6, loop, false}, {14, unrolled_loop, false},
               {-1, rep_prefix_4_byte, false}}},
    {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
               {-1, libcall, false}}}};
-static stringop_algs bdver1_memset[2] = {
+static stringop_algs bdver_memset[2] = {
    {libcall, {{8, loop, false}, {24, unrolled_loop, false},
               {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
    {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
               {-1, libcall, false}}}};
  
-const struct processor_costs bdver1_cost = {
+const struct processor_costs bdver_cost = {
+  {
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
+  8,                                /* cost for loading QImode using movzbl */
+  {8, 8, 8},                           /* cost of loading integer registers
+                                          in QImode, HImode and SImode.
+                                          Relative to reg-reg move (2).  */
+  {8, 8, 8},                           /* cost of storing integer registers */
+  4,                                   /* cost of reg,reg fld/fst */
+  {12, 12, 28},                                /* cost of loading fp registers
+                                          in SFmode, DFmode and XFmode */
+  {10, 10, 18},                                /* cost of storing fp registers
+                                          in SFmode, DFmode and XFmode */
+  4,                                   /* cost of moving MMX register */
+  {12, 12},                            /* cost of loading MMX registers
+                                          in SImode and DImode */
+  {10, 10},                            /* cost of storing MMX registers
+                                          in SImode and DImode */
+  2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
+  {12, 12, 10, 40, 60},                        /* cost of loading SSE registers
+                                          in 32,64,128,256 and 512-bit */
+  {10, 10, 10, 40, 60},                        /* cost of storing SSE registers
+                                          in 32,64,128,256 and 512-bit */
+  16, 20,                              /* SSE->integer and integer->SSE moves */
+  /* End of register allocator costs.  */
+  },
+
    COSTS_N_INSNS (1),                   /* cost of an add instruction */
    COSTS_N_INSNS (1),                   /* cost of a lea instruction */
    COSTS_N_INSNS (1),                   /* variable shift costs */
@@ -1081,32 +1251,19 @@ const struct processor_costs bdver1_cost = {
    COSTS_N_INSNS (1),                   /* cost of movzx */
    8,                                   /* "large" insn */
    9,                                   /* MOVE_RATIO */
-
-  /* All move costs are relative to integer->integer move times 2 and thus
-     they are latency*2. */
-  8,                                /* cost for loading QImode using movzbl */
+  6,                                   /* CLEAR_RATIO */
    {8, 8, 8},                           /* cost of loading integer registers
                                            in QImode, HImode and SImode.
                                            Relative to reg-reg move (2).  */
    {8, 8, 8},                           /* cost of storing integer registers */
-  4,                                   /* cost of reg,reg fld/fst */
-  {12, 12, 28},                                /* cost of loading fp registers
-                                          in SFmode, DFmode and XFmode */
-  {10, 10, 18},                                /* cost of storing fp registers
-                                          in SFmode, DFmode and XFmode */
-  4,                                   /* cost of moving MMX register */
-  {12, 12},                            /* cost of loading MMX registers
-                                          in SImode and DImode */
-  {10, 10},                            /* cost of storing MMX registers
-                                          in SImode and DImode */
+  {12, 12, 10, 40, 60},                        /* cost of loading SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {10, 10, 10, 40, 60},                        /* cost of storing SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {12, 12, 10, 40, 60},                        /* cost of unaligned loads.  */
+  {10, 10, 10, 40, 60},                        /* cost of unaligned stores.  */
    2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
-  {12, 12, 10, 20, 30},                        /* cost of loading SSE registers
-                                          in 32,64,128,256 and 512-bit */
-  {12, 12, 10, 20, 30},                        /* cost of unaligned loads.  */
-  {10, 10, 10, 20, 30},                        /* cost of storing SSE registers
-                                          in 32,64,128,256 and 512-bit */
-  {10, 10, 10, 20, 30},                        /* cost of unaligned stores.  */
-  16, 20,                              /* SSE->integer and integer->SSE moves */
+  16,                                  /* cost of moving SSE register to integer.  */
    12, 12,                              /* Gather load static, per_elt.  */
    10, 10,                              /* Gather store static, per_elt.  */
    16,                                  /* size of l1 cache.  */
@@ -1139,8 +1296,8 @@ const struct processor_costs bdver1_cost = {
    COSTS_N_INSNS (15),                  /* cost of SQRTSS instruction.  */
    COSTS_N_INSNS (26),                  /* cost of SQRTSD instruction.  */
    1, 2, 1, 1,                          /* reassoc int, fp, vec_int, vec_fp.  */
-  bdver1_memcpy,
-  bdver1_memset,
+  bdver_memcpy,
+  bdver_memset,
    COSTS_N_INSNS (4),                   /* cond_taken_branch_cost.  */
    COSTS_N_INSNS (2),                   /* cond_not_taken_branch_cost.  */
    "16:11:8",                           /* Loop alignment.  */
@@ -1149,355 +1306,164 @@ const struct processor_costs bdver1_cost = {
    "11",                                        /* Func alignment.  */
  };
  
-/*  BDVER2 has optimized REP instruction for medium sized blocks, but for
-    very small blocks it is better to use loop. For large blocks, libcall
-    can do nontemporary accesses and beat inline considerably.  */
  
-static stringop_algs bdver2_memcpy[2] = {
+/*  ZNVER1 has optimized REP instruction for medium sized blocks, but for
+    very small blocks it is better to use loop.  For large blocks, libcall
+    can do nontemporary accesses and beat inline considerably.  */
+static stringop_algs znver1_memcpy[2] = {
    {libcall, {{6, loop, false}, {14, unrolled_loop, false},
-             {-1, rep_prefix_4_byte, false}}},
+            {-1, rep_prefix_4_byte, false}}},
    {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
-             {-1, libcall, false}}}};
-static stringop_algs bdver2_memset[2] = {
+            {-1, libcall, false}}}};
+static stringop_algs znver1_memset[2] = {
    {libcall, {{8, loop, false}, {24, unrolled_loop, false},
-             {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
+            {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
    {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
-             {-1, libcall, false}}}};
+            {-1, libcall, false}}}};
+struct processor_costs znver1_cost = {
+  {
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
  
-const struct processor_costs bdver2_cost = {
-  COSTS_N_INSNS (1),                   /* cost of an add instruction */
-  COSTS_N_INSNS (1),                   /* cost of a lea instruction */
-  COSTS_N_INSNS (1),                   /* variable shift costs */
-  COSTS_N_INSNS (1),                   /* constant shift costs */
-  {COSTS_N_INSNS (4),                  /* cost of starting multiply for QI */
-   COSTS_N_INSNS (4),                  /*                               HI */
-   COSTS_N_INSNS (4),                  /*                               SI */
-   COSTS_N_INSNS (6),                  /*                               DI */
-   COSTS_N_INSNS (6)},                 /*                            other */
-  0,                                   /* cost of multiply per each bit set */
-  {COSTS_N_INSNS (19),                 /* cost of a divide/mod for QI */
-   COSTS_N_INSNS (35),                 /*                          HI */
-   COSTS_N_INSNS (51),                 /*                          SI */
-   COSTS_N_INSNS (83),                 /*                          DI */
-   COSTS_N_INSNS (83)},                        /*                          other */
-  COSTS_N_INSNS (1),                   /* cost of movsx */
-  COSTS_N_INSNS (1),                   /* cost of movzx */
-  8,                                   /* "large" insn */
-  9,                                   /* MOVE_RATIO */
-
-  /* All move costs are relative to integer->integer move times 2 and thus
-     they are latency*2. */
-  8,                                /* cost for loading QImode using movzbl */
-  {8, 8, 8},                           /* cost of loading integer registers
+  /* reg-reg moves are done by renaming and thus they are even cheaper than
+     1 cycle. Becuase reg-reg move cost is 2 and the following tables correspond
+     to doubles of latencies, we do not model this correctly.  It does not
+     seem to make practical difference to bump prices up even more.  */
+  6,                                   /* cost for loading QImode using
+                                          movzbl.  */
+  {6, 6, 6},                           /* cost of loading integer registers
                                            in QImode, HImode and SImode.
                                            Relative to reg-reg move (2).  */
-  {8, 8, 8},                           /* cost of storing integer registers */
-  4,                                   /* cost of reg,reg fld/fst */
-  {12, 12, 28},                                /* cost of loading fp registers
-                                          in SFmode, DFmode and XFmode */
-  {10, 10, 18},                                /* cost of storing fp registers
-                                          in SFmode, DFmode and XFmode */
-  4,                                   /* cost of moving MMX register */
-  {12, 12},                            /* cost of loading MMX registers
-                                          in SImode and DImode */
-  {10, 10},                            /* cost of storing MMX registers
-                                          in SImode and DImode */
-  2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
-  {12, 12, 10, 20, 30},                        /* cost of loading SSE registers
-                                          in 32,64,128,256 and 512-bit */
-  {12, 12, 10, 20, 30},                        /* cost of unaligned loads.  */
-  {10, 10, 10, 20, 30},                        /* cost of storing SSE registers
-                                          in 32,64,128,256 and 512-bit */
-  {10, 10, 10, 20, 30},                        /* cost of unaligned stores.  */
-  16, 20,                              /* SSE->integer and integer->SSE moves */
-  12, 12,                              /* Gather load static, per_elt.  */
-  10, 10,                              /* Gather store static, per_elt.  */
-  16,                                  /* size of l1 cache.  */
-  2048,                                        /* size of l2 cache.  */
-  64,                                  /* size of prefetch block */
-  /* New AMD processors never drop prefetches; if they cannot be performed
-     immediately, they are queued.  We set number of simultaneous prefetches
-     to a large constant to reflect this (it probably is not a good idea not
-     to limit number of prefetches at all, as their execution also takes some
-     time).  */
-  100,                                 /* number of parallel prefetches */
-  2,                                   /* Branch cost */
-  COSTS_N_INSNS (6),                   /* cost of FADD and FSUB insns.  */
-  COSTS_N_INSNS (6),                   /* cost of FMUL instruction.  */
-  COSTS_N_INSNS (42),                  /* cost of FDIV instruction.  */
-  COSTS_N_INSNS (2),                   /* cost of FABS instruction.  */
-  COSTS_N_INSNS (2),                   /* cost of FCHS instruction.  */
-  COSTS_N_INSNS (52),                  /* cost of FSQRT instruction.  */
-
-  COSTS_N_INSNS (2),                   /* cost of cheap SSE instruction.  */
-  COSTS_N_INSNS (6),                   /* cost of ADDSS/SD SUBSS/SD insns.  */
-  COSTS_N_INSNS (6),                   /* cost of MULSS instruction.  */
-  COSTS_N_INSNS (6),                   /* cost of MULSD instruction.  */
-  COSTS_N_INSNS (6),                   /* cost of FMA SS instruction.  */
-  COSTS_N_INSNS (6),                   /* cost of FMA SD instruction.  */
-  /* 9-24  */
-  COSTS_N_INSNS (24),                  /* cost of DIVSS instruction.  */
-  /* 9-27  */
-  COSTS_N_INSNS (27),                  /* cost of DIVSD instruction.  */
-  COSTS_N_INSNS (15),                  /* cost of SQRTSS instruction.  */
-  COSTS_N_INSNS (26),                  /* cost of SQRTSD instruction.  */
-  1, 2, 1, 1,                          /* reassoc int, fp, vec_int, vec_fp.  */
-  bdver2_memcpy,
-  bdver2_memset,
-  COSTS_N_INSNS (4),                   /* cond_taken_branch_cost.  */
-  COSTS_N_INSNS (2),                   /* cond_not_taken_branch_cost.  */
-  "16:11:8",                           /* Loop alignment.  */
-  "16:8:8",                            /* Jump alignment.  */
-  "0:0:8",                             /* Label alignment.  */
-  "11",                                        /* Func alignment.  */
-};
-
-
-  /*  BDVER3 has optimized REP instruction for medium sized blocks, but for
-      very small blocks it is better to use loop. For large blocks, libcall
-      can do nontemporary accesses and beat inline considerably.  */
-static stringop_algs bdver3_memcpy[2] = {
-  {libcall, {{6, loop, false}, {14, unrolled_loop, false},
-             {-1, rep_prefix_4_byte, false}}},
-  {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
-             {-1, libcall, false}}}};
-static stringop_algs bdver3_memset[2] = {
-  {libcall, {{8, loop, false}, {24, unrolled_loop, false},
-             {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
-  {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
-             {-1, libcall, false}}}};
-struct processor_costs bdver3_cost = {
-  COSTS_N_INSNS (1),                   /* cost of an add instruction */
-  COSTS_N_INSNS (1),                   /* cost of a lea instruction */
-  COSTS_N_INSNS (1),                   /* variable shift costs */
-  COSTS_N_INSNS (1),                   /* constant shift costs */
-  {COSTS_N_INSNS (4),                  /* cost of starting multiply for QI */
-   COSTS_N_INSNS (4),                  /*                               HI */
-   COSTS_N_INSNS (4),                  /*                               SI */
-   COSTS_N_INSNS (6),                  /*                               DI */
-   COSTS_N_INSNS (6)},                 /*                            other */
-  0,                                   /* cost of multiply per each bit set */
-  {COSTS_N_INSNS (19),                 /* cost of a divide/mod for QI */
-   COSTS_N_INSNS (35),                 /*                          HI */
-   COSTS_N_INSNS (51),                 /*                          SI */
-   COSTS_N_INSNS (83),                 /*                          DI */
-   COSTS_N_INSNS (83)},                        /*                          other */
-  COSTS_N_INSNS (1),                   /* cost of movsx */
-  COSTS_N_INSNS (1),                   /* cost of movzx */
-  8,                                   /* "large" insn */
-  9,                                   /* MOVE_RATIO */
+  {8, 8, 8},                           /* cost of storing integer
+                                          registers.  */
+  2,                                   /* cost of reg,reg fld/fst.  */
+  {6, 6, 16},                          /* cost of loading fp registers
+                                          in SFmode, DFmode and XFmode.  */
+  {8, 8, 16},                          /* cost of storing fp registers
+                                          in SFmode, DFmode and XFmode.  */
+  2,                                   /* cost of moving MMX register.  */
+  {6, 6},                              /* cost of loading MMX registers
+                                          in SImode and DImode.  */
+  {8, 8},                              /* cost of storing MMX registers
+                                          in SImode and DImode.  */
+  2, 3, 6,                             /* cost of moving XMM,YMM,ZMM register.  */
+  {6, 6, 6, 12, 24},                   /* cost of loading SSE registers
+                                          in 32,64,128,256 and 512-bit.  */
+  {8, 8, 8, 16, 32},                   /* cost of storing SSE registers
+                                          in 32,64,128,256 and 512-bit.  */
+  6, 6,                                        /* SSE->integer and integer->SSE moves.  */
+  /* End of register allocator costs.  */
+  },
  
-  /* All move costs are relative to integer->integer move times 2 and thus
-     they are latency*2. */
-  8,                                /* cost for loading QImode using movzbl */
-  {8, 8, 8},                           /* cost of loading integer registers
+  COSTS_N_INSNS (1),                   /* cost of an add instruction.  */
+  COSTS_N_INSNS (1),                   /* cost of a lea instruction.  */
+  COSTS_N_INSNS (1),                   /* variable shift costs.  */
+  COSTS_N_INSNS (1),                   /* constant shift costs.  */
+  {COSTS_N_INSNS (3),                  /* cost of starting multiply for QI.  */
+   COSTS_N_INSNS (3),                  /*                               HI.  */
+   COSTS_N_INSNS (3),                  /*                               SI.  */
+   COSTS_N_INSNS (3),                  /*                               DI.  */
+   COSTS_N_INSNS (3)},                 /*                            other.  */
+  0,                                   /* cost of multiply per each bit
+                                           set.  */
+   /* Depending on parameters, idiv can get faster on ryzen.  This is upper
+      bound.  */
+  {COSTS_N_INSNS (16),                 /* cost of a divide/mod for QI.  */
+   COSTS_N_INSNS (22),                 /*                          HI.  */
+   COSTS_N_INSNS (30),                 /*                          SI.  */
+   COSTS_N_INSNS (45),                 /*                          DI.  */
+   COSTS_N_INSNS (45)},                        /*                          other.  */
+  COSTS_N_INSNS (1),                   /* cost of movsx.  */
+  COSTS_N_INSNS (1),                   /* cost of movzx.  */
+  8,                                   /* "large" insn.  */
+  9,                                   /* MOVE_RATIO.  */
+  6,                                   /* CLEAR_RATIO */
+  {6, 6, 6},                           /* cost of loading integer registers
                                            in QImode, HImode and SImode.
                                            Relative to reg-reg move (2).  */
-  {8, 8, 8},                           /* cost of storing integer registers */
-  4,                                   /* cost of reg,reg fld/fst */
-  {12, 12, 28},                                /* cost of loading fp registers
-                                          in SFmode, DFmode and XFmode */
-  {10, 10, 18},                                /* cost of storing fp registers
-                                          in SFmode, DFmode and XFmode */
-  4,                                   /* cost of moving MMX register */
-  {12, 12},                            /* cost of loading MMX registers
-                                          in SImode and DImode */
-  {10, 10},                            /* cost of storing MMX registers
-                                          in SImode and DImode */
-  2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
-  {12, 12, 10, 20, 30},                        /* cost of loading SSE registers
-                                          in 32,64,128,256 and 512-bit */
-  {12, 12, 10, 20, 30},                        /* cost of unaligned loads.  */
-  {10, 10, 10, 20, 30},                        /* cost of storing SSE registers
-                                          in 32,64,128,256 and 512-bit */
-  {10, 10, 10, 20, 30},                        /* cost of unaligned stores.  */
-  16, 20,                              /* SSE->integer and integer->SSE moves */
-  12, 12,                              /* Gather load static, per_elt.  */
-  10, 10,                              /* Gather store static, per_elt.  */
-  16,                                  /* size of l1 cache.  */
-  2048,                                        /* size of l2 cache.  */
-  64,                                  /* size of prefetch block */
+  {8, 8, 8},                           /* cost of storing integer
+                                          registers.  */
+  {6, 6, 6, 12, 24},                   /* cost of loading SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {8, 8, 8, 16, 32},                   /* cost of storing SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {6, 6, 6, 12, 24},                   /* cost of unaligned loads.  */
+  {8, 8, 8, 16, 32},                   /* cost of unaligned stores.  */
+  2, 3, 6,                             /* cost of moving XMM,YMM,ZMM register.  */
+  6,                                   /* cost of moving SSE register to integer.  */
+  /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops,
+     throughput 12.  Approx 9 uops do not depend on vector size and every load
+     is 7 uops.  */
+  18, 8,                               /* Gather load static, per_elt.  */
+  18, 10,                              /* Gather store static, per_elt.  */
+  32,                                  /* size of l1 cache.  */
+  512,                                 /* size of l2 cache.  */
+  64,                                  /* size of prefetch block.  */
    /* New AMD processors never drop prefetches; if they cannot be performed
       immediately, they are queued.  We set number of simultaneous prefetches
       to a large constant to reflect this (it probably is not a good idea not
       to limit number of prefetches at all, as their execution also takes some
       time).  */
-  100,                                 /* number of parallel prefetches */
-  2,                                   /* Branch cost */
-  COSTS_N_INSNS (6),                   /* cost of FADD and FSUB insns.  */
-  COSTS_N_INSNS (6),                   /* cost of FMUL instruction.  */
-  COSTS_N_INSNS (42),                  /* cost of FDIV instruction.  */
-  COSTS_N_INSNS (2),                   /* cost of FABS instruction.  */
-  COSTS_N_INSNS (2),                   /* cost of FCHS instruction.  */
-  COSTS_N_INSNS (52),                  /* cost of FSQRT instruction.  */
-
-  COSTS_N_INSNS (2),                   /* cost of cheap SSE instruction.  */
-  COSTS_N_INSNS (6),                   /* cost of ADDSS/SD SUBSS/SD insns.  */
-  COSTS_N_INSNS (6),                   /* cost of MULSS instruction.  */
-  COSTS_N_INSNS (6),                   /* cost of MULSD instruction.  */
-  COSTS_N_INSNS (6),                   /* cost of FMA SS instruction.  */
-  COSTS_N_INSNS (6),                   /* cost of FMA SD instruction.  */
-  /* 9-24  */
-  COSTS_N_INSNS (24),                  /* cost of DIVSS instruction.  */
-  /* 9-27  */
-  COSTS_N_INSNS (27),                  /* cost of DIVSD instruction.  */
-  COSTS_N_INSNS (15),                  /* cost of SQRTSS instruction.  */
-  COSTS_N_INSNS (26),                  /* cost of SQRTSD instruction.  */
-  1, 2, 1, 1,                          /* reassoc int, fp, vec_int, vec_fp.  */
-  bdver3_memcpy,
-  bdver3_memset,
-  COSTS_N_INSNS (4),                   /* cond_taken_branch_cost.  */
-  COSTS_N_INSNS (2),                   /* cond_not_taken_branch_cost.  */
-  "16:11:8",                           /* Loop alignment.  */
-  "16:8:8",                            /* Jump alignment.  */
-  "0:0:8",                             /* Label alignment.  */
-  "11",                                        /* Func alignment.  */
-};
-
-/*  BDVER4 has optimized REP instruction for medium sized blocks, but for
-    very small blocks it is better to use loop. For large blocks, libcall
-    can do nontemporary accesses and beat inline considerably.  */
-static stringop_algs bdver4_memcpy[2] = {
-  {libcall, {{6, loop, false}, {14, unrolled_loop, false},
-             {-1, rep_prefix_4_byte, false}}},
-  {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
-             {-1, libcall, false}}}};
-static stringop_algs bdver4_memset[2] = {
-  {libcall, {{8, loop, false}, {24, unrolled_loop, false},
-             {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
-  {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
-             {-1, libcall, false}}}};
-struct processor_costs bdver4_cost = {
-  COSTS_N_INSNS (1),                   /* cost of an add instruction */
-  COSTS_N_INSNS (1),                   /* cost of a lea instruction */
-  COSTS_N_INSNS (1),                   /* variable shift costs */
-  COSTS_N_INSNS (1),                   /* constant shift costs */
-  {COSTS_N_INSNS (4),                  /* cost of starting multiply for QI */
-   COSTS_N_INSNS (4),                  /*                               HI */
-   COSTS_N_INSNS (4),                  /*                               SI */
-   COSTS_N_INSNS (6),                  /*                               DI */
-   COSTS_N_INSNS (6)},                 /*                            other */
-  0,                                   /* cost of multiply per each bit set */
-  {COSTS_N_INSNS (19),                 /* cost of a divide/mod for QI */
-   COSTS_N_INSNS (35),                 /*                          HI */
-   COSTS_N_INSNS (51),                 /*                          SI */
-   COSTS_N_INSNS (83),                 /*                          DI */
-   COSTS_N_INSNS (83)},                        /*                          other */
-  COSTS_N_INSNS (1),                   /* cost of movsx */
-  COSTS_N_INSNS (1),                   /* cost of movzx */
-  8,                                   /* "large" insn */
-  9,                                   /* MOVE_RATIO */
+  100,                                 /* number of parallel prefetches.  */
+  3,                                   /* Branch cost.  */
+  COSTS_N_INSNS (5),                   /* cost of FADD and FSUB insns.  */
+  COSTS_N_INSNS (5),                   /* cost of FMUL instruction.  */
+  /* Latency of fdiv is 8-15.  */
+  COSTS_N_INSNS (15),                  /* cost of FDIV instruction.  */
+  COSTS_N_INSNS (1),                   /* cost of FABS instruction.  */
+  COSTS_N_INSNS (1),                   /* cost of FCHS instruction.  */
+  /* Latency of fsqrt is 4-10.  */
+  COSTS_N_INSNS (10),                  /* cost of FSQRT instruction.  */
  
-  /* All move costs are relative to integer->integer move times 2 and thus
-     they are latency*2. */
-  8,                                /* cost for loading QImode using movzbl */
-  {8, 8, 8},                           /* cost of loading integer registers
-                                          in QImode, HImode and SImode.
-                                          Relative to reg-reg move (2).  */
-  {8, 8, 8},                           /* cost of storing integer registers */
-  4,                                   /* cost of reg,reg fld/fst */
-  {12, 12, 28},                                /* cost of loading fp registers
-                                          in SFmode, DFmode and XFmode */
-  {10, 10, 18},                                /* cost of storing fp registers
-                                          in SFmode, DFmode and XFmode */
-  4,                                   /* cost of moving MMX register */
-  {12, 12},                            /* cost of loading MMX registers
-                                          in SImode and DImode */
-  {10, 10},                            /* cost of storing MMX registers
-                                          in SImode and DImode */
-  2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
-  {12, 12, 10, 20, 30},                        /* cost of loading SSE registers
-                                          in 32,64,128,256 and 512-bit */
-  {12, 12, 10, 20, 30},                        /* cost of unaligned loads.  */
-  {10, 10, 10, 20, 30},                        /* cost of storing SSE registers
-                                          in 32,64,128,256 and 512-bit */
-  {10, 10, 10, 20, 30},                        /* cost of unaligned stores.  */
-  16, 20,                              /* SSE->integer and integer->SSE moves */
-  12, 12,                              /* Gather load static, per_elt.  */
-  10, 10,                              /* Gather store static, per_elt.  */
-  16,                                  /* size of l1 cache.  */
-  2048,                                        /* size of l2 cache.  */
-  64,                                  /* size of prefetch block */
-  /* New AMD processors never drop prefetches; if they cannot be performed
-     immediately, they are queued.  We set number of simultaneous prefetches
-     to a large constant to reflect this (it probably is not a good idea not
-     to limit number of prefetches at all, as their execution also takes some
-     time).  */
-  100,                                 /* number of parallel prefetches */
-  2,                                   /* Branch cost */
-  COSTS_N_INSNS (6),                   /* cost of FADD and FSUB insns.  */
-  COSTS_N_INSNS (6),                   /* cost of FMUL instruction.  */
-  COSTS_N_INSNS (42),                  /* cost of FDIV instruction.  */
-  COSTS_N_INSNS (2),                   /* cost of FABS instruction.  */
-  COSTS_N_INSNS (2),                   /* cost of FCHS instruction.  */
-  COSTS_N_INSNS (52),                  /* cost of FSQRT instruction.  */
+  COSTS_N_INSNS (1),                   /* cost of cheap SSE instruction.  */
+  COSTS_N_INSNS (3),                   /* cost of ADDSS/SD SUBSS/SD insns.  */
+  COSTS_N_INSNS (3),                   /* cost of MULSS instruction.  */
+  COSTS_N_INSNS (4),                   /* cost of MULSD instruction.  */
+  COSTS_N_INSNS (5),                   /* cost of FMA SS instruction.  */
+  COSTS_N_INSNS (5),                   /* cost of FMA SD instruction.  */
+  COSTS_N_INSNS (10),                  /* cost of DIVSS instruction.  */
+  /* 9-13  */
+  COSTS_N_INSNS (13),                  /* cost of DIVSD instruction.  */
+  COSTS_N_INSNS (10),                  /* cost of SQRTSS instruction.  */
+  COSTS_N_INSNS (15),                  /* cost of SQRTSD instruction.  */
+  /* Zen can execute 4 integer operations per cycle. FP operations take 3 cycles
+     and it can execute 2 integer additions and 2 multiplications thus
+     reassociation may make sense up to with of 6.  SPEC2k6 bencharks suggests
+     that 4 works better than 6 probably due to register pressure.
  
-  COSTS_N_INSNS (2),                   /* cost of cheap SSE instruction.  */
-  COSTS_N_INSNS (6),                   /* cost of ADDSS/SD SUBSS/SD insns.  */
-  COSTS_N_INSNS (6),                   /* cost of MULSS instruction.  */
-  COSTS_N_INSNS (6),                   /* cost of MULSD instruction.  */
-  COSTS_N_INSNS (6),                   /* cost of FMA SS instruction.  */
-  COSTS_N_INSNS (6),                   /* cost of FMA SD instruction.  */
-  /* 9-24  */
-  COSTS_N_INSNS (24),                  /* cost of DIVSS instruction.  */
-  /* 9-27  */
-  COSTS_N_INSNS (27),                  /* cost of DIVSD instruction.  */
-  COSTS_N_INSNS (15),                  /* cost of SQRTSS instruction.  */
-  COSTS_N_INSNS (26),                  /* cost of SQRTSD instruction.  */
-  1, 2, 1, 1,                          /* reassoc int, fp, vec_int, vec_fp.  */
-  bdver4_memcpy,
-  bdver4_memset,
+     Integer vector operations are taken by FP unit and execute 3 vector
+     plus/minus operations per cycle but only one multiply.  This is adjusted
+     in ix86_reassociation_width.  */
+  4, 4, 3, 6,                          /* reassoc int, fp, vec_int, vec_fp.  */
+  znver1_memcpy,
+  znver1_memset,
    COSTS_N_INSNS (4),                   /* cond_taken_branch_cost.  */
    COSTS_N_INSNS (2),                   /* cond_not_taken_branch_cost.  */
-  "16:11:8",                           /* Loop alignment.  */
-  "16:8:8",                            /* Jump alignment.  */
+  "16",                                        /* Loop alignment.  */
+  "16",                                        /* Jump alignment.  */
    "0:0:8",                             /* Label alignment.  */
-  "11",                                        /* Func alignment.  */
+  "16",                                        /* Func alignment.  */
  };
  
-
-/*  ZNVER1 has optimized REP instruction for medium sized blocks, but for
+/*  ZNVER2 has optimized REP instruction for medium sized blocks, but for
      very small blocks it is better to use loop.  For large blocks, libcall
      can do nontemporary accesses and beat inline considerably.  */
-static stringop_algs znver1_memcpy[2] = {
+static stringop_algs znver2_memcpy[2] = {
    {libcall, {{6, loop, false}, {14, unrolled_loop, false},
              {-1, rep_prefix_4_byte, false}}},
-  {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
+  {libcall, {{16, loop, false}, {64, rep_prefix_4_byte, false},
              {-1, libcall, false}}}};
-static stringop_algs znver1_memset[2] = {
+static stringop_algs znver2_memset[2] = {
    {libcall, {{8, loop, false}, {24, unrolled_loop, false},
              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
-  {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
+  {libcall, {{24, rep_prefix_4_byte, false}, {128, rep_prefix_8_byte, false},
              {-1, libcall, false}}}};
-struct processor_costs znver1_cost = {
-  COSTS_N_INSNS (1),                   /* cost of an add instruction.  */
-  COSTS_N_INSNS (1),                   /* cost of a lea instruction.  */
-  COSTS_N_INSNS (1),                   /* variable shift costs.  */
-  COSTS_N_INSNS (1),                   /* constant shift costs.  */
-  {COSTS_N_INSNS (3),                  /* cost of starting multiply for QI.  */
-   COSTS_N_INSNS (3),                  /*                               HI.  */
-   COSTS_N_INSNS (3),                  /*                               SI.  */
-   COSTS_N_INSNS (3),                  /*                               DI.  */
-   COSTS_N_INSNS (3)},                 /*                            other.  */
-  0,                                   /* cost of multiply per each bit
-                                           set.  */
-   /* Depending on parameters, idiv can get faster on ryzen.  This is upper
-      bound.  */
-  {COSTS_N_INSNS (16),                 /* cost of a divide/mod for QI.  */
-   COSTS_N_INSNS (22),                 /*                          HI.  */
-   COSTS_N_INSNS (30),                 /*                          SI.  */
-   COSTS_N_INSNS (45),                 /*                          DI.  */
-   COSTS_N_INSNS (45)},                        /*                          other.  */
-  COSTS_N_INSNS (1),                   /* cost of movsx.  */
-  COSTS_N_INSNS (1),                   /* cost of movzx.  */
-  8,                                   /* "large" insn.  */
-  9,                                   /* MOVE_RATIO.  */
  
-  /* All move costs are relative to integer->integer move times 2 and thus
-     they are latency*2. */
+struct processor_costs znver2_cost = {
+  {
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
  
    /* reg-reg moves are done by renaming and thus they are even cheaper than
-     1 cycle. Becuase reg-reg move cost is 2 and the following tables correspond
+     1 cycle.  Because reg-reg move cost is 2 and following tables correspond
       to doubles of latencies, we do not model this correctly.  It does not
       seem to make practical difference to bump prices up even more.  */
    6,                                   /* cost for loading QImode using
@@ -1509,22 +1475,62 @@ struct processor_costs znver1_cost = {
                                            registers.  */
    2,                                   /* cost of reg,reg fld/fst.  */
    {6, 6, 16},                          /* cost of loading fp registers
-                                          in SFmode, DFmode and XFmode.  */
+                                          in SFmode, DFmode and XFmode.  */
    {8, 8, 16},                          /* cost of storing fp registers
-                                          in SFmode, DFmode and XFmode.  */
+                                          in SFmode, DFmode and XFmode.  */
    2,                                   /* cost of moving MMX register.  */
    {6, 6},                              /* cost of loading MMX registers
                                            in SImode and DImode.  */
    {8, 8},                              /* cost of storing MMX registers
                                            in SImode and DImode.  */
-  2, 3, 6,                             /* cost of moving XMM,YMM,ZMM register.  */
-  {6, 6, 6, 10, 20},                   /* cost of loading SSE registers
+  2, 2, 3,                             /* cost of moving XMM,YMM,ZMM
+                                          register.  */
+  {6, 6, 6, 6, 12},                    /* cost of loading SSE registers
                                            in 32,64,128,256 and 512-bit.  */
-  {6, 6, 6, 10, 20},                   /* cost of unaligned loads.  */
    {8, 8, 8, 8, 16},                    /* cost of storing SSE registers
                                            in 32,64,128,256 and 512-bit.  */
+  6, 6,                                        /* SSE->integer and integer->SSE
+                                          moves.  */
+  /* End of register allocator costs.  */
+  },
+
+  COSTS_N_INSNS (1),                   /* cost of an add instruction.  */
+  COSTS_N_INSNS (1),                   /* cost of a lea instruction.  */
+  COSTS_N_INSNS (1),                   /* variable shift costs.  */
+  COSTS_N_INSNS (1),                   /* constant shift costs.  */
+  {COSTS_N_INSNS (3),                  /* cost of starting multiply for QI.  */
+   COSTS_N_INSNS (3),                  /*                               HI.  */
+   COSTS_N_INSNS (3),                  /*                               SI.  */
+   COSTS_N_INSNS (3),                  /*                               DI.  */
+   COSTS_N_INSNS (3)},                 /*                      other.  */
+  0,                                   /* cost of multiply per each bit
+                                          set.  */
+   /* Depending on parameters, idiv can get faster on ryzen.  This is upper
+      bound.  */
+  {COSTS_N_INSNS (16),                 /* cost of a divide/mod for QI.  */
+   COSTS_N_INSNS (22),                 /*                          HI.  */
+   COSTS_N_INSNS (30),                 /*                          SI.  */
+   COSTS_N_INSNS (45),                 /*                          DI.  */
+   COSTS_N_INSNS (45)},                        /*                          other.  */
+  COSTS_N_INSNS (1),                   /* cost of movsx.  */
+  COSTS_N_INSNS (1),                   /* cost of movzx.  */
+  8,                                   /* "large" insn.  */
+  9,                                   /* MOVE_RATIO.  */
+  6,                                   /* CLEAR_RATIO */
+  {6, 6, 6},                           /* cost of loading integer registers
+                                          in QImode, HImode and SImode.
+                                          Relative to reg-reg move (2).  */
+  {8, 8, 8},                           /* cost of storing integer
+                                          registers.  */
+  {6, 6, 6, 6, 12},                    /* cost of loading SSE registers
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {8, 8, 8, 8, 16},                    /* cost of storing SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {6, 6, 6, 6, 12},                    /* cost of unaligned loads.  */
    {8, 8, 8, 8, 16},                    /* cost of unaligned stores.  */
-  6, 6,                                        /* SSE->integer and integer->SSE moves.  */
+  2, 2, 3,                             /* cost of moving XMM,YMM,ZMM
+                                          register.  */
+  6,                                   /* cost of moving SSE register to integer.  */
    /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops,
       throughput 12.  Approx 9 uops do not depend on vector size and every load
       is 7 uops.  */
@@ -1552,25 +1558,26 @@ struct processor_costs znver1_cost = {
    COSTS_N_INSNS (1),                   /* cost of cheap SSE instruction.  */
    COSTS_N_INSNS (3),                   /* cost of ADDSS/SD SUBSS/SD insns.  */
    COSTS_N_INSNS (3),                   /* cost of MULSS instruction.  */
-  COSTS_N_INSNS (4),                   /* cost of MULSD instruction.  */
+  COSTS_N_INSNS (3),                   /* cost of MULSD instruction.  */
    COSTS_N_INSNS (5),                   /* cost of FMA SS instruction.  */
    COSTS_N_INSNS (5),                   /* cost of FMA SD instruction.  */
    COSTS_N_INSNS (10),                  /* cost of DIVSS instruction.  */
-  /* 9-13  */
+  /* 9-13.  */
    COSTS_N_INSNS (13),                  /* cost of DIVSD instruction.  */
    COSTS_N_INSNS (10),                  /* cost of SQRTSS instruction.  */
    COSTS_N_INSNS (15),                  /* cost of SQRTSD instruction.  */
-  /* Zen can execute 4 integer operations per cycle. FP operations take 3 cycles
-     and it can execute 2 integer additions and 2 multiplications thus
-     reassociation may make sense up to with of 6.  SPEC2k6 bencharks suggests
+  /* Zen can execute 4 integer operations per cycle.  FP operations
+     take 3 cycles and it can execute 2 integer additions and 2
+     multiplications thus reassociation may make sense up to with of 6.
+     SPEC2k6 bencharks suggests
       that 4 works better than 6 probably due to register pressure.
  
       Integer vector operations are taken by FP unit and execute 3 vector
       plus/minus operations per cycle but only one multiply.  This is adjusted
       in ix86_reassociation_width.  */
    4, 4, 3, 6,                          /* reassoc int, fp, vec_int, vec_fp.  */
-  znver1_memcpy,
-  znver1_memset,
+  znver2_memcpy,
+  znver2_memset,
    COSTS_N_INSNS (4),                   /* cond_taken_branch_cost.  */
    COSTS_N_INSNS (2),                   /* cond_not_taken_branch_cost.  */
    "16",                                        /* Loop alignment.  */
@@ -1595,6 +1602,32 @@ static stringop_algs skylake_memset[2] = {
  
  static const
  struct processor_costs skylake_cost = {
+  {
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
+  6,                                /* cost for loading QImode using movzbl */
+  {4, 4, 4},                           /* cost of loading integer registers
+                                          in QImode, HImode and SImode.
+                                          Relative to reg-reg move (2).  */
+  {6, 6, 6},                           /* cost of storing integer registers */
+  2,                                   /* cost of reg,reg fld/fst */
+  {6, 6, 8},                           /* cost of loading fp registers
+                                          in SFmode, DFmode and XFmode */
+  {6, 6, 10},                          /* cost of storing fp registers
+                                          in SFmode, DFmode and XFmode */
+  2,                                   /* cost of moving MMX register */
+  {6, 6},                              /* cost of loading MMX registers
+                                          in SImode and DImode */
+  {6, 6},                              /* cost of storing MMX registers
+                                          in SImode and DImode */
+  2, 2, 4,                             /* cost of moving XMM,YMM,ZMM register */
+  {6, 6, 6, 10, 20},                   /* cost of loading SSE registers
+                                          in 32,64,128,256 and 512-bit */
+  {8, 8, 8, 12, 24},                   /* cost of storing SSE registers
+                                          in 32,64,128,256 and 512-bit */
+  6, 6,                                        /* SSE->integer and integer->SSE moves */
+  /* End of register allocator costs.  */
+  },
+
    COSTS_N_INSNS (1),                   /* cost of an add instruction */
    COSTS_N_INSNS (1)+1,         /* cost of a lea instruction */
    COSTS_N_INSNS (1),                   /* variable shift costs */
@@ -1616,30 +1649,19 @@ struct processor_costs skylake_cost = {
    COSTS_N_INSNS (0),                   /* cost of movzx */
    8,                                   /* "large" insn */
    17,                                  /* MOVE_RATIO */
-
-  6,                                /* cost for loading QImode using movzbl */
+  6,                                   /* CLEAR_RATIO */
    {4, 4, 4},                           /* cost of loading integer registers
                                            in QImode, HImode and SImode.
                                            Relative to reg-reg move (2).  */
-  {6, 6, 3},                           /* cost of storing integer registers */
-  2,                                   /* cost of reg,reg fld/fst */
-  {6, 6, 8},                           /* cost of loading fp registers
-                                          in SFmode, DFmode and XFmode */
-  {6, 6, 10},                          /* cost of storing fp registers
-                                          in SFmode, DFmode and XFmode */
-  2,                                   /* cost of moving MMX register */
-  {6, 6},                              /* cost of loading MMX registers
-                                          in SImode and DImode */
-  {6, 6},                              /* cost of storing MMX registers
-                                          in SImode and DImode */
-  2, 2, 4,                             /* cost of moving XMM,YMM,ZMM register */
-  {6, 6, 6, 10, 20},                   /* cost of loading SSE registers
-                                          in 32,64,128,256 and 512-bit */
+  {6, 6, 6},                           /* cost of storing integer registers */
+  {6, 6, 6, 10, 20},                   /* cost of loading SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {8, 8, 8, 12, 24},                   /* cost of storing SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
    {6, 6, 6, 10, 20},                   /* cost of unaligned loads.  */
-  {8, 8, 8, 12, 24},                   /* cost of storing SSE registers
-                                          in 32,64,128,256 and 512-bit */
    {8, 8, 8, 8, 16},                    /* cost of unaligned stores.  */
-  2, 2,                                        /* SSE->integer and integer->SSE moves */
+  2, 2, 4,                             /* cost of moving XMM,YMM,ZMM register */
+  2,                                   /* cost of moving SSE register to integer.  */
    20, 8,                               /* Gather load static, per_elt.  */
    22, 10,                              /* Gather store static, per_elt.  */
    64,                                  /* size of l1 cache.  */
@@ -1688,6 +1710,32 @@ static stringop_algs btver1_memset[2] = {
    {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
               {-1, libcall, false}}}};
  const struct processor_costs btver1_cost = {
+  {
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
+  8,                                /* cost for loading QImode using movzbl */
+  {6, 8, 6},                           /* cost of loading integer registers
+                                          in QImode, HImode and SImode.
+                                          Relative to reg-reg move (2).  */
+  {6, 8, 6},                           /* cost of storing integer registers */
+  4,                                   /* cost of reg,reg fld/fst */
+  {12, 12, 28},                                /* cost of loading fp registers
+                                          in SFmode, DFmode and XFmode */
+  {12, 12, 38},                                /* cost of storing fp registers
+                                          in SFmode, DFmode and XFmode */
+  4,                                   /* cost of moving MMX register */
+  {10, 10},                            /* cost of loading MMX registers
+                                          in SImode and DImode */
+  {12, 12},                            /* cost of storing MMX registers
+                                          in SImode and DImode */
+  2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
+  {10, 10, 12, 48, 96},                        /* cost of loading SSE registers
+                                          in 32,64,128,256 and 512-bit */
+  {10, 10, 12, 48, 96},                        /* cost of storing SSE registers
+                                          in 32,64,128,256 and 512-bit */
+  14, 14,                              /* SSE->integer and integer->SSE moves */
+  /* End of register allocator costs.  */
+  },
+
    COSTS_N_INSNS (1),                   /* cost of an add instruction */
    COSTS_N_INSNS (2),                   /* cost of a lea instruction */
    COSTS_N_INSNS (1),                   /* variable shift costs */
@@ -1707,32 +1755,19 @@ const struct processor_costs btver1_cost = {
    COSTS_N_INSNS (1),                   /* cost of movzx */
    8,                                   /* "large" insn */
    9,                                   /* MOVE_RATIO */
-
-  /* All move costs are relative to integer->integer move times 2 and thus
-     they are latency*2. */
-  8,                                /* cost for loading QImode using movzbl */
+  6,                                   /* CLEAR_RATIO */
    {6, 8, 6},                           /* cost of loading integer registers
                                            in QImode, HImode and SImode.
                                            Relative to reg-reg move (2).  */
    {6, 8, 6},                           /* cost of storing integer registers */
-  4,                                   /* cost of reg,reg fld/fst */
-  {12, 12, 28},                                /* cost of loading fp registers
-                                          in SFmode, DFmode and XFmode */
-  {12, 12, 38},                                /* cost of storing fp registers
-                                          in SFmode, DFmode and XFmode */
-  4,                                   /* cost of moving MMX register */
-  {10, 10},                            /* cost of loading MMX registers
-                                          in SImode and DImode */
-  {12, 12},                            /* cost of storing MMX registers
-                                          in SImode and DImode */
+  {10, 10, 12, 48, 96},                        /* cost of loading SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {10, 10, 12, 48, 96},                        /* cost of storing SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {10, 10, 12, 48, 96},                        /* cost of unaligned loads.  */
+  {10, 10, 12, 48, 96},                        /* cost of unaligned stores.  */
    2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
-  {10, 10, 12, 24, 48},                        /* cost of loading SSE registers
-                                          in 32,64,128,256 and 512-bit */
-  {10, 10, 12, 24, 48},                        /* cost of unaligned loads.  */
-  {10, 10, 12, 24, 48},                        /* cost of storing SSE registers
-                                          in 32,64,128,256 and 512-bit */
-  {10, 10, 12, 24, 48},                        /* cost of unaligned stores.  */
-  14, 14,                              /* SSE->integer and integer->SSE moves */
+  14,                                  /* cost of moving SSE register to integer.  */
    10, 10,                              /* Gather load static, per_elt.  */
    10, 10,                              /* Gather store static, per_elt.  */
    32,                                  /* size of l1 cache.  */
@@ -1779,6 +1814,32 @@ static stringop_algs btver2_memset[2] = {
    {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
               {-1, libcall, false}}}};
  const struct processor_costs btver2_cost = {
+  {
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
+  8,                                /* cost for loading QImode using movzbl */
+  {8, 8, 6},                           /* cost of loading integer registers
+                                          in QImode, HImode and SImode.
+                                          Relative to reg-reg move (2).  */
+  {8, 8, 6},                           /* cost of storing integer registers */
+  4,                                   /* cost of reg,reg fld/fst */
+  {12, 12, 28},                                /* cost of loading fp registers
+                                          in SFmode, DFmode and XFmode */
+  {12, 12, 38},                                /* cost of storing fp registers
+                                          in SFmode, DFmode and XFmode */
+  4,                                   /* cost of moving MMX register */
+  {10, 10},                            /* cost of loading MMX registers
+                                          in SImode and DImode */
+  {12, 12},                            /* cost of storing MMX registers
+                                          in SImode and DImode */
+  2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
+  {10, 10, 12, 48, 96},                        /* cost of loading SSE registers
+                                          in 32,64,128,256 and 512-bit */
+  {10, 10, 12, 48, 96},                        /* cost of storing SSE registers
+                                          in 32,64,128,256 and 512-bit */
+  14, 14,                              /* SSE->integer and integer->SSE moves */
+  /* End of register allocator costs.  */
+  },
+
    COSTS_N_INSNS (1),                   /* cost of an add instruction */
    COSTS_N_INSNS (2),                   /* cost of a lea instruction */
    COSTS_N_INSNS (1),                   /* variable shift costs */
@@ -1798,32 +1859,19 @@ const struct processor_costs btver2_cost = {
    COSTS_N_INSNS (1),                   /* cost of movzx */
    8,                                   /* "large" insn */
    9,                                   /* MOVE_RATIO */
-
-  /* All move costs are relative to integer->integer move times 2 and thus
-     they are latency*2. */
-  8,                                /* cost for loading QImode using movzbl */
+  6,                                   /* CLEAR_RATIO */
    {8, 8, 6},                           /* cost of loading integer registers
                                            in QImode, HImode and SImode.
                                            Relative to reg-reg move (2).  */
    {8, 8, 6},                           /* cost of storing integer registers */
-  4,                                   /* cost of reg,reg fld/fst */
-  {12, 12, 28},                                /* cost of loading fp registers
-                                          in SFmode, DFmode and XFmode */
-  {12, 12, 38},                                /* cost of storing fp registers
-                                          in SFmode, DFmode and XFmode */
-  4,                                   /* cost of moving MMX register */
-  {10, 10},                            /* cost of loading MMX registers
-                                          in SImode and DImode */
-  {12, 12},                            /* cost of storing MMX registers
-                                          in SImode and DImode */
+  {10, 10, 12, 48, 96},                        /* cost of loading SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {10, 10, 12, 48, 96},                        /* cost of storing SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {10, 10, 12, 48, 96},                        /* cost of unaligned loads.  */
+  {10, 10, 12, 48, 96},                        /* cost of unaligned stores.  */
    2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
-  {10, 10, 12, 24, 48},                        /* cost of loading SSE registers
-                                          in 32,64,128,256 and 512-bit */
-  {10, 10, 12, 24, 48},                        /* cost of unaligned loads.  */
-  {10, 10, 12, 24, 48},                        /* cost of storing SSE registers
-                                          in 32,64,128,256 and 512-bit */
-  {10, 10, 12, 24, 48},                        /* cost of unaligned stores.  */
-  14, 14,                              /* SSE->integer and integer->SSE moves */
+  14,                                  /* cost of moving SSE register to integer.  */
    10, 10,                              /* Gather load static, per_elt.  */
    10, 10,                              /* Gather store static, per_elt.  */
    32,                                  /* size of l1 cache.  */
@@ -1869,28 +1917,8 @@ static stringop_algs pentium4_memset[2] = {
  
  static const
  struct processor_costs pentium4_cost = {
-  COSTS_N_INSNS (1),                   /* cost of an add instruction */
-  COSTS_N_INSNS (3),                   /* cost of a lea instruction */
-  COSTS_N_INSNS (4),                   /* variable shift costs */
-  COSTS_N_INSNS (4),                   /* constant shift costs */
-  {COSTS_N_INSNS (15),                 /* cost of starting multiply for QI */
-   COSTS_N_INSNS (15),                 /*                               HI */
-   COSTS_N_INSNS (15),                 /*                               SI */
-   COSTS_N_INSNS (15),                 /*                               DI */
-   COSTS_N_INSNS (15)},                        /*                            other */
-  0,                                   /* cost of multiply per each bit set */
-  {COSTS_N_INSNS (56),                 /* cost of a divide/mod for QI */
-   COSTS_N_INSNS (56),                 /*                          HI */
-   COSTS_N_INSNS (56),                 /*                          SI */
-   COSTS_N_INSNS (56),                 /*                          DI */
-   COSTS_N_INSNS (56)},                        /*                          other */
-  COSTS_N_INSNS (1),                   /* cost of movsx */
-  COSTS_N_INSNS (1),                   /* cost of movzx */
-  16,                                  /* "large" insn */
-  6,                                   /* MOVE_RATIO */
-
-  /* All move costs are relative to integer->integer move times 2 and thus
-     they are latency*2. */
+  {
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
    5,                                /* cost for loading QImode using movzbl */
    {4, 5, 4},                           /* cost of loading integer registers
                                            in QImode, HImode and SImode.
@@ -1909,11 +1937,44 @@ struct processor_costs pentium4_cost = {
    12, 24, 48,                          /* cost of moving XMM,YMM,ZMM register */
    {16, 16, 16, 32, 64},                        /* cost of loading SSE registers
                                            in 32,64,128,256 and 512-bit */
+  {16, 16, 16, 32, 64},                        /* cost of storing SSE registers
+                                          in 32,64,128,256 and 512-bit */
+  20, 12,                              /* SSE->integer and integer->SSE moves */
+  /* End of register allocator costs.  */
+  },
+
+  COSTS_N_INSNS (1),                   /* cost of an add instruction */
+  COSTS_N_INSNS (3),                   /* cost of a lea instruction */
+  COSTS_N_INSNS (4),                   /* variable shift costs */
+  COSTS_N_INSNS (4),                   /* constant shift costs */
+  {COSTS_N_INSNS (15),                 /* cost of starting multiply for QI */
+   COSTS_N_INSNS (15),                 /*                               HI */
+   COSTS_N_INSNS (15),                 /*                               SI */
+   COSTS_N_INSNS (15),                 /*                               DI */
+   COSTS_N_INSNS (15)},                        /*                            other */
+  0,                                   /* cost of multiply per each bit set */
+  {COSTS_N_INSNS (56),                 /* cost of a divide/mod for QI */
+   COSTS_N_INSNS (56),                 /*                          HI */
+   COSTS_N_INSNS (56),                 /*                          SI */
+   COSTS_N_INSNS (56),                 /*                          DI */
+   COSTS_N_INSNS (56)},                        /*                          other */
+  COSTS_N_INSNS (1),                   /* cost of movsx */
+  COSTS_N_INSNS (1),                   /* cost of movzx */
+  16,                                  /* "large" insn */
+  6,                                   /* MOVE_RATIO */
+  6,                                   /* CLEAR_RATIO */
+  {4, 5, 4},                           /* cost of loading integer registers
+                                          in QImode, HImode and SImode.
+                                          Relative to reg-reg move (2).  */
+  {2, 3, 2},                           /* cost of storing integer registers */
+  {16, 16, 16, 32, 64},                        /* cost of loading SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {16, 16, 16, 32, 64},                        /* cost of storing SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
    {32, 32, 32, 64, 128},               /* cost of unaligned loads.  */
-  {16, 16, 16, 32, 64},                        /* cost of storing SSE registers
-                                          in 32,64,128,256 and 512-bit */
    {32, 32, 32, 64, 128},               /* cost of unaligned stores.  */
-  20, 12,                              /* SSE->integer and integer->SSE moves */
+  12, 24, 48,                          /* cost of moving XMM,YMM,ZMM register */
+  20,                                  /* cost of moving SSE register to integer.  */
    16, 16,                              /* Gather load static, per_elt.  */
    16, 16,                              /* Gather store static, per_elt.  */
    8,                                   /* size of l1 cache.  */
@@ -1962,6 +2023,32 @@ static stringop_algs nocona_memset[2] = {
  
  static const
  struct processor_costs nocona_cost = {
+  {
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
+  4,                                /* cost for loading QImode using movzbl */
+  {4, 4, 4},                           /* cost of loading integer registers
+                                          in QImode, HImode and SImode.
+                                          Relative to reg-reg move (2).  */
+  {4, 4, 4},                           /* cost of storing integer registers */
+  12,                                  /* cost of reg,reg fld/fst */
+  {14, 14, 14},                                /* cost of loading fp registers
+                                          in SFmode, DFmode and XFmode */
+  {14, 14, 14},                                /* cost of storing fp registers
+                                          in SFmode, DFmode and XFmode */
+  14,                                  /* cost of moving MMX register */
+  {12, 12},                            /* cost of loading MMX registers
+                                          in SImode and DImode */
+  {12, 12},                            /* cost of storing MMX registers
+                                          in SImode and DImode */
+  6, 12, 24,                           /* cost of moving XMM,YMM,ZMM register */
+  {12, 12, 12, 24, 48},                        /* cost of loading SSE registers
+                                          in 32,64,128,256 and 512-bit */
+  {12, 12, 12, 24, 48},                        /* cost of storing SSE registers
+                                          in 32,64,128,256 and 512-bit */
+  20, 12,                              /* SSE->integer and integer->SSE moves */
+  /* End of register allocator costs.  */
+  },
+
    COSTS_N_INSNS (1),                   /* cost of an add instruction */
    COSTS_N_INSNS (1),                   /* cost of a lea instruction */
    COSTS_N_INSNS (1),                   /* variable shift costs */
@@ -1981,32 +2068,19 @@ struct processor_costs nocona_cost = {
    COSTS_N_INSNS (1),                   /* cost of movzx */
    16,                                  /* "large" insn */
    17,                                  /* MOVE_RATIO */
-
-  /* All move costs are relative to integer->integer move times 2 and thus
-     they are latency*2. */
-  4,                                /* cost for loading QImode using movzbl */
+  6,                                   /* CLEAR_RATIO */
    {4, 4, 4},                           /* cost of loading integer registers
                                            in QImode, HImode and SImode.
                                            Relative to reg-reg move (2).  */
    {4, 4, 4},                           /* cost of storing integer registers */
-  12,                                  /* cost of reg,reg fld/fst */
-  {14, 14, 14},                                /* cost of loading fp registers
-                                          in SFmode, DFmode and XFmode */
-  {14, 14, 14},                                /* cost of storing fp registers
-                                          in SFmode, DFmode and XFmode */
-  14,                                  /* cost of moving MMX register */
-  {12, 12},                            /* cost of loading MMX registers
-                                          in SImode and DImode */
-  {12, 12},                            /* cost of storing MMX registers
-                                          in SImode and DImode */
-  6, 12, 24,                           /* cost of moving XMM,YMM,ZMM register */
-  {12, 12, 12, 24, 48},                        /* cost of loading SSE registers
-                                          in 32,64,128,256 and 512-bit */
+  {12, 12, 12, 24, 48},                        /* cost of loading SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {12, 12, 12, 24, 48},                        /* cost of storing SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
    {24, 24, 24, 48, 96},                        /* cost of unaligned loads.  */
-  {12, 12, 12, 24, 48},                        /* cost of storing SSE registers
-                                          in 32,64,128,256 and 512-bit */
    {24, 24, 24, 48, 96},                        /* cost of unaligned stores.  */
-  20, 12,                              /* SSE->integer and integer->SSE moves */
+  6, 12, 24,                           /* cost of moving XMM,YMM,ZMM register */
+  20,                                  /* cost of moving SSE register to integer.  */
    12, 12,                              /* Gather load static, per_elt.  */
    12, 12,                              /* Gather store static, per_elt.  */
    8,                                   /* size of l1 cache.  */
@@ -2053,6 +2127,32 @@ static stringop_algs atom_memset[2] = {
               {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
  static const
  struct processor_costs atom_cost = {
+  {
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
+  6,                                   /* cost for loading QImode using movzbl */
+  {6, 6, 6},                           /* cost of loading integer registers
+                                          in QImode, HImode and SImode.
+                                          Relative to reg-reg move (2).  */
+  {6, 6, 6},                           /* cost of storing integer registers */
+  4,                                   /* cost of reg,reg fld/fst */
+  {6, 6, 18},                          /* cost of loading fp registers
+                                          in SFmode, DFmode and XFmode */
+  {14, 14, 24},                                /* cost of storing fp registers
+                                          in SFmode, DFmode and XFmode */
+  2,                                   /* cost of moving MMX register */
+  {8, 8},                              /* cost of loading MMX registers
+                                          in SImode and DImode */
+  {10, 10},                            /* cost of storing MMX registers
+                                          in SImode and DImode */
+  2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
+  {8, 8, 8, 16, 32},                   /* cost of loading SSE registers
+                                          in 32,64,128,256 and 512-bit */
+  {8, 8, 8, 16, 32},                   /* cost of storing SSE registers
+                                          in 32,64,128,256 and 512-bit */
+  8, 6,                                        /* SSE->integer and integer->SSE moves */
+  /* End of register allocator costs.  */
+  },
+
    COSTS_N_INSNS (1),                   /* cost of an add instruction */
    COSTS_N_INSNS (1) + 1,               /* cost of a lea instruction */
    COSTS_N_INSNS (1),                   /* variable shift costs */
@@ -2072,32 +2172,19 @@ struct processor_costs atom_cost = {
    COSTS_N_INSNS (1),                   /* cost of movzx */
    8,                                   /* "large" insn */
    17,                                  /* MOVE_RATIO */
-
-  /* All move costs are relative to integer->integer move times 2 and thus
-     they are latency*2. */
-  6,                                   /* cost for loading QImode using movzbl */
+  6,                                   /* CLEAR_RATIO */
    {6, 6, 6},                           /* cost of loading integer registers
                                            in QImode, HImode and SImode.
                                            Relative to reg-reg move (2).  */
    {6, 6, 6},                           /* cost of storing integer registers */
-  4,                                   /* cost of reg,reg fld/fst */
-  {6, 6, 18},                          /* cost of loading fp registers
-                                          in SFmode, DFmode and XFmode */
-  {14, 14, 24},                                /* cost of storing fp registers
-                                          in SFmode, DFmode and XFmode */
-  2,                                   /* cost of moving MMX register */
-  {8, 8},                              /* cost of loading MMX registers
-                                          in SImode and DImode */
-  {10, 10},                            /* cost of storing MMX registers
-                                          in SImode and DImode */
-  2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
-  {8, 8, 8, 16, 32},                   /* cost of loading SSE registers
-                                          in 32,64,128,256 and 512-bit */
+  {8, 8, 8, 16, 32},                   /* cost of loading SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {8, 8, 8, 16, 32},                   /* cost of storing SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
    {16, 16, 16, 32, 64},                        /* cost of unaligned loads.  */
-  {8, 8, 8, 16, 32},                   /* cost of storing SSE registers
-                                          in 32,64,128,256 and 512-bit */
    {16, 16, 16, 32, 64},                        /* cost of unaligned stores.  */
-  8, 6,                                        /* SSE->integer and integer->SSE moves */
+  2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
+  8,                                   /* cost of moving SSE register to integer.  */
    8, 8,                                        /* Gather load static, per_elt.  */
    8, 8,                                        /* Gather store static, per_elt.  */
    32,                                  /* size of l1 cache.  */
@@ -2144,6 +2231,32 @@ static stringop_algs slm_memset[2] = {
               {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
  static const
  struct processor_costs slm_cost = {
+  {
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
+  8,                                   /* cost for loading QImode using movzbl */
+  {8, 8, 8},                           /* cost of loading integer registers
+                                          in QImode, HImode and SImode.
+                                          Relative to reg-reg move (2).  */
+  {6, 6, 6},                           /* cost of storing integer registers */
+  2,                                   /* cost of reg,reg fld/fst */
+  {8, 8, 18},                          /* cost of loading fp registers
+                                          in SFmode, DFmode and XFmode */
+  {6, 6, 18},                          /* cost of storing fp registers
+                                          in SFmode, DFmode and XFmode */
+  2,                                   /* cost of moving MMX register */
+  {8, 8},                              /* cost of loading MMX registers
+                                          in SImode and DImode */
+  {6, 6},                              /* cost of storing MMX registers
+                                          in SImode and DImode */
+  2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
+  {8, 8, 8, 16, 32},                   /* cost of loading SSE registers
+                                          in 32,64,128,256 and 512-bit */
+  {8, 8, 8, 16, 32},                   /* cost of storing SSE registers
+                                          in 32,64,128,256 and 512-bit */
+  8, 6,                                        /* SSE->integer and integer->SSE moves */
+  /* End of register allocator costs.  */
+  },
+
    COSTS_N_INSNS (1),                   /* cost of an add instruction */
    COSTS_N_INSNS (1) + 1,               /* cost of a lea instruction */
    COSTS_N_INSNS (1),                   /* variable shift costs */
@@ -2163,32 +2276,19 @@ struct processor_costs slm_cost = {
    COSTS_N_INSNS (1),                   /* cost of movzx */
    8,                                   /* "large" insn */
    17,                                  /* MOVE_RATIO */
-
-  /* All move costs are relative to integer->integer move times 2 and thus
-     they are latency*2. */
-  8,                                   /* cost for loading QImode using movzbl */
+  6,                                   /* CLEAR_RATIO */
    {8, 8, 8},                           /* cost of loading integer registers
                                            in QImode, HImode and SImode.
                                            Relative to reg-reg move (2).  */
    {6, 6, 6},                           /* cost of storing integer registers */
-  2,                                   /* cost of reg,reg fld/fst */
-  {8, 8, 18},                          /* cost of loading fp registers
-                                          in SFmode, DFmode and XFmode */
-  {6, 6, 18},                          /* cost of storing fp registers
-                                          in SFmode, DFmode and XFmode */
-  2,                                   /* cost of moving MMX register */
-  {8, 8},                              /* cost of loading MMX registers
-                                          in SImode and DImode */
-  {6, 6},                              /* cost of storing MMX registers
-                                          in SImode and DImode */
-  2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
-  {8, 8, 8, 16, 32},                   /* cost of loading SSE registers
-                                          in 32,64,128,256 and 512-bit */
+  {8, 8, 8, 16, 32},                   /* cost of loading SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {8, 8, 8, 16, 32},                   /* cost of storing SSE register
+                                          in SImode, DImode and TImode.  */
    {16, 16, 16, 32, 64},                        /* cost of unaligned loads.  */
-  {8, 8, 8, 16, 32},                   /* cost of storing SSE registers
-                                          in 32,64,128,256 and 512-bit */
    {16, 16, 16, 32, 64},                        /* cost of unaligned stores.  */
-  8, 6,                                        /* SSE->integer and integer->SSE moves */
+  2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
+  8,                                   /* cost of moving SSE register to integer.  */
    8, 8,                                        /* Gather load static, per_elt.  */
    8, 8,                                        /* Gather store static, per_elt.  */
    32,                                  /* size of l1 cache.  */
@@ -2235,6 +2335,32 @@ static stringop_algs intel_memset[2] = {
               {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
  static const
  struct processor_costs intel_cost = {
+  {
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
+  6,                                /* cost for loading QImode using movzbl */
+  {4, 4, 4},                           /* cost of loading integer registers
+                                          in QImode, HImode and SImode.
+                                          Relative to reg-reg move (2).  */
+  {6, 6, 6},                           /* cost of storing integer registers */
+  2,                                   /* cost of reg,reg fld/fst */
+  {6, 6, 8},                           /* cost of loading fp registers
+                                          in SFmode, DFmode and XFmode */
+  {6, 6, 10},                          /* cost of storing fp registers
+                                          in SFmode, DFmode and XFmode */
+  2,                                   /* cost of moving MMX register */
+  {6, 6},                              /* cost of loading MMX registers
+                                          in SImode and DImode */
+  {6, 6},                              /* cost of storing MMX registers
+                                          in SImode and DImode */
+  2, 2, 2,                             /* cost of moving XMM,YMM,ZMM register */
+  {6, 6, 6, 6, 6},                     /* cost of loading SSE registers
+                                          in 32,64,128,256 and 512-bit */
+  {6, 6, 6, 6, 6},                     /* cost of storing SSE registers
+                                          in 32,64,128,256 and 512-bit */
+  4, 4,                                        /* SSE->integer and integer->SSE moves */
+  /* End of register allocator costs.  */
+  },
+
    COSTS_N_INSNS (1),                   /* cost of an add instruction */
    COSTS_N_INSNS (1) + 1,               /* cost of a lea instruction */
    COSTS_N_INSNS (1),                   /* variable shift costs */
@@ -2254,32 +2380,19 @@ struct processor_costs intel_cost = {
    COSTS_N_INSNS (1),                   /* cost of movzx */
    8,                                   /* "large" insn */
    17,                                  /* MOVE_RATIO */
-
-  /* All move costs are relative to integer->integer move times 2 and thus
-     they are latency*2. */
-  6,                                /* cost for loading QImode using movzbl */
+  6,                                   /* CLEAR_RATIO */
    {4, 4, 4},                           /* cost of loading integer registers
                                            in QImode, HImode and SImode.
                                            Relative to reg-reg move (2).  */
    {6, 6, 6},                           /* cost of storing integer registers */
-  2,                                   /* cost of reg,reg fld/fst */
-  {6, 6, 8},                           /* cost of loading fp registers
-                                          in SFmode, DFmode and XFmode */
-  {6, 6, 10},                          /* cost of storing fp registers
-                                          in SFmode, DFmode and XFmode */
-  2,                                   /* cost of moving MMX register */
-  {6, 6},                              /* cost of loading MMX registers
-                                          in SImode and DImode */
-  {6, 6},                              /* cost of storing MMX registers
-                                          in SImode and DImode */
-  2, 2, 2,                             /* cost of moving XMM,YMM,ZMM register */
-  {6, 6, 6, 6, 6},                     /* cost of loading SSE registers
-                                          in 32,64,128,256 and 512-bit */
+  {6, 6, 6, 6, 6},                     /* cost of loading SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {6, 6, 6, 6, 6},                     /* cost of storing SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
    {10, 10, 10, 10, 10},                        /* cost of unaligned loads.  */
-  {6, 6, 6, 6, 6},                     /* cost of storing SSE registers
-                                          in 32,64,128,256 and 512-bit */
    {10, 10, 10, 10, 10},                        /* cost of unaligned loads.  */
-  4, 4,                                        /* SSE->integer and integer->SSE moves */
+  2, 2, 2,                             /* cost of moving XMM,YMM,ZMM register */
+  4,                                   /* cost of moving SSE register to integer.  */
    6, 6,                                        /* Gather load static, per_elt.  */
    6, 6,                                        /* Gather store static, per_elt.  */
    32,                                  /* size of l1 cache.  */
@@ -2294,7 +2407,7 @@ struct processor_costs intel_cost = {
    COSTS_N_INSNS (8),                   /* cost of FCHS instruction.  */
    COSTS_N_INSNS (40),                  /* cost of FSQRT instruction.  */
  
-  COSTS_N_INSNS (8),                   /* cost of cheap SSE instruction.  */
+  COSTS_N_INSNS (1),                   /* cost of cheap SSE instruction.  */
    COSTS_N_INSNS (8),                   /* cost of ADDSS/SD SUBSS/SD insns.  */
    COSTS_N_INSNS (8),                   /* cost of MULSS instruction.  */
    COSTS_N_INSNS (8),                   /* cost of MULSD instruction.  */
@@ -2330,6 +2443,32 @@ static stringop_algs generic_memset[2] = {
               {-1, libcall, false}}}};
  static const
  struct processor_costs generic_cost = {
+  {
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
+  6,                                /* cost for loading QImode using movzbl */
+  {6, 6, 6},                           /* cost of loading integer registers
+                                          in QImode, HImode and SImode.
+                                          Relative to reg-reg move (2).  */
+  {6, 6, 6},                           /* cost of storing integer registers */
+  4,                                   /* cost of reg,reg fld/fst */
+  {6, 6, 12},                          /* cost of loading fp registers
+                                          in SFmode, DFmode and XFmode */
+  {6, 6, 12},                          /* cost of storing fp registers
+                                          in SFmode, DFmode and XFmode */
+  2,                                   /* cost of moving MMX register */
+  {6, 6},                              /* cost of loading MMX registers
+                                          in SImode and DImode */
+  {6, 6},                              /* cost of storing MMX registers
+                                          in SImode and DImode */
+  2, 3, 4,                             /* cost of moving XMM,YMM,ZMM register */
+  {6, 6, 6, 10, 15},                   /* cost of loading SSE registers
+                                          in 32,64,128,256 and 512-bit */
+  {6, 6, 6, 10, 15},                   /* cost of storing SSE registers
+                                          in 32,64,128,256 and 512-bit */
+  6, 6,                                        /* SSE->integer and integer->SSE moves */
+  /* End of register allocator costs.  */
+  },
+
    COSTS_N_INSNS (1),                   /* cost of an add instruction */
    /* Setting cost to 2 makes our current implementation of synth_mult result in
       use of unnecessary temporary registers causing regression on several
@@ -2352,32 +2491,19 @@ struct processor_costs generic_cost = {
    COSTS_N_INSNS (1),                   /* cost of movzx */
    8,                                   /* "large" insn */
    17,                                  /* MOVE_RATIO */
-
-  /* All move costs are relative to integer->integer move times 2 and thus
-     they are latency*2. */
-  6,                                /* cost for loading QImode using movzbl */
+  6,                                   /* CLEAR_RATIO */
    {6, 6, 6},                           /* cost of loading integer registers
                                            in QImode, HImode and SImode.
                                            Relative to reg-reg move (2).  */
    {6, 6, 6},                           /* cost of storing integer registers */
-  4,                                   /* cost of reg,reg fld/fst */
-  {6, 6, 12},                          /* cost of loading fp registers
-                                          in SFmode, DFmode and XFmode */
-  {6, 6, 12},                          /* cost of storing fp registers
-                                          in SFmode, DFmode and XFmode */
-  2,                                   /* cost of moving MMX register */
-  {6, 6},                              /* cost of loading MMX registers
-                                          in SImode and DImode */
-  {6, 6},                              /* cost of storing MMX registers
-                                          in SImode and DImode */
-  2, 3, 4,                             /* cost of moving XMM,YMM,ZMM register */
-  {6, 6, 6, 10, 15},                   /* cost of loading SSE registers
-                                          in 32,64,128,256 and 512-bit */
+  {6, 6, 6, 10, 15},                   /* cost of loading SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {6, 6, 6, 10, 15},                   /* cost of storing SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
    {6, 6, 6, 10, 15},                   /* cost of unaligned loads.  */
-  {6, 6, 6, 10, 15},                   /* cost of storing SSE registers
-                                          in 32,64,128,256 and 512-bit */
    {6, 6, 6, 10, 15},                   /* cost of unaligned storess.  */
-  6, 6,                                        /* SSE->integer and integer->SSE moves */
+  2, 3, 4,                             /* cost of moving XMM,YMM,ZMM register */
+  6,                                   /* cost of moving SSE register to integer.  */
    18, 6,                               /* Gather load static, per_elt.  */
    18, 6,                               /* Gather store static, per_elt.  */
    32,                                  /* size of l1 cache.  */
@@ -2430,6 +2556,32 @@ static stringop_algs core_memset[2] = {
  
  static const
  struct processor_costs core_cost = {
+  {
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
+  6,                                /* cost for loading QImode using movzbl */
+  {4, 4, 4},                           /* cost of loading integer registers
+                                          in QImode, HImode and SImode.
+                                          Relative to reg-reg move (2).  */
+  {6, 6, 6},                           /* cost of storing integer registers */
+  2,                                   /* cost of reg,reg fld/fst */
+  {6, 6, 8},                           /* cost of loading fp registers
+                                          in SFmode, DFmode and XFmode */
+  {6, 6, 10},                          /* cost of storing fp registers
+                                          in SFmode, DFmode and XFmode */
+  2,                                   /* cost of moving MMX register */
+  {6, 6},                              /* cost of loading MMX registers
+                                          in SImode and DImode */
+  {6, 6},                              /* cost of storing MMX registers
+                                          in SImode and DImode */
+  2, 2, 4,                             /* cost of moving XMM,YMM,ZMM register */
+  {6, 6, 6, 6, 12},                    /* cost of loading SSE registers
+                                          in 32,64,128,256 and 512-bit */
+  {6, 6, 6, 6, 12},                    /* cost of storing SSE registers
+                                          in 32,64,128,256 and 512-bit */
+  6, 6,                                        /* SSE->integer and integer->SSE moves */
+  /* End of register allocator costs.  */
+  },
+
    COSTS_N_INSNS (1),                   /* cost of an add instruction */
    /* On all chips taken into consideration lea is 2 cycles and more.  With
       this cost however our current implementation of synth_mult results in
@@ -2456,32 +2608,19 @@ struct processor_costs core_cost = {
    COSTS_N_INSNS (1),                   /* cost of movzx */
    8,                                   /* "large" insn */
    17,                                  /* MOVE_RATIO */
-
-  /* All move costs are relative to integer->integer move times 2 and thus
-     they are latency*2. */
-  6,                                /* cost for loading QImode using movzbl */
+  6,                                   /* CLEAR_RATIO */
    {4, 4, 4},                           /* cost of loading integer registers
                                            in QImode, HImode and SImode.
                                            Relative to reg-reg move (2).  */
    {6, 6, 6},                           /* cost of storing integer registers */
-  2,                                   /* cost of reg,reg fld/fst */
-  {6, 6, 8},                           /* cost of loading fp registers
-                                          in SFmode, DFmode and XFmode */
-  {6, 6, 10},                          /* cost of storing fp registers
-                                          in SFmode, DFmode and XFmode */
-  2,                                   /* cost of moving MMX register */
-  {6, 6},                              /* cost of loading MMX registers
-                                          in SImode and DImode */
-  {6, 6},                              /* cost of storing MMX registers
-                                          in SImode and DImode */
-  2, 2, 4,                             /* cost of moving XMM,YMM,ZMM register */
-  {6, 6, 6, 6, 12},                    /* cost of loading SSE registers
-                                          in 32,64,128,256 and 512-bit */
+  {6, 6, 6, 6, 12},                    /* cost of loading SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {6, 6, 6, 6, 12},                    /* cost of storing SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
    {6, 6, 6, 6, 12},                    /* cost of unaligned loads.  */
-  {6, 6, 6, 6, 12},                    /* cost of storing SSE registers
-                                          in 32,64,128,256 and 512-bit */
    {6, 6, 6, 6, 12},                    /* cost of unaligned stores.  */
-  2, 2,                                        /* SSE->integer and integer->SSE moves */
+  2, 2, 4,                             /* cost of moving XMM,YMM,ZMM register */
+  2,                                   /* cost of moving SSE register to integer.  */
    /* VGATHERDPD is 7 uops, rec throughput 5, while VGATHERDPD is 9 uops,
       rec. throughput 6.
       So 5 uops statically and one uops per load.  */