i386: Separate costs of pseudo registers from hard registers

author H.J. Lu <hongjiu.lu@intel.com>

Thu, 15 Aug 2019 18:15:33 +0000 (18:15 +0000)

committer H.J. Lu <hjl@gcc.gnu.org>

Thu, 15 Aug 2019 18:15:33 +0000 (11:15 -0700)
author H.J. Lu <hongjiu.lu@intel.com>
Thu, 15 Aug 2019 18:15:33 +0000 (18:15 +0000)
committer H.J. Lu <hjl@gcc.gnu.org>
Thu, 15 Aug 2019 18:15:33 +0000 (11:15 -0700)
diff --git a/gcc/ChangeLog b/gcc/ChangeLog

index eba58f69f802bec6108ec8c61ad3ec598f89cb31..aa295f09368dbc362893b626127e51e070c438c9 100644 (file)
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,19 @@
+2019-08-15  H.J. Lu  <hongjiu.lu@intel.com>
+
+       PR target/90878
+       * config/i386/i386.c (inline_memory_move_cost): Use hard_register
+       for costs of hard register moves.
+       (ix86_register_move_cost): Likewise.
+       * config/i386/i386.h (processor_costs): Move costs of hard
+       register moves to hard_register.  Add int_load, int_store,
+       xmm_move, ymm_move, zmm_move, sse_to_integer, integer_to_sse,
+       sse_load, sse_store, sse_unaligned_load and sse_unaligned_store
+       for costs of RTL expressions.
+       * config/i386/x86-tune-costs.h: Move costs of hard register
+       moves to hard_register.  Duplicate int_load, int_store,
+       xmm_move, ymm_move, zmm_move, sse_to_integer, integer_to_sse,
+       sse_load, sse_store for costs of RTL expressions.
+
  2019-08-15  Richard Sandiford  <richard.sandiford@arm.com>
  
         * target.def (setup_incoming_vararg_bounds): Remove.
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c

index 5eb625ce724ff652847d036f1d10e67b27ab9f4d..647bcbef0506f4f5ad561535c1e303a9cd045682 100644 (file)
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -18464,8 +18464,10 @@ inline_memory_move_cost (machine_mode mode, enum reg_class regclass, int in)
             return 100;
         }
        if (in == 2)
-        return MAX (ix86_cost->fp_load [index], ix86_cost->fp_store [index]);
-      return in ? ix86_cost->fp_load [index] : ix86_cost->fp_store [index];
+        return MAX (ix86_cost->hard_register.fp_load [index],
+                   ix86_cost->hard_register.fp_store [index]);
+      return in ? ix86_cost->hard_register.fp_load [index]
+               : ix86_cost->hard_register.fp_store [index];
      }
    if (SSE_CLASS_P (regclass))
      {
@@ -18473,8 +18475,10 @@ inline_memory_move_cost (machine_mode mode, enum reg_class regclass, int in)
        if (index == -1)
         return 100;
        if (in == 2)
-        return MAX (ix86_cost->sse_load [index], ix86_cost->sse_store [index]);
-      return in ? ix86_cost->sse_load [index] : ix86_cost->sse_store [index];
+        return MAX (ix86_cost->hard_register.sse_load [index],
+                   ix86_cost->hard_register.sse_store [index]);
+      return in ? ix86_cost->hard_register.sse_load [index]
+               : ix86_cost->hard_register.sse_store [index];
      }
    if (MMX_CLASS_P (regclass))
      {
@@ -18491,8 +18495,10 @@ inline_memory_move_cost (machine_mode mode, enum reg_class regclass, int in)
             return 100;
         }
        if (in == 2)
-        return MAX (ix86_cost->mmx_load [index], ix86_cost->mmx_store [index]);
-      return in ? ix86_cost->mmx_load [index] : ix86_cost->mmx_store [index];
+        return MAX (ix86_cost->hard_register.mmx_load [index],
+                   ix86_cost->hard_register.mmx_store [index]);
+      return in ? ix86_cost->hard_register.mmx_load [index]
+               : ix86_cost->hard_register.mmx_store [index];
      }
    switch (GET_MODE_SIZE (mode))
      {
@@ -18500,37 +18506,41 @@ inline_memory_move_cost (machine_mode mode, enum reg_class regclass, int in)
         if (Q_CLASS_P (regclass) || TARGET_64BIT)
           {
             if (!in)
-             return ix86_cost->int_store[0];
+             return ix86_cost->hard_register.int_store[0];
             if (TARGET_PARTIAL_REG_DEPENDENCY
                 && optimize_function_for_speed_p (cfun))
-             cost = ix86_cost->movzbl_load;
+             cost = ix86_cost->hard_register.movzbl_load;
             else
-             cost = ix86_cost->int_load[0];
+             cost = ix86_cost->hard_register.int_load[0];
             if (in == 2)
-             return MAX (cost, ix86_cost->int_store[0]);
+             return MAX (cost, ix86_cost->hard_register.int_store[0]);
             return cost;
           }
         else
           {
            if (in == 2)
-            return MAX (ix86_cost->movzbl_load, ix86_cost->int_store[0] + 4);
+            return MAX (ix86_cost->hard_register.movzbl_load,
+                        ix86_cost->hard_register.int_store[0] + 4);
            if (in)
-            return ix86_cost->movzbl_load;
+            return ix86_cost->hard_register.movzbl_load;
            else
-            return ix86_cost->int_store[0] + 4;
+            return ix86_cost->hard_register.int_store[0] + 4;
           }
         break;
        case 2:
         if (in == 2)
-         return MAX (ix86_cost->int_load[1], ix86_cost->int_store[1]);
-       return in ? ix86_cost->int_load[1] : ix86_cost->int_store[1];
+         return MAX (ix86_cost->hard_register.int_load[1],
+                     ix86_cost->hard_register.int_store[1]);
+       return in ? ix86_cost->hard_register.int_load[1]
+                 : ix86_cost->hard_register.int_store[1];
        default:
         if (in == 2)
-         cost = MAX (ix86_cost->int_load[2], ix86_cost->int_store[2]);
+         cost = MAX (ix86_cost->hard_register.int_load[2],
+                     ix86_cost->hard_register.int_store[2]);
         else if (in)
-         cost = ix86_cost->int_load[2];
+         cost = ix86_cost->hard_register.int_load[2];
         else
-         cost = ix86_cost->int_store[2];
+         cost = ix86_cost->hard_register.int_store[2];
         /* Multiply with the number of GPR moves needed.  */
         return cost * CEIL ((int) GET_MODE_SIZE (mode), UNITS_PER_WORD);
      }
@@ -18600,20 +18610,21 @@ ix86_register_move_cost (machine_mode mode, reg_class_t class1_i,
         because of missing QImode and HImode moves to, from or between
         MMX/SSE registers.  */
      return MAX (8, SSE_CLASS_P (class1)
-               ? ix86_cost->sse_to_integer : ix86_cost->integer_to_sse);
+               ? ix86_cost->hard_register.sse_to_integer
+               : ix86_cost->hard_register.integer_to_sse);
  
    if (MAYBE_FLOAT_CLASS_P (class1))
-    return ix86_cost->fp_move;
+    return ix86_cost->hard_register.fp_move;
    if (MAYBE_SSE_CLASS_P (class1))
      {
        if (GET_MODE_BITSIZE (mode) <= 128)
-       return ix86_cost->xmm_move;
+       return ix86_cost->hard_register.xmm_move;
        if (GET_MODE_BITSIZE (mode) <= 256)
-       return ix86_cost->ymm_move;
-      return ix86_cost->zmm_move;
+       return ix86_cost->hard_register.ymm_move;
+      return ix86_cost->hard_register.zmm_move;
      }
    if (MAYBE_MMX_CLASS_P (class1))
-    return ix86_cost->mmx_move;
+    return ix86_cost->hard_register.mmx_move;
    return 2;
  }
  
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h

index 800d7c4c4e34ec71d0715114e87910b71084cc6b..e0a77e1fb25af1df0e083517d5ede380c0429e71 100644 (file)
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -237,9 +237,46 @@ struct stringop_algs
    } size [MAX_STRINGOP_ALGS];
  };
  
-/* Define the specific costs for a given cpu */
+/* Define the specific costs for a given cpu.  NB: hard_register is used
+   by TARGET_REGISTER_MOVE_COST and TARGET_MEMORY_MOVE_COST to compute
+   hard register move costs by register allocator.  Relative costs of
+   pseudo register load and store versus pseudo register moves in RTL
+   expressions for TARGET_RTX_COSTS can be different from relative
+   costs of hard registers to get the most efficient operations with
+   pseudo registers.  */
  
  struct processor_costs {
+  /* Costs used by register allocator.  integer->integer register move
+     cost is 2.  */
+  struct
+    {
+      const int movzbl_load;   /* cost of loading using movzbl */
+      const int int_load[3];   /* cost of loading integer registers
+                                  in QImode, HImode and SImode relative
+                                  to reg-reg move (2).  */
+      const int int_store[3];  /* cost of storing integer register
+                                  in QImode, HImode and SImode */
+      const int fp_move;       /* cost of reg,reg fld/fst */
+      const int fp_load[3];    /* cost of loading FP register
+                                  in SFmode, DFmode and XFmode */
+      const int fp_store[3];   /* cost of storing FP register
+                                  in SFmode, DFmode and XFmode */
+      const int mmx_move;      /* cost of moving MMX register.  */
+      const int mmx_load[2];   /* cost of loading MMX register
+                                  in SImode and DImode */
+      const int mmx_store[2];  /* cost of storing MMX register
+                                  in SImode and DImode */
+      const int xmm_move;      /* cost of moving XMM register.  */
+      const int ymm_move;      /* cost of moving XMM register.  */
+      const int zmm_move;      /* cost of moving XMM register.  */
+      const int sse_load[5];   /* cost of loading SSE register
+                                  in 32bit, 64bit, 128bit, 256bit and 512bit */
+      const int sse_store[5];  /* cost of storing SSE register
+                                  in SImode, DImode and TImode.  */
+      const int sse_to_integer;        /* cost of moving SSE register to integer.  */
+      const int integer_to_sse;        /* cost of moving integer register to SSE. */
+    } hard_register;
+
    const int add;               /* cost of an add instruction */
    const int lea;               /* cost of a lea instruction */
    const int shift_var;         /* variable shift costs */
@@ -254,32 +291,20 @@ struct processor_costs {
    const int large_insn;                /* insns larger than this cost more */
    const int move_ratio;                /* The threshold of number of scalar
                                    memory-to-memory move insns.  */
-  const int movzbl_load;       /* cost of loading using movzbl */
    const int int_load[3];       /* cost of loading integer registers
                                    in QImode, HImode and SImode relative
                                    to reg-reg move (2).  */
    const int int_store[3];      /* cost of storing integer register
                                    in QImode, HImode and SImode */
-  const int fp_move;           /* cost of reg,reg fld/fst */
-  const int fp_load[3];                /* cost of loading FP register
-                                  in SFmode, DFmode and XFmode */
-  const int fp_store[3];       /* cost of storing FP register
-                                  in SFmode, DFmode and XFmode */
-  const int mmx_move;          /* cost of moving MMX register.  */
-  const int mmx_load[2];       /* cost of loading MMX register
-                                  in SImode and DImode */
-  const int mmx_store[2];      /* cost of storing MMX register
-                                  in SImode and DImode */
-  const int xmm_move, ymm_move, /* cost of moving XMM and YMM register.  */
-           zmm_move;
    const int sse_load[5];       /* cost of loading SSE register
                                    in 32bit, 64bit, 128bit, 256bit and 512bit */
-  const int sse_unaligned_load[5];/* cost of unaligned load.  */
    const int sse_store[5];      /* cost of storing SSE register
-                                  in SImode, DImode and TImode.  */
+                                  in 32bit, 64bit, 128bit, 256bit and 512bit */
+  const int sse_unaligned_load[5];/* cost of unaligned load.  */
    const int sse_unaligned_store[5];/* cost of unaligned store.  */
+  const int xmm_move, ymm_move, /* cost of moving XMM and YMM register.  */
+           zmm_move;
    const int sse_to_integer;    /* cost of moving SSE register to integer.  */
-  const int integer_to_sse;    /* cost of moving integer register to SSE. */
    const int gather_static, gather_per_elt; /* Cost of gather load is computed
                                    as static + per_item * nelts. */
    const int scatter_static, scatter_per_elt; /* Cost of gather store is
diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h

index 8b963c07051d5d2f1aa11936b7a215a7a8265c89..ad9ea4bfa089d50ec1b26493ab8c6878f238a776 100644 (file)
--- a/gcc/config/i386/x86-tune-costs.h
+++ b/gcc/config/i386/x86-tune-costs.h
@@ -36,6 +36,30 @@ static stringop_algs ix86_size_memset[2] = {
  
  const
  struct processor_costs ix86_size_cost = {/* costs for tuning for size */
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
+  2,                                /* cost for loading QImode using movzbl */
+  {2, 2, 2},                           /* cost of loading integer registers
+                                          in QImode, HImode and SImode.
+                                          Relative to reg-reg move (2).  */
+  {2, 2, 2},                           /* cost of storing integer registers */
+  2,                                   /* cost of reg,reg fld/fst */
+  {2, 2, 2},                           /* cost of loading fp registers
+                                          in SFmode, DFmode and XFmode */
+  {2, 2, 2},                           /* cost of storing fp registers
+                                          in SFmode, DFmode and XFmode */
+  3,                                   /* cost of moving MMX register */
+  {3, 3},                              /* cost of loading MMX registers
+                                          in SImode and DImode */
+  {3, 3},                              /* cost of storing MMX registers
+                                          in SImode and DImode */
+  3, 3, 3,                             /* cost of moving XMM,YMM,ZMM register */
+  {3, 3, 3, 3, 3},                     /* cost of loading SSE registers
+                                          in 32,64,128,256 and 512-bit */
+  {3, 3, 3, 3, 3},                     /* cost of storing SSE registers
+                                          in 32,64,128,256 and 512-bit */
+  3, 3,                                        /* SSE->integer and integer->SSE moves */
+  /* End of register allocator costs.  */
+
    COSTS_N_BYTES (2),                   /* cost of an add instruction */
    COSTS_N_BYTES (3),                   /* cost of a lea instruction */
    COSTS_N_BYTES (2),                   /* variable shift costs */
@@ -55,33 +79,20 @@ struct processor_costs ix86_size_cost = {/* costs for tuning for size */
    COSTS_N_BYTES (3),                   /* cost of movzx */
    0,                                   /* "large" insn */
    2,                                   /* MOVE_RATIO */
-
-  /* All move costs are relative to integer->integer move times 2. */
-  2,                                /* cost for loading QImode using movzbl */
    {2, 2, 2},                           /* cost of loading integer registers
                                            in QImode, HImode and SImode.
                                            Relative to reg-reg move (2).  */
    {2, 2, 2},                           /* cost of storing integer registers */
-  2,                                   /* cost of reg,reg fld/fst */
-  {2, 2, 2},                           /* cost of loading fp registers
-                                          in SFmode, DFmode and XFmode */
-  {2, 2, 2},                           /* cost of storing fp registers
-                                          in SFmode, DFmode and XFmode */
-  3,                                   /* cost of moving MMX register */
-  {3, 3},                              /* cost of loading MMX registers
-                                          in SImode and DImode */
-  {3, 3},                              /* cost of storing MMX registers
-                                          in SImode and DImode */
-  3, 3, 3,                             /* cost of moving XMM,YMM,ZMM register */
-  {3, 3, 3, 3, 3},                     /* cost of loading SSE registers
-                                          in 32,64,128,256 and 512-bit */
+  {3, 3, 3, 3, 3},                     /* cost of loading SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {3, 3, 3, 3, 3},                     /* cost of storing SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
    {3, 3, 3, 3, 3},                     /* cost of unaligned SSE load
                                            in 128bit, 256bit and 512bit */
-  {3, 3, 3, 3, 3},                     /* cost of storing SSE registers
-                                          in 32,64,128,256 and 512-bit */
-  {3, 3, 3, 3, 3},                             /* cost of unaligned SSE store
+  {3, 3, 3, 3, 3},                     /* cost of unaligned SSE store
                                            in 128bit, 256bit and 512bit */
-  3, 3,                                        /* SSE->integer and integer->SSE moves */
+  3, 3, 3,                             /* cost of moving XMM,YMM,ZMM register */
+  3,                                   /* cost of moving SSE register to integer.  */
    5, 0,                                        /* Gather load static, per_elt.  */
    5, 0,                                        /* Gather store static, per_elt.  */
    0,                                   /* size of l1 cache  */
@@ -127,6 +138,30 @@ static stringop_algs i386_memset[2] = {
  
  static const
  struct processor_costs i386_cost = {   /* 386 specific costs */
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
+  4,                                /* cost for loading QImode using movzbl */
+  {2, 4, 2},                           /* cost of loading integer registers
+                                          in QImode, HImode and SImode.
+                                          Relative to reg-reg move (2).  */
+  {2, 4, 2},                           /* cost of storing integer registers */
+  2,                                   /* cost of reg,reg fld/fst */
+  {8, 8, 8},                           /* cost of loading fp registers
+                                          in SFmode, DFmode and XFmode */
+  {8, 8, 8},                           /* cost of storing fp registers
+                                          in SFmode, DFmode and XFmode */
+  2,                                   /* cost of moving MMX register */
+  {4, 8},                              /* cost of loading MMX registers
+                                          in SImode and DImode */
+  {4, 8},                              /* cost of storing MMX registers
+                                          in SImode and DImode */
+  2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
+  {4, 8, 16, 32, 64},                  /* cost of loading SSE registers
+                                          in 32,64,128,256 and 512-bit */
+  {4, 8, 16, 32, 64},                  /* cost of storing SSE registers
+                                          in 32,64,128,256 and 512-bit */
+  3, 3,                                        /* SSE->integer and integer->SSE moves */
+  /* End of register allocator costs.  */
+
    COSTS_N_INSNS (1),                   /* cost of an add instruction */
    COSTS_N_INSNS (1),                   /* cost of a lea instruction */
    COSTS_N_INSNS (3),                   /* variable shift costs */
@@ -146,32 +181,18 @@ struct processor_costs i386_cost = {      /* 386 specific costs */
    COSTS_N_INSNS (2),                   /* cost of movzx */
    15,                                  /* "large" insn */
    3,                                   /* MOVE_RATIO */
-
-  /* All move costs are relative to integer->integer move times 2 and thus
-     they are latency*2. */
-  4,                                /* cost for loading QImode using movzbl */
    {2, 4, 2},                           /* cost of loading integer registers
                                            in QImode, HImode and SImode.
                                            Relative to reg-reg move (2).  */
    {2, 4, 2},                           /* cost of storing integer registers */
-  2,                                   /* cost of reg,reg fld/fst */
-  {8, 8, 8},                           /* cost of loading fp registers
-                                          in SFmode, DFmode and XFmode */
-  {8, 8, 8},                           /* cost of storing fp registers
-                                          in SFmode, DFmode and XFmode */
-  2,                                   /* cost of moving MMX register */
-  {4, 8},                              /* cost of loading MMX registers
-                                          in SImode and DImode */
-  {4, 8},                              /* cost of storing MMX registers
-                                          in SImode and DImode */
-  2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
-  {4, 8, 16, 32, 64},                  /* cost of loading SSE registers
-                                          in 32,64,128,256 and 512-bit */
+  {4, 8, 16, 32, 64},                  /* cost of loading SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {4, 8, 16, 32, 64},                  /* cost of storing SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
    {4, 8, 16, 32, 64},                  /* cost of unaligned loads.  */
-  {4, 8, 16, 32, 64},                  /* cost of storing SSE registers
-                                          in 32,64,128,256 and 512-bit */
    {4, 8, 16, 32, 64},                  /* cost of unaligned stores.  */
-  3, 3,                                        /* SSE->integer and integer->SSE moves */
+  2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
+  3,                                   /* cost of moving SSE register to integer.  */
    4, 4,                                        /* Gather load static, per_elt.  */
    4, 4,                                        /* Gather store static, per_elt.  */
    0,                                   /* size of l1 cache  */
@@ -216,6 +237,30 @@ static stringop_algs i486_memset[2] = {
  
  static const
  struct processor_costs i486_cost = {   /* 486 specific costs */
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
+  4,                                /* cost for loading QImode using movzbl */
+  {2, 4, 2},                           /* cost of loading integer registers
+                                          in QImode, HImode and SImode.
+                                          Relative to reg-reg move (2).  */
+  {2, 4, 2},                           /* cost of storing integer registers */
+  2,                                   /* cost of reg,reg fld/fst */
+  {8, 8, 8},                           /* cost of loading fp registers
+                                          in SFmode, DFmode and XFmode */
+  {8, 8, 8},                           /* cost of storing fp registers
+                                          in SFmode, DFmode and XFmode */
+  2,                                   /* cost of moving MMX register */
+  {4, 8},                              /* cost of loading MMX registers
+                                          in SImode and DImode */
+  {4, 8},                              /* cost of storing MMX registers
+                                          in SImode and DImode */
+  2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
+  {4, 8, 16, 32, 64},                  /* cost of loading SSE registers
+                                          in 32,64,128,256 and 512-bit */
+  {4, 8, 16, 32, 64},                  /* cost of storing SSE registers
+                                          in 32,64,128,256 and 512-bit */
+  3, 3,                                        /* SSE->integer and integer->SSE moves */
+  /* End of register allocator costs.  */
+
    COSTS_N_INSNS (1),                   /* cost of an add instruction */
    COSTS_N_INSNS (1),                   /* cost of a lea instruction */
    COSTS_N_INSNS (3),                   /* variable shift costs */
@@ -235,32 +280,18 @@ struct processor_costs i486_cost = {      /* 486 specific costs */
    COSTS_N_INSNS (2),                   /* cost of movzx */
    15,                                  /* "large" insn */
    3,                                   /* MOVE_RATIO */
-
-  /* All move costs are relative to integer->integer move times 2 and thus
-     they are latency*2. */
-  4,                                /* cost for loading QImode using movzbl */
    {2, 4, 2},                           /* cost of loading integer registers
                                            in QImode, HImode and SImode.
                                            Relative to reg-reg move (2).  */
    {2, 4, 2},                           /* cost of storing integer registers */
-  2,                                   /* cost of reg,reg fld/fst */
-  {8, 8, 8},                           /* cost of loading fp registers
-                                          in SFmode, DFmode and XFmode */
-  {8, 8, 8},                           /* cost of storing fp registers
-                                          in SFmode, DFmode and XFmode */
-  2,                                   /* cost of moving MMX register */
-  {4, 8},                              /* cost of loading MMX registers
-                                          in SImode and DImode */
-  {4, 8},                              /* cost of storing MMX registers
-                                          in SImode and DImode */
-  2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
-  {4, 8, 16, 32, 64},                  /* cost of loading SSE registers
-                                          in 32,64,128,256 and 512-bit */
+  {4, 8, 16, 32, 64},                  /* cost of loading SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {4, 8, 16, 32, 64},                  /* cost of storing SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
    {4, 8, 16, 32, 64},                  /* cost of unaligned loads.  */
-  {4, 8, 16, 32, 64},                  /* cost of storing SSE registers
-                                          in 32,64,128,256 and 512-bit */
    {4, 8, 16, 32, 64},                  /* cost of unaligned stores.  */
-  3, 3,                                        /* SSE->integer and integer->SSE moves */
+  2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
+  3,                                   /* cost of moving SSE register to integer.  */
    4, 4,                                        /* Gather load static, per_elt.  */
    4, 4,                                        /* Gather store static, per_elt.  */
    4,                                   /* size of l1 cache.  486 has 8kB cache
@@ -307,6 +338,30 @@ static stringop_algs pentium_memset[2] = {
  
  static const
  struct processor_costs pentium_cost = {
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
+  6,                                /* cost for loading QImode using movzbl */
+  {2, 4, 2},                           /* cost of loading integer registers
+                                          in QImode, HImode and SImode.
+                                          Relative to reg-reg move (2).  */
+  {2, 4, 2},                           /* cost of storing integer registers */
+  2,                                   /* cost of reg,reg fld/fst */
+  {2, 2, 6},                           /* cost of loading fp registers
+                                          in SFmode, DFmode and XFmode */
+  {4, 4, 6},                           /* cost of storing fp registers
+                                          in SFmode, DFmode and XFmode */
+  8,                                   /* cost of moving MMX register */
+  {8, 8},                              /* cost of loading MMX registers
+                                          in SImode and DImode */
+  {8, 8},                              /* cost of storing MMX registers
+                                          in SImode and DImode */
+  2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
+  {4, 8, 16, 32, 64},                  /* cost of loading SSE registers
+                                          in 32,64,128,256 and 512-bit */
+  {4, 8, 16, 32, 64},                  /* cost of storing SSE registers
+                                          in 32,64,128,256 and 512-bit */
+  3, 3,                                        /* SSE->integer and integer->SSE moves */
+  /* End of register allocator costs.  */
+
    COSTS_N_INSNS (1),                   /* cost of an add instruction */
    COSTS_N_INSNS (1),                   /* cost of a lea instruction */
    COSTS_N_INSNS (4),                   /* variable shift costs */
@@ -326,32 +381,18 @@ struct processor_costs pentium_cost = {
    COSTS_N_INSNS (2),                   /* cost of movzx */
    8,                                   /* "large" insn */
    6,                                   /* MOVE_RATIO */
-
-  /* All move costs are relative to integer->integer move times 2 and thus
-     they are latency*2. */
-  6,                                /* cost for loading QImode using movzbl */
    {2, 4, 2},                           /* cost of loading integer registers
                                            in QImode, HImode and SImode.
                                            Relative to reg-reg move (2).  */
    {2, 4, 2},                           /* cost of storing integer registers */
-  2,                                   /* cost of reg,reg fld/fst */
-  {2, 2, 6},                           /* cost of loading fp registers
-                                          in SFmode, DFmode and XFmode */
-  {4, 4, 6},                           /* cost of storing fp registers
-                                          in SFmode, DFmode and XFmode */
-  8,                                   /* cost of moving MMX register */
-  {8, 8},                              /* cost of loading MMX registers
-                                          in SImode and DImode */
-  {8, 8},                              /* cost of storing MMX registers
-                                          in SImode and DImode */
-  2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
-  {4, 8, 16, 32, 64},                  /* cost of loading SSE registers
-                                          in 32,64,128,256 and 512-bit */
+  {4, 8, 16, 32, 64},                  /* cost of loading SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {4, 8, 16, 32, 64},                  /* cost of storing SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
    {4, 8, 16, 32, 64},                  /* cost of unaligned loads.  */
-  {4, 8, 16, 32, 64},                  /* cost of storing SSE registers
-                                          in 32,64,128,256 and 512-bit */
    {4, 8, 16, 32, 64},                  /* cost of unaligned stores.  */
-  3, 3,                                        /* SSE->integer and integer->SSE moves */
+  2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
+  3,                                   /* cost of moving SSE register to integer.  */
    4, 4,                                        /* Gather load static, per_elt.  */
    4, 4,                                        /* Gather store static, per_elt.  */
    8,                                   /* size of l1 cache.  */
@@ -389,6 +430,30 @@ struct processor_costs pentium_cost = {
  
  static const
  struct processor_costs lakemont_cost = {
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
+  6,                                /* cost for loading QImode using movzbl */
+  {2, 4, 2},                           /* cost of loading integer registers
+                                          in QImode, HImode and SImode.
+                                          Relative to reg-reg move (2).  */
+  {2, 4, 2},                           /* cost of storing integer registers */
+  2,                                   /* cost of reg,reg fld/fst */
+  {2, 2, 6},                           /* cost of loading fp registers
+                                          in SFmode, DFmode and XFmode */
+  {4, 4, 6},                           /* cost of storing fp registers
+                                          in SFmode, DFmode and XFmode */
+  8,                                   /* cost of moving MMX register */
+  {8, 8},                              /* cost of loading MMX registers
+                                          in SImode and DImode */
+  {8, 8},                              /* cost of storing MMX registers
+                                          in SImode and DImode */
+  2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
+  {4, 8, 16, 32, 64},                  /* cost of loading SSE registers
+                                          in 32,64,128,256 and 512-bit */
+  {4, 8, 16, 32, 64},                  /* cost of storing SSE registers
+                                          in 32,64,128,256 and 512-bit */
+  3, 3,                                        /* SSE->integer and integer->SSE moves */
+  /* End of register allocator costs.  */
+
    COSTS_N_INSNS (1),                   /* cost of an add instruction */
    COSTS_N_INSNS (1) + 1,               /* cost of a lea instruction */
    COSTS_N_INSNS (1),                   /* variable shift costs */
@@ -408,32 +473,18 @@ struct processor_costs lakemont_cost = {
    COSTS_N_INSNS (2),                   /* cost of movzx */
    8,                                   /* "large" insn */
    17,                                  /* MOVE_RATIO */
-
-  /* All move costs are relative to integer->integer move times 2 and thus
-     they are latency*2. */
-  6,                                /* cost for loading QImode using movzbl */
    {2, 4, 2},                           /* cost of loading integer registers
                                            in QImode, HImode and SImode.
                                            Relative to reg-reg move (2).  */
    {2, 4, 2},                           /* cost of storing integer registers */
-  2,                                   /* cost of reg,reg fld/fst */
-  {2, 2, 6},                           /* cost of loading fp registers
-                                          in SFmode, DFmode and XFmode */
-  {4, 4, 6},                           /* cost of storing fp registers
-                                          in SFmode, DFmode and XFmode */
-  8,                                   /* cost of moving MMX register */
-  {8, 8},                              /* cost of loading MMX registers
-                                          in SImode and DImode */
-  {8, 8},                              /* cost of storing MMX registers
-                                          in SImode and DImode */
-  2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
-  {4, 8, 16, 32, 64},                  /* cost of loading SSE registers
-                                          in 32,64,128,256 and 512-bit */
+  {4, 8, 16, 32, 64},                  /* cost of loading SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {4, 8, 16, 32, 64},                  /* cost of storing SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
    {4, 8, 16, 32, 64},                  /* cost of unaligned loads.  */
-  {4, 8, 16, 32, 64},                  /* cost of storing SSE registers
-                                          in 32,64,128,256 and 512-bit */
    {4, 8, 16, 32, 64},                  /* cost of unaligned stores.  */
-  3, 3,                                        /* SSE->integer and integer->SSE moves */
+  2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
+  3,                                   /* cost of moving SSE register to integer.  */
    4, 4,                                        /* Gather load static, per_elt.  */
    4, 4,                                        /* Gather store static, per_elt.  */
    8,                                   /* size of l1 cache.  */
@@ -486,6 +537,30 @@ static stringop_algs pentiumpro_memset[2] = {
    DUMMY_STRINGOP_ALGS};
  static const
  struct processor_costs pentiumpro_cost = {
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
+  2,                                /* cost for loading QImode using movzbl */
+  {4, 4, 4},                           /* cost of loading integer registers
+                                          in QImode, HImode and SImode.
+                                          Relative to reg-reg move (2).  */
+  {2, 2, 2},                           /* cost of storing integer registers */
+  2,                                   /* cost of reg,reg fld/fst */
+  {2, 2, 6},                           /* cost of loading fp registers
+                                          in SFmode, DFmode and XFmode */
+  {4, 4, 6},                           /* cost of storing fp registers
+                                          in SFmode, DFmode and XFmode */
+  2,                                   /* cost of moving MMX register */
+  {2, 2},                              /* cost of loading MMX registers
+                                          in SImode and DImode */
+  {2, 2},                              /* cost of storing MMX registers
+                                          in SImode and DImode */
+  2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
+  {4, 8, 16, 32, 64},                  /* cost of loading SSE registers
+                                          in 32,64,128,256 and 512-bit */
+  {4, 8, 16, 32, 64},                  /* cost of storing SSE registers
+                                          in 32,64,128,256 and 512-bit */
+  3, 3,                                        /* SSE->integer and integer->SSE moves */
+  /* End of register allocator costs.  */
+
    COSTS_N_INSNS (1),                   /* cost of an add instruction */
    COSTS_N_INSNS (1),                   /* cost of a lea instruction */
    COSTS_N_INSNS (1),                   /* variable shift costs */
@@ -505,32 +580,18 @@ struct processor_costs pentiumpro_cost = {
    COSTS_N_INSNS (1),                   /* cost of movzx */
    8,                                   /* "large" insn */
    6,                                   /* MOVE_RATIO */
-
-  /* All move costs are relative to integer->integer move times 2 and thus
-     they are latency*2. */
-  2,                                /* cost for loading QImode using movzbl */
    {4, 4, 4},                           /* cost of loading integer registers
                                            in QImode, HImode and SImode.
                                            Relative to reg-reg move (2).  */
    {2, 2, 2},                           /* cost of storing integer registers */
-  2,                                   /* cost of reg,reg fld/fst */
-  {2, 2, 6},                           /* cost of loading fp registers
-                                          in SFmode, DFmode and XFmode */
-  {4, 4, 6},                           /* cost of storing fp registers
-                                          in SFmode, DFmode and XFmode */
-  2,                                   /* cost of moving MMX register */
-  {2, 2},                              /* cost of loading MMX registers
-                                          in SImode and DImode */
-  {2, 2},                              /* cost of storing MMX registers
-                                          in SImode and DImode */
-  2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
-  {4, 8, 16, 32, 64},                  /* cost of loading SSE registers
-                                          in 32,64,128,256 and 512-bit */
+  {4, 8, 16, 32, 64},                  /* cost of loading SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {4, 8, 16, 32, 64},                  /* cost of storing SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
    {4, 8, 16, 32, 64},                  /* cost of unaligned loads.  */
-  {4, 8, 16, 32, 64},                  /* cost of storing SSE registers
-                                          in 32,64,128,256 and 512-bit */
    {4, 8, 16, 32, 64},                  /* cost of unaligned stores.  */
-  3, 3,                                        /* SSE->integer and integer->SSE moves */
+  2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
+  3,                                   /* cost of moving SSE register to integer.  */
    4, 4,                                        /* Gather load static, per_elt.  */
    4, 4,                                        /* Gather store static, per_elt.  */
    8,                                   /* size of l1 cache.  */
@@ -574,6 +635,30 @@ static stringop_algs geode_memset[2] = {
    DUMMY_STRINGOP_ALGS};
  static const
  struct processor_costs geode_cost = {
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
+  2,                                /* cost for loading QImode using movzbl */
+  {2, 2, 2},                           /* cost of loading integer registers
+                                          in QImode, HImode and SImode.
+                                          Relative to reg-reg move (2).  */
+  {2, 2, 2},                           /* cost of storing integer registers */
+  2,                                   /* cost of reg,reg fld/fst */
+  {2, 2, 2},                           /* cost of loading fp registers
+                                          in SFmode, DFmode and XFmode */
+  {4, 6, 6},                           /* cost of storing fp registers
+                                          in SFmode, DFmode and XFmode */
+  2,                                   /* cost of moving MMX register */
+  {2, 2},                              /* cost of loading MMX registers
+                                          in SImode and DImode */
+  {2, 2},                              /* cost of storing MMX registers
+                                          in SImode and DImode */
+  2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
+  {2, 2, 8, 16, 32},                   /* cost of loading SSE registers
+                                          in 32,64,128,256 and 512-bit */
+  {2, 2, 8, 16, 32},                   /* cost of storing SSE registers
+                                          in 32,64,128,256 and 512-bit */
+  6, 6,                                        /* SSE->integer and integer->SSE moves */
+  /* End of register allocator costs.  */
+
    COSTS_N_INSNS (1),                   /* cost of an add instruction */
    COSTS_N_INSNS (1),                   /* cost of a lea instruction */
    COSTS_N_INSNS (2),                   /* variable shift costs */
@@ -593,33 +678,18 @@ struct processor_costs geode_cost = {
    COSTS_N_INSNS (1),                   /* cost of movzx */
    8,                                   /* "large" insn */
    4,                                   /* MOVE_RATIO */
-
-  /* All move costs are relative to integer->integer move times 2 and thus
-     they are latency*2. */
-  2,                                /* cost for loading QImode using movzbl */
    {2, 2, 2},                           /* cost of loading integer registers
                                            in QImode, HImode and SImode.
                                            Relative to reg-reg move (2).  */
    {2, 2, 2},                           /* cost of storing integer registers */
-  2,                                   /* cost of reg,reg fld/fst */
-  {2, 2, 2},                           /* cost of loading fp registers
-                                          in SFmode, DFmode and XFmode */
-  {4, 6, 6},                           /* cost of storing fp registers
-                                          in SFmode, DFmode and XFmode */
-
-  2,                                   /* cost of moving MMX register */
-  {2, 2},                              /* cost of loading MMX registers
-                                          in SImode and DImode */
-  {2, 2},                              /* cost of storing MMX registers
-                                          in SImode and DImode */
-  2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
-  {2, 2, 8, 16, 32},                   /* cost of loading SSE registers
-                                          in 32,64,128,256 and 512-bit */
+  {2, 2, 8, 16, 32},                   /* cost of loading SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {2, 2, 8, 16, 32},                   /* cost of storing SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
    {2, 2, 8, 16, 32},                   /* cost of unaligned loads.  */
-  {2, 2, 8, 16, 32},                   /* cost of storing SSE registers
-                                          in 32,64,128,256 and 512-bit */
    {2, 2, 8, 16, 32},                   /* cost of unaligned stores.  */
-  6, 6,                                        /* SSE->integer and integer->SSE moves */
+  2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
+  6,                                   /* cost of moving SSE register to integer.  */
    2, 2,                                        /* Gather load static, per_elt.  */
    2, 2,                                        /* Gather store static, per_elt.  */
    64,                                  /* size of l1 cache.  */
@@ -663,6 +733,30 @@ static stringop_algs k6_memset[2] = {
    DUMMY_STRINGOP_ALGS};
  static const
  struct processor_costs k6_cost = {
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
+  3,                                /* cost for loading QImode using movzbl */
+  {4, 5, 4},                           /* cost of loading integer registers
+                                          in QImode, HImode and SImode.
+                                          Relative to reg-reg move (2).  */
+  {2, 3, 2},                           /* cost of storing integer registers */
+  4,                                   /* cost of reg,reg fld/fst */
+  {6, 6, 6},                           /* cost of loading fp registers
+                                          in SFmode, DFmode and XFmode */
+  {4, 4, 4},                           /* cost of storing fp registers
+                                          in SFmode, DFmode and XFmode */
+  2,                                   /* cost of moving MMX register */
+  {2, 2},                              /* cost of loading MMX registers
+                                          in SImode and DImode */
+  {2, 2},                              /* cost of storing MMX registers
+                                          in SImode and DImode */
+  2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
+  {2, 2, 8, 16, 32},                   /* cost of loading SSE registers
+                                          in 32,64,128,256 and 512-bit */
+  {2, 2, 8, 16, 32},                   /* cost of storing SSE registers
+                                          in 32,64,128,256 and 512-bit */
+  6, 6,                                        /* SSE->integer and integer->SSE moves */
+  /* End of register allocator costs.  */
+
    COSTS_N_INSNS (1),                   /* cost of an add instruction */
    COSTS_N_INSNS (2),                   /* cost of a lea instruction */
    COSTS_N_INSNS (1),                   /* variable shift costs */
@@ -682,32 +776,18 @@ struct processor_costs k6_cost = {
    COSTS_N_INSNS (2),                   /* cost of movzx */
    8,                                   /* "large" insn */
    4,                                   /* MOVE_RATIO */
-
-  /* All move costs are relative to integer->integer move times 2 and thus
-     they are latency*2. */
-  3,                                /* cost for loading QImode using movzbl */
    {4, 5, 4},                           /* cost of loading integer registers
                                            in QImode, HImode and SImode.
                                            Relative to reg-reg move (2).  */
    {2, 3, 2},                           /* cost of storing integer registers */
-  4,                                   /* cost of reg,reg fld/fst */
-  {6, 6, 6},                           /* cost of loading fp registers
-                                          in SFmode, DFmode and XFmode */
-  {4, 4, 4},                           /* cost of storing fp registers
-                                          in SFmode, DFmode and XFmode */
-  2,                                   /* cost of moving MMX register */
-  {2, 2},                              /* cost of loading MMX registers
-                                          in SImode and DImode */
-  {2, 2},                              /* cost of storing MMX registers
-                                          in SImode and DImode */
-  2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
-  {2, 2, 8, 16, 32},                   /* cost of loading SSE registers
-                                          in 32,64,128,256 and 512-bit */
+  {2, 2, 8, 16, 32},                   /* cost of loading SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {2, 2, 8, 16, 32},                   /* cost of storing SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
    {2, 2, 8, 16, 32},                   /* cost of unaligned loads.  */
-  {2, 2, 8, 16, 32},                   /* cost of storing SSE registers
-                                          in 32,64,128,256 and 512-bit */
    {2, 2, 8, 16, 32},                   /* cost of unaligned stores.  */
-  6, 6,                                        /* SSE->integer and integer->SSE moves */
+  2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
+  6,                                   /* cost of moving SSE register to integer.  */
    2, 2,                                        /* Gather load static, per_elt.  */
    2, 2,                                        /* Gather store static, per_elt.  */
    32,                                  /* size of l1 cache.  */
@@ -757,6 +837,30 @@ static stringop_algs athlon_memset[2] = {
    DUMMY_STRINGOP_ALGS};
  static const
  struct processor_costs athlon_cost = {
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
+  4,                                /* cost for loading QImode using movzbl */
+  {3, 4, 3},                           /* cost of loading integer registers
+                                          in QImode, HImode and SImode.
+                                          Relative to reg-reg move (2).  */
+  {3, 4, 3},                           /* cost of storing integer registers */
+  4,                                   /* cost of reg,reg fld/fst */
+  {4, 4, 12},                          /* cost of loading fp registers
+                                          in SFmode, DFmode and XFmode */
+  {6, 6, 8},                           /* cost of storing fp registers
+                                          in SFmode, DFmode and XFmode */
+  2,                                   /* cost of moving MMX register */
+  {4, 4},                              /* cost of loading MMX registers
+                                          in SImode and DImode */
+  {4, 4},                              /* cost of storing MMX registers
+                                          in SImode and DImode */
+  2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
+  {4, 4, 12, 12, 24},                  /* cost of loading SSE registers
+                                          in 32,64,128,256 and 512-bit */
+  {4, 4, 10, 10, 20},                  /* cost of storing SSE registers
+                                          in 32,64,128,256 and 512-bit */
+  5, 5,                                        /* SSE->integer and integer->SSE moves */
+  /* End of register allocator costs.  */
+
    COSTS_N_INSNS (1),                   /* cost of an add instruction */
    COSTS_N_INSNS (2),                   /* cost of a lea instruction */
    COSTS_N_INSNS (1),                   /* variable shift costs */
@@ -776,32 +880,18 @@ struct processor_costs athlon_cost = {
    COSTS_N_INSNS (1),                   /* cost of movzx */
    8,                                   /* "large" insn */
    9,                                   /* MOVE_RATIO */
-
-  /* All move costs are relative to integer->integer move times 2 and thus
-     they are latency*2. */
-  4,                                /* cost for loading QImode using movzbl */
    {3, 4, 3},                           /* cost of loading integer registers
                                            in QImode, HImode and SImode.
                                            Relative to reg-reg move (2).  */
    {3, 4, 3},                           /* cost of storing integer registers */
-  4,                                   /* cost of reg,reg fld/fst */
-  {4, 4, 12},                          /* cost of loading fp registers
-                                          in SFmode, DFmode and XFmode */
-  {6, 6, 8},                           /* cost of storing fp registers
-                                          in SFmode, DFmode and XFmode */
-  2,                                   /* cost of moving MMX register */
-  {4, 4},                              /* cost of loading MMX registers
-                                          in SImode and DImode */
-  {4, 4},                              /* cost of storing MMX registers
-                                          in SImode and DImode */
-  2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
-  {4, 4, 12, 12, 24},                  /* cost of loading SSE registers
-                                          in 32,64,128,256 and 512-bit */
+  {4, 4, 12, 12, 24},                  /* cost of loading SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {4, 4, 10, 10, 20},                  /* cost of storing SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
    {4, 4, 12, 12, 24},                  /* cost of unaligned loads.  */
-  {4, 4, 10, 10, 20},                  /* cost of storing SSE registers
-                                          in 32,64,128,256 and 512-bit */
    {4, 4, 10, 10, 20},                  /* cost of unaligned stores.  */
-  5, 5,                                        /* SSE->integer and integer->SSE moves */
+  2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
+  5,                                   /* cost of moving SSE register to integer.  */
    4, 4,                                        /* Gather load static, per_elt.  */
    4, 4,                                        /* Gather store static, per_elt.  */
    64,                                  /* size of l1 cache.  */
@@ -853,6 +943,30 @@ static stringop_algs k8_memset[2] = {
               {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
  static const
  struct processor_costs k8_cost = {
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
+  4,                                /* cost for loading QImode using movzbl */
+  {3, 4, 3},                           /* cost of loading integer registers
+                                          in QImode, HImode and SImode.
+                                          Relative to reg-reg move (2).  */
+  {3, 4, 3},                           /* cost of storing integer registers */
+  4,                                   /* cost of reg,reg fld/fst */
+  {4, 4, 12},                          /* cost of loading fp registers
+                                          in SFmode, DFmode and XFmode */
+  {6, 6, 8},                           /* cost of storing fp registers
+                                          in SFmode, DFmode and XFmode */
+  2,                                   /* cost of moving MMX register */
+  {3, 3},                              /* cost of loading MMX registers
+                                          in SImode and DImode */
+  {4, 4},                              /* cost of storing MMX registers
+                                          in SImode and DImode */
+  2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
+  {4, 3, 12, 12, 24},                  /* cost of loading SSE registers
+                                          in 32,64,128,256 and 512-bit */
+  {4, 4, 10, 10, 20},                  /* cost of storing SSE registers
+                                          in 32,64,128,256 and 512-bit */
+  5, 5,                                        /* SSE->integer and integer->SSE moves */
+  /* End of register allocator costs.  */
+
    COSTS_N_INSNS (1),                   /* cost of an add instruction */
    COSTS_N_INSNS (2),                   /* cost of a lea instruction */
    COSTS_N_INSNS (1),                   /* variable shift costs */
@@ -872,32 +986,18 @@ struct processor_costs k8_cost = {
    COSTS_N_INSNS (1),                   /* cost of movzx */
    8,                                   /* "large" insn */
    9,                                   /* MOVE_RATIO */
-
-  /* All move costs are relative to integer->integer move times 2 and thus
-     they are latency*2. */
-  4,                                /* cost for loading QImode using movzbl */
    {3, 4, 3},                           /* cost of loading integer registers
                                            in QImode, HImode and SImode.
                                            Relative to reg-reg move (2).  */
    {3, 4, 3},                           /* cost of storing integer registers */
-  4,                                   /* cost of reg,reg fld/fst */
-  {4, 4, 12},                          /* cost of loading fp registers
-                                          in SFmode, DFmode and XFmode */
-  {6, 6, 8},                           /* cost of storing fp registers
-                                          in SFmode, DFmode and XFmode */
-  2,                                   /* cost of moving MMX register */
-  {3, 3},                              /* cost of loading MMX registers
-                                          in SImode and DImode */
-  {4, 4},                              /* cost of storing MMX registers
-                                          in SImode and DImode */
-  2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
-  {4, 3, 12, 12, 24},                  /* cost of loading SSE registers
-                                          in 32,64,128,256 and 512-bit */
+  {4, 3, 12, 12, 24},                  /* cost of loading SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {4, 4, 10, 10, 20},                  /* cost of storing SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
    {4, 3, 12, 12, 24},                  /* cost of unaligned loads.  */
-  {4, 4, 10, 10, 20},                  /* cost of storing SSE registers
-                                          in 32,64,128,256 and 512-bit */
    {4, 4, 10, 10, 20},                  /* cost of unaligned stores.  */
-  5, 5,                                        /* SSE->integer and integer->SSE moves */
+  2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
+  5,                                   /* cost of moving SSE register to integer.  */
    4, 4,                                        /* Gather load static, per_elt.  */
    4, 4,                                        /* Gather store static, per_elt.  */
    64,                                  /* size of l1 cache.  */
@@ -953,28 +1053,7 @@ static stringop_algs amdfam10_memset[2] = {
    {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
               {-1, libcall, false}}}};
  struct processor_costs amdfam10_cost = {
-  COSTS_N_INSNS (1),                   /* cost of an add instruction */
-  COSTS_N_INSNS (2),                   /* cost of a lea instruction */
-  COSTS_N_INSNS (1),                   /* variable shift costs */
-  COSTS_N_INSNS (1),                   /* constant shift costs */
-  {COSTS_N_INSNS (3),                  /* cost of starting multiply for QI */
-   COSTS_N_INSNS (4),                  /*                               HI */
-   COSTS_N_INSNS (3),                  /*                               SI */
-   COSTS_N_INSNS (4),                  /*                               DI */
-   COSTS_N_INSNS (5)},                 /*                            other */
-  0,                                   /* cost of multiply per each bit set */
-  {COSTS_N_INSNS (19),                 /* cost of a divide/mod for QI */
-   COSTS_N_INSNS (35),                 /*                          HI */
-   COSTS_N_INSNS (51),                 /*                          SI */
-   COSTS_N_INSNS (83),                 /*                          DI */
-   COSTS_N_INSNS (83)},                        /*                          other */
-  COSTS_N_INSNS (1),                   /* cost of movsx */
-  COSTS_N_INSNS (1),                   /* cost of movzx */
-  8,                                   /* "large" insn */
-  9,                                   /* MOVE_RATIO */
-
-  /* All move costs are relative to integer->integer move times 2 and thus
-     they are latency*2. */
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
    4,                                /* cost for loading QImode using movzbl */
    {3, 4, 3},                           /* cost of loading integer registers
                                            in QImode, HImode and SImode.
@@ -993,11 +1072,10 @@ struct processor_costs amdfam10_cost = {
    2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
    {4, 4, 3, 6, 12},                    /* cost of loading SSE registers
                                            in 32,64,128,256 and 512-bit */
-  {4, 4, 3, 7, 12},                    /* cost of unaligned loads.  */
    {4, 4, 5, 10, 20},                   /* cost of storing SSE registers
                                            in 32,64,128,256 and 512-bit */
-  {4, 4, 5, 10, 20},                   /* cost of unaligned stores.  */
    3, 3,                                        /* SSE->integer and integer->SSE moves */
+
                                         /* On K8:
                                             MOVD reg64, xmmreg Double FSTORE 4
                                             MOVD reg32, xmmreg Double FSTORE 4
@@ -1006,6 +1084,39 @@ struct processor_costs amdfam10_cost = {
                                                                1/1  1/1
                                             MOVD reg32, xmmreg Double FADD 3
                                                                1/1  1/1 */
+  /* End of register allocator costs.  */
+
+  COSTS_N_INSNS (1),                   /* cost of an add instruction */
+  COSTS_N_INSNS (2),                   /* cost of a lea instruction */
+  COSTS_N_INSNS (1),                   /* variable shift costs */
+  COSTS_N_INSNS (1),                   /* constant shift costs */
+  {COSTS_N_INSNS (3),                  /* cost of starting multiply for QI */
+   COSTS_N_INSNS (4),                  /*                               HI */
+   COSTS_N_INSNS (3),                  /*                               SI */
+   COSTS_N_INSNS (4),                  /*                               DI */
+   COSTS_N_INSNS (5)},                 /*                            other */
+  0,                                   /* cost of multiply per each bit set */
+  {COSTS_N_INSNS (19),                 /* cost of a divide/mod for QI */
+   COSTS_N_INSNS (35),                 /*                          HI */
+   COSTS_N_INSNS (51),                 /*                          SI */
+   COSTS_N_INSNS (83),                 /*                          DI */
+   COSTS_N_INSNS (83)},                        /*                          other */
+  COSTS_N_INSNS (1),                   /* cost of movsx */
+  COSTS_N_INSNS (1),                   /* cost of movzx */
+  8,                                   /* "large" insn */
+  9,                                   /* MOVE_RATIO */
+  {3, 4, 3},                           /* cost of loading integer registers
+                                          in QImode, HImode and SImode.
+                                          Relative to reg-reg move (2).  */
+  {3, 4, 3},                           /* cost of storing integer registers */
+  {4, 4, 3, 6, 12},                    /* cost of loading SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {4, 4, 5, 10, 20},                   /* cost of storing SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {4, 4, 3, 7, 12},                    /* cost of unaligned loads.  */
+  {4, 4, 5, 10, 20},                   /* cost of unaligned stores.  */
+  2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
+  3,                                   /* cost of moving SSE register to integer.  */
    4, 4,                                        /* Gather load static, per_elt.  */
    4, 4,                                        /* Gather store static, per_elt.  */
    64,                                  /* size of l1 cache.  */
@@ -1062,6 +1173,30 @@ static stringop_algs bdver_memset[2] = {
               {-1, libcall, false}}}};
  
  const struct processor_costs bdver_cost = {
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
+  8,                                /* cost for loading QImode using movzbl */
+  {8, 8, 8},                           /* cost of loading integer registers
+                                          in QImode, HImode and SImode.
+                                          Relative to reg-reg move (2).  */
+  {8, 8, 8},                           /* cost of storing integer registers */
+  4,                                   /* cost of reg,reg fld/fst */
+  {12, 12, 28},                                /* cost of loading fp registers
+                                          in SFmode, DFmode and XFmode */
+  {10, 10, 18},                                /* cost of storing fp registers
+                                          in SFmode, DFmode and XFmode */
+  4,                                   /* cost of moving MMX register */
+  {12, 12},                            /* cost of loading MMX registers
+                                          in SImode and DImode */
+  {10, 10},                            /* cost of storing MMX registers
+                                          in SImode and DImode */
+  2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
+  {12, 12, 10, 40, 60},                        /* cost of loading SSE registers
+                                          in 32,64,128,256 and 512-bit */
+  {10, 10, 10, 40, 60},                        /* cost of storing SSE registers
+                                          in 32,64,128,256 and 512-bit */
+  16, 20,                              /* SSE->integer and integer->SSE moves */
+  /* End of register allocator costs.  */
+
    COSTS_N_INSNS (1),                   /* cost of an add instruction */
    COSTS_N_INSNS (1),                   /* cost of a lea instruction */
    COSTS_N_INSNS (1),                   /* variable shift costs */
@@ -1081,32 +1216,18 @@ const struct processor_costs bdver_cost = {
    COSTS_N_INSNS (1),                   /* cost of movzx */
    8,                                   /* "large" insn */
    9,                                   /* MOVE_RATIO */
-
-  /* All move costs are relative to integer->integer move times 2 and thus
-     they are latency*2. */
-  8,                                /* cost for loading QImode using movzbl */
    {8, 8, 8},                           /* cost of loading integer registers
                                            in QImode, HImode and SImode.
                                            Relative to reg-reg move (2).  */
    {8, 8, 8},                           /* cost of storing integer registers */
-  4,                                   /* cost of reg,reg fld/fst */
-  {12, 12, 28},                                /* cost of loading fp registers
-                                          in SFmode, DFmode and XFmode */
-  {10, 10, 18},                                /* cost of storing fp registers
-                                          in SFmode, DFmode and XFmode */
-  4,                                   /* cost of moving MMX register */
-  {12, 12},                            /* cost of loading MMX registers
-                                          in SImode and DImode */
-  {10, 10},                            /* cost of storing MMX registers
-                                          in SImode and DImode */
-  2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
-  {12, 12, 10, 40, 60},                        /* cost of loading SSE registers
-                                          in 32,64,128,256 and 512-bit */
+  {12, 12, 10, 40, 60},                        /* cost of loading SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {10, 10, 10, 40, 60},                        /* cost of storing SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
    {12, 12, 10, 40, 60},                        /* cost of unaligned loads.  */
-  {10, 10, 10, 40, 60},                        /* cost of storing SSE registers
-                                          in 32,64,128,256 and 512-bit */
    {10, 10, 10, 40, 60},                        /* cost of unaligned stores.  */
-  16, 20,                              /* SSE->integer and integer->SSE moves */
+  2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
+  16,                                  /* cost of moving SSE register to integer.  */
    12, 12,                              /* Gather load static, per_elt.  */
    10, 10,                              /* Gather store static, per_elt.  */
    16,                                  /* size of l1 cache.  */
@@ -1164,6 +1285,37 @@ static stringop_algs znver1_memset[2] = {
    {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
              {-1, libcall, false}}}};
  struct processor_costs znver1_cost = {
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
+
+  /* reg-reg moves are done by renaming and thus they are even cheaper than
+     1 cycle. Becuase reg-reg move cost is 2 and the following tables correspond
+     to doubles of latencies, we do not model this correctly.  It does not
+     seem to make practical difference to bump prices up even more.  */
+  6,                                   /* cost for loading QImode using
+                                          movzbl.  */
+  {6, 6, 6},                           /* cost of loading integer registers
+                                          in QImode, HImode and SImode.
+                                          Relative to reg-reg move (2).  */
+  {8, 8, 8},                           /* cost of storing integer
+                                          registers.  */
+  2,                                   /* cost of reg,reg fld/fst.  */
+  {6, 6, 16},                          /* cost of loading fp registers
+                                          in SFmode, DFmode and XFmode.  */
+  {8, 8, 16},                          /* cost of storing fp registers
+                                          in SFmode, DFmode and XFmode.  */
+  2,                                   /* cost of moving MMX register.  */
+  {6, 6},                              /* cost of loading MMX registers
+                                          in SImode and DImode.  */
+  {8, 8},                              /* cost of storing MMX registers
+                                          in SImode and DImode.  */
+  2, 3, 6,                             /* cost of moving XMM,YMM,ZMM register.  */
+  {6, 6, 6, 12, 24},                   /* cost of loading SSE registers
+                                          in 32,64,128,256 and 512-bit.  */
+  {8, 8, 8, 16, 32},                   /* cost of storing SSE registers
+                                          in 32,64,128,256 and 512-bit.  */
+  6, 6,                                        /* SSE->integer and integer->SSE moves.  */
+  /* End of register allocator costs.  */
+
    COSTS_N_INSNS (1),                   /* cost of an add instruction.  */
    COSTS_N_INSNS (1),                   /* cost of a lea instruction.  */
    COSTS_N_INSNS (1),                   /* variable shift costs.  */
@@ -1186,39 +1338,19 @@ struct processor_costs znver1_cost = {
    COSTS_N_INSNS (1),                   /* cost of movzx.  */
    8,                                   /* "large" insn.  */
    9,                                   /* MOVE_RATIO.  */
-
-  /* All move costs are relative to integer->integer move times 2 and thus
-     they are latency*2. */
-
-  /* reg-reg moves are done by renaming and thus they are even cheaper than
-     1 cycle. Becuase reg-reg move cost is 2 and the following tables correspond
-     to doubles of latencies, we do not model this correctly.  It does not
-     seem to make practical difference to bump prices up even more.  */
-  6,                                   /* cost for loading QImode using
-                                          movzbl.  */
    {6, 6, 6},                           /* cost of loading integer registers
                                            in QImode, HImode and SImode.
                                            Relative to reg-reg move (2).  */
    {8, 8, 8},                           /* cost of storing integer
                                            registers.  */
-  2,                                   /* cost of reg,reg fld/fst.  */
-  {6, 6, 16},                          /* cost of loading fp registers
-                                          in SFmode, DFmode and XFmode.  */
-  {8, 8, 16},                          /* cost of storing fp registers
-                                          in SFmode, DFmode and XFmode.  */
-  2,                                   /* cost of moving MMX register.  */
-  {6, 6},                              /* cost of loading MMX registers
-                                          in SImode and DImode.  */
-  {8, 8},                              /* cost of storing MMX registers
-                                          in SImode and DImode.  */
-  2, 3, 6,                             /* cost of moving XMM,YMM,ZMM register.  */
-  {6, 6, 6, 12, 24},                   /* cost of loading SSE registers
-                                          in 32,64,128,256 and 512-bit.  */
+  {6, 6, 6, 12, 24},                   /* cost of loading SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {8, 8, 8, 16, 32},                   /* cost of storing SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
    {6, 6, 6, 12, 24},                   /* cost of unaligned loads.  */
-  {8, 8, 8, 16, 32},                   /* cost of storing SSE registers
-                                          in 32,64,128,256 and 512-bit.  */
    {8, 8, 8, 16, 32},                   /* cost of unaligned stores.  */
-  6, 6,                                        /* SSE->integer and integer->SSE moves.  */
+  2, 3, 6,                             /* cost of moving XMM,YMM,ZMM register.  */
+  6,                                   /* cost of moving SSE register to integer.  */
    /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops,
       throughput 12.  Approx 9 uops do not depend on vector size and every load
       is 7 uops.  */
@@ -1288,31 +1420,7 @@ static stringop_algs znver2_memset[2] = {
              {-1, libcall, false}}}};
  
  struct processor_costs znver2_cost = {
-  COSTS_N_INSNS (1),                   /* cost of an add instruction.  */
-  COSTS_N_INSNS (1),                   /* cost of a lea instruction.  */
-  COSTS_N_INSNS (1),                   /* variable shift costs.  */
-  COSTS_N_INSNS (1),                   /* constant shift costs.  */
-  {COSTS_N_INSNS (3),                  /* cost of starting multiply for QI.  */
-   COSTS_N_INSNS (3),                  /*                               HI.  */
-   COSTS_N_INSNS (3),                  /*                               SI.  */
-   COSTS_N_INSNS (3),                  /*                               DI.  */
-   COSTS_N_INSNS (3)},                 /*                      other.  */
-  0,                                   /* cost of multiply per each bit
-                                          set.  */
-   /* Depending on parameters, idiv can get faster on ryzen.  This is upper
-      bound.  */
-  {COSTS_N_INSNS (16),                 /* cost of a divide/mod for QI.  */
-   COSTS_N_INSNS (22),                 /*                          HI.  */
-   COSTS_N_INSNS (30),                 /*                          SI.  */
-   COSTS_N_INSNS (45),                 /*                          DI.  */
-   COSTS_N_INSNS (45)},                        /*                          other.  */
-  COSTS_N_INSNS (1),                   /* cost of movsx.  */
-  COSTS_N_INSNS (1),                   /* cost of movzx.  */
-  8,                                   /* "large" insn.  */
-  9,                                   /* MOVE_RATIO.  */
-
-  /* All move costs are relative to integer->integer move times 2 and thus
-     they are latency*2.  */
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
  
    /* reg-reg moves are done by renaming and thus they are even cheaper than
       1 cycle.  Because reg-reg move cost is 2 and following tables correspond
@@ -1339,12 +1447,48 @@ struct processor_costs znver2_cost = {
                                            register.  */
    {6, 6, 6, 6, 12},                    /* cost of loading SSE registers
                                            in 32,64,128,256 and 512-bit.  */
-  {6, 6, 6, 6, 12},                    /* cost of unaligned loads.  */
    {8, 8, 8, 8, 16},                    /* cost of storing SSE registers
                                            in 32,64,128,256 and 512-bit.  */
-  {8, 8, 8, 8, 16},                    /* cost of unaligned stores.  */
    6, 6,                                        /* SSE->integer and integer->SSE
                                            moves.  */
+  /* End of register allocator costs.  */
+
+  COSTS_N_INSNS (1),                   /* cost of an add instruction.  */
+  COSTS_N_INSNS (1),                   /* cost of a lea instruction.  */
+  COSTS_N_INSNS (1),                   /* variable shift costs.  */
+  COSTS_N_INSNS (1),                   /* constant shift costs.  */
+  {COSTS_N_INSNS (3),                  /* cost of starting multiply for QI.  */
+   COSTS_N_INSNS (3),                  /*                               HI.  */
+   COSTS_N_INSNS (3),                  /*                               SI.  */
+   COSTS_N_INSNS (3),                  /*                               DI.  */
+   COSTS_N_INSNS (3)},                 /*                      other.  */
+  0,                                   /* cost of multiply per each bit
+                                          set.  */
+   /* Depending on parameters, idiv can get faster on ryzen.  This is upper
+      bound.  */
+  {COSTS_N_INSNS (16),                 /* cost of a divide/mod for QI.  */
+   COSTS_N_INSNS (22),                 /*                          HI.  */
+   COSTS_N_INSNS (30),                 /*                          SI.  */
+   COSTS_N_INSNS (45),                 /*                          DI.  */
+   COSTS_N_INSNS (45)},                        /*                          other.  */
+  COSTS_N_INSNS (1),                   /* cost of movsx.  */
+  COSTS_N_INSNS (1),                   /* cost of movzx.  */
+  8,                                   /* "large" insn.  */
+  9,                                   /* MOVE_RATIO.  */
+  {6, 6, 6},                           /* cost of loading integer registers
+                                          in QImode, HImode and SImode.
+                                          Relative to reg-reg move (2).  */
+  {8, 8, 8},                           /* cost of storing integer
+                                          registers.  */
+  {6, 6, 6, 6, 12},                    /* cost of loading SSE registers
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {8, 8, 8, 8, 16},                    /* cost of storing SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {6, 6, 6, 6, 12},                    /* cost of unaligned loads.  */
+  {8, 8, 8, 8, 16},                    /* cost of unaligned stores.  */
+  2, 2, 3,                             /* cost of moving XMM,YMM,ZMM
+                                          register.  */
+  6,                                   /* cost of moving SSE register to integer.  */
    /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops,
       throughput 12.  Approx 9 uops do not depend on vector size and every load
       is 7 uops.  */
@@ -1416,6 +1560,30 @@ static stringop_algs skylake_memset[2] = {
  
  static const
  struct processor_costs skylake_cost = {
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
+  6,                                /* cost for loading QImode using movzbl */
+  {4, 4, 4},                           /* cost of loading integer registers
+                                          in QImode, HImode and SImode.
+                                          Relative to reg-reg move (2).  */
+  {6, 6, 3},                           /* cost of storing integer registers */
+  2,                                   /* cost of reg,reg fld/fst */
+  {6, 6, 8},                           /* cost of loading fp registers
+                                          in SFmode, DFmode and XFmode */
+  {6, 6, 10},                          /* cost of storing fp registers
+                                          in SFmode, DFmode and XFmode */
+  2,                                   /* cost of moving MMX register */
+  {6, 6},                              /* cost of loading MMX registers
+                                          in SImode and DImode */
+  {6, 6},                              /* cost of storing MMX registers
+                                          in SImode and DImode */
+  2, 2, 4,                             /* cost of moving XMM,YMM,ZMM register */
+  {6, 6, 6, 10, 20},                   /* cost of loading SSE registers
+                                          in 32,64,128,256 and 512-bit */
+  {8, 8, 8, 12, 24},                   /* cost of storing SSE registers
+                                          in 32,64,128,256 and 512-bit */
+  2, 2,                                        /* SSE->integer and integer->SSE moves */
+  /* End of register allocator costs.  */
+
    COSTS_N_INSNS (1),                   /* cost of an add instruction */
    COSTS_N_INSNS (1)+1,         /* cost of a lea instruction */
    COSTS_N_INSNS (1),                   /* variable shift costs */
@@ -1437,30 +1605,18 @@ struct processor_costs skylake_cost = {
    COSTS_N_INSNS (0),                   /* cost of movzx */
    8,                                   /* "large" insn */
    17,                                  /* MOVE_RATIO */
-
-  6,                                /* cost for loading QImode using movzbl */
    {4, 4, 4},                           /* cost of loading integer registers
                                            in QImode, HImode and SImode.
                                            Relative to reg-reg move (2).  */
    {6, 6, 3},                           /* cost of storing integer registers */
-  2,                                   /* cost of reg,reg fld/fst */
-  {6, 6, 8},                           /* cost of loading fp registers
-                                          in SFmode, DFmode and XFmode */
-  {6, 6, 10},                          /* cost of storing fp registers
-                                          in SFmode, DFmode and XFmode */
-  2,                                   /* cost of moving MMX register */
-  {6, 6},                              /* cost of loading MMX registers
-                                          in SImode and DImode */
-  {6, 6},                              /* cost of storing MMX registers
-                                          in SImode and DImode */
-  2, 2, 4,                             /* cost of moving XMM,YMM,ZMM register */
-  {6, 6, 6, 10, 20},                   /* cost of loading SSE registers
-                                          in 32,64,128,256 and 512-bit */
+  {6, 6, 6, 10, 20},                   /* cost of loading SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {8, 8, 8, 12, 24},                   /* cost of storing SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
    {6, 6, 6, 10, 20},                   /* cost of unaligned loads.  */
-  {8, 8, 8, 12, 24},                   /* cost of storing SSE registers
-                                          in 32,64,128,256 and 512-bit */
    {8, 8, 8, 8, 16},                    /* cost of unaligned stores.  */
-  2, 2,                                        /* SSE->integer and integer->SSE moves */
+  2, 2, 4,                             /* cost of moving XMM,YMM,ZMM register */
+  2,                                   /* cost of moving SSE register to integer.  */
    20, 8,                               /* Gather load static, per_elt.  */
    22, 10,                              /* Gather store static, per_elt.  */
    64,                                  /* size of l1 cache.  */
@@ -1509,6 +1665,30 @@ static stringop_algs btver1_memset[2] = {
    {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
               {-1, libcall, false}}}};
  const struct processor_costs btver1_cost = {
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
+  8,                                /* cost for loading QImode using movzbl */
+  {6, 8, 6},                           /* cost of loading integer registers
+                                          in QImode, HImode and SImode.
+                                          Relative to reg-reg move (2).  */
+  {6, 8, 6},                           /* cost of storing integer registers */
+  4,                                   /* cost of reg,reg fld/fst */
+  {12, 12, 28},                                /* cost of loading fp registers
+                                          in SFmode, DFmode and XFmode */
+  {12, 12, 38},                                /* cost of storing fp registers
+                                          in SFmode, DFmode and XFmode */
+  4,                                   /* cost of moving MMX register */
+  {10, 10},                            /* cost of loading MMX registers
+                                          in SImode and DImode */
+  {12, 12},                            /* cost of storing MMX registers
+                                          in SImode and DImode */
+  2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
+  {10, 10, 12, 48, 96},                        /* cost of loading SSE registers
+                                          in 32,64,128,256 and 512-bit */
+  {10, 10, 12, 48, 96},                        /* cost of storing SSE registers
+                                          in 32,64,128,256 and 512-bit */
+  14, 14,                              /* SSE->integer and integer->SSE moves */
+  /* End of register allocator costs.  */
+
    COSTS_N_INSNS (1),                   /* cost of an add instruction */
    COSTS_N_INSNS (2),                   /* cost of a lea instruction */
    COSTS_N_INSNS (1),                   /* variable shift costs */
@@ -1528,32 +1708,18 @@ const struct processor_costs btver1_cost = {
    COSTS_N_INSNS (1),                   /* cost of movzx */
    8,                                   /* "large" insn */
    9,                                   /* MOVE_RATIO */
-
-  /* All move costs are relative to integer->integer move times 2 and thus
-     they are latency*2. */
-  8,                                /* cost for loading QImode using movzbl */
    {6, 8, 6},                           /* cost of loading integer registers
                                            in QImode, HImode and SImode.
                                            Relative to reg-reg move (2).  */
    {6, 8, 6},                           /* cost of storing integer registers */
-  4,                                   /* cost of reg,reg fld/fst */
-  {12, 12, 28},                                /* cost of loading fp registers
-                                          in SFmode, DFmode and XFmode */
-  {12, 12, 38},                                /* cost of storing fp registers
-                                          in SFmode, DFmode and XFmode */
-  4,                                   /* cost of moving MMX register */
-  {10, 10},                            /* cost of loading MMX registers
-                                          in SImode and DImode */
-  {12, 12},                            /* cost of storing MMX registers
-                                          in SImode and DImode */
-  2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
-  {10, 10, 12, 48, 96},                        /* cost of loading SSE registers
-                                          in 32,64,128,256 and 512-bit */
+  {10, 10, 12, 48, 96},                        /* cost of loading SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {10, 10, 12, 48, 96},                        /* cost of storing SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
    {10, 10, 12, 48, 96},                        /* cost of unaligned loads.  */
-  {10, 10, 12, 48, 96},                        /* cost of storing SSE registers
-                                          in 32,64,128,256 and 512-bit */
    {10, 10, 12, 48, 96},                        /* cost of unaligned stores.  */
-  14, 14,                              /* SSE->integer and integer->SSE moves */
+  2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
+  14,                                  /* cost of moving SSE register to integer.  */
    10, 10,                              /* Gather load static, per_elt.  */
    10, 10,                              /* Gather store static, per_elt.  */
    32,                                  /* size of l1 cache.  */
@@ -1600,6 +1766,30 @@ static stringop_algs btver2_memset[2] = {
    {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
               {-1, libcall, false}}}};
  const struct processor_costs btver2_cost = {
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
+  8,                                /* cost for loading QImode using movzbl */
+  {8, 8, 6},                           /* cost of loading integer registers
+                                          in QImode, HImode and SImode.
+                                          Relative to reg-reg move (2).  */
+  {8, 8, 6},                           /* cost of storing integer registers */
+  4,                                   /* cost of reg,reg fld/fst */
+  {12, 12, 28},                                /* cost of loading fp registers
+                                          in SFmode, DFmode and XFmode */
+  {12, 12, 38},                                /* cost of storing fp registers
+                                          in SFmode, DFmode and XFmode */
+  4,                                   /* cost of moving MMX register */
+  {10, 10},                            /* cost of loading MMX registers
+                                          in SImode and DImode */
+  {12, 12},                            /* cost of storing MMX registers
+                                          in SImode and DImode */
+  2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
+  {10, 10, 12, 48, 96},                        /* cost of loading SSE registers
+                                          in 32,64,128,256 and 512-bit */
+  {10, 10, 12, 48, 96},                        /* cost of storing SSE registers
+                                          in 32,64,128,256 and 512-bit */
+  14, 14,                              /* SSE->integer and integer->SSE moves */
+  /* End of register allocator costs.  */
+
    COSTS_N_INSNS (1),                   /* cost of an add instruction */
    COSTS_N_INSNS (2),                   /* cost of a lea instruction */
    COSTS_N_INSNS (1),                   /* variable shift costs */
@@ -1619,32 +1809,18 @@ const struct processor_costs btver2_cost = {
    COSTS_N_INSNS (1),                   /* cost of movzx */
    8,                                   /* "large" insn */
    9,                                   /* MOVE_RATIO */
-
-  /* All move costs are relative to integer->integer move times 2 and thus
-     they are latency*2. */
-  8,                                /* cost for loading QImode using movzbl */
    {8, 8, 6},                           /* cost of loading integer registers
                                            in QImode, HImode and SImode.
                                            Relative to reg-reg move (2).  */
    {8, 8, 6},                           /* cost of storing integer registers */
-  4,                                   /* cost of reg,reg fld/fst */
-  {12, 12, 28},                                /* cost of loading fp registers
-                                          in SFmode, DFmode and XFmode */
-  {12, 12, 38},                                /* cost of storing fp registers
-                                          in SFmode, DFmode and XFmode */
-  4,                                   /* cost of moving MMX register */
-  {10, 10},                            /* cost of loading MMX registers
-                                          in SImode and DImode */
-  {12, 12},                            /* cost of storing MMX registers
-                                          in SImode and DImode */
-  2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
-  {10, 10, 12, 48, 96},                        /* cost of loading SSE registers
-                                          in 32,64,128,256 and 512-bit */
+  {10, 10, 12, 48, 96},                        /* cost of loading SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {10, 10, 12, 48, 96},                        /* cost of storing SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
    {10, 10, 12, 48, 96},                        /* cost of unaligned loads.  */
-  {10, 10, 12, 48, 96},                        /* cost of storing SSE registers
-                                          in 32,64,128,256 and 512-bit */
    {10, 10, 12, 48, 96},                        /* cost of unaligned stores.  */
-  14, 14,                              /* SSE->integer and integer->SSE moves */
+  2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
+  14,                                  /* cost of moving SSE register to integer.  */
    10, 10,                              /* Gather load static, per_elt.  */
    10, 10,                              /* Gather store static, per_elt.  */
    32,                                  /* size of l1 cache.  */
@@ -1690,28 +1866,7 @@ static stringop_algs pentium4_memset[2] = {
  
  static const
  struct processor_costs pentium4_cost = {
-  COSTS_N_INSNS (1),                   /* cost of an add instruction */
-  COSTS_N_INSNS (3),                   /* cost of a lea instruction */
-  COSTS_N_INSNS (4),                   /* variable shift costs */
-  COSTS_N_INSNS (4),                   /* constant shift costs */
-  {COSTS_N_INSNS (15),                 /* cost of starting multiply for QI */
-   COSTS_N_INSNS (15),                 /*                               HI */
-   COSTS_N_INSNS (15),                 /*                               SI */
-   COSTS_N_INSNS (15),                 /*                               DI */
-   COSTS_N_INSNS (15)},                        /*                            other */
-  0,                                   /* cost of multiply per each bit set */
-  {COSTS_N_INSNS (56),                 /* cost of a divide/mod for QI */
-   COSTS_N_INSNS (56),                 /*                          HI */
-   COSTS_N_INSNS (56),                 /*                          SI */
-   COSTS_N_INSNS (56),                 /*                          DI */
-   COSTS_N_INSNS (56)},                        /*                          other */
-  COSTS_N_INSNS (1),                   /* cost of movsx */
-  COSTS_N_INSNS (1),                   /* cost of movzx */
-  16,                                  /* "large" insn */
-  6,                                   /* MOVE_RATIO */
-
-  /* All move costs are relative to integer->integer move times 2 and thus
-     they are latency*2. */
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
    5,                                /* cost for loading QImode using movzbl */
    {4, 5, 4},                           /* cost of loading integer registers
                                            in QImode, HImode and SImode.
@@ -1730,11 +1885,42 @@ struct processor_costs pentium4_cost = {
    12, 24, 48,                          /* cost of moving XMM,YMM,ZMM register */
    {16, 16, 16, 32, 64},                        /* cost of loading SSE registers
                                            in 32,64,128,256 and 512-bit */
+  {16, 16, 16, 32, 64},                        /* cost of storing SSE registers
+                                          in 32,64,128,256 and 512-bit */
+  20, 12,                              /* SSE->integer and integer->SSE moves */
+  /* End of register allocator costs.  */
+
+  COSTS_N_INSNS (1),                   /* cost of an add instruction */
+  COSTS_N_INSNS (3),                   /* cost of a lea instruction */
+  COSTS_N_INSNS (4),                   /* variable shift costs */
+  COSTS_N_INSNS (4),                   /* constant shift costs */
+  {COSTS_N_INSNS (15),                 /* cost of starting multiply for QI */
+   COSTS_N_INSNS (15),                 /*                               HI */
+   COSTS_N_INSNS (15),                 /*                               SI */
+   COSTS_N_INSNS (15),                 /*                               DI */
+   COSTS_N_INSNS (15)},                        /*                            other */
+  0,                                   /* cost of multiply per each bit set */
+  {COSTS_N_INSNS (56),                 /* cost of a divide/mod for QI */
+   COSTS_N_INSNS (56),                 /*                          HI */
+   COSTS_N_INSNS (56),                 /*                          SI */
+   COSTS_N_INSNS (56),                 /*                          DI */
+   COSTS_N_INSNS (56)},                        /*                          other */
+  COSTS_N_INSNS (1),                   /* cost of movsx */
+  COSTS_N_INSNS (1),                   /* cost of movzx */
+  16,                                  /* "large" insn */
+  6,                                   /* MOVE_RATIO */
+  {4, 5, 4},                           /* cost of loading integer registers
+                                          in QImode, HImode and SImode.
+                                          Relative to reg-reg move (2).  */
+  {2, 3, 2},                           /* cost of storing integer registers */
+  {16, 16, 16, 32, 64},                        /* cost of loading SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {16, 16, 16, 32, 64},                        /* cost of storing SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
    {32, 32, 32, 64, 128},               /* cost of unaligned loads.  */
-  {16, 16, 16, 32, 64},                        /* cost of storing SSE registers
-                                          in 32,64,128,256 and 512-bit */
    {32, 32, 32, 64, 128},               /* cost of unaligned stores.  */
-  20, 12,                              /* SSE->integer and integer->SSE moves */
+  12, 24, 48,                          /* cost of moving XMM,YMM,ZMM register */
+  20,                                  /* cost of moving SSE register to integer.  */
    16, 16,                              /* Gather load static, per_elt.  */
    16, 16,                              /* Gather store static, per_elt.  */
    8,                                   /* size of l1 cache.  */
@@ -1783,6 +1969,30 @@ static stringop_algs nocona_memset[2] = {
  
  static const
  struct processor_costs nocona_cost = {
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
+  4,                                /* cost for loading QImode using movzbl */
+  {4, 4, 4},                           /* cost of loading integer registers
+                                          in QImode, HImode and SImode.
+                                          Relative to reg-reg move (2).  */
+  {4, 4, 4},                           /* cost of storing integer registers */
+  12,                                  /* cost of reg,reg fld/fst */
+  {14, 14, 14},                                /* cost of loading fp registers
+                                          in SFmode, DFmode and XFmode */
+  {14, 14, 14},                                /* cost of storing fp registers
+                                          in SFmode, DFmode and XFmode */
+  14,                                  /* cost of moving MMX register */
+  {12, 12},                            /* cost of loading MMX registers
+                                          in SImode and DImode */
+  {12, 12},                            /* cost of storing MMX registers
+                                          in SImode and DImode */
+  6, 12, 24,                           /* cost of moving XMM,YMM,ZMM register */
+  {12, 12, 12, 24, 48},                        /* cost of loading SSE registers
+                                          in 32,64,128,256 and 512-bit */
+  {12, 12, 12, 24, 48},                        /* cost of storing SSE registers
+                                          in 32,64,128,256 and 512-bit */
+  20, 12,                              /* SSE->integer and integer->SSE moves */
+  /* End of register allocator costs.  */
+
    COSTS_N_INSNS (1),                   /* cost of an add instruction */
    COSTS_N_INSNS (1),                   /* cost of a lea instruction */
    COSTS_N_INSNS (1),                   /* variable shift costs */
@@ -1802,32 +2012,18 @@ struct processor_costs nocona_cost = {
    COSTS_N_INSNS (1),                   /* cost of movzx */
    16,                                  /* "large" insn */
    17,                                  /* MOVE_RATIO */
-
-  /* All move costs are relative to integer->integer move times 2 and thus
-     they are latency*2. */
-  4,                                /* cost for loading QImode using movzbl */
    {4, 4, 4},                           /* cost of loading integer registers
                                            in QImode, HImode and SImode.
                                            Relative to reg-reg move (2).  */
    {4, 4, 4},                           /* cost of storing integer registers */
-  12,                                  /* cost of reg,reg fld/fst */
-  {14, 14, 14},                                /* cost of loading fp registers
-                                          in SFmode, DFmode and XFmode */
-  {14, 14, 14},                                /* cost of storing fp registers
-                                          in SFmode, DFmode and XFmode */
-  14,                                  /* cost of moving MMX register */
-  {12, 12},                            /* cost of loading MMX registers
-                                          in SImode and DImode */
-  {12, 12},                            /* cost of storing MMX registers
-                                          in SImode and DImode */
-  6, 12, 24,                           /* cost of moving XMM,YMM,ZMM register */
-  {12, 12, 12, 24, 48},                        /* cost of loading SSE registers
-                                          in 32,64,128,256 and 512-bit */
+  {12, 12, 12, 24, 48},                        /* cost of loading SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {12, 12, 12, 24, 48},                        /* cost of storing SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
    {24, 24, 24, 48, 96},                        /* cost of unaligned loads.  */
-  {12, 12, 12, 24, 48},                        /* cost of storing SSE registers
-                                          in 32,64,128,256 and 512-bit */
    {24, 24, 24, 48, 96},                        /* cost of unaligned stores.  */
-  20, 12,                              /* SSE->integer and integer->SSE moves */
+  6, 12, 24,                           /* cost of moving XMM,YMM,ZMM register */
+  20,                                  /* cost of moving SSE register to integer.  */
    12, 12,                              /* Gather load static, per_elt.  */
    12, 12,                              /* Gather store static, per_elt.  */
    8,                                   /* size of l1 cache.  */
@@ -1874,6 +2070,30 @@ static stringop_algs atom_memset[2] = {
               {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
  static const
  struct processor_costs atom_cost = {
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
+  6,                                   /* cost for loading QImode using movzbl */
+  {6, 6, 6},                           /* cost of loading integer registers
+                                          in QImode, HImode and SImode.
+                                          Relative to reg-reg move (2).  */
+  {6, 6, 6},                           /* cost of storing integer registers */
+  4,                                   /* cost of reg,reg fld/fst */
+  {6, 6, 18},                          /* cost of loading fp registers
+                                          in SFmode, DFmode and XFmode */
+  {14, 14, 24},                                /* cost of storing fp registers
+                                          in SFmode, DFmode and XFmode */
+  2,                                   /* cost of moving MMX register */
+  {8, 8},                              /* cost of loading MMX registers
+                                          in SImode and DImode */
+  {10, 10},                            /* cost of storing MMX registers
+                                          in SImode and DImode */
+  2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
+  {8, 8, 8, 16, 32},                   /* cost of loading SSE registers
+                                          in 32,64,128,256 and 512-bit */
+  {8, 8, 8, 16, 32},                   /* cost of storing SSE registers
+                                          in 32,64,128,256 and 512-bit */
+  8, 6,                                        /* SSE->integer and integer->SSE moves */
+  /* End of register allocator costs.  */
+
    COSTS_N_INSNS (1),                   /* cost of an add instruction */
    COSTS_N_INSNS (1) + 1,               /* cost of a lea instruction */
    COSTS_N_INSNS (1),                   /* variable shift costs */
@@ -1893,32 +2113,18 @@ struct processor_costs atom_cost = {
    COSTS_N_INSNS (1),                   /* cost of movzx */
    8,                                   /* "large" insn */
    17,                                  /* MOVE_RATIO */
-
-  /* All move costs are relative to integer->integer move times 2 and thus
-     they are latency*2. */
-  6,                                   /* cost for loading QImode using movzbl */
    {6, 6, 6},                           /* cost of loading integer registers
                                            in QImode, HImode and SImode.
                                            Relative to reg-reg move (2).  */
    {6, 6, 6},                           /* cost of storing integer registers */
-  4,                                   /* cost of reg,reg fld/fst */
-  {6, 6, 18},                          /* cost of loading fp registers
-                                          in SFmode, DFmode and XFmode */
-  {14, 14, 24},                                /* cost of storing fp registers
-                                          in SFmode, DFmode and XFmode */
-  2,                                   /* cost of moving MMX register */
-  {8, 8},                              /* cost of loading MMX registers
-                                          in SImode and DImode */
-  {10, 10},                            /* cost of storing MMX registers
-                                          in SImode and DImode */
-  2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
-  {8, 8, 8, 16, 32},                   /* cost of loading SSE registers
-                                          in 32,64,128,256 and 512-bit */
+  {8, 8, 8, 16, 32},                   /* cost of loading SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {8, 8, 8, 16, 32},                   /* cost of storing SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
    {16, 16, 16, 32, 64},                        /* cost of unaligned loads.  */
-  {8, 8, 8, 16, 32},                   /* cost of storing SSE registers
-                                          in 32,64,128,256 and 512-bit */
    {16, 16, 16, 32, 64},                        /* cost of unaligned stores.  */
-  8, 6,                                        /* SSE->integer and integer->SSE moves */
+  2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
+  8,                                   /* cost of moving SSE register to integer.  */
    8, 8,                                        /* Gather load static, per_elt.  */
    8, 8,                                        /* Gather store static, per_elt.  */
    32,                                  /* size of l1 cache.  */
@@ -1965,6 +2171,30 @@ static stringop_algs slm_memset[2] = {
               {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
  static const
  struct processor_costs slm_cost = {
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
+  8,                                   /* cost for loading QImode using movzbl */
+  {8, 8, 8},                           /* cost of loading integer registers
+                                          in QImode, HImode and SImode.
+                                          Relative to reg-reg move (2).  */
+  {6, 6, 6},                           /* cost of storing integer registers */
+  2,                                   /* cost of reg,reg fld/fst */
+  {8, 8, 18},                          /* cost of loading fp registers
+                                          in SFmode, DFmode and XFmode */
+  {6, 6, 18},                          /* cost of storing fp registers
+                                          in SFmode, DFmode and XFmode */
+  2,                                   /* cost of moving MMX register */
+  {8, 8},                              /* cost of loading MMX registers
+                                          in SImode and DImode */
+  {6, 6},                              /* cost of storing MMX registers
+                                          in SImode and DImode */
+  2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
+  {8, 8, 8, 16, 32},                   /* cost of loading SSE registers
+                                          in 32,64,128,256 and 512-bit */
+  {8, 8, 8, 16, 32},                   /* cost of storing SSE registers
+                                          in 32,64,128,256 and 512-bit */
+  8, 6,                                        /* SSE->integer and integer->SSE moves */
+  /* End of register allocator costs.  */
+
    COSTS_N_INSNS (1),                   /* cost of an add instruction */
    COSTS_N_INSNS (1) + 1,               /* cost of a lea instruction */
    COSTS_N_INSNS (1),                   /* variable shift costs */
@@ -1984,32 +2214,18 @@ struct processor_costs slm_cost = {
    COSTS_N_INSNS (1),                   /* cost of movzx */
    8,                                   /* "large" insn */
    17,                                  /* MOVE_RATIO */
-
-  /* All move costs are relative to integer->integer move times 2 and thus
-     they are latency*2. */
-  8,                                   /* cost for loading QImode using movzbl */
    {8, 8, 8},                           /* cost of loading integer registers
                                            in QImode, HImode and SImode.
                                            Relative to reg-reg move (2).  */
    {6, 6, 6},                           /* cost of storing integer registers */
-  2,                                   /* cost of reg,reg fld/fst */
-  {8, 8, 18},                          /* cost of loading fp registers
-                                          in SFmode, DFmode and XFmode */
-  {6, 6, 18},                          /* cost of storing fp registers
-                                          in SFmode, DFmode and XFmode */
-  2,                                   /* cost of moving MMX register */
-  {8, 8},                              /* cost of loading MMX registers
-                                          in SImode and DImode */
-  {6, 6},                              /* cost of storing MMX registers
-                                          in SImode and DImode */
-  2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
-  {8, 8, 8, 16, 32},                   /* cost of loading SSE registers
-                                          in 32,64,128,256 and 512-bit */
+  {8, 8, 8, 16, 32},                   /* cost of loading SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {8, 8, 8, 16, 32},                   /* cost of storing SSE register
+                                          in SImode, DImode and TImode.  */
    {16, 16, 16, 32, 64},                        /* cost of unaligned loads.  */
-  {8, 8, 8, 16, 32},                   /* cost of storing SSE registers
-                                          in 32,64,128,256 and 512-bit */
    {16, 16, 16, 32, 64},                        /* cost of unaligned stores.  */
-  8, 6,                                        /* SSE->integer and integer->SSE moves */
+  2, 4, 8,                             /* cost of moving XMM,YMM,ZMM register */
+  8,                                   /* cost of moving SSE register to integer.  */
    8, 8,                                        /* Gather load static, per_elt.  */
    8, 8,                                        /* Gather store static, per_elt.  */
    32,                                  /* size of l1 cache.  */
@@ -2056,6 +2272,30 @@ static stringop_algs intel_memset[2] = {
               {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
  static const
  struct processor_costs intel_cost = {
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
+  6,                                /* cost for loading QImode using movzbl */
+  {4, 4, 4},                           /* cost of loading integer registers
+                                          in QImode, HImode and SImode.
+                                          Relative to reg-reg move (2).  */
+  {6, 6, 6},                           /* cost of storing integer registers */
+  2,                                   /* cost of reg,reg fld/fst */
+  {6, 6, 8},                           /* cost of loading fp registers
+                                          in SFmode, DFmode and XFmode */
+  {6, 6, 10},                          /* cost of storing fp registers
+                                          in SFmode, DFmode and XFmode */
+  2,                                   /* cost of moving MMX register */
+  {6, 6},                              /* cost of loading MMX registers
+                                          in SImode and DImode */
+  {6, 6},                              /* cost of storing MMX registers
+                                          in SImode and DImode */
+  2, 2, 2,                             /* cost of moving XMM,YMM,ZMM register */
+  {6, 6, 6, 6, 6},                     /* cost of loading SSE registers
+                                          in 32,64,128,256 and 512-bit */
+  {6, 6, 6, 6, 6},                     /* cost of storing SSE registers
+                                          in 32,64,128,256 and 512-bit */
+  4, 4,                                        /* SSE->integer and integer->SSE moves */
+  /* End of register allocator costs.  */
+
    COSTS_N_INSNS (1),                   /* cost of an add instruction */
    COSTS_N_INSNS (1) + 1,               /* cost of a lea instruction */
    COSTS_N_INSNS (1),                   /* variable shift costs */
@@ -2075,32 +2315,18 @@ struct processor_costs intel_cost = {
    COSTS_N_INSNS (1),                   /* cost of movzx */
    8,                                   /* "large" insn */
    17,                                  /* MOVE_RATIO */
-
-  /* All move costs are relative to integer->integer move times 2 and thus
-     they are latency*2. */
-  6,                                /* cost for loading QImode using movzbl */
    {4, 4, 4},                           /* cost of loading integer registers
                                            in QImode, HImode and SImode.
                                            Relative to reg-reg move (2).  */
    {6, 6, 6},                           /* cost of storing integer registers */
-  2,                                   /* cost of reg,reg fld/fst */
-  {6, 6, 8},                           /* cost of loading fp registers
-                                          in SFmode, DFmode and XFmode */
-  {6, 6, 10},                          /* cost of storing fp registers
-                                          in SFmode, DFmode and XFmode */
-  2,                                   /* cost of moving MMX register */
-  {6, 6},                              /* cost of loading MMX registers
-                                          in SImode and DImode */
-  {6, 6},                              /* cost of storing MMX registers
-                                          in SImode and DImode */
-  2, 2, 2,                             /* cost of moving XMM,YMM,ZMM register */
-  {6, 6, 6, 6, 6},                     /* cost of loading SSE registers
-                                          in 32,64,128,256 and 512-bit */
+  {6, 6, 6, 6, 6},                     /* cost of loading SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {6, 6, 6, 6, 6},                     /* cost of storing SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
    {10, 10, 10, 10, 10},                        /* cost of unaligned loads.  */
-  {6, 6, 6, 6, 6},                     /* cost of storing SSE registers
-                                          in 32,64,128,256 and 512-bit */
    {10, 10, 10, 10, 10},                        /* cost of unaligned loads.  */
-  4, 4,                                        /* SSE->integer and integer->SSE moves */
+  2, 2, 2,                             /* cost of moving XMM,YMM,ZMM register */
+  4,                                   /* cost of moving SSE register to integer.  */
    6, 6,                                        /* Gather load static, per_elt.  */
    6, 6,                                        /* Gather store static, per_elt.  */
    32,                                  /* size of l1 cache.  */
@@ -2151,6 +2377,30 @@ static stringop_algs generic_memset[2] = {
               {-1, libcall, false}}}};
  static const
  struct processor_costs generic_cost = {
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
+  6,                                /* cost for loading QImode using movzbl */
+  {6, 6, 6},                           /* cost of loading integer registers
+                                          in QImode, HImode and SImode.
+                                          Relative to reg-reg move (2).  */
+  {6, 6, 6},                           /* cost of storing integer registers */
+  4,                                   /* cost of reg,reg fld/fst */
+  {6, 6, 12},                          /* cost of loading fp registers
+                                          in SFmode, DFmode and XFmode */
+  {6, 6, 12},                          /* cost of storing fp registers
+                                          in SFmode, DFmode and XFmode */
+  2,                                   /* cost of moving MMX register */
+  {6, 6},                              /* cost of loading MMX registers
+                                          in SImode and DImode */
+  {6, 6},                              /* cost of storing MMX registers
+                                          in SImode and DImode */
+  2, 3, 4,                             /* cost of moving XMM,YMM,ZMM register */
+  {6, 6, 6, 10, 15},                   /* cost of loading SSE registers
+                                          in 32,64,128,256 and 512-bit */
+  {6, 6, 6, 10, 15},                   /* cost of storing SSE registers
+                                          in 32,64,128,256 and 512-bit */
+  6, 6,                                        /* SSE->integer and integer->SSE moves */
+  /* End of register allocator costs.  */
+
    COSTS_N_INSNS (1),                   /* cost of an add instruction */
    /* Setting cost to 2 makes our current implementation of synth_mult result in
       use of unnecessary temporary registers causing regression on several
@@ -2173,32 +2423,18 @@ struct processor_costs generic_cost = {
    COSTS_N_INSNS (1),                   /* cost of movzx */
    8,                                   /* "large" insn */
    17,                                  /* MOVE_RATIO */
-
-  /* All move costs are relative to integer->integer move times 2 and thus
-     they are latency*2. */
-  6,                                /* cost for loading QImode using movzbl */
    {6, 6, 6},                           /* cost of loading integer registers
                                            in QImode, HImode and SImode.
                                            Relative to reg-reg move (2).  */
    {6, 6, 6},                           /* cost of storing integer registers */
-  4,                                   /* cost of reg,reg fld/fst */
-  {6, 6, 12},                          /* cost of loading fp registers
-                                          in SFmode, DFmode and XFmode */
-  {6, 6, 12},                          /* cost of storing fp registers
-                                          in SFmode, DFmode and XFmode */
-  2,                                   /* cost of moving MMX register */
-  {6, 6},                              /* cost of loading MMX registers
-                                          in SImode and DImode */
-  {6, 6},                              /* cost of storing MMX registers
-                                          in SImode and DImode */
-  2, 3, 4,                             /* cost of moving XMM,YMM,ZMM register */
-  {6, 6, 6, 10, 15},                   /* cost of loading SSE registers
-                                          in 32,64,128,256 and 512-bit */
+  {6, 6, 6, 10, 15},                   /* cost of loading SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {6, 6, 6, 10, 15},                   /* cost of storing SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
    {6, 6, 6, 10, 15},                   /* cost of unaligned loads.  */
-  {6, 6, 6, 10, 15},                   /* cost of storing SSE registers
-                                          in 32,64,128,256 and 512-bit */
    {6, 6, 6, 10, 15},                   /* cost of unaligned storess.  */
-  6, 6,                                        /* SSE->integer and integer->SSE moves */
+  2, 3, 4,                             /* cost of moving XMM,YMM,ZMM register */
+  6,                                   /* cost of moving SSE register to integer.  */
    18, 6,                               /* Gather load static, per_elt.  */
    18, 6,                               /* Gather store static, per_elt.  */
    32,                                  /* size of l1 cache.  */
@@ -2251,6 +2487,30 @@ static stringop_algs core_memset[2] = {
  
  static const
  struct processor_costs core_cost = {
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
+  6,                                /* cost for loading QImode using movzbl */
+  {4, 4, 4},                           /* cost of loading integer registers
+                                          in QImode, HImode and SImode.
+                                          Relative to reg-reg move (2).  */
+  {6, 6, 6},                           /* cost of storing integer registers */
+  2,                                   /* cost of reg,reg fld/fst */
+  {6, 6, 8},                           /* cost of loading fp registers
+                                          in SFmode, DFmode and XFmode */
+  {6, 6, 10},                          /* cost of storing fp registers
+                                          in SFmode, DFmode and XFmode */
+  2,                                   /* cost of moving MMX register */
+  {6, 6},                              /* cost of loading MMX registers
+                                          in SImode and DImode */
+  {6, 6},                              /* cost of storing MMX registers
+                                          in SImode and DImode */
+  2, 2, 4,                             /* cost of moving XMM,YMM,ZMM register */
+  {6, 6, 6, 6, 12},                    /* cost of loading SSE registers
+                                          in 32,64,128,256 and 512-bit */
+  {6, 6, 6, 6, 12},                    /* cost of storing SSE registers
+                                          in 32,64,128,256 and 512-bit */
+  2, 2,                                        /* SSE->integer and integer->SSE moves */
+  /* End of register allocator costs.  */
+
    COSTS_N_INSNS (1),                   /* cost of an add instruction */
    /* On all chips taken into consideration lea is 2 cycles and more.  With
       this cost however our current implementation of synth_mult results in
@@ -2277,32 +2537,18 @@ struct processor_costs core_cost = {
    COSTS_N_INSNS (1),                   /* cost of movzx */
    8,                                   /* "large" insn */
    17,                                  /* MOVE_RATIO */
-
-  /* All move costs are relative to integer->integer move times 2 and thus
-     they are latency*2. */
-  6,                                /* cost for loading QImode using movzbl */
    {4, 4, 4},                           /* cost of loading integer registers
                                            in QImode, HImode and SImode.
                                            Relative to reg-reg move (2).  */
    {6, 6, 6},                           /* cost of storing integer registers */
-  2,                                   /* cost of reg,reg fld/fst */
-  {6, 6, 8},                           /* cost of loading fp registers
-                                          in SFmode, DFmode and XFmode */
-  {6, 6, 10},                          /* cost of storing fp registers
-                                          in SFmode, DFmode and XFmode */
-  2,                                   /* cost of moving MMX register */
-  {6, 6},                              /* cost of loading MMX registers
-                                          in SImode and DImode */
-  {6, 6},                              /* cost of storing MMX registers
-                                          in SImode and DImode */
-  2, 2, 4,                             /* cost of moving XMM,YMM,ZMM register */
-  {6, 6, 6, 6, 12},                    /* cost of loading SSE registers
-                                          in 32,64,128,256 and 512-bit */
+  {6, 6, 6, 6, 12},                    /* cost of loading SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {6, 6, 6, 6, 12},                    /* cost of storing SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
    {6, 6, 6, 6, 12},                    /* cost of unaligned loads.  */
-  {6, 6, 6, 6, 12},                    /* cost of storing SSE registers
-                                          in 32,64,128,256 and 512-bit */
    {6, 6, 6, 6, 12},                    /* cost of unaligned stores.  */
-  2, 2,                                        /* SSE->integer and integer->SSE moves */
+  2, 2, 4,                             /* cost of moving XMM,YMM,ZMM register */
+  2,                                   /* cost of moving SSE register to integer.  */
    /* VGATHERDPD is 7 uops, rec throughput 5, while VGATHERDPD is 9 uops,
       rec. throughput 6.
       So 5 uops statically and one uops per load.  */
author	H.J. Lu <hongjiu.lu@intel.com>
	Thu, 15 Aug 2019 18:15:33 +0000 (18:15 +0000)
committer	H.J. Lu <hjl@gcc.gnu.org>
	Thu, 15 Aug 2019 18:15:33 +0000 (11:15 -0700)
gcc/ChangeLog		patch \| blob \| history
gcc/config/i386/i386.c		patch \| blob \| history
gcc/config/i386/i386.h		patch \| blob \| history
gcc/config/i386/x86-tune-costs.h		patch \| blob \| history