From: H.J. Lu <hongjiu.lu@intel.com>
Date: Thu, 15 Aug 2019 18:15:33 +0000 (+0000)
Subject: i386: Separate costs of pseudo registers from hard registers
X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=d321551cea11f27a9afd67ece9bbda095a579950;p=gcc.git

i386: Separate costs of pseudo registers from hard registers

processor_costs has costs of RTL expressions with pseudo registers and
and costs of hard register moves:

1. Costs of RTL expressions are used to generate the most efficient RTL
operations with pseudo registers.

2. Costs of hard register moves are used by register allocator to
decide how to allocate and move hard registers.

Since relative costs of pseudo register load and store versus pseudo
register moves in RTL expressions can be different from relative costs
of hard registers, we should separate costs of RTL expressions with
pseudo registers from costs of hard registers so that register allocator
and RTL expressions can be improved independently.

This patch moves costs of hard register moves to the new hard_register
field and duplicates costs of moves which are also used for costs of RTL
expressions.

	PR target/90878
	* config/i386/i386.c (inline_memory_move_cost): Use hard_register
	for costs of hard register moves.
	(ix86_register_move_cost): Likewise.
	* config/i386/i386.h (processor_costs): Move costs of hard
	register moves to hard_register.  Add int_load, int_store,
	xmm_move, ymm_move, zmm_move, sse_to_integer, integer_to_sse,
	sse_load, sse_store, sse_unaligned_load and sse_unaligned_store
	for costs of RTL expressions.
	* config/i386/x86-tune-costs.h: Move costs of hard register
	moves to hard_register.  Duplicate int_load, int_store,
	xmm_move, ymm_move, zmm_move, sse_to_integer, integer_to_sse,
	sse_load, sse_store for costs of RTL expressions.

From-SVN: r274543
---

diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index eba58f69f80..aa295f09368 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,19 @@
+2019-08-15  H.J. Lu  <hongjiu.lu@intel.com>
+
+	PR target/90878
+	* config/i386/i386.c (inline_memory_move_cost): Use hard_register
+	for costs of hard register moves.
+	(ix86_register_move_cost): Likewise.
+	* config/i386/i386.h (processor_costs): Move costs of hard
+	register moves to hard_register.  Add int_load, int_store,
+	xmm_move, ymm_move, zmm_move, sse_to_integer, integer_to_sse,
+	sse_load, sse_store, sse_unaligned_load and sse_unaligned_store
+	for costs of RTL expressions.
+	* config/i386/x86-tune-costs.h: Move costs of hard register
+	moves to hard_register.  Duplicate int_load, int_store,
+	xmm_move, ymm_move, zmm_move, sse_to_integer, integer_to_sse,
+	sse_load, sse_store for costs of RTL expressions.
+
 2019-08-15  Richard Sandiford  <richard.sandiford@arm.com>
 
 	* target.def (setup_incoming_vararg_bounds): Remove.
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 5eb625ce724..647bcbef050 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -18464,8 +18464,10 @@ inline_memory_move_cost (machine_mode mode, enum reg_class regclass, int in)
 	    return 100;
 	}
       if (in == 2)
-        return MAX (ix86_cost->fp_load [index], ix86_cost->fp_store [index]);
-      return in ? ix86_cost->fp_load [index] : ix86_cost->fp_store [index];
+        return MAX (ix86_cost->hard_register.fp_load [index],
+		    ix86_cost->hard_register.fp_store [index]);
+      return in ? ix86_cost->hard_register.fp_load [index]
+		: ix86_cost->hard_register.fp_store [index];
     }
   if (SSE_CLASS_P (regclass))
     {
@@ -18473,8 +18475,10 @@ inline_memory_move_cost (machine_mode mode, enum reg_class regclass, int in)
       if (index == -1)
 	return 100;
       if (in == 2)
-        return MAX (ix86_cost->sse_load [index], ix86_cost->sse_store [index]);
-      return in ? ix86_cost->sse_load [index] : ix86_cost->sse_store [index];
+        return MAX (ix86_cost->hard_register.sse_load [index],
+		    ix86_cost->hard_register.sse_store [index]);
+      return in ? ix86_cost->hard_register.sse_load [index]
+		: ix86_cost->hard_register.sse_store [index];
     }
   if (MMX_CLASS_P (regclass))
     {
@@ -18491,8 +18495,10 @@ inline_memory_move_cost (machine_mode mode, enum reg_class regclass, int in)
 	    return 100;
 	}
       if (in == 2)
-        return MAX (ix86_cost->mmx_load [index], ix86_cost->mmx_store [index]);
-      return in ? ix86_cost->mmx_load [index] : ix86_cost->mmx_store [index];
+        return MAX (ix86_cost->hard_register.mmx_load [index],
+		    ix86_cost->hard_register.mmx_store [index]);
+      return in ? ix86_cost->hard_register.mmx_load [index]
+		: ix86_cost->hard_register.mmx_store [index];
     }
   switch (GET_MODE_SIZE (mode))
     {
@@ -18500,37 +18506,41 @@ inline_memory_move_cost (machine_mode mode, enum reg_class regclass, int in)
 	if (Q_CLASS_P (regclass) || TARGET_64BIT)
 	  {
 	    if (!in)
-	      return ix86_cost->int_store[0];
+	      return ix86_cost->hard_register.int_store[0];
 	    if (TARGET_PARTIAL_REG_DEPENDENCY
 	        && optimize_function_for_speed_p (cfun))
-	      cost = ix86_cost->movzbl_load;
+	      cost = ix86_cost->hard_register.movzbl_load;
 	    else
-	      cost = ix86_cost->int_load[0];
+	      cost = ix86_cost->hard_register.int_load[0];
 	    if (in == 2)
-	      return MAX (cost, ix86_cost->int_store[0]);
+	      return MAX (cost, ix86_cost->hard_register.int_store[0]);
 	    return cost;
 	  }
 	else
 	  {
 	   if (in == 2)
-	     return MAX (ix86_cost->movzbl_load, ix86_cost->int_store[0] + 4);
+	     return MAX (ix86_cost->hard_register.movzbl_load,
+			 ix86_cost->hard_register.int_store[0] + 4);
 	   if (in)
-	     return ix86_cost->movzbl_load;
+	     return ix86_cost->hard_register.movzbl_load;
 	   else
-	     return ix86_cost->int_store[0] + 4;
+	     return ix86_cost->hard_register.int_store[0] + 4;
 	  }
 	break;
       case 2:
 	if (in == 2)
-	  return MAX (ix86_cost->int_load[1], ix86_cost->int_store[1]);
-	return in ? ix86_cost->int_load[1] : ix86_cost->int_store[1];
+	  return MAX (ix86_cost->hard_register.int_load[1],
+		      ix86_cost->hard_register.int_store[1]);
+	return in ? ix86_cost->hard_register.int_load[1]
+		  : ix86_cost->hard_register.int_store[1];
       default:
 	if (in == 2)
-	  cost = MAX (ix86_cost->int_load[2], ix86_cost->int_store[2]);
+	  cost = MAX (ix86_cost->hard_register.int_load[2],
+		      ix86_cost->hard_register.int_store[2]);
 	else if (in)
-	  cost = ix86_cost->int_load[2];
+	  cost = ix86_cost->hard_register.int_load[2];
 	else
-	  cost = ix86_cost->int_store[2];
+	  cost = ix86_cost->hard_register.int_store[2];
 	/* Multiply with the number of GPR moves needed.  */
 	return cost * CEIL ((int) GET_MODE_SIZE (mode), UNITS_PER_WORD);
     }
@@ -18600,20 +18610,21 @@ ix86_register_move_cost (machine_mode mode, reg_class_t class1_i,
        because of missing QImode and HImode moves to, from or between
        MMX/SSE registers.  */
     return MAX (8, SSE_CLASS_P (class1)
-		? ix86_cost->sse_to_integer : ix86_cost->integer_to_sse);
+		? ix86_cost->hard_register.sse_to_integer
+		: ix86_cost->hard_register.integer_to_sse);
 
   if (MAYBE_FLOAT_CLASS_P (class1))
-    return ix86_cost->fp_move;
+    return ix86_cost->hard_register.fp_move;
   if (MAYBE_SSE_CLASS_P (class1))
     {
       if (GET_MODE_BITSIZE (mode) <= 128)
-	return ix86_cost->xmm_move;
+	return ix86_cost->hard_register.xmm_move;
       if (GET_MODE_BITSIZE (mode) <= 256)
-	return ix86_cost->ymm_move;
-      return ix86_cost->zmm_move;
+	return ix86_cost->hard_register.ymm_move;
+      return ix86_cost->hard_register.zmm_move;
     }
   if (MAYBE_MMX_CLASS_P (class1))
-    return ix86_cost->mmx_move;
+    return ix86_cost->hard_register.mmx_move;
   return 2;
 }
 
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index 800d7c4c4e3..e0a77e1fb25 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -237,9 +237,46 @@ struct stringop_algs
   } size [MAX_STRINGOP_ALGS];
 };
 
-/* Define the specific costs for a given cpu */
+/* Define the specific costs for a given cpu.  NB: hard_register is used
+   by TARGET_REGISTER_MOVE_COST and TARGET_MEMORY_MOVE_COST to compute
+   hard register move costs by register allocator.  Relative costs of
+   pseudo register load and store versus pseudo register moves in RTL
+   expressions for TARGET_RTX_COSTS can be different from relative
+   costs of hard registers to get the most efficient operations with
+   pseudo registers.  */
 
 struct processor_costs {
+  /* Costs used by register allocator.  integer->integer register move
+     cost is 2.  */
+  struct
+    {
+      const int movzbl_load;	/* cost of loading using movzbl */
+      const int int_load[3];	/* cost of loading integer registers
+				   in QImode, HImode and SImode relative
+				   to reg-reg move (2).  */
+      const int int_store[3];	/* cost of storing integer register
+				   in QImode, HImode and SImode */
+      const int fp_move;	/* cost of reg,reg fld/fst */
+      const int fp_load[3];	/* cost of loading FP register
+				   in SFmode, DFmode and XFmode */
+      const int fp_store[3];	/* cost of storing FP register
+				   in SFmode, DFmode and XFmode */
+      const int mmx_move;	/* cost of moving MMX register.  */
+      const int mmx_load[2];	/* cost of loading MMX register
+				   in SImode and DImode */
+      const int mmx_store[2];	/* cost of storing MMX register
+				   in SImode and DImode */
+      const int xmm_move;	/* cost of moving XMM register.  */
+      const int ymm_move;	/* cost of moving XMM register.  */
+      const int zmm_move;	/* cost of moving XMM register.  */
+      const int sse_load[5];	/* cost of loading SSE register
+				   in 32bit, 64bit, 128bit, 256bit and 512bit */
+      const int sse_store[5];	/* cost of storing SSE register
+				   in SImode, DImode and TImode.  */
+      const int sse_to_integer;	/* cost of moving SSE register to integer.  */
+      const int integer_to_sse;	/* cost of moving integer register to SSE. */
+    } hard_register;
+
   const int add;		/* cost of an add instruction */
   const int lea;		/* cost of a lea instruction */
   const int shift_var;		/* variable shift costs */
@@ -254,32 +291,20 @@ struct processor_costs {
   const int large_insn;		/* insns larger than this cost more */
   const int move_ratio;		/* The threshold of number of scalar
 				   memory-to-memory move insns.  */
-  const int movzbl_load;	/* cost of loading using movzbl */
   const int int_load[3];	/* cost of loading integer registers
 				   in QImode, HImode and SImode relative
 				   to reg-reg move (2).  */
   const int int_store[3];	/* cost of storing integer register
 				   in QImode, HImode and SImode */
-  const int fp_move;		/* cost of reg,reg fld/fst */
-  const int fp_load[3];		/* cost of loading FP register
-				   in SFmode, DFmode and XFmode */
-  const int fp_store[3];	/* cost of storing FP register
-				   in SFmode, DFmode and XFmode */
-  const int mmx_move;		/* cost of moving MMX register.  */
-  const int mmx_load[2];	/* cost of loading MMX register
-				   in SImode and DImode */
-  const int mmx_store[2];	/* cost of storing MMX register
-				   in SImode and DImode */
-  const int xmm_move, ymm_move, /* cost of moving XMM and YMM register.  */
-	    zmm_move;
   const int sse_load[5];	/* cost of loading SSE register
 				   in 32bit, 64bit, 128bit, 256bit and 512bit */
-  const int sse_unaligned_load[5];/* cost of unaligned load.  */
   const int sse_store[5];	/* cost of storing SSE register
-				   in SImode, DImode and TImode.  */
+				   in 32bit, 64bit, 128bit, 256bit and 512bit */
+  const int sse_unaligned_load[5];/* cost of unaligned load.  */
   const int sse_unaligned_store[5];/* cost of unaligned store.  */
+  const int xmm_move, ymm_move, /* cost of moving XMM and YMM register.  */
+	    zmm_move;
   const int sse_to_integer;	/* cost of moving SSE register to integer.  */
-  const int integer_to_sse;	/* cost of moving integer register to SSE. */
   const int gather_static, gather_per_elt; /* Cost of gather load is computed
 				   as static + per_item * nelts. */
   const int scatter_static, scatter_per_elt; /* Cost of gather store is
diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h
index 8b963c07051..ad9ea4bfa08 100644
--- a/gcc/config/i386/x86-tune-costs.h
+++ b/gcc/config/i386/x86-tune-costs.h
@@ -36,6 +36,30 @@ static stringop_algs ix86_size_memset[2] = {
 
 const
 struct processor_costs ix86_size_cost = {/* costs for tuning for size */
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
+  2,				     /* cost for loading QImode using movzbl */
+  {2, 2, 2},				/* cost of loading integer registers
+					   in QImode, HImode and SImode.
+					   Relative to reg-reg move (2).  */
+  {2, 2, 2},				/* cost of storing integer registers */
+  2,					/* cost of reg,reg fld/fst */
+  {2, 2, 2},				/* cost of loading fp registers
+					   in SFmode, DFmode and XFmode */
+  {2, 2, 2},				/* cost of storing fp registers
+					   in SFmode, DFmode and XFmode */
+  3,					/* cost of moving MMX register */
+  {3, 3},				/* cost of loading MMX registers
+					   in SImode and DImode */
+  {3, 3},				/* cost of storing MMX registers
+					   in SImode and DImode */
+  3, 3, 3,				/* cost of moving XMM,YMM,ZMM register */
+  {3, 3, 3, 3, 3},			/* cost of loading SSE registers
+					   in 32,64,128,256 and 512-bit */
+  {3, 3, 3, 3, 3},			/* cost of storing SSE registers
+					   in 32,64,128,256 and 512-bit */
+  3, 3,					/* SSE->integer and integer->SSE moves */
+  /* End of register allocator costs.  */
+
   COSTS_N_BYTES (2),			/* cost of an add instruction */
   COSTS_N_BYTES (3),			/* cost of a lea instruction */
   COSTS_N_BYTES (2),			/* variable shift costs */
@@ -55,33 +79,20 @@ struct processor_costs ix86_size_cost = {/* costs for tuning for size */
   COSTS_N_BYTES (3),			/* cost of movzx */
   0,					/* "large" insn */
   2,					/* MOVE_RATIO */
-
-  /* All move costs are relative to integer->integer move times 2. */
-  2,				     /* cost for loading QImode using movzbl */
   {2, 2, 2},				/* cost of loading integer registers
 					   in QImode, HImode and SImode.
 					   Relative to reg-reg move (2).  */
   {2, 2, 2},				/* cost of storing integer registers */
-  2,					/* cost of reg,reg fld/fst */
-  {2, 2, 2},				/* cost of loading fp registers
-					   in SFmode, DFmode and XFmode */
-  {2, 2, 2},				/* cost of storing fp registers
-					   in SFmode, DFmode and XFmode */
-  3,					/* cost of moving MMX register */
-  {3, 3},				/* cost of loading MMX registers
-					   in SImode and DImode */
-  {3, 3},				/* cost of storing MMX registers
-					   in SImode and DImode */
-  3, 3, 3,				/* cost of moving XMM,YMM,ZMM register */
-  {3, 3, 3, 3, 3},			/* cost of loading SSE registers
-					   in 32,64,128,256 and 512-bit */
+  {3, 3, 3, 3, 3},			/* cost of loading SSE register
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {3, 3, 3, 3, 3},			/* cost of storing SSE register
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {3, 3, 3, 3, 3},			/* cost of unaligned SSE load
 					   in 128bit, 256bit and 512bit */
-  {3, 3, 3, 3, 3},			/* cost of storing SSE registers
-					   in 32,64,128,256 and 512-bit */
-  {3, 3, 3, 3, 3},				/* cost of unaligned SSE store
+  {3, 3, 3, 3, 3},			/* cost of unaligned SSE store
 					   in 128bit, 256bit and 512bit */
-  3, 3,					/* SSE->integer and integer->SSE moves */
+  3, 3, 3,				/* cost of moving XMM,YMM,ZMM register */
+  3,					/* cost of moving SSE register to integer.  */
   5, 0,					/* Gather load static, per_elt.  */
   5, 0,					/* Gather store static, per_elt.  */
   0,					/* size of l1 cache  */
@@ -127,6 +138,30 @@ static stringop_algs i386_memset[2] = {
 
 static const
 struct processor_costs i386_cost = {	/* 386 specific costs */
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
+  4,				     /* cost for loading QImode using movzbl */
+  {2, 4, 2},				/* cost of loading integer registers
+					   in QImode, HImode and SImode.
+					   Relative to reg-reg move (2).  */
+  {2, 4, 2},				/* cost of storing integer registers */
+  2,					/* cost of reg,reg fld/fst */
+  {8, 8, 8},				/* cost of loading fp registers
+					   in SFmode, DFmode and XFmode */
+  {8, 8, 8},				/* cost of storing fp registers
+					   in SFmode, DFmode and XFmode */
+  2,					/* cost of moving MMX register */
+  {4, 8},				/* cost of loading MMX registers
+					   in SImode and DImode */
+  {4, 8},				/* cost of storing MMX registers
+					   in SImode and DImode */
+  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
+  {4, 8, 16, 32, 64},			/* cost of loading SSE registers
+					   in 32,64,128,256 and 512-bit */
+  {4, 8, 16, 32, 64},			/* cost of storing SSE registers
+					   in 32,64,128,256 and 512-bit */
+  3, 3,					/* SSE->integer and integer->SSE moves */
+  /* End of register allocator costs.  */
+
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   COSTS_N_INSNS (1),			/* cost of a lea instruction */
   COSTS_N_INSNS (3),			/* variable shift costs */
@@ -146,32 +181,18 @@ struct processor_costs i386_cost = {	/* 386 specific costs */
   COSTS_N_INSNS (2),			/* cost of movzx */
   15,					/* "large" insn */
   3,					/* MOVE_RATIO */
-
-  /* All move costs are relative to integer->integer move times 2 and thus
-     they are latency*2. */
-  4,				     /* cost for loading QImode using movzbl */
   {2, 4, 2},				/* cost of loading integer registers
 					   in QImode, HImode and SImode.
 					   Relative to reg-reg move (2).  */
   {2, 4, 2},				/* cost of storing integer registers */
-  2,					/* cost of reg,reg fld/fst */
-  {8, 8, 8},				/* cost of loading fp registers
-					   in SFmode, DFmode and XFmode */
-  {8, 8, 8},				/* cost of storing fp registers
-					   in SFmode, DFmode and XFmode */
-  2,					/* cost of moving MMX register */
-  {4, 8},				/* cost of loading MMX registers
-					   in SImode and DImode */
-  {4, 8},				/* cost of storing MMX registers
-					   in SImode and DImode */
-  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
-  {4, 8, 16, 32, 64},			/* cost of loading SSE registers
-					   in 32,64,128,256 and 512-bit */
+  {4, 8, 16, 32, 64},			/* cost of loading SSE register
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {4, 8, 16, 32, 64},			/* cost of storing SSE register
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {4, 8, 16, 32, 64},			/* cost of unaligned loads.  */
-  {4, 8, 16, 32, 64},			/* cost of storing SSE registers
-					   in 32,64,128,256 and 512-bit */
   {4, 8, 16, 32, 64},			/* cost of unaligned stores.  */
-  3, 3,					/* SSE->integer and integer->SSE moves */
+  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
+  3,					/* cost of moving SSE register to integer.  */
   4, 4,					/* Gather load static, per_elt.  */
   4, 4,					/* Gather store static, per_elt.  */
   0,					/* size of l1 cache  */
@@ -216,6 +237,30 @@ static stringop_algs i486_memset[2] = {
 
 static const
 struct processor_costs i486_cost = {	/* 486 specific costs */
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
+  4,				     /* cost for loading QImode using movzbl */
+  {2, 4, 2},				/* cost of loading integer registers
+					   in QImode, HImode and SImode.
+					   Relative to reg-reg move (2).  */
+  {2, 4, 2},				/* cost of storing integer registers */
+  2,					/* cost of reg,reg fld/fst */
+  {8, 8, 8},				/* cost of loading fp registers
+					   in SFmode, DFmode and XFmode */
+  {8, 8, 8},				/* cost of storing fp registers
+					   in SFmode, DFmode and XFmode */
+  2,					/* cost of moving MMX register */
+  {4, 8},				/* cost of loading MMX registers
+					   in SImode and DImode */
+  {4, 8},				/* cost of storing MMX registers
+					   in SImode and DImode */
+  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
+  {4, 8, 16, 32, 64},			/* cost of loading SSE registers
+					   in 32,64,128,256 and 512-bit */
+  {4, 8, 16, 32, 64},			/* cost of storing SSE registers
+					   in 32,64,128,256 and 512-bit */
+  3, 3,					/* SSE->integer and integer->SSE moves */
+  /* End of register allocator costs.  */
+
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   COSTS_N_INSNS (1),			/* cost of a lea instruction */
   COSTS_N_INSNS (3),			/* variable shift costs */
@@ -235,32 +280,18 @@ struct processor_costs i486_cost = {	/* 486 specific costs */
   COSTS_N_INSNS (2),			/* cost of movzx */
   15,					/* "large" insn */
   3,					/* MOVE_RATIO */
-
-  /* All move costs are relative to integer->integer move times 2 and thus
-     they are latency*2. */
-  4,				     /* cost for loading QImode using movzbl */
   {2, 4, 2},				/* cost of loading integer registers
 					   in QImode, HImode and SImode.
 					   Relative to reg-reg move (2).  */
   {2, 4, 2},				/* cost of storing integer registers */
-  2,					/* cost of reg,reg fld/fst */
-  {8, 8, 8},				/* cost of loading fp registers
-					   in SFmode, DFmode and XFmode */
-  {8, 8, 8},				/* cost of storing fp registers
-					   in SFmode, DFmode and XFmode */
-  2,					/* cost of moving MMX register */
-  {4, 8},				/* cost of loading MMX registers
-					   in SImode and DImode */
-  {4, 8},				/* cost of storing MMX registers
-					   in SImode and DImode */
-  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
-  {4, 8, 16, 32, 64},			/* cost of loading SSE registers
-					   in 32,64,128,256 and 512-bit */
+  {4, 8, 16, 32, 64},			/* cost of loading SSE register
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {4, 8, 16, 32, 64},			/* cost of storing SSE register
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {4, 8, 16, 32, 64},			/* cost of unaligned loads.  */
-  {4, 8, 16, 32, 64},			/* cost of storing SSE registers
-					   in 32,64,128,256 and 512-bit */
   {4, 8, 16, 32, 64},			/* cost of unaligned stores.  */
-  3, 3,					/* SSE->integer and integer->SSE moves */
+  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
+  3,					/* cost of moving SSE register to integer.  */
   4, 4,					/* Gather load static, per_elt.  */
   4, 4,					/* Gather store static, per_elt.  */
   4,					/* size of l1 cache.  486 has 8kB cache
@@ -307,6 +338,30 @@ static stringop_algs pentium_memset[2] = {
 
 static const
 struct processor_costs pentium_cost = {
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
+  6,				     /* cost for loading QImode using movzbl */
+  {2, 4, 2},				/* cost of loading integer registers
+					   in QImode, HImode and SImode.
+					   Relative to reg-reg move (2).  */
+  {2, 4, 2},				/* cost of storing integer registers */
+  2,					/* cost of reg,reg fld/fst */
+  {2, 2, 6},				/* cost of loading fp registers
+					   in SFmode, DFmode and XFmode */
+  {4, 4, 6},				/* cost of storing fp registers
+					   in SFmode, DFmode and XFmode */
+  8,					/* cost of moving MMX register */
+  {8, 8},				/* cost of loading MMX registers
+					   in SImode and DImode */
+  {8, 8},				/* cost of storing MMX registers
+					   in SImode and DImode */
+  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
+  {4, 8, 16, 32, 64},			/* cost of loading SSE registers
+					   in 32,64,128,256 and 512-bit */
+  {4, 8, 16, 32, 64},			/* cost of storing SSE registers
+					   in 32,64,128,256 and 512-bit */
+  3, 3,					/* SSE->integer and integer->SSE moves */
+  /* End of register allocator costs.  */
+
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   COSTS_N_INSNS (1),			/* cost of a lea instruction */
   COSTS_N_INSNS (4),			/* variable shift costs */
@@ -326,32 +381,18 @@ struct processor_costs pentium_cost = {
   COSTS_N_INSNS (2),			/* cost of movzx */
   8,					/* "large" insn */
   6,					/* MOVE_RATIO */
-
-  /* All move costs are relative to integer->integer move times 2 and thus
-     they are latency*2. */
-  6,				     /* cost for loading QImode using movzbl */
   {2, 4, 2},				/* cost of loading integer registers
 					   in QImode, HImode and SImode.
 					   Relative to reg-reg move (2).  */
   {2, 4, 2},				/* cost of storing integer registers */
-  2,					/* cost of reg,reg fld/fst */
-  {2, 2, 6},				/* cost of loading fp registers
-					   in SFmode, DFmode and XFmode */
-  {4, 4, 6},				/* cost of storing fp registers
-					   in SFmode, DFmode and XFmode */
-  8,					/* cost of moving MMX register */
-  {8, 8},				/* cost of loading MMX registers
-					   in SImode and DImode */
-  {8, 8},				/* cost of storing MMX registers
-					   in SImode and DImode */
-  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
-  {4, 8, 16, 32, 64},			/* cost of loading SSE registers
-					   in 32,64,128,256 and 512-bit */
+  {4, 8, 16, 32, 64},			/* cost of loading SSE register
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {4, 8, 16, 32, 64},			/* cost of storing SSE register
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {4, 8, 16, 32, 64},			/* cost of unaligned loads.  */
-  {4, 8, 16, 32, 64},			/* cost of storing SSE registers
-					   in 32,64,128,256 and 512-bit */
   {4, 8, 16, 32, 64},			/* cost of unaligned stores.  */
-  3, 3,					/* SSE->integer and integer->SSE moves */
+  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
+  3,					/* cost of moving SSE register to integer.  */
   4, 4,					/* Gather load static, per_elt.  */
   4, 4,					/* Gather store static, per_elt.  */
   8,					/* size of l1 cache.  */
@@ -389,6 +430,30 @@ struct processor_costs pentium_cost = {
 
 static const
 struct processor_costs lakemont_cost = {
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
+  6,				     /* cost for loading QImode using movzbl */
+  {2, 4, 2},				/* cost of loading integer registers
+					   in QImode, HImode and SImode.
+					   Relative to reg-reg move (2).  */
+  {2, 4, 2},				/* cost of storing integer registers */
+  2,					/* cost of reg,reg fld/fst */
+  {2, 2, 6},				/* cost of loading fp registers
+					   in SFmode, DFmode and XFmode */
+  {4, 4, 6},				/* cost of storing fp registers
+					   in SFmode, DFmode and XFmode */
+  8,					/* cost of moving MMX register */
+  {8, 8},				/* cost of loading MMX registers
+					   in SImode and DImode */
+  {8, 8},				/* cost of storing MMX registers
+					   in SImode and DImode */
+  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
+  {4, 8, 16, 32, 64},			/* cost of loading SSE registers
+					   in 32,64,128,256 and 512-bit */
+  {4, 8, 16, 32, 64},			/* cost of storing SSE registers
+					   in 32,64,128,256 and 512-bit */
+  3, 3,					/* SSE->integer and integer->SSE moves */
+  /* End of register allocator costs.  */
+
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   COSTS_N_INSNS (1) + 1,		/* cost of a lea instruction */
   COSTS_N_INSNS (1),			/* variable shift costs */
@@ -408,32 +473,18 @@ struct processor_costs lakemont_cost = {
   COSTS_N_INSNS (2),			/* cost of movzx */
   8,					/* "large" insn */
   17,					/* MOVE_RATIO */
-
-  /* All move costs are relative to integer->integer move times 2 and thus
-     they are latency*2. */
-  6,				     /* cost for loading QImode using movzbl */
   {2, 4, 2},				/* cost of loading integer registers
 					   in QImode, HImode and SImode.
 					   Relative to reg-reg move (2).  */
   {2, 4, 2},				/* cost of storing integer registers */
-  2,					/* cost of reg,reg fld/fst */
-  {2, 2, 6},				/* cost of loading fp registers
-					   in SFmode, DFmode and XFmode */
-  {4, 4, 6},				/* cost of storing fp registers
-					   in SFmode, DFmode and XFmode */
-  8,					/* cost of moving MMX register */
-  {8, 8},				/* cost of loading MMX registers
-					   in SImode and DImode */
-  {8, 8},				/* cost of storing MMX registers
-					   in SImode and DImode */
-  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
-  {4, 8, 16, 32, 64},			/* cost of loading SSE registers
-					   in 32,64,128,256 and 512-bit */
+  {4, 8, 16, 32, 64},			/* cost of loading SSE register
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {4, 8, 16, 32, 64},			/* cost of storing SSE register
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {4, 8, 16, 32, 64},			/* cost of unaligned loads.  */
-  {4, 8, 16, 32, 64},			/* cost of storing SSE registers
-					   in 32,64,128,256 and 512-bit */
   {4, 8, 16, 32, 64},			/* cost of unaligned stores.  */
-  3, 3,					/* SSE->integer and integer->SSE moves */
+  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
+  3,					/* cost of moving SSE register to integer.  */
   4, 4,					/* Gather load static, per_elt.  */
   4, 4,					/* Gather store static, per_elt.  */
   8,					/* size of l1 cache.  */
@@ -486,6 +537,30 @@ static stringop_algs pentiumpro_memset[2] = {
   DUMMY_STRINGOP_ALGS};
 static const
 struct processor_costs pentiumpro_cost = {
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
+  2,				     /* cost for loading QImode using movzbl */
+  {4, 4, 4},				/* cost of loading integer registers
+					   in QImode, HImode and SImode.
+					   Relative to reg-reg move (2).  */
+  {2, 2, 2},				/* cost of storing integer registers */
+  2,					/* cost of reg,reg fld/fst */
+  {2, 2, 6},				/* cost of loading fp registers
+					   in SFmode, DFmode and XFmode */
+  {4, 4, 6},				/* cost of storing fp registers
+					   in SFmode, DFmode and XFmode */
+  2,					/* cost of moving MMX register */
+  {2, 2},				/* cost of loading MMX registers
+					   in SImode and DImode */
+  {2, 2},				/* cost of storing MMX registers
+					   in SImode and DImode */
+  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
+  {4, 8, 16, 32, 64},			/* cost of loading SSE registers
+					   in 32,64,128,256 and 512-bit */
+  {4, 8, 16, 32, 64},			/* cost of storing SSE registers
+					   in 32,64,128,256 and 512-bit */
+  3, 3,					/* SSE->integer and integer->SSE moves */
+  /* End of register allocator costs.  */
+
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   COSTS_N_INSNS (1),			/* cost of a lea instruction */
   COSTS_N_INSNS (1),			/* variable shift costs */
@@ -505,32 +580,18 @@ struct processor_costs pentiumpro_cost = {
   COSTS_N_INSNS (1),			/* cost of movzx */
   8,					/* "large" insn */
   6,					/* MOVE_RATIO */
-
-  /* All move costs are relative to integer->integer move times 2 and thus
-     they are latency*2. */
-  2,				     /* cost for loading QImode using movzbl */
   {4, 4, 4},				/* cost of loading integer registers
 					   in QImode, HImode and SImode.
 					   Relative to reg-reg move (2).  */
   {2, 2, 2},				/* cost of storing integer registers */
-  2,					/* cost of reg,reg fld/fst */
-  {2, 2, 6},				/* cost of loading fp registers
-					   in SFmode, DFmode and XFmode */
-  {4, 4, 6},				/* cost of storing fp registers
-					   in SFmode, DFmode and XFmode */
-  2,					/* cost of moving MMX register */
-  {2, 2},				/* cost of loading MMX registers
-					   in SImode and DImode */
-  {2, 2},				/* cost of storing MMX registers
-					   in SImode and DImode */
-  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
-  {4, 8, 16, 32, 64},			/* cost of loading SSE registers
-					   in 32,64,128,256 and 512-bit */
+  {4, 8, 16, 32, 64},			/* cost of loading SSE register
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {4, 8, 16, 32, 64},			/* cost of storing SSE register
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {4, 8, 16, 32, 64},			/* cost of unaligned loads.  */
-  {4, 8, 16, 32, 64},			/* cost of storing SSE registers
-					   in 32,64,128,256 and 512-bit */
   {4, 8, 16, 32, 64},			/* cost of unaligned stores.  */
-  3, 3,					/* SSE->integer and integer->SSE moves */
+  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
+  3,					/* cost of moving SSE register to integer.  */
   4, 4,					/* Gather load static, per_elt.  */
   4, 4,					/* Gather store static, per_elt.  */
   8,					/* size of l1 cache.  */
@@ -574,6 +635,30 @@ static stringop_algs geode_memset[2] = {
   DUMMY_STRINGOP_ALGS};
 static const
 struct processor_costs geode_cost = {
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
+  2,				     /* cost for loading QImode using movzbl */
+  {2, 2, 2},				/* cost of loading integer registers
+					   in QImode, HImode and SImode.
+					   Relative to reg-reg move (2).  */
+  {2, 2, 2},				/* cost of storing integer registers */
+  2,					/* cost of reg,reg fld/fst */
+  {2, 2, 2},				/* cost of loading fp registers
+					   in SFmode, DFmode and XFmode */
+  {4, 6, 6},				/* cost of storing fp registers
+					   in SFmode, DFmode and XFmode */
+  2,					/* cost of moving MMX register */
+  {2, 2},				/* cost of loading MMX registers
+					   in SImode and DImode */
+  {2, 2},				/* cost of storing MMX registers
+					   in SImode and DImode */
+  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
+  {2, 2, 8, 16, 32},			/* cost of loading SSE registers
+					   in 32,64,128,256 and 512-bit */
+  {2, 2, 8, 16, 32},			/* cost of storing SSE registers
+					   in 32,64,128,256 and 512-bit */
+  6, 6,					/* SSE->integer and integer->SSE moves */
+  /* End of register allocator costs.  */
+
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   COSTS_N_INSNS (1),			/* cost of a lea instruction */
   COSTS_N_INSNS (2),			/* variable shift costs */
@@ -593,33 +678,18 @@ struct processor_costs geode_cost = {
   COSTS_N_INSNS (1),			/* cost of movzx */
   8,					/* "large" insn */
   4,					/* MOVE_RATIO */
-
-  /* All move costs are relative to integer->integer move times 2 and thus
-     they are latency*2. */
-  2,				     /* cost for loading QImode using movzbl */
   {2, 2, 2},				/* cost of loading integer registers
 					   in QImode, HImode and SImode.
 					   Relative to reg-reg move (2).  */
   {2, 2, 2},				/* cost of storing integer registers */
-  2,					/* cost of reg,reg fld/fst */
-  {2, 2, 2},				/* cost of loading fp registers
-					   in SFmode, DFmode and XFmode */
-  {4, 6, 6},				/* cost of storing fp registers
-					   in SFmode, DFmode and XFmode */
-
-  2,					/* cost of moving MMX register */
-  {2, 2},				/* cost of loading MMX registers
-					   in SImode and DImode */
-  {2, 2},				/* cost of storing MMX registers
-					   in SImode and DImode */
-  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
-  {2, 2, 8, 16, 32},			/* cost of loading SSE registers
-					   in 32,64,128,256 and 512-bit */
+  {2, 2, 8, 16, 32},			/* cost of loading SSE register
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {2, 2, 8, 16, 32},			/* cost of storing SSE register
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {2, 2, 8, 16, 32},			/* cost of unaligned loads.  */
-  {2, 2, 8, 16, 32},			/* cost of storing SSE registers
-					   in 32,64,128,256 and 512-bit */
   {2, 2, 8, 16, 32},			/* cost of unaligned stores.  */
-  6, 6,					/* SSE->integer and integer->SSE moves */
+  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
+  6,					/* cost of moving SSE register to integer.  */
   2, 2,					/* Gather load static, per_elt.  */
   2, 2,					/* Gather store static, per_elt.  */
   64,					/* size of l1 cache.  */
@@ -663,6 +733,30 @@ static stringop_algs k6_memset[2] = {
   DUMMY_STRINGOP_ALGS};
 static const
 struct processor_costs k6_cost = {
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
+  3,				     /* cost for loading QImode using movzbl */
+  {4, 5, 4},				/* cost of loading integer registers
+					   in QImode, HImode and SImode.
+					   Relative to reg-reg move (2).  */
+  {2, 3, 2},				/* cost of storing integer registers */
+  4,					/* cost of reg,reg fld/fst */
+  {6, 6, 6},				/* cost of loading fp registers
+					   in SFmode, DFmode and XFmode */
+  {4, 4, 4},				/* cost of storing fp registers
+					   in SFmode, DFmode and XFmode */
+  2,					/* cost of moving MMX register */
+  {2, 2},				/* cost of loading MMX registers
+					   in SImode and DImode */
+  {2, 2},				/* cost of storing MMX registers
+					   in SImode and DImode */
+  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
+  {2, 2, 8, 16, 32},			/* cost of loading SSE registers
+					   in 32,64,128,256 and 512-bit */
+  {2, 2, 8, 16, 32},			/* cost of storing SSE registers
+					   in 32,64,128,256 and 512-bit */
+  6, 6,					/* SSE->integer and integer->SSE moves */
+  /* End of register allocator costs.  */
+
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   COSTS_N_INSNS (2),			/* cost of a lea instruction */
   COSTS_N_INSNS (1),			/* variable shift costs */
@@ -682,32 +776,18 @@ struct processor_costs k6_cost = {
   COSTS_N_INSNS (2),			/* cost of movzx */
   8,					/* "large" insn */
   4,					/* MOVE_RATIO */
-
-  /* All move costs are relative to integer->integer move times 2 and thus
-     they are latency*2. */
-  3,				     /* cost for loading QImode using movzbl */
   {4, 5, 4},				/* cost of loading integer registers
 					   in QImode, HImode and SImode.
 					   Relative to reg-reg move (2).  */
   {2, 3, 2},				/* cost of storing integer registers */
-  4,					/* cost of reg,reg fld/fst */
-  {6, 6, 6},				/* cost of loading fp registers
-					   in SFmode, DFmode and XFmode */
-  {4, 4, 4},				/* cost of storing fp registers
-					   in SFmode, DFmode and XFmode */
-  2,					/* cost of moving MMX register */
-  {2, 2},				/* cost of loading MMX registers
-					   in SImode and DImode */
-  {2, 2},				/* cost of storing MMX registers
-					   in SImode and DImode */
-  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
-  {2, 2, 8, 16, 32},			/* cost of loading SSE registers
-					   in 32,64,128,256 and 512-bit */
+  {2, 2, 8, 16, 32},			/* cost of loading SSE register
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {2, 2, 8, 16, 32},			/* cost of storing SSE register
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {2, 2, 8, 16, 32},			/* cost of unaligned loads.  */
-  {2, 2, 8, 16, 32},			/* cost of storing SSE registers
-					   in 32,64,128,256 and 512-bit */
   {2, 2, 8, 16, 32},			/* cost of unaligned stores.  */
-  6, 6,					/* SSE->integer and integer->SSE moves */
+  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
+  6,					/* cost of moving SSE register to integer.  */
   2, 2,					/* Gather load static, per_elt.  */
   2, 2,					/* Gather store static, per_elt.  */
   32,					/* size of l1 cache.  */
@@ -757,6 +837,30 @@ static stringop_algs athlon_memset[2] = {
   DUMMY_STRINGOP_ALGS};
 static const
 struct processor_costs athlon_cost = {
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
+  4,				     /* cost for loading QImode using movzbl */
+  {3, 4, 3},				/* cost of loading integer registers
+					   in QImode, HImode and SImode.
+					   Relative to reg-reg move (2).  */
+  {3, 4, 3},				/* cost of storing integer registers */
+  4,					/* cost of reg,reg fld/fst */
+  {4, 4, 12},				/* cost of loading fp registers
+					   in SFmode, DFmode and XFmode */
+  {6, 6, 8},				/* cost of storing fp registers
+					   in SFmode, DFmode and XFmode */
+  2,					/* cost of moving MMX register */
+  {4, 4},				/* cost of loading MMX registers
+					   in SImode and DImode */
+  {4, 4},				/* cost of storing MMX registers
+					   in SImode and DImode */
+  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
+  {4, 4, 12, 12, 24},			/* cost of loading SSE registers
+					   in 32,64,128,256 and 512-bit */
+  {4, 4, 10, 10, 20},			/* cost of storing SSE registers
+					   in 32,64,128,256 and 512-bit */
+  5, 5,					/* SSE->integer and integer->SSE moves */
+  /* End of register allocator costs.  */
+
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   COSTS_N_INSNS (2),			/* cost of a lea instruction */
   COSTS_N_INSNS (1),			/* variable shift costs */
@@ -776,32 +880,18 @@ struct processor_costs athlon_cost = {
   COSTS_N_INSNS (1),			/* cost of movzx */
   8,					/* "large" insn */
   9,					/* MOVE_RATIO */
-
-  /* All move costs are relative to integer->integer move times 2 and thus
-     they are latency*2. */
-  4,				     /* cost for loading QImode using movzbl */
   {3, 4, 3},				/* cost of loading integer registers
 					   in QImode, HImode and SImode.
 					   Relative to reg-reg move (2).  */
   {3, 4, 3},				/* cost of storing integer registers */
-  4,					/* cost of reg,reg fld/fst */
-  {4, 4, 12},				/* cost of loading fp registers
-					   in SFmode, DFmode and XFmode */
-  {6, 6, 8},				/* cost of storing fp registers
-					   in SFmode, DFmode and XFmode */
-  2,					/* cost of moving MMX register */
-  {4, 4},				/* cost of loading MMX registers
-					   in SImode and DImode */
-  {4, 4},				/* cost of storing MMX registers
-					   in SImode and DImode */
-  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
-  {4, 4, 12, 12, 24},			/* cost of loading SSE registers
-					   in 32,64,128,256 and 512-bit */
+  {4, 4, 12, 12, 24},			/* cost of loading SSE register
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {4, 4, 10, 10, 20},			/* cost of storing SSE register
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {4, 4, 12, 12, 24},			/* cost of unaligned loads.  */
-  {4, 4, 10, 10, 20},			/* cost of storing SSE registers
-					   in 32,64,128,256 and 512-bit */
   {4, 4, 10, 10, 20},			/* cost of unaligned stores.  */
-  5, 5,					/* SSE->integer and integer->SSE moves */
+  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
+  5,					/* cost of moving SSE register to integer.  */
   4, 4,					/* Gather load static, per_elt.  */
   4, 4,					/* Gather store static, per_elt.  */
   64,					/* size of l1 cache.  */
@@ -853,6 +943,30 @@ static stringop_algs k8_memset[2] = {
              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
 static const
 struct processor_costs k8_cost = {
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
+  4,				     /* cost for loading QImode using movzbl */
+  {3, 4, 3},				/* cost of loading integer registers
+					   in QImode, HImode and SImode.
+					   Relative to reg-reg move (2).  */
+  {3, 4, 3},				/* cost of storing integer registers */
+  4,					/* cost of reg,reg fld/fst */
+  {4, 4, 12},				/* cost of loading fp registers
+					   in SFmode, DFmode and XFmode */
+  {6, 6, 8},				/* cost of storing fp registers
+					   in SFmode, DFmode and XFmode */
+  2,					/* cost of moving MMX register */
+  {3, 3},				/* cost of loading MMX registers
+					   in SImode and DImode */
+  {4, 4},				/* cost of storing MMX registers
+					   in SImode and DImode */
+  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
+  {4, 3, 12, 12, 24},			/* cost of loading SSE registers
+					   in 32,64,128,256 and 512-bit */
+  {4, 4, 10, 10, 20},			/* cost of storing SSE registers
+					   in 32,64,128,256 and 512-bit */
+  5, 5,					/* SSE->integer and integer->SSE moves */
+  /* End of register allocator costs.  */
+
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   COSTS_N_INSNS (2),			/* cost of a lea instruction */
   COSTS_N_INSNS (1),			/* variable shift costs */
@@ -872,32 +986,18 @@ struct processor_costs k8_cost = {
   COSTS_N_INSNS (1),			/* cost of movzx */
   8,					/* "large" insn */
   9,					/* MOVE_RATIO */
-
-  /* All move costs are relative to integer->integer move times 2 and thus
-     they are latency*2. */
-  4,				     /* cost for loading QImode using movzbl */
   {3, 4, 3},				/* cost of loading integer registers
 					   in QImode, HImode and SImode.
 					   Relative to reg-reg move (2).  */
   {3, 4, 3},				/* cost of storing integer registers */
-  4,					/* cost of reg,reg fld/fst */
-  {4, 4, 12},				/* cost of loading fp registers
-					   in SFmode, DFmode and XFmode */
-  {6, 6, 8},				/* cost of storing fp registers
-					   in SFmode, DFmode and XFmode */
-  2,					/* cost of moving MMX register */
-  {3, 3},				/* cost of loading MMX registers
-					   in SImode and DImode */
-  {4, 4},				/* cost of storing MMX registers
-					   in SImode and DImode */
-  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
-  {4, 3, 12, 12, 24},			/* cost of loading SSE registers
-					   in 32,64,128,256 and 512-bit */
+  {4, 3, 12, 12, 24},			/* cost of loading SSE register
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {4, 4, 10, 10, 20},			/* cost of storing SSE register
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {4, 3, 12, 12, 24},			/* cost of unaligned loads.  */
-  {4, 4, 10, 10, 20},			/* cost of storing SSE registers
-					   in 32,64,128,256 and 512-bit */
   {4, 4, 10, 10, 20},			/* cost of unaligned stores.  */
-  5, 5,					/* SSE->integer and integer->SSE moves */
+  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
+  5,					/* cost of moving SSE register to integer.  */
   4, 4,					/* Gather load static, per_elt.  */
   4, 4,					/* Gather store static, per_elt.  */
   64,					/* size of l1 cache.  */
@@ -953,28 +1053,7 @@ static stringop_algs amdfam10_memset[2] = {
   {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
              {-1, libcall, false}}}};
 struct processor_costs amdfam10_cost = {
-  COSTS_N_INSNS (1),			/* cost of an add instruction */
-  COSTS_N_INSNS (2),			/* cost of a lea instruction */
-  COSTS_N_INSNS (1),			/* variable shift costs */
-  COSTS_N_INSNS (1),			/* constant shift costs */
-  {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
-   COSTS_N_INSNS (4),			/*				 HI */
-   COSTS_N_INSNS (3),			/*				 SI */
-   COSTS_N_INSNS (4),			/*				 DI */
-   COSTS_N_INSNS (5)},			/*			      other */
-  0,					/* cost of multiply per each bit set */
-  {COSTS_N_INSNS (19),			/* cost of a divide/mod for QI */
-   COSTS_N_INSNS (35),			/*			    HI */
-   COSTS_N_INSNS (51),			/*			    SI */
-   COSTS_N_INSNS (83),			/*			    DI */
-   COSTS_N_INSNS (83)},			/*			    other */
-  COSTS_N_INSNS (1),			/* cost of movsx */
-  COSTS_N_INSNS (1),			/* cost of movzx */
-  8,					/* "large" insn */
-  9,					/* MOVE_RATIO */
-
-  /* All move costs are relative to integer->integer move times 2 and thus
-     they are latency*2. */
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
   4,				     /* cost for loading QImode using movzbl */
   {3, 4, 3},				/* cost of loading integer registers
 					   in QImode, HImode and SImode.
@@ -993,11 +1072,10 @@ struct processor_costs amdfam10_cost = {
   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
   {4, 4, 3, 6, 12},			/* cost of loading SSE registers
 					   in 32,64,128,256 and 512-bit */
-  {4, 4, 3, 7, 12},			/* cost of unaligned loads.  */
   {4, 4, 5, 10, 20},			/* cost of storing SSE registers
 					   in 32,64,128,256 and 512-bit */
-  {4, 4, 5, 10, 20},			/* cost of unaligned stores.  */
   3, 3,					/* SSE->integer and integer->SSE moves */
+
   					/* On K8:
   					    MOVD reg64, xmmreg Double FSTORE 4
 					    MOVD reg32, xmmreg Double FSTORE 4
@@ -1006,6 +1084,39 @@ struct processor_costs amdfam10_cost = {
 							       1/1  1/1
 					    MOVD reg32, xmmreg Double FADD 3
 							       1/1  1/1 */
+  /* End of register allocator costs.  */
+
+  COSTS_N_INSNS (1),			/* cost of an add instruction */
+  COSTS_N_INSNS (2),			/* cost of a lea instruction */
+  COSTS_N_INSNS (1),			/* variable shift costs */
+  COSTS_N_INSNS (1),			/* constant shift costs */
+  {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
+   COSTS_N_INSNS (4),			/*				 HI */
+   COSTS_N_INSNS (3),			/*				 SI */
+   COSTS_N_INSNS (4),			/*				 DI */
+   COSTS_N_INSNS (5)},			/*			      other */
+  0,					/* cost of multiply per each bit set */
+  {COSTS_N_INSNS (19),			/* cost of a divide/mod for QI */
+   COSTS_N_INSNS (35),			/*			    HI */
+   COSTS_N_INSNS (51),			/*			    SI */
+   COSTS_N_INSNS (83),			/*			    DI */
+   COSTS_N_INSNS (83)},			/*			    other */
+  COSTS_N_INSNS (1),			/* cost of movsx */
+  COSTS_N_INSNS (1),			/* cost of movzx */
+  8,					/* "large" insn */
+  9,					/* MOVE_RATIO */
+  {3, 4, 3},				/* cost of loading integer registers
+					   in QImode, HImode and SImode.
+					   Relative to reg-reg move (2).  */
+  {3, 4, 3},				/* cost of storing integer registers */
+  {4, 4, 3, 6, 12},			/* cost of loading SSE register
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {4, 4, 5, 10, 20},			/* cost of storing SSE register
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {4, 4, 3, 7, 12},			/* cost of unaligned loads.  */
+  {4, 4, 5, 10, 20},			/* cost of unaligned stores.  */
+  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
+  3,					/* cost of moving SSE register to integer.  */
   4, 4,					/* Gather load static, per_elt.  */
   4, 4,					/* Gather store static, per_elt.  */
   64,					/* size of l1 cache.  */
@@ -1062,6 +1173,30 @@ static stringop_algs bdver_memset[2] = {
              {-1, libcall, false}}}};
 
 const struct processor_costs bdver_cost = {
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
+  8,				     /* cost for loading QImode using movzbl */
+  {8, 8, 8},				/* cost of loading integer registers
+					   in QImode, HImode and SImode.
+					   Relative to reg-reg move (2).  */
+  {8, 8, 8},				/* cost of storing integer registers */
+  4,					/* cost of reg,reg fld/fst */
+  {12, 12, 28},				/* cost of loading fp registers
+		   			   in SFmode, DFmode and XFmode */
+  {10, 10, 18},				/* cost of storing fp registers
+ 		   			   in SFmode, DFmode and XFmode */
+  4,					/* cost of moving MMX register */
+  {12, 12},				/* cost of loading MMX registers
+					   in SImode and DImode */
+  {10, 10},				/* cost of storing MMX registers
+					   in SImode and DImode */
+  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
+  {12, 12, 10, 40, 60},			/* cost of loading SSE registers
+					   in 32,64,128,256 and 512-bit */
+  {10, 10, 10, 40, 60},			/* cost of storing SSE registers
+					   in 32,64,128,256 and 512-bit */
+  16, 20,				/* SSE->integer and integer->SSE moves */
+  /* End of register allocator costs.  */
+
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   COSTS_N_INSNS (1),			/* cost of a lea instruction */
   COSTS_N_INSNS (1),			/* variable shift costs */
@@ -1081,32 +1216,18 @@ const struct processor_costs bdver_cost = {
   COSTS_N_INSNS (1),			/* cost of movzx */
   8,					/* "large" insn */
   9,					/* MOVE_RATIO */
-
-  /* All move costs are relative to integer->integer move times 2 and thus
-     they are latency*2. */
-  8,				     /* cost for loading QImode using movzbl */
   {8, 8, 8},				/* cost of loading integer registers
 					   in QImode, HImode and SImode.
 					   Relative to reg-reg move (2).  */
   {8, 8, 8},				/* cost of storing integer registers */
-  4,					/* cost of reg,reg fld/fst */
-  {12, 12, 28},				/* cost of loading fp registers
-		   			   in SFmode, DFmode and XFmode */
-  {10, 10, 18},				/* cost of storing fp registers
- 		   			   in SFmode, DFmode and XFmode */
-  4,					/* cost of moving MMX register */
-  {12, 12},				/* cost of loading MMX registers
-					   in SImode and DImode */
-  {10, 10},				/* cost of storing MMX registers
-					   in SImode and DImode */
-  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
-  {12, 12, 10, 40, 60},			/* cost of loading SSE registers
-					   in 32,64,128,256 and 512-bit */
+  {12, 12, 10, 40, 60},			/* cost of loading SSE register
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {10, 10, 10, 40, 60},			/* cost of storing SSE register
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {12, 12, 10, 40, 60},			/* cost of unaligned loads.  */
-  {10, 10, 10, 40, 60},			/* cost of storing SSE registers
-					   in 32,64,128,256 and 512-bit */
   {10, 10, 10, 40, 60},			/* cost of unaligned stores.  */
-  16, 20,				/* SSE->integer and integer->SSE moves */
+  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
+  16,					/* cost of moving SSE register to integer.  */
   12, 12,				/* Gather load static, per_elt.  */
   10, 10,				/* Gather store static, per_elt.  */
   16,					/* size of l1 cache.  */
@@ -1164,6 +1285,37 @@ static stringop_algs znver1_memset[2] = {
   {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
 	     {-1, libcall, false}}}};
 struct processor_costs znver1_cost = {
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
+
+  /* reg-reg moves are done by renaming and thus they are even cheaper than
+     1 cycle. Becuase reg-reg move cost is 2 and the following tables correspond
+     to doubles of latencies, we do not model this correctly.  It does not
+     seem to make practical difference to bump prices up even more.  */
+  6,					/* cost for loading QImode using
+					   movzbl.  */
+  {6, 6, 6},				/* cost of loading integer registers
+					   in QImode, HImode and SImode.
+					   Relative to reg-reg move (2).  */
+  {8, 8, 8},				/* cost of storing integer
+					   registers.  */
+  2,					/* cost of reg,reg fld/fst.  */
+  {6, 6, 16},				/* cost of loading fp registers
+		   			   in SFmode, DFmode and XFmode.  */
+  {8, 8, 16},				/* cost of storing fp registers
+ 		   			   in SFmode, DFmode and XFmode.  */
+  2,					/* cost of moving MMX register.  */
+  {6, 6},				/* cost of loading MMX registers
+					   in SImode and DImode.  */
+  {8, 8},				/* cost of storing MMX registers
+					   in SImode and DImode.  */
+  2, 3, 6,				/* cost of moving XMM,YMM,ZMM register.  */
+  {6, 6, 6, 12, 24},			/* cost of loading SSE registers
+					   in 32,64,128,256 and 512-bit.  */
+  {8, 8, 8, 16, 32},			/* cost of storing SSE registers
+					   in 32,64,128,256 and 512-bit.  */
+  6, 6,					/* SSE->integer and integer->SSE moves.  */
+  /* End of register allocator costs.  */
+
   COSTS_N_INSNS (1),			/* cost of an add instruction.  */
   COSTS_N_INSNS (1),			/* cost of a lea instruction.  */
   COSTS_N_INSNS (1),			/* variable shift costs.  */
@@ -1186,39 +1338,19 @@ struct processor_costs znver1_cost = {
   COSTS_N_INSNS (1),			/* cost of movzx.  */
   8,					/* "large" insn.  */
   9,					/* MOVE_RATIO.  */
-
-  /* All move costs are relative to integer->integer move times 2 and thus
-     they are latency*2. */
-
-  /* reg-reg moves are done by renaming and thus they are even cheaper than
-     1 cycle. Becuase reg-reg move cost is 2 and the following tables correspond
-     to doubles of latencies, we do not model this correctly.  It does not
-     seem to make practical difference to bump prices up even more.  */
-  6,					/* cost for loading QImode using
-					   movzbl.  */
   {6, 6, 6},				/* cost of loading integer registers
 					   in QImode, HImode and SImode.
 					   Relative to reg-reg move (2).  */
   {8, 8, 8},				/* cost of storing integer
 					   registers.  */
-  2,					/* cost of reg,reg fld/fst.  */
-  {6, 6, 16},				/* cost of loading fp registers
-		   			   in SFmode, DFmode and XFmode.  */
-  {8, 8, 16},				/* cost of storing fp registers
- 		   			   in SFmode, DFmode and XFmode.  */
-  2,					/* cost of moving MMX register.  */
-  {6, 6},				/* cost of loading MMX registers
-					   in SImode and DImode.  */
-  {8, 8},				/* cost of storing MMX registers
-					   in SImode and DImode.  */
-  2, 3, 6,				/* cost of moving XMM,YMM,ZMM register.  */
-  {6, 6, 6, 12, 24},			/* cost of loading SSE registers
-					   in 32,64,128,256 and 512-bit.  */
+  {6, 6, 6, 12, 24},			/* cost of loading SSE register
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {8, 8, 8, 16, 32},			/* cost of storing SSE register
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {6, 6, 6, 12, 24},			/* cost of unaligned loads.  */
-  {8, 8, 8, 16, 32},			/* cost of storing SSE registers
-					   in 32,64,128,256 and 512-bit.  */
   {8, 8, 8, 16, 32},			/* cost of unaligned stores.  */
-  6, 6,					/* SSE->integer and integer->SSE moves.  */
+  2, 3, 6,				/* cost of moving XMM,YMM,ZMM register.  */
+  6,					/* cost of moving SSE register to integer.  */
   /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops,
      throughput 12.  Approx 9 uops do not depend on vector size and every load
      is 7 uops.  */
@@ -1288,31 +1420,7 @@ static stringop_algs znver2_memset[2] = {
 	     {-1, libcall, false}}}};
 
 struct processor_costs znver2_cost = {
-  COSTS_N_INSNS (1),			/* cost of an add instruction.  */
-  COSTS_N_INSNS (1),			/* cost of a lea instruction.  */
-  COSTS_N_INSNS (1),			/* variable shift costs.  */
-  COSTS_N_INSNS (1),			/* constant shift costs.  */
-  {COSTS_N_INSNS (3),			/* cost of starting multiply for QI.  */
-   COSTS_N_INSNS (3),			/* 				 HI.  */
-   COSTS_N_INSNS (3),			/*				 SI.  */
-   COSTS_N_INSNS (3),			/*				 DI.  */
-   COSTS_N_INSNS (3)},			/*			other.  */
-  0,					/* cost of multiply per each bit
-					   set.  */
-   /* Depending on parameters, idiv can get faster on ryzen.  This is upper
-      bound.  */
-  {COSTS_N_INSNS (16),			/* cost of a divide/mod for QI.  */
-   COSTS_N_INSNS (22),			/* 			    HI.  */
-   COSTS_N_INSNS (30),			/*			    SI.  */
-   COSTS_N_INSNS (45),			/*			    DI.  */
-   COSTS_N_INSNS (45)},			/*			    other.  */
-  COSTS_N_INSNS (1),			/* cost of movsx.  */
-  COSTS_N_INSNS (1),			/* cost of movzx.  */
-  8,					/* "large" insn.  */
-  9,					/* MOVE_RATIO.  */
-
-  /* All move costs are relative to integer->integer move times 2 and thus
-     they are latency*2.  */
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
 
   /* reg-reg moves are done by renaming and thus they are even cheaper than
      1 cycle.  Because reg-reg move cost is 2 and following tables correspond
@@ -1339,12 +1447,48 @@ struct processor_costs znver2_cost = {
 					   register.  */
   {6, 6, 6, 6, 12},			/* cost of loading SSE registers
 					   in 32,64,128,256 and 512-bit.  */
-  {6, 6, 6, 6, 12},			/* cost of unaligned loads.  */
   {8, 8, 8, 8, 16},			/* cost of storing SSE registers
 					   in 32,64,128,256 and 512-bit.  */
-  {8, 8, 8, 8, 16},			/* cost of unaligned stores.  */
   6, 6,					/* SSE->integer and integer->SSE
 					   moves.  */
+  /* End of register allocator costs.  */
+
+  COSTS_N_INSNS (1),			/* cost of an add instruction.  */
+  COSTS_N_INSNS (1),			/* cost of a lea instruction.  */
+  COSTS_N_INSNS (1),			/* variable shift costs.  */
+  COSTS_N_INSNS (1),			/* constant shift costs.  */
+  {COSTS_N_INSNS (3),			/* cost of starting multiply for QI.  */
+   COSTS_N_INSNS (3),			/* 				 HI.  */
+   COSTS_N_INSNS (3),			/*				 SI.  */
+   COSTS_N_INSNS (3),			/*				 DI.  */
+   COSTS_N_INSNS (3)},			/*			other.  */
+  0,					/* cost of multiply per each bit
+					   set.  */
+   /* Depending on parameters, idiv can get faster on ryzen.  This is upper
+      bound.  */
+  {COSTS_N_INSNS (16),			/* cost of a divide/mod for QI.  */
+   COSTS_N_INSNS (22),			/* 			    HI.  */
+   COSTS_N_INSNS (30),			/*			    SI.  */
+   COSTS_N_INSNS (45),			/*			    DI.  */
+   COSTS_N_INSNS (45)},			/*			    other.  */
+  COSTS_N_INSNS (1),			/* cost of movsx.  */
+  COSTS_N_INSNS (1),			/* cost of movzx.  */
+  8,					/* "large" insn.  */
+  9,					/* MOVE_RATIO.  */
+  {6, 6, 6},				/* cost of loading integer registers
+					   in QImode, HImode and SImode.
+					   Relative to reg-reg move (2).  */
+  {8, 8, 8},				/* cost of storing integer
+					   registers.  */
+  {6, 6, 6, 6, 12},			/* cost of loading SSE registers
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {8, 8, 8, 8, 16},			/* cost of storing SSE register
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {6, 6, 6, 6, 12},			/* cost of unaligned loads.  */
+  {8, 8, 8, 8, 16},			/* cost of unaligned stores.  */
+  2, 2, 3,				/* cost of moving XMM,YMM,ZMM
+					   register.  */
+  6,					/* cost of moving SSE register to integer.  */
   /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops,
      throughput 12.  Approx 9 uops do not depend on vector size and every load
      is 7 uops.  */
@@ -1416,6 +1560,30 @@ static stringop_algs skylake_memset[2] = {
 
 static const
 struct processor_costs skylake_cost = {
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
+  6,				     /* cost for loading QImode using movzbl */
+  {4, 4, 4},				/* cost of loading integer registers
+					   in QImode, HImode and SImode.
+					   Relative to reg-reg move (2).  */
+  {6, 6, 3},				/* cost of storing integer registers */
+  2,					/* cost of reg,reg fld/fst */
+  {6, 6, 8},				/* cost of loading fp registers
+					   in SFmode, DFmode and XFmode */
+  {6, 6, 10},				/* cost of storing fp registers
+					   in SFmode, DFmode and XFmode */
+  2,					/* cost of moving MMX register */
+  {6, 6},				/* cost of loading MMX registers
+					   in SImode and DImode */
+  {6, 6},				/* cost of storing MMX registers
+					   in SImode and DImode */
+  2, 2, 4,				/* cost of moving XMM,YMM,ZMM register */
+  {6, 6, 6, 10, 20},			/* cost of loading SSE registers
+					   in 32,64,128,256 and 512-bit */
+  {8, 8, 8, 12, 24},			/* cost of storing SSE registers
+					   in 32,64,128,256 and 512-bit */
+  2, 2,					/* SSE->integer and integer->SSE moves */
+  /* End of register allocator costs.  */
+
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   COSTS_N_INSNS (1)+1,		/* cost of a lea instruction */
   COSTS_N_INSNS (1),			/* variable shift costs */
@@ -1437,30 +1605,18 @@ struct processor_costs skylake_cost = {
   COSTS_N_INSNS (0),			/* cost of movzx */
   8,					/* "large" insn */
   17,					/* MOVE_RATIO */
-
-  6,				     /* cost for loading QImode using movzbl */
   {4, 4, 4},				/* cost of loading integer registers
 					   in QImode, HImode and SImode.
 					   Relative to reg-reg move (2).  */
   {6, 6, 3},				/* cost of storing integer registers */
-  2,					/* cost of reg,reg fld/fst */
-  {6, 6, 8},				/* cost of loading fp registers
-					   in SFmode, DFmode and XFmode */
-  {6, 6, 10},				/* cost of storing fp registers
-					   in SFmode, DFmode and XFmode */
-  2,					/* cost of moving MMX register */
-  {6, 6},				/* cost of loading MMX registers
-					   in SImode and DImode */
-  {6, 6},				/* cost of storing MMX registers
-					   in SImode and DImode */
-  2, 2, 4,				/* cost of moving XMM,YMM,ZMM register */
-  {6, 6, 6, 10, 20},			/* cost of loading SSE registers
-					   in 32,64,128,256 and 512-bit */
+  {6, 6, 6, 10, 20},			/* cost of loading SSE register
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {8, 8, 8, 12, 24},			/* cost of storing SSE register
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {6, 6, 6, 10, 20},			/* cost of unaligned loads.  */
-  {8, 8, 8, 12, 24},			/* cost of storing SSE registers
-					   in 32,64,128,256 and 512-bit */
   {8, 8, 8, 8, 16},			/* cost of unaligned stores.  */
-  2, 2,					/* SSE->integer and integer->SSE moves */
+  2, 2, 4,				/* cost of moving XMM,YMM,ZMM register */
+  2,					/* cost of moving SSE register to integer.  */
   20, 8,				/* Gather load static, per_elt.  */
   22, 10,				/* Gather store static, per_elt.  */
   64,					/* size of l1 cache.  */
@@ -1509,6 +1665,30 @@ static stringop_algs btver1_memset[2] = {
   {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
              {-1, libcall, false}}}};
 const struct processor_costs btver1_cost = {
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
+  8,				     /* cost for loading QImode using movzbl */
+  {6, 8, 6},				/* cost of loading integer registers
+					   in QImode, HImode and SImode.
+					   Relative to reg-reg move (2).  */
+  {6, 8, 6},				/* cost of storing integer registers */
+  4,					/* cost of reg,reg fld/fst */
+  {12, 12, 28},				/* cost of loading fp registers
+					   in SFmode, DFmode and XFmode */
+  {12, 12, 38},				/* cost of storing fp registers
+					   in SFmode, DFmode and XFmode */
+  4,					/* cost of moving MMX register */
+  {10, 10},				/* cost of loading MMX registers
+					   in SImode and DImode */
+  {12, 12},				/* cost of storing MMX registers
+					   in SImode and DImode */
+  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
+  {10, 10, 12, 48, 96},			/* cost of loading SSE registers
+					   in 32,64,128,256 and 512-bit */
+  {10, 10, 12, 48, 96},			/* cost of storing SSE registers
+					   in 32,64,128,256 and 512-bit */
+  14, 14,				/* SSE->integer and integer->SSE moves */
+  /* End of register allocator costs.  */
+
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   COSTS_N_INSNS (2),			/* cost of a lea instruction */
   COSTS_N_INSNS (1),			/* variable shift costs */
@@ -1528,32 +1708,18 @@ const struct processor_costs btver1_cost = {
   COSTS_N_INSNS (1),			/* cost of movzx */
   8,					/* "large" insn */
   9,					/* MOVE_RATIO */
-
-  /* All move costs are relative to integer->integer move times 2 and thus
-     they are latency*2. */
-  8,				     /* cost for loading QImode using movzbl */
   {6, 8, 6},				/* cost of loading integer registers
 					   in QImode, HImode and SImode.
 					   Relative to reg-reg move (2).  */
   {6, 8, 6},				/* cost of storing integer registers */
-  4,					/* cost of reg,reg fld/fst */
-  {12, 12, 28},				/* cost of loading fp registers
-					   in SFmode, DFmode and XFmode */
-  {12, 12, 38},				/* cost of storing fp registers
-					   in SFmode, DFmode and XFmode */
-  4,					/* cost of moving MMX register */
-  {10, 10},				/* cost of loading MMX registers
-					   in SImode and DImode */
-  {12, 12},				/* cost of storing MMX registers
-					   in SImode and DImode */
-  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
-  {10, 10, 12, 48, 96},			/* cost of loading SSE registers
-					   in 32,64,128,256 and 512-bit */
+  {10, 10, 12, 48, 96},			/* cost of loading SSE register
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {10, 10, 12, 48, 96},			/* cost of storing SSE register
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {10, 10, 12, 48, 96},			/* cost of unaligned loads.  */
-  {10, 10, 12, 48, 96},			/* cost of storing SSE registers
-					   in 32,64,128,256 and 512-bit */
   {10, 10, 12, 48, 96},			/* cost of unaligned stores.  */
-  14, 14,				/* SSE->integer and integer->SSE moves */
+  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
+  14,					/* cost of moving SSE register to integer.  */
   10, 10,				/* Gather load static, per_elt.  */
   10, 10,				/* Gather store static, per_elt.  */
   32,					/* size of l1 cache.  */
@@ -1600,6 +1766,30 @@ static stringop_algs btver2_memset[2] = {
   {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
              {-1, libcall, false}}}};
 const struct processor_costs btver2_cost = {
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
+  8,				     /* cost for loading QImode using movzbl */
+  {8, 8, 6},				/* cost of loading integer registers
+					   in QImode, HImode and SImode.
+					   Relative to reg-reg move (2).  */
+  {8, 8, 6},				/* cost of storing integer registers */
+  4,					/* cost of reg,reg fld/fst */
+  {12, 12, 28},				/* cost of loading fp registers
+					   in SFmode, DFmode and XFmode */
+  {12, 12, 38},				/* cost of storing fp registers
+					   in SFmode, DFmode and XFmode */
+  4,					/* cost of moving MMX register */
+  {10, 10},				/* cost of loading MMX registers
+					   in SImode and DImode */
+  {12, 12},				/* cost of storing MMX registers
+					   in SImode and DImode */
+  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
+  {10, 10, 12, 48, 96},			/* cost of loading SSE registers
+					   in 32,64,128,256 and 512-bit */
+  {10, 10, 12, 48, 96},			/* cost of storing SSE registers
+					   in 32,64,128,256 and 512-bit */
+  14, 14,				/* SSE->integer and integer->SSE moves */
+  /* End of register allocator costs.  */
+
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   COSTS_N_INSNS (2),			/* cost of a lea instruction */
   COSTS_N_INSNS (1),			/* variable shift costs */
@@ -1619,32 +1809,18 @@ const struct processor_costs btver2_cost = {
   COSTS_N_INSNS (1),			/* cost of movzx */
   8,					/* "large" insn */
   9,					/* MOVE_RATIO */
-
-  /* All move costs are relative to integer->integer move times 2 and thus
-     they are latency*2. */
-  8,				     /* cost for loading QImode using movzbl */
   {8, 8, 6},				/* cost of loading integer registers
 					   in QImode, HImode and SImode.
 					   Relative to reg-reg move (2).  */
   {8, 8, 6},				/* cost of storing integer registers */
-  4,					/* cost of reg,reg fld/fst */
-  {12, 12, 28},				/* cost of loading fp registers
-					   in SFmode, DFmode and XFmode */
-  {12, 12, 38},				/* cost of storing fp registers
-					   in SFmode, DFmode and XFmode */
-  4,					/* cost of moving MMX register */
-  {10, 10},				/* cost of loading MMX registers
-					   in SImode and DImode */
-  {12, 12},				/* cost of storing MMX registers
-					   in SImode and DImode */
-  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
-  {10, 10, 12, 48, 96},			/* cost of loading SSE registers
-					   in 32,64,128,256 and 512-bit */
+  {10, 10, 12, 48, 96},			/* cost of loading SSE register
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {10, 10, 12, 48, 96},			/* cost of storing SSE register
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {10, 10, 12, 48, 96},			/* cost of unaligned loads.  */
-  {10, 10, 12, 48, 96},			/* cost of storing SSE registers
-					   in 32,64,128,256 and 512-bit */
   {10, 10, 12, 48, 96},			/* cost of unaligned stores.  */
-  14, 14,				/* SSE->integer and integer->SSE moves */
+  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
+  14,					/* cost of moving SSE register to integer.  */
   10, 10,				/* Gather load static, per_elt.  */
   10, 10,				/* Gather store static, per_elt.  */
   32,					/* size of l1 cache.  */
@@ -1690,28 +1866,7 @@ static stringop_algs pentium4_memset[2] = {
 
 static const
 struct processor_costs pentium4_cost = {
-  COSTS_N_INSNS (1),			/* cost of an add instruction */
-  COSTS_N_INSNS (3),			/* cost of a lea instruction */
-  COSTS_N_INSNS (4),			/* variable shift costs */
-  COSTS_N_INSNS (4),			/* constant shift costs */
-  {COSTS_N_INSNS (15),			/* cost of starting multiply for QI */
-   COSTS_N_INSNS (15),			/*				 HI */
-   COSTS_N_INSNS (15),			/*				 SI */
-   COSTS_N_INSNS (15),			/*				 DI */
-   COSTS_N_INSNS (15)},			/*			      other */
-  0,					/* cost of multiply per each bit set */
-  {COSTS_N_INSNS (56),			/* cost of a divide/mod for QI */
-   COSTS_N_INSNS (56),			/*			    HI */
-   COSTS_N_INSNS (56),			/*			    SI */
-   COSTS_N_INSNS (56),			/*			    DI */
-   COSTS_N_INSNS (56)},			/*			    other */
-  COSTS_N_INSNS (1),			/* cost of movsx */
-  COSTS_N_INSNS (1),			/* cost of movzx */
-  16,					/* "large" insn */
-  6,					/* MOVE_RATIO */
-
-  /* All move costs are relative to integer->integer move times 2 and thus
-     they are latency*2. */
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
   5,				     /* cost for loading QImode using movzbl */
   {4, 5, 4},				/* cost of loading integer registers
 					   in QImode, HImode and SImode.
@@ -1730,11 +1885,42 @@ struct processor_costs pentium4_cost = {
   12, 24, 48,				/* cost of moving XMM,YMM,ZMM register */
   {16, 16, 16, 32, 64},			/* cost of loading SSE registers
 					   in 32,64,128,256 and 512-bit */
+  {16, 16, 16, 32, 64},			/* cost of storing SSE registers
+					   in 32,64,128,256 and 512-bit */
+  20, 12,				/* SSE->integer and integer->SSE moves */
+  /* End of register allocator costs.  */
+
+  COSTS_N_INSNS (1),			/* cost of an add instruction */
+  COSTS_N_INSNS (3),			/* cost of a lea instruction */
+  COSTS_N_INSNS (4),			/* variable shift costs */
+  COSTS_N_INSNS (4),			/* constant shift costs */
+  {COSTS_N_INSNS (15),			/* cost of starting multiply for QI */
+   COSTS_N_INSNS (15),			/*				 HI */
+   COSTS_N_INSNS (15),			/*				 SI */
+   COSTS_N_INSNS (15),			/*				 DI */
+   COSTS_N_INSNS (15)},			/*			      other */
+  0,					/* cost of multiply per each bit set */
+  {COSTS_N_INSNS (56),			/* cost of a divide/mod for QI */
+   COSTS_N_INSNS (56),			/*			    HI */
+   COSTS_N_INSNS (56),			/*			    SI */
+   COSTS_N_INSNS (56),			/*			    DI */
+   COSTS_N_INSNS (56)},			/*			    other */
+  COSTS_N_INSNS (1),			/* cost of movsx */
+  COSTS_N_INSNS (1),			/* cost of movzx */
+  16,					/* "large" insn */
+  6,					/* MOVE_RATIO */
+  {4, 5, 4},				/* cost of loading integer registers
+					   in QImode, HImode and SImode.
+					   Relative to reg-reg move (2).  */
+  {2, 3, 2},				/* cost of storing integer registers */
+  {16, 16, 16, 32, 64},			/* cost of loading SSE register
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {16, 16, 16, 32, 64},			/* cost of storing SSE register
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {32, 32, 32, 64, 128},		/* cost of unaligned loads.  */
-  {16, 16, 16, 32, 64},			/* cost of storing SSE registers
-					   in 32,64,128,256 and 512-bit */
   {32, 32, 32, 64, 128},		/* cost of unaligned stores.  */
-  20, 12,				/* SSE->integer and integer->SSE moves */
+  12, 24, 48,				/* cost of moving XMM,YMM,ZMM register */
+  20,					/* cost of moving SSE register to integer.  */
   16, 16,				/* Gather load static, per_elt.  */
   16, 16,				/* Gather store static, per_elt.  */
   8,					/* size of l1 cache.  */
@@ -1783,6 +1969,30 @@ static stringop_algs nocona_memset[2] = {
 
 static const
 struct processor_costs nocona_cost = {
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
+  4,				     /* cost for loading QImode using movzbl */
+  {4, 4, 4},				/* cost of loading integer registers
+					   in QImode, HImode and SImode.
+					   Relative to reg-reg move (2).  */
+  {4, 4, 4},				/* cost of storing integer registers */
+  12,					/* cost of reg,reg fld/fst */
+  {14, 14, 14},				/* cost of loading fp registers
+					   in SFmode, DFmode and XFmode */
+  {14, 14, 14},				/* cost of storing fp registers
+					   in SFmode, DFmode and XFmode */
+  14,					/* cost of moving MMX register */
+  {12, 12},				/* cost of loading MMX registers
+					   in SImode and DImode */
+  {12, 12},				/* cost of storing MMX registers
+					   in SImode and DImode */
+  6, 12, 24,				/* cost of moving XMM,YMM,ZMM register */
+  {12, 12, 12, 24, 48},			/* cost of loading SSE registers
+					   in 32,64,128,256 and 512-bit */
+  {12, 12, 12, 24, 48},			/* cost of storing SSE registers
+					   in 32,64,128,256 and 512-bit */
+  20, 12,				/* SSE->integer and integer->SSE moves */
+  /* End of register allocator costs.  */
+
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   COSTS_N_INSNS (1),			/* cost of a lea instruction */
   COSTS_N_INSNS (1),			/* variable shift costs */
@@ -1802,32 +2012,18 @@ struct processor_costs nocona_cost = {
   COSTS_N_INSNS (1),			/* cost of movzx */
   16,					/* "large" insn */
   17,					/* MOVE_RATIO */
-
-  /* All move costs are relative to integer->integer move times 2 and thus
-     they are latency*2. */
-  4,				     /* cost for loading QImode using movzbl */
   {4, 4, 4},				/* cost of loading integer registers
 					   in QImode, HImode and SImode.
 					   Relative to reg-reg move (2).  */
   {4, 4, 4},				/* cost of storing integer registers */
-  12,					/* cost of reg,reg fld/fst */
-  {14, 14, 14},				/* cost of loading fp registers
-					   in SFmode, DFmode and XFmode */
-  {14, 14, 14},				/* cost of storing fp registers
-					   in SFmode, DFmode and XFmode */
-  14,					/* cost of moving MMX register */
-  {12, 12},				/* cost of loading MMX registers
-					   in SImode and DImode */
-  {12, 12},				/* cost of storing MMX registers
-					   in SImode and DImode */
-  6, 12, 24,				/* cost of moving XMM,YMM,ZMM register */
-  {12, 12, 12, 24, 48},			/* cost of loading SSE registers
-					   in 32,64,128,256 and 512-bit */
+  {12, 12, 12, 24, 48},			/* cost of loading SSE register
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {12, 12, 12, 24, 48},			/* cost of storing SSE register
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {24, 24, 24, 48, 96},			/* cost of unaligned loads.  */
-  {12, 12, 12, 24, 48},			/* cost of storing SSE registers
-					   in 32,64,128,256 and 512-bit */
   {24, 24, 24, 48, 96},			/* cost of unaligned stores.  */
-  20, 12,				/* SSE->integer and integer->SSE moves */
+  6, 12, 24,				/* cost of moving XMM,YMM,ZMM register */
+  20,					/* cost of moving SSE register to integer.  */
   12, 12,				/* Gather load static, per_elt.  */
   12, 12,				/* Gather store static, per_elt.  */
   8,					/* size of l1 cache.  */
@@ -1874,6 +2070,30 @@ static stringop_algs atom_memset[2] = {
              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
 static const
 struct processor_costs atom_cost = {
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
+  6,					/* cost for loading QImode using movzbl */
+  {6, 6, 6},				/* cost of loading integer registers
+					   in QImode, HImode and SImode.
+					   Relative to reg-reg move (2).  */
+  {6, 6, 6},				/* cost of storing integer registers */
+  4,					/* cost of reg,reg fld/fst */
+  {6, 6, 18},				/* cost of loading fp registers
+					   in SFmode, DFmode and XFmode */
+  {14, 14, 24},				/* cost of storing fp registers
+					   in SFmode, DFmode and XFmode */
+  2,					/* cost of moving MMX register */
+  {8, 8},				/* cost of loading MMX registers
+					   in SImode and DImode */
+  {10, 10},				/* cost of storing MMX registers
+					   in SImode and DImode */
+  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
+  {8, 8, 8, 16, 32},			/* cost of loading SSE registers
+					   in 32,64,128,256 and 512-bit */
+  {8, 8, 8, 16, 32},			/* cost of storing SSE registers
+					   in 32,64,128,256 and 512-bit */
+  8, 6,					/* SSE->integer and integer->SSE moves */
+  /* End of register allocator costs.  */
+
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   COSTS_N_INSNS (1) + 1,		/* cost of a lea instruction */
   COSTS_N_INSNS (1),			/* variable shift costs */
@@ -1893,32 +2113,18 @@ struct processor_costs atom_cost = {
   COSTS_N_INSNS (1),			/* cost of movzx */
   8,					/* "large" insn */
   17,					/* MOVE_RATIO */
-
-  /* All move costs are relative to integer->integer move times 2 and thus
-     they are latency*2. */
-  6,					/* cost for loading QImode using movzbl */
   {6, 6, 6},				/* cost of loading integer registers
 					   in QImode, HImode and SImode.
 					   Relative to reg-reg move (2).  */
   {6, 6, 6},				/* cost of storing integer registers */
-  4,					/* cost of reg,reg fld/fst */
-  {6, 6, 18},				/* cost of loading fp registers
-					   in SFmode, DFmode and XFmode */
-  {14, 14, 24},				/* cost of storing fp registers
-					   in SFmode, DFmode and XFmode */
-  2,					/* cost of moving MMX register */
-  {8, 8},				/* cost of loading MMX registers
-					   in SImode and DImode */
-  {10, 10},				/* cost of storing MMX registers
-					   in SImode and DImode */
-  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
-  {8, 8, 8, 16, 32},			/* cost of loading SSE registers
-					   in 32,64,128,256 and 512-bit */
+  {8, 8, 8, 16, 32},			/* cost of loading SSE register
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {8, 8, 8, 16, 32},			/* cost of storing SSE register
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {16, 16, 16, 32, 64},			/* cost of unaligned loads.  */
-  {8, 8, 8, 16, 32},			/* cost of storing SSE registers
-					   in 32,64,128,256 and 512-bit */
   {16, 16, 16, 32, 64},			/* cost of unaligned stores.  */
-  8, 6,					/* SSE->integer and integer->SSE moves */
+  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
+  8,					/* cost of moving SSE register to integer.  */
   8, 8,					/* Gather load static, per_elt.  */
   8, 8,					/* Gather store static, per_elt.  */
   32,					/* size of l1 cache.  */
@@ -1965,6 +2171,30 @@ static stringop_algs slm_memset[2] = {
              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
 static const
 struct processor_costs slm_cost = {
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
+  8,					/* cost for loading QImode using movzbl */
+  {8, 8, 8},				/* cost of loading integer registers
+					   in QImode, HImode and SImode.
+					   Relative to reg-reg move (2).  */
+  {6, 6, 6},				/* cost of storing integer registers */
+  2,					/* cost of reg,reg fld/fst */
+  {8, 8, 18},				/* cost of loading fp registers
+					   in SFmode, DFmode and XFmode */
+  {6, 6, 18},				/* cost of storing fp registers
+					   in SFmode, DFmode and XFmode */
+  2,					/* cost of moving MMX register */
+  {8, 8},				/* cost of loading MMX registers
+					   in SImode and DImode */
+  {6, 6},				/* cost of storing MMX registers
+					   in SImode and DImode */
+  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
+  {8, 8, 8, 16, 32},			/* cost of loading SSE registers
+					   in 32,64,128,256 and 512-bit */
+  {8, 8, 8, 16, 32},			/* cost of storing SSE registers
+					   in 32,64,128,256 and 512-bit */
+  8, 6,					/* SSE->integer and integer->SSE moves */
+  /* End of register allocator costs.  */
+
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   COSTS_N_INSNS (1) + 1,		/* cost of a lea instruction */
   COSTS_N_INSNS (1),			/* variable shift costs */
@@ -1984,32 +2214,18 @@ struct processor_costs slm_cost = {
   COSTS_N_INSNS (1),			/* cost of movzx */
   8,					/* "large" insn */
   17,					/* MOVE_RATIO */
-
-  /* All move costs are relative to integer->integer move times 2 and thus
-     they are latency*2. */
-  8,					/* cost for loading QImode using movzbl */
   {8, 8, 8},				/* cost of loading integer registers
 					   in QImode, HImode and SImode.
 					   Relative to reg-reg move (2).  */
   {6, 6, 6},				/* cost of storing integer registers */
-  2,					/* cost of reg,reg fld/fst */
-  {8, 8, 18},				/* cost of loading fp registers
-					   in SFmode, DFmode and XFmode */
-  {6, 6, 18},				/* cost of storing fp registers
-					   in SFmode, DFmode and XFmode */
-  2,					/* cost of moving MMX register */
-  {8, 8},				/* cost of loading MMX registers
-					   in SImode and DImode */
-  {6, 6},				/* cost of storing MMX registers
-					   in SImode and DImode */
-  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
-  {8, 8, 8, 16, 32},			/* cost of loading SSE registers
-					   in 32,64,128,256 and 512-bit */
+  {8, 8, 8, 16, 32},			/* cost of loading SSE register
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {8, 8, 8, 16, 32},			/* cost of storing SSE register
+					   in SImode, DImode and TImode.  */
   {16, 16, 16, 32, 64},			/* cost of unaligned loads.  */
-  {8, 8, 8, 16, 32},			/* cost of storing SSE registers
-					   in 32,64,128,256 and 512-bit */
   {16, 16, 16, 32, 64},			/* cost of unaligned stores.  */
-  8, 6,					/* SSE->integer and integer->SSE moves */
+  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
+  8,					/* cost of moving SSE register to integer.  */
   8, 8,					/* Gather load static, per_elt.  */
   8, 8,					/* Gather store static, per_elt.  */
   32,					/* size of l1 cache.  */
@@ -2056,6 +2272,30 @@ static stringop_algs intel_memset[2] = {
              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
 static const
 struct processor_costs intel_cost = {
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
+  6,				     /* cost for loading QImode using movzbl */
+  {4, 4, 4},				/* cost of loading integer registers
+					   in QImode, HImode and SImode.
+					   Relative to reg-reg move (2).  */
+  {6, 6, 6},				/* cost of storing integer registers */
+  2,					/* cost of reg,reg fld/fst */
+  {6, 6, 8},				/* cost of loading fp registers
+					   in SFmode, DFmode and XFmode */
+  {6, 6, 10},				/* cost of storing fp registers
+					   in SFmode, DFmode and XFmode */
+  2,					/* cost of moving MMX register */
+  {6, 6},				/* cost of loading MMX registers
+					   in SImode and DImode */
+  {6, 6},				/* cost of storing MMX registers
+					   in SImode and DImode */
+  2, 2, 2,				/* cost of moving XMM,YMM,ZMM register */
+  {6, 6, 6, 6, 6},			/* cost of loading SSE registers
+					   in 32,64,128,256 and 512-bit */
+  {6, 6, 6, 6, 6},			/* cost of storing SSE registers
+					   in 32,64,128,256 and 512-bit */
+  4, 4,					/* SSE->integer and integer->SSE moves */
+  /* End of register allocator costs.  */
+
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   COSTS_N_INSNS (1) + 1,		/* cost of a lea instruction */
   COSTS_N_INSNS (1),			/* variable shift costs */
@@ -2075,32 +2315,18 @@ struct processor_costs intel_cost = {
   COSTS_N_INSNS (1),			/* cost of movzx */
   8,					/* "large" insn */
   17,					/* MOVE_RATIO */
-
-  /* All move costs are relative to integer->integer move times 2 and thus
-     they are latency*2. */
-  6,				     /* cost for loading QImode using movzbl */
   {4, 4, 4},				/* cost of loading integer registers
 					   in QImode, HImode and SImode.
 					   Relative to reg-reg move (2).  */
   {6, 6, 6},				/* cost of storing integer registers */
-  2,					/* cost of reg,reg fld/fst */
-  {6, 6, 8},				/* cost of loading fp registers
-					   in SFmode, DFmode and XFmode */
-  {6, 6, 10},				/* cost of storing fp registers
-					   in SFmode, DFmode and XFmode */
-  2,					/* cost of moving MMX register */
-  {6, 6},				/* cost of loading MMX registers
-					   in SImode and DImode */
-  {6, 6},				/* cost of storing MMX registers
-					   in SImode and DImode */
-  2, 2, 2,				/* cost of moving XMM,YMM,ZMM register */
-  {6, 6, 6, 6, 6},			/* cost of loading SSE registers
-					   in 32,64,128,256 and 512-bit */
+  {6, 6, 6, 6, 6},			/* cost of loading SSE register
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {6, 6, 6, 6, 6},			/* cost of storing SSE register
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {10, 10, 10, 10, 10},			/* cost of unaligned loads.  */
-  {6, 6, 6, 6, 6},			/* cost of storing SSE registers
-					   in 32,64,128,256 and 512-bit */
   {10, 10, 10, 10, 10},			/* cost of unaligned loads.  */
-  4, 4,					/* SSE->integer and integer->SSE moves */
+  2, 2, 2,				/* cost of moving XMM,YMM,ZMM register */
+  4,					/* cost of moving SSE register to integer.  */
   6, 6,					/* Gather load static, per_elt.  */
   6, 6,					/* Gather store static, per_elt.  */
   32,					/* size of l1 cache.  */
@@ -2151,6 +2377,30 @@ static stringop_algs generic_memset[2] = {
              {-1, libcall, false}}}};
 static const
 struct processor_costs generic_cost = {
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
+  6,				     /* cost for loading QImode using movzbl */
+  {6, 6, 6},				/* cost of loading integer registers
+					   in QImode, HImode and SImode.
+					   Relative to reg-reg move (2).  */
+  {6, 6, 6},				/* cost of storing integer registers */
+  4,					/* cost of reg,reg fld/fst */
+  {6, 6, 12},				/* cost of loading fp registers
+					   in SFmode, DFmode and XFmode */
+  {6, 6, 12},				/* cost of storing fp registers
+					   in SFmode, DFmode and XFmode */
+  2,					/* cost of moving MMX register */
+  {6, 6},				/* cost of loading MMX registers
+					   in SImode and DImode */
+  {6, 6},				/* cost of storing MMX registers
+					   in SImode and DImode */
+  2, 3, 4,				/* cost of moving XMM,YMM,ZMM register */
+  {6, 6, 6, 10, 15},			/* cost of loading SSE registers
+					   in 32,64,128,256 and 512-bit */
+  {6, 6, 6, 10, 15},			/* cost of storing SSE registers
+					   in 32,64,128,256 and 512-bit */
+  6, 6,					/* SSE->integer and integer->SSE moves */
+  /* End of register allocator costs.  */
+
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   /* Setting cost to 2 makes our current implementation of synth_mult result in
      use of unnecessary temporary registers causing regression on several
@@ -2173,32 +2423,18 @@ struct processor_costs generic_cost = {
   COSTS_N_INSNS (1),			/* cost of movzx */
   8,					/* "large" insn */
   17,					/* MOVE_RATIO */
-
-  /* All move costs are relative to integer->integer move times 2 and thus
-     they are latency*2. */
-  6,				     /* cost for loading QImode using movzbl */
   {6, 6, 6},				/* cost of loading integer registers
 					   in QImode, HImode and SImode.
 					   Relative to reg-reg move (2).  */
   {6, 6, 6},				/* cost of storing integer registers */
-  4,					/* cost of reg,reg fld/fst */
-  {6, 6, 12},				/* cost of loading fp registers
-					   in SFmode, DFmode and XFmode */
-  {6, 6, 12},				/* cost of storing fp registers
-					   in SFmode, DFmode and XFmode */
-  2,					/* cost of moving MMX register */
-  {6, 6},				/* cost of loading MMX registers
-					   in SImode and DImode */
-  {6, 6},				/* cost of storing MMX registers
-					   in SImode and DImode */
-  2, 3, 4,				/* cost of moving XMM,YMM,ZMM register */
-  {6, 6, 6, 10, 15},			/* cost of loading SSE registers
-					   in 32,64,128,256 and 512-bit */
+  {6, 6, 6, 10, 15},			/* cost of loading SSE register
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {6, 6, 6, 10, 15},			/* cost of storing SSE register
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {6, 6, 6, 10, 15},			/* cost of unaligned loads.  */
-  {6, 6, 6, 10, 15},			/* cost of storing SSE registers
-					   in 32,64,128,256 and 512-bit */
   {6, 6, 6, 10, 15},			/* cost of unaligned storess.  */
-  6, 6,					/* SSE->integer and integer->SSE moves */
+  2, 3, 4,				/* cost of moving XMM,YMM,ZMM register */
+  6,					/* cost of moving SSE register to integer.  */
   18, 6,				/* Gather load static, per_elt.  */
   18, 6,				/* Gather store static, per_elt.  */
   32,					/* size of l1 cache.  */
@@ -2251,6 +2487,30 @@ static stringop_algs core_memset[2] = {
 
 static const
 struct processor_costs core_cost = {
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
+  6,				     /* cost for loading QImode using movzbl */
+  {4, 4, 4},				/* cost of loading integer registers
+					   in QImode, HImode and SImode.
+					   Relative to reg-reg move (2).  */
+  {6, 6, 6},				/* cost of storing integer registers */
+  2,					/* cost of reg,reg fld/fst */
+  {6, 6, 8},				/* cost of loading fp registers
+					   in SFmode, DFmode and XFmode */
+  {6, 6, 10},				/* cost of storing fp registers
+					   in SFmode, DFmode and XFmode */
+  2,					/* cost of moving MMX register */
+  {6, 6},				/* cost of loading MMX registers
+					   in SImode and DImode */
+  {6, 6},				/* cost of storing MMX registers
+					   in SImode and DImode */
+  2, 2, 4,				/* cost of moving XMM,YMM,ZMM register */
+  {6, 6, 6, 6, 12},			/* cost of loading SSE registers
+					   in 32,64,128,256 and 512-bit */
+  {6, 6, 6, 6, 12},			/* cost of storing SSE registers
+					   in 32,64,128,256 and 512-bit */
+  2, 2,					/* SSE->integer and integer->SSE moves */
+  /* End of register allocator costs.  */
+
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   /* On all chips taken into consideration lea is 2 cycles and more.  With
      this cost however our current implementation of synth_mult results in
@@ -2277,32 +2537,18 @@ struct processor_costs core_cost = {
   COSTS_N_INSNS (1),			/* cost of movzx */
   8,					/* "large" insn */
   17,					/* MOVE_RATIO */
-
-  /* All move costs are relative to integer->integer move times 2 and thus
-     they are latency*2. */
-  6,				     /* cost for loading QImode using movzbl */
   {4, 4, 4},				/* cost of loading integer registers
 					   in QImode, HImode and SImode.
 					   Relative to reg-reg move (2).  */
   {6, 6, 6},				/* cost of storing integer registers */
-  2,					/* cost of reg,reg fld/fst */
-  {6, 6, 8},				/* cost of loading fp registers
-					   in SFmode, DFmode and XFmode */
-  {6, 6, 10},				/* cost of storing fp registers
-					   in SFmode, DFmode and XFmode */
-  2,					/* cost of moving MMX register */
-  {6, 6},				/* cost of loading MMX registers
-					   in SImode and DImode */
-  {6, 6},				/* cost of storing MMX registers
-					   in SImode and DImode */
-  2, 2, 4,				/* cost of moving XMM,YMM,ZMM register */
-  {6, 6, 6, 6, 12},			/* cost of loading SSE registers
-					   in 32,64,128,256 and 512-bit */
+  {6, 6, 6, 6, 12},			/* cost of loading SSE register
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {6, 6, 6, 6, 12},			/* cost of storing SSE register
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {6, 6, 6, 6, 12},			/* cost of unaligned loads.  */
-  {6, 6, 6, 6, 12},			/* cost of storing SSE registers
-					   in 32,64,128,256 and 512-bit */
   {6, 6, 6, 6, 12},			/* cost of unaligned stores.  */
-  2, 2,					/* SSE->integer and integer->SSE moves */
+  2, 2, 4,				/* cost of moving XMM,YMM,ZMM register */
+  2,					/* cost of moving SSE register to integer.  */
   /* VGATHERDPD is 7 uops, rec throughput 5, while VGATHERDPD is 9 uops,
      rec. throughput 6.
      So 5 uops statically and one uops per load.  */