From 262767053ee39b37d4e34b4fae01bfb1697bd081 Mon Sep 17 00:00:00 2001
From: Roger Sayle <roger@eyesopen.com>
Date: Thu, 2 Sep 2004 02:00:55 +0000
Subject: [PATCH] expmed.c (enum alg_code): Remove long unused enumeration
 values.

	* expmed.c (enum alg_code): Remove long unused enumeration values.
        (struct mult_cost): New structure to hold the "score" of a synthetic
	multiply sequence, including both a rtx_cost and a latency field.
	(MULT_COST_LESS): New macro to compare mult_cost to a constant.
	(CHEAPER_MULT_COST): New macro to compare two mult_costs.
	(struct algorithm): Change type of cost field to be mult_cost.
	(synth_mult): Change type of cost_limit argument to be a
	pointer to a mult_cost.  Update all cost comparisons to use the
	new mult_cost infrastructure.  For alg_add_factor and
	alg_sub_factor operations, latency is lower than the rtx_cost.
	(choose_mult_variant):  Update calls to synth_mult.  Perform
	cost comparisons using the new mult_cost infrastructure.
	(expand_mult_highpart): Use alg.cost.cost instead of alg.cost
	to optain the total rtx_cost of a synth_mult "algorithm".

From-SVN: r86954
---
 gcc/ChangeLog |  17 ++++
 gcc/expmed.c  | 257 ++++++++++++++++++++++++++++++++++++--------------
 2 files changed, 201 insertions(+), 73 deletions(-)

diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index cfa6e35f3c1..8a4a14b86ba 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,20 @@
+2004-09-01  Roger Sayle  <roger@eyesopen.com>
+
+	* expmed.c (enum alg_code): Remove long unused enumeration values.
+        (struct mult_cost): New structure to hold the "score" of a synthetic
+	multiply sequence, including both a rtx_cost and a latency field.
+	(MULT_COST_LESS): New macro to compare mult_cost to a constant.
+	(CHEAPER_MULT_COST): New macro to compare two mult_costs.
+	(struct algorithm): Change type of cost field to be mult_cost.
+	(synth_mult): Change type of cost_limit argument to be a
+	pointer to a mult_cost.  Update all cost comparisons to use the
+	new mult_cost infrastructure.  For alg_add_factor and 
+	alg_sub_factor operations, latency is lower than the rtx_cost.
+	(choose_mult_variant):  Update calls to synth_mult.  Perform
+	cost comparisons using the new mult_cost infrastructure.
+	(expand_mult_highpart): Use alg.cost.cost instead of alg.cost
+	to optain the total rtx_cost of a synth_mult "algorithm".
+
 2004-09-01  David Edelsohn  <edelsohn@gnu.org>
 
 	* config/rs6000/power4.md: Increase store latency to 12.
diff --git a/gcc/expmed.c b/gcc/expmed.c
index 034e49a293d..10084e5ae8b 100644
--- a/gcc/expmed.c
+++ b/gcc/expmed.c
@@ -2153,8 +2153,37 @@ expand_shift (enum tree_code code, enum machine_mode mode, rtx shifted,
 enum alg_code { alg_zero, alg_m, alg_shift,
 		  alg_add_t_m2, alg_sub_t_m2,
 		  alg_add_factor, alg_sub_factor,
-		  alg_add_t2_m, alg_sub_t2_m,
-		  alg_add, alg_subtract, alg_factor, alg_shiftop };
+		  alg_add_t2_m, alg_sub_t2_m };
+
+/* This structure holds the "cost" of a multiply sequence.  The
+   "cost" field holds the total rtx_cost of every operator in the
+   synthetic multiplication sequence, hence cost(a op b) is defined
+   as rtx_cost(op) + cost(a) + cost(b), where cost(leaf) is zero.
+   The "latency" field holds the minimum possible latency of the
+   synthetic multiply, on a hypothetical infinitely parallel CPU.
+   This is the critical path, or the maximum height, of the expression
+   tree which is the sum of rtx_costs on the most expensive path from
+   any leaf to the root.  Hence latency(a op b) is defined as zero for
+   leaves and rtx_cost(op) + max(latency(a), latency(b)) otherwise.  */
+
+struct mult_cost {
+  short cost;     /* Total rtx_cost of the multiplication sequence.  */
+  short latency;  /* The latency of the multiplication sequence.  */
+};
+
+/* This macro is used to compare a pointer to a mult_cost against an
+   single integer "rtx_cost" value.  This is equivalent to the macro
+   CHEAPER_MULT_COST(X,Z) where Z = {Y,Y}.  */
+#define MULT_COST_LESS(X,Y) ((X)->cost < (Y)	\
+			     || ((X)->cost == (Y) && (X)->latency < (Y)))
+
+/* This macro is used to compare two pointers to mult_costs against
+   each other.  The macro returns true if X is cheaper than Y.
+   Currently, the cheaper of two mult_costs is the one with the
+   lower "cost".  If "cost"s are tied, the lower latency is cheaper.  */
+#define CHEAPER_MULT_COST(X,Y)  ((X)->cost < (Y)->cost		\
+				 || ((X)->cost == (Y)->cost	\
+				     && (X)->latency < (Y)->latency))
 
 /* This structure records a sequence of operations.
    `ops' is the number of operations recorded.
@@ -2177,7 +2206,7 @@ enum alg_code { alg_zero, alg_m, alg_shift,
 
 struct algorithm
 {
-  short cost;
+  struct mult_cost cost;
   short ops;
   /* The size of the OP and LOG fields are not directly related to the
      word size, but the worst-case algorithms will be if we have few
@@ -2195,7 +2224,7 @@ struct algorithm
 enum mult_variant {basic_variant, negate_variant, add_variant};
 
 static void synth_mult (struct algorithm *, unsigned HOST_WIDE_INT,
-			int, enum machine_mode mode);
+			const struct mult_cost *, enum machine_mode mode);
 static bool choose_mult_variant (enum machine_mode, HOST_WIDE_INT,
 				 struct algorithm *, enum mult_variant *, int);
 static rtx expand_mult_const (enum machine_mode, rtx, HOST_WIDE_INT, rtx,
@@ -2215,19 +2244,22 @@ static rtx expand_mult_highpart_optab (enum machine_mode, rtx, rtx, rtx,
 
 static void
 synth_mult (struct algorithm *alg_out, unsigned HOST_WIDE_INT t,
-	    int cost_limit, enum machine_mode mode)
+	    const struct mult_cost *cost_limit, enum machine_mode mode)
 {
   int m;
   struct algorithm *alg_in, *best_alg;
-  int cost;
+  struct mult_cost best_cost;
+  struct mult_cost new_limit;
+  int op_cost, op_latency;
   unsigned HOST_WIDE_INT q;
   int maxm = MIN (BITS_PER_WORD, GET_MODE_BITSIZE (mode));
 
   /* Indicate that no algorithm is yet found.  If no algorithm
      is found, this value will be returned and indicate failure.  */
-  alg_out->cost = cost_limit;
+  alg_out->cost.cost = cost_limit->cost + 1;
 
-  if (cost_limit <= 0)
+  if (cost_limit->cost < 0
+      || (cost_limit->cost == 0 && cost_limit->latency <= 0))
     return;
 
   /* Restrict the bits of "t" to the multiplication's mode.  */
@@ -2237,7 +2269,8 @@ synth_mult (struct algorithm *alg_out, unsigned HOST_WIDE_INT t,
   if (t == 1)
     {
       alg_out->ops = 1;
-      alg_out->cost = 0;
+      alg_out->cost.cost = 0;
+      alg_out->cost.latency = 0;
       alg_out->op[0] = alg_m;
       return;
     }
@@ -2246,12 +2279,13 @@ synth_mult (struct algorithm *alg_out, unsigned HOST_WIDE_INT t,
      fail now.  */
   if (t == 0)
     {
-      if (zero_cost >= cost_limit)
+      if (MULT_COST_LESS (cost_limit, zero_cost))
 	return;
       else
 	{
 	  alg_out->ops = 1;
-	  alg_out->cost = zero_cost;
+	  alg_out->cost.cost = zero_cost;
+	  alg_out->cost.latency = zero_cost;
 	  alg_out->op[0] = alg_zero;
 	  return;
 	}
@@ -2261,6 +2295,7 @@ synth_mult (struct algorithm *alg_out, unsigned HOST_WIDE_INT t,
 
   alg_in = alloca (sizeof (struct algorithm));
   best_alg = alloca (sizeof (struct algorithm));
+  best_cost = *cost_limit;
 
   /* If we have a group of zero bits at the low-order part of T, try
      multiplying by the remaining bits and then doing a shift.  */
@@ -2274,19 +2309,22 @@ synth_mult (struct algorithm *alg_out, unsigned HOST_WIDE_INT t,
 	  /* The function expand_shift will choose between a shift and
 	     a sequence of additions, so the observed cost is given as
 	     MIN (m * add_cost[mode], shift_cost[mode][m]).  */
-	  cost = m * add_cost[mode];
-	  if (shift_cost[mode][m] < cost)
-	    cost = shift_cost[mode][m];
-	  synth_mult (alg_in, q, cost_limit - cost, mode);
-
-	  cost += alg_in->cost;
-	  if (cost < cost_limit)
+	  op_cost = m * add_cost[mode];
+	  if (shift_cost[mode][m] < op_cost)
+	    op_cost = shift_cost[mode][m];
+	  new_limit.cost = best_cost.cost - op_cost;
+	  new_limit.latency = best_cost.latency - op_cost;
+	  synth_mult (alg_in, q, &new_limit, mode);
+
+	  alg_in->cost.cost += op_cost;
+	  alg_in->cost.latency += op_cost;
+	  if (CHEAPER_MULT_COST (&alg_in->cost, &best_cost))
 	    {
 	      struct algorithm *x;
+	      best_cost = alg_in->cost;
 	      x = alg_in, alg_in = best_alg, best_alg = x;
 	      best_alg->log[best_alg->ops] = m;
 	      best_alg->op[best_alg->ops] = alg_shift;
-	      cost_limit = cost;
 	    }
 	}
     }
@@ -2311,34 +2349,40 @@ synth_mult (struct algorithm *alg_out, unsigned HOST_WIDE_INT t,
 	{
 	  /* T ends with ...111.  Multiply by (T + 1) and subtract 1.  */
 
-	  cost = add_cost[mode];
-	  synth_mult (alg_in, t + 1, cost_limit - cost, mode);
+	  op_cost = add_cost[mode];
+	  new_limit.cost = best_cost.cost - op_cost;
+	  new_limit.latency = best_cost.latency - op_cost;
+	  synth_mult (alg_in, t + 1, &new_limit, mode);
 
-	  cost += alg_in->cost;
-	  if (cost < cost_limit)
+	  alg_in->cost.cost += op_cost;
+	  alg_in->cost.latency += op_cost;
+	  if (CHEAPER_MULT_COST (&alg_in->cost, &best_cost))
 	    {
 	      struct algorithm *x;
+	      best_cost = alg_in->cost;
 	      x = alg_in, alg_in = best_alg, best_alg = x;
 	      best_alg->log[best_alg->ops] = 0;
 	      best_alg->op[best_alg->ops] = alg_sub_t_m2;
-	      cost_limit = cost;
 	    }
 	}
       else
 	{
 	  /* T ends with ...01 or ...011.  Multiply by (T - 1) and add 1.  */
 
-	  cost = add_cost[mode];
-	  synth_mult (alg_in, t - 1, cost_limit - cost, mode);
+	  op_cost = add_cost[mode];
+	  new_limit.cost = best_cost.cost - op_cost;
+	  new_limit.latency = best_cost.latency - op_cost;
+	  synth_mult (alg_in, t - 1, &new_limit, mode);
 
-	  cost += alg_in->cost;
-	  if (cost < cost_limit)
+	  alg_in->cost.cost += op_cost;
+	  alg_in->cost.latency += op_cost;
+	  if (CHEAPER_MULT_COST (&alg_in->cost, &best_cost))
 	    {
 	      struct algorithm *x;
+	      best_cost = alg_in->cost;
 	      x = alg_in, alg_in = best_alg, best_alg = x;
 	      best_alg->log[best_alg->ops] = 0;
 	      best_alg->op[best_alg->ops] = alg_add_t_m2;
-	      cost_limit = cost;
 	    }
 	}
     }
@@ -2360,19 +2404,36 @@ synth_mult (struct algorithm *alg_out, unsigned HOST_WIDE_INT t,
       d = ((unsigned HOST_WIDE_INT) 1 << m) + 1;
       if (t % d == 0 && t > d && m < maxm)
 	{
-	  cost = add_cost[mode] + shift_cost[mode][m];
-	  if (shiftadd_cost[mode][m] < cost)
-	    cost = shiftadd_cost[mode][m];
-	  synth_mult (alg_in, t / d, cost_limit - cost, mode);
+	  /* If the target has a cheap shift-and-add instruction use
+	     that in preference to a shift insn followed by an add insn.
+	     Assume that the shift-and-add is "atomic" with a latency
+	     equal to it's cost, otherwise assume that on superscalar
+	     hardware the shift may be executed concurrently with the
+	     earlier steps in the algorithm.  */
+	  op_cost = add_cost[mode] + shift_cost[mode][m];
+	  if (shiftadd_cost[mode][m] < op_cost)
+	    {
+	      op_cost = shiftadd_cost[mode][m];
+	      op_latency = op_cost;
+	    }
+	  else
+	    op_latency = add_cost[mode];
+
+	  new_limit.cost = best_cost.cost - op_cost;
+	  new_limit.latency = best_cost.latency - op_latency;
+	  synth_mult (alg_in, t / d, &new_limit, mode);
 
-	  cost += alg_in->cost;
-	  if (cost < cost_limit)
+	  alg_in->cost.cost += op_cost;
+	  alg_in->cost.latency += op_latency;
+	  if (alg_in->cost.latency < op_cost)
+	    alg_in->cost.latency = op_cost;
+	  if (CHEAPER_MULT_COST (&alg_in->cost, &best_cost))
 	    {
 	      struct algorithm *x;
+	      best_cost = alg_in->cost;
 	      x = alg_in, alg_in = best_alg, best_alg = x;
 	      best_alg->log[best_alg->ops] = m;
 	      best_alg->op[best_alg->ops] = alg_add_factor;
-	      cost_limit = cost;
 	    }
 	  /* Other factors will have been taken care of in the recursion.  */
 	  break;
@@ -2381,19 +2442,36 @@ synth_mult (struct algorithm *alg_out, unsigned HOST_WIDE_INT t,
       d = ((unsigned HOST_WIDE_INT) 1 << m) - 1;
       if (t % d == 0 && t > d && m < maxm)
 	{
-	  cost = add_cost[mode] + shift_cost[mode][m];
-	  if (shiftsub_cost[mode][m] < cost)
-	    cost = shiftsub_cost[mode][m];
-	  synth_mult (alg_in, t / d, cost_limit - cost, mode);
+	  /* If the target has a cheap shift-and-subtract insn use
+	     that in preference to a shift insn followed by a sub insn.
+	     Assume that the shift-and-sub is "atomic" with a latency
+	     equal to it's cost, otherwise assume that on superscalar
+	     hardware the shift may be executed concurrently with the
+	     earlier steps in the algorithm.  */
+	  op_cost = add_cost[mode] + shift_cost[mode][m];
+	  if (shiftsub_cost[mode][m] < op_cost)
+	    {
+	      op_cost = shiftsub_cost[mode][m];
+	      op_latency = op_cost;
+	    }
+	  else
+	    op_latency = add_cost[mode];
+
+	  new_limit.cost = best_cost.cost - op_cost;
+	  new_limit.cost = best_cost.cost - op_latency;
+	  synth_mult (alg_in, t / d, &new_limit, mode);
 
-	  cost += alg_in->cost;
-	  if (cost < cost_limit)
+	  alg_in->cost.cost += op_cost;
+	  alg_in->cost.latency += op_latency;
+	  if (alg_in->cost.latency < op_cost)
+	    alg_in->cost.latency = op_cost;
+	  if (CHEAPER_MULT_COST (&alg_in->cost, &best_cost))
 	    {
 	      struct algorithm *x;
+	      best_cost = alg_in->cost;
 	      x = alg_in, alg_in = best_alg, best_alg = x;
 	      best_alg->log[best_alg->ops] = m;
 	      best_alg->op[best_alg->ops] = alg_sub_factor;
-	      cost_limit = cost;
 	    }
 	  break;
 	}
@@ -2408,17 +2486,20 @@ synth_mult (struct algorithm *alg_out, unsigned HOST_WIDE_INT t,
       m = exact_log2 (q);
       if (m >= 0 && m < maxm)
 	{
-	  cost = shiftadd_cost[mode][m];
-	  synth_mult (alg_in, (t - 1) >> m, cost_limit - cost, mode);
-
-	  cost += alg_in->cost;
-	  if (cost < cost_limit)
+	  op_cost = shiftadd_cost[mode][m];
+	  new_limit.cost = best_cost.cost - op_cost;
+	  new_limit.latency = best_cost.latency - op_cost;
+	  synth_mult (alg_in, (t - 1) >> m, &new_limit, mode);
+
+	  alg_in->cost.cost += op_cost;
+	  alg_in->cost.latency += op_cost;
+	  if (CHEAPER_MULT_COST (&alg_in->cost, &best_cost))
 	    {
 	      struct algorithm *x;
+	      best_cost = alg_in->cost;
 	      x = alg_in, alg_in = best_alg, best_alg = x;
 	      best_alg->log[best_alg->ops] = m;
 	      best_alg->op[best_alg->ops] = alg_add_t2_m;
-	      cost_limit = cost;
 	    }
 	}
 
@@ -2427,36 +2508,38 @@ synth_mult (struct algorithm *alg_out, unsigned HOST_WIDE_INT t,
       m = exact_log2 (q);
       if (m >= 0 && m < maxm)
 	{
-	  cost = shiftsub_cost[mode][m];
-	  synth_mult (alg_in, (t + 1) >> m, cost_limit - cost, mode);
-
-	  cost += alg_in->cost;
-	  if (cost < cost_limit)
+	  op_cost = shiftsub_cost[mode][m];
+	  new_limit.cost = best_cost.cost - op_cost;
+	  new_limit.latency = best_cost.latency - op_cost;
+	  synth_mult (alg_in, (t + 1) >> m, &new_limit, mode);
+
+	  alg_in->cost.cost += op_cost;
+	  alg_in->cost.latency += op_cost;
+	  if (CHEAPER_MULT_COST (&alg_in->cost, &best_cost))
 	    {
 	      struct algorithm *x;
+	      best_cost = alg_in->cost;
 	      x = alg_in, alg_in = best_alg, best_alg = x;
 	      best_alg->log[best_alg->ops] = m;
 	      best_alg->op[best_alg->ops] = alg_sub_t2_m;
-	      cost_limit = cost;
 	    }
 	}
     }
 
-  /* If cost_limit has not decreased since we stored it in alg_out->cost,
-     we have not found any algorithm.  */
-  if (cost_limit == alg_out->cost)
-    return;
-
   /* If we are getting a too long sequence for `struct algorithm'
      to record, make this search fail.  */
   if (best_alg->ops == MAX_BITS_PER_WORD)
     return;
 
+  /* If best_cost has not decreased, we have not found any algorithm.  */
+  if (!CHEAPER_MULT_COST (&best_cost, cost_limit))
+    return;
+
   /* Copy the algorithm from temporary space to the space at alg_out.
      We avoid using structure assignment because the majority of
      best_alg is normally undefined, and this is a critical function.  */
   alg_out->ops = best_alg->ops + 1;
-  alg_out->cost = cost_limit;
+  alg_out->cost = best_cost;
   memcpy (alg_out->op, best_alg->op,
 	  alg_out->ops * sizeof *alg_out->op);
   memcpy (alg_out->log, best_alg->log,
@@ -2479,29 +2562,57 @@ choose_mult_variant (enum machine_mode mode, HOST_WIDE_INT val,
 		     int mult_cost)
 {
   struct algorithm alg2;
+  struct mult_cost limit;
+  int op_cost;
 
   *variant = basic_variant;
-  synth_mult (alg, val, mult_cost, mode);
+  limit.cost = mult_cost;
+  limit.latency = mult_cost;
+  synth_mult (alg, val, &limit, mode);
 
   /* This works only if the inverted value actually fits in an
      `unsigned int' */
   if (HOST_BITS_PER_INT >= GET_MODE_BITSIZE (mode))
     {
-      synth_mult (&alg2, -val, MIN (alg->cost, mult_cost) - neg_cost[mode],
-		  mode);
-      alg2.cost += neg_cost[mode];
-      if (alg2.cost < alg->cost)
+      op_cost = neg_cost[mode];
+      if (MULT_COST_LESS (&alg->cost, mult_cost))
+	{
+	  limit.cost = alg->cost.cost - op_cost;
+	  limit.latency = alg->cost.latency - op_cost;
+	}
+      else
+	{
+	  limit.cost = mult_cost - op_cost;
+	  limit.latency = mult_cost - op_cost;
+	}
+
+      synth_mult (&alg2, -val, &limit, mode);
+      alg2.cost.cost += op_cost;
+      alg2.cost.latency += op_cost;
+      if (CHEAPER_MULT_COST (&alg2.cost, &alg->cost))
 	*alg = alg2, *variant = negate_variant;
     }
 
   /* This proves very useful for division-by-constant.  */
-  synth_mult (&alg2, val - 1, MIN (alg->cost, mult_cost) - add_cost[mode],
-	      mode);
-  alg2.cost += add_cost[mode];
-  if (alg2.cost < alg->cost)
+  op_cost = add_cost[mode];
+  if (MULT_COST_LESS (&alg->cost, mult_cost))
+    {
+      limit.cost = alg->cost.cost - op_cost;
+      limit.latency = alg->cost.latency - op_cost;
+    }
+  else
+    {
+      limit.cost = mult_cost - op_cost;
+      limit.latency = mult_cost - op_cost;
+    }
+
+  synth_mult (&alg2, val - 1, &limit, mode);
+  alg2.cost.cost += op_cost;
+  alg2.cost.latency += op_cost;
+  if (CHEAPER_MULT_COST (&alg2.cost, &alg->cost))
     *alg = alg2, *variant = add_variant;
 
-  return alg->cost < mult_cost;
+  return MULT_COST_LESS (&alg->cost, mult_cost);
 }
 
 /* A subroutine of expand_mult, used for constant multiplications.
@@ -3074,8 +3185,8 @@ expand_mult_highpart (enum machine_mode mode, rtx op0,
     {
       /* See whether the specialized multiplication optabs are
 	 cheaper than the shift/add version.  */
-      tem = expand_mult_highpart_optab (mode, op0, op1, target,
-					unsignedp, alg.cost + extra_cost);
+      tem = expand_mult_highpart_optab (mode, op0, op1, target, unsignedp,
+					alg.cost.cost + extra_cost);
       if (tem)
 	return tem;
 
-- 
2.30.2