From: Kyrylo Tkachov <kyrylo.tkachov@arm.com>
Date: Tue, 21 Apr 2015 12:56:39 +0000 (+0000)
Subject: [expmed] Properly account for the cost and latency of shift+add ops when synthesizing... 
X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=35430ca0c6c34943932b111cb55475f292a0a208;p=gcc.git

[expmed] Properly account for the cost and latency of shift+add ops when synthesizing mults

        * expmed.c: (synth_mult): Only assume overlapping
        shift with previous steps in alg_sub_t_m2 case.

        * gcc.target/aarch64/mult-synth_1.c: New test.
        * gcc.target/aarch64/mult-synth_2.c: Likewise.
        * gcc.target/aarch64/mult-synth_3.c: Likewise.
        * gcc.target/aarch64/mult-synth_4.c: Likewise.
        * gcc.target/aarch64/mult-synth_5.c: Likewise.
        * gcc.target/aarch64/mult-synth_6.c: Likewise.

From-SVN: r222268
---

diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 85c2ba33aed..1e56a373bdc 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,8 @@
+2015-04-21  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>
+
+	* expmed.c: (synth_mult): Only assume overlapping
+	shift with previous steps in alg_sub_t_m2 case.
+
 2015-04-21  Richard Biener  <rguenther@suse.de>
 
 	PR tree-optimization/65650
diff --git a/gcc/expmed.c b/gcc/expmed.c
index 6327629d458..6679f501c2c 100644
--- a/gcc/expmed.c
+++ b/gcc/expmed.c
@@ -2664,14 +2664,28 @@ synth_mult (struct algorithm *alg_out, unsigned HOST_WIDE_INT t,
       m = exact_log2 (-orig_t + 1);
       if (m >= 0 && m < maxm)
 	{
-	  op_cost = shiftsub1_cost (speed, mode, m);
+	  op_cost = add_cost (speed, mode) + shift_cost (speed, mode, m);
+	  /* If the target has a cheap shift-and-subtract insn use
+	     that in preference to a shift insn followed by a sub insn.
+	     Assume that the shift-and-sub is "atomic" with a latency
+	     equal to it's cost, otherwise assume that on superscalar
+	     hardware the shift may be executed concurrently with the
+	     earlier steps in the algorithm.  */
+	  if (shiftsub1_cost (speed, mode, m) <= op_cost)
+	    {
+	      op_cost = shiftsub1_cost (speed, mode, m);
+	      op_latency = op_cost;
+	    }
+	  else
+	    op_latency = add_cost (speed, mode);
+
 	  new_limit.cost = best_cost.cost - op_cost;
-	  new_limit.latency = best_cost.latency - op_cost;
+	  new_limit.latency = best_cost.latency - op_latency;
 	  synth_mult (alg_in, (unsigned HOST_WIDE_INT) (-orig_t + 1) >> m,
 		      &new_limit, mode);
 
 	  alg_in->cost.cost += op_cost;
-	  alg_in->cost.latency += op_cost;
+	  alg_in->cost.latency += op_latency;
 	  if (CHEAPER_MULT_COST (&alg_in->cost, &best_cost))
 	    {
 	      best_cost = alg_in->cost;
@@ -2704,20 +2718,12 @@ synth_mult (struct algorithm *alg_out, unsigned HOST_WIDE_INT t,
       if (t % d == 0 && t > d && m < maxm
 	  && (!cache_hit || cache_alg == alg_add_factor))
 	{
-	  /* If the target has a cheap shift-and-add instruction use
-	     that in preference to a shift insn followed by an add insn.
-	     Assume that the shift-and-add is "atomic" with a latency
-	     equal to its cost, otherwise assume that on superscalar
-	     hardware the shift may be executed concurrently with the
-	     earlier steps in the algorithm.  */
 	  op_cost = add_cost (speed, mode) + shift_cost (speed, mode, m);
-	  if (shiftadd_cost (speed, mode, m) < op_cost)
-	    {
-	      op_cost = shiftadd_cost (speed, mode, m);
-	      op_latency = op_cost;
-	    }
-	  else
-	    op_latency = add_cost (speed, mode);
+	  if (shiftadd_cost (speed, mode, m) <= op_cost)
+	    op_cost = shiftadd_cost (speed, mode, m);
+
+	  op_latency = op_cost;
+
 
 	  new_limit.cost = best_cost.cost - op_cost;
 	  new_limit.latency = best_cost.latency - op_latency;
@@ -2742,20 +2748,11 @@ synth_mult (struct algorithm *alg_out, unsigned HOST_WIDE_INT t,
       if (t % d == 0 && t > d && m < maxm
 	  && (!cache_hit || cache_alg == alg_sub_factor))
 	{
-	  /* If the target has a cheap shift-and-subtract insn use
-	     that in preference to a shift insn followed by a sub insn.
-	     Assume that the shift-and-sub is "atomic" with a latency
-	     equal to it's cost, otherwise assume that on superscalar
-	     hardware the shift may be executed concurrently with the
-	     earlier steps in the algorithm.  */
 	  op_cost = add_cost (speed, mode) + shift_cost (speed, mode, m);
-	  if (shiftsub0_cost (speed, mode, m) < op_cost)
-	    {
-	      op_cost = shiftsub0_cost (speed, mode, m);
-	      op_latency = op_cost;
-	    }
-	  else
-	    op_latency = add_cost (speed, mode);
+	  if (shiftsub0_cost (speed, mode, m) <= op_cost)
+	    op_cost = shiftsub0_cost (speed, mode, m);
+
+	  op_latency = op_cost;
 
 	  new_limit.cost = best_cost.cost - op_cost;
 	  new_limit.latency = best_cost.latency - op_latency;
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog
index 5573e423ca8..c031a3afc88 100644
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,3 +1,12 @@
+2015-04-21  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>
+
+	* gcc.target/aarch64/mult-synth_1.c: New test.
+	* gcc.target/aarch64/mult-synth_2.c: Likewise.
+	* gcc.target/aarch64/mult-synth_3.c: Likewise.
+	* gcc.target/aarch64/mult-synth_4.c: Likewise.
+	* gcc.target/aarch64/mult-synth_5.c: Likewise.
+	* gcc.target/aarch64/mult-synth_6.c: Likewise.
+
 2015-04-21  Richard Biener  <rguenther@suse.de>
 
 	PR tree-optimization/65650
diff --git a/gcc/testsuite/gcc.target/aarch64/mult-synth_1.c b/gcc/testsuite/gcc.target/aarch64/mult-synth_1.c
new file mode 100644
index 00000000000..37f079d5beb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/mult-synth_1.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mcpu=cortex-a57 -save-temps" } */
+
+int
+foo (int x)
+{
+  return x * 100;
+}
+
+/* { dg-final { scan-assembler-times "mul\tw\[0-9\]+, w\[0-9\]+, w\[0-9\]+" 1 } } */
+/* { dg-final { cleanup-saved-temps } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/mult-synth_2.c b/gcc/testsuite/gcc.target/aarch64/mult-synth_2.c
new file mode 100644
index 00000000000..4d2e5bf3dc1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/mult-synth_2.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mcpu=cortex-a57 -save-temps" } */
+
+int
+foo (int x)
+{
+  return x * 25;
+}
+
+/* { dg-final { scan-assembler-times "mul\tw\[0-9\]+, w\[0-9\]+, w\[0-9\]+" 1 } } */
+/* { dg-final { cleanup-saved-temps } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/mult-synth_3.c b/gcc/testsuite/gcc.target/aarch64/mult-synth_3.c
new file mode 100644
index 00000000000..03e83e97b50
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/mult-synth_3.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mcpu=cortex-a57 -save-temps" } */
+
+int
+foo (int x)
+{
+  return x * 11;
+}
+
+/* { dg-final { scan-assembler-times "mul\tw\[0-9\]+, w\[0-9\]+, w\[0-9\]+" 1 } } */
+/* { dg-final { cleanup-saved-temps } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/mult-synth_4.c b/gcc/testsuite/gcc.target/aarch64/mult-synth_4.c
new file mode 100644
index 00000000000..05a82bdffe0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/mult-synth_4.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mcpu=cortex-a57 -save-temps" } */
+
+long
+foo (int x, int y)
+{
+   return (long)x * 6L;
+}
+
+/* { dg-final { scan-assembler-times "smull\tx\[0-9\]+, w\[0-9\]+, w\[0-9\]+" 1 } } */
+/* { dg-final { cleanup-saved-temps } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/mult-synth_5.c b/gcc/testsuite/gcc.target/aarch64/mult-synth_5.c
new file mode 100644
index 00000000000..8cf3314aa2a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/mult-synth_5.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mcpu=cortex-a57 -save-temps" } */
+
+int
+foo (int x)
+{
+  return x * 10;
+}
+
+/* { dg-final { scan-assembler-not "\tw1" } } */
+/* { dg-final { cleanup-saved-temps } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/mult-synth_6.c b/gcc/testsuite/gcc.target/aarch64/mult-synth_6.c
new file mode 100644
index 00000000000..e941b72351e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/mult-synth_6.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mcpu=cortex-a57 -save-temps" } */
+
+int
+foo (int x)
+{
+  return x * 20;
+}
+
+/* { dg-final { scan-assembler-not "\tw1" } } */
+/* { dg-final { cleanup-saved-temps } } */