[expmed] Properly account for the cost and latency of shift+add ops when synthesizing...

author Kyrylo Tkachov <kyrylo.tkachov@arm.com>

Tue, 21 Apr 2015 12:56:39 +0000 (12:56 +0000)

committer Kyrylo Tkachov <ktkachov@gcc.gnu.org>

Tue, 21 Apr 2015 12:56:39 +0000 (12:56 +0000)
author Kyrylo Tkachov <kyrylo.tkachov@arm.com>
Tue, 21 Apr 2015 12:56:39 +0000 (12:56 +0000)
committer Kyrylo Tkachov <ktkachov@gcc.gnu.org>
Tue, 21 Apr 2015 12:56:39 +0000 (12:56 +0000)
diff --git a/gcc/ChangeLog b/gcc/ChangeLog

index 85c2ba33aed3cfe02b26a0c4ad4b8f0ac26fc5c4..1e56a373bdc07074226b44b88f14dc39e219204d 100644 (file)
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,8 @@
+2015-04-21  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>
+
+       * expmed.c: (synth_mult): Only assume overlapping
+       shift with previous steps in alg_sub_t_m2 case.
+
  2015-04-21  Richard Biener  <rguenther@suse.de>
  
         PR tree-optimization/65650
diff --git a/gcc/expmed.c b/gcc/expmed.c

index 6327629d458fc734fbf26c16a9457e7307f52213..6679f501c2c1eafb63b93f60681dcc305847249a 100644 (file)
--- a/gcc/expmed.c
+++ b/gcc/expmed.c
@@ -2664,14 +2664,28 @@ synth_mult (struct algorithm *alg_out, unsigned HOST_WIDE_INT t,
        m = exact_log2 (-orig_t + 1);
        if (m >= 0 && m < maxm)
         {
-         op_cost = shiftsub1_cost (speed, mode, m);
+         op_cost = add_cost (speed, mode) + shift_cost (speed, mode, m);
+         /* If the target has a cheap shift-and-subtract insn use
+            that in preference to a shift insn followed by a sub insn.
+            Assume that the shift-and-sub is "atomic" with a latency
+            equal to it's cost, otherwise assume that on superscalar
+            hardware the shift may be executed concurrently with the
+            earlier steps in the algorithm.  */
+         if (shiftsub1_cost (speed, mode, m) <= op_cost)
+           {
+             op_cost = shiftsub1_cost (speed, mode, m);
+             op_latency = op_cost;
+           }
+         else
+           op_latency = add_cost (speed, mode);
+
           new_limit.cost = best_cost.cost - op_cost;
-         new_limit.latency = best_cost.latency - op_cost;
+         new_limit.latency = best_cost.latency - op_latency;
           synth_mult (alg_in, (unsigned HOST_WIDE_INT) (-orig_t + 1) >> m,
                       &new_limit, mode);
  
           alg_in->cost.cost += op_cost;
-         alg_in->cost.latency += op_cost;
+         alg_in->cost.latency += op_latency;
           if (CHEAPER_MULT_COST (&alg_in->cost, &best_cost))
             {
               best_cost = alg_in->cost;
@@ -2704,20 +2718,12 @@ synth_mult (struct algorithm *alg_out, unsigned HOST_WIDE_INT t,
        if (t % d == 0 && t > d && m < maxm
           && (!cache_hit || cache_alg == alg_add_factor))
         {
-         /* If the target has a cheap shift-and-add instruction use
-            that in preference to a shift insn followed by an add insn.
-            Assume that the shift-and-add is "atomic" with a latency
-            equal to its cost, otherwise assume that on superscalar
-            hardware the shift may be executed concurrently with the
-            earlier steps in the algorithm.  */
           op_cost = add_cost (speed, mode) + shift_cost (speed, mode, m);
-         if (shiftadd_cost (speed, mode, m) < op_cost)
-           {
-             op_cost = shiftadd_cost (speed, mode, m);
-             op_latency = op_cost;
-           }
-         else
-           op_latency = add_cost (speed, mode);
+         if (shiftadd_cost (speed, mode, m) <= op_cost)
+           op_cost = shiftadd_cost (speed, mode, m);
+
+         op_latency = op_cost;
+
  
           new_limit.cost = best_cost.cost - op_cost;
           new_limit.latency = best_cost.latency - op_latency;
@@ -2742,20 +2748,11 @@ synth_mult (struct algorithm *alg_out, unsigned HOST_WIDE_INT t,
        if (t % d == 0 && t > d && m < maxm
           && (!cache_hit || cache_alg == alg_sub_factor))
         {
-         /* If the target has a cheap shift-and-subtract insn use
-            that in preference to a shift insn followed by a sub insn.
-            Assume that the shift-and-sub is "atomic" with a latency
-            equal to it's cost, otherwise assume that on superscalar
-            hardware the shift may be executed concurrently with the
-            earlier steps in the algorithm.  */
           op_cost = add_cost (speed, mode) + shift_cost (speed, mode, m);
-         if (shiftsub0_cost (speed, mode, m) < op_cost)
-           {
-             op_cost = shiftsub0_cost (speed, mode, m);
-             op_latency = op_cost;
-           }
-         else
-           op_latency = add_cost (speed, mode);
+         if (shiftsub0_cost (speed, mode, m) <= op_cost)
+           op_cost = shiftsub0_cost (speed, mode, m);
+
+         op_latency = op_cost;
  
           new_limit.cost = best_cost.cost - op_cost;
           new_limit.latency = best_cost.latency - op_latency;
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog

index 5573e423ca8616dcc91821f3215a248362eca808..c031a3afc880bd80803db070dd517cbe7c60c1fe 100644 (file)
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,3 +1,12 @@
+2015-04-21  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>
+
+       * gcc.target/aarch64/mult-synth_1.c: New test.
+       * gcc.target/aarch64/mult-synth_2.c: Likewise.
+       * gcc.target/aarch64/mult-synth_3.c: Likewise.
+       * gcc.target/aarch64/mult-synth_4.c: Likewise.
+       * gcc.target/aarch64/mult-synth_5.c: Likewise.
+       * gcc.target/aarch64/mult-synth_6.c: Likewise.
+
  2015-04-21  Richard Biener  <rguenther@suse.de>
  
         PR tree-optimization/65650
diff --git a/gcc/testsuite/gcc.target/aarch64/mult-synth_1.c b/gcc/testsuite/gcc.target/aarch64/mult-synth_1.c

new file mode 100644 (file)

index 0000000..37f079d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/mult-synth_1.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mcpu=cortex-a57 -save-temps" } */
+
+int
+foo (int x)
+{
+  return x * 100;
+}
+
+/* { dg-final { scan-assembler-times "mul\tw\[0-9\]+, w\[0-9\]+, w\[0-9\]+" 1 } } */
+/* { dg-final { cleanup-saved-temps } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/mult-synth_2.c b/gcc/testsuite/gcc.target/aarch64/mult-synth_2.c

new file mode 100644 (file)

index 0000000..4d2e5bf
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/mult-synth_2.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mcpu=cortex-a57 -save-temps" } */
+
+int
+foo (int x)
+{
+  return x * 25;
+}
+
+/* { dg-final { scan-assembler-times "mul\tw\[0-9\]+, w\[0-9\]+, w\[0-9\]+" 1 } } */
+/* { dg-final { cleanup-saved-temps } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/mult-synth_3.c b/gcc/testsuite/gcc.target/aarch64/mult-synth_3.c

new file mode 100644 (file)

index 0000000..03e83e9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/mult-synth_3.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mcpu=cortex-a57 -save-temps" } */
+
+int
+foo (int x)
+{
+  return x * 11;
+}
+
+/* { dg-final { scan-assembler-times "mul\tw\[0-9\]+, w\[0-9\]+, w\[0-9\]+" 1 } } */
+/* { dg-final { cleanup-saved-temps } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/mult-synth_4.c b/gcc/testsuite/gcc.target/aarch64/mult-synth_4.c

new file mode 100644 (file)

index 0000000..05a82bd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/mult-synth_4.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mcpu=cortex-a57 -save-temps" } */
+
+long
+foo (int x, int y)
+{
+   return (long)x * 6L;
+}
+
+/* { dg-final { scan-assembler-times "smull\tx\[0-9\]+, w\[0-9\]+, w\[0-9\]+" 1 } } */
+/* { dg-final { cleanup-saved-temps } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/mult-synth_5.c b/gcc/testsuite/gcc.target/aarch64/mult-synth_5.c

new file mode 100644 (file)

index 0000000..8cf3314
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/mult-synth_5.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mcpu=cortex-a57 -save-temps" } */
+
+int
+foo (int x)
+{
+  return x * 10;
+}
+
+/* { dg-final { scan-assembler-not "\tw1" } } */
+/* { dg-final { cleanup-saved-temps } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/mult-synth_6.c b/gcc/testsuite/gcc.target/aarch64/mult-synth_6.c

new file mode 100644 (file)

index 0000000..e941b72
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/mult-synth_6.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mcpu=cortex-a57 -save-temps" } */
+
+int
+foo (int x)
+{
+  return x * 20;
+}
+
+/* { dg-final { scan-assembler-not "\tw1" } } */
+/* { dg-final { cleanup-saved-temps } } */
author	Kyrylo Tkachov <kyrylo.tkachov@arm.com>
	Tue, 21 Apr 2015 12:56:39 +0000 (12:56 +0000)
committer	Kyrylo Tkachov <ktkachov@gcc.gnu.org>
	Tue, 21 Apr 2015 12:56:39 +0000 (12:56 +0000)
gcc/ChangeLog		patch \| blob \| history
gcc/expmed.c		patch \| blob \| history
gcc/testsuite/ChangeLog		patch \| blob \| history
gcc/testsuite/gcc.target/aarch64/mult-synth_1.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/aarch64/mult-synth_2.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/aarch64/mult-synth_3.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/aarch64/mult-synth_4.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/aarch64/mult-synth_5.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/aarch64/mult-synth_6.c	[new file with mode: 0644]	patch \| blob