widening_mul: restrict ops to be defined in the same basic-block when convert plusmin...
authorRichard Biener <rguenther@suse.de>
Thu, 26 Mar 2020 07:33:57 +0000 (08:33 +0100)
committerRichard Biener <rguenther@suse.de>
Thu, 26 Mar 2020 07:36:42 +0000 (08:36 +0100)
In the testcase for PR94269, widening_mul moves two multiply
instructions from outside the loop to inside
the loop, merging with two add instructions separately.  This
increases the cost of the loop.  Like FMA detection
in the same pass, simply restrict ops to be defined in the same
basic-block to avoid possibly moving multiply
to a different block with a higher execution frequency.

2020-03-26  Felix Yang  <felix.yang@huawei.com>

PR tree-optimization/94269
* tree-ssa-math-opts.c (convert_plusminus_to_widen): Restrict
this
operation to single basic block.

* gcc.dg/pr94269.c: New test.

gcc/ChangeLog
gcc/testsuite/ChangeLog
gcc/testsuite/gcc.dg/pr94269.c [new file with mode: 0644]
gcc/tree-ssa-math-opts.c

index 3ad7a7aae0d2b59969c501faf0eaa6d7ac53d635..9f7927d748f203cdc0bccac44926e33b2e5a5609 100644 (file)
@@ -1,3 +1,10 @@
+2020-03-26  Felix Yang  <felix.yang@huawei.com>
+
+       PR tree-optimization/94269
+       * tree-ssa-math-opts.c (convert_plusminus_to_widen): Restrict
+       this
+       operation to single basic block.
+
 2020-03-25  Jeff Law  <law@redhat.com>
 
        PR rtl-optimization/90275
index e750dcbc7900157e7524034bf851797c5bc2942f..f8512d3fb3277623accfdbd059a2a5b2ef102a92 100644 (file)
@@ -1,3 +1,8 @@
+2020-03-26  Felix Yang  <felix.yang@huawei.com>
+
+       PR tree-optimization/94269
+       * gcc.dg/pr94269.c: New test.
+
 2020-03-25  Andrew Stubbs  <ams@codesourcery.com>
 
        * gcc.dg/vect/bb-slp-pr69907.c: Disable the dump scan for amdgcn.
diff --git a/gcc/testsuite/gcc.dg/pr94269.c b/gcc/testsuite/gcc.dg/pr94269.c
new file mode 100644 (file)
index 0000000..49d5704
--- /dev/null
@@ -0,0 +1,26 @@
+/* { dg-do compile { target aarch64*-*-* } } */
+/* { dg-options "-O2 -ftree-loop-vectorize -funsafe-math-optimizations -march=armv8.2-a+sve -msve-vector-bits=256" } */
+
+float
+foo(long n, float *x, int inc_x,
+            float *y, int inc_y)
+{
+  float dot = 0.0;
+  int ix = 0, iy = 0;
+
+  if (n < 0) {
+    return dot;
+  }
+
+  int i = 0;
+  while (i < n) {
+    dot += y[iy] * x[ix];
+    ix  += inc_x;
+    iy  += inc_y;
+    i++;
+  }
+
+  return dot;
+}
+
+/* { dg-final { scan-assembler-not "smaddl" { target aarch64*-*-* } } } */
index 54ba035f5ee1cbd949b1b4bef835a8dfb3b16155..969c1a6b6c6d788342cca54f1a401b6b20813f3a 100644 (file)
@@ -2715,11 +2715,14 @@ convert_plusminus_to_widen (gimple_stmt_iterator *gsi, gimple *stmt,
      multiply-and-accumulate instructions.
 
      If the widened-multiplication result has more than one uses, it is
-     probably wiser not to do the conversion.  */
+     probably wiser not to do the conversion.  Also restrict this operation
+     to single basic block to avoid moving the multiply to a different block
+     with a higher execution frequency.  */
   if (code == PLUS_EXPR
       && (rhs1_code == MULT_EXPR || rhs1_code == WIDEN_MULT_EXPR))
     {
       if (!has_single_use (rhs1)
+         || gimple_bb (rhs1_stmt) != gimple_bb (stmt)
          || !is_widening_mult_p (rhs1_stmt, &type1, &mult_rhs1,
                                  &type2, &mult_rhs2))
        return false;
@@ -2729,6 +2732,7 @@ convert_plusminus_to_widen (gimple_stmt_iterator *gsi, gimple *stmt,
   else if (rhs2_code == MULT_EXPR || rhs2_code == WIDEN_MULT_EXPR)
     {
       if (!has_single_use (rhs2)
+         || gimple_bb (rhs2_stmt) != gimple_bb (stmt)
          || !is_widening_mult_p (rhs2_stmt, &type1, &mult_rhs1,
                                  &type2, &mult_rhs2))
        return false;