re PR middle-end/90796 (GCC: O2 vs O3 output differs on simple test)

author Michael Matz <matz@suse.de>

Tue, 22 Oct 2019 12:25:03 +0000 (12:25 +0000)

committer Michael Matz <matz@gcc.gnu.org>

Tue, 22 Oct 2019 12:25:03 +0000 (12:25 +0000)
author Michael Matz <matz@suse.de>
Tue, 22 Oct 2019 12:25:03 +0000 (12:25 +0000)
committer Michael Matz <matz@gcc.gnu.org>
Tue, 22 Oct 2019 12:25:03 +0000 (12:25 +0000)
diff --git a/gcc/ChangeLog b/gcc/ChangeLog

index 8f0af771b5f500af98e4b70a8032da4794460e79..34ce686e1a0382bd986f29893d41042a88f68dd7 100644 (file)
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,10 @@
+2019-10-22  Michael Matz  <matz@suse.de>
+
+       PR middle-end/90796
+       * gimple-loop-jam.c (any_access_function_variant_p): New function.
+       (adjust_unroll_factor): Use it to constrain safety, new parameter.
+       (tree_loop_unroll_and_jam): Adjust call and profitable unroll factor.
+
  2019-10-22  Richard Biener  <rguenther@suse.de>
  
         PR tree-optimization/92173
diff --git a/gcc/gimple-loop-jam.c b/gcc/gimple-loop-jam.c

index 11153f5402555737fa888bb8f4b4e558756ccf2a..899653b0863475bd9f809bd5b7ec485c6b936ea0 100644 (file)
--- a/gcc/gimple-loop-jam.c
+++ b/gcc/gimple-loop-jam.c
@@ -360,16 +360,33 @@ fuse_loops (class loop *loop)
    rewrite_into_loop_closed_ssa_1 (NULL, 0, SSA_OP_USE, loop);
  }
  
+/* Return true if any of the access functions for dataref A
+   isn't invariant with respect to loop LOOP_NEST.  */
+static bool
+any_access_function_variant_p (const struct data_reference *a,
+                              const class loop *loop_nest)
+{
+  unsigned int i;
+  vec<tree> fns = DR_ACCESS_FNS (a);
+  tree t;
+
+  FOR_EACH_VEC_ELT (fns, i, t)
+    if (!evolution_function_is_invariant_p (t, loop_nest->num))
+      return true;
+
+  return false;
+}
+
  /* Returns true if the distance in DDR can be determined and adjusts
     the unroll factor in *UNROLL to make unrolling valid for that distance.
-   Otherwise return false.
+   Otherwise return false.  DDR is with respect to the outer loop of INNER.
  
     If this data dep can lead to a removed memory reference, increment
     *REMOVED and adjust *PROFIT_UNROLL to be the necessary unroll factor
     for this to happen.  */
  
  static bool
-adjust_unroll_factor (struct data_dependence_relation *ddr,
+adjust_unroll_factor (class loop *inner, struct data_dependence_relation *ddr,
                       unsigned *unroll, unsigned *profit_unroll,
                       unsigned *removed)
  {
@@ -392,9 +409,59 @@ adjust_unroll_factor (struct data_dependence_relation *ddr,
             gcc_unreachable ();
           else if ((unsigned)dist >= *unroll)
             ;
-         else if (lambda_vector_lexico_pos (dist_v + 1, DDR_NB_LOOPS (ddr) - 1)
-                  || (lambda_vector_zerop (dist_v + 1, DDR_NB_LOOPS (ddr) - 1)
-                      && dist > 0))
+         else if (lambda_vector_zerop (dist_v + 1, DDR_NB_LOOPS (ddr) - 1))
+           {
+             /* We have (a,0) with a < N, so this will be transformed into
+                (0,0) after unrolling by N.  This might potentially be a
+                problem, if it's not a read-read dependency.  */
+             if (DR_IS_READ (DDR_A (ddr)) && DR_IS_READ (DDR_B (ddr)))
+               ;
+             else
+               {
+                 /* So, at least one is a write, and we might reduce the
+                    distance vector to (0,0).  This is still no problem
+                    if both data-refs are affine with respect to the inner
+                    loops.  But if one of them is invariant with respect
+                    to an inner loop our reordering implicit in loop fusion
+                    corrupts the program, as our data dependences don't
+                    capture this.  E.g. for:
+                      for (0 <= i < n)
+                        for (0 <= j < m)
+                          a[i][0] = a[i+1][0] + 2;    // (1)
+                          b[i][j] = b[i+1][j] + 2;    // (2)
+                    the distance vector for both statements is (-1,0),
+                    but exchanging the order for (2) is okay, while
+                    for (1) it is not.  To see this, write out the original
+                    accesses (assume m is 2):
+                      a i j original
+                      0 0 0 r a[1][0] b[1][0]
+                      1 0 0 w a[0][0] b[0][0]
+                      2 0 1 r a[1][0] b[1][1]
+                      3 0 1 w a[0][0] b[0][1]
+                      4 1 0 r a[2][0] b[2][0]
+                      5 1 0 w a[1][0] b[1][0]
+                    after unroll-by-2 and fusion the accesses are done in
+                    this order (from column a): 0,1, 4,5, 2,3, i.e. this:
+                      a i j transformed
+                      0 0 0 r a[1][0] b[1][0]
+                      1 0 0 w a[0][0] b[0][0]
+                      4 1 0 r a[2][0] b[2][0]
+                      5 1 0 w a[1][0] b[1][0]
+                      2 0 1 r a[1][0] b[1][1]  
+                      3 0 1 w a[0][0] b[0][1]
+                    Note how access 2 accesses the same element as access 5
+                    for array 'a' but not for array 'b'.  */
+                 if (any_access_function_variant_p (DDR_A (ddr), inner)
+                     && any_access_function_variant_p (DDR_B (ddr), inner))
+                   ;
+                 else
+                   /* And if any dataref of this pair is invariant with
+                      respect to the inner loop, we have no chance than
+                      to reduce the unroll factor.  */
+                   *unroll = dist;
+               }
+           }
+         else if (lambda_vector_lexico_pos (dist_v + 1, DDR_NB_LOOPS (ddr) - 1))
             ;
           else
             *unroll = dist;
@@ -486,7 +553,7 @@ tree_loop_unroll_and_jam (void)
           /* Now check the distance vector, for determining a sensible
              outer unroll factor, and for validity of merging the inner
              loop copies.  */
-         if (!adjust_unroll_factor (ddr, &unroll_factor, &profit_unroll,
+         if (!adjust_unroll_factor (loop, ddr, &unroll_factor, &profit_unroll,
                                      &removed))
             {
               /* Couldn't get the distance vector.  For two reads that's
@@ -506,7 +573,7 @@ tree_loop_unroll_and_jam (void)
          to ignore all profitability concerns and apply the transformation
          always.  */
        if (!PARAM_VALUE (PARAM_UNROLL_JAM_MIN_PERCENT))
-       profit_unroll = 2;
+       profit_unroll = MAX(2, profit_unroll);
        else if (removed * 100 / datarefs.length ()
           < (unsigned)PARAM_VALUE (PARAM_UNROLL_JAM_MIN_PERCENT))
         profit_unroll = 1;
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog

index 9827b38387d03dc2cdc498ebba068d693c91c5dc..9caf60787d4dab9152d039c9fe85ac0177c6b361 100644 (file)
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,3 +1,8 @@
+2019-10-22  Michael Matz  <matz@suse.de>
+
+       PR middle-end/90796
+       * gcc.dg/unroll-and-jam.c: Add three invalid and one valid case.
+
  2019-10-22  Richard Biener  <rguenther@suse.de>
  
         PR tree-optimization/92173
diff --git a/gcc/testsuite/gcc.dg/unroll-and-jam.c b/gcc/testsuite/gcc.dg/unroll-and-jam.c

index 70910d318cbc703986ede1adf62a6a16645db6fc..bcfe1bda9b0b9cc242dde43210199f87218b3a08 100644 (file)
--- a/gcc/testsuite/gcc.dg/unroll-and-jam.c
+++ b/gcc/testsuite/gcc.dg/unroll-and-jam.c
@@ -31,10 +31,10 @@ void checkb(void)
    //printf("  %d\n", sum);
  }
  
+unsigned i, j;
  #define TEST(name, body, test) \
  static void __attribute__((noinline,noclone)) name (unsigned long n, unsigned long m) \
  { \
-  unsigned long i, j; \
    for (i = 1; i < m; i++) { \
        for (j = 1; j < n; j++) { \
           body; \
@@ -58,9 +58,14 @@ TEST(foo3, aa[i+1][j-1]=aa[i][j] * aa[i][j] / 2, checkaa()) //notok, -1,1
  TEST(foo4, aa[i][j] = aa[i-1][j+1] * aa[i-1][j+1] / 2, checkaa()) //notok, -1,1
  TEST(foo5, aa[i][j] = aa[i+1][j+1] * aa[i+1][j+1] / 2, checkaa()) //ok, 1,1
  TEST(foo6, aa[i][j] = aa[i+1][j] * aa[i+1][j] / 2, checkaa()) //ok, -1,0
+TEST(foo61, aa[i][0] = aa[i+1][0] * aa[i+1][0] / 2, checkaa()) //notok, -1,0
+TEST(foo62, aa[i][j/2] = aa[i+1][j/2] * aa[i+1][j/2] / 2, checkaa()) //notok, not affine
+TEST(foo63, aa[i][j%2] = aa[i+1][j%2] * aa[i+1][j%2] / 2, checkaa()) //notok, not affine
  TEST(foo7, aa[i+1][j] = aa[i][j] * aa[i][j] / 2, checkaa()) //ok, 1,0
  TEST(foo9, b[j] = 3*b[j+1] + 1, checkb()) //notok, 0,-1
  TEST(foo10, b[j] = 3*b[j] + 1, checkb()) //ok, 0,0
+extern int f;
+TEST(foo11, f = b[i-1] = 1 + 3* b[i+1], checkb()) //ok, 2,0 but must reduce unroll factor to 2, (it would be incorrect with unroll-by-3, which the profitability would suggest)
  
  /* foo8 should work as well, but currently doesn't because the distance
     vectors we compute are too pessimistic.  We compute
@@ -68,6 +73,7 @@ TEST(foo10, b[j] = 3*b[j] + 1, checkb()) //ok, 0,0
     and the last one causes us to lose.  */
  TEST(foo8, b[j+1] = 3*b[j] + 1, checkb()) //ok, 0,1
  
+int f;
  unsigned int a[1024];
  unsigned int b[1024];
  unsigned int aa[16][1024];
@@ -88,10 +94,12 @@ void init(void)
      printf(" %s\n", #name); \
      init();for(i=0;i<4;i++)name##noopt(32,8); checka = checksum; \
      init();for(i=0;i<4;i++)name(32,8); \
+    if (checka != checksum) fail = 1; \
      printf("%sok %s\n", checka != checksum ? "NOT " : "", #name);
  
  int main()
  {
+  int fail = 0;
    int i;
    unsigned checka;
    RUN(foo1);
@@ -100,12 +108,18 @@ int main()
    RUN(foo4);
    RUN(foo5);
    RUN(foo6);
+  RUN(foo61);
+  RUN(foo62);
+  RUN(foo63);
    RUN(foo7);
    RUN(foo8);
    RUN(foo9);
    RUN(foo10);
-  return 0;
+  RUN(foo11);
+  if (fail)
+    __builtin_abort();
+  return fail;
  }
  
-/* Five loops should be unroll-jammed (actually six, but see above).  */
-/* { dg-final { scan-tree-dump-times "applying unroll and jam" 5 "unrolljam" } } */
+/* Six loops should be unroll-jammed (actually seven, but see above).  */
+/* { dg-final { scan-tree-dump-times "applying unroll and jam" 6 "unrolljam" } } */
author	Michael Matz <matz@suse.de>
	Tue, 22 Oct 2019 12:25:03 +0000 (12:25 +0000)
committer	Michael Matz <matz@gcc.gnu.org>
	Tue, 22 Oct 2019 12:25:03 +0000 (12:25 +0000)
gcc/ChangeLog		patch \| blob \| history
gcc/gimple-loop-jam.c		patch \| blob \| history
gcc/testsuite/ChangeLog		patch \| blob \| history
gcc/testsuite/gcc.dg/unroll-and-jam.c		patch \| blob \| history