From 92781ff1da896b2f92b1dcc06953be493371bf21 Mon Sep 17 00:00:00 2001 From: Michael Matz Date: Tue, 22 Oct 2019 12:25:03 +0000 Subject: [PATCH] re PR middle-end/90796 (GCC: O2 vs O3 output differs on simple test) Fix PR middle-end/90796 PR middle-end/90796 * gimple-loop-jam.c (any_access_function_variant_p): New function. (adjust_unroll_factor): Use it to constrain safety, new parameter. (tree_loop_unroll_and_jam): Adjust call and profitable unroll factor. testsuite/ * gcc.dg/unroll-and-jam.c: Add three invalid and one valid case. From-SVN: r277287 --- gcc/ChangeLog | 7 +++ gcc/gimple-loop-jam.c | 81 ++++++++++++++++++++++++--- gcc/testsuite/ChangeLog | 5 ++ gcc/testsuite/gcc.dg/unroll-and-jam.c | 22 ++++++-- 4 files changed, 104 insertions(+), 11 deletions(-) diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 8f0af771b5f..34ce686e1a0 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,10 @@ +2019-10-22 Michael Matz + + PR middle-end/90796 + * gimple-loop-jam.c (any_access_function_variant_p): New function. + (adjust_unroll_factor): Use it to constrain safety, new parameter. + (tree_loop_unroll_and_jam): Adjust call and profitable unroll factor. + 2019-10-22 Richard Biener PR tree-optimization/92173 diff --git a/gcc/gimple-loop-jam.c b/gcc/gimple-loop-jam.c index 11153f54025..899653b0863 100644 --- a/gcc/gimple-loop-jam.c +++ b/gcc/gimple-loop-jam.c @@ -360,16 +360,33 @@ fuse_loops (class loop *loop) rewrite_into_loop_closed_ssa_1 (NULL, 0, SSA_OP_USE, loop); } +/* Return true if any of the access functions for dataref A + isn't invariant with respect to loop LOOP_NEST. */ +static bool +any_access_function_variant_p (const struct data_reference *a, + const class loop *loop_nest) +{ + unsigned int i; + vec fns = DR_ACCESS_FNS (a); + tree t; + + FOR_EACH_VEC_ELT (fns, i, t) + if (!evolution_function_is_invariant_p (t, loop_nest->num)) + return true; + + return false; +} + /* Returns true if the distance in DDR can be determined and adjusts the unroll factor in *UNROLL to make unrolling valid for that distance. - Otherwise return false. + Otherwise return false. DDR is with respect to the outer loop of INNER. If this data dep can lead to a removed memory reference, increment *REMOVED and adjust *PROFIT_UNROLL to be the necessary unroll factor for this to happen. */ static bool -adjust_unroll_factor (struct data_dependence_relation *ddr, +adjust_unroll_factor (class loop *inner, struct data_dependence_relation *ddr, unsigned *unroll, unsigned *profit_unroll, unsigned *removed) { @@ -392,9 +409,59 @@ adjust_unroll_factor (struct data_dependence_relation *ddr, gcc_unreachable (); else if ((unsigned)dist >= *unroll) ; - else if (lambda_vector_lexico_pos (dist_v + 1, DDR_NB_LOOPS (ddr) - 1) - || (lambda_vector_zerop (dist_v + 1, DDR_NB_LOOPS (ddr) - 1) - && dist > 0)) + else if (lambda_vector_zerop (dist_v + 1, DDR_NB_LOOPS (ddr) - 1)) + { + /* We have (a,0) with a < N, so this will be transformed into + (0,0) after unrolling by N. This might potentially be a + problem, if it's not a read-read dependency. */ + if (DR_IS_READ (DDR_A (ddr)) && DR_IS_READ (DDR_B (ddr))) + ; + else + { + /* So, at least one is a write, and we might reduce the + distance vector to (0,0). This is still no problem + if both data-refs are affine with respect to the inner + loops. But if one of them is invariant with respect + to an inner loop our reordering implicit in loop fusion + corrupts the program, as our data dependences don't + capture this. E.g. for: + for (0 <= i < n) + for (0 <= j < m) + a[i][0] = a[i+1][0] + 2; // (1) + b[i][j] = b[i+1][j] + 2; // (2) + the distance vector for both statements is (-1,0), + but exchanging the order for (2) is okay, while + for (1) it is not. To see this, write out the original + accesses (assume m is 2): + a i j original + 0 0 0 r a[1][0] b[1][0] + 1 0 0 w a[0][0] b[0][0] + 2 0 1 r a[1][0] b[1][1] + 3 0 1 w a[0][0] b[0][1] + 4 1 0 r a[2][0] b[2][0] + 5 1 0 w a[1][0] b[1][0] + after unroll-by-2 and fusion the accesses are done in + this order (from column a): 0,1, 4,5, 2,3, i.e. this: + a i j transformed + 0 0 0 r a[1][0] b[1][0] + 1 0 0 w a[0][0] b[0][0] + 4 1 0 r a[2][0] b[2][0] + 5 1 0 w a[1][0] b[1][0] + 2 0 1 r a[1][0] b[1][1] + 3 0 1 w a[0][0] b[0][1] + Note how access 2 accesses the same element as access 5 + for array 'a' but not for array 'b'. */ + if (any_access_function_variant_p (DDR_A (ddr), inner) + && any_access_function_variant_p (DDR_B (ddr), inner)) + ; + else + /* And if any dataref of this pair is invariant with + respect to the inner loop, we have no chance than + to reduce the unroll factor. */ + *unroll = dist; + } + } + else if (lambda_vector_lexico_pos (dist_v + 1, DDR_NB_LOOPS (ddr) - 1)) ; else *unroll = dist; @@ -486,7 +553,7 @@ tree_loop_unroll_and_jam (void) /* Now check the distance vector, for determining a sensible outer unroll factor, and for validity of merging the inner loop copies. */ - if (!adjust_unroll_factor (ddr, &unroll_factor, &profit_unroll, + if (!adjust_unroll_factor (loop, ddr, &unroll_factor, &profit_unroll, &removed)) { /* Couldn't get the distance vector. For two reads that's @@ -506,7 +573,7 @@ tree_loop_unroll_and_jam (void) to ignore all profitability concerns and apply the transformation always. */ if (!PARAM_VALUE (PARAM_UNROLL_JAM_MIN_PERCENT)) - profit_unroll = 2; + profit_unroll = MAX(2, profit_unroll); else if (removed * 100 / datarefs.length () < (unsigned)PARAM_VALUE (PARAM_UNROLL_JAM_MIN_PERCENT)) profit_unroll = 1; diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 9827b38387d..9caf60787d4 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,8 @@ +2019-10-22 Michael Matz + + PR middle-end/90796 + * gcc.dg/unroll-and-jam.c: Add three invalid and one valid case. + 2019-10-22 Richard Biener PR tree-optimization/92173 diff --git a/gcc/testsuite/gcc.dg/unroll-and-jam.c b/gcc/testsuite/gcc.dg/unroll-and-jam.c index 70910d318cb..bcfe1bda9b0 100644 --- a/gcc/testsuite/gcc.dg/unroll-and-jam.c +++ b/gcc/testsuite/gcc.dg/unroll-and-jam.c @@ -31,10 +31,10 @@ void checkb(void) //printf(" %d\n", sum); } +unsigned i, j; #define TEST(name, body, test) \ static void __attribute__((noinline,noclone)) name (unsigned long n, unsigned long m) \ { \ - unsigned long i, j; \ for (i = 1; i < m; i++) { \ for (j = 1; j < n; j++) { \ body; \ @@ -58,9 +58,14 @@ TEST(foo3, aa[i+1][j-1]=aa[i][j] * aa[i][j] / 2, checkaa()) //notok, -1,1 TEST(foo4, aa[i][j] = aa[i-1][j+1] * aa[i-1][j+1] / 2, checkaa()) //notok, -1,1 TEST(foo5, aa[i][j] = aa[i+1][j+1] * aa[i+1][j+1] / 2, checkaa()) //ok, 1,1 TEST(foo6, aa[i][j] = aa[i+1][j] * aa[i+1][j] / 2, checkaa()) //ok, -1,0 +TEST(foo61, aa[i][0] = aa[i+1][0] * aa[i+1][0] / 2, checkaa()) //notok, -1,0 +TEST(foo62, aa[i][j/2] = aa[i+1][j/2] * aa[i+1][j/2] / 2, checkaa()) //notok, not affine +TEST(foo63, aa[i][j%2] = aa[i+1][j%2] * aa[i+1][j%2] / 2, checkaa()) //notok, not affine TEST(foo7, aa[i+1][j] = aa[i][j] * aa[i][j] / 2, checkaa()) //ok, 1,0 TEST(foo9, b[j] = 3*b[j+1] + 1, checkb()) //notok, 0,-1 TEST(foo10, b[j] = 3*b[j] + 1, checkb()) //ok, 0,0 +extern int f; +TEST(foo11, f = b[i-1] = 1 + 3* b[i+1], checkb()) //ok, 2,0 but must reduce unroll factor to 2, (it would be incorrect with unroll-by-3, which the profitability would suggest) /* foo8 should work as well, but currently doesn't because the distance vectors we compute are too pessimistic. We compute @@ -68,6 +73,7 @@ TEST(foo10, b[j] = 3*b[j] + 1, checkb()) //ok, 0,0 and the last one causes us to lose. */ TEST(foo8, b[j+1] = 3*b[j] + 1, checkb()) //ok, 0,1 +int f; unsigned int a[1024]; unsigned int b[1024]; unsigned int aa[16][1024]; @@ -88,10 +94,12 @@ void init(void) printf(" %s\n", #name); \ init();for(i=0;i<4;i++)name##noopt(32,8); checka = checksum; \ init();for(i=0;i<4;i++)name(32,8); \ + if (checka != checksum) fail = 1; \ printf("%sok %s\n", checka != checksum ? "NOT " : "", #name); int main() { + int fail = 0; int i; unsigned checka; RUN(foo1); @@ -100,12 +108,18 @@ int main() RUN(foo4); RUN(foo5); RUN(foo6); + RUN(foo61); + RUN(foo62); + RUN(foo63); RUN(foo7); RUN(foo8); RUN(foo9); RUN(foo10); - return 0; + RUN(foo11); + if (fail) + __builtin_abort(); + return fail; } -/* Five loops should be unroll-jammed (actually six, but see above). */ -/* { dg-final { scan-tree-dump-times "applying unroll and jam" 5 "unrolljam" } } */ +/* Six loops should be unroll-jammed (actually seven, but see above). */ +/* { dg-final { scan-tree-dump-times "applying unroll and jam" 6 "unrolljam" } } */ -- 2.30.2