rewrite_into_loop_closed_ssa_1 (NULL, 0, SSA_OP_USE, loop);
}
+/* Return true if any of the access functions for dataref A
+ isn't invariant with respect to loop LOOP_NEST. */
+static bool
+any_access_function_variant_p (const struct data_reference *a,
+ const class loop *loop_nest)
+{
+ unsigned int i;
+ vec<tree> fns = DR_ACCESS_FNS (a);
+ tree t;
+
+ FOR_EACH_VEC_ELT (fns, i, t)
+ if (!evolution_function_is_invariant_p (t, loop_nest->num))
+ return true;
+
+ return false;
+}
+
/* Returns true if the distance in DDR can be determined and adjusts
the unroll factor in *UNROLL to make unrolling valid for that distance.
- Otherwise return false.
+ Otherwise return false. DDR is with respect to the outer loop of INNER.
If this data dep can lead to a removed memory reference, increment
*REMOVED and adjust *PROFIT_UNROLL to be the necessary unroll factor
for this to happen. */
static bool
-adjust_unroll_factor (struct data_dependence_relation *ddr,
+adjust_unroll_factor (class loop *inner, struct data_dependence_relation *ddr,
unsigned *unroll, unsigned *profit_unroll,
unsigned *removed)
{
gcc_unreachable ();
else if ((unsigned)dist >= *unroll)
;
- else if (lambda_vector_lexico_pos (dist_v + 1, DDR_NB_LOOPS (ddr) - 1)
- || (lambda_vector_zerop (dist_v + 1, DDR_NB_LOOPS (ddr) - 1)
- && dist > 0))
+ else if (lambda_vector_zerop (dist_v + 1, DDR_NB_LOOPS (ddr) - 1))
+ {
+ /* We have (a,0) with a < N, so this will be transformed into
+ (0,0) after unrolling by N. This might potentially be a
+ problem, if it's not a read-read dependency. */
+ if (DR_IS_READ (DDR_A (ddr)) && DR_IS_READ (DDR_B (ddr)))
+ ;
+ else
+ {
+ /* So, at least one is a write, and we might reduce the
+ distance vector to (0,0). This is still no problem
+ if both data-refs are affine with respect to the inner
+ loops. But if one of them is invariant with respect
+ to an inner loop our reordering implicit in loop fusion
+ corrupts the program, as our data dependences don't
+ capture this. E.g. for:
+ for (0 <= i < n)
+ for (0 <= j < m)
+ a[i][0] = a[i+1][0] + 2; // (1)
+ b[i][j] = b[i+1][j] + 2; // (2)
+ the distance vector for both statements is (-1,0),
+ but exchanging the order for (2) is okay, while
+ for (1) it is not. To see this, write out the original
+ accesses (assume m is 2):
+ a i j original
+ 0 0 0 r a[1][0] b[1][0]
+ 1 0 0 w a[0][0] b[0][0]
+ 2 0 1 r a[1][0] b[1][1]
+ 3 0 1 w a[0][0] b[0][1]
+ 4 1 0 r a[2][0] b[2][0]
+ 5 1 0 w a[1][0] b[1][0]
+ after unroll-by-2 and fusion the accesses are done in
+ this order (from column a): 0,1, 4,5, 2,3, i.e. this:
+ a i j transformed
+ 0 0 0 r a[1][0] b[1][0]
+ 1 0 0 w a[0][0] b[0][0]
+ 4 1 0 r a[2][0] b[2][0]
+ 5 1 0 w a[1][0] b[1][0]
+ 2 0 1 r a[1][0] b[1][1]
+ 3 0 1 w a[0][0] b[0][1]
+ Note how access 2 accesses the same element as access 5
+ for array 'a' but not for array 'b'. */
+ if (any_access_function_variant_p (DDR_A (ddr), inner)
+ && any_access_function_variant_p (DDR_B (ddr), inner))
+ ;
+ else
+ /* And if any dataref of this pair is invariant with
+ respect to the inner loop, we have no chance than
+ to reduce the unroll factor. */
+ *unroll = dist;
+ }
+ }
+ else if (lambda_vector_lexico_pos (dist_v + 1, DDR_NB_LOOPS (ddr) - 1))
;
else
*unroll = dist;
/* Now check the distance vector, for determining a sensible
outer unroll factor, and for validity of merging the inner
loop copies. */
- if (!adjust_unroll_factor (ddr, &unroll_factor, &profit_unroll,
+ if (!adjust_unroll_factor (loop, ddr, &unroll_factor, &profit_unroll,
&removed))
{
/* Couldn't get the distance vector. For two reads that's
to ignore all profitability concerns and apply the transformation
always. */
if (!PARAM_VALUE (PARAM_UNROLL_JAM_MIN_PERCENT))
- profit_unroll = 2;
+ profit_unroll = MAX(2, profit_unroll);
else if (removed * 100 / datarefs.length ()
< (unsigned)PARAM_VALUE (PARAM_UNROLL_JAM_MIN_PERCENT))
profit_unroll = 1;
//printf(" %d\n", sum);
}
+unsigned i, j;
#define TEST(name, body, test) \
static void __attribute__((noinline,noclone)) name (unsigned long n, unsigned long m) \
{ \
- unsigned long i, j; \
for (i = 1; i < m; i++) { \
for (j = 1; j < n; j++) { \
body; \
TEST(foo4, aa[i][j] = aa[i-1][j+1] * aa[i-1][j+1] / 2, checkaa()) //notok, -1,1
TEST(foo5, aa[i][j] = aa[i+1][j+1] * aa[i+1][j+1] / 2, checkaa()) //ok, 1,1
TEST(foo6, aa[i][j] = aa[i+1][j] * aa[i+1][j] / 2, checkaa()) //ok, -1,0
+TEST(foo61, aa[i][0] = aa[i+1][0] * aa[i+1][0] / 2, checkaa()) //notok, -1,0
+TEST(foo62, aa[i][j/2] = aa[i+1][j/2] * aa[i+1][j/2] / 2, checkaa()) //notok, not affine
+TEST(foo63, aa[i][j%2] = aa[i+1][j%2] * aa[i+1][j%2] / 2, checkaa()) //notok, not affine
TEST(foo7, aa[i+1][j] = aa[i][j] * aa[i][j] / 2, checkaa()) //ok, 1,0
TEST(foo9, b[j] = 3*b[j+1] + 1, checkb()) //notok, 0,-1
TEST(foo10, b[j] = 3*b[j] + 1, checkb()) //ok, 0,0
+extern int f;
+TEST(foo11, f = b[i-1] = 1 + 3* b[i+1], checkb()) //ok, 2,0 but must reduce unroll factor to 2, (it would be incorrect with unroll-by-3, which the profitability would suggest)
/* foo8 should work as well, but currently doesn't because the distance
vectors we compute are too pessimistic. We compute
and the last one causes us to lose. */
TEST(foo8, b[j+1] = 3*b[j] + 1, checkb()) //ok, 0,1
+int f;
unsigned int a[1024];
unsigned int b[1024];
unsigned int aa[16][1024];
printf(" %s\n", #name); \
init();for(i=0;i<4;i++)name##noopt(32,8); checka = checksum; \
init();for(i=0;i<4;i++)name(32,8); \
+ if (checka != checksum) fail = 1; \
printf("%sok %s\n", checka != checksum ? "NOT " : "", #name);
int main()
{
+ int fail = 0;
int i;
unsigned checka;
RUN(foo1);
RUN(foo4);
RUN(foo5);
RUN(foo6);
+ RUN(foo61);
+ RUN(foo62);
+ RUN(foo63);
RUN(foo7);
RUN(foo8);
RUN(foo9);
RUN(foo10);
- return 0;
+ RUN(foo11);
+ if (fail)
+ __builtin_abort();
+ return fail;
}
-/* Five loops should be unroll-jammed (actually six, but see above). */
-/* { dg-final { scan-tree-dump-times "applying unroll and jam" 5 "unrolljam" } } */
+/* Six loops should be unroll-jammed (actually seven, but see above). */
+/* { dg-final { scan-tree-dump-times "applying unroll and jam" 6 "unrolljam" } } */