From 5b37e8664b05336df60996a2411b4d61a852c613 Mon Sep 17 00:00:00 2001 From: Nathan Sidwell Date: Mon, 2 May 2016 13:16:22 +0000 Subject: [PATCH] omp-low.c (struct oacc_loop): Add 'inner' field. gcc/ * omp-low.c (struct oacc_loop): Add 'inner' field. (new_oacc_loop_raw): Initialize it to zero. (oacc_loop_fixed_partitions): Initialize it. (oacc_loop_auto_partitions): Partition outermost loop to outermost available partitioning. gcc/testsuite/ * c-c++-common/goacc/loop-auto-1.c: Adjust expected warnings. libgomp/ * testsuite/libgomp.oacc-c-c++-common/loop-auto-1.c: Adjust expected partitioning. From-SVN: r235756 --- gcc/ChangeLog | 8 +++ gcc/omp-low.c | 52 +++++++++++++----- gcc/testsuite/ChangeLog | 4 ++ .../c-c++-common/goacc/loop-auto-1.c | 8 +-- libgomp/ChangeLog | 5 ++ .../libgomp.oacc-c-c++-common/loop-auto-1.c | 54 ++++++++++--------- 6 files changed, 88 insertions(+), 43 deletions(-) diff --git a/gcc/ChangeLog b/gcc/ChangeLog index a098272d42f..ff0b573b0c9 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,11 @@ +2016-05-02 Nathan Sidwell + + * omp-low.c (struct oacc_loop): Add 'inner' field. + (new_oacc_loop_raw): Initialize it to zero. + (oacc_loop_fixed_partitions): Initialize it. + (oacc_loop_auto_partitions): Partition outermost loop to outermost + available partitioning. + 2016-05-02 Claudiu Zissulescu * config/arc/arc.md (mulsidi3): Change operand 0 predicate to diff --git a/gcc/omp-low.c b/gcc/omp-low.c index 50ad68ece77..e4a1e4746db 100644 --- a/gcc/omp-low.c +++ b/gcc/omp-low.c @@ -241,6 +241,7 @@ struct oacc_loop tree routine; /* Pseudo-loop enclosing a routine. */ unsigned mask; /* Partitioning mask. */ + unsigned inner; /* Partitioning of inner loops. */ unsigned flags; /* Partitioning flags. */ unsigned ifns; /* Contained loop abstraction functions. */ tree chunk_size; /* Chunk size. */ @@ -18921,7 +18922,7 @@ new_oacc_loop_raw (oacc_loop *parent, location_t loc) memset (loop->tails, 0, sizeof (loop->tails)); loop->routine = NULL_TREE; - loop->mask = loop->flags = 0; + loop->mask = loop->flags = loop->inner = 0; loop->ifns = 0; loop->chunk_size = 0; loop->head_end = NULL; @@ -19449,8 +19450,11 @@ oacc_loop_fixed_partitions (oacc_loop *loop, unsigned outer_mask) mask_all |= this_mask; if (loop->child) - mask_all |= oacc_loop_fixed_partitions (loop->child, - outer_mask | this_mask); + { + loop->inner = oacc_loop_fixed_partitions (loop->child, + outer_mask | this_mask); + mask_all |= loop->inner; + } if (loop->sibling) mask_all |= oacc_loop_fixed_partitions (loop->sibling, outer_mask); @@ -19466,7 +19470,7 @@ oacc_loop_fixed_partitions (oacc_loop *loop, unsigned outer_mask) static unsigned oacc_loop_auto_partitions (oacc_loop *loop, unsigned outer_mask) { - unsigned inner_mask = 0; + bool assign = (loop->flags & OLF_AUTO) && (loop->flags & OLF_INDEPENDENT); bool noisy = true; #ifdef ACCEL_COMPILER @@ -19475,16 +19479,33 @@ oacc_loop_auto_partitions (oacc_loop *loop, unsigned outer_mask) noisy = false; #endif + if (assign && outer_mask < GOMP_DIM_MASK (GOMP_DIM_MAX - 1)) + { + /* Allocate the outermost loop at the outermost available + level. */ + unsigned this_mask = outer_mask + 1; + + if (!(this_mask & loop->inner)) + loop->mask = this_mask; + } + if (loop->child) - inner_mask |= oacc_loop_auto_partitions (loop->child, - outer_mask | loop->mask); + { + unsigned child_mask = outer_mask | loop->mask; + + if (loop->mask || assign) + child_mask |= GOMP_DIM_MASK (GOMP_DIM_MAX); - if ((loop->flags & OLF_AUTO) && (loop->flags & OLF_INDEPENDENT)) + loop->inner = oacc_loop_auto_partitions (loop->child, child_mask); + } + + if (assign && !loop->mask) { + /* Allocate the loop at the innermost available level. */ unsigned this_mask = 0; /* Determine the outermost partitioning used within this loop. */ - this_mask = inner_mask | GOMP_DIM_MASK (GOMP_DIM_MAX); + this_mask = loop->inner | GOMP_DIM_MASK (GOMP_DIM_MAX); this_mask = (this_mask & -this_mask); /* Pick the partitioning just inside that one. */ @@ -19497,17 +19518,20 @@ oacc_loop_auto_partitions (oacc_loop *loop, unsigned outer_mask) warning_at (loop->loc, 0, "insufficient partitioning available to parallelize loop"); - if (dump_file) - fprintf (dump_file, "Auto loop %s:%d assigned %d\n", - LOCATION_FILE (loop->loc), LOCATION_LINE (loop->loc), - this_mask); - loop->mask = this_mask; } - inner_mask |= loop->mask; + + if (assign && dump_file) + fprintf (dump_file, "Auto loop %s:%d assigned %d\n", + LOCATION_FILE (loop->loc), LOCATION_LINE (loop->loc), + loop->mask); + + unsigned inner_mask = 0; if (loop->sibling) inner_mask |= oacc_loop_auto_partitions (loop->sibling, outer_mask); + + inner_mask |= loop->inner | loop->mask; return inner_mask; } diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 3d6051cf884..27a5972024e 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,7 @@ +2016-05-02 Nathan Sidwell + + * c-c++-common/goacc/loop-auto-1.c: Adjust expected warnings. + 2016-05-02 Marek Polacek PR c/70851 diff --git a/gcc/testsuite/c-c++-common/goacc/loop-auto-1.c b/gcc/testsuite/c-c++-common/goacc/loop-auto-1.c index ee6d28c2e8c..33d53409fe3 100644 --- a/gcc/testsuite/c-c++-common/goacc/loop-auto-1.c +++ b/gcc/testsuite/c-c++-common/goacc/loop-auto-1.c @@ -186,10 +186,10 @@ void Worker (void) for (int jx = 0; jx < 10; jx++) {} } -#pragma acc loop auto /* { dg-warning "insufficient partitioning" } */ +#pragma acc loop auto for (int ix = 0; ix < 10; ix++) { -#pragma acc loop auto +#pragma acc loop auto /* { dg-warning "insufficient partitioning" } */ for (int jx = 0; jx < 10; jx++) { #pragma acc loop auto @@ -214,10 +214,10 @@ void Vector (void) #pragma acc loop auto for (int ix = 0; ix < 10; ix++) {} -#pragma acc loop auto /* { dg-warning "insufficient partitioning" } */ +#pragma acc loop auto for (int ix = 0; ix < 10; ix++) { -#pragma acc loop auto +#pragma acc loop auto /* { dg-warning "insufficient partitioning" } */ for (int jx = 0; jx < 10; jx++) {} } } diff --git a/libgomp/ChangeLog b/libgomp/ChangeLog index 351c2392530..9de04f57d73 100644 --- a/libgomp/ChangeLog +++ b/libgomp/ChangeLog @@ -1,3 +1,8 @@ +2016-05-02 Nathan Sidwell + + * testsuite/libgomp.oacc-c-c++-common/loop-auto-1.c: Adjust + expected partitioning. + 2016-04-29 Cesar Philippidis PR middle-end/70626 diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-auto-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-auto-1.c index 622bbdffaea..8a755b88038 100644 --- a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-auto-1.c +++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-auto-1.c @@ -103,9 +103,11 @@ int vector_1 (int *ary, int size) #pragma acc parallel num_workers (32) vector_length(32) copy(ary[0:size]) firstprivate (size) { +#pragma acc loop gang + for (int jx = 0; jx < 1; jx++) #pragma acc loop auto - for (int ix = 0; ix < size; ix++) - ary[ix] = place (); + for (int ix = 0; ix < size; ix++) + ary[ix] = place (); } return check (ary, size, 0, 0, 1); @@ -118,7 +120,7 @@ int vector_2 (int *ary, int size) #pragma acc parallel num_workers (32) vector_length(32) copy(ary[0:size]) firstprivate (size) { #pragma acc loop worker - for (int jx = 0; jx < size / 64; jx++) + for (int jx = 0; jx < size / 64; jx++) #pragma acc loop auto for (int ix = 0; ix < 64; ix++) ary[ix + jx * 64] = place (); @@ -133,30 +135,16 @@ int worker_1 (int *ary, int size) #pragma acc parallel num_workers (32) vector_length(32) copy(ary[0:size]) firstprivate (size) { +#pragma acc loop gang + for (int kx = 0; kx < 1; kx++) #pragma acc loop auto - for (int jx = 0; jx < size / 64; jx++) + for (int jx = 0; jx < size / 64; jx++) #pragma acc loop vector - for (int ix = 0; ix < 64; ix++) - ary[ix + jx * 64] = place (); - } - - return check (ary, size, 0, 1, 1); -} - -int worker_2 (int *ary, int size) -{ - clear (ary, size); - -#pragma acc parallel num_workers (32) vector_length(32) copy(ary[0:size]) firstprivate (size) - { -#pragma acc loop auto - for (int jx = 0; jx < size / 64; jx++) -#pragma acc loop auto - for (int ix = 0; ix < 64; ix++) - ary[ix + jx * 64] = place (); + for (int ix = 0; ix < 64; ix++) + ary[ix + jx * 64] = place (); } - return check (ary, size, 0, 1, 1); + return check (ary, size, 0, 1, 1); } int gang_1 (int *ary, int size) @@ -193,6 +181,22 @@ int gang_2 (int *ary, int size) return check (ary, size, 1, 1, 1); } +int gang_3 (int *ary, int size) +{ + clear (ary, size); + +#pragma acc parallel num_workers (32) vector_length(32) copy(ary[0:size]) firstprivate (size) + { +#pragma acc loop auto + for (int jx = 0; jx < size / 64; jx++) +#pragma acc loop auto + for (int ix = 0; ix < 64; ix++) + ary[ix + jx * 64] = place (); + } + + return check (ary, size, 1, 0, 1); +} + #define N (32*32*32) int main () { @@ -214,13 +218,13 @@ int main () if (worker_1 (ary, N)) return 1; - if (worker_2 (ary, N)) - return 1; if (gang_1 (ary, N)) return 1; if (gang_2 (ary, N)) return 1; + if (gang_3 (ary, N)) + return 1; return 0; } -- 2.30.2