From c5a64cfec7388ddb685071d99dc7b23c918af16c Mon Sep 17 00:00:00 2001 From: Nathan Sidwell Date: Fri, 13 Nov 2015 21:51:32 +0000 Subject: [PATCH] omp-low.c (scan_sharing_clauses): Accept INDEPENDENT, AUTO & SEQ. gcc/ * gcc/omp-low.c (scan_sharing_clauses): Accept INDEPENDENT, AUTO & SEQ. (oacc_loop_fixed_partitions): Correct return type to bool. (oacc_loop_auto_partitions): New. (oacc_loop_partition): Take mask argument, call oacc_loop_auto_partitions. (execute_oacc_device_lower): Provide mask to oacc_loop_partition. gcc/testsuite/ * c-c++-common/goacc/loop-auto-1.c: New. libgomp/ * testsuite/libgomp.oacc-c-c++-common/loop-auto-1.c: New. From-SVN: r230354 --- gcc/ChangeLog | 10 + gcc/omp-low.c | 80 ++++-- gcc/testsuite/ChangeLog | 2 + .../c-c++-common/goacc/loop-auto-1.c | 230 ++++++++++++++++++ libgomp/ChangeLog | 2 + .../libgomp.oacc-c-c++-common/loop-auto-1.c | 225 +++++++++++++++++ 6 files changed, 532 insertions(+), 17 deletions(-) create mode 100644 gcc/testsuite/c-c++-common/goacc/loop-auto-1.c create mode 100644 libgomp/testsuite/libgomp.oacc-c-c++-common/loop-auto-1.c diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 13133b1b395..00d587753c0 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,13 @@ +2015-11-13 Nathan Sidwell + + * gcc/omp-low.c (scan_sharing_clauses): Accept INDEPENDENT, AUTO & + SEQ. + (oacc_loop_fixed_partitions): Correct return type to bool. + (oacc_loop_auto_partitions): New. + (oacc_loop_partition): Take mask argument, call + oacc_loop_auto_partitions. + (execute_oacc_device_lower): Provide mask to oacc_loop_partition. + 2015-11-13 Michael Meissner * config/rs6000/constraints.md (we constraint): New constraint for diff --git a/gcc/omp-low.c b/gcc/omp-low.c index f7584deb3ac..4b2b4777221 100644 --- a/gcc/omp-low.c +++ b/gcc/omp-low.c @@ -2124,6 +2124,9 @@ scan_sharing_clauses (tree clauses, omp_context *ctx) case OMP_CLAUSE_WORKER: case OMP_CLAUSE_VECTOR: case OMP_CLAUSE_TILE: + case OMP_CLAUSE_INDEPENDENT: + case OMP_CLAUSE_AUTO: + case OMP_CLAUSE_SEQ: break; case OMP_CLAUSE_ALIGNED: @@ -2136,9 +2139,6 @@ scan_sharing_clauses (tree clauses, omp_context *ctx) case OMP_CLAUSE_DEVICE_RESIDENT: case OMP_CLAUSE_USE_DEVICE: case OMP_CLAUSE__CACHE_: - case OMP_CLAUSE_INDEPENDENT: - case OMP_CLAUSE_AUTO: - case OMP_CLAUSE_SEQ: sorry ("Clause not supported yet"); break; @@ -2299,14 +2299,14 @@ scan_sharing_clauses (tree clauses, omp_context *ctx) case OMP_CLAUSE_WORKER: case OMP_CLAUSE_VECTOR: case OMP_CLAUSE_TILE: + case OMP_CLAUSE_INDEPENDENT: + case OMP_CLAUSE_AUTO: + case OMP_CLAUSE_SEQ: break; case OMP_CLAUSE_DEVICE_RESIDENT: case OMP_CLAUSE_USE_DEVICE: case OMP_CLAUSE__CACHE_: - case OMP_CLAUSE_INDEPENDENT: - case OMP_CLAUSE_AUTO: - case OMP_CLAUSE_SEQ: sorry ("Clause not supported yet"); break; @@ -19230,10 +19230,10 @@ oacc_loop_process (oacc_loop *loop) /* Walk the OpenACC loop heirarchy checking and assigning the programmer-specified partitionings. OUTER_MASK is the partitioning - this loop is contained within. Return partitiong mask used within - this loop nest. */ + this loop is contained within. Return true if we contain an + auto-partitionable loop. */ -static unsigned +static bool oacc_loop_fixed_partitions (oacc_loop *loop, unsigned outer_mask) { unsigned this_mask = loop->mask; @@ -19337,18 +19337,63 @@ oacc_loop_fixed_partitions (oacc_loop *loop, unsigned outer_mask) return has_auto; } +/* Walk the OpenACC loop heirarchy to assign auto-partitioned loops. + OUTER_MASK is the partitioning this loop is contained within. + Return the cumulative partitioning used by this loop, siblings and + children. */ + +static unsigned +oacc_loop_auto_partitions (oacc_loop *loop, unsigned outer_mask) +{ + unsigned inner_mask = 0; + bool noisy = true; + +#ifdef ACCEL_COMPILER + /* When device_type is supported, we want the device compiler to be + noisy, if the loop parameters are device_type-specific. */ + noisy = false; +#endif + + if (loop->child) + inner_mask |= oacc_loop_auto_partitions (loop->child, + outer_mask | loop->mask); + + if ((loop->flags & OLF_AUTO) && (loop->flags & OLF_INDEPENDENT)) + { + unsigned this_mask = 0; + + /* Determine the outermost partitioning used within this loop. */ + this_mask = inner_mask | GOMP_DIM_MASK (GOMP_DIM_MAX); + this_mask = (this_mask & -this_mask); + + /* Pick the partitioning just inside that one. */ + this_mask >>= 1; + + /* And avoid picking one use by an outer loop. */ + this_mask &= ~outer_mask; + + if (!this_mask && noisy) + warning_at (loop->loc, 0, + "insufficient partitioning available to parallelize loop"); + + loop->mask = this_mask; + } + inner_mask |= loop->mask; + + if (loop->sibling) + inner_mask |= oacc_loop_auto_partitions (loop->sibling, outer_mask); + + return inner_mask; +} + /* Walk the OpenACC loop heirarchy to check and assign partitioning axes. */ static void -oacc_loop_partition (oacc_loop *loop, int fn_level) +oacc_loop_partition (oacc_loop *loop, unsigned outer_mask) { - unsigned outer_mask = 0; - - if (fn_level >= 0) - outer_mask = GOMP_DIM_MASK (fn_level) - 1; - - oacc_loop_fixed_partitions (loop, outer_mask); + if (oacc_loop_fixed_partitions (loop, outer_mask)) + oacc_loop_auto_partitions (loop, outer_mask); } /* Default fork/join early expander. Delete the function calls if @@ -19429,7 +19474,8 @@ execute_oacc_device_lower () /* Discover, partition and process the loops. */ oacc_loop *loops = oacc_loop_discovery (); - oacc_loop_partition (loops, fn_level); + unsigned outer_mask = fn_level >= 0 ? GOMP_DIM_MASK (fn_level) - 1 : 0; + oacc_loop_partition (loops, outer_mask); oacc_loop_process (loops); if (dump_file) { diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 68cf4e946fe..33011306f35 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,5 +1,7 @@ 2015-11-13 Nathan Sidwell + * c-c++-common/goacc/loop-auto-1.c: New. + * lib/target-supports.exp (check_effective_target_offload_nvptx): New. * gcc.dg/goacc/nvptx-merged-loop.c: New. diff --git a/gcc/testsuite/c-c++-common/goacc/loop-auto-1.c b/gcc/testsuite/c-c++-common/goacc/loop-auto-1.c new file mode 100644 index 00000000000..ee6d28c2e8c --- /dev/null +++ b/gcc/testsuite/c-c++-common/goacc/loop-auto-1.c @@ -0,0 +1,230 @@ + +void Foo () +{ + +#pragma acc parallel num_gangs(10) num_workers(32) vector_length(32) + { +#pragma acc loop vector + for (int ix = 0; ix < 10; ix++) + { +#pragma acc loop seq + for (int jx = 0; jx < 10; jx++) {} + +#pragma acc loop auto /* { dg-warning "insufficient partitioning" } */ + for (int jx = 0; jx < 10; jx++) {} + } + +#pragma acc loop worker + for (int ix = 0; ix < 10; ix++) + { +#pragma acc loop auto + for (int jx = 0; jx < 10; jx++) {} + +#pragma acc loop auto /* { dg-warning "insufficient partitioning" } */ + for (int jx = 0; jx < 10; jx++) + { +#pragma acc loop vector + for (int kx = 0; kx < 10; kx++) {} + } + } + +#pragma acc loop gang + for (int ix = 0; ix < 10; ix++) + { +#pragma acc loop auto + for (int jx = 0; jx < 10; jx++) {} + +#pragma acc loop auto + for (int jx = 0; jx < 10; jx++) + { +#pragma acc loop auto + for (int kx = 0; kx < 10; kx++) {} + } + +#pragma acc loop worker + for (int jx = 0; jx < 10; jx++) + { +#pragma acc loop auto + for (int kx = 0; kx < 10; kx++) {} + } + +#pragma acc loop vector + for (int jx = 0; jx < 10; jx++) + { +#pragma acc loop auto /* { dg-warning "insufficient partitioning" } */ + for (int kx = 0; kx < 10; kx++) {} + } + +#pragma acc loop auto + for (int jx = 0; jx < 10; jx++) + { +#pragma acc loop vector + for (int kx = 0; kx < 10; kx++) {} + } + + } + +#pragma acc loop auto + for (int ix = 0; ix < 10; ix++) + { +#pragma acc loop auto + for (int jx = 0; jx < 10; jx++) + { +#pragma acc loop auto + for (int kx = 0; kx < 10; kx++) {} + } + } + } +} + +#pragma acc routine gang +void Gang (void) +{ +#pragma acc loop vector + for (int ix = 0; ix < 10; ix++) + { +#pragma acc loop seq + for (int jx = 0; jx < 10; jx++) {} + +#pragma acc loop auto /* { dg-warning "insufficient partitioning" } */ + for (int jx = 0; jx < 10; jx++) {} + } + +#pragma acc loop worker + for (int ix = 0; ix < 10; ix++) + { +#pragma acc loop auto + for (int jx = 0; jx < 10; jx++) {} + +#pragma acc loop auto /* { dg-warning "insufficient partitioning" } */ + for (int jx = 0; jx < 10; jx++) + { +#pragma acc loop vector + for (int kx = 0; kx < 10; kx++) {} + } + } + +#pragma acc loop gang + for (int ix = 0; ix < 10; ix++) + { +#pragma acc loop auto + for (int jx = 0; jx < 10; jx++) {} + +#pragma acc loop auto + for (int jx = 0; jx < 10; jx++) + { +#pragma acc loop auto + for (int kx = 0; kx < 10; kx++) {} + } + +#pragma acc loop worker + for (int jx = 0; jx < 10; jx++) + { +#pragma acc loop auto + for (int kx = 0; kx < 10; kx++) {} + } + +#pragma acc loop vector + for (int jx = 0; jx < 10; jx++) + { +#pragma acc loop auto /* { dg-warning "insufficient partitioning" } */ + for (int kx = 0; kx < 10; kx++) {} + } + +#pragma acc loop auto + for (int jx = 0; jx < 10; jx++) + { +#pragma acc loop vector + for (int kx = 0; kx < 10; kx++) {} + } + + } + +#pragma acc loop auto + for (int ix = 0; ix < 10; ix++) + { +#pragma acc loop auto + for (int jx = 0; jx < 10; jx++) + { +#pragma acc loop auto + for (int kx = 0; kx < 10; kx++) {} + } + } +} + +#pragma acc routine worker +void Worker (void) +{ +#pragma acc loop vector + for (int ix = 0; ix < 10; ix++) + { +#pragma acc loop seq + for (int jx = 0; jx < 10; jx++) {} + +#pragma acc loop auto /* { dg-warning "insufficient partitioning" } */ + for (int jx = 0; jx < 10; jx++) {} + } + +#pragma acc loop worker + for (int ix = 0; ix < 10; ix++) + { +#pragma acc loop auto + for (int jx = 0; jx < 10; jx++) {} + +#pragma acc loop auto /* { dg-warning "insufficient partitioning" } */ + for (int jx = 0; jx < 10; jx++) + { +#pragma acc loop vector + for (int kx = 0; kx < 10; kx++) {} + } + } + +#pragma acc loop auto + for (int ix = 0; ix < 10; ix++) + { +#pragma acc loop auto + for (int jx = 0; jx < 10; jx++) {} + } + +#pragma acc loop auto /* { dg-warning "insufficient partitioning" } */ + for (int ix = 0; ix < 10; ix++) + { +#pragma acc loop auto + for (int jx = 0; jx < 10; jx++) + { +#pragma acc loop auto + for (int kx = 0; kx < 10; kx++) {} + } + } +} + +#pragma acc routine vector +void Vector (void) +{ +#pragma acc loop vector + for (int ix = 0; ix < 10; ix++) + { +#pragma acc loop seq + for (int jx = 0; jx < 10; jx++) {} + +#pragma acc loop auto /* { dg-warning "insufficient partitioning" } */ + for (int jx = 0; jx < 10; jx++) {} + } + +#pragma acc loop auto + for (int ix = 0; ix < 10; ix++) {} + +#pragma acc loop auto /* { dg-warning "insufficient partitioning" } */ + for (int ix = 0; ix < 10; ix++) + { +#pragma acc loop auto + for (int jx = 0; jx < 10; jx++) {} + } +} + +#pragma acc routine seq +void Seq (void) +{ +#pragma acc loop auto /* { dg-warning "insufficient partitioning" } */ + for (int ix = 0; ix < 10; ix++) {} +} diff --git a/libgomp/ChangeLog b/libgomp/ChangeLog index eb1db836410..2f5a4d189da 100644 --- a/libgomp/ChangeLog +++ b/libgomp/ChangeLog @@ -1,5 +1,7 @@ 2015-11-13 Nathan Sidwell + * testsuite/libgomp.oacc-c-c++-common/loop-auto-1.c: New. + * testsuite/libgomp.oacc-c-c++-common/collapse-2.c: Sequential loop is sequential. diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-auto-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-auto-1.c new file mode 100644 index 00000000000..174af919792 --- /dev/null +++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-auto-1.c @@ -0,0 +1,225 @@ +/* { dg-do run } */ +/* { dg-additional-options "-O2" */ + +#include +#include + +int check (const int *ary, int size, int gp, int wp, int vp) +{ + int exit = 0; + int ix; + int gangs[32], workers[32], vectors[32]; + + for (ix = 0; ix < 32; ix++) + gangs[ix] = workers[ix] = vectors[ix] = 0; + + for (ix = 0; ix < size; ix++) + { + vectors[ary[ix] & 0xff]++; + workers[(ary[ix] >> 8) & 0xff]++; + gangs[(ary[ix] >> 16) & 0xff]++; + } + + for (ix = 0; ix < 32; ix++) + { + if (gp) + { + int expect = gangs[0]; + if (gangs[ix] != expect) + { + exit = 1; + printf ("gang %d not used %d times\n", ix, expect); + } + } + else if (ix && gangs[ix]) + { + exit = 1; + printf ("gang %d unexpectedly used\n", ix); + } + + if (wp) + { + int expect = workers[0]; + if (workers[ix] != expect) + { + exit = 1; + printf ("worker %d not used %d times\n", ix, expect); + } + } + else if (ix && workers[ix]) + { + exit = 1; + printf ("worker %d unexpectedly used\n", ix); + } + + if (vp) + { + int expect = vectors[0]; + if (vectors[ix] != expect) + { + exit = 1; + printf ("vector %d not used %d times\n", ix, expect); + } + } + else if (ix && vectors[ix]) + { + exit = 1; + printf ("vector %d unexpectedly used\n", ix); + } + + } + return exit; +} + +#pragma acc routine seq +static int __attribute__((noinline)) place () +{ + int r = 0; + + if (acc_on_device (acc_device_nvidia)) + { + int g = 0, w = 0, v = 0; + + __asm__ volatile ("mov.u32 %0,%%ctaid.x;" : "=r" (g)); + __asm__ volatile ("mov.u32 %0,%%tid.y;" : "=r" (w)); + __asm__ volatile ("mov.u32 %0,%%tid.x;" : "=r" (v)); + r = (g << 16) | (w << 8) | v; + } + return r; +} + +static void clear (int *ary, int size) +{ + int ix; + + for (ix = 0; ix < size; ix++) + ary[ix] = -1; +} + +int vector_1 (int *ary, int size) +{ + clear (ary, size); + +#pragma acc parallel num_workers (32) vector_length(32) copy(ary[0:size]) firstprivate (size) + { +#pragma acc loop auto + for (int ix = 0; ix < size; ix++) + ary[ix] = place (); + } + + return check (ary, size, 0, 0, 1); +} + +int vector_2 (int *ary, int size) +{ + clear (ary, size); + +#pragma acc parallel num_workers (32) vector_length(32) copy(ary[0:size]) firstprivate (size) + { +#pragma acc loop worker + for (int jx = 0; jx < size / 64; jx++) +#pragma acc loop auto + for (int ix = 0; ix < 64; ix++) + ary[ix + jx * 64] = place (); + } + + return check (ary, size, 0, 1, 1); +} + +int worker_1 (int *ary, int size) +{ + clear (ary, size); + +#pragma acc parallel num_workers (32) vector_length(32) copy(ary[0:size]) firstprivate (size) + { +#pragma acc loop auto + for (int jx = 0; jx < size / 64; jx++) +#pragma acc loop vector + for (int ix = 0; ix < 64; ix++) + ary[ix + jx * 64] = place (); + } + + return check (ary, size, 0, 1, 1); +} + +int worker_2 (int *ary, int size) +{ + clear (ary, size); + +#pragma acc parallel num_workers (32) vector_length(32) copy(ary[0:size]) firstprivate (size) + { +#pragma acc loop auto + for (int jx = 0; jx < size / 64; jx++) +#pragma acc loop auto + for (int ix = 0; ix < 64; ix++) + ary[ix + jx * 64] = place (); + } + + return check (ary, size, 0, 1, 1); +} + +int gang_1 (int *ary, int size) +{ + clear (ary, size); + +#pragma acc parallel num_gangs (32) num_workers (32) vector_length(32) copy(ary[0:size]) firstprivate (size) + { +#pragma acc loop auto + for (int jx = 0; jx < size / 64; jx++) +#pragma acc loop worker + for (int ix = 0; ix < 64; ix++) + ary[ix + jx * 64] = place (); + } + + return check (ary, size, 1, 1, 0); +} + +int gang_2 (int *ary, int size) +{ + clear (ary, size); + +#pragma acc parallel num_gangs (32) num_workers (32) vector_length(32) copy(ary[0:size]) firstprivate (size) + { +#pragma acc loop auto + for (int kx = 0; kx < size / (32 * 32); kx++) +#pragma acc loop auto + for (int jx = 0; jx < 32; jx++) +#pragma acc loop auto + for (int ix = 0; ix < 32; ix++) + ary[ix + jx * 32 + kx * 32 * 32] = place (); + } + + return check (ary, size, 1, 1, 1); +} + +#define N (32*32*32) +int main () +{ + int ondev = 0; + +#pragma acc parallel copy(ondev) + { + ondev = acc_on_device (acc_device_not_host); + } + if (!ondev) + return 0; + + int ary[N]; + + if (vector_1 (ary, N)) + return 1; + if (vector_2 (ary, N)) + return 1; + + if (worker_1 (ary, N)) + return 1; + if (worker_2 (ary, N)) + return 1; + + if (gang_1 (ary, N)) + return 1; + if (gang_2 (ary, N)) + return 1; + + return 0; +} -- 2.30.2