From e91eba31fdc49d928090a9b0424247fd6029f044 Mon Sep 17 00:00:00 2001 From: Nathan Sidwell Date: Fri, 20 Apr 2018 13:46:07 +0000 Subject: [PATCH] [nvptx] Fix calls to vector and worker routines 2018-04-20 Nathan Sidwell Tom de Vries PR target/85445 * config/nvptx/nvptx.c (nvptx_emit_forking, nvptx_emit_joining): Emit insns for calls too. (nvptx_find_par): Always look for worker-level predecessor insn. (nvptx_propagate): Add is_call parm, return bool. Copy frame for calls. (nvptx_vpropagate, nvptx_wpropagate): Adjust. (nvptx_process_pars): Propagate frames for calls. * testsuite/libgomp.oacc-c++/ref-1.C: New. Co-Authored-By: Tom de Vries From-SVN: r259523 --- gcc/ChangeLog | 12 +++ gcc/config/nvptx/nvptx.c | 106 ++++++++++++--------- libgomp/ChangeLog | 6 ++ libgomp/testsuite/libgomp.oacc-c++/ref-1.C | 78 +++++++++++++++ 4 files changed, 156 insertions(+), 46 deletions(-) create mode 100644 libgomp/testsuite/libgomp.oacc-c++/ref-1.C diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 44619827a89..7152a1942a7 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,15 @@ +2018-04-20 Nathan Sidwell + Tom de Vries + + PR target/85445 + * config/nvptx/nvptx.c (nvptx_emit_forking, nvptx_emit_joining): + Emit insns for calls too. + (nvptx_find_par): Always look for worker-level predecessor insn. + (nvptx_propagate): Add is_call parm, return bool. Copy frame for + calls. + (nvptx_vpropagate, nvptx_wpropagate): Adjust. + (nvptx_process_pars): Propagate frames for calls. + 2018-04-20 H.J. Lu PR target/85469 diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c index 131b495098e..ca3fea3002d 100644 --- a/gcc/config/nvptx/nvptx.c +++ b/gcc/config/nvptx/nvptx.c @@ -399,8 +399,7 @@ nvptx_emit_forking (unsigned mask, bool is_call) it creates a block with a single successor before entering a partitooned region. That is a good candidate for the end of an SESE region. */ - if (!is_call) - emit_insn (gen_nvptx_fork (op)); + emit_insn (gen_nvptx_fork (op)); emit_insn (gen_nvptx_forked (op)); } } @@ -419,8 +418,7 @@ nvptx_emit_joining (unsigned mask, bool is_call) /* Emit joining for all non-call pars to ensure there's a single predecessor for the block the join insn ends up in. This is needed for skipping entire loops. */ - if (!is_call) - emit_insn (gen_nvptx_joining (op)); + emit_insn (gen_nvptx_joining (op)); emit_insn (gen_nvptx_join (op)); } } @@ -3086,8 +3084,7 @@ nvptx_find_par (bb_insn_map_t *map, parallel *par, basic_block block) par = new parallel (par, mask); par->forked_block = block; par->forked_insn = end; - if (!(mask & GOMP_DIM_MASK (GOMP_DIM_MAX)) - && (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))) + if (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)) par->fork_insn = nvptx_discover_pre (block, CODE_FOR_nvptx_fork); } @@ -3102,8 +3099,7 @@ nvptx_find_par (bb_insn_map_t *map, parallel *par, basic_block block) gcc_assert (par->mask == mask); par->join_block = block; par->join_insn = end; - if (!(mask & GOMP_DIM_MASK (GOMP_DIM_MAX)) - && (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))) + if (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)) par->joining_insn = nvptx_discover_pre (block, CODE_FOR_nvptx_joining); par = par->parent; @@ -3782,29 +3778,34 @@ nvptx_find_sese (auto_vec &blocks, bb_pair_vec_t ®ions) #undef BB_SET_SESE #undef BB_GET_SESE -/* Propagate live state at the start of a partitioned region. BLOCK - provides the live register information, and might not contain - INSN. Propagation is inserted just after INSN. RW indicates whether - we are reading and/or writing state. This +/* Propagate live state at the start of a partitioned region. IS_CALL + indicates whether the propagation is for a (partitioned) call + instruction. BLOCK provides the live register information, and + might not contain INSN. Propagation is inserted just after INSN. RW + indicates whether we are reading and/or writing state. This separation is needed for worker-level proppagation where we essentially do a spill & fill. FN is the underlying worker function to generate the propagation instructions for single register. DATA is user data. - We propagate the live register set and the entire frame. We could - do better by (a) propagating just the live set that is used within - the partitioned regions and (b) only propagating stack entries that - are used. The latter might be quite hard to determine. */ + Returns true if we didn't emit any instructions. + + We propagate the live register set for non-calls and the entire + frame for calls and non-calls. We could do better by (a) + propagating just the live set that is used within the partitioned + regions and (b) only propagating stack entries that are used. The + latter might be quite hard to determine. */ typedef rtx (*propagator_fn) (rtx, propagate_mask, unsigned, void *); -static void -nvptx_propagate (basic_block block, rtx_insn *insn, propagate_mask rw, - propagator_fn fn, void *data) +static bool +nvptx_propagate (bool is_call, basic_block block, rtx_insn *insn, + propagate_mask rw, propagator_fn fn, void *data) { bitmap live = DF_LIVE_IN (block); bitmap_iterator iterator; unsigned ix; + bool empty = true; /* Copy the frame array. */ HOST_WIDE_INT fs = get_frame_size (); @@ -3816,6 +3817,7 @@ nvptx_propagate (basic_block block, rtx_insn *insn, propagate_mask rw, rtx pred = NULL_RTX; rtx_code_label *label = NULL; + empty = false; /* The frame size might not be DImode compatible, but the frame array's declaration will be. So it's ok to round up here. */ fs = (fs + GET_MODE_SIZE (DImode) - 1) / GET_MODE_SIZE (DImode); @@ -3862,18 +3864,21 @@ nvptx_propagate (basic_block block, rtx_insn *insn, propagate_mask rw, insn = emit_insn_after (cpy, insn); } - /* Copy live registers. */ - EXECUTE_IF_SET_IN_BITMAP (live, 0, ix, iterator) - { - rtx reg = regno_reg_rtx[ix]; + if (!is_call) + /* Copy live registers. */ + EXECUTE_IF_SET_IN_BITMAP (live, 0, ix, iterator) + { + rtx reg = regno_reg_rtx[ix]; - if (REGNO (reg) >= FIRST_PSEUDO_REGISTER) - { - rtx bcast = fn (reg, rw, 0, data); + if (REGNO (reg) >= FIRST_PSEUDO_REGISTER) + { + rtx bcast = fn (reg, rw, 0, data); - insn = emit_insn_after (bcast, insn); - } - } + insn = emit_insn_after (bcast, insn); + empty = false; + } + } + return empty; } /* Worker for nvptx_vpropagate. */ @@ -3889,12 +3894,13 @@ vprop_gen (rtx reg, propagate_mask pm, } /* Propagate state that is live at start of BLOCK across the vectors - of a single warp. Propagation is inserted just after INSN. */ + of a single warp. Propagation is inserted just after INSN. + IS_CALL and return as for nvptx_propagate. */ -static void -nvptx_vpropagate (basic_block block, rtx_insn *insn) +static bool +nvptx_vpropagate (bool is_call, basic_block block, rtx_insn *insn) { - nvptx_propagate (block, insn, PM_read_write, vprop_gen, 0); + return nvptx_propagate (is_call, block, insn, PM_read_write, vprop_gen, 0); } /* Worker for nvptx_wpropagate. */ @@ -3930,10 +3936,10 @@ wprop_gen (rtx reg, propagate_mask pm, unsigned rep, void *data_) /* Spill or fill live state that is live at start of BLOCK. PRE_P indicates if this is just before partitioned mode (do spill), or just after it starts (do fill). Sequence is inserted just after - INSN. */ + INSN. IS_CALL and return as for nvptx_propagate. */ -static void -nvptx_wpropagate (bool pre_p, basic_block block, rtx_insn *insn) +static bool +nvptx_wpropagate (bool pre_p, bool is_call, basic_block block, rtx_insn *insn) { wcast_data_t data; @@ -3941,7 +3947,9 @@ nvptx_wpropagate (bool pre_p, basic_block block, rtx_insn *insn) data.offset = 0; data.ptr = NULL_RTX; - nvptx_propagate (block, insn, pre_p ? PM_read : PM_write, wprop_gen, &data); + bool empty = nvptx_propagate (is_call, block, insn, + pre_p ? PM_read : PM_write, wprop_gen, &data); + gcc_assert (empty == !data.offset); if (data.offset) { /* Stuff was emitted, initialize the base pointer now. */ @@ -3951,6 +3959,7 @@ nvptx_wpropagate (bool pre_p, basic_block block, rtx_insn *insn) if (worker_bcast_size < data.offset) worker_bcast_size = data.offset; } + return empty; } /* Emit a worker-level synchronization barrier. We use different @@ -4311,18 +4320,23 @@ nvptx_process_pars (parallel *par) inner_mask |= par->inner_mask; } - if (par->mask & GOMP_DIM_MASK (GOMP_DIM_MAX)) - /* No propagation needed for a call. */; - else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)) + bool is_call = (par->mask & GOMP_DIM_MASK (GOMP_DIM_MAX)) != 0; + + if (par->mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)) { - nvptx_wpropagate (false, par->forked_block, par->forked_insn); - nvptx_wpropagate (true, par->forked_block, par->fork_insn); - /* Insert begin and end synchronizations. */ - emit_insn_before (nvptx_wsync (false), par->forked_insn); - emit_insn_before (nvptx_wsync (true), par->join_insn); + nvptx_wpropagate (false, is_call, par->forked_block, par->forked_insn); + bool empty = nvptx_wpropagate (true, is_call, + par->forked_block, par->fork_insn); + + if (!empty || !is_call) + { + /* Insert begin and end synchronizations. */ + emit_insn_before (nvptx_wsync (false), par->forked_insn); + emit_insn_before (nvptx_wsync (true), par->join_insn); + } } else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)) - nvptx_vpropagate (par->forked_block, par->forked_insn); + nvptx_vpropagate (is_call, par->forked_block, par->forked_insn); /* Now do siblings. */ if (par->next) diff --git a/libgomp/ChangeLog b/libgomp/ChangeLog index f68a4a83fad..1c9fa440564 100644 --- a/libgomp/ChangeLog +++ b/libgomp/ChangeLog @@ -1,3 +1,9 @@ +2018-04-20 Nathan Sidwell + Tom de Vries + + PR target/85445 + * testsuite/libgomp.oacc-c++/ref-1.C: New. + 2018-04-19 Thomas Schwinge PR libgomp/85463 diff --git a/libgomp/testsuite/libgomp.oacc-c++/ref-1.C b/libgomp/testsuite/libgomp.oacc-c++/ref-1.C new file mode 100644 index 00000000000..b3aaf0ff5fb --- /dev/null +++ b/libgomp/testsuite/libgomp.oacc-c++/ref-1.C @@ -0,0 +1,78 @@ +/* { dg-do run } */ + +#include + +#pragma acc routine vector +void __attribute__((noinline, noclone)) +Vector (int *ptr, int n, const int &inc) +{ +#pragma acc loop vector + for (unsigned ix = 0; ix < n; ix++) + ptr[ix] += inc; +} + +#pragma acc routine worker +void __attribute__((noinline, noclone)) +Worker (int *ptr, int m, int n, const int &inc) +{ +#pragma acc loop worker + for (unsigned ix = 0; ix < m; ix++) + Vector(ptr + ix * n, n, inc); +} + +int +main (void) +{ + const int n = 32, m = 32; + + int ary[m][n]; + unsigned ix, iy; + + for (ix = m; ix--;) + for (iy = n; iy--;) + ary[ix][iy] = (ix << 8) + iy; + +#pragma acc parallel copy(ary) + { + Worker (&ary[0][0], m, n, 1 << 16); + } + + int err = 0; + + for (ix = m; ix--;) + for (iy = n; iy--;) + if (ary[ix][iy] != ((1 << 16) + (ix << 8) + iy)) + { + printf ("ary[%u][%u] = %x expected %x\n", + ix, iy, ary[ix][iy], ((1 << 16) + (ix << 8) + iy)); + err++; + } + + if (err) + { + printf ("%d failed\n", err); + return 1; + } + +#pragma acc parallel copy(ary) + { + Vector (&ary[0][0], m * n, (1 << 24) - (1 << 16)); + } + + for (ix = m; ix--;) + for (iy = n; iy--;) + if (ary[ix][iy] != ((1 << 24) + (ix << 8) + iy)) + { + printf ("ary[%u][%u] = %x expected %x\n", + ix, iy, ary[ix][iy], ((1 << 24) + (ix << 8) + iy)); + err++; + } + + if (err) + { + printf ("%d failed\n", err); + return 1; + } + + return 0; +} -- 2.30.2