From 33f47f427920586a6f21dd8f1b1adc582b9fb7af Mon Sep 17 00:00:00 2001 From: Nathan Sidwell Date: Wed, 18 Nov 2015 13:49:17 +0000 Subject: [PATCH] nvptx.c (global_lock_var): New. gcc/ * config/nvptx/nvptx.c (global_lock_var): New. (nvptx_global_lock_addr): New. (nvptx_lockless_update): Recomment and adjust for clarity. (nvptx_lockfull_update): New. (nvptx_reduction_update): New. (nvptx_goacc_reduction_fini): Call it. libgcc/ * config/nvptx/reduction.c: New. * config/nvptx/t-nvptx (LIB2ADD): Add it. libgomp/ * testsuite/libgomp.oacc-c-c++-common/reduction-cplx-flt.c: Add worker & gang cases. * testsuite/libgomp.oacc-c-c++-common/reduction-cplx-dbl.c: Likewise. From-SVN: r230545 --- gcc/ChangeLog | 9 + gcc/config/nvptx/nvptx.c | 240 +++++++++++++++--- libgcc/ChangeLog | 5 + libgcc/config/nvptx/reduction.c | 31 +++ libgcc/config/nvptx/t-nvptx | 3 +- libgomp/ChangeLog | 6 + .../reduction-cplx-dbl.c | 96 +++++-- .../reduction-cplx-flt.c | 96 +++++-- 8 files changed, 412 insertions(+), 74 deletions(-) create mode 100644 libgcc/config/nvptx/reduction.c diff --git a/gcc/ChangeLog b/gcc/ChangeLog index abf01d336f7..d569816265c 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,12 @@ +2015-11-18 Nathan Sidwell + + * config/nvptx/nvptx.c (global_lock_var): New. + (nvptx_global_lock_addr): New. + (nvptx_lockless_update): Recomment and adjust for clarity. + (nvptx_lockfull_update): New. + (nvptx_reduction_update): New. + (nvptx_goacc_reduction_fini): Call it. + 2015-11-18 Bernd Schmidt * regrename.h (struct du_head): Add target_data_1 and target_data_2 diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c index 26c2e961051..4436ac4de94 100644 --- a/gcc/config/nvptx/nvptx.c +++ b/gcc/config/nvptx/nvptx.c @@ -114,6 +114,9 @@ static unsigned worker_red_align; #define worker_red_name "__worker_red" static GTY(()) rtx worker_red_sym; +/* Global lock variable, needed for 128bit worker & gang reductions. */ +static GTY(()) tree global_lock_var; + /* Allocate a new, cleared machine_function structure. */ static struct machine_function * @@ -3681,8 +3684,45 @@ nvptx_generate_vector_shuffle (location_t loc, gimplify_assign (dest_var, expr, seq); } -/* Insert code to locklessly update *PTR with *PTR OP VAR just before - GSI. */ +/* Lazily generate the global lock var decl and return its address. */ + +static tree +nvptx_global_lock_addr () +{ + tree v = global_lock_var; + + if (!v) + { + tree name = get_identifier ("__reduction_lock"); + tree type = build_qualified_type (unsigned_type_node, + TYPE_QUAL_VOLATILE); + v = build_decl (BUILTINS_LOCATION, VAR_DECL, name, type); + global_lock_var = v; + DECL_ARTIFICIAL (v) = 1; + DECL_EXTERNAL (v) = 1; + TREE_STATIC (v) = 1; + TREE_PUBLIC (v) = 1; + TREE_USED (v) = 1; + mark_addressable (v); + mark_decl_referenced (v); + } + + return build_fold_addr_expr (v); +} + +/* Insert code to locklessly update *PTR with *PTR OP VAR just before + GSI. We use a lockless scheme for nearly all case, which looks + like: + actual = initval(OP); + do { + guess = actual; + write = guess OP myval; + actual = cmp&swap (ptr, guess, write) + } while (actual bit-different-to guess); + return write; + + This relies on a cmp&swap instruction, which is available for 32- + and 64-bit types. Larger types must use a locking scheme. */ static tree nvptx_lockless_update (location_t loc, gimple_stmt_iterator *gsi, @@ -3690,46 +3730,30 @@ nvptx_lockless_update (location_t loc, gimple_stmt_iterator *gsi, { unsigned fn = NVPTX_BUILTIN_CMP_SWAP; tree_code code = NOP_EXPR; - tree type = unsigned_type_node; - - enum machine_mode mode = TYPE_MODE (TREE_TYPE (var)); + tree arg_type = unsigned_type_node; + tree var_type = TREE_TYPE (var); - if (!INTEGRAL_MODE_P (mode)) + if (TREE_CODE (var_type) == COMPLEX_TYPE + || TREE_CODE (var_type) == REAL_TYPE) code = VIEW_CONVERT_EXPR; - if (GET_MODE_SIZE (mode) == GET_MODE_SIZE (DImode)) + + if (TYPE_SIZE (var_type) == TYPE_SIZE (long_long_unsigned_type_node)) { + arg_type = long_long_unsigned_type_node; fn = NVPTX_BUILTIN_CMP_SWAPLL; - type = long_long_unsigned_type_node; } + tree swap_fn = nvptx_builtin_decl (fn, true); + gimple_seq init_seq = NULL; - tree init_var = make_ssa_name (type); - tree init_expr = omp_reduction_init_op (loc, op, TREE_TYPE (var)); - init_expr = fold_build1 (code, type, init_expr); + tree init_var = make_ssa_name (arg_type); + tree init_expr = omp_reduction_init_op (loc, op, var_type); + init_expr = fold_build1 (code, arg_type, init_expr); gimplify_assign (init_var, init_expr, &init_seq); gimple *init_end = gimple_seq_last (init_seq); gsi_insert_seq_before (gsi, init_seq, GSI_SAME_STMT); - gimple_seq loop_seq = NULL; - tree expect_var = make_ssa_name (type); - tree actual_var = make_ssa_name (type); - tree write_var = make_ssa_name (type); - - tree write_expr = fold_build1 (code, TREE_TYPE (var), expect_var); - write_expr = fold_build2 (op, TREE_TYPE (var), write_expr, var); - write_expr = fold_build1 (code, type, write_expr); - gimplify_assign (write_var, write_expr, &loop_seq); - - tree swap_expr = nvptx_builtin_decl (fn, true); - swap_expr = build_call_expr_loc (loc, swap_expr, 3, - ptr, expect_var, write_var); - gimplify_assign (actual_var, swap_expr, &loop_seq); - - gcond *cond = gimple_build_cond (EQ_EXPR, actual_var, expect_var, - NULL_TREE, NULL_TREE); - gimple_seq_add_stmt (&loop_seq, cond); - /* Split the block just after the init stmts. */ basic_block pre_bb = gsi_bb (*gsi); edge pre_edge = split_block (pre_bb, init_end); @@ -3738,12 +3762,34 @@ nvptx_lockless_update (location_t loc, gimple_stmt_iterator *gsi, /* Reset the iterator. */ *gsi = gsi_for_stmt (gsi_stmt (*gsi)); - /* Insert the loop statements. */ - gimple *loop_end = gimple_seq_last (loop_seq); - gsi_insert_seq_before (gsi, loop_seq, GSI_SAME_STMT); + tree expect_var = make_ssa_name (arg_type); + tree actual_var = make_ssa_name (arg_type); + tree write_var = make_ssa_name (arg_type); + + /* Build and insert the reduction calculation. */ + gimple_seq red_seq = NULL; + tree write_expr = fold_build1 (code, var_type, expect_var); + write_expr = fold_build2 (op, var_type, write_expr, var); + write_expr = fold_build1 (code, arg_type, write_expr); + gimplify_assign (write_var, write_expr, &red_seq); + + gsi_insert_seq_before (gsi, red_seq, GSI_SAME_STMT); + + /* Build & insert the cmp&swap sequence. */ + gimple_seq latch_seq = NULL; + tree swap_expr = build_call_expr_loc (loc, swap_fn, 3, + ptr, expect_var, write_var); + gimplify_assign (actual_var, swap_expr, &latch_seq); + + gcond *cond = gimple_build_cond (EQ_EXPR, actual_var, expect_var, + NULL_TREE, NULL_TREE); + gimple_seq_add_stmt (&latch_seq, cond); + + gimple *latch_end = gimple_seq_last (latch_seq); + gsi_insert_seq_before (gsi, latch_seq, GSI_SAME_STMT); - /* Split the block just after the loop stmts. */ - edge post_edge = split_block (loop_bb, loop_end); + /* Split the block just after the latch stmts. */ + edge post_edge = split_block (loop_bb, latch_end); basic_block post_bb = post_edge->dest; loop_bb = post_edge->src; *gsi = gsi_for_stmt (gsi_stmt (*gsi)); @@ -3762,7 +3808,123 @@ nvptx_lockless_update (location_t loc, gimple_stmt_iterator *gsi, loop->latch = loop_bb; add_loop (loop, loop_bb->loop_father); - return fold_build1 (code, TREE_TYPE (var), write_var); + return fold_build1 (code, var_type, write_var); +} + +/* Insert code to lockfully update *PTR with *PTR OP VAR just before + GSI. This is necessary for types larger than 64 bits, where there + is no cmp&swap instruction to implement a lockless scheme. We use + a lock variable in global memory. + + while (cmp&swap (&lock_var, 0, 1)) + continue; + T accum = *ptr; + accum = accum OP var; + *ptr = accum; + cmp&swap (&lock_var, 1, 0); + return accum; + + A lock in global memory is necessary to force execution engine + descheduling and avoid resource starvation that can occur if the + lock is in .shared memory. */ + +static tree +nvptx_lockfull_update (location_t loc, gimple_stmt_iterator *gsi, + tree ptr, tree var, tree_code op) +{ + tree var_type = TREE_TYPE (var); + tree swap_fn = nvptx_builtin_decl (NVPTX_BUILTIN_CMP_SWAP, true); + tree uns_unlocked = build_int_cst (unsigned_type_node, 0); + tree uns_locked = build_int_cst (unsigned_type_node, 1); + + /* Split the block just before the gsi. Insert a gimple nop to make + this easier. */ + gimple *nop = gimple_build_nop (); + gsi_insert_before (gsi, nop, GSI_SAME_STMT); + basic_block entry_bb = gsi_bb (*gsi); + edge entry_edge = split_block (entry_bb, nop); + basic_block lock_bb = entry_edge->dest; + /* Reset the iterator. */ + *gsi = gsi_for_stmt (gsi_stmt (*gsi)); + + /* Build and insert the locking sequence. */ + gimple_seq lock_seq = NULL; + tree lock_var = make_ssa_name (unsigned_type_node); + tree lock_expr = nvptx_global_lock_addr (); + lock_expr = build_call_expr_loc (loc, swap_fn, 3, lock_expr, + uns_unlocked, uns_locked); + gimplify_assign (lock_var, lock_expr, &lock_seq); + gcond *cond = gimple_build_cond (EQ_EXPR, lock_var, uns_unlocked, + NULL_TREE, NULL_TREE); + gimple_seq_add_stmt (&lock_seq, cond); + gimple *lock_end = gimple_seq_last (lock_seq); + gsi_insert_seq_before (gsi, lock_seq, GSI_SAME_STMT); + + /* Split the block just after the lock sequence. */ + edge locked_edge = split_block (lock_bb, lock_end); + basic_block update_bb = locked_edge->dest; + lock_bb = locked_edge->src; + *gsi = gsi_for_stmt (gsi_stmt (*gsi)); + + /* Create the lock loop ... */ + locked_edge->flags ^= EDGE_TRUE_VALUE | EDGE_FALLTHRU; + make_edge (lock_bb, lock_bb, EDGE_FALSE_VALUE); + set_immediate_dominator (CDI_DOMINATORS, lock_bb, entry_bb); + set_immediate_dominator (CDI_DOMINATORS, update_bb, lock_bb); + + /* ... and the loop structure. */ + loop *lock_loop = alloc_loop (); + lock_loop->header = lock_bb; + lock_loop->latch = lock_bb; + lock_loop->nb_iterations_estimate = 1; + lock_loop->any_estimate = true; + add_loop (lock_loop, entry_bb->loop_father); + + /* Build and insert the reduction calculation. */ + gimple_seq red_seq = NULL; + tree acc_in = make_ssa_name (var_type); + tree ref_in = build_simple_mem_ref (ptr); + TREE_THIS_VOLATILE (ref_in) = 1; + gimplify_assign (acc_in, ref_in, &red_seq); + + tree acc_out = make_ssa_name (var_type); + tree update_expr = fold_build2 (op, var_type, ref_in, var); + gimplify_assign (acc_out, update_expr, &red_seq); + + tree ref_out = build_simple_mem_ref (ptr); + TREE_THIS_VOLATILE (ref_out) = 1; + gimplify_assign (ref_out, acc_out, &red_seq); + + gsi_insert_seq_before (gsi, red_seq, GSI_SAME_STMT); + + /* Build & insert the unlock sequence. */ + gimple_seq unlock_seq = NULL; + tree unlock_expr = nvptx_global_lock_addr (); + unlock_expr = build_call_expr_loc (loc, swap_fn, 3, unlock_expr, + uns_locked, uns_unlocked); + gimplify_and_add (unlock_expr, &unlock_seq); + gsi_insert_seq_before (gsi, unlock_seq, GSI_SAME_STMT); + + return acc_out; +} + +/* Emit a sequence to update a reduction accumlator at *PTR with the + value held in VAR using operator OP. Return the updated value. + + TODO: optimize for atomic ops and indepedent complex ops. */ + +static tree +nvptx_reduction_update (location_t loc, gimple_stmt_iterator *gsi, + tree ptr, tree var, tree_code op) +{ + tree type = TREE_TYPE (var); + tree size = TYPE_SIZE (type); + + if (size == TYPE_SIZE (unsigned_type_node) + || size == TYPE_SIZE (long_long_unsigned_type_node)) + return nvptx_lockless_update (loc, gsi, ptr, var, op); + else + return nvptx_lockfull_update (loc, gsi, ptr, var, op); } /* NVPTX implementation of GOACC_REDUCTION_SETUP. */ @@ -3944,11 +4106,11 @@ nvptx_goacc_reduction_fini (gcall *call) if (accum) { - /* Locklessly update the accumulator. */ + /* UPDATE the accumulator. */ gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT); seq = NULL; - r = nvptx_lockless_update (gimple_location (call), &gsi, - accum, var, op); + r = nvptx_reduction_update (gimple_location (call), &gsi, + accum, var, op); } } diff --git a/libgcc/ChangeLog b/libgcc/ChangeLog index a4a17e0b2f2..8ab02ab3f08 100644 --- a/libgcc/ChangeLog +++ b/libgcc/ChangeLog @@ -1,3 +1,8 @@ +2015-11-18 Nathan Sidwell + + * config/nvptx/reduction.c: New. + * config/nvptx/t-nvptx (LIB2ADD): Add it. + 2015-11-15 David Edelsohn * config/rs6000/on_exit.c: New file. diff --git a/libgcc/config/nvptx/reduction.c b/libgcc/config/nvptx/reduction.c new file mode 100644 index 00000000000..11bad4c3306 --- /dev/null +++ b/libgcc/config/nvptx/reduction.c @@ -0,0 +1,31 @@ +/* Oversized reductions lock variable + Copyright (C) 2015 Free Software Foundation, Inc. + Contributed by Mentor Graphics. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation; either version 3, or (at your option) any later +version. + +GCC is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +Under Section 7 of GPL version 3, you are granted additional +permissions described in the GCC Runtime Library Exception, version +3.1, as published by the Free Software Foundation. + +You should have received a copy of the GNU General Public License and +a copy of the GCC Runtime Library Exception along with this program; +see the files COPYING3 and COPYING.RUNTIME respectively. If not, see +. */ + + +/* We use a global lock variable for reductions on objects larger than + 64 bits. Until and unless proven that lock contention for + different reduction is a problem, a single lock will suffice. */ + +unsigned volatile __reduction_lock = 0; diff --git a/libgcc/config/nvptx/t-nvptx b/libgcc/config/nvptx/t-nvptx index 34d68cca6cd..e66188f8722 100644 --- a/libgcc/config/nvptx/t-nvptx +++ b/libgcc/config/nvptx/t-nvptx @@ -1,6 +1,7 @@ LIB2ADD=$(srcdir)/config/nvptx/malloc.asm \ $(srcdir)/config/nvptx/free.asm \ - $(srcdir)/config/nvptx/realloc.c + $(srcdir)/config/nvptx/realloc.c \ + $(srcdir)/config/nvptx/reduction.c LIB2ADDEH= LIB2FUNCS_EXCLUDE=__main diff --git a/libgomp/ChangeLog b/libgomp/ChangeLog index 73a3b91bcaf..d3a5f47e7cb 100644 --- a/libgomp/ChangeLog +++ b/libgomp/ChangeLog @@ -1,3 +1,9 @@ +2015-11-18 Nathan Sidwell + + * testsuite/libgomp.oacc-c-c++-common/reduction-cplx-flt.c: Add + worker & gang cases. + * testsuite/libgomp.oacc-c-c++-common/reduction-cplx-dbl.c: Likewise. + 2015-11-17 Cesar Philippidis * config/nvptx/priority_queue.c: New file. diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-cplx-dbl.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-cplx-dbl.c index 314e5118be9..94b29b55925 100644 --- a/libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-cplx-dbl.c +++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-cplx-dbl.c @@ -14,28 +14,41 @@ int close_enough (double _Complex a, double _Complex b) return mag2_diff / mag2_a < (FRAC * FRAC); } -int main (void) -{ #define N 100 - double _Complex ary[N], sum, prod, tsum, tprod; - int ix; - sum = tsum = 0; - prod = tprod = 1; - - for (ix = 0; ix < N; ix++) - { - double frac = ix * (1.0 / 1024) + 1.0; - - ary[ix] = frac + frac * 2.0i - 1.0i; - sum += ary[ix]; - prod *= ary[ix]; - } +static int __attribute__ ((noinline)) +vector (double _Complex ary[N], double _Complex sum, double _Complex prod) +{ + double _Complex tsum = 0, tprod = 1; -#pragma acc parallel vector_length(32) copyin(ary) copy (tsum, tprod) +#pragma acc parallel vector_length(32) copyin(ary[0:N]) copy (tsum, tprod) { #pragma acc loop vector reduction(+:tsum) reduction (*:tprod) - for (ix = 0; ix < N; ix++) + for (int ix = 0; ix < N; ix++) + { + tsum += ary[ix]; + tprod *= ary[ix]; + } + } + + if (!close_enough (sum, tsum)) + return 1; + + if (!close_enough (prod, tprod)) + return 1; + + return 0; +} + +static int __attribute__ ((noinline)) +worker (double _Complex ary[N], double _Complex sum, double _Complex prod) +{ + double _Complex tsum = 0, tprod = 1; + +#pragma acc parallel num_workers(32) copyin(ary[0:N]) copy (tsum, tprod) + { +#pragma acc loop worker reduction(+:tsum) reduction (*:tprod) + for (int ix = 0; ix < N; ix++) { tsum += ary[ix]; tprod *= ary[ix]; @@ -50,3 +63,52 @@ int main (void) return 0; } + +static int __attribute__ ((noinline)) +gang (double _Complex ary[N], double _Complex sum, double _Complex prod) +{ + double _Complex tsum = 0, tprod = 1; + +#pragma acc parallel num_gangs (32) copyin(ary[0:N]) copy (tsum, tprod) + { +#pragma acc loop gang reduction(+:tsum) reduction (*:tprod) + for (int ix = 0; ix < N; ix++) + { + tsum += ary[ix]; + tprod *= ary[ix]; + } + } + + if (!close_enough (sum, tsum)) + return 1; + + if (!close_enough (prod, tprod)) + return 1; + + return 0; +} + +int main (void) +{ + double _Complex ary[N], sum = 0, prod = 1; + + for (int ix = 0; ix < N; ix++) + { + double frac = ix * (1.0 / 1024) + 1.0; + + ary[ix] = frac + frac * 2.0i - 1.0i; + sum += ary[ix]; + prod *= ary[ix]; + } + + if (vector (ary, sum, prod)) + return 1; + + if (worker (ary, sum, prod)) + return 1; + + if (gang (ary, sum, prod)) + return 1; + + return 0; +} diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-cplx-flt.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-cplx-flt.c index b3bde656079..d76bf6b8de6 100644 --- a/libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-cplx-flt.c +++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-cplx-flt.c @@ -14,28 +14,41 @@ int close_enough (float _Complex a, float _Complex b) return mag2_diff / mag2_a < (FRAC * FRAC); } -int main (void) -{ #define N 100 - float _Complex ary[N], sum, prod, tsum, tprod; - int ix; - sum = tsum = 0; - prod = tprod = 1; - - for (ix = 0; ix < N; ix++) - { - float frac = ix * (1.0f / 1024) + 1.0f; - - ary[ix] = frac + frac * 2.0i - 1.0i; - sum += ary[ix]; - prod *= ary[ix]; - } +static int __attribute__ ((noinline)) +vector (float _Complex ary[N], float _Complex sum, float _Complex prod) +{ + float _Complex tsum = 0, tprod = 1; -#pragma acc parallel vector_length(32) copyin(ary) copy (tsum, tprod) +#pragma acc parallel vector_length(32) copyin(ary[0:N]) copy (tsum, tprod) { #pragma acc loop vector reduction(+:tsum) reduction (*:tprod) - for (ix = 0; ix < N; ix++) + for (int ix = 0; ix < N; ix++) + { + tsum += ary[ix]; + tprod *= ary[ix]; + } + } + + if (!close_enough (sum, tsum)) + return 1; + + if (!close_enough (prod, tprod)) + return 1; + + return 0; +} + +static int __attribute__ ((noinline)) +worker (float _Complex ary[N], float _Complex sum, float _Complex prod) +{ + float _Complex tsum = 0, tprod = 1; + +#pragma acc parallel num_workers(32) copyin(ary[0:N]) copy (tsum, tprod) + { +#pragma acc loop worker reduction(+:tsum) reduction (*:tprod) + for (int ix = 0; ix < N; ix++) { tsum += ary[ix]; tprod *= ary[ix]; @@ -50,3 +63,52 @@ int main (void) return 0; } + +static int __attribute__ ((noinline)) +gang (float _Complex ary[N], float _Complex sum, float _Complex prod) +{ + float _Complex tsum = 0, tprod = 1; + +#pragma acc parallel num_gangs (32) copyin(ary[0:N]) copy (tsum, tprod) + { +#pragma acc loop gang reduction(+:tsum) reduction (*:tprod) + for (int ix = 0; ix < N; ix++) + { + tsum += ary[ix]; + tprod *= ary[ix]; + } + } + + if (!close_enough (sum, tsum)) + return 1; + + if (!close_enough (prod, tprod)) + return 1; + + return 0; +} + +int main (void) +{ + float _Complex ary[N], sum = 0, prod = 1; + + for (int ix = 0; ix < N; ix++) + { + float frac = ix * (1.0f / 1024) + 1.0f; + + ary[ix] = frac + frac * 2.0i - 1.0i; + sum += ary[ix]; + prod *= ary[ix]; + } + + if (vector (ary, sum, prod)) + return 1; + + if (worker (ary, sum, prod)) + return 1; + + if (gang (ary, sum, prod)) + return 1; + + return 0; +} -- 2.30.2