From e4834818d22f5c663b24940fd5b04da266e11fe8 Mon Sep 17 00:00:00 2001 From: Nathan Sidwell Date: Wed, 28 Oct 2015 03:00:50 +0000 Subject: [PATCH] omp-low.c (struct omp_context): Remove gwv_below, gwv_this fields. * omp-low.c (struct omp_context): Remove gwv_below, gwv_this fields. (is_oacc_parallel, is_oacc_kernels): New. (enclosing_target_ctx): May return NULL. (ctx_in_oacc_kernels_region): New. (check_oacc_kernel_gwv): New. (oacc_loop_or_target_p): Delete. (scan_omp_for): Don't calculate gwv mask. Check parallel clause operands. Strip reductions fro kernels. (scan_omp_target): Don't calculate gwv mask. (lower_oacc_head_mark, lower_oacc_loop_marker, lower_oacc_head_tail): New. (struct oacc_collapse): New. (expand_oacc_collapse_init, expand_oacc_collapse_vars): New. (expand_omp_for_static_nochunk, expand_omp_for_static_chunk): Remove OpenACC handling. (expand_oacc_for): New. (expand_omp_for): Call expand_oacc_for. (lower_omp_for): Call lower_oacc_head_tail. From-SVN: r229472 --- gcc/ChangeLog | 22 ++ gcc/omp-low.c | 993 ++++++++++++++++++++++++++++++++++++++++++++------ 2 files changed, 909 insertions(+), 106 deletions(-) diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 2e88f597586..463e6012d00 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,25 @@ +2015-10-27 Nathan Sidwell + + * omp-low.c (struct omp_context): Remove gwv_below, gwv_this + fields. + (is_oacc_parallel, is_oacc_kernels): New. + (enclosing_target_ctx): May return NULL. + (ctx_in_oacc_kernels_region): New. + (check_oacc_kernel_gwv): New. + (oacc_loop_or_target_p): Delete. + (scan_omp_for): Don't calculate gwv mask. Check parallel clause + operands. Strip reductions fro kernels. + (scan_omp_target): Don't calculate gwv mask. + (lower_oacc_head_mark, lower_oacc_loop_marker, + lower_oacc_head_tail): New. + (struct oacc_collapse): New. + (expand_oacc_collapse_init, expand_oacc_collapse_vars): New. + (expand_omp_for_static_nochunk, expand_omp_for_static_chunk): + Remove OpenACC handling. + (expand_oacc_for): New. + (expand_omp_for): Call expand_oacc_for. + (lower_omp_for): Call lower_oacc_head_tail. + 2015-10-27 Mikhail Maltsev * attribs.c (check_attribute_tables): New function, broken out from... diff --git a/gcc/omp-low.c b/gcc/omp-low.c index 754783537a2..c4411669c46 100644 --- a/gcc/omp-low.c +++ b/gcc/omp-low.c @@ -200,14 +200,6 @@ struct omp_context /* True if this construct can be cancelled. */ bool cancellable; - - /* For OpenACC loops, a mask of gang, worker and vector used at - levels below this one. */ - int gwv_below; - /* For OpenACC loops, a mask of gang, worker and vector used at - this level and above. For parallel and kernels clauses, a mask - indicating which of num_gangs/num_workers/num_vectors was used. */ - int gwv_this; }; /* A structure holding the elements of: @@ -299,6 +291,28 @@ static gphi *find_phi_with_arg_on_edge (tree, edge); *handled_ops_p = false; \ break; +/* Return true if CTX corresponds to an oacc parallel region. */ + +static bool +is_oacc_parallel (omp_context *ctx) +{ + enum gimple_code outer_type = gimple_code (ctx->stmt); + return ((outer_type == GIMPLE_OMP_TARGET) + && (gimple_omp_target_kind (ctx->stmt) + == GF_OMP_TARGET_KIND_OACC_PARALLEL)); +} + +/* Return true if CTX corresponds to an oacc kernels region. */ + +static bool +is_oacc_kernels (omp_context *ctx) +{ + enum gimple_code outer_type = gimple_code (ctx->stmt); + return ((outer_type == GIMPLE_OMP_TARGET) + && (gimple_omp_target_kind (ctx->stmt) + == GF_OMP_TARGET_KIND_OACC_KERNELS)); +} + /* Helper function to get the name of the array containing the partial reductions for OpenACC reductions. */ static const char * @@ -2933,28 +2947,95 @@ finish_taskreg_scan (omp_context *ctx) } } +/* Find the enclosing offload context. */ static omp_context * enclosing_target_ctx (omp_context *ctx) { - while (ctx != NULL - && gimple_code (ctx->stmt) != GIMPLE_OMP_TARGET) - ctx = ctx->outer; - gcc_assert (ctx != NULL); + for (; ctx; ctx = ctx->outer) + if (gimple_code (ctx->stmt) == GIMPLE_OMP_TARGET) + break; + return ctx; } +/* Return true if ctx is part of an oacc kernels region. */ + static bool -oacc_loop_or_target_p (gimple *stmt) +ctx_in_oacc_kernels_region (omp_context *ctx) { - enum gimple_code outer_type = gimple_code (stmt); - return ((outer_type == GIMPLE_OMP_TARGET - && ((gimple_omp_target_kind (stmt) - == GF_OMP_TARGET_KIND_OACC_PARALLEL) - || (gimple_omp_target_kind (stmt) - == GF_OMP_TARGET_KIND_OACC_KERNELS))) - || (outer_type == GIMPLE_OMP_FOR - && gimple_omp_for_kind (stmt) == GF_OMP_FOR_KIND_OACC_LOOP)); + for (;ctx != NULL; ctx = ctx->outer) + { + gimple *stmt = ctx->stmt; + if (gimple_code (stmt) == GIMPLE_OMP_TARGET + && gimple_omp_target_kind (stmt) == GF_OMP_TARGET_KIND_OACC_KERNELS) + return true; + } + + return false; +} + +/* Check the parallelism clauses inside a kernels regions. + Until kernels handling moves to use the same loop indirection + scheme as parallel, we need to do this checking early. */ + +static unsigned +check_oacc_kernel_gwv (gomp_for *stmt, omp_context *ctx) +{ + bool checking = true; + unsigned outer_mask = 0; + unsigned this_mask = 0; + bool has_seq = false, has_auto = false; + + if (ctx->outer) + outer_mask = check_oacc_kernel_gwv (NULL, ctx->outer); + if (!stmt) + { + checking = false; + if (gimple_code (ctx->stmt) != GIMPLE_OMP_FOR) + return outer_mask; + stmt = as_a (ctx->stmt); + } + + for (tree c = gimple_omp_for_clauses (stmt); c; c = OMP_CLAUSE_CHAIN (c)) + { + switch (OMP_CLAUSE_CODE (c)) + { + case OMP_CLAUSE_GANG: + this_mask |= GOMP_DIM_MASK (GOMP_DIM_GANG); + break; + case OMP_CLAUSE_WORKER: + this_mask |= GOMP_DIM_MASK (GOMP_DIM_WORKER); + break; + case OMP_CLAUSE_VECTOR: + this_mask |= GOMP_DIM_MASK (GOMP_DIM_VECTOR); + break; + case OMP_CLAUSE_SEQ: + has_seq = true; + break; + case OMP_CLAUSE_AUTO: + has_auto = true; + break; + default: + break; + } + } + + if (checking) + { + if (has_seq && (this_mask || has_auto)) + error_at (gimple_location (stmt), "% overrides other" + " OpenACC loop specifiers"); + else if (has_auto && this_mask) + error_at (gimple_location (stmt), "% conflicts with other" + " OpenACC loop specifiers"); + + if (this_mask & outer_mask) + error_at (gimple_location (stmt), "inner loop uses same" + " OpenACC parallelism as containing loop"); + } + + return outer_mask | this_mask; } /* Scan a GIMPLE_OMP_FOR. */ @@ -2962,52 +3043,62 @@ oacc_loop_or_target_p (gimple *stmt) static void scan_omp_for (gomp_for *stmt, omp_context *outer_ctx) { - enum gimple_code outer_type = GIMPLE_ERROR_MARK; omp_context *ctx; size_t i; tree clauses = gimple_omp_for_clauses (stmt); - if (outer_ctx) - outer_type = gimple_code (outer_ctx->stmt); - ctx = new_omp_context (stmt, outer_ctx); if (is_gimple_omp_oacc (stmt)) { - if (outer_ctx && outer_type == GIMPLE_OMP_FOR) - ctx->gwv_this = outer_ctx->gwv_this; - for (tree c = clauses; c; c = OMP_CLAUSE_CHAIN (c)) + omp_context *tgt = enclosing_target_ctx (outer_ctx); + + if (!tgt || is_oacc_parallel (tgt)) + for (tree c = clauses; c; c = OMP_CLAUSE_CHAIN (c)) + { + char const *check = NULL; + + switch (OMP_CLAUSE_CODE (c)) + { + case OMP_CLAUSE_GANG: + check = "gang"; + break; + + case OMP_CLAUSE_WORKER: + check = "worker"; + break; + + case OMP_CLAUSE_VECTOR: + check = "vector"; + break; + + default: + break; + } + + if (check && OMP_CLAUSE_OPERAND (c, 0)) + error_at (gimple_location (stmt), + "argument not permitted on %qs clause in" + " OpenACC %", check); + } + + if (tgt && is_oacc_kernels (tgt)) { - int val; - if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_GANG) - val = MASK_GANG; - else if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_WORKER) - val = MASK_WORKER; - else if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_VECTOR) - val = MASK_VECTOR; - else - continue; - ctx->gwv_this |= val; - if (!outer_ctx) - { - /* Skip; not nested inside a region. */ - continue; - } - if (!oacc_loop_or_target_p (outer_ctx->stmt)) - { - /* Skip; not nested inside an OpenACC region. */ - continue; - } - if (outer_type == GIMPLE_OMP_FOR) - outer_ctx->gwv_below |= val; - if (OMP_CLAUSE_OPERAND (c, 0) != NULL_TREE) + /* Strip out reductions, as they are not handled yet. */ + tree *prev_ptr = &clauses; + + while (tree probe = *prev_ptr) { - omp_context *enclosing = enclosing_target_ctx (outer_ctx); - if (gimple_omp_target_kind (enclosing->stmt) - == GF_OMP_TARGET_KIND_OACC_PARALLEL) - error_at (gimple_location (stmt), - "no arguments allowed to gang, worker and vector clauses inside parallel"); + tree *next_ptr = &OMP_CLAUSE_CHAIN (probe); + + if (OMP_CLAUSE_CODE (probe) == OMP_CLAUSE_REDUCTION) + *prev_ptr = *next_ptr; + else + prev_ptr = next_ptr; } + + gimple_omp_for_set_clauses (stmt, clauses); + check_oacc_kernel_gwv (stmt, ctx); } } @@ -3022,19 +3113,6 @@ scan_omp_for (gomp_for *stmt, omp_context *outer_ctx) scan_omp_op (gimple_omp_for_incr_ptr (stmt, i), ctx); } scan_omp (gimple_omp_body_ptr (stmt), ctx); - - if (is_gimple_omp_oacc (stmt)) - { - if (ctx->gwv_this & ctx->gwv_below) - error_at (gimple_location (stmt), - "gang, worker and vector may occur only once in a loop nest"); - else if (ctx->gwv_below != 0 - && ctx->gwv_this > ctx->gwv_below) - error_at (gimple_location (stmt), - "gang, worker and vector must occur in this order in a loop nest"); - if (outer_ctx && outer_type == GIMPLE_OMP_FOR) - outer_ctx->gwv_below |= ctx->gwv_below; - } } /* Scan an OpenMP sections directive. */ @@ -3105,19 +3183,6 @@ scan_omp_target (gomp_target *stmt, omp_context *outer_ctx) gimple_omp_target_set_child_fn (stmt, ctx->cb.dst_fn); } - if (is_gimple_omp_oacc (stmt)) - { - for (tree c = clauses; c; c = OMP_CLAUSE_CHAIN (c)) - { - if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_NUM_GANGS) - ctx->gwv_this |= MASK_GANG; - else if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_NUM_WORKERS) - ctx->gwv_this |= MASK_WORKER; - else if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_VECTOR_LENGTH) - ctx->gwv_this |= MASK_VECTOR; - } - } - scan_sharing_clauses (clauses, ctx); scan_omp (gimple_omp_body_ptr (stmt), ctx); @@ -5850,6 +5915,176 @@ lower_send_shared_vars (gimple_seq *ilist, gimple_seq *olist, omp_context *ctx) } } +/* Emit an OpenACC head marker call, encapulating the partitioning and + other information that must be processed by the target compiler. + Return the maximum number of dimensions the associated loop might + be partitioned over. */ + +static unsigned +lower_oacc_head_mark (location_t loc, tree ddvar, tree clauses, + gimple_seq *seq, omp_context *ctx) +{ + unsigned levels = 0; + unsigned tag = 0; + tree gang_static = NULL_TREE; + auto_vec args; + + args.quick_push (build_int_cst + (integer_type_node, IFN_UNIQUE_OACC_HEAD_MARK)); + args.quick_push (ddvar); + for (tree c = clauses; c; c = OMP_CLAUSE_CHAIN (c)) + { + switch (OMP_CLAUSE_CODE (c)) + { + case OMP_CLAUSE_GANG: + tag |= OLF_DIM_GANG; + gang_static = OMP_CLAUSE_GANG_STATIC_EXPR (c); + /* static:* is represented by -1, and we can ignore it, as + scheduling is always static. */ + if (gang_static && integer_minus_onep (gang_static)) + gang_static = NULL_TREE; + levels++; + break; + + case OMP_CLAUSE_WORKER: + tag |= OLF_DIM_WORKER; + levels++; + break; + + case OMP_CLAUSE_VECTOR: + tag |= OLF_DIM_VECTOR; + levels++; + break; + + case OMP_CLAUSE_SEQ: + tag |= OLF_SEQ; + break; + + case OMP_CLAUSE_AUTO: + tag |= OLF_AUTO; + break; + + case OMP_CLAUSE_INDEPENDENT: + tag |= OLF_INDEPENDENT; + break; + + default: + continue; + } + } + + if (gang_static) + { + if (DECL_P (gang_static)) + gang_static = build_outer_var_ref (gang_static, ctx); + tag |= OLF_GANG_STATIC; + } + + /* In a parallel region, loops are implicitly INDEPENDENT. */ + omp_context *tgt = enclosing_target_ctx (ctx); + if (!tgt || is_oacc_parallel (tgt)) + tag |= OLF_INDEPENDENT; + + /* A loop lacking SEQ, GANG, WORKER and/or VECTOR is implicitly AUTO. */ + if (!(tag & (((GOMP_DIM_MASK (GOMP_DIM_MAX) - 1) << OLF_DIM_BASE) + | OLF_SEQ))) + tag |= OLF_AUTO; + + /* Ensure at least one level. */ + if (!levels) + levels++; + + args.quick_push (build_int_cst (integer_type_node, levels)); + args.quick_push (build_int_cst (integer_type_node, tag)); + if (gang_static) + args.quick_push (gang_static); + + gcall *call = gimple_build_call_internal_vec (IFN_UNIQUE, args); + gimple_set_location (call, loc); + gimple_set_lhs (call, ddvar); + gimple_seq_add_stmt (seq, call); + + return levels; +} + +/* Emit an OpenACC lopp head or tail marker to SEQ. LEVEL is the + partitioning level of the enclosed region. */ + +static void +lower_oacc_loop_marker (location_t loc, tree ddvar, bool head, + tree tofollow, gimple_seq *seq) +{ + int marker_kind = (head ? IFN_UNIQUE_OACC_HEAD_MARK + : IFN_UNIQUE_OACC_TAIL_MARK); + tree marker = build_int_cst (integer_type_node, marker_kind); + int nargs = 2 + (tofollow != NULL_TREE); + gcall *call = gimple_build_call_internal (IFN_UNIQUE, nargs, + marker, ddvar, tofollow); + gimple_set_location (call, loc); + gimple_set_lhs (call, ddvar); + gimple_seq_add_stmt (seq, call); +} + +/* Generate the before and after OpenACC loop sequences. CLAUSES are + the loop clauses, from which we extract reductions. Initialize + HEAD and TAIL. */ + +static void +lower_oacc_head_tail (location_t loc, tree clauses, + gimple_seq *head, gimple_seq *tail, omp_context *ctx) +{ + bool inner = false; + tree ddvar = create_tmp_var (integer_type_node, ".data_dep"); + gimple_seq_add_stmt (head, gimple_build_assign (ddvar, integer_zero_node)); + + unsigned count = lower_oacc_head_mark (loc, ddvar, clauses, head, ctx); + if (!count) + lower_oacc_loop_marker (loc, ddvar, false, integer_zero_node, tail); + + tree fork_kind = build_int_cst (unsigned_type_node, IFN_UNIQUE_OACC_FORK); + tree join_kind = build_int_cst (unsigned_type_node, IFN_UNIQUE_OACC_JOIN); + + for (unsigned done = 1; count; count--, done++) + { + gimple_seq fork_seq = NULL; + gimple_seq join_seq = NULL; + + tree place = build_int_cst (integer_type_node, -1); + gcall *fork = gimple_build_call_internal (IFN_UNIQUE, 3, + fork_kind, ddvar, place); + gimple_set_location (fork, loc); + gimple_set_lhs (fork, ddvar); + + gcall *join = gimple_build_call_internal (IFN_UNIQUE, 3, + join_kind, ddvar, place); + gimple_set_location (join, loc); + gimple_set_lhs (join, ddvar); + + /* Mark the beginning of this level sequence. */ + if (inner) + lower_oacc_loop_marker (loc, ddvar, true, + build_int_cst (integer_type_node, count), + &fork_seq); + lower_oacc_loop_marker (loc, ddvar, false, + build_int_cst (integer_type_node, done), + &join_seq); + + gimple_seq_add_stmt (&fork_seq, fork); + gimple_seq_add_stmt (&join_seq, join); + + /* Append this level to head. */ + gimple_seq_add_seq (head, fork_seq); + /* Prepend it to tail. */ + gimple_seq_add_seq (&join_seq, *tail); + *tail = join_seq; + + inner = true; + } + + /* Mark the end of the sequence. */ + lower_oacc_loop_marker (loc, ddvar, true, NULL_TREE, head); + lower_oacc_loop_marker (loc, ddvar, false, NULL_TREE, tail); +} /* A convenience function to build an empty GIMPLE_COND with just the condition. */ @@ -6760,6 +6995,149 @@ expand_omp_taskreg (struct omp_region *region) update_ssa (TODO_update_ssa_only_virtuals); } +/* Information about members of an OpenACC collapsed loop nest. */ + +struct oacc_collapse +{ + tree base; /* Base value. */ + tree iters; /* Number of steps. */ + tree step; /* step size. */ +}; + +/* Helper for expand_oacc_for. Determine collapsed loop information. + Fill in COUNTS array. Emit any initialization code before GSI. + Return the calculated outer loop bound of BOUND_TYPE. */ + +static tree +expand_oacc_collapse_init (const struct omp_for_data *fd, + gimple_stmt_iterator *gsi, + oacc_collapse *counts, tree bound_type) +{ + tree total = build_int_cst (bound_type, 1); + int ix; + + gcc_assert (integer_onep (fd->loop.step)); + gcc_assert (integer_zerop (fd->loop.n1)); + + for (ix = 0; ix != fd->collapse; ix++) + { + const omp_for_data_loop *loop = &fd->loops[ix]; + + tree iter_type = TREE_TYPE (loop->v); + tree diff_type = iter_type; + tree plus_type = iter_type; + + gcc_assert (loop->cond_code == fd->loop.cond_code); + + if (POINTER_TYPE_P (iter_type)) + plus_type = sizetype; + if (POINTER_TYPE_P (diff_type) || TYPE_UNSIGNED (diff_type)) + diff_type = signed_type_for (diff_type); + + tree b = loop->n1; + tree e = loop->n2; + tree s = loop->step; + bool up = loop->cond_code == LT_EXPR; + tree dir = build_int_cst (diff_type, up ? +1 : -1); + bool negating; + tree expr; + + b = force_gimple_operand_gsi (gsi, b, true, NULL_TREE, + true, GSI_SAME_STMT); + e = force_gimple_operand_gsi (gsi, e, true, NULL_TREE, + true, GSI_SAME_STMT); + + /* Convert the step, avoiding possible unsigned->signed overflow. */ + negating = !up && TYPE_UNSIGNED (TREE_TYPE (s)); + if (negating) + s = fold_build1 (NEGATE_EXPR, TREE_TYPE (s), s); + s = fold_convert (diff_type, s); + if (negating) + s = fold_build1 (NEGATE_EXPR, diff_type, s); + s = force_gimple_operand_gsi (gsi, s, true, NULL_TREE, + true, GSI_SAME_STMT); + + /* Determine the range, avoiding possible unsigned->signed overflow. */ + negating = !up && TYPE_UNSIGNED (iter_type); + expr = fold_build2 (MINUS_EXPR, plus_type, + fold_convert (plus_type, negating ? b : e), + fold_convert (plus_type, negating ? e : b)); + expr = fold_convert (diff_type, expr); + if (negating) + expr = fold_build1 (NEGATE_EXPR, diff_type, expr); + tree range = force_gimple_operand_gsi + (gsi, expr, true, NULL_TREE, true, GSI_SAME_STMT); + + /* Determine number of iterations. */ + expr = fold_build2 (MINUS_EXPR, diff_type, range, dir); + expr = fold_build2 (PLUS_EXPR, diff_type, expr, s); + expr = fold_build2 (TRUNC_DIV_EXPR, diff_type, expr, s); + + tree iters = force_gimple_operand_gsi (gsi, expr, true, NULL_TREE, + true, GSI_SAME_STMT); + + counts[ix].base = b; + counts[ix].iters = iters; + counts[ix].step = s; + + total = fold_build2 (MULT_EXPR, bound_type, total, + fold_convert (bound_type, iters)); + } + + return total; +} + +/* Emit initializers for collapsed loop members. IVAR is the outer + loop iteration variable, from which collapsed loop iteration values + are calculated. COUNTS array has been initialized by + expand_oacc_collapse_inits. */ + +static void +expand_oacc_collapse_vars (const struct omp_for_data *fd, + gimple_stmt_iterator *gsi, + const oacc_collapse *counts, tree ivar) +{ + tree ivar_type = TREE_TYPE (ivar); + + /* The most rapidly changing iteration variable is the innermost + one. */ + for (int ix = fd->collapse; ix--;) + { + const omp_for_data_loop *loop = &fd->loops[ix]; + const oacc_collapse *collapse = &counts[ix]; + tree iter_type = TREE_TYPE (loop->v); + tree diff_type = TREE_TYPE (collapse->step); + tree plus_type = iter_type; + enum tree_code plus_code = PLUS_EXPR; + tree expr; + + if (POINTER_TYPE_P (iter_type)) + { + plus_code = POINTER_PLUS_EXPR; + plus_type = sizetype; + } + + expr = fold_build2 (TRUNC_MOD_EXPR, ivar_type, ivar, + fold_convert (ivar_type, collapse->iters)); + expr = fold_build2 (MULT_EXPR, diff_type, fold_convert (diff_type, expr), + collapse->step); + expr = fold_build2 (plus_code, iter_type, collapse->base, + fold_convert (plus_type, expr)); + expr = force_gimple_operand_gsi (gsi, expr, false, NULL_TREE, + true, GSI_SAME_STMT); + gassign *ass = gimple_build_assign (loop->v, expr); + gsi_insert_before (gsi, ass, GSI_SAME_STMT); + + if (ix) + { + expr = fold_build2 (TRUNC_DIV_EXPR, ivar_type, ivar, + fold_convert (ivar_type, collapse->iters)); + ivar = force_gimple_operand_gsi (gsi, expr, true, NULL_TREE, + true, GSI_SAME_STMT); + } + } +} + /* Helper function for expand_omp_{for_*,simd}. If this is the outermost of the combined collapse > 1 loop constructs, generate code like: @@ -8406,10 +8784,6 @@ expand_omp_for_static_nochunk (struct omp_region *region, tree *counts = NULL; tree n1, n2, step; - gcc_checking_assert ((gimple_omp_for_kind (fd->for_stmt) - != GF_OMP_FOR_KIND_OACC_LOOP) - || !inner_stmt); - itype = type = TREE_TYPE (fd->loop.v); if (POINTER_TYPE_P (type)) itype = signed_type_for (type); @@ -8502,10 +8876,6 @@ expand_omp_for_static_nochunk (struct omp_region *region, nthreads = builtin_decl_explicit (BUILT_IN_OMP_GET_NUM_TEAMS); threadid = builtin_decl_explicit (BUILT_IN_OMP_GET_TEAM_NUM); break; - case GF_OMP_FOR_KIND_OACC_LOOP: - nthreads = builtin_decl_explicit (BUILT_IN_GOACC_GET_NUM_THREADS); - threadid = builtin_decl_explicit (BUILT_IN_GOACC_GET_THREAD_NUM); - break; default: gcc_unreachable (); } @@ -8732,10 +9102,7 @@ expand_omp_for_static_nochunk (struct omp_region *region, if (!gimple_omp_return_nowait_p (gsi_stmt (gsi))) { t = gimple_omp_return_lhs (gsi_stmt (gsi)); - if (gimple_omp_for_kind (fd->for_stmt) == GF_OMP_FOR_KIND_OACC_LOOP) - gcc_checking_assert (t == NULL_TREE); - else - gsi_insert_after (&gsi, build_omp_barrier (t), GSI_SAME_STMT); + gsi_insert_after (&gsi, build_omp_barrier (t), GSI_SAME_STMT); } gsi_remove (&gsi, true); @@ -8873,10 +9240,6 @@ expand_omp_for_static_chunk (struct omp_region *region, tree *counts = NULL; tree n1, n2, step; - gcc_checking_assert ((gimple_omp_for_kind (fd->for_stmt) - != GF_OMP_FOR_KIND_OACC_LOOP) - || !inner_stmt); - itype = type = TREE_TYPE (fd->loop.v); if (POINTER_TYPE_P (type)) itype = signed_type_for (type); @@ -8973,10 +9336,6 @@ expand_omp_for_static_chunk (struct omp_region *region, nthreads = builtin_decl_explicit (BUILT_IN_OMP_GET_NUM_TEAMS); threadid = builtin_decl_explicit (BUILT_IN_OMP_GET_TEAM_NUM); break; - case GF_OMP_FOR_KIND_OACC_LOOP: - nthreads = builtin_decl_explicit (BUILT_IN_GOACC_GET_NUM_THREADS); - threadid = builtin_decl_explicit (BUILT_IN_GOACC_GET_THREAD_NUM); - break; default: gcc_unreachable (); } @@ -9236,10 +9595,7 @@ expand_omp_for_static_chunk (struct omp_region *region, if (!gimple_omp_return_nowait_p (gsi_stmt (gsi))) { t = gimple_omp_return_lhs (gsi_stmt (gsi)); - if (gimple_omp_for_kind (fd->for_stmt) == GF_OMP_FOR_KIND_OACC_LOOP) - gcc_checking_assert (t == NULL_TREE); - else - gsi_insert_after (&gsi, build_omp_barrier (t), GSI_SAME_STMT); + gsi_insert_after (&gsi, build_omp_barrier (t), GSI_SAME_STMT); } gsi_remove (&gsi, true); @@ -10289,6 +10645,410 @@ expand_omp_taskloop_for_inner (struct omp_region *region, } } +/* A subroutine of expand_omp_for. Generate code for an OpenACC + partitioned loop. The lowering here is abstracted, in that the + loop parameters are passed through internal functions, which are + further lowered by oacc_device_lower, once we get to the target + compiler. The loop is of the form: + + for (V = B; V LTGT E; V += S) {BODY} + + where LTGT is < or >. We may have a specified chunking size, CHUNKING + (constant 0 for no chunking) and we will have a GWV partitioning + mask, specifying dimensions over which the loop is to be + partitioned (see note below). We generate code that looks like: + + [incoming FALL->body, BRANCH->exit] + typedef signedintify (typeof (V)) T; // underlying signed integral type + T range = E - B; + T chunk_no = 0; + T DIR = LTGT == '<' ? +1 : -1; + T chunk_max = GOACC_LOOP_CHUNK (dir, range, S, CHUNK_SIZE, GWV); + T step = GOACC_LOOP_STEP (dir, range, S, CHUNK_SIZE, GWV); + + [created by splitting end of entry_bb] + T offset = GOACC_LOOP_OFFSET (dir, range, S, CHUNK_SIZE, GWV, chunk_no); + T bound = GOACC_LOOP_BOUND (dir, range, S, CHUNK_SIZE, GWV, offset); + if (!(offset LTGT bound)) goto bottom_bb; + + [incoming] + V = B + offset; + {BODY} + + [incoming, may == body_bb FALL->exit_bb, BRANCH->body_bb] + offset += step; + if (offset LTGT bound) goto body_bb; [*] + + [created by splitting start of exit_bb] insert BRANCH->head_bb + chunk_no++; + if (chunk < chunk_max) goto head_bb; + + [incoming] + V = B + ((range -/+ 1) / S +/- 1) * S [*] + + [*] Needed if V live at end of loop + + Note: CHUNKING & GWV mask are specified explicitly here. This is a + transition, and will be specified by a more general mechanism shortly. + */ + +static void +expand_oacc_for (struct omp_region *region, struct omp_for_data *fd) +{ + tree v = fd->loop.v; + enum tree_code cond_code = fd->loop.cond_code; + enum tree_code plus_code = PLUS_EXPR; + + tree chunk_size = integer_minus_one_node; + tree gwv = integer_zero_node; + tree iter_type = TREE_TYPE (v); + tree diff_type = iter_type; + tree plus_type = iter_type; + struct oacc_collapse *counts = NULL; + + gcc_checking_assert (gimple_omp_for_kind (fd->for_stmt) + == GF_OMP_FOR_KIND_OACC_LOOP); + gcc_assert (!gimple_omp_for_combined_into_p (fd->for_stmt)); + gcc_assert (cond_code == LT_EXPR || cond_code == GT_EXPR); + + if (POINTER_TYPE_P (iter_type)) + { + plus_code = POINTER_PLUS_EXPR; + plus_type = sizetype; + } + if (POINTER_TYPE_P (diff_type) || TYPE_UNSIGNED (diff_type)) + diff_type = signed_type_for (diff_type); + + basic_block entry_bb = region->entry; /* BB ending in OMP_FOR */ + basic_block exit_bb = region->exit; /* BB ending in OMP_RETURN */ + basic_block cont_bb = region->cont; /* BB ending in OMP_CONTINUE */ + basic_block bottom_bb = NULL; + + /* entry_bb has two sucessors; the branch edge is to the exit + block, fallthrough edge to body. */ + gcc_assert (EDGE_COUNT (entry_bb->succs) == 2 + && BRANCH_EDGE (entry_bb)->dest == exit_bb); + + /* If cont_bb non-NULL, it has 2 successors. The branch successor is + body_bb, or to a block whose only successor is the body_bb. Its + fallthrough successor is the final block (same as the branch + successor of the entry_bb). */ + if (cont_bb) + { + basic_block body_bb = FALLTHRU_EDGE (entry_bb)->dest; + basic_block bed = BRANCH_EDGE (cont_bb)->dest; + + gcc_assert (FALLTHRU_EDGE (cont_bb)->dest == exit_bb); + gcc_assert (bed == body_bb || single_succ_edge (bed)->dest == body_bb); + } + else + gcc_assert (!gimple_in_ssa_p (cfun)); + + /* The exit block only has entry_bb and cont_bb as predecessors. */ + gcc_assert (EDGE_COUNT (exit_bb->preds) == 1 + (cont_bb != NULL)); + + tree chunk_no; + tree chunk_max = NULL_TREE; + tree bound, offset; + tree step = create_tmp_var (diff_type, ".step"); + bool up = cond_code == LT_EXPR; + tree dir = build_int_cst (diff_type, up ? +1 : -1); + bool chunking = !gimple_in_ssa_p (cfun);; + bool negating; + + /* SSA instances. */ + tree offset_incr = NULL_TREE; + tree offset_init = NULL_TREE; + + gimple_stmt_iterator gsi; + gassign *ass; + gcall *call; + gimple *stmt; + tree expr; + location_t loc; + edge split, be, fte; + + /* Split the end of entry_bb to create head_bb. */ + split = split_block (entry_bb, last_stmt (entry_bb)); + basic_block head_bb = split->dest; + entry_bb = split->src; + + /* Chunk setup goes at end of entry_bb, replacing the omp_for. */ + gsi = gsi_last_bb (entry_bb); + gomp_for *for_stmt = as_a (gsi_stmt (gsi)); + loc = gimple_location (for_stmt); + + if (gimple_in_ssa_p (cfun)) + { + offset_init = gimple_omp_for_index (for_stmt, 0); + gcc_assert (integer_zerop (fd->loop.n1)); + /* The SSA parallelizer does gang parallelism. */ + gwv = build_int_cst (integer_type_node, GOMP_DIM_MASK (GOMP_DIM_GANG)); + } + + if (fd->collapse > 1) + { + counts = XALLOCAVEC (struct oacc_collapse, fd->collapse); + tree total = expand_oacc_collapse_init (fd, &gsi, counts, + TREE_TYPE (fd->loop.n2)); + + if (SSA_VAR_P (fd->loop.n2)) + { + total = force_gimple_operand_gsi (&gsi, total, false, NULL_TREE, + true, GSI_SAME_STMT); + ass = gimple_build_assign (fd->loop.n2, total); + gsi_insert_before (&gsi, ass, GSI_SAME_STMT); + } + + } + + tree b = fd->loop.n1; + tree e = fd->loop.n2; + tree s = fd->loop.step; + + b = force_gimple_operand_gsi (&gsi, b, true, NULL_TREE, true, GSI_SAME_STMT); + e = force_gimple_operand_gsi (&gsi, e, true, NULL_TREE, true, GSI_SAME_STMT); + + /* Convert the step, avoiding possible unsigned->signed overflow. */ + negating = !up && TYPE_UNSIGNED (TREE_TYPE (s)); + if (negating) + s = fold_build1 (NEGATE_EXPR, TREE_TYPE (s), s); + s = fold_convert (diff_type, s); + if (negating) + s = fold_build1 (NEGATE_EXPR, diff_type, s); + s = force_gimple_operand_gsi (&gsi, s, true, NULL_TREE, true, GSI_SAME_STMT); + + if (!chunking) + chunk_size = integer_zero_node; + expr = fold_convert (diff_type, chunk_size); + chunk_size = force_gimple_operand_gsi (&gsi, expr, true, + NULL_TREE, true, GSI_SAME_STMT); + /* Determine the range, avoiding possible unsigned->signed overflow. */ + negating = !up && TYPE_UNSIGNED (iter_type); + expr = fold_build2 (MINUS_EXPR, plus_type, + fold_convert (plus_type, negating ? b : e), + fold_convert (plus_type, negating ? e : b)); + expr = fold_convert (diff_type, expr); + if (negating) + expr = fold_build1 (NEGATE_EXPR, diff_type, expr); + tree range = force_gimple_operand_gsi (&gsi, expr, true, + NULL_TREE, true, GSI_SAME_STMT); + + chunk_no = build_int_cst (diff_type, 0); + if (chunking) + { + gcc_assert (!gimple_in_ssa_p (cfun)); + + expr = chunk_no; + chunk_max = create_tmp_var (diff_type, ".chunk_max"); + chunk_no = create_tmp_var (diff_type, ".chunk_no"); + + ass = gimple_build_assign (chunk_no, expr); + gsi_insert_before (&gsi, ass, GSI_SAME_STMT); + + call = gimple_build_call_internal (IFN_GOACC_LOOP, 6, + build_int_cst (integer_type_node, + IFN_GOACC_LOOP_CHUNKS), + dir, range, s, chunk_size, gwv); + gimple_call_set_lhs (call, chunk_max); + gimple_set_location (call, loc); + gsi_insert_before (&gsi, call, GSI_SAME_STMT); + } + else + chunk_size = chunk_no; + + call = gimple_build_call_internal (IFN_GOACC_LOOP, 6, + build_int_cst (integer_type_node, + IFN_GOACC_LOOP_STEP), + dir, range, s, chunk_size, gwv); + gimple_call_set_lhs (call, step); + gimple_set_location (call, loc); + gsi_insert_before (&gsi, call, GSI_SAME_STMT); + + /* Remove the GIMPLE_OMP_FOR. */ + gsi_remove (&gsi, true); + + /* Fixup edges from head_bb */ + be = BRANCH_EDGE (head_bb); + fte = FALLTHRU_EDGE (head_bb); + be->flags |= EDGE_FALSE_VALUE; + fte->flags ^= EDGE_FALLTHRU | EDGE_TRUE_VALUE; + + basic_block body_bb = fte->dest; + + if (gimple_in_ssa_p (cfun)) + { + gsi = gsi_last_bb (cont_bb); + gomp_continue *cont_stmt = as_a (gsi_stmt (gsi)); + + offset = gimple_omp_continue_control_use (cont_stmt); + offset_incr = gimple_omp_continue_control_def (cont_stmt); + } + else + { + offset = create_tmp_var (diff_type, ".offset"); + offset_init = offset_incr = offset; + } + bound = create_tmp_var (TREE_TYPE (offset), ".bound"); + + /* Loop offset & bound go into head_bb. */ + gsi = gsi_start_bb (head_bb); + + call = gimple_build_call_internal (IFN_GOACC_LOOP, 7, + build_int_cst (integer_type_node, + IFN_GOACC_LOOP_OFFSET), + dir, range, s, + chunk_size, gwv, chunk_no); + gimple_call_set_lhs (call, offset_init); + gimple_set_location (call, loc); + gsi_insert_after (&gsi, call, GSI_CONTINUE_LINKING); + + call = gimple_build_call_internal (IFN_GOACC_LOOP, 7, + build_int_cst (integer_type_node, + IFN_GOACC_LOOP_BOUND), + dir, range, s, + chunk_size, gwv, offset_init); + gimple_call_set_lhs (call, bound); + gimple_set_location (call, loc); + gsi_insert_after (&gsi, call, GSI_CONTINUE_LINKING); + + expr = build2 (cond_code, boolean_type_node, offset_init, bound); + gsi_insert_after (&gsi, gimple_build_cond_empty (expr), + GSI_CONTINUE_LINKING); + + /* V assignment goes into body_bb. */ + if (!gimple_in_ssa_p (cfun)) + { + gsi = gsi_start_bb (body_bb); + + expr = build2 (plus_code, iter_type, b, + fold_convert (plus_type, offset)); + expr = force_gimple_operand_gsi (&gsi, expr, false, NULL_TREE, + true, GSI_SAME_STMT); + ass = gimple_build_assign (v, expr); + gsi_insert_before (&gsi, ass, GSI_SAME_STMT); + if (fd->collapse > 1) + expand_oacc_collapse_vars (fd, &gsi, counts, v); + } + + /* Loop increment goes into cont_bb. If this is not a loop, we + will have spawned threads as if it was, and each one will + execute one iteration. The specification is not explicit about + whether such constructs are ill-formed or not, and they can + occur, especially when noreturn routines are involved. */ + if (cont_bb) + { + gsi = gsi_last_bb (cont_bb); + gomp_continue *cont_stmt = as_a (gsi_stmt (gsi)); + loc = gimple_location (cont_stmt); + + /* Increment offset. */ + if (gimple_in_ssa_p (cfun)) + expr= build2 (plus_code, iter_type, offset, + fold_convert (plus_type, step)); + else + expr = build2 (PLUS_EXPR, diff_type, offset, step); + expr = force_gimple_operand_gsi (&gsi, expr, false, NULL_TREE, + true, GSI_SAME_STMT); + ass = gimple_build_assign (offset_incr, expr); + gsi_insert_before (&gsi, ass, GSI_SAME_STMT); + expr = build2 (cond_code, boolean_type_node, offset_incr, bound); + gsi_insert_before (&gsi, gimple_build_cond_empty (expr), GSI_SAME_STMT); + + /* Remove the GIMPLE_OMP_CONTINUE. */ + gsi_remove (&gsi, true); + + /* Fixup edges from cont_bb */ + be = BRANCH_EDGE (cont_bb); + fte = FALLTHRU_EDGE (cont_bb); + be->flags |= EDGE_TRUE_VALUE; + fte->flags ^= EDGE_FALLTHRU | EDGE_FALSE_VALUE; + + if (chunking) + { + /* Split the beginning of exit_bb to make bottom_bb. We + need to insert a nop at the start, because splitting is + after a stmt, not before. */ + gsi = gsi_start_bb (exit_bb); + stmt = gimple_build_nop (); + gsi_insert_before (&gsi, stmt, GSI_SAME_STMT); + split = split_block (exit_bb, stmt); + bottom_bb = split->src; + exit_bb = split->dest; + gsi = gsi_last_bb (bottom_bb); + + /* Chunk increment and test goes into bottom_bb. */ + expr = build2 (PLUS_EXPR, diff_type, chunk_no, + build_int_cst (diff_type, 1)); + ass = gimple_build_assign (chunk_no, expr); + gsi_insert_after (&gsi, ass, GSI_CONTINUE_LINKING); + + /* Chunk test at end of bottom_bb. */ + expr = build2 (LT_EXPR, boolean_type_node, chunk_no, chunk_max); + gsi_insert_after (&gsi, gimple_build_cond_empty (expr), + GSI_CONTINUE_LINKING); + + /* Fixup edges from bottom_bb. */ + split->flags ^= EDGE_FALLTHRU | EDGE_FALSE_VALUE; + make_edge (bottom_bb, head_bb, EDGE_TRUE_VALUE); + } + } + + gsi = gsi_last_bb (exit_bb); + gcc_assert (gimple_code (gsi_stmt (gsi)) == GIMPLE_OMP_RETURN); + loc = gimple_location (gsi_stmt (gsi)); + + if (!gimple_in_ssa_p (cfun)) + { + /* Insert the final value of V, in case it is live. This is the + value for the only thread that survives past the join. */ + expr = fold_build2 (MINUS_EXPR, diff_type, range, dir); + expr = fold_build2 (PLUS_EXPR, diff_type, expr, s); + expr = fold_build2 (TRUNC_DIV_EXPR, diff_type, expr, s); + expr = fold_build2 (MULT_EXPR, diff_type, expr, s); + expr = build2 (plus_code, iter_type, b, fold_convert (plus_type, expr)); + expr = force_gimple_operand_gsi (&gsi, expr, false, NULL_TREE, + true, GSI_SAME_STMT); + ass = gimple_build_assign (v, expr); + gsi_insert_before (&gsi, ass, GSI_SAME_STMT); + } + + /* Remove the OMP_RETURN. */ + gsi_remove (&gsi, true); + + if (cont_bb) + { + /* We now have one or two nested loops. Update the loop + structures. */ + struct loop *parent = entry_bb->loop_father; + struct loop *body = body_bb->loop_father; + + if (chunking) + { + struct loop *chunk_loop = alloc_loop (); + chunk_loop->header = head_bb; + chunk_loop->latch = bottom_bb; + add_loop (chunk_loop, parent); + parent = chunk_loop; + } + else if (parent != body) + { + gcc_assert (body->header == body_bb); + gcc_assert (body->latch == cont_bb + || single_pred (body->latch) == cont_bb); + parent = NULL; + } + + if (parent) + { + struct loop *body_loop = alloc_loop (); + body_loop->header = body_bb; + body_loop->latch = cont_bb; + add_loop (body_loop, parent); + } + } +} + /* Expand the OMP loop defined by REGION. */ static void @@ -10324,6 +11084,11 @@ expand_omp_for (struct omp_region *region, gimple *inner_stmt) expand_omp_simd (region, &fd); else if (gimple_omp_for_kind (fd.for_stmt) == GF_OMP_FOR_KIND_CILKFOR) expand_cilk_for (region, &fd); + else if (gimple_omp_for_kind (fd.for_stmt) == GF_OMP_FOR_KIND_OACC_LOOP) + { + gcc_assert (!inner_stmt); + expand_oacc_for (region, &fd); + } else if (gimple_omp_for_kind (fd.for_stmt) == GF_OMP_FOR_KIND_TASKLOOP) { if (gimple_omp_for_combined_into_p (fd.for_stmt)) @@ -13521,6 +14286,7 @@ lower_omp_for (gimple_stmt_iterator *gsi_p, omp_context *ctx) gomp_for *stmt = as_a (gsi_stmt (*gsi_p)); gbind *new_stmt; gimple_seq omp_for_body, body, dlist; + gimple_seq oacc_head = NULL, oacc_tail = NULL; size_t i; push_gimplify_context (); @@ -13629,6 +14395,16 @@ lower_omp_for (gimple_stmt_iterator *gsi_p, omp_context *ctx) /* Once lowered, extract the bounds and clauses. */ extract_omp_for_data (stmt, &fd, NULL); + if (is_gimple_omp_oacc (ctx->stmt) + && !ctx_in_oacc_kernels_region (ctx)) + lower_oacc_head_tail (gimple_location (stmt), + gimple_omp_for_clauses (stmt), + &oacc_head, &oacc_tail, ctx); + + /* Add OpenACC partitioning markers just before the loop */ + if (oacc_head) + gimple_seq_add_seq (&body, oacc_head); + lower_omp_for_lastprivate (&fd, &body, &dlist, ctx); if (gimple_omp_for_kind (stmt) == GF_OMP_FOR_KIND_FOR) @@ -13662,6 +14438,11 @@ lower_omp_for (gimple_stmt_iterator *gsi_p, omp_context *ctx) /* Region exit marker goes at the end of the loop body. */ gimple_seq_add_stmt (&body, gimple_build_omp_return (fd.have_nowait)); maybe_add_implicit_barrier_cancel (ctx, &body); + + /* Add OpenACC joining and reduction markers just after the loop. */ + if (oacc_tail) + gimple_seq_add_seq (&body, oacc_tail); + pop_gimplify_context (new_stmt); gimple_bind_append_vars (new_stmt, ctx->block_vars); -- 2.30.2