+2017-02-09 Nathan Sidwell <nathan@codesourcery.com>
+ Chung-Lin Tang <cltang@codesourcery.com>
+
+ * gimplify.c (gimplify_scan_omp_clauses): No special handling for
+ OMP_CLAUSE_TILE.
+ (gimplify_adjust_omp_clauses): Don't delete TILE.
+ (gimplify_omp_for): Deal with TILE.
+ * internal-fn.c (expand_GOACC_TILE): New function.
+ * internal-fn.def (GOACC_DIM_POS): Comment may be overly conservative.
+ (GOACC_TILE): New.
+ * omp-expand.c (struct oacc_collapse): Add tile and outer fields.
+ (expand_oacc_collapse_init): Add LOC paramter. Initialize tile
+ element fields.
+ (expand_oacc_collapse_vars): Add INNER parm, adjust for tiling,
+ avoid DIV for outermost collapse var.
+ (expand_oacc_for): Insert tile element loop as needed. Adjust.
+ Remove out of date comments, fix whitespace.
+ * omp-general.c (omp_extract_for_data): Deal with tiling.
+ * omp-general.h (enum oacc_loop_flags): Add OLF_TILE flag,
+ adjust OLF_DIM_BASE value.
+ (struct omp_for_data): Add tiling field.
+ * omp-low.c (scan_sharing_clauses): Allow OMP_CLAUSE_TILE.
+ (lower_oacc_head_mark): Add OLF_TILE as appropriate. Ensure 2 levels
+ for auto loops. Remove default auto determining, moved to
+ oacc_loop_fixed_partitions.
+ * omp-offload.c (struct oacc_loop): Change 'ifns' to vector of call
+ stmts, add e_mask field.
+ (oacc_dim_call): New function, abstracted out from oacc_thread_numbers.
+ (oacc_thread_numbers): Use oacc_dim_call.
+ (oacc_xform_tile): New.
+ (new_oacc_loop_raw): Initialize e_mask, adjust for ifns vector.
+ (finish_oacc_loop): Adjust for ifns vector.
+ (oacc_loop_discover_walk): Append loop abstraction sites to list,
+ add case for GOACC_TILE fns.
+ (oacc_loop_xform_loop): Delete.
+ (oacc_loop_process): Iterate over call list directly, and add
+ handling for GOACC_TILE fns.
+ (oacc_loop_fixed_partitions): Determine default auto, deal with TILE,
+ dump partitioning.
+ (oacc_loop_auto_partitions): Add outer_assign parm. Assign all but
+ vector partitioning to outer loops. Assign 2 partitions to loops
+ when available. Add TILE handling.
+ (oacc_loop_partition): Adjust oacc_loop_auto_partitions call.
+ (execite_oacc_device_lower): Process GOACC_TILE fns, ignore unknown specs.
+ * tree-nested.c (convert_nonlocal_omp_clauses): Allow OMP_CLAUSE_TILE.
+ * tree.c (omp_clause_num_ops): Adjust TILE ops.
+ * tree.h (OMP_CLAUSE_TILE_ITERVAR, OMP_CLAUSE_TILE_COUNT): New.
+
2017-02-09 Gerald Pfeifer <gerald@pfeifer.com>
* configure.ac (ACX_BUGURL): Update.
+2016-02-09 Nathan Sidwell <nathan@codesourcery.com>
+ Chung-Lin Tang <cltang@codesourcery.com>
+
+ * c-parser.c (c_parser_omp_clause_collapse): Disallow tile.
+ (c_parser_oacc_clause_tile): Disallow collapse. Fix parsing and
+ semantic checking.
+ * c-parser.c (c_parser_omp_for_loop): Accept tiling constructs.
+
2017-02-07 Richard Biener <rguenther@suse.de>
* gimple-parser.c (c_parser_gimple_expr_list): Simplify.
location_t loc;
check_no_duplicate_clause (list, OMP_CLAUSE_COLLAPSE, "collapse");
+ check_no_duplicate_clause (list, OMP_CLAUSE_TILE, "tile");
loc = c_parser_peek_token (parser)->location;
if (c_parser_require (parser, CPP_OPEN_PAREN, "expected %<(%>"))
c_parser_oacc_clause_tile (c_parser *parser, tree list)
{
tree c, expr = error_mark_node;
- location_t loc, expr_loc;
+ location_t loc;
tree tile = NULL_TREE;
check_no_duplicate_clause (list, OMP_CLAUSE_TILE, "tile");
+ check_no_duplicate_clause (list, OMP_CLAUSE_COLLAPSE, "collapse");
loc = c_parser_peek_token (parser)->location;
if (!c_parser_require (parser, CPP_OPEN_PAREN, "expected %<(%>"))
do
{
+ if (tile && !c_parser_require (parser, CPP_COMMA, "expected %<,%>"))
+ return list;
+
if (c_parser_next_token_is (parser, CPP_MULT)
&& (c_parser_peek_2nd_token (parser)->type == CPP_COMMA
|| c_parser_peek_2nd_token (parser)->type == CPP_CLOSE_PAREN))
{
c_parser_consume_token (parser);
- expr = integer_minus_one_node;
+ expr = integer_zero_node;
}
else
{
- expr_loc = c_parser_peek_token (parser)->location;
+ location_t expr_loc = c_parser_peek_token (parser)->location;
c_expr cexpr = c_parser_expr_no_commas (parser, NULL);
cexpr = convert_lvalue_to_rvalue (expr_loc, cexpr, false, true);
expr = cexpr.value;
return list;
}
- if (!INTEGRAL_TYPE_P (TREE_TYPE (expr)))
- {
- c_parser_error (parser, "%<tile%> value must be integral");
- return list;
- }
-
expr = c_fully_fold (expr, false, NULL);
- /* Attempt to statically determine when expr isn't positive. */
- c = fold_build2_loc (expr_loc, LE_EXPR, boolean_type_node, expr,
- build_int_cst (TREE_TYPE (expr), 0));
- protected_set_expr_location (c, expr_loc);
- if (c == boolean_true_node)
+ if (!INTEGRAL_TYPE_P (TREE_TYPE (expr))
+ || !tree_fits_shwi_p (expr)
+ || tree_to_shwi (expr) <= 0)
{
- warning_at (expr_loc, 0,"%<tile%> value must be positive");
- expr = integer_one_node;
+ error_at (expr_loc, "%<tile%> argument needs positive"
+ " integral constant");
+ expr = integer_zero_node;
}
}
tile = tree_cons (NULL_TREE, expr, tile);
- if (c_parser_next_token_is (parser, CPP_COMMA))
- c_parser_consume_token (parser);
}
while (c_parser_next_token_is_not (parser, CPP_CLOSE_PAREN));
bool fail = false, open_brace_parsed = false;
int i, collapse = 1, ordered = 0, count, nbraces = 0;
location_t for_loc;
+ bool tiling = false;
vec<tree, va_gc> *for_block = make_tree_vector ();
for (cl = clauses; cl; cl = OMP_CLAUSE_CHAIN (cl))
if (OMP_CLAUSE_CODE (cl) == OMP_CLAUSE_COLLAPSE)
collapse = tree_to_shwi (OMP_CLAUSE_COLLAPSE_EXPR (cl));
+ else if (OMP_CLAUSE_CODE (cl) == OMP_CLAUSE_TILE)
+ {
+ tiling = true;
+ collapse = list_length (OMP_CLAUSE_TILE_LIST (cl));
+ }
else if (OMP_CLAUSE_CODE (cl) == OMP_CLAUSE_ORDERED
&& OMP_CLAUSE_ORDERED_EXPR (cl))
{
pc = &OMP_CLAUSE_CHAIN (*pc);
}
- gcc_assert (collapse >= 1 && ordered >= 0);
+ gcc_assert (tiling || (collapse >= 1 && ordered >= 0));
count = ordered ? ordered : collapse;
declv = make_tree_vec (count);
+2016-02-09 Nathan Sidwell <nathan@codesourcery.com>
+ Chung-Lin Tang <cltang@codesourcery.com>
+
+ * parser.c (cp_parser_oacc_clause_tile): Disallow collapse. Fix
+ parsing. Parse constant expression. Remove semantic checking.
+ (cp_parser_omp_clause_collapse): Disallow tile.
+ (cp_parser_omp_for_loop): Deal with tile clause. Don't emit a parse
+ error about missing for after already emitting one. Use more
+ conventional for idiom for unbounded loop.
+ * pt.c (tsubst_omp_clauses): Handle OMP_CLAUSE_TILE.
+ * semantics.c (finish_omp_clauses): Correct TILE semantic check.
+ (finish_omp_for): Deal with tile clause.
+
2017-02-07 Nathan Sidwell <nathan@acm.org>
* method.c (synthesized_method_base_walk): New. Broken out of ...
tree c, expr = error_mark_node;
tree tile = NULL_TREE;
+ /* Collapse and tile are mutually exclusive. (The spec doesn't say
+ so, but the spec authors never considered such a case and have
+ differing opinions on what it might mean, including 'not
+ allowed'.) */
check_no_duplicate_clause (list, OMP_CLAUSE_TILE, "tile", clause_loc);
+ check_no_duplicate_clause (list, OMP_CLAUSE_COLLAPSE, "collapse",
+ clause_loc);
if (!cp_parser_require (parser, CPP_OPEN_PAREN, RT_OPEN_PAREN))
return list;
do
{
+ if (tile && !cp_parser_require (parser, CPP_COMMA, RT_COMMA))
+ return list;
+
if (cp_lexer_next_token_is (parser->lexer, CPP_MULT)
&& (cp_lexer_nth_token_is (parser->lexer, 2, CPP_COMMA)
|| cp_lexer_nth_token_is (parser->lexer, 2, CPP_CLOSE_PAREN)))
{
cp_lexer_consume_token (parser->lexer);
- expr = integer_minus_one_node;
+ expr = integer_zero_node;
}
else
- expr = cp_parser_assignment_expression (parser, NULL, false, false);
-
- if (expr == error_mark_node)
- return list;
+ expr = cp_parser_constant_expression (parser);
tile = tree_cons (NULL_TREE, expr, tile);
-
- if (cp_lexer_next_token_is (parser->lexer, CPP_COMMA))
- cp_lexer_consume_token (parser->lexer);
}
while (cp_lexer_next_token_is_not (parser->lexer, CPP_CLOSE_PAREN));
}
check_no_duplicate_clause (list, OMP_CLAUSE_COLLAPSE, "collapse", location);
+ check_no_duplicate_clause (list, OMP_CLAUSE_TILE, "tile", location);
c = build_omp_clause (loc, OMP_CLAUSE_COLLAPSE);
OMP_CLAUSE_CHAIN (c) = list;
OMP_CLAUSE_COLLAPSE_EXPR (c) = num;
int i, collapse = 1, ordered = 0, count, nbraces = 0;
vec<tree, va_gc> *for_block = make_tree_vector ();
auto_vec<tree, 4> orig_inits;
+ bool tiling = false;
for (cl = clauses; cl; cl = OMP_CLAUSE_CHAIN (cl))
if (OMP_CLAUSE_CODE (cl) == OMP_CLAUSE_COLLAPSE)
collapse = tree_to_shwi (OMP_CLAUSE_COLLAPSE_EXPR (cl));
+ else if (OMP_CLAUSE_CODE (cl) == OMP_CLAUSE_TILE)
+ {
+ tiling = true;
+ collapse = list_length (OMP_CLAUSE_TILE_LIST (cl));
+ }
else if (OMP_CLAUSE_CODE (cl) == OMP_CLAUSE_ORDERED
&& OMP_CLAUSE_ORDERED_EXPR (cl))
{
pc = &OMP_CLAUSE_CHAIN (*pc);
}
- gcc_assert (collapse >= 1 && ordered >= 0);
+ gcc_assert (tiling || (collapse >= 1 && ordered >= 0));
count = ordered ? ordered : collapse;
declv = make_tree_vec (count);
if (code != CILK_FOR
&& !cp_lexer_next_token_is_keyword (parser->lexer, RID_FOR))
{
- cp_parser_error (parser, "for statement expected");
+ if (!collapse_err)
+ cp_parser_error (parser, "for statement expected");
return NULL;
}
if (code == CILK_FOR
&& !cp_lexer_next_token_is_keyword (parser->lexer, RID_CILK_FOR))
{
- cp_parser_error (parser, "_Cilk_for statement expected");
+ if (!collapse_err)
+ cp_parser_error (parser, "_Cilk_for statement expected");
return NULL;
}
loc = cp_lexer_consume_token (parser->lexer)->location;
nested. Hopefully the final version clarifies this.
For now handle (multiple) {'s and empty statements. */
cp_parser_parse_tentatively (parser);
- do
+ for (;;)
{
if (cp_lexer_next_token_is_keyword (parser->lexer, RID_FOR))
break;
else
{
loc = cp_lexer_peek_token (parser->lexer)->location;
- error_at (loc, "not enough collapsed for loops");
+ error_at (loc, "not enough for loops to collapse");
collapse_err = true;
cp_parser_abort_tentative_parse (parser);
declv = NULL_TREE;
break;
}
}
- while (1);
if (declv)
{
= tsubst_omp_clause_decl (OMP_CLAUSE_DECL (oc), args, complain,
in_decl);
break;
+ case OMP_CLAUSE_TILE:
case OMP_CLAUSE_IF:
case OMP_CLAUSE_NUM_THREADS:
case OMP_CLAUSE_SCHEDULE:
case OMP_CLAUSE_AUTO:
case OMP_CLAUSE_SEQ:
break;
- case OMP_CLAUSE_TILE:
- {
- tree lnc, loc;
- for (lnc = OMP_CLAUSE_TILE_LIST (nc),
- loc = OMP_CLAUSE_TILE_LIST (oc);
- loc;
- loc = TREE_CHAIN (loc), lnc = TREE_CHAIN (lnc))
- {
- TREE_VALUE (lnc) = tsubst_expr (TREE_VALUE (loc), args,
- complain, in_decl, false);
- }
- }
- break;
default:
gcc_unreachable ();
}
else if (!type_dependent_expression_p (t)
&& !INTEGRAL_TYPE_P (TREE_TYPE (t)))
{
- error ("%<tile%> value must be integral");
+ error_at (OMP_CLAUSE_LOCATION (c),
+ "%<tile%> argument needs integral type");
remove = true;
}
else
t = mark_rvalue_use (t);
if (!processing_template_decl)
{
+ /* Zero is used to indicate '*', we permit you
+ to get there via an ICE of value zero. */
t = maybe_constant_value (t);
- if (TREE_CODE (t) == INTEGER_CST
- && tree_int_cst_sgn (t) != 1
- && t != integer_minus_one_node)
+ if (!tree_fits_shwi_p (t)
+ || tree_to_shwi (t) < 0)
{
- warning_at (OMP_CLAUSE_LOCATION (c), 0,
- "%<tile%> value must be positive");
- t = integer_one_node;
+ error_at (OMP_CLAUSE_LOCATION (c),
+ "%<tile%> argument needs positive "
+ "integral constant");
+ remove = true;
}
}
t = fold_build_cleanup_point_expr (TREE_TYPE (t), t);
gcc_assert (TREE_VEC_LENGTH (declv) == TREE_VEC_LENGTH (incrv));
if (TREE_VEC_LENGTH (declv) > 1)
{
- tree c = omp_find_clause (clauses, OMP_CLAUSE_COLLAPSE);
+ tree c;
+
+ c = omp_find_clause (clauses, OMP_CLAUSE_TILE);
if (c)
- collapse = tree_to_shwi (OMP_CLAUSE_COLLAPSE_EXPR (c));
- if (collapse != TREE_VEC_LENGTH (declv))
- ordered = TREE_VEC_LENGTH (declv);
+ collapse = list_length (OMP_CLAUSE_TILE_LIST (c));
+ else
+ {
+ c = omp_find_clause (clauses, OMP_CLAUSE_COLLAPSE);
+ if (c)
+ collapse = tree_to_shwi (OMP_CLAUSE_COLLAPSE_EXPR (c));
+ if (collapse != TREE_VEC_LENGTH (declv))
+ ordered = TREE_VEC_LENGTH (declv);
+ }
}
for (i = 0; i < TREE_VEC_LENGTH (declv); i++)
{
+2017-02-09 Cesar Philippidis <cesar@codesourcery.com>
+ Joseph Myers <joseph@codesourcery.com>
+
+ * openmp.c (resolve_omp_clauses): Error on directives
+ containing both tile and collapse clauses.
+ (resolve_oacc_loop_blocks): Represent '*' tile arguments as zero.
+ * trans-openmp.c (gfc_trans_omp_do): Lower tiled loops like
+ collapsed loops.
+
2017-02-07 Steven G. Kargl <kargl@gcc.gnu.org>
* trans-types.c (gfc_get_int_kind_from_width_isofortranen): Choose
if (omp_clauses->wait_list)
for (el = omp_clauses->wait_list; el; el = el->next)
resolve_scalar_int_expr (el->expr, "WAIT");
+ if (omp_clauses->collapse && omp_clauses->tile_list)
+ gfc_error ("Incompatible use of TILE and COLLAPSE at %L", &code->loc);
if (omp_clauses->depend_source && code->op != EXEC_OMP_ORDERED)
gfc_error ("SOURCE dependence type only allowed "
"on ORDERED directive at %L", &code->loc);
if (el->expr == NULL)
{
/* NULL expressions are used to represent '*' arguments.
- Convert those to a -1 expressions. */
+ Convert those to a 0 expressions. */
el->expr = gfc_get_constant_expr (BT_INTEGER,
gfc_default_integer_kind,
&code->loc);
- mpz_set_si (el->expr->value.integer, -1);
+ mpz_set_si (el->expr->value.integer, 0);
}
else
{
dovar_init *di;
unsigned ix;
vec<tree, va_heap, vl_embed> *saved_doacross_steps = doacross_steps;
+ gfc_expr_list *tile = do_clauses ? do_clauses->tile_list : clauses->tile_list;
+
+ /* Both collapsed and tiled loops are lowered the same way. In
+ OpenACC, those clauses are not compatible, so prioritize the tile
+ clause, if present. */
+ if (tile)
+ {
+ collapse = 0;
+ for (gfc_expr_list *el = tile; el; el = el->next)
+ collapse++;
+ }
doacross_steps = NULL;
if (clauses->orderedc)
remove = true;
break;
- case OMP_CLAUSE_TILE:
- for (tree list = OMP_CLAUSE_TILE_LIST (c); !remove && list;
- list = TREE_CHAIN (list))
- {
- if (gimplify_expr (&TREE_VALUE (list), pre_p, NULL,
- is_gimple_val, fb_rvalue) == GS_ERROR)
- remove = true;
- }
- break;
-
case OMP_CLAUSE_NOWAIT:
case OMP_CLAUSE_ORDERED:
case OMP_CLAUSE_UNTIED:
case OMP_CLAUSE_COLLAPSE:
+ case OMP_CLAUSE_TILE:
case OMP_CLAUSE_AUTO:
case OMP_CLAUSE_SEQ:
case OMP_CLAUSE_INDEPENDENT:
case OMP_CLAUSE_VECTOR:
case OMP_CLAUSE_AUTO:
case OMP_CLAUSE_SEQ:
- break;
-
case OMP_CLAUSE_TILE:
- /* We're not yet making use of the information provided by OpenACC
- tile clauses. Discard these here, to simplify later middle end
- processing. */
- remove = true;
break;
default:
(OMP_FOR_INIT (for_stmt))
* 2);
}
- int collapse = 1;
+ int collapse = 1, tile = 0;
c = omp_find_clause (OMP_FOR_CLAUSES (for_stmt), OMP_CLAUSE_COLLAPSE);
if (c)
collapse = tree_to_shwi (OMP_CLAUSE_COLLAPSE_EXPR (c));
+ c = omp_find_clause (OMP_FOR_CLAUSES (for_stmt), OMP_CLAUSE_TILE);
+ if (c)
+ tile = list_length (OMP_CLAUSE_TILE_LIST (c));
for (i = 0; i < TREE_VEC_LENGTH (OMP_FOR_INIT (for_stmt)); i++)
{
t = TREE_VEC_ELT (OMP_FOR_INIT (for_stmt), i);
OMP_CLAUSE_LINEAR_STEP (c2) = OMP_CLAUSE_LINEAR_STEP (c);
}
- if ((var != decl || collapse > 1) && orig_for_stmt == for_stmt)
+ if ((var != decl || collapse > 1 || tile) && orig_for_stmt == for_stmt)
{
for (c = OMP_FOR_CLAUSES (for_stmt); c ; c = OMP_CLAUSE_CHAIN (c))
if (((OMP_CLAUSE_CODE (c) == OMP_CLAUSE_LASTPRIVATE
gcc_unreachable ();
}
+/* This is expanded by oacc_device_lower pass. */
+
+static void
+expand_GOACC_TILE (internal_fn, gcall *)
+{
+ gcc_unreachable ();
+}
+
/* Set errno to EDOM. */
static void
dimension. DIM_POS is pure (and not const) so that it isn't
thought to clobber memory and can be gcse'd within a single
parallel region, but not across FORK/JOIN boundaries. They take a
- single INTEGER_CST argument. */
+ single INTEGER_CST argument. This might be overly conservative. */
DEF_INTERNAL_FN (GOACC_DIM_SIZE, ECF_CONST | ECF_NOTHROW | ECF_LEAF, ".")
DEF_INTERNAL_FN (GOACC_DIM_POS, ECF_PURE | ECF_NOTHROW | ECF_LEAF, ".")
/* OpenACC reduction abstraction. See internal-fn.h for usage. */
DEF_INTERNAL_FN (GOACC_REDUCTION, ECF_NOTHROW | ECF_LEAF, NULL)
+/* Openacc tile abstraction. Describes the spans of the element loop.
+ GOACC_TILE (num-loops, loop-no, tile-arg, tile-mask, element-mask). */
+DEF_INTERNAL_FN (GOACC_TILE, ECF_NOTHROW | ECF_LEAF, NULL)
+
/* Set errno to EDOM, if GCC knows how to do that directly for the
current target. */
DEF_INTERNAL_FN (SET_EDOM, ECF_LEAF | ECF_NOTHROW, NULL)
{
tree base; /* Base value. */
tree iters; /* Number of steps. */
- tree step; /* step size. */
+ tree step; /* Step size. */
+ tree tile; /* Tile increment (if tiled). */
+ tree outer; /* Tile iterator var. */
};
/* Helper for expand_oacc_for. Determine collapsed loop information.
static tree
expand_oacc_collapse_init (const struct omp_for_data *fd,
gimple_stmt_iterator *gsi,
- oacc_collapse *counts, tree bound_type)
+ oacc_collapse *counts, tree bound_type,
+ location_t loc)
{
+ tree tiling = fd->tiling;
tree total = build_int_cst (bound_type, 1);
int ix;
gcc_assert (integer_onep (fd->loop.step));
gcc_assert (integer_zerop (fd->loop.n1));
- for (ix = 0; ix != fd->collapse; ix++)
+ /* When tiling, the first operand of the tile clause applies to the
+ innermost loop, and we work outwards from there. Seems
+ backwards, but whatever. */
+ for (ix = fd->collapse; ix--;)
{
const omp_for_data_loop *loop = &fd->loops[ix];
if (POINTER_TYPE_P (diff_type) || TYPE_UNSIGNED (diff_type))
diff_type = signed_type_for (diff_type);
+ if (tiling)
+ {
+ tree num = build_int_cst (integer_type_node, fd->collapse);
+ tree loop_no = build_int_cst (integer_type_node, ix);
+ tree tile = TREE_VALUE (tiling);
+ gcall *call
+ = gimple_build_call_internal (IFN_GOACC_TILE, 5, num, loop_no, tile,
+ /* gwv-outer=*/integer_zero_node,
+ /* gwv-inner=*/integer_zero_node);
+
+ counts[ix].outer = create_tmp_var (iter_type, ".outer");
+ counts[ix].tile = create_tmp_var (diff_type, ".tile");
+ gimple_call_set_lhs (call, counts[ix].tile);
+ gimple_set_location (call, loc);
+ gsi_insert_before (gsi, call, GSI_SAME_STMT);
+
+ tiling = TREE_CHAIN (tiling);
+ }
+ else
+ {
+ counts[ix].tile = NULL;
+ counts[ix].outer = loop->v;
+ }
+
tree b = loop->n1;
tree e = loop->n2;
tree s = loop->step;
return total;
}
-/* Emit initializers for collapsed loop members. IVAR is the outer
+/* Emit initializers for collapsed loop members. INNER is true if
+ this is for the element loop of a TILE. IVAR is the outer
loop iteration variable, from which collapsed loop iteration values
are calculated. COUNTS array has been initialized by
expand_oacc_collapse_inits. */
static void
-expand_oacc_collapse_vars (const struct omp_for_data *fd,
+expand_oacc_collapse_vars (const struct omp_for_data *fd, bool inner,
gimple_stmt_iterator *gsi,
const oacc_collapse *counts, tree ivar)
{
{
const omp_for_data_loop *loop = &fd->loops[ix];
const oacc_collapse *collapse = &counts[ix];
- tree iter_type = TREE_TYPE (loop->v);
+ tree v = inner ? loop->v : collapse->outer;
+ tree iter_type = TREE_TYPE (v);
tree diff_type = TREE_TYPE (collapse->step);
tree plus_type = iter_type;
enum tree_code plus_code = PLUS_EXPR;
plus_type = sizetype;
}
- expr = fold_build2 (TRUNC_MOD_EXPR, ivar_type, ivar,
- fold_convert (ivar_type, collapse->iters));
+ expr = ivar;
+ if (ix)
+ {
+ tree mod = fold_convert (ivar_type, collapse->iters);
+ ivar = fold_build2 (TRUNC_DIV_EXPR, ivar_type, expr, mod);
+ expr = fold_build2 (TRUNC_MOD_EXPR, ivar_type, expr, mod);
+ ivar = force_gimple_operand_gsi (gsi, ivar, true, NULL_TREE,
+ true, GSI_SAME_STMT);
+ }
+
expr = fold_build2 (MULT_EXPR, diff_type, fold_convert (diff_type, expr),
collapse->step);
- expr = fold_build2 (plus_code, iter_type, collapse->base,
+ expr = fold_build2 (plus_code, iter_type,
+ inner ? collapse->outer : collapse->base,
fold_convert (plus_type, expr));
expr = force_gimple_operand_gsi (gsi, expr, false, NULL_TREE,
true, GSI_SAME_STMT);
- gassign *ass = gimple_build_assign (loop->v, expr);
+ gassign *ass = gimple_build_assign (v, expr);
gsi_insert_before (gsi, ass, GSI_SAME_STMT);
-
- if (ix)
- {
- expr = fold_build2 (TRUNC_DIV_EXPR, ivar_type, ivar,
- fold_convert (ivar_type, collapse->iters));
- ivar = force_gimple_operand_gsi (gsi, expr, true, NULL_TREE,
- true, GSI_SAME_STMT);
- }
}
}
where LTGT is < or >. We may have a specified chunking size, CHUNKING
(constant 0 for no chunking) and we will have a GWV partitioning
mask, specifying dimensions over which the loop is to be
- partitioned (see note below). We generate code that looks like:
+ partitioned (see note below). We generate code that looks like
+ (this ignores tiling):
<entry_bb> [incoming FALL->body, BRANCH->exit]
typedef signedintify (typeof (V)) T; // underlying signed integral type
<exit_bb> [incoming]
V = B + ((range -/+ 1) / S +/- 1) * S [*]
- [*] Needed if V live at end of loop
-
- Note: CHUNKING & GWV mask are specified explicitly here. This is a
- transition, and will be specified by a more general mechanism shortly.
- */
+ [*] Needed if V live at end of loop. */
static void
expand_oacc_for (struct omp_region *region, struct omp_for_data *fd)
tree step = create_tmp_var (diff_type, ".step");
bool up = cond_code == LT_EXPR;
tree dir = build_int_cst (diff_type, up ? +1 : -1);
- bool chunking = !gimple_in_ssa_p (cfun);;
+ bool chunking = !gimple_in_ssa_p (cfun);
bool negating;
+ /* Tiling vars. */
+ tree tile_size = NULL_TREE;
+ tree element_s = NULL_TREE;
+ tree e_bound = NULL_TREE, e_offset = NULL_TREE, e_step = NULL_TREE;
+ basic_block elem_body_bb = NULL;
+ basic_block elem_cont_bb = NULL;
+
/* SSA instances. */
tree offset_incr = NULL_TREE;
tree offset_init = NULL_TREE;
gwv = build_int_cst (integer_type_node, GOMP_DIM_MASK (GOMP_DIM_GANG));
}
- if (fd->collapse > 1)
+ if (fd->collapse > 1 || fd->tiling)
{
+ gcc_assert (!gimple_in_ssa_p (cfun) && up);
counts = XALLOCAVEC (struct oacc_collapse, fd->collapse);
tree total = expand_oacc_collapse_init (fd, &gsi, counts,
- TREE_TYPE (fd->loop.n2));
+ TREE_TYPE (fd->loop.n2), loc);
if (SSA_VAR_P (fd->loop.n2))
{
ass = gimple_build_assign (fd->loop.n2, total);
gsi_insert_before (&gsi, ass, GSI_SAME_STMT);
}
-
}
tree b = fd->loop.n1;
expr = fold_convert (diff_type, chunk_size);
chunk_size = force_gimple_operand_gsi (&gsi, expr, true,
NULL_TREE, true, GSI_SAME_STMT);
+
+ if (fd->tiling)
+ {
+ /* Determine the tile size and element step,
+ modify the outer loop step size. */
+ tile_size = create_tmp_var (diff_type, ".tile_size");
+ expr = build_int_cst (diff_type, 1);
+ for (int ix = 0; ix < fd->collapse; ix++)
+ expr = fold_build2 (MULT_EXPR, diff_type, counts[ix].tile, expr);
+ expr = force_gimple_operand_gsi (&gsi, expr, true,
+ NULL_TREE, true, GSI_SAME_STMT);
+ ass = gimple_build_assign (tile_size, expr);
+ gsi_insert_before (&gsi, ass, GSI_SAME_STMT);
+
+ element_s = create_tmp_var (diff_type, ".element_s");
+ ass = gimple_build_assign (element_s, s);
+ gsi_insert_before (&gsi, ass, GSI_SAME_STMT);
+
+ expr = fold_build2 (MULT_EXPR, diff_type, s, tile_size);
+ s = force_gimple_operand_gsi (&gsi, expr, true,
+ NULL_TREE, true, GSI_SAME_STMT);
+ }
+
/* Determine the range, avoiding possible unsigned->signed overflow. */
negating = !up && TYPE_UNSIGNED (iter_type);
expr = fold_build2 (MINUS_EXPR, plus_type,
true, GSI_SAME_STMT);
ass = gimple_build_assign (v, expr);
gsi_insert_before (&gsi, ass, GSI_SAME_STMT);
- if (fd->collapse > 1)
- expand_oacc_collapse_vars (fd, &gsi, counts, v);
+
+ if (fd->collapse > 1 || fd->tiling)
+ expand_oacc_collapse_vars (fd, false, &gsi, counts, v);
+
+ if (fd->tiling)
+ {
+ /* Determine the range of the element loop -- usually simply
+ the tile_size, but could be smaller if the final
+ iteration of the outer loop is a partial tile. */
+ tree e_range = create_tmp_var (diff_type, ".e_range");
+
+ expr = build2 (MIN_EXPR, diff_type,
+ build2 (MINUS_EXPR, diff_type, bound, offset),
+ build2 (MULT_EXPR, diff_type, tile_size,
+ element_s));
+ expr = force_gimple_operand_gsi (&gsi, expr, false, NULL_TREE,
+ true, GSI_SAME_STMT);
+ ass = gimple_build_assign (e_range, expr);
+ gsi_insert_before (&gsi, ass, GSI_SAME_STMT);
+
+ /* Determine bound, offset & step of inner loop. */
+ e_bound = create_tmp_var (diff_type, ".e_bound");
+ e_offset = create_tmp_var (diff_type, ".e_offset");
+ e_step = create_tmp_var (diff_type, ".e_step");
+
+ /* Mark these as element loops. */
+ tree t, e_gwv = integer_minus_one_node;
+ tree chunk = build_int_cst (diff_type, 0); /* Never chunked. */
+
+ t = build_int_cst (integer_type_node, IFN_GOACC_LOOP_OFFSET);
+ call = gimple_build_call_internal (IFN_GOACC_LOOP, 7, t, dir, e_range,
+ element_s, chunk, e_gwv, chunk);
+ gimple_call_set_lhs (call, e_offset);
+ gimple_set_location (call, loc);
+ gsi_insert_before (&gsi, call, GSI_SAME_STMT);
+
+ t = build_int_cst (integer_type_node, IFN_GOACC_LOOP_BOUND);
+ call = gimple_build_call_internal (IFN_GOACC_LOOP, 7, t, dir, e_range,
+ element_s, chunk, e_gwv, e_offset);
+ gimple_call_set_lhs (call, e_bound);
+ gimple_set_location (call, loc);
+ gsi_insert_before (&gsi, call, GSI_SAME_STMT);
+
+ t = build_int_cst (integer_type_node, IFN_GOACC_LOOP_STEP);
+ call = gimple_build_call_internal (IFN_GOACC_LOOP, 6, t, dir, e_range,
+ element_s, chunk, e_gwv);
+ gimple_call_set_lhs (call, e_step);
+ gimple_set_location (call, loc);
+ gsi_insert_before (&gsi, call, GSI_SAME_STMT);
+
+ /* Add test and split block. */
+ expr = build2 (cond_code, boolean_type_node, e_offset, e_bound);
+ stmt = gimple_build_cond_empty (expr);
+ gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
+ split = split_block (body_bb, stmt);
+ elem_body_bb = split->dest;
+ if (cont_bb == body_bb)
+ cont_bb = elem_body_bb;
+ body_bb = split->src;
+
+ split->flags ^= EDGE_FALLTHRU | EDGE_TRUE_VALUE;
+
+ /* Initialize the user's loop vars. */
+ gsi = gsi_start_bb (elem_body_bb);
+ expand_oacc_collapse_vars (fd, true, &gsi, counts, e_offset);
+ }
}
/* Loop increment goes into cont_bb. If this is not a loop, we
gomp_continue *cont_stmt = as_a <gomp_continue *> (gsi_stmt (gsi));
loc = gimple_location (cont_stmt);
+ if (fd->tiling)
+ {
+ /* Insert element loop increment and test. */
+ expr = build2 (PLUS_EXPR, diff_type, e_offset, e_step);
+ expr = force_gimple_operand_gsi (&gsi, expr, false, NULL_TREE,
+ true, GSI_SAME_STMT);
+ ass = gimple_build_assign (e_offset, expr);
+ gsi_insert_before (&gsi, ass, GSI_SAME_STMT);
+ expr = build2 (cond_code, boolean_type_node, e_offset, e_bound);
+
+ stmt = gimple_build_cond_empty (expr);
+ gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
+ split = split_block (cont_bb, stmt);
+ elem_cont_bb = split->src;
+ cont_bb = split->dest;
+
+ split->flags ^= EDGE_FALLTHRU | EDGE_FALSE_VALUE;
+ make_edge (elem_cont_bb, elem_body_bb, EDGE_TRUE_VALUE);
+
+ make_edge (body_bb, cont_bb, EDGE_FALSE_VALUE);
+
+ gsi = gsi_for_stmt (cont_stmt);
+ }
+
/* Increment offset. */
if (gimple_in_ssa_p (cfun))
- expr= build2 (plus_code, iter_type, offset,
- fold_convert (plus_type, step));
+ expr = build2 (plus_code, iter_type, offset,
+ fold_convert (plus_type, step));
else
expr = build2 (PLUS_EXPR, diff_type, offset, step);
expr = force_gimple_operand_gsi (&gsi, expr, false, NULL_TREE,
if (cont_bb)
{
- /* We now have one or two nested loops. Update the loop
+ /* We now have one, two or three nested loops. Update the loop
structures. */
struct loop *parent = entry_bb->loop_father;
struct loop *body = body_bb->loop_father;
body_loop->header = body_bb;
body_loop->latch = cont_bb;
add_loop (body_loop, parent);
+
+ if (fd->tiling)
+ {
+ /* Insert tiling's element loop. */
+ struct loop *inner_loop = alloc_loop ();
+ inner_loop->header = elem_body_bb;
+ inner_loop->latch = elem_cont_bb;
+ add_loop (inner_loop, body_loop);
+ }
}
}
}
fd->for_stmt = for_stmt;
fd->pre = NULL;
- if (gimple_omp_for_collapse (for_stmt) > 1)
- fd->loops = loops;
- else
- fd->loops = &fd->loop;
-
fd->have_nowait = distribute || simd;
fd->have_ordered = false;
+ fd->tiling = NULL_TREE;
fd->collapse = 1;
fd->ordered = 0;
fd->sched_kind = OMP_CLAUSE_SCHEDULE_STATIC;
collapse_count = &OMP_CLAUSE_COLLAPSE_COUNT (t);
}
break;
+ case OMP_CLAUSE_TILE:
+ fd->tiling = OMP_CLAUSE_TILE_LIST (t);
+ fd->collapse = list_length (fd->tiling);
+ gcc_assert (fd->collapse);
+ collapse_iter = &OMP_CLAUSE_TILE_ITERVAR (t);
+ collapse_count = &OMP_CLAUSE_TILE_COUNT (t);
+ break;
default:
break;
}
+
+ if (fd->collapse > 1 || fd->tiling)
+ fd->loops = loops;
+ else
+ fd->loops = &fd->loop;
+
if (fd->ordered && fd->collapse == 1 && loops != NULL)
{
fd->loops = loops;
fd->sched_kind = OMP_CLAUSE_SCHEDULE_STATIC;
gcc_assert (fd->chunk_size == NULL);
}
- gcc_assert (fd->collapse == 1 || collapse_iter != NULL);
+ gcc_assert ((fd->collapse == 1 && !fd->tiling) || collapse_iter != NULL);
if (taskloop)
fd->sched_kind = OMP_CLAUSE_SCHEDULE_RUNTIME;
if (fd->sched_kind == OMP_CLAUSE_SCHEDULE_RUNTIME)
int cnt = fd->ordered ? fd->ordered : fd->collapse;
for (i = 0; i < cnt; i++)
{
- if (i == 0 && fd->collapse == 1 && (fd->ordered == 0 || loops == NULL))
+ if (i == 0
+ && fd->collapse == 1
+ && !fd->tiling
+ && (fd->ordered == 0 || loops == NULL))
loop = &fd->loop;
else if (loops != NULL)
loop = loops + i;
|| (fd->sched_kind == OMP_CLAUSE_SCHEDULE_STATIC
&& !fd->have_ordered))
{
- if (fd->collapse == 1)
+ if (fd->collapse == 1 && !fd->tiling)
iter_type = TREE_TYPE (loop->v);
else if (i == 0
|| TYPE_PRECISION (iter_type)
*collapse_count = create_tmp_var (iter_type, ".count");
}
- if (fd->collapse > 1 || (fd->ordered && loops))
+ if (fd->collapse > 1 || fd->tiling || (fd->ordered && loops))
{
fd->loop.v = *collapse_iter;
fd->loop.n1 = build_int_cst (TREE_TYPE (fd->loop.v), 0);
OLF_AUTO = 1u << 1, /* Compiler chooses axes. */
OLF_INDEPENDENT = 1u << 2, /* Iterations are known independent. */
OLF_GANG_STATIC = 1u << 3, /* Gang partitioning is static (has op). */
-
+ OLF_TILE = 1u << 4, /* Tiled loop. */
+
/* Explicitly specified loop axes. */
- OLF_DIM_BASE = 4,
+ OLF_DIM_BASE = 5,
OLF_DIM_GANG = 1u << (OLF_DIM_BASE + GOMP_DIM_GANG),
OLF_DIM_WORKER = 1u << (OLF_DIM_BASE + GOMP_DIM_WORKER),
OLF_DIM_VECTOR = 1u << (OLF_DIM_BASE + GOMP_DIM_VECTOR),
tree chunk_size;
gomp_for *for_stmt;
tree pre, iter_type;
- int collapse;
+ tree tiling; /* Tiling values (if non null). */
+ int collapse; /* Collapsed loops, 1 for a non-collapsed loop. */
int ordered;
bool have_nowait, have_ordered, simd_schedule;
unsigned char sched_modifiers;
case OMP_CLAUSE_INDEPENDENT:
case OMP_CLAUSE_AUTO:
case OMP_CLAUSE_SEQ:
+ case OMP_CLAUSE_TILE:
case OMP_CLAUSE__SIMT_:
break;
install_var_local (decl, ctx);
break;
- case OMP_CLAUSE_TILE:
case OMP_CLAUSE__CACHE_:
default:
gcc_unreachable ();
case OMP_CLAUSE_INDEPENDENT:
case OMP_CLAUSE_AUTO:
case OMP_CLAUSE_SEQ:
+ case OMP_CLAUSE_TILE:
case OMP_CLAUSE__GRIDDIM_:
case OMP_CLAUSE__SIMT_:
break;
- case OMP_CLAUSE_TILE:
case OMP_CLAUSE__CACHE_:
default:
gcc_unreachable ();
tag |= OLF_INDEPENDENT;
break;
+ case OMP_CLAUSE_TILE:
+ tag |= OLF_TILE;
+ break;
+
default:
continue;
}
if (!tgt || is_oacc_parallel (tgt))
tag |= OLF_INDEPENDENT;
- /* A loop lacking SEQ, GANG, WORKER and/or VECTOR is implicitly AUTO. */
- if (!(tag & (((GOMP_DIM_MASK (GOMP_DIM_MAX) - 1) << OLF_DIM_BASE)
- | OLF_SEQ)))
- tag |= OLF_AUTO;
+ if (tag & OLF_TILE)
+ /* Tiling could use all 3 levels. */
+ levels = 3;
+ else
+ {
+ /* A loop lacking SEQ, GANG, WORKER and/or VECTOR could be AUTO.
+ Ensure at least one level, or 2 for possible auto
+ partitioning */
+ bool maybe_auto = !(tag & (((GOMP_DIM_MASK (GOMP_DIM_MAX) - 1)
+ << OLF_DIM_BASE) | OLF_SEQ));
- /* Ensure at least one level. */
- if (!levels)
- levels++;
+ if (levels < 1u + maybe_auto)
+ levels = 1u + maybe_auto;
+ }
args.quick_push (build_int_cst (integer_type_node, levels));
args.quick_push (build_int_cst (integer_type_node, tag));
tree routine; /* Pseudo-loop enclosing a routine. */
unsigned mask; /* Partitioning mask. */
+ unsigned e_mask; /* Partitioning of element loops (when tiling). */
unsigned inner; /* Partitioning of inner loops. */
unsigned flags; /* Partitioning flags. */
- unsigned ifns; /* Contained loop abstraction functions. */
+ vec<gcall *> ifns; /* Contained loop abstraction functions. */
tree chunk_size; /* Chunk size. */
gcall *head_end; /* Final marker of head sequence. */
};
}
}
+/* Call dim_pos (POS == true) or dim_size (POS == false) builtins for
+ axis DIM. Return a tmp var holding the result. */
+
+static tree
+oacc_dim_call (bool pos, int dim, gimple_seq *seq)
+{
+ tree arg = build_int_cst (unsigned_type_node, dim);
+ tree size = create_tmp_var (integer_type_node);
+ enum internal_fn fn = pos ? IFN_GOACC_DIM_POS : IFN_GOACC_DIM_SIZE;
+ gimple *call = gimple_build_call_internal (fn, 1, arg);
+
+ gimple_call_set_lhs (call, size);
+ gimple_seq_add_stmt (seq, call);
+
+ return size;
+}
+
/* Find the number of threads (POS = false), or thread number (POS =
true) for an OpenACC region partitioned as MASK. Setup code
required for the calculation is added to SEQ. */
for (ix = GOMP_DIM_GANG; ix != GOMP_DIM_MAX; ix++)
if (GOMP_DIM_MASK (ix) & mask)
{
- tree arg = build_int_cst (unsigned_type_node, ix);
-
if (res)
{
/* We had an outer index, so scale that by the size of
this dimension. */
- tree n = create_tmp_var (integer_type_node);
- gimple *call
- = gimple_build_call_internal (IFN_GOACC_DIM_SIZE, 1, arg);
-
- gimple_call_set_lhs (call, n);
- gimple_seq_add_stmt (seq, call);
+ tree n = oacc_dim_call (false, ix, seq);
res = fold_build2 (MULT_EXPR, integer_type_node, res, n);
}
if (pos)
{
/* Determine index in this dimension. */
- tree id = create_tmp_var (integer_type_node);
- gimple *call = gimple_build_call_internal
- (IFN_GOACC_DIM_POS, 1, arg);
-
- gimple_call_set_lhs (call, id);
- gimple_seq_add_stmt (seq, call);
+ tree id = oacc_dim_call (true, ix, seq);
if (res)
res = fold_build2 (PLUS_EXPR, integer_type_node, res, id);
else
gsi_replace_with_seq (&gsi, seq, true);
}
+/* Transform a GOACC_TILE call. Determines the element loop span for
+ the specified loop of the nest. This is 1 if we're not tiling.
+
+ GOACC_TILE (collapse_count, loop_no, tile_arg, gwv_tile, gwv_element); */
+
+static void
+oacc_xform_tile (gcall *call)
+{
+ gimple_stmt_iterator gsi = gsi_for_stmt (call);
+ unsigned collapse = tree_to_uhwi (gimple_call_arg (call, 0));
+ /* Inner loops have higher loop_nos. */
+ unsigned loop_no = tree_to_uhwi (gimple_call_arg (call, 1));
+ tree tile_size = gimple_call_arg (call, 2);
+ unsigned e_mask = tree_to_uhwi (gimple_call_arg (call, 4));
+ tree lhs = gimple_call_lhs (call);
+ tree type = TREE_TYPE (lhs);
+ gimple_seq seq = NULL;
+ tree span = build_int_cst (type, 1);
+
+ gcc_assert (!(e_mask
+ & ~(GOMP_DIM_MASK (GOMP_DIM_VECTOR)
+ | GOMP_DIM_MASK (GOMP_DIM_WORKER))));
+ push_gimplify_context (!seen_error ());
+
+#ifndef ACCEL_COMPILER
+ /* Partitioning disabled on host compilers. */
+ e_mask = 0;
+#endif
+ if (!e_mask)
+ /* Not paritioning. */
+ span = integer_one_node;
+ else if (!integer_zerop (tile_size))
+ /* User explicitly specified size. */
+ span = tile_size;
+ else
+ {
+ /* Pick a size based on the paritioning of the element loop and
+ the number of loop nests. */
+ tree first_size = NULL_TREE;
+ tree second_size = NULL_TREE;
+
+ if (e_mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
+ first_size = oacc_dim_call (false, GOMP_DIM_VECTOR, &seq);
+ if (e_mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
+ second_size = oacc_dim_call (false, GOMP_DIM_WORKER, &seq);
+
+ if (!first_size)
+ {
+ first_size = second_size;
+ second_size = NULL_TREE;
+ }
+
+ if (loop_no + 1 == collapse)
+ {
+ span = first_size;
+ if (!loop_no && second_size)
+ span = fold_build2 (MULT_EXPR, TREE_TYPE (span),
+ span, second_size);
+ }
+ else if (loop_no + 2 == collapse)
+ span = second_size;
+ else
+ span = NULL_TREE;
+
+ if (!span)
+ /* There's no obvious element size for this loop. Options
+ are 1, first_size or some non-unity constant (32 is my
+ favourite). We should gather some statistics. */
+ span = first_size;
+ }
+
+ span = fold_convert (type, span);
+ gimplify_assign (lhs, span, &seq);
+
+ pop_gimplify_context (NULL);
+
+ gsi_replace_with_seq (&gsi, seq, true);
+}
+
/* Default partitioned and minimum partitioned dimensions. */
static int oacc_default_dims[GOMP_DIM_MAX];
memset (loop->tails, 0, sizeof (loop->tails));
loop->routine = NULL_TREE;
- loop->mask = loop->flags = loop->inner = 0;
- loop->ifns = 0;
+ loop->mask = loop->e_mask = loop->flags = loop->inner = 0;
loop->chunk_size = 0;
loop->head_end = NULL;
finish_oacc_loop (oacc_loop *loop)
{
/* If the loop has been collapsed, don't partition it. */
- if (!loop->ifns)
+ if (loop->ifns.is_empty ())
loop->mask = loop->flags = 0;
return loop->parent;
}
break;
case IFN_GOACC_LOOP:
- /* Count the goacc loop abstraction fns, to determine if the
- loop was collapsed already. */
- loop->ifns++;
+ case IFN_GOACC_TILE:
+ /* Record the abstraction function, so we can manipulate it
+ later. */
+ loop->ifns.safe_push (call);
break;
case IFN_UNIQUE:
}
}
-/* Transform the IFN_GOACC_LOOP internal functions by providing the
- determined partitioning mask and chunking argument. END_MARKER
- points at the end IFN_HEAD_TAIL call intgroducing the loop. IFNS
- is the number of IFN_GOACC_LOOP calls for the loop. MASK_ARG is
- the replacement partitioning mask and CHUNK_ARG is the replacement
- chunking arg. */
-
-static void
-oacc_loop_xform_loop (gcall *end_marker, unsigned ifns,
- tree mask_arg, tree chunk_arg)
-{
- gimple_stmt_iterator gsi = gsi_for_stmt (end_marker);
-
- gcc_checking_assert (ifns);
- for (;;)
- {
- for (; !gsi_end_p (gsi); gsi_next (&gsi))
- {
- gimple *stmt = gsi_stmt (gsi);
-
- if (!is_gimple_call (stmt))
- continue;
-
- gcall *call = as_a <gcall *> (stmt);
-
- if (!gimple_call_internal_p (call))
- continue;
-
- if (gimple_call_internal_fn (call) != IFN_GOACC_LOOP)
- continue;
-
- *gimple_call_arg_ptr (call, 5) = mask_arg;
- *gimple_call_arg_ptr (call, 4) = chunk_arg;
- ifns--;
- if (!ifns)
- return;
- }
-
- /* The LOOP_BOUND ifn could be in the single successor
- block. */
- basic_block bb = single_succ (gsi_bb (gsi));
- gsi = gsi_start_bb (bb);
- }
-}
-
/* Process the discovered OpenACC loops, setting the correct
partitioning level etc. */
if (loop->mask && !loop->routine)
{
int ix;
- unsigned mask = loop->mask;
- unsigned dim = GOMP_DIM_GANG;
- tree mask_arg = build_int_cst (unsigned_type_node, mask);
+ tree mask_arg = build_int_cst (unsigned_type_node, loop->mask);
+ tree e_mask_arg = build_int_cst (unsigned_type_node, loop->e_mask);
tree chunk_arg = loop->chunk_size;
+ gcall *call;
+
+ for (ix = 0; loop->ifns.iterate (ix, &call); ix++)
+ switch (gimple_call_internal_fn (call))
+ {
+ case IFN_GOACC_LOOP:
+ {
+ bool is_e = gimple_call_arg (call, 5) == integer_minus_one_node;
+ gimple_call_set_arg (call, 5, is_e ? e_mask_arg : mask_arg);
+ if (!is_e)
+ gimple_call_set_arg (call, 4, chunk_arg);
+ }
+ break;
- oacc_loop_xform_loop (loop->head_end, loop->ifns, mask_arg, chunk_arg);
+ case IFN_GOACC_TILE:
+ gimple_call_set_arg (call, 3, mask_arg);
+ gimple_call_set_arg (call, 4, e_mask_arg);
+ break;
+ default:
+ gcc_unreachable ();
+ }
+
+ unsigned dim = GOMP_DIM_GANG;
+ unsigned mask = loop->mask | loop->e_mask;
for (ix = 0; ix != GOMP_DIM_MAX && mask; ix++)
{
while (!(GOMP_DIM_MASK (dim) & mask))
{
bool auto_par = (loop->flags & OLF_AUTO) != 0;
bool seq_par = (loop->flags & OLF_SEQ) != 0;
-
+ bool tiling = (loop->flags & OLF_TILE) != 0;
+
this_mask = ((loop->flags >> OLF_DIM_BASE)
& (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1));
+ /* Apply auto partitioning if this is a non-partitioned regular
+ loop, or (no more than) single axis tiled loop. */
+ bool maybe_auto
+ = !seq_par && this_mask == (tiling ? this_mask & -this_mask : 0);
+
if ((this_mask != 0) + auto_par + seq_par > 1)
{
if (noisy)
? "%<seq%> overrides other OpenACC loop specifiers"
: "%<auto%> conflicts with other OpenACC loop "
"specifiers");
- auto_par = false;
+ maybe_auto = false;
loop->flags &= ~OLF_AUTO;
if (seq_par)
{
this_mask = 0;
}
}
- if (auto_par && (loop->flags & OLF_INDEPENDENT))
- mask_all |= GOMP_DIM_MASK (GOMP_DIM_MAX);
+
+ if (maybe_auto && (loop->flags & OLF_INDEPENDENT))
+ {
+ loop->flags |= OLF_AUTO;
+ mask_all |= GOMP_DIM_MASK (GOMP_DIM_MAX);
+ }
}
if (this_mask & outer_mask)
{
const oacc_loop *outer;
for (outer = loop->parent; outer; outer = outer->parent)
- if (outer->mask & this_mask)
+ if ((outer->mask | outer->e_mask) & this_mask)
break;
if (noisy)
}
}
- loop->mask = this_mask;
mask_all |= this_mask;
+ if (loop->flags & OLF_TILE)
+ {
+ /* When tiling, vector goes to the element loop, and failing
+ that we put worker there. The std doesn't contemplate
+ specifying all three. We choose to put worker and vector on
+ the element loops in that case. */
+ unsigned this_e_mask = this_mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR);
+ if (!this_e_mask || this_mask & GOMP_DIM_MASK (GOMP_DIM_GANG))
+ this_e_mask |= this_mask & GOMP_DIM_MASK (GOMP_DIM_WORKER);
+
+ loop->e_mask = this_e_mask;
+ this_mask ^= this_e_mask;
+ }
+
+ loop->mask = this_mask;
+
+ if (dump_file)
+ fprintf (dump_file, "Loop %s:%d user specified %d & %d\n",
+ LOCATION_FILE (loop->loc), LOCATION_LINE (loop->loc),
+ loop->mask, loop->e_mask);
+
if (loop->child)
{
- loop->inner = oacc_loop_fixed_partitions (loop->child,
- outer_mask | this_mask);
+ unsigned tmp_mask = outer_mask | this_mask | loop->e_mask;
+ loop->inner = oacc_loop_fixed_partitions (loop->child, tmp_mask);
mask_all |= loop->inner;
}
/* Walk the OpenACC loop heirarchy to assign auto-partitioned loops.
OUTER_MASK is the partitioning this loop is contained within.
+ OUTER_ASSIGN is true if an outer loop is being auto-partitioned.
Return the cumulative partitioning used by this loop, siblings and
children. */
static unsigned
-oacc_loop_auto_partitions (oacc_loop *loop, unsigned outer_mask)
+oacc_loop_auto_partitions (oacc_loop *loop, unsigned outer_mask,
+ bool outer_assign)
{
bool assign = (loop->flags & OLF_AUTO) && (loop->flags & OLF_INDEPENDENT);
bool noisy = true;
+ bool tiling = loop->flags & OLF_TILE;
#ifdef ACCEL_COMPILER
/* When device_type is supported, we want the device compiler to be
noisy = false;
#endif
- if (assign && outer_mask < GOMP_DIM_MASK (GOMP_DIM_MAX - 1))
+ if (assign && (!outer_assign | loop->inner))
{
- /* Allocate the outermost loop at the outermost available
- level. */
- unsigned this_mask = outer_mask + 1;
+ /* Allocate outermost and non-innermost loops at the outermost
+ non-innermost available level. */
+ unsigned this_mask = GOMP_DIM_MASK (GOMP_DIM_GANG);
+
+ /* Find the first outermost available partition. */
+ while (this_mask <= outer_mask)
+ this_mask <<= 1;
+
+ /* Grab two axes if tiling, and we've not assigned anything */
+ if (tiling && !(loop->mask | loop->e_mask))
+ this_mask |= this_mask << 1;
+
+ /* Prohibit the innermost partitioning at the moment. */
+ this_mask &= GOMP_DIM_MASK (GOMP_DIM_MAX - 1) - 1;
+
+ /* Don't use any dimension explicitly claimed by an inner loop. */
+ this_mask &= ~loop->inner;
+
+ if (tiling && !loop->e_mask)
+ {
+ /* If we got two axes, allocate the inner one to the element
+ loop. */
+ loop->e_mask = this_mask & (this_mask << 1);
+ this_mask ^= loop->e_mask;
+ }
- if (!(this_mask & loop->inner))
- loop->mask = this_mask;
+ loop->mask |= this_mask;
}
if (loop->child)
{
- unsigned child_mask = outer_mask | loop->mask;
-
- if (loop->mask || assign)
- child_mask |= GOMP_DIM_MASK (GOMP_DIM_MAX);
-
- loop->inner = oacc_loop_auto_partitions (loop->child, child_mask);
+ unsigned tmp_mask = outer_mask | loop->mask | loop->e_mask;
+ loop->inner = oacc_loop_auto_partitions (loop->child, tmp_mask,
+ outer_assign | assign);
}
- if (assign && !loop->mask)
+ if (assign && (!loop->mask || (tiling && !loop->e_mask) || !outer_assign))
{
- /* Allocate the loop at the innermost available level. */
+ /* Allocate the loop at the innermost available level. Note
+ that we do this even if we already assigned this loop the
+ outermost available level above. That way we'll partition
+ this along 2 axes, if they are available. */
unsigned this_mask = 0;
/* Determine the outermost partitioning used within this loop. */
/* And avoid picking one use by an outer loop. */
this_mask &= ~outer_mask;
- if (!this_mask && noisy)
- warning_at (loop->loc, 0,
- "insufficient partitioning available to parallelize loop");
+ /* If tiling and we failed completely above, grab the next one
+ too. Making sure it doesn't hit an outer loop. */
+ if (tiling)
+ {
+ this_mask &= ~(loop->e_mask | loop->mask);
+ unsigned tile_mask = ((this_mask >> 1)
+ & ~(outer_mask | loop->e_mask | loop->mask));
+
+ if (tile_mask || loop->mask)
+ {
+ loop->e_mask |= this_mask;
+ this_mask = tile_mask;
+ }
+ if (!loop->e_mask && noisy)
+ warning_at (loop->loc, 0,
+ "insufficient partitioning available"
+ " to parallelize element loop");
+ }
- loop->mask = this_mask;
+ loop->mask |= this_mask;
+ if (!loop->mask && noisy)
+ warning_at (loop->loc, 0,
+ "insufficient partitioning available"
+ " to parallelize%s loop", tiling ? " tile" : "");
}
if (assign && dump_file)
- fprintf (dump_file, "Auto loop %s:%d assigned %d\n",
+ fprintf (dump_file, "Auto loop %s:%d assigned %d & %d\n",
LOCATION_FILE (loop->loc), LOCATION_LINE (loop->loc),
- loop->mask);
+ loop->mask, loop->e_mask);
unsigned inner_mask = 0;
if (loop->sibling)
- inner_mask |= oacc_loop_auto_partitions (loop->sibling, outer_mask);
+ inner_mask |= oacc_loop_auto_partitions (loop->sibling,
+ outer_mask, outer_assign);
- inner_mask |= loop->inner | loop->mask;
+ inner_mask |= loop->inner | loop->mask | loop->e_mask;
return inner_mask;
}
if (mask_all & GOMP_DIM_MASK (GOMP_DIM_MAX))
{
mask_all ^= GOMP_DIM_MASK (GOMP_DIM_MAX);
- mask_all |= oacc_loop_auto_partitions (loop, outer_mask);
+ mask_all |= oacc_loop_auto_partitions (loop, outer_mask, false);
}
return mask_all;
}
{
default: break;
+ case IFN_GOACC_TILE:
+ oacc_xform_tile (call);
+ rescan = true;
+ break;
+
case IFN_GOACC_LOOP:
oacc_xform_loop (call);
rescan = true;
switch (kind)
{
default:
- gcc_unreachable ();
+ break;
case IFN_UNIQUE_OACC_FORK:
case IFN_UNIQUE_OACC_JOIN:
+2017-02-09 Nathan Sidwell <nathan@codesourcery.com>
+ Cesar Philippidis <cesar@codesourcery.com>
+ Joseph Myers <joseph@codesourcery.com>
+ Chung-Lin Tang <cltang@codesourcery.com>
+
+ * c-c++-common/goacc/combined-directives.c: Remove xfail.
+ * c-c++-common/goacc/loop-auto-1.c: Adjust and add additional case.
+ * c-c++-common/goacc/loop-auto-2.c: New.
+ * c-c++-common/goacc/tile.c: Include stdbool, fix expected errors.
+ * c-c++-common/goacc/tile-2.c: New.
+ * g++.dg/goacc/template.C: Test tile subst. Adjust erroneous uses.
+ * g++.dg/goacc/tile-1.C: New, check tile subst.
+ * gcc.dg/goacc/loop-processing-1.c: Adjust dg-final pattern.
+ * gfortran.dg/goacc/combined-directives.f90: Remove xfail.
+ * gfortran.dg/goacc/tile-1.f90: New test.
+ * gfortran.dg/goacc/tile-2.f90: New test.
+ * gfortran.dg/goacc/tile-lowering.f95: New test.
+
2017-02-09 Richard Biener <rguenther@suse.de>
PR tree-optimization/69823
// { dg-final { scan-tree-dump-times "acc loop vector" 2 "gimple" } }
// { dg-final { scan-tree-dump-times "acc loop seq" 2 "gimple" } }
// { dg-final { scan-tree-dump-times "acc loop auto" 2 "gimple" } }
-// XFAILed: OpenACC tile clauses are discarded during gimplification.
-// { dg-final { scan-tree-dump-times "acc loop tile.2, 3" 2 "gimple" { xfail *-*-* } } }
+// { dg-final { scan-tree-dump-times "acc loop tile.2, 3" 2 "gimple" } }
// { dg-final { scan-tree-dump-times "acc loop independent private.i" 2 "gimple" } }
// { dg-final { scan-tree-dump-times "private.z" 2 "gimple" } }
for (int kx = 0; kx < 10; kx++) {}
}
}
+
+#pragma acc loop auto
+ for (int ix = 0; ix < 10; ix++)
+ {
+#pragma acc loop auto
+ for (int jx = 0; jx < 10; jx++)
+ {
+#pragma acc loop auto /* { dg-warning "insufficient partitioning" } */
+ for (int kx = 0; kx < 10; kx++)
+ {
+#pragma acc loop auto
+ for (int lx = 0; lx < 10; lx++) {}
+ }
+ }
+ }
}
}
#pragma acc loop auto
for (int ix = 0; ix < 10; ix++) {}
-#pragma acc loop auto
+#pragma acc loop auto /* { dg-warning "insufficient partitioning" } */
for (int ix = 0; ix < 10; ix++)
{
-#pragma acc loop auto /* { dg-warning "insufficient partitioning" } */
+#pragma acc loop auto
for (int jx = 0; jx < 10; jx++) {}
}
}
--- /dev/null
+
+// Tile parititioning
+
+void Ok ()
+{
+#pragma acc parallel num_gangs (10) num_workers(32) vector_length(32)
+ {
+
+#pragma acc loop tile(*) gang vector
+ for (int ix = 0; ix < 10; ix++)
+ {
+ }
+
+#pragma acc loop tile(*)
+ for (int ix = 0; ix < 10; ix++)
+ {
+ }
+
+#pragma acc loop tile(*) gang
+ for (int ix = 0; ix < 10; ix++)
+ {
+ #pragma acc loop vector
+ for (int jx = 0; jx < 10; jx++)
+ ;
+ }
+
+#pragma acc loop tile(*)
+ for (int ix = 0; ix < 10; ix++)
+ {
+ #pragma acc loop vector
+ for (int jx = 0; jx < 10; jx++)
+ ;
+ }
+
+#pragma acc loop gang
+ for (int jx = 0; jx < 10; jx++)
+ {
+#pragma acc loop tile(*) vector
+ for (int ix = 0; ix < 10; ix++)
+ {
+ }
+
+#pragma acc loop tile(*)
+ for (int ix = 0; ix < 10; ix++)
+ {
+ }
+ }
+
+#pragma acc loop tile(*) worker
+ for (int ix = 0; ix < 10; ix++)
+ {
+ #pragma acc loop vector
+ for (int jx = 0; jx < 10; jx++)
+ ;
+ }
+ }
+}
+
+void Bad ()
+{
+#pragma acc parallel num_gangs (10) num_workers(32) vector_length(32)
+ {
+
+#pragma acc loop tile(*) gang vector /* { dg-message "containing loop" } */
+ for (int ix = 0; ix < 10; ix++)
+ {
+#pragma acc loop vector /* { dg-error "uses same" } */
+ for (int jx = 0; jx < 10; jx++)
+ ;
+ }
+
+#pragma acc loop tile(*) gang vector
+ for (int ix = 0; ix < 10; ix++)
+ {
+ #pragma acc loop auto /* { dg-warning "insufficient partitioning" } */
+ for (int jx = 0; jx < 10; jx++)
+ ;
+ }
+
+#pragma acc loop tile(*) auto /* { dg-warning "insufficient partitioning" } */
+ for (int ix = 0; ix < 10; ix++)
+ {
+ #pragma acc loop worker
+ for (int jx = 0; jx < 10; jx++)
+ ;
+ }
+
+#pragma acc loop worker /* { dg-message "containing loop" } */
+ for (int jx = 0; jx < 10; jx++)
+ {
+#pragma acc loop tile(*) gang vector /* { dg-error "incorrectly nested" } */
+ for (int ix = 0; ix < 10; ix++)
+ {
+ }
+
+#pragma acc loop tile(*) vector /* { dg-warning "insufficient partitioning" } */
+ for (int ix = 0; ix < 10; ix++)
+ {
+ }
+
+#pragma acc loop tile(*) /* { dg-warning "insufficient partitioning" } */
+ for (int ix = 0; ix < 10; ix++)
+ {
+ }
+ }
+ }
+}
--- /dev/null
+int main ()
+{
+#pragma acc parallel
+ {
+#pragma acc loop tile (*,*)
+ for (int ix = 0; ix < 30; ix++)
+ ; /* { dg-error "not enough" } */
+
+#pragma acc loop tile (*,*)
+ for (int ix = 0; ix < 30; ix++)
+ for (int jx = 0; jx < ix; jx++) /* { dg-error "condition expression" } */
+ ;
+
+#pragma acc loop tile (*)
+ for (int ix = 0; ix < 30; ix++)
+ for (int jx = 0; jx < ix; jx++) /* OK */
+ ;
+
+ }
+ return 0;
+}
+#include <stdbool.h>
+
int
main ()
{
- int i, *a, b;
+ int i, j, k, *a, b;
#pragma acc parallel loop tile (10)
for (i = 0; i < 100; i++)
#pragma acc parallel loop tile (10, *)
for (i = 0; i < 100; i++)
- ;
+ for (j = 0; j < 100; j++)
+ ;
-#pragma acc parallel loop tile (10, *, i)
+#pragma acc parallel loop tile (10, *, i) // { dg-error "" }
for (i = 0; i < 100; i++)
- ;
+ for (j = 0; j < 100; j++)
+ for (k = 0; k < 100; k++)
+ ;
#pragma acc parallel loop tile // { dg-error "expected '\\\('" }
for (i = 0; i < 100; i++)
for (i = 0; i < 100; i++)
;
-#pragma acc parallel loop tile (1.1) // { dg-error "'tile' value must be integral" }
+#pragma acc parallel loop tile (1.1) // { dg-error "'tile' argument needs" }
for (i = 0; i < 100; i++)
;
-#pragma acc parallel loop tile (-3) // { dg-warning "'tile' value must be positive" }
+#pragma acc parallel loop tile (-3) // { dg-error "'tile' argument needs" }
for (i = 0; i < 100; i++)
;
-#pragma acc parallel loop tile (10,-3) // { dg-warning "'tile' value must be positive" }
+#pragma acc parallel loop tile (10,-3) // { dg-error "'tile' argument needs" }
for (i = 0; i < 100; i++)
- ;
+ for (j = 0; j < 100; j++)
+ ;
-#pragma acc parallel loop tile (-100,10,5) // { dg-warning "'tile' value must be positive" }
+#pragma acc parallel loop tile (-100,10,5) // { dg-error "'tile' argument needs" }
for (i = 0; i < 100; i++)
- ;
+ for (j = 0; j < 100; j++)
+ for (k = 0; k < 100; k++)
+ ;
-#pragma acc parallel loop tile (1,2.0,true) // { dg-error "" }
+#pragma acc parallel loop tile (1,true)
for (i = 0; i < 100; i++)
- ;
+ for (j = 0; j < 100; j++)
+ ;
-#pragma acc parallel loop tile (*a, 1)
+#pragma acc parallel loop tile (*a, 1) // { dg-error "" }
for (i = 0; i < 100; i++)
- ;
+ for (j = 0; j < 100; j++)
+ ;
-#pragma acc parallel loop tile (1, *a, b)
+#pragma acc parallel loop tile (1, b) // { dg-error "" }
for (i = 0; i < 100; i++)
- ;
+ for (j = 0; j < 100; j++)
+ ;
-#pragma acc parallel loop tile (b, 1, *a)
+#pragma acc parallel loop tile (b, 1) // { dg-error "" }
for (i = 0; i < 100; i++)
- ;
+ for (j = 0; j < 100; j++)
+ ;
return 0;
}
void par (void)
{
- int i, j;
+ int i, j, k;
#pragma acc parallel
{
for (j = 1; j < 10; j++)
{ }
}
-#pragma acc loop tile(-2) // { dg-warning "'tile' value must be positive" }
+#pragma acc loop tile(-2) // { dg-error "'tile' argument needs" }
for (i = 1; i < 10; i++)
{ }
-#pragma acc loop tile(i)
+#pragma acc loop tile(i) // { dg-error "" }
for (i = 1; i < 10; i++)
{ }
#pragma acc loop tile(2, 2, 1)
for (i = 1; i < 3; i++)
{
for (j = 4; j < 6; j++)
- { }
+ for (k = 0; k< 100; k++);
}
#pragma acc loop tile(2, 2)
for (i = 1; i < 5; i+=2)
{
- for (j = i + 1; j < 7; j+=i)
+ for (j = i + 1; j < 7; j+=i) // { dg-error "initializer expression" }
{ }
}
#pragma acc loop vector tile(*)
for (j = 1; j < 10; j++)
{ }
}
-#pragma acc parallel loop tile(-2) // { dg-warning "'tile' value must be positive" }
+#pragma acc parallel loop tile(-2) // { dg-error "'tile' argument needs" }
for (i = 1; i < 10; i++)
{ }
-#pragma acc parallel loop tile(i)
+#pragma acc parallel loop tile(i) // { dg-error "" }
for (i = 1; i < 10; i++)
{ }
#pragma acc parallel loop tile(2, 2, 1)
for (i = 1; i < 3; i++)
- {
- for (j = 4; j < 6; j++)
- { }
- }
+ for (j = 4; j < 6; j++)
+ for (int k = 1 ; k < 2; k++)
+ ;
#pragma acc parallel loop tile(2, 2)
for (i = 1; i < 5; i+=2)
- {
- for (j = i + 1; j < 7; j++)
- { }
- }
+ for (j = i + 1; j < 7; j++) // { dg-error "initializer expression" }
+ { }
#pragma acc parallel loop vector tile(*)
for (i = 0; i < 10; i++)
{ }
#pragma acc loop tile(*, 1)
for (i = 0; i < 10; i++)
{
- for (j = 0; j < 10; i++)
+ for (j = 0; j < 10; i++) /* { dg-error "increment expression" } */
{ }
}
-#pragma acc loop tile(-2) // { dg-warning "'tile' value must be positive" }
+#pragma acc loop tile(-2) // { dg-error "'tile' argument needs" }
for (i = 0; i < 10; i++)
{ }
-#pragma acc loop tile(i)
+#pragma acc loop tile(i) // { dg-error "" }
for (i = 0; i < 10; i++)
{ }
#pragma acc loop tile(2, 2, 1)
for (i = 2; i < 4; i++)
- for (i = 4; i < 6; i++)
+ for (j = 4; j < 6; j++)
+ for (int k = 4; k < 6; k++)
{ }
#pragma acc loop tile(2, 2)
for (i = 1; i < 5; i+=2)
- for (j = i+1; j < 7; i++)
+ for (j = i+1; j < 7; j++) /* { dg-error "initializer expression" } */
{ }
#pragma acc loop vector tile(*)
for (i = 0; i < 10; i++)
for (j = 1; j < 10; j++)
{ }
}
-#pragma acc kernels loop tile(-2) // { dg-warning "'tile' value must be positive" }
+#pragma acc kernels loop tile(-2) // { dg-error "'tile' argument needs" }
for (i = 1; i < 10; i++)
{ }
-#pragma acc kernels loop tile(i)
+#pragma acc kernels loop tile(i) // { dg-error "" }
for (i = 1; i < 10; i++)
{ }
#pragma acc kernels loop tile(2, 2, 1)
for (i = 1; i < 3; i++)
- {
- for (j = 4; j < 6; j++)
- { }
- }
+ for (j = 4; j < 6; j++)
+ for (int k = 1; k < 7; k++)
+ ;
#pragma acc kernels loop tile(2, 2)
for (i = 1; i < 5; i++)
{
- for (j = i + 1; j < 7; j += i)
+ for (j = i + 1; j < 7; j += i) /* { dg-error "initializer expression" } */
{ }
}
#pragma acc kernels loop vector tile(*)
return val * 2;
}
-template<typename T> T
+template<typename T, int I> T
oacc_parallel_copy (T a)
{
T b = 0;
for (int j = 0; j < 5; j++)
b = a;
-#pragma acc loop auto tile (a, 3)
+#pragma acc loop auto tile (I, 3)
for (int i = 0; i < a; i++)
for (int j = 0; j < 5; j++)
b = a;
int
main ()
{
- int b = oacc_parallel_copy<int> (5);
+ int b = oacc_parallel_copy<int, 4> (5);
int c = oacc_kernels_copy<int> (5);
return b + c;
--- /dev/null
+/* of tile erroneously clobbered the template, resulting
+ in missing errors and other fun. */
+
+template <int I>
+void Foo ()
+{
+#pragma acc parallel loop tile(I) // { dg-error "" }
+ for (int ix = 0; ix < 10; ix++)
+ ;
+}
+
+int main ()
+{
+ Foo<1> (); // OK
+ Foo<-1> (); // error
+}
}
}
-/* { dg-final { scan-tree-dump {OpenACC loops.*Loop 0\(0\).*Loop 14\(1\).*\.data_dep\.[0-9_]+ = UNIQUE \(OACC_HEAD_MARK, 0, 1, 20\);.*Head-0:.*\.data_dep\.[0-9_]+ = UNIQUE \(OACC_HEAD_MARK, 0, 1, 20\);.*\.data_dep\.[0-9_]+ = UNIQUE \(OACC_FORK, \.data_dep\.[0-9_]+, 0\);.*Tail-0:.*\.data_dep\.[0-9_]+ = UNIQUE \(OACC_TAIL_MARK, \.data_dep\.[0-9_]+, 1\);.*\.data_dep\.[0-9_]+ = UNIQUE \(OACC_JOIN, \.data_dep\.[0-9_]+, 0\);.*Loop 6\(4\).*\.data_dep\.[0-9_]+ = UNIQUE \(OACC_HEAD_MARK, 0, 1, 6\);.*Head-0:.*\.data_dep\.[0-9_]+ = UNIQUE \(OACC_HEAD_MARK, 0, 1, 6\);.*\.data_dep\.[0-9_]+ = UNIQUE \(OACC_FORK, \.data_dep\.[0-9_]+, 2\);.*Tail-0:.*\.data_dep\.[0-9_]+ = UNIQUE \(OACC_TAIL_MARK, \.data_dep\.[0-9_]+, 1\);.*\.data_dep\.[0-9_]+ = UNIQUE \(OACC_JOIN, \.data_dep\.[0-9_]+, 2\);} "oaccdevlow" } } */
+/* { dg-final { scan-tree-dump {OpenACC loops.*Loop 0\(0\).*Loop 24\(1\).*\.data_dep\.[0-9_]+ = UNIQUE \(OACC_HEAD_MARK, 0, 1, 36\);.*Head-0:.*\.data_dep\.[0-9_]+ = UNIQUE \(OACC_HEAD_MARK, 0, 1, 36\);.*\.data_dep\.[0-9_]+ = UNIQUE \(OACC_FORK, \.data_dep\.[0-9_]+, 0\);.*Tail-0:.*\.data_dep\.[0-9_]+ = UNIQUE \(OACC_TAIL_MARK, \.data_dep\.[0-9_]+, 1\);.*\.data_dep\.[0-9_]+ = UNIQUE \(OACC_JOIN, \.data_dep\.[0-9_]+, 0\);.*Loop 6\(6\).*\.data_dep\.[0-9_]+ = UNIQUE \(OACC_HEAD_MARK, 0, 2, 6\);.*Head-0:.*\.data_dep\.[0-9_]+ = UNIQUE \(OACC_HEAD_MARK, 0, 2, 6\);.*\.data_dep\.[0-9_]+ = UNIQUE \(OACC_FORK, \.data_dep\.[0-9_]+, 1\);.*Head-1:.*\.data_dep\.[0-9_]+ = UNIQUE \(OACC_HEAD_MARK, \.data_dep\.[0-9_]+, 1\);.*\.data_dep\.[0-9_]+ = UNIQUE \(OACC_FORK, \.data_dep\.[0-9_]+, 2\);.*Tail-1:.*\.data_dep\.[0-9_]+ = UNIQUE \(OACC_TAIL_MARK, \.data_dep\.[0-9_]+, 2\);.*\.data_dep\.[0-9_]+ = UNIQUE \(OACC_JOIN, \.data_dep\.[0-9_]+, 2\);.*Tail-0:.*\.data_dep\.[0-9_]+ = UNIQUE \(OACC_TAIL_MARK, \.data_dep\.[0-9_]+, 1\);.*\.data_dep\.[0-9_]+ = UNIQUE \(OACC_JOIN, \.data_dep\.[0-9_]+, 1\);} "oaccdevlow" } } */
! { dg-final { scan-tree-dump-times "acc loop private.i. private.j. vector" 2 "gimple" } }
! { dg-final { scan-tree-dump-times "acc loop private.i. private.j. seq" 2 "gimple" } }
! { dg-final { scan-tree-dump-times "acc loop private.i. private.j. auto" 2 "gimple" } }
-! XFAILed: OpenACC tile clauses are discarded during gimplification.
-! { dg-final { scan-tree-dump-times "acc loop private.i. private.j. tile.2, 3" 2 "gimple" { xfail *-*-* } } }
+! { dg-final { scan-tree-dump-times "acc loop private.i. private.j. tile.2, 3" 2 "gimple" } }
! { dg-final { scan-tree-dump-times "acc loop private.i. independent" 2 "gimple" } }
! { dg-final { scan-tree-dump-times "private.z" 2 "gimple" } }
! { dg-final { scan-tree-dump-times "omp target oacc_\[^ \]+ map.force_tofrom:y" 2 "gimple" } }
--- /dev/null
+subroutine parloop
+ integer, parameter :: n = 100
+ integer i, j, k, a
+
+ !$acc parallel loop tile(10)
+ do i = 1, n
+ end do
+
+ !$acc parallel loop tile(*)
+ do i = 1, n
+ end do
+
+ !$acc parallel loop tile(10, *)
+ do i = 1, n
+ do j = 1, n
+ end do
+ end do
+
+ !$acc parallel loop tile(10, *, i) ! { dg-error "" }
+ do i = 1, n
+ do j = 1, n
+ do k = 1, n
+ end do
+ end do
+ end do
+
+ !$acc parallel loop tile ! { dg-error "Unclassifiable" }
+ do i = 1, n
+ end do
+
+ !$acc parallel loop tile() ! { dg-error "Syntax error" }
+ do i = 1, n
+ end do
+
+ !$acc parallel loop tile(,1) ! { dg-error "Syntax error" }
+ do i = 1, n
+ end do
+
+ !$acc parallel loop tile(,,) ! { dg-error "Syntax error" }
+ do i = 1, n
+ end do
+
+ !$acc parallel loop tile(1.1) ! { dg-error "requires a scalar INTEGER" }
+ do i = 1, n
+ end do
+
+ !$acc parallel loop tile(-3) ! { dg-warning "must be positive" }
+ do i = 1, n
+ end do
+
+ !$acc parallel loop tile(10, -3) ! { dg-warning "must be positive" }
+ do i = 1, n
+ do j = 1, n
+ end do
+ end do
+
+ !$acc parallel loop tile(-100, 10, 5) ! { dg-warning "must be positive" }
+ do i = 1, n
+ do j = 1, n
+ do k = 1, n
+ end do
+ end do
+ end do
+
+ !$acc parallel loop tile(10, .true.) ! { dg-error "requires a scalar" }
+ do i = 1, n
+ do j = 1, n
+ end do
+ end do
+
+ !$acc parallel loop tile(1, a) ! { dg-error "constant expression" }
+ do i = 1, n
+ do j = 1, n
+ end do
+ end do
+
+ !$acc parallel loop tile(a, 1) ! { dg-error "constant expression" }
+ do i = 1, n
+ do j = 1, n
+ end do
+ end do
+
+ !$acc parallel loop tile(2, 3) collapse (2) ! { dg-error "Incompatible use" }
+ do i = 1, n
+ do j = 1, n
+ end do
+ end do
+end subroutine parloop
+
+subroutine par
+ integer, parameter :: n = 100
+ integer i, j, k
+
+ !$acc parallel
+ !$acc loop tile ! { dg-error "Unclassifiable" }
+ do i = 1, n
+ end do
+
+ !$acc loop tile() ! { dg-error "Syntax error" }
+ do i = 1, n
+ end do
+
+ !$acc loop tile(1)
+ do i = 1, n
+ end do
+
+ !$acc loop tile(*)
+ do i = 1, n
+ end do
+
+ !$acc loop tile(2)
+ do i = 1, n
+ do j = 1, n
+ end do
+ end do
+
+ !$acc loop tile(-2) ! { dg-warning "must be positive" }
+ do i = 1, n
+ end do
+
+ !$acc loop tile(i) ! { dg-error "constant expression" }
+ do i = 1, n
+ end do
+
+ !$acc loop tile(2, 2, 1)
+ do i = 1, n
+ do j = 1, n
+ do k = 1, n
+ end do
+ end do
+ end do
+
+ !$acc parallel loop tile(2, 2)
+ do i = 1, n
+ do j = i+1, n, j ! { dg-error "rectangular iteration space" }
+ end do
+ end do
+
+ !$acc loop vector tile(*)
+ do i = 1, n
+ end do
+
+ !$acc loop worker tile(*)
+ do i = 1, n
+ end do
+
+ !$acc loop gang tile(*)
+ do i = 1, n
+ end do
+
+ !$acc loop vector gang tile(*)
+ do i = 1, n
+ end do
+
+ !$acc loop vector worker tile(*)
+ do i = 1, n
+ end do
+
+ !$acc loop gang worker tile(*)
+ do i = 1, n
+ end do
+
+ !$acc loop tile(2, 3) collapse (2) ! { dg-error "Incompatible use" }
+ do i = 1, n
+ do j = 1, n
+ end do
+ end do
+ !$acc end parallel
+end subroutine par
+
+subroutine kern
+ integer, parameter :: n = 100
+ integer i, j, k
+
+ !$acc kernels
+ !$acc loop tile ! { dg-error "Unclassifiable" }
+ do i = 1, n
+ end do
+
+ !$acc loop tile() ! { dg-error "Syntax error" }
+ do i = 1, n
+ end do
+
+ !$acc loop tile(1)
+ do i = 1, n
+ end do
+
+ !$acc loop tile(*)
+ do i = 1, n
+ end do
+
+ !$acc loop tile(2)
+ do i = 1, n
+ do j = 1, n
+ end do
+ end do
+
+ !$acc loop tile(-2) ! { dg-warning "must be positive" }
+ do i = 1, n
+ end do
+
+ !$acc loop tile(i) ! { dg-error "constant expression" }
+ do i = 1, n
+ end do
+
+ !$acc loop tile(2, 2, 1)
+ do i = 1, n
+ do j = 1, n
+ do k = 1, n
+ end do
+ end do
+ end do
+
+ !$acc parallel loop tile(2, 2)
+ do i = 1, n
+ do j = 1, n
+ end do
+ end do
+
+ !$acc loop vector tile(*)
+ do i = 1, n
+ end do
+
+ !$acc loop worker tile(*)
+ do i = 1, n
+ end do
+
+ !$acc loop gang tile(*)
+ do i = 1, n
+ end do
+
+ !$acc loop vector gang tile(*)
+ do i = 1, n
+ end do
+
+ !$acc loop vector worker tile(*)
+ do i = 1, n
+ end do
+
+ !$acc loop gang worker tile(*)
+ do i = 1, n
+ end do
+
+ !$acc loop tile(2, 3) collapse (2) ! { dg-error "Incompatible use" }
+ do i = 1, n
+ do j = 1, n
+ end do
+ end do
+ !$acc end kernels
+end subroutine kern
+
+subroutine kernsloop
+ integer, parameter :: n = 100
+ integer i, j, k, a
+
+ !$acc kernels loop tile(10)
+ do i = 1, n
+ end do
+
+ !$acc kernels loop tile(*)
+ do i = 1, n
+ end do
+
+ !$acc kernels loop tile(10, *)
+ do i = 1, n
+ do j = 1, n
+ end do
+ end do
+
+ !$acc kernels loop tile(10, *, i) ! { dg-error "" }
+ do i = 1, n
+ do j = 1, n
+ do k = 1, n
+ end do
+ end do
+ end do
+
+ !$acc kernels loop tile ! { dg-error "Unclassifiable" }
+ do i = 1, n
+ end do
+
+ !$acc kernels loop tile() ! { dg-error "Syntax error" }
+ do i = 1, n
+ end do
+
+ !$acc kernels loop tile(,1) ! { dg-error "Syntax error" }
+ do i = 1, n
+ end do
+
+ !$acc kernels loop tile(,,) ! { dg-error "Syntax error" }
+ do i = 1, n
+ end do
+
+ !$acc kernels loop tile(1.1) ! { dg-error "requires a scalar INTEGER" }
+ do i = 1, n
+ end do
+
+ !$acc kernels loop tile(-3) ! { dg-warning "must be positive" }
+ do i = 1, n
+ end do
+
+ !$acc kernels loop tile(10, -3) ! { dg-warning "must be positive" }
+ do i = 1, n
+ do j = 1, n
+ end do
+ end do
+
+ !$acc kernels loop tile(-100, 10, 5) ! { dg-warning "must be positive" }
+ do i = 1, n
+ do j = 1, n
+ do k = 1, n
+ end do
+ end do
+ end do
+
+ !$acc kernels loop tile(10, .true.) ! { dg-error "requires a scalar" }
+ do i = 1, n
+ do j = 1, n
+ end do
+ end do
+
+ !$acc kernels loop tile(1, a) ! { dg-error "constant expression" }
+ do i = 1, n
+ do j = 1, n
+ end do
+ end do
+
+ !$acc kernels loop tile(a, 1) ! { dg-error "constant expression" }
+ do i = 1, n
+ do j = 1, n
+ end do
+ end do
+
+ !$acc kernels loop tile(2, 3) collapse (2) ! { dg-error "Incompatible use" }
+ do i = 1, n
+ do j = 1, n
+ end do
+ end do
+end subroutine kernsloop
--- /dev/null
+subroutine par
+ integer ix, jx
+
+ !$acc parallel
+ !$acc loop tile (*,*) ! { dg-error "not enough DO loops for tiled" }
+ do ix = 1, 30
+ end do
+
+ !$acc loop tile (*,*)
+ do ix = 1, 30
+ do jx = 1, ix ! { dg-error "tiled loops don.t form rectangular" }
+ end do
+ end do
+
+ !$acc loop tile (*)
+ do ix = 1, 30
+ do jx = 1, ix
+ end do
+ end do
+ !$acc end parallel
+end subroutine par
--- /dev/null
+! { dg-do compile }
+! { dg-additional-options "-fdump-tree-original" }
+
+subroutine par
+ integer i, j, k
+
+ !$acc parallel
+ !$acc loop tile (1)
+ do i = 1, 10
+ end do
+
+ !$acc loop tile (*)
+ do i = 1, 10
+ end do
+
+ !$acc loop tile (1,2)
+ do i = 1, 10
+ do j = 1, 10
+ end do
+ end do
+
+ !$acc loop tile (*,2)
+ do i = 1, 10
+ do j = 1, 10
+ end do
+ end do
+
+ !$acc loop tile (1,*)
+ do i = 1, 10
+ do j = 1, 10
+ end do
+ end do
+
+ !$acc loop tile (*,*)
+ do i = 1, 10
+ do j = 1, 10
+ end do
+ end do
+
+ !$acc loop tile (1,2,3)
+ do i = 1, 10
+ do j = 1, 10
+ do k = 1, 10
+ end do
+ end do
+ end do
+
+ !$acc loop tile (*,2,3)
+ do i = 1, 10
+ do j = 1, 10
+ do k = 1, 10
+ end do
+ end do
+ end do
+
+ !$acc loop tile (1,*,3)
+ do i = 1, 10
+ do j = 1, 10
+ do k = 1, 10
+ end do
+ end do
+ end do
+
+ !$acc loop tile (1,2,*)
+ do i = 1, 10
+ do j = 1, 10
+ do k = 1, 10
+ end do
+ end do
+ end do
+ !$acc end parallel
+end subroutine par
+
+subroutine kerns
+ integer i, j, k
+
+ !$acc kernels
+ !$acc loop tile (1)
+ do i = 1, 10
+ end do
+
+ !$acc loop tile (*)
+ do i = 1, 10
+ end do
+
+ !$acc loop tile (1,2)
+ do i = 1, 10
+ do j = 1, 10
+ end do
+ end do
+
+ !$acc loop tile (*,2)
+ do i = 1, 10
+ do j = 1, 10
+ end do
+ end do
+
+ !$acc loop tile (1,*)
+ do i = 1, 10
+ do j = 1, 10
+ end do
+ end do
+
+ !$acc loop tile (*,*)
+ do i = 1, 10
+ do j = 1, 10
+ end do
+ end do
+
+ !$acc loop tile (1,2,3)
+ do i = 1, 10
+ do j = 1, 10
+ do k = 1, 10
+ end do
+ end do
+ end do
+
+ !$acc loop tile (*,2,3)
+ do i = 1, 10
+ do j = 1, 10
+ do k = 1, 10
+ end do
+ end do
+ end do
+
+ !$acc loop tile (1,*,3)
+ do i = 1, 10
+ do j = 1, 10
+ do k = 1, 10
+ end do
+ end do
+ end do
+
+ !$acc loop tile (1,2,*)
+ do i = 1, 10
+ do j = 1, 10
+ do k = 1, 10
+ end do
+ end do
+ end do
+ !$acc end kernels
+end subroutine kerns
+
+subroutine parloop
+ integer i, j, k
+
+ !$acc parallel loop tile (1)
+ do i = 1, 10
+ end do
+
+ !$acc parallel loop tile (*)
+ do i = 1, 10
+ end do
+
+ !$acc parallel loop tile (1,2)
+ do i = 1, 10
+ do j = 1, 10
+ end do
+ end do
+
+ !$acc parallel loop tile (*,2)
+ do i = 1, 10
+ do j = 1, 10
+ end do
+ end do
+
+ !$acc parallel loop tile (1,*)
+ do i = 1, 10
+ do j = 1, 10
+ end do
+ end do
+
+ !$acc parallel loop tile (*,*)
+ do i = 1, 10
+ do j = 1, 10
+ end do
+ end do
+
+ !$acc parallel loop tile (1,2,3)
+ do i = 1, 10
+ do j = 1, 10
+ do k = 1, 10
+ end do
+ end do
+ end do
+
+ !$acc parallel loop tile (*,2,3)
+ do i = 1, 10
+ do j = 1, 10
+ do k = 1, 10
+ end do
+ end do
+ end do
+
+ !$acc parallel loop tile (1,*,3)
+ do i = 1, 10
+ do j = 1, 10
+ do k = 1, 10
+ end do
+ end do
+ end do
+
+ !$acc parallel loop tile (1,2,*)
+ do i = 1, 10
+ do j = 1, 10
+ do k = 1, 10
+ end do
+ end do
+ end do
+end subroutine parloop
+
+subroutine kernloop
+ integer i, j, k
+
+ !$acc kernels loop tile (1)
+ do i = 1, 10
+ end do
+
+ !$acc kernels loop tile (*)
+ do i = 1, 10
+ end do
+
+ !$acc kernels loop tile (1,2)
+ do i = 1, 10
+ do j = 1, 10
+ end do
+ end do
+
+ !$acc kernels loop tile (*,2)
+ do i = 1, 10
+ do j = 1, 10
+ end do
+ end do
+
+ !$acc kernels loop tile (1,*)
+ do i = 1, 10
+ do j = 1, 10
+ end do
+ end do
+
+ !$acc kernels loop tile (*,*)
+ do i = 1, 10
+ do j = 1, 10
+ end do
+ end do
+
+ !$acc kernels loop tile (1,2,3)
+ do i = 1, 10
+ do j = 1, 10
+ do k = 1, 10
+ end do
+ end do
+ end do
+
+ !$acc kernels loop tile (*,2,3)
+ do i = 1, 10
+ do j = 1, 10
+ do k = 1, 10
+ end do
+ end do
+ end do
+
+ !$acc kernels loop tile (1,*,3)
+ do i = 1, 10
+ do j = 1, 10
+ do k = 1, 10
+ end do
+ end do
+ end do
+
+ !$acc kernels loop tile (1,2,*)
+ do i = 1, 10
+ do j = 1, 10
+ do k = 1, 10
+ end do
+ end do
+ end do
+end subroutine kernloop
+
+
+! { dg-final { scan-tree-dump-times "tile\\(1\\)" 4 "original" } }
+! { dg-final { scan-tree-dump-times "tile\\(0\\)" 4 "original" } }
+! { dg-final { scan-tree-dump-times "tile\\(1, 2\\)" 4 "original" } }
+! { dg-final { scan-tree-dump-times "tile\\(0, 2\\)" 4 "original" } }
+! { dg-final { scan-tree-dump-times "tile\\(1, 0\\)" 4 "original" } }
+! { dg-final { scan-tree-dump-times "tile\\(0, 0\\)" 4 "original" } }
+! { dg-final { scan-tree-dump-times "tile\\(1, 2, 3\\)" 4 "original" } }
+! { dg-final { scan-tree-dump-times "tile\\(0, 2, 3\\)" 4 "original" } }
+! { dg-final { scan-tree-dump-times "tile\\(1, 0, 3\\)" 4 "original" } }
+! { dg-final { scan-tree-dump-times "tile\\(1, 2, 0\\)" 4 "original" } }
+! { dg-final { scan-tree-dump-times "for \\(" 88 "original" } }
+! { dg-final { scan-tree-dump-times "while \\(" 0 "original" } }
case OMP_CLAUSE_DEFAULT:
case OMP_CLAUSE_COPYIN:
case OMP_CLAUSE_COLLAPSE:
+ case OMP_CLAUSE_TILE:
case OMP_CLAUSE_UNTIED:
case OMP_CLAUSE_MERGEABLE:
case OMP_CLAUSE_PROC_BIND:
case OMP_CLAUSE_AUTO:
break;
- /* OpenACC tile clauses are discarded during gimplification. */
- case OMP_CLAUSE_TILE:
/* The following clause belongs to the OpenACC cache directive, which
is discarded during gimplification. */
case OMP_CLAUSE__CACHE_:
case OMP_CLAUSE_DEFAULT:
case OMP_CLAUSE_COPYIN:
case OMP_CLAUSE_COLLAPSE:
+ case OMP_CLAUSE_TILE:
case OMP_CLAUSE_UNTIED:
case OMP_CLAUSE_MERGEABLE:
case OMP_CLAUSE_PROC_BIND:
case OMP_CLAUSE_AUTO:
break;
- /* OpenACC tile clauses are discarded during gimplification. */
- case OMP_CLAUSE_TILE:
/* The following clause belongs to the OpenACC cache directive, which
is discarded during gimplification. */
case OMP_CLAUSE__CACHE_:
1, /* OMP_CLAUSE_NUM_GANGS */
1, /* OMP_CLAUSE_NUM_WORKERS */
1, /* OMP_CLAUSE_VECTOR_LENGTH */
- 1, /* OMP_CLAUSE_TILE */
+ 3, /* OMP_CLAUSE_TILE */
2, /* OMP_CLAUSE__GRIDDIM_ */
};
#define OMP_CLAUSE_TILE_LIST(NODE) \
OMP_CLAUSE_OPERAND (OMP_CLAUSE_SUBCODE_CHECK (NODE, OMP_CLAUSE_TILE), 0)
+#define OMP_CLAUSE_TILE_ITERVAR(NODE) \
+ OMP_CLAUSE_OPERAND (OMP_CLAUSE_SUBCODE_CHECK (NODE, OMP_CLAUSE_TILE), 1)
+#define OMP_CLAUSE_TILE_COUNT(NODE) \
+ OMP_CLAUSE_OPERAND (OMP_CLAUSE_SUBCODE_CHECK (NODE, OMP_CLAUSE_TILE), 2)
#define OMP_CLAUSE__GRIDDIM__DIMENSION(NODE) \
(OMP_CLAUSE_SUBCODE_CHECK (NODE, OMP_CLAUSE__GRIDDIM_)\
+2017-02-09 Nathan Sidwell <nathan@codesourcery.com>
+ Chung-Lin Tang <cltang@codesourcery.com>
+
+ * testsuite/libgomp.oacc-c-c++-common/tile-1.c: New.
+ * testsuite/libgomp.oacc-c-c++-common/loop-auto-1.c: Adjust and
+ add additional case.
+ * testsuite/libgomp.oacc-c-c++-common/vprop.c: XFAIL under
+ "openacc_nvidia_accel_selected".
+ * libgomp.oacc-fortran/nested-function-1.f90 (test2):
+ Add num_workers(8) clause.
+
2017-02-08 John David Anglin <danglin@gcc.gnu.org>
* testsuite/libgomp.oacc-c-c++-common/loop-dim-default.c: Skip on
ary[ix] = place ();
}
- return check (ary, size, 0, 0, 1);
+ return check (ary, size, 0, 1, 1);
}
int vector_2 (int *ary, int size)
ary[ix + jx * 64] = place ();
}
+ return check (ary, size, 1, 1, 1);
+}
+
+int gang_4 (int *ary, int size)
+{
+ clear (ary, size);
+
+#pragma acc parallel vector_length(32) copy(ary[0:size]) firstprivate (size)
+ {
+#pragma acc loop auto
+ for (int jx = 0; jx < size; jx++)
+ ary[jx] = place ();
+ }
+
return check (ary, size, 1, 0, 1);
}
-#define N (32*32*32)
+#define N (32*32*32*2)
int main ()
{
int ondev = 0;
return 1;
if (gang_3 (ary, N))
return 1;
+ if (gang_4 (ary, N))
+ return 1;
return 0;
}
--- /dev/null
+/* This code uses nvptx inline assembly guarded with acc_on_device, which is
+ not optimized away at -O0, and then confuses the target assembler.
+ { dg-skip-if "" { *-*-* } { "-O0" } { "" } } */
+
+/* { dg-additional-options "-fopenacc-dim=32" } */
+
+#include <stdio.h>
+#include <openacc.h>
+
+static int check (const int *ary, int size, int gp, int wp, int vp)
+{
+ int exit = 0;
+ int ix;
+ int gangs[32], workers[32], vectors[32];
+
+ for (ix = 0; ix < 32; ix++)
+ gangs[ix] = workers[ix] = vectors[ix] = 0;
+
+ for (ix = 0; ix < size; ix++)
+ {
+ vectors[ary[ix] & 0xff]++;
+ workers[(ary[ix] >> 8) & 0xff]++;
+ gangs[(ary[ix] >> 16) & 0xff]++;
+ }
+
+ for (ix = 0; ix < 32; ix++)
+ {
+ if (gp)
+ {
+ int expect = gangs[0];
+ if (gangs[ix] != expect)
+ {
+ exit = 1;
+ printf ("gang %d not used %d times\n", ix, expect);
+ }
+ }
+ else if (ix && gangs[ix])
+ {
+ exit = 1;
+ printf ("gang %d unexpectedly used\n", ix);
+ }
+
+ if (wp)
+ {
+ int expect = workers[0];
+ if (workers[ix] != expect)
+ {
+ exit = 1;
+ printf ("worker %d not used %d times\n", ix, expect);
+ }
+ }
+ else if (ix && workers[ix])
+ {
+ exit = 1;
+ printf ("worker %d unexpectedly used\n", ix);
+ }
+
+ if (vp)
+ {
+ int expect = vectors[0];
+ if (vectors[ix] != expect)
+ {
+ exit = 1;
+ printf ("vector %d not used %d times\n", ix, expect);
+ }
+ }
+ else if (ix && vectors[ix])
+ {
+ exit = 1;
+ printf ("vector %d unexpectedly used\n", ix);
+ }
+
+ }
+ return exit;
+}
+
+#pragma acc routine seq
+static int __attribute__((noinline)) place ()
+{
+ int r = 0;
+
+ if (acc_on_device (acc_device_nvidia))
+ {
+ int g = 0, w = 0, v = 0;
+
+ __asm__ volatile ("mov.u32 %0,%%ctaid.x;" : "=r" (g));
+ __asm__ volatile ("mov.u32 %0,%%tid.y;" : "=r" (w));
+ __asm__ volatile ("mov.u32 %0,%%tid.x;" : "=r" (v));
+ r = (g << 16) | (w << 8) | v;
+ }
+ return r;
+}
+
+static void clear (int *ary, int size)
+{
+ int ix;
+
+ for (ix = 0; ix < size; ix++)
+ ary[ix] = -1;
+}
+
+int gang_vector_1 (int *ary, int size)
+{
+ clear (ary, size);
+#pragma acc parallel vector_length(32) num_gangs (32) copy (ary[0:size]) firstprivate (size)
+ {
+#pragma acc loop tile(128) gang vector
+ for (int jx = 0; jx < size; jx++)
+ ary[jx] = place ();
+ }
+
+ return check (ary, size, 1, 0, 1);
+}
+
+int gang_vector_2a (int *ary, int size)
+{
+ if (size % 256)
+ return 1;
+
+ clear (ary, size);
+#pragma acc parallel vector_length(32) num_gangs (32) copy (ary[0:size]) firstprivate (size)
+ {
+#pragma acc loop tile(64, 64) gang vector
+ for (int jx = 0; jx < size / 256; jx++)
+ for (int ix = 0; ix < 256; ix++)
+ ary[jx * 256 + ix] = place ();
+ }
+
+ return check (ary, size, 1, 0, 1);
+}
+
+int gang_vector_2b (int *ary, int size)
+{
+ if (size % 256)
+ return 1;
+
+ clear (ary, size);
+#pragma acc parallel vector_length(32) num_gangs (32) copy (ary[0:size]) firstprivate (size)
+ {
+#pragma acc loop tile(64, 64) gang vector
+ for (int jx = 0; jx < size; jx += 256)
+ for (int ix = 0; ix < 256; ix++)
+ ary[jx + ix] = place ();
+ }
+
+ return check (ary, size, 1, 0, 1);
+}
+
+int worker_vector_2a (int *ary, int size)
+{
+ if (size % 256)
+ return 1;
+
+ clear (ary, size);
+#pragma acc parallel vector_length(32) num_workers (32) copy (ary[0:size]) firstprivate (size)
+ {
+#pragma acc loop tile(64, 64) worker vector
+ for (int jx = 0; jx < size / 256; jx++)
+ for (int ix = 0; ix < 256; ix++)
+ ary[jx * 256 + ix] = place ();
+ }
+
+ return check (ary, size, 0, 1, 1);
+}
+
+int worker_vector_2b (int *ary, int size)
+{
+ if (size % 256)
+ return 1;
+
+ clear (ary, size);
+#pragma acc parallel vector_length(32) num_workers (32) copy (ary[0:size]) firstprivate (size)
+ {
+#pragma acc loop tile(64, 64) worker vector
+ for (int jx = 0; jx < size; jx += 256)
+ for (int ix = 0; ix < 256; ix++)
+ ary[jx + ix] = place ();
+ }
+
+ return check (ary, size, 0, 1, 1);
+}
+
+int gang_worker_vector_2a (int *ary, int size)
+{
+ if (size % 256)
+ return 1;
+ clear (ary, size);
+#pragma acc parallel vector_length(32) num_workers (32) num_gangs(32) copy (ary[0:size]) firstprivate (size)
+ {
+#pragma acc loop tile(32, 32)
+ for (int jx = 0; jx < size / 256; jx++)
+ for (int ix = 0; ix < 256; ix++)
+ ary[jx * 256 + ix] = place ();
+ }
+
+ return check (ary, size, 1, 1, 1);
+}
+
+int gang_worker_vector_2b (int *ary, int size)
+{
+ if (size % 256)
+ return 1;
+ clear (ary, size);
+#pragma acc parallel vector_length(32) num_workers (32) num_gangs(32) copy (ary[0:size]) firstprivate (size)
+ {
+#pragma acc loop tile(32, 32)
+ for (int jx = 0; jx < size; jx += 256)
+ for (int ix = 0; ix < 256; ix++)
+ ary[jx + ix] = place ();
+ }
+
+ return check (ary, size, 1, 1, 1);
+}
+
+int gang_worker_vector_star_2a (int *ary, int size)
+{
+ if (size % 256)
+ return 1;
+
+ clear (ary, size);
+#pragma acc parallel vector_length(32) num_workers (32) num_gangs(32) copy (ary[0:size]) firstprivate (size)
+ {
+#pragma acc loop tile(*, *)
+ for (int jx = 0; jx < size / 256; jx++)
+ for (int ix = 0; ix < 256; ix++)
+ ary[jx * 256 + ix] = place ();
+ }
+
+ return check (ary, size, 1, 1, 1);
+}
+
+int gang_worker_vector_star_2b (int *ary, int size)
+{
+ if (size % 256)
+ return 1;
+
+ clear (ary, size);
+#pragma acc parallel vector_length(32) num_workers (32) num_gangs(32) copy (ary[0:size]) firstprivate (size)
+ {
+#pragma acc loop tile(*, *)
+ for (int jx = 0; jx < size; jx +=256)
+ for (int ix = 0; ix < 256; ix++)
+ ary[jx + ix] = place ();
+ }
+
+ return check (ary, size, 1, 1, 1);
+}
+
+#define N (32*32*32*8)
+int main ()
+{
+ int ondev = 0;
+
+#pragma acc parallel copy(ondev)
+ {
+ ondev = acc_on_device (acc_device_not_host);
+ }
+ if (!ondev)
+ return 0;
+
+ int ary[N];
+ if (gang_vector_1 (ary, N))
+ return 1;
+ if (gang_vector_2a (ary, N))
+ return 1;
+ if (worker_vector_2a (ary, N))
+ return 1;
+ if (gang_worker_vector_2a (ary, N))
+ return 1;
+ if (gang_worker_vector_star_2a (ary, N))
+ return 1;
+ if (gang_vector_2b (ary, N))
+ return 1;
+ if (worker_vector_2b (ary, N))
+ return 1;
+ if (gang_worker_vector_2b (ary, N))
+ return 1;
+ if (gang_worker_vector_star_2b (ary, N))
+ return 1;
+ return 0;
+}
+/* { dg-do run } */
+/* { dg-xfail-run-if "PR78266" { openacc_nvidia_accel_selected } { "*" } { "" } } */
+
#include <assert.h>
#define test(type) \
subroutine test2
integer :: a(3,3,3), k, kk, kkk, l, ll, lll
a = 0
- !$acc parallel
+ !$acc parallel num_workers(8)
! Use "gang(static:1)" here and below to effectively turn gang-redundant
! execution mode into something like gang-single.
!$acc loop gang(static:1) collapse(1)