From 5acef69f9d3d9f3c537b5e5157519edf02f86c4d Mon Sep 17 00:00:00 2001 From: Jakub Jelinek Date: Thu, 9 Jul 2020 12:07:17 +0200 Subject: [PATCH] openmp: Optimize triangular loop logical iterator to actual iterators computation using search for quadratic equation root(s) This patch implements the optimized logical to actual iterators computation for triangular loops. I have a rough implementation using integers, but this one uses floating point. There is a small problem that -fopenmp programs aren't linked with -lm, so it does it only if the hw has sqrt optab (and uses ifn rather than __builtin_sqrt because it obviously doesn't need errno handling etc.). Do you think it is ok this way, or should I use the integral computation using inlined isqrt (we have inequation of the form start >= x * t10 + t11 * (((x - 1) * x) / 2) where t10 and t11 are signed long long values and start unsigned long long, and the division by 2 actually is a problem for accuracy in some cases, so if we do it in integral, we need to do actually long long t12 = 2 * t10 - t11; unsigned long long t13 = t12 * t12 + start * 8 * t11; unsigned long long isqrt_ = isqrtull (t13); long long x = (((long long) isqrt_ - t12) / t11) >> 1; with careful overflow checking on all the computations before isqrtull (and on overflows use the fallback implementation). 2020-07-09 Jakub Jelinek * omp-general.h (struct omp_for_data): Add min_inner_iterations and factor members. * omp-general.c (omp_extract_for_data): Initialize them and remember them in OMP_CLAUSE_COLLAPSE_COUNT if needed and restore from there. * omp-expand.c (expand_omp_for_init_counts): Fix up computation of counts[fd->last_nonrect] if fd->loop.n2 is INTEGER_CST. (expand_omp_for_init_vars): For fd->first_nonrect + 1 == fd->last_nonrect loops with for now INTEGER_CST fd->loop.n2 find quadratic equation roots instead of using fallback method when possible. * testsuite/libgomp.c/loop-19.c: New test. * testsuite/libgomp.c/loop-20.c: New test. --- gcc/omp-expand.c | 211 +++++++++++++++++++++++++- gcc/omp-general.c | 23 ++- gcc/omp-general.h | 7 + libgomp/testsuite/libgomp.c/loop-19.c | 86 +++++++++++ libgomp/testsuite/libgomp.c/loop-20.c | 84 ++++++++++ 5 files changed, 404 insertions(+), 7 deletions(-) create mode 100644 libgomp/testsuite/libgomp.c/loop-19.c create mode 100644 libgomp/testsuite/libgomp.c/loop-20.c diff --git a/gcc/omp-expand.c b/gcc/omp-expand.c index b1349d8f088..c3b8820e213 100644 --- a/gcc/omp-expand.c +++ b/gcc/omp-expand.c @@ -2137,7 +2137,7 @@ expand_omp_for_init_counts (struct omp_for_data *fd, gimple_stmt_iterator *gsi, int non_rect_referenced = 0, non_rect = 0; for (i = 0; i < fd->collapse; i++) { - if ((i < fd->first_nonrect || fd->last_nonrect) + if ((i < fd->first_nonrect || i > fd->last_nonrect) && !integer_zerop (counts[i])) t = fold_build2 (TRUNC_DIV_EXPR, type, t, counts[i]); if (fd->loops[i].non_rect_referenced) @@ -2249,17 +2249,208 @@ expand_omp_for_init_vars (struct omp_for_data *fd, gimple_stmt_iterator *gsi, t = tem; if (i == fd->last_nonrect) { - /* Fallback implementation. Evaluate the loops in between - (inclusive) fd->first_nonrect and fd->last_nonrect at - runtime unsing temporaries instead of the original iteration - variables, in the body just bump the counter and compare - with the desired value. */ t = force_gimple_operand_gsi (gsi, t, true, NULL_TREE, false, GSI_CONTINUE_LINKING); tree stopval = t; tree idx = create_tmp_reg (type, ".count"); expand_omp_build_assign (gsi, idx, build_zero_cst (type), true); + basic_block bb_triang = NULL; + if (fd->first_nonrect + 1 == fd->last_nonrect + /* For now. */ + && TREE_CODE (fd->loop.n2) == INTEGER_CST + && (optab_handler (sqrt_optab, TYPE_MODE (double_type_node)) + != CODE_FOR_nothing)) + { + tree itype = TREE_TYPE (fd->loops[i].v); + tree min_inner_iterations = fd->min_inner_iterations; + tree factor = fd->factor; + gcond *cond_stmt + = gimple_build_cond (NE_EXPR, factor, + build_zero_cst (TREE_TYPE (factor)), + NULL_TREE, NULL_TREE); + gsi_insert_after (gsi, cond_stmt, GSI_CONTINUE_LINKING); + edge e = split_block (gsi_bb (*gsi), cond_stmt); + basic_block bb0 = e->src; + e->flags = EDGE_TRUE_VALUE; + e->probability = profile_probability::likely (); + *gsi = gsi_after_labels (e->dest); + tree slltype = long_long_integer_type_node; + tree ulltype = long_long_unsigned_type_node; + tree stopvalull = fold_convert (ulltype, stopval); + stopvalull + = force_gimple_operand_gsi (gsi, stopvalull, true, NULL_TREE, + false, GSI_CONTINUE_LINKING); + min_inner_iterations + = fold_convert (slltype, min_inner_iterations); + min_inner_iterations + = force_gimple_operand_gsi (gsi, min_inner_iterations, true, + NULL_TREE, false, + GSI_CONTINUE_LINKING); + factor = fold_convert (slltype, factor); + factor + = force_gimple_operand_gsi (gsi, factor, true, NULL_TREE, + false, GSI_CONTINUE_LINKING); + tree min_inner_iterationsd + = fold_build1 (FLOAT_EXPR, double_type_node, + min_inner_iterations); + min_inner_iterationsd + = force_gimple_operand_gsi (gsi, min_inner_iterationsd, true, + NULL_TREE, false, + GSI_CONTINUE_LINKING); + tree factord = fold_build1 (FLOAT_EXPR, double_type_node, + factor); + factord = force_gimple_operand_gsi (gsi, factord, true, + NULL_TREE, false, + GSI_CONTINUE_LINKING); + tree stopvald = fold_build1 (FLOAT_EXPR, double_type_node, + stopvalull); + stopvald = force_gimple_operand_gsi (gsi, stopvald, true, + NULL_TREE, false, + GSI_CONTINUE_LINKING); + /* Temporarily disable flag_rounding_math, values will be + decimal numbers divided by 2 and worst case imprecisions + due to too large values ought to be caught later by the + checks for fallback. */ + int save_flag_rounding_math = flag_rounding_math; + flag_rounding_math = 0; + t = fold_build2 (RDIV_EXPR, double_type_node, factord, + build_real (double_type_node, dconst2)); + tree t3 = fold_build2 (MINUS_EXPR, double_type_node, + min_inner_iterationsd, t); + t3 = force_gimple_operand_gsi (gsi, t3, true, NULL_TREE, false, + GSI_CONTINUE_LINKING); + t = fold_build2 (MULT_EXPR, double_type_node, factord, + build_real (double_type_node, dconst2)); + t = fold_build2 (MULT_EXPR, double_type_node, t, stopvald); + t = fold_build2 (PLUS_EXPR, double_type_node, t, + fold_build2 (MULT_EXPR, double_type_node, + t3, t3)); + flag_rounding_math = save_flag_rounding_math; + t = force_gimple_operand_gsi (gsi, t, true, NULL_TREE, false, + GSI_CONTINUE_LINKING); + cond_stmt + = gimple_build_cond (LT_EXPR, t, + build_zero_cst (double_type_node), + NULL_TREE, NULL_TREE); + gsi_insert_after (gsi, cond_stmt, GSI_CONTINUE_LINKING); + e = split_block (gsi_bb (*gsi), cond_stmt); + basic_block bb1 = e->src; + e->flags = EDGE_FALSE_VALUE; + e->probability = profile_probability::very_likely (); + *gsi = gsi_after_labels (e->dest); + gcall *call = gimple_build_call_internal (IFN_SQRT, 1, t); + tree sqrtr = create_tmp_var (double_type_node); + gimple_call_set_lhs (call, sqrtr); + gsi_insert_after (gsi, call, GSI_CONTINUE_LINKING); + t = fold_build2 (MINUS_EXPR, double_type_node, sqrtr, t3); + t = fold_build2 (RDIV_EXPR, double_type_node, t, factord); + t = fold_build1 (FIX_TRUNC_EXPR, ulltype, t); + tree c = create_tmp_var (ulltype); + tree d = create_tmp_var (ulltype); + expand_omp_build_assign (gsi, c, t, true); + t = fold_build2 (MINUS_EXPR, ulltype, c, + build_one_cst (ulltype)); + t = fold_build2 (MULT_EXPR, ulltype, c, t); + t = fold_build2 (RSHIFT_EXPR, ulltype, t, integer_one_node); + t = fold_build2 (MULT_EXPR, ulltype, fd->factor, t); + tree t2 = fold_build2 (MULT_EXPR, ulltype, c, + fd->min_inner_iterations); + t = fold_build2 (PLUS_EXPR, ulltype, t, t2); + expand_omp_build_assign (gsi, d, t, true); + t = fold_build2 (MULT_EXPR, ulltype, fd->factor, c); + t = fold_build2 (PLUS_EXPR, ulltype, + t, fd->min_inner_iterations); + t2 = force_gimple_operand_gsi (gsi, t, true, NULL_TREE, false, + GSI_CONTINUE_LINKING); + cond_stmt = gimple_build_cond (GE_EXPR, stopvalull, d, + NULL_TREE, NULL_TREE); + gsi_insert_after (gsi, cond_stmt, GSI_CONTINUE_LINKING); + e = split_block (gsi_bb (*gsi), cond_stmt); + basic_block bb2 = e->src; + e->flags = EDGE_TRUE_VALUE; + e->probability = profile_probability::very_likely (); + *gsi = gsi_after_labels (e->dest); + t = fold_build2 (PLUS_EXPR, ulltype, d, t2); + t = force_gimple_operand_gsi (gsi, t, true, NULL_TREE, false, + GSI_CONTINUE_LINKING); + cond_stmt = gimple_build_cond (GE_EXPR, stopvalull, t, + NULL_TREE, NULL_TREE); + gsi_insert_after (gsi, cond_stmt, GSI_CONTINUE_LINKING); + e = split_block (gsi_bb (*gsi), cond_stmt); + basic_block bb3 = e->src; + e->flags = EDGE_FALSE_VALUE; + e->probability = profile_probability::very_likely (); + *gsi = gsi_after_labels (e->dest); + t = fold_convert (itype, c); + t = fold_build2 (MULT_EXPR, itype, t, fd->loops[i - 1].step); + t = fold_build2 (PLUS_EXPR, itype, fd->loops[i - 1].n1, t); + t = force_gimple_operand_gsi (gsi, t, true, NULL_TREE, false, + GSI_CONTINUE_LINKING); + expand_omp_build_assign (gsi, fd->loops[i - 1].v, t, true); + t2 = fold_build2 (MINUS_EXPR, ulltype, stopvalull, d); + t2 = fold_convert (itype, t2); + t2 = fold_build2 (MULT_EXPR, itype, t2, fd->loops[i].step); + t2 = fold_build2 (PLUS_EXPR, itype, t2, fd->loops[i].n1); + if (fd->loops[i].m1) + { + t = fold_build2 (MULT_EXPR, itype, t, fd->loops[i].m1); + t2 = fold_build2 (PLUS_EXPR, itype, t2, t); + } + expand_omp_build_assign (gsi, fd->loops[i].v, t2, true); + e = split_block (gsi_bb (*gsi), gsi_stmt (*gsi)); + bb_triang = e->src; + *gsi = gsi_after_labels (e->dest); + remove_edge (e); + e = make_edge (bb1, gsi_bb (*gsi), EDGE_TRUE_VALUE); + e->probability = profile_probability::very_unlikely (); + e = make_edge (bb2, gsi_bb (*gsi), EDGE_FALSE_VALUE); + e->probability = profile_probability::very_unlikely (); + e = make_edge (bb3, gsi_bb (*gsi), EDGE_TRUE_VALUE); + e->probability = profile_probability::very_unlikely (); + + basic_block bb4 = create_empty_bb (bb0); + add_bb_to_loop (bb4, bb0->loop_father); + e = make_edge (bb0, bb4, EDGE_FALSE_VALUE); + e->probability = profile_probability::unlikely (); + make_edge (bb4, gsi_bb (*gsi), EDGE_FALLTHRU); + set_immediate_dominator (CDI_DOMINATORS, bb4, bb0); + set_immediate_dominator (CDI_DOMINATORS, gsi_bb (*gsi), bb0); + gimple_stmt_iterator gsi2 = gsi_after_labels (bb4); + t2 = fold_build2 (TRUNC_DIV_EXPR, type, + counts[i], counts[i - 1]); + t2 = force_gimple_operand_gsi (&gsi2, t2, true, NULL_TREE, false, + GSI_CONTINUE_LINKING); + t = fold_build2 (TRUNC_MOD_EXPR, type, stopval, t2); + t2 = fold_build2 (TRUNC_DIV_EXPR, type, stopval, t2); + t = fold_convert (itype, t); + t2 = fold_convert (itype, t2); + t = fold_build2 (MULT_EXPR, itype, t, + fold_convert (itype, fd->loops[i].step)); + t = fold_build2 (PLUS_EXPR, itype, fd->loops[i].n1, t); + t2 = fold_build2 (MULT_EXPR, itype, t2, + fold_convert (itype, fd->loops[i - 1].step)); + t2 = fold_build2 (PLUS_EXPR, itype, fd->loops[i - 1].n1, t2); + t2 = force_gimple_operand_gsi (&gsi2, t2, false, NULL_TREE, + false, GSI_CONTINUE_LINKING); + stmt = gimple_build_assign (fd->loops[i - 1].v, t2); + gsi_insert_after (&gsi2, stmt, GSI_CONTINUE_LINKING); + if (fd->loops[i].m1) + { + t2 = fold_build2 (MULT_EXPR, itype, fd->loops[i].m1, + fd->loops[i - 1].v); + t = fold_build2 (PLUS_EXPR, itype, t, t2); + } + t = force_gimple_operand_gsi (&gsi2, t, false, NULL_TREE, + false, GSI_CONTINUE_LINKING); + stmt = gimple_build_assign (fd->loops[i].v, t); + gsi_insert_after (&gsi2, stmt, GSI_CONTINUE_LINKING); + } + /* Fallback implementation. Evaluate the loops in between + (inclusive) fd->first_nonrect and fd->last_nonrect at + runtime unsing temporaries instead of the original iteration + variables, in the body just bump the counter and compare + with the desired value. */ gimple_stmt_iterator gsi2 = *gsi; basic_block entry_bb = gsi_bb (gsi2); edge e = split_block (entry_bb, gsi_stmt (gsi2)); @@ -2455,6 +2646,14 @@ expand_omp_for_init_vars (struct omp_for_data *fd, gimple_stmt_iterator *gsi, *gsi = gsi_last_bb (gsi_bb (*gsi)); else gsi_prev (gsi); + if (bb_triang) + { + e = split_block (gsi_bb (*gsi), gsi_stmt (*gsi)); + make_edge (bb_triang, e->dest, EDGE_FALLTHRU); + *gsi = gsi_after_labels (e->dest); + if (!gsi_end_p (*gsi)) + gsi_insert_before (gsi, gimple_build_nop (), GSI_NEW_STMT); + } } else { diff --git a/gcc/omp-general.c b/gcc/omp-general.c index 2a47466f897..c6878cfec66 100644 --- a/gcc/omp-general.c +++ b/gcc/omp-general.c @@ -212,6 +212,8 @@ omp_extract_for_data (gomp_for *for_stmt, struct omp_for_data *fd, fd->sched_modifiers = 0; fd->chunk_size = NULL_TREE; fd->simd_schedule = false; + fd->min_inner_iterations = NULL_TREE; + fd->factor = NULL_TREE; collapse_iter = NULL; collapse_count = NULL; @@ -653,6 +655,8 @@ omp_extract_for_data (gomp_for *for_stmt, struct omp_for_data *fd, else t2 = fold_build2 (TRUNC_DIV_EXPR, itype, t2, step); t2 = fold_convert (llutype, t2); + fd->min_inner_iterations = t; + fd->factor = t2; t = fold_build2 (MULT_EXPR, llutype, t, single_nonrect_count); tree t3 = fold_build2 (MINUS_EXPR, llutype, @@ -707,7 +711,17 @@ omp_extract_for_data (gomp_for *for_stmt, struct omp_for_data *fd, if (collapse_count && *collapse_count == NULL) { if (count) - *collapse_count = fold_convert_loc (loc, iter_type, count); + { + *collapse_count = fold_convert_loc (loc, iter_type, count); + if (fd->min_inner_iterations && fd->factor) + { + t = make_tree_vec (3); + TREE_VEC_ELT (t, 0) = *collapse_count; + TREE_VEC_ELT (t, 1) = fd->min_inner_iterations; + TREE_VEC_ELT (t, 2) = fd->factor; + *collapse_count = t; + } + } else *collapse_count = create_tmp_var (iter_type, ".count"); } @@ -717,6 +731,13 @@ omp_extract_for_data (gomp_for *for_stmt, struct omp_for_data *fd, fd->loop.v = *collapse_iter; fd->loop.n1 = build_int_cst (TREE_TYPE (fd->loop.v), 0); fd->loop.n2 = *collapse_count; + if (TREE_CODE (fd->loop.n2) == TREE_VEC) + { + gcc_assert (fd->non_rect); + fd->min_inner_iterations = TREE_VEC_ELT (fd->loop.n2, 1); + fd->factor = TREE_VEC_ELT (fd->loop.n2, 2); + fd->loop.n2 = TREE_VEC_ELT (fd->loop.n2, 0); + } fd->loop.step = build_int_cst (TREE_TYPE (fd->loop.v), 1); fd->loop.m1 = NULL_TREE; fd->loop.m2 = NULL_TREE; diff --git a/gcc/omp-general.h b/gcc/omp-general.h index a76396577b9..ec0f2a4becb 100644 --- a/gcc/omp-general.h +++ b/gcc/omp-general.h @@ -78,6 +78,13 @@ struct omp_for_data unsigned char sched_modifiers; enum omp_clause_schedule_kind sched_kind; struct omp_for_data_loop *loops; + /* The following are relevant only for non-rectangular loops + where only a single loop depends on an outer loop iterator. */ + tree min_inner_iterations; /* Number of iterations of the inner + loop with either the first or last + outer iterator, depending on which + results in fewer iterations. */ + tree factor; /* (m2 - m1) * outer_step / inner_step. */ }; #define OACC_FN_ATTRIB "oacc function" diff --git a/libgomp/testsuite/libgomp.c/loop-19.c b/libgomp/testsuite/libgomp.c/loop-19.c new file mode 100644 index 00000000000..732ebb04153 --- /dev/null +++ b/libgomp/testsuite/libgomp.c/loop-19.c @@ -0,0 +1,86 @@ +/* { dg-do run } */ + +extern void abort (void); + +int x, i, j; +volatile int a, b, c, d, e, f, g, h; +int k[16][67]; + +int +main () +{ + int niters; + for (i = 0; i < 16; i++) + for (j = i * 2 + 1; j < 4 * i + 3; j++) + k[i][j] = 1; + a = 0; b = 16; c = 1; d = 2; e = 1; f = 4; g = 3; h = 1; + niters = 0; i = -100; j = -100; x = -100; + #pragma omp parallel for collapse(2) lastprivate (i, j, x) reduction(+:niters) + for (i = 0; i < 16; i++) + for (j = i * 2 + 1; j < 4 * i + 3; j++) + { + if (i < 0 || i >= 16 || j < 2 * i + 1 || j >= 3 + i * 4 || k[i][j] != 1) + abort (); + k[i][j]++; + x = i * 1024 + (j & 1023); + niters++; + } + if (i != 16 || j != 63 || x != 15422 || niters != 272) + abort (); + niters = 0; i = -100; j = -100; x = -100; + #pragma omp parallel for collapse(2) lastprivate (i, j, x) reduction(+:niters) + for (i = a; i < b; i += c) + for (j = d * i + e; j < g + i * f; j += h) + { + if (i < 0 || i >= 16 || j < 2 * i + 1 || j >= 3 + i * 4 || k[i][j] != 2) + abort (); + k[i][j]++; + x = i * 1024 + (j & 1023); + niters++; + } + if (i != 16 || j != 63 || x != 15422 || niters != 272) + abort (); + for (i = 0; i < 16; i++) + for (j = i * 2 + 1; j < 4 * i + 3; j++) + if (k[i][j] == 3) + k[i][j] = 0; + else + abort (); + for (i = 0; i < 16; i++) + for (j = i * 2 + 1; j < 2 * i + 7; j++) + k[i][j] = 1; + a = 0; b = 16; c = 1; d = 2; e = 1; f = 2; g = 7; h = 1; + niters = 0; i = -100; j = -100; x = -100; + #pragma omp parallel for collapse(2) lastprivate (i, j, x) reduction(+:niters) + for (i = 0; i < 16; i++) + for (j = i * 2 + 1; j < 2 * i + 7; j++) + { + if (i < 0 || i >= 16 || j < 2 * i + 1 || j >= 7 + i * 2 || k[i][j] != 1) + abort (); + k[i][j]++; + x = i * 1024 + (j & 1023); + niters++; + } + if (i != 16 || j != 37 || x != 15396 || niters != 96) + abort (); + niters = 0; i = -100; j = -100; x = -100; + #pragma omp parallel for collapse(2) lastprivate (i, j, x) reduction(+:niters) + for (i = a; i < b; i += c) + for (j = d * i + e; j < g + i * f; j += h) + { + if (i < 0 || i >= 16 || j < 2 * i + 1 || j >= 7 + i * 2 || k[i][j] != 2) + abort (); + k[i][j]++; + x = i * 1024 + (j & 1023); + niters++; + } + if (i != 16 || j != 37 || x != 15396 || niters != 96) + abort (); + for (i = 0; i < 16; i++) + for (j = i * 2 + 1; j < 2 * i + 7; j++) + if (k[i][j] == 3) + k[i][j] = 0; + else + abort (); + return 0; +} diff --git a/libgomp/testsuite/libgomp.c/loop-20.c b/libgomp/testsuite/libgomp.c/loop-20.c new file mode 100644 index 00000000000..c3265ac0470 --- /dev/null +++ b/libgomp/testsuite/libgomp.c/loop-20.c @@ -0,0 +1,84 @@ +/* { dg-do run } */ + +extern void abort (void); + +unsigned long long int x, i, j; +volatile unsigned long long int a, b, c, d, e, f, g, h; +int k[4][206]; + +int +main () +{ + long long int niters; + for (j = ~0ULL / 2 - 32; j < ((~0ULL / 2) + 6); j++) + k[0][j - ~0ULL / 2 + 64] = 1; + a = 1; b = 2; c = 1; d = 0; e = ~0ULL / 2 - 32; f = ((~0ULL / 2) + 6); g = 0; h = 1; + niters = 0; i = -100; j = -100; x = -100; + #pragma omp parallel for collapse(2) lastprivate (i, j, x) reduction(+:niters) + for (i = 1; i < 2; i++) + for (j = ~0ULL / 2 - 32; j < i * ((~0ULL / 2) + 6); j++) + { + if (i != 1 || j < ~0ULL / 2 - 32 || j >= ((~0ULL / 2) + 6) || k[0][j - ~0ULL / 2 + 64] != 1) + abort (); + k[0][j - ~0ULL / 2 + 64]++; + x = i * 1024 + (j & 1023); + niters++; + } + if (i != 2 || j != ((~0ULL / 2) + 6) || x != 1028 || niters != 38) + abort (); + niters = 0; i = -100; j = -100; x = -100; + #pragma omp parallel for collapse(2) lastprivate (i, j, x) reduction(+:niters) + for (i = a; i < b; i += c) + for (j = d * i + e; j < g + i * f; j += h) + { + if (i != 1 || j < ~0ULL / 2 - 32 || j >= ((~0ULL / 2) + 6) || k[0][j - ~0ULL / 2 + 64] != 2) + abort (); + k[0][j - ~0ULL / 2 + 64]++; + x = i * 1024 + (j & 1023); + niters++; + } + if (i != 2 || j != ((~0ULL / 2) + 6) || x != 1028 || niters != 38) + abort (); + for (j = ~0ULL / 2 - 32; j < ((~0ULL / 2) + 6); j++) + if (k[0][j - ~0ULL / 2 + 64] == 3) + k[0][j - ~0ULL / 2 + 64] = 0; + else + abort (); + for (i = 1; i < 4; i++) + for (j = 64ULL * i; j < i * 32ULL + 110; j++) + k[i][j] = 1; + a = 1; b = 4; c = 1; d = 64ULL; e = 0; f = 32ULL; g = 110ULL; h = 1; + niters = 0; i = -100; j = -100; x = -100; + #pragma omp parallel for collapse(2) lastprivate (i, j, x) reduction(+:niters) + for (i = 1; i < 4; i++) + for (j = 64ULL * i; j < i * 32ULL + 110; j++) + { + if (i < 1 || i >= 4 || j < 64ULL * i || j >= i * 32ULL + 110 || k[i][j] != 1) + abort (); + k[i][j]++; + x = i * 1024 + (j & 1023); + niters++; + } + if (i != 4 || j != 206 || x != 3277 || niters != 138) + abort (); + niters = 0; i = -100; j = -100; x = -100; + #pragma omp parallel for collapse(2) lastprivate (i, j, x) reduction(+:niters) + for (i = a; i < b; i += c) + for (j = d * i + e; j < g + i * f; j += h) + { + if (i < 1 || i >= 4 || j < 64ULL * i || j >= i * 32ULL + 110 || k[i][j] != 2) + abort (); + k[i][j]++; + x = i * 1024 + (j & 1023); + niters++; + } + if (i != 4 || j != 206 || x != 3277 || niters != 138) + abort (); + for (i = 1; i < 4; i++) + for (j = 64ULL * i; j < i * 32ULL + 110; j++) + if (k[i][j] == 3) + k[i][j] = 0; + else + abort (); + return 0; +} -- 2.30.2