From a851ce04f7050dd82aa8344e7b68ee8319fb7b6f Mon Sep 17 00:00:00 2001 From: Richard Biener Date: Fri, 17 Nov 2017 13:15:34 +0000 Subject: [PATCH] re PR fortran/83017 (DO CONCURRENT not parallelizing) 2017-11-17 Richard Biener PR tree-optimization/83017 * tree-parloops.c (MIN_PER_THREAD): Use --param parloops-min-per-thread. (gen_parallel_loop): Properly count iterations. (parallelize_loops): Handle loop->can_be_parallel independent of flag_loop_parallelize_all. Make static profitability test match the runtime one. * params.def (PARAM_PARLOOPS_MIN_PER_THREAD): New. * invoke.texi (parloops-min-per-thread): Document. * gcc.dg/autopar/pr49960.c: Adjust. From-SVN: r254867 --- gcc/ChangeLog | 11 +++++++++++ gcc/doc/invoke.texi | 6 ++++++ gcc/params.def | 6 ++++++ gcc/testsuite/ChangeLog | 5 +++++ gcc/testsuite/gcc.dg/autopar/pr49960.c | 16 ++++++++++------ gcc/tree-parloops.c | 23 ++++++++--------------- 6 files changed, 46 insertions(+), 21 deletions(-) diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 07f2aa957b1..030a13ca282 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,14 @@ +2017-11-17 Richard Biener + + PR tree-optimization/83017 + * tree-parloops.c (MIN_PER_THREAD): Use --param parloops-min-per-thread. + (gen_parallel_loop): Properly count iterations. + (parallelize_loops): Handle loop->can_be_parallel independent + of flag_loop_parallelize_all. Make static profitability test match + the runtime one. + * params.def (PARAM_PARLOOPS_MIN_PER_THREAD): New. + * invoke.texi (parloops-min-per-thread): Document. + 2017-11-17 Vineet Gupta * config/arc/linux.h: GLIBC_DYNAMIC_LINKER update per glibc diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi index 1e2b869885b..e18fa545fd2 100644 --- a/gcc/doc/invoke.texi +++ b/gcc/doc/invoke.texi @@ -10816,6 +10816,12 @@ is 0. Schedule type of omp schedule for loops parallelized by parloops (static, dynamic, guided, auto, runtime). The default is static. +@item parloops-min-per-thread +The minimum number of iterations per thread of an innermost parallelized +loop for which the parallelized variant is prefered over the single threaded +one. The default is 100. Note that for a parallelized loop nest the +minimum number of iterations of the outermost loop per thread is two. + @item max-ssa-name-query-depth Maximum depth of recursion when querying properties of SSA names in things like fold routines. One level of recursion corresponds to following a diff --git a/gcc/params.def b/gcc/params.def index 8881f4c403a..89915d4fc7f 100644 --- a/gcc/params.def +++ b/gcc/params.def @@ -1240,6 +1240,12 @@ DEFPARAMENUM5 (PARAM_PARLOOPS_SCHEDULE, static, static, dynamic, guided, auto, runtime) +DEFPARAM (PARAM_PARLOOPS_MIN_PER_THREAD, + "parloops-min-per-thread", + "Minimum number of iterations per thread of an innermost " + "parallelized loop.", + 100, 2, 0) + DEFPARAM (PARAM_MAX_SSA_NAME_QUERY_DEPTH, "max-ssa-name-query-depth", "Maximum recursion depth allowed when querying a property of an" diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 16990330ea4..2238dae00f7 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,8 @@ +2017-11-17 Richard Biener + + PR tree-optimization/83017 + * gcc.dg/autopar/pr49960.c: Adjust. + 2017-11-17 Sudakshina Das * gcc.target/arm/armv8_2-fp16-move-1.c: Edit vmov scan-assembler diff --git a/gcc/testsuite/gcc.dg/autopar/pr49960.c b/gcc/testsuite/gcc.dg/autopar/pr49960.c index e3fb04d99c1..447169d991a 100644 --- a/gcc/testsuite/gcc.dg/autopar/pr49960.c +++ b/gcc/testsuite/gcc.dg/autopar/pr49960.c @@ -7,7 +7,8 @@ #define MA 400 int T[MA][MB],A[MA][NA],B[MB][NA]; -void MRTRBR(int MA_1, int NA_1, int MB_1) +void __attribute__((noinline)) +MRTRBR(int MA_1, int NA_1, int MB_1) { int i,j, t,k; @@ -21,7 +22,7 @@ void MRTRBR(int MA_1, int NA_1, int MB_1) /* The outer most loop is not parallel because for different k's there is write-write dependency for T[i][j]. */ - /* The two inner loops don't get parallelized due to low number of + /* The innermost loop doesn't get parallelized due to low number of iterations. */ for (k = 3; k < NA_1; k++) @@ -38,7 +39,10 @@ void main () for (i = 3; i < MA; i++) for (j = 3; j < MB; j++) - T[i][j] = (i>j?i:j); + { + __asm__ volatile ("" : : : "memory"); + T[i][j] = (i>j?i:j); + } MRTRBR (MA,NA,MB); @@ -48,7 +52,7 @@ void main () } -/* Check that the outer most loop doesn't get parallelized (thus no loop gets parallelized) */ +/* Check that the outer most loop doesn't get parallelized. */ -/* { dg-final { scan-tree-dump-times "SUCCESS: may be parallelized" 0 "parloops2" } } */ -/* { dg-final { scan-tree-dump-times "loopfn" 0 "optimized" } } */ +/* { dg-final { scan-tree-dump-times "SUCCESS: may be parallelized" 1 "parloops2" } } */ +/* { dg-final { scan-tree-dump-times "__builtin_GOMP_parallel" 1 "optimized" } } */ diff --git a/gcc/tree-parloops.c b/gcc/tree-parloops.c index cfc143f3c02..643ab5719ae 100644 --- a/gcc/tree-parloops.c +++ b/gcc/tree-parloops.c @@ -184,7 +184,7 @@ parloop /* Minimal number of iterations of a loop that should be executed in each thread. */ -#define MIN_PER_THREAD 100 +#define MIN_PER_THREAD PARAM_VALUE (PARAM_PARLOOPS_MIN_PER_THREAD) /* Element of the hashtable, representing a reduction in the current loop. */ @@ -2336,7 +2336,7 @@ gen_parallel_loop (struct loop *loop, gcc_checking_assert (n_threads != 0); many_iterations_cond = fold_build2 (GE_EXPR, boolean_type_node, - nit, build_int_cst (type, m_p_thread * n_threads)); + nit, build_int_cst (type, m_p_thread * n_threads - 1)); many_iterations_cond = fold_build2 (TRUTH_AND_EXPR, boolean_type_node, @@ -3299,15 +3299,6 @@ parallelize_loops (bool oacc_kernels_p) fprintf (dump_file, "loop %d is innermost\n",loop->num); } - /* If we use autopar in graphite pass, we use its marked dependency - checking results. */ - if (flag_loop_parallelize_all && !loop->can_be_parallel) - { - if (dump_file && (dump_flags & TDF_DETAILS)) - fprintf (dump_file, "loop is not parallel according to graphite\n"); - continue; - } - if (!single_dom_exit (loop)) { @@ -3325,15 +3316,17 @@ parallelize_loops (bool oacc_kernels_p) || loop_has_vector_phi_nodes (loop)) continue; - estimated = estimated_stmt_executions_int (loop); + estimated = estimated_loop_iterations_int (loop); if (estimated == -1) - estimated = likely_max_stmt_executions_int (loop); + estimated = get_likely_max_loop_iterations_int (loop); /* FIXME: Bypass this check as graphite doesn't update the count and frequency correctly now. */ if (!flag_loop_parallelize_all && !oacc_kernels_p && ((estimated != -1 - && estimated <= (HOST_WIDE_INT) n_threads * MIN_PER_THREAD) + && (estimated + < ((HOST_WIDE_INT) n_threads + * (loop->inner ? 2 : MIN_PER_THREAD) - 1))) /* Do not bother with loops in cold areas. */ || optimize_loop_nest_for_size_p (loop))) continue; @@ -3347,7 +3340,7 @@ parallelize_loops (bool oacc_kernels_p) if (loop_has_phi_with_address_arg (loop)) continue; - if (!flag_loop_parallelize_all + if (!loop->can_be_parallel && !loop_parallel_p (loop, &parloop_obstack)) continue; -- 2.30.2