From: Jakub Jelinek Date: Wed, 22 Jun 2011 20:39:25 +0000 (+0200) Subject: re PR libgomp/49490 (suboptimal load balancing in loops) X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=fb79f500af1b4d9ee5685e8934726b5e2ea4edd5;p=gcc.git re PR libgomp/49490 (suboptimal load balancing in loops) PR libgomp/49490 * omp-low.c (expand_omp_for_static_nochunk): Only use n ceil/ nthreads size for the first n % nthreads threads in the team instead of all threads except for the last few ones which get less work or none at all. * iter.c (gomp_iter_static_next): For chunk size 0 only use n ceil/ nthreads size for the first n % nthreads threads in the team instead of all threads except for the last few ones which get less work or none at all. * iter_ull.c (gomp_iter_ull_static_next): Likewise. * env.c (parse_schedule): If OMP_SCHEDULE doesn't have chunk argument, set run_sched_modifier to 0 for static resp. 1 for other kinds. If chunk argument is 0 and not static, set value to 1. From-SVN: r175315 --- diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 5b5964db18a..d4ee63b825d 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,5 +1,12 @@ 2011-06-22 Jakub Jelinek + PR libgomp/49490 + * omp-low.c (expand_omp_for_static_nochunk): Only + use n ceil/ nthreads size for the first + n % nthreads threads in the team instead of + all threads except for the last few ones which + get less work or none at all. + PR debug/49496 * tree-vect-patterns.c (vect_recog_widen_mult_pattern): Ignore debug uses. diff --git a/gcc/omp-low.c b/gcc/omp-low.c index 944a40bce97..05646bf8833 100644 --- a/gcc/omp-low.c +++ b/gcc/omp-low.c @@ -3,7 +3,7 @@ marshalling to implement data sharing and copying clauses. Contributed by Diego Novillo - Copyright (C) 2005, 2006, 2007, 2008, 2009, 2010 + Copyright (C) 2005, 2006, 2007, 2008, 2009, 2010, 2011 Free Software Foundation, Inc. This file is part of GCC. @@ -4108,9 +4108,14 @@ expand_omp_for_generic (struct omp_region *region, else n = (adj + N2 - N1) / STEP; q = n / nthreads; - q += (q * nthreads != n); - s0 = q * threadid; - e0 = min(s0 + q, n); + tt = n % nthreads; + if (threadid < tt) goto L3; else goto L4; + L3: + tt = 0; + q = q + 1; + L4: + s0 = q * threadid + tt; + e0 = s0 + q; V = s0 * STEP + N1; if (s0 >= e0) goto L2; else goto L0; L0: @@ -4126,12 +4131,14 @@ static void expand_omp_for_static_nochunk (struct omp_region *region, struct omp_for_data *fd) { - tree n, q, s0, e0, e, t, nthreads, threadid; + tree n, q, s0, e0, e, t, tt, nthreads, threadid; tree type, itype, vmain, vback; - basic_block entry_bb, exit_bb, seq_start_bb, body_bb, cont_bb; + basic_block entry_bb, second_bb, third_bb, exit_bb, seq_start_bb; + basic_block body_bb, cont_bb; basic_block fin_bb; gimple_stmt_iterator gsi; gimple stmt; + edge ep; itype = type = TREE_TYPE (fd->loop.v); if (POINTER_TYPE_P (type)) @@ -4185,19 +4192,39 @@ expand_omp_for_static_nochunk (struct omp_region *region, t = fold_convert (itype, t); n = force_gimple_operand_gsi (&gsi, t, true, NULL_TREE, true, GSI_SAME_STMT); + q = create_tmp_var (itype, "q"); t = fold_build2 (TRUNC_DIV_EXPR, itype, n, nthreads); - q = force_gimple_operand_gsi (&gsi, t, true, NULL_TREE, true, GSI_SAME_STMT); + t = force_gimple_operand_gsi (&gsi, t, false, NULL_TREE, true, GSI_SAME_STMT); + gsi_insert_before (&gsi, gimple_build_assign (q, t), GSI_SAME_STMT); + + tt = create_tmp_var (itype, "tt"); + t = fold_build2 (TRUNC_MOD_EXPR, itype, n, nthreads); + t = force_gimple_operand_gsi (&gsi, t, false, NULL_TREE, true, GSI_SAME_STMT); + gsi_insert_before (&gsi, gimple_build_assign (tt, t), GSI_SAME_STMT); - t = fold_build2 (MULT_EXPR, itype, q, nthreads); - t = fold_build2 (NE_EXPR, itype, t, n); - t = fold_build2 (PLUS_EXPR, itype, q, t); - q = force_gimple_operand_gsi (&gsi, t, true, NULL_TREE, true, GSI_SAME_STMT); + t = build2 (LT_EXPR, boolean_type_node, threadid, tt); + stmt = gimple_build_cond_empty (t); + gsi_insert_before (&gsi, stmt, GSI_SAME_STMT); + + second_bb = split_block (entry_bb, stmt)->dest; + gsi = gsi_last_bb (second_bb); + gcc_assert (gimple_code (gsi_stmt (gsi)) == GIMPLE_OMP_FOR); + + gsi_insert_before (&gsi, gimple_build_assign (tt, build_int_cst (itype, 0)), + GSI_SAME_STMT); + stmt = gimple_build_assign_with_ops (PLUS_EXPR, q, q, + build_int_cst (itype, 1)); + gsi_insert_before (&gsi, stmt, GSI_SAME_STMT); + + third_bb = split_block (second_bb, stmt)->dest; + gsi = gsi_last_bb (third_bb); + gcc_assert (gimple_code (gsi_stmt (gsi)) == GIMPLE_OMP_FOR); t = build2 (MULT_EXPR, itype, q, threadid); + t = build2 (PLUS_EXPR, itype, t, tt); s0 = force_gimple_operand_gsi (&gsi, t, true, NULL_TREE, true, GSI_SAME_STMT); t = fold_build2 (PLUS_EXPR, itype, s0, q); - t = fold_build2 (MIN_EXPR, itype, t, n); e0 = force_gimple_operand_gsi (&gsi, t, true, NULL_TREE, true, GSI_SAME_STMT); t = build2 (GE_EXPR, boolean_type_node, s0, e0); @@ -4263,13 +4290,20 @@ expand_omp_for_static_nochunk (struct omp_region *region, gsi_remove (&gsi, true); /* Connect all the blocks. */ - find_edge (entry_bb, seq_start_bb)->flags = EDGE_FALSE_VALUE; - find_edge (entry_bb, fin_bb)->flags = EDGE_TRUE_VALUE; + ep = make_edge (entry_bb, third_bb, EDGE_FALSE_VALUE); + ep->probability = REG_BR_PROB_BASE / 4 * 3; + ep = find_edge (entry_bb, second_bb); + ep->flags = EDGE_TRUE_VALUE; + ep->probability = REG_BR_PROB_BASE / 4; + find_edge (third_bb, seq_start_bb)->flags = EDGE_FALSE_VALUE; + find_edge (third_bb, fin_bb)->flags = EDGE_TRUE_VALUE; find_edge (cont_bb, body_bb)->flags = EDGE_TRUE_VALUE; find_edge (cont_bb, fin_bb)->flags = EDGE_FALSE_VALUE; - set_immediate_dominator (CDI_DOMINATORS, seq_start_bb, entry_bb); + set_immediate_dominator (CDI_DOMINATORS, second_bb, entry_bb); + set_immediate_dominator (CDI_DOMINATORS, third_bb, entry_bb); + set_immediate_dominator (CDI_DOMINATORS, seq_start_bb, third_bb); set_immediate_dominator (CDI_DOMINATORS, body_bb, recompute_dominator (CDI_DOMINATORS, body_bb)); set_immediate_dominator (CDI_DOMINATORS, fin_bb, diff --git a/libgomp/ChangeLog b/libgomp/ChangeLog index 7d3fe227c5e..1d93273b919 100644 --- a/libgomp/ChangeLog +++ b/libgomp/ChangeLog @@ -1,3 +1,17 @@ +2011-06-22 Jakub Jelinek + + PR libgomp/49490 + * iter.c (gomp_iter_static_next): For chunk size 0 + only use n ceil/ nthreads size for the first + n % nthreads threads in the team instead of + all threads except for the last few ones which + get less work or none at all. + * iter_ull.c (gomp_iter_ull_static_next): Likewise. + * env.c (parse_schedule): If OMP_SCHEDULE doesn't have + chunk argument, set run_sched_modifier to 0 for static + resp. 1 for other kinds. If chunk argument is 0 + and not static, set value to 1. + 2011-05-19 Jakub Jelinek PR c++/49043 diff --git a/libgomp/env.c b/libgomp/env.c index 92fa8c37497..0ca9a1c9e00 100644 --- a/libgomp/env.c +++ b/libgomp/env.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2005, 2006, 2007, 2008, 2009, 2010 +/* Copyright (C) 2005, 2006, 2007, 2008, 2009, 2010, 2011 Free Software Foundation, Inc. Contributed by Richard Henderson . @@ -108,7 +108,11 @@ parse_schedule (void) while (isspace ((unsigned char) *env)) ++env; if (*env == '\0') - return; + { + gomp_global_icv.run_sched_modifier + = gomp_global_icv.run_sched_var != GFS_STATIC; + return; + } if (*env++ != ',') goto unknown; while (isspace ((unsigned char) *env)) @@ -129,6 +133,8 @@ parse_schedule (void) if ((int)value != value) goto invalid; + if (value == 0 && gomp_global_icv.run_sched_var != GFS_STATIC) + value = 1; gomp_global_icv.run_sched_modifier = value; return; diff --git a/libgomp/iter.c b/libgomp/iter.c index 9ec4dbd2252..cd9484a1ea4 100644 --- a/libgomp/iter.c +++ b/libgomp/iter.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2005, 2008, 2009 Free Software Foundation, Inc. +/* Copyright (C) 2005, 2008, 2009, 2011 Free Software Foundation, Inc. Contributed by Richard Henderson . This file is part of the GNU OpenMP Library (libgomp). @@ -59,7 +59,7 @@ gomp_iter_static_next (long *pstart, long *pend) trip through the outer loop. */ if (ws->chunk_size == 0) { - unsigned long n, q, i; + unsigned long n, q, i, t; unsigned long s0, e0; long s, e; @@ -74,11 +74,14 @@ gomp_iter_static_next (long *pstart, long *pend) /* Compute the "zero-based" start and end points. That is, as if the loop began at zero and incremented by one. */ q = n / nthreads; - q += (q * nthreads != n); - s0 = q * i; + t = n % nthreads; + if (i < t) + { + t = 0; + q++; + } + s0 = q * i + t; e0 = s0 + q; - if (e0 > n) - e0 = n; /* Notice when no iterations allocated for this thread. */ if (s0 >= e0) diff --git a/libgomp/iter_ull.c b/libgomp/iter_ull.c index 1754e6333c2..a393920b55e 100644 --- a/libgomp/iter_ull.c +++ b/libgomp/iter_ull.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2005, 2008, 2009 Free Software Foundation, Inc. +/* Copyright (C) 2005, 2008, 2009, 2011 Free Software Foundation, Inc. Contributed by Richard Henderson . This file is part of the GNU OpenMP Library (libgomp). @@ -60,7 +60,7 @@ gomp_iter_ull_static_next (gomp_ull *pstart, gomp_ull *pend) trip through the outer loop. */ if (ws->chunk_size_ull == 0) { - gomp_ull n, q, i, s0, e0, s, e; + gomp_ull n, q, i, t, s0, e0, s, e; if (thr->ts.static_trip > 0) return 1; @@ -75,11 +75,14 @@ gomp_iter_ull_static_next (gomp_ull *pstart, gomp_ull *pend) /* Compute the "zero-based" start and end points. That is, as if the loop began at zero and incremented by one. */ q = n / nthreads; - q += (q * nthreads != n); - s0 = q * i; + t = n % nthreads; + if (i < t) + { + t = 0; + q++; + } + s0 = q * i + t; e0 = s0 + q; - if (e0 > n) - e0 = n; /* Notice when no iterations allocated for this thread. */ if (s0 >= e0)