re PR libgomp/49490 (suboptimal load balancing in loops)

author Jakub Jelinek <jakub@redhat.com>

Wed, 22 Jun 2011 20:39:25 +0000 (22:39 +0200)

committer Jakub Jelinek <jakub@gcc.gnu.org>

Wed, 22 Jun 2011 20:39:25 +0000 (22:39 +0200)
author Jakub Jelinek <jakub@redhat.com>
Wed, 22 Jun 2011 20:39:25 +0000 (22:39 +0200)
committer Jakub Jelinek <jakub@gcc.gnu.org>
Wed, 22 Jun 2011 20:39:25 +0000 (22:39 +0200)
diff --git a/gcc/ChangeLog b/gcc/ChangeLog

index 5b5964db18a33f22ab44cb99a7dbce0fb9ccd3d5..d4ee63b825df15852eb0c93d1a5cb7be675460be 100644 (file)
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,5 +1,12 @@
  2011-06-22  Jakub Jelinek  <jakub@redhat.com>
  
+       PR libgomp/49490
+       * omp-low.c (expand_omp_for_static_nochunk): Only
+       use n ceil/ nthreads size for the first
+       n % nthreads threads in the team instead of
+       all threads except for the last few ones which
+       get less work or none at all.
+
         PR debug/49496
         * tree-vect-patterns.c (vect_recog_widen_mult_pattern): Ignore debug
         uses.
diff --git a/gcc/omp-low.c b/gcc/omp-low.c

index 944a40bce979a884ff710366e5887bad9c85a28d..05646bf8833ceb521d799be6c891b4a631d2bb50 100644 (file)
--- a/gcc/omp-low.c
+++ b/gcc/omp-low.c
@@ -3,7 +3,7 @@
     marshalling to implement data sharing and copying clauses.
     Contributed by Diego Novillo <dnovillo@redhat.com>
  
-   Copyright (C) 2005, 2006, 2007, 2008, 2009, 2010
+   Copyright (C) 2005, 2006, 2007, 2008, 2009, 2010, 2011
     Free Software Foundation, Inc.
  
  This file is part of GCC.
@@ -4108,9 +4108,14 @@ expand_omp_for_generic (struct omp_region *region,
         else
           n = (adj + N2 - N1) / STEP;
         q = n / nthreads;
-       q += (q * nthreads != n);
-       s0 = q * threadid;
-       e0 = min(s0 + q, n);
+       tt = n % nthreads;
+       if (threadid < tt) goto L3; else goto L4;
+    L3:
+       tt = 0;
+       q = q + 1;
+    L4:
+       s0 = q * threadid + tt;
+       e0 = s0 + q;
         V = s0 * STEP + N1;
         if (s0 >= e0) goto L2; else goto L0;
      L0:
@@ -4126,12 +4131,14 @@ static void
  expand_omp_for_static_nochunk (struct omp_region *region,
                                struct omp_for_data *fd)
  {
-  tree n, q, s0, e0, e, t, nthreads, threadid;
+  tree n, q, s0, e0, e, t, tt, nthreads, threadid;
    tree type, itype, vmain, vback;
-  basic_block entry_bb, exit_bb, seq_start_bb, body_bb, cont_bb;
+  basic_block entry_bb, second_bb, third_bb, exit_bb, seq_start_bb;
+  basic_block body_bb, cont_bb;
    basic_block fin_bb;
    gimple_stmt_iterator gsi;
    gimple stmt;
+  edge ep;
  
    itype = type = TREE_TYPE (fd->loop.v);
    if (POINTER_TYPE_P (type))
@@ -4185,19 +4192,39 @@ expand_omp_for_static_nochunk (struct omp_region *region,
    t = fold_convert (itype, t);
    n = force_gimple_operand_gsi (&gsi, t, true, NULL_TREE, true, GSI_SAME_STMT);
  
+  q = create_tmp_var (itype, "q");
    t = fold_build2 (TRUNC_DIV_EXPR, itype, n, nthreads);
-  q = force_gimple_operand_gsi (&gsi, t, true, NULL_TREE, true, GSI_SAME_STMT);
+  t = force_gimple_operand_gsi (&gsi, t, false, NULL_TREE, true, GSI_SAME_STMT);
+  gsi_insert_before (&gsi, gimple_build_assign (q, t), GSI_SAME_STMT);
+
+  tt = create_tmp_var (itype, "tt");
+  t = fold_build2 (TRUNC_MOD_EXPR, itype, n, nthreads);
+  t = force_gimple_operand_gsi (&gsi, t, false, NULL_TREE, true, GSI_SAME_STMT);
+  gsi_insert_before (&gsi, gimple_build_assign (tt, t), GSI_SAME_STMT);
  
-  t = fold_build2 (MULT_EXPR, itype, q, nthreads);
-  t = fold_build2 (NE_EXPR, itype, t, n);
-  t = fold_build2 (PLUS_EXPR, itype, q, t);
-  q = force_gimple_operand_gsi (&gsi, t, true, NULL_TREE, true, GSI_SAME_STMT);
+  t = build2 (LT_EXPR, boolean_type_node, threadid, tt);
+  stmt = gimple_build_cond_empty (t);
+  gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
+
+  second_bb = split_block (entry_bb, stmt)->dest;
+  gsi = gsi_last_bb (second_bb);
+  gcc_assert (gimple_code (gsi_stmt (gsi)) == GIMPLE_OMP_FOR);
+
+  gsi_insert_before (&gsi, gimple_build_assign (tt, build_int_cst (itype, 0)),
+                    GSI_SAME_STMT);
+  stmt = gimple_build_assign_with_ops (PLUS_EXPR, q, q,
+                                      build_int_cst (itype, 1));
+  gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
+
+  third_bb = split_block (second_bb, stmt)->dest;
+  gsi = gsi_last_bb (third_bb);
+  gcc_assert (gimple_code (gsi_stmt (gsi)) == GIMPLE_OMP_FOR);
  
    t = build2 (MULT_EXPR, itype, q, threadid);
+  t = build2 (PLUS_EXPR, itype, t, tt);
    s0 = force_gimple_operand_gsi (&gsi, t, true, NULL_TREE, true, GSI_SAME_STMT);
  
    t = fold_build2 (PLUS_EXPR, itype, s0, q);
-  t = fold_build2 (MIN_EXPR, itype, t, n);
    e0 = force_gimple_operand_gsi (&gsi, t, true, NULL_TREE, true, GSI_SAME_STMT);
  
    t = build2 (GE_EXPR, boolean_type_node, s0, e0);
@@ -4263,13 +4290,20 @@ expand_omp_for_static_nochunk (struct omp_region *region,
    gsi_remove (&gsi, true);
  
    /* Connect all the blocks.  */
-  find_edge (entry_bb, seq_start_bb)->flags = EDGE_FALSE_VALUE;
-  find_edge (entry_bb, fin_bb)->flags = EDGE_TRUE_VALUE;
+  ep = make_edge (entry_bb, third_bb, EDGE_FALSE_VALUE);
+  ep->probability = REG_BR_PROB_BASE / 4 * 3;
+  ep = find_edge (entry_bb, second_bb);
+  ep->flags = EDGE_TRUE_VALUE;
+  ep->probability = REG_BR_PROB_BASE / 4;
+  find_edge (third_bb, seq_start_bb)->flags = EDGE_FALSE_VALUE;
+  find_edge (third_bb, fin_bb)->flags = EDGE_TRUE_VALUE;
  
    find_edge (cont_bb, body_bb)->flags = EDGE_TRUE_VALUE;
    find_edge (cont_bb, fin_bb)->flags = EDGE_FALSE_VALUE;
  
-  set_immediate_dominator (CDI_DOMINATORS, seq_start_bb, entry_bb);
+  set_immediate_dominator (CDI_DOMINATORS, second_bb, entry_bb);
+  set_immediate_dominator (CDI_DOMINATORS, third_bb, entry_bb);
+  set_immediate_dominator (CDI_DOMINATORS, seq_start_bb, third_bb);
    set_immediate_dominator (CDI_DOMINATORS, body_bb,
                            recompute_dominator (CDI_DOMINATORS, body_bb));
    set_immediate_dominator (CDI_DOMINATORS, fin_bb,
diff --git a/libgomp/ChangeLog b/libgomp/ChangeLog

index 7d3fe227c5ea7b7476ba8f419eaef31aaf066350..1d93273b919042e3797e634353363064e7d3afa6 100644 (file)
--- a/libgomp/ChangeLog
+++ b/libgomp/ChangeLog
@@ -1,3 +1,17 @@
+2011-06-22  Jakub Jelinek  <jakub@redhat.com>
+
+       PR libgomp/49490
+       * iter.c (gomp_iter_static_next): For chunk size 0
+       only use n ceil/ nthreads size for the first
+       n % nthreads threads in the team instead of
+       all threads except for the last few ones which
+       get less work or none at all.
+       * iter_ull.c (gomp_iter_ull_static_next): Likewise.
+       * env.c (parse_schedule): If OMP_SCHEDULE doesn't have
+       chunk argument, set run_sched_modifier to 0 for static
+       resp. 1 for other kinds.  If chunk argument is 0
+       and not static, set value to 1.
+
  2011-05-19  Jakub Jelinek  <jakub@redhat.com>
  
         PR c++/49043
diff --git a/libgomp/env.c b/libgomp/env.c

index 92fa8c3749794880c1998a2175113e20d3afa011..0ca9a1c9e0060433e9d19b245a95870a6c7f1416 100644 (file)
--- a/libgomp/env.c
+++ b/libgomp/env.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2005, 2006, 2007, 2008, 2009, 2010
+/* Copyright (C) 2005, 2006, 2007, 2008, 2009, 2010, 2011
     Free Software Foundation, Inc.
     Contributed by Richard Henderson <rth@redhat.com>.
  
@@ -108,7 +108,11 @@ parse_schedule (void)
    while (isspace ((unsigned char) *env))
      ++env;
    if (*env == '\0')
-    return;
+    {
+      gomp_global_icv.run_sched_modifier
+       = gomp_global_icv.run_sched_var != GFS_STATIC;
+      return;
+    }
    if (*env++ != ',')
      goto unknown;
    while (isspace ((unsigned char) *env))
@@ -129,6 +133,8 @@ parse_schedule (void)
    if ((int)value != value)
      goto invalid;
  
+  if (value == 0 && gomp_global_icv.run_sched_var != GFS_STATIC)
+    value = 1;
    gomp_global_icv.run_sched_modifier = value;
    return;
  
diff --git a/libgomp/iter.c b/libgomp/iter.c

index 9ec4dbd2252e32eca0bcccc86661d7c32ffcb7d4..cd9484a1ea4cb6593bb93aa54f9db6c5ce8caf51 100644 (file)
--- a/libgomp/iter.c
+++ b/libgomp/iter.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2005, 2008, 2009 Free Software Foundation, Inc.
+/* Copyright (C) 2005, 2008, 2009, 2011 Free Software Foundation, Inc.
     Contributed by Richard Henderson <rth@redhat.com>.
  
     This file is part of the GNU OpenMP Library (libgomp).
@@ -59,7 +59,7 @@ gomp_iter_static_next (long *pstart, long *pend)
       trip through the outer loop.  */
    if (ws->chunk_size == 0)
      {
-      unsigned long n, q, i;
+      unsigned long n, q, i, t;
        unsigned long s0, e0;
        long s, e;
  
@@ -74,11 +74,14 @@ gomp_iter_static_next (long *pstart, long *pend)
        /* Compute the "zero-based" start and end points.  That is, as
           if the loop began at zero and incremented by one.  */
        q = n / nthreads;
-      q += (q * nthreads != n);
-      s0 = q * i;
+      t = n % nthreads;
+      if (i < t)
+       {
+         t = 0;
+         q++;
+       }
+      s0 = q * i + t;
        e0 = s0 + q;
-      if (e0 > n)
-        e0 = n;
  
        /* Notice when no iterations allocated for this thread.  */
        if (s0 >= e0)
diff --git a/libgomp/iter_ull.c b/libgomp/iter_ull.c

index 1754e6333c2d5f4b0266bfeba840b119eba3481c..a393920b55e48f84a9b48a7522fc0177225e0264 100644 (file)
--- a/libgomp/iter_ull.c
+++ b/libgomp/iter_ull.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2005, 2008, 2009 Free Software Foundation, Inc.
+/* Copyright (C) 2005, 2008, 2009, 2011 Free Software Foundation, Inc.
     Contributed by Richard Henderson <rth@redhat.com>.
  
     This file is part of the GNU OpenMP Library (libgomp).
@@ -60,7 +60,7 @@ gomp_iter_ull_static_next (gomp_ull *pstart, gomp_ull *pend)
       trip through the outer loop.  */
    if (ws->chunk_size_ull == 0)
      {
-      gomp_ull n, q, i, s0, e0, s, e;
+      gomp_ull n, q, i, t, s0, e0, s, e;
  
        if (thr->ts.static_trip > 0)
         return 1;
@@ -75,11 +75,14 @@ gomp_iter_ull_static_next (gomp_ull *pstart, gomp_ull *pend)
        /* Compute the "zero-based" start and end points.  That is, as
          if the loop began at zero and incremented by one.  */
        q = n / nthreads;
-      q += (q * nthreads != n);
-      s0 = q * i;
+      t = n % nthreads;
+      if (i < t)
+       {
+         t = 0;
+         q++;
+       }
+      s0 = q * i + t;
        e0 = s0 + q;
-      if (e0 > n)
-       e0 = n;
  
        /* Notice when no iterations allocated for this thread.  */
        if (s0 >= e0)
author	Jakub Jelinek <jakub@redhat.com>
	Wed, 22 Jun 2011 20:39:25 +0000 (22:39 +0200)
committer	Jakub Jelinek <jakub@gcc.gnu.org>
	Wed, 22 Jun 2011 20:39:25 +0000 (22:39 +0200)
gcc/ChangeLog		patch \| blob \| history
gcc/omp-low.c		patch \| blob \| history
libgomp/ChangeLog		patch \| blob \| history
libgomp/env.c		patch \| blob \| history
libgomp/iter.c		patch \| blob \| history
libgomp/iter_ull.c		patch \| blob \| history