Patchwork Change omp for static non-chunk computation (PR libgomp/49490)

login
register
mail settings
Submitter Jakub Jelinek
Date June 22, 2011, 6:13 p.m.
Message ID <20110622181334.GO16443@tyan-ft48-01.lab.bos.redhat.com>
Download mbox | patch
Permalink /patch/101531/
State New
Headers show

Comments

Jakub Jelinek - June 22, 2011, 6:13 p.m.
Hi!

As has been reported, we don't divide the work for schedule(static)
loops very well.  E.g. for 33 iterations with 8 threads, we give
5 iterations to the first 6 threads, 3 iterations to the 7th thread
and 0 iterations to the last thread in the team.
The reason for that is the 
       q = n / nthreads;
       q += (q * nthreads != n);
       s0 = q * i;
       e0 = s0 + q;
       if (e0 > n)
         e0 = n; 
the computation of the start (s0)/end (e0), there i is the 0 based
thread id, n is number of iterations that need to be divided in between the
threads and nthreads number of threads in the team.

The following patch instead implements:
       q = n / nthreads;
       t = n % nthreads;
       if (i < t)
        {
          t = 0;
          q++;
        }
       s0 = q * i + t;
       e0 = s0 + q;
At least on x86_64/i686 using
       q = n / nthreads;
       t = n % nthreads;
instead of
       q = n / nthreads;
       t = n - (q * nthreads);
results in much better generated code, because the division
computes both division and modulus, while the second form
isn't optimized that way (missed optimization)?
Anyway, couldn't figure out how to implement this without a
conditional jump (I'd say the likely case is that n % nthreads
is zero).  With this for 33 iterations and 8 threads
we divide it as 5 in the first thread and 4 in all other threads.

Richard, do you agree with this?

Bootstrapped/regtested on x86_64-linux and i686-linux.

2011-06-22  Jakub Jelinek  <jakub@redhat.com>

	PR libgomp/49490
	* omp-low.c (expand_omp_for_static_nochunk): Only
	use n ceil/ nthreads size for the first
	n % nthreads threads in the team instead of
	all threads except for the last few ones which
	get less work or none at all.

	* iter.c (gomp_iter_static_next): For chunk size 0
	only use n ceil/ nthreads size for the first
	n % nthreads threads in the team instead of
	all threads except for the last few ones which
	get less work or none at all.
	* iter_ull.c (gomp_iter_ull_static_next): Likewise.
	* env.c (parse_schedule): If OMP_SCHEDULE doesn't have
	chunk argument, set run_sched_modifier to 0 for static
	resp. 1 for other kinds.  If chunk argument is 0
	and not static, set value to 1.


	Jakub
Richard Henderson - June 22, 2011, 8:01 p.m.
On 06/22/2011 11:13 AM, Jakub Jelinek wrote:
> Richard, do you agree with this?
> 
> Bootstrapped/regtested on x86_64-linux and i686-linux.
> 
> 2011-06-22  Jakub Jelinek  <jakub@redhat.com>
> 
> 	PR libgomp/49490
> 	* omp-low.c (expand_omp_for_static_nochunk): Only
> 	use n ceil/ nthreads size for the first
> 	n % nthreads threads in the team instead of
> 	all threads except for the last few ones which
> 	get less work or none at all.
> 
> 	* iter.c (gomp_iter_static_next): For chunk size 0
> 	only use n ceil/ nthreads size for the first
> 	n % nthreads threads in the team instead of
> 	all threads except for the last few ones which
> 	get less work or none at all.
> 	* iter_ull.c (gomp_iter_ull_static_next): Likewise.
> 	* env.c (parse_schedule): If OMP_SCHEDULE doesn't have
> 	chunk argument, set run_sched_modifier to 0 for static
> 	resp. 1 for other kinds.  If chunk argument is 0
> 	and not static, set value to 1.

Looks good.


r~

Patch

--- gcc/omp-low.c.jj	2011-06-22 10:16:56.000000000 +0200
+++ gcc/omp-low.c	2011-06-22 15:56:14.000000000 +0200
@@ -3,7 +3,7 @@ 
    marshalling to implement data sharing and copying clauses.
    Contributed by Diego Novillo <dnovillo@redhat.com>
 
-   Copyright (C) 2005, 2006, 2007, 2008, 2009, 2010
+   Copyright (C) 2005, 2006, 2007, 2008, 2009, 2010, 2011
    Free Software Foundation, Inc.
 
 This file is part of GCC.
@@ -4108,9 +4108,14 @@  expand_omp_for_generic (struct omp_regio
 	else
 	  n = (adj + N2 - N1) / STEP;
 	q = n / nthreads;
-	q += (q * nthreads != n);
-	s0 = q * threadid;
-	e0 = min(s0 + q, n);
+	tt = n % nthreads;
+	if (threadid < tt) goto L3; else goto L4;
+    L3:
+	tt = 0;
+	q = q + 1;
+    L4:
+	s0 = q * threadid + tt;
+	e0 = s0 + q;
 	V = s0 * STEP + N1;
 	if (s0 >= e0) goto L2; else goto L0;
     L0:
@@ -4126,12 +4131,14 @@  static void
 expand_omp_for_static_nochunk (struct omp_region *region,
 			       struct omp_for_data *fd)
 {
-  tree n, q, s0, e0, e, t, nthreads, threadid;
+  tree n, q, s0, e0, e, t, tt, nthreads, threadid;
   tree type, itype, vmain, vback;
-  basic_block entry_bb, exit_bb, seq_start_bb, body_bb, cont_bb;
+  basic_block entry_bb, second_bb, third_bb, exit_bb, seq_start_bb;
+  basic_block body_bb, cont_bb;
   basic_block fin_bb;
   gimple_stmt_iterator gsi;
   gimple stmt;
+  edge ep;
 
   itype = type = TREE_TYPE (fd->loop.v);
   if (POINTER_TYPE_P (type))
@@ -4185,19 +4192,39 @@  expand_omp_for_static_nochunk (struct om
   t = fold_convert (itype, t);
   n = force_gimple_operand_gsi (&gsi, t, true, NULL_TREE, true, GSI_SAME_STMT);
 
+  q = create_tmp_var (itype, "q");
   t = fold_build2 (TRUNC_DIV_EXPR, itype, n, nthreads);
-  q = force_gimple_operand_gsi (&gsi, t, true, NULL_TREE, true, GSI_SAME_STMT);
+  t = force_gimple_operand_gsi (&gsi, t, false, NULL_TREE, true, GSI_SAME_STMT);
+  gsi_insert_before (&gsi, gimple_build_assign (q, t), GSI_SAME_STMT);
+
+  tt = create_tmp_var (itype, "tt");
+  t = fold_build2 (TRUNC_MOD_EXPR, itype, n, nthreads);
+  t = force_gimple_operand_gsi (&gsi, t, false, NULL_TREE, true, GSI_SAME_STMT);
+  gsi_insert_before (&gsi, gimple_build_assign (tt, t), GSI_SAME_STMT);
 
-  t = fold_build2 (MULT_EXPR, itype, q, nthreads);
-  t = fold_build2 (NE_EXPR, itype, t, n);
-  t = fold_build2 (PLUS_EXPR, itype, q, t);
-  q = force_gimple_operand_gsi (&gsi, t, true, NULL_TREE, true, GSI_SAME_STMT);
+  t = build2 (LT_EXPR, boolean_type_node, threadid, tt);
+  stmt = gimple_build_cond_empty (t);
+  gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
+
+  second_bb = split_block (entry_bb, stmt)->dest;
+  gsi = gsi_last_bb (second_bb);
+  gcc_assert (gimple_code (gsi_stmt (gsi)) == GIMPLE_OMP_FOR);
+
+  gsi_insert_before (&gsi, gimple_build_assign (tt, build_int_cst (itype, 0)),
+		     GSI_SAME_STMT);
+  stmt = gimple_build_assign_with_ops (PLUS_EXPR, q, q,
+				       build_int_cst (itype, 1));
+  gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
+
+  third_bb = split_block (second_bb, stmt)->dest;
+  gsi = gsi_last_bb (third_bb);
+  gcc_assert (gimple_code (gsi_stmt (gsi)) == GIMPLE_OMP_FOR);
 
   t = build2 (MULT_EXPR, itype, q, threadid);
+  t = build2 (PLUS_EXPR, itype, t, tt);
   s0 = force_gimple_operand_gsi (&gsi, t, true, NULL_TREE, true, GSI_SAME_STMT);
 
   t = fold_build2 (PLUS_EXPR, itype, s0, q);
-  t = fold_build2 (MIN_EXPR, itype, t, n);
   e0 = force_gimple_operand_gsi (&gsi, t, true, NULL_TREE, true, GSI_SAME_STMT);
 
   t = build2 (GE_EXPR, boolean_type_node, s0, e0);
@@ -4263,13 +4290,20 @@  expand_omp_for_static_nochunk (struct om
   gsi_remove (&gsi, true);
 
   /* Connect all the blocks.  */
-  find_edge (entry_bb, seq_start_bb)->flags = EDGE_FALSE_VALUE;
-  find_edge (entry_bb, fin_bb)->flags = EDGE_TRUE_VALUE;
+  ep = make_edge (entry_bb, third_bb, EDGE_FALSE_VALUE);
+  ep->probability = REG_BR_PROB_BASE / 4 * 3;
+  ep = find_edge (entry_bb, second_bb);
+  ep->flags = EDGE_TRUE_VALUE;
+  ep->probability = REG_BR_PROB_BASE / 4;
+  find_edge (third_bb, seq_start_bb)->flags = EDGE_FALSE_VALUE;
+  find_edge (third_bb, fin_bb)->flags = EDGE_TRUE_VALUE;
 
   find_edge (cont_bb, body_bb)->flags = EDGE_TRUE_VALUE;
   find_edge (cont_bb, fin_bb)->flags = EDGE_FALSE_VALUE;
 
-  set_immediate_dominator (CDI_DOMINATORS, seq_start_bb, entry_bb);
+  set_immediate_dominator (CDI_DOMINATORS, second_bb, entry_bb);
+  set_immediate_dominator (CDI_DOMINATORS, third_bb, entry_bb);
+  set_immediate_dominator (CDI_DOMINATORS, seq_start_bb, third_bb);
   set_immediate_dominator (CDI_DOMINATORS, body_bb,
 			   recompute_dominator (CDI_DOMINATORS, body_bb));
   set_immediate_dominator (CDI_DOMINATORS, fin_bb,
--- libgomp/iter.c.jj	2009-04-14 16:33:07.000000000 +0200
+++ libgomp/iter.c	2011-06-22 13:42:47.000000000 +0200
@@ -1,4 +1,4 @@ 
-/* Copyright (C) 2005, 2008, 2009 Free Software Foundation, Inc.
+/* Copyright (C) 2005, 2008, 2009, 2011 Free Software Foundation, Inc.
    Contributed by Richard Henderson <rth@redhat.com>.
 
    This file is part of the GNU OpenMP Library (libgomp).
@@ -59,7 +59,7 @@  gomp_iter_static_next (long *pstart, lon
      trip through the outer loop.  */
   if (ws->chunk_size == 0)
     {
-      unsigned long n, q, i;
+      unsigned long n, q, i, t;
       unsigned long s0, e0;
       long s, e;
 
@@ -74,11 +74,14 @@  gomp_iter_static_next (long *pstart, lon
       /* Compute the "zero-based" start and end points.  That is, as
          if the loop began at zero and incremented by one.  */
       q = n / nthreads;
-      q += (q * nthreads != n);
-      s0 = q * i;
+      t = n % nthreads;
+      if (i < t)
+	{
+	  t = 0;
+	  q++;
+	}
+      s0 = q * i + t;
       e0 = s0 + q;
-      if (e0 > n)
-        e0 = n;
 
       /* Notice when no iterations allocated for this thread.  */
       if (s0 >= e0)
--- libgomp/iter_ull.c.jj	2009-04-14 16:33:07.000000000 +0200
+++ libgomp/iter_ull.c	2011-06-22 13:43:09.000000000 +0200
@@ -1,4 +1,4 @@ 
-/* Copyright (C) 2005, 2008, 2009 Free Software Foundation, Inc.
+/* Copyright (C) 2005, 2008, 2009, 2011 Free Software Foundation, Inc.
    Contributed by Richard Henderson <rth@redhat.com>.
 
    This file is part of the GNU OpenMP Library (libgomp).
@@ -60,7 +60,7 @@  gomp_iter_ull_static_next (gomp_ull *pst
      trip through the outer loop.  */
   if (ws->chunk_size_ull == 0)
     {
-      gomp_ull n, q, i, s0, e0, s, e;
+      gomp_ull n, q, i, t, s0, e0, s, e;
 
       if (thr->ts.static_trip > 0)
 	return 1;
@@ -75,11 +75,14 @@  gomp_iter_ull_static_next (gomp_ull *pst
       /* Compute the "zero-based" start and end points.  That is, as
 	 if the loop began at zero and incremented by one.  */
       q = n / nthreads;
-      q += (q * nthreads != n);
-      s0 = q * i;
+      t = n % nthreads;
+      if (i < t)
+	{
+	  t = 0;
+	  q++;
+	}
+      s0 = q * i + t;
       e0 = s0 + q;
-      if (e0 > n)
-	e0 = n;
 
       /* Notice when no iterations allocated for this thread.  */
       if (s0 >= e0)
--- libgomp/env.c.jj	2010-12-06 08:08:32.000000000 +0100
+++ libgomp/env.c	2011-06-22 16:14:56.000000000 +0200
@@ -1,4 +1,4 @@ 
-/* Copyright (C) 2005, 2006, 2007, 2008, 2009, 2010
+/* Copyright (C) 2005, 2006, 2007, 2008, 2009, 2010, 2011
    Free Software Foundation, Inc.
    Contributed by Richard Henderson <rth@redhat.com>.
 
@@ -108,7 +108,11 @@  parse_schedule (void)
   while (isspace ((unsigned char) *env))
     ++env;
   if (*env == '\0')
-    return;
+    {
+      gomp_global_icv.run_sched_modifier
+	= gomp_global_icv.run_sched_var != GFS_STATIC;
+      return;
+    }
   if (*env++ != ',')
     goto unknown;
   while (isspace ((unsigned char) *env))
@@ -129,6 +133,8 @@  parse_schedule (void)
   if ((int)value != value)
     goto invalid;
 
+  if (value == 0 && gomp_global_icv.run_sched_var != GFS_STATIC)
+    value = 1;
   gomp_global_icv.run_sched_modifier = value;
   return;