diff mbox

libgomp: Introduce gomp_thread::spare_team

Message ID 1436271477-16941-1-git-send-email-sebastian.huber@embedded-brains.de
State New
Headers show

Commit Message

Sebastian Huber July 7, 2015, 12:17 p.m. UTC
Try to re-use the previous team to avoid the use of malloc() and free()
in the normal case where number of threads is the same.  Avoid
superfluous destruction and initialization of team synchronization
objects.

Using the microbenchmark posted here

https://gcc.gnu.org/ml/gcc-patches/2008-03/msg00930.html

shows an improvement in the parallel bench test case (target
x86_64-unknown-linux-gnu, median out of 9 test runs, iteration count
increased to 200000).

Before the patch:

parallel bench 11.2284 seconds

After the patch:

parallel bench 10.7575 seconds

libgomp/ChangeLog
2015-07-07  Sebastian Huber  <sebastian.huber@embedded-brains.de>

	* libgomp.h (gomp_thread): Add spare_team field.
	* team.c (gomp_thread_start): Initialize spare team for non-TLS
	targets.
	(gomp_new_team): Use spare team if possible.
	(free_team): Destroy more team objects.
	(gomp_free_thread): Free spare team if necessary.
	(free_non_nested_team): New.
	(gomp_team_end): Move some team object destructions to
	free_team().  Use free_non_nested_team().
---
 libgomp/libgomp.h |  3 +++
 libgomp/team.c    | 63 ++++++++++++++++++++++++++++++++++++-------------------
 2 files changed, 45 insertions(+), 21 deletions(-)

Comments

Sebastian Huber July 13, 2015, 6:32 a.m. UTC | #1
Ping.

On 07/07/15 14:17, Sebastian Huber wrote:
> Try to re-use the previous team to avoid the use of malloc() and free()
> in the normal case where number of threads is the same.  Avoid
> superfluous destruction and initialization of team synchronization
> objects.
>
> Using the microbenchmark posted here
>
> https://gcc.gnu.org/ml/gcc-patches/2008-03/msg00930.html
>
> shows an improvement in the parallel bench test case (target
> x86_64-unknown-linux-gnu, median out of 9 test runs, iteration count
> increased to 200000).
>
> Before the patch:
>
> parallel bench 11.2284 seconds
>
> After the patch:
>
> parallel bench 10.7575 seconds
>
> libgomp/ChangeLog
> 2015-07-07  Sebastian Huber  <sebastian.huber@embedded-brains.de>
>
> 	* libgomp.h (gomp_thread): Add spare_team field.
> 	* team.c (gomp_thread_start): Initialize spare team for non-TLS
> 	targets.
> 	(gomp_new_team): Use spare team if possible.
> 	(free_team): Destroy more team objects.
> 	(gomp_free_thread): Free spare team if necessary.
> 	(free_non_nested_team): New.
> 	(gomp_team_end): Move some team object destructions to
> 	free_team().  Use free_non_nested_team().
> ---
>   libgomp/libgomp.h |  3 +++
>   libgomp/team.c    | 63 ++++++++++++++++++++++++++++++++++++-------------------
>   2 files changed, 45 insertions(+), 21 deletions(-)
>
> diff --git a/libgomp/libgomp.h b/libgomp/libgomp.h
> index 5ed0f78..563c1e2 100644
> --- a/libgomp/libgomp.h
> +++ b/libgomp/libgomp.h
> @@ -448,6 +448,9 @@ struct gomp_thread
>   
>     /* User pthread thread pool */
>     struct gomp_thread_pool *thread_pool;
> +
> +  /* Spare team ready for re-use in gomp_new_team()  */
> +  struct gomp_team *spare_team;
>   };
>   
>   
> diff --git a/libgomp/team.c b/libgomp/team.c
> index b98b233..cc19eb0 100644
> --- a/libgomp/team.c
> +++ b/libgomp/team.c
> @@ -77,6 +77,7 @@ gomp_thread_start (void *xdata)
>     struct gomp_thread local_thr;
>     thr = &local_thr;
>     pthread_setspecific (gomp_tls_key, thr);
> +  thr->spare_team = NULL;
>   #endif
>     gomp_sem_init (&thr->release, 0);
>   
> @@ -140,19 +141,35 @@ gomp_thread_start (void *xdata)
>   struct gomp_team *
>   gomp_new_team (unsigned nthreads)
>   {
> +  struct gomp_thread *thr = gomp_thread ();
> +  struct gomp_team *spare_team = thr->spare_team;
>     struct gomp_team *team;
> -  size_t size;
>     int i;
>   
> -  size = sizeof (*team) + nthreads * (sizeof (team->ordered_release[0])
> -				      + sizeof (team->implicit_task[0]));
> -  team = gomp_malloc (size);
> +  if (spare_team && spare_team->nthreads == nthreads)
> +    {
> +      thr->spare_team = NULL;
> +      team = spare_team;
> +    }
> +  else
> +    {
> +      size_t extra = sizeof (team->ordered_release[0])
> +                     + sizeof (team->implicit_task[0]);
> +      team = gomp_malloc (sizeof (*team) + nthreads * extra);
> +
> +#ifndef HAVE_SYNC_BUILTINS
> +      gomp_mutex_init (&team->work_share_list_free_lock);
> +#endif
> +      gomp_barrier_init (&team->barrier, nthreads);
> +      gomp_sem_init (&team->master_release, 0);
> +      gomp_mutex_init (&team->task_lock);
> +
> +      team->nthreads = nthreads;
> +    }
>   
>     team->work_share_chunk = 8;
>   #ifdef HAVE_SYNC_BUILTINS
>     team->single_count = 0;
> -#else
> -  gomp_mutex_init (&team->work_share_list_free_lock);
>   #endif
>     team->work_shares_to_free = &team->work_shares[0];
>     gomp_init_work_share (&team->work_shares[0], false, nthreads);
> @@ -163,14 +180,9 @@ gomp_new_team (unsigned nthreads)
>       team->work_shares[i].next_free = &team->work_shares[i + 1];
>     team->work_shares[i].next_free = NULL;
>   
> -  team->nthreads = nthreads;
> -  gomp_barrier_init (&team->barrier, nthreads);
> -
> -  gomp_sem_init (&team->master_release, 0);
>     team->ordered_release = (void *) &team->implicit_task[nthreads];
>     team->ordered_release[0] = &team->master_release;
>   
> -  gomp_mutex_init (&team->task_lock);
>     team->task_queue = NULL;
>     team->task_count = 0;
>     team->task_queued_count = 0;
> @@ -187,6 +199,10 @@ gomp_new_team (unsigned nthreads)
>   static void
>   free_team (struct gomp_team *team)
>   {
> +  gomp_sem_destroy (&team->master_release);
> +#ifndef HAVE_SYNC_BUILTINS
> +  gomp_mutex_destroy (&team->work_share_list_free_lock);
> +#endif
>     gomp_barrier_destroy (&team->barrier);
>     gomp_mutex_destroy (&team->task_lock);
>     free (team);
> @@ -225,6 +241,8 @@ gomp_free_thread (void *arg __attribute__((unused)))
>   {
>     struct gomp_thread *thr = gomp_thread ();
>     struct gomp_thread_pool *pool = thr->thread_pool;
> +  if (thr->spare_team)
> +    free_team (thr->spare_team);
>     if (pool)
>       {
>         if (pool->threads_used > 0)
> @@ -835,6 +853,18 @@ gomp_team_start (void (*fn) (void *), void *data, unsigned nthreads,
>       free (affinity_thr);
>   }
>   
> +static void
> +free_non_nested_team (struct gomp_team *team, struct gomp_thread *thr)
> +{
> +  struct gomp_thread_pool *pool = thr->thread_pool;
> +  if (pool->last_team)
> +    {
> +      if (thr->spare_team)
> +	free_team (thr->spare_team);
> +      thr->spare_team = pool->last_team;
> +    }
> +  pool->last_team = team;
> +}
>   
>   /* Terminate the current team.  This is only to be called by the master
>      thread.  We assume that we must wait for the other threads.  */
> @@ -894,21 +924,12 @@ gomp_team_end (void)
>   	}
>         while (ws != NULL);
>       }
> -  gomp_sem_destroy (&team->master_release);
> -#ifndef HAVE_SYNC_BUILTINS
> -  gomp_mutex_destroy (&team->work_share_list_free_lock);
> -#endif
>   
>     if (__builtin_expect (thr->ts.team != NULL, 0)
>         || __builtin_expect (team->nthreads == 1, 0))
>       free_team (team);
>     else
> -    {
> -      struct gomp_thread_pool *pool = thr->thread_pool;
> -      if (pool->last_team)
> -	free_team (pool->last_team);
> -      pool->last_team = team;
> -    }
> +    free_non_nested_team (team, thr);
>   }
>   
>
Jakub Jelinek July 13, 2015, 7:17 a.m. UTC | #2
On Mon, Jul 13, 2015 at 08:32:33AM +0200, Sebastian Huber wrote:
> Ping.

Space in gomp_thread is precious, that is a TLS variable, and you want
to only handle the non-nested case only anyway, so why don't you
just try to use
if (thr->thread_pool)
  {
    struct gomp_thread *last_team = thr->thread_pool->last_team;
    if (last_team && last_team->nthreads == nthreads)
      {
	team = last_team;
	thr->thread_pool->last_team = NULL;
      }
  }
?

The move of mutex, barrier and sem inits/destroy is possible (on Linux
likely not measurable due to destroys being nops and inits being very cheap,
but on other OSes it might be).

	Jakub
Sebastian Huber July 13, 2015, 7:26 a.m. UTC | #3
On 13/07/15 09:17, Jakub Jelinek wrote:
> On Mon, Jul 13, 2015 at 08:32:33AM +0200, Sebastian Huber wrote:
>> Ping.
> Space in gomp_thread is precious, that is a TLS variable, and you want
> to only handle the non-nested case only anyway, so why don't you
> just try to use
> if (thr->thread_pool)
>    {
>      struct gomp_thread *last_team = thr->thread_pool->last_team;
>      if (last_team && last_team->nthreads == nthreads)
>        {
> 	team = last_team;
> 	thr->thread_pool->last_team = NULL;
>        }
>    }
> ?

I started with this variant, but it needs one more if to check the 
thread pool. Since space seems to be more important, I will adjust the 
patch.
Jakub Jelinek July 13, 2015, 7:35 a.m. UTC | #4
On Mon, Jul 13, 2015 at 09:26:10AM +0200, Sebastian Huber wrote:
> 
> 
> On 13/07/15 09:17, Jakub Jelinek wrote:
> >On Mon, Jul 13, 2015 at 08:32:33AM +0200, Sebastian Huber wrote:
> >>Ping.
> >Space in gomp_thread is precious, that is a TLS variable, and you want
> >to only handle the non-nested case only anyway, so why don't you
> >just try to use
> >if (thr->thread_pool)
> >   {
> >     struct gomp_thread *last_team = thr->thread_pool->last_team;
> >     if (last_team && last_team->nthreads == nthreads)
> >       {
> >	team = last_team;
> >	thr->thread_pool->last_team = NULL;
> >       }
> >   }
> >?
> 
> I started with this variant, but it needs one more if to check the thread
> pool. Since space seems to be more important, I will adjust the patch.

I guess you can use if (__builtin_expect (thr->thread_pool != NULL, 1))
the first GOMP_parallel etc. in a thread is slower anyway.
It also needs a check that thr->ts.team == NULL.

	Jakub
diff mbox

Patch

diff --git a/libgomp/libgomp.h b/libgomp/libgomp.h
index 5ed0f78..563c1e2 100644
--- a/libgomp/libgomp.h
+++ b/libgomp/libgomp.h
@@ -448,6 +448,9 @@  struct gomp_thread
 
   /* User pthread thread pool */
   struct gomp_thread_pool *thread_pool;
+
+  /* Spare team ready for re-use in gomp_new_team()  */
+  struct gomp_team *spare_team;
 };
 
 
diff --git a/libgomp/team.c b/libgomp/team.c
index b98b233..cc19eb0 100644
--- a/libgomp/team.c
+++ b/libgomp/team.c
@@ -77,6 +77,7 @@  gomp_thread_start (void *xdata)
   struct gomp_thread local_thr;
   thr = &local_thr;
   pthread_setspecific (gomp_tls_key, thr);
+  thr->spare_team = NULL;
 #endif
   gomp_sem_init (&thr->release, 0);
 
@@ -140,19 +141,35 @@  gomp_thread_start (void *xdata)
 struct gomp_team *
 gomp_new_team (unsigned nthreads)
 {
+  struct gomp_thread *thr = gomp_thread ();
+  struct gomp_team *spare_team = thr->spare_team;
   struct gomp_team *team;
-  size_t size;
   int i;
 
-  size = sizeof (*team) + nthreads * (sizeof (team->ordered_release[0])
-				      + sizeof (team->implicit_task[0]));
-  team = gomp_malloc (size);
+  if (spare_team && spare_team->nthreads == nthreads)
+    {
+      thr->spare_team = NULL;
+      team = spare_team;
+    }
+  else
+    {
+      size_t extra = sizeof (team->ordered_release[0])
+                     + sizeof (team->implicit_task[0]);
+      team = gomp_malloc (sizeof (*team) + nthreads * extra);
+
+#ifndef HAVE_SYNC_BUILTINS
+      gomp_mutex_init (&team->work_share_list_free_lock);
+#endif
+      gomp_barrier_init (&team->barrier, nthreads);
+      gomp_sem_init (&team->master_release, 0);
+      gomp_mutex_init (&team->task_lock);
+
+      team->nthreads = nthreads;
+    }
 
   team->work_share_chunk = 8;
 #ifdef HAVE_SYNC_BUILTINS
   team->single_count = 0;
-#else
-  gomp_mutex_init (&team->work_share_list_free_lock);
 #endif
   team->work_shares_to_free = &team->work_shares[0];
   gomp_init_work_share (&team->work_shares[0], false, nthreads);
@@ -163,14 +180,9 @@  gomp_new_team (unsigned nthreads)
     team->work_shares[i].next_free = &team->work_shares[i + 1];
   team->work_shares[i].next_free = NULL;
 
-  team->nthreads = nthreads;
-  gomp_barrier_init (&team->barrier, nthreads);
-
-  gomp_sem_init (&team->master_release, 0);
   team->ordered_release = (void *) &team->implicit_task[nthreads];
   team->ordered_release[0] = &team->master_release;
 
-  gomp_mutex_init (&team->task_lock);
   team->task_queue = NULL;
   team->task_count = 0;
   team->task_queued_count = 0;
@@ -187,6 +199,10 @@  gomp_new_team (unsigned nthreads)
 static void
 free_team (struct gomp_team *team)
 {
+  gomp_sem_destroy (&team->master_release);
+#ifndef HAVE_SYNC_BUILTINS
+  gomp_mutex_destroy (&team->work_share_list_free_lock);
+#endif
   gomp_barrier_destroy (&team->barrier);
   gomp_mutex_destroy (&team->task_lock);
   free (team);
@@ -225,6 +241,8 @@  gomp_free_thread (void *arg __attribute__((unused)))
 {
   struct gomp_thread *thr = gomp_thread ();
   struct gomp_thread_pool *pool = thr->thread_pool;
+  if (thr->spare_team)
+    free_team (thr->spare_team);
   if (pool)
     {
       if (pool->threads_used > 0)
@@ -835,6 +853,18 @@  gomp_team_start (void (*fn) (void *), void *data, unsigned nthreads,
     free (affinity_thr);
 }
 
+static void
+free_non_nested_team (struct gomp_team *team, struct gomp_thread *thr)
+{
+  struct gomp_thread_pool *pool = thr->thread_pool;
+  if (pool->last_team)
+    {
+      if (thr->spare_team)
+	free_team (thr->spare_team);
+      thr->spare_team = pool->last_team;
+    }
+  pool->last_team = team;
+}
 
 /* Terminate the current team.  This is only to be called by the master
    thread.  We assume that we must wait for the other threads.  */
@@ -894,21 +924,12 @@  gomp_team_end (void)
 	}
       while (ws != NULL);
     }
-  gomp_sem_destroy (&team->master_release);
-#ifndef HAVE_SYNC_BUILTINS
-  gomp_mutex_destroy (&team->work_share_list_free_lock);
-#endif
 
   if (__builtin_expect (thr->ts.team != NULL, 0)
       || __builtin_expect (team->nthreads == 1, 0))
     free_team (team);
   else
-    {
-      struct gomp_thread_pool *pool = thr->thread_pool;
-      if (pool->last_team)
-	free_team (pool->last_team);
-      pool->last_team = team;
-    }
+    free_non_nested_team (team, thr);
 }