Tidy up locking for libgomp OpenACC entry points
diff mbox

Message ID 20150422194243.115f26fa@octopus
State New
Headers show

Commit Message

Julian Brown April 22, 2015, 6:42 p.m. UTC
Hi,

This patch is an attempt to fix some potential race conditions with
accesses to shared data structures from multiple concurrent threads in
libgomp's OpenACC entry points. The main change is to move locking out
of lookup_host and lookup_dev in oacc-mem.c and into their callers
(which can then hold the locks for the whole operation that they are
performing).

Also missing locking has been added for gomp_acc_insert_pointer.

Tests look OK (with offloading to NVidia PTX).

OK? (For the gomp4 branch, maybe, if trunk's not suitable at the
moment?)

Thanks,

Julian

ChangeLog

    libgomp/
    * oacc-mem.c (lookup_host): Remove locking from function. Note
    locking requirement for caller in function comment.
    (lookup_dev): Likewise.
    (acc_free, acc_deviceptr, acc_hostptr, acc_is_present)
    (acc_map_data, acc_unmap_data, present_create_copy, delete_copyout)
    (update_dev_host, gomp_acc_insert_pointer, gomp_acc_remove_pointer):
    Add locking.

Comments

Jakub Jelinek April 23, 2015, 8:32 a.m. UTC | #1
On Wed, Apr 22, 2015 at 07:42:43PM +0100, Julian Brown wrote:
> @@ -120,25 +116,32 @@ acc_free (void *d)
>  {
>    splay_tree_key k;
>    struct goacc_thread *thr = goacc_thread ();
> +  struct gomp_device_descr *acc_dev = thr->dev;

IMHO you want to move this line after:

>  
>    if (!d)
>      return;
>  
>    assert (thr && thr->dev);

the assert.  Supposedly also the thr = line after the return; line,
no need to do it if returning early.  As gcc now defaults to -std=gnu11
for C, libgomp is written in C11 and you don't need to limit to C89 (and
even back then, it was compiled by gcc and thus you could use GNU
extensions, such as mixed declarations and code).

Otherwise, I have no problem with this going to trunk, if Thomas is ok with
it.

	Jakub
Thomas Schwinge April 23, 2015, 4:41 p.m. UTC | #2
Hi!

On Wed, 22 Apr 2015 19:42:43 +0100, Julian Brown <julian@codesourcery.com> wrote:
> This patch is an attempt to fix some potential race conditions with
> accesses to shared data structures from multiple concurrent threads in
> libgomp's OpenACC entry points. The main change is to move locking out
> of lookup_host and lookup_dev in oacc-mem.c and into their callers
> (which can then hold the locks for the whole operation that they are
> performing).

Yeah, that makes sense I guess.

> Also missing locking has been added for gomp_acc_insert_pointer.
> 
> Tests look OK (with offloading to NVidia PTX).

How did you test to get some confidence in the locking being sufficient?

I just did a very cursory review, but it looks good with a few review
comments below.

Going further (separate patch?), a few more comments:

Is it OK that oacc-init.c:cached_base_dev is accessed without locking?

Generally, we have to keep in mind that the same device may be accessed
in parallel through both OpenACC and OpenMP interfaces.  For this, for
example, in oacc-init.c, even though acc_device_lock is held, is it OK to
call gomp_init_device(D) without D->lock being locked?  (Compare to
target.c code.)

Please document what exactly oacc-init.c:acc_device_lock is to guard.
I'm not sure I'm understanding this correctly.

Should oacc-init.c:acc_shutdown_1 release goacc_thread_lock before any
gomp_fatal calls?  (That seems to be the general policy in libgomp.)


> --- a/libgomp/oacc-mem.c
> +++ b/libgomp/oacc-mem.c

> @@ -120,25 +116,32 @@ acc_free (void *d)
>  {
>    splay_tree_key k;
>    struct goacc_thread *thr = goacc_thread ();
> +  struct gomp_device_descr *acc_dev = thr->dev;
>  
>    if (!d)
>      return;
>  
>    assert (thr && thr->dev);
>  
> +  gomp_mutex_lock (&acc_dev->lock);
> +
>    /* We don't have to call lazy open here, as the ptr value must have
>       been returned by acc_malloc.  It's not permitted to pass NULL in
>       (unless you got that null from acc_malloc).  */
> -  if ((k = lookup_dev (thr->dev->openacc.data_environ, d, 1)))
> -   {
> -     void *offset;
> +  if ((k = lookup_dev (acc_dev->openacc.data_environ, d, 1)))
> +    {
> +      void *offset;
> +
> +      offset = d - k->tgt->tgt_start + k->tgt_offset;
>  
> -     offset = d - k->tgt->tgt_start + k->tgt_offset;
> +      gomp_mutex_unlock (&acc_dev->lock);
>  
> -     acc_unmap_data ((void *)(k->host_start + offset));
> -   }
> +      acc_unmap_data ((void *)(k->host_start + offset));
> +    }
> +  else
> +    gomp_mutex_unlock (&acc_dev->lock);

Does it make sense to make the unlock unconditional, and move the
acc_unmap_data after it, guarded by »if (k)«?

> -  thr->dev->free_func (thr->dev->target_id, d);
> +  acc_dev->free_func (acc_dev->target_id, d);
>  }
>  
>  void
> @@ -178,16 +181,24 @@ acc_deviceptr (void *h)
>    goacc_lazy_initialize ();
>  
>    struct goacc_thread *thr = goacc_thread ();
> +  struct gomp_device_descr *dev = thr->dev;
> +
> +  gomp_mutex_lock (&dev->lock);
>  
> -  n = lookup_host (thr->dev, h, 1);
> +  n = lookup_host (dev, h, 1);
>  
>    if (!n)
> -    return NULL;
> +    {
> +      gomp_mutex_unlock (&dev->lock);
> +      return NULL;
> +    }
>  
>    offset = h - n->host_start;
>  
>    d = n->tgt->tgt_start + n->tgt_offset + offset;
>  
> +  gomp_mutex_unlock (&dev->lock);
> +
>    return d;
>  }

Do we need to retain the lock while working with n?  If not, the unlock
could be placed right after the lookup_host, unconditionally.  I'm
confused -- it's commonly being done (retained) in target.c code, but not
in the tgt_fn lookup in target.c:GOMP_target.

> @@ -204,16 +215,24 @@ acc_hostptr (void *d)
>    goacc_lazy_initialize ();
>  
>    struct goacc_thread *thr = goacc_thread ();
> +  struct gomp_device_descr *acc_dev = thr->dev;
>  
> -  n = lookup_dev (thr->dev->openacc.data_environ, d, 1);
> +  gomp_mutex_lock (&acc_dev->lock);
> +
> +  n = lookup_dev (acc_dev->openacc.data_environ, d, 1);
>  
>    if (!n)
> -    return NULL;
> +    {
> +      gomp_mutex_unlock (&acc_dev->lock);
> +      return NULL;
> +    }
>  
>    offset = d - n->tgt->tgt_start + n->tgt_offset;
>  
>    h = n->host_start + offset;
>  
> +  gomp_mutex_unlock (&acc_dev->lock);
> +
>    return h;
>  }

Similar, and also for other code later on.

> @@ -439,25 +494,35 @@ delete_copyout (unsigned f, void *h, size_t s)

>    if (f & FLAG_COPYOUT)
>      acc_dev->dev2host_func (acc_dev->target_id, h, d, s);
>  
> +  gomp_mutex_unlock (&acc_dev->lock);
> +
>    acc_unmap_data (h);
>  
>    acc_dev->free_func (acc_dev->target_id, d);

Should external functions (dev2host_func) be called without any locks
held (to avoid issues with any call-backs); so move the unlocking before
that?


Grüße,
 Thomas

Patch
diff mbox

commit 983e08e46be24380a52095851cd9c6eb481eb47c
Author: Julian Brown <julian@codesourcery.com>
Date:   Tue Apr 21 12:42:17 2015 -0700

    More locking in oacc-mem.c

diff --git a/libgomp/oacc-mem.c b/libgomp/oacc-mem.c
index 89ef5fc..d53af4b 100644
--- a/libgomp/oacc-mem.c
+++ b/libgomp/oacc-mem.c
@@ -35,7 +35,8 @@ 
 #include <stdint.h>
 #include <assert.h>
 
-/* Return block containing [H->S), or NULL if not contained.  */
+/* Return block containing [H->S), or NULL if not contained.  The device lock
+   for DEV must be locked on entry, and remains locked on exit.  */
 
 static splay_tree_key
 lookup_host (struct gomp_device_descr *dev, void *h, size_t s)
@@ -46,9 +47,7 @@  lookup_host (struct gomp_device_descr *dev, void *h, size_t s)
   node.host_start = (uintptr_t) h;
   node.host_end = (uintptr_t) h + s;
 
-  gomp_mutex_lock (&dev->lock);
   key = splay_tree_lookup (&dev->mem_map, &node);
-  gomp_mutex_unlock (&dev->lock);
 
   return key;
 }
@@ -56,7 +55,8 @@  lookup_host (struct gomp_device_descr *dev, void *h, size_t s)
 /* Return block containing [D->S), or NULL if not contained.
    The list isn't ordered by device address, so we have to iterate
    over the whole array.  This is not expected to be a common
-   operation.  */
+   operation.  The device lock associated with TGT must be locked on entry, and
+   remains locked on exit.  */
 
 static splay_tree_key
 lookup_dev (struct target_mem_desc *tgt, void *d, size_t s)
@@ -67,16 +67,12 @@  lookup_dev (struct target_mem_desc *tgt, void *d, size_t s)
   if (!tgt)
     return NULL;
 
-  gomp_mutex_lock (&tgt->device_descr->lock);
-
   for (t = tgt; t != NULL; t = t->prev)
     {
       if (t->tgt_start <= (uintptr_t) d && t->tgt_end >= (uintptr_t) d + s)
         break;
     }
 
-  gomp_mutex_unlock (&tgt->device_descr->lock);
-
   if (!t)
     return NULL;
 
@@ -120,25 +116,32 @@  acc_free (void *d)
 {
   splay_tree_key k;
   struct goacc_thread *thr = goacc_thread ();
+  struct gomp_device_descr *acc_dev = thr->dev;
 
   if (!d)
     return;
 
   assert (thr && thr->dev);
 
+  gomp_mutex_lock (&acc_dev->lock);
+
   /* We don't have to call lazy open here, as the ptr value must have
      been returned by acc_malloc.  It's not permitted to pass NULL in
      (unless you got that null from acc_malloc).  */
-  if ((k = lookup_dev (thr->dev->openacc.data_environ, d, 1)))
-   {
-     void *offset;
+  if ((k = lookup_dev (acc_dev->openacc.data_environ, d, 1)))
+    {
+      void *offset;
+
+      offset = d - k->tgt->tgt_start + k->tgt_offset;
 
-     offset = d - k->tgt->tgt_start + k->tgt_offset;
+      gomp_mutex_unlock (&acc_dev->lock);
 
-     acc_unmap_data ((void *)(k->host_start + offset));
-   }
+      acc_unmap_data ((void *)(k->host_start + offset));
+    }
+  else
+    gomp_mutex_unlock (&acc_dev->lock);
 
-  thr->dev->free_func (thr->dev->target_id, d);
+  acc_dev->free_func (acc_dev->target_id, d);
 }
 
 void
@@ -178,16 +181,24 @@  acc_deviceptr (void *h)
   goacc_lazy_initialize ();
 
   struct goacc_thread *thr = goacc_thread ();
+  struct gomp_device_descr *dev = thr->dev;
+
+  gomp_mutex_lock (&dev->lock);
 
-  n = lookup_host (thr->dev, h, 1);
+  n = lookup_host (dev, h, 1);
 
   if (!n)
-    return NULL;
+    {
+      gomp_mutex_unlock (&dev->lock);
+      return NULL;
+    }
 
   offset = h - n->host_start;
 
   d = n->tgt->tgt_start + n->tgt_offset + offset;
 
+  gomp_mutex_unlock (&dev->lock);
+
   return d;
 }
 
@@ -204,16 +215,24 @@  acc_hostptr (void *d)
   goacc_lazy_initialize ();
 
   struct goacc_thread *thr = goacc_thread ();
+  struct gomp_device_descr *acc_dev = thr->dev;
 
-  n = lookup_dev (thr->dev->openacc.data_environ, d, 1);
+  gomp_mutex_lock (&acc_dev->lock);
+
+  n = lookup_dev (acc_dev->openacc.data_environ, d, 1);
 
   if (!n)
-    return NULL;
+    {
+      gomp_mutex_unlock (&acc_dev->lock);
+      return NULL;
+    }
 
   offset = d - n->tgt->tgt_start + n->tgt_offset;
 
   h = n->host_start + offset;
 
+  gomp_mutex_unlock (&acc_dev->lock);
+
   return h;
 }
 
@@ -232,6 +251,8 @@  acc_is_present (void *h, size_t s)
   struct goacc_thread *thr = goacc_thread ();
   struct gomp_device_descr *acc_dev = thr->dev;
 
+  gomp_mutex_lock (&acc_dev->lock);
+
   n = lookup_host (acc_dev, h, s);
 
   if (n && ((uintptr_t)h < n->host_start
@@ -239,6 +260,8 @@  acc_is_present (void *h, size_t s)
 	    || s > n->host_end - n->host_start))
     n = NULL;
 
+  gomp_mutex_unlock (&acc_dev->lock);
+
   return n != NULL;
 }
 
@@ -274,20 +297,32 @@  acc_map_data (void *h, void *d, size_t s)
 	gomp_fatal ("[%p,+%d]->[%p,+%d] is a bad map",
                     (void *)h, (int)s, (void *)d, (int)s);
 
+      gomp_mutex_lock (&acc_dev->lock);
+
       if (lookup_host (acc_dev, h, s))
-	gomp_fatal ("host address [%p, +%d] is already mapped", (void *)h,
-		    (int)s);
+        {
+	  gomp_mutex_unlock (&acc_dev->lock);
+	  gomp_fatal ("host address [%p, +%d] is already mapped", (void *)h,
+		      (int)s);
+	}
 
       if (lookup_dev (thr->dev->openacc.data_environ, d, s))
-	gomp_fatal ("device address [%p, +%d] is already mapped", (void *)d,
-		    (int)s);
+        {
+	  gomp_mutex_unlock (&acc_dev->lock);
+	  gomp_fatal ("device address [%p, +%d] is already mapped", (void *)d,
+		      (int)s);
+	}
+
+      gomp_mutex_unlock (&acc_dev->lock);
 
       tgt = gomp_map_vars (acc_dev, mapnum, &hostaddrs, &devaddrs, &sizes,
 			   &kinds, true, false);
     }
 
+  gomp_mutex_lock (&acc_dev->lock);
   tgt->prev = acc_dev->openacc.data_environ;
   acc_dev->openacc.data_environ = tgt;
+  gomp_mutex_unlock (&acc_dev->lock);
 }
 
 void
@@ -299,17 +334,26 @@  acc_unmap_data (void *h)
   /* No need to call lazy open, as the address must have been mapped.  */
 
   size_t host_size;
+
+  gomp_mutex_lock (&acc_dev->lock);
+
   splay_tree_key n = lookup_host (acc_dev, h, 1);
   struct target_mem_desc *t;
 
   if (!n)
-    gomp_fatal ("%p is not a mapped block", (void *)h);
+    {
+      gomp_mutex_unlock (&acc_dev->lock);
+      gomp_fatal ("%p is not a mapped block", (void *)h);
+    }
 
   host_size = n->host_end - n->host_start;
 
   if (n->host_start != (uintptr_t) h)
-    gomp_fatal ("[%p,%d] surrounds1 %p",
-		(void *) n->host_start, (int) host_size, (void *) h);
+    {
+      gomp_mutex_unlock (&acc_dev->lock);
+      gomp_fatal ("[%p,%d] surrounds %p",
+		  (void *) n->host_start, (int) host_size, (void *) h);
+    }
 
   t = n->tgt;
 
@@ -323,8 +367,6 @@  acc_unmap_data (void *h)
       t->tgt_end = 0;
       t->to_free = 0;
 
-      gomp_mutex_lock (&acc_dev->lock);
-
       for (tp = NULL, t = acc_dev->openacc.data_environ; t != NULL;
 	   tp = t, t = t->prev)
 	if (n->tgt == t)
@@ -336,10 +378,10 @@  acc_unmap_data (void *h)
 
 	    break;
 	  }
-
-      gomp_mutex_unlock (&acc_dev->lock);
     }
 
+  gomp_mutex_unlock (&acc_dev->lock);
+
   gomp_unmap_vars (t, true);
 }
 
@@ -361,6 +403,8 @@  present_create_copy (unsigned f, void *h, size_t s)
   struct goacc_thread *thr = goacc_thread ();
   struct gomp_device_descr *acc_dev = thr->dev;
 
+  gomp_mutex_lock (&acc_dev->lock);
+
   n = lookup_host (acc_dev, h, s);
   if (n)
     {
@@ -368,13 +412,22 @@  present_create_copy (unsigned f, void *h, size_t s)
       d = (void *) (n->tgt->tgt_start + n->tgt_offset);
 
       if (!(f & FLAG_PRESENT))
-        gomp_fatal ("[%p,+%d] already mapped to [%p,+%d]",
-            (void *)h, (int)s, (void *)d, (int)s);
+        {
+	  gomp_mutex_unlock (&acc_dev->lock);
+          gomp_fatal ("[%p,+%d] already mapped to [%p,+%d]",
+        	      (void *)h, (int)s, (void *)d, (int)s);
+	}
       if ((h + s) > (void *)n->host_end)
-        gomp_fatal ("[%p,+%d] not mapped", (void *)h, (int)s);
+	{
+	  gomp_mutex_unlock (&acc_dev->lock);
+	  gomp_fatal ("[%p,+%d] not mapped", (void *)h, (int)s);
+	}
+
+      gomp_mutex_unlock (&acc_dev->lock);
     }
   else if (!(f & FLAG_CREATE))
     {
+      gomp_mutex_unlock (&acc_dev->lock);
       gomp_fatal ("[%p,+%d] not mapped", (void *)h, (int)s);
     }
   else
@@ -389,6 +442,8 @@  present_create_copy (unsigned f, void *h, size_t s)
       else
 	kinds = GOMP_MAP_ALLOC;
 
+      gomp_mutex_unlock (&acc_dev->lock);
+
       tgt = gomp_map_vars (acc_dev, mapnum, &hostaddrs, NULL, &s, &kinds, true,
 			   false);
 
@@ -439,25 +494,35 @@  delete_copyout (unsigned f, void *h, size_t s)
   struct goacc_thread *thr = goacc_thread ();
   struct gomp_device_descr *acc_dev = thr->dev;
 
+  gomp_mutex_lock (&acc_dev->lock);
+
   n = lookup_host (acc_dev, h, s);
 
   /* No need to call lazy open, as the data must already have been
      mapped.  */
 
   if (!n)
-    gomp_fatal ("[%p,%d] is not mapped", (void *)h, (int)s);
+    {
+      gomp_mutex_unlock (&acc_dev->lock);
+      gomp_fatal ("[%p,%d] is not mapped", (void *)h, (int)s);
+    }
 
   d = (void *) (n->tgt->tgt_start + n->tgt_offset);
 
   host_size = n->host_end - n->host_start;
 
   if (n->host_start != (uintptr_t) h || host_size != s)
-    gomp_fatal ("[%p,%d] surrounds2 [%p,+%d]",
-		(void *) n->host_start, (int) host_size, (void *) h, (int) s);
+    {
+      gomp_mutex_unlock (&acc_dev->lock);
+      gomp_fatal ("[%p,%d] surrounds2 [%p,+%d]",
+		  (void *) n->host_start, (int) host_size, (void *) h, (int) s);
+    }
 
   if (f & FLAG_COPYOUT)
     acc_dev->dev2host_func (acc_dev->target_id, h, d, s);
 
+  gomp_mutex_unlock (&acc_dev->lock);
+
   acc_unmap_data (h);
 
   acc_dev->free_func (acc_dev->target_id, d);
@@ -482,20 +547,27 @@  update_dev_host (int is_dev, void *h, size_t s)
   struct goacc_thread *thr = goacc_thread ();
   struct gomp_device_descr *acc_dev = thr->dev;
 
+  gomp_mutex_lock (&acc_dev->lock);
+
   n = lookup_host (acc_dev, h, s);
 
   /* No need to call lazy open, as the data must already have been
      mapped.  */
 
   if (!n)
-    gomp_fatal ("[%p,%d] is not mapped", h, (int)s);
+    {
+      gomp_mutex_unlock (&acc_dev->lock);
+      gomp_fatal ("[%p,%d] is not mapped", h, (int)s);
+    }
 
   d = (void *) (n->tgt->tgt_start + n->tgt_offset);
 
+  gomp_mutex_unlock (&acc_dev->lock);
+
   if (is_dev)
     acc_dev->host2dev_func (acc_dev->target_id, d, h, s);
   else
-    acc_dev->dev2host_func (acc_dev->target_id, h, d, s);
+    acc_dev->dev2host_func (acc_dev->target_id, h, d, s);  
 }
 
 void
@@ -522,8 +594,11 @@  gomp_acc_insert_pointer (size_t mapnum, void **hostaddrs, size_t *sizes,
   tgt = gomp_map_vars (acc_dev, mapnum, hostaddrs,
 		       NULL, sizes, kinds, true, false);
   gomp_debug (0, "  %s: mappings prepared\n", __FUNCTION__);
+
+  gomp_mutex_lock (&acc_dev->lock);
   tgt->prev = acc_dev->openacc.data_environ;
   acc_dev->openacc.data_environ = tgt;
+  gomp_mutex_unlock (&acc_dev->lock);
 }
 
 void
@@ -535,10 +610,15 @@  gomp_acc_remove_pointer (void *h, bool force_copyfrom, int async, int mapnum)
   struct target_mem_desc *t;
   int minrefs = (mapnum == 1) ? 2 : 3;
 
+  gomp_mutex_lock (&acc_dev->lock);
+
   n = lookup_host (acc_dev, h, 1);
 
   if (!n)
-    gomp_fatal ("%p is not a mapped block", (void *)h);
+    {
+      gomp_mutex_unlock (&acc_dev->lock);
+      gomp_fatal ("%p is not a mapped block", (void *)h);
+    }
 
   gomp_debug (0, "  %s: restore mappings\n", __FUNCTION__);
 
@@ -546,8 +626,6 @@  gomp_acc_remove_pointer (void *h, bool force_copyfrom, int async, int mapnum)
 
   struct target_mem_desc *tp;
 
-  gomp_mutex_lock (&acc_dev->lock);
-
   if (t->refcount == minrefs)
     {
       /* This is the last reference, so pull the descriptor off the