Fix OpenACC shutdown and PTX image unloading (PR65904)
diff mbox

Message ID 20150507113758.0423ce0d@octopus
State New
Headers show

Commit Message

Julian Brown May 7, 2015, 10:37 a.m. UTC
On Wed, 6 May 2015 10:32:56 +0200
Thomas Schwinge <thomas@codesourcery.com> wrote:

> Hi!
> 
> On Fri, 1 May 2015 10:47:19 +0100, Julian Brown
> <julian@codesourcery.com> wrote:
> > The patch also fixes a thinko that was revealed in image unloading
> > in the NVPTX backend. Tested for libgomp with PTX offloading.
> 
> Confirming that both nvptx (PR65904 fixed) and (emulated) intelmic (no
> changes) testing look fine.

Thanks for testing!

> By the way, do we need to lock ptx_devices in
> libgomp/plugin/plugin-nvptx.c:nvptx_attach_host_thread_to_device and
> libgomp/plugin/plugin-nvptx.c:GOMP_OFFLOAD_openacc_create_thread_data?
> (ptx_dev_lock?  If yes, its definition as well as instantiated_devices
> should be moved to a more appropriate place, probably?)

Probably yes (though I'm not sure what you mean about moving the
instantiated_devices and ptx_dev_lock to a more appropriate place?).

> Also, several
> accesses to instantiated_devices are not locked by ptx_dev_lock but
> should be, from my cursory review.

I'm not sure about that.

> > --- a/libgomp/target.c
> > +++ b/libgomp/target.c
> > @@ -797,32 +797,79 @@ GOMP_offload_register (void *host_table, enum
> > offload_target_type target_type, gomp_mutex_unlock (&register_lock);
> >  }
> >  
> > -/* This function should be called from every offload image while
> > unloading.
> > -   It gets the descriptor of the host func and var tables
> > HOST_TABLE, TYPE of
> > -   the target, and TARGET_DATA needed by target plugin.  */
> > +/* DEVICEP should be locked on entry, and remains locked on exit.
> > */
> 
> (I'm not a native speaker, but would use what I consider to be more
> explicit/precise language: »must be locked« instead of »should be«.
> I'll be happy to learn should they mean the same thing?)

I've changed the wording in a couple of comments.

> >  
> > -void
> > -GOMP_offload_unregister (void *host_table, enum
> > offload_target_type target_type,
> > -			 void *target_data)
> > +static void
> > +gomp_deoffload_image_from_device (struct gomp_device_descr
> > *devicep,
> > +				  void *host_table, void
> > *target_data) {
> 
> > +/* This function should be called from every offload image while
> > unloading.
> 
> s%from%for%, I think?  (And, s%should%must%, again?)

No, this really is "from" -- this comment wasn't actually added by my
patch, just moved. I'm also not sure about "should" in this instance --
unloading an image is already a corner-case, and maybe there are
circumstances in which it'd be impossible for some given object to call
the function?

> > +   It gets the descriptor of the host func and var tables
> > HOST_TABLE, TYPE of
> > +   the target, and TARGET_DATA needed by target plugin.  */
> > +
> > +void
> > +GOMP_offload_unregister (void *host_table, enum
> > offload_target_type target_type,
> > +			 void *target_data)
> > +{
> 
> > -/* Free address mapping tables.  MM must be locked on entry, and
> > remains locked
> > -   on return.  */
> > +/* Free address mapping tables for an active device DEVICEP.  This
> > includes
> > +   both mapped offload functions/variables, and mapped user data
> > regions.
> > +   To be used before shutting a device down: subsequently
> > reinitialising the
> > +   device will repopulate the offload image mappings.  */
> >  
> >  attribute_hidden void
> > -gomp_free_memmap (struct splay_tree_s *mem_map)
> > +gomp_free_memmap (struct gomp_device_descr *devicep)
> >  {
> > +  int i;
> > +  struct splay_tree_s *mem_map = &devicep->mem_map;
> > +
> > +  assert (devicep->is_initialized);
> > +
> > +  gomp_mutex_lock (&devicep->lock);
> 
> Need to lock before first access to *devicep?

Fixed.

> > +  
> > +  /* Unmap offload images that are registered to this device.  */
> > +  for (i = 0; i < num_offload_images; i++)
> > +    {
> > +      struct offload_image_descr *image = &offload_images[i];
> 
> Need to take register_lock when accessing offload_images?

This too. Retested for libgomp/NVPTX.

OK for trunk now?

Thanks,

Julian

ChangeLog

    PR libgomp/65904

    libgomp/
    * libgomp.h (gomp_free_memmap): Update prototype.
    * oacc-init.c (acc_shutdown_1): Pass device descriptor to
    gomp_free_memmap. Don't lock device around call.
    * target.c (gomp_map_vars): Initialise tgt->array to NULL before
    early exit.
    (GOMP_offload_unregister): Split out and call...
    (gomp_deoffload_image_from_device): This new function.
    (gomp_free_memmap): Call gomp_deoffload_image_from_device.
    * plugin/nvptx.c (struct ptx_image_data): Add ord, fn_descs fields.
    (nvptx_init): Tweak comment.
    (nvptx_attach_host_thread_to_device): Add locking with ptx_dev_lock
    around ptx_devices accesses.
    (GOMP_OFFLOAD_load_image): Populate new ptx_image_data fields.
    (GOMP_OFFLOAD_unload_image): Switch to ORD'th device before freeing
    images, and use fn_descs field from ptx_image_data instead of
    incorrectly using a pointer derived from target_data.
    (GOMP_OFFLOAD_openacc_create_thread_data): Add locking around
    ptx_devices accesses.

Patch
diff mbox

Index: libgomp/target.c
===================================================================
--- libgomp/target.c	(revision 222861)
+++ libgomp/target.c	(working copy)
@@ -178,6 +178,7 @@  gomp_map_vars (struct gomp_device_descr
   tgt->list_count = mapnum;
   tgt->refcount = 1;
   tgt->device_descr = devicep;
+  tgt->array = NULL;
 
   if (mapnum == 0)
     return tgt;
@@ -275,7 +276,6 @@  gomp_map_vars (struct gomp_device_descr
   if (is_target)
     tgt_size = mapnum * sizeof (void *);
 
-  tgt->array = NULL;
   if (not_found_cnt)
     {
       tgt->array = gomp_malloc (not_found_cnt * sizeof (*tgt->array));
@@ -797,32 +797,79 @@  GOMP_offload_register (void *host_table,
   gomp_mutex_unlock (&register_lock);
 }
 
-/* This function should be called from every offload image while unloading.
-   It gets the descriptor of the host func and var tables HOST_TABLE, TYPE of
-   the target, and TARGET_DATA needed by target plugin.  */
+/* DEVICEP must be locked on entry, and remains locked on exit.  */
 
-void
-GOMP_offload_unregister (void *host_table, enum offload_target_type target_type,
-			 void *target_data)
+static void
+gomp_deoffload_image_from_device (struct gomp_device_descr *devicep,
+				  void *host_table, void *target_data)
 {
   void **host_func_table = ((void ***) host_table)[0];
   void **host_funcs_end  = ((void ***) host_table)[1];
   void **host_var_table  = ((void ***) host_table)[2];
   void **host_vars_end   = ((void ***) host_table)[3];
-  int i;
 
   /* The func table contains only addresses, the var table contains addresses
      and corresponding sizes.  */
   int num_funcs = host_funcs_end - host_func_table;
   int num_vars  = (host_vars_end - host_var_table) / 2;
+  int j;
+
+  devicep->unload_image_func (devicep->target_id, target_data);
+
+  /* Remove mapping from splay tree.  */
+  struct splay_tree_key_s k;
+  splay_tree_key node = NULL;
+  if (num_funcs > 0)
+    {
+      k.host_start = (uintptr_t) host_func_table[0];
+      k.host_end = k.host_start + 1;
+      node = splay_tree_lookup (&devicep->mem_map, &k);
+    }
+  else if (num_vars > 0)
+    {
+      k.host_start = (uintptr_t) host_var_table[0];
+      k.host_end = k.host_start + (uintptr_t) host_var_table[1];
+      node = splay_tree_lookup (&devicep->mem_map, &k);
+    }
+
+  for (j = 0; j < num_funcs; j++)
+    {
+      k.host_start = (uintptr_t) host_func_table[j];
+      k.host_end = k.host_start + 1;
+      splay_tree_remove (&devicep->mem_map, &k);
+    }
+
+  for (j = 0; j < num_vars; j++)
+    {
+      k.host_start = (uintptr_t) host_var_table[j * 2];
+      k.host_end = k.host_start + (uintptr_t) host_var_table[j * 2 + 1];
+      splay_tree_remove (&devicep->mem_map, &k);
+    }
+
+  if (node)
+    {
+      free (node->tgt);
+      free (node);
+    }
+}
+
+/* This function should be called from every offload image while unloading.
+   It gets the descriptor of the host func and var tables HOST_TABLE, TYPE of
+   the target, and TARGET_DATA needed by target plugin.  */
+
+void
+GOMP_offload_unregister (void *host_table, enum offload_target_type target_type,
+			 void *target_data)
+{
+  int i;
 
   gomp_mutex_lock (&register_lock);
 
   /* Unload image from all initialized devices.  */
   for (i = 0; i < num_devices; i++)
     {
-      int j;
       struct gomp_device_descr *devicep = &devices[i];
+      
       gomp_mutex_lock (&devicep->lock);
       if (devicep->type != target_type || !devicep->is_initialized)
 	{
@@ -830,44 +877,7 @@  GOMP_offload_unregister (void *host_tabl
 	  continue;
 	}
 
-      devicep->unload_image_func (devicep->target_id, target_data);
-
-      /* Remove mapping from splay tree.  */
-      struct splay_tree_key_s k;
-      splay_tree_key node = NULL;
-      if (num_funcs > 0)
-	{
-	  k.host_start = (uintptr_t) host_func_table[0];
-	  k.host_end = k.host_start + 1;
-	  node = splay_tree_lookup (&devicep->mem_map, &k);
-	}
-      else if (num_vars > 0)
-	{
-	  k.host_start = (uintptr_t) host_var_table[0];
-	  k.host_end = k.host_start + (uintptr_t) host_var_table[1];
-	  node = splay_tree_lookup (&devicep->mem_map, &k);
-	}
-
-      for (j = 0; j < num_funcs; j++)
-	{
-	  k.host_start = (uintptr_t) host_func_table[j];
-	  k.host_end = k.host_start + 1;
-	  splay_tree_remove (&devicep->mem_map, &k);
-	}
-
-      for (j = 0; j < num_vars; j++)
-	{
-	  k.host_start = (uintptr_t) host_var_table[j * 2];
-	  k.host_end = k.host_start + (uintptr_t) host_var_table[j * 2 + 1];
-	  splay_tree_remove (&devicep->mem_map, &k);
-	}
-
-      if (node)
-	{
-	  free (node->tgt);
-	  free (node);
-	}
-
+      gomp_deoffload_image_from_device (devicep, host_table, target_data);
       gomp_mutex_unlock (&devicep->lock);
     }
 
@@ -903,20 +913,54 @@  gomp_init_device (struct gomp_device_des
   devicep->is_initialized = true;
 }
 
-/* Free address mapping tables.  MM must be locked on entry, and remains locked
-   on return.  */
+/* Free address mapping tables for an active device DEVICEP.  This includes
+   both mapped offload functions/variables, and mapped user data regions.
+   To be used before shutting a device down: subsequently reinitialising the
+   device will repopulate the offload image mappings.  */
 
 attribute_hidden void
-gomp_free_memmap (struct splay_tree_s *mem_map)
+gomp_free_memmap (struct gomp_device_descr *devicep)
 {
+  int i;
+  struct splay_tree_s *mem_map = &devicep->mem_map;
+
+  gomp_mutex_lock (&devicep->lock);
+
+  assert (devicep->is_initialized);
+
+  gomp_mutex_lock (&register_lock);
+
+  /* Unmap offload images that are registered to this device.  */
+  for (i = 0; i < num_offload_images; i++)
+    {
+      struct offload_image_descr *image = &offload_images[i];
+      if (image->type == devicep->type)
+        {
+	  void *host_table = image->host_table;
+	  void *target_data = image->target_data;
+
+	  gomp_deoffload_image_from_device (devicep, host_table, target_data);
+	}
+    }
+
+  gomp_mutex_unlock (&register_lock);
+
+  /* Clear other mappings.  */
   while (mem_map->root)
     {
       struct target_mem_desc *tgt = mem_map->root->key.tgt;
 
       splay_tree_remove (mem_map, &mem_map->root->key);
-      free (tgt->array);
+
+      if (tgt->list_count == 0)
+	assert (!tgt->array);
+      else
+	free (tgt->array);
+
       free (tgt);
     }
+
+  gomp_mutex_unlock (&devicep->lock);
 }
 
 /* This function de-initializes the target device, specified by DEVICEP.
Index: libgomp/libgomp.h
===================================================================
--- libgomp/libgomp.h	(revision 222861)
+++ libgomp/libgomp.h	(working copy)
@@ -777,7 +777,7 @@  extern struct target_mem_desc *gomp_map_
 extern void gomp_copy_from_async (struct target_mem_desc *);
 extern void gomp_unmap_vars (struct target_mem_desc *, bool);
 extern void gomp_init_device (struct gomp_device_descr *);
-extern void gomp_free_memmap (struct splay_tree_s *);
+extern void gomp_free_memmap (struct gomp_device_descr *);
 extern void gomp_fini_device (struct gomp_device_descr *);
 
 /* work.c */
Index: libgomp/plugin/plugin-nvptx.c
===================================================================
--- libgomp/plugin/plugin-nvptx.c	(revision 222861)
+++ libgomp/plugin/plugin-nvptx.c	(working copy)
@@ -334,8 +334,10 @@  struct ptx_event
 
 struct ptx_image_data
 {
+  int ord;
   void *target_data;
   CUmodule module;
+  struct targ_fn_descriptor *fn_descs;
   struct ptx_image_data *next;
 };
 
@@ -589,7 +591,7 @@  select_stream_for_async (int async, pthr
 }
 
 /* Initialize the device.  Return TRUE on success, else FALSE.  PTX_DEV_LOCK
-   should be locked on entry and remains locked on exit.  */
+   must be locked on entry and remains locked on exit.  */
 static bool
 nvptx_init (void)
 {
@@ -643,12 +645,17 @@  nvptx_attach_host_thread_to_device (int
     {
       CUcontext old_ctx;
 
+      pthread_mutex_lock (&ptx_dev_lock);
+
       ptx_dev = ptx_devices[n];
       assert (ptx_dev);
 
       r = cuCtxGetCurrent (&thd_ctx);
       if (r != CUDA_SUCCESS)
-        GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuda_error (r));
+	{
+	  pthread_mutex_unlock (&ptx_dev_lock);
+          GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuda_error (r));
+	}
 
       /* We don't necessarily have a current context (e.g. if it has been
          destroyed.  Pop it if we do though.  */
@@ -656,10 +663,14 @@  nvptx_attach_host_thread_to_device (int
 	{
 	  r = cuCtxPopCurrent (&old_ctx);
 	  if (r != CUDA_SUCCESS)
-            GOMP_PLUGIN_fatal ("cuCtxPopCurrent error: %s", cuda_error (r));
+	    {
+	      pthread_mutex_unlock (&ptx_dev_lock);
+	      GOMP_PLUGIN_fatal ("cuCtxPopCurrent error: %s", cuda_error (r));
+	    }
 	}
 
       r = cuCtxPushCurrent (ptx_dev->ctx);
+      pthread_mutex_unlock (&ptx_dev_lock);
       if (r != CUDA_SUCCESS)
         GOMP_PLUGIN_fatal ("cuCtxPushCurrent error: %s", cuda_error (r));
     }
@@ -1625,13 +1636,6 @@  GOMP_OFFLOAD_load_image (int ord, void *
 
   link_ptx (&module, img_header[0]);
 
-  pthread_mutex_lock (&ptx_image_lock);
-  new_image = GOMP_PLUGIN_malloc (sizeof (struct ptx_image_data));
-  new_image->target_data = target_data;
-  new_image->module = module;
-  new_image->next = ptx_images;
-  ptx_images = new_image;
-  pthread_mutex_unlock (&ptx_image_lock);
 
   /* The mkoffload utility emits a table of pointers/integers at the start of
      each offload image:
@@ -1652,8 +1656,21 @@  GOMP_OFFLOAD_load_image (int ord, void *
 
   *target_table = GOMP_PLUGIN_malloc (sizeof (struct addr_pair)
 				      * (fn_entries + var_entries));
-  targ_fns = GOMP_PLUGIN_malloc (sizeof (struct targ_fn_descriptor)
-				 * fn_entries);
+  if (fn_entries > 0)
+    targ_fns = GOMP_PLUGIN_malloc (sizeof (struct targ_fn_descriptor)
+				   * fn_entries);
+  else
+    targ_fns = NULL;
+
+  pthread_mutex_lock (&ptx_image_lock);
+  new_image = GOMP_PLUGIN_malloc (sizeof (struct ptx_image_data));
+  new_image->ord = ord;
+  new_image->target_data = target_data;
+  new_image->module = module;
+  new_image->fn_descs = targ_fns;
+  new_image->next = ptx_images;
+  ptx_images = new_image;
+  pthread_mutex_unlock (&ptx_image_lock);
 
   for (i = 0; i < fn_entries; i++)
     {
@@ -1687,23 +1704,22 @@  GOMP_OFFLOAD_load_image (int ord, void *
 }
 
 void
-GOMP_OFFLOAD_unload_image (int tid __attribute__((unused)), void *target_data)
+GOMP_OFFLOAD_unload_image (int ord, void *target_data)
 {
-  void **img_header = (void **) target_data;
-  struct targ_fn_descriptor *targ_fns
-    = (struct targ_fn_descriptor *) img_header[0];
   struct ptx_image_data *image, *prev = NULL, *newhd = NULL;
 
-  free (targ_fns);
+  nvptx_attach_host_thread_to_device (ord);
 
   pthread_mutex_lock (&ptx_image_lock);
   for (image = ptx_images; image != NULL;)
     {
       struct ptx_image_data *next = image->next;
 
-      if (image->target_data == target_data)
+      if (image->ord == ord && image->target_data == target_data)
 	{
 	  cuModuleUnload (image->module);
+	  if (image->fn_descs)
+	    free (image->fn_descs);
 	  free (image);
 	  if (prev)
 	    prev->next = next;
@@ -1833,13 +1849,18 @@  GOMP_OFFLOAD_openacc_create_thread_data
   CUresult r;
   CUcontext thd_ctx;
 
+  pthread_mutex_lock (&ptx_dev_lock);
+
   ptx_dev = ptx_devices[ord];
 
   assert (ptx_dev);
 
   r = cuCtxGetCurrent (&thd_ctx);
   if (r != CUDA_SUCCESS)
-    GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuda_error (r));
+    {
+      pthread_mutex_unlock (&ptx_dev_lock);
+      GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuda_error (r));
+    }
 
   assert (ptx_dev->ctx);
 
@@ -1847,12 +1868,17 @@  GOMP_OFFLOAD_openacc_create_thread_data
     {
       r = cuCtxPushCurrent (ptx_dev->ctx);
       if (r != CUDA_SUCCESS)
-	GOMP_PLUGIN_fatal ("cuCtxPushCurrent error: %s", cuda_error (r));
+	{
+	  pthread_mutex_unlock (&ptx_dev_lock);
+	  GOMP_PLUGIN_fatal ("cuCtxPushCurrent error: %s", cuda_error (r));
+	}
     }
 
   nvthd->current_stream = ptx_dev->null_stream;
   nvthd->ptx_dev = ptx_dev;
 
+  pthread_mutex_unlock (&ptx_dev_lock);
+
   return (void *) nvthd;
 }
 
Index: libgomp/oacc-init.c
===================================================================
--- libgomp/oacc-init.c	(revision 222862)
+++ libgomp/oacc-init.c	(working copy)
@@ -244,9 +244,7 @@  acc_shutdown_1 (acc_device_t d)
 
       if (walk->dev)
 	{
-	  gomp_mutex_lock (&walk->dev->lock);
-	  gomp_free_memmap (&walk->dev->mem_map);
-	  gomp_mutex_unlock (&walk->dev->lock);
+	  gomp_free_memmap (walk->dev);
 
 	  walk->dev = NULL;
 	  walk->base_dev = NULL;