diff mbox

[gomp4,1/3] OpenACC 2.0 support for libgomp - OpenACC runtime, NVidia PTX/CUDA plugin

Message ID 87bnowgoue.fsf@kepler.schwinge.homeip.net
State New
Headers show

Commit Message

Thomas Schwinge Oct. 28, 2014, 4:07 p.m. UTC
Hi!

Following the noble goal of code re-use, we had been using <sys/queue.h>
for a standard C linked list implementation.  However, we found that
elderly (but still sufficient to build GCC) glibc releases contain a
variant of <sys/queue.h> that pre-dates a 2006 upstream glibc update to a
more recent upstream BSD version of that file, and so is missing certain
interfaces that we were using.  Instead of conditionally re-implementing
those, in r216803 I committed a patch to remove the LIST_* usage, and
instead do things manually:

commit ba8916f6bc1dd93d8b6dc92f3d84aec49b68dea9
Author: tschwinge <tschwinge@138bc75d-0d04-0410-961f-82ee72b054a4>
Date:   Tue Oct 28 15:57:37 2014 +0000

    libgomp: Don't use <sys/queue.h>'s SLIST_*.
    
    Some of the interfaces are "too new".
    
    	libgomp/
    	* oacc-init.c: Don't use <sys/queue.h>'s SLIST_*.
    	* plugin-nvptx.c: Likewise.
    
    git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/branches/gomp-4_0-branch@216803 138bc75d-0d04-0410-961f-82ee72b054a4
---
 libgomp/ChangeLog.gomp |   5 ++
 libgomp/oacc-init.c    |  23 ++++-----
 libgomp/plugin-nvptx.c | 138 +++++++++++++++++++++++++++++--------------------
 3 files changed, 96 insertions(+), 70 deletions(-)



Grüße,
 Thomas
diff mbox

Patch

diff --git libgomp/ChangeLog.gomp libgomp/ChangeLog.gomp
index 5363068..fda1cbc 100644
--- libgomp/ChangeLog.gomp
+++ libgomp/ChangeLog.gomp
@@ -1,3 +1,8 @@ 
+2014-10-28  Thomas Schwinge  <thomas@codesourcery.com>
+
+	* oacc-init.c: Don't use <sys/queue.h>'s SLIST_*.
+	* plugin-nvptx.c: Likewise.
+
 2014-10-23  Thomas Schwinge  <thomas@codesourcery.com>
 
 	* testsuite/libgomp.oacc-c/reduction-initial-1.c: New file.
diff --git libgomp/oacc-init.c libgomp/oacc-init.c
index f797f89..ffa9ad8 100644
--- libgomp/oacc-init.c
+++ libgomp/oacc-init.c
@@ -31,7 +31,6 @@ 
 #include <stdlib.h>
 #include <strings.h>
 #include <stdbool.h>
-#include <sys/queue.h>
 #include <stdio.h>
 
 gomp_mutex_t acc_device_lock;
@@ -55,11 +54,11 @@  static __thread int handle_num = -1;
 struct ACC_context {
   struct memmap_t *ACC_memmap;
   void *ACC_handle;
-  SLIST_ENTRY(ACC_context) next;
+
+  struct ACC_context *next;
 };
 
-static SLIST_HEAD(_ACC_contexts, ACC_context) _ACC_contexts;
-static struct _ACC_contexts *ACC_contexts;
+static struct ACC_context *ACC_contexts;
 
 static struct gomp_device_descr const *dispatchers[_ACC_device_hwm] = { 0 };
 
@@ -198,7 +197,7 @@  lazy_open (int ord)
   ACC_handle = ACC_dev->openacc.open_device_func (ord);
   handle_num = ord;
 
-  SLIST_FOREACH(acc_ctx, ACC_contexts, next)
+  for (acc_ctx = ACC_contexts; acc_ctx != NULL; acc_ctx = acc_ctx->next)
     {
       if (acc_ctx->ACC_handle == ACC_handle)
         {
@@ -220,7 +219,8 @@  lazy_open (int ord)
   if (!ACC_memmap->mem_map.is_initialized)
     gomp_init_tables (ACC_dev, &ACC_memmap->mem_map);
 
-  SLIST_INSERT_HEAD(ACC_contexts, acc_ctx, next);
+  acc_ctx->next = ACC_contexts;
+  ACC_contexts = acc_ctx;
 }
 
 /* OpenACC 2.0a (3.2.12, 3.2.13) doesn't specify whether the serialization of
@@ -259,12 +259,10 @@  _acc_shutdown (acc_device_t d)
 
   close_handle ();
 
-  while (SLIST_FIRST(ACC_contexts) != NULL)
+  while (ACC_contexts != NULL)
     {
-      struct ACC_context *c;
-
-      c = SLIST_FIRST(ACC_contexts);
-      SLIST_REMOVE_HEAD(ACC_contexts, next);
+      struct ACC_context *c = ACC_contexts;
+      ACC_contexts = ACC_contexts->next;
       free (c);
     }
 
@@ -467,8 +465,7 @@  ACC_runtime_initialize (void)
 {
   gomp_mutex_init (&acc_device_lock);
 
-  ACC_contexts = &_ACC_contexts;
-  SLIST_INIT (ACC_contexts);
+  ACC_contexts = NULL;
 }
 
 /* Compiler helper functions */
diff --git libgomp/plugin-nvptx.c libgomp/plugin-nvptx.c
index f193229..33f868a 100644
--- libgomp/plugin-nvptx.c
+++ libgomp/plugin-nvptx.c
@@ -40,7 +40,6 @@ 
 #include "libgomp-plugin.h"
 
 #include <cuda.h>
-#include <sys/queue.h>
 #include <stdint.h>
 #include <string.h>
 #include <stdio.h>
@@ -149,11 +148,9 @@  struct PTX_stream
   void *h_prev;
   void *h_tail;
 
-  SLIST_ENTRY(PTX_stream) next;
+  struct PTX_stream *next;
 };
 
-SLIST_HEAD(PTX_streams, PTX_stream);
-
 /* Each thread may select a stream (also specific to a device/context).  */
 static __thread struct PTX_stream *current_stream;
 
@@ -293,7 +290,7 @@  struct PTX_device
   /* All non-null streams associated with this device (actually context),
      either created implicitly or passed in from the user (via
      acc_set_cuda_stream).  */
-  struct PTX_streams active_streams;
+  struct PTX_stream *active_streams;
   struct {
     struct PTX_stream **arr;
     int size;
@@ -306,12 +303,12 @@  struct PTX_device
   bool concur;
   int  mode;
   bool mkern;
-  SLIST_ENTRY(PTX_device) next;
+
+  struct PTX_device *next;
 };
 
 static __thread struct PTX_device *PTX_dev;
-static SLIST_HEAD(_PTX_devices, PTX_device) _PTX_devices;
-static struct _PTX_devices *PTX_devices;
+static struct PTX_device *PTX_devices;
 
 enum PTX_event_type
 {
@@ -327,12 +324,12 @@  struct PTX_event
   int type;
   void *addr;
   int ord;
-  SLIST_ENTRY(PTX_event) next;
+
+  struct PTX_event *next;
 };
 
 static gomp_mutex_t PTX_event_lock;
-static SLIST_HEAD(_PTX_events, PTX_event) _PTX_events;
-static struct _PTX_events *PTX_events;
+static struct PTX_event *PTX_events;
 
 #define _XSTR(s) _STR(s)
 #define _STR(s) #s
@@ -417,7 +414,7 @@  init_streams_for_device (struct PTX_device *ptx_dev, int concurrency)
   map_init (null_stream);
   ptx_dev->null_stream = null_stream;
   
-  SLIST_INIT (&ptx_dev->active_streams);
+  ptx_dev->active_streams = NULL;
   GOMP_PLUGIN_mutex_init (&ptx_dev->stream_lock);
   
   if (concurrency < 1)
@@ -437,13 +434,13 @@  init_streams_for_device (struct PTX_device *ptx_dev, int concurrency)
 static void
 fini_streams_for_device (struct PTX_device *ptx_dev)
 {
-  struct PTX_stream *s;
   free (ptx_dev->async_streams.arr);
   
-  while (!SLIST_EMPTY (&ptx_dev->active_streams))
+  while (ptx_dev->active_streams != NULL)
     {
-      s = SLIST_FIRST (&ptx_dev->active_streams);
-      SLIST_REMOVE_HEAD (&ptx_dev->active_streams, next);
+      struct PTX_stream *s = ptx_dev->active_streams;
+      ptx_dev->active_streams = ptx_dev->active_streams->next;
+
       cuStreamDestroy (s->stream);
       map_fini (s);
       free (s);
@@ -535,7 +532,8 @@  select_stream_for_async (int async, pthread_t thread, bool create,
 	  s->h = NULL;
 	  map_init (s);
 	  
-	  SLIST_INSERT_HEAD (&ptx_dev->active_streams, s, next);
+	  s->next = ptx_dev->active_streams;
+	  ptx_dev->active_streams = s;
 	  ptx_dev->async_streams.arr[async] = s;
 	}
 
@@ -593,11 +591,8 @@  PTX_init (void)
   if (r != CUDA_SUCCESS)
     GOMP_PLUGIN_fatal ("cuInit error: %s", cuErrorMsg (r));
 
-  PTX_devices = &_PTX_devices;
-  PTX_events = &_PTX_events;
-
-  SLIST_INIT(PTX_devices);
-  SLIST_INIT(PTX_events);
+  PTX_devices = NULL;
+  PTX_events = NULL;
 
   GOMP_PLUGIN_mutex_init (&PTX_event_lock);
 
@@ -625,7 +620,9 @@  PTX_open_device (int n)
     {
       struct PTX_device *ptx_device;
 
-      SLIST_FOREACH(ptx_device, PTX_devices, next)
+      for (ptx_device = PTX_devices;
+	   ptx_device != NULL;
+	   ptx_device = ptx_device->next)
         {
           if (ptx_device->ord == n)
             {
@@ -653,7 +650,8 @@  PTX_open_device (int n)
   PTX_dev->dev = dev;
   PTX_dev->ctx_shared = false;
 
-  SLIST_INSERT_HEAD(PTX_devices, PTX_dev, next);
+  PTX_dev->next = PTX_devices;
+  PTX_devices = PTX_dev;
 
   r = cuCtxGetCurrent (&PTX_dev->ctx);
   if (r != CUDA_SUCCESS)
@@ -729,7 +727,15 @@  PTX_close_device (void *h __attribute__((unused)))
 	GOMP_PLUGIN_fatal ("cuCtxDestroy error: %s", cuErrorMsg (r));
     }
 
-  SLIST_REMOVE(PTX_devices, PTX_dev, PTX_device, next);
+  if (PTX_devices == PTX_dev)
+    PTX_devices = PTX_devices->next;
+  else
+    {
+      struct PTX_device* d = PTX_devices;
+      while (d->next != PTX_dev)
+	d = d->next;
+      d->next = d->next->next;
+    }
   free (PTX_dev);
 
   PTX_dev = NULL;
@@ -920,60 +926,67 @@  link_ptx (CUmodule *module, char *ptx_code)
 static void
 event_gc (bool memmap_lockable)
 {
-  struct PTX_event *ptx_event;
+  struct PTX_event *ptx_event = PTX_events;
 
   GOMP_PLUGIN_mutex_lock (&PTX_event_lock);
 
-  for (ptx_event = SLIST_FIRST (PTX_events); ptx_event;)
+  while (ptx_event != NULL)
     {
       CUresult r;
-      struct PTX_event *next = SLIST_NEXT (ptx_event, next);
+      struct PTX_event *e = ptx_event;
 
-      if (ptx_event->ord != PTX_dev->ord)
-        goto next_event;
+      ptx_event = ptx_event->next;
 
-      r = cuEventQuery (*ptx_event->evt);
+      if (e->ord != PTX_dev->ord)
+	continue;
+
+      r = cuEventQuery (*e->evt);
       if (r == CUDA_SUCCESS)
-        {
-          CUevent *te;
+	{
+	  CUevent *te;
 
-          te = ptx_event->evt;
+	  te = e->evt;
 
-	  switch (ptx_event->type)
+	  switch (e->type)
 	    {
 	    case PTX_EVT_MEM:
 	    case PTX_EVT_SYNC:
 	      break;
 	    
 	    case PTX_EVT_KNL:
-              map_pop (ptx_event->addr);
+	      map_pop (e->addr);
 	      break;
 
 	    case PTX_EVT_ASYNC_CLEANUP:
-              {
-	        /* The function GOMP_PLUGIN_async_unmap_vars needs to claim the
+	      {
+		/* The function GOMP_PLUGIN_async_unmap_vars needs to claim the
 		   memory-map splay tree lock for the current device, so we
 		   can't call it when one of our callers has already claimed
 		   the lock.  In that case, just delay the GC for this event
-		   until later.  */
-	        if (!memmap_lockable)
-		  goto next_event;
+		   until later.	 */
+		if (!memmap_lockable)
+		  continue;
 
-		GOMP_PLUGIN_async_unmap_vars (ptx_event->addr);
-              }
+		GOMP_PLUGIN_async_unmap_vars (e->addr);
+	      }
 	      break;
 	    }
 
-          cuEventDestroy (*te);
-          free ((void *)te);
+	  cuEventDestroy (*te);
+	  free ((void *)te);
 
-          SLIST_REMOVE (PTX_events, ptx_event, PTX_event, next);
+	  if (PTX_events == e)
+	    PTX_events = PTX_events->next;
+	  else
+	    {
+	      struct PTX_event *e_ = PTX_events;
+	      while (e_->next != e)
+		e_ = e_->next;
+	      e_->next = e_->next->next;
+	    }
 
-          free (ptx_event);
-        }
-
-    next_event:
-      ptx_event = next;
+	  free (e);
+	}
     }
 
   GOMP_PLUGIN_mutex_unlock (&PTX_event_lock);
@@ -995,7 +1008,8 @@  event_add (enum PTX_event_type type, CUevent *e, void *h)
 
   GOMP_PLUGIN_mutex_lock (&PTX_event_lock);
 
-  SLIST_INSERT_HEAD(PTX_events, ptx_event, next);
+  ptx_event->next = PTX_events;
+  PTX_events = ptx_event;
 
   GOMP_PLUGIN_mutex_unlock (&PTX_event_lock);
 }
@@ -1316,7 +1330,7 @@  PTX_async_test_all (void)
 
   GOMP_PLUGIN_mutex_lock (&PTX_dev->stream_lock);
 
-  SLIST_FOREACH (s, &PTX_dev->active_streams, next)
+  for (s = PTX_dev->active_streams; s != NULL; s = s->next)
     {
       if ((s->multithreaded || pthread_equal (s->host_thread, self))
 	  && cuStreamQuery (s->stream) == CUDA_ERROR_NOT_READY)
@@ -1400,7 +1414,7 @@  PTX_wait_all (void)
 
   /* Wait for active streams initiated by this thread (or by multiple threads)
      to complete.  */
-  SLIST_FOREACH (s, &PTX_dev->active_streams, next)
+  for (s = PTX_dev->active_streams; s != NULL; s = s->next)
     {
       if (s->multithreaded || pthread_equal (s->host_thread, self))
         {
@@ -1443,7 +1457,9 @@  PTX_wait_all_async (int async)
 
   GOMP_PLUGIN_mutex_lock (&PTX_dev->stream_lock);
 
-  SLIST_FOREACH (other_stream, &PTX_dev->active_streams, next)
+  for (other_stream = PTX_dev->active_streams;
+       other_stream != NULL;
+       other_stream = other_stream->next)
     {
       if (!other_stream->multithreaded
 	  && !pthread_equal (other_stream->host_thread, self))
@@ -1524,8 +1540,16 @@  PTX_set_cuda_stream (int async, void *stream)
   
   if (oldstream)
     {
-      SLIST_REMOVE (&PTX_dev->active_streams, oldstream, PTX_stream, next);
-      
+      if (PTX_dev->active_streams == oldstream)
+	PTX_dev->active_streams = PTX_dev->active_streams->next;
+      else
+	{
+	  struct PTX_stream *s = PTX_dev->active_streams;
+	  while (s->next != oldstream)
+	    s = s->next;
+	  s->next = s->next->next;
+	}
+
       cuStreamDestroy (oldstream->stream);
       map_fini (oldstream);
       free (oldstream);