diff mbox

[gomp4.1] depend nowait support for target {update,{enter,exit} data}

Message ID 20151019194754.GB1855@msticlxl57.ims.intel.com
State New
Headers show

Commit Message

Ilya Verbin Oct. 19, 2015, 7:47 p.m. UTC
On Thu, Oct 15, 2015 at 16:01:56 +0200, Jakub Jelinek wrote:
> >    void *fn_addr = gomp_get_target_fn_addr (devicep, fn);
> >  
> > +  if (flags & GOMP_TARGET_FLAG_NOWAIT)
> > +    {
> > +      gomp_create_target_task (devicep, fn_addr, mapnum, hostaddrs, sizes,
> > +			       kinds, flags, depend);
> > +      return;
> > +    }
> 
> But this is not ok.  You need to do this far earlier, already before the
> if (depend != NULL) code in GOMP_target_41.  And, I think you should just
> not pass fn_addr, but fn itself.
> 
> > @@ -1636,34 +1657,58 @@ void
> >  gomp_target_task_fn (void *data)
> >  {
> >    struct gomp_target_task *ttask = (struct gomp_target_task *) data;
> > +  struct gomp_device_descr *devicep = ttask->devicep;
> > +
> >    if (ttask->fn != NULL)
> >      {
> > -      /* GOMP_target_41 */
> > +      if (devicep == NULL
> > +	  || !(devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400))
> > +	{
> > +	  /* FIXME: Save host fn addr into gomp_target_task?  */
> > +	  gomp_target_fallback_firstprivate (NULL, ttask->mapnum,
> 
> If you pass above fn instead of fn_addr, ttask->fn is what you want
> to pass to gomp_target_fallback_firstprivate here and remove the FIXME.
> 
> > +					     ttask->hostaddrs, ttask->sizes,
> > +					     ttask->kinds);
> > +	  return;
> > +	}
> > +
> > +      struct target_mem_desc *tgt_vars
> > +	= gomp_map_vars (devicep, ttask->mapnum, ttask->hostaddrs, NULL,
> > +			 ttask->sizes, ttask->kinds, true,
> > +			 GOMP_MAP_VARS_TARGET);
> > +      devicep->async_run_func (devicep->target_id, ttask->fn,
> > +			       (void *) tgt_vars->tgt_start, data);
> 
> You need to void *fn_addr = gomp_get_target_fn_addr (devicep, ttask->fn);
> first obviously, and pass fn_addr.
> 
> > +
> > +      /* FIXME: TMP example of checking for completion.
> > +	 Alternatively the plugin can set some completion flag in ttask.  */
> > +      while (!devicep->async_is_completed_func (devicep->target_id, data))
> > +	{
> > +	  fprintf (stderr, "-");
> > +	  usleep (100000);
> > +	}
> 
> This obviously doesn't belong here.
> 
> >    if (device->capabilities & GOMP_OFFLOAD_CAP_OPENACC_200)
> > diff --git a/libgomp/testsuite/libgomp.c/target-tmp.c b/libgomp/testsuite/libgomp.c/target-tmp.c
> > new file mode 100644
> > index 0000000..23a739c
> > --- /dev/null
> > +++ b/libgomp/testsuite/libgomp.c/target-tmp.c
> > @@ -0,0 +1,40 @@
> > +#include <stdio.h>
> > +#include <unistd.h>
> > +
> > +#pragma omp declare target
> > +void foo (int n)
> > +{
> > +  printf ("Start tgt %d\n", n);
> > +  usleep (5000000);
> 
> 5s is too long.  Not to mention that not sure if PTX can do printf
> and especially usleep.
> 
> > diff --git a/liboffloadmic/plugin/libgomp-plugin-intelmic.cpp b/liboffloadmic/plugin/libgomp-plugin-intelmic.cpp
> > index 26ac6fe..c843710 100644
> > --- a/liboffloadmic/plugin/libgomp-plugin-intelmic.cpp
> > +++ b/liboffloadmic/plugin/libgomp-plugin-intelmic.cpp
> ...
> > +/* Set of asynchronously running target tasks.  */
> > +static std::set<const void *> *async_tasks;
> > +
> >  /* Thread-safe registration of the main image.  */
> >  static pthread_once_t main_image_is_registered = PTHREAD_ONCE_INIT;
> >  
> > +/* Mutex for protecting async_tasks.  */
> > +static pthread_mutex_t async_tasks_lock = PTHREAD_MUTEX_INITIALIZER;
> > +
> >  static VarDesc vd_host2tgt = {
> >    { 1, 1 },		      /* dst, src			      */
> >    { 1, 0 },		      /* in, out			      */
> > @@ -156,6 +163,8 @@ init (void)
> >  
> >  out:
> >    address_table = new ImgDevAddrMap;
> > +  async_tasks = new std::set<const void *>;
> > +  pthread_mutex_init (&async_tasks_lock, NULL);
> 
> PTHREAD_MUTEX_INITIALIZER should already initialize the lock.
> But, do you really need async_tasks and the lock?  Better store
> something into some plugin's owned field in target_task struct and
> let the plugin callback be passed address of that field rather than the
> whole target_task?

So, here is what I have for now.  Attached target-29.c testcase works fine with
MIC emul, however I don't know how to (and where) properly check for completion
of async execution on target.  And, similarly, where to do unmapping after that?
Do we need a callback from plugin to libgomp (as far as I understood, PTX
runtime supports this, but HSA doesn't), or libgomp will just check for
ttask->is_completed in task.c?

 

Thanks,
  -- Ilya
diff mbox

Patch

diff --git a/libgomp/libgomp.h b/libgomp/libgomp.h
index 9c8b1fb..e707c80 100644
--- a/libgomp/libgomp.h
+++ b/libgomp/libgomp.h
@@ -430,6 +430,7 @@  struct gomp_target_task
   size_t *sizes;
   unsigned short *kinds;
   unsigned int flags;
+  bool is_completed;
   void *hostaddrs[];
 };
 
@@ -877,6 +878,7 @@  struct gomp_device_descr
   void *(*host2dev_func) (int, void *, const void *, size_t);
   void *(*dev2dev_func) (int, void *, const void *, size_t);
   void (*run_func) (int, void *, void *);
+  void (*async_run_func) (int, void *, void *, bool *);
 
   /* Splay tree containing information about mapped memory regions.  */
   struct splay_tree_s mem_map;
diff --git a/libgomp/target.c b/libgomp/target.c
index d4c0cef..a136fb9 100644
--- a/libgomp/target.c
+++ b/libgomp/target.c
@@ -1339,6 +1339,14 @@  GOMP_target_41 (int device, void (*fn) (void *), size_t mapnum,
 {
   struct gomp_device_descr *devicep = resolve_device (device);
 
+  /* FIXME: Check for thr->ts.team && !thr->task->final_task ?  */
+  if (flags & GOMP_TARGET_FLAG_NOWAIT)
+    {
+      gomp_create_target_task (devicep, fn, mapnum, hostaddrs, sizes, kinds,
+			       flags, depend);
+      return;
+    }
+
   /* If there are depend clauses, but nowait is not present,
      block the parent task until the dependencies are resolved
      and then just continue with the rest of the function as if it
@@ -1650,34 +1658,56 @@  void
 gomp_target_task_fn (void *data)
 {
   struct gomp_target_task *ttask = (struct gomp_target_task *) data;
+  struct gomp_device_descr *devicep = ttask->devicep;
+
   if (ttask->fn != NULL)
     {
-      /* GOMP_target_41 */
+      if (devicep == NULL
+	  || !(devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400))
+	{
+	  gomp_target_fallback_firstprivate (ttask->fn, ttask->mapnum,
+					     ttask->hostaddrs, ttask->sizes,
+					     ttask->kinds);
+	  return;
+	}
+
+      void *fn_addr = gomp_get_target_fn_addr (devicep, ttask->fn);
+      struct target_mem_desc *tgt_vars
+	= gomp_map_vars (devicep, ttask->mapnum, ttask->hostaddrs, NULL,
+			 ttask->sizes, ttask->kinds, true,
+			 GOMP_MAP_VARS_TARGET);
+      devicep->async_run_func (devicep->target_id, fn_addr,
+			       (void *) tgt_vars->tgt_start,
+			       &ttask->is_completed);
+
+      /* FIXME: Move the task into some sleeping state, remove this loop from
+	 here.  */
+      while (!ttask->is_completed);
+      return;
     }
-  else if (ttask->devicep == NULL
-	   || !(ttask->devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400))
+  else if (devicep == NULL
+	   || !(devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400))
     return;
 
   size_t i;
   if (ttask->flags & GOMP_TARGET_FLAG_UPDATE)
-    gomp_update (ttask->devicep, ttask->mapnum, ttask->hostaddrs, ttask->sizes,
+    gomp_update (devicep, ttask->mapnum, ttask->hostaddrs, ttask->sizes,
 		 ttask->kinds, true);
   else if ((ttask->flags & GOMP_TARGET_FLAG_EXIT_DATA) == 0)
     for (i = 0; i < ttask->mapnum; i++)
       if ((ttask->kinds[i] & 0xff) == GOMP_MAP_STRUCT)
 	{
-	  gomp_map_vars (ttask->devicep, ttask->sizes[i] + 1,
-			 &ttask->hostaddrs[i], NULL, &ttask->sizes[i],
-			 &ttask->kinds[i], true, GOMP_MAP_VARS_ENTER_DATA);
+	  gomp_map_vars (devicep, ttask->sizes[i] + 1, &ttask->hostaddrs[i],
+			 NULL, &ttask->sizes[i], &ttask->kinds[i], true,
+			 GOMP_MAP_VARS_ENTER_DATA);
 	  i += ttask->sizes[i];
 	}
       else
-	gomp_map_vars (ttask->devicep, 1, &ttask->hostaddrs[i], NULL,
-		       &ttask->sizes[i], &ttask->kinds[i],
-		       true, GOMP_MAP_VARS_ENTER_DATA);
+	gomp_map_vars (devicep, 1, &ttask->hostaddrs[i], NULL, &ttask->sizes[i],
+		       &ttask->kinds[i], true, GOMP_MAP_VARS_ENTER_DATA);
   else
-    gomp_exit_data (ttask->devicep, ttask->mapnum, ttask->hostaddrs,
-		    ttask->sizes, ttask->kinds);
+    gomp_exit_data (devicep, ttask->mapnum, ttask->hostaddrs, ttask->sizes,
+		    ttask->kinds);
 }
 
 void
@@ -2122,6 +2152,7 @@  gomp_load_plugin_for_device (struct gomp_device_descr *device,
   if (device->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400)
     {
       DLSYM (run);
+      DLSYM (async_run);
       DLSYM (dev2dev);
     }
   if (device->capabilities & GOMP_OFFLOAD_CAP_OPENACC_200)
diff --git a/libgomp/task.c b/libgomp/task.c
index 1246c6a..7e1e3ca 100644
--- a/libgomp/task.c
+++ b/libgomp/task.c
@@ -480,6 +480,7 @@  gomp_create_target_task (struct gomp_device_descr *devicep,
   task->kind = GOMP_TASK_WAITING;
   task->in_tied_task = parent->in_tied_task;
   task->taskgroup = taskgroup;
+  /* FIXME: Segmentation fault here, if there are no dependencies.  */
   ttask = (struct gomp_target_task *) &task->depend[(uintptr_t) depend[0]];
   ttask->devicep = devicep;
   ttask->fn = fn;
@@ -490,6 +491,7 @@  gomp_create_target_task (struct gomp_device_descr *devicep,
   ttask->kinds = (unsigned short *) &ttask->sizes[mapnum];
   memcpy (ttask->kinds, kinds, mapnum * sizeof (unsigned short));
   ttask->flags = flags;
+  ttask->is_completed = false;
   task->fn = gomp_target_task_fn;
   task->fn_data = ttask;
   task->final_task = 0;
diff --git a/libgomp/testsuite/libgomp.c/target-29.c b/libgomp/testsuite/libgomp.c/target-29.c
new file mode 100644
index 0000000..c532797
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c/target-29.c
@@ -0,0 +1,50 @@ 
+#include <stdlib.h>
+#include <unistd.h>
+
+int main ()
+{
+  int a = 0, b = 0, c = 0, d[7];
+
+  #pragma omp parallel
+  #pragma omp single
+  {
+    #pragma omp task depend(out: d[0])
+      a = 2;
+
+    #pragma omp target enter data nowait map(to: a,b,c) depend(in: d[0]) depend(out: d[1])
+
+    #pragma omp target nowait map(alloc: a) depend(in: d[1]) depend(out: d[2])
+      a++;
+
+    #pragma omp target nowait map(alloc: b) depend(in: d[2]) depend(out: d[3])
+    {
+      usleep (1000);
+      b = 4;
+    }
+
+    #pragma omp target nowait map(alloc: b) depend(in: d[2]) depend(out: d[4])
+    {
+      usleep (5000);
+      b = 5;
+    }
+
+    #pragma omp target nowait map(alloc: c) depend(in: d[3], d[4]) depend(out: d[5])
+    {
+      usleep (5000);
+      c = 6;
+    }
+
+    #pragma omp target nowait map(alloc: c) depend(in: d[3], d[4]) depend(out: d[6])
+    {
+      usleep (1000);
+      c = 7;
+    }
+
+    #pragma omp target exit data map(always,from: a,b,c) depend(in: d[5], d[6])
+  }
+
+  if (a != 3 || (b != 4 && b != 5) || (c != 6 && c != 7))
+    abort ();
+
+  return 0;
+}
diff --git a/liboffloadmic/plugin/libgomp-plugin-intelmic.cpp b/liboffloadmic/plugin/libgomp-plugin-intelmic.cpp
index 26ac6fe..91c52ec 100644
--- a/liboffloadmic/plugin/libgomp-plugin-intelmic.cpp
+++ b/liboffloadmic/plugin/libgomp-plugin-intelmic.cpp
@@ -192,11 +192,23 @@  GOMP_OFFLOAD_get_num_devices (void)
 
 static void
 offload (const char *file, uint64_t line, int device, const char *name,
-	 int num_vars, VarDesc *vars, VarDesc2 *vars2)
+	 int num_vars, VarDesc *vars, VarDesc2 *vars2, const void **async_data)
 {
   OFFLOAD ofld = __offload_target_acquire1 (&device, file, line);
   if (ofld)
-    __offload_offload1 (ofld, name, 0, num_vars, vars, vars2, 0, NULL, NULL);
+    {
+      if (async_data == NULL)
+	__offload_offload1 (ofld, name, 0, num_vars, vars, vars2, 0, NULL,
+			    NULL);
+      else
+	{
+	  OffloadFlags flags;
+	  flags.flags = 0;
+	  flags.bits.omp_async = 1;
+	  __offload_offload3 (ofld, name, 0, num_vars, vars, NULL, 0, NULL,
+			      async_data, 0, NULL, flags, NULL);
+	}
+    }
   else
     {
       fprintf (stderr, "%s:%d: Offload target acquire failed\n", file, line);
@@ -218,7 +230,7 @@  GOMP_OFFLOAD_init_device (int device)
   TRACE ("");
   pthread_once (&main_image_is_registered, register_main_image);
   offload (__FILE__, __LINE__, device, "__offload_target_init_proc", 0,
-	   NULL, NULL);
+	   NULL, NULL, NULL);
 }
 
 extern "C" void
@@ -240,7 +252,7 @@  get_target_table (int device, int &num_funcs, int &num_vars, void **&table)
   VarDesc2 vd1g[2] = { { "num_funcs", 0 }, { "num_vars", 0 } };
 
   offload (__FILE__, __LINE__, device, "__offload_target_table_p1", 2,
-	   vd1, vd1g);
+	   vd1, vd1g, NULL);
 
   int table_size = num_funcs + 2 * num_vars;
   if (table_size > 0)
@@ -254,7 +266,7 @@  get_target_table (int device, int &num_funcs, int &num_vars, void **&table)
       VarDesc2 vd2g = { "table", 0 };
 
       offload (__FILE__, __LINE__, device, "__offload_target_table_p2", 1,
-	       &vd2, &vd2g);
+	       &vd2, &vd2g, NULL);
     }
 }
 
@@ -401,8 +413,8 @@  GOMP_OFFLOAD_alloc (int device, size_t size)
   vd1[1].size = sizeof (void *);
   VarDesc2 vd1g[2] = { { "size", 0 }, { "tgt_ptr", 0 } };
 
-  offload (__FILE__, __LINE__, device, "__offload_target_alloc", 2, vd1, vd1g);
-
+  offload (__FILE__, __LINE__, device, "__offload_target_alloc", 2, vd1, vd1g,
+	   NULL);
   return tgt_ptr;
 }
 
@@ -416,7 +428,8 @@  GOMP_OFFLOAD_free (int device, void *tgt_ptr)
   vd1.size = sizeof (void *);
   VarDesc2 vd1g = { "tgt_ptr", 0 };
 
-  offload (__FILE__, __LINE__, device, "__offload_target_free", 1, &vd1, &vd1g);
+  offload (__FILE__, __LINE__, device, "__offload_target_free", 1, &vd1, &vd1g,
+	   NULL);
 }
 
 extern "C" void *
@@ -435,7 +448,7 @@  GOMP_OFFLOAD_host2dev (int device, void *tgt_ptr, const void *host_ptr,
   VarDesc2 vd1g[2] = { { "tgt_ptr", 0 }, { "size", 0 } };
 
   offload (__FILE__, __LINE__, device, "__offload_target_host2tgt_p1", 2,
-	   vd1, vd1g);
+	   vd1, vd1g, NULL);
 
   VarDesc vd2 = vd_host2tgt;
   vd2.ptr = (void *) host_ptr;
@@ -443,7 +456,7 @@  GOMP_OFFLOAD_host2dev (int device, void *tgt_ptr, const void *host_ptr,
   VarDesc2 vd2g = { "var", 0 };
 
   offload (__FILE__, __LINE__, device, "__offload_target_host2tgt_p2", 1,
-	   &vd2, &vd2g);
+	   &vd2, &vd2g, NULL);
 
   return tgt_ptr;
 }
@@ -464,7 +477,7 @@  GOMP_OFFLOAD_dev2host (int device, void *host_ptr, const void *tgt_ptr,
   VarDesc2 vd1g[2] = { { "tgt_ptr", 0 }, { "size", 0 } };
 
   offload (__FILE__, __LINE__, device, "__offload_target_tgt2host_p1", 2,
-	   vd1, vd1g);
+	   vd1, vd1g, NULL);
 
   VarDesc vd2 = vd_tgt2host;
   vd2.ptr = (void *) host_ptr;
@@ -472,7 +485,7 @@  GOMP_OFFLOAD_dev2host (int device, void *host_ptr, const void *tgt_ptr,
   VarDesc2 vd2g = { "var", 0 };
 
   offload (__FILE__, __LINE__, device, "__offload_target_tgt2host_p2", 1,
-	   &vd2, &vd2g);
+	   &vd2, &vd2g, NULL);
 
   return host_ptr;
 }
@@ -495,22 +508,43 @@  GOMP_OFFLOAD_dev2dev (int device, void *dst_ptr, const void *src_ptr,
   VarDesc2 vd1g[3] = { { "dst_ptr", 0 }, { "src_ptr", 0 }, { "size", 0 } };
 
   offload (__FILE__, __LINE__, device, "__offload_target_tgt2tgt", 3, vd1,
-	   vd1g);
+	   vd1g, NULL);
 
   return dst_ptr;
 }
 
 extern "C" void
+GOMP_OFFLOAD_async_run (int device, void *tgt_fn, void *tgt_vars,
+			bool *async_data)
+{
+  TRACE ("(device = %d, tgt_fn = %p, tgt_vars = %p, async_data = %p)", device,
+	 tgt_fn, tgt_vars, async_data);
+
+  VarDesc vd[2] = { vd_host2tgt, vd_host2tgt };
+  vd[0].ptr = &tgt_fn;
+  vd[0].size = sizeof (void *);
+  vd[1].ptr = &tgt_vars;
+  vd[1].size = sizeof (void *);
+
+  offload (__FILE__, __LINE__, device, "__offload_target_run", 2, vd, NULL,
+	   (const void **) async_data);
+}
+
+extern "C" void
 GOMP_OFFLOAD_run (int device, void *tgt_fn, void *tgt_vars)
 {
-  TRACE ("(tgt_fn = %p, tgt_vars = %p)", tgt_fn, tgt_vars);
+  TRACE ("(device = %d, tgt_fn = %p, tgt_vars = %p)", device, tgt_fn, tgt_vars);
 
-  VarDesc vd1[2] = { vd_host2tgt, vd_host2tgt };
-  vd1[0].ptr = &tgt_fn;
-  vd1[0].size = sizeof (void *);
-  vd1[1].ptr = &tgt_vars;
-  vd1[1].size = sizeof (void *);
-  VarDesc2 vd1g[2] = { { "tgt_fn", 0 }, { "tgt_vars", 0 } };
+  GOMP_OFFLOAD_async_run (device, tgt_fn, tgt_vars, NULL);
+}
+
+/* Called by liboffloadmic when asynchronous function is completed.  */
+
+extern "C" void
+__gomp_offload_intelmic_async_completed (const void *async_data)
+{
+  TRACE ("(async_data = %p)", async_data);
 
-  offload (__FILE__, __LINE__, device, "__offload_target_run", 2, vd1, vd1g);
+  bool *ttask_is_completed = (bool *) async_data;
+  *ttask_is_completed = true;
 }
diff --git a/liboffloadmic/runtime/offload_host.cpp b/liboffloadmic/runtime/offload_host.cpp
index 66c2a01..fe6ec44 100644
--- a/liboffloadmic/runtime/offload_host.cpp
+++ b/liboffloadmic/runtime/offload_host.cpp
@@ -64,6 +64,9 @@  static void __offload_fini_library(void);
 #define GET_OFFLOAD_NUMBER(timer_data) \
     timer_data? timer_data->offload_number : 0
 
+extern "C" void
+__gomp_offload_intelmic_async_completed (const void *);
+
 extern "C" {
 #ifdef TARGET_WINNT
 // Windows does not support imports from libraries without actually
@@ -2508,7 +2511,7 @@  extern "C" {
         const void *info
     )
     {
-	/* TODO: Call callback function, pass info.  */
+	__gomp_offload_intelmic_async_completed (info);
     }
 }