diff mbox

[gomp4.5] depend nowait support for target

Message ID 20151111165222.GL5675@tucnak.redhat.com
State New
Headers show

Commit Message

Jakub Jelinek Nov. 11, 2015, 4:52 p.m. UTC
Hi!

On Mon, Oct 19, 2015 at 10:47:54PM +0300, Ilya Verbin wrote:
> So, here is what I have for now.  Attached target-29.c testcase works fine with
> MIC emul, however I don't know how to (and where) properly check for completion
> of async execution on target.  And, similarly, where to do unmapping after that?
> Do we need a callback from plugin to libgomp (as far as I understood, PTX
> runtime supports this, but HSA doesn't), or libgomp will just check for
> ttask->is_completed in task.c?

Here is the patch updated to have a task.c defined function that the plugin
can call upon completion of async offloading exection.
The testsuite coverage will need to improve, the testcase is wrong
(contains data races - if you want to test parallel running of two target
regions that both touch the same var, I'd say best would be to use
#pragma omp atomic and or in 4 in one case and 1 in another case, then
test if result is 5 (and similarly for the other var).
Also, with the usleeps Alex Monakov will be unhappy because PTX newlib does
not have it, but we'll need to find some solution for that.

Another thing to work on beyond testsuite coverage (it is desirable to test
nowait target tasks (both depend and without depend) being awaited in all
the various waiting spots, i.e. end of parallel, barrier, taskwait, end of
taskgroup, or if (0) task with depend clause waiting on that.

Also, I wonder what to do if #pragma omp target nowait is used outside of
(host) parallel - when team is NULL.  All the tasking code in that case just
executes tasks undeferred, which is fine for all but target nowait - there
it is I'd say useful to be able to run a single host thread concurrently
with some async offloading tasks.  So, I wonder if in that case,
if we encounter target nowait with team == NULL, should not just create a
dummy non-active (nthreads == 1) team, as if there was #pragma omp parallel
if (0) starting above it and ending at program's end.  In OpenMP, the
program's initial thread is implicitly surrounded by inactive parallel, so
this isn't anything against the OpenMP execution model.  But we'd need to
free the team somewhere in a destructor.

Can you please try to cleanup the liboffloadmic side of this, so that
a callback instead of hardcoded __gomp_offload_intelmic_async_completed call
is used?  Can you make sure it works on XeonPhi non-emulated too?

I'll keep working on the testcase coverage and on the team == NULL case.

The patch is on top of gomp-4_5-branch - needs Aldy's priority_queue stuff.



	Jakub
diff mbox

Patch

--- liboffloadmic/runtime/offload_host.cpp.jj	2015-11-05 11:31:05.013916598 +0100
+++ liboffloadmic/runtime/offload_host.cpp	2015-11-10 12:58:55.090951303 +0100
@@ -64,6 +64,9 @@  static void __offload_fini_library(void)
 #define GET_OFFLOAD_NUMBER(timer_data) \
     timer_data? timer_data->offload_number : 0
 
+extern "C" void
+__gomp_offload_intelmic_async_completed (const void *);
+
 extern "C" {
 #ifdef TARGET_WINNT
 // Windows does not support imports from libraries without actually
@@ -2507,7 +2510,7 @@  extern "C" {
         const void *info
     )
     {
-	/* TODO: Call callback function, pass info.  */
+	__gomp_offload_intelmic_async_completed (info);
     }
 }
 
--- liboffloadmic/plugin/libgomp-plugin-intelmic.cpp.jj	2015-10-14 10:24:10.922194230 +0200
+++ liboffloadmic/plugin/libgomp-plugin-intelmic.cpp	2015-11-11 15:48:55.428967827 +0100
@@ -192,11 +192,23 @@  GOMP_OFFLOAD_get_num_devices (void)
 
 static void
 offload (const char *file, uint64_t line, int device, const char *name,
-	 int num_vars, VarDesc *vars, VarDesc2 *vars2)
+	 int num_vars, VarDesc *vars, VarDesc2 *vars2, const void **async_data)
 {
   OFFLOAD ofld = __offload_target_acquire1 (&device, file, line);
   if (ofld)
-    __offload_offload1 (ofld, name, 0, num_vars, vars, vars2, 0, NULL, NULL);
+    {
+      if (async_data == NULL)
+	__offload_offload1 (ofld, name, 0, num_vars, vars, vars2, 0, NULL,
+			    NULL);
+      else
+	{
+	  OffloadFlags flags;
+	  flags.flags = 0;
+	  flags.bits.omp_async = 1;
+	  __offload_offload3 (ofld, name, 0, num_vars, vars, NULL, 0, NULL,
+			      async_data, 0, NULL, flags, NULL);
+	}
+    }
   else
     {
       fprintf (stderr, "%s:%d: Offload target acquire failed\n", file, line);
@@ -218,7 +230,7 @@  GOMP_OFFLOAD_init_device (int device)
   TRACE ("");
   pthread_once (&main_image_is_registered, register_main_image);
   offload (__FILE__, __LINE__, device, "__offload_target_init_proc", 0,
-	   NULL, NULL);
+	   NULL, NULL, NULL);
 }
 
 extern "C" void
@@ -240,7 +252,7 @@  get_target_table (int device, int &num_f
   VarDesc2 vd1g[2] = { { "num_funcs", 0 }, { "num_vars", 0 } };
 
   offload (__FILE__, __LINE__, device, "__offload_target_table_p1", 2,
-	   vd1, vd1g);
+	   vd1, vd1g, NULL);
 
   int table_size = num_funcs + 2 * num_vars;
   if (table_size > 0)
@@ -254,7 +266,7 @@  get_target_table (int device, int &num_f
       VarDesc2 vd2g = { "table", 0 };
 
       offload (__FILE__, __LINE__, device, "__offload_target_table_p2", 1,
-	       &vd2, &vd2g);
+	       &vd2, &vd2g, NULL);
     }
 }
 
@@ -401,8 +413,8 @@  GOMP_OFFLOAD_alloc (int device, size_t s
   vd1[1].size = sizeof (void *);
   VarDesc2 vd1g[2] = { { "size", 0 }, { "tgt_ptr", 0 } };
 
-  offload (__FILE__, __LINE__, device, "__offload_target_alloc", 2, vd1, vd1g);
-
+  offload (__FILE__, __LINE__, device, "__offload_target_alloc", 2, vd1, vd1g,
+	   NULL);
   return tgt_ptr;
 }
 
@@ -416,7 +428,8 @@  GOMP_OFFLOAD_free (int device, void *tgt
   vd1.size = sizeof (void *);
   VarDesc2 vd1g = { "tgt_ptr", 0 };
 
-  offload (__FILE__, __LINE__, device, "__offload_target_free", 1, &vd1, &vd1g);
+  offload (__FILE__, __LINE__, device, "__offload_target_free", 1, &vd1, &vd1g,
+	   NULL);
 }
 
 extern "C" void *
@@ -435,7 +448,7 @@  GOMP_OFFLOAD_host2dev (int device, void
   VarDesc2 vd1g[2] = { { "tgt_ptr", 0 }, { "size", 0 } };
 
   offload (__FILE__, __LINE__, device, "__offload_target_host2tgt_p1", 2,
-	   vd1, vd1g);
+	   vd1, vd1g, NULL);
 
   VarDesc vd2 = vd_host2tgt;
   vd2.ptr = (void *) host_ptr;
@@ -443,7 +456,7 @@  GOMP_OFFLOAD_host2dev (int device, void
   VarDesc2 vd2g = { "var", 0 };
 
   offload (__FILE__, __LINE__, device, "__offload_target_host2tgt_p2", 1,
-	   &vd2, &vd2g);
+	   &vd2, &vd2g, NULL);
 
   return tgt_ptr;
 }
@@ -464,7 +477,7 @@  GOMP_OFFLOAD_dev2host (int device, void
   VarDesc2 vd1g[2] = { { "tgt_ptr", 0 }, { "size", 0 } };
 
   offload (__FILE__, __LINE__, device, "__offload_target_tgt2host_p1", 2,
-	   vd1, vd1g);
+	   vd1, vd1g, NULL);
 
   VarDesc vd2 = vd_tgt2host;
   vd2.ptr = (void *) host_ptr;
@@ -472,7 +485,7 @@  GOMP_OFFLOAD_dev2host (int device, void
   VarDesc2 vd2g = { "var", 0 };
 
   offload (__FILE__, __LINE__, device, "__offload_target_tgt2host_p2", 1,
-	   &vd2, &vd2g);
+	   &vd2, &vd2g, NULL);
 
   return host_ptr;
 }
@@ -495,22 +508,42 @@  GOMP_OFFLOAD_dev2dev (int device, void *
   VarDesc2 vd1g[3] = { { "dst_ptr", 0 }, { "src_ptr", 0 }, { "size", 0 } };
 
   offload (__FILE__, __LINE__, device, "__offload_target_tgt2tgt", 3, vd1,
-	   vd1g);
+	   vd1g, NULL);
 
   return dst_ptr;
 }
 
 extern "C" void
+GOMP_OFFLOAD_async_run (int device, void *tgt_fn, void *tgt_vars,
+			void *async_data)
+{
+  TRACE ("(device = %d, tgt_fn = %p, tgt_vars = %p, async_data = %p)", device,
+	 tgt_fn, tgt_vars, async_data);
+
+  VarDesc vd[2] = { vd_host2tgt, vd_host2tgt };
+  vd[0].ptr = &tgt_fn;
+  vd[0].size = sizeof (void *);
+  vd[1].ptr = &tgt_vars;
+  vd[1].size = sizeof (void *);
+
+  offload (__FILE__, __LINE__, device, "__offload_target_run", 2, vd, NULL,
+	   (const void **) async_data);
+}
+
+extern "C" void
 GOMP_OFFLOAD_run (int device, void *tgt_fn, void *tgt_vars)
 {
-  TRACE ("(tgt_fn = %p, tgt_vars = %p)", tgt_fn, tgt_vars);
+  TRACE ("(device = %d, tgt_fn = %p, tgt_vars = %p)", device, tgt_fn, tgt_vars);
 
-  VarDesc vd1[2] = { vd_host2tgt, vd_host2tgt };
-  vd1[0].ptr = &tgt_fn;
-  vd1[0].size = sizeof (void *);
-  vd1[1].ptr = &tgt_vars;
-  vd1[1].size = sizeof (void *);
-  VarDesc2 vd1g[2] = { { "tgt_fn", 0 }, { "tgt_vars", 0 } };
+  GOMP_OFFLOAD_async_run (device, tgt_fn, tgt_vars, NULL);
+}
+
+/* Called by liboffloadmic when asynchronous function is completed.  */
+
+extern "C" void
+__gomp_offload_intelmic_async_completed (const void *async_data)
+{
+  TRACE ("(async_data = %p)", async_data);
 
-  offload (__FILE__, __LINE__, device, "__offload_target_run", 2, vd1, vd1g);
+  GOMP_PLUGIN_target_task_completion ((void *) async_data);
 }
--- libgomp/libgomp-plugin.h.jj	2015-10-14 10:24:10.000000000 +0200
+++ libgomp/libgomp-plugin.h	2015-11-11 15:48:16.875505434 +0100
@@ -59,10 +59,20 @@  struct addr_pair
   uintptr_t end;
 };
 
+/* Various state of OpenMP async offloading tasks.  */
+enum gomp_target_task_state
+{
+  GOMP_TARGET_TASK_DATA,
+  GOMP_TARGET_TASK_BEFORE_MAP,
+  GOMP_TARGET_TASK_FALLBACK,
+  GOMP_TARGET_TASK_RUNNING
+};
+
 /* Miscellaneous functions.  */
 extern void *GOMP_PLUGIN_malloc (size_t) __attribute__ ((malloc));
 extern void *GOMP_PLUGIN_malloc_cleared (size_t) __attribute__ ((malloc));
 extern void *GOMP_PLUGIN_realloc (void *, size_t);
+void GOMP_PLUGIN_target_task_completion (void *);
 
 extern void GOMP_PLUGIN_debug (int, const char *, ...)
 	__attribute__ ((format (printf, 2, 3)));
--- libgomp/testsuite/libgomp.c/target-32.c.jj	2015-11-10 12:58:55.087951346 +0100
+++ libgomp/testsuite/libgomp.c/target-32.c	2015-11-10 12:58:55.087951346 +0100
@@ -0,0 +1,50 @@ 
+#include <stdlib.h>
+#include <unistd.h>
+
+int main ()
+{
+  int a = 0, b = 0, c = 0, d[7];
+
+  #pragma omp parallel
+  #pragma omp single
+  {
+    #pragma omp task depend(out: d[0])
+      a = 2;
+
+    #pragma omp target enter data nowait map(to: a,b,c) depend(in: d[0]) depend(out: d[1])
+
+    #pragma omp target nowait map(alloc: a) depend(in: d[1]) depend(out: d[2])
+      a++;
+
+    #pragma omp target nowait map(alloc: b) depend(in: d[2]) depend(out: d[3])
+    {
+      usleep (1000);
+      b = 4;
+    }
+
+    #pragma omp target nowait map(alloc: b) depend(in: d[2]) depend(out: d[4])
+    {
+      usleep (5000);
+      b = 5;
+    }
+
+    #pragma omp target nowait map(alloc: c) depend(in: d[3], d[4]) depend(out: d[5])
+    {
+      usleep (5000);
+      c = 6;
+    }
+
+    #pragma omp target nowait map(alloc: c) depend(in: d[3], d[4]) depend(out: d[6])
+    {
+      usleep (1000);
+      c = 7;
+    }
+
+    #pragma omp target exit data map(always,from: a,b,c) depend(in: d[5], d[6])
+  }
+
+  if (a != 3 || (b != 4 && b != 5) || (c != 6 && c != 7))
+    abort ();
+
+  return 0;
+}
--- libgomp/target.c.jj	2015-11-09 11:14:37.325239961 +0100
+++ libgomp/target.c	2015-11-11 16:05:52.550784608 +0100
@@ -1348,17 +1348,7 @@  GOMP_target (int device, void (*fn) (voi
   struct target_mem_desc *tgt_vars
     = gomp_map_vars (devicep, mapnum, hostaddrs, NULL, sizes, kinds, false,
 		     GOMP_MAP_VARS_TARGET);
-  struct gomp_thread old_thr, *thr = gomp_thread ();
-  old_thr = *thr;
-  memset (thr, '\0', sizeof (*thr));
-  if (gomp_places_list)
-    {
-      thr->place = old_thr.place;
-      thr->ts.place_partition_len = gomp_places_list_len;
-    }
   devicep->run_func (devicep->target_id, fn_addr, (void *) tgt_vars->tgt_start);
-  gomp_free_thread (thr);
-  *thr = old_thr;
   gomp_unmap_vars (tgt_vars, true);
 }
 
@@ -1387,10 +1377,23 @@  GOMP_target_ext (int device, void (*fn)
   (void) num_teams;
   (void) thread_limit;
 
-  /* If there are depend clauses, but nowait is not present,
-     block the parent task until the dependencies are resolved
-     and then just continue with the rest of the function as if it
-     is a merged task.  */
+  if (flags & GOMP_TARGET_FLAG_NOWAIT)
+    {
+      struct gomp_thread *thr = gomp_thread ();
+      if (thr->ts.team
+	  && !thr->task->final_task)
+	{
+	  gomp_create_target_task (devicep, fn, mapnum, hostaddrs,
+				   sizes, kinds, flags, depend,
+				   GOMP_TARGET_TASK_BEFORE_MAP);
+	  return;
+	}
+    }
+
+  /* If there are depend clauses, but nowait is not present
+     (or we are in a final task), block the parent task until the
+     dependencies are resolved and then just continue with the rest
+     of the function as if it is a merged task.  */
   if (depend != NULL)
     {
       struct gomp_thread *thr = gomp_thread ();
@@ -1410,17 +1413,7 @@  GOMP_target_ext (int device, void (*fn)
   struct target_mem_desc *tgt_vars
     = gomp_map_vars (devicep, mapnum, hostaddrs, NULL, sizes, kinds, true,
 		     GOMP_MAP_VARS_TARGET);
-  struct gomp_thread old_thr, *thr = gomp_thread ();
-  old_thr = *thr;
-  memset (thr, '\0', sizeof (*thr));
-  if (gomp_places_list)
-    {
-      thr->place = old_thr.place;
-      thr->ts.place_partition_len = gomp_places_list_len;
-    }
   devicep->run_func (devicep->target_id, fn_addr, (void *) tgt_vars->tgt_start);
-  gomp_free_thread (thr);
-  *thr = old_thr;
   gomp_unmap_vars (tgt_vars, true);
 }
 
@@ -1527,23 +1520,25 @@  GOMP_target_update_ext (int device, size
 	      && thr->ts.team
 	      && !thr->task->final_task)
 	    {
-	      gomp_create_target_task (devicep, (void (*) (void *)) NULL,
-				       mapnum, hostaddrs, sizes, kinds,
-				       flags | GOMP_TARGET_FLAG_UPDATE,
-				       depend);
-	      return;
+	      if (gomp_create_target_task (devicep, (void (*) (void *)) NULL,
+					   mapnum, hostaddrs, sizes, kinds,
+					   flags | GOMP_TARGET_FLAG_UPDATE,
+					   depend, GOMP_TARGET_TASK_DATA))
+		return;
 	    }
+	  else
+	    {
+	      struct gomp_team *team = thr->ts.team;
+	      /* If parallel or taskgroup has been cancelled, don't start new
+		 tasks.  */
+	      if (team
+		  && (gomp_team_barrier_cancelled (&team->barrier)
+		      || (thr->task->taskgroup
+			  && thr->task->taskgroup->cancelled)))
+		return;
 
-	  struct gomp_team *team = thr->ts.team;
-	  /* If parallel or taskgroup has been cancelled, don't start new
-	     tasks.  */
-	  if (team
-	      && (gomp_team_barrier_cancelled (&team->barrier)
-		  || (thr->task->taskgroup
-		      && thr->task->taskgroup->cancelled)))
-	    return;
-
-	  gomp_task_maybe_wait_for_dependencies (depend);
+	      gomp_task_maybe_wait_for_dependencies (depend);
+	    }
 	}
     }
 
@@ -1647,22 +1642,25 @@  GOMP_target_enter_exit_data (int device,
 	      && thr->ts.team
 	      && !thr->task->final_task)
 	    {
-	      gomp_create_target_task (devicep, (void (*) (void *)) NULL,
-				       mapnum, hostaddrs, sizes, kinds,
-				       flags, depend);
-	      return;
+	      if (gomp_create_target_task (devicep, (void (*) (void *)) NULL,
+					   mapnum, hostaddrs, sizes, kinds,
+					   flags, depend,
+					   GOMP_TARGET_TASK_DATA))
+		return;
 	    }
+	  else
+	    {
+	      struct gomp_team *team = thr->ts.team;
+	      /* If parallel or taskgroup has been cancelled, don't start new
+		 tasks.  */
+	      if (team
+		  && (gomp_team_barrier_cancelled (&team->barrier)
+		      || (thr->task->taskgroup
+			  && thr->task->taskgroup->cancelled)))
+		return;
 
-	  struct gomp_team *team = thr->ts.team;
-	  /* If parallel or taskgroup has been cancelled, don't start new
-	     tasks.  */
-	  if (team
-	      && (gomp_team_barrier_cancelled (&team->barrier)
-		  || (thr->task->taskgroup
-		      && thr->task->taskgroup->cancelled)))
-	    return;
-
-	  gomp_task_maybe_wait_for_dependencies (depend);
+	      gomp_task_maybe_wait_for_dependencies (depend);
+	    }
 	}
     }
 
@@ -1694,38 +1692,65 @@  GOMP_target_enter_exit_data (int device,
     gomp_exit_data (devicep, mapnum, hostaddrs, sizes, kinds);
 }
 
-void
+bool
 gomp_target_task_fn (void *data)
 {
   struct gomp_target_task *ttask = (struct gomp_target_task *) data;
+  struct gomp_device_descr *devicep = ttask->devicep;
+
   if (ttask->fn != NULL)
     {
-      /* GOMP_target_ext */
-    }
-  else if (ttask->devicep == NULL
-	   || !(ttask->devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400))
-    return;
+      if (devicep == NULL
+	  || !(devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400))
+	{
+	  ttask->state = GOMP_TARGET_TASK_FALLBACK;
+	  gomp_target_fallback_firstprivate (ttask->fn, ttask->mapnum,
+					     ttask->hostaddrs, ttask->sizes,
+					     ttask->kinds);
+	  return false;
+	}
+
+      if (ttask->state == GOMP_TARGET_TASK_RUNNING)
+	{
+	  gomp_unmap_vars (ttask->tgt, true);
+	  return false;
+	}
+
+      void *fn_addr = gomp_get_target_fn_addr (devicep, ttask->fn);
+      ttask->tgt
+	= gomp_map_vars (devicep, ttask->mapnum, ttask->hostaddrs, NULL,
+			 ttask->sizes, ttask->kinds, true,
+			 GOMP_MAP_VARS_TARGET);
+      ttask->state = GOMP_TARGET_TASK_RUNNING;
+
+      devicep->async_run_func (devicep->target_id, fn_addr,
+			       (void *) ttask->tgt->tgt_start, (void *) ttask);
+      return true;
+    }
+  else if (devicep == NULL
+	   || !(devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400))
+    return false;
 
   size_t i;
   if (ttask->flags & GOMP_TARGET_FLAG_UPDATE)
-    gomp_update (ttask->devicep, ttask->mapnum, ttask->hostaddrs, ttask->sizes,
+    gomp_update (devicep, ttask->mapnum, ttask->hostaddrs, ttask->sizes,
 		 ttask->kinds, true);
   else if ((ttask->flags & GOMP_TARGET_FLAG_EXIT_DATA) == 0)
     for (i = 0; i < ttask->mapnum; i++)
       if ((ttask->kinds[i] & 0xff) == GOMP_MAP_STRUCT)
 	{
-	  gomp_map_vars (ttask->devicep, ttask->sizes[i] + 1,
-			 &ttask->hostaddrs[i], NULL, &ttask->sizes[i],
-			 &ttask->kinds[i], true, GOMP_MAP_VARS_ENTER_DATA);
+	  gomp_map_vars (devicep, ttask->sizes[i] + 1, &ttask->hostaddrs[i],
+			 NULL, &ttask->sizes[i], &ttask->kinds[i], true,
+			 GOMP_MAP_VARS_ENTER_DATA);
 	  i += ttask->sizes[i];
 	}
       else
-	gomp_map_vars (ttask->devicep, 1, &ttask->hostaddrs[i], NULL,
-		       &ttask->sizes[i], &ttask->kinds[i],
-		       true, GOMP_MAP_VARS_ENTER_DATA);
+	gomp_map_vars (devicep, 1, &ttask->hostaddrs[i], NULL, &ttask->sizes[i],
+		       &ttask->kinds[i], true, GOMP_MAP_VARS_ENTER_DATA);
   else
-    gomp_exit_data (ttask->devicep, ttask->mapnum, ttask->hostaddrs,
-		    ttask->sizes, ttask->kinds);
+    gomp_exit_data (devicep, ttask->mapnum, ttask->hostaddrs, ttask->sizes,
+		    ttask->kinds);
+  return false;
 }
 
 void
@@ -2170,6 +2195,7 @@  gomp_load_plugin_for_device (struct gomp
   if (device->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400)
     {
       DLSYM (run);
+      DLSYM (async_run);
       DLSYM (dev2dev);
     }
   if (device->capabilities & GOMP_OFFLOAD_CAP_OPENACC_200)
--- libgomp/libgomp.map.jj	2015-10-26 15:38:20.000000000 +0100
+++ libgomp/libgomp.map	2015-11-11 16:15:23.807818735 +0100
@@ -406,3 +406,8 @@  GOMP_PLUGIN_1.0 {
 	GOMP_PLUGIN_async_unmap_vars;
 	GOMP_PLUGIN_acc_thread;
 };
+
+GOMP_PLUGIN_1.1 {
+  global:
+	GOMP_PLUGIN_target_task_completion;
+} GOMP_PLUGIN_1.0;
--- libgomp/task.c.jj	2015-11-09 11:14:37.332239862 +0100
+++ libgomp/task.c	2015-11-11 16:42:43.749898088 +0100
@@ -482,11 +482,12 @@  ialias (GOMP_taskgroup_end)
 
 /* Called for nowait target tasks.  */
 
-void
+bool
 gomp_create_target_task (struct gomp_device_descr *devicep,
 			 void (*fn) (void *), size_t mapnum, void **hostaddrs,
 			 size_t *sizes, unsigned short *kinds,
-			 unsigned int flags, void **depend)
+			 unsigned int flags, void **depend,
+			 enum gomp_target_task_state state)
 {
   struct gomp_thread *thr = gomp_thread ();
   struct gomp_team *team = thr->ts.team;
@@ -495,7 +496,7 @@  gomp_create_target_task (struct gomp_dev
   if (team
       && (gomp_team_barrier_cancelled (&team->barrier)
 	  || (thr->task->taskgroup && thr->task->taskgroup->cancelled)))
-    return;
+    return true;
 
   struct gomp_target_task *ttask;
   struct gomp_task *task;
@@ -503,19 +504,44 @@  gomp_create_target_task (struct gomp_dev
   struct gomp_taskgroup *taskgroup = parent->taskgroup;
   bool do_wake;
   size_t depend_size = 0;
+  uintptr_t depend_cnt = 0;
+  size_t tgt_align = 0, tgt_size = 0;
 
   if (depend != NULL)
-    depend_size = ((uintptr_t) depend[0]
-		   * sizeof (struct gomp_task_depend_entry));
+    {
+      depend_cnt = (uintptr_t) depend[0];
+      depend_size = depend_cnt * sizeof (struct gomp_task_depend_entry);
+    }
+  if (fn)
+    {
+      /* GOMP_MAP_FIRSTPRIVATE need to be copied first, as they are
+	 firstprivate on the target task.  */
+      size_t i;
+      for (i = 0; i < mapnum; i++)
+	if ((kinds[i] & 0xff) == GOMP_MAP_FIRSTPRIVATE)
+	  {
+	    size_t align = (size_t) 1 << (kinds[i] >> 8);
+	    if (tgt_align < align)
+	      tgt_align = align;
+	    tgt_size = (tgt_size + align - 1) & ~(align - 1);
+	    tgt_size += sizes[i];
+	  }
+      if (tgt_align)
+	tgt_size += tgt_align - 1;
+      else
+	tgt_size = 0;
+    }
+
   task = gomp_malloc (sizeof (*task) + depend_size
 		      + sizeof (*ttask)
 		      + mapnum * (sizeof (void *) + sizeof (size_t)
-				  + sizeof (unsigned short)));
+				  + sizeof (unsigned short))
+		      + tgt_size);
   gomp_init_task (task, parent, gomp_icv (false));
   task->kind = GOMP_TASK_WAITING;
   task->in_tied_task = parent->in_tied_task;
   task->taskgroup = taskgroup;
-  ttask = (struct gomp_target_task *) &task->depend[(uintptr_t) depend[0]];
+  ttask = (struct gomp_target_task *) &task->depend[depend_cnt];
   ttask->devicep = devicep;
   ttask->fn = fn;
   ttask->mapnum = mapnum;
@@ -524,8 +550,29 @@  gomp_create_target_task (struct gomp_dev
   memcpy (ttask->sizes, sizes, mapnum * sizeof (size_t));
   ttask->kinds = (unsigned short *) &ttask->sizes[mapnum];
   memcpy (ttask->kinds, kinds, mapnum * sizeof (unsigned short));
+  if (tgt_align)
+    {
+      char *tgt = (char *) &ttask->kinds[mapnum];
+      size_t i;
+      uintptr_t al = (uintptr_t) tgt & (tgt_align - 1);
+      if (al)
+	tgt += tgt_align - al;
+      tgt_size = 0;
+      for (i = 0; i < mapnum; i++)
+	if ((kinds[i] & 0xff) == GOMP_MAP_FIRSTPRIVATE)
+	  {
+	    size_t align = (size_t) 1 << (kinds[i] >> 8);
+	    tgt_size = (tgt_size + align - 1) & ~(align - 1);
+	    memcpy (tgt + tgt_size, hostaddrs[i], sizes[i]); 
+	    ttask->hostaddrs[i] = tgt + tgt_size;
+	    tgt_size = tgt_size + sizes[i];
+	  }
+    }
   ttask->flags = flags;
-  task->fn = gomp_target_task_fn;
+  ttask->state = state;
+  ttask->task = task;
+  ttask->team = team;
+  task->fn = NULL;
   task->fn_data = ttask;
   task->final_task = 0;
   gomp_mutex_lock (&team->task_lock);
@@ -536,19 +583,26 @@  gomp_create_target_task (struct gomp_dev
       gomp_mutex_unlock (&team->task_lock);
       gomp_finish_task (task);
       free (task);
-      return;
+      return true;
     }
-  if (taskgroup)
-    taskgroup->num_children++;
   if (depend_size)
     {
       gomp_task_handle_depend (task, parent, depend);
       if (task->num_dependees)
 	{
 	  gomp_mutex_unlock (&team->task_lock);
-	  return;
+	  return true;
 	}
     }
+  if (state == GOMP_TARGET_TASK_DATA)
+    {
+      gomp_mutex_unlock (&team->task_lock);
+      gomp_finish_task (task);
+      free (task);
+      return false;
+    }
+  if (taskgroup)
+    taskgroup->num_children++;
   priority_queue_insert (PQ_CHILDREN, &parent->children_queue, task, 0,
 			 PRIORITY_INSERT_BEGIN,
 			 /*adjust_parent_depends_on=*/false,
@@ -570,6 +624,95 @@  gomp_create_target_task (struct gomp_dev
   gomp_mutex_unlock (&team->task_lock);
   if (do_wake)
     gomp_team_barrier_wake (&team->barrier, 1);
+  return true;
+}
+
+static void inline
+priority_queue_move_task_first (enum priority_queue_type type,
+				struct priority_queue *head,
+				struct gomp_task *task)
+{
+#if _LIBGOMP_CHECKING_
+  if (!priority_queue_task_in_queue_p (type, head, task))
+    gomp_fatal ("Attempt to move first missing task %p", task);
+#endif
+  struct priority_list *list;
+  if (priority_queue_multi_p (head))
+    {
+      list = priority_queue_lookup_priority (head, task->priority);
+#if _LIBGOMP_CHECKING_
+      if (!list)
+	gomp_fatal ("Unable to find priority %d", task->priority);
+#endif
+    }
+  else
+    list = &head->l;
+  priority_list_remove (list, task_to_priority_node (type, task), 0);
+  priority_list_insert (type, list, task, task->priority,
+			PRIORITY_INSERT_BEGIN, type == PQ_CHILDREN,
+			task->parent_depends_on);
+}
+
+/* Signal that a target task TTASK has completed the asynchronously
+   running phase and should be requeued as a task to handle the
+   variable unmapping.  */
+
+void
+GOMP_PLUGIN_target_task_completion (void *data)
+{
+  struct gomp_target_task *ttask = (struct gomp_target_task *) data;
+  struct gomp_task *task = ttask->task;
+  struct gomp_team *team = ttask->team;
+
+  gomp_mutex_lock (&team->task_lock);
+  struct gomp_task *parent = task->parent;
+  if (parent)
+    priority_queue_move_task_first (PQ_CHILDREN, &parent->children_queue,
+				    task);
+
+  struct gomp_taskgroup *taskgroup = task->taskgroup;
+  if (taskgroup)
+    priority_queue_move_task_first (PQ_TASKGROUP, &taskgroup->taskgroup_queue,
+				    task);
+
+  priority_queue_insert (PQ_TEAM, &team->task_queue, task, task->priority,
+			 PRIORITY_INSERT_BEGIN, false,
+			 task->parent_depends_on);
+  task->kind = GOMP_TASK_WAITING;
+  if (parent && parent->taskwait)
+    {
+      if (parent->taskwait->in_taskwait)
+	{
+	  /* One more task has had its dependencies met.
+	     Inform any waiters.  */
+	  parent->taskwait->in_taskwait = false;
+	  gomp_sem_post (&parent->taskwait->taskwait_sem);
+	}
+      else if (parent->taskwait->in_depend_wait)
+	{
+	  /* One more task has had its dependencies met.
+	     Inform any waiters.  */
+	  parent->taskwait->in_depend_wait = false;
+	  gomp_sem_post (&parent->taskwait->taskwait_sem);
+	}
+    }
+  if (taskgroup && taskgroup->in_taskgroup_wait)
+    {
+      /* One more task has had its dependencies met.
+	 Inform any waiters.  */
+      taskgroup->in_taskgroup_wait = false;
+      gomp_sem_post (&taskgroup->taskgroup_sem);
+    }
+
+  ++team->task_queued_count;
+  gomp_team_barrier_set_task_pending (&team->barrier);
+  /* I'm afraid this can't be done after releasing team->task_lock,
+     as gomp_target_task_completion is run from unrelated thread and
+     therefore in between gomp_mutex_unlock and gomp_team_barrier_wake
+     the team could be gone already.  */
+  if (team->nthreads > team->task_running_count)
+    gomp_team_barrier_wake (&team->barrier, 1);
+  gomp_mutex_unlock (&team->task_lock);
 }
 
 /* Given a parent_depends_on task in LIST, move it to the front of its
@@ -1041,7 +1184,20 @@  gomp_barrier_handle_tasks (gomp_barrier_
       if (child_task)
 	{
 	  thr->task = child_task;
-	  child_task->fn (child_task->fn_data);
+	  if (__builtin_expect (child_task->fn == NULL, 0))
+	    {
+	      if (gomp_target_task_fn (child_task->fn_data))
+		{
+		  thr->task = task;
+		  gomp_mutex_lock (&team->task_lock);
+		  child_task->kind = GOMP_TASK_ASYNC_RUNNING;
+		  team->task_running_count--;
+		  child_task = NULL;
+		  continue;
+		}
+	    }
+	  else
+	    child_task->fn (child_task->fn_data);
 	  thr->task = task;
 	}
       else
@@ -1170,7 +1326,19 @@  GOMP_taskwait (void)
       if (child_task)
 	{
 	  thr->task = child_task;
-	  child_task->fn (child_task->fn_data);
+	  if (__builtin_expect (child_task->fn == NULL, 0))
+	    {
+	      if (gomp_target_task_fn (child_task->fn_data))
+		{
+		  thr->task = task;
+		  gomp_mutex_lock (&team->task_lock);
+		  child_task->kind = GOMP_TASK_ASYNC_RUNNING;
+		  child_task = NULL;
+		  continue;
+		}
+	    }
+	  else
+	    child_task->fn (child_task->fn_data);
 	  thr->task = task;
 	}
       else
@@ -1342,7 +1510,19 @@  gomp_task_maybe_wait_for_dependencies (v
       if (child_task)
 	{
 	  thr->task = child_task;
-	  child_task->fn (child_task->fn_data);
+	  if (__builtin_expect (child_task->fn == NULL, 0))
+	    {
+	      if (gomp_target_task_fn (child_task->fn_data))
+		{
+		  thr->task = task;
+		  gomp_mutex_lock (&team->task_lock);
+		  child_task->kind = GOMP_TASK_ASYNC_RUNNING;
+		  child_task = NULL;
+		  continue;
+		}
+	    }
+	  else
+	    child_task->fn (child_task->fn_data);
 	  thr->task = task;
 	}
       else
@@ -1450,8 +1630,8 @@  GOMP_taskgroup_end (void)
 		= priority_queue_next_task (PQ_CHILDREN, &task->children_queue,
 					    PQ_TEAM, &team->task_queue,
 					    &unused);
-            }
-          else
+	    }
+	  else
 	    {
 	      gomp_mutex_unlock (&team->task_lock);
 	      if (to_free)
@@ -1506,7 +1686,19 @@  GOMP_taskgroup_end (void)
       if (child_task)
 	{
 	  thr->task = child_task;
-	  child_task->fn (child_task->fn_data);
+	  if (__builtin_expect (child_task->fn == NULL, 0))
+	    {
+	      if (gomp_target_task_fn (child_task->fn_data))
+		{
+		  thr->task = task;
+		  gomp_mutex_lock (&team->task_lock);
+		  child_task->kind = GOMP_TASK_ASYNC_RUNNING;
+		  child_task = NULL;
+		  continue;
+		}
+	    }
+	  else
+	    child_task->fn (child_task->fn_data);
 	  thr->task = task;
 	}
       else
--- libgomp/priority_queue.c.jj	2015-11-09 11:15:33.000000000 +0100
+++ libgomp/priority_queue.c	2015-11-10 17:52:33.769414428 +0100
@@ -85,7 +85,7 @@  priority_queue_task_in_queue_p (enum pri
    order.  LIST is a priority list of type TYPE.
 
    The expected order is that GOMP_TASK_WAITING tasks come before
-   GOMP_TASK_TIED ones.
+   GOMP_TASK_TIED/GOMP_TASK_ASYNC_RUNNING ones.
 
    If CHECK_DEPS is TRUE, we also check that parent_depends_on WAITING
    tasks come before !parent_depends_on WAITING tasks.  This is only
@@ -104,7 +104,7 @@  priority_list_verify (enum priority_queu
       struct gomp_task *t = priority_node_to_task (type, p);
       if (seen_tied && t->kind == GOMP_TASK_WAITING)
 	gomp_fatal ("priority_queue_verify: WAITING task after TIED");
-      if (t->kind == GOMP_TASK_TIED)
+      if (t->kind >= GOMP_TASK_TIED)
 	seen_tied = true;
       else if (check_deps && t->kind == GOMP_TASK_WAITING)
 	{
--- libgomp/libgomp.h.jj	2015-11-09 11:14:37.326239947 +0100
+++ libgomp/libgomp.h	2015-11-11 10:46:40.143794155 +0100
@@ -373,7 +373,12 @@  enum gomp_task_kind
   /* Task created by GOMP_task and waiting to be run.  */
   GOMP_TASK_WAITING,
   /* Task currently executing or scheduled and about to execute.  */
-  GOMP_TASK_TIED
+  GOMP_TASK_TIED,
+  /* Used for target tasks that have vars mapped and async run started,
+     but not yet completed.  Once that completes, they will be readded
+     into the queues as GOMP_TASK_WAITING in order to perform the var
+     unmapping.  */
+  GOMP_TASK_ASYNC_RUNNING
 };
 
 struct gomp_task_depend_entry
@@ -453,6 +458,8 @@  struct gomp_task
   struct gomp_task_depend_entry depend[];
 };
 
+/* This structure describes a single #pragma omp taskgroup.  */
+
 struct gomp_taskgroup
 {
   struct gomp_taskgroup *prev;
@@ -464,6 +471,8 @@  struct gomp_taskgroup
   size_t num_children;
 };
 
+/* This structure describes a target task.  */
+
 struct gomp_target_task
 {
   struct gomp_device_descr *devicep;
@@ -472,6 +481,10 @@  struct gomp_target_task
   size_t *sizes;
   unsigned short *kinds;
   unsigned int flags;
+  enum gomp_target_task_state state;
+  struct target_mem_desc *tgt;
+  struct gomp_task *task;
+  struct gomp_team *team;
   void *hostaddrs[];
 };
 
@@ -723,10 +736,10 @@  extern void gomp_init_task (struct gomp_
 extern void gomp_end_task (void);
 extern void gomp_barrier_handle_tasks (gomp_barrier_state_t);
 extern void gomp_task_maybe_wait_for_dependencies (void **);
-extern void gomp_create_target_task (struct gomp_device_descr *,
+extern bool gomp_create_target_task (struct gomp_device_descr *,
 				     void (*) (void *), size_t, void **,
 				     size_t *, unsigned short *, unsigned int,
-				     void **);
+				     void **, enum gomp_target_task_state);
 
 static void inline
 gomp_finish_task (struct gomp_task *task)
@@ -747,7 +760,7 @@  extern void gomp_free_thread (void *);
 
 extern void gomp_init_targets_once (void);
 extern int gomp_get_num_devices (void);
-extern void gomp_target_task_fn (void *);
+extern bool gomp_target_task_fn (void *);
 
 /* Splay tree definitions.  */
 typedef struct splay_tree_node_s *splay_tree_node;
@@ -901,6 +914,7 @@  struct gomp_device_descr
   void *(*host2dev_func) (int, void *, const void *, size_t);
   void *(*dev2dev_func) (int, void *, const void *, size_t);
   void (*run_func) (int, void *, void *);
+  void (*async_run_func) (int, void *, void *, void *);
 
   /* Splay tree containing information about mapped memory regions.  */
   struct splay_tree_s mem_map;