diff mbox

[hsa,merge,02/10] Modifications to libgomp proper

Message ID 20160113173925.431228371@virgil.suse.cz
State New
Headers show

Commit Message

Martin Jambor Jan. 13, 2016, 5:39 p.m. UTC
Hi,

The patch below contains all changes to libgomp files except for the
hsa plugin (which is in the following patch).

The patch is a re-post of
https://gcc.gnu.org/ml/gcc-patches/2015-12/msg01288.html but we have
incorporated a number of requests from the feedback.  From the
subsequent communications with Jakub, I have the feeling he is fine
with the changes.  But perhaps he or someone else would like to have
one more look.

Thanks,

Martin


2016-01-13  Martin Jambor  <mjambor@suse.cz>

include/
	* gomp-constants.h (GOMP_DEVICE_HSA): New macro.
	(GOMP_VERSION_HSA): Likewise.
	(GOMP_TARGET_ARG_DEVICE_MASK): Likewise.
	(GOMP_TARGET_ARG_DEVICE_ALL): Likewise.
	(GOMP_TARGET_ARG_SUBSEQUENT_PARAM): Likewise.
	(GOMP_TARGET_ARG_ID_MASK): Likewise.
	(GOMP_TARGET_ARG_NUM_TEAMS): Likewise.
	(GOMP_TARGET_ARG_THREAD_LIMIT): Likewise.
	(GOMP_TARGET_ARG_VALUE_SHIFT): Likewise.
	(GOMP_TARGET_ARG_HSA_KERNEL_ATTRIBUTES): Likewise.

libgomp/
	* libgomp-plugin.h (offload_target_type): New element
	OFFLOAD_TARGET_TYPE_HSA.
	* libgomp.h (gomp_target_task): New fields firstprivate_copies and
	args.
	(bool gomp_create_target_task): Updated.
	(gomp_device_descr): Extra parameter of run_func and async_run_func,
	new field can_run_func.
	* libgomp_g.h (GOMP_target_ext): Update prototype.
	* oacc-host.c (host_run): Added a new parameter args.
	* target.c (calculate_firstprivate_requirements): New function.
	(copy_firstprivate_data): Likewise.
	(gomp_target_fallback_firstprivate): Use them.
	(gomp_target_unshare_firstprivate): New function.
	(gomp_get_target_fn_addr): Allow returning NULL for shared memory
	devices.
	(GOMP_target): Do host fallback for all shared memory devices.  Do not
	pass any args to plugins.
	(GOMP_target_ext): Introduce device-specific argument parameter args.
	Allow host fallback if device shares memory.  Do not remap data if
	device has shared memory.
	(gomp_target_task_fn): Likewise.  Also treat shared memory devices
	like host fallback for mappings.
	(GOMP_target_data): Treat shared memory devices like host fallback.
	(GOMP_target_data_ext): Likewise.
	(GOMP_target_update): Likewise.
	(GOMP_target_update_ext): Likewise.  Also pass NULL as args to
	gomp_create_target_task.
	(GOMP_target_enter_exit_data): Likewise.
	(omp_target_alloc): Treat shared memory devices like host fallback.
	(omp_target_free): Likewise.
	(omp_target_is_present): Likewise.
	(omp_target_memcpy): Likewise.
	(omp_target_memcpy_rect): Likewise.
	(omp_target_associate_ptr): Likewise.
	(gomp_load_plugin_for_device): Also load can_run.
	* task.c (GOMP_PLUGIN_target_task_completion): Free
	firstprivate_copies.
	(gomp_create_target_task): Accept new argument args and store it to
	ttask.

liboffloadmic/plugin
	* libgomp-plugin-intelmic.cpp (GOMP_OFFLOAD_async_run): New unused
	parameter.
	(GOMP_OFFLOAD_run): Likewise.
diff mbox

Patch

diff --git a/include/gomp-constants.h b/include/gomp-constants.h
index dffd631..a8e7723 100644
--- a/include/gomp-constants.h
+++ b/include/gomp-constants.h
@@ -176,6 +176,7 @@  enum gomp_map_kind
 #define GOMP_DEVICE_NOT_HOST		4
 #define GOMP_DEVICE_NVIDIA_PTX		5
 #define GOMP_DEVICE_INTEL_MIC		6
+#define GOMP_DEVICE_HSA			7
 
 #define GOMP_DEVICE_ICV			-1
 #define GOMP_DEVICE_HOST_FALLBACK	-2
@@ -201,6 +202,7 @@  enum gomp_map_kind
 #define GOMP_VERSION	0
 #define GOMP_VERSION_NVIDIA_PTX 1
 #define GOMP_VERSION_INTEL_MIC 0
+#define GOMP_VERSION_HSA 0
 
 #define GOMP_VERSION_PACK(LIB, DEV) (((LIB) << 16) | (DEV))
 #define GOMP_VERSION_LIB(PACK) (((PACK) >> 16) & 0xffff)
@@ -228,4 +230,30 @@  enum gomp_map_kind
 #define GOMP_LAUNCH_OP(X) (((X) >> GOMP_LAUNCH_OP_SHIFT) & 0xffff)
 #define GOMP_LAUNCH_OP_MAX 0xffff
 
+/* Bitmask to apply in order to find out the intended device of a target
+   argument.  */
+#define GOMP_TARGET_ARG_DEVICE_MASK		((1 << 7) - 1)
+/* The target argument is significant for all devices.  */
+#define GOMP_TARGET_ARG_DEVICE_ALL		0
+
+/* Flag set when the subsequent element in the device-specific argument
+   values.  */
+#define GOMP_TARGET_ARG_SUBSEQUENT_PARAM	(1 << 7)
+
+/* Bitmask to apply to a target argument to find out the value identifier.  */
+#define GOMP_TARGET_ARG_ID_MASK			(((1 << 8) - 1) << 8)
+/* Target argument index of NUM_TEAMS.  */
+#define GOMP_TARGET_ARG_NUM_TEAMS		(1 << 8)
+/* Target argument index of THREAD_LIMIT.  */
+#define GOMP_TARGET_ARG_THREAD_LIMIT		(2 << 8)
+
+/* If the value is directly embeded in target argument, it should be a 16-bit
+   at most and shifted by this many bits.  */
+#define GOMP_TARGET_ARG_VALUE_SHIFT		16
+
+/* HSA specific data structures.  */
+
+/* Identifiers of device-specific target arguments.  */
+#define GOMP_TARGET_ARG_HSA_KERNEL_ATTRIBUTES	(1 << 8)
+
 #endif
diff --git a/libgomp/libgomp-plugin.h b/libgomp/libgomp-plugin.h
index 64035e4..53f9248 100644
--- a/libgomp/libgomp-plugin.h
+++ b/libgomp/libgomp-plugin.h
@@ -48,7 +48,8 @@  enum offload_target_type
   OFFLOAD_TARGET_TYPE_HOST = 2,
   /* OFFLOAD_TARGET_TYPE_HOST_NONSHM = 3 removed.  */
   OFFLOAD_TARGET_TYPE_NVIDIA_PTX = 5,
-  OFFLOAD_TARGET_TYPE_INTEL_MIC = 6
+  OFFLOAD_TARGET_TYPE_INTEL_MIC = 6,
+  OFFLOAD_TARGET_TYPE_HSA = 7
 };
 
 /* Auxiliary struct, used for transferring pairs of addresses from plugin
diff --git a/libgomp/libgomp.h b/libgomp/libgomp.h
index 6ddde56..7108a6d 100644
--- a/libgomp/libgomp.h
+++ b/libgomp/libgomp.h
@@ -496,6 +496,10 @@  struct gomp_target_task
   struct target_mem_desc *tgt;
   struct gomp_task *task;
   struct gomp_team *team;
+  /* Copies of firstprivate mapped data for shared memory accelerators.  */
+  void *firstprivate_copies;
+  /* Device-specific target arguments.  */
+  void **args;
   void *hostaddrs[];
 };
 
@@ -750,7 +754,8 @@  extern void gomp_task_maybe_wait_for_dependencies (void **);
 extern bool gomp_create_target_task (struct gomp_device_descr *,
 				     void (*) (void *), size_t, void **,
 				     size_t *, unsigned short *, unsigned int,
-				     void **, enum gomp_target_task_state);
+				     void **, void **,
+				     enum gomp_target_task_state);
 
 static void inline
 gomp_finish_task (struct gomp_task *task)
@@ -937,8 +942,9 @@  struct gomp_device_descr
   void *(*dev2host_func) (int, void *, const void *, size_t);
   void *(*host2dev_func) (int, void *, const void *, size_t);
   void *(*dev2dev_func) (int, void *, const void *, size_t);
-  void (*run_func) (int, void *, void *);
-  void (*async_run_func) (int, void *, void *, void *);
+  bool (*can_run_func) (void *);
+  void (*run_func) (int, void *, void *, void **);
+  void (*async_run_func) (int, void *, void *, void **, void *);
 
   /* Splay tree containing information about mapped memory regions.  */
   struct splay_tree_s mem_map;
diff --git a/libgomp/libgomp_g.h b/libgomp/libgomp_g.h
index 6229ca0..24eebb6 100644
--- a/libgomp/libgomp_g.h
+++ b/libgomp/libgomp_g.h
@@ -278,8 +278,7 @@  extern void GOMP_single_copy_end (void *);
 extern void GOMP_target (int, void (*) (void *), const void *,
 			 size_t, void **, size_t *, unsigned char *);
 extern void GOMP_target_ext (int, void (*) (void *), size_t, void **, size_t *,
-			     unsigned short *, unsigned int, void **,
-			     int, int);
+			     unsigned short *, unsigned int, void **, void **);
 extern void GOMP_target_data (int, const void *,
 			      size_t, void **, size_t *, unsigned char *);
 extern void GOMP_target_data_ext (int, size_t, void **, size_t *,
diff --git a/libgomp/oacc-host.c b/libgomp/oacc-host.c
index 0760e44..1e760f6 100644
--- a/libgomp/oacc-host.c
+++ b/libgomp/oacc-host.c
@@ -123,7 +123,8 @@  host_host2dev (int n __attribute__ ((unused)),
 }
 
 static void
-host_run (int n __attribute__ ((unused)), void *fn_ptr, void *vars)
+host_run (int n __attribute__ ((unused)), void *fn_ptr, void *vars,
+	  void **args __attribute__((unused)))
 {
   void (*fn)(void *) = (void (*)(void *)) fn_ptr;
 
diff --git a/libgomp/target.c b/libgomp/target.c
index bea5822..f1f5849 100644
--- a/libgomp/target.c
+++ b/libgomp/target.c
@@ -1329,44 +1329,90 @@  gomp_target_fallback (void (*fn) (void *), void **hostaddrs)
   *thr = old_thr;
 }
 
-/* Host fallback with firstprivate map-type handling.  */
+/* Calculate alignment and size requirements of a private copy of data shared
+   as GOMP_MAP_FIRSTPRIVATE and store them to TGT_ALIGN and TGT_SIZE.  */
 
-static void
-gomp_target_fallback_firstprivate (void (*fn) (void *), size_t mapnum,
-				   void **hostaddrs, size_t *sizes,
-				   unsigned short *kinds)
+static inline void
+calculate_firstprivate_requirements (size_t mapnum, size_t *sizes,
+				     unsigned short *kinds, size_t *tgt_align,
+				     size_t *tgt_size)
 {
-  size_t i, tgt_align = 0, tgt_size = 0;
-  char *tgt = NULL;
+  size_t i;
+  for (i = 0; i < mapnum; i++)
+    if ((kinds[i] & 0xff) == GOMP_MAP_FIRSTPRIVATE)
+      {
+	size_t align = (size_t) 1 << (kinds[i] >> 8);
+	if (*tgt_align < align)
+	  *tgt_align = align;
+	*tgt_size = (*tgt_size + align - 1) & ~(align - 1);
+	*tgt_size += sizes[i];
+      }
+}
+
+/* Copy data shared as GOMP_MAP_FIRSTPRIVATE to DST.  */
+
+static inline void
+copy_firstprivate_data (char *tgt, size_t mapnum, void **hostaddrs,
+			size_t *sizes, unsigned short *kinds, size_t tgt_align,
+			size_t tgt_size)
+{
+  uintptr_t al = (uintptr_t) tgt & (tgt_align - 1);
+  if (al)
+    tgt += tgt_align - al;
+  tgt_size = 0;
+  size_t i;
   for (i = 0; i < mapnum; i++)
     if ((kinds[i] & 0xff) == GOMP_MAP_FIRSTPRIVATE)
       {
 	size_t align = (size_t) 1 << (kinds[i] >> 8);
-	if (tgt_align < align)
-	  tgt_align = align;
 	tgt_size = (tgt_size + align - 1) & ~(align - 1);
-	tgt_size += sizes[i];
+	memcpy (tgt + tgt_size, hostaddrs[i], sizes[i]);
+	hostaddrs[i] = tgt + tgt_size;
+	tgt_size = tgt_size + sizes[i];
       }
+}
+
+/* Host fallback with firstprivate map-type handling.  */
+
+static void
+gomp_target_fallback_firstprivate (void (*fn) (void *), size_t mapnum,
+				   void **hostaddrs, size_t *sizes,
+				   unsigned short *kinds)
+{
+  size_t tgt_align = 0, tgt_size = 0;
+  calculate_firstprivate_requirements (mapnum, sizes, kinds, &tgt_align,
+				       &tgt_size);
   if (tgt_align)
     {
-      tgt = gomp_alloca (tgt_size + tgt_align - 1);
-      uintptr_t al = (uintptr_t) tgt & (tgt_align - 1);
-      if (al)
-	tgt += tgt_align - al;
-      tgt_size = 0;
-      for (i = 0; i < mapnum; i++)
-	if ((kinds[i] & 0xff) == GOMP_MAP_FIRSTPRIVATE)
-	  {
-	    size_t align = (size_t) 1 << (kinds[i] >> 8);
-	    tgt_size = (tgt_size + align - 1) & ~(align - 1);
-	    memcpy (tgt + tgt_size, hostaddrs[i], sizes[i]);
-	    hostaddrs[i] = tgt + tgt_size;
-	    tgt_size = tgt_size + sizes[i];
-	  }
+      char *tgt = gomp_alloca (tgt_size + tgt_align - 1);
+      copy_firstprivate_data (tgt, mapnum, hostaddrs, sizes, kinds, tgt_align,
+			      tgt_size);
     }
   gomp_target_fallback (fn, hostaddrs);
 }
 
+/* Handle firstprivate map-type for shared memory devices and the host
+   fallback.  Return the pointer of firstprivate copies which has to be freed
+   after use.  */
+
+static void *
+gomp_target_unshare_firstprivate (size_t mapnum, void **hostaddrs,
+				  size_t *sizes, unsigned short *kinds)
+{
+  size_t tgt_align = 0, tgt_size = 0;
+  char *tgt = NULL;
+
+  calculate_firstprivate_requirements (mapnum, sizes, kinds, &tgt_align,
+				       &tgt_size);
+  if (tgt_align)
+    {
+      tgt = gomp_malloc (tgt_size + tgt_align - 1);
+      copy_firstprivate_data (tgt, mapnum, hostaddrs, sizes, kinds, tgt_align,
+			      tgt_size);
+    }
+  return tgt;
+}
+
 /* Helper function of GOMP_target{,_ext} routines.  */
 
 static void *
@@ -1390,7 +1436,12 @@  gomp_get_target_fn_addr (struct gomp_device_descr *devicep,
       splay_tree_key tgt_fn = splay_tree_lookup (&devicep->mem_map, &k);
       gomp_mutex_unlock (&devicep->lock);
       if (tgt_fn == NULL)
-	gomp_fatal ("Target function wasn't mapped");
+	{
+	  if (devicep->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
+	    return NULL;
+	  else
+	    gomp_fatal ("Target function wasn't mapped");
+	}
 
       return (void *) tgt_fn->tgt_offset;
     }
@@ -1416,13 +1467,16 @@  GOMP_target (int device, void (*fn) (void *), const void *unused,
   void *fn_addr;
   if (devicep == NULL
       || !(devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400)
+      /* All shared memory devices should use the GOMP_target_ext function.  */
+      || devicep->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM
       || !(fn_addr = gomp_get_target_fn_addr (devicep, fn)))
     return gomp_target_fallback (fn, hostaddrs);
 
   struct target_mem_desc *tgt_vars
     = gomp_map_vars (devicep, mapnum, hostaddrs, NULL, sizes, kinds, false,
 		     GOMP_MAP_VARS_TARGET);
-  devicep->run_func (devicep->target_id, fn_addr, (void *) tgt_vars->tgt_start);
+  devicep->run_func (devicep->target_id, fn_addr, (void *) tgt_vars->tgt_start,
+		     NULL);
   gomp_unmap_vars (tgt_vars, true);
 }
 
@@ -1430,6 +1484,15 @@  GOMP_target (int device, void (*fn) (void *), const void *unused,
    and several arguments have been added:
    FLAGS is a bitmask, see GOMP_TARGET_FLAG_* in gomp-constants.h.
    DEPEND is array of dependencies, see GOMP_task for details.
+
+   ARGS is a pointer to an array consisting of a variable number of both
+   device-independent and device-specific arguments, which can take one two
+   elements where the first specifies for which device it is intended, the type
+   and optionally also the value.  If the value is not present in the first
+   one, the whole second element the actual value.  The last element of the
+   array is a single NULL.  Among the device independent can be for example
+   NUM_TEAMS and THREAD_LIMIT.
+
    NUM_TEAMS is positive if GOMP_teams will be called in the body with
    that value, or 1 if teams construct is not present, or 0, if
    teams construct does not have num_teams clause and so the choice is
@@ -1443,14 +1506,10 @@  GOMP_target (int device, void (*fn) (void *), const void *unused,
 void
 GOMP_target_ext (int device, void (*fn) (void *), size_t mapnum,
 		 void **hostaddrs, size_t *sizes, unsigned short *kinds,
-		 unsigned int flags, void **depend, int num_teams,
-		 int thread_limit)
+		 unsigned int flags, void **depend, void **args)
 {
   struct gomp_device_descr *devicep = resolve_device (device);
 
-  (void) num_teams;
-  (void) thread_limit;
-
   if (flags & GOMP_TARGET_FLAG_NOWAIT)
     {
       struct gomp_thread *thr = gomp_thread ();
@@ -1487,7 +1546,7 @@  GOMP_target_ext (int device, void (*fn) (void *), size_t mapnum,
 	  && !thr->task->final_task)
 	{
 	  gomp_create_target_task (devicep, fn, mapnum, hostaddrs,
-				   sizes, kinds, flags, depend,
+				   sizes, kinds, flags, depend, args,
 				   GOMP_TARGET_TASK_BEFORE_MAP);
 	  return;
 	}
@@ -1507,17 +1566,30 @@  GOMP_target_ext (int device, void (*fn) (void *), size_t mapnum,
   void *fn_addr;
   if (devicep == NULL
       || !(devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400)
-      || !(fn_addr = gomp_get_target_fn_addr (devicep, fn)))
+      || !(fn_addr = gomp_get_target_fn_addr (devicep, fn))
+      || (devicep->can_run_func && !devicep->can_run_func (fn_addr)))
     {
       gomp_target_fallback_firstprivate (fn, mapnum, hostaddrs, sizes, kinds);
       return;
     }
 
-  struct target_mem_desc *tgt_vars
-    = gomp_map_vars (devicep, mapnum, hostaddrs, NULL, sizes, kinds, true,
-		     GOMP_MAP_VARS_TARGET);
-  devicep->run_func (devicep->target_id, fn_addr, (void *) tgt_vars->tgt_start);
-  gomp_unmap_vars (tgt_vars, true);
+  struct target_mem_desc *tgt_vars;
+  void *fpc = NULL;
+  if (devicep->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
+    {
+      fpc = gomp_target_unshare_firstprivate (mapnum, hostaddrs, sizes, kinds);
+      tgt_vars = NULL;
+    }
+  else
+    tgt_vars = gomp_map_vars (devicep, mapnum, hostaddrs, NULL, sizes, kinds,
+			      true, GOMP_MAP_VARS_TARGET);
+  devicep->run_func (devicep->target_id, fn_addr,
+		     tgt_vars ? (void *) tgt_vars->tgt_start : hostaddrs,
+		     args);
+  if (tgt_vars)
+    gomp_unmap_vars (tgt_vars, true);
+  else
+    free (fpc);
 }
 
 /* Host fallback for GOMP_target_data{,_ext} routines.  */
@@ -1547,7 +1619,8 @@  GOMP_target_data (int device, const void *unused, size_t mapnum,
   struct gomp_device_descr *devicep = resolve_device (device);
 
   if (devicep == NULL
-      || !(devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400))
+      || !(devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400)
+      || (devicep->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM))
     return gomp_target_data_fallback ();
 
   struct target_mem_desc *tgt
@@ -1565,7 +1638,8 @@  GOMP_target_data_ext (int device, size_t mapnum, void **hostaddrs,
   struct gomp_device_descr *devicep = resolve_device (device);
 
   if (devicep == NULL
-      || !(devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400))
+      || !(devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400)
+      || devicep->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
     return gomp_target_data_fallback ();
 
   struct target_mem_desc *tgt
@@ -1595,7 +1669,8 @@  GOMP_target_update (int device, const void *unused, size_t mapnum,
   struct gomp_device_descr *devicep = resolve_device (device);
 
   if (devicep == NULL
-      || !(devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400))
+      || !(devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400)
+      || devicep->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
     return;
 
   gomp_update (devicep, mapnum, hostaddrs, sizes, kinds, false);
@@ -1626,7 +1701,7 @@  GOMP_target_update_ext (int device, size_t mapnum, void **hostaddrs,
 	      if (gomp_create_target_task (devicep, (void (*) (void *)) NULL,
 					   mapnum, hostaddrs, sizes, kinds,
 					   flags | GOMP_TARGET_FLAG_UPDATE,
-					   depend, GOMP_TARGET_TASK_DATA))
+					   depend, NULL, GOMP_TARGET_TASK_DATA))
 		return;
 	    }
 	  else
@@ -1646,7 +1721,8 @@  GOMP_target_update_ext (int device, size_t mapnum, void **hostaddrs,
     }
 
   if (devicep == NULL
-      || !(devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400))
+      || !(devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400)
+      || devicep->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
     return;
 
   struct gomp_thread *thr = gomp_thread ();
@@ -1756,7 +1832,7 @@  GOMP_target_enter_exit_data (int device, size_t mapnum, void **hostaddrs,
 	    {
 	      if (gomp_create_target_task (devicep, (void (*) (void *)) NULL,
 					   mapnum, hostaddrs, sizes, kinds,
-					   flags, depend,
+					   flags, depend, NULL,
 					   GOMP_TARGET_TASK_DATA))
 		return;
 	    }
@@ -1777,7 +1853,8 @@  GOMP_target_enter_exit_data (int device, size_t mapnum, void **hostaddrs,
     }
 
   if (devicep == NULL
-      || !(devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400))
+      || !(devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400)
+      || devicep->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
     return;
 
   struct gomp_thread *thr = gomp_thread ();
@@ -1815,7 +1892,8 @@  gomp_target_task_fn (void *data)
       void *fn_addr;
       if (devicep == NULL
 	  || !(devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400)
-	  || !(fn_addr = gomp_get_target_fn_addr (devicep, ttask->fn)))
+	  || !(fn_addr = gomp_get_target_fn_addr (devicep, ttask->fn))
+	  || (devicep->can_run_func && !devicep->can_run_func (fn_addr)))
 	{
 	  ttask->state = GOMP_TARGET_TASK_FALLBACK;
 	  gomp_target_fallback_firstprivate (ttask->fn, ttask->mapnum,
@@ -1826,22 +1904,36 @@  gomp_target_task_fn (void *data)
 
       if (ttask->state == GOMP_TARGET_TASK_FINISHED)
 	{
-	  gomp_unmap_vars (ttask->tgt, true);
+	  if (ttask->tgt)
+	    gomp_unmap_vars (ttask->tgt, true);
 	  return false;
 	}
 
-      ttask->tgt
-	= gomp_map_vars (devicep, ttask->mapnum, ttask->hostaddrs, NULL,
-			 ttask->sizes, ttask->kinds, true,
-			 GOMP_MAP_VARS_TARGET);
+      void *actual_arguments;
+      if (devicep->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
+	{
+	  ttask->tgt = NULL;
+	  ttask->firstprivate_copies
+	    = gomp_target_unshare_firstprivate (ttask->mapnum, ttask->hostaddrs,
+						ttask->sizes, ttask->kinds);
+	  actual_arguments = ttask->hostaddrs;
+	}
+      else
+	{
+	  ttask->tgt = gomp_map_vars (devicep, ttask->mapnum, ttask->hostaddrs,
+				      NULL, ttask->sizes, ttask->kinds, true,
+				      GOMP_MAP_VARS_TARGET);
+	  actual_arguments = (void *) ttask->tgt->tgt_start;
+	}
       ttask->state = GOMP_TARGET_TASK_READY_TO_RUN;
 
-      devicep->async_run_func (devicep->target_id, fn_addr,
-			       (void *) ttask->tgt->tgt_start, (void *) ttask);
+      devicep->async_run_func (devicep->target_id, fn_addr, actual_arguments,
+			       ttask->args, (void *) ttask);
       return true;
     }
   else if (devicep == NULL
-	   || !(devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400))
+	   || !(devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400)
+	   || devicep->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
     return false;
 
   size_t i;
@@ -1891,7 +1983,8 @@  omp_target_alloc (size_t size, int device_num)
   if (devicep == NULL)
     return NULL;
 
-  if (!(devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400))
+  if (!(devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400)
+      || devicep->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
     return malloc (size);
 
   gomp_mutex_lock (&devicep->lock);
@@ -1919,7 +2012,8 @@  omp_target_free (void *device_ptr, int device_num)
   if (devicep == NULL)
     return;
 
-  if (!(devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400))
+  if (!(devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400)
+      || devicep->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
     {
       free (device_ptr);
       return;
@@ -1946,7 +2040,8 @@  omp_target_is_present (void *ptr, int device_num)
   if (devicep == NULL)
     return 0;
 
-  if (!(devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400))
+  if (!(devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400)
+      || devicep->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
     return 1;
 
   gomp_mutex_lock (&devicep->lock);
@@ -1976,7 +2071,8 @@  omp_target_memcpy (void *dst, void *src, size_t length, size_t dst_offset,
       if (dst_devicep == NULL)
 	return EINVAL;
 
-      if (!(dst_devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400))
+      if (!(dst_devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400)
+	  || dst_devicep->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
 	dst_devicep = NULL;
     }
   if (src_device_num != GOMP_DEVICE_HOST_FALLBACK)
@@ -1988,7 +2084,8 @@  omp_target_memcpy (void *dst, void *src, size_t length, size_t dst_offset,
       if (src_devicep == NULL)
 	return EINVAL;
 
-      if (!(src_devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400))
+      if (!(src_devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400)
+	  || src_devicep->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
 	src_devicep = NULL;
     }
   if (src_devicep == NULL && dst_devicep == NULL)
@@ -2118,7 +2215,8 @@  omp_target_memcpy_rect (void *dst, void *src, size_t element_size,
       if (dst_devicep == NULL)
 	return EINVAL;
 
-      if (!(dst_devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400))
+      if (!(dst_devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400)
+	  || dst_devicep->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
 	dst_devicep = NULL;
     }
   if (src_device_num != GOMP_DEVICE_HOST_FALLBACK)
@@ -2130,7 +2228,8 @@  omp_target_memcpy_rect (void *dst, void *src, size_t element_size,
       if (src_devicep == NULL)
 	return EINVAL;
 
-      if (!(src_devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400))
+      if (!(src_devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400)
+	  || src_devicep->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
 	src_devicep = NULL;
     }
 
@@ -2166,7 +2265,8 @@  omp_target_associate_ptr (void *host_ptr, void *device_ptr, size_t size,
   if (devicep == NULL)
     return EINVAL;
 
-  if (!(devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400))
+  if (!(devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400)
+      || devicep->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
     return EINVAL;
 
   gomp_mutex_lock (&devicep->lock);
@@ -2309,6 +2409,7 @@  gomp_load_plugin_for_device (struct gomp_device_descr *device,
     {
       DLSYM (run);
       DLSYM (async_run);
+      DLSYM_OPT (can_run, can_run);
       DLSYM (dev2dev);
     }
   if (device->capabilities & GOMP_OFFLOAD_CAP_OPENACC_200)
diff --git a/libgomp/task.c b/libgomp/task.c
index de9936e..90bdb1c 100644
--- a/libgomp/task.c
+++ b/libgomp/task.c
@@ -581,6 +581,7 @@  GOMP_PLUGIN_target_task_completion (void *data)
       gomp_mutex_unlock (&team->task_lock);
     }
   ttask->state = GOMP_TARGET_TASK_FINISHED;
+  free (ttask->firstprivate_copies);
   gomp_target_task_completion (team, task);
   gomp_mutex_unlock (&team->task_lock);
 }
@@ -593,7 +594,7 @@  bool
 gomp_create_target_task (struct gomp_device_descr *devicep,
 			 void (*fn) (void *), size_t mapnum, void **hostaddrs,
 			 size_t *sizes, unsigned short *kinds,
-			 unsigned int flags, void **depend,
+			 unsigned int flags, void **depend, void **args,
 			 enum gomp_target_task_state state)
 {
   struct gomp_thread *thr = gomp_thread ();
@@ -653,6 +654,7 @@  gomp_create_target_task (struct gomp_device_descr *devicep,
   ttask->devicep = devicep;
   ttask->fn = fn;
   ttask->mapnum = mapnum;
+  ttask->args = args;
   memcpy (ttask->hostaddrs, hostaddrs, mapnum * sizeof (void *));
   ttask->sizes = (size_t *) &ttask->hostaddrs[mapnum];
   memcpy (ttask->sizes, sizes, mapnum * sizeof (size_t));
diff --git a/liboffloadmic/plugin/libgomp-plugin-intelmic.cpp b/liboffloadmic/plugin/libgomp-plugin-intelmic.cpp
index 68f7b2c..58ef595 100644
--- a/liboffloadmic/plugin/libgomp-plugin-intelmic.cpp
+++ b/liboffloadmic/plugin/libgomp-plugin-intelmic.cpp
@@ -528,7 +528,7 @@  GOMP_OFFLOAD_dev2dev (int device, void *dst_ptr, const void *src_ptr,
 
 extern "C" void
 GOMP_OFFLOAD_async_run (int device, void *tgt_fn, void *tgt_vars,
-			void *async_data)
+			void **, void *async_data)
 {
   TRACE ("(device = %d, tgt_fn = %p, tgt_vars = %p, async_data = %p)", device,
 	 tgt_fn, tgt_vars, async_data);
@@ -544,7 +544,7 @@  GOMP_OFFLOAD_async_run (int device, void *tgt_fn, void *tgt_vars,
 }
 
 extern "C" void
-GOMP_OFFLOAD_run (int device, void *tgt_fn, void *tgt_vars)
+GOMP_OFFLOAD_run (int device, void *tgt_fn, void *tgt_vars, void **)
 {
   TRACE ("(device = %d, tgt_fn = %p, tgt_vars = %p)", device, tgt_fn, tgt_vars);