diff mbox

[hsa,2/10] Modifications to libgomp proper

Message ID 20151210175223.GH3534@virgil.suse.cz
State New
Headers show

Commit Message

Martin Jambor Dec. 10, 2015, 5:52 p.m. UTC
Hi,

thanks for the feedback.  I have incorporated most of it into the
branch (the diff is below) but  also have a few questions.

On Wed, Dec 09, 2015 at 12:35:36PM +0100, Jakub Jelinek wrote:
> On Mon, Dec 07, 2015 at 12:19:57PM +0100, Martin Jambor wrote:
> > +/* Flag set when the subsequent element in the device-specific argument
> > +   values.  */
> > +#define GOMP_TARGET_ARG_SUBSEQUENT_PARAM	(1 << 7)
> > +
> > +/* Bitmask to apply to a target argument to find out the value identifier.  */
> > +#define GOMP_TARGET_ARG_ID_MASK			(((1 << 8) - 1) << 8)
> > +/* Target argument index of NUM_TEAMS.  */
> > +#define GOMP_TARGET_ARG_NUM_TEAMS		(1 << 8)
> > +/* Target argument index of THREAD_LIMIT.  */
> > +#define GOMP_TARGET_ARG_THREAD_LIMIT		(2 << 8)
> 
> I meant that these two would be just special, passed as the first two
> pointers in the array, without the markup.  Because, otherwise you either
> need to use GOMP_TARGET_ARG_SUBSEQUENT_PARAM for these always, or for 32-bit
> arches and for 64-bit ones shift often at runtime.  Having the markup even
> for them is perhaps cleaner, but less efficient, so if you really want to go
> that way, please make sure you handle it properly for 32-bit pointers
> architectures though.  num_teams or thread_limit could be > 32767 or >
> 65535.

I see, I prefer the clean approach, even if it is more work, this
interface looks like it is going to be extended in the future.  But I
am wondering whether embedding the value into the identifier element
is actually worth it.  The passed array is going to be a small local
variable and I wonder whether there is going to be any benefit in it
having two elements instead of four (or four instead of six for
gridified kernels), especially if it means introducing control flow on
the part of the caller.  But if you really want it that way, I will
implement that.

> 
> > -static void
> > -gomp_target_fallback_firstprivate (void (*fn) (void *), size_t mapnum,
> > -				   void **hostaddrs, size_t *sizes,
> > -				   unsigned short *kinds)
> > +static void *
> > +gomp_target_unshare_firstprivate (size_t mapnum, void **hostaddrs,
> > +				  size_t *sizes, unsigned short *kinds)
> >  {
> >    size_t i, tgt_align = 0, tgt_size = 0;
> >    char *tgt = NULL;
> > @@ -1281,7 +1282,7 @@ gomp_target_fallback_firstprivate (void (*fn) (void *), size_t mapnum,
> >        }
> >    if (tgt_align)
> >      {
> > -      tgt = gomp_alloca (tgt_size + tgt_align - 1);
> > +      tgt = gomp_malloc (tgt_size + tgt_align - 1);
> 
> I don't like using gomp_malloc here, either copy/paste the function, or
> create separate inline functions for the two loops, one for the first loop
> which returns you tgt_align and tgt_size, and another for the stuff after
> the allocation.  Then you can use those two inline functions to implement
> both gomp_target_fallback_firstprivate which will use alloca, and
> gomp_target_unshare_firstprivate which will use gomp_malloc instead.

OK, I did that.

> 
> > @@ -1356,6 +1377,11 @@ GOMP_target (int device, void (*fn) (void *), const void *unused,
> >     and several arguments have been added:
> >     FLAGS is a bitmask, see GOMP_TARGET_FLAG_* in gomp-constants.h.
> >     DEPEND is array of dependencies, see GOMP_task for details.
> > +   ARGS is a pointer to an array consisting of NUM_TEAMS, THREAD_LIMIT and a
> > +   variable number of device-specific arguments, which always take two elements
> > +   where the first specifies the type and the second the actual value.  The
> > +   last element of the array is a single NULL.
> 
> Note, here you document NUM_TEAMS and THREAD_LIMIT as special values, not
> encoded.

I have changed the comment but will remember to do it again if
necessary after changing omp-low.c

> 
> > @@ -1473,6 +1508,7 @@ GOMP_target_data (int device, const void *unused, size_t mapnum,
> >    struct gomp_device_descr *devicep = resolve_device (device);
> >  
> >    if (devicep == NULL
> > +      || (devicep->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
> >        || !(devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400))
> 
> Would be nice to have some consistency in the order of capabilities checks.
> Usually you check SHARED_MEM after OPENMP_400, so perhaps do it this way
> here too.

Sure.

> 
> > @@ -1741,23 +1784,38 @@ gomp_target_task_fn (void *data)
> >  
> >        if (ttask->state == GOMP_TARGET_TASK_FINISHED)
> >  	{
> > -	  gomp_unmap_vars (ttask->tgt, true);
> > +	  if (ttask->tgt)
> > +	    gomp_unmap_vars (ttask->tgt, true);
> >  	  return false;
> >  	}
> 
> This doesn't make sense.  For the GOMP_OFFLOAD_CAP_SHARED_MEM case, unless
> you want to run the free (ttask->firstprivate_copies); as a task, you
> shouldn't be queing the target task again for further execution, instead
> it should just be handled like a finished task at that point.  The reason
> why for XeonPhi or PTX gomp_target_task_fn is run with
> GOMP_TARGET_TASK_FINISHED state is that gomp_unmap_vars can perform IO
> actions and doing it with the tasking lock held is highly undesirable.

OK, so if I understand this correctly, I should not be re-queing the
task in gomp_target_task_completion.  I have left the check to be
NULLness of ttask->tgt but can test the capability if you prefer (but
I at least hope the two options are equivalent).

> >  
> > -      void *fn_addr = gomp_get_target_fn_addr (devicep, ttask->fn);
> > -      ttask->tgt
> > -	= gomp_map_vars (devicep, ttask->mapnum, ttask->hostaddrs, NULL,
> > -			 ttask->sizes, ttask->kinds, true,
> > -			 GOMP_MAP_VARS_TARGET);
> > +      bool shared_mem;
> > +      if (devicep->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
> > +	{
> > +	  shared_mem = true;
> > +	  ttask->tgt = NULL;
> > +	  ttask->firstprivate_copies
> > +	    = gomp_target_unshare_firstprivate (ttask->mapnum, ttask->hostaddrs,
> > +						ttask->sizes, ttask->kinds);
> > +	}
> > +      else
> > +	{
> > +	  shared_mem = false;
> > +	  ttask->tgt = gomp_map_vars (devicep, ttask->mapnum, ttask->hostaddrs,
> > +				      NULL, ttask->sizes, ttask->kinds, true,
> > +				      GOMP_MAP_VARS_TARGET);
> > +	}
> >        ttask->state = GOMP_TARGET_TASK_READY_TO_RUN;
> >  
> >        devicep->async_run_func (devicep->target_id, fn_addr,
> > -			       (void *) ttask->tgt->tgt_start, (void *) ttask);
> > +			       shared_mem ? ttask->hostaddrs
> > +			       : (void *) ttask->tgt->tgt_start,
> > +			       ttask->args, (void *) ttask);
> 
> Replace the shared_mem bool variable with a void *arg or some other name and just
> set it to ttask->hostaddrs in the then case and ttask->tgt->tgt_start
> otherwise?

OK.

> 
> > --- a/libgomp/task.c
> > +++ b/libgomp/task.c
> > @@ -581,6 +581,7 @@ GOMP_PLUGIN_target_task_completion (void *data)
> >        gomp_mutex_unlock (&team->task_lock);
> >      }
> >    ttask->state = GOMP_TARGET_TASK_FINISHED;
> > +  free (ttask->firstprivate_copies);
> >    gomp_target_task_completion (team, task);
> >    gomp_mutex_unlock (&team->task_lock);
> >  }
> 
> So, this function should have a special case for the SHARED_MEM case, handle
> it closely to say how GOMP_taskgroup_end handles the finish_cancelled:
> case.  Just note that the target task is missing from certain queues at that
> point.

I'm afraid I need some help here.  I do not quite understand how is
finish_cancelled in GOMP_taskgroup_end similar, it seems to be doing
much more than freeing one pointer.  What is exactly the issue with
the above?

Nevertheless, after reading through bits of task.c again, I wonder
whether any copying (for both shared memory target and the host) in
gomp_target_task_fn is actually necessary because it seems to be also
done in gomp_create_target_task.  Does that not apply somehow?

In any event, I am going to apply the following to the branch.  Thanks
a lot for your comments,

Martin


2015-12-10  Martin Jambor  <mjambor@suse.cz>

	* target.c (calculate_firstprivate_requirements): New function.
	(copy_firstprivate_data): Likewise.
	(gomp_target_fallback_firstprivate): Use them.
	(gomp_target_unshare_firstprivate): Likewise.
	(GOMP_target_ext): Amended comment
	(GOMP_target_data): Changed order of device->capability testing.
	(gomp_target_task_fn): Do not check there is something to unmap in
	GOMP_TARGET_TASK_FINISHED case.  Remove ternary operator in an
	argument of async_run_func.

Test separating firstprivate copying.
---
 libgomp/target.c | 124 +++++++++++++++++++++++++++++++++++--------------------
 libgomp/task.c   |   9 ++--
 2 files changed, 85 insertions(+), 48 deletions(-)

Comments

Jakub Jelinek Dec. 11, 2015, 6:05 p.m. UTC | #1
On Thu, Dec 10, 2015 at 06:52:23PM +0100, Martin Jambor wrote:
> I see, I prefer the clean approach, even if it is more work, this
> interface looks like it is going to be extended in the future.  But I
> am wondering whether embedding the value into the identifier element
> is actually worth it.  The passed array is going to be a small local
> variable and I wonder whether there is going to be any benefit in it
> having two elements instead of four (or four instead of six for
> gridified kernels), especially if it means introducing control flow on
> the part of the caller.  But if you really want it that way, I will
> implement that.

I'm fine with implementing the two (num_threads and thread_limit) always
as separate argument, or perhaps what you could do is if the argument
is constant and fits into the signed 16 bits on 32-bit arches (or any
constant, that fits into 48 bits), use embedded argument (then there is no
extra runtime cost), and if it is variable and you'd need to shift,
put it as separate argument.

> OK, so if I understand this correctly, I should not be re-queing the
> task in gomp_target_task_completion.  I have left the check to be
> NULLness of ttask->tgt but can test the capability if you prefer (but
> I at least hope the two options are equivalent).

> > > --- a/libgomp/task.c
> > > +++ b/libgomp/task.c
> > > @@ -581,6 +581,7 @@ GOMP_PLUGIN_target_task_completion (void *data)
> > >        gomp_mutex_unlock (&team->task_lock);
> > >      }
> > >    ttask->state = GOMP_TARGET_TASK_FINISHED;
> > > +  free (ttask->firstprivate_copies);
> > >    gomp_target_task_completion (team, task);
> > >    gomp_mutex_unlock (&team->task_lock);
> > >  }
> > 
> > So, this function should have a special case for the SHARED_MEM case, handle
> > it closely to say how GOMP_taskgroup_end handles the finish_cancelled:
> > case.  Just note that the target task is missing from certain queues at that
> > point.
> 
> I'm afraid I need some help here.  I do not quite understand how is
> finish_cancelled in GOMP_taskgroup_end similar, it seems to be doing
> much more than freeing one pointer.  What is exactly the issue with
> the above?
> 
> Nevertheless, after reading through bits of task.c again, I wonder
> whether any copying (for both shared memory target and the host) in
> gomp_target_task_fn is actually necessary because it seems to be also
> done in gomp_create_target_task.  Does that not apply somehow?

The target task is scheduled for the first action as normal task, and the
scheduling of it already removes it from some of the queues (each task is
put into 1-3 queues), i.e. actions performed mostly by
gomp_task_run_pre.  Then the team task lock is unlocked and the task is run.
Finally, for normal tasks, gomp_task_run_post_handle_depend,
gomp_task_run_post_remove_parent, etc. is run.  Now, for async target tasks
that have something running on some other device at that point, we don't do
that, but instead make it GOMP_TASK_ASYNC_RUNNING.  And continue with other
stuff, until gomp_target_task_completion is run.
For non-shared mem that needs to readd the task again into the queues, so
that it will be scheduled again.  But you don't need that for shared mem
target tasks, they can just free the firstprivate_copies and finalize the
task.
At the time gomp_target_task_completion is called, the task is pretty much
in the same state as it is around the finish_cancelled:; label.
So instead of what the gomp_target_task_completion function does,
you would for SHARED_MEM do something like:
          size_t new_tasks
            = gomp_task_run_post_handle_depend (task, team);
          gomp_task_run_post_remove_parent (task);
          gomp_clear_parent (&task->children_queue);
          gomp_task_run_post_remove_taskgroup (task);
          team->task_count--;
	  do_wake = 0;
          if (new_tasks > 1)
            {
              do_wake = team->nthreads - team->task_running_count
                        - !task->in_tied_task;
              if (do_wake > new_tasks)
                do_wake = new_tasks;
            }
// Unlike other places, the following will be also run with the
// task_lock held, but I'm afraid there is nothing to do about it.
// See the comment in gomp_target_task_completion.
	  gomp_finish_task (task);
	  free (task);
	  if (do_wake)
	    gomp_team_barrier_wake (&team->barrier, do_wake);

	Jakub
diff mbox

Patch

diff --git a/libgomp/target.c b/libgomp/target.c
index b453c0c..886e727 100644
--- a/libgomp/target.c
+++ b/libgomp/target.c
@@ -1261,43 +1261,47 @@  gomp_target_fallback (void (*fn) (void *), void **hostaddrs)
   *thr = old_thr;
 }
 
-/* Handle firstprivate map-type for shared memory devices and the host
-   fallback.  Return the pointer of firstprivate copies which has to be freed
-   after use.  */
+/* Calculate alignment and size requirements of a private copy of data shared
+   as GOMP_MAP_FIRSTPRIVATE and store them to TGT_ALIGN and TGT_SIZE.  */
 
-static void *
-gomp_target_unshare_firstprivate (size_t mapnum, void **hostaddrs,
-				  size_t *sizes, unsigned short *kinds)
+static inline void
+calculate_firstprivate_requirements (size_t mapnum, size_t *sizes,
+				     unsigned short *kinds, size_t *tgt_align,
+				     size_t *tgt_size)
 {
-  size_t i, tgt_align = 0, tgt_size = 0;
-  char *tgt = NULL;
+  size_t i;
+  for (i = 0; i < mapnum; i++)
+    if ((kinds[i] & 0xff) == GOMP_MAP_FIRSTPRIVATE)
+      {
+	size_t align = (size_t) 1 << (kinds[i] >> 8);
+	if (*tgt_align < align)
+	  *tgt_align = align;
+	*tgt_size = (*tgt_size + align - 1) & ~(align - 1);
+	*tgt_size += sizes[i];
+      }
+}
+
+/* Copy data shared as GOMP_MAP_FIRSTPRIVATE to DST.  */
+
+static inline void
+copy_firstprivate_data (char *tgt, size_t mapnum, void **hostaddrs,
+			size_t *sizes, unsigned short *kinds, size_t tgt_align,
+			size_t tgt_size)
+{
+  uintptr_t al = (uintptr_t) tgt & (tgt_align - 1);
+  if (al)
+    tgt += tgt_align - al;
+  tgt_size = 0;
+  size_t i;
   for (i = 0; i < mapnum; i++)
     if ((kinds[i] & 0xff) == GOMP_MAP_FIRSTPRIVATE)
       {
 	size_t align = (size_t) 1 << (kinds[i] >> 8);
-	if (tgt_align < align)
-	  tgt_align = align;
 	tgt_size = (tgt_size + align - 1) & ~(align - 1);
-	tgt_size += sizes[i];
+	memcpy (tgt + tgt_size, hostaddrs[i], sizes[i]);
+	hostaddrs[i] = tgt + tgt_size;
+	tgt_size = tgt_size + sizes[i];
       }
-  if (tgt_align)
-    {
-      tgt = gomp_malloc (tgt_size + tgt_align - 1);
-      uintptr_t al = (uintptr_t) tgt & (tgt_align - 1);
-      if (al)
-	tgt += tgt_align - al;
-      tgt_size = 0;
-      for (i = 0; i < mapnum; i++)
-	if ((kinds[i] & 0xff) == GOMP_MAP_FIRSTPRIVATE)
-	  {
-	    size_t align = (size_t) 1 << (kinds[i] >> 8);
-	    tgt_size = (tgt_size + align - 1) & ~(align - 1);
-	    memcpy (tgt + tgt_size, hostaddrs[i], sizes[i]);
-	    hostaddrs[i] = tgt + tgt_size;
-	    tgt_size = tgt_size + sizes[i];
-	  }
-    }
-  return tgt;
 }
 
 /* Host fallback with firstprivate map-type handling.  */
@@ -1307,9 +1311,38 @@  gomp_target_fallback_firstprivate (void (*fn) (void *), size_t mapnum,
 				   void **hostaddrs, size_t *sizes,
 				   unsigned short *kinds)
 {
-  void *fpc = gomp_target_unshare_firstprivate (mapnum, hostaddrs, sizes, kinds);
+  size_t tgt_align = 0, tgt_size = 0;
+  calculate_firstprivate_requirements (mapnum, sizes, kinds, &tgt_align,
+				       &tgt_size);
+  if (tgt_align)
+    {
+      char *tgt = gomp_alloca (tgt_size + tgt_align - 1);
+      copy_firstprivate_data (tgt, mapnum, hostaddrs, sizes, kinds, tgt_align,
+			      tgt_size);
+    }
   gomp_target_fallback (fn, hostaddrs);
-  free (fpc);
+}
+
+/* Handle firstprivate map-type for shared memory devices and the host
+   fallback.  Return the pointer of firstprivate copies which has to be freed
+   after use.  */
+
+static void *
+gomp_target_unshare_firstprivate (size_t mapnum, void **hostaddrs,
+				  size_t *sizes, unsigned short *kinds)
+{
+  size_t tgt_align = 0, tgt_size = 0;
+  char *tgt = NULL;
+
+  calculate_firstprivate_requirements (mapnum, sizes, kinds, &tgt_align,
+				       &tgt_size);
+  if (tgt_align)
+    {
+      tgt = gomp_malloc (tgt_size + tgt_align - 1);
+      copy_firstprivate_data (tgt, mapnum, hostaddrs, sizes, kinds, tgt_align,
+			      tgt_size);
+    }
+  return tgt;
 }
 
 /* Helper function of GOMP_target{,_ext} routines.  */
@@ -1377,10 +1410,14 @@  GOMP_target (int device, void (*fn) (void *), const void *unused,
    and several arguments have been added:
    FLAGS is a bitmask, see GOMP_TARGET_FLAG_* in gomp-constants.h.
    DEPEND is array of dependencies, see GOMP_task for details.
-   ARGS is a pointer to an array consisting of NUM_TEAMS, THREAD_LIMIT and a
-   variable number of device-specific arguments, which always take two elements
-   where the first specifies the type and the second the actual value.  The
-   last element of the array is a single NULL.
+
+   ARGS is a pointer to an array consisting of a variable number of both
+   device-independent and device-specific arguments, which can take one two
+   elements where the first specifies for which device it is intended, the type
+   and optionally also the value.  If the value is not present in the first
+   one, the whole second element the actual value.  The last element of the
+   array is a single NULL.  Among the device independent can be for example
+   NUM_TEAMS and THREAD_LIMIT.
 
    NUM_TEAMS is positive if GOMP_teams will be called in the body with
    that value, or 1 if teams construct is not present, or 0, if
@@ -1508,8 +1545,8 @@  GOMP_target_data (int device, const void *unused, size_t mapnum,
   struct gomp_device_descr *devicep = resolve_device (device);
 
   if (devicep == NULL
-      || (devicep->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
-      || !(devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400))
+      || !(devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400)
+      || (devicep->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM))
     return gomp_target_data_fallback ();
 
   struct target_mem_desc *tgt
@@ -1784,32 +1821,29 @@  gomp_target_task_fn (void *data)
 
       if (ttask->state == GOMP_TARGET_TASK_FINISHED)
 	{
-	  if (ttask->tgt)
-	    gomp_unmap_vars (ttask->tgt, true);
+	  gomp_unmap_vars (ttask->tgt, true);
 	  return false;
 	}
 
-      bool shared_mem;
+      void *actual_arguments;
       if (devicep->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
 	{
-	  shared_mem = true;
 	  ttask->tgt = NULL;
 	  ttask->firstprivate_copies
 	    = gomp_target_unshare_firstprivate (ttask->mapnum, ttask->hostaddrs,
 						ttask->sizes, ttask->kinds);
+	  actual_arguments = ttask->hostaddrs;
 	}
       else
 	{
-	  shared_mem = false;
 	  ttask->tgt = gomp_map_vars (devicep, ttask->mapnum, ttask->hostaddrs,
 				      NULL, ttask->sizes, ttask->kinds, true,
 				      GOMP_MAP_VARS_TARGET);
+	  actual_arguments = (void *) ttask->tgt->tgt_start;
 	}
       ttask->state = GOMP_TARGET_TASK_READY_TO_RUN;
 
-      devicep->async_run_func (devicep->target_id, fn_addr,
-			       shared_mem ? ttask->hostaddrs
-			       : (void *) ttask->tgt->tgt_start,
+      devicep->async_run_func (devicep->target_id, fn_addr, actual_arguments,
 			       ttask->args, (void *) ttask);
       return true;
     }
diff --git a/libgomp/task.c b/libgomp/task.c
index aa6bd67..3c93bc6 100644
--- a/libgomp/task.c
+++ b/libgomp/task.c
@@ -524,9 +524,12 @@  gomp_target_task_completion (struct gomp_team *team, struct gomp_task *task)
     priority_queue_move_task_first (PQ_TASKGROUP, &taskgroup->taskgroup_queue,
 				    task);
 
-  priority_queue_insert (PQ_TEAM, &team->task_queue, task, task->priority,
-			 PRIORITY_INSERT_BEGIN, false,
-			 task->parent_depends_on);
+  struct gomp_target_task *ttask;
+  ttask = (struct gomp_target_task *) task->fn_data;
+  if (ttask->tgt)
+    priority_queue_insert (PQ_TEAM, &team->task_queue, task, task->priority,
+			   PRIORITY_INSERT_BEGIN, false,
+			   task->parent_depends_on);
   task->kind = GOMP_TASK_WAITING;
   if (parent && parent->taskwait)
     {