diff mbox series

[4/4] openacc: Profiling-interface fixes for asynchronous operations

Message ID e3b3a91de89f057376d576b527399963d33cfb50.1624987598.git.julian@codesourcery.com
State New
Headers show
Series openacc: Async fixes | expand

Commit Message

Julian Brown June 29, 2021, 11:42 p.m. UTC
This patch fixes some problems with the OpenACC profiling interface when
used with asynchronous offload operations. The profiling operations
themselves are now launched asynchronously, as previously they measured
the wrong thing, and/or executed at the same time as the operation they
were supposed to be measuring.

A consequence of this change is that "enqueueing" profiling callbacks
are no longer predictably ordered with respect to the callbacks
relating to the execution of asynchronous operations themselves. The
acc_prof-parallel-1.c test is un-XFAILed and adjusted accordingly.

This patch was posted for the og9 branch here:

  https://gcc.gnu.org/legacy-ml/gcc-patches/2019-09/msg01024.html

Tested with offloading to AMD GCN. OK for mainline?

Thanks,

Julian

2021-06-29  Julian Brown  <julian@codesourcery.com>

libgomp/
	* oacc-host.c (host_openacc_async_queue_callback): Invoke callback
	function immediately.
	* oacc-mem.c (goacc_enter_exit_data_internal): Call
	queue_async_prof_dispatch for asynchronous profile-event dispatches.
	* oacc-parallel.c (struct async_prof_callback_info,
	async_prof_dispatch, queue_async_prof_dispatch): New.
	(GOACC_parallel_keyed): Call queue_async_prof_dispatch for asynchronous
	profile-event dispatches.
	(GOACC_update): Likewise.
	* testsuite/libgomp.oacc-c-c++-common/acc_prof-init-1.c
	(cb_compute_construct_start): Remove/fix TODO.
	* testsuite/libgomp.oacc-c-c++-common/acc_prof-parallel-1.c: Remove
	XFAIL.
	(cb_exit_data_start): Tweak expected state values.
	(cb_exit_data_end): Likewise.
	(cb_compute_construct_start): Remove/fix TODO.
	(cb_compute_construct_end): Don't do adjustments for
	acc_ev_enqueue_launch_start/acc_ev_enqueue_launch_end callbacks.
	(cb_compute_construct_end): Tweak expected state values.
	(cb_enqueue_launch_start, cb_enqueue_launch_end): Don't expect
	launch-enqueue operations to happen synchronously with respect to
	profiling events on async streams.
	(main): Tweak expected state values.
---
 libgomp/oacc-host.c                           |   5 +-
 libgomp/oacc-mem.c                            |  32 ++-
 libgomp/oacc-parallel.c                       | 190 ++++++++++++++----
 .../acc_prof-init-1.c                         |   5 +-
 .../acc_prof-parallel-1.c                     |  66 ++----
 5 files changed, 194 insertions(+), 104 deletions(-)
diff mbox series

Patch

diff --git a/libgomp/oacc-host.c b/libgomp/oacc-host.c
index f3bbd2b9c61..1cbff4caace 100644
--- a/libgomp/oacc-host.c
+++ b/libgomp/oacc-host.c
@@ -204,10 +204,9 @@  host_openacc_async_dev2host (int ord __attribute__ ((unused)),
 static void
 host_openacc_async_queue_callback (struct goacc_asyncqueue *aq
 				   __attribute__ ((unused)),
-				   void (*callback_fn)(void *)
-				   __attribute__ ((unused)),
-				   void *userptr __attribute__ ((unused)))
+				   void (*callback_fn)(void *), void *userptr)
 {
+  callback_fn (userptr);
 }
 
 static struct goacc_asyncqueue *
diff --git a/libgomp/oacc-mem.c b/libgomp/oacc-mem.c
index 5988db0b886..f0bd907cf07 100644
--- a/libgomp/oacc-mem.c
+++ b/libgomp/oacc-mem.c
@@ -1317,6 +1317,12 @@  goacc_exit_data_internal (struct gomp_device_descr *acc_dev, size_t mapnum,
   gomp_mutex_unlock (&acc_dev->lock);
 }
 
+struct async_prof_callback_info *
+queue_async_prof_dispatch (struct gomp_device_descr *devicep, goacc_aq aq,
+			   acc_prof_info *prof_info, acc_event_info *event_info,
+			   acc_api_info *api_info,
+			   struct async_prof_callback_info *prev_info);
+
 static void
 goacc_enter_exit_data_internal (int flags_m, size_t mapnum, void **hostaddrs,
 				size_t *sizes, unsigned short *kinds,
@@ -1327,6 +1333,7 @@  goacc_enter_exit_data_internal (int flags_m, size_t mapnum, void **hostaddrs,
 
   struct goacc_thread *thr;
   struct gomp_device_descr *acc_dev;
+  struct async_prof_callback_info *data_start_info = NULL;
 
   goacc_lazy_initialize ();
 
@@ -1382,9 +1389,19 @@  goacc_enter_exit_data_internal (int flags_m, size_t mapnum, void **hostaddrs,
       api_info.async_handle = NULL;
     }
 
+  goacc_aq aq = get_goacc_asyncqueue (async);
+
   if (profiling_p)
-    goacc_profiling_dispatch (&prof_info, &enter_exit_data_event_info,
-			      &api_info);
+    {
+      if (aq)
+	data_start_info
+	  = queue_async_prof_dispatch (acc_dev, aq, &prof_info,
+				       &enter_exit_data_event_info, &api_info,
+				       NULL);
+      else
+	goacc_profiling_dispatch (&prof_info, &enter_exit_data_event_info,
+				  &api_info);
+    }
 
   if ((acc_dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
       || (flags & GOACC_FLAG_HOST_FALLBACK))
@@ -1398,8 +1415,6 @@  goacc_enter_exit_data_internal (int flags_m, size_t mapnum, void **hostaddrs,
   if (num_waits)
     goacc_wait (async, num_waits, ap);
 
-  goacc_aq aq = get_goacc_asyncqueue (async);
-
   if (data_enter)
     goacc_enter_data_internal (acc_dev, mapnum, hostaddrs, sizes, kinds, aq);
   else
@@ -1411,8 +1426,13 @@  goacc_enter_exit_data_internal (int flags_m, size_t mapnum, void **hostaddrs,
       prof_info.event_type
 	= data_enter ? acc_ev_enter_data_end : acc_ev_exit_data_end;
       enter_exit_data_event_info.other_event.event_type = prof_info.event_type;
-      goacc_profiling_dispatch (&prof_info, &enter_exit_data_event_info,
-				&api_info);
+      if (aq)
+	queue_async_prof_dispatch (acc_dev, aq, &prof_info,
+				   &enter_exit_data_event_info, &api_info,
+				   data_start_info);
+      else
+	goacc_profiling_dispatch (&prof_info, &enter_exit_data_event_info,
+				  &api_info);
 
       thr->prof_info = NULL;
       thr->api_info = NULL;
diff --git a/libgomp/oacc-parallel.c b/libgomp/oacc-parallel.c
index 83625ba8a8e..3cc9f31d23b 100644
--- a/libgomp/oacc-parallel.c
+++ b/libgomp/oacc-parallel.c
@@ -93,6 +93,62 @@  handle_ftn_pointers (size_t mapnum, void **hostaddrs, size_t *sizes,
 }
 
 
+struct async_prof_callback_info {
+  acc_prof_info prof_info;
+  acc_event_info event_info;
+  acc_api_info api_info;
+  struct async_prof_callback_info *start_info;
+};
+
+static void
+async_prof_dispatch (void *ptr)
+{
+  struct async_prof_callback_info *info
+    = (struct async_prof_callback_info *) ptr;
+
+  if (info->start_info)
+    {
+      /* The TOOL_INFO must be preserved from a start event to the
+	 corresponding end event.  Copy that here.  */
+      void *tool_info = info->start_info->event_info.other_event.tool_info;
+      info->event_info.other_event.tool_info = tool_info;
+    }
+
+  goacc_profiling_dispatch (&info->prof_info, &info->event_info,
+			    &info->api_info);
+
+  /* The async_prof_dispatch function is (so far) always used for start/end
+     profiling event pairs: the start and end parts are queued, then each is
+     dispatched (or the dispatches might be interleaved before the end part is
+     queued).
+     In any case, it's not safe to delete either info structure before the
+     whole bracketed event is complete.  */
+
+  if (info->start_info)
+    {
+      free (info->start_info);
+      free (info);
+    }
+}
+
+struct async_prof_callback_info *
+queue_async_prof_dispatch (struct gomp_device_descr *devicep, goacc_aq aq,
+			   acc_prof_info *prof_info, acc_event_info *event_info,
+			   acc_api_info *api_info,
+			   struct async_prof_callback_info *prev_info)
+{
+  struct async_prof_callback_info *info = malloc (sizeof (*info));
+
+  info->prof_info = *prof_info;
+  info->event_info = *event_info;
+  info->api_info = *api_info;
+  info->start_info = prev_info;
+
+  devicep->openacc.async.queue_callback_func (aq, async_prof_dispatch,
+					      (void *) info);
+  return info;
+}
+
 /* Launch a possibly offloaded function with FLAGS.  FN is the host fn
    address.  MAPNUM, HOSTADDRS, SIZES & KINDS  describe the memory
    blocks to be copied to/from the device.  Varadic arguments are
@@ -117,6 +173,8 @@  GOACC_parallel_keyed (int flags_m, void (*fn) (void *),
   int async = GOMP_ASYNC_SYNC;
   unsigned dims[GOMP_DIM_MAX];
   unsigned tag;
+  struct async_prof_callback_info *comp_start_info = NULL,
+				  *data_start_info = NULL;
 
 #ifdef HAVE_INTTYPES_H
   gomp_debug (0, "%s: mapnum=%"PRIu64", hostaddrs=%p, size=%p, kinds=%p\n",
@@ -178,28 +236,9 @@  GOACC_parallel_keyed (int flags_m, void (*fn) (void *),
       api_info.async_handle = NULL;
     }
 
-  if (profiling_p)
-    goacc_profiling_dispatch (&prof_info, &compute_construct_event_info,
-			      &api_info);
-
   handle_ftn_pointers (mapnum, hostaddrs, sizes, kinds);
 
-  /* Host fallback if "if" clause is false or if the current device is set to
-     the host.  */
-  if (flags & GOACC_FLAG_HOST_FALLBACK)
-    {
-      prof_info.device_type = acc_device_host;
-      api_info.device_type = prof_info.device_type;
-      goacc_save_and_set_bind (acc_device_host);
-      fn (hostaddrs);
-      goacc_restore_bind ();
-      goto out_prof;
-    }
-  else if (acc_device_type (acc_dev->type) == acc_device_host)
-    {
-      fn (hostaddrs);
-      goto out_prof;
-    }
+  goacc_aq aq = NULL;
 
   /* Default: let the runtime choose.  */
   for (i = 0; i != GOMP_DIM_MAX; i++)
@@ -233,11 +272,12 @@  GOACC_parallel_keyed (int flags_m, void (*fn) (void *),
 	    if (async == GOMP_LAUNCH_OP_MAX)
 	      async = va_arg (ap, unsigned);
 
-	    if (profiling_p)
-	      {
-		prof_info.async = async;
-		prof_info.async_queue = prof_info.async;
-	      }
+	    /* Set async number in profiling data, unless the device is the
+	       host or we're doing host fallback.  */
+	    if (profiling_p
+		&& !(flags & GOACC_FLAG_HOST_FALLBACK)
+		&& acc_device_type (acc_dev->type) != acc_device_host)
+	      prof_info.async = prof_info.async_queue = async;
 
 	    break;
 	  }
@@ -255,7 +295,38 @@  GOACC_parallel_keyed (int flags_m, void (*fn) (void *),
 	}
     }
   va_end (ap);
-  
+
+  aq = get_goacc_asyncqueue (async);
+
+  if (profiling_p)
+    {
+      if (aq)
+	comp_start_info
+	  = queue_async_prof_dispatch (acc_dev, aq, &prof_info,
+				       &compute_construct_event_info,
+				       &api_info, NULL);
+      else
+	goacc_profiling_dispatch (&prof_info, &compute_construct_event_info,
+				  &api_info);
+    }
+
+  /* Host fallback if "if" clause is false or if the current device is set to
+     the host.  */
+  if (flags & GOACC_FLAG_HOST_FALLBACK)
+    {
+      prof_info.device_type = acc_device_host;
+      api_info.device_type = prof_info.device_type;
+      goacc_save_and_set_bind (acc_device_host);
+      fn (hostaddrs);
+      goacc_restore_bind ();
+      goto out_prof;
+    }
+  else if (acc_device_type (acc_dev->type) == acc_device_host)
+    {
+      fn (hostaddrs);
+      goto out_prof;
+    }
+
   if (!(acc_dev->capabilities & GOMP_OFFLOAD_CAP_NATIVE_EXEC))
     {
       k.host_start = (uintptr_t) fn;
@@ -284,12 +355,16 @@  GOACC_parallel_keyed (int flags_m, void (*fn) (void *),
 	= compute_construct_event_info.other_event.parent_construct;
       enter_exit_data_event_info.other_event.implicit = 1;
       enter_exit_data_event_info.other_event.tool_info = NULL;
-      goacc_profiling_dispatch (&prof_info, &enter_exit_data_event_info,
-				&api_info);
+      if (aq)
+	data_start_info
+	  = queue_async_prof_dispatch (acc_dev, aq, &prof_info,
+				       &enter_exit_data_event_info, &api_info,
+				       NULL);
+      else
+	goacc_profiling_dispatch (&prof_info, &enter_exit_data_event_info,
+				  &api_info);
     }
 
-  goacc_aq aq = get_goacc_asyncqueue (async);
-
   tgt = goacc_map_vars (acc_dev, aq, mapnum, hostaddrs, NULL, sizes, kinds,
 			true, 0);
   if (profiling_p)
@@ -297,8 +372,13 @@  GOACC_parallel_keyed (int flags_m, void (*fn) (void *),
       prof_info.event_type = acc_ev_enter_data_end;
       enter_exit_data_event_info.other_event.event_type
 	= prof_info.event_type;
-      goacc_profiling_dispatch (&prof_info, &enter_exit_data_event_info,
-				&api_info);
+      if (aq)
+	queue_async_prof_dispatch (acc_dev, aq, &prof_info,
+				   &enter_exit_data_event_info, &api_info,
+				   data_start_info);
+      else
+	goacc_profiling_dispatch (&prof_info, &enter_exit_data_event_info,
+				  &api_info);
     }
 
   devaddrs = gomp_alloca (sizeof (void *) * mapnum);
@@ -317,8 +397,14 @@  GOACC_parallel_keyed (int flags_m, void (*fn) (void *),
       prof_info.event_type = acc_ev_exit_data_start;
       enter_exit_data_event_info.other_event.event_type = prof_info.event_type;
       enter_exit_data_event_info.other_event.tool_info = NULL;
-      goacc_profiling_dispatch (&prof_info, &enter_exit_data_event_info,
-				&api_info);
+      if (aq)
+	data_start_info
+	  = queue_async_prof_dispatch (acc_dev, aq, &prof_info,
+				       &enter_exit_data_event_info, &api_info,
+				       NULL);
+      else
+	goacc_profiling_dispatch (&prof_info, &enter_exit_data_event_info,
+				  &api_info);
     }
 
   /* If running synchronously (aq == NULL), this will unmap immediately.  */
@@ -328,8 +414,13 @@  GOACC_parallel_keyed (int flags_m, void (*fn) (void *),
     {
       prof_info.event_type = acc_ev_exit_data_end;
       enter_exit_data_event_info.other_event.event_type = prof_info.event_type;
-      goacc_profiling_dispatch (&prof_info, &enter_exit_data_event_info,
-				&api_info);
+      if (aq)
+	queue_async_prof_dispatch (acc_dev, aq, &prof_info,
+				   &enter_exit_data_event_info, &api_info,
+				   data_start_info);
+      else
+	goacc_profiling_dispatch (&prof_info, &enter_exit_data_event_info,
+				  &api_info);
     }
 
  out_prof:
@@ -338,8 +429,13 @@  GOACC_parallel_keyed (int flags_m, void (*fn) (void *),
       prof_info.event_type = acc_ev_compute_construct_end;
       compute_construct_event_info.other_event.event_type
 	= prof_info.event_type;
-      goacc_profiling_dispatch (&prof_info, &compute_construct_event_info,
-				&api_info);
+      if (aq)
+	queue_async_prof_dispatch (acc_dev, aq, &prof_info,
+				   &compute_construct_event_info, &api_info,
+				   comp_start_info);
+      else
+	goacc_profiling_dispatch (&prof_info, &compute_construct_event_info,
+				  &api_info);
 
       thr->prof_info = NULL;
       thr->api_info = NULL;
@@ -565,6 +661,8 @@  GOACC_update (int flags_m, size_t mapnum,
 
   struct goacc_thread *thr = goacc_thread ();
   struct gomp_device_descr *acc_dev = thr->dev;
+  goacc_aq aq = NULL;
+  struct async_prof_callback_info *update_start_info = NULL;
 
   bool profiling_p = GOACC_PROFILING_DISPATCH_P (true);
 
@@ -614,7 +712,15 @@  GOACC_update (int flags_m, size_t mapnum,
     }
 
   if (profiling_p)
-    goacc_profiling_dispatch (&prof_info, &update_event_info, &api_info);
+    {
+      aq = get_goacc_asyncqueue (async);
+      if (aq)
+	update_start_info
+	  = queue_async_prof_dispatch (acc_dev, aq, &prof_info,
+				       &update_event_info, &api_info, NULL);
+      else
+	goacc_profiling_dispatch (&prof_info, &update_event_info, &api_info);
+    }
 
   if ((acc_dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
       || (flags & GOACC_FLAG_HOST_FALLBACK))
@@ -701,7 +807,11 @@  GOACC_update (int flags_m, size_t mapnum,
     {
       prof_info.event_type = acc_ev_update_end;
       update_event_info.other_event.event_type = prof_info.event_type;
-      goacc_profiling_dispatch (&prof_info, &update_event_info, &api_info);
+      if (aq)
+	queue_async_prof_dispatch (acc_dev, aq, &prof_info, &update_event_info,
+				   &api_info, update_start_info);
+      else
+	goacc_profiling_dispatch (&prof_info, &update_event_info, &api_info);
 
       thr->prof_info = NULL;
       thr->api_info = NULL;
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-init-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-init-1.c
index 7d05f482f46..72cf6305bcc 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-init-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-init-1.c
@@ -159,7 +159,10 @@  static void cb_compute_construct_start (acc_prof_info *prof_info, acc_event_info
   assert (prof_info->device_type == acc_device_type);
   assert (prof_info->device_number == acc_device_num);
   assert (prof_info->thread_id == -1);
-  assert (prof_info->async == /* TODO acc_async */ acc_async_sync);
+  if (acc_device_type == acc_device_host)
+    assert (prof_info->async == acc_async_sync);
+  else
+    assert (prof_info->async == acc_async);
   assert (prof_info->async_queue == prof_info->async);
   assert (prof_info->src_file == NULL);
   assert (prof_info->func_name == NULL);
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-parallel-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-parallel-1.c
index dc1807c6ce4..9c8af743aba 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-parallel-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-parallel-1.c
@@ -1,5 +1,3 @@ 
-/* { dg-xfail-run-if "Async profiling bug" { *-*-* } } */
-
 /* Test dispatch of events to callbacks.  */
 
 #undef NDEBUG
@@ -286,9 +284,9 @@  static void cb_exit_data_start (acc_prof_info *prof_info, acc_event_info *event_
 {
   DEBUG_printf ("%s\n", __FUNCTION__);
 
-  assert (state == 7
+  assert (state == 5
 #if ASYNC_EXIT_DATA
-	  || state == 107
+	  || state == 105
 #endif
 	  );
   STATE_OP (state, ++);
@@ -342,9 +340,9 @@  static void cb_exit_data_end (acc_prof_info *prof_info, acc_event_info *event_in
 {
   DEBUG_printf ("%s\n", __FUNCTION__);
 
-  assert (state == 8
+  assert (state == 6
 #if ASYNC_EXIT_DATA
-	  || state == 108
+	  || state == 106
 #endif
 	  );
   STATE_OP (state, ++);
@@ -428,7 +426,10 @@  static void cb_compute_construct_start (acc_prof_info *prof_info, acc_event_info
   assert (prof_info->device_type == acc_device_type);
   assert (prof_info->device_number == acc_device_num);
   assert (prof_info->thread_id == -1);
-  assert (prof_info->async == /* TODO acc_async */ acc_async_sync);
+  if (acc_device_type == acc_device_host)
+    assert (prof_info->async == acc_async_sync);
+  else
+    assert (prof_info->async == acc_async);
   assert (prof_info->async_queue == prof_info->async);
   assert (prof_info->src_file == NULL);
   assert (prof_info->func_name == NULL);
@@ -469,9 +470,6 @@  static void cb_compute_construct_end (acc_prof_info *prof_info, acc_event_info *
     {
       /* Compensate for the missing 'acc_ev_enter_data_end'.  */
       state += 1;
-      /* Compensate for the missing 'acc_ev_enqueue_launch_start' and
-	 'acc_ev_enqueue_launch_end'.  */
-      state += 2;
       /* Compensate for the missing 'acc_ev_exit_data_start' and
 	 'acc_ev_exit_data_end'.  */
       state += 2;
@@ -484,8 +482,8 @@  static void cb_compute_construct_end (acc_prof_info *prof_info, acc_event_info *
       state += 2;
     }
 #endif
-  assert (state == 9
-	  || state == 109);
+  assert (state == 7
+	  || state == 107);
   STATE_OP (state, ++);
 
   assert (tool_info != NULL);
@@ -539,17 +537,6 @@  static void cb_enqueue_launch_start (acc_prof_info *prof_info, acc_event_info *e
 
   assert (acc_device_type != acc_device_host);
 
-  assert (state == 5
-	  || state == 105);
-  STATE_OP (state, ++);
-
-  assert (tool_info != NULL);
-  assert (tool_info->event_info.other_event.event_type == acc_ev_compute_construct_start);
-  assert (tool_info->nested == NULL);
-  tool_info->nested = (struct tool_info *) malloc(sizeof *tool_info);
-  assert (tool_info->nested != NULL);
-  tool_info->nested->nested = NULL;
-
   assert (prof_info->event_type == acc_ev_enqueue_launch_start);
   assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
   assert (prof_info->version == _ACC_PROF_INFO_VERSION);
@@ -593,13 +580,6 @@  static void cb_enqueue_launch_start (acc_prof_info *prof_info, acc_event_info *e
   assert (api_info->device_handle == NULL);
   assert (api_info->context_handle == NULL);
   assert (api_info->async_handle == NULL);
-
-  tool_info->nested->event_info.launch_event.event_type = event_info->launch_event.event_type;
-  tool_info->nested->event_info.launch_event.kernel_name = strdup (event_info->launch_event.kernel_name);
-  tool_info->nested->event_info.launch_event.num_gangs = event_info->launch_event.num_gangs;
-  tool_info->nested->event_info.launch_event.num_workers = event_info->launch_event.num_workers;
-  tool_info->nested->event_info.launch_event.vector_length = event_info->launch_event.vector_length;
-  event_info->other_event.tool_info = tool_info->nested;
 }
 
 static void cb_enqueue_launch_end (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
@@ -608,19 +588,6 @@  static void cb_enqueue_launch_end (acc_prof_info *prof_info, acc_event_info *eve
 
   assert (acc_device_type != acc_device_host);
 
-  assert (state == 6
-	  || state == 106);
-  STATE_OP (state, ++);
-
-  assert (tool_info != NULL);
-  assert (tool_info->event_info.other_event.event_type == acc_ev_compute_construct_start);
-  assert (tool_info->nested != NULL);
-  assert (tool_info->nested->event_info.launch_event.event_type == acc_ev_enqueue_launch_start);
-  assert (tool_info->nested->event_info.launch_event.kernel_name != NULL);
-  assert (tool_info->nested->event_info.launch_event.num_gangs >= 1);
-  assert (tool_info->nested->event_info.launch_event.num_workers >= 1);
-  assert (tool_info->nested->event_info.launch_event.vector_length >= 1);
-
   assert (prof_info->event_type == acc_ev_enqueue_launch_end);
   assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
   assert (prof_info->version == _ACC_PROF_INFO_VERSION);
@@ -640,12 +607,7 @@  static void cb_enqueue_launch_end (acc_prof_info *prof_info, acc_event_info *eve
   assert (event_info->launch_event.valid_bytes == _ACC_LAUNCH_EVENT_INFO_VALID_BYTES);
   assert (event_info->launch_event.parent_construct == acc_construct_parallel);
   assert (event_info->launch_event.implicit == 1);
-  assert (event_info->launch_event.tool_info == tool_info->nested);
   assert (event_info->launch_event.kernel_name != NULL);
-  assert (strcmp (event_info->launch_event.kernel_name, tool_info->nested->event_info.launch_event.kernel_name) == 0);
-  assert (event_info->launch_event.num_gangs == tool_info->nested->event_info.launch_event.num_gangs);
-  assert (event_info->launch_event.num_workers == tool_info->nested->event_info.launch_event.num_workers);
-  assert (event_info->launch_event.vector_length == tool_info->nested->event_info.launch_event.vector_length);
 
   if (acc_device_type == acc_device_host)
     assert (api_info->device_api == acc_device_api_none);
@@ -659,10 +621,6 @@  static void cb_enqueue_launch_end (acc_prof_info *prof_info, acc_event_info *eve
   assert (api_info->device_handle == NULL);
   assert (api_info->context_handle == NULL);
   assert (api_info->async_handle == NULL);
-
-  free ((void *) tool_info->nested->event_info.launch_event.kernel_name);
-  free (tool_info->nested);
-  tool_info->nested = NULL;
 }
 
 
@@ -711,7 +669,7 @@  int main()
     }
     assert (state_init == 4);
   }
-  assert (state == 10);
+  assert (state == 8);
 
   STATE_OP (state, = 100);
 
@@ -727,7 +685,7 @@  int main()
 #pragma acc wait
     assert (state_init == 104);
   }
-  assert (state == 110);
+  assert (state == 108);
 
   return 0;
 }