Message ID | 4f2750a1-9935-6629-b7fd-ce6280f902c0@mentor.com |
---|---|
State | New |
Headers | show |
Series | [OpenACC,2.5,libgomp] Add *_async versions of runtime library API functions | expand |
On 09/10/2018 08:04 AM, Chung-Lin Tang wrote: > GOACC_2.0 { > Index: libgomp/oacc-mem.c > =================================================================== > --- libgomp/oacc-mem.c (revision 264192) > +++ libgomp/oacc-mem.c (working copy) > @@ -153,8 +153,9 @@ acc_free (void *d) > gomp_fatal ("error in freeing device memory in %s", __FUNCTION__); > } > > -void > -acc_memcpy_to_device (void *d, void *h, size_t s) > +static void > +memcpy_tofrom_device (bool from, void *d, void *h, size_t s, int async, > + const char *libfnname) This showed up oddly in the diff, but memcpy_tofrom_device is a new internal function that's not part of the public API. It's nice that you were able to merge the to/from functions together. I think this is safe in terms of backwards compatibility. > { > /* No need to call lazy open here, as the device pointer must have > been obtained from a routine that did that. */ > @@ -164,31 +165,49 @@ acc_free (void *d) > > if (thr->dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM) > { > - memmove (d, h, s); > + if (from) > + memmove (h, d, s); > + else > + memmove (d, h, s); > return; > } > > - if (!thr->dev->host2dev_func (thr->dev->target_id, d, h, s)) > - gomp_fatal ("error in %s", __FUNCTION__); > + if (async > acc_async_sync) > + thr->dev->openacc.async_set_async_func (async); > + > + bool ret = (from > + ? thr->dev->dev2host_func (thr->dev->target_id, h, d, s) > + : thr->dev->host2dev_func (thr->dev->target_id, d, h, s)); > + > + if (async > acc_async_sync) > + thr->dev->openacc.async_set_async_func (acc_async_sync); > + > + if (!ret) > + gomp_fatal ("error in %s", libfnname); > } > > void > -acc_memcpy_from_device (void *h, void *d, size_t s) > +acc_memcpy_to_device (void *d, void *h, size_t s) > { > - /* No need to call lazy open here, as the device pointer must have > - been obtained from a routine that did that. */ > - struct goacc_thread *thr = goacc_thread (); > + memcpy_tofrom_device (false, d, h, s, acc_async_sync, __FUNCTION__); > +} > > - assert (thr && thr->dev); > +void > +acc_memcpy_to_device_async (void *d, void *h, size_t s, int async) > +{ > + memcpy_tofrom_device (false, d, h, s, async, __FUNCTION__); > +} > > - if (thr->dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM) > - { > - memmove (h, d, s); > - return; > - } > +void > +acc_memcpy_from_device (void *h, void *d, size_t s) > +{ > + memcpy_tofrom_device (true, d, h, s, acc_async_sync, __FUNCTION__); > +} > > - if (!thr->dev->dev2host_func (thr->dev->target_id, h, d, s)) > - gomp_fatal ("error in %s", __FUNCTION__); > +void > +acc_memcpy_from_device_async (void *h, void *d, size_t s, int async) > +{ > + memcpy_tofrom_device (true, d, h, s, async, __FUNCTION__); > } > > /* Return the device pointer that corresponds to host data H. Or NULL > @@ -428,7 +447,7 @@ acc_unmap_data (void *h) > #define FLAG_COPY (1 << 2) > > static void * > -present_create_copy (unsigned f, void *h, size_t s) > +present_create_copy (unsigned f, void *h, size_t s, int async) Likewise, this is another internal function, so it shouldn't break anything. > { > void *d; > splay_tree_key n; > @@ -490,11 +509,17 @@ static void * > > gomp_mutex_unlock (&acc_dev->lock); > > + if (async > acc_async_sync) > + acc_dev->openacc.async_set_async_func (async); > + > tgt = gomp_map_vars (acc_dev, mapnum, &hostaddrs, NULL, &s, &kinds, true, > GOMP_MAP_VARS_OPENACC); > /* Initialize dynamic refcount. */ > tgt->list[0].key->dynamic_refcount = 1; > > + if (async > acc_async_sync) > + acc_dev->openacc.async_set_async_func (acc_async_sync); > + > gomp_mutex_lock (&acc_dev->lock); > > d = tgt->to_free; > @@ -510,19 +535,32 @@ static void * > void * > acc_create (void *h, size_t s) > { > - return present_create_copy (FLAG_PRESENT | FLAG_CREATE, h, s); > + return present_create_copy (FLAG_PRESENT | FLAG_CREATE, h, s, acc_async_sync); > } > > +void > +acc_create_async (void *h, size_t s, int async) > +{ > + present_create_copy (FLAG_PRESENT | FLAG_CREATE, h, s, async); > +} > + > void * > acc_copyin (void *h, size_t s) > { > - return present_create_copy (FLAG_PRESENT | FLAG_CREATE | FLAG_COPY, h, s); > + return present_create_copy (FLAG_PRESENT | FLAG_CREATE | FLAG_COPY, h, s, > + acc_async_sync); > } > > +void > +acc_copyin_async (void *h, size_t s, int async) > +{ > + present_create_copy (FLAG_PRESENT | FLAG_CREATE | FLAG_COPY, h, s, async); > +} > + > void * > acc_present_or_create (void *h, size_t s) > { > - return present_create_copy (FLAG_PRESENT | FLAG_CREATE, h, s); > + return present_create_copy (FLAG_PRESENT | FLAG_CREATE, h, s, acc_async_sync); > } > > /* acc_pcreate is acc_present_or_create by a different name. */ > @@ -539,7 +577,8 @@ acc_pcreate (void *h, size_t s) > void * > acc_present_or_copyin (void *h, size_t s) > { > - return present_create_copy (FLAG_PRESENT | FLAG_CREATE | FLAG_COPY, h, s); > + return present_create_copy (FLAG_PRESENT | FLAG_CREATE | FLAG_COPY, h, s, > + acc_async_sync); > } > > /* acc_pcopyin is acc_present_or_copyin by a different name. */ > @@ -557,7 +596,7 @@ acc_pcopyin (void *h, size_t s) > #define FLAG_FINALIZE (1 << 1) > > static void > -delete_copyout (unsigned f, void *h, size_t s, const char *libfnname) > +delete_copyout (unsigned f, void *h, size_t s, int async, const char *libfnname) Ditto. > { > size_t host_size; > splay_tree_key n; > @@ -633,7 +672,13 @@ static void > } > > if (f & FLAG_COPYOUT) > - acc_dev->dev2host_func (acc_dev->target_id, h, d, s); > + { > + if (async > acc_async_sync) > + acc_dev->openacc.async_set_async_func (async); > + acc_dev->dev2host_func (acc_dev->target_id, h, d, s); > + if (async > acc_async_sync) > + acc_dev->openacc.async_set_async_func (acc_async_sync); > + } > > gomp_remove_var (acc_dev, n); > } > @@ -644,41 +689,54 @@ static void > void > acc_delete (void *h , size_t s) > { > - delete_copyout (0, h, s, __FUNCTION__); > + delete_copyout (0, h, s, acc_async_sync, __FUNCTION__); > } > > void > +acc_delete_async (void *h , size_t s, int async) > +{ > + delete_copyout (0, h, s, async, __FUNCTION__); > +} > + > +void > acc_delete_finalize (void *h , size_t s) > { > - delete_copyout (FLAG_FINALIZE, h, s, __FUNCTION__); > + delete_copyout (FLAG_FINALIZE, h, s, acc_async_sync, __FUNCTION__); > } > > void > acc_delete_finalize_async (void *h , size_t s, int async) > { > - delete_copyout (FLAG_FINALIZE, h, s, __FUNCTION__); > + delete_copyout (FLAG_FINALIZE, h, s, async, __FUNCTION__); > } > > void > acc_copyout (void *h, size_t s) > { > - delete_copyout (FLAG_COPYOUT, h, s, __FUNCTION__); > + delete_copyout (FLAG_COPYOUT, h, s, acc_async_sync, __FUNCTION__); > } > > void > +acc_copyout_async (void *h, size_t s, int async) > +{ > + delete_copyout (FLAG_COPYOUT, h, s, async, __FUNCTION__); > +} > + > +void > acc_copyout_finalize (void *h, size_t s) > { > - delete_copyout (FLAG_COPYOUT | FLAG_FINALIZE, h, s, __FUNCTION__); > + delete_copyout (FLAG_COPYOUT | FLAG_FINALIZE, h, s, acc_async_sync, > + __FUNCTION__); > } > > void > acc_copyout_finalize_async (void *h, size_t s, int async) > { > - delete_copyout (FLAG_COPYOUT | FLAG_FINALIZE, h, s, __FUNCTION__); > + delete_copyout (FLAG_COPYOUT | FLAG_FINALIZE, h, s, async, __FUNCTION__); > } > > static void > -update_dev_host (int is_dev, void *h, size_t s) > +update_dev_host (int is_dev, void *h, size_t s, int async) > { > splay_tree_key n; > void *d; > @@ -704,11 +762,17 @@ static void > d = (void *) (n->tgt->tgt_start + n->tgt_offset > + (uintptr_t) h - n->host_start); > > + if (async > acc_async_sync) > + acc_dev->openacc.async_set_async_func (async); > + > if (is_dev) > acc_dev->host2dev_func (acc_dev->target_id, d, h, s); > else > acc_dev->dev2host_func (acc_dev->target_id, h, d, s); > > + if (async > acc_async_sync) > + acc_dev->openacc.async_set_async_func (acc_async_sync); > + > gomp_mutex_unlock (&acc_dev->lock); > } > > @@ -715,16 +779,28 @@ static void > void > acc_update_device (void *h, size_t s) > { > - update_dev_host (1, h, s); > + update_dev_host (1, h, s, acc_async_sync); > } > > void > +acc_update_device_async (void *h, size_t s, int async) > +{ > + update_dev_host (1, h, s, async); > +} > + > +void > acc_update_self (void *h, size_t s) > { > - update_dev_host (0, h, s); > + update_dev_host (0, h, s, acc_async_sync); > } > > void > +acc_update_self_async (void *h, size_t s, int async) > +{ > + update_dev_host (0, h, s, async); > +} > + > +void > gomp_acc_insert_pointer (size_t mapnum, void **hostaddrs, size_t *sizes, > void *kinds) > { > Index: libgomp/openacc.f90 Given that this includes Fortran changes, you should have copied the fortran mailing list. Those changes look fairly mechanical and reasonable though. > =================================================================== > --- libgomp/openacc.f90 (revision 264192) > +++ libgomp/openacc.f90 (working copy) > @@ -332,6 +332,150 @@ module openacc_internal > logical acc_is_present_array_h > type (*), dimension (..), contiguous :: a > end function > + > + subroutine acc_copyin_async_32_h (a, len, async) > + use iso_c_binding, only: c_int32_t > + use openacc_kinds, only: acc_handle_kind > + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a > + type (*), dimension (*) :: a > + integer (c_int32_t) len > + integer (acc_handle_kind) async > + end subroutine > + > + subroutine acc_copyin_async_64_h (a, len, async) > + use iso_c_binding, only: c_int64_t > + use openacc_kinds, only: acc_handle_kind > + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a > + type (*), dimension (*) :: a > + integer (c_int64_t) len > + integer (acc_handle_kind) async > + end subroutine > + > + subroutine acc_copyin_async_array_h (a, async) > + use openacc_kinds, only: acc_handle_kind > + type (*), dimension (..), contiguous :: a > + integer (acc_handle_kind) async > + end subroutine > + > + subroutine acc_create_async_32_h (a, len, async) > + use iso_c_binding, only: c_int32_t > + use openacc_kinds, only: acc_handle_kind > + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a > + type (*), dimension (*) :: a > + integer (c_int32_t) len > + integer (acc_handle_kind) async > + end subroutine > + > + subroutine acc_create_async_64_h (a, len, async) > + use iso_c_binding, only: c_int64_t > + use openacc_kinds, only: acc_handle_kind > + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a > + type (*), dimension (*) :: a > + integer (c_int64_t) len > + integer (acc_handle_kind) async > + end subroutine > + > + subroutine acc_create_async_array_h (a, async) > + use openacc_kinds, only: acc_handle_kind > + type (*), dimension (..), contiguous :: a > + integer (acc_handle_kind) async > + end subroutine > + > + subroutine acc_copyout_async_32_h (a, len, async) > + use iso_c_binding, only: c_int32_t > + use openacc_kinds, only: acc_handle_kind > + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a > + type (*), dimension (*) :: a > + integer (c_int32_t) len > + integer (acc_handle_kind) async > + end subroutine > + > + subroutine acc_copyout_async_64_h (a, len, async) > + use iso_c_binding, only: c_int64_t > + use openacc_kinds, only: acc_handle_kind > + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a > + type (*), dimension (*) :: a > + integer (c_int64_t) len > + integer (acc_handle_kind) async > + end subroutine > + > + subroutine acc_copyout_async_array_h (a, async) > + use openacc_kinds, only: acc_handle_kind > + type (*), dimension (..), contiguous :: a > + integer (acc_handle_kind) async > + end subroutine > + > + subroutine acc_delete_async_32_h (a, len, async) > + use iso_c_binding, only: c_int32_t > + use openacc_kinds, only: acc_handle_kind > + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a > + type (*), dimension (*) :: a > + integer (c_int32_t) len > + integer (acc_handle_kind) async > + end subroutine > + > + subroutine acc_delete_async_64_h (a, len, async) > + use iso_c_binding, only: c_int64_t > + use openacc_kinds, only: acc_handle_kind > + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a > + type (*), dimension (*) :: a > + integer (c_int64_t) len > + integer (acc_handle_kind) async > + end subroutine > + > + subroutine acc_delete_async_array_h (a, async) > + use openacc_kinds, only: acc_handle_kind > + type (*), dimension (..), contiguous :: a > + integer (acc_handle_kind) async > + end subroutine > + > + subroutine acc_update_device_async_32_h (a, len, async) > + use iso_c_binding, only: c_int32_t > + use openacc_kinds, only: acc_handle_kind > + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a > + type (*), dimension (*) :: a > + integer (c_int32_t) len > + integer (acc_handle_kind) async > + end subroutine > + > + subroutine acc_update_device_async_64_h (a, len, async) > + use iso_c_binding, only: c_int64_t > + use openacc_kinds, only: acc_handle_kind > + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a > + type (*), dimension (*) :: a > + integer (c_int64_t) len > + integer (acc_handle_kind) async > + end subroutine > + > + subroutine acc_update_device_async_array_h (a, async) > + use openacc_kinds, only: acc_handle_kind > + type (*), dimension (..), contiguous :: a > + integer (acc_handle_kind) async > + end subroutine > + > + subroutine acc_update_self_async_32_h (a, len, async) > + use iso_c_binding, only: c_int32_t > + use openacc_kinds, only: acc_handle_kind > + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a > + type (*), dimension (*) :: a > + integer (c_int32_t) len > + integer (acc_handle_kind) async > + end subroutine > + > + subroutine acc_update_self_async_64_h (a, len, async) > + use iso_c_binding, only: c_int64_t > + use openacc_kinds, only: acc_handle_kind > + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a > + type (*), dimension (*) :: a > + integer (c_int64_t) len > + integer (acc_handle_kind) async > + end subroutine > + > + subroutine acc_update_self_async_array_h (a, async) > + use openacc_kinds, only: acc_handle_kind > + type (*), dimension (..), contiguous :: a > + integer (acc_handle_kind) async > + end subroutine > end interface > > interface > @@ -510,6 +654,60 @@ module openacc_internal > type (*), dimension (*) :: a > integer (c_size_t), value :: len > end function > + > + subroutine acc_copyin_async_l (a, len, async) & > + bind (C, name = "acc_copyin_async") > + use iso_c_binding, only: c_size_t, c_int > + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a > + type (*), dimension (*) :: a > + integer (c_size_t), value :: len > + integer (c_int), value :: async > + end subroutine > + > + subroutine acc_create_async_l (a, len, async) & > + bind (C, name = "acc_create_async") > + use iso_c_binding, only: c_size_t, c_int > + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a > + type (*), dimension (*) :: a > + integer (c_size_t), value :: len > + integer (c_int), value :: async > + end subroutine > + > + subroutine acc_copyout_async_l (a, len, async) & > + bind (C, name = "acc_copyout_async") > + use iso_c_binding, only: c_size_t, c_int > + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a > + type (*), dimension (*) :: a > + integer (c_size_t), value :: len > + integer (c_int), value :: async > + end subroutine > + > + subroutine acc_delete_async_l (a, len, async) & > + bind (C, name = "acc_delete_async") > + use iso_c_binding, only: c_size_t, c_int > + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a > + type (*), dimension (*) :: a > + integer (c_size_t), value :: len > + integer (c_int), value :: async > + end subroutine > + > + subroutine acc_update_device_async_l (a, len, async) & > + bind (C, name = "acc_update_device_async") > + use iso_c_binding, only: c_size_t, c_int > + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a > + type (*), dimension (*) :: a > + integer (c_size_t), value :: len > + integer (c_int), value :: async > + end subroutine > + > + subroutine acc_update_self_async_l (a, len, async) & > + bind (C, name = "acc_update_self_async") > + use iso_c_binding, only: c_size_t, c_int > + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a > + type (*), dimension (*) :: a > + integer (c_size_t), value :: len > + integer (c_int), value :: async > + end subroutine > end interface > end module > > @@ -529,6 +727,8 @@ module openacc > public :: acc_copyin, acc_present_or_copyin, acc_pcopyin, acc_create > public :: acc_present_or_create, acc_pcreate, acc_copyout, acc_delete > public :: acc_update_device, acc_update_self, acc_is_present > + public :: acc_copyin_async, acc_create_async, acc_copyout_async > + public :: acc_delete_async, acc_update_device_async, acc_update_self_async > > integer, parameter :: openacc_version = 201306 > > @@ -694,6 +894,42 @@ module openacc > ! acc_memcpy_to_device: Only available in C/C++ > ! acc_memcpy_from_device: Only available in C/C++ > > + interface acc_copyin_async > + procedure :: acc_copyin_async_32_h > + procedure :: acc_copyin_async_64_h > + procedure :: acc_copyin_async_array_h > + end interface > + > + interface acc_create_async > + procedure :: acc_create_async_32_h > + procedure :: acc_create_async_64_h > + procedure :: acc_create_async_array_h > + end interface > + > + interface acc_copyout_async > + procedure :: acc_copyout_async_32_h > + procedure :: acc_copyout_async_64_h > + procedure :: acc_copyout_async_array_h > + end interface > + > + interface acc_delete_async > + procedure :: acc_delete_async_32_h > + procedure :: acc_delete_async_64_h > + procedure :: acc_delete_async_array_h > + end interface > + > + interface acc_update_device_async > + procedure :: acc_update_device_async_32_h > + procedure :: acc_update_device_async_64_h > + procedure :: acc_update_device_async_array_h > + end interface > + > + interface acc_update_self_async > + procedure :: acc_update_self_async_32_h > + procedure :: acc_update_self_async_64_h > + procedure :: acc_update_self_async_array_h > + end interface > + > end module > > function acc_get_num_devices_h (d) > @@ -1078,3 +1314,189 @@ function acc_is_present_array_h (a) > type (*), dimension (..), contiguous :: a > acc_is_present_array_h = acc_is_present_l (a, sizeof (a)) == 1 > end function > + > +subroutine acc_copyin_async_32_h (a, len, async) > + use iso_c_binding, only: c_int32_t, c_size_t, c_int > + use openacc_internal, only: acc_copyin_async_l > + use openacc_kinds, only: acc_handle_kind > + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a > + type (*), dimension (*) :: a > + integer (c_int32_t) len > + integer (acc_handle_kind) async > + call acc_copyin_async_l (a, int (len, kind = c_size_t), int (async, kind = c_int)) > +end subroutine > + > +subroutine acc_copyin_async_64_h (a, len, async) > + use iso_c_binding, only: c_int64_t, c_size_t, c_int > + use openacc_internal, only: acc_copyin_async_l > + use openacc_kinds, only: acc_handle_kind > + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a > + type (*), dimension (*) :: a > + integer (c_int64_t) len > + integer (acc_handle_kind) async > + call acc_copyin_async_l (a, int (len, kind = c_size_t), int (async, kind = c_int)) > +end subroutine > + > +subroutine acc_copyin_async_array_h (a, async) > + use iso_c_binding, only: c_int > + use openacc_internal, only: acc_copyin_async_l > + use openacc_kinds, only: acc_handle_kind > + type (*), dimension (..), contiguous :: a > + integer (acc_handle_kind) async > + call acc_copyin_async_l (a, sizeof (a), int (async, kind = c_int)) > +end subroutine > + > +subroutine acc_create_async_32_h (a, len, async) > + use iso_c_binding, only: c_int32_t, c_size_t, c_int > + use openacc_internal, only: acc_create_async_l > + use openacc_kinds, only: acc_handle_kind > + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a > + type (*), dimension (*) :: a > + integer (c_int32_t) len > + integer (acc_handle_kind) async > + call acc_create_async_l (a, int (len, kind = c_size_t), int (async, kind = c_int)) > +end subroutine > + > +subroutine acc_create_async_64_h (a, len, async) > + use iso_c_binding, only: c_int64_t, c_size_t, c_int > + use openacc_internal, only: acc_create_async_l > + use openacc_kinds, only: acc_handle_kind > + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a > + type (*), dimension (*) :: a > + integer (c_int64_t) len > + integer (acc_handle_kind) async > + call acc_create_async_l (a, int (len, kind = c_size_t), int (async, kind = c_int)) > +end subroutine > + > +subroutine acc_create_async_array_h (a, async) > + use iso_c_binding, only: c_int > + use openacc_internal, only: acc_create_async_l > + use openacc_kinds, only: acc_handle_kind > + type (*), dimension (..), contiguous :: a > + integer (acc_handle_kind) async > + call acc_create_async_l (a, sizeof (a), int (async, kind = c_int)) > +end subroutine > + > +subroutine acc_copyout_async_32_h (a, len, async) > + use iso_c_binding, only: c_int32_t, c_size_t, c_int > + use openacc_internal, only: acc_copyout_async_l > + use openacc_kinds, only: acc_handle_kind > + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a > + type (*), dimension (*) :: a > + integer (c_int32_t) len > + integer (acc_handle_kind) async > + call acc_copyout_async_l (a, int (len, kind = c_size_t), int (async, kind = c_int)) > +end subroutine > + > +subroutine acc_copyout_async_64_h (a, len, async) > + use iso_c_binding, only: c_int64_t, c_size_t, c_int > + use openacc_internal, only: acc_copyout_async_l > + use openacc_kinds, only: acc_handle_kind > + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a > + type (*), dimension (*) :: a > + integer (c_int64_t) len > + integer (acc_handle_kind) async > + call acc_copyout_async_l (a, int (len, kind = c_size_t), int (async, kind = c_int)) > +end subroutine > + > +subroutine acc_copyout_async_array_h (a, async) > + use iso_c_binding, only: c_int > + use openacc_internal, only: acc_copyout_async_l > + use openacc_kinds, only: acc_handle_kind > + type (*), dimension (..), contiguous :: a > + integer (acc_handle_kind) async > + call acc_copyout_async_l (a, sizeof (a), int (async, kind = c_int)) > +end subroutine > + > +subroutine acc_delete_async_32_h (a, len, async) > + use iso_c_binding, only: c_int32_t, c_size_t, c_int > + use openacc_internal, only: acc_delete_async_l > + use openacc_kinds, only: acc_handle_kind > + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a > + type (*), dimension (*) :: a > + integer (c_int32_t) len > + integer (acc_handle_kind) async > + call acc_delete_async_l (a, int (len, kind = c_size_t), int (async, kind = c_int)) > +end subroutine > + > +subroutine acc_delete_async_64_h (a, len, async) > + use iso_c_binding, only: c_int64_t, c_size_t, c_int > + use openacc_internal, only: acc_delete_async_l > + use openacc_kinds, only: acc_handle_kind > + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a > + type (*), dimension (*) :: a > + integer (c_int64_t) len > + integer (acc_handle_kind) async > + call acc_delete_async_l (a, int (len, kind = c_size_t), int (async, kind = c_int)) > +end subroutine > + > +subroutine acc_delete_async_array_h (a, async) > + use iso_c_binding, only: c_int > + use openacc_internal, only: acc_delete_async_l > + use openacc_kinds, only: acc_handle_kind > + type (*), dimension (..), contiguous :: a > + integer (acc_handle_kind) async > + call acc_delete_async_l (a, sizeof (a), int (async, kind = c_int)) > +end subroutine > + > +subroutine acc_update_device_async_32_h (a, len, async) > + use iso_c_binding, only: c_int32_t, c_size_t, c_int > + use openacc_internal, only: acc_update_device_async_l > + use openacc_kinds, only: acc_handle_kind > + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a > + type (*), dimension (*) :: a > + integer (c_int32_t) len > + integer (acc_handle_kind) async > + call acc_update_device_async_l (a, int (len, kind = c_size_t), int (async, kind = c_int)) > +end subroutine > + > +subroutine acc_update_device_async_64_h (a, len, async) > + use iso_c_binding, only: c_int64_t, c_size_t, c_int > + use openacc_internal, only: acc_update_device_async_l > + use openacc_kinds, only: acc_handle_kind > + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a > + type (*), dimension (*) :: a > + integer (c_int64_t) len > + integer (acc_handle_kind) async > + call acc_update_device_async_l (a, int (len, kind = c_size_t), int (async, kind = c_int)) > +end subroutine > + > +subroutine acc_update_device_async_array_h (a, async) > + use iso_c_binding, only: c_int > + use openacc_internal, only: acc_update_device_async_l > + use openacc_kinds, only: acc_handle_kind > + type (*), dimension (..), contiguous :: a > + integer (acc_handle_kind) async > + call acc_update_device_async_l (a, sizeof (a), int (async, kind = c_int)) > +end subroutine > + > +subroutine acc_update_self_async_32_h (a, len, async) > + use iso_c_binding, only: c_int32_t, c_size_t, c_int > + use openacc_internal, only: acc_update_self_async_l > + use openacc_kinds, only: acc_handle_kind > + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a > + type (*), dimension (*) :: a > + integer (c_int32_t) len > + integer (acc_handle_kind) async > + call acc_update_self_async_l (a, int (len, kind = c_size_t), int (async, kind = c_int)) > +end subroutine > + > +subroutine acc_update_self_async_64_h (a, len, async) > + use iso_c_binding, only: c_int64_t, c_size_t, c_int > + use openacc_internal, only: acc_update_self_async_l > + use openacc_kinds, only: acc_handle_kind > + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a > + type (*), dimension (*) :: a > + integer (c_int64_t) len > + integer (acc_handle_kind) async > + call acc_update_self_async_l (a, int (len, kind = c_size_t), int (async, kind = c_int)) > +end subroutine > + > +subroutine acc_update_self_async_array_h (a, async) > + use iso_c_binding, only: c_int > + use openacc_internal, only: acc_update_self_async_l > + use openacc_kinds, only: acc_handle_kind > + type (*), dimension (..), contiguous :: a > + integer (acc_handle_kind) async > + call acc_update_self_async_l (a, sizeof (a), int (async, kind = c_int)) > +end subroutine > Index: libgomp/openacc.h > =================================================================== > --- libgomp/openacc.h (revision 264192) > +++ libgomp/openacc.h (working copy) > @@ -115,6 +115,16 @@ void acc_copyout_finalize_async (void *, size_t, i > void acc_delete_finalize (void *, size_t) __GOACC_NOTHROW; > void acc_delete_finalize_async (void *, size_t, int) __GOACC_NOTHROW; > > +/* Async functions, specified in OpenACC 2.5. */ > +void acc_copyin_async (void *, size_t, int) __GOACC_NOTHROW; > +void acc_create_async (void *, size_t, int) __GOACC_NOTHROW; > +void acc_copyout_async (void *, size_t, int) __GOACC_NOTHROW; > +void acc_delete_async (void *, size_t, int) __GOACC_NOTHROW; > +void acc_update_device_async (void *, size_t, int) __GOACC_NOTHROW; > +void acc_update_self_async (void *, size_t, int) __GOACC_NOTHROW; > +void acc_memcpy_to_device_async (void *, void *, size_t, int) __GOACC_NOTHROW; > +void acc_memcpy_from_device_async (void *, void *, size_t, int) __GOACC_NOTHROW; > + > /* CUDA-specific routines. */ > void *acc_get_current_cuda_device (void) __GOACC_NOTHROW; > void *acc_get_current_cuda_context (void) __GOACC_NOTHROW; > Index: libgomp/openacc_lib.h I don't see a test case for this. I believe that openacc_lib.h is used by fixed-mode Fortran programs (those that end in a .f). Can you add a fixed-mode version of lib-16.f90? > =================================================================== > --- libgomp/openacc_lib.h (revision 264192) > +++ libgomp/openacc_lib.h (working copy) > @@ -403,3 +403,159 @@ > > ! acc_memcpy_to_device: Only available in C/C++ > ! acc_memcpy_from_device: Only available in C/C++ > + > + interface acc_copyin_async > + subroutine acc_copyin_async_32_h (a, len, async) > + use iso_c_binding, only: c_int32_t > + import acc_handle_kind > + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a > + type (*), dimension (*) :: a > + integer (c_int32_t) len > + integer (acc_handle_kind) async > + end subroutine > + > + subroutine acc_copyin_async_64_h (a, len, async) > + use iso_c_binding, only: c_int64_t > + import acc_handle_kind > + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a > + type (*), dimension (*) :: a > + integer (c_int64_t) len > + integer (acc_handle_kind) async > + end subroutine > + > + subroutine acc_copyin_async_array_h (a, async_) > + import acc_handle_kind > + type (*), dimension (..), contiguous :: a > + integer (acc_handle_kind) async_ > + end subroutine > + end interface > + > + interface acc_create_async > + subroutine acc_create_async_32_h (a, len, async) > + use iso_c_binding, only: c_int32_t > + import acc_handle_kind > + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a > + type (*), dimension (*) :: a > + integer (c_int32_t) len > + integer (acc_handle_kind) async > + end subroutine > + > + subroutine acc_create_async_64_h (a, len, async) > + use iso_c_binding, only: c_int64_t > + import acc_handle_kind > + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a > + type (*), dimension (*) :: a > + integer (c_int64_t) len > + integer (acc_handle_kind) async > + end subroutine > + > + subroutine acc_create_async_array_h (a, async_) > + import acc_handle_kind > + type (*), dimension (..), contiguous :: a > + integer (acc_handle_kind) async_ > + end subroutine > + end interface > + > + interface acc_copyout_async > + subroutine acc_copyout_async_32_h (a, len, async) > + use iso_c_binding, only: c_int32_t > + import acc_handle_kind > + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a > + type (*), dimension (*) :: a > + integer (c_int32_t) len > + integer (acc_handle_kind) async > + end subroutine > + > + subroutine acc_copyout_async_64_h (a, len, async) > + use iso_c_binding, only: c_int64_t > + import acc_handle_kind > + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a > + type (*), dimension (*) :: a > + integer (c_int64_t) len > + integer (acc_handle_kind) async > + end subroutine > + > + subroutine acc_copyout_async_array_h (a, async_) > + import acc_handle_kind > + type (*), dimension (..), contiguous :: a > + integer (acc_handle_kind) async_ > + end subroutine > + end interface > + > + interface acc_delete_async > + subroutine acc_delete_async_32_h (a, len, async) > + use iso_c_binding, only: c_int32_t > + import acc_handle_kind > + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a > + type (*), dimension (*) :: a > + integer (c_int32_t) len > + integer (acc_handle_kind) async > + end subroutine > + > + subroutine acc_delete_async_64_h (a, len, async) > + use iso_c_binding, only: c_int64_t > + import acc_handle_kind > + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a > + type (*), dimension (*) :: a > + integer (c_int64_t) len > + integer (acc_handle_kind) async > + end subroutine > + > + subroutine acc_delete_async_array_h (a, async_) > + import acc_handle_kind > + type (*), dimension (..), contiguous :: a > + integer (acc_handle_kind) async_ > + end subroutine > + end interface > + > + interface acc_update_device_async > + subroutine acc_update_device_async_32_h (a, len, async) > + use iso_c_binding, only: c_int32_t > + import acc_handle_kind > + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a > + type (*), dimension (*) :: a > + integer (c_int32_t) len > + integer (acc_handle_kind) async > + end subroutine > + > + subroutine acc_update_device_async_64_h (a, len, async) > + use iso_c_binding, only: c_int64_t > + import acc_handle_kind > + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a > + type (*), dimension (*) :: a > + integer (c_int64_t) len > + integer (acc_handle_kind) async > + end subroutine > + > + subroutine acc_update_device_async_array_h (a, async_) > + import acc_handle_kind > + type (*), dimension (..), contiguous :: a > + integer (acc_handle_kind) async_ > + end subroutine > + end interface > + > + interface acc_update_self_async > + subroutine acc_update_self_async_32_h (a, len, async) > + use iso_c_binding, only: c_int32_t > + import acc_handle_kind > + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a > + type (*), dimension (*) :: a > + integer (c_int32_t) len > + integer (acc_handle_kind) async > + end subroutine > + > + subroutine acc_update_self_async_64_h (a, len, async) > + use iso_c_binding, only: c_int64_t > + import acc_handle_kind > + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a > + type (*), dimension (*) :: a > + integer (c_int64_t) len > + integer (acc_handle_kind) async > + end subroutine > + > + subroutine acc_update_self_async_array_h (a, async_) > + import acc_handle_kind > + type (*), dimension (..), contiguous :: a > + integer (acc_handle_kind) async_ > + end subroutine > + end interface > Index: libgomp/testsuite/libgomp.oacc-c-c++-common/lib-94.c > =================================================================== > --- libgomp/testsuite/libgomp.oacc-c-c++-common/lib-94.c (nonexistent) > +++ libgomp/testsuite/libgomp.oacc-c-c++-common/lib-94.c (working copy) > @@ -0,0 +1,42 @@ > +/* { dg-do run } */ > +/* { dg-skip-if "" { *-*-* } { "*" } { "-DACC_MEM_SHARED=0" } } */ > + > +#include <string.h> > +#include <stdlib.h> > +#include <openacc.h> > + > +int > +main (int argc, char **argv) > +{ > + const int N = 256; > + int i; > + int async = 8; > + unsigned char *h; > + > + h = (unsigned char *) malloc (N); > + > + for (i = 0; i < N; i++) > + { > + h[i] = i; > + } > + > + acc_copyin_async (h, N, async); > + > + memset (h, 0, N); > + > + acc_wait (async); > + > + acc_copyout_async (h, N, async + 1); > + > + acc_wait (async + 1); > + > + for (i = 0; i < N; i++) > + { > + if (h[i] != i) > + abort (); > + } > + > + free (h); > + > + return 0; > +} > Index: libgomp/testsuite/libgomp.oacc-c-c++-common/lib-95.c > =================================================================== > --- libgomp/testsuite/libgomp.oacc-c-c++-common/lib-95.c (nonexistent) > +++ libgomp/testsuite/libgomp.oacc-c-c++-common/lib-95.c (working copy) > @@ -0,0 +1,45 @@ > +/* { dg-do run } */ > +/* { dg-skip-if "" { *-*-* } { "*" } { "-DACC_MEM_SHARED=0" } } */ > + > +#include <string.h> > +#include <stdlib.h> > +#include <openacc.h> > + > +int > +main (int argc, char **argv) > +{ > + const int N = 256; > + int i, q = 5; > + unsigned char *h, *g; > + void *d; > + > + h = (unsigned char *) malloc (N); > + g = (unsigned char *) malloc (N); > + for (i = 0; i < N; i++) > + { > + g[i] = i; > + } > + > + acc_create_async (h, N, q); > + > + acc_memcpy_to_device_async (acc_deviceptr (h), g, N, q); > + memset (&h[0], 0, N); > + > + acc_wait (q); > + > + acc_update_self_async (h, N, q + 1); > + acc_delete_async (h, N, q + 1); > + > + acc_wait (q + 1); > + > + for (i = 0; i < N; i++) > + { > + if (h[i] != i) > + abort (); > + } > + > + free (h); > + free (g); > + > + return 0; > +} > Index: libgomp/testsuite/libgomp.oacc-fortran/lib-16.f90 > =================================================================== > --- libgomp/testsuite/libgomp.oacc-fortran/lib-16.f90 (nonexistent) > +++ libgomp/testsuite/libgomp.oacc-fortran/lib-16.f90 (working copy) > @@ -0,0 +1,57 @@ > +! { dg-do run } > +! { dg-skip-if "" { *-*-* } { "*" } { "-DACC_MEM_SHARED=0" } } > + > +program main > + use openacc > + implicit none > + > + integer, parameter :: N = 256 > + integer, allocatable :: h(:) > + integer :: i > + integer :: async = 5 > + > + allocate (h(N)) > + > + do i = 1, N > + h(i) = i > + end do > + > + call acc_copyin (h) > + > + do i = 1, N > + h(i) = i + i > + end do > + > + call acc_update_device_async (h, sizeof (h), async) > + > + if (acc_is_present (h) .neqv. .TRUE.) call abort > + > + h(:) = 0 > + > + call acc_copyout_async (h, sizeof (h), async) > + > + call acc_wait (async) > + > + do i = 1, N > + if (h(i) /= i + i) call abort > + end do > + > + call acc_copyin (h, sizeof (h)) > + > + h(:) = 0 > + > + call acc_update_self_async (h, sizeof (h), async) > + > + if (acc_is_present (h) .neqv. .TRUE.) call abort > + > + do i = 1, N > + if (h(i) /= i + i) call abort > + end do > + > + call acc_delete_async (h, async) > + > + call acc_wait (async) > + > + if (acc_is_present (h) .neqv. .FALSE.) call abort > + > +end program > While I can't approve this patch, it seems reasonable to me. I like how you cleaned up things from OG8 (e.g., replacing return (n ? 1 : 0) with return n != NULL'). Are there any other OG8 async patches in your queue? Thanks, Cesar
On 2018/9/11 1:22 AM, Cesar Philippidis wrote: > On 09/10/2018 08:04 AM, Chung-Lin Tang wrote: > >> GOACC_2.0 { >> Index: libgomp/oacc-mem.c >> =================================================================== >> --- libgomp/oacc-mem.c (revision 264192) >> +++ libgomp/oacc-mem.c (working copy) >> @@ -153,8 +153,9 @@ acc_free (void *d) >> gomp_fatal ("error in freeing device memory in %s", __FUNCTION__); >> } >> >> -void >> -acc_memcpy_to_device (void *d, void *h, size_t s) >> +static void >> +memcpy_tofrom_device (bool from, void *d, void *h, size_t s, int async, >> + const char *libfnname) > > This showed up oddly in the diff, but memcpy_tofrom_device is a new > internal function that's not part of the public API. It's nice that you > were able to merge the to/from functions together. I think this is safe > in terms of backwards compatibility. Yes, this patch only adds new API functions, though some internal refactoring was done. No compatibility was compromised, if that wasn't apparent. > While I can't approve this patch, it seems reasonable to me. I like how > you cleaned up things from OG8 (e.g., replacing return (n ? 1 : 0) with > return n != NULL'). Are there any other OG8 async patches in your queue? Yeah, there are more, this is just a smaller, independent part of the whole set of changes, as you also probably know. Thanks, Chung-Lin
Hi Chung-Lin! On Mon, 10 Sep 2018 23:04:18 +0800, Chung-Lin Tang <chunglin_tang@mentor.com> wrote: > This patch adds *_async versions of several OpenACC runtime library API functions, > which is to allow execution of a function asynchronously on particular async > stream, an addition to the standard since 2.5. Specifically, these functions: > > acc_copyin_async > acc_copyout_async > acc_copyout_finalize_async > acc_create_async > acc_delete_async > acc_delete_finalize_async > acc_memcpy_from_device_async > acc_memcpy_to_device_async > acc_update_device_async > acc_update_self_async > > which have an additional 'int async' argument in additional from the non-async version. > > libgomp tested with offloading with no regressions, is this okay for trunk? Thanks, approved. To record the review effort, please include "Reviewed-by: Thomas Schwinge <thomas@codesourcery.com>" in the commit log, see <https://gcc.gnu.org/wiki/Reviewed-by>. Grüße Thomas > 2018-09-10 Chung-Lin Tang <cltang@codesourcery.com> > > libgomp/ > * oacc-mem.c (memcpy_tofrom_device): New function, combined from > acc_memcpy_to/from_device functions, now with async parameter. > (acc_memcpy_to_device): Modify to use memcpy_tofrom_device. > (acc_memcpy_from_device): Likewise. > (acc_memcpy_to_device_async): New API function. > (acc_memcpy_from_device_async): Likewise. > (present_create_copy): Add async parameter and async setting/unsetting. > (acc_create): Adjust present_create_copy call. > (acc_copyin): Likewise. > (acc_present_or_create): Likewise. > (acc_present_or_copyin): Likewise. > (acc_create_async): New API function. > (acc_copyin_async): New API function. > (delete_copyout): Add async parameter and async setting/unsetting. > (acc_delete): Adjust delete_copyout call. > (acc_copyout): Likewise. > (acc_delete_async): New API function. > (acc_copyout_async): Likewise. > (update_dev_host): Add async parameter and async setting/unsetting. > (acc_update_device): Adjust update_dev_host call. > (acc_update_self): Likewise. > (acc_update_device_async): New API function. > (acc_update_self_async): Likewise. > * openacc.h (acc_copyin_async): Declare new API function. > (acc_create_async): Likewise. > (acc_copyout_async): Likewise. > (acc_delete_async): Likewise. > (acc_update_device_async): Likewise. > (acc_update_self_async): Likewise. > (acc_memcpy_to_device_async): Likewise. > (acc_memcpy_from_device_async): Likewise. > * openacc_lib.h (acc_copyin_async_32_h): New subroutine. > (acc_copyin_async_64_h): New subroutine. > (acc_copyin_async_array_h): New subroutine. > (acc_create_async_32_h): New subroutine. > (acc_create_async_64_h): New subroutine. > (acc_create_async_array_h): New subroutine. > (acc_copyout_async_32_h): New subroutine. > (acc_copyout_async_64_h): New subroutine. > (acc_copyout_async_array_h): New subroutine. > (acc_delete_async_32_h): New subroutine. > (acc_delete_async_64_h): New subroutine. > (acc_delete_async_array_h): New subroutine. > (acc_update_device_async_32_h): New subroutine. > (acc_update_device_async_64_h): New subroutine. > (acc_update_device_async_array_h): New subroutine. > (acc_update_self_async_32_h): New subroutine. > (acc_update_self_async_64_h): New subroutine. > (acc_update_self_async_array_h): New subroutine. > * openacc.f90 (acc_copyin_async_32_h): New subroutine. > (acc_copyin_async_64_h): New subroutine. > (acc_copyin_async_array_h): New subroutine. > (acc_create_async_32_h): New subroutine. > (acc_create_async_64_h): New subroutine. > (acc_create_async_array_h): New subroutine. > (acc_copyout_async_32_h): New subroutine. > (acc_copyout_async_64_h): New subroutine. > (acc_copyout_async_array_h): New subroutine. > (acc_delete_async_32_h): New subroutine. > (acc_delete_async_64_h): New subroutine. > (acc_delete_async_array_h): New subroutine. > (acc_update_device_async_32_h): New subroutine. > (acc_update_device_async_64_h): New subroutine. > (acc_update_device_async_array_h): New subroutine. > (acc_update_self_async_32_h): New subroutine. > (acc_update_self_async_64_h): New subroutine. > (acc_update_self_async_array_h): New subroutine. > * libgomp.map (OACC_2.5): Add acc_copyin_async*, acc_copyout_async*, > acc_copyout_finalize_async*, acc_create_async*, acc_delete_async*, > acc_delete_finalize_async*, acc_memcpy_from_device_async*, > acc_memcpy_to_device_async*, acc_update_device_async*, and > acc_update_self_async* entries. > * testsuite/libgomp.oacc-c-c++-common/lib-94.c: New test. > * testsuite/libgomp.oacc-c-c++-common/lib-95.c: New test. > * testsuite/libgomp.oacc-fortran/lib-16.f90: New test. > Index: libgomp/libgomp.map > =================================================================== > --- libgomp/libgomp.map (revision 264192) > +++ libgomp/libgomp.map (working copy) > @@ -388,14 +388,48 @@ OACC_2.0.1 { > > OACC_2.5 { > global: > + acc_copyin_async; > + acc_copyin_async_32_h_; > + acc_copyin_async_64_h_; > + acc_copyin_async_array_h_; > + acc_copyout_async; > + acc_copyout_async_32_h_; > + acc_copyout_async_64_h_; > + acc_copyout_async_array_h_; > acc_copyout_finalize; > acc_copyout_finalize_32_h_; > acc_copyout_finalize_64_h_; > acc_copyout_finalize_array_h_; > + acc_copyout_finalize_async; > + acc_copyout_finalize_async_32_h_; > + acc_copyout_finalize_async_64_h_; > + acc_copyout_finalize_async_array_h_; > + acc_create_async; > + acc_create_async_32_h_; > + acc_create_async_64_h_; > + acc_create_async_array_h_; > + acc_delete_async; > + acc_delete_async_32_h_; > + acc_delete_async_64_h_; > + acc_delete_async_array_h_; > acc_delete_finalize; > acc_delete_finalize_32_h_; > acc_delete_finalize_64_h_; > acc_delete_finalize_array_h_; > + acc_delete_finalize_async; > + acc_delete_finalize_async_32_h_; > + acc_delete_finalize_async_64_h_; > + acc_delete_finalize_async_array_h_; > + acc_memcpy_from_device_async; > + acc_memcpy_to_device_async; > + acc_update_device_async; > + acc_update_device_async_32_h_; > + acc_update_device_async_64_h_; > + acc_update_device_async_array_h_; > + acc_update_self_async; > + acc_update_self_async_32_h_; > + acc_update_self_async_64_h_; > + acc_update_self_async_array_h_; > } OACC_2.0.1; > > GOACC_2.0 { > Index: libgomp/oacc-mem.c > =================================================================== > --- libgomp/oacc-mem.c (revision 264192) > +++ libgomp/oacc-mem.c (working copy) > @@ -153,8 +153,9 @@ acc_free (void *d) > gomp_fatal ("error in freeing device memory in %s", __FUNCTION__); > } > > -void > -acc_memcpy_to_device (void *d, void *h, size_t s) > +static void > +memcpy_tofrom_device (bool from, void *d, void *h, size_t s, int async, > + const char *libfnname) > { > /* No need to call lazy open here, as the device pointer must have > been obtained from a routine that did that. */ > @@ -164,31 +165,49 @@ acc_free (void *d) > > if (thr->dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM) > { > - memmove (d, h, s); > + if (from) > + memmove (h, d, s); > + else > + memmove (d, h, s); > return; > } > > - if (!thr->dev->host2dev_func (thr->dev->target_id, d, h, s)) > - gomp_fatal ("error in %s", __FUNCTION__); > + if (async > acc_async_sync) > + thr->dev->openacc.async_set_async_func (async); > + > + bool ret = (from > + ? thr->dev->dev2host_func (thr->dev->target_id, h, d, s) > + : thr->dev->host2dev_func (thr->dev->target_id, d, h, s)); > + > + if (async > acc_async_sync) > + thr->dev->openacc.async_set_async_func (acc_async_sync); > + > + if (!ret) > + gomp_fatal ("error in %s", libfnname); > } > > void > -acc_memcpy_from_device (void *h, void *d, size_t s) > +acc_memcpy_to_device (void *d, void *h, size_t s) > { > - /* No need to call lazy open here, as the device pointer must have > - been obtained from a routine that did that. */ > - struct goacc_thread *thr = goacc_thread (); > + memcpy_tofrom_device (false, d, h, s, acc_async_sync, __FUNCTION__); > +} > > - assert (thr && thr->dev); > +void > +acc_memcpy_to_device_async (void *d, void *h, size_t s, int async) > +{ > + memcpy_tofrom_device (false, d, h, s, async, __FUNCTION__); > +} > > - if (thr->dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM) > - { > - memmove (h, d, s); > - return; > - } > +void > +acc_memcpy_from_device (void *h, void *d, size_t s) > +{ > + memcpy_tofrom_device (true, d, h, s, acc_async_sync, __FUNCTION__); > +} > > - if (!thr->dev->dev2host_func (thr->dev->target_id, h, d, s)) > - gomp_fatal ("error in %s", __FUNCTION__); > +void > +acc_memcpy_from_device_async (void *h, void *d, size_t s, int async) > +{ > + memcpy_tofrom_device (true, d, h, s, async, __FUNCTION__); > } > > /* Return the device pointer that corresponds to host data H. Or NULL > @@ -428,7 +447,7 @@ acc_unmap_data (void *h) > #define FLAG_COPY (1 << 2) > > static void * > -present_create_copy (unsigned f, void *h, size_t s) > +present_create_copy (unsigned f, void *h, size_t s, int async) > { > void *d; > splay_tree_key n; > @@ -490,11 +509,17 @@ static void * > > gomp_mutex_unlock (&acc_dev->lock); > > + if (async > acc_async_sync) > + acc_dev->openacc.async_set_async_func (async); > + > tgt = gomp_map_vars (acc_dev, mapnum, &hostaddrs, NULL, &s, &kinds, true, > GOMP_MAP_VARS_OPENACC); > /* Initialize dynamic refcount. */ > tgt->list[0].key->dynamic_refcount = 1; > > + if (async > acc_async_sync) > + acc_dev->openacc.async_set_async_func (acc_async_sync); > + > gomp_mutex_lock (&acc_dev->lock); > > d = tgt->to_free; > @@ -510,19 +535,32 @@ static void * > void * > acc_create (void *h, size_t s) > { > - return present_create_copy (FLAG_PRESENT | FLAG_CREATE, h, s); > + return present_create_copy (FLAG_PRESENT | FLAG_CREATE, h, s, acc_async_sync); > } > > +void > +acc_create_async (void *h, size_t s, int async) > +{ > + present_create_copy (FLAG_PRESENT | FLAG_CREATE, h, s, async); > +} > + > void * > acc_copyin (void *h, size_t s) > { > - return present_create_copy (FLAG_PRESENT | FLAG_CREATE | FLAG_COPY, h, s); > + return present_create_copy (FLAG_PRESENT | FLAG_CREATE | FLAG_COPY, h, s, > + acc_async_sync); > } > > +void > +acc_copyin_async (void *h, size_t s, int async) > +{ > + present_create_copy (FLAG_PRESENT | FLAG_CREATE | FLAG_COPY, h, s, async); > +} > + > void * > acc_present_or_create (void *h, size_t s) > { > - return present_create_copy (FLAG_PRESENT | FLAG_CREATE, h, s); > + return present_create_copy (FLAG_PRESENT | FLAG_CREATE, h, s, acc_async_sync); > } > > /* acc_pcreate is acc_present_or_create by a different name. */ > @@ -539,7 +577,8 @@ acc_pcreate (void *h, size_t s) > void * > acc_present_or_copyin (void *h, size_t s) > { > - return present_create_copy (FLAG_PRESENT | FLAG_CREATE | FLAG_COPY, h, s); > + return present_create_copy (FLAG_PRESENT | FLAG_CREATE | FLAG_COPY, h, s, > + acc_async_sync); > } > > /* acc_pcopyin is acc_present_or_copyin by a different name. */ > @@ -557,7 +596,7 @@ acc_pcopyin (void *h, size_t s) > #define FLAG_FINALIZE (1 << 1) > > static void > -delete_copyout (unsigned f, void *h, size_t s, const char *libfnname) > +delete_copyout (unsigned f, void *h, size_t s, int async, const char *libfnname) > { > size_t host_size; > splay_tree_key n; > @@ -633,7 +672,13 @@ static void > } > > if (f & FLAG_COPYOUT) > - acc_dev->dev2host_func (acc_dev->target_id, h, d, s); > + { > + if (async > acc_async_sync) > + acc_dev->openacc.async_set_async_func (async); > + acc_dev->dev2host_func (acc_dev->target_id, h, d, s); > + if (async > acc_async_sync) > + acc_dev->openacc.async_set_async_func (acc_async_sync); > + } > > gomp_remove_var (acc_dev, n); > } > @@ -644,41 +689,54 @@ static void > void > acc_delete (void *h , size_t s) > { > - delete_copyout (0, h, s, __FUNCTION__); > + delete_copyout (0, h, s, acc_async_sync, __FUNCTION__); > } > > void > +acc_delete_async (void *h , size_t s, int async) > +{ > + delete_copyout (0, h, s, async, __FUNCTION__); > +} > + > +void > acc_delete_finalize (void *h , size_t s) > { > - delete_copyout (FLAG_FINALIZE, h, s, __FUNCTION__); > + delete_copyout (FLAG_FINALIZE, h, s, acc_async_sync, __FUNCTION__); > } > > void > acc_delete_finalize_async (void *h , size_t s, int async) > { > - delete_copyout (FLAG_FINALIZE, h, s, __FUNCTION__); > + delete_copyout (FLAG_FINALIZE, h, s, async, __FUNCTION__); > } > > void > acc_copyout (void *h, size_t s) > { > - delete_copyout (FLAG_COPYOUT, h, s, __FUNCTION__); > + delete_copyout (FLAG_COPYOUT, h, s, acc_async_sync, __FUNCTION__); > } > > void > +acc_copyout_async (void *h, size_t s, int async) > +{ > + delete_copyout (FLAG_COPYOUT, h, s, async, __FUNCTION__); > +} > + > +void > acc_copyout_finalize (void *h, size_t s) > { > - delete_copyout (FLAG_COPYOUT | FLAG_FINALIZE, h, s, __FUNCTION__); > + delete_copyout (FLAG_COPYOUT | FLAG_FINALIZE, h, s, acc_async_sync, > + __FUNCTION__); > } > > void > acc_copyout_finalize_async (void *h, size_t s, int async) > { > - delete_copyout (FLAG_COPYOUT | FLAG_FINALIZE, h, s, __FUNCTION__); > + delete_copyout (FLAG_COPYOUT | FLAG_FINALIZE, h, s, async, __FUNCTION__); > } > > static void > -update_dev_host (int is_dev, void *h, size_t s) > +update_dev_host (int is_dev, void *h, size_t s, int async) > { > splay_tree_key n; > void *d; > @@ -704,11 +762,17 @@ static void > d = (void *) (n->tgt->tgt_start + n->tgt_offset > + (uintptr_t) h - n->host_start); > > + if (async > acc_async_sync) > + acc_dev->openacc.async_set_async_func (async); > + > if (is_dev) > acc_dev->host2dev_func (acc_dev->target_id, d, h, s); > else > acc_dev->dev2host_func (acc_dev->target_id, h, d, s); > > + if (async > acc_async_sync) > + acc_dev->openacc.async_set_async_func (acc_async_sync); > + > gomp_mutex_unlock (&acc_dev->lock); > } > > @@ -715,16 +779,28 @@ static void > void > acc_update_device (void *h, size_t s) > { > - update_dev_host (1, h, s); > + update_dev_host (1, h, s, acc_async_sync); > } > > void > +acc_update_device_async (void *h, size_t s, int async) > +{ > + update_dev_host (1, h, s, async); > +} > + > +void > acc_update_self (void *h, size_t s) > { > - update_dev_host (0, h, s); > + update_dev_host (0, h, s, acc_async_sync); > } > > void > +acc_update_self_async (void *h, size_t s, int async) > +{ > + update_dev_host (0, h, s, async); > +} > + > +void > gomp_acc_insert_pointer (size_t mapnum, void **hostaddrs, size_t *sizes, > void *kinds) > { > Index: libgomp/openacc.f90 > =================================================================== > --- libgomp/openacc.f90 (revision 264192) > +++ libgomp/openacc.f90 (working copy) > @@ -332,6 +332,150 @@ module openacc_internal > logical acc_is_present_array_h > type (*), dimension (..), contiguous :: a > end function > + > + subroutine acc_copyin_async_32_h (a, len, async) > + use iso_c_binding, only: c_int32_t > + use openacc_kinds, only: acc_handle_kind > + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a > + type (*), dimension (*) :: a > + integer (c_int32_t) len > + integer (acc_handle_kind) async > + end subroutine > + > + subroutine acc_copyin_async_64_h (a, len, async) > + use iso_c_binding, only: c_int64_t > + use openacc_kinds, only: acc_handle_kind > + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a > + type (*), dimension (*) :: a > + integer (c_int64_t) len > + integer (acc_handle_kind) async > + end subroutine > + > + subroutine acc_copyin_async_array_h (a, async) > + use openacc_kinds, only: acc_handle_kind > + type (*), dimension (..), contiguous :: a > + integer (acc_handle_kind) async > + end subroutine > + > + subroutine acc_create_async_32_h (a, len, async) > + use iso_c_binding, only: c_int32_t > + use openacc_kinds, only: acc_handle_kind > + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a > + type (*), dimension (*) :: a > + integer (c_int32_t) len > + integer (acc_handle_kind) async > + end subroutine > + > + subroutine acc_create_async_64_h (a, len, async) > + use iso_c_binding, only: c_int64_t > + use openacc_kinds, only: acc_handle_kind > + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a > + type (*), dimension (*) :: a > + integer (c_int64_t) len > + integer (acc_handle_kind) async > + end subroutine > + > + subroutine acc_create_async_array_h (a, async) > + use openacc_kinds, only: acc_handle_kind > + type (*), dimension (..), contiguous :: a > + integer (acc_handle_kind) async > + end subroutine > + > + subroutine acc_copyout_async_32_h (a, len, async) > + use iso_c_binding, only: c_int32_t > + use openacc_kinds, only: acc_handle_kind > + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a > + type (*), dimension (*) :: a > + integer (c_int32_t) len > + integer (acc_handle_kind) async > + end subroutine > + > + subroutine acc_copyout_async_64_h (a, len, async) > + use iso_c_binding, only: c_int64_t > + use openacc_kinds, only: acc_handle_kind > + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a > + type (*), dimension (*) :: a > + integer (c_int64_t) len > + integer (acc_handle_kind) async > + end subroutine > + > + subroutine acc_copyout_async_array_h (a, async) > + use openacc_kinds, only: acc_handle_kind > + type (*), dimension (..), contiguous :: a > + integer (acc_handle_kind) async > + end subroutine > + > + subroutine acc_delete_async_32_h (a, len, async) > + use iso_c_binding, only: c_int32_t > + use openacc_kinds, only: acc_handle_kind > + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a > + type (*), dimension (*) :: a > + integer (c_int32_t) len > + integer (acc_handle_kind) async > + end subroutine > + > + subroutine acc_delete_async_64_h (a, len, async) > + use iso_c_binding, only: c_int64_t > + use openacc_kinds, only: acc_handle_kind > + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a > + type (*), dimension (*) :: a > + integer (c_int64_t) len > + integer (acc_handle_kind) async > + end subroutine > + > + subroutine acc_delete_async_array_h (a, async) > + use openacc_kinds, only: acc_handle_kind > + type (*), dimension (..), contiguous :: a > + integer (acc_handle_kind) async > + end subroutine > + > + subroutine acc_update_device_async_32_h (a, len, async) > + use iso_c_binding, only: c_int32_t > + use openacc_kinds, only: acc_handle_kind > + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a > + type (*), dimension (*) :: a > + integer (c_int32_t) len > + integer (acc_handle_kind) async > + end subroutine > + > + subroutine acc_update_device_async_64_h (a, len, async) > + use iso_c_binding, only: c_int64_t > + use openacc_kinds, only: acc_handle_kind > + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a > + type (*), dimension (*) :: a > + integer (c_int64_t) len > + integer (acc_handle_kind) async > + end subroutine > + > + subroutine acc_update_device_async_array_h (a, async) > + use openacc_kinds, only: acc_handle_kind > + type (*), dimension (..), contiguous :: a > + integer (acc_handle_kind) async > + end subroutine > + > + subroutine acc_update_self_async_32_h (a, len, async) > + use iso_c_binding, only: c_int32_t > + use openacc_kinds, only: acc_handle_kind > + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a > + type (*), dimension (*) :: a > + integer (c_int32_t) len > + integer (acc_handle_kind) async > + end subroutine > + > + subroutine acc_update_self_async_64_h (a, len, async) > + use iso_c_binding, only: c_int64_t > + use openacc_kinds, only: acc_handle_kind > + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a > + type (*), dimension (*) :: a > + integer (c_int64_t) len > + integer (acc_handle_kind) async > + end subroutine > + > + subroutine acc_update_self_async_array_h (a, async) > + use openacc_kinds, only: acc_handle_kind > + type (*), dimension (..), contiguous :: a > + integer (acc_handle_kind) async > + end subroutine > end interface > > interface > @@ -510,6 +654,60 @@ module openacc_internal > type (*), dimension (*) :: a > integer (c_size_t), value :: len > end function > + > + subroutine acc_copyin_async_l (a, len, async) & > + bind (C, name = "acc_copyin_async") > + use iso_c_binding, only: c_size_t, c_int > + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a > + type (*), dimension (*) :: a > + integer (c_size_t), value :: len > + integer (c_int), value :: async > + end subroutine > + > + subroutine acc_create_async_l (a, len, async) & > + bind (C, name = "acc_create_async") > + use iso_c_binding, only: c_size_t, c_int > + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a > + type (*), dimension (*) :: a > + integer (c_size_t), value :: len > + integer (c_int), value :: async > + end subroutine > + > + subroutine acc_copyout_async_l (a, len, async) & > + bind (C, name = "acc_copyout_async") > + use iso_c_binding, only: c_size_t, c_int > + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a > + type (*), dimension (*) :: a > + integer (c_size_t), value :: len > + integer (c_int), value :: async > + end subroutine > + > + subroutine acc_delete_async_l (a, len, async) & > + bind (C, name = "acc_delete_async") > + use iso_c_binding, only: c_size_t, c_int > + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a > + type (*), dimension (*) :: a > + integer (c_size_t), value :: len > + integer (c_int), value :: async > + end subroutine > + > + subroutine acc_update_device_async_l (a, len, async) & > + bind (C, name = "acc_update_device_async") > + use iso_c_binding, only: c_size_t, c_int > + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a > + type (*), dimension (*) :: a > + integer (c_size_t), value :: len > + integer (c_int), value :: async > + end subroutine > + > + subroutine acc_update_self_async_l (a, len, async) & > + bind (C, name = "acc_update_self_async") > + use iso_c_binding, only: c_size_t, c_int > + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a > + type (*), dimension (*) :: a > + integer (c_size_t), value :: len > + integer (c_int), value :: async > + end subroutine > end interface > end module > > @@ -529,6 +727,8 @@ module openacc > public :: acc_copyin, acc_present_or_copyin, acc_pcopyin, acc_create > public :: acc_present_or_create, acc_pcreate, acc_copyout, acc_delete > public :: acc_update_device, acc_update_self, acc_is_present > + public :: acc_copyin_async, acc_create_async, acc_copyout_async > + public :: acc_delete_async, acc_update_device_async, acc_update_self_async > > integer, parameter :: openacc_version = 201306 > > @@ -694,6 +894,42 @@ module openacc > ! acc_memcpy_to_device: Only available in C/C++ > ! acc_memcpy_from_device: Only available in C/C++ > > + interface acc_copyin_async > + procedure :: acc_copyin_async_32_h > + procedure :: acc_copyin_async_64_h > + procedure :: acc_copyin_async_array_h > + end interface > + > + interface acc_create_async > + procedure :: acc_create_async_32_h > + procedure :: acc_create_async_64_h > + procedure :: acc_create_async_array_h > + end interface > + > + interface acc_copyout_async > + procedure :: acc_copyout_async_32_h > + procedure :: acc_copyout_async_64_h > + procedure :: acc_copyout_async_array_h > + end interface > + > + interface acc_delete_async > + procedure :: acc_delete_async_32_h > + procedure :: acc_delete_async_64_h > + procedure :: acc_delete_async_array_h > + end interface > + > + interface acc_update_device_async > + procedure :: acc_update_device_async_32_h > + procedure :: acc_update_device_async_64_h > + procedure :: acc_update_device_async_array_h > + end interface > + > + interface acc_update_self_async > + procedure :: acc_update_self_async_32_h > + procedure :: acc_update_self_async_64_h > + procedure :: acc_update_self_async_array_h > + end interface > + > end module > > function acc_get_num_devices_h (d) > @@ -1078,3 +1314,189 @@ function acc_is_present_array_h (a) > type (*), dimension (..), contiguous :: a > acc_is_present_array_h = acc_is_present_l (a, sizeof (a)) == 1 > end function > + > +subroutine acc_copyin_async_32_h (a, len, async) > + use iso_c_binding, only: c_int32_t, c_size_t, c_int > + use openacc_internal, only: acc_copyin_async_l > + use openacc_kinds, only: acc_handle_kind > + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a > + type (*), dimension (*) :: a > + integer (c_int32_t) len > + integer (acc_handle_kind) async > + call acc_copyin_async_l (a, int (len, kind = c_size_t), int (async, kind = c_int)) > +end subroutine > + > +subroutine acc_copyin_async_64_h (a, len, async) > + use iso_c_binding, only: c_int64_t, c_size_t, c_int > + use openacc_internal, only: acc_copyin_async_l > + use openacc_kinds, only: acc_handle_kind > + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a > + type (*), dimension (*) :: a > + integer (c_int64_t) len > + integer (acc_handle_kind) async > + call acc_copyin_async_l (a, int (len, kind = c_size_t), int (async, kind = c_int)) > +end subroutine > + > +subroutine acc_copyin_async_array_h (a, async) > + use iso_c_binding, only: c_int > + use openacc_internal, only: acc_copyin_async_l > + use openacc_kinds, only: acc_handle_kind > + type (*), dimension (..), contiguous :: a > + integer (acc_handle_kind) async > + call acc_copyin_async_l (a, sizeof (a), int (async, kind = c_int)) > +end subroutine > + > +subroutine acc_create_async_32_h (a, len, async) > + use iso_c_binding, only: c_int32_t, c_size_t, c_int > + use openacc_internal, only: acc_create_async_l > + use openacc_kinds, only: acc_handle_kind > + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a > + type (*), dimension (*) :: a > + integer (c_int32_t) len > + integer (acc_handle_kind) async > + call acc_create_async_l (a, int (len, kind = c_size_t), int (async, kind = c_int)) > +end subroutine > + > +subroutine acc_create_async_64_h (a, len, async) > + use iso_c_binding, only: c_int64_t, c_size_t, c_int > + use openacc_internal, only: acc_create_async_l > + use openacc_kinds, only: acc_handle_kind > + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a > + type (*), dimension (*) :: a > + integer (c_int64_t) len > + integer (acc_handle_kind) async > + call acc_create_async_l (a, int (len, kind = c_size_t), int (async, kind = c_int)) > +end subroutine > + > +subroutine acc_create_async_array_h (a, async) > + use iso_c_binding, only: c_int > + use openacc_internal, only: acc_create_async_l > + use openacc_kinds, only: acc_handle_kind > + type (*), dimension (..), contiguous :: a > + integer (acc_handle_kind) async > + call acc_create_async_l (a, sizeof (a), int (async, kind = c_int)) > +end subroutine > + > +subroutine acc_copyout_async_32_h (a, len, async) > + use iso_c_binding, only: c_int32_t, c_size_t, c_int > + use openacc_internal, only: acc_copyout_async_l > + use openacc_kinds, only: acc_handle_kind > + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a > + type (*), dimension (*) :: a > + integer (c_int32_t) len > + integer (acc_handle_kind) async > + call acc_copyout_async_l (a, int (len, kind = c_size_t), int (async, kind = c_int)) > +end subroutine > + > +subroutine acc_copyout_async_64_h (a, len, async) > + use iso_c_binding, only: c_int64_t, c_size_t, c_int > + use openacc_internal, only: acc_copyout_async_l > + use openacc_kinds, only: acc_handle_kind > + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a > + type (*), dimension (*) :: a > + integer (c_int64_t) len > + integer (acc_handle_kind) async > + call acc_copyout_async_l (a, int (len, kind = c_size_t), int (async, kind = c_int)) > +end subroutine > + > +subroutine acc_copyout_async_array_h (a, async) > + use iso_c_binding, only: c_int > + use openacc_internal, only: acc_copyout_async_l > + use openacc_kinds, only: acc_handle_kind > + type (*), dimension (..), contiguous :: a > + integer (acc_handle_kind) async > + call acc_copyout_async_l (a, sizeof (a), int (async, kind = c_int)) > +end subroutine > + > +subroutine acc_delete_async_32_h (a, len, async) > + use iso_c_binding, only: c_int32_t, c_size_t, c_int > + use openacc_internal, only: acc_delete_async_l > + use openacc_kinds, only: acc_handle_kind > + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a > + type (*), dimension (*) :: a > + integer (c_int32_t) len > + integer (acc_handle_kind) async > + call acc_delete_async_l (a, int (len, kind = c_size_t), int (async, kind = c_int)) > +end subroutine > + > +subroutine acc_delete_async_64_h (a, len, async) > + use iso_c_binding, only: c_int64_t, c_size_t, c_int > + use openacc_internal, only: acc_delete_async_l > + use openacc_kinds, only: acc_handle_kind > + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a > + type (*), dimension (*) :: a > + integer (c_int64_t) len > + integer (acc_handle_kind) async > + call acc_delete_async_l (a, int (len, kind = c_size_t), int (async, kind = c_int)) > +end subroutine > + > +subroutine acc_delete_async_array_h (a, async) > + use iso_c_binding, only: c_int > + use openacc_internal, only: acc_delete_async_l > + use openacc_kinds, only: acc_handle_kind > + type (*), dimension (..), contiguous :: a > + integer (acc_handle_kind) async > + call acc_delete_async_l (a, sizeof (a), int (async, kind = c_int)) > +end subroutine > + > +subroutine acc_update_device_async_32_h (a, len, async) > + use iso_c_binding, only: c_int32_t, c_size_t, c_int > + use openacc_internal, only: acc_update_device_async_l > + use openacc_kinds, only: acc_handle_kind > + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a > + type (*), dimension (*) :: a > + integer (c_int32_t) len > + integer (acc_handle_kind) async > + call acc_update_device_async_l (a, int (len, kind = c_size_t), int (async, kind = c_int)) > +end subroutine > + > +subroutine acc_update_device_async_64_h (a, len, async) > + use iso_c_binding, only: c_int64_t, c_size_t, c_int > + use openacc_internal, only: acc_update_device_async_l > + use openacc_kinds, only: acc_handle_kind > + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a > + type (*), dimension (*) :: a > + integer (c_int64_t) len > + integer (acc_handle_kind) async > + call acc_update_device_async_l (a, int (len, kind = c_size_t), int (async, kind = c_int)) > +end subroutine > + > +subroutine acc_update_device_async_array_h (a, async) > + use iso_c_binding, only: c_int > + use openacc_internal, only: acc_update_device_async_l > + use openacc_kinds, only: acc_handle_kind > + type (*), dimension (..), contiguous :: a > + integer (acc_handle_kind) async > + call acc_update_device_async_l (a, sizeof (a), int (async, kind = c_int)) > +end subroutine > + > +subroutine acc_update_self_async_32_h (a, len, async) > + use iso_c_binding, only: c_int32_t, c_size_t, c_int > + use openacc_internal, only: acc_update_self_async_l > + use openacc_kinds, only: acc_handle_kind > + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a > + type (*), dimension (*) :: a > + integer (c_int32_t) len > + integer (acc_handle_kind) async > + call acc_update_self_async_l (a, int (len, kind = c_size_t), int (async, kind = c_int)) > +end subroutine > + > +subroutine acc_update_self_async_64_h (a, len, async) > + use iso_c_binding, only: c_int64_t, c_size_t, c_int > + use openacc_internal, only: acc_update_self_async_l > + use openacc_kinds, only: acc_handle_kind > + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a > + type (*), dimension (*) :: a > + integer (c_int64_t) len > + integer (acc_handle_kind) async > + call acc_update_self_async_l (a, int (len, kind = c_size_t), int (async, kind = c_int)) > +end subroutine > + > +subroutine acc_update_self_async_array_h (a, async) > + use iso_c_binding, only: c_int > + use openacc_internal, only: acc_update_self_async_l > + use openacc_kinds, only: acc_handle_kind > + type (*), dimension (..), contiguous :: a > + integer (acc_handle_kind) async > + call acc_update_self_async_l (a, sizeof (a), int (async, kind = c_int)) > +end subroutine > Index: libgomp/openacc.h > =================================================================== > --- libgomp/openacc.h (revision 264192) > +++ libgomp/openacc.h (working copy) > @@ -115,6 +115,16 @@ void acc_copyout_finalize_async (void *, size_t, i > void acc_delete_finalize (void *, size_t) __GOACC_NOTHROW; > void acc_delete_finalize_async (void *, size_t, int) __GOACC_NOTHROW; > > +/* Async functions, specified in OpenACC 2.5. */ > +void acc_copyin_async (void *, size_t, int) __GOACC_NOTHROW; > +void acc_create_async (void *, size_t, int) __GOACC_NOTHROW; > +void acc_copyout_async (void *, size_t, int) __GOACC_NOTHROW; > +void acc_delete_async (void *, size_t, int) __GOACC_NOTHROW; > +void acc_update_device_async (void *, size_t, int) __GOACC_NOTHROW; > +void acc_update_self_async (void *, size_t, int) __GOACC_NOTHROW; > +void acc_memcpy_to_device_async (void *, void *, size_t, int) __GOACC_NOTHROW; > +void acc_memcpy_from_device_async (void *, void *, size_t, int) __GOACC_NOTHROW; > + > /* CUDA-specific routines. */ > void *acc_get_current_cuda_device (void) __GOACC_NOTHROW; > void *acc_get_current_cuda_context (void) __GOACC_NOTHROW; > Index: libgomp/openacc_lib.h > =================================================================== > --- libgomp/openacc_lib.h (revision 264192) > +++ libgomp/openacc_lib.h (working copy) > @@ -403,3 +403,159 @@ > > ! acc_memcpy_to_device: Only available in C/C++ > ! acc_memcpy_from_device: Only available in C/C++ > + > + interface acc_copyin_async > + subroutine acc_copyin_async_32_h (a, len, async) > + use iso_c_binding, only: c_int32_t > + import acc_handle_kind > + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a > + type (*), dimension (*) :: a > + integer (c_int32_t) len > + integer (acc_handle_kind) async > + end subroutine > + > + subroutine acc_copyin_async_64_h (a, len, async) > + use iso_c_binding, only: c_int64_t > + import acc_handle_kind > + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a > + type (*), dimension (*) :: a > + integer (c_int64_t) len > + integer (acc_handle_kind) async > + end subroutine > + > + subroutine acc_copyin_async_array_h (a, async_) > + import acc_handle_kind > + type (*), dimension (..), contiguous :: a > + integer (acc_handle_kind) async_ > + end subroutine > + end interface > + > + interface acc_create_async > + subroutine acc_create_async_32_h (a, len, async) > + use iso_c_binding, only: c_int32_t > + import acc_handle_kind > + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a > + type (*), dimension (*) :: a > + integer (c_int32_t) len > + integer (acc_handle_kind) async > + end subroutine > + > + subroutine acc_create_async_64_h (a, len, async) > + use iso_c_binding, only: c_int64_t > + import acc_handle_kind > + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a > + type (*), dimension (*) :: a > + integer (c_int64_t) len > + integer (acc_handle_kind) async > + end subroutine > + > + subroutine acc_create_async_array_h (a, async_) > + import acc_handle_kind > + type (*), dimension (..), contiguous :: a > + integer (acc_handle_kind) async_ > + end subroutine > + end interface > + > + interface acc_copyout_async > + subroutine acc_copyout_async_32_h (a, len, async) > + use iso_c_binding, only: c_int32_t > + import acc_handle_kind > + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a > + type (*), dimension (*) :: a > + integer (c_int32_t) len > + integer (acc_handle_kind) async > + end subroutine > + > + subroutine acc_copyout_async_64_h (a, len, async) > + use iso_c_binding, only: c_int64_t > + import acc_handle_kind > + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a > + type (*), dimension (*) :: a > + integer (c_int64_t) len > + integer (acc_handle_kind) async > + end subroutine > + > + subroutine acc_copyout_async_array_h (a, async_) > + import acc_handle_kind > + type (*), dimension (..), contiguous :: a > + integer (acc_handle_kind) async_ > + end subroutine > + end interface > + > + interface acc_delete_async > + subroutine acc_delete_async_32_h (a, len, async) > + use iso_c_binding, only: c_int32_t > + import acc_handle_kind > + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a > + type (*), dimension (*) :: a > + integer (c_int32_t) len > + integer (acc_handle_kind) async > + end subroutine > + > + subroutine acc_delete_async_64_h (a, len, async) > + use iso_c_binding, only: c_int64_t > + import acc_handle_kind > + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a > + type (*), dimension (*) :: a > + integer (c_int64_t) len > + integer (acc_handle_kind) async > + end subroutine > + > + subroutine acc_delete_async_array_h (a, async_) > + import acc_handle_kind > + type (*), dimension (..), contiguous :: a > + integer (acc_handle_kind) async_ > + end subroutine > + end interface > + > + interface acc_update_device_async > + subroutine acc_update_device_async_32_h (a, len, async) > + use iso_c_binding, only: c_int32_t > + import acc_handle_kind > + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a > + type (*), dimension (*) :: a > + integer (c_int32_t) len > + integer (acc_handle_kind) async > + end subroutine > + > + subroutine acc_update_device_async_64_h (a, len, async) > + use iso_c_binding, only: c_int64_t > + import acc_handle_kind > + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a > + type (*), dimension (*) :: a > + integer (c_int64_t) len > + integer (acc_handle_kind) async > + end subroutine > + > + subroutine acc_update_device_async_array_h (a, async_) > + import acc_handle_kind > + type (*), dimension (..), contiguous :: a > + integer (acc_handle_kind) async_ > + end subroutine > + end interface > + > + interface acc_update_self_async > + subroutine acc_update_self_async_32_h (a, len, async) > + use iso_c_binding, only: c_int32_t > + import acc_handle_kind > + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a > + type (*), dimension (*) :: a > + integer (c_int32_t) len > + integer (acc_handle_kind) async > + end subroutine > + > + subroutine acc_update_self_async_64_h (a, len, async) > + use iso_c_binding, only: c_int64_t > + import acc_handle_kind > + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a > + type (*), dimension (*) :: a > + integer (c_int64_t) len > + integer (acc_handle_kind) async > + end subroutine > + > + subroutine acc_update_self_async_array_h (a, async_) > + import acc_handle_kind > + type (*), dimension (..), contiguous :: a > + integer (acc_handle_kind) async_ > + end subroutine > + end interface > Index: libgomp/testsuite/libgomp.oacc-c-c++-common/lib-94.c > =================================================================== > --- libgomp/testsuite/libgomp.oacc-c-c++-common/lib-94.c (nonexistent) > +++ libgomp/testsuite/libgomp.oacc-c-c++-common/lib-94.c (working copy) > @@ -0,0 +1,42 @@ > +/* { dg-do run } */ > +/* { dg-skip-if "" { *-*-* } { "*" } { "-DACC_MEM_SHARED=0" } } */ > + > +#include <string.h> > +#include <stdlib.h> > +#include <openacc.h> > + > +int > +main (int argc, char **argv) > +{ > + const int N = 256; > + int i; > + int async = 8; > + unsigned char *h; > + > + h = (unsigned char *) malloc (N); > + > + for (i = 0; i < N; i++) > + { > + h[i] = i; > + } > + > + acc_copyin_async (h, N, async); > + > + memset (h, 0, N); > + > + acc_wait (async); > + > + acc_copyout_async (h, N, async + 1); > + > + acc_wait (async + 1); > + > + for (i = 0; i < N; i++) > + { > + if (h[i] != i) > + abort (); > + } > + > + free (h); > + > + return 0; > +} > Index: libgomp/testsuite/libgomp.oacc-c-c++-common/lib-95.c > =================================================================== > --- libgomp/testsuite/libgomp.oacc-c-c++-common/lib-95.c (nonexistent) > +++ libgomp/testsuite/libgomp.oacc-c-c++-common/lib-95.c (working copy) > @@ -0,0 +1,45 @@ > +/* { dg-do run } */ > +/* { dg-skip-if "" { *-*-* } { "*" } { "-DACC_MEM_SHARED=0" } } */ > + > +#include <string.h> > +#include <stdlib.h> > +#include <openacc.h> > + > +int > +main (int argc, char **argv) > +{ > + const int N = 256; > + int i, q = 5; > + unsigned char *h, *g; > + void *d; > + > + h = (unsigned char *) malloc (N); > + g = (unsigned char *) malloc (N); > + for (i = 0; i < N; i++) > + { > + g[i] = i; > + } > + > + acc_create_async (h, N, q); > + > + acc_memcpy_to_device_async (acc_deviceptr (h), g, N, q); > + memset (&h[0], 0, N); > + > + acc_wait (q); > + > + acc_update_self_async (h, N, q + 1); > + acc_delete_async (h, N, q + 1); > + > + acc_wait (q + 1); > + > + for (i = 0; i < N; i++) > + { > + if (h[i] != i) > + abort (); > + } > + > + free (h); > + free (g); > + > + return 0; > +} > Index: libgomp/testsuite/libgomp.oacc-fortran/lib-16.f90 > =================================================================== > --- libgomp/testsuite/libgomp.oacc-fortran/lib-16.f90 (nonexistent) > +++ libgomp/testsuite/libgomp.oacc-fortran/lib-16.f90 (working copy) > @@ -0,0 +1,57 @@ > +! { dg-do run } > +! { dg-skip-if "" { *-*-* } { "*" } { "-DACC_MEM_SHARED=0" } } > + > +program main > + use openacc > + implicit none > + > + integer, parameter :: N = 256 > + integer, allocatable :: h(:) > + integer :: i > + integer :: async = 5 > + > + allocate (h(N)) > + > + do i = 1, N > + h(i) = i > + end do > + > + call acc_copyin (h) > + > + do i = 1, N > + h(i) = i + i > + end do > + > + call acc_update_device_async (h, sizeof (h), async) > + > + if (acc_is_present (h) .neqv. .TRUE.) call abort > + > + h(:) = 0 > + > + call acc_copyout_async (h, sizeof (h), async) > + > + call acc_wait (async) > + > + do i = 1, N > + if (h(i) /= i + i) call abort > + end do > + > + call acc_copyin (h, sizeof (h)) > + > + h(:) = 0 > + > + call acc_update_self_async (h, sizeof (h), async) > + > + if (acc_is_present (h) .neqv. .TRUE.) call abort > + > + do i = 1, N > + if (h(i) /= i + i) call abort > + end do > + > + call acc_delete_async (h, async) > + > + call acc_wait (async) > + > + if (acc_is_present (h) .neqv. .FALSE.) call abort > + > +end program
Hi! On Mon, 10 Sep 2018 10:22:17 -0700, Cesar Philippidis <cesar@codesourcery.com> wrote: > On 09/10/2018 08:04 AM, Chung-Lin Tang wrote: > > Index: libgomp/openacc_lib.h > > I don't see a test case for this. Right. (Better test coverage would be desirable generally.) > I believe that openacc_lib.h is used > by fixed-mode Fortran programs (those that end in a .f). Can you add a > fixed-mode version of lib-16.f90? Not only with fixed-form, but also with free-form Fortran programs. Committed to trunk in r266683: commit d084eb0a61d209ee0d852089ea76a672f519883b Author: tschwinge <tschwinge@138bc75d-0d04-0410-961f-82ee72b054a4> Date: Fri Nov 30 20:38:57 2018 +0000 Add libgomp.oacc-fortran/lib-16-2.f90 This is a copy of libgomp.oacc-fortran/lib-16.f90, but does 'include "openacc_lib.h"' instead of 'use openacc'. libgomp/ * testsuite/libgomp.oacc-fortran/lib-16-2.f90: New file. git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@266683 138bc75d-0d04-0410-961f-82ee72b054a4 --- libgomp/ChangeLog | 4 ++++ libgomp/testsuite/libgomp.oacc-fortran/{lib-16.f90 => lib-16-2.f90} | 3 ++- libgomp/testsuite/libgomp.oacc-fortran/lib-16.f90 | 1 + 3 files changed, 7 insertions(+), 1 deletion(-) diff --git libgomp/ChangeLog libgomp/ChangeLog index d3c1bc36c145..a9dcbd808200 100644 --- libgomp/ChangeLog +++ libgomp/ChangeLog @@ -1,3 +1,7 @@ +2018-11-30 Thomas Schwinge <thomas@codesourcery.com> + + * testsuite/libgomp.oacc-fortran/lib-16-2.f90: New file. + 2018-10-19 Richard Biener <rguenther@suse.de> PR tree-optimization/88182 diff --git libgomp/testsuite/libgomp.oacc-fortran/lib-16.f90 libgomp/testsuite/libgomp.oacc-fortran/lib-16-2.f90 similarity index 94% copy from libgomp/testsuite/libgomp.oacc-fortran/lib-16.f90 copy to libgomp/testsuite/libgomp.oacc-fortran/lib-16-2.f90 index 9701b52dd257..fa76f65912fb 100644 --- libgomp/testsuite/libgomp.oacc-fortran/lib-16.f90 +++ libgomp/testsuite/libgomp.oacc-fortran/lib-16-2.f90 @@ -1,9 +1,10 @@ +! See also "lib-16.f90". ! { dg-do run } ! { dg-skip-if "" { *-*-* } { "*" } { "-DACC_MEM_SHARED=0" } } program main - use openacc implicit none + include "openacc_lib.h" integer, parameter :: N = 256 integer, allocatable :: h(:) diff --git libgomp/testsuite/libgomp.oacc-fortran/lib-16.f90 libgomp/testsuite/libgomp.oacc-fortran/lib-16.f90 index 9701b52dd257..011f9cf31db4 100644 --- libgomp/testsuite/libgomp.oacc-fortran/lib-16.f90 +++ libgomp/testsuite/libgomp.oacc-fortran/lib-16.f90 @@ -1,3 +1,4 @@ +! See also "lib-16-2.f90". ! { dg-do run } ! { dg-skip-if "" { *-*-* } { "*" } { "-DACC_MEM_SHARED=0" } } Grüße Thomas
Hi Chung-Lin! ;-) It's been a while: On 2018-09-10T23:04:18+0800, Chung-Lin Tang <chunglin_tang@mentor.com> wrote: > * testsuite/libgomp.oacc-c-c++-common/lib-94.c: New test. > * testsuite/libgomp.oacc-c-c++-common/lib-95.c: New test. > * testsuite/libgomp.oacc-fortran/lib-16.f90: New test. Do you happen to remember why in these testcases you're using the following pattern: > --- libgomp/testsuite/libgomp.oacc-c-c++-common/lib-94.c (nonexistent) > +++ libgomp/testsuite/libgomp.oacc-c-c++-common/lib-94.c (working copy) > @@ -0,0 +1,42 @@ > +/* { dg-do run } */ > +/* { dg-skip-if "" { *-*-* } { "*" } { "-DACC_MEM_SHARED=0" } } */ > + > +#include <string.h> > +#include <stdlib.h> > +#include <openacc.h> > + > +int > +main (int argc, char **argv) > +{ > + const int N = 256; > + int i; > + int async = 8; > + unsigned char *h; > + > + h = (unsigned char *) malloc (N); > + > + for (i = 0; i < N; i++) > + { > + h[i] = i; > + } > + > + acc_copyin_async (h, N, async); > + > + memset (h, 0, N); > + > + acc_wait (async); You first issue 'acc_copyin_async', then (while potentially that's still accessing 'h') already 'memset' 'h' (potentially overwriting data that 'acc_copyin_async' is still working on), and only then 'acc_wait'? My understanding of OpenACC would swap 'memset' and 'acc_wait', but maybe you have a specific reason to do it in this way? In particular, the GCC nvptx offloading implementation "doesn't seem to care" (as discussed elsewhere; 'OpenACC "ephemeral" asynchronous host-to-device copies', etc.) -- but I suppose if you meant to test such implementation traits here, you'd have commented that? > + > + acc_copyout_async (h, N, async + 1); > + > + acc_wait (async + 1); > + > + for (i = 0; i < N; i++) > + { > + if (h[i] != i) > + abort (); > + } > + > + free (h); > + > + return 0; > +} > --- libgomp/testsuite/libgomp.oacc-c-c++-common/lib-95.c (nonexistent) > +++ libgomp/testsuite/libgomp.oacc-c-c++-common/lib-95.c (working copy) > @@ -0,0 +1,45 @@ > +/* { dg-do run } */ > +/* { dg-skip-if "" { *-*-* } { "*" } { "-DACC_MEM_SHARED=0" } } */ > + > +#include <string.h> > +#include <stdlib.h> > +#include <openacc.h> > + > +int > +main (int argc, char **argv) > +{ > + const int N = 256; > + int i, q = 5; > + unsigned char *h, *g; > + void *d; > + > + h = (unsigned char *) malloc (N); > + g = (unsigned char *) malloc (N); > + for (i = 0; i < N; i++) > + { > + g[i] = i; > + } > + > + acc_create_async (h, N, q); > + > + acc_memcpy_to_device_async (acc_deviceptr (h), g, N, q); > + memset (&h[0], 0, N); > + > + acc_wait (q); Similar here. > + acc_update_self_async (h, N, q + 1); > + acc_delete_async (h, N, q + 1); > + > + acc_wait (q + 1); > + > + for (i = 0; i < N; i++) > + { > + if (h[i] != i) > + abort (); > + } > + > + free (h); > + free (g); > + > + return 0; > +} > --- libgomp/testsuite/libgomp.oacc-fortran/lib-16.f90 (nonexistent) > +++ libgomp/testsuite/libgomp.oacc-fortran/lib-16.f90 (working copy) (Later also similarly copied into 'libgomp.oacc-fortran/lib-16-2.f90'.) Similar: > @@ -0,0 +1,57 @@ > +! { dg-do run } > +! { dg-skip-if "" { *-*-* } { "*" } { "-DACC_MEM_SHARED=0" } } > + > +program main > + use openacc > + implicit none > + > + integer, parameter :: N = 256 > + integer, allocatable :: h(:) > + integer :: i > + integer :: async = 5 > + > + allocate (h(N)) > + > + do i = 1, N > + h(i) = i > + end do > + > + call acc_copyin (h) > + > + do i = 1, N > + h(i) = i + i > + end do > + > + call acc_update_device_async (h, sizeof (h), async) > + > + if (acc_is_present (h) .neqv. .TRUE.) call abort Don't we need 'acc_wait' here (while 'acc_update_device_async' may still be reading from 'h'), before overwriting 'h' here: > + > + h(:) = 0 > + > + call acc_copyout_async (h, sizeof (h), async) > + > + call acc_wait (async) > + > + do i = 1, N > + if (h(i) /= i + i) call abort > + end do > + > + call acc_copyin (h, sizeof (h)) > + > + h(:) = 0 > + > + call acc_update_self_async (h, sizeof (h), async) > + > + if (acc_is_present (h) .neqv. .TRUE.) call abort Don't we need 'acc_wait' here (to make sure we finish device to host copy of 'h'), before evaluating 'h' here: > + > + do i = 1, N > + if (h(i) /= i + i) call abort > + end do > + > + call acc_delete_async (h, async) > + > + call acc_wait (async) > + > + if (acc_is_present (h) .neqv. .FALSE.) call abort > + > +end program Julian has patches for most of these (as part of other commits). Grüße Thomas ----------------- Mentor Graphics (Deutschland) GmbH, Arnulfstrasse 201, 80634 München Registergericht München HRB 106955, Geschäftsführer: Thomas Heurung, Frank Thürauf
Hi! On 2021-06-08T19:32:22+0200, I wrote: > Hi Chung-Lin! > > ;-) It's been a while: > > On 2018-09-10T23:04:18+0800, Chung-Lin Tang <chunglin_tang@mentor.com> wrote: >> * testsuite/libgomp.oacc-c-c++-common/lib-94.c: New test. >> * testsuite/libgomp.oacc-c-c++-common/lib-95.c: New test. >> * testsuite/libgomp.oacc-fortran/lib-16.f90: New test. > > Do you happen to remember why in these testcases you're using the > following pattern: Apparently not ;-) -- no answer/objection, I've thus now pushed "Fix OpenACC 'async'/'wait' issues in 'libgomp.oacc-c-c++-common/lib-{94,95}.c', 'libgomp.oacc-fortran/lib-16{,-2}.f90'" to master branch in commit 599e275d7e0b3fb79ff704d4cb2d8fdb0231116e, see attached. Grüße Thomas >> --- libgomp/testsuite/libgomp.oacc-c-c++-common/lib-94.c (nonexistent) >> +++ libgomp/testsuite/libgomp.oacc-c-c++-common/lib-94.c (working copy) >> @@ -0,0 +1,42 @@ >> +/* { dg-do run } */ >> +/* { dg-skip-if "" { *-*-* } { "*" } { "-DACC_MEM_SHARED=0" } } */ >> + >> +#include <string.h> >> +#include <stdlib.h> >> +#include <openacc.h> >> + >> +int >> +main (int argc, char **argv) >> +{ >> + const int N = 256; >> + int i; >> + int async = 8; >> + unsigned char *h; >> + >> + h = (unsigned char *) malloc (N); >> + >> + for (i = 0; i < N; i++) >> + { >> + h[i] = i; >> + } >> + >> + acc_copyin_async (h, N, async); >> + >> + memset (h, 0, N); >> + >> + acc_wait (async); > > You first issue 'acc_copyin_async', then (while potentially that's still > accessing 'h') already 'memset' 'h' (potentially overwriting data that > 'acc_copyin_async' is still working on), and only then 'acc_wait'? > > My understanding of OpenACC would swap 'memset' and 'acc_wait', but maybe > you have a specific reason to do it in this way? > > In particular, the GCC nvptx offloading implementation "doesn't seem to > care" (as discussed elsewhere; 'OpenACC "ephemeral" asynchronous > host-to-device copies', etc.) -- but I suppose if you meant to test such > implementation traits here, you'd have commented that? > >> + >> + acc_copyout_async (h, N, async + 1); >> + >> + acc_wait (async + 1); >> + >> + for (i = 0; i < N; i++) >> + { >> + if (h[i] != i) >> + abort (); >> + } >> + >> + free (h); >> + >> + return 0; >> +} > >> --- libgomp/testsuite/libgomp.oacc-c-c++-common/lib-95.c (nonexistent) >> +++ libgomp/testsuite/libgomp.oacc-c-c++-common/lib-95.c (working copy) >> @@ -0,0 +1,45 @@ >> +/* { dg-do run } */ >> +/* { dg-skip-if "" { *-*-* } { "*" } { "-DACC_MEM_SHARED=0" } } */ >> + >> +#include <string.h> >> +#include <stdlib.h> >> +#include <openacc.h> >> + >> +int >> +main (int argc, char **argv) >> +{ >> + const int N = 256; >> + int i, q = 5; >> + unsigned char *h, *g; >> + void *d; >> + >> + h = (unsigned char *) malloc (N); >> + g = (unsigned char *) malloc (N); >> + for (i = 0; i < N; i++) >> + { >> + g[i] = i; >> + } >> + >> + acc_create_async (h, N, q); >> + >> + acc_memcpy_to_device_async (acc_deviceptr (h), g, N, q); >> + memset (&h[0], 0, N); >> + >> + acc_wait (q); > > Similar here. > >> + acc_update_self_async (h, N, q + 1); >> + acc_delete_async (h, N, q + 1); >> + >> + acc_wait (q + 1); >> + >> + for (i = 0; i < N; i++) >> + { >> + if (h[i] != i) >> + abort (); >> + } >> + >> + free (h); >> + free (g); >> + >> + return 0; >> +} > >> --- libgomp/testsuite/libgomp.oacc-fortran/lib-16.f90 (nonexistent) >> +++ libgomp/testsuite/libgomp.oacc-fortran/lib-16.f90 (working copy) > > (Later also similarly copied into 'libgomp.oacc-fortran/lib-16-2.f90'.) > > Similar: > >> @@ -0,0 +1,57 @@ >> +! { dg-do run } >> +! { dg-skip-if "" { *-*-* } { "*" } { "-DACC_MEM_SHARED=0" } } >> + >> +program main >> + use openacc >> + implicit none >> + >> + integer, parameter :: N = 256 >> + integer, allocatable :: h(:) >> + integer :: i >> + integer :: async = 5 >> + >> + allocate (h(N)) >> + >> + do i = 1, N >> + h(i) = i >> + end do >> + >> + call acc_copyin (h) >> + >> + do i = 1, N >> + h(i) = i + i >> + end do >> + >> + call acc_update_device_async (h, sizeof (h), async) >> + >> + if (acc_is_present (h) .neqv. .TRUE.) call abort > > Don't we need 'acc_wait' here (while 'acc_update_device_async' may still > be reading from 'h'), before overwriting 'h' here: > >> + >> + h(:) = 0 >> + >> + call acc_copyout_async (h, sizeof (h), async) >> + >> + call acc_wait (async) >> + >> + do i = 1, N >> + if (h(i) /= i + i) call abort >> + end do >> + >> + call acc_copyin (h, sizeof (h)) >> + >> + h(:) = 0 >> + >> + call acc_update_self_async (h, sizeof (h), async) >> + >> + if (acc_is_present (h) .neqv. .TRUE.) call abort > > Don't we need 'acc_wait' here (to make sure we finish device to host copy > of 'h'), before evaluating 'h' here: > >> + >> + do i = 1, N >> + if (h(i) /= i + i) call abort >> + end do >> + >> + call acc_delete_async (h, async) >> + >> + call acc_wait (async) >> + >> + if (acc_is_present (h) .neqv. .FALSE.) call abort >> + >> +end program > > Julian has patches for most of these (as part of other commits). ----------------- Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634 München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht München, HRB 106955
Index: libgomp/libgomp.map =================================================================== --- libgomp/libgomp.map (revision 264192) +++ libgomp/libgomp.map (working copy) @@ -388,14 +388,48 @@ OACC_2.0.1 { OACC_2.5 { global: + acc_copyin_async; + acc_copyin_async_32_h_; + acc_copyin_async_64_h_; + acc_copyin_async_array_h_; + acc_copyout_async; + acc_copyout_async_32_h_; + acc_copyout_async_64_h_; + acc_copyout_async_array_h_; acc_copyout_finalize; acc_copyout_finalize_32_h_; acc_copyout_finalize_64_h_; acc_copyout_finalize_array_h_; + acc_copyout_finalize_async; + acc_copyout_finalize_async_32_h_; + acc_copyout_finalize_async_64_h_; + acc_copyout_finalize_async_array_h_; + acc_create_async; + acc_create_async_32_h_; + acc_create_async_64_h_; + acc_create_async_array_h_; + acc_delete_async; + acc_delete_async_32_h_; + acc_delete_async_64_h_; + acc_delete_async_array_h_; acc_delete_finalize; acc_delete_finalize_32_h_; acc_delete_finalize_64_h_; acc_delete_finalize_array_h_; + acc_delete_finalize_async; + acc_delete_finalize_async_32_h_; + acc_delete_finalize_async_64_h_; + acc_delete_finalize_async_array_h_; + acc_memcpy_from_device_async; + acc_memcpy_to_device_async; + acc_update_device_async; + acc_update_device_async_32_h_; + acc_update_device_async_64_h_; + acc_update_device_async_array_h_; + acc_update_self_async; + acc_update_self_async_32_h_; + acc_update_self_async_64_h_; + acc_update_self_async_array_h_; } OACC_2.0.1; GOACC_2.0 { Index: libgomp/oacc-mem.c =================================================================== --- libgomp/oacc-mem.c (revision 264192) +++ libgomp/oacc-mem.c (working copy) @@ -153,8 +153,9 @@ acc_free (void *d) gomp_fatal ("error in freeing device memory in %s", __FUNCTION__); } -void -acc_memcpy_to_device (void *d, void *h, size_t s) +static void +memcpy_tofrom_device (bool from, void *d, void *h, size_t s, int async, + const char *libfnname) { /* No need to call lazy open here, as the device pointer must have been obtained from a routine that did that. */ @@ -164,31 +165,49 @@ acc_free (void *d) if (thr->dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM) { - memmove (d, h, s); + if (from) + memmove (h, d, s); + else + memmove (d, h, s); return; } - if (!thr->dev->host2dev_func (thr->dev->target_id, d, h, s)) - gomp_fatal ("error in %s", __FUNCTION__); + if (async > acc_async_sync) + thr->dev->openacc.async_set_async_func (async); + + bool ret = (from + ? thr->dev->dev2host_func (thr->dev->target_id, h, d, s) + : thr->dev->host2dev_func (thr->dev->target_id, d, h, s)); + + if (async > acc_async_sync) + thr->dev->openacc.async_set_async_func (acc_async_sync); + + if (!ret) + gomp_fatal ("error in %s", libfnname); } void -acc_memcpy_from_device (void *h, void *d, size_t s) +acc_memcpy_to_device (void *d, void *h, size_t s) { - /* No need to call lazy open here, as the device pointer must have - been obtained from a routine that did that. */ - struct goacc_thread *thr = goacc_thread (); + memcpy_tofrom_device (false, d, h, s, acc_async_sync, __FUNCTION__); +} - assert (thr && thr->dev); +void +acc_memcpy_to_device_async (void *d, void *h, size_t s, int async) +{ + memcpy_tofrom_device (false, d, h, s, async, __FUNCTION__); +} - if (thr->dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM) - { - memmove (h, d, s); - return; - } +void +acc_memcpy_from_device (void *h, void *d, size_t s) +{ + memcpy_tofrom_device (true, d, h, s, acc_async_sync, __FUNCTION__); +} - if (!thr->dev->dev2host_func (thr->dev->target_id, h, d, s)) - gomp_fatal ("error in %s", __FUNCTION__); +void +acc_memcpy_from_device_async (void *h, void *d, size_t s, int async) +{ + memcpy_tofrom_device (true, d, h, s, async, __FUNCTION__); } /* Return the device pointer that corresponds to host data H. Or NULL @@ -428,7 +447,7 @@ acc_unmap_data (void *h) #define FLAG_COPY (1 << 2) static void * -present_create_copy (unsigned f, void *h, size_t s) +present_create_copy (unsigned f, void *h, size_t s, int async) { void *d; splay_tree_key n; @@ -490,11 +509,17 @@ static void * gomp_mutex_unlock (&acc_dev->lock); + if (async > acc_async_sync) + acc_dev->openacc.async_set_async_func (async); + tgt = gomp_map_vars (acc_dev, mapnum, &hostaddrs, NULL, &s, &kinds, true, GOMP_MAP_VARS_OPENACC); /* Initialize dynamic refcount. */ tgt->list[0].key->dynamic_refcount = 1; + if (async > acc_async_sync) + acc_dev->openacc.async_set_async_func (acc_async_sync); + gomp_mutex_lock (&acc_dev->lock); d = tgt->to_free; @@ -510,19 +535,32 @@ static void * void * acc_create (void *h, size_t s) { - return present_create_copy (FLAG_PRESENT | FLAG_CREATE, h, s); + return present_create_copy (FLAG_PRESENT | FLAG_CREATE, h, s, acc_async_sync); } +void +acc_create_async (void *h, size_t s, int async) +{ + present_create_copy (FLAG_PRESENT | FLAG_CREATE, h, s, async); +} + void * acc_copyin (void *h, size_t s) { - return present_create_copy (FLAG_PRESENT | FLAG_CREATE | FLAG_COPY, h, s); + return present_create_copy (FLAG_PRESENT | FLAG_CREATE | FLAG_COPY, h, s, + acc_async_sync); } +void +acc_copyin_async (void *h, size_t s, int async) +{ + present_create_copy (FLAG_PRESENT | FLAG_CREATE | FLAG_COPY, h, s, async); +} + void * acc_present_or_create (void *h, size_t s) { - return present_create_copy (FLAG_PRESENT | FLAG_CREATE, h, s); + return present_create_copy (FLAG_PRESENT | FLAG_CREATE, h, s, acc_async_sync); } /* acc_pcreate is acc_present_or_create by a different name. */ @@ -539,7 +577,8 @@ acc_pcreate (void *h, size_t s) void * acc_present_or_copyin (void *h, size_t s) { - return present_create_copy (FLAG_PRESENT | FLAG_CREATE | FLAG_COPY, h, s); + return present_create_copy (FLAG_PRESENT | FLAG_CREATE | FLAG_COPY, h, s, + acc_async_sync); } /* acc_pcopyin is acc_present_or_copyin by a different name. */ @@ -557,7 +596,7 @@ acc_pcopyin (void *h, size_t s) #define FLAG_FINALIZE (1 << 1) static void -delete_copyout (unsigned f, void *h, size_t s, const char *libfnname) +delete_copyout (unsigned f, void *h, size_t s, int async, const char *libfnname) { size_t host_size; splay_tree_key n; @@ -633,7 +672,13 @@ static void } if (f & FLAG_COPYOUT) - acc_dev->dev2host_func (acc_dev->target_id, h, d, s); + { + if (async > acc_async_sync) + acc_dev->openacc.async_set_async_func (async); + acc_dev->dev2host_func (acc_dev->target_id, h, d, s); + if (async > acc_async_sync) + acc_dev->openacc.async_set_async_func (acc_async_sync); + } gomp_remove_var (acc_dev, n); } @@ -644,41 +689,54 @@ static void void acc_delete (void *h , size_t s) { - delete_copyout (0, h, s, __FUNCTION__); + delete_copyout (0, h, s, acc_async_sync, __FUNCTION__); } void +acc_delete_async (void *h , size_t s, int async) +{ + delete_copyout (0, h, s, async, __FUNCTION__); +} + +void acc_delete_finalize (void *h , size_t s) { - delete_copyout (FLAG_FINALIZE, h, s, __FUNCTION__); + delete_copyout (FLAG_FINALIZE, h, s, acc_async_sync, __FUNCTION__); } void acc_delete_finalize_async (void *h , size_t s, int async) { - delete_copyout (FLAG_FINALIZE, h, s, __FUNCTION__); + delete_copyout (FLAG_FINALIZE, h, s, async, __FUNCTION__); } void acc_copyout (void *h, size_t s) { - delete_copyout (FLAG_COPYOUT, h, s, __FUNCTION__); + delete_copyout (FLAG_COPYOUT, h, s, acc_async_sync, __FUNCTION__); } void +acc_copyout_async (void *h, size_t s, int async) +{ + delete_copyout (FLAG_COPYOUT, h, s, async, __FUNCTION__); +} + +void acc_copyout_finalize (void *h, size_t s) { - delete_copyout (FLAG_COPYOUT | FLAG_FINALIZE, h, s, __FUNCTION__); + delete_copyout (FLAG_COPYOUT | FLAG_FINALIZE, h, s, acc_async_sync, + __FUNCTION__); } void acc_copyout_finalize_async (void *h, size_t s, int async) { - delete_copyout (FLAG_COPYOUT | FLAG_FINALIZE, h, s, __FUNCTION__); + delete_copyout (FLAG_COPYOUT | FLAG_FINALIZE, h, s, async, __FUNCTION__); } static void -update_dev_host (int is_dev, void *h, size_t s) +update_dev_host (int is_dev, void *h, size_t s, int async) { splay_tree_key n; void *d; @@ -704,11 +762,17 @@ static void d = (void *) (n->tgt->tgt_start + n->tgt_offset + (uintptr_t) h - n->host_start); + if (async > acc_async_sync) + acc_dev->openacc.async_set_async_func (async); + if (is_dev) acc_dev->host2dev_func (acc_dev->target_id, d, h, s); else acc_dev->dev2host_func (acc_dev->target_id, h, d, s); + if (async > acc_async_sync) + acc_dev->openacc.async_set_async_func (acc_async_sync); + gomp_mutex_unlock (&acc_dev->lock); } @@ -715,16 +779,28 @@ static void void acc_update_device (void *h, size_t s) { - update_dev_host (1, h, s); + update_dev_host (1, h, s, acc_async_sync); } void +acc_update_device_async (void *h, size_t s, int async) +{ + update_dev_host (1, h, s, async); +} + +void acc_update_self (void *h, size_t s) { - update_dev_host (0, h, s); + update_dev_host (0, h, s, acc_async_sync); } void +acc_update_self_async (void *h, size_t s, int async) +{ + update_dev_host (0, h, s, async); +} + +void gomp_acc_insert_pointer (size_t mapnum, void **hostaddrs, size_t *sizes, void *kinds) { Index: libgomp/openacc.f90 =================================================================== --- libgomp/openacc.f90 (revision 264192) +++ libgomp/openacc.f90 (working copy) @@ -332,6 +332,150 @@ module openacc_internal logical acc_is_present_array_h type (*), dimension (..), contiguous :: a end function + + subroutine acc_copyin_async_32_h (a, len, async) + use iso_c_binding, only: c_int32_t + use openacc_kinds, only: acc_handle_kind + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a + type (*), dimension (*) :: a + integer (c_int32_t) len + integer (acc_handle_kind) async + end subroutine + + subroutine acc_copyin_async_64_h (a, len, async) + use iso_c_binding, only: c_int64_t + use openacc_kinds, only: acc_handle_kind + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a + type (*), dimension (*) :: a + integer (c_int64_t) len + integer (acc_handle_kind) async + end subroutine + + subroutine acc_copyin_async_array_h (a, async) + use openacc_kinds, only: acc_handle_kind + type (*), dimension (..), contiguous :: a + integer (acc_handle_kind) async + end subroutine + + subroutine acc_create_async_32_h (a, len, async) + use iso_c_binding, only: c_int32_t + use openacc_kinds, only: acc_handle_kind + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a + type (*), dimension (*) :: a + integer (c_int32_t) len + integer (acc_handle_kind) async + end subroutine + + subroutine acc_create_async_64_h (a, len, async) + use iso_c_binding, only: c_int64_t + use openacc_kinds, only: acc_handle_kind + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a + type (*), dimension (*) :: a + integer (c_int64_t) len + integer (acc_handle_kind) async + end subroutine + + subroutine acc_create_async_array_h (a, async) + use openacc_kinds, only: acc_handle_kind + type (*), dimension (..), contiguous :: a + integer (acc_handle_kind) async + end subroutine + + subroutine acc_copyout_async_32_h (a, len, async) + use iso_c_binding, only: c_int32_t + use openacc_kinds, only: acc_handle_kind + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a + type (*), dimension (*) :: a + integer (c_int32_t) len + integer (acc_handle_kind) async + end subroutine + + subroutine acc_copyout_async_64_h (a, len, async) + use iso_c_binding, only: c_int64_t + use openacc_kinds, only: acc_handle_kind + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a + type (*), dimension (*) :: a + integer (c_int64_t) len + integer (acc_handle_kind) async + end subroutine + + subroutine acc_copyout_async_array_h (a, async) + use openacc_kinds, only: acc_handle_kind + type (*), dimension (..), contiguous :: a + integer (acc_handle_kind) async + end subroutine + + subroutine acc_delete_async_32_h (a, len, async) + use iso_c_binding, only: c_int32_t + use openacc_kinds, only: acc_handle_kind + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a + type (*), dimension (*) :: a + integer (c_int32_t) len + integer (acc_handle_kind) async + end subroutine + + subroutine acc_delete_async_64_h (a, len, async) + use iso_c_binding, only: c_int64_t + use openacc_kinds, only: acc_handle_kind + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a + type (*), dimension (*) :: a + integer (c_int64_t) len + integer (acc_handle_kind) async + end subroutine + + subroutine acc_delete_async_array_h (a, async) + use openacc_kinds, only: acc_handle_kind + type (*), dimension (..), contiguous :: a + integer (acc_handle_kind) async + end subroutine + + subroutine acc_update_device_async_32_h (a, len, async) + use iso_c_binding, only: c_int32_t + use openacc_kinds, only: acc_handle_kind + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a + type (*), dimension (*) :: a + integer (c_int32_t) len + integer (acc_handle_kind) async + end subroutine + + subroutine acc_update_device_async_64_h (a, len, async) + use iso_c_binding, only: c_int64_t + use openacc_kinds, only: acc_handle_kind + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a + type (*), dimension (*) :: a + integer (c_int64_t) len + integer (acc_handle_kind) async + end subroutine + + subroutine acc_update_device_async_array_h (a, async) + use openacc_kinds, only: acc_handle_kind + type (*), dimension (..), contiguous :: a + integer (acc_handle_kind) async + end subroutine + + subroutine acc_update_self_async_32_h (a, len, async) + use iso_c_binding, only: c_int32_t + use openacc_kinds, only: acc_handle_kind + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a + type (*), dimension (*) :: a + integer (c_int32_t) len + integer (acc_handle_kind) async + end subroutine + + subroutine acc_update_self_async_64_h (a, len, async) + use iso_c_binding, only: c_int64_t + use openacc_kinds, only: acc_handle_kind + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a + type (*), dimension (*) :: a + integer (c_int64_t) len + integer (acc_handle_kind) async + end subroutine + + subroutine acc_update_self_async_array_h (a, async) + use openacc_kinds, only: acc_handle_kind + type (*), dimension (..), contiguous :: a + integer (acc_handle_kind) async + end subroutine end interface interface @@ -510,6 +654,60 @@ module openacc_internal type (*), dimension (*) :: a integer (c_size_t), value :: len end function + + subroutine acc_copyin_async_l (a, len, async) & + bind (C, name = "acc_copyin_async") + use iso_c_binding, only: c_size_t, c_int + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a + type (*), dimension (*) :: a + integer (c_size_t), value :: len + integer (c_int), value :: async + end subroutine + + subroutine acc_create_async_l (a, len, async) & + bind (C, name = "acc_create_async") + use iso_c_binding, only: c_size_t, c_int + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a + type (*), dimension (*) :: a + integer (c_size_t), value :: len + integer (c_int), value :: async + end subroutine + + subroutine acc_copyout_async_l (a, len, async) & + bind (C, name = "acc_copyout_async") + use iso_c_binding, only: c_size_t, c_int + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a + type (*), dimension (*) :: a + integer (c_size_t), value :: len + integer (c_int), value :: async + end subroutine + + subroutine acc_delete_async_l (a, len, async) & + bind (C, name = "acc_delete_async") + use iso_c_binding, only: c_size_t, c_int + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a + type (*), dimension (*) :: a + integer (c_size_t), value :: len + integer (c_int), value :: async + end subroutine + + subroutine acc_update_device_async_l (a, len, async) & + bind (C, name = "acc_update_device_async") + use iso_c_binding, only: c_size_t, c_int + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a + type (*), dimension (*) :: a + integer (c_size_t), value :: len + integer (c_int), value :: async + end subroutine + + subroutine acc_update_self_async_l (a, len, async) & + bind (C, name = "acc_update_self_async") + use iso_c_binding, only: c_size_t, c_int + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a + type (*), dimension (*) :: a + integer (c_size_t), value :: len + integer (c_int), value :: async + end subroutine end interface end module @@ -529,6 +727,8 @@ module openacc public :: acc_copyin, acc_present_or_copyin, acc_pcopyin, acc_create public :: acc_present_or_create, acc_pcreate, acc_copyout, acc_delete public :: acc_update_device, acc_update_self, acc_is_present + public :: acc_copyin_async, acc_create_async, acc_copyout_async + public :: acc_delete_async, acc_update_device_async, acc_update_self_async integer, parameter :: openacc_version = 201306 @@ -694,6 +894,42 @@ module openacc ! acc_memcpy_to_device: Only available in C/C++ ! acc_memcpy_from_device: Only available in C/C++ + interface acc_copyin_async + procedure :: acc_copyin_async_32_h + procedure :: acc_copyin_async_64_h + procedure :: acc_copyin_async_array_h + end interface + + interface acc_create_async + procedure :: acc_create_async_32_h + procedure :: acc_create_async_64_h + procedure :: acc_create_async_array_h + end interface + + interface acc_copyout_async + procedure :: acc_copyout_async_32_h + procedure :: acc_copyout_async_64_h + procedure :: acc_copyout_async_array_h + end interface + + interface acc_delete_async + procedure :: acc_delete_async_32_h + procedure :: acc_delete_async_64_h + procedure :: acc_delete_async_array_h + end interface + + interface acc_update_device_async + procedure :: acc_update_device_async_32_h + procedure :: acc_update_device_async_64_h + procedure :: acc_update_device_async_array_h + end interface + + interface acc_update_self_async + procedure :: acc_update_self_async_32_h + procedure :: acc_update_self_async_64_h + procedure :: acc_update_self_async_array_h + end interface + end module function acc_get_num_devices_h (d) @@ -1078,3 +1314,189 @@ function acc_is_present_array_h (a) type (*), dimension (..), contiguous :: a acc_is_present_array_h = acc_is_present_l (a, sizeof (a)) == 1 end function + +subroutine acc_copyin_async_32_h (a, len, async) + use iso_c_binding, only: c_int32_t, c_size_t, c_int + use openacc_internal, only: acc_copyin_async_l + use openacc_kinds, only: acc_handle_kind + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a + type (*), dimension (*) :: a + integer (c_int32_t) len + integer (acc_handle_kind) async + call acc_copyin_async_l (a, int (len, kind = c_size_t), int (async, kind = c_int)) +end subroutine + +subroutine acc_copyin_async_64_h (a, len, async) + use iso_c_binding, only: c_int64_t, c_size_t, c_int + use openacc_internal, only: acc_copyin_async_l + use openacc_kinds, only: acc_handle_kind + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a + type (*), dimension (*) :: a + integer (c_int64_t) len + integer (acc_handle_kind) async + call acc_copyin_async_l (a, int (len, kind = c_size_t), int (async, kind = c_int)) +end subroutine + +subroutine acc_copyin_async_array_h (a, async) + use iso_c_binding, only: c_int + use openacc_internal, only: acc_copyin_async_l + use openacc_kinds, only: acc_handle_kind + type (*), dimension (..), contiguous :: a + integer (acc_handle_kind) async + call acc_copyin_async_l (a, sizeof (a), int (async, kind = c_int)) +end subroutine + +subroutine acc_create_async_32_h (a, len, async) + use iso_c_binding, only: c_int32_t, c_size_t, c_int + use openacc_internal, only: acc_create_async_l + use openacc_kinds, only: acc_handle_kind + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a + type (*), dimension (*) :: a + integer (c_int32_t) len + integer (acc_handle_kind) async + call acc_create_async_l (a, int (len, kind = c_size_t), int (async, kind = c_int)) +end subroutine + +subroutine acc_create_async_64_h (a, len, async) + use iso_c_binding, only: c_int64_t, c_size_t, c_int + use openacc_internal, only: acc_create_async_l + use openacc_kinds, only: acc_handle_kind + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a + type (*), dimension (*) :: a + integer (c_int64_t) len + integer (acc_handle_kind) async + call acc_create_async_l (a, int (len, kind = c_size_t), int (async, kind = c_int)) +end subroutine + +subroutine acc_create_async_array_h (a, async) + use iso_c_binding, only: c_int + use openacc_internal, only: acc_create_async_l + use openacc_kinds, only: acc_handle_kind + type (*), dimension (..), contiguous :: a + integer (acc_handle_kind) async + call acc_create_async_l (a, sizeof (a), int (async, kind = c_int)) +end subroutine + +subroutine acc_copyout_async_32_h (a, len, async) + use iso_c_binding, only: c_int32_t, c_size_t, c_int + use openacc_internal, only: acc_copyout_async_l + use openacc_kinds, only: acc_handle_kind + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a + type (*), dimension (*) :: a + integer (c_int32_t) len + integer (acc_handle_kind) async + call acc_copyout_async_l (a, int (len, kind = c_size_t), int (async, kind = c_int)) +end subroutine + +subroutine acc_copyout_async_64_h (a, len, async) + use iso_c_binding, only: c_int64_t, c_size_t, c_int + use openacc_internal, only: acc_copyout_async_l + use openacc_kinds, only: acc_handle_kind + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a + type (*), dimension (*) :: a + integer (c_int64_t) len + integer (acc_handle_kind) async + call acc_copyout_async_l (a, int (len, kind = c_size_t), int (async, kind = c_int)) +end subroutine + +subroutine acc_copyout_async_array_h (a, async) + use iso_c_binding, only: c_int + use openacc_internal, only: acc_copyout_async_l + use openacc_kinds, only: acc_handle_kind + type (*), dimension (..), contiguous :: a + integer (acc_handle_kind) async + call acc_copyout_async_l (a, sizeof (a), int (async, kind = c_int)) +end subroutine + +subroutine acc_delete_async_32_h (a, len, async) + use iso_c_binding, only: c_int32_t, c_size_t, c_int + use openacc_internal, only: acc_delete_async_l + use openacc_kinds, only: acc_handle_kind + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a + type (*), dimension (*) :: a + integer (c_int32_t) len + integer (acc_handle_kind) async + call acc_delete_async_l (a, int (len, kind = c_size_t), int (async, kind = c_int)) +end subroutine + +subroutine acc_delete_async_64_h (a, len, async) + use iso_c_binding, only: c_int64_t, c_size_t, c_int + use openacc_internal, only: acc_delete_async_l + use openacc_kinds, only: acc_handle_kind + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a + type (*), dimension (*) :: a + integer (c_int64_t) len + integer (acc_handle_kind) async + call acc_delete_async_l (a, int (len, kind = c_size_t), int (async, kind = c_int)) +end subroutine + +subroutine acc_delete_async_array_h (a, async) + use iso_c_binding, only: c_int + use openacc_internal, only: acc_delete_async_l + use openacc_kinds, only: acc_handle_kind + type (*), dimension (..), contiguous :: a + integer (acc_handle_kind) async + call acc_delete_async_l (a, sizeof (a), int (async, kind = c_int)) +end subroutine + +subroutine acc_update_device_async_32_h (a, len, async) + use iso_c_binding, only: c_int32_t, c_size_t, c_int + use openacc_internal, only: acc_update_device_async_l + use openacc_kinds, only: acc_handle_kind + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a + type (*), dimension (*) :: a + integer (c_int32_t) len + integer (acc_handle_kind) async + call acc_update_device_async_l (a, int (len, kind = c_size_t), int (async, kind = c_int)) +end subroutine + +subroutine acc_update_device_async_64_h (a, len, async) + use iso_c_binding, only: c_int64_t, c_size_t, c_int + use openacc_internal, only: acc_update_device_async_l + use openacc_kinds, only: acc_handle_kind + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a + type (*), dimension (*) :: a + integer (c_int64_t) len + integer (acc_handle_kind) async + call acc_update_device_async_l (a, int (len, kind = c_size_t), int (async, kind = c_int)) +end subroutine + +subroutine acc_update_device_async_array_h (a, async) + use iso_c_binding, only: c_int + use openacc_internal, only: acc_update_device_async_l + use openacc_kinds, only: acc_handle_kind + type (*), dimension (..), contiguous :: a + integer (acc_handle_kind) async + call acc_update_device_async_l (a, sizeof (a), int (async, kind = c_int)) +end subroutine + +subroutine acc_update_self_async_32_h (a, len, async) + use iso_c_binding, only: c_int32_t, c_size_t, c_int + use openacc_internal, only: acc_update_self_async_l + use openacc_kinds, only: acc_handle_kind + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a + type (*), dimension (*) :: a + integer (c_int32_t) len + integer (acc_handle_kind) async + call acc_update_self_async_l (a, int (len, kind = c_size_t), int (async, kind = c_int)) +end subroutine + +subroutine acc_update_self_async_64_h (a, len, async) + use iso_c_binding, only: c_int64_t, c_size_t, c_int + use openacc_internal, only: acc_update_self_async_l + use openacc_kinds, only: acc_handle_kind + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a + type (*), dimension (*) :: a + integer (c_int64_t) len + integer (acc_handle_kind) async + call acc_update_self_async_l (a, int (len, kind = c_size_t), int (async, kind = c_int)) +end subroutine + +subroutine acc_update_self_async_array_h (a, async) + use iso_c_binding, only: c_int + use openacc_internal, only: acc_update_self_async_l + use openacc_kinds, only: acc_handle_kind + type (*), dimension (..), contiguous :: a + integer (acc_handle_kind) async + call acc_update_self_async_l (a, sizeof (a), int (async, kind = c_int)) +end subroutine Index: libgomp/openacc.h =================================================================== --- libgomp/openacc.h (revision 264192) +++ libgomp/openacc.h (working copy) @@ -115,6 +115,16 @@ void acc_copyout_finalize_async (void *, size_t, i void acc_delete_finalize (void *, size_t) __GOACC_NOTHROW; void acc_delete_finalize_async (void *, size_t, int) __GOACC_NOTHROW; +/* Async functions, specified in OpenACC 2.5. */ +void acc_copyin_async (void *, size_t, int) __GOACC_NOTHROW; +void acc_create_async (void *, size_t, int) __GOACC_NOTHROW; +void acc_copyout_async (void *, size_t, int) __GOACC_NOTHROW; +void acc_delete_async (void *, size_t, int) __GOACC_NOTHROW; +void acc_update_device_async (void *, size_t, int) __GOACC_NOTHROW; +void acc_update_self_async (void *, size_t, int) __GOACC_NOTHROW; +void acc_memcpy_to_device_async (void *, void *, size_t, int) __GOACC_NOTHROW; +void acc_memcpy_from_device_async (void *, void *, size_t, int) __GOACC_NOTHROW; + /* CUDA-specific routines. */ void *acc_get_current_cuda_device (void) __GOACC_NOTHROW; void *acc_get_current_cuda_context (void) __GOACC_NOTHROW; Index: libgomp/openacc_lib.h =================================================================== --- libgomp/openacc_lib.h (revision 264192) +++ libgomp/openacc_lib.h (working copy) @@ -403,3 +403,159 @@ ! acc_memcpy_to_device: Only available in C/C++ ! acc_memcpy_from_device: Only available in C/C++ + + interface acc_copyin_async + subroutine acc_copyin_async_32_h (a, len, async) + use iso_c_binding, only: c_int32_t + import acc_handle_kind + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a + type (*), dimension (*) :: a + integer (c_int32_t) len + integer (acc_handle_kind) async + end subroutine + + subroutine acc_copyin_async_64_h (a, len, async) + use iso_c_binding, only: c_int64_t + import acc_handle_kind + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a + type (*), dimension (*) :: a + integer (c_int64_t) len + integer (acc_handle_kind) async + end subroutine + + subroutine acc_copyin_async_array_h (a, async_) + import acc_handle_kind + type (*), dimension (..), contiguous :: a + integer (acc_handle_kind) async_ + end subroutine + end interface + + interface acc_create_async + subroutine acc_create_async_32_h (a, len, async) + use iso_c_binding, only: c_int32_t + import acc_handle_kind + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a + type (*), dimension (*) :: a + integer (c_int32_t) len + integer (acc_handle_kind) async + end subroutine + + subroutine acc_create_async_64_h (a, len, async) + use iso_c_binding, only: c_int64_t + import acc_handle_kind + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a + type (*), dimension (*) :: a + integer (c_int64_t) len + integer (acc_handle_kind) async + end subroutine + + subroutine acc_create_async_array_h (a, async_) + import acc_handle_kind + type (*), dimension (..), contiguous :: a + integer (acc_handle_kind) async_ + end subroutine + end interface + + interface acc_copyout_async + subroutine acc_copyout_async_32_h (a, len, async) + use iso_c_binding, only: c_int32_t + import acc_handle_kind + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a + type (*), dimension (*) :: a + integer (c_int32_t) len + integer (acc_handle_kind) async + end subroutine + + subroutine acc_copyout_async_64_h (a, len, async) + use iso_c_binding, only: c_int64_t + import acc_handle_kind + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a + type (*), dimension (*) :: a + integer (c_int64_t) len + integer (acc_handle_kind) async + end subroutine + + subroutine acc_copyout_async_array_h (a, async_) + import acc_handle_kind + type (*), dimension (..), contiguous :: a + integer (acc_handle_kind) async_ + end subroutine + end interface + + interface acc_delete_async + subroutine acc_delete_async_32_h (a, len, async) + use iso_c_binding, only: c_int32_t + import acc_handle_kind + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a + type (*), dimension (*) :: a + integer (c_int32_t) len + integer (acc_handle_kind) async + end subroutine + + subroutine acc_delete_async_64_h (a, len, async) + use iso_c_binding, only: c_int64_t + import acc_handle_kind + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a + type (*), dimension (*) :: a + integer (c_int64_t) len + integer (acc_handle_kind) async + end subroutine + + subroutine acc_delete_async_array_h (a, async_) + import acc_handle_kind + type (*), dimension (..), contiguous :: a + integer (acc_handle_kind) async_ + end subroutine + end interface + + interface acc_update_device_async + subroutine acc_update_device_async_32_h (a, len, async) + use iso_c_binding, only: c_int32_t + import acc_handle_kind + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a + type (*), dimension (*) :: a + integer (c_int32_t) len + integer (acc_handle_kind) async + end subroutine + + subroutine acc_update_device_async_64_h (a, len, async) + use iso_c_binding, only: c_int64_t + import acc_handle_kind + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a + type (*), dimension (*) :: a + integer (c_int64_t) len + integer (acc_handle_kind) async + end subroutine + + subroutine acc_update_device_async_array_h (a, async_) + import acc_handle_kind + type (*), dimension (..), contiguous :: a + integer (acc_handle_kind) async_ + end subroutine + end interface + + interface acc_update_self_async + subroutine acc_update_self_async_32_h (a, len, async) + use iso_c_binding, only: c_int32_t + import acc_handle_kind + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a + type (*), dimension (*) :: a + integer (c_int32_t) len + integer (acc_handle_kind) async + end subroutine + + subroutine acc_update_self_async_64_h (a, len, async) + use iso_c_binding, only: c_int64_t + import acc_handle_kind + !GCC$ ATTRIBUTES NO_ARG_CHECK :: a + type (*), dimension (*) :: a + integer (c_int64_t) len + integer (acc_handle_kind) async + end subroutine + + subroutine acc_update_self_async_array_h (a, async_) + import acc_handle_kind + type (*), dimension (..), contiguous :: a + integer (acc_handle_kind) async_ + end subroutine + end interface Index: libgomp/testsuite/libgomp.oacc-c-c++-common/lib-94.c =================================================================== --- libgomp/testsuite/libgomp.oacc-c-c++-common/lib-94.c (nonexistent) +++ libgomp/testsuite/libgomp.oacc-c-c++-common/lib-94.c (working copy) @@ -0,0 +1,42 @@ +/* { dg-do run } */ +/* { dg-skip-if "" { *-*-* } { "*" } { "-DACC_MEM_SHARED=0" } } */ + +#include <string.h> +#include <stdlib.h> +#include <openacc.h> + +int +main (int argc, char **argv) +{ + const int N = 256; + int i; + int async = 8; + unsigned char *h; + + h = (unsigned char *) malloc (N); + + for (i = 0; i < N; i++) + { + h[i] = i; + } + + acc_copyin_async (h, N, async); + + memset (h, 0, N); + + acc_wait (async); + + acc_copyout_async (h, N, async + 1); + + acc_wait (async + 1); + + for (i = 0; i < N; i++) + { + if (h[i] != i) + abort (); + } + + free (h); + + return 0; +} Index: libgomp/testsuite/libgomp.oacc-c-c++-common/lib-95.c =================================================================== --- libgomp/testsuite/libgomp.oacc-c-c++-common/lib-95.c (nonexistent) +++ libgomp/testsuite/libgomp.oacc-c-c++-common/lib-95.c (working copy) @@ -0,0 +1,45 @@ +/* { dg-do run } */ +/* { dg-skip-if "" { *-*-* } { "*" } { "-DACC_MEM_SHARED=0" } } */ + +#include <string.h> +#include <stdlib.h> +#include <openacc.h> + +int +main (int argc, char **argv) +{ + const int N = 256; + int i, q = 5; + unsigned char *h, *g; + void *d; + + h = (unsigned char *) malloc (N); + g = (unsigned char *) malloc (N); + for (i = 0; i < N; i++) + { + g[i] = i; + } + + acc_create_async (h, N, q); + + acc_memcpy_to_device_async (acc_deviceptr (h), g, N, q); + memset (&h[0], 0, N); + + acc_wait (q); + + acc_update_self_async (h, N, q + 1); + acc_delete_async (h, N, q + 1); + + acc_wait (q + 1); + + for (i = 0; i < N; i++) + { + if (h[i] != i) + abort (); + } + + free (h); + free (g); + + return 0; +} Index: libgomp/testsuite/libgomp.oacc-fortran/lib-16.f90 =================================================================== --- libgomp/testsuite/libgomp.oacc-fortran/lib-16.f90 (nonexistent) +++ libgomp/testsuite/libgomp.oacc-fortran/lib-16.f90 (working copy) @@ -0,0 +1,57 @@ +! { dg-do run } +! { dg-skip-if "" { *-*-* } { "*" } { "-DACC_MEM_SHARED=0" } } + +program main + use openacc + implicit none + + integer, parameter :: N = 256 + integer, allocatable :: h(:) + integer :: i + integer :: async = 5 + + allocate (h(N)) + + do i = 1, N + h(i) = i + end do + + call acc_copyin (h) + + do i = 1, N + h(i) = i + i + end do + + call acc_update_device_async (h, sizeof (h), async) + + if (acc_is_present (h) .neqv. .TRUE.) call abort + + h(:) = 0 + + call acc_copyout_async (h, sizeof (h), async) + + call acc_wait (async) + + do i = 1, N + if (h(i) /= i + i) call abort + end do + + call acc_copyin (h, sizeof (h)) + + h(:) = 0 + + call acc_update_self_async (h, sizeof (h), async) + + if (acc_is_present (h) .neqv. .TRUE.) call abort + + do i = 1, N + if (h(i) /= i + i) call abort + end do + + call acc_delete_async (h, async) + + call acc_wait (async) + + if (acc_is_present (h) .neqv. .FALSE.) call abort + +end program