diff mbox series

[v3,libgomp,OpenMP,5.0,committed] Implement omp_get_device_num

Message ID 7462afd0-2d5c-20cc-84a5-6184039f70c5@codesourcery.com
State New
Headers show
Series [v3,libgomp,OpenMP,5.0,committed] Implement omp_get_device_num | expand

Commit Message

Chung-Lin Tang Aug. 5, 2021, 3:30 p.m. UTC
On 2021/8/3 8:07 PM, Thomas Schwinge wrote:
>> Really suggest to have intelmic support be re-worked as an offload plugin inside
>> libgomp, rather than floating outside by itself.
> Well, it is a regular libgomp plugin, just its sources are not in
> 'libgomp/plugin/' and it's not built during libgomp build.  Are you
> suggesting just to move it into 'libgomp/plugin/'?  This may need some
> more complicated setup because of its 'liboffloadmic' dependency?

Well it appears that liboffloadmic is layered atop of a COI API (Common Offload Interface?)
that is supposed to be the true proprietary interface to Intel MIC devices.

I think it is more reasonable to have each libgomp plugin to directly be built
atop of the vendor-specific interface for the accelerator. Having another in-tree library
serve in-between makes things a bit unnecessarily complex.

(I'm not sure if I recall correctly, but did liboffloadmic have another use besides for
libgomp?)

>>>> --- a/libgomp/libgomp-plugin.h
>>>> +++ b/libgomp/libgomp-plugin.h
>>>> @@ -102,6 +102,12 @@ struct addr_pair
>>>>      uintptr_t end;
>>>>    };
>>>>    
>>>> +/* This symbol is to name a target side variable that holds the designated
>>>> +   'device number' of the target device. The symbol needs to be available to
>>>> +   libgomp code and the  offload plugin (which in the latter case must be
>>>> +   stringified).  */
>>>> +#define GOMP_DEVICE_NUM_VAR __gomp_device_num
>>> For a single var it is acceptable (though, please avoid the double space
>>> before offload plugin in the comment), but once we have more than one
>>> variable, I think we should simply have a struct which will contain all the
>>> parameters that need to be copied from the host to the offloading device at
>>> image load time (and have eventually another struct that holds parameters
>>> that we'll need to copy to the device on each kernel launch, I bet some ICVs
>>> will be one category, other ICVs another one).
> ACK.  Also other program state, like 'fenv' or the gfortran "state blob".
> This is<https://gcc.gnu.org/PR92827>  "Missing data/state
> sharing/propagation between host and offloading devices".

Okay, so we actually have a PR number for this :)


>> Actually, if you look at the 5.[01] specifications, omp_get_device_num() is not
>> defined in terms of an ICV. Maybe it conceptually ought to be, but the current
>> description of "the device number of the device on which the calling thread is
>> executing" is not one if the defined ICVs.
>>
>> It looks like there will eventually be some kind of ICV block handled in a similar
>> way, but I think that the modifications will be straightforward then. For now,
>> I think it's okay for GOMP_DEVICE_NUM_VAR to just be a normal global variable.
> There is, by the way, precedent for that:
> 'libgomp/config/nvptx/time.c:double __nvptx_clocktick', set up in
> 'libgomp/plugin/plugin-nvptx.c:nvptx_set_clocktick' ('cuModuleGetGlobal'
> to get the device address, followed by 'cuMemcpyHtoD'), invoked from
> 'libgomp/plugin/plugin-nvptx.c:GOMP_OFFLOAD_load_image', quite simple.
> 
> For the case discussed here, we're now adding more complex
> 'other_count'/'other_entries'/'num_others' bookkeeping.  (Great that all
> of the plugins plus 'libgomp/target.c' invented their own terminology...)
> ;-)

Well, that is kind of what nvptx is doing by itself internally.
(e.g. libgomp/config/gcn/time.c does not use such external setting by the plugin)

Maybe that "last" entry handled by load_image will eventually turn into a large
block struct to handle all such cases.

>> --- a/libgomp/plugin/plugin-gcn.c
>> +++ b/libgomp/plugin/plugin-gcn.c
...
>> +  if (status == HSA_STATUS_SUCCESS)
>> +    {
>> +      uint64_t device_num_varptr;
>> +      uint32_t device_num_varsize;
>> +
>> +      status = hsa_fns.hsa_executable_symbol_get_info_fn
>> +	(var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
>> +	 &device_num_varptr);
>> +      if (status != HSA_STATUS_SUCCESS)
>> +	hsa_fatal ("Could not extract a variable from its symbol", status);
>> +      status = hsa_fns.hsa_executable_symbol_get_info_fn
>> +	(var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE,
>> +	 &device_num_varsize);
>> +      if (status != HSA_STATUS_SUCCESS)
>> +	hsa_fatal ("Could not extract a variable size from its symbol", status);
>> +
>> +      pair->start = device_num_varptr;
>> +      pair->end = device_num_varptr + device_num_varsize;
>> +    }
>> +  else
>> +    pair->start = pair->end = 0;
>> +
> Is the 'else' branch to accomodate "old" executables running against
> "new" libgomp?  If yes, then please add a comment, "for compatibility
> with pre-GCC 12 executables" or similar.

No, it's because GOMP_DEVICE_NUM_VAR is only linked in when the program
actually uses omp_get_device_num().

Even when a program does use omp_get_device_num(), only that offload image
which contains that part of libgomp has the device number variable defined.

So the else case should actually be quite common.

> Also, add 'pair++;', to avoid future confusion?

Done.

>> +  if (r == CUDA_SUCCESS)
>> +    {
>> +      targ_tbl->start = (uintptr_t) device_num_varptr;
>> +      targ_tbl->end = (uintptr_t) (device_num_varptr + device_num_varsize);
>> +    }
>> +  else
>> +    targ_tbl->start = targ_tbl->end = 0;
>> +
> Same comment for 'else', please.
> 
> Also, 'targ_tbl++;', to avoid future confusion?

Done.

>>     nvptx_set_clocktick (module, dev);
>>   
>> -  return fn_entries + var_entries;
>> +  return fn_entries + var_entries + other_entries;
>>   }
>>   
>>   /* Unload the program described by TARGET_DATA.  DEV_DATA is the
>> --- a/libgomp/target.c
>> +++ b/libgomp/target.c
>> @@ -1974,6 +1974,9 @@ gomp_load_image_to_device (struct gomp_device_descr *devicep, unsigned version,
>>     int num_funcs = host_funcs_end - host_func_table;
>>     int num_vars  = (host_vars_end - host_var_table) / 2;
>>   
>> +  /* Others currently is only 'device_num' */
>> +  int num_others = 1;
>> +
>>     /* Load image to device and get target addresses for the image.  */
>>     struct addr_pair *target_table = NULL;
>>     int i, num_target_entries;
>> @@ -1982,7 +1985,9 @@ gomp_load_image_to_device (struct gomp_device_descr *devicep, unsigned version,
>>       = devicep->load_image_func (devicep->target_id, version,
>>   				target_data, &target_table);
>>   
> Do I understand right that the special-casing here:
> 
>> -  if (num_target_entries != num_funcs + num_vars)
>> +  if (num_target_entries != num_funcs + num_vars
>> +      /* Others (device_num) are included as trailing entries in pair list.  */
>> +      && num_target_entries != num_funcs + num_vars + num_others)
>>       {
>>         gomp_mutex_unlock (&devicep->lock);
>>         if (is_register_lock)
>> @@ -2054,6 +2059,31 @@ gomp_load_image_to_device (struct gomp_device_descr *devicep, unsigned version,
>>         array++;
>>       }
>>   
>> +  /* Last entry is for the on-device 'device_num' variable. Tolerate case
>> +     where plugin does not return this entry.  */
>> +  if (num_funcs + num_vars < num_target_entries)
> ... and here is just for compatibility with the unmodified Intel MIC
> plugin?  Wouldn't it be simpler to just add a dummy value to that one, to
> avoid this special-casing?

Adding that "null" code to liboffloadmic is not lesser effort, and is not as robust as
simply checking here in gomp_load_image_to_device.

> 
>> +    {
>> +      struct addr_pair *device_num_var = &target_table[num_funcs + num_vars];
>> +      if (device_num_var->start != 0)
> Please also here add a comment about the '0' case, for completeness.

Done.

>> +	{
>> +	  /* The index of the devicep within devices[] is regarded as its
>> +	     'device number', which is different from the per-device type
>> +	     devicep->target_id.  */
>> +	  int device_num_val = (int) (devicep - &devices[0]);
>> +	  if (device_num_var->end - device_num_var->start != sizeof (int))
>> +	    {
>> +	      gomp_mutex_unlock (&devicep->lock);
> Add:
> 
>      if (is_register_lock)
>        gomp_mutex_unlock (&register_lock);
> 
> ..., I suppose?

Done, thanks for catching this one.

>> +	  /* Copy device_num value to place on device memory, hereby actually
>> +	     designating its device number into effect.  */
>> +	  gomp_copy_host2dev (devicep, NULL, (void *) device_num_var->start,
>> +			      &device_num_val, sizeof (int), false, NULL);
>> +	}
>> +    }
>> +
>>     free (target_table);
>>   }
> Am I understanding this correctly that instead of handling it all inside
> the plugins (like '__nvptx_clocktick'), we're here doing the
> 'other_count'/'other_entries'/'num_others' bookkeeping plus
> 'gomp_copy_host2dev', because we don't know 'device_num_val' inside the
> plugins, and also to introduce a more generic interface for future ICVs
> etc.?

As mentioned above, cases like "__nvptx_clocktick" should be seen as target
specific, and can be implemented entirely within nvptx's domain. It's really
not a design issue, and any target can do similar device initialization
if they can.

> It feels to me as if this interface can be improved still.  (For example,
> would it make sense to pass into the plugins (via 'load_image_func') some
> kind of table, containing 'device_num_val' etc., and then let each plugin
> to the setup?)  But I'm OK if we leave that for later, once we actually
> add more ICVs etc.

I thought of that when implementing the current patch too, but:

1) It would likely be yet another plugin-hook added (not undoable, but kind
    of superfluous IMHO).

2) It would be further exposing higher-level concepts (OpenMP or OpenACC)
    into the plugin, and generally speaking the plugin has a more limited
    view of the entire libgomp source. This means that there will be cases
    where putting some kind of setup/initialization in the plugin will be
    awkward and hard to implement (without pulling even more stuff into the
    plugin).

    Having the plugin simply do the job of finding the device location of
    an opaque variable with pre-arranged name and size, and return it for
    libgomp to do the setup work, is a better separation of interface.

>> --- a/libgomp/config/gcn/icv-device.c
>> +++ b/libgomp/config/gcn/icv-device.c
>> @@ -70,6 +70,16 @@ omp_is_initial_device (void)
>>     return 0;
>>   }
>>   
>> +/* This is set to the device number of current GPU during device initialization,
>> +   when the offload image containing this libgomp portion is loaded.  */
>> +static int GOMP_DEVICE_NUM_VAR;
>> +
>> +int
>> +omp_get_device_num (void)
>> +{
>> +  return GOMP_DEVICE_NUM_VAR;
>> +}
>> +
>>   ialias (omp_set_default_device)
>>   ialias (omp_get_default_device)
>>   ialias (omp_get_initial_device)
> I suppose also add 'ialias (omp_get_device_num)' here, like...

Done, thanks for catching.

>> --- a/libgomp/testsuite/lib/libgomp.exp
>> +++ b/libgomp/testsuite/lib/libgomp.exp
>> +# Return 1 if compiling for offload target intelmic
>> +proc check_effective_target_offload_target_intelmic { } {
>> +    return [libgomp_check_effective_target_offload_target "*-intelmic"]
>> +}
>> --- /dev/null
>> +++ b/libgomp/testsuite/libgomp.c-c++-common/target-45.c
>> @@ -0,0 +1,30 @@
>> +/* { dg-do run { target { ! offload_target_intelmic } } } */
> This means that the test case is skipped as soon as the compiler is
> configured for Intel MIC offloading -- even if that's not used during
> execution.
> 
>  From some older experiment of mine, I do have a
> 'check_effective_target_offload_device_intel_mic', which I'll propose as
> a follow-up, once this is in.

Great.

>> +  if (initial_device .and. host_device_num .ne. device_num) stop 2
> That one matches 'libgomp.c-c++-common/target-45.c':
> 
>      if (initial_device && host_device_num != device_num)
>        abort ();
> 
> ..., but here:
> 
>> +  if (initial_device .and. host_device_num .eq. device_num) stop 3
> ... shouldn't that be '.not.initial_device', like in:
> 
>      if (!initial_device && host_device_num == device_num)
>        abort ();

Yeah, Tobias also caught this as well :)

> 
> (Also, I'm not familiar with Fortran operator precedence rules, so
> probably would put the individual expressions into braces.;-)  -- But I
> trust you know better than I do, of course.)

Done.

Attached is the final "v3" patch that I committed.

Thanks,
Chung-Lin
From 0bac793ed6bad2c0c13cd1e93a1aa5808467afc8 Mon Sep 17 00:00:00 2001
From: Chung-Lin Tang <cltang@codesourcery.com>
Date: Thu, 5 Aug 2021 23:29:03 +0800
Subject: [PATCH] openmp: Implement omp_get_device_num routine

This patch implements the omp_get_device_num library routine, specified in
OpenMP 5.0.

GOMP_DEVICE_NUM_VAR is a macro symbol which defines name of a "device number"
variable, is defined on the device-side libgomp, has it's address returned to
host-side libgomp during device initialization, and the host libgomp then
sets its value to the designated device number.

libgomp/ChangeLog:

	* icv-device.c (omp_get_device_num): New API function, host side.
	* fortran.c (omp_get_device_num_): New interface function.
	* libgomp-plugin.h (GOMP_DEVICE_NUM_VAR): Define macro symbol.
	* libgomp.map (OMP_5.0.2): New version space with omp_get_device_num,
	omp_get_device_num_.
	* libgomp.texi (omp_get_device_num): Add documentation for new API
	function.
	* omp.h.in (omp_get_device_num): Add declaration.
	* omp_lib.f90.in (omp_get_device_num): Likewise.
	* omp_lib.h.in (omp_get_device_num): Likewise.
	* target.c (gomp_load_image_to_device): If additional entry for device
	number exists at end of returned entries from 'load_image_func' hook,
	copy the assigned device number over to the device variable.

	* config/gcn/icv-device.c (GOMP_DEVICE_NUM_VAR): Define static global.
	(omp_get_device_num): New API function, device side.
	* plugin/plugin-gcn.c ("symcat.h"): Add include.
	(GOMP_OFFLOAD_load_image): Add addresses of device GOMP_DEVICE_NUM_VAR
	at end of returned 'target_table' entries.

	* config/nvptx/icv-device.c (GOMP_DEVICE_NUM_VAR): Define static global.
	(omp_get_device_num): New API function, device side.
	* plugin/plugin-nvptx.c ("symcat.h"): Add include.
	(GOMP_OFFLOAD_load_image): Add addresses of device GOMP_DEVICE_NUM_VAR
	at end of returned 'target_table' entries.

	* testsuite/lib/libgomp.exp
	(check_effective_target_offload_target_intelmic): New function for
	testing for intelmic offloading.
	* testsuite/libgomp.c-c++-common/target-45.c: New test.
	* testsuite/libgomp.fortran/target10.f90: New test.
---
 libgomp/config/gcn/icv-device.c               | 11 ++++++
 libgomp/config/nvptx/icv-device.c             | 11 ++++++
 libgomp/fortran.c                             |  7 ++++
 libgomp/icv-device.c                          |  9 +++++
 libgomp/libgomp-plugin.h                      |  6 +++
 libgomp/libgomp.map                           |  8 +++-
 libgomp/libgomp.texi                          | 29 ++++++++++++++
 libgomp/omp.h.in                              |  1 +
 libgomp/omp_lib.f90.in                        |  6 +++
 libgomp/omp_lib.h.in                          |  3 ++
 libgomp/plugin/plugin-gcn.c                   | 38 ++++++++++++++++++-
 libgomp/plugin/plugin-nvptx.c                 | 25 ++++++++++--
 libgomp/target.c                              | 36 +++++++++++++++++-
 libgomp/testsuite/lib/libgomp.exp             |  5 +++
 .../libgomp.c-c++-common/target-45.c          | 30 +++++++++++++++
 .../testsuite/libgomp.fortran/target10.f90    | 20 ++++++++++
 16 files changed, 238 insertions(+), 7 deletions(-)
 create mode 100644 libgomp/testsuite/libgomp.c-c++-common/target-45.c
 create mode 100644 libgomp/testsuite/libgomp.fortran/target10.f90
diff mbox series

Patch

diff --git a/libgomp/config/gcn/icv-device.c b/libgomp/config/gcn/icv-device.c
index 72d4f7cff74..34e0f8346f2 100644
--- a/libgomp/config/gcn/icv-device.c
+++ b/libgomp/config/gcn/icv-device.c
@@ -70,6 +70,16 @@  omp_is_initial_device (void)
   return 0;
 }
 
+/* This is set to the device number of current GPU during device initialization,
+   when the offload image containing this libgomp portion is loaded.  */
+static volatile int GOMP_DEVICE_NUM_VAR;
+
+int
+omp_get_device_num (void)
+{
+  return GOMP_DEVICE_NUM_VAR;
+}
+
 ialias (omp_set_default_device)
 ialias (omp_get_default_device)
 ialias (omp_get_initial_device)
@@ -77,3 +87,4 @@  ialias (omp_get_num_devices)
 ialias (omp_get_num_teams)
 ialias (omp_get_team_num)
 ialias (omp_is_initial_device)
+ialias (omp_get_device_num)
diff --git a/libgomp/config/nvptx/icv-device.c b/libgomp/config/nvptx/icv-device.c
index 3b96890f338..b63149d0c34 100644
--- a/libgomp/config/nvptx/icv-device.c
+++ b/libgomp/config/nvptx/icv-device.c
@@ -58,8 +58,19 @@  omp_is_initial_device (void)
   return 0;
 }
 
+/* This is set to the device number of current GPU during device initialization,
+   when the offload image containing this libgomp portion is loaded.  */
+static volatile int GOMP_DEVICE_NUM_VAR;
+
+int
+omp_get_device_num (void)
+{
+  return GOMP_DEVICE_NUM_VAR;
+}
+
 ialias (omp_set_default_device)
 ialias (omp_get_default_device)
 ialias (omp_get_initial_device)
 ialias (omp_get_num_devices)
 ialias (omp_is_initial_device)
+ialias (omp_get_device_num)
diff --git a/libgomp/fortran.c b/libgomp/fortran.c
index e042702ac91..07f97656e51 100644
--- a/libgomp/fortran.c
+++ b/libgomp/fortran.c
@@ -83,6 +83,7 @@  ialias_redirect (omp_get_partition_place_nums)
 ialias_redirect (omp_set_default_device)
 ialias_redirect (omp_get_default_device)
 ialias_redirect (omp_get_num_devices)
+ialias_redirect (omp_get_device_num)
 ialias_redirect (omp_get_num_teams)
 ialias_redirect (omp_get_team_num)
 ialias_redirect (omp_is_initial_device)
@@ -599,6 +600,12 @@  omp_get_initial_device_ (void)
   return omp_get_initial_device ();
 }
 
+int32_t
+omp_get_device_num_ (void)
+{
+  return omp_get_device_num ();
+}
+
 int32_t
 omp_get_max_task_priority_ (void)
 {
diff --git a/libgomp/icv-device.c b/libgomp/icv-device.c
index c1bedf46647..f11bdfa85c4 100644
--- a/libgomp/icv-device.c
+++ b/libgomp/icv-device.c
@@ -61,8 +61,17 @@  omp_is_initial_device (void)
   return 1;
 }
 
+int
+omp_get_device_num (void)
+{
+  /* By specification, this is equivalent to omp_get_initial_device
+     on the host.  */
+  return omp_get_initial_device ();
+}
+
 ialias (omp_set_default_device)
 ialias (omp_get_default_device)
 ialias (omp_get_initial_device)
 ialias (omp_get_num_devices)
 ialias (omp_is_initial_device)
+ialias (omp_get_device_num)
diff --git a/libgomp/libgomp-plugin.h b/libgomp/libgomp-plugin.h
index 62645ce9954..cf24a2bee41 100644
--- a/libgomp/libgomp-plugin.h
+++ b/libgomp/libgomp-plugin.h
@@ -102,6 +102,12 @@  struct addr_pair
   uintptr_t end;
 };
 
+/* This symbol is to name a target side variable that holds the designated
+   'device number' of the target device. The symbol needs to be available to
+   libgomp code and the offload plugin (which in the latter case must be
+   stringified).  */
+#define GOMP_DEVICE_NUM_VAR __gomp_device_num
+
 /* Miscellaneous functions.  */
 extern void *GOMP_PLUGIN_malloc (size_t) __attribute__ ((malloc));
 extern void *GOMP_PLUGIN_malloc_cleared (size_t) __attribute__ ((malloc));
diff --git a/libgomp/libgomp.map b/libgomp/libgomp.map
index 69aa69562b8..cc44885cba9 100644
--- a/libgomp/libgomp.map
+++ b/libgomp/libgomp.map
@@ -199,12 +199,18 @@  OMP_5.0.1 {
 	omp_fulfill_event_;
 } OMP_5.0;
 
+OMP_5.0.2 {
+  global:
+	omp_get_device_num;
+	omp_get_device_num_;
+} OMP_5.0.1;
+
 OMP_5.1 {
   global:
 	omp_display_env;
 	omp_display_env_;
 	omp_display_env_8_;
-} OMP_5.0.1;
+} OMP_5.0.2;
 
 GOMP_1.0 {
   global:
diff --git a/libgomp/libgomp.texi b/libgomp/libgomp.texi
index 2c1f1b5968b..fc9e708a8d2 100644
--- a/libgomp/libgomp.texi
+++ b/libgomp/libgomp.texi
@@ -165,6 +165,7 @@  linkage, and do not throw exceptions.
 * omp_get_ancestor_thread_num:: Ancestor thread ID
 * omp_get_cancellation::        Whether cancellation support is enabled
 * omp_get_default_device::      Get the default device for target regions
+* omp_get_device_num::          Get device that current thread is running on
 * omp_get_dynamic::             Dynamic teams setting
 * omp_get_initial_device::      Device number of host device
 * omp_get_level::               Number of parallel regions
@@ -385,6 +386,34 @@  For OpenMP 5.1, this must be equal to the value returned by the
 
 
 
+@node omp_get_device_num
+@section @code{omp_get_device_num} -- Return device number of current device
+@table @asis
+@item @emph{Description}:
+This function returns a device number that represents the device that the
+current thread is executing on. For OpenMP 5.0, this must be equal to the
+value returned by the @code{omp_get_initial_device} function when called
+from the host.
+
+@item @emph{C/C++}
+@multitable @columnfractions .20 .80
+@item @emph{Prototype}: @tab @code{int omp_get_device_num(void);}
+@end multitable
+
+@item @emph{Fortran}:
+@multitable @columnfractions .20 .80
+@item @emph{Interface}: @tab @code{integer function omp_get_device_num()}
+@end multitable
+
+@item @emph{See also}:
+@ref{omp_get_initial_device}
+
+@item @emph{Reference}:
+@uref{https://www.openmp.org, OpenMP specification v5.0}, Section 3.2.37.
+@end table
+
+
+
 @node omp_get_level
 @section @code{omp_get_level} -- Obtain the current nesting level
 @table @asis
diff --git a/libgomp/omp.h.in b/libgomp/omp.h.in
index c93db968d2e..da34a9d98a6 100644
--- a/libgomp/omp.h.in
+++ b/libgomp/omp.h.in
@@ -243,6 +243,7 @@  extern void omp_get_partition_place_nums (int *) __GOMP_NOTHROW;
 extern void omp_set_default_device (int) __GOMP_NOTHROW;
 extern int omp_get_default_device (void) __GOMP_NOTHROW;
 extern int omp_get_num_devices (void) __GOMP_NOTHROW;
+extern int omp_get_device_num (void) __GOMP_NOTHROW;
 extern int omp_get_num_teams (void) __GOMP_NOTHROW;
 extern int omp_get_team_num (void) __GOMP_NOTHROW;
 
diff --git a/libgomp/omp_lib.f90.in b/libgomp/omp_lib.f90.in
index 5fc6587e49e..d7e804f4fd5 100644
--- a/libgomp/omp_lib.f90.in
+++ b/libgomp/omp_lib.f90.in
@@ -550,6 +550,12 @@ 
           end function omp_get_initial_device
         end interface
 
+        interface
+          function omp_get_device_num ()
+            integer (4) :: omp_get_device_num
+          end function omp_get_device_num
+        end interface
+
         interface
           function omp_get_max_task_priority ()
             integer (4) :: omp_get_max_task_priority
diff --git a/libgomp/omp_lib.h.in b/libgomp/omp_lib.h.in
index 9873cea9ac1..20c32645e3c 100644
--- a/libgomp/omp_lib.h.in
+++ b/libgomp/omp_lib.h.in
@@ -244,6 +244,9 @@ 
       external omp_get_initial_device
       integer(4) omp_get_initial_device
 
+      external omp_get_device_num
+      integer(4) omp_get_device_num
+
       external omp_get_max_task_priority
       integer(4) omp_get_max_task_priority
 
diff --git a/libgomp/plugin/plugin-gcn.c b/libgomp/plugin/plugin-gcn.c
index 2548614a2e5..f26d7361106 100644
--- a/libgomp/plugin/plugin-gcn.c
+++ b/libgomp/plugin/plugin-gcn.c
@@ -29,6 +29,7 @@ 
 /* {{{ Includes and defines  */
 
 #include "config.h"
+#include "symcat.h"
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
@@ -3305,6 +3306,7 @@  GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
   struct kernel_info *kernel;
   int kernel_count = image_desc->kernel_count;
   unsigned var_count = image_desc->global_variable_count;
+  int other_count = 1;
 
   agent = get_agent_info (ord);
   if (!agent)
@@ -3321,7 +3323,8 @@  GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
 
   GCN_DEBUG ("Encountered %d kernels in an image\n", kernel_count);
   GCN_DEBUG ("Encountered %u global variables in an image\n", var_count);
-  pair = GOMP_PLUGIN_malloc ((kernel_count + var_count - 2)
+  GCN_DEBUG ("Expect %d other variables in an image\n", other_count);
+  pair = GOMP_PLUGIN_malloc ((kernel_count + var_count + other_count - 2)
 			     * sizeof (struct addr_pair));
   *target_table = pair;
   module = (struct module_info *)
@@ -3396,6 +3399,37 @@  GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
       pair++;
     }
 
+  GCN_DEBUG ("Looking for variable %s\n", STRINGX (GOMP_DEVICE_NUM_VAR));
+
+  hsa_status_t status;
+  hsa_executable_symbol_t var_symbol;
+  status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
+						 STRINGX (GOMP_DEVICE_NUM_VAR),
+						 agent->id, 0, &var_symbol);
+  if (status == HSA_STATUS_SUCCESS)
+    {
+      uint64_t device_num_varptr;
+      uint32_t device_num_varsize;
+
+      status = hsa_fns.hsa_executable_symbol_get_info_fn
+	(var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
+	 &device_num_varptr);
+      if (status != HSA_STATUS_SUCCESS)
+	hsa_fatal ("Could not extract a variable from its symbol", status);
+      status = hsa_fns.hsa_executable_symbol_get_info_fn
+	(var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE,
+	 &device_num_varsize);
+      if (status != HSA_STATUS_SUCCESS)
+	hsa_fatal ("Could not extract a variable size from its symbol", status);
+
+      pair->start = device_num_varptr;
+      pair->end = device_num_varptr + device_num_varsize;
+    }
+  else
+    /* The 'GOMP_DEVICE_NUM_VAR' variable was not in this image.  */
+    pair->start = pair->end = 0;
+  pair++;
+
   /* Ensure that constructors are run first.  */
   struct GOMP_kernel_launch_attributes kla =
     { 3,
@@ -3418,7 +3452,7 @@  GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
   if (module->fini_array_func)
     kernel_count--;
 
-  return kernel_count + var_count;
+  return kernel_count + var_count + other_count;
 }
 
 /* Unload GCN object-code module described by struct gcn_image_desc in
diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c
index 1215212d501..0f16e1cf00d 100644
--- a/libgomp/plugin/plugin-nvptx.c
+++ b/libgomp/plugin/plugin-nvptx.c
@@ -34,6 +34,7 @@ 
 #define _GNU_SOURCE
 #include "openacc.h"
 #include "config.h"
+#include "symcat.h"
 #include "libgomp-plugin.h"
 #include "oacc-plugin.h"
 #include "gomp-constants.h"
@@ -1265,7 +1266,7 @@  GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
   CUmodule module;
   const char *const *var_names;
   const struct targ_fn_launch *fn_descs;
-  unsigned int fn_entries, var_entries, i, j;
+  unsigned int fn_entries, var_entries, other_entries, i, j;
   struct targ_fn_descriptor *targ_fns;
   struct addr_pair *targ_tbl;
   const nvptx_tdata_t *img_header = (const nvptx_tdata_t *) target_data;
@@ -1295,8 +1296,11 @@  GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
   fn_entries = img_header->fn_num;
   fn_descs = img_header->fn_descs;
 
+  /* Currently, the only other entry kind is 'device number'.  */
+  other_entries = 1;
+
   targ_tbl = GOMP_PLUGIN_malloc (sizeof (struct addr_pair)
-				 * (fn_entries + var_entries));
+				 * (fn_entries + var_entries + other_entries));
   targ_fns = GOMP_PLUGIN_malloc (sizeof (struct targ_fn_descriptor)
 				 * fn_entries);
 
@@ -1345,9 +1349,24 @@  GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
       targ_tbl->end = targ_tbl->start + bytes;
     }
 
+  CUdeviceptr device_num_varptr;
+  size_t device_num_varsize;
+  CUresult r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, &device_num_varptr,
+				  &device_num_varsize, module,
+				  STRINGX (GOMP_DEVICE_NUM_VAR));
+  if (r == CUDA_SUCCESS)
+    {
+      targ_tbl->start = (uintptr_t) device_num_varptr;
+      targ_tbl->end = (uintptr_t) (device_num_varptr + device_num_varsize);
+    }
+  else
+    /* The 'GOMP_DEVICE_NUM_VAR' variable was not in this image.  */
+    targ_tbl->start = targ_tbl->end = 0;
+  targ_tbl++;
+
   nvptx_set_clocktick (module, dev);
 
-  return fn_entries + var_entries;
+  return fn_entries + var_entries + other_entries;
 }
 
 /* Unload the program described by TARGET_DATA.  DEV_DATA is the
diff --git a/libgomp/target.c b/libgomp/target.c
index 453b3210e40..67fcf41cc2e 100644
--- a/libgomp/target.c
+++ b/libgomp/target.c
@@ -1974,6 +1974,9 @@  gomp_load_image_to_device (struct gomp_device_descr *devicep, unsigned version,
   int num_funcs = host_funcs_end - host_func_table;
   int num_vars  = (host_vars_end - host_var_table) / 2;
 
+  /* Others currently is only 'device_num' */
+  int num_others = 1;
+
   /* Load image to device and get target addresses for the image.  */
   struct addr_pair *target_table = NULL;
   int i, num_target_entries;
@@ -1982,7 +1985,9 @@  gomp_load_image_to_device (struct gomp_device_descr *devicep, unsigned version,
     = devicep->load_image_func (devicep->target_id, version,
 				target_data, &target_table);
 
-  if (num_target_entries != num_funcs + num_vars)
+  if (num_target_entries != num_funcs + num_vars
+      /* Others (device_num) are included as trailing entries in pair list.  */
+      && num_target_entries != num_funcs + num_vars + num_others)
     {
       gomp_mutex_unlock (&devicep->lock);
       if (is_register_lock)
@@ -2054,6 +2059,35 @@  gomp_load_image_to_device (struct gomp_device_descr *devicep, unsigned version,
       array++;
     }
 
+  /* Last entry is for the on-device 'device_num' variable. Tolerate case
+     where plugin does not return this entry.  */
+  if (num_funcs + num_vars < num_target_entries)
+    {
+      struct addr_pair *device_num_var = &target_table[num_funcs + num_vars];
+      /* Start address will be non-zero for last entry if GOMP_DEVICE_NUM_VAR
+	 was found in this image.  */
+      if (device_num_var->start != 0)
+	{
+	  /* The index of the devicep within devices[] is regarded as its
+	     'device number', which is different from the per-device type
+	     devicep->target_id.  */
+	  int device_num_val = (int) (devicep - &devices[0]);
+	  if (device_num_var->end - device_num_var->start != sizeof (int))
+	    {
+	      gomp_mutex_unlock (&devicep->lock);
+	      if (is_register_lock)
+		gomp_mutex_unlock (&register_lock);
+	      gomp_fatal ("offload plugin managed 'device_num' not of expected "
+			  "format");
+	    }
+
+	  /* Copy device_num value to place on device memory, hereby actually
+	     designating its device number into effect.  */
+	  gomp_copy_host2dev (devicep, NULL, (void *) device_num_var->start,
+			      &device_num_val, sizeof (int), false, NULL);
+	}
+    }
+
   free (target_table);
 }
 
diff --git a/libgomp/testsuite/lib/libgomp.exp b/libgomp/testsuite/lib/libgomp.exp
index a2050151e84..ba8a73275c5 100644
--- a/libgomp/testsuite/lib/libgomp.exp
+++ b/libgomp/testsuite/lib/libgomp.exp
@@ -374,6 +374,11 @@  proc check_effective_target_offload_target_amdgcn { } {
     return [libgomp_check_effective_target_offload_target "amdgcn"]
 }
 
+# Return 1 if compiling for offload target intelmic
+proc check_effective_target_offload_target_intelmic { } {
+    return [libgomp_check_effective_target_offload_target "*-intelmic"]
+}
+
 # Return 1 if offload device is available.
 proc check_effective_target_offload_device { } {
     return [check_runtime_nocache offload_device_available_ {
diff --git a/libgomp/testsuite/libgomp.c-c++-common/target-45.c b/libgomp/testsuite/libgomp.c-c++-common/target-45.c
new file mode 100644
index 00000000000..ec0d202e51c
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c-c++-common/target-45.c
@@ -0,0 +1,30 @@ 
+/* { dg-do run { target { ! offload_target_intelmic } } } */
+
+#include <omp.h>
+#include <stdlib.h>
+
+int main (void)
+{
+
+  int host_device_num = omp_get_device_num ();
+
+  if (host_device_num != omp_get_initial_device ())
+    abort ();
+
+  int device_num;
+  int initial_device;
+
+  #pragma omp target map(from: device_num, initial_device)
+  {
+    initial_device = omp_is_initial_device ();
+    device_num = omp_get_device_num ();
+  }
+
+  if (initial_device && host_device_num != device_num)
+    abort ();
+
+  if (!initial_device && host_device_num == device_num)
+    abort ();
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.fortran/target10.f90 b/libgomp/testsuite/libgomp.fortran/target10.f90
new file mode 100644
index 00000000000..0b939ad7a0d
--- /dev/null
+++ b/libgomp/testsuite/libgomp.fortran/target10.f90
@@ -0,0 +1,20 @@ 
+! { dg-do run { target { ! offload_target_intelmic } } }
+
+program main
+  use omp_lib
+  implicit none
+  integer :: device_num, host_device_num
+  logical :: initial_device
+
+  host_device_num = omp_get_device_num ()
+  if (host_device_num .ne. omp_get_initial_device ()) stop 1
+
+  !$omp target map(from: device_num, initial_device)
+  initial_device = omp_is_initial_device ()
+  device_num = omp_get_device_num ()
+  !$omp end target
+
+  if (initial_device .and. (host_device_num .ne. device_num)) stop 2
+  if ((.not. initial_device) .and. (host_device_num .eq. device_num)) stop 3
+
+end program main