diff mbox series

[v2,1/8] libgomp: Disentangle shared memory from managed

Message ID 20240628102449.562467-2-ams@baylibre.com
State New
Headers show
Series OpenMP: Unified Shared Memory via Managed Memory | expand

Commit Message

Andrew Stubbs June 28, 2024, 10:24 a.m. UTC
Some GPU compute systems allow the GPU to access host memory without much
prior setup, but that's not necessarily the fast way to do it.  For shared
memory APUs this is almost certainly the correct choice, but for AMD there
is the difference between "fine-grained" and "coarse-grained" memory, and
for NVidia Cuda generally runs better if it knows the status of the memory
you access.

Therefore, for performance, we want to use "managed memory", in which the OS
drivers handle page migration on the fly, but this will require some
additional configuration steps that I will implement in later patches.  There
may be a temporary regression in USM support.

This patch disables the basic stop-gap shared memory so we can introduce
fast Unified Shared Memory using the managed memory APIs in the next patches.

If a device has integrated memory then the patch attempts to continue using
the current behaviour.  The new plugin API to achieve this is made optional
so as not to break compatibility.  It needs to be a new API because the
existing capability setting runs before the devices have been scanned and does
not allow different capabilities for different devices.

libgomp/ChangeLog:

	* libgomp-plugin.h (GOMP_OFFLOAD_get_dev_caps): New prototype.
	* libgomp.h (struct gomp_device_descr): Add get_dev_caps_func.
	* plugin/plugin-gcn.c (GOMP_OFFLOAD_get_dev_caps): New function.
	* plugin/plugin-nvptx.c (GOMP_OFFLOAD_get_dev_caps): New function.
	* target.c (gomp_load_plugin_for_device): Load the get_dev_caps API.
	(gomp_target_init): Don't assume unified shared memory is the same
	as actual shared memory.  Use get_dev_caps to allow plugins to set
	different capabilities for different devices.
---
 libgomp/libgomp-plugin.h      |  1 +
 libgomp/libgomp.h             |  1 +
 libgomp/plugin/plugin-gcn.c   | 40 ++++++++++++++++++++++++++++++++---
 libgomp/plugin/plugin-nvptx.c | 16 ++++++++++++++
 libgomp/target.c              |  9 ++++----
 5 files changed, 59 insertions(+), 8 deletions(-)
diff mbox series

Patch

diff --git a/libgomp/libgomp-plugin.h b/libgomp/libgomp-plugin.h
index 0d2e3f0a6ec..100dbca1633 100644
--- a/libgomp/libgomp-plugin.h
+++ b/libgomp/libgomp-plugin.h
@@ -128,6 +128,7 @@  extern void GOMP_PLUGIN_target_rev (uint64_t, uint64_t, uint64_t, uint64_t,
 /* Prototypes for functions implemented by libgomp plugins.  */
 extern const char *GOMP_OFFLOAD_get_name (void);
 extern unsigned int GOMP_OFFLOAD_get_caps (void);
+extern unsigned int GOMP_OFFLOAD_get_dev_caps (int);
 extern int GOMP_OFFLOAD_get_type (void);
 extern int GOMP_OFFLOAD_get_num_devices (unsigned int);
 extern bool GOMP_OFFLOAD_init_device (int);
diff --git a/libgomp/libgomp.h b/libgomp/libgomp.h
index c3aabd4b7d3..f48bf7418f0 100644
--- a/libgomp/libgomp.h
+++ b/libgomp/libgomp.h
@@ -1402,6 +1402,7 @@  struct gomp_device_descr
   /* Function handlers.  */
   __typeof (GOMP_OFFLOAD_get_name) *get_name_func;
   __typeof (GOMP_OFFLOAD_get_caps) *get_caps_func;
+  __typeof (GOMP_OFFLOAD_get_dev_caps) *get_dev_caps_func;
   __typeof (GOMP_OFFLOAD_get_type) *get_type_func;
   __typeof (GOMP_OFFLOAD_get_num_devices) *get_num_devices_func;
   __typeof (GOMP_OFFLOAD_init_device) *init_device_func;
diff --git a/libgomp/plugin/plugin-gcn.c b/libgomp/plugin/plugin-gcn.c
index 3d882b5ab63..c8c588e8efa 100644
--- a/libgomp/plugin/plugin-gcn.c
+++ b/libgomp/plugin/plugin-gcn.c
@@ -3321,9 +3321,43 @@  GOMP_OFFLOAD_get_name (void)
 unsigned int
 GOMP_OFFLOAD_get_caps (void)
 {
-  /* FIXME: Enable shared memory for APU, but not discrete GPU.  */
-  return /*GOMP_OFFLOAD_CAP_SHARED_MEM |*/ GOMP_OFFLOAD_CAP_OPENMP_400
-	    | GOMP_OFFLOAD_CAP_OPENACC_200;
+  return GOMP_OFFLOAD_CAP_OPENMP_400 | GOMP_OFFLOAD_CAP_OPENACC_200;
+}
+
+/* Return any capabilities that are specific to one device only.  */
+
+unsigned int
+GOMP_OFFLOAD_get_dev_caps (int n)
+{
+  /* The device agents have been enumerated, but might not have been
+     initialized, so get_agent_info won't work here.  */
+  struct agent_info *agent = &hsa_context.agents[n];
+
+  char name[64];
+  hsa_status_t status = hsa_fns.hsa_agent_get_info_fn (agent->id,
+						       HSA_AGENT_INFO_NAME,
+						       &name);
+  if (status != HSA_STATUS_SUCCESS)
+    return 0;
+
+  gcn_isa device_isa = isa_code (name);
+  unsigned int caps = 0;
+
+  /* APU devices might have shared memory.
+     Don't add devices to this check if they support shared memory
+     via XNACK and page migration!  */
+  if (device_isa == EF_AMDGPU_MACH_AMDGCN_GFX1036 /* Expect "yes".  */
+      || device_isa == EF_AMDGPU_MACH_AMDGCN_GFX1103 /* Observed "no".  */)
+    {
+      bool b;
+      hsa_system_info_t type = HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT;
+      status = hsa_fns.hsa_system_get_info_fn (type, &b);
+      if (status == HSA_STATUS_SUCCESS
+	  && b)
+	caps |= GOMP_OFFLOAD_CAP_SHARED_MEM;
+    }
+
+  return caps;
 }
 
 /* Identify as GCN accelerator.  */
diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c
index 5caf8fe5026..2ef3198ebe1 100644
--- a/libgomp/plugin/plugin-nvptx.c
+++ b/libgomp/plugin/plugin-nvptx.c
@@ -1185,6 +1185,22 @@  GOMP_OFFLOAD_get_caps (void)
   return GOMP_OFFLOAD_CAP_OPENACC_200 | GOMP_OFFLOAD_CAP_OPENMP_400;
 }
 
+unsigned int
+GOMP_OFFLOAD_get_dev_caps (int n)
+{
+  unsigned int caps = 0;
+
+  /* APU devices might share memory with the host system.  */
+  int pi;
+  CUresult r;
+  r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &pi,
+			 CU_DEVICE_ATTRIBUTE_INTEGRATED, n);
+  if (r == CUDA_SUCCESS && pi != 0)
+    caps |= GOMP_OFFLOAD_CAP_SHARED_MEM;
+
+  return caps;
+}
+
 int
 GOMP_OFFLOAD_get_type (void)
 {
diff --git a/libgomp/target.c b/libgomp/target.c
index adbd18101f2..effd48bb92f 100644
--- a/libgomp/target.c
+++ b/libgomp/target.c
@@ -5285,6 +5285,7 @@  gomp_load_plugin_for_device (struct gomp_device_descr *device,
 
   DLSYM (get_name);
   DLSYM (get_caps);
+  DLSYM_OPT (get_dev_caps, get_dev_caps);
   DLSYM (get_type);
   DLSYM (get_num_devices);
   DLSYM (init_device);
@@ -5456,11 +5457,6 @@  gomp_target_init (void)
 	      {
 		/* Augment DEVICES and NUM_DEVICES.  */
 
-		/* If USM has been requested and is supported by all devices
-		   of this type, set the capability accordingly.  */
-		if (omp_requires_mask & GOMP_REQUIRES_UNIFIED_SHARED_MEMORY)
-		  current_device.capabilities |= GOMP_OFFLOAD_CAP_SHARED_MEM;
-
 		devs = realloc (devs, (num_devs + new_num_devs)
 				      * sizeof (struct gomp_device_descr));
 		if (!devs)
@@ -5479,6 +5475,9 @@  gomp_target_init (void)
 		for (i = 0; i < new_num_devs; i++)
 		  {
 		    current_device.target_id = i;
+		    if (current_device.get_dev_caps_func)
+		      current_device.capabilities
+		        |= current_device.get_dev_caps_func (i);
 		    devs[num_devs] = current_device;
 		    gomp_mutex_init (&devs[num_devs].lock);
 		    num_devs++;