diff mbox series

[OG12,commit] amdgcn, libgomp: USM allocation update

Message ID 7bb722dc-0e73-dce2-d05f-d471663366a4@codesourcery.com
State New
Headers show
Series [OG12,commit] amdgcn, libgomp: USM allocation update | expand

Commit Message

Andrew Stubbs Oct. 24, 2022, 4:26 p.m. UTC
I've committed this patch to the devel/omp/gcc-12 branch. I will have to 
fold it into my previous OpenMP memory management patch series when I 
repost it.

The patch changes the internal memory allocation method such that memory 
is allocated in the regular heap and then marked as "coarse-grained", as 
opposed to allocating coarse-grained memory in the first place. The 
difference is that this is CPU first, not GPU first, which is typically 
the right way around, especially when we are using this for all possible 
allocations.

Andrew
amdgcn, libgomp: USM allocation update

Allocate Unified Shared Memory via malloc and hsa_amd_svm_attributes_set,
instead of hsa_allocate_memory.  This scheme should be more efficient for
for memory that is first accessed by the CPU.

libgomp/ChangeLog:

	* plugin/plugin-gcn.c (HSA_AMD_SYSTEM_INFO_SVM_SUPPORTED): New.
	(HSA_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT): New.
	(HSA_AMD_SVM_ATTRIB_GLOBAL_FLAG): New.
	(HSA_AMD_SVM_GLOBAL_FLAG_COARSE_GRAINED): New.
	(hsa_amd_svm_attribute_pair_t): New.
	(struct hsa_runtime_fn_info): Add hsa_amd_svm_attributes_set_fn.
	(dump_hsa_system_info): Dump HSA_AMD_SYSTEM_INFO_SVM_SUPPORTED and
	HSA_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT.
	(DLSYM_OPT_FN): New.
	(init_hsa_runtime_functions): Add hsa_amd_svm_attributes_set.
	(GOMP_OFFLOAD_usm_alloc): Use malloc and hsa_amd_svm_attributes_set.
	(GOMP_OFFLOAD_usm_free): Use regular free.
	* testsuite/libgomp.c/usm-1.c: Add -mxnack=on for amdgcn.
	* testsuite/libgomp.c/usm-2.c: Likewise.
	* testsuite/libgomp.c/usm-3.c: Likewise.
	* testsuite/libgomp.c/usm-4.c: Likewise.
diff mbox series

Patch

diff --git a/libgomp/plugin/plugin-gcn.c b/libgomp/plugin/plugin-gcn.c
index dd493f63912..4871a6a793b 100644
--- a/libgomp/plugin/plugin-gcn.c
+++ b/libgomp/plugin/plugin-gcn.c
@@ -113,6 +113,16 @@  struct gcn_thread
   int async;
 };
 
+/* TEMPORARY IMPORT, UNTIL hsa_ext_amd.h GETS UPDATED.  */
+const static int HSA_AMD_SYSTEM_INFO_SVM_SUPPORTED = 0x201;
+const static int HSA_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT = 0x202;
+const static int HSA_AMD_SVM_ATTRIB_GLOBAL_FLAG = 0;
+const static int HSA_AMD_SVM_GLOBAL_FLAG_COARSE_GRAINED = 1;
+typedef struct hsa_amd_svm_attribute_pair_s {
+  uint64_t attribute;
+  uint64_t value;
+} hsa_amd_svm_attribute_pair_t;
+
 /* As an HSA runtime is dlopened, following structure defines function
    pointers utilized by the HSA plug-in.  */
 
@@ -195,6 +205,9 @@  struct hsa_runtime_fn_info
   hsa_status_t (*hsa_code_object_deserialize_fn)
     (void *serialized_code_object, size_t serialized_code_object_size,
      const char *options, hsa_code_object_t *code_object);
+  hsa_status_t (*hsa_amd_svm_attributes_set_fn)
+    (void* ptr, size_t size, hsa_amd_svm_attribute_pair_t* attribute_list,
+     size_t attribute_count);
 };
 
 /* Structure describing the run-time and grid properties of an HSA kernel
@@ -720,6 +733,24 @@  dump_hsa_system_info (void)
     }
   else
     GCN_WARNING ("HSA_SYSTEM_INFO_EXTENSIONS: FAILED\n");
+
+  bool svm_supported;
+  status = hsa_fns.hsa_system_get_info_fn
+    (HSA_AMD_SYSTEM_INFO_SVM_SUPPORTED, &svm_supported);
+  if (status == HSA_STATUS_SUCCESS)
+    GCN_DEBUG ("HSA_AMD_SYSTEM_INFO_SVM_SUPPORTED: %s\n",
+	       (svm_supported ? "TRUE" : "FALSE"));
+  else
+    GCN_WARNING ("HSA_AMD_SYSTEM_INFO_SVM_SUPPORTED: FAILED\n");
+
+  bool svm_accessible;
+  status = hsa_fns.hsa_system_get_info_fn
+    (HSA_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT, &svm_accessible);
+  if (status == HSA_STATUS_SUCCESS)
+    GCN_DEBUG ("HSA_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT: %s\n",
+	       (svm_accessible ? "TRUE" : "FALSE"));
+  else
+    GCN_WARNING ("HSA_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT: FAILED\n");
 }
 
 /* Dump information about the available hardware.  */
@@ -1361,6 +1392,8 @@  init_hsa_runtime_functions (void)
   hsa_fns.function##_fn = dlsym (handle, #function); \
   if (hsa_fns.function##_fn == NULL) \
     return false;
+#define DLSYM_OPT_FN(function) \
+  hsa_fns.function##_fn = dlsym (handle, #function);
   void *handle = dlopen (hsa_runtime_lib, RTLD_LAZY);
   if (handle == NULL)
     return false;
@@ -1395,6 +1428,7 @@  init_hsa_runtime_functions (void)
   DLSYM_FN (hsa_signal_load_acquire)
   DLSYM_FN (hsa_queue_destroy)
   DLSYM_FN (hsa_code_object_deserialize)
+  DLSYM_OPT_FN (hsa_amd_svm_attributes_set)
   return true;
 #undef DLSYM_FN
 }
@@ -3886,15 +3920,38 @@  static struct usm_splay_tree_s usm_map = { NULL };
 
 /* Allocate memory suitable for Unified Shared Memory.
 
-   In fact, AMD memory need only be "coarse grained", which target
-   allocations already are.  We do need to track allocations so that
-   GOMP_OFFLOAD_is_usm_ptr can look them up.  */
+   Normal heap memory is already enabled for USM, but by default it is "fine-
+   grained" memory, meaning that the GPU must access it via the system bus,
+   slowly.  Changing the page to "coarse-grained" mode means that the page
+   is migrated on-demand and can therefore be accessed quickly by both CPU and
+   GPU (although care should be taken to prevent thrashing the page back and
+   forth).
+
+   GOMP_OFFLOAD_alloc also allocates coarse-grained memory, but in that case
+   the initial location is GPU memory; this function returns system memory.
+
+   We record and track allocations so that GOMP_OFFLOAD_is_usm_ptr can look
+   them up.  */
 
 void *
 GOMP_OFFLOAD_usm_alloc (int device, size_t size)
 {
-  void *ptr = GOMP_OFFLOAD_alloc (device, size);
+  void *ptr = malloc (size);
+  if (!ptr || !hsa_fns.hsa_amd_svm_attributes_set_fn)
+    return ptr;
+
+  /* Register the heap allocation as coarse grained, which implies USM.  */
+  struct hsa_amd_svm_attribute_pair_s attr = {
+    HSA_AMD_SVM_ATTRIB_GLOBAL_FLAG,
+    HSA_AMD_SVM_GLOBAL_FLAG_COARSE_GRAINED
+  };
+  hsa_status_t status = hsa_fns.hsa_amd_svm_attributes_set_fn (ptr, size,
+							       &attr, 1);
+  if (status != HSA_STATUS_SUCCESS)
+    GOMP_PLUGIN_fatal ("Failed to allocate Unified Shared Memory;"
+		       " please update your drivers and/or kernel");
 
+  /* Record the allocation for GOMP_OFFLOAD_is_usm_ptr.  */
   usm_splay_tree_node node = malloc (sizeof (struct usm_splay_tree_node_s));
   node->key.addr = ptr;
   node->key.size = size;
@@ -3918,7 +3975,8 @@  GOMP_OFFLOAD_usm_free (int device, void *ptr)
       free (node);
     }
 
-  return GOMP_OFFLOAD_free (device, ptr);
+  free (ptr);
+  return true;
 }
 
 /* True if the memory was allocated via GOMP_OFFLOAD_usm_alloc.  */
diff --git a/libgomp/testsuite/libgomp.c/usm-1.c b/libgomp/testsuite/libgomp.c/usm-1.c
index e73f1816f9a..f7bf897b839 100644
--- a/libgomp/testsuite/libgomp.c/usm-1.c
+++ b/libgomp/testsuite/libgomp.c/usm-1.c
@@ -1,5 +1,6 @@ 
 /* { dg-do run } */
 /* { dg-require-effective-target omp_usm } */
+/* { dg-options "-foffload=amdgcn-amdhsa=-mxnack=on" { target offload_target_amdgcn } } */
 
 #include <omp.h>
 #include <stdint.h>
diff --git a/libgomp/testsuite/libgomp.c/usm-2.c b/libgomp/testsuite/libgomp.c/usm-2.c
index 31f2bae7145..3f52adbd7e1 100644
--- a/libgomp/testsuite/libgomp.c/usm-2.c
+++ b/libgomp/testsuite/libgomp.c/usm-2.c
@@ -1,5 +1,6 @@ 
 /* { dg-do run } */
 /* { dg-require-effective-target omp_usm } */
+/* { dg-options "-foffload=amdgcn-amdhsa=-mxnack=on" { target offload_target_amdgcn } } */
 
 #include <omp.h>
 #include <stdint.h>
diff --git a/libgomp/testsuite/libgomp.c/usm-3.c b/libgomp/testsuite/libgomp.c/usm-3.c
index 2c78a0d8ced..225cba5fe58 100644
--- a/libgomp/testsuite/libgomp.c/usm-3.c
+++ b/libgomp/testsuite/libgomp.c/usm-3.c
@@ -1,5 +1,6 @@ 
 /* { dg-do run } */
 /* { dg-require-effective-target omp_usm } */
+/* { dg-options "-foffload=amdgcn-amdhsa=-mxnack=on" { target offload_target_amdgcn } } */
 
 #include <omp.h>
 #include <stdint.h>
diff --git a/libgomp/testsuite/libgomp.c/usm-4.c b/libgomp/testsuite/libgomp.c/usm-4.c
index 1ac5498f73f..d4addfc587a 100644
--- a/libgomp/testsuite/libgomp.c/usm-4.c
+++ b/libgomp/testsuite/libgomp.c/usm-4.c
@@ -1,5 +1,6 @@ 
 /* { dg-do run } */
 /* { dg-require-effective-target omp_usm } */
+/* { dg-options "-foffload=amdgcn-amdhsa=-mxnack=on" { target offload_target_amdgcn } } */
 
 #include <omp.h>
 #include <stdint.h>