[og7] Adjust k80 resources

Message ID 127e6708-d713-a34d-753c-d0ce978a8172@codesourcery.com
State New
Headers show

Commit Message

Cesar Philippidis Aug. 11, 2017, 7:38 p.m.
I've pushed this patch to openacc-gcc-7-branch to teach the libgomp
nvptx plugin how to cope with the hardware resources on K80 boards. K80
boards have two physical GPUs on a single board. Consequently, the CUDA
driver reports that 2x the amount of registers and shared memory are
available on those GPUs. But that's not true if only a single GPU is
being utilized. Consequently, this prevented the runtime from informing
the user that that K80 does not have sufficient hardware resources to
execute certain offloaded kernels.

Unfortunately, I don't have a test case which reproduce this failure,
but it does show up in various OpenACC tests such as cloverleaf. I'll
try to create a reduced test case that uses a lot of hardware registers
later.

Cesar

Patch

2017-08-11  Cesar Philippidis  <cesar@codesourcery.com>

	libgomp/
	* plugin/cuda/cuda.h (CUdevice_attribute): Add
	CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
	CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR.
	* plugin/plugin-nvptx.c (struct ptx_device): Add
	compute_capability_major, compute_capability_minor members.
	(nvptx_open_device): Probe driver for those values.  Adjust 
	regs_per_sm and max_shared_memory_per_multiprocessor for K80
	hardware.

diff --git a/libgomp/plugin/cuda/cuda.h b/libgomp/plugin/cuda/cuda.h
index 25d5d1913b0..94a693cbdef 100644
--- a/libgomp/plugin/cuda/cuda.h
+++ b/libgomp/plugin/cuda/cuda.h
@@ -69,6 +69,8 @@  typedef enum {
   CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS = 31,
   CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR = 39,
   CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT = 40,
+  CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR = 75,
+  CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR = 76,
   CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR = 82
 } CUdevice_attribute;
 
diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c
index 37e1f6efbe1..10f000ab3c1 100644
--- a/libgomp/plugin/plugin-nvptx.c
+++ b/libgomp/plugin/plugin-nvptx.c
@@ -285,7 +285,9 @@  struct ptx_device
   bool map;
   bool concur;
   bool mkern;
-  int  mode;
+  int mode;
+  int compute_capability_major;
+  int compute_capability_minor;
   int clock_khz;
   int num_sms;
   int regs_per_block;
@@ -448,6 +450,14 @@  nvptx_open_device (int n)
   ptx_dev->mode = pi;
 
   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
+		  &pi, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, dev);
+  ptx_dev->compute_capability_major = pi;
+
+  CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
+		  &pi, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, dev);
+  ptx_dev->compute_capability_minor = pi;
+
+  CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 		  &pi, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
   ptx_dev->mkern = pi;
 
@@ -512,20 +522,37 @@  nvptx_open_device (int n)
 
   GOMP_PLUGIN_debug (0, "Nvidia device %d:\n\tGPU_OVERLAP = %d\n"
 		     "\tCAN_MAP_HOST_MEMORY = %d\n\tCONCURRENT_KERNELS = %d\n"
-		     "\tCOMPUTE_MODE = %d\n\tINTEGRATED = %d\n"
+		     "\tCOMPUTE_MODE = %d\n"
+		     "\tCU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR = %d\n"
+		     "\tCU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR = %d\n"
+		     "\tINTEGRATED = %d\n"
 		     "\tMAX_THREADS_PER_BLOCK = %d\n\tWARP_SIZE = %d\n"
 		     "\tMULTIPROCESSOR_COUNT = %d\n"
 		     "\tMAX_THREADS_PER_MULTIPROCESSOR = %d\n"
 		     "\tMAX_REGISTERS_PER_MULTIPROCESSOR = %d\n"
 		     "\tMAX_SHARED_MEMORY_PER_MULTIPROCESSOR = %d\n",
 		     ptx_dev->ord, ptx_dev->overlap, ptx_dev->map,
-		     ptx_dev->concur, ptx_dev->mode, ptx_dev->mkern,
-		     ptx_dev->max_threads_per_block, ptx_dev->warp_size,
-		     ptx_dev->num_sms,
+		     ptx_dev->concur, ptx_dev->mode,
+		     ptx_dev->compute_capability_major,
+		     ptx_dev->compute_capability_minor,
+		     ptx_dev->mkern, ptx_dev->max_threads_per_block,
+		     ptx_dev->warp_size, ptx_dev->num_sms,
 		     ptx_dev->max_threads_per_multiprocessor,
 		     ptx_dev->regs_per_sm,
 		     ptx_dev->max_shared_memory_per_multiprocessor);
 
+  /* K80 (SM_37) boards contain two physical GPUs.  Consequntly they
+     report 2x larger values for MAX_REGISTERS_PER_MULTIPROCESSOR and
+     MAX_SHARED_MEMORY_PER_MULTIPROCESSOR.  Those values need to be
+     adjusted on order to allow the nvptx_exec to select an
+     appropriate num_workers.  */
+  if (ptx_dev->compute_capability_major == 3
+      && ptx_dev->compute_capability_minor == 7)
+    {
+      ptx_dev->regs_per_sm /= 2;
+      ptx_dev->max_shared_memory_per_multiprocessor /= 2;
+    }
+
   return ptx_dev;
 }