diff mbox

[gomp4] runtime default compute dimensions

Message ID 8b489f0c-0159-80e5-28a8-1460029326d3@acm.org
State New
Headers show

Commit Message

Nathan Sidwell Aug. 30, 2016, 2:53 p.m. UTC
This patch interrogates the target device to determine default  gemotry at 
runtime.  This has the greatest difference on gang partitioning, where there's a 
noticeable sawtooth in the relationship between number of gangs and execution 
time.  Picking the number of gangs as an exact multiple of number of physical 
multi-cpus gets the best performance. Picking one more than that gives a step 
increase in execution time.  The sawtooth gets blunter as the multiplication 
factor increases, as one might expect when scheduling smaller and smaller 
parcels of work onto a limited set of physical cpus.

nathan
diff mbox

Patch

2016-08-30  Nathan Sidwell  <nathan@codesourcery.com>

	* plugin/plugin-nvptx.c (nvptx_exec): Interrogate board attributes
	to determine default geometry.

Index: plugin/plugin-nvptx.c
===================================================================
--- plugin/plugin-nvptx.c	(revision 239862)
+++ plugin/plugin-nvptx.c	(working copy)
@@ -938,14 +938,42 @@  nvptx_exec (void (*fn), size_t mapnum, v
 		}
 	    }
 
-	  /* Do some sanity checking.  The CUDA API doesn't appear to
-	     provide queries to determine these limits.  */
+	  int warp_size, block_size, dev_size, cpu_size;
+	  CUdevice dev = nvptx_thread()->ptx_dev->dev;
+	  /* 32 is the default for known hardware.  */
+	  int gang = 0, worker = 32, vector = 32;
+
+	  if (CUDA_SUCCESS == cuDeviceGetAttribute
+	      (&block_size, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, dev)
+	      && CUDA_SUCCESS == cuDeviceGetAttribute
+	      (&warp_size, CU_DEVICE_ATTRIBUTE_WARP_SIZE, dev)
+	      && CUDA_SUCCESS == cuDeviceGetAttribute
+	      (&dev_size, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev)
+	      && CUDA_SUCCESS == cuDeviceGetAttribute
+	      (&cpu_size, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, dev))
+	    {
+	      GOMP_PLUGIN_debug (0, " warp_size=%d, block_size=%d,"
+				 " dev_size=%d, cpu_size=%d\n",
+				 warp_size, block_size, dev_size, cpu_size);
+	      gang = (cpu_size / block_size) * dev_size;
+	      worker = block_size / warp_size;
+	      vector = warp_size;
+	    }
+
+	  /* There is no upper bound on the gang size.  The best size
+	     matches the hardware configuration.  Logical gangs are
+	     scheduled onto physical hardware.  To maximize usage, we
+	     should guess a large number.  */
 	  if (default_dims[GOMP_DIM_GANG] < 1)
-	    default_dims[GOMP_DIM_GANG] = 32;
+	    default_dims[GOMP_DIM_GANG] = gang ? gang : 1024;
+	  /* The worker size must not exceed the hardware.  */
 	  if (default_dims[GOMP_DIM_WORKER] < 1
-	      || default_dims[GOMP_DIM_WORKER] > 32)
-	    default_dims[GOMP_DIM_WORKER] = 32;
-	  default_dims[GOMP_DIM_VECTOR] = 32;
+	      || (default_dims[GOMP_DIM_WORKER] > worker && gang))
+	    default_dims[GOMP_DIM_WORKER] = worker;
+	  /* The vector size must exactly match the hardware.  */
+	  if (default_dims[GOMP_DIM_VECTOR] < 1
+	      || (default_dims[GOMP_DIM_VECTOR] != vector && gang))
+	    default_dims[GOMP_DIM_VECTOR] = vector;
 
 	  GOMP_PLUGIN_debug (0, " default dimensions [%d,%d,%d]\n",
 			     default_dims[GOMP_DIM_GANG],