diff mbox series

[libgomp,nvptx] Move device property sampling from nvptx_exec to nvptx_open

Message ID 763d7205-08d8-75ea-2bdc-e0bbba3b0d64@suse.de
State New
Headers show
Series [libgomp,nvptx] Move device property sampling from nvptx_exec to nvptx_open | expand

Commit Message

Tom de Vries July 26, 2018, 12:13 p.m. UTC
> diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c
> index 89326e57741..5022e462a3d 100644
> --- a/libgomp/plugin/plugin-nvptx.c
> +++ b/libgomp/plugin/plugin-nvptx.c
> @@ -1120,6 +1126,7 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
>    void *hp, *dp;
>    struct nvptx_thread *nvthd = nvptx_thread ();
>    const char *maybe_abort_msg = "(perhaps abort was called)";
> +  int dev_size = nvthd->ptx_dev->num_sms;
>  
>    function = targ_fn->fn;
>  
> @@ -1150,23 +1156,20 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
>  	  for (int i = 0; i < GOMP_DIM_MAX; ++i)
>  	    default_dims[i] = GOMP_PLUGIN_acc_default_dim (i);
>  
> -	  int warp_size, block_size, dev_size, cpu_size;
> +	  int warp_size, block_size, cpu_size;
>  	  CUdevice dev = nvptx_thread()->ptx_dev->dev;
>  	  /* 32 is the default for known hardware.  */
>  	  int gang = 0, worker = 32, vector = 32;
> -	  CUdevice_attribute cu_tpb, cu_ws, cu_mpc, cu_tpm;
> +	  CUdevice_attribute cu_tpb, cu_ws, cu_tpm;
>  
>  	  cu_tpb = CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK;
>  	  cu_ws = CU_DEVICE_ATTRIBUTE_WARP_SIZE;
> -	  cu_mpc = CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT;
>  	  cu_tpm  = CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR;
>  
>  	  if (CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &block_size, cu_tpb,
>  				 dev) == CUDA_SUCCESS
>  	      && CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &warp_size, cu_ws,
>  				    dev) == CUDA_SUCCESS
> -	      && CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &dev_size, cu_mpc,
> -				    dev) == CUDA_SUCCESS
>  	      && CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &cpu_size, cu_tpm,
>  				    dev) == CUDA_SUCCESS)
>  	    {

This is a good idea (and should have been an independent patch of course).

Furthermore, it's better to move the remaining cuDeviceGetAttribute
calls to nvptx_open, as was already suggested by Thomas here (
https://gcc.gnu.org/ml/gcc-patches/2017-02/msg01020.html ).

Committed to trunk.

- Tom
diff mbox series

Patch

[libgomp, nvptx] Move device property sampling from nvptx_exec to nvptx_open

Move sampling of device properties from nvptx_exec to nvptx_open, and assume
the sampling always succeeds.  This simplifies the default dimension
initialization code in nvptx_open.

2018-07-26  Cesar Philippidis  <cesar@codesourcery.com>
	    Tom de Vries  <tdevries@suse.de>

	* plugin/plugin-nvptx.c (struct ptx_device): Add warp_size,
	max_threads_per_block and max_threads_per_multiprocessor fields.
	(nvptx_open_device): Initialize new fields.
	(nvptx_exec): Use num_sms, and new fields.

---
 libgomp/plugin/plugin-nvptx.c | 53 +++++++++++++++++++++----------------------
 1 file changed, 26 insertions(+), 27 deletions(-)

diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c
index 89326e57741..5d9b5151e95 100644
--- a/libgomp/plugin/plugin-nvptx.c
+++ b/libgomp/plugin/plugin-nvptx.c
@@ -414,6 +414,9 @@  struct ptx_device
   int num_sms;
   int regs_per_block;
   int regs_per_sm;
+  int warp_size;
+  int max_threads_per_block;
+  int max_threads_per_multiprocessor;
 
   struct ptx_image_data *images;  /* Images loaded on device.  */
   pthread_mutex_t image_lock;     /* Lock for above list.  */
@@ -800,6 +803,15 @@  nvptx_open_device (int n)
       GOMP_PLUGIN_error ("Only warp size 32 is supported");
       return NULL;
     }
+  ptx_dev->warp_size = pi;
+
+  CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, &pi,
+		  CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, dev);
+  ptx_dev->max_threads_per_block = pi;
+
+  CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, &pi,
+		  CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, dev);
+  ptx_dev->max_threads_per_multiprocessor = pi;
 
   r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &async_engines,
 			 CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev);
@@ -1150,33 +1162,20 @@  nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
 	  for (int i = 0; i < GOMP_DIM_MAX; ++i)
 	    default_dims[i] = GOMP_PLUGIN_acc_default_dim (i);
 
-	  int warp_size, block_size, dev_size, cpu_size;
-	  CUdevice dev = nvptx_thread()->ptx_dev->dev;
-	  /* 32 is the default for known hardware.  */
-	  int gang = 0, worker = 32, vector = 32;
-	  CUdevice_attribute cu_tpb, cu_ws, cu_mpc, cu_tpm;
-
-	  cu_tpb = CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK;
-	  cu_ws = CU_DEVICE_ATTRIBUTE_WARP_SIZE;
-	  cu_mpc = CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT;
-	  cu_tpm  = CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR;
-
-	  if (CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &block_size, cu_tpb,
-				 dev) == CUDA_SUCCESS
-	      && CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &warp_size, cu_ws,
-				    dev) == CUDA_SUCCESS
-	      && CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &dev_size, cu_mpc,
-				    dev) == CUDA_SUCCESS
-	      && CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &cpu_size, cu_tpm,
-				    dev) == CUDA_SUCCESS)
-	    {
-	      GOMP_PLUGIN_debug (0, " warp_size=%d, block_size=%d,"
-				 " dev_size=%d, cpu_size=%d\n",
-				 warp_size, block_size, dev_size, cpu_size);
-	      gang = (cpu_size / block_size) * dev_size;
-	      worker = block_size / warp_size;
-	      vector = warp_size;
-	    }
+	  int gang, worker, vector;
+	  {
+	    int warp_size = nvthd->ptx_dev->warp_size;
+	    int block_size = nvthd->ptx_dev->max_threads_per_block;
+	    int cpu_size = nvthd->ptx_dev->max_threads_per_multiprocessor;
+	    int dev_size = nvthd->ptx_dev->num_sms;
+	    GOMP_PLUGIN_debug (0, " warp_size=%d, block_size=%d,"
+			       " dev_size=%d, cpu_size=%d\n",
+			       warp_size, block_size, dev_size, cpu_size);
+
+	    gang = (cpu_size / block_size) * dev_size;
+	    worker = block_size / warp_size;
+	    vector = warp_size;
+	  }
 
 	  /* There is no upper bound on the gang size.  The best size
 	     matches the hardware configuration.  Logical gangs are