diff mbox series

[amdgcn] Scale number of threads/workers with VGPR usage

Message ID 3ffd97f1-3725-3918-ca33-f90c40d58992@codesourcery.com
State New
Headers show
Series [amdgcn] Scale number of threads/workers with VGPR usage | expand

Commit Message

Kwok Cheung Yeung Jan. 31, 2020, 1:56 p.m. UTC
The GCN architecture has 4 SIMD units per compute unit, with 256 VGPRs per SIMD 
unit. OpenMP threads or OpenACC workers must be distributed across the SIMD 
units, with each thread/worker fitting entirely within a single SIMD unit. VGPRs 
are shared by the kernels running in a SIMD unit, so we can have 4 workers that 
use up to 256 VGPRs, 8 workers that use up to 128 VGPRs, 16 workers that use up 
to 64 VGPRs and so on.

If more threads/workers are requested than can be supported, then the runtime 
fails with the message:

libgomp: GCN fatal error: Asynchronous queue error
Runtime message: HSA_STATUS_ERROR_INVALID_ISA: The instruction set architecture 
is invalid.

This patch adds code to mkoffload such that the number of VGPRs (and SGPRs for 
good measure) requested by a kernel is reported to libgomp at runtime. When 
launching a kernel, if libgomp detects that the number of threads/workers 
exceeds what can be supported by the hardware, it automatically scales down the 
number to the maximum supported value.

This behaviour can be overridden using environment variables to set an explicit 
number of threads/workers (GCN_NUM_THREADS/GCN_NUM_WORKERS), but there is not 
much point IMO as the kernel will just fail to run.

Tested on a GCN3 accelerator with 6 new passes and no regressions noted in 
libgomp. Okay for trunk?

Kwok

     gcc/
     * config/gcn/mkoffload.c (process_asm): Add sgpr_count and vgpr_count to
     definition of hsa_kernel_description.  Parse assembly to find SGPR and
     VGPR count of kernel and store in hsa_kernel_description.

     libgomp/
     * plugin/plugin-gcn.c (struct hsa_kernel_description): Add sgpr_count
     and vgpr_count fields.
     (struct kernel_info): Add a field for a hsa_kernel_description.
     (run_kernel): Reduce the number of threads/workers if the requested
     number would require too many VGPRs.
     (init_basic_kernel_info): Initialize description field with
     the hsa_kernel_description entry for the kernel.

Comments

Andrew Stubbs Jan. 31, 2020, 2:12 p.m. UTC | #1
On 31/01/2020 13:56, Kwok Cheung Yeung wrote:
> The GCN architecture has 4 SIMD units per compute unit, with 256 VGPRs 
> per SIMD unit. OpenMP threads or OpenACC workers must be distributed 
> across the SIMD units, with each thread/worker fitting entirely within a 
> single SIMD unit. VGPRs are shared by the kernels running in a SIMD 
> unit, so we can have 4 workers that use up to 256 VGPRs, 8 workers that 
> use up to 128 VGPRs, 16 workers that use up to 64 VGPRs and so on.
> 
> If more threads/workers are requested than can be supported, then the 
> runtime fails with the message:
> 
> libgomp: GCN fatal error: Asynchronous queue error
> Runtime message: HSA_STATUS_ERROR_INVALID_ISA: The instruction set 
> architecture is invalid.
> 
> This patch adds code to mkoffload such that the number of VGPRs (and 
> SGPRs for good measure) requested by a kernel is reported to libgomp at 
> runtime. When launching a kernel, if libgomp detects that the number of 
> threads/workers exceeds what can be supported by the hardware, it 
> automatically scales down the number to the maximum supported value.
> 
> This behaviour can be overridden using environment variables to set an 
> explicit number of threads/workers (GCN_NUM_THREADS/GCN_NUM_WORKERS), 
> but there is not much point IMO as the kernel will just fail to run.
> 
> Tested on a GCN3 accelerator with 6 new passes and no regressions noted 
> in libgomp. Okay for trunk?
> 
> Kwok
> 
>      gcc/
>      * config/gcn/mkoffload.c (process_asm): Add sgpr_count and 
> vgpr_count to
>      definition of hsa_kernel_description.  Parse assembly to find SGPR and
>      VGPR count of kernel and store in hsa_kernel_description.
> 
>      libgomp/
>      * plugin/plugin-gcn.c (struct hsa_kernel_description): Add sgpr_count
>      and vgpr_count fields.
>      (struct kernel_info): Add a field for a hsa_kernel_description.
>      (run_kernel): Reduce the number of threads/workers if the requested
>      number would require too many VGPRs.
>      (init_basic_kernel_info): Initialize description field with
>      the hsa_kernel_description entry for the kernel.
> 

OK.

Andrew
diff mbox series

Patch

diff --git a/gcc/config/gcn/mkoffload.c b/gcc/config/gcn/mkoffload.c
index 0062f15..723da10 100644
--- a/gcc/config/gcn/mkoffload.c
+++ b/gcc/config/gcn/mkoffload.c
@@ -211,12 +211,13 @@  access_check (const char *name, int mode)
 static void
 process_asm (FILE *in, FILE *out, FILE *cfile)
 {
-  int fn_count = 0, var_count = 0, dims_count = 0;
-  struct obstack fns_os, vars_os, varsizes_os, dims_os;
+  int fn_count = 0, var_count = 0, dims_count = 0, regcount_count = 0;
+  struct obstack fns_os, vars_os, varsizes_os, dims_os, regcounts_os;
   obstack_init (&fns_os);
   obstack_init (&vars_os);
   obstack_init (&varsizes_os);
   obstack_init (&dims_os);
+  obstack_init (&regcounts_os);
 
   struct oaccdims
   {
@@ -224,13 +225,20 @@  process_asm (FILE *in, FILE *out, FILE *cfile)
     char *name;
   } dim;
 
+  struct regcount
+  {
+    int sgpr_count;
+    int vgpr_count;
+    char *kernel_name;
+  } regcount;
+
   /* Always add _init_array and _fini_array as kernels.  */
   obstack_ptr_grow (&fns_os, xstrdup ("_init_array"));
   obstack_ptr_grow (&fns_os, xstrdup ("_fini_array"));
   fn_count += 2;
 
   char buf[1000];
-  enum { IN_CODE, IN_VARS, IN_FUNCS } state = IN_CODE;
+  enum { IN_CODE, IN_AMD_KERNEL_CODE_T, IN_VARS, IN_FUNCS } state = IN_CODE;
   while (fgets (buf, sizeof (buf), in))
     {
       switch (state)
@@ -243,6 +251,22 @@  process_asm (FILE *in, FILE *out, FILE *cfile)
 		obstack_grow (&dims_os, &dim, sizeof (dim));
 		dims_count++;
 	      }
+	    else if (sscanf (buf, " .amdgpu_hsa_kernel %ms\n",
+			     &regcount.kernel_name) == 1)
+	      break;
+
+	    break;
+	  }
+	case IN_AMD_KERNEL_CODE_T:
+	  {
+	    gcc_assert (regcount.kernel_name);
+	    if (sscanf (buf, " wavefront_sgpr_count = %d\n",
+			&regcount.sgpr_count) == 1)
+	      break;
+	    else if (sscanf (buf, " workitem_vgpr_count = %d\n",
+			     &regcount.vgpr_count) == 1)
+	      break;
+
 	    break;
 	  }
 	case IN_VARS:
@@ -282,19 +306,36 @@  process_asm (FILE *in, FILE *out, FILE *cfile)
 	state = IN_VARS;
       else if (sscanf (buf, " .section .gnu.offload_funcs%c", &dummy) > 0)
 	state = IN_FUNCS;
+      else if (sscanf (buf, " .amd_kernel_code_%c", &dummy) > 0)
+	{
+	  state = IN_AMD_KERNEL_CODE_T;
+	  regcount.sgpr_count = regcount.vgpr_count = -1;
+	}
       else if (sscanf (buf, " .section %c", &dummy) > 0
 	       || sscanf (buf, " .text%c", &dummy) > 0
 	       || sscanf (buf, " .bss%c", &dummy) > 0
 	       || sscanf (buf, " .data%c", &dummy) > 0
 	       || sscanf (buf, " .ident %c", &dummy) > 0)
 	state = IN_CODE;
+      else if (sscanf (buf, " .end_amd_kernel_code_%c", &dummy) > 0)
+	{
+	  state = IN_CODE;
+	  gcc_assert (regcount.kernel_name != NULL
+		      && regcount.sgpr_count >= 0
+		      && regcount.vgpr_count >= 0);
+	  obstack_grow (&regcounts_os, &regcount, sizeof (regcount));
+	  regcount_count++;
+	  regcount.kernel_name = NULL;
+	  regcount.sgpr_count = regcount.vgpr_count = -1;
+	}
 
-      if (state == IN_CODE)
+      if (state == IN_CODE || state == IN_AMD_KERNEL_CODE_T)
 	fputs (buf, out);
     }
 
   char **fns = XOBFINISH (&fns_os, char **);
   struct oaccdims *dims = XOBFINISH (&dims_os, struct oaccdims *);
+  struct regcount *regcounts = XOBFINISH (&regcounts_os, struct regcount *);
 
   fprintf (cfile, "#include <stdlib.h>\n");
   fprintf (cfile, "#include <stdbool.h>\n\n");
@@ -322,6 +363,8 @@  process_asm (FILE *in, FILE *out, FILE *cfile)
   fprintf (cfile, "static const struct hsa_kernel_description {\n"
 	   "  const char *name;\n"
 	   "  int oacc_dims[3];\n"
+	   "  int sgpr_count;\n"
+	   "  int vgpr_count;\n"
 	   "} gcn_kernels[] = {\n  ");
   dim.d[0] = dim.d[1] = dim.d[2] = 0;
   const char *comma;
@@ -329,15 +372,24 @@  process_asm (FILE *in, FILE *out, FILE *cfile)
     {
       /* Find if we recorded dimensions for this function.  */
       int *d = dim.d;		/* Previously zeroed.  */
+      int sgpr_count = 0;
+      int vgpr_count = 0;
       for (int j = 0; j < dims_count; j++)
 	if (strcmp (fns[i], dims[j].name) == 0)
 	  {
 	    d = dims[j].d;
 	    break;
 	  }
+      for (int j = 0; j < regcount_count; j++)
+	if (strcmp (fns[i], regcounts[j].kernel_name) == 0)
+	  {
+	    sgpr_count = regcounts[j].sgpr_count;
+	    vgpr_count = regcounts[j].vgpr_count;
+	    break;
+	  }
 
-      fprintf (cfile, "%s{\"%s\", {%d, %d, %d}}", comma,
-	       fns[i], d[0], d[1], d[2]);
+      fprintf (cfile, "%s{\"%s\", {%d, %d, %d}, %d, %d}", comma,
+	       fns[i], d[0], d[1], d[2], sgpr_count, vgpr_count);
 
       free (fns[i]);
     }
@@ -346,7 +398,10 @@  process_asm (FILE *in, FILE *out, FILE *cfile)
   obstack_free (&fns_os, NULL);
   for (i = 0; i < dims_count; i++)
     free (dims[i].name);
+  for (i = 0; i < regcount_count; i++)
+    free (regcounts[i].kernel_name);
   obstack_free (&dims_os, NULL);
+  obstack_free (&regcounts_os, NULL);
 }
 
 /* Embed an object file into a C source file.  */
diff --git a/libgomp/plugin/plugin-gcn.c b/libgomp/plugin/plugin-gcn.c
index 22676b4..25547ef 100644
--- a/libgomp/plugin/plugin-gcn.c
+++ b/libgomp/plugin/plugin-gcn.c
@@ -371,6 +371,8 @@  struct hsa_kernel_description
 {
   const char *name;
   int oacc_dims[3];  /* Only present for GCN kernels.  */
+  int sgpr_count;
+  int vpgr_count;
 };
 
 /* Mkoffload uses this structure to describe an offload variable.  */
@@ -478,6 +480,8 @@  struct kernel_info
   struct agent_info *agent;
   /* The specific module where the kernel takes place.  */
   struct module_info *module;
+  /* Information provided by mkoffload associated with the kernel.  */
+  struct hsa_kernel_description *description;
   /* Mutex enforcing that at most once thread ever initializes a kernel for
      use.  A thread should have locked agent->module_rwlock for reading before
      acquiring it.  */
@@ -2102,6 +2106,24 @@  run_kernel (struct kernel_info *kernel, void *vars,
 	    struct GOMP_kernel_launch_attributes *kla,
 	    struct goacc_asyncqueue *aq, bool module_locked)
 {
+  GCN_DEBUG ("SGPRs: %d, VGPRs: %d\n", kernel->description->sgpr_count,
+	     kernel->description->vpgr_count);
+
+  /* Reduce the number of threads/workers if there are insufficient
+     VGPRs available to run the kernels together.  */
+  if (kla->ndim == 3 && kernel->description->vpgr_count > 0)
+    {
+      int granulated_vgprs = (kernel->description->vpgr_count + 3) & ~3;
+      int max_threads = (256 / granulated_vgprs) * 4;
+      if (kla->gdims[2] > max_threads)
+	{
+	  GCN_WARNING ("Too many VGPRs required to support %d threads/workers"
+		       " per team/gang - reducing to %d threads/workers.\n",
+		       kla->gdims[2], max_threads);
+	  kla->gdims[2] = max_threads;
+	}
+    }
+
   GCN_DEBUG ("GCN launch on queue: %d:%d\n", kernel->agent->device_id,
 	     (aq ? aq->id : 0));
   GCN_DEBUG ("GCN launch attribs: gdims:[");
@@ -2303,6 +2325,7 @@  init_basic_kernel_info (struct kernel_info *kernel,
   kernel->agent = agent;
   kernel->module = module;
   kernel->name = d->name;
+  kernel->description = d;
   if (pthread_mutex_init (&kernel->init_mutex, NULL))
     {
       GOMP_PLUGIN_error ("Failed to initialize a GCN kernel mutex");