diff mbox series

[3/5,amdgcn] Restrict register usage in non-kernel functions

Message ID df45c108-a44e-3741-fee9-72103bb5bbec@codesourcery.com
State New
Headers show
Series Reduce register usage on AMD GCN | expand

Commit Message

Kwok Cheung Yeung Nov. 14, 2019, 3:32 p.m. UTC
This patch restricts non-kernel functions to using a maximum of 64 SGPRs 
and 24 VGPRs.

Kernels can request various pieces of information from the HSA runtime, 
and these will be loaded into the registers consecutively before the 
kernel executes. These registers are normally fixed. Since non-kernel 
functions cannot make these requests, they have to assume that the 
default set of information has been requested. If a non-leaf kernel 
requests information not in the defaults, a warning is now emitted as 
pieces of info needed by callees may have shifted locations. A leaf 
kernel can do whatever it wants.

I have setup FIXED_REGISTERS for the default case now - if a different 
set of startup info is requested (which should be rare), then the set of 
fixed registers will be adjusted accordingly by 
gcn_conditional_register_usage. Compared to before, v0, s2 and s3 are 
now unfixed (due to the newlib patch 'Stash reent marker in upper bits 
of s1 on AMD GCN' and the first patch in this series).

Okay to commit?

Kwok


2019-11-14  Kwok Cheung Yeung  <kcy@codesourcery.com>

	gcc/
	* config/gcn/gcn.c (default_requested_args): New.
	(gcn_parse_amdgpu_hsa_kernel_attribute): Initialize requested args
	set with default_requested_args.
	(gcn_conditional_register_usage): Limit register usage of non-kernel
	functions.  Reassign fixed registers if a non-standard set of args is
	requested.
	* config/gcn/gcn.h (FIXED_REGISTERS): Fix registers according to ABI.
---
  gcc/config/gcn/gcn.c | 63 
++++++++++++++++++++++++++++++----------------------
  gcc/config/gcn/gcn.h |  6 ++---
  2 files changed, 39 insertions(+), 30 deletions(-)

Comments

Andrew Stubbs Nov. 15, 2019, 11:24 a.m. UTC | #1
On 14/11/2019 15:32, Kwok Cheung Yeung wrote:
> This patch restricts non-kernel functions to using a maximum of 64 SGPRs 
> and 24 VGPRs.
> 
> Kernels can request various pieces of information from the HSA runtime, 
> and these will be loaded into the registers consecutively before the 
> kernel executes. These registers are normally fixed. Since non-kernel 
> functions cannot make these requests, they have to assume that the 
> default set of information has been requested. If a non-leaf kernel 
> requests information not in the defaults, a warning is now emitted as 
> pieces of info needed by callees may have shifted locations. A leaf 
> kernel can do whatever it wants.
> 
> I have setup FIXED_REGISTERS for the default case now - if a different 
> set of startup info is requested (which should be rare), then the set of 
> fixed registers will be adjusted accordingly by 
> gcn_conditional_register_usage. Compared to before, v0, s2 and s3 are 
> now unfixed (due to the newlib patch 'Stash reent marker in upper bits 
> of s1 on AMD GCN' and the first patch in this series).
> 
> Okay to commit?

I think this is fine, but please check the white-space and formatting.

There's something odd going on. Probably the patch got mangled by 
Thunderbird. Please post future patches as attachments. I believe the 
GCC guidelines request using the "inline" content disposition setting 
you can only find in the TB config editor, but any kind of attachment is 
OK with me.

Andrew
diff mbox series

Patch

diff --git a/gcc/config/gcn/gcn.c b/gcc/config/gcn/gcn.c
index 09dfabb..8a2f7d7 100644
--- a/gcc/config/gcn/gcn.c
+++ b/gcc/config/gcn/gcn.c
@@ -191,6 +191,17 @@  static const struct gcn_kernel_arg_type
    {"work_item_id_Z", NULL, V64SImode, FIRST_VGPR_REG + 2}
  };

+static const long default_requested_args
+	= (1 << PRIVATE_SEGMENT_BUFFER_ARG)
+	  | (1 << DISPATCH_PTR_ARG)
+	  | (1 << QUEUE_PTR_ARG)
+	  | (1 << KERNARG_SEGMENT_PTR_ARG)
+	  | (1 << PRIVATE_SEGMENT_WAVE_OFFSET_ARG)
+	  | (1 << WORKGROUP_ID_X_ARG)
+	  | (1 << WORK_ITEM_ID_X_ARG)
+	  | (1 << WORK_ITEM_ID_Y_ARG)
+	  | (1 << WORK_ITEM_ID_Z_ARG);
+
  /* Extract parameter settings from __attribute__((amdgpu_hsa_kernel ())).
     This function also sets the default values for some arguments.

@@ -201,10 +212,7 @@  gcn_parse_amdgpu_hsa_kernel_attribute (struct 
gcn_kernel_args *args,
  				       tree list)
  {
    bool err = false;
-  args->requested = ((1 << PRIVATE_SEGMENT_BUFFER_ARG)
-		     | (1 << QUEUE_PTR_ARG)
-		     | (1 << KERNARG_SEGMENT_PTR_ARG)
-		     | (1 << PRIVATE_SEGMENT_WAVE_OFFSET_ARG));
+  args->requested = default_requested_args;
    args->nargs = 0;

    for (int a = 0; a < GCN_KERNEL_ARG_TYPES; a++)
@@ -242,8 +250,6 @@  gcn_parse_amdgpu_hsa_kernel_attribute (struct 
gcn_kernel_args *args,
        args->requested |= (1 << a);
        args->order[args->nargs++] = a;
      }
-  args->requested |= (1 << WORKGROUP_ID_X_ARG);
-  args->requested |= (1 << WORK_ITEM_ID_Z_ARG);

    /* Requesting WORK_ITEM_ID_Z_ARG implies requesting 
WORK_ITEM_ID_X_ARG and
       WORK_ITEM_ID_Y_ARG.  Similarly, requesting WORK_ITEM_ID_Y_ARG implies
@@ -253,10 +259,6 @@  gcn_parse_amdgpu_hsa_kernel_attribute (struct 
gcn_kernel_args *args,
    if (args->requested & (1 << WORK_ITEM_ID_Y_ARG))
      args->requested |= (1 << WORK_ITEM_ID_X_ARG);

-  /* Always enable this so that kernargs is in a predictable place for
-     gomp_print, etc.  */
-  args->requested |= (1 << DISPATCH_PTR_ARG);
-
    int sgpr_regno = FIRST_SGPR_REG;
    args->nsgprs = 0;
    for (int a = 0; a < GCN_KERNEL_ARG_TYPES; a++)
@@ -2041,27 +2043,34 @@  gcn_secondary_reload (bool in_p, rtx x, 
reg_class_t rclass,
  static void
  gcn_conditional_register_usage (void)
  {
-  int i;
+  if (!cfun || !cfun->machine)
+    return;

-  /* FIXME: Do we need to reset fixed_regs?  */
+  if (cfun->machine->normal_function)
+    {
+      /* Restrict the set of SGPRs and VGPRs used by non-kernel 
functions.  */
+      for (int i = SGPR_REGNO (62); i <= LAST_SGPR_REG; i++)
+	fixed_regs[i] = 1, call_used_regs[i] = 1;

-/* Limit ourselves to 1/16 the register file for maximimum sized 
workgroups.
-   There are enough SGPRs not to limit those.
-   TODO: Adjust this more dynamically.  */
-  for (i = FIRST_VGPR_REG + 64; i <= LAST_VGPR_REG; i++)
-    fixed_regs[i] = 1, call_used_regs[i] = 1;
+      for (int i = VGPR_REGNO (24); i <= LAST_VGPR_REG; i++)
+	fixed_regs[i] = 1, call_used_regs[i] = 1;

-  if (!cfun || !cfun->machine || cfun->machine->normal_function)
-    {
-      /* Normal functions can't know what kernel argument registers are
-         live, so just fix the bottom 16 SGPRs, and bottom 3 VGPRs.  */
-      for (i = 0; i < 16; i++)
-	fixed_regs[FIRST_SGPR_REG + i] = 1;
-      for (i = 0; i < 3; i++)
-	fixed_regs[FIRST_VGPR_REG + i] = 1;
        return;
      }

+  /* If the set of requested args is the default set, nothing more needs to
+     be done.  */
+  if (cfun->machine->args.requested == default_requested_args)
+    return;
+
+  /* Requesting a set of args different from the default violates the 
ABI.  */
+  if (!leaf_function_p ())
+    warning (0, "A non-default set of initial values has been requested, "
+		"which violates the ABI!");
+
+  for (int i = SGPR_REGNO (0); i < SGPR_REGNO (14); i++)
+    fixed_regs[i] = 0;
+
    /* Fix the runtime argument register containing values that may be
       needed later.  DISPATCH_PTR_ARG and FLAT_SCRATCH_* should not be
       needed after the prologue so there's no need to fix them.  */
@@ -2069,10 +2078,10 @@  gcn_conditional_register_usage (void)

fixed_regs[cfun->machine->args.reg[PRIVATE_SEGMENT_WAVE_OFFSET_ARG]] = 1;
    if (cfun->machine->args.reg[PRIVATE_SEGMENT_BUFFER_ARG] >= 0)
      {
+      /* The upper 32-bits of the 64-bit descriptor are not used, so allow
+	the containing registers to be used for other purposes.  */
        fixed_regs[cfun->machine->args.reg[PRIVATE_SEGMENT_BUFFER_ARG]] = 1;
        fixed_regs[cfun->machine->args.reg[PRIVATE_SEGMENT_BUFFER_ARG] + 
1] = 1;
-      fixed_regs[cfun->machine->args.reg[PRIVATE_SEGMENT_BUFFER_ARG] + 
2] = 1;
-      fixed_regs[cfun->machine->args.reg[PRIVATE_SEGMENT_BUFFER_ARG] + 
3] = 1;
      }
    if (cfun->machine->args.reg[KERNARG_SEGMENT_PTR_ARG] >= 0)
      {
diff --git a/gcc/config/gcn/gcn.h b/gcc/config/gcn/gcn.h
index b3b2d1a..dd3789b 100644
--- a/gcc/config/gcn/gcn.h
+++ b/gcc/config/gcn/gcn.h
@@ -160,9 +160,9 @@ 
  
  #define FIXED_REGISTERS {			    \
      /* Scalars.  */				    \
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0,		    \
+    1, 1, 0, 0, 1, 1, 0, 0, 1, 1,		    \
  /*		fp    sp    lr.  */		    \
-    0, 0, 0, 0, 1, 1, 1, 1, 0, 0,		    \
+    1, 1, 0, 0, 1, 1, 1, 1, 0, 0,		    \
  /*  exec_save, cc_save */			    \
      1, 1, 1, 1, 0, 0, 0, 0, 0, 0,		    \
      0, 0, 0, 0, 0, 0, 0, 0, 0, 0,		    \
@@ -180,7 +180,7 @@ 
      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, \
      1, 1, 1, 1, 1, 1, 1, 1, 1, 1,		    \
      /* VGRPs */					    \
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \
+    0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \
      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \
      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \
      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \