From 3f5524adacff23710cf1cab393a56bf23853cafa Mon Sep 17 00:00:00 2001
From: Thomas Schwinge <thomas@codesourcery.com>
Date: Wed, 21 Dec 2022 21:25:19 +0100
Subject: [PATCH] [WIP] nvptx: '-mframe-malloc-threshold',
'-Wframe-malloc-threshold'
---
gcc/config/nvptx/nvptx.cc | 102 ++++++++++++++++--
gcc/config/nvptx/nvptx.h | 3 +
gcc/config/nvptx/nvptx.opt | 12 +++
gcc/doc/invoke.texi | 16 ++-
.../nvptx/frame-malloc-threshold-1.c | 29 +++++
.../nvptx/frame-malloc-threshold-2.c | 13 +++
.../nvptx/frame-malloc-threshold-3.c | 14 +++
.../nvptx/frame-malloc-threshold-4.c | 16 +++
.../nvptx/frame-malloc-threshold-5.c | 15 +++
.../nvptx/frame-malloc-threshold-6.c | 15 +++
.../nvptx/frame-malloc-threshold-7.c | 15 +++
11 files changed, 240 insertions(+), 10 deletions(-)
create mode 100644 gcc/testsuite/gcc.target/nvptx/frame-malloc-threshold-1.c
create mode 100644 gcc/testsuite/gcc.target/nvptx/frame-malloc-threshold-2.c
create mode 100644 gcc/testsuite/gcc.target/nvptx/frame-malloc-threshold-3.c
create mode 100644 gcc/testsuite/gcc.target/nvptx/frame-malloc-threshold-4.c
create mode 100644 gcc/testsuite/gcc.target/nvptx/frame-malloc-threshold-5.c
create mode 100644 gcc/testsuite/gcc.target/nvptx/frame-malloc-threshold-6.c
create mode 100644 gcc/testsuite/gcc.target/nvptx/frame-malloc-threshold-7.c
@@ -178,6 +178,16 @@ static hash_map<tree_decl_hash, unsigned int> gang_private_shared_hmap;
/* Global lock variable, needed for 128bit worker & gang reductions. */
static GTY(()) tree global_lock_var;
+/* True if any function 'has_malloc_frame'.
+ Because of 'nvptx_name_replacement', we can't just:
+ nvptx_record_fndecl (builtin_decl_explicit (BUILT_IN_FREE));
+ nvptx_record_fndecl (builtin_decl_explicit (BUILT_IN_MALLOC));
+ ..., but instead have to track them individually.
+*/
+static bool need_free_malloc_decl;
+static bool have_free_decl;
+static bool have_malloc_decl;
+
/* True if any function references __nvptx_stacks. */
static bool need_softstack_decl;
static bool have_softstack_decl;
@@ -976,6 +986,11 @@ write_fn_marker (std::stringstream &s, bool is_defn, bool globalize,
s << " GLOBAL";
s << " FUNCTION " << (is_defn ? "DEF: " : "DECL: ");
s << name << "\n";
+
+ if (strcmp (name, "free") == 0)
+ have_free_decl = true;
+ else if (strcmp (name, "malloc") == 0)
+ have_malloc_decl = true;
}
/* Emit a linker marker for a variable decl or defn. */
@@ -1231,22 +1246,66 @@ nvptx_maybe_record_fnsym (rtx sym)
nvptx_record_needed_fndecl (decl);
}
+//TODO
/* Emit a local array to hold some part of a conventional stack frame
and initialize REGNO to point to it. If the size is zero, it'll
never be valid to dereference, so we can simply initialize to
zero. */
static void
-init_frame (FILE *file, int regno, unsigned align, unsigned size)
+init_frame (FILE *file, int regno, int align, HOST_WIDE_INT size)
{
- if (size)
- fprintf (file, "\t.local .align %d .b8 %s_ar[%u];\n",
- align, reg_names[regno], size);
fprintf (file, "\t.reg.u%d %s;\n",
POINTER_SIZE, reg_names[regno]);
- fprintf (file, (size ? "\tcvta.local.u%d %s, %s_ar;\n"
- : "\tmov.u%d %s, 0;\n"),
- POINTER_SIZE, reg_names[regno], reg_names[regno]);
+
+ if (regno == FRAME_POINTER_REGNUM
+ && ((unsigned HOST_WIDE_INT) size
+ >= (unsigned HOST_WIDE_INT) nvptx_frame_malloc_threshold))
+ {
+ warning_at (DECL_SOURCE_LOCATION (current_function_decl),
+ OPT_Wframe_malloc_threshold,
+ "using %<malloc%> for frame with size of %wu bytes", size);
+
+ /* <https://docs.nvidia.com/cuda/cuda-c-programming-guide/#dynamic-global-memory-allocation-and-operations>
+ (2022-12-21, v12.0) states that in addition to the "in-kernel
+ 'malloc()' function" there also exists an "in-kernel
+ '__nv_aligned_device_malloc()' function", where "the address of the
+ allocated memory will be a multiple of 'align'". However that's not
+ documented on
+ <https://docs.nvidia.com/cuda/ptx-writers-guide-to-interoperability/#system-calls>
+ (2022-12-21, v12.0), so we shall not use that function. */
+ /* <https://docs.nvidia.com/cuda/ptx-writers-guide-to-interoperability/#system-calls>
+ (2022-12-21, v12.0) does not, but
+ <https://docs.nvidia.com/cuda/cuda-c-programming-guide/#dynamic-global-memory-allocation-and-operations>
+ (2022-12-21, v12.0) does state that the pointer returned by
+ "in-kernel 'malloc()' [...] is guaranteed to be aligned to a
+ 16-byte boundary". */
+ if (align > 16)
+ sorry ("unfulfilled %d bytes alignment for frame", align);
+
+ /* We don't need to support 'realloc', so instead of newlib 'malloc'
+ directly use the PTX 'malloc'. */
+ fprintf (file,
+ "\t{\n"
+ "\t .param .u64 %%ptr;\n"
+ "\t .param .u64 %%size;\n"
+ "\t st.param.u64 [%%size], " HOST_WIDE_INT_PRINT_DEC ";\n"
+ "\t call (%%ptr), malloc, (%%size);\n"
+ "\t ld.param.u64 %s, [%%ptr];\n"
+ "\t}\n",
+ size, reg_names[regno]);
+ cfun->machine->has_malloc_frame = true;
+ need_free_malloc_decl = true;
+ }
+ else
+ {
+ if (size)
+ fprintf (file, "\t.local .align %d .b8 %s_ar[" HOST_WIDE_INT_PRINT_DEC "];\n",
+ align, reg_names[regno], size);
+ fprintf (file, (size ? "\tcvta.local.u%d %s, %s_ar;\n"
+ : "\tmov.u%d %s, 0;\n"),
+ POINTER_SIZE, reg_names[regno], reg_names[regno]);
+ }
}
/* Emit soft stack frame setup sequence. */
@@ -1744,12 +1803,22 @@ nvptx_output_set_softstack (unsigned src_regno)
}
return "";
}
+
/* Output a return instruction. Also copy the return value to its outgoing
location. */
const char *
nvptx_output_return (void)
{
+ if (cfun->machine->has_malloc_frame)
+ fprintf (asm_out_file,
+ "\t{\n"
+ "\t .param .u64 %%ptr;\n"
+ "\t st.param.u64 [%%ptr], %s;\n"
+ "\t call free, (%%ptr);\n"
+ "\t}\n",
+ reg_names[FRAME_POINTER_REGNUM]);
+
machine_mode mode = (machine_mode)cfun->machine->return_mode;
if (mode != VOIDmode)
@@ -4470,8 +4539,8 @@ nvptx_propagate (bool is_call, basic_block block, rtx_insn *insn,
rtx_code_label *label = NULL;
empty = false;
- /* The frame size might not be DImode compatible, but the frame
- array's declaration will be. So it's ok to round up here. */
+ /* The frame size might not be DImode-compatible, but the actual frame
+ allocated by 'init_frame' will be. So it's ok to round up here. */
fs = (fs + GET_MODE_SIZE (DImode) - 1) / GET_MODE_SIZE (DImode);
/* Detect single iteration loop. */
if (fs == 1)
@@ -5989,6 +6058,21 @@ write_shared_buffer (FILE *file, rtx sym, unsigned align, unsigned size)
static void
nvptx_file_end (void)
{
+ if (need_free_malloc_decl)
+ {
+ if (!have_free_decl)
+ {
+ write_fn_marker (func_decls, false, true, "free");
+ func_decls << ".extern .func free (.param .b64 %ptr);\n";
+ }
+ if (!have_malloc_decl)
+ {
+ write_fn_marker (func_decls, false, true, "malloc");
+ func_decls
+ << ".extern .func (.param .b64 %ptr) malloc (.param .b64 %size);\n";
+ }
+ }
+
hash_table<tree_hasher>::iterator iter;
tree decl;
FOR_EACH_HASH_TABLE_ELEMENT (*needed_fndecls_htab, decl, tree, iter)
@@ -214,6 +214,8 @@ struct nvptx_args {
#define TRAMPOLINE_SIZE 32
#define TRAMPOLINE_ALIGNMENT 256
+
+#define NVPTX_FRAME_MALLOC_THRESHOLD_INIT 257
/* We don't run reload, so this isn't actually used, but it still needs to be
defined. Showing an argp->fp elimination also stops
@@ -244,6 +246,7 @@ struct GTY(()) machine_function
bool is_varadic; /* This call is varadic */
bool has_varadic; /* Current function has a varadic call. */
bool has_chain; /* Current function has outgoing static chain. */
+ bool has_malloc_frame;
bool has_softstack; /* Current function has a soft stack frame. */
bool has_simtreg; /* Current function has an OpenMP SIMD region. */
int num_args; /* Number of args of current call. */
@@ -28,6 +28,18 @@ Target RejectNegative Mask(ABI64)
Ignored, but preserved for backward compatibility. Only 64-bit ABI is
supported.
+mframe-malloc-threshold=
+Target Joined RejectNegative Host_Wide_Int ByteSize Var(nvptx_frame_malloc_threshold) Init(NVPTX_FRAME_MALLOC_THRESHOLD_INIT)
+-mframe-malloc-threshold=<byte-size> When the frame size exceeds <byte-size>, frame allocation switches from '.local' memory to 'malloc'.
+
+mno-frame-malloc-threshold
+Target Alias(mframe-malloc-threshold=,18446744073709551615EiB,none)
+Always use '.local' memory for frame allocation. Equivalent to -mframe-malloc-threshold=<SIZE_MAX> or larger.
+
+Wframe-malloc-threshold
+Target Warning
+Warn when the threshold is reached where frame allocation switches from '.local' memory to 'malloc'.
+
mmainkernel
Target RejectNegative
Link in code for a __main kernel.
@@ -1179,7 +1179,9 @@ Objective-C and Objective-C++ Dialects}.
-march=@var{arch} -mbmx -mno-bmx -mcdx -mno-cdx}
@emph{Nvidia PTX Options}
-@gccoptlist{-m64 -mmainkernel -moptimize}
+@gccoptlist{-m64 @gol
+-mframe-malloc-threshold=@var{byte-size} @gol
+-mmainkernel -moptimize}
@emph{OpenRISC Options}
@gccoptlist{-mboard=@var{name} -mnewlib -mhard-mul -mhard-div @gol
@@ -28367,6 +28369,18 @@ This option sets the values of the preprocessor macros
for instance, for @samp{3.1} the macros have the values @samp{3} and
@samp{1}, respectively.
+@item -mframe-malloc-threshold=@var{byte-size}
+@opindex mframe-malloc-threshold=
+@opindex mno-frame-malloc-threshold
+TODO
+
+This is not relevant if @code{-msoft-stack} is enabled.
+
+@option{-mframe-malloc-threshold=TODO} is enabled by default.
+This may be disabled either by specifying
+@var{byte-size} of @samp{SIZE_MAX} or more or by
+@option{-mno-frame-malloc-threshold}.
+
@item -mmainkernel
@opindex mmainkernel
Link in code for a __main kernel. This is for stand-alone instead of
new file mode 100644
@@ -0,0 +1,29 @@
+/* { dg-do assemble } */
+/* { dg-options {-save-temps -O0} } */
+/* { dg-additional-options -Wframe-malloc-threshold } */
+
+/* PTX-provided 'free', 'malloc'; cf. 'nvptx_name_replacement'. */
+void ptx_free (void *) __asm__ ("free");
+void *ptx_malloc (__SIZE_TYPE__) __asm__ ("malloc");
+
+int f (void)
+/* { dg-warning {using 'malloc' for frame with size of [0-9]+ bytes} {} { target *-*-* } .-1 } */
+{
+ char a[1234];
+
+ ptx_malloc (5);
+
+ ptx_free (ptx_malloc (1));
+}
+
+/* We exceed the default '-mframe-malloc-threshold=[...]'.
+ { dg-final { scan-assembler-not {%frame_ar} } }
+ { dg-final { scan-assembler-times {(?n)call free,.*;} 2 } }
+ { dg-final { scan-assembler-times {(?n)call .*, malloc, .*;} 3 } }
+*/
+
+/* Of the implicit (via 'need_free_malloc_decl') and explicit declarations of
+ 'free', 'malloc', only one is emitted each:
+ { dg-final { scan-assembler-times {(?n)\.extern .* free .*;} 1 } }
+ { dg-final { scan-assembler-times {(?n)\.extern .* malloc .*;} 1 } }
+*/
new file mode 100644
@@ -0,0 +1,13 @@
+/* { dg-do assemble } */
+/* { dg-options {-save-temps -O0} } */
+
+int f (void)
+{
+ char a[1234];
+}
+
+/* We exceed the default '-mframe-malloc-threshold=[...]'.
+ { dg-final { scan-assembler-not {%frame_ar} } }
+ { dg-final { scan-assembler-times {(?n)call free,.*;} 1 } }
+ { dg-final { scan-assembler-times {(?n)call .*, malloc, .*;} 1 } }
+*/
new file mode 100644
@@ -0,0 +1,14 @@
+/* { dg-do assemble } */
+/* { dg-options {-save-temps -O0} } */
+/* { dg-additional-options -Wframe-malloc-threshold } */
+
+int f (void)
+{
+ char a[256];
+}
+
+/* We don't exceed the default '-mframe-malloc-threshold=[...]'.
+ { dg-final { scan-assembler-times {(?n)cvta\.local\.u64 %frame, %frame_ar;} 1 } }
+ { dg-final { scan-assembler-not {free} } }
+ { dg-final { scan-assembler-not {malloc} } }
+*/
new file mode 100644
@@ -0,0 +1,16 @@
+/* { dg-do assemble } */
+/* { dg-options {-save-temps -O0} } */
+/* { dg-additional-options -mframe-malloc-threshold=32 } */
+/* { dg-additional-options -Wframe-malloc-threshold } */
+
+int f (void)
+/* { dg-warning {using 'malloc' for frame with size of [0-9]+ bytes} {} { target *-*-* } .-1 } */
+{
+ char a[32];
+}
+
+/* We exceed the specified '-mframe-malloc-threshold=[...]'.
+ { dg-final { scan-assembler-not {%frame_ar} } }
+ { dg-final { scan-assembler-times {(?n)call free,.*;} 1 } }
+ { dg-final { scan-assembler-times {(?n)call .*, malloc, .*;} 1 } }
+*/
new file mode 100644
@@ -0,0 +1,15 @@
+/* { dg-do assemble } */
+/* { dg-options {-save-temps -O0} } */
+/* { dg-additional-options -mframe-malloc-threshold=1249 } */
+/* { dg-additional-options -Wframe-malloc-threshold } */
+
+int f (void)
+{
+ char a[1234];
+}
+
+/* We don't exceed the specified '-mframe-malloc-threshold=[...]'.
+/* { dg-final { scan-assembler-times {(?n)cvta\.local\.u64 %frame, %frame_ar;} 1 } }
+ { dg-final { scan-assembler-not {free} } }
+ { dg-final { scan-assembler-not {malloc} } }
+*/
new file mode 100644
@@ -0,0 +1,15 @@
+/* { dg-do assemble } */
+/* { dg-options {-save-temps -O0} } */
+/* { dg-additional-options -mframe-malloc-threshold=2KiB } */
+/* { dg-additional-options -Wframe-malloc-threshold } */
+
+int f (void)
+{
+ char a[1234];
+}
+
+/* We don't exceed the specified '-mframe-malloc-threshold=[...]'.
+/* { dg-final { scan-assembler-times {(?n)cvta\.local\.u64 %frame, %frame_ar;} 1 } }
+ { dg-final { scan-assembler-not {free} } }
+ { dg-final { scan-assembler-not {malloc} } }
+*/
new file mode 100644
@@ -0,0 +1,15 @@
+/* { dg-do assemble } */
+/* { dg-options {-save-temps -O0} } */
+/* { dg-additional-options -mno-frame-malloc-threshold } */
+/* { dg-additional-options -Wframe-malloc-threshold } */
+
+int f (void)
+{
+ char a[1234];
+}
+
+/* We'll never exceed the specified unlimited '-mframe-malloc-threshold=[...]'.
+/* { dg-final { scan-assembler-times {(?n)cvta\.local\.u64 %frame, %frame_ar;} 1 } }
+ { dg-final { scan-assembler-not {free} } }
+ { dg-final { scan-assembler-not {malloc} } }
+*/
--
2.35.1