@@ -142,6 +142,9 @@ static GTY(()) tree global_lock_var;
/* True if any function references __nvptx_stacks. */
static bool need_softstack_decl;
+/* True if any function references __nvptx_uni. */
+static bool need_unisimt_decl;
+
/* Allocate a new, cleared machine_function structure. */
static struct machine_function *
@@ -1049,6 +1052,34 @@ nvptx_init_axis_predicate (FILE *file, int regno, const char *name)
fprintf (file, "\t}\n");
}
+/* Emit code to initialize predicate and master lane index registers for
+ -muniform-simt code generation variant. */
+
+static void
+nvptx_init_unisimt_predicate (FILE *file)
+{
+ int bits = POINTER_SIZE;
+ int master = REGNO (cfun->machine->unisimt_master);
+ int pred = REGNO (cfun->machine->unisimt_predicate);
+ fprintf (file, "\t{\n");
+ fprintf (file, "\t\t.reg.u32 %%ustmp0;\n");
+ fprintf (file, "\t\t.reg.u%d %%ustmp1;\n", bits);
+ fprintf (file, "\t\t.reg.u%d %%ustmp2;\n", bits);
+ fprintf (file, "\t\tmov.u32 %%ustmp0, %%tid.y;\n");
+ fprintf (file, "\t\tmul%s.u32 %%ustmp1, %%ustmp0, 4;\n",
+ bits == 64 ? ".wide" : ".lo");
+ fprintf (file, "\t\tmov.u%d %%ustmp2, __nvptx_uni;\n", bits);
+ fprintf (file, "\t\tadd.u%d %%ustmp2, %%ustmp2, %%ustmp1;\n", bits);
+ fprintf (file, "\t\tld.shared.u32 %%r%d, [%%ustmp2];\n", master);
+ fprintf (file, "\t\tmov.u32 %%ustmp0, %%tid.x;\n");
+ /* Compute 'master lane index' as 'tid.x & __nvptx_uni[tid.y]'. */
+ fprintf (file, "\t\tand.b32 %%r%d, %%r%d, %%ustmp0;\n", master, master);
+ /* Compute predicate as 'tid.x == master'. */
+ fprintf (file, "\t\tsetp.eq.u32 %%r%d, %%r%d, %%ustmp0;\n", pred, master);
+ fprintf (file, "\t}\n");
+ need_unisimt_decl = true;
+}
+
/* Implement ASM_DECLARE_FUNCTION_NAME. Writes the start of a ptx
function, including local var decls and copies from the arguments to
local regs. */
@@ -1138,6 +1169,8 @@ nvptx_declare_function_name (FILE *file, const char *name, const_tree decl)
if (cfun->machine->axis_predicate[1])
nvptx_init_axis_predicate (file,
REGNO (cfun->machine->axis_predicate[1]), "x");
+ if (cfun->machine->unisimt_predicate)
+ nvptx_init_unisimt_predicate (file);
}
/* Output instruction that sets soft stack pointer in shared memory to the
@@ -2401,6 +2434,89 @@ nvptx_reorg_subreg (void)
}
}
+/* Return a SImode "master lane index" register for uniform-simt, allocating on
+ first use. */
+
+static rtx
+nvptx_get_unisimt_master ()
+{
+ rtx &master = cfun->machine->unisimt_master;
+ return master ? master : master = gen_reg_rtx (SImode);
+}
+
+/* Return a BImode "predicate" register for uniform-simt, similar to above. */
+
+static rtx
+nvptx_get_unisimt_predicate ()
+{
+ rtx &pred = cfun->machine->unisimt_predicate;
+ return pred ? pred : pred = gen_reg_rtx (BImode);
+}
+
+/* Return true if given call insn references one of the functions provided by
+ the CUDA runtime: malloc, free, vprintf. */
+
+static bool
+nvptx_call_insn_is_syscall_p (rtx_insn *insn)
+{
+ rtx pat = PATTERN (insn);
+ gcc_checking_assert (GET_CODE (pat) == PARALLEL);
+ pat = XVECEXP (pat, 0, 0);
+ if (GET_CODE (pat) == SET)
+ pat = SET_SRC (pat);
+ gcc_checking_assert (GET_CODE (pat) == CALL
+ && GET_CODE (XEXP (pat, 0)) == MEM);
+ rtx addr = XEXP (XEXP (pat, 0), 0);
+ if (GET_CODE (addr) != SYMBOL_REF)
+ return false;
+ const char *name = XSTR (addr, 0);
+ /* Ordinary malloc/free are redirected to __nvptx_{malloc,free), so only the
+ references with forced assembler name refer to PTX syscalls. For vprintf,
+ accept both normal and forced-assembler-name references. */
+ return (!strcmp (name, "vprintf") || !strcmp (name, "*vprintf")
+ || !strcmp (name, "*malloc")
+ || !strcmp (name, "*free"));
+}
+
+/* If SET subexpression of INSN sets a register, emit a shuffle instruction to
+ propagate its value from lane MASTER to current lane. */
+
+static void
+nvptx_unisimt_handle_set (rtx set, rtx_insn *insn, rtx master)
+{
+ rtx reg;
+ if (GET_CODE (set) == SET && REG_P (reg = SET_DEST (set)))
+ emit_insn_after (nvptx_gen_shuffle (reg, reg, master, SHUFFLE_IDX), insn);
+}
+
+/* Adjust code for uniform-simt code generation variant by making atomics and
+ "syscalls" conditionally executed, and inserting shuffle-based propagation
+ for registers being set. */
+
+static void
+nvptx_reorg_uniform_simt ()
+{
+ rtx_insn *insn, *next;
+
+ for (insn = get_insns (); insn; insn = next)
+ {
+ next = NEXT_INSN (insn);
+ if (!(CALL_P (insn) && nvptx_call_insn_is_syscall_p (insn))
+ && !(NONJUMP_INSN_P (insn)
+ && GET_CODE (PATTERN (insn)) == PARALLEL
+ && get_attr_atomic (insn)))
+ continue;
+ rtx pat = PATTERN (insn);
+ rtx master = nvptx_get_unisimt_master ();
+ for (int i = 0; i < XVECLEN (pat, 0); i++)
+ nvptx_unisimt_handle_set (XVECEXP (pat, 0, i), insn, master);
+ rtx pred = nvptx_get_unisimt_predicate ();
+ pred = gen_rtx_NE (BImode, pred, const0_rtx);
+ pat = gen_rtx_COND_EXEC (VOIDmode, pred, pat);
+ validate_change (insn, &PATTERN (insn), pat, false);
+ }
+}
+
/* Loop structure of the function. The entire function is described as
a NULL loop. */
@@ -3922,6 +4038,9 @@ nvptx_reorg (void)
/* Replace subregs. */
nvptx_reorg_subreg ();
+ if (TARGET_UNIFORM_SIMT)
+ nvptx_reorg_uniform_simt ();
+
regstat_free_n_sets_and_refs ();
df_finish_pass (true);
@@ -4118,6 +4237,11 @@ nvptx_file_end (void)
fprintf (asm_out_file, ".extern .shared .u%d __nvptx_stacks[32];\n",
POINTER_SIZE);
}
+ if (need_unisimt_decl)
+ {
+ write_var_marker (asm_out_file, false, true, "__nvptx_uni");
+ fprintf (asm_out_file, ".extern .shared .u32 __nvptx_uni[32];\n");
+ }
}
/* Expander for the shuffle builtins. */
@@ -33,6 +33,8 @@
builtin_define ("__nvptx__"); \
if (TARGET_SOFT_STACK) \
builtin_define ("__nvptx_softstack__"); \
+ if (TARGET_UNIFORM_SIMT) \
+ builtin_define ("__nvptx_unisimt__"); \
} while (0)
/* Avoid the default in ../../gcc.c, which adds "-pthread", which is not
@@ -215,6 +217,8 @@ struct GTY(()) machine_function
int return_mode; /* Return mode of current fn.
(machine_mode not defined yet.) */
rtx axis_predicate[2]; /* Neutering predicates. */
+ rtx unisimt_master; /* 'Master lane index' for -muniform-simt. */
+ rtx unisimt_predicate; /* Predicate for -muniform-simt. */
};
#endif
@@ -62,6 +62,9 @@
(define_attr "subregs_ok" "false,true"
(const_string "false"))
+(define_attr "atomic" "false,true"
+ (const_string "false"))
+
;; The nvptx operand predicates, in general, don't permit subregs and
;; only literal constants, which differ from the generic ones, which
;; permit subregs and symbolc constants (as appropriate)
@@ -1215,7 +1218,8 @@
(set (match_dup 1)
(unspec_volatile:SDIM [(const_int 0)] UNSPECV_CAS))]
""
- "%.\\tatom%A1.cas.b%T0\\t%0, %1, %2, %3;")
+ "%.\\tatom%A1.cas.b%T0\\t%0, %1, %2, %3;"
+ [(set_attr "atomic" "true")])
(define_insn "atomic_exchange<mode>"
[(set (match_operand:SDIM 0 "nvptx_register_operand" "=R") ;; output
@@ -1226,7 +1230,8 @@
(set (match_dup 1)
(match_operand:SDIM 2 "nvptx_nonmemory_operand" "Ri"))] ;; input
""
- "%.\\tatom%A1.exch.b%T0\\t%0, %1, %2;")
+ "%.\\tatom%A1.exch.b%T0\\t%0, %1, %2;"
+ [(set_attr "atomic" "true")])
(define_insn "atomic_fetch_add<mode>"
[(set (match_operand:SDIM 1 "memory_operand" "+m")
@@ -1238,7 +1243,8 @@
(set (match_operand:SDIM 0 "nvptx_register_operand" "=R")
(match_dup 1))]
""
- "%.\\tatom%A1.add%t0\\t%0, %1, %2;")
+ "%.\\tatom%A1.add%t0\\t%0, %1, %2;"
+ [(set_attr "atomic" "true")])
(define_insn "atomic_fetch_addsf"
[(set (match_operand:SF 1 "memory_operand" "+m")
@@ -1250,7 +1256,8 @@
(set (match_operand:SF 0 "nvptx_register_operand" "=R")
(match_dup 1))]
""
- "%.\\tatom%A1.add%t0\\t%0, %1, %2;")
+ "%.\\tatom%A1.add%t0\\t%0, %1, %2;"
+ [(set_attr "atomic" "true")])
(define_code_iterator any_logic [and ior xor])
(define_code_attr logic [(and "and") (ior "or") (xor "xor")])
@@ -1266,7 +1273,8 @@
(set (match_operand:SDIM 0 "nvptx_register_operand" "=R")
(match_dup 1))]
"0"
- "%.\\tatom%A1.b%T0.<logic>\\t%0, %1, %2;")
+ "%.\\tatom%A1.b%T0.<logic>\\t%0, %1, %2;"
+ [(set_attr "atomic" "true")])
(define_insn "nvptx_barsync"
[(unspec_volatile [(match_operand:SI 0 "const_int_operand" "")]
@@ -36,3 +36,7 @@ Optimize partition neutering.
msoft-stack
Target Report Mask(SOFT_STACK)
Use custom stacks instead of local memory for automatic storage.
+
+muniform-simt
+Target Report Mask(UNIFORM_SIMT)
+Generate code that can keep local state uniform across all lanes.
@@ -19621,6 +19621,16 @@ for OpenMP offloading, but the option is exposed on its own for the purpose
of testing the compiler; to generate code suitable for linking into programs
using OpenMP offloading, use option @option{-mgomp}.
+@item -muniform-simt
+@opindex muniform-simt
+Generate code that allows to keep all lanes in each warp active, even when
+observable effects from execution should appear as if only one lane was
+active. This is achieved by instrumenting syscalls and atomic instructions in
+a lightweight way that allows to switch behavior at runtime. This code
+generation variant is used for OpenMP offloading, but the option is exposed on
+its own for the purpose of testing the compiler; to generate code suitable for
+linking into programs using OpenMP offloading, use option @option{-mgomp}.
+
@end table
@node PDP-11 Options
new file mode 100644
@@ -0,0 +1,22 @@
+/* { dg-options "-O2 -muniform-simt" } */
+/* { dg-do run } */
+
+#include <stdarg.h>
+
+static __attribute__((noinline,noclone)) int print (const char *fmt, ...)
+{
+ va_list va;
+ va_start (va, fmt);
+ int r = __builtin_vprintf (fmt, va);
+ va_end (va);
+ return r;
+}
+
+int main()
+{
+ static int v;
+ __sync_fetch_and_add (&v, 1);
+ if (print ("%d", v) != 1) /* { dg-output "^1$" } */
+ __builtin_abort ();
+ return 0;
+}
@@ -38,6 +38,10 @@ __main (int *rval_ptr, int argc, void **argv)
static char stack[131072] __attribute__((aligned(8)));
__nvptx_stacks[0] = stack + sizeof stack;
#endif
+#ifdef __nvptx_unisimt__
+ extern unsigned __nvptx_uni[32] __attribute__((shared));
+ __nvptx_uni[0] = 0;
+#endif
exit (main (argc, argv));
}
@@ -1,4 +1,4 @@
-/* Define shared memory array for -msoft-stack.
+/* Define shared memory arrays for -msoft-stack and -muniform-simt.
Copyright (C) 2015-2016 Free Software Foundation, Inc.
@@ -22,3 +22,4 @@
<http://www.gnu.org/licenses/>. */
char *__nvptx_stacks[32] __attribute__((shared,nocommon));
+unsigned __nvptx_uni[32] __attribute__((shared,nocommon));