diff mbox

[gomp4] New nvptx pattern and internal builtin

Message ID 55A92142.6050008@codesourcery.com
State New
Headers show

Commit Message

Bernd Schmidt July 17, 2015, 3:37 p.m. UTC
I've made this change at the request of Cesar who says it's needed for 
his reductions work. It makes a new instruction to represent shfl.down, 
a thread communication instruction, and some builtin functions for 
internal use to access it.

Committed on gomp-4_0-branch.


Bernd
diff mbox

Patch

Index: gcc/ChangeLog.gomp
===================================================================
--- gcc/ChangeLog.gomp	(revision 225936)
+++ gcc/ChangeLog.gomp	(working copy)
@@ -1,3 +1,17 @@ 
+2015-07-17  Bernd Schmidt  <bernds@codesourcery.com>
+
+	* config/nvptx/nvptx.c (enum nvptx_builtins,
+	struct builtin_description): New.
+	(nvptx_builtin_decls, bdesc_2arg): New static variables.
+	(def_builtin): New macro.
+	(nvptx_builtin_decl, nvptx_init_builtins, nvptx_expand_binop_builtin,
+	nvptx_expand_builtin): New static functions.
+	(TARGET_INIT_BUILTINS, TARGET_EXPAND_BUILTIN, TARGET_BUILTIN_DECL):
+	Define.
+	* config/nvptx/nvptx.md (UNSPECV_SHFL_DOWN): New constant.
+	(thread_shuffle_down<mode>): New pattern.
+	(thread_shuffle_downdi): New expander.
+
 2015-07-17  Julian Brown  <julian@codesourcery.com>
 
 	* gimplify.c (gimplify_scan_omp_clauses): Handle
Index: gcc/config/nvptx/nvptx.c
===================================================================
--- gcc/config/nvptx/nvptx.c	(revision 225936)
+++ gcc/config/nvptx/nvptx.c	(working copy)
@@ -3058,6 +3058,139 @@  nvptx_file_end (void)
     }
 }
 
+/* Codes for all the NVPTX builtins.  */
+enum nvptx_builtins
+{
+  NVPTX_BUILTIN_SHUFFLE_DOWN,
+  NVPTX_BUILTIN_SHUFFLE_DOWNF,
+  NVPTX_BUILTIN_SHUFFLE_DOWNLL,
+
+  NVPTX_BUILTIN_MAX
+};
+
+
+static GTY(()) tree nvptx_builtin_decls[NVPTX_BUILTIN_MAX];
+
+/* Return the NVPTX builtin for CODE.  */
+static tree
+nvptx_builtin_decl (unsigned code, bool initialize_p ATTRIBUTE_UNUSED)
+{
+  if (code >= NVPTX_BUILTIN_MAX)
+    return error_mark_node;
+
+  return nvptx_builtin_decls[code];
+}
+
+#define def_builtin(NAME, TYPE, CODE)					\
+do {									\
+  tree bdecl;								\
+  bdecl = add_builtin_function ((NAME), (TYPE), (CODE), BUILT_IN_MD,	\
+				NULL, NULL_TREE);			\
+  nvptx_builtin_decls[CODE] = bdecl;					\
+} while (0)
+
+/* Set up all builtin functions for this target.  */
+static void
+nvptx_init_builtins (void)
+{ 
+  tree uint_ftype_uint_int
+    = build_function_type_list (unsigned_type_node, unsigned_type_node,
+				integer_type_node, NULL_TREE);
+  tree ull_ftype_ull_int
+    = build_function_type_list (long_long_unsigned_type_node,
+				long_long_unsigned_type_node,
+				integer_type_node, NULL_TREE);
+  tree float_ftype_float_int
+    = build_function_type_list (float_type_node, float_type_node,
+				integer_type_node, NULL_TREE);
+  def_builtin ("__builtin_nvptx_shuffle_down", uint_ftype_uint_int,
+	       NVPTX_BUILTIN_SHUFFLE_DOWN);
+  def_builtin ("__builtin_nvptx_shuffle_downf", float_ftype_float_int,
+	       NVPTX_BUILTIN_SHUFFLE_DOWNF);
+  def_builtin ("__builtin_nvptx_shuffle_downll", ull_ftype_ull_int,
+	       NVPTX_BUILTIN_SHUFFLE_DOWNLL);
+}
+
+/* Subroutine of nvptx_expand_builtin to take care of binop insns.  MACFLAG is -1
+   if this is a normal binary op, or one of the MACFLAG_xxx constants.  */
+
+static rtx
+nvptx_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
+{
+  rtx pat;
+  tree arg0 = CALL_EXPR_ARG (exp, 0);
+  tree arg1 = CALL_EXPR_ARG (exp, 1);
+  rtx op0 = expand_expr (arg0, NULL_RTX, VOIDmode, EXPAND_NORMAL);
+  rtx op1 = expand_expr (arg1, NULL_RTX, VOIDmode, EXPAND_NORMAL);
+  machine_mode op0mode = GET_MODE (op0);
+  machine_mode op1mode = GET_MODE (op1);
+  machine_mode tmode = insn_data[icode].operand[0].mode;
+  machine_mode mode0 = insn_data[icode].operand[1].mode;
+  machine_mode mode1 = insn_data[icode].operand[2].mode;
+  rtx ret = target;
+
+  if (! target
+      || GET_MODE (target) != tmode
+      || ! (*insn_data[icode].operand[0].predicate) (target, tmode))
+    target = gen_reg_rtx (tmode);
+
+  gcc_assert ((op0mode == mode0 || op0mode == VOIDmode)
+	      && (op1mode == mode1 || op1mode == VOIDmode));
+
+  if (! (*insn_data[icode].operand[1].predicate) (op0, mode0))
+    op0 = copy_to_mode_reg (mode0, op0);
+  if (! (*insn_data[icode].operand[2].predicate) (op1, mode1))
+    op1 = copy_to_mode_reg (mode1, op1);
+
+  pat = GEN_FCN (icode) (target, op0, op1);
+
+  if (! pat)
+    return 0;
+
+  emit_insn (pat);
+
+  return ret;
+}
+
+
+struct builtin_description
+{
+  const enum insn_code icode;
+  const char *const name;
+  const enum nvptx_builtins code;
+};
+
+static const struct builtin_description bdesc_2arg[] =
+{
+  { CODE_FOR_thread_shuffle_downsi, "__builtin_nvptx_shuffle_down", NVPTX_BUILTIN_SHUFFLE_DOWN },
+  { CODE_FOR_thread_shuffle_downsf, "__builtin_nvptx_shuffle_downf", NVPTX_BUILTIN_SHUFFLE_DOWNF },
+  { CODE_FOR_thread_shuffle_downdi, "__builtin_nvptx_shuffle_downll", NVPTX_BUILTIN_SHUFFLE_DOWNLL }
+};
+
+/* Expand an expression EXP that calls a built-in function,
+   with result going to TARGET if that's convenient
+   (and in mode MODE if that's convenient).
+   SUBTARGET may be used as the target for computing one of EXP's operands.
+   IGNORE is nonzero if the value is to be ignored.  */
+
+static rtx
+nvptx_expand_builtin (tree exp, rtx target ATTRIBUTE_UNUSED,
+		     rtx subtarget ATTRIBUTE_UNUSED,
+		     machine_mode mode ATTRIBUTE_UNUSED,
+		     int ignore ATTRIBUTE_UNUSED)
+{
+  size_t i;
+  const struct builtin_description *d;
+  tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
+  unsigned int fcode = DECL_FUNCTION_CODE (fndecl);
+
+  for (i = 0, d = bdesc_2arg; i < ARRAY_SIZE (bdesc_2arg); i++, d++)
+    if (d->code == fcode)
+      return nvptx_expand_binop_builtin (d->icode, exp, target);
+
+  gcc_unreachable ();
+}
+
 #undef TARGET_OPTION_OVERRIDE
 #define TARGET_OPTION_OVERRIDE nvptx_option_override
 
@@ -3145,6 +3278,13 @@  nvptx_file_end (void)
 #undef TARGET_CANNOT_COPY_INSN_P
 #define TARGET_CANNOT_COPY_INSN_P nvptx_cannot_copy_insn_p
 
+#undef TARGET_INIT_BUILTINS
+#define TARGET_INIT_BUILTINS nvptx_init_builtins
+#undef TARGET_EXPAND_BUILTIN
+#define TARGET_EXPAND_BUILTIN nvptx_expand_builtin
+#undef  TARGET_BUILTIN_DECL
+#define TARGET_BUILTIN_DECL nvptx_builtin_decl
+
 struct gcc_target targetm = TARGET_INITIALIZER;
 
 #include "gt-nvptx.h"
Index: gcc/config/nvptx/nvptx.md
===================================================================
--- gcc/config/nvptx/nvptx.md	(revision 225936)
+++ gcc/config/nvptx/nvptx.md	(working copy)
@@ -70,6 +70,8 @@  (define_c_enum "unspecv" [
    UNSPECV_FORKED
    UNSPECV_JOINING
    UNSPECV_JOIN
+
+   UNSPECV_SHFL_DOWN
 ])
 
 (define_attr "subregs_ok" "false,true"
@@ -1416,6 +1418,39 @@  (define_insn "nvptx_broadcast<mode>"
   ""
   "%.\\tshfl.idx.b32\\t%0, %1, 0, 31;")
 
+(define_insn "thread_shuffle_down<mode>"
+  [(set (match_operand:BITS 0 "nvptx_register_operand" "")
+	(unspec_volatile:BITS [(match_operand:SI 1 "nvptx_register_operand" "")
+			       (match_operand:SI 2 "nvptx_nonmemory_operand" "")]
+			      UNSPECV_SHFL_DOWN))]
+  ""
+  "%.\\tshfl.down.b32\\t%0, %1, %2, 31;")
+
+(define_expand "thread_shuffle_downdi"
+  [(set (match_operand:DI 0 "nvptx_register_operand" "")
+	(unspec_volatile:DI [(match_operand:DI 1 "nvptx_register_operand" "")
+			     (match_operand:SI 2 "nvptx_nonmemory_operand" "")]
+			    UNSPECV_SHFL_DOWN))]
+  ""
+{
+  rtx t = gen_reg_rtx (DImode);
+  emit_insn (gen_lshrdi3 (t, operands[1], GEN_INT (32)));
+  rtx op0 = force_reg (SImode, gen_lowpart (SImode, t));
+  rtx op1 = force_reg (SImode, gen_lowpart (SImode, operands[1]));
+  rtx targ0 = gen_reg_rtx (SImode);
+  rtx targ1 = gen_reg_rtx (SImode);
+  emit_insn (gen_thread_shuffle_downsi (targ0, op0, operands[2]));
+  emit_insn (gen_thread_shuffle_downsi (targ1, op1, operands[2]));
+  rtx t2 = gen_reg_rtx (DImode);
+  rtx t3 = gen_reg_rtx (DImode);
+  emit_insn (gen_extendsidi2 (t2, targ0));
+  emit_insn (gen_extendsidi2 (t3, targ1));
+  rtx t4 = gen_reg_rtx (DImode);
+  emit_insn (gen_ashldi3 (t4, t2, GEN_INT (32)));
+  emit_insn (gen_iordi3 (operands[0], t3, t4));
+  DONE;
+})
+
 ;; extract parts of a 64 bit object into 2 32-bit ints
 (define_insn "unpack<mode>si2"
   [(set (match_operand:SI 0 "nvptx_register_operand" "")