diff mbox

[gomp4] ptx builtins

Message ID 55BA0A96.90302@acm.org
State New
Headers show

Commit Message

Nathan Sidwell July 30, 2015, 11:29 a.m. UTC
I've committed this to gomp4 branch.

The vector neutering code already has machinery to generate DI/DF shuffles from 
the SI  underlying instruction.  This generalizes that machinery and changes the 
shuffle-down machinery to use it.  Less code duplication - yay!  Also added a DF 
mode shuffle down, as that was missing.

nathan
2015-07-30  Nathan Sidwell  <nathan@acm.org>

	gcc/
	* config/nvptx/nvptx.mc (UNSPEC_BROADCAST, UNSPEC_SHFL_DOWN):
	Replace with ...
	(UNSPEC_SHUFFLE): ... this.
	(nvptx_broadcast<mode>): Replace with ...
	(nvptx_shuffle<mode>): ... this.
	(thread_shuffle_down<mode>,  thread_shiffle_downdi): Delete.
	* config/nvptx/nvptx.c (SHUFFLE_UP, SHUFFLE_DOWN, SHUFFLE_BFLY,
	SHUFFLE_IDX): New defines.
	(nvptx_gen_shuffle): Break out of nvptx_gen_vcast and generalize.
	(nvptx_gen_vcast): Use nvptx_gen_shuffle.
	(nvptx_print_operand): Add 'S' case.
	(nvptx_cannot_copy_insn_p): Adjust.
	(nvptx_expand_shuffle_down): New builtin expander for shuffles.
	(enum nvptx_types): Add NT_DBL_DBL_INT case.
	(struct builtin_descriptor): Use ptr to fn for expander.  Remove
	icode and num_args.
	(builtins): Adjust.
	(nvptx_init_builtins): Adjust.
	(nvptx_expand_builtin): Invoke builtin-specific expander function.
diff mbox

Patch

Index: gcc/config/nvptx/nvptx.md
===================================================================
--- gcc/config/nvptx/nvptx.md	(revision 226377)
+++ gcc/config/nvptx/nvptx.md	(working copy)
@@ -55,7 +55,7 @@ 
 
    UNSPEC_BIT_CONV
 
-   UNSPEC_BROADCAST
+   UNSPEC_SHUFFLE
    UNSPEC_BR_UNIFIED
 ])
 
@@ -70,8 +70,6 @@ 
    UNSPECV_FORKED
    UNSPECV_JOINING
    UNSPECV_JOIN
-
-   UNSPECV_SHFL_DOWN
 ])
 
 (define_attr "subregs_ok" "false,true"
@@ -1410,46 +1408,15 @@ 
 })
 
 ;; only 32-bit shuffles exist.
-(define_insn "nvptx_broadcast<mode>"
+(define_insn "nvptx_shuffle<mode>"
   [(set (match_operand:BITS 0 "nvptx_register_operand" "")
 	(unspec:BITS
-		[(match_operand:BITS 1 "nvptx_register_operand" "")]
-		  UNSPEC_BROADCAST))]
+		[(match_operand:BITS 1 "nvptx_register_operand" "")
+		 (match_operand:SI 2 "nvptx_nonmemory_operand" "")
+		 (match_operand:SI 3 "const_int_operand" "")]
+		  UNSPEC_SHUFFLE))]
   ""
-  "%.\\tshfl.idx.b32\\t%0, %1, 0, 31;")
-
-(define_insn "thread_shuffle_down<mode>"
-  [(set (match_operand:BITS 0 "nvptx_register_operand" "")
-	(unspec_volatile:BITS [(match_operand:SI 1 "nvptx_register_operand" "")
-			       (match_operand:SI 2 "nvptx_nonmemory_operand" "")]
-			      UNSPECV_SHFL_DOWN))]
-  ""
-  "%.\\tshfl.down.b32\\t%0, %1, %2, 31;")
-
-(define_expand "thread_shuffle_downdi"
-  [(set (match_operand:DI 0 "nvptx_register_operand" "")
-	(unspec_volatile:DI [(match_operand:DI 1 "nvptx_register_operand" "")
-			     (match_operand:SI 2 "nvptx_nonmemory_operand" "")]
-			    UNSPECV_SHFL_DOWN))]
-  ""
-{
-  rtx t = gen_reg_rtx (DImode);
-  emit_insn (gen_lshrdi3 (t, operands[1], GEN_INT (32)));
-  rtx op0 = force_reg (SImode, gen_lowpart (SImode, t));
-  rtx op1 = force_reg (SImode, gen_lowpart (SImode, operands[1]));
-  rtx targ0 = gen_reg_rtx (SImode);
-  rtx targ1 = gen_reg_rtx (SImode);
-  emit_insn (gen_thread_shuffle_downsi (targ0, op0, operands[2]));
-  emit_insn (gen_thread_shuffle_downsi (targ1, op1, operands[2]));
-  rtx t2 = gen_reg_rtx (DImode);
-  rtx t3 = gen_reg_rtx (DImode);
-  emit_insn (gen_extendsidi2 (t2, targ0));
-  emit_insn (gen_extendsidi2 (t3, targ1));
-  rtx t4 = gen_reg_rtx (DImode);
-  emit_insn (gen_ashldi3 (t4, t2, GEN_INT (32)));
-  emit_insn (gen_iordi3 (operands[0], t3, t4));
-  DONE;
-})
+  "%.\\tshfl.%S3.b32\\t%0, %1, %2, 31;")
 
 ;; extract parts of a 64 bit object into 2 32-bit ints
 (define_insn "unpack<mode>si2"
Index: gcc/config/nvptx/nvptx.c
===================================================================
--- gcc/config/nvptx/nvptx.c	(revision 226377)
+++ gcc/config/nvptx/nvptx.c	(working copy)
@@ -64,6 +64,11 @@ 
 /* This file should be included last.  */
 #include "target-def.h"
 
+#define SHUFFLE_UP 0
+#define SHUFFLE_DOWN 1
+#define SHUFFLE_BFLY 2
+#define SHUFFLE_IDX 3
+
 /* Record the function decls we've written, and the libfuncs and function
    decls corresponding to them.  */
 static std::stringstream func_decls;
@@ -1132,17 +1137,17 @@  nvptx_gen_pack (rtx dst, rtx src0, rtx s
    across the vectors of a single warp.  */
 
 static rtx
-nvptx_gen_vcast (rtx reg)
+nvptx_gen_shuffle (rtx dst, rtx src, rtx idx, unsigned kind)
 {
   rtx res;
 
-  switch (GET_MODE (reg))
+  switch (GET_MODE (dst))
     {
     case SImode:
-      res = gen_nvptx_broadcastsi (reg, reg);
+      res = gen_nvptx_shufflesi (dst, src, idx, GEN_INT (kind));
       break;
     case SFmode:
-      res = gen_nvptx_broadcastsf (reg, reg);
+      res = gen_nvptx_shufflesf (dst, src, idx, GEN_INT (kind));
       break;
     case DImode:
     case DFmode:
@@ -1151,10 +1156,10 @@  nvptx_gen_vcast (rtx reg)
 	rtx tmp1 = gen_reg_rtx (SImode);
 
 	start_sequence ();
-	emit_insn (nvptx_gen_unpack (tmp0, tmp1, reg));
-	emit_insn (nvptx_gen_vcast (tmp0));
-	emit_insn (nvptx_gen_vcast (tmp1));
-	emit_insn (nvptx_gen_pack (reg, tmp0, tmp1));
+	emit_insn (nvptx_gen_unpack (tmp0, tmp1, src));
+	emit_insn (nvptx_gen_shuffle (tmp0, tmp0, idx, kind));
+	emit_insn (nvptx_gen_shuffle (tmp1, tmp1, idx, kind));
+	emit_insn (nvptx_gen_pack (dst, tmp0, tmp1));
 	res = get_insns ();
 	end_sequence ();
       }
@@ -1164,21 +1169,29 @@  nvptx_gen_vcast (rtx reg)
 	rtx tmp = gen_reg_rtx (SImode);
 	
 	start_sequence ();
-	emit_insn (gen_sel_truesi (tmp, reg, GEN_INT (1), const0_rtx));
-	emit_insn (nvptx_gen_vcast (tmp));
-	emit_insn (gen_rtx_SET (reg, gen_rtx_NE (BImode, tmp, const0_rtx)));
+	emit_insn (gen_sel_truesi (tmp, src, GEN_INT (1), const0_rtx));
+	emit_insn (nvptx_gen_shuffle (tmp, tmp, idx, kind));
+	emit_insn (gen_rtx_SET (dst, gen_rtx_NE (BImode, tmp, const0_rtx)));
 	res = get_insns ();
 	end_sequence ();
       }
       break;
       
-    case HImode:
-    case QImode:
-    default:debug_rtx (reg);gcc_unreachable ();
+    default:
+      gcc_unreachable ();
     }
   return res;
 }
 
+/* Generate an instruction or sequence to broadcast register REG
+   across the vectors of a single warp.  */
+
+static rtx
+nvptx_gen_vcast (rtx reg)
+{
+  return nvptx_gen_shuffle (reg, reg, const0_rtx, SHUFFLE_IDX);
+}
+
 /* Structure used when generating a worker-level spill or fill.  */
 
 struct wcast_data_t
@@ -1862,6 +1875,7 @@  nvptx_print_operand_address (FILE *file,
    A -- print an address space identifier for a MEM
    c -- print an opcode suffix for a comparison operator, including a type code
    f -- print a full reg even for something that must always be split
+   S -- print a shuffle kind
    t -- print a type opcode suffix, promoting QImode to 32 bits
    T -- print a type size in bits
    u -- print a type opcode suffix without promotions.  */
@@ -1913,6 +1927,15 @@  nvptx_print_operand (FILE *file, rtx x,
       fprintf (file, "%s", nvptx_ptx_type_from_mode (op_mode, false));
       break;
 
+    case 'S':
+      {
+	unsigned kind = UINTVAL (x);
+	static const char *const kinds[] = 
+	  {"up", "down", "bfly", "idx"};
+	fprintf (file, "%s", kinds[kind]);
+      }
+      break;
+      
     case 'T':
       fprintf (file, "%d", GET_MODE_BITSIZE (GET_MODE (x)));
       break;
@@ -2996,8 +3019,8 @@  nvptx_cannot_copy_insn_p (rtx_insn *insn
 {
   switch (recog_memoized (insn))
     {
-    case CODE_FOR_nvptx_broadcastsi:
-    case CODE_FOR_nvptx_broadcastsf:
+    case CODE_FOR_nvptx_shufflesi:
+    case CODE_FOR_nvptx_shufflesf:
     case CODE_FOR_nvptx_barsync:
     case CODE_FOR_nvptx_fork:
     case CODE_FOR_nvptx_forked:
@@ -3101,11 +3124,39 @@  nvptx_file_end (void)
     }
 }
 
+/* Expander for the shuffle down builtins.  */
+static rtx
+nvptx_expand_shuffle_down (tree exp, rtx target, machine_mode mode, int ignore)
+{
+  if (ignore)
+    return target;
+  
+  if (! target)
+    target = gen_reg_rtx (mode);
+
+  rtx src = expand_expr (CALL_EXPR_ARG (exp, 0),
+			NULL_RTX, mode, EXPAND_NORMAL);
+  if (!REG_P (src))
+    src = copy_to_mode_reg (mode, src);
+
+  rtx idx = expand_expr (CALL_EXPR_ARG (exp, 1),
+			NULL_RTX, SImode, EXPAND_NORMAL);
+  if (!REG_P (idx) && GET_CODE (idx) != CONST_INT)
+    idx = copy_to_mode_reg (SImode, idx);
+
+  rtx pat = nvptx_gen_shuffle (target, src, idx, SHUFFLE_DOWN);
+  if (pat)
+    emit_insn (pat);
+
+  return target;
+}
+
 enum nvptx_types
   {
     NT_UINT_UINT_INT,
     NT_ULL_ULL_INT,
     NT_FLT_FLT_INT,
+    NT_DBL_DBL_INT,
 
     NT_MAX
   };
@@ -3113,19 +3164,20 @@  enum nvptx_types
 struct builtin_description
 {
   const char *name;
-  enum insn_code icode;
   unsigned short type;
-  unsigned short num_args;
+  rtx (*expander) (tree, rtx, machine_mode, int);
 };
 
 static const struct builtin_description builtins[] =
 {
-  {"__builtin_nvptx_shuffle_down", CODE_FOR_thread_shuffle_downsi,
-   NT_UINT_UINT_INT, 2},
-  {"__builtin_nvptx_shuffle_downf", CODE_FOR_thread_shuffle_downsf,
-   NT_FLT_FLT_INT, 2},
-  { "__builtin_nvptx_shuffle_downll", CODE_FOR_thread_shuffle_downdi,
-    NT_ULL_ULL_INT, 2},
+  {"__builtin_nvptx_shuffle_down", NT_UINT_UINT_INT,
+   nvptx_expand_shuffle_down},
+  {"__builtin_nvptx_shuffle_downll", NT_ULL_ULL_INT,
+   nvptx_expand_shuffle_down},
+  {"__builtin_nvptx_shuffle_downf", NT_FLT_FLT_INT,
+   nvptx_expand_shuffle_down},
+  {"__builtin_nvptx_shuffle_downd", NT_DBL_DBL_INT,
+   nvptx_expand_shuffle_down},
 };
 
 #define NVPTX_BUILTIN_MAX (sizeof (builtins) / sizeof (builtins[0]))
@@ -3159,6 +3211,9 @@  nvptx_init_builtins (void)
   types[NT_FLT_FLT_INT]
     = build_function_type_list (float_type_node, float_type_node,
 				integer_type_node, NULL_TREE);
+  types[NT_DBL_DBL_INT]
+    = build_function_type_list (double_type_node, double_type_node,
+				integer_type_node, NULL_TREE);
 
   for (ix = 0; ix != NVPTX_BUILTIN_MAX; ix++)
     nvptx_builtin_decls[ix]
@@ -3180,34 +3235,8 @@  nvptx_expand_builtin (tree exp, rtx targ
 {
   tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
   const struct builtin_description *d = &builtins[DECL_FUNCTION_CODE (fndecl)];
-  unsigned icode = d->icode;
-  rtx operands[2]; /* maxium operands */
-  unsigned ix;
-  machine_mode tmode = insn_data[icode].operand[0].mode;
-
-  if (ignore)
-    return target;
-  
-  if (! target
-      || mode != tmode
-      || ! (*insn_data[icode].operand[0].predicate) (target, tmode))
-    target = gen_reg_rtx (tmode);
-
-  for (ix = d->num_args; ix--;)
-    {
-      machine_mode m = insn_data[icode].operand[ix + 1].mode;
-      rtx op = expand_expr (CALL_EXPR_ARG (exp, ix),
-			    NULL_RTX, VOIDmode, EXPAND_NORMAL);
-      if (! (*insn_data[icode].operand[ix + 1].predicate) (op, m))
-	op = copy_to_mode_reg (m, op);
-      operands[ix] = op;
-    }
 
-  rtx pat = GEN_FCN (icode) (target, operands[0], operands[1]);
-  if (pat)
-    emit_insn (pat);
-
-  return target;
+  return d->expander (exp, target, mode, ignore);
 }
 
 #undef TARGET_OPTION_OVERRIDE