diff mbox

[gomp4] New builtins, preparation for oacc vector-single

Message ID 555C77A8.2030300@codesourcery.com
State New
Headers show

Commit Message

Bernd Schmidt May 20, 2015, 12:01 p.m. UTC
To implement OpenACC vector-single mode, we need to ensure that only one 
thread out of the group representing a worker executes. The others skip 
computations but follow along the CFG, so the results of conditional 
branch decisions must be broadcast to them.

The patch below adds a new builtin and nvptx pattern to implement that 
broadcast functionality.

Committed on gomp-4_0-branch.


Bernd

Comments

Jakub Jelinek May 20, 2015, 12:39 p.m. UTC | #1
On Wed, May 20, 2015 at 02:01:44PM +0200, Bernd Schmidt wrote:
> To implement OpenACC vector-single mode, we need to ensure that only one
> thread out of the group representing a worker executes. The others skip
> computations but follow along the CFG, so the results of conditional branch
> decisions must be broadcast to them.
> 
> The patch below adds a new builtin and nvptx pattern to implement that
> broadcast functionality.

So, is the goal of this that threads in the warp other than the 0th
don't do anything except in vectorized regions, where all the threads
in the warp participate in the vectorization?
Thus, for OpenMP, should the whole warp be a single thread
(thus omp_get_thread_num () would be tid.x >> 5)?
If so, is the GCC vectorizer going to be taught about this?

	Jakub
Bernd Schmidt May 20, 2015, 1:54 p.m. UTC | #2
On 05/20/2015 02:39 PM, Jakub Jelinek wrote:
> On Wed, May 20, 2015 at 02:01:44PM +0200, Bernd Schmidt wrote:
>> To implement OpenACC vector-single mode, we need to ensure that only one
>> thread out of the group representing a worker executes. The others skip
>> computations but follow along the CFG, so the results of conditional branch
>> decisions must be broadcast to them.
>>
>> The patch below adds a new builtin and nvptx pattern to implement that
>> broadcast functionality.
>
> So, is the goal of this that threads in the warp other than the 0th
> don't do anything except in vectorized regions, where all the threads
> in the warp participate in the vectorization?

Yes.

> Thus, for OpenMP, should the whole warp be a single thread
> (thus omp_get_thread_num () would be tid.x >> 5)?

Do you mean for an OMP port to nvptx? I haven't looked at OpenMP enough 
to say if or how it could be mapped to GPU hardware; it's not something 
we intend to do for this project.


Bernd
diff mbox

Patch

Index: gcc/ChangeLog.gomp
===================================================================
--- gcc/ChangeLog.gomp	(revision 223360)
+++ gcc/ChangeLog.gomp	(working copy)
@@ -1,3 +1,16 @@ 
+2015-05-19  Bernd Schmidt  <bernds@codesourcery.com>
+
+	* omp-builtins.def (GOACC_thread_broadcast,
+	GOACC_thread_broadcast_ll): New builtins.
+	* optabs.def (oacc_thread_broadcast_optab): New optab.
+	* builtins.c (expand_builtin_oacc_thread_broadcast): New function.
+	(expand_builtin): Use it.
+	* config/nvptx/nvptx.c (nvptx_cannot_copy_insn_p): New function.
+	(TARGET_CANNOT_COPY_INSN_P): Define.
+	* config/nvptx/nvptx.md (UNSPECV_WARP_BCAST): New constant.
+	(oacc_thread_broadcastsi): New pattern.
+	(oacc_thread_broadcastdi): New expander.
+
 2015-05-19  Tom de Vries  <tom@codesourcery.com>
 
 	* omp-low.c (enclosing_target_ctx): Comment out.
Index: gcc/builtins.c
===================================================================
--- gcc/builtins.c	(revision 223360)
+++ gcc/builtins.c	(working copy)
@@ -6022,6 +6022,43 @@  expand_oacc_ganglocal_ptr (rtx target AT
   return NULL_RTX;
 }
 
+/* Handle a GOACC_thread_broadcast builtin call EXP with target TARGET.
+   Return the result.  */
+
+static rtx
+expand_builtin_oacc_thread_broadcast (tree exp, rtx target)
+{
+  tree arg0 = CALL_EXPR_ARG (exp, 0);
+  enum insn_code icode;
+
+  enum machine_mode mode = TYPE_MODE (TREE_TYPE (arg0));
+  gcc_assert (INTEGRAL_MODE_P (mode));
+  do
+    {
+      icode = direct_optab_handler (oacc_thread_broadcast_optab, mode);
+      mode = GET_MODE_WIDER_MODE (mode);
+    }
+  while (icode == CODE_FOR_nothing && mode != VOIDmode);
+  if (icode == CODE_FOR_nothing)
+    return expand_expr (arg0, NULL_RTX, VOIDmode, EXPAND_NORMAL);
+
+  rtx tmp = target;
+  machine_mode mode0 = insn_data[icode].operand[0].mode;
+  machine_mode mode1 = insn_data[icode].operand[1].mode;
+  if (!REG_P (tmp) || GET_MODE (tmp) != mode0)
+    tmp = gen_reg_rtx (mode0);
+  rtx op1 = expand_expr (arg0, NULL_RTX, mode1, EXPAND_NORMAL);
+  if (GET_MODE (op1) != mode1)
+    op1 = convert_to_mode (mode1, op1, 0);
+
+  rtx insn = GEN_FCN (icode) (tmp, op1);
+  if (insn != NULL_RTX)
+    {
+      emit_insn (insn);
+      return tmp;
+    }
+  return const0_rtx;
+}
 
 /* Expand an expression EXP that calls a built-in function,
    with result going to TARGET if that's convenient
@@ -7177,6 +7214,10 @@  expand_builtin (tree exp, rtx target, rt
 	return target;
       break;
 
+    case BUILT_IN_GOACC_THREAD_BROADCAST:
+    case BUILT_IN_GOACC_THREAD_BROADCAST_LL:
+      return expand_builtin_oacc_thread_broadcast (exp, target);
+
     default:	/* just do library call, if unknown builtin */
       break;
     }
Index: gcc/config/nvptx/nvptx.c
===================================================================
--- gcc/config/nvptx/nvptx.c	(revision 223360)
+++ gcc/config/nvptx/nvptx.c	(working copy)
@@ -2029,6 +2029,15 @@  nvptx_vector_alignment (const_tree type)
 
   return MIN (align, BIGGEST_ALIGNMENT);
 }
+
+static bool
+nvptx_cannot_copy_insn_p (rtx_insn *insn)
+{
+  if (recog_memoized (insn) == CODE_FOR_oacc_thread_broadcastsi)
+    return true;
+  return false;
+}
+
 
 /* Record a symbol for mkoffload to enter into the mapping table.  */
 
@@ -2153,6 +2162,9 @@  nvptx_file_end (void)
 #undef TARGET_VECTOR_ALIGNMENT
 #define TARGET_VECTOR_ALIGNMENT nvptx_vector_alignment
 
+#undef  TARGET_CANNOT_COPY_INSN_P
+#define TARGET_CANNOT_COPY_INSN_P nvptx_cannot_copy_insn_p
+
 struct gcc_target targetm = TARGET_INITIALIZER;
 
 #include "gt-nvptx.h"
Index: gcc/config/nvptx/nvptx.md
===================================================================
--- gcc/config/nvptx/nvptx.md	(revision 223360)
+++ gcc/config/nvptx/nvptx.md	(working copy)
@@ -61,6 +61,7 @@  (define_c_enum "unspecv" [
    UNSPECV_LOCK
    UNSPECV_CAS
    UNSPECV_XCHG
+   UNSPECV_WARP_BCAST
 ])
 
 (define_attr "subregs_ok" "false,true"
@@ -1322,6 +1323,37 @@  (define_expand "oacc_ctaid"
     FAIL;
 })
 
+(define_insn "oacc_thread_broadcastsi"
+  [(set (match_operand:SI 0 "nvptx_register_operand" "")
+	(unspec_volatile:SI [(match_operand:SI 1 "nvptx_register_operand" "")]
+			    UNSPECV_WARP_BCAST))]
+  ""
+  "%.\\tshfl.idx.b32\\t%0, %1, 0, 31;")
+
+(define_expand "oacc_thread_broadcastdi"
+  [(set (match_operand:DI 0 "nvptx_register_operand" "")
+	(unspec_volatile:DI [(match_operand:DI 1 "nvptx_register_operand" "")]
+			    UNSPECV_WARP_BCAST))]
+  ""
+{
+  rtx t = gen_reg_rtx (DImode);
+  emit_insn (gen_lshrdi3 (t, operands[1], GEN_INT (32)));
+  rtx op0 = force_reg (SImode, gen_lowpart (SImode, t));
+  rtx op1 = force_reg (SImode, gen_lowpart (SImode, operands[1]));
+  rtx targ0 = gen_reg_rtx (SImode);
+  rtx targ1 = gen_reg_rtx (SImode);
+  emit_insn (gen_oacc_thread_broadcastsi (targ0, op0));
+  emit_insn (gen_oacc_thread_broadcastsi (targ1, op1));
+  rtx t2 = gen_reg_rtx (DImode);
+  rtx t3 = gen_reg_rtx (DImode);
+  emit_insn (gen_extendsidi2 (t2, targ0));
+  emit_insn (gen_extendsidi2 (t3, targ1));
+  rtx t4 = gen_reg_rtx (DImode);
+  emit_insn (gen_ashldi3 (t4, t2, GEN_INT (32)));
+  emit_insn (gen_iordi3 (operands[0], t3, t4));
+  DONE;
+})
+
 (define_insn "ganglocal_ptr<mode>"
   [(set (match_operand:P 0 "nvptx_register_operand" "")
 	(unspec:P [(const_int 0)] UNSPEC_SHARED_DATA))]
Index: gcc/fortran/ChangeLog.gomp
===================================================================
--- gcc/fortran/ChangeLog.gomp	(revision 223360)
+++ gcc/fortran/ChangeLog.gomp	(working copy)
@@ -1,3 +1,7 @@ 
+2015-05-19  Bernd Schmidt  <bernds@codesourcery.com>
+
+	* types.def (BT_FN_ULONGLONG_ULONGLONG): Define.
+
 2015-05-13  Cesar Philippidis  <cesar@codesourcery.com>
 
 	* f95-lang.c (gfc_attribute_table): Add and "oacc function"
Index: gcc/fortran/types.def
===================================================================
--- gcc/fortran/types.def	(revision 223360)
+++ gcc/fortran/types.def	(working copy)
@@ -84,6 +84,7 @@  DEF_FUNCTION_TYPE_1 (BT_FN_VOID_PTRPTR,
 DEF_FUNCTION_TYPE_1 (BT_FN_VOID_VPTR, BT_VOID, BT_VOLATILE_PTR)
 DEF_FUNCTION_TYPE_1 (BT_FN_INT_INT, BT_INT, BT_INT)
 DEF_FUNCTION_TYPE_1 (BT_FN_UINT_UINT, BT_UINT, BT_UINT)
+DEF_FUNCTION_TYPE_1 (BT_FN_ULONGLONG_ULONGLONG, BT_ULONGLONG, BT_ULONGLONG)
 DEF_FUNCTION_TYPE_1 (BT_FN_PTR_PTR, BT_PTR, BT_PTR)
 DEF_FUNCTION_TYPE_1 (BT_FN_VOID_INT, BT_VOID, BT_INT)
 DEF_FUNCTION_TYPE_1 (BT_FN_BOOL_INT, BT_BOOL, BT_INT)
Index: gcc/omp-builtins.def
===================================================================
--- gcc/omp-builtins.def	(revision 223360)
+++ gcc/omp-builtins.def	(working copy)
@@ -77,6 +77,10 @@  DEF_GOACC_BUILTIN (BUILT_IN_GOACC_GET_GA
 		   BT_FN_PTR, ATTR_NOTHROW_LEAF_LIST)
 DEF_GOACC_BUILTIN (BUILT_IN_GOACC_DEVICEPTR, "GOACC_deviceptr",
 		   BT_FN_PTR_PTR, ATTR_CONST_NOTHROW_LEAF_LIST)
+DEF_GOACC_BUILTIN (BUILT_IN_GOACC_THREAD_BROADCAST, "GOACC_thread_broadcast",
+		   BT_FN_UINT_UINT, ATTR_NOTHROW_LEAF_LIST)
+DEF_GOACC_BUILTIN (BUILT_IN_GOACC_THREAD_BROADCAST_LL, "GOACC_thread_broadcast_ll",
+		   BT_FN_ULONGLONG_ULONGLONG, ATTR_NOTHROW_LEAF_LIST)
 
 DEF_GOACC_BUILTIN_COMPILER (BUILT_IN_ACC_ON_DEVICE, "acc_on_device",
 			    BT_FN_INT_INT, ATTR_CONST_NOTHROW_LEAF_LIST)
Index: gcc/optabs.def
===================================================================
--- gcc/optabs.def	(revision 223360)
+++ gcc/optabs.def	(working copy)
@@ -332,3 +332,5 @@  OPTAB_D (atomic_xor_optab, "atomic_xor$I
 
 OPTAB_D (get_thread_pointer_optab, "get_thread_pointer$I$a")
 OPTAB_D (set_thread_pointer_optab, "set_thread_pointer$I$a")
+
+OPTAB_D (oacc_thread_broadcast_optab, "oacc_thread_broadcast$I$a")