diff mbox

[gomp4] optimize launch dimensions

Message ID 55C13F2F.3070600@acm.org
State New
Headers show

Commit Message

Nathan Sidwell Aug. 4, 2015, 10:39 p.m. UTC
I've committed this to gomp4 branch.  It optimizes the new GOACC_DIM_SIZE and 
GOACC_DIM_POS bultins for constant dimensions.  In addition:

*) added a target-specific dimension validation and defaulting hook.  Provided a 
ptx implementation.

*) Made GOACC_DIM_POS pure, to allow some optimization with it (but not 
migration across fork/join)

*) Delete fork/join markers on the host.

This uncovers a defect in the invokation of the device compiler.  We fail to 
propagate -fno-diagnostics-show-caret and similar options to it.  Leading to the 
two tests I altered failing with unexpected diagnostic.  I'll be fixing that 
shortly.

nathan
diff mbox

Patch

2015-08-04  Nathan Sidwell  <nathan@codesourcery.com>

	gcc/
	* doc/tm.texi.in (TARGET_GOACC_VALIDATE_DIMS): Add hook.
	* doc/tm.texi: Regenerated.
	* target.def (TARGET_GOACC): New hook prefix.
	(validate_dims): New.
	* targhooks.h (default_goacc_validate_dims): Declare.
	* internal-fn.def: Add comments.
	(GOACC_DIM_POS): Make pure.
	* config/nvptx/nvptx.c (nvptx_validate_dims): New.
	(TARGET_GOACC_VALIDATE_DIMS): Override.
	* omp-low.h (set_oacc_fn_attrib): Leave default dims as NULL.
	(oacc_xform_dim): New.
	(execute_oacc_transform): Process launch dimensions. Optimize
	DIM_SIZE and DIM_POS.  Delete FORK & JOIN on host.
	(default_oacc_validate_dims): New.

	libgomp/
	* testsuite/libgomp.oacc-c-c++-common/routine-1.c: Add warning.
	* testsuite/libgomp.oacc-c-c++-common/routine-2.c: Add warning.

Index: gcc/internal-fn.def
===================================================================
--- gcc/internal-fn.def	(revision 226595)
+++ gcc/internal-fn.def	(working copy)
@@ -64,7 +64,22 @@  DEF_INTERNAL_FN (MUL_OVERFLOW, ECF_CONST
 DEF_INTERNAL_FN (TSAN_FUNC_EXIT, ECF_NOVOPS | ECF_LEAF | ECF_NOTHROW, NULL)
 DEF_INTERNAL_FN (VA_ARG, ECF_NOTHROW | ECF_LEAF, NULL)
 DEF_INTERNAL_FN (GOACC_DATA_END_WITH_ARG, ECF_NOTHROW, ".r")
+
+/* FORK and JOIN mark the points at which partitioned execution is
+   entered or exited.  We arrange for these two function to be
+   unduplicable and uncombinable in order to preserve the SESE CFG
+   property of partitioned loops.  These are non-const functions to prevent
+   optimizations migrating memory accesses across a partition change
+   boundary.  They take a single INTEGER_CST
+   argument and return nothing.  */
 DEF_INTERNAL_FN (GOACC_FORK, ECF_NOTHROW | ECF_LEAF, ".")
 DEF_INTERNAL_FN (GOACC_JOIN, ECF_NOTHROW | ECF_LEAF, ".")
+
+/* DIM_SIZE and DIM_POS return the size of a particular compute
+   dimension and the executing thread's position within that
+   dimension.  DIM_POS is pure (and not const) so that it isn't
+   thought to clobber memory and can be gcse'd within a single
+   parallel region, but not across FORK/JOIN boundaries.  They take a
+   single INTEGER_CST argument.  */
 DEF_INTERNAL_FN (GOACC_DIM_SIZE, ECF_CONST | ECF_NOTHROW | ECF_LEAF, ".")
-DEF_INTERNAL_FN (GOACC_DIM_POS, ECF_NOTHROW | ECF_LEAF, ".")
+DEF_INTERNAL_FN (GOACC_DIM_POS, ECF_PURE | ECF_NOTHROW | ECF_LEAF, ".")
Index: gcc/config/nvptx/nvptx.c
===================================================================
--- gcc/config/nvptx/nvptx.c	(revision 226595)
+++ gcc/config/nvptx/nvptx.c	(working copy)
@@ -3524,6 +3524,57 @@  nvptx_expand_builtin (tree exp, rtx targ
   return d->expander (exp, target, mode, ignore);
 }
 
+/* Validate compute dimensions, fill in defaults.  */
+
+static tree
+nvptx_validate_dims (tree decl, tree dims)
+{
+  tree adims[GOMP_DIM_MAX];
+  unsigned ix;
+  bool changed = false;
+  tree pos = dims;
+
+  for (ix = 0; ix != GOMP_DIM_MAX; ix++)
+    {
+      adims[ix] = TREE_VALUE (pos);
+      pos = TREE_CHAIN (pos);
+    }
+  /* Define vector size for known hardware.  */
+#define PTX_VECTOR_LENGTH 32
+  /* If the worker size is not 1, the vector size must be 32.  If
+     the vector size is not 1, it must be 32.  */
+  if ((adims[GOMP_DIM_WORKER]
+       && TREE_INT_CST_LOW (adims[GOMP_DIM_WORKER]) != 1)
+      || (adims[GOMP_DIM_VECTOR]
+	  && TREE_INT_CST_LOW (adims[GOMP_DIM_VECTOR]) != 1))
+    {
+      if (!adims[GOMP_DIM_VECTOR]
+	  || TREE_INT_CST_LOW (adims[GOMP_DIM_VECTOR]) != PTX_VECTOR_LENGTH)
+	{
+	  tree use = build_int_cst (integer_type_node, PTX_VECTOR_LENGTH);
+	  if (adims[GOMP_DIM_VECTOR])
+	    warning_at (DECL_SOURCE_LOCATION (decl), 0,
+			TREE_INT_CST_LOW (adims[GOMP_DIM_VECTOR])
+			? "using vector_length (%E), ignoring %E"
+			: "using vector_length (%E), ignoring runtime setting",
+			use, adims[GOMP_DIM_VECTOR]);
+	  adims[GOMP_DIM_VECTOR] = use;
+	}
+    }
+
+  /* Set defaults.  */
+  for (ix = 0; ix != GOMP_DIM_MAX; ix++)
+    if (!adims[ix])
+      adims[ix] = integer_one_node;
+
+  /* Write results.  */
+  pos = dims;
+  for (ix = 0; ix != GOMP_DIM_MAX; ix++, pos = TREE_CHAIN (pos))
+    TREE_VALUE (pos) = adims[ix];
+  
+  return dims;
+}
+
 #undef TARGET_OPTION_OVERRIDE
 #define TARGET_OPTION_OVERRIDE nvptx_option_override
 
@@ -3618,6 +3669,9 @@  nvptx_expand_builtin (tree exp, rtx targ
 #undef  TARGET_BUILTIN_DECL
 #define TARGET_BUILTIN_DECL nvptx_builtin_decl
 
+#undef TARGET_GOACC_VALIDATE_DIMS
+#define TARGET_GOACC_VALIDATE_DIMS nvptx_validate_dims
+
 struct gcc_target targetm = TARGET_INITIALIZER;
 
 #include "gt-nvptx.h"
Index: gcc/target.def
===================================================================
--- gcc/target.def	(revision 226595)
+++ gcc/target.def	(working copy)
@@ -1639,6 +1639,21 @@  int, (struct cgraph_node *), NULL)
 
 HOOK_VECTOR_END (simd_clone)
 
+/* Functions relating to openacc.  */
+#undef HOOK_PREFIX
+#define HOOK_PREFIX "TARGET_GOACC_"
+HOOK_VECTOR (TARGET_GOACC, goacc)
+
+DEFHOOK
+(validate_dims,
+"This hook should check the launch dimensions provided/ It should fill\n\
+in default values and verify non-defaults.  The TREE_LIST is unshared\n\
+and may be overwritten.  Diagnostics should be issued as appropriate.",
+tree, (tree, tree),
+default_goacc_validate_dims)
+
+HOOK_VECTOR_END (goacc)
+
 /* Functions relating to vectorization.  */
 #undef HOOK_PREFIX
 #define HOOK_PREFIX "TARGET_VECTORIZE_"
Index: gcc/targhooks.h
===================================================================
--- gcc/targhooks.h	(revision 226595)
+++ gcc/targhooks.h	(working copy)
@@ -107,6 +107,8 @@  extern unsigned default_add_stmt_cost (v
 extern void default_finish_cost (void *, unsigned *, unsigned *, unsigned *);
 extern void default_destroy_cost_data (void *);
 
+extern tree default_goacc_validate_dims (tree, tree);
+
 /* These are here, and not in hooks.[ch], because not all users of
    hooks.h include tm.h, and thus we don't have CUMULATIVE_ARGS.  */
 
Index: gcc/doc/tm.texi
===================================================================
--- gcc/doc/tm.texi	(revision 226595)
+++ gcc/doc/tm.texi	(working copy)
@@ -5740,6 +5740,12 @@  usable.  In that case, the smaller the n
 to use it.
 @end deftypefn
 
+@deftypefn {Target Hook} tree TARGET_GOACC_VALIDATE_DIMS (tree, @var{tree})
+This hook should check the launch dimensions provided/ It should fill
+in default values and verify non-defaults.  The TREE_LIST is unshared
+and may be overwritten.  Diagnostics should be issued as appropriate.
+@end deftypefn
+
 @node Anchored Addresses
 @section Anchored Addresses
 @cindex anchored addresses
Index: gcc/doc/tm.texi.in
===================================================================
--- gcc/doc/tm.texi.in	(revision 226595)
+++ gcc/doc/tm.texi.in	(working copy)
@@ -4245,6 +4245,8 @@  address;  but often a machine-dependent
 
 @hook TARGET_SIMD_CLONE_USABLE
 
+@hook TARGET_GOACC_VALIDATE_DIMS
+
 @node Anchored Addresses
 @section Anchored Addresses
 @cindex anchored addresses
Index: gcc/omp-low.c
===================================================================
--- gcc/omp-low.c	(revision 226595)
+++ gcc/omp-low.c	(working copy)
@@ -9293,7 +9293,9 @@  oacc_launch_pack (unsigned code, tree de
 
    The attribute value is a TREE_LIST.  A set of dimensions is
    represented as a list of INTEGER_CST.  Those that are runtime
-   expres are represented as an INTEGER_CST of zero.
+   expres are represented as an INTEGER_CST of zero.  Defaults are set
+   as NULL_TREE and will be filled in later by the target hook
+   TARGET_OACC_VALIDATE_DIMS.
 
    TOOO. Normally the attribute will just contain a single such list.  If
    however it contains a list of lists, this will represent the use of
@@ -9317,14 +9319,12 @@  set_oacc_fn_attrib (tree clauses, tree f
   for (ix = GOMP_DIM_MAX; ix--;)
     {
       tree clause = find_omp_clause (clauses, ids[ix]);
-      tree dim;
+      tree dim = NULL_TREE;
 
-      if (!clause)
-	dim = integer_one_node;
-      else
+      if (clause)
 	dim = OMP_CLAUSE_EXPR (clause, ids[ix]);
       dims[ix] = dim;
-      if (TREE_CODE (dim) != INTEGER_CST)
+      if (dim && TREE_CODE (dim) != INTEGER_CST)
 	{
 	  dim = integer_zero_node;
 	  non_const |= GOMP_DIM_MASK (ix);
@@ -14586,35 +14586,126 @@  oacc_xform_on_device (gimple_stmt_iterat
   gsi_replace_with_seq (gsi, replace, false);
 }
 
+/* Transform oacc_dim_size and oacc_dim_pos internal function calls to
+   constants, where possible.  */
+
+static void
+oacc_xform_dim (gimple_stmt_iterator *gsi, gimple stmt,
+		tree dims, bool is_pos)
+{
+  tree arg = gimple_call_arg (stmt, 0);
+  unsigned axis = (unsigned)TREE_INT_CST_LOW (arg);
+  while (axis--)
+    dims = TREE_CHAIN (dims);
+  unsigned size = TREE_INT_CST_LOW (TREE_VALUE (dims));
+
+  if (size == 0)
+    /* Dimension size is dynamic.  */
+    return;
+  if (is_pos)
+    {
+      if (size != 1)
+	/* Size is more than 1.  */
+	return;
+      size = 0;
+    }
+
+  /* Replace the internal call with a constant.  */
+  tree lhs = gimple_call_lhs (stmt);
+  gimple g = gimple_build_assign
+    (lhs, build_int_cst (unsigned_type_node, size));
+  gsi_replace (gsi, g, false);
+}
+
 /* Main entry point for oacc transformations which run on the device
-   compilerafter LTO, so we know what the target device is at this
+   compiler after LTO, so we know what the target device is at this
    point (including the host fallback).  */
 
 static unsigned int
 execute_oacc_transform ()
 {
   basic_block bb;
-
-  if (!get_oacc_fn_attrib (current_function_decl))
+  tree attrs = get_oacc_fn_attrib (current_function_decl);
+  
+  if (!attrs)
+    /* Not an offloaded function.  */
     return 0;
 
+  tree dims = TREE_VALUE (attrs);
+  if (dims)
+    dims = targetm.goacc.validate_dims (current_function_decl, dims);
+  /* Safe to overwrite, this attribute chain is unshared.  */
+  TREE_VALUE (attrs) = dims;
+  
   FOR_ALL_BB_FN (bb, cfun)
     {
-      for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
-	   !gsi_end_p (gsi); gsi_next (&gsi))
+      for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);)
 	{
 	  gimple stmt = gsi_stmt (gsi);
 
-	  /* acc_on_device must be evaluated at compile time for
-	     constant arguments.  */
-	  if (gimple_call_builtin_p (stmt, BUILT_IN_ACC_ON_DEVICE))
-	    oacc_xform_on_device (&gsi, stmt);
+	  if (is_gimple_call (stmt))
+	    {
+	      /* acc_on_device must be evaluated at compile time for
+		 constant arguments.  */
+	      if (gimple_call_builtin_p (stmt, BUILT_IN_ACC_ON_DEVICE))
+		oacc_xform_on_device (&gsi, stmt);
+
+	      if (gimple_call_internal_p (stmt))
+		switch (gimple_call_internal_fn (stmt))
+		  {
+		  default: break;
+
+		  case IFN_GOACC_DIM_SIZE:
+		    if (dims)
+		      oacc_xform_dim (&gsi, stmt, dims, false);
+		    break;
+
+		  case IFN_GOACC_DIM_POS:
+		    if (dims)
+		      oacc_xform_dim (&gsi, stmt, dims, true);
+		    break;
+
+#ifndef ACCEL_COMPILER
+		  case IFN_GOACC_FORK:
+		  case IFN_GOACC_JOIN:
+		    /* These are irrelevant on the host.  */
+		    replace_uses_by (gimple_vdef (stmt), gimple_vuse (stmt));
+		    gsi_remove (&gsi, true);
+		    /* Removal will have advanced the iterator.  */
+		    continue;
+#endif
+		  }
+	    }
+	  gsi_next (&gsi);
 	}
     }
 
   return 0;
 }
 
+/* Default launch dimension validator.  */
+
+tree
+default_goacc_validate_dims (tree ARG_UNUSED (decl), tree dims)
+{
+  tree pos = dims;
+  for (unsigned ix = GOMP_DIM_MAX; ix--; pos = TREE_CHAIN (pos))
+    {
+      tree val = TREE_VALUE (pos);
+      
+#ifdef ACCEL_COMPILER
+      if (!val)
+	val = integer_one_node;
+#else
+      /* Set to 1 on the host. */
+      val = integer_one_node;
+#endif
+      TREE_VALUE (pos) =  val;
+    }
+
+  return dims;
+}
+
 namespace {
 
 const pass_data pass_data_oacc_transform =
Index: libgomp/testsuite/libgomp.oacc-c-c++-common/routine-1.c
===================================================================
--- libgomp/testsuite/libgomp.oacc-c-c++-common/routine-1.c	(revision 226595)
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/routine-1.c	(working copy)
@@ -1,6 +1,6 @@ 
 /* FIXME: remove -fno-var-tracking from dg-aditional-options.  */
 
-/* { dg-do run } */
+/* { dg-do run { target openacc_nvidia_accel_selected }  } */
 /* { dg-additional-options "-fno-inline -fno-var-tracking" } */
 
 #include <stdio.h>
@@ -23,7 +23,7 @@  main()
 
   a = (int *)malloc (sizeof (int) * n);
 
-#pragma acc parallel copy (a[0:n]) vector_length (5)
+#pragma acc parallel copy (a[0:n]) vector_length (5) /* { dg-warning "using vector_length" } */
   {
 #pragma acc loop
     for (i = 0; i < n; i++)
Index: libgomp/testsuite/libgomp.oacc-c-c++-common/routine-2.c
===================================================================
--- libgomp/testsuite/libgomp.oacc-c-c++-common/routine-2.c	(revision 226595)
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/routine-2.c	(working copy)
@@ -1,6 +1,6 @@ 
 /* FIXME: remove -fno-var-tracking from dg-additional-options.  */
 
-/* { dg-do run } */
+/* { dg-do run { target openacc_nvidia_accel_selected }  } */
 /* { dg-additional-options "-fno-inline -fno-var-tracking" } */
 
 #include <stdio.h>
@@ -24,7 +24,7 @@  main()
 
   a = (int *)malloc (sizeof (int) * n);
 
-#pragma acc parallel copy (a[0:n]) vector_length (5)
+#pragma acc parallel copy (a[0:n]) vector_length (5) /* { dg-warning "using vector_length" } */
   {
 #pragma acc loop
     for (i = 0; i < n; i++)