diff mbox

[hsa,branch] Map collapse(2) and collapse(3) to HSA grid dimensions

Message ID 20160202175616.GO32511@virgil.suse.cz
State New
Headers show

Commit Message

Martin Jambor Feb. 2, 2016, 5:56 p.m. UTC
Hi,

with HSA merged, the hsa branch can be used for development of new
features again.  Thus, I have committed there a patch which I finished
after the merge proposal and thus I kept in a private branch so far,
which allows collapse(2) and collapse(3) clauses to be gridified and
the individual loops to be directly mapped to HSA grid dimensions.

In order to achieve, that I needed to introduce hsa-specific builtins
which expand to HSAIL instructions giving information about specific
HSA grid dimensions.  I hope I have done that right, any comments are
welcome.

Other than that, the changes are small because as I was restructuring
the code, I was moving it in this direction for some time already.
Committed to the branch (a few days ago actually, sorry for that).

Thanks,

Martin


2016-01-26  Martin Jambor  <mjambor@suse.cz>

gcc/
        * Makefile.in (BUILTINS_DEF): Add hsa-builtins.def.
        * builtins.def: Include hsa-builtins.def.
        (DEF_HSA_BUILTIN): Define.
        * hsa-builtins.def: New file.
        * hsa-gen.c (query_hsa_grid): Accept dimension as an hsa_op_immed.
        Add a new override.
        (gen_hsa_insns_for_call): Handle BUILT_IN_HSA_GET_WORKITEM_ABSID.
        * omp-low.c (grid_get_kernel_launch_attributes): Support up to
        three dimensions.
        (grid_expand_omp_for_loop): Likewise.
        (lower_omp_for_lastprivate): Do not extract looptemps from grid loops.
        (grid_target_follows_gridifiable_pattern): Allow collapse up to 3.
        * tree-inline.h (copy_body_data): New field
        decl_creation_prevention_level.  Moved remap_var_for_cilk to minimize
        padding.

gcc/fortran/
        * f95-lang.c: Include hsa-builtins.def.
        (DEF_HSA_BUILTIN): Define.

libgomp/
        * plugin/plugin-hsa.c (parse_target_attributes): Support up to three
        dimensions.
        (get_group_size): New function.
        (GOMP_OFFLOAD_run): Support up to three dimensions.
diff mbox

Patch

diff --git a/gcc/Makefile.in b/gcc/Makefile.in
index ab9cbbf..a996708 100644
--- a/gcc/Makefile.in
+++ b/gcc/Makefile.in
@@ -899,7 +899,8 @@  RTL_H = $(RTL_BASE_H) $(FLAGS_H) genrtl.h
 READ_MD_H = $(OBSTACK_H) $(HASHTAB_H) read-md.h
 PARAMS_H = params.h params-enum.h params.def
 BUILTINS_DEF = builtins.def sync-builtins.def omp-builtins.def \
-	gtm-builtins.def sanitizer.def cilkplus.def cilk-builtins.def
+	gtm-builtins.def sanitizer.def cilkplus.def cilk-builtins.def \
+	hsa-builtins.def
 INTERNAL_FN_DEF = internal-fn.def
 INTERNAL_FN_H = internal-fn.h $(INTERNAL_FN_DEF)
 TREE_CORE_H = tree-core.h coretypes.h all-tree.def tree.def \
diff --git a/gcc/builtins.def b/gcc/builtins.def
index 2fc7f65..14d2335 100644
--- a/gcc/builtins.def
+++ b/gcc/builtins.def
@@ -188,6 +188,16 @@  along with GCC; see the file COPYING3.  If not see
 		|| flag_cilkplus \
 		|| flag_offload_abi != OFFLOAD_ABI_UNSET))
 
+#undef DEF_HSA_BUILTIN
+#ifdef ENABLE_HSA
+#define DEF_HSA_BUILTIN(ENUM, NAME, TYPE, ATTRS) \
+  DEF_BUILTIN (ENUM, "__builtin_" NAME, BUILT_IN_NORMAL, TYPE, TYPE,    \
+               false, false, true, ATTRS, false, \
+	       (!flag_disable_hsa))
+#else
+#define DEF_HSA_BUILTIN(ENUM, NAME, TYPE, ATTRS)
+#endif
+
 /* Builtin used by implementation of Cilk Plus.  Most of these are decomposed
    by the compiler but a few are implemented in libcilkrts.  */ 
 #undef DEF_CILK_BUILTIN_STUB
@@ -932,6 +942,9 @@  DEF_GCC_BUILTIN (BUILT_IN_LINE, "LINE", BT_FN_INT, ATTR_NOTHROW_LEAF_LIST)
 /* Offloading and Multi Processing builtins.  */
 #include "omp-builtins.def"
 
+/* Heterogeneous Systems Architecture.  */
+#include "hsa-builtins.def"
+
 /* Cilk keywords builtins.  */
 #include "cilk-builtins.def"
 
diff --git a/gcc/fortran/f95-lang.c b/gcc/fortran/f95-lang.c
index 9c3a311..efa750de 100644
--- a/gcc/fortran/f95-lang.c
+++ b/gcc/fortran/f95-lang.c
@@ -1234,6 +1234,17 @@  gfc_init_builtin_functions (void)
 #undef DEF_GOMP_BUILTIN
     }
 
+#ifdef ENABLE_HSA
+  if (!flag_disable_hsa)
+    {
+#undef DEF_HSA_BUILTIN
+#define DEF_HSA_BUILTIN(code, name, type, attr) \
+      gfc_define_builtin ("__builtin_" name, builtin_types[type], \
+			  code, name, attr);
+#include "../hsa-builtins.def"
+    }
+#endif
+
   gfc_define_builtin ("__builtin_trap", builtin_types[BT_FN_VOID],
 		      BUILT_IN_TRAP, NULL, ATTR_NOTHROW_LEAF_LIST);
   TREE_THIS_VOLATILE (builtin_decl_explicit (BUILT_IN_TRAP)) = 1;
diff --git a/gcc/hsa-builtins.def b/gcc/hsa-builtins.def
new file mode 100644
index 0000000..e4681c1
--- /dev/null
+++ b/gcc/hsa-builtins.def
@@ -0,0 +1,31 @@ 
+/* This file contains the definitions and documentation for the
+   Offloading and Multi Processing builtins used in the GNU compiler.
+   Copyright (C) 2005-2015 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 3, or (at your option) any later
+version.
+
+GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+/* Before including this file, you should define a macro:
+
+     DEF_HSA_BUILTIN (ENUM, NAME, TYPE, ATTRS)
+
+   See builtins.def for details.  */
+
+/* The reason why they aren't in gcc/builtins.def is that the Fortran front end
+   doesn't source those.  */
+
+DEF_HSA_BUILTIN (BUILT_IN_HSA_GET_WORKITEM_ABSID, "hsa_get_workitem_absid",
+	  	 BT_FN_UINT_UINT, ATTR_CONST_NOTHROW_LEAF_LIST)
diff --git a/gcc/hsa-gen.c b/gcc/hsa-gen.c
index ce3c1ea..966c4c8 100644
--- a/gcc/hsa-gen.c
+++ b/gcc/hsa-gen.c
@@ -3722,15 +3722,11 @@  hsa_insn_basic::set_output_in_type (hsa_op_reg *dest, unsigned op_index,
    HBB.  */
 
 static void
-query_hsa_grid (hsa_op_reg *dest, BrigType16_t opcode, int dimension,
+query_hsa_grid (hsa_op_reg *dest, BrigType16_t opcode,  hsa_op_immed *dimension,
 		hsa_bb *hbb)
 {
-  /* We're using just one-dimensional kernels, so hard-coded
-     dimension X.  */
-  hsa_op_immed *imm
-    = new hsa_op_immed (dimension, (BrigKind16_t) BRIG_TYPE_U32);
   hsa_insn_basic *insn = new hsa_insn_basic (2, opcode, BRIG_TYPE_U32, NULL,
-					     imm);
+					     dimension);
   hbb->append_insn (insn);
   insn->set_output_in_type (dest, 0, hbb);
 }
@@ -3739,7 +3735,7 @@  query_hsa_grid (hsa_op_reg *dest, BrigType16_t opcode, int dimension,
    Instructions are appended to basic block HBB.  */
 
 static void
-query_hsa_grid (gimple *stmt, BrigOpcode16_t opcode, int dimension,
+query_hsa_grid (gimple *stmt, BrigOpcode16_t opcode, hsa_op_immed *dimension,
 		hsa_bb *hbb)
 {
   tree lhs = gimple_call_lhs (dyn_cast <gcall *> (stmt));
@@ -3751,6 +3747,18 @@  query_hsa_grid (gimple *stmt, BrigOpcode16_t opcode, int dimension,
   query_hsa_grid (dest, opcode, dimension, hbb);
 }
 
+/* Generate a special HSA-related instruction for gimple STMT.
+   Instructions are appended to basic block HBB.  */
+
+static void
+query_hsa_grid (gimple *stmt, BrigOpcode16_t opcode, int dimension,
+		hsa_bb *hbb)
+{
+  hsa_op_immed *bdim = new hsa_op_immed (dimension,
+					 (BrigKind16_t) BRIG_TYPE_U32);
+  query_hsa_grid (stmt, opcode, bdim, hbb);
+}
+
 /* Emit instructions that set hsa_num_threads according to provided VALUE.
    Instructions are appended to basic block HBB.  */
 
@@ -5506,6 +5514,14 @@  gen_hsa_insns_for_call (gimple *stmt, hsa_bb *hbb)
 
 	break;
       }
+    case BUILT_IN_HSA_GET_WORKITEM_ABSID:
+      {
+	hsa_op_immed *bdim = new hsa_op_immed (gimple_call_arg (stmt, 0), true);
+	if (bdim->m_type != BRIG_TYPE_U32)
+	  bdim->get_in_type (BRIG_TYPE_U32, hbb);
+	query_hsa_grid (stmt, BRIG_OPCODE_WORKITEMABSID, bdim, hbb);
+	break;
+      }
     case BUILT_IN_OMP_GET_THREAD_NUM:
       {
 	query_hsa_grid (stmt, BRIG_OPCODE_WORKITEMABSID, 0, hbb);
diff --git a/gcc/omp-low.c b/gcc/omp-low.c
index e0ac1d5..8379d3e 100644
--- a/gcc/omp-low.c
+++ b/gcc/omp-low.c
@@ -12733,7 +12733,6 @@  grid_get_kernel_launch_attributes (gimple_stmt_iterator *gsi,
 				   gomp_target *tgt_stmt)
 {
   grid_create_kernel_launch_attr_types ();
-  tree u32_one = build_one_cst (uint32_type_node);
   tree lattrs = create_tmp_var (grid_attr_trees->kernel_launch_attributes_type,
 				"__kernel_launch_attrs");
 
@@ -12758,10 +12757,10 @@  grid_get_kernel_launch_attributes (gimple_stmt_iterator *gsi,
 
   tree dimref = build3 (COMPONENT_REF, uint32_type_node, lattrs,
 			grid_attr_trees->kernel_lattrs_dimnum_decl, NULL_TREE);
-  /* At this moment we cannot gridify a loop with a collapse clause.  */
-  /* TODO: Adjust when we support bigger collapse.  */
-  gcc_assert (max_dim == 0);
-  gsi_insert_before (gsi, gimple_build_assign (dimref, u32_one), GSI_SAME_STMT);
+  gcc_checking_assert (max_dim <= 2);
+  tree dimensions = build_int_cstu (uint32_type_node, max_dim + 1);
+  gsi_insert_before (gsi, gimple_build_assign (dimref, dimensions),
+		     GSI_SAME_STMT);
   TREE_ADDRESSABLE (lattrs) = 1;
   return build_fold_addr_expr (lattrs);
 }
@@ -13409,53 +13408,59 @@  expand_omp_target (struct omp_region *region)
 static void
 grid_expand_omp_for_loop (struct omp_region *kfor)
 {
-  tree t, threadid;
-  tree type, itype;
   gimple_stmt_iterator gsi;
-  tree n1, step;
-  struct omp_for_data fd;
-
   gomp_for *for_stmt = as_a <gomp_for *> (last_stmt (kfor->entry));
   gcc_checking_assert (gimple_omp_for_kind (for_stmt)
 		       == GF_OMP_FOR_KIND_GRID_LOOP);
+  size_t collapse = gimple_omp_for_collapse (for_stmt);
+  struct omp_for_data_loop *loops
+    = (struct omp_for_data_loop *)
+    alloca (gimple_omp_for_collapse (for_stmt)
+	    * sizeof (struct omp_for_data_loop));
+
+  struct omp_for_data fd;
+
   basic_block body_bb = FALLTHRU_EDGE (kfor->entry)->dest;
 
-  gcc_assert (gimple_omp_for_collapse (for_stmt) == 1);
   gcc_assert (kfor->cont);
-  extract_omp_for_data (for_stmt, &fd, NULL);
-
-  itype = type = TREE_TYPE (fd.loop.v);
-  if (POINTER_TYPE_P (type))
-    itype = signed_type_for (type);
+  extract_omp_for_data (for_stmt, &fd, loops);
 
   gsi = gsi_start_bb (body_bb);
 
-  n1 = fd.loop.n1;
-  step = fd.loop.step;
-  n1 = force_gimple_operand_gsi (&gsi, fold_convert (type, n1),
-				 true, NULL_TREE, true, GSI_SAME_STMT);
-  step = force_gimple_operand_gsi (&gsi, fold_convert (itype, step),
-				   true, NULL_TREE, true, GSI_SAME_STMT);
-  threadid = build_call_expr (builtin_decl_explicit
-			      (BUILT_IN_OMP_GET_THREAD_NUM), 0);
-  threadid = fold_convert (itype, threadid);
-  threadid = force_gimple_operand_gsi (&gsi, threadid, true, NULL_TREE,
-				       true, GSI_SAME_STMT);
+  for (size_t dim = 0; dim < collapse; dim++)
+    {
+      tree type, itype;
+      itype = type = TREE_TYPE (fd.loops[dim].v);
+      if (POINTER_TYPE_P (type))
+	itype = signed_type_for (type);
 
-  tree startvar = fd.loop.v;
-  t = fold_build2 (MULT_EXPR, itype, threadid, step);
-  if (POINTER_TYPE_P (type))
-    t = fold_build_pointer_plus (n1, t);
-  else
-    t = fold_build2 (PLUS_EXPR, type, t, n1);
-  t = fold_convert (type, t);
-  t = force_gimple_operand_gsi (&gsi, t,
-				DECL_P (startvar)
-				&& TREE_ADDRESSABLE (startvar),
-				NULL_TREE, true, GSI_SAME_STMT);
-  gassign *assign_stmt = gimple_build_assign (startvar, t);
-  gsi_insert_before (&gsi, assign_stmt, GSI_SAME_STMT);
+      tree n1 = fd.loops[dim].n1;
+      tree step = fd.loops[dim].step;
+      n1 = force_gimple_operand_gsi (&gsi, fold_convert (type, n1),
+				     true, NULL_TREE, true, GSI_SAME_STMT);
+      step = force_gimple_operand_gsi (&gsi, fold_convert (itype, step),
+				       true, NULL_TREE, true, GSI_SAME_STMT);
+      tree threadid = build_call_expr (builtin_decl_explicit
+				       (BUILT_IN_HSA_GET_WORKITEM_ABSID), 1,
+				       build_int_cstu (unsigned_type_node, dim));
+      threadid = fold_convert (itype, threadid);
+      threadid = force_gimple_operand_gsi (&gsi, threadid, true, NULL_TREE,
+					   true, GSI_SAME_STMT);
 
+      tree startvar = fd.loops[dim].v;
+      tree t = fold_build2 (MULT_EXPR, itype, threadid, step);
+      if (POINTER_TYPE_P (type))
+	t = fold_build_pointer_plus (n1, t);
+      else
+	t = fold_build2 (PLUS_EXPR, type, t, n1);
+      t = fold_convert (type, t);
+      t = force_gimple_operand_gsi (&gsi, t,
+				    DECL_P (startvar)
+				    && TREE_ADDRESSABLE (startvar),
+				    NULL_TREE, true, GSI_SAME_STMT);
+      gassign *assign_stmt = gimple_build_assign (startvar, t);
+      gsi_insert_before (&gsi, assign_stmt, GSI_SAME_STMT);
+    }
   /* Remove the omp for statement */
   gsi = gsi_last_bb (kfor->entry);
   gsi_remove (&gsi, true);
@@ -14837,7 +14842,8 @@  lower_omp_for_lastprivate (struct omp_for_data *fd, gimple_seq *body_p,
   tree n2 = fd->loop.n2;
   if (fd->collapse > 1
       && TREE_CODE (n2) != INTEGER_CST
-      && gimple_omp_for_combined_into_p (fd->for_stmt))
+      && gimple_omp_for_combined_into_p (fd->for_stmt)
+      && gimple_omp_for_kind (fd->for_stmt) != GF_OMP_FOR_KIND_GRID_LOOP)
     {
       struct omp_context *taskreg_ctx = NULL;
       if (gimple_code (ctx->outer->stmt) == GIMPLE_OMP_FOR)
@@ -17324,13 +17330,13 @@  grid_target_follows_gridifiable_pattern (gomp_target *target, tree *group_size_p
 			 "distribute construct\n ");
       return false;
     }
-  if (dist->collapse > 1)
+  if (dist->collapse > 3)
     {
       if (dump_enabled_p ())
 	dump_printf_loc (MSG_NOTE, tloc,
 			 "Will not turn target construct into a gridified GPGPU "
 			 "kernel because the distribute construct contains "
-			 "collapse clause\n");
+			 "collapse clause with parameter greater than 3\n");
       return false;
     }
   struct omp_for_data fd;
@@ -17405,13 +17411,13 @@  grid_target_follows_gridifiable_pattern (gomp_target *target, tree *group_size_p
 			 "loop\n");
       return false;
     }
-  if (gfor->collapse > 1)
+  if (gfor->collapse > 3)
     {
       if (dump_enabled_p ())
 	dump_printf_loc (MSG_NOTE, tloc,
 			 "Will not turn target construct into a gridified GPGPU "
 			 "kernel because the inner loop contains collapse "
-			 "clause\n");
+			 "clause with parameter greater than 3\n");
       return false;
     }
 
diff --git a/libgomp/plugin/plugin-hsa.c b/libgomp/plugin/plugin-hsa.c
index d888493..687a840 100644
--- a/libgomp/plugin/plugin-hsa.c
+++ b/libgomp/plugin/plugin-hsa.c
@@ -1148,18 +1148,43 @@  parse_target_attributes (void **input,
   struct GOMP_kernel_launch_attributes *kla;
   kla = (struct GOMP_kernel_launch_attributes *) *input;
   *result = kla;
-  if (kla->ndim != 1)
-    GOMP_PLUGIN_fatal ("HSA does not yet support number of dimensions "
-		       "different from one.");
-  if (kla->gdims[0] == 0)
-    return false;
-
-  HSA_DEBUG ("GOMP_OFFLOAD_run called with grid size %u and group size %u\n",
-	     kla->gdims[0], kla->wdims[0]);
+  if (kla->ndim == 0 || kla->ndim > 3)
+    GOMP_PLUGIN_fatal ("Invalid number of dimensions (%u)", kla->ndim);
 
+  HSA_DEBUG ("GOMP_OFFLOAD_run called with %u dimensions:\n", kla->ndim);
+  unsigned i;
+  for (i = 0; i < kla->ndim; i++)
+    {
+      HSA_DEBUG ("  Dimension %u: grid size %u and group size %u\n", i,
+		 kla->gdims[i], kla->wdims[i]);
+      if (kla->gdims[i] == 0)
+	return false;
+    }
   return true;
 }
 
+/* Return the group size given the requested GROUP size, GRID size and number
+   of grid dimensions NDIM.  */
+
+static uint32_t
+get_group_size (uint32_t ndim, uint32_t grid, uint32_t group)
+{
+  if (group == 0)
+    {
+      /* TODO: Provide a default via environment or device characteristics.  */
+      if (ndim == 1)
+	group = 64;
+      else if (ndim == 2)
+	group = 8;
+      else
+	group = 4;
+    }
+
+  if (group > grid)
+    group = grid;
+  return group;
+}
+
 /* Return true if the HSA runtime can run function FN_PTR.  */
 
 bool
@@ -1232,19 +1257,36 @@  GOMP_OFFLOAD_run (int n, void *fn_ptr, void *vars, void **args)
 	   + index % agent->command_q->size;
 
   memset (((uint8_t *) packet) + 4, 0, sizeof (*packet) - 4);
-  packet->setup |= (uint16_t) 1 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
+  packet->setup
+    |= (uint16_t) kla->ndim << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
   packet->grid_size_x = kla->gdims[0];
-  uint32_t wgs = kla->wdims[0];
-  if (wgs == 0)
-    /* TODO: Provide a default via environment.  */
-    wgs = 64;
-  else if (wgs > kla->gdims[0])
-    wgs = kla->gdims[0];
-  packet->workgroup_size_x = wgs;
-  packet->grid_size_y = 1;
-  packet->workgroup_size_y = 1;
-  packet->grid_size_z = 1;
-  packet->workgroup_size_z = 1;
+  packet->workgroup_size_x = get_group_size (kla->ndim, kla->gdims[0],
+					     kla->wdims[0]);
+
+  if (kla->ndim >= 2)
+    {
+      packet->grid_size_y = kla->gdims[1];
+      packet->workgroup_size_y = get_group_size (kla->ndim, kla->gdims[1],
+						 kla->wdims[1]);
+    }
+  else
+    {
+      packet->grid_size_y = 1;
+      packet->workgroup_size_y = 1;
+    }
+
+  if (kla->ndim == 3)
+    {
+      packet->grid_size_z = kla->gdims[2];
+      packet->workgroup_size_z = get_group_size (kla->ndim, kla->gdims[2],
+					     kla->wdims[2]);
+    }
+  else
+    {
+      packet->grid_size_z = 1;
+      packet->workgroup_size_z = 1;
+    }
+
   packet->private_segment_size = kernel->private_segment_size;
   packet->group_segment_size = kernel->group_segment_size;
   packet->kernel_object = kernel->object;