diff mbox

[RFC,gomp4] Offloading patches (2/3): Add tables generation

Message ID 20131217113957.GA39975@msticlxl57.ims.intel.com
State New
Headers show

Commit Message

Michael Zolotukhin Dec. 17, 2013, 11:39 a.m. UTC
Hi everybody,

Here is a patch 2/3: Add tables generation.

This patch is just a slightly modified patch sent a couple of weeks ago.  When
compiling with '-fopenmp' compiler generates a special symbol, containing
addresses and sizes of globals/omp_fn-functions, and places it into a special
section.  Later, at linking, these sections are merged together and we get a
single table with all addresses/sizes for entire binary.  Also, in this patch we
start to pass '__OPENMP_TARGET__' symbol to GOMP_target calls.

Thanks,
Michael


---
 gcc/omp-low.c |  119 +++++++++++++++++++++++++++++++++++++++++++++++++++++----
 gcc/omp-low.h |    1 +
 gcc/toplev.c  |    3 +
 3 files changed, 115 insertions(+), 8 deletions(-)

Comments

Michael Zolotukhin Jan. 16, 2014, 11:36 a.m. UTC | #1
Ping.

On 17 December 2013 15:39, Michael V. Zolotukhin
<michael.v.zolotukhin@gmail.com> wrote:
> Hi everybody,
>
> Here is a patch 2/3: Add tables generation.
>
> This patch is just a slightly modified patch sent a couple of weeks ago.  When
> compiling with '-fopenmp' compiler generates a special symbol, containing
> addresses and sizes of globals/omp_fn-functions, and places it into a special
> section.  Later, at linking, these sections are merged together and we get a
> single table with all addresses/sizes for entire binary.  Also, in this patch we
> start to pass '__OPENMP_TARGET__' symbol to GOMP_target calls.
>
> Thanks,
> Michael
>
>
> ---
>  gcc/omp-low.c |  119 +++++++++++++++++++++++++++++++++++++++++++++++++++++----
>  gcc/omp-low.h |    1 +
>  gcc/toplev.c  |    3 +
>  3 files changed, 115 insertions(+), 8 deletions(-)
>
> diff --git a/gcc/omp-low.c b/gcc/omp-low.c
> index e0f7d1d..f860204 100644
> --- a/gcc/omp-low.c
> +++ b/gcc/omp-low.c
> @@ -58,6 +58,7 @@ along with GCC; see the file COPYING3.  If not see
>  #include "optabs.h"
>  #include "cfgloop.h"
>  #include "target.h"
> +#include "common/common-target.h"
>  #include "omp-low.h"
>  #include "gimple-low.h"
>  #include "tree-cfgcleanup.h"
> @@ -8371,19 +8372,22 @@ expand_omp_target (struct omp_region *region)
>      }
>
>    gimple g;
> -  /* FIXME: This will be address of
> -     extern char __OPENMP_TARGET__[] __attribute__((visibility ("hidden")))
> -     symbol, as soon as the linker plugin is able to create it for us.  */
> -  tree openmp_target = build_zero_cst (ptr_type_node);
> +  tree openmp_target
> +    = build_decl (UNKNOWN_LOCATION, VAR_DECL,
> +                 get_identifier ("__OPENMP_TARGET__"), ptr_type_node);
> +  TREE_PUBLIC (openmp_target) = 1;
> +  DECL_EXTERNAL (openmp_target) = 1;
>    if (kind == GF_OMP_TARGET_KIND_REGION)
>      {
>        tree fnaddr = build_fold_addr_expr (child_fn);
> -      g = gimple_build_call (builtin_decl_explicit (start_ix), 7,
> -                            device, fnaddr, openmp_target, t1, t2, t3, t4);
> +      g = gimple_build_call (builtin_decl_explicit (start_ix), 7, device,
> +                            fnaddr, build_fold_addr_expr (openmp_target),
> +                            t1, t2, t3, t4);
>      }
>    else
> -    g = gimple_build_call (builtin_decl_explicit (start_ix), 6,
> -                          device, openmp_target, t1, t2, t3, t4);
> +    g = gimple_build_call (builtin_decl_explicit (start_ix), 6, device,
> +                          build_fold_addr_expr (openmp_target),
> +                          t1, t2, t3, t4);
>    gimple_set_location (g, gimple_location (entry_stmt));
>    gsi_insert_before (&gsi, g, GSI_SAME_STMT);
>    if (kind != GF_OMP_TARGET_KIND_REGION)
> @@ -12379,4 +12383,103 @@ make_pass_omp_simd_clone (gcc::context *ctxt)
>    return new pass_omp_simd_clone (ctxt);
>  }
>
> +/* Helper function for omp_finish_file routine.
> +   Takes decls from V_DECLS and adds their addresses and sizes to
> +   constructor-vector V_CTOR.  It will be later used as DECL_INIT for decl
> +   representing a global symbol for OpenMP descriptor.
> +   If IS_FUNCTION is true, we use 1 for size.  */
> +static void
> +add_decls_addresses_to_decl_constructor (vec<tree, va_gc> *v_decls,
> +                                        vec<constructor_elt, va_gc> *v_ctor,
> +                                        bool is_function)
> +{
> +  unsigned int len = 0, i;
> +  tree size, it;
> +  len = vec_safe_length (v_decls);
> +  for (i = 0; i < len; i++)
> +    {
> +      /* Decls are placed in reversed order in fat-objects, so we need to
> +        revert them back if we compile target.  */
> +      if (!flag_openmp_target)
> +       it = (*v_decls)[i];
> +      else
> +       it = (*v_decls)[len - i - 1];
> +      size = is_function ? integer_one_node : DECL_SIZE (it);
> +      CONSTRUCTOR_APPEND_ELT (v_ctor, NULL_TREE, build_fold_addr_expr (it));
> +      CONSTRUCTOR_APPEND_ELT (v_ctor, NULL_TREE,
> +                             fold_convert (const_ptr_type_node,
> +                                           size));
> +    }
> +}
> +
> +/* Create new symbol containing (address, size) pairs for omp-marked
> +   functions and global variables.  */
> +void
> +omp_finish_file (void)
> +{
> +  struct cgraph_node *node;
> +  struct varpool_node *vnode;
> +  const char *section_name = ".offload_func_table_section";
> +  tree new_decl, new_decl_type;
> +  vec<constructor_elt, va_gc> *v;
> +  vec<tree, va_gc> *v_func, *v_var;
> +  tree ctor;
> +  int num = 0;
> +
> +  if (!targetm_common.have_named_sections)
> +    return;
> +
> +  vec_alloc (v_func, 0);
> +  vec_alloc (v_var, 0);
> +
> +  /* Collect all omp-target functions.  */
> +  FOR_EACH_DEFINED_FUNCTION (node)
> +    {
> +      /* TODO: This check could fail on functions, created by omp
> +        parallel/task pragmas.  It's better to name outlined for offloading
> +        functions in some different way and to check here the function name.
> +        It could be something like "*_omp_tgtfn" in contrast with "*_omp_fn"
> +        for functions from omp parallel/task pragmas.  */
> +      if (!lookup_attribute ("omp declare target",
> +                            DECL_ATTRIBUTES (node->decl))
> +         || !DECL_ARTIFICIAL (node->decl))
> +       continue;
> +      vec_safe_push (v_func, node->decl);
> +      num ++;
> +    }
> +  /* Collect all omp-target global variables.  */
> +  FOR_EACH_DEFINED_VARIABLE (vnode)
> +    {
> +      if (!lookup_attribute ("omp declare target",
> +                            DECL_ATTRIBUTES (vnode->decl))
> +         || TREE_CODE (vnode->decl) != VAR_DECL
> +         || DECL_SIZE (vnode->decl) == 0)
> +       continue;
> +
> +      vec_safe_push (v_var, vnode->decl);
> +      num ++;
> +    }
> +
> +  if (num == 0)
> +    return;
> +
> +  vec_alloc (v, num * 2);
> +
> +  add_decls_addresses_to_decl_constructor (v_func, v, true);
> +  add_decls_addresses_to_decl_constructor (v_var, v, false);
> +
> +  new_decl_type = build_array_type_nelts (pointer_sized_int_node, num * 2);
> +  ctor = build_constructor (new_decl_type, v);
> +  TREE_CONSTANT (ctor) = 1;
> +  TREE_STATIC (ctor) = 1;
> +  new_decl = build_decl (UNKNOWN_LOCATION, VAR_DECL,
> +                        get_identifier (".omp_table"), new_decl_type);
> +  TREE_STATIC (new_decl) = 1;
> +  DECL_INITIAL (new_decl) = ctor;
> +  DECL_SECTION_NAME (new_decl) = build_string (strlen (section_name),
> +                                              section_name);
> +
> +  varpool_assemble_decl (varpool_node_for_decl (new_decl));
> +}
> +
>  #include "gt-omp-low.h"
> diff --git a/gcc/omp-low.h b/gcc/omp-low.h
> index 6b5a2ff..813189d 100644
> --- a/gcc/omp-low.h
> +++ b/gcc/omp-low.h
> @@ -27,5 +27,6 @@ extern void omp_expand_local (basic_block);
>  extern void free_omp_regions (void);
>  extern tree omp_reduction_init (tree, tree);
>  extern bool make_gimple_omp_edges (basic_block, struct omp_region **);
> +extern void omp_finish_file (void);
>
>  #endif /* GCC_OMP_LOW_H */
> diff --git a/gcc/toplev.c b/gcc/toplev.c
> index 5fedcea..af010ff 100644
> --- a/gcc/toplev.c
> +++ b/gcc/toplev.c
> @@ -78,6 +78,7 @@ along with GCC; see the file COPYING3.  If not see
>  #include "diagnostic-color.h"
>  #include "context.h"
>  #include "pass_manager.h"
> +#include "omp-low.h"
>
>  #if defined(DBX_DEBUGGING_INFO) || defined(XCOFF_DEBUGGING_INFO)
>  #include "dbxout.h"
> @@ -576,6 +577,8 @@ compile_file (void)
>        if (flag_sanitize & SANITIZE_THREAD)
>         tsan_finish_file ();
>
> +      omp_finish_file ();
> +
>        output_shared_constant_pool ();
>        output_object_blocks ();
>        finish_tm_clone_pairs ();
> --
> 1.7.1
>
Bernd Schmidt Jan. 28, 2014, 11:20 a.m. UTC | #2
On 12/17/2013 12:39 PM, Michael V. Zolotukhin wrote:
> Here is a patch 2/3: Add tables generation.
>
> This patch is just a slightly modified patch sent a couple of weeks ago.  When
> compiling with '-fopenmp' compiler generates a special symbol, containing
> addresses and sizes of globals/omp_fn-functions, and places it into a special
> section.  Later, at linking, these sections are merged together and we get a
> single table with all addresses/sizes for entire binary.  Also, in this patch we
> start to pass '__OPENMP_TARGET__' symbol to GOMP_target calls.

I also have a question about the code in this patch. I can see how the 
table is constructed - what's not clear to me is how it is going to be 
used? How do you map from a function or variable you want to look up to 
an index in this table?


Bernd
Ilya Verbin Jan. 28, 2014, 12:52 p.m. UTC | #3
Adding gcc-patches.


Hi Bernd,

The table is used in libgomp (see my patch [1]), as proposed by Jakub
(see [2]).  The idea is that the order of entries in the host and
target tables must be identical.  This allows to set up one-to-one
correspondence between host and target addresses.
In my patch libgomp calls device_init_func from the plugin, that loads
all libraries to the target, and returns the joint table with
addresses for all libraries.  But probably it's better to have a
function like device_load, that will load only one library image, an
return a table for this library.

[1] http://gcc.gnu.org/ml/gcc-patches/2013-12/msg01896.html
[2] http://gcc.gnu.org/ml/gcc-patches/2013-09/msg01392.html

  -- Ilya


2014/1/28 Bernd Schmidt <bernds@codesourcery.com>
>
> On 12/17/2013 12:39 PM, Michael V. Zolotukhin wrote:
>>
>> Here is a patch 2/3: Add tables generation.
>>
>> This patch is just a slightly modified patch sent a couple of weeks ago.  When
>> compiling with '-fopenmp' compiler generates a special symbol, containing
>> addresses and sizes of globals/omp_fn-functions, and places it into a special
>> section.  Later, at linking, these sections are merged together and we get a
>> single table with all addresses/sizes for entire binary.  Also, in this patch we
>> start to pass '__OPENMP_TARGET__' symbol to GOMP_target calls.
>
>
> I also have a question about the code in this patch. I can see how the table is constructed - what's not clear to me is how it is going to be used? How do you map from a function or variable you want to look up to an index in this table?
>
>
> Bernd
>
Bernd Schmidt Jan. 29, 2014, 2:43 p.m. UTC | #4
On 01/28/2014 01:52 PM, Ilya Verbin wrote:
> The table is used in libgomp (see my patch [1]), as proposed by Jakub
> (see [2]).  The idea is that the order of entries in the host and
> target tables must be identical.  This allows to set up one-to-one
> correspondence between host and target addresses.

I was worried the answer was going to be something like this. These are 
produced by two different compilers/linkers, aren't they? This seems 
really fragile to me and a recipe for hard-to-debug silent failures.

For ptx, I think we can create both tables at the same time in the 
image-generating tool, so I'm hoping that it won't be a problem. I'll 
let you know what I find, but it would still be nice to come up with 
something more robust.

> Could you please describe how functions would be invoked on
> PTX?

First we let the driver compile and load all the ptx code, then we'd 
call a cuda library function with the name of the function passed as a 
string. So I guess the host table would contain pointers to the host 
version of the functions, and the target table would contain a pointer 
to the name.


Bernd
Ilya Verbin Jan. 29, 2014, 4:06 p.m. UTC | #5
2014/1/29 Bernd Schmidt <bernds@codesourcery.com>:
> I was worried the answer was going to be something like this. These are
> produced by two different compilers/linkers, aren't they? This seems really
> fragile to me and a recipe for hard-to-debug silent failures.

Yes.

> First we let the driver compile and load all the ptx code, then we'd call a
> cuda library function with the name of the function passed as a string. So I
> guess the host table would contain pointers to the host version of the
> functions, and the target table would contain a pointer to the name.

We also wanted to use function names at the beginning of our work [1].
Then Jakub noticed that the names aren't necessarily unique: [2], [3], [4], etc.
This led to the current approach of host/target address tables.

>
> Bernd
>

[1] http://gcc.gnu.org/ml/gcc/2013-08/msg00251.html
[2] http://gcc.gnu.org/ml/gcc-patches/2013-09/msg00250.html
[3] http://gcc.gnu.org/ml/gcc/2013-09/msg00146.html
[4] http://gcc.gnu.org/ml/gcc/2013-10/msg00184.html

  -- Ilya
Ilya Verbin Feb. 14, 2014, 2:49 p.m. UTC | #6
Hi Bernd and Thomas,

Are you planning to support offloading from DSO in PTX/CUDA
environment?  If yes, how are you going to solve the problem of the
collision of function names from different DSOs?

However, if we decide to use element-wise host-target address mapping,
there are opportunities to make this approach more robust.  E.g. we
can store some hash(name) in the compiler-generated tables along with
the address and size.  When libgomp will perform device
initialization, it will compare hashes from the host and target DSOs.
This should reveal possible errors during the initialization, and will
avoid hard-to-debug silent failures.

  -- Ilya
diff mbox

Patch

diff --git a/gcc/omp-low.c b/gcc/omp-low.c
index e0f7d1d..f860204 100644
--- a/gcc/omp-low.c
+++ b/gcc/omp-low.c
@@ -58,6 +58,7 @@  along with GCC; see the file COPYING3.  If not see
 #include "optabs.h"
 #include "cfgloop.h"
 #include "target.h"
+#include "common/common-target.h"
 #include "omp-low.h"
 #include "gimple-low.h"
 #include "tree-cfgcleanup.h"
@@ -8371,19 +8372,22 @@  expand_omp_target (struct omp_region *region)
     }
 
   gimple g;
-  /* FIXME: This will be address of
-     extern char __OPENMP_TARGET__[] __attribute__((visibility ("hidden")))
-     symbol, as soon as the linker plugin is able to create it for us.  */
-  tree openmp_target = build_zero_cst (ptr_type_node);
+  tree openmp_target
+    = build_decl (UNKNOWN_LOCATION, VAR_DECL,
+		  get_identifier ("__OPENMP_TARGET__"), ptr_type_node);
+  TREE_PUBLIC (openmp_target) = 1;
+  DECL_EXTERNAL (openmp_target) = 1;
   if (kind == GF_OMP_TARGET_KIND_REGION)
     {
       tree fnaddr = build_fold_addr_expr (child_fn);
-      g = gimple_build_call (builtin_decl_explicit (start_ix), 7,
-			     device, fnaddr, openmp_target, t1, t2, t3, t4);
+      g = gimple_build_call (builtin_decl_explicit (start_ix), 7, device,
+			     fnaddr, build_fold_addr_expr (openmp_target),
+			     t1, t2, t3, t4);
     }
   else
-    g = gimple_build_call (builtin_decl_explicit (start_ix), 6,
-			   device, openmp_target, t1, t2, t3, t4);
+    g = gimple_build_call (builtin_decl_explicit (start_ix), 6, device,
+			   build_fold_addr_expr (openmp_target),
+			   t1, t2, t3, t4);
   gimple_set_location (g, gimple_location (entry_stmt));
   gsi_insert_before (&gsi, g, GSI_SAME_STMT);
   if (kind != GF_OMP_TARGET_KIND_REGION)
@@ -12379,4 +12383,103 @@  make_pass_omp_simd_clone (gcc::context *ctxt)
   return new pass_omp_simd_clone (ctxt);
 }
 
+/* Helper function for omp_finish_file routine.
+   Takes decls from V_DECLS and adds their addresses and sizes to
+   constructor-vector V_CTOR.  It will be later used as DECL_INIT for decl
+   representing a global symbol for OpenMP descriptor.
+   If IS_FUNCTION is true, we use 1 for size.  */
+static void
+add_decls_addresses_to_decl_constructor (vec<tree, va_gc> *v_decls,
+					 vec<constructor_elt, va_gc> *v_ctor,
+					 bool is_function)
+{
+  unsigned int len = 0, i;
+  tree size, it;
+  len = vec_safe_length (v_decls);
+  for (i = 0; i < len; i++)
+    {
+      /* Decls are placed in reversed order in fat-objects, so we need to
+	 revert them back if we compile target.  */
+      if (!flag_openmp_target)
+	it = (*v_decls)[i];
+      else
+	it = (*v_decls)[len - i - 1];
+      size = is_function ? integer_one_node : DECL_SIZE (it);
+      CONSTRUCTOR_APPEND_ELT (v_ctor, NULL_TREE, build_fold_addr_expr (it));
+      CONSTRUCTOR_APPEND_ELT (v_ctor, NULL_TREE,
+			      fold_convert (const_ptr_type_node,
+					    size));
+    }
+}
+
+/* Create new symbol containing (address, size) pairs for omp-marked
+   functions and global variables.  */
+void
+omp_finish_file (void)
+{
+  struct cgraph_node *node;
+  struct varpool_node *vnode;
+  const char *section_name = ".offload_func_table_section";
+  tree new_decl, new_decl_type;
+  vec<constructor_elt, va_gc> *v;
+  vec<tree, va_gc> *v_func, *v_var;
+  tree ctor;
+  int num = 0;
+
+  if (!targetm_common.have_named_sections)
+    return;
+
+  vec_alloc (v_func, 0);
+  vec_alloc (v_var, 0);
+
+  /* Collect all omp-target functions.  */
+  FOR_EACH_DEFINED_FUNCTION (node)
+    {
+      /* TODO: This check could fail on functions, created by omp
+	 parallel/task pragmas.  It's better to name outlined for offloading
+	 functions in some different way and to check here the function name.
+	 It could be something like "*_omp_tgtfn" in contrast with "*_omp_fn"
+	 for functions from omp parallel/task pragmas.  */
+      if (!lookup_attribute ("omp declare target",
+			     DECL_ATTRIBUTES (node->decl))
+	  || !DECL_ARTIFICIAL (node->decl))
+	continue;
+      vec_safe_push (v_func, node->decl);
+      num ++;
+    }
+  /* Collect all omp-target global variables.  */
+  FOR_EACH_DEFINED_VARIABLE (vnode)
+    {
+      if (!lookup_attribute ("omp declare target",
+			     DECL_ATTRIBUTES (vnode->decl))
+	  || TREE_CODE (vnode->decl) != VAR_DECL
+	  || DECL_SIZE (vnode->decl) == 0)
+	continue;
+
+      vec_safe_push (v_var, vnode->decl);
+      num ++;
+    }
+
+  if (num == 0)
+    return;
+
+  vec_alloc (v, num * 2);
+
+  add_decls_addresses_to_decl_constructor (v_func, v, true);
+  add_decls_addresses_to_decl_constructor (v_var, v, false);
+
+  new_decl_type = build_array_type_nelts (pointer_sized_int_node, num * 2);
+  ctor = build_constructor (new_decl_type, v);
+  TREE_CONSTANT (ctor) = 1;
+  TREE_STATIC (ctor) = 1;
+  new_decl = build_decl (UNKNOWN_LOCATION, VAR_DECL,
+			 get_identifier (".omp_table"), new_decl_type);
+  TREE_STATIC (new_decl) = 1;
+  DECL_INITIAL (new_decl) = ctor;
+  DECL_SECTION_NAME (new_decl) = build_string (strlen (section_name),
+					       section_name);
+
+  varpool_assemble_decl (varpool_node_for_decl (new_decl));
+}
+
 #include "gt-omp-low.h"
diff --git a/gcc/omp-low.h b/gcc/omp-low.h
index 6b5a2ff..813189d 100644
--- a/gcc/omp-low.h
+++ b/gcc/omp-low.h
@@ -27,5 +27,6 @@  extern void omp_expand_local (basic_block);
 extern void free_omp_regions (void);
 extern tree omp_reduction_init (tree, tree);
 extern bool make_gimple_omp_edges (basic_block, struct omp_region **);
+extern void omp_finish_file (void);
 
 #endif /* GCC_OMP_LOW_H */
diff --git a/gcc/toplev.c b/gcc/toplev.c
index 5fedcea..af010ff 100644
--- a/gcc/toplev.c
+++ b/gcc/toplev.c
@@ -78,6 +78,7 @@  along with GCC; see the file COPYING3.  If not see
 #include "diagnostic-color.h"
 #include "context.h"
 #include "pass_manager.h"
+#include "omp-low.h"
 
 #if defined(DBX_DEBUGGING_INFO) || defined(XCOFF_DEBUGGING_INFO)
 #include "dbxout.h"
@@ -576,6 +577,8 @@  compile_file (void)
       if (flag_sanitize & SANITIZE_THREAD)
 	tsan_finish_file ();
 
+      omp_finish_file ();
+
       output_shared_constant_pool ();
       output_object_blocks ();
       finish_tm_clone_pairs ();