diff mbox

Ping Re: [gomp4] Dumping gimple for offload.

Message ID 20131119095829.GA19301@msticlxl7.ims.intel.com
State New
Headers show

Commit Message

Ilya Tocar Nov. 19, 2013, 9:58 a.m. UTC
On 14 Nov 11:27, Richard Biener wrote:
> > +  /* Set when symbol needs to be dumped for lto/offloading.  */
> > +  unsigned need_dump : 1;
> > +
> 
> That's very non-descriptive.  What's "offloading"?  But yes, something
> like this is what I was asking for.

I've changed it into:
Set when symbol needs to be dumped into LTO bytecode for LTO,
or in pragma omp target case, for separate compilation targeting
a different architecture.

Ok for gomp4 branch now?

2013-11-19 Ilya Tocar  <ilya.tocar@intel.com> 

	* cgraph.h (symtab_node): Add need_dump.
	* cgraphunit.c (ipa_passes): Run ipa_write_summaries for omp.
	(compile): Intialize streamer for omp. 
	* ipa-inline-analysis.c (inline_generate_summary): Add flag_openmp.
	* lto-cgraph.c (lto_set_symtab_encoder_in_partition): Respect
	need_dump flag.
	(select_what_to_dump): New.
	* lto-streamer.c (section_name_prefix): New.
	(lto_get_section_name): Use section_name_prefix.
	(lto_streamer_init): Add flag_openmp.
	* lto-streamer.h (OMP_SECTION_NAME_PREFIX): New.
	(section_name_prefix): Ditto.
	(select_what_to_dump): Ditto.
	* lto/lto-partition.c (add_symbol_to_partition_1): Set need_dump.
	(lto_promote_cross_file_statics): Dump everyhtinh.
	* passes.c (ipa_write_summaries): Add parameter,
	call select_what_to_dump.
	* tree-pass.h (void ipa_write_summaries): Add parameter.


---
 gcc/cgraph.h              |  5 +++++
 gcc/cgraphunit.c          | 15 +++++++++++++--
 gcc/ipa-inline-analysis.c |  2 +-
 gcc/lto-cgraph.c          | 14 ++++++++++++++
 gcc/lto-streamer.c        |  5 +++--
 gcc/lto-streamer.h        |  6 ++++++
 gcc/lto/lto-partition.c   |  3 +++
 gcc/passes.c              |  6 ++++--
 gcc/tree-pass.h           |  2 +-
 9 files changed, 50 insertions(+), 8 deletions(-)

Comments

Richard Biener Nov. 20, 2013, 9:34 a.m. UTC | #1
On Tue, Nov 19, 2013 at 10:58 AM, Ilya Tocar <tocarip.intel@gmail.com> wrote:
> On 14 Nov 11:27, Richard Biener wrote:
>> > +  /* Set when symbol needs to be dumped for lto/offloading.  */
>> > +  unsigned need_dump : 1;
>> > +
>>
>> That's very non-descriptive.  What's "offloading"?  But yes, something
>> like this is what I was asking for.
>
> I've changed it into:
> Set when symbol needs to be dumped into LTO bytecode for LTO,
> or in pragma omp target case, for separate compilation targeting
> a different architecture.
>
> Ok for gomp4 branch now?

Works for me.  I'll let branch maintainers decide if it follows whatever
is done there (I haven't found time to follow stuff here).

Richard.

> 2013-11-19 Ilya Tocar  <ilya.tocar@intel.com>
>
>         * cgraph.h (symtab_node): Add need_dump.
>         * cgraphunit.c (ipa_passes): Run ipa_write_summaries for omp.
>         (compile): Intialize streamer for omp.
>         * ipa-inline-analysis.c (inline_generate_summary): Add flag_openmp.
>         * lto-cgraph.c (lto_set_symtab_encoder_in_partition): Respect
>         need_dump flag.
>         (select_what_to_dump): New.
>         * lto-streamer.c (section_name_prefix): New.
>         (lto_get_section_name): Use section_name_prefix.
>         (lto_streamer_init): Add flag_openmp.
>         * lto-streamer.h (OMP_SECTION_NAME_PREFIX): New.
>         (section_name_prefix): Ditto.
>         (select_what_to_dump): Ditto.
>         * lto/lto-partition.c (add_symbol_to_partition_1): Set need_dump.
>         (lto_promote_cross_file_statics): Dump everyhtinh.
>         * passes.c (ipa_write_summaries): Add parameter,
>         call select_what_to_dump.
>         * tree-pass.h (void ipa_write_summaries): Add parameter.
>
>
> ---
>  gcc/cgraph.h              |  5 +++++
>  gcc/cgraphunit.c          | 15 +++++++++++++--
>  gcc/ipa-inline-analysis.c |  2 +-
>  gcc/lto-cgraph.c          | 14 ++++++++++++++
>  gcc/lto-streamer.c        |  5 +++--
>  gcc/lto-streamer.h        |  6 ++++++
>  gcc/lto/lto-partition.c   |  3 +++
>  gcc/passes.c              |  6 ++++--
>  gcc/tree-pass.h           |  2 +-
>  9 files changed, 50 insertions(+), 8 deletions(-)
>
> diff --git a/gcc/cgraph.h b/gcc/cgraph.h
> index fb0fe93..9f799f4 100644
> --- a/gcc/cgraph.h
> +++ b/gcc/cgraph.h
> @@ -105,6 +105,11 @@ public:
>    /* Set when symbol has address taken. */
>    unsigned address_taken : 1;
>
> +  /* Set when symbol needs to be dumped into LTO bytecode for LTO,
> +     or in pragma omp target case, for separate compilation targeting
> +     a different architecture.  */
> +  unsigned need_dump : 1;
> +
>
>    /* Ordering of all symtab entries.  */
>    int order;
> diff --git a/gcc/cgraphunit.c b/gcc/cgraphunit.c
> index c3a8967..53cd250 100644
> --- a/gcc/cgraphunit.c
> +++ b/gcc/cgraphunit.c
> @@ -2019,7 +2019,18 @@ ipa_passes (void)
>                               passes->all_lto_gen_passes);
>
>    if (!in_lto_p)
> -    ipa_write_summaries ();
> +    {
> +      if (flag_openmp)
> +       {
> +         section_name_prefix = OMP_SECTION_NAME_PREFIX;
> +         ipa_write_summaries (true);
> +       }
> +      if (flag_lto)
> +       {
> +         section_name_prefix = LTO_SECTION_NAME_PREFIX;
> +         ipa_write_summaries (false);
> +       }
> +    }
>
>    if (flag_generate_lto)
>      targetm.asm_out.lto_end ();
> @@ -2110,7 +2121,7 @@ compile (void)
>    cgraph_state = CGRAPH_STATE_IPA;
>
>    /* If LTO is enabled, initialize the streamer hooks needed by GIMPLE.  */
> -  if (flag_lto)
> +  if (flag_lto || flag_openmp)
>      lto_streamer_hooks_init ();
>
>    /* Don't run the IPA passes if there was any error or sorry messages.  */
> diff --git a/gcc/ipa-inline-analysis.c b/gcc/ipa-inline-analysis.c
> index 4458723..62faa52 100644
> --- a/gcc/ipa-inline-analysis.c
> +++ b/gcc/ipa-inline-analysis.c
> @@ -3813,7 +3813,7 @@ inline_generate_summary (void)
>
>    /* When not optimizing, do not bother to analyze.  Inlining is still done
>       because edge redirection needs to happen there.  */
> -  if (!optimize && !flag_lto && !flag_wpa)
> +  if (!optimize && !flag_lto && !flag_wpa && !flag_openmp)
>      return;
>
>    function_insertion_hook_holder =
> diff --git a/gcc/lto-cgraph.c b/gcc/lto-cgraph.c
> index 6a52da8..697c069 100644
> --- a/gcc/lto-cgraph.c
> +++ b/gcc/lto-cgraph.c
> @@ -238,6 +238,9 @@ void
>  lto_set_symtab_encoder_in_partition (lto_symtab_encoder_t encoder,
>                                      symtab_node *node)
>  {
> +  /* Ignore not needed nodes.  */
> +  if (!node->need_dump)
> +    return;
>    int index = lto_symtab_encoder_encode (encoder, node);
>    encoder->nodes[index].in_partition = true;
>  }
> @@ -751,6 +754,17 @@ add_references (lto_symtab_encoder_t encoder,
>        lto_symtab_encoder_encode (encoder, ref->referred);
>  }
>
> +/* Select what needs to be dumped. In lto case dump everything.
> +   In omp target case only dump stuff makrked with attribute.  */
> +void
> +select_what_to_dump (bool is_omp)
> +{
> +  struct symtab_node *snode;
> +  FOR_EACH_SYMBOL(snode)
> +    snode->need_dump = !is_omp || lookup_attribute ("omp declare target",
> +                                                   DECL_ATTRIBUTES (snode->decl));
> +}
> +
>  /* Find all symbols we want to stream into given partition and insert them
>     to encoders.
>
> diff --git a/gcc/lto-streamer.c b/gcc/lto-streamer.c
> index 1540e4c..ffafb0e 100644
> --- a/gcc/lto-streamer.c
> +++ b/gcc/lto-streamer.c
> @@ -43,6 +43,7 @@ struct lto_stats_d lto_stats;
>  static bitmap_obstack lto_obstack;
>  static bool lto_obstack_initialized;
>
> +const char *section_name_prefix = LTO_SECTION_NAME_PREFIX;
>
>  /* Return a string representing LTO tag TAG.  */
>
> @@ -172,7 +173,7 @@ lto_get_section_name (int section_type, const char *name, struct lto_file_decl_d
>      sprintf (post, "." HOST_WIDE_INT_PRINT_HEX_PURE, f->id);
>    else
>      sprintf (post, "." HOST_WIDE_INT_PRINT_HEX_PURE, get_random_seed (false));
> -  return concat (LTO_SECTION_NAME_PREFIX, sep, add, post, NULL);
> +  return concat (section_name_prefix, sep, add, post, NULL);
>  }
>
>
> @@ -310,7 +311,7 @@ lto_streamer_init (void)
>  bool
>  gate_lto_out (void)
>  {
> -  return ((flag_generate_lto || in_lto_p)
> +  return ((flag_generate_lto || in_lto_p || flag_openmp)
>           /* Don't bother doing anything if the program has errors.  */
>           && !seen_error ());
>  }
> diff --git a/gcc/lto-streamer.h b/gcc/lto-streamer.h
> index 797e92e..f4c46db 100644
> --- a/gcc/lto-streamer.h
> +++ b/gcc/lto-streamer.h
> @@ -139,6 +139,11 @@ along with GCC; see the file COPYING3.  If not see
>     name for the functions and static_initializers.  For other types of
>     sections a '.' and the section type are appended.  */
>  #define LTO_SECTION_NAME_PREFIX         ".gnu.lto_"
> +#define OMP_SECTION_NAME_PREFIX         ".gnu.target_lto_"
> +
> +/* Can be either OMP_SECTION_NAME_PREFIX when we stream pragma omp target
> +   stuff, or LTO_SECTION_NAME_PREFIX for lto case.  */
> +extern const char  *section_name_prefix;
>
>  #define LTO_major_version 2
>  #define LTO_minor_version 2
> @@ -895,6 +900,7 @@ bool referenced_from_this_partition_p (struct ipa_ref_list *,
>  bool reachable_from_this_partition_p (struct cgraph_node *,
>                                       lto_symtab_encoder_t);
>  lto_symtab_encoder_t compute_ltrans_boundary (lto_symtab_encoder_t encoder);
> +void select_what_to_dump (bool);
>
>
>  /* In lto-symtab.c.  */
> diff --git a/gcc/lto/lto-partition.c b/gcc/lto/lto-partition.c
> index 6a3d881..2d2aa63 100644
> --- a/gcc/lto/lto-partition.c
> +++ b/gcc/lto/lto-partition.c
> @@ -190,6 +190,7 @@ add_symbol_to_partition_1 (ltrans_partition part, symtab_node *node)
>    gcc_assert (c != SYMBOL_EXTERNAL
>               && (c == SYMBOL_DUPLICATE || !symbol_partitioned_p (node)));
>
> +  node->need_dump = true;
>    lto_set_symtab_encoder_in_partition (part->encoder, node);
>
>    if (symbol_partitioned_p (node))
> @@ -917,6 +918,8 @@ lto_promote_cross_file_statics (void)
>
>    gcc_assert (flag_wpa);
>
> +  select_what_to_dump (false);
> +
>    /* First compute boundaries.  */
>    n_sets = ltrans_partitions.length ();
>    for (i = 0; i < n_sets; i++)
> diff --git a/gcc/passes.c b/gcc/passes.c
> index 19e5869..88b1538 100644
> --- a/gcc/passes.c
> +++ b/gcc/passes.c
> @@ -2335,7 +2335,7 @@ ipa_write_summaries_1 (lto_symtab_encoder_t encoder)
>  /* Write out summaries for all the nodes in the callgraph.  */
>
>  void
> -ipa_write_summaries (void)
> +ipa_write_summaries (bool is_omp)
>  {
>    lto_symtab_encoder_t encoder;
>    int i, order_pos;
> @@ -2343,9 +2343,11 @@ ipa_write_summaries (void)
>    struct cgraph_node *node;
>    struct cgraph_node **order;
>
> -  if (!flag_generate_lto || seen_error ())
> +  if (!(flag_generate_lto || flag_openmp) || seen_error () )
>      return;
>
> +  select_what_to_dump (is_omp);
> +
>    encoder = lto_symtab_encoder_new (false);
>
>    /* Create the callgraph set in the same order used in
> diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h
> index fa403c7..8d51d80 100644
> --- a/gcc/tree-pass.h
> +++ b/gcc/tree-pass.h
> @@ -595,7 +595,7 @@ extern void pass_fini_dump_file (struct opt_pass *);
>  extern const char *get_current_pass_name (void);
>  extern void print_current_pass (FILE *);
>  extern void debug_pass (void);
> -extern void ipa_write_summaries (void);
> +extern void ipa_write_summaries (bool is_omp);
>  extern void ipa_write_optimization_summaries (struct lto_symtab_encoder_d *);
>  extern void ipa_read_summaries (void);
>  extern void ipa_read_optimization_summaries (void);
> --
> 1.8.3.1
>
Jakub Jelinek Nov. 20, 2013, 9:36 a.m. UTC | #2
On Wed, Nov 20, 2013 at 10:34:30AM +0100, Richard Biener wrote:
> On Tue, Nov 19, 2013 at 10:58 AM, Ilya Tocar <tocarip.intel@gmail.com> wrote:
> > On 14 Nov 11:27, Richard Biener wrote:
> >> > +  /* Set when symbol needs to be dumped for lto/offloading.  */
> >> > +  unsigned need_dump : 1;
> >> > +
> >>
> >> That's very non-descriptive.  What's "offloading"?  But yes, something
> >> like this is what I was asking for.
> >
> > I've changed it into:
> > Set when symbol needs to be dumped into LTO bytecode for LTO,
> > or in pragma omp target case, for separate compilation targeting
> > a different architecture.
> >
> > Ok for gomp4 branch now?
> 
> Works for me.  I'll let branch maintainers decide if it follows whatever
> is done there (I haven't found time to follow stuff here).

Ok then.

	Jakub
Thomas Schwinge Nov. 25, 2013, 4:13 p.m. UTC | #3
Hi!

Just some suggestion related to terminology.


On Tue, 19 Nov 2013 13:58:29 +0400, Ilya Tocar <tocarip.intel@gmail.com> wrote:
> On 14 Nov 11:27, Richard Biener wrote:
> > > +  /* Set when symbol needs to be dumped for lto/offloading.  */
> > > +  unsigned need_dump : 1;
> > > +
> > 
> > That's very non-descriptive.  What's "offloading"?  But yes, something
> > like this is what I was asking for.
> 
> I've changed it into:
> Set when symbol needs to be dumped into LTO bytecode for LTO,
> or in pragma omp target case, for separate compilation targeting
> a different architecture.

Can we in fact agree to use the term "offload" to mean exactly that?
We'll need this in other contexts, too, such as for configuring the
"secondary" lto1 (which is, in fact, the one that will be processing the
main GCC's "offloaded" code)?  I'm happy to go looking for a proper
section in GCC's (internal?) manual to document what "offloading" means
in GCC's context, and I'm likewise happy to hear if there's any better
term existing for describing basically the process of »separate
compilation targeting a different architecture«?  (I'm not a native
speaker.)  By the way, in this context, I like saing "offloading" better
than "acceleration", because while we strive for acceleration, offloading
is what we technically do.


> diff --git a/gcc/cgraphunit.c b/gcc/cgraphunit.c
> index c3a8967..53cd250 100644
> --- a/gcc/cgraphunit.c
> +++ b/gcc/cgraphunit.c
> @@ -2019,7 +2019,18 @@ ipa_passes (void)
>  			      passes->all_lto_gen_passes);
>  
>    if (!in_lto_p)
> -    ipa_write_summaries ();
> +    {
> +      if (flag_openmp)

The following comment applies to several more instances in this patch:
after the front end's parsing stage, we should now basically everywhere
treat flag_openacc and flag_openmp the same.  The idea is that all the
existing OpenMP omp_* infrastructure in the middle end and following is
now applicable to not only OpenMP but also OpenACC and any other
"acceleration" mechanisms.  Again, is there a better term to use instead
of "acceleration" for describing the union of Cilk+, OpenACC, OpenMP, and
similar techniques?

Also, would it then make sense to define a flag à la:

    #define flag_acceleration (flag_openacc | flag_openmp)

..., and begin using that everywhere after the front ends where
flag_openmp is currently used?


> +/* Select what needs to be dumped. In lto case dump everything.
> +   In omp target case only dump stuff makrked with attribute.  */
> +void
> +select_what_to_dump (bool is_omp)

Likewise, while this obviously (and unsurprisingly) continues with the
existing convention, I'd suggest that in the future we name such things
more generically: is_offload or is_acceleration (or, again, any better
term that is generically applicable).

Or, going by what I've been told before by Jakub and Nathan in
<http://news.gmane.org/find-root.php?message_id=%3C523AC6FF.6030007%40acm.org%3E>:
»Names are sticky.«, should we instead continue to name all these new
things omp_*, too, and declare that the "omp" tag is an artifact of
history?  (But it certainly is an obstacle to anyone new to the code.  I
realize that people are a bit tired of refactoring, which recently has
been applied in other contexts, and has caused some "churn".  But still,
I'd personally go ahread and rename the current omp_* middle end bits to
any new tag that we agree on, just to make things more clear to everyone
new to the code.  But, I don't insist on that (by now I manage to
internally map omp_* to acceleration_* or similar), so you long-time
contributors of course get to have the finaly say about this.  I'm only
commenting from still a new contributor's point of view.)


> diff --git a/gcc/lto-streamer.h b/gcc/lto-streamer.h
> index 797e92e..f4c46db 100644
> --- a/gcc/lto-streamer.h
> +++ b/gcc/lto-streamer.h
> @@ -139,6 +139,11 @@ along with GCC; see the file COPYING3.  If not see
>     name for the functions and static_initializers.  For other types of
>     sections a '.' and the section type are appended.  */
>  #define LTO_SECTION_NAME_PREFIX         ".gnu.lto_"
> +#define OMP_SECTION_NAME_PREFIX         ".gnu.target_lto_"

Also here, the "target" tag is confusing in that "target" typically has a
different meaning in the compiler context -- while this one here is used
for implementing OpenMP's target construct, what it technically does
should be described by .gnu.offload_gimple_ or somthing similar.  (Note
that also the LTO term is no longer really applicable, as we're not
neccessarily doing link-time optimization here, but instead rather just
use the GIMPLE dumping ("offloading") infrastructure that happens to
already be implemented in GCC's LTO support.

Again, all this doesn't matter to anyone who's already versed with the
code, but it does matter to people who are new, and still have to learn
all these fine details.  (Of which there are many, many more.  Yes, I
know, life's tough, and I'm used to that, but still.)  ;-)


And I'm certainly not asking anyone to spend their own time with this
refactoring work, but I'd just like to inquire the general permission,
the option of allowing me (or anyone else, of course) to do this later
on.  Also, if you again say: »Names are sticky.«, then that's fine with
me, too.


Grüße,
 Thomas
Jakub Jelinek Nov. 25, 2013, 5:01 p.m. UTC | #4
On Mon, Nov 25, 2013 at 05:13:25PM +0100, Thomas Schwinge wrote:
> > --- a/gcc/cgraphunit.c
> > +++ b/gcc/cgraphunit.c
> > @@ -2019,7 +2019,18 @@ ipa_passes (void)
> >  			      passes->all_lto_gen_passes);
> >  
> >    if (!in_lto_p)
> > -    ipa_write_summaries ();
> > +    {
> > +      if (flag_openmp)
> 
> The following comment applies to several more instances in this patch:
> after the front end's parsing stage, we should now basically everywhere
> treat flag_openacc and flag_openmp the same.  The idea is that all the
> existing OpenMP omp_* infrastructure in the middle end and following is
> now applicable to not only OpenMP but also OpenACC and any other
> "acceleration" mechanisms.  Again, is there a better term to use instead
> of "acceleration" for describing the union of Cilk+, OpenACC, OpenMP, and
> similar techniques?

I don't think acceleration is a good term for everything say OpenMP does,
and calling OpenMP clauses acceleration clauses just because some other
standards decided to copy/modify a subset of the OpenMP syntax is weird.
> 
> Also, would it then make sense to define a flag à la:
> 
>     #define flag_acceleration (flag_openacc | flag_openmp)
> 
> ..., and begin using that everywhere after the front ends where
> flag_openmp is currently used?

We certainly can have some helper macros, but flag_openacc | flag_openmp
isn't the right definition of all of them, right now we have
flag_openmp, flag_enable_cilkplus (misnamed, should be really
flag_cilkplus), flag_openacc and flag_openmp_simd.  For some things
(e.g. related to offloading, you want flag_openacc | flag_openmp,
for others, e.g. SIMD, you want flag_openmp | flag_openmp_simd |
flag_cilkplus, for others some different combination.  So flag_acceleration
would be certainly misleading.

> Also here, the "target" tag is confusing in that "target" typically has a
> different meaning in the compiler context -- while this one here is used

I agree that target word there is weird.

> for implementing OpenMP's target construct, what it technically does
> should be described by .gnu.offload_gimple_ or somthing similar.  (Note
> that also the LTO term is no longer really applicable, as we're not
> neccessarily doing link-time optimization here, but instead rather just
> use the GIMPLE dumping ("offloading") infrastructure that happens to
> already be implemented in GCC's LTO support.

But LTO is right here IMHO, we are streaming the LTO bytecode there, not
GIMPLE.

	Jakub
Bernd Schmidt Nov. 29, 2013, 12:17 p.m. UTC | #5
On 11/20/2013 10:36 AM, Jakub Jelinek wrote:
> On Wed, Nov 20, 2013 at 10:34:30AM +0100, Richard Biener wrote:
>> On Tue, Nov 19, 2013 at 10:58 AM, Ilya Tocar <tocarip.intel@gmail.com> wrote:
>>> On 14 Nov 11:27, Richard Biener wrote:
>>>>> +  /* Set when symbol needs to be dumped for lto/offloading.  */
>>>>> +  unsigned need_dump : 1;
>>>>> +
>>>>
>>>> That's very non-descriptive.  What's "offloading"?  But yes, something
>>>> like this is what I was asking for.
>>>
>>> I've changed it into:
>>> Set when symbol needs to be dumped into LTO bytecode for LTO,
>>> or in pragma omp target case, for separate compilation targeting
>>> a different architecture.
>>>
>>> Ok for gomp4 branch now?
>>
>> Works for me.  I'll let branch maintainers decide if it follows whatever
>> is done there (I haven't found time to follow stuff here).
> 
> Ok then.

We've been working on similar patches for our OpenACC project. The goal
is to have functions generated during omp-low that will ultimately
execute on a ptx target, write them out using LTO infrastructure and
read them back in using a nvptx-none lto1.

Unforunately, with multiple teams working in the same area there's
obviously going to be some measure of duplication. What I'd like to do
is to post a snapshot of what I currently have, to show the general
ideas and hopefully get some discussion of what the final picture should
look like. The next few mails in reply to this one will contain patches
that work towards the following general outline. I've been trying to
keep this flexible enough so that it won't be suitable just for the
OpenACC work but for whatever else people want to achieve in this area.

1. New configure options are added, --enable-accelerator and
--enable-as-accelerator-for. The names are certainly up for discussion.
These allow the compiler to know which target combinations are
available. The host compiler will be configured with
--enable-accelerator, and the offload/accelerator compiler is configured
with both options (mostly to ensure they both agree on the spelling of
the accelerator target name).
2. Using --enable-as-accelerator-for= changes the install paths, so that
the accelerator compilers end up in (for example)
   bin/x86_64-linux-gnu-accel-nvptx-gcc-4.9.0
   libexec/x86_64-linux-gnu/accel/nvptx/4.9.0/lto1
which should keep them separate in case a target can be used both as a
normal target and as an accelerator.
3. Some machinery is added to build the accelerator gcc directly in the
same tree as the host compiler, in a separate "accel-gcc" subdir. This
works for nvptx because that target doesn't even want to build a libgcc.
It may not be suitable for other accelerators if they want to build
target libraries, but otherwise I think it would be a nice convenience.
However, building separately should work fine as well as long as the
right options are used for configuring all the involved compilers.
4. We add a vector of target machines to the compiler. Normally this is
just initialized to the single machine for which the compiler is
configured, but when e.g. OpenACC with an accelerator is enabled, the
accelerator machine is added to that list. It should cope fine with
multiple different accelerator devices.
5. There's a new DECL_TARGET which refers to this list of target
machines. It's set when creating a child function from e.g. "#pragma acc
parallel"
6. ipa_write_summaries iterates over DECL_TARGET machines to write out
LTO for each of them. LTO sections for a different target get a separate
prefix encoding the machine name, e.g. ".gnu.tlto_nvptx_...".
7. lto_wrapper recognizes them and calls the various gcc drivers as
needed. This is where the series ends, and this step is still incomplete.

As mentioned, this patch series is still incomplete and has rough edges,
but I hope it will generate discussion. Further details that will need
to be addressed are (among others) option handling between compilers for
different targets, and slightly rewriting the incoming gimple to be
valid for the target (nvptx requires variables to go into various
different address spaces).

The patches I'll send assume that the present patch from this thread has
been reverted, but otherwise they should apply to current gomp-4_0-branch.

Thoughts, comments? Does anyone have a good name for these accelerator
targets or output targets, something that avoids the overloaded word
"target" (I was thinking "destination machine" maybe)?


Bernd
Richard Biener Nov. 29, 2013, 12:36 p.m. UTC | #6
On Fri, Nov 29, 2013 at 1:17 PM, Bernd Schmidt <bernds@codesourcery.com> wrote:
> On 11/20/2013 10:36 AM, Jakub Jelinek wrote:
>> On Wed, Nov 20, 2013 at 10:34:30AM +0100, Richard Biener wrote:
>>> On Tue, Nov 19, 2013 at 10:58 AM, Ilya Tocar <tocarip.intel@gmail.com> wrote:
>>>> On 14 Nov 11:27, Richard Biener wrote:
>>>>>> +  /* Set when symbol needs to be dumped for lto/offloading.  */
>>>>>> +  unsigned need_dump : 1;
>>>>>> +
>>>>>
>>>>> That's very non-descriptive.  What's "offloading"?  But yes, something
>>>>> like this is what I was asking for.
>>>>
>>>> I've changed it into:
>>>> Set when symbol needs to be dumped into LTO bytecode for LTO,
>>>> or in pragma omp target case, for separate compilation targeting
>>>> a different architecture.
>>>>
>>>> Ok for gomp4 branch now?
>>>
>>> Works for me.  I'll let branch maintainers decide if it follows whatever
>>> is done there (I haven't found time to follow stuff here).
>>
>> Ok then.
>
> We've been working on similar patches for our OpenACC project. The goal
> is to have functions generated during omp-low that will ultimately
> execute on a ptx target, write them out using LTO infrastructure and
> read them back in using a nvptx-none lto1.
>
> Unforunately, with multiple teams working in the same area there's
> obviously going to be some measure of duplication. What I'd like to do
> is to post a snapshot of what I currently have, to show the general
> ideas and hopefully get some discussion of what the final picture should
> look like. The next few mails in reply to this one will contain patches
> that work towards the following general outline. I've been trying to
> keep this flexible enough so that it won't be suitable just for the
> OpenACC work but for whatever else people want to achieve in this area.
>
> 1. New configure options are added, --enable-accelerator and
> --enable-as-accelerator-for. The names are certainly up for discussion.
> These allow the compiler to know which target combinations are
> available. The host compiler will be configured with
> --enable-accelerator, and the offload/accelerator compiler is configured
> with both options (mostly to ensure they both agree on the spelling of
> the accelerator target name).
> 2. Using --enable-as-accelerator-for= changes the install paths, so that
> the accelerator compilers end up in (for example)
>    bin/x86_64-linux-gnu-accel-nvptx-gcc-4.9.0
>    libexec/x86_64-linux-gnu/accel/nvptx/4.9.0/lto1
> which should keep them separate in case a target can be used both as a
> normal target and as an accelerator.
> 3. Some machinery is added to build the accelerator gcc directly in the
> same tree as the host compiler, in a separate "accel-gcc" subdir. This
> works for nvptx because that target doesn't even want to build a libgcc.
> It may not be suitable for other accelerators if they want to build
> target libraries, but otherwise I think it would be a nice convenience.
> However, building separately should work fine as well as long as the
> right options are used for configuring all the involved compilers.
> 4. We add a vector of target machines to the compiler. Normally this is
> just initialized to the single machine for which the compiler is
> configured, but when e.g. OpenACC with an accelerator is enabled, the
> accelerator machine is added to that list. It should cope fine with
> multiple different accelerator devices.
> 5. There's a new DECL_TARGET which refers to this list of target
> machines. It's set when creating a child function from e.g. "#pragma acc
> parallel"
> 6. ipa_write_summaries iterates over DECL_TARGET machines to write out
> LTO for each of them. LTO sections for a different target get a separate
> prefix encoding the machine name, e.g. ".gnu.tlto_nvptx_...".
> 7. lto_wrapper recognizes them and calls the various gcc drivers as
> needed. This is where the series ends, and this step is still incomplete.
>
> As mentioned, this patch series is still incomplete and has rough edges,
> but I hope it will generate discussion. Further details that will need
> to be addressed are (among others) option handling between compilers for
> different targets, and slightly rewriting the incoming gimple to be
> valid for the target (nvptx requires variables to go into various
> different address spaces).
>
> The patches I'll send assume that the present patch from this thread has
> been reverted, but otherwise they should apply to current gomp-4_0-branch.
>
> Thoughts, comments? Does anyone have a good name for these accelerator
> targets or output targets, something that avoids the overloaded word
> "target" (I was thinking "destination machine" maybe)?

Note that we (SUSE/AMD) sofar think we can go an easier route, not
adding a real backend that targets HSAIL/BRIG but instead use a
custom GIMPLE SSA -> HSAIL/BRIG translator (including a SSA
based register allocator).  Which if course simplifies driving this a bit
as we don't need to write/read any GIMPLE.

The idea is of course that the "highlevel" target languages, being it
HSAIL/BRIG or PTX run through another compiler + optimizer anyway,
so machine specific optimization is not necessary (fingers crossing...).

Not sure if anybody announced it yet (but gcc-cvs readers may have
noticed), there is a 'hsa' branch in svn covering work done sofar
(see gcc/README.hsa for how to use it).

Richard.

>
> Bernd
>
Bernd Schmidt Nov. 29, 2013, 12:50 p.m. UTC | #7
On 11/29/2013 01:36 PM, Richard Biener wrote:
> Note that we (SUSE/AMD) sofar think we can go an easier route, not
> adding a real backend that targets HSAIL/BRIG but instead use a
> custom GIMPLE SSA -> HSAIL/BRIG translator (including a SSA
> based register allocator).  Which if course simplifies driving this a bit
> as we don't need to write/read any GIMPLE.
> 
> The idea is of course that the "highlevel" target languages, being it
> HSAIL/BRIG or PTX run through another compiler + optimizer anyway,
> so machine specific optimization is not necessary (fingers crossing...).
> 
> Not sure if anybody announced it yet (but gcc-cvs readers may have
> noticed), there is a 'hsa' branch in svn covering work done sofar
> (see gcc/README.hsa for how to use it).

That's also an interesting idea. Did you resurrect the gimple-backend
branch that I think existed a while ago?

I'm not sure ptx is really high-level enough for that approach to work
well though. And gimple looks different for x86 and ptx due to the use
of address spaces, so I have doubts whether such an approach would be
suitable.


Bernd
Richard Biener Nov. 29, 2013, 12:59 p.m. UTC | #8
On Fri, Nov 29, 2013 at 1:50 PM, Bernd Schmidt <bernds@codesourcery.com> wrote:
> On 11/29/2013 01:36 PM, Richard Biener wrote:
>> Note that we (SUSE/AMD) sofar think we can go an easier route, not
>> adding a real backend that targets HSAIL/BRIG but instead use a
>> custom GIMPLE SSA -> HSAIL/BRIG translator (including a SSA
>> based register allocator).  Which if course simplifies driving this a bit
>> as we don't need to write/read any GIMPLE.
>>
>> The idea is of course that the "highlevel" target languages, being it
>> HSAIL/BRIG or PTX run through another compiler + optimizer anyway,
>> so machine specific optimization is not necessary (fingers crossing...).
>>
>> Not sure if anybody announced it yet (but gcc-cvs readers may have
>> noticed), there is a 'hsa' branch in svn covering work done sofar
>> (see gcc/README.hsa for how to use it).
>
> That's also an interesting idea. Did you resurrect the gimple-backend
> branch that I think existed a while ago?

No - I didn't even know there was one ;)  I know of the gimple-frontend
branch.

> I'm not sure ptx is really high-level enough for that approach to work
> well though. And gimple looks different for x86 and ptx due to the use
> of address spaces, so I have doubts whether such an approach would be
> suitable.

We'll see - it's good to have both variants tried.  I doubt HSAIL is
more high-level than PTX though, but the unified address space
you have with the HSA framework certainly simplifies things
(not that there isn't talks about optional different memory spaces
in the HSA specs ...).

Richard.

>
> Bernd
>
Kirill Yukhin Nov. 29, 2013, 1:05 p.m. UTC | #9
Hello Bernd, 
On 29 Nov 13:17, Bernd Schmidt wrote:
> 5. There's a new DECL_TARGET which refers to this list of target
> machines. It's set when creating a child function from e.g. "#pragma acc
> parallel"
Actually, I do not understand, what term `target machine' means here. 
Are you talking about to target toolchain (target compiler, assembler, linker,
libraries etc)?

> 6. ipa_write_summaries iterates over DECL_TARGET machines to write out
> LTO for each of them. LTO sections for a different target get a separate
> prefix encoding the machine name, e.g. ".gnu.tlto_nvptx_...".
Why we want separate sections for different targets? As far as I understand
this is going to be generic Gimple, which should be identical to PTX, MIC etc.
We cannot use target built-ins inside such a common regions, right?

I also think it worst saying that currently we're working on passing of omp_target
sections to target compiler (we call it `streaming in') so we can produce target
objects from lto sections containing IR marked to be `target'. 
Multiple targets are handled by means of dedicated targets descriptor, containing vector of
target compilers which will be executed on given sections one-by-one producing set
of objects for every target.
This sections are not related on exact target, as I mentioned above.

We're also working on generation of dedicated tables which will be needed 
for host<->target address mapping (see Jakub's mails on the subject).

Hope to post initial versions nearest wws.

--
Thanks, K
Bernd Schmidt Nov. 29, 2013, 1:15 p.m. UTC | #10
On 11/29/2013 02:05 PM, Kirill Yukhin wrote:
> On 29 Nov 13:17, Bernd Schmidt wrote:
>> 5. There's a new DECL_TARGET which refers to this list of target
>> machines. It's set when creating a child function from e.g. "#pragma acc
>> parallel"
> Actually, I do not understand, what term `target machine' means here. 
> Are you talking about to target toolchain (target compiler, assembler, linker,
> libraries etc)?

The idea is that if an x86-linux toolchain is configured with
--enable-accelerator=nvptx, there would be two machines in the list -
the normal host, x86-linux, and nvptx. We can directly generate code for
the normal host, and everything else goes through lto writeout ->
lto_wrapper -> target gcc/lto1.

>> 6. ipa_write_summaries iterates over DECL_TARGET machines to write out
>> LTO for each of them. LTO sections for a different target get a separate
>> prefix encoding the machine name, e.g. ".gnu.tlto_nvptx_...".
> Why we want separate sections for different targets? As far as I understand
> this is going to be generic Gimple, which should be identical to PTX, MIC etc.
> We cannot use target built-ins inside such a common regions, right?

But they need to be read in by a different compiler and fed to the
appropriate nvptx-none lto1. I imagine the easiest way to do this is to
encode the name in the section and then call multiple different gcc
frontends from lto-wrapper, so that's what I've been working towards.

> I also think it worst saying that currently we're working on passing of omp_target
> sections to target compiler (we call it `streaming in') so we can produce target
> objects from lto sections containing IR marked to be `target'. 
> Multiple targets are handled by means of dedicated targets descriptor, containing vector of
> target compilers which will be executed on given sections one-by-one producing set
> of objects for every target.
> This sections are not related on exact target, as I mentioned above.

None of this has been posted yet, correct? If it has, can you point me
at the right place in the archive?


Bernd
Jakub Jelinek Nov. 29, 2013, 3:03 p.m. UTC | #11
On Fri, Nov 29, 2013 at 01:36:48PM +0100, Richard Biener wrote:
> > Thoughts, comments? Does anyone have a good name for these accelerator
> > targets or output targets, something that avoids the overloaded word
> > "target" (I was thinking "destination machine" maybe)?

I think offload is best word here.

> Note that we (SUSE/AMD) sofar think we can go an easier route, not
> adding a real backend that targets HSAIL/BRIG but instead use a
> custom GIMPLE SSA -> HSAIL/BRIG translator (including a SSA
> based register allocator).  Which if course simplifies driving this a bit
> as we don't need to write/read any GIMPLE.
> 
> The idea is of course that the "highlevel" target languages, being it
> HSAIL/BRIG or PTX run through another compiler + optimizer anyway,
> so machine specific optimization is not necessary (fingers crossing...).
> 
> Not sure if anybody announced it yet (but gcc-cvs readers may have
> noticed), there is a 'hsa' branch in svn covering work done sofar
> (see gcc/README.hsa for how to use it).

But you probably don't want to translate GIMPLE right out of IPA into
HSAIL/BRIG, do you?  And various further passes depend already (well, also
the early ones a little bit, but that is something to fix) heavily on
targetm.* and target macros, so do you plan to switch targetm to something
else and compile again a subset of functions for the HSAIL target?
Otherwise, how could you e.g. vectorize code (assuming HSAIL has vector
support)?

	Jakub
Jakub Jelinek Nov. 29, 2013, 3:16 p.m. UTC | #12
On Fri, Nov 29, 2013 at 01:17:56PM +0100, Bernd Schmidt wrote:
> We've been working on similar patches for our OpenACC project. The goal
> is to have functions generated during omp-low that will ultimately
> execute on a ptx target, write them out using LTO infrastructure and
> read them back in using a nvptx-none lto1.

Please see the past threads about this topic, e.g.
"Questions about LTO infrastructure and pragma omp target"
thread on gcc ml from August till now, also
"Offloading Support in libgomp"
and
"Target compilation for offloading"

It certainly doesn't make sense to invent different infrastructures for
OpenMP offloading and for OpenACC offloading, after all, the current
OpenACC code on gomp-4_0-branch I think meant to use the libgomp APIs.

> 4. We add a vector of target machines to the compiler. Normally this is
> just initialized to the single machine for which the compiler is
> configured, but when e.g. OpenACC with an accelerator is enabled, the
> accelerator machine is added to that list. It should cope fine with
> multiple different accelerator devices.

This was discussed that it would be nice to allow users during linking
(or compilation already?) to choose for which offloading targets code should
be compiled, and have a mechanism to use original non-target specific
options + have a way to override those for the offloading target.

> 5. There's a new DECL_TARGET which refers to this list of target
> machines. It's set when creating a child function from e.g. "#pragma acc
> parallel"
> 6. ipa_write_summaries iterates over DECL_TARGET machines to write out

Right now on gomp-4_0-branch a special attribute on the decls (VAR_DECL
as well as FUNCTION_DECL) is used for these, but if there are spare bits,
something else could be used instead.

> LTO for each of them. LTO sections for a different target get a separate
> prefix encoding the machine name, e.g. ".gnu.tlto_nvptx_...".

As you want to dump the GIMPLE IL right out of ~ IPA stage, it should in
theory be target independent, so it is undesirable to emit it several times
for each offloading target.  Instead just stream once and let during linking
decide what to support.

	Jakub
Bernd Schmidt Nov. 29, 2013, 4:57 p.m. UTC | #13
On 11/29/2013 04:16 PM, Jakub Jelinek wrote:
> As you want to dump the GIMPLE IL right out of ~ IPA stage, it should in
> theory be target independent, so it is undesirable to emit it several times
> for each offloading target.

That's not what happens. It's just partitioned into disjoint sets, one
for each target, which is then written out with the target name encoded
in the section name.


Bernd
Jakub Jelinek Nov. 29, 2013, 5:03 p.m. UTC | #14
On Fri, Nov 29, 2013 at 05:57:25PM +0100, Bernd Schmidt wrote:
> On 11/29/2013 04:16 PM, Jakub Jelinek wrote:
> > As you want to dump the GIMPLE IL right out of ~ IPA stage, it should in
> > theory be target independent, so it is undesirable to emit it several times
> > for each offloading target.
> 
> That's not what happens. It's just partitioned into disjoint sets, one
> for each target, which is then written out with the target name encoded
> in the section name.

But why?  Does OpenACC have some way to say, this is to be offloaded for
offloading target XYZ?  In OpenMP 4.0, you have just host (initial device)
code and then some code that is either offloaded if possible, or not
(thus, either code/variables are emitted only for the primary target, no
offloading, or both for primary target and offloading).  But the tought was
to stream the generic GIMPLE at IPA time for that (target independent) and
only choose later on (during linking) for which offloading targets you
actually want to compile it (if any).

	Jakub
Bernd Schmidt Nov. 29, 2013, 5:07 p.m. UTC | #15
On 11/29/2013 06:03 PM, Jakub Jelinek wrote:
> On Fri, Nov 29, 2013 at 05:57:25PM +0100, Bernd Schmidt wrote:
>> On 11/29/2013 04:16 PM, Jakub Jelinek wrote:
>>> As you want to dump the GIMPLE IL right out of ~ IPA stage, it should in
>>> theory be target independent, so it is undesirable to emit it several times
>>> for each offloading target.
>>
>> That's not what happens. It's just partitioned into disjoint sets, one
>> for each target, which is then written out with the target name encoded
>> in the section name.
> 
> But why?  Does OpenACC have some way to say, this is to be offloaded for
> offloading target XYZ?  In OpenMP 4.0, you have just host (initial device)
> code and then some code that is either offloaded if possible, or not
> (thus, either code/variables are emitted only for the primary target, no
> offloading, or both for primary target and offloading).  But the tought was
> to stream the generic GIMPLE at IPA time for that (target independent) and
> only choose later on (during linking) for which offloading targets you
> actually want to compile it (if any).

By what mechanism do you choose? This is unclear to me from what I've
seen. Does this involve user action, and what's the advantage of doing
it this way?
The model I was imagining is that we choose the OpenACC target at
configure time and things happen automatically from there on.


Bernd
Jakub Jelinek Nov. 29, 2013, 5:12 p.m. UTC | #16
On Fri, Nov 29, 2013 at 06:07:38PM +0100, Bernd Schmidt wrote:
> By what mechanism do you choose? This is unclear to me from what I've
> seen. Does this involve user action, and what's the advantage of doing
> it this way?

See the 3 threads I've mentioned.  The compiler would know the list of
available offloading targets (after all, it needs to build libgomp plugins
for those targets), and that would be the default, and user could override
that through link time command line options (say, ok, while gcc has been
configured to support all of hsail-none, ptx-none and x86_64-k10m-linux
offloading targets, I only want to support here one of those, and
please use these additional options for compilation of that target).

	Jakub
Richard Biener Dec. 2, 2013, 12:20 p.m. UTC | #17
On Fri, Nov 29, 2013 at 4:03 PM, Jakub Jelinek <jakub@redhat.com> wrote:
> On Fri, Nov 29, 2013 at 01:36:48PM +0100, Richard Biener wrote:
>> > Thoughts, comments? Does anyone have a good name for these accelerator
>> > targets or output targets, something that avoids the overloaded word
>> > "target" (I was thinking "destination machine" maybe)?
>
> I think offload is best word here.
>
>> Note that we (SUSE/AMD) sofar think we can go an easier route, not
>> adding a real backend that targets HSAIL/BRIG but instead use a
>> custom GIMPLE SSA -> HSAIL/BRIG translator (including a SSA
>> based register allocator).  Which if course simplifies driving this a bit
>> as we don't need to write/read any GIMPLE.
>>
>> The idea is of course that the "highlevel" target languages, being it
>> HSAIL/BRIG or PTX run through another compiler + optimizer anyway,
>> so machine specific optimization is not necessary (fingers crossing...).
>>
>> Not sure if anybody announced it yet (but gcc-cvs readers may have
>> noticed), there is a 'hsa' branch in svn covering work done sofar
>> (see gcc/README.hsa for how to use it).
>
> But you probably don't want to translate GIMPLE right out of IPA into
> HSAIL/BRIG, do you?

Actually we do currently.

> And various further passes depend already (well, also
> the early ones a little bit, but that is something to fix) heavily on
> targetm.* and target macros, so do you plan to switch targetm to something
> else and compile again a subset of functions for the HSAIL target?
> Otherwise, how could you e.g. vectorize code (assuming HSAIL has vector
> support)?

Yeah, we seem to run host specific passes before HSAIL/BRIG generation
so we have to address this somehow.

Richard.

>
>         Jakub
Bernd Schmidt Dec. 3, 2013, 1 p.m. UTC | #18
On 11/29/2013 06:12 PM, Jakub Jelinek wrote:
> On Fri, Nov 29, 2013 at 06:07:38PM +0100, Bernd Schmidt wrote:
>> By what mechanism do you choose? This is unclear to me from what I've
>> seen. Does this involve user action, and what's the advantage of doing
>> it this way?
> 
> See the 3 threads I've mentioned.  The compiler would know the list of
> available offloading targets (after all, it needs to build libgomp plugins
> for those targets), and that would be the default, and user could override
> that through link time command line options (say, ok, while gcc has been
> configured to support all of hsail-none, ptx-none and x86_64-k10m-linux
> offloading targets, I only want to support here one of those, and
> please use these additional options for compilation of that target).

Ok, IIUC the model is that we just compile all target code for all
targets (or a subset of them). Is that correct? In that case I can see
how the code that's on the branch now is sufficient; I'd assumed
something more fine-grained would be desirable.
It would be helpful to see the other pieces of this work if they already
exist.


Bernd
Michael Zolotukhin Dec. 3, 2013, 1:44 p.m. UTC | #19
Hi Bernd,

I am working on offloading support for OpenMP4, so I'll try to share my vision
of how everything works and answer your questions.

GCC compiles host version of the code (as usual) and dumps Gimple, as it does
for LTO, but for offloading.  Gimple IR is stored only for functions/variables
that have target versions - for OpenMP that means the functions/variables under
'omp declare target' pragma and outlined functions from 'omp target' regions.
Dumped gimple is then picked up by lto-plugin, which performs compilations for
all specified targets (specified either in configure or by compilation flags).
(Note that though we use lto-infrastructure, it possible that we don't do any
link-time optimizations.)

Compiled target images are then embedded into a data-sections of the host binary,
and are registered in a global for all targets descriptor.  This descriptor is
basically an array of pointers to beginning of the images and its sizes.

Along with pointers to images, the descriptor contains pointer to
functions/globals tables and some other data.  Function tables are used to find
the correspondence between host and target versions of functions, that could be
offloaded.

To compile code for target, lto-wrapper calls target compiler, giving it
fat-object with dumped gimple as an input.  This target compiler could actually
be anything that understands Gimple as its input and produces a target image.
(Patch for this part hasn't been submitted yet, but I hope to do it in a near
future).

Another big part of the entire picture is libgomp - a runtime for supporting all
this infrastructure.  To support different targets, we added plugin-support to
libgomp.  Thus, libgomp operates in 'general' terms and calls plugin-hooks, when
it needs to do anything target-specific (examples of such hooks are:
device_run_function, device_init_device, etc.).

So, for each target we need to have two important pieces:
1) compiler, that accepts gimple as its input and produces target image
2) plugin for libgomp, which will implement all device-specific part

That's just a general overview, so I could explain some parts in more details if
you need.  Not everything has already been implemented yet, here is the current
status (also, very general and maybe rough):
 - Plugins support in libgomp.  Done in gomp4-branch.
 - Streaming gimple IR into dedicated sections for later use by target
   compilers.  Done in gomp4-branch.
 - Invoking target compilers and embedding target images into host binary.  Work
   in progress, hope to send the patch soon.
 - Configure options, compile options for specifying offloading targets.  Work
   in progress.

Thanks,
Michael

On 03 Dec 14:00, Bernd Schmidt wrote:
> On 11/29/2013 06:12 PM, Jakub Jelinek wrote:
> > On Fri, Nov 29, 2013 at 06:07:38PM +0100, Bernd Schmidt wrote:
> >> By what mechanism do you choose? This is unclear to me from what I've
> >> seen. Does this involve user action, and what's the advantage of doing
> >> it this way?
> > 
> > See the 3 threads I've mentioned.  The compiler would know the list of
> > available offloading targets (after all, it needs to build libgomp plugins
> > for those targets), and that would be the default, and user could override
> > that through link time command line options (say, ok, while gcc has been
> > configured to support all of hsail-none, ptx-none and x86_64-k10m-linux
> > offloading targets, I only want to support here one of those, and
> > please use these additional options for compilation of that target).
> 
> Ok, IIUC the model is that we just compile all target code for all
> targets (or a subset of them). Is that correct? In that case I can see
> how the code that's on the branch now is sufficient; I'd assumed
> something more fine-grained would be desirable.
> It would be helpful to see the other pieces of this work if they already
> exist.
> 
> 
> Bernd
>
diff mbox

Patch

diff --git a/gcc/cgraph.h b/gcc/cgraph.h
index fb0fe93..9f799f4 100644
--- a/gcc/cgraph.h
+++ b/gcc/cgraph.h
@@ -105,6 +105,11 @@  public:
   /* Set when symbol has address taken. */
   unsigned address_taken : 1;
 
+  /* Set when symbol needs to be dumped into LTO bytecode for LTO,
+     or in pragma omp target case, for separate compilation targeting
+     a different architecture.  */
+  unsigned need_dump : 1;
+
 
   /* Ordering of all symtab entries.  */
   int order;
diff --git a/gcc/cgraphunit.c b/gcc/cgraphunit.c
index c3a8967..53cd250 100644
--- a/gcc/cgraphunit.c
+++ b/gcc/cgraphunit.c
@@ -2019,7 +2019,18 @@  ipa_passes (void)
 			      passes->all_lto_gen_passes);
 
   if (!in_lto_p)
-    ipa_write_summaries ();
+    {
+      if (flag_openmp)
+	{
+	  section_name_prefix = OMP_SECTION_NAME_PREFIX;
+	  ipa_write_summaries (true);
+	}
+      if (flag_lto)
+	{
+	  section_name_prefix = LTO_SECTION_NAME_PREFIX;
+	  ipa_write_summaries (false);
+	}
+    }
 
   if (flag_generate_lto)
     targetm.asm_out.lto_end ();
@@ -2110,7 +2121,7 @@  compile (void)
   cgraph_state = CGRAPH_STATE_IPA;
 
   /* If LTO is enabled, initialize the streamer hooks needed by GIMPLE.  */
-  if (flag_lto)
+  if (flag_lto || flag_openmp)
     lto_streamer_hooks_init ();
 
   /* Don't run the IPA passes if there was any error or sorry messages.  */
diff --git a/gcc/ipa-inline-analysis.c b/gcc/ipa-inline-analysis.c
index 4458723..62faa52 100644
--- a/gcc/ipa-inline-analysis.c
+++ b/gcc/ipa-inline-analysis.c
@@ -3813,7 +3813,7 @@  inline_generate_summary (void)
 
   /* When not optimizing, do not bother to analyze.  Inlining is still done
      because edge redirection needs to happen there.  */
-  if (!optimize && !flag_lto && !flag_wpa)
+  if (!optimize && !flag_lto && !flag_wpa && !flag_openmp)
     return;
 
   function_insertion_hook_holder =
diff --git a/gcc/lto-cgraph.c b/gcc/lto-cgraph.c
index 6a52da8..697c069 100644
--- a/gcc/lto-cgraph.c
+++ b/gcc/lto-cgraph.c
@@ -238,6 +238,9 @@  void
 lto_set_symtab_encoder_in_partition (lto_symtab_encoder_t encoder,
 				     symtab_node *node)
 {
+  /* Ignore not needed nodes.  */
+  if (!node->need_dump)
+    return;
   int index = lto_symtab_encoder_encode (encoder, node);
   encoder->nodes[index].in_partition = true;
 }
@@ -751,6 +754,17 @@  add_references (lto_symtab_encoder_t encoder,
       lto_symtab_encoder_encode (encoder, ref->referred);
 }
 
+/* Select what needs to be dumped. In lto case dump everything.
+   In omp target case only dump stuff makrked with attribute.  */
+void
+select_what_to_dump (bool is_omp)
+{
+  struct symtab_node *snode;
+  FOR_EACH_SYMBOL(snode)
+    snode->need_dump = !is_omp || lookup_attribute ("omp declare target",
+						    DECL_ATTRIBUTES (snode->decl));
+}
+
 /* Find all symbols we want to stream into given partition and insert them
    to encoders.
 
diff --git a/gcc/lto-streamer.c b/gcc/lto-streamer.c
index 1540e4c..ffafb0e 100644
--- a/gcc/lto-streamer.c
+++ b/gcc/lto-streamer.c
@@ -43,6 +43,7 @@  struct lto_stats_d lto_stats;
 static bitmap_obstack lto_obstack;
 static bool lto_obstack_initialized;
 
+const char *section_name_prefix = LTO_SECTION_NAME_PREFIX;
 
 /* Return a string representing LTO tag TAG.  */
 
@@ -172,7 +173,7 @@  lto_get_section_name (int section_type, const char *name, struct lto_file_decl_d
     sprintf (post, "." HOST_WIDE_INT_PRINT_HEX_PURE, f->id);
   else
     sprintf (post, "." HOST_WIDE_INT_PRINT_HEX_PURE, get_random_seed (false)); 
-  return concat (LTO_SECTION_NAME_PREFIX, sep, add, post, NULL);
+  return concat (section_name_prefix, sep, add, post, NULL);
 }
 
 
@@ -310,7 +311,7 @@  lto_streamer_init (void)
 bool
 gate_lto_out (void)
 {
-  return ((flag_generate_lto || in_lto_p)
+  return ((flag_generate_lto || in_lto_p || flag_openmp)
 	  /* Don't bother doing anything if the program has errors.  */
 	  && !seen_error ());
 }
diff --git a/gcc/lto-streamer.h b/gcc/lto-streamer.h
index 797e92e..f4c46db 100644
--- a/gcc/lto-streamer.h
+++ b/gcc/lto-streamer.h
@@ -139,6 +139,11 @@  along with GCC; see the file COPYING3.  If not see
    name for the functions and static_initializers.  For other types of
    sections a '.' and the section type are appended.  */
 #define LTO_SECTION_NAME_PREFIX         ".gnu.lto_"
+#define OMP_SECTION_NAME_PREFIX         ".gnu.target_lto_"
+
+/* Can be either OMP_SECTION_NAME_PREFIX when we stream pragma omp target
+   stuff, or LTO_SECTION_NAME_PREFIX for lto case.  */
+extern const char  *section_name_prefix;
 
 #define LTO_major_version 2
 #define LTO_minor_version 2
@@ -895,6 +900,7 @@  bool referenced_from_this_partition_p (struct ipa_ref_list *,
 bool reachable_from_this_partition_p (struct cgraph_node *,
 				      lto_symtab_encoder_t);
 lto_symtab_encoder_t compute_ltrans_boundary (lto_symtab_encoder_t encoder);
+void select_what_to_dump (bool);
 
 
 /* In lto-symtab.c.  */
diff --git a/gcc/lto/lto-partition.c b/gcc/lto/lto-partition.c
index 6a3d881..2d2aa63 100644
--- a/gcc/lto/lto-partition.c
+++ b/gcc/lto/lto-partition.c
@@ -190,6 +190,7 @@  add_symbol_to_partition_1 (ltrans_partition part, symtab_node *node)
   gcc_assert (c != SYMBOL_EXTERNAL
 	      && (c == SYMBOL_DUPLICATE || !symbol_partitioned_p (node)));
 
+  node->need_dump = true;
   lto_set_symtab_encoder_in_partition (part->encoder, node);
 
   if (symbol_partitioned_p (node))
@@ -917,6 +918,8 @@  lto_promote_cross_file_statics (void)
 
   gcc_assert (flag_wpa);
 
+  select_what_to_dump (false);
+
   /* First compute boundaries.  */
   n_sets = ltrans_partitions.length ();
   for (i = 0; i < n_sets; i++)
diff --git a/gcc/passes.c b/gcc/passes.c
index 19e5869..88b1538 100644
--- a/gcc/passes.c
+++ b/gcc/passes.c
@@ -2335,7 +2335,7 @@  ipa_write_summaries_1 (lto_symtab_encoder_t encoder)
 /* Write out summaries for all the nodes in the callgraph.  */
 
 void
-ipa_write_summaries (void)
+ipa_write_summaries (bool is_omp)
 {
   lto_symtab_encoder_t encoder;
   int i, order_pos;
@@ -2343,9 +2343,11 @@  ipa_write_summaries (void)
   struct cgraph_node *node;
   struct cgraph_node **order;
 
-  if (!flag_generate_lto || seen_error ())
+  if (!(flag_generate_lto || flag_openmp) || seen_error () )
     return;
 
+  select_what_to_dump (is_omp);
+
   encoder = lto_symtab_encoder_new (false);
 
   /* Create the callgraph set in the same order used in
diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h
index fa403c7..8d51d80 100644
--- a/gcc/tree-pass.h
+++ b/gcc/tree-pass.h
@@ -595,7 +595,7 @@  extern void pass_fini_dump_file (struct opt_pass *);
 extern const char *get_current_pass_name (void);
 extern void print_current_pass (FILE *);
 extern void debug_pass (void);
-extern void ipa_write_summaries (void);
+extern void ipa_write_summaries (bool is_omp);
 extern void ipa_write_optimization_summaries (struct lto_symtab_encoder_d *);
 extern void ipa_read_summaries (void);
 extern void ipa_read_optimization_summaries (void);