Patchwork Fix incorrect devirtualization (PR middle-end/48661)

login
register
mail settings
Submitter Jakub Jelinek
Date April 18, 2011, 9:40 p.m.
Message ID <20110418214008.GE17079@tyan-ft48-01.lab.bos.redhat.com>
Download mbox | patch
Permalink /patch/91874/
State New
Headers show

Comments

Jakub Jelinek - April 18, 2011, 9:40 p.m.
Hi!

If TREE_BINFO has BV_VCALL_INDEX set, this needs to be dynamically
adjusted, but none of the callers are prepared to handle that.
Thus, this patch makes devirtualization give up in those cases.

Bootstrapped/regtested on x86_64-linux and i686-linux, trunk and 4.6.
On the trunk the testcase ICEs before and after the patch in some new callgraph
checking (added today or so, Honza?), on the branch it works just fine.

Ok for trunk/4.6?

2011-04-18  Jakub Jelinek  <jakub@redhat.com>

	PR middle-end/48661
	* gimple-fold.c (gimple_get_virt_method_for_binfo): Return NULL
	if TREE_TYPE (v) is non-NULL.

	* gimple-fold.c (gimple_get_virt_method_for_binfo): Renamed from
	gimple_get_virt_mehtod_for_binfo.
	* gimple.h (gimple_get_virt_method_for_binfo): Likewise.
	* ipa-cp.c (ipcp_process_devirtualization_opportunities): Adjust
	callers.
	* ipa-prop.c (try_make_edge_direct_virtual_call): Likewise.

	* g++.dg/torture/pr48661.C: New test.


	Jakub
Richard Guenther - April 18, 2011, 9:47 p.m.
On Mon, Apr 18, 2011 at 11:40 PM, Jakub Jelinek <jakub@redhat.com> wrote:
> Hi!
>
> If TREE_BINFO has BV_VCALL_INDEX set, this needs to be dynamically
> adjusted, but none of the callers are prepared to handle that.
> Thus, this patch makes devirtualization give up in those cases.
>
> Bootstrapped/regtested on x86_64-linux and i686-linux, trunk and 4.6.
> On the trunk the testcase ICEs before and after the patch in some new callgraph
> checking (added today or so, Honza?), on the branch it works just fine.
>
> Ok for trunk/4.6?

Ok.

Thanks,
Richard.

> 2011-04-18  Jakub Jelinek  <jakub@redhat.com>
>
>        PR middle-end/48661
>        * gimple-fold.c (gimple_get_virt_method_for_binfo): Return NULL
>        if TREE_TYPE (v) is non-NULL.
>
>        * gimple-fold.c (gimple_get_virt_method_for_binfo): Renamed from
>        gimple_get_virt_mehtod_for_binfo.
>        * gimple.h (gimple_get_virt_method_for_binfo): Likewise.
>        * ipa-cp.c (ipcp_process_devirtualization_opportunities): Adjust
>        callers.
>        * ipa-prop.c (try_make_edge_direct_virtual_call): Likewise.
>
>        * g++.dg/torture/pr48661.C: New test.
>
> --- gcc/gimple-fold.c.jj        2011-03-14 14:12:15.000000000 +0100
> +++ gcc/gimple-fold.c   2011-04-18 18:35:22.000000000 +0200
> @@ -1374,7 +1374,7 @@ gimple_fold_builtin (gimple stmt)
>    is a thunk (other than a this adjustment which is dealt with by DELTA). */
>
>  tree
> -gimple_get_virt_mehtod_for_binfo (HOST_WIDE_INT token, tree known_binfo,
> +gimple_get_virt_method_for_binfo (HOST_WIDE_INT token, tree known_binfo,
>                                  tree *delta, bool refuse_thunks)
>  {
>   HOST_WIDE_INT i;
> @@ -1393,6 +1393,10 @@ gimple_get_virt_mehtod_for_binfo (HOST_W
>       v = TREE_CHAIN (v);
>     }
>
> +  /* If BV_VCALL_INDEX is non-NULL, give up.  */
> +  if (TREE_TYPE (v))
> +    return NULL_TREE;
> +
>   fndecl = TREE_VALUE (v);
>   node = cgraph_get_node_or_alias (fndecl);
>   if (refuse_thunks
> --- gcc/gimple.h.jj     2011-03-14 14:12:15.000000000 +0100
> +++ gcc/gimple.h        2011-04-18 18:35:40.000000000 +0200
> @@ -892,7 +892,7 @@ unsigned get_gimple_rhs_num_ops (enum tr
>  gimple gimple_alloc_stat (enum gimple_code, unsigned MEM_STAT_DECL);
>  const char *gimple_decl_printable_name (tree, int);
>  bool gimple_fold_call (gimple_stmt_iterator *gsi, bool inplace);
> -tree gimple_get_virt_mehtod_for_binfo (HOST_WIDE_INT, tree, tree *, bool);
> +tree gimple_get_virt_method_for_binfo (HOST_WIDE_INT, tree, tree *, bool);
>  void gimple_adjust_this_by_delta (gimple_stmt_iterator *, tree);
>  /* Returns true iff T is a valid GIMPLE statement.  */
>  extern bool is_gimple_stmt (tree);
> --- gcc/ipa-cp.c.jj     2011-04-13 12:39:28.000000000 +0200
> +++ gcc/ipa-cp.c        2011-04-18 18:36:11.000000000 +0200
> @@ -1242,7 +1242,7 @@ ipcp_process_devirtualization_opportunit
>        {
>          tree binfo = VEC_index (tree, info->params[param_index].types, j);
>          tree d;
> -         tree t = gimple_get_virt_mehtod_for_binfo (token, binfo, &d, true);
> +         tree t = gimple_get_virt_method_for_binfo (token, binfo, &d, true);
>
>          if (!t)
>            {
> --- gcc/ipa-prop.c.jj   2011-04-13 12:39:28.000000000 +0200
> +++ gcc/ipa-prop.c      2011-04-18 18:36:30.000000000 +0200
> @@ -1730,7 +1730,7 @@ try_make_edge_direct_virtual_call (struc
>   type = ie->indirect_info->otr_type;
>   binfo = get_binfo_at_offset (binfo, ie->indirect_info->anc_offset, type);
>   if (binfo)
> -    target = gimple_get_virt_mehtod_for_binfo (token, binfo, &delta, true);
> +    target = gimple_get_virt_method_for_binfo (token, binfo, &delta, true);
>   else
>     return NULL;
>
> --- gcc/testsuite/g++.dg/torture/pr48661.C.jj   2011-04-18 18:50:49.000000000 +0200
> +++ gcc/testsuite/g++.dg/torture/pr48661.C      2011-04-18 18:50:11.000000000 +0200
> @@ -0,0 +1,77 @@
> +// PR middle-end/48661
> +// { dg-do run }
> +
> +extern "C" void abort ();
> +
> +__attribute__((noinline))
> +double
> +foo (double x, double y)
> +{
> +  asm volatile ("" : : : "memory");
> +  return x + y;
> +}
> +
> +__attribute__((noinline, noclone))
> +void
> +bar (int x)
> +{
> +  if (x != 123)
> +    abort ();
> +}
> +
> +struct A
> +{
> +  double a1, a2;
> +};
> +
> +struct B
> +{
> +  virtual int m () const = 0 ;
> +};
> +
> +struct C
> +{
> +  virtual ~C () {}
> +};
> +
> +struct D : virtual public B, public C
> +{
> +  explicit D (const A &x) : d(123) { foo (x.a2, x.a1); }
> +  int m () const { return d; }
> +  int d;
> +};
> +
> +struct E
> +{
> +  E () : d(0) {}
> +  virtual void n (const B &x) { d = x.m (); x.m (); x.m (); }
> +  int d;
> +};
> +
> +void
> +test ()
> +{
> +  A a;
> +  a.a1 = 0;
> +  a.a2 = 1;
> +  E p;
> +  D q (a);
> +  const B &b = q;
> +  bar (b.m ());
> +  p.n (b);
> +  bar (p.d);
> +}
> +
> +void
> +baz ()
> +{
> +  A a;
> +  D p2 (a);
> +}
> +
> +int
> +main ()
> +{
> +  test ();
> +  return 0;
> +}
>
>        Jakub
>
Jason Merrill - April 18, 2011, 10:33 p.m.
On 04/18/2011 02:40 PM, Jakub Jelinek wrote:
> If TREE_BINFO has BV_VCALL_INDEX set, this needs to be dynamically
> adjusted, but none of the callers are prepared to handle that.

Well, it means that we do dynamic adjustment at runtime.  If we're able 
to do devirtualization, we should be able to figure out the right offset 
as well, just not in 4.6.

Jason
H.J. Lu - April 18, 2011, 11:35 p.m.
On Mon, Apr 18, 2011 at 2:40 PM, Jakub Jelinek <jakub@redhat.com> wrote:
> Hi!
>
> If TREE_BINFO has BV_VCALL_INDEX set, this needs to be dynamically
> adjusted, but none of the callers are prepared to handle that.
> Thus, this patch makes devirtualization give up in those cases.
>
> Bootstrapped/regtested on x86_64-linux and i686-linux, trunk and 4.6.
> On the trunk the testcase ICEs before and after the patch in some new callgraph
> checking (added today or so, Honza?), on the branch it works just fine.
>
> Ok for trunk/4.6?
>
> 2011-04-18  Jakub Jelinek  <jakub@redhat.com>
>
>        PR middle-end/48661
>        * gimple-fold.c (gimple_get_virt_method_for_binfo): Return NULL
>        if TREE_TYPE (v) is non-NULL.
>
>        * gimple-fold.c (gimple_get_virt_method_for_binfo): Renamed from
>        gimple_get_virt_mehtod_for_binfo.
>        * gimple.h (gimple_get_virt_method_for_binfo): Likewise.
>        * ipa-cp.c (ipcp_process_devirtualization_opportunities): Adjust
>        callers.
>        * ipa-prop.c (try_make_edge_direct_virtual_call): Likewise.
>
>        * g++.dg/torture/pr48661.C: New test.
>

This caused:

http://gcc.gnu.org/bugzilla/show_bug.cgi?id=48674
Jan Hubicka - April 19, 2011, 12:06 a.m.
> Hi!
> 
> If TREE_BINFO has BV_VCALL_INDEX set, this needs to be dynamically
> adjusted, but none of the callers are prepared to handle that.
> Thus, this patch makes devirtualization give up in those cases.
> 
> Bootstrapped/regtested on x86_64-linux and i686-linux, trunk and 4.6.
> On the trunk the testcase ICEs before and after the patch in some new callgraph
> checking (added today or so, Honza?), on the branch it works just fine.

The ICE for me is:
jakub.C:77:1: error: a call to thunk improperly represented in the call graph:
# VUSE <.MEM_11>
D.2319_3 = D::_ZTv0_n24_NK1D1mEv (&q.D.2159);

void test()/26(14) @0x7ffff77066f0 (asm: _Z4testv) availability:available analyzed needed reachable body externally_visible finalized
  called by: int main()/28 (1.00 per call) (can throw external) 
  calls: void bar(int)/1 (1.00 per call) void bar(int)/1 (1.00 per call) virtual int D::m() const/15 (1.00 per call) D::D(const A&)/14 (1.00 per call) 
  References: 
  Refering this function: 
  has 3 outgoing edges for indirect calls.

this is not my snaity check, but Martins and I think it means that somehow your
change is ineffective on mainline and we still devirtualize. 
Martin, I declare this problem yours.

Honza
Jan Hubicka - April 19, 2011, 12:15 a.m.
> > Hi!
> > 
> > If TREE_BINFO has BV_VCALL_INDEX set, this needs to be dynamically
> > adjusted, but none of the callers are prepared to handle that.
> > Thus, this patch makes devirtualization give up in those cases.
> > 
> > Bootstrapped/regtested on x86_64-linux and i686-linux, trunk and 4.6.
> > On the trunk the testcase ICEs before and after the patch in some new callgraph
> > checking (added today or so, Honza?), on the branch it works just fine.
> 
> The ICE for me is:
> jakub.C:77:1: error: a call to thunk improperly represented in the call graph:
> # VUSE <.MEM_11>
> D.2319_3 = D::_ZTv0_n24_NK1D1mEv (&q.D.2159);
> 
> void test()/26(14) @0x7ffff77066f0 (asm: _Z4testv) availability:available analyzed needed reachable body externally_visible finalized
>   called by: int main()/28 (1.00 per call) (can throw external) 
>   calls: void bar(int)/1 (1.00 per call) void bar(int)/1 (1.00 per call) virtual int D::m() const/15 (1.00 per call) D::D(const A&)/14 (1.00 per call) 
>   References: 
>   Refering this function: 
>   has 3 outgoing edges for indirect calls.
> 
> this is not my snaity check, but Martins and I think it means that somehow your
> change is ineffective on mainline and we still devirtualize. 
> Martin, I declare this problem yours.

Actually what happens here is that CCP devirtualize by propagating the
constructors and due to Richard's new code to drop OBJ_TYPE_REF we finally get
a direct call.  This is all good and desirable.

I think good solution would be to fold further and inline the thunk adjustment,
just like the type based devirtualization does.  Even once I get far enough
with my cgraph cleanuping project to make cgraph represent thunks nicely, we
would win if in these cases ccp and other passes simply inlined the this adjustment,
like we do with type based devirtualization already.
Martin, I guess it is matter of looking up the thunk info by associated cgraph node
alias and extending fold_stmts of passes that now drop the OBJ_TYPE_REF wrappers?

Honza
Jakub Jelinek - April 19, 2011, 7:07 a.m.
On Mon, Apr 18, 2011 at 03:33:18PM -0700, Jason Merrill wrote:
> On 04/18/2011 02:40 PM, Jakub Jelinek wrote:
> >If TREE_BINFO has BV_VCALL_INDEX set, this needs to be dynamically
> >adjusted, but none of the callers are prepared to handle that.
> 
> Well, it means that we do dynamic adjustment at runtime.  If we're
> able to do devirtualization, we should be able to figure out the
> right offset as well, just not in 4.6.

Sure, but how exactly?  If BV_VCALL_INDEX is non-NULL,
I guess we need to find for the given binfo corresponding rtti_binfo
that was (or would be if vtable isn't emitted in the current TU)
used in build_rtti_vtbl_entries to compute the difference between
BINFO_OFFSETs:
  offset = size_diffop_loc (input_location,
                        BINFO_OFFSET (vid->rtti_binfo), BINFO_OFFSET (b));
That is the value actually added on top of the fixed delta, right?
Or do you think it should just add to the delta the value read from the
vcall_offset from the vtable?

But first of all PR48674 needs solving on the trunk.
I think cgraph thunk nodes have fixed_offset field so it is easy to figure
out that adjustment, but for vcall_offset adjustment the same applies here,
is there a way to find out what value will be added, or does it need to be
dynamic?  And what kind of info do we need to find that out.

	Jakub
Richard Guenther - April 19, 2011, 7:51 a.m.
On Tue, Apr 19, 2011 at 2:15 AM, Jan Hubicka <hubicka@ucw.cz> wrote:
>> > Hi!
>> >
>> > If TREE_BINFO has BV_VCALL_INDEX set, this needs to be dynamically
>> > adjusted, but none of the callers are prepared to handle that.
>> > Thus, this patch makes devirtualization give up in those cases.
>> >
>> > Bootstrapped/regtested on x86_64-linux and i686-linux, trunk and 4.6.
>> > On the trunk the testcase ICEs before and after the patch in some new callgraph
>> > checking (added today or so, Honza?), on the branch it works just fine.
>>
>> The ICE for me is:
>> jakub.C:77:1: error: a call to thunk improperly represented in the call graph:
>> # VUSE <.MEM_11>
>> D.2319_3 = D::_ZTv0_n24_NK1D1mEv (&q.D.2159);
>>
>> void test()/26(14) @0x7ffff77066f0 (asm: _Z4testv) availability:available analyzed needed reachable body externally_visible finalized
>>   called by: int main()/28 (1.00 per call) (can throw external)
>>   calls: void bar(int)/1 (1.00 per call) void bar(int)/1 (1.00 per call) virtual int D::m() const/15 (1.00 per call) D::D(const A&)/14 (1.00 per call)
>>   References:
>>   Refering this function:
>>   has 3 outgoing edges for indirect calls.
>>
>> this is not my snaity check, but Martins and I think it means that somehow your
>> change is ineffective on mainline and we still devirtualize.
>> Martin, I declare this problem yours.
>
> Actually what happens here is that CCP devirtualize by propagating the
> constructors and due to Richard's new code to drop OBJ_TYPE_REF we finally get
> a direct call.  This is all good and desirable.
>
> I think good solution would be to fold further and inline the thunk adjustment,
> just like the type based devirtualization does.  Even once I get far enough
> with my cgraph cleanuping project to make cgraph represent thunks nicely, we
> would win if in these cases ccp and other passes simply inlined the this adjustment,
> like we do with type based devirtualization already.
> Martin, I guess it is matter of looking up the thunk info by associated cgraph node
> alias and extending fold_stmts of passes that now drop the OBJ_TYPE_REF wrappers?

Huh.  No, I don't think we want to do any "inlining" as part of
folding.  At least not if it
is a correctness issue (is it?).  Why does the inliner not simply
inline the thunk function
body?

Richard.

> Honza
>
Jan Hubicka - April 19, 2011, 10:16 a.m.
> Huh.  No, I don't think we want to do any "inlining" as part of
> folding.  At least not if it
> is a correctness issue (is it?).  Why does the inliner not simply
> inline the thunk function
> body?

Because thunk functions have no bodies in gimple form and are no functions (at
the moment) in cgraph sense.

What IPA infrastructure needs to learn is to handle inlining of those as well
as compute proper jump functions.  We spoke with Martin on this some time ago,
we ought to get this done in 4.7. Thunks are very irritating spcial cases from
cgraph POV ;(

I however like the idea that code doing devirtualizatio (type based or folding based)
will also do adjustment to call the actual functions instead of thunk.  Direct
calls to thunks are quite nonsential.

Honza
s
> 
> Richard.
> 
> > Honza
> >
Richard Guenther - April 19, 2011, 10:25 a.m.
On Tue, 19 Apr 2011, Jan Hubicka wrote:

> > Huh.  No, I don't think we want to do any "inlining" as part of
> > folding.  At least not if it
> > is a correctness issue (is it?).  Why does the inliner not simply
> > inline the thunk function
> > body?
> 
> Because thunk functions have no bodies in gimple form and are no functions (at
> the moment) in cgraph sense.

Well, I see thunks as simple forwarders which we can represent with
a real function with body in gimple.  We don't have to _emit_ it
that way later, but at least IPA passes could do regular stmt
analysis on them and not need any special thunk knowledge (maybe
apart from making the inliner not inline the forwarding call into
the thunk itself).

> What IPA infrastructure needs to learn is to handle inlining of those as well
> as compute proper jump functions.  We spoke with Martin on this some time ago,
> we ought to get this done in 4.7. Thunks are very irritating spcial cases from
> cgraph POV ;(

And I don't see why they have to be very irritating spcial cases ;)

> I however like the idea that code doing devirtualizatio (type based or folding based)
> will also do adjustment to call the actual functions instead of thunk.  Direct
> calls to thunks are quite nonsential.

Well, sure.  If we discover a direct call to a thunk late, after inlining
we can do some tricks to inline the adjustment, even during folding.  Just
analysis passes should simply see the thunk.

Richard.
Jan Hubicka - April 19, 2011, 10:48 a.m.
> On Tue, 19 Apr 2011, Jan Hubicka wrote:
> 
> > > Huh.  No, I don't think we want to do any "inlining" as part of
> > > folding.  At least not if it
> > > is a correctness issue (is it?).  Why does the inliner not simply
> > > inline the thunk function
> > > body?
> > 
> > Because thunk functions have no bodies in gimple form and are no functions (at
> > the moment) in cgraph sense.
> 
> Well, I see thunks as simple forwarders which we can represent with
> a real function with body in gimple.  We don't have to _emit_ it

That is true only for non-variadic thunks. (and we actually generate bodies for
those early, unless target asked us to do otherwise).  Sure, Diego once
mentioned that we might have some kind of special thunk_call gimple statement,
but that seems to me that amount of gimple code that would need to handle
thunk_call strictly exceeds amount of IPA code that would need to get past
special kind of functions.

Also as as we make them regular functions, we need to go all the way through,
as IPA optimizers will happily modify them so simply using existing target
interface instead of cfgexpand later won't help.  In a way, with tailcall
support, the target interface is really only interesting for variadic case.
At least for i386. I did not inspect details of implementation of thunks on
all the archs that do asm thunks to be sure.

> that way later, but at least IPA passes could do regular stmt
> analysis on them and not need any special thunk knowledge (maybe
> apart from making the inliner not inline the forwarding call into
> the thunk itself).
> 
> > What IPA infrastructure needs to learn is to handle inlining of those as well
> > as compute proper jump functions.  We spoke with Martin on this some time ago,
> > we ought to get this done in 4.7. Thunks are very irritating spcial cases from
> > cgraph POV ;(
> 
> And I don't see why they have to be very irritating spcial cases ;)

Well, becuase of variadic thunks, I think ;)

Honza
> 
> > I however like the idea that code doing devirtualizatio (type based or folding based)
> > will also do adjustment to call the actual functions instead of thunk.  Direct
> > calls to thunks are quite nonsential.
> 
> Well, sure.  If we discover a direct call to a thunk late, after inlining
> we can do some tricks to inline the adjustment, even during folding.  Just
> analysis passes should simply see the thunk.
> 
> Richard.
Richard Guenther - April 19, 2011, 11:02 a.m.
On Tue, 19 Apr 2011, Jan Hubicka wrote:

> > On Tue, 19 Apr 2011, Jan Hubicka wrote:
> > 
> > > > Huh.  No, I don't think we want to do any "inlining" as part of
> > > > folding.  At least not if it
> > > > is a correctness issue (is it?).  Why does the inliner not simply
> > > > inline the thunk function
> > > > body?
> > > 
> > > Because thunk functions have no bodies in gimple form and are no functions (at
> > > the moment) in cgraph sense.
> > 
> > Well, I see thunks as simple forwarders which we can represent with
> > a real function with body in gimple.  We don't have to _emit_ it
> 
> That is true only for non-variadic thunks. (and we actually generate bodies for
> those early, unless target asked us to do otherwise).  Sure, Diego once
> mentioned that we might have some kind of special thunk_call gimple statement,
> but that seems to me that amount of gimple code that would need to handle
> thunk_call strictly exceeds amount of IPA code that would need to get past
> special kind of functions.
> 
> Also as as we make them regular functions, we need to go all the way through,
> as IPA optimizers will happily modify them so simply using existing target
> interface instead of cfgexpand later won't help.  In a way, with tailcall
> support, the target interface is really only interesting for variadic case.
> At least for i386. I did not inspect details of implementation of thunks on
> all the archs that do asm thunks to be sure.
> 
> > that way later, but at least IPA passes could do regular stmt
> > analysis on them and not need any special thunk knowledge (maybe
> > apart from making the inliner not inline the forwarding call into
> > the thunk itself).
> > 
> > > What IPA infrastructure needs to learn is to handle inlining of those as well
> > > as compute proper jump functions.  We spoke with Martin on this some time ago,
> > > we ought to get this done in 4.7. Thunks are very irritating spcial cases from
> > > cgraph POV ;(
> > 
> > And I don't see why they have to be very irritating spcial cases ;)
> 
> Well, becuase of variadic thunks, I think ;)

I thought the idea was to use __builtin_va_arg_pack and friends.
Of course the inliner would still need to know how to inline such
a va-arg forwarder, and we would need a way to expand them (well,
or just go the existing special casing).  We might need this
kind of inliner support anyway because of the partial inlining
opportunity in libquantum which is for a varargs function like

void foo (...)
{
  if (always-zero-static)
    return;

  ...
}

thus partial inlining would create such a forwarding call.

But well, it's just my random 2 cents on the issue ;)

Richard.
Jan Hubicka - April 19, 2011, 2:10 p.m.
> I thought the idea was to use __builtin_va_arg_pack and friends.
> Of course the inliner would still need to know how to inline such
> a va-arg forwarder, and we would need a way to expand them (well,
> or just go the existing special casing).  We might need this
> kind of inliner support anyway because of the partial inlining
> opportunity in libquantum which is for a varargs function like
> 
> void foo (...)
> {
>   if (always-zero-static)
>     return;
> 
>   ...
> }
> 
> thus partial inlining would create such a forwarding call.

Yep, I know this is related issue.  We have builtlin_apply and
builtin_va_arg_pack. Neither of those solves the problem here.

With __builtin_va_arg_pack the thunk would be something like:
int b(int *this, ...);

int a(int *this, ...)
{ 
  return b(this+1, __builtin_va_arg_pack ());
}
and similarly for your libquantum testcase. However one gets:
jh@gcc10:~/trunk/build/gcc$ gcc -O2 t.c
t.c: In function 'a':
t.c:5: error: invalid use of '__builtin_va_arg_pack ()'
becuase va_arg_pack works only when eliminated by the inliner.

Consequentely thunks would be forced to be always inline and this contradits
their purpose to be addressed from virtual tables.

So we would need to develop equivalent of va_arg_pack that works
when not inlining. It can not be implemented generally, only for tail
calls and making nice interface for this seems tricky.

__builtin_apply don't fit here because it 
 1) produces lousy code
 2) requires knowledge about size of argument block
 3) doesn't let you to modify one of arguments.

We could go by deriving __builtlin_return into somethign like:

int a(int *this, ...)
{ 
  __builtin_return_va_arg_pack (b, this + 1)
}

that would have semantic of tail calling the function with the same arguments as the function was called with
with overwritting the arguments it is given that would be required to match the first few named arguments
of the function.

Since this construct is departed enough from gimple_call we probably would want
to have something like gimple_tailcall_va_arg_pack or so.  We will need to
develop expand machinery - I am not sure one can implement expander of this
without introducing new macros. Maybe it is - it essentially means to load
argument registers from original argument registers and tailcall and even for
funnier architectures I think one can just copy what calls.c does already for
tailcalls with the slight semantics changes...
I am not really aware with all details of all calling conventions we support to
be sure however.

I am not really sure if it is really prettier than the irritating special case
in cgraph, however?
> 
> But well, it's just my random 2 cents on the issue ;)
:)

Honza
> 
> Richard.
Richard Guenther - April 19, 2011, 2:16 p.m.
On Tue, 19 Apr 2011, Jan Hubicka wrote:

> > I thought the idea was to use __builtin_va_arg_pack and friends.
> > Of course the inliner would still need to know how to inline such
> > a va-arg forwarder, and we would need a way to expand them (well,
> > or just go the existing special casing).  We might need this
> > kind of inliner support anyway because of the partial inlining
> > opportunity in libquantum which is for a varargs function like
> > 
> > void foo (...)
> > {
> >   if (always-zero-static)
> >     return;
> > 
> >   ...
> > }
> > 
> > thus partial inlining would create such a forwarding call.
> 
> Yep, I know this is related issue.  We have builtlin_apply and
> builtin_va_arg_pack. Neither of those solves the problem here.
> 
> With __builtin_va_arg_pack the thunk would be something like:
> int b(int *this, ...);
> 
> int a(int *this, ...)
> { 
>   return b(this+1, __builtin_va_arg_pack ());
> }
> and similarly for your libquantum testcase. However one gets:
> jh@gcc10:~/trunk/build/gcc$ gcc -O2 t.c
> t.c: In function 'a':
> t.c:5: error: invalid use of '__builtin_va_arg_pack ()'
> becuase va_arg_pack works only when eliminated by the inliner.
> 
> Consequentely thunks would be forced to be always inline and this contradits
> their purpose to be addressed from virtual tables.
> 
> So we would need to develop equivalent of va_arg_pack that works
> when not inlining. It can not be implemented generally, only for tail
> calls and making nice interface for this seems tricky.
> 
> __builtin_apply don't fit here because it 
>  1) produces lousy code
>  2) requires knowledge about size of argument block
>  3) doesn't let you to modify one of arguments.
> 
> We could go by deriving __builtlin_return into somethign like:
> 
> int a(int *this, ...)
> { 
>   __builtin_return_va_arg_pack (b, this + 1)
> }
> 
> that would have semantic of tail calling the function with the same arguments as the function was called with
> with overwritting the arguments it is given that would be required to match the first few named arguments
> of the function.
> 
> Since this construct is departed enough from gimple_call we probably would want
> to have something like gimple_tailcall_va_arg_pack or so.  We will need to
> develop expand machinery - I am not sure one can implement expander of this
> without introducing new macros. Maybe it is - it essentially means to load
> argument registers from original argument registers and tailcall and even for
> funnier architectures I think one can just copy what calls.c does already for
> tailcalls with the slight semantics changes...
> I am not really aware with all details of all calling conventions we support to
> be sure however.
> 
> I am not really sure if it is really prettier than the irritating special case
> in cgraph, however?

Well, even for the partial inlining case I'd devise a scheme that
if a call to such a forwarder remains it gets expanded as a call to
the original (non-split) function (similar to emitting a call to
the asm-thunk instead of expanding the gimple representation of the 
thunk).  I realize this is another special case, but I like it more
as it appears to be more flexible to me ;)  (it's similar to
the redefined extern inline function case, one function body for
inlining and another one for call targets)

> > But well, it's just my random 2 cents on the issue ;)
> :)
:)

Richard.
Jan Hubicka - April 19, 2011, 2:50 p.m.
Hi,
> 
> Well, even for the partial inlining case I'd devise a scheme that
> if a call to such a forwarder remains it gets expanded as a call to
> the original (non-split) function (similar to emitting a call to
> the asm-thunk instead of expanding the gimple representation of the 
> thunk).  I realize this is another special case, but I like it more

For partial inlining I do have plan.  I intend to simply mark edges of calls
containing __builtin_va_arg_pack argument, arrange it to disappear when
function gets inlinined and make inliner to do final pass after inlining small
function inlining all those calls.

This will work nicely for partial inlining - when all header functions gets
inlined, the offline copy will remain.  When header won't get inlined, the
function will be merged again.

When some of them gets inlined, the partially inlined function will get
unnecesiraly dupicated, but just twice that is not bad for this weird side case
we speak about.

It won't help thunks where offline copy may neccesarily exist and we already
have thunks to avoid that duplication of callee function body after all.

I think we have options:

  1) keep them "non functions" in the callgraph.  Then we need
     a) I) make inliner aware of expanding the thunk whenever we decide to inline
           and call edge that "goes through thunk".
        OR
	II) make the direct calls to thunks not appearing in our IL and being
            directly folded into inline code of the thunk
     b) teach ipa-prop to compute jump functions correctly for thunked edges:
        for devirtualization happening at IPA we will neccesarily need represent
	those edges without having the adjustment visible at analysis time.
  2) make thunk gimple representible.  For this we need
     a) introduce gimple representation of thunk tail call, i.e. gimple_tailcall_va_arg_pack or something
     b) make gimple passes to understand it.
     c) I) invent expansion machinery
        OR
        II) teach IPA passes to special case bodies of thunk and do not change them so current
	    ASM machinery stays around
     d) teach ipa-inline and ipa-prop to handle this new type of call and propagate through.

Martin implemented 1-II variant since. It doesn't seem that much worse than
together alternatives to me and is significandly easier to implement.

Main drawback is the need for nice cgraph representation.  Representing thunks
completely on side as we do now has turned out to be confusing and not really
pretty. So we want thunks appear as full fledged cgraph node with call edge to
function they are thunking.

This will neccesarily impose some extra work on IPA passes since they will not
be able to easilly expect that everything in cgraph is a function.  But we have
precisely the same problem with alias and unless we want to go for aliases
represented as gimple bodies symmetric to "empty thunks", we will need this
special case anyway.

As for alias/thunk, I plan to make them symbol table entries and make callgraph
edges to have "caller" and "target". Target of call is the symbol called, not
neccesarily function.  Then I plan to have cgraph_edge_callee accesor that will
look into real target since it is what majority of IPA code cares about.

Also we will need to have cgraph_edge_callee_body_visibility instead of global
cgraph_body_visibility we have now since it depends if you call through alias
that can be overwritten. We can also have function that will return the thunk
adjustments happening on the edge IMO. Since IPA cares about this on two places
(computation of jump function and inlining), it give relatively low pain.

With this API, 1-a-I or 1-a-II would not be too ugly. Still combination of both
makes most sense for me: there is little to lose by inserting the thunk code as
soon as we devirutalize during local optimization and if we won't do so we will
just keep missing optimizations. We will need II to nicely handle case when
devirutalization happens at WPA.

2-I variant is probably most generic and will solve the aforementioned partial
inlining better in a sense that we will not need to duplicate function body in
the case variadic function gets partially inlined only into some of its
callees. At the same time it seems quite invasive and IMO have very little pratical
benefit.  I also don't know if having extra exotic gimple construct to worry
about really makes compiler more maintainable.

Honza

> as it appears to be more flexible to me ;)  (it's similar to
> the redefined extern inline function case, one function body for
> inlining and another one for call targets)
> 
> > > But well, it's just my random 2 cents on the issue ;)
> > :)
> :)
> 
> Richard.
Jason Merrill - April 19, 2011, 11:11 p.m.
On 04/19/2011 12:07 AM, Jakub Jelinek wrote:
> On Mon, Apr 18, 2011 at 03:33:18PM -0700, Jason Merrill wrote:

>> Well, it means that we do dynamic adjustment at runtime.  If we're
>> able to do devirtualization, we should be able to figure out the
>> right offset as well, just not in 4.6.
>
> Sure, but how exactly?  If BV_VCALL_INDEX is non-NULL,
> I guess we need to find for the given binfo corresponding rtti_binfo
> that was (or would be if vtable isn't emitted in the current TU)
> used in build_rtti_vtbl_entries to compute the difference between
> BINFO_OFFSETs:

The vcall entries are added by add_vcall_offset; the rtti entry is just 
used for dynamic_cast<void*>.

> Or do you think it should just add to the delta the value read from the
> vcall_offset from the vtable?

That's what the thunk does; if devirtualization can look up the vcall 
offset in the vtable, then it should do that.

Jason
Martin Jambor - April 20, 2011, 2 p.m.
Hi,

On Tue, Apr 19, 2011 at 09:07:35AM +0200, Jakub Jelinek wrote:
> On Mon, Apr 18, 2011 at 03:33:18PM -0700, Jason Merrill wrote:
> > On 04/18/2011 02:40 PM, Jakub Jelinek wrote:
> > >If TREE_BINFO has BV_VCALL_INDEX set, this needs to be dynamically
> > >adjusted, but none of the callers are prepared to handle that.
> > 
> > Well, it means that we do dynamic adjustment at runtime.  If we're
> > able to do devirtualization, we should be able to figure out the
> > right offset as well, just not in 4.6.
> 
> Sure, but how exactly?

We could re-use code in assemble_thunk and thunk_adjust in
cgraphunit.c which IIRC is able to deal with these kinds of thunks
too.

That is, if do not opt for some more systematic approach, like making
(most of) them "real function with body in gimple" instead.

Martin
Martin Jambor - April 20, 2011, 2:12 p.m.
Hi,

On Tue, Apr 19, 2011 at 02:15:18AM +0200, Jan Hubicka wrote:
> Actually what happens here is that CCP devirtualize by propagating the
> constructors and due to Richard's new code to drop OBJ_TYPE_REF we finally get
> a direct call.  This is all good and desirable.
> 
> I think good solution would be to fold further and inline the thunk
> adjustment, just like the type based devirtualization does.  Even
> once I get far enough with my cgraph cleanuping project to make
> cgraph represent thunks nicely, we would win if in these cases ccp
> and other passes simply inlined the this adjustment, like we do with
> type based devirtualization already.

> Martin, I guess it is matter of looking up the thunk info by
> associated cgraph node alias and extending fold_stmts of passes that
> now drop the OBJ_TYPE_REF wrappers?

Well, if you have a cgraph node then yes.  But if the method is
implemented in a different compilation unit you don't.  And as I
already said today on IRC, I don't think it is possible to tell
whether a function is a thunk by looking at the decl alone (the front
hand has a flag for it as Jakub noted, though), let alone what kind of
thunk it is.

The more I think about this the more I would also like to make thunks
as ordinary real functions as possible, with perhaps some kind of
totally opaque decls/cgraph_nodes for the most obscure types which
could be generated by assembly.

Martin
Jan Hubicka - April 20, 2011, 2:38 p.m.
> Hi,
> 
> On Tue, Apr 19, 2011 at 02:15:18AM +0200, Jan Hubicka wrote:
> > Actually what happens here is that CCP devirtualize by propagating the
> > constructors and due to Richard's new code to drop OBJ_TYPE_REF we finally get
> > a direct call.  This is all good and desirable.
> > 
> > I think good solution would be to fold further and inline the thunk
> > adjustment, just like the type based devirtualization does.  Even
> > once I get far enough with my cgraph cleanuping project to make
> > cgraph represent thunks nicely, we would win if in these cases ccp
> > and other passes simply inlined the this adjustment, like we do with
> > type based devirtualization already.
> 
> > Martin, I guess it is matter of looking up the thunk info by
> > associated cgraph node alias and extending fold_stmts of passes that
> > now drop the OBJ_TYPE_REF wrappers?
> 
> Well, if you have a cgraph node then yes.  But if the method is
> implemented in a different compilation unit you don't.  And as I
> already said today on IRC, I don't think it is possible to tell
> whether a function is a thunk by looking at the decl alone (the front
> hand has a flag for it as Jakub noted, though), let alone what kind of
> thunk it is.

Well, you don't care about thunks resisting in other unit/partition...

Honza
> 
> The more I think about this the more I would also like to make thunks
> as ordinary real functions as possible, with perhaps some kind of
> totally opaque decls/cgraph_nodes for the most obscure types which
> could be generated by assembly.
> 
> Martin
Richard Guenther - April 20, 2011, 3:18 p.m.
On Wed, 20 Apr 2011, Jan Hubicka wrote:

> > Hi,
> > 
> > On Tue, Apr 19, 2011 at 02:15:18AM +0200, Jan Hubicka wrote:
> > > Actually what happens here is that CCP devirtualize by propagating the
> > > constructors and due to Richard's new code to drop OBJ_TYPE_REF we finally get
> > > a direct call.  This is all good and desirable.
> > > 
> > > I think good solution would be to fold further and inline the thunk
> > > adjustment, just like the type based devirtualization does.  Even
> > > once I get far enough with my cgraph cleanuping project to make
> > > cgraph represent thunks nicely, we would win if in these cases ccp
> > > and other passes simply inlined the this adjustment, like we do with
> > > type based devirtualization already.
> > 
> > > Martin, I guess it is matter of looking up the thunk info by
> > > associated cgraph node alias and extending fold_stmts of passes that
> > > now drop the OBJ_TYPE_REF wrappers?
> > 
> > Well, if you have a cgraph node then yes.  But if the method is
> > implemented in a different compilation unit you don't.  And as I
> > already said today on IRC, I don't think it is possible to tell
> > whether a function is a thunk by looking at the decl alone (the front
> > hand has a flag for it as Jakub noted, though), let alone what kind of
> > thunk it is.
> 
> Well, you don't care about thunks resisting in other unit/partition...

Sure you do - LTO might bring them into scope if you fold them to 
a direct call early.

Richard.
Martin Jambor - April 20, 2011, 3:22 p.m.
Hi,

On Wed, Apr 20, 2011 at 04:38:25PM +0200, Jan Hubicka wrote:
> > Hi,
> > 
> > On Tue, Apr 19, 2011 at 02:15:18AM +0200, Jan Hubicka wrote:
> > > Actually what happens here is that CCP devirtualize by propagating the
> > > constructors and due to Richard's new code to drop OBJ_TYPE_REF we finally get
> > > a direct call.  This is all good and desirable.
> > > 
> > > I think good solution would be to fold further and inline the thunk
> > > adjustment, just like the type based devirtualization does.  Even
> > > once I get far enough with my cgraph cleanuping project to make
> > > cgraph represent thunks nicely, we would win if in these cases ccp
> > > and other passes simply inlined the this adjustment, like we do with
> > > type based devirtualization already.
> > 
> > > Martin, I guess it is matter of looking up the thunk info by
> > > associated cgraph node alias and extending fold_stmts of passes that
> > > now drop the OBJ_TYPE_REF wrappers?
> > 
> > Well, if you have a cgraph node then yes.  But if the method is
> > implemented in a different compilation unit you don't.  And as I
> > already said today on IRC, I don't think it is possible to tell
> > whether a function is a thunk by looking at the decl alone (the front
> > hand has a flag for it as Jakub noted, though), let alone what kind of
> > thunk it is.
> 
> Well, you don't care about thunks resisting in other unit/partition...
> 

Unless you fold in early optimizations and LTO later, deciding to
inline the function but forgetting about the thunk adjustment.

Martin
Jan Hubicka - April 20, 2011, 3:23 p.m.
> On Wed, 20 Apr 2011, Jan Hubicka wrote:
> 
> > > Hi,
> > > 
> > > On Tue, Apr 19, 2011 at 02:15:18AM +0200, Jan Hubicka wrote:
> > > > Actually what happens here is that CCP devirtualize by propagating the
> > > > constructors and due to Richard's new code to drop OBJ_TYPE_REF we finally get
> > > > a direct call.  This is all good and desirable.
> > > > 
> > > > I think good solution would be to fold further and inline the thunk
> > > > adjustment, just like the type based devirtualization does.  Even
> > > > once I get far enough with my cgraph cleanuping project to make
> > > > cgraph represent thunks nicely, we would win if in these cases ccp
> > > > and other passes simply inlined the this adjustment, like we do with
> > > > type based devirtualization already.
> > > 
> > > > Martin, I guess it is matter of looking up the thunk info by
> > > > associated cgraph node alias and extending fold_stmts of passes that
> > > > now drop the OBJ_TYPE_REF wrappers?
> > > 
> > > Well, if you have a cgraph node then yes.  But if the method is
> > > implemented in a different compilation unit you don't.  And as I
> > > already said today on IRC, I don't think it is possible to tell
> > > whether a function is a thunk by looking at the decl alone (the front
> > > hand has a flag for it as Jakub noted, though), let alone what kind of
> > > thunk it is.
> > 
> > Well, you don't care about thunks resisting in other unit/partition...
> 
> Sure you do - LTO might bring them into scope if you fold them to 
> a direct call early.

Well, we do have way to represent the thunk adjust over edges and lto-symtab
can just feed the data in while merging external decl with thunk.
Honza
> 
> Richard.

Patch

--- gcc/gimple-fold.c.jj	2011-03-14 14:12:15.000000000 +0100
+++ gcc/gimple-fold.c	2011-04-18 18:35:22.000000000 +0200
@@ -1374,7 +1374,7 @@  gimple_fold_builtin (gimple stmt)
    is a thunk (other than a this adjustment which is dealt with by DELTA). */
 
 tree
-gimple_get_virt_mehtod_for_binfo (HOST_WIDE_INT token, tree known_binfo,
+gimple_get_virt_method_for_binfo (HOST_WIDE_INT token, tree known_binfo,
 				  tree *delta, bool refuse_thunks)
 {
   HOST_WIDE_INT i;
@@ -1393,6 +1393,10 @@  gimple_get_virt_mehtod_for_binfo (HOST_W
       v = TREE_CHAIN (v);
     }
 
+  /* If BV_VCALL_INDEX is non-NULL, give up.  */
+  if (TREE_TYPE (v))
+    return NULL_TREE;
+
   fndecl = TREE_VALUE (v);
   node = cgraph_get_node_or_alias (fndecl);
   if (refuse_thunks
--- gcc/gimple.h.jj	2011-03-14 14:12:15.000000000 +0100
+++ gcc/gimple.h	2011-04-18 18:35:40.000000000 +0200
@@ -892,7 +892,7 @@  unsigned get_gimple_rhs_num_ops (enum tr
 gimple gimple_alloc_stat (enum gimple_code, unsigned MEM_STAT_DECL);
 const char *gimple_decl_printable_name (tree, int);
 bool gimple_fold_call (gimple_stmt_iterator *gsi, bool inplace);
-tree gimple_get_virt_mehtod_for_binfo (HOST_WIDE_INT, tree, tree *, bool);
+tree gimple_get_virt_method_for_binfo (HOST_WIDE_INT, tree, tree *, bool);
 void gimple_adjust_this_by_delta (gimple_stmt_iterator *, tree);
 /* Returns true iff T is a valid GIMPLE statement.  */
 extern bool is_gimple_stmt (tree);
--- gcc/ipa-cp.c.jj	2011-04-13 12:39:28.000000000 +0200
+++ gcc/ipa-cp.c	2011-04-18 18:36:11.000000000 +0200
@@ -1242,7 +1242,7 @@  ipcp_process_devirtualization_opportunit
 	{
 	  tree binfo = VEC_index (tree, info->params[param_index].types, j);
 	  tree d;
-	  tree t = gimple_get_virt_mehtod_for_binfo (token, binfo, &d, true);
+	  tree t = gimple_get_virt_method_for_binfo (token, binfo, &d, true);
 
 	  if (!t)
 	    {
--- gcc/ipa-prop.c.jj	2011-04-13 12:39:28.000000000 +0200
+++ gcc/ipa-prop.c	2011-04-18 18:36:30.000000000 +0200
@@ -1730,7 +1730,7 @@  try_make_edge_direct_virtual_call (struc
   type = ie->indirect_info->otr_type;
   binfo = get_binfo_at_offset (binfo, ie->indirect_info->anc_offset, type);
   if (binfo)
-    target = gimple_get_virt_mehtod_for_binfo (token, binfo, &delta, true);
+    target = gimple_get_virt_method_for_binfo (token, binfo, &delta, true);
   else
     return NULL;
 
--- gcc/testsuite/g++.dg/torture/pr48661.C.jj	2011-04-18 18:50:49.000000000 +0200
+++ gcc/testsuite/g++.dg/torture/pr48661.C	2011-04-18 18:50:11.000000000 +0200
@@ -0,0 +1,77 @@ 
+// PR middle-end/48661
+// { dg-do run }
+
+extern "C" void abort ();
+
+__attribute__((noinline))
+double
+foo (double x, double y)
+{
+  asm volatile ("" : : : "memory");
+  return x + y;
+}
+
+__attribute__((noinline, noclone))
+void
+bar (int x)
+{
+  if (x != 123)
+    abort ();
+}
+
+struct A
+{
+  double a1, a2;
+};
+
+struct B 
+{
+  virtual int m () const = 0 ;
+};
+
+struct C
+{
+  virtual ~C () {}
+};
+
+struct D : virtual public B, public C
+{ 
+  explicit D (const A &x) : d(123) { foo (x.a2, x.a1); }
+  int m () const { return d; }
+  int d;
+}; 
+
+struct E
+{
+  E () : d(0) {}
+  virtual void n (const B &x) { d = x.m (); x.m (); x.m (); }
+  int d;
+};
+
+void
+test ()
+{
+  A a;
+  a.a1 = 0;
+  a.a2 = 1;
+  E p;
+  D q (a);
+  const B &b = q;
+  bar (b.m ());
+  p.n (b);
+  bar (p.d);
+}
+
+void
+baz ()
+{
+  A a;
+  D p2 (a);
+}
+
+int
+main ()
+{
+  test ();
+  return 0;
+}