diff mbox

[x86,63534] Fix '-p' profile for 32 bit PIC mode

Message ID CAOvf_xxG8pbtr=0v3scZTJVsNpu5Sgkjz9K6W+S8bbEaGPRjig@mail.gmail.com
State New
Headers show

Commit Message

Evgeny Stupachenko Oct. 24, 2014, 2:12 p.m. UTC
The following patch align stack for mcount and there should be no
problems with unwind as ix86_frame_pointer_required is true when
crtl->profile is true and flag_fentry is false (we call mcount after
function prolog).
When flag_fentry is true it is set to false in 32bit PIC mode:
  if (!TARGET_64BIT_P (opts->x_ix86_isa_flags) && opts->x_flag_pic)
    {
      if (opts->x_flag_fentry > 0)
        sorry ("-mfentry isn%'t supported for 32-bit in combination "
      "with -fpic");
      opts->x_flag_fentry = 0;
    }



2014-10-24  Evgeny Stupachenko  <evstupac@gmail.com>

        PR target/63534
        * config/i386/i386.c (x86_function_profiler): Add GOT register init
        for mcount call.

   else
     {

On Fri, Oct 17, 2014 at 6:38 PM, Jakub Jelinek <jakub@redhat.com> wrote:
> On Fri, Oct 17, 2014 at 06:30:42PM +0400, Evgeny Stupachenko wrote:
>> Hi,
>>
>> The patch fixes profile in 32bits PIC mode (only -p option affected).
>>
>> x86 bootstrap, make check passed
>>
>> spec2000 o2 -p train data on Corei7:
>> CINT -5%
>> CFP  +1,5
>> compared to a compiler before "enabling ebx".
>>
>> There is a potential performance improve after the patch applied
>> suggested by Jakub:
>> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63534#c8
>> There is opened bug on this: PR63527. However the fix of the bug is
>> more complicated.
>>
>> Is it ok?
>
> Unfortunately I don't think it is ok.
> 1) you don't set the appropriate bit in pic_labels_used (for ebx)
> 2) more importantly, it causes the stack to be misaligned (i.e. violating
>    ABI) for the _mcount call, and, break unwind info.
>
>> 2014-10-16  Evgeny Stupachenko  <evstupac@gmail.com>
>>
>>         PR target/63534
>>         * config/i386/i386.c (x86_function_profiler): Add GOT register init
>>         for mcount call.
>>
>> diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
>> index a3ca2ed..5117572 100644
>> --- a/gcc/config/i386/i386.c
>> +++ b/gcc/config/i386/i386.c
>> @@ -39119,11 +39126,15 @@ x86_function_profiler (FILE *file, int
>> labelno ATTRIBUTE_UNUSED)
>>      }
>>    else if (flag_pic)
>>      {
>> +      fprintf (file,"\tpush\t%%ebx\n");
>> +      fprintf (file,"\tcall\t__x86.get_pc_thunk.bx\n");
>> +      fprintf (file,"\taddl\t$_GLOBAL_OFFSET_TABLE_, %%ebx\n");
>>  #ifndef NO_PROFILE_COUNTERS
>>        fprintf (file, "\tleal\t%sP%d@GOTOFF(%%ebx),%%"
>> PROFILE_COUNT_REGISTER "\n",
>>                LPREFIX, labelno);
>>  #endif
>>        fprintf (file, "1:\tcall\t*%s@GOT(%%ebx)\n", mcount_name);
>> +      fprintf (file,"\tpop\t%%ebx\n");
>>      }
>>    else
>>      {
>
>         Jakub

Comments

Jakub Jelinek Oct. 24, 2014, 2:29 p.m. UTC | #1
On Fri, Oct 24, 2014 at 06:12:15PM +0400, Evgeny Stupachenko wrote:
> The following patch align stack for mcount and there should be no
> problems with unwind as ix86_frame_pointer_required is true when
> crtl->profile is true and flag_fentry is false (we call mcount after
> function prolog).
> When flag_fentry is true it is set to false in 32bit PIC mode:
>   if (!TARGET_64BIT_P (opts->x_ix86_isa_flags) && opts->x_flag_pic)
>     {
>       if (opts->x_flag_fentry > 0)
>         sorry ("-mfentry isn%'t supported for 32-bit in combination "
>       "with -fpic");
>       opts->x_flag_fentry = 0;
>     }

What is wrong in emitting the set_got right before the PROLOGUE_END
note and that way sharing a single load from both?
This looks just as a hack.

> 2014-10-24  Evgeny Stupachenko  <evstupac@gmail.com>
> 
>         PR target/63534
>         * config/i386/i386.c (x86_function_profiler): Add GOT register init
>         for mcount call.
> 
> diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
> index 6235c4f..2dff29c 100644
> --- a/gcc/config/i386/i386.c
> +++ b/gcc/config/i386/i386.c
> @@ -39124,13 +39124,22 @@ x86_function_profiler (FILE *file, int
> labelno ATTRIBUTE_UNUSED)
>        else
>         x86_print_call_or_nop (file, mcount_name);
>      }
> +  /* At this stage we can't detrmine where GOT register is, as RA can allocate
> +     it to any hard register.  Therefore we need to set it once again.  */
>    else if (flag_pic)
>      {
> +      pic_labels_used |= 1 << BX_REG;
> +      fprintf (file,"\tsub\t$16, %%esp\n");
> +      fprintf (file,"\tmovl\t%%ebx, (%%esp)\n");
> +      fprintf (file,"\tcall\t__x86.get_pc_thunk.bx\n");
> +      fprintf (file,"\taddl\t$_GLOBAL_OFFSET_TABLE_, %%ebx\n");
>  #ifndef NO_PROFILE_COUNTERS
>        fprintf (file, "\tleal\t%sP%d@GOTOFF(%%ebx),%%"
> PROFILE_COUNT_REGISTER "\n",
>                LPREFIX, labelno);
>  #endif
>        fprintf (file, "1:\tcall\t*%s@GOT(%%ebx)\n", mcount_name);
> +      fprintf (file,"\tmovl\t(%%esp), %%ebx\n");
> +      fprintf (file,"\tadd\t$16, %%esp\n");
>      }
>    else
>      {
> 

	Jakub
Evgeny Stupachenko Oct. 24, 2014, 3:19 p.m. UTC | #2
>What is wrong in emitting the set_got right before the PROLOGUE_END
>note and that way sharing a single load from both?
Can you please explain the idea? Now set_got emitted right after
PROLOGUE_END, what is the advantage in emitting it right before?
Which load is going to be shared?

>This looks just as a hack.
Isn't it similar to what was before but just adding additional "prints"?


On Fri, Oct 24, 2014 at 6:29 PM, Jakub Jelinek <jakub@redhat.com> wrote:
> On Fri, Oct 24, 2014 at 06:12:15PM +0400, Evgeny Stupachenko wrote:
>> The following patch align stack for mcount and there should be no
>> problems with unwind as ix86_frame_pointer_required is true when
>> crtl->profile is true and flag_fentry is false (we call mcount after
>> function prolog).
>> When flag_fentry is true it is set to false in 32bit PIC mode:
>>   if (!TARGET_64BIT_P (opts->x_ix86_isa_flags) && opts->x_flag_pic)
>>     {
>>       if (opts->x_flag_fentry > 0)
>>         sorry ("-mfentry isn%'t supported for 32-bit in combination "
>>       "with -fpic");
>>       opts->x_flag_fentry = 0;
>>     }
>
> What is wrong in emitting the set_got right before the PROLOGUE_END
> note and that way sharing a single load from both?
> This looks just as a hack.
>
>> 2014-10-24  Evgeny Stupachenko  <evstupac@gmail.com>
>>
>>         PR target/63534
>>         * config/i386/i386.c (x86_function_profiler): Add GOT register init
>>         for mcount call.
>>
>> diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
>> index 6235c4f..2dff29c 100644
>> --- a/gcc/config/i386/i386.c
>> +++ b/gcc/config/i386/i386.c
>> @@ -39124,13 +39124,22 @@ x86_function_profiler (FILE *file, int
>> labelno ATTRIBUTE_UNUSED)
>>        else
>>         x86_print_call_or_nop (file, mcount_name);
>>      }
>> +  /* At this stage we can't detrmine where GOT register is, as RA can allocate
>> +     it to any hard register.  Therefore we need to set it once again.  */
>>    else if (flag_pic)
>>      {
>> +      pic_labels_used |= 1 << BX_REG;
>> +      fprintf (file,"\tsub\t$16, %%esp\n");
>> +      fprintf (file,"\tmovl\t%%ebx, (%%esp)\n");
>> +      fprintf (file,"\tcall\t__x86.get_pc_thunk.bx\n");
>> +      fprintf (file,"\taddl\t$_GLOBAL_OFFSET_TABLE_, %%ebx\n");
>>  #ifndef NO_PROFILE_COUNTERS
>>        fprintf (file, "\tleal\t%sP%d@GOTOFF(%%ebx),%%"
>> PROFILE_COUNT_REGISTER "\n",
>>                LPREFIX, labelno);
>>  #endif
>>        fprintf (file, "1:\tcall\t*%s@GOT(%%ebx)\n", mcount_name);
>> +      fprintf (file,"\tmovl\t(%%esp), %%ebx\n");
>> +      fprintf (file,"\tadd\t$16, %%esp\n");
>>      }
>>    else
>>      {
>>
>
>         Jakub
Jakub Jelinek Oct. 24, 2014, 3:50 p.m. UTC | #3
On Fri, Oct 24, 2014 at 07:19:53PM +0400, Evgeny Stupachenko wrote:
> >What is wrong in emitting the set_got right before the PROLOGUE_END
> >note and that way sharing a single load from both?
> Can you please explain the idea? Now set_got emitted right after
> PROLOGUE_END, what is the advantage in emitting it right before?
> Which load is going to be shared?

I thought I've already explained.
In ix86_init_pic_reg 32-bit part, if crtl->profile, instead of
      rtx insn = emit_insn (gen_set_got (pic_offset_table_rtx));
      RTX_FRAME_RELATED_P (insn) = 1;
do:
      rtx reg = crtl->profile
		? gen_rtx_REG (Pmode, REAL_PIC_OFFSET_TABLE_REGNUM)
		: pic_offset_table_rtx;
      rtx insn = emit_insn (gen_set_got (reg));
      RTX_FRAME_RELATED_P (insn) = 1;
      if (crtl->profile)
	emit_move_insn (pic_offset_table_rtx, reg);
or so.  That will ensure the RA will most likely allocate the pic pseudo
to %ebx at the start of the function, and even if it doesn't, it will still
be loaded into that reg and only then moved to some other reg.

Then, supposedly you need to tweak the condition in ix86_save_reg:
  if (pic_offset_table_rtx
      && !ix86_use_pseudo_pic_reg ()
      && regno == REAL_PIC_OFFSET_TABLE_REGNUM
      && (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
          || crtl->profile
          || crtl->calls_eh_return
          || crtl->uses_const_pool
          || cfun->has_nonlocal_label))
    return ix86_select_alt_pic_regnum () == INVALID_REGNUM;
to something like:
  if (regno == REAL_PIC_OFFSET_TABLE_REGNUM
      && pic_offset_table_rtx)
    {
      if (ix86_use_pseudo_pic_reg ())
	{
	  /* %ebx needed by call to _mcount after the prologue.  */
	  if (!TARGET_64BIT && flag_pic && crtl->profile)
	    return true;
	}
      else if (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
	       || crtl->profile
	       || crtl->calls_eh_return
	       || crtl->uses_const_pool
	       || cfun->has_nonlocal_label))
	return ix86_select_alt_pic_regnum () == INVALID_REGNUM;
    }
which will make sure the prologue/epilogue saves/restores %ebx properly.

And, finally, for the !TARGET_64BIT && flag_pic && crtl->profile case
e.g. at the end of ix86_expand_prologue, check if the prologue is followed
by a series of notes (one of which is the PROLOGUE_END note), but no real
insns, and followed by set_got pattern (perhaps check that it recogs to
CODE_FOR_set_got) that loads into %ebx.  If it does, then fine, and just
move that insn from where it is emitted to before those notes.
If you don't find it there, emit set_got insn to %ebx yourself at the end
of the prologue.  Then no need to change the _mcount call in any way.
The profiler code is emitted on the PROLOGUE_END note, so if you managed
to move the set_got across the PROLOGUE_END note, or if you added an extra
one (e.g. for the case when no set_got was really needed in the rest of the
function), at that point the pic register will be allocated in %ebx.

> >> --- a/gcc/config/i386/i386.c
> >> +++ b/gcc/config/i386/i386.c
> >> @@ -39124,13 +39124,22 @@ x86_function_profiler (FILE *file, int
> >> labelno ATTRIBUTE_UNUSED)
> >>        else
> >>         x86_print_call_or_nop (file, mcount_name);
> >>      }
> >> +  /* At this stage we can't detrmine where GOT register is, as RA can allocate
> >> +     it to any hard register.  Therefore we need to set it once again.  */
> >>    else if (flag_pic)
> >>      {
> >> +      pic_labels_used |= 1 << BX_REG;
> >> +      fprintf (file,"\tsub\t$16, %%esp\n");
> >> +      fprintf (file,"\tmovl\t%%ebx, (%%esp)\n");
> >> +      fprintf (file,"\tcall\t__x86.get_pc_thunk.bx\n");
> >> +      fprintf (file,"\taddl\t$_GLOBAL_OFFSET_TABLE_, %%ebx\n");
> >>  #ifndef NO_PROFILE_COUNTERS
> >>        fprintf (file, "\tleal\t%sP%d@GOTOFF(%%ebx),%%"
> >> PROFILE_COUNT_REGISTER "\n",
> >>                LPREFIX, labelno);
> >>  #endif
> >>        fprintf (file, "1:\tcall\t*%s@GOT(%%ebx)\n", mcount_name);
> >> +      fprintf (file,"\tmovl\t(%%esp), %%ebx\n");
> >> +      fprintf (file,"\tadd\t$16, %%esp\n");

Note, the unwind info is wrong even in this case.  Whenever you are in
between that call\t__x86.get_pc_thunk.bx and movl\t(%%esp), %%ebx,
there is no unwind info telling the debug info consumers that %ebx has been
saved to the stack and where, so any time the debugger or anything else
looks up at outer frames e.g. from _mcount, the %ebx will contain bogus
value in the function that calls the function with _mcount call.

	Jakub
diff mbox

Patch

diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 6235c4f..2dff29c 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -39124,13 +39124,22 @@  x86_function_profiler (FILE *file, int
labelno ATTRIBUTE_UNUSED)
       else
        x86_print_call_or_nop (file, mcount_name);
     }
+  /* At this stage we can't detrmine where GOT register is, as RA can allocate
+     it to any hard register.  Therefore we need to set it once again.  */
   else if (flag_pic)
     {
+      pic_labels_used |= 1 << BX_REG;
+      fprintf (file,"\tsub\t$16, %%esp\n");
+      fprintf (file,"\tmovl\t%%ebx, (%%esp)\n");
+      fprintf (file,"\tcall\t__x86.get_pc_thunk.bx\n");
+      fprintf (file,"\taddl\t$_GLOBAL_OFFSET_TABLE_, %%ebx\n");
 #ifndef NO_PROFILE_COUNTERS
       fprintf (file, "\tleal\t%sP%d@GOTOFF(%%ebx),%%"
PROFILE_COUNT_REGISTER "\n",
               LPREFIX, labelno);
 #endif
       fprintf (file, "1:\tcall\t*%s@GOT(%%ebx)\n", mcount_name);
+      fprintf (file,"\tmovl\t(%%esp), %%ebx\n");
+      fprintf (file,"\tadd\t$16, %%esp\n");
     }