diff mbox

[x86,63534] Fix '-p' profile for 32 bit PIC mode

Message ID CAOvf_xzJTmoi=MOa9g65zqhqRfHSG59C_bqUso-CeV9B9SUQeg@mail.gmail.com
State New
Headers show

Commit Message

Evgeny Stupachenko Oct. 28, 2014, 1:10 p.m. UTC
Thank you, Jakub.

The following patch passed bootstrap, gcc make check and spec2000 with
"-p -m32 -fPIC".
Is it ok?

ChangeLog:

2014-10-28  Evgeny Stupachenko  <evstupac@gmail.com>

gcc/testsuite
        * gcc.target/i386/mcount_pic.c: New.

gcc/
        * config/i386/i386.c (ix86_init_pic_reg): Emit SET_GOT to
        REAL_PIC_OFFSET_TABLE_REGNUM for mcount profiling.
        (ix86_save_reg): Save REAL_PIC_OFFSET_TABLE_REGNUM when profiling
        using mcount in 32bit PIC mode.
        (ix86_elim_set_got): New.
        (ix86_expand_prologue): For the mcount profiling emit new SET_GOT
        in PROLOGUE, delete initial if possible.



On Fri, Oct 24, 2014 at 7:50 PM, Jakub Jelinek <jakub@redhat.com> wrote:
> On Fri, Oct 24, 2014 at 07:19:53PM +0400, Evgeny Stupachenko wrote:
>> >What is wrong in emitting the set_got right before the PROLOGUE_END
>> >note and that way sharing a single load from both?
>> Can you please explain the idea? Now set_got emitted right after
>> PROLOGUE_END, what is the advantage in emitting it right before?
>> Which load is going to be shared?
>
> I thought I've already explained.
> In ix86_init_pic_reg 32-bit part, if crtl->profile, instead of
>       rtx insn = emit_insn (gen_set_got (pic_offset_table_rtx));
>       RTX_FRAME_RELATED_P (insn) = 1;
> do:
>       rtx reg = crtl->profile
>                 ? gen_rtx_REG (Pmode, REAL_PIC_OFFSET_TABLE_REGNUM)
>                 : pic_offset_table_rtx;
>       rtx insn = emit_insn (gen_set_got (reg));
>       RTX_FRAME_RELATED_P (insn) = 1;
>       if (crtl->profile)
>         emit_move_insn (pic_offset_table_rtx, reg);
> or so.  That will ensure the RA will most likely allocate the pic pseudo
> to %ebx at the start of the function, and even if it doesn't, it will still
> be loaded into that reg and only then moved to some other reg.
>
> Then, supposedly you need to tweak the condition in ix86_save_reg:
>   if (pic_offset_table_rtx
>       && !ix86_use_pseudo_pic_reg ()
>       && regno == REAL_PIC_OFFSET_TABLE_REGNUM
>       && (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
>           || crtl->profile
>           || crtl->calls_eh_return
>           || crtl->uses_const_pool
>           || cfun->has_nonlocal_label))
>     return ix86_select_alt_pic_regnum () == INVALID_REGNUM;
> to something like:
>   if (regno == REAL_PIC_OFFSET_TABLE_REGNUM
>       && pic_offset_table_rtx)
>     {
>       if (ix86_use_pseudo_pic_reg ())
>         {
>           /* %ebx needed by call to _mcount after the prologue.  */
>           if (!TARGET_64BIT && flag_pic && crtl->profile)
>             return true;
>         }
>       else if (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
>                || crtl->profile
>                || crtl->calls_eh_return
>                || crtl->uses_const_pool
>                || cfun->has_nonlocal_label))
>         return ix86_select_alt_pic_regnum () == INVALID_REGNUM;
>     }
> which will make sure the prologue/epilogue saves/restores %ebx properly.
>
> And, finally, for the !TARGET_64BIT && flag_pic && crtl->profile case
> e.g. at the end of ix86_expand_prologue, check if the prologue is followed
> by a series of notes (one of which is the PROLOGUE_END note), but no real
> insns, and followed by set_got pattern (perhaps check that it recogs to
> CODE_FOR_set_got) that loads into %ebx.  If it does, then fine, and just
> move that insn from where it is emitted to before those notes.
> If you don't find it there, emit set_got insn to %ebx yourself at the end
> of the prologue.  Then no need to change the _mcount call in any way.
> The profiler code is emitted on the PROLOGUE_END note, so if you managed
> to move the set_got across the PROLOGUE_END note, or if you added an extra
> one (e.g. for the case when no set_got was really needed in the rest of the
> function), at that point the pic register will be allocated in %ebx.
>
>> >> --- a/gcc/config/i386/i386.c
>> >> +++ b/gcc/config/i386/i386.c
>> >> @@ -39124,13 +39124,22 @@ x86_function_profiler (FILE *file, int
>> >> labelno ATTRIBUTE_UNUSED)
>> >>        else
>> >>         x86_print_call_or_nop (file, mcount_name);
>> >>      }
>> >> +  /* At this stage we can't detrmine where GOT register is, as RA can allocate
>> >> +     it to any hard register.  Therefore we need to set it once again.  */
>> >>    else if (flag_pic)
>> >>      {
>> >> +      pic_labels_used |= 1 << BX_REG;
>> >> +      fprintf (file,"\tsub\t$16, %%esp\n");
>> >> +      fprintf (file,"\tmovl\t%%ebx, (%%esp)\n");
>> >> +      fprintf (file,"\tcall\t__x86.get_pc_thunk.bx\n");
>> >> +      fprintf (file,"\taddl\t$_GLOBAL_OFFSET_TABLE_, %%ebx\n");
>> >>  #ifndef NO_PROFILE_COUNTERS
>> >>        fprintf (file, "\tleal\t%sP%d@GOTOFF(%%ebx),%%"
>> >> PROFILE_COUNT_REGISTER "\n",
>> >>                LPREFIX, labelno);
>> >>  #endif
>> >>        fprintf (file, "1:\tcall\t*%s@GOT(%%ebx)\n", mcount_name);
>> >> +      fprintf (file,"\tmovl\t(%%esp), %%ebx\n");
>> >> +      fprintf (file,"\tadd\t$16, %%esp\n");
>
> Note, the unwind info is wrong even in this case.  Whenever you are in
> between that call\t__x86.get_pc_thunk.bx and movl\t(%%esp), %%ebx,
> there is no unwind info telling the debug info consumers that %ebx has been
> saved to the stack and where, so any time the debugger or anything else
> looks up at outer frames e.g. from _mcount, the %ebx will contain bogus
> value in the function that calls the function with _mcount call.
>
>         Jakub

On Fri, Oct 24, 2014 at 6:50 PM, Jakub Jelinek <jakub@redhat.com> wrote:
> On Fri, Oct 24, 2014 at 07:19:53PM +0400, Evgeny Stupachenko wrote:
>> >What is wrong in emitting the set_got right before the PROLOGUE_END
>> >note and that way sharing a single load from both?
>> Can you please explain the idea? Now set_got emitted right after
>> PROLOGUE_END, what is the advantage in emitting it right before?
>> Which load is going to be shared?
>
> I thought I've already explained.
> In ix86_init_pic_reg 32-bit part, if crtl->profile, instead of
>       rtx insn = emit_insn (gen_set_got (pic_offset_table_rtx));
>       RTX_FRAME_RELATED_P (insn) = 1;
> do:
>       rtx reg = crtl->profile
>                 ? gen_rtx_REG (Pmode, REAL_PIC_OFFSET_TABLE_REGNUM)
>                 : pic_offset_table_rtx;
>       rtx insn = emit_insn (gen_set_got (reg));
>       RTX_FRAME_RELATED_P (insn) = 1;
>       if (crtl->profile)
>         emit_move_insn (pic_offset_table_rtx, reg);
> or so.  That will ensure the RA will most likely allocate the pic pseudo
> to %ebx at the start of the function, and even if it doesn't, it will still
> be loaded into that reg and only then moved to some other reg.
>
> Then, supposedly you need to tweak the condition in ix86_save_reg:
>   if (pic_offset_table_rtx
>       && !ix86_use_pseudo_pic_reg ()
>       && regno == REAL_PIC_OFFSET_TABLE_REGNUM
>       && (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
>           || crtl->profile
>           || crtl->calls_eh_return
>           || crtl->uses_const_pool
>           || cfun->has_nonlocal_label))
>     return ix86_select_alt_pic_regnum () == INVALID_REGNUM;
> to something like:
>   if (regno == REAL_PIC_OFFSET_TABLE_REGNUM
>       && pic_offset_table_rtx)
>     {
>       if (ix86_use_pseudo_pic_reg ())
>         {
>           /* %ebx needed by call to _mcount after the prologue.  */
>           if (!TARGET_64BIT && flag_pic && crtl->profile)
>             return true;
>         }
>       else if (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
>                || crtl->profile
>                || crtl->calls_eh_return
>                || crtl->uses_const_pool
>                || cfun->has_nonlocal_label))
>         return ix86_select_alt_pic_regnum () == INVALID_REGNUM;
>     }
> which will make sure the prologue/epilogue saves/restores %ebx properly.
>
> And, finally, for the !TARGET_64BIT && flag_pic && crtl->profile case
> e.g. at the end of ix86_expand_prologue, check if the prologue is followed
> by a series of notes (one of which is the PROLOGUE_END note), but no real
> insns, and followed by set_got pattern (perhaps check that it recogs to
> CODE_FOR_set_got) that loads into %ebx.  If it does, then fine, and just
> move that insn from where it is emitted to before those notes.
> If you don't find it there, emit set_got insn to %ebx yourself at the end
> of the prologue.  Then no need to change the _mcount call in any way.
> The profiler code is emitted on the PROLOGUE_END note, so if you managed
> to move the set_got across the PROLOGUE_END note, or if you added an extra
> one (e.g. for the case when no set_got was really needed in the rest of the
> function), at that point the pic register will be allocated in %ebx.
>
>> >> --- a/gcc/config/i386/i386.c
>> >> +++ b/gcc/config/i386/i386.c
>> >> @@ -39124,13 +39124,22 @@ x86_function_profiler (FILE *file, int
>> >> labelno ATTRIBUTE_UNUSED)
>> >>        else
>> >>         x86_print_call_or_nop (file, mcount_name);
>> >>      }
>> >> +  /* At this stage we can't detrmine where GOT register is, as RA can allocate
>> >> +     it to any hard register.  Therefore we need to set it once again.  */
>> >>    else if (flag_pic)
>> >>      {
>> >> +      pic_labels_used |= 1 << BX_REG;
>> >> +      fprintf (file,"\tsub\t$16, %%esp\n");
>> >> +      fprintf (file,"\tmovl\t%%ebx, (%%esp)\n");
>> >> +      fprintf (file,"\tcall\t__x86.get_pc_thunk.bx\n");
>> >> +      fprintf (file,"\taddl\t$_GLOBAL_OFFSET_TABLE_, %%ebx\n");
>> >>  #ifndef NO_PROFILE_COUNTERS
>> >>        fprintf (file, "\tleal\t%sP%d@GOTOFF(%%ebx),%%"
>> >> PROFILE_COUNT_REGISTER "\n",
>> >>                LPREFIX, labelno);
>> >>  #endif
>> >>        fprintf (file, "1:\tcall\t*%s@GOT(%%ebx)\n", mcount_name);
>> >> +      fprintf (file,"\tmovl\t(%%esp), %%ebx\n");
>> >> +      fprintf (file,"\tadd\t$16, %%esp\n");
>
> Note, the unwind info is wrong even in this case.  Whenever you are in
> between that call\t__x86.get_pc_thunk.bx and movl\t(%%esp), %%ebx,
> there is no unwind info telling the debug info consumers that %ebx has been
> saved to the stack and where, so any time the debugger or anything else
> looks up at outer frames e.g. from _mcount, the %ebx will contain bogus
> value in the function that calls the function with _mcount call.
>
>         Jakub

Comments

Jakub Jelinek Oct. 28, 2014, 1:36 p.m. UTC | #1
On Tue, Oct 28, 2014 at 04:10:12PM +0300, Evgeny Stupachenko wrote:
> Thank you, Jakub.
> 
> The following patch passed bootstrap, gcc make check and spec2000 with
> "-p -m32 -fPIC".
> Is it ok?
> 
> ChangeLog:
> 
> 2014-10-28  Evgeny Stupachenko  <evstupac@gmail.com>
> 
> gcc/testsuite
>         * gcc.target/i386/mcount_pic.c: New.
> 
> gcc/

Please mention
	PR target/63534
in the ChangeLog entry.

> @@ -10818,6 +10833,36 @@ ix86_finalize_stack_realign_flags (void)
>    crtl->stack_realign_finalized = true;
>  }
> 
> +/* Delete first SET_GOT allocated to reg.  */
> +
> +static void
> +ix86_elim_set_got (rtx reg)
> +{
> +  basic_block bb;
> +  FOR_EACH_BB_FN (bb, cfun)
> +    {
> +      rtx_insn *c_insn;
> +      FOR_BB_INSNS (bb, c_insn)
> +       {
> +         if (GET_CODE (c_insn) == INSN)
> +           {
> +             rtx pat = PATTERN (c_insn);
> +             if (GET_CODE (pat) == PARALLEL)
> +               {
> +                 rtx vec = XVECEXP (pat, 0, 0);
> +                 if (GET_CODE (vec) == SET
> +                    && XINT (XEXP (vec, 1), 1) == UNSPEC_SET_GOT
> +                    && REGNO (XEXP (vec, 0)) == REGNO (reg))
> +                   {
> +                     delete_insn (c_insn);
> +                     return;
> +                   }
> +               }
> +           }
> +       }
> +    }
> +}
> +

This is unsafe.  What I meant is to just remove set_got insn
that is at the beginning of the first basic block (successor of
entry bb), perhaps after some notes.  You can perhaps generalize
that a little bit and look at other insns in the first bb, as long
as no NONDEBUG_INSN_P insn preceeding satisfy
modified_in_p (reg, c_insn) or reg_referenced_p (reg, PATTERN (c_insn))
or so.  But certainly stop at the end of the first bb or at any
real insn that might clobber/set %ebx or use it.

Also, instead of GET_CODE (c_insn) == INSN please use
NONJUMP_INSN_P (c_insn).

> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/mcount_pic.c
> @@ -0,0 +1,12 @@
> +/* Test check correct mcount generation.  */
> +/* { dg-do run } */
> +/* { dg-require-effective-target ia32 } */
> +/* { dg-options "-O2 -fpic -p -save-temps" } */
> +
> +int main()
> +{
> +  return 0;
> +}
> +
> +/* { dg-final { scan-assembler "mcount" } } */
> +/* { dg-final { scan-assembler "get_pc_thunk" } } */

Missing cleanup-save-temps.  Is _mcount the name of the profiling routine
on all i?86 targets?

	Jakub
diff mbox

Patch

diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 6235c4f..3332f36 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -6190,8 +6190,15 @@  ix86_init_pic_reg (void)
     }
   else
     {
-      rtx insn = emit_insn (gen_set_got (pic_offset_table_rtx));
+      /*  If there is future mcount call in the function it is more profitable
+         to emit SET_GOT into ABI defined REAL_PIC_OFFSET_TABLE_REGNUM.  */
+      rtx reg = crtl->profile
+               ? gen_rtx_REG (Pmode, REAL_PIC_OFFSET_TABLE_REGNUM)
+               : pic_offset_table_rtx;
+      rtx insn = emit_insn (gen_set_got (reg));
       RTX_FRAME_RELATED_P (insn) = 1;
+      if (crtl->profile)
+        emit_move_insn (pic_offset_table_rtx, reg);
       add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
     }

@@ -9471,15 +9478,23 @@  ix86_select_alt_pic_regnum (void)
 static bool
 ix86_save_reg (unsigned int regno, bool maybe_eh_return)
 {
-  if (pic_offset_table_rtx
-      && !ix86_use_pseudo_pic_reg ()
-      && regno == REAL_PIC_OFFSET_TABLE_REGNUM
-      && (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
-         || crtl->profile
-         || crtl->calls_eh_return
-         || crtl->uses_const_pool
-         || cfun->has_nonlocal_label))
-    return ix86_select_alt_pic_regnum () == INVALID_REGNUM;
+  if (regno == REAL_PIC_OFFSET_TABLE_REGNUM
+      && pic_offset_table_rtx)
+    {
+      if (ix86_use_pseudo_pic_reg ())
+       {
+         /* REAL_PIC_OFFSET_TABLE_REGNUM used by call to
+         _mcount in prologue.  */
+         if (!TARGET_64BIT && flag_pic && crtl->profile)
+           return true;
+       }
+      else if (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
+              || crtl->profile
+              || crtl->calls_eh_return
+              || crtl->uses_const_pool
+              || cfun->has_nonlocal_label)
+        return ix86_select_alt_pic_regnum () == INVALID_REGNUM;
+    }

   if (crtl->calls_eh_return && maybe_eh_return)
     {
@@ -10818,6 +10833,36 @@  ix86_finalize_stack_realign_flags (void)
   crtl->stack_realign_finalized = true;
 }

+/* Delete first SET_GOT allocated to reg.  */
+
+static void
+ix86_elim_set_got (rtx reg)
+{
+  basic_block bb;
+  FOR_EACH_BB_FN (bb, cfun)
+    {
+      rtx_insn *c_insn;
+      FOR_BB_INSNS (bb, c_insn)
+       {
+         if (GET_CODE (c_insn) == INSN)
+           {
+             rtx pat = PATTERN (c_insn);
+             if (GET_CODE (pat) == PARALLEL)
+               {
+                 rtx vec = XVECEXP (pat, 0, 0);
+                 if (GET_CODE (vec) == SET
+                    && XINT (XEXP (vec, 1), 1) == UNSPEC_SET_GOT
+                    && REGNO (XEXP (vec, 0)) == REGNO (reg))
+                   {
+                     delete_insn (c_insn);
+                     return;
+                   }
+               }
+           }
+       }
+    }
+}
+
 /* Expand the prologue into a bunch of separate insns.  */

 void
@@ -11271,6 +11316,20 @@  ix86_expand_prologue (void)
   if (!sse_registers_saved)
     ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);

+  /* For the mcount profiling on 32 bit PIC mode we need to emit SET_GOT
+     in PROLOGUE.  */
+  if (!TARGET_64BIT && pic_offset_table_rtx && crtl->profile && !flag_fentry)
+    {
+      rtx pic = gen_rtx_REG (Pmode, REAL_PIC_OFFSET_TABLE_REGNUM);
+      insn = emit_insn (gen_set_got (pic));
+      RTX_FRAME_RELATED_P (insn) = 1;
+      add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
+      emit_insn (gen_prologue_use (pic));
+      /* Deleting already emmitted SET_GOT if exist and allocated to
+        REAL_PIC_OFFSET_TABLE_REGNUM.  */
+      ix86_elim_set_got (pic);
+    }
+
   if (crtl->drap_reg && !crtl->stack_realign_needed)
     {
       /* vDRAP is setup but after reload it turns out stack realign
diff --git a/gcc/testsuite/gcc.target/i386/mcount_pic.c
b/gcc/testsuite/gcc.target/i386/mcount_pic.c
new file mode 100644
index 0000000..1297265
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/mcount_pic.c
@@ -0,0 +1,12 @@ 
+/* Test check correct mcount generation.  */
+/* { dg-do run } */
+/* { dg-require-effective-target ia32 } */
+/* { dg-options "-O2 -fpic -p -save-temps" } */
+
+int main()
+{
+  return 0;
+}
+
+/* { dg-final { scan-assembler "mcount" } } */
+/* { dg-final { scan-assembler "get_pc_thunk" } } */