diff mbox

[v6,1/9] ppc64 (le): prepare for -mprofile-kernel

Message ID 20160125170723.D2CCE692CE@newverein.lst.de (mailing list archive)
State Superseded
Headers show

Commit Message

Torsten Duwe Jan. 25, 2016, 3:26 p.m. UTC
The gcc switch -mprofile-kernel, available for ppc64 on gcc > 4.8.5,
allows to call _mcount very early in the function, which low-level
ASM code and code patching functions need to consider.
Especially the link register and the parameter registers are still
alive and not yet saved into a new stack frame.

Signed-off-by: Torsten Duwe <duwe@suse.de>
---
 arch/powerpc/kernel/entry_64.S  | 45 +++++++++++++++++++++++++++++++++++++++--
 arch/powerpc/kernel/ftrace.c    | 12 +++++++++--
 arch/powerpc/kernel/module_64.c | 14 +++++++++++++
 3 files changed, 67 insertions(+), 4 deletions(-)

Comments

Michael Ellerman Jan. 27, 2016, 10:19 a.m. UTC | #1
Hi Torsten,

On Mon, 2016-01-25 at 16:26 +0100, Torsten Duwe wrote:
> The gcc switch -mprofile-kernel, available for ppc64 on gcc > 4.8.5,
> allows to call _mcount very early in the function, which low-level
> ASM code and code patching functions need to consider.
> Especially the link register and the parameter registers are still
> alive and not yet saved into a new stack frame.
> 
> Signed-off-by: Torsten Duwe <duwe@suse.de>
> ---
>  arch/powerpc/kernel/entry_64.S  | 45 +++++++++++++++++++++++++++++++++++++++--
>  arch/powerpc/kernel/ftrace.c    | 12 +++++++++--
>  arch/powerpc/kernel/module_64.c | 14 +++++++++++++
>  3 files changed, 67 insertions(+), 4 deletions(-)
> 
> diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
> index a94f155..e7cd043 100644
> --- a/arch/powerpc/kernel/entry_64.S
> +++ b/arch/powerpc/kernel/entry_64.S
> @@ -1206,7 +1206,12 @@ _GLOBAL(enter_prom)
>  #ifdef CONFIG_DYNAMIC_FTRACE
>  _GLOBAL(mcount)
>  _GLOBAL(_mcount)
> -	blr
> +	std	r0,LRSAVE(r1) /* gcc6 does this _after_ this call _only_ */
> +	mflr	r0
> +	mtctr	r0
> +	ld	r0,LRSAVE(r1)
> +	mtlr	r0
> +	bctr

Can we use r11 instead? eg:

_GLOBAL(_mcount)
	mflr	r11
	mtctr	r11
	mtlr	r0
	bctr

Otherwise I worry the std/ld is going to cause a load-hit-store. And it's just
plain more instructions too.

I don't quite grok the gcc code enough to tell if that's always safe, GCC does
use r11 sometimes, but I don't think it ever expects it to survive across
_mcount()?


> @@ -1262,13 +1267,28 @@ _GLOBAL(ftrace_stub)
>  
>  #ifdef CONFIG_FUNCTION_GRAPH_TRACER
>  _GLOBAL(ftrace_graph_caller)
> +#ifdef CC_USING_MPROFILE_KERNEL
> +	/* with -mprofile-kernel, parameter regs are still alive at _mcount */
> +	std	r10, 104(r1)
> +	std	r9, 96(r1)
> +	std	r8, 88(r1)
> +	std	r7, 80(r1)
> +	std	r6, 72(r1)
> +	std	r5, 64(r1)
> +	std	r4, 56(r1)
> +	std	r3, 48(r1)
> +	mfctr	r4		/* ftrace_caller has moved local addr here */
> +	std	r4, 40(r1)
> +	mflr	r3		/* ftrace_caller has restored LR from stack */
> +#else
>  	/* load r4 with local address */
>  	ld	r4, 128(r1)
> -	subi	r4, r4, MCOUNT_INSN_SIZE
>  
>  	/* Grab the LR out of the caller stack frame */
>  	ld	r11, 112(r1)
>  	ld	r3, 16(r11)
> +#endif
> +	subi	r4, r4, MCOUNT_INSN_SIZE
>  
>  	bl	prepare_ftrace_return
>  	nop

AFAICS these end up being the only instructions shared between the two
versions. Which I don't think is worth the semantic burden of all the #ifdefs.
So please just write it as two separate functions, one for
CC_USING_MPROFILE_KERNEL and one for not.

> @@ -1277,6 +1297,26 @@ _GLOBAL(ftrace_graph_caller)
>  	 * prepare_ftrace_return gives us the address we divert to.
>  	 * Change the LR in the callers stack frame to this.
>  	 */
> +
> +#ifdef CC_USING_MPROFILE_KERNEL
> +	mtlr	r3
> +
> +	ld	r0, 40(r1)
> +	mtctr	r0
> +	ld	r10, 104(r1)
> +	ld	r9, 96(r1)
> +	ld	r8, 88(r1)
> +	ld	r7, 80(r1)
> +	ld	r6, 72(r1)
> +	ld	r5, 64(r1)
> +	ld	r4, 56(r1)
> +	ld	r3, 48(r1)
> +
> +	addi	r1, r1, 112
> +	mflr	r0
> +	std	r0, LRSAVE(r1)
> +	bctr
> +#else
>  	ld	r11, 112(r1)
>  	std	r3, 16(r11)
>  
> @@ -1284,6 +1324,7 @@ _GLOBAL(ftrace_graph_caller)
>  	mtlr	r0
>  	addi	r1, r1, 112
>  	blr
> +#endif
>  
>  _GLOBAL(return_to_handler)
>  	/* need to save return values */
> diff --git a/arch/powerpc/kernel/ftrace.c b/arch/powerpc/kernel/ftrace.c
> index 44d4d8e..080c525 100644
> --- a/arch/powerpc/kernel/ftrace.c
> +++ b/arch/powerpc/kernel/ftrace.c
> @@ -306,11 +306,19 @@ __ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
>  	 * The load offset is different depending on the ABI. For simplicity
>  	 * just mask it out when doing the compare.
>  	 */
> +#ifndef CC_USING_MPROFILE_KERNEL
>  	if ((op[0] != 0x48000008) || ((op[1] & 0xffff0000) != 0xe8410000)) {
> -		pr_err("Unexpected call sequence: %x %x\n", op[0], op[1]);
> +		pr_err("Unexpected call sequence at %p: %x %x\n",
> +		ip, op[0], op[1]);
>  		return -EINVAL;
>  	}
> -
> +#else
> +	/* look for patched "NOP" on ppc64 with -mprofile-kernel */
> +	if (op[0] != 0x60000000) {

That is "PPC_INST_NOP".

> +		pr_err("Unexpected call at %p: %x\n", ip, op[0]);
> +		return -EINVAL;
> +	}
> +#endif

Can you please break that out into a static inline, with separate versions for
the two cases.

We should aim for no #ifdefs inside functions.

> diff --git a/arch/powerpc/kernel/module_64.c b/arch/powerpc/kernel/module_64.c
> index 6838451..30f6be1 100644
> --- a/arch/powerpc/kernel/module_64.c
> +++ b/arch/powerpc/kernel/module_64.c
> @@ -475,6 +475,20 @@ static unsigned long stub_for_addr(Elf64_Shdr *sechdrs,
>  static int restore_r2(u32 *instruction, struct module *me)
>  {
>  	if (*instruction != PPC_INST_NOP) {
> +#ifdef CC_USING_MPROFILE_KERNEL
> +		/* -mprofile_kernel sequence starting with
> +		 * mflr r0 and maybe std r0, LRSAVE(r1)
> +		 */
> +		if ((instruction[-3] == 0x7c0802a6 &&
> +		    instruction[-2] == 0xf8010010) ||
> +		    instruction[-2] == 0x7c0802a6) {
> +			/* Nothing to be done here, it's an _mcount
> +			 * call location and r2 will have to be
> +			 * restored in the _mcount function.
> +			 */
> +			return 1;
> +		};
> +#endif

Again I'd rather that was in a helper static inline.

And some #defines for the instructions would also help.

>  		pr_err("%s: Expect noop after relocate, got %08x\n",
>  		       me->name, *instruction);
>  		return 0;


cheers
Torsten Duwe Jan. 27, 2016, 10:44 a.m. UTC | #2
On Wed, Jan 27, 2016 at 09:19:27PM +1100, Michael Ellerman wrote:
> Hi Torsten,
> 
> > +++ b/arch/powerpc/kernel/entry_64.S
> > @@ -1206,7 +1206,12 @@ _GLOBAL(enter_prom)
> >  #ifdef CONFIG_DYNAMIC_FTRACE
> >  _GLOBAL(mcount)
> >  _GLOBAL(_mcount)
> > -	blr
> > +	std	r0,LRSAVE(r1) /* gcc6 does this _after_ this call _only_ */
> > +	mflr	r0
> > +	mtctr	r0
> > +	ld	r0,LRSAVE(r1)
> > +	mtlr	r0
> > +	bctr
> 
> Can we use r11 instead? eg:
> 
> _GLOBAL(_mcount)
> 	mflr	r11
> 	mtctr	r11
> 	mtlr	r0
> 	bctr
> 
> Otherwise I worry the std/ld is going to cause a load-hit-store. And it's just
> plain more instructions too.
> 
> I don't quite grok the gcc code enough to tell if that's always safe, GCC does
> use r11 sometimes, but I don't think it ever expects it to survive across
> _mcount()?

I used r11 in that area once, and it crashed, but I don't recall the deatils.
We'll see. The performance shouldn't be critical, as the code is only used
during boot-up. With DYNAMIC_FTRACE, The calls will be replaced by
0x600000^W PPC_INST_NOP :)

> >  
> >  	bl	prepare_ftrace_return
> >  	nop
> 
> AFAICS these end up being the only instructions shared between the two
> versions. Which I don't think is worth the semantic burden of all the #ifdefs.
> So please just write it as two separate functions, one for
> CC_USING_MPROFILE_KERNEL and one for not.
> 
> > index 44d4d8e..080c525 100644
> > --- a/arch/powerpc/kernel/ftrace.c
> > +++ b/arch/powerpc/kernel/ftrace.c
> > @@ -306,11 +306,19 @@ __ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
> >  	 * The load offset is different depending on the ABI. For simplicity
> >  	 * just mask it out when doing the compare.
> >  	 */
> > +#ifndef CC_USING_MPROFILE_KERNEL
> >  	if ((op[0] != 0x48000008) || ((op[1] & 0xffff0000) != 0xe8410000)) {
> > -		pr_err("Unexpected call sequence: %x %x\n", op[0], op[1]);
> > +		pr_err("Unexpected call sequence at %p: %x %x\n",
> > +		ip, op[0], op[1]);
> >  		return -EINVAL;
> >  	}
> > -
> > +#else
> > +	/* look for patched "NOP" on ppc64 with -mprofile-kernel */
> > +	if (op[0] != 0x60000000) {
> 
> That is "PPC_INST_NOP".
> 
> > +		pr_err("Unexpected call at %p: %x\n", ip, op[0]);
> > +		return -EINVAL;
> > +	}
> > +#endif
> 
> Can you please break that out into a static inline, with separate versions for
> the two cases.
> 
> We should aim for no #ifdefs inside functions.

Points taken.

Does this set _work_ for you now? That'd be great to hear.

Stay tuned for v7...

	Torsten
Alan Modra Jan. 27, 2016, 12:58 p.m. UTC | #3
On Wed, Jan 27, 2016 at 09:19:27PM +1100, Michael Ellerman wrote:
> Hi Torsten,
> 
> On Mon, 2016-01-25 at 16:26 +0100, Torsten Duwe wrote:
> > The gcc switch -mprofile-kernel, available for ppc64 on gcc > 4.8.5,
> > allows to call _mcount very early in the function, which low-level
> > ASM code and code patching functions need to consider.
> > Especially the link register and the parameter registers are still
> > alive and not yet saved into a new stack frame.
> > 
> > Signed-off-by: Torsten Duwe <duwe@suse.de>
> > ---
> >  arch/powerpc/kernel/entry_64.S  | 45 +++++++++++++++++++++++++++++++++++++++--
> >  arch/powerpc/kernel/ftrace.c    | 12 +++++++++--
> >  arch/powerpc/kernel/module_64.c | 14 +++++++++++++
> >  3 files changed, 67 insertions(+), 4 deletions(-)
> > 
> > diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
> > index a94f155..e7cd043 100644
> > --- a/arch/powerpc/kernel/entry_64.S
> > +++ b/arch/powerpc/kernel/entry_64.S
> > @@ -1206,7 +1206,12 @@ _GLOBAL(enter_prom)
> >  #ifdef CONFIG_DYNAMIC_FTRACE
> >  _GLOBAL(mcount)
> >  _GLOBAL(_mcount)
> > -	blr
> > +	std	r0,LRSAVE(r1) /* gcc6 does this _after_ this call _only_ */
> > +	mflr	r0
> > +	mtctr	r0
> > +	ld	r0,LRSAVE(r1)
> > +	mtlr	r0
> > +	bctr
> 
> Can we use r11 instead? eg:
> 
> _GLOBAL(_mcount)
> 	mflr	r11
> 	mtctr	r11
> 	mtlr	r0
> 	bctr

Depends on what you need to support.  As Torsten says, the code to
call _mcount when -mprofile-kernel is emitted before the prologue of a
function (similar to -m32), but after the ELFv2 global entry point
code.  If you trash r11 here you're killing the static chain pointer,
used by C nested functions or other languages that use a static chain,
eg. Pascal.  r11 has *not* been saved for ELFv2.

r12 might be a better choice for a temp reg.
Torsten Duwe Jan. 27, 2016, 1:45 p.m. UTC | #4
On Wed, Jan 27, 2016 at 11:28:09PM +1030, Alan Modra wrote:
> On Wed, Jan 27, 2016 at 09:19:27PM +1100, Michael Ellerman wrote:
> > 
> > Can we use r11 instead? eg:
> > 
> > _GLOBAL(_mcount)
> > 	mflr	r11
> > 	mtctr	r11
> > 	mtlr	r0
> > 	bctr
> 
> Depends on what you need to support.  As Torsten says, the code to
> call _mcount when -mprofile-kernel is emitted before the prologue of a
> function (similar to -m32), but after the ELFv2 global entry point
> code.  If you trash r11 here you're killing the static chain pointer,
> used by C nested functions or other languages that use a static chain,
> eg. Pascal.  r11 has *not* been saved for ELFv2.

Even if nested functions aren't supported in the Linux kernel(?), I think
it was an earlier version of mcount when r11 usage ruined my day.

> r12 might be a better choice for a temp reg.

Good idea. r12 holds the function entry point and is used to calculate the
new TOC value just _before_ the call. It should be available.
I'll try, thanks for the hint.

	Torsten
Michael Ellerman Jan. 28, 2016, 3:39 a.m. UTC | #5
On Wed, 2016-01-27 at 23:28 +1030, Alan Modra wrote:
> On Wed, Jan 27, 2016 at 09:19:27PM +1100, Michael Ellerman wrote:
> > Hi Torsten,
> > 
> > On Mon, 2016-01-25 at 16:26 +0100, Torsten Duwe wrote:
> > > The gcc switch -mprofile-kernel, available for ppc64 on gcc > 4.8.5,
> > > allows to call _mcount very early in the function, which low-level
> > > ASM code and code patching functions need to consider.
> > > Especially the link register and the parameter registers are still
> > > alive and not yet saved into a new stack frame.
> > > 
> > > Signed-off-by: Torsten Duwe <duwe@suse.de>
> > > ---
> > >  arch/powerpc/kernel/entry_64.S  | 45 +++++++++++++++++++++++++++++++++++++++--
> > >  arch/powerpc/kernel/ftrace.c    | 12 +++++++++--
> > >  arch/powerpc/kernel/module_64.c | 14 +++++++++++++
> > >  3 files changed, 67 insertions(+), 4 deletions(-)
> > > 
> > > diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
> > > index a94f155..e7cd043 100644
> > > --- a/arch/powerpc/kernel/entry_64.S
> > > +++ b/arch/powerpc/kernel/entry_64.S
> > > @@ -1206,7 +1206,12 @@ _GLOBAL(enter_prom)
> > >  #ifdef CONFIG_DYNAMIC_FTRACE
> > >  _GLOBAL(mcount)
> > >  _GLOBAL(_mcount)
> > > -	blr
> > > +	std	r0,LRSAVE(r1) /* gcc6 does this _after_ this call _only_ */
> > > +	mflr	r0
> > > +	mtctr	r0
> > > +	ld	r0,LRSAVE(r1)
> > > +	mtlr	r0
> > > +	bctr
> > 
> > Can we use r11 instead? eg:
> > 
> > _GLOBAL(_mcount)
> > 	mflr	r11
> > 	mtctr	r11
> > 	mtlr	r0
> > 	bctr
> 
> Depends on what you need to support.  As Torsten says, the code to
> call _mcount when -mprofile-kernel is emitted before the prologue of a
> function (similar to -m32), but after the ELFv2 global entry point
> code.  If you trash r11 here you're killing the static chain pointer,
> used by C nested functions or other languages that use a static chain,
> eg. Pascal.  r11 has *not* been saved for ELFv2.

OK, thanks for clarfiying. Pascal is not a big concern :D. But although I
don't think we use nested functions anywhere, someone somewhere could be, or at
least might one day.

> r12 might be a better choice for a temp reg.

Even better.

cheers
Michael Ellerman Jan. 28, 2016, 4:26 a.m. UTC | #6
On Wed, 2016-01-27 at 11:44 +0100, Torsten Duwe wrote:
> On Wed, Jan 27, 2016 at 09:19:27PM +1100, Michael Ellerman wrote:
> > Hi Torsten,
> >
> > > +++ b/arch/powerpc/kernel/entry_64.S
> > > @@ -1206,7 +1206,12 @@ _GLOBAL(enter_prom)
> > >  #ifdef CONFIG_DYNAMIC_FTRACE
> > >  _GLOBAL(mcount)
> > >  _GLOBAL(_mcount)
> > > -	blr
> > > +	std	r0,LRSAVE(r1) /* gcc6 does this _after_ this call _only_ */
> > > +	mflr	r0
> > > +	mtctr	r0
> > > +	ld	r0,LRSAVE(r1)
> > > +	mtlr	r0
> > > +	bctr
> >
> > Can we use r11 instead? eg:
> >
> > _GLOBAL(_mcount)
> > 	mflr	r11
> > 	mtctr	r11
> > 	mtlr	r0
> > 	bctr
> >
> > Otherwise I worry the std/ld is going to cause a load-hit-store. And it's just
> > plain more instructions too.
> >
> > I don't quite grok the gcc code enough to tell if that's always safe, GCC does
> > use r11 sometimes, but I don't think it ever expects it to survive across
> > _mcount()?
>
> I used r11 in that area once, and it crashed, but I don't recall the deatils.
> We'll see. The performance shouldn't be critical, as the code is only used
> during boot-up. With DYNAMIC_FTRACE, The calls will be replaced by
> 0x600000^W PPC_INST_NOP :)

True.

That raises an interesting question, how does it work *without* DYNAMIC_FTRACE?

It looks like you haven't updated that version of _mcount at all? Or maybe I'm
missing an #ifdef somewhere?

_GLOBAL_TOC(_mcount)
	/* Taken from output of objdump from lib64/glibc */
	mflr	r3
	ld	r11, 0(r1)
	stdu	r1, -112(r1)
	std	r3, 128(r1)
	ld	r4, 16(r11)

	subi	r3, r3, MCOUNT_INSN_SIZE
	LOAD_REG_ADDR(r5,ftrace_trace_function)
	ld	r5,0(r5)
	ld	r5,0(r5)
	mtctr	r5
	bctrl
	nop

It doesn't look like that will work right with the -mprofile-kernel ABI. And
indeed it doesn't boot.

So we'll need to work that out. I guess the minimum would be to disable
-mprofile-kernel if DYNAMIC_FTRACE is disabled.

Frankly I think we'd be happy to *only* support DYNAMIC_FTRACE, but the generic
code doesn't let us do that at the moment.

> > > index 44d4d8e..080c525 100644
> > > --- a/arch/powerpc/kernel/ftrace.c
> > > +++ b/arch/powerpc/kernel/ftrace.c
> > > @@ -306,11 +306,19 @@ __ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
> > >  	 * The load offset is different depending on the ABI. For simplicity
> > >  	 * just mask it out when doing the compare.
> > >  	 */
> > > +#ifndef CC_USING_MPROFILE_KERNEL
> > >  	if ((op[0] != 0x48000008) || ((op[1] & 0xffff0000) != 0xe8410000)) {
> > > -		pr_err("Unexpected call sequence: %x %x\n", op[0], op[1]);
> > > +		pr_err("Unexpected call sequence at %p: %x %x\n",
> > > +		ip, op[0], op[1]);
> > >  		return -EINVAL;
> > >  	}
> > > -
> > > +#else
> > > +	/* look for patched "NOP" on ppc64 with -mprofile-kernel */
> > > +	if (op[0] != 0x60000000) {
> >
> > That is "PPC_INST_NOP".
> >

> > > +		pr_err("Unexpected call at %p: %x\n", ip, op[0]);
> > > +		return -EINVAL;
> > > +	}
> > > +#endif
> >
> > Can you please break that out into a static inline, with separate versions for
> > the two cases.
> >
> > We should aim for no #ifdefs inside functions.
>
> Points taken.

Thanks.

> Does this set _work_ for you now? That'd be great to hear.

Sort of, see previous comments.

But it's better than the previous version which didn't boot :)

Also ftracetest fails at step 8:

  ...
  [8] ftrace - function graph filters with stack tracer
  Unable to handle kernel paging request for data at address 0xd0000000033d7f70
  Faulting instruction address: 0xc0000000001b16ec
  Oops: Kernel access of bad area, sig: 11 [#1]
  SMP NR_CPUS=2048 NUMA pSeries
  Modules linked in: virtio_balloon fuse autofs4 virtio_net virtio_pci virtio_ring virtio
  CPU: 15 PID: 0 Comm: swapper/15 Not tainted 4.5.0-rc1-00009-g325e167adf2b #4
  task: c0000001fefe0400 ti: c0000001fff74000 task.ti: c0000001fb0e0000
  NIP: c0000000001b16ec LR: c000000000048abc CTR: d0000000032d0424
  REGS: c0000001fff77aa0 TRAP: 0300   Not tainted  (4.5.0-rc1-00009-g325e167adf2b)
  MSR: 8000000000009033 <SF,EE,ME,IR,DR,RI,LE>  CR: 28002422  XER: 20000000
  CFAR: c0000000004ebbf0 DAR: d0000000033d7f70 DSISR: 40000000 SOFTE: 0
  GPR00: c000000000009f84 c0000001fff77d20 d0000000032d9fb0 c000000000118d70
  GPR04: d0000000032d0420 0000000000000000 0000000000000000 00000001ff170000
  GPR08: 0000000000000000 d0000000033d9fb0 c0000001f36f2c00 d0000000032d1898
  GPR12: c000000000118d70 c00000000fe03c00 c0000001fb0e0000 c000000000d6d3c8
  GPR16: c000000000d59c28 c0000001fb0e0000 0000000000000001 0000000000000008
  GPR20: c0000001fb0e0080 0000000000000001 0000000000000002 0000000000000019
  GPR24: c0000001f36f2c00 0000000000000000 0000000000000000 0000000000000000
  GPR28: 0000000000000000 d0000000032d0420 c000000000118d70 c0000001f3570680
  NIP [c0000000001b16ec] ftrace_graph_is_dead+0xc/0x20
  LR [c000000000048abc] prepare_ftrace_return+0x2c/0x150
  Call Trace:
  [c0000001fff77d20] [0000000000000002] 0x2 (unreliable)
  [c0000001fff77d70] [c000000000009f84] ftrace_graph_caller+0x34/0x74
  [c0000001fff77de0] [c000000000118d70] handle_irq_event_percpu+0x90/0x2b0
  [c0000001fff77ea0] [c000000000118ffc] handle_irq_event+0x6c/0xd0
  [c0000001fff77ed0] [c00000000011e280] handle_fasteoi_irq+0xf0/0x2a0
  [c0000001fff77f00] [c000000000117f40] generic_handle_irq+0x50/0x80
  [c0000001fff77f20] [c000000000011228] __do_irq+0x98/0x1d0
  [c0000001fff77f90] [c000000000024074] call_do_irq+0x14/0x24
  [c0000001fb0e3a20] [c0000000000113f8] do_IRQ+0x98/0x140
  [c0000001fb0e3a60] [c0000000000025d0] hardware_interrupt_common+0x150/0x180
  --- interrupt: 501 at plpar_hcall_norets+0x1c/0x28
      LR = check_and_cede_processor+0x38/0x50
  [c0000001fb0e3d50] [c0000000008008c4] check_and_cede_processor+0x24/0x50 (unreliable)
  [c0000001fb0e3db0] [c000000000800aec] shared_cede_loop+0x6c/0x180
  [c0000001fb0e3df0] [c0000000007fdca4] cpuidle_enter_state+0x174/0x400
  [c0000001fb0e3e50] [c000000000104550] call_cpuidle+0x50/0xa0
  [c0000001fb0e3e70] [c000000000104b58] cpu_startup_entry+0x338/0x440
  [c0000001fb0e3f30] [c00000000004090c] start_secondary+0x35c/0x3a0
  [c0000001fb0e3f90] [c000000000008b6c] start_secondary_prolog+0x10/0x14
  Instruction dump:
  60000000 4bfe6c79 60000000 e8610020 38210030 e8010010 7c0803a6 4bffff30
  60420000 3c4c00ca 3842ff20 3d220010 <8869dfc0> 4e800020 60000000 60000000
  ---[ end trace 129c2895cb584df3 ]---

  Kernel panic - not syncing: Fatal exception in interrupt


That doesn't happen without your series applied, though that doesn't 100% mean
it's your bug. I haven't had time to dig any deeper.

> Stay tuned for v7...

Thanks.

cheers
Torsten Duwe Jan. 28, 2016, 11:50 a.m. UTC | #7
On Thu, Jan 28, 2016 at 03:26:59PM +1100, Michael Ellerman wrote:
> 
> That raises an interesting question, how does it work *without* DYNAMIC_FTRACE?
> 
> It looks like you haven't updated that version of _mcount at all? Or maybe I'm
> missing an #ifdef somewhere?

You didn't, I did. I haven't considered that combination.

> It doesn't look like that will work right with the -mprofile-kernel ABI. And
> indeed it doesn't boot.

The lean _mcount should handle it and boot, had I not misplaced it in
the #ifdefs, but then of course profiling wouldn't work.

> So we'll need to work that out. I guess the minimum would be to disable
> -mprofile-kernel if DYNAMIC_FTRACE is disabled.

I feel that supporting all combinations of ABIv1/ABIv2, FTRACE/DYNAMIC_FTRACE,
-p/-mprofile-kernel will get us into #ifdef hell, and at least one kernel
developer will go insane. That will probably be the one porting this
to ppc64be (ABIv1).

> Frankly I think we'd be happy to *only* support DYNAMIC_FTRACE, but the generic
> code doesn't let us do that at the moment.

Seconded.
I'll have a look at the Kconfigs.

> But it's better than the previous version which didn't boot :)

That's your fault, you picked the wrong compiler ;-)

> Also ftracetest fails at step 8:
>   ...
>   [8] ftrace - function graph filters with stack tracer
>   Unable to handle kernel paging request for data at address 0xd0000000033d7f70
[...]
> That doesn't happen without your series applied, though that doesn't 100% mean
> it's your bug. I haven't had time to dig any deeper.

Will check as well...

	Torsten
AKASHI Takahiro Feb. 3, 2016, 7:23 a.m. UTC | #8
Hi,

On 01/26/2016 12:26 AM, Torsten Duwe wrote:
> The gcc switch -mprofile-kernel, available for ppc64 on gcc > 4.8.5,
> allows to call _mcount very early in the function, which low-level
> ASM code and code patching functions need to consider.
> Especially the link register and the parameter registers are still
> alive and not yet saved into a new stack frame.

I'm thinking of implementing live patch support *for arm64*, and as part of
those efforts, we are proposing[1] a new *generic* gcc option, -fprolog-add=N.
This option will insert N nop instructions at the beginning of each function.
So we have to initialize those codes at the boot time to later utilize
them for FTRACE_WITH_REGS. Other than that, it will work similarly
with -mfentry on x86 (and -mprofile-kernel?).

I'm totally unfamiliar with ppc architecture, but just wondering
whether this option will also be useful for other architectures.

I will really appreciate you if you share your thoughts with me, please?

[1]  https://gcc.gnu.org/ml/gcc/2015-05/msg00267.html, and
      https://gcc.gnu.org/ml/gcc/2015-10/msg00090.html

Thanks,
-Takahiro AKASHI

> Signed-off-by: Torsten Duwe <duwe@suse.de>
> ---
>   arch/powerpc/kernel/entry_64.S  | 45 +++++++++++++++++++++++++++++++++++++++--
>   arch/powerpc/kernel/ftrace.c    | 12 +++++++++--
>   arch/powerpc/kernel/module_64.c | 14 +++++++++++++
>   3 files changed, 67 insertions(+), 4 deletions(-)
>
> diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
> index a94f155..e7cd043 100644
> --- a/arch/powerpc/kernel/entry_64.S
> +++ b/arch/powerpc/kernel/entry_64.S
> @@ -1206,7 +1206,12 @@ _GLOBAL(enter_prom)
>   #ifdef CONFIG_DYNAMIC_FTRACE
>   _GLOBAL(mcount)
>   _GLOBAL(_mcount)
> -	blr
> +	std	r0,LRSAVE(r1) /* gcc6 does this _after_ this call _only_ */
> +	mflr	r0
> +	mtctr	r0
> +	ld	r0,LRSAVE(r1)
> +	mtlr	r0
> +	bctr
>
>   _GLOBAL_TOC(ftrace_caller)
>   	/* Taken from output of objdump from lib64/glibc */
> @@ -1262,13 +1267,28 @@ _GLOBAL(ftrace_stub)
>
>   #ifdef CONFIG_FUNCTION_GRAPH_TRACER
>   _GLOBAL(ftrace_graph_caller)
> +#ifdef CC_USING_MPROFILE_KERNEL
> +	/* with -mprofile-kernel, parameter regs are still alive at _mcount */
> +	std	r10, 104(r1)
> +	std	r9, 96(r1)
> +	std	r8, 88(r1)
> +	std	r7, 80(r1)
> +	std	r6, 72(r1)
> +	std	r5, 64(r1)
> +	std	r4, 56(r1)
> +	std	r3, 48(r1)
> +	mfctr	r4		/* ftrace_caller has moved local addr here */
> +	std	r4, 40(r1)
> +	mflr	r3		/* ftrace_caller has restored LR from stack */
> +#else
>   	/* load r4 with local address */
>   	ld	r4, 128(r1)
> -	subi	r4, r4, MCOUNT_INSN_SIZE
>
>   	/* Grab the LR out of the caller stack frame */
>   	ld	r11, 112(r1)
>   	ld	r3, 16(r11)
> +#endif
> +	subi	r4, r4, MCOUNT_INSN_SIZE
>
>   	bl	prepare_ftrace_return
>   	nop
> @@ -1277,6 +1297,26 @@ _GLOBAL(ftrace_graph_caller)
>   	 * prepare_ftrace_return gives us the address we divert to.
>   	 * Change the LR in the callers stack frame to this.
>   	 */
> +
> +#ifdef CC_USING_MPROFILE_KERNEL
> +	mtlr	r3
> +
> +	ld	r0, 40(r1)
> +	mtctr	r0
> +	ld	r10, 104(r1)
> +	ld	r9, 96(r1)
> +	ld	r8, 88(r1)
> +	ld	r7, 80(r1)
> +	ld	r6, 72(r1)
> +	ld	r5, 64(r1)
> +	ld	r4, 56(r1)
> +	ld	r3, 48(r1)
> +
> +	addi	r1, r1, 112
> +	mflr	r0
> +	std	r0, LRSAVE(r1)
> +	bctr
> +#else
>   	ld	r11, 112(r1)
>   	std	r3, 16(r11)
>
> @@ -1284,6 +1324,7 @@ _GLOBAL(ftrace_graph_caller)
>   	mtlr	r0
>   	addi	r1, r1, 112
>   	blr
> +#endif
>
>   _GLOBAL(return_to_handler)
>   	/* need to save return values */
> diff --git a/arch/powerpc/kernel/ftrace.c b/arch/powerpc/kernel/ftrace.c
> index 44d4d8e..080c525 100644
> --- a/arch/powerpc/kernel/ftrace.c
> +++ b/arch/powerpc/kernel/ftrace.c
> @@ -306,11 +306,19 @@ __ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
>   	 * The load offset is different depending on the ABI. For simplicity
>   	 * just mask it out when doing the compare.
>   	 */
> +#ifndef CC_USING_MPROFILE_KERNEL
>   	if ((op[0] != 0x48000008) || ((op[1] & 0xffff0000) != 0xe8410000)) {
> -		pr_err("Unexpected call sequence: %x %x\n", op[0], op[1]);
> +		pr_err("Unexpected call sequence at %p: %x %x\n",
> +		ip, op[0], op[1]);
>   		return -EINVAL;
>   	}
> -
> +#else
> +	/* look for patched "NOP" on ppc64 with -mprofile-kernel */
> +	if (op[0] != 0x60000000) {
> +		pr_err("Unexpected call at %p: %x\n", ip, op[0]);
> +		return -EINVAL;
> +	}
> +#endif
>   	/* If we never set up a trampoline to ftrace_caller, then bail */
>   	if (!rec->arch.mod->arch.tramp) {
>   		pr_err("No ftrace trampoline\n");
> diff --git a/arch/powerpc/kernel/module_64.c b/arch/powerpc/kernel/module_64.c
> index 6838451..30f6be1 100644
> --- a/arch/powerpc/kernel/module_64.c
> +++ b/arch/powerpc/kernel/module_64.c
> @@ -475,6 +475,20 @@ static unsigned long stub_for_addr(Elf64_Shdr *sechdrs,
>   static int restore_r2(u32 *instruction, struct module *me)
>   {
>   	if (*instruction != PPC_INST_NOP) {
> +#ifdef CC_USING_MPROFILE_KERNEL
> +		/* -mprofile_kernel sequence starting with
> +		 * mflr r0 and maybe std r0, LRSAVE(r1)
> +		 */
> +		if ((instruction[-3] == 0x7c0802a6 &&
> +		    instruction[-2] == 0xf8010010) ||
> +		    instruction[-2] == 0x7c0802a6) {
> +			/* Nothing to be done here, it's an _mcount
> +			 * call location and r2 will have to be
> +			 * restored in the _mcount function.
> +			 */
> +			return 1;
> +		};
> +#endif
>   		pr_err("%s: Expect noop after relocate, got %08x\n",
>   		       me->name, *instruction);
>   		return 0;
>
Jiri Kosina Feb. 3, 2016, 8:55 a.m. UTC | #9
On Wed, 3 Feb 2016, AKASHI Takahiro wrote:

> > The gcc switch -mprofile-kernel, available for ppc64 on gcc > 4.8.5,
> > allows to call _mcount very early in the function, which low-level
> > ASM code and code patching functions need to consider.
> > Especially the link register and the parameter registers are still
> > alive and not yet saved into a new stack frame.
> 
> I'm thinking of implementing live patch support *for arm64*, and as part of
> those efforts, we are proposing[1] a new *generic* gcc option, -fprolog-add=N.
> This option will insert N nop instructions at the beginning of each function.
> So we have to initialize those codes at the boot time to later utilize
> them for FTRACE_WITH_REGS. Other than that, it will work similarly
> with -mfentry on x86 (and -mprofile-kernel?).
> 
> I'm totally unfamiliar with ppc architecture, but just wondering
> whether this option will also be useful for other architectures.

The interesting part of the story with ppc64 is that you indeed want to 
create the callsite before the *most* of the prologue, but not really :) 
The part of the prologue where TOC pointer is saved needs to happen before 
the fentry/profiling call.

I don't think this will be an issue on ARM64, but it definitely is 
something that should be taken into account in case the gcc option is 
meant to be really generic.

Thanks,
Torsten Duwe Feb. 3, 2016, 11:24 a.m. UTC | #10
On Wed, Feb 03, 2016 at 09:55:11AM +0100, Jiri Kosina wrote:
> On Wed, 3 Feb 2016, AKASHI Takahiro wrote:
> > those efforts, we are proposing[1] a new *generic* gcc option, -fprolog-add=N.
> > This option will insert N nop instructions at the beginning of each function.

> The interesting part of the story with ppc64 is that you indeed want to 
> create the callsite before the *most* of the prologue, but not really :) 

I was silently assuming that GCC would do this right on ppc64le; add the NOPs
right after the TOC load. Or after TOC load and LR save? ...

> The part of the prologue where TOC pointer is saved needs to happen before 
> the fentry/profiling call.

Yes, any call, to any profiler/tracer/live patcher is potentially global
and needs the _new_ TOC value.

This proposal, if implemented in a too naive fashion, will worsen the problem
we currently discuss: a few NOPs _never_ cause any global reference. GCC might
be even more inclined to not load a new TOC value. That change would need to be
fairly smart on ppc64le.

	Torsten
AKASHI Takahiro Feb. 4, 2016, 9:31 a.m. UTC | #11
Jiri, Torsten

Thank you for your explanation.

On 02/03/2016 08:24 PM, Torsten Duwe wrote:
> On Wed, Feb 03, 2016 at 09:55:11AM +0100, Jiri Kosina wrote:
>> On Wed, 3 Feb 2016, AKASHI Takahiro wrote:
>>> those efforts, we are proposing[1] a new *generic* gcc option, -fprolog-add=N.
>>> This option will insert N nop instructions at the beginning of each function.
>
>> The interesting part of the story with ppc64 is that you indeed want to
>> create the callsite before the *most* of the prologue, but not really :)
>
> I was silently assuming that GCC would do this right on ppc64le; add the NOPs
> right after the TOC load. Or after TOC load and LR save? ...

On arm/arm64, link register must be saved before any function call. So anyhow
we will have to add something, 3 instructions at the minimum, like:
    save lr
    branch _mcount
    restore lr
    <prologue>
    ...
    <body>
    ...

>> The part of the prologue where TOC pointer is saved needs to happen before
>> the fentry/profiling call.
>
> Yes, any call, to any profiler/tracer/live patcher is potentially global
> and needs the _new_ TOC value.

I don't want to bother you, but for my better understandings, could you show me
an example of asm instructions for a function prologue under -mprofile-kernel, please?

-Takahiro AKASHI

> This proposal, if implemented in a too naive fashion, will worsen the problem
> we currently discuss: a few NOPs _never_ cause any global reference. GCC might
> be even more inclined to not load a new TOC value. That change would need to be
> fairly smart on ppc64le.
>
> 	Torsten
>
Petr Mladek Feb. 4, 2016, 11:02 a.m. UTC | #12
On Thu 2016-02-04 18:31:40, AKASHI Takahiro wrote:
> Jiri, Torsten
> 
> Thank you for your explanation.
> 
> On 02/03/2016 08:24 PM, Torsten Duwe wrote:
> >On Wed, Feb 03, 2016 at 09:55:11AM +0100, Jiri Kosina wrote:
> >>On Wed, 3 Feb 2016, AKASHI Takahiro wrote:
> >>>those efforts, we are proposing[1] a new *generic* gcc option, -fprolog-add=N.
> >>>This option will insert N nop instructions at the beginning of each function.
> >
> >>The interesting part of the story with ppc64 is that you indeed want to
> >>create the callsite before the *most* of the prologue, but not really :)
> >
> >I was silently assuming that GCC would do this right on ppc64le; add the NOPs
> >right after the TOC load. Or after TOC load and LR save? ...
> 
> On arm/arm64, link register must be saved before any function call. So anyhow
> we will have to add something, 3 instructions at the minimum, like:
>    save lr
>    branch _mcount
>    restore lr
>    <prologue>
>    ...
>    <body>
>    ...

So, it is similar to PPC that has to handle LR as well.


> >>The part of the prologue where TOC pointer is saved needs to happen before
> >>the fentry/profiling call.
> >
> >Yes, any call, to any profiler/tracer/live patcher is potentially global
> >and needs the _new_ TOC value.

The code below is generated for PPC64LE with -mprofile-kernel using:

$> gcc --version
gcc (SUSE Linux) 6.0.0 20160121 (experimental) [trunk revision 232670]
Copyright (C) 2016 Free Software Foundation, Inc.
This is free software; see the source for copying conditions.  There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.


0000000000000050 <cmdline_proc_show>:
  50:   00 00 4c 3c     addis   r2,r12,0
                        50: R_PPC64_REL16_HA    .TOC.
  54:   00 00 42 38     addi    r2,r2,0
                        54: R_PPC64_REL16_LO    .TOC.+0x4
  58:   a6 02 08 7c     mflr    r0
  5c:   01 00 00 48     bl      5c <cmdline_proc_show+0xc>
                        5c: R_PPC64_REL24       _mcount
  60:   a6 02 08 7c     mflr    r0
  64:   10 00 01 f8     std     r0,16(r1)
  68:   a1 ff 21 f8     stdu    r1,-96(r1)
  6c:   00 00 22 3d     addis   r9,r2,0
                        6c: R_PPC64_TOC16_HA    .toc
  70:   00 00 82 3c     addis   r4,r2,0
                        70: R_PPC64_TOC16_HA    .rodata.str1.8
  74:   00 00 29 e9     ld      r9,0(r9)
                        74: R_PPC64_TOC16_LO_DS .toc
  78:   00 00 84 38     addi    r4,r4,0
                        78: R_PPC64_TOC16_LO    .rodata.str1.8
  7c:   00 00 a9 e8     ld      r5,0(r9)
  80:   01 00 00 48     bl      80 <cmdline_proc_show+0x30>
                        80: R_PPC64_REL24       seq_printf
  84:   00 00 00 60     nop
  88:   00 00 60 38     li      r3,0
  8c:   60 00 21 38     addi    r1,r1,96
  90:   10 00 01 e8     ld      r0,16(r1)
  94:   a6 03 08 7c     mtlr    r0
  98:   20 00 80 4e     blr


And the same function compiled using:

$> gcc --version
gcc (SUSE Linux) 4.8.5
Copyright (C) 2015 Free Software Foundation, Inc.
This is free software; see the source for copying conditions.  There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.


0000000000000050 <cmdline_proc_show>:
  50:   00 00 4c 3c     addis   r2,r12,0
                        50: R_PPC64_REL16_HA    .TOC.
  54:   00 00 42 38     addi    r2,r2,0
                        54: R_PPC64_REL16_LO    .TOC.+0x4
  58:   a6 02 08 7c     mflr    r0
  5c:   10 00 01 f8     std     r0,16(r1)
  60:   01 00 00 48     bl      60 <cmdline_proc_show+0x10>
                        60: R_PPC64_REL24       _mcount
  64:   a6 02 08 7c     mflr    r0
  68:   10 00 01 f8     std     r0,16(r1)
  6c:   a1 ff 21 f8     stdu    r1,-96(r1)
  70:   00 00 42 3d     addis   r10,r2,0
                        70: R_PPC64_TOC16_HA    .toc
  74:   00 00 82 3c     addis   r4,r2,0
                        74: R_PPC64_TOC16_HA    .rodata.str1.8
  78:   00 00 2a e9     ld      r9,0(r10)
                        78: R_PPC64_TOC16_LO_DS .toc
  7c:   00 00 84 38     addi    r4,r4,0
                        7c: R_PPC64_TOC16_LO    .rodata.str1.8
  80:   00 00 a9 e8     ld      r5,0(r9)
  84:   01 00 00 48     bl      84 <cmdline_proc_show+0x34>
                        84: R_PPC64_REL24       seq_printf
  88:   00 00 00 60     nop
  8c:   00 00 60 38     li      r3,0
  90:   60 00 21 38     addi    r1,r1,96
  94:   10 00 01 e8     ld      r0,16(r1)
  98:   a6 03 08 7c     mtlr    r0
  9c:   20 00 80 4e     blr


Please, note that are used either 3 or 4 instructions before the
mcount location depending on the compiler version.

Best Regards,
Petr
Jiri Kosina Feb. 4, 2016, 9:47 p.m. UTC | #13
On Thu, 4 Feb 2016, AKASHI Takahiro wrote:

> On arm/arm64, link register must be saved before any function call. So anyhow
> we will have to add something, 3 instructions at the minimum, like:
>    save lr
>    branch _mcount
>    restore lr
>    <prologue>
>    ...
>    <body>
>    ...

This means that we have at least two architectures that need one 
instruction before the mcount/mfentry call, and the rest of the prologue 
to follow afterwards. On x86, we don't need any "pre-prologue".

Persumably the corresponding opcodes have different sizes. This nicely 
demonstrates my point -- if this one-gcc-option-to-rule-them-all would 
exist, it needs to be generic enough to describe these kinds of 
constraints (who knows what other restrictions will pop up when exploring 
other, more exotic, architectures later).

Thanks,
Balbir Singh Feb. 5, 2016, 4:40 a.m. UTC | #14
On Thu, Feb 4, 2016 at 10:02 PM, Petr Mladek <pmladek@suse.com> wrote:
> On Thu 2016-02-04 18:31:40, AKASHI Takahiro wrote:
>> Jiri, Torsten
>>
>> Thank you for your explanation.
>>
>> On 02/03/2016 08:24 PM, Torsten Duwe wrote:
>> >On Wed, Feb 03, 2016 at 09:55:11AM +0100, Jiri Kosina wrote:
>> >>On Wed, 3 Feb 2016, AKASHI Takahiro wrote:
>> >>>those efforts, we are proposing[1] a new *generic* gcc option, -fprolog-add=N.
>> >>>This option will insert N nop instructions at the beginning of each function.
>> >
>> >>The interesting part of the story with ppc64 is that you indeed want to
>> >>create the callsite before the *most* of the prologue, but not really :)
>> >
>> >I was silently assuming that GCC would do this right on ppc64le; add the NOPs
>> >right after the TOC load. Or after TOC load and LR save? ...
>>
>> On arm/arm64, link register must be saved before any function call. So anyhow
>> we will have to add something, 3 instructions at the minimum, like:
>>    save lr
>>    branch _mcount
>>    restore lr
>>    <prologue>
>>    ...
>>    <body>
>>    ...
>
> So, it is similar to PPC that has to handle LR as well.
>
>
>> >>The part of the prologue where TOC pointer is saved needs to happen before
>> >>the fentry/profiling call.
>> >
>> >Yes, any call, to any profiler/tracer/live patcher is potentially global
>> >and needs the _new_ TOC value.
>
> The code below is generated for PPC64LE with -mprofile-kernel using:
>
> $> gcc --version
> gcc (SUSE Linux) 6.0.0 20160121 (experimental) [trunk revision 232670]
> Copyright (C) 2016 Free Software Foundation, Inc.
> This is free software; see the source for copying conditions.  There is NO
> warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
>
>
> 0000000000000050 <cmdline_proc_show>:
>   50:   00 00 4c 3c     addis   r2,r12,0
>                         50: R_PPC64_REL16_HA    .TOC.
>   54:   00 00 42 38     addi    r2,r2,0
>                         54: R_PPC64_REL16_LO    .TOC.+0x4
>   58:   a6 02 08 7c     mflr    r0
>   5c:   01 00 00 48     bl      5c <cmdline_proc_show+0xc>
>                         5c: R_PPC64_REL24       _mcount
>   60:   a6 02 08 7c     mflr    r0
>   64:   10 00 01 f8     std     r0,16(r1)
>   68:   a1 ff 21 f8     stdu    r1,-96(r1)
>   6c:   00 00 22 3d     addis   r9,r2,0
>                         6c: R_PPC64_TOC16_HA    .toc
>   70:   00 00 82 3c     addis   r4,r2,0
>                         70: R_PPC64_TOC16_HA    .rodata.str1.8
>   74:   00 00 29 e9     ld      r9,0(r9)
>                         74: R_PPC64_TOC16_LO_DS .toc
>   78:   00 00 84 38     addi    r4,r4,0
>                         78: R_PPC64_TOC16_LO    .rodata.str1.8
>   7c:   00 00 a9 e8     ld      r5,0(r9)
>   80:   01 00 00 48     bl      80 <cmdline_proc_show+0x30>
>                         80: R_PPC64_REL24       seq_printf
>   84:   00 00 00 60     nop
>   88:   00 00 60 38     li      r3,0
>   8c:   60 00 21 38     addi    r1,r1,96
>   90:   10 00 01 e8     ld      r0,16(r1)
>   94:   a6 03 08 7c     mtlr    r0
>   98:   20 00 80 4e     blr
>
>
> And the same function compiled using:
>
> $> gcc --version
> gcc (SUSE Linux) 4.8.5
> Copyright (C) 2015 Free Software Foundation, Inc.
> This is free software; see the source for copying conditions.  There is NO
> warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
>
>
> 0000000000000050 <cmdline_proc_show>:
>   50:   00 00 4c 3c     addis   r2,r12,0
>                         50: R_PPC64_REL16_HA    .TOC.
>   54:   00 00 42 38     addi    r2,r2,0
>                         54: R_PPC64_REL16_LO    .TOC.+0x4
>   58:   a6 02 08 7c     mflr    r0
>   5c:   10 00 01 f8     std     r0,16(r1)
>   60:   01 00 00 48     bl      60 <cmdline_proc_show+0x10>
>                         60: R_PPC64_REL24       _mcount
>   64:   a6 02 08 7c     mflr    r0
>   68:   10 00 01 f8     std     r0,16(r1)
>   6c:   a1 ff 21 f8     stdu    r1,-96(r1)
>   70:   00 00 42 3d     addis   r10,r2,0
>                         70: R_PPC64_TOC16_HA    .toc
>   74:   00 00 82 3c     addis   r4,r2,0
>                         74: R_PPC64_TOC16_HA    .rodata.str1.8
>   78:   00 00 2a e9     ld      r9,0(r10)
>                         78: R_PPC64_TOC16_LO_DS .toc
>   7c:   00 00 84 38     addi    r4,r4,0
>                         7c: R_PPC64_TOC16_LO    .rodata.str1.8
>   80:   00 00 a9 e8     ld      r5,0(r9)
>   84:   01 00 00 48     bl      84 <cmdline_proc_show+0x34>
>                         84: R_PPC64_REL24       seq_printf
>   88:   00 00 00 60     nop
>   8c:   00 00 60 38     li      r3,0
>   90:   60 00 21 38     addi    r1,r1,96
>   94:   10 00 01 e8     ld      r0,16(r1)
>   98:   a6 03 08 7c     mtlr    r0
>   9c:   20 00 80 4e     blr
>
>
> Please, note that are used either 3 or 4 instructions before the
> mcount location depending on the compiler version.


Thanks Petr

For big endian builds I saw

Dump of assembler code for function alloc_pages_current:
   0xc000000000256f00 <+0>:    mflr    r0
   0xc000000000256f04 <+4>:    std     r0,16(r1)
   0xc000000000256f08 <+8>:    bl      0xc000000000009e5c <.mcount>
   0xc000000000256f0c <+12>:    mflr    r0

The offset is 8 bytes. Your earlier patch handled this by adding 16, I
suspect it needs revisiting

Balbir
Petr Mladek Feb. 5, 2016, 10:22 a.m. UTC | #15
On Fri 2016-02-05 15:40:27, Balbir Singh wrote:
> On Thu, Feb 4, 2016 at 10:02 PM, Petr Mladek <pmladek@suse.com> wrote:
> For big endian builds I saw
> 
> Dump of assembler code for function alloc_pages_current:
>    0xc000000000256f00 <+0>:    mflr    r0
>    0xc000000000256f04 <+4>:    std     r0,16(r1)
>    0xc000000000256f08 <+8>:    bl      0xc000000000009e5c <.mcount>
>    0xc000000000256f0c <+12>:    mflr    r0
> 
> The offset is 8 bytes. Your earlier patch handled this by adding 16, I
> suspect it needs revisiting

It seems to be one of the funcitons that do not access any global
symbol. gcc does not produce TOC handling in this case when
compiled with -mprofile-kernel. I believe that it is a gcc bug.
More details can be found in the thread starting at
http://thread.gmane.org/gmane.linux.kernel/2134759/focus=2141996

Best Regards,
Petr
diff mbox

Patch

diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
index a94f155..e7cd043 100644
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S
@@ -1206,7 +1206,12 @@  _GLOBAL(enter_prom)
 #ifdef CONFIG_DYNAMIC_FTRACE
 _GLOBAL(mcount)
 _GLOBAL(_mcount)
-	blr
+	std	r0,LRSAVE(r1) /* gcc6 does this _after_ this call _only_ */
+	mflr	r0
+	mtctr	r0
+	ld	r0,LRSAVE(r1)
+	mtlr	r0
+	bctr
 
 _GLOBAL_TOC(ftrace_caller)
 	/* Taken from output of objdump from lib64/glibc */
@@ -1262,13 +1267,28 @@  _GLOBAL(ftrace_stub)
 
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 _GLOBAL(ftrace_graph_caller)
+#ifdef CC_USING_MPROFILE_KERNEL
+	/* with -mprofile-kernel, parameter regs are still alive at _mcount */
+	std	r10, 104(r1)
+	std	r9, 96(r1)
+	std	r8, 88(r1)
+	std	r7, 80(r1)
+	std	r6, 72(r1)
+	std	r5, 64(r1)
+	std	r4, 56(r1)
+	std	r3, 48(r1)
+	mfctr	r4		/* ftrace_caller has moved local addr here */
+	std	r4, 40(r1)
+	mflr	r3		/* ftrace_caller has restored LR from stack */
+#else
 	/* load r4 with local address */
 	ld	r4, 128(r1)
-	subi	r4, r4, MCOUNT_INSN_SIZE
 
 	/* Grab the LR out of the caller stack frame */
 	ld	r11, 112(r1)
 	ld	r3, 16(r11)
+#endif
+	subi	r4, r4, MCOUNT_INSN_SIZE
 
 	bl	prepare_ftrace_return
 	nop
@@ -1277,6 +1297,26 @@  _GLOBAL(ftrace_graph_caller)
 	 * prepare_ftrace_return gives us the address we divert to.
 	 * Change the LR in the callers stack frame to this.
 	 */
+
+#ifdef CC_USING_MPROFILE_KERNEL
+	mtlr	r3
+
+	ld	r0, 40(r1)
+	mtctr	r0
+	ld	r10, 104(r1)
+	ld	r9, 96(r1)
+	ld	r8, 88(r1)
+	ld	r7, 80(r1)
+	ld	r6, 72(r1)
+	ld	r5, 64(r1)
+	ld	r4, 56(r1)
+	ld	r3, 48(r1)
+
+	addi	r1, r1, 112
+	mflr	r0
+	std	r0, LRSAVE(r1)
+	bctr
+#else
 	ld	r11, 112(r1)
 	std	r3, 16(r11)
 
@@ -1284,6 +1324,7 @@  _GLOBAL(ftrace_graph_caller)
 	mtlr	r0
 	addi	r1, r1, 112
 	blr
+#endif
 
 _GLOBAL(return_to_handler)
 	/* need to save return values */
diff --git a/arch/powerpc/kernel/ftrace.c b/arch/powerpc/kernel/ftrace.c
index 44d4d8e..080c525 100644
--- a/arch/powerpc/kernel/ftrace.c
+++ b/arch/powerpc/kernel/ftrace.c
@@ -306,11 +306,19 @@  __ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
 	 * The load offset is different depending on the ABI. For simplicity
 	 * just mask it out when doing the compare.
 	 */
+#ifndef CC_USING_MPROFILE_KERNEL
 	if ((op[0] != 0x48000008) || ((op[1] & 0xffff0000) != 0xe8410000)) {
-		pr_err("Unexpected call sequence: %x %x\n", op[0], op[1]);
+		pr_err("Unexpected call sequence at %p: %x %x\n",
+		ip, op[0], op[1]);
 		return -EINVAL;
 	}
-
+#else
+	/* look for patched "NOP" on ppc64 with -mprofile-kernel */
+	if (op[0] != 0x60000000) {
+		pr_err("Unexpected call at %p: %x\n", ip, op[0]);
+		return -EINVAL;
+	}
+#endif
 	/* If we never set up a trampoline to ftrace_caller, then bail */
 	if (!rec->arch.mod->arch.tramp) {
 		pr_err("No ftrace trampoline\n");
diff --git a/arch/powerpc/kernel/module_64.c b/arch/powerpc/kernel/module_64.c
index 6838451..30f6be1 100644
--- a/arch/powerpc/kernel/module_64.c
+++ b/arch/powerpc/kernel/module_64.c
@@ -475,6 +475,20 @@  static unsigned long stub_for_addr(Elf64_Shdr *sechdrs,
 static int restore_r2(u32 *instruction, struct module *me)
 {
 	if (*instruction != PPC_INST_NOP) {
+#ifdef CC_USING_MPROFILE_KERNEL
+		/* -mprofile_kernel sequence starting with
+		 * mflr r0 and maybe std r0, LRSAVE(r1)
+		 */
+		if ((instruction[-3] == 0x7c0802a6 &&
+		    instruction[-2] == 0xf8010010) ||
+		    instruction[-2] == 0x7c0802a6) {
+			/* Nothing to be done here, it's an _mcount
+			 * call location and r2 will have to be
+			 * restored in the _mcount function.
+			 */
+			return 1;
+		};
+#endif
 		pr_err("%s: Expect noop after relocate, got %08x\n",
 		       me->name, *instruction);
 		return 0;