Patchwork [v3,1/1] booke/kprobe: make program exception to use one dedicated exception stack

login
register
mail settings
Submitter Tiejun Chen
Date July 11, 2011, 11:31 a.m.
Message ID <1310383915-30543-1-git-send-email-tiejun.chen@windriver.com>
Download mbox | patch
Permalink /patch/104198/
State Rejected
Headers show

Comments

Tiejun Chen - July 11, 2011, 11:31 a.m.
When kprobe these operations such as store-and-update-word for SP(r1),

stwu r1, -A(r1)

The program exception is triggered, and PPC always allocate an exception frame
as shown as the follows:

old r1 ----------
         ...
         nip
         gpr[2] ~ gpr[31]
         gpr[1] <--------- old r1 is stored.
         gpr[0]
       -------- <--------- pr_regs @offset 16 bytes
       padding
       STACK_FRAME_REGS_MARKER
       LR
       back chain
new r1 ----------
Then emulate_step() will emulate this instruction, 'stwu'. Actually its
equivalent to:
1> Update pr_regs->gpr[1] = mem[old r1 + (-A)]
2> stw [old r1], mem[old r1 + (-A)]

Please notice the stack based on new r1 may be covered with mem[old r1
+(-A)] when addr[old r1 + (-A)] < addr[old r1 + sizeof(an exception frame0].
So the above 2# operation will overwirte something to break this exception
frame then unexpected kernel problem will be issued.

So looks we have to implement independed interrupt stack for PPC program
exception when CONFIG_BOOKE is enabled. Here we can use
EXC_LEVEL_EXCEPTION_PROLOG to replace original NORMAL_EXCEPTION_PROLOG
for program exception if CONFIG_BOOKE. Then its always safe for kprobe
with independed exc stack from one pre-allocated and dedicated thread_info.
Actually this is just waht we did for critical/machine check exceptions
on PPC.

Signed-off-by: Tiejun Chen <tiejun.chen@windriver.com>
---
 arch/powerpc/include/asm/irq.h   |    3 +++
 arch/powerpc/include/asm/reg.h   |    4 ++++
 arch/powerpc/kernel/entry_32.S   |   15 +++++++++++++++
 arch/powerpc/kernel/head_booke.h |   15 +++++++++++++--
 arch/powerpc/kernel/irq.c        |   11 +++++++++++
 arch/powerpc/kernel/setup_32.c   |    4 ++++
 6 files changed, 50 insertions(+), 2 deletions(-)
Tiejun Chen - July 12, 2011, 2:35 a.m.
Tiejun Chen wrote:
> When kprobe these operations such as store-and-update-word for SP(r1),
> 
> stwu r1, -A(r1)
> 
> The program exception is triggered, and PPC always allocate an exception frame
> as shown as the follows:
> 
> old r1 ----------
>          ...
>          nip
>          gpr[2] ~ gpr[31]
>          gpr[1] <--------- old r1 is stored.
>          gpr[0]
>        -------- <--------- pr_regs @offset 16 bytes
>        padding
>        STACK_FRAME_REGS_MARKER
>        LR
>        back chain
> new r1 ----------
> Then emulate_step() will emulate this instruction, 'stwu'. Actually its
> equivalent to:
> 1> Update pr_regs->gpr[1] = mem[old r1 + (-A)]
> 2> stw [old r1], mem[old r1 + (-A)]
> 
> Please notice the stack based on new r1 may be covered with mem[old r1
> +(-A)] when addr[old r1 + (-A)] < addr[old r1 + sizeof(an exception frame0].
> So the above 2# operation will overwirte something to break this exception
> frame then unexpected kernel problem will be issued.
> 
> So looks we have to implement independed interrupt stack for PPC program
> exception when CONFIG_BOOKE is enabled. Here we can use
> EXC_LEVEL_EXCEPTION_PROLOG to replace original NORMAL_EXCEPTION_PROLOG
> for program exception if CONFIG_BOOKE. Then its always safe for kprobe
> with independed exc stack from one pre-allocated and dedicated thread_info.
> Actually this is just waht we did for critical/machine check exceptions
> on PPC.
> 
> Signed-off-by: Tiejun Chen <tiejun.chen@windriver.com>
> ---
>  arch/powerpc/include/asm/irq.h   |    3 +++
>  arch/powerpc/include/asm/reg.h   |    4 ++++
>  arch/powerpc/kernel/entry_32.S   |   15 +++++++++++++++
>  arch/powerpc/kernel/head_booke.h |   15 +++++++++++++--
>  arch/powerpc/kernel/irq.c        |   11 +++++++++++
>  arch/powerpc/kernel/setup_32.c   |    4 ++++
>  6 files changed, 50 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/irq.h b/arch/powerpc/include/asm/irq.h
> index 1bff591..6d12169 100644
> --- a/arch/powerpc/include/asm/irq.h
> +++ b/arch/powerpc/include/asm/irq.h
> @@ -313,6 +313,9 @@ struct pt_regs;
>  extern struct thread_info *critirq_ctx[NR_CPUS];
>  extern struct thread_info *dbgirq_ctx[NR_CPUS];
>  extern struct thread_info *mcheckirq_ctx[NR_CPUS];
> +#if defined(CONFIG_KPROBES) && defined(CONFIG_BOOKE)
> +extern struct thread_info *pgirq_ctx[NR_CPUS];
> +#endif
>  extern void exc_lvl_ctx_init(void);
>  #else
>  #define exc_lvl_ctx_init()
> diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
> index c5cae0d..34d6178 100644
> --- a/arch/powerpc/include/asm/reg.h
> +++ b/arch/powerpc/include/asm/reg.h
> @@ -885,6 +885,10 @@
>  #endif
>  #define SPRN_SPRG_RVCPU		SPRN_SPRG1
>  #define SPRN_SPRG_WVCPU		SPRN_SPRG1
> +#ifdef	CONFIG_KPROBES
> +#define	SPRN_SPRG_RSCRATCH_PG	SPRN_SPRG0
> +#define	SPRN_SPRG_WSCRATCH_PG	SPRN_SPRG0
> +#endif
>  #endif
>  
>  #ifdef CONFIG_8xx
> diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S
> index 56212bc..a99e209 100644
> --- a/arch/powerpc/kernel/entry_32.S
> +++ b/arch/powerpc/kernel/entry_32.S
> @@ -1122,6 +1122,21 @@ ret_from_mcheck_exc:
>  	RESTORE_xSRR(DSRR0,DSRR1);
>  	RESTORE_MMU_REGS;
>  	RET_FROM_EXC_LEVEL(SPRN_MCSRR0, SPRN_MCSRR1, PPC_RFMCI)
> +
> +	.globl	ret_from_prog_exc
> +ret_from_prog_exc:
> +	mfspr	r9,SPRN_SPRG_THREAD
> +	lwz	r10,SAVED_KSP_LIMIT(r1)
> +	stw	r10,KSP_LIMIT(r9)
> +	lwz	r9,THREAD_INFO-THREAD(r9)
> +	rlwinm	r10,r1,0,0,(31-THREAD_SHIFT)
> +	lwz	r10,TI_PREEMPT(r10)
> +	stw	r10,TI_PREEMPT(r9)
> +	RESTORE_xSRR(SRR0,SRR1);
> +	RESTORE_xSRR(CSRR0,CSRR1);
> +	RESTORE_xSRR(DSRR0,DSRR1);
> +	RESTORE_MMU_REGS;
> +	RET_FROM_EXC_LEVEL(SPRN_SRR0, SPRN_SRR1, rfi)
>  #endif /* CONFIG_BOOKE */

After a further consideration, to improve the above code fragment with the following
------
+
+       .globl  ret_from_prog_exc
+ret_from_prog_exc:
+#ifdef CONFIG_KPROBES
+       mfspr   r9,SPRN_SPRG_THREAD
+       lwz     r9,THREAD_INFO-THREAD(r9)
+       rlwinm  r10,r1,0,0,(31-THREAD_SHIFT)
+       lwz     r10,TI_PREEMPT(r10)
+       stw     r10,TI_PREEMPT(r9)
+       RET_FROM_EXC_LEVEL(SPRN_SRR0, SPRN_SRR1, rfi)
+#else
+       b       ret_from_except_full
+#endif

Here remove unnecessary restore, and also make sure its still same as normal
program exception when !CONFIG_KPROBES.

Tiejun

>  
>  /*
> diff --git a/arch/powerpc/kernel/head_booke.h b/arch/powerpc/kernel/head_booke.h
> index a0bf158..941be40 100644
> --- a/arch/powerpc/kernel/head_booke.h
> +++ b/arch/powerpc/kernel/head_booke.h
> @@ -79,6 +79,10 @@
>  /* only on e500mc/e200 */
>  #define DBG_STACK_BASE		dbgirq_ctx
>  
> +#if defined(CONFIG_KPROBES)
> +#define PG_STACK_BASE		pgirq_ctx
> +#endif
> +
>  #define EXC_LVL_FRAME_OVERHEAD	(THREAD_SIZE - INT_FRAME_SIZE - EXC_LVL_SIZE)
>  
>  #ifdef CONFIG_SMP
> @@ -158,6 +162,12 @@
>  		EXC_LEVEL_EXCEPTION_PROLOG(DBG, SPRN_DSRR0, SPRN_DSRR1)
>  #define MCHECK_EXCEPTION_PROLOG \
>  		EXC_LEVEL_EXCEPTION_PROLOG(MC, SPRN_MCSRR0, SPRN_MCSRR1)
> +#if defined(CONFIG_KPROBES)
> +#define	PROGRAM_EXCEPTION_PROLOG \
> +		EXC_LEVEL_EXCEPTION_PROLOG(PG, SPRN_SRR0, SPRN_SRR1)
> +#else
> +#define	PROGRAM_EXCEPTION_PROLOG	NORMAL_EXCEPTION_PROLOG
> +#endif
>  
>  /*
>   * Exception vectors.
> @@ -370,11 +380,12 @@ label:
>  
>  #define PROGRAM_EXCEPTION						      \
>  	START_EXCEPTION(Program)					      \
> -	NORMAL_EXCEPTION_PROLOG;					      \
> +	PROGRAM_EXCEPTION_PROLOG;					      \
>  	mfspr	r4,SPRN_ESR;		/* Grab the ESR and save it */	      \
>  	stw	r4,_ESR(r11);						      \
>  	addi	r3,r1,STACK_FRAME_OVERHEAD;				      \
> -	EXC_XFER_STD(0x0700, program_check_exception)
> +	EXC_XFER_TEMPLATE(program_check_exception, 0x0700, MSR_KERNEL, NOCOPY,\
> +		transfer_to_handler_full, ret_from_prog_exc)
>  
>  #define DECREMENTER_EXCEPTION						      \
>  	START_EXCEPTION(Decrementer)					      \
> diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
> index 5b428e3..ff5b8dd 100644
> --- a/arch/powerpc/kernel/irq.c
> +++ b/arch/powerpc/kernel/irq.c
> @@ -397,6 +397,10 @@ struct thread_info   *critirq_ctx[NR_CPUS] __read_mostly;
>  struct thread_info    *dbgirq_ctx[NR_CPUS] __read_mostly;
>  struct thread_info *mcheckirq_ctx[NR_CPUS] __read_mostly;
>  
> +#if defined(CONFIG_KPROBES) && defined(CONFIG_BOOKE)
> +struct thread_info    *pgirq_ctx[NR_CPUS] __read_mostly;
> +#endif
> +
>  void exc_lvl_ctx_init(void)
>  {
>  	struct thread_info *tp;
> @@ -423,6 +427,13 @@ void exc_lvl_ctx_init(void)
>  		tp = mcheckirq_ctx[cpu_nr];
>  		tp->cpu = cpu_nr;
>  		tp->preempt_count = HARDIRQ_OFFSET;
> +
> +#if defined(CONFIG_KPROBES)
> +		memset((void *)pgirq_ctx[i], 0, THREAD_SIZE);
> +		tp = pgirq_ctx[i];
> +		tp->cpu = i;
> +		tp->preempt_count = 0;
> +#endif
>  #endif
>  	}
>  }
> diff --git a/arch/powerpc/kernel/setup_32.c b/arch/powerpc/kernel/setup_32.c
> index 620d792..b872564 100644
> --- a/arch/powerpc/kernel/setup_32.c
> +++ b/arch/powerpc/kernel/setup_32.c
> @@ -272,6 +272,10 @@ static void __init exc_lvl_early_init(void)
>  			__va(memblock_alloc(THREAD_SIZE, THREAD_SIZE));
>  		mcheckirq_ctx[hw_cpu] = (struct thread_info *)
>  			__va(memblock_alloc(THREAD_SIZE, THREAD_SIZE));
> +#ifdef CONFIG_KPROBES
> +		pgirq_ctx[hw_cpu] = (struct thread_info *)
> +			__va(memblock_alloc(THREAD_SIZE, THREAD_SIZE));
> +#endif
>  #endif
>  	}
>  }
Tiejun Chen - July 14, 2011, 11:56 a.m.
Tiejun Chen wrote:
> v1 -> v2: when allocate pgirq_ctx, use 'hw_cpu' to identify cpu ID in exc_lvl_early_init().
> v2 -> v3: add that specific return-from-program-exc to restore necessary thread info then
> we can withdraw the original patch #2.

Any feedback?

Thanks
Tiejun

> 
> Tiejun
> _______________________________________________
> Linuxppc-dev mailing list
> Linuxppc-dev@lists.ozlabs.org
> https://lists.ozlabs.org/listinfo/linuxppc-dev
>
Kumar Gala - July 14, 2011, 1:27 p.m.
On Jul 11, 2011, at 6:31 AM, Tiejun Chen wrote:

> When kprobe these operations such as store-and-update-word for SP(r1),
> 
> stwu r1, -A(r1)
> 
> The program exception is triggered, and PPC always allocate an exception frame
> as shown as the follows:
> 
> old r1 ----------
>         ...
>         nip
>         gpr[2] ~ gpr[31]
>         gpr[1] <--------- old r1 is stored.
>         gpr[0]
>       -------- <--------- pr_regs @offset 16 bytes
>       padding
>       STACK_FRAME_REGS_MARKER
>       LR
>       back chain
> new r1 ----------
> Then emulate_step() will emulate this instruction, 'stwu'. Actually its
> equivalent to:
> 1> Update pr_regs->gpr[1] = mem[old r1 + (-A)]
> 2> stw [old r1], mem[old r1 + (-A)]
> 
> Please notice the stack based on new r1 may be covered with mem[old r1
> +(-A)] when addr[old r1 + (-A)] < addr[old r1 + sizeof(an exception frame0].
> So the above 2# operation will overwirte something to break this exception
> frame then unexpected kernel problem will be issued.
> 
> So looks we have to implement independed interrupt stack for PPC program
> exception when CONFIG_BOOKE is enabled. Here we can use
> EXC_LEVEL_EXCEPTION_PROLOG to replace original NORMAL_EXCEPTION_PROLOG
> for program exception if CONFIG_BOOKE. Then its always safe for kprobe
> with independed exc stack from one pre-allocated and dedicated thread_info.
> Actually this is just waht we did for critical/machine check exceptions
> on PPC.
> 
> Signed-off-by: Tiejun Chen <tiejun.chen@windriver.com>
> ---

I'm still very confused why we need a unique stack frame for kprobe/program exceptions on book-e devices.

Can you explain this further.

- k
Scott Wood - July 14, 2011, 3:53 p.m.
On Thu, 14 Jul 2011 08:27:44 -0500
Kumar Gala <galak@kernel.crashing.org> wrote:

> 
> On Jul 11, 2011, at 6:31 AM, Tiejun Chen wrote:
> 
> > When kprobe these operations such as store-and-update-word for SP(r1),
> > 
> > stwu r1, -A(r1)
> > 
> > The program exception is triggered, and PPC always allocate an exception frame
> > as shown as the follows:
> > 
> > old r1 ----------
> >         ...
> >         nip
> >         gpr[2] ~ gpr[31]
> >         gpr[1] <--------- old r1 is stored.
> >         gpr[0]
> >       -------- <--------- pr_regs @offset 16 bytes
> >       padding
> >       STACK_FRAME_REGS_MARKER
> >       LR
> >       back chain
> > new r1 ----------
> > Then emulate_step() will emulate this instruction, 'stwu'. Actually its
> > equivalent to:
> > 1> Update pr_regs->gpr[1] = mem[old r1 + (-A)]
> > 2> stw [old r1], mem[old r1 + (-A)]
> > 
> > Please notice the stack based on new r1 may be covered with mem[old r1
> > +(-A)] when addr[old r1 + (-A)] < addr[old r1 + sizeof(an exception frame0].
> > So the above 2# operation will overwirte something to break this exception
> > frame then unexpected kernel problem will be issued.
> > 
> > So looks we have to implement independed interrupt stack for PPC program
> > exception when CONFIG_BOOKE is enabled. Here we can use
> > EXC_LEVEL_EXCEPTION_PROLOG to replace original NORMAL_EXCEPTION_PROLOG
> > for program exception if CONFIG_BOOKE. Then its always safe for kprobe
> > with independed exc stack from one pre-allocated and dedicated thread_info.
> > Actually this is just waht we did for critical/machine check exceptions
> > on PPC.
> > 
> > Signed-off-by: Tiejun Chen <tiejun.chen@windriver.com>
> > ---
> 
> I'm still very confused why we need a unique stack frame for kprobe/program exceptions on book-e devices.

I don't know why it's booke-specific (or why they're emulating rather than
using single-step), but the problem is trying to emulate an instruction
that's expanding the non-exception stack into the area that the exception
handler is sitting on, and writing to that new stack area.

-Scott
Tiejun Chen - July 15, 2011, 5:28 a.m.
Kumar Gala wrote:
> On Jul 11, 2011, at 6:31 AM, Tiejun Chen wrote:
> 
>> When kprobe these operations such as store-and-update-word for SP(r1),
>>
>> stwu r1, -A(r1)
>>
>> The program exception is triggered, and PPC always allocate an exception frame
>> as shown as the follows:
>>
>> old r1 ----------
>>         ...
>>         nip
>>         gpr[2] ~ gpr[31]
>>         gpr[1] <--------- old r1 is stored.
>>         gpr[0]
>>       -------- <--------- pr_regs @offset 16 bytes
>>       padding
>>       STACK_FRAME_REGS_MARKER
>>       LR
>>       back chain
>> new r1 ----------
>> Then emulate_step() will emulate this instruction, 'stwu'. Actually its
>> equivalent to:
>> 1> Update pr_regs->gpr[1] = mem[old r1 + (-A)]
>> 2> stw [old r1], mem[old r1 + (-A)]
>>
>> Please notice the stack based on new r1 may be covered with mem[old r1
>> +(-A)] when addr[old r1 + (-A)] < addr[old r1 + sizeof(an exception frame0].
>> So the above 2# operation will overwirte something to break this exception
>> frame then unexpected kernel problem will be issued.
>>
>> So looks we have to implement independed interrupt stack for PPC program
>> exception when CONFIG_BOOKE is enabled. Here we can use
>> EXC_LEVEL_EXCEPTION_PROLOG to replace original NORMAL_EXCEPTION_PROLOG
>> for program exception if CONFIG_BOOKE. Then its always safe for kprobe
>> with independed exc stack from one pre-allocated and dedicated thread_info.
>> Actually this is just waht we did for critical/machine check exceptions
>> on PPC.
>>
>> Signed-off-by: Tiejun Chen <tiejun.chen@windriver.com>
>> ---
> 
> I'm still very confused why we need a unique stack frame for kprobe/program exceptions on book-e devices.

Its a bug at least for Book-E. And if you'd like to check another topic thread,
"[BUG?]3.0-rc4+ftrace+kprobe: set kprobe at instruction 'stwu' lead to system
crash/freeze", you can know this story completely :)

This bug should not be reproduced on PPC64 with the exception prolog/endlog
dedicated to PPC64. But I have no enough time to check other PPC32 & !BOOKE so
I'm not sure if we should extend this modification.

> 
> Can you explain this further.

I can show one of those issued examples.

Here we kprobe the entry point of show_interrupts().

(gdb) disassemble show_interrupts
Dump of assembler code for function show_interrupts:
   0xc0004ff4 <+0>:	stwu    r1,-48(r1)
   0xc0004ff8 <+4>:	mflr    r0

I add some printk() inside pre_handler() to show pt_regs->gpr[1] and pt_regs->nip.
------
......
Planted kprobe at c0004ff4
pre_handler: p->addr = 0xc0004ff4, nip = 0xc0004ff4, msr = 0x29000
gpr[1] = de767e50.
nip = c0004ff4.

When hit this instruction, emulate_step() would emulate this instruction as follows:
------
#1> current pr_regs->gpr[1] = 0xde767e50 - 48 = 0xde767e20;
#2> stw (previous pr_regs->gpr[1]), @(current pr_regs->gpr[1])
	==> stw (0xde767e50), 0xde767e20

But after this kprobe process something would be rewrite incorrectly:
------
......
post_handler: p->addr = 0xc0004ff4, msr = 0x29000
gpr[1] = de767e20.
nip = de767e54.
	^
	If everything is good nip should equal to (0xc0004ff4 + 0x4). But looks its
reset with (0xde767e50 + 0x4) via the above #2 operation. So why?

As I understand kprobe use 'trap' to enter the program exception. At now PR = 0
so the kernel allocate an exception frame as normal.

        ---------------- old r1[0xde767e50]
	 1  pt_regs->result
	 2  pt_regs->dsisr
	 3  pt_regs->dar
	 4  pt_regs->trap
	 5  pt_regs->mq
	 6  pt_regs->ccr
	 7  pt_regs->xer
	 8  pt_regs->link
	 9  pt_regs->ctr
	 10 pt_regs->orig_gpr3
	 11 pt_regs->msr
         12 pt_regs->nip  <-- @ 0xde767e50 - 12 x 4 = 0xde767e20
		......
	----------------- new r1[0xde767e50 - INT_FRAME_SIZE]

I think you can understand why pt_regs->nip is broken :) So the root cause is an
exception frame and the kprobed function stack frame are always overlap. And
then someone member inside an exception frame may be corrupted by that emulated
stw operation.

So I think we have to use one unique stack frame to avoid this when enable
CONFIG_KPROBES. Especially for Book-E we can refer easily machine
check/critical/debug exception implementation to do this like my patch.

Tiejun

> 
> - k
> 
>
Scott Wood - July 15, 2011, 6:42 p.m.
On Fri, 15 Jul 2011 13:28:15 +0800
tiejun.chen <tiejun.chen@windriver.com> wrote:

> Kumar Gala wrote:
> > I'm still very confused why we need a unique stack frame for kprobe/program exceptions on book-e devices.
> 
> Its a bug at least for Book-E.

But why only booke?  There's nothing booke-specific about the stwu
instruction.

> And if you'd like to check another topic thread,
> "[BUG?]3.0-rc4+ftrace+kprobe: set kprobe at instruction 'stwu' lead to system
> crash/freeze", you can know this story completely :)
> 
> This bug should not be reproduced on PPC64 with the exception prolog/endlog
> dedicated to PPC64.

What if the function you're stepping through is creating a stack frame that
is larger than the 288 bytes that get skipped?

> But I have no enough time to check other PPC32 & !BOOKE so
> I'm not sure if we should extend this modification.

Someone should.

-Scott
Tiejun Chen - July 16, 2011, 3:25 a.m.
> -----Original Message-----
> From: Scott Wood [mailto:scottwood@freescale.com] 
> Sent: Saturday, July 16, 2011 2:43 AM
> To: Chen, Tiejun
> Cc: Kumar Gala; linuxppc-dev@ozlabs.org
> Subject: Re: [v3 PATCH 1/1] booke/kprobe: make program 
> exception to use one dedicated exception stack
> 
> On Fri, 15 Jul 2011 13:28:15 +0800
> tiejun.chen <tiejun.chen@windriver.com> wrote:
> 
> > Kumar Gala wrote:
> > > I'm still very confused why we need a unique stack frame 
> for kprobe/program exceptions on book-e devices.
> > 
> > Its a bug at least for Book-E.
> 
> But why only booke?  There's nothing booke-specific about the 

I don't mean this is reproduced only on booke, so I use 'at least' carefully to notice we really see this problem on booke.

> stwu instruction.

Please note this root cause to this bug is not related to how to emulate stwu instruction. That should be issued from the overlap between an exception frame and the kprobed function stack frame on booke. Would you like to see that example I showed?

> 
> > And if you'd like to check another topic thread,
> > "[BUG?]3.0-rc4+ftrace+kprobe: set kprobe at instruction 
> 'stwu' lead to 
> > system crash/freeze", you can know this story completely :)
> > 
> > This bug should not be reproduced on PPC64 with the exception 
> > prolog/endlog dedicated to PPC64.
> 
> What if the function you're stepping through is creating a 
> stack frame that is larger than the 288 bytes that get skipped?

From that bug email looks PPC64 should be good from some feedbacks. So here I think guys who are very familiar with PPC64 can reply this rather than me :)

> 
> > But I have no enough time to check other PPC32 & !BOOKE so I'm not 
> > sure if we should extend this modification.
> 
> Someone should.

This is fine to me.

But currently we have to fix this based on booke firstly.

Tiejun 

> 
> -Scott
> 
>
Scott Wood - July 18, 2011, 3:56 p.m.
On Sat, 16 Jul 2011 03:25:47 +0000
"Chen, Tiejun" <Tiejun.Chen@windriver.com> wrote:

> > -----Original Message-----
> > From: Scott Wood [mailto:scottwood@freescale.com] 
> > Sent: Saturday, July 16, 2011 2:43 AM
> > To: Chen, Tiejun
> > Cc: Kumar Gala; linuxppc-dev@ozlabs.org
> > Subject: Re: [v3 PATCH 1/1] booke/kprobe: make program 
> > exception to use one dedicated exception stack
> > 
> > On Fri, 15 Jul 2011 13:28:15 +0800
> > tiejun.chen <tiejun.chen@windriver.com> wrote:
> > 
> > > Kumar Gala wrote:
> > > > I'm still very confused why we need a unique stack frame 
> > for kprobe/program exceptions on book-e devices.
> > > 
> > > Its a bug at least for Book-E.
> > 
> > But why only booke?  There's nothing booke-specific about the 
> 
> I don't mean this is reproduced only on booke, so I use 'at least' carefully to notice we really see this problem on booke.
> 
> > stwu instruction.
> 
> Please note this root cause to this bug is not related to how to emulate stwu instruction. That should be issued from the overlap between an exception frame and the kprobed function stack frame on booke. Would you like to see that example I showed?

As I understand it, the problem comes from the fact that stwu combines the
creation of a stack frame with storing into that stack frame.  If they were
separate instructions you'd have a new exception frame at a lower address
by the time you actually store to the non-exception frame.

-Scott
Tiejun Chen - July 19, 2011, 10:52 a.m.
Scott Wood wrote:
> On Sat, 16 Jul 2011 03:25:47 +0000
> "Chen, Tiejun" <Tiejun.Chen@windriver.com> wrote:
> 
>>> -----Original Message-----
>>> From: Scott Wood [mailto:scottwood@freescale.com] 
>>> Sent: Saturday, July 16, 2011 2:43 AM
>>> To: Chen, Tiejun
>>> Cc: Kumar Gala; linuxppc-dev@ozlabs.org
>>> Subject: Re: [v3 PATCH 1/1] booke/kprobe: make program 
>>> exception to use one dedicated exception stack
>>>
>>> On Fri, 15 Jul 2011 13:28:15 +0800
>>> tiejun.chen <tiejun.chen@windriver.com> wrote:
>>>
>>>> Kumar Gala wrote:
>>>>> I'm still very confused why we need a unique stack frame 
>>> for kprobe/program exceptions on book-e devices.
>>>> Its a bug at least for Book-E.
>>> But why only booke?  There's nothing booke-specific about the 
>> I don't mean this is reproduced only on booke, so I use 'at least' carefully to notice we really see this problem on booke.
>>
>>> stwu instruction.
>> Please note this root cause to this bug is not related to how to emulate stwu instruction. That should be issued from the overlap between an exception frame and the kprobed function stack frame on booke. Would you like to see that example I showed?
> 
> As I understand it, the problem comes from the fact that stwu combines the
> creation of a stack frame with storing into that stack frame.  If they were

Yes.

> separate instructions you'd have a new exception frame at a lower address
> by the time you actually store to the non-exception frame.

So when kprobe we should use a unique stack frame to skip that stack frame the
kprobed stwu want to create.

Tiejun

> 
> -Scott
> 
>
Tiejun Chen - July 21, 2011, 9:32 a.m.
tiejun.chen wrote:
> Kumar Gala wrote:
>> On Jul 11, 2011, at 6:31 AM, Tiejun Chen wrote:
>>
>>> When kprobe these operations such as store-and-update-word for SP(r1),
>>>
>>> stwu r1, -A(r1)
>>>
>>> The program exception is triggered, and PPC always allocate an exception frame
>>> as shown as the follows:
>>>
>>> old r1 ----------
>>>         ...
>>>         nip
>>>         gpr[2] ~ gpr[31]
>>>         gpr[1] <--------- old r1 is stored.
>>>         gpr[0]
>>>       -------- <--------- pr_regs @offset 16 bytes
>>>       padding
>>>       STACK_FRAME_REGS_MARKER
>>>       LR
>>>       back chain
>>> new r1 ----------
>>> Then emulate_step() will emulate this instruction, 'stwu'. Actually its
>>> equivalent to:
>>> 1> Update pr_regs->gpr[1] = mem[old r1 + (-A)]
>>> 2> stw [old r1], mem[old r1 + (-A)]
>>>
>>> Please notice the stack based on new r1 may be covered with mem[old r1
>>> +(-A)] when addr[old r1 + (-A)] < addr[old r1 + sizeof(an exception frame0].
>>> So the above 2# operation will overwirte something to break this exception
>>> frame then unexpected kernel problem will be issued.
>>>
>>> So looks we have to implement independed interrupt stack for PPC program
>>> exception when CONFIG_BOOKE is enabled. Here we can use
>>> EXC_LEVEL_EXCEPTION_PROLOG to replace original NORMAL_EXCEPTION_PROLOG
>>> for program exception if CONFIG_BOOKE. Then its always safe for kprobe
>>> with independed exc stack from one pre-allocated and dedicated thread_info.
>>> Actually this is just waht we did for critical/machine check exceptions
>>> on PPC.
>>>
>>> Signed-off-by: Tiejun Chen <tiejun.chen@windriver.com>
>>> ---
>> I'm still very confused why we need a unique stack frame for kprobe/program exceptions on book-e devices.
> 
> Its a bug at least for Book-E. And if you'd like to check another topic thread,
> "[BUG?]3.0-rc4+ftrace+kprobe: set kprobe at instruction 'stwu' lead to system
> crash/freeze", you can know this story completely :)
> 
> This bug should not be reproduced on PPC64 with the exception prolog/endlog
> dedicated to PPC64. But I have no enough time to check other PPC32 & !BOOKE so
> I'm not sure if we should extend this modification.
> 
>> Can you explain this further.
> 
> I can show one of those issued examples.
> 
> Here we kprobe the entry point of show_interrupts().
> 
> (gdb) disassemble show_interrupts
> Dump of assembler code for function show_interrupts:
>    0xc0004ff4 <+0>:	stwu    r1,-48(r1)
>    0xc0004ff8 <+4>:	mflr    r0
> 
> I add some printk() inside pre_handler() to show pt_regs->gpr[1] and pt_regs->nip.
> ------
> ......
> Planted kprobe at c0004ff4
> pre_handler: p->addr = 0xc0004ff4, nip = 0xc0004ff4, msr = 0x29000
> gpr[1] = de767e50.
> nip = c0004ff4.
> 
> When hit this instruction, emulate_step() would emulate this instruction as follows:
> ------
> #1> current pr_regs->gpr[1] = 0xde767e50 - 48 = 0xde767e20;
> #2> stw (previous pr_regs->gpr[1]), @(current pr_regs->gpr[1])
> 	==> stw (0xde767e50), 0xde767e20
> 
> But after this kprobe process something would be rewrite incorrectly:
> ------
> ......
> post_handler: p->addr = 0xc0004ff4, msr = 0x29000
> gpr[1] = de767e20.
> nip = de767e54.
> 	^
> 	If everything is good nip should equal to (0xc0004ff4 + 0x4). But looks its
> reset with (0xde767e50 + 0x4) via the above #2 operation. So why?
> 
> As I understand kprobe use 'trap' to enter the program exception. At now PR = 0
> so the kernel allocate an exception frame as normal.
> 
>         ---------------- old r1[0xde767e50]
> 	 1  pt_regs->result
> 	 2  pt_regs->dsisr
> 	 3  pt_regs->dar
> 	 4  pt_regs->trap
> 	 5  pt_regs->mq
> 	 6  pt_regs->ccr
> 	 7  pt_regs->xer
> 	 8  pt_regs->link
> 	 9  pt_regs->ctr
> 	 10 pt_regs->orig_gpr3
> 	 11 pt_regs->msr
>          12 pt_regs->nip  <-- @ 0xde767e50 - 12 x 4 = 0xde767e20
> 		......
> 	----------------- new r1[0xde767e50 - INT_FRAME_SIZE]
> 
> I think you can understand why pt_regs->nip is broken :) So the root cause is an
> exception frame and the kprobed function stack frame are always overlap. And
> then someone member inside an exception frame may be corrupted by that emulated
> stw operation.
> 
> So I think we have to use one unique stack frame to avoid this when enable
> CONFIG_KPROBES. Especially for Book-E we can refer easily machine
> check/critical/debug exception implementation to do this like my patch.
> 

More questions or suggestions?

Tiejun

> Tiejun
> 
>> - k
Benjamin Herrenschmidt - Aug. 30, 2011, 5:44 a.m.
> > As I understand it, the problem comes from the fact that stwu combines the
> > creation of a stack frame with storing into that stack frame.  If they were
> 
> Yes.
> 
> > separate instructions you'd have a new exception frame at a lower address
> > by the time you actually store to the non-exception frame.
> 
> So when kprobe we should use a unique stack frame to skip that stack frame the
> kprobed stwu want to create.

I still don't like that patch. Potentially the problem exist for all
variants of powerpc, not just booke, and I'm not sure I like adding yet
another exception stack.

Another (non-great) approach would be to special case stwu to the stack,
and instead of doing the store while emulating the instruction, keep the
store address around and do it later, after the stack has been unwound,
in the exit path (a TIF flag to hit the slow path and then do it in the
slow path).

It sounds hackish but it makes it easier to fix everybody at once, there
are "issues" with changing stacks especially on ppc64 and it would
definitely be affected as well if the stack frame created is larger than
our gap.

Cheers,
Ben.
Tiejun Chen - Aug. 31, 2011, 9:17 a.m.
Benjamin Herrenschmidt wrote:
>>> As I understand it, the problem comes from the fact that stwu combines the
>>> creation of a stack frame with storing into that stack frame.  If they were
>> Yes.
>>
>>> separate instructions you'd have a new exception frame at a lower address
>>> by the time you actually store to the non-exception frame.
>> So when kprobe we should use a unique stack frame to skip that stack frame the
>> kprobed stwu want to create.
> 
> I still don't like that patch. Potentially the problem exist for all
> variants of powerpc, not just booke, and I'm not sure I like adding yet

Yes.

> another exception stack.

But I think we should extend easily this for other powerpc variants. And only
when enable CONFIG_KPROBES that dedicated exception stack is valid, so its not
such a big risk :)

> 
> Another (non-great) approach would be to special case stwu to the stack,
> and instead of doing the store while emulating the instruction, keep the
> store address around and do it later, after the stack has been unwound,
> in the exit path (a TIF flag to hit the slow path and then do it in the
> slow path).

Actually I also considered one idea that we do stw-update in the exit path like
your proposal. But I'm not sure if its worth intruding a new TIF flag only for
'stwu'. And if I understand what your exit path means properly, we should do
this on ret_from_except_full,

...
exc_exit_restart:
        lwz     r11,_NIP(r1)
        lwz     r12,_MSR(r1)
	
	Looks we have to add something to update as 'stwu' since _NIP/_MSR are also
corrupted potentially. So I feel we'll make this complicated if we really do here.

exc_exit_start:
        mtspr   SPRN_SRR0,r11
        mtspr   SPRN_SRR1,r12
        REST_2GPRS(11, r1)
        lwz     r1,GPR1(r1)
        .globl exc_exit_restart_end
exc_exit_restart_end:
        PPC405_ERR77_SYNC
        rfi
        b       .                       /* prevent prefetch past rfi */

If I'm wrong please correct me.

> 
> It sounds hackish but it makes it easier to fix everybody at once, there
> are "issues" with changing stacks especially on ppc64 and it would
> definitely be affected as well if the stack frame created is larger than
> our gap.

If we provide another exception stack like we did debug exception on ppc64, are
there those "issues" you said?

Thanks
Tiejun

> 
> Cheers,
> Ben.
> 
>
Benjamin Herrenschmidt - Aug. 31, 2011, 9:32 p.m.
On Wed, 2011-08-31 at 17:17 +0800, tiejun.chen wrote:

> > It sounds hackish but it makes it easier to fix everybody at once, there
> > are "issues" with changing stacks especially on ppc64 and it would
> > definitely be affected as well if the stack frame created is larger than
> > our gap.
> 
> If we provide another exception stack like we did debug exception on ppc64, are
> there those "issues" you said?

Actually if we allocate it like the irq stacks, we should be ok, but
that's yet another 16K per CPU that needs to be covered by the first
segment, nasty....

In any case, please try to come up with a patch that covers all
variants.

Cheers,
Ben.

Patch

diff --git a/arch/powerpc/include/asm/irq.h b/arch/powerpc/include/asm/irq.h
index 1bff591..6d12169 100644
--- a/arch/powerpc/include/asm/irq.h
+++ b/arch/powerpc/include/asm/irq.h
@@ -313,6 +313,9 @@  struct pt_regs;
 extern struct thread_info *critirq_ctx[NR_CPUS];
 extern struct thread_info *dbgirq_ctx[NR_CPUS];
 extern struct thread_info *mcheckirq_ctx[NR_CPUS];
+#if defined(CONFIG_KPROBES) && defined(CONFIG_BOOKE)
+extern struct thread_info *pgirq_ctx[NR_CPUS];
+#endif
 extern void exc_lvl_ctx_init(void);
 #else
 #define exc_lvl_ctx_init()
diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index c5cae0d..34d6178 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -885,6 +885,10 @@ 
 #endif
 #define SPRN_SPRG_RVCPU		SPRN_SPRG1
 #define SPRN_SPRG_WVCPU		SPRN_SPRG1
+#ifdef	CONFIG_KPROBES
+#define	SPRN_SPRG_RSCRATCH_PG	SPRN_SPRG0
+#define	SPRN_SPRG_WSCRATCH_PG	SPRN_SPRG0
+#endif
 #endif
 
 #ifdef CONFIG_8xx
diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S
index 56212bc..a99e209 100644
--- a/arch/powerpc/kernel/entry_32.S
+++ b/arch/powerpc/kernel/entry_32.S
@@ -1122,6 +1122,21 @@  ret_from_mcheck_exc:
 	RESTORE_xSRR(DSRR0,DSRR1);
 	RESTORE_MMU_REGS;
 	RET_FROM_EXC_LEVEL(SPRN_MCSRR0, SPRN_MCSRR1, PPC_RFMCI)
+
+	.globl	ret_from_prog_exc
+ret_from_prog_exc:
+	mfspr	r9,SPRN_SPRG_THREAD
+	lwz	r10,SAVED_KSP_LIMIT(r1)
+	stw	r10,KSP_LIMIT(r9)
+	lwz	r9,THREAD_INFO-THREAD(r9)
+	rlwinm	r10,r1,0,0,(31-THREAD_SHIFT)
+	lwz	r10,TI_PREEMPT(r10)
+	stw	r10,TI_PREEMPT(r9)
+	RESTORE_xSRR(SRR0,SRR1);
+	RESTORE_xSRR(CSRR0,CSRR1);
+	RESTORE_xSRR(DSRR0,DSRR1);
+	RESTORE_MMU_REGS;
+	RET_FROM_EXC_LEVEL(SPRN_SRR0, SPRN_SRR1, rfi)
 #endif /* CONFIG_BOOKE */
 
 /*
diff --git a/arch/powerpc/kernel/head_booke.h b/arch/powerpc/kernel/head_booke.h
index a0bf158..941be40 100644
--- a/arch/powerpc/kernel/head_booke.h
+++ b/arch/powerpc/kernel/head_booke.h
@@ -79,6 +79,10 @@ 
 /* only on e500mc/e200 */
 #define DBG_STACK_BASE		dbgirq_ctx
 
+#if defined(CONFIG_KPROBES)
+#define PG_STACK_BASE		pgirq_ctx
+#endif
+
 #define EXC_LVL_FRAME_OVERHEAD	(THREAD_SIZE - INT_FRAME_SIZE - EXC_LVL_SIZE)
 
 #ifdef CONFIG_SMP
@@ -158,6 +162,12 @@ 
 		EXC_LEVEL_EXCEPTION_PROLOG(DBG, SPRN_DSRR0, SPRN_DSRR1)
 #define MCHECK_EXCEPTION_PROLOG \
 		EXC_LEVEL_EXCEPTION_PROLOG(MC, SPRN_MCSRR0, SPRN_MCSRR1)
+#if defined(CONFIG_KPROBES)
+#define	PROGRAM_EXCEPTION_PROLOG \
+		EXC_LEVEL_EXCEPTION_PROLOG(PG, SPRN_SRR0, SPRN_SRR1)
+#else
+#define	PROGRAM_EXCEPTION_PROLOG	NORMAL_EXCEPTION_PROLOG
+#endif
 
 /*
  * Exception vectors.
@@ -370,11 +380,12 @@  label:
 
 #define PROGRAM_EXCEPTION						      \
 	START_EXCEPTION(Program)					      \
-	NORMAL_EXCEPTION_PROLOG;					      \
+	PROGRAM_EXCEPTION_PROLOG;					      \
 	mfspr	r4,SPRN_ESR;		/* Grab the ESR and save it */	      \
 	stw	r4,_ESR(r11);						      \
 	addi	r3,r1,STACK_FRAME_OVERHEAD;				      \
-	EXC_XFER_STD(0x0700, program_check_exception)
+	EXC_XFER_TEMPLATE(program_check_exception, 0x0700, MSR_KERNEL, NOCOPY,\
+		transfer_to_handler_full, ret_from_prog_exc)
 
 #define DECREMENTER_EXCEPTION						      \
 	START_EXCEPTION(Decrementer)					      \
diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
index 5b428e3..ff5b8dd 100644
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -397,6 +397,10 @@  struct thread_info   *critirq_ctx[NR_CPUS] __read_mostly;
 struct thread_info    *dbgirq_ctx[NR_CPUS] __read_mostly;
 struct thread_info *mcheckirq_ctx[NR_CPUS] __read_mostly;
 
+#if defined(CONFIG_KPROBES) && defined(CONFIG_BOOKE)
+struct thread_info    *pgirq_ctx[NR_CPUS] __read_mostly;
+#endif
+
 void exc_lvl_ctx_init(void)
 {
 	struct thread_info *tp;
@@ -423,6 +427,13 @@  void exc_lvl_ctx_init(void)
 		tp = mcheckirq_ctx[cpu_nr];
 		tp->cpu = cpu_nr;
 		tp->preempt_count = HARDIRQ_OFFSET;
+
+#if defined(CONFIG_KPROBES)
+		memset((void *)pgirq_ctx[i], 0, THREAD_SIZE);
+		tp = pgirq_ctx[i];
+		tp->cpu = i;
+		tp->preempt_count = 0;
+#endif
 #endif
 	}
 }
diff --git a/arch/powerpc/kernel/setup_32.c b/arch/powerpc/kernel/setup_32.c
index 620d792..b872564 100644
--- a/arch/powerpc/kernel/setup_32.c
+++ b/arch/powerpc/kernel/setup_32.c
@@ -272,6 +272,10 @@  static void __init exc_lvl_early_init(void)
 			__va(memblock_alloc(THREAD_SIZE, THREAD_SIZE));
 		mcheckirq_ctx[hw_cpu] = (struct thread_info *)
 			__va(memblock_alloc(THREAD_SIZE, THREAD_SIZE));
+#ifdef CONFIG_KPROBES
+		pgirq_ctx[hw_cpu] = (struct thread_info *)
+			__va(memblock_alloc(THREAD_SIZE, THREAD_SIZE));
+#endif
 #endif
 	}
 }