diff mbox

linux-next ppc64: RCU mods cause __might_sleep BUGs

Message ID alpine.LSU.2.00.1205021324430.24246@eggly.anvils (mailing list archive)
State Not Applicable
Headers show

Commit Message

Hugh Dickins May 2, 2012, 8:25 p.m. UTC
On Tue, 1 May 2012, Paul E. McKenney wrote:
> > > > > On Mon, 2012-04-30 at 15:37 -0700, Hugh Dickins wrote:
> > > > > > 
> > > > > > BUG: sleeping function called from invalid context at include/linux/pagemap.h:354
> > > > > > in_atomic(): 0, irqs_disabled(): 0, pid: 6886, name: cc1
> > > > > > Call Trace:
> > > > > > [c0000001a99f78e0] [c00000000000f34c] .show_stack+0x6c/0x16c (unreliable)
> > > > > > [c0000001a99f7990] [c000000000077b40] .__might_sleep+0x11c/0x134
> > > > > > [c0000001a99f7a10] [c0000000000c6228] .filemap_fault+0x1fc/0x494
> > > > > > [c0000001a99f7af0] [c0000000000e7c9c] .__do_fault+0x120/0x684
> > > > > > [c0000001a99f7c00] [c000000000025790] .do_page_fault+0x458/0x664
> > > > > > [c0000001a99f7e30] [c000000000005868] handle_page_fault+0x10/0x30

Got it at last.  Embarrassingly obvious.  __rcu_read_lock() and
__rcu_read_unlock() are not safe to be using __this_cpu operations,
the cpu may change in between the rmw's read and write: they should
be using this_cpu operations (or, I put preempt_disable/enable in the
__rcu_read_unlock below).  __this_cpus there work out fine on x86,
which was given good instructions to use; but not so well on PowerPC.

I've been running successfully for an hour now with the patch below;
but I expect you'll want to consider the tradeoffs, and may choose a
different solution.

Hugh

Comments

Paul E. McKenney May 2, 2012, 8:49 p.m. UTC | #1
On Wed, May 02, 2012 at 01:25:30PM -0700, Hugh Dickins wrote:
> On Tue, 1 May 2012, Paul E. McKenney wrote:
> > > > > > On Mon, 2012-04-30 at 15:37 -0700, Hugh Dickins wrote:
> > > > > > > 
> > > > > > > BUG: sleeping function called from invalid context at include/linux/pagemap.h:354
> > > > > > > in_atomic(): 0, irqs_disabled(): 0, pid: 6886, name: cc1
> > > > > > > Call Trace:
> > > > > > > [c0000001a99f78e0] [c00000000000f34c] .show_stack+0x6c/0x16c (unreliable)
> > > > > > > [c0000001a99f7990] [c000000000077b40] .__might_sleep+0x11c/0x134
> > > > > > > [c0000001a99f7a10] [c0000000000c6228] .filemap_fault+0x1fc/0x494
> > > > > > > [c0000001a99f7af0] [c0000000000e7c9c] .__do_fault+0x120/0x684
> > > > > > > [c0000001a99f7c00] [c000000000025790] .do_page_fault+0x458/0x664
> > > > > > > [c0000001a99f7e30] [c000000000005868] handle_page_fault+0x10/0x30
> 
> Got it at last.  Embarrassingly obvious.  __rcu_read_lock() and
> __rcu_read_unlock() are not safe to be using __this_cpu operations,
> the cpu may change in between the rmw's read and write: they should
> be using this_cpu operations (or, I put preempt_disable/enable in the
> __rcu_read_unlock below).  __this_cpus there work out fine on x86,
> which was given good instructions to use; but not so well on PowerPC.

Thank you very much for tracking this down!!!

> I've been running successfully for an hour now with the patch below;
> but I expect you'll want to consider the tradeoffs, and may choose a
> different solution.

The thing that puzzles me about this is that the normal path through
the scheduler would save and restore these per-CPU variables to and
from the task structure.  There must be a sneak path through the
scheduler that I failed to account for.

But given your good work, this should be easy for me to chase down
even on my x86-based laptop -- just convert from __this_cpu_inc() to a
read-inc-delay-write sequence.  And check that the underlying variable
didn't change in the meantime.  And dump an ftrace if it did.  ;-)

Thank you again, Hugh!

							Thanx, Paul

> Hugh
> 
> --- 3.4-rc4-next-20120427/include/linux/rcupdate.h	2012-04-28 09:26:38.000000000 -0700
> +++ testing/include/linux/rcupdate.h	2012-05-02 11:46:06.000000000 -0700
> @@ -159,7 +159,7 @@ DECLARE_PER_CPU(struct task_struct *, rc
>   */
>  static inline void __rcu_read_lock(void)
>  {
> -	__this_cpu_inc(rcu_read_lock_nesting);
> +	this_cpu_inc(rcu_read_lock_nesting);
>  	barrier(); /* Keep code within RCU read-side critical section. */
>  }
> 
> --- 3.4-rc4-next-20120427/kernel/rcupdate.c	2012-04-28 09:26:40.000000000 -0700
> +++ testing/kernel/rcupdate.c	2012-05-02 11:44:13.000000000 -0700
> @@ -72,6 +72,7 @@ DEFINE_PER_CPU(struct task_struct *, rcu
>   */
>  void __rcu_read_unlock(void)
>  {
> +	preempt_disable();
>  	if (__this_cpu_read(rcu_read_lock_nesting) != 1)
>  		__this_cpu_dec(rcu_read_lock_nesting);
>  	else {
> @@ -83,13 +84,14 @@ void __rcu_read_unlock(void)
>  		barrier();  /* ->rcu_read_unlock_special load before assign */
>  		__this_cpu_write(rcu_read_lock_nesting, 0);
>  	}
> -#ifdef CONFIG_PROVE_LOCKING
> +#if 1 /* CONFIG_PROVE_LOCKING */
>  	{
>  		int rln = __this_cpu_read(rcu_read_lock_nesting);
> 
> -		WARN_ON_ONCE(rln < 0 && rln > INT_MIN / 2);
> +		BUG_ON(rln < 0 && rln > INT_MIN / 2);
>  	}
>  #endif /* #ifdef CONFIG_PROVE_LOCKING */
> +	preempt_enable();
>  }
>  EXPORT_SYMBOL_GPL(__rcu_read_unlock);
> 
> --- 3.4-rc4-next-20120427/kernel/sched/core.c	2012-04-28 09:26:40.000000000 -0700
> +++ testing/kernel/sched/core.c	2012-05-01 22:40:46.000000000 -0700
> @@ -2024,7 +2024,7 @@ asmlinkage void schedule_tail(struct tas
>  {
>  	struct rq *rq = this_rq();
> 
> -	rcu_switch_from(prev);
> +	/* rcu_switch_from(prev); */
>  	rcu_switch_to();
>  	finish_task_switch(rq, prev);
> 
> @@ -7093,6 +7093,10 @@ void __might_sleep(const char *file, int
>  		"BUG: sleeping function called from invalid context at %s:%d\n",
>  			file, line);
>  	printk(KERN_ERR
> +		"cpu=%d preempt_count=%x preempt_offset=%x rcu_nesting=%x nesting_save=%x\n",
> +		raw_smp_processor_id(), preempt_count(), preempt_offset,
> +		rcu_preempt_depth(), current->rcu_read_lock_nesting_save); 
> +	printk(KERN_ERR
>  		"in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
>  			in_atomic(), irqs_disabled(),
>  			current->pid, current->comm);
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
>
Benjamin Herrenschmidt May 2, 2012, 9:20 p.m. UTC | #2
On Wed, 2012-05-02 at 13:25 -0700, Hugh Dickins wrote:
> Got it at last.  Embarrassingly obvious.  __rcu_read_lock() and
> __rcu_read_unlock() are not safe to be using __this_cpu operations,
> the cpu may change in between the rmw's read and write: they should
> be using this_cpu operations (or, I put preempt_disable/enable in the
> __rcu_read_unlock below).  __this_cpus there work out fine on x86,
> which was given good instructions to use; but not so well on PowerPC.
> 
> I've been running successfully for an hour now with the patch below;
> but I expect you'll want to consider the tradeoffs, and may choose a
> different solution.

Didn't Linus recently rant about these __this_cpu vs this_cpu nonsense ?

I thought that was going out..

Cheers,
Ben.
Paul E. McKenney May 2, 2012, 9:32 p.m. UTC | #3
On Wed, May 02, 2012 at 01:49:54PM -0700, Paul E. McKenney wrote:
> On Wed, May 02, 2012 at 01:25:30PM -0700, Hugh Dickins wrote:
> > On Tue, 1 May 2012, Paul E. McKenney wrote:
> > > > > > > On Mon, 2012-04-30 at 15:37 -0700, Hugh Dickins wrote:
> > > > > > > > 
> > > > > > > > BUG: sleeping function called from invalid context at include/linux/pagemap.h:354
> > > > > > > > in_atomic(): 0, irqs_disabled(): 0, pid: 6886, name: cc1
> > > > > > > > Call Trace:
> > > > > > > > [c0000001a99f78e0] [c00000000000f34c] .show_stack+0x6c/0x16c (unreliable)
> > > > > > > > [c0000001a99f7990] [c000000000077b40] .__might_sleep+0x11c/0x134
> > > > > > > > [c0000001a99f7a10] [c0000000000c6228] .filemap_fault+0x1fc/0x494
> > > > > > > > [c0000001a99f7af0] [c0000000000e7c9c] .__do_fault+0x120/0x684
> > > > > > > > [c0000001a99f7c00] [c000000000025790] .do_page_fault+0x458/0x664
> > > > > > > > [c0000001a99f7e30] [c000000000005868] handle_page_fault+0x10/0x30
> > 
> > Got it at last.  Embarrassingly obvious.  __rcu_read_lock() and
> > __rcu_read_unlock() are not safe to be using __this_cpu operations,
> > the cpu may change in between the rmw's read and write: they should
> > be using this_cpu operations (or, I put preempt_disable/enable in the
> > __rcu_read_unlock below).  __this_cpus there work out fine on x86,
> > which was given good instructions to use; but not so well on PowerPC.
> 
> Thank you very much for tracking this down!!!
> 
> > I've been running successfully for an hour now with the patch below;
> > but I expect you'll want to consider the tradeoffs, and may choose a
> > different solution.
> 
> The thing that puzzles me about this is that the normal path through
> the scheduler would save and restore these per-CPU variables to and
> from the task structure.  There must be a sneak path through the
> scheduler that I failed to account for.

Sigh...  I am slow today, I guess.  The preemption could of course
happen between the time that the task calculated the address of the
per-CPU variable and the time that it modified it.  If this happens,
we are modifying some other CPU's per-CPU variable.

Given that Linus saw no performance benefit from this patchset, I am
strongly tempted to just drop this inlinable-__rcu_read_lock() series
at this point.

I suppose that the other option is to move preempt_count() to a
per-CPU variable, then use the space in the task_info struct.
But that didn't generate anywhere near as good of code...

							Thanx, Paul

> But given your good work, this should be easy for me to chase down
> even on my x86-based laptop -- just convert from __this_cpu_inc() to a
> read-inc-delay-write sequence.  And check that the underlying variable
> didn't change in the meantime.  And dump an ftrace if it did.  ;-)
> 
> Thank you again, Hugh!
> 
> 							Thanx, Paul
> 
> > Hugh
> > 
> > --- 3.4-rc4-next-20120427/include/linux/rcupdate.h	2012-04-28 09:26:38.000000000 -0700
> > +++ testing/include/linux/rcupdate.h	2012-05-02 11:46:06.000000000 -0700
> > @@ -159,7 +159,7 @@ DECLARE_PER_CPU(struct task_struct *, rc
> >   */
> >  static inline void __rcu_read_lock(void)
> >  {
> > -	__this_cpu_inc(rcu_read_lock_nesting);
> > +	this_cpu_inc(rcu_read_lock_nesting);
> >  	barrier(); /* Keep code within RCU read-side critical section. */
> >  }
> > 
> > --- 3.4-rc4-next-20120427/kernel/rcupdate.c	2012-04-28 09:26:40.000000000 -0700
> > +++ testing/kernel/rcupdate.c	2012-05-02 11:44:13.000000000 -0700
> > @@ -72,6 +72,7 @@ DEFINE_PER_CPU(struct task_struct *, rcu
> >   */
> >  void __rcu_read_unlock(void)
> >  {
> > +	preempt_disable();
> >  	if (__this_cpu_read(rcu_read_lock_nesting) != 1)
> >  		__this_cpu_dec(rcu_read_lock_nesting);
> >  	else {
> > @@ -83,13 +84,14 @@ void __rcu_read_unlock(void)
> >  		barrier();  /* ->rcu_read_unlock_special load before assign */
> >  		__this_cpu_write(rcu_read_lock_nesting, 0);
> >  	}
> > -#ifdef CONFIG_PROVE_LOCKING
> > +#if 1 /* CONFIG_PROVE_LOCKING */
> >  	{
> >  		int rln = __this_cpu_read(rcu_read_lock_nesting);
> > 
> > -		WARN_ON_ONCE(rln < 0 && rln > INT_MIN / 2);
> > +		BUG_ON(rln < 0 && rln > INT_MIN / 2);
> >  	}
> >  #endif /* #ifdef CONFIG_PROVE_LOCKING */
> > +	preempt_enable();
> >  }
> >  EXPORT_SYMBOL_GPL(__rcu_read_unlock);
> > 
> > --- 3.4-rc4-next-20120427/kernel/sched/core.c	2012-04-28 09:26:40.000000000 -0700
> > +++ testing/kernel/sched/core.c	2012-05-01 22:40:46.000000000 -0700
> > @@ -2024,7 +2024,7 @@ asmlinkage void schedule_tail(struct tas
> >  {
> >  	struct rq *rq = this_rq();
> > 
> > -	rcu_switch_from(prev);
> > +	/* rcu_switch_from(prev); */
> >  	rcu_switch_to();
> >  	finish_task_switch(rq, prev);
> > 
> > @@ -7093,6 +7093,10 @@ void __might_sleep(const char *file, int
> >  		"BUG: sleeping function called from invalid context at %s:%d\n",
> >  			file, line);
> >  	printk(KERN_ERR
> > +		"cpu=%d preempt_count=%x preempt_offset=%x rcu_nesting=%x nesting_save=%x\n",
> > +		raw_smp_processor_id(), preempt_count(), preempt_offset,
> > +		rcu_preempt_depth(), current->rcu_read_lock_nesting_save); 
> > +	printk(KERN_ERR
> >  		"in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
> >  			in_atomic(), irqs_disabled(),
> >  			current->pid, current->comm);
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > Please read the FAQ at  http://www.tux.org/lkml/
> >
Paul E. McKenney May 2, 2012, 9:36 p.m. UTC | #4
On Wed, May 02, 2012 at 02:32:38PM -0700, Paul E. McKenney wrote:
> On Wed, May 02, 2012 at 01:49:54PM -0700, Paul E. McKenney wrote:
> > On Wed, May 02, 2012 at 01:25:30PM -0700, Hugh Dickins wrote:
> > > On Tue, 1 May 2012, Paul E. McKenney wrote:
> > > > > > > > On Mon, 2012-04-30 at 15:37 -0700, Hugh Dickins wrote:
> > > > > > > > > 
> > > > > > > > > BUG: sleeping function called from invalid context at include/linux/pagemap.h:354
> > > > > > > > > in_atomic(): 0, irqs_disabled(): 0, pid: 6886, name: cc1
> > > > > > > > > Call Trace:
> > > > > > > > > [c0000001a99f78e0] [c00000000000f34c] .show_stack+0x6c/0x16c (unreliable)
> > > > > > > > > [c0000001a99f7990] [c000000000077b40] .__might_sleep+0x11c/0x134
> > > > > > > > > [c0000001a99f7a10] [c0000000000c6228] .filemap_fault+0x1fc/0x494
> > > > > > > > > [c0000001a99f7af0] [c0000000000e7c9c] .__do_fault+0x120/0x684
> > > > > > > > > [c0000001a99f7c00] [c000000000025790] .do_page_fault+0x458/0x664
> > > > > > > > > [c0000001a99f7e30] [c000000000005868] handle_page_fault+0x10/0x30
> > > 
> > > Got it at last.  Embarrassingly obvious.  __rcu_read_lock() and
> > > __rcu_read_unlock() are not safe to be using __this_cpu operations,
> > > the cpu may change in between the rmw's read and write: they should
> > > be using this_cpu operations (or, I put preempt_disable/enable in the
> > > __rcu_read_unlock below).  __this_cpus there work out fine on x86,
> > > which was given good instructions to use; but not so well on PowerPC.
> > 
> > Thank you very much for tracking this down!!!
> > 
> > > I've been running successfully for an hour now with the patch below;
> > > but I expect you'll want to consider the tradeoffs, and may choose a
> > > different solution.
> > 
> > The thing that puzzles me about this is that the normal path through
> > the scheduler would save and restore these per-CPU variables to and
> > from the task structure.  There must be a sneak path through the
> > scheduler that I failed to account for.
> 
> Sigh...  I am slow today, I guess.  The preemption could of course
> happen between the time that the task calculated the address of the
> per-CPU variable and the time that it modified it.  If this happens,
> we are modifying some other CPU's per-CPU variable.
> 
> Given that Linus saw no performance benefit from this patchset, I am
> strongly tempted to just drop this inlinable-__rcu_read_lock() series
> at this point.
> 
> I suppose that the other option is to move preempt_count() to a
> per-CPU variable, then use the space in the task_info struct.
> But that didn't generate anywhere near as good of code...

But preempt_count() would suffer exactly the same problem.  The address
is calculated, the task moves to some other CPU, and then the task
is messing with some other CPU's preemption counter.  Blech.

							Thanx, Paul

> > But given your good work, this should be easy for me to chase down
> > even on my x86-based laptop -- just convert from __this_cpu_inc() to a
> > read-inc-delay-write sequence.  And check that the underlying variable
> > didn't change in the meantime.  And dump an ftrace if it did.  ;-)
> > 
> > Thank you again, Hugh!
> > 
> > 							Thanx, Paul
> > 
> > > Hugh
> > > 
> > > --- 3.4-rc4-next-20120427/include/linux/rcupdate.h	2012-04-28 09:26:38.000000000 -0700
> > > +++ testing/include/linux/rcupdate.h	2012-05-02 11:46:06.000000000 -0700
> > > @@ -159,7 +159,7 @@ DECLARE_PER_CPU(struct task_struct *, rc
> > >   */
> > >  static inline void __rcu_read_lock(void)
> > >  {
> > > -	__this_cpu_inc(rcu_read_lock_nesting);
> > > +	this_cpu_inc(rcu_read_lock_nesting);
> > >  	barrier(); /* Keep code within RCU read-side critical section. */
> > >  }
> > > 
> > > --- 3.4-rc4-next-20120427/kernel/rcupdate.c	2012-04-28 09:26:40.000000000 -0700
> > > +++ testing/kernel/rcupdate.c	2012-05-02 11:44:13.000000000 -0700
> > > @@ -72,6 +72,7 @@ DEFINE_PER_CPU(struct task_struct *, rcu
> > >   */
> > >  void __rcu_read_unlock(void)
> > >  {
> > > +	preempt_disable();
> > >  	if (__this_cpu_read(rcu_read_lock_nesting) != 1)
> > >  		__this_cpu_dec(rcu_read_lock_nesting);
> > >  	else {
> > > @@ -83,13 +84,14 @@ void __rcu_read_unlock(void)
> > >  		barrier();  /* ->rcu_read_unlock_special load before assign */
> > >  		__this_cpu_write(rcu_read_lock_nesting, 0);
> > >  	}
> > > -#ifdef CONFIG_PROVE_LOCKING
> > > +#if 1 /* CONFIG_PROVE_LOCKING */
> > >  	{
> > >  		int rln = __this_cpu_read(rcu_read_lock_nesting);
> > > 
> > > -		WARN_ON_ONCE(rln < 0 && rln > INT_MIN / 2);
> > > +		BUG_ON(rln < 0 && rln > INT_MIN / 2);
> > >  	}
> > >  #endif /* #ifdef CONFIG_PROVE_LOCKING */
> > > +	preempt_enable();
> > >  }
> > >  EXPORT_SYMBOL_GPL(__rcu_read_unlock);
> > > 
> > > --- 3.4-rc4-next-20120427/kernel/sched/core.c	2012-04-28 09:26:40.000000000 -0700
> > > +++ testing/kernel/sched/core.c	2012-05-01 22:40:46.000000000 -0700
> > > @@ -2024,7 +2024,7 @@ asmlinkage void schedule_tail(struct tas
> > >  {
> > >  	struct rq *rq = this_rq();
> > > 
> > > -	rcu_switch_from(prev);
> > > +	/* rcu_switch_from(prev); */
> > >  	rcu_switch_to();
> > >  	finish_task_switch(rq, prev);
> > > 
> > > @@ -7093,6 +7093,10 @@ void __might_sleep(const char *file, int
> > >  		"BUG: sleeping function called from invalid context at %s:%d\n",
> > >  			file, line);
> > >  	printk(KERN_ERR
> > > +		"cpu=%d preempt_count=%x preempt_offset=%x rcu_nesting=%x nesting_save=%x\n",
> > > +		raw_smp_processor_id(), preempt_count(), preempt_offset,
> > > +		rcu_preempt_depth(), current->rcu_read_lock_nesting_save); 
> > > +	printk(KERN_ERR
> > >  		"in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
> > >  			in_atomic(), irqs_disabled(),
> > >  			current->pid, current->comm);
> > > --
> > > To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> > > the body of a message to majordomo@vger.kernel.org
> > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > Please read the FAQ at  http://www.tux.org/lkml/
> > >
Paul E. McKenney May 2, 2012, 9:54 p.m. UTC | #5
On Thu, May 03, 2012 at 07:20:15AM +1000, Benjamin Herrenschmidt wrote:
> On Wed, 2012-05-02 at 13:25 -0700, Hugh Dickins wrote:
> > Got it at last.  Embarrassingly obvious.  __rcu_read_lock() and
> > __rcu_read_unlock() are not safe to be using __this_cpu operations,
> > the cpu may change in between the rmw's read and write: they should
> > be using this_cpu operations (or, I put preempt_disable/enable in the
> > __rcu_read_unlock below).  __this_cpus there work out fine on x86,
> > which was given good instructions to use; but not so well on PowerPC.
> > 
> > I've been running successfully for an hour now with the patch below;
> > but I expect you'll want to consider the tradeoffs, and may choose a
> > different solution.
> 
> Didn't Linus recently rant about these __this_cpu vs this_cpu nonsense ?
> 
> I thought that was going out..

Linus did rant about __raw_get_cpu_var() because it cannot use the x86
%fs segement overrides a bit more than a month ago.  The __this_cpu
stuff is useful if you have preemption disabled -- avoids the extra
layer of preempt_disable().

Or was this a different rant?

							Thanx, Paul
Hugh Dickins May 2, 2012, 10:54 p.m. UTC | #6
On Wed, May 2, 2012 at 2:54 PM, Paul E. McKenney
<paulmck@linux.vnet.ibm.com> wrote:
> On Thu, May 03, 2012 at 07:20:15AM +1000, Benjamin Herrenschmidt wrote:
>> On Wed, 2012-05-02 at 13:25 -0700, Hugh Dickins wrote:
>> > Got it at last.  Embarrassingly obvious.  __rcu_read_lock() and
>> > __rcu_read_unlock() are not safe to be using __this_cpu operations,
>> > the cpu may change in between the rmw's read and write: they should
>> > be using this_cpu operations (or, I put preempt_disable/enable in the
>> > __rcu_read_unlock below).  __this_cpus there work out fine on x86,
>> > which was given good instructions to use; but not so well on PowerPC.
>> >
>> > I've been running successfully for an hour now with the patch below;
>> > but I expect you'll want to consider the tradeoffs, and may choose a
>> > different solution.
>>
>> Didn't Linus recently rant about these __this_cpu vs this_cpu nonsense ?
>>
>> I thought that was going out..
>
> Linus did rant about __raw_get_cpu_var() because it cannot use the x86
> %fs segement overrides a bit more than a month ago.  The __this_cpu
> stuff is useful if you have preemption disabled -- avoids the extra
> layer of preempt_disable().
>
> Or was this a different rant?

https://lkml.org/lkml/2011/11/29/321

I think it ended up with Christoph removing the more egregious
variants, but this_cpu_that and __this_cpu_the_other remaining.

Hugh
Paul E. McKenney May 3, 2012, 12:14 a.m. UTC | #7
On Wed, May 02, 2012 at 03:54:24PM -0700, Hugh Dickins wrote:
> On Wed, May 2, 2012 at 2:54 PM, Paul E. McKenney
> <paulmck@linux.vnet.ibm.com> wrote:
> > On Thu, May 03, 2012 at 07:20:15AM +1000, Benjamin Herrenschmidt wrote:
> >> On Wed, 2012-05-02 at 13:25 -0700, Hugh Dickins wrote:
> >> > Got it at last.  Embarrassingly obvious.  __rcu_read_lock() and
> >> > __rcu_read_unlock() are not safe to be using __this_cpu operations,
> >> > the cpu may change in between the rmw's read and write: they should
> >> > be using this_cpu operations (or, I put preempt_disable/enable in the
> >> > __rcu_read_unlock below).  __this_cpus there work out fine on x86,
> >> > which was given good instructions to use; but not so well on PowerPC.
> >> >
> >> > I've been running successfully for an hour now with the patch below;
> >> > but I expect you'll want to consider the tradeoffs, and may choose a
> >> > different solution.
> >>
> >> Didn't Linus recently rant about these __this_cpu vs this_cpu nonsense ?
> >>
> >> I thought that was going out..
> >
> > Linus did rant about __raw_get_cpu_var() because it cannot use the x86
> > %fs segement overrides a bit more than a month ago.  The __this_cpu
> > stuff is useful if you have preemption disabled -- avoids the extra
> > layer of preempt_disable().
> >
> > Or was this a different rant?
> 
> https://lkml.org/lkml/2011/11/29/321
> 
> I think it ended up with Christoph removing the more egregious
> variants, but this_cpu_that and __this_cpu_the_other remaining.

Ah, thank you for the pointer.

It would be nice to have the CPU transparency of x86 on other
architectures, but from what I can see, that would require dedicating
a register to this purpose -- and even then requires that the arch
have indexed addressing modes.  There are some other approaches, for
example, having __this_cpu_that() be located at a special address that
the scheduler treated as implicitly preempt_disable().  Or I suppose
that the arch-specific trap-handling code could fake it.  A little
bit messy, but the ability to access a given CPU's per-CPU variable
while running on that CPU does appear to have at least a couple of
uses -- inlining RCU and also making preempt_disable() use per-CPU
variables.

In any case, I must confess that I feel quite silly about my series
of patches.  I have reverted them aside from a couple that did useful
optimizations, and they should show up in -next shortly.

							Thanx, Paul
Hugh Dickins May 3, 2012, 12:24 a.m. UTC | #8
On Wed, 2 May 2012, Paul E. McKenney wrote:
> On Wed, May 02, 2012 at 03:54:24PM -0700, Hugh Dickins wrote:
> > On Wed, May 2, 2012 at 2:54 PM, Paul E. McKenney
> > <paulmck@linux.vnet.ibm.com> wrote:
> > > On Thu, May 03, 2012 at 07:20:15AM +1000, Benjamin Herrenschmidt wrote:
> > >> On Wed, 2012-05-02 at 13:25 -0700, Hugh Dickins wrote:
> > >> > Got it at last.  Embarrassingly obvious.  __rcu_read_lock() and
> > >> > __rcu_read_unlock() are not safe to be using __this_cpu operations,
> > >> > the cpu may change in between the rmw's read and write: they should
> > >> > be using this_cpu operations (or, I put preempt_disable/enable in the
> > >> > __rcu_read_unlock below).  __this_cpus there work out fine on x86,
> > >> > which was given good instructions to use; but not so well on PowerPC.
> > >> >
> > >> > I've been running successfully for an hour now with the patch below;
> > >> > but I expect you'll want to consider the tradeoffs, and may choose a
> > >> > different solution.
> > >>
> > >> Didn't Linus recently rant about these __this_cpu vs this_cpu nonsense ?
> > >>
> > >> I thought that was going out..
> > >
> > > Linus did rant about __raw_get_cpu_var() because it cannot use the x86
> > > %fs segement overrides a bit more than a month ago.  The __this_cpu
> > > stuff is useful if you have preemption disabled -- avoids the extra
> > > layer of preempt_disable().
> > >
> > > Or was this a different rant?
> > 
> > https://lkml.org/lkml/2011/11/29/321
> > 
> > I think it ended up with Christoph removing the more egregious
> > variants, but this_cpu_that and __this_cpu_the_other remaining.
> 
> Ah, thank you for the pointer.
> 
> It would be nice to have the CPU transparency of x86 on other
> architectures, but from what I can see, that would require dedicating
> a register to this purpose -- and even then requires that the arch
> have indexed addressing modes.  There are some other approaches, for
> example, having __this_cpu_that() be located at a special address that
> the scheduler treated as implicitly preempt_disable().  Or I suppose
> that the arch-specific trap-handling code could fake it.  A little
> bit messy, but the ability to access a given CPU's per-CPU variable
> while running on that CPU does appear to have at least a couple of
> uses -- inlining RCU and also making preempt_disable() use per-CPU
> variables.
> 
> In any case, I must confess that I feel quite silly about my series
> of patches.  I have reverted them aside from a couple that did useful
> optimizations, and they should show up in -next shortly.

A wee bit sad, but thank you - it was an experiment worth trying,
and perhaps there will be reason to come back to it future.

Hugh
Hugh Dickins May 7, 2012, 4:21 p.m. UTC | #9
On Wed, 2 May 2012, Hugh Dickins wrote:
> On Wed, 2 May 2012, Paul E. McKenney wrote:
> > 
> > In any case, I must confess that I feel quite silly about my series
> > of patches.  I have reverted them aside from a couple that did useful
> > optimizations, and they should show up in -next shortly.
> 
> A wee bit sad, but thank you - it was an experiment worth trying,
> and perhaps there will be reason to come back to it future.

The revert indeed showed up in next-20120504: thanks, no problem now.

But although it's just history, and not worth anyone's time to
investigate, I shouldn't let this thread die without an epilogue.

Although the patch I posted (this_cpu_inc in __rcu_read_lock,
preempt_disable and enable in __rcu_read_unlock) ran well until
I killed the test after 70 hours, it did not _entirely_ eliminate
the sleeping function BUG messages.

In 70 hours I got six isolated messages like the below (but from
different __might_sleep callsites) - where before I'd have flurries
of hundreds(?) and freeze within the hour.

And the "rcu_nesting" debug line I'd added to the message was different:
where before it was showing ffffffff on some tasks and 1 on others i.e.
increment or decrement had been applied to the wrong task, these messages
now all showed 0s throughout i.e. by the time the message was printed,
there was no longer any justification for the message.

As if a memory barrier were missing somewhere, perhaps.

BUG: sleeping function called from invalid context at arch/powerpc/mm/fault.c:305
cpu=2 preempt_count=0 preempt_offset=0 rcu_nesting=0 nesting_save=0
in_atomic(): 0, irqs_disabled(): 0, pid: 12266, name: cc1
Call Trace:
[c000000003affac0] [c00000000000f36c] .show_stack+0x6c/0x16c (unreliable)
[c000000003affb70] [c000000000078788] .__might_sleep+0x150/0x170
[c000000003affc00] [c0000000000255f4] .do_page_fault+0x288/0x664
[c000000003affe30] [c000000000005868] handle_page_fault+0x10/0x30

Hugh
Paul E. McKenney May 7, 2012, 6:50 p.m. UTC | #10
On Mon, May 07, 2012 at 09:21:54AM -0700, Hugh Dickins wrote:
> On Wed, 2 May 2012, Hugh Dickins wrote:
> > On Wed, 2 May 2012, Paul E. McKenney wrote:
> > > 
> > > In any case, I must confess that I feel quite silly about my series
> > > of patches.  I have reverted them aside from a couple that did useful
> > > optimizations, and they should show up in -next shortly.
> > 
> > A wee bit sad, but thank you - it was an experiment worth trying,
> > and perhaps there will be reason to come back to it future.
> 
> The revert indeed showed up in next-20120504: thanks, no problem now.
> 
> But although it's just history, and not worth anyone's time to
> investigate, I shouldn't let this thread die without an epilogue.
> 
> Although the patch I posted (this_cpu_inc in __rcu_read_lock,
> preempt_disable and enable in __rcu_read_unlock) ran well until
> I killed the test after 70 hours, it did not _entirely_ eliminate
> the sleeping function BUG messages.
> 
> In 70 hours I got six isolated messages like the below (but from
> different __might_sleep callsites) - where before I'd have flurries
> of hundreds(?) and freeze within the hour.
> 
> And the "rcu_nesting" debug line I'd added to the message was different:
> where before it was showing ffffffff on some tasks and 1 on others i.e.
> increment or decrement had been applied to the wrong task, these messages
> now all showed 0s throughout i.e. by the time the message was printed,
> there was no longer any justification for the message.
> 
> As if a memory barrier were missing somewhere, perhaps.

These fields should be updated only by the corresponding CPU, so
if memory barriers are needed, it seems to me that the cross-CPU
access is the bug, not the lack of a memory barrier.

Ah...  Is preemption disabled across the access to RCU's nesting level
when printing out the message?  If not, a preeemption at that point
could result in the value printed being inaccurate.

							Thanx, Paul

> BUG: sleeping function called from invalid context at arch/powerpc/mm/fault.c:305
> cpu=2 preempt_count=0 preempt_offset=0 rcu_nesting=0 nesting_save=0
> in_atomic(): 0, irqs_disabled(): 0, pid: 12266, name: cc1
> Call Trace:
> [c000000003affac0] [c00000000000f36c] .show_stack+0x6c/0x16c (unreliable)
> [c000000003affb70] [c000000000078788] .__might_sleep+0x150/0x170
> [c000000003affc00] [c0000000000255f4] .do_page_fault+0x288/0x664
> [c000000003affe30] [c000000000005868] handle_page_fault+0x10/0x30
> 
> Hugh
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
>
Hugh Dickins May 7, 2012, 9:38 p.m. UTC | #11
On Mon, 7 May 2012, Paul E. McKenney wrote:
> On Mon, May 07, 2012 at 09:21:54AM -0700, Hugh Dickins wrote:
> > 
> > In 70 hours I got six isolated messages like the below (but from
> > different __might_sleep callsites) - where before I'd have flurries
> > of hundreds(?) and freeze within the hour.
> > 
> > And the "rcu_nesting" debug line I'd added to the message was different:
> > where before it was showing ffffffff on some tasks and 1 on others i.e.
> > increment or decrement had been applied to the wrong task, these messages
> > now all showed 0s throughout i.e. by the time the message was printed,
> > there was no longer any justification for the message.
> > 
> > As if a memory barrier were missing somewhere, perhaps.
> 
> These fields should be updated only by the corresponding CPU, so
> if memory barriers are needed, it seems to me that the cross-CPU
> access is the bug, not the lack of a memory barrier.

Yes: the code you added appeared to be using local CPU accesses only
(very much intentionally), and the context switch should already have
provided all the memory barriers needed there.

> 
> Ah...  Is preemption disabled across the access to RCU's nesting level
> when printing out the message?  If not, a preeemption at that point
> could result in the value printed being inaccurate.

Preemption was enabled in the cases I saw.  So you're pointing out that
#define rcu_preempt_depth() (__this_cpu_read(rcu_read_lock_nesting))
should have been
#define rcu_preempt_depth() (this_cpu_read(rcu_read_lock_nesting))
to avoid the danger of spurious __might_sleep() warnings.

Yes, I believe you've got it - thanks.

Hugh
diff mbox

Patch

--- 3.4-rc4-next-20120427/include/linux/rcupdate.h	2012-04-28 09:26:38.000000000 -0700
+++ testing/include/linux/rcupdate.h	2012-05-02 11:46:06.000000000 -0700
@@ -159,7 +159,7 @@  DECLARE_PER_CPU(struct task_struct *, rc
  */
 static inline void __rcu_read_lock(void)
 {
-	__this_cpu_inc(rcu_read_lock_nesting);
+	this_cpu_inc(rcu_read_lock_nesting);
 	barrier(); /* Keep code within RCU read-side critical section. */
 }
 
--- 3.4-rc4-next-20120427/kernel/rcupdate.c	2012-04-28 09:26:40.000000000 -0700
+++ testing/kernel/rcupdate.c	2012-05-02 11:44:13.000000000 -0700
@@ -72,6 +72,7 @@  DEFINE_PER_CPU(struct task_struct *, rcu
  */
 void __rcu_read_unlock(void)
 {
+	preempt_disable();
 	if (__this_cpu_read(rcu_read_lock_nesting) != 1)
 		__this_cpu_dec(rcu_read_lock_nesting);
 	else {
@@ -83,13 +84,14 @@  void __rcu_read_unlock(void)
 		barrier();  /* ->rcu_read_unlock_special load before assign */
 		__this_cpu_write(rcu_read_lock_nesting, 0);
 	}
-#ifdef CONFIG_PROVE_LOCKING
+#if 1 /* CONFIG_PROVE_LOCKING */
 	{
 		int rln = __this_cpu_read(rcu_read_lock_nesting);
 
-		WARN_ON_ONCE(rln < 0 && rln > INT_MIN / 2);
+		BUG_ON(rln < 0 && rln > INT_MIN / 2);
 	}
 #endif /* #ifdef CONFIG_PROVE_LOCKING */
+	preempt_enable();
 }
 EXPORT_SYMBOL_GPL(__rcu_read_unlock);
 
--- 3.4-rc4-next-20120427/kernel/sched/core.c	2012-04-28 09:26:40.000000000 -0700
+++ testing/kernel/sched/core.c	2012-05-01 22:40:46.000000000 -0700
@@ -2024,7 +2024,7 @@  asmlinkage void schedule_tail(struct tas
 {
 	struct rq *rq = this_rq();
 
-	rcu_switch_from(prev);
+	/* rcu_switch_from(prev); */
 	rcu_switch_to();
 	finish_task_switch(rq, prev);
 
@@ -7093,6 +7093,10 @@  void __might_sleep(const char *file, int
 		"BUG: sleeping function called from invalid context at %s:%d\n",
 			file, line);
 	printk(KERN_ERR
+		"cpu=%d preempt_count=%x preempt_offset=%x rcu_nesting=%x nesting_save=%x\n",
+		raw_smp_processor_id(), preempt_count(), preempt_offset,
+		rcu_preempt_depth(), current->rcu_read_lock_nesting_save); 
+	printk(KERN_ERR
 		"in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
 			in_atomic(), irqs_disabled(),
 			current->pid, current->comm);