diff mbox

bpf: convert hashtab lock to raw lock

Message ID 1446243386-26582-1-git-send-email-yang.shi@linaro.org
State Accepted, archived
Delegated to: David Miller
Headers show

Commit Message

Yang Shi Oct. 30, 2015, 10:16 p.m. UTC
When running bpf samples on rt kernel, it reports the below warning:

BUG: sleeping function called from invalid context at kernel/locking/rtmutex.c:917
in_atomic(): 1, irqs_disabled(): 128, pid: 477, name: ping
Preemption disabled at:[<ffff80000017db58>] kprobe_perf_func+0x30/0x228

CPU: 3 PID: 477 Comm: ping Not tainted 4.1.10-rt8 #4
Hardware name: Freescale Layerscape 2085a RDB Board (DT)
Call trace:
[<ffff80000008a5b0>] dump_backtrace+0x0/0x128
[<ffff80000008a6f8>] show_stack+0x20/0x30
[<ffff8000007da90c>] dump_stack+0x7c/0xa0
[<ffff8000000e4830>] ___might_sleep+0x188/0x1a0
[<ffff8000007e2200>] rt_spin_lock+0x28/0x40
[<ffff80000018bf9c>] htab_map_update_elem+0x124/0x320
[<ffff80000018c718>] bpf_map_update_elem+0x40/0x58
[<ffff800000187658>] __bpf_prog_run+0xd48/0x1640
[<ffff80000017ca6c>] trace_call_bpf+0x8c/0x100
[<ffff80000017db58>] kprobe_perf_func+0x30/0x228
[<ffff80000017dd84>] kprobe_dispatcher+0x34/0x58
[<ffff8000007e399c>] kprobe_handler+0x114/0x250
[<ffff8000007e3bf4>] kprobe_breakpoint_handler+0x1c/0x30
[<ffff800000085b80>] brk_handler+0x88/0x98
[<ffff8000000822f0>] do_debug_exception+0x50/0xb8
Exception stack(0xffff808349687460 to 0xffff808349687580)
7460: 4ca2b600 ffff8083 4a3a7000 ffff8083 49687620 ffff8083 0069c5f8 ffff8000
7480: 00000001 00000000 007e0628 ffff8000 496874b0 ffff8083 007e1de8 ffff8000
74a0: 496874d0 ffff8083 0008e04c ffff8000 00000001 00000000 4ca2b600 ffff8083
74c0: 00ba2e80 ffff8000 49687528 ffff8083 49687510 ffff8083 000e5c70 ffff8000
74e0: 00c22348 ffff8000 00000000 ffff8083 49687510 ffff8083 000e5c74 ffff8000
7500: 4ca2b600 ffff8083 49401800 ffff8083 00000001 00000000 00000000 00000000
7520: 496874d0 ffff8083 00000000 00000000 00000000 00000000 00000000 00000000
7540: 2f2e2d2c 33323130 00000000 00000000 4c944500 ffff8083 00000000 00000000
7560: 00000000 00000000 008751e0 ffff8000 00000001 00000000 124e2d1d 00107b77

Convert hashtab lock to raw lock to avoid such warning.

Signed-off-by: Yang Shi <yang.shi@linaro.org>
---
This patch is applicable to mainline kernel too.

 kernel/bpf/hashtab.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

Comments

Alexei Starovoitov Oct. 31, 2015, 12:03 a.m. UTC | #1
On Fri, Oct 30, 2015 at 03:16:26PM -0700, Yang Shi wrote:
> When running bpf samples on rt kernel, it reports the below warning:
> 
> BUG: sleeping function called from invalid context at kernel/locking/rtmutex.c:917
> in_atomic(): 1, irqs_disabled(): 128, pid: 477, name: ping
> Preemption disabled at:[<ffff80000017db58>] kprobe_perf_func+0x30/0x228
...
> diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
> index 83c209d..972b76b 100644
> --- a/kernel/bpf/hashtab.c
> +++ b/kernel/bpf/hashtab.c
> @@ -17,7 +17,7 @@
>  struct bpf_htab {
>  	struct bpf_map map;
>  	struct hlist_head *buckets;
> -	spinlock_t lock;
> +	raw_spinlock_t lock;

How do we address such things in general?
I bet there are tons of places around the kernel that
call spin_lock from atomic.
I'd hate to lose the benefits of lockdep of non-raw spin_lock
just to make rt happy.

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Steven Rostedt Oct. 31, 2015, 1:47 p.m. UTC | #2
On Fri, 30 Oct 2015 17:03:58 -0700
Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:

> On Fri, Oct 30, 2015 at 03:16:26PM -0700, Yang Shi wrote:
> > When running bpf samples on rt kernel, it reports the below warning:
> > 
> > BUG: sleeping function called from invalid context at kernel/locking/rtmutex.c:917
> > in_atomic(): 1, irqs_disabled(): 128, pid: 477, name: ping
> > Preemption disabled at:[<ffff80000017db58>] kprobe_perf_func+0x30/0x228  
> ...
> > diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
> > index 83c209d..972b76b 100644
> > --- a/kernel/bpf/hashtab.c
> > +++ b/kernel/bpf/hashtab.c
> > @@ -17,7 +17,7 @@
> >  struct bpf_htab {
> >  	struct bpf_map map;
> >  	struct hlist_head *buckets;
> > -	spinlock_t lock;
> > +	raw_spinlock_t lock;  
> 
> How do we address such things in general?
> I bet there are tons of places around the kernel that
> call spin_lock from atomic.
> I'd hate to lose the benefits of lockdep of non-raw spin_lock
> just to make rt happy.

You wont lose any benefits of lockdep. Lockdep still checks
raw_spin_lock(). The only difference between raw_spin_lock and
spin_lock is that in -rt spin_lock turns into an rt_mutex() and
raw_spin_lock stays a spin lock.

The error is that in -rt, you called a mutex and not a spin lock while
atomic.

-- Steve

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Daniel Borkmann Oct. 31, 2015, 6:37 p.m. UTC | #3
On 10/31/2015 02:47 PM, Steven Rostedt wrote:
> On Fri, 30 Oct 2015 17:03:58 -0700
> Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:
>> On Fri, Oct 30, 2015 at 03:16:26PM -0700, Yang Shi wrote:
>>> When running bpf samples on rt kernel, it reports the below warning:
>>>
>>> BUG: sleeping function called from invalid context at kernel/locking/rtmutex.c:917
>>> in_atomic(): 1, irqs_disabled(): 128, pid: 477, name: ping
>>> Preemption disabled at:[<ffff80000017db58>] kprobe_perf_func+0x30/0x228
>> ...
>>> diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
>>> index 83c209d..972b76b 100644
>>> --- a/kernel/bpf/hashtab.c
>>> +++ b/kernel/bpf/hashtab.c
>>> @@ -17,7 +17,7 @@
>>>   struct bpf_htab {
>>>   	struct bpf_map map;
>>>   	struct hlist_head *buckets;
>>> -	spinlock_t lock;
>>> +	raw_spinlock_t lock;
>>
>> How do we address such things in general?
>> I bet there are tons of places around the kernel that
>> call spin_lock from atomic.
>> I'd hate to lose the benefits of lockdep of non-raw spin_lock
>> just to make rt happy.
>
> You wont lose any benefits of lockdep. Lockdep still checks
> raw_spin_lock(). The only difference between raw_spin_lock and
> spin_lock is that in -rt spin_lock turns into an rt_mutex() and
> raw_spin_lock stays a spin lock.

( Btw, Yang, would have been nice if your commit description would have
   already included such info, not only that you convert it, but also why
   it's okay to do so. )

> The error is that in -rt, you called a mutex and not a spin lock while
> atomic.

You are right, I think this happens due to the preempt_disable() in the
trace_call_bpf() handler. So, I think the patch seems okay. The dep_map
is btw union'ed in the struct spinlock case to the same offset of the
dep_map from raw_spinlock.

It's a bit inconvenient, though, when we add other library code as maps
in future, f.e. things like rhashtable as they would first need to be
converted to raw_spinlock_t as well, but judging from the git log, it
looks like common practice.

Thanks,
Daniel
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alexei Starovoitov Nov. 1, 2015, 10:56 p.m. UTC | #4
On Sat, Oct 31, 2015 at 09:47:36AM -0400, Steven Rostedt wrote:
> On Fri, 30 Oct 2015 17:03:58 -0700
> Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:
> 
> > On Fri, Oct 30, 2015 at 03:16:26PM -0700, Yang Shi wrote:
> > > When running bpf samples on rt kernel, it reports the below warning:
> > > 
> > > BUG: sleeping function called from invalid context at kernel/locking/rtmutex.c:917
> > > in_atomic(): 1, irqs_disabled(): 128, pid: 477, name: ping
> > > Preemption disabled at:[<ffff80000017db58>] kprobe_perf_func+0x30/0x228  
> > ...
> > > diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
> > > index 83c209d..972b76b 100644
> > > --- a/kernel/bpf/hashtab.c
> > > +++ b/kernel/bpf/hashtab.c
> > > @@ -17,7 +17,7 @@
> > >  struct bpf_htab {
> > >  	struct bpf_map map;
> > >  	struct hlist_head *buckets;
> > > -	spinlock_t lock;
> > > +	raw_spinlock_t lock;  
> > 
> > How do we address such things in general?
> > I bet there are tons of places around the kernel that
> > call spin_lock from atomic.
> > I'd hate to lose the benefits of lockdep of non-raw spin_lock
> > just to make rt happy.
> 
> You wont lose any benefits of lockdep. Lockdep still checks
> raw_spin_lock(). The only difference between raw_spin_lock and
> spin_lock is that in -rt spin_lock turns into an rt_mutex() and
> raw_spin_lock stays a spin lock.

I see. The patch makes sense then.
Would be good to document this peculiarity of spin_lock.

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Thomas Gleixner Nov. 2, 2015, 8:59 a.m. UTC | #5
On Sun, 1 Nov 2015, Alexei Starovoitov wrote:
> On Sat, Oct 31, 2015 at 09:47:36AM -0400, Steven Rostedt wrote:
> > On Fri, 30 Oct 2015 17:03:58 -0700
> > Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:
> > 
> > > On Fri, Oct 30, 2015 at 03:16:26PM -0700, Yang Shi wrote:
> > > > When running bpf samples on rt kernel, it reports the below warning:
> > > > 
> > > > BUG: sleeping function called from invalid context at kernel/locking/rtmutex.c:917
> > > > in_atomic(): 1, irqs_disabled(): 128, pid: 477, name: ping
> > > > Preemption disabled at:[<ffff80000017db58>] kprobe_perf_func+0x30/0x228  
> > > ...
> > > > diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
> > > > index 83c209d..972b76b 100644
> > > > --- a/kernel/bpf/hashtab.c
> > > > +++ b/kernel/bpf/hashtab.c
> > > > @@ -17,7 +17,7 @@
> > > >  struct bpf_htab {
> > > >  	struct bpf_map map;
> > > >  	struct hlist_head *buckets;
> > > > -	spinlock_t lock;
> > > > +	raw_spinlock_t lock;  
> > > 
> > > How do we address such things in general?
> > > I bet there are tons of places around the kernel that
> > > call spin_lock from atomic.
> > > I'd hate to lose the benefits of lockdep of non-raw spin_lock
> > > just to make rt happy.
> > 
> > You wont lose any benefits of lockdep. Lockdep still checks
> > raw_spin_lock(). The only difference between raw_spin_lock and
> > spin_lock is that in -rt spin_lock turns into an rt_mutex() and
> > raw_spin_lock stays a spin lock.
> 
> I see. The patch makes sense then.
> Would be good to document this peculiarity of spin_lock.

I'm working on a document.

Thanks,

	tglx
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Yang Shi Nov. 2, 2015, 5:09 p.m. UTC | #6
On 11/2/2015 12:59 AM, Thomas Gleixner wrote:
> On Sun, 1 Nov 2015, Alexei Starovoitov wrote:
>> On Sat, Oct 31, 2015 at 09:47:36AM -0400, Steven Rostedt wrote:
>>> On Fri, 30 Oct 2015 17:03:58 -0700
>>> Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:
>>>
>>>> On Fri, Oct 30, 2015 at 03:16:26PM -0700, Yang Shi wrote:
>>>>> When running bpf samples on rt kernel, it reports the below warning:
>>>>>
>>>>> BUG: sleeping function called from invalid context at kernel/locking/rtmutex.c:917
>>>>> in_atomic(): 1, irqs_disabled(): 128, pid: 477, name: ping
>>>>> Preemption disabled at:[<ffff80000017db58>] kprobe_perf_func+0x30/0x228
>>>> ...
>>>>> diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
>>>>> index 83c209d..972b76b 100644
>>>>> --- a/kernel/bpf/hashtab.c
>>>>> +++ b/kernel/bpf/hashtab.c
>>>>> @@ -17,7 +17,7 @@
>>>>>   struct bpf_htab {
>>>>>   	struct bpf_map map;
>>>>>   	struct hlist_head *buckets;
>>>>> -	spinlock_t lock;
>>>>> +	raw_spinlock_t lock;
>>>>
>>>> How do we address such things in general?
>>>> I bet there are tons of places around the kernel that
>>>> call spin_lock from atomic.
>>>> I'd hate to lose the benefits of lockdep of non-raw spin_lock
>>>> just to make rt happy.
>>>
>>> You wont lose any benefits of lockdep. Lockdep still checks
>>> raw_spin_lock(). The only difference between raw_spin_lock and
>>> spin_lock is that in -rt spin_lock turns into an rt_mutex() and
>>> raw_spin_lock stays a spin lock.
>>
>> I see. The patch makes sense then.
>> Would be good to document this peculiarity of spin_lock.
>
> I'm working on a document.

Thanks Steven and Thomas for your elaboration and comment.

Yang

>
> Thanks,
>
> 	tglx
>

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Yang Shi Nov. 2, 2015, 5:12 p.m. UTC | #7
On 10/31/2015 11:37 AM, Daniel Borkmann wrote:
> On 10/31/2015 02:47 PM, Steven Rostedt wrote:
>> On Fri, 30 Oct 2015 17:03:58 -0700
>> Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:
>>> On Fri, Oct 30, 2015 at 03:16:26PM -0700, Yang Shi wrote:
>>>> When running bpf samples on rt kernel, it reports the below warning:
>>>>
>>>> BUG: sleeping function called from invalid context at
>>>> kernel/locking/rtmutex.c:917
>>>> in_atomic(): 1, irqs_disabled(): 128, pid: 477, name: ping
>>>> Preemption disabled at:[<ffff80000017db58>] kprobe_perf_func+0x30/0x228
>>> ...
>>>> diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
>>>> index 83c209d..972b76b 100644
>>>> --- a/kernel/bpf/hashtab.c
>>>> +++ b/kernel/bpf/hashtab.c
>>>> @@ -17,7 +17,7 @@
>>>>   struct bpf_htab {
>>>>       struct bpf_map map;
>>>>       struct hlist_head *buckets;
>>>> -    spinlock_t lock;
>>>> +    raw_spinlock_t lock;
>>>
>>> How do we address such things in general?
>>> I bet there are tons of places around the kernel that
>>> call spin_lock from atomic.
>>> I'd hate to lose the benefits of lockdep of non-raw spin_lock
>>> just to make rt happy.
>>
>> You wont lose any benefits of lockdep. Lockdep still checks
>> raw_spin_lock(). The only difference between raw_spin_lock and
>> spin_lock is that in -rt spin_lock turns into an rt_mutex() and
>> raw_spin_lock stays a spin lock.
>
> ( Btw, Yang, would have been nice if your commit description would have
>    already included such info, not only that you convert it, but also why
>    it's okay to do so. )

I think Thomas's document will include all the information about rt spin 
lock/raw spin lock, etc.

Alexei & Daniel,

If you think such info is necessary, I definitely could add it into the 
commit log in v2.

>
>> The error is that in -rt, you called a mutex and not a spin lock while
>> atomic.
>
> You are right, I think this happens due to the preempt_disable() in the
> trace_call_bpf() handler. So, I think the patch seems okay. The dep_map
> is btw union'ed in the struct spinlock case to the same offset of the
> dep_map from raw_spinlock.
>
> It's a bit inconvenient, though, when we add other library code as maps
> in future, f.e. things like rhashtable as they would first need to be
> converted to raw_spinlock_t as well, but judging from the git log, it
> looks like common practice.

Yes, it is common practice for converting sleepable spin lock to raw 
spin lock in -rt to avoid scheduling in atomic context bug.

Thanks,
Yang

>
> Thanks,
> Daniel

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Steven Rostedt Nov. 2, 2015, 5:24 p.m. UTC | #8
On Mon, 02 Nov 2015 09:12:29 -0800
"Shi, Yang" <yang.shi@linaro.org> wrote:

> Yes, it is common practice for converting sleepable spin lock to raw 
> spin lock in -rt to avoid scheduling in atomic context bug.

Note, in a lot of cases we don't just convert spin_locks to raw because
of atomic context. There's times we need to change the design where the
lock is not taken in atomic context (switching preempt_disable() to a
local_lock() for example).

But bpf is much like ftrace and kprobes where they can be taken almost
anywhere, and the do indeed need to be raw.

-- Steve

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Daniel Borkmann Nov. 2, 2015, 5:28 p.m. UTC | #9
On 11/02/2015 06:12 PM, Shi, Yang wrote:
...
> If you think such info is necessary, I definitely could add it into the commit log in v2.

As this is going to be documented anyway (thanks! ;)), and the discussion
to this patch can be found in the archives for those wondering, I'm good:

Acked-by: Daniel Borkmann <daniel@iogearbox.net>

Thanks for the fix, Yang!

I presume this should go to net-next then ...
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Yang Shi Nov. 2, 2015, 5:31 p.m. UTC | #10
On 11/2/2015 9:24 AM, Steven Rostedt wrote:
> On Mon, 02 Nov 2015 09:12:29 -0800
> "Shi, Yang" <yang.shi@linaro.org> wrote:
>
>> Yes, it is common practice for converting sleepable spin lock to raw
>> spin lock in -rt to avoid scheduling in atomic context bug.
>
> Note, in a lot of cases we don't just convert spin_locks to raw because
> of atomic context. There's times we need to change the design where the
> lock is not taken in atomic context (switching preempt_disable() to a
> local_lock() for example).

Yes, definitely. Understood.

Thanks,
Yang

>
> But bpf is much like ftrace and kprobes where they can be taken almost
> anywhere, and the do indeed need to be raw.
>
> -- Steve
>

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Miller Nov. 2, 2015, 8:47 p.m. UTC | #11
From: Yang Shi <yang.shi@linaro.org>
Date: Fri, 30 Oct 2015 15:16:26 -0700

> When running bpf samples on rt kernel, it reports the below warning:
 ...
> Convert hashtab lock to raw lock to avoid such warning.
> 
> Signed-off-by: Yang Shi <yang.shi@linaro.org>

Applied to net-next, thanks.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 83c209d..972b76b 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -17,7 +17,7 @@ 
 struct bpf_htab {
 	struct bpf_map map;
 	struct hlist_head *buckets;
-	spinlock_t lock;
+	raw_spinlock_t lock;
 	u32 count;	/* number of elements in this hashtable */
 	u32 n_buckets;	/* number of hash buckets */
 	u32 elem_size;	/* size of each element in bytes */
@@ -82,7 +82,7 @@  static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
 	for (i = 0; i < htab->n_buckets; i++)
 		INIT_HLIST_HEAD(&htab->buckets[i]);
 
-	spin_lock_init(&htab->lock);
+	raw_spin_lock_init(&htab->lock);
 	htab->count = 0;
 
 	htab->elem_size = sizeof(struct htab_elem) +
@@ -230,7 +230,7 @@  static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
 	l_new->hash = htab_map_hash(l_new->key, key_size);
 
 	/* bpf_map_update_elem() can be called in_irq() */
-	spin_lock_irqsave(&htab->lock, flags);
+	raw_spin_lock_irqsave(&htab->lock, flags);
 
 	head = select_bucket(htab, l_new->hash);
 
@@ -266,11 +266,11 @@  static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
 	} else {
 		htab->count++;
 	}
-	spin_unlock_irqrestore(&htab->lock, flags);
+	raw_spin_unlock_irqrestore(&htab->lock, flags);
 
 	return 0;
 err:
-	spin_unlock_irqrestore(&htab->lock, flags);
+	raw_spin_unlock_irqrestore(&htab->lock, flags);
 	kfree(l_new);
 	return ret;
 }
@@ -291,7 +291,7 @@  static int htab_map_delete_elem(struct bpf_map *map, void *key)
 
 	hash = htab_map_hash(key, key_size);
 
-	spin_lock_irqsave(&htab->lock, flags);
+	raw_spin_lock_irqsave(&htab->lock, flags);
 
 	head = select_bucket(htab, hash);
 
@@ -304,7 +304,7 @@  static int htab_map_delete_elem(struct bpf_map *map, void *key)
 		ret = 0;
 	}
 
-	spin_unlock_irqrestore(&htab->lock, flags);
+	raw_spin_unlock_irqrestore(&htab->lock, flags);
 	return ret;
 }