diff mbox

Crash due to mutex genl_lock called from RCU context

Message ID 1480136078.8455.589.camel@edumazet-glaptop3.roam.corp.google.com
State Accepted, archived
Delegated to: David Miller
Headers show

Commit Message

Eric Dumazet Nov. 26, 2016, 4:54 a.m. UTC
On Fri, 2016-11-25 at 20:11 -0800, Eric Dumazet wrote:
> On Fri, 2016-11-25 at 19:15 -0700, subashab@codeaurora.org wrote:
> > We are seeing a crash due to gen_lock mutex being acquired in RCU 
> > context.
> > Crash is seen on a 4.4 based kernel ARM64 device. This occurred in a
> > regression rack, so unfortunately I don't have steps for a reproducer.
> > 
> > It looks like freeing socket in RCU was brought in through commit
> > 21e4902aea80ef35afc00ee8d2abdea4f519b7f7 ("netlink: Lockless lookup with
> > RCU grace period in socket release").
> > I am not very familiar with generic netlink sockets so I am not sure
> > if there is any other way to fix this apart from reverting this patch.
> > 
> > Any pointers to debug this would be appreciated.
> > 
> > Here is the call stack -
> > 
> > BUG: sleeping function called from invalid context 
> > kernel/locking/mutex.c:98
> > in_atomic(): 1, irqs_disabled(): 0, pid: 16400, name: busybox
> > [<ffffff80080cad20>] ___might_sleep+0x134/0x144
> > [<ffffff80080cadac>] __might_sleep+0x7c/0x8c
> > [<ffffff8008ef09a8>] mutex_lock+0x2c/0x4c
> > [<ffffff8008d307f0>] genl_lock+0x1c/0x24
> > [<ffffff8008d30848>] genl_lock_done+0x2c/0x50
> > [<ffffff8008d2ccac>] netlink_sock_destruct+0x30/0x94
> > [<ffffff8008cdef44>] sk_destruct+0x2c/0x150
> > [<ffffff8008cdf104>] __sk_free+0x9c/0xc4
> > [<ffffff8008cdf16c>] sk_free+0x40/0x4c
> > [<ffffff8008d2c7fc>] deferred_put_nlk_sk+0x40/0x4c
> > [<ffffff800810b104>] rcu_process_callbacks+0x4d4/0x644
> > [<ffffff80080a6598>] __do_softirq+0x1b8/0x3c4
> > [<ffffff80080a6a60>] irq_exit+0x80/0xd4
> > [<ffffff800808e554>] handle_IPI+0x1c0/0x364
> > [<ffffff80080817f8>] gic_handle_irq+0x154/0x1a4
> 
> Right, Thomas commit looks buggy.
> 
> Unfortunately the proper infra was added later in commit 
> a4298e4522d687a79af8 ("net: add SOCK_RCU_FREE socket flag")
> 
> I guess we should backport it, then apply following (untested) fix
> 
> Could you test this solution ? Thanks !
> 
> diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
> index 62bea4591054..fe0d43314198 100644
> --- a/net/netlink/af_netlink.c
> +++ b/net/netlink/af_netlink.c
> @@ -456,8 +456,9 @@ static struct sock *netlink_lookup(struct net *net, int protocol, u32 portid)
>  
>  	rcu_read_lock();
>  	sk = __netlink_lookup(table, portid, net);
> -	if (sk)
> -		sock_hold(sk);
> +	if (sk && !atomic_inc_not_zero(&sk->sk_refcnt))
> +		sk = NULL;
> +
>  	rcu_read_unlock();
>  
>  	return sk;
> @@ -581,6 +582,7 @@ static int __netlink_create(struct net *net, struct socket *sock,
>  	}
>  	init_waitqueue_head(&nlk->wait);
>  
> +	sock_set_flag(sk, SOCK_RCU_FREE);
>  	sk->sk_destruct = netlink_sock_destruct;
>  	sk->sk_protocol = protocol;

Oh well, this wont work, since sk->sk_destruct will be called from RCU
callback.

Grabbing the mutex should not be done from netlink_sock_destruct() but
from netlink_release()

Maybe this patch would be better :

Comments

Subash Abhinov Kasiviswanathan Nov. 26, 2016, 5:59 a.m. UTC | #1
> Oh well, this wont work, since sk->sk_destruct will be called from RCU
> callback.
> 
> Grabbing the mutex should not be done from netlink_sock_destruct() but
> from netlink_release()
> 
> Maybe this patch would be better :
> 
> diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
> index 62bea4591054..cce10e3c9b68 100644
> --- a/net/netlink/af_netlink.c
> +++ b/net/netlink/af_netlink.c
> @@ -324,16 +324,6 @@ static void netlink_skb_set_owner_r(struct
> sk_buff *skb, struct sock *sk)
> 
>  static void netlink_sock_destruct(struct sock *sk)
>  {
> -	struct netlink_sock *nlk = nlk_sk(sk);
> -
> -	if (nlk->cb_running) {
> -		if (nlk->cb.done)
> -			nlk->cb.done(&nlk->cb);
> -
> -		module_put(nlk->cb.module);
> -		kfree_skb(nlk->cb.skb);
> -	}
> -
>  	skb_queue_purge(&sk->sk_receive_queue);
> 
>  	if (!sock_flag(sk, SOCK_DEAD)) {
> @@ -456,8 +446,9 @@ static struct sock *netlink_lookup(struct net
> *net, int protocol, u32 portid)
> 
>  	rcu_read_lock();
>  	sk = __netlink_lookup(table, portid, net);
> -	if (sk)
> -		sock_hold(sk);
> +	if (sk && !atomic_inc_not_zero(&sk->sk_refcnt))
> +		sk = NULL;
> +
>  	rcu_read_unlock();
> 
>  	return sk;
> @@ -581,6 +572,7 @@ static int __netlink_create(struct net *net,
> struct socket *sock,
>  	}
>  	init_waitqueue_head(&nlk->wait);
> 
> +	sock_set_flag(sk, SOCK_RCU_FREE);
>  	sk->sk_destruct = netlink_sock_destruct;
>  	sk->sk_protocol = protocol;
>  	return 0;
> @@ -645,13 +637,6 @@ static int netlink_create(struct net *net, struct
> socket *sock, int protocol,
>  	goto out;
>  }
> 
> -static void deferred_put_nlk_sk(struct rcu_head *head)
> -{
> -	struct netlink_sock *nlk = container_of(head, struct netlink_sock, 
> rcu);
> -
> -	sock_put(&nlk->sk);
> -}
> -
>  static int netlink_release(struct socket *sock)
>  {
>  	struct sock *sk = sock->sk;
> @@ -724,7 +709,19 @@ static int netlink_release(struct socket *sock)
>  	local_bh_disable();
>  	sock_prot_inuse_add(sock_net(sk), &netlink_proto, -1);
>  	local_bh_enable();
> -	call_rcu(&nlk->rcu, deferred_put_nlk_sk);
> +	if (nlk->cb_running) {
> +		mutex_lock(nlk->cb_mutex);
> +		if (nlk->cb_running) {
> +			if (nlk->cb.done)
> +				nlk->cb.done(&nlk->cb);
> +
> +			module_put(nlk->cb.module);
> +			kfree_skb(nlk->cb.skb);
> +			nlk->cb_running = false;
> +		}
> +		mutex_unlock(nlk->cb_mutex);
> +	}
> +	sock_put(sk);
>  	return 0;
>  }
> 
> diff --git a/net/netlink/af_netlink.h b/net/netlink/af_netlink.h
> index 3cfd6cc60504..5dc08a7b0a2b 100644
> --- a/net/netlink/af_netlink.h
> +++ b/net/netlink/af_netlink.h
> @@ -32,7 +32,6 @@ struct netlink_sock {
>  	struct module		*module;
> 
>  	struct rhash_head	node;
> -	struct rcu_head		rcu;
>  };
> 
>  static inline struct netlink_sock *nlk_sk(struct sock *sk)

Thanks Eric! I'll try this and get back with results over this weekend.

--
Qualcomm Innovation Center, Inc.
The Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, a 
Linux Foundation Collaborative Project
Cong Wang Nov. 27, 2016, 2:08 a.m. UTC | #2
On Fri, Nov 25, 2016 at 8:54 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
>
> Oh well, this wont work, since sk->sk_destruct will be called from RCU
> callback.
>
> Grabbing the mutex should not be done from netlink_sock_destruct() but
> from netlink_release()

But you also change the behavior of cb.done(), currently it is called when the
last sock ref is gone, with your patch it is called when the first
sock is closed.
No?

I don't see why we need to get genl_lock in ->done() here, because we are
already the last sock using it and module ref protects the ops from being
removed via module, seems we are pretty safe without any lock.
Eric Dumazet Nov. 27, 2016, 2:26 a.m. UTC | #3
On Sat, 2016-11-26 at 18:08 -0800, Cong Wang wrote:
> On Fri, Nov 25, 2016 at 8:54 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> >
> > Oh well, this wont work, since sk->sk_destruct will be called from RCU
> > callback.
> >
> > Grabbing the mutex should not be done from netlink_sock_destruct() but
> > from netlink_release()
> 
> But you also change the behavior of cb.done(), currently it is called when the
> last sock ref is gone, with your patch it is called when the first
> sock is closed.

No. It will be called when last refcount on the socket is released,
sk_refcnt transitions to final 0.


My patch changes the sock_hold() to the variant that makes sure
sk_refcnt is not 0 before increase, otherwise a race can happen and
release could be called twice.

Classic refcounting stuff coupled to rcu rules.

> No?


Are you telling me inet_release() is called when we close() the first
file descriptor ?

fd1 = socket()
fd2 = dup(fd1);
close(fd2) -> release() ???


> 
> I don't see why we need to get genl_lock in ->done() here, because we are
> already the last sock using it and module ref protects the ops from being
> removed via module, seems we are pretty safe without any lock.

Well, at least this exposes a real bug in Thomas patch.

Removing the lock might be done for net-next, not stable branches.
Cong Wang Nov. 27, 2016, 6:28 a.m. UTC | #4
On Sat, Nov 26, 2016 at 6:26 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
>
> Are you telling me inet_release() is called when we close() the first
> file descriptor ?
>
> fd1 = socket()
> fd2 = dup(fd1);
> close(fd2) -> release() ???

Sorry, I didn't express myself clearly, I meant your change,
if exclude the SOCK_RCU_FREE part, basically reverts this commit:

commit 3f660d66dfbc13ea4b61d3865851b348444c24b4
Author: Herbert Xu <herbert@gondor.apana.org.au>
Date:   Thu May 3 03:17:14 2007 -0700

    [NETLINK]: Kill CB only when socket is unused

IOW, ->release() is called when the last sock fd ref is gone, but ->destructor()
is called with the last sock ref is gone. They are very different.


>> I don't see why we need to get genl_lock in ->done() here, because we are
>> already the last sock using it and module ref protects the ops from being
>> removed via module, seems we are pretty safe without any lock.
>
> Well, at least this exposes a real bug in Thomas patch.
>
> Removing the lock might be done for net-next, not stable branches.

I am confused, what Subash reported is a kernel warning which can
surely be fixed by removing genl lock (if it is correct, I need to double
check), so why for net-next?
Eric Dumazet Nov. 27, 2016, 4:23 p.m. UTC | #5
On Sat, 2016-11-26 at 22:28 -0800, Cong Wang wrote:
> On Sat, Nov 26, 2016 at 6:26 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> >
> > Are you telling me inet_release() is called when we close() the first
> > file descriptor ?
> >
> > fd1 = socket()
> > fd2 = dup(fd1);
> > close(fd2) -> release() ???
> 
> Sorry, I didn't express myself clearly, I meant your change,
> if exclude the SOCK_RCU_FREE part, basically reverts this commit:
> 
> commit 3f660d66dfbc13ea4b61d3865851b348444c24b4
> Author: Herbert Xu <herbert@gondor.apana.org.au>
> Date:   Thu May 3 03:17:14 2007 -0700
> 
>     [NETLINK]: Kill CB only when socket is unused
> 
> IOW, ->release() is called when the last sock fd ref is gone, but ->destructor()
> is called with the last sock ref is gone. They are very different.

Hmm...


> I am confused, what Subash reported is a kernel warning which can
> surely be fixed by removing genl lock (if it is correct, I need to double
> check), so why for net-next?

Because Subash pointed to a buggy commit.

We want to fix all issues bring by this commit, not only the immediate
problem about mutex.

I have no idea if we can safely remove the mutex from genl_lock_done() :

The genl_lock() is not only protecting the socket itself, it might
protect global data as well, or protect some kind of lock ordering among
multiple mutexes.

Have you checked all genl users, down to linux-4.0 , point where commit
21e4902aea80ef35a was added ?

Herbert, Thomas, your help would be appreciated, thanks.
Cong Wang Nov. 28, 2016, 6:53 a.m. UTC | #6
On Sun, Nov 27, 2016 at 8:23 AM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> On Sat, 2016-11-26 at 22:28 -0800, Cong Wang wrote:
>> On Sat, Nov 26, 2016 at 6:26 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
>> >
>> > Are you telling me inet_release() is called when we close() the first
>> > file descriptor ?
>> >
>> > fd1 = socket()
>> > fd2 = dup(fd1);
>> > close(fd2) -> release() ???
>>
>> Sorry, I didn't express myself clearly, I meant your change,
>> if exclude the SOCK_RCU_FREE part, basically reverts this commit:
>>
>> commit 3f660d66dfbc13ea4b61d3865851b348444c24b4
>> Author: Herbert Xu <herbert@gondor.apana.org.au>
>> Date:   Thu May 3 03:17:14 2007 -0700
>>
>>     [NETLINK]: Kill CB only when socket is unused
>>
>> IOW, ->release() is called when the last sock fd ref is gone, but ->destructor()
>> is called with the last sock ref is gone. They are very different.
>
> Hmm...
>
>
>> I am confused, what Subash reported is a kernel warning which can
>> surely be fixed by removing genl lock (if it is correct, I need to double
>> check), so why for net-next?
>
> Because Subash pointed to a buggy commit.
>
> We want to fix all issues bring by this commit, not only the immediate
> problem about mutex.
>
> I have no idea if we can safely remove the mutex from genl_lock_done() :

I meant removing it only for the destructor case, we definitely can't remove
it for the dump case.

>
> The genl_lock() is not only protecting the socket itself, it might
> protect global data as well, or protect some kind of lock ordering among
> multiple mutexes.
>
> Have you checked all genl users, down to linux-4.0 , point where commit
> 21e4902aea80ef35a was added ?
>

I just took a deeper look, some user calls rhashtable_destroy() in ->done(),
so even removing that genl lock is not enough, perhaps we should just
move it to a work struct like what Daniel does for the tcf_proto, but that is
ugly... I don't know if RCU provides any API to execute the callback in process
context.
diff mbox

Patch

diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 62bea4591054..cce10e3c9b68 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -324,16 +324,6 @@  static void netlink_skb_set_owner_r(struct sk_buff *skb, struct sock *sk)
 
 static void netlink_sock_destruct(struct sock *sk)
 {
-	struct netlink_sock *nlk = nlk_sk(sk);
-
-	if (nlk->cb_running) {
-		if (nlk->cb.done)
-			nlk->cb.done(&nlk->cb);
-
-		module_put(nlk->cb.module);
-		kfree_skb(nlk->cb.skb);
-	}
-
 	skb_queue_purge(&sk->sk_receive_queue);
 
 	if (!sock_flag(sk, SOCK_DEAD)) {
@@ -456,8 +446,9 @@  static struct sock *netlink_lookup(struct net *net, int protocol, u32 portid)
 
 	rcu_read_lock();
 	sk = __netlink_lookup(table, portid, net);
-	if (sk)
-		sock_hold(sk);
+	if (sk && !atomic_inc_not_zero(&sk->sk_refcnt))
+		sk = NULL;
+
 	rcu_read_unlock();
 
 	return sk;
@@ -581,6 +572,7 @@  static int __netlink_create(struct net *net, struct socket *sock,
 	}
 	init_waitqueue_head(&nlk->wait);
 
+	sock_set_flag(sk, SOCK_RCU_FREE);
 	sk->sk_destruct = netlink_sock_destruct;
 	sk->sk_protocol = protocol;
 	return 0;
@@ -645,13 +637,6 @@  static int netlink_create(struct net *net, struct socket *sock, int protocol,
 	goto out;
 }
 
-static void deferred_put_nlk_sk(struct rcu_head *head)
-{
-	struct netlink_sock *nlk = container_of(head, struct netlink_sock, rcu);
-
-	sock_put(&nlk->sk);
-}
-
 static int netlink_release(struct socket *sock)
 {
 	struct sock *sk = sock->sk;
@@ -724,7 +709,19 @@  static int netlink_release(struct socket *sock)
 	local_bh_disable();
 	sock_prot_inuse_add(sock_net(sk), &netlink_proto, -1);
 	local_bh_enable();
-	call_rcu(&nlk->rcu, deferred_put_nlk_sk);
+	if (nlk->cb_running) {
+		mutex_lock(nlk->cb_mutex);
+		if (nlk->cb_running) {
+			if (nlk->cb.done)
+				nlk->cb.done(&nlk->cb);
+	
+			module_put(nlk->cb.module);
+			kfree_skb(nlk->cb.skb);
+			nlk->cb_running = false;
+		}
+		mutex_unlock(nlk->cb_mutex);
+	}
+	sock_put(sk);
 	return 0;
 }
 
diff --git a/net/netlink/af_netlink.h b/net/netlink/af_netlink.h
index 3cfd6cc60504..5dc08a7b0a2b 100644
--- a/net/netlink/af_netlink.h
+++ b/net/netlink/af_netlink.h
@@ -32,7 +32,6 @@  struct netlink_sock {
 	struct module		*module;
 
 	struct rhash_head	node;
-	struct rcu_head		rcu;
 };
 
 static inline struct netlink_sock *nlk_sk(struct sock *sk)