diff mbox

[RFC,net-next,00/11] netns: don't switch namespace while creating kernel sockets

Message ID 20150508140733.GA13325@gondor.apana.org.au
State RFC, archived
Delegated to: David Miller
Headers show

Commit Message

Herbert Xu May 8, 2015, 2:07 p.m. UTC
On Thu, May 07, 2015 at 11:14:13AM -0500, Eric W. Biederman wrote:
>
> The following change shows how it is possible to always know that your
> network namespace has a non-zero reference count in the network
> namespace initialization methods.  My implementation of
> lock_network_namespaces is problematic in that it does not sleep
> while network namespaces are unregistering.  But it is enough to show
> how the locking and reference counting can be fixed.

If name spaces are created/deleted constantly this could take
a long time.  How about forcibly cleaning up entries when we
register a new op?

---8<---
Subject: netns: Do not init dead nets when registering new op

Currently when a new op is registered we will initialise all
current namespaces through that op, even those that are zombies
(zero ref count).  This is bad because it puts the onus on each
op implementor to deal with this and they are bound to slip up.

This patch fixes this by only initialising live namespaces.

This however brings about a new problem as when those zombies
are eventually cleaned up we will call op->exit even on them
even though they have not been initialised.

This is fixed by bringing forward parts of the work of the cleanup
thread and performing them during registration.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

Comments

Eric W. Biederman May 8, 2015, 5:36 p.m. UTC | #1
Herbert Xu <herbert@gondor.apana.org.au> writes:

> On Thu, May 07, 2015 at 11:14:13AM -0500, Eric W. Biederman wrote:
>>
>> The following change shows how it is possible to always know that your
>> network namespace has a non-zero reference count in the network
>> namespace initialization methods.  My implementation of
>> lock_network_namespaces is problematic in that it does not sleep
>> while network namespaces are unregistering.  But it is enough to show
>> how the locking and reference counting can be fixed.
>
> If name spaces are created/deleted constantly this could take
> a long time.  How about forcibly cleaning up entries when we
> register a new op?

That certainly seems more sensible than waiting for an indefinite amount
of time, and the code looks reasonably sound.  Placing on things on
lists and then not removing them, and then later stomping the list
pointers gives me flashbacks of other bugs I have fought, but we already
do that in this code, and in this context it appears harmless.

So overall I think I like it.

At the same time while a network namespace remains discoverable we might
be using it in small ways.  So I am wondering if this is the right
approach.

It really is invalid for a network namespace init routine to grab the
reference count of it's network namespace (thus making the network
namespace unfreeable).  So I am wondering if perhaps all we need to do
is find a clean refactoring of the socket code so this case does not
come up at all.

Perhaps just a flag that says this is a kernel socket so don't get/put
the refcount on the network namespace.  

> ---8<---
> Subject: netns: Do not init dead nets when registering new op
>
> Currently when a new op is registered we will initialise all
> current namespaces through that op, even those that are zombies
> (zero ref count).  This is bad because it puts the onus on each
> op implementor to deal with this and they are bound to slip up.
>
> This patch fixes this by only initialising live namespaces.
>
> This however brings about a new problem as when those zombies
> are eventually cleaned up we will call op->exit even on them
> even though they have not been initialised.
>
> This is fixed by bringing forward parts of the work of the cleanup
> thread and performing them during registration.
>
> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
>
> diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
> index f733656..4931100 100644
> --- a/include/net/net_namespace.h
> +++ b/include/net/net_namespace.h
> @@ -133,6 +133,7 @@ struct net {
>  #endif
>  	struct sock		*diag_nlsk;
>  	atomic_t		fnhe_genid;
> +	bool			dead;
>  };
>  
>  #include <linux/seq_file_net.h>
> diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
> index 78fc04a..1f888a7 100644
> --- a/net/core/net_namespace.c
> +++ b/net/core/net_namespace.c
> @@ -339,41 +339,38 @@ struct net *copy_net_ns(unsigned long flags,
>  	return net;
>  }
>  
> -static DEFINE_SPINLOCK(cleanup_list_lock);
> -static LIST_HEAD(cleanup_list);  /* Must hold cleanup_list_lock to touch */
> -
> -static void cleanup_net(struct work_struct *work)
> +static void unlink_net(struct net *net, struct list_head *net_exit_list)
>  {
> -	const struct pernet_operations *ops;
> -	struct net *net, *tmp;
> -	struct list_head net_kill_list;
> -	LIST_HEAD(net_exit_list);
> +	struct net *tmp;
>  
> -	/* Atomically snapshot the list of namespaces to cleanup */
> -	spin_lock_irq(&cleanup_list_lock);
> -	list_replace_init(&cleanup_list, &net_kill_list);
> -	spin_unlock_irq(&cleanup_list_lock);
> +	if (net->dead)
> +		return;
>  
> -	mutex_lock(&net_mutex);
> +	net->dead = true;
> +
> +	list_add_tail(&net->exit_list, net_exit_list);
>  
>  	/* Don't let anyone else find us. */
>  	rtnl_lock();
> -	list_for_each_entry(net, &net_kill_list, cleanup_list) {
> -		list_del_rcu(&net->list);
> -		list_add_tail(&net->exit_list, &net_exit_list);
> -		for_each_net(tmp) {
> -			int id = __peernet2id(tmp, net, false);
> +	list_del_rcu(&net->list);
>  
> -			if (id >= 0) {
> -				rtnl_net_notifyid(tmp, net, RTM_DELNSID, id);
> -				idr_remove(&tmp->netns_ids, id);
> -			}
> -		}
> -		idr_destroy(&net->netns_ids);
> +	for_each_net(tmp) {
> +		int id = __peernet2id(tmp, net, false);
>  
> +		if (id >= 0) {
> +			rtnl_net_notifyid(tmp, net, RTM_DELNSID, id);
> +			idr_remove(&tmp->netns_ids, id);
> +		}
>  	}
>  	rtnl_unlock();
>  
> +	idr_destroy(&net->netns_ids);
> +}
> +
> +static void exit_all_ops(struct list_head *net_exit_list)
> +{
> +	const struct pernet_operations *ops;
> +
>  	/*
>  	 * Another CPU might be rcu-iterating the list, wait for it.
>  	 * This needs to be before calling the exit() notifiers, so
> @@ -383,11 +380,33 @@ static void cleanup_net(struct work_struct *work)
>  
>  	/* Run all of the network namespace exit methods */
>  	list_for_each_entry_reverse(ops, &pernet_list, list)
> -		ops_exit_list(ops, &net_exit_list);
> +		ops_exit_list(ops, net_exit_list);
>  
>  	/* Free the net generic variables */
>  	list_for_each_entry_reverse(ops, &pernet_list, list)
> -		ops_free_list(ops, &net_exit_list);
> +		ops_free_list(ops, net_exit_list);
> +}
> +
> +static DEFINE_SPINLOCK(cleanup_list_lock);
> +static LIST_HEAD(cleanup_list);  /* Must hold cleanup_list_lock to touch */
> +
> +static void cleanup_net(struct work_struct *work)
> +{
> +	struct list_head net_kill_list;
> +	struct net *net, *tmp;
> +	LIST_HEAD(net_exit_list);
> +
> +	/* Atomically snapshot the list of namespaces to cleanup */
> +	spin_lock_irq(&cleanup_list_lock);
> +	list_replace_init(&cleanup_list, &net_kill_list);
> +	spin_unlock_irq(&cleanup_list_lock);
> +
> +	mutex_lock(&net_mutex);
> +
> +	list_for_each_entry(net, &net_kill_list, cleanup_list)
> +		unlink_net(net, &net_exit_list);
> +
> +	exit_all_ops(&net_exit_list);
>  
>  	mutex_unlock(&net_mutex);
>  
> @@ -397,8 +416,7 @@ static void cleanup_net(struct work_struct *work)
>  	rcu_barrier();
>  
>  	/* Finally it is safe to free my network namespace structure */
> -	list_for_each_entry_safe(net, tmp, &net_exit_list, exit_list) {
> -		list_del_init(&net->exit_list);
> +	list_for_each_entry_safe(net, tmp, &net_kill_list, cleanup_list) {
>  		put_user_ns(net->user_ns);
>  		net_drop_ns(net);
>  	}
> @@ -727,30 +745,59 @@ static int __init net_ns_init(void)
>  pure_initcall(net_ns_init);
>  
>  #ifdef CONFIG_NET_NS
> +static void grab_all_nets(struct list_head *all_net_list)
> +{
> +	struct net *net;
> +	LIST_HEAD(net_exit_list);
> +
> +	for_each_net(net) {
> +		if (maybe_get_net(net)) 
> +			list_add_tail(&net->cleanup_list, all_net_list);
> +		else
> +			unlink_net(net, &net_exit_list);
> +	}
> +
> +	exit_all_ops(&net_exit_list);
> +}
> +
> +static void drop_all_nets(struct list_head *all_net_list)
> +{
> +	struct net *net, *next;
> +
> +	list_for_each_entry_safe(net, next, all_net_list, cleanup_list)
> +		put_net(net);
> +}
> +
>  static int __register_pernet_operations(struct list_head *list,
>  					struct pernet_operations *ops)
>  {
>  	struct net *net;
> -	int error;
> +	int error = 0;
> +	LIST_HEAD(all_net_list);
>  	LIST_HEAD(net_exit_list);
>  
> +	grab_all_nets(&all_net_list);
> +
>  	list_add_tail(&ops->list, list);
>  	if (ops->init || (ops->id && ops->size)) {
> -		for_each_net(net) {
> +		list_for_each_entry(net, &all_net_list, cleanup_list) {
>  			error = ops_init(ops, net);
>  			if (error)
>  				goto out_undo;
>  			list_add_tail(&net->exit_list, &net_exit_list);
>  		}
>  	}
> -	return 0;
> +
> +out_drop_nets:
> +	drop_all_nets(&all_net_list);
> +	return error;
>  
>  out_undo:
>  	/* If I have an error cleanup all namespaces I initialized */
>  	list_del(&ops->list);
>  	ops_exit_list(ops, &net_exit_list);
>  	ops_free_list(ops, &net_exit_list);
> -	return error;
> +	goto out_drop_nets;
>  }
>  
>  static void __unregister_pernet_operations(struct pernet_operations *ops)
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Cong Wang May 8, 2015, 8:27 p.m. UTC | #2
On Fri, May 8, 2015 at 10:36 AM, Eric W. Biederman
<ebiederm@xmission.com> wrote:
>
> It really is invalid for a network namespace init routine to grab the
> reference count of it's network namespace (thus making the network
> namespace unfreeable).  So I am wondering if perhaps all we need to do
> is find a clean refactoring of the socket code so this case does not
> come up at all.


Good point!

I _guess_ the reason is these kernel sockets have to exist longer than
netns' life-time, it could be due to on-flying skb's?

On the other hand, we do create some fb_tunnel netdevice in netns init
too, but we don't take a refcnt there, probably because we wait
for netdevice refcnt goes to zero when unregistering.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Cong Wang May 8, 2015, 9:13 p.m. UTC | #3
On Fri, May 8, 2015 at 1:27 PM, Cong Wang <cwang@twopensource.com> wrote:
> On Fri, May 8, 2015 at 10:36 AM, Eric W. Biederman
> <ebiederm@xmission.com> wrote:
>>
>> It really is invalid for a network namespace init routine to grab the
>> reference count of it's network namespace (thus making the network
>> namespace unfreeable).  So I am wondering if perhaps all we need to do
>> is find a clean refactoring of the socket code so this case does not
>> come up at all.
>
>
> Good point!
>
> I _guess_ the reason is these kernel sockets have to exist longer than
> netns' life-time, it could be due to on-flying skb's?
>
> On the other hand, we do create some fb_tunnel netdevice in netns init
> too, but we don't take a refcnt there, probably because we wait
> for netdevice refcnt goes to zero when unregistering.

Answer myself, it looks like we don't need to hold the refcnt at all,
since we create the socket at init and release it at uninit, so they
should have the same life-time?

The reason why user-space sockets need this refcnt is they
could have longer life-time than the netns?

This seems the right direction to solve this problem for me,
I am going to try this way to see how far I can go. ;)

Thanks.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric W. Biederman May 8, 2015, 10:08 p.m. UTC | #4
Cong Wang <cwang@twopensource.com> writes:

> On Fri, May 8, 2015 at 1:27 PM, Cong Wang <cwang@twopensource.com> wrote:
>> On Fri, May 8, 2015 at 10:36 AM, Eric W. Biederman
>> <ebiederm@xmission.com> wrote:
>>>
>>> It really is invalid for a network namespace init routine to grab the
>>> reference count of it's network namespace (thus making the network
>>> namespace unfreeable).  So I am wondering if perhaps all we need to do
>>> is find a clean refactoring of the socket code so this case does not
>>> come up at all.
>>
>>
>> Good point!
>>
>> I _guess_ the reason is these kernel sockets have to exist longer than
>> netns' life-time, it could be due to on-flying skb's?
>>
>> On the other hand, we do create some fb_tunnel netdevice in netns init
>> too, but we don't take a refcnt there, probably because we wait
>> for netdevice refcnt goes to zero when unregistering.
>
> Answer myself, it looks like we don't need to hold the refcnt at all,
> since we create the socket at init and release it at uninit, so they
> should have the same life-time?
>
> The reason why user-space sockets need this refcnt is they
> could have longer life-time than the netns?

User space sockets by design keep the network namespace alive.

> This seems the right direction to solve this problem for me,
> I am going to try this way to see how far I can go. ;)

I will be interested to see what you come up with.

Eric

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Herbert Xu May 9, 2015, 1:13 a.m. UTC | #5
On Fri, May 08, 2015 at 12:36:56PM -0500, Eric W. Biederman wrote:
>
> At the same time while a network namespace remains discoverable we might
> be using it in small ways.  So I am wondering if this is the right
> approach.
> 
> It really is invalid for a network namespace init routine to grab the
> reference count of it's network namespace (thus making the network
> namespace unfreeable).  So I am wondering if perhaps all we need to do
> is find a clean refactoring of the socket code so this case does not
> come up at all.
> 
> Perhaps just a flag that says this is a kernel socket so don't get/put
> the refcount on the network namespace.  

No this line of thinking is what led us into the hole in the
first place.

Really it's perfectly valid for the init function to want to
take a reference on net.  For all we know it might be a temporary
reference taken by a third party.  It doesn't even have to be a
socket.

We must hide this subtlety from ops implementors since they have
no knowledge of our implementation.  Expecting them to deal with
this is going to result in bugs, and we have already had multiple
bugs in this area.

Cheers,
Eric W. Biederman May 9, 2015, 1:53 a.m. UTC | #6
Herbert Xu <herbert@gondor.apana.org.au> writes:

> On Fri, May 08, 2015 at 12:36:56PM -0500, Eric W. Biederman wrote:
>>
>> At the same time while a network namespace remains discoverable we might
>> be using it in small ways.  So I am wondering if this is the right
>> approach.
>> 
>> It really is invalid for a network namespace init routine to grab the
>> reference count of it's network namespace (thus making the network
>> namespace unfreeable).  So I am wondering if perhaps all we need to do
>> is find a clean refactoring of the socket code so this case does not
>> come up at all.
>> 
>> Perhaps just a flag that says this is a kernel socket so don't get/put
>> the refcount on the network namespace.  
>
> No this line of thinking is what led us into the hole in the
> first place.
>
> Really it's perfectly valid for the init function to want to
> take a reference on net.  For all we know it might be a temporary
> reference taken by a third party.  It doesn't even have to be a
> socket.
>
> We must hide this subtlety from ops implementors since they have
> no knowledge of our implementation.  Expecting them to deal with
> this is going to result in bugs, and we have already had multiple
> bugs in this area.

I won't fundamentally argue.  However the worst hack and subtlety come
from the way we deal with kernel sockets and that is much easier to
clean up so we might as well get that first.

Eric

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric W. Biederman May 9, 2015, 2:05 a.m. UTC | #7
Right now the situtation for allocating kernel sockets is a mess.
- sock_create_kern does not take a namespace parameter.
- kernel sockets must not reference count a network namespace and keep
  it alive or else we will have a reference counting loop.
- The way we avoid the reference counting loop with sk_change_net
  and sk_release_kernel are major hacks.

This patchset addresses this mess by fixing sock_create_kern to do
everything necessary to create a kernel socket.  None of the current
users of kernel sockets need the network namespace reference counted.
Either kernel sockets are network namespace aware (and using the current
hacks) or kernel sockets are limited to the initial network namespace
in which case it does not matter.

This patchset starts by addressing tun which should be using normal
userspace sockets like macvtap.

Then sock_create_kern is fixed to take a network namespace.
Then the in kernel status of sockets are passed through to sk_alloc.
Then sk_alloc is fixed to not reference count the network namespace
     of kernel sockets.
Then the callers of sock_create_kern are fixed up to stop using hacks.
Then netlink which uses it's own flavor of sock_create_kern is fixed.

Finally the hacks that are sk_change_net and sk_release_kernel are removed.

When it is all done the code is easier to follow, easier to use, easier
to maintain and shorter by about 70 lines.

Reported-by: Ying Xue <ying.xue@windriver.com>

Eric W. Biederman (6):
      tun: Utilize the normal socket network namespace refcounting.
      net: Add a struct net parameter to sock_create_kern
      net: Pass kern from net_proto_family.create to sk_alloc
      net: Modify sk_alloc to not reference count the netns of kernel sockets.
      netlink: Create kernel netlink sockets in the proper network namespace
      net: kill sk_change_net and sk_release_kernel

 crypto/af_alg.c                    |  4 ++--
 drivers/block/drbd/drbd_receiver.c |  4 ++--
 drivers/isdn/mISDN/socket.c        | 12 ++++++------
 drivers/net/macvtap.c              |  2 +-
 drivers/net/ppp/pppoe.c            |  4 ++--
 drivers/net/ppp/pppox.c            |  2 +-
 drivers/net/ppp/pptp.c             |  4 ++--
 drivers/net/tun.c                  | 26 +++++---------------------
 fs/afs/rxrpc.c                     |  2 +-
 fs/dlm/lowcomms.c                  | 16 ++++++++--------
 include/linux/if_pppox.h           |  2 +-
 include/linux/net.h                |  3 +--
 include/net/af_vsock.h             |  2 +-
 include/net/inet_common.h          |  2 +-
 include/net/llc_conn.h             |  2 +-
 include/net/sock.h                 | 21 +++------------------
 net/appletalk/ddp.c                |  2 +-
 net/atm/common.c                   |  4 ++--
 net/atm/common.h                   |  2 +-
 net/atm/pvc.c                      |  2 +-
 net/atm/svc.c                      |  2 +-
 net/ax25/af_ax25.c                 |  4 ++--
 net/bluetooth/bnep/sock.c          |  2 +-
 net/bluetooth/cmtp/sock.c          |  2 +-
 net/bluetooth/hci_sock.c           |  2 +-
 net/bluetooth/hidp/sock.c          |  2 +-
 net/bluetooth/l2cap_sock.c         | 10 +++++-----
 net/bluetooth/rfcomm/core.c        |  2 +-
 net/bluetooth/rfcomm/sock.c        |  8 ++++----
 net/bluetooth/sco.c                |  8 ++++----
 net/caif/caif_socket.c             |  2 +-
 net/can/af_can.c                   |  2 +-
 net/ceph/messenger.c               |  4 ++--
 net/core/sock.c                    | 30 ++++++++----------------------
 net/decnet/af_decnet.c             |  8 ++++----
 net/ieee802154/socket.c            |  2 +-
 net/ipv4/af_inet.c                 |  6 ++----
 net/ipv4/udp_tunnel.c              |  8 +++-----
 net/ipv6/af_inet6.c                |  2 +-
 net/ipv6/ip6_udp_tunnel.c          |  6 ++----
 net/ipx/af_ipx.c                   |  2 +-
 net/irda/af_irda.c                 |  2 +-
 net/iucv/af_iucv.c                 | 10 +++++-----
 net/key/af_key.c                   |  2 +-
 net/l2tp/l2tp_core.c               | 15 ++++++---------
 net/l2tp/l2tp_ppp.c                |  4 ++--
 net/llc/af_llc.c                   |  2 +-
 net/llc/llc_conn.c                 |  6 +++---
 net/netfilter/ipvs/ip_vs_sync.c    | 30 +++++++++---------------------
 net/netlink/af_netlink.c           | 21 +++++++++------------
 net/netrom/af_netrom.c             |  4 ++--
 net/nfc/af_nfc.c                   |  2 +-
 net/nfc/llcp.h                     |  2 +-
 net/nfc/llcp_core.c                |  2 +-
 net/nfc/llcp_sock.c                |  8 ++++----
 net/nfc/nfc.h                      |  2 +-
 net/nfc/rawsock.c                  |  4 ++--
 net/packet/af_packet.c             |  2 +-
 net/phonet/af_phonet.c             |  2 +-
 net/phonet/pep.c                   |  2 +-
 net/rds/af_rds.c                   |  2 +-
 net/rose/af_rose.c                 |  4 ++--
 net/rxrpc/af_rxrpc.c               |  2 +-
 net/rxrpc/ar-local.c               |  4 ++--
 net/sctp/ipv6.c                    |  2 +-
 net/sctp/protocol.c                |  2 +-
 net/socket.c                       |  7 ++-----
 net/tipc/socket.c                  |  2 +-
 net/unix/af_unix.c                 |  8 ++++----
 net/vmw_vsock/af_vsock.c           |  7 ++++---
 net/vmw_vsock/vmci_transport.c     |  2 +-
 net/x25/af_x25.c                   |  8 ++++----
 72 files changed, 166 insertions(+), 238 deletions(-)


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Herbert Xu May 9, 2015, 2:38 a.m. UTC | #8
On Fri, May 08, 2015 at 09:05:33PM -0500, Eric W. Biederman wrote:
> 
> Right now the situtation for allocating kernel sockets is a mess.
> - sock_create_kern does not take a namespace parameter.
> - kernel sockets must not reference count a network namespace and keep
>   it alive or else we will have a reference counting loop.
> - The way we avoid the reference counting loop with sk_change_net
>   and sk_release_kernel are major hacks.
> 
> This patchset addresses this mess by fixing sock_create_kern to do
> everything necessary to create a kernel socket.  None of the current
> users of kernel sockets need the network namespace reference counted.
> Either kernel sockets are network namespace aware (and using the current
> hacks) or kernel sockets are limited to the initial network namespace
> in which case it does not matter.
> 
> This patchset starts by addressing tun which should be using normal
> userspace sockets like macvtap.
> 
> Then sock_create_kern is fixed to take a network namespace.
> Then the in kernel status of sockets are passed through to sk_alloc.
> Then sk_alloc is fixed to not reference count the network namespace
>      of kernel sockets.
> Then the callers of sock_create_kern are fixed up to stop using hacks.
> Then netlink which uses it's own flavor of sock_create_kern is fixed.
> 
> Finally the hacks that are sk_change_net and sk_release_kernel are removed.
> 
> When it is all done the code is easier to follow, easier to use, easier
> to maintain and shorter by about 70 lines.
> 
> Reported-by: Ying Xue <ying.xue@windriver.com>

Looks good to me.

Thanks,
David Miller May 11, 2015, 2:53 p.m. UTC | #9
From: ebiederm@xmission.com (Eric W. Biederman)
Date: Fri, 08 May 2015 21:05:33 -0500

> Right now the situtation for allocating kernel sockets is a mess.
> - sock_create_kern does not take a namespace parameter.
> - kernel sockets must not reference count a network namespace and keep
>   it alive or else we will have a reference counting loop.
> - The way we avoid the reference counting loop with sk_change_net
>   and sk_release_kernel are major hacks.
> 
> This patchset addresses this mess by fixing sock_create_kern to do
> everything necessary to create a kernel socket.  None of the current
> users of kernel sockets need the network namespace reference counted.
> Either kernel sockets are network namespace aware (and using the current
> hacks) or kernel sockets are limited to the initial network namespace
> in which case it does not matter.
> 
> This patchset starts by addressing tun which should be using normal
> userspace sockets like macvtap.
> 
> Then sock_create_kern is fixed to take a network namespace.
> Then the in kernel status of sockets are passed through to sk_alloc.
> Then sk_alloc is fixed to not reference count the network namespace
>      of kernel sockets.
> Then the callers of sock_create_kern are fixed up to stop using hacks.
> Then netlink which uses it's own flavor of sock_create_kern is fixed.
> 
> Finally the hacks that are sk_change_net and sk_release_kernel are removed.
> 
> When it is all done the code is easier to follow, easier to use, easier
> to maintain and shorter by about 70 lines.
> 
> Reported-by: Ying Xue <ying.xue@windriver.com>

Looks good, applied to net-next, thanks Eric.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index f733656..4931100 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -133,6 +133,7 @@  struct net {
 #endif
 	struct sock		*diag_nlsk;
 	atomic_t		fnhe_genid;
+	bool			dead;
 };
 
 #include <linux/seq_file_net.h>
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 78fc04a..1f888a7 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -339,41 +339,38 @@  struct net *copy_net_ns(unsigned long flags,
 	return net;
 }
 
-static DEFINE_SPINLOCK(cleanup_list_lock);
-static LIST_HEAD(cleanup_list);  /* Must hold cleanup_list_lock to touch */
-
-static void cleanup_net(struct work_struct *work)
+static void unlink_net(struct net *net, struct list_head *net_exit_list)
 {
-	const struct pernet_operations *ops;
-	struct net *net, *tmp;
-	struct list_head net_kill_list;
-	LIST_HEAD(net_exit_list);
+	struct net *tmp;
 
-	/* Atomically snapshot the list of namespaces to cleanup */
-	spin_lock_irq(&cleanup_list_lock);
-	list_replace_init(&cleanup_list, &net_kill_list);
-	spin_unlock_irq(&cleanup_list_lock);
+	if (net->dead)
+		return;
 
-	mutex_lock(&net_mutex);
+	net->dead = true;
+
+	list_add_tail(&net->exit_list, net_exit_list);
 
 	/* Don't let anyone else find us. */
 	rtnl_lock();
-	list_for_each_entry(net, &net_kill_list, cleanup_list) {
-		list_del_rcu(&net->list);
-		list_add_tail(&net->exit_list, &net_exit_list);
-		for_each_net(tmp) {
-			int id = __peernet2id(tmp, net, false);
+	list_del_rcu(&net->list);
 
-			if (id >= 0) {
-				rtnl_net_notifyid(tmp, net, RTM_DELNSID, id);
-				idr_remove(&tmp->netns_ids, id);
-			}
-		}
-		idr_destroy(&net->netns_ids);
+	for_each_net(tmp) {
+		int id = __peernet2id(tmp, net, false);
 
+		if (id >= 0) {
+			rtnl_net_notifyid(tmp, net, RTM_DELNSID, id);
+			idr_remove(&tmp->netns_ids, id);
+		}
 	}
 	rtnl_unlock();
 
+	idr_destroy(&net->netns_ids);
+}
+
+static void exit_all_ops(struct list_head *net_exit_list)
+{
+	const struct pernet_operations *ops;
+
 	/*
 	 * Another CPU might be rcu-iterating the list, wait for it.
 	 * This needs to be before calling the exit() notifiers, so
@@ -383,11 +380,33 @@  static void cleanup_net(struct work_struct *work)
 
 	/* Run all of the network namespace exit methods */
 	list_for_each_entry_reverse(ops, &pernet_list, list)
-		ops_exit_list(ops, &net_exit_list);
+		ops_exit_list(ops, net_exit_list);
 
 	/* Free the net generic variables */
 	list_for_each_entry_reverse(ops, &pernet_list, list)
-		ops_free_list(ops, &net_exit_list);
+		ops_free_list(ops, net_exit_list);
+}
+
+static DEFINE_SPINLOCK(cleanup_list_lock);
+static LIST_HEAD(cleanup_list);  /* Must hold cleanup_list_lock to touch */
+
+static void cleanup_net(struct work_struct *work)
+{
+	struct list_head net_kill_list;
+	struct net *net, *tmp;
+	LIST_HEAD(net_exit_list);
+
+	/* Atomically snapshot the list of namespaces to cleanup */
+	spin_lock_irq(&cleanup_list_lock);
+	list_replace_init(&cleanup_list, &net_kill_list);
+	spin_unlock_irq(&cleanup_list_lock);
+
+	mutex_lock(&net_mutex);
+
+	list_for_each_entry(net, &net_kill_list, cleanup_list)
+		unlink_net(net, &net_exit_list);
+
+	exit_all_ops(&net_exit_list);
 
 	mutex_unlock(&net_mutex);
 
@@ -397,8 +416,7 @@  static void cleanup_net(struct work_struct *work)
 	rcu_barrier();
 
 	/* Finally it is safe to free my network namespace structure */
-	list_for_each_entry_safe(net, tmp, &net_exit_list, exit_list) {
-		list_del_init(&net->exit_list);
+	list_for_each_entry_safe(net, tmp, &net_kill_list, cleanup_list) {
 		put_user_ns(net->user_ns);
 		net_drop_ns(net);
 	}
@@ -727,30 +745,59 @@  static int __init net_ns_init(void)
 pure_initcall(net_ns_init);
 
 #ifdef CONFIG_NET_NS
+static void grab_all_nets(struct list_head *all_net_list)
+{
+	struct net *net;
+	LIST_HEAD(net_exit_list);
+
+	for_each_net(net) {
+		if (maybe_get_net(net)) 
+			list_add_tail(&net->cleanup_list, all_net_list);
+		else
+			unlink_net(net, &net_exit_list);
+	}
+
+	exit_all_ops(&net_exit_list);
+}
+
+static void drop_all_nets(struct list_head *all_net_list)
+{
+	struct net *net, *next;
+
+	list_for_each_entry_safe(net, next, all_net_list, cleanup_list)
+		put_net(net);
+}
+
 static int __register_pernet_operations(struct list_head *list,
 					struct pernet_operations *ops)
 {
 	struct net *net;
-	int error;
+	int error = 0;
+	LIST_HEAD(all_net_list);
 	LIST_HEAD(net_exit_list);
 
+	grab_all_nets(&all_net_list);
+
 	list_add_tail(&ops->list, list);
 	if (ops->init || (ops->id && ops->size)) {
-		for_each_net(net) {
+		list_for_each_entry(net, &all_net_list, cleanup_list) {
 			error = ops_init(ops, net);
 			if (error)
 				goto out_undo;
 			list_add_tail(&net->exit_list, &net_exit_list);
 		}
 	}
-	return 0;
+
+out_drop_nets:
+	drop_all_nets(&all_net_list);
+	return error;
 
 out_undo:
 	/* If I have an error cleanup all namespaces I initialized */
 	list_del(&ops->list);
 	ops_exit_list(ops, &net_exit_list);
 	ops_free_list(ops, &net_exit_list);
-	return error;
+	goto out_drop_nets;
 }
 
 static void __unregister_pernet_operations(struct pernet_operations *ops)