Message ID | 20150508140733.GA13325@gondor.apana.org.au |
---|---|
State | RFC, archived |
Delegated to: | David Miller |
Headers | show |
Herbert Xu <herbert@gondor.apana.org.au> writes: > On Thu, May 07, 2015 at 11:14:13AM -0500, Eric W. Biederman wrote: >> >> The following change shows how it is possible to always know that your >> network namespace has a non-zero reference count in the network >> namespace initialization methods. My implementation of >> lock_network_namespaces is problematic in that it does not sleep >> while network namespaces are unregistering. But it is enough to show >> how the locking and reference counting can be fixed. > > If name spaces are created/deleted constantly this could take > a long time. How about forcibly cleaning up entries when we > register a new op? That certainly seems more sensible than waiting for an indefinite amount of time, and the code looks reasonably sound. Placing on things on lists and then not removing them, and then later stomping the list pointers gives me flashbacks of other bugs I have fought, but we already do that in this code, and in this context it appears harmless. So overall I think I like it. At the same time while a network namespace remains discoverable we might be using it in small ways. So I am wondering if this is the right approach. It really is invalid for a network namespace init routine to grab the reference count of it's network namespace (thus making the network namespace unfreeable). So I am wondering if perhaps all we need to do is find a clean refactoring of the socket code so this case does not come up at all. Perhaps just a flag that says this is a kernel socket so don't get/put the refcount on the network namespace. > ---8<--- > Subject: netns: Do not init dead nets when registering new op > > Currently when a new op is registered we will initialise all > current namespaces through that op, even those that are zombies > (zero ref count). This is bad because it puts the onus on each > op implementor to deal with this and they are bound to slip up. > > This patch fixes this by only initialising live namespaces. > > This however brings about a new problem as when those zombies > are eventually cleaned up we will call op->exit even on them > even though they have not been initialised. > > This is fixed by bringing forward parts of the work of the cleanup > thread and performing them during registration. > > Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> > > diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h > index f733656..4931100 100644 > --- a/include/net/net_namespace.h > +++ b/include/net/net_namespace.h > @@ -133,6 +133,7 @@ struct net { > #endif > struct sock *diag_nlsk; > atomic_t fnhe_genid; > + bool dead; > }; > > #include <linux/seq_file_net.h> > diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c > index 78fc04a..1f888a7 100644 > --- a/net/core/net_namespace.c > +++ b/net/core/net_namespace.c > @@ -339,41 +339,38 @@ struct net *copy_net_ns(unsigned long flags, > return net; > } > > -static DEFINE_SPINLOCK(cleanup_list_lock); > -static LIST_HEAD(cleanup_list); /* Must hold cleanup_list_lock to touch */ > - > -static void cleanup_net(struct work_struct *work) > +static void unlink_net(struct net *net, struct list_head *net_exit_list) > { > - const struct pernet_operations *ops; > - struct net *net, *tmp; > - struct list_head net_kill_list; > - LIST_HEAD(net_exit_list); > + struct net *tmp; > > - /* Atomically snapshot the list of namespaces to cleanup */ > - spin_lock_irq(&cleanup_list_lock); > - list_replace_init(&cleanup_list, &net_kill_list); > - spin_unlock_irq(&cleanup_list_lock); > + if (net->dead) > + return; > > - mutex_lock(&net_mutex); > + net->dead = true; > + > + list_add_tail(&net->exit_list, net_exit_list); > > /* Don't let anyone else find us. */ > rtnl_lock(); > - list_for_each_entry(net, &net_kill_list, cleanup_list) { > - list_del_rcu(&net->list); > - list_add_tail(&net->exit_list, &net_exit_list); > - for_each_net(tmp) { > - int id = __peernet2id(tmp, net, false); > + list_del_rcu(&net->list); > > - if (id >= 0) { > - rtnl_net_notifyid(tmp, net, RTM_DELNSID, id); > - idr_remove(&tmp->netns_ids, id); > - } > - } > - idr_destroy(&net->netns_ids); > + for_each_net(tmp) { > + int id = __peernet2id(tmp, net, false); > > + if (id >= 0) { > + rtnl_net_notifyid(tmp, net, RTM_DELNSID, id); > + idr_remove(&tmp->netns_ids, id); > + } > } > rtnl_unlock(); > > + idr_destroy(&net->netns_ids); > +} > + > +static void exit_all_ops(struct list_head *net_exit_list) > +{ > + const struct pernet_operations *ops; > + > /* > * Another CPU might be rcu-iterating the list, wait for it. > * This needs to be before calling the exit() notifiers, so > @@ -383,11 +380,33 @@ static void cleanup_net(struct work_struct *work) > > /* Run all of the network namespace exit methods */ > list_for_each_entry_reverse(ops, &pernet_list, list) > - ops_exit_list(ops, &net_exit_list); > + ops_exit_list(ops, net_exit_list); > > /* Free the net generic variables */ > list_for_each_entry_reverse(ops, &pernet_list, list) > - ops_free_list(ops, &net_exit_list); > + ops_free_list(ops, net_exit_list); > +} > + > +static DEFINE_SPINLOCK(cleanup_list_lock); > +static LIST_HEAD(cleanup_list); /* Must hold cleanup_list_lock to touch */ > + > +static void cleanup_net(struct work_struct *work) > +{ > + struct list_head net_kill_list; > + struct net *net, *tmp; > + LIST_HEAD(net_exit_list); > + > + /* Atomically snapshot the list of namespaces to cleanup */ > + spin_lock_irq(&cleanup_list_lock); > + list_replace_init(&cleanup_list, &net_kill_list); > + spin_unlock_irq(&cleanup_list_lock); > + > + mutex_lock(&net_mutex); > + > + list_for_each_entry(net, &net_kill_list, cleanup_list) > + unlink_net(net, &net_exit_list); > + > + exit_all_ops(&net_exit_list); > > mutex_unlock(&net_mutex); > > @@ -397,8 +416,7 @@ static void cleanup_net(struct work_struct *work) > rcu_barrier(); > > /* Finally it is safe to free my network namespace structure */ > - list_for_each_entry_safe(net, tmp, &net_exit_list, exit_list) { > - list_del_init(&net->exit_list); > + list_for_each_entry_safe(net, tmp, &net_kill_list, cleanup_list) { > put_user_ns(net->user_ns); > net_drop_ns(net); > } > @@ -727,30 +745,59 @@ static int __init net_ns_init(void) > pure_initcall(net_ns_init); > > #ifdef CONFIG_NET_NS > +static void grab_all_nets(struct list_head *all_net_list) > +{ > + struct net *net; > + LIST_HEAD(net_exit_list); > + > + for_each_net(net) { > + if (maybe_get_net(net)) > + list_add_tail(&net->cleanup_list, all_net_list); > + else > + unlink_net(net, &net_exit_list); > + } > + > + exit_all_ops(&net_exit_list); > +} > + > +static void drop_all_nets(struct list_head *all_net_list) > +{ > + struct net *net, *next; > + > + list_for_each_entry_safe(net, next, all_net_list, cleanup_list) > + put_net(net); > +} > + > static int __register_pernet_operations(struct list_head *list, > struct pernet_operations *ops) > { > struct net *net; > - int error; > + int error = 0; > + LIST_HEAD(all_net_list); > LIST_HEAD(net_exit_list); > > + grab_all_nets(&all_net_list); > + > list_add_tail(&ops->list, list); > if (ops->init || (ops->id && ops->size)) { > - for_each_net(net) { > + list_for_each_entry(net, &all_net_list, cleanup_list) { > error = ops_init(ops, net); > if (error) > goto out_undo; > list_add_tail(&net->exit_list, &net_exit_list); > } > } > - return 0; > + > +out_drop_nets: > + drop_all_nets(&all_net_list); > + return error; > > out_undo: > /* If I have an error cleanup all namespaces I initialized */ > list_del(&ops->list); > ops_exit_list(ops, &net_exit_list); > ops_free_list(ops, &net_exit_list); > - return error; > + goto out_drop_nets; > } > > static void __unregister_pernet_operations(struct pernet_operations *ops) -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Fri, May 8, 2015 at 10:36 AM, Eric W. Biederman <ebiederm@xmission.com> wrote: > > It really is invalid for a network namespace init routine to grab the > reference count of it's network namespace (thus making the network > namespace unfreeable). So I am wondering if perhaps all we need to do > is find a clean refactoring of the socket code so this case does not > come up at all. Good point! I _guess_ the reason is these kernel sockets have to exist longer than netns' life-time, it could be due to on-flying skb's? On the other hand, we do create some fb_tunnel netdevice in netns init too, but we don't take a refcnt there, probably because we wait for netdevice refcnt goes to zero when unregistering. -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Fri, May 8, 2015 at 1:27 PM, Cong Wang <cwang@twopensource.com> wrote: > On Fri, May 8, 2015 at 10:36 AM, Eric W. Biederman > <ebiederm@xmission.com> wrote: >> >> It really is invalid for a network namespace init routine to grab the >> reference count of it's network namespace (thus making the network >> namespace unfreeable). So I am wondering if perhaps all we need to do >> is find a clean refactoring of the socket code so this case does not >> come up at all. > > > Good point! > > I _guess_ the reason is these kernel sockets have to exist longer than > netns' life-time, it could be due to on-flying skb's? > > On the other hand, we do create some fb_tunnel netdevice in netns init > too, but we don't take a refcnt there, probably because we wait > for netdevice refcnt goes to zero when unregistering. Answer myself, it looks like we don't need to hold the refcnt at all, since we create the socket at init and release it at uninit, so they should have the same life-time? The reason why user-space sockets need this refcnt is they could have longer life-time than the netns? This seems the right direction to solve this problem for me, I am going to try this way to see how far I can go. ;) Thanks. -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Cong Wang <cwang@twopensource.com> writes: > On Fri, May 8, 2015 at 1:27 PM, Cong Wang <cwang@twopensource.com> wrote: >> On Fri, May 8, 2015 at 10:36 AM, Eric W. Biederman >> <ebiederm@xmission.com> wrote: >>> >>> It really is invalid for a network namespace init routine to grab the >>> reference count of it's network namespace (thus making the network >>> namespace unfreeable). So I am wondering if perhaps all we need to do >>> is find a clean refactoring of the socket code so this case does not >>> come up at all. >> >> >> Good point! >> >> I _guess_ the reason is these kernel sockets have to exist longer than >> netns' life-time, it could be due to on-flying skb's? >> >> On the other hand, we do create some fb_tunnel netdevice in netns init >> too, but we don't take a refcnt there, probably because we wait >> for netdevice refcnt goes to zero when unregistering. > > Answer myself, it looks like we don't need to hold the refcnt at all, > since we create the socket at init and release it at uninit, so they > should have the same life-time? > > The reason why user-space sockets need this refcnt is they > could have longer life-time than the netns? User space sockets by design keep the network namespace alive. > This seems the right direction to solve this problem for me, > I am going to try this way to see how far I can go. ;) I will be interested to see what you come up with. Eric -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Fri, May 08, 2015 at 12:36:56PM -0500, Eric W. Biederman wrote: > > At the same time while a network namespace remains discoverable we might > be using it in small ways. So I am wondering if this is the right > approach. > > It really is invalid for a network namespace init routine to grab the > reference count of it's network namespace (thus making the network > namespace unfreeable). So I am wondering if perhaps all we need to do > is find a clean refactoring of the socket code so this case does not > come up at all. > > Perhaps just a flag that says this is a kernel socket so don't get/put > the refcount on the network namespace. No this line of thinking is what led us into the hole in the first place. Really it's perfectly valid for the init function to want to take a reference on net. For all we know it might be a temporary reference taken by a third party. It doesn't even have to be a socket. We must hide this subtlety from ops implementors since they have no knowledge of our implementation. Expecting them to deal with this is going to result in bugs, and we have already had multiple bugs in this area. Cheers,
Herbert Xu <herbert@gondor.apana.org.au> writes: > On Fri, May 08, 2015 at 12:36:56PM -0500, Eric W. Biederman wrote: >> >> At the same time while a network namespace remains discoverable we might >> be using it in small ways. So I am wondering if this is the right >> approach. >> >> It really is invalid for a network namespace init routine to grab the >> reference count of it's network namespace (thus making the network >> namespace unfreeable). So I am wondering if perhaps all we need to do >> is find a clean refactoring of the socket code so this case does not >> come up at all. >> >> Perhaps just a flag that says this is a kernel socket so don't get/put >> the refcount on the network namespace. > > No this line of thinking is what led us into the hole in the > first place. > > Really it's perfectly valid for the init function to want to > take a reference on net. For all we know it might be a temporary > reference taken by a third party. It doesn't even have to be a > socket. > > We must hide this subtlety from ops implementors since they have > no knowledge of our implementation. Expecting them to deal with > this is going to result in bugs, and we have already had multiple > bugs in this area. I won't fundamentally argue. However the worst hack and subtlety come from the way we deal with kernel sockets and that is much easier to clean up so we might as well get that first. Eric -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Right now the situtation for allocating kernel sockets is a mess. - sock_create_kern does not take a namespace parameter. - kernel sockets must not reference count a network namespace and keep it alive or else we will have a reference counting loop. - The way we avoid the reference counting loop with sk_change_net and sk_release_kernel are major hacks. This patchset addresses this mess by fixing sock_create_kern to do everything necessary to create a kernel socket. None of the current users of kernel sockets need the network namespace reference counted. Either kernel sockets are network namespace aware (and using the current hacks) or kernel sockets are limited to the initial network namespace in which case it does not matter. This patchset starts by addressing tun which should be using normal userspace sockets like macvtap. Then sock_create_kern is fixed to take a network namespace. Then the in kernel status of sockets are passed through to sk_alloc. Then sk_alloc is fixed to not reference count the network namespace of kernel sockets. Then the callers of sock_create_kern are fixed up to stop using hacks. Then netlink which uses it's own flavor of sock_create_kern is fixed. Finally the hacks that are sk_change_net and sk_release_kernel are removed. When it is all done the code is easier to follow, easier to use, easier to maintain and shorter by about 70 lines. Reported-by: Ying Xue <ying.xue@windriver.com> Eric W. Biederman (6): tun: Utilize the normal socket network namespace refcounting. net: Add a struct net parameter to sock_create_kern net: Pass kern from net_proto_family.create to sk_alloc net: Modify sk_alloc to not reference count the netns of kernel sockets. netlink: Create kernel netlink sockets in the proper network namespace net: kill sk_change_net and sk_release_kernel crypto/af_alg.c | 4 ++-- drivers/block/drbd/drbd_receiver.c | 4 ++-- drivers/isdn/mISDN/socket.c | 12 ++++++------ drivers/net/macvtap.c | 2 +- drivers/net/ppp/pppoe.c | 4 ++-- drivers/net/ppp/pppox.c | 2 +- drivers/net/ppp/pptp.c | 4 ++-- drivers/net/tun.c | 26 +++++--------------------- fs/afs/rxrpc.c | 2 +- fs/dlm/lowcomms.c | 16 ++++++++-------- include/linux/if_pppox.h | 2 +- include/linux/net.h | 3 +-- include/net/af_vsock.h | 2 +- include/net/inet_common.h | 2 +- include/net/llc_conn.h | 2 +- include/net/sock.h | 21 +++------------------ net/appletalk/ddp.c | 2 +- net/atm/common.c | 4 ++-- net/atm/common.h | 2 +- net/atm/pvc.c | 2 +- net/atm/svc.c | 2 +- net/ax25/af_ax25.c | 4 ++-- net/bluetooth/bnep/sock.c | 2 +- net/bluetooth/cmtp/sock.c | 2 +- net/bluetooth/hci_sock.c | 2 +- net/bluetooth/hidp/sock.c | 2 +- net/bluetooth/l2cap_sock.c | 10 +++++----- net/bluetooth/rfcomm/core.c | 2 +- net/bluetooth/rfcomm/sock.c | 8 ++++---- net/bluetooth/sco.c | 8 ++++---- net/caif/caif_socket.c | 2 +- net/can/af_can.c | 2 +- net/ceph/messenger.c | 4 ++-- net/core/sock.c | 30 ++++++++---------------------- net/decnet/af_decnet.c | 8 ++++---- net/ieee802154/socket.c | 2 +- net/ipv4/af_inet.c | 6 ++---- net/ipv4/udp_tunnel.c | 8 +++----- net/ipv6/af_inet6.c | 2 +- net/ipv6/ip6_udp_tunnel.c | 6 ++---- net/ipx/af_ipx.c | 2 +- net/irda/af_irda.c | 2 +- net/iucv/af_iucv.c | 10 +++++----- net/key/af_key.c | 2 +- net/l2tp/l2tp_core.c | 15 ++++++--------- net/l2tp/l2tp_ppp.c | 4 ++-- net/llc/af_llc.c | 2 +- net/llc/llc_conn.c | 6 +++--- net/netfilter/ipvs/ip_vs_sync.c | 30 +++++++++--------------------- net/netlink/af_netlink.c | 21 +++++++++------------ net/netrom/af_netrom.c | 4 ++-- net/nfc/af_nfc.c | 2 +- net/nfc/llcp.h | 2 +- net/nfc/llcp_core.c | 2 +- net/nfc/llcp_sock.c | 8 ++++---- net/nfc/nfc.h | 2 +- net/nfc/rawsock.c | 4 ++-- net/packet/af_packet.c | 2 +- net/phonet/af_phonet.c | 2 +- net/phonet/pep.c | 2 +- net/rds/af_rds.c | 2 +- net/rose/af_rose.c | 4 ++-- net/rxrpc/af_rxrpc.c | 2 +- net/rxrpc/ar-local.c | 4 ++-- net/sctp/ipv6.c | 2 +- net/sctp/protocol.c | 2 +- net/socket.c | 7 ++----- net/tipc/socket.c | 2 +- net/unix/af_unix.c | 8 ++++---- net/vmw_vsock/af_vsock.c | 7 ++++--- net/vmw_vsock/vmci_transport.c | 2 +- net/x25/af_x25.c | 8 ++++---- 72 files changed, 166 insertions(+), 238 deletions(-) -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Fri, May 08, 2015 at 09:05:33PM -0500, Eric W. Biederman wrote: > > Right now the situtation for allocating kernel sockets is a mess. > - sock_create_kern does not take a namespace parameter. > - kernel sockets must not reference count a network namespace and keep > it alive or else we will have a reference counting loop. > - The way we avoid the reference counting loop with sk_change_net > and sk_release_kernel are major hacks. > > This patchset addresses this mess by fixing sock_create_kern to do > everything necessary to create a kernel socket. None of the current > users of kernel sockets need the network namespace reference counted. > Either kernel sockets are network namespace aware (and using the current > hacks) or kernel sockets are limited to the initial network namespace > in which case it does not matter. > > This patchset starts by addressing tun which should be using normal > userspace sockets like macvtap. > > Then sock_create_kern is fixed to take a network namespace. > Then the in kernel status of sockets are passed through to sk_alloc. > Then sk_alloc is fixed to not reference count the network namespace > of kernel sockets. > Then the callers of sock_create_kern are fixed up to stop using hacks. > Then netlink which uses it's own flavor of sock_create_kern is fixed. > > Finally the hacks that are sk_change_net and sk_release_kernel are removed. > > When it is all done the code is easier to follow, easier to use, easier > to maintain and shorter by about 70 lines. > > Reported-by: Ying Xue <ying.xue@windriver.com> Looks good to me. Thanks,
From: ebiederm@xmission.com (Eric W. Biederman) Date: Fri, 08 May 2015 21:05:33 -0500 > Right now the situtation for allocating kernel sockets is a mess. > - sock_create_kern does not take a namespace parameter. > - kernel sockets must not reference count a network namespace and keep > it alive or else we will have a reference counting loop. > - The way we avoid the reference counting loop with sk_change_net > and sk_release_kernel are major hacks. > > This patchset addresses this mess by fixing sock_create_kern to do > everything necessary to create a kernel socket. None of the current > users of kernel sockets need the network namespace reference counted. > Either kernel sockets are network namespace aware (and using the current > hacks) or kernel sockets are limited to the initial network namespace > in which case it does not matter. > > This patchset starts by addressing tun which should be using normal > userspace sockets like macvtap. > > Then sock_create_kern is fixed to take a network namespace. > Then the in kernel status of sockets are passed through to sk_alloc. > Then sk_alloc is fixed to not reference count the network namespace > of kernel sockets. > Then the callers of sock_create_kern are fixed up to stop using hacks. > Then netlink which uses it's own flavor of sock_create_kern is fixed. > > Finally the hacks that are sk_change_net and sk_release_kernel are removed. > > When it is all done the code is easier to follow, easier to use, easier > to maintain and shorter by about 70 lines. > > Reported-by: Ying Xue <ying.xue@windriver.com> Looks good, applied to net-next, thanks Eric. -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index f733656..4931100 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -133,6 +133,7 @@ struct net { #endif struct sock *diag_nlsk; atomic_t fnhe_genid; + bool dead; }; #include <linux/seq_file_net.h> diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 78fc04a..1f888a7 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -339,41 +339,38 @@ struct net *copy_net_ns(unsigned long flags, return net; } -static DEFINE_SPINLOCK(cleanup_list_lock); -static LIST_HEAD(cleanup_list); /* Must hold cleanup_list_lock to touch */ - -static void cleanup_net(struct work_struct *work) +static void unlink_net(struct net *net, struct list_head *net_exit_list) { - const struct pernet_operations *ops; - struct net *net, *tmp; - struct list_head net_kill_list; - LIST_HEAD(net_exit_list); + struct net *tmp; - /* Atomically snapshot the list of namespaces to cleanup */ - spin_lock_irq(&cleanup_list_lock); - list_replace_init(&cleanup_list, &net_kill_list); - spin_unlock_irq(&cleanup_list_lock); + if (net->dead) + return; - mutex_lock(&net_mutex); + net->dead = true; + + list_add_tail(&net->exit_list, net_exit_list); /* Don't let anyone else find us. */ rtnl_lock(); - list_for_each_entry(net, &net_kill_list, cleanup_list) { - list_del_rcu(&net->list); - list_add_tail(&net->exit_list, &net_exit_list); - for_each_net(tmp) { - int id = __peernet2id(tmp, net, false); + list_del_rcu(&net->list); - if (id >= 0) { - rtnl_net_notifyid(tmp, net, RTM_DELNSID, id); - idr_remove(&tmp->netns_ids, id); - } - } - idr_destroy(&net->netns_ids); + for_each_net(tmp) { + int id = __peernet2id(tmp, net, false); + if (id >= 0) { + rtnl_net_notifyid(tmp, net, RTM_DELNSID, id); + idr_remove(&tmp->netns_ids, id); + } } rtnl_unlock(); + idr_destroy(&net->netns_ids); +} + +static void exit_all_ops(struct list_head *net_exit_list) +{ + const struct pernet_operations *ops; + /* * Another CPU might be rcu-iterating the list, wait for it. * This needs to be before calling the exit() notifiers, so @@ -383,11 +380,33 @@ static void cleanup_net(struct work_struct *work) /* Run all of the network namespace exit methods */ list_for_each_entry_reverse(ops, &pernet_list, list) - ops_exit_list(ops, &net_exit_list); + ops_exit_list(ops, net_exit_list); /* Free the net generic variables */ list_for_each_entry_reverse(ops, &pernet_list, list) - ops_free_list(ops, &net_exit_list); + ops_free_list(ops, net_exit_list); +} + +static DEFINE_SPINLOCK(cleanup_list_lock); +static LIST_HEAD(cleanup_list); /* Must hold cleanup_list_lock to touch */ + +static void cleanup_net(struct work_struct *work) +{ + struct list_head net_kill_list; + struct net *net, *tmp; + LIST_HEAD(net_exit_list); + + /* Atomically snapshot the list of namespaces to cleanup */ + spin_lock_irq(&cleanup_list_lock); + list_replace_init(&cleanup_list, &net_kill_list); + spin_unlock_irq(&cleanup_list_lock); + + mutex_lock(&net_mutex); + + list_for_each_entry(net, &net_kill_list, cleanup_list) + unlink_net(net, &net_exit_list); + + exit_all_ops(&net_exit_list); mutex_unlock(&net_mutex); @@ -397,8 +416,7 @@ static void cleanup_net(struct work_struct *work) rcu_barrier(); /* Finally it is safe to free my network namespace structure */ - list_for_each_entry_safe(net, tmp, &net_exit_list, exit_list) { - list_del_init(&net->exit_list); + list_for_each_entry_safe(net, tmp, &net_kill_list, cleanup_list) { put_user_ns(net->user_ns); net_drop_ns(net); } @@ -727,30 +745,59 @@ static int __init net_ns_init(void) pure_initcall(net_ns_init); #ifdef CONFIG_NET_NS +static void grab_all_nets(struct list_head *all_net_list) +{ + struct net *net; + LIST_HEAD(net_exit_list); + + for_each_net(net) { + if (maybe_get_net(net)) + list_add_tail(&net->cleanup_list, all_net_list); + else + unlink_net(net, &net_exit_list); + } + + exit_all_ops(&net_exit_list); +} + +static void drop_all_nets(struct list_head *all_net_list) +{ + struct net *net, *next; + + list_for_each_entry_safe(net, next, all_net_list, cleanup_list) + put_net(net); +} + static int __register_pernet_operations(struct list_head *list, struct pernet_operations *ops) { struct net *net; - int error; + int error = 0; + LIST_HEAD(all_net_list); LIST_HEAD(net_exit_list); + grab_all_nets(&all_net_list); + list_add_tail(&ops->list, list); if (ops->init || (ops->id && ops->size)) { - for_each_net(net) { + list_for_each_entry(net, &all_net_list, cleanup_list) { error = ops_init(ops, net); if (error) goto out_undo; list_add_tail(&net->exit_list, &net_exit_list); } } - return 0; + +out_drop_nets: + drop_all_nets(&all_net_list); + return error; out_undo: /* If I have an error cleanup all namespaces I initialized */ list_del(&ops->list); ops_exit_list(ops, &net_exit_list); ops_free_list(ops, &net_exit_list); - return error; + goto out_drop_nets; } static void __unregister_pernet_operations(struct pernet_operations *ops)