[nf,1/2] netfilter: ipt_CLUSTERIP: fix deadlock in netns exit routine

Message ID 20181005164242.9682-1-ap420073@gmail.com
State Changes Requested
Delegated to: Pablo Neira
Headers show
Series
  • netfilter: ipt_CLUSTERIP: fix bugs in ipt_CLUSTERIP
Related show

Commit Message

Taehee Yoo Oct. 5, 2018, 4:42 p.m.
When network namespace is destroyed, cleanup_net() is called.
cleanup_net() holds pernet_ops_rwsem then calls each ->exit callback.
So that clusterip_tg_destroy() is called by cleanup_net().
And clusterip_tg_destroy() calls unregister_netdevice_notifier().

But both cleanup_net() and clusterip_tg_destroy() hold same
lock(pernet_ops_rwsem). hence deadlock occurrs.

After this patch, only 1 notifier is registered when module is inserted.
And all of configs are added to per-net list.

test commands:
   %ip netns add vm1
   %ip netns exec vm1 bash
   %ip link set lo up
   %iptables -A INPUT -p tcp -i lo -d 192.168.0.5 --dport 80 \
	 -j CLUSTERIP --new --hashmode sourceip \
	 --clustermac 01:00:5e:00:00:20 --total-nodes 2 --local-node 1
   %exit
   %ip netns del vm1

splat looks like:
[  341.809674] ============================================
[  341.809674] WARNING: possible recursive locking detected
[  341.809674] 4.19.0-rc5+ #16 Tainted: G        W
[  341.809674] --------------------------------------------
[  341.809674] kworker/u4:2/87 is trying to acquire lock:
[  341.809674] 000000005da2d519 (pernet_ops_rwsem){++++}, at: unregister_netdevice_notifier+0x8c/0x460
[  341.809674]
[  341.809674] but task is already holding lock:
[  341.809674] 000000005da2d519 (pernet_ops_rwsem){++++}, at: cleanup_net+0x119/0x900
[  341.809674]
[  341.809674] other info that might help us debug this:
[  341.809674]  Possible unsafe locking scenario:
[  341.809674]
[  341.809674]        CPU0
[  341.809674]        ----
[  341.809674]   lock(pernet_ops_rwsem);
[  341.809674]   lock(pernet_ops_rwsem);
[  341.809674]
[  341.809674]  *** DEADLOCK ***
[  341.809674]
[  341.809674]  May be due to missing lock nesting notation
[  341.809674]
[  341.809674] 3 locks held by kworker/u4:2/87:
[  341.809674]  #0: 00000000d9df6c92 ((wq_completion)"%s""netns"){+.+.}, at: process_one_work+0xafe/0x1de0
[  341.809674]  #1: 00000000c2cbcee2 (net_cleanup_work){+.+.}, at: process_one_work+0xb60/0x1de0
[  341.809674]  #2: 000000005da2d519 (pernet_ops_rwsem){++++}, at: cleanup_net+0x119/0x900
[  341.809674]
[  341.809674] stack backtrace:
[  341.809674] CPU: 1 PID: 87 Comm: kworker/u4:2 Tainted: G        W         4.19.0-rc5+ #16
[  341.809674] Workqueue: netns cleanup_net
[  341.809674] Call Trace:
[ ... ]
[  342.070196]  down_write+0x93/0x160
[  342.070196]  ? unregister_netdevice_notifier+0x8c/0x460
[  342.070196]  ? down_read+0x1e0/0x1e0
[  342.070196]  ? sched_clock_cpu+0x126/0x170
[  342.070196]  ? find_held_lock+0x39/0x1c0
[  342.070196]  unregister_netdevice_notifier+0x8c/0x460
[  342.070196]  ? register_netdevice_notifier+0x790/0x790
[  342.070196]  ? __local_bh_enable_ip+0xe9/0x1b0
[  342.070196]  ? __local_bh_enable_ip+0xe9/0x1b0
[  342.070196]  ? clusterip_tg_destroy+0x372/0x650 [ipt_CLUSTERIP]
[  342.070196]  ? trace_hardirqs_on+0x93/0x210
[  342.070196]  ? __bpf_trace_preemptirq_template+0x10/0x10
[  342.070196]  ? clusterip_tg_destroy+0x372/0x650 [ipt_CLUSTERIP]
[  342.123094]  clusterip_tg_destroy+0x3ad/0x650 [ipt_CLUSTERIP]
[  342.123094]  ? clusterip_net_init+0x3d0/0x3d0 [ipt_CLUSTERIP]
[  342.123094]  ? cleanup_match+0x17d/0x200 [ip_tables]
[  342.123094]  ? xt_unregister_table+0x215/0x300 [x_tables]
[  342.123094]  ? kfree+0xe2/0x2a0
[  342.123094]  cleanup_entry+0x1d5/0x2f0 [ip_tables]
[  342.123094]  ? cleanup_match+0x200/0x200 [ip_tables]
[  342.123094]  __ipt_unregister_table+0x9b/0x1a0 [ip_tables]
[  342.123094]  iptable_filter_net_exit+0x43/0x80 [iptable_filter]
[  342.123094]  ops_exit_list.isra.10+0x94/0x140
[  342.123094]  cleanup_net+0x45b/0x900
[ ... ]

Fixes: 202f59afd441 ("netfilter: ipt_CLUSTERIP: do not hold dev")
Signed-off-by: Taehee Yoo <ap420073@gmail.com>
---
 net/ipv4/netfilter/ipt_CLUSTERIP.c | 71 +++++++++++++++++-------------
 1 file changed, 40 insertions(+), 31 deletions(-)

Comments

Pablo Neira Ayuso Oct. 10, 2018, 5:32 p.m. | #1
On Sat, Oct 06, 2018 at 01:42:42AM +0900, Taehee Yoo wrote:
> diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
> index 2c8d313ae216..6ccabe6f74a6 100644
> --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
> +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
> @@ -59,7 +59,6 @@ struct clusterip_config {
>  	struct rcu_head rcu;
>  
>  	char ifname[IFNAMSIZ];			/* device ifname */
> -	struct notifier_block notifier;		/* refresh c->ifindex in it */
>  };
>  
>  #ifdef CONFIG_PROC_FS
> @@ -118,8 +117,6 @@ clusterip_config_entry_put(struct net *net, struct clusterip_config *c)
>  		spin_unlock(&cn->lock);
>  		local_bh_enable();
>  
> -		unregister_netdevice_notifier(&c->notifier);
> -
>  		return;
>  	}
>  	local_bh_enable();
> @@ -181,32 +178,37 @@ clusterip_netdev_event(struct notifier_block *this, unsigned long event,
>  		       void *ptr)
>  {
>  	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
> +	struct net *net = dev_net(dev);
> +	struct clusterip_net *cn = net_generic(net, clusterip_net_id);
>  	struct clusterip_config *c;
>  
> -	c = container_of(this, struct clusterip_config, notifier);
> -	switch (event) {
> -	case NETDEV_REGISTER:
> -		if (!strcmp(dev->name, c->ifname)) {
> -			c->ifindex = dev->ifindex;
> -			dev_mc_add(dev, c->clustermac);
> -		}
> -		break;
> -	case NETDEV_UNREGISTER:
> -		if (dev->ifindex == c->ifindex) {
> -			dev_mc_del(dev, c->clustermac);
> -			c->ifindex = -1;
> -		}
> -		break;
> -	case NETDEV_CHANGENAME:
> -		if (!strcmp(dev->name, c->ifname)) {
> -			c->ifindex = dev->ifindex;
> -			dev_mc_add(dev, c->clustermac);
> -		} else if (dev->ifindex == c->ifindex) {
> -			dev_mc_del(dev, c->clustermac);
> -			c->ifindex = -1;
> +	spin_lock(&cn->lock);

Do we need spin_lock_bh() here?
Taehee Yoo Oct. 11, 2018, 11:15 a.m. | #2
On Thu, 11 Oct 2018 at 02:32, Pablo Neira Ayuso <pablo@netfilter.org> wrote:
>

Hi Pablo,

Thank you for review!

> On Sat, Oct 06, 2018 at 01:42:42AM +0900, Taehee Yoo wrote:
> > diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
> > index 2c8d313ae216..6ccabe6f74a6 100644
> > --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
> > +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
> > @@ -59,7 +59,6 @@ struct clusterip_config {
> >       struct rcu_head rcu;
> >
> >       char ifname[IFNAMSIZ];                  /* device ifname */
> > -     struct notifier_block notifier;         /* refresh c->ifindex in it */
> >  };
> >
> >  #ifdef CONFIG_PROC_FS
> > @@ -118,8 +117,6 @@ clusterip_config_entry_put(struct net *net, struct clusterip_config *c)
> >               spin_unlock(&cn->lock);
> >               local_bh_enable();
> >
> > -             unregister_netdevice_notifier(&c->notifier);
> > -
> >               return;
> >       }
> >       local_bh_enable();
> > @@ -181,32 +178,37 @@ clusterip_netdev_event(struct notifier_block *this, unsigned long event,
> >                      void *ptr)
> >  {
> >       struct net_device *dev = netdev_notifier_info_to_dev(ptr);
> > +     struct net *net = dev_net(dev);
> > +     struct clusterip_net *cn = net_generic(net, clusterip_net_id);
> >       struct clusterip_config *c;
> >
> > -     c = container_of(this, struct clusterip_config, notifier);
> > -     switch (event) {
> > -     case NETDEV_REGISTER:
> > -             if (!strcmp(dev->name, c->ifname)) {
> > -                     c->ifindex = dev->ifindex;
> > -                     dev_mc_add(dev, c->clustermac);
> > -             }
> > -             break;
> > -     case NETDEV_UNREGISTER:
> > -             if (dev->ifindex == c->ifindex) {
> > -                     dev_mc_del(dev, c->clustermac);
> > -                     c->ifindex = -1;
> > -             }
> > -             break;
> > -     case NETDEV_CHANGENAME:
> > -             if (!strcmp(dev->name, c->ifname)) {
> > -                     c->ifindex = dev->ifindex;
> > -                     dev_mc_add(dev, c->clustermac);
> > -             } else if (dev->ifindex == c->ifindex) {
> > -                     dev_mc_del(dev, c->clustermac);
> > -                     c->ifindex = -1;
> > +     spin_lock(&cn->lock);
>
> Do we need spin_lock_bh() here?

I checked that, you're right.
config is modified in the BH. so, that should be spin_lock_bh().
I will send v2 patch

Thanks!

Patch

diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index 2c8d313ae216..6ccabe6f74a6 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -59,7 +59,6 @@  struct clusterip_config {
 	struct rcu_head rcu;
 
 	char ifname[IFNAMSIZ];			/* device ifname */
-	struct notifier_block notifier;		/* refresh c->ifindex in it */
 };
 
 #ifdef CONFIG_PROC_FS
@@ -118,8 +117,6 @@  clusterip_config_entry_put(struct net *net, struct clusterip_config *c)
 		spin_unlock(&cn->lock);
 		local_bh_enable();
 
-		unregister_netdevice_notifier(&c->notifier);
-
 		return;
 	}
 	local_bh_enable();
@@ -181,32 +178,37 @@  clusterip_netdev_event(struct notifier_block *this, unsigned long event,
 		       void *ptr)
 {
 	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+	struct net *net = dev_net(dev);
+	struct clusterip_net *cn = net_generic(net, clusterip_net_id);
 	struct clusterip_config *c;
 
-	c = container_of(this, struct clusterip_config, notifier);
-	switch (event) {
-	case NETDEV_REGISTER:
-		if (!strcmp(dev->name, c->ifname)) {
-			c->ifindex = dev->ifindex;
-			dev_mc_add(dev, c->clustermac);
-		}
-		break;
-	case NETDEV_UNREGISTER:
-		if (dev->ifindex == c->ifindex) {
-			dev_mc_del(dev, c->clustermac);
-			c->ifindex = -1;
-		}
-		break;
-	case NETDEV_CHANGENAME:
-		if (!strcmp(dev->name, c->ifname)) {
-			c->ifindex = dev->ifindex;
-			dev_mc_add(dev, c->clustermac);
-		} else if (dev->ifindex == c->ifindex) {
-			dev_mc_del(dev, c->clustermac);
-			c->ifindex = -1;
+	spin_lock(&cn->lock);
+	list_for_each_entry_rcu(c, &cn->configs, list) {
+		switch (event) {
+		case NETDEV_REGISTER:
+			if (!strcmp(dev->name, c->ifname)) {
+				c->ifindex = dev->ifindex;
+				dev_mc_add(dev, c->clustermac);
+			}
+			break;
+		case NETDEV_UNREGISTER:
+			if (dev->ifindex == c->ifindex) {
+				dev_mc_del(dev, c->clustermac);
+				c->ifindex = -1;
+			}
+			break;
+		case NETDEV_CHANGENAME:
+			if (!strcmp(dev->name, c->ifname)) {
+				c->ifindex = dev->ifindex;
+				dev_mc_add(dev, c->clustermac);
+			} else if (dev->ifindex == c->ifindex) {
+				dev_mc_del(dev, c->clustermac);
+				c->ifindex = -1;
+			}
+			break;
 		}
-		break;
 	}
+	spin_unlock(&cn->lock);
 
 	return NOTIFY_DONE;
 }
@@ -260,12 +262,8 @@  clusterip_config_init(struct net *net, const struct ipt_clusterip_tgt_info *i,
 	}
 #endif
 
-	c->notifier.notifier_call = clusterip_netdev_event;
-	err = register_netdevice_notifier(&c->notifier);
-	if (!err) {
-		refcount_set(&c->entries, 1);
-		return c;
-	}
+	refcount_set(&c->entries, 1);
+	return c;
 
 #ifdef CONFIG_PROC_FS
 	proc_remove(c->pde);
@@ -847,6 +845,10 @@  static struct pernet_operations clusterip_net_ops = {
 	.size = sizeof(struct clusterip_net),
 };
 
+struct notifier_block cip_netdev_notifier = {
+	.notifier_call = clusterip_netdev_event
+};
+
 static int __init clusterip_tg_init(void)
 {
 	int ret;
@@ -859,11 +861,17 @@  static int __init clusterip_tg_init(void)
 	if (ret < 0)
 		goto cleanup_subsys;
 
+	ret = register_netdevice_notifier(&cip_netdev_notifier);
+	if (ret < 0)
+		goto unregister_target;
+
 	pr_info("ClusterIP Version %s loaded successfully\n",
 		CLUSTERIP_VERSION);
 
 	return 0;
 
+unregister_target:
+	xt_unregister_target(&clusterip_tg_reg);
 cleanup_subsys:
 	unregister_pernet_subsys(&clusterip_net_ops);
 	return ret;
@@ -873,6 +881,7 @@  static void __exit clusterip_tg_exit(void)
 {
 	pr_info("ClusterIP Version %s unloading\n", CLUSTERIP_VERSION);
 
+	unregister_netdevice_notifier(&cip_netdev_notifier);
 	xt_unregister_target(&clusterip_tg_reg);
 	unregister_pernet_subsys(&clusterip_net_ops);