[v3] ipset: list:set: Decrease refcount synchronously on deletion and replace

Message ID b34e8a8d09e0cf72fee8d0cbe1aa931ca1e43661.1531597806.git.sbrivio@redhat.com
State Accepted
Delegated to: Jozsef Kadlecsik
Headers show
Series
  • [v3] ipset: list:set: Decrease refcount synchronously on deletion and replace
Related show

Commit Message

Stefano Brivio July 14, 2018, 7:59 p.m.
Commit 45040978c899 ("netfilter: ipset: Fix set:list type crash
when flush/dump set in parallel") postponed decreasing set
reference counters to the RCU callback.

An 'ipset del' command can terminate before the RCU grace period
is elapsed, and if sets are listed before then, the reference
counter shown in userspace will be wrong:

 # ipset create h hash:ip; ipset create l list:set; ipset add l
 # ipset del l h; ipset list h
 Name: h
 Type: hash:ip
 Revision: 4
 Header: family inet hashsize 1024 maxelem 65536
 Size in memory: 88
 References: 1
 Number of entries: 0
 Members:
 # sleep 1; ipset list h
 Name: h
 Type: hash:ip
 Revision: 4
 Header: family inet hashsize 1024 maxelem 65536
 Size in memory: 88
 References: 0
 Number of entries: 0
 Members:

Fix this by making the reference count update synchronous again.

As a result, when sets are listed, ip_set_name_byindex() might
now fetch a set whose reference count is already zero. Instead
of relying on the reference count to protect against concurrent
set renaming, grab ip_set_ref_lock as reader and copy the name,
while holding the same lock in ip_set_rename() as writer
instead.

Reported-by: Li Shuang <shuali@redhat.com>
Fixes: 45040978c899 ("netfilter: ipset: Fix set:list type crash when flush/dump set in parallel")
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
v3:
  - As suggested by Jozsef, don't hold ref_netlink in
    ip_set_name_byindex(), grabbing ip_set_ref_lock as reader is
    enough, if we hold ip_set_ref_lock as writer in ip_set_rename()
  - Don't add __ip_set_get_netlink() back, we don't need it anymore
  - Also as pointed out by Jozsef, in list_set_del() and
    list_set_replace(), we should decrease the reference counter only
    after we're done removing the set from or swapping it in the list

v2: As pointed out by Jozsef, we can't assume that the nfnl mutex
    is taken while dumping, so we need some explicit locking
    to protect names of list:set members against changes

 include/linux/netfilter/ipset/ip_set.h |  2 +-
 net/netfilter/ipset/ip_set_core.c      | 23 +++++++++++------------
 net/netfilter/ipset/ip_set_list_set.c  | 17 +++++++++++------
 3 files changed, 23 insertions(+), 19 deletions(-)

Comments

Jozsef Kadlecsik July 16, 2018, 7:08 p.m. | #1
Hi Stefano,

On Sat, 14 Jul 2018, Stefano Brivio wrote:

> Commit 45040978c899 ("netfilter: ipset: Fix set:list type crash
> when flush/dump set in parallel") postponed decreasing set
> reference counters to the RCU callback.
> 
> An 'ipset del' command can terminate before the RCU grace period
> is elapsed, and if sets are listed before then, the reference
> counter shown in userspace will be wrong:
> 
>  # ipset create h hash:ip; ipset create l list:set; ipset add l
>  # ipset del l h; ipset list h
>  Name: h
>  Type: hash:ip
>  Revision: 4
>  Header: family inet hashsize 1024 maxelem 65536
>  Size in memory: 88
>  References: 1
>  Number of entries: 0
>  Members:
>  # sleep 1; ipset list h
>  Name: h
>  Type: hash:ip
>  Revision: 4
>  Header: family inet hashsize 1024 maxelem 65536
>  Size in memory: 88
>  References: 0
>  Number of entries: 0
>  Members:
> 
> Fix this by making the reference count update synchronous again.
> 
> As a result, when sets are listed, ip_set_name_byindex() might
> now fetch a set whose reference count is already zero. Instead
> of relying on the reference count to protect against concurrent
> set renaming, grab ip_set_ref_lock as reader and copy the name,
> while holding the same lock in ip_set_rename() as writer
> instead.

Thanks, patch is applied in the ipset git tree.

Best regards,
Jozsef

> Reported-by: Li Shuang <shuali@redhat.com>
> Fixes: 45040978c899 ("netfilter: ipset: Fix set:list type crash when flush/dump set in parallel")
> Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
> ---
> v3:
>   - As suggested by Jozsef, don't hold ref_netlink in
>     ip_set_name_byindex(), grabbing ip_set_ref_lock as reader is
>     enough, if we hold ip_set_ref_lock as writer in ip_set_rename()
>   - Don't add __ip_set_get_netlink() back, we don't need it anymore
>   - Also as pointed out by Jozsef, in list_set_del() and
>     list_set_replace(), we should decrease the reference counter only
>     after we're done removing the set from or swapping it in the list
> 
> v2: As pointed out by Jozsef, we can't assume that the nfnl mutex
>     is taken while dumping, so we need some explicit locking
>     to protect names of list:set members against changes
> 
>  include/linux/netfilter/ipset/ip_set.h |  2 +-
>  net/netfilter/ipset/ip_set_core.c      | 23 +++++++++++------------
>  net/netfilter/ipset/ip_set_list_set.c  | 17 +++++++++++------
>  3 files changed, 23 insertions(+), 19 deletions(-)
> 
> diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h
> index 34fc80f3eb90..1d100efe74ec 100644
> --- a/include/linux/netfilter/ipset/ip_set.h
> +++ b/include/linux/netfilter/ipset/ip_set.h
> @@ -314,7 +314,7 @@ enum {
>  extern ip_set_id_t ip_set_get_byname(struct net *net,
>  				     const char *name, struct ip_set **set);
>  extern void ip_set_put_byindex(struct net *net, ip_set_id_t index);
> -extern const char *ip_set_name_byindex(struct net *net, ip_set_id_t index);
> +extern void ip_set_name_byindex(struct net *net, ip_set_id_t index, char *name);
>  extern ip_set_id_t ip_set_nfnl_get_byindex(struct net *net, ip_set_id_t index);
>  extern void ip_set_nfnl_put(struct net *net, ip_set_id_t index);
>  
> diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c
> index bc4bd247bb7d..fa15a831aeee 100644
> --- a/net/netfilter/ipset/ip_set_core.c
> +++ b/net/netfilter/ipset/ip_set_core.c
> @@ -693,21 +693,20 @@ ip_set_put_byindex(struct net *net, ip_set_id_t index)
>  EXPORT_SYMBOL_GPL(ip_set_put_byindex);
>  
>  /* Get the name of a set behind a set index.
> - * We assume the set is referenced, so it does exist and
> - * can't be destroyed. The set cannot be renamed due to
> - * the referencing either.
> - *
> + * Set itself is protected by RCU, but its name isn't: to protect against
> + * renaming, grab ip_set_ref_lock as reader (see ip_set_rename()) and copy the
> + * name.
>   */
> -const char *
> -ip_set_name_byindex(struct net *net, ip_set_id_t index)
> +void
> +ip_set_name_byindex(struct net *net, ip_set_id_t index, char *name)
>  {
> -	const struct ip_set *set = ip_set_rcu_get(net, index);
> +	struct ip_set *set = ip_set_rcu_get(net, index);
>  
>  	BUG_ON(!set);
> -	BUG_ON(set->ref == 0);
>  
> -	/* Referenced, so it's safe */
> -	return set->name;
> +	read_lock_bh(&ip_set_ref_lock);
> +	strncpy(name, set->name, IPSET_MAXNAMELEN);
> +	read_unlock_bh(&ip_set_ref_lock);
>  }
>  EXPORT_SYMBOL_GPL(ip_set_name_byindex);
>  
> @@ -1153,7 +1152,7 @@ static int ip_set_rename(struct net *net, struct sock *ctnl,
>  	if (!set)
>  		return -ENOENT;
>  
> -	read_lock_bh(&ip_set_ref_lock);
> +	write_lock_bh(&ip_set_ref_lock);
>  	if (set->ref != 0) {
>  		ret = -IPSET_ERR_REFERENCED;
>  		goto out;
> @@ -1170,7 +1169,7 @@ static int ip_set_rename(struct net *net, struct sock *ctnl,
>  	strncpy(set->name, name2, IPSET_MAXNAMELEN);
>  
>  out:
> -	read_unlock_bh(&ip_set_ref_lock);
> +	write_unlock_bh(&ip_set_ref_lock);
>  	return ret;
>  }
>  
> diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c
> index 072a658fde04..4eef55da0878 100644
> --- a/net/netfilter/ipset/ip_set_list_set.c
> +++ b/net/netfilter/ipset/ip_set_list_set.c
> @@ -148,9 +148,7 @@ __list_set_del_rcu(struct rcu_head * rcu)
>  {
>  	struct set_elem *e = container_of(rcu, struct set_elem, rcu);
>  	struct ip_set *set = e->set;
> -	struct list_set *map = set->data;
>  
> -	ip_set_put_byindex(map->net, e->id);
>  	ip_set_ext_destroy(set, e);
>  	kfree(e);
>  }
> @@ -158,15 +156,21 @@ __list_set_del_rcu(struct rcu_head * rcu)
>  static inline void
>  list_set_del(struct ip_set *set, struct set_elem *e)
>  {
> +	struct list_set *map = set->data;
> +
>  	set->elements--;
>  	list_del_rcu(&e->list);
> +	ip_set_put_byindex(map->net, e->id);
>  	call_rcu(&e->rcu, __list_set_del_rcu);
>  }
>  
>  static inline void
> -list_set_replace(struct set_elem *e, struct set_elem *old)
> +list_set_replace(struct ip_set *set, struct set_elem *e, struct set_elem *old)
>  {
> +	struct list_set *map = set->data;
> +
>  	list_replace_rcu(&old->list, &e->list);
> +	ip_set_put_byindex(map->net, old->id);
>  	call_rcu(&old->rcu, __list_set_del_rcu);
>  }
>  
> @@ -298,7 +302,7 @@ list_set_uadd(struct ip_set *set, void *value, const struct ip_set_ext *ext,
>  	INIT_LIST_HEAD(&e->list);
>  	list_set_init_extensions(set, ext, e);
>  	if (n)
> -		list_set_replace(e, n);
> +		list_set_replace(set, e, n);
>  	else if (next)
>  		list_add_tail_rcu(&e->list, &next->list);
>  	else if (prev)
> @@ -486,6 +490,7 @@ list_set_list(const struct ip_set *set,
>  	const struct list_set *map = set->data;
>  	struct nlattr *atd, *nested;
>  	u32 i = 0, first = cb->args[IPSET_CB_ARG0];
> +	char name[IPSET_MAXNAMELEN];
>  	struct set_elem *e;
>  	int ret = 0;
>  
> @@ -504,8 +509,8 @@ list_set_list(const struct ip_set *set,
>  		nested = ipset_nest_start(skb, IPSET_ATTR_DATA);
>  		if (!nested)
>  			goto nla_put_failure;
> -		if (nla_put_string(skb, IPSET_ATTR_NAME,
> -				   ip_set_name_byindex(map->net, e->id)))
> +		ip_set_name_byindex(map->net, e->id, name);
> +		if (nla_put_string(skb, IPSET_ATTR_NAME, name))
>  			goto nla_put_failure;
>  		if (ip_set_put_extensions(skb, set, e, true))
>  			goto nla_put_failure;
> -- 
> 2.15.1
> 
> --
> To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 

-
E-mail  : kadlec@blackhole.kfki.hu, kadlecsik.jozsef@wigner.mta.hu
PGP key : http://www.kfki.hu/~kadlec/pgp_public_key.txt
Address : Wigner Research Centre for Physics, Hungarian Academy of Sciences
          H-1525 Budapest 114, POB. 49, Hungary
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Stefano Brivio July 17, 2018, 3:32 p.m. | #2
On Mon, 16 Jul 2018 21:08:55 +0200 (CEST)
Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> wrote:

> Thanks, patch is applied in the ipset git tree.

Hi Jozsef,

thanks!

Sorry for the dumb question, I'm not really used to the ipset git
workflow: I've seen it's in the kernel/ directory of your netfilter.org
repository. Will this go into the nf git tree next?
Jozsef Kadlecsik July 18, 2018, 7:57 a.m. | #3
On Tue, 17 Jul 2018, Stefano Brivio wrote:

> > Thanks, patch is applied in the ipset git tree.
> 
> Sorry for the dumb question, I'm not really used to the ipset git 
> workflow: I've seen it's in the kernel/ directory of your netfilter.org 
> repository. Will this go into the nf git tree next?

The usual workflow is that I collect several ipset patches and then send 
the batch for the nf or nf-next tree, depending on the content of the 
patches. So yes, it'll go into nf.

Best regards,
Jozsef
-
E-mail  : kadlec@blackhole.kfki.hu, kadlecsik.jozsef@wigner.mta.hu
PGP key : http://www.kfki.hu/~kadlec/pgp_public_key.txt
Address : Wigner Research Centre for Physics, Hungarian Academy of Sciences
          H-1525 Budapest 114, POB. 49, Hungary
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Patch

diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h
index 34fc80f3eb90..1d100efe74ec 100644
--- a/include/linux/netfilter/ipset/ip_set.h
+++ b/include/linux/netfilter/ipset/ip_set.h
@@ -314,7 +314,7 @@  enum {
 extern ip_set_id_t ip_set_get_byname(struct net *net,
 				     const char *name, struct ip_set **set);
 extern void ip_set_put_byindex(struct net *net, ip_set_id_t index);
-extern const char *ip_set_name_byindex(struct net *net, ip_set_id_t index);
+extern void ip_set_name_byindex(struct net *net, ip_set_id_t index, char *name);
 extern ip_set_id_t ip_set_nfnl_get_byindex(struct net *net, ip_set_id_t index);
 extern void ip_set_nfnl_put(struct net *net, ip_set_id_t index);
 
diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c
index bc4bd247bb7d..fa15a831aeee 100644
--- a/net/netfilter/ipset/ip_set_core.c
+++ b/net/netfilter/ipset/ip_set_core.c
@@ -693,21 +693,20 @@  ip_set_put_byindex(struct net *net, ip_set_id_t index)
 EXPORT_SYMBOL_GPL(ip_set_put_byindex);
 
 /* Get the name of a set behind a set index.
- * We assume the set is referenced, so it does exist and
- * can't be destroyed. The set cannot be renamed due to
- * the referencing either.
- *
+ * Set itself is protected by RCU, but its name isn't: to protect against
+ * renaming, grab ip_set_ref_lock as reader (see ip_set_rename()) and copy the
+ * name.
  */
-const char *
-ip_set_name_byindex(struct net *net, ip_set_id_t index)
+void
+ip_set_name_byindex(struct net *net, ip_set_id_t index, char *name)
 {
-	const struct ip_set *set = ip_set_rcu_get(net, index);
+	struct ip_set *set = ip_set_rcu_get(net, index);
 
 	BUG_ON(!set);
-	BUG_ON(set->ref == 0);
 
-	/* Referenced, so it's safe */
-	return set->name;
+	read_lock_bh(&ip_set_ref_lock);
+	strncpy(name, set->name, IPSET_MAXNAMELEN);
+	read_unlock_bh(&ip_set_ref_lock);
 }
 EXPORT_SYMBOL_GPL(ip_set_name_byindex);
 
@@ -1153,7 +1152,7 @@  static int ip_set_rename(struct net *net, struct sock *ctnl,
 	if (!set)
 		return -ENOENT;
 
-	read_lock_bh(&ip_set_ref_lock);
+	write_lock_bh(&ip_set_ref_lock);
 	if (set->ref != 0) {
 		ret = -IPSET_ERR_REFERENCED;
 		goto out;
@@ -1170,7 +1169,7 @@  static int ip_set_rename(struct net *net, struct sock *ctnl,
 	strncpy(set->name, name2, IPSET_MAXNAMELEN);
 
 out:
-	read_unlock_bh(&ip_set_ref_lock);
+	write_unlock_bh(&ip_set_ref_lock);
 	return ret;
 }
 
diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c
index 072a658fde04..4eef55da0878 100644
--- a/net/netfilter/ipset/ip_set_list_set.c
+++ b/net/netfilter/ipset/ip_set_list_set.c
@@ -148,9 +148,7 @@  __list_set_del_rcu(struct rcu_head * rcu)
 {
 	struct set_elem *e = container_of(rcu, struct set_elem, rcu);
 	struct ip_set *set = e->set;
-	struct list_set *map = set->data;
 
-	ip_set_put_byindex(map->net, e->id);
 	ip_set_ext_destroy(set, e);
 	kfree(e);
 }
@@ -158,15 +156,21 @@  __list_set_del_rcu(struct rcu_head * rcu)
 static inline void
 list_set_del(struct ip_set *set, struct set_elem *e)
 {
+	struct list_set *map = set->data;
+
 	set->elements--;
 	list_del_rcu(&e->list);
+	ip_set_put_byindex(map->net, e->id);
 	call_rcu(&e->rcu, __list_set_del_rcu);
 }
 
 static inline void
-list_set_replace(struct set_elem *e, struct set_elem *old)
+list_set_replace(struct ip_set *set, struct set_elem *e, struct set_elem *old)
 {
+	struct list_set *map = set->data;
+
 	list_replace_rcu(&old->list, &e->list);
+	ip_set_put_byindex(map->net, old->id);
 	call_rcu(&old->rcu, __list_set_del_rcu);
 }
 
@@ -298,7 +302,7 @@  list_set_uadd(struct ip_set *set, void *value, const struct ip_set_ext *ext,
 	INIT_LIST_HEAD(&e->list);
 	list_set_init_extensions(set, ext, e);
 	if (n)
-		list_set_replace(e, n);
+		list_set_replace(set, e, n);
 	else if (next)
 		list_add_tail_rcu(&e->list, &next->list);
 	else if (prev)
@@ -486,6 +490,7 @@  list_set_list(const struct ip_set *set,
 	const struct list_set *map = set->data;
 	struct nlattr *atd, *nested;
 	u32 i = 0, first = cb->args[IPSET_CB_ARG0];
+	char name[IPSET_MAXNAMELEN];
 	struct set_elem *e;
 	int ret = 0;
 
@@ -504,8 +509,8 @@  list_set_list(const struct ip_set *set,
 		nested = ipset_nest_start(skb, IPSET_ATTR_DATA);
 		if (!nested)
 			goto nla_put_failure;
-		if (nla_put_string(skb, IPSET_ATTR_NAME,
-				   ip_set_name_byindex(map->net, e->id)))
+		ip_set_name_byindex(map->net, e->id, name);
+		if (nla_put_string(skb, IPSET_ATTR_NAME, name))
 			goto nla_put_failure;
 		if (ip_set_put_extensions(skb, set, e, true))
 			goto nla_put_failure;