Patchwork netlink: convert DIY reader/writer to mutex and RCU (v2)

login
register
mail settings
Submitter stephen hemminger
Date March 8, 2010, 10 p.m.
Message ID <20100308140021.234a120a@nehalam>
Download mbox | patch
Permalink /patch/47133/
State Changes Requested
Delegated to: David Miller
Headers show

Comments

stephen hemminger - March 8, 2010, 10 p.m.
The netlink table locking was open coded version of reader/writer
sleeping lock.  Change to using mutex and RCU which makes
code clearer, shorter, and simpler.

Could use sk_list nulls but then would have to have kmem_cache
for netlink handles and that seems like unnecessary bloat.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>

---
v1 -> v2 do RCU correctly...
  * use spinlock not mutex (not safe to sleep in normal RCU)
  * use _rcu variants of add/delete


 include/linux/netlink.h  |    5 -
 include/net/sock.h       |   12 +++
 net/netlink/af_netlink.c |  161 +++++++++++++++--------------------------------
 net/netlink/genetlink.c  |    9 --
 4 files changed, 66 insertions(+), 121 deletions(-)

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Patch

--- a/net/netlink/af_netlink.c	2010-03-08 09:07:15.104547875 -0800
+++ b/net/netlink/af_netlink.c	2010-03-08 13:43:46.026097658 -0800
@@ -128,15 +128,11 @@  struct netlink_table {
 };
 
 static struct netlink_table *nl_table;
-
-static DECLARE_WAIT_QUEUE_HEAD(nl_table_wait);
+static DEFINE_SPINLOCK(nltable_lock);
 
 static int netlink_dump(struct sock *sk);
 static void netlink_destroy_callback(struct netlink_callback *cb);
 
-static DEFINE_RWLOCK(nl_table_lock);
-static atomic_t nl_table_users = ATOMIC_INIT(0);
-
 static ATOMIC_NOTIFIER_HEAD(netlink_chain);
 
 static u32 netlink_group_mask(u32 group)
@@ -171,61 +167,6 @@  static void netlink_sock_destruct(struct
 	WARN_ON(nlk_sk(sk)->groups);
 }
 
-/* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it is _very_ bad on
- * SMP. Look, when several writers sleep and reader wakes them up, all but one
- * immediately hit write lock and grab all the cpus. Exclusive sleep solves
- * this, _but_ remember, it adds useless work on UP machines.
- */
-
-void netlink_table_grab(void)
-	__acquires(nl_table_lock)
-{
-	might_sleep();
-
-	write_lock_irq(&nl_table_lock);
-
-	if (atomic_read(&nl_table_users)) {
-		DECLARE_WAITQUEUE(wait, current);
-
-		add_wait_queue_exclusive(&nl_table_wait, &wait);
-		for (;;) {
-			set_current_state(TASK_UNINTERRUPTIBLE);
-			if (atomic_read(&nl_table_users) == 0)
-				break;
-			write_unlock_irq(&nl_table_lock);
-			schedule();
-			write_lock_irq(&nl_table_lock);
-		}
-
-		__set_current_state(TASK_RUNNING);
-		remove_wait_queue(&nl_table_wait, &wait);
-	}
-}
-
-void netlink_table_ungrab(void)
-	__releases(nl_table_lock)
-{
-	write_unlock_irq(&nl_table_lock);
-	wake_up(&nl_table_wait);
-}
-
-static inline void
-netlink_lock_table(void)
-{
-	/* read_lock() synchronizes us to netlink_table_grab */
-
-	read_lock(&nl_table_lock);
-	atomic_inc(&nl_table_users);
-	read_unlock(&nl_table_lock);
-}
-
-static inline void
-netlink_unlock_table(void)
-{
-	if (atomic_dec_and_test(&nl_table_users))
-		wake_up(&nl_table_wait);
-}
-
 static inline struct sock *netlink_lookup(struct net *net, int protocol,
 					  u32 pid)
 {
@@ -234,9 +175,9 @@  static inline struct sock *netlink_looku
 	struct sock *sk;
 	struct hlist_node *node;
 
-	read_lock(&nl_table_lock);
+	rcu_read_lock();
 	head = nl_pid_hashfn(hash, pid);
-	sk_for_each(sk, node, head) {
+	sk_for_each_rcu(sk, node, head) {
 		if (net_eq(sock_net(sk), net) && (nlk_sk(sk)->pid == pid)) {
 			sock_hold(sk);
 			goto found;
@@ -244,7 +185,7 @@  static inline struct sock *netlink_looku
 	}
 	sk = NULL;
 found:
-	read_unlock(&nl_table_lock);
+	rcu_read_unlock();
 	return sk;
 }
 
@@ -299,7 +240,8 @@  static int nl_pid_hash_rehash(struct nl_
 		struct hlist_node *node, *tmp;
 
 		sk_for_each_safe(sk, node, tmp, &otable[i])
-			__sk_add_node(sk, nl_pid_hashfn(hash, nlk_sk(sk)->pid));
+			__sk_add_node_rcu(sk,
+					  nl_pid_hashfn(hash, nlk_sk(sk)->pid));
 	}
 
 	nl_pid_hash_free(otable, osize);
@@ -353,7 +295,7 @@  static int netlink_insert(struct sock *s
 	struct hlist_node *node;
 	int len;
 
-	netlink_table_grab();
+	spin_lock(&nltable_lock);
 	head = nl_pid_hashfn(hash, pid);
 	len = 0;
 	sk_for_each(osk, node, head) {
@@ -376,22 +318,22 @@  static int netlink_insert(struct sock *s
 		head = nl_pid_hashfn(hash, pid);
 	hash->entries++;
 	nlk_sk(sk)->pid = pid;
-	sk_add_node(sk, head);
+	sk_add_node_rcu(sk, head);
 	err = 0;
 
 err:
-	netlink_table_ungrab();
+	spin_unlock(&nltable_lock);
 	return err;
 }
 
 static void netlink_remove(struct sock *sk)
 {
-	netlink_table_grab();
+	spin_lock(&nltable_lock);
 	if (sk_del_node_init(sk))
 		nl_table[sk->sk_protocol].hash.entries--;
 	if (nlk_sk(sk)->subscriptions)
-		__sk_del_bind_node(sk);
-	netlink_table_ungrab();
+		__sk_del_bind_node_rcu(sk);
+	spin_unlock(&nltable_lock);
 }
 
 static struct proto netlink_proto = {
@@ -444,12 +386,14 @@  static int netlink_create(struct net *ne
 	if (protocol < 0 || protocol >= MAX_LINKS)
 		return -EPROTONOSUPPORT;
 
-	netlink_lock_table();
+	spin_lock(&nltable_lock);
 #ifdef CONFIG_MODULES
 	if (!nl_table[protocol].registered) {
-		netlink_unlock_table();
+		spin_unlock(&nltable_lock);
+
 		request_module("net-pf-%d-proto-%d", PF_NETLINK, protocol);
-		netlink_lock_table();
+
+		spin_lock(&nltable_lock);
 	}
 #endif
 	if (nl_table[protocol].registered &&
@@ -458,7 +402,7 @@  static int netlink_create(struct net *ne
 	else
 		err = -EPROTONOSUPPORT;
 	cb_mutex = nl_table[protocol].cb_mutex;
-	netlink_unlock_table();
+	spin_unlock(&nltable_lock);
 
 	if (err < 0)
 		goto out;
@@ -515,7 +459,7 @@  static int netlink_release(struct socket
 
 	module_put(nlk->module);
 
-	netlink_table_grab();
+	spin_lock(&nltable_lock);
 	if (netlink_is_kernel(sk)) {
 		BUG_ON(nl_table[sk->sk_protocol].registered == 0);
 		if (--nl_table[sk->sk_protocol].registered == 0) {
@@ -525,7 +469,7 @@  static int netlink_release(struct socket
 		}
 	} else if (nlk->subscriptions)
 		netlink_update_listeners(sk);
-	netlink_table_ungrab();
+	spin_unlock(&nltable_lock);
 
 	kfree(nlk->groups);
 	nlk->groups = NULL;
@@ -533,6 +477,8 @@  static int netlink_release(struct socket
 	local_bh_disable();
 	sock_prot_inuse_add(sock_net(sk), &netlink_proto, -1);
 	local_bh_enable();
+
+	synchronize_rcu();
 	sock_put(sk);
 	return 0;
 }
@@ -551,7 +497,7 @@  static int netlink_autobind(struct socke
 
 retry:
 	cond_resched();
-	netlink_table_grab();
+	spin_lock(&nltable_lock);
 	head = nl_pid_hashfn(hash, pid);
 	sk_for_each(osk, node, head) {
 		if (!net_eq(sock_net(osk), net))
@@ -561,11 +507,11 @@  retry:
 			pid = rover--;
 			if (rover > -4097)
 				rover = -4097;
-			netlink_table_ungrab();
+			spin_unlock(&nltable_lock);
 			goto retry;
 		}
 	}
-	netlink_table_ungrab();
+	spin_unlock(&nltable_lock);
 
 	err = netlink_insert(sk, net, pid);
 	if (err == -EADDRINUSE)
@@ -590,9 +536,9 @@  netlink_update_subscriptions(struct sock
 	struct netlink_sock *nlk = nlk_sk(sk);
 
 	if (nlk->subscriptions && !subscriptions)
-		__sk_del_bind_node(sk);
+		__sk_del_bind_node_rcu(sk);
 	else if (!nlk->subscriptions && subscriptions)
-		sk_add_bind_node(sk, &nl_table[sk->sk_protocol].mc_list);
+		sk_add_bind_node_rcu(sk, &nl_table[sk->sk_protocol].mc_list);
 	nlk->subscriptions = subscriptions;
 }
 
@@ -603,7 +549,7 @@  static int netlink_realloc_groups(struct
 	unsigned long *new_groups;
 	int err = 0;
 
-	netlink_table_grab();
+	spin_lock(&nltable_lock);
 
 	groups = nl_table[sk->sk_protocol].groups;
 	if (!nl_table[sk->sk_protocol].registered) {
@@ -625,7 +571,7 @@  static int netlink_realloc_groups(struct
 	nlk->groups = new_groups;
 	nlk->ngroups = groups;
  out_unlock:
-	netlink_table_ungrab();
+	spin_unlock(&nltable_lock);
 	return err;
 }
 
@@ -664,13 +610,13 @@  static int netlink_bind(struct socket *s
 	if (!nladdr->nl_groups && (nlk->groups == NULL || !(u32)nlk->groups[0]))
 		return 0;
 
-	netlink_table_grab();
+	spin_lock(&nltable_lock);
 	netlink_update_subscriptions(sk, nlk->subscriptions +
 					 hweight32(nladdr->nl_groups) -
 					 hweight32(nlk->groups[0]));
 	nlk->groups[0] = (nlk->groups[0] & ~0xffffffffUL) | nladdr->nl_groups;
 	netlink_update_listeners(sk);
-	netlink_table_ungrab();
+	spin_unlock(&nltable_lock);
 
 	return 0;
 }
@@ -1059,15 +1005,12 @@  int netlink_broadcast(struct sock *ssk, 
 
 	/* While we sleep in clone, do not allow to change socket list */
 
-	netlink_lock_table();
-
-	sk_for_each_bound(sk, node, &nl_table[ssk->sk_protocol].mc_list)
+	rcu_read_lock();
+	sk_for_each_bound_rcu(sk, node, &nl_table[ssk->sk_protocol].mc_list)
 		do_one_broadcast(sk, &info);
+	rcu_read_unlock();
 
 	kfree_skb(skb);
-
-	netlink_unlock_table();
-
 	kfree_skb(info.skb2);
 
 	if (info.delivery_failure)
@@ -1129,12 +1072,12 @@  void netlink_set_err(struct sock *ssk, u
 	/* sk->sk_err wants a positive error value */
 	info.code = -code;
 
-	read_lock(&nl_table_lock);
+	rcu_read_lock();
 
-	sk_for_each_bound(sk, node, &nl_table[ssk->sk_protocol].mc_list)
+	sk_for_each_bound_rcu(sk, node, &nl_table[ssk->sk_protocol].mc_list)
 		do_one_set_err(sk, &info);
 
-	read_unlock(&nl_table_lock);
+	rcu_read_unlock();
 }
 EXPORT_SYMBOL(netlink_set_err);
 
@@ -1187,10 +1130,10 @@  static int netlink_setsockopt(struct soc
 			return err;
 		if (!val || val - 1 >= nlk->ngroups)
 			return -EINVAL;
-		netlink_table_grab();
+		spin_lock(&nltable_lock);
 		netlink_update_socket_mc(nlk, val,
 					 optname == NETLINK_ADD_MEMBERSHIP);
-		netlink_table_ungrab();
+		spin_unlock(&nltable_lock);
 		err = 0;
 		break;
 	}
@@ -1515,7 +1458,7 @@  netlink_kernel_create(struct net *net, i
 	nlk = nlk_sk(sk);
 	nlk->flags |= NETLINK_KERNEL_SOCKET;
 
-	netlink_table_grab();
+	spin_lock(&nltable_lock);
 	if (!nl_table[unit].registered) {
 		nl_table[unit].groups = groups;
 		nl_table[unit].listeners = listeners;
@@ -1526,7 +1469,7 @@  netlink_kernel_create(struct net *net, i
 		kfree(listeners);
 		nl_table[unit].registered++;
 	}
-	netlink_table_ungrab();
+	spin_unlock(&nltable_lock);
 	return sk;
 
 out_sock_release:
@@ -1557,7 +1500,7 @@  static void netlink_free_old_listeners(s
 	kfree(lrh->ptr);
 }
 
-int __netlink_change_ngroups(struct sock *sk, unsigned int groups)
+static int __netlink_change_ngroups(struct sock *sk, unsigned int groups)
 {
 	unsigned long *listeners, *old = NULL;
 	struct listeners_rcu_head *old_rcu_head;
@@ -1608,14 +1551,14 @@  int netlink_change_ngroups(struct sock *
 {
 	int err;
 
-	netlink_table_grab();
+	spin_lock(&nltable_lock);
 	err = __netlink_change_ngroups(sk, groups);
-	netlink_table_ungrab();
+	spin_unlock(&nltable_lock);
 
 	return err;
 }
 
-void __netlink_clear_multicast_users(struct sock *ksk, unsigned int group)
+static void __netlink_clear_multicast_users(struct sock *ksk, unsigned int group)
 {
 	struct sock *sk;
 	struct hlist_node *node;
@@ -1635,9 +1578,9 @@  void __netlink_clear_multicast_users(str
  */
 void netlink_clear_multicast_users(struct sock *ksk, unsigned int group)
 {
-	netlink_table_grab();
+	spin_lock(&nltable_lock);
 	__netlink_clear_multicast_users(ksk, group);
-	netlink_table_ungrab();
+	spin_unlock(&nltable_lock);
 }
 
 void netlink_set_nonroot(int protocol, unsigned int flags)
@@ -1902,7 +1845,7 @@  static struct sock *netlink_seq_socket_i
 		struct nl_pid_hash *hash = &nl_table[i].hash;
 
 		for (j = 0; j <= hash->mask; j++) {
-			sk_for_each(s, node, &hash->table[j]) {
+			sk_for_each_rcu(s, node, &hash->table[j]) {
 				if (sock_net(s) != seq_file_net(seq))
 					continue;
 				if (off == pos) {
@@ -1918,9 +1861,9 @@  static struct sock *netlink_seq_socket_i
 }
 
 static void *netlink_seq_start(struct seq_file *seq, loff_t *pos)
-	__acquires(nl_table_lock)
+	__acquires(RCU)
 {
-	read_lock(&nl_table_lock);
+	rcu_read_lock();
 	return *pos ? netlink_seq_socket_idx(seq, *pos - 1) : SEQ_START_TOKEN;
 }
 
@@ -1967,9 +1910,9 @@  static void *netlink_seq_next(struct seq
 }
 
 static void netlink_seq_stop(struct seq_file *seq, void *v)
-	__releases(nl_table_lock)
+	__releases(RCU)
 {
-	read_unlock(&nl_table_lock);
+	rcu_read_unlock();
 }
 
 
--- a/include/net/sock.h	2010-03-08 10:52:34.113924084 -0800
+++ b/include/net/sock.h	2010-03-08 13:54:27.815939404 -0800
@@ -451,6 +451,10 @@  static __inline__ void __sk_add_node(str
 {
 	hlist_add_head(&sk->sk_node, list);
 }
+static __inline__ void __sk_add_node_rcu(struct sock *sk, struct hlist_head *list)
+{
+	hlist_add_head_rcu(&sk->sk_node, list);
+}
 
 static __inline__ void sk_add_node(struct sock *sk, struct hlist_head *list)
 {
@@ -479,12 +483,18 @@  static __inline__ void __sk_del_bind_nod
 {
 	__hlist_del(&sk->sk_bind_node);
 }
+#define __sk_del_bind_node_rcu(sk) __sk_del_bind_node(sk)
 
 static __inline__ void sk_add_bind_node(struct sock *sk,
 					struct hlist_head *list)
 {
 	hlist_add_head(&sk->sk_bind_node, list);
 }
+static __inline__ void sk_add_bind_node_rcu(struct sock *sk,
+					struct hlist_head *list)
+{
+	hlist_add_head_rcu(&sk->sk_bind_node, list);
+}
 
 #define sk_for_each(__sk, node, list) \
 	hlist_for_each_entry(__sk, node, list, sk_node)
@@ -507,6 +517,8 @@  static __inline__ void sk_add_bind_node(
 	hlist_for_each_entry_safe(__sk, node, tmp, list, sk_node)
 #define sk_for_each_bound(__sk, node, list) \
 	hlist_for_each_entry(__sk, node, list, sk_bind_node)
+#define sk_for_each_bound_rcu(__sk, node, list) \
+	hlist_for_each_entry_rcu(__sk, node, list, sk_bind_node)
 
 /* Sock flags */
 enum sock_flags {
--- a/include/linux/netlink.h	2010-03-08 10:56:35.314235687 -0800
+++ b/include/linux/netlink.h	2010-03-08 11:04:33.414547616 -0800
@@ -170,18 +170,13 @@  struct netlink_skb_parms {
 #define NETLINK_CREDS(skb)	(&NETLINK_CB((skb)).creds)
 
 
-extern void netlink_table_grab(void);
-extern void netlink_table_ungrab(void);
-
 extern struct sock *netlink_kernel_create(struct net *net,
 					  int unit,unsigned int groups,
 					  void (*input)(struct sk_buff *skb),
 					  struct mutex *cb_mutex,
 					  struct module *module);
 extern void netlink_kernel_release(struct sock *sk);
-extern int __netlink_change_ngroups(struct sock *sk, unsigned int groups);
 extern int netlink_change_ngroups(struct sock *sk, unsigned int groups);
-extern void __netlink_clear_multicast_users(struct sock *sk, unsigned int group);
 extern void netlink_clear_multicast_users(struct sock *sk, unsigned int group);
 extern void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err);
 extern int netlink_has_listeners(struct sock *sk, unsigned int group);
--- a/net/netlink/genetlink.c	2010-03-08 10:56:02.194547526 -0800
+++ b/net/netlink/genetlink.c	2010-03-08 11:01:05.865177057 -0800
@@ -168,10 +168,9 @@  int genl_register_mc_group(struct genl_f
 	if (family->netnsok) {
 		struct net *net;
 
-		netlink_table_grab();
 		rcu_read_lock();
 		for_each_net_rcu(net) {
-			err = __netlink_change_ngroups(net->genl_sock,
+			err = netlink_change_ngroups(net->genl_sock,
 					mc_groups_longs * BITS_PER_LONG);
 			if (err) {
 				/*
@@ -181,12 +180,10 @@  int genl_register_mc_group(struct genl_f
 				 * increased on some sockets which is ok.
 				 */
 				rcu_read_unlock();
-				netlink_table_ungrab();
 				goto out;
 			}
 		}
 		rcu_read_unlock();
-		netlink_table_ungrab();
 	} else {
 		err = netlink_change_ngroups(init_net.genl_sock,
 					     mc_groups_longs * BITS_PER_LONG);
@@ -212,12 +209,10 @@  static void __genl_unregister_mc_group(s
 	struct net *net;
 	BUG_ON(grp->family != family);
 
-	netlink_table_grab();
 	rcu_read_lock();
 	for_each_net_rcu(net)
-		__netlink_clear_multicast_users(net->genl_sock, grp->id);
+		netlink_clear_multicast_users(net->genl_sock, grp->id);
 	rcu_read_unlock();
-	netlink_table_ungrab();
 
 	clear_bit(grp->id, mc_groups);
 	list_del(&grp->list);