diff mbox

[RFC,net-next-2.6] udp: Dont use lock_sock()/release_sock() in rx path

Message ID 4AD4F858.2040200@gmail.com
State RFC, archived
Delegated to: David Miller
Headers show

Commit Message

Eric Dumazet Oct. 13, 2009, 9:59 p.m. UTC
Eric Dumazet a écrit :
> Sure, will do, but first I want to suppress the lock_sock()/release_sock() in
> rx path, that was added for sk_forward_alloc thing. This really hurts,
> because of the backlog handling.
> 
> I have preliminary patch that restore UDP latencies we had in the past ;)
> 
> Trick is for UDP, sk_forward_alloc is not updated by tx/rx, only rx.
> So we can use the sk_receive_queue.lock to forbid concurrent updates.
> 
> As this lock is already hot and only used by rx, we wont have to
> dirty the sk_lock, that will only be used by tx path.
> 
> Then we can carefuly reorder struct sock to lower number of cache lines
> needed for each path.
> 

[RFC net-next-2.6] udp: Dont use lock_sock()/release_sock() in rx path

We added two years ago memory accounting for UDP and some people complain
about increased latencies, especially on multicast.

Because sk_forward_alloc is not atomic, we duplicated the protection we used for TCP,
ie use lock_sock()/release_sock() to guard sk_forward_alloc against concurrent updates.

When a frame is received by NIC, sofirq handler has to lock the socket, and eventually
has to queue the frame into backlog instead of receive queue.

Then, user application also has to lock socket to dequeue a frame from receive_queue
and eventually process backlog queue, leading to high latencies.

This lock is also used in tx path to guard cork structure against concurrent updates.

This cause false sharing of socket lock between several cpus that could be avoided,
considering UDP touches sk_forward_alloc only on rx. (TCP use it both for rx and tx)

Instead of using socket lock, we can use the sk_receive.lock lock that we have to
get anyway to queue frame at softirq time (or dequeue it by user application)

This avoids two atomic ops per packet in softirq handler, one in user app doing recvmsg(),
and we dont touch backlog anymore.

This way, the socket lock is only used in tx path, as in the past, and we can improve
things by reordering struct sock fields into separate rx/tx groups, in a followup patch.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
 core/sock.c |    6 +++++-
 ipv4/udp.c  |   34 ++++++++--------------------------
 ipv6/udp.c  |   36 ++++++++++--------------------------
 3 files changed, 23 insertions(+), 53 deletions(-)

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/net/core/sock.c b/net/core/sock.c
index 43ca2c9..d06b1a0 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -292,8 +292,13 @@  int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 	if (err)
 		goto out;
 
+	skb_orphan(skb);
+
+	spin_lock_irqsave(&list->lock, flags);
+
 	if (!sk_rmem_schedule(sk, skb->truesize)) {
 		err = -ENOBUFS;
+		spin_unlock_irqrestore(&list->lock, flags);
 		goto out;
 	}
 
@@ -307,7 +312,6 @@  int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 	 */
 	skb_len = skb->len;
 
-	spin_lock_irqsave(&list->lock, flags);
 	skb->dropcount = atomic_read(&sk->sk_drops);
 	__skb_queue_tail(list, skb);
 	spin_unlock_irqrestore(&list->lock, flags);
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index ee61b3f..f62cec3 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -855,29 +855,21 @@  out:
  */
 static unsigned int first_packet_length(struct sock *sk)
 {
-	struct sk_buff_head list_kill, *rcvq = &sk->sk_receive_queue;
+	struct sk_buff_head *rcvq = &sk->sk_receive_queue;
 	struct sk_buff *skb;
 	unsigned int res;
 
-	__skb_queue_head_init(&list_kill);
-
 	spin_lock_bh(&rcvq->lock);
 	while ((skb = skb_peek(rcvq)) != NULL &&
 		udp_lib_checksum_complete(skb)) {
 		UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS,
 				 IS_UDPLITE(sk));
 		__skb_unlink(skb, rcvq);
-		__skb_queue_tail(&list_kill, skb);
+		skb_kill_datagram(sk, skb, 0);
 	}
 	res = skb ? skb->len : 0;
 	spin_unlock_bh(&rcvq->lock);
 
-	if (!skb_queue_empty(&list_kill)) {
-		lock_sock(sk);
-		__skb_queue_purge(&list_kill);
-		sk_mem_reclaim_partial(sk);
-		release_sock(sk);
-	}
 	return res;
 }
 
@@ -1003,17 +995,17 @@  try_again:
 		err = ulen;
 
 out_free:
-	lock_sock(sk);
+	spin_lock_bh(&sk->sk_receive_queue.lock);
 	skb_free_datagram(sk, skb);
-	release_sock(sk);
+	spin_unlock_bh(&sk->sk_receive_queue.lock);
 out:
 	return err;
 
 csum_copy_err:
-	lock_sock(sk);
+	spin_lock_bh(&sk->sk_receive_queue.lock);
 	if (!skb_kill_datagram(sk, skb, flags))
-		UDP_INC_STATS_USER(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
-	release_sock(sk);
+		UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
+	spin_unlock_bh(&sk->sk_receive_queue.lock);
 
 	if (noblock)
 		return -EAGAIN;
@@ -1095,7 +1087,6 @@  drop:
 int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 {
 	struct udp_sock *up = udp_sk(sk);
-	int rc;
 	int is_udplite = IS_UDPLITE(sk);
 
 	/*
@@ -1175,16 +1166,7 @@  int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 			goto drop;
 	}
 
-	rc = 0;
-
-	bh_lock_sock(sk);
-	if (!sock_owned_by_user(sk))
-		rc = __udp_queue_rcv_skb(sk, skb);
-	else
-		sk_add_backlog(sk, skb);
-	bh_unlock_sock(sk);
-
-	return rc;
+	return __udp_queue_rcv_skb(sk, skb);
 
 drop:
 	UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 1f8e2af..07468aa 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -288,23 +288,23 @@  try_again:
 		err = ulen;
 
 out_free:
-	lock_sock(sk);
+	spin_lock_bh(&sk->sk_receive_queue.lock);
 	skb_free_datagram(sk, skb);
-	release_sock(sk);
+	spin_unlock_bh(&sk->sk_receive_queue.lock);
 out:
 	return err;
 
 csum_copy_err:
-	lock_sock(sk);
+	spin_lock_bh(&sk->sk_receive_queue.lock);
 	if (!skb_kill_datagram(sk, skb, flags)) {
 		if (is_udp4)
-			UDP_INC_STATS_USER(sock_net(sk),
+			UDP_INC_STATS_BH(sock_net(sk),
 					UDP_MIB_INERRORS, is_udplite);
 		else
-			UDP6_INC_STATS_USER(sock_net(sk),
+			UDP6_INC_STATS_BH(sock_net(sk),
 					UDP_MIB_INERRORS, is_udplite);
 	}
-	release_sock(sk);
+	spin_unlock_bh(&sk->sk_receive_queue.lock);
 
 	if (flags & MSG_DONTWAIT)
 		return -EAGAIN;
@@ -468,21 +468,10 @@  static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
 	while ((sk2 = udp_v6_mcast_next(net, sk_nulls_next(sk2), uh->dest, daddr,
 					uh->source, saddr, dif))) {
 		struct sk_buff *buff = skb_clone(skb, GFP_ATOMIC);
-		if (buff) {
-			bh_lock_sock(sk2);
-			if (!sock_owned_by_user(sk2))
-				udpv6_queue_rcv_skb(sk2, buff);
-			else
-				sk_add_backlog(sk2, buff);
-			bh_unlock_sock(sk2);
-		}
+		if (buff)
+			udpv6_queue_rcv_skb(sk2, buff);
 	}
-	bh_lock_sock(sk);
-	if (!sock_owned_by_user(sk))
-		udpv6_queue_rcv_skb(sk, skb);
-	else
-		sk_add_backlog(sk, skb);
-	bh_unlock_sock(sk);
+	udpv6_queue_rcv_skb(sk, skb);
 out:
 	spin_unlock(&hslot->lock);
 	return 0;
@@ -597,12 +586,7 @@  int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
 
 	/* deliver */
 
-	bh_lock_sock(sk);
-	if (!sock_owned_by_user(sk))
-		udpv6_queue_rcv_skb(sk, skb);
-	else
-		sk_add_backlog(sk, skb);
-	bh_unlock_sock(sk);
+	udpv6_queue_rcv_skb(sk, skb);
 	sock_put(sk);
 	return 0;