[5/5] tcp: ipv4 listen state scaled

From: Dmitry Popov <dp@highloadlab.com>

From: Dmitry Popov <dp@highloadlab.com>

Fast path for TCP_LISTEN state processing added.

tcp_v4_rcv_listen is called from tcp_v4_rcv without socket lock.
However, it may acquire main socket lock in 3 cases:
1) To check syn_table in tcp_v4_hnd_req.
2) To check syn_table and modify accept queue in tcp_v4_conn_request.
3) To modify accept queue in get_cookie_sock.

In cases 1 and 2 we check for user lock and add skb to sk_backlog if
socket is locked.
In case 3 we don't check for user lock and it may lead to wrong
behavior. That's why we need socket locking in tcp_set_state(sk,
TCP_CLOSE).

Additional state in sk->sk_lock.owned is needed to prevent infinite
loop in backlog processing.

Signed-off-by: Dmitry Popov <dp@highloadlab.com>
---
 include/net/sock.h    |    6 ++-
 net/core/sock.c       |    4 +-
 net/ipv4/syncookies.c |   20 +++++-
 net/ipv4/tcp.c        |    5 ++
 net/ipv4/tcp_ipv4.c   |  159 +++++++++++++++++++++++++++++++++++++++++--------
 5 files changed, 162 insertions(+), 32 deletions(-)
 	 * limitations, they conserve resources and peer is
@@ -1353,6 +1370,7 @@ int tcp_v4_conn_request(struct sock *sk, struct
sk_buff *skb)
 			syn_flood_warning(skb);
 		if (sysctl_tcp_syncookies) {
 			tcp_inc_syncookie_stats(&tp->syncookie_stats);
+			bh_unlock_sock(sk);
 			want_cookie = 1;
 		} else
 #else
@@ -1405,9 +1423,6 @@ int tcp_v4_conn_request(struct sock *sk, struct
sk_buff *skb)
 		while (l-- > 0)
 			*c++ ^= *hash_location++;

-#ifdef CONFIG_SYN_COOKIES
-		want_cookie = 0;	/* not our kind of cookie */
-#endif
 		tmp_ext.cookie_out_never = 0; /* false */
 		tmp_ext.cookie_plus = tmp_opt.cookie_plus;
 		tmp_ext.cookie_in_always = tp->rx_opt.cookie_in_always;
@@ -1494,6 +1509,7 @@ int tcp_v4_conn_request(struct sock *sk, struct
sk_buff *skb)
 		goto drop_and_free;

 	inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
+	bh_unlock_sock(sk);
 	return 0;

 drop_and_release:
@@ -1501,6 +1517,8 @@ drop_and_release:
 drop_and_free:
 	reqsk_free(req);
 drop:
+	if (!want_cookie)
+		bh_unlock_sock(sk);
 	return 0;
 }
 EXPORT_SYMBOL(tcp_v4_conn_request);
@@ -1588,10 +1606,35 @@ static struct sock *tcp_v4_hnd_req(struct sock
*sk, struct sk_buff *skb)
 	struct sock *nsk;
 	struct request_sock **prev;
 	/* Find possible connection requests. */
-	struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
+	struct request_sock *req;
+
+	bh_lock_sock_nested(sk);
+
+	if (__sock_owned_by_user(sk)) {
+		if (likely(!sk_add_backlog(sk, skb)))
+			skb_get(skb);
+		else
+			NET_INC_STATS_BH(dev_net(skb->dev),
+					 LINUX_MIB_TCPBACKLOGDROP);
+		bh_unlock_sock(sk);
+		return NULL;
+	}
+
+	if (inet_csk(sk)->icsk_accept_queue.listen_opt == NULL) {
+		/* socket is closing */
+		bh_unlock_sock(sk);
+		return NULL;
+	}
+
+	req = inet_csk_search_req(sk, &prev, th->source,
 						       iph->saddr, iph->daddr);
-	if (req)
-		return tcp_check_req(sk, skb, req, prev);
+	if (req) {
+		nsk = tcp_check_req(sk, skb, req, prev);
+		bh_unlock_sock(sk);
+		return nsk;
+	} else {
+		bh_unlock_sock(sk);
+	}

 	nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
 			th->source, iph->daddr, th->dest, inet_iif(skb));
@@ -1633,6 +1676,72 @@ static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
 	return 0;
 }

+/* Beware! This may be called without socket lock.
+ * TCP Checksum should be checked before this call.
+ */
+int tcp_v4_rcv_listen(struct sock *sk, struct sk_buff *skb)
+{
+	struct sock *nsk;
+	struct sock *rsk;
+	struct tcphdr *th = tcp_hdr(skb);
+
+	nsk = tcp_v4_hnd_req(sk, skb);
+
+	if (!nsk)
+		goto discard;
+
+	if (nsk != sk) {
+		/* Probable SYN-ACK */
+		if (tcp_child_process(sk, nsk, skb)) {
+			rsk = nsk;
+			goto reset;
+		}
+		return 0;
+	}
+
+	/* Probable SYN */
+	TCP_CHECK_TIMER(sk);
+
+	if (th->ack) {
+		rsk = sk;
+		goto reset;
+	}
+
+	if (!th->rst && th->syn) {
+		if (inet_csk(sk)->icsk_af_ops->conn_request(sk, skb) < 0) {
+			rsk = sk;
+			goto reset;
+		}
+		/* Now we have several options: In theory there is
+		 * nothing else in the frame. KA9Q has an option to
+		 * send data with the syn, BSD accepts data with the
+		 * syn up to the [to be] advertised window and
+		 * Solaris 2.1 gives you a protocol error. For now
+		 * we just ignore it, that fits the spec precisely
+		 * and avoids incompatibilities. It would be nice in
+		 * future to drop through and process the data.
+		 *
+		 * Now that TTCP is starting to be used we ought to
+		 * queue this data.
+		 * But, this leaves one open to an easy denial of
+		 * service attack, and SYN cookies can't defend
+		 * against this problem. So, we drop the data
+		 * in the interest of security over speed unless
+		 * it's still in use.
+		 */
+	}
+
+	TCP_CHECK_TIMER(sk);
+
+discard:
+	kfree_skb(skb);
+	return 0;
+
+reset:
+	tcp_v4_send_reset(rsk, skb);
+	goto discard;
+}
+

 /* The socket must have it's spinlock held when we get
  * here.
@@ -1644,15 +1753,11 @@ static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
  */
 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
 {
-	struct sock *rsk;
-
 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
 		sock_rps_save_rxhash(sk, skb->rxhash);
 		TCP_CHECK_TIMER(sk);
-		if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
-			rsk = sk;
+		if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len))
 			goto reset;
-		}
 		TCP_CHECK_TIMER(sk);
 		return 0;
 	}
@@ -1660,32 +1765,23 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
 	if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
 		goto csum_err;

-	if (sk->sk_state == TCP_LISTEN) {
-		struct sock *nsk = tcp_v4_hnd_req(sk, skb);
-		if (!nsk)
-			goto discard;
-
-		if (nsk != sk) {
-			if (tcp_child_process(sk, nsk, skb)) {
-				rsk = nsk;
-				goto reset;
-			}
-			return 0;
-		}
-	} else
+	if (sk->sk_state == TCP_LISTEN)
+		/* This is for IPv4-mapped IPv6 addresses
+		   and backlog processing */
+		return tcp_v4_rcv_listen(sk, skb);
+	else
 		sock_rps_save_rxhash(sk, skb->rxhash);


 	TCP_CHECK_TIMER(sk);
 	if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
-		rsk = sk;
 		goto reset;
 	}
 	TCP_CHECK_TIMER(sk);
 	return 0;

 reset:
-	tcp_v4_send_reset(rsk, skb);
+	tcp_v4_send_reset(sk, skb);
 discard:
 	kfree_skb(skb);
 	/* Be careful here. If this function gets more complicated and
@@ -1779,6 +1875,17 @@ process:
 		goto discard_and_relse;
 #endif

+	if (sk->sk_state == TCP_LISTEN) {
+		/* Fast path for listening socket */
+		if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb)) {
+			TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
+			goto discard_and_relse;
+		}
+		tcp_v4_rcv_listen(sk, skb);
+		sock_put(sk);
+		return 0;
+	}
+
 	bh_lock_sock_nested(sk);
 	ret = 0;
 	if (!sock_owned_by_user(sk)) {
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Message ID	AANLkTikRsOevLBHn0xb0S_YvfPMWpAdw373bxQUc+xbV@mail.gmail.com
State	Rejected, archived
Delegated to:	David Miller
Headers	show Return-Path: <netdev-owner@vger.kernel.org> X-Original-To: patchwork-incoming@ozlabs.org Delivered-To: patchwork-incoming@ozlabs.org Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by ozlabs.org (Postfix) with ESMTP id 62210B6F10 for <patchwork-incoming@ozlabs.org>; Thu, 28 Oct 2010 00:32:33 +1100 (EST) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S933263Ab0J0Nc2 (ORCPT <rfc822;patchwork-incoming@ozlabs.org>); Wed, 27 Oct 2010 09:32:28 -0400 Received: from mail-yw0-f46.google.com ([209.85.213.46]:51163 "EHLO mail-yw0-f46.google.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1756640Ab0J0Nc0 (ORCPT <rfc822;netdev@vger.kernel.org>); Wed, 27 Oct 2010 09:32:26 -0400 Received: by ywk9 with SMTP id 9so357425ywk.19 for <multiple recipients>; Wed, 27 Oct 2010 06:32:25 -0700 (PDT) MIME-Version: 1.0 Received: by 10.239.154.72 with SMTP id d8mr45165hbc.83.1288186343262; Wed, 27 Oct 2010 06:32:23 -0700 (PDT) Received: by 10.220.172.73 with HTTP; Wed, 27 Oct 2010 06:32:23 -0700 (PDT) X-Originating-IP: [89.235.167.66] Date: Wed, 27 Oct 2010 17:32:23 +0400 Message-ID: <AANLkTikRsOevLBHn0xb0S_YvfPMWpAdw373bxQUc+xbV@mail.gmail.com> Subject: [PATCH 5/5] tcp: ipv4 listen state scaled From: Dmitry Popov <dp@highloadlab.com> To: "David S. Miller" <davem@davemloft.net>, William.Allen.Simpson@gmail.com, Eric Dumazet <eric.dumazet@gmail.com>, Andreas Petlund <apetlund@simula.no>, Shan Wei <shanwei@cn.fujitsu.com>, Herbert Xu <herbert@gondor.apana.org.au>, Octavian Purdila <opurdila@ixiacom.com>, =?ISO-8859-1?Q?Ilpo_J=E4rvinen?= <ilpo.jarvinen@helsinki.fi>, Alexey Dobriyan <adobriyan@gmail.com>, Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>, "Pekka Savola (ipv6)" <pekkas@netcore.fi>, James Morris <jmorris@namei.org>, Hideaki YOSHIFUJI <yoshfuji@linux-ipv6.org>, Patrick McHardy <kaber@trash.net>, Evgeniy Polyakov <zbr@ioremap.net>, Laurent Chavey <chavey@google.com>, Gilad Ben-Yossef <gilad@codefidence.com>, Greg Kroah-Hartman <gregkh@suse.de>, "Steven J. Magnani" <steve@digidescorp.com>, Joe Perches <joe@perches.com>, Stephen Hemminger <shemminger@vyatta.com>, Yony Amit <yony@comsleep.com>, linux-kernel@vger.kernel.org, netdev@vger.kernel.org, Artyom Gavrichenkov <ag@highloadlab.com> Content-Type: text/plain; charset=ISO-8859-1 Sender: netdev-owner@vger.kernel.org Precedence: bulk List-ID: <netdev.vger.kernel.org> X-Mailing-List: netdev@vger.kernel.org

[5/5] tcp: ipv4 listen state scaled

Commit Message

Comments

Patch