Patchwork [RFC,TPROXY] kick out TIME_WAIT sockets in case a new connection comes in with the same tuple

login
register
mail settings
Submitter Balazs Scheidler
Date Dec. 10, 2008, 10:15 a.m.
Message ID <1228904130.7542.57.camel@bzorp.balabit>
Download mbox | patch
Permalink /patch/13147/
State RFC
Delegated to: David Miller
Headers show

Comments

Balazs Scheidler - Dec. 10, 2008, 10:15 a.m.
On Wed, 2008-12-10 at 00:57 -0800, David Miller wrote:
> From: Balazs Scheidler <bazsi@balabit.hu>
> Date: Wed, 10 Dec 2008 09:52:22 +0100
> 
> > I understand. Here are the alternatives I considered:
> >  1) the patch above, by extending the socket structures
> >  2) expand skb, of course I felt this is worse than the patch I posted
> >  3) call inet_twsk_deschedule() from the prerouting hook
> > 
> > The 3rd one does not require any expansion of the related structures,
> > however it'd mean that the TCP state is not only looked up, but also
> > changed from the TPROXY target. I felt this ugly, but the ugliness would
> > be constrained to the tproxy code. Shall I post a patch implementing
> > option #3 above?
> 
> It would be interesting to see what it looks like, so if you
> don't mind then yes please toss that together so we can have
> a look.
> 

Updated patch below. The SYN validation could still be improved to match
the one in tcp_timewait_state_process(), the current one is very
simplistic, but the approach is already visible.

[TPROXY] kick out TIME_WAIT sockets in case a new connection comes in with the same tuple

Without tproxy redirections an incoming SYN kicks out conflicting
TIME_WAIT sockets, in order to handle clients that reuse ports
within the TIME_WAIT period.

The same mechanism didn't work in case TProxy is involved in finding
the proper socket, as the time_wait processing code looked up the
listening socket assuming that the listener addr/port matches those
of the established connection.

This is not the case with TProxy as the listener addr/port is possibly
changed with the tproxy rule.

An earlier version of this patch used expanded inet_sock and
inet_timewait_sock structs in order to store the original listener
address. This time, the TIME_WAIT socket is removed from the hashes right in
the TPROXY target so by the time the packet reaches the TCP stack, the
TIME_WAIT socket will not be there.

This patch is currently only a draft, I'm still thinking about
changing the prototype of nf_tproxy_get_sock_v4(), so please
do NOT apply.
---

 include/net/inet_timewait_sock.h       |    2 +-
 include/net/netfilter/nf_tproxy_core.h |    6 +++++-
 net/netfilter/nf_tproxy_core.c         |   29 +++++++++++++++++++++--------
 net/netfilter/xt_TPROXY.c              |   29 ++++++++++++++++++++++++-----
 net/netfilter/xt_socket.c              |    2 +-
 5 files changed, 52 insertions(+), 16 deletions(-)
David Miller - Dec. 10, 2008, 11:21 p.m.
From: Balazs Scheidler <bazsi@balabit.hu>
Date: Wed, 10 Dec 2008 11:15:30 +0100

> Updated patch below. The SYN validation could still be improved to match
> the one in tcp_timewait_state_process(), the current one is very
> simplistic, but the approach is already visible.
> 
> [TPROXY] kick out TIME_WAIT sockets in case a new connection comes in with the same tuple

This looks fine to me.  Let me know when you have a final version
for me to apply to net-next-2.6
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Patch

diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h
index 4b8ece2..2f579f4 100644
--- a/include/net/inet_timewait_sock.h
+++ b/include/net/inet_timewait_sock.h
@@ -130,7 +130,7 @@  struct inet_timewait_sock {
 	/* And these are ours. */
 	__u8			tw_ipv6only:1,
 				tw_transparent:1;
-	/* 15 bits hole, try to pack */
+	/* 14 bits hole, try to pack */
 	__u16			tw_ipv6_offset;
 	unsigned long		tw_ttd;
 	struct inet_bind_bucket	*tw_tb;
diff --git a/include/net/netfilter/nf_tproxy_core.h b/include/net/netfilter/nf_tproxy_core.h
index 208b46f..b3a8942 100644
--- a/include/net/netfilter/nf_tproxy_core.h
+++ b/include/net/netfilter/nf_tproxy_core.h
@@ -8,12 +8,16 @@ 
 #include <net/inet_sock.h>
 #include <net/tcp.h>
 
+#define NFT_LOOKUP_ANY         0
+#define NFT_LOOKUP_LISTENER    1
+#define NFT_LOOKUP_ESTABLISHED 2
+
 /* look up and get a reference to a matching socket */
 extern struct sock *
 nf_tproxy_get_sock_v4(struct net *net, const u8 protocol,
 		      const __be32 saddr, const __be32 daddr,
 		      const __be16 sport, const __be16 dport,
-		      const struct net_device *in, bool listening);
+		      const struct net_device *in, int lookup_type);
 
 static inline void
 nf_tproxy_put_sock(struct sock *sk)
diff --git a/net/netfilter/nf_tproxy_core.c b/net/netfilter/nf_tproxy_core.c
index cdc97f3..1858da0 100644
--- a/net/netfilter/nf_tproxy_core.c
+++ b/net/netfilter/nf_tproxy_core.c
@@ -22,21 +22,34 @@  struct sock *
 nf_tproxy_get_sock_v4(struct net *net, const u8 protocol,
 		      const __be32 saddr, const __be32 daddr,
 		      const __be16 sport, const __be16 dport,
-		      const struct net_device *in, bool listening_only)
+		      const struct net_device *in, int lookup_type)
 {
 	struct sock *sk;
 
 	/* look up socket */
 	switch (protocol) {
 	case IPPROTO_TCP:
-		if (listening_only)
-			sk = __inet_lookup_listener(net, &tcp_hashinfo,
-						    daddr, ntohs(dport),
-						    in->ifindex);
-		else
+		switch (lookup_type) {
+		case NFT_LOOKUP_ANY:
 			sk = __inet_lookup(net, &tcp_hashinfo,
 					   saddr, sport, daddr, dport,
 					   in->ifindex);
+			break;
+		case NFT_LOOKUP_LISTENER:
+			sk = inet_lookup_listener(net, &tcp_hashinfo,
+						    daddr, dport,
+						    in->ifindex);
+			break;
+		case NFT_LOOKUP_ESTABLISHED:
+			sk = inet_lookup_established(net, &tcp_hashinfo,
+						    saddr, sport, daddr, dport,
+						    in->ifindex);
+			break;
+		default:
+			WARN_ON(1);
+			sk = NULL;
+			break;
+		}
 		break;
 	case IPPROTO_UDP:
 		sk = udp4_lib_lookup(net, saddr, sport, daddr, dport,
@@ -47,8 +60,8 @@  nf_tproxy_get_sock_v4(struct net *net, const u8 protocol,
 		sk = NULL;
 	}
 
-	pr_debug("tproxy socket lookup: proto %u %08x:%u -> %08x:%u, listener only: %d, sock %p\n",
-		 protocol, ntohl(saddr), ntohs(sport), ntohl(daddr), ntohs(dport), listening_only, sk);
+	pr_debug("tproxy socket lookup: proto %u %08x:%u -> %08x:%u, lookup type: %d, sock %p\n",
+		 protocol, ntohl(saddr), ntohs(sport), ntohl(daddr), ntohs(dport), lookup_type, sk);
 
 	return sk;
 }
diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c
index 1340c2f..92a911e 100644
--- a/net/netfilter/xt_TPROXY.c
+++ b/net/netfilter/xt_TPROXY.c
@@ -29,7 +29,7 @@  tproxy_tg(struct sk_buff *skb, const struct xt_target_param *par)
 {
 	const struct iphdr *iph = ip_hdr(skb);
 	const struct xt_tproxy_target_info *tgi = par->targinfo;
-	struct udphdr _hdr, *hp;
+	struct tcphdr _hdr, *hp;
 	struct sock *sk;
 
 	hp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_hdr), &_hdr);
@@ -37,10 +37,29 @@  tproxy_tg(struct sk_buff *skb, const struct xt_target_param *par)
 		return NF_DROP;
 
 	sk = nf_tproxy_get_sock_v4(dev_net(skb->dev), iph->protocol,
-				   iph->saddr, tgi->laddr ? tgi->laddr : iph->daddr,
-				   hp->source, tgi->lport ? tgi->lport : hp->dest,
-				   par->in, true);
-
+				   iph->saddr, iph->daddr,
+				   hp->source, hp->dest,
+				   par->in, NFT_LOOKUP_ESTABLISHED);
+				   
+	if (sk->sk_state == TCP_TIME_WAIT && hp->syn && !hp->rst && !hp->ack && !hp->fin) {
+		struct sock *sk2;
+		 
+		sk2 = nf_tproxy_get_sock_v4(dev_net(skb->dev), iph->protocol,
+					   iph->saddr, tgi->laddr ? tgi->laddr : iph->daddr,
+					   hp->source, tgi->lport ? tgi->lport : hp->dest,
+					   par->in, NFT_LOOKUP_LISTENER);
+		if (sk2) {
+			inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
+			inet_twsk_put(inet_twsk(sk));
+			sk = sk2;
+		}
+	}
+	else if (!sk) {
+		sk = nf_tproxy_get_sock_v4(dev_net(skb->dev), iph->protocol,
+					   iph->saddr, tgi->laddr ? tgi->laddr : iph->daddr,
+					   hp->source, tgi->lport ? tgi->lport : hp->dest,
+					   par->in, NFT_LOOKUP_LISTENER);
+	}
 	/* NOTE: assign_sock consumes our sk reference */
 	if (sk && nf_tproxy_assign_sock(skb, sk)) {
 		/* This should be in a separate target, but we don't do multiple
diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c
index 1d521a5..ae8233d 100644
--- a/net/netfilter/xt_socket.c
+++ b/net/netfilter/xt_socket.c
@@ -142,7 +142,7 @@  socket_mt(const struct sk_buff *skb, const struct xt_match_param *par)
 #endif
 
 	sk = nf_tproxy_get_sock_v4(dev_net(skb->dev), protocol,
-				   saddr, daddr, sport, dport, par->in, false);
+				   saddr, daddr, sport, dport, par->in, NFT_LOOKUP_ANY);
 	if (sk != NULL) {
 		bool wildcard = (sk->sk_state != TCP_TIME_WAIT && inet_sk(sk)->rcv_saddr == 0);