diff mbox

[RFC] ipv6: dst_allfrag() not taken into account by TCP

Message ID 1326831948.2606.2.camel@edumazet-laptop
State RFC, archived
Delegated to: David Miller
Headers show

Commit Message

Eric Dumazet Jan. 17, 2012, 8:25 p.m. UTC
Le mardi 17 janvier 2012 à 21:03 +0100, Tore Anderson a écrit :
> * Eric Dumazet
> 
> > Bugzilla reference :
> > 
> > https://bugzilla.kernel.org/show_bug.cgi?id=42572
> 
> Hi, and thanks for taking an interest in this issue!
> 
> I've got some general comments regarding running IPv6-only Linux servers
> behind stateless IPv4/IPv6 translators. (They are not strictly related
> to the above bug, but not completely off-topic either I hope.)
> 
> 1) The Linux kernel doesn't allow reducing the effective IPv6 link MTU
> (as recorded in the routing cache) to anything less than 1280. This
> means that it can end up in a situation where the effective IPv6 link
> MTU is greater than the actual IPv6 Path MTU. In the PCAP in the
> bugzilla, they are 1280 and 1279, respectively. However, the kernel
> doesn't appear to record the actual Path MTU anywhere, instead setting
> the allfrag feature.
> 
> While this is perfectly legal behaviour according to the RFC, from an
> operational point of view it would have been nice if there were some way
> (e.g. a sysctl) to tell the kernel to also actually allow an ICMPv6 PTB
> to reduce the effective IPv6 link MTU to values less than 1280 (at least
> down to the minimum IPv4 MTU + 20 bytes). That would have avoided the
> need for the allfrag feature to come into play completely.
> 
> The RFC allows for this behaviour, too.
> 
> 2) Since the kernel doesn't keep track of the actual Path MTU (if it's
> lower than 1280), when the allfrag feature gets set on a route, *every*
> packet gets a fragmentation header. (Which is to be expected, really,
> given it's name.) However, this means that even tiny packets such as a
> TCP SYN/ACK gets the fragmentation header added. This is clearly not
> particularly useful.
> 
> If the kernel had kept track of the effective Path MTU, and only
> included the IPv6 Fragmentation header on packets that were larger than
> it *only*, this wouldn't have been a problem. (Alternatively, if it
> allowed the effective link MTU to drop below 1280 that would also have
> avoided this problem.)
> 
> 3) There seems to be a bug related to generating the TCP checksum of
> SYN/ACK packets to destinations with the allfrag features set. I just
> submitted a bug report about this:
> 
> https://bugzilla.kernel.org/show_bug.cgi?id=42595
> 
> This makes the allfrag feature pretty much useless for me, as I can only
> successfully establish a single TCP session from a client behind a <1280
> MTU link for the entire lifetime of the routing cache entry.
> 
> Best regards,

Thanks Tore, we'll take a look at this second problem.

Could you please test following patch ?

It should apply on current 'net' or 'linux' tree

 include/net/inet_connection_sock.h |    1 +
 include/net/tcp.h                  |    4 ++--
 net/ipv4/tcp_output.c              |   19 +++++++++++++++++--
 net/ipv6/tcp_ipv6.c                |    1 +
 net/sctp/ipv6.c                    |    1 +
 5 files changed, 22 insertions(+), 4 deletions(-)



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Tore Anderson Jan. 18, 2012, 12:42 p.m. UTC | #1
* Eric Dumazet

> Could you please test following patch ?

Hi,

It seems to work fine, now all the outgoing packets that includes a
Fragmentation header all are "atomic" or "non-fragmented" fragments,
i.e., that they both have Offset=0 and More Fragments=0.

I'm uploaded a tcpdump showing the traffic with the patch here:

https://bugzilla.kernel.org/attachment.cgi?id=72106

Best regards,
diff mbox

Patch

diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index dbf9aab..8297691 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -45,6 +45,7 @@  struct inet_connection_sock_af_ops {
 				      struct dst_entry *dst);
 	struct inet_peer *(*get_peer)(struct sock *sk, bool *release_it);
 	u16	    net_header_len;
+	u16	    net_frag_header_len;
 	u16	    sockaddr_len;
 	int	    (*setsockopt)(struct sock *sk, int level, int optname, 
 				  char __user *optval, unsigned int optlen);
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 0118ea9..86e5ef4 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -531,8 +531,8 @@  extern int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
 
 extern void tcp_initialize_rcv_mss(struct sock *sk);
 
-extern int tcp_mtu_to_mss(const struct sock *sk, int pmtu);
-extern int tcp_mss_to_mtu(const struct sock *sk, int mss);
+extern int tcp_mtu_to_mss(struct sock *sk, int pmtu);
+extern int tcp_mss_to_mtu(struct sock *sk, int mss);
 extern void tcp_mtup_init(struct sock *sk);
 extern void tcp_valid_rtt_meas(struct sock *sk, u32 seq_rtt);
 
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 8c8de27..5d27b68 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1151,7 +1151,7 @@  int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
 }
 
 /* Calculate MSS. Not accounting for SACKs here.  */
-int tcp_mtu_to_mss(const struct sock *sk, int pmtu)
+int tcp_mtu_to_mss(struct sock *sk, int pmtu)
 {
 	const struct tcp_sock *tp = tcp_sk(sk);
 	const struct inet_connection_sock *icsk = inet_csk(sk);
@@ -1162,6 +1162,14 @@  int tcp_mtu_to_mss(const struct sock *sk, int pmtu)
 	 */
 	mss_now = pmtu - icsk->icsk_af_ops->net_header_len - sizeof(struct tcphdr);
 
+	/* IPv6 adds a frag_hdr in case RTAX_FEATURE_ALLFRAG is set */
+	if (icsk->icsk_af_ops->net_frag_header_len) {
+		const struct dst_entry *dst = __sk_dst_get(sk);
+
+		if (dst && dst_allfrag(dst))
+			mss_now -= icsk->icsk_af_ops->net_frag_header_len;
+	}
+
 	/* Clamp it (mss_clamp does not include tcp options) */
 	if (mss_now > tp->rx_opt.mss_clamp)
 		mss_now = tp->rx_opt.mss_clamp;
@@ -1180,7 +1188,7 @@  int tcp_mtu_to_mss(const struct sock *sk, int pmtu)
 }
 
 /* Inverse of above */
-int tcp_mss_to_mtu(const struct sock *sk, int mss)
+int tcp_mss_to_mtu(struct sock *sk, int mss)
 {
 	const struct tcp_sock *tp = tcp_sk(sk);
 	const struct inet_connection_sock *icsk = inet_csk(sk);
@@ -1191,6 +1199,13 @@  int tcp_mss_to_mtu(const struct sock *sk, int mss)
 	      icsk->icsk_ext_hdr_len +
 	      icsk->icsk_af_ops->net_header_len;
 
+	/* IPv6 adds a frag_hdr in case RTAX_FEATURE_ALLFRAG is set */
+	if (icsk->icsk_af_ops->net_frag_header_len) {
+		const struct dst_entry *dst = __sk_dst_get(sk);
+
+		if (dst && dst_allfrag(dst))
+			mtu += icsk->icsk_af_ops->net_frag_header_len;
+	}
 	return mtu;
 }
 
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 906c7ca..d90c007 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1883,6 +1883,7 @@  static const struct inet_connection_sock_af_ops ipv6_specific = {
 	.syn_recv_sock	   = tcp_v6_syn_recv_sock,
 	.get_peer	   = tcp_v6_get_peer,
 	.net_header_len	   = sizeof(struct ipv6hdr),
+	.net_frag_header_len = sizeof(struct frag_hdr),
 	.setsockopt	   = ipv6_setsockopt,
 	.getsockopt	   = ipv6_getsockopt,
 	.addr2sockaddr	   = inet6_csk_addr2sockaddr,
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index 91f4791..13174e5 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -1000,6 +1000,7 @@  static struct sctp_af sctp_af_inet6 = {
 	.seq_dump_addr	   = sctp_v6_seq_dump_addr,
 	.ecn_capable	   = sctp_v6_ecn_capable,
 	.net_header_len	   = sizeof(struct ipv6hdr),
+	.net_frag_header_len = sizeof(struct frag_hdr),
 	.sockaddr_len	   = sizeof(struct sockaddr_in6),
 #ifdef CONFIG_COMPAT
 	.compat_setsockopt = compat_ipv6_setsockopt,