diff mbox

[net-next] ipv6: RTAX_FEATURE_ALLFRAG causes inefficient TCP segment sizing

Message ID 1335289058.5205.165.camel@edumazet-glaptop
State Accepted, archived
Delegated to: David Miller
Headers show

Commit Message

Eric Dumazet April 24, 2012, 5:37 p.m. UTC
From: Eric Dumazet <edumazet@google.com>

Quoting Tore Anderson from :
https://bugzilla.kernel.org/show_bug.cgi?id=42572

When RTAX_FEATURE_ALLFRAG is set on a route, the effective TCP segment
size does not take into account the size of the IPv6 Fragmentation
header that needs to be included in outbound packets, causing every
transmitted TCP segment to be fragmented across two IPv6 packets, the
latter of which will only contain 8 bytes of actual payload.

RTAX_FEATURE_ALLFRAG is typically set on a route in response to
receving a ICMPv6 Packet Too Big message indicating a Path MTU of less
than 1280 bytes. 1280 bytes is the minimum IPv6 MTU, however ICMPv6
PTBs with MTU < 1280 are still valid, in particular when an IPv6
packet is sent to an IPv4 destination through a stateless translator.
Any ICMPv4 Need To Fragment packets originated from the IPv4 part of
the path will be translated to ICMPv6 PTB which may then indicate an
MTU of less than 1280.

The Linux kernel refuses to reduce the effective MTU to anything below
1280 bytes, instead it sets it to exactly 1280 bytes, and
RTAX_FEATURE_ALLFRAG is also set. However, the TCP segment size appears
to be set to 1240 bytes (1280 Path MTU - 40 bytes of IPv6 header),
instead of 1232 (additionally taking into account the 8 bytes required
by the IPv6 Fragmentation extension header).

This in turn results in rather inefficient transmission, as every 
transmitted TCP segment now is split in two fragments containing
1232+8 bytes of payload.

After this patch, all the outgoing packets that includes a
Fragmentation header all are "atomic" or "non-fragmented" fragments,
i.e., they both have Offset=0 and More Fragments=0.

With help from David S. Miller

Reported-by: Tore Anderson <tore@fud.no>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Maciej Żenczykowski <maze@google.com>
Cc: Tom Herbert <therbert@google.com>
Tested-by: Tore Anderson <tore@fud.no>
---
 include/net/inet_connection_sock.h |    1 +
 include/net/tcp.h                  |    4 ++--
 net/ipv4/tcp_output.c              |   19 +++++++++++++++++--
 net/ipv6/tcp_ipv6.c                |    1 +
 4 files changed, 21 insertions(+), 4 deletions(-)



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Maciej Żenczykowski April 24, 2012, 7:49 p.m. UTC | #1
Why do we refuse to set ipv6 mtu's below 1280?
how is what we do any better?

On Tue, Apr 24, 2012 at 10:37 AM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> From: Eric Dumazet <edumazet@google.com>
>
> Quoting Tore Anderson from :
> https://bugzilla.kernel.org/show_bug.cgi?id=42572
>
> When RTAX_FEATURE_ALLFRAG is set on a route, the effective TCP segment
> size does not take into account the size of the IPv6 Fragmentation
> header that needs to be included in outbound packets, causing every
> transmitted TCP segment to be fragmented across two IPv6 packets, the
> latter of which will only contain 8 bytes of actual payload.
>
> RTAX_FEATURE_ALLFRAG is typically set on a route in response to
> receving a ICMPv6 Packet Too Big message indicating a Path MTU of less
> than 1280 bytes. 1280 bytes is the minimum IPv6 MTU, however ICMPv6
> PTBs with MTU < 1280 are still valid, in particular when an IPv6
> packet is sent to an IPv4 destination through a stateless translator.
> Any ICMPv4 Need To Fragment packets originated from the IPv4 part of
> the path will be translated to ICMPv6 PTB which may then indicate an
> MTU of less than 1280.
>
> The Linux kernel refuses to reduce the effective MTU to anything below
> 1280 bytes, instead it sets it to exactly 1280 bytes, and
> RTAX_FEATURE_ALLFRAG is also set. However, the TCP segment size appears
> to be set to 1240 bytes (1280 Path MTU - 40 bytes of IPv6 header),
> instead of 1232 (additionally taking into account the 8 bytes required
> by the IPv6 Fragmentation extension header).
>
> This in turn results in rather inefficient transmission, as every
> transmitted TCP segment now is split in two fragments containing
> 1232+8 bytes of payload.
>
> After this patch, all the outgoing packets that includes a
> Fragmentation header all are "atomic" or "non-fragmented" fragments,
> i.e., they both have Offset=0 and More Fragments=0.
>
> With help from David S. Miller
>
> Reported-by: Tore Anderson <tore@fud.no>
> Signed-off-by: Eric Dumazet <edumazet@google.com>
> Cc: Maciej Żenczykowski <maze@google.com>
> Cc: Tom Herbert <therbert@google.com>
> Tested-by: Tore Anderson <tore@fud.no>
> ---
>  include/net/inet_connection_sock.h |    1 +
>  include/net/tcp.h                  |    4 ++--
>  net/ipv4/tcp_output.c              |   19 +++++++++++++++++--
>  net/ipv6/tcp_ipv6.c                |    1 +
>  4 files changed, 21 insertions(+), 4 deletions(-)
>
> diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
> index 46c9e2c..7d83f90 100644
> --- a/include/net/inet_connection_sock.h
> +++ b/include/net/inet_connection_sock.h
> @@ -45,6 +45,7 @@ struct inet_connection_sock_af_ops {
>                                      struct dst_entry *dst);
>        struct inet_peer *(*get_peer)(struct sock *sk, bool *release_it);
>        u16         net_header_len;
> +       u16         net_frag_header_len;
>        u16         sockaddr_len;
>        int         (*setsockopt)(struct sock *sk, int level, int optname,
>                                  char __user *optval, unsigned int optlen);
> diff --git a/include/net/tcp.h b/include/net/tcp.h
> index fc880e9..0fb84de 100644
> --- a/include/net/tcp.h
> +++ b/include/net/tcp.h
> @@ -544,8 +544,8 @@ extern int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
>
>  extern void tcp_initialize_rcv_mss(struct sock *sk);
>
> -extern int tcp_mtu_to_mss(const struct sock *sk, int pmtu);
> -extern int tcp_mss_to_mtu(const struct sock *sk, int mss);
> +extern int tcp_mtu_to_mss(struct sock *sk, int pmtu);
> +extern int tcp_mss_to_mtu(struct sock *sk, int mss);
>  extern void tcp_mtup_init(struct sock *sk);
>  extern void tcp_valid_rtt_meas(struct sock *sk, u32 seq_rtt);
>
> diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
> index 7b7cf38..834e89f 100644
> --- a/net/ipv4/tcp_output.c
> +++ b/net/ipv4/tcp_output.c
> @@ -1150,7 +1150,7 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
>  }
>
>  /* Calculate MSS. Not accounting for SACKs here.  */
> -int tcp_mtu_to_mss(const struct sock *sk, int pmtu)
> +int tcp_mtu_to_mss(struct sock *sk, int pmtu)
>  {
>        const struct tcp_sock *tp = tcp_sk(sk);
>        const struct inet_connection_sock *icsk = inet_csk(sk);
> @@ -1161,6 +1161,14 @@ int tcp_mtu_to_mss(const struct sock *sk, int pmtu)
>         */
>        mss_now = pmtu - icsk->icsk_af_ops->net_header_len - sizeof(struct tcphdr);
>
> +       /* IPv6 adds a frag_hdr in case RTAX_FEATURE_ALLFRAG is set */
> +       if (icsk->icsk_af_ops->net_frag_header_len) {
> +               const struct dst_entry *dst = __sk_dst_get(sk);
> +
> +               if (dst && dst_allfrag(dst))
> +                       mss_now -= icsk->icsk_af_ops->net_frag_header_len;
> +       }
> +
>        /* Clamp it (mss_clamp does not include tcp options) */
>        if (mss_now > tp->rx_opt.mss_clamp)
>                mss_now = tp->rx_opt.mss_clamp;
> @@ -1179,7 +1187,7 @@ int tcp_mtu_to_mss(const struct sock *sk, int pmtu)
>  }
>
>  /* Inverse of above */
> -int tcp_mss_to_mtu(const struct sock *sk, int mss)
> +int tcp_mss_to_mtu(struct sock *sk, int mss)
>  {
>        const struct tcp_sock *tp = tcp_sk(sk);
>        const struct inet_connection_sock *icsk = inet_csk(sk);
> @@ -1190,6 +1198,13 @@ int tcp_mss_to_mtu(const struct sock *sk, int mss)
>              icsk->icsk_ext_hdr_len +
>              icsk->icsk_af_ops->net_header_len;
>
> +       /* IPv6 adds a frag_hdr in case RTAX_FEATURE_ALLFRAG is set */
> +       if (icsk->icsk_af_ops->net_frag_header_len) {
> +               const struct dst_entry *dst = __sk_dst_get(sk);
> +
> +               if (dst && dst_allfrag(dst))
> +                       mtu += icsk->icsk_af_ops->net_frag_header_len;
> +       }
>        return mtu;
>  }
>
> diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
> index cdbf292..57b2109 100644
> --- a/net/ipv6/tcp_ipv6.c
> +++ b/net/ipv6/tcp_ipv6.c
> @@ -1778,6 +1778,7 @@ static const struct inet_connection_sock_af_ops ipv6_specific = {
>        .syn_recv_sock     = tcp_v6_syn_recv_sock,
>        .get_peer          = tcp_v6_get_peer,
>        .net_header_len    = sizeof(struct ipv6hdr),
> +       .net_frag_header_len = sizeof(struct frag_hdr),
>        .setsockopt        = ipv6_setsockopt,
>        .getsockopt        = ipv6_getsockopt,
>        .addr2sockaddr     = inet6_csk_addr2sockaddr,
>
>
Eric Dumazet April 24, 2012, 8:10 p.m. UTC | #2
On Tue, 2012-04-24 at 12:49 -0700, Maciej Żenczykowski wrote:
> Why do we refuse to set ipv6 mtu's below 1280?
> how is what we do any better?

I guess you didnt read Tore use case.

Thats the standard : http://tools.ietf.org/html/rfc2460#section-5

   In response to an IPv6 packet that is sent to an IPv4 destination
   (i.e., a packet that undergoes translation from IPv6 to IPv4), the
   originating IPv6 node may receive an ICMP Packet Too Big message
   reporting a Next-Hop MTU less than 1280.  In that case, the IPv6 node
   is not required to reduce the size of subsequent packets to less than
   1280, but must include a Fragment header in those packets so that the
   IPv6-to-IPv4 translating router can obtain a suitable Identification
   value to use in resulting IPv4 fragments.  Note that this means the
   payload may have to be reduced to 1232 octets (1280 minus 40 for the
   IPv6 header and 8 for the Fragment header), and smaller still if
   additional extension headers are used.


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Maciej Żenczykowski April 24, 2012, 9:50 p.m. UTC | #3
On Tue, Apr 24, 2012 at 1:10 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> On Tue, 2012-04-24 at 12:49 -0700, Maciej Żenczykowski wrote:
>> Why do we refuse to set ipv6 mtu's below 1280?
>> how is what we do any better?
>
> I guess you didnt read Tore use case.
>
> Thats the standard : http://tools.ietf.org/html/rfc2460#section-5
>
>   In response to an IPv6 packet that is sent to an IPv4 destination
>   (i.e., a packet that undergoes translation from IPv6 to IPv4), the
>   originating IPv6 node may receive an ICMP Packet Too Big message
>   reporting a Next-Hop MTU less than 1280.  In that case, the IPv6 node
>   is not required to reduce the size of subsequent packets to less than

... is not required to reduce...

but doesn't that mean it _may_ reduce anyway?
and thus make life easier on the guy who will have to fragment (and
thus he won't have to fragment).

[of course you'd still need to lose 8 bytes for the frag header...]

>   1280, but must include a Fragment header in those packets so that the
>   IPv6-to-IPv4 translating router can obtain a suitable Identification
>   value to use in resulting IPv4 fragments.  Note that this means the
>   payload may have to be reduced to 1232 octets (1280 minus 40 for the
>   IPv6 header and 8 for the Fragment header), and smaller still if
>   additional extension headers are used.

I just don't see what not decreasing mtu below 1280 buys us.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Maciej Żenczykowski April 24, 2012, 9:51 p.m. UTC | #4
> I just don't see what not decreasing mtu below 1280 buys us.

In particular it seems like it would be a good idea to drop tcp mss...
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet April 25, 2012, 5:32 a.m. UTC | #5
On Tue, 2012-04-24 at 14:50 -0700, Maciej Żenczykowski wrote:
> On Tue, Apr 24, 2012 at 1:10 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> > On Tue, 2012-04-24 at 12:49 -0700, Maciej Żenczykowski wrote:
> >> Why do we refuse to set ipv6 mtu's below 1280?
> >> how is what we do any better?
> >
> > I guess you didnt read Tore use case.
> >
> > Thats the standard : http://tools.ietf.org/html/rfc2460#section-5
> >
> >   In response to an IPv6 packet that is sent to an IPv4 destination
> >   (i.e., a packet that undergoes translation from IPv6 to IPv4), the
> >   originating IPv6 node may receive an ICMP Packet Too Big message
> >   reporting a Next-Hop MTU less than 1280.  In that case, the IPv6 node
> >   is not required to reduce the size of subsequent packets to less than
> 
> ... is not required to reduce...
> 
> but doesn't that mean it _may_ reduce anyway?
> and thus make life easier on the guy who will have to fragment (and
> thus he won't have to fragment).
> 
> [of course you'd still need to lose 8 bytes for the frag header...]
> 
> >   1280, but must include a Fragment header in those packets so that the
> >   IPv6-to-IPv4 translating router can obtain a suitable Identification
> >   value to use in resulting IPv4 fragments.  Note that this means the
> >   payload may have to be reduced to 1232 octets (1280 minus 40 for the
> >   IPv6 header and 8 for the Fragment header), and smaller still if
> >   additional extension headers are used.
> 
> I just don't see what not decreasing mtu below 1280 buys us.

But we chose to _not_ decrease mtu and adhere to the specs.

Current linux chose to implement the allfragfeature, so match RFC 2460
specs.

If you want to reduce size of subsequent packets, its a lot more work in
linux stack, for small gain, if any.

We only need to properly generate the fragment header, that means
reducing the _payload_ by 8 bytes. Not reducing the _mtu_ that still is
1280, as allowed.

IPv6 must cohabit with IPv4 for the next years, there is no hope
thinking it can ignore tunnelings issues, since tunneling is part of the
global IPv6 transition that is currently happening.



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Maciej Żenczykowski April 25, 2012, 7:34 a.m. UTC | #6
> But we chose to _not_ decrease mtu and adhere to the specs.

I get that we _choose_ to behave such, and I agree this adheres to specs.

But I'm not convinced that (even though this is allowed per RFC) this
is the right choice.

> Current linux chose to implement the allfragfeature, so match RFC 2460
> specs.
>
> If you want to reduce size of subsequent packets, its a lot more work in
> linux stack, for small gain, if any.

True, but this is work at the source host (and indeed might possibly
even be done by tso hardware or at least by gso),
which is reducing work at whatever the actual tunneling point is.
It's not actually affecting how much
work the receiver has to do (indeed it might even decrease it, imagine
sending 3840 bytes, getting that
fragmented into 6 packets: 1000, 280, 1000, 280, 1000, 280 instead of
just 4 packets: 1000, 1000,
1000, 840 [obviously would need to correct for header overhead, but
the basis stands])

[side note: indeed 1280 should never be split 1000 + 280, but should
rather be split into even pieces 640 + 640, to help prevent need for
further fragmentation further down the line]

Tunneling points are traditionally cpu starved.

Also note that IPv6 prefers to see fragmentation happen at the end
hosts, and not at the routers.
Although of course it doesn't treat a tunnel end point as a router.

Also packet loss is better handled by tcp losing segments than by the
tunnel losing a fragment and thus losing the entire packet.

I guess I simply think we're making the wrong trade off here.

> We only need to properly generate the fragment header, that means
> reducing the _payload_ by 8 bytes. Not reducing the _mtu_ that still is
> 1280, as allowed.
>
> IPv6 must cohabit with IPv4 for the next years, there is no hope
> thinking it can ignore tunnelings issues, since tunneling is part of the
> global IPv6 transition that is currently happening.

To be fair this probably isn't super-related to the vast majority of
IPv4, the vast majority of the IPv4 world out there has mtu's easily
in excess of 1280+tunneling overhead bytes.
I wonder if anyone has any statistics, but I'd guess 99.99% of the
IPv4 internet is 1400+ MTU.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Tore Anderson April 25, 2012, 9:20 a.m. UTC | #7
* Maciej Żenczykowski

>> But we chose to _not_ decrease mtu and adhere to the specs.
>
> I get that we _choose_ to behave such, and I agree this adheres to
> specs.

"Chose" (past), not "choose" (present). ;-)

This patch does not make this choice. This patch merely fixes a bug in
the implementation of the choice that was made a long time ago.

> But I'm not convinced that (even though this is allowed per RFC) this
> is the right choice.

That is a different issue entirely, but I don't disagree with you. A
"min_pmtu" sysctl or something like that would be useful.

> Also note that IPv6 prefers to see fragmentation happen at the end
> hosts, and not at the routers.
> Although of course it doesn't treat a tunnel end point as a router.

Actually, in IPv6, fragmentation *must* be performed by end hosts,
routers (including tunnel end points) *cannot* fragment.

However, the use case for the allfrag feature is not handling tunnels,
but IPv4<->IPv6 translation. The issue is that a IPv6 host may very 
well
receive an ICMPv6 Packet Too Big indicating a PMTU of <1280 that was
originally transmitted by an IPv4 router (as an ICMPv4 Need To 
Fragment)
and underwent translation to IPv6.

In this case, the IPv6 node does not need to reduce the PMTU to <1280
(Linux does not), but it is not invalid to have a <1280 MTU link in the
IPv4 internet either, so something else must be done for the
communication to work. The solution is then to include the IPv6 
Fragment
extension header, so that the translator have a suitable Identification
value to copy into the translated IPv4 header, and may therefore clear
the Don't Fragment flag, so that the IPv4 router will fragment the
packet as it is forwarded onto the low-MTU link.

In case you're interested, I have a slide deck below that explains the
use case for IPv4<->IPv6 translation. Slide 25 is about the particular
corner case where the allfrag feature is necessary. URL:

http://fud.no/talks/20120417-RIPE64-The_Case_for_IPv6_Only_Data_Centres.pdf

Tore
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet April 25, 2012, 9:38 a.m. UTC | #8
On Wed, 2012-04-25 at 11:20 +0200, Tore Anderson wrote:
> * Maciej Żenczykowski
> 
> >> But we chose to _not_ decrease mtu and adhere to the specs.
> >
> > I get that we _choose_ to behave such, and I agree this adheres to
> > specs.
> 
> "Chose" (past), not "choose" (present). ;-)
> 
> This patch does not make this choice. This patch merely fixes a bug in
> the implementation of the choice that was made a long time ago.
> 
> > But I'm not convinced that (even though this is allowed per RFC) this
> > is the right choice.
> 
> That is a different issue entirely, but I don't disagree with you. A
> "min_pmtu" sysctl or something like that would be useful.
> 
> > Also note that IPv6 prefers to see fragmentation happen at the end
> > hosts, and not at the routers.
> > Although of course it doesn't treat a tunnel end point as a router.
> 
> Actually, in IPv6, fragmentation *must* be performed by end hosts,
> routers (including tunnel end points) *cannot* fragment.
> 
> However, the use case for the allfrag feature is not handling tunnels,
> but IPv4<->IPv6 translation. The issue is that a IPv6 host may very 
> well
> receive an ICMPv6 Packet Too Big indicating a PMTU of <1280 that was
> originally transmitted by an IPv4 router (as an ICMPv4 Need To 
> Fragment)
> and underwent translation to IPv6.
> 
> In this case, the IPv6 node does not need to reduce the PMTU to <1280
> (Linux does not), but it is not invalid to have a <1280 MTU link in the
> IPv4 internet either, so something else must be done for the
> communication to work. The solution is then to include the IPv6 
> Fragment
> extension header, so that the translator have a suitable Identification
> value to copy into the translated IPv4 header, and may therefore clear
> the Don't Fragment flag, so that the IPv4 router will fragment the
> packet as it is forwarded onto the low-MTU link.
> 
> In case you're interested, I have a slide deck below that explains the
> use case for IPv4<->IPv6 translation. Slide 25 is about the particular
> corner case where the allfrag feature is necessary. URL:
> 
> http://fud.no/talks/20120417-RIPE64-The_Case_for_IPv6_Only_Data_Centres.pdf


Hmm, but what if we change linux to choice a) instead of b) ?

That is, not cap mtu to minimum value 1280 (and not use anymore
RTAX_FEATURE_ALLFRAG) : dst_allfrag() would be always false.

In this case, do we still need to send the frag header ?

I ask this because some TSO6 implementations probably dont cope very
well with this added header (untested path)





--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Maciej Żenczykowski April 25, 2012, 9:48 a.m. UTC | #9
>> I get that we _choose_ to behave such, and I agree this adheres to
>> specs.
>
> "Chose" (past), not "choose" (present). ;-)

Details, but until we 'choose' to change it we continuously 'choose'
to have the current behaviour. ;-)

> This patch does not make this choice. This patch merely fixes a bug in
> the implementation of the choice that was made a long time ago.

yes, I wasn't saying this patch was bad, I was just wanting to point
that we should perhaps revisit the design choice.

>> But I'm not convinced that (even though this is allowed per RFC) this
>> is the right choice.
>
> That is a different issue entirely, but I don't disagree with you. A
> "min_pmtu" sysctl or something like that would be useful.

I don't really know what the default value should be?  Something around 500?
[to handle IPv4s min mtu of 576?]

Do we have any idea what values of small mtu actually show up in practice?

> Actually, in IPv6, fragmentation *must* be performed by end hosts,
> routers (including tunnel end points) *cannot* fragment.

Yes, I miss-phrased that, that's what I meant.

> However, the use case for the allfrag feature is not handling tunnels,
> but IPv4<->IPv6 translation. The issue is that a IPv6 host may very well
> receive an ICMPv6 Packet Too Big indicating a PMTU of <1280 that was
> originally transmitted by an IPv4 router (as an ICMPv4 Need To Fragment)
> and underwent translation to IPv6.

Very good point, although that's basically kind of like half a tunnel ;-)

> In case you're interested, I have a slide deck below that explains the
> use case for IPv4<->IPv6 translation. Slide 25 is about the particular
> corner case where the allfrag feature is necessary. URL:
>
> http://fud.no/talks/20120417-RIPE64-The_Case_for_IPv6_Only_Data_Centres.pdf

Yes, I've seen this slide set a couple days ago - very good.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Tore Anderson April 25, 2012, 9:51 a.m. UTC | #10
* Eric Dumazet

> Hmm, but what if we change linux to choice a) instead of b) ?
>
> That is, not cap mtu to minimum value 1280 (and not use anymore
> RTAX_FEATURE_ALLFRAG) : dst_allfrag() would be always false.

Yep.

> In this case, do we still need to send the frag header ?

No - the (translated) IPv4 packet should then fit onto the small-MTU
IPv4 link without any fragmentation (by the IPv4 router) required. So
you end up with perfectly regular end-to-end Path MTU Discovery.

Tore
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Maciej Żenczykowski April 25, 2012, 9:52 a.m. UTC | #11
> Hmm, but what if we change linux to choice a) instead of b) ?
>
> That is, not cap mtu to minimum value 1280 (and not use anymore
> RTAX_FEATURE_ALLFRAG) : dst_allfrag() would be always false.
>
> In this case, do we still need to send the frag header ?

Yeah, I was wondering about that myself.

By my reading of the relevant RFC it's not quite clear whether you
truly must include the frag header even if you choose to obey the
lower than 1280 mtu.
Although I don't see any reason why you would need to...
So long as there's a decent minimum pmtu we're willing to obey.

> I ask this because some TSO6 implementations probably dont cope very
> well with this added header (untested path)

Yes, I was thinking the same thing, hence why I mentioned GSO.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Tore Anderson April 25, 2012, 10:04 a.m. UTC | #12
* Maciej Żenczykowski

>> That is a different issue entirely, but I don't disagree with you. A
>> "min_pmtu" sysctl or something like that would be useful.
>
> I don't really know what the default value should be?  Something 
> around 500?
> [to handle IPv4s min mtu of 576?]

The sensible default would be either 1280 (and keep allfrag feature), 
or
the minimum IPv4 PMTU currently enforced by the kernel + 20 bytes (to
compensate for the larger IPv6 header size). I don't know what the 
current
minimum PMTU is.

Also, I'm not really sure if the IPv4 minimum PMTU is defined as 576 or
68 bytes. There are some conflicting information out there, and nothing 
really
authoritative either way (that I've found at least).

> Do we have any idea what values of small mtu actually show up in 
> practice?

I have no data on this, I'm afraid. But I believe small MTUs (<1260, 
which
currently triggers the need for allfrag), are very rare - at least 
where I'm
from. Anectdotal, but - we've been running our corporate web site 
IPv6-only
with IPv4 access through stateless translation on a Linux server with 
the
buggy allfrag feature for several months, and there has been no 
complaints.

>> However, the use case for the allfrag feature is not handling 
>> tunnels,
>> but IPv4<->IPv6 translation. The issue is that a IPv6 host may very 
>> well
>> receive an ICMPv6 Packet Too Big indicating a PMTU of <1280 that was
>> originally transmitted by an IPv4 router (as an ICMPv4 Need To 
>> Fragment)
>> and underwent translation to IPv6.
>
> Very good point, although that's basically kind of like half a tunnel 
> ;-)

Yup, only difference is that an IPv6 tunnel is guaranteed to have a MTU 
of
1280, so no need for allfrag or dropping PMTU below 1280. No such 
guarantees
exist for IPv4 links.

Tore
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet April 25, 2012, 10:15 a.m. UTC | #13
On Wed, 2012-04-25 at 12:04 +0200, Tore Anderson wrote:

> 
> Also, I'm not really sure if the IPv4 minimum PMTU is defined as 576 or
> 68 bytes. There are some conflicting information out there, and nothing 
> really
> authoritative either way (that I've found at least).

576 certainly not.

68 seems to be allowed



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Maciej Żenczykowski April 25, 2012, 10:30 a.m. UTC | #14
> The sensible default would be either 1280 (and keep allfrag feature), or
> the minimum IPv4 PMTU currently enforced by the kernel + 20 bytes (to
> compensate for the larger IPv6 header size). I don't know what the current
> minimum PMTU is.

I'd actually go with min IPv4 PMTU - 20, not + 20, see below for why.

Hmm, it may be best to honour PMTUs below 1280 to some small but not
too small value (512?), and below that give up, say mtu remains 1280
but still add the frag header.

> Also, I'm not really sure if the IPv4 minimum PMTU is defined as 576 or
> 68 bytes. There are some conflicting information out there, and nothing
> really
> authoritative either way (that I've found at least).

Yeah, I've seen conflicting information.
I wonder if there's some tiny embedded devices out there with
miniscule mtus, because of tiny amounts of ram.

> Yup, only difference is that an IPv6 tunnel is guaranteed to have a MTU of
> 1280, so no need for allfrag or dropping PMTU below 1280. No such guarantees
> exist for IPv4 links.

Hmm, I thought one way to implement an IPv6 over IPv4 tunnel was to basically
rely on IPv4 fragmentation, hence you would actually be using the same mechanism
as for an IPv6-IPv4 translator, then you would be tunneling the IPv6
packet over IPv4,
so your IPv6 mtu would be 20 less than the v4 one, not 20 more which
you get if you
replace the v6 header with a smaller v4 one.

Anyway, this certainly seems like an ipv6-in-ipv4 tunneling mechanism
which would currently work, wouldn't it?

(re: Eric's patch, I think it should protect itself against malicious
PMTU messages with too small MTUs, like 0 or 1 or 68 [not enough for
timestamped ipv6/tcp)
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet April 25, 2012, 10:44 a.m. UTC | #15
On Wed, 2012-04-25 at 03:30 -0700, Maciej Żenczykowski wrote:

> Hmm, it may be best to honour PMTUs below 1280 to some small but not
> too small value (512?), and below that give up, say mtu remains 1280
> but still add the frag header.
> 

Thats a good idea.


> (re: Eric's patch, I think it should protect itself against malicious
> PMTU messages with too small MTUs, like 0 or 1 or 68 [not enough for
> timestamped ipv6/tcp)

Yes, in fact we are going to keep the allfrag feature, and arm it if we
receive an mtu < 512.

I'll send a v3 patch soon.

Thanks


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Tore Anderson April 25, 2012, 10:45 a.m. UTC | #16
* Maciej Żenczykowski

>> The sensible default would be either 1280 (and keep allfrag 
>> feature), or
>> the minimum IPv4 PMTU currently enforced by the kernel + 20 bytes 
>> (to
>> compensate for the larger IPv6 header size). I don't know what the 
>> current
>> minimum PMTU is.
>
> I'd actually go with min IPv4 PMTU - 20, not + 20, see below for why.

I think you forgot to include the explanation why. :-)

> Hmm, it may be best to honour PMTUs below 1280 to some small but not
> too small value (512?), and below that give up, say mtu remains 1280
> but still add the frag header.

That is also possible, yes.

> Hmm, I thought one way to implement an IPv6 over IPv4 tunnel was to 
> basically
> rely on IPv4 fragmentation, hence you would actually be using the 
> same mechanism
> as for an IPv6-IPv4 translator, then you would be tunneling the IPv6 
> packet over IPv4,
> so your IPv6 mtu would be 20 less than the v4 one, not 20 more which 
> you get if you
> replace the v6 header with a smaller v4 one.
>
> Anyway, this certainly seems like an ipv6-in-ipv4 tunneling mechanism
> which would currently work, wouldn't it?

I suppose. This would be invisible to IPv6, though - the fragmentation 
and reassembly
happens at a lower layer than IPv6. Same as ATM for example. Situation 
is described
by RFC 2460:

«On any link that cannot convey a 1280-octet packet in one piece, 
link-specific
fragmentation and reassembly must be provided at a layer below IPv6.»

> (re: Eric's patch, I think it should protect itself against malicious
> PMTU messages with too small MTUs, like 0 or 1 or 68 [not enough for
> timestamped ipv6/tcp)

Does this happen for IPv4, I wonder? IMHO, it makes sense to keep the 
the minimum
PMTUDs allowed in sync. If PMTUD=1 is allowed in IPv4, and this is not 
problematic,
I don't see why it couldn't be allowed in IPv6 either.

Tore


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Maciej Żenczykowski April 25, 2012, 11:02 a.m. UTC | #17
> I think you forgot to include the explanation why. :-)

I did try, I just didn't do a very good job.

> I suppose. This would be invisible to IPv6, though - the fragmentation and
> reassembly happens at a lower layer than IPv6. Same as ATM for example. Situation is
> described by RFC 2460:

It would be invisible (*), and you probably wouldn't really need the
frag header in the ipv6 packet,
but it would still be desirable to have ipv6 already have packets
smaller than ipv4
mtu - 20, rather than have to frag/unfrag at the tunnel endpoint.
Since it is always more efficient to have fragmented correctly in the
first place.

(*) Would it be legal for a tunnel endpoint to support ipv6 packets up
to 1280 bytes in size
but still send back a 'packet to big please use 1K mtu' message?

> «On any link that cannot convey a 1280-octet packet in one piece,
> link-specific fragmentation and reassembly must be provided at a layer below IPv6.»

True...  I wonder how far we should bend over, just because it'll do
the work for us,
doesn't mean it isn't more efficient to do it ourselves...

Eh, not sure it's really worth the bother, I've never seen ipv6
tunneled over something with a small (<1280) but not tiny (>200) mtu.

>> (re: Eric's patch, I think it should protect itself against malicious
>> PMTU messages with too small MTUs, like 0 or 1 or 68 [not enough for
>> timestamped ipv6/tcp)
>
>
> Does this happen for IPv4, I wonder? IMHO, it makes sense to keep the the
> minimum
> PMTUDs allowed in sync. If PMTUD=1 is allowed in IPv4, and this is not
> problematic,
> I don't see why it couldn't be allowed in IPv6 either.
>
> Tore
>
>
Tore Anderson April 25, 2012, 11:49 a.m. UTC | #18
* Maciej Żenczykowski

>> I think you forgot to include the explanation why. :-)
>
> I did try, I just didn't do a very good job.

Oh, because of IPv6-in-IPv4 tunnel encap overhead, I get it now.

>> I suppose. This would be invisible to IPv6, though - the 
>> fragmentation and
>> reassembly happens at a lower layer than IPv6. Same as ATM for 
>> example. Situation is
>> described by RFC 2460:
>
> It would be invisible (*), and you probably wouldn't really need the
> frag header in the ipv6 packet,
> but it would still be desirable to have ipv6 already have packets
> smaller than ipv4
> mtu - 20, rather than have to frag/unfrag at the tunnel endpoint.
> Since it is always more efficient to have fragmented correctly in the
> first place.
>
> (*) Would it be legal for a tunnel endpoint to support ipv6 packets 
> up
> to 1280 bytes in size
> but still send back a 'packet to big please use 1K mtu' message?

I don't think this is a valid thing to do - either the tunnel server 
would
forward the packet through the tunnel (fragmenting the underlaying IPv4 
if
necessary), and *not* send back a ICMPv6 PTB error at the same time, 
OR: it
would need to drop the packet and *do* send back the ICMPv6 PTB. Not 
both
at the same time.

However, if it is going to drop the large packets and reply with the 
PTB, it
will cause a blackhole with current Linux, because if it get the PTB 
with
MTU=1000, it will include the frag header, but *not* reduce the actual 
packet
size. And since this is pure IPv6 routing, the tunnel server will not 
be able
to fragment the IPv6 packets, since IPv6 routers don't do that (the 
presense
of the Fragmentation header does not change this fact).

So the tunnel routers pretty much *must* support an IPv6 MTU of 1280, 
either
by ensuring the outer IPv4 MTU is 1300 or larger, or by performing IPv4
fragmentation and reassembly "under the hood".

I don't think it is worth while (or even appropriate) for the Linux 
IPv6 stack
to try to optimize for this.

Tore

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Maciej Żenczykowski April 25, 2012, 11:55 a.m. UTC | #19
>> (*) Would it be legal for a tunnel endpoint to support ipv6 packets up
>> to 1280 bytes in size
>> but still send back a 'packet to big please use 1K mtu' message?
>
>
> I don't think this is a valid thing to do - either the tunnel server would
> forward the packet through the tunnel (fragmenting the underlaying IPv4 if
> necessary), and *not* send back a ICMPv6 PTB error at the same time, OR: it
> would need to drop the packet and *do* send back the ICMPv6 PTB. Not both
> at the same time.
>
> However, if it is going to drop the large packets and reply with the PTB, it
> will cause a blackhole with current Linux, because if it get the PTB with
> MTU=1000, it will include the frag header, but *not* reduce the actual
> packet
> size. And since this is pure IPv6 routing, the tunnel server will not be
> able
> to fragment the IPv6 packets, since IPv6 routers don't do that (the presense
> of the Fragmentation header does not change this fact).
>
> So the tunnel routers pretty much *must* support an IPv6 MTU of 1280, either
> by ensuring the outer IPv4 MTU is 1300 or larger, or by performing IPv4
> fragmentation and reassembly "under the hood".

I think you could still accept 1280 mtu packets, but for packets >= 1281 mtu
you could return packet to big, please use mtu=1000.

>
> I don't think it is worth while (or even appropriate) for the Linux IPv6
> stack
> to try to optimize for this.

Agreed.  Let's ignore this.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Tore Anderson April 26, 2012, 10:32 a.m. UTC | #20
* Eric Dumazet

> Yes, in fact we are going to keep the allfrag feature, and arm it if 
> we
> receive an mtu < 512.

One thing to think about if you go down this path is to keep track of 
the PMTU received in the ICMPv6 PTB. If you activate the allfrag feature 
on a destination because you received a PTB with PMTU=400, it's not 
useful to include the Fragmentation header in packets <400 bytes in 
size, for example.

With the current code, even tiny packets without any L4 payload (e.g. 
TCP SYN/ACK) will have the Fragmentation header added. That's not 
optimal, but necessary since the current code does not keep track of the 
actual PMTU if it is <1280 as far as I can tell (instead it discards the 
received PMTU value, clamps the effective PMTU to 1280, and enables 
allfrag).

Tore
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Miller April 27, 2012, 4:03 a.m. UTC | #21
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Tue, 24 Apr 2012 19:37:38 +0200

> From: Eric Dumazet <edumazet@google.com>
> 
> Quoting Tore Anderson from :
> https://bugzilla.kernel.org/show_bug.cgi?id=42572
> 
> When RTAX_FEATURE_ALLFRAG is set on a route, the effective TCP segment
> size does not take into account the size of the IPv6 Fragmentation
> header that needs to be included in outbound packets, causing every
> transmitted TCP segment to be fragmented across two IPv6 packets, the
> latter of which will only contain 8 bytes of actual payload.
> 
> RTAX_FEATURE_ALLFRAG is typically set on a route in response to
> receving a ICMPv6 Packet Too Big message indicating a Path MTU of less
> than 1280 bytes. 1280 bytes is the minimum IPv6 MTU, however ICMPv6
> PTBs with MTU < 1280 are still valid, in particular when an IPv6
> packet is sent to an IPv4 destination through a stateless translator.
> Any ICMPv4 Need To Fragment packets originated from the IPv4 part of
> the path will be translated to ICMPv6 PTB which may then indicate an
> MTU of less than 1280.
> 
> The Linux kernel refuses to reduce the effective MTU to anything below
> 1280 bytes, instead it sets it to exactly 1280 bytes, and
> RTAX_FEATURE_ALLFRAG is also set. However, the TCP segment size appears
> to be set to 1240 bytes (1280 Path MTU - 40 bytes of IPv6 header),
> instead of 1232 (additionally taking into account the 8 bytes required
> by the IPv6 Fragmentation extension header).
> 
> This in turn results in rather inefficient transmission, as every 
> transmitted TCP segment now is split in two fragments containing
> 1232+8 bytes of payload.
> 
> After this patch, all the outgoing packets that includes a
> Fragmentation header all are "atomic" or "non-fragmented" fragments,
> i.e., they both have Offset=0 and More Fragments=0.
> 
> With help from David S. Miller
> 
> Reported-by: Tore Anderson <tore@fud.no>
> Signed-off-by: Eric Dumazet <edumazet@google.com>
> Cc: Maciej Żenczykowski <maze@google.com>
> Cc: Tom Herbert <therbert@google.com>
> Tested-by: Tore Anderson <tore@fud.no>

Applied.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index 46c9e2c..7d83f90 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -45,6 +45,7 @@  struct inet_connection_sock_af_ops {
 				      struct dst_entry *dst);
 	struct inet_peer *(*get_peer)(struct sock *sk, bool *release_it);
 	u16	    net_header_len;
+	u16	    net_frag_header_len;
 	u16	    sockaddr_len;
 	int	    (*setsockopt)(struct sock *sk, int level, int optname, 
 				  char __user *optval, unsigned int optlen);
diff --git a/include/net/tcp.h b/include/net/tcp.h
index fc880e9..0fb84de 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -544,8 +544,8 @@  extern int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
 
 extern void tcp_initialize_rcv_mss(struct sock *sk);
 
-extern int tcp_mtu_to_mss(const struct sock *sk, int pmtu);
-extern int tcp_mss_to_mtu(const struct sock *sk, int mss);
+extern int tcp_mtu_to_mss(struct sock *sk, int pmtu);
+extern int tcp_mss_to_mtu(struct sock *sk, int mss);
 extern void tcp_mtup_init(struct sock *sk);
 extern void tcp_valid_rtt_meas(struct sock *sk, u32 seq_rtt);
 
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 7b7cf38..834e89f 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1150,7 +1150,7 @@  int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
 }
 
 /* Calculate MSS. Not accounting for SACKs here.  */
-int tcp_mtu_to_mss(const struct sock *sk, int pmtu)
+int tcp_mtu_to_mss(struct sock *sk, int pmtu)
 {
 	const struct tcp_sock *tp = tcp_sk(sk);
 	const struct inet_connection_sock *icsk = inet_csk(sk);
@@ -1161,6 +1161,14 @@  int tcp_mtu_to_mss(const struct sock *sk, int pmtu)
 	 */
 	mss_now = pmtu - icsk->icsk_af_ops->net_header_len - sizeof(struct tcphdr);
 
+	/* IPv6 adds a frag_hdr in case RTAX_FEATURE_ALLFRAG is set */
+	if (icsk->icsk_af_ops->net_frag_header_len) {
+		const struct dst_entry *dst = __sk_dst_get(sk);
+
+		if (dst && dst_allfrag(dst))
+			mss_now -= icsk->icsk_af_ops->net_frag_header_len;
+	}
+
 	/* Clamp it (mss_clamp does not include tcp options) */
 	if (mss_now > tp->rx_opt.mss_clamp)
 		mss_now = tp->rx_opt.mss_clamp;
@@ -1179,7 +1187,7 @@  int tcp_mtu_to_mss(const struct sock *sk, int pmtu)
 }
 
 /* Inverse of above */
-int tcp_mss_to_mtu(const struct sock *sk, int mss)
+int tcp_mss_to_mtu(struct sock *sk, int mss)
 {
 	const struct tcp_sock *tp = tcp_sk(sk);
 	const struct inet_connection_sock *icsk = inet_csk(sk);
@@ -1190,6 +1198,13 @@  int tcp_mss_to_mtu(const struct sock *sk, int mss)
 	      icsk->icsk_ext_hdr_len +
 	      icsk->icsk_af_ops->net_header_len;
 
+	/* IPv6 adds a frag_hdr in case RTAX_FEATURE_ALLFRAG is set */
+	if (icsk->icsk_af_ops->net_frag_header_len) {
+		const struct dst_entry *dst = __sk_dst_get(sk);
+
+		if (dst && dst_allfrag(dst))
+			mtu += icsk->icsk_af_ops->net_frag_header_len;
+	}
 	return mtu;
 }
 
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index cdbf292..57b2109 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1778,6 +1778,7 @@  static const struct inet_connection_sock_af_ops ipv6_specific = {
 	.syn_recv_sock	   = tcp_v6_syn_recv_sock,
 	.get_peer	   = tcp_v6_get_peer,
 	.net_header_len	   = sizeof(struct ipv6hdr),
+	.net_frag_header_len = sizeof(struct frag_hdr),
 	.setsockopt	   = ipv6_setsockopt,
 	.getsockopt	   = ipv6_getsockopt,
 	.addr2sockaddr	   = inet6_csk_addr2sockaddr,