diff mbox series

[RFC,netnext] mptcp: Use 32-bit DATA_ACK when possible

Message ID 20200513053257.8836-1-cpaasch@apple.com
State Superseded, archived
Delegated to: Paolo Abeni
Headers show
Series [RFC,netnext] mptcp: Use 32-bit DATA_ACK when possible | expand

Commit Message

Christoph Paasch May 13, 2020, 5:32 a.m. UTC
RFC8684 allows to send 32-bit DATA_ACKs as long as the peer is not
sending 64-bit data-sequence numbers. The 64-bit DSN is only there for
extreme scenarios when a very high throughput subflow is combined with a
long-RTT subflow such that the high-throughput subflow wraps around the
32-bit sequence number space within an RTT of the high-RTT subflow.

It is thus a rare scenario and we should try to use the 32-bit DATA_ACK
instead as long as possible. It allows to reduce the TCP-option overhead
by 4 bytes, thus makes space for an additional SACK-block. It also makes
tcpdumps much easier to read when the DSN and DATA_ACK are both either
32 or 64-bit.

Signed-off-by: Christoph Paasch <cpaasch@apple.com>
---
 include/net/mptcp.h  |  1 +
 net/mptcp/options.c  | 33 ++++++++++++++++++++++++---------
 net/mptcp/protocol.h |  1 +
 net/mptcp/subflow.c  |  2 ++
 4 files changed, 28 insertions(+), 9 deletions(-)

Comments

Paolo Abeni May 13, 2020, 7:16 a.m. UTC | #1
On Tue, 2020-05-12 at 22:32 -0700, Christoph Paasch wrote:
> RFC8684 allows to send 32-bit DATA_ACKs as long as the peer is not
> sending 64-bit data-sequence numbers. The 64-bit DSN is only there for
> extreme scenarios when a very high throughput subflow is combined with a
> long-RTT subflow such that the high-throughput subflow wraps around the
> 32-bit sequence number space within an RTT of the high-RTT subflow.
> 
> It is thus a rare scenario and we should try to use the 32-bit DATA_ACK
> instead as long as possible. It allows to reduce the TCP-option overhead
> by 4 bytes, thus makes space for an additional SACK-block. It also makes
> tcpdumps much easier to read when the DSN and DATA_ACK are both either
> 32 or 64-bit.
> 
> Signed-off-by: Christoph Paasch <cpaasch@apple.com>
> ---
>  include/net/mptcp.h  |  1 +
>  net/mptcp/options.c  | 33 ++++++++++++++++++++++++---------
>  net/mptcp/protocol.h |  1 +
>  net/mptcp/subflow.c  |  2 ++
>  4 files changed, 28 insertions(+), 9 deletions(-)
> 
> diff --git a/include/net/mptcp.h b/include/net/mptcp.h
> index e60275659de6..25d8b36ed94f 100644
> --- a/include/net/mptcp.h
> +++ b/include/net/mptcp.h
> @@ -17,6 +17,7 @@ struct seq_file;
>  /* MPTCP sk_buff extension data */
>  struct mptcp_ext {
>  	u64		data_ack;
> +	u32		data_ack32;

I think the above 2 fields could be "union-ed".
I think it's preferrable to avoid growing mptcp_ext size.

>  	u64		data_seq;
>  	u32		subflow_seq;
>  	u16		data_len;
> diff --git a/net/mptcp/options.c b/net/mptcp/options.c
> index 45497af23906..ece6f92cf7d1 100644
> --- a/net/mptcp/options.c
> +++ b/net/mptcp/options.c
> @@ -516,7 +516,16 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb,
>  		return ret;
>  	}
>  
> -	ack_size = TCPOLEN_MPTCP_DSS_ACK64;
> +	if (subflow->use_64bit_ack) {
> +		ack_size = TCPOLEN_MPTCP_DSS_ACK64;
> +		opts->ext_copy.data_ack = msk->ack_seq;
> +		opts->ext_copy.ack64 = 1;
> +	} else {
> +		ack_size = TCPOLEN_MPTCP_DSS_ACK32;
> +		opts->ext_copy.data_ack32 = (uint32_t)(msk->ack_seq);
> +		opts->ext_copy.ack64 = 0;
> +	}
> +	opts->ext_copy.use_ack = 1;

If I read correctly, 'use_64bit_ack' is initialized with 0 and only set
if the peer sends 64bits sequence number. When interacting with impl
using 32bits DSN, our stack will flip the behavior at least once for
passive socket:

MPC+data is internally translated into DSS with 64bit DSN, so first
passive socket ack will use 64 bits.

Later, if the client uses 32 bits acks, it will switch to 32 bit acks.

Is that ok?

>  	/* Add kind/length/subtype/flag overhead if mapping is not populated */
>  	if (dss_size == 0)
> @@ -524,10 +533,6 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb,
>  
>  	dss_size += ack_size;
>  
> -	opts->ext_copy.data_ack = msk->ack_seq;
> -	opts->ext_copy.ack64 = 1;
> -	opts->ext_copy.use_ack = 1;
> -
>  	*size = ALIGN(dss_size, 4);
>  	return true;
>  }
> @@ -986,8 +991,13 @@ void mptcp_write_options(__be32 *ptr, struct mptcp_out_options *opts)
>  		u8 flags = 0;
>  
>  		if (mpext->use_ack) {
> -			len += TCPOLEN_MPTCP_DSS_ACK64;
> -			flags = MPTCP_DSS_HAS_ACK | MPTCP_DSS_ACK64;
> +			flags = MPTCP_DSS_HAS_ACK;
> +			if (mpext->ack64) {
> +				len += TCPOLEN_MPTCP_DSS_ACK64;
> +				flags |= MPTCP_DSS_ACK64;
> +			} else {
> +				len += TCPOLEN_MPTCP_DSS_ACK32;
> +			}
>  		}
>  
>  		if (mpext->use_map) {
> @@ -1004,8 +1014,13 @@ void mptcp_write_options(__be32 *ptr, struct mptcp_out_options *opts)
>  		*ptr++ = mptcp_option(MPTCPOPT_DSS, len, 0, flags);
>  
>  		if (mpext->use_ack) {
> -			put_unaligned_be64(mpext->data_ack, ptr);
> -			ptr += 2;
> +			if (mpext->ack64) {
> +				put_unaligned_be64(mpext->data_ack, ptr);
> +				ptr += 2;
> +			} else {
> +				put_unaligned_be32(mpext->data_ack32, ptr);
> +				ptr += 1;
> +			}
>  		}
>  
>  		if (mpext->use_map) {
> diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
> index e4ca6320ce76..f5adca93e8fb 100644
> --- a/net/mptcp/protocol.h
> +++ b/net/mptcp/protocol.h
> @@ -290,6 +290,7 @@ struct mptcp_subflow_context {
>  		data_avail : 1,
>  		rx_eof : 1,
>  		data_fin_tx_enable : 1,
> +		use_64bit_ack : 1, /* Set when we received a 64-bit DSN */
>  		can_ack : 1;	    /* only after processing the remote a key */
>  	u64	data_fin_tx_seq;
>  	u32	remote_nonce;

Possibly with a wider scope than this patch, but the above bitfields
contains both fields that are rarely changed (data_fin_tx_enable,
can_ack, request_mptcp, etc...) and fields that are set on every packet
/every DSS (map_vaild, data_avail, the newly added use_64bit_ack).

At least 'data_avail' is accessed without holding the ssk socket lock
by mptcp_subflow_recv_lookup(). Perhaps we could revisit the binary
layout of this struct?

Other than the above LGTM!

Cheers,

Paolo
Christoph Paasch May 13, 2020, 6:01 p.m. UTC | #2
On 13/05/20 - 09:16:54, Paolo Abeni wrote:
> On Tue, 2020-05-12 at 22:32 -0700, Christoph Paasch wrote:
> > RFC8684 allows to send 32-bit DATA_ACKs as long as the peer is not
> > sending 64-bit data-sequence numbers. The 64-bit DSN is only there for
> > extreme scenarios when a very high throughput subflow is combined with a
> > long-RTT subflow such that the high-throughput subflow wraps around the
> > 32-bit sequence number space within an RTT of the high-RTT subflow.
> > 
> > It is thus a rare scenario and we should try to use the 32-bit DATA_ACK
> > instead as long as possible. It allows to reduce the TCP-option overhead
> > by 4 bytes, thus makes space for an additional SACK-block. It also makes
> > tcpdumps much easier to read when the DSN and DATA_ACK are both either
> > 32 or 64-bit.
> > 
> > Signed-off-by: Christoph Paasch <cpaasch@apple.com>
> > ---
> >  include/net/mptcp.h  |  1 +
> >  net/mptcp/options.c  | 33 ++++++++++++++++++++++++---------
> >  net/mptcp/protocol.h |  1 +
> >  net/mptcp/subflow.c  |  2 ++
> >  4 files changed, 28 insertions(+), 9 deletions(-)
> > 
> > diff --git a/include/net/mptcp.h b/include/net/mptcp.h
> > index e60275659de6..25d8b36ed94f 100644
> > --- a/include/net/mptcp.h
> > +++ b/include/net/mptcp.h
> > @@ -17,6 +17,7 @@ struct seq_file;
> >  /* MPTCP sk_buff extension data */
> >  struct mptcp_ext {
> >  	u64		data_ack;
> > +	u32		data_ack32;
> 
> I think the above 2 fields could be "union-ed".
> I think it's preferrable to avoid growing mptcp_ext size.

Sounds good!

> 
> >  	u64		data_seq;
> >  	u32		subflow_seq;
> >  	u16		data_len;
> > diff --git a/net/mptcp/options.c b/net/mptcp/options.c
> > index 45497af23906..ece6f92cf7d1 100644
> > --- a/net/mptcp/options.c
> > +++ b/net/mptcp/options.c
> > @@ -516,7 +516,16 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb,
> >  		return ret;
> >  	}
> >  
> > -	ack_size = TCPOLEN_MPTCP_DSS_ACK64;
> > +	if (subflow->use_64bit_ack) {
> > +		ack_size = TCPOLEN_MPTCP_DSS_ACK64;
> > +		opts->ext_copy.data_ack = msk->ack_seq;
> > +		opts->ext_copy.ack64 = 1;
> > +	} else {
> > +		ack_size = TCPOLEN_MPTCP_DSS_ACK32;
> > +		opts->ext_copy.data_ack32 = (uint32_t)(msk->ack_seq);
> > +		opts->ext_copy.ack64 = 0;
> > +	}
> > +	opts->ext_copy.use_ack = 1;
> 
> If I read correctly, 'use_64bit_ack' is initialized with 0 and only set
> if the peer sends 64bits sequence number. When interacting with impl
> using 32bits DSN, our stack will flip the behavior at least once for
> passive socket:
> 
> MPC+data is internally translated into DSS with 64bit DSN, so first
> passive socket ack will use 64 bits.
> 
> Later, if the client uses 32 bits acks, it will switch to 32 bit acks.
> 
> Is that ok?

Yes, I saw that behavior in my tests. I think it is ok. For the initial data
there won't be a SACK-block, so nothing is lost there.

> >  	/* Add kind/length/subtype/flag overhead if mapping is not populated */
> >  	if (dss_size == 0)
> > @@ -524,10 +533,6 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb,
> >  
> >  	dss_size += ack_size;
> >  
> > -	opts->ext_copy.data_ack = msk->ack_seq;
> > -	opts->ext_copy.ack64 = 1;
> > -	opts->ext_copy.use_ack = 1;
> > -
> >  	*size = ALIGN(dss_size, 4);
> >  	return true;
> >  }
> > @@ -986,8 +991,13 @@ void mptcp_write_options(__be32 *ptr, struct mptcp_out_options *opts)
> >  		u8 flags = 0;
> >  
> >  		if (mpext->use_ack) {
> > -			len += TCPOLEN_MPTCP_DSS_ACK64;
> > -			flags = MPTCP_DSS_HAS_ACK | MPTCP_DSS_ACK64;
> > +			flags = MPTCP_DSS_HAS_ACK;
> > +			if (mpext->ack64) {
> > +				len += TCPOLEN_MPTCP_DSS_ACK64;
> > +				flags |= MPTCP_DSS_ACK64;
> > +			} else {
> > +				len += TCPOLEN_MPTCP_DSS_ACK32;
> > +			}
> >  		}
> >  
> >  		if (mpext->use_map) {
> > @@ -1004,8 +1014,13 @@ void mptcp_write_options(__be32 *ptr, struct mptcp_out_options *opts)
> >  		*ptr++ = mptcp_option(MPTCPOPT_DSS, len, 0, flags);
> >  
> >  		if (mpext->use_ack) {
> > -			put_unaligned_be64(mpext->data_ack, ptr);
> > -			ptr += 2;
> > +			if (mpext->ack64) {
> > +				put_unaligned_be64(mpext->data_ack, ptr);
> > +				ptr += 2;
> > +			} else {
> > +				put_unaligned_be32(mpext->data_ack32, ptr);
> > +				ptr += 1;
> > +			}
> >  		}
> >  
> >  		if (mpext->use_map) {
> > diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
> > index e4ca6320ce76..f5adca93e8fb 100644
> > --- a/net/mptcp/protocol.h
> > +++ b/net/mptcp/protocol.h
> > @@ -290,6 +290,7 @@ struct mptcp_subflow_context {
> >  		data_avail : 1,
> >  		rx_eof : 1,
> >  		data_fin_tx_enable : 1,
> > +		use_64bit_ack : 1, /* Set when we received a 64-bit DSN */
> >  		can_ack : 1;	    /* only after processing the remote a key */
> >  	u64	data_fin_tx_seq;
> >  	u32	remote_nonce;
> 
> Possibly with a wider scope than this patch, but the above bitfields
> contains both fields that are rarely changed (data_fin_tx_enable,
> can_ack, request_mptcp, etc...) and fields that are set on every packet
> /every DSS (map_vaild, data_avail, the newly added use_64bit_ack).
> 
> At least 'data_avail' is accessed without holding the ssk socket lock
> by mptcp_subflow_recv_lookup(). Perhaps we could revisit the binary
> layout of this struct?

Yes, that might be good: https://github.com/multipath-tcp/mptcp_net-next/issues/24

> Other than the above LGTM!

Cool! I will make the union and send to netnext if there are no other
comments here on the list.


Thanks,
Christoph
diff mbox series

Patch

diff --git a/include/net/mptcp.h b/include/net/mptcp.h
index e60275659de6..25d8b36ed94f 100644
--- a/include/net/mptcp.h
+++ b/include/net/mptcp.h
@@ -17,6 +17,7 @@  struct seq_file;
 /* MPTCP sk_buff extension data */
 struct mptcp_ext {
 	u64		data_ack;
+	u32		data_ack32;
 	u64		data_seq;
 	u32		subflow_seq;
 	u16		data_len;
diff --git a/net/mptcp/options.c b/net/mptcp/options.c
index 45497af23906..ece6f92cf7d1 100644
--- a/net/mptcp/options.c
+++ b/net/mptcp/options.c
@@ -516,7 +516,16 @@  static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb,
 		return ret;
 	}
 
-	ack_size = TCPOLEN_MPTCP_DSS_ACK64;
+	if (subflow->use_64bit_ack) {
+		ack_size = TCPOLEN_MPTCP_DSS_ACK64;
+		opts->ext_copy.data_ack = msk->ack_seq;
+		opts->ext_copy.ack64 = 1;
+	} else {
+		ack_size = TCPOLEN_MPTCP_DSS_ACK32;
+		opts->ext_copy.data_ack32 = (uint32_t)(msk->ack_seq);
+		opts->ext_copy.ack64 = 0;
+	}
+	opts->ext_copy.use_ack = 1;
 
 	/* Add kind/length/subtype/flag overhead if mapping is not populated */
 	if (dss_size == 0)
@@ -524,10 +533,6 @@  static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb,
 
 	dss_size += ack_size;
 
-	opts->ext_copy.data_ack = msk->ack_seq;
-	opts->ext_copy.ack64 = 1;
-	opts->ext_copy.use_ack = 1;
-
 	*size = ALIGN(dss_size, 4);
 	return true;
 }
@@ -986,8 +991,13 @@  void mptcp_write_options(__be32 *ptr, struct mptcp_out_options *opts)
 		u8 flags = 0;
 
 		if (mpext->use_ack) {
-			len += TCPOLEN_MPTCP_DSS_ACK64;
-			flags = MPTCP_DSS_HAS_ACK | MPTCP_DSS_ACK64;
+			flags = MPTCP_DSS_HAS_ACK;
+			if (mpext->ack64) {
+				len += TCPOLEN_MPTCP_DSS_ACK64;
+				flags |= MPTCP_DSS_ACK64;
+			} else {
+				len += TCPOLEN_MPTCP_DSS_ACK32;
+			}
 		}
 
 		if (mpext->use_map) {
@@ -1004,8 +1014,13 @@  void mptcp_write_options(__be32 *ptr, struct mptcp_out_options *opts)
 		*ptr++ = mptcp_option(MPTCPOPT_DSS, len, 0, flags);
 
 		if (mpext->use_ack) {
-			put_unaligned_be64(mpext->data_ack, ptr);
-			ptr += 2;
+			if (mpext->ack64) {
+				put_unaligned_be64(mpext->data_ack, ptr);
+				ptr += 2;
+			} else {
+				put_unaligned_be32(mpext->data_ack32, ptr);
+				ptr += 1;
+			}
 		}
 
 		if (mpext->use_map) {
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index e4ca6320ce76..f5adca93e8fb 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -290,6 +290,7 @@  struct mptcp_subflow_context {
 		data_avail : 1,
 		rx_eof : 1,
 		data_fin_tx_enable : 1,
+		use_64bit_ack : 1, /* Set when we received a 64-bit DSN */
 		can_ack : 1;	    /* only after processing the remote a key */
 	u64	data_fin_tx_seq;
 	u32	remote_nonce;
diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
index 009d5c478062..f22dad482cd4 100644
--- a/net/mptcp/subflow.c
+++ b/net/mptcp/subflow.c
@@ -661,9 +661,11 @@  static enum mapping_status get_mapping_status(struct sock *ssk)
 	if (!mpext->dsn64) {
 		map_seq = expand_seq(subflow->map_seq, subflow->map_data_len,
 				     mpext->data_seq);
+		subflow->use_64bit_ack = 0;
 		pr_debug("expanded seq=%llu", subflow->map_seq);
 	} else {
 		map_seq = mpext->data_seq;
+		subflow->use_64bit_ack = 1;
 	}
 
 	if (subflow->map_valid) {