diff mbox series

MPTCP stream performances

Message ID aa8d90d38dbbb5f97a5a883389378d979f32b186.camel@redhat.com
State RFC, archived
Headers show
Series MPTCP stream performances | expand

Commit Message

Paolo Abeni Oct. 26, 2020, 3:46 p.m. UTC
Hi all,

as reported in yday mtg I'm doing perf test comparing MPTCP (single
stream) vs TCP on both net-next and export.

Test done on loopback using netperf and:

https://github.com/pabeni/mptcp-tools/tree/master/use_mptcp

branch		TCP	MPTCP	delta
		(Gbps)	(Gbps)	(%)

net-next	38	24	-36
export		38	[3,13]	[-92,-65]

as said, on the export branch the netperf tput decreases by 1 order of
magniuted or more after some, variable time. Since netperf reports the
average on the selected test time, the reported tput is both variable
and very low.

So var I could not find a reasonable way to investigate this. Dumping
the traffic or enabling debug make disapparing the issue. Even 'perf'
has been misleading so-far: without CPU pinning 'perf' may kick on the
same cpu running 'netperf' macking the timing inside the trace not
relevant. With CPU pinning (netserver, netperf and perf running on
different cores) I can't reproduce the issue.

Anyhow it looks like the receiver is not sending/delaying some required
acks, specifically when a recvmsg() syscall casues a relevant grow in
the receiver window.

If the receiver MPTCP] rcv window is full, and the data is sitting
inside the msk receive queue, we should send some additional ack when
some memory is freed by recvmsg(), but currently we don't.

Additionally, the current tcp_cleanup_rbuf()/mptcp_send_ack() hooks
in __mptcp_move_skbs_from_subflow()/__mptcp_move_skbs() are called when
the data is still sitting in the msk receive queue, so they can't catch
the above corner case.

The following hacksh patch somewhat addresses the issue here, but it
has some possibly serious problems:
- It needs to track the announced space - here a subject-to-race
WRITE_ONCE it is used, but we will need
 another atomic operation to do that properly.
- It creates a potentially high number of dup-ack due to the
unconditional tcp_send_ack(). looks like tcp_cleanup_rbuf() is not
enough/ is fooled too easily by the problem described above.

I *think* and wild guessing that a more correct solution would be:
- removing the workqueue [and]
- ensuring in-sequence data at subflow level is either moved to msk or
dropped (and not kept into the subflow due to msk receive queue being
full). 

With the above 2 things in place, than subflow_data_ready() will be
able to always update msk->ack_seq, and we should need any later
additional ack, except for window opening.

I'm pretty sure the above is far from clear, but please send any kind
of feedback, I really do feal this is a serious problem,

Thanks!

Paolo

---

Comments

Paolo Abeni Oct. 27, 2020, 11:18 a.m. UTC | #1
Hello,

adding some comments to give a little more context. This is getting
close to my stream of consciousness, so I guess it will not more easily
readable then "the ulysses"...

On Mon, 2020-10-26 at 16:46 +0100, Paolo Abeni wrote:
> diff --git a/net/mptcp/options.c b/net/mptcp/options.c
> index 4e02c259134f..2d07d099d038 100644
> --- a/net/mptcp/options.c
> +++ b/net/mptcp/options.c
> @@ -530,6 +530,7 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb,
>  		opts->ext_copy.ack64 = 0;
>  	}
>  	opts->ext_copy.use_ack = 1;
> +	WRITE_ONCE(msk->old_wspace, tcp_space((struct sock *)msk));

here a cmpxchg would be needed to make the above safe, but at worst we
will have an additional unneeded call to tcp_cleanp_rbuf(), and the
latter will avoid dup acks, so I think it's not tragic.

>  
>  	/* Add kind/length/subtype/flag overhead if mapping is not populated */
>  	if (dss_size == 0)
> diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
> index e69852fa32e2..328aa98c51f2 100644
> --- a/net/mptcp/protocol.c
> +++ b/net/mptcp/protocol.c
> @@ -406,7 +406,7 @@ static void mptcp_set_timeout(const struct sock *sk, const struct sock *ssk)
>  	mptcp_sk(sk)->timer_ival = tout > 0 ? tout : TCP_RTO_MIN;
>  }
>  
> -static void mptcp_send_ack(struct mptcp_sock *msk)
> +static void mptcp_send_ack(struct mptcp_sock *msk, bool force)
>  {
>  	struct mptcp_subflow_context *subflow;
>  
> @@ -414,8 +414,13 @@ static void mptcp_send_ack(struct mptcp_sock *msk)
>  		struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
>  
>  		lock_sock(ssk);
> -		tcp_send_ack(ssk);
> +		if (force)
> +			tcp_send_ack(ssk);
> +		else
> +			tcp_cleanup_rbuf(ssk, 1);
>  		release_sock(ssk);
> +		if (!force)
> +			break;
>  	}
>  }
>  
> @@ -467,7 +472,7 @@ static bool mptcp_check_data_fin(struct sock *sk)
>  
>  		ret = true;
>  		mptcp_set_timeout(sk, NULL);
> -		mptcp_send_ack(msk);
> +		mptcp_send_ack(msk, true);
>  		if (sock_flag(sk, SOCK_DEAD))
>  			return ret;
>  
> @@ -490,7 +495,6 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk,
>  	unsigned int moved = 0;
>  	bool more_data_avail;
>  	struct tcp_sock *tp;
> -	u32 old_copied_seq;
>  	bool done = false;
>  	int sk_rbuf;
>  
> @@ -507,7 +511,6 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk,
>  
>  	pr_debug("msk=%p ssk=%p", msk, ssk);
>  	tp = tcp_sk(ssk);
> -	old_copied_seq = tp->copied_seq;
>  	do {
>  		u32 map_remaining, offset;
>  		u32 seq = tp->copied_seq;
> @@ -572,10 +575,7 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk,
>  		}
>  	} while (more_data_avail);
>  
> -	*bytes += moved;
> -	if (tp->copied_seq != old_copied_seq)
> -		tcp_cleanup_rbuf(ssk, 1);

__mptcp_move_skbs_from_subflow() is invoked by the subflow receive
path, via subflow_data_ready()/mptcp_data_ready() and by
mptcp_recvmsg(), via __mptcp_move_skbs().

In the first case we don't need to send additional acks, the TCP stack
will do that for us in tcp_rcv_established() via tcp_ack_snd_check()
just after updating the msk->ack_seq. 

To cover the second case, is better moving the ack hook elsewhere, see
below.

Note: in tcp_rcv_established() we always hit the TCP 'slow-path', as
TCP header prediction always fails for packets carrying any type of
MPTCP options. We could change that. e.g. including a DSS64 in the
predicted header len, but then the TCP ack will be generated _before_
msk->ack_seq is updated, so we are better off with header
misprediction;)

> -
> +	*bytes = moved;
>  	return done;
>  }
>  
> @@ -1526,7 +1526,7 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied)
>  
>  static bool __mptcp_move_skbs(struct mptcp_sock *msk)
>  {
> -	unsigned int moved = 0;
> +	unsigned int moved_total = 0;
>  	bool done;
>  
>  	/* avoid looping forever below on racing close */
> @@ -1536,6 +1536,7 @@ static bool __mptcp_move_skbs(struct mptcp_sock *msk)
>  	__mptcp_flush_join_list(msk);
>  	do {
>  		struct sock *ssk = mptcp_subflow_recv_lookup(msk);
> +		unsigned int moved;
>  		bool slowpath;
>  
>  		if (!ssk)
> @@ -1543,12 +1544,14 @@ static bool __mptcp_move_skbs(struct mptcp_sock *msk)
>  
>  		slowpath = lock_sock_fast(ssk);
>  		done = __mptcp_move_skbs_from_subflow(msk, ssk, &moved);
> +		if (moved)
> +			tcp_send_ack(ssk);

This may generare an high volume of TCP dup acks, but calling
tcp_cleanup_rbuf() here is not enough, because the packets are still
accounted in the msk receive buffer, the msk receive windows is not
changed with the above call, so neither the subflow receive window.
tcp_cleanup_rbuf() will likely _not_ generate a new ack when needed.

Even replacing tcp_send_ack() with __tcp_ack_snd_check() will not
improve much: since the rcv window is not changed, and the relevant
segment has been already acked at the TCP level, such call will either
generate a dup ack or no ack at all. Additionally __tcp_ack_snd_check()
is currently static.

All the above makes me thing that trying to remove the workqueue for
the receive path could be a better solution: if incoming pkts are
always moved into the msk receive queue by subflow_data_ready(), the
ACK packet generated by the TCP stack will be enough to cover all the
above, without any need for dup acks.

With a caveat: currently we leave some packet in the subflow receive
queue while the msk receive queue is full. Possibly keeping the
incoming skb always accounted into the ingress ssk would avoid that
case - currently we move the accounting from ssk to msk
in __mptcp_move_skbs_from_subflow(). Handling the skb freing at receive
time and subflow tear down will not be trivial.

/P
Florian Westphal Oct. 27, 2020, 7:19 p.m. UTC | #2
Paolo Abeni <pabeni@redhat.com> wrote:
> On Mon, 2020-10-26 at 16:46 +0100, Paolo Abeni wrote:
> > diff --git a/net/mptcp/options.c b/net/mptcp/options.c
> > index 4e02c259134f..2d07d099d038 100644
> > --- a/net/mptcp/options.c
> > +++ b/net/mptcp/options.c
> > @@ -530,6 +530,7 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb,
> >  		opts->ext_copy.ack64 = 0;
> >  	}
> >  	opts->ext_copy.use_ack = 1;
> > +	WRITE_ONCE(msk->old_wspace, tcp_space((struct sock *)msk));
> 
> here a cmpxchg would be needed to make the above safe, but at worst we
> will have an additional unneeded call to tcp_cleanp_rbuf(), and the
> latter will avoid dup acks, so I think it's not tragic.

Agree.

> __mptcp_move_skbs_from_subflow() is invoked by the subflow receive
> path, via subflow_data_ready()/mptcp_data_ready() and by
> mptcp_recvmsg(), via __mptcp_move_skbs().
> 
> In the first case we don't need to send additional acks, the TCP stack
> will do that for us in tcp_rcv_established() via tcp_ack_snd_check()
> just after updating the msk->ack_seq. 
> To cover the second case, is better moving the ack hook elsewhere, see
> below.

[..]

> > -
> > +	*bytes = moved;
> >  	return done;
> >  }
> >  
> > @@ -1526,7 +1526,7 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied)
> >  
> >  static bool __mptcp_move_skbs(struct mptcp_sock *msk)
> >  {
> > -	unsigned int moved = 0;
> > +	unsigned int moved_total = 0;
> >  	bool done;
> >  
> >  	/* avoid looping forever below on racing close */
> > @@ -1536,6 +1536,7 @@ static bool __mptcp_move_skbs(struct mptcp_sock *msk)
> >  	__mptcp_flush_join_list(msk);
> >  	do {
> >  		struct sock *ssk = mptcp_subflow_recv_lookup(msk);
> > +		unsigned int moved;
> >  		bool slowpath;
> >  
> >  		if (!ssk)
> > @@ -1543,12 +1544,14 @@ static bool __mptcp_move_skbs(struct mptcp_sock *msk)
> >  
> >  		slowpath = lock_sock_fast(ssk);
> >  		done = __mptcp_move_skbs_from_subflow(msk, ssk, &moved);
> > +		if (moved)
> > +			tcp_send_ack(ssk);
> 
> This may generare an high volume of TCP dup acks, but calling
> tcp_cleanup_rbuf() here is not enough, because the packets are still
> accounted in the msk receive buffer, the msk receive windows is not
> changed with the above call, so neither the subflow receive window.
> tcp_cleanup_rbuf() will likely _not_ generate a new ack when needed.

Hmm, right.

> Even replacing tcp_send_ack() with __tcp_ack_snd_check() will not
> improve much: since the rcv window is not changed, and the relevant
> segment has been already acked at the TCP level, such call will either
> generate a dup ack or no ack at all. Additionally __tcp_ack_snd_check()
> is currently static.
> 
> All the above makes me thing that trying to remove the workqueue for
> the receive path could be a better solution: if incoming pkts are
> always moved into the msk receive queue by subflow_data_ready(), the
> ACK packet generated by the TCP stack will be enough to cover all the
> above, without any need for dup acks.

Agree, we will need to either remove or largely avoid mptcp-level
ack_seq advancement, waiting for the work queue to schedule makes the
mptcp-level ack updates too bursty, we should try to send those updates
more smoothly.

> With a caveat: currently we leave some packet in the subflow receive
> queue while the msk receive queue is full. Possibly keeping the
> incoming skb always accounted into the ingress ssk would avoid that
> case - currently we move the accounting from ssk to msk
> in __mptcp_move_skbs_from_subflow(). Handling the skb freing at receive
> time and subflow tear down will not be trivial.

Problem is that we rely on the mptcp-socket accountign as well for the
receive window.

I guess you should try to make this patch less hack-ish, probably by
completely avoiding __mptcp_move_skbs() change, and then try to change
the rx path to place skbs into the msk queue directly.

What about a 'pre queue' with dedicated lock and then have recv() or
worker splice that to the msk rx queue?

This would be similar to what UDP is doing.
diff mbox series

Patch

diff --git a/net/mptcp/options.c b/net/mptcp/options.c
index 4e02c259134f..2d07d099d038 100644
--- a/net/mptcp/options.c
+++ b/net/mptcp/options.c
@@ -530,6 +530,7 @@  static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb,
 		opts->ext_copy.ack64 = 0;
 	}
 	opts->ext_copy.use_ack = 1;
+	WRITE_ONCE(msk->old_wspace, tcp_space((struct sock *)msk));
 
 	/* Add kind/length/subtype/flag overhead if mapping is not populated */
 	if (dss_size == 0)
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index e69852fa32e2..328aa98c51f2 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -406,7 +406,7 @@  static void mptcp_set_timeout(const struct sock *sk, const struct sock *ssk)
 	mptcp_sk(sk)->timer_ival = tout > 0 ? tout : TCP_RTO_MIN;
 }
 
-static void mptcp_send_ack(struct mptcp_sock *msk)
+static void mptcp_send_ack(struct mptcp_sock *msk, bool force)
 {
 	struct mptcp_subflow_context *subflow;
 
@@ -414,8 +414,13 @@  static void mptcp_send_ack(struct mptcp_sock *msk)
 		struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
 
 		lock_sock(ssk);
-		tcp_send_ack(ssk);
+		if (force)
+			tcp_send_ack(ssk);
+		else
+			tcp_cleanup_rbuf(ssk, 1);
 		release_sock(ssk);
+		if (!force)
+			break;
 	}
 }
 
@@ -467,7 +472,7 @@  static bool mptcp_check_data_fin(struct sock *sk)
 
 		ret = true;
 		mptcp_set_timeout(sk, NULL);
-		mptcp_send_ack(msk);
+		mptcp_send_ack(msk, true);
 		if (sock_flag(sk, SOCK_DEAD))
 			return ret;
 
@@ -490,7 +495,6 @@  static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk,
 	unsigned int moved = 0;
 	bool more_data_avail;
 	struct tcp_sock *tp;
-	u32 old_copied_seq;
 	bool done = false;
 	int sk_rbuf;
 
@@ -507,7 +511,6 @@  static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk,
 
 	pr_debug("msk=%p ssk=%p", msk, ssk);
 	tp = tcp_sk(ssk);
-	old_copied_seq = tp->copied_seq;
 	do {
 		u32 map_remaining, offset;
 		u32 seq = tp->copied_seq;
@@ -572,10 +575,7 @@  static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk,
 		}
 	} while (more_data_avail);
 
-	*bytes += moved;
-	if (tp->copied_seq != old_copied_seq)
-		tcp_cleanup_rbuf(ssk, 1);
-
+	*bytes = moved;
 	return done;
 }
 
@@ -1526,7 +1526,7 @@  static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied)
 
 static bool __mptcp_move_skbs(struct mptcp_sock *msk)
 {
-	unsigned int moved = 0;
+	unsigned int moved_total = 0;
 	bool done;
 
 	/* avoid looping forever below on racing close */
@@ -1536,6 +1536,7 @@  static bool __mptcp_move_skbs(struct mptcp_sock *msk)
 	__mptcp_flush_join_list(msk);
 	do {
 		struct sock *ssk = mptcp_subflow_recv_lookup(msk);
+		unsigned int moved;
 		bool slowpath;
 
 		if (!ssk)
@@ -1543,12 +1544,14 @@  static bool __mptcp_move_skbs(struct mptcp_sock *msk)
 
 		slowpath = lock_sock_fast(ssk);
 		done = __mptcp_move_skbs_from_subflow(msk, ssk, &moved);
+		if (moved)
+			tcp_send_ack(ssk);
 		unlock_sock_fast(ssk, slowpath);
+		moved_total += moved;
 	} while (!done);
 
-	if (mptcp_ofo_queue(msk) || moved > 0) {
-		if (!mptcp_check_data_fin((struct sock *)msk))
-			mptcp_send_ack(msk);
+	if (mptcp_ofo_queue(msk) || moved_total > 0) {
+		mptcp_check_data_fin((struct sock *)msk);
 		return true;
 	}
 	return false;
@@ -1573,7 +1576,7 @@  static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
 	__mptcp_flush_join_list(msk);
 
 	while (len > (size_t)copied) {
-		int bytes_read;
+		int bytes_read, old_space;
 
 		bytes_read = __mptcp_recvmsg_mskq(msk, msg, len - copied);
 		if (unlikely(bytes_read < 0)) {
@@ -1588,6 +1591,11 @@  static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
 		    __mptcp_move_skbs(msk))
 			continue;
 
+		/* be sure to advertize window change */
+		old_space = READ_ONCE(msk->old_wspace);
+		if ((tcp_space(sk) - old_space) >= old_space)
+			mptcp_send_ack(msk, false);
+
 		/* only the master socket status is relevant here. The exit
 		 * conditions mirror closely tcp_recvmsg()
 		 */
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index 5117093a4de4..091ecb586f58 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -220,6 +220,7 @@  struct mptcp_sock {
 	u64		rcv_data_fin_seq;
 	struct sock	*last_snd;
 	int		snd_burst;
+	int		old_wspace;
 	atomic64_t	snd_una;
 	atomic64_t	wnd_end;
 	unsigned long	timer_ival;
diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
index b210bc92fc32..07e44ffbeb30 100644
--- a/net/mptcp/subflow.c
+++ b/net/mptcp/subflow.c
@@ -850,8 +850,6 @@  static void mptcp_subflow_discard_data(struct sock *ssk, struct sk_buff *skb,
 		sk_eat_skb(ssk, skb);
 	if (mptcp_subflow_get_map_offset(subflow) >= subflow->map_data_len)
 		subflow->map_valid = 0;
-	if (incr)
-		tcp_cleanup_rbuf(ssk, incr);
 }
 
 static bool subflow_check_data_avail(struct sock *ssk)