diff mbox series

Re: [PATCH v2 5/6] mptcp: protocol: re-check dsn before reading from subflow

Message ID 20200217221440.GI19559@breakpoint.cc
State Superseded, archived
Delegated to: Paolo Abeni
Headers show
Series Re: [PATCH v2 5/6] mptcp: protocol: re-check dsn before reading from subflow | expand

Commit Message

Florian Westphal Feb. 17, 2020, 10:14 p.m. UTC
Paolo Abeni <pabeni@redhat.com> wrote:
> On Mon, 2020-02-17 at 16:01 +0100, Florian Westphal wrote:
> > mptcp_subflow_data_available() is commonly called via
> > ssk->sk_data_ready(), in this case the mptcp socket lock
> > cannot be acquired.
> > 
> > Therefore, while we can safely discard subflow data that
> > was already received up to msk->ack_seq, we cannot be sure
> > that 'subflow->data_avail' will still be valid at the time
> > userspace wants to read the data -- a previous read on a
> > different subflow might have carried this data already.
> > 
> > In that (unlikely) event, msk->ack_seq will have been updated
> > and will be ahead of the subflow dsn.
> 
> Does the above mean that we can get wake-up events and found no actual
> data in the msk socket? (read will block)
> 
> Something alike:
> - user space is blocked on mptcp_recvmsg()/mptcp_wait_data() ->
> wait_data == true
> - subflow 1 delivers new in order data, wakes the user space which
> start reading
> - before mptcp_recvmsg updates the msk->ack_seq, subflow 2 delivers the
> same DSS as subflow 1, consider that to be in order and sets the
> MPTCP_DATA_READY bit
> - mptcp_recvmsg fills the user-space buffer and completes. Since
> wait_data == true, the MPTCP_DATA_READY bit is left untouched.
> - next mptcp_poll will return EPOLLIN even if no data is really
> available.
> 
> Should we move the 'wait_data = false' initialization inside some inner
> loop?

Hmm, what about this (delta, would squash this to "mptcp: update mptcp ack
sequence from work queue".

In your scenario above, we would now hit the 'msk->rx_queue is empty'
branch, DATA_READY gets cleared:

Comments

Paolo Abeni Feb. 18, 2020, 8:33 a.m. UTC | #1
On Mon, 2020-02-17 at 23:14 +0100, Florian Westphal wrote:
> Paolo Abeni <pabeni@redhat.com> wrote:
> > On Mon, 2020-02-17 at 16:01 +0100, Florian Westphal wrote:
> > > mptcp_subflow_data_available() is commonly called via
> > > ssk->sk_data_ready(), in this case the mptcp socket lock
> > > cannot be acquired.
> > > 
> > > Therefore, while we can safely discard subflow data that
> > > was already received up to msk->ack_seq, we cannot be sure
> > > that 'subflow->data_avail' will still be valid at the time
> > > userspace wants to read the data -- a previous read on a
> > > different subflow might have carried this data already.
> > > 
> > > In that (unlikely) event, msk->ack_seq will have been updated
> > > and will be ahead of the subflow dsn.
> > 
> > Does the above mean that we can get wake-up events and found no actual
> > data in the msk socket? (read will block)
> > 
> > Something alike:
> > - user space is blocked on mptcp_recvmsg()/mptcp_wait_data() ->
> > wait_data == true
> > - subflow 1 delivers new in order data, wakes the user space which
> > start reading
> > - before mptcp_recvmsg updates the msk->ack_seq, subflow 2 delivers the
> > same DSS as subflow 1, consider that to be in order and sets the
> > MPTCP_DATA_READY bit
> > - mptcp_recvmsg fills the user-space buffer and completes. Since
> > wait_data == true, the MPTCP_DATA_READY bit is left untouched.
> > - next mptcp_poll will return EPOLLIN even if no data is really
> > available.
> > 
> > Should we move the 'wait_data = false' initialization inside some inner
> > loop?
> 
> Hmm, what about this (delta, would squash this to "mptcp: update mptcp ack
> sequence from work queue".
> 
> In your scenario above, we would now hit the 'msk->rx_queue is empty'
> branch, DATA_READY gets cleared:
> 
> diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
> --- a/net/mptcp/protocol.c
> +++ b/net/mptcp/protocol.c
> @@ -787,10 +787,7 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
>  			 int nonblock, int flags, int *addr_len)
>  {
>  	struct mptcp_sock *msk = mptcp_sk(sk);
> -	bool more_data_avail = true;
> -	bool wait_data = false;
>  	struct socket *ssock;
> -	struct sock *ssk;
>  	int copied = 0;
>  	int target;
>  	long timeo;
> @@ -814,7 +811,7 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
>  	target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
>  	__mptcp_flush_join_list(msk);
>  
> -	while (more_data_avail) {
> +	while (len > (size_t)copied) {
>  		int bytes_read;
>  
>  		bytes_read = __mptcp_recvmsg_mskq(msk, msg, len - copied);
> @@ -826,12 +823,9 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
>  
>  		copied += bytes_read;
>  
> -		if (skb_queue_empty(&sk->sk_receive_queue)) {
> -			if (!__mptcp_move_skbs(msk))
> -				more_data_avail = false;
> -
> +		if (skb_queue_empty(&sk->sk_receive_queue) &&
> +		    __mptcp_move_skbs(msk))
>  			continue;
> -		}
>  
>  		/* only the master socket status is relevant here. The exit
>  		 * conditions mirror closely tcp_recvmsg()
> @@ -872,26 +866,24 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
>  		}
>  
>  		pr_debug("block timeout %ld", timeo);
> -		wait_data = true;
>  		mptcp_wait_data(sk, &timeo);
>  		if (unlikely(__mptcp_tcp_fallback(msk)))
>  			goto fallback;
>  	}
>  
> -	if (more_data_avail) {
> -		if (!test_bit(MPTCP_DATA_READY, &msk->flags))
> -			set_bit(MPTCP_DATA_READY, &msk->flags);
> -	} else if (!wait_data) {
> +	if (skb_queue_empty(&sk->sk_receive_queue)) {
> +		/* entire backlog drained, clear DATA_READY. */
>  		clear_bit(MPTCP_DATA_READY, &msk->flags);
>  
> -		/* .. race-breaker: ssk might get new data after last
> -		 * data_available() returns false.
> +		/* .. race-breaker: ssk might have gotten new data
> +		 * after last __mptcp_move_skbs() returned false.
>  		 */
> -		ssk = mptcp_subflow_recv_lookup(msk);
> -		if (unlikely(ssk))
> +		if (unlikely(__mptcp_move_skbs(msk)))
>  			set_bit(MPTCP_DATA_READY, &msk->flags);
> +	} else if (unlikely(!test_bit(MPTCP_DATA_READY, &msk->flags))) {
> +		/* data to read but mptcp_wait_data() cleared DATA_READY */
> +		set_bit(MPTCP_DATA_READY, &msk->flags);
>  	}
> -
>  out_err:
>  	release_sock(sk);
>  	return copied;
> 

LGTM, thanks for takling the concerns of my twisted mind ;)

Cheers,

Paolo
diff mbox series

Patch

diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -787,10 +787,7 @@  static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
 			 int nonblock, int flags, int *addr_len)
 {
 	struct mptcp_sock *msk = mptcp_sk(sk);
-	bool more_data_avail = true;
-	bool wait_data = false;
 	struct socket *ssock;
-	struct sock *ssk;
 	int copied = 0;
 	int target;
 	long timeo;
@@ -814,7 +811,7 @@  static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
 	target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
 	__mptcp_flush_join_list(msk);
 
-	while (more_data_avail) {
+	while (len > (size_t)copied) {
 		int bytes_read;
 
 		bytes_read = __mptcp_recvmsg_mskq(msk, msg, len - copied);
@@ -826,12 +823,9 @@  static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
 
 		copied += bytes_read;
 
-		if (skb_queue_empty(&sk->sk_receive_queue)) {
-			if (!__mptcp_move_skbs(msk))
-				more_data_avail = false;
-
+		if (skb_queue_empty(&sk->sk_receive_queue) &&
+		    __mptcp_move_skbs(msk))
 			continue;
-		}
 
 		/* only the master socket status is relevant here. The exit
 		 * conditions mirror closely tcp_recvmsg()
@@ -872,26 +866,24 @@  static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
 		}
 
 		pr_debug("block timeout %ld", timeo);
-		wait_data = true;
 		mptcp_wait_data(sk, &timeo);
 		if (unlikely(__mptcp_tcp_fallback(msk)))
 			goto fallback;
 	}
 
-	if (more_data_avail) {
-		if (!test_bit(MPTCP_DATA_READY, &msk->flags))
-			set_bit(MPTCP_DATA_READY, &msk->flags);
-	} else if (!wait_data) {
+	if (skb_queue_empty(&sk->sk_receive_queue)) {
+		/* entire backlog drained, clear DATA_READY. */
 		clear_bit(MPTCP_DATA_READY, &msk->flags);
 
-		/* .. race-breaker: ssk might get new data after last
-		 * data_available() returns false.
+		/* .. race-breaker: ssk might have gotten new data
+		 * after last __mptcp_move_skbs() returned false.
 		 */
-		ssk = mptcp_subflow_recv_lookup(msk);
-		if (unlikely(ssk))
+		if (unlikely(__mptcp_move_skbs(msk)))
 			set_bit(MPTCP_DATA_READY, &msk->flags);
+	} else if (unlikely(!test_bit(MPTCP_DATA_READY, &msk->flags))) {
+		/* data to read but mptcp_wait_data() cleared DATA_READY */
+		set_bit(MPTCP_DATA_READY, &msk->flags);
 	}
-
 out_err:
 	release_sock(sk);
 	return copied;