diff mbox series

[RFC,01/12] mptcp: msk is writable according to msk write space

Message ID aa6bae778983c8f5d02a15e443482fc24ea2a56a.1596216310.git.pabeni@redhat.com
State Superseded, archived
Headers show
Series mptcp: multiple xmit substreams support | expand

Commit Message

Paolo Abeni July 31, 2020, 5:39 p.m. UTC
Currently, when checking for the 'msk is writable' condition,
we look at the individual subflows write space. That works
well while we sends data via a single subflow, but will not
as soon as we will enable concurrent xmit on multiple subflows.

Let's use the msk write space instead.

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 net/mptcp/protocol.c | 42 ++++++++++++++++++++++++------------------
 net/mptcp/subflow.c  |  3 +--
 2 files changed, 25 insertions(+), 20 deletions(-)

Comments

Florian Westphal Aug. 1, 2020, 10:23 p.m. UTC | #1
Paolo Abeni <pabeni@redhat.com> wrote:
> Currently, when checking for the 'msk is writable' condition,
> we look at the individual subflows write space. That works
> well while we sends data via a single subflow, but will not
> as soon as we will enable concurrent xmit on multiple subflows.
> 
> Let's use the msk write space instead.

Why do we need MPTCP_SEND_SPACE bit after this change?

> Signed-off-by: Paolo Abeni <pabeni@redhat.com>
> ---
>  net/mptcp/protocol.c | 42 ++++++++++++++++++++++++------------------
>  net/mptcp/subflow.c  |  3 +--
>  2 files changed, 25 insertions(+), 20 deletions(-)
> 
> diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
> index 3b9ae98c67bb..d51b1d95dfb8 100644
> --- a/net/mptcp/protocol.c
> +++ b/net/mptcp/protocol.c
> @@ -612,8 +612,15 @@ static void mptcp_clean_una(struct sock *sk)
>  		sk_mem_reclaim_partial(sk);
>  
>  		/* Only wake up writers if a subflow is ready */
> -		if (test_bit(MPTCP_SEND_SPACE, &msk->flags))
> +		if (sk_stream_is_writeable(sk)) {
> +			set_bit(MPTCP_SEND_SPACE, &mptcp_sk(sk)->flags);

mptcp sk has enough wmem, so MPTCP_SEND_SPACE is set.

> @@ -799,21 +806,27 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
>  	return ret;
>  }
>  
> -static void mptcp_nospace(struct mptcp_sock *msk, struct socket *sock)
> +static void mptcp_nospace(struct sock *sk, struct sock *ssk)
>  {
> +	struct mptcp_sock *msk = mptcp_sk(sk);
> +	struct socket *sock;
> +
>  	clear_bit(MPTCP_SEND_SPACE, &msk->flags);

This clears it, helper is called when msk wmem occupancy is too large.

> diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
> index 7e838bc6e364..bd2201d62ae6 100644
> --- a/net/mptcp/subflow.c
> +++ b/net/mptcp/subflow.c
> @@ -926,8 +926,7 @@ static void subflow_write_space(struct sock *sk)
>  	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
>  	struct sock *parent = subflow->conn;
>  
> -	sk_stream_write_space(sk);
> -	if (sk_stream_is_writeable(sk)) {
> +	if (sk_stream_is_writeable(parent)) {
>  		set_bit(MPTCP_SEND_SPACE, &mptcp_sk(parent)->flags);

here, its set again when wmem occupancy of msk is low enough again.

So, AFAICS we can just remove it completely.

If not, perhaps add a comment as to what its signalling vs. 'msk is
writeable/not writeable'.

Otherwise looks like a good cleanup to me.
Paolo Abeni Aug. 4, 2020, 5:14 p.m. UTC | #2
On Sun, 2020-08-02 at 00:23 +0200, Florian Westphal wrote:
> Paolo Abeni <pabeni@redhat.com> wrote:
> > Currently, when checking for the 'msk is writable' condition,
> > we look at the individual subflows write space. That works
> > well while we sends data via a single subflow, but will not
> > as soon as we will enable concurrent xmit on multiple subflows.
> > 
> > Let's use the msk write space instead.
> 
> Why do we need MPTCP_SEND_SPACE bit after this change?

I'm testing without it, I see the following:

the msk is writable, but all the subflows have sk_stream_wspace() < 0.

The above can happen because we account the full truesize to ssk, while
only the page frag size to the msk.

Note that checking:

	sk_stream_wspace(sk) > 0 && sk_stream_is_writeable(parent)

in subflow_write_space() is not enough, we also need mptcp_poll()
returning a mask with EPOLLOUT cleared when all subflows have no write
space.

Overall I think that keeping the flag around is the simplest option.

Cheers,

Paolo
diff mbox series

Patch

diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 3b9ae98c67bb..d51b1d95dfb8 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -612,8 +612,15 @@  static void mptcp_clean_una(struct sock *sk)
 		sk_mem_reclaim_partial(sk);
 
 		/* Only wake up writers if a subflow is ready */
-		if (test_bit(MPTCP_SEND_SPACE, &msk->flags))
+		if (sk_stream_is_writeable(sk)) {
+			set_bit(MPTCP_SEND_SPACE, &mptcp_sk(sk)->flags);
+			smp_mb__after_atomic();
+
+			/* set SEND_SPACE before sk_stream_write_space clears
+			 * NOSPACE
+			 */
 			sk_stream_write_space(sk);
+		}
 	}
 }
 
@@ -799,21 +806,27 @@  static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
 	return ret;
 }
 
-static void mptcp_nospace(struct mptcp_sock *msk, struct socket *sock)
+static void mptcp_nospace(struct sock *sk, struct sock *ssk)
 {
+	struct mptcp_sock *msk = mptcp_sk(sk);
+	struct socket *sock;
+
 	clear_bit(MPTCP_SEND_SPACE, &msk->flags);
 	smp_mb__after_atomic(); /* msk->flags is changed by write_space cb */
 
 	/* enables sk->write_space() callbacks */
-	set_bit(SOCK_NOSPACE, &sock->flags);
+	sock = READ_ONCE(ssk->sk_socket);
+	if (sock)
+		set_bit(SOCK_NOSPACE, &sock->flags);
 }
 
 static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)
 {
 	struct mptcp_subflow_context *subflow;
+	struct sock *sk = (struct sock *)msk;
 	struct sock *backup = NULL;
 
-	sock_owned_by_me((const struct sock *)msk);
+	sock_owned_by_me(sk);
 
 	if (!mptcp_ext_cache_refill(msk))
 		return NULL;
@@ -821,12 +834,8 @@  static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)
 	mptcp_for_each_subflow(msk, subflow) {
 		struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
 
-		if (!sk_stream_memory_free(ssk)) {
-			struct socket *sock = ssk->sk_socket;
-
-			if (sock)
-				mptcp_nospace(msk, sock);
-
+		if (!sk_stream_memory_free(sk)) {
+			mptcp_nospace(sk, ssk);
 			return NULL;
 		}
 
@@ -843,16 +852,12 @@  static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)
 	return backup;
 }
 
-static void ssk_check_wmem(struct mptcp_sock *msk, struct sock *ssk)
+static void ssk_check_wmem(struct sock *sk, struct sock *ssk)
 {
-	struct socket *sock;
-
-	if (likely(sk_stream_is_writeable(ssk)))
+	if (likely(sk_stream_is_writeable(sk)))
 		return;
 
-	sock = READ_ONCE(ssk->sk_socket);
-	if (sock)
-		mptcp_nospace(msk, sock);
+	mptcp_nospace(sk, ssk);
 }
 
 static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
@@ -976,6 +981,7 @@  static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 				 * Wakeup will happen via mptcp_clean_una().
 				 */
 				mptcp_set_timeout(sk, ssk);
+				mptcp_nospace(sk, ssk);
 				release_sock(ssk);
 				goto wait_for_sndbuf;
 			}
@@ -992,7 +998,7 @@  static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 			mptcp_reset_timer(sk);
 	}
 
-	ssk_check_wmem(msk, ssk);
+	ssk_check_wmem(sk, ssk);
 	release_sock(ssk);
 out:
 	release_sock(sk);
diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
index 7e838bc6e364..bd2201d62ae6 100644
--- a/net/mptcp/subflow.c
+++ b/net/mptcp/subflow.c
@@ -926,8 +926,7 @@  static void subflow_write_space(struct sock *sk)
 	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
 	struct sock *parent = subflow->conn;
 
-	sk_stream_write_space(sk);
-	if (sk_stream_is_writeable(sk)) {
+	if (sk_stream_is_writeable(parent)) {
 		set_bit(MPTCP_SEND_SPACE, &mptcp_sk(parent)->flags);
 		smp_mb__after_atomic();
 		/* set SEND_SPACE before sk_stream_write_space clears NOSPACE */