diff mbox series

[v2,mptcp-next,2/4] mptcp: re-enable sndbuf autotune

Message ID ac2c6974c3b150a94a80e3b92b07afb9d97f601a.1610359105.git.pabeni@redhat.com
State Superseded, archived
Headers show
Series mptcp: re-enable snd buf autotune | expand

Commit Message

Paolo Abeni Jan. 11, 2021, 10:05 a.m. UTC
After commit 6e628cd3a8f7 ("mptcp: use mptcp release_cb for
delayed tasks"), MPTCP never sets the flag bit SOCK_NOSPACE
on its subflow. As a side effect, autotune never takes place,
as it happens inside tcp_new_space(), which in turn is called
only when the mentioned bit is set.

Let's sendmsg() set the subflows NOSPACE bit when looking for
more memory. Additionally, cleanup the sndbuf propagation from
subflow into the msk, leveraging the subflow write space callback
and dropping a bunch of duplicate code.

This also makes the SNDBUF_LIMITED chrono relevant again
for MPTCP subflows.

Fixes: 6e628cd3a8f7 ("mptcp: use mptcp release_cb for delayed tasks")
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
v1 -> v2:
 - leverage the previous commit, so we can always simply set the nospace
   bit on the msk/parent socket
---
 net/mptcp/protocol.c | 57 +++++++++++++++++++-------------------------
 net/mptcp/protocol.h | 20 ++++++++++++++++
 net/mptcp/subflow.c  | 10 +++++++-
 3 files changed, 53 insertions(+), 34 deletions(-)

Comments

Mat Martineau Jan. 12, 2021, 1:01 a.m. UTC | #1
On Mon, 11 Jan 2021, Paolo Abeni wrote:

> After commit 6e628cd3a8f7 ("mptcp: use mptcp release_cb for
> delayed tasks"), MPTCP never sets the flag bit SOCK_NOSPACE
> on its subflow. As a side effect, autotune never takes place,
> as it happens inside tcp_new_space(), which in turn is called
> only when the mentioned bit is set.
>
> Let's sendmsg() set the subflows NOSPACE bit when looking for
> more memory. Additionally, cleanup the sndbuf propagation from
> subflow into the msk, leveraging the subflow write space callback
> and dropping a bunch of duplicate code.
>
> This also makes the SNDBUF_LIMITED chrono relevant again
> for MPTCP subflows.
>
> Fixes: 6e628cd3a8f7 ("mptcp: use mptcp release_cb for delayed tasks")
> Signed-off-by: Paolo Abeni <pabeni@redhat.com>
> ---
> v1 -> v2:
> - leverage the previous commit, so we can always simply set the nospace
>   bit on the msk/parent socket
> ---
> net/mptcp/protocol.c | 57 +++++++++++++++++++-------------------------
> net/mptcp/protocol.h | 20 ++++++++++++++++
> net/mptcp/subflow.c  | 10 +++++++-
> 3 files changed, 53 insertions(+), 34 deletions(-)
>
> diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
> index ada69495a51b..a64b2f6fb17b 100644
> --- a/net/mptcp/protocol.c
> +++ b/net/mptcp/protocol.c

> @@ -1421,7 +1410,8 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk,
> 		 send_info[1].ssk, send_info[1].ratio);
>
> 	/* pick the best backup if no other subflow is active */
> -	if (!nr_active)
> +	msk->use_backup = !nr_active;
> +	if (msk->use_backup)

This is the only place msk->use_backup is read. Is this a leftover from 
v1?


--
Mat Martineau
Intel
Paolo Abeni Jan. 12, 2021, 10:27 a.m. UTC | #2
On Mon, 2021-01-11 at 17:01 -0800, Mat Martineau wrote:
> On Mon, 11 Jan 2021, Paolo Abeni wrote:
> 
> > After commit 6e628cd3a8f7 ("mptcp: use mptcp release_cb for
> > delayed tasks"), MPTCP never sets the flag bit SOCK_NOSPACE
> > on its subflow. As a side effect, autotune never takes place,
> > as it happens inside tcp_new_space(), which in turn is called
> > only when the mentioned bit is set.
> > 
> > Let's sendmsg() set the subflows NOSPACE bit when looking for
> > more memory. Additionally, cleanup the sndbuf propagation from
> > subflow into the msk, leveraging the subflow write space callback
> > and dropping a bunch of duplicate code.
> > 
> > This also makes the SNDBUF_LIMITED chrono relevant again
> > for MPTCP subflows.
> > 
> > Fixes: 6e628cd3a8f7 ("mptcp: use mptcp release_cb for delayed tasks")
> > Signed-off-by: Paolo Abeni <pabeni@redhat.com>
> > ---
> > v1 -> v2:
> > - leverage the previous commit, so we can always simply set the nospace
> >   bit on the msk/parent socket
> > ---
> > net/mptcp/protocol.c | 57 +++++++++++++++++++-------------------------
> > net/mptcp/protocol.h | 20 ++++++++++++++++
> > net/mptcp/subflow.c  | 10 +++++++-
> > 3 files changed, 53 insertions(+), 34 deletions(-)
> > 
> > diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
> > index ada69495a51b..a64b2f6fb17b 100644
> > --- a/net/mptcp/protocol.c
> > +++ b/net/mptcp/protocol.c
> > @@ -1421,7 +1410,8 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk,
> > 		 send_info[1].ssk, send_info[1].ratio);
> > 
> > 	/* pick the best backup if no other subflow is active */
> > -	if (!nr_active)
> > +	msk->use_backup = !nr_active;
> > +	if (msk->use_backup)
> 
> This is the only place msk->use_backup is read. Is this a leftover from 
> v1?

Indeed! thanks for noticing! I will drop that in the next iteration.

Paolo
diff mbox series

Patch

diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index ada69495a51b..a64b2f6fb17b 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -730,10 +730,14 @@  void mptcp_data_ready(struct sock *sk, struct sock *ssk)
 
 void __mptcp_flush_join_list(struct mptcp_sock *msk)
 {
+	struct mptcp_subflow_context *subflow;
+
 	if (likely(list_empty(&msk->join_list)))
 		return;
 
 	spin_lock_bh(&msk->join_list_lock);
+	list_for_each_entry(subflow, &msk->join_list, node)
+		mptcp_propagate_sndbuf((struct sock *)msk, mptcp_subflow_tcp_sock(subflow));
 	list_splice_tail_init(&msk->join_list, &msk->conn_list);
 	spin_unlock_bh(&msk->join_list_lock);
 }
@@ -1033,13 +1037,7 @@  static void __mptcp_clean_una(struct sock *sk)
 			__mptcp_update_wmem(sk);
 			sk_mem_reclaim_partial(sk);
 		}
-
-		if (sk_stream_is_writeable(sk)) {
-			/* pairs with memory barrier in mptcp_poll */
-			smp_mb();
-			if (test_and_clear_bit(MPTCP_NOSPACE, &msk->flags))
-				sk_stream_write_space(sk);
-		}
+		mptcp_write_space(sk);
 	}
 
 	if (snd_una == READ_ONCE(msk->snd_nxt)) {
@@ -1358,8 +1356,7 @@  struct subflow_send_info {
 	u64 ratio;
 };
 
-static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk,
-					   u32 *sndbuf)
+static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)
 {
 	struct subflow_send_info send_info[2];
 	struct mptcp_subflow_context *subflow;
@@ -1370,24 +1367,17 @@  static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk,
 
 	sock_owned_by_me((struct sock *)msk);
 
-	*sndbuf = 0;
 	if (__mptcp_check_fallback(msk)) {
 		if (!msk->first)
 			return NULL;
-		*sndbuf = msk->first->sk_sndbuf;
 		return sk_stream_memory_free(msk->first) ? msk->first : NULL;
 	}
 
 	/* re-use last subflow, if the burst allow that */
 	if (msk->last_snd && msk->snd_burst > 0 &&
 	    sk_stream_memory_free(msk->last_snd) &&
-	    mptcp_subflow_active(mptcp_subflow_ctx(msk->last_snd))) {
-		mptcp_for_each_subflow(msk, subflow) {
-			ssk =  mptcp_subflow_tcp_sock(subflow);
-			*sndbuf = max(tcp_sk(ssk)->snd_wnd, *sndbuf);
-		}
+	    mptcp_subflow_active(mptcp_subflow_ctx(msk->last_snd)))
 		return msk->last_snd;
-	}
 
 	/* pick the subflow with the lower wmem/wspace ratio */
 	for (i = 0; i < 2; ++i) {
@@ -1400,7 +1390,6 @@  static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk,
 			continue;
 
 		nr_active += !subflow->backup;
-		*sndbuf = max(tcp_sk(ssk)->snd_wnd, *sndbuf);
 		if (!sk_stream_memory_free(subflow->tcp_sock))
 			continue;
 
@@ -1421,7 +1410,8 @@  static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk,
 		 send_info[1].ssk, send_info[1].ratio);
 
 	/* pick the best backup if no other subflow is active */
-	if (!nr_active)
+	msk->use_backup = !nr_active;
+	if (msk->use_backup)
 		send_info[0].ssk = send_info[1].ssk;
 
 	if (send_info[0].ssk) {
@@ -1430,6 +1420,7 @@  static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk,
 				       sk_stream_wspace(msk->last_snd));
 		return msk->last_snd;
 	}
+
 	return NULL;
 }
 
@@ -1450,7 +1441,6 @@  static void mptcp_push_pending(struct sock *sk, unsigned int flags)
 	};
 	struct mptcp_data_frag *dfrag;
 	int len, copied = 0;
-	u32 sndbuf;
 
 	while ((dfrag = mptcp_send_head(sk))) {
 		info.sent = dfrag->already_sent;
@@ -1461,12 +1451,7 @@  static void mptcp_push_pending(struct sock *sk, unsigned int flags)
 
 			prev_ssk = ssk;
 			__mptcp_flush_join_list(msk);
-			ssk = mptcp_subflow_get_send(msk, &sndbuf);
-
-			/* do auto tuning */
-			if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK) &&
-			    sndbuf > READ_ONCE(sk->sk_sndbuf))
-				WRITE_ONCE(sk->sk_sndbuf, sndbuf);
+			ssk = mptcp_subflow_get_send(msk);
 
 			/* try to keep the subflow socket lock across
 			 * consecutive xmit on the same socket
@@ -1533,11 +1518,6 @@  static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk)
 		while (len > 0) {
 			int ret = 0;
 
-			/* do auto tuning */
-			if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK) &&
-			    ssk->sk_sndbuf > READ_ONCE(sk->sk_sndbuf))
-				WRITE_ONCE(sk->sk_sndbuf, ssk->sk_sndbuf);
-
 			if (unlikely(mptcp_must_reclaim_memory(sk, ssk))) {
 				__mptcp_update_wmem(sk);
 				sk_mem_reclaim_partial(sk);
@@ -1575,6 +1555,15 @@  static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk)
 	}
 }
 
+static void mptcp_set_nospace(struct sock *sk)
+{
+	/* enable autotune */
+	set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+
+	/* will be cleared on avail space */
+	set_bit(MPTCP_NOSPACE, &mptcp_sk(sk)->flags);
+}
+
 static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 {
 	struct mptcp_sock *msk = mptcp_sk(sk);
@@ -1676,7 +1665,7 @@  static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 		continue;
 
 wait_for_memory:
-		set_bit(MPTCP_NOSPACE, &msk->flags);
+		mptcp_set_nospace(sk);
 		mptcp_push_pending(sk, msg->msg_flags);
 		ret = sk_stream_wait_memory(sk, &timeo);
 		if (ret)
@@ -2347,6 +2336,7 @@  static int __mptcp_init_sock(struct sock *sk)
 	msk->tx_pending_data = 0;
 	msk->size_goal_cache = TCP_BASE_MSS;
 
+	msk->use_backup = false;
 	msk->ack_hint = NULL;
 	msk->first = NULL;
 	inet_csk(sk)->icsk_sync_mss = mptcp_sync_mss;
@@ -3269,6 +3259,7 @@  static int mptcp_stream_accept(struct socket *sock, struct socket *newsock,
 
 		mptcp_copy_inaddrs(newsk, msk->first);
 		mptcp_rcv_space_init(msk, msk->first);
+		mptcp_propagate_sndbuf(newsk, msk->first);
 
 		/* set ssk->sk_socket of accept()ed flows to mptcp socket.
 		 * This is needed so NOSPACE flag can be set from tcp stack.
@@ -3309,7 +3300,7 @@  static __poll_t mptcp_check_writeable(struct mptcp_sock *msk)
 	if (sk_stream_is_writeable(sk))
 		return EPOLLOUT | EPOLLWRNORM;
 
-	set_bit(MPTCP_NOSPACE, &msk->flags);
+	mptcp_set_nospace(sk);
 	smp_mb__after_atomic(); /* msk->flags is changed by write_space cb */
 	if (sk_stream_is_writeable(sk))
 		return EPOLLOUT | EPOLLWRNORM;
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index 65d200a1072b..adc56bcbdf68 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -248,6 +248,7 @@  struct mptcp_sock {
 	bool		snd_data_fin_enable;
 	bool		rcv_fastclose;
 	bool		use_64bit_ack; /* Set when we received a 64-bit DSN */
+	bool		use_backup;	/* only backup subflow available */
 	spinlock_t	join_list_lock;
 	struct sock	*ack_hint;
 	struct work_struct work;
@@ -522,6 +523,25 @@  static inline bool mptcp_data_fin_enabled(const struct mptcp_sock *msk)
 	       READ_ONCE(msk->write_seq) == READ_ONCE(msk->snd_nxt);
 }
 
+static inline bool mptcp_propagate_sndbuf(struct sock *sk, struct sock *ssk)
+{
+	if ((sk->sk_userlocks & SOCK_SNDBUF_LOCK) || ssk->sk_sndbuf <= READ_ONCE(sk->sk_sndbuf))
+		return false;
+
+	WRITE_ONCE(sk->sk_sndbuf, ssk->sk_sndbuf);
+	return true;
+}
+
+static inline void mptcp_write_space(struct sock *sk)
+{
+	if (sk_stream_is_writeable(sk)) {
+		/* pairs with memory barrier in mptcp_poll */
+		smp_mb();
+		if (test_and_clear_bit(MPTCP_NOSPACE, &mptcp_sk(sk)->flags))
+			sk_stream_write_space(sk);
+	}
+}
+
 void mptcp_destroy_common(struct mptcp_sock *msk);
 
 void __init mptcp_token_init(void);
diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
index 22313710d769..31cc362a4638 100644
--- a/net/mptcp/subflow.c
+++ b/net/mptcp/subflow.c
@@ -343,6 +343,7 @@  static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
 	if (subflow->conn_finished)
 		return;
 
+	mptcp_propagate_sndbuf(parent, sk);
 	subflow->rel_write_seq = 1;
 	subflow->conn_finished = 1;
 	subflow->ssn_offset = TCP_SKB_CB(skb)->seq;
@@ -1040,7 +1041,13 @@  static void subflow_data_ready(struct sock *sk)
 
 static void subflow_write_space(struct sock *ssk)
 {
-	/* we take action in __mptcp_clean_una() */
+	struct socket *sock = ssk->sk_socket;
+
+	if (mptcp_propagate_sndbuf(mptcp_subflow_ctx(ssk)->conn, ssk))
+		mptcp_write_space(ssk);
+
+	if (sk_stream_is_writeable(ssk) && sock)
+		clear_bit(SOCK_NOSPACE, &sock->flags);
 }
 
 static struct inet_connection_sock_af_ops *
@@ -1302,6 +1309,7 @@  static void subflow_state_change(struct sock *sk)
 	__subflow_state_change(sk);
 
 	if (subflow_simultaneous_connect(sk)) {
+		mptcp_propagate_sndbuf(parent, sk);
 		mptcp_do_fallback(sk);
 		mptcp_rcv_space_init(mptcp_sk(parent), sk);
 		pr_fallback(mptcp_sk(parent));