diff mbox series

[v2,09/13] mptcp: sendmsg: transmit on backup if other subflows have been closed

Message ID 20191118214538.21931-10-fw@strlen.de
State Accepted, archived
Delegated to: Matthieu Baerts
Headers show
Series [v2] mptcp: wmem accounting and nonblocking io support | expand

Commit Message

Florian Westphal Nov. 18, 2019, 9:45 p.m. UTC
Currently we always pick the first ssk on the list and then have
mptcp_sendmsg_frag wait until more space becomes available in case that
ssk has no write space available.

Instead check the first subflow on the list.  If no more write space
is available, then we need to either return -EAGAIN to userspace (nonblock
case), or we need to wait until a subflow becomes available.

This is done by blocking the current thread via sk_stream_wait_memory()
and then make the subflow sk_write_space() unblock the parent mptcp socket.

We can't acquire the mptcp socket lock from the subflow callbacks, but
we can use the mptcp_sk->flags -- MPTCP_SEND_SPACE flag is added for this
purpose.  If it gets set, then at least one subflow has become available for
writing.

v1: dumb-down the selection: just pick the first ssk on the list and make
mptcp socket block if it has no wspace.
Backup is only used if no non-backup subflow exists.

v2: avoid another while loop and fold !ssk condition with wmem check
    on parent mptcp socket (Paolo).

Signed-off-by: Florian Westphal <fw@strlen.de>
---
 net/mptcp/protocol.c | 58 +++++++++++++++++++++++++++++++++++++++-----
 net/mptcp/protocol.h |  1 +
 net/mptcp/subflow.c  |  5 +++-
 3 files changed, 57 insertions(+), 7 deletions(-)
diff mbox series

Patch

diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index d0b050f6611e..be927f456a18 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -384,6 +384,43 @@  static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
 	return ret;
 }
 
+static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)
+{
+	struct mptcp_subflow_context *subflow;
+	struct sock *backup = NULL;
+
+	sock_owned_by_me((const struct sock *)msk);
+
+	mptcp_for_each_subflow(msk, subflow) {
+		struct sock *ssk = mptcp_subflow_tcp_socket(subflow)->sk;
+
+		if (!sk_stream_memory_free(ssk)) {
+			struct socket *sock = ssk->sk_socket;
+
+			if (sock) {
+				clear_bit(MPTCP_SEND_SPACE, &msk->flags);
+				smp_mb__after_atomic();
+
+				/* enables sk->write_space() callbacks */
+				set_bit(SOCK_NOSPACE, &sock->flags);
+			}
+
+			return NULL;
+		}
+
+		if (subflow->backup) {
+			if (!backup)
+				backup = ssk;
+
+			continue;
+		}
+
+		return ssk;
+	}
+
+	return backup;
+}
+
 static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 {
 	int mss_now = 0, size_goal = 0, ret = 0;
@@ -410,18 +447,19 @@  static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 
 	mptcp_clean_una(sk);
 
-	while (!sk_stream_memory_free(sk)) {
+	ssk = mptcp_subflow_get_send(msk);
+	while (!sk_stream_memory_free(sk) || !ssk) {
 		ret = sk_stream_wait_memory(sk, &timeo);
 		if (ret)
 			goto out;
 
 		mptcp_clean_una(sk);
-	}
 
-	ssk = mptcp_subflow_get(msk);
-	if (!ssk) {
-		release_sock(sk);
-		return -ENOTCONN;
+		ssk = mptcp_subflow_get_send(msk);
+		if (list_empty(&msk->conn_list)) {
+			ret = -ENOTCONN;
+			goto out;
+		}
 	}
 
 	pr_debug("conn_list->subflow=%p", ssk);
@@ -1117,6 +1155,13 @@  bool mptcp_sk_is_subflow(const struct sock *sk)
 	return subflow->mp_join == 1;
 }
 
+static bool mptcp_memory_free(const struct sock *sk, int wake)
+{
+	struct mptcp_sock *msk = mptcp_sk(sk);
+
+	return wake ? test_bit(MPTCP_SEND_SPACE, &msk->flags) : true;
+}
+
 static struct proto mptcp_prot = {
 	.name		= "MPTCP",
 	.owner		= THIS_MODULE,
@@ -1137,6 +1182,7 @@  static struct proto mptcp_prot = {
 	.sockets_allocated	= &mptcp_sockets_allocated,
 	.memory_allocated	= &tcp_memory_allocated,
 	.memory_pressure	= &tcp_memory_pressure,
+	.stream_memory_free	= mptcp_memory_free,
 	.sysctl_wmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_wmem),
 	.sysctl_mem	= sysctl_tcp_mem,
 	.obj_size	= sizeof(struct mptcp_sock),
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index 6e23da8c5024..ce5c5de6a5eb 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -77,6 +77,7 @@ 
 /* MPTCP socket flags */
 #define MPTCP_DATA_READY	BIT(0)
 #define MPTCP_WORK_RTX		BIT(1)
+#define MPTCP_SEND_SPACE	BIT(2)
 
 static inline __be32 mptcp_option(u8 subopt, u8 len, u8 nib, u8 field)
 {
diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
index 976e49349276..32082c6e8552 100644
--- a/net/mptcp/subflow.c
+++ b/net/mptcp/subflow.c
@@ -670,8 +670,11 @@  static void subflow_write_space(struct sock *sk)
 	struct sock *parent = subflow->conn;
 
 	sk_stream_write_space(sk);
-	if (parent)
+	if (parent && sk_stream_is_writeable(sk)) {
+		set_bit(MPTCP_SEND_SPACE, &mptcp_sk(parent)->flags);
+		smp_mb__after_atomic();
 		sk_stream_write_space(parent);
+	}
 }
 
 int mptcp_subflow_connect(struct sock *sk, struct sockaddr *local,