diff mbox series

[RFC,4/5] mptcp: update mptcp ack sequence from work queue

Message ID 20200213102742.18937-5-fw@strlen.de
State Superseded, archived
Delegated to: Paolo Abeni
Headers show
Series mptcp: perform mptcp ack update from work queue | expand

Commit Message

Florian Westphal Feb. 13, 2020, 10:27 a.m. UTC
This adds a new worker flag to indicate when the work queue should
drain a subflow socket.

skbs on the subflow socket are then placed on the mptcp socket receive
queue (which was not used so far).
This allows us to announce the correct mptcp ack sequence in the tcp
acks that we send back to the peer, even when the application does not
call recv() on the mptcp socket for some time.

We still wake the userspace task that poll for events -- we do not
depend on the work queue to have run: if the mptcp level receive queue
is empty, skbs are taken from in-sequence subflow sockets.

Signed-off-by: Florian Westphal <fw@strlen.de>
---
 net/mptcp/protocol.c | 162 +++++++++++++++++++++++++++++++++++++++++++
 net/mptcp/protocol.h |   1 +
 2 files changed, 163 insertions(+)

Comments

Paolo Abeni Feb. 13, 2020, 2:37 p.m. UTC | #1
On Thu, 2020-02-13 at 11:27 +0100, Florian Westphal wrote:
> +static void __mptcp_move_skbs(struct mptcp_sock *msk)
> +{
> +	struct mptcp_steal_arg arg = {
> +		.msk = msk,
> +		.fin = false,
> +	};
> +	read_descriptor_t desc = {
> +		.arg.data = &arg,
> +	};
> +
> +	for (;;) {
> +		struct mptcp_subflow_context *subflow;
> +		bool more_data_avail;
> +
> +		arg.ssk = mptcp_subflow_recv_lookup(msk);
> +		if (!arg.ssk)
> +			break;
> +
> +		subflow = mptcp_subflow_ctx(arg.ssk);
> +
> +		lock_sock(arg.ssk);
> +
> +		do {
> +			u32 map_remaining;
> +			int bytes_read;
> +
> +			/* try to read as much data as available */
> +			map_remaining = subflow->map_data_len -
> +					mptcp_subflow_get_map_offset(subflow);
> +			desc.count = map_remaining;
> +
> +			bytes_read = tcp_read_sock(arg.ssk, &desc, mptcp_steal_actor);
> +			if (bytes_read <= 0) {
> +				release_sock(arg.ssk);
> +				return;
> +			}
> +
> +			if (arg.fin) {
> +				struct tcp_sock *tp = tcp_sk(arg.ssk);
> +				u32 seq = READ_ONCE(tp->copied_seq);
> +
> +				WRITE_ONCE(tp->copied_seq, seq + 1);
> +			}
> +
> +			more_data_avail = mptcp_subflow_data_available(arg.ssk);
> +		} while (more_data_avail);
> +
> +		release_sock(arg.ssk);
> +	}
> +}

Overall the above move the pending skbs to the msk receive queue,
increment ssk copied_seq and eventually let TCP send ack via
tcp_cleanup_rbuf().

I think we could avoid touching tcp_read_sock() in patch 2/5 moving the
skb directly from __mptcp_move_skbs() - no call to tcp_read_sock(),
just queue manipulation and copied_seq accounting.

Than __mptcp_move_skbs() will call directly, as needed - need to export
it.

We could additionally clean-up the code calling __mptcp_move_skbs()
from mptcp_recvmsg() before looking into the msk receive queue. Than no
need for the current recvmsg inner loop.

I fear we need some tweek to handle TCP fallback. If I read the RFC
correctly the msk can fallback to TCP even after receiving some DSS
data (e.g. msk receive queue contains some skbs and the msk fallback to
TCP. We can't call anymore sock_recvmsg())

WDYT?

Thanks,

Paolo
Mat Martineau Feb. 15, 2020, 1:39 a.m. UTC | #2
On Thu, 13 Feb 2020, Paolo Abeni wrote:

> On Thu, 2020-02-13 at 11:27 +0100, Florian Westphal wrote:
>> +static void __mptcp_move_skbs(struct mptcp_sock *msk)
>> +{
>> +	struct mptcp_steal_arg arg = {
>> +		.msk = msk,
>> +		.fin = false,
>> +	};
>> +	read_descriptor_t desc = {
>> +		.arg.data = &arg,
>> +	};
>> +
>> +	for (;;) {
>> +		struct mptcp_subflow_context *subflow;
>> +		bool more_data_avail;
>> +
>> +		arg.ssk = mptcp_subflow_recv_lookup(msk);
>> +		if (!arg.ssk)
>> +			break;
>> +
>> +		subflow = mptcp_subflow_ctx(arg.ssk);
>> +
>> +		lock_sock(arg.ssk);
>> +
>> +		do {
>> +			u32 map_remaining;
>> +			int bytes_read;
>> +
>> +			/* try to read as much data as available */
>> +			map_remaining = subflow->map_data_len -
>> +					mptcp_subflow_get_map_offset(subflow);
>> +			desc.count = map_remaining;
>> +
>> +			bytes_read = tcp_read_sock(arg.ssk, &desc, mptcp_steal_actor);
>> +			if (bytes_read <= 0) {
>> +				release_sock(arg.ssk);
>> +				return;
>> +			}
>> +
>> +			if (arg.fin) {
>> +				struct tcp_sock *tp = tcp_sk(arg.ssk);
>> +				u32 seq = READ_ONCE(tp->copied_seq);
>> +
>> +				WRITE_ONCE(tp->copied_seq, seq + 1);
>> +			}
>> +
>> +			more_data_avail = mptcp_subflow_data_available(arg.ssk);
>> +		} while (more_data_avail);
>> +
>> +		release_sock(arg.ssk);
>> +	}
>> +}
>
> Overall the above move the pending skbs to the msk receive queue,
> increment ssk copied_seq and eventually let TCP send ack via
> tcp_cleanup_rbuf().
>

It's only moving in-order skbs, right?

> I think we could avoid touching tcp_read_sock() in patch 2/5 moving the
> skb directly from __mptcp_move_skbs() - no call to tcp_read_sock(),
> just queue manipulation and copied_seq accounting.
>
> Than __mptcp_move_skbs() will call directly, as needed - need to export
> it.

This makes sense to me. We have to do another layer of reassembly for 
MPTCP and we already know the skbs are in-order for the subflow stream, 
and there's no need to handle partial skb reads like tcp_read_sock does.

>
> We could additionally clean-up the code calling __mptcp_move_skbs()
> from mptcp_recvmsg() before looking into the msk receive queue. Than no
> need for the current recvmsg inner loop.
>
> I fear we need some tweek to handle TCP fallback. If I read the RFC
> correctly the msk can fallback to TCP even after receiving some DSS
> data (e.g. msk receive queue contains some skbs and the msk fallback to
> TCP. We can't call anymore sock_recvmsg())

RFC 8684 section 3.7 (Fallback) says that "If a subflow breaks during 
operation... the subflow SHOULD be treated as broken and closed with a 
RST". Does that cover your concern, or are you referring to a fallback at 
connection time?


--
Mat Martineau
Intel
Paolo Abeni Feb. 17, 2020, 8:40 a.m. UTC | #3
On Fri, 2020-02-14 at 17:39 -0800, Mat Martineau wrote:
> On Thu, 13 Feb 2020, Paolo Abeni wrote:
> 
> > On Thu, 2020-02-13 at 11:27 +0100, Florian Westphal wrote:
> > > +static void __mptcp_move_skbs(struct mptcp_sock *msk)
> > > +{
> > > +	struct mptcp_steal_arg arg = {
> > > +		.msk = msk,
> > > +		.fin = false,
> > > +	};
> > > +	read_descriptor_t desc = {
> > > +		.arg.data = &arg,
> > > +	};
> > > +
> > > +	for (;;) {
> > > +		struct mptcp_subflow_context *subflow;
> > > +		bool more_data_avail;
> > > +
> > > +		arg.ssk = mptcp_subflow_recv_lookup(msk);
> > > +		if (!arg.ssk)
> > > +			break;
> > > +
> > > +		subflow = mptcp_subflow_ctx(arg.ssk);
> > > +
> > > +		lock_sock(arg.ssk);
> > > +
> > > +		do {
> > > +			u32 map_remaining;
> > > +			int bytes_read;
> > > +
> > > +			/* try to read as much data as available */
> > > +			map_remaining = subflow->map_data_len -
> > > +					mptcp_subflow_get_map_offset(subflow);
> > > +			desc.count = map_remaining;
> > > +
> > > +			bytes_read = tcp_read_sock(arg.ssk, &desc, mptcp_steal_actor);
> > > +			if (bytes_read <= 0) {
> > > +				release_sock(arg.ssk);
> > > +				return;
> > > +			}
> > > +
> > > +			if (arg.fin) {
> > > +				struct tcp_sock *tp = tcp_sk(arg.ssk);
> > > +				u32 seq = READ_ONCE(tp->copied_seq);
> > > +
> > > +				WRITE_ONCE(tp->copied_seq, seq + 1);
> > > +			}
> > > +
> > > +			more_data_avail = mptcp_subflow_data_available(arg.ssk);
> > > +		} while (more_data_avail);
> > > +
> > > +		release_sock(arg.ssk);
> > > +	}
> > > +}
> > 
> > Overall the above move the pending skbs to the msk receive queue,
> > increment ssk copied_seq and eventually let TCP send ack via
> > tcp_cleanup_rbuf().
> > 
> 
> It's only moving in-order skbs, right?
> 
> > I think we could avoid touching tcp_read_sock() in patch 2/5 moving the
> > skb directly from __mptcp_move_skbs() - no call to tcp_read_sock(),
> > just queue manipulation and copied_seq accounting.
> > 
> > Than __mptcp_move_skbs() will call directly, as needed - need to export
> > it.
> 
> This makes sense to me. We have to do another layer of reassembly for 
> MPTCP and we already know the skbs are in-order for the subflow stream, 
> and there's no need to handle partial skb reads like tcp_read_sock does.
> 
> > We could additionally clean-up the code calling __mptcp_move_skbs()
> > from mptcp_recvmsg() before looking into the msk receive queue. Than no
> > need for the current recvmsg inner loop.
> > 
> > I fear we need some tweek to handle TCP fallback. If I read the RFC
> > correctly the msk can fallback to TCP even after receiving some DSS
> > data (e.g. msk receive queue contains some skbs and the msk fallback to
> > TCP. We can't call anymore sock_recvmsg())
> 
> RFC 8684 section 3.7 (Fallback) says that "If a subflow breaks during 
> operation... the subflow SHOULD be treated as broken and closed with a 
> RST". Does that cover your concern, or are you referring to a fallback at 
> connection time?

uhm... it looks like fallback scenarios are quite rich/complex.

@Florian, I think this discussion in important, but does not relate
strictly to this series; the problem is independent, and could be
addresses separatelly - if we need to address anything ;)

The scenario I was thinking about is (e.g.):

- the client send a data segment + DSS mapping
- the server do not send DACKs, just plain TCP acks (because e.g. MPTCP
options are allowed by middle-box after syn)
- the client should fall-back (we currently don't do any check there,
so we don't) and sends some additional plain TCP data segments
- the server sends some data seg, and should fall-back for the same
resons
- the server tries to read from the msk socket. The older data is in
the msk receive queue, the newer in the subflow receive queue, if we
call sock_recvmsg(), that will corrupt the stream.

I'm unsure if the above wording from 3.7 applies here: it think that
the sentence "If a subflow breaks during operation" applies to the msk
socket after that the first data segment is DACK-ed other, otherwise
the 2 paragraph just before that[1] would be unneeded, right?

@Christoph, could you please assert the correct interpretation here?
And what about the other critical fallback scenario discussed during
the last mtg? (fallback due to out-of-order DACK / plain TCP ACK)

Thanks,

Paolo

[1] "If, however, an ACK is received for data (not just for the
SYN)"...
Christoph Paasch Feb. 17, 2020, 6:27 p.m. UTC | #4
Hello,

On 17/02/20 - 09:40:10, Paolo Abeni wrote:
> On Fri, 2020-02-14 at 17:39 -0800, Mat Martineau wrote:
> > On Thu, 13 Feb 2020, Paolo Abeni wrote:
> > 
> > > On Thu, 2020-02-13 at 11:27 +0100, Florian Westphal wrote:
> > > > +static void __mptcp_move_skbs(struct mptcp_sock *msk)
> > > > +{
> > > > +	struct mptcp_steal_arg arg = {
> > > > +		.msk = msk,
> > > > +		.fin = false,
> > > > +	};
> > > > +	read_descriptor_t desc = {
> > > > +		.arg.data = &arg,
> > > > +	};
> > > > +
> > > > +	for (;;) {
> > > > +		struct mptcp_subflow_context *subflow;
> > > > +		bool more_data_avail;
> > > > +
> > > > +		arg.ssk = mptcp_subflow_recv_lookup(msk);
> > > > +		if (!arg.ssk)
> > > > +			break;
> > > > +
> > > > +		subflow = mptcp_subflow_ctx(arg.ssk);
> > > > +
> > > > +		lock_sock(arg.ssk);
> > > > +
> > > > +		do {
> > > > +			u32 map_remaining;
> > > > +			int bytes_read;
> > > > +
> > > > +			/* try to read as much data as available */
> > > > +			map_remaining = subflow->map_data_len -
> > > > +					mptcp_subflow_get_map_offset(subflow);
> > > > +			desc.count = map_remaining;
> > > > +
> > > > +			bytes_read = tcp_read_sock(arg.ssk, &desc, mptcp_steal_actor);
> > > > +			if (bytes_read <= 0) {
> > > > +				release_sock(arg.ssk);
> > > > +				return;
> > > > +			}
> > > > +
> > > > +			if (arg.fin) {
> > > > +				struct tcp_sock *tp = tcp_sk(arg.ssk);
> > > > +				u32 seq = READ_ONCE(tp->copied_seq);
> > > > +
> > > > +				WRITE_ONCE(tp->copied_seq, seq + 1);
> > > > +			}
> > > > +
> > > > +			more_data_avail = mptcp_subflow_data_available(arg.ssk);
> > > > +		} while (more_data_avail);
> > > > +
> > > > +		release_sock(arg.ssk);
> > > > +	}
> > > > +}
> > > 
> > > Overall the above move the pending skbs to the msk receive queue,
> > > increment ssk copied_seq and eventually let TCP send ack via
> > > tcp_cleanup_rbuf().
> > > 
> > 
> > It's only moving in-order skbs, right?
> > 
> > > I think we could avoid touching tcp_read_sock() in patch 2/5 moving the
> > > skb directly from __mptcp_move_skbs() - no call to tcp_read_sock(),
> > > just queue manipulation and copied_seq accounting.
> > > 
> > > Than __mptcp_move_skbs() will call directly, as needed - need to export
> > > it.
> > 
> > This makes sense to me. We have to do another layer of reassembly for 
> > MPTCP and we already know the skbs are in-order for the subflow stream, 
> > and there's no need to handle partial skb reads like tcp_read_sock does.
> > 
> > > We could additionally clean-up the code calling __mptcp_move_skbs()
> > > from mptcp_recvmsg() before looking into the msk receive queue. Than no
> > > need for the current recvmsg inner loop.
> > > 
> > > I fear we need some tweek to handle TCP fallback. If I read the RFC
> > > correctly the msk can fallback to TCP even after receiving some DSS
> > > data (e.g. msk receive queue contains some skbs and the msk fallback to
> > > TCP. We can't call anymore sock_recvmsg())
> > 
> > RFC 8684 section 3.7 (Fallback) says that "If a subflow breaks during 
> > operation... the subflow SHOULD be treated as broken and closed with a 
> > RST". Does that cover your concern, or are you referring to a fallback at 
> > connection time?
> 
> uhm... it looks like fallback scenarios are quite rich/complex.
> 
> @Florian, I think this discussion in important, but does not relate
> strictly to this series; the problem is independent, and could be
> addresses separatelly - if we need to address anything ;)
> 
> The scenario I was thinking about is (e.g.):
> 

I'm numbering the steps for easier reference:

> 1. the client send a data segment + DSS mapping
> 2. the server do not send DACKs, just plain TCP acks (because e.g. MPTCP
> options are allowed by middle-box after syn)
> 3. the client should fall-back (we currently don't do any check there,
> so we don't) and sends some additional plain TCP data segments
> 4. the server sends some data seg, and should fall-back for the same
> resons

*if* in Step 2 the server received data without a DSS-option he already fell
back to regular TCP right away. Because at the beginning a sender MUST
include the DSS-options in each segment:

"A sender MUST include a DSS option with data sequence mapping in
 every segment until one of the sent segments has been acknowledged
 with a DSS option containing a Data ACK."

 Nevertheless, the following steps can still happen if the middlebox is just
 in one direction (aka., server to client and thus the DATA_ACK gets
 removed).

> 5. the server tries to read from the msk socket. The older data is in
> the msk receive queue, the newer in the subflow receive queue, if we
> call sock_recvmsg(), that will corrupt the stream.
> 
> I'm unsure if the above wording from 3.7 applies here: it think that
> the sentence "If a subflow breaks during operation" applies to the msk
> socket after that the first data segment is DACK-ed other, otherwise
> the 2 paragraph just before that[1] would be unneeded, right?

Yes, this scenario here is about the start of the connection. and the
paragraph [1] applies, as well as "In the case of such an ACK being received
on the first subflow...".

Thus, once the sender receives the ACK without DATA_ACK, he sends an
infinite mapping option (DSS With data-len == 0) to the server.
That is the indication that it has to fallback. It is guaranteed that all
data is in-order because the client should not yet have established a second
subflow or sent data in an out-of-order way on the subflow.

> @Christoph, could you please assert the correct interpretation here?
> And what about the other critical fallback scenario discussed during
> the last mtg? (fallback due to out-of-order DACK / plain TCP ACK)

If that happens, the sender will receive the plain TCP ACK and fallback to
regular TCP. It will indicate that to the receiver with an infinite mapping.
(Cfr.: "The sender will send one final data sequence mapping...").

Once the receiver receives that, he knows that he also has to fallback to
regular TCP.


In general, it is best for a host to always include a DATA_ACK option in
every segment that it sends. That avoids these kind of issues :-)


Christoph


> 
> Thanks,
> 
> Paolo
> 
> [1] "If, however, an ACK is received for data (not just for the
> SYN)"...
> 
> 
> 
>
diff mbox series

Patch

diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 1ce5105e980a..e9e87e62a184 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -34,6 +34,12 @@  struct mptcp6_sock {
 
 static struct percpu_counter mptcp_sockets_allocated;
 
+struct mptcp_skb_cb {
+	u32 offset;
+};
+
+#define MPTCP_SKB_CB(__skb)	((struct mptcp_skb_cb *)&((__skb)->cb[0]))
+
 /* If msk has an initial subflow socket, and the MP_CAPABLE handshake has not
  * completed yet or has failed, return the subflow socket.
  * Otherwise return NULL.
@@ -162,6 +168,13 @@  void mptcp_data_ready(struct sock *sk)
 
 	set_bit(MPTCP_DATA_READY, &msk->flags);
 	sk->sk_data_ready(sk);
+
+	if (test_and_set_bit(MPTCP_WORK_DATA_READY, &msk->flags))
+		return;
+
+	sock_hold(sk);
+	if (!schedule_work(&msk->rtx_work))
+		sock_put(sk);
 }
 
 static void mptcp_stop_timer(struct sock *sk)
@@ -643,6 +656,45 @@  static void mptcp_wait_data(struct sock *sk, long *timeo)
 	remove_wait_queue(sk_sleep(sk), &wait);
 }
 
+static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk,
+				struct msghdr *msg,
+				size_t len)
+{
+	struct sock *sk = (struct sock *)msk;
+	struct sk_buff *skb;
+	int copied = 0;
+
+	while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) {
+		u32 offset = MPTCP_SKB_CB(skb)->offset;
+		u32 data_len = skb->len - offset;
+		u32 count = min_t(size_t, len - copied, data_len);
+		int err;
+
+		err = skb_copy_datagram_msg(skb, offset, msg, count);
+		if (unlikely(err < 0)) {
+			if (!copied)
+				return err;
+			break;
+		}
+
+		copied += count;
+
+		if (count < data_len) {
+			MPTCP_SKB_CB(skb)->offset += count;
+			break;
+		}
+
+		__skb_unlink(skb, &sk->sk_receive_queue);
+		__kfree_skb(skb);
+
+		if (copied >= len)
+			break;
+		break;
+	}
+
+	return copied;
+}
+
 static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
 			 int nonblock, int flags, int *addr_len)
 {
@@ -688,6 +740,24 @@  static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
 		u32 map_remaining;
 		int bytes_read;
 
+		bytes_read = __mptcp_recvmsg_mskq(msk, msg, len - copied);
+		if (unlikely(bytes_read < 0)) {
+			if (!copied)
+				copied = bytes_read;
+
+			goto out_err;
+		}
+
+		copied += bytes_read;
+
+		if (!skb_queue_empty(&sk->sk_receive_queue)) {
+			if (copied == len) {
+				more_data_avail = true;
+				done = true;
+			}
+			continue;
+		}
+
 		ssk = mptcp_subflow_recv_lookup(msk);
 		pr_debug("msk=%p ssk=%p", msk, ssk);
 		if (!ssk)
@@ -793,6 +863,7 @@  static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
 			set_bit(MPTCP_DATA_READY, &msk->flags);
 	}
 
+out_err:
 	release_sock(sk);
 	return copied;
 }
@@ -906,6 +977,90 @@  static void mptcp_check_for_eof(struct mptcp_sock *msk)
 	}
 }
 
+struct mptcp_steal_arg {
+	struct mptcp_sock *msk;
+	struct sock *ssk;
+	bool fin;
+};
+
+static int mptcp_steal_actor(read_descriptor_t *desc, struct sk_buff *skb,
+			     unsigned int offset, size_t len)
+{
+	struct mptcp_steal_arg *arg = desc->arg.data;
+	struct sock *sk, *ssk;
+	size_t copy_len = min(desc->count, len);
+
+	sk = (struct sock *)arg->msk;
+	ssk = arg->ssk;
+
+	/* skb_orphan is not good (should keep skb charged to subflow).
+	 * Removing it causes WARN at ssk close time, as tcp_done() call
+	 * path may see ssk->sk_forward_alloc > 0.
+	 */
+	__skb_unlink(skb, &ssk->sk_receive_queue);
+	skb_orphan(skb);
+	__skb_queue_tail(&sk->sk_receive_queue, skb);
+
+	arg->msk->ack_seq += copy_len;
+	desc->count -= copy_len;
+	MPTCP_SKB_CB(skb)->offset = offset;
+	if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
+		arg->fin = true;
+
+	return (int)copy_len;
+}
+
+static void __mptcp_move_skbs(struct mptcp_sock *msk)
+{
+	struct mptcp_steal_arg arg = {
+		.msk = msk,
+		.fin = false,
+	};
+	read_descriptor_t desc = {
+		.arg.data = &arg,
+	};
+
+	for (;;) {
+		struct mptcp_subflow_context *subflow;
+		bool more_data_avail;
+
+		arg.ssk = mptcp_subflow_recv_lookup(msk);
+		if (!arg.ssk)
+			break;
+
+		subflow = mptcp_subflow_ctx(arg.ssk);
+
+		lock_sock(arg.ssk);
+
+		do {
+			u32 map_remaining;
+			int bytes_read;
+
+			/* try to read as much data as available */
+			map_remaining = subflow->map_data_len -
+					mptcp_subflow_get_map_offset(subflow);
+			desc.count = map_remaining;
+
+			bytes_read = tcp_read_sock(arg.ssk, &desc, mptcp_steal_actor);
+			if (bytes_read <= 0) {
+				release_sock(arg.ssk);
+				return;
+			}
+
+			if (arg.fin) {
+				struct tcp_sock *tp = tcp_sk(arg.ssk);
+				u32 seq = READ_ONCE(tp->copied_seq);
+
+				WRITE_ONCE(tp->copied_seq, seq + 1);
+			}
+
+			more_data_avail = mptcp_subflow_data_available(arg.ssk);
+		} while (more_data_avail);
+
+		release_sock(arg.ssk);
+	}
+}
+
 static void mptcp_worker(struct work_struct *work)
 {
 	int orig_len, orig_offset, ret, mss_now = 0, size_goal = 0;
@@ -923,6 +1078,9 @@  static void mptcp_worker(struct work_struct *work)
 	lock_sock(sk);
 	mptcp_clean_una(sk);
 
+	if (test_and_clear_bit(MPTCP_WORK_DATA_READY, &msk->flags))
+		__mptcp_move_skbs(msk);
+
 	if (test_and_clear_bit(MPTCP_WORK_EOF, &msk->flags))
 		mptcp_check_for_eof(msk);
 
@@ -1087,6 +1245,8 @@  static void mptcp_close(struct sock *sk, long timeout)
 
 	mptcp_cancel_work(sk);
 
+	__skb_queue_purge(&sk->sk_receive_queue);
+
 	sk_common_release(sk);
 }
 
@@ -1757,6 +1917,8 @@  void mptcp_proto_init(void)
 		panic("Failed to register MPTCP proto.\n");
 
 	inet_register_protosw(&mptcp_protosw);
+
+	BUILD_BUG_ON(sizeof(struct mptcp_skb_cb) > sizeof_field(struct sk_buff, cb));
 }
 
 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index 760607e3e65f..e755ed4f273d 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -82,6 +82,7 @@ 
 #define MPTCP_SEND_SPACE	BIT(1)
 #define MPTCP_WORK_RTX		BIT(2)
 #define MPTCP_WORK_EOF		BIT(3)
+#define MPTCP_WORK_DATA_READY	BIT(4)
 
 static inline __be32 mptcp_option(u8 subopt, u8 len, u8 nib, u8 field)
 {