diff mbox series

[mptcp-next,v3] mptcp: track window announced to peer

Message ID 20201022212738.24258-1-fw@strlen.de
State Accepted, archived
Commit 99873062750d038e4f5f520d62771ed4d8f86eb8
Delegated to: Matthieu Baerts
Headers show
Series [mptcp-next,v3] mptcp: track window announced to peer | expand

Commit Message

Florian Westphal Oct. 22, 2020, 9:27 p.m. UTC
OoO handling attemtps to detect when packet is out-of-window by testing
current ack sequence and remaining space vs. sequence number.

This doesn't work reliably.  Store the highest allowed sequence number
that we've announced and use it to detect oow packets.

Do this when mptcp options get written to the packet (wire format).
For this to work we need to move the write_options call until after stack
selected a new tcp window.

Signed-off-by: Florian Westphal <fw@strlen.de>
---
 Changes in v3:
  - add a few line breaks
  - update rcvwnd even for non-dss packets

 include/net/mptcp.h   |  3 ++-
 net/ipv4/tcp_output.c | 11 +++++++----
 net/mptcp/options.c   | 22 +++++++++++++++++++++-
 net/mptcp/protocol.c  | 12 +++++++-----
 net/mptcp/protocol.h  |  1 +
 5 files changed, 38 insertions(+), 11 deletions(-)

Comments

Mat Martineau Oct. 23, 2020, 12:43 a.m. UTC | #1
On Thu, 22 Oct 2020, Florian Westphal wrote:

> OoO handling attemtps to detect when packet is out-of-window by testing
> current ack sequence and remaining space vs. sequence number.
>
> This doesn't work reliably.  Store the highest allowed sequence number
> that we've announced and use it to detect oow packets.
>
> Do this when mptcp options get written to the packet (wire format).
> For this to work we need to move the write_options call until after stack
> selected a new tcp window.
>
> Signed-off-by: Florian Westphal <fw@strlen.de>
> ---
> Changes in v3:
>  - add a few line breaks
>  - update rcvwnd even for non-dss packets
>
> include/net/mptcp.h   |  3 ++-
> net/ipv4/tcp_output.c | 11 +++++++----
> net/mptcp/options.c   | 22 +++++++++++++++++++++-
> net/mptcp/protocol.c  | 12 +++++++-----
> net/mptcp/protocol.h  |  1 +
> 5 files changed, 38 insertions(+), 11 deletions(-)
>
> diff --git a/include/net/mptcp.h b/include/net/mptcp.h
> index 6e706d838e4e..b6cf07143a8a 100644
> --- a/include/net/mptcp.h
> +++ b/include/net/mptcp.h
> @@ -88,7 +88,8 @@ bool mptcp_established_options(struct sock *sk, struct sk_buff *skb,
> 			       struct mptcp_out_options *opts);
> void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb);
>
> -void mptcp_write_options(__be32 *ptr, struct mptcp_out_options *opts);
> +void mptcp_write_options(__be32 *ptr, const struct tcp_sock *tp,
> +			 struct mptcp_out_options *opts);
>
> /* move the skb extension owership, with the assumption that 'to' is
>  * newly allocated
> diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
> index 9abf5a0358d5..a127856ab7df 100644
> --- a/net/ipv4/tcp_output.c
> +++ b/net/ipv4/tcp_output.c
> @@ -445,11 +445,12 @@ struct tcp_out_options {
> 	struct mptcp_out_options mptcp;
> };
>
> -static void mptcp_options_write(__be32 *ptr, struct tcp_out_options *opts)
> +static void mptcp_options_write(__be32 *ptr, const struct tcp_sock *tp,
> +				struct tcp_out_options *opts)
> {
> #if IS_ENABLED(CONFIG_MPTCP)
> 	if (unlikely(OPTION_MPTCP & opts->options))
> -		mptcp_write_options(ptr, &opts->mptcp);
> +		mptcp_write_options(ptr, tp, &opts->mptcp);
> #endif
> }
>
> @@ -701,7 +702,7 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
>
> 	smc_options_write(ptr, &options);
>
> -	mptcp_options_write(ptr, opts);
> +	mptcp_options_write(ptr, tp, opts);
> }
>
> static void smc_set_option(const struct tcp_sock *tp,
> @@ -1348,7 +1349,6 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
> 		}
> 	}
>
> -	tcp_options_write((__be32 *)(th + 1), tp, &opts);
> 	skb_shinfo(skb)->gso_type = sk->sk_gso_type;
> 	if (likely(!(tcb->tcp_flags & TCPHDR_SYN))) {
> 		th->window      = htons(tcp_select_window(sk));

These lines of code between the old tcp_options_write() location and the 
new location were last touched by Eric, doing some cache miss 
optimizations. It doesn't seem like populating the TCP options just after 
th->window instead of just before should behave similarly with the data 
cache - are you concerned about this at all?

> @@ -1359,6 +1359,9 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
> 		 */
> 		th->window	= htons(min(tp->rcv_wnd, 65535U));
> 	}
> +
> +	tcp_options_write((__be32 *)(th + 1), tp, &opts);
> +
> #ifdef CONFIG_TCP_MD5SIG
> 	/* Calculate the MD5 hash, as we have all we need now */
> 	if (md5) {
> diff --git a/net/mptcp/options.c b/net/mptcp/options.c
> index f2d1e27a2bc1..2e9f1c4ea008 100644
> --- a/net/mptcp/options.c
> +++ b/net/mptcp/options.c
> @@ -991,7 +991,24 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb)
> 	}
> }
>
> -void mptcp_write_options(__be32 *ptr, struct mptcp_out_options *opts)
> +static void mptcp_set_rwin(const struct tcp_sock *tp)
> +{
> +	const struct sock *ssk = (const struct sock *)tp;
> +	const struct mptcp_subflow_context *subflow;
> +	struct mptcp_sock *msk;
> +	u64 ack_seq;
> +
> +	subflow = mptcp_subflow_ctx(ssk);
> +	msk = mptcp_sk(subflow->conn);
> +
> +	ack_seq = READ_ONCE(msk->ack_seq) + tp->rcv_wnd;
> +
> +	if (after64(ack_seq, READ_ONCE(msk->rcv_wnd_sent)))
> +		WRITE_ONCE(msk->rcv_wnd_sent, ack_seq);

This looks racy if packet headers on multiple subflows are created 
concurrently: msk->rcv_wnd_sent could end up with an earlier sequence 
number than it should have, leading to possible extra drops. It looks like 
a hard race to trigger, but do you think rcv_wnd_sent should use cmpxchg 
like wnd_end and snd_una?

Mat


> +}
> +
> +void mptcp_write_options(__be32 *ptr, const struct tcp_sock *tp,
> +			 struct mptcp_out_options *opts)
> {
> 	if ((OPTION_MPTCP_MPC_SYN | OPTION_MPTCP_MPC_SYNACK |
> 	     OPTION_MPTCP_MPC_ACK) & opts->suboptions) {
> @@ -1148,4 +1165,7 @@ void mptcp_write_options(__be32 *ptr, struct mptcp_out_options *opts)
> 					   TCPOPT_NOP << 8 | TCPOPT_NOP, ptr);
> 		}
> 	}
> +
> +	if (tp)
> +		mptcp_set_rwin(tp);
> }
> diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
> index 6c131759f3e5..cd747696e7af 100644
> --- a/net/mptcp/protocol.c
> +++ b/net/mptcp/protocol.c
> @@ -168,19 +168,19 @@ static void mptcp_data_queue_ofo(struct mptcp_sock *msk, struct sk_buff *skb)
> 	struct rb_node **p, *parent;
> 	u64 seq, end_seq, max_seq;
> 	struct sk_buff *skb1;
> -	int space;
>
> 	seq = MPTCP_SKB_CB(skb)->map_seq;
> 	end_seq = MPTCP_SKB_CB(skb)->end_seq;
> -	space = tcp_space(sk);
> -	max_seq = space > 0 ? space + msk->ack_seq : msk->ack_seq;
> +	max_seq = READ_ONCE(msk->rcv_wnd_sent);
>
> 	pr_debug("msk=%p seq=%llx limit=%llx empty=%d", msk, seq, max_seq,
> 		 RB_EMPTY_ROOT(&msk->out_of_order_queue));
> -	if (after64(seq, max_seq)) {
> +	if (after64(end_seq, max_seq)) {
> 		/* out of window */
> 		mptcp_drop(sk, skb);
> -		pr_debug("oow by %ld", (unsigned long)seq - (unsigned long)max_seq);
> +		pr_debug("oow by %lld, rcv_wnd_sent %llu\n",
> +			 (unsigned long long)end_seq - (unsigned long)max_seq,
> +			 (unsigned long long)msk->rcv_wnd_sent);
> 		MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_NODSSWINDOW);
> 		return;
> 	}
> @@ -2258,6 +2258,7 @@ struct sock *mptcp_sk_clone(const struct sock *sk,
> 		mptcp_crypto_key_sha(msk->remote_key, NULL, &ack_seq);
> 		ack_seq++;
> 		WRITE_ONCE(msk->ack_seq, ack_seq);
> +		WRITE_ONCE(msk->rcv_wnd_sent, ack_seq);
> 	}
>
> #if !IS_ENABLED(CONFIG_KASAN)
> @@ -2567,6 +2568,7 @@ void mptcp_finish_connect(struct sock *ssk)
> 	WRITE_ONCE(msk->write_seq, subflow->idsn + 1);
> 	WRITE_ONCE(msk->snd_nxt, msk->write_seq);
> 	WRITE_ONCE(msk->ack_seq, ack_seq);
> +	WRITE_ONCE(msk->rcv_wnd_sent, ack_seq);
> 	WRITE_ONCE(msk->can_ack, 1);
> 	atomic64_set(&msk->snd_una, msk->write_seq);
>
> diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
> index b4c8dbe9236b..bf15c2fc0ba1 100644
> --- a/net/mptcp/protocol.h
> +++ b/net/mptcp/protocol.h
> @@ -211,6 +211,7 @@ struct mptcp_sock {
> 	u64		write_seq;
> 	u64		snd_nxt;
> 	u64		ack_seq;
> +	u64		rcv_wnd_sent;
> 	u64		rcv_data_fin_seq;
> 	struct sock	*last_snd;
> 	int		snd_burst;
> -- 
> 2.26.2

--
Mat Martineau
Intel
Paolo Abeni Oct. 23, 2020, 8:33 a.m. UTC | #2
On Thu, 2020-10-22 at 17:43 -0700, Mat Martineau wrote:
> On Thu, 22 Oct 2020, Florian Westphal wrote:
> @@ -1348,7 +1349,6 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
> > 		}
> > 	}
> > 
> > -	tcp_options_write((__be32 *)(th + 1), tp, &opts);
> > 	skb_shinfo(skb)->gso_type = sk->sk_gso_type;
> > 	if (likely(!(tcb->tcp_flags & TCPHDR_SYN))) {
> > 		th->window      = htons(tcp_select_window(sk));
> 
> These lines of code between the old tcp_options_write() location and the 
> new location were last touched by Eric, doing some cache miss 
> optimizations. It doesn't seem like populating the TCP options just after 
> th->window instead of just before should behave similarly with the data 
> cache - are you concerned about this at all?

Thank you for the carefully review!

commit 51466a7545b73b7ad7bcfb33410d2823ccfaa501
Author: Eric Dumazet <edumazet@google.com>
Date:   Thu Jun 11 09:15:16 2015 -0700

    tcp: fill shinfo->gso_type at last moment

moves 'skb_shinfo(skb)->gso_type' initialization just after the option
write. Looking at the commit message, what actually makes a difference
in such commit is moving the mentioned assigment after the last xmit
check to avoid touching extra cache lines if not needed - and we
preserve that fact.

I think the above is sane performace-wise, but I guess Eric could still
teach me better ;)

> 
> > @@ -1359,6 +1359,9 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
> > 		 */
> > 		th->window	= htons(min(tp->rcv_wnd, 65535U));
> > 	}
> > +
> > +	tcp_options_write((__be32 *)(th + 1), tp, &opts);
> > +
> > #ifdef CONFIG_TCP_MD5SIG
> > 	/* Calculate the MD5 hash, as we have all we need now */
> > 	if (md5) {
> > diff --git a/net/mptcp/options.c b/net/mptcp/options.c
> > index f2d1e27a2bc1..2e9f1c4ea008 100644
> > --- a/net/mptcp/options.c
> > +++ b/net/mptcp/options.c
> > @@ -991,7 +991,24 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb)
> > 	}
> > }
> > 
> > -void mptcp_write_options(__be32 *ptr, struct mptcp_out_options *opts)
> > +static void mptcp_set_rwin(const struct tcp_sock *tp)
> > +{
> > +	const struct sock *ssk = (const struct sock *)tp;
> > +	const struct mptcp_subflow_context *subflow;
> > +	struct mptcp_sock *msk;
> > +	u64 ack_seq;
> > +
> > +	subflow = mptcp_subflow_ctx(ssk);
> > +	msk = mptcp_sk(subflow->conn);
> > +
> > +	ack_seq = READ_ONCE(msk->ack_seq) + tp->rcv_wnd;
> > +
> > +	if (after64(ack_seq, READ_ONCE(msk->rcv_wnd_sent)))
> > +		WRITE_ONCE(msk->rcv_wnd_sent, ack_seq);
> 
> This looks racy if packet headers on multiple subflows are created 
> concurrently: msk->rcv_wnd_sent could end up with an earlier sequence 
> number than it should have, leading to possible extra drops. It looks like 
> a hard race to trigger, but do you think rcv_wnd_sent should use cmpxchg 
> like wnd_end and snd_una?

I personally don't have a strong opinion on this, I guess we can live
with the potential race, we have MIBs reporting both MPTCP
retransmissions and OOW events, which would be both triggered if this
race happens and have any conseguence.

Possibly related: as some point we will probabaly need to consolidate
the cmpxchg()s with some other locking mechanims to reduce the atomic
operations needed.

Cheers,

/P
Florian Westphal Oct. 23, 2020, 11:59 a.m. UTC | #3
Paolo Abeni <pabeni@redhat.com> wrote:
> On Thu, 2020-10-22 at 17:43 -0700, Mat Martineau wrote:
> > On Thu, 22 Oct 2020, Florian Westphal wrote:
> > @@ -1348,7 +1349,6 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
> > > 		}
> > > 	}
> > > 
> > > -	tcp_options_write((__be32 *)(th + 1), tp, &opts);
> > > 	skb_shinfo(skb)->gso_type = sk->sk_gso_type;
> > > 	if (likely(!(tcb->tcp_flags & TCPHDR_SYN))) {
> > > 		th->window      = htons(tcp_select_window(sk));
> > 
> > These lines of code between the old tcp_options_write() location and the 
> > new location were last touched by Eric, doing some cache miss 
> > optimizations. It doesn't seem like populating the TCP options just after 
> > th->window instead of just before should behave similarly with the data 
> > cache - are you concerned about this at all?
> 
> Thank you for the carefully review!
> 
> commit 51466a7545b73b7ad7bcfb33410d2823ccfaa501
> Author: Eric Dumazet <edumazet@google.com>
> Date:   Thu Jun 11 09:15:16 2015 -0700
> 
>     tcp: fill shinfo->gso_type at last moment
> 
> moves 'skb_shinfo(skb)->gso_type' initialization just after the option
> write. Looking at the commit message, what actually makes a difference
> in such commit is moving the mentioned assigment after the last xmit
> check to avoid touching extra cache lines if not needed - and we
> preserve that fact.
> 
> I think the above is sane performace-wise, but I guess Eric could still
> teach me better ;)

If neither additional hooking nor change in tcp stack is possible only choice
is to relax OOW test:

max_seq = READ_ONCE(msk->ack_seq) + READ_ONCE(sk->sk_rcvbuf);
if (after64(end_seq, max_seq)) ...

that means we will queue up OOW packets if those are still within
current rcvbuf size but its fine from protocol pov.
Paolo Abeni Oct. 23, 2020, 5 p.m. UTC | #4
On Fri, 2020-10-23 at 13:59 +0200, Florian Westphal wrote:
> Paolo Abeni <pabeni@redhat.com> wrote:
> > On Thu, 2020-10-22 at 17:43 -0700, Mat Martineau wrote:
> > > On Thu, 22 Oct 2020, Florian Westphal wrote:
> > > @@ -1348,7 +1349,6 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
> > > > 		}
> > > > 	}
> > > > 
> > > > -	tcp_options_write((__be32 *)(th + 1), tp, &opts);
> > > > 	skb_shinfo(skb)->gso_type = sk->sk_gso_type;
> > > > 	if (likely(!(tcb->tcp_flags & TCPHDR_SYN))) {
> > > > 		th->window      = htons(tcp_select_window(sk));
> > > 
> > > These lines of code between the old tcp_options_write() location and the 
> > > new location were last touched by Eric, doing some cache miss 
> > > optimizations. It doesn't seem like populating the TCP options just after 
> > > th->window instead of just before should behave similarly with the data 
> > > cache - are you concerned about this at all?
> > 
> > Thank you for the carefully review!
> > 
> > commit 51466a7545b73b7ad7bcfb33410d2823ccfaa501
> > Author: Eric Dumazet <edumazet@google.com>
> > Date:   Thu Jun 11 09:15:16 2015 -0700
> > 
> >     tcp: fill shinfo->gso_type at last moment
> > 
> > moves 'skb_shinfo(skb)->gso_type' initialization just after the option
> > write. Looking at the commit message, what actually makes a difference
> > in such commit is moving the mentioned assigment after the last xmit
> > check to avoid touching extra cache lines if not needed - and we
> > preserve that fact.
> > 
> > I think the above is sane performace-wise, but I guess Eric could still
> > teach me better ;)
> 
> If neither additional hooking nor change in tcp stack is possible only choice

I'm sorry, I was likely misleading. I mean: the patch looks fine and
safe to me. Perhpas we can try send that upstream as RFC to explicitly
ask for feedback ?!?

Thanks!

Paolo
Mat Martineau Oct. 23, 2020, 5:26 p.m. UTC | #5
On Fri, 23 Oct 2020, Paolo Abeni wrote:

> On Fri, 2020-10-23 at 13:59 +0200, Florian Westphal wrote:
>> Paolo Abeni <pabeni@redhat.com> wrote:
>>> On Thu, 2020-10-22 at 17:43 -0700, Mat Martineau wrote:
>>>> On Thu, 22 Oct 2020, Florian Westphal wrote:
>>>> @@ -1348,7 +1349,6 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
>>>>> 		}
>>>>> 	}
>>>>>
>>>>> -	tcp_options_write((__be32 *)(th + 1), tp, &opts);
>>>>> 	skb_shinfo(skb)->gso_type = sk->sk_gso_type;
>>>>> 	if (likely(!(tcb->tcp_flags & TCPHDR_SYN))) {
>>>>> 		th->window      = htons(tcp_select_window(sk));
>>>>
>>>> These lines of code between the old tcp_options_write() location and the
>>>> new location were last touched by Eric, doing some cache miss
>>>> optimizations. It doesn't seem like populating the TCP options just after
>>>> th->window instead of just before should behave similarly with the data
>>>> cache - are you concerned about this at all?

I made a mistake re-editing my last sentence here - was trying to say it 
*does* seem like the cache behavior should be similar.

>>>
>>> Thank you for the carefully review!
>>>
>>> commit 51466a7545b73b7ad7bcfb33410d2823ccfaa501
>>> Author: Eric Dumazet <edumazet@google.com>
>>> Date:   Thu Jun 11 09:15:16 2015 -0700
>>>
>>>     tcp: fill shinfo->gso_type at last moment
>>>
>>> moves 'skb_shinfo(skb)->gso_type' initialization just after the option
>>> write. Looking at the commit message, what actually makes a difference
>>> in such commit is moving the mentioned assigment after the last xmit
>>> check to avoid touching extra cache lines if not needed - and we
>>> preserve that fact.
>>>
>>> I think the above is sane performace-wise, but I guess Eric could still
>>> teach me better ;)
>>
>> If neither additional hooking nor change in tcp stack is possible only choice
>
> I'm sorry, I was likely misleading. I mean: the patch looks fine and
> safe to me. Perhpas we can try send that upstream as RFC to explicitly
> ask for feedback ?!?
>

Agreed!


--
Mat Martineau
Intel
Matthieu Baerts Oct. 27, 2020, 8:14 p.m. UTC | #6
Hi Florian, Mat, Paolo,

On 22/10/2020 23:27, Florian Westphal wrote:
> OoO handling attemtps to detect when packet is out-of-window by testing
> current ack sequence and remaining space vs. sequence number.
> 
> This doesn't work reliably.  Store the highest allowed sequence number
> that we've announced and use it to detect oow packets.
> 
> Do this when mptcp options get written to the packet (wire format).
> For this to work we need to move the write_options call until after stack
> selected a new tcp window.

Thank you for the patch and reviews!

- 99873062750d: mptcp: track window announced to peer
   - I added Mat's Reviewed-by and Paolo's Acked-by tags, please tell me 
if you prefer when I don't add those if they are not explicit. It is 
just to avoid multiple messages on the netdev ML.
   - (I fixed a small typo with 'attempts' in the commit message and 
reported by codespell)
- Results: 9bceebfe0a58..c041aa4e115c

Tests + export are in progress!

Cheers,
Matt
diff mbox series

Patch

diff --git a/include/net/mptcp.h b/include/net/mptcp.h
index 6e706d838e4e..b6cf07143a8a 100644
--- a/include/net/mptcp.h
+++ b/include/net/mptcp.h
@@ -88,7 +88,8 @@  bool mptcp_established_options(struct sock *sk, struct sk_buff *skb,
 			       struct mptcp_out_options *opts);
 void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb);
 
-void mptcp_write_options(__be32 *ptr, struct mptcp_out_options *opts);
+void mptcp_write_options(__be32 *ptr, const struct tcp_sock *tp,
+			 struct mptcp_out_options *opts);
 
 /* move the skb extension owership, with the assumption that 'to' is
  * newly allocated
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 9abf5a0358d5..a127856ab7df 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -445,11 +445,12 @@  struct tcp_out_options {
 	struct mptcp_out_options mptcp;
 };
 
-static void mptcp_options_write(__be32 *ptr, struct tcp_out_options *opts)
+static void mptcp_options_write(__be32 *ptr, const struct tcp_sock *tp,
+				struct tcp_out_options *opts)
 {
 #if IS_ENABLED(CONFIG_MPTCP)
 	if (unlikely(OPTION_MPTCP & opts->options))
-		mptcp_write_options(ptr, &opts->mptcp);
+		mptcp_write_options(ptr, tp, &opts->mptcp);
 #endif
 }
 
@@ -701,7 +702,7 @@  static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
 
 	smc_options_write(ptr, &options);
 
-	mptcp_options_write(ptr, opts);
+	mptcp_options_write(ptr, tp, opts);
 }
 
 static void smc_set_option(const struct tcp_sock *tp,
@@ -1348,7 +1349,6 @@  static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
 		}
 	}
 
-	tcp_options_write((__be32 *)(th + 1), tp, &opts);
 	skb_shinfo(skb)->gso_type = sk->sk_gso_type;
 	if (likely(!(tcb->tcp_flags & TCPHDR_SYN))) {
 		th->window      = htons(tcp_select_window(sk));
@@ -1359,6 +1359,9 @@  static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
 		 */
 		th->window	= htons(min(tp->rcv_wnd, 65535U));
 	}
+
+	tcp_options_write((__be32 *)(th + 1), tp, &opts);
+
 #ifdef CONFIG_TCP_MD5SIG
 	/* Calculate the MD5 hash, as we have all we need now */
 	if (md5) {
diff --git a/net/mptcp/options.c b/net/mptcp/options.c
index f2d1e27a2bc1..2e9f1c4ea008 100644
--- a/net/mptcp/options.c
+++ b/net/mptcp/options.c
@@ -991,7 +991,24 @@  void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb)
 	}
 }
 
-void mptcp_write_options(__be32 *ptr, struct mptcp_out_options *opts)
+static void mptcp_set_rwin(const struct tcp_sock *tp)
+{
+	const struct sock *ssk = (const struct sock *)tp;
+	const struct mptcp_subflow_context *subflow;
+	struct mptcp_sock *msk;
+	u64 ack_seq;
+
+	subflow = mptcp_subflow_ctx(ssk);
+	msk = mptcp_sk(subflow->conn);
+
+	ack_seq = READ_ONCE(msk->ack_seq) + tp->rcv_wnd;
+
+	if (after64(ack_seq, READ_ONCE(msk->rcv_wnd_sent)))
+		WRITE_ONCE(msk->rcv_wnd_sent, ack_seq);
+}
+
+void mptcp_write_options(__be32 *ptr, const struct tcp_sock *tp,
+			 struct mptcp_out_options *opts)
 {
 	if ((OPTION_MPTCP_MPC_SYN | OPTION_MPTCP_MPC_SYNACK |
 	     OPTION_MPTCP_MPC_ACK) & opts->suboptions) {
@@ -1148,4 +1165,7 @@  void mptcp_write_options(__be32 *ptr, struct mptcp_out_options *opts)
 					   TCPOPT_NOP << 8 | TCPOPT_NOP, ptr);
 		}
 	}
+
+	if (tp)
+		mptcp_set_rwin(tp);
 }
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 6c131759f3e5..cd747696e7af 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -168,19 +168,19 @@  static void mptcp_data_queue_ofo(struct mptcp_sock *msk, struct sk_buff *skb)
 	struct rb_node **p, *parent;
 	u64 seq, end_seq, max_seq;
 	struct sk_buff *skb1;
-	int space;
 
 	seq = MPTCP_SKB_CB(skb)->map_seq;
 	end_seq = MPTCP_SKB_CB(skb)->end_seq;
-	space = tcp_space(sk);
-	max_seq = space > 0 ? space + msk->ack_seq : msk->ack_seq;
+	max_seq = READ_ONCE(msk->rcv_wnd_sent);
 
 	pr_debug("msk=%p seq=%llx limit=%llx empty=%d", msk, seq, max_seq,
 		 RB_EMPTY_ROOT(&msk->out_of_order_queue));
-	if (after64(seq, max_seq)) {
+	if (after64(end_seq, max_seq)) {
 		/* out of window */
 		mptcp_drop(sk, skb);
-		pr_debug("oow by %ld", (unsigned long)seq - (unsigned long)max_seq);
+		pr_debug("oow by %lld, rcv_wnd_sent %llu\n",
+			 (unsigned long long)end_seq - (unsigned long)max_seq,
+			 (unsigned long long)msk->rcv_wnd_sent);
 		MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_NODSSWINDOW);
 		return;
 	}
@@ -2258,6 +2258,7 @@  struct sock *mptcp_sk_clone(const struct sock *sk,
 		mptcp_crypto_key_sha(msk->remote_key, NULL, &ack_seq);
 		ack_seq++;
 		WRITE_ONCE(msk->ack_seq, ack_seq);
+		WRITE_ONCE(msk->rcv_wnd_sent, ack_seq);
 	}
 
 #if !IS_ENABLED(CONFIG_KASAN)
@@ -2567,6 +2568,7 @@  void mptcp_finish_connect(struct sock *ssk)
 	WRITE_ONCE(msk->write_seq, subflow->idsn + 1);
 	WRITE_ONCE(msk->snd_nxt, msk->write_seq);
 	WRITE_ONCE(msk->ack_seq, ack_seq);
+	WRITE_ONCE(msk->rcv_wnd_sent, ack_seq);
 	WRITE_ONCE(msk->can_ack, 1);
 	atomic64_set(&msk->snd_una, msk->write_seq);
 
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index b4c8dbe9236b..bf15c2fc0ba1 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -211,6 +211,7 @@  struct mptcp_sock {
 	u64		write_seq;
 	u64		snd_nxt;
 	u64		ack_seq;
+	u64		rcv_wnd_sent;
 	u64		rcv_data_fin_seq;
 	struct sock	*last_snd;
 	int		snd_burst;