diff mbox series

[3/3] mptcp: process MP_CAPABLE data option.

Message ID 20191202171504.88739-4-cpaasch@apple.com
State Superseded, archived
Headers show
Series mptcp: support for rfc6824 MP CAPABLE hanshake | expand

Commit Message

Christoph Paasch Dec. 2, 2019, 5:15 p.m. UTC
This patch implement the handling of MP_CAPABLE + data option, as
per RFC 6824 bis - MPTCP v1.

On the server side we can receive the remote key after that the connection
is established. We need to explicitly track the 'missing remote key'
status and avoid emtting mptcp ack till we get such info.

When a late/retransmitted/OoO pkt carrying MP_CAPABLE[+data] option
is received, we have to propagate the mptcp seq number info to
the msk socket. To avoid ABBA locking issue, explicitly check for
that in recvmsg(), where we own msk and subflow sock locks.

The above also means that an established mp_capable subflow - still
waiting for the remote key - can be 'downgraded' to plain TCP.

Such change could potentially block forever reader waiting for new
data - as they hook to msk, while later wake-up after the downgrade
will be on subflow only.

The above issue is not handled here, we likely have to get rid of
msk->fallback to handle that cleanly.

Signed-off-by: Christoph Paasch <cpaasch@apple.com>
---
 net/mptcp/options.c  | 48 +++++++++++++++++++++++++++++++++++---------
 net/mptcp/protocol.c | 20 ++++++++++++------
 net/mptcp/protocol.h |  8 ++++++--
 net/mptcp/subflow.c  | 37 +++++++++++++++++++++++++++++-----
 4 files changed, 91 insertions(+), 22 deletions(-)

Comments

Peter Krystad Dec. 5, 2019, 2:06 a.m. UTC | #1
Hi, uh, Christoph,

On Mon, 2019-12-02 at 09:15 -0800, Christoph Paasch wrote:
> This patch implement the handling of MP_CAPABLE + data option, as
> per RFC 6824 bis - MPTCP v1.
> 
> On the server side we can receive the remote key after that the connection
> is established. We need to explicitly track the 'missing remote key'
> status and avoid emtting mptcp ack till we get such info.
> 
> When a late/retransmitted/OoO pkt carrying MP_CAPABLE[+data] option
> is received, we have to propagate the mptcp seq number info to
> the msk socket. To avoid ABBA locking issue, explicitly check for
> that in recvmsg(), where we own msk and subflow sock locks.
> 
> The above also means that an established mp_capable subflow - still
> waiting for the remote key - can be 'downgraded' to plain TCP.
> 
> Such change could potentially block forever reader waiting for new
> data - as they hook to msk, while later wake-up after the downgrade
> will be on subflow only.
> 
> The above issue is not handled here, we likely have to get rid of
> msk->fallback to handle that cleanly.
> 
> Signed-off-by: Christoph Paasch <cpaasch@apple.com>
> ---
>  net/mptcp/options.c  | 48 +++++++++++++++++++++++++++++++++++---------
>  net/mptcp/protocol.c | 20 ++++++++++++------
>  net/mptcp/protocol.h |  8 ++++++--
>  net/mptcp/subflow.c  | 37 +++++++++++++++++++++++++++++-----
>  4 files changed, 91 insertions(+), 22 deletions(-)
> 
> diff --git a/net/mptcp/options.c b/net/mptcp/options.c
> index 299af8e5456d..0b80a24c34b4 100644
> --- a/net/mptcp/options.c
> +++ b/net/mptcp/options.c
> @@ -299,6 +299,7 @@ void mptcp_rcv_synsent(struct sock *sk)
>  	pr_debug("subflow=%p", subflow);
>  	if (subflow->request_mptcp && tp->rx_opt.mptcp.mp_capable) {
>  		subflow->mp_capable = 1;
> +		subflow->can_ack = 1;
>  		subflow->remote_key = tp->rx_opt.mptcp.sndr_key;
>  	}
>  }
> @@ -365,6 +366,7 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb,
>  					  unsigned int remaining,
>  					  struct mptcp_out_options *opts)
>  {
> +	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
>  	unsigned int dss_size = 0;
>  	struct mptcp_ext *mpext;
>  	struct mptcp_sock *msk;
> @@ -383,6 +385,11 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb,
>  			opts->ext_copy = *mpext;
>  	}
>  
> +	opts->ext_copy.use_ack = 0;
> +	msk = mptcp_sk(subflow->conn);
> +	if (!msk || !READ_ONCE(msk->can_ack))
> +		return false;
> +
>  	ack_size = TCPOLEN_MPTCP_DSS_ACK64;
>  
>  	/* Add kind/length/subtype/flag overhead if mapping is not populated */
> @@ -391,15 +398,7 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb,
>  
>  	dss_size += ack_size;
>  
> -	msk = mptcp_sk(mptcp_subflow_ctx(sk)->conn);
> -	if (msk) {
> -		opts->ext_copy.data_ack = msk->ack_seq;
> -	} else {
> -		mptcp_crypto_key_sha(mptcp_subflow_ctx(sk)->remote_key,
> -				     NULL, &opts->ext_copy.data_ack);
> -		opts->ext_copy.data_ack++;
> -	}
> -
> +	opts->ext_copy.data_ack = msk->ack_seq;
>  	opts->ext_copy.ack64 = 1;
>  	opts->ext_copy.use_ack = 1;
>  
> @@ -451,6 +450,35 @@ bool mptcp_synack_options(const struct request_sock *req, unsigned int *size,
>  	return false;
>  }
>  
> +static bool check_fourth_ack(struct mptcp_subflow_context *subflow,
> +			     struct sk_buff *skb,
> +			     struct mptcp_options_received *mp_opt)
> +{
> +	/* here we can process OoO, in-window pkts, only in-sequence 4th ack
> +	 * are relevant
> +	 */
> +	if (likely(subflow->fourth_ack ||
> +		   TCP_SKB_CB(skb)->seq != subflow->ssn_offset + 1))
> +		return true;
> +
> +	if (mp_opt->use_ack)
> +		subflow->fourth_ack = 1;
> +
> +	if (subflow->can_ack)
> +		return true;
> +
> +	/* if still waiting for remote/client key, we exepct an
> +	 * MP_CAPABLE + data as first establish packet
> +	 */

Maybe this comment can clearly state that this is a fallback?
"If the first established packet does not contain MP_CAPABLE + data then
fallback to TCP". 
 
> +	if (!mp_opt->mp_capable) {
> +		subflow->mp_capable = 0;
> +		return false;
> +	}
> +	subflow->remote_key = mp_opt->sndr_key;
> +	subflow->can_ack = 1;
> +	return true;
> +}
> +
>  void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb,
>  			    struct tcp_options_received *opt_rx)
>  {
> @@ -462,6 +490,8 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb,
>  		return;
>  
>  	mp_opt = &opt_rx->mptcp;
> +	if (!check_fourth_ack(subflow, skb, mp_opt))
> +		return;
>  
>  	if (!mp_opt->dss)
>  		return;
> diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
> index 4b000f842d6b..1076f0887cd5 100644
> --- a/net/mptcp/protocol.c
> +++ b/net/mptcp/protocol.c
> @@ -347,6 +347,12 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
>  
>  		lock_sock(ssk);
>  		while (mptcp_subflow_data_available(ssk) && !done) {
> +			if (unlikely(!msk->can_ack)) {
> +				msk->remote_key = subflow->remote_key;
> +				msk->ack_seq = subflow->map_seq;
> +				msk->can_ack = true;
> +			}
> +
>  			/* try to read as much data as available */
>  			map_remaining = subflow->map_data_len -
>  					mptcp_subflow_get_map_offset(subflow);
> @@ -505,17 +511,20 @@ static struct sock *mptcp_accept(struct sock *sk, int flags, int *err,
>  		__mptcp_init_sock(new_mptcp_sock);
>  
>  		msk = mptcp_sk(new_mptcp_sock);
> -		msk->remote_key = subflow->remote_key;
>  		msk->local_key = subflow->local_key;
>  		msk->token = subflow->token;
>  
>  		mptcp_token_update_accept(new_sock->sk, new_mptcp_sock);
>  		msk->subflow = NULL;
>  
> -		mptcp_crypto_key_sha(msk->remote_key, NULL, &ack_seq);
>  		msk->write_seq = subflow->idsn + 1;
> -		ack_seq++;
> -		msk->ack_seq = ack_seq;
> +		if (subflow->can_ack) {
> +			msk->can_ack = true;
> +			msk->remote_key = subflow->remote_key;
> +			mptcp_crypto_key_sha(msk->remote_key, NULL, &ack_seq);
> +			ack_seq++;
> +			msk->ack_seq = ack_seq;
> +		}
>  		newsk = new_mptcp_sock;
>  		list_add(&subflow->node, &msk->conn_list);
>  		bh_unlock_sock(new_mptcp_sock);
> @@ -527,8 +536,6 @@ static struct sock *mptcp_accept(struct sock *sk, int flags, int *err,
>  		 * the receive path and process the pending ones
>  		 */
>  		lock_sock(new_sock->sk);
> -		subflow->map_seq = ack_seq;
> -		subflow->map_subflow_seq = 1;
>  		subflow->rel_write_seq = 1;
>  		subflow->tcp_sock = new_sock;
>  		subflow->conn = new_mptcp_sock;
> @@ -656,6 +663,7 @@ void mptcp_finish_connect(struct sock *sk, int mp_capable)
>  		msk->write_seq = subflow->idsn + 1;
>  		ack_seq++;
>  		msk->ack_seq = ack_seq;
> +		msk->can_ack = 1;
>  		subflow->map_seq = ack_seq;
>  		subflow->map_subflow_seq = 1;
>  		subflow->rel_write_seq = 1;
> diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
> index bdff743cf2b3..7630d8860319 100644
> --- a/net/mptcp/protocol.h
> +++ b/net/mptcp/protocol.h
> @@ -66,6 +66,7 @@ struct mptcp_sock {
>  	u64		ack_seq;
>  	u32		token;
>  	unsigned long	flags;
> +	bool		can_ack;
>  	struct list_head conn_list;
>  	struct socket	*subflow; /* outgoing connect/listener/!mp_capable */
>  };
> @@ -80,9 +81,10 @@ static inline struct mptcp_sock *mptcp_sk(const struct sock *sk)
>  
>  struct mptcp_subflow_request_sock {
>  	struct	tcp_request_sock sk;
> -	u8	mp_capable : 1,
> +	u16	mp_capable : 1,
>  		mp_join : 1,
>  		backup : 1,
> +		remote_key_valid : 1,
>  		version : 4;
>  	u64	local_key;
>  	u64	remote_key;
> @@ -116,8 +118,10 @@ struct mptcp_subflow_context {
>  		fourth_ack : 1,	    /* send initial DSS */
>  		conn_finished : 1,
>  		map_valid : 1,
> +		mp_capable_map : 1,
>  		data_avail : 1,
> -		rx_eof : 1;
> +		rx_eof : 1,
> +		can_ack : 1;	    /* only after processing the remote a key */
>  
>  	struct	socket *tcp_sock;   /* underlying tcp_sock */
>  	struct	sock *conn;	    /* parent mptcp_sock */
> diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
> index 96cc80f8d751..255e3762b39c 100644
> --- a/net/mptcp/subflow.c
> +++ b/net/mptcp/subflow.c
> @@ -61,6 +61,7 @@ static void subflow_init_req(struct request_sock *req,
>  	mptcp_get_options(skb, &rx_opt);
>  
>  	subflow_req->mp_capable = 0;
> +	subflow_req->remote_key_valid = 0;
>  
>  #ifdef CONFIG_TCP_MD5SIG
>  	/* no MPTCP if MD5SIG is enabled on this socket or we may run out of
> @@ -188,17 +189,28 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk,
>  
>  	pr_debug("listener=%p, req=%p, conn=%p", listener, req, listener->conn);
>  
> -	/* if the sk is MP_CAPABLE, we need to fetch the client key */
> +	/* if the sk is MP_CAPABLE, we try to fetch the client key */

Maybe this comment should be moved down to where the code fetching the
remote key is now that more code has been added here.

>  	subflow_req = mptcp_subflow_rsk(req);
>  	if (subflow_req->mp_capable) {
> +		if (TCP_SKB_CB(skb)->seq != subflow_req->ssn_offset + 1) {
> +			/* here we can receive and accept an in-window,
> +			 * out-of-order pkt, which will not carry the MP_CAPABLE
> +			 * opt even on mptcp enabled paths
> +			 */
> +			goto create_child;
> +		}

So this means we will create the child socket even without receiving the
MP_CAPABLE ACK? Will there be side-effects of there are a lot of such packets?

> +
>  		opt_rx.mptcp.mp_capable = 0;
>  		mptcp_get_options(skb, &opt_rx);
> -		if (!opt_rx.mptcp.mp_capable)
> -			subflow_req->mp_capable = 0;
> -		else
> +		if (opt_rx.mptcp.mp_capable) {
>  			subflow_req->remote_key = opt_rx.mptcp.sndr_key;
> +			subflow_req->remote_key_valid = 1;
> +		} else {
> +			subflow_req->mp_capable = 0;
> +		}
>  	}
>  
> +create_child:
>  	child = listener->icsk_af_ops->syn_recv_sock(sk, skb, req, dst,
>  						     req_unhash, own_req);
>  
> @@ -363,6 +375,7 @@ static enum mapping_status get_mapping_status(struct sock *ssk)
>  	subflow->map_subflow_seq = mpext->subflow_seq;
>  	subflow->map_data_len = mpext->data_len;
>  	subflow->map_valid = 1;
> +	subflow->mp_capable_map = mpext->mp_capable_map;
>  	pr_debug("new map seq=%llu subflow_seq=%u data_len=%u",
>  		 subflow->map_seq, subflow->map_subflow_seq,
>  		 subflow->map_data_len);
> @@ -414,6 +427,16 @@ static bool subflow_check_data_avail(struct sock *ssk)
>  		if (WARN_ON_ONCE(!skb))
>  			return false;
>  
> +		/* if msk lacks the remote key, this subflow must provide an
> +		 * MP_CAPABLE-based mapping
> +		 */
> +		if (unlikely(!READ_ONCE(msk->ack_seq))) {
> +			if (subflow->mp_capable_map)
> +				break;
> +			ssk->sk_err = EBADMSG;
> +			goto fatal;
> +		}
> +
>  		old_ack = READ_ONCE(msk->ack_seq);
>  		ack_seq = mptcp_subflow_get_mapped_dsn(subflow);
>  		pr_debug("msk ack_seq=%llx subflow ack_seq=%llx", old_ack,
> @@ -651,8 +674,12 @@ static void subflow_ulp_clone(const struct request_sock *req,
>  	new_ctx->tcp_sk_data_ready = old_ctx->tcp_sk_data_ready;
>  
>  	if (subflow_req->mp_capable) {
> +		/* see comments in subflow_syn_recv_sock(), MPTCP connection
> +		 * is fully establishged only after we receive the remote key

Detail: "established".

Peter.

> +		 */
>  		new_ctx->mp_capable = 1;
> -		new_ctx->fourth_ack = 1;
> +		new_ctx->fourth_ack = subflow_req->remote_key_valid;
> +		new_ctx->can_ack = subflow_req->remote_key_valid;
>  		new_ctx->remote_key = subflow_req->remote_key;
>  		new_ctx->local_key = subflow_req->local_key;
>  		new_ctx->token = subflow_req->token;
Paolo Abeni Dec. 5, 2019, 10:32 a.m. UTC | #2
On Wed, 2019-12-04 at 18:06 -0800, Peter Krystad wrote:
> @@ -188,17 +189,28 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk,
> >  
> >  	pr_debug("listener=%p, req=%p, conn=%p", listener, req, listener->conn);
> >  
> > -	/* if the sk is MP_CAPABLE, we need to fetch the client key */
> > +	/* if the sk is MP_CAPABLE, we try to fetch the client key */
> 
> Maybe this comment should be moved down to where the code fetching the
> remote key is now that more code has been added here.
> 
> >  	subflow_req = mptcp_subflow_rsk(req);
> >  	if (subflow_req->mp_capable) {
> > +		if (TCP_SKB_CB(skb)->seq != subflow_req->ssn_offset + 1) {
> > +			/* here we can receive and accept an in-window,
> > +			 * out-of-order pkt, which will not carry the MP_CAPABLE
> > +			 * opt even on mptcp enabled paths
> > +			 */
> > +			goto create_child;
> > +		}
> 
> So this means we will create the child socket even without receiving the
> MP_CAPABLE ACK? Will there be side-effects of there are a lot of such packets?

The RFC says we must always create such sockets (if e.g. middle-boxes
drop the 3rd MP_CAPABLE opt) so we can't escape this scenario.

The worst effect I can think of is:

- a new socket being created server side with mp_capable = 1, 
- user-space calls recvmsg()/sendmsg() and blocks
- the 3rd ack is received without the MP_CAPABLE opt, and the socket
fallback to plain tcp.
- will ever the kernel wake the user-space on the fallback socket?

AFAICS, with the last patches from Florian both recvmsg() and sendmsg()
will be woken-up correctly - but than we have to explicitly check for
fallback.

Anyhow I think we need some pktdrill to test this in a reliable way, I
would not overlookup too much at it now.

Thanks,

Paolo
diff mbox series

Patch

diff --git a/net/mptcp/options.c b/net/mptcp/options.c
index 299af8e5456d..0b80a24c34b4 100644
--- a/net/mptcp/options.c
+++ b/net/mptcp/options.c
@@ -299,6 +299,7 @@  void mptcp_rcv_synsent(struct sock *sk)
 	pr_debug("subflow=%p", subflow);
 	if (subflow->request_mptcp && tp->rx_opt.mptcp.mp_capable) {
 		subflow->mp_capable = 1;
+		subflow->can_ack = 1;
 		subflow->remote_key = tp->rx_opt.mptcp.sndr_key;
 	}
 }
@@ -365,6 +366,7 @@  static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb,
 					  unsigned int remaining,
 					  struct mptcp_out_options *opts)
 {
+	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
 	unsigned int dss_size = 0;
 	struct mptcp_ext *mpext;
 	struct mptcp_sock *msk;
@@ -383,6 +385,11 @@  static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb,
 			opts->ext_copy = *mpext;
 	}
 
+	opts->ext_copy.use_ack = 0;
+	msk = mptcp_sk(subflow->conn);
+	if (!msk || !READ_ONCE(msk->can_ack))
+		return false;
+
 	ack_size = TCPOLEN_MPTCP_DSS_ACK64;
 
 	/* Add kind/length/subtype/flag overhead if mapping is not populated */
@@ -391,15 +398,7 @@  static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb,
 
 	dss_size += ack_size;
 
-	msk = mptcp_sk(mptcp_subflow_ctx(sk)->conn);
-	if (msk) {
-		opts->ext_copy.data_ack = msk->ack_seq;
-	} else {
-		mptcp_crypto_key_sha(mptcp_subflow_ctx(sk)->remote_key,
-				     NULL, &opts->ext_copy.data_ack);
-		opts->ext_copy.data_ack++;
-	}
-
+	opts->ext_copy.data_ack = msk->ack_seq;
 	opts->ext_copy.ack64 = 1;
 	opts->ext_copy.use_ack = 1;
 
@@ -451,6 +450,35 @@  bool mptcp_synack_options(const struct request_sock *req, unsigned int *size,
 	return false;
 }
 
+static bool check_fourth_ack(struct mptcp_subflow_context *subflow,
+			     struct sk_buff *skb,
+			     struct mptcp_options_received *mp_opt)
+{
+	/* here we can process OoO, in-window pkts, only in-sequence 4th ack
+	 * are relevant
+	 */
+	if (likely(subflow->fourth_ack ||
+		   TCP_SKB_CB(skb)->seq != subflow->ssn_offset + 1))
+		return true;
+
+	if (mp_opt->use_ack)
+		subflow->fourth_ack = 1;
+
+	if (subflow->can_ack)
+		return true;
+
+	/* if still waiting for remote/client key, we exepct an
+	 * MP_CAPABLE + data as first establish packet
+	 */
+	if (!mp_opt->mp_capable) {
+		subflow->mp_capable = 0;
+		return false;
+	}
+	subflow->remote_key = mp_opt->sndr_key;
+	subflow->can_ack = 1;
+	return true;
+}
+
 void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb,
 			    struct tcp_options_received *opt_rx)
 {
@@ -462,6 +490,8 @@  void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb,
 		return;
 
 	mp_opt = &opt_rx->mptcp;
+	if (!check_fourth_ack(subflow, skb, mp_opt))
+		return;
 
 	if (!mp_opt->dss)
 		return;
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 4b000f842d6b..1076f0887cd5 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -347,6 +347,12 @@  static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
 
 		lock_sock(ssk);
 		while (mptcp_subflow_data_available(ssk) && !done) {
+			if (unlikely(!msk->can_ack)) {
+				msk->remote_key = subflow->remote_key;
+				msk->ack_seq = subflow->map_seq;
+				msk->can_ack = true;
+			}
+
 			/* try to read as much data as available */
 			map_remaining = subflow->map_data_len -
 					mptcp_subflow_get_map_offset(subflow);
@@ -505,17 +511,20 @@  static struct sock *mptcp_accept(struct sock *sk, int flags, int *err,
 		__mptcp_init_sock(new_mptcp_sock);
 
 		msk = mptcp_sk(new_mptcp_sock);
-		msk->remote_key = subflow->remote_key;
 		msk->local_key = subflow->local_key;
 		msk->token = subflow->token;
 
 		mptcp_token_update_accept(new_sock->sk, new_mptcp_sock);
 		msk->subflow = NULL;
 
-		mptcp_crypto_key_sha(msk->remote_key, NULL, &ack_seq);
 		msk->write_seq = subflow->idsn + 1;
-		ack_seq++;
-		msk->ack_seq = ack_seq;
+		if (subflow->can_ack) {
+			msk->can_ack = true;
+			msk->remote_key = subflow->remote_key;
+			mptcp_crypto_key_sha(msk->remote_key, NULL, &ack_seq);
+			ack_seq++;
+			msk->ack_seq = ack_seq;
+		}
 		newsk = new_mptcp_sock;
 		list_add(&subflow->node, &msk->conn_list);
 		bh_unlock_sock(new_mptcp_sock);
@@ -527,8 +536,6 @@  static struct sock *mptcp_accept(struct sock *sk, int flags, int *err,
 		 * the receive path and process the pending ones
 		 */
 		lock_sock(new_sock->sk);
-		subflow->map_seq = ack_seq;
-		subflow->map_subflow_seq = 1;
 		subflow->rel_write_seq = 1;
 		subflow->tcp_sock = new_sock;
 		subflow->conn = new_mptcp_sock;
@@ -656,6 +663,7 @@  void mptcp_finish_connect(struct sock *sk, int mp_capable)
 		msk->write_seq = subflow->idsn + 1;
 		ack_seq++;
 		msk->ack_seq = ack_seq;
+		msk->can_ack = 1;
 		subflow->map_seq = ack_seq;
 		subflow->map_subflow_seq = 1;
 		subflow->rel_write_seq = 1;
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index bdff743cf2b3..7630d8860319 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -66,6 +66,7 @@  struct mptcp_sock {
 	u64		ack_seq;
 	u32		token;
 	unsigned long	flags;
+	bool		can_ack;
 	struct list_head conn_list;
 	struct socket	*subflow; /* outgoing connect/listener/!mp_capable */
 };
@@ -80,9 +81,10 @@  static inline struct mptcp_sock *mptcp_sk(const struct sock *sk)
 
 struct mptcp_subflow_request_sock {
 	struct	tcp_request_sock sk;
-	u8	mp_capable : 1,
+	u16	mp_capable : 1,
 		mp_join : 1,
 		backup : 1,
+		remote_key_valid : 1,
 		version : 4;
 	u64	local_key;
 	u64	remote_key;
@@ -116,8 +118,10 @@  struct mptcp_subflow_context {
 		fourth_ack : 1,	    /* send initial DSS */
 		conn_finished : 1,
 		map_valid : 1,
+		mp_capable_map : 1,
 		data_avail : 1,
-		rx_eof : 1;
+		rx_eof : 1,
+		can_ack : 1;	    /* only after processing the remote a key */
 
 	struct	socket *tcp_sock;   /* underlying tcp_sock */
 	struct	sock *conn;	    /* parent mptcp_sock */
diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
index 96cc80f8d751..255e3762b39c 100644
--- a/net/mptcp/subflow.c
+++ b/net/mptcp/subflow.c
@@ -61,6 +61,7 @@  static void subflow_init_req(struct request_sock *req,
 	mptcp_get_options(skb, &rx_opt);
 
 	subflow_req->mp_capable = 0;
+	subflow_req->remote_key_valid = 0;
 
 #ifdef CONFIG_TCP_MD5SIG
 	/* no MPTCP if MD5SIG is enabled on this socket or we may run out of
@@ -188,17 +189,28 @@  static struct sock *subflow_syn_recv_sock(const struct sock *sk,
 
 	pr_debug("listener=%p, req=%p, conn=%p", listener, req, listener->conn);
 
-	/* if the sk is MP_CAPABLE, we need to fetch the client key */
+	/* if the sk is MP_CAPABLE, we try to fetch the client key */
 	subflow_req = mptcp_subflow_rsk(req);
 	if (subflow_req->mp_capable) {
+		if (TCP_SKB_CB(skb)->seq != subflow_req->ssn_offset + 1) {
+			/* here we can receive and accept an in-window,
+			 * out-of-order pkt, which will not carry the MP_CAPABLE
+			 * opt even on mptcp enabled paths
+			 */
+			goto create_child;
+		}
+
 		opt_rx.mptcp.mp_capable = 0;
 		mptcp_get_options(skb, &opt_rx);
-		if (!opt_rx.mptcp.mp_capable)
-			subflow_req->mp_capable = 0;
-		else
+		if (opt_rx.mptcp.mp_capable) {
 			subflow_req->remote_key = opt_rx.mptcp.sndr_key;
+			subflow_req->remote_key_valid = 1;
+		} else {
+			subflow_req->mp_capable = 0;
+		}
 	}
 
+create_child:
 	child = listener->icsk_af_ops->syn_recv_sock(sk, skb, req, dst,
 						     req_unhash, own_req);
 
@@ -363,6 +375,7 @@  static enum mapping_status get_mapping_status(struct sock *ssk)
 	subflow->map_subflow_seq = mpext->subflow_seq;
 	subflow->map_data_len = mpext->data_len;
 	subflow->map_valid = 1;
+	subflow->mp_capable_map = mpext->mp_capable_map;
 	pr_debug("new map seq=%llu subflow_seq=%u data_len=%u",
 		 subflow->map_seq, subflow->map_subflow_seq,
 		 subflow->map_data_len);
@@ -414,6 +427,16 @@  static bool subflow_check_data_avail(struct sock *ssk)
 		if (WARN_ON_ONCE(!skb))
 			return false;
 
+		/* if msk lacks the remote key, this subflow must provide an
+		 * MP_CAPABLE-based mapping
+		 */
+		if (unlikely(!READ_ONCE(msk->ack_seq))) {
+			if (subflow->mp_capable_map)
+				break;
+			ssk->sk_err = EBADMSG;
+			goto fatal;
+		}
+
 		old_ack = READ_ONCE(msk->ack_seq);
 		ack_seq = mptcp_subflow_get_mapped_dsn(subflow);
 		pr_debug("msk ack_seq=%llx subflow ack_seq=%llx", old_ack,
@@ -651,8 +674,12 @@  static void subflow_ulp_clone(const struct request_sock *req,
 	new_ctx->tcp_sk_data_ready = old_ctx->tcp_sk_data_ready;
 
 	if (subflow_req->mp_capable) {
+		/* see comments in subflow_syn_recv_sock(), MPTCP connection
+		 * is fully establishged only after we receive the remote key
+		 */
 		new_ctx->mp_capable = 1;
-		new_ctx->fourth_ack = 1;
+		new_ctx->fourth_ack = subflow_req->remote_key_valid;
+		new_ctx->can_ack = subflow_req->remote_key_valid;
 		new_ctx->remote_key = subflow_req->remote_key;
 		new_ctx->local_key = subflow_req->local_key;
 		new_ctx->token = subflow_req->token;