diff mbox series

[RFC,net-next,1/2] net: mptcp: improve fallback to TCP

Message ID 621872a39f41ec139aa4fa0b16d5321d3f590775.1589819601.git.dcaratti@redhat.com
State Superseded, archived
Delegated to: Mat Martineau
Headers show
Series improve MPTCP fallback | expand

Commit Message

Davide Caratti May 18, 2020, 4:37 p.m. UTC
current MPTCP resets the connection in case the DSS option is not received
after a successful three-way handshake, while RFC8684 §3.7 suggests a
fall-back to regular TCP when only the first subflow has been estabilshed.
Introduce support for "infinite mapping" to allow continuing without DSS in
these cases.

Changes since RFC v1:
 - use a dedicated member of struct msk to indicate that a fallback has
   happened, use it in case of infinite mapping
 - don't delete skb_ext in case of infinite mapping (Mat)
 - test the value of pm.subflows on reception of an infinite map to
   ensure that no other subflow is currently opened (Mat)
 - in mptcp_established_options(), avoid adding TCP options in case of
   fallback indication; simplify sendmsg()/recvmsg()/poll() to keep using
   the MPTCP socket in case of TCP fallback. Set the fallback indication
   in case subflow is not mp_capable after successful 3-way handshake,
   instead of flipping 'is_mptcp' (Paolo/Mat)
 - remove deadcode in mptcp_finish_connect, and increment
   MPTCP_MIB_MPCAPABLEACTIVEFALLBACK in subflow_finish_connect (Paolo)

BugLink: https://github.com/multipath-tcp/mptcp_net-next/issues/11
Signed-off-by: Davide Caratti <dcaratti@redhat.com>
---
 net/mptcp/options.c  |  9 ++++-
 net/mptcp/protocol.c | 80 ++++++++++++--------------------------------
 net/mptcp/protocol.h | 14 ++++++++
 net/mptcp/subflow.c  | 63 ++++++++++++++++++++++++----------
 4 files changed, 88 insertions(+), 78 deletions(-)

Comments

Mat Martineau May 19, 2020, 12:14 a.m. UTC | #1
On Mon, 18 May 2020, Davide Caratti wrote:

> current MPTCP resets the connection in case the DSS option is not received
> after a successful three-way handshake, while RFC8684 §3.7 suggests a
> fall-back to regular TCP when only the first subflow has been estabilshed.
> Introduce support for "infinite mapping" to allow continuing without DSS in
> these cases.

If I'm understanding the intent of the changes, the idea in this patch is 
to keep using the MPTCP segmentation & reassembly after fallback, but to 
short-circuit most of it by (1) not writing the DSS option to the TCP 
header on the tx side and (2) generating dummy mappings on the rx side.

That seems like a lot of overhead, but if optimizing sendmsg/recvmsg on 
fallback is introducing problems then we need to take that in to 
consideration. It's not clear to me what we gain by using MPTCP 
segmentation & reassembly after the fallback is complete (other than 
possibly working around bugs in the existing fallback code). If the tricky 
part is completing the transmit or receive call where fallback happens 
*during* the call, maybe the fallback technique used in this patch could 
help complete an in-progress send/recv, but future calls would use 
sock_sendmsg/sock_recvmsg?


I'd also like to avoid overloading the term "infinite mapping", which has 
specific meaning in the RFC: a DSS mapping with the data-level length set 
to 0 that is sent under specific circumstances. If that specific DSS 
mapping isn't present on the wire, maybe "dummy mapping" or "fake mapping" 
is a useful term to use instead.


>
> Changes since RFC v1:
> - use a dedicated member of struct msk to indicate that a fallback has
>   happened, use it in case of infinite mapping
> - don't delete skb_ext in case of infinite mapping (Mat)
> - test the value of pm.subflows on reception of an infinite map to
>   ensure that no other subflow is currently opened (Mat)
> - in mptcp_established_options(), avoid adding TCP options in case of
>   fallback indication; simplify sendmsg()/recvmsg()/poll() to keep using
>   the MPTCP socket in case of TCP fallback. Set the fallback indication
>   in case subflow is not mp_capable after successful 3-way handshake,
>   instead of flipping 'is_mptcp' (Paolo/Mat)
> - remove deadcode in mptcp_finish_connect, and increment
>   MPTCP_MIB_MPCAPABLEACTIVEFALLBACK in subflow_finish_connect (Paolo)
>
> BugLink: https://github.com/multipath-tcp/mptcp_net-next/issues/11
> Signed-off-by: Davide Caratti <dcaratti@redhat.com>
> ---
> net/mptcp/options.c  |  9 ++++-
> net/mptcp/protocol.c | 80 ++++++++++++--------------------------------
> net/mptcp/protocol.h | 14 ++++++++
> net/mptcp/subflow.c  | 63 ++++++++++++++++++++++++----------
> 4 files changed, 88 insertions(+), 78 deletions(-)
>
> diff --git a/net/mptcp/options.c b/net/mptcp/options.c
> index ece6f92cf7d11..361e605c32aa2 100644
> --- a/net/mptcp/options.c
> +++ b/net/mptcp/options.c
> @@ -623,6 +623,9 @@ bool mptcp_established_options(struct sock *sk, struct sk_buff *skb,
>
> 	opts->suboptions = 0;
>
> +	if (unlikely(mptcp_check_fallback(sk)))
> +		return false;
> +
> 	if (mptcp_established_options_mp(sk, skb, &opt_size, remaining, opts))
> 		ret = true;
> 	else if (mptcp_established_options_dss(sk, skb, &opt_size, remaining,
> @@ -713,7 +716,8 @@ static bool check_fully_established(struct mptcp_sock *msk, struct sock *sk,
> 	 */
> 	if (!mp_opt->mp_capable) {
> 		subflow->mp_capable = 0;
> -		tcp_sk(sk)->is_mptcp = 0;
> +		pr_fallback(msk);
> +		msk->fallback = true;
> 		return false;
> 	}
>
> @@ -813,6 +817,9 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb,
> 	struct mptcp_options_received mp_opt;
> 	struct mptcp_ext *mpext;
>
> +	if (msk->fallback)
> +		return;
> +
> 	mptcp_get_options(skb, &mp_opt);
> 	if (!check_fully_established(msk, sk, subflow, skb, &mp_opt))
> 		return;
> diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
> index e3a628bea2b81..44489ab95f115 100644
> --- a/net/mptcp/protocol.c
> +++ b/net/mptcp/protocol.c
> @@ -52,11 +52,6 @@ static struct socket *__mptcp_nmpc_socket(const struct mptcp_sock *msk)
> 	return msk->subflow;
> }
>
> -static bool __mptcp_needs_tcp_fallback(const struct mptcp_sock *msk)
> -{
> -	return msk->first && !sk_is_mptcp(msk->first);
> -}
> -
> static struct socket *mptcp_is_tcpsk(struct sock *sk)
> {
> 	struct socket *sock = sk->sk_socket;
> @@ -94,7 +89,7 @@ static struct socket *__mptcp_tcp_fallback(struct mptcp_sock *msk)
> 	if (unlikely(sock))
> 		return sock;
>
> -	if (likely(!__mptcp_needs_tcp_fallback(msk)))
> +	if (likely(!msk->fallback))
> 		return NULL;
>
> 	return msk->subflow;
> @@ -212,6 +207,15 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk,
> 		if (!skb)
> 			break;
>
> +		if (msk->fallback) {
> +			/* if we are running under the workqueue, TCP could have
> +			 * collapsed skbs between dummy map creation and now
> +			 * be sure to adjust the size
> +			 */
> +			map_remaining = skb->len;
> +			subflow->map_data_len = skb->len;
> +		}
> +
> 		offset = seq - TCP_SKB_CB(skb)->seq;
> 		fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN;
> 		if (fin) {
> @@ -428,8 +432,15 @@ static void mptcp_clean_una(struct sock *sk)
> {
> 	struct mptcp_sock *msk = mptcp_sk(sk);
> 	struct mptcp_data_frag *dtmp, *dfrag;
> -	u64 snd_una = atomic64_read(&msk->snd_una);
> 	bool cleaned = false;
> +	u64 snd_una;
> +
> +	/* on fallback we just need to ignore snd_una, as this is really
> +	 * plain TCP
> +	 */
> +	if (msk->fallback)
> +		atomic64_set(&msk->snd_una, msk->write_seq);
> +	snd_una = atomic64_read(&msk->snd_una);

An alternative to flushing the rtx_queue here would be to not add anything 
to the rtx_queue when in fallback, and to do a one-time flush when going 
in to fallback.

>
> 	list_for_each_entry_safe(dfrag, dtmp, &msk->rtx_queue, list) {
> 		if (after64(dfrag->data_seq + dfrag->data_len, snd_una))
> @@ -702,7 +713,6 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
> 	int mss_now = 0, size_goal = 0, ret = 0;
> 	struct mptcp_sock *msk = mptcp_sk(sk);
> 	struct page_frag *pfrag;
> -	struct socket *ssock;
> 	size_t copied = 0;
> 	struct sock *ssk;
> 	bool tx_ok;
> @@ -721,15 +731,6 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
> 			goto out;
> 	}
>
> -fallback:

Could it make sense to remove this label (and the goto below), but keep 
the fallback passthrough to sock_sendmsg() if the connection is already in 
fallback when mptcp_sendmsg() is called?

> -	ssock = __mptcp_tcp_fallback(msk);
> -	if (unlikely(ssock)) {
> -		release_sock(sk);
> -		pr_debug("fallback passthrough");
> -		ret = sock_sendmsg(ssock, msg);
> -		return ret >= 0 ? ret + copied : (copied ? copied : ret);
> -	}
> -
> 	pfrag = sk_page_frag(sk);
> restart:
> 	mptcp_clean_una(sk);
> @@ -781,17 +782,6 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
> 			}
> 			break;
> 		}
> -		if (ret == 0 && unlikely(__mptcp_needs_tcp_fallback(msk))) {
> -			/* Can happen for passive sockets:
> -			 * 3WHS negotiated MPTCP, but first packet after is
> -			 * plain TCP (e.g. due to middlebox filtering unknown
> -			 * options).
> -			 *
> -			 * Fall back to TCP.
> -			 */
> -			release_sock(ssk);
> -			goto fallback;
> -		}
>
> 		copied += ret;
>
> @@ -934,7 +924,6 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
> 			 int nonblock, int flags, int *addr_len)
> {
> 	struct mptcp_sock *msk = mptcp_sk(sk);
> -	struct socket *ssock;
> 	int copied = 0;
> 	int target;
> 	long timeo;
> @@ -943,16 +932,6 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
> 		return -EOPNOTSUPP;
>
> 	lock_sock(sk);
> -	ssock = __mptcp_tcp_fallback(msk);
> -	if (unlikely(ssock)) {
> -fallback:

Same comment as with the mptcp_sendmsg() fallback label above.

> -		release_sock(sk);
> -		pr_debug("fallback-read subflow=%p",
> -			 mptcp_subflow_ctx(ssock->sk));
> -		copied = sock_recvmsg(ssock, msg, flags);
> -		return copied;
> -	}
> -
> 	timeo = sock_rcvtimeo(sk, nonblock);
>
> 	len = min_t(size_t, len, INT_MAX);
> @@ -1015,8 +994,6 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
>
> 		pr_debug("block timeout %ld", timeo);
> 		mptcp_wait_data(sk, &timeo);
> -		if (unlikely(__mptcp_tcp_fallback(msk)))
> -			goto fallback;
> 	}
>
> 	if (skb_queue_empty(&sk->sk_receive_queue)) {
> @@ -1491,6 +1468,7 @@ static struct sock *mptcp_accept(struct sock *sk, int flags, int *err,
> 		 */
> 		if (WARN_ON_ONCE(!new_mptcp_sock)) {
> 			tcp_sk(newsk)->is_mptcp = 0;
> +			pr_fallback(msk);
> 			return newsk;
> 		}
>
> @@ -1512,6 +1490,8 @@ static struct sock *mptcp_accept(struct sock *sk, int flags, int *err,
> 		__MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEPASSIVEACK);
> 		local_bh_enable();
> 	} else {
> +		pr_fallback(msk);
> +		msk->fallback = true;
> 		MPTCP_INC_STATS(sock_net(sk),
> 				MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK);
> 	}
> @@ -1635,12 +1615,6 @@ void mptcp_finish_connect(struct sock *ssk)
> 	sk = subflow->conn;
> 	msk = mptcp_sk(sk);
>
> -	if (!subflow->mp_capable) {
> -		MPTCP_INC_STATS(sock_net(sk),
> -				MPTCP_MIB_MPCAPABLEACTIVEFALLBACK);
> -		return;
> -	}
> -
> 	pr_debug("msk=%p, token=%u", sk, subflow->token);
>
> 	mptcp_crypto_key_sha(subflow->remote_key, NULL, &ack_seq);
> @@ -1916,21 +1890,9 @@ static __poll_t mptcp_poll(struct file *file, struct socket *sock,
> {
> 	struct sock *sk = sock->sk;
> 	struct mptcp_sock *msk;
> -	struct socket *ssock;
> 	__poll_t mask = 0;
>
> 	msk = mptcp_sk(sk);
> -	lock_sock(sk);
> -	ssock = __mptcp_tcp_fallback(msk);
> -	if (!ssock)
> -		ssock = __mptcp_nmpc_socket(msk);
> -	if (ssock) {
> -		mask = ssock->ops->poll(file, ssock, wait);
> -		release_sock(sk);
> -		return mask;
> -	}
> -
> -	release_sock(sk);
> 	sock_poll_wait(file, sock, wait);
> 	lock_sock(sk);
>
> diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
> index f5adca93e8fb5..cf37bca9d1a5a 100644
> --- a/net/mptcp/protocol.h
> +++ b/net/mptcp/protocol.h
> @@ -202,6 +202,7 @@ struct mptcp_sock {
> 	u32		token;
> 	unsigned long	flags;
> 	bool		can_ack;
> +	bool		fallback;
> 	spinlock_t	join_list_lock;
> 	struct work_struct work;
> 	struct list_head conn_list;
> @@ -340,6 +341,15 @@ mptcp_subflow_get_mapped_dsn(const struct mptcp_subflow_context *subflow)
> 	return subflow->map_seq + mptcp_subflow_get_map_offset(subflow);
> }
>
> +
> +static inline bool mptcp_check_fallback(struct sock *sk)
> +{
> +	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
> +	struct mptcp_sock *msk = mptcp_sk(subflow->conn);
> +
> +	return msk->fallback;
> +}
> +
> int mptcp_is_enabled(struct net *net);
> bool mptcp_subflow_data_available(struct sock *sk);
> void mptcp_subflow_init(void);
> @@ -459,4 +469,8 @@ static inline bool before64(__u64 seq1, __u64 seq2)
>
> void mptcp_diag_subflow_init(struct tcp_ulp_ops *ops);
>
> +#define pr_fallback(a) do { pr_debug("%s:fallback to TCP (msk=%p)",\
> +				      __FUNCTION__, a); } while (0)
> +
> +
> #endif /* __MPTCP_PROTOCOL_H */
> diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
> index 0020d356233dd..dc64e8c78e36d 100644
> --- a/net/mptcp/subflow.c
> +++ b/net/mptcp/subflow.c
> @@ -222,7 +222,6 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
> 	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
> 	struct mptcp_options_received mp_opt;
> 	struct sock *parent = subflow->conn;
> -	struct tcp_sock *tp = tcp_sk(sk);
>
> 	subflow->icsk_af_ops->sk_rx_dst_set(sk, skb);
>
> @@ -236,6 +235,8 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
> 		return;
>
> 	subflow->conn_finished = 1;
> +	subflow->ssn_offset = TCP_SKB_CB(skb)->seq;
> +	pr_debug("subflow=%p synack seq=%x", subflow, subflow->ssn_offset);
>
> 	mptcp_get_options(skb, &mp_opt);
> 	if (subflow->request_mptcp && mp_opt.mp_capable) {
> @@ -251,21 +252,19 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
> 		pr_debug("subflow=%p, thmac=%llu, remote_nonce=%u", subflow,
> 			 subflow->thmac, subflow->remote_nonce);
> 	} else if (subflow->request_mptcp) {
> -		tp->is_mptcp = 0;
> +		pr_fallback(mptcp_sk(subflow->conn));
> +		mptcp_sk(subflow->conn)->fallback = true;
> +		MPTCP_INC_STATS(sock_net(sk),
> +				MPTCP_MIB_MPCAPABLEACTIVEFALLBACK);
> 	}
>
> -	if (!tp->is_mptcp)
> +	if (mptcp_sk(subflow->conn)->fallback)
> 		return;
>
> 	if (subflow->mp_capable) {
> 		pr_debug("subflow=%p, remote_key=%llu", mptcp_subflow_ctx(sk),
> 			 subflow->remote_key);
> 		mptcp_finish_connect(sk);
> -
> -		if (skb) {
> -			pr_debug("synack seq=%u", TCP_SKB_CB(skb)->seq);
> -			subflow->ssn_offset = TCP_SKB_CB(skb)->seq;
> -		}
> 	} else if (subflow->mp_join) {
> 		pr_debug("subflow=%p, thmac=%llu, remote_nonce=%u",
> 			 subflow, subflow->thmac,
> @@ -281,9 +280,6 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
> 				      subflow->remote_nonce,
> 				      subflow->hmac);
>
> -		if (skb)
> -			subflow->ssn_offset = TCP_SKB_CB(skb)->seq;
> -
> 		if (!mptcp_finish_join(sk))
> 			goto do_reset;
>
> @@ -547,7 +543,8 @@ enum mapping_status {
> 	MAPPING_OK,
> 	MAPPING_INVALID,
> 	MAPPING_EMPTY,
> -	MAPPING_DATA_FIN
> +	MAPPING_DATA_FIN,
> +	MAPPING_INFINITE
> };
>
> static u64 expand_seq(u64 old_seq, u16 old_data_len, u64 seq)
> @@ -602,6 +599,7 @@ static bool validate_mapping(struct sock *ssk, struct sk_buff *skb)
> static enum mapping_status get_mapping_status(struct sock *ssk)
> {
> 	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
> +	struct mptcp_sock *msk = mptcp_sk(subflow->conn);
> 	struct mptcp_ext *mpext;
> 	struct sk_buff *skb;
> 	u16 data_len;
> @@ -611,6 +609,9 @@ static enum mapping_status get_mapping_status(struct sock *ssk)
> 	if (!skb)
> 		return MAPPING_EMPTY;
>
> +	if (msk->fallback)
> +		return MAPPING_INFINITE;
> +
> 	mpext = mptcp_get_ext(skb);
> 	if (!mpext || !mpext->use_map) {
> 		if (!subflow->map_valid && !skb->len) {
> @@ -627,8 +628,13 @@ static enum mapping_status get_mapping_status(struct sock *ssk)
> 			return MAPPING_EMPTY;
> 		}
>
> -		if (!subflow->map_valid)
> +		if (!subflow->map_valid) {
> +			if (!mpext && !subflow->local_id) {
> +				pr_debug("no mpext and 0 local id");
> +				return MAPPING_INFINITE;
> +			}
> 			return MAPPING_INVALID;
> +		}
>
> 		goto validate_seq;
> 	}
> @@ -639,8 +645,12 @@ static enum mapping_status get_mapping_status(struct sock *ssk)
>
> 	data_len = mpext->data_len;
> 	if (data_len == 0) {
> -		pr_err("Infinite mapping not handled");
> +		/* XXX TODO this should be allowed only if we didn't
> +		 * receive a DACK yet. Paolo suggests to use a dedicated
> +		 * flag in msk, but can't we use msk->can_ack? */

The RFC has a lot of requirements for allowing an infinite mapping, and I 
don't think they align perfectly with what msk->can_ack means. A dedicated 
"allow_fallback" flag could be set to 'false' when fallback is no longer 
allowed: a subflow is added, out-of-order MPTCP mappings are encountered, 
etc.

The way I read the RFC (and I could be misreading), there are cases where 
an infinite mapping can be used after DATA_ACK if other conditions are 
met (like the checksum error section in 3.7).

> 		MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_INFINITEMAPRX);
> +		if (!subflow->local_id && !READ_ONCE(msk->pm.subflows))
> +			return MAPPING_INFINITE;
> 		return MAPPING_INVALID;
> 	}
>
> @@ -752,6 +762,19 @@ static bool subflow_check_data_avail(struct sock *ssk)
> 			ssk->sk_err = EBADMSG;
> 			goto fatal;
> 		}
> +		if (status == MAPPING_INFINITE) {
> +			if (!msk->fallback) {
> +				pr_fallback(msk);
> +				msk->fallback = true;
> +			}
> +			skb = skb_peek(&ssk->sk_receive_queue);
> +			subflow->map_valid = 1;
> +			subflow->map_seq = READ_ONCE(msk->ack_seq);
> +			subflow->map_data_len = skb->len;
> +			subflow->map_subflow_seq = tcp_sk(ssk)->copied_seq -
> +						   subflow->ssn_offset;
> +			return true;
> +		}
>
> 		if (status != MAPPING_OK)
> 			return false;
> @@ -875,14 +898,18 @@ static void subflow_data_ready(struct sock *sk)
> {
> 	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
> 	struct sock *parent = subflow->conn;
> +	struct mptcp_sock *msk;
>
> -	if (!subflow->mp_capable && !subflow->mp_join) {
> -		subflow->tcp_data_ready(sk);
> -
> +	msk = mptcp_sk(parent);
> +	if (inet_sk_state_load(sk) == TCP_LISTEN) {
> +		set_bit(MPTCP_DATA_READY, &msk->flags);
> 		parent->sk_data_ready(parent);
> 		return;
> 	}
>
> +	WARN_ON_ONCE(!msk->fallback && !subflow->mp_capable &&
> +		     !subflow->mp_join);
> +
> 	if (mptcp_subflow_data_available(sk))
> 		mptcp_data_ready(parent, sk);
> }
> @@ -1105,7 +1132,7 @@ static void subflow_state_change(struct sock *sk)
> 	 * a fin packet carrying a DSS can be unnoticed if we don't trigger
> 	 * the data available machinery here.
> 	 */
> -	if (subflow->mp_capable && mptcp_subflow_data_available(sk))
> +	if (mptcp_subflow_data_available(sk))
> 		mptcp_data_ready(parent, sk);
>
> 	if (!(parent->sk_shutdown & RCV_SHUTDOWN) &&
> -- 
> 2.26.2

--
Mat Martineau
Intel
Paolo Abeni May 19, 2020, 9:42 a.m. UTC | #2
On Mon, 2020-05-18 at 17:14 -0700, Mat Martineau wrote:
> On Mon, 18 May 2020, Davide Caratti wrote:
> 
> > current MPTCP resets the connection in case the DSS option is not received
> > after a successful three-way handshake, while RFC8684 §3.7 suggests a
> > fall-back to regular TCP when only the first subflow has been estabilshed.
> > Introduce support for "infinite mapping" to allow continuing without DSS in
> > these cases.
> 
> If I'm understanding the intent of the changes, the idea in this patch is 
> to keep using the MPTCP segmentation & reassembly after fallback, but to 
> short-circuit most of it by (1) not writing the DSS option to the TCP 
> header on the tx side and (2) generating dummy mappings on the rx side.

The above has a few goals, which can't be reached with the current
optimization for sendmsg/recvmsg on fallback:

* support for fallback in scenario we currently can't support. e.g. on
passive socket we can't fallback due to infinite mapping with the
current infra, as passive sockets have 'msk->subflow == NULL'.

* cleanup the non fallback code, removing most of 'if (<fallback>')' in
critical path

* better performances for non fallback - e.g. we drop a lock pair in
poll() and likely we could drop also the remaning one. (this will
improve also the fallback scenario).

Moreover this allows follow-up patch[es] to address other current
shortcoming. e.g. setsockopt()/getsockopt() on passive sockets
currently always return -EOPNOTSUPP as they relay on 'msk->subflow'.

The rationale is I'm pretty sure we should try to remove 'msk->subflow' 
usage completely ;)

> That seems like a lot of overhead, but if optimizing sendmsg/recvmsg on 
> fallback is introducing problems then we need to take that in to 
> consideration. It's not clear to me what we gain by using MPTCP 
> segmentation & reassembly after the fallback is complete (other than 
> possibly working around bugs in the existing fallback code). If the tricky 
> part is completing the transmit or receive call where fallback happens 
> *during* the call, maybe the fallback technique used in this patch could 
> help complete an in-progress send/recv, but future calls would use 
> sock_sendmsg/sock_recvmsg?

I would personally favour non fallback scenarios vs fallback ones.
Keeping the current fallback 'shortcuts' affects badly the non fallback
one due e.g. poll locking.

> I'd also like to avoid overloading the term "infinite mapping", which has 
> specific meaning in the RFC: a DSS mapping with the data-level length set 
> to 0 that is sent under specific circumstances. If that specific DSS 
> mapping isn't present on the wire, maybe "dummy mapping" or "fake mapping" 
> is a useful term to use instead.

Agreed! :)

> > @@ -428,8 +432,15 @@ static void mptcp_clean_una(struct sock *sk)
> > {
> > 	struct mptcp_sock *msk = mptcp_sk(sk);
> > 	struct mptcp_data_frag *dtmp, *dfrag;
> > -	u64 snd_una = atomic64_read(&msk->snd_una);
> > 	bool cleaned = false;
> > +	u64 snd_una;
> > +
> > +	/* on fallback we just need to ignore snd_una, as this is really
> > +	 * plain TCP
> > +	 */
> > +	if (msk->fallback)
> > +		atomic64_set(&msk->snd_una, msk->write_seq);
> > +	snd_una = atomic64_read(&msk->snd_una);
> 
> An alternative to flushing the rtx_queue here would be to not add anything 
> to the rtx_queue when in fallback, and to do a one-time flush when going 
> in to fallback.

I *think* we will end-up doing the one-time flushing here, as fallback
may happen in subflow ctx where the msk socket lock is not held.

Yep, we can inprove memory usage avoiding the enqueue in the fallback
case.

Thanks!

Paolo
Davide Caratti May 19, 2020, 11:05 a.m. UTC | #3
hello Mat, thanks for looking at this.

On Mon, 2020-05-18 at 17:14 -0700, Mat Martineau wrote:
> On Mon, 18 May 2020, Davide Caratti wrote:
> 
> > current MPTCP resets the connection in case the DSS option is not received
> > after a successful three-way handshake, while RFC8684 §3.7 suggests a
> > fall-back to regular TCP when only the first subflow has been estabilshed.
> > Introduce support for "infinite mapping" to allow continuing without DSS in
> > these cases.
> 
> If I'm understanding the intent of the changes, the idea in this patch is 
> to keep using the MPTCP segmentation & reassembly after fallback, but to 
> short-circuit most of it by (1) not writing the DSS option to the TCP 
> header on the tx side and (2) generating dummy mappings on the rx side.

on a second thought, it would have been better to split this and the
support for infinite maps into two separate patches.

Initially I cleared "is_mptcp" on reception of an infinite map, but then
those "goto fallback" were creating so many troubles that the approach was
flipped to always use msk, also for fallback sockets (unless the ones
passing through subflow_ulp_fallback()).


> I'd also like to avoid overloading the term "infinite mapping", which has 
> specific meaning in the RFC: a DSS mapping with the data-level length set 
> to 0 that is sent under specific circumstances. If that specific DSS 
> mapping isn't present on the wire, maybe "dummy mapping" or "fake mapping" 
> is a useful term to use instead.

good point. Looking at the specifications again, not only the kernel must
be able to process an incoming infinite map; it must also emit an infinite
map, tentatively, in case it detects a middlebox that filters/corrupts the
DSS sub-option.

[...]
> > 	}
> > @@ -639,8 +645,12 @@ static enum mapping_status get_mapping_status(struct sock *ssk)
> > 
> > 	data_len = mpext->data_len;
> > 	if (data_len == 0) {
> > -		pr_err("Infinite mapping not handled");
> > +		/* XXX TODO this should be allowed only if we didn't
> > +		 * receive a DACK yet. Paolo suggests to use a dedicated
> > +		 * flag in msk, but can't we use msk->can_ack? */
> 
> The RFC has a lot of requirements for allowing an infinite mapping, and I 
> don't think they align perfectly with what msk->can_ack means. A dedicated 
> "allow_fallback" flag could be set to 'false' when fallback is no longer 
> allowed: a subflow is added, out-of-order MPTCP mappings are encountered, 
> etc.

sure, and also talking with Paolo "can_ack" didn't really fit, as it means
"can send ack because have both local and remote keys". I was for using
subflow->conn_finished (and for removing all XXX stuff before posting to a
ML :) ), but also this is not fitting well.

> The way I read the RFC (and I could be misreading), there are cases where 
> an infinite mapping can be used after DATA_ACK if other conditions are 
> met (like the checksum error section in 3.7).

let's see if I can add a scenario for this (in addition to those at 
https://github.com/dcaratti/packetdrill/tree/fallback-after3whs )

> > 		MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_INFINITEMAPRX);
> > +		if (!subflow->local_id && !READ_ONCE(msk->pm.subflows))
> > +			return MAPPING_INFINITE;
> > 		return MAPPING_INVALID;
> > 	}
Mat Martineau May 19, 2020, 6:12 p.m. UTC | #4
On Tue, 19 May 2020, Davide Caratti wrote:

> hello Mat, thanks for looking at this.
>
> On Mon, 2020-05-18 at 17:14 -0700, Mat Martineau wrote:
>> On Mon, 18 May 2020, Davide Caratti wrote:
>>
>>> current MPTCP resets the connection in case the DSS option is not received
>>> after a successful three-way handshake, while RFC8684 §3.7 suggests a
>>> fall-back to regular TCP when only the first subflow has been estabilshed.
>>> Introduce support for "infinite mapping" to allow continuing without DSS in
>>> these cases.
>>
>> If I'm understanding the intent of the changes, the idea in this patch is
>> to keep using the MPTCP segmentation & reassembly after fallback, but to
>> short-circuit most of it by (1) not writing the DSS option to the TCP
>> header on the tx side and (2) generating dummy mappings on the rx side.
>
> on a second thought, it would have been better to split this and the
> support for infinite maps into two separate patches.
>

That would help!

> Initially I cleared "is_mptcp" on reception of an infinite map, but then
> those "goto fallback" were creating so many troubles that the approach was
> flipped to always use msk, also for fallback sockets (unless the ones
> passing through subflow_ulp_fallback()).
>

Not depending on is_mptcp for fallback behavior is good. Subflow sockets 
with is_mptcp == false do lead to some confusing situations.

>
>> I'd also like to avoid overloading the term "infinite mapping", which has
>> specific meaning in the RFC: a DSS mapping with the data-level length set
>> to 0 that is sent under specific circumstances. If that specific DSS
>> mapping isn't present on the wire, maybe "dummy mapping" or "fake mapping"
>> is a useful term to use instead.
>
> good point. Looking at the specifications again, not only the kernel must
> be able to process an incoming infinite map; it must also emit an infinite
> map, tentatively, in case it detects a middlebox that filters/corrupts the
> DSS sub-option.
>
> [...]
>>> 	}
>>> @@ -639,8 +645,12 @@ static enum mapping_status get_mapping_status(struct sock *ssk)
>>>
>>> 	data_len = mpext->data_len;
>>> 	if (data_len == 0) {
>>> -		pr_err("Infinite mapping not handled");
>>> +		/* XXX TODO this should be allowed only if we didn't
>>> +		 * receive a DACK yet. Paolo suggests to use a dedicated
>>> +		 * flag in msk, but can't we use msk->can_ack? */
>>
>> The RFC has a lot of requirements for allowing an infinite mapping, and I
>> don't think they align perfectly with what msk->can_ack means. A dedicated
>> "allow_fallback" flag could be set to 'false' when fallback is no longer
>> allowed: a subflow is added, out-of-order MPTCP mappings are encountered,
>> etc.
>
> sure, and also talking with Paolo "can_ack" didn't really fit, as it means
> "can send ack because have both local and remote keys". I was for using
> subflow->conn_finished (and for removing all XXX stuff before posting to a
> ML :) ), but also this is not fitting well.

It has been a challenge to keep track of what each flag means and how they 
all interact. A fallback state machine is a possible way to approach the 
problem, but I'm not sure how well that works with our locking 
arrangement. Hard to narrow down what to recommend here, the tradeoffs are 
complicated!


>
>> The way I read the RFC (and I could be misreading), there are cases where
>> an infinite mapping can be used after DATA_ACK if other conditions are
>> met (like the checksum error section in 3.7).
>
> let's see if I can add a scenario for this (in addition to those at
> https://github.com/dcaratti/packetdrill/tree/fallback-after3whs )

Great, thank you.

>
>>> 		MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_INFINITEMAPRX);
>>> +		if (!subflow->local_id && !READ_ONCE(msk->pm.subflows))
>>> +			return MAPPING_INFINITE;
>>> 		return MAPPING_INVALID;
>>> 	}

Thanks for sharing the patch!

--
Mat Martineau
Intel
Mat Martineau May 19, 2020, 6:28 p.m. UTC | #5
On Tue, 19 May 2020, Paolo Abeni wrote:

> On Mon, 2020-05-18 at 17:14 -0700, Mat Martineau wrote:
>> On Mon, 18 May 2020, Davide Caratti wrote:
>>
>>> current MPTCP resets the connection in case the DSS option is not received
>>> after a successful three-way handshake, while RFC8684 §3.7 suggests a
>>> fall-back to regular TCP when only the first subflow has been estabilshed.
>>> Introduce support for "infinite mapping" to allow continuing without DSS in
>>> these cases.
>>
>> If I'm understanding the intent of the changes, the idea in this patch is
>> to keep using the MPTCP segmentation & reassembly after fallback, but to
>> short-circuit most of it by (1) not writing the DSS option to the TCP
>> header on the tx side and (2) generating dummy mappings on the rx side.
>
> The above has a few goals, which can't be reached with the current
> optimization for sendmsg/recvmsg on fallback:
>
> * support for fallback in scenario we currently can't support. e.g. on
> passive socket we can't fallback due to infinite mapping with the
> current infra, as passive sockets have 'msk->subflow == NULL'.
>
> * cleanup the non fallback code, removing most of 'if (<fallback>')' in
> critical path
>
> * better performances for non fallback - e.g. we drop a lock pair in
> poll() and likely we could drop also the remaning one. (this will
> improve also the fallback scenario).
>
> Moreover this allows follow-up patch[es] to address other current
> shortcoming. e.g. setsockopt()/getsockopt() on passive sockets
> currently always return -EOPNOTSUPP as they relay on 'msk->subflow'.
>
> The rationale is I'm pretty sure we should try to remove 'msk->subflow'
> usage completely ;)

Removing msk->subflow sounds good to me!

Thanks for explaining the goals for this approach to fallback.

>
>> That seems like a lot of overhead, but if optimizing sendmsg/recvmsg on
>> fallback is introducing problems then we need to take that in to
>> consideration. It's not clear to me what we gain by using MPTCP
>> segmentation & reassembly after the fallback is complete (other than
>> possibly working around bugs in the existing fallback code). If the tricky
>> part is completing the transmit or receive call where fallback happens
>> *during* the call, maybe the fallback technique used in this patch could
>> help complete an in-progress send/recv, but future calls would use
>> sock_sendmsg/sock_recvmsg?
>
> I would personally favour non fallback scenarios vs fallback ones.
> Keeping the current fallback 'shortcuts' affects badly the non fallback
> one due e.g. poll locking.

I agree, we don't want to add overhead to MPTCP connections in order to 
optimize fallback connections. The impact on polling and other operations 
wasn't clear to me before.

>
>> I'd also like to avoid overloading the term "infinite mapping", which has
>> specific meaning in the RFC: a DSS mapping with the data-level length set
>> to 0 that is sent under specific circumstances. If that specific DSS
>> mapping isn't present on the wire, maybe "dummy mapping" or "fake mapping"
>> is a useful term to use instead.
>
> Agreed! :)
>
>>> @@ -428,8 +432,15 @@ static void mptcp_clean_una(struct sock *sk)
>>> {
>>> 	struct mptcp_sock *msk = mptcp_sk(sk);
>>> 	struct mptcp_data_frag *dtmp, *dfrag;
>>> -	u64 snd_una = atomic64_read(&msk->snd_una);
>>> 	bool cleaned = false;
>>> +	u64 snd_una;
>>> +
>>> +	/* on fallback we just need to ignore snd_una, as this is really
>>> +	 * plain TCP
>>> +	 */
>>> +	if (msk->fallback)
>>> +		atomic64_set(&msk->snd_una, msk->write_seq);
>>> +	snd_una = atomic64_read(&msk->snd_una);
>>
>> An alternative to flushing the rtx_queue here would be to not add anything
>> to the rtx_queue when in fallback, and to do a one-time flush when going
>> in to fallback.
>
> I *think* we will end-up doing the one-time flushing here, as fallback
> may happen in subflow ctx where the msk socket lock is not held.
>
> Yep, we can inprove memory usage avoiding the enqueue in the fallback
> case.
>

Agreed, thanks!

--
Mat Martineau
Intel
diff mbox series

Patch

diff --git a/net/mptcp/options.c b/net/mptcp/options.c
index ece6f92cf7d11..361e605c32aa2 100644
--- a/net/mptcp/options.c
+++ b/net/mptcp/options.c
@@ -623,6 +623,9 @@  bool mptcp_established_options(struct sock *sk, struct sk_buff *skb,
 
 	opts->suboptions = 0;
 
+	if (unlikely(mptcp_check_fallback(sk)))
+		return false;
+
 	if (mptcp_established_options_mp(sk, skb, &opt_size, remaining, opts))
 		ret = true;
 	else if (mptcp_established_options_dss(sk, skb, &opt_size, remaining,
@@ -713,7 +716,8 @@  static bool check_fully_established(struct mptcp_sock *msk, struct sock *sk,
 	 */
 	if (!mp_opt->mp_capable) {
 		subflow->mp_capable = 0;
-		tcp_sk(sk)->is_mptcp = 0;
+		pr_fallback(msk);
+		msk->fallback = true;
 		return false;
 	}
 
@@ -813,6 +817,9 @@  void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb,
 	struct mptcp_options_received mp_opt;
 	struct mptcp_ext *mpext;
 
+	if (msk->fallback)
+		return;
+
 	mptcp_get_options(skb, &mp_opt);
 	if (!check_fully_established(msk, sk, subflow, skb, &mp_opt))
 		return;
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index e3a628bea2b81..44489ab95f115 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -52,11 +52,6 @@  static struct socket *__mptcp_nmpc_socket(const struct mptcp_sock *msk)
 	return msk->subflow;
 }
 
-static bool __mptcp_needs_tcp_fallback(const struct mptcp_sock *msk)
-{
-	return msk->first && !sk_is_mptcp(msk->first);
-}
-
 static struct socket *mptcp_is_tcpsk(struct sock *sk)
 {
 	struct socket *sock = sk->sk_socket;
@@ -94,7 +89,7 @@  static struct socket *__mptcp_tcp_fallback(struct mptcp_sock *msk)
 	if (unlikely(sock))
 		return sock;
 
-	if (likely(!__mptcp_needs_tcp_fallback(msk)))
+	if (likely(!msk->fallback))
 		return NULL;
 
 	return msk->subflow;
@@ -212,6 +207,15 @@  static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk,
 		if (!skb)
 			break;
 
+		if (msk->fallback) {
+			/* if we are running under the workqueue, TCP could have
+			 * collapsed skbs between dummy map creation and now
+			 * be sure to adjust the size
+			 */
+			map_remaining = skb->len;
+			subflow->map_data_len = skb->len;
+		}
+
 		offset = seq - TCP_SKB_CB(skb)->seq;
 		fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN;
 		if (fin) {
@@ -428,8 +432,15 @@  static void mptcp_clean_una(struct sock *sk)
 {
 	struct mptcp_sock *msk = mptcp_sk(sk);
 	struct mptcp_data_frag *dtmp, *dfrag;
-	u64 snd_una = atomic64_read(&msk->snd_una);
 	bool cleaned = false;
+	u64 snd_una;
+
+	/* on fallback we just need to ignore snd_una, as this is really
+	 * plain TCP
+	 */
+	if (msk->fallback)
+		atomic64_set(&msk->snd_una, msk->write_seq);
+	snd_una = atomic64_read(&msk->snd_una);
 
 	list_for_each_entry_safe(dfrag, dtmp, &msk->rtx_queue, list) {
 		if (after64(dfrag->data_seq + dfrag->data_len, snd_una))
@@ -702,7 +713,6 @@  static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 	int mss_now = 0, size_goal = 0, ret = 0;
 	struct mptcp_sock *msk = mptcp_sk(sk);
 	struct page_frag *pfrag;
-	struct socket *ssock;
 	size_t copied = 0;
 	struct sock *ssk;
 	bool tx_ok;
@@ -721,15 +731,6 @@  static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 			goto out;
 	}
 
-fallback:
-	ssock = __mptcp_tcp_fallback(msk);
-	if (unlikely(ssock)) {
-		release_sock(sk);
-		pr_debug("fallback passthrough");
-		ret = sock_sendmsg(ssock, msg);
-		return ret >= 0 ? ret + copied : (copied ? copied : ret);
-	}
-
 	pfrag = sk_page_frag(sk);
 restart:
 	mptcp_clean_una(sk);
@@ -781,17 +782,6 @@  static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 			}
 			break;
 		}
-		if (ret == 0 && unlikely(__mptcp_needs_tcp_fallback(msk))) {
-			/* Can happen for passive sockets:
-			 * 3WHS negotiated MPTCP, but first packet after is
-			 * plain TCP (e.g. due to middlebox filtering unknown
-			 * options).
-			 *
-			 * Fall back to TCP.
-			 */
-			release_sock(ssk);
-			goto fallback;
-		}
 
 		copied += ret;
 
@@ -934,7 +924,6 @@  static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
 			 int nonblock, int flags, int *addr_len)
 {
 	struct mptcp_sock *msk = mptcp_sk(sk);
-	struct socket *ssock;
 	int copied = 0;
 	int target;
 	long timeo;
@@ -943,16 +932,6 @@  static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
 		return -EOPNOTSUPP;
 
 	lock_sock(sk);
-	ssock = __mptcp_tcp_fallback(msk);
-	if (unlikely(ssock)) {
-fallback:
-		release_sock(sk);
-		pr_debug("fallback-read subflow=%p",
-			 mptcp_subflow_ctx(ssock->sk));
-		copied = sock_recvmsg(ssock, msg, flags);
-		return copied;
-	}
-
 	timeo = sock_rcvtimeo(sk, nonblock);
 
 	len = min_t(size_t, len, INT_MAX);
@@ -1015,8 +994,6 @@  static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
 
 		pr_debug("block timeout %ld", timeo);
 		mptcp_wait_data(sk, &timeo);
-		if (unlikely(__mptcp_tcp_fallback(msk)))
-			goto fallback;
 	}
 
 	if (skb_queue_empty(&sk->sk_receive_queue)) {
@@ -1491,6 +1468,7 @@  static struct sock *mptcp_accept(struct sock *sk, int flags, int *err,
 		 */
 		if (WARN_ON_ONCE(!new_mptcp_sock)) {
 			tcp_sk(newsk)->is_mptcp = 0;
+			pr_fallback(msk);
 			return newsk;
 		}
 
@@ -1512,6 +1490,8 @@  static struct sock *mptcp_accept(struct sock *sk, int flags, int *err,
 		__MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEPASSIVEACK);
 		local_bh_enable();
 	} else {
+		pr_fallback(msk);
+		msk->fallback = true;
 		MPTCP_INC_STATS(sock_net(sk),
 				MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK);
 	}
@@ -1635,12 +1615,6 @@  void mptcp_finish_connect(struct sock *ssk)
 	sk = subflow->conn;
 	msk = mptcp_sk(sk);
 
-	if (!subflow->mp_capable) {
-		MPTCP_INC_STATS(sock_net(sk),
-				MPTCP_MIB_MPCAPABLEACTIVEFALLBACK);
-		return;
-	}
-
 	pr_debug("msk=%p, token=%u", sk, subflow->token);
 
 	mptcp_crypto_key_sha(subflow->remote_key, NULL, &ack_seq);
@@ -1916,21 +1890,9 @@  static __poll_t mptcp_poll(struct file *file, struct socket *sock,
 {
 	struct sock *sk = sock->sk;
 	struct mptcp_sock *msk;
-	struct socket *ssock;
 	__poll_t mask = 0;
 
 	msk = mptcp_sk(sk);
-	lock_sock(sk);
-	ssock = __mptcp_tcp_fallback(msk);
-	if (!ssock)
-		ssock = __mptcp_nmpc_socket(msk);
-	if (ssock) {
-		mask = ssock->ops->poll(file, ssock, wait);
-		release_sock(sk);
-		return mask;
-	}
-
-	release_sock(sk);
 	sock_poll_wait(file, sock, wait);
 	lock_sock(sk);
 
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index f5adca93e8fb5..cf37bca9d1a5a 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -202,6 +202,7 @@  struct mptcp_sock {
 	u32		token;
 	unsigned long	flags;
 	bool		can_ack;
+	bool		fallback;
 	spinlock_t	join_list_lock;
 	struct work_struct work;
 	struct list_head conn_list;
@@ -340,6 +341,15 @@  mptcp_subflow_get_mapped_dsn(const struct mptcp_subflow_context *subflow)
 	return subflow->map_seq + mptcp_subflow_get_map_offset(subflow);
 }
 
+
+static inline bool mptcp_check_fallback(struct sock *sk)
+{
+	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+	struct mptcp_sock *msk = mptcp_sk(subflow->conn);
+
+	return msk->fallback;
+}
+
 int mptcp_is_enabled(struct net *net);
 bool mptcp_subflow_data_available(struct sock *sk);
 void mptcp_subflow_init(void);
@@ -459,4 +469,8 @@  static inline bool before64(__u64 seq1, __u64 seq2)
 
 void mptcp_diag_subflow_init(struct tcp_ulp_ops *ops);
 
+#define pr_fallback(a) do { pr_debug("%s:fallback to TCP (msk=%p)",\
+				      __FUNCTION__, a); } while (0)
+
+
 #endif /* __MPTCP_PROTOCOL_H */
diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
index 0020d356233dd..dc64e8c78e36d 100644
--- a/net/mptcp/subflow.c
+++ b/net/mptcp/subflow.c
@@ -222,7 +222,6 @@  static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
 	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
 	struct mptcp_options_received mp_opt;
 	struct sock *parent = subflow->conn;
-	struct tcp_sock *tp = tcp_sk(sk);
 
 	subflow->icsk_af_ops->sk_rx_dst_set(sk, skb);
 
@@ -236,6 +235,8 @@  static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
 		return;
 
 	subflow->conn_finished = 1;
+	subflow->ssn_offset = TCP_SKB_CB(skb)->seq;
+	pr_debug("subflow=%p synack seq=%x", subflow, subflow->ssn_offset);
 
 	mptcp_get_options(skb, &mp_opt);
 	if (subflow->request_mptcp && mp_opt.mp_capable) {
@@ -251,21 +252,19 @@  static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
 		pr_debug("subflow=%p, thmac=%llu, remote_nonce=%u", subflow,
 			 subflow->thmac, subflow->remote_nonce);
 	} else if (subflow->request_mptcp) {
-		tp->is_mptcp = 0;
+		pr_fallback(mptcp_sk(subflow->conn));
+		mptcp_sk(subflow->conn)->fallback = true;
+		MPTCP_INC_STATS(sock_net(sk),
+				MPTCP_MIB_MPCAPABLEACTIVEFALLBACK);
 	}
 
-	if (!tp->is_mptcp)
+	if (mptcp_sk(subflow->conn)->fallback)
 		return;
 
 	if (subflow->mp_capable) {
 		pr_debug("subflow=%p, remote_key=%llu", mptcp_subflow_ctx(sk),
 			 subflow->remote_key);
 		mptcp_finish_connect(sk);
-
-		if (skb) {
-			pr_debug("synack seq=%u", TCP_SKB_CB(skb)->seq);
-			subflow->ssn_offset = TCP_SKB_CB(skb)->seq;
-		}
 	} else if (subflow->mp_join) {
 		pr_debug("subflow=%p, thmac=%llu, remote_nonce=%u",
 			 subflow, subflow->thmac,
@@ -281,9 +280,6 @@  static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
 				      subflow->remote_nonce,
 				      subflow->hmac);
 
-		if (skb)
-			subflow->ssn_offset = TCP_SKB_CB(skb)->seq;
-
 		if (!mptcp_finish_join(sk))
 			goto do_reset;
 
@@ -547,7 +543,8 @@  enum mapping_status {
 	MAPPING_OK,
 	MAPPING_INVALID,
 	MAPPING_EMPTY,
-	MAPPING_DATA_FIN
+	MAPPING_DATA_FIN,
+	MAPPING_INFINITE
 };
 
 static u64 expand_seq(u64 old_seq, u16 old_data_len, u64 seq)
@@ -602,6 +599,7 @@  static bool validate_mapping(struct sock *ssk, struct sk_buff *skb)
 static enum mapping_status get_mapping_status(struct sock *ssk)
 {
 	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
+	struct mptcp_sock *msk = mptcp_sk(subflow->conn);
 	struct mptcp_ext *mpext;
 	struct sk_buff *skb;
 	u16 data_len;
@@ -611,6 +609,9 @@  static enum mapping_status get_mapping_status(struct sock *ssk)
 	if (!skb)
 		return MAPPING_EMPTY;
 
+	if (msk->fallback)
+		return MAPPING_INFINITE;
+
 	mpext = mptcp_get_ext(skb);
 	if (!mpext || !mpext->use_map) {
 		if (!subflow->map_valid && !skb->len) {
@@ -627,8 +628,13 @@  static enum mapping_status get_mapping_status(struct sock *ssk)
 			return MAPPING_EMPTY;
 		}
 
-		if (!subflow->map_valid)
+		if (!subflow->map_valid) {
+			if (!mpext && !subflow->local_id) {
+				pr_debug("no mpext and 0 local id");
+				return MAPPING_INFINITE;
+			}
 			return MAPPING_INVALID;
+		}
 
 		goto validate_seq;
 	}
@@ -639,8 +645,12 @@  static enum mapping_status get_mapping_status(struct sock *ssk)
 
 	data_len = mpext->data_len;
 	if (data_len == 0) {
-		pr_err("Infinite mapping not handled");
+		/* XXX TODO this should be allowed only if we didn't
+		 * receive a DACK yet. Paolo suggests to use a dedicated
+		 * flag in msk, but can't we use msk->can_ack? */
 		MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_INFINITEMAPRX);
+		if (!subflow->local_id && !READ_ONCE(msk->pm.subflows))
+			return MAPPING_INFINITE;
 		return MAPPING_INVALID;
 	}
 
@@ -752,6 +762,19 @@  static bool subflow_check_data_avail(struct sock *ssk)
 			ssk->sk_err = EBADMSG;
 			goto fatal;
 		}
+		if (status == MAPPING_INFINITE) {
+			if (!msk->fallback) {
+				pr_fallback(msk);
+				msk->fallback = true;
+			}
+			skb = skb_peek(&ssk->sk_receive_queue);
+			subflow->map_valid = 1;
+			subflow->map_seq = READ_ONCE(msk->ack_seq);
+			subflow->map_data_len = skb->len;
+			subflow->map_subflow_seq = tcp_sk(ssk)->copied_seq -
+						   subflow->ssn_offset;
+			return true;
+		}
 
 		if (status != MAPPING_OK)
 			return false;
@@ -875,14 +898,18 @@  static void subflow_data_ready(struct sock *sk)
 {
 	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
 	struct sock *parent = subflow->conn;
+	struct mptcp_sock *msk;
 
-	if (!subflow->mp_capable && !subflow->mp_join) {
-		subflow->tcp_data_ready(sk);
-
+	msk = mptcp_sk(parent);
+	if (inet_sk_state_load(sk) == TCP_LISTEN) {
+		set_bit(MPTCP_DATA_READY, &msk->flags);
 		parent->sk_data_ready(parent);
 		return;
 	}
 
+	WARN_ON_ONCE(!msk->fallback && !subflow->mp_capable &&
+		     !subflow->mp_join);
+
 	if (mptcp_subflow_data_available(sk))
 		mptcp_data_ready(parent, sk);
 }
@@ -1105,7 +1132,7 @@  static void subflow_state_change(struct sock *sk)
 	 * a fin packet carrying a DSS can be unnoticed if we don't trigger
 	 * the data available machinery here.
 	 */
-	if (subflow->mp_capable && mptcp_subflow_data_available(sk))
+	if (mptcp_subflow_data_available(sk))
 		mptcp_data_ready(parent, sk);
 
 	if (!(parent->sk_shutdown & RCV_SHUTDOWN) &&