diff mbox series

[mptcp-next,v2] net: mptcp: improve fallback to TCP

Message ID 20200605092227.2420308-1-dcaratti@redhat.com
State Accepted, archived
Delegated to: Matthieu Baerts
Headers show
Series [mptcp-next,v2] net: mptcp: improve fallback to TCP | expand

Commit Message

Davide Caratti June 5, 2020, 9:22 a.m. UTC
keep using MPTCP sockets and a "dummy mapping" in case of fallback to
regular TCP. Skip adding DSS option on send, if TCP fallback has been
done earlier.

Notes: I'm unsure on what to do in mptcp_clean_una() to do a one-time
flush of the retransmit queue, as per Mat's suggestion. Any advice?

Changes since v1
 - rebase on top of Paolo's fix for NULL dereference in mptcp_recvmsg()

Changes since RFC v2:
 - use a bit in msk->flags, rather than a dedicated boolean in struct
   msk. This bit is going to be used in combination with another one,
   TCP_FALLBACK_ALLOWED, that is 1 at the first subflow creation
   and gets cleared once TCP fallback is no more allowed.
 - separate code that adds support for "infinite mapping", and use
   the term "dummy" instead of "infinite". Suggested by Mat
 - remove inappropriate call to __mptcp_do_fallback() in
   mptcp_accept() (Paolo)

Changes since RFC v1:
 - use a dedicated member of struct msk to indicate that a fallback
   ha happened, use it in case of infinite mapping
 - don't delete skb_ext in case of infinite mapping (Mat)
 - test the value of pm.subflows on reception of an infinite map to
   ensure that no other subflow is currently opened (Mat)
 - in mptcp_established_options(), avoid adding TCP options in case
   of fallback indication; simplify sendmsg()/recvmsg()/poll() to
   keep using the MPTCP socket in case of TCP fallback. Set the
   fallback indication in case subflow is not mp_capable after
   successful 3-way handshake, instead of flipping 'is_mptcp'
   (Paolo/Mat)
 - remove deadcode in mptcp_finish_connect, and increment
   MPTCP_MIB_MPCAPABLEACTIVEFALLBACK in subflow_finish_connect
   (Paolo)

BugLink: https://github.com/multipath-tcp/mptcp_net-next/issues/11
BugLink: https://github.com/multipath-tcp/mptcp_net-next/issues/22
Signed-off-by: Davide Caratti <dcaratti@redhat.com>
---
 net/mptcp/options.c  |  9 ++++-
 net/mptcp/protocol.c | 78 ++++++++++----------------------------------
 net/mptcp/protocol.h | 34 +++++++++++++++++++
 net/mptcp/subflow.c  | 46 +++++++++++++++++---------
 4 files changed, 90 insertions(+), 77 deletions(-)

Comments

Paolo Abeni June 8, 2020, 10:16 a.m. UTC | #1
On Fri, 2020-06-05 at 11:22 +0200, Davide Caratti wrote:
> keep using MPTCP sockets and a "dummy mapping" in case of fallback to
> regular TCP. Skip adding DSS option on send, if TCP fallback has been
> done earlier.
> 
> Notes: I'm unsure on what to do in mptcp_clean_una() to do a one-time
> flush of the retransmit queue, as per Mat's suggestion. Any advice?
> 
> Changes since v1
>  - rebase on top of Paolo's fix for NULL dereference in mptcp_recvmsg()
> 
> Changes since RFC v2:
>  - use a bit in msk->flags, rather than a dedicated boolean in struct
>    msk. This bit is going to be used in combination with another one,
>    TCP_FALLBACK_ALLOWED, that is 1 at the first subflow creation
>    and gets cleared once TCP fallback is no more allowed.
>  - separate code that adds support for "infinite mapping", and use
>    the term "dummy" instead of "infinite". Suggested by Mat
>  - remove inappropriate call to __mptcp_do_fallback() in
>    mptcp_accept() (Paolo)
> 
> Changes since RFC v1:
>  - use a dedicated member of struct msk to indicate that a fallback
>    ha happened, use it in case of infinite mapping
>  - don't delete skb_ext in case of infinite mapping (Mat)
>  - test the value of pm.subflows on reception of an infinite map to
>    ensure that no other subflow is currently opened (Mat)
>  - in mptcp_established_options(), avoid adding TCP options in case
>    of fallback indication; simplify sendmsg()/recvmsg()/poll() to
>    keep using the MPTCP socket in case of TCP fallback. Set the
>    fallback indication in case subflow is not mp_capable after
>    successful 3-way handshake, instead of flipping 'is_mptcp'
>    (Paolo/Mat)
>  - remove deadcode in mptcp_finish_connect, and increment
>    MPTCP_MIB_MPCAPABLEACTIVEFALLBACK in subflow_finish_connect
>    (Paolo)
> 
> BugLink: https://github.com/multipath-tcp/mptcp_net-next/issues/11
> BugLink: https://github.com/multipath-tcp/mptcp_net-next/issues/22
> Signed-off-by: Davide Caratti <dcaratti@redhat.com>
> ---
>  net/mptcp/options.c  |  9 ++++-
>  net/mptcp/protocol.c | 78 ++++++++++----------------------------------
>  net/mptcp/protocol.h | 34 +++++++++++++++++++
>  net/mptcp/subflow.c  | 46 +++++++++++++++++---------
>  4 files changed, 90 insertions(+), 77 deletions(-)
> 
> diff --git a/net/mptcp/options.c b/net/mptcp/options.c
> index 01f1f4cf4902..cf0b59ead1e4 100644
> --- a/net/mptcp/options.c
> +++ b/net/mptcp/options.c
> @@ -624,6 +624,9 @@ bool mptcp_established_options(struct sock *sk, struct sk_buff *skb,
>  
>  	opts->suboptions = 0;
>  
> +	if (unlikely(mptcp_check_fallback(sk)))
> +		return false;
> +
>  	if (mptcp_established_options_mp(sk, skb, &opt_size, remaining, opts))
>  		ret = true;
>  	else if (mptcp_established_options_dss(sk, skb, &opt_size, remaining,
> @@ -714,7 +717,8 @@ static bool check_fully_established(struct mptcp_sock *msk, struct sock *sk,
>  	 */
>  	if (!mp_opt->mp_capable) {
>  		subflow->mp_capable = 0;
> -		tcp_sk(sk)->is_mptcp = 0;
> +		pr_fallback(msk);
> +		__mptcp_do_fallback(msk);
>  		return false;
>  	}
>  
> @@ -814,6 +818,9 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb,
>  	struct mptcp_options_received mp_opt;
>  	struct mptcp_ext *mpext;
>  
> +	if (__mptcp_check_fallback(msk))
> +		return;
> +
>  	mptcp_get_options(skb, &mp_opt);
>  	if (!check_fully_established(msk, sk, subflow, skb, &mp_opt))
>  		return;
> diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
> index 5d7f4cd2c8e6..1d907519c223 100644
> --- a/net/mptcp/protocol.c
> +++ b/net/mptcp/protocol.c
> @@ -52,11 +52,6 @@ static struct socket *__mptcp_nmpc_socket(const struct mptcp_sock *msk)
>  	return msk->subflow;
>  }
>  
> -static bool __mptcp_needs_tcp_fallback(const struct mptcp_sock *msk)
> -{
> -	return msk->first && !sk_is_mptcp(msk->first);
> -}
> -
>  static struct socket *mptcp_is_tcpsk(struct sock *sk)
>  {
>  	struct socket *sock = sk->sk_socket;
> @@ -94,7 +89,7 @@ static struct socket *__mptcp_tcp_fallback(struct mptcp_sock *msk)
>  	if (unlikely(sock))
>  		return sock;
>  
> -	if (likely(!__mptcp_needs_tcp_fallback(msk)))
> +	if (likely(!__mptcp_check_fallback(msk)))
>  		return NULL;
>  
>  	return msk->subflow;
> @@ -229,6 +224,15 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk,
>  		if (!skb)
>  			break;
>  
> +		if (__mptcp_check_fallback(msk)) {
> +			/* if we are running under the workqueue, TCP could have
> +			 * collapsed skbs between dummy map creation and now
> +			 * be sure to adjust the size
> +			 */
> +			map_remaining = skb->len;
> +			subflow->map_data_len = skb->len;
> +		}
> +
>  		offset = seq - TCP_SKB_CB(skb)->seq;
>  		fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN;
>  		if (fin) {
> @@ -445,8 +449,15 @@ static void mptcp_clean_una(struct sock *sk)
>  {
>  	struct mptcp_sock *msk = mptcp_sk(sk);
>  	struct mptcp_data_frag *dtmp, *dfrag;
> -	u64 snd_una = atomic64_read(&msk->snd_una);
>  	bool cleaned = false;
> +	u64 snd_una;
> +
> +	/* on fallback we just need to ignore snd_una, as this is really
> +	 * plain TCP
> +	 */
> +	if (__mptcp_check_fallback(msk))
> +		atomic64_set(&msk->snd_una, msk->write_seq);
> +	snd_una = atomic64_read(&msk->snd_una);
>  
>  	list_for_each_entry_safe(dfrag, dtmp, &msk->rtx_queue, list) {
>  		if (after64(dfrag->data_seq + dfrag->data_len, snd_una))
> @@ -719,7 +730,6 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
>  	int mss_now = 0, size_goal = 0, ret = 0;
>  	struct mptcp_sock *msk = mptcp_sk(sk);
>  	struct page_frag *pfrag;
> -	struct socket *ssock;
>  	size_t copied = 0;
>  	struct sock *ssk;
>  	bool tx_ok;
> @@ -738,15 +748,6 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
>  			goto out;
>  	}
>  
> -fallback:
> -	ssock = __mptcp_tcp_fallback(msk);
> -	if (unlikely(ssock)) {
> -		release_sock(sk);
> -		pr_debug("fallback passthrough");
> -		ret = sock_sendmsg(ssock, msg);
> -		return ret >= 0 ? ret + copied : (copied ? copied : ret);
> -	}
> -
>  	pfrag = sk_page_frag(sk);
>  restart:
>  	mptcp_clean_una(sk);
> @@ -798,17 +799,6 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
>  			}
>  			break;
>  		}
> -		if (ret == 0 && unlikely(__mptcp_needs_tcp_fallback(msk))) {
> -			/* Can happen for passive sockets:
> -			 * 3WHS negotiated MPTCP, but first packet after is
> -			 * plain TCP (e.g. due to middlebox filtering unknown
> -			 * options).
> -			 *
> -			 * Fall back to TCP.
> -			 */
> -			release_sock(ssk);
> -			goto fallback;
> -		}
>  
>  		copied += ret;
>  
> @@ -951,7 +941,6 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
>  			 int nonblock, int flags, int *addr_len)
>  {
>  	struct mptcp_sock *msk = mptcp_sk(sk);
> -	struct socket *ssock;
>  	int copied = 0;
>  	int target;
>  	long timeo;
> @@ -960,16 +949,6 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
>  		return -EOPNOTSUPP;
>  
>  	lock_sock(sk);
> -	ssock = __mptcp_tcp_fallback(msk);
> -	if (unlikely(ssock)) {
> -fallback:
> -		release_sock(sk);
> -		pr_debug("fallback-read subflow=%p",
> -			 mptcp_subflow_ctx(ssock->sk));
> -		copied = sock_recvmsg(ssock, msg, flags);
> -		return copied;
> -	}
> -
>  	timeo = sock_rcvtimeo(sk, nonblock);
>  
>  	len = min_t(size_t, len, INT_MAX);
> @@ -1032,9 +1011,6 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
>  
>  		pr_debug("block timeout %ld", timeo);
>  		mptcp_wait_data(sk, &timeo);
> -		ssock = __mptcp_tcp_fallback(msk);
> -		if (unlikely(ssock))
> -			goto fallback;
>  	}
>  
>  	if (skb_queue_empty(&sk->sk_receive_queue)) {
> @@ -1659,12 +1635,6 @@ void mptcp_finish_connect(struct sock *ssk)
>  	sk = subflow->conn;
>  	msk = mptcp_sk(sk);
>  
> -	if (!subflow->mp_capable) {
> -		MPTCP_INC_STATS(sock_net(sk),
> -				MPTCP_MIB_MPCAPABLEACTIVEFALLBACK);
> -		return;
> -	}
> -
>  	pr_debug("msk=%p, token=%u", sk, subflow->token);
>  
>  	mptcp_crypto_key_sha(subflow->remote_key, NULL, &ack_seq);
> @@ -1964,21 +1934,9 @@ static __poll_t mptcp_poll(struct file *file, struct socket *sock,
>  {
>  	struct sock *sk = sock->sk;
>  	struct mptcp_sock *msk;
> -	struct socket *ssock;
>  	__poll_t mask = 0;
>  
>  	msk = mptcp_sk(sk);
> -	lock_sock(sk);
> -	ssock = __mptcp_tcp_fallback(msk);
> -	if (!ssock)
> -		ssock = __mptcp_nmpc_socket(msk);
> -	if (ssock) {
> -		mask = ssock->ops->poll(file, ssock, wait);
> -		release_sock(sk);
> -		return mask;
> -	}
> -
> -	release_sock(sk);
>  	sock_poll_wait(file, sock, wait);
>  	lock_sock(sk);
>  
> diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
> index 0592fec3418b..2b39d5344d95 100644
> --- a/net/mptcp/protocol.h
> +++ b/net/mptcp/protocol.h
> @@ -89,6 +89,7 @@
>  #define MPTCP_SEND_SPACE	1
>  #define MPTCP_WORK_RTX		2
>  #define MPTCP_WORK_EOF		3
> +#define MPTCP_FALLBACK_DONE	4
>  
>  struct mptcp_options_received {
>  	u64	sndr_key;
> @@ -461,4 +462,37 @@ static inline bool before64(__u64 seq1, __u64 seq2)
>  
>  void mptcp_diag_subflow_init(struct tcp_ulp_ops *ops);
>  
> +static inline bool __mptcp_check_fallback(struct mptcp_sock *msk)
> +{
> +	return test_bit(MPTCP_FALLBACK_DONE, &msk->flags);
> +}
> +
> +static inline bool mptcp_check_fallback(struct sock *sk)
> +{
> +	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
> +	struct mptcp_sock *msk = mptcp_sk(subflow->conn);
> +
> +	return __mptcp_check_fallback(msk);
> +}
> +
> +static inline void __mptcp_do_fallback(struct mptcp_sock *msk)
> +{
> +	if (test_bit(MPTCP_FALLBACK_DONE, &msk->flags)) {
> +		pr_debug("TCP fallback already done (msk=%p)", msk);
> +		return;
> +	}
> +	set_bit(MPTCP_FALLBACK_DONE, &msk->flags);
> +}
> +
> +static inline void mptcp_do_fallback(struct sock *sk)
> +{
> +	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
> +	struct mptcp_sock *msk = mptcp_sk(subflow->conn);
> +
> +	__mptcp_do_fallback(msk);
> +}
> +
> +#define pr_fallback(a) do { pr_debug("%s:fallback to TCP (msk=%p)",\
> +				      __FUNCTION__, a); } while (0)
> +
>  #endif /* __MPTCP_PROTOCOL_H */
> diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
> index 10b4770a1419..8245395f8354 100644
> --- a/net/mptcp/subflow.c
> +++ b/net/mptcp/subflow.c
> @@ -223,7 +223,6 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
>  	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
>  	struct mptcp_options_received mp_opt;
>  	struct sock *parent = subflow->conn;
> -	struct tcp_sock *tp = tcp_sk(sk);
>  
>  	subflow->icsk_af_ops->sk_rx_dst_set(sk, skb);
>  
> @@ -237,6 +236,8 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
>  		return;
>  
>  	subflow->conn_finished = 1;
> +	subflow->ssn_offset = TCP_SKB_CB(skb)->seq;
> +	pr_debug("subflow=%p synack seq=%x", subflow, subflow->ssn_offset);
>  
>  	mptcp_get_options(skb, &mp_opt);
>  	if (subflow->request_mptcp && mp_opt.mp_capable) {
> @@ -252,21 +253,19 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
>  		pr_debug("subflow=%p, thmac=%llu, remote_nonce=%u", subflow,
>  			 subflow->thmac, subflow->remote_nonce);
>  	} else if (subflow->request_mptcp) {
> -		tp->is_mptcp = 0;
> +		mptcp_do_fallback(sk);
> +		pr_fallback(mptcp_sk(subflow->conn));
> +		MPTCP_INC_STATS(sock_net(sk),
> +				MPTCP_MIB_MPCAPABLEACTIVEFALLBACK);
>  	}
>  
> -	if (!tp->is_mptcp)
> +	if (mptcp_check_fallback(sk))
>  		return;
>  
>  	if (subflow->mp_capable) {
>  		pr_debug("subflow=%p, remote_key=%llu", mptcp_subflow_ctx(sk),
>  			 subflow->remote_key);
>  		mptcp_finish_connect(sk);
> -
> -		if (skb) {
> -			pr_debug("synack seq=%u", TCP_SKB_CB(skb)->seq);
> -			subflow->ssn_offset = TCP_SKB_CB(skb)->seq;
> -		}
>  	} else if (subflow->mp_join) {
>  		u8 hmac[SHA256_DIGEST_SIZE];
>  
> @@ -286,9 +285,6 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
>  
>  		memcpy(subflow->hmac, hmac, MPTCPOPT_HMAC_LEN);
>  
> -		if (skb)
> -			subflow->ssn_offset = TCP_SKB_CB(skb)->seq;
> -
>  		if (!mptcp_finish_join(sk))
>  			goto do_reset;
>  
> @@ -564,7 +560,8 @@ enum mapping_status {
>  	MAPPING_OK,
>  	MAPPING_INVALID,
>  	MAPPING_EMPTY,
> -	MAPPING_DATA_FIN
> +	MAPPING_DATA_FIN,
> +	MAPPING_DUMMY
>  };
>  
>  static u64 expand_seq(u64 old_seq, u16 old_data_len, u64 seq)
> @@ -628,6 +625,9 @@ static enum mapping_status get_mapping_status(struct sock *ssk)
>  	if (!skb)
>  		return MAPPING_EMPTY;
>  
> +	if (mptcp_check_fallback(ssk))
> +		return MAPPING_DUMMY;
> +
>  	mpext = mptcp_get_ext(skb);
>  	if (!mpext || !mpext->use_map) {
>  		if (!subflow->map_valid && !skb->len) {
> @@ -769,6 +769,16 @@ static bool subflow_check_data_avail(struct sock *ssk)
>  			ssk->sk_err = EBADMSG;
>  			goto fatal;
>  		}
> +		if (status == MAPPING_DUMMY) {
> +			__mptcp_do_fallback(msk);
> +			skb = skb_peek(&ssk->sk_receive_queue);
> +			subflow->map_valid = 1;
> +			subflow->map_seq = READ_ONCE(msk->ack_seq);
> +			subflow->map_data_len = skb->len;
> +			subflow->map_subflow_seq = tcp_sk(ssk)->copied_seq -
> +						   subflow->ssn_offset;
> +			return true;
> +		}
>  
>  		if (status != MAPPING_OK)
>  			return false;
> @@ -892,14 +902,18 @@ static void subflow_data_ready(struct sock *sk)
>  {
>  	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
>  	struct sock *parent = subflow->conn;
> +	struct mptcp_sock *msk;
>  
> -	if (!subflow->mp_capable && !subflow->mp_join) {
> -		subflow->tcp_data_ready(sk);
> -
> +	msk = mptcp_sk(parent);
> +	if (inet_sk_state_load(sk) == TCP_LISTEN) {
> +		set_bit(MPTCP_DATA_READY, &msk->flags);
>  		parent->sk_data_ready(parent);
>  		return;
>  	}
>  
> +	WARN_ON_ONCE(!__mptcp_check_fallback(msk) && !subflow->mp_capable &&
> +		     !subflow->mp_join);
> +
>  	if (mptcp_subflow_data_available(sk))
>  		mptcp_data_ready(parent, sk);
>  }
> @@ -1122,7 +1136,7 @@ static void subflow_state_change(struct sock *sk)
>  	 * a fin packet carrying a DSS can be unnoticed if we don't trigger
>  	 * the data available machinery here.
>  	 */
> -	if (subflow->mp_capable && mptcp_subflow_data_available(sk))
> +	if (mptcp_subflow_data_available(sk))
>  		mptcp_data_ready(parent, sk);
>  
>  	if (!(parent->sk_shutdown & RCV_SHUTDOWN) &&

The patch LGTM, modolo the follow-up to cope with issues/35

I propose to merge this as is, and handle fixes (and commit message
rewrite) with squash-to patches.

WDYT?

Thanks!

Paolo
Matthieu Baerts June 8, 2020, 10:21 a.m. UTC | #2
Hi Paolo, Davide,

On 08/06/2020 12:16, Paolo Abeni wrote:
> On Fri, 2020-06-05 at 11:22 +0200, Davide Caratti wrote:

(...)

> The patch LGTM, modolo the follow-up to cope with issues/35
> 
> I propose to merge this as is, and handle fixes (and commit message
> rewrite) with squash-to patches.
> 
> WDYT?

Sounds good to me! This will unblock Florian as well.

Cheers,
Matt
Matthieu Baerts June 8, 2020, 11:57 a.m. UTC | #3
Hi Davide, Paolo, Mat,

On 08/06/2020 12:16, Paolo Abeni wrote:
> On Fri, 2020-06-05 at 11:22 +0200, Davide Caratti wrote:
>> keep using MPTCP sockets and a "dummy mapping" in case of fallback to
>> regular TCP. Skip adding DSS option on send, if TCP fallback has been
>> done earlier.
>>
>> Notes: I'm unsure on what to do in mptcp_clean_una() to do a one-time
>> flush of the retransmit queue, as per Mat's suggestion. Any advice?
>>
>> Changes since v1
>>   - rebase on top of Paolo's fix for NULL dereference in mptcp_recvmsg()
>>
>> Changes since RFC v2:
>>   - use a bit in msk->flags, rather than a dedicated boolean in struct
>>     msk. This bit is going to be used in combination with another one,
>>     TCP_FALLBACK_ALLOWED, that is 1 at the first subflow creation
>>     and gets cleared once TCP fallback is no more allowed.
>>   - separate code that adds support for "infinite mapping", and use
>>     the term "dummy" instead of "infinite". Suggested by Mat
>>   - remove inappropriate call to __mptcp_do_fallback() in
>>     mptcp_accept() (Paolo)
>>
>> Changes since RFC v1:
>>   - use a dedicated member of struct msk to indicate that a fallback
>>     ha happened, use it in case of infinite mapping
>>   - don't delete skb_ext in case of infinite mapping (Mat)
>>   - test the value of pm.subflows on reception of an infinite map to
>>     ensure that no other subflow is currently opened (Mat)
>>   - in mptcp_established_options(), avoid adding TCP options in case
>>     of fallback indication; simplify sendmsg()/recvmsg()/poll() to
>>     keep using the MPTCP socket in case of TCP fallback. Set the
>>     fallback indication in case subflow is not mp_capable after
>>     successful 3-way handshake, instead of flipping 'is_mptcp'
>>     (Paolo/Mat)
>>   - remove deadcode in mptcp_finish_connect, and increment
>>     MPTCP_MIB_MPCAPABLEACTIVEFALLBACK in subflow_finish_connect
>>     (Paolo)
>>
>> BugLink: https://github.com/multipath-tcp/mptcp_net-next/issues/11
>> BugLink: https://github.com/multipath-tcp/mptcp_net-next/issues/22
>> Signed-off-by: Davide Caratti <dcaratti@redhat.com>

Thank you for the patch and the reviews!

This commit is now at the end of the export branch (before the two 
DO-NOT-MERGE ones), tests are all passing!

@Davide: do not hesitate to create a new PR for Packetdrill ;-)

Cheers,
Matt
diff mbox series

Patch

diff --git a/net/mptcp/options.c b/net/mptcp/options.c
index 01f1f4cf4902..cf0b59ead1e4 100644
--- a/net/mptcp/options.c
+++ b/net/mptcp/options.c
@@ -624,6 +624,9 @@  bool mptcp_established_options(struct sock *sk, struct sk_buff *skb,
 
 	opts->suboptions = 0;
 
+	if (unlikely(mptcp_check_fallback(sk)))
+		return false;
+
 	if (mptcp_established_options_mp(sk, skb, &opt_size, remaining, opts))
 		ret = true;
 	else if (mptcp_established_options_dss(sk, skb, &opt_size, remaining,
@@ -714,7 +717,8 @@  static bool check_fully_established(struct mptcp_sock *msk, struct sock *sk,
 	 */
 	if (!mp_opt->mp_capable) {
 		subflow->mp_capable = 0;
-		tcp_sk(sk)->is_mptcp = 0;
+		pr_fallback(msk);
+		__mptcp_do_fallback(msk);
 		return false;
 	}
 
@@ -814,6 +818,9 @@  void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb,
 	struct mptcp_options_received mp_opt;
 	struct mptcp_ext *mpext;
 
+	if (__mptcp_check_fallback(msk))
+		return;
+
 	mptcp_get_options(skb, &mp_opt);
 	if (!check_fully_established(msk, sk, subflow, skb, &mp_opt))
 		return;
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 5d7f4cd2c8e6..1d907519c223 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -52,11 +52,6 @@  static struct socket *__mptcp_nmpc_socket(const struct mptcp_sock *msk)
 	return msk->subflow;
 }
 
-static bool __mptcp_needs_tcp_fallback(const struct mptcp_sock *msk)
-{
-	return msk->first && !sk_is_mptcp(msk->first);
-}
-
 static struct socket *mptcp_is_tcpsk(struct sock *sk)
 {
 	struct socket *sock = sk->sk_socket;
@@ -94,7 +89,7 @@  static struct socket *__mptcp_tcp_fallback(struct mptcp_sock *msk)
 	if (unlikely(sock))
 		return sock;
 
-	if (likely(!__mptcp_needs_tcp_fallback(msk)))
+	if (likely(!__mptcp_check_fallback(msk)))
 		return NULL;
 
 	return msk->subflow;
@@ -229,6 +224,15 @@  static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk,
 		if (!skb)
 			break;
 
+		if (__mptcp_check_fallback(msk)) {
+			/* if we are running under the workqueue, TCP could have
+			 * collapsed skbs between dummy map creation and now
+			 * be sure to adjust the size
+			 */
+			map_remaining = skb->len;
+			subflow->map_data_len = skb->len;
+		}
+
 		offset = seq - TCP_SKB_CB(skb)->seq;
 		fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN;
 		if (fin) {
@@ -445,8 +449,15 @@  static void mptcp_clean_una(struct sock *sk)
 {
 	struct mptcp_sock *msk = mptcp_sk(sk);
 	struct mptcp_data_frag *dtmp, *dfrag;
-	u64 snd_una = atomic64_read(&msk->snd_una);
 	bool cleaned = false;
+	u64 snd_una;
+
+	/* on fallback we just need to ignore snd_una, as this is really
+	 * plain TCP
+	 */
+	if (__mptcp_check_fallback(msk))
+		atomic64_set(&msk->snd_una, msk->write_seq);
+	snd_una = atomic64_read(&msk->snd_una);
 
 	list_for_each_entry_safe(dfrag, dtmp, &msk->rtx_queue, list) {
 		if (after64(dfrag->data_seq + dfrag->data_len, snd_una))
@@ -719,7 +730,6 @@  static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 	int mss_now = 0, size_goal = 0, ret = 0;
 	struct mptcp_sock *msk = mptcp_sk(sk);
 	struct page_frag *pfrag;
-	struct socket *ssock;
 	size_t copied = 0;
 	struct sock *ssk;
 	bool tx_ok;
@@ -738,15 +748,6 @@  static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 			goto out;
 	}
 
-fallback:
-	ssock = __mptcp_tcp_fallback(msk);
-	if (unlikely(ssock)) {
-		release_sock(sk);
-		pr_debug("fallback passthrough");
-		ret = sock_sendmsg(ssock, msg);
-		return ret >= 0 ? ret + copied : (copied ? copied : ret);
-	}
-
 	pfrag = sk_page_frag(sk);
 restart:
 	mptcp_clean_una(sk);
@@ -798,17 +799,6 @@  static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 			}
 			break;
 		}
-		if (ret == 0 && unlikely(__mptcp_needs_tcp_fallback(msk))) {
-			/* Can happen for passive sockets:
-			 * 3WHS negotiated MPTCP, but first packet after is
-			 * plain TCP (e.g. due to middlebox filtering unknown
-			 * options).
-			 *
-			 * Fall back to TCP.
-			 */
-			release_sock(ssk);
-			goto fallback;
-		}
 
 		copied += ret;
 
@@ -951,7 +941,6 @@  static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
 			 int nonblock, int flags, int *addr_len)
 {
 	struct mptcp_sock *msk = mptcp_sk(sk);
-	struct socket *ssock;
 	int copied = 0;
 	int target;
 	long timeo;
@@ -960,16 +949,6 @@  static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
 		return -EOPNOTSUPP;
 
 	lock_sock(sk);
-	ssock = __mptcp_tcp_fallback(msk);
-	if (unlikely(ssock)) {
-fallback:
-		release_sock(sk);
-		pr_debug("fallback-read subflow=%p",
-			 mptcp_subflow_ctx(ssock->sk));
-		copied = sock_recvmsg(ssock, msg, flags);
-		return copied;
-	}
-
 	timeo = sock_rcvtimeo(sk, nonblock);
 
 	len = min_t(size_t, len, INT_MAX);
@@ -1032,9 +1011,6 @@  static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
 
 		pr_debug("block timeout %ld", timeo);
 		mptcp_wait_data(sk, &timeo);
-		ssock = __mptcp_tcp_fallback(msk);
-		if (unlikely(ssock))
-			goto fallback;
 	}
 
 	if (skb_queue_empty(&sk->sk_receive_queue)) {
@@ -1659,12 +1635,6 @@  void mptcp_finish_connect(struct sock *ssk)
 	sk = subflow->conn;
 	msk = mptcp_sk(sk);
 
-	if (!subflow->mp_capable) {
-		MPTCP_INC_STATS(sock_net(sk),
-				MPTCP_MIB_MPCAPABLEACTIVEFALLBACK);
-		return;
-	}
-
 	pr_debug("msk=%p, token=%u", sk, subflow->token);
 
 	mptcp_crypto_key_sha(subflow->remote_key, NULL, &ack_seq);
@@ -1964,21 +1934,9 @@  static __poll_t mptcp_poll(struct file *file, struct socket *sock,
 {
 	struct sock *sk = sock->sk;
 	struct mptcp_sock *msk;
-	struct socket *ssock;
 	__poll_t mask = 0;
 
 	msk = mptcp_sk(sk);
-	lock_sock(sk);
-	ssock = __mptcp_tcp_fallback(msk);
-	if (!ssock)
-		ssock = __mptcp_nmpc_socket(msk);
-	if (ssock) {
-		mask = ssock->ops->poll(file, ssock, wait);
-		release_sock(sk);
-		return mask;
-	}
-
-	release_sock(sk);
 	sock_poll_wait(file, sock, wait);
 	lock_sock(sk);
 
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index 0592fec3418b..2b39d5344d95 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -89,6 +89,7 @@ 
 #define MPTCP_SEND_SPACE	1
 #define MPTCP_WORK_RTX		2
 #define MPTCP_WORK_EOF		3
+#define MPTCP_FALLBACK_DONE	4
 
 struct mptcp_options_received {
 	u64	sndr_key;
@@ -461,4 +462,37 @@  static inline bool before64(__u64 seq1, __u64 seq2)
 
 void mptcp_diag_subflow_init(struct tcp_ulp_ops *ops);
 
+static inline bool __mptcp_check_fallback(struct mptcp_sock *msk)
+{
+	return test_bit(MPTCP_FALLBACK_DONE, &msk->flags);
+}
+
+static inline bool mptcp_check_fallback(struct sock *sk)
+{
+	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+	struct mptcp_sock *msk = mptcp_sk(subflow->conn);
+
+	return __mptcp_check_fallback(msk);
+}
+
+static inline void __mptcp_do_fallback(struct mptcp_sock *msk)
+{
+	if (test_bit(MPTCP_FALLBACK_DONE, &msk->flags)) {
+		pr_debug("TCP fallback already done (msk=%p)", msk);
+		return;
+	}
+	set_bit(MPTCP_FALLBACK_DONE, &msk->flags);
+}
+
+static inline void mptcp_do_fallback(struct sock *sk)
+{
+	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+	struct mptcp_sock *msk = mptcp_sk(subflow->conn);
+
+	__mptcp_do_fallback(msk);
+}
+
+#define pr_fallback(a) do { pr_debug("%s:fallback to TCP (msk=%p)",\
+				      __FUNCTION__, a); } while (0)
+
 #endif /* __MPTCP_PROTOCOL_H */
diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
index 10b4770a1419..8245395f8354 100644
--- a/net/mptcp/subflow.c
+++ b/net/mptcp/subflow.c
@@ -223,7 +223,6 @@  static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
 	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
 	struct mptcp_options_received mp_opt;
 	struct sock *parent = subflow->conn;
-	struct tcp_sock *tp = tcp_sk(sk);
 
 	subflow->icsk_af_ops->sk_rx_dst_set(sk, skb);
 
@@ -237,6 +236,8 @@  static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
 		return;
 
 	subflow->conn_finished = 1;
+	subflow->ssn_offset = TCP_SKB_CB(skb)->seq;
+	pr_debug("subflow=%p synack seq=%x", subflow, subflow->ssn_offset);
 
 	mptcp_get_options(skb, &mp_opt);
 	if (subflow->request_mptcp && mp_opt.mp_capable) {
@@ -252,21 +253,19 @@  static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
 		pr_debug("subflow=%p, thmac=%llu, remote_nonce=%u", subflow,
 			 subflow->thmac, subflow->remote_nonce);
 	} else if (subflow->request_mptcp) {
-		tp->is_mptcp = 0;
+		mptcp_do_fallback(sk);
+		pr_fallback(mptcp_sk(subflow->conn));
+		MPTCP_INC_STATS(sock_net(sk),
+				MPTCP_MIB_MPCAPABLEACTIVEFALLBACK);
 	}
 
-	if (!tp->is_mptcp)
+	if (mptcp_check_fallback(sk))
 		return;
 
 	if (subflow->mp_capable) {
 		pr_debug("subflow=%p, remote_key=%llu", mptcp_subflow_ctx(sk),
 			 subflow->remote_key);
 		mptcp_finish_connect(sk);
-
-		if (skb) {
-			pr_debug("synack seq=%u", TCP_SKB_CB(skb)->seq);
-			subflow->ssn_offset = TCP_SKB_CB(skb)->seq;
-		}
 	} else if (subflow->mp_join) {
 		u8 hmac[SHA256_DIGEST_SIZE];
 
@@ -286,9 +285,6 @@  static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
 
 		memcpy(subflow->hmac, hmac, MPTCPOPT_HMAC_LEN);
 
-		if (skb)
-			subflow->ssn_offset = TCP_SKB_CB(skb)->seq;
-
 		if (!mptcp_finish_join(sk))
 			goto do_reset;
 
@@ -564,7 +560,8 @@  enum mapping_status {
 	MAPPING_OK,
 	MAPPING_INVALID,
 	MAPPING_EMPTY,
-	MAPPING_DATA_FIN
+	MAPPING_DATA_FIN,
+	MAPPING_DUMMY
 };
 
 static u64 expand_seq(u64 old_seq, u16 old_data_len, u64 seq)
@@ -628,6 +625,9 @@  static enum mapping_status get_mapping_status(struct sock *ssk)
 	if (!skb)
 		return MAPPING_EMPTY;
 
+	if (mptcp_check_fallback(ssk))
+		return MAPPING_DUMMY;
+
 	mpext = mptcp_get_ext(skb);
 	if (!mpext || !mpext->use_map) {
 		if (!subflow->map_valid && !skb->len) {
@@ -769,6 +769,16 @@  static bool subflow_check_data_avail(struct sock *ssk)
 			ssk->sk_err = EBADMSG;
 			goto fatal;
 		}
+		if (status == MAPPING_DUMMY) {
+			__mptcp_do_fallback(msk);
+			skb = skb_peek(&ssk->sk_receive_queue);
+			subflow->map_valid = 1;
+			subflow->map_seq = READ_ONCE(msk->ack_seq);
+			subflow->map_data_len = skb->len;
+			subflow->map_subflow_seq = tcp_sk(ssk)->copied_seq -
+						   subflow->ssn_offset;
+			return true;
+		}
 
 		if (status != MAPPING_OK)
 			return false;
@@ -892,14 +902,18 @@  static void subflow_data_ready(struct sock *sk)
 {
 	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
 	struct sock *parent = subflow->conn;
+	struct mptcp_sock *msk;
 
-	if (!subflow->mp_capable && !subflow->mp_join) {
-		subflow->tcp_data_ready(sk);
-
+	msk = mptcp_sk(parent);
+	if (inet_sk_state_load(sk) == TCP_LISTEN) {
+		set_bit(MPTCP_DATA_READY, &msk->flags);
 		parent->sk_data_ready(parent);
 		return;
 	}
 
+	WARN_ON_ONCE(!__mptcp_check_fallback(msk) && !subflow->mp_capable &&
+		     !subflow->mp_join);
+
 	if (mptcp_subflow_data_available(sk))
 		mptcp_data_ready(parent, sk);
 }
@@ -1122,7 +1136,7 @@  static void subflow_state_change(struct sock *sk)
 	 * a fin packet carrying a DSS can be unnoticed if we don't trigger
 	 * the data available machinery here.
 	 */
-	if (subflow->mp_capable && mptcp_subflow_data_available(sk))
+	if (mptcp_subflow_data_available(sk))
 		mptcp_data_ready(parent, sk);
 
 	if (!(parent->sk_shutdown & RCV_SHUTDOWN) &&