diff mbox series

[net-next,08/12] mptcp: Use full MPTCP-level disconnect state machine

Message ID 20200728221210.92841-9-mathew.j.martineau@linux.intel.com
State Accepted, archived
Headers show
Series Exchange MPTCP DATA_FIN/DATA_ACK before TCP FIN | expand

Commit Message

Mat Martineau July 28, 2020, 10:12 p.m. UTC
RFC 8684 appendix D describes the connection state machine for
MPTCP. This patch implements the DATA_FIN / DATA_ACK exchanges and
MPTCP-level socket state changes described in that appendix, rather than
simply sending DATA_FIN along with TCP FIN when disconnecting subflows.

DATA_FIN is now sent and acknowledged before shutting down the
subflows. Received DATA_FIN information (if not part of a data packet)
is written to the MPTCP socket when the incoming DSS option is parsed by
the subflow, and the MPTCP worker is scheduled to process the
flag. DATA_FIN received as part of a full DSS mapping will be handled
when the mapping is processed.

The DATA_FIN is acknowledged by the worker if the reader is caught
up. If there is still data to be moved to the MPTCP-level queue, ack_seq
will be incremented to account for the DATA_FIN when it reaches the end
of the stream and a DATA_ACK will be sent to the peer.

Signed-off-by: Mat Martineau <mathew.j.martineau@linux.intel.com>
---
 net/mptcp/options.c  | 11 ++++++
 net/mptcp/protocol.c | 87 +++++++++++++++++++++++++++++++++++++-------
 net/mptcp/subflow.c  | 11 ++++--
 3 files changed, 92 insertions(+), 17 deletions(-)
diff mbox series

Patch

diff --git a/net/mptcp/options.c b/net/mptcp/options.c
index 38583d1b9b5f..b4458ecd01f8 100644
--- a/net/mptcp/options.c
+++ b/net/mptcp/options.c
@@ -868,6 +868,17 @@  void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb,
 	if (mp_opt.use_ack)
 		update_una(msk, &mp_opt);
 
+	/* Zero-length packets, like bare ACKs carrying a DATA_FIN, are
+	 * dropped by the caller and not propagated to the MPTCP layer.
+	 * Copy the DATA_FIN information now.
+	 */
+	if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) {
+		if (mp_opt.data_fin && mp_opt.data_len == 1 &&
+		    mptcp_update_rcv_data_fin(msk, mp_opt.data_seq) &&
+		    schedule_work(&msk->work))
+			sock_hold(subflow->conn);
+	}
+
 	mpext = skb_ext_add(skb, SKB_EXT_MPTCP);
 	if (!mpext)
 		return;
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index b3350830e14d..f264ea15e081 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -381,6 +381,15 @@  static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk,
 
 	*bytes = moved;
 
+	/* If the moves have caught up with the DATA_FIN sequence number
+	 * it's time to ack the DATA_FIN and change socket state, but
+	 * this is not a good place to change state. Let the workqueue
+	 * do it.
+	 */
+	if (mptcp_pending_data_fin(sk, NULL) &&
+	    schedule_work(&msk->work))
+		sock_hold(sk);
+
 	return done;
 }
 
@@ -466,7 +475,8 @@  void mptcp_data_acked(struct sock *sk)
 {
 	mptcp_reset_timer(sk);
 
-	if (!sk_stream_is_writeable(sk) &&
+	if ((!sk_stream_is_writeable(sk) ||
+	     (inet_sk_state_load(sk) != TCP_ESTABLISHED)) &&
 	    schedule_work(&mptcp_sk(sk)->work))
 		sock_hold(sk);
 }
@@ -1384,6 +1394,7 @@  static void mptcp_worker(struct work_struct *work)
 
 	lock_sock(sk);
 	mptcp_clean_una(sk);
+	mptcp_check_data_fin_ack(sk);
 	__mptcp_flush_join_list(msk);
 	__mptcp_move_skbs(msk);
 
@@ -1393,6 +1404,8 @@  static void mptcp_worker(struct work_struct *work)
 	if (test_and_clear_bit(MPTCP_WORK_EOF, &msk->flags))
 		mptcp_check_for_eof(msk);
 
+	mptcp_check_data_fin(sk);
+
 	if (!test_and_clear_bit(MPTCP_WORK_RTX, &msk->flags))
 		goto unlock;
 
@@ -1515,7 +1528,7 @@  static void mptcp_cancel_work(struct sock *sk)
 		sock_put(sk);
 }
 
-static void mptcp_subflow_shutdown(struct sock *ssk, int how)
+static void mptcp_subflow_shutdown(struct sock *sk, struct sock *ssk, int how)
 {
 	lock_sock(ssk);
 
@@ -1528,8 +1541,15 @@  static void mptcp_subflow_shutdown(struct sock *ssk, int how)
 		tcp_disconnect(ssk, O_NONBLOCK);
 		break;
 	default:
-		ssk->sk_shutdown |= how;
-		tcp_shutdown(ssk, how);
+		if (__mptcp_check_fallback(mptcp_sk(sk))) {
+			pr_debug("Fallback");
+			ssk->sk_shutdown |= how;
+			tcp_shutdown(ssk, how);
+		} else {
+			pr_debug("Sending DATA_FIN on subflow %p", ssk);
+			mptcp_set_timeout(sk, ssk);
+			tcp_send_ack(ssk);
+		}
 		break;
 	}
 
@@ -1570,9 +1590,35 @@  static void mptcp_close(struct sock *sk, long timeout)
 	LIST_HEAD(conn_list);
 
 	lock_sock(sk);
+	sk->sk_shutdown = SHUTDOWN_MASK;
+
+	if (sk->sk_state == TCP_LISTEN) {
+		inet_sk_state_store(sk, TCP_CLOSE);
+		goto cleanup;
+	} else if (sk->sk_state == TCP_CLOSE) {
+		goto cleanup;
+	}
+
+	if (__mptcp_check_fallback(msk)) {
+		goto update_state;
+	} else if (mptcp_close_state(sk)) {
+		pr_debug("Sending DATA_FIN sk=%p", sk);
+		WRITE_ONCE(msk->write_seq, msk->write_seq + 1);
+		WRITE_ONCE(msk->snd_data_fin_enable, 1);
+
+		mptcp_for_each_subflow(msk, subflow) {
+			struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow);
+
+			mptcp_subflow_shutdown(sk, tcp_sk, SHUTDOWN_MASK);
+		}
+	}
 
+	sk_stream_wait_close(sk, timeout);
+
+update_state:
 	inet_sk_state_store(sk, TCP_CLOSE);
 
+cleanup:
 	/* be sure to always acquire the join list lock, to sync vs
 	 * mptcp_finish_join().
 	 */
@@ -1581,8 +1627,6 @@  static void mptcp_close(struct sock *sk, long timeout)
 	spin_unlock_bh(&msk->join_list_lock);
 	list_splice_init(&msk->conn_list, &conn_list);
 
-	msk->snd_data_fin_enable = 1;
-
 	__mptcp_clear_xmit(sk);
 
 	release_sock(sk);
@@ -2265,11 +2309,8 @@  static int mptcp_shutdown(struct socket *sock, int how)
 	pr_debug("sk=%p, how=%d", msk, how);
 
 	lock_sock(sock->sk);
-	if (how == SHUT_WR || how == SHUT_RDWR)
-		inet_sk_state_store(sock->sk, TCP_FIN_WAIT1);
 
 	how++;
-
 	if ((how & ~SHUTDOWN_MASK) || !how) {
 		ret = -EINVAL;
 		goto out_unlock;
@@ -2283,13 +2324,31 @@  static int mptcp_shutdown(struct socket *sock, int how)
 			sock->state = SS_CONNECTED;
 	}
 
-	__mptcp_flush_join_list(msk);
-	msk->snd_data_fin_enable = 1;
+	/* If we've already sent a FIN, or it's a closed state, skip this. */
+	if (__mptcp_check_fallback(msk)) {
+		if (how == SHUT_WR || how == SHUT_RDWR)
+			inet_sk_state_store(sock->sk, TCP_FIN_WAIT1);
 
-	mptcp_for_each_subflow(msk, subflow) {
-		struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow);
+		mptcp_for_each_subflow(msk, subflow) {
+			struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow);
 
-		mptcp_subflow_shutdown(tcp_sk, how);
+			mptcp_subflow_shutdown(sock->sk, tcp_sk, how);
+		}
+	} else if ((how & SEND_SHUTDOWN) &&
+		   ((1 << sock->sk->sk_state) &
+		    (TCPF_ESTABLISHED | TCPF_SYN_SENT |
+		     TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) &&
+		   mptcp_close_state(sock->sk)) {
+		__mptcp_flush_join_list(msk);
+
+		WRITE_ONCE(msk->write_seq, msk->write_seq + 1);
+		WRITE_ONCE(msk->snd_data_fin_enable, 1);
+
+		mptcp_for_each_subflow(msk, subflow) {
+			struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow);
+
+			mptcp_subflow_shutdown(sock->sk, tcp_sk, how);
+		}
 	}
 
 	/* Wake up anyone sleeping in poll. */
diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
index e645483d1200..7ab2a52ad150 100644
--- a/net/mptcp/subflow.c
+++ b/net/mptcp/subflow.c
@@ -598,7 +598,8 @@  static bool validate_mapping(struct sock *ssk, struct sk_buff *skb)
 	return true;
 }
 
-static enum mapping_status get_mapping_status(struct sock *ssk)
+static enum mapping_status get_mapping_status(struct sock *ssk,
+					      struct mptcp_sock *msk)
 {
 	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
 	struct mptcp_ext *mpext;
@@ -648,7 +649,8 @@  static enum mapping_status get_mapping_status(struct sock *ssk)
 
 	if (mpext->data_fin == 1) {
 		if (data_len == 1) {
-			pr_debug("DATA_FIN with no payload");
+			mptcp_update_rcv_data_fin(msk, mpext->data_seq);
+			pr_debug("DATA_FIN with no payload seq=%llu", mpext->data_seq);
 			if (subflow->map_valid) {
 				/* A DATA_FIN might arrive in a DSS
 				 * option before the previous mapping
@@ -660,6 +662,9 @@  static enum mapping_status get_mapping_status(struct sock *ssk)
 			} else {
 				return MAPPING_DATA_FIN;
 			}
+		} else {
+			mptcp_update_rcv_data_fin(msk, mpext->data_seq + data_len);
+			pr_debug("DATA_FIN with mapping seq=%llu", mpext->data_seq + data_len);
 		}
 
 		/* Adjust for DATA_FIN using 1 byte of sequence space */
@@ -748,7 +753,7 @@  static bool subflow_check_data_avail(struct sock *ssk)
 		u64 ack_seq;
 		u64 old_ack;
 
-		status = get_mapping_status(ssk);
+		status = get_mapping_status(ssk, msk);
 		pr_debug("msk=%p ssk=%p status=%d", msk, ssk, status);
 		if (status == MAPPING_INVALID) {
 			ssk->sk_err = EBADMSG;